"""Validate updated JAO data collection results. Compares old vs new column selection and validates transformations. """ import sys from pathlib import Path import polars as pl # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) def main(): """Validate updated JAO collection.""" print("\n" + "=" * 80) print("JAO COLLECTION UPDATE VALIDATION") print("=" * 80) # Load updated data updated_cnec = pl.read_parquet("data/raw/sample_updated/jao_cnec_sample.parquet") updated_maxbex = pl.read_parquet("data/raw/sample_updated/jao_maxbex_sample.parquet") updated_lta = pl.read_parquet("data/raw/sample_updated/jao_lta_sample.parquet") # Load original data (if exists) try: original_cnec = pl.read_parquet("data/raw/sample/jao_cnec_sample.parquet") has_original = True except: has_original = False original_cnec = None print("\n## 1. COLUMN COUNT COMPARISON") print("-" * 80) if has_original: print(f"Original CNEC columns: {original_cnec.shape[1]}") print(f"Updated CNEC columns: {updated_cnec.shape[1]}") print(f"Reduction: {original_cnec.shape[1] - updated_cnec.shape[1]} columns removed") print(f"Reduction %: {100 * (original_cnec.shape[1] - updated_cnec.shape[1]) / original_cnec.shape[1]:.1f}%") else: print(f"Updated CNEC columns: {updated_cnec.shape[1]}") print("(Original data not available for comparison)") print("\n## 2. NEW COLUMNS VALIDATION") print("-" * 80) new_cols_expected = ['fuaf', 'frm', 'shadow_price_log'] for col in new_cols_expected: if col in updated_cnec.columns: print(f"[OK] {col}: PRESENT") # Stats col_data = updated_cnec[col] null_count = col_data.null_count() null_pct = 100 * null_count / len(col_data) print(f" - Records: {len(col_data)}") print(f" - Nulls: {null_count} ({null_pct:.1f}%)") print(f" - Min: {col_data.min():.4f}") print(f" - Max: {col_data.max():.4f}") print(f" - Mean: {col_data.mean():.4f}") else: print(f"[FAIL] {col}: MISSING") print("\n## 3. REMOVED COLUMNS VALIDATION") print("-" * 80) removed_cols_expected = ['hubFrom', 'hubTo', 'f0all', 'amr', 'lta_margin'] all_removed = True for col in removed_cols_expected: if col in updated_cnec.columns: print(f"[FAIL] {col}: STILL PRESENT (should be removed)") all_removed = False else: print(f"[OK] {col}: Removed") if all_removed: print("\n[OK] All expected columns successfully removed") print("\n## 4. SHADOW PRICE LOG TRANSFORM VALIDATION") print("-" * 80) if 'shadow_price' in updated_cnec.columns and 'shadow_price_log' in updated_cnec.columns: sp = updated_cnec['shadow_price'] sp_log = updated_cnec['shadow_price_log'] print(f"Shadow price (original):") print(f" - Range: [{sp.min():.2f}, {sp.max():.2f}] EUR/MW") print(f" - 99th percentile: {sp.quantile(0.99):.2f} EUR/MW") print(f" - Values >1000: {(sp > 1000).sum()} (should be uncapped)") print(f"\nShadow price (log-transformed):") print(f" - Range: [{sp_log.min():.4f}, {sp_log.max():.4f}]") print(f" - Mean: {sp_log.mean():.4f}") print(f" - Std: {sp_log.std():.4f}") # Verify log transform correctness import numpy as np manual_log = (sp + 1).log() max_diff = (sp_log - manual_log).abs().max() if max_diff < 0.001: print(f"\n[OK] Log transform verified correct (max diff: {max_diff:.6f})") else: print(f"\n[WARN] Log transform may have issues (max diff: {max_diff:.6f})") print("\n## 5. DATA QUALITY CHECKS") print("-" * 80) # Check RAM clipping if 'ram' in updated_cnec.columns and 'fmax' in updated_cnec.columns: ram = updated_cnec['ram'] fmax = updated_cnec['fmax'] negative_ram = (ram < 0).sum() ram_exceeds_fmax = (ram > fmax).sum() print(f"RAM quality:") print(f" - Negative values: {negative_ram} (should be 0)") print(f" - RAM > fmax: {ram_exceeds_fmax} (should be 0)") if negative_ram == 0 and ram_exceeds_fmax == 0: print(f" [OK] RAM properly clipped to [0, fmax]") else: print(f" [WARN] RAM clipping may have issues") # Check PTDF clipping ptdf_cols = [col for col in updated_cnec.columns if col.startswith('ptdf_')] if ptdf_cols: ptdf_issues = 0 for col in ptdf_cols: ptdf_data = updated_cnec[col] out_of_range = ((ptdf_data < -1.5) | (ptdf_data > 1.5)).sum() if out_of_range > 0: ptdf_issues += 1 print(f"\nPTDF quality:") print(f" - Columns checked: {len(ptdf_cols)}") print(f" - Columns with out-of-range values: {ptdf_issues}") if ptdf_issues == 0: print(f" [OK] All PTDFs properly clipped to [-1.5, +1.5]") else: print(f" [WARN] Some PTDFs have out-of-range values") print("\n## 6. LTA DATA VALIDATION") print("-" * 80) print(f"LTA records: {updated_lta.shape[0]}") print(f"LTA columns: {updated_lta.shape[1]}") print(f"LTA columns: {', '.join(updated_lta.columns[:10])}...") # Check if LTA has actual data (not all zeros) numeric_cols = [col for col in updated_lta.columns if updated_lta[col].dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]] if numeric_cols: # Check if any numeric column has non-zero values has_data = False for col in numeric_cols[:5]: # Check first 5 numeric columns if updated_lta[col].sum() != 0: has_data = True break if has_data: print(f"[OK] LTA contains actual allocation data") else: print(f"[WARN] LTA data may be all zeros") print("\n## 7. FILE SIZE COMPARISON") print("-" * 80) updated_cnec_size = Path("data/raw/sample_updated/jao_cnec_sample.parquet").stat().st_size updated_maxbex_size = Path("data/raw/sample_updated/jao_maxbex_sample.parquet").stat().st_size updated_lta_size = Path("data/raw/sample_updated/jao_lta_sample.parquet").stat().st_size print(f"Updated CNEC file: {updated_cnec_size / 1024:.1f} KB") print(f"Updated MaxBEX file: {updated_maxbex_size / 1024:.1f} KB") print(f"Updated LTA file: {updated_lta_size / 1024:.1f} KB") print(f"Total: {(updated_cnec_size + updated_maxbex_size + updated_lta_size) / 1024:.1f} KB") if has_original: original_cnec_size = Path("data/raw/sample/jao_cnec_sample.parquet").stat().st_size reduction = 100 * (original_cnec_size - updated_cnec_size) / original_cnec_size print(f"\nCNEC file size reduction: {reduction:.1f}%") print("\n" + "=" * 80) print("VALIDATION COMPLETE") print("=" * 80) if __name__ == "__main__": main()