Spaces:
Sleeping
Sleeping
| """Validate updated JAO data collection results. | |
| Compares old vs new column selection and validates transformations. | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import polars as pl | |
| # Add src to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) | |
| def main(): | |
| """Validate updated JAO collection.""" | |
| print("\n" + "=" * 80) | |
| print("JAO COLLECTION UPDATE VALIDATION") | |
| print("=" * 80) | |
| # Load updated data | |
| updated_cnec = pl.read_parquet("data/raw/sample_updated/jao_cnec_sample.parquet") | |
| updated_maxbex = pl.read_parquet("data/raw/sample_updated/jao_maxbex_sample.parquet") | |
| updated_lta = pl.read_parquet("data/raw/sample_updated/jao_lta_sample.parquet") | |
| # Load original data (if exists) | |
| try: | |
| original_cnec = pl.read_parquet("data/raw/sample/jao_cnec_sample.parquet") | |
| has_original = True | |
| except: | |
| has_original = False | |
| original_cnec = None | |
| print("\n## 1. COLUMN COUNT COMPARISON") | |
| print("-" * 80) | |
| if has_original: | |
| print(f"Original CNEC columns: {original_cnec.shape[1]}") | |
| print(f"Updated CNEC columns: {updated_cnec.shape[1]}") | |
| print(f"Reduction: {original_cnec.shape[1] - updated_cnec.shape[1]} columns removed") | |
| print(f"Reduction %: {100 * (original_cnec.shape[1] - updated_cnec.shape[1]) / original_cnec.shape[1]:.1f}%") | |
| else: | |
| print(f"Updated CNEC columns: {updated_cnec.shape[1]}") | |
| print("(Original data not available for comparison)") | |
| print("\n## 2. NEW COLUMNS VALIDATION") | |
| print("-" * 80) | |
| new_cols_expected = ['fuaf', 'frm', 'shadow_price_log'] | |
| for col in new_cols_expected: | |
| if col in updated_cnec.columns: | |
| print(f"[OK] {col}: PRESENT") | |
| # Stats | |
| col_data = updated_cnec[col] | |
| null_count = col_data.null_count() | |
| null_pct = 100 * null_count / len(col_data) | |
| print(f" - Records: {len(col_data)}") | |
| print(f" - Nulls: {null_count} ({null_pct:.1f}%)") | |
| print(f" - Min: {col_data.min():.4f}") | |
| print(f" - Max: {col_data.max():.4f}") | |
| print(f" - Mean: {col_data.mean():.4f}") | |
| else: | |
| print(f"[FAIL] {col}: MISSING") | |
| print("\n## 3. REMOVED COLUMNS VALIDATION") | |
| print("-" * 80) | |
| removed_cols_expected = ['hubFrom', 'hubTo', 'f0all', 'amr', 'lta_margin'] | |
| all_removed = True | |
| for col in removed_cols_expected: | |
| if col in updated_cnec.columns: | |
| print(f"[FAIL] {col}: STILL PRESENT (should be removed)") | |
| all_removed = False | |
| else: | |
| print(f"[OK] {col}: Removed") | |
| if all_removed: | |
| print("\n[OK] All expected columns successfully removed") | |
| print("\n## 4. SHADOW PRICE LOG TRANSFORM VALIDATION") | |
| print("-" * 80) | |
| if 'shadow_price' in updated_cnec.columns and 'shadow_price_log' in updated_cnec.columns: | |
| sp = updated_cnec['shadow_price'] | |
| sp_log = updated_cnec['shadow_price_log'] | |
| print(f"Shadow price (original):") | |
| print(f" - Range: [{sp.min():.2f}, {sp.max():.2f}] EUR/MW") | |
| print(f" - 99th percentile: {sp.quantile(0.99):.2f} EUR/MW") | |
| print(f" - Values >1000: {(sp > 1000).sum()} (should be uncapped)") | |
| print(f"\nShadow price (log-transformed):") | |
| print(f" - Range: [{sp_log.min():.4f}, {sp_log.max():.4f}]") | |
| print(f" - Mean: {sp_log.mean():.4f}") | |
| print(f" - Std: {sp_log.std():.4f}") | |
| # Verify log transform correctness | |
| import numpy as np | |
| manual_log = (sp + 1).log() | |
| max_diff = (sp_log - manual_log).abs().max() | |
| if max_diff < 0.001: | |
| print(f"\n[OK] Log transform verified correct (max diff: {max_diff:.6f})") | |
| else: | |
| print(f"\n[WARN] Log transform may have issues (max diff: {max_diff:.6f})") | |
| print("\n## 5. DATA QUALITY CHECKS") | |
| print("-" * 80) | |
| # Check RAM clipping | |
| if 'ram' in updated_cnec.columns and 'fmax' in updated_cnec.columns: | |
| ram = updated_cnec['ram'] | |
| fmax = updated_cnec['fmax'] | |
| negative_ram = (ram < 0).sum() | |
| ram_exceeds_fmax = (ram > fmax).sum() | |
| print(f"RAM quality:") | |
| print(f" - Negative values: {negative_ram} (should be 0)") | |
| print(f" - RAM > fmax: {ram_exceeds_fmax} (should be 0)") | |
| if negative_ram == 0 and ram_exceeds_fmax == 0: | |
| print(f" [OK] RAM properly clipped to [0, fmax]") | |
| else: | |
| print(f" [WARN] RAM clipping may have issues") | |
| # Check PTDF clipping | |
| ptdf_cols = [col for col in updated_cnec.columns if col.startswith('ptdf_')] | |
| if ptdf_cols: | |
| ptdf_issues = 0 | |
| for col in ptdf_cols: | |
| ptdf_data = updated_cnec[col] | |
| out_of_range = ((ptdf_data < -1.5) | (ptdf_data > 1.5)).sum() | |
| if out_of_range > 0: | |
| ptdf_issues += 1 | |
| print(f"\nPTDF quality:") | |
| print(f" - Columns checked: {len(ptdf_cols)}") | |
| print(f" - Columns with out-of-range values: {ptdf_issues}") | |
| if ptdf_issues == 0: | |
| print(f" [OK] All PTDFs properly clipped to [-1.5, +1.5]") | |
| else: | |
| print(f" [WARN] Some PTDFs have out-of-range values") | |
| print("\n## 6. LTA DATA VALIDATION") | |
| print("-" * 80) | |
| print(f"LTA records: {updated_lta.shape[0]}") | |
| print(f"LTA columns: {updated_lta.shape[1]}") | |
| print(f"LTA columns: {', '.join(updated_lta.columns[:10])}...") | |
| # Check if LTA has actual data (not all zeros) | |
| numeric_cols = [col for col in updated_lta.columns | |
| if updated_lta[col].dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]] | |
| if numeric_cols: | |
| # Check if any numeric column has non-zero values | |
| has_data = False | |
| for col in numeric_cols[:5]: # Check first 5 numeric columns | |
| if updated_lta[col].sum() != 0: | |
| has_data = True | |
| break | |
| if has_data: | |
| print(f"[OK] LTA contains actual allocation data") | |
| else: | |
| print(f"[WARN] LTA data may be all zeros") | |
| print("\n## 7. FILE SIZE COMPARISON") | |
| print("-" * 80) | |
| updated_cnec_size = Path("data/raw/sample_updated/jao_cnec_sample.parquet").stat().st_size | |
| updated_maxbex_size = Path("data/raw/sample_updated/jao_maxbex_sample.parquet").stat().st_size | |
| updated_lta_size = Path("data/raw/sample_updated/jao_lta_sample.parquet").stat().st_size | |
| print(f"Updated CNEC file: {updated_cnec_size / 1024:.1f} KB") | |
| print(f"Updated MaxBEX file: {updated_maxbex_size / 1024:.1f} KB") | |
| print(f"Updated LTA file: {updated_lta_size / 1024:.1f} KB") | |
| print(f"Total: {(updated_cnec_size + updated_maxbex_size + updated_lta_size) / 1024:.1f} KB") | |
| if has_original: | |
| original_cnec_size = Path("data/raw/sample/jao_cnec_sample.parquet").stat().st_size | |
| reduction = 100 * (original_cnec_size - updated_cnec_size) / original_cnec_size | |
| print(f"\nCNEC file size reduction: {reduction:.1f}%") | |
| print("\n" + "=" * 80) | |
| print("VALIDATION COMPLETE") | |
| print("=" * 80) | |
| if __name__ == "__main__": | |
| main() | |