"""Validate unified JAO data and engineered features. Checks: 1. Timeline: hourly, no gaps, sorted 2. Feature completeness: null percentages 3. Data leakage: future data not in historical features 4. Summary statistics Author: Claude Date: 2025-11-06 """ import polars as pl from pathlib import Path print("\n" + "=" * 80) print("JAO DATA VALIDATION") print("=" * 80) # ========================================================================= # 1. Load datasets # ========================================================================= print("\nLoading datasets...") unified_path = Path('data/processed/unified_jao_24month.parquet') cnec_path = Path('data/processed/cnec_hourly_24month.parquet') features_path = Path('data/processed/features_jao_24month.parquet') unified = pl.read_parquet(unified_path) cnec = pl.read_parquet(cnec_path) features = pl.read_parquet(features_path) print(f" Unified JAO: {unified.shape}") print(f" CNEC hourly: {cnec.shape}") print(f" Features: {features.shape}") # ========================================================================= # 2. Timeline Validation # ========================================================================= print("\n" + "-" * 80) print("[1/4] TIMELINE VALIDATION") print("-" * 80) # Check sorted is_sorted = unified['mtu'].is_sorted() print(f" Timeline sorted: {'[PASS]' if is_sorted else '[FAIL]'}") # Check for gaps (should be hourly) time_diffs = unified['mtu'].diff().drop_nulls() most_common_diff = time_diffs.mode()[0] hourly_expected = most_common_diff.total_seconds() == 3600 print(f" Most common time diff: {most_common_diff}") print(f" Hourly intervals: {'[PASS]' if hourly_expected else '[FAIL]'}") # Date range min_date = unified['mtu'].min() max_date = unified['mtu'].max() print(f" Date range: {min_date} to {max_date}") print(f" Total hours: {len(unified):,}") # Expected: Oct 2023 to Sept 2025 = ~24 months # After deduplication: 17,544 hours (729.75 days = ~24 months) expected_days = (max_date - min_date).days + 1 print(f" Days covered: {expected_days} (~{expected_days / 30:.1f} months)") # ========================================================================= # 3. Feature Completeness # ========================================================================= print("\n" + "-" * 80) print("[2/4] FEATURE COMPLETENESS") print("-" * 80) # Count features by category cnec_t1_cols = [c for c in features.columns if c.startswith('cnec_t1_')] cnec_t2_cols = [c for c in features.columns if c.startswith('cnec_t2_')] lta_cols = [c for c in features.columns if c.startswith('lta_')] temporal_cols = [c for c in features.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] target_cols = [c for c in features.columns if c.startswith('target_')] print(f" Tier-1 CNEC features: {len(cnec_t1_cols)}") print(f" Tier-2 CNEC features: {len(cnec_t2_cols)}") print(f" LTA features: {len(lta_cols)}") print(f" Temporal features: {len(temporal_cols)}") print(f" Target variables: {len(target_cols)}") print(f" Total features: {features.shape[1] - 1} (excluding mtu)") # Null counts by category print("\n Null percentages:") cnec_t1_nulls = features.select(cnec_t1_cols).null_count().sum_horizontal()[0] cnec_t2_nulls = features.select(cnec_t2_cols).null_count().sum_horizontal()[0] lta_nulls = features.select(lta_cols).null_count().sum_horizontal()[0] temporal_nulls = features.select(temporal_cols).null_count().sum_horizontal()[0] target_nulls = features.select(target_cols).null_count().sum_horizontal()[0] total_cells_t1 = len(features) * len(cnec_t1_cols) total_cells_t2 = len(features) * len(cnec_t2_cols) total_cells_lta = len(features) * len(lta_cols) total_cells_temporal = len(features) * len(temporal_cols) total_cells_target = len(features) * len(target_cols) print(f" Tier-1 CNEC: {cnec_t1_nulls / total_cells_t1 * 100:.2f}% nulls") print(f" Tier-2 CNEC: {cnec_t2_nulls / total_cells_t2 * 100:.2f}% nulls") print(f" LTA: {lta_nulls / total_cells_lta * 100:.2f}% nulls") print(f" Temporal: {temporal_nulls / total_cells_temporal * 100:.2f}% nulls") print(f" Targets: {target_nulls / total_cells_target * 100:.2f}% nulls") # Overall null percentage total_nulls = features.null_count().sum_horizontal()[0] total_cells = len(features) * len(features.columns) overall_null_pct = total_nulls / total_cells * 100 print(f"\n Overall null percentage: {overall_null_pct:.2f}%") if overall_null_pct < 60: print(f" Completeness: [PASS] (<60% nulls)") else: print(f" Completeness: [WARNING] (>{overall_null_pct:.1f}% nulls)") # ========================================================================= # 4. Data Leakage Check # ========================================================================= print("\n" + "-" * 80) print("[3/4] DATA LEAKAGE CHECK") print("-" * 80) # LTA are future covariates - should have NO nulls (known in advance) lta_null_count = unified.select([c for c in unified.columns if c.startswith('border_')]).null_count().sum_horizontal()[0] print(f" LTA nulls: {lta_null_count}") if lta_null_count == 0: print(" LTA future covariates: [PASS] (no nulls)") else: print(f" LTA future covariates: [WARNING] ({lta_null_count} nulls)") # Historical features should have lags (shift creates nulls at start) # Check that lag features have nulls ONLY at the beginning has_lag_features = any('_L' in c for c in features.columns) if has_lag_features: print(" Historical lag features: [PRESENT] (nulls expected at start)") else: print(" Historical lag features: [WARNING] (no lag features found)") # ========================================================================= # 5. Summary Statistics # ========================================================================= print("\n" + "-" * 80) print("[4/4] SUMMARY STATISTICS") print("-" * 80) print("\nUnified JAO Data:") print(f" Rows: {len(unified):,}") print(f" Columns: {len(unified.columns)}") print(f" MaxBEX borders: {len([c for c in unified.columns if 'border_' in c and 'lta' not in c.lower()])}") print(f" LTA borders: {len([c for c in unified.columns if c.startswith('border_')])}") print(f" Net Positions: {len([c for c in unified.columns if c.startswith('netpos_')])}") print("\nCNEC Hourly Data:") print(f" Total CNEC records: {len(cnec):,}") print(f" Unique CNECs: {cnec['cnec_eic'].n_unique()}") print(f" Unique timestamps: {cnec['mtu'].n_unique():,}") print(f" CNECs per timestamp: {len(cnec) / cnec['mtu'].n_unique():.1f} avg") print("\nFeature Engineering:") print(f" Total features: {features.shape[1] - 1}") print(f" Feature rows: {len(features):,}") print(f" File size: {features_path.stat().st_size / (1024**2):.2f} MB") # ========================================================================= # Validation Summary # ========================================================================= print("\n" + "=" * 80) print("VALIDATION SUMMARY") print("=" * 80) checks_passed = 0 total_checks = 4 # Timeline check if is_sorted and hourly_expected: print(" [PASS] Timeline validation PASSED") checks_passed += 1 else: print(" [FAIL] Timeline validation FAILED") # Feature completeness check if overall_null_pct < 60: print(" [PASS] Feature completeness PASSED") checks_passed += 1 else: print(" [WARNING] Feature completeness WARNING (high nulls)") # Data leakage check if lta_null_count == 0 and has_lag_features: print(" [PASS] Data leakage check PASSED") checks_passed += 1 else: print(" [WARNING] Data leakage check WARNING") # Overall data quality if len(unified) == len(features): print(" [PASS] Data consistency PASSED") checks_passed += 1 else: print(" [FAIL] Data consistency FAILED (row mismatch)") print(f"\nChecks passed: {checks_passed}/{total_checks}") if checks_passed == total_checks: print("\n[SUCCESS] All validation checks PASSED") elif checks_passed >= total_checks - 1: print("\n[WARNING] Minor issues detected") else: print("\n[FAILURE] Critical issues detected") print("=" * 80) print()