Spaces:
Sleeping
Sleeping
| """Validate unified JAO data and engineered features. | |
| Checks: | |
| 1. Timeline: hourly, no gaps, sorted | |
| 2. Feature completeness: null percentages | |
| 3. Data leakage: future data not in historical features | |
| 4. Summary statistics | |
| Author: Claude | |
| Date: 2025-11-06 | |
| """ | |
| import polars as pl | |
| from pathlib import Path | |
| print("\n" + "=" * 80) | |
| print("JAO DATA VALIDATION") | |
| print("=" * 80) | |
| # ========================================================================= | |
| # 1. Load datasets | |
| # ========================================================================= | |
| print("\nLoading datasets...") | |
| unified_path = Path('data/processed/unified_jao_24month.parquet') | |
| cnec_path = Path('data/processed/cnec_hourly_24month.parquet') | |
| features_path = Path('data/processed/features_jao_24month.parquet') | |
| unified = pl.read_parquet(unified_path) | |
| cnec = pl.read_parquet(cnec_path) | |
| features = pl.read_parquet(features_path) | |
| print(f" Unified JAO: {unified.shape}") | |
| print(f" CNEC hourly: {cnec.shape}") | |
| print(f" Features: {features.shape}") | |
| # ========================================================================= | |
| # 2. Timeline Validation | |
| # ========================================================================= | |
| print("\n" + "-" * 80) | |
| print("[1/4] TIMELINE VALIDATION") | |
| print("-" * 80) | |
| # Check sorted | |
| is_sorted = unified['mtu'].is_sorted() | |
| print(f" Timeline sorted: {'[PASS]' if is_sorted else '[FAIL]'}") | |
| # Check for gaps (should be hourly) | |
| time_diffs = unified['mtu'].diff().drop_nulls() | |
| most_common_diff = time_diffs.mode()[0] | |
| hourly_expected = most_common_diff.total_seconds() == 3600 | |
| print(f" Most common time diff: {most_common_diff}") | |
| print(f" Hourly intervals: {'[PASS]' if hourly_expected else '[FAIL]'}") | |
| # Date range | |
| min_date = unified['mtu'].min() | |
| max_date = unified['mtu'].max() | |
| print(f" Date range: {min_date} to {max_date}") | |
| print(f" Total hours: {len(unified):,}") | |
| # Expected: Oct 2023 to Sept 2025 = ~24 months | |
| # After deduplication: 17,544 hours (729.75 days = ~24 months) | |
| expected_days = (max_date - min_date).days + 1 | |
| print(f" Days covered: {expected_days} (~{expected_days / 30:.1f} months)") | |
| # ========================================================================= | |
| # 3. Feature Completeness | |
| # ========================================================================= | |
| print("\n" + "-" * 80) | |
| print("[2/4] FEATURE COMPLETENESS") | |
| print("-" * 80) | |
| # Count features by category | |
| cnec_t1_cols = [c for c in features.columns if c.startswith('cnec_t1_')] | |
| cnec_t2_cols = [c for c in features.columns if c.startswith('cnec_t2_')] | |
| lta_cols = [c for c in features.columns if c.startswith('lta_')] | |
| temporal_cols = [c for c in features.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] | |
| target_cols = [c for c in features.columns if c.startswith('target_')] | |
| print(f" Tier-1 CNEC features: {len(cnec_t1_cols)}") | |
| print(f" Tier-2 CNEC features: {len(cnec_t2_cols)}") | |
| print(f" LTA features: {len(lta_cols)}") | |
| print(f" Temporal features: {len(temporal_cols)}") | |
| print(f" Target variables: {len(target_cols)}") | |
| print(f" Total features: {features.shape[1] - 1} (excluding mtu)") | |
| # Null counts by category | |
| print("\n Null percentages:") | |
| cnec_t1_nulls = features.select(cnec_t1_cols).null_count().sum_horizontal()[0] | |
| cnec_t2_nulls = features.select(cnec_t2_cols).null_count().sum_horizontal()[0] | |
| lta_nulls = features.select(lta_cols).null_count().sum_horizontal()[0] | |
| temporal_nulls = features.select(temporal_cols).null_count().sum_horizontal()[0] | |
| target_nulls = features.select(target_cols).null_count().sum_horizontal()[0] | |
| total_cells_t1 = len(features) * len(cnec_t1_cols) | |
| total_cells_t2 = len(features) * len(cnec_t2_cols) | |
| total_cells_lta = len(features) * len(lta_cols) | |
| total_cells_temporal = len(features) * len(temporal_cols) | |
| total_cells_target = len(features) * len(target_cols) | |
| print(f" Tier-1 CNEC: {cnec_t1_nulls / total_cells_t1 * 100:.2f}% nulls") | |
| print(f" Tier-2 CNEC: {cnec_t2_nulls / total_cells_t2 * 100:.2f}% nulls") | |
| print(f" LTA: {lta_nulls / total_cells_lta * 100:.2f}% nulls") | |
| print(f" Temporal: {temporal_nulls / total_cells_temporal * 100:.2f}% nulls") | |
| print(f" Targets: {target_nulls / total_cells_target * 100:.2f}% nulls") | |
| # Overall null percentage | |
| total_nulls = features.null_count().sum_horizontal()[0] | |
| total_cells = len(features) * len(features.columns) | |
| overall_null_pct = total_nulls / total_cells * 100 | |
| print(f"\n Overall null percentage: {overall_null_pct:.2f}%") | |
| if overall_null_pct < 60: | |
| print(f" Completeness: [PASS] (<60% nulls)") | |
| else: | |
| print(f" Completeness: [WARNING] (>{overall_null_pct:.1f}% nulls)") | |
| # ========================================================================= | |
| # 4. Data Leakage Check | |
| # ========================================================================= | |
| print("\n" + "-" * 80) | |
| print("[3/4] DATA LEAKAGE CHECK") | |
| print("-" * 80) | |
| # LTA are future covariates - should have NO nulls (known in advance) | |
| lta_null_count = unified.select([c for c in unified.columns if c.startswith('border_')]).null_count().sum_horizontal()[0] | |
| print(f" LTA nulls: {lta_null_count}") | |
| if lta_null_count == 0: | |
| print(" LTA future covariates: [PASS] (no nulls)") | |
| else: | |
| print(f" LTA future covariates: [WARNING] ({lta_null_count} nulls)") | |
| # Historical features should have lags (shift creates nulls at start) | |
| # Check that lag features have nulls ONLY at the beginning | |
| has_lag_features = any('_L' in c for c in features.columns) | |
| if has_lag_features: | |
| print(" Historical lag features: [PRESENT] (nulls expected at start)") | |
| else: | |
| print(" Historical lag features: [WARNING] (no lag features found)") | |
| # ========================================================================= | |
| # 5. Summary Statistics | |
| # ========================================================================= | |
| print("\n" + "-" * 80) | |
| print("[4/4] SUMMARY STATISTICS") | |
| print("-" * 80) | |
| print("\nUnified JAO Data:") | |
| print(f" Rows: {len(unified):,}") | |
| print(f" Columns: {len(unified.columns)}") | |
| print(f" MaxBEX borders: {len([c for c in unified.columns if 'border_' in c and 'lta' not in c.lower()])}") | |
| print(f" LTA borders: {len([c for c in unified.columns if c.startswith('border_')])}") | |
| print(f" Net Positions: {len([c for c in unified.columns if c.startswith('netpos_')])}") | |
| print("\nCNEC Hourly Data:") | |
| print(f" Total CNEC records: {len(cnec):,}") | |
| print(f" Unique CNECs: {cnec['cnec_eic'].n_unique()}") | |
| print(f" Unique timestamps: {cnec['mtu'].n_unique():,}") | |
| print(f" CNECs per timestamp: {len(cnec) / cnec['mtu'].n_unique():.1f} avg") | |
| print("\nFeature Engineering:") | |
| print(f" Total features: {features.shape[1] - 1}") | |
| print(f" Feature rows: {len(features):,}") | |
| print(f" File size: {features_path.stat().st_size / (1024**2):.2f} MB") | |
| # ========================================================================= | |
| # Validation Summary | |
| # ========================================================================= | |
| print("\n" + "=" * 80) | |
| print("VALIDATION SUMMARY") | |
| print("=" * 80) | |
| checks_passed = 0 | |
| total_checks = 4 | |
| # Timeline check | |
| if is_sorted and hourly_expected: | |
| print(" [PASS] Timeline validation PASSED") | |
| checks_passed += 1 | |
| else: | |
| print(" [FAIL] Timeline validation FAILED") | |
| # Feature completeness check | |
| if overall_null_pct < 60: | |
| print(" [PASS] Feature completeness PASSED") | |
| checks_passed += 1 | |
| else: | |
| print(" [WARNING] Feature completeness WARNING (high nulls)") | |
| # Data leakage check | |
| if lta_null_count == 0 and has_lag_features: | |
| print(" [PASS] Data leakage check PASSED") | |
| checks_passed += 1 | |
| else: | |
| print(" [WARNING] Data leakage check WARNING") | |
| # Overall data quality | |
| if len(unified) == len(features): | |
| print(" [PASS] Data consistency PASSED") | |
| checks_passed += 1 | |
| else: | |
| print(" [FAIL] Data consistency FAILED (row mismatch)") | |
| print(f"\nChecks passed: {checks_passed}/{total_checks}") | |
| if checks_passed == total_checks: | |
| print("\n[SUCCESS] All validation checks PASSED") | |
| elif checks_passed >= total_checks - 1: | |
| print("\n[WARNING] Minor issues detected") | |
| else: | |
| print("\n[FAILURE] Critical issues detected") | |
| print("=" * 80) | |
| print() | |