fbmc-chronos2 / scripts /validate_jao_data.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
raw
history blame
8.16 kB
"""Validate unified JAO data and engineered features.
Checks:
1. Timeline: hourly, no gaps, sorted
2. Feature completeness: null percentages
3. Data leakage: future data not in historical features
4. Summary statistics
Author: Claude
Date: 2025-11-06
"""
import polars as pl
from pathlib import Path
print("\n" + "=" * 80)
print("JAO DATA VALIDATION")
print("=" * 80)
# =========================================================================
# 1. Load datasets
# =========================================================================
print("\nLoading datasets...")
unified_path = Path('data/processed/unified_jao_24month.parquet')
cnec_path = Path('data/processed/cnec_hourly_24month.parquet')
features_path = Path('data/processed/features_jao_24month.parquet')
unified = pl.read_parquet(unified_path)
cnec = pl.read_parquet(cnec_path)
features = pl.read_parquet(features_path)
print(f" Unified JAO: {unified.shape}")
print(f" CNEC hourly: {cnec.shape}")
print(f" Features: {features.shape}")
# =========================================================================
# 2. Timeline Validation
# =========================================================================
print("\n" + "-" * 80)
print("[1/4] TIMELINE VALIDATION")
print("-" * 80)
# Check sorted
is_sorted = unified['mtu'].is_sorted()
print(f" Timeline sorted: {'[PASS]' if is_sorted else '[FAIL]'}")
# Check for gaps (should be hourly)
time_diffs = unified['mtu'].diff().drop_nulls()
most_common_diff = time_diffs.mode()[0]
hourly_expected = most_common_diff.total_seconds() == 3600
print(f" Most common time diff: {most_common_diff}")
print(f" Hourly intervals: {'[PASS]' if hourly_expected else '[FAIL]'}")
# Date range
min_date = unified['mtu'].min()
max_date = unified['mtu'].max()
print(f" Date range: {min_date} to {max_date}")
print(f" Total hours: {len(unified):,}")
# Expected: Oct 2023 to Sept 2025 = ~24 months
# After deduplication: 17,544 hours (729.75 days = ~24 months)
expected_days = (max_date - min_date).days + 1
print(f" Days covered: {expected_days} (~{expected_days / 30:.1f} months)")
# =========================================================================
# 3. Feature Completeness
# =========================================================================
print("\n" + "-" * 80)
print("[2/4] FEATURE COMPLETENESS")
print("-" * 80)
# Count features by category
cnec_t1_cols = [c for c in features.columns if c.startswith('cnec_t1_')]
cnec_t2_cols = [c for c in features.columns if c.startswith('cnec_t2_')]
lta_cols = [c for c in features.columns if c.startswith('lta_')]
temporal_cols = [c for c in features.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
target_cols = [c for c in features.columns if c.startswith('target_')]
print(f" Tier-1 CNEC features: {len(cnec_t1_cols)}")
print(f" Tier-2 CNEC features: {len(cnec_t2_cols)}")
print(f" LTA features: {len(lta_cols)}")
print(f" Temporal features: {len(temporal_cols)}")
print(f" Target variables: {len(target_cols)}")
print(f" Total features: {features.shape[1] - 1} (excluding mtu)")
# Null counts by category
print("\n Null percentages:")
cnec_t1_nulls = features.select(cnec_t1_cols).null_count().sum_horizontal()[0]
cnec_t2_nulls = features.select(cnec_t2_cols).null_count().sum_horizontal()[0]
lta_nulls = features.select(lta_cols).null_count().sum_horizontal()[0]
temporal_nulls = features.select(temporal_cols).null_count().sum_horizontal()[0]
target_nulls = features.select(target_cols).null_count().sum_horizontal()[0]
total_cells_t1 = len(features) * len(cnec_t1_cols)
total_cells_t2 = len(features) * len(cnec_t2_cols)
total_cells_lta = len(features) * len(lta_cols)
total_cells_temporal = len(features) * len(temporal_cols)
total_cells_target = len(features) * len(target_cols)
print(f" Tier-1 CNEC: {cnec_t1_nulls / total_cells_t1 * 100:.2f}% nulls")
print(f" Tier-2 CNEC: {cnec_t2_nulls / total_cells_t2 * 100:.2f}% nulls")
print(f" LTA: {lta_nulls / total_cells_lta * 100:.2f}% nulls")
print(f" Temporal: {temporal_nulls / total_cells_temporal * 100:.2f}% nulls")
print(f" Targets: {target_nulls / total_cells_target * 100:.2f}% nulls")
# Overall null percentage
total_nulls = features.null_count().sum_horizontal()[0]
total_cells = len(features) * len(features.columns)
overall_null_pct = total_nulls / total_cells * 100
print(f"\n Overall null percentage: {overall_null_pct:.2f}%")
if overall_null_pct < 60:
print(f" Completeness: [PASS] (<60% nulls)")
else:
print(f" Completeness: [WARNING] (>{overall_null_pct:.1f}% nulls)")
# =========================================================================
# 4. Data Leakage Check
# =========================================================================
print("\n" + "-" * 80)
print("[3/4] DATA LEAKAGE CHECK")
print("-" * 80)
# LTA are future covariates - should have NO nulls (known in advance)
lta_null_count = unified.select([c for c in unified.columns if c.startswith('border_')]).null_count().sum_horizontal()[0]
print(f" LTA nulls: {lta_null_count}")
if lta_null_count == 0:
print(" LTA future covariates: [PASS] (no nulls)")
else:
print(f" LTA future covariates: [WARNING] ({lta_null_count} nulls)")
# Historical features should have lags (shift creates nulls at start)
# Check that lag features have nulls ONLY at the beginning
has_lag_features = any('_L' in c for c in features.columns)
if has_lag_features:
print(" Historical lag features: [PRESENT] (nulls expected at start)")
else:
print(" Historical lag features: [WARNING] (no lag features found)")
# =========================================================================
# 5. Summary Statistics
# =========================================================================
print("\n" + "-" * 80)
print("[4/4] SUMMARY STATISTICS")
print("-" * 80)
print("\nUnified JAO Data:")
print(f" Rows: {len(unified):,}")
print(f" Columns: {len(unified.columns)}")
print(f" MaxBEX borders: {len([c for c in unified.columns if 'border_' in c and 'lta' not in c.lower()])}")
print(f" LTA borders: {len([c for c in unified.columns if c.startswith('border_')])}")
print(f" Net Positions: {len([c for c in unified.columns if c.startswith('netpos_')])}")
print("\nCNEC Hourly Data:")
print(f" Total CNEC records: {len(cnec):,}")
print(f" Unique CNECs: {cnec['cnec_eic'].n_unique()}")
print(f" Unique timestamps: {cnec['mtu'].n_unique():,}")
print(f" CNECs per timestamp: {len(cnec) / cnec['mtu'].n_unique():.1f} avg")
print("\nFeature Engineering:")
print(f" Total features: {features.shape[1] - 1}")
print(f" Feature rows: {len(features):,}")
print(f" File size: {features_path.stat().st_size / (1024**2):.2f} MB")
# =========================================================================
# Validation Summary
# =========================================================================
print("\n" + "=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)
checks_passed = 0
total_checks = 4
# Timeline check
if is_sorted and hourly_expected:
print(" [PASS] Timeline validation PASSED")
checks_passed += 1
else:
print(" [FAIL] Timeline validation FAILED")
# Feature completeness check
if overall_null_pct < 60:
print(" [PASS] Feature completeness PASSED")
checks_passed += 1
else:
print(" [WARNING] Feature completeness WARNING (high nulls)")
# Data leakage check
if lta_null_count == 0 and has_lag_features:
print(" [PASS] Data leakage check PASSED")
checks_passed += 1
else:
print(" [WARNING] Data leakage check WARNING")
# Overall data quality
if len(unified) == len(features):
print(" [PASS] Data consistency PASSED")
checks_passed += 1
else:
print(" [FAIL] Data consistency FAILED (row mismatch)")
print(f"\nChecks passed: {checks_passed}/{total_checks}")
if checks_passed == total_checks:
print("\n[SUCCESS] All validation checks PASSED")
elif checks_passed >= total_checks - 1:
print("\n[WARNING] Minor issues detected")
else:
print("\n[FAILURE] Critical issues detected")
print("=" * 80)
print()