Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

fbmc-chronos2 / scripts /validate_jao_data.py

Evgueni Poloukarov

feat: complete Phase 1 ENTSO-E asset-specific outage validation

27cb60a about 1 month ago

8.16 kB

	"""Validate unified JAO data and engineered features.

	Checks:
	1. Timeline: hourly, no gaps, sorted
	2. Feature completeness: null percentages
	3. Data leakage: future data not in historical features
	4. Summary statistics

	Author: Claude
	Date: 2025-11-06
	"""
	import polars as pl
	from pathlib import Path

	print("\n" + "=" * 80)
	print("JAO DATA VALIDATION")
	print("=" * 80)

	# =========================================================================
	# 1. Load datasets
	# =========================================================================
	print("\nLoading datasets...")

	unified_path = Path('data/processed/unified_jao_24month.parquet')
	cnec_path = Path('data/processed/cnec_hourly_24month.parquet')
	features_path = Path('data/processed/features_jao_24month.parquet')

	unified = pl.read_parquet(unified_path)
	cnec = pl.read_parquet(cnec_path)
	features = pl.read_parquet(features_path)

	print(f" Unified JAO: {unified.shape}")
	print(f" CNEC hourly: {cnec.shape}")
	print(f" Features: {features.shape}")

	# =========================================================================
	# 2. Timeline Validation
	# =========================================================================
	print("\n" + "-" * 80)
	print("[1/4] TIMELINE VALIDATION")
	print("-" * 80)

	# Check sorted
	is_sorted = unified['mtu'].is_sorted()
	print(f" Timeline sorted: {'[PASS]' if is_sorted else '[FAIL]'}")

	# Check for gaps (should be hourly)
	time_diffs = unified['mtu'].diff().drop_nulls()
	most_common_diff = time_diffs.mode()[0]
	hourly_expected = most_common_diff.total_seconds() == 3600

	print(f" Most common time diff: {most_common_diff}")
	print(f" Hourly intervals: {'[PASS]' if hourly_expected else '[FAIL]'}")

	# Date range
	min_date = unified['mtu'].min()
	max_date = unified['mtu'].max()
	print(f" Date range: {min_date} to {max_date}")
	print(f" Total hours: {len(unified):,}")

	# Expected: Oct 2023 to Sept 2025 = ~24 months
	# After deduplication: 17,544 hours (729.75 days = ~24 months)
	expected_days = (max_date - min_date).days + 1
	print(f" Days covered: {expected_days} (~{expected_days / 30:.1f} months)")

	# =========================================================================
	# 3. Feature Completeness
	# =========================================================================
	print("\n" + "-" * 80)
	print("[2/4] FEATURE COMPLETENESS")
	print("-" * 80)

	# Count features by category
	cnec_t1_cols = [c for c in features.columns if c.startswith('cnec_t1_')]
	cnec_t2_cols = [c for c in features.columns if c.startswith('cnec_t2_')]
	lta_cols = [c for c in features.columns if c.startswith('lta_')]
	temporal_cols = [c for c in features.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
	target_cols = [c for c in features.columns if c.startswith('target_')]

	print(f" Tier-1 CNEC features: {len(cnec_t1_cols)}")
	print(f" Tier-2 CNEC features: {len(cnec_t2_cols)}")
	print(f" LTA features: {len(lta_cols)}")
	print(f" Temporal features: {len(temporal_cols)}")
	print(f" Target variables: {len(target_cols)}")
	print(f" Total features: {features.shape[1] - 1} (excluding mtu)")

	# Null counts by category
	print("\n Null percentages:")
	cnec_t1_nulls = features.select(cnec_t1_cols).null_count().sum_horizontal()[0]
	cnec_t2_nulls = features.select(cnec_t2_cols).null_count().sum_horizontal()[0]
	lta_nulls = features.select(lta_cols).null_count().sum_horizontal()[0]
	temporal_nulls = features.select(temporal_cols).null_count().sum_horizontal()[0]
	target_nulls = features.select(target_cols).null_count().sum_horizontal()[0]

	total_cells_t1 = len(features) * len(cnec_t1_cols)
	total_cells_t2 = len(features) * len(cnec_t2_cols)
	total_cells_lta = len(features) * len(lta_cols)
	total_cells_temporal = len(features) * len(temporal_cols)
	total_cells_target = len(features) * len(target_cols)

	print(f" Tier-1 CNEC: {cnec_t1_nulls / total_cells_t1 * 100:.2f}% nulls")
	print(f" Tier-2 CNEC: {cnec_t2_nulls / total_cells_t2 * 100:.2f}% nulls")
	print(f" LTA: {lta_nulls / total_cells_lta * 100:.2f}% nulls")
	print(f" Temporal: {temporal_nulls / total_cells_temporal * 100:.2f}% nulls")
	print(f" Targets: {target_nulls / total_cells_target * 100:.2f}% nulls")

	# Overall null percentage
	total_nulls = features.null_count().sum_horizontal()[0]
	total_cells = len(features) * len(features.columns)
	overall_null_pct = total_nulls / total_cells * 100

	print(f"\n Overall null percentage: {overall_null_pct:.2f}%")

	if overall_null_pct < 60:
	print(f" Completeness: [PASS] (<60% nulls)")
	else:
	print(f" Completeness: [WARNING] (>{overall_null_pct:.1f}% nulls)")

	# =========================================================================
	# 4. Data Leakage Check
	# =========================================================================
	print("\n" + "-" * 80)
	print("[3/4] DATA LEAKAGE CHECK")
	print("-" * 80)

	# LTA are future covariates - should have NO nulls (known in advance)
	lta_null_count = unified.select([c for c in unified.columns if c.startswith('border_')]).null_count().sum_horizontal()[0]

	print(f" LTA nulls: {lta_null_count}")

	if lta_null_count == 0:
	print(" LTA future covariates: [PASS] (no nulls)")
	else:
	print(f" LTA future covariates: [WARNING] ({lta_null_count} nulls)")

	# Historical features should have lags (shift creates nulls at start)
	# Check that lag features have nulls ONLY at the beginning
	has_lag_features = any('_L' in c for c in features.columns)

	if has_lag_features:
	print(" Historical lag features: [PRESENT] (nulls expected at start)")
	else:
	print(" Historical lag features: [WARNING] (no lag features found)")

	# =========================================================================
	# 5. Summary Statistics
	# =========================================================================
	print("\n" + "-" * 80)
	print("[4/4] SUMMARY STATISTICS")
	print("-" * 80)

	print("\nUnified JAO Data:")
	print(f" Rows: {len(unified):,}")
	print(f" Columns: {len(unified.columns)}")
	print(f" MaxBEX borders: {len([c for c in unified.columns if 'border_' in c and 'lta' not in c.lower()])}")
	print(f" LTA borders: {len([c for c in unified.columns if c.startswith('border_')])}")
	print(f" Net Positions: {len([c for c in unified.columns if c.startswith('netpos_')])}")

	print("\nCNEC Hourly Data:")
	print(f" Total CNEC records: {len(cnec):,}")
	print(f" Unique CNECs: {cnec['cnec_eic'].n_unique()}")
	print(f" Unique timestamps: {cnec['mtu'].n_unique():,}")
	print(f" CNECs per timestamp: {len(cnec) / cnec['mtu'].n_unique():.1f} avg")

	print("\nFeature Engineering:")
	print(f" Total features: {features.shape[1] - 1}")
	print(f" Feature rows: {len(features):,}")
	print(f" File size: {features_path.stat().st_size / (1024**2):.2f} MB")

	# =========================================================================
	# Validation Summary
	# =========================================================================
	print("\n" + "=" * 80)
	print("VALIDATION SUMMARY")
	print("=" * 80)

	checks_passed = 0
	total_checks = 4

	# Timeline check
	if is_sorted and hourly_expected:
	print(" [PASS] Timeline validation PASSED")
	checks_passed += 1
	else:
	print(" [FAIL] Timeline validation FAILED")

	# Feature completeness check
	if overall_null_pct < 60:
	print(" [PASS] Feature completeness PASSED")
	checks_passed += 1
	else:
	print(" [WARNING] Feature completeness WARNING (high nulls)")

	# Data leakage check
	if lta_null_count == 0 and has_lag_features:
	print(" [PASS] Data leakage check PASSED")
	checks_passed += 1
	else:
	print(" [WARNING] Data leakage check WARNING")

	# Overall data quality
	if len(unified) == len(features):
	print(" [PASS] Data consistency PASSED")
	checks_passed += 1
	else:
	print(" [FAIL] Data consistency FAILED (row mismatch)")

	print(f"\nChecks passed: {checks_passed}/{total_checks}")

	if checks_passed == total_checks:
	print("\n[SUCCESS] All validation checks PASSED")
	elif checks_passed >= total_checks - 1:
	print("\n[WARNING] Minor issues detected")
	else:
	print("\n[FAILURE] Critical issues detected")

	print("=" * 80)
	print()