Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

fbmc-chronos2 / notebooks /02_unified_jao_exploration.py

Evgueni Poloukarov

feat: complete Phase 1 ENTSO-E asset-specific outage validation

27cb60a about 1 month ago

19.5 kB

	"""FBMC Flow Forecasting - Unified JAO Data Exploration

	Objective: Explore unified 24-month JAO data and engineered features

	This notebook explores:
	1. Unified JAO dataset (MaxBEX + CNEC + LTA + NetPos)
	2. Engineered features (726 features across 5 categories)
	3. Feature completeness and validation
	4. Key statistics and distributions

	Usage:
	marimo edit notebooks/02_unified_jao_exploration.py
	"""

	import marimo

	__generated_with = "0.17.2"
	app = marimo.App(width="medium")


	@app.cell
	def _():
	import marimo as mo
	import polars as pl
	import altair as alt
	from pathlib import Path
	import numpy as np
	return Path, alt, mo, pl


	@app.cell
	def _(mo):
	mo.md(
	r"""
	# Unified JAO Data Exploration (24 Months)

	Date Range: October 2023 - October 2025 (24 months)

	## Data Pipeline Overview:

	1. Raw JAO Data (4 datasets)
	- MaxBEX: Maximum Bilateral Exchange capacity (TARGET)
	- CNEC/PTDF: Critical constraints with power transfer factors
	- LTA: Long Term Allocations (future covariates)
	- Net Positions: Domain boundaries (min/max per zone)

	2. Data Unification → `unified_jao_24month.parquet`
	- Deduplicated NetPos (removed 1,152 duplicate timestamps)
	- Forward-filled LTA gaps (710 missing hours)
	- Broadcast daily CNEC to hourly
	- Sorted timeline (hourly, 17,544 records)

	3. Feature Engineering → `features_jao_24month.parquet`
	- 726 features across 5 categories
	- Tier-1 CNEC: 274 features
	- Tier-2 CNEC: 390 features
	- LTA: 40 features
	- Temporal: 12 features
	- Targets: 10 features
	"""
	)
	return


	@app.cell
	def _(Path, pl):
	# Load unified datasets
	print("Loading unified JAO datasets...")

	processed_dir = Path('data/processed')

	unified_jao = pl.read_parquet(processed_dir / 'unified_jao_24month.parquet')
	cnec_hourly = pl.read_parquet(processed_dir / 'cnec_hourly_24month.parquet')
	features_jao = pl.read_parquet(processed_dir / 'features_jao_24month.parquet')

	print(f"[OK] Unified JAO: {unified_jao.shape}")
	print(f"[OK] CNEC hourly: {cnec_hourly.shape}")
	print(f"[OK] Features: {features_jao.shape}")
	return features_jao, unified_jao


	@app.cell
	def _(features_jao, mo, unified_jao):
	# Dataset overview
	mo.md(f"""
	## Dataset Overview

	### 1. Unified JAO Dataset
	- Shape: {unified_jao.shape[0]:,} rows × {unified_jao.shape[1]} columns
	- Date Range: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
	- Timeline Sorted: {unified_jao['mtu'].is_sorted()}
	- Null Percentage: {(unified_jao.null_count().sum_horizontal()[0] / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%

	### 2. Engineered Features
	- Shape: {features_jao.shape[0]:,} rows × {features_jao.shape[1]} columns
	- Total Features: {features_jao.shape[1] - 1} (excluding mtu timestamp)
	- Null Percentage: {(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%
	- _Note: High nulls expected due to sparse CNEC binding patterns and lag features_
	""")
	return


	@app.cell
	def _(mo):
	mo.md("""## 1. Unified JAO Dataset Structure""")
	return


	@app.cell
	def _(mo, unified_jao):
	# Show sample of unified data
	mo.md("""### Sample Data (First 20 Rows)""")
	mo.ui.table(unified_jao.head(20).to_pandas(), page_size=10)
	return


	@app.cell
	def _(mo, unified_jao):
	# Column breakdown
	maxbex_cols = [c for c in unified_jao.columns if 'border_' in c and not c.startswith('lta')]
	lta_cols = [c for c in unified_jao.columns if c.startswith('border_')]
	netpos_cols = [c for c in unified_jao.columns if c.startswith('netpos_')]

	mo.md(f"""
	### Column Breakdown

	- Timestamp: 1 column (`mtu`)
	- MaxBEX Borders: {len(maxbex_cols)} columns
	- LTA Borders: {len(lta_cols)} columns
	- Net Positions: {len(netpos_cols)} columns (if present)
	- Total: {unified_jao.shape[1]} columns
	""")
	return


	@app.cell
	def _(mo):
	mo.md("""### Timeline Validation""")
	return


	@app.cell
	def _(alt, pl, unified_jao):
	# Timeline validation
	time_diffs = unified_jao['mtu'].diff().drop_nulls()

	# Most common time diff
	most_common = time_diffs.mode()[0]
	is_hourly = most_common.total_seconds() == 3600

	# Create histogram of time diffs
	time_diff_hours = time_diffs.map_elements(lambda x: x.total_seconds() / 3600, return_dtype=pl.Float64)

	time_diff_df = pl.DataFrame({
	'time_diff_hours': time_diff_hours
	})

	timeline_chart = alt.Chart(time_diff_df.to_pandas()).mark_bar().encode(
	x=alt.X('time_diff_hours:Q', bin=alt.Bin(maxbins=50), title='Time Difference (hours)'),
	y=alt.Y('count()', title='Count'),
	tooltip=['time_diff_hours:Q', 'count()']
	).properties(
	title='Timeline Gaps Distribution',
	width=800,
	height=300
	)

	timeline_chart
	return is_hourly, most_common


	@app.cell
	def _(is_hourly, mo, most_common):
	if is_hourly:
	mo.md(f"""
	✅ Timeline Validation: PASS
	- Most common time diff: {most_common} (1 hour)
	- Timeline is properly sorted and hourly
	""")
	else:
	mo.md(f"""
	⚠️ Timeline Validation: WARNING
	- Most common time diff: {most_common}
	- Expected: 1 hour
	""")
	return


	@app.cell
	def _(mo):
	mo.md("""## 2. Feature Engineering Results""")
	return


	@app.cell
	def _(features_jao, mo, pl):
	# Feature category breakdown
	tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
	tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
	lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
	temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
	target_cols = [c for c in features_jao.columns if c.startswith('target_')]

	# Create summary table
	feature_summary = pl.DataFrame({
	'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
	'Features': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), features_jao.shape[1] - 1],
	'Null %': [
	f"{(features_jao.select(tier1_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier1_cols)) * 100):.2f}%" if tier1_cols else "N/A",
	f"{(features_jao.select(tier2_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier2_cols)) * 100):.2f}%" if tier2_cols else "N/A",
	f"{(features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(lta_feat_cols)) * 100):.2f}%" if lta_feat_cols else "N/A",
	f"{(features_jao.select(temporal_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(temporal_cols)) * 100):.2f}%" if temporal_cols else "N/A",
	f"{(features_jao.select(target_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(target_cols)) * 100):.2f}%" if target_cols else "N/A",
	f"{(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%"
	]
	})

	mo.ui.table(feature_summary.to_pandas())
	return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols


	@app.cell
	def _(mo):
	mo.md("""### Sample Features (First 20 Rows)""")
	return


	@app.cell
	def _(features_jao, mo):
	# Show first 10 columns only (too many to display all)
	mo.ui.table(features_jao.select(features_jao.columns[:10]).head(20).to_pandas(), page_size=10)
	return


	@app.cell
	def _(mo):
	mo.md("""## 3. LTA Features (Future Covariates)""")
	return


	@app.cell
	def _(lta_feat_cols, mo):
	# LTA features analysis
	mo.md(f"""
	LTA Features: {len(lta_feat_cols)} features

	LTA (Long Term Allocations) are future covariates - known years in advance via auctions.
	These should have 0% nulls since they're available for the entire forecast horizon.
	""")
	return


	@app.cell
	def _(alt, features_jao):
	# Plot LTA total allocated over time
	lta_chart_data = features_jao.select(['mtu', 'lta_total_allocated']).sort('mtu')

	lta_chart = alt.Chart(lta_chart_data.to_pandas()).mark_line().encode(
	x=alt.X('mtu:T', title='Date'),
	y=alt.Y('lta_total_allocated:Q', title='Total LTA Allocated (MW)'),
	tooltip=['mtu:T', 'lta_total_allocated:Q']
	).properties(
	title='LTA Total Allocated Capacity Over Time',
	width=800,
	height=400
	).interactive()

	lta_chart
	return


	@app.cell
	def _(features_jao, lta_feat_cols, mo):
	# LTA statistics
	lta_stats = features_jao.select(lta_feat_cols[:5]).describe()

	mo.md("""### LTA Sample Statistics (First 5 Features)""")
	mo.ui.table(lta_stats.to_pandas())
	return


	@app.cell
	def _(mo):
	mo.md("""## 4. Temporal Features""")
	return


	@app.cell
	def _(features_jao, mo, temporal_cols):
	# Show temporal features
	mo.md(f"""
	Temporal Features: {len(temporal_cols)} features

	Cyclic encoding for hour, month, and weekday to capture periodicity.
	""")

	mo.ui.table(features_jao.select(['mtu'] + temporal_cols).head(24).to_pandas())
	return


	@app.cell
	def _(alt, features_jao, pl):
	# Hourly distribution
	hour_dist = features_jao.group_by('hour').agg(pl.len().alias('count')).sort('hour')

	hour_chart = alt.Chart(hour_dist.to_pandas()).mark_bar().encode(
	x=alt.X('hour:O', title='Hour of Day'),
	y=alt.Y('count:Q', title='Count'),
	tooltip=['hour:O', 'count:Q']
	).properties(
	title='Distribution by Hour of Day',
	width=800,
	height=300
	)

	hour_chart
	return


	@app.cell
	def _(mo):
	mo.md("""## 5. CNEC Features (Historical)""")
	return


	@app.cell
	def _(features_jao, mo, tier1_cols, tier2_cols):
	# CNEC features overview
	mo.md(f"""
	CNEC Features: {len(tier1_cols) + len(tier2_cols)} total

	- Tier-1 CNECs: {len(tier1_cols)} features (top 58 most critical CNECs)
	- Tier-2 CNECs: {len(tier2_cols)} features (next 150 CNECs)

	High null percentage is expected due to:
	1. Sparse binding patterns (not all CNECs bind every hour)
	2. Lag features create nulls at timeline start
	3. Pivoting creates sparse constraint matrices
	""")

	# Sample Tier-1 features
	mo.ui.table(features_jao.select(['mtu'] + tier1_cols[:5]).head(20).to_pandas(), page_size=10)
	return


	@app.cell
	def _(alt, features_jao, pl, tier1_cols):
	# Binding frequency for sample Tier-1 CNECs
	binding_cols = [c for c in tier1_cols if 'binding_' in c][:10]

	if binding_cols:
	binding_freq = pl.DataFrame({
	'cnec': [c.replace('cnec_t1_binding_', '') for c in binding_cols],
	'binding_rate': [features_jao[c].mean() for c in binding_cols]
	})

	binding_chart = alt.Chart(binding_freq.to_pandas()).mark_bar().encode(
	x=alt.X('binding_rate:Q', title='Binding Frequency (0-1)'),
	y=alt.Y('cnec:N', sort='-x', title='CNEC'),
	tooltip=['cnec:N', alt.Tooltip('binding_rate:Q', format='.2%')]
	).properties(
	title='Binding Frequency - Sample Tier-1 CNECs',
	width=800,
	height=300
	)

	binding_chart
	else:
	None
	return


	@app.cell
	def _(mo):
	mo.md("""## 6. Target Variables""")
	return


	@app.cell
	def _(features_jao, mo, target_cols):
	# Show target variables (MaxBEX borders)
	mo.md(f"""
	Target Variables: {len(target_cols)} features

	Sample MaxBEX borders for forecasting (first 10 borders):
	""")

	if target_cols:
	mo.ui.table(features_jao.select(['mtu'] + target_cols).head(20).to_pandas(), page_size=10)
	return


	@app.cell
	def _(alt, features_jao, target_cols):
	# Plot sample target variable over time
	if target_cols:
	sample_target = target_cols[0]

	target_chart_data = features_jao.select(['mtu', sample_target]).sort('mtu')

	target_chart = alt.Chart(target_chart_data.to_pandas()).mark_line().encode(
	x=alt.X('mtu:T', title='Date'),
	y=alt.Y(f'{sample_target}:Q', title='Capacity (MW)'),
	tooltip=['mtu:T', f'{sample_target}:Q']
	).properties(
	title=f'Target Variable Over Time: {sample_target}',
	width=800,
	height=400
	).interactive()

	target_chart
	else:
	None
	return


	@app.cell
	def _(mo):
	mo.md(
	"""
	## 7. Data Quality Summary

	Final validation checks:
	"""
	)
	return


	@app.cell
	def _(features_jao, is_hourly, lta_feat_cols, mo, pl, unified_jao):
	# Data quality checks
	checks = []

	# Check 1: Timeline sorted and hourly
	checks.append({
	'Check': 'Timeline sorted & hourly',
	'Status': 'PASS' if is_hourly else 'FAIL',
	'Details': f'Most common diff: {unified_jao["mtu"].diff().drop_nulls().mode()[0]}'
	})

	# Check 2: No nulls in unified dataset
	unified_nulls = unified_jao.null_count().sum_horizontal()[0]
	checks.append({
	'Check': 'Unified data completeness',
	'Status': 'PASS' if unified_nulls == 0 else 'WARNING',
	'Details': f'{unified_nulls} nulls ({(unified_nulls / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%)'
	})

	# Check 3: LTA features have no nulls (future covariates)
	lta_nulls = features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] if lta_feat_cols else 0
	checks.append({
	'Check': 'LTA future covariates complete',
	'Status': 'PASS' if lta_nulls == 0 else 'FAIL',
	'Details': f'{lta_nulls} nulls in {len(lta_feat_cols)} LTA features'
	})

	# Check 4: Data consistency (same row count)
	checks.append({
	'Check': 'Data consistency',
	'Status': 'PASS' if len(unified_jao) == len(features_jao) else 'FAIL',
	'Details': f'Unified: {len(unified_jao):,} rows, Features: {len(features_jao):,} rows'
	})

	checks_df = pl.DataFrame(checks)

	mo.ui.table(checks_df.to_pandas())
	return (checks,)


	@app.cell
	def _(checks, mo):
	# Overall status
	all_pass = all(c['Status'] == 'PASS' for c in checks)

	if all_pass:
	mo.md("""
	✅ All validation checks PASSED

	Data is ready for model training and inference!
	""")
	else:
	failed = [c['Check'] for c in checks if c['Status'] == 'FAIL']
	warnings = [c['Check'] for c in checks if c['Status'] == 'WARNING']

	status = "⚠️ Some checks failed or have warnings\n\n"
	if failed:
	status += f"Failed: {', '.join(failed)}\n\n"
	if warnings:
	status += f"Warnings: {', '.join(warnings)}"

	mo.md(status)
	return


	@app.cell
	def _(mo):
	mo.md(
	"""
	## Next Steps

	✅ JAO Data Collection & Unification: COMPLETE
	- 24 months of data (Oct 2023 - Oct 2025)
	- 17,544 hourly records
	- 726 features engineered

	Remaining Work:
	1. Collect weather data (OpenMeteo, 52 grid points)
	2. Collect ENTSO-E data (generation, flows, outages)
	3. Complete remaining feature scaffolding (NetPos lags, MaxBEX lags, system aggregates)
	4. Integrate all data sources
	5. Begin zero-shot Chronos 2 inference

	---

	Data Files:
	- `data/processed/unified_jao_24month.parquet` (5.59 MB)
	- `data/processed/cnec_hourly_24month.parquet` (4.57 MB)
	- `data/processed/features_jao_24month.parquet` (0.60 MB)
	"""
	)
	return


	@app.cell
	def _(mo, unified_jao):
	# Display the unified JAO dataset
	mo.md("## Unified JAO Dataset")
	mo.ui.table(unified_jao.to_pandas(), page_size=20)
	return


	@app.cell
	def _(features_jao, mo, unified_jao):
	# Show the actual structure with timestamp
	mo.md("### Unified JAO Dataset Structure")
	display_df = unified_jao.select(['mtu'] + [c for c in unified_jao.columns if c != 'mtu'][:10]).head(10)
	mo.ui.table(display_df.to_pandas())

	mo.md(f"""
	Dataset Info:
	- Total columns: {len(unified_jao.columns)}
	- Timestamp column: `mtu` (Market Time Unit)
	- Date range: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
	""")

	# Show the 726 features dataset separately
	mo.md("### Features Dataset (726 engineered features)")
	mo.ui.table(features_jao.select(['mtu'] + features_jao.columns[1:11]).head(10).to_pandas())
	return


	@app.cell
	def _(features_jao, mo, pl, unified_jao):
	# Show actual column counts
	mo.md(f"""
	### Dataset Column Counts

	unified_jao: {len(unified_jao.columns)} columns
	- Raw unified data (MaxBEX, LTA, NetPos)

	features_jao: {len(features_jao.columns)} columns
	- Engineered features (726 + timestamp)
	""")

	# Show all column categories in features dataset
	tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
	tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
	lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
	temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
	target_cols = [c for c in features_jao.columns if c.startswith('target_')]

	feature_breakdown = pl.DataFrame({
	'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
	'Count': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), len(features_jao.columns)]
	})

	mo.md("### Feature Breakdown in features_jao dataset:")
	mo.ui.table(feature_breakdown.to_pandas())

	# Show first 20 actual column names from features_jao
	mo.md("### First 20 column names in features_jao:")
	for i, col in enumerate(features_jao.columns[:]):
	print(f"{i+1:3d}. {col}")
	return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols


	@app.cell
	def _(features_jao, mo, pl):
	# Check CNEC Tier-1 binding values without redefining variables
	_cnec_t1_binding_cols = [c for c in features_jao.columns if c.startswith('target_border')]

	if _cnec_t1_binding_cols:
	# Show sample of binding values
	_sample_bindings = features_jao.select(['mtu'] + _cnec_t1_binding_cols[:5]).head(20)

	mo.md("### Sample CNEC Tier-1 Binding Values (First 5 CNECs)")
	mo.ui.table(_sample_bindings.to_pandas(), page_size=10)

	# Check unique values in first binding column
	_first_col = _cnec_t1_binding_cols[0]
	_unique_vals = features_jao[_first_col].unique().sort()

	mo.md(f"### Unique Values in {_first_col}")
	print(f"Unique values: {_unique_vals.to_list()}")

	# Value counts for first column
	_val_counts = features_jao.group_by(_first_col).agg(pl.len().alias('count')).sort('count', descending=True)
	mo.ui.table(_val_counts.to_pandas())
	return


	if __name__ == "__main__":
	app.run()