fbmc-chronos2 / notebooks /02_unified_jao_exploration.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
"""FBMC Flow Forecasting - Unified JAO Data Exploration
Objective: Explore unified 24-month JAO data and engineered features
This notebook explores:
1. Unified JAO dataset (MaxBEX + CNEC + LTA + NetPos)
2. Engineered features (726 features across 5 categories)
3. Feature completeness and validation
4. Key statistics and distributions
Usage:
marimo edit notebooks/02_unified_jao_exploration.py
"""
import marimo
__generated_with = "0.17.2"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import polars as pl
import altair as alt
from pathlib import Path
import numpy as np
return Path, alt, mo, pl
@app.cell
def _(mo):
mo.md(
r"""
# Unified JAO Data Exploration (24 Months)
**Date Range**: October 2023 - October 2025 (24 months)
## Data Pipeline Overview:
1. **Raw JAO Data** (4 datasets)
- MaxBEX: Maximum Bilateral Exchange capacity (TARGET)
- CNEC/PTDF: Critical constraints with power transfer factors
- LTA: Long Term Allocations (future covariates)
- Net Positions: Domain boundaries (min/max per zone)
2. **Data Unification** → `unified_jao_24month.parquet`
- Deduplicated NetPos (removed 1,152 duplicate timestamps)
- Forward-filled LTA gaps (710 missing hours)
- Broadcast daily CNEC to hourly
- Sorted timeline (hourly, 17,544 records)
3. **Feature Engineering** → `features_jao_24month.parquet`
- 726 features across 5 categories
- Tier-1 CNEC: 274 features
- Tier-2 CNEC: 390 features
- LTA: 40 features
- Temporal: 12 features
- Targets: 10 features
"""
)
return
@app.cell
def _(Path, pl):
# Load unified datasets
print("Loading unified JAO datasets...")
processed_dir = Path('data/processed')
unified_jao = pl.read_parquet(processed_dir / 'unified_jao_24month.parquet')
cnec_hourly = pl.read_parquet(processed_dir / 'cnec_hourly_24month.parquet')
features_jao = pl.read_parquet(processed_dir / 'features_jao_24month.parquet')
print(f"[OK] Unified JAO: {unified_jao.shape}")
print(f"[OK] CNEC hourly: {cnec_hourly.shape}")
print(f"[OK] Features: {features_jao.shape}")
return features_jao, unified_jao
@app.cell
def _(features_jao, mo, unified_jao):
# Dataset overview
mo.md(f"""
## Dataset Overview
### 1. Unified JAO Dataset
- **Shape**: {unified_jao.shape[0]:,} rows × {unified_jao.shape[1]} columns
- **Date Range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
- **Timeline Sorted**: {unified_jao['mtu'].is_sorted()}
- **Null Percentage**: {(unified_jao.null_count().sum_horizontal()[0] / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%
### 2. Engineered Features
- **Shape**: {features_jao.shape[0]:,} rows × {features_jao.shape[1]} columns
- **Total Features**: {features_jao.shape[1] - 1} (excluding mtu timestamp)
- **Null Percentage**: {(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%
- _Note: High nulls expected due to sparse CNEC binding patterns and lag features_
""")
return
@app.cell
def _(mo):
mo.md("""## 1. Unified JAO Dataset Structure""")
return
@app.cell
def _(mo, unified_jao):
# Show sample of unified data
mo.md("""### Sample Data (First 20 Rows)""")
mo.ui.table(unified_jao.head(20).to_pandas(), page_size=10)
return
@app.cell
def _(mo, unified_jao):
# Column breakdown
maxbex_cols = [c for c in unified_jao.columns if 'border_' in c and not c.startswith('lta')]
lta_cols = [c for c in unified_jao.columns if c.startswith('border_')]
netpos_cols = [c for c in unified_jao.columns if c.startswith('netpos_')]
mo.md(f"""
### Column Breakdown
- **Timestamp**: 1 column (`mtu`)
- **MaxBEX Borders**: {len(maxbex_cols)} columns
- **LTA Borders**: {len(lta_cols)} columns
- **Net Positions**: {len(netpos_cols)} columns (if present)
- **Total**: {unified_jao.shape[1]} columns
""")
return
@app.cell
def _(mo):
mo.md("""### Timeline Validation""")
return
@app.cell
def _(alt, pl, unified_jao):
# Timeline validation
time_diffs = unified_jao['mtu'].diff().drop_nulls()
# Most common time diff
most_common = time_diffs.mode()[0]
is_hourly = most_common.total_seconds() == 3600
# Create histogram of time diffs
time_diff_hours = time_diffs.map_elements(lambda x: x.total_seconds() / 3600, return_dtype=pl.Float64)
time_diff_df = pl.DataFrame({
'time_diff_hours': time_diff_hours
})
timeline_chart = alt.Chart(time_diff_df.to_pandas()).mark_bar().encode(
x=alt.X('time_diff_hours:Q', bin=alt.Bin(maxbins=50), title='Time Difference (hours)'),
y=alt.Y('count()', title='Count'),
tooltip=['time_diff_hours:Q', 'count()']
).properties(
title='Timeline Gaps Distribution',
width=800,
height=300
)
timeline_chart
return is_hourly, most_common
@app.cell
def _(is_hourly, mo, most_common):
if is_hourly:
mo.md(f"""
✅ **Timeline Validation: PASS**
- Most common time diff: {most_common} (1 hour)
- Timeline is properly sorted and hourly
""")
else:
mo.md(f"""
⚠️ **Timeline Validation: WARNING**
- Most common time diff: {most_common}
- Expected: 1 hour
""")
return
@app.cell
def _(mo):
mo.md("""## 2. Feature Engineering Results""")
return
@app.cell
def _(features_jao, mo, pl):
# Feature category breakdown
tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
target_cols = [c for c in features_jao.columns if c.startswith('target_')]
# Create summary table
feature_summary = pl.DataFrame({
'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
'Features': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), features_jao.shape[1] - 1],
'Null %': [
f"{(features_jao.select(tier1_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier1_cols)) * 100):.2f}%" if tier1_cols else "N/A",
f"{(features_jao.select(tier2_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier2_cols)) * 100):.2f}%" if tier2_cols else "N/A",
f"{(features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(lta_feat_cols)) * 100):.2f}%" if lta_feat_cols else "N/A",
f"{(features_jao.select(temporal_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(temporal_cols)) * 100):.2f}%" if temporal_cols else "N/A",
f"{(features_jao.select(target_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(target_cols)) * 100):.2f}%" if target_cols else "N/A",
f"{(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%"
]
})
mo.ui.table(feature_summary.to_pandas())
return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols
@app.cell
def _(mo):
mo.md("""### Sample Features (First 20 Rows)""")
return
@app.cell
def _(features_jao, mo):
# Show first 10 columns only (too many to display all)
mo.ui.table(features_jao.select(features_jao.columns[:10]).head(20).to_pandas(), page_size=10)
return
@app.cell
def _(mo):
mo.md("""## 3. LTA Features (Future Covariates)""")
return
@app.cell
def _(lta_feat_cols, mo):
# LTA features analysis
mo.md(f"""
**LTA Features**: {len(lta_feat_cols)} features
LTA (Long Term Allocations) are **future covariates** - known years in advance via auctions.
These should have **0% nulls** since they're available for the entire forecast horizon.
""")
return
@app.cell
def _(alt, features_jao):
# Plot LTA total allocated over time
lta_chart_data = features_jao.select(['mtu', 'lta_total_allocated']).sort('mtu')
lta_chart = alt.Chart(lta_chart_data.to_pandas()).mark_line().encode(
x=alt.X('mtu:T', title='Date'),
y=alt.Y('lta_total_allocated:Q', title='Total LTA Allocated (MW)'),
tooltip=['mtu:T', 'lta_total_allocated:Q']
).properties(
title='LTA Total Allocated Capacity Over Time',
width=800,
height=400
).interactive()
lta_chart
return
@app.cell
def _(features_jao, lta_feat_cols, mo):
# LTA statistics
lta_stats = features_jao.select(lta_feat_cols[:5]).describe()
mo.md("""### LTA Sample Statistics (First 5 Features)""")
mo.ui.table(lta_stats.to_pandas())
return
@app.cell
def _(mo):
mo.md("""## 4. Temporal Features""")
return
@app.cell
def _(features_jao, mo, temporal_cols):
# Show temporal features
mo.md(f"""
**Temporal Features**: {len(temporal_cols)} features
Cyclic encoding for hour, month, and weekday to capture periodicity.
""")
mo.ui.table(features_jao.select(['mtu'] + temporal_cols).head(24).to_pandas())
return
@app.cell
def _(alt, features_jao, pl):
# Hourly distribution
hour_dist = features_jao.group_by('hour').agg(pl.len().alias('count')).sort('hour')
hour_chart = alt.Chart(hour_dist.to_pandas()).mark_bar().encode(
x=alt.X('hour:O', title='Hour of Day'),
y=alt.Y('count:Q', title='Count'),
tooltip=['hour:O', 'count:Q']
).properties(
title='Distribution by Hour of Day',
width=800,
height=300
)
hour_chart
return
@app.cell
def _(mo):
mo.md("""## 5. CNEC Features (Historical)""")
return
@app.cell
def _(features_jao, mo, tier1_cols, tier2_cols):
# CNEC features overview
mo.md(f"""
**CNEC Features**: {len(tier1_cols) + len(tier2_cols)} total
- **Tier-1 CNECs**: {len(tier1_cols)} features (top 58 most critical CNECs)
- **Tier-2 CNECs**: {len(tier2_cols)} features (next 150 CNECs)
High null percentage is **expected** due to:
1. Sparse binding patterns (not all CNECs bind every hour)
2. Lag features create nulls at timeline start
3. Pivoting creates sparse constraint matrices
""")
# Sample Tier-1 features
mo.ui.table(features_jao.select(['mtu'] + tier1_cols[:5]).head(20).to_pandas(), page_size=10)
return
@app.cell
def _(alt, features_jao, pl, tier1_cols):
# Binding frequency for sample Tier-1 CNECs
binding_cols = [c for c in tier1_cols if 'binding_' in c][:10]
if binding_cols:
binding_freq = pl.DataFrame({
'cnec': [c.replace('cnec_t1_binding_', '') for c in binding_cols],
'binding_rate': [features_jao[c].mean() for c in binding_cols]
})
binding_chart = alt.Chart(binding_freq.to_pandas()).mark_bar().encode(
x=alt.X('binding_rate:Q', title='Binding Frequency (0-1)'),
y=alt.Y('cnec:N', sort='-x', title='CNEC'),
tooltip=['cnec:N', alt.Tooltip('binding_rate:Q', format='.2%')]
).properties(
title='Binding Frequency - Sample Tier-1 CNECs',
width=800,
height=300
)
binding_chart
else:
None
return
@app.cell
def _(mo):
mo.md("""## 6. Target Variables""")
return
@app.cell
def _(features_jao, mo, target_cols):
# Show target variables (MaxBEX borders)
mo.md(f"""
**Target Variables**: {len(target_cols)} features
Sample MaxBEX borders for forecasting (first 10 borders):
""")
if target_cols:
mo.ui.table(features_jao.select(['mtu'] + target_cols).head(20).to_pandas(), page_size=10)
return
@app.cell
def _(alt, features_jao, target_cols):
# Plot sample target variable over time
if target_cols:
sample_target = target_cols[0]
target_chart_data = features_jao.select(['mtu', sample_target]).sort('mtu')
target_chart = alt.Chart(target_chart_data.to_pandas()).mark_line().encode(
x=alt.X('mtu:T', title='Date'),
y=alt.Y(f'{sample_target}:Q', title='Capacity (MW)'),
tooltip=['mtu:T', f'{sample_target}:Q']
).properties(
title=f'Target Variable Over Time: {sample_target}',
width=800,
height=400
).interactive()
target_chart
else:
None
return
@app.cell
def _(mo):
mo.md(
"""
## 7. Data Quality Summary
Final validation checks:
"""
)
return
@app.cell
def _(features_jao, is_hourly, lta_feat_cols, mo, pl, unified_jao):
# Data quality checks
checks = []
# Check 1: Timeline sorted and hourly
checks.append({
'Check': 'Timeline sorted & hourly',
'Status': 'PASS' if is_hourly else 'FAIL',
'Details': f'Most common diff: {unified_jao["mtu"].diff().drop_nulls().mode()[0]}'
})
# Check 2: No nulls in unified dataset
unified_nulls = unified_jao.null_count().sum_horizontal()[0]
checks.append({
'Check': 'Unified data completeness',
'Status': 'PASS' if unified_nulls == 0 else 'WARNING',
'Details': f'{unified_nulls} nulls ({(unified_nulls / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%)'
})
# Check 3: LTA features have no nulls (future covariates)
lta_nulls = features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] if lta_feat_cols else 0
checks.append({
'Check': 'LTA future covariates complete',
'Status': 'PASS' if lta_nulls == 0 else 'FAIL',
'Details': f'{lta_nulls} nulls in {len(lta_feat_cols)} LTA features'
})
# Check 4: Data consistency (same row count)
checks.append({
'Check': 'Data consistency',
'Status': 'PASS' if len(unified_jao) == len(features_jao) else 'FAIL',
'Details': f'Unified: {len(unified_jao):,} rows, Features: {len(features_jao):,} rows'
})
checks_df = pl.DataFrame(checks)
mo.ui.table(checks_df.to_pandas())
return (checks,)
@app.cell
def _(checks, mo):
# Overall status
all_pass = all(c['Status'] == 'PASS' for c in checks)
if all_pass:
mo.md("""
✅ **All validation checks PASSED**
Data is ready for model training and inference!
""")
else:
failed = [c['Check'] for c in checks if c['Status'] == 'FAIL']
warnings = [c['Check'] for c in checks if c['Status'] == 'WARNING']
status = "⚠️ **Some checks failed or have warnings**\n\n"
if failed:
status += f"**Failed**: {', '.join(failed)}\n\n"
if warnings:
status += f"**Warnings**: {', '.join(warnings)}"
mo.md(status)
return
@app.cell
def _(mo):
mo.md(
"""
## Next Steps
✅ **JAO Data Collection & Unification: COMPLETE**
- 24 months of data (Oct 2023 - Oct 2025)
- 17,544 hourly records
- 726 features engineered
**Remaining Work:**
1. Collect weather data (OpenMeteo, 52 grid points)
2. Collect ENTSO-E data (generation, flows, outages)
3. Complete remaining feature scaffolding (NetPos lags, MaxBEX lags, system aggregates)
4. Integrate all data sources
5. Begin zero-shot Chronos 2 inference
---
**Data Files**:
- `data/processed/unified_jao_24month.parquet` (5.59 MB)
- `data/processed/cnec_hourly_24month.parquet` (4.57 MB)
- `data/processed/features_jao_24month.parquet` (0.60 MB)
"""
)
return
@app.cell
def _(mo, unified_jao):
# Display the unified JAO dataset
mo.md("## Unified JAO Dataset")
mo.ui.table(unified_jao.to_pandas(), page_size=20)
return
@app.cell
def _(features_jao, mo, unified_jao):
# Show the actual structure with timestamp
mo.md("### Unified JAO Dataset Structure")
display_df = unified_jao.select(['mtu'] + [c for c in unified_jao.columns if c != 'mtu'][:10]).head(10)
mo.ui.table(display_df.to_pandas())
mo.md(f"""
**Dataset Info:**
- **Total columns**: {len(unified_jao.columns)}
- **Timestamp column**: `mtu` (Market Time Unit)
- **Date range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
""")
# Show the 726 features dataset separately
mo.md("### Features Dataset (726 engineered features)")
mo.ui.table(features_jao.select(['mtu'] + features_jao.columns[1:11]).head(10).to_pandas())
return
@app.cell
def _(features_jao, mo, pl, unified_jao):
# Show actual column counts
mo.md(f"""
### Dataset Column Counts
**unified_jao**: {len(unified_jao.columns)} columns
- Raw unified data (MaxBEX, LTA, NetPos)
**features_jao**: {len(features_jao.columns)} columns
- Engineered features (726 + timestamp)
""")
# Show all column categories in features dataset
tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
target_cols = [c for c in features_jao.columns if c.startswith('target_')]
feature_breakdown = pl.DataFrame({
'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
'Count': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), len(features_jao.columns)]
})
mo.md("### Feature Breakdown in features_jao dataset:")
mo.ui.table(feature_breakdown.to_pandas())
# Show first 20 actual column names from features_jao
mo.md("### First 20 column names in features_jao:")
for i, col in enumerate(features_jao.columns[:]):
print(f"{i+1:3d}. {col}")
return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols
@app.cell
def _(features_jao, mo, pl):
# Check CNEC Tier-1 binding values without redefining variables
_cnec_t1_binding_cols = [c for c in features_jao.columns if c.startswith('target_border')]
if _cnec_t1_binding_cols:
# Show sample of binding values
_sample_bindings = features_jao.select(['mtu'] + _cnec_t1_binding_cols[:5]).head(20)
mo.md("### Sample CNEC Tier-1 Binding Values (First 5 CNECs)")
mo.ui.table(_sample_bindings.to_pandas(), page_size=10)
# Check unique values in first binding column
_first_col = _cnec_t1_binding_cols[0]
_unique_vals = features_jao[_first_col].unique().sort()
mo.md(f"### Unique Values in {_first_col}")
print(f"Unique values: {_unique_vals.to_list()}")
# Value counts for first column
_val_counts = features_jao.group_by(_first_col).agg(pl.len().alias('count')).sort('count', descending=True)
mo.ui.table(_val_counts.to_pandas())
return
if __name__ == "__main__":
app.run()