"""FBMC Flow Forecasting - Unified JAO Data Exploration Objective: Explore unified 24-month JAO data and engineered features This notebook explores: 1. Unified JAO dataset (MaxBEX + CNEC + LTA + NetPos) 2. Engineered features (726 features across 5 categories) 3. Feature completeness and validation 4. Key statistics and distributions Usage: marimo edit notebooks/02_unified_jao_exploration.py """ import marimo __generated_with = "0.17.2" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import polars as pl import altair as alt from pathlib import Path import numpy as np return Path, alt, mo, pl @app.cell def _(mo): mo.md( r""" # Unified JAO Data Exploration (24 Months) **Date Range**: October 2023 - October 2025 (24 months) ## Data Pipeline Overview: 1. **Raw JAO Data** (4 datasets) - MaxBEX: Maximum Bilateral Exchange capacity (TARGET) - CNEC/PTDF: Critical constraints with power transfer factors - LTA: Long Term Allocations (future covariates) - Net Positions: Domain boundaries (min/max per zone) 2. **Data Unification** → `unified_jao_24month.parquet` - Deduplicated NetPos (removed 1,152 duplicate timestamps) - Forward-filled LTA gaps (710 missing hours) - Broadcast daily CNEC to hourly - Sorted timeline (hourly, 17,544 records) 3. **Feature Engineering** → `features_jao_24month.parquet` - 726 features across 5 categories - Tier-1 CNEC: 274 features - Tier-2 CNEC: 390 features - LTA: 40 features - Temporal: 12 features - Targets: 10 features """ ) return @app.cell def _(Path, pl): # Load unified datasets print("Loading unified JAO datasets...") processed_dir = Path('data/processed') unified_jao = pl.read_parquet(processed_dir / 'unified_jao_24month.parquet') cnec_hourly = pl.read_parquet(processed_dir / 'cnec_hourly_24month.parquet') features_jao = pl.read_parquet(processed_dir / 'features_jao_24month.parquet') print(f"[OK] Unified JAO: {unified_jao.shape}") print(f"[OK] CNEC hourly: {cnec_hourly.shape}") print(f"[OK] Features: {features_jao.shape}") return features_jao, unified_jao @app.cell def _(features_jao, mo, unified_jao): # Dataset overview mo.md(f""" ## Dataset Overview ### 1. Unified JAO Dataset - **Shape**: {unified_jao.shape[0]:,} rows × {unified_jao.shape[1]} columns - **Date Range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()} - **Timeline Sorted**: {unified_jao['mtu'].is_sorted()} - **Null Percentage**: {(unified_jao.null_count().sum_horizontal()[0] / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}% ### 2. Engineered Features - **Shape**: {features_jao.shape[0]:,} rows × {features_jao.shape[1]} columns - **Total Features**: {features_jao.shape[1] - 1} (excluding mtu timestamp) - **Null Percentage**: {(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}% - _Note: High nulls expected due to sparse CNEC binding patterns and lag features_ """) return @app.cell def _(mo): mo.md("""## 1. Unified JAO Dataset Structure""") return @app.cell def _(mo, unified_jao): # Show sample of unified data mo.md("""### Sample Data (First 20 Rows)""") mo.ui.table(unified_jao.head(20).to_pandas(), page_size=10) return @app.cell def _(mo, unified_jao): # Column breakdown maxbex_cols = [c for c in unified_jao.columns if 'border_' in c and not c.startswith('lta')] lta_cols = [c for c in unified_jao.columns if c.startswith('border_')] netpos_cols = [c for c in unified_jao.columns if c.startswith('netpos_')] mo.md(f""" ### Column Breakdown - **Timestamp**: 1 column (`mtu`) - **MaxBEX Borders**: {len(maxbex_cols)} columns - **LTA Borders**: {len(lta_cols)} columns - **Net Positions**: {len(netpos_cols)} columns (if present) - **Total**: {unified_jao.shape[1]} columns """) return @app.cell def _(mo): mo.md("""### Timeline Validation""") return @app.cell def _(alt, pl, unified_jao): # Timeline validation time_diffs = unified_jao['mtu'].diff().drop_nulls() # Most common time diff most_common = time_diffs.mode()[0] is_hourly = most_common.total_seconds() == 3600 # Create histogram of time diffs time_diff_hours = time_diffs.map_elements(lambda x: x.total_seconds() / 3600, return_dtype=pl.Float64) time_diff_df = pl.DataFrame({ 'time_diff_hours': time_diff_hours }) timeline_chart = alt.Chart(time_diff_df.to_pandas()).mark_bar().encode( x=alt.X('time_diff_hours:Q', bin=alt.Bin(maxbins=50), title='Time Difference (hours)'), y=alt.Y('count()', title='Count'), tooltip=['time_diff_hours:Q', 'count()'] ).properties( title='Timeline Gaps Distribution', width=800, height=300 ) timeline_chart return is_hourly, most_common @app.cell def _(is_hourly, mo, most_common): if is_hourly: mo.md(f""" ✅ **Timeline Validation: PASS** - Most common time diff: {most_common} (1 hour) - Timeline is properly sorted and hourly """) else: mo.md(f""" ⚠️ **Timeline Validation: WARNING** - Most common time diff: {most_common} - Expected: 1 hour """) return @app.cell def _(mo): mo.md("""## 2. Feature Engineering Results""") return @app.cell def _(features_jao, mo, pl): # Feature category breakdown tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')] tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')] lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')] temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] target_cols = [c for c in features_jao.columns if c.startswith('target_')] # Create summary table feature_summary = pl.DataFrame({ 'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'], 'Features': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), features_jao.shape[1] - 1], 'Null %': [ f"{(features_jao.select(tier1_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier1_cols)) * 100):.2f}%" if tier1_cols else "N/A", f"{(features_jao.select(tier2_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier2_cols)) * 100):.2f}%" if tier2_cols else "N/A", f"{(features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(lta_feat_cols)) * 100):.2f}%" if lta_feat_cols else "N/A", f"{(features_jao.select(temporal_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(temporal_cols)) * 100):.2f}%" if temporal_cols else "N/A", f"{(features_jao.select(target_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(target_cols)) * 100):.2f}%" if target_cols else "N/A", f"{(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%" ] }) mo.ui.table(feature_summary.to_pandas()) return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols @app.cell def _(mo): mo.md("""### Sample Features (First 20 Rows)""") return @app.cell def _(features_jao, mo): # Show first 10 columns only (too many to display all) mo.ui.table(features_jao.select(features_jao.columns[:10]).head(20).to_pandas(), page_size=10) return @app.cell def _(mo): mo.md("""## 3. LTA Features (Future Covariates)""") return @app.cell def _(lta_feat_cols, mo): # LTA features analysis mo.md(f""" **LTA Features**: {len(lta_feat_cols)} features LTA (Long Term Allocations) are **future covariates** - known years in advance via auctions. These should have **0% nulls** since they're available for the entire forecast horizon. """) return @app.cell def _(alt, features_jao): # Plot LTA total allocated over time lta_chart_data = features_jao.select(['mtu', 'lta_total_allocated']).sort('mtu') lta_chart = alt.Chart(lta_chart_data.to_pandas()).mark_line().encode( x=alt.X('mtu:T', title='Date'), y=alt.Y('lta_total_allocated:Q', title='Total LTA Allocated (MW)'), tooltip=['mtu:T', 'lta_total_allocated:Q'] ).properties( title='LTA Total Allocated Capacity Over Time', width=800, height=400 ).interactive() lta_chart return @app.cell def _(features_jao, lta_feat_cols, mo): # LTA statistics lta_stats = features_jao.select(lta_feat_cols[:5]).describe() mo.md("""### LTA Sample Statistics (First 5 Features)""") mo.ui.table(lta_stats.to_pandas()) return @app.cell def _(mo): mo.md("""## 4. Temporal Features""") return @app.cell def _(features_jao, mo, temporal_cols): # Show temporal features mo.md(f""" **Temporal Features**: {len(temporal_cols)} features Cyclic encoding for hour, month, and weekday to capture periodicity. """) mo.ui.table(features_jao.select(['mtu'] + temporal_cols).head(24).to_pandas()) return @app.cell def _(alt, features_jao, pl): # Hourly distribution hour_dist = features_jao.group_by('hour').agg(pl.len().alias('count')).sort('hour') hour_chart = alt.Chart(hour_dist.to_pandas()).mark_bar().encode( x=alt.X('hour:O', title='Hour of Day'), y=alt.Y('count:Q', title='Count'), tooltip=['hour:O', 'count:Q'] ).properties( title='Distribution by Hour of Day', width=800, height=300 ) hour_chart return @app.cell def _(mo): mo.md("""## 5. CNEC Features (Historical)""") return @app.cell def _(features_jao, mo, tier1_cols, tier2_cols): # CNEC features overview mo.md(f""" **CNEC Features**: {len(tier1_cols) + len(tier2_cols)} total - **Tier-1 CNECs**: {len(tier1_cols)} features (top 58 most critical CNECs) - **Tier-2 CNECs**: {len(tier2_cols)} features (next 150 CNECs) High null percentage is **expected** due to: 1. Sparse binding patterns (not all CNECs bind every hour) 2. Lag features create nulls at timeline start 3. Pivoting creates sparse constraint matrices """) # Sample Tier-1 features mo.ui.table(features_jao.select(['mtu'] + tier1_cols[:5]).head(20).to_pandas(), page_size=10) return @app.cell def _(alt, features_jao, pl, tier1_cols): # Binding frequency for sample Tier-1 CNECs binding_cols = [c for c in tier1_cols if 'binding_' in c][:10] if binding_cols: binding_freq = pl.DataFrame({ 'cnec': [c.replace('cnec_t1_binding_', '') for c in binding_cols], 'binding_rate': [features_jao[c].mean() for c in binding_cols] }) binding_chart = alt.Chart(binding_freq.to_pandas()).mark_bar().encode( x=alt.X('binding_rate:Q', title='Binding Frequency (0-1)'), y=alt.Y('cnec:N', sort='-x', title='CNEC'), tooltip=['cnec:N', alt.Tooltip('binding_rate:Q', format='.2%')] ).properties( title='Binding Frequency - Sample Tier-1 CNECs', width=800, height=300 ) binding_chart else: None return @app.cell def _(mo): mo.md("""## 6. Target Variables""") return @app.cell def _(features_jao, mo, target_cols): # Show target variables (MaxBEX borders) mo.md(f""" **Target Variables**: {len(target_cols)} features Sample MaxBEX borders for forecasting (first 10 borders): """) if target_cols: mo.ui.table(features_jao.select(['mtu'] + target_cols).head(20).to_pandas(), page_size=10) return @app.cell def _(alt, features_jao, target_cols): # Plot sample target variable over time if target_cols: sample_target = target_cols[0] target_chart_data = features_jao.select(['mtu', sample_target]).sort('mtu') target_chart = alt.Chart(target_chart_data.to_pandas()).mark_line().encode( x=alt.X('mtu:T', title='Date'), y=alt.Y(f'{sample_target}:Q', title='Capacity (MW)'), tooltip=['mtu:T', f'{sample_target}:Q'] ).properties( title=f'Target Variable Over Time: {sample_target}', width=800, height=400 ).interactive() target_chart else: None return @app.cell def _(mo): mo.md( """ ## 7. Data Quality Summary Final validation checks: """ ) return @app.cell def _(features_jao, is_hourly, lta_feat_cols, mo, pl, unified_jao): # Data quality checks checks = [] # Check 1: Timeline sorted and hourly checks.append({ 'Check': 'Timeline sorted & hourly', 'Status': 'PASS' if is_hourly else 'FAIL', 'Details': f'Most common diff: {unified_jao["mtu"].diff().drop_nulls().mode()[0]}' }) # Check 2: No nulls in unified dataset unified_nulls = unified_jao.null_count().sum_horizontal()[0] checks.append({ 'Check': 'Unified data completeness', 'Status': 'PASS' if unified_nulls == 0 else 'WARNING', 'Details': f'{unified_nulls} nulls ({(unified_nulls / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%)' }) # Check 3: LTA features have no nulls (future covariates) lta_nulls = features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] if lta_feat_cols else 0 checks.append({ 'Check': 'LTA future covariates complete', 'Status': 'PASS' if lta_nulls == 0 else 'FAIL', 'Details': f'{lta_nulls} nulls in {len(lta_feat_cols)} LTA features' }) # Check 4: Data consistency (same row count) checks.append({ 'Check': 'Data consistency', 'Status': 'PASS' if len(unified_jao) == len(features_jao) else 'FAIL', 'Details': f'Unified: {len(unified_jao):,} rows, Features: {len(features_jao):,} rows' }) checks_df = pl.DataFrame(checks) mo.ui.table(checks_df.to_pandas()) return (checks,) @app.cell def _(checks, mo): # Overall status all_pass = all(c['Status'] == 'PASS' for c in checks) if all_pass: mo.md(""" ✅ **All validation checks PASSED** Data is ready for model training and inference! """) else: failed = [c['Check'] for c in checks if c['Status'] == 'FAIL'] warnings = [c['Check'] for c in checks if c['Status'] == 'WARNING'] status = "⚠️ **Some checks failed or have warnings**\n\n" if failed: status += f"**Failed**: {', '.join(failed)}\n\n" if warnings: status += f"**Warnings**: {', '.join(warnings)}" mo.md(status) return @app.cell def _(mo): mo.md( """ ## Next Steps ✅ **JAO Data Collection & Unification: COMPLETE** - 24 months of data (Oct 2023 - Oct 2025) - 17,544 hourly records - 726 features engineered **Remaining Work:** 1. Collect weather data (OpenMeteo, 52 grid points) 2. Collect ENTSO-E data (generation, flows, outages) 3. Complete remaining feature scaffolding (NetPos lags, MaxBEX lags, system aggregates) 4. Integrate all data sources 5. Begin zero-shot Chronos 2 inference --- **Data Files**: - `data/processed/unified_jao_24month.parquet` (5.59 MB) - `data/processed/cnec_hourly_24month.parquet` (4.57 MB) - `data/processed/features_jao_24month.parquet` (0.60 MB) """ ) return @app.cell def _(mo, unified_jao): # Display the unified JAO dataset mo.md("## Unified JAO Dataset") mo.ui.table(unified_jao.to_pandas(), page_size=20) return @app.cell def _(features_jao, mo, unified_jao): # Show the actual structure with timestamp mo.md("### Unified JAO Dataset Structure") display_df = unified_jao.select(['mtu'] + [c for c in unified_jao.columns if c != 'mtu'][:10]).head(10) mo.ui.table(display_df.to_pandas()) mo.md(f""" **Dataset Info:** - **Total columns**: {len(unified_jao.columns)} - **Timestamp column**: `mtu` (Market Time Unit) - **Date range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()} """) # Show the 726 features dataset separately mo.md("### Features Dataset (726 engineered features)") mo.ui.table(features_jao.select(['mtu'] + features_jao.columns[1:11]).head(10).to_pandas()) return @app.cell def _(features_jao, mo, pl, unified_jao): # Show actual column counts mo.md(f""" ### Dataset Column Counts **unified_jao**: {len(unified_jao.columns)} columns - Raw unified data (MaxBEX, LTA, NetPos) **features_jao**: {len(features_jao.columns)} columns - Engineered features (726 + timestamp) """) # Show all column categories in features dataset tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')] tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')] lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')] temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] target_cols = [c for c in features_jao.columns if c.startswith('target_')] feature_breakdown = pl.DataFrame({ 'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'], 'Count': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), len(features_jao.columns)] }) mo.md("### Feature Breakdown in features_jao dataset:") mo.ui.table(feature_breakdown.to_pandas()) # Show first 20 actual column names from features_jao mo.md("### First 20 column names in features_jao:") for i, col in enumerate(features_jao.columns[:]): print(f"{i+1:3d}. {col}") return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols @app.cell def _(features_jao, mo, pl): # Check CNEC Tier-1 binding values without redefining variables _cnec_t1_binding_cols = [c for c in features_jao.columns if c.startswith('target_border')] if _cnec_t1_binding_cols: # Show sample of binding values _sample_bindings = features_jao.select(['mtu'] + _cnec_t1_binding_cols[:5]).head(20) mo.md("### Sample CNEC Tier-1 Binding Values (First 5 CNECs)") mo.ui.table(_sample_bindings.to_pandas(), page_size=10) # Check unique values in first binding column _first_col = _cnec_t1_binding_cols[0] _unique_vals = features_jao[_first_col].unique().sort() mo.md(f"### Unique Values in {_first_col}") print(f"Unique values: {_unique_vals.to_list()}") # Value counts for first column _val_counts = features_jao.group_by(_first_col).agg(pl.len().alias('count')).sort('count', descending=True) mo.ui.table(_val_counts.to_pandas()) return if __name__ == "__main__": app.run()