"""FBMC Flow Forecasting - Data Exploration Notebook Day 1 Objective: Explore downloaded JAO FBMC data structure and identify patterns. Usage: marimo edit notebooks/01_data_exploration.py """ import marimo __generated_with = "0.17.2" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import polars as pl import altair as alt from pathlib import Path import sys # Add src to path for imports sys.path.insert(0, str(Path.cwd().parent / "src")) return Path, alt, mo, pl @app.cell def _(mo): mo.md( r""" # FBMC Flow Forecasting - Sample Data Exploration **MVP Objective**: Zero-shot electricity cross-border capacity forecasting ## Sample Data Goals: 1. Load 1-week JAO sample data (Sept 23-30, 2025) 2. Inspect MaxBEX structure (TARGET VARIABLE) 3. Inspect CNECs + PTDFs structure (from Active Constraints) 4. Identify binding CNECs in sample period 5. Validate data completeness ## Data Sources (1-week sample): - **MaxBEX**: Maximum Bilateral Exchange capacity (TARGET) - 208 hours × 132 borders - **CNECs/PTDFs**: Active constraints with PTDFs for all zones - 813 CNECs × 40 columns _Note: This is a 1-week sample for API testing. Full 24-month collection pending._ """ ) return @app.cell def _(Path): # Configuration DATA_DIR = Path("data/raw/sample") RESULTS_DIR = Path("results/visualizations") # Expected sample data files (1-week: Sept 23-30, 2025) MAXBEX_FILE = DATA_DIR / "maxbex_sample_sept2025.parquet" CNECS_FILE = DATA_DIR / "cnecs_sample_sept2025.parquet" return CNECS_FILE, MAXBEX_FILE @app.cell def _(CNECS_FILE, MAXBEX_FILE, mo): # Check data availability data_status = { "MaxBEX (TARGET)": MAXBEX_FILE.exists(), "CNECs/PTDFs": CNECS_FILE.exists(), } if all(data_status.values()): mo.md(""" ✅ **Sample data files found - ready for exploration!** - MaxBEX: 208 hours × 132 borders - CNECs/PTDFs: 813 records × 40 columns """) else: missing = [k for k, v in data_status.items() if not v] mo.md( f""" ⚠️ **Missing data files**: {', '.join(missing)} **Next Steps:** 1. Run sample collection: `python scripts/collect_sample_data.py` 2. Return here for exploration """ ) return (data_status,) @app.cell def _(data_status, mo): # Only proceed if data exists if not all(data_status.values()): mo.stop(True, mo.md("⚠️ Data not available - stopping notebook")) return @app.cell def _(CNECS_FILE, MAXBEX_FILE, pl): # Load sample data print("Loading JAO sample datasets...") maxbex_df = pl.read_parquet(MAXBEX_FILE) cnecs_df = pl.read_parquet(CNECS_FILE) print(f"[OK] MaxBEX (TARGET): {maxbex_df.shape}") print(f"[OK] CNECs/PTDFs: {cnecs_df.shape}") return cnecs_df, maxbex_df @app.cell def _(cnecs_df, maxbex_df, mo): mo.md( f""" ## Dataset Overview (1-Week Sample: Sept 23-30, 2025) ### MaxBEX Data (TARGET VARIABLE) - **Shape**: {maxbex_df.shape[0]:,} rows × {maxbex_df.shape[1]} columns - **Description**: Maximum Bilateral Exchange capacity across all FBMC Core borders - **Border Directions**: {maxbex_df.shape[1]} (e.g., AT>BE, DE>FR, etc.) - **Format**: Wide format - each column is a border direction ### CNECs/PTDFs Data (Active Constraints) - **Shape**: {cnecs_df.shape[0]:,} rows × {cnecs_df.shape[1]} columns - **Description**: Critical Network Elements with Contingencies + Power Transfer Distribution Factors - **Key Fields**: tso, cnec_name, shadow_price, ram, ptdf_AT, ptdf_BE, etc. - **Unique CNECs**: {cnecs_df['cnec_name'].n_unique() if 'cnec_name' in cnecs_df.columns else 'N/A'} """ ) return @app.cell def _(mo): mo.md("""## 1. MaxBEX DataFrame (TARGET VARIABLE)""") return @app.cell def _(maxbex_df, mo): # Display MaxBEX dataframe mo.ui.table(maxbex_df.head(20).to_pandas()) return @app.cell def _(mo): mo.md( r""" ### Understanding MaxBEX: Commercial vs Physical Capacity **What is MaxBEX?** - MaxBEX = **Max**imum **B**ilateral **Ex**change capacity - Represents commercial hub-to-hub trading capacity between zone pairs - NOT the same as physical interconnector ratings **Why 132 Border Directions?** - FBMC Core has 12 bidding zones (AT, BE, CZ, DE-LU, FR, HR, HU, NL, PL, RO, SI, SK) - MaxBEX exists for ALL zone pairs: 12 × 11 = 132 bidirectional combinations - This includes "virtual borders" (zone pairs without physical interconnectors) **Virtual Borders Explained:** - Example: FR→HU exchange capacity exists despite no physical FR-HU interconnector - Power flows through AC grid network via intermediate countries (DE, AT, CZ) - PTDFs (Power Transfer Distribution Factors) quantify how each zone-pair exchange affects every CNEC - MaxBEX is the result of optimization: maximize zone-to-zone exchange subject to ALL network constraints **Network Physics:** - A 1000 MW export from FR to HU physically affects transmission lines in: - Germany (DE): Power flows through DE grid - Austria (AT): Power flows through AT grid - Czech Republic (CZ): Power flows through CZ grid - Each CNEC has PTDFs for all zones, capturing these network sensitivities - MaxBEX capacity is limited by the most constraining CNEC in the network **Interpretation:** - Physical borders (e.g., DE→FR): Limited by interconnector capacity + network constraints - Virtual borders (e.g., FR→HU): Limited purely by network constraints (CNECs + PTDFs) - All MaxBEX values are simultaneously feasible (network-secure commercial capacity) """ ) return @app.cell def _(maxbex_df, mo): mo.md(f""" ### Key Borders Statistics Showing capacity ranges for major borders: """) # Select key borders for statistics table stats_key_borders = ['DE>FR', 'FR>DE', 'DE>NL', 'NL>DE', 'AT>DE', 'DE>AT', 'BE>NL', 'NL>BE'] available_borders = [b for b in stats_key_borders if b in maxbex_df.columns] # Get statistics and round to 1 decimal place stats_df = maxbex_df.select(available_borders).describe() stats_df_rounded = stats_df.with_columns([ pl.col(col).round(1) for col in stats_df.columns if col != 'statistic' ]) mo.ui.table(stats_df_rounded.to_pandas()) return @app.cell def _(alt, maxbex_df, pl): # MaxBEX Time Series Visualization using Polars # Select borders for time series chart timeseries_borders = ['DE>FR', 'FR>DE', 'DE>NL', 'NL>DE', 'AT>DE', 'DE>AT'] available_timeseries = [b for b in timeseries_borders if b in maxbex_df.columns] # Add row number and unpivot to long format using Polars maxbex_with_hour = maxbex_df.select(available_timeseries).with_row_index(name='hour') maxbex_plot = maxbex_with_hour.unpivot( index=['hour'], on=available_timeseries, variable_name='border', value_name='capacity_MW' ) chart_maxbex = alt.Chart(maxbex_plot.to_pandas()).mark_line().encode( x=alt.X('hour:Q', title='Hour'), y=alt.Y('capacity_MW:Q', title='Capacity (MW)'), color=alt.Color('border:N', title='Border'), tooltip=['hour:Q', 'border:N', 'capacity_MW:Q'] ).properties( title='MaxBEX Capacity Over Time (Key Borders)', width=800, height=400 ).interactive() chart_maxbex return @app.cell def _(mo): mo.md("""### MaxBEX Capacity Heatmap (All Zone Pairs)""") return @app.cell def _(alt, maxbex_df, pl): # Create heatmap of average MaxBEX capacity across all zone pairs using Polars # Parse border names into from/to zones with mean capacity zones = ['AT', 'BE', 'CZ', 'DE', 'FR', 'HR', 'HU', 'NL', 'PL', 'RO', 'SI', 'SK'] heatmap_data = [] for heatmap_col in maxbex_df.columns: if '>' in heatmap_col: from_zone, to_zone = heatmap_col.split('>') heatmap_mean_capacity = maxbex_df[heatmap_col].mean() heatmap_data.append({ 'from_zone': from_zone, 'to_zone': to_zone, 'avg_capacity': heatmap_mean_capacity }) heatmap_df = pl.DataFrame(heatmap_data) # Create heatmap heatmap = alt.Chart(heatmap_df.to_pandas()).mark_rect().encode( x=alt.X('from_zone:N', title='From Zone', sort=zones), y=alt.Y('to_zone:N', title='To Zone', sort=zones), color=alt.Color('avg_capacity:Q', scale=alt.Scale(scheme='viridis'), title='Avg Capacity (MW)'), tooltip=['from_zone:N', 'to_zone:N', alt.Tooltip('avg_capacity:Q', format='.0f', title='Capacity (MW)')] ).properties( title='Average MaxBEX Capacity: All 132 Zone Pairs', width=600, height=600 ) heatmap return @app.cell def _(mo): mo.md("""### Physical vs Virtual Borders Analysis""") return @app.cell def _(alt, maxbex_df, pl): # Identify physical vs virtual borders based on typical interconnector patterns # Physical borders: adjacent countries with known interconnectors physical_borders = [ 'AT>DE', 'DE>AT', 'AT>CZ', 'CZ>AT', 'AT>HU', 'HU>AT', 'AT>SI', 'SI>AT', 'BE>FR', 'FR>BE', 'BE>NL', 'NL>BE', 'BE>DE', 'DE>BE', 'CZ>DE', 'DE>CZ', 'CZ>PL', 'PL>CZ', 'CZ>SK', 'SK>CZ', 'DE>FR', 'FR>DE', 'DE>NL', 'NL>DE', 'DE>PL', 'PL>DE', 'FR>DE', 'DE>FR', 'HR>HU', 'HU>HR', 'HR>SI', 'SI>HR', 'HU>RO', 'RO>HU', 'HU>SK', 'SK>HU', 'PL>SK', 'SK>PL', 'RO>SI', 'SI>RO' # May be virtual ] # Calculate statistics for comparison using Polars comparison_data = [] for comparison_col in maxbex_df.columns: if '>' in comparison_col: comparison_mean_capacity = maxbex_df[comparison_col].mean() border_type = 'Physical' if comparison_col in physical_borders else 'Virtual' comparison_data.append({ 'border': comparison_col, 'type': border_type, 'avg_capacity': comparison_mean_capacity }) comparison_df = pl.DataFrame(comparison_data) # Box plot comparison box_plot = alt.Chart(comparison_df.to_pandas()).mark_boxplot(extent='min-max').encode( x=alt.X('type:N', title='Border Type'), y=alt.Y('avg_capacity:Q', title='Average Capacity (MW)'), color=alt.Color('type:N', scale=alt.Scale(domain=['Physical', 'Virtual'], range=['#1f77b4', '#ff7f0e'])) ).properties( title='MaxBEX Capacity Distribution: Physical vs Virtual Borders', width=400, height=400 ) # Summary statistics summary = comparison_df.group_by('type').agg([ pl.col('avg_capacity').mean().alias('mean_capacity'), pl.col('avg_capacity').median().alias('median_capacity'), pl.col('avg_capacity').min().alias('min_capacity'), pl.col('avg_capacity').max().alias('max_capacity'), pl.len().alias('count') ]) box_plot return comparison_df, summary @app.cell def _(mo, summary): return mo.vstack([ mo.md("**Border Type Statistics:**"), mo.ui.table(summary.to_pandas()) ]) @app.cell def _(mo): mo.md("""## 2. CNECs/PTDFs DataFrame""") return @app.cell def _(cnecs_df, mo): # Display CNECs dataframe mo.ui.table(cnecs_df.head(20).to_pandas()) return @app.cell def _(alt, cnecs_df, pl): # Top Binding CNECs by Shadow Price top_cnecs = ( cnecs_df .group_by('cnec_name') .agg([ pl.col('shadow_price').mean().alias('avg_shadow_price'), pl.col('ram').mean().alias('avg_ram'), pl.len().alias('count') ]) .sort('avg_shadow_price', descending=True) .head(15) ) chart_cnecs = alt.Chart(top_cnecs.to_pandas()).mark_bar().encode( x=alt.X('avg_shadow_price:Q', title='Average Shadow Price (€/MWh)'), y=alt.Y('cnec_name:N', sort='-x', title='CNEC'), tooltip=['cnec_name:N', 'avg_shadow_price:Q', 'avg_ram:Q', 'count:Q'], color=alt.Color('avg_shadow_price:Q', scale=alt.Scale(scheme='reds')) ).properties( title='Top 15 CNECs by Average Shadow Price', width=800, height=400 ) chart_cnecs return @app.cell def _(alt, cnecs_df): # Shadow Price Distribution chart_shadow = alt.Chart(cnecs_df.to_pandas()).mark_bar().encode( x=alt.X('shadow_price:Q', bin=alt.Bin(maxbins=50), title='Shadow Price (€/MWh)'), y=alt.Y('count()', title='Count'), tooltip=['shadow_price:Q', 'count()'] ).properties( title='Shadow Price Distribution', width=800, height=300 ) chart_shadow return @app.cell def _(alt, cnecs_df, pl): # TSO Distribution tso_counts = ( cnecs_df .group_by('tso') .agg(pl.len().alias('count')) .sort('count', descending=True) ) chart_tso = alt.Chart(tso_counts.to_pandas()).mark_bar().encode( x=alt.X('count:Q', title='Number of Active Constraints'), y=alt.Y('tso:N', sort='-x', title='TSO'), tooltip=['tso:N', 'count:Q'], color=alt.value('steelblue') ).properties( title='Active Constraints by TSO', width=800, height=400 ) chart_tso return @app.cell def _(mo): mo.md("""### CNEC Network Impact Analysis""") return @app.cell def _(alt, cnecs_df, pl): # Analyze which zones are most affected by top CNECs # Select top 10 most binding CNECs top_10_cnecs = ( cnecs_df .group_by('cnec_name') .agg(pl.col('shadow_price').mean().alias('avg_shadow_price')) .sort('avg_shadow_price', descending=True) .head(10) .get_column('cnec_name') .to_list() ) # Get PTDF columns for impact analysis impact_ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')] # Calculate average absolute PTDF impact for top CNECs impact_data = [] for cnec in top_10_cnecs: cnec_data = cnecs_df.filter(pl.col('cnec_name') == cnec) for ptdf_col in impact_ptdf_cols: zone = ptdf_col.replace('ptdf_', '') avg_abs_ptdf = cnec_data[ptdf_col].abs().mean() impact_data.append({ 'cnec_name': cnec[:40], # Truncate long names 'zone': zone, 'avg_abs_ptdf': avg_abs_ptdf }) impact_df = pl.DataFrame(impact_data) # Create heatmap showing CNEC-zone impact impact_heatmap = alt.Chart(impact_df.to_pandas()).mark_rect().encode( x=alt.X('zone:N', title='Zone'), y=alt.Y('cnec_name:N', title='CNEC (Top 10 by Shadow Price)'), color=alt.Color('avg_abs_ptdf:Q', scale=alt.Scale(scheme='reds'), title='Avg |PTDF|'), tooltip=['cnec_name:N', 'zone:N', alt.Tooltip('avg_abs_ptdf:Q', format='.4f')] ).properties( title='Network Impact: Which Zones Affect Each CNEC?', width=600, height=400 ) impact_heatmap return @app.cell def _(cnecs_df, mo): mo.md("## 3. PTDF Analysis") # Extract PTDF columns ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')] mo.md(f"**PTDF Zones**: {len(ptdf_cols)} zones - {', '.join([c.replace('ptdf_', '') for c in ptdf_cols])}") return (ptdf_cols,) @app.cell def _(cnecs_df, ptdf_cols): # PTDF Statistics ptdf_stats = cnecs_df.select(ptdf_cols).describe() ptdf_stats return @app.cell def _(mo): mo.md( """ ## Data Quality Validation Checking for completeness, missing values, and data integrity: """ ) return @app.cell def _(cnecs_df, maxbex_df, mo, pl): # Calculate data completeness def check_completeness(df, name): total_cells = df.shape[0] * df.shape[1] null_cells = df.null_count().sum_horizontal()[0] completeness = (1 - null_cells / total_cells) * 100 return { 'Dataset': name, 'Total Cells': total_cells, 'Null Cells': null_cells, 'Completeness %': f"{completeness:.2f}%" } completeness_report = [ check_completeness(maxbex_df, 'MaxBEX (TARGET)'), check_completeness(cnecs_df, 'CNECs/PTDFs') ] mo.ui.table(pl.DataFrame(completeness_report).to_pandas()) return (completeness_report,) @app.cell def _(completeness_report, mo): # Validation check all_complete = all( float(r['Completeness %'].rstrip('%')) >= 95.0 for r in completeness_report ) if all_complete: mo.md("✅ **All datasets meet >95% completeness threshold**") else: mo.md("⚠️ **Some datasets below 95% completeness - investigate missing data**") return @app.cell def _(mo): mo.md( """ ## Next Steps After data exploration completion: 1. **Day 2**: Feature engineering (75-85 features) 2. **Day 3**: Zero-shot inference with Chronos 2 3. **Day 4**: Performance evaluation and analysis 4. **Day 5**: Documentation and handover --- **Note**: This notebook will be exported to JupyterLab format (.ipynb) for analyst handover. """ ) return if __name__ == "__main__": app.run()