Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 17,655 Bytes

"""FBMC Flow Forecasting - Data Exploration Notebook

Day 1 Objective: Explore downloaded JAO FBMC data structure and identify patterns.

Usage:
    marimo edit notebooks/01_data_exploration.py
"""

import marimo

__generated_with = "0.17.2"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import polars as pl
    import altair as alt
    from pathlib import Path
    import sys

    # Add src to path for imports
    sys.path.insert(0, str(Path.cwd().parent / "src"))
    return Path, alt, mo, pl


@app.cell
def _(mo):
    mo.md(
        r"""
    # FBMC Flow Forecasting - Sample Data Exploration

    **MVP Objective**: Zero-shot electricity cross-border capacity forecasting

    ## Sample Data Goals:
    1. Load 1-week JAO sample data (Sept 23-30, 2025)
    2. Inspect MaxBEX structure (TARGET VARIABLE)
    3. Inspect CNECs + PTDFs structure (from Active Constraints)
    4. Identify binding CNECs in sample period
    5. Validate data completeness

    ## Data Sources (1-week sample):
    - **MaxBEX**: Maximum Bilateral Exchange capacity (TARGET) - 208 hours × 132 borders
    - **CNECs/PTDFs**: Active constraints with PTDFs for all zones - 813 CNECs × 40 columns

    _Note: This is a 1-week sample for API testing. Full 24-month collection pending._
    """
    )
    return


@app.cell
def _(Path):
    # Configuration
    DATA_DIR = Path("data/raw/sample")
    RESULTS_DIR = Path("results/visualizations")

    # Expected sample data files (1-week: Sept 23-30, 2025)
    MAXBEX_FILE = DATA_DIR / "maxbex_sample_sept2025.parquet"
    CNECS_FILE = DATA_DIR / "cnecs_sample_sept2025.parquet"
    return CNECS_FILE, MAXBEX_FILE


@app.cell
def _(CNECS_FILE, MAXBEX_FILE, mo):
    # Check data availability
    data_status = {
        "MaxBEX (TARGET)": MAXBEX_FILE.exists(),
        "CNECs/PTDFs": CNECS_FILE.exists(),
    }

    if all(data_status.values()):
        mo.md("""
        ✅ **Sample data files found - ready for exploration!**

        - MaxBEX: 208 hours × 132 borders
        - CNECs/PTDFs: 813 records × 40 columns
        """)
    else:
        missing = [k for k, v in data_status.items() if not v]
        mo.md(
            f"""
            ⚠️ **Missing data files**: {', '.join(missing)}

            **Next Steps:**
            1. Run sample collection: `python scripts/collect_sample_data.py`
            2. Return here for exploration
            """
        )
    return (data_status,)


@app.cell
def _(data_status, mo):
    # Only proceed if data exists
    if not all(data_status.values()):
        mo.stop(True, mo.md("⚠️ Data not available - stopping notebook"))
    return


@app.cell
def _(CNECS_FILE, MAXBEX_FILE, pl):
    # Load sample data
    print("Loading JAO sample datasets...")

    maxbex_df = pl.read_parquet(MAXBEX_FILE)
    cnecs_df = pl.read_parquet(CNECS_FILE)

    print(f"[OK] MaxBEX (TARGET): {maxbex_df.shape}")
    print(f"[OK] CNECs/PTDFs: {cnecs_df.shape}")
    return cnecs_df, maxbex_df


@app.cell
def _(cnecs_df, maxbex_df, mo):
    mo.md(
        f"""
    ## Dataset Overview (1-Week Sample: Sept 23-30, 2025)

    ### MaxBEX Data (TARGET VARIABLE)
    - **Shape**: {maxbex_df.shape[0]:,} rows × {maxbex_df.shape[1]} columns
    - **Description**: Maximum Bilateral Exchange capacity across all FBMC Core borders
    - **Border Directions**: {maxbex_df.shape[1]} (e.g., AT>BE, DE>FR, etc.)
    - **Format**: Wide format - each column is a border direction

    ### CNECs/PTDFs Data (Active Constraints)
    - **Shape**: {cnecs_df.shape[0]:,} rows × {cnecs_df.shape[1]} columns
    - **Description**: Critical Network Elements with Contingencies + Power Transfer Distribution Factors
    - **Key Fields**: tso, cnec_name, shadow_price, ram, ptdf_AT, ptdf_BE, etc.
    - **Unique CNECs**: {cnecs_df['cnec_name'].n_unique() if 'cnec_name' in cnecs_df.columns else 'N/A'}
    """
    )
    return


@app.cell
def _(mo):
    mo.md("""## 1. MaxBEX DataFrame (TARGET VARIABLE)""")
    return


@app.cell
def _(maxbex_df, mo):
    # Display MaxBEX dataframe
    mo.ui.table(maxbex_df.head(20).to_pandas())
    return


@app.cell
def _(mo):
    mo.md(
        r"""
    ### Understanding MaxBEX: Commercial vs Physical Capacity

    **What is MaxBEX?**
    - MaxBEX = **Max**imum **B**ilateral **Ex**change capacity
    - Represents commercial hub-to-hub trading capacity between zone pairs
    - NOT the same as physical interconnector ratings

    **Why 132 Border Directions?**
    - FBMC Core has 12 bidding zones (AT, BE, CZ, DE-LU, FR, HR, HU, NL, PL, RO, SI, SK)
    - MaxBEX exists for ALL zone pairs: 12 × 11 = 132 bidirectional combinations
    - This includes "virtual borders" (zone pairs without physical interconnectors)

    **Virtual Borders Explained:**
    - Example: FR→HU exchange capacity exists despite no physical FR-HU interconnector
    - Power flows through AC grid network via intermediate countries (DE, AT, CZ)
    - PTDFs (Power Transfer Distribution Factors) quantify how each zone-pair exchange affects every CNEC
    - MaxBEX is the result of optimization: maximize zone-to-zone exchange subject to ALL network constraints

    **Network Physics:**
    - A 1000 MW export from FR to HU physically affects transmission lines in:
      - Germany (DE): Power flows through DE grid
      - Austria (AT): Power flows through AT grid
      - Czech Republic (CZ): Power flows through CZ grid
    - Each CNEC has PTDFs for all zones, capturing these network sensitivities
    - MaxBEX capacity is limited by the most constraining CNEC in the network

    **Interpretation:**
    - Physical borders (e.g., DE→FR): Limited by interconnector capacity + network constraints
    - Virtual borders (e.g., FR→HU): Limited purely by network constraints (CNECs + PTDFs)
    - All MaxBEX values are simultaneously feasible (network-secure commercial capacity)
    """
    )
    return


@app.cell
def _(maxbex_df, mo):
    mo.md(f"""
    ### Key Borders Statistics
    Showing capacity ranges for major borders:
    """)

    # Select key borders for statistics table
    stats_key_borders = ['DE>FR', 'FR>DE', 'DE>NL', 'NL>DE', 'AT>DE', 'DE>AT', 'BE>NL', 'NL>BE']
    available_borders = [b for b in stats_key_borders if b in maxbex_df.columns]

    # Get statistics and round to 1 decimal place
    stats_df = maxbex_df.select(available_borders).describe()
    stats_df_rounded = stats_df.with_columns([
        pl.col(col).round(1) for col in stats_df.columns if col != 'statistic'
    ])

    mo.ui.table(stats_df_rounded.to_pandas())
    return


@app.cell
def _(alt, maxbex_df, pl):
    # MaxBEX Time Series Visualization using Polars

    # Select borders for time series chart
    timeseries_borders = ['DE>FR', 'FR>DE', 'DE>NL', 'NL>DE', 'AT>DE', 'DE>AT']
    available_timeseries = [b for b in timeseries_borders if b in maxbex_df.columns]

    # Add row number and unpivot to long format using Polars
    maxbex_with_hour = maxbex_df.select(available_timeseries).with_row_index(name='hour')

    maxbex_plot = maxbex_with_hour.unpivot(
        index=['hour'],
        on=available_timeseries,
        variable_name='border',
        value_name='capacity_MW'
    )

    chart_maxbex = alt.Chart(maxbex_plot.to_pandas()).mark_line().encode(
        x=alt.X('hour:Q', title='Hour'),
        y=alt.Y('capacity_MW:Q', title='Capacity (MW)'),
        color=alt.Color('border:N', title='Border'),
        tooltip=['hour:Q', 'border:N', 'capacity_MW:Q']
    ).properties(
        title='MaxBEX Capacity Over Time (Key Borders)',
        width=800,
        height=400
    ).interactive()

    chart_maxbex
    return


@app.cell
def _(mo):
    mo.md("""### MaxBEX Capacity Heatmap (All Zone Pairs)""")
    return


@app.cell
def _(alt, maxbex_df, pl):
    # Create heatmap of average MaxBEX capacity across all zone pairs using Polars

    # Parse border names into from/to zones with mean capacity
    zones = ['AT', 'BE', 'CZ', 'DE', 'FR', 'HR', 'HU', 'NL', 'PL', 'RO', 'SI', 'SK']
    heatmap_data = []

    for heatmap_col in maxbex_df.columns:
        if '>' in heatmap_col:
            from_zone, to_zone = heatmap_col.split('>')
            heatmap_mean_capacity = maxbex_df[heatmap_col].mean()
            heatmap_data.append({
                'from_zone': from_zone,
                'to_zone': to_zone,
                'avg_capacity': heatmap_mean_capacity
            })

    heatmap_df = pl.DataFrame(heatmap_data)

    # Create heatmap
    heatmap = alt.Chart(heatmap_df.to_pandas()).mark_rect().encode(
        x=alt.X('from_zone:N', title='From Zone', sort=zones),
        y=alt.Y('to_zone:N', title='To Zone', sort=zones),
        color=alt.Color('avg_capacity:Q',
                       scale=alt.Scale(scheme='viridis'),
                       title='Avg Capacity (MW)'),
        tooltip=['from_zone:N', 'to_zone:N', alt.Tooltip('avg_capacity:Q', format='.0f', title='Capacity (MW)')]
    ).properties(
        title='Average MaxBEX Capacity: All 132 Zone Pairs',
        width=600,
        height=600
    )

    heatmap
    return


@app.cell
def _(mo):
    mo.md("""### Physical vs Virtual Borders Analysis""")
    return


@app.cell
def _(alt, maxbex_df, pl):
    # Identify physical vs virtual borders based on typical interconnector patterns
    # Physical borders: adjacent countries with known interconnectors
    physical_borders = [
        'AT>DE', 'DE>AT', 'AT>CZ', 'CZ>AT', 'AT>HU', 'HU>AT', 'AT>SI', 'SI>AT',
        'BE>FR', 'FR>BE', 'BE>NL', 'NL>BE', 'BE>DE', 'DE>BE',
        'CZ>DE', 'DE>CZ', 'CZ>PL', 'PL>CZ', 'CZ>SK', 'SK>CZ',
        'DE>FR', 'FR>DE', 'DE>NL', 'NL>DE', 'DE>PL', 'PL>DE',
        'FR>DE', 'DE>FR',
        'HR>HU', 'HU>HR', 'HR>SI', 'SI>HR',
        'HU>RO', 'RO>HU', 'HU>SK', 'SK>HU',
        'PL>SK', 'SK>PL',
        'RO>SI', 'SI>RO'  # May be virtual
    ]

    # Calculate statistics for comparison using Polars
    comparison_data = []
    for comparison_col in maxbex_df.columns:
        if '>' in comparison_col:
            comparison_mean_capacity = maxbex_df[comparison_col].mean()
            border_type = 'Physical' if comparison_col in physical_borders else 'Virtual'
            comparison_data.append({
                'border': comparison_col,
                'type': border_type,
                'avg_capacity': comparison_mean_capacity
            })

    comparison_df = pl.DataFrame(comparison_data)

    # Box plot comparison
    box_plot = alt.Chart(comparison_df.to_pandas()).mark_boxplot(extent='min-max').encode(
        x=alt.X('type:N', title='Border Type'),
        y=alt.Y('avg_capacity:Q', title='Average Capacity (MW)'),
        color=alt.Color('type:N', scale=alt.Scale(domain=['Physical', 'Virtual'],
                                                    range=['#1f77b4', '#ff7f0e']))
    ).properties(
        title='MaxBEX Capacity Distribution: Physical vs Virtual Borders',
        width=400,
        height=400
    )

    # Summary statistics
    summary = comparison_df.group_by('type').agg([
        pl.col('avg_capacity').mean().alias('mean_capacity'),
        pl.col('avg_capacity').median().alias('median_capacity'),
        pl.col('avg_capacity').min().alias('min_capacity'),
        pl.col('avg_capacity').max().alias('max_capacity'),
        pl.len().alias('count')
    ])

    box_plot
    return comparison_df, summary


@app.cell
def _(mo, summary):
    return mo.vstack([
        mo.md("**Border Type Statistics:**"),
        mo.ui.table(summary.to_pandas())
    ])


@app.cell
def _(mo):
    mo.md("""## 2. CNECs/PTDFs DataFrame""")
    return


@app.cell
def _(cnecs_df, mo):
    # Display CNECs dataframe
    mo.ui.table(cnecs_df.head(20).to_pandas())
    return


@app.cell
def _(alt, cnecs_df, pl):
    # Top Binding CNECs by Shadow Price
    top_cnecs = (
        cnecs_df
        .group_by('cnec_name')
        .agg([
            pl.col('shadow_price').mean().alias('avg_shadow_price'),
            pl.col('ram').mean().alias('avg_ram'),
            pl.len().alias('count')
        ])
        .sort('avg_shadow_price', descending=True)
        .head(15)
    )

    chart_cnecs = alt.Chart(top_cnecs.to_pandas()).mark_bar().encode(
        x=alt.X('avg_shadow_price:Q', title='Average Shadow Price (€/MWh)'),
        y=alt.Y('cnec_name:N', sort='-x', title='CNEC'),
        tooltip=['cnec_name:N', 'avg_shadow_price:Q', 'avg_ram:Q', 'count:Q'],
        color=alt.Color('avg_shadow_price:Q', scale=alt.Scale(scheme='reds'))
    ).properties(
        title='Top 15 CNECs by Average Shadow Price',
        width=800,
        height=400
    )

    chart_cnecs
    return


@app.cell
def _(alt, cnecs_df):
    # Shadow Price Distribution
    chart_shadow = alt.Chart(cnecs_df.to_pandas()).mark_bar().encode(
        x=alt.X('shadow_price:Q', bin=alt.Bin(maxbins=50), title='Shadow Price (€/MWh)'),
        y=alt.Y('count()', title='Count'),
        tooltip=['shadow_price:Q', 'count()']
    ).properties(
        title='Shadow Price Distribution',
        width=800,
        height=300
    )

    chart_shadow
    return


@app.cell
def _(alt, cnecs_df, pl):
    # TSO Distribution
    tso_counts = (
        cnecs_df
        .group_by('tso')
        .agg(pl.len().alias('count'))
        .sort('count', descending=True)
    )

    chart_tso = alt.Chart(tso_counts.to_pandas()).mark_bar().encode(
        x=alt.X('count:Q', title='Number of Active Constraints'),
        y=alt.Y('tso:N', sort='-x', title='TSO'),
        tooltip=['tso:N', 'count:Q'],
        color=alt.value('steelblue')
    ).properties(
        title='Active Constraints by TSO',
        width=800,
        height=400
    )

    chart_tso
    return


@app.cell
def _(mo):
    mo.md("""### CNEC Network Impact Analysis""")
    return


@app.cell
def _(alt, cnecs_df, pl):
    # Analyze which zones are most affected by top CNECs
    # Select top 10 most binding CNECs
    top_10_cnecs = (
        cnecs_df
        .group_by('cnec_name')
        .agg(pl.col('shadow_price').mean().alias('avg_shadow_price'))
        .sort('avg_shadow_price', descending=True)
        .head(10)
        .get_column('cnec_name')
        .to_list()
    )

    # Get PTDF columns for impact analysis
    impact_ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')]

    # Calculate average absolute PTDF impact for top CNECs
    impact_data = []
    for cnec in top_10_cnecs:
        cnec_data = cnecs_df.filter(pl.col('cnec_name') == cnec)
        for ptdf_col in impact_ptdf_cols:
            zone = ptdf_col.replace('ptdf_', '')
            avg_abs_ptdf = cnec_data[ptdf_col].abs().mean()
            impact_data.append({
                'cnec_name': cnec[:40],  # Truncate long names
                'zone': zone,
                'avg_abs_ptdf': avg_abs_ptdf
            })

    impact_df = pl.DataFrame(impact_data)

    # Create heatmap showing CNEC-zone impact
    impact_heatmap = alt.Chart(impact_df.to_pandas()).mark_rect().encode(
        x=alt.X('zone:N', title='Zone'),
        y=alt.Y('cnec_name:N', title='CNEC (Top 10 by Shadow Price)'),
        color=alt.Color('avg_abs_ptdf:Q',
                       scale=alt.Scale(scheme='reds'),
                       title='Avg |PTDF|'),
        tooltip=['cnec_name:N', 'zone:N', alt.Tooltip('avg_abs_ptdf:Q', format='.4f')]
    ).properties(
        title='Network Impact: Which Zones Affect Each CNEC?',
        width=600,
        height=400
    )

    impact_heatmap
    return


@app.cell
def _(cnecs_df, mo):
    mo.md("## 3. PTDF Analysis")

    # Extract PTDF columns
    ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')]

    mo.md(f"**PTDF Zones**: {len(ptdf_cols)} zones - {', '.join([c.replace('ptdf_', '') for c in ptdf_cols])}")
    return (ptdf_cols,)


@app.cell
def _(cnecs_df, ptdf_cols):
    # PTDF Statistics
    ptdf_stats = cnecs_df.select(ptdf_cols).describe()
    ptdf_stats
    return


@app.cell
def _(mo):
    mo.md(
        """
    ## Data Quality Validation

    Checking for completeness, missing values, and data integrity:
    """
    )
    return


@app.cell
def _(cnecs_df, maxbex_df, mo, pl):
    # Calculate data completeness
    def check_completeness(df, name):
        total_cells = df.shape[0] * df.shape[1]
        null_cells = df.null_count().sum_horizontal()[0]
        completeness = (1 - null_cells / total_cells) * 100

        return {
            'Dataset': name,
            'Total Cells': total_cells,
            'Null Cells': null_cells,
            'Completeness %': f"{completeness:.2f}%"
        }

    completeness_report = [
        check_completeness(maxbex_df, 'MaxBEX (TARGET)'),
        check_completeness(cnecs_df, 'CNECs/PTDFs')
    ]

    mo.ui.table(pl.DataFrame(completeness_report).to_pandas())
    return (completeness_report,)


@app.cell
def _(completeness_report, mo):
    # Validation check
    all_complete = all(
        float(r['Completeness %'].rstrip('%')) >= 95.0
        for r in completeness_report
    )

    if all_complete:
        mo.md("✅ **All datasets meet >95% completeness threshold**")
    else:
        mo.md("⚠️ **Some datasets below 95% completeness - investigate missing data**")
    return


@app.cell
def _(mo):
    mo.md(
        """
    ## Next Steps

    After data exploration completion:

    1. **Day 2**: Feature engineering (75-85 features)
    2. **Day 3**: Zero-shot inference with Chronos 2
    3. **Day 4**: Performance evaluation and analysis
    4. **Day 5**: Documentation and handover

    ---

    **Note**: This notebook will be exported to JupyterLab format (.ipynb) for analyst handover.
    """
    )
    return


if __name__ == "__main__":
    app.run()