Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

Evgueni Poloukarov Claude commited on 22 days ago

Commit

dcc56de

1 Parent(s): c366480

fix: correct Polish border targets - use directional flows (132 targets)

Critical bug fix for feature engineering pipeline:

PROBLEM:
- Polish border targets were ALL ZEROS (e.g., 0.00000028 MW)
- Expected: thousands of MW for interconnection flows
- Root cause: targets created from MaxBEX capacity columns instead of actual flows

SOLUTION:
- Fixed engineer_jao_features.py to use directional flow columns (CZ>PL, PL>CZ, etc.)
- Each direction is now a separate target (NOT netted):
* target_border_CZ_PL = Flow from CZ to PL (CZ>PL column)
* target_border_PL_CZ = Flow from PL to CZ (PL>CZ column)

RESULTS:
- Before: 38 MaxBEX targets (all Polish borders = 0)
- After: 132 directional targets (all Polish borders valid)
- target_border_CZ_PL: Mean=3,482 MW, Range=[144, 5,699] MW
- target_border_PL_CZ: Mean=2,698 MW, Range=[0, 4,631] MW

VALIDATION:
- Added forecast_validation_oct2024.py Marimo notebook
- Interactive visualization of forecast vs actuals for all borders
- Data leakage detection (fails if MAE=0)

DEPLOYMENT:
- Regenerated unified features: 17,544 rows x 2,647 columns
- Uploaded to HuggingFace: evgueni-p/fbmc-features-24month
- HF Space will automatically use corrected dataset

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show

notebooks/forecast_validation_oct2024.py +307 -0
src/feature_engineering/engineer_jao_features.py +17 -4

notebooks/forecast_validation_oct2024.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import marimo
+__generated_with = "0.9.30"
+app = marimo.App(width="medium")
+@app.cell
+def imports_and_setup():
+    """Import libraries and set up paths."""
+    import marimo as mo
+    import polars as pl
+    import altair as alt
+    from pathlib import Path
+    from datetime import datetime
+    # Set up absolute paths
+    project_root = Path(__file__).parent.parent
+    return mo, pl, alt, Path, datetime, project_root
+@app.cell
+def load_september_2025_data(pl, project_root):
+    """Load September 2025 forecast results and actuals."""
+    # Load actuals from HuggingFace dataset (ground truth)
+    print('[INFO] Loading actuals from HuggingFace dataset...')
+    from datasets import load_dataset
+    import os
+    dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN'))
+    df_actuals_full = pl.from_arrow(dataset.data.table)
+    print(f'[INFO] HF dataset loaded: {df_actuals_full.shape}')
+    # Load forecast results
+    forecast_path = project_root / 'results' / 'september_2025_forecast_504h.parquet'
+    if not forecast_path.exists():
+        raise FileNotFoundError(f'Forecast file not found: {forecast_path}. Run September 2025 forecast first.')
+    df_forecast_full = pl.read_parquet(forecast_path)
+    print(f'[INFO] Forecast loaded: {df_forecast_full.shape}')
+    print(f'[INFO] Forecast dates: {df_forecast_full["timestamp"].min()} to {df_forecast_full["timestamp"].max()}')
+    # Filter actuals to September 2025 period (Aug 18 - Sept 15)
+    start_date = datetime(2025, 8, 18)  # 2 weeks before forecast
+    end_date = datetime(2025, 9, 16)     # Through end of forecast period
+    df_actuals_filtered = df_actuals_full.filter(
+        (pl.col('timestamp') >= start_date) &
+        (pl.col('timestamp') < end_date)
+    )
+    print(f'[INFO] Actuals filtered: {df_actuals_filtered.shape[0]} hours (Aug 18 - Sept 15, 2025)')
+    return df_actuals_full, df_actuals_filtered, df_forecast_full, start_date, end_date
+@app.cell
+def prepare_unified_dataframe(pl, df_actuals_filtered, df_forecast_full):
+    """Prepare unified dataframe with forecast and actual pairs for all borders."""
+    # Extract border names from forecast columns
+    forecast_cols_list = [col for col in df_forecast_full.columns if col.endswith('_median')]
+    border_names_list = [col.replace('_median', '') for col in forecast_cols_list]
+    print(f'[INFO] Processing {len(border_names_list)} borders...')
+    # Start with timestamp from actuals
+    df_unified_data = df_actuals_filtered.select('timestamp')
+    # Add actual and forecast for each border
+    for border in border_names_list:
+        actual_col_source = f'target_border_{border}'
+        forecast_col_source = f'{border}_median'
+        # Add actuals
+        if actual_col_source in df_actuals_filtered.columns:
+            df_unified_data = df_unified_data.with_columns(
+                df_actuals_filtered[actual_col_source].alias(f'actual_{border}')
+            )
+        else:
+            print(f'[WARNING] Actual column missing: {actual_col_source}')
+            df_unified_data = df_unified_data.with_columns(pl.lit(None).alias(f'actual_{border}'))
+        # Add forecasts (join on timestamp)
+        if forecast_col_source in df_forecast_full.columns:
+            df_forecast_subset = df_forecast_full.select(['timestamp', forecast_col_source])
+            df_unified_data = df_unified_data.join(
+                df_forecast_subset,
+                on='timestamp',
+                how='left'
+            ).rename({forecast_col_source: f'forecast_{border}'})
+        else:
+            print(f'[WARNING] Forecast column missing: {forecast_col_source}')
+            df_unified_data = df_unified_data.with_columns(pl.lit(None).alias(f'forecast_{border}'))
+    print(f'[INFO] Unified data prepared: {df_unified_data.shape}')
+    # Validate no data leakage - check that forecasts don't perfectly match actuals
+    sample_border = border_names_list[0]
+    forecast_col_check = f'forecast_{sample_border}'
+    actual_col_check = f'actual_{sample_border}'
+    if forecast_col_check in df_unified_data.columns and actual_col_check in df_unified_data.columns:
+        _forecast_start_check = datetime(2025, 9, 2)
+        _df_forecast_check = df_unified_data.filter(pl.col('timestamp') >= _forecast_start_check)
+        if len(_df_forecast_check) > 0:
+            mae_check = (_df_forecast_check[forecast_col_check] - _df_forecast_check[actual_col_check]).abs().mean()
+            if mae_check == 0:
+                raise ValueError(f'DATA LEAKAGE DETECTED: Forecasts perfectly match actuals (MAE=0) for {sample_border}!')
+    print('[INFO] Data leakage check passed - forecasts differ from actuals')
+    return df_unified_data, border_names_list
+@app.cell
+def create_border_selector(mo, border_names_list):
+    """Create interactive border selection dropdown."""
+    border_selector_widget = mo.ui.dropdown(
+        options={border: border for border in sorted(border_names_list)},
+        value='AT_CZ',
+        label='Select Border:'
+    )
+    return border_selector_widget,
+@app.cell
+def display_border_selector(mo, border_selector_widget):
+    """Display the border selector UI."""
+    mo.md(f"""
+    ## Forecast Validation: September 2025
+    **Select a border to view:**
+    {border_selector_widget}
+    Chart shows:
+    - **2 weeks historical** (Aug 18-31, 2025): Actual flows only
+    - **2 weeks forecast** (Sept 2-15, 2025): Forecast vs Actual comparison
+    - **Context**: 504 hours (21 days)
+    """)
+@app.cell
+def filter_data_for_selected_border(pl, df_unified_data, border_selector_widget, start_date):
+    """Filter data for the selected border."""
+    selected_border_name = border_selector_widget.value
+    # Extract columns for selected border
+    actual_col_name = f'actual_{selected_border_name}'
+    forecast_col_name = f'forecast_{selected_border_name}'
+    # Check if columns exist
+    if actual_col_name not in df_unified_data.columns:
+        df_selected_border = None
+        print(f'[ERROR] Actual column {actual_col_name} not found')
+    else:
+        df_selected_border = df_unified_data.select([
+            'timestamp',
+            pl.col(actual_col_name).alias('actual'),
+            pl.col(forecast_col_name).alias('forecast') if forecast_col_name in df_unified_data.columns else pl.lit(None).alias('forecast')
+        ])
+        # Add period marker (historical vs forecast)
+        forecast_start = datetime(2025, 9, 2)
+        df_selected_border = df_selected_border.with_columns(
+            pl.when(pl.col('timestamp') >= forecast_start)
+            .then(pl.lit('Forecast Period'))
+            .otherwise(pl.lit('Historical'))
+            .alias('period')
+        )
+    return df_selected_border, selected_border_name, forecast_start
+@app.cell
+def create_time_series_chart(alt, df_selected_border, selected_border_name, forecast_start):
+    """Create Altair time series visualization."""
+    if df_selected_border is None:
+        chart_time_series = alt.Chart().mark_text(text='No data available', size=20)
+    else:
+        # Convert to pandas for Altair (CLAUDE.md Rule #37)
+        df_plot = df_selected_border.to_pandas()
+        # Create base chart
+        base = alt.Chart(df_plot).encode(
+            x=alt.X('timestamp:T', title='Date', axis=alt.Axis(format='%b %d'))
+        )
+        # Actual line (blue, solid)
+        line_actual = base.mark_line(color='blue', strokeWidth=2).encode(
+            y=alt.Y('actual:Q', title='Flow (MW)', scale=alt.Scale(zero=False)),
+            tooltip=[
+                alt.Tooltip('timestamp:T', title='Time', format='%Y-%m-%d %H:%M'),
+                alt.Tooltip('actual:Q', title='Actual (MW)', format='.1f')
+            ]
+        )
+        # Forecast line (orange, dashed) - only for forecast period
+        df_plot_forecast = df_plot[df_plot['period'] == 'Forecast Period']
+        if len(df_plot_forecast) > 0 and df_plot_forecast['forecast'].notna().any():
+            line_forecast = alt.Chart(df_plot_forecast).mark_line(
+                color='orange',
+                strokeWidth=2,
+                strokeDash=[5, 5]
+            ).encode(
+                x=alt.X('timestamp:T'),
+                y=alt.Y('forecast:Q'),
+                tooltip=[
+                    alt.Tooltip('timestamp:T', title='Time', format='%Y-%m-%d %H:%M'),
+                    alt.Tooltip('forecast:Q', title='Forecast (MW)', format='.1f'),
+                    alt.Tooltip('actual:Q', title='Actual (MW)', format='.1f')
+                ]
+            )
+        else:
+            line_forecast = alt.Chart().mark_point()  # Empty chart
+        # Vertical line at forecast start
+        rule_forecast_start = alt.Chart(
+            alt.Data(values=[{'x': forecast_start}])
+        ).mark_rule(color='red', strokeDash=[3, 3], strokeWidth=1).encode(
+            x='x:T'
+        )
+        # Combine layers
+        chart_time_series = (line_actual + line_forecast + rule_forecast_start).properties(
+            width=800,
+            height=400,
+            title=f'Border: {selected_border_name} | Hourly Flows (Aug 18 - Sept 15, 2025)'
+        ).configure_axis(
+            labelFontSize=12,
+            titleFontSize=14
+        ).configure_title(
+            fontSize=16
+        )
+    return chart_time_series,
+@app.cell
+def calculate_summary_statistics(pl, df_selected_border, selected_border_name, forecast_start):
+    """Calculate summary statistics for the selected border."""
+    if df_selected_border is None:
+        stats_summary_text = 'No data available'
+    else:
+        # Filter to forecast period only
+        df_forecast_period = df_selected_border.filter(
+            pl.col('timestamp') >= forecast_start
+        )
+        if len(df_forecast_period) == 0 or df_forecast_period['forecast'].is_null().all():
+            stats_summary_text = 'No forecast data available for this period'
+        else:
+            # Calculate MAE
+            mae_value = (
+                (df_forecast_period['forecast'] - df_forecast_period['actual']).abs().mean()
+            )
+            # Forecast variation
+            forecast_values = df_forecast_period['forecast'].drop_nulls()
+            unique_count = forecast_values.n_unique()
+            std_value = forecast_values.std()
+            # Actual variation (for reference)
+            actual_values = df_forecast_period['actual'].drop_nulls()
+            actual_std = actual_values.std()
+            stats_summary_text = f"""
+### Forecast Quality Statistics
+**Border**: {selected_border_name}
+**Period**: September 2-15, 2025 (336 hours)
+**Context**: 504 hours (21 days)
+**Accuracy Metrics:**
+- **MAE**: {mae_value:.2f} MW
+- Forecast variation: {unique_count} unique values, StdDev = {std_value:.2f} MW
+- Actual variation: StdDev = {actual_std:.2f} MW
+**Interpretation:**
+- MAE < 50 MW: Excellent
+- MAE 50-100 MW: Good
+- MAE > 100 MW: Needs improvement
+"""
+    return stats_summary_text,
+@app.cell
+def display_chart_and_stats(mo, chart_time_series, stats_summary_text):
+    """Display the chart and statistics."""
+    mo.vstack([
+        chart_time_series,
+        mo.md(stats_summary_text)
+    ])
+if __name__ == "__main__":
+    app.run()

src/feature_engineering/engineer_jao_features.py CHANGED Viewed

@@ -598,11 +598,24 @@ def engineer_jao_features(
                     feat_temporal, feat_system, feat_regional, feat_pca, feat_lags]:
         all_features = all_features.join(feat_df, on='mtu', how='left')
-    # Add target variable (ALL MaxBEX borders - 38 Core FBMC borders)
-    maxbex_cols = [c for c in unified.columns if c.startswith('border_') and 'lta' not in c.lower()]
-    for col in maxbex_cols:  # Use ALL Core FBMC borders (38 total)
         all_features = all_features.with_columns([
-            unified[col].alias(f'target_{col}')
         ])
     # Remove duplicates if any

                     feat_temporal, feat_system, feat_regional, feat_pca, feat_lags]:
         all_features = all_features.join(feat_df, on='mtu', how='left')
+    # Add target variables from DIRECTIONAL FLOWS (not MaxBEX capacity)
+    # JAO provides directional flows (CZ>PL, PL>CZ, etc.) as separate columns
+    # Create targets for EACH DIRECTION as separate forecast targets
+    # Example: target_border_CZ_PL = flow from CZ to PL (CZ>PL column)
+    #          target_border_PL_CZ = flow from PL to CZ (PL>CZ column)
+    # Find all directional flow columns
+    directional_cols = [c for c in unified.columns if '>' in c]
+    print(f"\n[INFO] Creating targets for {len(directional_cols)} directional flows...")
+    # Create one target per directional flow (e.g., CZ>PL becomes target_border_CZ_PL)
+    for col in sorted(directional_cols):
+        from_country, to_country = col.split('>')
+        # Target name: target_border_{FROM}_{TO} (preserves direction)
+        target_name = f'target_border_{from_country}_{to_country}'
         all_features = all_features.with_columns([
+            unified[col].alias(target_name)
         ])
     # Remove duplicates if any