fbmc-chronos2 / notebooks /october_2024_evaluation.py
Evgueni Poloukarov
refactor: improve Marimo notebook readability with proper number formatting
2a32f6f
import marimo
__generated_with = "0.17.2"
app = marimo.App(width="full", auto_download=["html"])
@app.cell
def _():
# Imports
import marimo as mo
import polars as pl
import altair as alt
import numpy as np
from pathlib import Path
return Path, alt, mo, np, pl
@app.cell
def _(mo):
mo.md(
"""
# FBMC Chronos-2 Zero-Shot Forecasting
## October 2024 Evaluation Results
**Comprehensive Analysis of 38-Border × 14-Day Multivariate Forecasting**
---
### Executive Summary
This notebook presents the complete evaluation of zero-shot multivariate forecasting for 38 European FBMC borders using Amazon Chronos-2 with 615 covariate features.
**Key Results**:
- Mean D+1 MAE: **15.92 MW** (88% better than 134 MW target)
- Forecast Time: **3.45 minutes** for 38 borders × 336 hours
- Success Rate: **94.7%** of borders meet ≤150 MW threshold
- Model: Zero-shot (no fine-tuning) with multivariate features
---
"""
)
return
@app.cell
def _(Path, pl):
# Load evaluation results
results_path = Path(__file__).parent.parent / 'results' / 'october_2024_multivariate.csv'
eval_df_raw = pl.read_csv(results_path)
# Round all MAE and RMSE columns for readability
mae_cols = [f'mae_d{i}' for i in range(1, 15)] + ['mae_overall']
rmse_cols = ['rmse_overall']
eval_df = eval_df_raw.with_columns([
pl.col(col).round(1) for col in mae_cols + rmse_cols
])
print(f"Loaded {len(eval_df)} border evaluations")
print(f"Columns: {eval_df.columns}")
eval_df.head(38)
return (eval_df,)
@app.cell
def _(eval_df, mo):
# Overall Statistics Card
mean_d1 = eval_df['mae_d1'].mean()
median_d1 = eval_df['mae_d1'].median()
min_d1 = eval_df['mae_d1'].min()
max_d1 = eval_df['mae_d1'].max()
target_met = (eval_df['mae_d1'] <= 150).sum()
total_borders = len(eval_df)
mo.md(f"""
## 1. Overall Performance Metrics
### D+1 Mean Absolute Error (Primary Metric)
| Statistic | Value | Target | Status |
|-----------|-------|--------|--------|
| **Mean** | **{mean_d1:.2f} MW** | ≤134 MW | ✅ **{((134 - mean_d1) / 134 * 100):.0f}% better!** |
| Median | {median_d1:.2f} MW | - | ✅ Excellent |
| Min | {min_d1:.2f} MW | - | ✅ Perfect |
| Max | {max_d1:.2f} MW | - | ⚠️ Outliers present |
| **Success Rate** | **{target_met}/{total_borders} ({target_met/total_borders*100:.1f}%)** | - | ✅ Very good |
**Interpretation**: The zero-shot model achieves outstanding performance with mean D+1 MAE of {mean_d1:.2f} MW, significantly beating the 134 MW target. However, 2 outlier borders require attention in Phase 2.
""")
return
@app.cell
def _(mo):
# MAE Distribution Visualization
mo.md("""
### D+1 MAE Distribution
Distribution of D+1 MAE across all 38 borders, showing the concentration of excellent performance with a few outliers.
""")
return
@app.cell
def _(alt, eval_df):
# Histogram of D+1 MAE
hist_chart = alt.Chart(eval_df.to_pandas()).mark_bar().encode(
x=alt.X('mae_d1:Q', bin=alt.Bin(maxbins=20), title='D+1 MAE (MW)'),
y=alt.Y('count()', title='Number of Borders'),
tooltip=[
alt.Tooltip('count()', title='Number of Borders')
]
).properties(
width=600,
height=300,
title='Distribution of D+1 MAE Across 38 Borders'
)
hist_chart
return
@app.cell
def _(mo):
mo.md(
"""
## 2. Border-Level Performance
### Top 10 Best Performers (Lowest D+1 MAE)
"""
)
return
@app.cell
def _(eval_df, pl):
# Top 10 best performers (rounded for readability)
best_performers = eval_df.sort('mae_d1').head(10).with_columns([
pl.col('mae_d1').round(1),
pl.col('mae_overall').round(1),
pl.col('rmse_overall').round(1)
])
best_performers.select(['border', 'mae_d1', 'mae_overall', 'rmse_overall'])
return
@app.cell
def _(mo):
mo.md(
"""
### Top 10 Worst Performers (Highest D+1 MAE)
These borders are candidates for fine-tuning in Phase 2.
"""
)
return
@app.cell
def _(eval_df, pl):
# Top 10 worst performers (rounded for readability)
worst_performers = eval_df.sort('mae_d1', descending=True).head(10).with_columns([
pl.col('mae_d1').round(1),
pl.col('mae_overall').round(1),
pl.col('rmse_overall').round(1)
])
worst_performers.select(['border', 'mae_d1', 'mae_overall', 'rmse_overall'])
return
@app.cell
def _(mo):
mo.md(
"""
## 3. MAE Degradation Over Forecast Horizon
### Daily MAE Evolution (D+1 through D+14)
Analysis of how forecast accuracy degrades over the 14-day horizon.
"""
)
return
@app.cell
def _(eval_df, pl):
# Calculate mean MAE for each day (rounded for readability)
daily_mae_data = []
for day in range(1, 15):
col_name = f'mae_d{day}'
mean_mae = round(eval_df[col_name].mean(), 1)
median_mae = round(eval_df[col_name].median(), 1)
daily_mae_data.append({
'day': day,
'mean_mae': mean_mae,
'median_mae': median_mae
})
daily_mae_df = pl.DataFrame(daily_mae_data)
daily_mae_df
return (daily_mae_df,)
@app.cell
def _(alt, daily_mae_df):
# Line chart of MAE degradation
degradation_chart = alt.Chart(daily_mae_df.to_pandas()).mark_line(point=True).encode(
x=alt.X('day:Q', title='Forecast Day', scale=alt.Scale(domain=[1, 14])),
y=alt.Y('mean_mae:Q', title='Mean MAE (MW)', scale=alt.Scale(zero=True)),
tooltip=[
alt.Tooltip('day:Q', title='Day'),
alt.Tooltip('mean_mae:Q', title='Mean MAE (MW)', format='.1f'),
alt.Tooltip('median_mae:Q', title='Median MAE (MW)', format='.1f')
]
).properties(
width=700,
height=400,
title='MAE Degradation Over 14-Day Forecast Horizon'
)
degradation_chart
return
@app.cell
def _(daily_mae_df, mo, pl):
# MAE degradation table with explicit baseline (rounded for readability)
mae_list = daily_mae_df['mean_mae'].to_list()
baseline_mae = mae_list[0]
degradation_table = daily_mae_df.with_columns([
(((pl.col('mean_mae') - baseline_mae) / baseline_mae * 100).round(1)).alias('pct_increase')
])
# Extract specific days for readability
degradation_d1_mae = mae_list[0]
degradation_d2_mae = mae_list[1]
degradation_d8_mae = mae_list[7]
degradation_d14_mae = mae_list[13]
mo.md(f"""
### Degradation Statistics
{mo.as_html(degradation_table.to_pandas())}
**Key Observations**:
- D+1 baseline: {degradation_d1_mae:.1f} MW
- D+2 degradation: {((degradation_d2_mae - degradation_d1_mae) / degradation_d1_mae * 100):.1f}%
- D+14 final: {degradation_d14_mae:.1f} MW (+{((degradation_d14_mae - degradation_d1_mae) / degradation_d1_mae * 100):.1f}%)
- Largest jump: D+8 at {degradation_d8_mae:.1f} MW (investigate cause)
""")
return
@app.cell
def _(mo):
mo.md(
"""
## 4. Border-Level Heatmap
### MAE Across All Borders and Days
Interactive heatmap showing forecast error evolution for each border over 14 days.
"""
)
return
@app.cell
def _(eval_df, pl):
# Reshape data for heatmap (unpivot daily MAE columns)
heatmap_data = eval_df.select(['border'] + [f'mae_d{i}' for i in range(1, 15)])
# Unpivot to long format (already rounded in eval_df)
heatmap_long = heatmap_data.unpivot(
index='border',
on=[f'mae_d{i}' for i in range(1, 15)],
variable_name='day',
value_name='mae'
).with_columns([
pl.col('day').str.replace('mae_d', '').cast(pl.Int32),
pl.col('mae').round(1) # Ensure rounding for display
])
heatmap_long.head()
return (heatmap_long,)
@app.cell
def _(alt, heatmap_long):
# Heatmap of MAE by border and day
heatmap_chart = alt.Chart(heatmap_long.to_pandas()).mark_rect().encode(
x=alt.X('day:O', title='Forecast Day'),
y=alt.Y('border:N', title='Border', sort='-x'),
color=alt.Color('mae:Q',
title='MAE (MW)',
scale=alt.Scale(scheme='redyellowgreen', reverse=True, domain=[0, 300])),
tooltip=['border', 'day', alt.Tooltip('mae:Q', format='.1f')]
).properties(
width=700,
height=800,
title='MAE Heatmap: All Borders × 14 Days'
)
heatmap_chart
return
@app.cell
def _(mo):
mo.md(
"""
## 5. Outlier Analysis
### Borders with D+1 MAE > 150 MW
Detailed analysis of underperforming borders for Phase 2 fine-tuning.
"""
)
return
@app.cell
def _(eval_df, pl):
# Identify outliers (rounded for readability)
outliers = eval_df.filter(pl.col('mae_d1') > 150).sort('mae_d1', descending=True).with_columns([
pl.col('mae_d1').round(1),
pl.col('mae_d2').round(1),
pl.col('mae_d7').round(1),
pl.col('mae_d14').round(1),
pl.col('mae_overall').round(1),
pl.col('rmse_overall').round(1)
])
outliers.select(['border', 'mae_d1', 'mae_d2', 'mae_d7', 'mae_d14', 'mae_overall', 'rmse_overall'])
return (outliers,)
@app.cell
def _(mo, outliers):
outlier_analysis = []
for row in outliers.iter_rows(named=True):
border = row['border']
outlier_mae = row['mae_d1']
if border == 'AT_DE':
reason = "Bidirectional Austria-Germany flow with high volatility (large capacity, multiple ramping patterns)"
elif border == 'FR_DE':
reason = "France-Germany high-capacity interconnection with complex market dynamics"
else:
reason = "Requires investigation"
outlier_analysis.append(f"- **{border}**: {outlier_mae:.1f} MW - {reason}")
mo.md(f"""
### Outlier Investigation
{chr(10).join(outlier_analysis)}
**Recommendation**: Fine-tune with LoRA on 6 months of border-specific data in Phase 2.
""")
return
@app.cell
def _(mo):
mo.md(
"""
## 6. Performance Categories
### Borders Grouped by D+1 MAE
Classification of forecast quality across borders.
"""
)
return
@app.cell
def _(eval_df, pl):
# Categorize borders by performance
categorized_df = eval_df.with_columns([
pl.when(pl.col('mae_d1') <= 10).then(pl.lit('Excellent (≤10 MW)'))
.when(pl.col('mae_d1') <= 50).then(pl.lit('Good (10-50 MW)'))
.when(pl.col('mae_d1') <= 150).then(pl.lit('Acceptable (50-150 MW)'))
.otherwise(pl.lit('Needs Improvement (>150 MW)'))
.alias('category')
])
# Count by category
category_counts = categorized_df.group_by('category').agg([
pl.count().alias('count')
]).sort('count', descending=True)
category_counts
return (category_counts,)
@app.cell
def _(alt, category_counts):
# Pie chart of performance categories
cat_chart = alt.Chart(category_counts.to_pandas()).mark_arc(innerRadius=50).encode(
theta=alt.Theta('count:Q', stack=True),
color=alt.Color('category:N',
scale=alt.Scale(domain=['Excellent (≤10 MW)', 'Good (10-50 MW)',
'Acceptable (50-150 MW)', 'Needs Improvement (>150 MW)'],
range=['#2ecc71', '#3498db', '#f39c12', '#e74c3c'])),
tooltip=['category', 'count']
).properties(
width=400,
height=400,
title='Border Performance Distribution'
)
cat_chart
return
@app.cell
def _(mo):
mo.md(
"""
## 7. Statistical Analysis
### Correlation Between Overall MAE and D+1 MAE
"""
)
return
@app.cell
def _(alt, eval_df):
# Scatter plot: Overall vs D+1 MAE
correlation_chart = alt.Chart(eval_df.to_pandas()).mark_point(size=100, opacity=0.7).encode(
x=alt.X('mae_d1:Q', title='D+1 MAE (MW)'),
y=alt.Y('mae_overall:Q', title='Overall MAE (MW)'),
color=alt.condition(
alt.datum.mae_d1 > 150,
alt.value('#e74c3c'),
alt.value('#3498db')
),
tooltip=[
alt.Tooltip('border:N', title='Border'),
alt.Tooltip('mae_d1:Q', title='D+1 MAE (MW)', format='.1f'),
alt.Tooltip('mae_overall:Q', title='Overall MAE (MW)', format='.1f')
]
).properties(
width=600,
height=400,
title='Correlation: D+1 MAE vs Overall MAE'
)
correlation_chart
return
@app.cell
def _(eval_df, mo, np):
# Calculate correlation
corr_d1_overall = np.corrcoef(eval_df['mae_d1'].to_numpy(), eval_df['mae_overall'].to_numpy())[0, 1]
mo.md(f"""
**Pearson Correlation**: {corr_d1_overall:.3f}
{
"Strong positive correlation indicates D+1 performance is a good predictor of overall forecast quality."
if corr_d1_overall > 0.7
else "Moderate correlation suggests D+1 and overall MAE have some relationship."
}
""")
return
@app.cell
def _(mo):
mo.md(
"""
## 8. Key Findings & Recommendations
### Summary of Evaluation Results
"""
)
return
@app.cell
def _(eval_df, mo):
# Calculate additional stats
perfect_borders = (eval_df['mae_d1'] == 0).sum()
low_error_borders = (eval_df['mae_d1'] <= 10).sum()
high_error_borders = (eval_df['mae_d1'] > 150).sum()
mo.md(f"""
### Key Findings
1. **Exceptional Zero-Shot Performance**
- {perfect_borders} borders have ZERO D+1 MAE (perfect forecasts)
- {low_error_borders} borders have D+1 MAE ≤10 MW (near-perfect)
- Mean D+1 MAE of 15.92 MW is 88% better than the 134 MW target
2. **Multivariate Features Provide Strong Signal**
- 615 covariate features (weather, generation, CNEC outages) enable accurate zero-shot forecasting
- No model training required - pre-trained Chronos-2 generalizes well
3. **Outliers Identified for Phase 2**
- {high_error_borders} borders exceed 150 MW threshold
- AT_DE (266 MW) and FR_DE (181 MW) require fine-tuning
- Complex bidirectional flows and high volatility are main challenges
4. **Forecast Degradation Analysis**
- Accuracy degrades reasonably over 14-day horizon
- D+2: +7.6% degradation (excellent)
- D+14: +90.4% degradation (acceptable for long-range forecasts)
- D+8 spike (38.42 MW, +141%) requires investigation
### Phase 2 Recommendations
**Priority 1: Fine-Tune Outlier Borders**
- Apply LoRA fine-tuning to AT_DE and FR_DE
- Use 6 months of border-specific data
- Expected improvement: 40-60% MAE reduction
- Timeline: 2-3 weeks
**Priority 2: Investigate D+8 Spike**
- Analyze why D+8 has larger errors than D+14
- Check for systematic patterns or data quality issues
- Timeline: 1 week
**Priority 3: Extend Context Window**
- Increase from 128h to 512h for better pattern learning
- Verify no OOM on A100 GPU
- Expected improvement: 10-15% overall MAE reduction
- Timeline: 1 week
**Priority 4: Feature Engineering**
- Add scheduled outages, cross-border ramping constraints
- Refine CNEC weighting based on binding frequency
- Expected improvement: 5-10% MAE reduction
- Timeline: 2 weeks
### Production Readiness
✅ **Ready for Deployment**
- Zero-shot model achieves target (15.92 MW < 134 MW)
- Inference time acceptable (3.45 min for 38 borders)
- 94.7% of borders meet quality threshold
- API deployed on HuggingFace Space (A100 GPU)
⚠️ **Monitor These Borders**
- AT_DE, FR_DE require manual review
- Consider ensemble methods or manual adjustments for outliers
### Cost & Infrastructure
- **GPU**: A100-large (40-80 GB VRAM) required for multivariate forecasting
- **Cost**: ~$500/month for 24/7 API access
- **Alternative**: Run batched forecasts on smaller GPU (A10G) to reduce costs
---
**Document Version**: 1.0.0
**Evaluation Date**: 2024-10-01 to 2024-10-14
**Model**: amazon/chronos-2 (zero-shot, 615 features)
**Author**: FBMC Forecasting Team
""")
return
if __name__ == "__main__":
app.run()