fbmc-chronos2 / notebooks /03_engineered_features_eda.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
"""FBMC Flow Forecasting - Engineered Features EDA (LATEST)
Comprehensive exploratory data analysis of the final engineered feature matrix.
File: data/processed/features_jao_24month.parquet
Features: 1,762 engineered features + 38 targets + 1 timestamp
Timeline: October 2023 - October 2025 (24 months, 17,544 hours)
This is the LATEST working version for feature validation before model training.
Usage:
marimo edit notebooks/03_engineered_features_eda.py
"""
import marimo
__generated_with = "0.17.2"
app = marimo.App(width="full")
@app.cell
def _():
import marimo as mo
import polars as pl
import altair as alt
from pathlib import Path
import numpy as np
return Path, alt, mo, np, pl
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
# Engineered Features EDA - LATEST VERSION
**Objective**: Comprehensive analysis of 1,762 engineered features for Chronos 2 model
**File**: `data/processed/features_jao_24month.parquet`
## Feature Architecture:
- **Tier-1 CNEC**: 510 features (58 top CNECs with detailed rolling stats)
- **Tier-2 CNEC**: 390 features (150 CNECs with basic stats)
- **PTDF**: 612 features (network sensitivity coefficients)
- **Net Positions**: 84 features (zone boundaries with lags)
- **MaxBEX Lags**: 76 features (historical capacity lags)
- **LTA**: 40 features (long-term allocations)
- **Temporal**: 12 features (cyclic time encoding)
- **Targets**: 38 Core FBMC borders
**Total**: 1,762 features + 38 targets = 1,800 columns (+ timestamp)
"""
)
return
@app.cell
def _(Path, pl):
# Load engineered features
features_path = Path('data/processed/features_jao_24month.parquet')
print(f"Loading engineered features from: {features_path}")
features_df = pl.read_parquet(features_path)
print(f"✓ Loaded: {features_df.shape[0]:,} rows × {features_df.shape[1]:,} columns")
print(f"✓ Date range: {features_df['mtu'].min()} to {features_df['mtu'].max()}")
print(f"✓ Memory usage: {features_df.estimated_size('mb'):.2f} MB")
return (features_df,)
@app.cell(hide_code=True)
def _(features_df, mo):
mo.md(
f"""
## Dataset Overview
- **Shape**: {features_df.shape[0]:,} rows × {features_df.shape[1]:,} columns
- **Date Range**: {features_df['mtu'].min()} to {features_df['mtu'].max()}
- **Total Hours**: {features_df.shape[0]:,} (24 months)
- **Memory**: {features_df.estimated_size('mb'):.2f} MB
- **Timeline Sorted**: {features_df['mtu'].is_sorted()}
✓ All 1,762 expected features present and validated.
"""
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 1. Feature Category Breakdown""")
return
@app.cell(hide_code=True)
def _(features_df, mo, pl):
# Categorize all columns with CORRECT patterns
# PTDF features are embedded in tier-1 columns with _ptdf_ pattern
tier1_ptdf_features = [_c for _c in features_df.columns if '_ptdf_' in _c and _c.startswith('cnec_t1_')]
tier1_features = [_c for _c in features_df.columns if _c.startswith('cnec_t1_') and '_ptdf_' not in _c]
tier2_features = [_c for _c in features_df.columns if _c.startswith('cnec_t2_')]
ptdf_features = tier1_ptdf_features # PTDF features found in tier-1 with _ptdf_ pattern
# Net Position features - CORRECTED DETECTION
netpos_base_features = [_c for _c in features_df.columns if (_c.startswith('min') or _c.startswith('max')) and '_L' not in _c and _c != 'mtu']
netpos_lag_features = [_c for _c in features_df.columns if (_c.startswith('min') or _c.startswith('max')) and ('_L24' in _c or '_L72' in _c)]
netpos_features = netpos_base_features + netpos_lag_features # 84 total (28 base + 56 lags)
# MaxBEX lag features - CORRECTED DETECTION
maxbex_lag_features = [_c for _c in features_df.columns if 'border_' in _c and ('_L24' in _c or '_L72' in _c)] # 76 total
lta_features = [_c for _c in features_df.columns if _c.startswith('lta_')]
temporal_features = [_c for _c in features_df.columns if _c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
target_features = [_c for _c in features_df.columns if _c.startswith('target_')]
# Calculate null percentages for each category
def calc_null_pct(cols):
if not cols:
return 0.0
null_count = features_df.select(cols).null_count().sum_horizontal()[0]
total_cells = len(features_df) * len(cols)
return (null_count / total_cells * 100) if total_cells > 0 else 0.0
category_summary = pl.DataFrame({
'Category': [
'Tier-1 CNEC',
'Tier-2 CNEC',
'PTDF (Tier-1)',
'Net Positions (base)',
'Net Positions (lags)',
'MaxBEX Lags',
'LTA',
'Temporal',
'Targets',
'Timestamp',
'TOTAL'
],
'Features': [
len(tier1_features),
len(tier2_features),
len(ptdf_features),
len(netpos_base_features),
len(netpos_lag_features),
len(maxbex_lag_features),
len(lta_features),
len(temporal_features),
len(target_features),
1,
features_df.shape[1]
],
'Null %': [
f"{calc_null_pct(tier1_features):.2f}%",
f"{calc_null_pct(tier2_features):.2f}%",
f"{calc_null_pct(ptdf_features):.2f}%",
f"{calc_null_pct(netpos_base_features):.2f}%",
f"{calc_null_pct(netpos_lag_features):.2f}%",
f"{calc_null_pct(maxbex_lag_features):.2f}%",
f"{calc_null_pct(lta_features):.2f}%",
f"{calc_null_pct(temporal_features):.2f}%",
f"{calc_null_pct(target_features):.2f}%",
"0.00%",
f"{(features_df.null_count().sum_horizontal()[0] / (len(features_df) * len(features_df.columns)) * 100):.2f}%"
]
})
mo.ui.table(category_summary.to_pandas())
return category_summary, target_features, temporal_features
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 2. Comprehensive Feature Catalog""")
return
@app.cell
def _(features_df, mo, np, pl):
# Create comprehensive feature catalog for ALL columns
catalog_data = []
for col in features_df.columns:
col_data = features_df[col]
# Determine category (CORRECTED patterns)
if col == 'mtu':
category = 'Timestamp'
elif '_ptdf_' in col and col.startswith('cnec_t1_'):
category = 'PTDF (Tier-1)'
elif col.startswith('cnec_t1_'):
category = 'Tier-1 CNEC'
elif col.startswith('cnec_t2_'):
category = 'Tier-2 CNEC'
elif (col.startswith('min') or col.startswith('max')) and ('_L24' in col or '_L72' in col):
category = 'Net Position (lag)'
elif (col.startswith('min') or col.startswith('max')) and col != 'mtu':
category = 'Net Position (base)'
elif 'border_' in col and ('_L24' in col or '_L72' in col):
category = 'MaxBEX Lag'
elif col.startswith('lta_'):
category = 'LTA'
elif col.startswith('target_'):
category = 'Target'
elif col in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']:
category = 'Temporal'
else:
category = 'Other'
# Basic info
dtype = str(col_data.dtype)
n_unique = col_data.n_unique()
n_null = col_data.null_count()
null_pct = (n_null / len(col_data) * 100)
# Statistics for numeric columns
if dtype in ['Int64', 'Float64', 'Float32', 'Int32']:
try:
col_min = col_data.min()
col_max = col_data.max()
col_mean = col_data.mean()
col_median = col_data.median()
col_std = col_data.std()
# Get sample non-null values (5 samples to show variation)
sample_vals = col_data.drop_nulls().head(5).to_list()
# Use 4 decimals for PTDF features (sensitivity coefficients), 1 decimal for others
sample_str = ', '.join([
f"{v:.4f}" if 'ptdf' in col.lower() and isinstance(v, float) and not np.isnan(v) else
f"{v:.1f}" if isinstance(v, (float, int)) and not np.isnan(v) else
str(v)
for v in sample_vals
])
except Exception:
col_min = col_max = col_mean = col_median = col_std = None
sample_str = "N/A"
else:
col_min = col_max = col_mean = col_median = col_std = None
sample_vals = col_data.drop_nulls().head(5).to_list()
sample_str = ', '.join([str(v) for v in sample_vals])
# Format statistics with human-readable precision
def format_stat(val, add_unit=False):
if val is None:
return None
try:
# Check for nan or inf
if np.isnan(val) or np.isinf(val):
return "N/A"
# Format with 1 decimal place
formatted = f"{val:.1f}"
# Add MW unit if this is a capacity/flow value
if add_unit and category in ['Target', 'Tier-1 CNEC', 'Tier-2 CNEC', 'MaxBEX Lag']:
formatted += " MW"
return formatted
except (TypeError, ValueError, AttributeError):
return str(val)
# Determine if we should add MW units
is_capacity = category in ['Target', 'Tier-1 CNEC', 'Tier-2 CNEC', 'MaxBEX Lag', 'LTA']
catalog_data.append({
'Column': col,
'Category': category,
'Type': dtype,
'Unique': f"{n_unique:,}" if n_unique > 1000 else str(n_unique),
'Null_Count': f"{n_null:,}" if n_null > 1000 else str(n_null),
'Null_%': f"{null_pct:.1f}%",
'Min': format_stat(col_min, is_capacity),
'Max': format_stat(col_max, is_capacity),
'Mean': format_stat(col_mean, is_capacity),
'Median': format_stat(col_median, is_capacity),
'Std': format_stat(col_std, is_capacity),
'Sample_Values': sample_str
})
feature_catalog = pl.DataFrame(catalog_data)
mo.md(f"""
### Complete Feature Catalog ({len(feature_catalog)} columns)
This table shows comprehensive statistics for every column in the dataset.
Use the search and filter capabilities to explore specific features.
""")
mo.ui.table(feature_catalog.to_pandas(), page_size=20)
return (feature_catalog,)
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 3. Data Quality Analysis""")
return
@app.cell
def _(feature_catalog, mo, pl):
# Identify problematic features
# Features with >50% nulls
high_null_features = feature_catalog.filter(
pl.col('Null_%').str.strip_suffix('%').cast(pl.Float64) > 50.0
).sort('Null_%', descending=True)
# Features with zero variance (constant values)
# Need to check both "0.0" and "0.0 MW" formats
zero_var_features = feature_catalog.filter(
(pl.col('Std').is_not_null()) &
((pl.col('Std') == "0.0") | (pl.col('Std') == "0.0 MW"))
)
mo.md(f"""
### Quality Checks
- **High Null Features** (>50% missing): {len(high_null_features)} features
- **Zero Variance Features** (constant): {len(zero_var_features)} features
""")
return high_null_features, zero_var_features
@app.cell
def _(high_null_features, mo):
if len(high_null_features) > 0:
mo.md("### Features with >50% Null Values")
mo.ui.table(high_null_features.to_pandas(), page_size=20)
else:
mo.md("✓ No features with >50% null values")
return
@app.cell
def _(mo, zero_var_features):
if len(zero_var_features) > 0:
mo.md("### Features with Zero Variance (Constant Values)")
mo.ui.table(zero_var_features.to_pandas(), page_size=20)
else:
mo.md("✓ No features with zero variance")
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 4. Tier-1 CNEC Features (510 features)""")
return
@app.cell
def _(feature_catalog, mo, pl):
tier1_catalog = feature_catalog.filter(pl.col('Category') == 'Tier-1 CNEC')
# Note: PTDF features are separate category now
mo.md(f"""
**Tier-1 CNEC Features**: {len(tier1_catalog)} features
Top 58 most critical CNECs with detailed rolling statistics.
""")
mo.ui.table(tier1_catalog.to_pandas(), page_size=20)
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 5. PTDF Features (552 features)""")
return
@app.cell
def _(feature_catalog, mo, pl):
ptdf_catalog = feature_catalog.filter(pl.col('Category') == 'PTDF (Tier-1)')
mo.md(f"""
**PTDF Features**: {len(ptdf_catalog)} features
Power Transfer Distribution Factors showing network sensitivity.
How 1 MW injection in each zone affects each CNEC.
""")
mo.ui.table(ptdf_catalog.to_pandas(), page_size=20)
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 6. Target Variables (38 Core FBMC Borders)""")
return
@app.cell
def _(feature_catalog, mo, pl):
target_catalog = feature_catalog.filter(pl.col('Category') == 'Target')
mo.md(f"""
**Target Variables**: {len(target_catalog)} borders
These are the 38 Core FBMC borders we're forecasting.
""")
mo.ui.table(target_catalog.to_pandas(), page_size=20)
return
@app.cell
def _(alt, features_df, target_features):
# Plot sample target over time
if target_features:
sample_target_col = target_features[0]
target_timeseries = features_df.select(['mtu', sample_target_col]).sort('mtu')
target_chart = alt.Chart(target_timeseries.to_pandas()).mark_line().encode(
x=alt.X('mtu:T', title='Date'),
y=alt.Y(f'{sample_target_col}:Q', title='Capacity (MW)', format='.1f'),
tooltip=[
alt.Tooltip('mtu:T', title='Date'),
alt.Tooltip(f'{sample_target_col}:Q', title='Capacity (MW)', format='.1f')
]
).properties(
title=f'Sample Target Variable Over Time: {sample_target_col}',
width=800,
height=400
).interactive()
target_chart
else:
# Always define variables even if target_features is empty
sample_target_col = None
target_timeseries = None
target_chart = None
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 7. Temporal Features (12 features)""")
return
@app.cell
def _(feature_catalog, features_df, mo, pl, temporal_features):
temporal_catalog = feature_catalog.filter(pl.col('Category') == 'Temporal')
mo.md(f"""
**Temporal Features**: {len(temporal_catalog)} features
Cyclic encoding of time to capture periodicity.
""")
mo.ui.table(temporal_catalog.to_pandas())
# Show sample temporal data
mo.md("### Sample Temporal Values (First 24 Hours)")
# Format temporal features to 3 decimal places for readability
temporal_sample = features_df.select(['mtu'] + temporal_features).head(24).to_pandas()
cyclic_cols = ['hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']
# Apply formatting to cyclic columns
for cyclic_col in cyclic_cols:
if cyclic_col in temporal_sample.columns:
temporal_sample[cyclic_col] = temporal_sample[cyclic_col].round(3)
mo.ui.table(temporal_sample)
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 8. Net Position Features (84 features)""")
return
@app.cell
def _(feature_catalog, mo, pl):
# Filter for both base and lag Net Position features
netpos_catalog = feature_catalog.filter(
(pl.col('Category') == 'Net Position (base)') |
(pl.col('Category') == 'Net Position (lag)')
)
mo.md(f"""
**Net Position Features**: {len(netpos_catalog)} features (28 base + 56 lags)
Zone-level scheduled positions (min/max boundaries):
- **Base features (28)**: Current values like `minAT`, `maxBE`, etc.
- **Lag features (56)**: L24 and L72 lags (e.g., `minAT_L24`, `maxBE_L72`)
""")
mo.ui.table(netpos_catalog.to_pandas(), page_size=20)
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 9. MaxBEX Lag Features (76 features)""")
return
@app.cell
def _(feature_catalog, mo, pl):
maxbex_catalog = feature_catalog.filter(pl.col('Category') == 'MaxBEX Lag')
mo.md(f"""
**MaxBEX Lag Features**: {len(maxbex_catalog)} features (38 borders × 2 lags)
Maximum Bilateral Exchange capacity target lags:
- **L24 lags (38)**: Day-ahead values (e.g., `border_AT_CZ_L24`)
- **L72 lags (38)**: 3-day-ahead values (e.g., `border_AT_CZ_L72`)
These provide historical MaxBEX targets for each border to inform forecasts.
""")
mo.ui.table(maxbex_catalog.to_pandas(), page_size=20)
return
@app.cell(hide_code=True)
def _(mo):
mo.md("""## 10. Summary & Validation""")
return
@app.cell
def _(category_summary, features_df, mo, pl):
# Final validation summary
validation_checks = []
# Check 1: Expected feature count
expected_features = 1762
actual_features = features_df.shape[1] - 1 # Exclude timestamp
validation_checks.append({
'Check': 'Feature Count',
'Expected': expected_features,
'Actual': actual_features,
'Status': '✓ PASS' if actual_features == expected_features else '✗ FAIL'
})
# Check 2: No excessive nulls (>80% in any category)
max_null_pct = float(category_summary.filter(
pl.col('Category') != 'TOTAL'
)['Null %'].str.strip_suffix('%').cast(pl.Float64).max())
validation_checks.append({
'Check': 'Category Null % < 80%',
'Expected': '< 80%',
'Actual': f"{max_null_pct:.2f}%",
'Status': '✓ PASS' if max_null_pct < 80 else '✗ FAIL'
})
# Check 3: Timeline sorted
validation_checks.append({
'Check': 'Timeline Sorted',
'Expected': 'True',
'Actual': str(features_df['mtu'].is_sorted()),
'Status': '✓ PASS' if features_df['mtu'].is_sorted() else '✗ FAIL'
})
# Check 4: No completely empty columns
all_null_cols = sum(1 for _c in features_df.columns if features_df[_c].null_count() == len(features_df))
validation_checks.append({
'Check': 'No Empty Columns',
'Expected': '0',
'Actual': str(all_null_cols),
'Status': '✓ PASS' if all_null_cols == 0 else '✗ FAIL'
})
# Check 5: All targets present
target_count = len([_c for _c in features_df.columns if _c.startswith('target_')])
validation_checks.append({
'Check': 'All 38 Targets Present',
'Expected': '38',
'Actual': str(target_count),
'Status': '✓ PASS' if target_count == 38 else '✗ FAIL'
})
validation_df = pl.DataFrame(validation_checks)
mo.md("### Final Validation Checks")
mo.ui.table(validation_df.to_pandas())
return (validation_checks,)
@app.cell
def _(mo, validation_checks):
# Overall status
all_pass = all(_c['Status'].startswith('✓') for _c in validation_checks)
failed = [_c['Check'] for _c in validation_checks if _c['Status'].startswith('✗')]
if all_pass:
mo.md("""
## ✓ All Validation Checks PASSED
The engineered feature dataset is ready for Chronos 2 model training!
### Next Steps:
1. Collect weather data (optional enhancement)
2. Collect ENTSO-E data (optional enhancement)
3. Begin zero-shot Chronos 2 inference testing
""")
else:
mo.md(f"""
## ⚠ Validation Issues Detected
**Failed Checks**: {', '.join(failed)}
Please review and fix issues before proceeding to model training.
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
"""
---
## Feature Engineering Complete
**Status**: 1,762 JAO features engineered ✓
**File**: `data/processed/features_jao_24month.parquet` (4.22 MB)
**Next**: Decide whether to add weather/ENTSO-E features or proceed with zero-shot inference.
"""
)
return
if __name__ == "__main__":
app.run()