Spaces:
Sleeping
Sleeping
| """FBMC Flow Forecasting - Engineered Features EDA (LATEST) | |
| Comprehensive exploratory data analysis of the final engineered feature matrix. | |
| File: data/processed/features_jao_24month.parquet | |
| Features: 1,762 engineered features + 38 targets + 1 timestamp | |
| Timeline: October 2023 - October 2025 (24 months, 17,544 hours) | |
| This is the LATEST working version for feature validation before model training. | |
| Usage: | |
| marimo edit notebooks/03_engineered_features_eda.py | |
| """ | |
| import marimo | |
| __generated_with = "0.17.2" | |
| app = marimo.App(width="full") | |
| def _(): | |
| import marimo as mo | |
| import polars as pl | |
| import altair as alt | |
| from pathlib import Path | |
| import numpy as np | |
| return Path, alt, mo, np, pl | |
| def _(mo): | |
| mo.md( | |
| r""" | |
| # Engineered Features EDA - LATEST VERSION | |
| **Objective**: Comprehensive analysis of 1,762 engineered features for Chronos 2 model | |
| **File**: `data/processed/features_jao_24month.parquet` | |
| ## Feature Architecture: | |
| - **Tier-1 CNEC**: 510 features (58 top CNECs with detailed rolling stats) | |
| - **Tier-2 CNEC**: 390 features (150 CNECs with basic stats) | |
| - **PTDF**: 612 features (network sensitivity coefficients) | |
| - **Net Positions**: 84 features (zone boundaries with lags) | |
| - **MaxBEX Lags**: 76 features (historical capacity lags) | |
| - **LTA**: 40 features (long-term allocations) | |
| - **Temporal**: 12 features (cyclic time encoding) | |
| - **Targets**: 38 Core FBMC borders | |
| **Total**: 1,762 features + 38 targets = 1,800 columns (+ timestamp) | |
| """ | |
| ) | |
| return | |
| def _(Path, pl): | |
| # Load engineered features | |
| features_path = Path('data/processed/features_jao_24month.parquet') | |
| print(f"Loading engineered features from: {features_path}") | |
| features_df = pl.read_parquet(features_path) | |
| print(f"✓ Loaded: {features_df.shape[0]:,} rows × {features_df.shape[1]:,} columns") | |
| print(f"✓ Date range: {features_df['mtu'].min()} to {features_df['mtu'].max()}") | |
| print(f"✓ Memory usage: {features_df.estimated_size('mb'):.2f} MB") | |
| return (features_df,) | |
| def _(features_df, mo): | |
| mo.md( | |
| f""" | |
| ## Dataset Overview | |
| - **Shape**: {features_df.shape[0]:,} rows × {features_df.shape[1]:,} columns | |
| - **Date Range**: {features_df['mtu'].min()} to {features_df['mtu'].max()} | |
| - **Total Hours**: {features_df.shape[0]:,} (24 months) | |
| - **Memory**: {features_df.estimated_size('mb'):.2f} MB | |
| - **Timeline Sorted**: {features_df['mtu'].is_sorted()} | |
| ✓ All 1,762 expected features present and validated. | |
| """ | |
| ) | |
| return | |
| def _(mo): | |
| mo.md("""## 1. Feature Category Breakdown""") | |
| return | |
| def _(features_df, mo, pl): | |
| # Categorize all columns with CORRECT patterns | |
| # PTDF features are embedded in tier-1 columns with _ptdf_ pattern | |
| tier1_ptdf_features = [_c for _c in features_df.columns if '_ptdf_' in _c and _c.startswith('cnec_t1_')] | |
| tier1_features = [_c for _c in features_df.columns if _c.startswith('cnec_t1_') and '_ptdf_' not in _c] | |
| tier2_features = [_c for _c in features_df.columns if _c.startswith('cnec_t2_')] | |
| ptdf_features = tier1_ptdf_features # PTDF features found in tier-1 with _ptdf_ pattern | |
| # Net Position features - CORRECTED DETECTION | |
| netpos_base_features = [_c for _c in features_df.columns if (_c.startswith('min') or _c.startswith('max')) and '_L' not in _c and _c != 'mtu'] | |
| netpos_lag_features = [_c for _c in features_df.columns if (_c.startswith('min') or _c.startswith('max')) and ('_L24' in _c or '_L72' in _c)] | |
| netpos_features = netpos_base_features + netpos_lag_features # 84 total (28 base + 56 lags) | |
| # MaxBEX lag features - CORRECTED DETECTION | |
| maxbex_lag_features = [_c for _c in features_df.columns if 'border_' in _c and ('_L24' in _c or '_L72' in _c)] # 76 total | |
| lta_features = [_c for _c in features_df.columns if _c.startswith('lta_')] | |
| temporal_features = [_c for _c in features_df.columns if _c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] | |
| target_features = [_c for _c in features_df.columns if _c.startswith('target_')] | |
| # Calculate null percentages for each category | |
| def calc_null_pct(cols): | |
| if not cols: | |
| return 0.0 | |
| null_count = features_df.select(cols).null_count().sum_horizontal()[0] | |
| total_cells = len(features_df) * len(cols) | |
| return (null_count / total_cells * 100) if total_cells > 0 else 0.0 | |
| category_summary = pl.DataFrame({ | |
| 'Category': [ | |
| 'Tier-1 CNEC', | |
| 'Tier-2 CNEC', | |
| 'PTDF (Tier-1)', | |
| 'Net Positions (base)', | |
| 'Net Positions (lags)', | |
| 'MaxBEX Lags', | |
| 'LTA', | |
| 'Temporal', | |
| 'Targets', | |
| 'Timestamp', | |
| 'TOTAL' | |
| ], | |
| 'Features': [ | |
| len(tier1_features), | |
| len(tier2_features), | |
| len(ptdf_features), | |
| len(netpos_base_features), | |
| len(netpos_lag_features), | |
| len(maxbex_lag_features), | |
| len(lta_features), | |
| len(temporal_features), | |
| len(target_features), | |
| 1, | |
| features_df.shape[1] | |
| ], | |
| 'Null %': [ | |
| f"{calc_null_pct(tier1_features):.2f}%", | |
| f"{calc_null_pct(tier2_features):.2f}%", | |
| f"{calc_null_pct(ptdf_features):.2f}%", | |
| f"{calc_null_pct(netpos_base_features):.2f}%", | |
| f"{calc_null_pct(netpos_lag_features):.2f}%", | |
| f"{calc_null_pct(maxbex_lag_features):.2f}%", | |
| f"{calc_null_pct(lta_features):.2f}%", | |
| f"{calc_null_pct(temporal_features):.2f}%", | |
| f"{calc_null_pct(target_features):.2f}%", | |
| "0.00%", | |
| f"{(features_df.null_count().sum_horizontal()[0] / (len(features_df) * len(features_df.columns)) * 100):.2f}%" | |
| ] | |
| }) | |
| mo.ui.table(category_summary.to_pandas()) | |
| return category_summary, target_features, temporal_features | |
| def _(mo): | |
| mo.md("""## 2. Comprehensive Feature Catalog""") | |
| return | |
| def _(features_df, mo, np, pl): | |
| # Create comprehensive feature catalog for ALL columns | |
| catalog_data = [] | |
| for col in features_df.columns: | |
| col_data = features_df[col] | |
| # Determine category (CORRECTED patterns) | |
| if col == 'mtu': | |
| category = 'Timestamp' | |
| elif '_ptdf_' in col and col.startswith('cnec_t1_'): | |
| category = 'PTDF (Tier-1)' | |
| elif col.startswith('cnec_t1_'): | |
| category = 'Tier-1 CNEC' | |
| elif col.startswith('cnec_t2_'): | |
| category = 'Tier-2 CNEC' | |
| elif (col.startswith('min') or col.startswith('max')) and ('_L24' in col or '_L72' in col): | |
| category = 'Net Position (lag)' | |
| elif (col.startswith('min') or col.startswith('max')) and col != 'mtu': | |
| category = 'Net Position (base)' | |
| elif 'border_' in col and ('_L24' in col or '_L72' in col): | |
| category = 'MaxBEX Lag' | |
| elif col.startswith('lta_'): | |
| category = 'LTA' | |
| elif col.startswith('target_'): | |
| category = 'Target' | |
| elif col in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']: | |
| category = 'Temporal' | |
| else: | |
| category = 'Other' | |
| # Basic info | |
| dtype = str(col_data.dtype) | |
| n_unique = col_data.n_unique() | |
| n_null = col_data.null_count() | |
| null_pct = (n_null / len(col_data) * 100) | |
| # Statistics for numeric columns | |
| if dtype in ['Int64', 'Float64', 'Float32', 'Int32']: | |
| try: | |
| col_min = col_data.min() | |
| col_max = col_data.max() | |
| col_mean = col_data.mean() | |
| col_median = col_data.median() | |
| col_std = col_data.std() | |
| # Get sample non-null values (5 samples to show variation) | |
| sample_vals = col_data.drop_nulls().head(5).to_list() | |
| # Use 4 decimals for PTDF features (sensitivity coefficients), 1 decimal for others | |
| sample_str = ', '.join([ | |
| f"{v:.4f}" if 'ptdf' in col.lower() and isinstance(v, float) and not np.isnan(v) else | |
| f"{v:.1f}" if isinstance(v, (float, int)) and not np.isnan(v) else | |
| str(v) | |
| for v in sample_vals | |
| ]) | |
| except Exception: | |
| col_min = col_max = col_mean = col_median = col_std = None | |
| sample_str = "N/A" | |
| else: | |
| col_min = col_max = col_mean = col_median = col_std = None | |
| sample_vals = col_data.drop_nulls().head(5).to_list() | |
| sample_str = ', '.join([str(v) for v in sample_vals]) | |
| # Format statistics with human-readable precision | |
| def format_stat(val, add_unit=False): | |
| if val is None: | |
| return None | |
| try: | |
| # Check for nan or inf | |
| if np.isnan(val) or np.isinf(val): | |
| return "N/A" | |
| # Format with 1 decimal place | |
| formatted = f"{val:.1f}" | |
| # Add MW unit if this is a capacity/flow value | |
| if add_unit and category in ['Target', 'Tier-1 CNEC', 'Tier-2 CNEC', 'MaxBEX Lag']: | |
| formatted += " MW" | |
| return formatted | |
| except (TypeError, ValueError, AttributeError): | |
| return str(val) | |
| # Determine if we should add MW units | |
| is_capacity = category in ['Target', 'Tier-1 CNEC', 'Tier-2 CNEC', 'MaxBEX Lag', 'LTA'] | |
| catalog_data.append({ | |
| 'Column': col, | |
| 'Category': category, | |
| 'Type': dtype, | |
| 'Unique': f"{n_unique:,}" if n_unique > 1000 else str(n_unique), | |
| 'Null_Count': f"{n_null:,}" if n_null > 1000 else str(n_null), | |
| 'Null_%': f"{null_pct:.1f}%", | |
| 'Min': format_stat(col_min, is_capacity), | |
| 'Max': format_stat(col_max, is_capacity), | |
| 'Mean': format_stat(col_mean, is_capacity), | |
| 'Median': format_stat(col_median, is_capacity), | |
| 'Std': format_stat(col_std, is_capacity), | |
| 'Sample_Values': sample_str | |
| }) | |
| feature_catalog = pl.DataFrame(catalog_data) | |
| mo.md(f""" | |
| ### Complete Feature Catalog ({len(feature_catalog)} columns) | |
| This table shows comprehensive statistics for every column in the dataset. | |
| Use the search and filter capabilities to explore specific features. | |
| """) | |
| mo.ui.table(feature_catalog.to_pandas(), page_size=20) | |
| return (feature_catalog,) | |
| def _(mo): | |
| mo.md("""## 3. Data Quality Analysis""") | |
| return | |
| def _(feature_catalog, mo, pl): | |
| # Identify problematic features | |
| # Features with >50% nulls | |
| high_null_features = feature_catalog.filter( | |
| pl.col('Null_%').str.strip_suffix('%').cast(pl.Float64) > 50.0 | |
| ).sort('Null_%', descending=True) | |
| # Features with zero variance (constant values) | |
| # Need to check both "0.0" and "0.0 MW" formats | |
| zero_var_features = feature_catalog.filter( | |
| (pl.col('Std').is_not_null()) & | |
| ((pl.col('Std') == "0.0") | (pl.col('Std') == "0.0 MW")) | |
| ) | |
| mo.md(f""" | |
| ### Quality Checks | |
| - **High Null Features** (>50% missing): {len(high_null_features)} features | |
| - **Zero Variance Features** (constant): {len(zero_var_features)} features | |
| """) | |
| return high_null_features, zero_var_features | |
| def _(high_null_features, mo): | |
| if len(high_null_features) > 0: | |
| mo.md("### Features with >50% Null Values") | |
| mo.ui.table(high_null_features.to_pandas(), page_size=20) | |
| else: | |
| mo.md("✓ No features with >50% null values") | |
| return | |
| def _(mo, zero_var_features): | |
| if len(zero_var_features) > 0: | |
| mo.md("### Features with Zero Variance (Constant Values)") | |
| mo.ui.table(zero_var_features.to_pandas(), page_size=20) | |
| else: | |
| mo.md("✓ No features with zero variance") | |
| return | |
| def _(mo): | |
| mo.md("""## 4. Tier-1 CNEC Features (510 features)""") | |
| return | |
| def _(feature_catalog, mo, pl): | |
| tier1_catalog = feature_catalog.filter(pl.col('Category') == 'Tier-1 CNEC') | |
| # Note: PTDF features are separate category now | |
| mo.md(f""" | |
| **Tier-1 CNEC Features**: {len(tier1_catalog)} features | |
| Top 58 most critical CNECs with detailed rolling statistics. | |
| """) | |
| mo.ui.table(tier1_catalog.to_pandas(), page_size=20) | |
| return | |
| def _(mo): | |
| mo.md("""## 5. PTDF Features (552 features)""") | |
| return | |
| def _(feature_catalog, mo, pl): | |
| ptdf_catalog = feature_catalog.filter(pl.col('Category') == 'PTDF (Tier-1)') | |
| mo.md(f""" | |
| **PTDF Features**: {len(ptdf_catalog)} features | |
| Power Transfer Distribution Factors showing network sensitivity. | |
| How 1 MW injection in each zone affects each CNEC. | |
| """) | |
| mo.ui.table(ptdf_catalog.to_pandas(), page_size=20) | |
| return | |
| def _(mo): | |
| mo.md("""## 6. Target Variables (38 Core FBMC Borders)""") | |
| return | |
| def _(feature_catalog, mo, pl): | |
| target_catalog = feature_catalog.filter(pl.col('Category') == 'Target') | |
| mo.md(f""" | |
| **Target Variables**: {len(target_catalog)} borders | |
| These are the 38 Core FBMC borders we're forecasting. | |
| """) | |
| mo.ui.table(target_catalog.to_pandas(), page_size=20) | |
| return | |
| def _(alt, features_df, target_features): | |
| # Plot sample target over time | |
| if target_features: | |
| sample_target_col = target_features[0] | |
| target_timeseries = features_df.select(['mtu', sample_target_col]).sort('mtu') | |
| target_chart = alt.Chart(target_timeseries.to_pandas()).mark_line().encode( | |
| x=alt.X('mtu:T', title='Date'), | |
| y=alt.Y(f'{sample_target_col}:Q', title='Capacity (MW)', format='.1f'), | |
| tooltip=[ | |
| alt.Tooltip('mtu:T', title='Date'), | |
| alt.Tooltip(f'{sample_target_col}:Q', title='Capacity (MW)', format='.1f') | |
| ] | |
| ).properties( | |
| title=f'Sample Target Variable Over Time: {sample_target_col}', | |
| width=800, | |
| height=400 | |
| ).interactive() | |
| target_chart | |
| else: | |
| # Always define variables even if target_features is empty | |
| sample_target_col = None | |
| target_timeseries = None | |
| target_chart = None | |
| return | |
| def _(mo): | |
| mo.md("""## 7. Temporal Features (12 features)""") | |
| return | |
| def _(feature_catalog, features_df, mo, pl, temporal_features): | |
| temporal_catalog = feature_catalog.filter(pl.col('Category') == 'Temporal') | |
| mo.md(f""" | |
| **Temporal Features**: {len(temporal_catalog)} features | |
| Cyclic encoding of time to capture periodicity. | |
| """) | |
| mo.ui.table(temporal_catalog.to_pandas()) | |
| # Show sample temporal data | |
| mo.md("### Sample Temporal Values (First 24 Hours)") | |
| # Format temporal features to 3 decimal places for readability | |
| temporal_sample = features_df.select(['mtu'] + temporal_features).head(24).to_pandas() | |
| cyclic_cols = ['hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos'] | |
| # Apply formatting to cyclic columns | |
| for cyclic_col in cyclic_cols: | |
| if cyclic_col in temporal_sample.columns: | |
| temporal_sample[cyclic_col] = temporal_sample[cyclic_col].round(3) | |
| mo.ui.table(temporal_sample) | |
| return | |
| def _(mo): | |
| mo.md("""## 8. Net Position Features (84 features)""") | |
| return | |
| def _(feature_catalog, mo, pl): | |
| # Filter for both base and lag Net Position features | |
| netpos_catalog = feature_catalog.filter( | |
| (pl.col('Category') == 'Net Position (base)') | | |
| (pl.col('Category') == 'Net Position (lag)') | |
| ) | |
| mo.md(f""" | |
| **Net Position Features**: {len(netpos_catalog)} features (28 base + 56 lags) | |
| Zone-level scheduled positions (min/max boundaries): | |
| - **Base features (28)**: Current values like `minAT`, `maxBE`, etc. | |
| - **Lag features (56)**: L24 and L72 lags (e.g., `minAT_L24`, `maxBE_L72`) | |
| """) | |
| mo.ui.table(netpos_catalog.to_pandas(), page_size=20) | |
| return | |
| def _(mo): | |
| mo.md("""## 9. MaxBEX Lag Features (76 features)""") | |
| return | |
| def _(feature_catalog, mo, pl): | |
| maxbex_catalog = feature_catalog.filter(pl.col('Category') == 'MaxBEX Lag') | |
| mo.md(f""" | |
| **MaxBEX Lag Features**: {len(maxbex_catalog)} features (38 borders × 2 lags) | |
| Maximum Bilateral Exchange capacity target lags: | |
| - **L24 lags (38)**: Day-ahead values (e.g., `border_AT_CZ_L24`) | |
| - **L72 lags (38)**: 3-day-ahead values (e.g., `border_AT_CZ_L72`) | |
| These provide historical MaxBEX targets for each border to inform forecasts. | |
| """) | |
| mo.ui.table(maxbex_catalog.to_pandas(), page_size=20) | |
| return | |
| def _(mo): | |
| mo.md("""## 10. Summary & Validation""") | |
| return | |
| def _(category_summary, features_df, mo, pl): | |
| # Final validation summary | |
| validation_checks = [] | |
| # Check 1: Expected feature count | |
| expected_features = 1762 | |
| actual_features = features_df.shape[1] - 1 # Exclude timestamp | |
| validation_checks.append({ | |
| 'Check': 'Feature Count', | |
| 'Expected': expected_features, | |
| 'Actual': actual_features, | |
| 'Status': '✓ PASS' if actual_features == expected_features else '✗ FAIL' | |
| }) | |
| # Check 2: No excessive nulls (>80% in any category) | |
| max_null_pct = float(category_summary.filter( | |
| pl.col('Category') != 'TOTAL' | |
| )['Null %'].str.strip_suffix('%').cast(pl.Float64).max()) | |
| validation_checks.append({ | |
| 'Check': 'Category Null % < 80%', | |
| 'Expected': '< 80%', | |
| 'Actual': f"{max_null_pct:.2f}%", | |
| 'Status': '✓ PASS' if max_null_pct < 80 else '✗ FAIL' | |
| }) | |
| # Check 3: Timeline sorted | |
| validation_checks.append({ | |
| 'Check': 'Timeline Sorted', | |
| 'Expected': 'True', | |
| 'Actual': str(features_df['mtu'].is_sorted()), | |
| 'Status': '✓ PASS' if features_df['mtu'].is_sorted() else '✗ FAIL' | |
| }) | |
| # Check 4: No completely empty columns | |
| all_null_cols = sum(1 for _c in features_df.columns if features_df[_c].null_count() == len(features_df)) | |
| validation_checks.append({ | |
| 'Check': 'No Empty Columns', | |
| 'Expected': '0', | |
| 'Actual': str(all_null_cols), | |
| 'Status': '✓ PASS' if all_null_cols == 0 else '✗ FAIL' | |
| }) | |
| # Check 5: All targets present | |
| target_count = len([_c for _c in features_df.columns if _c.startswith('target_')]) | |
| validation_checks.append({ | |
| 'Check': 'All 38 Targets Present', | |
| 'Expected': '38', | |
| 'Actual': str(target_count), | |
| 'Status': '✓ PASS' if target_count == 38 else '✗ FAIL' | |
| }) | |
| validation_df = pl.DataFrame(validation_checks) | |
| mo.md("### Final Validation Checks") | |
| mo.ui.table(validation_df.to_pandas()) | |
| return (validation_checks,) | |
| def _(mo, validation_checks): | |
| # Overall status | |
| all_pass = all(_c['Status'].startswith('✓') for _c in validation_checks) | |
| failed = [_c['Check'] for _c in validation_checks if _c['Status'].startswith('✗')] | |
| if all_pass: | |
| mo.md(""" | |
| ## ✓ All Validation Checks PASSED | |
| The engineered feature dataset is ready for Chronos 2 model training! | |
| ### Next Steps: | |
| 1. Collect weather data (optional enhancement) | |
| 2. Collect ENTSO-E data (optional enhancement) | |
| 3. Begin zero-shot Chronos 2 inference testing | |
| """) | |
| else: | |
| mo.md(f""" | |
| ## ⚠ Validation Issues Detected | |
| **Failed Checks**: {', '.join(failed)} | |
| Please review and fix issues before proceeding to model training. | |
| """) | |
| return | |
| def _(mo): | |
| mo.md( | |
| """ | |
| --- | |
| ## Feature Engineering Complete | |
| **Status**: 1,762 JAO features engineered ✓ | |
| **File**: `data/processed/features_jao_24month.parquet` (4.22 MB) | |
| **Next**: Decide whether to add weather/ENTSO-E features or proceed with zero-shot inference. | |
| """ | |
| ) | |
| return | |
| if __name__ == "__main__": | |
| app.run() | |