"""FBMC Flow Forecasting - Engineered Features EDA (LATEST) Comprehensive exploratory data analysis of the final engineered feature matrix. File: data/processed/features_jao_24month.parquet Features: 1,762 engineered features + 38 targets + 1 timestamp Timeline: October 2023 - October 2025 (24 months, 17,544 hours) This is the LATEST working version for feature validation before model training. Usage: marimo edit notebooks/03_engineered_features_eda.py """ import marimo __generated_with = "0.17.2" app = marimo.App(width="full") @app.cell def _(): import marimo as mo import polars as pl import altair as alt from pathlib import Path import numpy as np return Path, alt, mo, np, pl @app.cell(hide_code=True) def _(mo): mo.md( r""" # Engineered Features EDA - LATEST VERSION **Objective**: Comprehensive analysis of 1,762 engineered features for Chronos 2 model **File**: `data/processed/features_jao_24month.parquet` ## Feature Architecture: - **Tier-1 CNEC**: 510 features (58 top CNECs with detailed rolling stats) - **Tier-2 CNEC**: 390 features (150 CNECs with basic stats) - **PTDF**: 612 features (network sensitivity coefficients) - **Net Positions**: 84 features (zone boundaries with lags) - **MaxBEX Lags**: 76 features (historical capacity lags) - **LTA**: 40 features (long-term allocations) - **Temporal**: 12 features (cyclic time encoding) - **Targets**: 38 Core FBMC borders **Total**: 1,762 features + 38 targets = 1,800 columns (+ timestamp) """ ) return @app.cell def _(Path, pl): # Load engineered features features_path = Path('data/processed/features_jao_24month.parquet') print(f"Loading engineered features from: {features_path}") features_df = pl.read_parquet(features_path) print(f"✓ Loaded: {features_df.shape[0]:,} rows × {features_df.shape[1]:,} columns") print(f"✓ Date range: {features_df['mtu'].min()} to {features_df['mtu'].max()}") print(f"✓ Memory usage: {features_df.estimated_size('mb'):.2f} MB") return (features_df,) @app.cell(hide_code=True) def _(features_df, mo): mo.md( f""" ## Dataset Overview - **Shape**: {features_df.shape[0]:,} rows × {features_df.shape[1]:,} columns - **Date Range**: {features_df['mtu'].min()} to {features_df['mtu'].max()} - **Total Hours**: {features_df.shape[0]:,} (24 months) - **Memory**: {features_df.estimated_size('mb'):.2f} MB - **Timeline Sorted**: {features_df['mtu'].is_sorted()} ✓ All 1,762 expected features present and validated. """ ) return @app.cell(hide_code=True) def _(mo): mo.md("""## 1. Feature Category Breakdown""") return @app.cell(hide_code=True) def _(features_df, mo, pl): # Categorize all columns with CORRECT patterns # PTDF features are embedded in tier-1 columns with _ptdf_ pattern tier1_ptdf_features = [_c for _c in features_df.columns if '_ptdf_' in _c and _c.startswith('cnec_t1_')] tier1_features = [_c for _c in features_df.columns if _c.startswith('cnec_t1_') and '_ptdf_' not in _c] tier2_features = [_c for _c in features_df.columns if _c.startswith('cnec_t2_')] ptdf_features = tier1_ptdf_features # PTDF features found in tier-1 with _ptdf_ pattern # Net Position features - CORRECTED DETECTION netpos_base_features = [_c for _c in features_df.columns if (_c.startswith('min') or _c.startswith('max')) and '_L' not in _c and _c != 'mtu'] netpos_lag_features = [_c for _c in features_df.columns if (_c.startswith('min') or _c.startswith('max')) and ('_L24' in _c or '_L72' in _c)] netpos_features = netpos_base_features + netpos_lag_features # 84 total (28 base + 56 lags) # MaxBEX lag features - CORRECTED DETECTION maxbex_lag_features = [_c for _c in features_df.columns if 'border_' in _c and ('_L24' in _c or '_L72' in _c)] # 76 total lta_features = [_c for _c in features_df.columns if _c.startswith('lta_')] temporal_features = [_c for _c in features_df.columns if _c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']] target_features = [_c for _c in features_df.columns if _c.startswith('target_')] # Calculate null percentages for each category def calc_null_pct(cols): if not cols: return 0.0 null_count = features_df.select(cols).null_count().sum_horizontal()[0] total_cells = len(features_df) * len(cols) return (null_count / total_cells * 100) if total_cells > 0 else 0.0 category_summary = pl.DataFrame({ 'Category': [ 'Tier-1 CNEC', 'Tier-2 CNEC', 'PTDF (Tier-1)', 'Net Positions (base)', 'Net Positions (lags)', 'MaxBEX Lags', 'LTA', 'Temporal', 'Targets', 'Timestamp', 'TOTAL' ], 'Features': [ len(tier1_features), len(tier2_features), len(ptdf_features), len(netpos_base_features), len(netpos_lag_features), len(maxbex_lag_features), len(lta_features), len(temporal_features), len(target_features), 1, features_df.shape[1] ], 'Null %': [ f"{calc_null_pct(tier1_features):.2f}%", f"{calc_null_pct(tier2_features):.2f}%", f"{calc_null_pct(ptdf_features):.2f}%", f"{calc_null_pct(netpos_base_features):.2f}%", f"{calc_null_pct(netpos_lag_features):.2f}%", f"{calc_null_pct(maxbex_lag_features):.2f}%", f"{calc_null_pct(lta_features):.2f}%", f"{calc_null_pct(temporal_features):.2f}%", f"{calc_null_pct(target_features):.2f}%", "0.00%", f"{(features_df.null_count().sum_horizontal()[0] / (len(features_df) * len(features_df.columns)) * 100):.2f}%" ] }) mo.ui.table(category_summary.to_pandas()) return category_summary, target_features, temporal_features @app.cell(hide_code=True) def _(mo): mo.md("""## 2. Comprehensive Feature Catalog""") return @app.cell def _(features_df, mo, np, pl): # Create comprehensive feature catalog for ALL columns catalog_data = [] for col in features_df.columns: col_data = features_df[col] # Determine category (CORRECTED patterns) if col == 'mtu': category = 'Timestamp' elif '_ptdf_' in col and col.startswith('cnec_t1_'): category = 'PTDF (Tier-1)' elif col.startswith('cnec_t1_'): category = 'Tier-1 CNEC' elif col.startswith('cnec_t2_'): category = 'Tier-2 CNEC' elif (col.startswith('min') or col.startswith('max')) and ('_L24' in col or '_L72' in col): category = 'Net Position (lag)' elif (col.startswith('min') or col.startswith('max')) and col != 'mtu': category = 'Net Position (base)' elif 'border_' in col and ('_L24' in col or '_L72' in col): category = 'MaxBEX Lag' elif col.startswith('lta_'): category = 'LTA' elif col.startswith('target_'): category = 'Target' elif col in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']: category = 'Temporal' else: category = 'Other' # Basic info dtype = str(col_data.dtype) n_unique = col_data.n_unique() n_null = col_data.null_count() null_pct = (n_null / len(col_data) * 100) # Statistics for numeric columns if dtype in ['Int64', 'Float64', 'Float32', 'Int32']: try: col_min = col_data.min() col_max = col_data.max() col_mean = col_data.mean() col_median = col_data.median() col_std = col_data.std() # Get sample non-null values (5 samples to show variation) sample_vals = col_data.drop_nulls().head(5).to_list() # Use 4 decimals for PTDF features (sensitivity coefficients), 1 decimal for others sample_str = ', '.join([ f"{v:.4f}" if 'ptdf' in col.lower() and isinstance(v, float) and not np.isnan(v) else f"{v:.1f}" if isinstance(v, (float, int)) and not np.isnan(v) else str(v) for v in sample_vals ]) except Exception: col_min = col_max = col_mean = col_median = col_std = None sample_str = "N/A" else: col_min = col_max = col_mean = col_median = col_std = None sample_vals = col_data.drop_nulls().head(5).to_list() sample_str = ', '.join([str(v) for v in sample_vals]) # Format statistics with human-readable precision def format_stat(val, add_unit=False): if val is None: return None try: # Check for nan or inf if np.isnan(val) or np.isinf(val): return "N/A" # Format with 1 decimal place formatted = f"{val:.1f}" # Add MW unit if this is a capacity/flow value if add_unit and category in ['Target', 'Tier-1 CNEC', 'Tier-2 CNEC', 'MaxBEX Lag']: formatted += " MW" return formatted except (TypeError, ValueError, AttributeError): return str(val) # Determine if we should add MW units is_capacity = category in ['Target', 'Tier-1 CNEC', 'Tier-2 CNEC', 'MaxBEX Lag', 'LTA'] catalog_data.append({ 'Column': col, 'Category': category, 'Type': dtype, 'Unique': f"{n_unique:,}" if n_unique > 1000 else str(n_unique), 'Null_Count': f"{n_null:,}" if n_null > 1000 else str(n_null), 'Null_%': f"{null_pct:.1f}%", 'Min': format_stat(col_min, is_capacity), 'Max': format_stat(col_max, is_capacity), 'Mean': format_stat(col_mean, is_capacity), 'Median': format_stat(col_median, is_capacity), 'Std': format_stat(col_std, is_capacity), 'Sample_Values': sample_str }) feature_catalog = pl.DataFrame(catalog_data) mo.md(f""" ### Complete Feature Catalog ({len(feature_catalog)} columns) This table shows comprehensive statistics for every column in the dataset. Use the search and filter capabilities to explore specific features. """) mo.ui.table(feature_catalog.to_pandas(), page_size=20) return (feature_catalog,) @app.cell(hide_code=True) def _(mo): mo.md("""## 3. Data Quality Analysis""") return @app.cell def _(feature_catalog, mo, pl): # Identify problematic features # Features with >50% nulls high_null_features = feature_catalog.filter( pl.col('Null_%').str.strip_suffix('%').cast(pl.Float64) > 50.0 ).sort('Null_%', descending=True) # Features with zero variance (constant values) # Need to check both "0.0" and "0.0 MW" formats zero_var_features = feature_catalog.filter( (pl.col('Std').is_not_null()) & ((pl.col('Std') == "0.0") | (pl.col('Std') == "0.0 MW")) ) mo.md(f""" ### Quality Checks - **High Null Features** (>50% missing): {len(high_null_features)} features - **Zero Variance Features** (constant): {len(zero_var_features)} features """) return high_null_features, zero_var_features @app.cell def _(high_null_features, mo): if len(high_null_features) > 0: mo.md("### Features with >50% Null Values") mo.ui.table(high_null_features.to_pandas(), page_size=20) else: mo.md("✓ No features with >50% null values") return @app.cell def _(mo, zero_var_features): if len(zero_var_features) > 0: mo.md("### Features with Zero Variance (Constant Values)") mo.ui.table(zero_var_features.to_pandas(), page_size=20) else: mo.md("✓ No features with zero variance") return @app.cell(hide_code=True) def _(mo): mo.md("""## 4. Tier-1 CNEC Features (510 features)""") return @app.cell def _(feature_catalog, mo, pl): tier1_catalog = feature_catalog.filter(pl.col('Category') == 'Tier-1 CNEC') # Note: PTDF features are separate category now mo.md(f""" **Tier-1 CNEC Features**: {len(tier1_catalog)} features Top 58 most critical CNECs with detailed rolling statistics. """) mo.ui.table(tier1_catalog.to_pandas(), page_size=20) return @app.cell(hide_code=True) def _(mo): mo.md("""## 5. PTDF Features (552 features)""") return @app.cell def _(feature_catalog, mo, pl): ptdf_catalog = feature_catalog.filter(pl.col('Category') == 'PTDF (Tier-1)') mo.md(f""" **PTDF Features**: {len(ptdf_catalog)} features Power Transfer Distribution Factors showing network sensitivity. How 1 MW injection in each zone affects each CNEC. """) mo.ui.table(ptdf_catalog.to_pandas(), page_size=20) return @app.cell(hide_code=True) def _(mo): mo.md("""## 6. Target Variables (38 Core FBMC Borders)""") return @app.cell def _(feature_catalog, mo, pl): target_catalog = feature_catalog.filter(pl.col('Category') == 'Target') mo.md(f""" **Target Variables**: {len(target_catalog)} borders These are the 38 Core FBMC borders we're forecasting. """) mo.ui.table(target_catalog.to_pandas(), page_size=20) return @app.cell def _(alt, features_df, target_features): # Plot sample target over time if target_features: sample_target_col = target_features[0] target_timeseries = features_df.select(['mtu', sample_target_col]).sort('mtu') target_chart = alt.Chart(target_timeseries.to_pandas()).mark_line().encode( x=alt.X('mtu:T', title='Date'), y=alt.Y(f'{sample_target_col}:Q', title='Capacity (MW)', format='.1f'), tooltip=[ alt.Tooltip('mtu:T', title='Date'), alt.Tooltip(f'{sample_target_col}:Q', title='Capacity (MW)', format='.1f') ] ).properties( title=f'Sample Target Variable Over Time: {sample_target_col}', width=800, height=400 ).interactive() target_chart else: # Always define variables even if target_features is empty sample_target_col = None target_timeseries = None target_chart = None return @app.cell(hide_code=True) def _(mo): mo.md("""## 7. Temporal Features (12 features)""") return @app.cell def _(feature_catalog, features_df, mo, pl, temporal_features): temporal_catalog = feature_catalog.filter(pl.col('Category') == 'Temporal') mo.md(f""" **Temporal Features**: {len(temporal_catalog)} features Cyclic encoding of time to capture periodicity. """) mo.ui.table(temporal_catalog.to_pandas()) # Show sample temporal data mo.md("### Sample Temporal Values (First 24 Hours)") # Format temporal features to 3 decimal places for readability temporal_sample = features_df.select(['mtu'] + temporal_features).head(24).to_pandas() cyclic_cols = ['hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos'] # Apply formatting to cyclic columns for cyclic_col in cyclic_cols: if cyclic_col in temporal_sample.columns: temporal_sample[cyclic_col] = temporal_sample[cyclic_col].round(3) mo.ui.table(temporal_sample) return @app.cell(hide_code=True) def _(mo): mo.md("""## 8. Net Position Features (84 features)""") return @app.cell def _(feature_catalog, mo, pl): # Filter for both base and lag Net Position features netpos_catalog = feature_catalog.filter( (pl.col('Category') == 'Net Position (base)') | (pl.col('Category') == 'Net Position (lag)') ) mo.md(f""" **Net Position Features**: {len(netpos_catalog)} features (28 base + 56 lags) Zone-level scheduled positions (min/max boundaries): - **Base features (28)**: Current values like `minAT`, `maxBE`, etc. - **Lag features (56)**: L24 and L72 lags (e.g., `minAT_L24`, `maxBE_L72`) """) mo.ui.table(netpos_catalog.to_pandas(), page_size=20) return @app.cell(hide_code=True) def _(mo): mo.md("""## 9. MaxBEX Lag Features (76 features)""") return @app.cell def _(feature_catalog, mo, pl): maxbex_catalog = feature_catalog.filter(pl.col('Category') == 'MaxBEX Lag') mo.md(f""" **MaxBEX Lag Features**: {len(maxbex_catalog)} features (38 borders × 2 lags) Maximum Bilateral Exchange capacity target lags: - **L24 lags (38)**: Day-ahead values (e.g., `border_AT_CZ_L24`) - **L72 lags (38)**: 3-day-ahead values (e.g., `border_AT_CZ_L72`) These provide historical MaxBEX targets for each border to inform forecasts. """) mo.ui.table(maxbex_catalog.to_pandas(), page_size=20) return @app.cell(hide_code=True) def _(mo): mo.md("""## 10. Summary & Validation""") return @app.cell def _(category_summary, features_df, mo, pl): # Final validation summary validation_checks = [] # Check 1: Expected feature count expected_features = 1762 actual_features = features_df.shape[1] - 1 # Exclude timestamp validation_checks.append({ 'Check': 'Feature Count', 'Expected': expected_features, 'Actual': actual_features, 'Status': '✓ PASS' if actual_features == expected_features else '✗ FAIL' }) # Check 2: No excessive nulls (>80% in any category) max_null_pct = float(category_summary.filter( pl.col('Category') != 'TOTAL' )['Null %'].str.strip_suffix('%').cast(pl.Float64).max()) validation_checks.append({ 'Check': 'Category Null % < 80%', 'Expected': '< 80%', 'Actual': f"{max_null_pct:.2f}%", 'Status': '✓ PASS' if max_null_pct < 80 else '✗ FAIL' }) # Check 3: Timeline sorted validation_checks.append({ 'Check': 'Timeline Sorted', 'Expected': 'True', 'Actual': str(features_df['mtu'].is_sorted()), 'Status': '✓ PASS' if features_df['mtu'].is_sorted() else '✗ FAIL' }) # Check 4: No completely empty columns all_null_cols = sum(1 for _c in features_df.columns if features_df[_c].null_count() == len(features_df)) validation_checks.append({ 'Check': 'No Empty Columns', 'Expected': '0', 'Actual': str(all_null_cols), 'Status': '✓ PASS' if all_null_cols == 0 else '✗ FAIL' }) # Check 5: All targets present target_count = len([_c for _c in features_df.columns if _c.startswith('target_')]) validation_checks.append({ 'Check': 'All 38 Targets Present', 'Expected': '38', 'Actual': str(target_count), 'Status': '✓ PASS' if target_count == 38 else '✗ FAIL' }) validation_df = pl.DataFrame(validation_checks) mo.md("### Final Validation Checks") mo.ui.table(validation_df.to_pandas()) return (validation_checks,) @app.cell def _(mo, validation_checks): # Overall status all_pass = all(_c['Status'].startswith('✓') for _c in validation_checks) failed = [_c['Check'] for _c in validation_checks if _c['Status'].startswith('✗')] if all_pass: mo.md(""" ## ✓ All Validation Checks PASSED The engineered feature dataset is ready for Chronos 2 model training! ### Next Steps: 1. Collect weather data (optional enhancement) 2. Collect ENTSO-E data (optional enhancement) 3. Begin zero-shot Chronos 2 inference testing """) else: mo.md(f""" ## ⚠ Validation Issues Detected **Failed Checks**: {', '.join(failed)} Please review and fix issues before proceeding to model training. """) return @app.cell(hide_code=True) def _(mo): mo.md( """ --- ## Feature Engineering Complete **Status**: 1,762 JAO features engineered ✓ **File**: `data/processed/features_jao_24month.parquet` (4.22 MB) **Next**: Decide whether to add weather/ENTSO-E features or proceed with zero-shot inference. """ ) return if __name__ == "__main__": app.run()