Spaces:
Sleeping
fix: correct Polish border targets - use directional flows (132 targets)
Browse filesCritical bug fix for feature engineering pipeline:
PROBLEM:
- Polish border targets were ALL ZEROS (e.g., 0.00000028 MW)
- Expected: thousands of MW for interconnection flows
- Root cause: targets created from MaxBEX capacity columns instead of actual flows
SOLUTION:
- Fixed engineer_jao_features.py to use directional flow columns (CZ>PL, PL>CZ, etc.)
- Each direction is now a separate target (NOT netted):
* target_border_CZ_PL = Flow from CZ to PL (CZ>PL column)
* target_border_PL_CZ = Flow from PL to CZ (PL>CZ column)
RESULTS:
- Before: 38 MaxBEX targets (all Polish borders = 0)
- After: 132 directional targets (all Polish borders valid)
- target_border_CZ_PL: Mean=3,482 MW, Range=[144, 5,699] MW
- target_border_PL_CZ: Mean=2,698 MW, Range=[0, 4,631] MW
VALIDATION:
- Added forecast_validation_oct2024.py Marimo notebook
- Interactive visualization of forecast vs actuals for all borders
- Data leakage detection (fails if MAE=0)
DEPLOYMENT:
- Regenerated unified features: 17,544 rows x 2,647 columns
- Uploaded to HuggingFace: evgueni-p/fbmc-features-24month
- HF Space will automatically use corrected dataset
Co-Authored-By: Claude <[email protected]>
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import marimo
|
| 2 |
+
|
| 3 |
+
__generated_with = "0.9.30"
|
| 4 |
+
app = marimo.App(width="medium")
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@app.cell
|
| 8 |
+
def imports_and_setup():
|
| 9 |
+
"""Import libraries and set up paths."""
|
| 10 |
+
import marimo as mo
|
| 11 |
+
import polars as pl
|
| 12 |
+
import altair as alt
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from datetime import datetime
|
| 15 |
+
|
| 16 |
+
# Set up absolute paths
|
| 17 |
+
project_root = Path(__file__).parent.parent
|
| 18 |
+
|
| 19 |
+
return mo, pl, alt, Path, datetime, project_root
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@app.cell
|
| 23 |
+
def load_september_2025_data(pl, project_root):
|
| 24 |
+
"""Load September 2025 forecast results and actuals."""
|
| 25 |
+
|
| 26 |
+
# Load actuals from HuggingFace dataset (ground truth)
|
| 27 |
+
print('[INFO] Loading actuals from HuggingFace dataset...')
|
| 28 |
+
from datasets import load_dataset
|
| 29 |
+
import os
|
| 30 |
+
|
| 31 |
+
dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN'))
|
| 32 |
+
df_actuals_full = pl.from_arrow(dataset.data.table)
|
| 33 |
+
print(f'[INFO] HF dataset loaded: {df_actuals_full.shape}')
|
| 34 |
+
|
| 35 |
+
# Load forecast results
|
| 36 |
+
forecast_path = project_root / 'results' / 'september_2025_forecast_504h.parquet'
|
| 37 |
+
|
| 38 |
+
if not forecast_path.exists():
|
| 39 |
+
raise FileNotFoundError(f'Forecast file not found: {forecast_path}. Run September 2025 forecast first.')
|
| 40 |
+
|
| 41 |
+
df_forecast_full = pl.read_parquet(forecast_path)
|
| 42 |
+
print(f'[INFO] Forecast loaded: {df_forecast_full.shape}')
|
| 43 |
+
print(f'[INFO] Forecast dates: {df_forecast_full["timestamp"].min()} to {df_forecast_full["timestamp"].max()}')
|
| 44 |
+
|
| 45 |
+
# Filter actuals to September 2025 period (Aug 18 - Sept 15)
|
| 46 |
+
start_date = datetime(2025, 8, 18) # 2 weeks before forecast
|
| 47 |
+
end_date = datetime(2025, 9, 16) # Through end of forecast period
|
| 48 |
+
|
| 49 |
+
df_actuals_filtered = df_actuals_full.filter(
|
| 50 |
+
(pl.col('timestamp') >= start_date) &
|
| 51 |
+
(pl.col('timestamp') < end_date)
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
print(f'[INFO] Actuals filtered: {df_actuals_filtered.shape[0]} hours (Aug 18 - Sept 15, 2025)')
|
| 55 |
+
|
| 56 |
+
return df_actuals_full, df_actuals_filtered, df_forecast_full, start_date, end_date
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@app.cell
|
| 60 |
+
def prepare_unified_dataframe(pl, df_actuals_filtered, df_forecast_full):
|
| 61 |
+
"""Prepare unified dataframe with forecast and actual pairs for all borders."""
|
| 62 |
+
|
| 63 |
+
# Extract border names from forecast columns
|
| 64 |
+
forecast_cols_list = [col for col in df_forecast_full.columns if col.endswith('_median')]
|
| 65 |
+
border_names_list = [col.replace('_median', '') for col in forecast_cols_list]
|
| 66 |
+
|
| 67 |
+
print(f'[INFO] Processing {len(border_names_list)} borders...')
|
| 68 |
+
|
| 69 |
+
# Start with timestamp from actuals
|
| 70 |
+
df_unified_data = df_actuals_filtered.select('timestamp')
|
| 71 |
+
|
| 72 |
+
# Add actual and forecast for each border
|
| 73 |
+
for border in border_names_list:
|
| 74 |
+
actual_col_source = f'target_border_{border}'
|
| 75 |
+
forecast_col_source = f'{border}_median'
|
| 76 |
+
|
| 77 |
+
# Add actuals
|
| 78 |
+
if actual_col_source in df_actuals_filtered.columns:
|
| 79 |
+
df_unified_data = df_unified_data.with_columns(
|
| 80 |
+
df_actuals_filtered[actual_col_source].alias(f'actual_{border}')
|
| 81 |
+
)
|
| 82 |
+
else:
|
| 83 |
+
print(f'[WARNING] Actual column missing: {actual_col_source}')
|
| 84 |
+
df_unified_data = df_unified_data.with_columns(pl.lit(None).alias(f'actual_{border}'))
|
| 85 |
+
|
| 86 |
+
# Add forecasts (join on timestamp)
|
| 87 |
+
if forecast_col_source in df_forecast_full.columns:
|
| 88 |
+
df_forecast_subset = df_forecast_full.select(['timestamp', forecast_col_source])
|
| 89 |
+
df_unified_data = df_unified_data.join(
|
| 90 |
+
df_forecast_subset,
|
| 91 |
+
on='timestamp',
|
| 92 |
+
how='left'
|
| 93 |
+
).rename({forecast_col_source: f'forecast_{border}'})
|
| 94 |
+
else:
|
| 95 |
+
print(f'[WARNING] Forecast column missing: {forecast_col_source}')
|
| 96 |
+
df_unified_data = df_unified_data.with_columns(pl.lit(None).alias(f'forecast_{border}'))
|
| 97 |
+
|
| 98 |
+
print(f'[INFO] Unified data prepared: {df_unified_data.shape}')
|
| 99 |
+
|
| 100 |
+
# Validate no data leakage - check that forecasts don't perfectly match actuals
|
| 101 |
+
sample_border = border_names_list[0]
|
| 102 |
+
forecast_col_check = f'forecast_{sample_border}'
|
| 103 |
+
actual_col_check = f'actual_{sample_border}'
|
| 104 |
+
|
| 105 |
+
if forecast_col_check in df_unified_data.columns and actual_col_check in df_unified_data.columns:
|
| 106 |
+
_forecast_start_check = datetime(2025, 9, 2)
|
| 107 |
+
_df_forecast_check = df_unified_data.filter(pl.col('timestamp') >= _forecast_start_check)
|
| 108 |
+
|
| 109 |
+
if len(_df_forecast_check) > 0:
|
| 110 |
+
mae_check = (_df_forecast_check[forecast_col_check] - _df_forecast_check[actual_col_check]).abs().mean()
|
| 111 |
+
if mae_check == 0:
|
| 112 |
+
raise ValueError(f'DATA LEAKAGE DETECTED: Forecasts perfectly match actuals (MAE=0) for {sample_border}!')
|
| 113 |
+
|
| 114 |
+
print('[INFO] Data leakage check passed - forecasts differ from actuals')
|
| 115 |
+
|
| 116 |
+
return df_unified_data, border_names_list
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@app.cell
|
| 120 |
+
def create_border_selector(mo, border_names_list):
|
| 121 |
+
"""Create interactive border selection dropdown."""
|
| 122 |
+
|
| 123 |
+
border_selector_widget = mo.ui.dropdown(
|
| 124 |
+
options={border: border for border in sorted(border_names_list)},
|
| 125 |
+
value='AT_CZ',
|
| 126 |
+
label='Select Border:'
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return border_selector_widget,
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@app.cell
|
| 133 |
+
def display_border_selector(mo, border_selector_widget):
|
| 134 |
+
"""Display the border selector UI."""
|
| 135 |
+
mo.md(f"""
|
| 136 |
+
## Forecast Validation: September 2025
|
| 137 |
+
|
| 138 |
+
**Select a border to view:**
|
| 139 |
+
{border_selector_widget}
|
| 140 |
+
|
| 141 |
+
Chart shows:
|
| 142 |
+
- **2 weeks historical** (Aug 18-31, 2025): Actual flows only
|
| 143 |
+
- **2 weeks forecast** (Sept 2-15, 2025): Forecast vs Actual comparison
|
| 144 |
+
- **Context**: 504 hours (21 days)
|
| 145 |
+
""")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
@app.cell
|
| 149 |
+
def filter_data_for_selected_border(pl, df_unified_data, border_selector_widget, start_date):
|
| 150 |
+
"""Filter data for the selected border."""
|
| 151 |
+
|
| 152 |
+
selected_border_name = border_selector_widget.value
|
| 153 |
+
|
| 154 |
+
# Extract columns for selected border
|
| 155 |
+
actual_col_name = f'actual_{selected_border_name}'
|
| 156 |
+
forecast_col_name = f'forecast_{selected_border_name}'
|
| 157 |
+
|
| 158 |
+
# Check if columns exist
|
| 159 |
+
if actual_col_name not in df_unified_data.columns:
|
| 160 |
+
df_selected_border = None
|
| 161 |
+
print(f'[ERROR] Actual column {actual_col_name} not found')
|
| 162 |
+
else:
|
| 163 |
+
df_selected_border = df_unified_data.select([
|
| 164 |
+
'timestamp',
|
| 165 |
+
pl.col(actual_col_name).alias('actual'),
|
| 166 |
+
pl.col(forecast_col_name).alias('forecast') if forecast_col_name in df_unified_data.columns else pl.lit(None).alias('forecast')
|
| 167 |
+
])
|
| 168 |
+
|
| 169 |
+
# Add period marker (historical vs forecast)
|
| 170 |
+
forecast_start = datetime(2025, 9, 2)
|
| 171 |
+
df_selected_border = df_selected_border.with_columns(
|
| 172 |
+
pl.when(pl.col('timestamp') >= forecast_start)
|
| 173 |
+
.then(pl.lit('Forecast Period'))
|
| 174 |
+
.otherwise(pl.lit('Historical'))
|
| 175 |
+
.alias('period')
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
return df_selected_border, selected_border_name, forecast_start
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
@app.cell
|
| 182 |
+
def create_time_series_chart(alt, df_selected_border, selected_border_name, forecast_start):
|
| 183 |
+
"""Create Altair time series visualization."""
|
| 184 |
+
|
| 185 |
+
if df_selected_border is None:
|
| 186 |
+
chart_time_series = alt.Chart().mark_text(text='No data available', size=20)
|
| 187 |
+
else:
|
| 188 |
+
# Convert to pandas for Altair (CLAUDE.md Rule #37)
|
| 189 |
+
df_plot = df_selected_border.to_pandas()
|
| 190 |
+
|
| 191 |
+
# Create base chart
|
| 192 |
+
base = alt.Chart(df_plot).encode(
|
| 193 |
+
x=alt.X('timestamp:T', title='Date', axis=alt.Axis(format='%b %d'))
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# Actual line (blue, solid)
|
| 197 |
+
line_actual = base.mark_line(color='blue', strokeWidth=2).encode(
|
| 198 |
+
y=alt.Y('actual:Q', title='Flow (MW)', scale=alt.Scale(zero=False)),
|
| 199 |
+
tooltip=[
|
| 200 |
+
alt.Tooltip('timestamp:T', title='Time', format='%Y-%m-%d %H:%M'),
|
| 201 |
+
alt.Tooltip('actual:Q', title='Actual (MW)', format='.1f')
|
| 202 |
+
]
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Forecast line (orange, dashed) - only for forecast period
|
| 206 |
+
df_plot_forecast = df_plot[df_plot['period'] == 'Forecast Period']
|
| 207 |
+
|
| 208 |
+
if len(df_plot_forecast) > 0 and df_plot_forecast['forecast'].notna().any():
|
| 209 |
+
line_forecast = alt.Chart(df_plot_forecast).mark_line(
|
| 210 |
+
color='orange',
|
| 211 |
+
strokeWidth=2,
|
| 212 |
+
strokeDash=[5, 5]
|
| 213 |
+
).encode(
|
| 214 |
+
x=alt.X('timestamp:T'),
|
| 215 |
+
y=alt.Y('forecast:Q'),
|
| 216 |
+
tooltip=[
|
| 217 |
+
alt.Tooltip('timestamp:T', title='Time', format='%Y-%m-%d %H:%M'),
|
| 218 |
+
alt.Tooltip('forecast:Q', title='Forecast (MW)', format='.1f'),
|
| 219 |
+
alt.Tooltip('actual:Q', title='Actual (MW)', format='.1f')
|
| 220 |
+
]
|
| 221 |
+
)
|
| 222 |
+
else:
|
| 223 |
+
line_forecast = alt.Chart().mark_point() # Empty chart
|
| 224 |
+
|
| 225 |
+
# Vertical line at forecast start
|
| 226 |
+
rule_forecast_start = alt.Chart(
|
| 227 |
+
alt.Data(values=[{'x': forecast_start}])
|
| 228 |
+
).mark_rule(color='red', strokeDash=[3, 3], strokeWidth=1).encode(
|
| 229 |
+
x='x:T'
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# Combine layers
|
| 233 |
+
chart_time_series = (line_actual + line_forecast + rule_forecast_start).properties(
|
| 234 |
+
width=800,
|
| 235 |
+
height=400,
|
| 236 |
+
title=f'Border: {selected_border_name} | Hourly Flows (Aug 18 - Sept 15, 2025)'
|
| 237 |
+
).configure_axis(
|
| 238 |
+
labelFontSize=12,
|
| 239 |
+
titleFontSize=14
|
| 240 |
+
).configure_title(
|
| 241 |
+
fontSize=16
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
return chart_time_series,
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
@app.cell
|
| 248 |
+
def calculate_summary_statistics(pl, df_selected_border, selected_border_name, forecast_start):
|
| 249 |
+
"""Calculate summary statistics for the selected border."""
|
| 250 |
+
|
| 251 |
+
if df_selected_border is None:
|
| 252 |
+
stats_summary_text = 'No data available'
|
| 253 |
+
else:
|
| 254 |
+
# Filter to forecast period only
|
| 255 |
+
df_forecast_period = df_selected_border.filter(
|
| 256 |
+
pl.col('timestamp') >= forecast_start
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
if len(df_forecast_period) == 0 or df_forecast_period['forecast'].is_null().all():
|
| 260 |
+
stats_summary_text = 'No forecast data available for this period'
|
| 261 |
+
else:
|
| 262 |
+
# Calculate MAE
|
| 263 |
+
mae_value = (
|
| 264 |
+
(df_forecast_period['forecast'] - df_forecast_period['actual']).abs().mean()
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Forecast variation
|
| 268 |
+
forecast_values = df_forecast_period['forecast'].drop_nulls()
|
| 269 |
+
unique_count = forecast_values.n_unique()
|
| 270 |
+
std_value = forecast_values.std()
|
| 271 |
+
|
| 272 |
+
# Actual variation (for reference)
|
| 273 |
+
actual_values = df_forecast_period['actual'].drop_nulls()
|
| 274 |
+
actual_std = actual_values.std()
|
| 275 |
+
|
| 276 |
+
stats_summary_text = f"""
|
| 277 |
+
### Forecast Quality Statistics
|
| 278 |
+
|
| 279 |
+
**Border**: {selected_border_name}
|
| 280 |
+
**Period**: September 2-15, 2025 (336 hours)
|
| 281 |
+
**Context**: 504 hours (21 days)
|
| 282 |
+
|
| 283 |
+
**Accuracy Metrics:**
|
| 284 |
+
- **MAE**: {mae_value:.2f} MW
|
| 285 |
+
- Forecast variation: {unique_count} unique values, StdDev = {std_value:.2f} MW
|
| 286 |
+
- Actual variation: StdDev = {actual_std:.2f} MW
|
| 287 |
+
|
| 288 |
+
**Interpretation:**
|
| 289 |
+
- MAE < 50 MW: Excellent
|
| 290 |
+
- MAE 50-100 MW: Good
|
| 291 |
+
- MAE > 100 MW: Needs improvement
|
| 292 |
+
"""
|
| 293 |
+
|
| 294 |
+
return stats_summary_text,
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
@app.cell
|
| 298 |
+
def display_chart_and_stats(mo, chart_time_series, stats_summary_text):
|
| 299 |
+
"""Display the chart and statistics."""
|
| 300 |
+
mo.vstack([
|
| 301 |
+
chart_time_series,
|
| 302 |
+
mo.md(stats_summary_text)
|
| 303 |
+
])
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
if __name__ == "__main__":
|
| 307 |
+
app.run()
|
|
@@ -598,11 +598,24 @@ def engineer_jao_features(
|
|
| 598 |
feat_temporal, feat_system, feat_regional, feat_pca, feat_lags]:
|
| 599 |
all_features = all_features.join(feat_df, on='mtu', how='left')
|
| 600 |
|
| 601 |
-
# Add target
|
| 602 |
-
|
| 603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
all_features = all_features.with_columns([
|
| 605 |
-
unified[col].alias(
|
| 606 |
])
|
| 607 |
|
| 608 |
# Remove duplicates if any
|
|
|
|
| 598 |
feat_temporal, feat_system, feat_regional, feat_pca, feat_lags]:
|
| 599 |
all_features = all_features.join(feat_df, on='mtu', how='left')
|
| 600 |
|
| 601 |
+
# Add target variables from DIRECTIONAL FLOWS (not MaxBEX capacity)
|
| 602 |
+
# JAO provides directional flows (CZ>PL, PL>CZ, etc.) as separate columns
|
| 603 |
+
# Create targets for EACH DIRECTION as separate forecast targets
|
| 604 |
+
# Example: target_border_CZ_PL = flow from CZ to PL (CZ>PL column)
|
| 605 |
+
# target_border_PL_CZ = flow from PL to CZ (PL>CZ column)
|
| 606 |
+
|
| 607 |
+
# Find all directional flow columns
|
| 608 |
+
directional_cols = [c for c in unified.columns if '>' in c]
|
| 609 |
+
|
| 610 |
+
print(f"\n[INFO] Creating targets for {len(directional_cols)} directional flows...")
|
| 611 |
+
|
| 612 |
+
# Create one target per directional flow (e.g., CZ>PL becomes target_border_CZ_PL)
|
| 613 |
+
for col in sorted(directional_cols):
|
| 614 |
+
from_country, to_country = col.split('>')
|
| 615 |
+
# Target name: target_border_{FROM}_{TO} (preserves direction)
|
| 616 |
+
target_name = f'target_border_{from_country}_{to_country}'
|
| 617 |
all_features = all_features.with_columns([
|
| 618 |
+
unified[col].alias(target_name)
|
| 619 |
])
|
| 620 |
|
| 621 |
# Remove duplicates if any
|