fbmc-chronos2 / scripts /collect_openmeteo_24month.py
Evgueni Poloukarov
feat: complete weather feature engineering with simplified approach (375 features)
7aa0336
raw
history blame
5.13 kB
"""
Collect 24-Month Weather Data from OpenMeteo
=============================================
Collects hourly weather data from OpenMeteo Historical API for the full
24-month period (Oct 2023 - Sept 2025) across 52 strategic grid points.
7 Weather Variables:
- temperature_2m: Air temperature at 2m (C)
- windspeed_10m: Wind speed at 10m (m/s)
- windspeed_100m: Wind speed at 100m (m/s) - for wind generation
- winddirection_100m: Wind direction at 100m (degrees)
- shortwave_radiation: Solar radiation (W/m2) - for solar generation
- cloudcover: Cloud cover percentage
- surface_pressure: Surface air pressure (hPa)
Collection Strategy:
- 52 grid points (covering all FBMC zones + neighbors)
- 2-week chunks (1.0 API call each)
- 270 requests/minute (45% of 600 limit)
- Estimated runtime: ~5 minutes
Output: data/raw/weather_24month.parquet
Size: ~50-80 MB (52 points × 7 vars × 17,520 hours)
Features: 364 (52 × 7) when engineered
"""
import sys
from pathlib import Path
# Add src to path
sys.path.append(str(Path(__file__).parent.parent))
from src.data_collection.collect_openmeteo import OpenMeteoCollector
# Date range: Oct 2023 - Sept 2025 (24 months)
START_DATE = '2023-10-01'
END_DATE = '2025-09-30'
# Output file
OUTPUT_DIR = Path(__file__).parent.parent / 'data' / 'raw'
OUTPUT_FILE = OUTPUT_DIR / 'weather_24month.parquet'
print("="*80)
print("24-MONTH WEATHER DATA COLLECTION")
print("="*80)
print()
print("Period: October 2023 - September 2025 (24 months)")
print("Grid points: 52 strategic locations across FBMC")
print("Variables: 7 weather parameters")
print("Estimated runtime: ~5 minutes")
print()
# Initialize collector with safe rate limiting
print("Initializing OpenMeteo collector...")
collector = OpenMeteoCollector(
requests_per_minute=270, # 45% of 600 limit
chunk_days=14 # 1.0 API call per request
)
print("[OK] Collector initialized")
print()
# Run collection
try:
df = collector.collect_all(
start_date=START_DATE,
end_date=END_DATE,
output_path=OUTPUT_FILE
)
if not df.is_empty():
print()
print("="*80)
print("COLLECTION SUCCESS")
print("="*80)
print()
print(f"Output: {OUTPUT_FILE}")
print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]} columns")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Grid points: {df['grid_point'].n_unique()}")
print(f"Weather variables: {len([c for c in df.columns if c not in ['timestamp', 'grid_point', 'location_name', 'latitude', 'longitude']])}")
print()
# Data quality summary
null_count_total = df.null_count().sum_horizontal()[0]
null_pct = (null_count_total / (df.shape[0] * df.shape[1])) * 100
print(f"Data completeness: {100 - null_pct:.2f}%")
if null_pct > 0:
print()
print("Missing data by column:")
for col in df.columns:
null_count = df[col].null_count()
if null_count > 0:
pct = (null_count / len(df)) * 100
print(f" - {col}: {null_count:,} ({pct:.2f}%)")
print()
print("="*80)
print("NEXT STEPS")
print("="*80)
print()
print("1. Implement weather feature engineering:")
print(" - Create src/feature_engineering/engineer_weather_features.py")
print(" - Engineer ~364 features (52 grid points x 7 variables)")
print(" - Add spatial aggregation (zone-level averages)")
print()
print("2. Expected features:")
print(" - Grid-level: temp_{grid_point}, wind_{grid_point}, solar_{grid_point}, etc.")
print(" - Zone-level: temp_avg_{zone}, wind_avg_{zone}, solar_avg_{zone}, etc.")
print(" - Lags: Previous 1h, 6h, 12h, 24h for key variables")
print()
print("3. Final unified features:")
print(" - JAO: 1,698")
print(" - ENTSO-E: 296")
print(" - Weather: 364")
print(" - Total: ~2,358 features")
print()
print("[OK] Weather data collection COMPLETE!")
else:
print()
print("[ERROR] No weather data collected")
print()
print("Possible causes:")
print(" - OpenMeteo API access issues")
print(" - Rate limit exceeded")
print(" - Network connectivity problems")
print()
sys.exit(1)
except KeyboardInterrupt:
print()
print()
print("="*80)
print("COLLECTION INTERRUPTED")
print("="*80)
print()
print("Collection was stopped by user.")
print()
print("NOTE: OpenMeteo collection does NOT have checkpoint/resume capability")
print(" (collection completes in ~5 minutes, so not needed)")
print()
print("To restart: Run this script again")
print()
sys.exit(130)
except Exception as e:
print()
print()
print("="*80)
print("COLLECTION FAILED")
print("="*80)
print()
print(f"Error: {e}")
print()
import traceback
traceback.print_exc()
print()
sys.exit(1)