""" Collect 24-Month Weather Data from OpenMeteo ============================================= Collects hourly weather data from OpenMeteo Historical API for the full 24-month period (Oct 2023 - Sept 2025) across 52 strategic grid points. 7 Weather Variables: - temperature_2m: Air temperature at 2m (C) - windspeed_10m: Wind speed at 10m (m/s) - windspeed_100m: Wind speed at 100m (m/s) - for wind generation - winddirection_100m: Wind direction at 100m (degrees) - shortwave_radiation: Solar radiation (W/m2) - for solar generation - cloudcover: Cloud cover percentage - surface_pressure: Surface air pressure (hPa) Collection Strategy: - 52 grid points (covering all FBMC zones + neighbors) - 2-week chunks (1.0 API call each) - 270 requests/minute (45% of 600 limit) - Estimated runtime: ~5 minutes Output: data/raw/weather_24month.parquet Size: ~50-80 MB (52 points × 7 vars × 17,520 hours) Features: 364 (52 × 7) when engineered """ import sys from pathlib import Path # Add src to path sys.path.append(str(Path(__file__).parent.parent)) from src.data_collection.collect_openmeteo import OpenMeteoCollector # Date range: Oct 2023 - Sept 2025 (24 months) START_DATE = '2023-10-01' END_DATE = '2025-09-30' # Output file OUTPUT_DIR = Path(__file__).parent.parent / 'data' / 'raw' OUTPUT_FILE = OUTPUT_DIR / 'weather_24month.parquet' print("="*80) print("24-MONTH WEATHER DATA COLLECTION") print("="*80) print() print("Period: October 2023 - September 2025 (24 months)") print("Grid points: 52 strategic locations across FBMC") print("Variables: 7 weather parameters") print("Estimated runtime: ~5 minutes") print() # Initialize collector with safe rate limiting print("Initializing OpenMeteo collector...") collector = OpenMeteoCollector( requests_per_minute=270, # 45% of 600 limit chunk_days=14 # 1.0 API call per request ) print("[OK] Collector initialized") print() # Run collection try: df = collector.collect_all( start_date=START_DATE, end_date=END_DATE, output_path=OUTPUT_FILE ) if not df.is_empty(): print() print("="*80) print("COLLECTION SUCCESS") print("="*80) print() print(f"Output: {OUTPUT_FILE}") print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]} columns") print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") print(f"Grid points: {df['grid_point'].n_unique()}") print(f"Weather variables: {len([c for c in df.columns if c not in ['timestamp', 'grid_point', 'location_name', 'latitude', 'longitude']])}") print() # Data quality summary null_count_total = df.null_count().sum_horizontal()[0] null_pct = (null_count_total / (df.shape[0] * df.shape[1])) * 100 print(f"Data completeness: {100 - null_pct:.2f}%") if null_pct > 0: print() print("Missing data by column:") for col in df.columns: null_count = df[col].null_count() if null_count > 0: pct = (null_count / len(df)) * 100 print(f" - {col}: {null_count:,} ({pct:.2f}%)") print() print("="*80) print("NEXT STEPS") print("="*80) print() print("1. Implement weather feature engineering:") print(" - Create src/feature_engineering/engineer_weather_features.py") print(" - Engineer ~364 features (52 grid points x 7 variables)") print(" - Add spatial aggregation (zone-level averages)") print() print("2. Expected features:") print(" - Grid-level: temp_{grid_point}, wind_{grid_point}, solar_{grid_point}, etc.") print(" - Zone-level: temp_avg_{zone}, wind_avg_{zone}, solar_avg_{zone}, etc.") print(" - Lags: Previous 1h, 6h, 12h, 24h for key variables") print() print("3. Final unified features:") print(" - JAO: 1,698") print(" - ENTSO-E: 296") print(" - Weather: 364") print(" - Total: ~2,358 features") print() print("[OK] Weather data collection COMPLETE!") else: print() print("[ERROR] No weather data collected") print() print("Possible causes:") print(" - OpenMeteo API access issues") print(" - Rate limit exceeded") print(" - Network connectivity problems") print() sys.exit(1) except KeyboardInterrupt: print() print() print("="*80) print("COLLECTION INTERRUPTED") print("="*80) print() print("Collection was stopped by user.") print() print("NOTE: OpenMeteo collection does NOT have checkpoint/resume capability") print(" (collection completes in ~5 minutes, so not needed)") print() print("To restart: Run this script again") print() sys.exit(130) except Exception as e: print() print() print("="*80) print("COLLECTION FAILED") print("="*80) print() print(f"Error: {e}") print() import traceback traceback.print_exc() print() sys.exit(1)