Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
feat: complete weather feature engineering with simplified approach (375 features)
7aa0336
| """ | |
| Collect 24-Month Weather Data from OpenMeteo | |
| ============================================= | |
| Collects hourly weather data from OpenMeteo Historical API for the full | |
| 24-month period (Oct 2023 - Sept 2025) across 52 strategic grid points. | |
| 7 Weather Variables: | |
| - temperature_2m: Air temperature at 2m (C) | |
| - windspeed_10m: Wind speed at 10m (m/s) | |
| - windspeed_100m: Wind speed at 100m (m/s) - for wind generation | |
| - winddirection_100m: Wind direction at 100m (degrees) | |
| - shortwave_radiation: Solar radiation (W/m2) - for solar generation | |
| - cloudcover: Cloud cover percentage | |
| - surface_pressure: Surface air pressure (hPa) | |
| Collection Strategy: | |
| - 52 grid points (covering all FBMC zones + neighbors) | |
| - 2-week chunks (1.0 API call each) | |
| - 270 requests/minute (45% of 600 limit) | |
| - Estimated runtime: ~5 minutes | |
| Output: data/raw/weather_24month.parquet | |
| Size: ~50-80 MB (52 points × 7 vars × 17,520 hours) | |
| Features: 364 (52 × 7) when engineered | |
| """ | |
| import sys | |
| from pathlib import Path | |
| # Add src to path | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from src.data_collection.collect_openmeteo import OpenMeteoCollector | |
| # Date range: Oct 2023 - Sept 2025 (24 months) | |
| START_DATE = '2023-10-01' | |
| END_DATE = '2025-09-30' | |
| # Output file | |
| OUTPUT_DIR = Path(__file__).parent.parent / 'data' / 'raw' | |
| OUTPUT_FILE = OUTPUT_DIR / 'weather_24month.parquet' | |
| print("="*80) | |
| print("24-MONTH WEATHER DATA COLLECTION") | |
| print("="*80) | |
| print() | |
| print("Period: October 2023 - September 2025 (24 months)") | |
| print("Grid points: 52 strategic locations across FBMC") | |
| print("Variables: 7 weather parameters") | |
| print("Estimated runtime: ~5 minutes") | |
| print() | |
| # Initialize collector with safe rate limiting | |
| print("Initializing OpenMeteo collector...") | |
| collector = OpenMeteoCollector( | |
| requests_per_minute=270, # 45% of 600 limit | |
| chunk_days=14 # 1.0 API call per request | |
| ) | |
| print("[OK] Collector initialized") | |
| print() | |
| # Run collection | |
| try: | |
| df = collector.collect_all( | |
| start_date=START_DATE, | |
| end_date=END_DATE, | |
| output_path=OUTPUT_FILE | |
| ) | |
| if not df.is_empty(): | |
| print() | |
| print("="*80) | |
| print("COLLECTION SUCCESS") | |
| print("="*80) | |
| print() | |
| print(f"Output: {OUTPUT_FILE}") | |
| print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]} columns") | |
| print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}") | |
| print(f"Grid points: {df['grid_point'].n_unique()}") | |
| print(f"Weather variables: {len([c for c in df.columns if c not in ['timestamp', 'grid_point', 'location_name', 'latitude', 'longitude']])}") | |
| print() | |
| # Data quality summary | |
| null_count_total = df.null_count().sum_horizontal()[0] | |
| null_pct = (null_count_total / (df.shape[0] * df.shape[1])) * 100 | |
| print(f"Data completeness: {100 - null_pct:.2f}%") | |
| if null_pct > 0: | |
| print() | |
| print("Missing data by column:") | |
| for col in df.columns: | |
| null_count = df[col].null_count() | |
| if null_count > 0: | |
| pct = (null_count / len(df)) * 100 | |
| print(f" - {col}: {null_count:,} ({pct:.2f}%)") | |
| print() | |
| print("="*80) | |
| print("NEXT STEPS") | |
| print("="*80) | |
| print() | |
| print("1. Implement weather feature engineering:") | |
| print(" - Create src/feature_engineering/engineer_weather_features.py") | |
| print(" - Engineer ~364 features (52 grid points x 7 variables)") | |
| print(" - Add spatial aggregation (zone-level averages)") | |
| print() | |
| print("2. Expected features:") | |
| print(" - Grid-level: temp_{grid_point}, wind_{grid_point}, solar_{grid_point}, etc.") | |
| print(" - Zone-level: temp_avg_{zone}, wind_avg_{zone}, solar_avg_{zone}, etc.") | |
| print(" - Lags: Previous 1h, 6h, 12h, 24h for key variables") | |
| print() | |
| print("3. Final unified features:") | |
| print(" - JAO: 1,698") | |
| print(" - ENTSO-E: 296") | |
| print(" - Weather: 364") | |
| print(" - Total: ~2,358 features") | |
| print() | |
| print("[OK] Weather data collection COMPLETE!") | |
| else: | |
| print() | |
| print("[ERROR] No weather data collected") | |
| print() | |
| print("Possible causes:") | |
| print(" - OpenMeteo API access issues") | |
| print(" - Rate limit exceeded") | |
| print(" - Network connectivity problems") | |
| print() | |
| sys.exit(1) | |
| except KeyboardInterrupt: | |
| print() | |
| print() | |
| print("="*80) | |
| print("COLLECTION INTERRUPTED") | |
| print("="*80) | |
| print() | |
| print("Collection was stopped by user.") | |
| print() | |
| print("NOTE: OpenMeteo collection does NOT have checkpoint/resume capability") | |
| print(" (collection completes in ~5 minutes, so not needed)") | |
| print() | |
| print("To restart: Run this script again") | |
| print() | |
| sys.exit(130) | |
| except Exception as e: | |
| print() | |
| print() | |
| print("="*80) | |
| print("COLLECTION FAILED") | |
| print("="*80) | |
| print() | |
| print(f"Error: {e}") | |
| print() | |
| import traceback | |
| traceback.print_exc() | |
| print() | |
| sys.exit(1) | |