""" Collect ENTSOE 1-week sample data for Sept 23-30, 2025 Collects generation by type for all 12 Core FBMC zones: - Wind, Solar, Thermal, Hydro, Nuclear generation Matches the JAO sample period for integrated analysis. """ import os import sys from pathlib import Path from datetime import datetime, timedelta import pandas as pd from entsoe import EntsoePandasClient from dotenv import load_dotenv # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) # Load API key load_dotenv() API_KEY = os.getenv('ENTSOE_API_KEY') if not API_KEY: print("[ERROR] ENTSOE_API_KEY not found in .env file") print("Please add: ENTSOE_API_KEY=your_key_here") sys.exit(1) # Initialize client client = EntsoePandasClient(api_key=API_KEY) # Core FBMC zones (12 total) FBMC_ZONES = { 'AT': '10YAT-APG------L', # Austria 'BE': '10YBE----------2', # Belgium 'CZ': '10YCZ-CEPS-----N', # Czech Republic 'DE_LU': '10Y1001A1001A83F', # Germany-Luxembourg 'FR': '10YFR-RTE------C', # France 'HR': '10YHR-HEP------M', # Croatia 'HU': '10YHU-MAVIR----U', # Hungary 'NL': '10YNL----------L', # Netherlands 'PL': '10YPL-AREA-----S', # Poland 'RO': '10YRO-TEL------P', # Romania 'SI': '10YSI-ELES-----O', # Slovenia 'SK': '10YSK-SEPS-----K', # Slovakia } # Generation types mapping (ENTSOE API codes) GENERATION_TYPES = { 'B16': 'solar', # Solar 'B19': 'wind_offshore', # Wind offshore 'B18': 'wind_onshore', # Wind onshore 'B01': 'biomass', # Biomass 'B10': 'hydro_pumped', # Hydro pumped storage 'B11': 'hydro_run', # Hydro run-of-river 'B12': 'hydro_reservoir', # Hydro reservoir 'B14': 'nuclear', # Nuclear 'B02': 'fossil_brown_coal', # Fossil brown coal/lignite 'B05': 'fossil_coal', # Fossil hard coal 'B04': 'fossil_gas', # Fossil gas 'B03': 'fossil_oil', # Fossil oil } # Sample period: Sept 23-30, 2025 (matches JAO sample) START_DATE = pd.Timestamp('2025-09-23', tz='UTC') END_DATE = pd.Timestamp('2025-09-30', tz='UTC') print("=" * 70) print("ENTSOE 1-Week Sample Data Collection") print("=" * 70) print(f"Period: {START_DATE.date()} to {END_DATE.date()}") print(f"Zones: {len(FBMC_ZONES)} Core FBMC zones") print(f"Duration: 7 days = 168 hours") print() # Collect data all_generation = [] for zone_code, zone_eic in FBMC_ZONES.items(): print(f"\n[{zone_code}] Collecting generation data...") try: # Query generation by type gen_df = client.query_generation( zone_eic, start=START_DATE, end=END_DATE, psr_type=None # Get all generation types ) # Add zone identifier gen_df['zone'] = zone_code # Reshape: generation types as columns if isinstance(gen_df, pd.DataFrame): # Already in correct format all_generation.append(gen_df) print(f" [OK] Collected {len(gen_df)} rows") else: print(f" [WARNING] Unexpected format: {type(gen_df)}") except Exception as e: print(f" [ERROR] {e}") continue if not all_generation: print("\n[ERROR] No data collected - check API key and zone codes") sys.exit(1) # Combine all zones print("\n" + "=" * 70) print("Processing collected data...") combined_df = pd.concat(all_generation, axis=0) # Reset index to make timestamp a column combined_df = combined_df.reset_index() if 'index' in combined_df.columns: combined_df = combined_df.rename(columns={'index': 'timestamp'}) print(f" Combined shape: {combined_df.shape}") print(f" Columns: {list(combined_df.columns)}") # Save to parquet output_dir = Path("data/raw/sample") output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / "entsoe_sample_sept2025.parquet" combined_df.to_parquet(output_file, index=False) print(f"\n[SUCCESS] Saved to: {output_file}") print(f" File size: {output_file.stat().st_size / 1024:.1f} KB") print() print("=" * 70) print("ENTSOE Sample Collection Complete") print("=" * 70) print("\nNext: Add ENTSOE exploration to Marimo notebook")