fbmc-chronos2 / scripts /collect_lta_netpos_24month.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
raw
history blame
7.11 kB
"""Collect LTA and Net Positions data for 24 months (Oct 2023 - Sept 2025)."""
import sys
from pathlib import Path
from datetime import datetime, timedelta
import polars as pl
import time
from requests.exceptions import HTTPError
# Add src to path
sys.path.insert(0, str(Path.cwd() / 'src'))
from data_collection.collect_jao import JAOCollector
def collect_lta_monthly(collector, start_date, end_date):
"""Collect LTA data month by month (API doesn't support long ranges).
Implements JAO API rate limiting:
- 100 requests/minute limit
- 1 second between requests (60 req/min with safety margin)
- Exponential backoff on 429 errors
"""
import pandas as pd
all_lta_data = []
# Generate monthly date ranges
current_start = pd.Timestamp(start_date)
end_ts = pd.Timestamp(end_date)
month_count = 0
while current_start <= end_ts:
# Calculate month end
if current_start.month == 12:
current_end = current_start.replace(year=current_start.year + 1, month=1, day=1) - timedelta(days=1)
else:
current_end = current_start.replace(month=current_start.month + 1, day=1) - timedelta(days=1)
# Don't go past final end date
if current_end > end_ts:
current_end = end_ts
month_count += 1
print(f" Month {month_count}/24: {current_start.date()} to {current_end.date()}...", end=" ", flush=True)
# Retry logic with exponential backoff
max_retries = 5
base_delay = 60 # Start with 60s on 429 error
for attempt in range(max_retries):
try:
# Rate limiting: 1 second between all requests
time.sleep(1)
# Query LTA for this month
pd_start = pd.Timestamp(current_start, tz='UTC')
pd_end = pd.Timestamp(current_end, tz='UTC')
df = collector.client.query_lta(pd_start, pd_end)
if df is not None and not df.empty:
# CRITICAL: Reset index to preserve datetime (mtu) as column
all_lta_data.append(pl.from_pandas(df.reset_index()))
print(f"{len(df):,} records")
else:
print("No data")
# Success - break retry loop
break
except HTTPError as e:
if e.response.status_code == 429:
# Rate limited - exponential backoff
wait_time = base_delay * (2 ** attempt)
print(f"Rate limited (429), waiting {wait_time}s... ", end="", flush=True)
time.sleep(wait_time)
if attempt < max_retries - 1:
print(f"Retrying ({attempt + 2}/{max_retries})...", end=" ", flush=True)
else:
print(f"Failed after {max_retries} attempts")
else:
# Other HTTP error - don't retry
print(f"Failed: {e}")
break
except Exception as e:
# Non-HTTP error
print(f"Failed: {e}")
break
# Move to next month
if current_start.month == 12:
current_start = current_start.replace(year=current_start.year + 1, month=1, day=1)
else:
current_start = current_start.replace(month=current_start.month + 1, day=1)
# Combine all monthly data
if all_lta_data:
combined = pl.concat(all_lta_data, how='vertical')
print(f"\n Combined: {len(combined):,} total records")
return combined
else:
return None
def main():
"""Collect LTA and Net Positions for complete 24-month period."""
print("\n" + "=" * 80)
print("JAO LTA + NET POSITIONS COLLECTION - 24 MONTHS")
print("=" * 80)
print("Period: October 2023 - September 2025")
print("=" * 80)
print()
# Initialize collector
collector = JAOCollector()
# Date range (matches Phase 1 SPARSE collection)
start_date = '2023-10-01'
end_date = '2025-09-30'
# Output directory
output_dir = Path('data/raw/phase1_24month')
output_dir.mkdir(parents=True, exist_ok=True)
start_time = datetime.now()
# =========================================================================
# DATASET 1: LTA (Long Term Allocations)
# =========================================================================
print("\n" + "=" * 80)
print("DATASET 1/2: LTA (Long Term Allocations)")
print("=" * 80)
print("Collecting monthly (API limitation)...")
print()
lta_output = output_dir / 'jao_lta.parquet'
try:
lta_df = collect_lta_monthly(collector, start_date, end_date)
if lta_df is not None:
# Save to parquet
lta_df.write_parquet(lta_output)
print(f"\n[OK] LTA collection successful: {len(lta_df):,} records")
print(f"[OK] Saved to: {lta_output}")
print(f"[OK] File size: {lta_output.stat().st_size / (1024**2):.2f} MB")
else:
print(f"\n[WARNING] LTA collection returned no data")
except Exception as e:
print(f"\n[ERROR] LTA collection failed: {e}")
import traceback
traceback.print_exc()
# =========================================================================
# DATASET 2: NET POSITIONS (Domain Boundaries)
# =========================================================================
print("\n" + "=" * 80)
print("DATASET 2/2: NET POSITIONS (Domain Boundaries)")
print("=" * 80)
print()
netpos_output = output_dir / 'jao_net_positions.parquet'
try:
netpos_df = collector.collect_net_positions_sample(
start_date=start_date,
end_date=end_date,
output_path=netpos_output
)
if netpos_df is not None:
print(f"\n[OK] Net Positions collection successful: {len(netpos_df):,} records")
else:
print(f"\n[WARNING] Net Positions collection returned no data")
except Exception as e:
print(f"\n[ERROR] Net Positions collection failed: {e}")
import traceback
traceback.print_exc()
# =========================================================================
# SUMMARY
# =========================================================================
elapsed = datetime.now() - start_time
print("\n" + "=" * 80)
print("COLLECTION COMPLETE")
print("=" * 80)
print(f"Total time: {elapsed}")
print()
print("Files created:")
if lta_output.exists():
print(f" [OK] {lta_output}")
print(f" Size: {lta_output.stat().st_size / (1024**2):.2f} MB")
else:
print(f" [MISSING] {lta_output}")
if netpos_output.exists():
print(f" [OK] {netpos_output}")
print(f" Size: {netpos_output.stat().st_size / (1024**2):.2f} MB")
else:
print(f" [MISSING] {netpos_output}")
print("=" * 80)
if __name__ == '__main__':
main()