Spaces:
Sleeping
Sleeping
| """Collect LTA and Net Positions data for 24 months (Oct 2023 - Sept 2025).""" | |
| import sys | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| import polars as pl | |
| import time | |
| from requests.exceptions import HTTPError | |
| # Add src to path | |
| sys.path.insert(0, str(Path.cwd() / 'src')) | |
| from data_collection.collect_jao import JAOCollector | |
| def collect_lta_monthly(collector, start_date, end_date): | |
| """Collect LTA data month by month (API doesn't support long ranges). | |
| Implements JAO API rate limiting: | |
| - 100 requests/minute limit | |
| - 1 second between requests (60 req/min with safety margin) | |
| - Exponential backoff on 429 errors | |
| """ | |
| import pandas as pd | |
| all_lta_data = [] | |
| # Generate monthly date ranges | |
| current_start = pd.Timestamp(start_date) | |
| end_ts = pd.Timestamp(end_date) | |
| month_count = 0 | |
| while current_start <= end_ts: | |
| # Calculate month end | |
| if current_start.month == 12: | |
| current_end = current_start.replace(year=current_start.year + 1, month=1, day=1) - timedelta(days=1) | |
| else: | |
| current_end = current_start.replace(month=current_start.month + 1, day=1) - timedelta(days=1) | |
| # Don't go past final end date | |
| if current_end > end_ts: | |
| current_end = end_ts | |
| month_count += 1 | |
| print(f" Month {month_count}/24: {current_start.date()} to {current_end.date()}...", end=" ", flush=True) | |
| # Retry logic with exponential backoff | |
| max_retries = 5 | |
| base_delay = 60 # Start with 60s on 429 error | |
| for attempt in range(max_retries): | |
| try: | |
| # Rate limiting: 1 second between all requests | |
| time.sleep(1) | |
| # Query LTA for this month | |
| pd_start = pd.Timestamp(current_start, tz='UTC') | |
| pd_end = pd.Timestamp(current_end, tz='UTC') | |
| df = collector.client.query_lta(pd_start, pd_end) | |
| if df is not None and not df.empty: | |
| # CRITICAL: Reset index to preserve datetime (mtu) as column | |
| all_lta_data.append(pl.from_pandas(df.reset_index())) | |
| print(f"{len(df):,} records") | |
| else: | |
| print("No data") | |
| # Success - break retry loop | |
| break | |
| except HTTPError as e: | |
| if e.response.status_code == 429: | |
| # Rate limited - exponential backoff | |
| wait_time = base_delay * (2 ** attempt) | |
| print(f"Rate limited (429), waiting {wait_time}s... ", end="", flush=True) | |
| time.sleep(wait_time) | |
| if attempt < max_retries - 1: | |
| print(f"Retrying ({attempt + 2}/{max_retries})...", end=" ", flush=True) | |
| else: | |
| print(f"Failed after {max_retries} attempts") | |
| else: | |
| # Other HTTP error - don't retry | |
| print(f"Failed: {e}") | |
| break | |
| except Exception as e: | |
| # Non-HTTP error | |
| print(f"Failed: {e}") | |
| break | |
| # Move to next month | |
| if current_start.month == 12: | |
| current_start = current_start.replace(year=current_start.year + 1, month=1, day=1) | |
| else: | |
| current_start = current_start.replace(month=current_start.month + 1, day=1) | |
| # Combine all monthly data | |
| if all_lta_data: | |
| combined = pl.concat(all_lta_data, how='vertical') | |
| print(f"\n Combined: {len(combined):,} total records") | |
| return combined | |
| else: | |
| return None | |
| def main(): | |
| """Collect LTA and Net Positions for complete 24-month period.""" | |
| print("\n" + "=" * 80) | |
| print("JAO LTA + NET POSITIONS COLLECTION - 24 MONTHS") | |
| print("=" * 80) | |
| print("Period: October 2023 - September 2025") | |
| print("=" * 80) | |
| print() | |
| # Initialize collector | |
| collector = JAOCollector() | |
| # Date range (matches Phase 1 SPARSE collection) | |
| start_date = '2023-10-01' | |
| end_date = '2025-09-30' | |
| # Output directory | |
| output_dir = Path('data/raw/phase1_24month') | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| start_time = datetime.now() | |
| # ========================================================================= | |
| # DATASET 1: LTA (Long Term Allocations) | |
| # ========================================================================= | |
| print("\n" + "=" * 80) | |
| print("DATASET 1/2: LTA (Long Term Allocations)") | |
| print("=" * 80) | |
| print("Collecting monthly (API limitation)...") | |
| print() | |
| lta_output = output_dir / 'jao_lta.parquet' | |
| try: | |
| lta_df = collect_lta_monthly(collector, start_date, end_date) | |
| if lta_df is not None: | |
| # Save to parquet | |
| lta_df.write_parquet(lta_output) | |
| print(f"\n[OK] LTA collection successful: {len(lta_df):,} records") | |
| print(f"[OK] Saved to: {lta_output}") | |
| print(f"[OK] File size: {lta_output.stat().st_size / (1024**2):.2f} MB") | |
| else: | |
| print(f"\n[WARNING] LTA collection returned no data") | |
| except Exception as e: | |
| print(f"\n[ERROR] LTA collection failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| # ========================================================================= | |
| # DATASET 2: NET POSITIONS (Domain Boundaries) | |
| # ========================================================================= | |
| print("\n" + "=" * 80) | |
| print("DATASET 2/2: NET POSITIONS (Domain Boundaries)") | |
| print("=" * 80) | |
| print() | |
| netpos_output = output_dir / 'jao_net_positions.parquet' | |
| try: | |
| netpos_df = collector.collect_net_positions_sample( | |
| start_date=start_date, | |
| end_date=end_date, | |
| output_path=netpos_output | |
| ) | |
| if netpos_df is not None: | |
| print(f"\n[OK] Net Positions collection successful: {len(netpos_df):,} records") | |
| else: | |
| print(f"\n[WARNING] Net Positions collection returned no data") | |
| except Exception as e: | |
| print(f"\n[ERROR] Net Positions collection failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| # ========================================================================= | |
| # SUMMARY | |
| # ========================================================================= | |
| elapsed = datetime.now() - start_time | |
| print("\n" + "=" * 80) | |
| print("COLLECTION COMPLETE") | |
| print("=" * 80) | |
| print(f"Total time: {elapsed}") | |
| print() | |
| print("Files created:") | |
| if lta_output.exists(): | |
| print(f" [OK] {lta_output}") | |
| print(f" Size: {lta_output.stat().st_size / (1024**2):.2f} MB") | |
| else: | |
| print(f" [MISSING] {lta_output}") | |
| if netpos_output.exists(): | |
| print(f" [OK] {netpos_output}") | |
| print(f" Size: {netpos_output.stat().st_size / (1024**2):.2f} MB") | |
| else: | |
| print(f" [MISSING] {netpos_output}") | |
| print("=" * 80) | |
| if __name__ == '__main__': | |
| main() | |