Spaces:
Sleeping
feat: Day 0 - Initialize FBMC Flow Forecasting MVP
Browse files- Set up project structure (src/, notebooks/, doc/, config/, tools/, tests/)
- Configure virtual environment with Python 3.13.2 and 179 packages
- Install core dependencies: polars, torch, chronos-forecasting, marimo, altair
- Configure API keys: ENTSO-E (working), HuggingFace (deferred to Day 3)
- Create data collection scripts with proper rate limiting:
* OpenMeteo: 270 req/min (45% of 600 limit) - 2-week chunks
* ENTSO-E: 27 req/min (45% of 60 limit) - monthly chunks
* JAO: Wrapper for JAOPuTo tool (requires Java 11+)
- Create HF Datasets manager for data storage (separate from Git)
- Create data loading and validation utilities
- Create initial Marimo exploration notebook
- Document Java installation requirements
Data Strategy:
- Code → Git (this repo)
- Data → HuggingFace Datasets (separate, not in Git)
- NO Git LFS (following data science best practices)
Scope: Oct 2024 - Sept 2025 (12 months, leaves Oct 2025 for live testing)
Timeline: 5-day MVP (zero-shot inference, no fine-tuning)
Next: Install Java, download JAOPuTo, begin Day 1 data collection
- .claude/settings.local.json +13 -0
- .gitignore +62 -0
- CLAUDE.md +342 -0
- doc/Day_0_Quick_Start_Guide.md +1045 -0
- doc/FBMC_Flow_Forecasting_MVP_ZERO_SHOT_PLAN.md +0 -0
- doc/JAVA_INSTALL_GUIDE.md +214 -0
- doc/activity.md +90 -0
- notebooks/01_data_exploration.py +284 -0
- requirements.txt +32 -0
- src/__init__.py +0 -0
- src/data_collection/__init__.py +0 -0
- src/data_collection/collect_entsoe.py +428 -0
- src/data_collection/collect_jao.py +268 -0
- src/data_collection/collect_openmeteo.py +421 -0
- src/data_collection/download_all.py +95 -0
- src/data_collection/hf_datasets_manager.py +170 -0
- src/feature_engineering/__init__.py +0 -0
- src/model/__init__.py +0 -0
- src/utils/__init__.py +0 -0
- src/utils/data_loader.py +251 -0
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"permissions": {
|
| 3 |
+
"allow": [
|
| 4 |
+
"Bash(.venv/Scripts/pip.exe list:*)",
|
| 5 |
+
"Bash(findstr:*)",
|
| 6 |
+
"Bash(.venv/Scripts/python.exe:*)",
|
| 7 |
+
"WebFetch(domain:transparencyplatform.zendesk.com)",
|
| 8 |
+
"WebSearch"
|
| 9 |
+
],
|
| 10 |
+
"deny": [],
|
| 11 |
+
"ask": []
|
| 12 |
+
}
|
| 13 |
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================
|
| 2 |
+
# Data Files - NEVER commit to Git
|
| 3 |
+
# ============================================
|
| 4 |
+
# Following data science best practices:
|
| 5 |
+
# - Code goes in Git
|
| 6 |
+
# - Data goes in HuggingFace Datasets
|
| 7 |
+
data/
|
| 8 |
+
*.parquet
|
| 9 |
+
*.pkl
|
| 10 |
+
*.csv
|
| 11 |
+
*.h5
|
| 12 |
+
*.hdf5
|
| 13 |
+
*.feather
|
| 14 |
+
|
| 15 |
+
# ============================================
|
| 16 |
+
# Model Artifacts
|
| 17 |
+
# ============================================
|
| 18 |
+
models/checkpoints/
|
| 19 |
+
*.pth
|
| 20 |
+
*.safetensors
|
| 21 |
+
*.ckpt
|
| 22 |
+
|
| 23 |
+
# ============================================
|
| 24 |
+
# Credentials & Secrets
|
| 25 |
+
# ============================================
|
| 26 |
+
.env
|
| 27 |
+
config/api_keys.yaml
|
| 28 |
+
*.key
|
| 29 |
+
*.pem
|
| 30 |
+
|
| 31 |
+
# ============================================
|
| 32 |
+
# Python
|
| 33 |
+
# ============================================
|
| 34 |
+
__pycache__/
|
| 35 |
+
*.pyc
|
| 36 |
+
*.pyo
|
| 37 |
+
*.egg-info/
|
| 38 |
+
.pytest_cache/
|
| 39 |
+
.venv/
|
| 40 |
+
venv/
|
| 41 |
+
verify_install.py
|
| 42 |
+
|
| 43 |
+
# ============================================
|
| 44 |
+
# IDE & OS
|
| 45 |
+
# ============================================
|
| 46 |
+
.vscode/
|
| 47 |
+
.idea/
|
| 48 |
+
*.swp
|
| 49 |
+
.DS_Store
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# ============================================
|
| 53 |
+
# Jupyter
|
| 54 |
+
# ============================================
|
| 55 |
+
.ipynb_checkpoints/
|
| 56 |
+
|
| 57 |
+
# ============================================
|
| 58 |
+
# Temporary Files
|
| 59 |
+
# ============================================
|
| 60 |
+
*.tmp
|
| 61 |
+
*.log
|
| 62 |
+
.cache/
|
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FBMC Flow Forecasting MVP - Claude Execution Rules
|
| 2 |
+
# Global Development Rules
|
| 3 |
+
1. **Always update `activity.md`** after significant changes with timestamp, description, files modified, and status. It's CRITICAL to always document where we are in the workflow.
|
| 4 |
+
2. When starting a new session, always reference activity.md first.
|
| 5 |
+
3. Always look for existing code to iterate on instead of creating new code
|
| 6 |
+
4. Do not drastically change the patterns before trying to iterate on existing patterns.
|
| 7 |
+
5. Always kill all existing related servers that may have been created in previous testing before trying to start a new server.
|
| 8 |
+
6. Always prefer simple solutions
|
| 9 |
+
7. Avoid duplication of code whenever possible, which means checking for other areas of the codebase that might already have similar code and functionality
|
| 10 |
+
8. Write code that takes into account the different environments: dev, test, and prod
|
| 11 |
+
9. You are careful to only make changes that are requested or you are confident are well understood and related to the change being requested
|
| 12 |
+
10. When fixing an issue or bug, do not introduce a new pattern or technology without first exhausting all options for the existing implementation. And if you finally do this, make sure to remove the old implementation afterwards so we don't have duplicate logic.
|
| 13 |
+
11. Keep the codebase very clean and organized
|
| 14 |
+
12. Avoid writing scripts in files if possible, especially if the sript is likely to be run once
|
| 15 |
+
13. When you're not sure about something, ask for clarification
|
| 16 |
+
14. Avoid having files over 200-300 lines of code. Refactor at that point.
|
| 17 |
+
15. Mocking data is only needed for tests, never mock data for dev or prod
|
| 18 |
+
16. Never add stubbing or fake data patterns to code that affects the dev or prod environments
|
| 19 |
+
17. Never overwrite my .env file without first asking and confirming
|
| 20 |
+
18. Focus on the areas of code relevant to the task
|
| 21 |
+
19. Do not touch code that is unrelated to the task
|
| 22 |
+
20. Write thorough test for all major functionality
|
| 23 |
+
21. Avoid making major changes to the patterns of how a feature works, after it has shown to work well, unless explicitly instructed
|
| 24 |
+
22. Always think about what method and areas of code might be affected by code changes
|
| 25 |
+
23. Keep commits small and focused on a single change
|
| 26 |
+
24. Write meaningful commit messages
|
| 27 |
+
25. Review your own code before asking others to review it
|
| 28 |
+
26. Be mindful of performance implications
|
| 29 |
+
27. Always consider security implications of your code
|
| 30 |
+
28. After making significant code changes (new features, major fixes, completing implementation phases), proactively offer to commit and push changes to GitHub with descriptive commit messages. Always ask for approval before executing git commands. Ensure no sensitive information (.env files, API keys) is committed.
|
| 31 |
+
29. ALWAYS use virtual environments for Python projects. NEVER install packages globally. Create virtual environments with clear, project-specific names following the pattern: {project_name}_env (e.g., news_intel_env). Always verify virtual environment is activated before installing packages.
|
| 32 |
+
30. **NEVER pollute directories with multiple file versions**
|
| 33 |
+
- Do NOT leave test files, backup files, or old versions in main directories
|
| 34 |
+
- If testing: move test files to archive immediately after use
|
| 35 |
+
- If updating: either replace the file or archive the old version
|
| 36 |
+
- Keep only ONE working version of each file in main directories
|
| 37 |
+
- Use descriptive names in archive folders with dates
|
| 38 |
+
|
| 39 |
+
## Project Identity
|
| 40 |
+
|
| 41 |
+
**Zero-shot electricity cross-border capacity forecasting using Chronos 2**
|
| 42 |
+
- 5-day MVP timeline (FIRM - no extensions)
|
| 43 |
+
- Target: 134 MW MAE on D+1 forecasts
|
| 44 |
+
- Approach: Zero-shot inference only (NO fine-tuning)
|
| 45 |
+
- Handover: Complete working system to quantitative analyst
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## Tech Stack
|
| 50 |
+
|
| 51 |
+
### Core ML/Data
|
| 52 |
+
- **Model**: Amazon Chronos 2 Large (710M params, pre-trained)
|
| 53 |
+
- **Data Processing**: Polars (primary), PyArrow
|
| 54 |
+
- **Scientific**: NumPy, scikit-learn
|
| 55 |
+
- **Framework**: PyTorch 2.0+, Transformers 4.35+
|
| 56 |
+
|
| 57 |
+
### Development Environment
|
| 58 |
+
- **Local Notebooks**: Marimo 0.9+ (reactive, .py format)
|
| 59 |
+
- **Handover Format**: JupyterLab (standard .ipynb)
|
| 60 |
+
- **Infrastructure**: HuggingFace Space (JupyterLab SDK, A10G GPU)
|
| 61 |
+
- **Package Manager**: uv (10-100x faster than pip)
|
| 62 |
+
|
| 63 |
+
### Data Collection
|
| 64 |
+
- **JAO Data**: JAOPuTo CLI tool (Java 11+ required)
|
| 65 |
+
- **Power Data**: entsoe-py (ENTSO-E Transparency API)
|
| 66 |
+
- **Weather Data**: OpenMeteo API (free tier)
|
| 67 |
+
- **Data Storage**: HuggingFace Datasets (NOT Git/Git-LFS)
|
| 68 |
+
|
| 69 |
+
### Visualization & Analysis
|
| 70 |
+
- **Primary**: Altair 5.0+
|
| 71 |
+
- **Notebooks**: Marimo reactive interface
|
| 72 |
+
- **Export**: Standard matplotlib/seaborn for static reports
|
| 73 |
+
|
| 74 |
+
### Testing & Quality
|
| 75 |
+
- **Testing**: pytest (unit, integration, smoke tests)
|
| 76 |
+
- **Validation**: Custom assertions for data quality
|
| 77 |
+
- **CI/CD**: GitHub Actions (optional, for automated testing)
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## Critical Execution Rules
|
| 82 |
+
|
| 83 |
+
### 1. Scope Discipline
|
| 84 |
+
- **ONLY** zero-shot inference - no model training/fine-tuning
|
| 85 |
+
- **ONLY** Core FBMC (13 countries, ~20 borders)
|
| 86 |
+
- **ONLY** 12 months historical data (Oct 2024 - Sept 2025)
|
| 87 |
+
- **ONLY** 5 days development time
|
| 88 |
+
- If asked to add features, reference Phase 2 handover
|
| 89 |
+
|
| 90 |
+
### 2. Data Management Philosophy
|
| 91 |
+
```
|
| 92 |
+
Code → Git repository (~50 MB, version controlled)
|
| 93 |
+
Data → HuggingFace Datasets (~6 GB, separate storage)
|
| 94 |
+
NO Git LFS (never, following data science best practices)
|
| 95 |
+
```
|
| 96 |
+
- **NEVER** commit data files (.parquet, .csv, .pkl) to Git
|
| 97 |
+
- All data goes through HuggingFace Datasets API
|
| 98 |
+
- `.gitignore` must exclude `data/` directory
|
| 99 |
+
- Git repo must stay under 100 MB total
|
| 100 |
+
|
| 101 |
+
### 3. Chronos 2 Zero-Shot Pattern
|
| 102 |
+
```python
|
| 103 |
+
# CORRECT - Zero-shot inference
|
| 104 |
+
pipeline = ChronosPipeline.from_pretrained("amazon/chronos-t5-large")
|
| 105 |
+
forecast = pipeline.predict(context=features[-512:], prediction_length=336)
|
| 106 |
+
|
| 107 |
+
# INCORRECT - Do NOT train/fine-tune
|
| 108 |
+
model.fit(training_data) # ❌ OUT OF SCOPE
|
| 109 |
+
```
|
| 110 |
+
- Load pre-trained model only
|
| 111 |
+
- Use 12-month data for feature baselines and context windows
|
| 112 |
+
- NO gradient updates, NO epoch training, NO .fit() calls
|
| 113 |
+
|
| 114 |
+
### 4. Marimo Development Workflow
|
| 115 |
+
- **Use Marimo locally** for reactive development
|
| 116 |
+
- **Export to Jupyter** for quant analyst handover
|
| 117 |
+
- Structure: DAG cells, no variable redefinition
|
| 118 |
+
- Pattern for expensive ops: `mo.ui.run_button() + @mo.cache()`
|
| 119 |
+
- Configure: `auto_instantiate = false`, `on_cell_change = "lazy"`
|
| 120 |
+
|
| 121 |
+
### 5. Feature Engineering Constraints
|
| 122 |
+
- **Exactly 75-85 features** (no more, no less)
|
| 123 |
+
- **52 weather grid points** (simplified spatial model)
|
| 124 |
+
- **Top 50 CNECs** identified by binding frequency
|
| 125 |
+
- Focus on high-signal features only
|
| 126 |
+
- Validate >95% feature completeness
|
| 127 |
+
|
| 128 |
+
### 6. Performance Targets
|
| 129 |
+
- **Inference**: <5 minutes for complete 14-day forecast
|
| 130 |
+
- **Accuracy**: D+1 MAE target is 134 MW (must be <150 MW)
|
| 131 |
+
- **Cost**: $30/month (A10G GPU, no upgrades in MVP)
|
| 132 |
+
- Document performance gaps for Phase 2 fine-tuning
|
| 133 |
+
|
| 134 |
+
### 7. Code Quality Standards
|
| 135 |
+
- Polars-first for data operations (faster, more memory efficient)
|
| 136 |
+
- Type hints for all function signatures
|
| 137 |
+
- Docstrings for all non-trivial functions
|
| 138 |
+
- Validation checks at every pipeline stage
|
| 139 |
+
- Error handling with informative messages
|
| 140 |
+
|
| 141 |
+
### 8. Daily Development Structure
|
| 142 |
+
```
|
| 143 |
+
Day 0: Environment setup (45 min) → git commit + push
|
| 144 |
+
Day 1: Data collection (8 hrs) → validate data → git commit + push
|
| 145 |
+
Day 2: Feature engineering (8 hrs) → test features → git commit + push
|
| 146 |
+
Day 3: Zero-shot inference (8 hrs) → smoke test → git commit + push
|
| 147 |
+
Day 4: Performance evaluation (8 hrs) → validate metrics → git commit + push
|
| 148 |
+
Day 5: Documentation + handover (8 hrs) → integration test → final commit + push
|
| 149 |
+
```
|
| 150 |
+
- Each day ends with validation tests + git commit + push to GitHub
|
| 151 |
+
- Intermediate commits for major milestones within the day
|
| 152 |
+
- NO day can bleed into the next
|
| 153 |
+
- If running behind, scope down (never extend timeline)
|
| 154 |
+
- Tests must pass before committing
|
| 155 |
+
|
| 156 |
+
### 9. Git Workflow & Version Control
|
| 157 |
+
- **Commit frequency**: End of each major milestone + end of each day
|
| 158 |
+
- **Commit style**: Conventional commits format
|
| 159 |
+
- `feat: add weather data collection pipeline`
|
| 160 |
+
- `fix: correct CNEC binding frequency calculation`
|
| 161 |
+
- `docs: update handover guide with evaluation metrics`
|
| 162 |
+
- `refactor: optimize feature engineering for polars`
|
| 163 |
+
- **Push to GitHub**: After every commit (keep remote in sync)
|
| 164 |
+
- **Branch strategy**: Main branch only for MVP (no feature branches)
|
| 165 |
+
- **Commit granularity**: Logical units of work (not "end of day dump")
|
| 166 |
+
- **Git hygiene**: Review `git status` before commits, ensure data/ excluded
|
| 167 |
+
|
| 168 |
+
**Daily commit pattern**:
|
| 169 |
+
```bash
|
| 170 |
+
# End of Day 1
|
| 171 |
+
git add .
|
| 172 |
+
git commit -m "feat: complete data collection pipeline with HF Datasets integration"
|
| 173 |
+
git push origin main
|
| 174 |
+
|
| 175 |
+
# Mid-Day 2 milestone
|
| 176 |
+
git commit -m "feat: implement 85-feature engineering pipeline"
|
| 177 |
+
git push origin main
|
| 178 |
+
|
| 179 |
+
# End of Day 2
|
| 180 |
+
git commit -m "test: add feature validation and CNEC identification"
|
| 181 |
+
git push origin main
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
### 10. Testing Strategy
|
| 185 |
+
- **Data validation**: Assert data completeness, check for nulls, validate ranges
|
| 186 |
+
- **Feature engineering**: Unit tests for each feature calculation
|
| 187 |
+
- **Model inference**: Smoke test on small sample before full run
|
| 188 |
+
- **Integration**: End-to-end pipeline test with 1-week subset
|
| 189 |
+
- **Performance**: Assert inference time <5 min, MAE within bounds
|
| 190 |
+
|
| 191 |
+
**Testing patterns**:
|
| 192 |
+
```python
|
| 193 |
+
# Data validation checks
|
| 194 |
+
assert df.null_count().sum() < 0.05 * len(df), "Too many missing values"
|
| 195 |
+
assert date_range_complete(df['timestamp']), "Date gaps detected"
|
| 196 |
+
|
| 197 |
+
# Feature validation
|
| 198 |
+
features = engineer.transform(data)
|
| 199 |
+
assert features.shape[1] == 85, f"Expected 85 features, got {features.shape[1]}"
|
| 200 |
+
assert (features.select(pl.all().is_null().sum()).row(0) == (0,) * 85), "Null features detected"
|
| 201 |
+
|
| 202 |
+
# Inference validation
|
| 203 |
+
forecast = pipeline.predict(context, prediction_length=336)
|
| 204 |
+
assert forecast.shape == (336, n_borders), "Forecast shape mismatch"
|
| 205 |
+
assert not np.isnan(forecast).any(), "NaN in predictions"
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
**Testing schedule**:
|
| 209 |
+
- Day 1: Validate downloaded data completeness
|
| 210 |
+
- Day 2: Test each feature calculation independently
|
| 211 |
+
- Day 3: Smoke test inference on 7-day window
|
| 212 |
+
- Day 4: Validate evaluation metrics calculations
|
| 213 |
+
- Day 5: Full integration test before handover
|
| 214 |
+
|
| 215 |
+
**Test organization** (tests/ directory):
|
| 216 |
+
```
|
| 217 |
+
tests/
|
| 218 |
+
├── test_data_collection.py # Data completeness, API responses
|
| 219 |
+
├── test_feature_engineering.py # Each feature calculation
|
| 220 |
+
├── test_model_inference.py # Inference smoke tests
|
| 221 |
+
└── test_integration.py # End-to-end pipeline
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
**Running tests**:
|
| 225 |
+
```bash
|
| 226 |
+
# Install pytest
|
| 227 |
+
uv pip install pytest
|
| 228 |
+
|
| 229 |
+
# Run all tests
|
| 230 |
+
pytest tests/ -v
|
| 231 |
+
|
| 232 |
+
# Run specific test file
|
| 233 |
+
pytest tests/test_feature_engineering.py -v
|
| 234 |
+
|
| 235 |
+
# Before each commit
|
| 236 |
+
pytest tests/ && git commit -m "feat: ..."
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
### 11. Documentation Requirements
|
| 240 |
+
- README.md with quick start guide
|
| 241 |
+
- HANDOVER_GUIDE.md for quant analyst
|
| 242 |
+
- Inline code comments for complex logic
|
| 243 |
+
- Results visualization + interpretation
|
| 244 |
+
- Fine-tuning roadmap (Phase 2 guidance)
|
| 245 |
+
|
| 246 |
+
### 12. Handover Package Must Include
|
| 247 |
+
- Working zero-shot forecast system
|
| 248 |
+
- All Marimo notebooks (.py) + exported Jupyter (.ipynb)
|
| 249 |
+
- HuggingFace Space with complete environment
|
| 250 |
+
- Performance analysis showing 134 MW MAE achieved
|
| 251 |
+
- Error analysis identifying fine-tuning opportunities
|
| 252 |
+
- Clear Phase 2 roadmap
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
## Geographic Scope (Reference)
|
| 257 |
+
|
| 258 |
+
**Core FBMC Countries** (13 total):
|
| 259 |
+
AT, BE, HR, CZ, FR, DE-LU, HU, NL, PL, RO, SK, SI
|
| 260 |
+
|
| 261 |
+
**Borders**: ~20 interconnections (multivariate forecasting)
|
| 262 |
+
|
| 263 |
+
**OUT OF SCOPE**: Nordic FBMC (NO, SE, DK, FI) - Phase 2 only
|
| 264 |
+
|
| 265 |
+
---
|
| 266 |
+
|
| 267 |
+
## API Access Confirmed
|
| 268 |
+
- ✓ JAOPuTo tool (12 months FBMC data accessible)
|
| 269 |
+
- ✓ ENTSO-E API key (generation, flows)
|
| 270 |
+
- ✓ OpenMeteo API (free tier, 52 grid points)
|
| 271 |
+
- ✓ HuggingFace write token (Datasets upload)
|
| 272 |
+
|
| 273 |
+
---
|
| 274 |
+
|
| 275 |
+
## Decision-Making Framework
|
| 276 |
+
|
| 277 |
+
When uncertain, apply this hierarchy:
|
| 278 |
+
|
| 279 |
+
1. **Does it extend timeline?** → Reject immediately
|
| 280 |
+
2. **Does it require fine-tuning?** → Phase 2 only
|
| 281 |
+
3. **Does it compromise data management?** → Never commit data to Git
|
| 282 |
+
4. **Does it add features beyond 85?** → Reject (scope creep)
|
| 283 |
+
5. **Does it skip testing/validation?** → Add checks immediately
|
| 284 |
+
6. **Does it help quant analyst?** → Include in handover docs
|
| 285 |
+
7. **Does it improve zero-shot accuracy?** → Consider if time permits
|
| 286 |
+
8. **Does it add complexity?** → Default to simplicity
|
| 287 |
+
9. **Can you commit and push?** → Do it now (frequent commits)
|
| 288 |
+
|
| 289 |
+
---
|
| 290 |
+
|
| 291 |
+
## Anti-Patterns to Avoid
|
| 292 |
+
|
| 293 |
+
❌ Training/fine-tuning the model (Phase 2)
|
| 294 |
+
❌ Committing data files to Git repository
|
| 295 |
+
❌ Using Git LFS for data storage
|
| 296 |
+
❌ Extending beyond 5-day timeline
|
| 297 |
+
❌ Adding features beyond 85 count
|
| 298 |
+
❌ Including Nordic FBMC borders
|
| 299 |
+
❌ Building production automation (out of scope)
|
| 300 |
+
❌ Creating real-time dashboards (out of scope)
|
| 301 |
+
❌ Over-engineering infrastructure
|
| 302 |
+
❌ Forgetting to document for handover
|
| 303 |
+
❌ Skipping data validation checks
|
| 304 |
+
❌ Running full pipeline without smoke tests
|
| 305 |
+
❌ Committing without pushing to GitHub
|
| 306 |
+
|
| 307 |
+
---
|
| 308 |
+
|
| 309 |
+
## Success Criteria Checklist
|
| 310 |
+
|
| 311 |
+
At Day 5 completion:
|
| 312 |
+
- [ ] Zero-shot forecasts for all ~20 FBMC borders working
|
| 313 |
+
- [ ] Inference time <5 minutes per 14-day forecast
|
| 314 |
+
- [ ] D+1 MAE ≤ 134 MW (target <150 MW)
|
| 315 |
+
- [ ] HuggingFace Space operational at $30/month
|
| 316 |
+
- [ ] Complete handover documentation written
|
| 317 |
+
- [ ] All Marimo notebooks exported to Jupyter format
|
| 318 |
+
- [ ] Git repo <100 MB (code only, no data)
|
| 319 |
+
- [ ] Data stored in HuggingFace Datasets (separate)
|
| 320 |
+
- [ ] Quant analyst can fork HF Space and continue
|
| 321 |
+
- [ ] All tests passing (data validation, feature checks, inference)
|
| 322 |
+
- [ ] Git history shows daily commits with descriptive messages
|
| 323 |
+
- [ ] GitHub repo synchronized with all commits pushed
|
| 324 |
+
|
| 325 |
+
---
|
| 326 |
+
|
| 327 |
+
## Communication Style
|
| 328 |
+
|
| 329 |
+
When providing updates or recommendations:
|
| 330 |
+
- Lead with impact on 5-day timeline
|
| 331 |
+
- Be direct about scope constraints
|
| 332 |
+
- Suggest alternatives within MVP boundaries
|
| 333 |
+
- Reference Phase 2 for out-of-scope items
|
| 334 |
+
- Document assumptions and limitations
|
| 335 |
+
- Always include next concrete action
|
| 336 |
+
|
| 337 |
+
---
|
| 338 |
+
|
| 339 |
+
**Version**: 1.0.0
|
| 340 |
+
**Created**: 2025-10-27
|
| 341 |
+
**Project**: FBMC Flow Forecasting MVP (Zero-Shot)
|
| 342 |
+
**Purpose**: Execution rules for Claude during 5-day development
|
|
@@ -0,0 +1,1045 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FBMC Flow Forecasting MVP - Day 0 Quick Start Guide
|
| 2 |
+
## Environment Setup (45 Minutes)
|
| 3 |
+
|
| 4 |
+
**Target**: From zero to working local + HF Space environment with all dependencies verified
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Prerequisites Check (5 minutes)
|
| 9 |
+
|
| 10 |
+
Before starting, verify you have:
|
| 11 |
+
|
| 12 |
+
```bash
|
| 13 |
+
# Check Java (required for JAOPuTo)
|
| 14 |
+
java -version
|
| 15 |
+
# Need: Java 11 or higher
|
| 16 |
+
# If missing: https://adoptium.net/ (download Temurin JDK 17)
|
| 17 |
+
|
| 18 |
+
# Check Git
|
| 19 |
+
git --version
|
| 20 |
+
# Need: 2.x+
|
| 21 |
+
|
| 22 |
+
# Check Python
|
| 23 |
+
python3 --version
|
| 24 |
+
# Need: 3.10+
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
**API Keys & Accounts Ready:**
|
| 28 |
+
- [ ] ENTSO-E Transparency Platform API key
|
| 29 |
+
- [ ] Hugging Face account with payment method for Spaces
|
| 30 |
+
- [ ] Hugging Face write token (for uploading datasets)
|
| 31 |
+
|
| 32 |
+
**Important Data Storage Philosophy:**
|
| 33 |
+
- **Code** → Git repository (small, version controlled)
|
| 34 |
+
- **Data** → HuggingFace Datasets (separate, not in Git)
|
| 35 |
+
- **NO Git LFS** needed (following data science best practices)
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## Step 1: Create Hugging Face Space (10 minutes)
|
| 40 |
+
|
| 41 |
+
1. **Navigate to**: https://huggingface.co/new-space
|
| 42 |
+
|
| 43 |
+
2. **Configure Space:**
|
| 44 |
+
- **Owner**: Your username/organization
|
| 45 |
+
- **Space name**: `fbmc-forecasting` (or your preference)
|
| 46 |
+
- **License**: Apache 2.0
|
| 47 |
+
- **Select SDK**: `JupyterLab`
|
| 48 |
+
- **Select Hardware**: `A10G GPU ($30/month)` ↠**CRITICAL**
|
| 49 |
+
- **Visibility**: Private (recommended for MVP)
|
| 50 |
+
|
| 51 |
+
3. **Create Space** button
|
| 52 |
+
|
| 53 |
+
4. **Wait 2-3 minutes** for Space initialization
|
| 54 |
+
|
| 55 |
+
5. **Verify Space Access:**
|
| 56 |
+
- Visit: `https://huggingface.co/spaces/YOUR_USERNAME/fbmc-forecasting`
|
| 57 |
+
- Confirm JupyterLab interface loads
|
| 58 |
+
- Check hardware: Should show "A10G GPU" in bottom-right
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
## Step 2: Local Environment Setup (25 minutes)
|
| 63 |
+
|
| 64 |
+
### 2.1 Clone HF Space Locally (2 minutes)
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
# Clone your HF Space
|
| 68 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/fbmc-forecasting
|
| 69 |
+
cd fbmc-forecasting
|
| 70 |
+
|
| 71 |
+
# Verify remote
|
| 72 |
+
git remote -v
|
| 73 |
+
# Should show: https://huggingface.co/spaces/YOUR_USERNAME/fbmc-forecasting
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### 2.2 Create Directory Structure (1 minute)
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
# Create project directories
|
| 80 |
+
mkdir -p notebooks \
|
| 81 |
+
notebooks_exported \
|
| 82 |
+
src/{data_collection,feature_engineering,model,utils} \
|
| 83 |
+
config \
|
| 84 |
+
results/{forecasts,evaluation,visualizations} \
|
| 85 |
+
docs \
|
| 86 |
+
tools \
|
| 87 |
+
tests
|
| 88 |
+
|
| 89 |
+
# Note: data/ directory will be created by download scripts
|
| 90 |
+
# It is NOT tracked in Git (following best practices)
|
| 91 |
+
|
| 92 |
+
# Verify structure
|
| 93 |
+
tree -L 2
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
### 2.3 Install uv Package Manager (2 minutes)
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
# Install uv (ultra-fast pip replacement)
|
| 100 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 101 |
+
|
| 102 |
+
# Add to PATH (if not automatic)
|
| 103 |
+
export PATH="$HOME/.cargo/bin:$PATH"
|
| 104 |
+
|
| 105 |
+
# Verify installation
|
| 106 |
+
uv --version
|
| 107 |
+
# Should show: uv 0.x.x
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
### 2.4 Create Virtual Environment (1 minute)
|
| 111 |
+
|
| 112 |
+
```bash
|
| 113 |
+
# Create .venv with uv
|
| 114 |
+
uv venv
|
| 115 |
+
|
| 116 |
+
# Activate (Linux/Mac)
|
| 117 |
+
source .venv/bin/activate
|
| 118 |
+
|
| 119 |
+
# Activate (Windows)
|
| 120 |
+
# .venv\Scripts\activate
|
| 121 |
+
|
| 122 |
+
# Verify activation
|
| 123 |
+
which python
|
| 124 |
+
# Should point to: /path/to/fbmc-forecasting/.venv/bin/python
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
### 2.5 Install Dependencies (2 minutes)
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
# Create requirements.txt
|
| 131 |
+
cat > requirements.txt << 'EOF'
|
| 132 |
+
# Core Data & ML
|
| 133 |
+
polars>=0.20.0
|
| 134 |
+
pyarrow>=13.0.0
|
| 135 |
+
numpy>=1.24.0
|
| 136 |
+
scikit-learn>=1.3.0
|
| 137 |
+
|
| 138 |
+
# Time Series Forecasting
|
| 139 |
+
chronos-forecasting>=1.0.0
|
| 140 |
+
transformers>=4.35.0
|
| 141 |
+
torch>=2.0.0
|
| 142 |
+
|
| 143 |
+
# Data Collection
|
| 144 |
+
entsoe-py>=0.5.0
|
| 145 |
+
requests>=2.31.0
|
| 146 |
+
|
| 147 |
+
# HuggingFace Integration (for Datasets, NOT Git LFS)
|
| 148 |
+
datasets>=2.14.0
|
| 149 |
+
huggingface-hub>=0.17.0
|
| 150 |
+
|
| 151 |
+
# Visualization & Notebooks
|
| 152 |
+
altair>=5.0.0
|
| 153 |
+
marimo>=0.9.0
|
| 154 |
+
jupyter>=1.0.0
|
| 155 |
+
ipykernel>=6.25.0
|
| 156 |
+
|
| 157 |
+
# Utilities
|
| 158 |
+
pyyaml>=6.0.0
|
| 159 |
+
python-dotenv>=1.0.0
|
| 160 |
+
tqdm>=4.66.0
|
| 161 |
+
|
| 162 |
+
# HF Space Integration
|
| 163 |
+
gradio>=4.0.0
|
| 164 |
+
EOF
|
| 165 |
+
|
| 166 |
+
# Install with uv (ultra-fast)
|
| 167 |
+
uv pip install -r requirements.txt
|
| 168 |
+
|
| 169 |
+
# Create lockfile for reproducibility
|
| 170 |
+
uv pip compile requirements.txt -o requirements.lock
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
**Verify installations:**
|
| 174 |
+
```bash
|
| 175 |
+
python -c "import polars; print(f'polars {polars.__version__}')"
|
| 176 |
+
python -c "import marimo; print(f'marimo {marimo.__version__}')"
|
| 177 |
+
python -c "import torch; print(f'torch {torch.__version__}')"
|
| 178 |
+
python -c "from chronos import ChronosPipeline; print('chronos-forecasting ✓')"
|
| 179 |
+
python -c "from datasets import Dataset; print('datasets ✓')"
|
| 180 |
+
python -c "from huggingface_hub import HfApi; print('huggingface-hub ✓')"
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
### 2.6 Configure .gitignore (Data Exclusion) (2 minutes)
|
| 184 |
+
|
| 185 |
+
```bash
|
| 186 |
+
# Create .gitignore - CRITICAL for keeping data out of Git
|
| 187 |
+
cat > .gitignore << 'EOF'
|
| 188 |
+
# ============================================
|
| 189 |
+
# Data Files - NEVER commit to Git
|
| 190 |
+
# ============================================
|
| 191 |
+
# Following data science best practices:
|
| 192 |
+
# - Code goes in Git
|
| 193 |
+
# - Data goes in HuggingFace Datasets
|
| 194 |
+
data/
|
| 195 |
+
*.parquet
|
| 196 |
+
*.pkl
|
| 197 |
+
*.csv
|
| 198 |
+
*.h5
|
| 199 |
+
*.hdf5
|
| 200 |
+
*.feather
|
| 201 |
+
|
| 202 |
+
# ============================================
|
| 203 |
+
# Model Artifacts
|
| 204 |
+
# ============================================
|
| 205 |
+
models/checkpoints/
|
| 206 |
+
*.pth
|
| 207 |
+
*.safetensors
|
| 208 |
+
*.ckpt
|
| 209 |
+
|
| 210 |
+
# ============================================
|
| 211 |
+
# Credentials & Secrets
|
| 212 |
+
# ============================================
|
| 213 |
+
.env
|
| 214 |
+
config/api_keys.yaml
|
| 215 |
+
*.key
|
| 216 |
+
*.pem
|
| 217 |
+
|
| 218 |
+
# ============================================
|
| 219 |
+
# Python
|
| 220 |
+
# ============================================
|
| 221 |
+
__pycache__/
|
| 222 |
+
*.pyc
|
| 223 |
+
*.pyo
|
| 224 |
+
*.egg-info/
|
| 225 |
+
.pytest_cache/
|
| 226 |
+
.venv/
|
| 227 |
+
venv/
|
| 228 |
+
|
| 229 |
+
# ============================================
|
| 230 |
+
# IDE & OS
|
| 231 |
+
# ============================================
|
| 232 |
+
.vscode/
|
| 233 |
+
.idea/
|
| 234 |
+
*.swp
|
| 235 |
+
.DS_Store
|
| 236 |
+
Thumbs.db
|
| 237 |
+
|
| 238 |
+
# ============================================
|
| 239 |
+
# Jupyter
|
| 240 |
+
# ============================================
|
| 241 |
+
.ipynb_checkpoints/
|
| 242 |
+
|
| 243 |
+
# ============================================
|
| 244 |
+
# Temporary Files
|
| 245 |
+
# ============================================
|
| 246 |
+
*.tmp
|
| 247 |
+
*.log
|
| 248 |
+
.cache/
|
| 249 |
+
EOF
|
| 250 |
+
|
| 251 |
+
# Stage .gitignore
|
| 252 |
+
git add .gitignore
|
| 253 |
+
|
| 254 |
+
# Verify data/ will be ignored
|
| 255 |
+
echo "data/" >> .gitignore
|
| 256 |
+
git check-ignore data/test.parquet
|
| 257 |
+
# Should output: data/test.parquet (confirming it's ignored)
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
**Why NO Git LFS?**
|
| 261 |
+
Following data science best practices:
|
| 262 |
+
- ✓ **Code** → Git (fast, version controlled)
|
| 263 |
+
- ✓ **Data** → HuggingFace Datasets (separate, scalable)
|
| 264 |
+
- ✗ **NOT** Git LFS (expensive, non-standard for ML projects)
|
| 265 |
+
|
| 266 |
+
**Data will be:**
|
| 267 |
+
- Downloaded via scripts (Day 1)
|
| 268 |
+
- Uploaded to HF Datasets (Day 1)
|
| 269 |
+
- Loaded programmatically (Days 2-5)
|
| 270 |
+
- NEVER committed to Git repository
|
| 271 |
+
|
| 272 |
+
### 2.7 Download JAOPuTo Tool (5 minutes)
|
| 273 |
+
|
| 274 |
+
```bash
|
| 275 |
+
# Navigate to tools directory
|
| 276 |
+
cd tools
|
| 277 |
+
|
| 278 |
+
# Download JAOPuTo (visit in browser or use wget)
|
| 279 |
+
# URL: https://publicationtool.jao.eu/core/
|
| 280 |
+
# Download: JAOPuTo.jar (latest version)
|
| 281 |
+
|
| 282 |
+
# Or use wget (if direct link available):
|
| 283 |
+
# wget https://publicationtool.jao.eu/core/download/JAOPuTo.jar
|
| 284 |
+
|
| 285 |
+
# Verify download
|
| 286 |
+
ls -lh JAOPuTo.jar
|
| 287 |
+
# Should show: ~5-10 MB file
|
| 288 |
+
|
| 289 |
+
# Test JAOPuTo
|
| 290 |
+
java -jar JAOPuTo.jar --help
|
| 291 |
+
# Should display: Usage information and available commands
|
| 292 |
+
|
| 293 |
+
cd ..
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
**Expected JAOPuTo output:**
|
| 297 |
+
```
|
| 298 |
+
JAOPuTo - JAO Publication Tool
|
| 299 |
+
Version: X.X.X
|
| 300 |
+
|
| 301 |
+
Usage: java -jar JAOPuTo.jar [options]
|
| 302 |
+
|
| 303 |
+
Options:
|
| 304 |
+
--start-date YYYY-MM-DD Start date for data download
|
| 305 |
+
--end-date YYYY-MM-DD End date for data download
|
| 306 |
+
--data-type TYPE Data type (FBMC_DOMAIN, CNEC, etc.)
|
| 307 |
+
--output-format FORMAT Output format (csv, parquet)
|
| 308 |
+
--output-dir PATH Output directory
|
| 309 |
+
...
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
### 2.8 Configure API Keys & HuggingFace Access (3 minutes)
|
| 313 |
+
|
| 314 |
+
```bash
|
| 315 |
+
# Create config directory structure
|
| 316 |
+
mkdir -p config
|
| 317 |
+
|
| 318 |
+
# Create API keys configuration
|
| 319 |
+
cat > config/api_keys.yaml << 'EOF'
|
| 320 |
+
# ENTSO-E Transparency Platform
|
| 321 |
+
entsoe_api_key: "YOUR_ENTSOE_API_KEY_HERE"
|
| 322 |
+
|
| 323 |
+
# OpenMeteo (free tier - no key required)
|
| 324 |
+
openmeteo_base_url: "https://api.open-meteo.com/v1/forecast"
|
| 325 |
+
|
| 326 |
+
# Hugging Face (for uploading datasets)
|
| 327 |
+
hf_token: "YOUR_HF_WRITE_TOKEN_HERE"
|
| 328 |
+
hf_username: "YOUR_HF_USERNAME"
|
| 329 |
+
EOF
|
| 330 |
+
|
| 331 |
+
# Create .env file for environment variables
|
| 332 |
+
cat > .env << 'EOF'
|
| 333 |
+
ENTSOE_API_KEY=YOUR_ENTSOE_API_KEY_HERE
|
| 334 |
+
OPENMETEO_BASE_URL=https://api.open-meteo.com/v1/forecast
|
| 335 |
+
HF_TOKEN=YOUR_HF_WRITE_TOKEN_HERE
|
| 336 |
+
HF_USERNAME=YOUR_HF_USERNAME
|
| 337 |
+
EOF
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
**Get your HuggingFace Write Token:**
|
| 341 |
+
1. Visit: https://huggingface.co/settings/tokens
|
| 342 |
+
2. Click "New token"
|
| 343 |
+
3. Name: "FBMC Dataset Upload"
|
| 344 |
+
4. Type: **Write** (required for uploading datasets)
|
| 345 |
+
5. Copy token
|
| 346 |
+
|
| 347 |
+
**Now edit the files with your actual credentials:**
|
| 348 |
+
```bash
|
| 349 |
+
# Option 1: Use text editor
|
| 350 |
+
nano config/api_keys.yaml # Update all YOUR_*_HERE placeholders
|
| 351 |
+
nano .env # Update all YOUR_*_HERE placeholders
|
| 352 |
+
|
| 353 |
+
# Option 2: Use sed (replace with your actual values)
|
| 354 |
+
sed -i 's/YOUR_ENTSOE_API_KEY_HERE/your-actual-entsoe-key/' config/api_keys.yaml .env
|
| 355 |
+
sed -i 's/YOUR_HF_WRITE_TOKEN_HERE/hf_your-actual-token/' config/api_keys.yaml .env
|
| 356 |
+
sed -i 's/YOUR_HF_USERNAME/your-username/' config/api_keys.yaml .env
|
| 357 |
+
```
|
| 358 |
+
|
| 359 |
+
**Verify credentials are set:**
|
| 360 |
+
```bash
|
| 361 |
+
# Should NOT see any "YOUR_*_HERE" placeholders
|
| 362 |
+
grep "YOUR_" config/api_keys.yaml
|
| 363 |
+
# Empty output = good!
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
### 2.9 Create Data Management Utilities (5 minutes)
|
| 367 |
+
|
| 368 |
+
```bash
|
| 369 |
+
# Create data collection module with HF Datasets integration
|
| 370 |
+
cat > src/data_collection/hf_datasets_manager.py << 'EOF'
|
| 371 |
+
"""HuggingFace Datasets manager for FBMC data storage."""
|
| 372 |
+
|
| 373 |
+
import polars as pl
|
| 374 |
+
from datasets import Dataset, DatasetDict
|
| 375 |
+
from huggingface_hub import HfApi
|
| 376 |
+
from pathlib import Path
|
| 377 |
+
import yaml
|
| 378 |
+
|
| 379 |
+
class FBMCDatasetManager:
|
| 380 |
+
"""Manage FBMC data uploads/downloads via HuggingFace Datasets."""
|
| 381 |
+
|
| 382 |
+
def __init__(self, config_path: str = "config/api_keys.yaml"):
|
| 383 |
+
"""Initialize with HF credentials."""
|
| 384 |
+
with open(config_path) as f:
|
| 385 |
+
config = yaml.safe_load(f)
|
| 386 |
+
|
| 387 |
+
self.hf_token = config['hf_token']
|
| 388 |
+
self.hf_username = config['hf_username']
|
| 389 |
+
self.api = HfApi(token=self.hf_token)
|
| 390 |
+
|
| 391 |
+
def upload_dataset(self, parquet_path: Path, dataset_name: str, description: str = ""):
|
| 392 |
+
"""Upload Parquet file to HuggingFace Datasets."""
|
| 393 |
+
print(f"Uploading {parquet_path.name} to HF Datasets...")
|
| 394 |
+
|
| 395 |
+
# Load Parquet as polars, convert to HF Dataset
|
| 396 |
+
df = pl.read_parquet(parquet_path)
|
| 397 |
+
dataset = Dataset.from_pandas(df.to_pandas())
|
| 398 |
+
|
| 399 |
+
# Create full dataset name
|
| 400 |
+
full_name = f"{self.hf_username}/{dataset_name}"
|
| 401 |
+
|
| 402 |
+
# Upload to HF
|
| 403 |
+
dataset.push_to_hub(
|
| 404 |
+
full_name,
|
| 405 |
+
token=self.hf_token,
|
| 406 |
+
private=False # Public datasets (free storage)
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
print(f"✓ Uploaded to: https://huggingface.co/datasets/{full_name}")
|
| 410 |
+
return full_name
|
| 411 |
+
|
| 412 |
+
def download_dataset(self, dataset_name: str, output_path: Path):
|
| 413 |
+
"""Download dataset from HF to local Parquet."""
|
| 414 |
+
from datasets import load_dataset
|
| 415 |
+
|
| 416 |
+
print(f"Downloading {dataset_name} from HF Datasets...")
|
| 417 |
+
|
| 418 |
+
# Download from HF
|
| 419 |
+
dataset = load_dataset(
|
| 420 |
+
f"{self.hf_username}/{dataset_name}",
|
| 421 |
+
split="train"
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
# Convert to polars and save
|
| 425 |
+
df = pl.from_pandas(dataset.to_pandas())
|
| 426 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 427 |
+
df.write_parquet(output_path)
|
| 428 |
+
|
| 429 |
+
print(f"✓ Downloaded to: {output_path}")
|
| 430 |
+
return df
|
| 431 |
+
|
| 432 |
+
def list_datasets(self):
|
| 433 |
+
"""List all FBMC datasets for this user."""
|
| 434 |
+
datasets = self.api.list_datasets(author=self.hf_username)
|
| 435 |
+
fbmc_datasets = [d for d in datasets if 'fbmc' in d.id.lower()]
|
| 436 |
+
|
| 437 |
+
print(f"\nFBMC Datasets for {self.hf_username}:")
|
| 438 |
+
for ds in fbmc_datasets:
|
| 439 |
+
print(f" - {ds.id}")
|
| 440 |
+
|
| 441 |
+
return fbmc_datasets
|
| 442 |
+
|
| 443 |
+
# Example usage (will be used in Day 1)
|
| 444 |
+
if __name__ == "__main__":
|
| 445 |
+
manager = FBMCDatasetManager()
|
| 446 |
+
|
| 447 |
+
# Upload example (Day 1 will use this)
|
| 448 |
+
# manager.upload_dataset(
|
| 449 |
+
# parquet_path=Path("data/raw/cnecs_2023_2025.parquet"),
|
| 450 |
+
# dataset_name="fbmc-cnecs-2023-2025",
|
| 451 |
+
# description="FBMC CNECs data: Jan 2023 - Sept 2025"
|
| 452 |
+
# )
|
| 453 |
+
|
| 454 |
+
# Download example (HF Space will use this)
|
| 455 |
+
# manager.download_dataset(
|
| 456 |
+
# dataset_name="fbmc-cnecs-2023-2025",
|
| 457 |
+
# output_path=Path("data/raw/cnecs_2023_2025.parquet")
|
| 458 |
+
# )
|
| 459 |
+
EOF
|
| 460 |
+
|
| 461 |
+
# Create data download orchestrator
|
| 462 |
+
cat > src/data_collection/download_all.py << 'EOF'
|
| 463 |
+
"""Download all FBMC data from HuggingFace Datasets."""
|
| 464 |
+
|
| 465 |
+
from pathlib import Path
|
| 466 |
+
from hf_datasets_manager import FBMCDatasetManager
|
| 467 |
+
|
| 468 |
+
def setup_data(data_dir: Path = Path("data/raw")):
|
| 469 |
+
"""Download all datasets if not present locally."""
|
| 470 |
+
manager = FBMCDatasetManager()
|
| 471 |
+
|
| 472 |
+
datasets_to_download = {
|
| 473 |
+
"fbmc-cnecs-2023-2025": "cnecs_2023_2025.parquet",
|
| 474 |
+
"fbmc-weather-2023-2025": "weather_2023_2025.parquet",
|
| 475 |
+
"fbmc-entsoe-2023-2025": "entsoe_2023_2025.parquet",
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 479 |
+
|
| 480 |
+
for dataset_name, filename in datasets_to_download.items():
|
| 481 |
+
output_path = data_dir / filename
|
| 482 |
+
|
| 483 |
+
if output_path.exists():
|
| 484 |
+
print(f"✓ {filename} already exists, skipping")
|
| 485 |
+
else:
|
| 486 |
+
try:
|
| 487 |
+
manager.download_dataset(dataset_name, output_path)
|
| 488 |
+
except Exception as e:
|
| 489 |
+
print(f"✗ Failed to download {dataset_name}: {e}")
|
| 490 |
+
print(f" You may need to run Day 1 data collection first")
|
| 491 |
+
|
| 492 |
+
print("\n✓ Data setup complete")
|
| 493 |
+
|
| 494 |
+
if __name__ == "__main__":
|
| 495 |
+
setup_data()
|
| 496 |
+
EOF
|
| 497 |
+
|
| 498 |
+
# Make scripts executable
|
| 499 |
+
chmod +x src/data_collection/hf_datasets_manager.py
|
| 500 |
+
chmod +x src/data_collection/download_all.py
|
| 501 |
+
|
| 502 |
+
echo "✓ Data management utilities created"
|
| 503 |
+
```
|
| 504 |
+
|
| 505 |
+
**What This Does:**
|
| 506 |
+
- `hf_datasets_manager.py`: Upload/download Parquet files to/from HF Datasets
|
| 507 |
+
- `download_all.py`: One-command data setup for HF Space or analysts
|
| 508 |
+
|
| 509 |
+
**Day 1 Workflow:**
|
| 510 |
+
1. Download data from JAO/ENTSO-E/OpenMeteo to `data/raw/`
|
| 511 |
+
2. Upload each Parquet to HF Datasets (separate from Git)
|
| 512 |
+
3. Git repo stays small (only code)
|
| 513 |
+
|
| 514 |
+
**HF Space Workflow:**
|
| 515 |
+
```python
|
| 516 |
+
# In your Space's app.py startup:
|
| 517 |
+
from src.data_collection.download_all import setup_data
|
| 518 |
+
setup_data() # Downloads from HF Datasets, not Git
|
| 519 |
+
```
|
| 520 |
+
|
| 521 |
+
### 2.10 Create First Marimo Notebook (5 minutes)
|
| 522 |
+
|
| 523 |
+
```bash
|
| 524 |
+
# Create initial exploration notebook
|
| 525 |
+
cat > notebooks/01_data_exploration.py << 'EOF'
|
| 526 |
+
import marimo
|
| 527 |
+
|
| 528 |
+
__generated_with = "0.9.0"
|
| 529 |
+
app = marimo.App(width="medium")
|
| 530 |
+
|
| 531 |
+
@app.cell
|
| 532 |
+
def __():
|
| 533 |
+
import marimo as mo
|
| 534 |
+
import polars as pl
|
| 535 |
+
import altair as alt
|
| 536 |
+
from pathlib import Path
|
| 537 |
+
return mo, pl, alt, Path
|
| 538 |
+
|
| 539 |
+
@app.cell
|
| 540 |
+
def __(mo):
|
| 541 |
+
mo.md(
|
| 542 |
+
"""
|
| 543 |
+
# FBMC Flow Forecasting - Data Exploration
|
| 544 |
+
|
| 545 |
+
**Day 1 Objective**: Explore JAO FBMC data structure
|
| 546 |
+
|
| 547 |
+
## Steps:
|
| 548 |
+
1. Load downloaded Parquet files
|
| 549 |
+
2. Inspect CNECs, PTDFs, RAMs
|
| 550 |
+
3. Identify top 50 binding CNECs
|
| 551 |
+
4. Visualize temporal patterns
|
| 552 |
+
"""
|
| 553 |
+
)
|
| 554 |
+
return
|
| 555 |
+
|
| 556 |
+
@app.cell
|
| 557 |
+
def __(Path):
|
| 558 |
+
# Data paths
|
| 559 |
+
DATA_DIR = Path("../data/raw")
|
| 560 |
+
CNECS_FILE = DATA_DIR / "cnecs_2023_2025.parquet"
|
| 561 |
+
return DATA_DIR, CNECS_FILE
|
| 562 |
+
|
| 563 |
+
@app.cell
|
| 564 |
+
def __(mo, CNECS_FILE):
|
| 565 |
+
# Check if data exists
|
| 566 |
+
if CNECS_FILE.exists():
|
| 567 |
+
mo.md("✓ CNECs data found - ready for Day 1 analysis")
|
| 568 |
+
else:
|
| 569 |
+
mo.md("âš CNECs data not yet downloaded - run Day 1 collection script")
|
| 570 |
+
return
|
| 571 |
+
|
| 572 |
+
if __name__ == "__main__":
|
| 573 |
+
app.run()
|
| 574 |
+
EOF
|
| 575 |
+
|
| 576 |
+
# Test Marimo installation
|
| 577 |
+
marimo edit notebooks/01_data_exploration.py &
|
| 578 |
+
# This will open browser with interactive notebook
|
| 579 |
+
# Close after verifying it loads correctly (Ctrl+C in terminal)
|
| 580 |
+
```
|
| 581 |
+
|
| 582 |
+
### 2.11 Create Utility Modules (2 minutes)
|
| 583 |
+
|
| 584 |
+
```bash
|
| 585 |
+
# Create data loading utilities
|
| 586 |
+
cat > src/utils/data_loader.py << 'EOF'
|
| 587 |
+
"""Data loading utilities for FBMC forecasting project."""
|
| 588 |
+
|
| 589 |
+
import polars as pl
|
| 590 |
+
from pathlib import Path
|
| 591 |
+
from typing import Optional
|
| 592 |
+
|
| 593 |
+
def load_cnecs(data_dir: Path, start_date: Optional[str] = None, end_date: Optional[str] = None) -> pl.DataFrame:
|
| 594 |
+
"""Load CNEC data with optional date filtering."""
|
| 595 |
+
cnecs = pl.read_parquet(data_dir / "cnecs_2023_2025.parquet")
|
| 596 |
+
|
| 597 |
+
if start_date:
|
| 598 |
+
cnecs = cnecs.filter(pl.col("timestamp") >= start_date)
|
| 599 |
+
if end_date:
|
| 600 |
+
cnecs = cnecs.filter(pl.col("timestamp") <= end_date)
|
| 601 |
+
|
| 602 |
+
return cnecs
|
| 603 |
+
|
| 604 |
+
def load_weather(data_dir: Path, grid_points: Optional[list] = None) -> pl.DataFrame:
|
| 605 |
+
"""Load weather data with optional grid point filtering."""
|
| 606 |
+
weather = pl.read_parquet(data_dir / "weather_2023_2025.parquet")
|
| 607 |
+
|
| 608 |
+
if grid_points:
|
| 609 |
+
weather = weather.filter(pl.col("grid_point").is_in(grid_points))
|
| 610 |
+
|
| 611 |
+
return weather
|
| 612 |
+
EOF
|
| 613 |
+
|
| 614 |
+
# Create __init__.py files
|
| 615 |
+
touch src/__init__.py
|
| 616 |
+
touch src/utils/__init__.py
|
| 617 |
+
touch src/data_collection/__init__.py
|
| 618 |
+
touch src/feature_engineering/__init__.py
|
| 619 |
+
touch src/model/__init__.py
|
| 620 |
+
```
|
| 621 |
+
|
| 622 |
+
### 2.12 Initial Commit (2 minutes)
|
| 623 |
+
|
| 624 |
+
```bash
|
| 625 |
+
# Stage all changes (note: data/ is excluded by .gitignore)
|
| 626 |
+
git add .
|
| 627 |
+
|
| 628 |
+
# Create initial commit
|
| 629 |
+
git commit -m "Day 0: Initialize FBMC forecasting MVP environment
|
| 630 |
+
|
| 631 |
+
- Add project structure (notebooks, src, config, tools)
|
| 632 |
+
- Configure uv + polars + Marimo + Chronos + HF Datasets stack
|
| 633 |
+
- Create .gitignore (excludes data/ following best practices)
|
| 634 |
+
- Download JAOPuTo tool for JAO data access
|
| 635 |
+
- Configure ENTSO-E, OpenMeteo, and HuggingFace API access
|
| 636 |
+
- Add HF Datasets manager for data storage (separate from Git)
|
| 637 |
+
- Create data download utilities (download_all.py)
|
| 638 |
+
- Create initial exploration notebook
|
| 639 |
+
|
| 640 |
+
Data Strategy:
|
| 641 |
+
- Code → Git (this repo)
|
| 642 |
+
- Data → HuggingFace Datasets (separate, not in Git)
|
| 643 |
+
- NO Git LFS (following data science best practices)
|
| 644 |
+
|
| 645 |
+
Infrastructure: HF Space (A10G GPU, \$30/month)"
|
| 646 |
+
|
| 647 |
+
# Push to HF Space
|
| 648 |
+
git push origin main
|
| 649 |
+
|
| 650 |
+
# Verify push succeeded
|
| 651 |
+
git status
|
| 652 |
+
# Should show: "Your branch is up to date with 'origin/main'"
|
| 653 |
+
|
| 654 |
+
# Verify no data files were committed
|
| 655 |
+
git ls-files | grep "\.parquet"
|
| 656 |
+
# Should be empty (no .parquet files in Git)
|
| 657 |
+
```
|
| 658 |
+
|
| 659 |
+
---
|
| 660 |
+
|
| 661 |
+
## Step 3: Verify Complete Setup (5 minutes)
|
| 662 |
+
|
| 663 |
+
### 3.1 Python Environment Verification
|
| 664 |
+
|
| 665 |
+
```bash
|
| 666 |
+
# Activate environment if not already
|
| 667 |
+
source .venv/bin/activate
|
| 668 |
+
|
| 669 |
+
# Run comprehensive checks
|
| 670 |
+
python << 'EOF'
|
| 671 |
+
import sys
|
| 672 |
+
print(f"Python: {sys.version}")
|
| 673 |
+
|
| 674 |
+
packages = [
|
| 675 |
+
"polars", "pyarrow", "numpy", "scikit-learn",
|
| 676 |
+
"torch", "transformers", "marimo", "altair",
|
| 677 |
+
"entsoe", "requests", "yaml", "gradio",
|
| 678 |
+
"datasets", "huggingface_hub"
|
| 679 |
+
]
|
| 680 |
+
|
| 681 |
+
print("\nPackage Versions:")
|
| 682 |
+
for pkg in packages:
|
| 683 |
+
try:
|
| 684 |
+
if pkg == "entsoe":
|
| 685 |
+
import entsoe
|
| 686 |
+
print(f"✓ entsoe-py: {entsoe.__version__}")
|
| 687 |
+
elif pkg == "yaml":
|
| 688 |
+
import yaml
|
| 689 |
+
print(f"✓ pyyaml: {yaml.__version__}")
|
| 690 |
+
elif pkg == "huggingface_hub":
|
| 691 |
+
from huggingface_hub import HfApi
|
| 692 |
+
print(f"✓ huggingface-hub: Ready")
|
| 693 |
+
else:
|
| 694 |
+
mod = __import__(pkg)
|
| 695 |
+
print(f"✓ {pkg}: {mod.__version__}")
|
| 696 |
+
except Exception as e:
|
| 697 |
+
print(f"✗ {pkg}: {e}")
|
| 698 |
+
|
| 699 |
+
# Test Chronos specifically
|
| 700 |
+
try:
|
| 701 |
+
from chronos import ChronosPipeline
|
| 702 |
+
print("\n✓ Chronos forecasting: Ready")
|
| 703 |
+
except Exception as e:
|
| 704 |
+
print(f"\n✗ Chronos forecasting: {e}")
|
| 705 |
+
|
| 706 |
+
# Test HF Datasets
|
| 707 |
+
try:
|
| 708 |
+
from datasets import Dataset
|
| 709 |
+
print("✓ HuggingFace Datasets: Ready")
|
| 710 |
+
except Exception as e:
|
| 711 |
+
print(f"✗ HuggingFace Datasets: {e}")
|
| 712 |
+
|
| 713 |
+
print("\nAll checks complete!")
|
| 714 |
+
EOF
|
| 715 |
+
```
|
| 716 |
+
|
| 717 |
+
### 3.2 JAOPuTo Verification
|
| 718 |
+
|
| 719 |
+
```bash
|
| 720 |
+
# Test JAOPuTo with dry-run
|
| 721 |
+
java -jar tools/JAOPuTo.jar \
|
| 722 |
+
--help
|
| 723 |
+
|
| 724 |
+
# Expected: Usage information displayed without errors
|
| 725 |
+
```
|
| 726 |
+
|
| 727 |
+
### 3.3 API Access Verification
|
| 728 |
+
|
| 729 |
+
```bash
|
| 730 |
+
# Test ENTSO-E API
|
| 731 |
+
python << 'EOF'
|
| 732 |
+
from entsoe import EntsoePandasClient
|
| 733 |
+
import yaml
|
| 734 |
+
|
| 735 |
+
# Load API key
|
| 736 |
+
with open('config/api_keys.yaml') as f:
|
| 737 |
+
config = yaml.safe_load(f)
|
| 738 |
+
|
| 739 |
+
api_key = config['entsoe_api_key']
|
| 740 |
+
|
| 741 |
+
if 'YOUR_ENTSOE_API_KEY_HERE' in api_key:
|
| 742 |
+
print("âš ENTSO-E API key not configured - update config/api_keys.yaml")
|
| 743 |
+
else:
|
| 744 |
+
try:
|
| 745 |
+
client = EntsoePandasClient(api_key=api_key)
|
| 746 |
+
print("✓ ENTSO-E API client initialized successfully")
|
| 747 |
+
except Exception as e:
|
| 748 |
+
print(f"✗ ENTSO-E API error: {e}")
|
| 749 |
+
EOF
|
| 750 |
+
|
| 751 |
+
# Test OpenMeteo API
|
| 752 |
+
python << 'EOF'
|
| 753 |
+
import requests
|
| 754 |
+
|
| 755 |
+
response = requests.get(
|
| 756 |
+
"https://api.open-meteo.com/v1/forecast",
|
| 757 |
+
params={
|
| 758 |
+
"latitude": 52.52,
|
| 759 |
+
"longitude": 13.41,
|
| 760 |
+
"hourly": "temperature_2m",
|
| 761 |
+
"start_date": "2025-01-01",
|
| 762 |
+
"end_date": "2025-01-02"
|
| 763 |
+
}
|
| 764 |
+
)
|
| 765 |
+
|
| 766 |
+
if response.status_code == 200:
|
| 767 |
+
print("✓ OpenMeteo API accessible")
|
| 768 |
+
else:
|
| 769 |
+
print(f"✗ OpenMeteo API error: {response.status_code}")
|
| 770 |
+
EOF
|
| 771 |
+
|
| 772 |
+
# Test HuggingFace authentication
|
| 773 |
+
python << 'EOF'
|
| 774 |
+
from huggingface_hub import HfApi
|
| 775 |
+
import yaml
|
| 776 |
+
|
| 777 |
+
with open('config/api_keys.yaml') as f:
|
| 778 |
+
config = yaml.safe_load(f)
|
| 779 |
+
|
| 780 |
+
hf_token = config['hf_token']
|
| 781 |
+
hf_username = config['hf_username']
|
| 782 |
+
|
| 783 |
+
if 'YOUR_HF' in hf_token or 'YOUR_HF' in hf_username:
|
| 784 |
+
print("âš HuggingFace credentials not configured - update config/api_keys.yaml")
|
| 785 |
+
else:
|
| 786 |
+
try:
|
| 787 |
+
api = HfApi(token=hf_token)
|
| 788 |
+
user_info = api.whoami()
|
| 789 |
+
print(f"✓ HuggingFace authenticated as: {user_info['name']}")
|
| 790 |
+
print(f" Can create datasets: {'datasets' in user_info.get('auth', {}).get('accessToken', {}).get('role', '')}")
|
| 791 |
+
except Exception as e:
|
| 792 |
+
print(f"✗ HuggingFace authentication error: {e}")
|
| 793 |
+
print(f" Verify token has WRITE permissions")
|
| 794 |
+
EOF
|
| 795 |
+
```
|
| 796 |
+
|
| 797 |
+
### 3.4 HF Space Verification
|
| 798 |
+
|
| 799 |
+
```bash
|
| 800 |
+
# Check HF Space status
|
| 801 |
+
echo "Visit your HF Space: https://huggingface.co/spaces/YOUR_USERNAME/fbmc-forecasting"
|
| 802 |
+
echo ""
|
| 803 |
+
echo "Verify:"
|
| 804 |
+
echo " 1. JupyterLab interface loads"
|
| 805 |
+
echo " 2. Hardware shows 'A10G GPU' in bottom-right"
|
| 806 |
+
echo " 3. Files from git push are visible"
|
| 807 |
+
echo " 4. Can create new notebook"
|
| 808 |
+
```
|
| 809 |
+
|
| 810 |
+
### 3.5 Final Checklist
|
| 811 |
+
|
| 812 |
+
```bash
|
| 813 |
+
# Print final status
|
| 814 |
+
cat << 'EOF'
|
| 815 |
+
â•”â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•—
|
| 816 |
+
â•‘ DAY 0 SETUP VERIFICATION CHECKLIST â•‘
|
| 817 |
+
╚â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
|
| 818 |
+
|
| 819 |
+
Environment:
|
| 820 |
+
[ ] Python 3.10+ installed
|
| 821 |
+
[ ] Java 11+ installed (for JAOPuTo)
|
| 822 |
+
[ ] Git installed (NO Git LFS needed)
|
| 823 |
+
[ ] uv package manager installed
|
| 824 |
+
|
| 825 |
+
Local Setup:
|
| 826 |
+
[ ] Virtual environment created and activated
|
| 827 |
+
[ ] All Python dependencies installed (23 packages)
|
| 828 |
+
[ ] JAOPuTo.jar downloaded and tested
|
| 829 |
+
[ ] API keys configured (ENTSO-E + OpenMeteo + HuggingFace)
|
| 830 |
+
[ ] HuggingFace write token obtained
|
| 831 |
+
[ ] Project structure created (8 directories)
|
| 832 |
+
[ ] .gitignore configured (data/ excluded)
|
| 833 |
+
[ ] Initial Marimo notebook created
|
| 834 |
+
[ ] Data management utilities created (hf_datasets_manager.py)
|
| 835 |
+
|
| 836 |
+
Git & HF Space:
|
| 837 |
+
[ ] HF Space created (A10G GPU, $30/month)
|
| 838 |
+
[ ] Repository cloned locally
|
| 839 |
+
[ ] .gitignore excludes all data files (*.parquet, data/)
|
| 840 |
+
[ ] Initial commit pushed to HF Space (code only, NO data)
|
| 841 |
+
[ ] HF Space JupyterLab accessible
|
| 842 |
+
[ ] Git repo size < 50 MB (no data committed)
|
| 843 |
+
|
| 844 |
+
Verification Tests:
|
| 845 |
+
[ ] Python imports successful (polars, chronos, datasets, etc.)
|
| 846 |
+
[ ] JAOPuTo --help displays correctly
|
| 847 |
+
[ ] ENTSO-E API client initializes
|
| 848 |
+
[ ] OpenMeteo API responds (status 200)
|
| 849 |
+
[ ] HuggingFace authentication successful (write access)
|
| 850 |
+
[ ] Marimo notebook opens in browser
|
| 851 |
+
|
| 852 |
+
Data Strategy Confirmed:
|
| 853 |
+
[ ] Code goes in Git (version controlled)
|
| 854 |
+
[ ] Data goes in HuggingFace Datasets (separate storage)
|
| 855 |
+
[ ] NO Git LFS setup (following data science best practices)
|
| 856 |
+
[ ] data/ directory in .gitignore
|
| 857 |
+
|
| 858 |
+
Ready for Day 1: [ ]
|
| 859 |
+
|
| 860 |
+
Next Step: Run Day 1 data collection (8 hours)
|
| 861 |
+
- Download data locally via JAOPuTo/APIs
|
| 862 |
+
- Upload to HuggingFace Datasets (separate from Git)
|
| 863 |
+
- Total data: ~6 GB (stored in HF Datasets, NOT Git)
|
| 864 |
+
EOF
|
| 865 |
+
```
|
| 866 |
+
|
| 867 |
+
---
|
| 868 |
+
|
| 869 |
+
## Troubleshooting
|
| 870 |
+
|
| 871 |
+
### Issue: Java not found
|
| 872 |
+
```bash
|
| 873 |
+
# Install Java 17 (recommended)
|
| 874 |
+
# Mac:
|
| 875 |
+
brew install openjdk@17
|
| 876 |
+
|
| 877 |
+
# Ubuntu/Debian:
|
| 878 |
+
sudo apt update
|
| 879 |
+
sudo apt install openjdk-17-jdk
|
| 880 |
+
|
| 881 |
+
# Verify:
|
| 882 |
+
java -version
|
| 883 |
+
```
|
| 884 |
+
|
| 885 |
+
### Issue: uv installation fails
|
| 886 |
+
```bash
|
| 887 |
+
# Alternative: Use pip directly
|
| 888 |
+
python -m venv .venv
|
| 889 |
+
source .venv/bin/activate
|
| 890 |
+
pip install -r requirements.txt
|
| 891 |
+
```
|
| 892 |
+
|
| 893 |
+
### Issue: Git LFS files not syncing
|
| 894 |
+
**Not applicable** - We're using HuggingFace Datasets, not Git LFS.
|
| 895 |
+
|
| 896 |
+
If you see Git LFS references, you may have an old version of this guide. Data files should NEVER be in Git.
|
| 897 |
+
|
| 898 |
+
### Issue: HuggingFace authentication fails
|
| 899 |
+
```bash
|
| 900 |
+
# Verify token is correct
|
| 901 |
+
python << 'EOF'
|
| 902 |
+
from huggingface_hub import HfApi
|
| 903 |
+
import yaml
|
| 904 |
+
|
| 905 |
+
with open('config/api_keys.yaml') as f:
|
| 906 |
+
config = yaml.safe_load(f)
|
| 907 |
+
|
| 908 |
+
try:
|
| 909 |
+
api = HfApi(token=config['hf_token'])
|
| 910 |
+
print(api.whoami())
|
| 911 |
+
except Exception as e:
|
| 912 |
+
print(f"Error: {e}")
|
| 913 |
+
print("\nTroubleshooting:")
|
| 914 |
+
print("1. Visit: https://huggingface.co/settings/tokens")
|
| 915 |
+
print("2. Verify token has WRITE permission")
|
| 916 |
+
print("3. Copy token exactly (starts with 'hf_')")
|
| 917 |
+
print("4. Update config/api_keys.yaml and .env")
|
| 918 |
+
EOF
|
| 919 |
+
```
|
| 920 |
+
|
| 921 |
+
### Issue: Cannot upload to HuggingFace Datasets
|
| 922 |
+
```bash
|
| 923 |
+
# Common causes:
|
| 924 |
+
# 1. Token doesn't have write permissions
|
| 925 |
+
# Fix: Create new token with "write" scope
|
| 926 |
+
|
| 927 |
+
# 2. Dataset name already exists
|
| 928 |
+
# Fix: Use different name or add version suffix
|
| 929 |
+
# Example: fbmc-cnecs-2023-2025-v2
|
| 930 |
+
|
| 931 |
+
# 3. File too large (>5GB single file limit)
|
| 932 |
+
# Fix: Split into multiple datasets or use sharding
|
| 933 |
+
|
| 934 |
+
# Test upload with small sample:
|
| 935 |
+
python << 'EOF'
|
| 936 |
+
from datasets import Dataset
|
| 937 |
+
import pandas as pd
|
| 938 |
+
|
| 939 |
+
# Create tiny test dataset
|
| 940 |
+
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
|
| 941 |
+
dataset = Dataset.from_pandas(df)
|
| 942 |
+
|
| 943 |
+
# Try uploading
|
| 944 |
+
try:
|
| 945 |
+
dataset.push_to_hub("YOUR_USERNAME/test-dataset", token="YOUR_TOKEN")
|
| 946 |
+
print("✓ Upload successful - authentication works")
|
| 947 |
+
except Exception as e:
|
| 948 |
+
print(f"✗ Upload failed: {e}")
|
| 949 |
+
EOF
|
| 950 |
+
```
|
| 951 |
+
|
| 952 |
+
### Issue: Marimo notebook won't open
|
| 953 |
+
```bash
|
| 954 |
+
# Check marimo installation
|
| 955 |
+
marimo --version
|
| 956 |
+
|
| 957 |
+
# Try running without opening browser
|
| 958 |
+
marimo run notebooks/01_data_exploration.py
|
| 959 |
+
|
| 960 |
+
# Check for port conflicts
|
| 961 |
+
lsof -i :2718 # Default Marimo port
|
| 962 |
+
```
|
| 963 |
+
|
| 964 |
+
### Issue: ENTSO-E API key invalid
|
| 965 |
+
```bash
|
| 966 |
+
# Verify key in ENTSO-E Transparency Platform:
|
| 967 |
+
# 1. Login: https://transparency.entsoe.eu/
|
| 968 |
+
# 2. Navigate: Account Settings → Web API Security Token
|
| 969 |
+
# 3. Copy key exactly (no spaces)
|
| 970 |
+
# 4. Update: config/api_keys.yaml and .env
|
| 971 |
+
```
|
| 972 |
+
|
| 973 |
+
### Issue: HF Space shows "Building..." forever
|
| 974 |
+
```bash
|
| 975 |
+
# Check HF Space logs:
|
| 976 |
+
# Visit: https://huggingface.co/spaces/YOUR_USERNAME/fbmc-forecasting
|
| 977 |
+
# Click: "Settings" → "Logs"
|
| 978 |
+
|
| 979 |
+
# Common fix: Ensure requirements.txt is valid
|
| 980 |
+
# Test locally:
|
| 981 |
+
pip install -r requirements.txt --dry-run
|
| 982 |
+
```
|
| 983 |
+
|
| 984 |
+
---
|
| 985 |
+
|
| 986 |
+
## What's Next: Day 1 Preview
|
| 987 |
+
|
| 988 |
+
**Day 1 Objective**: Download 2 years of historical data (Jan 2023 - Sept 2025)
|
| 989 |
+
|
| 990 |
+
**Data Collection Tasks:**
|
| 991 |
+
1. **JAO FBMC Data** (4 hours)
|
| 992 |
+
- CNECs: ~500 MB
|
| 993 |
+
- PTDFs: ~800 MB
|
| 994 |
+
- RAMs: ~400 MB
|
| 995 |
+
- Shadow prices: ~300 MB
|
| 996 |
+
|
| 997 |
+
2. **ENTSO-E Data** (2 hours)
|
| 998 |
+
- Generation forecasts: 12 zones × 2 years
|
| 999 |
+
- Actual generation: 12 zones × 2 years
|
| 1000 |
+
- Cross-border flows: 20 borders × 2 years
|
| 1001 |
+
|
| 1002 |
+
3. **OpenMeteo Weather** (2 hours)
|
| 1003 |
+
- 52 grid points × 2 years
|
| 1004 |
+
- 8 variables per point
|
| 1005 |
+
- Parallel download optimization
|
| 1006 |
+
|
| 1007 |
+
**Total Data Size**: ~6 GB (compressed Parquet)
|
| 1008 |
+
|
| 1009 |
+
**Day 1 Script**: Will be provided with exact JAOPuTo commands and parallel download logic.
|
| 1010 |
+
|
| 1011 |
+
---
|
| 1012 |
+
|
| 1013 |
+
## Summary
|
| 1014 |
+
|
| 1015 |
+
**Time Investment**: 45 minutes
|
| 1016 |
+
**Result**: Production-ready local + cloud development environment
|
| 1017 |
+
|
| 1018 |
+
**You Now Have:**
|
| 1019 |
+
- ✓ HF Space with A10G GPU ($30/month)
|
| 1020 |
+
- ✓ Local Python environment (23 packages including HF Datasets)
|
| 1021 |
+
- ✓ JAOPuTo tool for JAO data access
|
| 1022 |
+
- ✓ ENTSO-E + OpenMeteo + HuggingFace API access configured
|
| 1023 |
+
- ✓ HuggingFace Datasets manager for data storage (separate from Git)
|
| 1024 |
+
- ✓ Data download/upload utilities (hf_datasets_manager.py)
|
| 1025 |
+
- ✓ Marimo reactive notebook environment
|
| 1026 |
+
- ✓ .gitignore configured (data/ excluded, following best practices)
|
| 1027 |
+
- ✓ Complete project structure (8 directories)
|
| 1028 |
+
|
| 1029 |
+
**Data Strategy Implemented:**
|
| 1030 |
+
```
|
| 1031 |
+
Code (version controlled) → Git Repository (~50 MB)
|
| 1032 |
+
Data (storage & versioning) → HuggingFace Datasets (~6 GB)
|
| 1033 |
+
NO Git LFS (following data science best practices)
|
| 1034 |
+
```
|
| 1035 |
+
|
| 1036 |
+
**Ready For**: Day 1 data collection (8 hours)
|
| 1037 |
+
- Download data locally (JAOPuTo + APIs)
|
| 1038 |
+
- Upload to HuggingFace Datasets (not Git)
|
| 1039 |
+
- Git repo stays clean (code only)
|
| 1040 |
+
|
| 1041 |
+
---
|
| 1042 |
+
|
| 1043 |
+
**Document Version**: 1.0
|
| 1044 |
+
**Last Updated**: 2025-10-26
|
| 1045 |
+
**Project**: FBMC Flow Forecasting MVP (Zero-Shot)
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Java 11+ Installation Guide for JAOPuTo Tool
|
| 2 |
+
|
| 3 |
+
**Required for**: JAO FBMC data collection via JAOPuTo tool
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Quick Install (Windows)
|
| 8 |
+
|
| 9 |
+
### Option 1: Adoptium Eclipse Temurin (Recommended)
|
| 10 |
+
|
| 11 |
+
1. **Download Java 17 (LTS)**:
|
| 12 |
+
- Visit: https://adoptium.net/temurin/releases/
|
| 13 |
+
- Select:
|
| 14 |
+
- **Operating System**: Windows
|
| 15 |
+
- **Architecture**: x64
|
| 16 |
+
- **Package Type**: JDK
|
| 17 |
+
- **Version**: 17 (LTS)
|
| 18 |
+
- Download: `.msi` installer
|
| 19 |
+
|
| 20 |
+
2. **Install**:
|
| 21 |
+
- Run the downloaded `.msi` file
|
| 22 |
+
- Accept defaults (includes adding to PATH)
|
| 23 |
+
- Click "Install"
|
| 24 |
+
|
| 25 |
+
3. **Verify**:
|
| 26 |
+
```bash
|
| 27 |
+
java -version
|
| 28 |
+
```
|
| 29 |
+
Should output: `openjdk version "17.0.x"`
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
### Option 2: Chocolatey (If Installed)
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
choco install temurin17
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
Then verify:
|
| 40 |
+
```bash
|
| 41 |
+
java -version
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
### Option 3: Manual Download (Alternative)
|
| 47 |
+
|
| 48 |
+
If Adoptium doesn't work:
|
| 49 |
+
|
| 50 |
+
1. **Oracle JDK** (Requires Oracle account):
|
| 51 |
+
- https://www.oracle.com/java/technologies/downloads/#java17
|
| 52 |
+
|
| 53 |
+
2. **Amazon Corretto**:
|
| 54 |
+
- https://aws.amazon.com/corretto/
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## Post-Installation
|
| 59 |
+
|
| 60 |
+
### 1. Verify Java Installation
|
| 61 |
+
|
| 62 |
+
Open **Git Bash** or **Command Prompt** and run:
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
java -version
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
**Expected output**:
|
| 69 |
+
```
|
| 70 |
+
openjdk version "17.0.10" 2024-01-16
|
| 71 |
+
OpenJDK Runtime Environment Temurin-17.0.10+7 (build 17.0.10+7)
|
| 72 |
+
OpenJDK 64-Bit Server VM Temurin-17.0.10+7 (build 17.0.10+7, mixed mode, sharing)
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### 2. Verify JAVA_HOME (Optional but Recommended)
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
echo $JAVA_HOME # Git Bash
|
| 79 |
+
echo %JAVA_HOME% # Command Prompt
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
If not set, add to environment variables:
|
| 83 |
+
- Path: `C:\Program Files\Eclipse Adoptium\jdk-17.0.10.7-hotspot\`
|
| 84 |
+
- Variable: `JAVA_HOME`
|
| 85 |
+
|
| 86 |
+
### 3. Test JAOPuTo
|
| 87 |
+
|
| 88 |
+
Download JAOPuTo.jar (see next section), then test:
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
java -jar tools/JAOPuTo.jar --help
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Should display help information without errors.
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## Download JAOPuTo Tool
|
| 99 |
+
|
| 100 |
+
### Official Download
|
| 101 |
+
|
| 102 |
+
1. **Visit**: https://publicationtool.jao.eu/core/
|
| 103 |
+
2. **Look for**: Download section or "JAOPuTo" link
|
| 104 |
+
3. **Save to**: `C:\Users\evgue\projects\fbmc_chronos2\tools\JAOPuTo.jar`
|
| 105 |
+
|
| 106 |
+
### Alternative Sources
|
| 107 |
+
|
| 108 |
+
If official site is unclear:
|
| 109 |
+
|
| 110 |
+
1. **JAO Support**:
|
| 111 |
+
- Email: [email protected]
|
| 112 |
+
- Subject: "JAOPuTo Tool Download Request"
|
| 113 |
+
- Request: Latest JAOPuTo.jar for FBMC data download
|
| 114 |
+
|
| 115 |
+
2. **Check Documentation**:
|
| 116 |
+
- https://www.jao.eu/core-fbmc
|
| 117 |
+
- Look for API or data download tools
|
| 118 |
+
|
| 119 |
+
---
|
| 120 |
+
|
| 121 |
+
## Troubleshooting
|
| 122 |
+
|
| 123 |
+
### Issue: "java: command not found"
|
| 124 |
+
|
| 125 |
+
**Solution 1**: Restart Git Bash/terminal after installation
|
| 126 |
+
|
| 127 |
+
**Solution 2**: Manually add Java to PATH
|
| 128 |
+
- Open: System Properties → Environment Variables
|
| 129 |
+
- Edit: PATH
|
| 130 |
+
- Add: `C:\Program Files\Eclipse Adoptium\jdk-17.0.10.7-hotspot\bin`
|
| 131 |
+
- Restart terminal
|
| 132 |
+
|
| 133 |
+
### Issue: "JAR file not found"
|
| 134 |
+
|
| 135 |
+
**Check**:
|
| 136 |
+
```bash
|
| 137 |
+
ls -la tools/JAOPuTo.jar
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
**Solution**: Ensure JAOPuTo.jar is in `tools/` directory
|
| 141 |
+
|
| 142 |
+
### Issue: "Unsupported Java version"
|
| 143 |
+
|
| 144 |
+
JAOPuTo requires Java **11 or higher**.
|
| 145 |
+
|
| 146 |
+
Check version:
|
| 147 |
+
```bash
|
| 148 |
+
java -version
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
If you have Java 8 or older, install Java 17 (LTS).
|
| 152 |
+
|
| 153 |
+
### Issue: Multiple Java Versions
|
| 154 |
+
|
| 155 |
+
If you have multiple Java installations:
|
| 156 |
+
|
| 157 |
+
1. Check current version:
|
| 158 |
+
```bash
|
| 159 |
+
java -version
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
2. List all installations:
|
| 163 |
+
```bash
|
| 164 |
+
where java # Windows
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
3. Set specific version:
|
| 168 |
+
- Update PATH to prioritize Java 17
|
| 169 |
+
- Or use full path: `"C:\Program Files\Eclipse Adoptium\...\bin\java.exe"`
|
| 170 |
+
|
| 171 |
+
---
|
| 172 |
+
|
| 173 |
+
## Next Steps After Java Installation
|
| 174 |
+
|
| 175 |
+
Once Java is installed and verified:
|
| 176 |
+
|
| 177 |
+
1. **Download JAOPuTo.jar**:
|
| 178 |
+
- Save to: `tools/JAOPuTo.jar`
|
| 179 |
+
|
| 180 |
+
2. **Test JAO collection**:
|
| 181 |
+
```bash
|
| 182 |
+
python src/data_collection/collect_jao.py --manual-instructions
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
3. **Begin Day 1 data collection**:
|
| 186 |
+
```bash
|
| 187 |
+
# OpenMeteo (5 minutes)
|
| 188 |
+
python src/data_collection/collect_openmeteo.py
|
| 189 |
+
|
| 190 |
+
# ENTSO-E (longer, depends on data volume)
|
| 191 |
+
python src/data_collection/collect_entsoe.py
|
| 192 |
+
|
| 193 |
+
# JAO FBMC data
|
| 194 |
+
python src/data_collection/collect_jao.py
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
---
|
| 198 |
+
|
| 199 |
+
## Quick Reference
|
| 200 |
+
|
| 201 |
+
| Item | Value |
|
| 202 |
+
|------|-------|
|
| 203 |
+
| **Recommended Version** | Java 17 (LTS) |
|
| 204 |
+
| **Minimum Version** | Java 11 |
|
| 205 |
+
| **Download** | https://adoptium.net/temurin/releases/ |
|
| 206 |
+
| **JAOPuTo Tool** | https://publicationtool.jao.eu/core/ |
|
| 207 |
+
| **Support** | [email protected] |
|
| 208 |
+
| **Verify Command** | `java -version` |
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
**Document Version**: 1.0
|
| 213 |
+
**Last Updated**: 2025-10-27
|
| 214 |
+
**Project**: FBMC Flow Forecasting MVP
|
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FBMC Flow Forecasting MVP - Activity Log
|
| 2 |
+
|
| 3 |
+
## 2025-10-27 13:00 - Day 0: Environment Setup Complete
|
| 4 |
+
|
| 5 |
+
### Work Completed
|
| 6 |
+
- Installed uv package manager at C:\Users\evgue\.local\bin\uv.exe
|
| 7 |
+
- Installed Python 3.13.2 via uv (managed installation)
|
| 8 |
+
- Created virtual environment at .venv/ with Python 3.13.2
|
| 9 |
+
- Installed 179 packages from requirements.txt
|
| 10 |
+
- Created .gitignore to exclude data files, venv, and secrets
|
| 11 |
+
- Verified key packages: polars 1.34.0, torch 2.9.0+cpu, transformers 4.57.1, chronos-forecasting 2.0.0, datasets, marimo 0.17.2, altair 5.5.0, entsoe-py, gradio 5.49.1
|
| 12 |
+
- Created doc/ folder for documentation
|
| 13 |
+
- Moved Day_0_Quick_Start_Guide.md and FBMC_Flow_Forecasting_MVP_ZERO_SHOT_PLAN.md to doc/
|
| 14 |
+
- Deleted verify_install.py test script (cleanup per global rules)
|
| 15 |
+
|
| 16 |
+
### Files Created
|
| 17 |
+
- requirements.txt - Full dependency list
|
| 18 |
+
- .venv/ - Virtual environment
|
| 19 |
+
- .gitignore - Git exclusions
|
| 20 |
+
- doc/ - Documentation folder
|
| 21 |
+
- doc/activity.md - This activity log
|
| 22 |
+
|
| 23 |
+
### Files Moved
|
| 24 |
+
- doc/Day_0_Quick_Start_Guide.md (from root)
|
| 25 |
+
- doc/FBMC_Flow_Forecasting_MVP_ZERO_SHOT_PLAN.md (from root)
|
| 26 |
+
|
| 27 |
+
### Files Deleted
|
| 28 |
+
- verify_install.py (test script, no longer needed)
|
| 29 |
+
|
| 30 |
+
### Key Decisions
|
| 31 |
+
- Kept torch/transformers/chronos in local environment despite CPU-only hardware (provides flexibility, already installed, minimal overhead)
|
| 32 |
+
- Using uv-managed Python 3.13.2 (isolated from Miniconda base environment)
|
| 33 |
+
- Data management philosophy: Code → Git, Data → HuggingFace Datasets, NO Git LFS
|
| 34 |
+
- Project structure: Clean root with CLAUDE.md and requirements.txt, all other docs in doc/ folder
|
| 35 |
+
|
| 36 |
+
### Status
|
| 37 |
+
✅ Day 0 Phase 1 complete - Environment ready for utilities and API setup
|
| 38 |
+
|
| 39 |
+
### Next Steps
|
| 40 |
+
- Create data collection utilities with rate limiting
|
| 41 |
+
- Configure API keys (ENTSO-E, HuggingFace, OpenMeteo)
|
| 42 |
+
- Download JAOPuTo tool for JAO data access (requires Java 11+)
|
| 43 |
+
- Begin Day 1: Data collection (8 hours)
|
| 44 |
+
|
| 45 |
+
---
|
| 46 |
+
|
| 47 |
+
## 2025-10-27 15:00 - Day 0 Continued: Utilities and API Configuration
|
| 48 |
+
|
| 49 |
+
### Work Completed
|
| 50 |
+
- Configured ENTSO-E API key in .env file (ec254e4d-b4db-455e-9f9a-bf5713bfc6b1)
|
| 51 |
+
- Set HuggingFace username: evgueni-p (HF Space setup deferred to Day 3)
|
| 52 |
+
- Created src/data_collection/hf_datasets_manager.py - HuggingFace Datasets upload/download utility (uses .env)
|
| 53 |
+
- Created src/data_collection/download_all.py - Batch dataset download script
|
| 54 |
+
- Created src/utils/data_loader.py - Data loading and validation utilities
|
| 55 |
+
- Created notebooks/01_data_exploration.py - Marimo notebook for Day 1 data exploration
|
| 56 |
+
- Deleted redundant config/api_keys.yaml (using .env for all API configuration)
|
| 57 |
+
|
| 58 |
+
### Files Created
|
| 59 |
+
- src/data_collection/hf_datasets_manager.py - HF Datasets manager with .env integration
|
| 60 |
+
- src/data_collection/download_all.py - Dataset download orchestrator
|
| 61 |
+
- src/utils/data_loader.py - Data loading and validation utilities
|
| 62 |
+
- notebooks/01_data_exploration.py - Initial Marimo exploration notebook
|
| 63 |
+
|
| 64 |
+
### Files Deleted
|
| 65 |
+
- config/api_keys.yaml (redundant - using .env instead)
|
| 66 |
+
|
| 67 |
+
### Key Decisions
|
| 68 |
+
- Using .env for ALL API configuration (simpler than dual .env + YAML approach)
|
| 69 |
+
- HuggingFace Space setup deferred to Day 3 when GPU inference is needed
|
| 70 |
+
- Working locally first: data collection → exploration → feature engineering → then deploy to HF Space
|
| 71 |
+
- GitHub username: evgspacdmy (for Git repository setup)
|
| 72 |
+
- Data scope: Oct 2024 - Sept 2025 (leaves Oct 2025 for live testing)
|
| 73 |
+
|
| 74 |
+
### Status
|
| 75 |
+
⚠️ Day 0 Phase 2 in progress - Need to complete:
|
| 76 |
+
- ❌ Java 11+ installation (blocker for JAOPuTo tool)
|
| 77 |
+
- ❌ Create data collection scripts with rate limiting (OpenMeteo, ENTSO-E)
|
| 78 |
+
- ❌ Download JAOPuTo.jar tool
|
| 79 |
+
- ❌ Initialize Git repository
|
| 80 |
+
- ❌ Create GitHub repository and push initial commit
|
| 81 |
+
|
| 82 |
+
### Next Steps
|
| 83 |
+
1. Install Java 11+ (requirement for JAOPuTo)
|
| 84 |
+
2. Create OpenMeteo data collection script with rate limiting
|
| 85 |
+
3. Create ENTSO-E data collection script with rate limiting
|
| 86 |
+
4. Create JAO data collection wrapper script
|
| 87 |
+
5. Initialize Git repository and push to GitHub (evgspacdmy)
|
| 88 |
+
6. Begin Day 1: Data collection (8 hours)
|
| 89 |
+
|
| 90 |
+
---
|
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FBMC Flow Forecasting - Data Exploration Notebook
|
| 2 |
+
|
| 3 |
+
Day 1 Objective: Explore downloaded JAO FBMC data structure and identify patterns.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
marimo edit notebooks/01_data_exploration.py
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import marimo
|
| 10 |
+
|
| 11 |
+
__generated_with = "0.17.2"
|
| 12 |
+
app = marimo.App(width="medium")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@app.cell
|
| 16 |
+
def __():
|
| 17 |
+
import marimo as mo
|
| 18 |
+
import polars as pl
|
| 19 |
+
import altair as alt
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
# Add src to path for imports
|
| 24 |
+
sys.path.insert(0, str(Path.cwd().parent / "src"))
|
| 25 |
+
|
| 26 |
+
return mo, pl, alt, Path, sys
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@app.cell
|
| 30 |
+
def __(mo):
|
| 31 |
+
mo.md(
|
| 32 |
+
r"""
|
| 33 |
+
# FBMC Flow Forecasting - Data Exploration
|
| 34 |
+
|
| 35 |
+
**MVP Objective**: Zero-shot electricity cross-border capacity forecasting
|
| 36 |
+
|
| 37 |
+
## Day 1 Goals:
|
| 38 |
+
1. Load downloaded FBMC data (JAO, ENTSO-E, OpenMeteo)
|
| 39 |
+
2. Inspect CNECs, PTDFs, RAMs structure
|
| 40 |
+
3. Identify top 50 binding CNECs by frequency
|
| 41 |
+
4. Visualize temporal patterns and correlations
|
| 42 |
+
5. Validate data completeness (>95% coverage)
|
| 43 |
+
|
| 44 |
+
## Data Sources:
|
| 45 |
+
- **JAO FBMC**: CNECs, PTDFs, RAMs, shadow prices (Oct 2024 - Sept 2025)
|
| 46 |
+
- **ENTSO-E**: Generation, flows, demand (12 bidding zones)
|
| 47 |
+
- **OpenMeteo**: Weather at 52 strategic grid points
|
| 48 |
+
"""
|
| 49 |
+
)
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.cell
|
| 54 |
+
def __(Path):
|
| 55 |
+
# Configuration
|
| 56 |
+
DATA_DIR = Path("../data/raw")
|
| 57 |
+
RESULTS_DIR = Path("../results/visualizations")
|
| 58 |
+
|
| 59 |
+
# Expected data files
|
| 60 |
+
CNECS_FILE = DATA_DIR / "cnecs_2024_2025.parquet"
|
| 61 |
+
WEATHER_FILE = DATA_DIR / "weather_2024_2025.parquet"
|
| 62 |
+
ENTSOE_FILE = DATA_DIR / "entsoe_2024_2025.parquet"
|
| 63 |
+
|
| 64 |
+
return DATA_DIR, RESULTS_DIR, CNECS_FILE, WEATHER_FILE, ENTSOE_FILE
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@app.cell
|
| 68 |
+
def __(mo, CNECS_FILE, WEATHER_FILE, ENTSOE_FILE):
|
| 69 |
+
# Check data availability
|
| 70 |
+
data_status = {
|
| 71 |
+
"CNECs": CNECS_FILE.exists(),
|
| 72 |
+
"Weather": WEATHER_FILE.exists(),
|
| 73 |
+
"ENTSO-E": ENTSOE_FILE.exists(),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
if all(data_status.values()):
|
| 77 |
+
mo.md("✅ **All data files found - ready for exploration!**")
|
| 78 |
+
else:
|
| 79 |
+
missing = [k for k, v in data_status.items() if not v]
|
| 80 |
+
mo.md(
|
| 81 |
+
f"""
|
| 82 |
+
⚠️ **Missing data files**: {', '.join(missing)}
|
| 83 |
+
|
| 84 |
+
**Next Steps:**
|
| 85 |
+
1. Run Day 1 data collection script
|
| 86 |
+
2. Download from JAO, ENTSO-E, OpenMeteo APIs
|
| 87 |
+
3. Return here for exploration
|
| 88 |
+
"""
|
| 89 |
+
)
|
| 90 |
+
return data_status, missing
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.cell
|
| 94 |
+
def __(mo, data_status):
|
| 95 |
+
# Only proceed if data exists
|
| 96 |
+
if not all(data_status.values()):
|
| 97 |
+
mo.stop(True, mo.md("⚠️ Data not available - stopping notebook"))
|
| 98 |
+
return
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@app.cell
|
| 102 |
+
def __(pl, CNECS_FILE, WEATHER_FILE, ENTSOE_FILE):
|
| 103 |
+
# Load data
|
| 104 |
+
print("Loading FBMC datasets...")
|
| 105 |
+
|
| 106 |
+
cnecs_df = pl.read_parquet(CNECS_FILE)
|
| 107 |
+
weather_df = pl.read_parquet(WEATHER_FILE)
|
| 108 |
+
entsoe_df = pl.read_parquet(ENTSOE_FILE)
|
| 109 |
+
|
| 110 |
+
print(f"✅ CNECs: {cnecs_df.shape}")
|
| 111 |
+
print(f"✅ Weather: {weather_df.shape}")
|
| 112 |
+
print(f"✅ ENTSO-E: {entsoe_df.shape}")
|
| 113 |
+
|
| 114 |
+
return cnecs_df, weather_df, entsoe_df
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@app.cell
|
| 118 |
+
def __(mo, cnecs_df, weather_df, entsoe_df):
|
| 119 |
+
mo.md(
|
| 120 |
+
f"""
|
| 121 |
+
## Dataset Overview
|
| 122 |
+
|
| 123 |
+
### CNECs Data
|
| 124 |
+
- **Shape**: {cnecs_df.shape[0]:,} rows × {cnecs_df.shape[1]} columns
|
| 125 |
+
- **Date Range**: {cnecs_df['timestamp'].min()} to {cnecs_df['timestamp'].max()}
|
| 126 |
+
- **Unique Borders**: {cnecs_df['border'].n_unique() if 'border' in cnecs_df.columns else 'N/A'}
|
| 127 |
+
|
| 128 |
+
### Weather Data
|
| 129 |
+
- **Shape**: {weather_df.shape[0]:,} rows × {weather_df.shape[1]} columns
|
| 130 |
+
- **Date Range**: {weather_df['timestamp'].min()} to {weather_df['timestamp'].max()}
|
| 131 |
+
- **Grid Points**: {weather_df['grid_point'].n_unique() if 'grid_point' in weather_df.columns else 'N/A'}
|
| 132 |
+
|
| 133 |
+
### ENTSO-E Data
|
| 134 |
+
- **Shape**: {entsoe_df.shape[0]:,} rows × {entsoe_df.shape[1]} columns
|
| 135 |
+
- **Date Range**: {entsoe_df['timestamp'].min()} to {entsoe_df['timestamp'].max()}
|
| 136 |
+
- **Bidding Zones**: {entsoe_df['zone'].n_unique() if 'zone' in entsoe_df.columns else 'N/A'}
|
| 137 |
+
"""
|
| 138 |
+
)
|
| 139 |
+
return
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@app.cell
|
| 143 |
+
def __(mo, cnecs_df):
|
| 144 |
+
mo.md(
|
| 145 |
+
"""
|
| 146 |
+
## CNEC Data Inspection
|
| 147 |
+
|
| 148 |
+
Examining Critical Network Elements with Contingencies (CNECs) structure:
|
| 149 |
+
"""
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Display schema and sample
|
| 153 |
+
mo.ui.table(cnecs_df.head(10).to_pandas())
|
| 154 |
+
return
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@app.cell
|
| 158 |
+
def __(mo, cnecs_df, alt):
|
| 159 |
+
# Identify top 50 binding CNECs
|
| 160 |
+
if 'cnec_id' in cnecs_df.columns and 'binding' in cnecs_df.columns:
|
| 161 |
+
top_binding_cnecs = (
|
| 162 |
+
cnecs_df
|
| 163 |
+
.group_by('cnec_id')
|
| 164 |
+
.agg(pl.col('binding').sum().alias('binding_count'))
|
| 165 |
+
.sort('binding_count', descending=True)
|
| 166 |
+
.head(50)
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Visualize binding frequency
|
| 170 |
+
chart = alt.Chart(top_binding_cnecs.to_pandas()).mark_bar().encode(
|
| 171 |
+
x=alt.X('cnec_id:N', sort='-y', axis=alt.Axis(labelAngle=-45)),
|
| 172 |
+
y='binding_count:Q',
|
| 173 |
+
tooltip=['cnec_id', 'binding_count']
|
| 174 |
+
).properties(
|
| 175 |
+
title='Top 50 Most Frequently Binding CNECs',
|
| 176 |
+
width=800,
|
| 177 |
+
height=400
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
mo.ui.altair_chart(chart)
|
| 181 |
+
else:
|
| 182 |
+
mo.md("⚠️ CNEC binding data not yet available - will be computed after download")
|
| 183 |
+
return top_binding_cnecs, chart
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
@app.cell
|
| 187 |
+
def __(mo, weather_df, alt):
|
| 188 |
+
# Weather pattern visualization
|
| 189 |
+
if 'timestamp' in weather_df.columns and 'windspeed_100m' in weather_df.columns:
|
| 190 |
+
# Sample for visualization (every 6 hours)
|
| 191 |
+
weather_sample = weather_df.filter(pl.col('timestamp').dt.hour() % 6 == 0)
|
| 192 |
+
|
| 193 |
+
chart = alt.Chart(weather_sample.to_pandas()).mark_line().encode(
|
| 194 |
+
x='timestamp:T',
|
| 195 |
+
y='windspeed_100m:Q',
|
| 196 |
+
color='grid_point:N',
|
| 197 |
+
tooltip=['timestamp', 'grid_point', 'windspeed_100m']
|
| 198 |
+
).properties(
|
| 199 |
+
title='Wind Speed Patterns (100m) Across Grid Points',
|
| 200 |
+
width=800,
|
| 201 |
+
height=400
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
mo.ui.altair_chart(chart)
|
| 205 |
+
else:
|
| 206 |
+
mo.md("⚠️ Weather data structure differs from expected - check after download")
|
| 207 |
+
return weather_sample,
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
@app.cell
|
| 211 |
+
def __(mo):
|
| 212 |
+
mo.md(
|
| 213 |
+
"""
|
| 214 |
+
## Data Quality Validation
|
| 215 |
+
|
| 216 |
+
Checking for completeness, missing values, and data integrity:
|
| 217 |
+
"""
|
| 218 |
+
)
|
| 219 |
+
return
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@app.cell
|
| 223 |
+
def __(mo, cnecs_df, weather_df, entsoe_df):
|
| 224 |
+
# Calculate data completeness
|
| 225 |
+
def check_completeness(df, name):
|
| 226 |
+
total_cells = df.shape[0] * df.shape[1]
|
| 227 |
+
null_cells = df.null_count().sum_horizontal()[0]
|
| 228 |
+
completeness = (1 - null_cells / total_cells) * 100
|
| 229 |
+
|
| 230 |
+
return {
|
| 231 |
+
'Dataset': name,
|
| 232 |
+
'Total Cells': total_cells,
|
| 233 |
+
'Null Cells': null_cells,
|
| 234 |
+
'Completeness %': f"{completeness:.2f}%"
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
completeness_report = [
|
| 238 |
+
check_completeness(cnecs_df, 'CNECs'),
|
| 239 |
+
check_completeness(weather_df, 'Weather'),
|
| 240 |
+
check_completeness(entsoe_df, 'ENTSO-E')
|
| 241 |
+
]
|
| 242 |
+
|
| 243 |
+
mo.ui.table(pl.DataFrame(completeness_report).to_pandas())
|
| 244 |
+
return check_completeness, completeness_report
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
@app.cell
|
| 248 |
+
def __(mo, completeness_report):
|
| 249 |
+
# Validation check
|
| 250 |
+
all_complete = all(
|
| 251 |
+
float(r['Completeness %'].rstrip('%')) >= 95.0
|
| 252 |
+
for r in completeness_report
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
if all_complete:
|
| 256 |
+
mo.md("✅ **All datasets meet >95% completeness threshold**")
|
| 257 |
+
else:
|
| 258 |
+
mo.md("⚠️ **Some datasets below 95% completeness - investigate missing data**")
|
| 259 |
+
return all_complete,
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
@app.cell
|
| 263 |
+
def __(mo):
|
| 264 |
+
mo.md(
|
| 265 |
+
"""
|
| 266 |
+
## Next Steps
|
| 267 |
+
|
| 268 |
+
After data exploration completion:
|
| 269 |
+
|
| 270 |
+
1. **Day 2**: Feature engineering (75-85 features)
|
| 271 |
+
2. **Day 3**: Zero-shot inference with Chronos 2
|
| 272 |
+
3. **Day 4**: Performance evaluation and analysis
|
| 273 |
+
4. **Day 5**: Documentation and handover
|
| 274 |
+
|
| 275 |
+
---
|
| 276 |
+
|
| 277 |
+
**Note**: This notebook will be exported to JupyterLab format (.ipynb) for analyst handover.
|
| 278 |
+
"""
|
| 279 |
+
)
|
| 280 |
+
return
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
if __name__ == "__main__":
|
| 284 |
+
app.run()
|
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Data & ML
|
| 2 |
+
polars>=0.20.0
|
| 3 |
+
pyarrow>=13.0.0
|
| 4 |
+
numpy>=1.24.0
|
| 5 |
+
scikit-learn>=1.3.0
|
| 6 |
+
|
| 7 |
+
# Time Series Forecasting
|
| 8 |
+
chronos-forecasting>=1.0.0
|
| 9 |
+
transformers>=4.35.0
|
| 10 |
+
torch>=2.0.0
|
| 11 |
+
|
| 12 |
+
# Data Collection
|
| 13 |
+
entsoe-py>=0.5.0
|
| 14 |
+
requests>=2.31.0
|
| 15 |
+
|
| 16 |
+
# HuggingFace Integration (for Datasets, NOT Git LFS)
|
| 17 |
+
datasets>=2.14.0
|
| 18 |
+
huggingface-hub>=0.17.0
|
| 19 |
+
|
| 20 |
+
# Visualization & Notebooks
|
| 21 |
+
altair>=5.0.0
|
| 22 |
+
marimo>=0.9.0
|
| 23 |
+
jupyter>=1.0.0
|
| 24 |
+
ipykernel>=6.25.0
|
| 25 |
+
|
| 26 |
+
# Utilities
|
| 27 |
+
pyyaml>=6.0.0
|
| 28 |
+
python-dotenv>=1.0.0
|
| 29 |
+
tqdm>=4.66.0
|
| 30 |
+
|
| 31 |
+
# HF Space Integration
|
| 32 |
+
gradio>=4.0.0
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ENTSO-E Transparency Platform Data Collection with Rate Limiting
|
| 2 |
+
|
| 3 |
+
Collects generation, load, and cross-border flow data from ENTSO-E API.
|
| 4 |
+
Implements proper rate limiting to avoid temporary bans.
|
| 5 |
+
|
| 6 |
+
ENTSO-E Rate Limits (OFFICIAL):
|
| 7 |
+
- 60 requests per 60 seconds (hard limit - exceeding triggers 10-min ban)
|
| 8 |
+
- Screen scraping >60 requests/min leads to temporary IP ban
|
| 9 |
+
|
| 10 |
+
Strategy:
|
| 11 |
+
- 27 requests/minute (45% of 60 limit - safe)
|
| 12 |
+
- 1 request every ~2.2 seconds
|
| 13 |
+
- Request data in monthly chunks to minimize API calls
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import polars as pl
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from datetime import datetime, timedelta
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
import os
|
| 21 |
+
import time
|
| 22 |
+
from typing import List, Tuple
|
| 23 |
+
from tqdm import tqdm
|
| 24 |
+
from entsoe import EntsoePandasClient
|
| 25 |
+
import pandas as pd
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Load environment variables
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# FBMC Bidding Zones (12 zones from project plan)
|
| 33 |
+
BIDDING_ZONES = {
|
| 34 |
+
'AT': 'Austria',
|
| 35 |
+
'BE': 'Belgium',
|
| 36 |
+
'HR': 'Croatia',
|
| 37 |
+
'CZ': 'Czech Republic',
|
| 38 |
+
'FR': 'France',
|
| 39 |
+
'DE_LU': 'Germany-Luxembourg',
|
| 40 |
+
'HU': 'Hungary',
|
| 41 |
+
'NL': 'Netherlands',
|
| 42 |
+
'PL': 'Poland',
|
| 43 |
+
'RO': 'Romania',
|
| 44 |
+
'SK': 'Slovakia',
|
| 45 |
+
'SI': 'Slovenia',
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# FBMC Cross-Border Flows (~20 major borders)
|
| 50 |
+
BORDERS = [
|
| 51 |
+
('DE_LU', 'NL'),
|
| 52 |
+
('DE_LU', 'FR'),
|
| 53 |
+
('DE_LU', 'BE'),
|
| 54 |
+
('DE_LU', 'AT'),
|
| 55 |
+
('DE_LU', 'CZ'),
|
| 56 |
+
('DE_LU', 'PL'),
|
| 57 |
+
('FR', 'BE'),
|
| 58 |
+
('FR', 'ES'), # External but affects FBMC
|
| 59 |
+
('FR', 'CH'), # External but affects FBMC
|
| 60 |
+
('AT', 'CZ'),
|
| 61 |
+
('AT', 'HU'),
|
| 62 |
+
('AT', 'SI'),
|
| 63 |
+
('AT', 'CH'), # External but affects FBMC
|
| 64 |
+
('CZ', 'SK'),
|
| 65 |
+
('CZ', 'PL'),
|
| 66 |
+
('HU', 'SK'),
|
| 67 |
+
('HU', 'RO'),
|
| 68 |
+
('HU', 'HR'),
|
| 69 |
+
('SI', 'HR'),
|
| 70 |
+
('PL', 'SK'),
|
| 71 |
+
('PL', 'CZ'),
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class EntsoECollector:
|
| 76 |
+
"""Collect ENTSO-E data with proper rate limiting."""
|
| 77 |
+
|
| 78 |
+
def __init__(self, requests_per_minute: int = 27):
|
| 79 |
+
"""Initialize collector with rate limiting.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
requests_per_minute: Max requests per minute (default: 27 = 45% of 60 limit)
|
| 83 |
+
"""
|
| 84 |
+
api_key = os.getenv('ENTSOE_API_KEY')
|
| 85 |
+
if not api_key or 'your_entsoe' in api_key.lower():
|
| 86 |
+
raise ValueError("ENTSO-E API key not configured in .env file")
|
| 87 |
+
|
| 88 |
+
self.client = EntsoePandasClient(api_key=api_key)
|
| 89 |
+
self.requests_per_minute = requests_per_minute
|
| 90 |
+
self.delay_seconds = 60.0 / requests_per_minute
|
| 91 |
+
self.request_count = 0
|
| 92 |
+
|
| 93 |
+
print(f"ENTSO-E Collector initialized")
|
| 94 |
+
print(f"Rate limit: {self.requests_per_minute} requests/minute")
|
| 95 |
+
print(f"Delay between requests: {self.delay_seconds:.2f}s")
|
| 96 |
+
|
| 97 |
+
def _rate_limit(self):
|
| 98 |
+
"""Apply rate limiting delay."""
|
| 99 |
+
time.sleep(self.delay_seconds)
|
| 100 |
+
self.request_count += 1
|
| 101 |
+
|
| 102 |
+
def _generate_monthly_chunks(
|
| 103 |
+
self,
|
| 104 |
+
start_date: str,
|
| 105 |
+
end_date: str
|
| 106 |
+
) -> List[Tuple[pd.Timestamp, pd.Timestamp]]:
|
| 107 |
+
"""Generate monthly date chunks for API requests.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
start_date: Start date (YYYY-MM-DD)
|
| 111 |
+
end_date: End date (YYYY-MM-DD)
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
List of (start, end) timestamp tuples
|
| 115 |
+
"""
|
| 116 |
+
start_dt = pd.Timestamp(start_date, tz='UTC')
|
| 117 |
+
end_dt = pd.Timestamp(end_date, tz='UTC')
|
| 118 |
+
|
| 119 |
+
chunks = []
|
| 120 |
+
current = start_dt
|
| 121 |
+
|
| 122 |
+
while current < end_dt:
|
| 123 |
+
# Get end of month or end_date, whichever is earlier
|
| 124 |
+
month_end = (current + pd.offsets.MonthEnd(0))
|
| 125 |
+
chunk_end = min(month_end, end_dt)
|
| 126 |
+
|
| 127 |
+
chunks.append((current, chunk_end))
|
| 128 |
+
current = chunk_end + pd.Timedelta(hours=1)
|
| 129 |
+
|
| 130 |
+
return chunks
|
| 131 |
+
|
| 132 |
+
def collect_generation_per_type(
|
| 133 |
+
self,
|
| 134 |
+
zone: str,
|
| 135 |
+
start_date: str,
|
| 136 |
+
end_date: str
|
| 137 |
+
) -> pl.DataFrame:
|
| 138 |
+
"""Collect generation by production type for a bidding zone.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
zone: Bidding zone code (e.g., 'DE_LU', 'FR')
|
| 142 |
+
start_date: Start date (YYYY-MM-DD)
|
| 143 |
+
end_date: End date (YYYY-MM-DD)
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
Polars DataFrame with generation data
|
| 147 |
+
"""
|
| 148 |
+
chunks = self._generate_monthly_chunks(start_date, end_date)
|
| 149 |
+
all_data = []
|
| 150 |
+
|
| 151 |
+
for start_chunk, end_chunk in tqdm(chunks, desc=f" {zone} generation", leave=False):
|
| 152 |
+
try:
|
| 153 |
+
# Fetch generation data
|
| 154 |
+
df = self.client.query_generation(
|
| 155 |
+
zone,
|
| 156 |
+
start=start_chunk,
|
| 157 |
+
end=end_chunk,
|
| 158 |
+
psr_type=None # Get all production types
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
if df is not None and not df.empty:
|
| 162 |
+
# Convert to long format
|
| 163 |
+
df_reset = df.reset_index()
|
| 164 |
+
df_melted = df_reset.melt(
|
| 165 |
+
id_vars=['index'],
|
| 166 |
+
var_name='production_type',
|
| 167 |
+
value_name='generation_mw'
|
| 168 |
+
)
|
| 169 |
+
df_melted = df_melted.rename(columns={'index': 'timestamp'})
|
| 170 |
+
df_melted['zone'] = zone
|
| 171 |
+
|
| 172 |
+
# Convert to Polars
|
| 173 |
+
pl_df = pl.from_pandas(df_melted)
|
| 174 |
+
all_data.append(pl_df)
|
| 175 |
+
|
| 176 |
+
self._rate_limit()
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f" ❌ Failed {zone} {start_chunk.date()} to {end_chunk.date()}: {e}")
|
| 180 |
+
self._rate_limit()
|
| 181 |
+
continue
|
| 182 |
+
|
| 183 |
+
if all_data:
|
| 184 |
+
return pl.concat(all_data)
|
| 185 |
+
else:
|
| 186 |
+
return pl.DataFrame()
|
| 187 |
+
|
| 188 |
+
def collect_load(
|
| 189 |
+
self,
|
| 190 |
+
zone: str,
|
| 191 |
+
start_date: str,
|
| 192 |
+
end_date: str
|
| 193 |
+
) -> pl.DataFrame:
|
| 194 |
+
"""Collect load (demand) data for a bidding zone.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
zone: Bidding zone code
|
| 198 |
+
start_date: Start date (YYYY-MM-DD)
|
| 199 |
+
end_date: End date (YYYY-MM-DD)
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
Polars DataFrame with load data
|
| 203 |
+
"""
|
| 204 |
+
chunks = self._generate_monthly_chunks(start_date, end_date)
|
| 205 |
+
all_data = []
|
| 206 |
+
|
| 207 |
+
for start_chunk, end_chunk in tqdm(chunks, desc=f" {zone} load", leave=False):
|
| 208 |
+
try:
|
| 209 |
+
# Fetch load data
|
| 210 |
+
series = self.client.query_load(
|
| 211 |
+
zone,
|
| 212 |
+
start=start_chunk,
|
| 213 |
+
end=end_chunk
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
if series is not None and not series.empty:
|
| 217 |
+
df = pd.DataFrame({
|
| 218 |
+
'timestamp': series.index,
|
| 219 |
+
'load_mw': series.values,
|
| 220 |
+
'zone': zone
|
| 221 |
+
})
|
| 222 |
+
|
| 223 |
+
pl_df = pl.from_pandas(df)
|
| 224 |
+
all_data.append(pl_df)
|
| 225 |
+
|
| 226 |
+
self._rate_limit()
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f" ❌ Failed {zone} {start_chunk.date()} to {end_chunk.date()}: {e}")
|
| 230 |
+
self._rate_limit()
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
if all_data:
|
| 234 |
+
return pl.concat(all_data)
|
| 235 |
+
else:
|
| 236 |
+
return pl.DataFrame()
|
| 237 |
+
|
| 238 |
+
def collect_cross_border_flows(
|
| 239 |
+
self,
|
| 240 |
+
from_zone: str,
|
| 241 |
+
to_zone: str,
|
| 242 |
+
start_date: str,
|
| 243 |
+
end_date: str
|
| 244 |
+
) -> pl.DataFrame:
|
| 245 |
+
"""Collect cross-border flow data between two zones.
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
from_zone: From bidding zone
|
| 249 |
+
to_zone: To bidding zone
|
| 250 |
+
start_date: Start date (YYYY-MM-DD)
|
| 251 |
+
end_date: End date (YYYY-MM-DD)
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
Polars DataFrame with flow data
|
| 255 |
+
"""
|
| 256 |
+
chunks = self._generate_monthly_chunks(start_date, end_date)
|
| 257 |
+
all_data = []
|
| 258 |
+
|
| 259 |
+
border_id = f"{from_zone}_{to_zone}"
|
| 260 |
+
|
| 261 |
+
for start_chunk, end_chunk in tqdm(chunks, desc=f" {border_id}", leave=False):
|
| 262 |
+
try:
|
| 263 |
+
# Fetch cross-border flow
|
| 264 |
+
series = self.client.query_crossborder_flows(
|
| 265 |
+
from_zone,
|
| 266 |
+
to_zone,
|
| 267 |
+
start=start_chunk,
|
| 268 |
+
end=end_chunk
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
if series is not None and not series.empty:
|
| 272 |
+
df = pd.DataFrame({
|
| 273 |
+
'timestamp': series.index,
|
| 274 |
+
'flow_mw': series.values,
|
| 275 |
+
'from_zone': from_zone,
|
| 276 |
+
'to_zone': to_zone,
|
| 277 |
+
'border': border_id
|
| 278 |
+
})
|
| 279 |
+
|
| 280 |
+
pl_df = pl.from_pandas(df)
|
| 281 |
+
all_data.append(pl_df)
|
| 282 |
+
|
| 283 |
+
self._rate_limit()
|
| 284 |
+
|
| 285 |
+
except Exception as e:
|
| 286 |
+
print(f" ❌ Failed {border_id} {start_chunk.date()} to {end_chunk.date()}: {e}")
|
| 287 |
+
self._rate_limit()
|
| 288 |
+
continue
|
| 289 |
+
|
| 290 |
+
if all_data:
|
| 291 |
+
return pl.concat(all_data)
|
| 292 |
+
else:
|
| 293 |
+
return pl.DataFrame()
|
| 294 |
+
|
| 295 |
+
def collect_all(
|
| 296 |
+
self,
|
| 297 |
+
start_date: str,
|
| 298 |
+
end_date: str,
|
| 299 |
+
output_dir: Path
|
| 300 |
+
) -> dict:
|
| 301 |
+
"""Collect all ENTSO-E data with rate limiting.
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
start_date: Start date (YYYY-MM-DD)
|
| 305 |
+
end_date: End date (YYYY-MM-DD)
|
| 306 |
+
output_dir: Directory to save Parquet files
|
| 307 |
+
|
| 308 |
+
Returns:
|
| 309 |
+
Dictionary with paths to saved files
|
| 310 |
+
"""
|
| 311 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 312 |
+
|
| 313 |
+
# Calculate total requests
|
| 314 |
+
months = len(self._generate_monthly_chunks(start_date, end_date))
|
| 315 |
+
total_requests = (
|
| 316 |
+
len(BIDDING_ZONES) * months * 2 + # Generation + load
|
| 317 |
+
len(BORDERS) * months # Flows
|
| 318 |
+
)
|
| 319 |
+
estimated_minutes = total_requests / self.requests_per_minute
|
| 320 |
+
|
| 321 |
+
print("=" * 70)
|
| 322 |
+
print("ENTSO-E Data Collection")
|
| 323 |
+
print("=" * 70)
|
| 324 |
+
print(f"Date range: {start_date} to {end_date}")
|
| 325 |
+
print(f"Bidding zones: {len(BIDDING_ZONES)}")
|
| 326 |
+
print(f"Cross-border flows: {len(BORDERS)}")
|
| 327 |
+
print(f"Monthly chunks: {months}")
|
| 328 |
+
print(f"Total requests: ~{total_requests}")
|
| 329 |
+
print(f"Rate limit: {self.requests_per_minute} requests/minute (45% of 60 max)")
|
| 330 |
+
print(f"Estimated time: {estimated_minutes:.1f} minutes")
|
| 331 |
+
print()
|
| 332 |
+
|
| 333 |
+
results = {}
|
| 334 |
+
|
| 335 |
+
# 1. Collect Generation Data
|
| 336 |
+
print("[1/3] Collecting generation data by production type...")
|
| 337 |
+
generation_data = []
|
| 338 |
+
for zone in tqdm(BIDDING_ZONES.keys(), desc="Generation"):
|
| 339 |
+
df = self.collect_generation_per_type(zone, start_date, end_date)
|
| 340 |
+
if not df.is_empty():
|
| 341 |
+
generation_data.append(df)
|
| 342 |
+
|
| 343 |
+
if generation_data:
|
| 344 |
+
generation_df = pl.concat(generation_data)
|
| 345 |
+
gen_path = output_dir / "entsoe_generation_2024_2025.parquet"
|
| 346 |
+
generation_df.write_parquet(gen_path)
|
| 347 |
+
results['generation'] = gen_path
|
| 348 |
+
print(f"✅ Generation: {generation_df.shape[0]:,} records → {gen_path}")
|
| 349 |
+
|
| 350 |
+
# 2. Collect Load Data
|
| 351 |
+
print("\n[2/3] Collecting load (demand) data...")
|
| 352 |
+
load_data = []
|
| 353 |
+
for zone in tqdm(BIDDING_ZONES.keys(), desc="Load"):
|
| 354 |
+
df = self.collect_load(zone, start_date, end_date)
|
| 355 |
+
if not df.is_empty():
|
| 356 |
+
load_data.append(df)
|
| 357 |
+
|
| 358 |
+
if load_data:
|
| 359 |
+
load_df = pl.concat(load_data)
|
| 360 |
+
load_path = output_dir / "entsoe_load_2024_2025.parquet"
|
| 361 |
+
load_df.write_parquet(load_path)
|
| 362 |
+
results['load'] = load_path
|
| 363 |
+
print(f"✅ Load: {load_df.shape[0]:,} records → {load_path}")
|
| 364 |
+
|
| 365 |
+
# 3. Collect Cross-Border Flows
|
| 366 |
+
print("\n[3/3] Collecting cross-border flows...")
|
| 367 |
+
flow_data = []
|
| 368 |
+
for from_zone, to_zone in tqdm(BORDERS, desc="Flows"):
|
| 369 |
+
df = self.collect_cross_border_flows(from_zone, to_zone, start_date, end_date)
|
| 370 |
+
if not df.is_empty():
|
| 371 |
+
flow_data.append(df)
|
| 372 |
+
|
| 373 |
+
if flow_data:
|
| 374 |
+
flow_df = pl.concat(flow_data)
|
| 375 |
+
flow_path = output_dir / "entsoe_flows_2024_2025.parquet"
|
| 376 |
+
flow_df.write_parquet(flow_path)
|
| 377 |
+
results['flows'] = flow_path
|
| 378 |
+
print(f"✅ Flows: {flow_df.shape[0]:,} records → {flow_path}")
|
| 379 |
+
|
| 380 |
+
print()
|
| 381 |
+
print("=" * 70)
|
| 382 |
+
print("ENTSO-E Collection Complete")
|
| 383 |
+
print("=" * 70)
|
| 384 |
+
print(f"Total API requests made: {self.request_count}")
|
| 385 |
+
print(f"Files created: {len(results)}")
|
| 386 |
+
for data_type, path in results.items():
|
| 387 |
+
file_size = path.stat().st_size / (1024**2)
|
| 388 |
+
print(f" - {data_type}: {file_size:.1f} MB")
|
| 389 |
+
|
| 390 |
+
return results
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
if __name__ == "__main__":
|
| 394 |
+
import argparse
|
| 395 |
+
|
| 396 |
+
parser = argparse.ArgumentParser(description="Collect ENTSO-E data with proper rate limiting")
|
| 397 |
+
parser.add_argument(
|
| 398 |
+
'--start-date',
|
| 399 |
+
default='2024-10-01',
|
| 400 |
+
help='Start date (YYYY-MM-DD)'
|
| 401 |
+
)
|
| 402 |
+
parser.add_argument(
|
| 403 |
+
'--end-date',
|
| 404 |
+
default='2025-09-30',
|
| 405 |
+
help='End date (YYYY-MM-DD)'
|
| 406 |
+
)
|
| 407 |
+
parser.add_argument(
|
| 408 |
+
'--output-dir',
|
| 409 |
+
type=Path,
|
| 410 |
+
default=Path('data/raw'),
|
| 411 |
+
help='Output directory for Parquet files'
|
| 412 |
+
)
|
| 413 |
+
parser.add_argument(
|
| 414 |
+
'--requests-per-minute',
|
| 415 |
+
type=int,
|
| 416 |
+
default=27,
|
| 417 |
+
help='Requests per minute (default: 27 = 45%% of 60 limit)'
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
args = parser.parse_args()
|
| 421 |
+
|
| 422 |
+
# Initialize collector and run
|
| 423 |
+
collector = EntsoECollector(requests_per_minute=args.requests_per_minute)
|
| 424 |
+
collector.collect_all(
|
| 425 |
+
start_date=args.start_date,
|
| 426 |
+
end_date=args.end_date,
|
| 427 |
+
output_dir=args.output_dir
|
| 428 |
+
)
|
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""JAO FBMC Data Collection using JAOPuTo Tool
|
| 2 |
+
|
| 3 |
+
Wrapper script for downloading FBMC data using the JAOPuTo Java tool.
|
| 4 |
+
Requires Java 11+ to be installed.
|
| 5 |
+
|
| 6 |
+
JAOPuTo Tool:
|
| 7 |
+
- Download: https://publicationtool.jao.eu/core/
|
| 8 |
+
- Save JAOPuTo.jar to tools/ directory
|
| 9 |
+
- No explicit rate limits documented (reasonable use expected)
|
| 10 |
+
|
| 11 |
+
Data Types:
|
| 12 |
+
- CNECs (Critical Network Elements with Contingencies)
|
| 13 |
+
- PTDFs (Power Transfer Distribution Factors)
|
| 14 |
+
- RAMs (Remaining Available Margins)
|
| 15 |
+
- Shadow prices
|
| 16 |
+
- Final computation results
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import subprocess
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
import polars as pl
|
| 23 |
+
from typing import Optional
|
| 24 |
+
import os
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class JAOCollector:
|
| 28 |
+
"""Collect FBMC data using JAOPuTo tool."""
|
| 29 |
+
|
| 30 |
+
def __init__(self, jaoputo_jar: Path = Path("tools/JAOPuTo.jar")):
|
| 31 |
+
"""Initialize JAO collector.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
jaoputo_jar: Path to JAOPuTo.jar file
|
| 35 |
+
"""
|
| 36 |
+
self.jaoputo_jar = jaoputo_jar
|
| 37 |
+
|
| 38 |
+
if not self.jaoputo_jar.exists():
|
| 39 |
+
raise FileNotFoundError(
|
| 40 |
+
f"JAOPuTo.jar not found at {jaoputo_jar}\n"
|
| 41 |
+
f"Download from: https://publicationtool.jao.eu/core/\n"
|
| 42 |
+
f"Save to: tools/JAOPuTo.jar"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# Check Java installation
|
| 46 |
+
try:
|
| 47 |
+
result = subprocess.run(
|
| 48 |
+
['java', '-version'],
|
| 49 |
+
capture_output=True,
|
| 50 |
+
text=True
|
| 51 |
+
)
|
| 52 |
+
java_version = result.stderr.split('\n')[0]
|
| 53 |
+
print(f"✅ Java installed: {java_version}")
|
| 54 |
+
except FileNotFoundError:
|
| 55 |
+
raise EnvironmentError(
|
| 56 |
+
"Java not found. Install Java 11+ from https://adoptium.net/temurin/releases/"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def download_fbmc_data(
|
| 60 |
+
self,
|
| 61 |
+
start_date: str,
|
| 62 |
+
end_date: str,
|
| 63 |
+
output_dir: Path,
|
| 64 |
+
data_types: Optional[list] = None
|
| 65 |
+
) -> dict:
|
| 66 |
+
"""Download FBMC data using JAOPuTo tool.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
start_date: Start date (YYYY-MM-DD)
|
| 70 |
+
end_date: End date (YYYY-MM-DD)
|
| 71 |
+
output_dir: Directory to save downloaded files
|
| 72 |
+
data_types: List of data types to download (default: all)
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Dictionary with paths to downloaded files
|
| 76 |
+
"""
|
| 77 |
+
if data_types is None:
|
| 78 |
+
data_types = [
|
| 79 |
+
'CNEC',
|
| 80 |
+
'PTDF',
|
| 81 |
+
'RAM',
|
| 82 |
+
'ShadowPrice',
|
| 83 |
+
'FinalComputation'
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
|
| 88 |
+
print("=" * 70)
|
| 89 |
+
print("JAO FBMC Data Collection")
|
| 90 |
+
print("=" * 70)
|
| 91 |
+
print(f"Date range: {start_date} to {end_date}")
|
| 92 |
+
print(f"Data types: {', '.join(data_types)}")
|
| 93 |
+
print(f"Output directory: {output_dir}")
|
| 94 |
+
print(f"JAOPuTo tool: {self.jaoputo_jar}")
|
| 95 |
+
print()
|
| 96 |
+
|
| 97 |
+
results = {}
|
| 98 |
+
|
| 99 |
+
for data_type in data_types:
|
| 100 |
+
print(f"[{data_type}] Downloading...")
|
| 101 |
+
|
| 102 |
+
output_file = output_dir / f"jao_{data_type.lower()}_{start_date}_{end_date}.csv"
|
| 103 |
+
|
| 104 |
+
# Build JAOPuTo command
|
| 105 |
+
# Note: Actual command structure needs to be verified with JAOPuTo documentation
|
| 106 |
+
cmd = [
|
| 107 |
+
'java',
|
| 108 |
+
'-jar',
|
| 109 |
+
str(self.jaoputo_jar),
|
| 110 |
+
'--start-date', start_date,
|
| 111 |
+
'--end-date', end_date,
|
| 112 |
+
'--data-type', data_type,
|
| 113 |
+
'--output', str(output_file),
|
| 114 |
+
'--format', 'csv',
|
| 115 |
+
'--region', 'CORE' # Core FBMC region
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
result = subprocess.run(
|
| 120 |
+
cmd,
|
| 121 |
+
capture_output=True,
|
| 122 |
+
text=True,
|
| 123 |
+
timeout=600 # 10 minute timeout
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
if result.returncode == 0:
|
| 127 |
+
if output_file.exists():
|
| 128 |
+
file_size = output_file.stat().st_size / (1024**2)
|
| 129 |
+
print(f"✅ {data_type}: {file_size:.1f} MB → {output_file}")
|
| 130 |
+
results[data_type] = output_file
|
| 131 |
+
else:
|
| 132 |
+
print(f"⚠️ {data_type}: Command succeeded but file not created")
|
| 133 |
+
else:
|
| 134 |
+
print(f"❌ {data_type}: Failed")
|
| 135 |
+
print(f" Error: {result.stderr}")
|
| 136 |
+
|
| 137 |
+
except subprocess.TimeoutExpired:
|
| 138 |
+
print(f"❌ {data_type}: Timeout (>10 minutes)")
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"❌ {data_type}: {e}")
|
| 141 |
+
|
| 142 |
+
# Convert CSV files to Parquet for efficiency
|
| 143 |
+
print("\n[Conversion] Converting CSV to Parquet...")
|
| 144 |
+
for data_type, csv_path in results.items():
|
| 145 |
+
try:
|
| 146 |
+
parquet_path = csv_path.with_suffix('.parquet')
|
| 147 |
+
|
| 148 |
+
# Read CSV and save as Parquet
|
| 149 |
+
df = pl.read_csv(csv_path)
|
| 150 |
+
df.write_parquet(parquet_path)
|
| 151 |
+
|
| 152 |
+
# Update results to point to Parquet
|
| 153 |
+
results[data_type] = parquet_path
|
| 154 |
+
|
| 155 |
+
# Optionally delete CSV to save space
|
| 156 |
+
# csv_path.unlink()
|
| 157 |
+
|
| 158 |
+
parquet_size = parquet_path.stat().st_size / (1024**2)
|
| 159 |
+
print(f"✅ {data_type}: Converted to Parquet ({parquet_size:.1f} MB)")
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
print(f"⚠️ {data_type}: Conversion failed - {e}")
|
| 163 |
+
|
| 164 |
+
print()
|
| 165 |
+
print("=" * 70)
|
| 166 |
+
print("JAO Collection Complete")
|
| 167 |
+
print("=" * 70)
|
| 168 |
+
print(f"Files downloaded: {len(results)}")
|
| 169 |
+
for data_type, path in results.items():
|
| 170 |
+
print(f" - {data_type}: {path.name}")
|
| 171 |
+
|
| 172 |
+
return results
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def download_jao_manual_instructions():
|
| 176 |
+
"""Print manual download instructions if JAOPuTo doesn't work."""
|
| 177 |
+
print("""
|
| 178 |
+
╔══════════════════════════════════════════════════════════════════════════╗
|
| 179 |
+
║ JAO DATA MANUAL DOWNLOAD INSTRUCTIONS ║
|
| 180 |
+
╚══════════════════════════════════════════════════════════════════════════╝
|
| 181 |
+
|
| 182 |
+
If JAOPuTo tool doesn't work, download data manually:
|
| 183 |
+
|
| 184 |
+
1. Visit: https://publicationtool.jao.eu/core/
|
| 185 |
+
|
| 186 |
+
2. Navigate to:
|
| 187 |
+
- FBMC Domain
|
| 188 |
+
- Core region
|
| 189 |
+
- Date range: Oct 2024 - Sept 2025
|
| 190 |
+
|
| 191 |
+
3. Download the following data types:
|
| 192 |
+
✓ CNECs (Critical Network Elements with Contingencies)
|
| 193 |
+
✓ PTDFs (Power Transfer Distribution Factors)
|
| 194 |
+
✓ RAMs (Remaining Available Margins)
|
| 195 |
+
✓ Shadow Prices
|
| 196 |
+
✓ Final Computation Results
|
| 197 |
+
|
| 198 |
+
4. Save files to: data/raw/
|
| 199 |
+
|
| 200 |
+
5. Recommended format: CSV or Excel (we'll convert to Parquet)
|
| 201 |
+
|
| 202 |
+
6. File naming convention:
|
| 203 |
+
- jao_cnec_2024-10_2025-09.csv
|
| 204 |
+
- jao_ptdf_2024-10_2025-09.csv
|
| 205 |
+
- jao_ram_2024-10_2025-09.csv
|
| 206 |
+
- etc.
|
| 207 |
+
|
| 208 |
+
7. Convert to Parquet:
|
| 209 |
+
python src/data_collection/convert_jao_to_parquet.py
|
| 210 |
+
|
| 211 |
+
════════════════════════════════════════════════════════════════════════════
|
| 212 |
+
|
| 213 |
+
Alternative: Contact JAO Support
|
| 214 |
+
- Email: [email protected]
|
| 215 |
+
- Request: Bulk data download for research purposes
|
| 216 |
+
- Specify: Core FBMC region, Oct 2024 - Sept 2025
|
| 217 |
+
|
| 218 |
+
════════════════════════════════════════════════════════════════════════════
|
| 219 |
+
""")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
import argparse
|
| 224 |
+
|
| 225 |
+
parser = argparse.ArgumentParser(description="Collect JAO FBMC data using JAOPuTo tool")
|
| 226 |
+
parser.add_argument(
|
| 227 |
+
'--start-date',
|
| 228 |
+
default='2024-10-01',
|
| 229 |
+
help='Start date (YYYY-MM-DD)'
|
| 230 |
+
)
|
| 231 |
+
parser.add_argument(
|
| 232 |
+
'--end-date',
|
| 233 |
+
default='2025-09-30',
|
| 234 |
+
help='End date (YYYY-MM-DD)'
|
| 235 |
+
)
|
| 236 |
+
parser.add_argument(
|
| 237 |
+
'--output-dir',
|
| 238 |
+
type=Path,
|
| 239 |
+
default=Path('data/raw'),
|
| 240 |
+
help='Output directory for files'
|
| 241 |
+
)
|
| 242 |
+
parser.add_argument(
|
| 243 |
+
'--jaoputo-jar',
|
| 244 |
+
type=Path,
|
| 245 |
+
default=Path('tools/JAOPuTo.jar'),
|
| 246 |
+
help='Path to JAOPuTo.jar file'
|
| 247 |
+
)
|
| 248 |
+
parser.add_argument(
|
| 249 |
+
'--manual-instructions',
|
| 250 |
+
action='store_true',
|
| 251 |
+
help='Print manual download instructions and exit'
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
args = parser.parse_args()
|
| 255 |
+
|
| 256 |
+
if args.manual_instructions:
|
| 257 |
+
download_jao_manual_instructions()
|
| 258 |
+
else:
|
| 259 |
+
try:
|
| 260 |
+
collector = JAOCollector(jaoputo_jar=args.jaoputo_jar)
|
| 261 |
+
collector.download_fbmc_data(
|
| 262 |
+
start_date=args.start_date,
|
| 263 |
+
end_date=args.end_date,
|
| 264 |
+
output_dir=args.output_dir
|
| 265 |
+
)
|
| 266 |
+
except (FileNotFoundError, EnvironmentError) as e:
|
| 267 |
+
print(f"\n❌ Error: {e}\n")
|
| 268 |
+
download_jao_manual_instructions()
|
|
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenMeteo Weather Data Collection with Proper Rate Limiting
|
| 2 |
+
|
| 3 |
+
Collects historical weather data from OpenMeteo API for 52 strategic grid points.
|
| 4 |
+
Implements proper rate limiting based on actual OpenMeteo free tier limits.
|
| 5 |
+
|
| 6 |
+
OpenMeteo Free Tier Limits (ACTUAL):
|
| 7 |
+
- 600 calls/minute
|
| 8 |
+
- 5,000 calls/hour
|
| 9 |
+
- 10,000 calls/day
|
| 10 |
+
- 300,000 calls/month
|
| 11 |
+
|
| 12 |
+
Request Counting:
|
| 13 |
+
- Base request (≤10 variables, ≤2 weeks) = 1.0 API call
|
| 14 |
+
- >10 variables OR >2 weeks = Multiple calls (fractional)
|
| 15 |
+
- Example: 4 weeks = 3.0 API calls, 8 weeks = 7.0 API calls
|
| 16 |
+
|
| 17 |
+
Strategy:
|
| 18 |
+
- Request data in 2-week chunks (stays at 1.0 API call per request)
|
| 19 |
+
- 7 weather parameters (under 10 limit)
|
| 20 |
+
- 270 requests/minute (45% of 600 limit - safe but efficient)
|
| 21 |
+
- ~5 minutes total for 12 months × 52 locations
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import requests
|
| 25 |
+
import polars as pl
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from datetime import datetime, timedelta
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
import os
|
| 30 |
+
import time
|
| 31 |
+
from typing import List, Dict, Tuple
|
| 32 |
+
from tqdm import tqdm
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Load environment variables
|
| 36 |
+
load_dotenv()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# 52 Strategic Grid Points (from project plan)
|
| 40 |
+
GRID_POINTS = {
|
| 41 |
+
# Germany (6 points)
|
| 42 |
+
"DE_North_Sea": {"lat": 54.5, "lon": 7.0, "name": "Offshore North Sea"},
|
| 43 |
+
"DE_Hamburg": {"lat": 53.5, "lon": 10.0, "name": "Hamburg/Schleswig-Holstein"},
|
| 44 |
+
"DE_Berlin": {"lat": 52.5, "lon": 13.5, "name": "Berlin/Brandenburg"},
|
| 45 |
+
"DE_Frankfurt": {"lat": 50.1, "lon": 8.7, "name": "Frankfurt"},
|
| 46 |
+
"DE_Munich": {"lat": 48.1, "lon": 11.6, "name": "Munich/Bavaria"},
|
| 47 |
+
"DE_Baltic": {"lat": 54.5, "lon": 13.0, "name": "Offshore Baltic"},
|
| 48 |
+
|
| 49 |
+
# France (5 points)
|
| 50 |
+
"FR_Dunkirk": {"lat": 51.0, "lon": 2.3, "name": "Dunkirk/Lille"},
|
| 51 |
+
"FR_Paris": {"lat": 48.9, "lon": 2.3, "name": "Paris"},
|
| 52 |
+
"FR_Lyon": {"lat": 45.8, "lon": 4.8, "name": "Lyon"},
|
| 53 |
+
"FR_Marseille": {"lat": 43.3, "lon": 5.4, "name": "Marseille"},
|
| 54 |
+
"FR_Strasbourg": {"lat": 48.6, "lon": 7.8, "name": "Strasbourg"},
|
| 55 |
+
|
| 56 |
+
# Netherlands (4 points)
|
| 57 |
+
"NL_Offshore": {"lat": 53.5, "lon": 4.5, "name": "Offshore North"},
|
| 58 |
+
"NL_Amsterdam": {"lat": 52.4, "lon": 4.9, "name": "Amsterdam"},
|
| 59 |
+
"NL_Rotterdam": {"lat": 51.9, "lon": 4.5, "name": "Rotterdam"},
|
| 60 |
+
"NL_Groningen": {"lat": 53.2, "lon": 6.6, "name": "Groningen"},
|
| 61 |
+
|
| 62 |
+
# Austria (3 points)
|
| 63 |
+
"AT_Kaprun": {"lat": 47.26, "lon": 12.74, "name": "Kaprun"},
|
| 64 |
+
"AT_St_Peter": {"lat": 48.26, "lon": 13.08, "name": "St. Peter"},
|
| 65 |
+
"AT_Vienna": {"lat": 48.15, "lon": 16.45, "name": "Vienna"},
|
| 66 |
+
|
| 67 |
+
# Belgium (3 points)
|
| 68 |
+
"BE_Offshore": {"lat": 51.5, "lon": 2.8, "name": "Belgian Offshore"},
|
| 69 |
+
"BE_Doel": {"lat": 51.32, "lon": 4.26, "name": "Doel"},
|
| 70 |
+
"BE_Avelgem": {"lat": 50.78, "lon": 3.45, "name": "Avelgem"},
|
| 71 |
+
|
| 72 |
+
# Czech Republic (3 points)
|
| 73 |
+
"CZ_Hradec": {"lat": 50.70, "lon": 13.80, "name": "Hradec-RPST"},
|
| 74 |
+
"CZ_Bohemia": {"lat": 50.50, "lon": 13.60, "name": "Northwest Bohemia"},
|
| 75 |
+
"CZ_Temelin": {"lat": 49.18, "lon": 14.37, "name": "Temelin"},
|
| 76 |
+
|
| 77 |
+
# Poland (4 points)
|
| 78 |
+
"PL_Baltic": {"lat": 54.8, "lon": 17.5, "name": "Baltic Offshore"},
|
| 79 |
+
"PL_SHVDC": {"lat": 54.5, "lon": 17.0, "name": "SwePol Link"},
|
| 80 |
+
"PL_Belchatow": {"lat": 51.27, "lon": 19.32, "name": "Belchatow"},
|
| 81 |
+
"PL_Mikulowa": {"lat": 51.5, "lon": 15.2, "name": "Mikulowa PST"},
|
| 82 |
+
|
| 83 |
+
# Hungary (3 points)
|
| 84 |
+
"HU_Paks": {"lat": 46.57, "lon": 18.86, "name": "Paks Nuclear"},
|
| 85 |
+
"HU_Bekescsaba": {"lat": 46.68, "lon": 21.09, "name": "Bekescsaba"},
|
| 86 |
+
"HU_Gyor": {"lat": 47.68, "lon": 17.63, "name": "Gyor"},
|
| 87 |
+
|
| 88 |
+
# Romania (3 points)
|
| 89 |
+
"RO_Fantanele": {"lat": 44.59, "lon": 28.57, "name": "Fantanele-Cogealac"},
|
| 90 |
+
"RO_Iron_Gates": {"lat": 44.67, "lon": 22.53, "name": "Iron Gates"},
|
| 91 |
+
"RO_Cernavoda": {"lat": 44.32, "lon": 28.03, "name": "Cernavoda"},
|
| 92 |
+
|
| 93 |
+
# Slovakia (3 points)
|
| 94 |
+
"SK_Bohunice": {"lat": 48.49, "lon": 17.68, "name": "Bohunice/Mochovce"},
|
| 95 |
+
"SK_Gabcikovo": {"lat": 47.88, "lon": 17.54, "name": "Gabcikovo"},
|
| 96 |
+
"SK_Rimavska": {"lat": 48.38, "lon": 20.00, "name": "Rimavska Sobota"},
|
| 97 |
+
|
| 98 |
+
# Slovenia (2 points)
|
| 99 |
+
"SI_Krsko": {"lat": 45.94, "lon": 15.52, "name": "Krsko Nuclear"},
|
| 100 |
+
"SI_Divaca": {"lat": 45.68, "lon": 13.97, "name": "Divaca"},
|
| 101 |
+
|
| 102 |
+
# Croatia (2 points)
|
| 103 |
+
"HR_Ernestinovo": {"lat": 45.47, "lon": 18.66, "name": "Ernestinovo"},
|
| 104 |
+
"HR_Zagreb": {"lat": 45.88, "lon": 16.12, "name": "Zagreb"},
|
| 105 |
+
|
| 106 |
+
# Luxembourg (2 points)
|
| 107 |
+
"LU_Trier": {"lat": 49.75, "lon": 6.63, "name": "Trier/Aach"},
|
| 108 |
+
"LU_Bauler": {"lat": 49.92, "lon": 6.20, "name": "Bauler"},
|
| 109 |
+
|
| 110 |
+
# External regions (8 points)
|
| 111 |
+
"CH_Central": {"lat": 46.85, "lon": 9.0, "name": "Switzerland Central"},
|
| 112 |
+
"UK_Southeast": {"lat": 51.5, "lon": 0.0, "name": "UK Southeast"},
|
| 113 |
+
"ES_North": {"lat": 43.3, "lon": -3.0, "name": "Spain North"},
|
| 114 |
+
"IT_North": {"lat": 45.5, "lon": 9.2, "name": "Italy North"},
|
| 115 |
+
"NO_South": {"lat": 59.0, "lon": 5.7, "name": "Norway South"},
|
| 116 |
+
"SE_South": {"lat": 56.0, "lon": 13.0, "name": "Sweden South"},
|
| 117 |
+
"DK_West": {"lat": 56.0, "lon": 9.0, "name": "Denmark West"},
|
| 118 |
+
"DK_East": {"lat": 55.7, "lon": 12.6, "name": "Denmark East"},
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Weather parameters to collect (7 params - under 10 limit)
|
| 123 |
+
WEATHER_PARAMS = [
|
| 124 |
+
'temperature_2m',
|
| 125 |
+
'windspeed_10m',
|
| 126 |
+
'windspeed_100m',
|
| 127 |
+
'winddirection_100m',
|
| 128 |
+
'shortwave_radiation',
|
| 129 |
+
'cloudcover',
|
| 130 |
+
'surface_pressure',
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class OpenMeteoCollector:
|
| 135 |
+
"""Collect weather data from OpenMeteo API with proper rate limiting."""
|
| 136 |
+
|
| 137 |
+
def __init__(
|
| 138 |
+
self,
|
| 139 |
+
requests_per_minute: int = 270,
|
| 140 |
+
chunk_days: int = 14
|
| 141 |
+
):
|
| 142 |
+
"""Initialize collector with rate limiting.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
requests_per_minute: Max HTTP requests per minute (default: 270 = 45% of 600 limit)
|
| 146 |
+
chunk_days: Days per request chunk (default: 14 = 1.0 API call)
|
| 147 |
+
"""
|
| 148 |
+
self.base_url = os.getenv('OPENMETEO_BASE_URL', 'https://api.open-meteo.com/v1/forecast')
|
| 149 |
+
|
| 150 |
+
# OpenMeteo historical data endpoint (free tier)
|
| 151 |
+
self.historical_url = 'https://archive-api.open-meteo.com/v1/archive'
|
| 152 |
+
|
| 153 |
+
self.requests_per_minute = requests_per_minute
|
| 154 |
+
self.chunk_days = chunk_days
|
| 155 |
+
self.delay_seconds = 60.0 / requests_per_minute # Delay between requests
|
| 156 |
+
self.session = requests.Session()
|
| 157 |
+
|
| 158 |
+
self.total_api_calls = 0 # Track actual API call count
|
| 159 |
+
|
| 160 |
+
def _generate_date_chunks(
|
| 161 |
+
self,
|
| 162 |
+
start_date: str,
|
| 163 |
+
end_date: str
|
| 164 |
+
) -> List[Tuple[str, str]]:
|
| 165 |
+
"""Generate date range chunks of specified size.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
start_date: Start date (YYYY-MM-DD)
|
| 169 |
+
end_date: End date (YYYY-MM-DD)
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
List of (start, end) date tuples
|
| 173 |
+
"""
|
| 174 |
+
start_dt = datetime.fromisoformat(start_date)
|
| 175 |
+
end_dt = datetime.fromisoformat(end_date)
|
| 176 |
+
|
| 177 |
+
chunks = []
|
| 178 |
+
current = start_dt
|
| 179 |
+
|
| 180 |
+
while current < end_dt:
|
| 181 |
+
chunk_end = min(current + timedelta(days=self.chunk_days - 1), end_dt)
|
| 182 |
+
chunks.append((
|
| 183 |
+
current.strftime('%Y-%m-%d'),
|
| 184 |
+
chunk_end.strftime('%Y-%m-%d')
|
| 185 |
+
))
|
| 186 |
+
current = chunk_end + timedelta(days=1)
|
| 187 |
+
|
| 188 |
+
return chunks
|
| 189 |
+
|
| 190 |
+
def _calculate_api_calls(self, start_date: str, end_date: str) -> float:
|
| 191 |
+
"""Calculate how many API calls this request will consume.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
start_date: Start date (YYYY-MM-DD)
|
| 195 |
+
end_date: End date (YYYY-MM-DD)
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Number of API calls (fractional)
|
| 199 |
+
"""
|
| 200 |
+
start_dt = datetime.fromisoformat(start_date)
|
| 201 |
+
end_dt = datetime.fromisoformat(end_date)
|
| 202 |
+
days = (end_dt - start_dt).days + 1
|
| 203 |
+
|
| 204 |
+
# OpenMeteo counting: ≤14 days = 1.0 call
|
| 205 |
+
# >14 days scales fractionally
|
| 206 |
+
if days <= 14:
|
| 207 |
+
return 1.0
|
| 208 |
+
else:
|
| 209 |
+
return days / 14.0
|
| 210 |
+
|
| 211 |
+
def fetch_location_chunk(
|
| 212 |
+
self,
|
| 213 |
+
location_id: str,
|
| 214 |
+
location_data: Dict,
|
| 215 |
+
start_date: str,
|
| 216 |
+
end_date: str
|
| 217 |
+
) -> pl.DataFrame:
|
| 218 |
+
"""Fetch weather data for a single location and date chunk.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
location_id: Location identifier (e.g., 'DE_Hamburg')
|
| 222 |
+
location_data: Dict with 'lat', 'lon', 'name'
|
| 223 |
+
start_date: Start date (YYYY-MM-DD)
|
| 224 |
+
end_date: End date (YYYY-MM-DD)
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
Polars DataFrame with weather data
|
| 228 |
+
"""
|
| 229 |
+
params = {
|
| 230 |
+
'latitude': location_data['lat'],
|
| 231 |
+
'longitude': location_data['lon'],
|
| 232 |
+
'hourly': ','.join(WEATHER_PARAMS),
|
| 233 |
+
'start_date': start_date,
|
| 234 |
+
'end_date': end_date,
|
| 235 |
+
'timezone': 'UTC'
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
# Calculate API call cost
|
| 239 |
+
api_calls = self._calculate_api_calls(start_date, end_date)
|
| 240 |
+
self.total_api_calls += api_calls
|
| 241 |
+
|
| 242 |
+
try:
|
| 243 |
+
response = self.session.get(
|
| 244 |
+
self.historical_url,
|
| 245 |
+
params=params,
|
| 246 |
+
timeout=30
|
| 247 |
+
)
|
| 248 |
+
response.raise_for_status()
|
| 249 |
+
data = response.json()
|
| 250 |
+
|
| 251 |
+
# Parse hourly data
|
| 252 |
+
hourly = data.get('hourly', {})
|
| 253 |
+
timestamps = hourly.get('time', [])
|
| 254 |
+
|
| 255 |
+
if not timestamps:
|
| 256 |
+
return pl.DataFrame()
|
| 257 |
+
|
| 258 |
+
# Build dataframe
|
| 259 |
+
df_data = {
|
| 260 |
+
'timestamp': timestamps,
|
| 261 |
+
'grid_point': [location_id] * len(timestamps),
|
| 262 |
+
'location_name': [location_data['name']] * len(timestamps),
|
| 263 |
+
'latitude': [location_data['lat']] * len(timestamps),
|
| 264 |
+
'longitude': [location_data['lon']] * len(timestamps),
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
# Add weather parameters
|
| 268 |
+
for param in WEATHER_PARAMS:
|
| 269 |
+
df_data[param] = hourly.get(param, [None] * len(timestamps))
|
| 270 |
+
|
| 271 |
+
df = pl.DataFrame(df_data)
|
| 272 |
+
|
| 273 |
+
# Convert timestamp to datetime
|
| 274 |
+
df = df.with_columns(
|
| 275 |
+
pl.col('timestamp').str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M')
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
return df
|
| 279 |
+
|
| 280 |
+
except requests.exceptions.RequestException as e:
|
| 281 |
+
print(f"❌ Failed {location_id} ({start_date} to {end_date}): {e}")
|
| 282 |
+
return pl.DataFrame()
|
| 283 |
+
|
| 284 |
+
def collect_all(
|
| 285 |
+
self,
|
| 286 |
+
start_date: str,
|
| 287 |
+
end_date: str,
|
| 288 |
+
output_path: Path
|
| 289 |
+
) -> pl.DataFrame:
|
| 290 |
+
"""Collect weather data for all 52 grid points with rate limiting.
|
| 291 |
+
|
| 292 |
+
Args:
|
| 293 |
+
start_date: Start date (YYYY-MM-DD)
|
| 294 |
+
end_date: End date (YYYY-MM-DD)
|
| 295 |
+
output_path: Path to save Parquet file
|
| 296 |
+
|
| 297 |
+
Returns:
|
| 298 |
+
Combined Polars DataFrame
|
| 299 |
+
"""
|
| 300 |
+
# Generate date chunks
|
| 301 |
+
date_chunks = self._generate_date_chunks(start_date, end_date)
|
| 302 |
+
total_requests = len(GRID_POINTS) * len(date_chunks)
|
| 303 |
+
estimated_minutes = total_requests / self.requests_per_minute
|
| 304 |
+
|
| 305 |
+
print("=" * 70)
|
| 306 |
+
print("OpenMeteo Weather Data Collection")
|
| 307 |
+
print("=" * 70)
|
| 308 |
+
print(f"Date range: {start_date} to {end_date}")
|
| 309 |
+
print(f"Grid points: {len(GRID_POINTS)}")
|
| 310 |
+
print(f"Date chunks: {len(date_chunks)} ({self.chunk_days}-day periods)")
|
| 311 |
+
print(f"Total HTTP requests: {total_requests}")
|
| 312 |
+
print(f"Rate limit: {self.requests_per_minute} requests/minute (45% of 600 max)")
|
| 313 |
+
print(f"Estimated time: {estimated_minutes:.1f} minutes")
|
| 314 |
+
print(f"Delay between requests: {self.delay_seconds:.2f}s")
|
| 315 |
+
print()
|
| 316 |
+
|
| 317 |
+
all_data = []
|
| 318 |
+
request_count = 0
|
| 319 |
+
|
| 320 |
+
# Iterate through all locations and date chunks
|
| 321 |
+
with tqdm(total=total_requests, desc="Fetching weather data") as pbar:
|
| 322 |
+
for location_id, location_data in GRID_POINTS.items():
|
| 323 |
+
location_chunks = []
|
| 324 |
+
|
| 325 |
+
for start_chunk, end_chunk in date_chunks:
|
| 326 |
+
# Fetch this chunk
|
| 327 |
+
df = self.fetch_location_chunk(
|
| 328 |
+
location_id,
|
| 329 |
+
location_data,
|
| 330 |
+
start_chunk,
|
| 331 |
+
end_chunk
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
if not df.is_empty():
|
| 335 |
+
location_chunks.append(df)
|
| 336 |
+
|
| 337 |
+
request_count += 1
|
| 338 |
+
pbar.update(1)
|
| 339 |
+
|
| 340 |
+
# Rate limiting - wait before next request
|
| 341 |
+
time.sleep(self.delay_seconds)
|
| 342 |
+
|
| 343 |
+
# Combine all chunks for this location
|
| 344 |
+
if location_chunks:
|
| 345 |
+
location_df = pl.concat(location_chunks)
|
| 346 |
+
all_data.append(location_df)
|
| 347 |
+
print(f"✅ {location_id}: {location_df.shape[0]} hours")
|
| 348 |
+
|
| 349 |
+
# Combine all dataframes
|
| 350 |
+
if all_data:
|
| 351 |
+
combined_df = pl.concat(all_data)
|
| 352 |
+
|
| 353 |
+
# Save to parquet
|
| 354 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 355 |
+
combined_df.write_parquet(output_path)
|
| 356 |
+
|
| 357 |
+
print()
|
| 358 |
+
print("=" * 70)
|
| 359 |
+
print("Collection Complete")
|
| 360 |
+
print("=" * 70)
|
| 361 |
+
print(f"Total HTTP requests: {request_count}")
|
| 362 |
+
print(f"Total API calls consumed: {self.total_api_calls:.1f}")
|
| 363 |
+
print(f"Total records: {combined_df.shape[0]:,}")
|
| 364 |
+
print(f"Date range: {combined_df['timestamp'].min()} to {combined_df['timestamp'].max()}")
|
| 365 |
+
print(f"Grid points: {combined_df['grid_point'].n_unique()}")
|
| 366 |
+
print(f"Completeness: {(1 - combined_df.null_count().sum() / (combined_df.shape[0] * combined_df.shape[1])) * 100:.2f}%")
|
| 367 |
+
print(f"Output: {output_path}")
|
| 368 |
+
print(f"File size: {output_path.stat().st_size / (1024**2):.1f} MB")
|
| 369 |
+
|
| 370 |
+
return combined_df
|
| 371 |
+
else:
|
| 372 |
+
print("❌ No data collected")
|
| 373 |
+
return pl.DataFrame()
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
if __name__ == "__main__":
|
| 377 |
+
import argparse
|
| 378 |
+
|
| 379 |
+
parser = argparse.ArgumentParser(description="Collect OpenMeteo weather data with proper rate limiting")
|
| 380 |
+
parser.add_argument(
|
| 381 |
+
'--start-date',
|
| 382 |
+
default='2024-10-01',
|
| 383 |
+
help='Start date (YYYY-MM-DD)'
|
| 384 |
+
)
|
| 385 |
+
parser.add_argument(
|
| 386 |
+
'--end-date',
|
| 387 |
+
default='2025-09-30',
|
| 388 |
+
help='End date (YYYY-MM-DD)'
|
| 389 |
+
)
|
| 390 |
+
parser.add_argument(
|
| 391 |
+
'--output',
|
| 392 |
+
type=Path,
|
| 393 |
+
default=Path('data/raw/weather_2024_2025.parquet'),
|
| 394 |
+
help='Output Parquet file path'
|
| 395 |
+
)
|
| 396 |
+
parser.add_argument(
|
| 397 |
+
'--requests-per-minute',
|
| 398 |
+
type=int,
|
| 399 |
+
default=270,
|
| 400 |
+
help='HTTP requests per minute (default: 270 = 45%% of 600 limit)'
|
| 401 |
+
)
|
| 402 |
+
parser.add_argument(
|
| 403 |
+
'--chunk-days',
|
| 404 |
+
type=int,
|
| 405 |
+
default=14,
|
| 406 |
+
help='Days per request chunk (default: 14 = 1.0 API call)'
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
args = parser.parse_args()
|
| 410 |
+
|
| 411 |
+
# Initialize collector and run
|
| 412 |
+
collector = OpenMeteoCollector(
|
| 413 |
+
requests_per_minute=args.requests_per_minute,
|
| 414 |
+
chunk_days=args.chunk_days
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
collector.collect_all(
|
| 418 |
+
start_date=args.start_date,
|
| 419 |
+
end_date=args.end_date,
|
| 420 |
+
output_path=args.output
|
| 421 |
+
)
|
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Download all FBMC data from HuggingFace Datasets.
|
| 2 |
+
|
| 3 |
+
This script downloads all required datasets from HuggingFace Datasets to local storage.
|
| 4 |
+
Used for setting up new environments (HF Space, analyst handover, etc.)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from hf_datasets_manager import FBMCDatasetManager
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def setup_data(data_dir: Path = Path("data/raw"), force_redownload: bool = False):
|
| 13 |
+
"""Download all datasets if not present locally.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
data_dir: Directory to store downloaded data (default: data/raw)
|
| 17 |
+
force_redownload: Re-download even if files exist (default: False)
|
| 18 |
+
"""
|
| 19 |
+
print("=" * 60)
|
| 20 |
+
print("FBMC Data Setup - Download from HuggingFace Datasets")
|
| 21 |
+
print("=" * 60)
|
| 22 |
+
|
| 23 |
+
manager = FBMCDatasetManager()
|
| 24 |
+
|
| 25 |
+
# Expected datasets (will be created during Day 1)
|
| 26 |
+
datasets_to_download = {
|
| 27 |
+
"fbmc-cnecs-2024-2025": "cnecs_2024_2025.parquet",
|
| 28 |
+
"fbmc-weather-2024-2025": "weather_2024_2025.parquet",
|
| 29 |
+
"fbmc-entsoe-2024-2025": "entsoe_2024_2025.parquet",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
|
| 34 |
+
success_count = 0
|
| 35 |
+
skip_count = 0
|
| 36 |
+
fail_count = 0
|
| 37 |
+
|
| 38 |
+
for dataset_name, filename in datasets_to_download.items():
|
| 39 |
+
output_path = data_dir / filename
|
| 40 |
+
|
| 41 |
+
print(f"\n[{filename}]")
|
| 42 |
+
|
| 43 |
+
if output_path.exists() and not force_redownload:
|
| 44 |
+
file_size_mb = output_path.stat().st_size / (1024 * 1024)
|
| 45 |
+
print(f"✅ Already exists ({file_size_mb:.1f} MB), skipping")
|
| 46 |
+
skip_count += 1
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
df = manager.download_dataset(dataset_name, output_path)
|
| 51 |
+
if df is not None:
|
| 52 |
+
success_count += 1
|
| 53 |
+
else:
|
| 54 |
+
fail_count += 1
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Failed to download {dataset_name}: {e}")
|
| 57 |
+
print(f" You may need to run Day 1 data collection first")
|
| 58 |
+
fail_count += 1
|
| 59 |
+
|
| 60 |
+
print("\n" + "=" * 60)
|
| 61 |
+
print("Download Summary:")
|
| 62 |
+
print(f" ✅ Downloaded: {success_count}")
|
| 63 |
+
print(f" ⏭️ Skipped: {skip_count}")
|
| 64 |
+
print(f" ❌ Failed: {fail_count}")
|
| 65 |
+
print("=" * 60)
|
| 66 |
+
|
| 67 |
+
if fail_count > 0:
|
| 68 |
+
print("\n⚠️ Some datasets failed to download.")
|
| 69 |
+
print(" Run Day 1 data collection to create these datasets.")
|
| 70 |
+
return False
|
| 71 |
+
else:
|
| 72 |
+
print("\n✅ Data setup complete!")
|
| 73 |
+
return True
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
import argparse
|
| 78 |
+
|
| 79 |
+
parser = argparse.ArgumentParser(description="Download FBMC datasets from HuggingFace")
|
| 80 |
+
parser.add_argument(
|
| 81 |
+
"--data-dir",
|
| 82 |
+
type=Path,
|
| 83 |
+
default=Path("data/raw"),
|
| 84 |
+
help="Directory to store data (default: data/raw)"
|
| 85 |
+
)
|
| 86 |
+
parser.add_argument(
|
| 87 |
+
"--force",
|
| 88 |
+
action="store_true",
|
| 89 |
+
help="Force re-download even if files exist"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
args = parser.parse_args()
|
| 93 |
+
|
| 94 |
+
success = setup_data(data_dir=args.data_dir, force_redownload=args.force)
|
| 95 |
+
sys.exit(0 if success else 1)
|
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HuggingFace Datasets manager for FBMC data storage.
|
| 2 |
+
|
| 3 |
+
This utility manages uploading/downloading Parquet files to/from HuggingFace Datasets.
|
| 4 |
+
Following best practices: Code -> Git, Data -> HF Datasets (NOT Git LFS)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import polars as pl
|
| 8 |
+
from datasets import Dataset, DatasetDict
|
| 9 |
+
from huggingface_hub import HfApi
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
import os
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class FBMCDatasetManager:
|
| 17 |
+
"""Manage FBMC data uploads/downloads via HuggingFace Datasets."""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
"""Initialize with HF credentials from .env file."""
|
| 21 |
+
# Load environment variables from .env
|
| 22 |
+
load_dotenv()
|
| 23 |
+
|
| 24 |
+
self.hf_token = os.getenv('HF_TOKEN')
|
| 25 |
+
self.hf_username = os.getenv('HF_USERNAME')
|
| 26 |
+
|
| 27 |
+
if not self.hf_token or 'your_hf' in self.hf_token.lower():
|
| 28 |
+
print("⚠️ HF token not configured - upload features disabled")
|
| 29 |
+
self.api = None
|
| 30 |
+
else:
|
| 31 |
+
self.api = HfApi(token=self.hf_token)
|
| 32 |
+
|
| 33 |
+
def upload_dataset(
|
| 34 |
+
self,
|
| 35 |
+
parquet_path: Path,
|
| 36 |
+
dataset_name: str,
|
| 37 |
+
description: str = "",
|
| 38 |
+
private: bool = False
|
| 39 |
+
) -> Optional[str]:
|
| 40 |
+
"""Upload Parquet file to HuggingFace Datasets.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
parquet_path: Path to local Parquet file
|
| 44 |
+
dataset_name: Name for HF dataset (e.g., 'fbmc-cnecs-2024-2025')
|
| 45 |
+
description: Optional dataset description
|
| 46 |
+
private: Whether dataset should be private (default: False for free storage)
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Full dataset name (username/dataset-name) or None if upload fails
|
| 50 |
+
"""
|
| 51 |
+
if not self.api:
|
| 52 |
+
print("❌ Cannot upload: HF token not configured")
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
print(f"📤 Uploading {parquet_path.name} to HF Datasets...")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# Load Parquet as polars, convert to HF Dataset
|
| 59 |
+
df = pl.read_parquet(parquet_path)
|
| 60 |
+
dataset = Dataset.from_pandas(df.to_pandas())
|
| 61 |
+
|
| 62 |
+
# Create full dataset name
|
| 63 |
+
full_name = f"{self.hf_username}/{dataset_name}"
|
| 64 |
+
|
| 65 |
+
# Upload to HF
|
| 66 |
+
dataset.push_to_hub(
|
| 67 |
+
full_name,
|
| 68 |
+
token=self.hf_token,
|
| 69 |
+
private=private
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
print(f"✅ Uploaded to: https://huggingface.co/datasets/{full_name}")
|
| 73 |
+
return full_name
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"❌ Upload failed: {e}")
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
def download_dataset(
|
| 80 |
+
self,
|
| 81 |
+
dataset_name: str,
|
| 82 |
+
output_path: Path,
|
| 83 |
+
split: str = "train"
|
| 84 |
+
) -> Optional[pl.DataFrame]:
|
| 85 |
+
"""Download dataset from HF to local Parquet file.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
dataset_name: HF dataset name (with or without username prefix)
|
| 89 |
+
output_path: Local path to save Parquet file
|
| 90 |
+
split: Dataset split to download (default: 'train')
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Polars DataFrame or None if download fails
|
| 94 |
+
"""
|
| 95 |
+
from datasets import load_dataset
|
| 96 |
+
|
| 97 |
+
# Add username prefix if not present
|
| 98 |
+
if '/' not in dataset_name:
|
| 99 |
+
dataset_name = f"{self.hf_username}/{dataset_name}"
|
| 100 |
+
|
| 101 |
+
print(f"📥 Downloading {dataset_name} from HF Datasets...")
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
# Download from HF
|
| 105 |
+
dataset = load_dataset(dataset_name, split=split)
|
| 106 |
+
|
| 107 |
+
# Convert to polars and save
|
| 108 |
+
df = pl.from_pandas(dataset.to_pandas())
|
| 109 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 110 |
+
df.write_parquet(output_path)
|
| 111 |
+
|
| 112 |
+
print(f"✅ Downloaded to: {output_path}")
|
| 113 |
+
print(f" Shape: {df.shape}")
|
| 114 |
+
return df
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"❌ Download failed: {e}")
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
def list_datasets(self, filter_fbmc: bool = True) -> list:
|
| 121 |
+
"""List all datasets for this user.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
filter_fbmc: Only show FBMC-related datasets (default: True)
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
List of dataset info dictionaries
|
| 128 |
+
"""
|
| 129 |
+
if not self.api:
|
| 130 |
+
print("❌ Cannot list: HF token not configured")
|
| 131 |
+
return []
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
datasets = list(self.api.list_datasets(author=self.hf_username))
|
| 135 |
+
|
| 136 |
+
if filter_fbmc:
|
| 137 |
+
datasets = [d for d in datasets if 'fbmc' in d.id.lower()]
|
| 138 |
+
|
| 139 |
+
print(f"\n📊 {'FBMC ' if filter_fbmc else ''}Datasets for {self.hf_username}:")
|
| 140 |
+
for ds in datasets:
|
| 141 |
+
print(f" - {ds.id}")
|
| 142 |
+
|
| 143 |
+
return datasets
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
print(f"❌ List failed: {e}")
|
| 147 |
+
return []
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# Example usage
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
manager = FBMCDatasetManager()
|
| 153 |
+
|
| 154 |
+
# Test configuration
|
| 155 |
+
print("HF Datasets Manager initialized")
|
| 156 |
+
print(f"Username: {manager.hf_username}")
|
| 157 |
+
print(f"Token configured: {manager.api is not None}")
|
| 158 |
+
|
| 159 |
+
# Upload example (will be used in Day 1)
|
| 160 |
+
# manager.upload_dataset(
|
| 161 |
+
# parquet_path=Path("data/raw/cnecs_2024_2025.parquet"),
|
| 162 |
+
# dataset_name="fbmc-cnecs-2024-2025",
|
| 163 |
+
# description="FBMC CNECs data: Oct 2024 - Sept 2025"
|
| 164 |
+
# )
|
| 165 |
+
|
| 166 |
+
# Download example (will be used when setting up new environments)
|
| 167 |
+
# manager.download_dataset(
|
| 168 |
+
# dataset_name="fbmc-cnecs-2024-2025",
|
| 169 |
+
# output_path=Path("data/raw/cnecs_2024_2025.parquet")
|
| 170 |
+
# )
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data loading utilities for FBMC forecasting project.
|
| 2 |
+
|
| 3 |
+
Provides convenient functions to load and filter FBMC data files.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import polars as pl
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional, List
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FBMCDataLoader:
|
| 13 |
+
"""Load and filter FBMC data with convenient methods."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, data_dir: Path = Path("data/raw")):
|
| 16 |
+
"""Initialize data loader.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
data_dir: Directory containing Parquet files (default: data/raw)
|
| 20 |
+
"""
|
| 21 |
+
self.data_dir = Path(data_dir)
|
| 22 |
+
if not self.data_dir.exists():
|
| 23 |
+
raise FileNotFoundError(f"Data directory not found: {data_dir}")
|
| 24 |
+
|
| 25 |
+
def load_cnecs(
|
| 26 |
+
self,
|
| 27 |
+
start_date: Optional[str] = None,
|
| 28 |
+
end_date: Optional[str] = None,
|
| 29 |
+
borders: Optional[List[str]] = None
|
| 30 |
+
) -> pl.DataFrame:
|
| 31 |
+
"""Load CNEC data with optional filtering.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
start_date: Start date (ISO format: 'YYYY-MM-DD')
|
| 35 |
+
end_date: End date (ISO format: 'YYYY-MM-DD')
|
| 36 |
+
borders: List of border codes to filter (e.g., ['DE_NL', 'DE_FR'])
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Polars DataFrame with CNEC data
|
| 40 |
+
"""
|
| 41 |
+
file_path = self.data_dir / "cnecs_2024_2025.parquet"
|
| 42 |
+
if not file_path.exists():
|
| 43 |
+
raise FileNotFoundError(f"CNECs file not found: {file_path}")
|
| 44 |
+
|
| 45 |
+
cnecs = pl.read_parquet(file_path)
|
| 46 |
+
|
| 47 |
+
# Apply date filters
|
| 48 |
+
if start_date:
|
| 49 |
+
cnecs = cnecs.filter(pl.col("timestamp") >= start_date)
|
| 50 |
+
if end_date:
|
| 51 |
+
cnecs = cnecs.filter(pl.col("timestamp") <= end_date)
|
| 52 |
+
|
| 53 |
+
# Apply border filter
|
| 54 |
+
if borders:
|
| 55 |
+
cnecs = cnecs.filter(pl.col("border").is_in(borders))
|
| 56 |
+
|
| 57 |
+
return cnecs
|
| 58 |
+
|
| 59 |
+
def load_weather(
|
| 60 |
+
self,
|
| 61 |
+
start_date: Optional[str] = None,
|
| 62 |
+
end_date: Optional[str] = None,
|
| 63 |
+
grid_points: Optional[List[str]] = None
|
| 64 |
+
) -> pl.DataFrame:
|
| 65 |
+
"""Load weather data with optional filtering.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
start_date: Start date (ISO format: 'YYYY-MM-DD')
|
| 69 |
+
end_date: End date (ISO format: 'YYYY-MM-DD')
|
| 70 |
+
grid_points: List of grid point IDs to filter
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Polars DataFrame with weather data
|
| 74 |
+
"""
|
| 75 |
+
file_path = self.data_dir / "weather_2024_2025.parquet"
|
| 76 |
+
if not file_path.exists():
|
| 77 |
+
raise FileNotFoundError(f"Weather file not found: {file_path}")
|
| 78 |
+
|
| 79 |
+
weather = pl.read_parquet(file_path)
|
| 80 |
+
|
| 81 |
+
# Apply date filters
|
| 82 |
+
if start_date:
|
| 83 |
+
weather = weather.filter(pl.col("timestamp") >= start_date)
|
| 84 |
+
if end_date:
|
| 85 |
+
weather = weather.filter(pl.col("timestamp") <= end_date)
|
| 86 |
+
|
| 87 |
+
# Apply grid point filter
|
| 88 |
+
if grid_points:
|
| 89 |
+
weather = weather.filter(pl.col("grid_point").is_in(grid_points))
|
| 90 |
+
|
| 91 |
+
return weather
|
| 92 |
+
|
| 93 |
+
def load_entsoe(
|
| 94 |
+
self,
|
| 95 |
+
start_date: Optional[str] = None,
|
| 96 |
+
end_date: Optional[str] = None,
|
| 97 |
+
zones: Optional[List[str]] = None
|
| 98 |
+
) -> pl.DataFrame:
|
| 99 |
+
"""Load ENTSO-E data with optional filtering.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
start_date: Start date (ISO format: 'YYYY-MM-DD')
|
| 103 |
+
end_date: End date (ISO format: 'YYYY-MM-DD')
|
| 104 |
+
zones: List of bidding zone codes (e.g., ['DE_LU', 'FR', 'NL'])
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Polars DataFrame with ENTSO-E data
|
| 108 |
+
"""
|
| 109 |
+
file_path = self.data_dir / "entsoe_2024_2025.parquet"
|
| 110 |
+
if not file_path.exists():
|
| 111 |
+
raise FileNotFoundError(f"ENTSO-E file not found: {file_path}")
|
| 112 |
+
|
| 113 |
+
entsoe = pl.read_parquet(file_path)
|
| 114 |
+
|
| 115 |
+
# Apply date filters
|
| 116 |
+
if start_date:
|
| 117 |
+
entsoe = entsoe.filter(pl.col("timestamp") >= start_date)
|
| 118 |
+
if end_date:
|
| 119 |
+
entsoe = entsoe.filter(pl.col("timestamp") <= end_date)
|
| 120 |
+
|
| 121 |
+
# Apply zone filter
|
| 122 |
+
if zones:
|
| 123 |
+
entsoe = entsoe.filter(pl.col("zone").is_in(zones))
|
| 124 |
+
|
| 125 |
+
return entsoe
|
| 126 |
+
|
| 127 |
+
def get_date_range(self) -> dict:
|
| 128 |
+
"""Get available date range from all datasets.
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Dictionary with min/max dates for each dataset
|
| 132 |
+
"""
|
| 133 |
+
date_ranges = {}
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
cnecs = pl.read_parquet(self.data_dir / "cnecs_2024_2025.parquet")
|
| 137 |
+
date_ranges['cnecs'] = {
|
| 138 |
+
'min': cnecs['timestamp'].min(),
|
| 139 |
+
'max': cnecs['timestamp'].max()
|
| 140 |
+
}
|
| 141 |
+
except Exception:
|
| 142 |
+
date_ranges['cnecs'] = None
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
weather = pl.read_parquet(self.data_dir / "weather_2024_2025.parquet")
|
| 146 |
+
date_ranges['weather'] = {
|
| 147 |
+
'min': weather['timestamp'].min(),
|
| 148 |
+
'max': weather['timestamp'].max()
|
| 149 |
+
}
|
| 150 |
+
except Exception:
|
| 151 |
+
date_ranges['weather'] = None
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
entsoe = pl.read_parquet(self.data_dir / "entsoe_2024_2025.parquet")
|
| 155 |
+
date_ranges['entsoe'] = {
|
| 156 |
+
'min': entsoe['timestamp'].min(),
|
| 157 |
+
'max': entsoe['timestamp'].max()
|
| 158 |
+
}
|
| 159 |
+
except Exception:
|
| 160 |
+
date_ranges['entsoe'] = None
|
| 161 |
+
|
| 162 |
+
return date_ranges
|
| 163 |
+
|
| 164 |
+
def validate_data_completeness(
|
| 165 |
+
self,
|
| 166 |
+
start_date: str,
|
| 167 |
+
end_date: str,
|
| 168 |
+
max_missing_pct: float = 5.0
|
| 169 |
+
) -> dict:
|
| 170 |
+
"""Validate data completeness for a given date range.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
start_date: Start date (ISO format)
|
| 174 |
+
end_date: End date (ISO format)
|
| 175 |
+
max_missing_pct: Maximum acceptable missing data percentage
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
Dictionary with validation results for each dataset
|
| 179 |
+
"""
|
| 180 |
+
results = {}
|
| 181 |
+
|
| 182 |
+
# Calculate expected number of hours
|
| 183 |
+
start_dt = datetime.fromisoformat(start_date)
|
| 184 |
+
end_dt = datetime.fromisoformat(end_date)
|
| 185 |
+
expected_hours = int((end_dt - start_dt).total_seconds() / 3600)
|
| 186 |
+
|
| 187 |
+
# Validate CNECs
|
| 188 |
+
try:
|
| 189 |
+
cnecs = self.load_cnecs(start_date, end_date)
|
| 190 |
+
actual_hours = cnecs.select(pl.col("timestamp").n_unique()).item()
|
| 191 |
+
missing_pct = (1 - actual_hours / expected_hours) * 100
|
| 192 |
+
|
| 193 |
+
results['cnecs'] = {
|
| 194 |
+
'expected_hours': expected_hours,
|
| 195 |
+
'actual_hours': actual_hours,
|
| 196 |
+
'missing_pct': missing_pct,
|
| 197 |
+
'valid': missing_pct <= max_missing_pct
|
| 198 |
+
}
|
| 199 |
+
except Exception as e:
|
| 200 |
+
results['cnecs'] = {'error': str(e), 'valid': False}
|
| 201 |
+
|
| 202 |
+
# Validate weather
|
| 203 |
+
try:
|
| 204 |
+
weather = self.load_weather(start_date, end_date)
|
| 205 |
+
actual_hours = weather.select(pl.col("timestamp").n_unique()).item()
|
| 206 |
+
missing_pct = (1 - actual_hours / expected_hours) * 100
|
| 207 |
+
|
| 208 |
+
results['weather'] = {
|
| 209 |
+
'expected_hours': expected_hours,
|
| 210 |
+
'actual_hours': actual_hours,
|
| 211 |
+
'missing_pct': missing_pct,
|
| 212 |
+
'valid': missing_pct <= max_missing_pct
|
| 213 |
+
}
|
| 214 |
+
except Exception as e:
|
| 215 |
+
results['weather'] = {'error': str(e), 'valid': False}
|
| 216 |
+
|
| 217 |
+
# Validate ENTSO-E
|
| 218 |
+
try:
|
| 219 |
+
entsoe = self.load_entsoe(start_date, end_date)
|
| 220 |
+
actual_hours = entsoe.select(pl.col("timestamp").n_unique()).item()
|
| 221 |
+
missing_pct = (1 - actual_hours / expected_hours) * 100
|
| 222 |
+
|
| 223 |
+
results['entsoe'] = {
|
| 224 |
+
'expected_hours': expected_hours,
|
| 225 |
+
'actual_hours': actual_hours,
|
| 226 |
+
'missing_pct': missing_pct,
|
| 227 |
+
'valid': missing_pct <= max_missing_pct
|
| 228 |
+
}
|
| 229 |
+
except Exception as e:
|
| 230 |
+
results['entsoe'] = {'error': str(e), 'valid': False}
|
| 231 |
+
|
| 232 |
+
return results
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
# Example usage
|
| 236 |
+
if __name__ == "__main__":
|
| 237 |
+
# Initialize loader
|
| 238 |
+
loader = FBMCDataLoader(data_dir=Path("data/raw"))
|
| 239 |
+
|
| 240 |
+
# Check available date ranges
|
| 241 |
+
print("Available date ranges:")
|
| 242 |
+
date_ranges = loader.get_date_range()
|
| 243 |
+
for dataset, ranges in date_ranges.items():
|
| 244 |
+
if ranges:
|
| 245 |
+
print(f" {dataset}: {ranges['min']} to {ranges['max']}")
|
| 246 |
+
else:
|
| 247 |
+
print(f" {dataset}: Not available")
|
| 248 |
+
|
| 249 |
+
# Load specific data
|
| 250 |
+
# cnecs = loader.load_cnecs(start_date="2024-10-01", end_date="2024-10-31")
|
| 251 |
+
# weather = loader.load_weather(start_date="2024-10-01", end_date="2024-10-31")
|