Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

Evgueni Poloukarov Claude commited on about 1 month ago

Commit

44b73f4

1 Parent(s): af88e60

feat: implement zero-shot inference pipeline for Day 3

Created complete inference infrastructure:

**New modules (src/inference/)**:
- data_fetcher.py: DataFetcher class for preparing Chronos 2 input
* Loads unified features from HF Dataset or local parquet
* Identifies 615 future covariates from metadata
* Prepares context windows (configurable length)
* Formats data for predict_df() API
* Handles multivariate forecasting (38 borders)

- chronos_pipeline.py: ChronosForecaster class for inference
* Loads Chronos 2 Large (710M params) with GPU support
* Zero-shot inference via predict_df() API
* Probabilistic forecasts (mean, median, quantiles)
* Performance benchmarking utilities
* Parquet export functionality

**Testing**:
- scripts/test_inference_pipeline.py: Comprehensive smoke test
* Tests data loading, model loading, inference
* Validates output quality and performance
* Estimates 14-day forecast time
* Single border × 7 days test case

**HuggingFace Space**:
- Fixed BUILD_ERROR by adding jupyterlab to requirements
- Space rebuild in progress (commit a7e66e0)

**Status**: Ready for local testing and smoke test execution

Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (4) hide show

scripts/test_inference_pipeline.py +177 -0
src/inference/__init__.py +6 -0
src/inference/chronos_pipeline.py +280 -0
src/inference/data_fetcher.py +252 -0

scripts/test_inference_pipeline.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Smoke test for zero-shot inference pipeline
+Tests:
+1. Data loading and preparation
+2. Chronos 2 model loading
+3. Inference on single border (7 days)
+4. Output validation
+5. Performance metrics
+"""
+import sys
+from pathlib import Path
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
+from inference.data_fetcher import DataFetcher
+from inference.chronos_pipeline import ChronosForecaster
+from datetime import datetime, timedelta
+import torch
+import pandas as pd
+def main():
+    print("="*60)
+    print("FBMC Chronos 2 Zero-Shot Inference - Smoke Test")
+    print("="*60)
+    # Step 1: Check environment
+    print("\n[1] Checking environment...")
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    else:
+        print("Running on CPU (inference will be slower)")
+    # Step 2: Initialize DataFetcher
+    print("\n[2] Initializing DataFetcher...")
+    fetcher = DataFetcher(
+        use_local=True,  # Use local files for testing
+        context_length=512  # Use 512 hours context
+    )
+    # Step 3: Load data
+    print("\n[3] Loading unified features...")
+    fetcher.load_data()
+    # Get available date range
+    min_date, max_date = fetcher.get_available_dates()
+    print(f"Available data: {min_date} to {max_date}")
+    # Select forecast date (use last month as test)
+    forecast_date = max_date - timedelta(days=30)
+    print(f"Test forecast date: {forecast_date}")
+    # Step 4: Prepare inference data (single border, 7 days)
+    print("\n[4] Preparing inference data (1 border, 7 days)...")
+    test_border = fetcher.target_borders[0]  # Use first border
+    print(f"Test border: {test_border}")
+    context_df, future_df = fetcher.prepare_inference_data(
+        forecast_date=forecast_date,
+        prediction_length=168,  # 7 days
+        borders=[test_border]
+    )
+    print(f"Context shape: {context_df.shape}")
+    print(f"Future shape: {future_df.shape}")
+    # Validate data
+    print("\n[5] Validating prepared data...")
+    assert 'timestamp' in context_df.columns, "Missing timestamp column"
+    assert 'border' in context_df.columns, "Missing border column"
+    assert 'target' in context_df.columns, "Missing target column"
+    assert len(context_df) > 0, "Empty context data"
+    assert len(future_df) > 0, "Empty future data"
+    print("[+] Data validation passed!")
+    # Check for NaN values
+    context_nulls = context_df.isnull().sum().sum()
+    future_nulls = future_df.isnull().sum().sum()
+    print(f"Context NaN count: {context_nulls}")
+    print(f"Future NaN count: {future_nulls}")
+    if context_nulls > 0 or future_nulls > 0:
+        print("[!] Warning: Data contains NaN values (will be handled by model)")
+    # Step 6: Initialize Chronos 2 forecaster
+    print("\n[6] Initializing Chronos 2 forecaster...")
+    forecaster = ChronosForecaster(
+        model_name="amazon/chronos-2-large",
+        device="auto"  # Will use GPU if available
+    )
+    # Step 7: Load model
+    print("\n[7] Loading Chronos 2 Large model...")
+    print("(This may take a few minutes on first load)")
+    forecaster.load_model()
+    print("[+] Model loaded successfully!")
+    # Step 8: Run inference
+    print("\n[8] Running zero-shot inference...")
+    print(f"Forecasting {test_border} for 7 days (168 hours)")
+    forecasts = forecaster.predict_single_border(
+        border=test_border,
+        context_df=context_df,
+        future_df=future_df,
+        prediction_length=168,
+        num_samples=100  # 100 samples for probabilistic forecast
+    )
+    print(f"[+] Inference complete! Forecast shape: {forecasts.shape}")
+    # Step 9: Validate forecasts
+    print("\n[9] Validating forecasts...")
+    assert len(forecasts) > 0, "Empty forecasts"
+    assert 'timestamp' in forecasts.columns or forecasts.index.name == 'timestamp', "Missing timestamp"
+    # Check for reasonable values
+    if 'mean' in forecasts.columns:
+        mean_forecast = forecasts['mean']
+        print(f"Forecast statistics:")
+        print(f"  Mean: {mean_forecast.mean():.2f} MW")
+        print(f"  Min: {mean_forecast.min():.2f} MW")
+        print(f"  Max: {mean_forecast.max():.2f} MW")
+        print(f"  Std: {mean_forecast.std():.2f} MW")
+        # Sanity check: values should be reasonable for power capacity
+        assert mean_forecast.min() >= 0, "Negative forecasts detected"
+        assert mean_forecast.max() < 20000, "Unreasonably high forecasts"
+        print("[+] Forecast validation passed!")
+    # Step 10: Benchmark performance
+    print("\n[10] Benchmarking inference performance...")
+    metrics = forecaster.benchmark_inference(
+        context_df=context_df,
+        future_df=future_df,
+        prediction_length=168
+    )
+    print(f"Performance metrics:")
+    for key, value in metrics.items():
+        print(f"  {key}: {value}")
+    # Check if we meet the 5-minute target (for 14 days)
+    # Scale to 14-day estimate
+    estimated_14d_time = metrics['inference_time_sec'] * (336 / 168)
+    print(f"\nEstimated time for 14-day forecast: {estimated_14d_time:.1f}s ({estimated_14d_time/60:.1f} min)")
+    if estimated_14d_time < 300:  # 5 minutes
+        print("[+] Performance target met! (<5 min for 14 days)")
+    else:
+        print("[!] Warning: May not meet 5-minute target for 14 days")
+    # Step 11: Save test forecasts
+    print("\n[11] Saving test forecasts...")
+    output_path = "data/evaluation/smoke_test_forecast.parquet"
+    forecaster.save_forecasts(forecasts, output_path)
+    print(f"[+] Saved to: {output_path}")
+    # Summary
+    print("\n" + "="*60)
+    print("SMOKE TEST SUMMARY")
+    print("="*60)
+    print("[+] All tests passed!")
+    print(f"[+] Border: {test_border}")
+    print(f"[+] Forecast length: 168 hours (7 days)")
+    print(f"[+] Inference time: {metrics['inference_time_sec']:.1f}s")
+    print(f"[+] Output shape: {forecasts.shape}")
+    print("\n[+] Ready for full inference run!")
+    print("="*60)
+if __name__ == "__main__":
+    main()

src/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Zero-shot inference pipeline for FBMC flow forecasting"""
+from .data_fetcher import DataFetcher
+from .chronos_pipeline import ChronosForecaster
+__all__ = ['DataFetcher', 'ChronosForecaster']

src/inference/chronos_pipeline.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+Chronos 2 Zero-Shot Inference Pipeline
+Handles:
+1. Loading Chronos 2 Large model (710M params)
+2. Running zero-shot inference using predict_df() API
+3. GPU/CPU device mapping
+4. Saving predictions to parquet
+"""
+from pathlib import Path
+from typing import Optional, Dict, List
+import pandas as pd
+import torch
+from datetime import datetime
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class ChronosForecaster:
+    """
+    Zero-shot forecaster using Chronos 2 Large model.
+    Features:
+    - Multivariate forecasting (multiple borders simultaneously)
+    - Covariate support (615 future covariates)
+    - Large context window (up to 8,192 hours)
+    - DataFrame API for easy data handling
+    """
+    def __init__(
+        self,
+        model_name: str = "amazon/chronos-2-large",
+        device: str = "auto",
+        torch_dtype: str = "float32"
+    ):
+        """
+        Initialize Chronos 2 forecaster.
+        Args:
+            model_name: HuggingFace model name (default: chronos-2-large)
+            device: Device to run on ('auto', 'cuda', 'cpu')
+            torch_dtype: Torch dtype ('float32', 'float16', 'bfloat16')
+        """
+        self.model_name = model_name
+        self.device = self._resolve_device(device)
+        self.torch_dtype = self._resolve_dtype(torch_dtype)
+        self.pipeline = None
+        logger.info(f"ChronosForecaster initialized:")
+        logger.info(f"  Model: {model_name}")
+        logger.info(f"  Device: {self.device}")
+        logger.info(f"  Dtype: {self.torch_dtype}")
+    def _resolve_device(self, device: str) -> str:
+        """Resolve device string to actual device."""
+        if device == "auto":
+            return "cuda" if torch.cuda.is_available() else "cpu"
+        return device
+    def _resolve_dtype(self, dtype_str: str) -> torch.dtype:
+        """Resolve dtype string to torch dtype."""
+        dtype_map = {
+            "float32": torch.float32,
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16
+        }
+        return dtype_map.get(dtype_str, torch.float32)
+    def load_model(self):
+        """Load Chronos 2 model from HuggingFace."""
+        if self.pipeline is not None:
+            logger.info("Model already loaded")
+            return
+        logger.info(f"Loading {self.model_name}...")
+        logger.info("This may take a few minutes on first load...")
+        try:
+            from chronos import Chronos2Pipeline
+            # Load with device_map for GPU support
+            self.pipeline = Chronos2Pipeline.from_pretrained(
+                self.model_name,
+                device_map=self.device if self.device == "cuda" else None,
+                torch_dtype=self.torch_dtype
+            )
+            # Move to device if not using device_map
+            if self.device == "cpu":
+                self.pipeline = self.pipeline.to(self.device)
+            logger.info(f"Model loaded successfully on {self.device}")
+            # Print GPU info if available
+            if self.device == "cuda":
+                gpu_name = torch.cuda.get_device_name(0)
+                gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+                logger.info(f"GPU: {gpu_name} ({gpu_memory:.1f} GB VRAM)")
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise
+    def predict(
+        self,
+        context_df: pd.DataFrame,
+        future_df: pd.DataFrame,
+        prediction_length: int = 336,
+        id_column: str = "border",
+        timestamp_column: str = "timestamp",
+        num_samples: int = 100
+    ) -> pd.DataFrame:
+        """
+        Run zero-shot inference using Chronos 2.
+        Args:
+            context_df: Historical data (timestamp, border, target, features)
+            future_df: Future covariates (timestamp, border, future_covariates)
+            prediction_length: Number of hours to forecast
+            id_column: Column name for border ID
+            timestamp_column: Column name for timestamp
+            num_samples: Number of samples for probabilistic forecast
+        Returns:
+            forecasts_df: DataFrame with predictions (timestamp, border, mean, median, q10, q90)
+        """
+        if self.pipeline is None:
+            self.load_model()
+        logger.info("Running zero-shot inference...")
+        logger.info(f"Context shape: {context_df.shape}")
+        logger.info(f"Future shape: {future_df.shape}")
+        logger.info(f"Prediction length: {prediction_length} hours")
+        logger.info(f"Borders: {context_df[id_column].nunique()}")
+        try:
+            # Run inference
+            forecasts = self.pipeline.predict_df(
+                context_df=context_df,
+                future_df=future_df,
+                prediction_length=prediction_length,
+                id_column=id_column,
+                timestamp_column=timestamp_column,
+                num_samples=num_samples
+            )
+            logger.info(f"Inference complete! Forecast shape: {forecasts.shape}")
+            # Add metadata
+            forecasts['forecast_date'] = context_df[timestamp_column].max()
+            forecasts['model'] = self.model_name
+            return forecasts
+        except Exception as e:
+            logger.error(f"Inference failed: {e}")
+            raise
+    def predict_single_border(
+        self,
+        border: str,
+        context_df: pd.DataFrame,
+        future_df: pd.DataFrame,
+        prediction_length: int = 336,
+        num_samples: int = 100
+    ) -> pd.DataFrame:
+        """
+        Run inference for a single border (useful for testing).
+        Args:
+            border: Border name (e.g., 'AT_CZ')
+            context_df: Historical data
+            future_df: Future covariates
+            prediction_length: Hours to forecast
+            num_samples: Samples for probabilistic forecast
+        Returns:
+            forecasts_df: Predictions for single border
+        """
+        logger.info(f"Running inference for border: {border}")
+        # Filter for single border
+        context_border = context_df[context_df['border'] == border].copy()
+        future_border = future_df[future_df['border'] == border].copy()
+        # Run prediction
+        forecasts = self.predict(
+            context_df=context_border,
+            future_df=future_border,
+            prediction_length=prediction_length,
+            num_samples=num_samples
+        )
+        return forecasts
+    def save_forecasts(
+        self,
+        forecasts: pd.DataFrame,
+        output_path: str,
+        include_metadata: bool = True
+    ):
+        """
+        Save forecasts to parquet file.
+        Args:
+            forecasts: Forecast DataFrame
+            output_path: Path to save parquet file
+            include_metadata: Include model metadata
+        """
+        logger.info(f"Saving forecasts to: {output_path}")
+        # Create output directory if needed
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Add metadata
+        if include_metadata:
+            forecasts = forecasts.copy()
+            forecasts['saved_at'] = datetime.now()
+        # Save to parquet
+        forecasts.to_parquet(output_path, index=False)
+        logger.info(f"Saved {len(forecasts)} rows to {output_path}")
+    def benchmark_inference(
+        self,
+        context_df: pd.DataFrame,
+        future_df: pd.DataFrame,
+        prediction_length: int = 336
+    ) -> Dict[str, float]:
+        """
+        Benchmark inference speed and memory usage.
+        Args:
+            context_df: Historical data
+            future_df: Future covariates
+            prediction_length: Hours to forecast
+        Returns:
+            metrics: Dict with inference_time_sec, gpu_memory_mb
+        """
+        import time
+        logger.info("Benchmarking inference performance...")
+        # Record start time and memory
+        start_time = time.time()
+        if self.device == "cuda":
+            torch.cuda.reset_peak_memory_stats()
+        # Run inference
+        _ = self.predict(
+            context_df=context_df,
+            future_df=future_df,
+            prediction_length=prediction_length
+        )
+        # Record end time and memory
+        end_time = time.time()
+        inference_time = end_time - start_time
+        metrics = {
+            'inference_time_sec': inference_time,
+            'borders': context_df['border'].nunique(),
+            'prediction_length': prediction_length
+        }
+        if self.device == "cuda":
+            peak_memory = torch.cuda.max_memory_allocated() / 1e6  # MB
+            metrics['gpu_memory_mb'] = peak_memory
+        logger.info(f"Inference time: {inference_time:.2f}s")
+        if 'gpu_memory_mb' in metrics:
+            logger.info(f"Peak GPU memory: {metrics['gpu_memory_mb']:.1f} MB")
+        return metrics

src/inference/data_fetcher.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Data Fetcher for Zero-Shot Inference
+Prepares data for Chronos 2 inference by:
+1. Loading unified features from HuggingFace Dataset
+2. Identifying future covariates from metadata
+3. Preparing context window (historical data)
+4. Preparing future covariates for forecast horizon
+5. Formatting data for Chronos 2 predict_df() API
+"""
+from pathlib import Path
+from typing import Tuple, List, Optional
+import pandas as pd
+import polars as pl
+from datetime import datetime, timedelta
+from datasets import load_dataset
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DataFetcher:
+    """
+    Fetches and prepares data for zero-shot Chronos 2 inference.
+    Handles:
+    - Loading unified features (2,553 features)
+    - Identifying future covariates (615 features)
+    - Creating context windows for each border
+    - Extending future covariates into forecast horizon
+    """
+    def __init__(
+        self,
+        dataset_name: str = "evgueni-p/fbmc-features-24month",
+        local_features_path: Optional[str] = None,
+        local_metadata_path: Optional[str] = None,
+        context_length: int = 512,
+        use_local: bool = False
+    ):
+        """
+        Initialize DataFetcher.
+        Args:
+            dataset_name: HuggingFace dataset name
+            local_features_path: Path to local features parquet file
+            local_metadata_path: Path to local metadata CSV
+            context_length: Number of hours to use as context (default: 512)
+            use_local: If True, load from local files instead of HF Dataset
+        """
+        self.dataset_name = dataset_name
+        self.local_features_path = local_features_path or "data/processed/features_unified_24month.parquet"
+        self.local_metadata_path = local_metadata_path or "data/processed/features_unified_metadata.csv"
+        self.context_length = context_length
+        self.use_local = use_local
+        # Will be loaded lazily
+        self.features_df: Optional[pl.DataFrame] = None
+        self.metadata_df: Optional[pd.DataFrame] = None
+        self.future_covariate_cols: Optional[List[str]] = None
+        self.target_borders: Optional[List[str]] = None
+    def load_data(self):
+        """Load unified features and metadata."""
+        logger.info("Loading unified features and metadata...")
+        if self.use_local:
+            # Load from local files
+            logger.info(f"Loading features from: {self.local_features_path}")
+            self.features_df = pl.read_parquet(self.local_features_path)
+            logger.info(f"Loading metadata from: {self.local_metadata_path}")
+            self.metadata_df = pd.read_csv(self.local_metadata_path)
+        else:
+            # Load from HuggingFace Dataset
+            logger.info(f"Loading features from HF Dataset: {self.dataset_name}")
+            dataset = load_dataset(self.dataset_name, split="train")
+            self.features_df = pl.from_pandas(dataset.to_pandas())
+            # Try to load metadata from HF Dataset
+            try:
+                metadata_dataset = load_dataset(self.dataset_name, data_files="metadata.csv", split="train")
+                self.metadata_df = metadata_dataset.to_pandas()
+            except:
+                logger.warning("Could not load metadata from HF Dataset, falling back to local")
+                self.metadata_df = pd.read_csv(self.local_metadata_path)
+        # Ensure timestamp column is datetime
+        if 'timestamp' in self.features_df.columns:
+            self.features_df = self.features_df.with_columns(
+                pl.col('timestamp').str.to_datetime()
+            )
+        logger.info(f"Loaded {len(self.features_df)} rows, {len(self.features_df.columns)} columns")
+        logger.info(f"Date range: {self.features_df['timestamp'].min()} to {self.features_df['timestamp'].max()}")
+        # Identify future covariates
+        self._identify_future_covariates()
+        # Identify target borders
+        self._identify_target_borders()
+    def _identify_future_covariates(self):
+        """Identify columns that are future covariates from metadata."""
+        logger.info("Identifying future covariates from metadata...")
+        # Filter for future covariates
+        future_cov_meta = self.metadata_df[
+            self.metadata_df['is_future_covariate'] == True
+        ]
+        self.future_covariate_cols = future_cov_meta['feature_name'].tolist()
+        logger.info(f"Found {len(self.future_covariate_cols)} future covariates")
+        logger.info(f"Categories: {future_cov_meta['category'].value_counts().to_dict()}")
+    def _identify_target_borders(self):
+        """Identify target borders from NTC columns."""
+        logger.info("Identifying target borders...")
+        # Find all ntc_actual_* columns
+        ntc_cols = [col for col in self.features_df.columns if col.startswith('ntc_actual_')]
+        # Extract border names
+        self.target_borders = [col.replace('ntc_actual_', '') for col in ntc_cols]
+        logger.info(f"Found {len(self.target_borders)} target borders")
+        logger.info(f"Borders: {', '.join(self.target_borders[:5])}...")
+    def prepare_inference_data(
+        self,
+        forecast_date: datetime,
+        prediction_length: int = 336,  # 14 days
+        borders: Optional[List[str]] = None
+    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Prepare context and future data for Chronos 2 inference.
+        Args:
+            forecast_date: The date to forecast from (as-of date)
+            prediction_length: Number of hours to forecast (default: 336 = 14 days)
+            borders: List of borders to forecast (default: all borders)
+        Returns:
+            context_df: Historical data (timestamp, border, target, all features)
+            future_df: Future covariates (timestamp, border, future covariates only)
+        """
+        if self.features_df is None:
+            self.load_data()
+        borders = borders or self.target_borders
+        logger.info(f"Preparing inference data for {len(borders)} borders")
+        logger.info(f"Forecast date: {forecast_date}")
+        logger.info(f"Context length: {self.context_length} hours")
+        logger.info(f"Prediction length: {prediction_length} hours")
+        # Extract context window (historical data)
+        context_start = forecast_date - timedelta(hours=self.context_length)
+        context_df = self.features_df.filter(
+            (pl.col('timestamp') >= context_start) &
+            (pl.col('timestamp') < forecast_date)
+        )
+        logger.info(f"Context window: {context_df['timestamp'].min()} to {context_df['timestamp'].max()}")
+        logger.info(f"Context rows: {len(context_df)}")
+        # Prepare context data for each border
+        context_dfs = []
+        for border in borders:
+            ntc_col = f'ntc_actual_{border}'
+            if ntc_col not in context_df.columns:
+                logger.warning(f"Border {border} not found in features, skipping")
+                continue
+            # Select: timestamp, target, all features
+            border_context = context_df.select([
+                'timestamp',
+                pl.lit(border).alias('border'),
+                pl.col(ntc_col).alias('target'),
+                *[col for col in context_df.columns if col not in ['timestamp', ntc_col]]
+            ])
+            context_dfs.append(border_context)
+        # Combine all borders
+        context_combined = pl.concat(context_dfs)
+        logger.info(f"Combined context shape: {context_combined.shape}")
+        # Prepare future covariates
+        # For MVP: Use last known values or simple forward-fill
+        # TODO: In production, fetch fresh weather forecasts, generate temporal features
+        logger.info("Preparing future covariates...")
+        future_dfs = []
+        for border in borders:
+            # Create future timestamps
+            future_timestamps = pd.date_range(
+                start=forecast_date,
+                periods=prediction_length,
+                freq='H'
+            )
+            # Get last known values of future covariates
+            last_row = context_df.filter(pl.col('timestamp') == context_df['timestamp'].max())
+            # Extract future covariate values
+            future_values = last_row.select(self.future_covariate_cols)
+            # Repeat for all future timestamps
+            future_border_df = pl.DataFrame({
+                'timestamp': future_timestamps,
+                'border': [border] * len(future_timestamps)
+            })
+            # Add future covariate values (forward-fill from last known)
+            for col in self.future_covariate_cols:
+                if col in future_values.columns:
+                    value = future_values[col][0]
+                    future_border_df = future_border_df.with_columns(
+                        pl.lit(value).alias(col)
+                    )
+            future_dfs.append(future_border_df)
+        # Combine all borders
+        future_combined = pl.concat(future_dfs)
+        logger.info(f"Future covariates shape: {future_combined.shape}")
+        # Convert to pandas for Chronos 2
+        context_pd = context_combined.to_pandas()
+        future_pd = future_combined.to_pandas()
+        logger.info("Data preparation complete!")
+        logger.info(f"Context: {context_pd.shape}, Future: {future_pd.shape}")
+        return context_pd, future_pd
+    def get_available_dates(self) -> Tuple[datetime, datetime]:
+        """Get the available date range in the dataset."""
+        if self.features_df is None:
+            self.load_data()
+        min_date = self.features_df['timestamp'].min()
+        max_date = self.features_df['timestamp'].max()
+        return min_date, max_date