Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 7,527 Bytes

27cb60a

"""Mask missing October 27-31, 2023 LTA data using forward fill from October 26.

Missing data: October 27-31, 2023 (~145 records, 0.5% of dataset)
Strategy: Forward fill LTA values from October 26, 2023
Rationale: LTA (Long Term Allocations) change infrequently, forward fill is conservative
"""
import sys
from pathlib import Path
from datetime import datetime, timedelta
import polars as pl

def main():
    """Forward fill missing October 27-31, 2023 LTA data."""

    print("\n" + "=" * 80)
    print("OCTOBER 27-31, 2023 LTA MASKING")
    print("=" * 80)
    print("Strategy: Forward fill from October 26, 2023")
    print("Missing data: ~145 records (0.5% of dataset)")
    print("=" * 80)
    print()

    # =========================================================================
    # 1. Load existing LTA data
    # =========================================================================
    lta_path = Path('data/raw/phase1_24month/jao_lta.parquet')

    if not lta_path.exists():
        print(f"[ERROR] LTA file not found: {lta_path}")
        return

    print("Loading existing LTA data...")
    lta_df = pl.read_parquet(lta_path)
    print(f"  Current records: {len(lta_df):,}")
    print(f"  Columns: {lta_df.columns}")
    print()

    # Backup existing file
    backup_path = lta_path.with_name('jao_lta.parquet.backup3')
    lta_df.write_parquet(backup_path)
    print(f"Backup created: {backup_path}")
    print()

    # =========================================================================
    # 2. Identify October 26, 2023 data (source for forward fill)
    # =========================================================================
    print("Extracting October 26, 2023 data...")

    # Use 'mtu' (Market Time Unit) timestamp column
    time_col = 'mtu'

    if time_col not in lta_df.columns:
        print(f"[ERROR] No 'mtu' timestamp column found. Available columns: {lta_df.columns}")
        return

    print(f"  Using timestamp column: '{time_col}'")

    # Convert to datetime if string
    if lta_df[time_col].dtype == pl.Utf8:
        lta_df = lta_df.with_columns([
            pl.col(time_col).str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S").alias(time_col)
        ])

    # Filter October 26, 2023 data
    oct_26_data = lta_df.filter(
        (pl.col(time_col).dt.year() == 2023) &
        (pl.col(time_col).dt.month() == 10) &
        (pl.col(time_col).dt.day() == 26)
    )

    print(f"  October 26, 2023 records: {len(oct_26_data)}")

    if len(oct_26_data) == 0:
        print("[ERROR] No October 26, 2023 data found to use for masking")
        return

    print()

    # =========================================================================
    # 3. Generate masked records for October 27-31, 2023
    # =========================================================================
    print("Generating masked records for October 27-31, 2023...")

    all_masked_records = []
    missing_days = [27, 28, 29, 30, 31]

    for day in missing_days:
        # Create masked records by copying Oct 26 data and updating timestamp
        masked_day = oct_26_data.clone()

        # Calculate time delta (1 day, 2 days, etc.)
        days_delta = day - 26

        # Update timestamp (preserve dtype)
        masked_day = masked_day.with_columns([
            (pl.col(time_col) + pl.duration(days=days_delta)).cast(lta_df[time_col].dtype).alias(time_col)
        ])

        # Add masking flag
        masked_day = masked_day.with_columns([
            pl.lit(True).alias('is_masked'),
            pl.lit('forward_fill_oct26').alias('masking_method')
        ])

        all_masked_records.append(masked_day)
        print(f"  Day {day}: {len(masked_day)} records (forward filled from Oct 26)")

    # Combine all masked records
    masked_df = pl.concat(all_masked_records, how='vertical')
    print(f"\n  Total masked records: {len(masked_df):,}")
    print()

    # =========================================================================
    # 4. Add masking flags to existing data
    # =========================================================================
    print("Adding masking flags to existing data...")

    # Add is_masked=False and masking_method=None to existing records
    lta_df = lta_df.with_columns([
        pl.lit(False).alias('is_masked'),
        pl.lit(None).cast(pl.Utf8).alias('masking_method')
    ])

    # =========================================================================
    # 5. Merge and validate
    # =========================================================================
    print("Merging masked records with existing data...")

    # Combine
    complete_df = pl.concat([lta_df, masked_df], how='vertical')

    # Sort by timestamp
    complete_df = complete_df.sort(time_col)

    # Deduplicate based on timestamp (October recovery created duplicates)
    initial_count = len(complete_df)
    complete_df = complete_df.unique(subset=['mtu'])
    deduped = initial_count - len(complete_df)

    if deduped > 0:
        print(f"  Removed {deduped} duplicate timestamps from October recovery merge")

    print()
    print("=" * 80)
    print("MASKING COMPLETE")
    print("=" * 80)
    print(f"Original records: {len(lta_df):,}")
    print(f"Masked records: {len(masked_df):,}")
    print(f"Total records: {len(complete_df):,}")
    print()

    # Count masked records
    masked_count = complete_df.filter(pl.col('is_masked') == True).height
    print(f"Masked data: {masked_count:,} records ({masked_count/len(complete_df)*100:.2f}%)")
    print()

    # =========================================================================
    # 6. Save complete dataset
    # =========================================================================
    print("Saving complete dataset...")
    complete_df.write_parquet(lta_path)
    print(f"  File: {lta_path}")
    print(f"  Size: {lta_path.stat().st_size / (1024**2):.2f} MB")
    print(f"  Backup: {backup_path}")
    print()

    # =========================================================================
    # 7. Validation
    # =========================================================================
    print("=" * 80)
    print("VALIDATION")
    print("=" * 80)

    # Check date continuity for October 2023
    oct_2023 = complete_df.filter(
        (pl.col(time_col).dt.year() == 2023) &
        (pl.col(time_col).dt.month() == 10)
    )

    unique_days = oct_2023.select(pl.col(time_col).dt.day().unique().sort()).to_series().to_list()
    expected_days = list(range(1, 32))  # 1-31

    missing_days_final = set(expected_days) - set(unique_days)

    if missing_days_final:
        print(f"[WARNING] October 2023 still missing days: {sorted(missing_days_final)}")
    else:
        print("[OK] October 2023 date continuity: Complete (days 1-31)")

    # Check masked records
    masked_oct = complete_df.filter(
        (pl.col(time_col).dt.year() == 2023) &
        (pl.col(time_col).dt.month() == 10) &
        (pl.col(time_col).dt.day().is_in([27, 28, 29, 30, 31])) &
        (pl.col('is_masked') == True)
    )

    print(f"[OK] Masked October 27-31, 2023: {len(masked_oct):,} records")

    # Overall data range
    min_date = complete_df.select(pl.col(time_col).min()).item()
    max_date = complete_df.select(pl.col(time_col).max()).item()
    print(f"[OK] Data range: {min_date} to {max_date}")

    print("=" * 80)
    print()
    print("SUCCESS: October 2023 LTA data masked with forward fill")
    print()

if __name__ == '__main__':
    main()