fbmc-chronos2 / scripts /inspect_sample_data.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
raw
history blame
3.83 kB
"""
Inspect JAO Sample Data Structure
Quick visual inspection of MaxBEX and CNECs/PTDFs data
"""
import polars as pl
from pathlib import Path
import sys
# Redirect output to file to avoid encoding issues
output_file = Path('data/raw/sample/data_inspection.txt')
sys.stdout = open(output_file, 'w', encoding='utf-8')
# Load the sample data
maxbex_path = Path('data/raw/sample/maxbex_sample_sept2025.parquet')
cnecs_path = Path('data/raw/sample/cnecs_sample_sept2025.parquet')
print("="*80)
print("JAO SAMPLE DATA INSPECTION")
print("="*80)
# ============================================================================
# 1. MaxBEX DATA (TARGET VARIABLE)
# ============================================================================
print("\n" + "="*80)
print("1. MaxBEX DATA (TARGET VARIABLE)")
print("="*80)
maxbex_df = pl.read_parquet(maxbex_path)
print(f"\nShape: {maxbex_df.shape[0]} rows x {maxbex_df.shape[1]} columns")
print(f"\nColumn names (first 20 border directions):")
print(maxbex_df.columns[:20])
print(f"\nDataFrame Schema:")
print(maxbex_df.schema)
print(f"\nFirst 5 rows:")
print(maxbex_df.head(5))
print(f"\nBasic Statistics (first 10 borders):")
print(maxbex_df.select(maxbex_df.columns[:10]).describe())
# Check for nulls
null_counts = maxbex_df.null_count()
total_nulls = sum([null_counts[col][0] for col in maxbex_df.columns])
print(f"\nNull Values: {total_nulls} total across all columns")
# ============================================================================
# 2. CNECs/PTDFs DATA
# ============================================================================
print("\n" + "="*80)
print("2. CNECs/PTDFs DATA (Active Constraints)")
print("="*80)
cnecs_df = pl.read_parquet(cnecs_path)
print(f"\nShape: {cnecs_df.shape[0]} rows x {cnecs_df.shape[1]} columns")
print(f"\nColumn names:")
print(cnecs_df.columns)
print(f"\nDataFrame Schema:")
print(cnecs_df.schema)
print(f"\nFirst 5 rows:")
print(cnecs_df.head(5))
print(f"\nBasic Statistics (numeric columns):")
# Select numeric columns only
numeric_cols = [col for col in cnecs_df.columns if cnecs_df[col].dtype in [pl.Float64, pl.Int64]]
print(cnecs_df.select(numeric_cols).describe())
# Check for nulls
null_counts_cnecs = cnecs_df.null_count()
total_nulls_cnecs = sum([null_counts_cnecs[col][0] for col in cnecs_df.columns])
print(f"\nNull Values: {total_nulls_cnecs} total across all columns")
# ============================================================================
# 3. KEY INSIGHTS
# ============================================================================
print("\n" + "="*80)
print("3. KEY INSIGHTS")
print("="*80)
print(f"\nMaxBEX Data:")
print(f" - Time series format: Index is datetime")
print(f" - Border directions: {maxbex_df.shape[1]} total")
print(f" - Wide format: Each column = one border direction")
print(f" - Data type: All float64 (MW capacity values)")
print(f"\nCNECs/PTDFs Data:")
print(f" - Unique CNECs: {cnecs_df['cnec_name'].n_unique()}")
print(f" - Unique TSOs: {cnecs_df['tso'].n_unique()}")
print(f" - PTDF columns: {len([c for c in cnecs_df.columns if c.startswith('ptdf_')])}")
print(f" - Has shadow prices: {'shadow_price' in cnecs_df.columns}")
print(f" - Has RAM values: {'ram' in cnecs_df.columns}")
# Show sample CNEC names
print(f"\nSample CNEC names (first 10):")
for i, name in enumerate(cnecs_df['cnec_name'].unique()[:10]):
print(f" {i+1}. {name}")
# Show PTDF column names
ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')]
print(f"\nPTDF columns ({len(ptdf_cols)} zones):")
print(f" {ptdf_cols}")
print("\n" + "="*80)
print("INSPECTION COMPLETE")
print("="*80)
# Close file and print location
sys.stdout.close()
sys.stdout = sys.__stdout__
print(f"[OK] Data inspection saved to: {output_file}")
print(f" View with: cat {output_file}")