Spaces:
Sleeping
Sleeping
| """ | |
| Inspect JAO Sample Data Structure | |
| Quick visual inspection of MaxBEX and CNECs/PTDFs data | |
| """ | |
| import polars as pl | |
| from pathlib import Path | |
| import sys | |
| # Redirect output to file to avoid encoding issues | |
| output_file = Path('data/raw/sample/data_inspection.txt') | |
| sys.stdout = open(output_file, 'w', encoding='utf-8') | |
| # Load the sample data | |
| maxbex_path = Path('data/raw/sample/maxbex_sample_sept2025.parquet') | |
| cnecs_path = Path('data/raw/sample/cnecs_sample_sept2025.parquet') | |
| print("="*80) | |
| print("JAO SAMPLE DATA INSPECTION") | |
| print("="*80) | |
| # ============================================================================ | |
| # 1. MaxBEX DATA (TARGET VARIABLE) | |
| # ============================================================================ | |
| print("\n" + "="*80) | |
| print("1. MaxBEX DATA (TARGET VARIABLE)") | |
| print("="*80) | |
| maxbex_df = pl.read_parquet(maxbex_path) | |
| print(f"\nShape: {maxbex_df.shape[0]} rows x {maxbex_df.shape[1]} columns") | |
| print(f"\nColumn names (first 20 border directions):") | |
| print(maxbex_df.columns[:20]) | |
| print(f"\nDataFrame Schema:") | |
| print(maxbex_df.schema) | |
| print(f"\nFirst 5 rows:") | |
| print(maxbex_df.head(5)) | |
| print(f"\nBasic Statistics (first 10 borders):") | |
| print(maxbex_df.select(maxbex_df.columns[:10]).describe()) | |
| # Check for nulls | |
| null_counts = maxbex_df.null_count() | |
| total_nulls = sum([null_counts[col][0] for col in maxbex_df.columns]) | |
| print(f"\nNull Values: {total_nulls} total across all columns") | |
| # ============================================================================ | |
| # 2. CNECs/PTDFs DATA | |
| # ============================================================================ | |
| print("\n" + "="*80) | |
| print("2. CNECs/PTDFs DATA (Active Constraints)") | |
| print("="*80) | |
| cnecs_df = pl.read_parquet(cnecs_path) | |
| print(f"\nShape: {cnecs_df.shape[0]} rows x {cnecs_df.shape[1]} columns") | |
| print(f"\nColumn names:") | |
| print(cnecs_df.columns) | |
| print(f"\nDataFrame Schema:") | |
| print(cnecs_df.schema) | |
| print(f"\nFirst 5 rows:") | |
| print(cnecs_df.head(5)) | |
| print(f"\nBasic Statistics (numeric columns):") | |
| # Select numeric columns only | |
| numeric_cols = [col for col in cnecs_df.columns if cnecs_df[col].dtype in [pl.Float64, pl.Int64]] | |
| print(cnecs_df.select(numeric_cols).describe()) | |
| # Check for nulls | |
| null_counts_cnecs = cnecs_df.null_count() | |
| total_nulls_cnecs = sum([null_counts_cnecs[col][0] for col in cnecs_df.columns]) | |
| print(f"\nNull Values: {total_nulls_cnecs} total across all columns") | |
| # ============================================================================ | |
| # 3. KEY INSIGHTS | |
| # ============================================================================ | |
| print("\n" + "="*80) | |
| print("3. KEY INSIGHTS") | |
| print("="*80) | |
| print(f"\nMaxBEX Data:") | |
| print(f" - Time series format: Index is datetime") | |
| print(f" - Border directions: {maxbex_df.shape[1]} total") | |
| print(f" - Wide format: Each column = one border direction") | |
| print(f" - Data type: All float64 (MW capacity values)") | |
| print(f"\nCNECs/PTDFs Data:") | |
| print(f" - Unique CNECs: {cnecs_df['cnec_name'].n_unique()}") | |
| print(f" - Unique TSOs: {cnecs_df['tso'].n_unique()}") | |
| print(f" - PTDF columns: {len([c for c in cnecs_df.columns if c.startswith('ptdf_')])}") | |
| print(f" - Has shadow prices: {'shadow_price' in cnecs_df.columns}") | |
| print(f" - Has RAM values: {'ram' in cnecs_df.columns}") | |
| # Show sample CNEC names | |
| print(f"\nSample CNEC names (first 10):") | |
| for i, name in enumerate(cnecs_df['cnec_name'].unique()[:10]): | |
| print(f" {i+1}. {name}") | |
| # Show PTDF column names | |
| ptdf_cols = [c for c in cnecs_df.columns if c.startswith('ptdf_')] | |
| print(f"\nPTDF columns ({len(ptdf_cols)} zones):") | |
| print(f" {ptdf_cols}") | |
| print("\n" + "="*80) | |
| print("INSPECTION COMPLETE") | |
| print("="*80) | |
| # Close file and print location | |
| sys.stdout.close() | |
| sys.stdout = sys.__stdout__ | |
| print(f"[OK] Data inspection saved to: {output_file}") | |
| print(f" View with: cat {output_file}") | |