"""Status checking utilities.""" import os import json import pandas as pd from data.manager import ( ENTITIES_PATH, MODEL_OUTPUT_DIR, AUDIO_DIR ) from data.loader import get_hf_dataset, HF_DATASET_NAME, is_dataset_loaded, get_train_dataframe, get_test_dataframe def check_setup_status(): """Check the status of various setup components""" _hf_dataset = get_hf_dataset() # Check if dataset is loaded and has train/test splits train_data_available = False test_data_available = False train_count = 0 test_count = 0 if _hf_dataset is not None: if 'train' in _hf_dataset: train_data_available = True train_count = len(_hf_dataset['train']) if 'test' in _hf_dataset: test_data_available = True test_count = len(_hf_dataset['test']) status = { "entities_extracted": os.path.exists(ENTITIES_PATH), "model_trained": os.path.exists(MODEL_OUTPUT_DIR), "train_data": train_data_available, "test_data": test_data_available, "train_count": train_count, "test_count": test_count, "audio_files": os.path.exists(AUDIO_DIR) and len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) > 0, "audio_from_hf": False, "audio_count_hf": 0, } # Check if audio is available from HF dataset if _hf_dataset is None: try: from datasets import load_dataset hf_token = os.getenv("HF_TOKEN") _hf_dataset = load_dataset(HF_DATASET_NAME, token=hf_token) except: pass if _hf_dataset is not None: # Check if audio column exists without iterating (which would trigger decoding) # Even accessing .features might trigger torchcodec requirement, so wrap carefully audio_count = 0 try: # Try to check features - this might trigger torchcodec ImportError has_train_audio = False has_test_audio = False try: if 'train' in _hf_dataset: features = _hf_dataset['train'].features has_train_audio = 'audio' in features if features else False except ImportError as e: if 'torchcodec' in str(e).lower(): # Can't check - torchcodec not available has_train_audio = False else: raise try: if 'test' in _hf_dataset: features = _hf_dataset['test'].features has_test_audio = 'audio' in features if features else False except ImportError as e: if 'torchcodec' in str(e).lower(): # Can't check - torchcodec not available has_test_audio = False else: raise if has_train_audio: status["audio_from_hf"] = True try: audio_count = len(_hf_dataset['train']) except: pass if has_test_audio: status["audio_from_hf"] = True try: audio_count += len(_hf_dataset['test']) except: pass if audio_count > 0: status["audio_count_hf"] = audio_count except ImportError as e: if 'torchcodec' in str(e).lower(): # torchcodec not installed - can't check audio features # Don't set audio_from_hf to avoid false positives pass else: raise except Exception as e: # Other errors - don't fail, just skip audio check if 'torchcodec' not in str(e).lower(): print(f"⚠ Warning checking audio features: {e}") # Check ESPnet try: from espnet2.bin.s2t_inference import Speech2Text status["espnet_installed"] = True except ImportError: status["espnet_installed"] = False # Check Flash Attention (optional optimization) try: import flash_attn status["flash_attention_available"] = True except ImportError: status["flash_attention_available"] = False # Get entity count if exists if status["entities_extracted"]: try: with open(ENTITIES_PATH, 'r') as f: data = json.load(f) status["entity_count"] = len(data.get('entities', [])) except: status["entity_count"] = 0 else: status["entity_count"] = 0 return status def get_data_loading_status(): """Get formatted data loading status for Load Data tab""" status = check_setup_status() status_text = """ ### šŸ“Š Current Data Status """ # Train data status if status['train_data']: status_text += f"āœ… **Training Data**: Available from dataset ({status['train_count']:,} samples)\n" else: status_text += "āŒ **Training Data**: Not loaded from dataset\n" # Test data status if status['test_data']: status_text += f"āœ… **Test Data**: Available from dataset ({status['test_count']:,} samples)\n" else: status_text += "āŒ **Test Data**: Not loaded from dataset\n" # Audio files status - show HF dataset status if available if status['audio_from_hf']: status_text += f"āœ… **Audio Files**: Available from HF Dataset ({status['audio_count_hf']:,} files)\n" status_text += " Audio files are loaded directly from the dataset on-demand.\n" elif status['audio_files']: try: audio_count = len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) status_text += f"āœ… **Audio Files**: Available locally ({audio_count:,} files)\n" except: status_text += "āœ… **Audio Files**: Available locally\n" else: status_text += "āš ļø **Audio Files**: Not loaded (will be loaded on-demand from HF Dataset if available)\n" # Check if data was auto-loaded from HF if status['train_data'] or status['test_data']: status_text += f"\nšŸ’” **Note**: Data is automatically loaded from `{HF_DATASET_NAME}` on startup.\n" if status['audio_from_hf']: status_text += "Audio files are accessed directly from the HF dataset (no local storage needed).\n" else: status_text += f"\nšŸ’” **Note**: Data will be automatically loaded from `{HF_DATASET_NAME}` on startup.\n" status_text += "If it hasn't loaded yet, you can manually load it below.\n" return status_text def get_status_display(): """Get formatted status display with project insights""" status = check_setup_status() # Calculate project insights train_samples = status.get('train_count', 0) test_samples = status.get('test_count', 0) audio_count = 0 avg_transcription_length = 0 vocab_size = 0 # Try to get vocabulary stats from training data if available if status['train_data']: try: train_df = get_train_dataframe() if 'Transcription' in train_df.columns: avg_length = train_df['Transcription'].str.len().mean() avg_transcription_length = avg_length # Estimate vocabulary size all_words = set() for text in train_df['Transcription'].dropna(): all_words.update(text.lower().split()) vocab_size = len(all_words) except: pass # Check audio files - prefer HF dataset count if status['audio_from_hf']: audio_count = status['audio_count_hf'] elif status['audio_files']: try: audio_count = len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) except: audio_count = 0 else: audio_count = 0 status_text = """

šŸļø Caribbean Voices Hackathon

OWSM v3.1 Training & Inference Platform

šŸ“ˆ Project Insights

""" # Add colorful insight cards if train_samples > 0: status_text += f"""
{train_samples:,}
Training Samples
""" if test_samples > 0: status_text += f"""
{test_samples:,}
Test Samples
""" if audio_count > 0: status_text += f"""
{audio_count:,}
Audio Files
""" if vocab_size > 0: status_text += f"""
{vocab_size:,}
Unique Words
""" if status['entities_extracted']: status_text += f"""
{status['entity_count']}
Caribbean Entities
""" status_text += """

šŸ”§ System Status

""" # Data Files Status status_text += """

šŸ“ Data Files

""" status_text += f"
{'āœ…' if status['train_data'] else 'āŒ'} Training Data: {'Available from dataset' if status['train_data'] else 'Not loaded'}" if train_samples > 0: status_text += f" ({train_samples:,} samples)" status_text += "
\n" status_text += f"
{'āœ…' if status['test_data'] else 'āŒ'} Test Data: {'Available from dataset' if status['test_data'] else 'Not loaded'}" if test_samples > 0: status_text += f" ({test_samples:,} samples)" status_text += "
\n" # Audio files status - show HF dataset status if available if status['audio_from_hf']: status_text += f"
āœ… Audio Files: Available from HF Dataset ({audio_count:,} files)
\n" elif status['audio_files']: status_text += f"
āœ… Audio Files: Available locally ({audio_count:,} files)
\n" else: status_text += f"
āš ļø Audio Files: Not Loaded
\n" status_text += "
\n" # OWSM Setup Status status_text += """

šŸ¤– OWSM Setup

""" status_text += f"
{'āœ…' if status['espnet_installed'] else 'āŒ'} ESPnet: {'Installed' if status['espnet_installed'] else 'Not Installed'}
\n" flash_attn_status = status.get('flash_attention_available', False) if flash_attn_status: status_text += f"
āœ… Flash Attention: Available (A10G GPU optimized)
\n" else: status_text += f"
āš ļø Flash Attention: Not Available (should be installed for A10G GPU)
\n" status_text += f"
{'āœ…' if status['entities_extracted'] else 'āŒ'} Entities: {'Extracted' if status['entities_extracted'] else 'Not Extracted'}" if status['entities_extracted']: status_text += f" ({status['entity_count']} entities)" status_text += "
\n" status_text += f"
{'āœ…' if status['model_trained'] else 'āŒ'} Model: {'Trained' if status['model_trained'] else 'Not Trained'}
\n" status_text += "
\n" status_text += """

šŸš€ Quick Navigation

Jump to the tools you need to work with the Caribbean Voices dataset:

""" return status_text