shaun3141's picture
Migrate all data loading to use Hugging Face dataset directly
e3aec0d
"""Status checking utilities."""
import os
import json
import pandas as pd
from data.manager import (
ENTITIES_PATH, MODEL_OUTPUT_DIR, AUDIO_DIR
)
from data.loader import get_hf_dataset, HF_DATASET_NAME, is_dataset_loaded, get_train_dataframe, get_test_dataframe
def check_setup_status():
"""Check the status of various setup components"""
_hf_dataset = get_hf_dataset()
# Check if dataset is loaded and has train/test splits
train_data_available = False
test_data_available = False
train_count = 0
test_count = 0
if _hf_dataset is not None:
if 'train' in _hf_dataset:
train_data_available = True
train_count = len(_hf_dataset['train'])
if 'test' in _hf_dataset:
test_data_available = True
test_count = len(_hf_dataset['test'])
status = {
"entities_extracted": os.path.exists(ENTITIES_PATH),
"model_trained": os.path.exists(MODEL_OUTPUT_DIR),
"train_data": train_data_available,
"test_data": test_data_available,
"train_count": train_count,
"test_count": test_count,
"audio_files": os.path.exists(AUDIO_DIR) and len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) > 0,
"audio_from_hf": False,
"audio_count_hf": 0,
}
# Check if audio is available from HF dataset
if _hf_dataset is None:
try:
from datasets import load_dataset
hf_token = os.getenv("HF_TOKEN")
_hf_dataset = load_dataset(HF_DATASET_NAME, token=hf_token)
except:
pass
if _hf_dataset is not None:
# Check if audio column exists without iterating (which would trigger decoding)
# Even accessing .features might trigger torchcodec requirement, so wrap carefully
audio_count = 0
try:
# Try to check features - this might trigger torchcodec ImportError
has_train_audio = False
has_test_audio = False
try:
if 'train' in _hf_dataset:
features = _hf_dataset['train'].features
has_train_audio = 'audio' in features if features else False
except ImportError as e:
if 'torchcodec' in str(e).lower():
# Can't check - torchcodec not available
has_train_audio = False
else:
raise
try:
if 'test' in _hf_dataset:
features = _hf_dataset['test'].features
has_test_audio = 'audio' in features if features else False
except ImportError as e:
if 'torchcodec' in str(e).lower():
# Can't check - torchcodec not available
has_test_audio = False
else:
raise
if has_train_audio:
status["audio_from_hf"] = True
try:
audio_count = len(_hf_dataset['train'])
except:
pass
if has_test_audio:
status["audio_from_hf"] = True
try:
audio_count += len(_hf_dataset['test'])
except:
pass
if audio_count > 0:
status["audio_count_hf"] = audio_count
except ImportError as e:
if 'torchcodec' in str(e).lower():
# torchcodec not installed - can't check audio features
# Don't set audio_from_hf to avoid false positives
pass
else:
raise
except Exception as e:
# Other errors - don't fail, just skip audio check
if 'torchcodec' not in str(e).lower():
print(f"⚠ Warning checking audio features: {e}")
# Check ESPnet
try:
from espnet2.bin.s2t_inference import Speech2Text
status["espnet_installed"] = True
except ImportError:
status["espnet_installed"] = False
# Check Flash Attention (optional optimization)
try:
import flash_attn
status["flash_attention_available"] = True
except ImportError:
status["flash_attention_available"] = False
# Get entity count if exists
if status["entities_extracted"]:
try:
with open(ENTITIES_PATH, 'r') as f:
data = json.load(f)
status["entity_count"] = len(data.get('entities', []))
except:
status["entity_count"] = 0
else:
status["entity_count"] = 0
return status
def get_data_loading_status():
"""Get formatted data loading status for Load Data tab"""
status = check_setup_status()
status_text = """
### πŸ“Š Current Data Status
"""
# Train data status
if status['train_data']:
status_text += f"βœ… **Training Data**: Available from dataset ({status['train_count']:,} samples)\n"
else:
status_text += "❌ **Training Data**: Not loaded from dataset\n"
# Test data status
if status['test_data']:
status_text += f"βœ… **Test Data**: Available from dataset ({status['test_count']:,} samples)\n"
else:
status_text += "❌ **Test Data**: Not loaded from dataset\n"
# Audio files status - show HF dataset status if available
if status['audio_from_hf']:
status_text += f"βœ… **Audio Files**: Available from HF Dataset ({status['audio_count_hf']:,} files)\n"
status_text += " Audio files are loaded directly from the dataset on-demand.\n"
elif status['audio_files']:
try:
audio_count = len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
status_text += f"βœ… **Audio Files**: Available locally ({audio_count:,} files)\n"
except:
status_text += "βœ… **Audio Files**: Available locally\n"
else:
status_text += "⚠️ **Audio Files**: Not loaded (will be loaded on-demand from HF Dataset if available)\n"
# Check if data was auto-loaded from HF
if status['train_data'] or status['test_data']:
status_text += f"\nπŸ’‘ **Note**: Data is automatically loaded from `{HF_DATASET_NAME}` on startup.\n"
if status['audio_from_hf']:
status_text += "Audio files are accessed directly from the HF dataset (no local storage needed).\n"
else:
status_text += f"\nπŸ’‘ **Note**: Data will be automatically loaded from `{HF_DATASET_NAME}` on startup.\n"
status_text += "If it hasn't loaded yet, you can manually load it below.\n"
return status_text
def get_status_display():
"""Get formatted status display with project insights"""
status = check_setup_status()
# Calculate project insights
train_samples = status.get('train_count', 0)
test_samples = status.get('test_count', 0)
audio_count = 0
avg_transcription_length = 0
vocab_size = 0
# Try to get vocabulary stats from training data if available
if status['train_data']:
try:
train_df = get_train_dataframe()
if 'Transcription' in train_df.columns:
avg_length = train_df['Transcription'].str.len().mean()
avg_transcription_length = avg_length
# Estimate vocabulary size
all_words = set()
for text in train_df['Transcription'].dropna():
all_words.update(text.lower().split())
vocab_size = len(all_words)
except:
pass
# Check audio files - prefer HF dataset count
if status['audio_from_hf']:
audio_count = status['audio_count_hf']
elif status['audio_files']:
try:
audio_count = len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
except:
audio_count = 0
else:
audio_count = 0
status_text = """
<div class="homepage-header">
<h1>🏝️ Caribbean Voices Hackathon</h1>
<p class="subtitle">OWSM v3.1 Training & Inference Platform</p>
</div>
<div class="insights-section">
<h2>πŸ“ˆ Project Insights</h2>
<div class="insights-grid">
"""
# Add colorful insight cards
if train_samples > 0:
status_text += f"""
<div class="insight-card card-blue">
<div class="insight-number">{train_samples:,}</div>
<div class="insight-label">Training Samples</div>
</div>
"""
if test_samples > 0:
status_text += f"""
<div class="insight-card card-green">
<div class="insight-number">{test_samples:,}</div>
<div class="insight-label">Test Samples</div>
</div>
"""
if audio_count > 0:
status_text += f"""
<div class="insight-card card-purple">
<div class="insight-number">{audio_count:,}</div>
<div class="insight-label">Audio Files</div>
</div>
"""
if vocab_size > 0:
status_text += f"""
<div class="insight-card card-orange">
<div class="insight-number">{vocab_size:,}</div>
<div class="insight-label">Unique Words</div>
</div>
"""
if status['entities_extracted']:
status_text += f"""
<div class="insight-card card-teal">
<div class="insight-number">{status['entity_count']}</div>
<div class="insight-label">Caribbean Entities</div>
</div>
"""
status_text += """
</div>
</div>
<div class="status-section">
<h2>πŸ”§ System Status</h2>
<div class="status-grid">
"""
# Data Files Status
status_text += """
<div class="status-group">
<h3>πŸ“ Data Files</h3>
"""
status_text += f"<div class='status-item'>{'βœ…' if status['train_data'] else '❌'} <strong>Training Data:</strong> {'Available from dataset' if status['train_data'] else 'Not loaded'}"
if train_samples > 0:
status_text += f" ({train_samples:,} samples)"
status_text += "</div>\n"
status_text += f"<div class='status-item'>{'βœ…' if status['test_data'] else '❌'} <strong>Test Data:</strong> {'Available from dataset' if status['test_data'] else 'Not loaded'}"
if test_samples > 0:
status_text += f" ({test_samples:,} samples)"
status_text += "</div>\n"
# Audio files status - show HF dataset status if available
if status['audio_from_hf']:
status_text += f"<div class='status-item'>βœ… <strong>Audio Files:</strong> Available from HF Dataset ({audio_count:,} files)</div>\n"
elif status['audio_files']:
status_text += f"<div class='status-item'>βœ… <strong>Audio Files:</strong> Available locally ({audio_count:,} files)</div>\n"
else:
status_text += f"<div class='status-item'>⚠️ <strong>Audio Files:</strong> Not Loaded</div>\n"
status_text += "</div>\n"
# OWSM Setup Status
status_text += """
<div class="status-group">
<h3>πŸ€– OWSM Setup</h3>
"""
status_text += f"<div class='status-item'>{'βœ…' if status['espnet_installed'] else '❌'} <strong>ESPnet:</strong> {'Installed' if status['espnet_installed'] else 'Not Installed'}</div>\n"
flash_attn_status = status.get('flash_attention_available', False)
if flash_attn_status:
status_text += f"<div class='status-item'>βœ… <strong>Flash Attention:</strong> Available (A10G GPU optimized)</div>\n"
else:
status_text += f"<div class='status-item'>⚠️ <strong>Flash Attention:</strong> Not Available (should be installed for A10G GPU)</div>\n"
status_text += f"<div class='status-item'>{'βœ…' if status['entities_extracted'] else '❌'} <strong>Entities:</strong> {'Extracted' if status['entities_extracted'] else 'Not Extracted'}"
if status['entities_extracted']:
status_text += f" ({status['entity_count']} entities)"
status_text += "</div>\n"
status_text += f"<div class='status-item'>{'βœ…' if status['model_trained'] else '❌'} <strong>Model:</strong> {'Trained' if status['model_trained'] else 'Not Trained'}</div>\n"
status_text += "</div>\n"
status_text += """
</div>
</div>
<div class="workflow-section">
<h2>πŸš€ Quick Navigation</h2>
<p class="workflow-description">Jump to the tools you need to work with the Caribbean Voices dataset:</p>
</div>
"""
return status_text