|
|
"""Status checking utilities.""" |
|
|
import os |
|
|
import json |
|
|
import pandas as pd |
|
|
from data.manager import ( |
|
|
ENTITIES_PATH, MODEL_OUTPUT_DIR, AUDIO_DIR |
|
|
) |
|
|
from data.loader import get_hf_dataset, HF_DATASET_NAME, is_dataset_loaded, get_train_dataframe, get_test_dataframe |
|
|
|
|
|
|
|
|
def check_setup_status(): |
|
|
"""Check the status of various setup components""" |
|
|
_hf_dataset = get_hf_dataset() |
|
|
|
|
|
|
|
|
train_data_available = False |
|
|
test_data_available = False |
|
|
train_count = 0 |
|
|
test_count = 0 |
|
|
|
|
|
if _hf_dataset is not None: |
|
|
if 'train' in _hf_dataset: |
|
|
train_data_available = True |
|
|
train_count = len(_hf_dataset['train']) |
|
|
if 'test' in _hf_dataset: |
|
|
test_data_available = True |
|
|
test_count = len(_hf_dataset['test']) |
|
|
|
|
|
status = { |
|
|
"entities_extracted": os.path.exists(ENTITIES_PATH), |
|
|
"model_trained": os.path.exists(MODEL_OUTPUT_DIR), |
|
|
"train_data": train_data_available, |
|
|
"test_data": test_data_available, |
|
|
"train_count": train_count, |
|
|
"test_count": test_count, |
|
|
"audio_files": os.path.exists(AUDIO_DIR) and len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) > 0, |
|
|
"audio_from_hf": False, |
|
|
"audio_count_hf": 0, |
|
|
} |
|
|
|
|
|
|
|
|
if _hf_dataset is None: |
|
|
try: |
|
|
from datasets import load_dataset |
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
_hf_dataset = load_dataset(HF_DATASET_NAME, token=hf_token) |
|
|
except: |
|
|
pass |
|
|
|
|
|
if _hf_dataset is not None: |
|
|
|
|
|
|
|
|
audio_count = 0 |
|
|
try: |
|
|
|
|
|
has_train_audio = False |
|
|
has_test_audio = False |
|
|
|
|
|
try: |
|
|
if 'train' in _hf_dataset: |
|
|
features = _hf_dataset['train'].features |
|
|
has_train_audio = 'audio' in features if features else False |
|
|
except ImportError as e: |
|
|
if 'torchcodec' in str(e).lower(): |
|
|
|
|
|
has_train_audio = False |
|
|
else: |
|
|
raise |
|
|
|
|
|
try: |
|
|
if 'test' in _hf_dataset: |
|
|
features = _hf_dataset['test'].features |
|
|
has_test_audio = 'audio' in features if features else False |
|
|
except ImportError as e: |
|
|
if 'torchcodec' in str(e).lower(): |
|
|
|
|
|
has_test_audio = False |
|
|
else: |
|
|
raise |
|
|
|
|
|
if has_train_audio: |
|
|
status["audio_from_hf"] = True |
|
|
try: |
|
|
audio_count = len(_hf_dataset['train']) |
|
|
except: |
|
|
pass |
|
|
if has_test_audio: |
|
|
status["audio_from_hf"] = True |
|
|
try: |
|
|
audio_count += len(_hf_dataset['test']) |
|
|
except: |
|
|
pass |
|
|
|
|
|
if audio_count > 0: |
|
|
status["audio_count_hf"] = audio_count |
|
|
|
|
|
except ImportError as e: |
|
|
if 'torchcodec' in str(e).lower(): |
|
|
|
|
|
|
|
|
pass |
|
|
else: |
|
|
raise |
|
|
except Exception as e: |
|
|
|
|
|
if 'torchcodec' not in str(e).lower(): |
|
|
print(f"β Warning checking audio features: {e}") |
|
|
|
|
|
|
|
|
try: |
|
|
from espnet2.bin.s2t_inference import Speech2Text |
|
|
status["espnet_installed"] = True |
|
|
except ImportError: |
|
|
status["espnet_installed"] = False |
|
|
|
|
|
|
|
|
try: |
|
|
import flash_attn |
|
|
status["flash_attention_available"] = True |
|
|
except ImportError: |
|
|
status["flash_attention_available"] = False |
|
|
|
|
|
|
|
|
if status["entities_extracted"]: |
|
|
try: |
|
|
with open(ENTITIES_PATH, 'r') as f: |
|
|
data = json.load(f) |
|
|
status["entity_count"] = len(data.get('entities', [])) |
|
|
except: |
|
|
status["entity_count"] = 0 |
|
|
else: |
|
|
status["entity_count"] = 0 |
|
|
|
|
|
return status |
|
|
|
|
|
|
|
|
def get_data_loading_status(): |
|
|
"""Get formatted data loading status for Load Data tab""" |
|
|
status = check_setup_status() |
|
|
|
|
|
status_text = """ |
|
|
### π Current Data Status |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
if status['train_data']: |
|
|
status_text += f"β
**Training Data**: Available from dataset ({status['train_count']:,} samples)\n" |
|
|
else: |
|
|
status_text += "β **Training Data**: Not loaded from dataset\n" |
|
|
|
|
|
|
|
|
if status['test_data']: |
|
|
status_text += f"β
**Test Data**: Available from dataset ({status['test_count']:,} samples)\n" |
|
|
else: |
|
|
status_text += "β **Test Data**: Not loaded from dataset\n" |
|
|
|
|
|
|
|
|
if status['audio_from_hf']: |
|
|
status_text += f"β
**Audio Files**: Available from HF Dataset ({status['audio_count_hf']:,} files)\n" |
|
|
status_text += " Audio files are loaded directly from the dataset on-demand.\n" |
|
|
elif status['audio_files']: |
|
|
try: |
|
|
audio_count = len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) |
|
|
status_text += f"β
**Audio Files**: Available locally ({audio_count:,} files)\n" |
|
|
except: |
|
|
status_text += "β
**Audio Files**: Available locally\n" |
|
|
else: |
|
|
status_text += "β οΈ **Audio Files**: Not loaded (will be loaded on-demand from HF Dataset if available)\n" |
|
|
|
|
|
|
|
|
if status['train_data'] or status['test_data']: |
|
|
status_text += f"\nπ‘ **Note**: Data is automatically loaded from `{HF_DATASET_NAME}` on startup.\n" |
|
|
if status['audio_from_hf']: |
|
|
status_text += "Audio files are accessed directly from the HF dataset (no local storage needed).\n" |
|
|
else: |
|
|
status_text += f"\nπ‘ **Note**: Data will be automatically loaded from `{HF_DATASET_NAME}` on startup.\n" |
|
|
status_text += "If it hasn't loaded yet, you can manually load it below.\n" |
|
|
|
|
|
return status_text |
|
|
|
|
|
|
|
|
def get_status_display(): |
|
|
"""Get formatted status display with project insights""" |
|
|
status = check_setup_status() |
|
|
|
|
|
|
|
|
train_samples = status.get('train_count', 0) |
|
|
test_samples = status.get('test_count', 0) |
|
|
audio_count = 0 |
|
|
avg_transcription_length = 0 |
|
|
vocab_size = 0 |
|
|
|
|
|
|
|
|
if status['train_data']: |
|
|
try: |
|
|
train_df = get_train_dataframe() |
|
|
if 'Transcription' in train_df.columns: |
|
|
avg_length = train_df['Transcription'].str.len().mean() |
|
|
avg_transcription_length = avg_length |
|
|
|
|
|
all_words = set() |
|
|
for text in train_df['Transcription'].dropna(): |
|
|
all_words.update(text.lower().split()) |
|
|
vocab_size = len(all_words) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if status['audio_from_hf']: |
|
|
audio_count = status['audio_count_hf'] |
|
|
elif status['audio_files']: |
|
|
try: |
|
|
audio_count = len([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')]) |
|
|
except: |
|
|
audio_count = 0 |
|
|
else: |
|
|
audio_count = 0 |
|
|
|
|
|
status_text = """ |
|
|
<div class="homepage-header"> |
|
|
<h1>ποΈ Caribbean Voices Hackathon</h1> |
|
|
<p class="subtitle">OWSM v3.1 Training & Inference Platform</p> |
|
|
</div> |
|
|
|
|
|
<div class="insights-section"> |
|
|
<h2>π Project Insights</h2> |
|
|
<div class="insights-grid"> |
|
|
""" |
|
|
|
|
|
|
|
|
if train_samples > 0: |
|
|
status_text += f""" |
|
|
<div class="insight-card card-blue"> |
|
|
<div class="insight-number">{train_samples:,}</div> |
|
|
<div class="insight-label">Training Samples</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
if test_samples > 0: |
|
|
status_text += f""" |
|
|
<div class="insight-card card-green"> |
|
|
<div class="insight-number">{test_samples:,}</div> |
|
|
<div class="insight-label">Test Samples</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
if audio_count > 0: |
|
|
status_text += f""" |
|
|
<div class="insight-card card-purple"> |
|
|
<div class="insight-number">{audio_count:,}</div> |
|
|
<div class="insight-label">Audio Files</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
if vocab_size > 0: |
|
|
status_text += f""" |
|
|
<div class="insight-card card-orange"> |
|
|
<div class="insight-number">{vocab_size:,}</div> |
|
|
<div class="insight-label">Unique Words</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
if status['entities_extracted']: |
|
|
status_text += f""" |
|
|
<div class="insight-card card-teal"> |
|
|
<div class="insight-number">{status['entity_count']}</div> |
|
|
<div class="insight-label">Caribbean Entities</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
status_text += """ |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="status-section"> |
|
|
<h2>π§ System Status</h2> |
|
|
<div class="status-grid"> |
|
|
""" |
|
|
|
|
|
|
|
|
status_text += """ |
|
|
<div class="status-group"> |
|
|
<h3>π Data Files</h3> |
|
|
""" |
|
|
status_text += f"<div class='status-item'>{'β
' if status['train_data'] else 'β'} <strong>Training Data:</strong> {'Available from dataset' if status['train_data'] else 'Not loaded'}" |
|
|
if train_samples > 0: |
|
|
status_text += f" ({train_samples:,} samples)" |
|
|
status_text += "</div>\n" |
|
|
|
|
|
status_text += f"<div class='status-item'>{'β
' if status['test_data'] else 'β'} <strong>Test Data:</strong> {'Available from dataset' if status['test_data'] else 'Not loaded'}" |
|
|
if test_samples > 0: |
|
|
status_text += f" ({test_samples:,} samples)" |
|
|
status_text += "</div>\n" |
|
|
|
|
|
|
|
|
if status['audio_from_hf']: |
|
|
status_text += f"<div class='status-item'>β
<strong>Audio Files:</strong> Available from HF Dataset ({audio_count:,} files)</div>\n" |
|
|
elif status['audio_files']: |
|
|
status_text += f"<div class='status-item'>β
<strong>Audio Files:</strong> Available locally ({audio_count:,} files)</div>\n" |
|
|
else: |
|
|
status_text += f"<div class='status-item'>β οΈ <strong>Audio Files:</strong> Not Loaded</div>\n" |
|
|
status_text += "</div>\n" |
|
|
|
|
|
|
|
|
status_text += """ |
|
|
<div class="status-group"> |
|
|
<h3>π€ OWSM Setup</h3> |
|
|
""" |
|
|
status_text += f"<div class='status-item'>{'β
' if status['espnet_installed'] else 'β'} <strong>ESPnet:</strong> {'Installed' if status['espnet_installed'] else 'Not Installed'}</div>\n" |
|
|
flash_attn_status = status.get('flash_attention_available', False) |
|
|
if flash_attn_status: |
|
|
status_text += f"<div class='status-item'>β
<strong>Flash Attention:</strong> Available (A10G GPU optimized)</div>\n" |
|
|
else: |
|
|
status_text += f"<div class='status-item'>β οΈ <strong>Flash Attention:</strong> Not Available (should be installed for A10G GPU)</div>\n" |
|
|
status_text += f"<div class='status-item'>{'β
' if status['entities_extracted'] else 'β'} <strong>Entities:</strong> {'Extracted' if status['entities_extracted'] else 'Not Extracted'}" |
|
|
if status['entities_extracted']: |
|
|
status_text += f" ({status['entity_count']} entities)" |
|
|
status_text += "</div>\n" |
|
|
status_text += f"<div class='status-item'>{'β
' if status['model_trained'] else 'β'} <strong>Model:</strong> {'Trained' if status['model_trained'] else 'Not Trained'}</div>\n" |
|
|
status_text += "</div>\n" |
|
|
|
|
|
status_text += """ |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="workflow-section"> |
|
|
<h2>π Quick Navigation</h2> |
|
|
<p class="workflow-description">Jump to the tools you need to work with the Caribbean Voices dataset:</p> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
return status_text |
|
|
|
|
|
|