Spaces:

shaun3141
/

caribbean-voices-hackathon

Sleeping

shaun3141 commited on 29 days ago

Commit

20e96fd

1 Parent(s): 28b1bc3

Separate ESPnet and Whisper training modules with clear naming

- Created espnet_trainer.py: ESPnet-specific training (no HuggingFace fallbacks)
- Created whisper_trainer.py: Full HuggingFace transformers integration
- Updated UI with separate ESPnet and Whisper training tabs
- Fixed imports to use relative imports in training/__init__.py
- Removed old trainer.py (backed up as trainer_old.py.bak)

Files changed (4) hide show

training/__init__.py +14 -1
training/espnet_trainer.py +211 -0
training/whisper_trainer.py +342 -0
ui/interface.py +68 -25

training/__init__.py CHANGED Viewed

	@@ -1,2 +1,15 @@
1	- """~~Training logic for OWSM fine-tuning."""~~

2

+"""
+Training modules for Caribbean Voices Hackathon.
+Separate training modules:
+- espnet_trainer: ESPnet-specific training (no HuggingFace dependencies)
+- whisper_trainer: Whisper training with full HuggingFace integration
+"""
+from .espnet_trainer import run_espnet_training_progress
+from .whisper_trainer import run_whisper_training_progress
+__all__ = [
+    'run_espnet_training_progress',
+    'run_whisper_training_progress',
+]

training/espnet_trainer.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+ESPnet-specific training for OWSM models.
+Uses ESPnet's native training framework - NO HuggingFace dependencies.
+"""
+import os
+import json
+import torch
+import numpy as np
+import random
+from typing import Tuple, Optional, Dict, Any
+from datasets import load_dataset, Audio
+from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR
+# Set seeds for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(SEED)
+torch.use_deterministic_algorithms(True, warn_only=True)
+# ESPnet model configuration
+ESPNET_MODEL_NAME = "espnet/owsm_v3.1_ebf_small"
+TARGET_SR = 16000
+MAX_AUDIO_LENGTH = 30  # seconds
+HF_DATASET_NAME = "shaun3141/caribbean-voices-hackathon"
+def run_espnet_training_progress(epochs: int, batch_size: int, learning_rate: float, progress=None) -> Tuple[str, Optional[Dict[str, Any]]]:
+    """
+    Run ESPnet OWSM training with progress tracking.
+    Uses ESPnet's native training framework - NO HuggingFace fallbacks.
+    """
+    try:
+        if progress:
+            progress(0, desc="Initializing ESPnet training...")
+        # Check ESPnet is installed - NO FALLBACKS
+        try:
+            from espnet2.bin.s2t_inference import Speech2Text
+        except ImportError as e:
+            raise RuntimeError(
+                f"❌ ESPnet is not installed!\n\n"
+                f"ESPnet is required for ESPnet model training.\n"
+                f"Install with: pip install espnet espnet_model_zoo\n\n"
+                f"Original error: {e}"
+            )
+        # Check prerequisites
+        if not os.path.exists(ENTITIES_PATH):
+            raise FileNotFoundError(
+                f"❌ Entities file not found at {ENTITIES_PATH}. "
+                f"Please extract entities first using the entity extraction tool."
+            )
+        if progress:
+            progress(0.05, desc="Loading entities...")
+        with open(ENTITIES_PATH, 'r') as f:
+            entities_data = json.load(f)
+        high_value_entities = set(entities_data['entities'])
+        print(f"Loaded {len(high_value_entities)} high-value entities")
+        if progress:
+            progress(0.1, desc="Loading dataset from Hugging Face...")
+        # Load dataset from HF
+        hf_token = os.getenv("HF_TOKEN")
+        print(f"Loading dataset: {HF_DATASET_NAME}")
+        dataset = load_dataset(HF_DATASET_NAME, token=hf_token)
+        if 'train' not in dataset:
+            raise ValueError(f"❌ Dataset {HF_DATASET_NAME} does not contain a 'train' split.")
+        train_full = dataset['train']
+        print(f"Loaded {len(train_full):,} total training samples")
+        # Cast to Audio to ensure correct sampling rate
+        train_full = train_full.cast_column("audio", Audio(sampling_rate=TARGET_SR))
+        # Create train/val split
+        if progress:
+            progress(0.15, desc="Creating train/val split...")
+        split_dataset = train_full.train_test_split(test_size=0.1, seed=SEED)
+        train_dataset_raw = split_dataset['train']
+        val_dataset_raw = split_dataset['test']
+        print(f"Train: {len(train_dataset_raw):,} samples")
+        print(f"Val: {len(val_dataset_raw):,} samples")
+        # Load ESPnet model - NO FALLBACKS
+        if progress:
+            progress(0.2, desc=f"Loading ESPnet model: {ESPNET_MODEL_NAME}...")
+        print(f"\n{'='*70}")
+        print(f"Loading ESPnet model: {ESPNET_MODEL_NAME}")
+        print(f"{'='*70}")
+        espnet_model = Speech2Text.from_pretrained(ESPNET_MODEL_NAME)
+        print("✓ ESPnet model loaded successfully")
+        # Extract tokenizer from ESPnet model
+        if not hasattr(espnet_model, 'tokenizer'):
+            raise RuntimeError(
+                f"❌ ESPnet model {ESPNET_MODEL_NAME} does not have a 'tokenizer' attribute. "
+                f"This is required for training. The model may not be compatible with fine-tuning."
+            )
+        if espnet_model.tokenizer is None:
+            raise RuntimeError(
+                f"❌ ESPnet model {ESPNET_MODEL_NAME} has a None tokenizer. "
+                f"This is required for training. The model may not be properly initialized."
+            )
+        espnet_tokenizer = espnet_model.tokenizer
+        print("✓ Tokenizer extracted from ESPnet model")
+        # Extract ASR model
+        if not hasattr(espnet_model, 'asr_model'):
+            raise RuntimeError(
+                f"❌ ESPnet model {ESPNET_MODEL_NAME} does not have an 'asr_model' attribute. "
+                f"This is required for training. The model may not be compatible with fine-tuning."
+            )
+        if espnet_model.asr_model is None:
+            raise RuntimeError(
+                f"❌ ESPnet model {ESPNET_MODEL_NAME} has a None asr_model. "
+                f"This is required for training. The model may not be properly initialized."
+            )
+        espnet_asr_model = espnet_model.asr_model
+        print("✓ ASR model extracted from ESPnet")
+        # ESPnet training requires ESPnet recipes
+        # For now, we'll provide clear instructions
+        if progress:
+            progress(0.3, desc="Preparing ESPnet training setup...")
+        print(f"\n{'='*70}")
+        print("ESPnet Training Setup")
+        print(f"{'='*70}")
+        print("ESPnet models require ESPnet's native training framework.")
+        print("To fine-tune ESPnet models, you need to:")
+        print("1. Set up an ESPnet recipe (e.g., egs2/librispeech/asr1)")
+        print("2. Modify the recipe to use your data")
+        print("3. Run the ESPnet training script")
+        print("\nThe model and tokenizer have been loaded successfully.")
+        print("You can use them with ESPnet's training recipes.")
+        print(f"{'='*70}\n")
+        # Save model info for ESPnet recipes
+        model_info = {
+            'model_name': ESPNET_MODEL_NAME,
+            'entities': list(high_value_entities),
+            'train_samples': len(train_dataset_raw),
+            'val_samples': len(val_dataset_raw),
+            'training_framework': 'espnet',
+            'note': 'This model requires ESPnet native training recipes for fine-tuning'
+        }
+        os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
+        model_info_path = os.path.join(MODEL_OUTPUT_DIR, "espnet_model_info.json")
+        with open(model_info_path, 'w') as f:
+            json.dump(model_info, f, indent=2)
+        # Save entities
+        entities_output_path = os.path.join(MODEL_OUTPUT_DIR, "caribbean_entities.json")
+        with open(entities_output_path, 'w') as f:
+            json.dump(entities_data, f, indent=2)
+        if progress:
+            progress(1.0, desc="Complete!")
+        success_msg = f"""
+## ✅ ESPnet Model Loaded Successfully!
+**Model:** {ESPNET_MODEL_NAME}
+**Output Directory:** {MODEL_OUTPUT_DIR}
+**Model Components:**
+- ✓ ESPnet Speech2Text model loaded
+- ✓ Tokenizer extracted
+- ✓ ASR model extracted
+**Files Saved:**
+- Model info: `{model_info_path}`
+- Entities: `{entities_output_path}`
+**Next Steps:**
+ESPnet models require ESPnet's native training framework for fine-tuning.
+Use ESPnet training recipes to fine-tune this model.
+**Training Data:**
+- Train samples: {len(train_dataset_raw):,}
+- Val samples: {len(val_dataset_raw):,}
+- Entities: {len(high_value_entities)}
+**Note:** This training interface loads the ESPnet model successfully.
+For actual fine-tuning, use ESPnet's training recipes.
+"""
+        return success_msg, model_info
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error during ESPnet training setup: {str(e)}\n\n{traceback.format_exc()}"
+        print(error_msg)
+        if progress:
+            progress(1.0, desc="Error!")
+        return error_msg, None

training/whisper_trainer.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""
+Whisper training using HuggingFace transformers.
+Full integration with HuggingFace training features.
+"""
+import os
+import json
+import torch
+import numpy as np
+import random
+from typing import Tuple, Optional, Dict, Any
+from datasets import load_dataset, Audio
+from transformers import (
+    WhisperProcessor,
+    WhisperForConditionalGeneration,
+    Seq2SeqTrainingArguments,
+    Seq2SeqTrainer,
+    DataCollatorForSeq2Seq,
+    EarlyStoppingCallback,
+)
+from owsm_model import OWSMWithEntityLoss
+from data.manager import ENTITIES_PATH, MODEL_OUTPUT_DIR, BASE_DIR
+# Set seeds for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(SEED)
+torch.use_deterministic_algorithms(True, warn_only=True)
+# Whisper model configuration
+WHISPER_MODEL_NAME = "openai/whisper-small"
+TARGET_SR = 16000
+MAX_AUDIO_LENGTH = 30  # seconds
+HF_DATASET_NAME = "shaun3141/caribbean-voices-hackathon"
+def compute_wer_metric(predictions, labels, tokenizer):
+    """Compute Word Error Rate metric."""
+    try:
+        import jiwer
+    except ImportError:
+        # Fallback simple WER calculation if jiwer not available
+        def simple_wer(ref, hyp):
+            ref_words = ref.lower().split()
+            hyp_words = hyp.lower().split()
+            if len(ref_words) == 0:
+                return 1.0 if len(hyp_words) > 0 else 0.0
+            # Simple Levenshtein-like WER
+            ref_str = ' '.join(ref_words)
+            hyp_str = ' '.join(hyp_words)
+            if ref_str == hyp_str:
+                return 0.0
+            ref_set = set(ref_words)
+            hyp_set = set(hyp_words)
+            common = len(ref_set & hyp_set)
+            total_ref = len(ref_words)
+            return 1.0 - (common / total_ref) if total_ref > 0 else 1.0
+        # Decode predictions and labels
+        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+        # Replace -100 with pad token for decoding
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        wer_scores = [simple_wer(ref, hyp) for ref, hyp in zip(decoded_labels, decoded_preds)]
+        return {"wer": np.mean(wer_scores)}
+    # Decode predictions and labels
+    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+    # Replace -100 with pad token for decoding
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+    # Compute WER using jiwer
+    wer = jiwer.wer(decoded_labels, decoded_preds)
+    return {"wer": wer}
+def prepare_whisper_dataset(dataset, processor):
+    """
+    Prepare dataset for Whisper training using Hugging Face Datasets.
+    """
+    def prepare_batch(batch):
+        """Process a batch of examples."""
+        audio = batch["audio"]
+        transcriptions = batch["transcription"]
+        # Process audio with processor
+        inputs = processor(
+            [x["array"] for x in audio],
+            sampling_rate=TARGET_SR,
+            return_tensors="pt",
+            padding=True,
+        )
+        # Process transcriptions
+        with processor.as_target_processor():
+            labels = processor(
+                transcriptions,
+                return_tensors="pt",
+                padding=True,
+            ).input_ids
+        # Replace padding token id's of the labels by -100 so it's ignored by the loss
+        labels[labels == processor.tokenizer.pad_token_id] = -100
+        batch["input_features"] = inputs.input_features
+        batch["labels"] = labels
+        return batch
+    # Remove columns that are not needed
+    column_names = dataset.column_names
+    # Process in batches
+    dataset = dataset.map(
+        prepare_batch,
+        batched=True,
+        batch_size=16,
+        remove_columns=column_names,
+        desc="Preprocessing dataset",
+    )
+    return dataset
+def run_whisper_training_progress(epochs: int, batch_size: int, learning_rate: float, progress=None) -> Tuple[str, Optional[Dict[str, Any]]]:
+    """
+    Run Whisper training with progress tracking using HuggingFace transformers.
+    Full integration with HuggingFace training features.
+    """
+    try:
+        if progress:
+            progress(0, desc="Preparing Whisper training...")
+        # Check prerequisites
+        if not os.path.exists(ENTITIES_PATH):
+            raise FileNotFoundError(
+                f"❌ Entities file not found at {ENTITIES_PATH}. "
+                f"Please extract entities first using the entity extraction tool."
+            )
+        if progress:
+            progress(0.05, desc="Loading entities...")
+        with open(ENTITIES_PATH, 'r') as f:
+            entities_data = json.load(f)
+        high_value_entities = set(entities_data['entities'])
+        print(f"Loaded {len(high_value_entities)} high-value entities")
+        if progress:
+            progress(0.1, desc="Loading dataset from Hugging Face...")
+        # Load dataset from HF
+        hf_token = os.getenv("HF_TOKEN")
+        print(f"Loading dataset: {HF_DATASET_NAME}")
+        dataset = load_dataset(HF_DATASET_NAME, token=hf_token)
+        if 'train' not in dataset:
+            raise ValueError(f"❌ Dataset {HF_DATASET_NAME} does not contain a 'train' split.")
+        train_full = dataset['train']
+        print(f"Loaded {len(train_full):,} total training samples")
+        # Cast to Audio to ensure correct sampling rate
+        train_full = train_full.cast_column("audio", Audio(sampling_rate=TARGET_SR))
+        # Create train/val split
+        if progress:
+            progress(0.15, desc="Creating train/val split...")
+        split_dataset = train_full.train_test_split(test_size=0.1, seed=SEED)
+        train_dataset_raw = split_dataset['train']
+        val_dataset_raw = split_dataset['test']
+        print(f"Train: {len(train_dataset_raw):,} samples")
+        print(f"Val: {len(val_dataset_raw):,} samples")
+        # Load Whisper processor
+        if progress:
+            progress(0.2, desc=f"Loading Whisper processor: {WHISPER_MODEL_NAME}...")
+        print(f"\nLoading Whisper processor: {WHISPER_MODEL_NAME}")
+        processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)
+        print(f"✓ Whisper processor loaded successfully")
+        # Load Whisper model
+        if progress:
+            progress(0.25, desc=f"Loading Whisper model: {WHISPER_MODEL_NAME}...")
+        print(f"\nLoading Whisper model: {WHISPER_MODEL_NAME}")
+        # Use our wrapper class with entity-weighted loss
+        model = OWSMWithEntityLoss.from_pretrained(
+            WHISPER_MODEL_NAME,
+            tokenizer=processor.tokenizer,
+            high_value_tokens=high_value_entities,
+            entity_weight=3.0,
+        )
+        print(f"✓ Whisper model loaded successfully")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
+        print(f"Model on device: {device}")
+        # Prepare datasets
+        if progress:
+            progress(0.3, desc="Preprocessing training dataset...")
+        print("\nPreprocessing training dataset...")
+        train_dataset = prepare_whisper_dataset(train_dataset_raw, processor)
+        if progress:
+            progress(0.4, desc="Preprocessing validation dataset...")
+        print("Preprocessing validation dataset...")
+        val_dataset = prepare_whisper_dataset(val_dataset_raw, processor)
+        # Training arguments
+        if progress:
+            progress(0.5, desc="Setting up training arguments...")
+        training_args = Seq2SeqTrainingArguments(
+            output_dir=MODEL_OUTPUT_DIR,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            gradient_accumulation_steps=4,
+            learning_rate=learning_rate,
+            warmup_steps=500,
+            num_train_epochs=epochs,
+            evaluation_strategy="steps",
+            eval_steps=1000,
+            save_strategy="steps",
+            save_steps=1000,
+            logging_steps=100,
+            load_best_model_at_end=True,
+            metric_for_best_model="wer",
+            greater_is_better=False,
+            save_total_limit=3,
+            fp16=torch.cuda.is_available(),
+            dataloader_num_workers=4,
+            report_to="none",
+            seed=SEED,
+            predict_with_generate=True,
+            generation_max_length=200,
+        )
+        # Data collator
+        data_collator = DataCollatorForSeq2Seq(
+            processor=processor,
+            model=model,
+            padding=True,
+        )
+        # Custom compute_metrics function for WER
+        def compute_metrics(eval_pred):
+            predictions, labels = eval_pred
+            return compute_wer_metric(predictions, labels, processor.tokenizer)
+        # Trainer
+        trainer = Seq2SeqTrainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=val_dataset,
+            data_collator=data_collator,
+            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+            compute_metrics=compute_metrics,
+        )
+        # Train
+        if progress:
+            progress(0.6, desc="Starting training...")
+        print("\n" + "=" * 70)
+        print("STARTING WHISPER TRAINING")
+        print("=" * 70)
+        print(f"Model: {WHISPER_MODEL_NAME}")
+        print(f"Epochs: {epochs}")
+        print(f"Batch Size: {batch_size}")
+        print(f"Learning Rate: {learning_rate}")
+        print(f"Train Samples: {len(train_dataset):,}")
+        print(f"Val Samples: {len(val_dataset):,}")
+        print("=" * 70)
+        trainer.train()
+        # Save final model
+        if progress:
+            progress(0.95, desc="Saving model...")
+        print(f"\nSaving model to {MODEL_OUTPUT_DIR}...")
+        model.save_pretrained(MODEL_OUTPUT_DIR)
+        processor.save_pretrained(MODEL_OUTPUT_DIR)
+        # Save entities for inference
+        entities_output_path = os.path.join(MODEL_OUTPUT_DIR, "caribbean_entities.json")
+        with open(entities_output_path, 'w') as f:
+            json.dump(entities_data, f, indent=2)
+        if progress:
+            progress(1.0, desc="Complete!")
+        final_metrics = trainer.evaluate()
+        wer = final_metrics.get('eval_wer', 'N/A')
+        loss = final_metrics.get('eval_loss', 'N/A')
+        wer_str = f"{wer:.4f}" if isinstance(wer, (int, float)) else str(wer)
+        loss_str = f"{loss:.4f}" if isinstance(loss, (int, float)) else str(loss)
+        success_msg = f"""
+## ✅ Whisper Training Complete!
+**Model:** {WHISPER_MODEL_NAME}
+**Output Directory:** {MODEL_OUTPUT_DIR}
+**Final Metrics:**
+- Word Error Rate (WER): {wer_str}
+- Validation Loss: {loss_str}
+**Files Saved:**
+- Model weights: `{MODEL_OUTPUT_DIR}`
+- Processor: `{MODEL_OUTPUT_DIR}`
+- Entities: `{entities_output_path}`
+The model is now ready for inference!
+"""
+        return success_msg, final_metrics
+    except Exception as e:
+        import traceback
+        error_msg = f"❌ Error during Whisper training: {str(e)}\n\n{traceback.format_exc()}"
+        print(error_msg)
+        if progress:
+            progress(1.0, desc="Error!")
+        return error_msg, None

ui/interface.py CHANGED Viewed

@@ -7,7 +7,8 @@ from datetime import datetime
 # Import modules
 from utils.status import get_status_display, get_data_loading_status
 from utils.entities import extract_entities_progress
-from training.trainer import run_training_progress
 from models.inference import transcribe_audio, run_inference_owsm
 from models.loader import get_available_models
 from data.loader import load_data_from_hf_dataset
@@ -168,30 +169,69 @@ def create_interface():
                     outputs=[extract_output, extract_json]
                 )
-            # Tab 4: Training
             with gr.Tab("🏋️ Training"):
-                gr.Markdown("### Fine-tune OWSM v3.1 Model")
                 gr.Markdown("""
-                Fine-tune OWSM v3.1 on Caribbean Voices dataset with entity-weighted loss.
-                **Note:** Full training requires ESPnet recipes. See documentation for details.
                 """)
-                with gr.Row():
-                    with gr.Column():
-                        train_epochs = gr.Slider(1, 10, value=3, step=1, label="Epochs")
-                        train_batch_size = gr.Slider(1, 32, value=4, step=1, label="Batch Size")
-                        train_lr = gr.Slider(1e-6, 1e-3, value=3e-5, step=1e-6, label="Learning Rate")
-                        train_btn = gr.Button("Start Training", variant="primary")
-                    with gr.Column():
-                        train_output = gr.Markdown()
-                        train_metrics = gr.JSON(label="Training Metrics")
-                train_btn.click(
-                    fn=run_training_progress,
-                    inputs=[train_epochs, train_batch_size, train_lr],
-                    outputs=[train_output, train_metrics]
-                )
             # Tab 5: Inference
             with gr.Tab("🚀 Inference"):
@@ -261,15 +301,18 @@ def create_interface():
                 ### Workflow
                 1. **Extract Entities**: Run entity extraction on training data
-                2. **Train Model**: Fine-tune OWSM (requires ESPnet recipes)
                 3. **Run Inference**: Generate test set transcriptions
                 4. **Download Results**: Get submission CSV file
                 ### Technical Details
-                - Framework: ESPnet + PyTorch
-                - Model: OWSM v3.1 E-Branchformer
-                - Entity Extraction: Frequency + capitalization analysis
-                - Training: Entity-weighted cross-entropy loss
                 ### Documentation
                 See `ESPNET_OWSM_SETUP.md` and `IMPLEMENTATION_SUMMARY.md` for details.

 # Import modules
 from utils.status import get_status_display, get_data_loading_status
 from utils.entities import extract_entities_progress
+from training.espnet_trainer import run_espnet_training_progress
+from training.whisper_trainer import run_whisper_training_progress
 from models.inference import transcribe_audio, run_inference_owsm
 from models.loader import get_available_models
 from data.loader import load_data_from_hf_dataset
                     outputs=[extract_output, extract_json]
                 )
+            # Tab 4: Training (with sub-tabs for ESPnet and Whisper)
             with gr.Tab("🏋️ Training"):
+                gr.Markdown("### Model Training")
                 gr.Markdown("""
+                Choose your training framework:
+                - **ESPnet Training**: For ESPnet OWSM models (requires ESPnet recipes)
+                - **Whisper Training**: For Whisper models (full HuggingFace integration)
                 """)
+                with gr.Tabs() as training_tabs:
+                    # ESPnet Training Tab
+                    with gr.Tab("🔧 ESPnet Training"):
+                        gr.Markdown("### ESPnet OWSM Model Training")
+                        gr.Markdown("""
+                        **ESPnet Training** - Uses ESPnet's native framework.
+                        This loads ESPnet models and prepares them for training with ESPnet recipes.
+                        Full fine-tuning requires ESPnet training recipes.
+                        """)
+                        with gr.Row():
+                            with gr.Column():
+                                espnet_train_epochs = gr.Slider(1, 10, value=3, step=1, label="Epochs (for ESPnet recipes)")
+                                espnet_train_batch_size = gr.Slider(1, 32, value=4, step=1, label="Batch Size (for ESPnet recipes)")
+                                espnet_train_lr = gr.Slider(1e-6, 1e-3, value=3e-5, step=1e-6, label="Learning Rate (for ESPnet recipes)")
+                                espnet_train_btn = gr.Button("Load ESPnet Model", variant="primary")
+                            with gr.Column():
+                                espnet_train_output = gr.Markdown()
+                                espnet_train_metrics = gr.JSON(label="Model Info")
+                        espnet_train_btn.click(
+                            fn=run_espnet_training_progress,
+                            inputs=[espnet_train_epochs, espnet_train_batch_size, espnet_train_lr],
+                            outputs=[espnet_train_output, espnet_train_metrics]
+                        )
+                    # Whisper Training Tab
+                    with gr.Tab("🎤 Whisper Training"):
+                        gr.Markdown("### Whisper Model Training")
+                        gr.Markdown("""
+                        **Whisper Training** - Full HuggingFace transformers integration.
+                        Fine-tune Whisper models with entity-weighted loss using HuggingFace's training framework.
+                        Includes full support for HuggingFace features like early stopping, WER metrics, etc.
+                        """)
+                        with gr.Row():
+                            with gr.Column():
+                                whisper_train_epochs = gr.Slider(1, 10, value=3, step=1, label="Epochs")
+                                whisper_train_batch_size = gr.Slider(1, 32, value=4, step=1, label="Batch Size")
+                                whisper_train_lr = gr.Slider(1e-6, 1e-3, value=3e-5, step=1e-6, label="Learning Rate")
+                                whisper_train_btn = gr.Button("Start Whisper Training", variant="primary")
+                            with gr.Column():
+                                whisper_train_output = gr.Markdown()
+                                whisper_train_metrics = gr.JSON(label="Training Metrics")
+                        whisper_train_btn.click(
+                            fn=run_whisper_training_progress,
+                            inputs=[whisper_train_epochs, whisper_train_batch_size, whisper_train_lr],
+                            outputs=[whisper_train_output, whisper_train_metrics]
+                        )
             # Tab 5: Inference
             with gr.Tab("🚀 Inference"):
                 ### Workflow
                 1. **Extract Entities**: Run entity extraction on training data
+                2. **Train Model**:
+                   - **ESPnet Training**: Load ESPnet models (requires ESPnet recipes for fine-tuning)
+                   - **Whisper Training**: Full HuggingFace fine-tuning with entity-weighted loss
                 3. **Run Inference**: Generate test set transcriptions
                 4. **Download Results**: Get submission CSV file
                 ### Technical Details
+                - **ESPnet Framework**: ESPnet + PyTorch for ESPnet OWSM models
+                - **Whisper Framework**: HuggingFace transformers for Whisper models
+                - **Model**: OWSM v3.1 E-Branchformer (ESPnet) or Whisper (HuggingFace)
+                - **Entity Extraction**: Frequency + capitalization analysis
+                - **Training**: Entity-weighted cross-entropy loss
                 ### Documentation
                 See `ESPNET_OWSM_SETUP.md` and `IMPLEMENTATION_SUMMARY.md` for details.