"""Entity extraction utilities.""" import os import json import pandas as pd from extract_entities import extract_entities_from_transcripts from data.manager import ENTITIES_PATH from data.loader import get_train_dataframe def extract_entities_progress(progress=None): """Extract Caribbean entities from training data with progress tracking""" try: if progress: progress(0, desc="Starting entity extraction...") if progress: progress(0.2, desc="Loading training data from dataset...") try: train_df = get_train_dataframe() except ValueError as e: return f"❌ {str(e)}", "{}" if progress: progress(0.4, desc=f"Analyzing {len(train_df):,} transcripts...") # Run extraction entities = extract_entities_from_transcripts( train_df, min_frequency=50, min_frequency_multiword=20, capitalization_threshold=0.7, verbose=False # Suppress prints in Gradio app ) if progress: progress(0.9, desc="Saving entities...") # Save to JSON entities_list = sorted(list(entities)) single_word = sorted([e for e in entities if ' ' not in e]) multi_word = sorted([e for e in entities if ' ' in e]) output_data = { 'entities': entities_list, 'single_word_entities': single_word, 'multi_word_entities': multi_word, 'count': len(entities_list), 'count_single_word': len(single_word), 'count_multi_word': len(multi_word), 'extraction_params': { 'min_frequency': 50, 'min_frequency_multiword': 20, 'capitalization_threshold': 0.7 } } with open(ENTITIES_PATH, 'w') as f: json.dump(output_data, f, indent=2) if progress: progress(1.0, desc="Complete!") top_single = single_word[:15] top_multi = multi_word[:15] summary = f""" ## ✅ Entity Extraction Complete **Total Entities:** {len(entities_list)} ({len(single_word)} single-word + {len(multi_word)} multi-word) **Top 15 Single-Word Entities:** {', '.join(top_single) if top_single else 'None'} **Top 15 Multi-Word Entities:** {', '.join(top_multi) if top_multi else 'None'} **Saved to:** `{ENTITIES_PATH}` """ return summary, json.dumps(output_data, indent=2) except Exception as e: import traceback error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}" return error_msg, "{}"