|
|
""" |
|
|
Extract high-value Caribbean entities from training transcripts. |
|
|
This builds a gazetteer purely from the competition dataset (no external data). |
|
|
Supports both single-word and multi-word entity extraction. |
|
|
""" |
|
|
import pandas as pd |
|
|
import re |
|
|
from typing import Set, Dict, List, Tuple |
|
|
|
|
|
|
|
|
def extract_ngrams(words: List[str], n: int) -> List[Tuple[str, ...]]: |
|
|
"""Extract n-grams from a list of words.""" |
|
|
return [tuple(words[i:i+n]) for i in range(len(words) - n + 1)] |
|
|
|
|
|
|
|
|
def is_phrase_capitalized(phrase_words: List[str]) -> bool: |
|
|
"""Check if a phrase has proper capitalization (all words capitalized).""" |
|
|
return all(word[0].isupper() if word else False for word in phrase_words) |
|
|
|
|
|
|
|
|
def extract_entities_from_transcripts(train_df: pd.DataFrame, |
|
|
min_frequency: int = 50, |
|
|
min_frequency_multiword: int = 20, |
|
|
capitalization_threshold: float = 0.7, |
|
|
verbose: bool = True) -> Set[str]: |
|
|
""" |
|
|
Extract high-value entities from training transcripts based on: |
|
|
1. Frequency (appears > min_frequency times) |
|
|
2. Capitalization pattern (capitalized/ALLCAPS most of the time) |
|
|
3. Multi-word phrase detection (bigrams, trigrams) |
|
|
4. Proximity to known Caribbean keywords (optional filter) |
|
|
|
|
|
Args: |
|
|
train_df: DataFrame with 'transcription' column (lowercase) |
|
|
min_frequency: Minimum occurrences for single-word entities |
|
|
min_frequency_multiword: Minimum occurrences for multi-word entities |
|
|
capitalization_threshold: Minimum ratio of capitalized occurrences (0-1) |
|
|
verbose: Print progress and statistics |
|
|
""" |
|
|
|
|
|
caribbean_keywords = { |
|
|
"caribbean", "bbc", "report", "london", "port", "prime", "minister", |
|
|
"trinidad", "tobago", "jamaica", "guyana", "haiti", "barbados", |
|
|
"antigua", "dominica", "grenada", "montserrat", "lucia", "kitts", |
|
|
"nevis", "suriname", "caricom", "west", "indies" |
|
|
} |
|
|
|
|
|
|
|
|
EXCLUDED_WORDS = { |
|
|
|
|
|
"i", "u", |
|
|
|
|
|
"mr", "mrs", "ms", "dr", "sir", "madam", |
|
|
|
|
|
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", |
|
|
|
|
|
"january", "february", "march", "april", "may", "june", |
|
|
"july", "august", "september", "october", "november", "december", |
|
|
|
|
|
"minister", "assembly", "council", "department", "secretariat", |
|
|
"secretary", "parliament", "congress", "labour", "republic", "states", |
|
|
"attorney", "association", |
|
|
|
|
|
"british", "american", "cuban", "haitian", "guyanese", "jamaican", |
|
|
"trinidadian", "dominican", "african", "european", "indian", "dutch", |
|
|
"french", "eastern", "latin", "south", "west", "north", |
|
|
} |
|
|
|
|
|
|
|
|
EXCLUDED_PHRASE_PATTERNS = { |
|
|
|
|
|
"last week", "this week", "next week", "last year", "this year", "next year", |
|
|
"last month", "this month", "next month", |
|
|
|
|
|
"prime minister", "foreign minister", "finance minister", |
|
|
|
|
|
"the report", "the government", "the country", |
|
|
} |
|
|
|
|
|
|
|
|
word_stats: Dict[str, Dict[str, int]] = {} |
|
|
phrase_stats: Dict[str, Dict[str, int]] = {} |
|
|
|
|
|
if verbose: |
|
|
print("\n[1/3] Analyzing single words and multi-word phrases...") |
|
|
|
|
|
|
|
|
transcription_col = 'transcription' if 'transcription' in train_df.columns else 'Transcription' |
|
|
|
|
|
for transcription in train_df[transcription_col]: |
|
|
if pd.isna(transcription): |
|
|
continue |
|
|
|
|
|
|
|
|
words = re.findall(r'\b[A-Za-z]+\b', str(transcription)) |
|
|
|
|
|
|
|
|
for word in words: |
|
|
word_lower = word.lower() |
|
|
|
|
|
if word_lower not in word_stats: |
|
|
word_stats[word_lower] = { |
|
|
'total': 0, |
|
|
'capitalized': 0, |
|
|
'allcaps': 0, |
|
|
'near_caribbean': 0 |
|
|
} |
|
|
|
|
|
word_stats[word_lower]['total'] += 1 |
|
|
|
|
|
|
|
|
if word.isupper() and len(word) > 1: |
|
|
word_stats[word_lower]['allcaps'] += 1 |
|
|
elif word[0].isupper(): |
|
|
word_stats[word_lower]['capitalized'] += 1 |
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(words) - 1): |
|
|
phrase = (words[i], words[i+1]) |
|
|
phrase_lower = ' '.join(w.lower() for w in phrase) |
|
|
|
|
|
if phrase_lower not in phrase_stats: |
|
|
phrase_stats[phrase_lower] = { |
|
|
'total': 0, |
|
|
'capitalized': 0, |
|
|
} |
|
|
|
|
|
phrase_stats[phrase_lower]['total'] += 1 |
|
|
|
|
|
|
|
|
if is_phrase_capitalized(phrase): |
|
|
phrase_stats[phrase_lower]['capitalized'] += 1 |
|
|
|
|
|
|
|
|
for i in range(len(words) - 2): |
|
|
phrase = (words[i], words[i+1], words[i+2]) |
|
|
phrase_lower = ' '.join(w.lower() for w in phrase) |
|
|
|
|
|
if phrase_lower not in phrase_stats: |
|
|
phrase_stats[phrase_lower] = { |
|
|
'total': 0, |
|
|
'capitalized': 0, |
|
|
} |
|
|
|
|
|
phrase_stats[phrase_lower]['total'] += 1 |
|
|
|
|
|
|
|
|
if is_phrase_capitalized(phrase): |
|
|
phrase_stats[phrase_lower]['capitalized'] += 1 |
|
|
|
|
|
if verbose: |
|
|
print(f" - Analyzed {len(word_stats):,} unique words") |
|
|
print(f" - Analyzed {len(phrase_stats):,} unique phrases") |
|
|
|
|
|
|
|
|
if verbose: |
|
|
print(f"\n[2/3] Filtering single-word entities...") |
|
|
|
|
|
single_word_entities = set() |
|
|
|
|
|
for word_lower, stats in word_stats.items(): |
|
|
|
|
|
if len(word_lower) < 2: |
|
|
continue |
|
|
|
|
|
|
|
|
if word_lower in EXCLUDED_WORDS: |
|
|
continue |
|
|
|
|
|
|
|
|
if stats['total'] < min_frequency: |
|
|
continue |
|
|
|
|
|
|
|
|
capitalized_ratio = (stats['capitalized'] + stats['allcaps']) / stats['total'] |
|
|
if capitalized_ratio < capitalization_threshold: |
|
|
continue |
|
|
|
|
|
single_word_entities.add(word_lower) |
|
|
|
|
|
if verbose: |
|
|
print(f" - Found {len(single_word_entities)} single-word entities") |
|
|
|
|
|
|
|
|
if verbose: |
|
|
print(f"\n[3/3] Filtering multi-word entities...") |
|
|
|
|
|
multiword_entities = set() |
|
|
|
|
|
for phrase_lower, stats in phrase_stats.items(): |
|
|
|
|
|
if phrase_lower in EXCLUDED_PHRASE_PATTERNS: |
|
|
continue |
|
|
|
|
|
|
|
|
if stats['total'] < min_frequency_multiword: |
|
|
continue |
|
|
|
|
|
|
|
|
capitalized_ratio = stats['capitalized'] / stats['total'] |
|
|
if capitalized_ratio < capitalization_threshold: |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
words_in_phrase = phrase_lower.split() |
|
|
|
|
|
|
|
|
has_known_entity = any(w in single_word_entities for w in words_in_phrase) |
|
|
is_common_pattern = any(pattern in phrase_lower for pattern in |
|
|
['port of', 'puerto', 'st ', ' st', 'prime minister', 'saint']) |
|
|
|
|
|
if has_known_entity or is_common_pattern: |
|
|
multiword_entities.add(phrase_lower) |
|
|
|
|
|
if verbose: |
|
|
print(f" - Found {len(multiword_entities)} multi-word entities") |
|
|
|
|
|
|
|
|
|
|
|
all_entities = single_word_entities | multiword_entities |
|
|
|
|
|
if verbose: |
|
|
print(f"\n✓ Total entities: {len(all_entities)} ({len(single_word_entities)} single + {len(multiword_entities)} multi-word)") |
|
|
|
|
|
|
|
|
if single_word_entities: |
|
|
entity_freqs = [(w, word_stats[w]['total']) for w in single_word_entities] |
|
|
entity_freqs.sort(key=lambda x: x[1], reverse=True) |
|
|
|
|
|
print("\nTop 20 single-word entities:") |
|
|
for word, freq in entity_freqs[:20]: |
|
|
stats = word_stats[word] |
|
|
cap_ratio = (stats['capitalized'] + stats['allcaps']) / stats['total'] |
|
|
print(f" {word:25s} | freq={freq:5d} | cap_ratio={cap_ratio:.2f}") |
|
|
|
|
|
if multiword_entities: |
|
|
phrase_freqs = [(p, phrase_stats[p]['total']) for p in multiword_entities] |
|
|
phrase_freqs.sort(key=lambda x: x[1], reverse=True) |
|
|
|
|
|
print("\nTop 20 multi-word entities:") |
|
|
for phrase, freq in phrase_freqs[:20]: |
|
|
stats = phrase_stats[phrase] |
|
|
cap_ratio = stats['capitalized'] / stats['total'] |
|
|
print(f" {phrase:25s} | freq={freq:5d} | cap_ratio={cap_ratio:.2f}") |
|
|
|
|
|
return all_entities |
|
|
|