Fix entity extraction column name mismatch
Browse filesThe HF dataset uses lowercase column names ('transcription', 'id') while the local CSV uses
capitalized names ('Transcription', 'ID'). Updated extract_entities.py to support both naming
conventions by checking which column exists before accessing it.
This fixes the KeyError: 'Transcription' error in the Extract Entities tab.
- extract_entities.py +5 -2
extract_entities.py
CHANGED
|
@@ -31,7 +31,7 @@ def extract_entities_from_transcripts(train_df: pd.DataFrame,
|
|
| 31 |
4. Proximity to known Caribbean keywords (optional filter)
|
| 32 |
|
| 33 |
Args:
|
| 34 |
-
train_df: DataFrame with '
|
| 35 |
min_frequency: Minimum occurrences for single-word entities
|
| 36 |
min_frequency_multiword: Minimum occurrences for multi-word entities
|
| 37 |
capitalization_threshold: Minimum ratio of capitalized occurrences (0-1)
|
|
@@ -84,7 +84,10 @@ def extract_entities_from_transcripts(train_df: pd.DataFrame,
|
|
| 84 |
if verbose:
|
| 85 |
print("\n[1/3] Analyzing single words and multi-word phrases...")
|
| 86 |
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
| 88 |
if pd.isna(transcription):
|
| 89 |
continue
|
| 90 |
|
|
|
|
| 31 |
4. Proximity to known Caribbean keywords (optional filter)
|
| 32 |
|
| 33 |
Args:
|
| 34 |
+
train_df: DataFrame with 'transcription' column (lowercase)
|
| 35 |
min_frequency: Minimum occurrences for single-word entities
|
| 36 |
min_frequency_multiword: Minimum occurrences for multi-word entities
|
| 37 |
capitalization_threshold: Minimum ratio of capitalized occurrences (0-1)
|
|
|
|
| 84 |
if verbose:
|
| 85 |
print("\n[1/3] Analyzing single words and multi-word phrases...")
|
| 86 |
|
| 87 |
+
# Support both 'Transcription' (CSV) and 'transcription' (HF dataset)
|
| 88 |
+
transcription_col = 'transcription' if 'transcription' in train_df.columns else 'Transcription'
|
| 89 |
+
|
| 90 |
+
for transcription in train_df[transcription_col]:
|
| 91 |
if pd.isna(transcription):
|
| 92 |
continue
|
| 93 |
|