shaun3141 commited on
Commit
ef24863
·
1 Parent(s): aceac12

Fix entity extraction column name mismatch

Browse files

The HF dataset uses lowercase column names ('transcription', 'id') while the local CSV uses
capitalized names ('Transcription', 'ID'). Updated extract_entities.py to support both naming
conventions by checking which column exists before accessing it.

This fixes the KeyError: 'Transcription' error in the Extract Entities tab.

Files changed (1) hide show
  1. extract_entities.py +5 -2
extract_entities.py CHANGED
@@ -31,7 +31,7 @@ def extract_entities_from_transcripts(train_df: pd.DataFrame,
31
  4. Proximity to known Caribbean keywords (optional filter)
32
 
33
  Args:
34
- train_df: DataFrame with 'Transcription' column
35
  min_frequency: Minimum occurrences for single-word entities
36
  min_frequency_multiword: Minimum occurrences for multi-word entities
37
  capitalization_threshold: Minimum ratio of capitalized occurrences (0-1)
@@ -84,7 +84,10 @@ def extract_entities_from_transcripts(train_df: pd.DataFrame,
84
  if verbose:
85
  print("\n[1/3] Analyzing single words and multi-word phrases...")
86
 
87
- for transcription in train_df['Transcription']:
 
 
 
88
  if pd.isna(transcription):
89
  continue
90
 
 
31
  4. Proximity to known Caribbean keywords (optional filter)
32
 
33
  Args:
34
+ train_df: DataFrame with 'transcription' column (lowercase)
35
  min_frequency: Minimum occurrences for single-word entities
36
  min_frequency_multiword: Minimum occurrences for multi-word entities
37
  capitalization_threshold: Minimum ratio of capitalized occurrences (0-1)
 
84
  if verbose:
85
  print("\n[1/3] Analyzing single words and multi-word phrases...")
86
 
87
+ # Support both 'Transcription' (CSV) and 'transcription' (HF dataset)
88
+ transcription_col = 'transcription' if 'transcription' in train_df.columns else 'Transcription'
89
+
90
+ for transcription in train_df[transcription_col]:
91
  if pd.isna(transcription):
92
  continue
93