Spaces:
Running
Running
mj-new
commited on
Commit
·
37d493c
1
Parent(s):
ceb2b55
Updated leaderboard code and requirements
Browse files- app.py +167 -52
- constants.py +2 -1
- requirements.txt +1 -1
- utils.py +87 -15
app.py
CHANGED
|
@@ -2,10 +2,12 @@ import os
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
|
| 5 |
-
from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
|
| 6 |
from app_utils import calculate_height_to_display, filter_dataframe
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
import numpy as np
|
|
|
|
|
|
|
| 9 |
|
| 10 |
hf_token = os.getenv('HF_TOKEN')
|
| 11 |
if hf_token is None:
|
|
@@ -185,7 +187,7 @@ def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='o
|
|
| 185 |
st.pyplot(fig)
|
| 186 |
|
| 187 |
with about:
|
| 188 |
-
st.title("
|
| 189 |
st.markdown(ABOUT_INFO, unsafe_allow_html=True)
|
| 190 |
|
| 191 |
# Table - evaluated systems # TODO - change to concatenated table
|
|
@@ -196,6 +198,13 @@ with about:
|
|
| 196 |
#print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
|
| 197 |
|
| 198 |
df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
|
| 200 |
#print(codename_to_shortname_mapping)
|
| 201 |
|
|
@@ -203,14 +212,32 @@ with about:
|
|
| 203 |
|
| 204 |
df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
|
| 205 |
df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
|
| 206 |
-
st.
|
| 207 |
|
| 208 |
st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
|
| 209 |
|
| 210 |
-
st.header("Detalied info about evaluated ASR systems")
|
| 211 |
-
|
| 212 |
#TODO - add info who created the system (company, institution, team, etc.)
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
# Table - evaluation datasets
|
| 216 |
# Table - evaluation metrics
|
|
@@ -223,6 +250,8 @@ with about:
|
|
| 223 |
# List - TODOs
|
| 224 |
|
| 225 |
with lead_bigos:
|
|
|
|
|
|
|
| 226 |
|
| 227 |
# configuration for tab
|
| 228 |
dataset = "amu-cai/pl-asr-bigos-v2-secret"
|
|
@@ -257,17 +286,17 @@ with lead_bigos:
|
|
| 257 |
# save sample to tsv
|
| 258 |
df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
|
| 259 |
|
|
|
|
|
|
|
|
|
|
| 260 |
# MOST IMPORTANT RESULTS
|
| 261 |
analysis_dim = "system"
|
| 262 |
metric = "WER"
|
| 263 |
-
st.subheader("
|
| 264 |
-
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
|
| 265 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 266 |
|
| 267 |
-
|
| 268 |
-
########### EVALUATION PARAMETERS PRESENTATION ################
|
| 269 |
-
st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
|
| 270 |
-
st.markdown(BIGOS_INFO, unsafe_allow_html=True)
|
| 271 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
| 272 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
| 273 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
|
@@ -301,7 +330,6 @@ with lead_bigos:
|
|
| 301 |
h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
|
| 302 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
| 303 |
|
| 304 |
-
|
| 305 |
##################### PER SUBSET ANALYSIS #########################
|
| 306 |
analysis_dim = "subset"
|
| 307 |
metric = "WER"
|
|
@@ -311,7 +339,7 @@ with lead_bigos:
|
|
| 311 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
| 312 |
|
| 313 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
| 314 |
-
fig =
|
| 315 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 316 |
|
| 317 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
|
@@ -395,16 +423,14 @@ with lead_pelcra:
|
|
| 395 |
|
| 396 |
df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
|
| 397 |
|
| 398 |
-
|
| 399 |
analysis_dim = "system"
|
| 400 |
metric = "WER"
|
| 401 |
-
st.subheader("
|
| 402 |
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
|
| 403 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
|
| 407 |
-
st.markdown(BIGOS_INFO, unsafe_allow_html=True)
|
| 408 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
| 409 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
| 410 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
|
@@ -447,7 +473,7 @@ with lead_pelcra:
|
|
| 447 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
| 448 |
|
| 449 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
| 450 |
-
fig =
|
| 451 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 452 |
|
| 453 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
|
@@ -502,6 +528,13 @@ with analysis:
|
|
| 502 |
|
| 503 |
dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
# read the latest results for the selected dataset
|
| 506 |
print("Reading the latest results for dataset: ", dataset)
|
| 507 |
df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
|
|
@@ -547,7 +580,7 @@ with analysis:
|
|
| 547 |
st.subheader("Best and worst systems for dataset {}".format(dataset))
|
| 548 |
df_best_worse_systems = pd.DataFrame(data, columns=header)
|
| 549 |
# do not display index
|
| 550 |
-
st.dataframe(df_best_worse_systems)
|
| 551 |
|
| 552 |
st.subheader("Comparison of average WER for best systems")
|
| 553 |
df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
|
|
@@ -602,21 +635,74 @@ with analysis:
|
|
| 602 |
# Y is thw average WER
|
| 603 |
# make each point a different color
|
| 604 |
# provide legend with system names
|
| 605 |
-
fig, ax = plt.subplots()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
for system in free_systems_wer['system'].unique():
|
| 607 |
subset = free_systems_wer[free_systems_wer['system'] == system]
|
| 608 |
-
|
| 609 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
for i, point in subset.iterrows():
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
st.pyplot(fig)
|
| 619 |
|
|
|
|
| 620 |
##################################################################################################################################################
|
| 621 |
# WER per audio duration
|
| 622 |
|
|
@@ -653,11 +739,7 @@ with analysis:
|
|
| 653 |
# print dataframe in streamlit
|
| 654 |
st.dataframe(df_per_sample_wer_audio_pivot)
|
| 655 |
|
| 656 |
-
#
|
| 657 |
-
# each system should have a different color
|
| 658 |
-
# the size of the point should be proportional to the number of samples in the bucket
|
| 659 |
-
# the x axis should be the audio duration bucket
|
| 660 |
-
# the y axis should be the average WER
|
| 661 |
fig, ax = plt.subplots()
|
| 662 |
for system in selected_systems:
|
| 663 |
subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
|
|
@@ -678,7 +760,7 @@ with analysis:
|
|
| 678 |
audio_feature_to_analyze = 'speech_rate_words'
|
| 679 |
audio_feature_unit = ' [words/s]'
|
| 680 |
metric = 'WER'
|
| 681 |
-
metric_unit = '
|
| 682 |
no_of_buckets = 10
|
| 683 |
# calculate average WER per audio duration bucket for the best and worse commercial and free systems
|
| 684 |
selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
|
|
@@ -688,24 +770,57 @@ with analysis:
|
|
| 688 |
# print dataframe in streamlit
|
| 689 |
st.dataframe(df_per_sample_wer_feature_pivot)
|
| 690 |
|
| 691 |
-
#
|
| 692 |
-
|
| 693 |
-
# the size of the point should be proportional to the number of samples in the bucket
|
| 694 |
-
# the x axis should be the audio duration bucket
|
| 695 |
-
# the y axis should be the average WER
|
| 696 |
-
fig, ax = plt.subplots()
|
| 697 |
-
for system in selected_systems:
|
| 698 |
-
subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system]
|
| 699 |
-
ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5)
|
| 700 |
-
ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit)
|
| 701 |
-
ax.set_ylabel(metric + metric_unit)
|
| 702 |
-
ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze))
|
| 703 |
|
| 704 |
-
#
|
| 705 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
st.pyplot(fig)
|
| 707 |
|
| 708 |
|
|
|
|
| 709 |
################################################################################################################################################
|
| 710 |
# WER PER GENDER
|
| 711 |
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
|
| 5 |
+
from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension,box_plot_per_dimension_subsets, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
|
| 6 |
from app_utils import calculate_height_to_display, filter_dataframe
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
import numpy as np
|
| 9 |
+
import statsmodels.api as sm
|
| 10 |
+
import seaborn as sns
|
| 11 |
|
| 12 |
hf_token = os.getenv('HF_TOKEN')
|
| 13 |
if hf_token is None:
|
|
|
|
| 187 |
st.pyplot(fig)
|
| 188 |
|
| 189 |
with about:
|
| 190 |
+
st.title("AMU Polish ASR Leaderboard")
|
| 191 |
st.markdown(ABOUT_INFO, unsafe_allow_html=True)
|
| 192 |
|
| 193 |
# Table - evaluated systems # TODO - change to concatenated table
|
|
|
|
| 198 |
#print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
|
| 199 |
|
| 200 |
df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
|
| 201 |
+
# drop columns "Included in BIGOS benchmark"
|
| 202 |
+
df_evaluated_systems = df_evaluated_systems.drop(columns=["Included in BIGOS benchmark"])
|
| 203 |
+
# drop empty rows
|
| 204 |
+
df_evaluated_systems = df_evaluated_systems.dropna(how='all')
|
| 205 |
+
# drop empty columns
|
| 206 |
+
df_evaluated_systems = df_evaluated_systems.dropna(axis=1, how='all')
|
| 207 |
+
|
| 208 |
codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
|
| 209 |
#print(codename_to_shortname_mapping)
|
| 210 |
|
|
|
|
| 212 |
|
| 213 |
df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
|
| 214 |
df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
|
| 215 |
+
st.subheader("Evaluated systems:")
|
| 216 |
|
| 217 |
st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
|
| 218 |
|
|
|
|
|
|
|
| 219 |
#TODO - add info who created the system (company, institution, team, etc.)
|
| 220 |
+
# Split into separate tables for free and commercial systems
|
| 221 |
+
free_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'free']
|
| 222 |
+
commercial_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'commercial']
|
| 223 |
+
|
| 224 |
+
st.subheader("Free systems:")
|
| 225 |
+
# drop empty columns
|
| 226 |
+
free_systems = free_systems.dropna(axis=1, how='all')
|
| 227 |
+
# drop empty rows
|
| 228 |
+
free_systems = free_systems.dropna(how='all')
|
| 229 |
+
|
| 230 |
+
# do not display index
|
| 231 |
+
st.dataframe(free_systems, hide_index=True, height = h_df_systems, use_container_width=True)
|
| 232 |
+
|
| 233 |
+
st.subheader("Commercial systems:")
|
| 234 |
+
# drop empty columns
|
| 235 |
+
commercial_systems = commercial_systems.dropna(axis=1, how='all')
|
| 236 |
+
# do not display index
|
| 237 |
+
# drop empty rows
|
| 238 |
+
commercial_systems = commercial_systems.dropna(how='all')
|
| 239 |
+
|
| 240 |
+
st.dataframe(commercial_systems, hide_index=True, height = h_df_systems, use_container_width=True)
|
| 241 |
|
| 242 |
# Table - evaluation datasets
|
| 243 |
# Table - evaluation metrics
|
|
|
|
| 250 |
# List - TODOs
|
| 251 |
|
| 252 |
with lead_bigos:
|
| 253 |
+
st.title("BIGOS Leaderboard")
|
| 254 |
+
st.markdown(BIGOS_INFO, unsafe_allow_html=True)
|
| 255 |
|
| 256 |
# configuration for tab
|
| 257 |
dataset = "amu-cai/pl-asr-bigos-v2-secret"
|
|
|
|
| 286 |
# save sample to tsv
|
| 287 |
df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
|
| 288 |
|
| 289 |
+
########### EVALUATION PARAMETERS PRESENTATION ################
|
| 290 |
+
st.title("ASR leaderboard for dataset: {} {}".format(dataset_short_name, dataset_version))
|
| 291 |
+
|
| 292 |
# MOST IMPORTANT RESULTS
|
| 293 |
analysis_dim = "system"
|
| 294 |
metric = "WER"
|
| 295 |
+
st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
|
| 296 |
+
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim, metric + "[%]","System", "Type")
|
| 297 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 298 |
|
| 299 |
+
st.header("Benchmark details")
|
|
|
|
|
|
|
|
|
|
| 300 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
| 301 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
| 302 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
|
|
|
| 330 |
h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
|
| 331 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
| 332 |
|
|
|
|
| 333 |
##################### PER SUBSET ANALYSIS #########################
|
| 334 |
analysis_dim = "subset"
|
| 335 |
metric = "WER"
|
|
|
|
| 339 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
| 340 |
|
| 341 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
| 342 |
+
fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
|
| 343 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 344 |
|
| 345 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
|
|
|
| 423 |
|
| 424 |
df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
|
| 425 |
|
| 426 |
+
# MOST IMPORTANT RESULTS
|
| 427 |
analysis_dim = "system"
|
| 428 |
metric = "WER"
|
| 429 |
+
st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
|
| 430 |
fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
|
| 431 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 432 |
+
|
| 433 |
+
st.header("Benchmark details")
|
|
|
|
|
|
|
| 434 |
st.markdown("**Evaluation date:** {}".format(eval_date))
|
| 435 |
st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
|
| 436 |
st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
|
|
|
|
| 473 |
st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
|
| 474 |
|
| 475 |
st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
|
| 476 |
+
fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
|
| 477 |
st.pyplot(fig, clear_figure=True, use_container_width=True)
|
| 478 |
|
| 479 |
### IMPACT OF NORMALIZATION ON ERROR RATES #####
|
|
|
|
| 528 |
|
| 529 |
dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
|
| 530 |
|
| 531 |
+
if dataset == "amu-cai/pl-asr-bigos-v2-secret":
|
| 532 |
+
dataset_short_name = "BIGOS"
|
| 533 |
+
elif dataset == "pelcra/pl-asr-pelcra-for-bigos-secret":
|
| 534 |
+
dataset_short_name = "PELCRA"
|
| 535 |
+
else:
|
| 536 |
+
dataset_short_name = "UNKNOWN"
|
| 537 |
+
|
| 538 |
# read the latest results for the selected dataset
|
| 539 |
print("Reading the latest results for dataset: ", dataset)
|
| 540 |
df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
|
|
|
|
| 580 |
st.subheader("Best and worst systems for dataset {}".format(dataset))
|
| 581 |
df_best_worse_systems = pd.DataFrame(data, columns=header)
|
| 582 |
# do not display index
|
| 583 |
+
st.dataframe(df_best_worse_systems, hide_index=True)
|
| 584 |
|
| 585 |
st.subheader("Comparison of average WER for best systems")
|
| 586 |
df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
|
|
|
|
| 635 |
# Y is thw average WER
|
| 636 |
# make each point a different color
|
| 637 |
# provide legend with system names
|
| 638 |
+
fig, ax = plt.subplots(figsize=(10, 7))
|
| 639 |
+
|
| 640 |
+
# Define larger jitter for close points
|
| 641 |
+
jitter_x = 5
|
| 642 |
+
jitter_y = 0.2
|
| 643 |
+
|
| 644 |
+
# Alternate marker shapes to distinguish overlapping points
|
| 645 |
+
marker_styles = ['o', 's', 'D', '^', 'v', '<', '>'] # Circle, square, diamond, and other shapes
|
| 646 |
+
marker_dict = {system: marker_styles[i % len(marker_styles)] for i, system in enumerate(free_systems_wer['system'].unique())}
|
| 647 |
+
|
| 648 |
for system in free_systems_wer['system'].unique():
|
| 649 |
subset = free_systems_wer[free_systems_wer['system'] == system]
|
| 650 |
+
marker_style = marker_dict[system]
|
| 651 |
+
|
| 652 |
+
# Scatter plot with distinct marker shapes for each system
|
| 653 |
+
ax.scatter(
|
| 654 |
+
subset['Parameters [M]'] + jitter_x * (np.random.rand(len(subset)) - 0.5), # Apply jitter to x for overlap
|
| 655 |
+
subset['WER'] + jitter_y * (np.random.rand(len(subset)) - 0.5), # Apply jitter to y for overlap
|
| 656 |
+
label=system, s=100, alpha=0.7, edgecolor='black', marker=marker_style
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
# Add text annotations with dynamic positioning to avoid overlap with y-axis
|
| 660 |
for i, point in subset.iterrows():
|
| 661 |
+
# Adjust position to avoid overlap with y-axis
|
| 662 |
+
x_offset = 10 if point['Parameters [M]'] < 50 else -10 if i % 2 == 1 else 10 # Push right if close to y-axis
|
| 663 |
+
y_offset = -0.5 if i % 2 == 0 else 0.5 # Alternate vertical offset
|
| 664 |
+
|
| 665 |
+
ax.annotate(
|
| 666 |
+
point['system'],
|
| 667 |
+
(point['Parameters [M]'], point['WER']),
|
| 668 |
+
textcoords="offset points",
|
| 669 |
+
xytext=(x_offset, y_offset),
|
| 670 |
+
ha='right' if x_offset < 0 else 'left',
|
| 671 |
+
fontsize=10,
|
| 672 |
+
bbox=dict(boxstyle="round,pad=0.3", edgecolor='white', facecolor='white', alpha=0.7)
|
| 673 |
+
)
|
| 674 |
+
|
| 675 |
+
# Set axis labels and title
|
| 676 |
+
ax.set_xlabel('Model Size [M Parameters]', fontsize=12)
|
| 677 |
+
ax.set_ylabel('WER (%)', fontsize=12)
|
| 678 |
+
ax.set_title(f'WER vs. Model Size for Dataset {dataset_short_name}', fontsize=14, pad=20)
|
| 679 |
+
|
| 680 |
+
# Adjust legend settings to fit outside the main plot area
|
| 681 |
+
ax.legend(
|
| 682 |
+
title='System', bbox_to_anchor=(0.8, 1), loc='upper left',
|
| 683 |
+
fontsize=8, title_fontsize=9, frameon=True, shadow=False, facecolor='white')
|
| 684 |
+
#)
|
| 685 |
+
|
| 686 |
+
# Add grid lines and minor ticks for better readability
|
| 687 |
+
ax.grid(True, linestyle='--', alpha=0.5)
|
| 688 |
+
ax.minorticks_on()
|
| 689 |
+
ax.tick_params(which='both', direction='in', top=True, right=True)
|
| 690 |
+
|
| 691 |
+
|
| 692 |
+
# increase granularity of y-axis to 20 points per whole range
|
| 693 |
+
# Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
|
| 694 |
+
y_min = 0
|
| 695 |
+
y_max = ax.get_ylim()[1] # Get the current maximum y value
|
| 696 |
+
y_max_rounded = np.ceil(y_max / 5) * 5 # Round y_max up to the next highest multiple of 5
|
| 697 |
+
ax.set_ylim(y_min, y_max_rounded)
|
| 698 |
+
|
| 699 |
+
# Improve layout spacing
|
| 700 |
+
plt.tight_layout()
|
| 701 |
+
|
| 702 |
+
# Display the plot
|
| 703 |
st.pyplot(fig)
|
| 704 |
|
| 705 |
+
|
| 706 |
##################################################################################################################################################
|
| 707 |
# WER per audio duration
|
| 708 |
|
|
|
|
| 739 |
# print dataframe in streamlit
|
| 740 |
st.dataframe(df_per_sample_wer_audio_pivot)
|
| 741 |
|
| 742 |
+
# create scatter plot with WER in function of audio duration
|
|
|
|
|
|
|
|
|
|
|
|
|
| 743 |
fig, ax = plt.subplots()
|
| 744 |
for system in selected_systems:
|
| 745 |
subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
|
|
|
|
| 760 |
audio_feature_to_analyze = 'speech_rate_words'
|
| 761 |
audio_feature_unit = ' [words/s]'
|
| 762 |
metric = 'WER'
|
| 763 |
+
metric_unit = ' (%)'
|
| 764 |
no_of_buckets = 10
|
| 765 |
# calculate average WER per audio duration bucket for the best and worse commercial and free systems
|
| 766 |
selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
|
|
|
|
| 770 |
# print dataframe in streamlit
|
| 771 |
st.dataframe(df_per_sample_wer_feature_pivot)
|
| 772 |
|
| 773 |
+
# Set a threshold to remove outliers - here we use the 97th percentile of WER
|
| 774 |
+
threshold = df_per_sample_wer_feature[metric].quantile(0.97)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 775 |
|
| 776 |
+
# Remove data points with WER greater than the threshold
|
| 777 |
+
filtered_df = df_per_sample_wer_feature[df_per_sample_wer_feature[metric] <= threshold]
|
| 778 |
+
|
| 779 |
+
# Create figure and axis with larger size
|
| 780 |
+
fig, ax = plt.subplots(figsize=(10, 7))
|
| 781 |
+
|
| 782 |
+
# Scatter plot for each system
|
| 783 |
+
for system in selected_systems:
|
| 784 |
+
subset = filtered_df[filtered_df['system'] == system]
|
| 785 |
+
ax.scatter(subset[audio_feature_to_analyze],
|
| 786 |
+
subset[metric],
|
| 787 |
+
label=system,
|
| 788 |
+
s=subset['number_of_samples'] * 0.5,
|
| 789 |
+
alpha=0.6) # Set alpha for better visibility of overlapping points
|
| 790 |
+
|
| 791 |
+
# Adding a trend line using LOWESS
|
| 792 |
+
lowess = sm.nonparametric.lowess
|
| 793 |
+
trend = lowess(subset[metric], subset[audio_feature_to_analyze], frac=0.3) # Adjust frac to control smoothing
|
| 794 |
+
ax.plot(trend[:, 0], trend[:, 1], label=f'{system} Trend', linestyle='-', linewidth=2)
|
| 795 |
+
|
| 796 |
+
# Set axis labels with improved formatting for readability
|
| 797 |
+
ax.set_xlabel(audio_feature_to_analyze.replace('_', ' ').capitalize() + ' ' + audio_feature_unit )
|
| 798 |
+
ax.set_ylabel(metric + ' ' + metric_unit )
|
| 799 |
+
|
| 800 |
+
# Set an improved title that is more informative
|
| 801 |
+
ax.set_title('Word Error Rate (WER) vs Speech Rate\nBest Performing Free and Paid Systems', fontsize=14)
|
| 802 |
+
|
| 803 |
+
# increase granularity of y-axis to 20 points per whole range
|
| 804 |
+
# Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
|
| 805 |
+
y_min = 0
|
| 806 |
+
y_max = ax.get_ylim()[1] # Get the current maximum y value
|
| 807 |
+
y_max_rounded = np.ceil(y_max / 5) * 5 # Round y_max up to the next highest multiple of 5
|
| 808 |
+
ax.set_ylim(y_min, y_max_rounded)
|
| 809 |
+
|
| 810 |
+
# Add a grid to improve readability and alignment
|
| 811 |
+
ax.grid(True, linestyle='--', alpha=0.7)
|
| 812 |
+
|
| 813 |
+
# Place legend outside the plot area to prevent overlapping with data points
|
| 814 |
+
ax.legend(title='System', loc='upper right', bbox_to_anchor=(0.95, 1))
|
| 815 |
+
|
| 816 |
+
# Add tight layout to improve spacing between elements
|
| 817 |
+
fig.tight_layout()
|
| 818 |
+
|
| 819 |
+
# Display the plot
|
| 820 |
st.pyplot(fig)
|
| 821 |
|
| 822 |
|
| 823 |
+
|
| 824 |
################################################################################################################################################
|
| 825 |
# WER PER GENDER
|
| 826 |
|
constants.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
|
| 2 |
The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
|
|
|
|
| 3 |
To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
|
| 4 |
-
If you use this work, please
|
| 5 |
```@misc{amu_cai_pl_asr_leaderboard, \
|
| 6 |
author = {Michał Junczyk}, \
|
| 7 |
title = {{AMU Polish ASR Leaderboard}}, \
|
|
|
|
| 1 |
ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
|
| 2 |
The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
|
| 3 |
+
If you want to add your system or dataset to the leaderboard, please contact Michał Junczyk ([email protected]) or open a pull request on [GitHub](https://github.com/goodmike31/pl-asr-bigos-tools) <br>\
|
| 4 |
To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
|
| 5 |
+
If you use this work, please cite it as follows: <br> \
|
| 6 |
```@misc{amu_cai_pl_asr_leaderboard, \
|
| 7 |
author = {Michał Junczyk}, \
|
| 8 |
title = {{AMU Polish ASR Leaderboard}}, \
|
requirements.txt
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
seaborn
|
| 2 |
-
|
|
|
|
| 1 |
seaborn
|
| 2 |
+
statsmodels
|
utils.py
CHANGED
|
@@ -9,22 +9,10 @@ from datasets import Dataset
|
|
| 9 |
from huggingface_hub import hf_hub_download
|
| 10 |
import matplotlib.patches as mpatches
|
| 11 |
import matplotlib as mpl
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
-
asr_systems_colors_mapping = {
|
| 15 |
-
'azure': '#1f77b4', # Blue
|
| 16 |
-
'google': '#2ca02c', # Green
|
| 17 |
-
'wav2vec2': '#d62728', # Red
|
| 18 |
-
'nemo': '#9467bd', # Purple
|
| 19 |
-
'assemblyai': '#8c564b', # Brown
|
| 20 |
-
'mms': '#e377c2', # Pink
|
| 21 |
-
'google_v2': '#7f7f7f', # Gray
|
| 22 |
-
'whisper_cloud': '#bcbd22', # Olive
|
| 23 |
-
'whisper_local': '#ff7f0e', # Orange
|
| 24 |
-
|
| 25 |
-
# Add or override other systems and their colors
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
def download_tsv_from_google_sheet(sheet_url):
|
| 29 |
# Modify the Google Sheet URL to export it as TSV
|
| 30 |
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
|
|
@@ -181,7 +169,7 @@ def filter_bottom_outliers(df_input, metric, min_threshold):
|
|
| 181 |
|
| 182 |
def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
|
| 183 |
# Box plot for WER per dataset
|
| 184 |
-
fig, ax = plt.subplots(figsize=(
|
| 185 |
|
| 186 |
# generate box plot without outliers
|
| 187 |
sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
|
|
@@ -193,6 +181,90 @@ def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
|
|
| 193 |
#return figure
|
| 194 |
return plt
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
|
| 197 |
# Create a figure and axis object
|
| 198 |
fig, ax = plt.subplots(figsize=(12, 8))
|
|
|
|
| 9 |
from huggingface_hub import hf_hub_download
|
| 10 |
import matplotlib.patches as mpatches
|
| 11 |
import matplotlib as mpl
|
| 12 |
+
from constants import asr_systems_colors_mapping
|
| 13 |
+
from matplotlib.lines import Line2D
|
| 14 |
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def download_tsv_from_google_sheet(sheet_url):
|
| 17 |
# Modify the Google Sheet URL to export it as TSV
|
| 18 |
tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
|
|
|
|
| 169 |
|
| 170 |
def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
|
| 171 |
# Box plot for WER per dataset
|
| 172 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
| 173 |
|
| 174 |
# generate box plot without outliers
|
| 175 |
sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
|
|
|
|
| 181 |
#return figure
|
| 182 |
return plt
|
| 183 |
|
| 184 |
+
def box_plot_per_dimension_subsets(df_input, metric, dimension, title, xlabel, ylabel, category_column, y_limit=100):
|
| 185 |
+
"""
|
| 186 |
+
Plots a box plot with individual data points colored and marked by a specified category.
|
| 187 |
+
|
| 188 |
+
Parameters:
|
| 189 |
+
- df_input (pd.DataFrame): Input DataFrame containing data to plot.
|
| 190 |
+
- metric (str): Column name for the metric to plot on the y-axis.
|
| 191 |
+
- dimension (str): Column name for the dimension (x-axis categories).
|
| 192 |
+
- title (str): Title of the plot.
|
| 193 |
+
- xlabel (str): Label for the x-axis.
|
| 194 |
+
- ylabel (str): Label for the y-axis.
|
| 195 |
+
- category_column (str): Column name to use for differentiating data points by color and marker.
|
| 196 |
+
- y_limit (float, optional): Maximum value for the y-axis to limit extreme outliers.
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
- fig: The matplotlib figure object.
|
| 200 |
+
"""
|
| 201 |
+
|
| 202 |
+
# Set up the figure and axis with a larger size for readability
|
| 203 |
+
fig, ax = plt.subplots(figsize=(14, 8))
|
| 204 |
+
|
| 205 |
+
# Create a sorted order for the dimension based on the median values of the metric
|
| 206 |
+
order = df_input.groupby(dimension)[metric].median().sort_values().index
|
| 207 |
+
|
| 208 |
+
# Generate box plot without showing extreme outliers
|
| 209 |
+
boxplot = sns.boxplot(
|
| 210 |
+
x=dimension, y=metric, data=df_input,
|
| 211 |
+
order=order, showfliers=False, width=0.6, ax=ax,
|
| 212 |
+
color="white"
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# Make the box plots transparent by adjusting the facecolor of each box
|
| 216 |
+
for patch in boxplot.artists:
|
| 217 |
+
patch.set_facecolor("white")
|
| 218 |
+
patch.set_alpha(0.2) # Set transparency
|
| 219 |
+
|
| 220 |
+
# Define category-specific colors and marker styles
|
| 221 |
+
categories = df_input[category_column].unique()
|
| 222 |
+
markers = ['o', 's', '^', 'D', 'X', 'P', '*'] # Different marker styles
|
| 223 |
+
colors = sns.color_palette("Set2", len(categories)) # Use a color palette with distinct colors
|
| 224 |
+
category_style_map = {category: {'color': colors[i % len(colors)], 'marker': markers[i % len(markers)]}
|
| 225 |
+
for i, category in enumerate(categories)}
|
| 226 |
+
|
| 227 |
+
# Overlay individual data points with category-specific colors and markers
|
| 228 |
+
for category, style in category_style_map.items():
|
| 229 |
+
# Filter data for each category
|
| 230 |
+
category_data = df_input[(df_input[category_column] == category) & (df_input[metric] <= y_limit)]
|
| 231 |
+
sns.stripplot(
|
| 232 |
+
x=dimension, y=metric, data=category_data,
|
| 233 |
+
order=order, color=style['color'], marker=style['marker'],
|
| 234 |
+
size=5, jitter=True, alpha=1, ax=ax
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Set title and axis labels
|
| 238 |
+
ax.set_title(title)
|
| 239 |
+
ax.set_xlabel(xlabel)
|
| 240 |
+
ax.set_ylabel(ylabel)
|
| 241 |
+
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
|
| 242 |
+
|
| 243 |
+
# Add gridlines for easier comparison
|
| 244 |
+
plt.grid(axis='y', linestyle='--', alpha=0.5)
|
| 245 |
+
|
| 246 |
+
# Set y-axis limit to improve readability
|
| 247 |
+
# Calculate the y-axis maximum as the next multiple of 5 above the data’s max value
|
| 248 |
+
# Make sure the max value does not contain any extreme outliers. Threhold at 98th percentile
|
| 249 |
+
max_value = df_input[metric].quantile(0.99)
|
| 250 |
+
|
| 251 |
+
y_max = (int(max_value / 5) + 1) * 5
|
| 252 |
+
|
| 253 |
+
# Set y-axis ticks with evenly spaced intervals of 5
|
| 254 |
+
ax.set_yticks(range(0, y_max + 1, 5))
|
| 255 |
+
ax.set_ylim(0, y_max)
|
| 256 |
+
|
| 257 |
+
# Create a custom legend with unique entries for each category
|
| 258 |
+
legend_handles = [
|
| 259 |
+
Line2D([0], [0], marker=style['marker'], color='w', markerfacecolor=style['color'], markersize=8, label=category)
|
| 260 |
+
for category, style in category_style_map.items()
|
| 261 |
+
]
|
| 262 |
+
ax.legend(handles=legend_handles, title=category_column, bbox_to_anchor=(1.05, 1), loc='upper left')
|
| 263 |
+
|
| 264 |
+
# Return the updated figure
|
| 265 |
+
return fig
|
| 266 |
+
|
| 267 |
+
|
| 268 |
def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
|
| 269 |
# Create a figure and axis object
|
| 270 |
fig, ax = plt.subplots(figsize=(12, 8))
|