Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: refactor the data loading function
Browse files
app.py
CHANGED
|
@@ -77,41 +77,54 @@ def restart_space():
|
|
| 77 |
from dataclasses import dataclass
|
| 78 |
import pandas as pd
|
| 79 |
from typing import Optional
|
|
|
|
|
|
|
| 80 |
@dataclass
|
| 81 |
class LeaderboardDataStore:
|
| 82 |
raw_data: Optional[list]
|
| 83 |
-
|
| 84 |
original_df_long_doc: Optional[pd.DataFrame]
|
| 85 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
| 86 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
| 87 |
reranking_models: Optional[list]
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
def update_metric_qa(
|
| 117 |
metric: str,
|
|
@@ -173,9 +186,9 @@ with demo:
|
|
| 173 |
# select reranking models
|
| 174 |
with gr.Column():
|
| 175 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
| 176 |
-
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, types_qa)
|
| 177 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 178 |
-
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].
|
| 179 |
|
| 180 |
set_listeners(
|
| 181 |
"qa",
|
|
@@ -212,11 +225,11 @@ with demo:
|
|
| 212 |
selected_noreranker = get_noreranking_dropdown()
|
| 213 |
lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 214 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 215 |
-
lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
|
| 216 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 217 |
-
hidden_lb_df_retriever = data["AIR-Bench_24.04"].
|
| 218 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
| 219 |
-
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
|
| 220 |
|
| 221 |
set_listeners(
|
| 222 |
"qa",
|
|
@@ -254,11 +267,11 @@ with demo:
|
|
| 254 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
| 255 |
with gr.Column(scale=1):
|
| 256 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 257 |
-
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
| 258 |
-
hidden_lb_df_reranker = data["AIR-Bench_24.04"].
|
| 259 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 260 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 261 |
-
hidden_lb_df_reranker, types_qa, visible=False
|
| 262 |
)
|
| 263 |
|
| 264 |
set_listeners(
|
|
@@ -316,12 +329,12 @@ with demo:
|
|
| 316 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
| 317 |
|
| 318 |
lb_table = get_leaderboard_table(
|
| 319 |
-
data["AIR-Bench_24.04"].leaderboard_df_long_doc, types_long_doc
|
| 320 |
)
|
| 321 |
|
| 322 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 323 |
hidden_lb_table_for_search = get_leaderboard_table(
|
| 324 |
-
data["AIR-Bench_24.04"].original_df_long_doc, types_long_doc, visible=False
|
| 325 |
)
|
| 326 |
|
| 327 |
set_listeners(
|
|
@@ -366,9 +379,9 @@ with demo:
|
|
| 366 |
]
|
| 367 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
| 368 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
| 369 |
-
lb_df_retriever_long_doc, types_long_doc)
|
| 370 |
hidden_lb_table_retriever_long_doc = get_leaderboard_table(
|
| 371 |
-
hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
|
| 372 |
)
|
| 373 |
|
| 374 |
set_listeners(
|
|
@@ -408,11 +421,11 @@ with demo:
|
|
| 408 |
selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
|
| 409 |
with gr.Column(scale=1):
|
| 410 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
| 411 |
-
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
| 412 |
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 413 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
| 414 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
| 415 |
-
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
| 416 |
)
|
| 417 |
|
| 418 |
set_listeners(
|
|
|
|
| 77 |
from dataclasses import dataclass
|
| 78 |
import pandas as pd
|
| 79 |
from typing import Optional
|
| 80 |
+
|
| 81 |
+
|
| 82 |
@dataclass
|
| 83 |
class LeaderboardDataStore:
|
| 84 |
raw_data: Optional[list]
|
| 85 |
+
raw_qa_df: Optional[pd.DataFrame]
|
| 86 |
original_df_long_doc: Optional[pd.DataFrame]
|
| 87 |
leaderboard_df_qa: Optional[pd.DataFrame]
|
| 88 |
leaderboard_df_long_doc: Optional[pd.DataFrame]
|
| 89 |
reranking_models: Optional[list]
|
| 90 |
+
types_qa: Optional[list]
|
| 91 |
+
types_long_doc: Optional[list]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def load_eval_results(file_path: str):
|
| 95 |
+
output = {}
|
| 96 |
+
versions = ("AIR-Bench_24.04",)
|
| 97 |
+
for version in versions:
|
| 98 |
+
output[version] = LeaderboardDataStore(None, None, None, None, None, None, None, None)
|
| 99 |
+
output[version].raw_data = get_raw_eval_results(f"{file_path}/{version}")
|
| 100 |
+
output[version].raw_qa_df = get_leaderboard_df(
|
| 101 |
+
output[version].raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
| 102 |
+
output[version].original_df_long_doc = get_leaderboard_df(
|
| 103 |
+
output[version].raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
| 104 |
+
print(f'raw data: {len(output[version].raw_data)}')
|
| 105 |
+
print(f'QA data loaded: {output[version].raw_qa_df.shape}')
|
| 106 |
+
print(f'Long-Doc data loaded: {len(output[version].original_df_long_doc)}')
|
| 107 |
+
|
| 108 |
+
output[version].leaderboard_df_qa = output[version].raw_qa_df.copy()
|
| 109 |
+
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)]
|
| 110 |
+
shown_columns_qa, types_qa = get_default_cols(
|
| 111 |
+
'qa', output[version].leaderboard_df_qa.columns, add_fix_cols=True)
|
| 112 |
+
output[version].types_qa = types_qa
|
| 113 |
+
output[version].leaderboard_df_qa = output[version].leaderboard_df_qa[~output[version].leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
| 114 |
+
output[version].leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 115 |
+
|
| 116 |
+
output[version].leaderboard_df_long_doc = output[version].original_df_long_doc.copy()
|
| 117 |
+
shown_columns_long_doc, types_long_doc = get_default_cols(
|
| 118 |
+
'long-doc', output[version].leaderboard_df_long_doc.columns, add_fix_cols=True)
|
| 119 |
+
output[version].types_long_doc = types_long_doc
|
| 120 |
+
output[version].leaderboard_df_long_doc = output[version].leaderboard_df_long_doc[~output[version].leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
|
| 121 |
+
output[version].leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
| 122 |
+
|
| 123 |
+
output[version].reranking_models = sorted(list(frozenset([eval_result.reranking_model for eval_result in output[version].raw_data])))
|
| 124 |
+
return output
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
data = load_eval_results(EVAL_RESULTS_PATH)
|
| 128 |
|
| 129 |
def update_metric_qa(
|
| 130 |
metric: str,
|
|
|
|
| 186 |
# select reranking models
|
| 187 |
with gr.Column():
|
| 188 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
| 189 |
+
leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
|
| 190 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 191 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_qa_df, data["AIR-Bench_24.04"].types_qa, visible=False)
|
| 192 |
|
| 193 |
set_listeners(
|
| 194 |
"qa",
|
|
|
|
| 225 |
selected_noreranker = get_noreranking_dropdown()
|
| 226 |
lb_df_retriever = data["AIR-Bench_24.04"].leaderboard_df_qa[data["AIR-Bench_24.04"].leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 227 |
lb_df_retriever = reset_rank(lb_df_retriever)
|
| 228 |
+
lb_table_retriever = get_leaderboard_table(lb_df_retriever, data["AIR-Bench_24.04"].types_qa)
|
| 229 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 230 |
+
hidden_lb_df_retriever = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
|
| 231 |
hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
|
| 232 |
+
hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, data["AIR-Bench_24.04"].types_qa, visible=False)
|
| 233 |
|
| 234 |
set_listeners(
|
| 235 |
"qa",
|
|
|
|
| 267 |
selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
|
| 268 |
with gr.Column(scale=1):
|
| 269 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 270 |
+
lb_table_reranker = get_leaderboard_table(lb_df_reranker, data["AIR-Bench_24.04"].types_qa)
|
| 271 |
+
hidden_lb_df_reranker = data["AIR-Bench_24.04"].raw_qa_df[data["AIR-Bench_24.04"].raw_qa_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 272 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 273 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 274 |
+
hidden_lb_df_reranker, data["AIR-Bench_24.04"].types_qa, visible=False
|
| 275 |
)
|
| 276 |
|
| 277 |
set_listeners(
|
|
|
|
| 329 |
selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
|
| 330 |
|
| 331 |
lb_table = get_leaderboard_table(
|
| 332 |
+
data["AIR-Bench_24.04"].leaderboard_df_long_doc, data["AIR-Bench_24.04"].types_long_doc
|
| 333 |
)
|
| 334 |
|
| 335 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 336 |
hidden_lb_table_for_search = get_leaderboard_table(
|
| 337 |
+
data["AIR-Bench_24.04"].original_df_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
| 338 |
)
|
| 339 |
|
| 340 |
set_listeners(
|
|
|
|
| 379 |
]
|
| 380 |
hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
|
| 381 |
lb_table_retriever_long_doc = get_leaderboard_table(
|
| 382 |
+
lb_df_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc)
|
| 383 |
hidden_lb_table_retriever_long_doc = get_leaderboard_table(
|
| 384 |
+
hidden_lb_db_retriever_long_doc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
| 385 |
)
|
| 386 |
|
| 387 |
set_listeners(
|
|
|
|
| 421 |
selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
|
| 422 |
with gr.Column(scale=1):
|
| 423 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
| 424 |
+
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc)
|
| 425 |
hidden_lb_df_reranker_ldoc = data["AIR-Bench_24.04"].original_df_long_doc[data["AIR-Bench_24.04"].original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 426 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
| 427 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
| 428 |
+
hidden_lb_df_reranker_ldoc, data["AIR-Bench_24.04"].types_long_doc, visible=False
|
| 429 |
)
|
| 430 |
|
| 431 |
set_listeners(
|