Add model positions in the ranking
Browse files- app.py +35 -9
- src/display/utils.py +2 -0
- src/leaderboard/read_evals.py +5 -2
app.py
CHANGED
|
@@ -108,10 +108,23 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
| 108 |
if dataframe is None or dataframe.empty:
|
| 109 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
field_list = fields(AutoEvalColumn)
|
| 112 |
|
| 113 |
return Leaderboard(
|
| 114 |
-
value=
|
| 115 |
datatype=[c.type for c in field_list],
|
| 116 |
#select_columns=SelectColumns(
|
| 117 |
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
|
@@ -144,6 +157,18 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
|
|
| 144 |
|
| 145 |
sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
pd.set_option('display.max_colwidth', None)
|
| 148 |
#print("========================", dataframe['Model'])
|
| 149 |
|
|
@@ -153,7 +178,8 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
|
|
| 153 |
|
| 154 |
return Leaderboard(
|
| 155 |
value=sorted_dataframe,
|
| 156 |
-
datatype=[c.type for c in field_list],
|
|
|
|
| 157 |
#select_columns=SelectColumns(
|
| 158 |
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
| 159 |
# cant_deselect=[c.name for c in field_list if c.never_hidden],
|
|
@@ -211,7 +237,7 @@ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
|
| 211 |
# Load leaderboard data
|
| 212 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 213 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 214 |
-
print(LEADERBOARD_DF.columns.tolist())
|
| 215 |
|
| 216 |
# Prepare the main interface
|
| 217 |
demo = gr.Blocks(css=custom_css)
|
|
@@ -242,8 +268,8 @@ with demo:
|
|
| 242 |
|
| 243 |
leaderboard = init_leaderboard(
|
| 244 |
LEADERBOARD_DF,
|
| 245 |
-
default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
|
| 246 |
-
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
|
| 247 |
)
|
| 248 |
|
| 249 |
with gr.TabItem("📈 Charts"):
|
|
@@ -269,8 +295,8 @@ with demo:
|
|
| 269 |
|
| 270 |
leaderboard = update_task_leaderboard(
|
| 271 |
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
|
| 272 |
-
default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
| 273 |
-
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
|
| 274 |
)
|
| 275 |
|
| 276 |
# About tab
|
|
@@ -289,10 +315,10 @@ with demo:
|
|
| 289 |
f"{task} Best Prompt": "Best Prompt",
|
| 290 |
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 291 |
task: "Combined Performance"}),
|
| 292 |
-
default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
|
| 293 |
'Best Prompt Id'],
|
| 294 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 295 |
-
col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
|
| 296 |
'Best Prompt', 'Best Prompt Id']]
|
| 297 |
)
|
| 298 |
|
|
|
|
| 108 |
if dataframe is None or dataframe.empty:
|
| 109 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 110 |
|
| 111 |
+
sorted_dataframe = dataframe.sort_values(by="Avg. Combined Performance ⬆️", ascending=False)
|
| 112 |
+
|
| 113 |
+
sorted_dataframe = sorted_dataframe.reset_index(drop=True)
|
| 114 |
+
sorted_dataframe["rank"] = sorted_dataframe.index + 1
|
| 115 |
+
|
| 116 |
+
# aggiungi la corona accanto al nome del modello se il rank è 1
|
| 117 |
+
sorted_dataframe["Model"] = sorted_dataframe.apply(
|
| 118 |
+
lambda row: f"{row['Model']} 🥇" if row["rank"] == 1 else
|
| 119 |
+
(f"{row['Model']} 🥈" if row["rank"] == 2 else
|
| 120 |
+
(f"{row['Model']} 🥉" if row["rank"] == 3 else row["Model"])),
|
| 121 |
+
axis=1
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
field_list = fields(AutoEvalColumn)
|
| 125 |
|
| 126 |
return Leaderboard(
|
| 127 |
+
value=sorted_dataframe,
|
| 128 |
datatype=[c.type for c in field_list],
|
| 129 |
#select_columns=SelectColumns(
|
| 130 |
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
|
|
|
| 157 |
|
| 158 |
sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
|
| 159 |
|
| 160 |
+
# aggiungo la colonna rank in base alla posizione
|
| 161 |
+
sorted_dataframe = sorted_dataframe.reset_index(drop=True)
|
| 162 |
+
sorted_dataframe["rank"] = sorted_dataframe.index + 1
|
| 163 |
+
|
| 164 |
+
# aggiungi la corona accanto al nome del modello se il rank è 1
|
| 165 |
+
sorted_dataframe["Model"] = sorted_dataframe.apply(
|
| 166 |
+
lambda row: f"{row['Model']} 🥇" if row["rank"] == 1 else
|
| 167 |
+
(f"{row['Model']} 🥈" if row["rank"] == 2 else
|
| 168 |
+
(f"{row['Model']} 🥉" if row["rank"] == 3 else row["Model"])),
|
| 169 |
+
axis=1
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
pd.set_option('display.max_colwidth', None)
|
| 173 |
#print("========================", dataframe['Model'])
|
| 174 |
|
|
|
|
| 178 |
|
| 179 |
return Leaderboard(
|
| 180 |
value=sorted_dataframe,
|
| 181 |
+
#datatype=[c.type for c in field_list],
|
| 182 |
+
datatype=[c.type for c in field_list] + [int],
|
| 183 |
#select_columns=SelectColumns(
|
| 184 |
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
| 185 |
# cant_deselect=[c.name for c in field_list if c.never_hidden],
|
|
|
|
| 237 |
# Load leaderboard data
|
| 238 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 239 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 240 |
+
#print(LEADERBOARD_DF.columns.tolist())
|
| 241 |
|
| 242 |
# Prepare the main interface
|
| 243 |
demo = gr.Blocks(css=custom_css)
|
|
|
|
| 268 |
|
| 269 |
leaderboard = init_leaderboard(
|
| 270 |
LEADERBOARD_DF,
|
| 271 |
+
default_selection=['rank', 'FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
|
| 272 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['rank', 'FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
|
| 273 |
)
|
| 274 |
|
| 275 |
with gr.TabItem("📈 Charts"):
|
|
|
|
| 295 |
|
| 296 |
leaderboard = update_task_leaderboard(
|
| 297 |
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
|
| 298 |
+
default_selection=['rank', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
| 299 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['rank', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
|
| 300 |
)
|
| 301 |
|
| 302 |
# About tab
|
|
|
|
| 315 |
f"{task} Best Prompt": "Best Prompt",
|
| 316 |
f"{task} Best Prompt Id": "Best Prompt Id",
|
| 317 |
task: "Combined Performance"}),
|
| 318 |
+
default_selection=['rank', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
|
| 319 |
'Best Prompt Id'],
|
| 320 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
| 321 |
+
col not in ['rank', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
|
| 322 |
'Best Prompt', 'Best Prompt Id']]
|
| 323 |
)
|
| 324 |
|
src/display/utils.py
CHANGED
|
@@ -25,6 +25,8 @@ auto_eval_column_dict = []
|
|
| 25 |
# Init
|
| 26 |
#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
|
|
|
|
|
|
|
| 28 |
auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
|
| 29 |
auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)])
|
| 30 |
|
|
|
|
| 25 |
# Init
|
| 26 |
#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
|
| 28 |
+
auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("rank", "str", True, never_hidden=True)])
|
| 29 |
+
|
| 30 |
auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
|
| 31 |
auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)])
|
| 32 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -2,7 +2,7 @@ import glob
|
|
| 2 |
import json
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
-
from dataclasses import dataclass
|
| 6 |
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
|
@@ -34,6 +34,7 @@ class EvalResult:
|
|
| 34 |
num_params: int = 0
|
| 35 |
date: str = "" # submission date of request file
|
| 36 |
still_on_hub: bool = False
|
|
|
|
| 37 |
|
| 38 |
@classmethod
|
| 39 |
def init_from_json_file(self, json_filepath):
|
|
@@ -117,7 +118,8 @@ class EvalResult:
|
|
| 117 |
revision= config.get("model_sha", ""),
|
| 118 |
still_on_hub=still_on_hub,
|
| 119 |
architecture=architecture,
|
| 120 |
-
num_params=num_params
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
'''
|
|
@@ -164,6 +166,7 @@ class EvalResult:
|
|
| 164 |
AutoEvalColumn.likes.name: self.likes,
|
| 165 |
AutoEvalColumn.params.name: self.num_params,
|
| 166 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
|
| 167 |
}
|
| 168 |
|
| 169 |
for task in Tasks:
|
|
|
|
| 2 |
import json
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
|
|
|
| 34 |
num_params: int = 0
|
| 35 |
date: str = "" # submission date of request file
|
| 36 |
still_on_hub: bool = False
|
| 37 |
+
rank: int = field(default=0) # 👈 nuovo campo con default = 0
|
| 38 |
|
| 39 |
@classmethod
|
| 40 |
def init_from_json_file(self, json_filepath):
|
|
|
|
| 118 |
revision= config.get("model_sha", ""),
|
| 119 |
still_on_hub=still_on_hub,
|
| 120 |
architecture=architecture,
|
| 121 |
+
num_params=num_params,
|
| 122 |
+
rank = 0
|
| 123 |
)
|
| 124 |
|
| 125 |
'''
|
|
|
|
| 166 |
AutoEvalColumn.likes.name: self.likes,
|
| 167 |
AutoEvalColumn.params.name: self.num_params,
|
| 168 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 169 |
+
"rank": self.rank
|
| 170 |
}
|
| 171 |
|
| 172 |
for task in Tasks:
|