yangzhitao commited on
Commit
d66a6a3
·
1 Parent(s): 60906bd

refactor: enhance leaderboard functionality and improve code structure

Browse files

- Introduced Pydantic models for better data validation in leaderboard evaluations.
- Refactored leaderboard DataFrame initialization for improved readability and maintainability.
- Updated Gradio components to use the new structure.
- Added new dependencies for enhanced functionality.
- Removed deprecated read_evals_orig.py file to streamline the codebase.

.vscode/cspell.json CHANGED
@@ -2,6 +2,7 @@
2
  "words": [
3
  "accs",
4
  "changethis",
 
5
  "evals",
6
  "initialisation",
7
  "modelcard",
 
2
  "words": [
3
  "accs",
4
  "changethis",
5
+ "checkboxgroup",
6
  "evals",
7
  "initialisation",
8
  "modelcard",
.vscode/settings.json CHANGED
@@ -7,5 +7,6 @@
7
  "source.fixAll.ruff": "always",
8
  "source.organizeImports.ruff": "always"
9
  }
10
- }
 
11
  }
 
7
  "source.fixAll.ruff": "always",
8
  "source.organizeImports.ruff": "always"
9
  }
10
+ },
11
+ "cursorpyright.analysis.typeCheckingMode": "basic"
12
  }
app.py CHANGED
@@ -1,9 +1,11 @@
1
  import gradio as gr
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
5
  from huggingface_hub import snapshot_download
6
  from rich import print
 
7
 
8
  from src.about import (
9
  CITATION_BUTTON_LABEL,
@@ -33,6 +35,7 @@ from src.submission.submit import add_new_eval
33
  def restart_space():
34
  API.restart_space(repo_id=settings.REPO_ID)
35
 
 
36
  print("///// --- Settings --- /////", settings.model_dump())
37
 
38
  # Space initialisation
@@ -77,28 +80,38 @@ LEADERBOARD_DF = get_leaderboard_df(
77
  def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
78
  if dataframe is None or dataframe.empty:
79
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  return Leaderboard(
81
  value=dataframe,
82
  datatype=[c.type for c in fields(AutoEvalColumn)],
83
- select_columns=SelectColumns(
84
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
85
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
86
- label="Select Columns to Display:",
87
- ),
88
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
89
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
90
- filter_columns=[
91
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
92
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
93
- ColumnFilter(
94
- AutoEvalColumn.params.name,
95
- type="slider",
96
- min=0.01,
97
- max=150,
98
- label="Select the number of parameters (B)",
99
- ),
100
- ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
101
- ],
102
  bool_checkboxgroup_label="Hide models",
103
  interactive=False,
104
  )
@@ -127,7 +140,7 @@ with demo:
127
  open=False,
128
  ):
129
  with gr.Row():
130
- finished_eval_table = gr.components.Dataframe(
131
  value=finished_eval_queue_df,
132
  headers=EVAL_COLS,
133
  datatype=EVAL_TYPES,
@@ -138,7 +151,7 @@ with demo:
138
  open=False,
139
  ):
140
  with gr.Row():
141
- running_eval_table = gr.components.Dataframe(
142
  value=running_eval_queue_df,
143
  headers=EVAL_COLS,
144
  datatype=EVAL_TYPES,
@@ -150,7 +163,7 @@ with demo:
150
  open=False,
151
  ):
152
  with gr.Row():
153
- pending_eval_table = gr.components.Dataframe(
154
  value=pending_eval_queue_df,
155
  headers=EVAL_COLS,
156
  datatype=EVAL_TYPES,
 
1
  import gradio as gr
2
+ import gradio.components as grc
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
6
  from huggingface_hub import snapshot_download
7
  from rich import print
8
+ from rich.markdown import Markdown
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
 
35
  def restart_space():
36
  API.restart_space(repo_id=settings.REPO_ID)
37
 
38
+
39
  print("///// --- Settings --- /////", settings.model_dump())
40
 
41
  # Space initialisation
 
80
  def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
81
  if dataframe is None or dataframe.empty:
82
  raise ValueError("Leaderboard DataFrame is empty or None.")
83
+ print("///// --- dataframe.head() --- /////", Markdown(dataframe.head().to_markdown() or "No data"))
84
+ selected_columns = SelectColumns(
85
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
86
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
87
+ label="Select Columns to Display:",
88
+ )
89
+ search_columns = [AutoEvalColumn.model.name, AutoEvalColumn.license.name]
90
+ hidden_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden]
91
+ filter_columns = [
92
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
93
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
94
+ ColumnFilter(
95
+ AutoEvalColumn.params.name,
96
+ type="slider",
97
+ min=0.01,
98
+ max=150,
99
+ label="Select the number of parameters (B)",
100
+ ),
101
+ ColumnFilter(
102
+ AutoEvalColumn.still_on_hub.name,
103
+ type="boolean", # pyright: ignore[reportArgumentType]
104
+ label="Deleted/incomplete",
105
+ default=False,
106
+ ),
107
+ ]
108
  return Leaderboard(
109
  value=dataframe,
110
  datatype=[c.type for c in fields(AutoEvalColumn)],
111
+ select_columns=selected_columns,
112
+ search_columns=search_columns,
113
+ hide_columns=hidden_columns,
114
+ filter_columns=filter_columns, # pyright: ignore[reportArgumentType]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  bool_checkboxgroup_label="Hide models",
116
  interactive=False,
117
  )
 
140
  open=False,
141
  ):
142
  with gr.Row():
143
+ finished_eval_table = grc.Dataframe(
144
  value=finished_eval_queue_df,
145
  headers=EVAL_COLS,
146
  datatype=EVAL_TYPES,
 
151
  open=False,
152
  ):
153
  with gr.Row():
154
+ running_eval_table = grc.Dataframe(
155
  value=running_eval_queue_df,
156
  headers=EVAL_COLS,
157
  datatype=EVAL_TYPES,
 
163
  open=False,
164
  ):
165
  with gr.Row():
166
+ pending_eval_table = grc.Dataframe(
167
  value=pending_eval_queue_df,
168
  headers=EVAL_COLS,
169
  datatype=EVAL_TYPES,
pyproject.toml CHANGED
@@ -25,6 +25,7 @@ dependencies = [
25
  "pydantic>=2.11.10",
26
  "pydantic-settings>=2.11.0",
27
  "rich>=14.2.0",
 
28
  ]
29
 
30
  [dependency-groups]
 
25
  "pydantic>=2.11.10",
26
  "pydantic-settings>=2.11.0",
27
  "rich>=14.2.0",
28
+ "tabulate>=0.9.0",
29
  ]
30
 
31
  [dependency-groups]
src/about.py CHANGED
@@ -1,20 +1,49 @@
1
- from dataclasses import dataclass
2
  from enum import Enum
 
3
 
 
4
 
5
- @dataclass
6
- class Task:
7
- benchmark: str
8
- metric: str
9
- col_name: str
10
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
- task0 = Task("anli_r1", "acc", "ANLI")
17
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  NUM_FEWSHOT = 0 # Change with your few shot
 
 
1
  from enum import Enum
2
+ from typing import Annotated
3
 
4
+ from pydantic import BaseModel, Field
5
 
6
+
7
+ class Task(BaseModel):
8
+ benchmark: Annotated[str, Field(description="The benchmark name")]
9
+ metric: Annotated[str, Field(description="The metric name")]
10
+ col_name: Annotated[str, Field(description="The column name")]
11
 
12
 
13
  # Select your tasks here
14
  # ---------------------------------------------------
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
+
18
+ # acc
19
+ task1_1 = Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
20
+ task2_1 = Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
21
+ task3_1 = Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
22
+ task4_1 = Task(benchmark="Core", metric="acc", col_name="Core(acc)")
23
+ task5_1 = Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
24
+ task6_1 = Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
25
+ task7_1 = Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
26
+ task8_1 = Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
27
+
28
+ # caa
29
+ task1_2 = Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
30
+ task2_2 = Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
31
+ task3_2 = Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
32
+ task4_2 = Task(benchmark="Core", metric="caa", col_name="Core(caa)")
33
+ task5_2 = Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
34
+ task6_2 = Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
35
+ task7_2 = Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
36
+ task8_2 = Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
37
+
38
+ # rand
39
+ task1_3 = Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
40
+ task2_3 = Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
41
+ task3_3 = Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
42
+ task4_3 = Task(benchmark="Core", metric="rand", col_name="Core(rand)")
43
+ task5_3 = Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
44
+ task6_3 = Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
45
+ task7_3 = Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
46
+ task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
47
 
48
 
49
  NUM_FEWSHOT = 0 # Change with your few shot
src/display/css_html_js.py CHANGED
@@ -2,6 +2,7 @@ from pathlib import Path
2
 
3
  custom_css = Path("src/assets/css/custom.css")
4
 
 
5
  get_window_url_params = """
6
  function(url_params) {
7
  const params = new URLSearchParams(window.location.search);
 
2
 
3
  custom_css = Path("src/assets/css/custom.css")
4
 
5
+ # FIXME: seems deprecated
6
  get_window_url_params = """
7
  function(url_params) {
8
  const params = new URLSearchParams(window.location.search);
src/display/formatting.py CHANGED
@@ -1,27 +1,33 @@
1
- def model_hyperlink(link, model_name):
 
 
 
 
 
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
- def make_clickable_model(model_name):
6
  link = f"https://huggingface.co/{model_name}"
7
  return model_hyperlink(link, model_name)
8
 
9
 
10
- def styled_error(error):
11
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
 
13
 
14
- def styled_warning(warn):
15
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
 
17
 
18
- def styled_message(message):
19
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
 
21
 
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
 
25
 
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
1
+ import typing
2
+
3
+ if typing.TYPE_CHECKING:
4
+ import pandas as pd
5
+
6
+
7
+ def model_hyperlink(link: str, model_name: str) -> str:
8
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
9
 
10
 
11
+ def make_clickable_model(model_name: str) -> str:
12
  link = f"https://huggingface.co/{model_name}"
13
  return model_hyperlink(link, model_name)
14
 
15
 
16
+ def styled_error(error: str) -> str:
17
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
18
 
19
 
20
+ def styled_warning(warn: str) -> str:
21
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
22
 
23
 
24
+ def styled_message(message: str) -> str:
25
  return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
26
 
27
 
28
+ def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
29
+ return df.loc[:, columns].notna().all(axis=1)
30
 
31
 
32
+ def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
33
+ return df.loc[:, columns].isna().any(axis=1)
src/display/utils.py CHANGED
@@ -1,63 +1,112 @@
1
- from dataclasses import dataclass, make_dataclass
 
 
 
 
2
  from enum import Enum
 
 
 
 
3
 
4
  from src.about import Tasks
5
 
6
 
7
- def fields(raw_class):
8
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
 
 
 
 
 
 
9
 
10
 
11
  # These classes are for user facing column names,
12
  # to avoid having to change them all around the code
13
  # when a modif is needed
14
- @dataclass
15
- class ColumnContent:
16
  name: str
17
- type: str
18
- displayed_by_default: bool
19
  hidden: bool = False
20
  never_hidden: bool = False
21
 
22
-
23
- # Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- # Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  # For the queue columns in the submission tab
48
- @dataclass(frozen=True)
49
- class EvalQueueColumn: # Queue column
50
- model = ColumnContent("model", "markdown", True)
51
- revision = ColumnContent("revision", "str", True)
52
- private = ColumnContent("private", "bool", True)
53
- precision = ColumnContent("precision", "str", True)
54
- weight_type = ColumnContent("weight_type", "str", "Original")
55
- status = ColumnContent("status", "str", True)
 
 
 
 
56
 
57
 
58
  # All the model information that we might need
59
- @dataclass
60
- class ModelDetails:
61
  name: str
62
  display_name: str = ""
63
  symbol: str = "" # emoji
@@ -87,17 +136,18 @@ class ModelType(Enum):
87
 
88
 
89
  class WeightType(Enum):
90
- Adapter = ModelDetails("Adapter")
91
- Original = ModelDetails("Original")
92
- Delta = ModelDetails("Delta")
93
 
94
 
95
  class Precision(Enum):
96
- float16 = ModelDetails("float16")
97
- bfloat16 = ModelDetails("bfloat16")
98
- Unknown = ModelDetails("?")
99
 
100
- def from_str(precision):
 
101
  if precision in ["torch.float16", "float16"]:
102
  return Precision.float16
103
  if precision in ["torch.bfloat16", "bfloat16"]:
@@ -106,9 +156,9 @@ class Precision(Enum):
106
 
107
 
108
  # Column selection
109
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
110
 
111
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
112
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
113
 
114
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
1
+ """Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/display/utils.py
2
+
3
+ Enhanced with Pydantic models.
4
+ """
5
+
6
  from enum import Enum
7
+ from typing import Literal, Union
8
+
9
+ from pydantic import BaseModel, ConfigDict, create_model
10
+ from typing_extensions import Self
11
 
12
  from src.about import Tasks
13
 
14
 
15
+ def fields(
16
+ raw_class: Union[
17
+ type["_AutoEvalColumnBase"],
18
+ "_AutoEvalColumnBase",
19
+ type["EvalQueueColumnCls"],
20
+ "EvalQueueColumnCls",
21
+ ],
22
+ ) -> list["ColumnContent"]:
23
+ return [v.default for k, v in raw_class.model_fields.items() if k[:2] != "__" and k[-2:] != "__"]
24
 
25
 
26
  # These classes are for user facing column names,
27
  # to avoid having to change them all around the code
28
  # when a modif is needed
29
+ class ColumnContent(BaseModel):
 
30
  name: str
31
+ type: Literal["str", "number", "bool", "markdown"]
32
+ displayed_by_default: bool | Literal["Original"] = False
33
  hidden: bool = False
34
  never_hidden: bool = False
35
 
36
+ @classmethod
37
+ def new(
38
+ cls,
39
+ name: str,
40
+ type: Literal["str", "number", "bool", "markdown"],
41
+ displayed_by_default: bool | Literal["Original"] = False,
42
+ *,
43
+ hidden: bool = False,
44
+ never_hidden: bool = False,
45
+ ) -> Self:
46
+ return cls(
47
+ name=name,
48
+ type=type,
49
+ displayed_by_default=displayed_by_default,
50
+ hidden=hidden,
51
+ never_hidden=never_hidden,
52
+ )
53
+
54
+
55
+ class _AutoEvalColumnBase(BaseModel):
56
+ model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
57
+
58
+ model_type_symbol: ColumnContent = ColumnContent(
59
+ name="T", type="str", displayed_by_default=True, never_hidden=True
60
+ )
61
+ model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
62
+ average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
63
+
64
+ model_type: ColumnContent = ColumnContent.new("Type", "str")
65
+ architecture: ColumnContent = ColumnContent.new("Architecture", "str")
66
+ weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
67
+ precision: ColumnContent = ColumnContent.new("Precision", "str")
68
+ license: ColumnContent = ColumnContent.new("Hub License", "str")
69
+ params: ColumnContent = ColumnContent.new("#Params (B)", "number")
70
+ likes: ColumnContent = ColumnContent.new("Hub ❤️", "number")
71
+ still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool")
72
+ revision: ColumnContent = ColumnContent.new("Model sha", "str")
73
+
74
+
75
+ # We use create_model to dynamically fill the scores from Tasks
76
+ field_definitions = {
77
+ task.name: (
78
+ ColumnContent,
79
+ ColumnContent.new(task.value.col_name, "number", True),
80
+ )
81
+ for task in Tasks
82
+ }
83
+ AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model( # pyright: ignore[reportCallIssue]
84
+ '_AutoEvalColumnCls',
85
+ __base__=_AutoEvalColumnBase,
86
+ **field_definitions, # pyright: ignore[reportArgumentType]
87
+ )
88
+
89
+
90
+ AutoEvalColumn = AutoEvalColumnCls()
91
 
92
 
93
  # For the queue columns in the submission tab
94
+ class EvalQueueColumnCls(BaseModel): # Queue column
95
+ model_config = ConfigDict(extra="forbid", frozen=True)
96
+
97
+ model: ColumnContent = ColumnContent.new("model", "markdown", True)
98
+ revision: ColumnContent = ColumnContent.new("revision", "str", True)
99
+ private: ColumnContent = ColumnContent.new("private", "bool", True)
100
+ precision: ColumnContent = ColumnContent.new("precision", "str", True)
101
+ weight_type: ColumnContent = ColumnContent.new("weight_type", "str", "Original")
102
+ status: ColumnContent = ColumnContent.new("status", "str", True)
103
+
104
+
105
+ EvalQueueColumn = EvalQueueColumnCls()
106
 
107
 
108
  # All the model information that we might need
109
+ class ModelDetails(BaseModel):
 
110
  name: str
111
  display_name: str = ""
112
  symbol: str = "" # emoji
 
136
 
137
 
138
  class WeightType(Enum):
139
+ Adapter = ModelDetails(name="Adapter")
140
+ Original = ModelDetails(name="Original")
141
+ Delta = ModelDetails(name="Delta")
142
 
143
 
144
  class Precision(Enum):
145
+ float16 = ModelDetails(name="float16")
146
+ bfloat16 = ModelDetails(name="bfloat16")
147
+ Unknown = ModelDetails(name="?")
148
 
149
+ @classmethod
150
+ def from_str(cls, precision):
151
  if precision in ["torch.float16", "float16"]:
152
  return Precision.float16
153
  if precision in ["torch.bfloat16", "bfloat16"]:
 
156
 
157
 
158
  # Column selection
159
+ COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
160
 
161
+ EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
162
+ EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
163
 
164
+ BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]
src/envs.py CHANGED
@@ -23,14 +23,17 @@ class Settings(BaseSettings):
23
  ]
24
 
25
  @computed_field
 
26
  def REPO_ID(self) -> str:
27
  return (Path(self.OWNER) / "leaderboard").as_posix()
28
 
29
  @computed_field
 
30
  def QUEUE_REPO(self) -> str:
31
  return (Path(self.OWNER) / "requests").as_posix()
32
 
33
  @computed_field
 
34
  def RESULTS_REPO(self) -> str:
35
  return (Path(self.OWNER) / "results").as_posix()
36
 
@@ -42,18 +45,22 @@ class Settings(BaseSettings):
42
  # Local caches
43
 
44
  @computed_field
 
45
  def EVAL_REQUESTS_PATH(self) -> str:
46
  return (Path(self.CACHE_PATH) / "eval-queue").as_posix()
47
 
48
  @computed_field
 
49
  def EVAL_RESULTS_PATH(self) -> str:
50
  return (Path(self.CACHE_PATH) / "eval-results").as_posix()
51
 
52
  @computed_field
 
53
  def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
54
  return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
55
 
56
  @computed_field
 
57
  def EVAL_RESULTS_PATH_BACKEND(self) -> str:
58
  return (Path(self.CACHE_PATH) / "eval-results-bk").as_posix()
59
 
@@ -63,5 +70,5 @@ class Settings(BaseSettings):
63
  return HfApi(token=self.TOKEN)
64
 
65
 
66
- settings = Settings()
67
  API = settings.API
 
23
  ]
24
 
25
  @computed_field
26
+ @cached_property
27
  def REPO_ID(self) -> str:
28
  return (Path(self.OWNER) / "leaderboard").as_posix()
29
 
30
  @computed_field
31
+ @cached_property
32
  def QUEUE_REPO(self) -> str:
33
  return (Path(self.OWNER) / "requests").as_posix()
34
 
35
  @computed_field
36
+ @cached_property
37
  def RESULTS_REPO(self) -> str:
38
  return (Path(self.OWNER) / "results").as_posix()
39
 
 
45
  # Local caches
46
 
47
  @computed_field
48
+ @cached_property
49
  def EVAL_REQUESTS_PATH(self) -> str:
50
  return (Path(self.CACHE_PATH) / "eval-queue").as_posix()
51
 
52
  @computed_field
53
+ @cached_property
54
  def EVAL_RESULTS_PATH(self) -> str:
55
  return (Path(self.CACHE_PATH) / "eval-results").as_posix()
56
 
57
  @computed_field
58
+ @cached_property
59
  def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
60
  return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
61
 
62
  @computed_field
63
+ @cached_property
64
  def EVAL_RESULTS_PATH_BACKEND(self) -> str:
65
  return (Path(self.CACHE_PATH) / "eval-results-bk").as_posix()
66
 
 
70
  return HfApi(token=self.TOKEN)
71
 
72
 
73
+ settings = Settings() # pyright: ignore[reportCallIssue]
74
  API = settings.API
src/leaderboard/read_evals.py CHANGED
@@ -1,11 +1,17 @@
 
 
 
 
 
1
  import glob
2
  import json
3
  import os
4
- from dataclasses import dataclass
5
- from typing import Any
6
 
7
- import dateutil
8
  import numpy as np
 
9
  from typing_extensions import Self
10
 
11
  from src.display.formatting import make_clickable_model
@@ -13,16 +19,35 @@ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, Weigh
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
16
- @dataclass
17
- class EvalResult:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
19
 
20
  eval_name: str # org_model_precision (uid)
21
  full_model: str # org/model (path on hub)
22
- org: str
23
  model: str
24
  revision: str # commit hash, "" if main
25
- results: dict
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -36,16 +61,14 @@ class EvalResult:
36
  @classmethod
37
  def init_from_json_file(cls, json_filepath: str) -> Self:
38
  """Inits the result from the specific model result file"""
39
- with open(json_filepath) as fp:
40
- data = json.load(fp)
41
-
42
- config = data.get("config")
43
 
44
  # Precision
45
- precision = Precision.from_str(config.get("model_dtype"))
46
 
47
  # Get model and org
48
- org_and_model = config.get("model_name", config.get("model_args", None))
49
  org_and_model = org_and_model.split("/", 1)
50
 
51
  if len(org_and_model) == 1:
@@ -59,38 +82,38 @@ class EvalResult:
59
  full_model = "/".join(org_and_model)
60
 
61
  still_on_hub, _, model_config = is_model_on_hub(
62
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
63
  )
64
- architecture = "?"
65
  if model_config is not None:
66
- architectures = getattr(model_config, "architectures", None)
67
  if architectures:
68
  architecture = ";".join(architectures)
69
 
70
  # Extract results available in this file (some results are split in several files)
71
- results = {}
72
- for task in Tasks:
73
- task = task.value
74
 
75
  # We average all scores of a given metric (not all metrics are present in all files)
76
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
77
  if accs.size == 0 or any(acc is None for acc in accs):
78
  continue
79
 
80
  mean_acc = np.mean(accs) * 100.0
81
- results[task.benchmark] = mean_acc
82
-
83
- return cls(
84
- eval_name=result_key,
85
- full_model=full_model,
86
- org=org,
87
- model=model,
88
- results=results,
89
- precision=precision,
90
- revision=config.get("model_sha", ""),
91
- still_on_hub=still_on_hub,
92
- architecture=architecture,
93
- )
94
 
95
  def update_with_request_file(self, requests_path: str) -> None:
96
  """Finds the relevant request file for the current model and updates info with it"""
@@ -135,7 +158,7 @@ class EvalResult:
135
  return data_dict
136
 
137
 
138
- def get_request_file_for_model(requests_path, model_name, precision):
139
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
140
  request_files = os.path.join(
141
  requests_path,
@@ -166,7 +189,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
166
  # Sort the files by date
167
  try:
168
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
  files = [files[-1]]
171
 
172
  for file in files:
 
1
+ """Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/leaderboard/read_evals.py
2
+
3
+ Enhanced with Pydantic models.
4
+ """
5
+
6
  import glob
7
  import json
8
  import os
9
+ from pathlib import Path
10
+ from typing import Annotated, Any
11
 
12
+ import dateutil.parser
13
  import numpy as np
14
+ from pydantic import BaseModel, ConfigDict, Field
15
  from typing_extensions import Self
16
 
17
  from src.display.formatting import make_clickable_model
 
19
  from src.submission.check_validity import is_model_on_hub
20
 
21
 
22
+ class EvalResultJson(BaseModel):
23
+ """Model of the eval result json file."""
24
+
25
+ model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
26
+
27
+ config: "EvalResultJson_Config"
28
+ results: dict[str, dict[str, float | None]]
29
+
30
+
31
+ class EvalResultJson_Config(BaseModel):
32
+ """`config` in the eval result json file."""
33
+
34
+ model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
35
+
36
+ model_dtype: Annotated[str, Field(..., description="The model precision. e.g. torch.bfloat16")]
37
+ model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
38
+ model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
39
+ model_args: Annotated[str | None, Field(description="The model args.")] = None
40
+
41
+
42
+ class EvalResult(BaseModel):
43
  """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
44
 
45
  eval_name: str # org_model_precision (uid)
46
  full_model: str # org/model (path on hub)
47
+ org: str | None
48
  model: str
49
  revision: str # commit hash, "" if main
50
+ results: dict[str, float]
51
  precision: Precision = Precision.Unknown
52
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
53
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
61
  @classmethod
62
  def init_from_json_file(cls, json_filepath: str) -> Self:
63
  """Inits the result from the specific model result file"""
64
+ data = EvalResultJson.model_validate_json(Path(json_filepath).read_text())
65
+ config = data.config
 
 
66
 
67
  # Precision
68
+ precision = Precision.from_str(config.model_dtype)
69
 
70
  # Get model and org
71
+ org_and_model = config.model_name or config.model_args or ""
72
  org_and_model = org_and_model.split("/", 1)
73
 
74
  if len(org_and_model) == 1:
 
82
  full_model = "/".join(org_and_model)
83
 
84
  still_on_hub, _, model_config = is_model_on_hub(
85
+ full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
86
  )
87
+ architecture: str = "?"
88
  if model_config is not None:
89
+ architectures: list[str] | None = getattr(model_config, "architectures", None)
90
  if architectures:
91
  architecture = ";".join(architectures)
92
 
93
  # Extract results available in this file (some results are split in several files)
94
+ results: dict[str, float] = {}
95
+ for t in Tasks:
96
+ task = t.value
97
 
98
  # We average all scores of a given metric (not all metrics are present in all files)
99
+ accs = np.array([v.get(task.metric, None) for k, v in data.results.items() if task.benchmark == k])
100
  if accs.size == 0 or any(acc is None for acc in accs):
101
  continue
102
 
103
  mean_acc = np.mean(accs) * 100.0
104
+ results[task.benchmark] = float(mean_acc)
105
+
106
+ return cls.model_validate({
107
+ "eval_name": result_key,
108
+ "full_model": full_model,
109
+ "org": org,
110
+ "model": model,
111
+ "results": results,
112
+ "precision": precision,
113
+ "revision": config.model_sha or "",
114
+ "still_on_hub": still_on_hub,
115
+ "architecture": architecture,
116
+ })
117
 
118
  def update_with_request_file(self, requests_path: str) -> None:
119
  """Finds the relevant request file for the current model and updates info with it"""
 
158
  return data_dict
159
 
160
 
161
+ def get_request_file_for_model(requests_path, model_name, precision) -> str:
162
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
163
  request_files = os.path.join(
164
  requests_path,
 
189
  # Sort the files by date
190
  try:
191
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
192
+ except dateutil.parser.ParserError:
193
  files = [files[-1]]
194
 
195
  for file in files:
src/leaderboard/read_evals_orig.py DELETED
@@ -1,194 +0,0 @@
1
- import glob
2
- import json
3
- import os
4
- from dataclasses import dataclass
5
-
6
- import dateutil
7
- import numpy as np
8
-
9
- from src.display.formatting import make_clickable_model
10
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
11
- from src.submission.check_validity import is_model_on_hub
12
-
13
-
14
- @dataclass
15
- class EvalResult:
16
- """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
17
-
18
- eval_name: str # org_model_precision (uid)
19
- full_model: str # org/model (path on hub)
20
- org: str
21
- model: str
22
- revision: str # commit hash, "" if main
23
- results: dict
24
- precision: Precision = Precision.Unknown
25
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
26
- weight_type: WeightType = WeightType.Original # Original or Adapter
27
- architecture: str = "Unknown"
28
- license: str = "?"
29
- likes: int = 0
30
- num_params: int = 0
31
- date: str = "" # submission date of request file
32
- still_on_hub: bool = False
33
-
34
- @classmethod
35
- def init_from_json_file(self, json_filepath):
36
- """Inits the result from the specific model result file"""
37
- with open(json_filepath) as fp:
38
- data = json.load(fp)
39
-
40
- config = data.get("config")
41
-
42
- # Precision
43
- precision = Precision.from_str(config.get("model_dtype"))
44
-
45
- # Get model and org
46
- org_and_model = config.get("model_name", config.get("model_args", None))
47
- org_and_model = org_and_model.split("/", 1)
48
-
49
- if len(org_and_model) == 1:
50
- org = None
51
- model = org_and_model[0]
52
- result_key = f"{model}_{precision.value.name}"
53
- else:
54
- org = org_and_model[0]
55
- model = org_and_model[1]
56
- result_key = f"{org}_{model}_{precision.value.name}"
57
- full_model = "/".join(org_and_model)
58
-
59
- still_on_hub, _, model_config = is_model_on_hub(
60
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
61
- )
62
- architecture = "?"
63
- if model_config is not None:
64
- architectures = getattr(model_config, "architectures", None)
65
- if architectures:
66
- architecture = ";".join(architectures)
67
-
68
- # Extract results available in this file (some results are split in several files)
69
- results = {}
70
- for task in Tasks:
71
- task = task.value
72
-
73
- # We average all scores of a given metric (not all metrics are present in all files)
74
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
75
- if accs.size == 0 or any(acc is None for acc in accs):
76
- continue
77
-
78
- mean_acc = np.mean(accs) * 100.0
79
- results[task.benchmark] = mean_acc
80
-
81
- return self(
82
- eval_name=result_key,
83
- full_model=full_model,
84
- org=org,
85
- model=model,
86
- results=results,
87
- precision=precision,
88
- revision=config.get("model_sha", ""),
89
- still_on_hub=still_on_hub,
90
- architecture=architecture,
91
- )
92
-
93
- def update_with_request_file(self, requests_path):
94
- """Finds the relevant request file for the current model and updates info with it"""
95
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
96
-
97
- try:
98
- with open(request_file) as f:
99
- request = json.load(f)
100
- self.model_type = ModelType.from_str(request.get("model_type", ""))
101
- self.weight_type = WeightType[request.get("weight_type", "Original")]
102
- self.license = request.get("license", "?")
103
- self.likes = request.get("likes", 0)
104
- self.num_params = request.get("params", 0)
105
- self.date = request.get("submitted_time", "")
106
- except Exception:
107
- print(
108
- f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
109
- )
110
-
111
- def to_dict(self):
112
- """Converts the Eval Result to a dict compatible with our dataframe display"""
113
- average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
114
- data_dict = {
115
- "eval_name": self.eval_name, # not a column, just a save name,
116
- AutoEvalColumn.precision.name: self.precision.value.name,
117
- AutoEvalColumn.model_type.name: self.model_type.value.name,
118
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
119
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
120
- AutoEvalColumn.architecture.name: self.architecture,
121
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
122
- AutoEvalColumn.revision.name: self.revision,
123
- AutoEvalColumn.average.name: average,
124
- AutoEvalColumn.license.name: self.license,
125
- AutoEvalColumn.likes.name: self.likes,
126
- AutoEvalColumn.params.name: self.num_params,
127
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
128
- }
129
-
130
- for task in Tasks:
131
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
132
-
133
- return data_dict
134
-
135
-
136
- def get_request_file_for_model(requests_path, model_name, precision):
137
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
138
- request_files = os.path.join(
139
- requests_path,
140
- f"{model_name}_eval_request_*.json",
141
- )
142
- request_files = glob.glob(request_files)
143
-
144
- # Select correct request file (precision)
145
- request_file = ""
146
- request_files = sorted(request_files, reverse=True)
147
- for tmp_request_file in request_files:
148
- with open(tmp_request_file) as f:
149
- req_content = json.load(f)
150
- if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
151
- request_file = tmp_request_file
152
- return request_file
153
-
154
-
155
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
156
- """From the path of the results folder root, extract all needed info for results"""
157
- model_result_filepaths = []
158
-
159
- for root, _, files in os.walk(results_path):
160
- # We should only have json files in model results
161
- if len(files) == 0 or any(not f.endswith(".json") for f in files):
162
- continue
163
-
164
- # Sort the files by date
165
- try:
166
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
167
- except dateutil.parser._parser.ParserError:
168
- files = [files[-1]]
169
-
170
- for file in files:
171
- model_result_filepaths.append(os.path.join(root, file))
172
-
173
- eval_results = {}
174
- for model_result_filepath in model_result_filepaths:
175
- # Creation of result
176
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
177
- eval_result.update_with_request_file(requests_path)
178
-
179
- # Store results of same eval together
180
- eval_name = eval_result.eval_name
181
- if eval_name in eval_results.keys():
182
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
183
- else:
184
- eval_results[eval_name] = eval_result
185
-
186
- results = []
187
- for v in eval_results.values():
188
- try:
189
- v.to_dict() # we test if the dict version is complete
190
- results.append(v)
191
- except KeyError: # not all eval values present
192
- continue
193
-
194
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -23,7 +23,12 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
23
  from src.leaderboard.read_evals import get_raw_eval_results
24
 
25
 
26
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
 
 
 
27
  """
28
  Creates a sorted leaderboard DataFrame from evaluation results.
29
 
@@ -52,14 +57,14 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
52
 
53
  df = pd.DataFrame.from_records(all_data_json)
54
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
55
- df = df[cols].round(decimals=2)
56
 
57
  # filter out if any of the benchmarks have not been produced
58
  df = df[has_no_nan_values(df, benchmark_cols)]
59
  return df
60
 
61
 
62
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
63
  """
64
  Creates separate DataFrames for different evaluation queue statuses.
65
 
@@ -72,7 +77,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
72
  cols (list): List of column names to include in the final DataFrames
73
 
74
  Returns:
75
- list[pd.DataFrame]: A list containing three DataFrames in order:
76
  1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
77
  2. df_running: Evaluations with status "RUNNING"
78
  3. df_pending: Evaluations with status "PENDING" or "RERUN"
@@ -120,4 +125,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
120
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
121
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
122
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
123
- return df_finished[cols], df_running[cols], df_pending[cols]
 
23
  from src.leaderboard.read_evals import get_raw_eval_results
24
 
25
 
26
+ def get_leaderboard_df(
27
+ results_path: str,
28
+ requests_path: str,
29
+ cols: list[str],
30
+ benchmark_cols: list[str],
31
+ ) -> pd.DataFrame:
32
  """
33
  Creates a sorted leaderboard DataFrame from evaluation results.
34
 
 
57
 
58
  df = pd.DataFrame.from_records(all_data_json)
59
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
60
+ df = df.loc[:, cols].round(decimals=2)
61
 
62
  # filter out if any of the benchmarks have not been produced
63
  df = df[has_no_nan_values(df, benchmark_cols)]
64
  return df
65
 
66
 
67
+ def get_evaluation_queue_df(save_path: str, cols: list[str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
68
  """
69
  Creates separate DataFrames for different evaluation queue statuses.
70
 
 
77
  cols (list): List of column names to include in the final DataFrames
78
 
79
  Returns:
80
+ tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames in order:
81
  1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
82
  2. df_running: Evaluations with status "RUNNING"
83
  3. df_pending: Evaluations with status "PENDING" or "RERUN"
 
125
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
126
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
127
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
128
+ return df_finished.loc[:, cols], df_running.loc[:, cols], df_pending.loc[:, cols]
src/submission/check_validity.py CHANGED
@@ -1,19 +1,23 @@
1
  import json
2
  import os
 
3
  from collections import defaultdict
4
 
5
- import huggingface_hub
6
  from huggingface_hub import ModelCard
7
  from huggingface_hub.hf_api import ModelInfo
8
  from transformers import AutoConfig
9
  from transformers.models.auto.tokenization_auto import AutoTokenizer
10
 
 
 
 
11
 
12
  def check_model_card(repo_id: str) -> tuple[bool, str]:
13
  """Checks if the model card and license exist and have been filled"""
14
  try:
15
  card = ModelCard.load(repo_id)
16
- except huggingface_hub.utils.EntryNotFoundError:
17
  return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
18
 
19
  # Enforce license metadata
@@ -32,8 +36,16 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
32
 
33
 
34
  def is_model_on_hub(
35
- model_name: str, revision: str, token: str | None = None, trust_remote_code=False, test_tokenizer=False
36
- ) -> tuple[bool, str]:
 
 
 
 
 
 
 
 
37
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
38
  try:
39
  config = AutoConfig.from_pretrained(
 
1
  import json
2
  import os
3
+ import typing
4
  from collections import defaultdict
5
 
6
+ import huggingface_hub.errors
7
  from huggingface_hub import ModelCard
8
  from huggingface_hub.hf_api import ModelInfo
9
  from transformers import AutoConfig
10
  from transformers.models.auto.tokenization_auto import AutoTokenizer
11
 
12
+ if typing.TYPE_CHECKING:
13
+ from transformers.configuration_utils import PretrainedConfig
14
+
15
 
16
  def check_model_card(repo_id: str) -> tuple[bool, str]:
17
  """Checks if the model card and license exist and have been filled"""
18
  try:
19
  card = ModelCard.load(repo_id)
20
+ except huggingface_hub.errors.EntryNotFoundError:
21
  return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
22
 
23
  # Enforce license metadata
 
36
 
37
 
38
  def is_model_on_hub(
39
+ model_name: str,
40
+ revision: str,
41
+ token: str | None = None,
42
+ trust_remote_code=False,
43
+ test_tokenizer=False,
44
+ ) -> tuple[
45
+ bool,
46
+ str | None,
47
+ "PretrainedConfig | None",
48
+ ]:
49
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
50
  try:
51
  config = AutoConfig.from_pretrained(
uv.lock CHANGED
@@ -687,6 +687,7 @@ dependencies = [
687
  { name = "python-dotenv" },
688
  { name = "rich" },
689
  { name = "sentencepiece" },
 
690
  { name = "tokenizers" },
691
  { name = "tqdm" },
692
  { name = "transformers" },
@@ -715,6 +716,7 @@ requires-dist = [
715
  { name = "python-dotenv", specifier = ">=1.2.1" },
716
  { name = "rich", specifier = ">=14.2.0" },
717
  { name = "sentencepiece" },
 
718
  { name = "tokenizers", specifier = ">=0.15.0" },
719
  { name = "tqdm" },
720
  { name = "transformers" },
@@ -1305,6 +1307,15 @@ wheels = [
1305
  { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
1306
  ]
1307
 
 
 
 
 
 
 
 
 
 
1308
  [[package]]
1309
  name = "tokenizers"
1310
  version = "0.22.1"
 
687
  { name = "python-dotenv" },
688
  { name = "rich" },
689
  { name = "sentencepiece" },
690
+ { name = "tabulate" },
691
  { name = "tokenizers" },
692
  { name = "tqdm" },
693
  { name = "transformers" },
 
716
  { name = "python-dotenv", specifier = ">=1.2.1" },
717
  { name = "rich", specifier = ">=14.2.0" },
718
  { name = "sentencepiece" },
719
+ { name = "tabulate", specifier = ">=0.9.0" },
720
  { name = "tokenizers", specifier = ">=0.15.0" },
721
  { name = "tqdm" },
722
  { name = "transformers" },
 
1307
  { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
1308
  ]
1309
 
1310
+ [[package]]
1311
+ name = "tabulate"
1312
+ version = "0.9.0"
1313
+ source = { registry = "https://pypi.org/simple" }
1314
+ sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
1315
+ wheels = [
1316
+ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
1317
+ ]
1318
+
1319
  [[package]]
1320
  name = "tokenizers"
1321
  version = "0.22.1"