Spaces:

lmms-lab-si
/

EASI-Leaderboard

Running

yangzhitao commited on Oct 29

Commit

d66a6a3

1 Parent(s): 60906bd

refactor: enhance leaderboard functionality and improve code structure

- Introduced Pydantic models for better data validation in leaderboard evaluations.
- Refactored leaderboard DataFrame initialization for improved readability and maintainability.
- Updated Gradio components to use the new structure.
- Added new dependencies for enhanced functionality.
- Removed deprecated read_evals_orig.py file to streamline the codebase.

Files changed (14) hide show

.vscode/cspell.json +1 -0
.vscode/settings.json +2 -1
app.py +35 -22
pyproject.toml +1 -0
src/about.py +37 -8
src/display/css_html_js.py +1 -0
src/display/formatting.py +15 -9
src/display/utils.py +101 -51
src/envs.py +8 -1
src/leaderboard/read_evals.py +58 -35
src/leaderboard/read_evals_orig.py +0 -194
src/populate.py +10 -5
src/submission/check_validity.py +16 -4
uv.lock +11 -0

.vscode/cspell.json CHANGED Viewed

@@ -2,6 +2,7 @@
     "words": [
         "accs",
         "changethis",
         "evals",
         "initialisation",
         "modelcard",

     "words": [
         "accs",
         "changethis",
+        "checkboxgroup",
         "evals",
         "initialisation",
         "modelcard",

.vscode/settings.json CHANGED Viewed

@@ -7,5 +7,6 @@
             "source.fixAll.ruff": "always",
             "source.organizeImports.ruff": "always"
         }
-    }
 }

             "source.fixAll.ruff": "always",
             "source.organizeImports.ruff": "always"
         }
+    },
+    "cursorpyright.analysis.typeCheckingMode": "basic"
 }

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
 from rich import print
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -33,6 +35,7 @@ from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=settings.REPO_ID)
 print("///// --- Settings --- /////", settings.model_dump())
 # Space initialisation
@@ -77,28 +80,38 @@ LEADERBOARD_DF = get_leaderboard_df(
 def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
-        ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
@@ -127,7 +140,7 @@ with demo:
                         open=False,
                     ):
                         with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
                                 value=finished_eval_queue_df,
                                 headers=EVAL_COLS,
                                 datatype=EVAL_TYPES,
@@ -138,7 +151,7 @@ with demo:
                         open=False,
                     ):
                         with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
                                 value=running_eval_queue_df,
                                 headers=EVAL_COLS,
                                 datatype=EVAL_TYPES,
@@ -150,7 +163,7 @@ with demo:
                         open=False,
                     ):
                         with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
                                 value=pending_eval_queue_df,
                                 headers=EVAL_COLS,
                                 datatype=EVAL_TYPES,

 import gradio as gr
+import gradio.components as grc
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
 from rich import print
+from rich.markdown import Markdown
 from src.about import (
     CITATION_BUTTON_LABEL,
 def restart_space():
     API.restart_space(repo_id=settings.REPO_ID)
 print("///// --- Settings --- /////", settings.model_dump())
 # Space initialisation
 def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+    print("///// --- dataframe.head() --- /////", Markdown(dataframe.head().to_markdown() or "No data"))
+    selected_columns = SelectColumns(
+        default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+        cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+        label="Select Columns to Display:",
+    )
+    search_columns = [AutoEvalColumn.model.name, AutoEvalColumn.license.name]
+    hidden_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden]
+    filter_columns = [
+        ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+        ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+        ColumnFilter(
+            AutoEvalColumn.params.name,
+            type="slider",
+            min=0.01,
+            max=150,
+            label="Select the number of parameters (B)",
+        ),
+        ColumnFilter(
+            AutoEvalColumn.still_on_hub.name,
+            type="boolean",  # pyright: ignore[reportArgumentType]
+            label="Deleted/incomplete",
+            default=False,
+        ),
+    ]
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
+        select_columns=selected_columns,
+        search_columns=search_columns,
+        hide_columns=hidden_columns,
+        filter_columns=filter_columns,  # pyright: ignore[reportArgumentType]
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
                         open=False,
                     ):
                         with gr.Row():
+                            finished_eval_table = grc.Dataframe(
                                 value=finished_eval_queue_df,
                                 headers=EVAL_COLS,
                                 datatype=EVAL_TYPES,
                         open=False,
                     ):
                         with gr.Row():
+                            running_eval_table = grc.Dataframe(
                                 value=running_eval_queue_df,
                                 headers=EVAL_COLS,
                                 datatype=EVAL_TYPES,
                         open=False,
                     ):
                         with gr.Row():
+                            pending_eval_table = grc.Dataframe(
                                 value=pending_eval_queue_df,
                                 headers=EVAL_COLS,
                                 datatype=EVAL_TYPES,

pyproject.toml CHANGED Viewed

@@ -25,6 +25,7 @@ dependencies = [
   "pydantic>=2.11.10",
   "pydantic-settings>=2.11.0",
   "rich>=14.2.0",
 ]
 [dependency-groups]

   "pydantic>=2.11.10",
   "pydantic-settings>=2.11.0",
   "rich>=14.2.0",
+  "tabulate>=0.9.0",
 ]
 [dependency-groups]

src/about.py CHANGED Viewed

@@ -1,20 +1,49 @@
-from dataclasses import dataclass
 from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0  # Change with your few shot

 from enum import Enum
+from typing import Annotated
+from pydantic import BaseModel, Field
+class Task(BaseModel):
+    benchmark: Annotated[str, Field(description="The benchmark name")]
+    metric: Annotated[str, Field(description="The metric name")]
+    col_name: Annotated[str, Field(description="The column name")]
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # acc
+    task1_1 = Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
+    task2_1 = Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
+    task3_1 = Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
+    task4_1 = Task(benchmark="Core", metric="acc", col_name="Core(acc)")
+    task5_1 = Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
+    task6_1 = Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
+    task7_1 = Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
+    task8_1 = Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
+    # caa
+    task1_2 = Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
+    task2_2 = Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
+    task3_2 = Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
+    task4_2 = Task(benchmark="Core", metric="caa", col_name="Core(caa)")
+    task5_2 = Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
+    task6_2 = Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
+    task7_2 = Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
+    task8_2 = Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
+    # rand
+    task1_3 = Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
+    task2_3 = Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
+    task3_3 = Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
+    task4_3 = Task(benchmark="Core", metric="rand", col_name="Core(rand)")
+    task5_3 = Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
+    task6_3 = Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
+    task7_3 = Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
+    task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
 NUM_FEWSHOT = 0  # Change with your few shot

src/display/css_html_js.py CHANGED Viewed

@@ -2,6 +2,7 @@ from pathlib import Path
 custom_css = Path("src/assets/css/custom.css")
 get_window_url_params = """
     function(url_params) {
         const params = new URLSearchParams(window.location.search);

 custom_css = Path("src/assets/css/custom.css")
+# FIXME: seems deprecated
 get_window_url_params = """
     function(url_params) {
         const params = new URLSearchParams(window.location.search);

src/display/formatting.py CHANGED Viewed

@@ -1,27 +1,33 @@
-def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
-def styled_error(error):
     return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
-def styled_warning(warn):
     return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
-def styled_message(message):
     return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
-def has_no_nan_values(df, columns):
-    return df[columns].notna().all(axis=1)
-def has_nan_values(df, columns):
-    return df[columns].isna().any(axis=1)

+import typing
+if typing.TYPE_CHECKING:
+    import pandas as pd
+def model_hyperlink(link: str, model_name: str) -> str:
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name: str) -> str:
     link = f"https://huggingface.co/{model_name}"
     return model_hyperlink(link, model_name)
+def styled_error(error: str) -> str:
     return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn: str) -> str:
     return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message: str) -> str:
     return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
+    return df.loc[:, columns].notna().all(axis=1)
+def has_nan_values(df: "pd.DataFrame", columns: list[str]) -> "pd.Series":
+    return df.loc[:, columns].isna().any(axis=1)

src/display/utils.py CHANGED Viewed

@@ -1,63 +1,112 @@
-from dataclasses import dataclass, make_dataclass
 from enum import Enum
 from src.about import Tasks
-def fields(raw_class):
-    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # when a modif is needed
-@dataclass
-class ColumnContent:
     name: str
-    type: str
-    displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
-# Leaderboard columns
-auto_eval_column_dict = []
-# Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-# Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
-for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-# We use make dataclass to dynamically fill the scores from Tasks
-AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 # For the queue columns in the submission tab
-@dataclass(frozen=True)
-class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
-    revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
-    status = ColumnContent("status", "str", True)
 # All the model information that we might need
-@dataclass
-class ModelDetails:
     name: str
     display_name: str = ""
     symbol: str = ""  # emoji
@@ -87,17 +136,18 @@ class ModelType(Enum):
 class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
 class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
@@ -106,9 +156,9 @@ class Precision(Enum):
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
-EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

+"""Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/display/utils.py
+Enhanced with Pydantic models.
+"""
 from enum import Enum
+from typing import Literal, Union
+from pydantic import BaseModel, ConfigDict, create_model
+from typing_extensions import Self
 from src.about import Tasks
+def fields(
+    raw_class: Union[
+        type["_AutoEvalColumnBase"],
+        "_AutoEvalColumnBase",
+        type["EvalQueueColumnCls"],
+        "EvalQueueColumnCls",
+    ],
+) -> list["ColumnContent"]:
+    return [v.default for k, v in raw_class.model_fields.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # when a modif is needed
+class ColumnContent(BaseModel):
     name: str
+    type: Literal["str", "number", "bool", "markdown"]
+    displayed_by_default: bool | Literal["Original"] = False
     hidden: bool = False
     never_hidden: bool = False
+    @classmethod
+    def new(
+        cls,
+        name: str,
+        type: Literal["str", "number", "bool", "markdown"],
+        displayed_by_default: bool | Literal["Original"] = False,
+        *,
+        hidden: bool = False,
+        never_hidden: bool = False,
+    ) -> Self:
+        return cls(
+            name=name,
+            type=type,
+            displayed_by_default=displayed_by_default,
+            hidden=hidden,
+            never_hidden=never_hidden,
+        )
+class _AutoEvalColumnBase(BaseModel):
+    model_config: ConfigDict = ConfigDict(extra="forbid", frozen=True)
+    model_type_symbol: ColumnContent = ColumnContent(
+        name="T", type="str", displayed_by_default=True, never_hidden=True
+    )
+    model: ColumnContent = ColumnContent.new("Model", "markdown", True, never_hidden=True)
+    average: ColumnContent = ColumnContent.new("Average ⬆️", "number", True)
+    model_type: ColumnContent = ColumnContent.new("Type", "str")
+    architecture: ColumnContent = ColumnContent.new("Architecture", "str")
+    weight_type: ColumnContent = ColumnContent.new("Weight type", "str", hidden=True)
+    precision: ColumnContent = ColumnContent.new("Precision", "str")
+    license: ColumnContent = ColumnContent.new("Hub License", "str")
+    params: ColumnContent = ColumnContent.new("#Params (B)", "number")
+    likes: ColumnContent = ColumnContent.new("Hub ❤️", "number")
+    still_on_hub: ColumnContent = ColumnContent.new("Available on the hub", "bool")
+    revision: ColumnContent = ColumnContent.new("Model sha", "str")
+# We use create_model to dynamically fill the scores from Tasks
+field_definitions = {
+    task.name: (
+        ColumnContent,
+        ColumnContent.new(task.value.col_name, "number", True),
+    )
+    for task in Tasks
+}
+AutoEvalColumnCls: type[_AutoEvalColumnBase] = create_model(  # pyright: ignore[reportCallIssue]
+    '_AutoEvalColumnCls',
+    __base__=_AutoEvalColumnBase,
+    **field_definitions,  # pyright: ignore[reportArgumentType]
+)
+AutoEvalColumn = AutoEvalColumnCls()
 # For the queue columns in the submission tab
+class EvalQueueColumnCls(BaseModel):  # Queue column
+    model_config = ConfigDict(extra="forbid", frozen=True)
+    model: ColumnContent = ColumnContent.new("model", "markdown", True)
+    revision: ColumnContent = ColumnContent.new("revision", "str", True)
+    private: ColumnContent = ColumnContent.new("private", "bool", True)
+    precision: ColumnContent = ColumnContent.new("precision", "str", True)
+    weight_type: ColumnContent = ColumnContent.new("weight_type", "str", "Original")
+    status: ColumnContent = ColumnContent.new("status", "str", True)
+EvalQueueColumn = EvalQueueColumnCls()
 # All the model information that we might need
+class ModelDetails(BaseModel):
     name: str
     display_name: str = ""
     symbol: str = ""  # emoji
 class WeightType(Enum):
+    Adapter = ModelDetails(name="Adapter")
+    Original = ModelDetails(name="Original")
+    Delta = ModelDetails(name="Delta")
 class Precision(Enum):
+    float16 = ModelDetails(name="float16")
+    bfloat16 = ModelDetails(name="bfloat16")
+    Unknown = ModelDetails(name="?")
+    @classmethod
+    def from_str(cls, precision):
         if precision in ["torch.float16", "float16"]:
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
 # Column selection
+COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
+EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
+EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
+BENCHMARK_COLS: list[str] = [t.value.col_name for t in Tasks]

src/envs.py CHANGED Viewed

@@ -23,14 +23,17 @@ class Settings(BaseSettings):
     ]
     @computed_field
     def REPO_ID(self) -> str:
         return (Path(self.OWNER) / "leaderboard").as_posix()
     @computed_field
     def QUEUE_REPO(self) -> str:
         return (Path(self.OWNER) / "requests").as_posix()
     @computed_field
     def RESULTS_REPO(self) -> str:
         return (Path(self.OWNER) / "results").as_posix()
@@ -42,18 +45,22 @@ class Settings(BaseSettings):
     # Local caches
     @computed_field
     def EVAL_REQUESTS_PATH(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-queue").as_posix()
     @computed_field
     def EVAL_RESULTS_PATH(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-results").as_posix()
     @computed_field
     def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
     @computed_field
     def EVAL_RESULTS_PATH_BACKEND(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-results-bk").as_posix()
@@ -63,5 +70,5 @@ class Settings(BaseSettings):
         return HfApi(token=self.TOKEN)
-settings = Settings()
 API = settings.API

     ]
     @computed_field
+    @cached_property
     def REPO_ID(self) -> str:
         return (Path(self.OWNER) / "leaderboard").as_posix()
     @computed_field
+    @cached_property
     def QUEUE_REPO(self) -> str:
         return (Path(self.OWNER) / "requests").as_posix()
     @computed_field
+    @cached_property
     def RESULTS_REPO(self) -> str:
         return (Path(self.OWNER) / "results").as_posix()
     # Local caches
     @computed_field
+    @cached_property
     def EVAL_REQUESTS_PATH(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-queue").as_posix()
     @computed_field
+    @cached_property
     def EVAL_RESULTS_PATH(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-results").as_posix()
     @computed_field
+    @cached_property
     def EVAL_REQUESTS_PATH_BACKEND(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-queue-bk").as_posix()
     @computed_field
+    @cached_property
     def EVAL_RESULTS_PATH_BACKEND(self) -> str:
         return (Path(self.CACHE_PATH) / "eval-results-bk").as_posix()
         return HfApi(token=self.TOKEN)
+settings = Settings()  # pyright: ignore[reportCallIssue]
 API = settings.API

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import glob
 import json
 import os
-from dataclasses import dataclass
-from typing import Any
-import dateutil
 import numpy as np
 from typing_extensions import Self
 from src.display.formatting import make_clickable_model
@@ -13,16 +19,35 @@ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, Weigh
 from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
     """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
     eval_name: str  # org_model_precision (uid)
     full_model: str  # org/model (path on hub)
-    org: str
     model: str
     revision: str  # commit hash, "" if main
-    results: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original  # Original or Adapter
@@ -36,16 +61,14 @@ class EvalResult:
     @classmethod
     def init_from_json_file(cls, json_filepath: str) -> Self:
         """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
         # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
         # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
@@ -59,38 +82,38 @@ class EvalResult:
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
-        architecture = "?"
         if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any(acc is None for acc in accs):
                 continue
             mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return cls(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision=config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture,
-        )
     def update_with_request_file(self, requests_path: str) -> None:
         """Finds the relevant request file for the current model and updates info with it"""
@@ -135,7 +158,7 @@ class EvalResult:
         return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
         requests_path,
@@ -166,7 +189,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Sort the files by date
         try:
             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
             files = [files[-1]]
         for file in files:

+"""Based on https://huggingface.co/spaces/demo-leaderboard-backend/leaderboard/blob/main/src/leaderboard/read_evals.py
+Enhanced with Pydantic models.
+"""
 import glob
 import json
 import os
+from pathlib import Path
+from typing import Annotated, Any
+import dateutil.parser
 import numpy as np
+from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import Self
 from src.display.formatting import make_clickable_model
 from src.submission.check_validity import is_model_on_hub
+class EvalResultJson(BaseModel):
+    """Model of the eval result json file."""
+    model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
+    config: "EvalResultJson_Config"
+    results: dict[str, dict[str, float | None]]
+class EvalResultJson_Config(BaseModel):
+    """`config` in the eval result json file."""
+    model_config: ConfigDict = ConfigDict(extra="allow", frozen=True)
+    model_dtype: Annotated[str, Field(..., description="The model precision. e.g. torch.bfloat16")]
+    model_name: Annotated[str, Field(..., description="The model name. e.g. Qwen/Qwen2.5-3B")]
+    model_sha: Annotated[str, Field(description="The model sha. e.g. 3aab1f1954e9cc14eb9509a215f9e5ca08227a9b")] = ""
+    model_args: Annotated[str | None, Field(description="The model args.")] = None
+class EvalResult(BaseModel):
     """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
     eval_name: str  # org_model_precision (uid)
     full_model: str  # org/model (path on hub)
+    org: str | None
     model: str
     revision: str  # commit hash, "" if main
+    results: dict[str, float]
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original  # Original or Adapter
     @classmethod
     def init_from_json_file(cls, json_filepath: str) -> Self:
         """Inits the result from the specific model result file"""
+        data = EvalResultJson.model_validate_json(Path(json_filepath).read_text())
+        config = data.config
         # Precision
+        precision = Precision.from_str(config.model_dtype)
         # Get model and org
+        org_and_model = config.model_name or config.model_args or ""
         org_and_model = org_and_model.split("/", 1)
         if len(org_and_model) == 1:
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model, config.model_sha or "main", trust_remote_code=True, test_tokenizer=False
         )
+        architecture: str = "?"
         if model_config is not None:
+            architectures: list[str] | None = getattr(model_config, "architectures", None)
             if architectures:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
+        results: dict[str, float] = {}
+        for t in Tasks:
+            task = t.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data.results.items() if task.benchmark == k])
             if accs.size == 0 or any(acc is None for acc in accs):
                 continue
             mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = float(mean_acc)
+        return cls.model_validate({
+            "eval_name": result_key,
+            "full_model": full_model,
+            "org": org,
+            "model": model,
+            "results": results,
+            "precision": precision,
+            "revision": config.model_sha or "",
+            "still_on_hub": still_on_hub,
+            "architecture": architecture,
+        })
     def update_with_request_file(self, requests_path: str) -> None:
         """Finds the relevant request file for the current model and updates info with it"""
         return data_dict
+def get_request_file_for_model(requests_path, model_name, precision) -> str:
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     request_files = os.path.join(
         requests_path,
         # Sort the files by date
         try:
             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser.ParserError:
             files = [files[-1]]
         for file in files:

src/leaderboard/read_evals_orig.py DELETED Viewed

@@ -1,194 +0,0 @@
-import glob
-import json
-import os
-from dataclasses import dataclass
-import dateutil
-import numpy as np
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
-from src.submission.check_validity import is_model_on_hub
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
-    eval_name: str  # org_model_precision (uid)
-    full_model: str  # org/model (path on hub)
-    org: str
-    model: str
-    revision: str  # commit hash, "" if main
-    results: dict
-    precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original  # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = ""  # submission date of request file
-    still_on_hub: bool = False
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any(acc is None for acc in accs):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision=config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture,
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file) as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(
-                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
-            )
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
-            AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-        }
-        for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file) as f:
-            req_content = json.load(f)
-            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
-                request_file = tmp_request_file
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any(not f.endswith(".json") for f in files):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict()  # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results

src/populate.py CHANGED Viewed

@@ -23,7 +23,12 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """
     Creates a sorted leaderboard DataFrame from evaluation results.
@@ -52,14 +57,14 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """
     Creates separate DataFrames for different evaluation queue statuses.
@@ -72,7 +77,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         cols (list): List of column names to include in the final DataFrames
     Returns:
-        list[pd.DataFrame]: A list containing three DataFrames in order:
             1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
             2. df_running: Evaluations with status "RUNNING"
             3. df_pending: Evaluations with status "PENDING" or "RERUN"
@@ -120,4 +125,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(
+    results_path: str,
+    requests_path: str,
+    cols: list[str],
+    benchmark_cols: list[str],
+) -> pd.DataFrame:
     """
     Creates a sorted leaderboard DataFrame from evaluation results.
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    df = df.loc[:, cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df
+def get_evaluation_queue_df(save_path: str, cols: list[str]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     Creates separate DataFrames for different evaluation queue statuses.
         cols (list): List of column names to include in the final DataFrames
     Returns:
+        tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames in order:
             1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
             2. df_running: Evaluations with status "RUNNING"
             3. df_pending: Evaluations with status "PENDING" or "RERUN"
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    return df_finished.loc[:, cols], df_running.loc[:, cols], df_pending.loc[:, cols]

src/submission/check_validity.py CHANGED Viewed

@@ -1,19 +1,23 @@
 import json
 import os
 from collections import defaultdict
-import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
         card = ModelCard.load(repo_id)
-    except huggingface_hub.utils.EntryNotFoundError:
         return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
     # Enforce license metadata
@@ -32,8 +36,16 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
 def is_model_on_hub(
-    model_name: str, revision: str, token: str | None = None, trust_remote_code=False, test_tokenizer=False
-) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
         config = AutoConfig.from_pretrained(

 import json
 import os
+import typing
 from collections import defaultdict
+import huggingface_hub.errors
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
+if typing.TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
         card = ModelCard.load(repo_id)
+    except huggingface_hub.errors.EntryNotFoundError:
         return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
     # Enforce license metadata
 def is_model_on_hub(
+    model_name: str,
+    revision: str,
+    token: str | None = None,
+    trust_remote_code=False,
+    test_tokenizer=False,
+) -> tuple[
+    bool,
+    str | None,
+    "PretrainedConfig | None",
+]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
         config = AutoConfig.from_pretrained(

uv.lock CHANGED Viewed

@@ -687,6 +687,7 @@ dependencies = [
     { name = "python-dotenv" },
     { name = "rich" },
     { name = "sentencepiece" },
     { name = "tokenizers" },
     { name = "tqdm" },
     { name = "transformers" },
@@ -715,6 +716,7 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "rich", specifier = ">=14.2.0" },
     { name = "sentencepiece" },
     { name = "tokenizers", specifier = ">=0.15.0" },
     { name = "tqdm" },
     { name = "transformers" },
@@ -1305,6 +1307,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
 ]
 [[package]]
 name = "tokenizers"
 version = "0.22.1"

     { name = "python-dotenv" },
     { name = "rich" },
     { name = "sentencepiece" },
+    { name = "tabulate" },
     { name = "tokenizers" },
     { name = "tqdm" },
     { name = "transformers" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "rich", specifier = ">=14.2.0" },
     { name = "sentencepiece" },
+    { name = "tabulate", specifier = ">=0.9.0" },
     { name = "tokenizers", specifier = ">=0.15.0" },
     { name = "tqdm" },
     { name = "transformers" },
     { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
 ]
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
+]
 [[package]]
 name = "tokenizers"
 version = "0.22.1"