Spaces:

lmms-lab-si
/

EASI-Leaderboard

Running

App Files Files Community

yangzhitao commited on Oct 29

Commit

60906bd

1 Parent(s): fe8ec74

chores: update configurations

Browse files

Files changed (10) hide show

.env.example +1 -0
.vscode/cspell.json +2 -0
app.py +4 -3
pyproject.toml +2 -0
src/envs.py +1 -1
src/leaderboard/read_evals.py +12 -10
src/leaderboard/read_evals_orig.py +194 -0
src/populate.py +65 -2
src/submission/submit.py +3 -1
uv.lock +4 -0

.env.example CHANGED Viewed

	@@ -1 +1,2 @@
1	HF_TOKEN=changethis


1	HF_TOKEN=changethis
2	+ HF_HOME=.

.vscode/cspell.json CHANGED Viewed

@@ -1,6 +1,8 @@
 {
     "words": [
         "changethis",
         "initialisation",
         "modelcard",
         "sentencepiece"

 {
     "words": [
+        "accs",
         "changethis",
+        "evals",
         "initialisation",
         "modelcard",
         "sentencepiece"

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -31,10 +33,10 @@ from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=settings.REPO_ID)
 # Space initialisation
 try:
-    print(settings.EVAL_REQUESTS_PATH)
     snapshot_download(
         repo_id=settings.QUEUE_REPO,
         local_dir=settings.EVAL_REQUESTS_PATH,
@@ -46,7 +48,6 @@ try:
 except Exception:
     restart_space()
 try:
-    print(settings.EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=settings.RESULTS_REPO,
         local_dir=settings.EVAL_RESULTS_PATH,
@@ -73,7 +74,7 @@ LEADERBOARD_DF = get_leaderboard_df(
 ) = get_evaluation_queue_df(settings.EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(

 import gradio as gr
+import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
+from rich import print
 from src.about import (
     CITATION_BUTTON_LABEL,
 def restart_space():
     API.restart_space(repo_id=settings.REPO_ID)
+print("///// --- Settings --- /////", settings.model_dump())
 # Space initialisation
 try:
     snapshot_download(
         repo_id=settings.QUEUE_REPO,
         local_dir=settings.EVAL_REQUESTS_PATH,
 except Exception:
     restart_space()
 try:
     snapshot_download(
         repo_id=settings.RESULTS_REPO,
         local_dir=settings.EVAL_RESULTS_PATH,
 ) = get_evaluation_queue_df(settings.EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(

pyproject.toml CHANGED Viewed

@@ -22,7 +22,9 @@ dependencies = [
   "tokenizers>=0.15.0",
   "sentencepiece",
   "python-dotenv>=1.2.1",
   "pydantic-settings>=2.11.0",
 ]
 [dependency-groups]

   "tokenizers>=0.15.0",
   "sentencepiece",
   "python-dotenv>=1.2.1",
+  "pydantic>=2.11.10",
   "pydantic-settings>=2.11.0",
+  "rich>=14.2.0",
 ]
 [dependency-groups]

src/envs.py CHANGED Viewed

@@ -19,7 +19,7 @@ class Settings(BaseSettings):
     # Change to your org - don't forget to create a results and request dataset, with the correct format!
     OWNER: Annotated[
         str,
-        Field("y-playground-backend"),
     ]
     @computed_field

     # Change to your org - don't forget to create a results and request dataset, with the correct format!
     OWNER: Annotated[
         str,
+        Field("y-playground"),
     ]
     @computed_field

src/leaderboard/read_evals.py CHANGED Viewed

@@ -2,9 +2,11 @@ import glob
 import json
 import os
 from dataclasses import dataclass
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
@@ -32,7 +34,7 @@ class EvalResult:
     still_on_hub: bool = False
     @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -78,7 +80,7 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
-        return self(
             eval_name=result_key,
             full_model=full_model,
             org=org,
@@ -90,25 +92,25 @@ class EvalResult:
             architecture=architecture,
         )
-    def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file) as f:
-                request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
-        except Exception:
             print(
-                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
             )
-    def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
         data_dict = {
@@ -154,7 +156,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
@@ -170,7 +172,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
@@ -183,7 +185,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         else:
             eval_results[eval_name] = eval_result
-    results = []
     for v in eval_results.values():
         try:
             v.to_dict()  # we test if the dict version is complete

 import json
 import os
 from dataclasses import dataclass
+from typing import Any
 import dateutil
 import numpy as np
+from typing_extensions import Self
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
     still_on_hub: bool = False
     @classmethod
+    def init_from_json_file(cls, json_filepath: str) -> Self:
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        return cls(
             eval_name=result_key,
             full_model=full_model,
             org=org,
             architecture=architecture,
         )
+    def update_with_request_file(self, requests_path: str) -> None:
         """Finds the relevant request file for the current model and updates info with it"""
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file) as f:
+                request: dict[str, Any] = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
+        except Exception as e:
             print(
+                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}. Error: {e}"
             )
+    def to_dict(self) -> dict:
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
         data_dict = {
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths: list[str] = []
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    eval_results: dict[str, EvalResult] = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         else:
             eval_results[eval_name] = eval_result
+    results: list[EvalResult] = []
     for v in eval_results.values():
         try:
             v.to_dict()  # we test if the dict version is complete

src/leaderboard/read_evals_orig.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import glob
+import json
+import os
+from dataclasses import dataclass
+import dateutil
+import numpy as np
+from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
+from src.submission.check_validity import is_model_on_hub
+@dataclass
+class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
+    model: str
+    revision: str  # commit hash, "" if main
+    results: dict
+    precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"
+    license: str = "?"
+    likes: int = 0
+    num_params: int = 0
+    date: str = ""  # submission date of request file
+    still_on_hub: bool = False
+    @classmethod
+    def init_from_json_file(self, json_filepath):
+        """Inits the result from the specific model result file"""
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        config = data.get("config")
+        # Precision
+        precision = Precision.from_str(config.get("model_dtype"))
+        # Get model and org
+        org_and_model = config.get("model_name", config.get("model_args", None))
+        org_and_model = org_and_model.split("/", 1)
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
+        full_model = "/".join(org_and_model)
+        still_on_hub, _, model_config = is_model_on_hub(
+            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
+        )
+        architecture = "?"
+        if model_config is not None:
+            architectures = getattr(model_config, "architectures", None)
+            if architectures:
+                architecture = ";".join(architectures)
+        # Extract results available in this file (some results are split in several files)
+        results = {}
+        for task in Tasks:
+            task = task.value
+            # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            if accs.size == 0 or any(acc is None for acc in accs):
+                continue
+            mean_acc = np.mean(accs) * 100.0
+            results[task.benchmark] = mean_acc
+        return self(
+            eval_name=result_key,
+            full_model=full_model,
+            org=org,
+            model=model,
+            results=results,
+            precision=precision,
+            revision=config.get("model_sha", ""),
+            still_on_hub=still_on_hub,
+            architecture=architecture,
+        )
+    def update_with_request_file(self, requests_path):
+        """Finds the relevant request file for the current model and updates info with it"""
+        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
+        try:
+            with open(request_file) as f:
+                request = json.load(f)
+            self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.weight_type = WeightType[request.get("weight_type", "Original")]
+            self.license = request.get("license", "?")
+            self.likes = request.get("likes", 0)
+            self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
+        except Exception:
+            print(
+                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
+            )
+    def to_dict(self):
+        """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = sum(v for v in self.results.values() if v is not None) / len(Tasks)
+        data_dict = {
+            "eval_name": self.eval_name,  # not a column, just a save name,
+            AutoEvalColumn.precision.name: self.precision.value.name,
+            AutoEvalColumn.model_type.name: self.model_type.value.name,
+            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
+            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            AutoEvalColumn.revision.name: self.revision,
+            AutoEvalColumn.average.name: average,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+        }
+        for task in Tasks:
+            data_dict[task.value.col_name] = self.results[task.value.benchmark]
+        return data_dict
+def get_request_file_for_model(requests_path, model_name, precision):
+    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
+    request_files = os.path.join(
+        requests_path,
+        f"{model_name}_eval_request_*.json",
+    )
+    request_files = glob.glob(request_files)
+    # Select correct request file (precision)
+    request_file = ""
+    request_files = sorted(request_files, reverse=True)
+    for tmp_request_file in request_files:
+        with open(tmp_request_file) as f:
+            req_content = json.load(f)
+            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
+                request_file = tmp_request_file
+    return request_file
+def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
+    """From the path of the results folder root, extract all needed info for results"""
+    model_result_filepaths = []
+    for root, _, files in os.walk(results_path):
+        # We should only have json files in model results
+        if len(files) == 0 or any(not f.endswith(".json") for f in files):
+            continue
+        # Sort the files by date
+        try:
+            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
+        except dateutil.parser._parser.ParserError:
+            files = [files[-1]]
+        for file in files:
+            model_result_filepaths.append(os.path.join(root, file))
+    eval_results = {}
+    for model_result_filepath in model_result_filepaths:
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
+    results = []
+    for v in eval_results.values():
+        try:
+            v.to_dict()  # we test if the dict version is complete
+            results.append(v)
+        except KeyError:  # not all eval values present
+            continue
+    return results

src/populate.py CHANGED Viewed

@@ -1,3 +1,18 @@
 import json
 import os
@@ -9,7 +24,29 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
@@ -23,7 +60,33 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []

+"""
+Data population utilities for leaderboard and evaluation queue management.
+This module provides functions to create and populate pandas DataFrames from evaluation
+results and submission data. It handles data processing for both the main leaderboard
+display and the evaluation queue status tracking.
+Key Functions:
+    get_leaderboard_df: Creates a sorted leaderboard DataFrame from evaluation results
+    get_evaluation_queue_df: Creates separate DataFrames for different evaluation statuses
+The module processes JSON files containing evaluation results and submission metadata,
+applies formatting transformations, and filters data based on completion status.
+"""
 import json
 import os
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    """
+    Creates a sorted leaderboard DataFrame from evaluation results.
+    This function processes raw evaluation data from JSON files and creates a pandas
+    DataFrame suitable for leaderboard display. The resulting DataFrame is sorted by
+    average performance scores in descending order and filtered to exclude incomplete
+    evaluations.
+    Args:
+        results_path (str): Path to the directory containing evaluation result files
+        requests_path (str): Path to the directory containing evaluation request files
+        cols (list): List of column names to include in the final DataFrame
+        benchmark_cols (list): List of benchmark column names used for filtering
+    Returns:
+        pd.DataFrame: A sorted and filtered DataFrame containing leaderboard data.
+            Rows are sorted by average score (descending) and filtered to
+            exclude entries with missing benchmark results.
+    Note:
+        The function automatically rounds numeric values to 2 decimal places and
+        filters out any entries that have NaN values in the specified benchmark columns.
+    """
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+    """
+    Creates separate DataFrames for different evaluation queue statuses.
+    This function scans a directory for evaluation submission files (both individual
+    JSON files and files within subdirectories) and categorizes them by their status.
+    It returns three separate DataFrames: finished, running, and pending evaluations.
+    Args:
+        save_path (str): Path to the directory containing evaluation submission files
+        cols (list): List of column names to include in the final DataFrames
+    Returns:
+        list[pd.DataFrame]: A list containing three DataFrames in order:
+            1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
+            2. df_running: Evaluations with status "RUNNING"
+            3. df_pending: Evaluations with status "PENDING" or "RERUN"
+    Note:
+        The function processes both individual JSON files and JSON files within
+        subdirectories (excluding markdown files). Model names are automatically
+        converted to clickable links, and revision defaults to "main" if not specified.
+        Status categorization:
+        - FINISHED: Any status starting with "FINISHED" or "PENDING_NEW_EVAL"
+        - RUNNING: Status equals "RUNNING"
+        - PENDING: Status equals "PENDING" or "RERUN"
+    """
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []

src/submission/submit.py CHANGED Viewed

@@ -59,7 +59,9 @@ def add_new_eval(
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=settings.TOKEN, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')

             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
+        model_on_hub, error, _ = is_model_on_hub(
+            model_name=model, revision=revision, token=settings.TOKEN, test_tokenizer=True
+        )
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')

uv.lock CHANGED Viewed

@@ -681,9 +681,11 @@ dependencies = [
     { name = "matplotlib" },
     { name = "numpy" },
     { name = "pandas" },
     { name = "pydantic-settings" },
     { name = "python-dateutil" },
     { name = "python-dotenv" },
     { name = "sentencepiece" },
     { name = "tokenizers" },
     { name = "tqdm" },
@@ -707,9 +709,11 @@ requires-dist = [
     { name = "matplotlib" },
     { name = "numpy" },
     { name = "pandas" },
     { name = "pydantic-settings", specifier = ">=2.11.0" },
     { name = "python-dateutil" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "sentencepiece" },
     { name = "tokenizers", specifier = ">=0.15.0" },
     { name = "tqdm" },

     { name = "matplotlib" },
     { name = "numpy" },
     { name = "pandas" },
+    { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "python-dateutil" },
     { name = "python-dotenv" },
+    { name = "rich" },
     { name = "sentencepiece" },
     { name = "tokenizers" },
     { name = "tqdm" },
     { name = "matplotlib" },
     { name = "numpy" },
     { name = "pandas" },
+    { name = "pydantic", specifier = ">=2.11.10" },
     { name = "pydantic-settings", specifier = ">=2.11.0" },
     { name = "python-dateutil" },
     { name = "python-dotenv", specifier = ">=1.2.1" },
+    { name = "rich", specifier = ">=14.2.0" },
     { name = "sentencepiece" },
     { name = "tokenizers", specifier = ">=0.15.0" },
     { name = "tqdm" },