Spaces:
Running
Running
File size: 5,587 Bytes
60906bd b98f07f 3b3db42 91e8a06 ceb2102 b98f07f 1ba1924 60906bd 1ba1924 ceb2102 b98f07f ceb2102 b98f07f 4103566 b98f07f 818f024 60906bd b98f07f 3165936 b98f07f abebeac b98f07f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
"""
Data population utilities for leaderboard and evaluation queue management.
This module provides functions to create and populate pandas DataFrames from evaluation
results and submission data. It handles data processing for both the main leaderboard
display and the evaluation queue status tracking.
Key Functions:
get_leaderboard_df: Creates a sorted leaderboard DataFrame from evaluation results
get_evaluation_queue_df: Creates separate DataFrames for different evaluation statuses
The module processes JSON files containing evaluation results and submission metadata,
applies formatting transformations, and filters data based on completion status.
"""
import json
import os
import pandas as pd
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
"""
Creates a sorted leaderboard DataFrame from evaluation results.
This function processes raw evaluation data from JSON files and creates a pandas
DataFrame suitable for leaderboard display. The resulting DataFrame is sorted by
average performance scores in descending order and filtered to exclude incomplete
evaluations.
Args:
results_path (str): Path to the directory containing evaluation result files
requests_path (str): Path to the directory containing evaluation request files
cols (list): List of column names to include in the final DataFrame
benchmark_cols (list): List of benchmark column names used for filtering
Returns:
pd.DataFrame: A sorted and filtered DataFrame containing leaderboard data.
Rows are sorted by average score (descending) and filtered to
exclude entries with missing benchmark results.
Note:
The function automatically rounds numeric values to 2 decimal places and
filters out any entries that have NaN values in the specified benchmark columns.
"""
raw_data = get_raw_eval_results(results_path, requests_path)
all_data_json = [v.to_dict() for v in raw_data]
df = pd.DataFrame.from_records(all_data_json)
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
df = df[cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df = df[has_no_nan_values(df, benchmark_cols)]
return df
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
"""
Creates separate DataFrames for different evaluation queue statuses.
This function scans a directory for evaluation submission files (both individual
JSON files and files within subdirectories) and categorizes them by their status.
It returns three separate DataFrames: finished, running, and pending evaluations.
Args:
save_path (str): Path to the directory containing evaluation submission files
cols (list): List of column names to include in the final DataFrames
Returns:
list[pd.DataFrame]: A list containing three DataFrames in order:
1. df_finished: Evaluations with status "FINISHED*" or "PENDING_NEW_EVAL"
2. df_running: Evaluations with status "RUNNING"
3. df_pending: Evaluations with status "PENDING" or "RERUN"
Note:
The function processes both individual JSON files and JSON files within
subdirectories (excluding markdown files). Model names are automatically
converted to clickable links, and revision defaults to "main" if not specified.
Status categorization:
- FINISHED: Any status starting with "FINISHED" or "PENDING_NEW_EVAL"
- RUNNING: Status equals "RUNNING"
- PENDING: Status equals "PENDING" or "RERUN"
"""
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
all_evals = []
for entry in entries:
if ".json" in entry:
file_path = os.path.join(save_path, entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
all_evals.append(data)
elif ".md" not in entry:
# this is a folder
sub_entries = [
e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
]
for sub_entry in sub_entries:
file_path = os.path.join(save_path, entry, sub_entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
all_evals.append(data)
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_running = pd.DataFrame.from_records(running_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_running[cols], df_pending[cols]
|