Spaces:
Sleeping
Sleeping
Ubuntu
commited on
Commit
·
d8eb71c
1
Parent(s):
2b848e2
MMTU leaderboard init
Browse files- README.md +1 -1
- app.py +52 -130
- results.json +82 -0
- src/about.py +25 -7
- src/display/utils.py +14 -13
- src/envs.py +7 -7
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 🥇
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
|
|
|
| 1 |
---
|
| 2 |
+
title: MMTU Leaderboard
|
| 3 |
emoji: 🥇
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
app.py
CHANGED
|
@@ -10,6 +10,7 @@ from src.about import (
|
|
| 10 |
EVALUATION_QUEUE_TEXT,
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
LLM_BENCHMARKS_TEXT,
|
|
|
|
| 13 |
TITLE,
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
|
@@ -24,38 +25,43 @@ from src.display.utils import (
|
|
| 24 |
WeightType,
|
| 25 |
Precision
|
| 26 |
)
|
| 27 |
-
from src.envs import API,
|
| 28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
-
from src.submission.submit import add_new_eval
|
| 30 |
|
| 31 |
|
| 32 |
def restart_space():
|
| 33 |
API.restart_space(repo_id=REPO_ID)
|
| 34 |
|
| 35 |
-
### Space initialisation
|
| 36 |
-
try:
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
except Exception:
|
| 42 |
-
|
| 43 |
-
try:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
except Exception:
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def init_leaderboard(dataframe):
|
| 61 |
if dataframe is None or dataframe.empty:
|
|
@@ -68,22 +74,22 @@ def init_leaderboard(dataframe):
|
|
| 68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 69 |
label="Select Columns to Display:",
|
| 70 |
),
|
| 71 |
-
search_columns=[AutoEvalColumn.model.name
|
| 72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
-
filter_columns=[
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
],
|
| 87 |
bool_checkboxgroup_label="Hide models",
|
| 88 |
interactive=False,
|
| 89 |
)
|
|
@@ -95,101 +101,17 @@ with demo:
|
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅
|
| 99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
|
| 101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 103 |
|
| 104 |
-
with gr.TabItem("🚀 Submit here!
|
| 105 |
-
|
| 106 |
-
with gr.Row():
|
| 107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 108 |
-
|
| 109 |
-
with gr.Column():
|
| 110 |
-
with gr.Accordion(
|
| 111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 112 |
-
open=False,
|
| 113 |
-
):
|
| 114 |
-
with gr.Row():
|
| 115 |
-
finished_eval_table = gr.components.Dataframe(
|
| 116 |
-
value=finished_eval_queue_df,
|
| 117 |
-
headers=EVAL_COLS,
|
| 118 |
-
datatype=EVAL_TYPES,
|
| 119 |
-
row_count=5,
|
| 120 |
-
)
|
| 121 |
-
with gr.Accordion(
|
| 122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 123 |
-
open=False,
|
| 124 |
-
):
|
| 125 |
-
with gr.Row():
|
| 126 |
-
running_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=running_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
with gr.Accordion(
|
| 134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
-
open=False,
|
| 136 |
-
):
|
| 137 |
-
with gr.Row():
|
| 138 |
-
pending_eval_table = gr.components.Dataframe(
|
| 139 |
-
value=pending_eval_queue_df,
|
| 140 |
-
headers=EVAL_COLS,
|
| 141 |
-
datatype=EVAL_TYPES,
|
| 142 |
-
row_count=5,
|
| 143 |
-
)
|
| 144 |
-
with gr.Row():
|
| 145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 146 |
-
|
| 147 |
-
with gr.Row():
|
| 148 |
-
with gr.Column():
|
| 149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
| 150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 151 |
-
model_type = gr.Dropdown(
|
| 152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
-
label="Model type",
|
| 154 |
-
multiselect=False,
|
| 155 |
-
value=None,
|
| 156 |
-
interactive=True,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
with gr.Column():
|
| 160 |
-
precision = gr.Dropdown(
|
| 161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 162 |
-
label="Precision",
|
| 163 |
-
multiselect=False,
|
| 164 |
-
value="float16",
|
| 165 |
-
interactive=True,
|
| 166 |
-
)
|
| 167 |
-
weight_type = gr.Dropdown(
|
| 168 |
-
choices=[i.value.name for i in WeightType],
|
| 169 |
-
label="Weights type",
|
| 170 |
-
multiselect=False,
|
| 171 |
-
value="Original",
|
| 172 |
-
interactive=True,
|
| 173 |
-
)
|
| 174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
-
|
| 176 |
-
submit_button = gr.Button("Submit Eval")
|
| 177 |
-
submission_result = gr.Markdown()
|
| 178 |
-
submit_button.click(
|
| 179 |
-
add_new_eval,
|
| 180 |
-
[
|
| 181 |
-
model_name_textbox,
|
| 182 |
-
base_model_name_textbox,
|
| 183 |
-
revision_name_textbox,
|
| 184 |
-
precision,
|
| 185 |
-
weight_type,
|
| 186 |
-
model_type,
|
| 187 |
-
],
|
| 188 |
-
submission_result,
|
| 189 |
-
)
|
| 190 |
|
| 191 |
with gr.Row():
|
| 192 |
-
with gr.Accordion("📙 Citation", open=
|
| 193 |
citation_button = gr.Textbox(
|
| 194 |
value=CITATION_BUTTON_TEXT,
|
| 195 |
label=CITATION_BUTTON_LABEL,
|
|
|
|
| 10 |
EVALUATION_QUEUE_TEXT,
|
| 11 |
INTRODUCTION_TEXT,
|
| 12 |
LLM_BENCHMARKS_TEXT,
|
| 13 |
+
SUBMIT_INTRODUCTION,
|
| 14 |
TITLE,
|
| 15 |
)
|
| 16 |
from src.display.css_html_js import custom_css
|
|
|
|
| 25 |
WeightType,
|
| 26 |
Precision
|
| 27 |
)
|
| 28 |
+
from src.envs import API, REPO_ID, TOKEN
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
+
# from src.submission.submit import add_new_eval
|
| 31 |
|
| 32 |
|
| 33 |
def restart_space():
|
| 34 |
API.restart_space(repo_id=REPO_ID)
|
| 35 |
|
| 36 |
+
# ### Space initialisation
|
| 37 |
+
# try:
|
| 38 |
+
# print(EVAL_REQUESTS_PATH)
|
| 39 |
+
# snapshot_download(
|
| 40 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 41 |
+
# )
|
| 42 |
+
# except Exception:
|
| 43 |
+
# restart_space()
|
| 44 |
+
# try:
|
| 45 |
+
# print(EVAL_RESULTS_PATH)
|
| 46 |
+
# snapshot_download(
|
| 47 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 48 |
+
# )
|
| 49 |
+
# except Exception:
|
| 50 |
+
# restart_space()
|
| 51 |
+
|
| 52 |
+
def get_df():
|
| 53 |
+
df = pd.read_json("results.json")
|
| 54 |
+
df.sort_values(by="Overall", ascending=False, inplace=True)
|
| 55 |
+
return df
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
LEADERBOARD_DF = get_df()
|
| 59 |
+
|
| 60 |
+
# (
|
| 61 |
+
# finished_eval_queue_df,
|
| 62 |
+
# running_eval_queue_df,
|
| 63 |
+
# pending_eval_queue_df,
|
| 64 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 65 |
|
| 66 |
def init_leaderboard(dataframe):
|
| 67 |
if dataframe is None or dataframe.empty:
|
|
|
|
| 74 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 75 |
label="Select Columns to Display:",
|
| 76 |
),
|
| 77 |
+
search_columns=[AutoEvalColumn.model.name],
|
| 78 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 79 |
+
# filter_columns=[
|
| 80 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 81 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 82 |
+
# ColumnFilter(
|
| 83 |
+
# AutoEvalColumn.params.name,
|
| 84 |
+
# type="slider",
|
| 85 |
+
# min=0.01,
|
| 86 |
+
# max=150,
|
| 87 |
+
# label="Select the number of parameters (B)",
|
| 88 |
+
# ),
|
| 89 |
+
# ColumnFilter(
|
| 90 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 91 |
+
# ),
|
| 92 |
+
# ],
|
| 93 |
bool_checkboxgroup_label="Hide models",
|
| 94 |
interactive=False,
|
| 95 |
)
|
|
|
|
| 101 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 102 |
|
| 103 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 104 |
+
with gr.TabItem("🏅 MMTU", elem_id="llm-benchmark-tab-table", id=0):
|
| 105 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 106 |
|
| 107 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 108 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 109 |
|
| 110 |
+
with gr.TabItem("🚀 Submit here!", elem_id="llm-benchmark-tab-table", id=3):
|
| 111 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
with gr.Row():
|
| 114 |
+
with gr.Accordion("📙 Citation", open=True):
|
| 115 |
citation_button = gr.Textbox(
|
| 116 |
value=CITATION_BUTTON_TEXT,
|
| 117 |
label=CITATION_BUTTON_LABEL,
|
results.json
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "o4-mini (2024-11-20)",
|
| 4 |
+
"Model type": "Reasoning",
|
| 5 |
+
"Model size": "unknown",
|
| 6 |
+
"Overall": 0.639,
|
| 7 |
+
"Data Source": "MMTU",
|
| 8 |
+
"Date": "2025-06-10"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "Deepseek-R1",
|
| 12 |
+
"Model type": "Reasoning",
|
| 13 |
+
"Model size": "unknown",
|
| 14 |
+
"Overall": 0.596 ,
|
| 15 |
+
"Data Source": "MMTU",
|
| 16 |
+
"Date": "2025-06-10"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "Deepseek-V3",
|
| 20 |
+
"Model type": "Chat",
|
| 21 |
+
"Model size": "unknown",
|
| 22 |
+
"Overall": 0.517,
|
| 23 |
+
"Data Source": "MMTU",
|
| 24 |
+
"Date": "2025-06-10"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "GPT-4o (2024-11-20)",
|
| 28 |
+
"Model type": "Chat",
|
| 29 |
+
"Model size": "unknown",
|
| 30 |
+
"Overall": 0.491 ,
|
| 31 |
+
"Data Source": "MMTU",
|
| 32 |
+
"Date": "2025-06-10"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "Llama-3.3-70B",
|
| 36 |
+
"Model type": "Chat",
|
| 37 |
+
"Model size": "70B",
|
| 38 |
+
"Overall": 0.438,
|
| 39 |
+
"Data Source": "MMTU",
|
| 40 |
+
"Date": "2025-06-10"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "Mistral-Large-2411",
|
| 44 |
+
"Model type": "Chat",
|
| 45 |
+
"Model size": "123B",
|
| 46 |
+
"Overall": 0.430,
|
| 47 |
+
"Data Source": "MMTU",
|
| 48 |
+
"Date": "2025-06-10"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "Llama-3.1-8B",
|
| 52 |
+
"Model type": "Chat",
|
| 53 |
+
"Model size": "8B",
|
| 54 |
+
"Overall": 0.259,
|
| 55 |
+
"Data Source": "MMTU",
|
| 56 |
+
"Date": "2025-06-10"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "Mistral-Small-2503",
|
| 60 |
+
"Model type": "Chat",
|
| 61 |
+
"Model size": "70B",
|
| 62 |
+
"Overall": 0.426,
|
| 63 |
+
"Data Source": "MMTU",
|
| 64 |
+
"Date": "2025-06-10"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "Llama-3.3-8B-Instruct",
|
| 68 |
+
"Model type": "Chat",
|
| 69 |
+
"Model size": "24B",
|
| 70 |
+
"Overall": 0.402,
|
| 71 |
+
"Data Source": "MMTU",
|
| 72 |
+
"Date": "2025-06-10"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "GPT-4o-mini (2024-07-18)",
|
| 76 |
+
"Model type": "Chat",
|
| 77 |
+
"Model size": "unknown",
|
| 78 |
+
"Overall": 0.386,
|
| 79 |
+
"Data Source": "MMTU",
|
| 80 |
+
"Date": "2025-06-10"
|
| 81 |
+
}
|
| 82 |
+
]
|
src/about.py
CHANGED
|
@@ -21,20 +21,28 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
EVALUATION_QUEUE_TEXT = """
|
|
@@ -67,6 +75,16 @@ Make sure you have followed the above steps first.
|
|
| 67 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 68 |
"""
|
| 69 |
|
| 70 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite
|
| 71 |
-
CITATION_BUTTON_TEXT =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
"""
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
+
TITLE = """<h1 align="center" id="space-title">MMTU leaderboard</h1>"""
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
+
|[**🤗 Dataset**](https://huggingface.co/datasets/MMTU-benchmark/MMTU) |[**🛠️GitHub**](https://github.com/MMTU-Benchmark/MMTU/tree/main) |[**🏆Leaderboard**](https://huggingface.co/spaces/MMTU-benchmark/mmtu-leaderboard)|[**📖 Paper**](https://arxiv.org/abs/2506.05587) |
|
| 29 |
+
|
| 30 |
+
Tables and table-based use cases play a crucial role in many real-world applications, such as spreadsheets, databases, and computational notebooks, which traditionally require expert-level users like data engineers, analysts, and database administrators to operate. Although LLMs have shown remarkable progress in working with tables, comprehensive benchmarking of such capabilities remains limited, often narrowly focusing on tasks like NL-to-SQL and Table-QA, while overlooking the broader spectrum of real-world tasks that professional users face today.
|
| 31 |
+
|
| 32 |
+
We introduce **MMTU**, a large-scale benchmark with over **30K questions** across **25 real-world table tasks**, designed to comprehensively evaluate models ability to understand, reason, and manipulate real tables at the expert-level. These tasks are drawn from decades' worth of computer science research on tabular data, with a focus on complex table tasks faced by professional users. We show that MMTU require a combination of skills -- including table understanding, reasoning, and coding -- that remain challenging for today's frontier models, where even frontier reasoning models like OpenAI o4-mini and DeepSeek R1 score only around 60%, suggesting significant room for improvement. Our evaluation code is available at [GitHub](https://github.com/MMTU-Benchmark/MMTU/tree/main).
|
| 33 |
+
|
| 34 |
"""
|
| 35 |
|
| 36 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 37 |
LLM_BENCHMARKS_TEXT = f"""
|
| 38 |
+
Please visit [**🤗 Dataset**](https://huggingface.co/datasets/MMTU-benchmark/MMTU) and [**📖 Paper**](https://arxiv.org/abs/2506.05587) to see the full list of tasks and their descriptions.
|
| 39 |
+
|
| 40 |
+
"""
|
| 41 |
|
| 42 |
+
SUBMIT_INTRODUCTION = """# Submit on MMTU Leaderboard Introduction
|
| 43 |
+
## ⚠ Please note that you need to submit the JSONL file with your model output.
|
| 44 |
|
| 45 |
+
You can generate an output file using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/MMTU-Benchmark/MMTU. After generating the file, please send us an email at [email protected], attaching the output file.
|
| 46 |
"""
|
| 47 |
|
| 48 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
| 75 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 76 |
"""
|
| 77 |
|
| 78 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite MMTU"
|
| 79 |
+
CITATION_BUTTON_TEXT = \
|
| 80 |
+
r"""
|
| 81 |
+
@misc{xing2025mmtumassivemultitasktable,
|
| 82 |
+
title={MMTU: A Massive Multi-Task Table Understanding and Reasoning Benchmark},
|
| 83 |
+
author={Junjie Xing and Yeye He and Mengyu Zhou and Haoyu Dong and Shi Han and Lingjiao Chen and Dongmei Zhang and Surajit Chaudhuri and H. V. Jagadish},
|
| 84 |
+
year={2025},
|
| 85 |
+
eprint={2506.05587},
|
| 86 |
+
archivePrefix={arXiv},
|
| 87 |
+
primaryClass={cs.AI},
|
| 88 |
+
url={https://arxiv.org/abs/2506.05587},
|
| 89 |
+
}
|
| 90 |
"""
|
src/display/utils.py
CHANGED
|
@@ -23,22 +23,23 @@ class ColumnContent:
|
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
| 26 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
-
auto_eval_column_dict.append(["
|
| 30 |
-
for task in Tasks:
|
| 31 |
-
|
| 32 |
# Model information
|
| 33 |
-
auto_eval_column_dict.append(["
|
| 34 |
-
auto_eval_column_dict.append(["
|
| 35 |
-
auto_eval_column_dict.append(["
|
| 36 |
-
auto_eval_column_dict.append(["
|
| 37 |
-
auto_eval_column_dict.append(["
|
| 38 |
-
auto_eval_column_dict.append(["
|
| 39 |
-
auto_eval_column_dict.append(["
|
| 40 |
-
auto_eval_column_dict.append(["
|
| 41 |
-
auto_eval_column_dict.append(["
|
|
|
|
| 42 |
|
| 43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
| 26 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
+
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall", "number", True)])
|
| 30 |
+
# for task in Tasks:
|
| 31 |
+
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
# Model information
|
| 33 |
+
auto_eval_column_dict.append(["data_source", ColumnContent, ColumnContent("Data Source", "str", True)])
|
| 34 |
+
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("Date", "str", True)])
|
| 35 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model type", "str", False)])
|
| 36 |
+
auto_eval_column_dict.append(["model_size", ColumnContent, ColumnContent("Model size", "str", False)])
|
| 37 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 38 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 39 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 40 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 41 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 42 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 43 |
|
| 44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
src/envs.py
CHANGED
|
@@ -6,20 +6,20 @@ from huggingface_hub import HfApi
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
-
OWNER = "
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
| 13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|
|
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = "MMTU-benchmark" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/leaderboard"
|
| 13 |
+
# QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
+
# RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
+
# EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
+
# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
+
# EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
+
# EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
| 25 |
API = HfApi(token=TOKEN)
|