Amber Tanaka
Asta Leaderboard First Draft (#3)
ee1b999 unverified
{
"suite_config": {
"name": "asta-bench",
"version": "1.0.0-dev1",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "arxivdigestables_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "sqa_dev",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "litqa2_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "paper_finder_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/macro_avg",
"tags": [
"lit"
]
},
{
"name": "discoverybench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "core_bench_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "evaluate_task_questions/accuracy",
"tags": [
"code"
]
},
{
"name": "ds1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "e2e_discovery_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "super_validation",
"path": "astabench/super_validation",
"primary_metric": "check_super_execution/entrypoints",
"tags": [
"code"
]
}
]
},
{
"name": "test",
"tasks": [
{
"name": "paper_finder_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/macro_avg",
"tags": [
"lit"
]
},
{
"name": "sqa_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "arxivdigestables_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "litqa2_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "discoverybench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "core_bench_test",
"path": "astabench/core_bench_test",
"primary_metric": "evaluate_task_questions/accuracy",
"tags": [
"code"
]
},
{
"name": "ds1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "e2e_discovery_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "super_test",
"path": "astabench/super_test",
"primary_metric": "check_super_execution/entrypoints",
"tags": [
"code"
]
}
]
}
]
},
"split": "validation",
"results": [
{
"task_name": "sqa_dev",
"metrics": [
{
"name": "global_avg/mean",
"value": 0.6215245045241414
},
{
"name": "global_avg/stderr",
"value": 0.02088486499225903
},
{
"name": "ingredient_recall/mean",
"value": 0.6029178145087237
},
{
"name": "ingredient_recall/stderr",
"value": 0.026215888361291618
},
{
"name": "answer_precision/mean",
"value": 0.7960436785436785
},
{
"name": "answer_precision/stderr",
"value": 0.027692773517249983
},
{
"name": "citation_precision/mean",
"value": 0.697849041353826
},
{
"name": "citation_precision/stderr",
"value": 0.026784164936602798
},
{
"name": "citation_recall/mean",
"value": 0.3892874836903378
},
{
"name": "citation_recall/stderr",
"value": 0.015094770200171756
}
],
"model_costs": [
1.3829150000000001,
0.9759700000000001,
2.2324650000000004,
0.76631,
0.9277900000000001,
2.6388600000000006,
0.8114100000000002,
2.3263174999999996,
2.5423725,
1.2398675000000001,
1.7387300000000003,
1.2176599999999997,
0.564655,
0.9726750000000001,
0.7675700000000001,
1.5198850000000002,
1.4726625000000002,
2.1937650000000004,
0.6907700000000001,
1.39835,
1.2598175,
2.5373550000000002,
2.19239,
1.2508875000000006,
2.2650550000000007,
1.6047725,
0.6525125000000003,
1.4262200000000003,
1.0533299999999999,
1.7252375,
1.407145,
1.5408700000000004,
2.8073224999999993,
1.0448125000000006,
1.7037300000000004,
0.8650500000000001,
1.0171225000000002,
0.5697925000000001,
2.7851025,
1.0551425,
2.9213775,
1.7772975000000004,
1.2753225000000001,
0.8108325000000001,
0.6958375000000001,
0.8840950000000003,
1.2028724999999998,
1.2490475000000003,
2.4272,
1.95026,
1.5352475,
2.11181,
2.3612249999999997,
1.8619225000000004,
0.7431075000000001,
1.5189675000000002,
1.089575,
1.6103700000000003,
1.4201450000000002,
2.397835,
1.469175,
1.0723550000000004,
0.7964050000000003,
3.3733175,
4.197085,
4.2637675,
1.2982124999999998,
0.66146,
1.1130475000000002,
2.4393974999999997,
2.582,
1.7381725000000001,
0.415025,
1.6777325,
1.0507825000000002,
2.4627125000000003,
1.017005,
1.9210250000000002,
1.5009025000000003,
0.8283125000000001,
2.9854425,
0.4633375000000001,
0.397685,
1.2803425,
3.0388200000000003,
1.2610875000000004,
1.798365,
3.427287500000001,
0.29307750000000005,
0.37101249999999997,
2.8046925000000003,
0.35557000000000005,
3.5481700000000007,
1.1073975,
1.5280825,
1.1714900000000001,
3.1791275000000003,
3.8214725000000005,
1.8440275,
1.730515,
1.9350675000000002,
1.6592125000000002,
1.9227124999999998,
1.202885,
1.2688150000000002,
0.8819875000000001,
0.6989325,
1.965635,
1.7467800000000002,
1.6940625000000002
]
}
],
"submission": {
"submit_time": "2025-06-09T20:55:35.869831Z",
"username": "miked-ai",
"agent_name": "Basic ReAct",
"agent_description": null,
"agent_url": null,
"logs_url": "hf://datasets/allenai/asta-bench-internal-submissions/1.0.0-dev1/validation/miked-ai_Basic_ReAct__task_tools__report_editor__2025-06-09T20-55-35",
"logs_url_public": null,
"summary_url": null
}
}