{ "suite_config": { "name": "asta-bench", "version": "1.0.0-dev1", "splits": [ { "name": "validation", "tasks": [ { "name": "arxivdigestables_validation", "path": "astabench/arxivdigestables_validation", "primary_metric": "score_tables/mean", "tags": [ "lit" ] }, { "name": "sqa_dev", "path": "astabench/sqa_dev", "primary_metric": "global_avg/mean", "tags": [ "lit" ] }, { "name": "litqa2_validation", "path": "astabench/litqa2_validation", "primary_metric": "is_correct/accuracy", "tags": [ "lit" ] }, { "name": "paper_finder_validation", "path": "astabench/paper_finder_validation", "primary_metric": "score_paper_finder/macro_avg", "tags": [ "lit" ] }, { "name": "discoverybench_validation", "path": "astabench/discoverybench_validation", "primary_metric": "score_discoverybench/mean", "tags": [ "data" ] }, { "name": "core_bench_validation", "path": "astabench/core_bench_validation", "primary_metric": "evaluate_task_questions/accuracy", "tags": [ "code" ] }, { "name": "ds1000_validation", "path": "astabench/ds1000_validation", "primary_metric": "ds1000_scorer/accuracy", "tags": [ "code" ] }, { "name": "e2e_discovery_validation", "path": "astabench/e2e_discovery_validation", "primary_metric": "score_rubric/accuracy", "tags": [ "discovery" ] }, { "name": "super_validation", "path": "astabench/super_validation", "primary_metric": "check_super_execution/entrypoints", "tags": [ "code" ] } ] }, { "name": "test", "tasks": [ { "name": "paper_finder_test", "path": "astabench/paper_finder_test", "primary_metric": "score_paper_finder/macro_avg", "tags": [ "lit" ] }, { "name": "sqa_test", "path": "astabench/sqa_test", "primary_metric": "global_avg/mean", "tags": [ "lit" ] }, { "name": "arxivdigestables_test", "path": "astabench/arxivdigestables_test", "primary_metric": "score_tables/mean", "tags": [ "lit" ] }, { "name": "litqa2_test", "path": "astabench/litqa2_test", "primary_metric": "is_correct/accuracy", "tags": [ "lit" ] }, { "name": "discoverybench_test", "path": "astabench/discoverybench_test", "primary_metric": "score_discoverybench/mean", "tags": [ "data" ] }, { "name": "core_bench_test", "path": "astabench/core_bench_test", "primary_metric": "evaluate_task_questions/accuracy", "tags": [ "code" ] }, { "name": "ds1000_test", "path": "astabench/ds1000_test", "primary_metric": "ds1000_scorer/accuracy", "tags": [ "code" ] }, { "name": "e2e_discovery_test", "path": "astabench/e2e_discovery_test", "primary_metric": "score_rubric/accuracy", "tags": [ "discovery" ] }, { "name": "super_test", "path": "astabench/super_test", "primary_metric": "check_super_execution/entrypoints", "tags": [ "code" ] } ] } ] }, "split": "validation", "results": [ { "task_name": "sqa_dev", "metrics": [ { "name": "global_avg/mean", "value": 0.6215245045241414 }, { "name": "global_avg/stderr", "value": 0.02088486499225903 }, { "name": "ingredient_recall/mean", "value": 0.6029178145087237 }, { "name": "ingredient_recall/stderr", "value": 0.026215888361291618 }, { "name": "answer_precision/mean", "value": 0.7960436785436785 }, { "name": "answer_precision/stderr", "value": 0.027692773517249983 }, { "name": "citation_precision/mean", "value": 0.697849041353826 }, { "name": "citation_precision/stderr", "value": 0.026784164936602798 }, { "name": "citation_recall/mean", "value": 0.3892874836903378 }, { "name": "citation_recall/stderr", "value": 0.015094770200171756 } ], "model_costs": [ 1.3829150000000001, 0.9759700000000001, 2.2324650000000004, 0.76631, 0.9277900000000001, 2.6388600000000006, 0.8114100000000002, 2.3263174999999996, 2.5423725, 1.2398675000000001, 1.7387300000000003, 1.2176599999999997, 0.564655, 0.9726750000000001, 0.7675700000000001, 1.5198850000000002, 1.4726625000000002, 2.1937650000000004, 0.6907700000000001, 1.39835, 1.2598175, 2.5373550000000002, 2.19239, 1.2508875000000006, 2.2650550000000007, 1.6047725, 0.6525125000000003, 1.4262200000000003, 1.0533299999999999, 1.7252375, 1.407145, 1.5408700000000004, 2.8073224999999993, 1.0448125000000006, 1.7037300000000004, 0.8650500000000001, 1.0171225000000002, 0.5697925000000001, 2.7851025, 1.0551425, 2.9213775, 1.7772975000000004, 1.2753225000000001, 0.8108325000000001, 0.6958375000000001, 0.8840950000000003, 1.2028724999999998, 1.2490475000000003, 2.4272, 1.95026, 1.5352475, 2.11181, 2.3612249999999997, 1.8619225000000004, 0.7431075000000001, 1.5189675000000002, 1.089575, 1.6103700000000003, 1.4201450000000002, 2.397835, 1.469175, 1.0723550000000004, 0.7964050000000003, 3.3733175, 4.197085, 4.2637675, 1.2982124999999998, 0.66146, 1.1130475000000002, 2.4393974999999997, 2.582, 1.7381725000000001, 0.415025, 1.6777325, 1.0507825000000002, 2.4627125000000003, 1.017005, 1.9210250000000002, 1.5009025000000003, 0.8283125000000001, 2.9854425, 0.4633375000000001, 0.397685, 1.2803425, 3.0388200000000003, 1.2610875000000004, 1.798365, 3.427287500000001, 0.29307750000000005, 0.37101249999999997, 2.8046925000000003, 0.35557000000000005, 3.5481700000000007, 1.1073975, 1.5280825, 1.1714900000000001, 3.1791275000000003, 3.8214725000000005, 1.8440275, 1.730515, 1.9350675000000002, 1.6592125000000002, 1.9227124999999998, 1.202885, 1.2688150000000002, 0.8819875000000001, 0.6989325, 1.965635, 1.7467800000000002, 1.6940625000000002 ] } ], "submission": { "submit_time": "2025-06-09T20:55:35.869831Z", "username": "miked-ai", "agent_name": "Basic ReAct", "agent_description": null, "agent_url": null, "logs_url": "hf://datasets/allenai/asta-bench-internal-submissions/1.0.0-dev1/validation/miked-ai_Basic_ReAct__task_tools__report_editor__2025-06-09T20-55-35", "logs_url_public": null, "summary_url": null } }