xhluca
commited on
Commit
·
271f965
1
Parent(s):
28a9c15
add human annotation
Browse files- annotations.csv +0 -0
- demo.py +54 -0
annotations.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
demo.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import ast
|
|
|
|
|
|
|
| 2 |
import pyparsing as pp
|
| 3 |
from dataclasses import dataclass
|
| 4 |
from typing import Any
|
|
@@ -474,16 +476,54 @@ def get_message_from_rule_based(judgment):
|
|
| 474 |
|
| 475 |
return output
|
| 476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 477 |
|
| 478 |
base_traj_dir = "trajectories/cleaned"
|
| 479 |
base_screenshot_dir = "trajectories/screenshots"
|
| 480 |
base_judgments_dir = "trajectories/judgments"
|
|
|
|
| 481 |
|
| 482 |
base_traj_dir = Path(base_traj_dir)
|
| 483 |
base_screenshot_dir = Path(base_screenshot_dir)
|
| 484 |
|
| 485 |
hl_action_parser = _build_highlevel_action_parser()
|
| 486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
| 488 |
gr.Markdown(
|
| 489 |
"""
|
|
@@ -552,6 +592,20 @@ with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
|
| 552 |
value=default_judges,
|
| 553 |
)
|
| 554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
| 556 |
def render_judge(benchmark, agent, task_id, judge_choices):
|
| 557 |
# load judgments
|
|
|
|
| 1 |
import ast
|
| 2 |
+
import csv
|
| 3 |
+
from textwrap import dedent
|
| 4 |
import pyparsing as pp
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from typing import Any
|
|
|
|
| 476 |
|
| 477 |
return output
|
| 478 |
|
| 479 |
+
def records_to_dict(records, key_order: list = ['benchmark', 'model_name', 'task_id']):
|
| 480 |
+
"""
|
| 481 |
+
Convert a list of records to a nested dict, with key order
|
| 482 |
+
The depth of the dict is determined by the number of keys in key_order.
|
| 483 |
+
"""
|
| 484 |
+
|
| 485 |
+
result = {}
|
| 486 |
+
|
| 487 |
+
for record in records:
|
| 488 |
+
# get the keys in the order of key_order
|
| 489 |
+
keys = [record[key] for key in key_order]
|
| 490 |
+
# create a nested dict
|
| 491 |
+
d = result
|
| 492 |
+
for key in keys[:-1]:
|
| 493 |
+
if key not in d:
|
| 494 |
+
d[key] = {}
|
| 495 |
+
d = d[key]
|
| 496 |
+
# set the value
|
| 497 |
+
d[keys[-1]] = record
|
| 498 |
+
|
| 499 |
+
return result
|
| 500 |
+
|
| 501 |
+
def format_annotation(annotation):
|
| 502 |
+
annotation_str = dedent(f"""
|
| 503 |
+
Success: {annotation['trajectory_success']}
|
| 504 |
+
Side Effect: {annotation['trajectory_side_effect']}
|
| 505 |
+
Looping: {annotation['trajectory_looping']}
|
| 506 |
+
""")
|
| 507 |
+
return annotation_str.strip()
|
| 508 |
+
|
| 509 |
|
| 510 |
base_traj_dir = "trajectories/cleaned"
|
| 511 |
base_screenshot_dir = "trajectories/screenshots"
|
| 512 |
base_judgments_dir = "trajectories/judgments"
|
| 513 |
+
annotations_path = "./annotations.csv"
|
| 514 |
|
| 515 |
base_traj_dir = Path(base_traj_dir)
|
| 516 |
base_screenshot_dir = Path(base_screenshot_dir)
|
| 517 |
|
| 518 |
hl_action_parser = _build_highlevel_action_parser()
|
| 519 |
|
| 520 |
+
# load annotations as records via csv
|
| 521 |
+
with open(annotations_path, "r") as f:
|
| 522 |
+
annotations = list(csv.DictReader(f))
|
| 523 |
+
annotations_dict = records_to_dict(annotations, key_order=['benchmark', 'model_name', 'task_id'])
|
| 524 |
+
|
| 525 |
+
# convert the annotations to a dict, with key order
|
| 526 |
+
|
| 527 |
with gr.Blocks(title="AgentRewardBench Demo") as demo:
|
| 528 |
gr.Markdown(
|
| 529 |
"""
|
|
|
|
| 592 |
value=default_judges,
|
| 593 |
)
|
| 594 |
|
| 595 |
+
# get annotation for the task from annotations_dict
|
| 596 |
+
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd])
|
| 597 |
+
def render_annotation(benchmark, agent, task_id):
|
| 598 |
+
bench_full = benchmarks_inverse[benchmark]
|
| 599 |
+
agent_full = agents_inverse[agent]
|
| 600 |
+
task_full = tasks_dict[bench_full]
|
| 601 |
+
task_id_full = f"{task_full}.{task_id}"
|
| 602 |
+
# get the annotation
|
| 603 |
+
annotation = annotations_dict[bench_full][agent_full][task_id_full]
|
| 604 |
+
annotation_str = format_annotation(annotation)
|
| 605 |
+
|
| 606 |
+
gr.Textbox(label="Expert Annotation", value=annotation_str, lines=3)
|
| 607 |
+
|
| 608 |
+
|
| 609 |
@gr.render(inputs=[benchmark_dd, model_dd, task_id_dd, judge_dd])
|
| 610 |
def render_judge(benchmark, agent, task_id, judge_choices):
|
| 611 |
# load judgments
|