Spaces:

Stremly
/

uitars

Running

App Files Files Community

Stremly commited on Jun 27

Commit

9a0d6b1

verified ·

1 Parent(s): 15c1569

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -63

app.py CHANGED Viewed

@@ -62,72 +62,72 @@ def navigate(screenshot, task: str, platform: str, history):
     # ───────────────────── normalise history input ──────────────────────────
-    try:
-        messages=[]
-        if isinstance(history, str):
-            try:
-                messages= ast.literal_eval(history)
-            except Exception as exc:
-                raise ValueError("`history` must be a JSON/Python list: " + str(exc))
-        else:
-            messages = history
-        prompt_header = (
-                "You are a GUI agent. You are given a task and your action history, with screenshots."
-                "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
-                f"## User Instruction\n{task}"
-            )
-        current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
-        messages.append(current)
-        # ─────────────────────────── model forward ─────────────────────────────
-        images, videos = process_vision_info(messages)
-        i=0
-        for message in messages:
-            if message['role'] == 'user' and isinstance(message.get('content'), list):
-                for item in message['content']:
-                    if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
-                        item['image_url'] = images[i]
-                        i+=1
-        text = processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
         )
-        print("\nimages\n:",images)
-        print("\ntext\n",text)
-        print("\nmessages\n",messages)
-        inputs = processor(
-            text=[text],
-            images=images,
-            videos=videos,
-            padding=True,
-            return_tensors="pt",
-        ).to("cuda")
-        generated = model.generate(**inputs, max_new_tokens=128)
-        trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
-        ]
-        raw_out = processor.batch_decode(
-            trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        # ─────── draw predicted click for quick visual verification (optional) ──────
-        try:
-            actions = ast.literal_eval(raw_out)
-            for act in actions if isinstance(actions, list) else [actions]:
-                pos = act.get("position")
-                if pos and isinstance(pos, list) and len(pos) == 2:
-                    screenshot = draw_point(screenshot, pos)
-        except Exception:
-            # decoding failed → just return original screenshot
-            pass
-        return screenshot, raw_out, messages
 # ────────────────────────── Gradio interface ───────────────────────────────

     # ───────────────────── normalise history input ──────────────────────────
+    messages=[]
+    if isinstance(history, str):
+        try:
+            messages= ast.literal_eval(history)
+        except Exception as exc:
+            raise ValueError("`history` must be a JSON/Python list: " + str(exc))
+    else:
+        messages = history
+    prompt_header = (
+            "You are a GUI agent. You are given a task and your action history, with screenshots."
+            "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nleft_double(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\nright_single(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e')\ndrag(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', end_box='\u003c|box_start|\u003e(x3, y3)\u003c|box_end|\u003e')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='\u003c|box_start|\u003e(x1, y1)\u003c|box_end|\u003e', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.\n\n"
+            f"## User Instruction\n{task}"
         )
+    current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
+    messages.append(current)
+    # ─────────────────────────── model forward ─────────────────────────────
+    images, videos = process_vision_info(messages)
+    i=0
+    for message in messages:
+        if message['role'] == 'user' and isinstance(message.get('content'), list):
+            for item in message['content']:
+                if item.get('type') == 'image_url' and isinstance(item.get('image_url'), str):
+                    item['image_url'] = images[i]
+                    i+=1
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    print("\nimages\n:",images)
+    print("\ntext\n",text)
+    print("\nmessages\n",messages)
+    inputs = processor(
+        text=[text],
+        images=images,
+        videos=videos,
+        padding=True,
+        return_tensors="pt",
+    ).to("cuda")
+    generated = model.generate(**inputs, max_new_tokens=128)
+    trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
+    ]
+    raw_out = processor.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    # ─────── draw predicted click for quick visual verification (optional) ──────
+    try:
+        actions = ast.literal_eval(raw_out)
+        for act in actions if isinstance(actions, list) else [actions]:
+            pos = act.get("position")
+            if pos and isinstance(pos, list) and len(pos) == 2:
+                screenshot = draw_point(screenshot, pos)
+    except Exception:
+        # decoding failed → just return original screenshot
+        pass
+    return screenshot, raw_out, messages
 # ────────────────────────── Gradio interface ───────────────────────────────