Spaces:

Stremly
/

uitars

Runtime error

App Files Files Community

Abs6187 commited on Jul 8

Commit

3b19b54

1 Parent(s): 7860e5b

Added New UI

Browse files

Files changed (1) hide show

app.py +11 -27

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
 # --- Model and Processor Initialization ---
-# This setup is standard and remains unchanged.
 _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
@@ -28,7 +27,6 @@ def draw_point(image: Image.Image, point=None, radius: int = 15):
     if point and isinstance(point, list) and len(point) == 2:
         x, y = point[0] * img.width, point[1] * img.height
         draw = ImageDraw.Draw(img)
-        # Draw a larger ellipse for better visibility on high-res screens
         draw.ellipse(
             (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
         )
@@ -38,24 +36,23 @@ def draw_point(image: Image.Image, point=None, radius: int = 15):
 def navigate(screenshot, task: str):
     """Runs a single inference step of the GUI reasoning model."""
     if not screenshot or not task:
-        # Added basic validation to prevent errors with empty inputs
         return None, "Please provide both a screenshot and a task.", []
     messages = []
-    # --- KEY CHANGE: Refined Prompt for Concise Reasoning ---
-    # The 'Note' section is updated to guide the model towards a shorter, more direct "Thought" process.
     prompt_header = (
-        "You are a GUI agent. You are given a task and a screenshot. Your goal is to determine the next action.\n\n"
-        "## Output Format\n```\nThought: ...\nAction: ...\n```\n\n"
         "## Action Space\n"
         "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
         "type(content='...')\n"
         "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
         "finished(content='...')\n\n"
-        "## Note\n"
-        "- In the `Thought` part, briefly state your reasoning in a single, direct sentence.\n"
-        "- Always use 'win' instead of 'meta' for hotkeys.\n\n"
         f"## User Instruction\n{task}"
     )
@@ -78,7 +75,6 @@ def navigate(screenshot, task: str):
     try:
         if "Action:" in raw_out:
             action_part = raw_out.split("Action:")[1].strip()
-            # The model sometimes wraps its output in ```, so we remove it.
             if action_part.startswith("```") and action_part.endswith("```"):
                 action_part = action_part[3:-3].strip()
@@ -96,8 +92,7 @@ def navigate(screenshot, task: str):
     return screenshot, raw_out, messages
-# --- KEY CHANGE: Enhanced Gradio UI ---
-# The interface is rebuilt using gr.Blocks for a cleaner layout and better user guidance.
 with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
     gr.Markdown(
         """
@@ -116,21 +111,11 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !i
             )
             submit_btn = gr.Button("Analyze Action", variant="primary")
-            gr.Examples(
-                examples=[
-                    ["examples/google.png", "Search for 'latest AI news'"],
-                    ["examples/github.png", "Find the search bar and type 'Qwen'"],
-                    ["examples/figma.png", "Select the blue rectangle on the canvas"],
-                ],
-                inputs=[screenshot_in, task_in],
-                label="Example Use Cases"
-            )
         with gr.Column(scale=2):
-            screenshot_out = gr.Image(label="Result: Screenshot with Click Point", interactive=False)
             with gr.Accordion("Model Output Details", open=False):
-                raw_out = gr.Textbox(label="Full Model Output (Thought & Action)", interactive=False)
-                history_out = gr.JSON(label="Conversation History for Debugging", interactive=False)
     submit_btn.click(
         fn=navigate,
@@ -146,7 +131,6 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !i
     )
 if __name__ == "__main__":
-    # To run this, you'll need to create an 'examples' directory with the sample images.
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
 # --- Model and Processor Initialization ---
 _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "ByteDance-Seed/UI-TARS-1.5-7B",
     device_map="auto",
     if point and isinstance(point, list) and len(point) == 2:
         x, y = point[0] * img.width, point[1] * img.height
         draw = ImageDraw.Draw(img)
         draw.ellipse(
             (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
         )
 def navigate(screenshot, task: str):
     """Runs a single inference step of the GUI reasoning model."""
     if not screenshot or not task:
         return None, "Please provide both a screenshot and a task.", []
     messages = []
     prompt_header = (
+        "You are a precise GUI agent. Your task is to analyze a screenshot and a user instruction, then output your thought process and the next action. "
+        "You MUST ONLY output a 'Thought' and an 'Action'. Do not add any other text.\n\n"
+        "## Output Format\n"
+        "```\n"
+        "Thought: [Your brief, single-sentence reasoning here.]\n"
+        "Action: [The specific action to take, e.g., click(...) or type(...)]\n"
+        "```\n\n"
         "## Action Space\n"
         "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
         "type(content='...')\n"
         "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
         "finished(content='...')\n\n"
         f"## User Instruction\n{task}"
     )
     try:
         if "Action:" in raw_out:
             action_part = raw_out.split("Action:")[1].strip()
             if action_part.startswith("```") and action_part.endswith("```"):
                 action_part = action_part[3:-3].strip()
     return screenshot, raw_out, messages
+# --- UI Definition with Fixes ---
 with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
     gr.Markdown(
         """
             )
             submit_btn = gr.Button("Analyze Action", variant="primary")
         with gr.Column(scale=2):
+            screenshot_out = gr.Image(label="Result: Screenshot with Click Point")
             with gr.Accordion("Model Output Details", open=False):
+                raw_out = gr.Textbox(label="Full Model Output (Thought & Action)")
+                history_out = gr.JSON(label="Conversation History for Debugging")
     submit_btn.click(
         fn=navigate,
     )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,