Abs6187 commited on
Commit
3b19b54
Β·
1 Parent(s): 7860e5b

Added New UI

Browse files
Files changed (1) hide show
  1. app.py +11 -27
app.py CHANGED
@@ -8,7 +8,6 @@ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
8
  from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
9
 
10
  # --- Model and Processor Initialization ---
11
- # This setup is standard and remains unchanged.
12
  _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
13
  "ByteDance-Seed/UI-TARS-1.5-7B",
14
  device_map="auto",
@@ -28,7 +27,6 @@ def draw_point(image: Image.Image, point=None, radius: int = 15):
28
  if point and isinstance(point, list) and len(point) == 2:
29
  x, y = point[0] * img.width, point[1] * img.height
30
  draw = ImageDraw.Draw(img)
31
- # Draw a larger ellipse for better visibility on high-res screens
32
  draw.ellipse(
33
  (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
34
  )
@@ -38,24 +36,23 @@ def draw_point(image: Image.Image, point=None, radius: int = 15):
38
  def navigate(screenshot, task: str):
39
  """Runs a single inference step of the GUI reasoning model."""
40
  if not screenshot or not task:
41
- # Added basic validation to prevent errors with empty inputs
42
  return None, "Please provide both a screenshot and a task.", []
43
 
44
  messages = []
45
 
46
- # --- KEY CHANGE: Refined Prompt for Concise Reasoning ---
47
- # The 'Note' section is updated to guide the model towards a shorter, more direct "Thought" process.
48
  prompt_header = (
49
- "You are a GUI agent. You are given a task and a screenshot. Your goal is to determine the next action.\n\n"
50
- "## Output Format\n```\nThought: ...\nAction: ...\n```\n\n"
 
 
 
 
 
51
  "## Action Space\n"
52
  "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
53
  "type(content='...')\n"
54
  "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
55
  "finished(content='...')\n\n"
56
- "## Note\n"
57
- "- In the `Thought` part, briefly state your reasoning in a single, direct sentence.\n"
58
- "- Always use 'win' instead of 'meta' for hotkeys.\n\n"
59
  f"## User Instruction\n{task}"
60
  )
61
 
@@ -78,7 +75,6 @@ def navigate(screenshot, task: str):
78
  try:
79
  if "Action:" in raw_out:
80
  action_part = raw_out.split("Action:")[1].strip()
81
- # The model sometimes wraps its output in ```, so we remove it.
82
  if action_part.startswith("```") and action_part.endswith("```"):
83
  action_part = action_part[3:-3].strip()
84
 
@@ -96,8 +92,7 @@ def navigate(screenshot, task: str):
96
 
97
  return screenshot, raw_out, messages
98
 
99
- # --- KEY CHANGE: Enhanced Gradio UI ---
100
- # The interface is rebuilt using gr.Blocks for a cleaner layout and better user guidance.
101
  with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
102
  gr.Markdown(
103
  """
@@ -116,21 +111,11 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !i
116
  )
117
  submit_btn = gr.Button("Analyze Action", variant="primary")
118
 
119
- gr.Examples(
120
- examples=[
121
- ["examples/google.png", "Search for 'latest AI news'"],
122
- ["examples/github.png", "Find the search bar and type 'Qwen'"],
123
- ["examples/figma.png", "Select the blue rectangle on the canvas"],
124
- ],
125
- inputs=[screenshot_in, task_in],
126
- label="Example Use Cases"
127
- )
128
-
129
  with gr.Column(scale=2):
130
- screenshot_out = gr.Image(label="Result: Screenshot with Click Point", interactive=False)
131
  with gr.Accordion("Model Output Details", open=False):
132
- raw_out = gr.Textbox(label="Full Model Output (Thought & Action)", interactive=False)
133
- history_out = gr.JSON(label="Conversation History for Debugging", interactive=False)
134
 
135
  submit_btn.click(
136
  fn=navigate,
@@ -146,7 +131,6 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !i
146
  )
147
 
148
  if __name__ == "__main__":
149
- # To run this, you'll need to create an 'examples' directory with the sample images.
150
  demo.launch(
151
  server_name="0.0.0.0",
152
  server_port=7860,
 
8
  from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
9
 
10
  # --- Model and Processor Initialization ---
 
11
  _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
12
  "ByteDance-Seed/UI-TARS-1.5-7B",
13
  device_map="auto",
 
27
  if point and isinstance(point, list) and len(point) == 2:
28
  x, y = point[0] * img.width, point[1] * img.height
29
  draw = ImageDraw.Draw(img)
 
30
  draw.ellipse(
31
  (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
32
  )
 
36
  def navigate(screenshot, task: str):
37
  """Runs a single inference step of the GUI reasoning model."""
38
  if not screenshot or not task:
 
39
  return None, "Please provide both a screenshot and a task.", []
40
 
41
  messages = []
42
 
 
 
43
  prompt_header = (
44
+ "You are a precise GUI agent. Your task is to analyze a screenshot and a user instruction, then output your thought process and the next action. "
45
+ "You MUST ONLY output a 'Thought' and an 'Action'. Do not add any other text.\n\n"
46
+ "## Output Format\n"
47
+ "```\n"
48
+ "Thought: [Your brief, single-sentence reasoning here.]\n"
49
+ "Action: [The specific action to take, e.g., click(...) or type(...)]\n"
50
+ "```\n\n"
51
  "## Action Space\n"
52
  "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
53
  "type(content='...')\n"
54
  "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
55
  "finished(content='...')\n\n"
 
 
 
56
  f"## User Instruction\n{task}"
57
  )
58
 
 
75
  try:
76
  if "Action:" in raw_out:
77
  action_part = raw_out.split("Action:")[1].strip()
 
78
  if action_part.startswith("```") and action_part.endswith("```"):
79
  action_part = action_part[3:-3].strip()
80
 
 
92
 
93
  return screenshot, raw_out, messages
94
 
95
+ # --- UI Definition with Fixes ---
 
96
  with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
97
  gr.Markdown(
98
  """
 
111
  )
112
  submit_btn = gr.Button("Analyze Action", variant="primary")
113
 
 
 
 
 
 
 
 
 
 
 
114
  with gr.Column(scale=2):
115
+ screenshot_out = gr.Image(label="Result: Screenshot with Click Point")
116
  with gr.Accordion("Model Output Details", open=False):
117
+ raw_out = gr.Textbox(label="Full Model Output (Thought & Action)")
118
+ history_out = gr.JSON(label="Conversation History for Debugging")
119
 
120
  submit_btn.click(
121
  fn=navigate,
 
131
  )
132
 
133
  if __name__ == "__main__":
 
134
  demo.launch(
135
  server_name="0.0.0.0",
136
  server_port=7860,