Abs6187 commited on
Commit
f737f3c
Β·
1 Parent(s): fec7485

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -117
app.py CHANGED
@@ -1,141 +1,95 @@
1
- # app.py
2
  import spaces
3
- import ast
4
  import torch
5
  from PIL import Image, ImageDraw
6
  import gradio as gr
 
7
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
8
- from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
9
 
10
- # --- Model and Processor Initialization ---
11
- _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
12
- "ByteDance-Seed/UI-TARS-1.5-7B",
13
- device_map="auto",
14
- torch_dtype=torch.float16
15
- )
16
- _PROCESSOR = AutoProcessor.from_pretrained(
17
- "ByteDance-Seed/UI-TARS-1.5-7B",
18
- size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
19
- use_fast=True,
20
- )
21
- model = _MODEL
22
- processor = _PROCESSOR
 
23
 
24
- def draw_point(image: Image.Image, point=None, radius: int = 15):
25
- """Overlays a larger, more visible red dot on the screenshot."""
26
  img = image.copy()
27
- if point and isinstance(point, list) and len(point) == 2:
28
- x, y = point[0] * img.width, point[1] * img.height
29
- draw = ImageDraw.Draw(img)
30
- draw.ellipse(
31
- (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
32
- )
 
 
 
 
 
 
 
 
 
 
 
 
33
  return img
34
 
35
  @spaces.GPU
36
- def navigate(screenshot, task: str):
37
- """Runs a single inference step of the GUI reasoning model."""
38
  if not screenshot or not task:
39
- return None, "Please provide both a screenshot and a task.", []
40
 
41
- messages = []
42
-
43
  prompt_header = (
44
- "You are a precise GUI agent. Your task is to analyze a screenshot and a user instruction, then output your thought process and the next action. "
45
- "You MUST ONLY output a 'Thought' and an 'Action'. Do not add any other text.\n\n"
46
- "## Output Format\n"
47
- "```\n"
48
- "Thought: [Your brief, single-sentence reasoning here.]\n"
49
- "Action: [The specific action to take, e.g., click(...) or type(...)]\n"
50
- "```\n\n"
51
- "## Action Space\n"
52
- "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
53
- "type(content='...')\n"
54
- "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
55
- "finished(content='...')\n\n"
56
  f"## User Instruction\n{task}"
57
  )
58
 
59
- content = [
60
- {"type": "text", "text": prompt_header},
61
- {"type": "image_url", "image_url": screenshot}
62
- ]
63
- messages.append({"role": "user", "content": content})
64
-
65
- images, videos = process_vision_info(messages)
66
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
67
- inputs = processor(
68
- text=[text], images=images, videos=videos, padding=True, return_tensors="pt"
69
- ).to("cuda")
70
-
71
- generated = model.generate(**inputs, max_new_tokens=256)
72
- trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)]
73
- raw_out = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
74
 
 
 
 
 
75
  try:
76
- if "Action:" in raw_out:
77
- action_part = raw_out.split("Action:")[1].strip()
78
- if action_part.startswith("```") and action_part.endswith("```"):
79
- action_part = action_part[3:-3].strip()
80
-
81
- action_dict = ast.literal_eval(action_part)
82
-
83
- box_str = action_dict.get("start_box")
84
- if box_str and isinstance(box_str, str) and "( " in box_str:
85
- coords_part = box_str.split('( ')[1].split(' )')[0]
86
- x_str, y_str = coords_part.split(', ')
87
- pos = [float(x_str), float(y_str)]
88
- screenshot = draw_point(screenshot, pos)
89
- except (Exception, SyntaxError) as e:
90
- print(f"Could not parse action or draw point: {e}")
91
- pass
92
-
93
- return screenshot, raw_out, messages
94
-
95
- # --- UI Definition with Fixes ---
96
- with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
97
- gr.Markdown(
98
- """
99
- # ✨ Enhanced UI-Tars Navigation Demo
100
- **Upload a screenshot and provide a task to see how the AI plans its next action.**
101
- The model will analyze the image and your instruction, then output its thought process and the specific action it would take. A red dot will indicate the target location for clicks or scrolls.
102
- """
103
- )
104
- with gr.Row():
105
- with gr.Column(scale=1):
106
- screenshot_in = gr.Image(type="pil", label="Screenshot")
107
- task_in = gr.Textbox(
108
- lines=2,
109
- placeholder="e.g., Click on the 'Sign In' button.",
110
- label="Task Instruction",
111
- )
112
- submit_btn = gr.Button("Analyze Action", variant="primary")
113
-
114
- with gr.Column(scale=2):
115
- # --- FIX APPLIED HERE ---
116
- # The 'interactive' argument has been removed from all output components
117
- # to ensure compatibility with the execution environment's Gradio version.
118
- screenshot_out = gr.Image(label="Result: Screenshot with Click Point")
119
- with gr.Accordion("Model Output Details", open=False):
120
- raw_out = gr.Textbox(label="Full Model Output (Thought & Action)")
121
- history_out = gr.JSON(label="Conversation History for Debugging")
122
 
123
- submit_btn.click(
124
- fn=navigate,
125
- inputs=[screenshot_in, task_in],
126
- outputs=[screenshot_out, raw_out, history_out],
127
- )
128
 
129
- gr.Markdown(
130
- """
131
- ---
132
- *Model: [ByteDance-Seed/UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B)*
133
- """
134
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  if __name__ == "__main__":
137
- demo.launch(
138
- server_name="0.0.0.0",
139
- server_port=7860,
140
- share=False,
141
- )
 
 
1
  import spaces
 
2
  import torch
3
  from PIL import Image, ImageDraw
4
  import gradio as gr
5
+ import re
6
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 
7
 
8
+ @spaces.GPU
9
+ def load_model_and_processor():
10
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
11
+ "ByteDance-Seed/UI-TARS-1.5-7B",
12
+ device_map="auto",
13
+ torch_dtype=torch.float16
14
+ )
15
+ processor = AutoProcessor.from_pretrained(
16
+ "ByteDance-Seed/UI-TARS-1.5-7B",
17
+ use_fast=True,
18
+ )
19
+ return model, processor
20
+
21
+ model, processor = load_model_and_processor()
22
 
23
+ def draw_point(image: Image.Image, point_str: str, radius: int = 10):
 
24
  img = image.copy()
25
+ try:
26
+ coord_regex = r'click\(.*?<\|box_start\|>\s*\((\s*[\d.]+)\s*,\s*([\d.]+)\s*\).*?\)'
27
+ match = re.search(coord_regex, point_str)
28
+
29
+ if match:
30
+ x_norm, y_norm = float(match.group(1)), float(match.group(2))
31
+ x = x_norm * img.width
32
+ y = y_norm * img.height
33
+
34
+ draw = ImageDraw.Draw(img)
35
+ draw.ellipse(
36
+ (x - radius, y - radius, x + radius, y + radius),
37
+ fill="red",
38
+ outline="white",
39
+ width=2
40
+ )
41
+ except Exception:
42
+ pass
43
  return img
44
 
45
  @spaces.GPU
46
+ def navigate(screenshot: Image.Image, task: str):
 
47
  if not screenshot or not task:
48
+ raise gr.Error("Please provide both a screenshot and a task.")
49
 
 
 
50
  prompt_header = (
51
+ "You are a GUI agent. You are given a task and your action history, with screenshots."
52
+ "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
 
 
 
 
 
 
 
 
 
 
53
  f"## User Instruction\n{task}"
54
  )
55
 
56
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt_header}, {"type": "image", "image": screenshot}]}]
57
+
 
 
 
 
 
58
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
59
+
60
+ inputs = processor(text=[text], images=[screenshot], return_tensors="pt").to(model.device)
 
 
 
 
 
61
 
62
+ generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
63
+
64
+ response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
65
+
66
  try:
67
+ action_text = response.split('[/INST]')[-1].strip()
68
+ except IndexError:
69
+ action_text = response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ output_image = draw_point(screenshot, action_text)
 
 
 
 
72
 
73
+ return output_image, action_text
74
+
75
+ demo = gr.Interface(
76
+ fn=navigate,
77
+ inputs=[
78
+ gr.Image(type="pil", label="Screenshot"),
79
+ gr.Textbox(
80
+ lines=1,
81
+ placeholder="e.g. Search the weather for New York",
82
+ label="Task",
83
+ )
84
+ ],
85
+ outputs=[
86
+ gr.Image(label="With Click Point"),
87
+ gr.Textbox(label="Raw Action Output"),
88
+ ],
89
+ title="UI-Tars Navigation Demo",
90
+ description="Upload a UI screenshot, describe a task, and see the AI-predicted next action. This model helps automate GUI interactions.",
91
+ allow_flagging="never",
92
+ )
93
 
94
  if __name__ == "__main__":
95
+ demo.launch()