Abs6187 commited on
Commit
4871e0c
Β·
1 Parent(s): 6c894b7

Updated app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -54
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import spaces
2
  import ast
3
  import torch
@@ -7,7 +8,7 @@ import base64
7
  from io import BytesIO
8
 
9
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
- from qwen_vl_utils import process_vision_info
11
 
12
  _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
13
  "ByteDance-Seed/UI-TARS-1.5-7B",
@@ -17,7 +18,7 @@ _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
17
 
18
  _PROCESSOR = AutoProcessor.from_pretrained(
19
  "ByteDance-Seed/UI-TARS-1.5-7B",
20
- size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
21
  use_fast=True,
22
  )
23
 
@@ -38,44 +39,32 @@ def draw_point(image: Image.Image, point=None, radius: int = 5):
38
 
39
  @spaces.GPU
40
  def navigate(screenshot, task: str):
41
- """Run one inference step on the GUI‑reasoning model."""
42
- messages = []
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  prompt_header = (
45
- "πŸ€– **GUI Agent Instructions**\n\n"
46
- "You're an intelligent agent solving UI tasks through:\n"
47
- "1. Visual understanding of screenshots\n"
48
- "2. Natural language task interpretation\n"
49
- "3. Action sequence generation\n\n"
50
- "## Action Reference\n"
51
- "```
52
- "| Action Type | Syntax |\n"
53
- "|--------------------|-------------------------|\n"
54
- "| Click Button | click(start_box='(x,y)') |\n"
55
- "| Double-Click | left_double(start_box='(x,y)') |\n"
56
- "| Drag Element | drag(start_box='(x1,y1)', end_box='(x2,y2)') |\n"
57
- "| Hotkey Input | hotkey(key='Ctrl+A') |\n"
58
- "| Text Input | type(content='Search term') |\n"
59
- "| Scroll Action | scroll(start_box='(x,y)', direction='down') |\n"
60
- "| Wait & Recheck | wait() |\n"
61
- "| Task Completion | finished(content='Result') |\n"
62
- "```"
63
- "\n**Note:**\n"
64
- "1. Use 'win' key instead of 'meta' in hotkey commands\n"
65
- "2. Include position coordinates in all spatial actions\n"
66
- "3. Keep 'Thought' concise - max 3 sentence strategy\n"
67
- f"\n**Task:**\n{task}"
68
- )
69
 
70
- current = {
71
- "role": "user",
72
- "content": [
73
- {"type": "text", "text": prompt_header},
74
- {"type": "image_url", "image_url": screenshot}
75
- ]
76
- }
77
  messages.append(current)
78
 
 
 
 
79
  images, videos = process_vision_info(messages)
80
  text = processor.apply_chat_template(
81
  messages, tokenize=False, add_generation_prompt=True
@@ -90,43 +79,48 @@ def navigate(screenshot, task: str):
90
 
91
  generated = model.generate(**inputs, max_new_tokens=128)
92
  trimmed = [
93
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)
94
  ]
95
  raw_out = processor.batch_decode(
96
  trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
97
  )[0]
98
 
 
99
  try:
100
  actions = ast.literal_eval(raw_out)
101
- for act in (actions if isinstance(actions, list) else [actions]):
102
  pos = act.get("position")
103
  if pos and isinstance(pos, list) and len(pos) == 2:
104
  screenshot = draw_point(screenshot, pos)
105
  except Exception:
 
106
  pass
107
 
108
  return screenshot, raw_out, messages
109
 
 
110
 
111
- demo = gr.Blocks()
112
- with gr.Row():
113
- gr.Image(type="pil", label="πŸ–ΌοΈ Screenshot Input").style(width=400)
114
- gr.Textbox(
115
- lines=1,
116
- placeholder="e.g., 'Book a flight to Paris'",
117
- label="πŸ” Task Description"
118
- ).style(width=400)
119
-
120
- with gr.Row():
121
- gr.Image(label="πŸ“ Click Point Visualization").style(width=400)
122
- gr.Textbox(label="πŸ“ Action Response").style(width=400)
123
-
124
- with gr.Row():
125
- gr.JSON(label="πŸ“œ Conversation History").style(width=800)
 
 
126
 
127
  demo.launch(
128
  server_name="0.0.0.0",
129
  server_port=7860,
130
- share=False,
131
- ssr_mode=False
132
  )
 
1
+ # app.py
2
  import spaces
3
  import ast
4
  import torch
 
8
  from io import BytesIO
9
 
10
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
11
+ from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable
12
 
13
  _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
14
  "ByteDance-Seed/UI-TARS-1.5-7B",
 
18
 
19
  _PROCESSOR = AutoProcessor.from_pretrained(
20
  "ByteDance-Seed/UI-TARS-1.5-7B",
21
+ size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28}, # sane res
22
  use_fast=True,
23
  )
24
 
 
39
 
40
  @spaces.GPU
41
  def navigate(screenshot, task: str):
42
+ """Run one inference step on the GUI‑reasoning model.
43
+
44
+ Args:
45
+ screenshot (PIL.Image): Latest UI screenshot.
46
+ task (str): Natural‑language task description
47
+ history (list | str | None): Previous messages list. Accepts either an
48
+ actual Python list (via gr.JSON) or a JSON/Python‑literal string.
49
+ """
50
+
51
+
52
+ # ───────────────────── normalise history input ──────────────────────────
53
+
54
+ messages=[]
55
 
56
  prompt_header = (
57
+ "You are a GUI agent. You are given a task and your action history, with screenshots."
58
+ "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
59
+ f"## User Instruction\n{task}"
60
+ )
61
+ current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
 
 
 
 
 
 
63
  messages.append(current)
64
 
65
+ #New Comment 1
66
+ # ─────────────────────────── model forward ─────────────────────────────
67
+
68
  images, videos = process_vision_info(messages)
69
  text = processor.apply_chat_template(
70
  messages, tokenize=False, add_generation_prompt=True
 
79
 
80
  generated = model.generate(**inputs, max_new_tokens=128)
81
  trimmed = [
82
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
83
  ]
84
  raw_out = processor.batch_decode(
85
  trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
86
  )[0]
87
 
88
+ # ─────── draw predicted click for quick visual verification (optional) ──────
89
  try:
90
  actions = ast.literal_eval(raw_out)
91
+ for act in actions if isinstance(actions, list) else [actions]:
92
  pos = act.get("position")
93
  if pos and isinstance(pos, list) and len(pos) == 2:
94
  screenshot = draw_point(screenshot, pos)
95
  except Exception:
96
+ # decoding failed β†’ just return original screenshot
97
  pass
98
 
99
  return screenshot, raw_out, messages
100
 
101
+ # ────────────────────────── Gradio interface ───────────────────────────────
102
 
103
+ demo = gr.Interface(
104
+ fn=navigate,
105
+ inputs=[
106
+ gr.Image(type="pil", label="Screenshot"),
107
+ gr.Textbox(
108
+ lines=1,
109
+ placeholder="e.g. Search the weather for New York",
110
+ label="Task",
111
+ )
112
+ ],
113
+ outputs=[
114
+ gr.Image(label="With Click Point"),
115
+ gr.Textbox(label="Raw Action JSON"),
116
+ gr.JSON(label="Updated Conversation History")
117
+ ],
118
+ title="UI-Tars Navigation Demo",
119
+ )
120
 
121
  demo.launch(
122
  server_name="0.0.0.0",
123
  server_port=7860,
124
+ share=False, # or True if you need a public link
125
+ ssr_mode=False, # turn off experimental SSR so the process blocks
126
  )