Spaces:
Sleeping
Sleeping
| import base64, os, json | |
| from typing import Optional | |
| import torch | |
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from PIL import Image, ImageDraw | |
| # ---- Hugging Face Spaces GPU decorator (safe fallback when not on Spaces) ---- | |
| try: | |
| import spaces | |
| GPU_DECORATOR = spaces.GPU | |
| except Exception: | |
| def GPU_DECORATOR(fn): # no-op locally | |
| return fn | |
| from qwen_vl_utils import process_vision_info # noqa: F401 (kept for parity if used elsewhere) | |
| from datasets import load_dataset # noqa: F401 | |
| from transformers import AutoProcessor | |
| from gui_actor.constants import chat_template # noqa: F401 | |
| from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer | |
| from gui_actor.inference import inference | |
| MAX_PIXELS = 3200 * 1800 | |
| def resize_image(image, resize_to_pixels=MAX_PIXELS): | |
| image_width, image_height = image.size | |
| if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels): | |
| resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5 | |
| image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio) | |
| image = image.resize((image_width_resized, image_height_resized)) | |
| return image | |
| def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)): | |
| overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) | |
| overlay_draw = ImageDraw.Draw(overlay) | |
| x, y = point | |
| overlay_draw.ellipse( | |
| [(x - radius, y - radius), (x + radius, y + radius)], | |
| outline=color, | |
| width=5 | |
| ) | |
| image = image.convert('RGBA') | |
| combined = Image.alpha_composite(image, overlay) | |
| combined = combined.convert('RGB') | |
| return combined | |
| def get_attn_map(image, attn_scores, n_width, n_height): | |
| w, h = image.size | |
| scores = np.array(attn_scores[0]).reshape(n_height, n_width) | |
| scores_norm = (scores - scores.min()) / (scores.max() - scores.min() + 1e-8) | |
| score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) | |
| colormap = plt.get_cmap('jet') | |
| colored_score_map = colormap(np.array(score_map) / 255.0)[:, :, :3] | |
| colored_overlay = Image.fromarray((colored_score_map * 255).astype(np.uint8)) | |
| blended = Image.blend(image, colored_overlay, alpha=0.3) | |
| return blended | |
| # ---------------------------- | |
| # Model/device init for Spaces | |
| # ---------------------------- | |
| def _pick_gpu_dtype() -> torch.dtype: | |
| if not torch.cuda.is_available(): | |
| return torch.float32 | |
| major, minor = torch.cuda.get_device_capability() | |
| # Ampere (8.x) / Hopper (9.x) support bf16 well | |
| return torch.bfloat16 if major >= 8 else torch.float16 | |
| # Global holders initialized in load_model() | |
| model = None | |
| tokenizer = None | |
| data_processor = None | |
| # <-- This is what Spaces looks for at startup | |
| def load_model(): | |
| """ | |
| Allocates the GPU on Spaces and loads the model on the right device/dtype. | |
| Runs once at startup. | |
| """ | |
| global model, tokenizer, data_processor | |
| model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL" | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| dtype = _pick_gpu_dtype() | |
| # Enable some healthy defaults on GPU | |
| if device.startswith("cuda"): | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.set_grad_enabled(False) | |
| data_processor = AutoProcessor.from_pretrained(model_name_or_path) | |
| tokenizer = data_processor.tokenizer | |
| # Use SDPA attention to avoid flash-attn dependency | |
| attn_impl = "sdpa" | |
| model_local = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( | |
| model_name_or_path, | |
| torch_dtype=dtype, | |
| attn_implementation=attn_impl, | |
| ).eval() | |
| # Move to device explicitly (avoid accelerate unless you need sharding) | |
| model_local.to(device) | |
| model = model_local | |
| return f"Loaded {model_name_or_path} on {device} with dtype={dtype} (attn={attn_impl})" | |
| # Trigger model loading on import so Spaces allocates GPU immediately | |
| _ = load_model() | |
| def process(image, instruction): | |
| # Safety: ensure model is loaded | |
| if model is None: | |
| _ = load_model() | |
| # Resize if needed | |
| w, h = image.size | |
| if w * h > MAX_PIXELS: | |
| image = resize_image(image) | |
| w, h = image.size | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": ( | |
| "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, " | |
| "your task is to locate the screen element that corresponds to the instruction. " | |
| "Output a PyAutoGUI action with a special token that points to the correct location." | |
| ), | |
| } | |
| ], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": instruction}, | |
| ], | |
| }, | |
| ] | |
| device = next(model.parameters()).device | |
| try: | |
| pred = inference( | |
| conversation, | |
| model, | |
| tokenizer, | |
| data_processor, | |
| use_placeholder=True, | |
| topk=3 | |
| ) | |
| except Exception as e: | |
| print("inference error:", e) | |
| return image, f"Error: {e}", None | |
| px, py = pred["topk_points"][0] | |
| output_coord = f"({px:.4f}, {py:.4f})" | |
| img_with_point = draw_point(image, (px * w, py * h)) | |
| n_width, n_height = pred["n_width"], pred["n_height"] | |
| attn_scores = pred["attn_scores"] | |
| att_map = get_attn_map(image, attn_scores, n_width, n_height) | |
| return img_with_point, output_coord, att_map | |
| # ---------------------------- | |
| # Gradio UI | |
| # ---------------------------- | |
| title = "GUI-Actor" | |
| header = """ | |
| <div align="center"> | |
| <h1 style="padding-bottom: 10px; padding-top: 10px;">๐ฏ <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1> | |
| <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;"> | |
| <a href="https://microsoft.github.io/GUI-Actor/">๐ Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">๐ arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">๐ป Github Repo</a><br/> | |
| </div> | |
| </div> | |
| """ | |
| theme = "soft" | |
| css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;} | |
| #anno-img .mask.active {opacity: 0.7}""" | |
| with gr.Blocks(title=title, css=css, theme=theme) as demo: | |
| gr.Markdown(header) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_image = gr.Image(type='pil', label='Upload image') | |
| input_instruction = gr.Textbox(label='Instruction', placeholder='Type your (low-level) instruction here') | |
| submit_button = gr.Button(value='Submit', variant='primary') | |
| with gr.Column(): | |
| image_with_point = gr.Image(type='pil', label='Image with Point (red circle)') | |
| with gr.Accordion('Detailed prediction'): | |
| pred_xy = gr.Textbox(label='Predicted Coordinates', placeholder='(x, y)') | |
| att_map = gr.Image(type='pil', label='Attention Map') | |
| submit_button.click( | |
| fn=process, | |
| inputs=[input_image, input_instruction], | |
| outputs=[image_with_point, pred_xy, att_map], | |
| queue=True, | |
| api_name="predict", | |
| ) | |
| # Version-agnostic Gradio startup (works across 3.x/4.x/5.x) | |
| # Try newer/older signatures, fall back gracefully. | |
| # Queue (GPU scheduling needed on Spaces) | |
| try: | |
| demo.queue(concurrency_count=1, max_size=4) | |
| except TypeError: | |
| try: | |
| demo.queue(max_size=4) | |
| except TypeError: | |
| demo.queue() | |
| # Launch | |
| try: | |
| demo.launch(share=False, max_threads=1, max_queue_size=4) | |
| except TypeError: | |
| try: | |
| demo.launch(share=False, max_queue_size=4) | |
| except TypeError: | |
| demo.launch(share=False) | |