diff --git a/droidrun/agent/droid/droid_agent.py b/droidrun/agent/droid/droid_agent.py index b66b822..bea9057 100644 --- a/droidrun/agent/droid/droid_agent.py +++ b/droidrun/agent/droid/droid_agent.py @@ -260,7 +260,10 @@ class DroidAgent(Workflow): self.app_opener_llm = None self.structured_output_llm = None - if self.config.logging.save_trajectory != "none": + if ( + not self._using_external_agent + and self.config.logging.save_trajectory != "none" + ): self.trajectory = Trajectory( goal=self.shared_state.instruction, base_path=self.config.logging.trajectory_path, @@ -371,6 +374,64 @@ class DroidAgent(Workflow): if self.trajectory_writer: await self.trajectory_writer.start() + # ── 0. External agent — early exit ──────────────────────────── + if self._using_external_agent: + agent_name = self.config.agent.name + + # Load the agent module + agent_module = load_agent(agent_name) + if not agent_module: + from droidrun.agent.external import list_agents + + available = list_agents() + if available: + agents_str = ", ".join(available) + raise ValueError( + f"Failed to load external agent '{agent_name}'.\n" + f"Available agents: {agents_str}" + ) + raise ValueError( + f"External agent '{agent_name}' not found.\n" + "No external agents are currently installed.\n" + "Run: droidrun run --help to see available agents." + ) + + # Resolve config + agent_config = self.config.external_agents.get(agent_name) + if not agent_config: + raise ValueError( + f"No configuration found for agent '{agent_name}'.\n\n" + "Add to your config.yaml:\n\n" + " external_agents:\n" + f" {agent_name}:\n" + ' api_key: "your-api-key"\n' + ' model: "model-name"\n' + " # ... any settings your agent needs" + ) + + final_config = {**agent_module["config"], **agent_config} + + # Resolve device serial and get raw AdbDevice + device_serial = self.resolved_device_config.serial + if device_serial is None: + devices = await adb.list() + if not devices: + raise ValueError("No connected Android devices found.") + device_serial = devices[0].serial + + adb_device = await adb.device(serial=device_serial) + + logger.info(f"🤖 Using external agent: {agent_name}") + + result = await agent_module["run"]( + device=adb_device, + instruction=self.shared_state.instruction, + config=final_config, + max_steps=self.config.agent.max_steps, + ) + + return FinalizeEvent(success=result["success"], reason=result["reason"]) + # ── 1. Create driver ────────────────────────────────────────── if self.config.agent.reasoning: vision_enabled = self.config.agent.manager.vision @@ -530,33 +591,6 @@ class DroidAgent(Workflow): # ── 6. Fetch device date once ───────────────────────────────── self.shared_state.device_date = await driver.get_date() - # ── 7. External agent mode ──────────────────────────────────── - if self._using_external_agent: - agent_name = self.config.agent.name - agent_module = load_agent(agent_name) - if not agent_module: - raise ValueError(f"Failed to load external agent: {agent_name}") - - agent_config = self.config.external_agents.get(agent_name) - if not agent_config: - raise ValueError( - f"No config found for agent '{agent_name}' in external_agents section" - ) - - final_config = {**agent_module["config"], **agent_config} - - logger.info(f"🤖 Using external agent: {agent_name}") - - result = await agent_module["run"]( - driver=self.driver, - action_ctx=self.action_ctx, - instruction=self.shared_state.instruction, - config=final_config, - max_steps=self.config.agent.max_steps, - ) - - return FinalizeEvent(success=result["success"], reason=result["reason"]) - if self.config.logging.save_trajectory != "none": self.trajectory_writer.write(self.trajectory, stage="init") diff --git a/droidrun/agent/external/__init__.py b/droidrun/agent/external/__init__.py index 4885a8e..cf57193 100644 --- a/droidrun/agent/external/__init__.py +++ b/droidrun/agent/external/__init__.py @@ -1,28 +1,71 @@ -"""External agent loader - dynamic imports.""" +"""External agent loader — dynamic imports. + +External agents are self-contained modules that receive raw ADB access +via ``async_adbutils.AdbDevice``. They bring their own LLM client, prompts, +parsing, and action loop — zero imports from ``droidrun``. + +An external agent can be either: +- A single file: ``droidrun/agent/external/my_agent.py`` +- A package: ``droidrun/agent/external/my_agent/__init__.py`` + +Required contract:: + + from async_adbutils import AdbDevice + + async def run( + device: AdbDevice, # raw ADB, already connected + instruction: str, # the task + config: dict, # from external_agents. in config.yaml + max_steps: int, # step limit + ) -> dict: # {"success": bool, "reason": str, "steps": int} + +Optional: ``DEFAULT_CONFIG: dict`` — merged under the user's config. +""" + +from __future__ import annotations import importlib import logging -from typing import Any, Callable, Dict, Optional, TypedDict +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, TypedDict logger = logging.getLogger("droidrun") +_EXTERNAL_DIR = Path(__file__).parent + class ExternalAgentModule(TypedDict): - """Type for loaded external agent module.""" + """Type for a loaded external agent module.""" run: Callable config: Dict[str, Any] -def load_agent(name: str) -> Optional[ExternalAgentModule]: - """ - Dynamically load an external agent by name. - - Args: - name: Agent module name (e.g., "mai_ui", "autoglm") +def list_agents() -> List[str]: + """Discover available external agents by scanning the external/ directory. Returns: - Dict with 'run' function and 'config' defaults, or None if failed. + Sorted list of agent names (module stems or package directory names). + """ + agents: list[str] = [] + for item in _EXTERNAL_DIR.iterdir(): + if item.name.startswith(("_", ".")): + continue + if item.is_file() and item.suffix == ".py": + agents.append(item.stem) + elif item.is_dir() and (item / "__init__.py").exists(): + agents.append(item.name) + return sorted(agents) + + +def load_agent(name: str) -> Optional[ExternalAgentModule]: + """Dynamically load an external agent by name. + + Args: + name: Agent module name (e.g., ``"my_agent"``). + + Returns: + Dict with ``run`` function and ``config`` defaults, or *None* on failure. """ try: module = importlib.import_module(f"droidrun.agent.external.{name}") diff --git a/droidrun/agent/external/autoglm.py b/droidrun/agent/external/autoglm.py deleted file mode 100644 index e523a84..0000000 --- a/droidrun/agent/external/autoglm.py +++ /dev/null @@ -1,1119 +0,0 @@ -""" -Open-AutoGLM External Agent - Full Implementation. - -This module implements the Open-AutoGLM phone agent protocol, matching the original -implementation from https://github.com/ArtificialZeng/Open-AutoGLM - -Key features: -- Stateful conversation history across steps -- Full system prompts (Chinese + English) with 14 actions and 18 rules -- AST-based safe action parsing -- OpenAI-compatible message format -- Tool wrappers matching original DeviceFactory interface -- Timing delays matching original implementation -""" - -import ast -import asyncio -import base64 -import json -import logging -import re -import time -from dataclasses import dataclass, field -from datetime import datetime -from typing import Any, Callable, Dict, List, Optional, Tuple - -from droidrun.agent.utils.chat_utils import to_chat_messages -from droidrun.agent.utils.inference import acall_with_retries -from droidrun.agent.utils.llm_picker import load_llm - -logger = logging.getLogger("droidrun") - -# ============================================================================= -# System Prompts (exact copy from Open-AutoGLM) -# ============================================================================= - - -def get_system_prompt_zh() -> str: - """Get Chinese system prompt with current date (matches original prompts_zh.py).""" - from datetime import datetime - - today = datetime.today() - weekday_names = [ - "星期一", - "星期二", - "星期三", - "星期四", - "星期五", - "星期六", - "星期日", - ] - weekday = weekday_names[today.weekday()] - formatted_date = today.strftime("%Y年%m月%d日") + " " + weekday - return ( - "今天的日期是: " - + formatted_date - + """ -你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。 -你必须严格按照要求输出以下格式: -{think} -{action} - -其中: -- {think} 是对你为什么选择这个操作的简短推理说明。 -- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。 - -操作指令及其作用如下: -- do(action="Launch", app="xxx") - Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。 -- do(action="Tap", element=[x,y]) - Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。 -- do(action="Tap", element=[x,y], message="重要操作") - 基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。 -- do(action="Type", text="xxx") - Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。 -- do(action="Type_Name", text="xxx") - Type_Name是输入人名的操作,基本功能同Type。 -- do(action="Interact") - Interact是当有多个满足条件的选项时而触发的交互操作,询问用户如何选择。 -- do(action="Swipe", start=[x1,y1], end=[x2,y2]) - Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。 -- do(action="Note", message="True") - 记录当前页面内容以便后续总结。 -- do(action="Call_API", instruction="xxx") - 总结或评论当前页面或已记录的内容。 -- do(action="Long Press", element=[x,y]) - Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。 -- do(action="Double Tap", element=[x,y]) - Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。 -- do(action="Take_over", message="xxx") - Take_over是接管操作,表示在登录和验证阶段需要用户协助。 -- do(action="Back") - 导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。 -- do(action="Home") - Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。 -- do(action="Wait", duration="x seconds") - 等待页面加载,x为需要等待多少秒。 -- finish(message="xxx") - finish是结束任务的操作,表示准确完整完成任务,message是终止信息。 - -必须遵循的规则: -1. 在执行任何操作前,先检查当前app是否是目标app,如果不是,先执行 Launch。 -2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化,请点击页面左上角的返回键进行返回,或者右上角的X号关闭。 -3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。 -4. 如果页面显示网络问题,需要重新加载,请点击重新加载。 -5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。 -6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。 -7. 在做小红书总结类任务时一定要筛选图文笔记。 -8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。 -9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。 -10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。 -11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将"群"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。 -12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。 -13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。 -14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。 -15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。 -16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。 -17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message="原因")。 -18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。 -""" - ) - - -def get_system_prompt_en() -> str: - """Get English system prompt with current date (matches original prompts_en.py).""" - from datetime import datetime - - today = datetime.today() - formatted_date = today.strftime("%Y-%m-%d, %A") - return ( - "The current date: " - + formatted_date - + """ -# Setup -You are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code. - -# More details about the code -Your response format must be structured as follows: - -Think first: Use ... to analyze the current screen, identify key elements, and determine the most efficient action. -Provide the action: Use ... to return a single line of pseudo-code representing the operation. - -Your output should STRICTLY follow the format: - -[Your thought] - - -[Your operation code] - - -- **Tap** - Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point. - **Example**: - - do(action="Tap", element=[x,y]) - -- **Type** - Enter text into the currently focused input field. - **Example**: - - do(action="Type", text="Hello World") - -- **Swipe** - Perform a swipe action with start point and end point. - **Examples**: - - do(action="Swipe", start=[x1,y1], end=[x2,y2]) - -- **Long Press** - Perform a long press action on a specified screen area. - You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point. - **Example**: - - do(action="Long Press", element=[x,y]) - -- **Launch** - Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action. - **Example**: - - do(action="Launch", app="Settings") - -- **Back** - Press the Back button to navigate to the previous screen. - **Example**: - - do(action="Back") - -- **Finish** - Terminate the program and optionally print a message. - **Example**: - - finish(message="Task completed.") - - - -REMEMBER: -- Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in part. -- Only ONE LINE of action in part per response: Each step must contain exactly one line of executable code. -- Generate execution code strictly according to format requirements. -""" - ) - - -def get_system_prompt(lang: str = "cn") -> str: - """ - Get system prompt by language. - - Args: - lang: Language code, 'cn' for Chinese (default), 'en' for English. - - Returns: - System prompt string with current date. - """ - if lang == "cn": - return get_system_prompt_zh() - else: - return get_system_prompt_en() - - -# ============================================================================= -# Default Configuration (agent-specific only, NOT LLM) -# ============================================================================= - -DEFAULT_CONFIG: Dict[str, Any] = { - # Agent-specific settings only - LLM must be provided by user - "lang": "cn", # cn (18 detailed rules) or en (minimal rules) - "stream": True, -} - - -# ============================================================================= -# Timing Configuration (matches original Open-AutoGLM) -# ============================================================================= - - -@dataclass -class ActionTimingConfig: - """Configuration for action handler timing delays.""" - - keyboard_switch_delay: float = 1.0 - text_clear_delay: float = 1.0 - text_input_delay: float = 1.0 - keyboard_restore_delay: float = 1.0 - - -@dataclass -class DeviceTimingConfig: - """Configuration for device operation timing delays.""" - - default_tap_delay: float = 1.0 - default_double_tap_delay: float = 1.0 - double_tap_interval: float = 0.1 - default_long_press_delay: float = 1.0 - default_swipe_delay: float = 1.0 - default_back_delay: float = 1.0 - default_home_delay: float = 1.0 - default_launch_delay: float = 1.0 - - -@dataclass -class TimingConfig: - """Master timing configuration.""" - - action: ActionTimingConfig = field(default_factory=ActionTimingConfig) - device: DeviceTimingConfig = field(default_factory=DeviceTimingConfig) - - -TIMING_CONFIG = TimingConfig() - - -# ============================================================================= -# Screenshot Data Class (matches original Open-AutoGLM) -# ============================================================================= - - -@dataclass -class Screenshot: - """Represents a captured screenshot (matches original interface).""" - - base64_data: str - width: int - height: int - is_sensitive: bool = False - - -# ============================================================================= -# Device Factory Wrapper (wraps DroidRun tools to match original interface) -# ============================================================================= - - -class DeviceFactoryWrapper: - """ - Wraps DroidRun Tools to provide the same interface as original Open-AutoGLM's - DeviceFactory. All methods are async but match the original signatures and - return types. - """ - - def __init__(self, tools, loop: asyncio.AbstractEventLoop): - """ - Initialize wrapper. - - Args: - tools: DroidRun Tools instance - loop: Event loop for running async operations - """ - self.tools = tools - self.loop = loop - self._current_app = "System Home" - - async def get_screenshot(self, timeout: int = 10) -> Screenshot: - """ - Get screenshot matching original interface. - - Returns: - Screenshot object with base64_data, width, height, is_sensitive - """ - try: - _, screenshot_bytes = await self.tools.take_screenshot() - if screenshot_bytes: - base64_data = base64.b64encode(screenshot_bytes).decode("utf-8") - return Screenshot( - base64_data=base64_data, - width=self.tools.screen_width or 1080, - height=self.tools.screen_height or 2400, - is_sensitive=False, - ) - except Exception as e: - logger.warning(f"Screenshot failed: {e}") - - # Return fallback black image - return self._create_fallback_screenshot() - - def _create_fallback_screenshot(self, is_sensitive: bool = False) -> Screenshot: - """Create a black fallback image when screenshot fails.""" - # Create a minimal black PNG (1x1 pixel) - # In production, you might want a full-size black image - width = self.tools.screen_width or 1080 - height = self.tools.screen_height or 2400 - - try: - from io import BytesIO - - from PIL import Image - - black_img = Image.new("RGB", (width, height), color="black") - buffered = BytesIO() - black_img.save(buffered, format="PNG") - base64_data = base64.b64encode(buffered.getvalue()).decode("utf-8") - except ImportError: - # Minimal 1x1 black PNG if PIL not available - base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==" - - return Screenshot( - base64_data=base64_data, - width=width, - height=height, - is_sensitive=is_sensitive, - ) - - async def get_current_app(self) -> str: - """ - Get current app name. - - Returns: - App name string (or "System Home" if unknown) - """ - # Try to get from tools state - if hasattr(self.tools, "current_package") and self.tools.current_package: - return self.tools.current_package - - # Try to extract from clickable elements - if ( - hasattr(self.tools, "clickable_elements_cache") - and self.tools.clickable_elements_cache - ): - first_elem = ( - self.tools.clickable_elements_cache[0] - if self.tools.clickable_elements_cache - else {} - ) - pkg = first_elem.get("package", "") - if pkg: - return pkg - - return self._current_app - - async def tap(self, x: int, y: int, delay: Optional[float] = None) -> None: - """ - Tap at coordinates with post-action delay. - - Args: - x: X coordinate (pixels) - y: Y coordinate (pixels) - delay: Delay after tap (default: 1.0s) - """ - if delay is None: - delay = TIMING_CONFIG.device.default_tap_delay - - await self.tools.tap_by_coordinates(x, y) - await asyncio.sleep(delay) - - async def double_tap(self, x: int, y: int, delay: Optional[float] = None) -> None: - """ - Double tap at coordinates. - - Args: - x: X coordinate (pixels) - y: Y coordinate (pixels) - delay: Delay after double tap (default: 1.0s) - """ - if delay is None: - delay = TIMING_CONFIG.device.default_double_tap_delay - - await self.tools.tap_by_coordinates(x, y) - await asyncio.sleep(TIMING_CONFIG.device.double_tap_interval) - await self.tools.tap_by_coordinates(x, y) - await asyncio.sleep(delay) - - async def long_press( - self, x: int, y: int, duration_ms: int = 3000, delay: Optional[float] = None - ) -> None: - """ - Long press at coordinates. - - Args: - x: X coordinate (pixels) - y: Y coordinate (pixels) - duration_ms: Press duration in milliseconds - delay: Delay after long press (default: 1.0s) - """ - if delay is None: - delay = TIMING_CONFIG.device.default_long_press_delay - - # Long press = swipe from same point to same point - await self.tools.swipe(x, y, x, y, duration_ms=duration_ms) - await asyncio.sleep(delay) - - async def swipe( - self, - start_x: int, - start_y: int, - end_x: int, - end_y: int, - duration_ms: Optional[int] = None, - delay: Optional[float] = None, - ) -> None: - """ - Swipe from start to end coordinates. - - Args: - start_x, start_y: Starting coordinates - end_x, end_y: Ending coordinates - duration_ms: Swipe duration (auto-calculated if None) - delay: Delay after swipe (default: 1.0s) - """ - if delay is None: - delay = TIMING_CONFIG.device.default_swipe_delay - - if duration_ms is None: - # Calculate duration based on distance (matches original) - dist_sq = (start_x - end_x) ** 2 + (start_y - end_y) ** 2 - duration_ms = int(dist_sq / 1000) - duration_ms = max(1000, min(duration_ms, 2000)) # Clamp 1000-2000ms - - await self.tools.swipe(start_x, start_y, end_x, end_y, duration_ms=duration_ms) - await asyncio.sleep(delay) - - async def back(self, delay: Optional[float] = None) -> None: - """Press back button.""" - if delay is None: - delay = TIMING_CONFIG.device.default_back_delay - - await self.tools.press_key(4) # KEYCODE_BACK - await asyncio.sleep(delay) - - async def home(self, delay: Optional[float] = None) -> None: - """Press home button.""" - if delay is None: - delay = TIMING_CONFIG.device.default_home_delay - - await self.tools.press_key(3) # KEYCODE_HOME - await asyncio.sleep(delay) - - async def launch_app(self, app_name: str, delay: Optional[float] = None) -> bool: - """ - Launch an app by name. - - Args: - app_name: App name or package name - - Returns: - True if launched successfully - """ - if delay is None: - delay = TIMING_CONFIG.device.default_launch_delay - - try: - await self.tools.start_app(app_name) - await asyncio.sleep(delay) - return True - except Exception as e: - logger.warning(f"Failed to launch {app_name}: {e}") - return False - - async def type_text(self, text: str) -> None: - """ - Type text with keyboard handling. - - Matches original behavior: - 1. Switch to ADB keyboard (handled by DroidRun portal) - 2. Clear existing text - 3. Type new text - 4. Restore keyboard (handled by DroidRun portal) - """ - # DroidRun's input_text with clear=True handles all this - await self.tools.input_text(text, index=-1, clear=True) - await asyncio.sleep(TIMING_CONFIG.action.text_input_delay) - - async def clear_text(self) -> None: - """Clear text in focused field.""" - # Type empty string with clear flag - await self.tools.input_text("", index=-1, clear=True) - await asyncio.sleep(TIMING_CONFIG.action.text_clear_delay) - - -# ============================================================================= -# Message Builder (matches original Open-AutoGLM) -# ============================================================================= - - -class MessageBuilder: - """Helper class for building OpenAI-compatible conversation messages.""" - - @staticmethod - def create_system_message(content: str) -> Dict[str, Any]: - """Create a system message.""" - return {"role": "system", "content": content} - - @staticmethod - def create_user_message( - text: str, image_base64: Optional[str] = None - ) -> Dict[str, Any]: - """ - Create a user message with optional image in OpenAI format. - - Image comes first, then text (matches original). - """ - content: List[Dict[str, Any]] = [] - - if image_base64: - content.append( - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - } - ) - - content.append({"type": "text", "text": text}) - - return {"role": "user", "content": content} - - @staticmethod - def create_assistant_message(content: str) -> Dict[str, Any]: - """Create an assistant message.""" - return {"role": "assistant", "content": content} - - @staticmethod - def remove_images_from_message(message: Dict[str, Any]) -> Dict[str, Any]: - """Remove image content from a message to save context space.""" - if isinstance(message.get("content"), list): - message["content"] = [ - item for item in message["content"] if item.get("type") == "text" - ] - return message - - @staticmethod - def build_screen_info(current_app: str, **extra_info) -> str: - """Build screen info JSON string.""" - info = {"current_app": current_app, **extra_info} - return json.dumps(info, ensure_ascii=False) - - -# ============================================================================= -# Action Parsing (matches original Open-AutoGLM) -# ============================================================================= - - -@dataclass -class ActionResult: - """Result of an action execution (matches original Open-AutoGLM).""" - - success: bool - should_finish: bool - message: Optional[str] = None - requires_confirmation: bool = False - - -def parse_action(response: str) -> Dict[str, Any]: - """ - Parse action from model response. - - Matches original Open-AutoGLM handler.py parse_action exactly. - """ - logger.debug(f"Parsing action: {response}") - try: - response = response.strip() - - # Special case for Type/Type_Name - extract text directly (matches original) - if response.startswith('do(action="Type"') or response.startswith( - 'do(action="Type_Name"' - ): - text = response.split("text=", 1)[1][1:-2] - action = {"_metadata": "do", "action": "Type", "text": text} - return action - - elif response.startswith("do"): - # Use AST parsing instead of eval for safety - try: - # Escape special characters (newlines, tabs, etc.) for valid Python syntax - response = response.replace("\n", "\\n") - response = response.replace("\r", "\\r") - response = response.replace("\t", "\\t") - - tree = ast.parse(response, mode="eval") - if not isinstance(tree.body, ast.Call): - raise ValueError("Expected a function call") - - call = tree.body - # Extract keyword arguments safely - action: Dict[str, Any] = {"_metadata": "do"} - for keyword in call.keywords: - key = keyword.arg - value = ast.literal_eval(keyword.value) - action[key] = value - - return action - except (SyntaxError, ValueError) as e: - raise ValueError(f"Failed to parse do() action: {e}") from e - - elif response.startswith("finish"): - action = { - "_metadata": "finish", - "message": response.replace("finish(message=", "")[1:-2], - } - else: - raise ValueError(f"Failed to parse action: {response}") - return action - except Exception as e: - raise ValueError(f"Failed to parse action: {e}") from e - - -def do(**kwargs) -> Dict[str, Any]: - """Helper function for creating 'do' actions.""" - kwargs["_metadata"] = "do" - return kwargs - - -def finish(**kwargs) -> Dict[str, Any]: - """Helper function for creating 'finish' actions.""" - kwargs["_metadata"] = "finish" - return kwargs - - -def parse_response(content: str) -> Tuple[str, str]: - """ - Parse the model response into thinking and action parts. - - Matches original Open-AutoGLM parsing rules. - """ - # Rule 1: Check for finish(message= - if "finish(message=" in content: - parts = content.split("finish(message=", 1) - thinking = parts[0].strip() - action = "finish(message=" + parts[1] - return thinking, action - - # Rule 2: Check for do(action= - if "do(action=" in content: - parts = content.split("do(action=", 1) - thinking = parts[0].strip() - action = "do(action=" + parts[1] - return thinking, action - - # Rule 3: Fallback to legacy XML tag parsing - if "" in content: - parts = content.split("", 1) - thinking = parts[0].replace("", "").replace("", "").strip() - action = parts[1].replace("", "").strip() - return thinking, action - - # Rule 4: No markers found, return content as action - return "", content - - -# ============================================================================= -# Action Handler (matches original Open-AutoGLM) -# ============================================================================= - - -class ActionHandler: - """ - Handles execution of actions from AI model output. - - Matches original Open-AutoGLM ActionHandler interface. - - Args: - device: DeviceFactoryWrapper instance for device operations. - confirmation_callback: Optional callback for sensitive action confirmation. - Should return True to proceed, False to cancel. - takeover_callback: Optional callback for takeover requests (login, captcha). - """ - - def __init__( - self, - device: DeviceFactoryWrapper, - confirmation_callback: Optional[Callable[[str], bool]] = None, - takeover_callback: Optional[Callable[[str], None]] = None, - ): - self.device = device - self.confirmation_callback = confirmation_callback or self._default_confirmation - self.takeover_callback = takeover_callback or self._default_takeover - - def _convert_relative_to_absolute( - self, element: List[int], screen_width: int, screen_height: int - ) -> Tuple[int, int]: - """Convert relative coordinates (0-1000) to absolute pixels.""" - x = int(element[0] / 1000 * screen_width) - y = int(element[1] / 1000 * screen_height) - return x, y - - async def execute( - self, action: Dict[str, Any], screen_width: int, screen_height: int - ) -> ActionResult: - """ - Execute an action from the AI model. - - Args: - action: The action dictionary from the model. - screen_width: Current screen width in pixels. - screen_height: Current screen height in pixels. - - Returns: - ActionResult indicating success and whether to finish. - """ - action_type = action.get("_metadata") - - if action_type == "finish": - return ActionResult( - success=True, should_finish=True, message=action.get("message") - ) - - if action_type != "do": - return ActionResult( - success=False, - should_finish=True, - message=f"Unknown action type: {action_type}", - ) - - action_name = action.get("action") - handler_method = self._get_handler(action_name) - - if handler_method is None: - return ActionResult( - success=False, - should_finish=False, - message=f"Unknown action: {action_name}", - ) - - try: - return await handler_method(action, screen_width, screen_height) - except Exception as e: - return ActionResult( - success=False, should_finish=False, message=f"Action failed: {e}" - ) - - def _get_handler(self, action_name: str) -> Optional[Callable]: - """Get the handler method for an action.""" - handlers = { - "Launch": self._handle_launch, - "Tap": self._handle_tap, - "Type": self._handle_type, - "Type_Name": self._handle_type, - "Swipe": self._handle_swipe, - "Back": self._handle_back, - "Home": self._handle_home, - "Double Tap": self._handle_double_tap, - "Long Press": self._handle_long_press, - "Wait": self._handle_wait, - "Take_over": self._handle_takeover, - "Note": self._handle_note, - "Call_API": self._handle_call_api, - "Interact": self._handle_interact, - } - return handlers.get(action_name) - - async def _handle_launch( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle app launch action.""" - app_name = action.get("app") - if not app_name: - return ActionResult(False, False, "No app name specified") - - success = await self.device.launch_app(app_name) - if success: - return ActionResult(True, False) - return ActionResult(False, False, f"App not found: {app_name}") - - async def _handle_tap(self, action: Dict, width: int, height: int) -> ActionResult: - """Handle tap action.""" - element = action.get("element") - if not element: - return ActionResult(False, False, "No element coordinates") - - x, y = self._convert_relative_to_absolute(element, width, height) - - # Check for sensitive operation - if "message" in action: - if not self.confirmation_callback(action["message"]): - return ActionResult( - success=False, - should_finish=True, - message="User cancelled sensitive operation", - ) - - await self.device.tap(x, y) - return ActionResult(True, False) - - async def _handle_type(self, action: Dict, width: int, height: int) -> ActionResult: - """Handle text input action.""" - text = action.get("text", "") - await self.device.type_text(text) - return ActionResult(True, False) - - async def _handle_swipe( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle swipe action.""" - start = action.get("start") - end = action.get("end") - - if not start or not end: - return ActionResult(False, False, "Missing swipe coordinates") - - start_x, start_y = self._convert_relative_to_absolute(start, width, height) - end_x, end_y = self._convert_relative_to_absolute(end, width, height) - - await self.device.swipe(start_x, start_y, end_x, end_y) - return ActionResult(True, False) - - async def _handle_back(self, action: Dict, width: int, height: int) -> ActionResult: - """Handle back button action.""" - await self.device.back() - return ActionResult(True, False) - - async def _handle_home(self, action: Dict, width: int, height: int) -> ActionResult: - """Handle home button action.""" - await self.device.home() - return ActionResult(True, False) - - async def _handle_double_tap( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle double tap action.""" - element = action.get("element") - if not element: - return ActionResult(False, False, "No element coordinates") - - x, y = self._convert_relative_to_absolute(element, width, height) - await self.device.double_tap(x, y) - return ActionResult(True, False) - - async def _handle_long_press( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle long press action.""" - element = action.get("element") - if not element: - return ActionResult(False, False, "No element coordinates") - - x, y = self._convert_relative_to_absolute(element, width, height) - await self.device.long_press(x, y) - return ActionResult(True, False) - - async def _handle_wait(self, action: Dict, width: int, height: int) -> ActionResult: - """Handle wait action.""" - duration_str = action.get("duration", "1 seconds") - try: - duration = float(duration_str.replace("seconds", "").strip()) - except ValueError: - duration = 1.0 - - await asyncio.sleep(duration) - return ActionResult(True, False) - - async def _handle_takeover( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle takeover request (login, captcha, etc.).""" - message = action.get("message", "User intervention required") - self.takeover_callback(message) - return ActionResult(True, False) - - async def _handle_note(self, action: Dict, width: int, height: int) -> ActionResult: - """Handle note action (placeholder for content recording).""" - # This action is typically used for recording page content - # Implementation depends on specific requirements - return ActionResult(True, False) - - async def _handle_call_api( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle API call action (placeholder for summarization).""" - # This action is typically used for content summarization - # Implementation depends on specific requirements - return ActionResult(True, False) - - async def _handle_interact( - self, action: Dict, width: int, height: int - ) -> ActionResult: - """Handle interaction request (user choice needed).""" - # This action signals that user input is needed - return ActionResult(True, False, message="User interaction required") - - @staticmethod - def _default_confirmation(message: str) -> bool: - """Default confirmation callback using console input.""" - response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ") - return response.upper() == "Y" - - @staticmethod - def _default_takeover(message: str) -> None: - """Default takeover callback using console input.""" - input(f"{message}\nPress Enter after completing manual operation...") - - -# ============================================================================= -# Main Entry Point -# ============================================================================= - - -async def run( - tools, - instruction: str, - config: Dict[str, Any], - max_steps: int = 15, - confirmation_callback: Optional[Callable[[str], bool]] = None, - takeover_callback: Optional[Callable[[str], None]] = None, -) -> Dict[str, Any]: - """ - Run AutoGLM agent matching original Open-AutoGLM implementation. - - Args: - tools: DroidRun Tools instance - instruction: Task to complete - config: Configuration dictionary: - llm: Dict passed directly to load_llm() - REQUIRED - provider: LLM provider (required, e.g. "OpenAILike") - model: Model name (required, e.g. "autoglm-phone-9b") - + any other params for load_llm (temperature, base_url, etc.) - lang: "cn" (detailed rules) or "en" (minimal) - default: "cn" - stream: Enable streaming - default: True - max_steps: Max iterations - confirmation_callback: Optional callback for sensitive action confirmation - takeover_callback: Optional callback for takeover requests - - Returns: - {"success": bool, "reason": str, "steps": int} - """ - # Validate LLM config - must be provided by user - llm_cfg = config.get("llm") - if not llm_cfg or not isinstance(llm_cfg, dict): - raise ValueError( - "AutoGLM requires 'llm' configuration. " - "Please configure external_agents.autoglm.llm in your config.yaml" - ) - - if "provider" not in llm_cfg: - raise ValueError( - "AutoGLM requires 'llm.provider' to be specified. " - "Example: provider: OpenAILike" - ) - - if "model" not in llm_cfg: - raise ValueError( - "AutoGLM requires 'llm.model' to be specified. " - "Example: model: autoglm-phone-9b" - ) - - # Load LLM - pass config directly to load_llm - llm_cfg = dict(llm_cfg) # Copy to avoid mutating - provider = llm_cfg.pop("provider") - llm = load_llm(provider, **llm_cfg) - - # Agent-specific configuration (defaults from DEFAULT_CONFIG) - lang = config.get("lang", DEFAULT_CONFIG["lang"]) - stream = config.get("stream", DEFAULT_CONFIG["stream"]) - - # Get system prompt with date (matches original) - system_prompt = get_system_prompt(lang=lang) - - # Create device wrapper - loop = asyncio.get_running_loop() - device = DeviceFactoryWrapper(tools, loop) - - # Stateful conversation context (matches original) - context: List[Dict[str, Any]] = [] - - logger.info(f"🤖 AutoGLM: {instruction}") - - for step in range(max_steps): - step_start = time.time() - logger.info(f"📍 Step {step + 1}/{max_steps}") - - # Get current screen state - await tools.get_state() - w = tools.screen_width or 1080 - h = tools.screen_height or 2400 - - # Get screenshot (matches original interface) - screenshot = await device.get_screenshot() - - # Get current app (matches original) - current_app = await device.get_current_app() - - # Build screen info (matches original format) - screen_info = MessageBuilder.build_screen_info(current_app) - - # Build messages (matches original flow) - if step == 0: - # First step: system message + user message with task + screen info - context.append(MessageBuilder.create_system_message(system_prompt)) - text_content = f"{instruction}\n\n{screen_info}" - context.append( - MessageBuilder.create_user_message(text_content, screenshot.base64_data) - ) - else: - # Subsequent steps: user message with screen info - text_content = f"** Screen Info **\n\n{screen_info}" - context.append( - MessageBuilder.create_user_message(text_content, screenshot.base64_data) - ) - - # Convert to LlamaIndex format and call LLM - try: - response = await acall_with_retries( - llm, - to_chat_messages(context), - stream=stream, - ) - response_text = str(response) - except Exception as e: - logger.error(f"LLM call failed: {e}") - return {"success": False, "reason": f"LLM error: {e}", "steps": step + 1} - - # Parse thinking and action (matches original) - thinking, action_str = parse_response(response_text) - - # Only log thinking if not streaming (streaming already printed it) - if thinking and not stream: - logger.info(f"💭 {thinking[:200]}{'...' if len(thinking) > 200 else ''}") - - # Remove image from current user message BEFORE adding assistant (matches original agent.py:205) - context[-1] = MessageBuilder.remove_images_from_message(context[-1]) - - # Add assistant response to context (matches original format) - context.append( - MessageBuilder.create_assistant_message( - f"{thinking}{action_str}" - ) - ) - - # Parse action - try: - action = parse_action(action_str) - except ValueError as e: - logger.warning(f"Failed to parse action: {e}") - action = {"_metadata": "finish", "message": action_str} - - action_name = action.get("action", action.get("_metadata", "unknown")) - # Log action with key details - if action_name in ("Tap", "Double Tap", "Long Press"): - coords = action.get("element", []) - logger.info(f"⚡ {action_name} {coords}") - elif action_name == "Swipe": - start, end = action.get("start", []), action.get("end", []) - logger.info(f"⚡ {action_name} {start} → {end}") - elif action_name == "Type": - text = action.get("text", "")[:30] - logger.info( - f"⚡ {action_name}: \"{text}{'...' if len(action.get('text', '')) > 30 else ''}\"" - ) - elif action_name == "Launch": - logger.info(f"⚡ {action_name}: {action.get('app', '')}") - elif action_name == "finish": - logger.info(f"⚡ {action_name}: {action.get('message', '')[:50]}") - else: - logger.info(f"⚡ {action_name}") - - # Create action handler and execute (matches original interface) - handler = ActionHandler( - device=device, - confirmation_callback=confirmation_callback, - takeover_callback=takeover_callback, - ) - - result = await handler.execute(action, screenshot.width, screenshot.height) - - step_time = time.time() - step_start - logger.debug(f" ⏱️ {step_time:.1f}s") - - # Check if finished - if result.should_finish: - reason = result.message or action.get("message", "Task completed") - logger.info(f"✅ Done ({step + 1} steps): {reason}") - return {"success": result.success, "reason": reason, "steps": step + 1} - - logger.warning(f"⚠️ Max steps ({max_steps}) reached") - return {"success": False, "reason": "Max steps reached", "steps": max_steps} diff --git a/droidrun/agent/external/mai_ui.py b/droidrun/agent/external/mai_ui.py deleted file mode 100644 index 6d6892e..0000000 --- a/droidrun/agent/external/mai_ui.py +++ /dev/null @@ -1,819 +0,0 @@ -"""MAI-UI External Agent - Exact implementation matching MAI-UI prompts and behavior. - -This agent replicates MAI-UI's exact prompts, message building, and trajectory -management while using DroidRun's AdbTools for execution. -""" - -import asyncio -import base64 -import copy -import json -import logging -import re -from dataclasses import dataclass, field -from io import BytesIO -from typing import Any, Dict, List, Optional, Tuple - -from PIL import Image -from jinja2 import Template - -from droidrun.agent.oneflows.app_starter_workflow import AppStarter -from droidrun.agent.utils.chat_utils import to_chat_messages -from droidrun.agent.utils.inference import acall_with_retries -from droidrun.agent.utils.llm_picker import load_llm - -logger = logging.getLogger("droidrun") - -# ============================================================================= -# Constants -# ============================================================================= - -SCALE_FACTOR = 999 - -# ============================================================================= -# Default Configuration (agent-specific only, NOT LLM) -# ============================================================================= - -DEFAULT_CONFIG: Dict[str, Any] = { - # Agent-specific settings matching MAI-UI defaults - "history_n": 3, # Number of history steps with images - # Note: vision is always True for MAI-UI (screenshot-based agent) -} - - -# ============================================================================= -# Trajectory Memory (matches MAI-UI's unified_memory.py) -# ============================================================================= - - -@dataclass -class TrajStep: - """ - Single step in an agent's trajectory. - - Attributes: - screenshot_bytes: Screenshot as PNG bytes - prediction: Raw LLM response text - action: Parsed action dictionary - thought: Extracted thinking/reasoning - step_index: Index of this step - structured_action: {"action_json": action} for history reconstruction - ask_user_response: Response from user when ask_user action was used - """ - - screenshot_bytes: bytes - prediction: str - action: Dict[str, Any] - thought: str - step_index: int - structured_action: Dict[str, Any] - ask_user_response: Optional[str] = None - - -@dataclass -class TrajMemory: - """ - Container for complete trajectory. - - Attributes: - task_goal: The instruction/goal for this trajectory - steps: List of trajectory steps - """ - - task_goal: str - steps: List[TrajStep] = field(default_factory=list) - - -# ============================================================================= -# System Prompt (exact MAI-UI prompt with MCP template - renders without MCP when no tools passed) -# ============================================================================= - -# fmt: off -MAI_MOBILE_SYS_PROMPT_TEMPLATE = Template( - "You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n" - "\n" - "## Output Format\n" - "For each function call, return the thinking process in tags, and a json object with function name and arguments within XML tags:\n" - "```\n" - "\n" - "...\n" - "\n" - "\n" - "{\"name\": \"mobile_use\", \"arguments\": }\n" - "\n" - "```\n" - "\n" - "## Action Space\n" - "\n" - "{\"action\": \"click\", \"coordinate\": [x, y]}\n" - "{\"action\": \"long_press\", \"coordinate\": [x, y]}\n" - "{\"action\": \"type\", \"text\": \"\"}\n" - "{\"action\": \"swipe\", \"direction\": \"up or down or left or right\", \"coordinate\": [x, y]} # \"coordinate\" is optional. Use the \"coordinate\" if you want to swipe a specific UI element.\n" - "{\"action\": \"open\", \"text\": \"app_name\"}\n" - "{\"action\": \"drag\", \"start_coordinate\": [x1, y1], \"end_coordinate\": [x2, y2]}\n" - "{\"action\": \"system_button\", \"button\": \"button_name\"} # Options: back, home, menu, enter \n" - "{\"action\": \"wait\"}\n" - "{\"action\": \"terminate\", \"status\": \"success or fail\"} \n" - "{\"action\": \"answer\", \"text\": \"xxx\"} # Use escape characters \\', \\\", and \\n in text part to ensure we can parse the text in normal python string format.\n" - "{\"action\": \"ask_user\", \"text\": \"xxx\"} # you can ask user for more information to complete the task.\n" - "{\"action\": \"double_click\", \"coordinate\": [x, y]}\n" - "\n" - "{% if tools %}" - "## MCP Tools\n" - "You are also provided with MCP tools, you can use them to complete the task.\n" - "{{ tools }}\n" - "\n" - "If you want to use MCP tools, you must output as the following format:\n" - "```\n" - "\n" - "...\n" - "\n" - "\n" - "{\"name\": , \"arguments\": }\n" - "\n" - "```\n" - "{% endif %}" - "## Note\n" - "- Available Apps: `{{ apps_list }}`.\n" - "- Write a small plan and finally summarize your next action (with its target element) in one sentence in part." -) -# fmt: on - - -# ============================================================================= -# Parsing Functions (matches MAI-UI's parsing) -# ============================================================================= - - -def parse_tagged_text(text: str) -> Dict[str, Any]: - """ - Parse text containing and tags. - - Handles both standard format and thinking model format (). - - Args: - text: Raw model output - - Returns: - Dictionary with "thinking" and "tool_call" keys - """ - text = text.strip() - - # Handle thinking model output format (uses instead of ) - if "" in text and "" not in text: - text = text.replace("", "") - text = "" + text - - result: Dict[str, Any] = { - "thinking": None, - "tool_call": None, - } - - # Extract thinking content - think_pattern = r"(.*?)" - think_match = re.search(think_pattern, text, re.DOTALL) - if think_match: - result["thinking"] = think_match.group(1).strip() - - # Extract tool_call content - call_pattern = r"(.*?)" - call_match = re.search(call_pattern, text, re.DOTALL) - if call_match: - try: - result["tool_call"] = json.loads(call_match.group(1).strip()) - except json.JSONDecodeError: - result["tool_call"] = None - - return result - - -def parse_action(text: str) -> Dict[str, Any]: - """ - Parse model output into structured action format. - - Normalizes coordinates from SCALE_FACTOR (0-999) to 0-1 range, - matching MAI-UI's parse_action_to_structure_output behavior. - - Args: - text: Raw model output - - Returns: - Dictionary with "thinking" and "action_json" keys - - Raises: - ValueError: If parsing fails - """ - parsed = parse_tagged_text(text) - - if not parsed["tool_call"]: - raise ValueError("No valid tool_call found in response") - - action = parsed["tool_call"].get("arguments", {}) - - # Normalize coordinates from SCALE_FACTOR range to [0, 1] - # This matches MAI-UI's parse_action_to_structure_output behavior - for coord_key in ["coordinate", "start_coordinate", "end_coordinate"]: - if coord_key in action: - coordinates = action[coord_key] - if len(coordinates) == 2: - point_x, point_y = coordinates - elif len(coordinates) == 4: - # Handle bounding box format (x1, y1, x2, y2) -> center point - x1, y1, x2, y2 = coordinates - point_x = (x1 + x2) / 2 - point_y = (y1 + y2) / 2 - else: - raise ValueError( - f"Invalid {coord_key} format: expected 2 or 4 values, got {len(coordinates)}" - ) - action[coord_key] = [point_x / SCALE_FACTOR, point_y / SCALE_FACTOR] - - return { - "thinking": parsed["thinking"], - "action_json": action, - } - - -# ============================================================================= -# Helper Functions -# ============================================================================= - - -def pil_to_base64(image: Image.Image) -> str: - """Convert PIL Image to base64 string.""" - buffer = BytesIO() - image.save(buffer, format="PNG") - return base64.b64encode(buffer.getvalue()).decode("utf-8") - - -def bytes_to_base64(image_bytes: bytes) -> str: - image = Image.open(BytesIO(image_bytes)) - if image.mode != "RGB": - image = image.convert("RGB") - buffer = BytesIO() - image.save(buffer, format="PNG") - return base64.b64encode(buffer.getvalue()).decode("utf-8") - - -async def resolve_app_name(tools, app_name: str) -> str: - """ - Resolve friendly app name to package name. - - Args: - tools: DroidRun Tools instance - app_name: Friendly app name (e.g., "Settings", "Chrome") - - Returns: - Package name (e.g., "com.android.settings") - """ - try: - apps = await tools.get_apps(include_system=True) - - # Try exact label match (case-insensitive) - for app in apps: - if app.get("label", "").lower() == app_name.lower(): - return app["package"] - - # Try partial match - for app in apps: - if app_name.lower() in app.get("label", "").lower(): - return app["package"] - - # Return as-is (might already be a package name) - return app_name - - except Exception as e: - logger.warning(f"Failed to resolve app name '{app_name}': {e}") - return app_name - - -async def get_available_apps(tools) -> str: - """ - Get list of available apps for the prompt. - - Returns: - Formatted string of app names - """ - try: - apps = await tools.get_apps(include_system=False) - app_names = [app.get("label", app.get("package", "")) for app in apps[:30]] - return json.dumps(app_names) - except Exception: - # Fallback to generic list - return '["Settings", "Chrome", "Camera", "Files", "Contacts", "Messages", "Phone", "Calendar", "Clock", "Calculator"]' - - -# ============================================================================= -# Message Building (matches MAI-UI's _build_messages) -# ============================================================================= - - -def mem2response(step: TrajStep) -> str: - """ - Reconstruct assistant response from trajectory step. - - Converts stored action back to the format the LLM expects in history. - - Args: - step: Trajectory step - - Returns: - Formatted response string with and tags - """ - thinking = step.thought or "" - structured_action = step.structured_action - - if not structured_action: - return f"\n{thinking}\n\n\n{{}}\n" - - action_json = copy.deepcopy(structured_action.get("action_json", {})) - - # Convert normalized coordinates back to SCALE_FACTOR range for history - # NOTE: Original MAI-UI only converts "coordinate", NOT start_coordinate/end_coordinate - # This matches the behavior in mai_naivigation_agent.py mem2response() - if "coordinate" in action_json: - coords = action_json["coordinate"] - if len(coords) == 2: - # Coordinates are stored normalized (0-1), convert to 0-999 - action_json["coordinate"] = [ - int(coords[0] * SCALE_FACTOR), - int(coords[1] * SCALE_FACTOR), - ] - - tool_call_dict = { - "name": "mobile_use", - "arguments": action_json, - } - tool_call_json = json.dumps(tool_call_dict, separators=(",", ":")) - - return f"\n{thinking}\n\n\n{tool_call_json}\n" - - -def build_messages( - instruction: str, - system_prompt: str, - traj_memory: TrajMemory, - current_screenshot_bytes: bytes, - history_n: int = 3, -) -> List[Dict[str, Any]]: - """ - Build multi-turn messages matching MAI-UI's format. - - Message structure: - 1. System prompt - 2. User instruction - 3. For each history step: - - Image (only for last history_n-1 steps) - - Assistant response - 4. Current screenshot - - Args: - instruction: Task instruction - system_prompt: System prompt text - traj_memory: Trajectory memory with history - current_screenshot_bytes: Current screenshot as bytes - history_n: Number of history images to include - - Returns: - List of message dictionaries - """ - messages = [ - { - "role": "system", - "content": [{"type": "text", "text": system_prompt}], - }, - { - "role": "user", - "content": [{"type": "text", "text": instruction}], - }, - ] - - steps = traj_memory.steps - image_idx = 0 - - if len(steps) > 0: - # Calculate which steps get images (last history_n - 1 steps) - start_image_idx = max(0, len(steps) - (history_n - 1)) - - # Collect history images - history_images = [] - for i, step in enumerate(steps): - if i >= start_image_idx: - history_images.append(step.screenshot_bytes) - - for history_idx, step in enumerate(steps): - should_include_image = history_idx >= start_image_idx - - if should_include_image and image_idx < len(history_images): - # Add image before assistant response - encoded = bytes_to_base64(history_images[image_idx]) - messages.append( - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:image/png;base64,{encoded}" - }, - } - ], - } - ) - image_idx += 1 - - # Add assistant response - history_response = mem2response(step) - messages.append( - { - "role": "assistant", - "content": [{"type": "text", "text": history_response}], - } - ) - - # Add ask_user_response if present (matches MAI-UI behavior) - if step.ask_user_response: - messages.append( - { - "role": "user", - "content": [{"type": "text", "text": step.ask_user_response}], - } - ) - - # Add current screenshot - current_encoded = bytes_to_base64(current_screenshot_bytes) - messages.append( - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{current_encoded}"}, - } - ], - } - ) - - return messages - - -# ============================================================================= -# Action Execution -# ============================================================================= - - -async def execute_action( - tools, - action: Dict[str, Any], - screen_width: int, - screen_height: int, - llm=None, -) -> Tuple[bool, str]: - """ - Execute a MAI-UI action using DroidRun tools. - - Args: - tools: DroidRun Tools instance - action: Parsed action dictionary with normalized coordinates (0-1 range) - screen_width: Screen width in pixels - screen_height: Screen height in pixels - llm: LLM instance for intelligent app opening (AppStarter workflow) - - Returns: - Tuple of (success, result_message) - """ - action_type = action.get("action", "") - w, h = screen_width, screen_height - - try: - if action_type == "click": - # Coordinates are normalized (0-1), convert to pixels - coord = action.get("coordinate", [0, 0]) - x = int(coord[0] * w) - y = int(coord[1] * h) - result = await tools.tap_by_coordinates(x, y) - return True, f"click ({x},{y}): {result}" - - elif action_type == "long_press": - coord = action.get("coordinate", [0, 0]) - x = int(coord[0] * w) - y = int(coord[1] * h) - await tools.swipe(x, y, x, y, 1000) - return True, f"long_press ({x},{y})" - - elif action_type == "double_click": - coord = action.get("coordinate", [0, 0]) - x = int(coord[0] * w) - y = int(coord[1] * h) - await tools.tap_by_coordinates(x, y) - await asyncio.sleep(0.1) - await tools.tap_by_coordinates(x, y) - return True, f"double_click ({x},{y})" - - elif action_type == "type": - text = action.get("text", "") - result = await tools.input_text(text) - return True, ( - f"type '{text[:30]}...': {result}" - if len(text) > 30 - else f"type '{text}': {result}" - ) - - elif action_type == "swipe": - direction = action.get("direction", "up") - # Default to center if no coordinate provided (0.5, 0.5 normalized) - coord = action.get("coordinate", [0.5, 0.5]) - - # Start position (normalized to pixels) - sx = int(coord[0] * w) - sy = int(coord[1] * h) - - # Direction offsets (proportional to screen size) - offsets = { - "up": (0, -h // 3), - "down": (0, h // 3), - "left": (-w // 3, 0), - "right": (w // 3, 0), - } - dx, dy = offsets.get(direction, (0, 0)) - - # Clamp end coordinates to screen bounds - ex = max(0, min(w - 1, sx + dx)) - ey = max(0, min(h - 1, sy + dy)) - - await tools.swipe(sx, sy, ex, ey, 300) - return True, f"swipe {direction} from ({sx},{sy}) to ({ex},{ey})" - - elif action_type == "drag": - start_coord = action.get("start_coordinate", [0, 0]) - end_coord = action.get("end_coordinate", [0, 0]) - - sx = int(start_coord[0] * w) - sy = int(start_coord[1] * h) - ex = int(end_coord[0] * w) - ey = int(end_coord[1] * h) - - # Use longer duration for drag semantics - await tools.swipe(sx, sy, ex, ey, 2000) - return True, f"drag from ({sx},{sy}) to ({ex},{ey})" - - elif action_type == "open": - app_name = action.get("text", "") - if llm is not None: - # Use intelligent LLM-based app matching via AppStarter - workflow = AppStarter( - tools=tools, - llm=llm, - timeout=60, - verbose=False, - ) - result = await workflow.run(app_description=app_name) - await asyncio.sleep(1) - return True, f"open '{app_name}': {result}" - else: - # Fallback to simple name matching - package = await resolve_app_name(tools, app_name) - result = await tools.start_app(package) - return True, f"open '{app_name}' ({package}): {result}" - - elif action_type == "system_button": - button = action.get("button", "back") - keycodes = { - "back": 4, - "home": 3, - "enter": 66, - "menu": 82, - } - keycode = keycodes.get(button, 4) - result = await tools.press_key(keycode) - return True, f"{button}: {result}" - - elif action_type == "wait": - await asyncio.sleep(1.0) - return True, "wait 1s" - - elif action_type == "terminate": - # Handled in main loop - return True, "terminate" - - elif action_type == "answer": - # Handled in main loop - return True, f"answer: {action.get('text', '')}" - - elif action_type == "ask_user": - # Handled in main loop - returns special marker - return True, "ask_user" - - else: - return False, f"unknown action: {action_type}" - - except Exception as e: - logger.error(f"Action execution failed: {e}") - return False, f"error: {e}" - - -# ============================================================================= -# Main Run Function -# ============================================================================= - - -async def run( - tools, - instruction: str, - config: Dict[str, Any], - max_steps: int = 15, -) -> Dict[str, Any]: - """ - Run MAI-UI agent with exact MAI-UI behavior. - - Args: - tools: DroidRun Tools instance (AdbTools) - instruction: Task to complete - config: Configuration dictionary: - llm: Dict passed directly to load_llm() with: - provider: LLM provider (default: "OpenAI") - model, temperature, base_url, api_key, max_tokens, top_p, top_k, etc. - history_n: Number of history images (default: 3) - vision: Whether to use screenshots (default: True) - max_steps: Maximum iterations - - Returns: - Dictionary with: success, reason, steps, answer (if answer action) - """ - # Validate LLM config - must be provided by user - llm_cfg = config.get("llm") - if not llm_cfg or not isinstance(llm_cfg, dict): - raise ValueError( - "MAI-UI requires 'llm' configuration. " - "Please configure external_agents.mai_ui.llm in your config.yaml" - ) - - if "provider" not in llm_cfg: - raise ValueError( - "MAI-UI requires 'llm.provider' to be specified. " - "Example: provider: OpenAI" - ) - - if "model" not in llm_cfg: - raise ValueError( - "MAI-UI requires 'llm.model' to be specified. " "Example: model: mai-ui-8b" - ) - - # Load LLM - pass config directly to load_llm - llm_cfg = dict(llm_cfg) # Copy to avoid mutating - provider = llm_cfg.pop("provider") - llm = load_llm(provider, **llm_cfg) - - # Agent-specific configuration (defaults from DEFAULT_CONFIG) - history_n = config.get("history_n", DEFAULT_CONFIG["history_n"]) - - # Initialize trajectory memory - traj_memory = TrajMemory(task_goal=instruction) - - # Get available apps for prompt and render system prompt - # No MCP tools passed - the MCP section will not appear in the prompt - apps_list = await get_available_apps(tools) - system_prompt = MAI_MOBILE_SYS_PROMPT_TEMPLATE.render( - apps_list=apps_list, - tools=None, # No MCP tools - section won't render - ) - - logger.info(f"MAI-UI agent starting: {instruction}") - - for step in range(max_steps): - logger.info(f"Step {step + 1}/{max_steps}") - - # Get screen dimensions - try: - await tools.get_state() - w, h = tools.screen_width, tools.screen_height - except Exception as e: - logger.error(f"Failed to get state: {e}") - w, h = 1080, 2400 # Fallback dimensions - - # Take screenshot (MAI-UI is vision-based, always requires screenshots) - try: - _, screenshot_bytes = await tools.take_screenshot() - except Exception as e: - logger.error(f"Failed to take screenshot: {e}") - continue - - if not screenshot_bytes: - logger.error("No screenshot available") - continue - - # Build messages - messages = build_messages( - instruction=instruction, - system_prompt=system_prompt, - traj_memory=traj_memory, - current_screenshot_bytes=screenshot_bytes, - history_n=history_n, - ) - - # Call LLM - try: - response = await acall_with_retries(llm, to_chat_messages(messages)) - response_text = str(response) - except Exception as e: - logger.error(f"LLM call failed: {e}") - continue - - # Parse response - try: - parsed = parse_action(response_text) - thinking = parsed["thinking"] or "" - action_json = parsed["action_json"] - except ValueError as e: - logger.error(f"Failed to parse response: {e}") - logger.debug(f"Raw response: {response_text[:500]}") - continue - - logger.info(f"Thinking: {thinking[:150]}...") - logger.info(f"Action: {action_json.get('action', 'unknown')}") - - # Check for terminal actions first (before storing step) - action_type = action_json.get("action", "") - - if action_type == "terminate": - # Store step before returning - traj_step = TrajStep( - screenshot_bytes=screenshot_bytes, - prediction=response_text, - action=action_json, - thought=thinking, - step_index=step, - structured_action={"action_json": action_json}, - ) - traj_memory.steps.append(traj_step) - - success = action_json.get("status") == "success" - reason = action_json.get("message", "Task terminated") - logger.info(f"Terminated: success={success}, reason={reason}") - return {"success": success, "reason": reason, "steps": step + 1} - - if action_type == "answer": - # Store step before returning - traj_step = TrajStep( - screenshot_bytes=screenshot_bytes, - prediction=response_text, - action=action_json, - thought=thinking, - step_index=step, - structured_action={"action_json": action_json}, - ) - traj_memory.steps.append(traj_step) - - answer_text = action_json.get("text", "") - logger.info(f"Answer: {answer_text}") - return { - "success": True, - "reason": "Task completed with answer", - "steps": step + 1, - "answer": answer_text, - } - - if action_type == "ask_user": - # Get user input via stdin - question = action_json.get("text", "Please provide input:") - logger.info(f"🤖 Agent asks: {question}") - user_response = input("Your response: ").strip() - logger.info(f"User response: {user_response}") - - # Store step with ask_user_response - traj_step = TrajStep( - screenshot_bytes=screenshot_bytes, - prediction=response_text, - action=action_json, - thought=thinking, - step_index=step, - structured_action={"action_json": action_json}, - ask_user_response=user_response, - ) - traj_memory.steps.append(traj_step) - - # Continue to next iteration (no device action needed) - await asyncio.sleep(0.5) - continue - - # Store step in trajectory (for non-terminal, non-ask_user actions) - # action_json already has normalized coordinates (0-1) from parse_action - traj_step = TrajStep( - screenshot_bytes=screenshot_bytes, - prediction=response_text, - action=action_json, - thought=thinking, - step_index=step, - structured_action={"action_json": action_json}, - ) - traj_memory.steps.append(traj_step) - - # Execute action - success, result_msg = await execute_action(tools, action_json, w, h, llm) - logger.info(f"Execution: {result_msg}") - - # Brief pause between steps - await asyncio.sleep(0.5) - - # Max steps reached - return {"success": False, "reason": "Max steps reached", "steps": max_steps} diff --git a/droidrun/cli/main.py b/droidrun/cli/main.py index 1617a78..de76719 100644 --- a/droidrun/cli/main.py +++ b/droidrun/cli/main.py @@ -34,8 +34,9 @@ from droidrun.portal import ( ping_portal_tcp, setup_portal, ) -from droidrun.telemetry import print_telemetry_message +from droidrun.agent.external import list_agents from droidrun.agent.utils.llm_picker import load_llm +from droidrun.telemetry import print_telemetry_message # Suppress all warnings warnings.filterwarnings("ignore") @@ -68,6 +69,7 @@ async def run_command( command: str, config_path: str | None = None, device: str | None = None, + agent: str | None = None, provider: str | None = None, model: str | None = None, steps: int | None = None, @@ -140,6 +142,8 @@ async def run_command( config.agent.fast_agent.vision = fast_agent_vision # Agent overrides + if agent is not None: + config.agent.name = agent if steps is not None: config.agent.max_steps = steps if reasoning is not None: @@ -314,10 +318,21 @@ def cli(): pass +_available_agents = list_agents() + + @cli.command() @click.argument("command", type=str) @click.option("--config", "-c", help="Path to custom config file", default=None) @click.option("--device", "-d", help="Device serial number or IP address", default=None) +@click.option( + "--agent", + "-a", + type=click.Choice(_available_agents) if _available_agents else None, + help="External agent to use" + + (f" [{', '.join(_available_agents)}]" if _available_agents else " (none available)"), + default=None, +) @click.option( "--provider", "-p", @@ -377,6 +392,7 @@ async def run( command: str, config: str | None, device: str | None, + agent: str | None, provider: str | None, model: str | None, steps: int | None, @@ -399,6 +415,7 @@ async def run( command=command, config_path=config, device=device, + agent=agent, provider=provider, model=model, steps=steps, diff --git a/droidrun/config_example.yaml b/droidrun/config_example.yaml index 0f8ab3b..919842e 100644 --- a/droidrun/config_example.yaml +++ b/droidrun/config_example.yaml @@ -1,12 +1,10 @@ # DroidRun Configuration File # This file is auto-generated. Edit values as needed. -_version: 4 +_version: 5 # === Agent Settings === agent: - # Agent to use: "droidrun" (native) or external: "mai_ui", "autoglm" - name: droidrun # Maximum number of steps per task max_steps: 15 # Enable planning with reasoning mode @@ -214,48 +212,10 @@ mcp: # enabled: true # === External Agent Settings === -# External agents are selected via agent.name above. -# Set agent.name to "mai_ui" or "autoglm" to use external agents. -# Settings below are merged with agent-specific defaults. +# External agents coming soon. See docs for how to add custom agents. +# Use: droidrun run "task" --agent # -# Example: -# agent: -# name: mai_ui -# max_steps: 20 -# -# Optional overrides via external_agent section: -# external_agent: -# llm: -# base_url: http://custom:8000/v1 - -# External agent configurations (reference settings) -external_agents: - # MAI-UI - Alibaba's GUI agent foundation model - # https://github.com/Tongyi-MAI/MAI-UI - # Requires vLLM server: vllm serve Tongyi-MAI/MAI-UI-8B - mai_ui: - llm: - provider: OpenAILike - model: Tongyi-MAI/MAI-UI-8B # or mai-ui-2b, mai-ui-32b, mai-ui-235b-a22b - api_base: https://enjoyed-placed-theaters-survival.trycloudflare.com/v1 - api_key: EMPTY - temperature: 0.0 - max_tokens: 2048 - top_p: 1.0 - top_k: -1 - history_n: 3 # Number of history steps with images - - # AutoGLM - Open-AutoGLM phone agent - # https://github.com/zai-org/Open-AutoGLM/ - autoglm: - llm: - provider: OpenAILike - model: autoglm-phone-9b - api_base: http://localhost:8000/v1 - api_key: EMPTY - temperature: 0.0 - top_p: 0.85 - frequency_penalty: 0.2 - max_tokens: 3000 - lang: en # cn or en - stream: true +# external_agents: +# my_agent: +# api_key: "your-api-key" +# model: "model-name" diff --git a/droidrun/config_manager/migrations/__init__.py b/droidrun/config_manager/migrations/__init__.py index 7f4cc26..a50e598 100644 --- a/droidrun/config_manager/migrations/__init__.py +++ b/droidrun/config_manager/migrations/__init__.py @@ -6,7 +6,7 @@ import pkgutil from pathlib import Path -CURRENT_VERSION = 4 +CURRENT_VERSION = 5 def get_migrations() -> List: diff --git a/droidrun/config_manager/migrations/v005_remove_external_agents.py b/droidrun/config_manager/migrations/v005_remove_external_agents.py new file mode 100644 index 0000000..186ad73 --- /dev/null +++ b/droidrun/config_manager/migrations/v005_remove_external_agents.py @@ -0,0 +1,21 @@ +"""Migration v5: Remove legacy external agent configs (mai_ui, autoglm).""" + +from typing import Any, Dict + +VERSION = 5 + + +def migrate(config: Dict[str, Any]) -> Dict[str, Any]: + """Remove legacy mai_ui and autoglm entries, reset name if it was one of those.""" + # Remove only the known legacy agent configs, leave user-added ones intact + external_agents = config.get("external_agents", {}) + if isinstance(external_agents, dict): + external_agents.pop("mai_ui", None) + external_agents.pop("autoglm", None) + + # Reset agent.name only if it was a removed agent + agent = config.get("agent", {}) + if agent.get("name") in ("mai_ui", "autoglm"): + agent.pop("name", None) + + return config