refactor!: redesign external agent system to use raw ADB access

External agents now receive a raw async_adbutils.AdbDevice instead of internal DroidRun tools. This makes them fully self-contained — zero imports from droidrun, just copy-paste a file or folder and it works. - Remove mai_ui and autoglm agents (depended on internals) - Pass raw AdbDevice directly, skip Portal/driver/registry setup - Add --agent CLI flag with dynamic discovery from external/ dir - Add list_agents() for runtime agent enumeration - Add config migration v005 to clean up legacy agent entries - Update config_example.yaml to remove old agent references BREAKING CHANGE: external agent run() signature changed from (tools, instruction, config, max_steps) to (device, instruction, config, max_steps) where device is an async_adbutils.AdbDevice
2026-05-23 07:40:37 +00:00 · 2026-03-25 21:42:28 +11:00
parent e3cc1ec154
commit eb047e7c38
8 changed files with 162 additions and 2025 deletions
@@ -260,7 +260,10 @@ class DroidAgent(Workflow):
            self.app_opener_llm = None
            self.structured_output_llm = None

-        if self.config.logging.save_trajectory != "none":
+        if (
+            not self._using_external_agent
+            and self.config.logging.save_trajectory != "none"
+        ):
            self.trajectory = Trajectory(
                goal=self.shared_state.instruction,
                base_path=self.config.logging.trajectory_path,
@@ -371,6 +374,64 @@ class DroidAgent(Workflow):
        if self.trajectory_writer:
            await self.trajectory_writer.start()

+        # ── 0. External agent — early exit ────────────────────────────
+        if self._using_external_agent:
+            agent_name = self.config.agent.name
+
+            # Load the agent module
+            agent_module = load_agent(agent_name)
+            if not agent_module:
+                from droidrun.agent.external import list_agents
+
+                available = list_agents()
+                if available:
+                    agents_str = ", ".join(available)
+                    raise ValueError(
+                        f"Failed to load external agent '{agent_name}'.\n"
+                        f"Available agents: {agents_str}"
+                    )
+                raise ValueError(
+                    f"External agent '{agent_name}' not found.\n"
+                    "No external agents are currently installed.\n"
+                    "Run: droidrun run --help  to see available agents."
+                )
+
+            # Resolve config
+            agent_config = self.config.external_agents.get(agent_name)
+            if not agent_config:
+                raise ValueError(
+                    f"No configuration found for agent '{agent_name}'.\n\n"
+                    "Add to your config.yaml:\n\n"
+                    "  external_agents:\n"
+                    f"    {agent_name}:\n"
+                    '      api_key: "your-api-key"\n'
+                    '      model: "model-name"\n'
+                    "      # ... any settings your agent needs"
+                )
+
+            final_config = {**agent_module["config"], **agent_config}
+
+            # Resolve device serial and get raw AdbDevice
+            device_serial = self.resolved_device_config.serial
+            if device_serial is None:
+                devices = await adb.list()
+                if not devices:
+                    raise ValueError("No connected Android devices found.")
+                device_serial = devices[0].serial
+
+            adb_device = await adb.device(serial=device_serial)
+
+            logger.info(f"🤖 Using external agent: {agent_name}")
+
+            result = await agent_module["run"](
+                device=adb_device,
+                instruction=self.shared_state.instruction,
+                config=final_config,
+                max_steps=self.config.agent.max_steps,
+            )
+
+            return FinalizeEvent(success=result["success"], reason=result["reason"])
+
        # ── 1. Create driver ──────────────────────────────────────────
        if self.config.agent.reasoning:
            vision_enabled = self.config.agent.manager.vision
@@ -530,33 +591,6 @@ class DroidAgent(Workflow):
        # ── 6. Fetch device date once ─────────────────────────────────
        self.shared_state.device_date = await driver.get_date()

-        # ── 7. External agent mode ────────────────────────────────────
-        if self._using_external_agent:
-            agent_name = self.config.agent.name
-            agent_module = load_agent(agent_name)
-            if not agent_module:
-                raise ValueError(f"Failed to load external agent: {agent_name}")
-
-            agent_config = self.config.external_agents.get(agent_name)
-            if not agent_config:
-                raise ValueError(
-                    f"No config found for agent '{agent_name}' in external_agents section"
-                )
-
-            final_config = {**agent_module["config"], **agent_config}
-
-            logger.info(f"🤖 Using external agent: {agent_name}")
-
-            result = await agent_module["run"](
-                driver=self.driver,
-                action_ctx=self.action_ctx,
-                instruction=self.shared_state.instruction,
-                config=final_config,
-                max_steps=self.config.agent.max_steps,
-            )
-
-            return FinalizeEvent(success=result["success"], reason=result["reason"])
-
        if self.config.logging.save_trajectory != "none":
            self.trajectory_writer.write(self.trajectory, stage="init")

@@ -1,28 +1,71 @@
-"""External agent loader - dynamic imports."""
+"""External agent loader — dynamic imports.
+
+External agents are self-contained modules that receive raw ADB access
+via ``async_adbutils.AdbDevice``. They bring their own LLM client, prompts,
+parsing, and action loop — zero imports from ``droidrun``.
+
+An external agent can be either:
+- A single file: ``droidrun/agent/external/my_agent.py``
+- A package:     ``droidrun/agent/external/my_agent/__init__.py``
+
+Required contract::
+
+    from async_adbutils import AdbDevice
+
+    async def run(
+        device: AdbDevice,       # raw ADB, already connected
+        instruction: str,        # the task
+        config: dict,            # from external_agents.<name> in config.yaml
+        max_steps: int,          # step limit
+    ) -> dict:                   # {"success": bool, "reason": str, "steps": int}
+
+Optional: ``DEFAULT_CONFIG: dict`` — merged under the user's config.
+"""
+
+from __future__ import annotations

 import importlib
 import logging
-from typing import Any, Callable, Dict, Optional, TypedDict
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, TypedDict

 logger = logging.getLogger("droidrun")

+_EXTERNAL_DIR = Path(__file__).parent
+

 class ExternalAgentModule(TypedDict):
-    """Type for loaded external agent module."""
+    """Type for a loaded external agent module."""

    run: Callable
    config: Dict[str, Any]


-def load_agent(name: str) -> Optional[ExternalAgentModule]:
-    """
-    Dynamically load an external agent by name.
-
-    Args:
-        name: Agent module name (e.g., "mai_ui", "autoglm")
+def list_agents() -> List[str]:
+    """Discover available external agents by scanning the external/ directory.

    Returns:
-        Dict with 'run' function and 'config' defaults, or None if failed.
+        Sorted list of agent names (module stems or package directory names).
+    """
+    agents: list[str] = []
+    for item in _EXTERNAL_DIR.iterdir():
+        if item.name.startswith(("_", ".")):
+            continue
+        if item.is_file() and item.suffix == ".py":
+            agents.append(item.stem)
+        elif item.is_dir() and (item / "__init__.py").exists():
+            agents.append(item.name)
+    return sorted(agents)
+
+
+def load_agent(name: str) -> Optional[ExternalAgentModule]:
+    """Dynamically load an external agent by name.
+
+    Args:
+        name: Agent module name (e.g., ``"my_agent"``).
+
+    Returns:
+        Dict with ``run`` function and ``config`` defaults, or *None* on failure.
    """
    try:
        module = importlib.import_module(f"droidrun.agent.external.{name}")
@@ -1,819 +0,0 @@
-"""MAI-UI External Agent - Exact implementation matching MAI-UI prompts and behavior.
-
-This agent replicates MAI-UI's exact prompts, message building, and trajectory
-management while using DroidRun's AdbTools for execution.
-"""
-
-import asyncio
-import base64
-import copy
-import json
-import logging
-import re
-from dataclasses import dataclass, field
-from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple
-
-from PIL import Image
-from jinja2 import Template
-
-from droidrun.agent.oneflows.app_starter_workflow import AppStarter
-from droidrun.agent.utils.chat_utils import to_chat_messages
-from droidrun.agent.utils.inference import acall_with_retries
-from droidrun.agent.utils.llm_picker import load_llm
-
-logger = logging.getLogger("droidrun")
-
-# =============================================================================
-# Constants
-# =============================================================================
-
-SCALE_FACTOR = 999
-
-# =============================================================================
-# Default Configuration (agent-specific only, NOT LLM)
-# =============================================================================
-
-DEFAULT_CONFIG: Dict[str, Any] = {
-    # Agent-specific settings matching MAI-UI defaults
-    "history_n": 3,  # Number of history steps with images
-    # Note: vision is always True for MAI-UI (screenshot-based agent)
-}
-
-
-# =============================================================================
-# Trajectory Memory (matches MAI-UI's unified_memory.py)
-# =============================================================================
-
-
-@dataclass
-class TrajStep:
-    """
-    Single step in an agent's trajectory.
-
-    Attributes:
-        screenshot_bytes: Screenshot as PNG bytes
-        prediction: Raw LLM response text
-        action: Parsed action dictionary
-        thought: Extracted thinking/reasoning
-        step_index: Index of this step
-        structured_action: {"action_json": action} for history reconstruction
-        ask_user_response: Response from user when ask_user action was used
-    """
-
-    screenshot_bytes: bytes
-    prediction: str
-    action: Dict[str, Any]
-    thought: str
-    step_index: int
-    structured_action: Dict[str, Any]
-    ask_user_response: Optional[str] = None
-
-
-@dataclass
-class TrajMemory:
-    """
-    Container for complete trajectory.
-
-    Attributes:
-        task_goal: The instruction/goal for this trajectory
-        steps: List of trajectory steps
-    """
-
-    task_goal: str
-    steps: List[TrajStep] = field(default_factory=list)
-
-
-# =============================================================================
-# System Prompt (exact MAI-UI prompt with MCP template - renders without MCP when no tools passed)
-# =============================================================================
-
-# fmt: off
-MAI_MOBILE_SYS_PROMPT_TEMPLATE = Template(
-    "You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n"
-    "\n"
-    "## Output Format\n"
-    "For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
-    "```\n"
-    "<thinking>\n"
-    "...\n"
-    "</thinking>\n"
-    "<tool_call>\n"
-    "{\"name\": \"mobile_use\", \"arguments\": <args-json-object>}\n"
-    "</tool_call>\n"
-    "```\n"
-    "\n"
-    "## Action Space\n"
-    "\n"
-    "{\"action\": \"click\", \"coordinate\": [x, y]}\n"
-    "{\"action\": \"long_press\", \"coordinate\": [x, y]}\n"
-    "{\"action\": \"type\", \"text\": \"\"}\n"
-    "{\"action\": \"swipe\", \"direction\": \"up or down or left or right\", \"coordinate\": [x, y]} # \"coordinate\" is optional. Use the \"coordinate\" if you want to swipe a specific UI element.\n"
-    "{\"action\": \"open\", \"text\": \"app_name\"}\n"
-    "{\"action\": \"drag\", \"start_coordinate\": [x1, y1], \"end_coordinate\": [x2, y2]}\n"
-    "{\"action\": \"system_button\", \"button\": \"button_name\"} # Options: back, home, menu, enter \n"
-    "{\"action\": \"wait\"}\n"
-    "{\"action\": \"terminate\", \"status\": \"success or fail\"} \n"
-    "{\"action\": \"answer\", \"text\": \"xxx\"} # Use escape characters \\', \\\", and \\n in text part to ensure we can parse the text in normal python string format.\n"
-    "{\"action\": \"ask_user\", \"text\": \"xxx\"} # you can ask user for more information to complete the task.\n"
-    "{\"action\": \"double_click\", \"coordinate\": [x, y]}\n"
-    "\n"
-    "{% if tools %}"
-    "## MCP Tools\n"
-    "You are also provided with MCP tools, you can use them to complete the task.\n"
-    "{{ tools }}\n"
-    "\n"
-    "If you want to use MCP tools, you must output as the following format:\n"
-    "```\n"
-    "<thinking>\n"
-    "...\n"
-    "</thinking>\n"
-    "<tool_call>\n"
-    "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
-    "</tool_call>\n"
-    "```\n"
-    "{% endif %}"
-    "## Note\n"
-    "- Available Apps: `{{ apps_list }}`.\n"
-    "- Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part."
-)
-# fmt: on
-
-
-# =============================================================================
-# Parsing Functions (matches MAI-UI's parsing)
-# =============================================================================
-
-
-def parse_tagged_text(text: str) -> Dict[str, Any]:
-    """
-    Parse text containing <thinking> and <tool_call> tags.
-
-    Handles both standard format and thinking model format (</think>).
-
-    Args:
-        text: Raw model output
-
-    Returns:
-        Dictionary with "thinking" and "tool_call" keys
-    """
-    text = text.strip()
-
-    # Handle thinking model output format (uses </think> instead of </thinking>)
-    if "</think>" in text and "</thinking>" not in text:
-        text = text.replace("</think>", "</thinking>")
-        text = "<thinking>" + text
-
-    result: Dict[str, Any] = {
-        "thinking": None,
-        "tool_call": None,
-    }
-
-    # Extract thinking content
-    think_pattern = r"<thinking>(.*?)</thinking>"
-    think_match = re.search(think_pattern, text, re.DOTALL)
-    if think_match:
-        result["thinking"] = think_match.group(1).strip()
-
-    # Extract tool_call content
-    call_pattern = r"<tool_call>(.*?)</tool_call>"
-    call_match = re.search(call_pattern, text, re.DOTALL)
-    if call_match:
-        try:
-            result["tool_call"] = json.loads(call_match.group(1).strip())
-        except json.JSONDecodeError:
-            result["tool_call"] = None
-
-    return result
-
-
-def parse_action(text: str) -> Dict[str, Any]:
-    """
-    Parse model output into structured action format.
-
-    Normalizes coordinates from SCALE_FACTOR (0-999) to 0-1 range,
-    matching MAI-UI's parse_action_to_structure_output behavior.
-
-    Args:
-        text: Raw model output
-
-    Returns:
-        Dictionary with "thinking" and "action_json" keys
-
-    Raises:
-        ValueError: If parsing fails
-    """
-    parsed = parse_tagged_text(text)
-
-    if not parsed["tool_call"]:
-        raise ValueError("No valid tool_call found in response")
-
-    action = parsed["tool_call"].get("arguments", {})
-
-    # Normalize coordinates from SCALE_FACTOR range to [0, 1]
-    # This matches MAI-UI's parse_action_to_structure_output behavior
-    for coord_key in ["coordinate", "start_coordinate", "end_coordinate"]:
-        if coord_key in action:
-            coordinates = action[coord_key]
-            if len(coordinates) == 2:
-                point_x, point_y = coordinates
-            elif len(coordinates) == 4:
-                # Handle bounding box format (x1, y1, x2, y2) -> center point
-                x1, y1, x2, y2 = coordinates
-                point_x = (x1 + x2) / 2
-                point_y = (y1 + y2) / 2
-            else:
-                raise ValueError(
-                    f"Invalid {coord_key} format: expected 2 or 4 values, got {len(coordinates)}"
-                )
-            action[coord_key] = [point_x / SCALE_FACTOR, point_y / SCALE_FACTOR]
-
-    return {
-        "thinking": parsed["thinking"],
-        "action_json": action,
-    }
-
-
-# =============================================================================
-# Helper Functions
-# =============================================================================
-
-
-def pil_to_base64(image: Image.Image) -> str:
-    """Convert PIL Image to base64 string."""
-    buffer = BytesIO()
-    image.save(buffer, format="PNG")
-    return base64.b64encode(buffer.getvalue()).decode("utf-8")
-
-
-def bytes_to_base64(image_bytes: bytes) -> str:
-    image = Image.open(BytesIO(image_bytes))
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    buffer = BytesIO()
-    image.save(buffer, format="PNG")
-    return base64.b64encode(buffer.getvalue()).decode("utf-8")
-
-
-async def resolve_app_name(tools, app_name: str) -> str:
-    """
-    Resolve friendly app name to package name.
-
-    Args:
-        tools: DroidRun Tools instance
-        app_name: Friendly app name (e.g., "Settings", "Chrome")
-
-    Returns:
-        Package name (e.g., "com.android.settings")
-    """
-    try:
-        apps = await tools.get_apps(include_system=True)
-
-        # Try exact label match (case-insensitive)
-        for app in apps:
-            if app.get("label", "").lower() == app_name.lower():
-                return app["package"]
-
-        # Try partial match
-        for app in apps:
-            if app_name.lower() in app.get("label", "").lower():
-                return app["package"]
-
-        # Return as-is (might already be a package name)
-        return app_name
-
-    except Exception as e:
-        logger.warning(f"Failed to resolve app name '{app_name}': {e}")
-        return app_name
-
-
-async def get_available_apps(tools) -> str:
-    """
-    Get list of available apps for the prompt.
-
-    Returns:
-        Formatted string of app names
-    """
-    try:
-        apps = await tools.get_apps(include_system=False)
-        app_names = [app.get("label", app.get("package", "")) for app in apps[:30]]
-        return json.dumps(app_names)
-    except Exception:
-        # Fallback to generic list
-        return '["Settings", "Chrome", "Camera", "Files", "Contacts", "Messages", "Phone", "Calendar", "Clock", "Calculator"]'
-
-
-# =============================================================================
-# Message Building (matches MAI-UI's _build_messages)
-# =============================================================================
-
-
-def mem2response(step: TrajStep) -> str:
-    """
-    Reconstruct assistant response from trajectory step.
-
-    Converts stored action back to the format the LLM expects in history.
-
-    Args:
-        step: Trajectory step
-
-    Returns:
-        Formatted response string with <thinking> and <tool_call> tags
-    """
-    thinking = step.thought or ""
-    structured_action = step.structured_action
-
-    if not structured_action:
-        return f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{{}}\n</tool_call>"
-
-    action_json = copy.deepcopy(structured_action.get("action_json", {}))
-
-    # Convert normalized coordinates back to SCALE_FACTOR range for history
-    # NOTE: Original MAI-UI only converts "coordinate", NOT start_coordinate/end_coordinate
-    # This matches the behavior in mai_naivigation_agent.py mem2response()
-    if "coordinate" in action_json:
-        coords = action_json["coordinate"]
-        if len(coords) == 2:
-            # Coordinates are stored normalized (0-1), convert to 0-999
-            action_json["coordinate"] = [
-                int(coords[0] * SCALE_FACTOR),
-                int(coords[1] * SCALE_FACTOR),
-            ]
-
-    tool_call_dict = {
-        "name": "mobile_use",
-        "arguments": action_json,
-    }
-    tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
-
-    return f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{tool_call_json}\n</tool_call>"
-
-
-def build_messages(
-    instruction: str,
-    system_prompt: str,
-    traj_memory: TrajMemory,
-    current_screenshot_bytes: bytes,
-    history_n: int = 3,
-) -> List[Dict[str, Any]]:
-    """
-    Build multi-turn messages matching MAI-UI's format.
-
-    Message structure:
-    1. System prompt
-    2. User instruction
-    3. For each history step:
-       - Image (only for last history_n-1 steps)
-       - Assistant response
-    4. Current screenshot
-
-    Args:
-        instruction: Task instruction
-        system_prompt: System prompt text
-        traj_memory: Trajectory memory with history
-        current_screenshot_bytes: Current screenshot as bytes
-        history_n: Number of history images to include
-
-    Returns:
-        List of message dictionaries
-    """
-    messages = [
-        {
-            "role": "system",
-            "content": [{"type": "text", "text": system_prompt}],
-        },
-        {
-            "role": "user",
-            "content": [{"type": "text", "text": instruction}],
-        },
-    ]
-
-    steps = traj_memory.steps
-    image_idx = 0
-
-    if len(steps) > 0:
-        # Calculate which steps get images (last history_n - 1 steps)
-        start_image_idx = max(0, len(steps) - (history_n - 1))
-
-        # Collect history images
-        history_images = []
-        for i, step in enumerate(steps):
-            if i >= start_image_idx:
-                history_images.append(step.screenshot_bytes)
-
-        for history_idx, step in enumerate(steps):
-            should_include_image = history_idx >= start_image_idx
-
-            if should_include_image and image_idx < len(history_images):
-                # Add image before assistant response
-                encoded = bytes_to_base64(history_images[image_idx])
-                messages.append(
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/png;base64,{encoded}"
-                                },
-                            }
-                        ],
-                    }
-                )
-                image_idx += 1
-
-            # Add assistant response
-            history_response = mem2response(step)
-            messages.append(
-                {
-                    "role": "assistant",
-                    "content": [{"type": "text", "text": history_response}],
-                }
-            )
-
-            # Add ask_user_response if present (matches MAI-UI behavior)
-            if step.ask_user_response:
-                messages.append(
-                    {
-                        "role": "user",
-                        "content": [{"type": "text", "text": step.ask_user_response}],
-                    }
-                )
-
-    # Add current screenshot
-    current_encoded = bytes_to_base64(current_screenshot_bytes)
-    messages.append(
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{current_encoded}"},
-                }
-            ],
-        }
-    )
-
-    return messages
-
-
-# =============================================================================
-# Action Execution
-# =============================================================================
-
-
-async def execute_action(
-    tools,
-    action: Dict[str, Any],
-    screen_width: int,
-    screen_height: int,
-    llm=None,
-) -> Tuple[bool, str]:
-    """
-    Execute a MAI-UI action using DroidRun tools.
-
-    Args:
-        tools: DroidRun Tools instance
-        action: Parsed action dictionary with normalized coordinates (0-1 range)
-        screen_width: Screen width in pixels
-        screen_height: Screen height in pixels
-        llm: LLM instance for intelligent app opening (AppStarter workflow)
-
-    Returns:
-        Tuple of (success, result_message)
-    """
-    action_type = action.get("action", "")
-    w, h = screen_width, screen_height
-
-    try:
-        if action_type == "click":
-            # Coordinates are normalized (0-1), convert to pixels
-            coord = action.get("coordinate", [0, 0])
-            x = int(coord[0] * w)
-            y = int(coord[1] * h)
-            result = await tools.tap_by_coordinates(x, y)
-            return True, f"click ({x},{y}): {result}"
-
-        elif action_type == "long_press":
-            coord = action.get("coordinate", [0, 0])
-            x = int(coord[0] * w)
-            y = int(coord[1] * h)
-            await tools.swipe(x, y, x, y, 1000)
-            return True, f"long_press ({x},{y})"
-
-        elif action_type == "double_click":
-            coord = action.get("coordinate", [0, 0])
-            x = int(coord[0] * w)
-            y = int(coord[1] * h)
-            await tools.tap_by_coordinates(x, y)
-            await asyncio.sleep(0.1)
-            await tools.tap_by_coordinates(x, y)
-            return True, f"double_click ({x},{y})"
-
-        elif action_type == "type":
-            text = action.get("text", "")
-            result = await tools.input_text(text)
-            return True, (
-                f"type '{text[:30]}...': {result}"
-                if len(text) > 30
-                else f"type '{text}': {result}"
-            )
-
-        elif action_type == "swipe":
-            direction = action.get("direction", "up")
-            # Default to center if no coordinate provided (0.5, 0.5 normalized)
-            coord = action.get("coordinate", [0.5, 0.5])
-
-            # Start position (normalized to pixels)
-            sx = int(coord[0] * w)
-            sy = int(coord[1] * h)
-
-            # Direction offsets (proportional to screen size)
-            offsets = {
-                "up": (0, -h // 3),
-                "down": (0, h // 3),
-                "left": (-w // 3, 0),
-                "right": (w // 3, 0),
-            }
-            dx, dy = offsets.get(direction, (0, 0))
-
-            # Clamp end coordinates to screen bounds
-            ex = max(0, min(w - 1, sx + dx))
-            ey = max(0, min(h - 1, sy + dy))
-
-            await tools.swipe(sx, sy, ex, ey, 300)
-            return True, f"swipe {direction} from ({sx},{sy}) to ({ex},{ey})"
-
-        elif action_type == "drag":
-            start_coord = action.get("start_coordinate", [0, 0])
-            end_coord = action.get("end_coordinate", [0, 0])
-
-            sx = int(start_coord[0] * w)
-            sy = int(start_coord[1] * h)
-            ex = int(end_coord[0] * w)
-            ey = int(end_coord[1] * h)
-
-            # Use longer duration for drag semantics
-            await tools.swipe(sx, sy, ex, ey, 2000)
-            return True, f"drag from ({sx},{sy}) to ({ex},{ey})"
-
-        elif action_type == "open":
-            app_name = action.get("text", "")
-            if llm is not None:
-                # Use intelligent LLM-based app matching via AppStarter
-                workflow = AppStarter(
-                    tools=tools,
-                    llm=llm,
-                    timeout=60,
-                    verbose=False,
-                )
-                result = await workflow.run(app_description=app_name)
-                await asyncio.sleep(1)
-                return True, f"open '{app_name}': {result}"
-            else:
-                # Fallback to simple name matching
-                package = await resolve_app_name(tools, app_name)
-                result = await tools.start_app(package)
-                return True, f"open '{app_name}' ({package}): {result}"
-
-        elif action_type == "system_button":
-            button = action.get("button", "back")
-            keycodes = {
-                "back": 4,
-                "home": 3,
-                "enter": 66,
-                "menu": 82,
-            }
-            keycode = keycodes.get(button, 4)
-            result = await tools.press_key(keycode)
-            return True, f"{button}: {result}"
-
-        elif action_type == "wait":
-            await asyncio.sleep(1.0)
-            return True, "wait 1s"
-
-        elif action_type == "terminate":
-            # Handled in main loop
-            return True, "terminate"
-
-        elif action_type == "answer":
-            # Handled in main loop
-            return True, f"answer: {action.get('text', '')}"
-
-        elif action_type == "ask_user":
-            # Handled in main loop - returns special marker
-            return True, "ask_user"
-
-        else:
-            return False, f"unknown action: {action_type}"
-
-    except Exception as e:
-        logger.error(f"Action execution failed: {e}")
-        return False, f"error: {e}"
-
-
-# =============================================================================
-# Main Run Function
-# =============================================================================
-
-
-async def run(
-    tools,
-    instruction: str,
-    config: Dict[str, Any],
-    max_steps: int = 15,
-) -> Dict[str, Any]:
-    """
-    Run MAI-UI agent with exact MAI-UI behavior.
-
-    Args:
-        tools: DroidRun Tools instance (AdbTools)
-        instruction: Task to complete
-        config: Configuration dictionary:
-            llm: Dict passed directly to load_llm() with:
-                provider: LLM provider (default: "OpenAI")
-                model, temperature, base_url, api_key, max_tokens, top_p, top_k, etc.
-            history_n: Number of history images (default: 3)
-            vision: Whether to use screenshots (default: True)
-        max_steps: Maximum iterations
-
-    Returns:
-        Dictionary with: success, reason, steps, answer (if answer action)
-    """
-    # Validate LLM config - must be provided by user
-    llm_cfg = config.get("llm")
-    if not llm_cfg or not isinstance(llm_cfg, dict):
-        raise ValueError(
-            "MAI-UI requires 'llm' configuration. "
-            "Please configure external_agents.mai_ui.llm in your config.yaml"
-        )
-
-    if "provider" not in llm_cfg:
-        raise ValueError(
-            "MAI-UI requires 'llm.provider' to be specified. "
-            "Example: provider: OpenAI"
-        )
-
-    if "model" not in llm_cfg:
-        raise ValueError(
-            "MAI-UI requires 'llm.model' to be specified. " "Example: model: mai-ui-8b"
-        )
-
-    # Load LLM - pass config directly to load_llm
-    llm_cfg = dict(llm_cfg)  # Copy to avoid mutating
-    provider = llm_cfg.pop("provider")
-    llm = load_llm(provider, **llm_cfg)
-
-    # Agent-specific configuration (defaults from DEFAULT_CONFIG)
-    history_n = config.get("history_n", DEFAULT_CONFIG["history_n"])
-
-    # Initialize trajectory memory
-    traj_memory = TrajMemory(task_goal=instruction)
-
-    # Get available apps for prompt and render system prompt
-    # No MCP tools passed - the MCP section will not appear in the prompt
-    apps_list = await get_available_apps(tools)
-    system_prompt = MAI_MOBILE_SYS_PROMPT_TEMPLATE.render(
-        apps_list=apps_list,
-        tools=None,  # No MCP tools - section won't render
-    )
-
-    logger.info(f"MAI-UI agent starting: {instruction}")
-
-    for step in range(max_steps):
-        logger.info(f"Step {step + 1}/{max_steps}")
-
-        # Get screen dimensions
-        try:
-            await tools.get_state()
-            w, h = tools.screen_width, tools.screen_height
-        except Exception as e:
-            logger.error(f"Failed to get state: {e}")
-            w, h = 1080, 2400  # Fallback dimensions
-
-        # Take screenshot (MAI-UI is vision-based, always requires screenshots)
-        try:
-            _, screenshot_bytes = await tools.take_screenshot()
-        except Exception as e:
-            logger.error(f"Failed to take screenshot: {e}")
-            continue
-
-        if not screenshot_bytes:
-            logger.error("No screenshot available")
-            continue
-
-        # Build messages
-        messages = build_messages(
-            instruction=instruction,
-            system_prompt=system_prompt,
-            traj_memory=traj_memory,
-            current_screenshot_bytes=screenshot_bytes,
-            history_n=history_n,
-        )
-
-        # Call LLM
-        try:
-            response = await acall_with_retries(llm, to_chat_messages(messages))
-            response_text = str(response)
-        except Exception as e:
-            logger.error(f"LLM call failed: {e}")
-            continue
-
-        # Parse response
-        try:
-            parsed = parse_action(response_text)
-            thinking = parsed["thinking"] or ""
-            action_json = parsed["action_json"]
-        except ValueError as e:
-            logger.error(f"Failed to parse response: {e}")
-            logger.debug(f"Raw response: {response_text[:500]}")
-            continue
-
-        logger.info(f"Thinking: {thinking[:150]}...")
-        logger.info(f"Action: {action_json.get('action', 'unknown')}")
-
-        # Check for terminal actions first (before storing step)
-        action_type = action_json.get("action", "")
-
-        if action_type == "terminate":
-            # Store step before returning
-            traj_step = TrajStep(
-                screenshot_bytes=screenshot_bytes,
-                prediction=response_text,
-                action=action_json,
-                thought=thinking,
-                step_index=step,
-                structured_action={"action_json": action_json},
-            )
-            traj_memory.steps.append(traj_step)
-
-            success = action_json.get("status") == "success"
-            reason = action_json.get("message", "Task terminated")
-            logger.info(f"Terminated: success={success}, reason={reason}")
-            return {"success": success, "reason": reason, "steps": step + 1}
-
-        if action_type == "answer":
-            # Store step before returning
-            traj_step = TrajStep(
-                screenshot_bytes=screenshot_bytes,
-                prediction=response_text,
-                action=action_json,
-                thought=thinking,
-                step_index=step,
-                structured_action={"action_json": action_json},
-            )
-            traj_memory.steps.append(traj_step)
-
-            answer_text = action_json.get("text", "")
-            logger.info(f"Answer: {answer_text}")
-            return {
-                "success": True,
-                "reason": "Task completed with answer",
-                "steps": step + 1,
-                "answer": answer_text,
-            }
-
-        if action_type == "ask_user":
-            # Get user input via stdin
-            question = action_json.get("text", "Please provide input:")
-            logger.info(f"🤖 Agent asks: {question}")
-            user_response = input("Your response: ").strip()
-            logger.info(f"User response: {user_response}")
-
-            # Store step with ask_user_response
-            traj_step = TrajStep(
-                screenshot_bytes=screenshot_bytes,
-                prediction=response_text,
-                action=action_json,
-                thought=thinking,
-                step_index=step,
-                structured_action={"action_json": action_json},
-                ask_user_response=user_response,
-            )
-            traj_memory.steps.append(traj_step)
-
-            # Continue to next iteration (no device action needed)
-            await asyncio.sleep(0.5)
-            continue
-
-        # Store step in trajectory (for non-terminal, non-ask_user actions)
-        # action_json already has normalized coordinates (0-1) from parse_action
-        traj_step = TrajStep(
-            screenshot_bytes=screenshot_bytes,
-            prediction=response_text,
-            action=action_json,
-            thought=thinking,
-            step_index=step,
-            structured_action={"action_json": action_json},
-        )
-        traj_memory.steps.append(traj_step)
-
-        # Execute action
-        success, result_msg = await execute_action(tools, action_json, w, h, llm)
-        logger.info(f"Execution: {result_msg}")
-
-        # Brief pause between steps
-        await asyncio.sleep(0.5)
-
-    # Max steps reached
-    return {"success": False, "reason": "Max steps reached", "steps": max_steps}
@@ -34,8 +34,9 @@ from droidrun.portal import (
    ping_portal_tcp,
    setup_portal,
 )
-from droidrun.telemetry import print_telemetry_message
+from droidrun.agent.external import list_agents
 from droidrun.agent.utils.llm_picker import load_llm
+from droidrun.telemetry import print_telemetry_message

 # Suppress all warnings
 warnings.filterwarnings("ignore")
@@ -68,6 +69,7 @@ async def run_command(
    command: str,
    config_path: str | None = None,
    device: str | None = None,
+    agent: str | None = None,
    provider: str | None = None,
    model: str | None = None,
    steps: int | None = None,
@@ -140,6 +142,8 @@ async def run_command(
                config.agent.fast_agent.vision = fast_agent_vision

        # Agent overrides
+        if agent is not None:
+            config.agent.name = agent
        if steps is not None:
            config.agent.max_steps = steps
        if reasoning is not None:
@@ -314,10 +318,21 @@ def cli():
    pass


+_available_agents = list_agents()
+
+
@cli.command()
@click.argument("command", type=str)
@click.option("--config", "-c", help="Path to custom config file", default=None)
@click.option("--device", "-d", help="Device serial number or IP address", default=None)
+@click.option(
+    "--agent",
+    "-a",
+    type=click.Choice(_available_agents) if _available_agents else None,
+    help="External agent to use"
+    + (f" [{', '.join(_available_agents)}]" if _available_agents else " (none available)"),
+    default=None,
+)
@click.option(
    "--provider",
    "-p",
@@ -377,6 +392,7 @@ async def run(
    command: str,
    config: str | None,
    device: str | None,
+    agent: str | None,
    provider: str | None,
    model: str | None,
    steps: int | None,
@@ -399,6 +415,7 @@ async def run(
            command=command,
            config_path=config,
            device=device,
+            agent=agent,
            provider=provider,
            model=model,
            steps=steps,
@@ -1,12 +1,10 @@
 # DroidRun Configuration File
 # This file is auto-generated. Edit values as needed.

-_version: 4
+_version: 5

 # === Agent Settings ===
 agent:
-  # Agent to use: "droidrun" (native) or external: "mai_ui", "autoglm"
-  name: droidrun
  # Maximum number of steps per task
  max_steps: 15
  # Enable planning with reasoning mode
@@ -214,48 +212,10 @@ mcp:
    #   enabled: true

 # === External Agent Settings ===
-# External agents are selected via agent.name above.
-# Set agent.name to "mai_ui" or "autoglm" to use external agents.
-# Settings below are merged with agent-specific defaults.
+# External agents coming soon. See docs for how to add custom agents.
+# Use: droidrun run "task" --agent <name>
 #
-# Example:
-#   agent:
-#     name: mai_ui
-#     max_steps: 20
-#
-# Optional overrides via external_agent section:
-# external_agent:
-#   llm:
-#     base_url: http://custom:8000/v1
-
-# External agent configurations (reference settings)
-external_agents:
-  # MAI-UI - Alibaba's GUI agent foundation model
-  # https://github.com/Tongyi-MAI/MAI-UI
-  # Requires vLLM server: vllm serve Tongyi-MAI/MAI-UI-8B
-  mai_ui:
-    llm:
-      provider: OpenAILike
-      model: Tongyi-MAI/MAI-UI-8B  # or mai-ui-2b, mai-ui-32b, mai-ui-235b-a22b
-      api_base: https://enjoyed-placed-theaters-survival.trycloudflare.com/v1
-      api_key: EMPTY
-      temperature: 0.0
-      max_tokens: 2048
-      top_p: 1.0
-      top_k: -1
-    history_n: 3  # Number of history steps with images
-
-  # AutoGLM - Open-AutoGLM phone agent
-  # https://github.com/zai-org/Open-AutoGLM/
-  autoglm:
-    llm:
-      provider: OpenAILike
-      model: autoglm-phone-9b
-      api_base: http://localhost:8000/v1
-      api_key: EMPTY
-      temperature: 0.0
-      top_p: 0.85
-      frequency_penalty: 0.2
-      max_tokens: 3000
-    lang: en      # cn or en
-    stream: true
+# external_agents:
+#   my_agent:
+#     api_key: "your-api-key"
+#     model: "model-name"
@@ -6,7 +6,7 @@ import pkgutil
 from pathlib import Path


-CURRENT_VERSION = 4
+CURRENT_VERSION = 5


 def get_migrations() -> List:
@@ -0,0 +1,21 @@
+"""Migration v5: Remove legacy external agent configs (mai_ui, autoglm)."""
+
+from typing import Any, Dict
+
+VERSION = 5
+
+
+def migrate(config: Dict[str, Any]) -> Dict[str, Any]:
+    """Remove legacy mai_ui and autoglm entries, reset name if it was one of those."""
+    # Remove only the known legacy agent configs, leave user-added ones intact
+    external_agents = config.get("external_agents", {})
+    if isinstance(external_agents, dict):
+        external_agents.pop("mai_ui", None)
+        external_agents.pop("autoglm", None)
+
+    # Reset agent.name only if it was a removed agent
+    agent = config.get("agent", {})
+    if agent.get("name") in ("mai_ui", "autoglm"):
+        agent.pop("name", None)
+
+    return config