refactor!: redesign external agent system to use raw ADB access

External agents now receive a raw async_adbutils.AdbDevice instead of
internal DroidRun tools. This makes them fully self-contained — zero
imports from droidrun, just copy-paste a file or folder and it works.

- Remove mai_ui and autoglm agents (depended on internals)
- Pass raw AdbDevice directly, skip Portal/driver/registry setup
- Add --agent CLI flag with dynamic discovery from external/ dir
- Add list_agents() for runtime agent enumeration
- Add config migration v005 to clean up legacy agent entries
- Update config_example.yaml to remove old agent references

BREAKING CHANGE: external agent run() signature changed from
(tools, instruction, config, max_steps) to
(device, instruction, config, max_steps) where device is an
async_adbutils.AdbDevice
This commit is contained in:
johnmalek312
2026-03-25 21:42:28 +11:00
parent e3cc1ec154
commit eb047e7c38
8 changed files with 162 additions and 2025 deletions
+62 -28
View File
@@ -260,7 +260,10 @@ class DroidAgent(Workflow):
self.app_opener_llm = None
self.structured_output_llm = None
if self.config.logging.save_trajectory != "none":
if (
not self._using_external_agent
and self.config.logging.save_trajectory != "none"
):
self.trajectory = Trajectory(
goal=self.shared_state.instruction,
base_path=self.config.logging.trajectory_path,
@@ -371,6 +374,64 @@ class DroidAgent(Workflow):
if self.trajectory_writer:
await self.trajectory_writer.start()
# ── 0. External agent — early exit ────────────────────────────
if self._using_external_agent:
agent_name = self.config.agent.name
# Load the agent module
agent_module = load_agent(agent_name)
if not agent_module:
from droidrun.agent.external import list_agents
available = list_agents()
if available:
agents_str = ", ".join(available)
raise ValueError(
f"Failed to load external agent '{agent_name}'.\n"
f"Available agents: {agents_str}"
)
raise ValueError(
f"External agent '{agent_name}' not found.\n"
"No external agents are currently installed.\n"
"Run: droidrun run --help to see available agents."
)
# Resolve config
agent_config = self.config.external_agents.get(agent_name)
if not agent_config:
raise ValueError(
f"No configuration found for agent '{agent_name}'.\n\n"
"Add to your config.yaml:\n\n"
" external_agents:\n"
f" {agent_name}:\n"
' api_key: "your-api-key"\n'
' model: "model-name"\n'
" # ... any settings your agent needs"
)
final_config = {**agent_module["config"], **agent_config}
# Resolve device serial and get raw AdbDevice
device_serial = self.resolved_device_config.serial
if device_serial is None:
devices = await adb.list()
if not devices:
raise ValueError("No connected Android devices found.")
device_serial = devices[0].serial
adb_device = await adb.device(serial=device_serial)
logger.info(f"🤖 Using external agent: {agent_name}")
result = await agent_module["run"](
device=adb_device,
instruction=self.shared_state.instruction,
config=final_config,
max_steps=self.config.agent.max_steps,
)
return FinalizeEvent(success=result["success"], reason=result["reason"])
# ── 1. Create driver ──────────────────────────────────────────
if self.config.agent.reasoning:
vision_enabled = self.config.agent.manager.vision
@@ -530,33 +591,6 @@ class DroidAgent(Workflow):
# ── 6. Fetch device date once ─────────────────────────────────
self.shared_state.device_date = await driver.get_date()
# ── 7. External agent mode ────────────────────────────────────
if self._using_external_agent:
agent_name = self.config.agent.name
agent_module = load_agent(agent_name)
if not agent_module:
raise ValueError(f"Failed to load external agent: {agent_name}")
agent_config = self.config.external_agents.get(agent_name)
if not agent_config:
raise ValueError(
f"No config found for agent '{agent_name}' in external_agents section"
)
final_config = {**agent_module["config"], **agent_config}
logger.info(f"🤖 Using external agent: {agent_name}")
result = await agent_module["run"](
driver=self.driver,
action_ctx=self.action_ctx,
instruction=self.shared_state.instruction,
config=final_config,
max_steps=self.config.agent.max_steps,
)
return FinalizeEvent(success=result["success"], reason=result["reason"])
if self.config.logging.save_trajectory != "none":
self.trajectory_writer.write(self.trajectory, stage="init")
+53 -10
View File
@@ -1,28 +1,71 @@
"""External agent loader - dynamic imports."""
"""External agent loader dynamic imports.
External agents are self-contained modules that receive raw ADB access
via ``async_adbutils.AdbDevice``. They bring their own LLM client, prompts,
parsing, and action loop — zero imports from ``droidrun``.
An external agent can be either:
- A single file: ``droidrun/agent/external/my_agent.py``
- A package: ``droidrun/agent/external/my_agent/__init__.py``
Required contract::
from async_adbutils import AdbDevice
async def run(
device: AdbDevice, # raw ADB, already connected
instruction: str, # the task
config: dict, # from external_agents.<name> in config.yaml
max_steps: int, # step limit
) -> dict: # {"success": bool, "reason": str, "steps": int}
Optional: ``DEFAULT_CONFIG: dict`` — merged under the user's config.
"""
from __future__ import annotations
import importlib
import logging
from typing import Any, Callable, Dict, Optional, TypedDict
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, TypedDict
logger = logging.getLogger("droidrun")
_EXTERNAL_DIR = Path(__file__).parent
class ExternalAgentModule(TypedDict):
"""Type for loaded external agent module."""
"""Type for a loaded external agent module."""
run: Callable
config: Dict[str, Any]
def load_agent(name: str) -> Optional[ExternalAgentModule]:
"""
Dynamically load an external agent by name.
Args:
name: Agent module name (e.g., "mai_ui", "autoglm")
def list_agents() -> List[str]:
"""Discover available external agents by scanning the external/ directory.
Returns:
Dict with 'run' function and 'config' defaults, or None if failed.
Sorted list of agent names (module stems or package directory names).
"""
agents: list[str] = []
for item in _EXTERNAL_DIR.iterdir():
if item.name.startswith(("_", ".")):
continue
if item.is_file() and item.suffix == ".py":
agents.append(item.stem)
elif item.is_dir() and (item / "__init__.py").exists():
agents.append(item.name)
return sorted(agents)
def load_agent(name: str) -> Optional[ExternalAgentModule]:
"""Dynamically load an external agent by name.
Args:
name: Agent module name (e.g., ``"my_agent"``).
Returns:
Dict with ``run`` function and ``config`` defaults, or *None* on failure.
"""
try:
module = importlib.import_module(f"droidrun.agent.external.{name}")
-1119
View File
File diff suppressed because it is too large Load Diff
-819
View File
@@ -1,819 +0,0 @@
"""MAI-UI External Agent - Exact implementation matching MAI-UI prompts and behavior.
This agent replicates MAI-UI's exact prompts, message building, and trajectory
management while using DroidRun's AdbTools for execution.
"""
import asyncio
import base64
import copy
import json
import logging
import re
from dataclasses import dataclass, field
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple
from PIL import Image
from jinja2 import Template
from droidrun.agent.oneflows.app_starter_workflow import AppStarter
from droidrun.agent.utils.chat_utils import to_chat_messages
from droidrun.agent.utils.inference import acall_with_retries
from droidrun.agent.utils.llm_picker import load_llm
logger = logging.getLogger("droidrun")
# =============================================================================
# Constants
# =============================================================================
SCALE_FACTOR = 999
# =============================================================================
# Default Configuration (agent-specific only, NOT LLM)
# =============================================================================
DEFAULT_CONFIG: Dict[str, Any] = {
# Agent-specific settings matching MAI-UI defaults
"history_n": 3, # Number of history steps with images
# Note: vision is always True for MAI-UI (screenshot-based agent)
}
# =============================================================================
# Trajectory Memory (matches MAI-UI's unified_memory.py)
# =============================================================================
@dataclass
class TrajStep:
"""
Single step in an agent's trajectory.
Attributes:
screenshot_bytes: Screenshot as PNG bytes
prediction: Raw LLM response text
action: Parsed action dictionary
thought: Extracted thinking/reasoning
step_index: Index of this step
structured_action: {"action_json": action} for history reconstruction
ask_user_response: Response from user when ask_user action was used
"""
screenshot_bytes: bytes
prediction: str
action: Dict[str, Any]
thought: str
step_index: int
structured_action: Dict[str, Any]
ask_user_response: Optional[str] = None
@dataclass
class TrajMemory:
"""
Container for complete trajectory.
Attributes:
task_goal: The instruction/goal for this trajectory
steps: List of trajectory steps
"""
task_goal: str
steps: List[TrajStep] = field(default_factory=list)
# =============================================================================
# System Prompt (exact MAI-UI prompt with MCP template - renders without MCP when no tools passed)
# =============================================================================
# fmt: off
MAI_MOBILE_SYS_PROMPT_TEMPLATE = Template(
"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n"
"\n"
"## Output Format\n"
"For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
"```\n"
"<thinking>\n"
"...\n"
"</thinking>\n"
"<tool_call>\n"
"{\"name\": \"mobile_use\", \"arguments\": <args-json-object>}\n"
"</tool_call>\n"
"```\n"
"\n"
"## Action Space\n"
"\n"
"{\"action\": \"click\", \"coordinate\": [x, y]}\n"
"{\"action\": \"long_press\", \"coordinate\": [x, y]}\n"
"{\"action\": \"type\", \"text\": \"\"}\n"
"{\"action\": \"swipe\", \"direction\": \"up or down or left or right\", \"coordinate\": [x, y]} # \"coordinate\" is optional. Use the \"coordinate\" if you want to swipe a specific UI element.\n"
"{\"action\": \"open\", \"text\": \"app_name\"}\n"
"{\"action\": \"drag\", \"start_coordinate\": [x1, y1], \"end_coordinate\": [x2, y2]}\n"
"{\"action\": \"system_button\", \"button\": \"button_name\"} # Options: back, home, menu, enter \n"
"{\"action\": \"wait\"}\n"
"{\"action\": \"terminate\", \"status\": \"success or fail\"} \n"
"{\"action\": \"answer\", \"text\": \"xxx\"} # Use escape characters \\', \\\", and \\n in text part to ensure we can parse the text in normal python string format.\n"
"{\"action\": \"ask_user\", \"text\": \"xxx\"} # you can ask user for more information to complete the task.\n"
"{\"action\": \"double_click\", \"coordinate\": [x, y]}\n"
"\n"
"{% if tools %}"
"## MCP Tools\n"
"You are also provided with MCP tools, you can use them to complete the task.\n"
"{{ tools }}\n"
"\n"
"If you want to use MCP tools, you must output as the following format:\n"
"```\n"
"<thinking>\n"
"...\n"
"</thinking>\n"
"<tool_call>\n"
"{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
"</tool_call>\n"
"```\n"
"{% endif %}"
"## Note\n"
"- Available Apps: `{{ apps_list }}`.\n"
"- Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part."
)
# fmt: on
# =============================================================================
# Parsing Functions (matches MAI-UI's parsing)
# =============================================================================
def parse_tagged_text(text: str) -> Dict[str, Any]:
"""
Parse text containing <thinking> and <tool_call> tags.
Handles both standard format and thinking model format (</think>).
Args:
text: Raw model output
Returns:
Dictionary with "thinking" and "tool_call" keys
"""
text = text.strip()
# Handle thinking model output format (uses </think> instead of </thinking>)
if "</think>" in text and "</thinking>" not in text:
text = text.replace("</think>", "</thinking>")
text = "<thinking>" + text
result: Dict[str, Any] = {
"thinking": None,
"tool_call": None,
}
# Extract thinking content
think_pattern = r"<thinking>(.*?)</thinking>"
think_match = re.search(think_pattern, text, re.DOTALL)
if think_match:
result["thinking"] = think_match.group(1).strip()
# Extract tool_call content
call_pattern = r"<tool_call>(.*?)</tool_call>"
call_match = re.search(call_pattern, text, re.DOTALL)
if call_match:
try:
result["tool_call"] = json.loads(call_match.group(1).strip())
except json.JSONDecodeError:
result["tool_call"] = None
return result
def parse_action(text: str) -> Dict[str, Any]:
"""
Parse model output into structured action format.
Normalizes coordinates from SCALE_FACTOR (0-999) to 0-1 range,
matching MAI-UI's parse_action_to_structure_output behavior.
Args:
text: Raw model output
Returns:
Dictionary with "thinking" and "action_json" keys
Raises:
ValueError: If parsing fails
"""
parsed = parse_tagged_text(text)
if not parsed["tool_call"]:
raise ValueError("No valid tool_call found in response")
action = parsed["tool_call"].get("arguments", {})
# Normalize coordinates from SCALE_FACTOR range to [0, 1]
# This matches MAI-UI's parse_action_to_structure_output behavior
for coord_key in ["coordinate", "start_coordinate", "end_coordinate"]:
if coord_key in action:
coordinates = action[coord_key]
if len(coordinates) == 2:
point_x, point_y = coordinates
elif len(coordinates) == 4:
# Handle bounding box format (x1, y1, x2, y2) -> center point
x1, y1, x2, y2 = coordinates
point_x = (x1 + x2) / 2
point_y = (y1 + y2) / 2
else:
raise ValueError(
f"Invalid {coord_key} format: expected 2 or 4 values, got {len(coordinates)}"
)
action[coord_key] = [point_x / SCALE_FACTOR, point_y / SCALE_FACTOR]
return {
"thinking": parsed["thinking"],
"action_json": action,
}
# =============================================================================
# Helper Functions
# =============================================================================
def pil_to_base64(image: Image.Image) -> str:
"""Convert PIL Image to base64 string."""
buffer = BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
def bytes_to_base64(image_bytes: bytes) -> str:
image = Image.open(BytesIO(image_bytes))
if image.mode != "RGB":
image = image.convert("RGB")
buffer = BytesIO()
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
async def resolve_app_name(tools, app_name: str) -> str:
"""
Resolve friendly app name to package name.
Args:
tools: DroidRun Tools instance
app_name: Friendly app name (e.g., "Settings", "Chrome")
Returns:
Package name (e.g., "com.android.settings")
"""
try:
apps = await tools.get_apps(include_system=True)
# Try exact label match (case-insensitive)
for app in apps:
if app.get("label", "").lower() == app_name.lower():
return app["package"]
# Try partial match
for app in apps:
if app_name.lower() in app.get("label", "").lower():
return app["package"]
# Return as-is (might already be a package name)
return app_name
except Exception as e:
logger.warning(f"Failed to resolve app name '{app_name}': {e}")
return app_name
async def get_available_apps(tools) -> str:
"""
Get list of available apps for the prompt.
Returns:
Formatted string of app names
"""
try:
apps = await tools.get_apps(include_system=False)
app_names = [app.get("label", app.get("package", "")) for app in apps[:30]]
return json.dumps(app_names)
except Exception:
# Fallback to generic list
return '["Settings", "Chrome", "Camera", "Files", "Contacts", "Messages", "Phone", "Calendar", "Clock", "Calculator"]'
# =============================================================================
# Message Building (matches MAI-UI's _build_messages)
# =============================================================================
def mem2response(step: TrajStep) -> str:
"""
Reconstruct assistant response from trajectory step.
Converts stored action back to the format the LLM expects in history.
Args:
step: Trajectory step
Returns:
Formatted response string with <thinking> and <tool_call> tags
"""
thinking = step.thought or ""
structured_action = step.structured_action
if not structured_action:
return f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{{}}\n</tool_call>"
action_json = copy.deepcopy(structured_action.get("action_json", {}))
# Convert normalized coordinates back to SCALE_FACTOR range for history
# NOTE: Original MAI-UI only converts "coordinate", NOT start_coordinate/end_coordinate
# This matches the behavior in mai_naivigation_agent.py mem2response()
if "coordinate" in action_json:
coords = action_json["coordinate"]
if len(coords) == 2:
# Coordinates are stored normalized (0-1), convert to 0-999
action_json["coordinate"] = [
int(coords[0] * SCALE_FACTOR),
int(coords[1] * SCALE_FACTOR),
]
tool_call_dict = {
"name": "mobile_use",
"arguments": action_json,
}
tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
return f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{tool_call_json}\n</tool_call>"
def build_messages(
instruction: str,
system_prompt: str,
traj_memory: TrajMemory,
current_screenshot_bytes: bytes,
history_n: int = 3,
) -> List[Dict[str, Any]]:
"""
Build multi-turn messages matching MAI-UI's format.
Message structure:
1. System prompt
2. User instruction
3. For each history step:
- Image (only for last history_n-1 steps)
- Assistant response
4. Current screenshot
Args:
instruction: Task instruction
system_prompt: System prompt text
traj_memory: Trajectory memory with history
current_screenshot_bytes: Current screenshot as bytes
history_n: Number of history images to include
Returns:
List of message dictionaries
"""
messages = [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}],
},
{
"role": "user",
"content": [{"type": "text", "text": instruction}],
},
]
steps = traj_memory.steps
image_idx = 0
if len(steps) > 0:
# Calculate which steps get images (last history_n - 1 steps)
start_image_idx = max(0, len(steps) - (history_n - 1))
# Collect history images
history_images = []
for i, step in enumerate(steps):
if i >= start_image_idx:
history_images.append(step.screenshot_bytes)
for history_idx, step in enumerate(steps):
should_include_image = history_idx >= start_image_idx
if should_include_image and image_idx < len(history_images):
# Add image before assistant response
encoded = bytes_to_base64(history_images[image_idx])
messages.append(
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{encoded}"
},
}
],
}
)
image_idx += 1
# Add assistant response
history_response = mem2response(step)
messages.append(
{
"role": "assistant",
"content": [{"type": "text", "text": history_response}],
}
)
# Add ask_user_response if present (matches MAI-UI behavior)
if step.ask_user_response:
messages.append(
{
"role": "user",
"content": [{"type": "text", "text": step.ask_user_response}],
}
)
# Add current screenshot
current_encoded = bytes_to_base64(current_screenshot_bytes)
messages.append(
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{current_encoded}"},
}
],
}
)
return messages
# =============================================================================
# Action Execution
# =============================================================================
async def execute_action(
tools,
action: Dict[str, Any],
screen_width: int,
screen_height: int,
llm=None,
) -> Tuple[bool, str]:
"""
Execute a MAI-UI action using DroidRun tools.
Args:
tools: DroidRun Tools instance
action: Parsed action dictionary with normalized coordinates (0-1 range)
screen_width: Screen width in pixels
screen_height: Screen height in pixels
llm: LLM instance for intelligent app opening (AppStarter workflow)
Returns:
Tuple of (success, result_message)
"""
action_type = action.get("action", "")
w, h = screen_width, screen_height
try:
if action_type == "click":
# Coordinates are normalized (0-1), convert to pixels
coord = action.get("coordinate", [0, 0])
x = int(coord[0] * w)
y = int(coord[1] * h)
result = await tools.tap_by_coordinates(x, y)
return True, f"click ({x},{y}): {result}"
elif action_type == "long_press":
coord = action.get("coordinate", [0, 0])
x = int(coord[0] * w)
y = int(coord[1] * h)
await tools.swipe(x, y, x, y, 1000)
return True, f"long_press ({x},{y})"
elif action_type == "double_click":
coord = action.get("coordinate", [0, 0])
x = int(coord[0] * w)
y = int(coord[1] * h)
await tools.tap_by_coordinates(x, y)
await asyncio.sleep(0.1)
await tools.tap_by_coordinates(x, y)
return True, f"double_click ({x},{y})"
elif action_type == "type":
text = action.get("text", "")
result = await tools.input_text(text)
return True, (
f"type '{text[:30]}...': {result}"
if len(text) > 30
else f"type '{text}': {result}"
)
elif action_type == "swipe":
direction = action.get("direction", "up")
# Default to center if no coordinate provided (0.5, 0.5 normalized)
coord = action.get("coordinate", [0.5, 0.5])
# Start position (normalized to pixels)
sx = int(coord[0] * w)
sy = int(coord[1] * h)
# Direction offsets (proportional to screen size)
offsets = {
"up": (0, -h // 3),
"down": (0, h // 3),
"left": (-w // 3, 0),
"right": (w // 3, 0),
}
dx, dy = offsets.get(direction, (0, 0))
# Clamp end coordinates to screen bounds
ex = max(0, min(w - 1, sx + dx))
ey = max(0, min(h - 1, sy + dy))
await tools.swipe(sx, sy, ex, ey, 300)
return True, f"swipe {direction} from ({sx},{sy}) to ({ex},{ey})"
elif action_type == "drag":
start_coord = action.get("start_coordinate", [0, 0])
end_coord = action.get("end_coordinate", [0, 0])
sx = int(start_coord[0] * w)
sy = int(start_coord[1] * h)
ex = int(end_coord[0] * w)
ey = int(end_coord[1] * h)
# Use longer duration for drag semantics
await tools.swipe(sx, sy, ex, ey, 2000)
return True, f"drag from ({sx},{sy}) to ({ex},{ey})"
elif action_type == "open":
app_name = action.get("text", "")
if llm is not None:
# Use intelligent LLM-based app matching via AppStarter
workflow = AppStarter(
tools=tools,
llm=llm,
timeout=60,
verbose=False,
)
result = await workflow.run(app_description=app_name)
await asyncio.sleep(1)
return True, f"open '{app_name}': {result}"
else:
# Fallback to simple name matching
package = await resolve_app_name(tools, app_name)
result = await tools.start_app(package)
return True, f"open '{app_name}' ({package}): {result}"
elif action_type == "system_button":
button = action.get("button", "back")
keycodes = {
"back": 4,
"home": 3,
"enter": 66,
"menu": 82,
}
keycode = keycodes.get(button, 4)
result = await tools.press_key(keycode)
return True, f"{button}: {result}"
elif action_type == "wait":
await asyncio.sleep(1.0)
return True, "wait 1s"
elif action_type == "terminate":
# Handled in main loop
return True, "terminate"
elif action_type == "answer":
# Handled in main loop
return True, f"answer: {action.get('text', '')}"
elif action_type == "ask_user":
# Handled in main loop - returns special marker
return True, "ask_user"
else:
return False, f"unknown action: {action_type}"
except Exception as e:
logger.error(f"Action execution failed: {e}")
return False, f"error: {e}"
# =============================================================================
# Main Run Function
# =============================================================================
async def run(
tools,
instruction: str,
config: Dict[str, Any],
max_steps: int = 15,
) -> Dict[str, Any]:
"""
Run MAI-UI agent with exact MAI-UI behavior.
Args:
tools: DroidRun Tools instance (AdbTools)
instruction: Task to complete
config: Configuration dictionary:
llm: Dict passed directly to load_llm() with:
provider: LLM provider (default: "OpenAI")
model, temperature, base_url, api_key, max_tokens, top_p, top_k, etc.
history_n: Number of history images (default: 3)
vision: Whether to use screenshots (default: True)
max_steps: Maximum iterations
Returns:
Dictionary with: success, reason, steps, answer (if answer action)
"""
# Validate LLM config - must be provided by user
llm_cfg = config.get("llm")
if not llm_cfg or not isinstance(llm_cfg, dict):
raise ValueError(
"MAI-UI requires 'llm' configuration. "
"Please configure external_agents.mai_ui.llm in your config.yaml"
)
if "provider" not in llm_cfg:
raise ValueError(
"MAI-UI requires 'llm.provider' to be specified. "
"Example: provider: OpenAI"
)
if "model" not in llm_cfg:
raise ValueError(
"MAI-UI requires 'llm.model' to be specified. " "Example: model: mai-ui-8b"
)
# Load LLM - pass config directly to load_llm
llm_cfg = dict(llm_cfg) # Copy to avoid mutating
provider = llm_cfg.pop("provider")
llm = load_llm(provider, **llm_cfg)
# Agent-specific configuration (defaults from DEFAULT_CONFIG)
history_n = config.get("history_n", DEFAULT_CONFIG["history_n"])
# Initialize trajectory memory
traj_memory = TrajMemory(task_goal=instruction)
# Get available apps for prompt and render system prompt
# No MCP tools passed - the MCP section will not appear in the prompt
apps_list = await get_available_apps(tools)
system_prompt = MAI_MOBILE_SYS_PROMPT_TEMPLATE.render(
apps_list=apps_list,
tools=None, # No MCP tools - section won't render
)
logger.info(f"MAI-UI agent starting: {instruction}")
for step in range(max_steps):
logger.info(f"Step {step + 1}/{max_steps}")
# Get screen dimensions
try:
await tools.get_state()
w, h = tools.screen_width, tools.screen_height
except Exception as e:
logger.error(f"Failed to get state: {e}")
w, h = 1080, 2400 # Fallback dimensions
# Take screenshot (MAI-UI is vision-based, always requires screenshots)
try:
_, screenshot_bytes = await tools.take_screenshot()
except Exception as e:
logger.error(f"Failed to take screenshot: {e}")
continue
if not screenshot_bytes:
logger.error("No screenshot available")
continue
# Build messages
messages = build_messages(
instruction=instruction,
system_prompt=system_prompt,
traj_memory=traj_memory,
current_screenshot_bytes=screenshot_bytes,
history_n=history_n,
)
# Call LLM
try:
response = await acall_with_retries(llm, to_chat_messages(messages))
response_text = str(response)
except Exception as e:
logger.error(f"LLM call failed: {e}")
continue
# Parse response
try:
parsed = parse_action(response_text)
thinking = parsed["thinking"] or ""
action_json = parsed["action_json"]
except ValueError as e:
logger.error(f"Failed to parse response: {e}")
logger.debug(f"Raw response: {response_text[:500]}")
continue
logger.info(f"Thinking: {thinking[:150]}...")
logger.info(f"Action: {action_json.get('action', 'unknown')}")
# Check for terminal actions first (before storing step)
action_type = action_json.get("action", "")
if action_type == "terminate":
# Store step before returning
traj_step = TrajStep(
screenshot_bytes=screenshot_bytes,
prediction=response_text,
action=action_json,
thought=thinking,
step_index=step,
structured_action={"action_json": action_json},
)
traj_memory.steps.append(traj_step)
success = action_json.get("status") == "success"
reason = action_json.get("message", "Task terminated")
logger.info(f"Terminated: success={success}, reason={reason}")
return {"success": success, "reason": reason, "steps": step + 1}
if action_type == "answer":
# Store step before returning
traj_step = TrajStep(
screenshot_bytes=screenshot_bytes,
prediction=response_text,
action=action_json,
thought=thinking,
step_index=step,
structured_action={"action_json": action_json},
)
traj_memory.steps.append(traj_step)
answer_text = action_json.get("text", "")
logger.info(f"Answer: {answer_text}")
return {
"success": True,
"reason": "Task completed with answer",
"steps": step + 1,
"answer": answer_text,
}
if action_type == "ask_user":
# Get user input via stdin
question = action_json.get("text", "Please provide input:")
logger.info(f"🤖 Agent asks: {question}")
user_response = input("Your response: ").strip()
logger.info(f"User response: {user_response}")
# Store step with ask_user_response
traj_step = TrajStep(
screenshot_bytes=screenshot_bytes,
prediction=response_text,
action=action_json,
thought=thinking,
step_index=step,
structured_action={"action_json": action_json},
ask_user_response=user_response,
)
traj_memory.steps.append(traj_step)
# Continue to next iteration (no device action needed)
await asyncio.sleep(0.5)
continue
# Store step in trajectory (for non-terminal, non-ask_user actions)
# action_json already has normalized coordinates (0-1) from parse_action
traj_step = TrajStep(
screenshot_bytes=screenshot_bytes,
prediction=response_text,
action=action_json,
thought=thinking,
step_index=step,
structured_action={"action_json": action_json},
)
traj_memory.steps.append(traj_step)
# Execute action
success, result_msg = await execute_action(tools, action_json, w, h, llm)
logger.info(f"Execution: {result_msg}")
# Brief pause between steps
await asyncio.sleep(0.5)
# Max steps reached
return {"success": False, "reason": "Max steps reached", "steps": max_steps}
+18 -1
View File
@@ -34,8 +34,9 @@ from droidrun.portal import (
ping_portal_tcp,
setup_portal,
)
from droidrun.telemetry import print_telemetry_message
from droidrun.agent.external import list_agents
from droidrun.agent.utils.llm_picker import load_llm
from droidrun.telemetry import print_telemetry_message
# Suppress all warnings
warnings.filterwarnings("ignore")
@@ -68,6 +69,7 @@ async def run_command(
command: str,
config_path: str | None = None,
device: str | None = None,
agent: str | None = None,
provider: str | None = None,
model: str | None = None,
steps: int | None = None,
@@ -140,6 +142,8 @@ async def run_command(
config.agent.fast_agent.vision = fast_agent_vision
# Agent overrides
if agent is not None:
config.agent.name = agent
if steps is not None:
config.agent.max_steps = steps
if reasoning is not None:
@@ -314,10 +318,21 @@ def cli():
pass
_available_agents = list_agents()
@cli.command()
@click.argument("command", type=str)
@click.option("--config", "-c", help="Path to custom config file", default=None)
@click.option("--device", "-d", help="Device serial number or IP address", default=None)
@click.option(
"--agent",
"-a",
type=click.Choice(_available_agents) if _available_agents else None,
help="External agent to use"
+ (f" [{', '.join(_available_agents)}]" if _available_agents else " (none available)"),
default=None,
)
@click.option(
"--provider",
"-p",
@@ -377,6 +392,7 @@ async def run(
command: str,
config: str | None,
device: str | None,
agent: str | None,
provider: str | None,
model: str | None,
steps: int | None,
@@ -399,6 +415,7 @@ async def run(
command=command,
config_path=config,
device=device,
agent=agent,
provider=provider,
model=model,
steps=steps,
+7 -47
View File
@@ -1,12 +1,10 @@
# DroidRun Configuration File
# This file is auto-generated. Edit values as needed.
_version: 4
_version: 5
# === Agent Settings ===
agent:
# Agent to use: "droidrun" (native) or external: "mai_ui", "autoglm"
name: droidrun
# Maximum number of steps per task
max_steps: 15
# Enable planning with reasoning mode
@@ -214,48 +212,10 @@ mcp:
# enabled: true
# === External Agent Settings ===
# External agents are selected via agent.name above.
# Set agent.name to "mai_ui" or "autoglm" to use external agents.
# Settings below are merged with agent-specific defaults.
# External agents coming soon. See docs for how to add custom agents.
# Use: droidrun run "task" --agent <name>
#
# Example:
# agent:
# name: mai_ui
# max_steps: 20
#
# Optional overrides via external_agent section:
# external_agent:
# llm:
# base_url: http://custom:8000/v1
# External agent configurations (reference settings)
external_agents:
# MAI-UI - Alibaba's GUI agent foundation model
# https://github.com/Tongyi-MAI/MAI-UI
# Requires vLLM server: vllm serve Tongyi-MAI/MAI-UI-8B
mai_ui:
llm:
provider: OpenAILike
model: Tongyi-MAI/MAI-UI-8B # or mai-ui-2b, mai-ui-32b, mai-ui-235b-a22b
api_base: https://enjoyed-placed-theaters-survival.trycloudflare.com/v1
api_key: EMPTY
temperature: 0.0
max_tokens: 2048
top_p: 1.0
top_k: -1
history_n: 3 # Number of history steps with images
# AutoGLM - Open-AutoGLM phone agent
# https://github.com/zai-org/Open-AutoGLM/
autoglm:
llm:
provider: OpenAILike
model: autoglm-phone-9b
api_base: http://localhost:8000/v1
api_key: EMPTY
temperature: 0.0
top_p: 0.85
frequency_penalty: 0.2
max_tokens: 3000
lang: en # cn or en
stream: true
# external_agents:
# my_agent:
# api_key: "your-api-key"
# model: "model-name"
@@ -6,7 +6,7 @@ import pkgutil
from pathlib import Path
CURRENT_VERSION = 4
CURRENT_VERSION = 5
def get_migrations() -> List:
@@ -0,0 +1,21 @@
"""Migration v5: Remove legacy external agent configs (mai_ui, autoglm)."""
from typing import Any, Dict
VERSION = 5
def migrate(config: Dict[str, Any]) -> Dict[str, Any]:
"""Remove legacy mai_ui and autoglm entries, reset name if it was one of those."""
# Remove only the known legacy agent configs, leave user-added ones intact
external_agents = config.get("external_agents", {})
if isinstance(external_agents, dict):
external_agents.pop("mai_ui", None)
external_agents.pop("autoglm", None)
# Reset agent.name only if it was a removed agent
agent = config.get("agent", {})
if agent.get("name") in ("mai_ui", "autoglm"):
agent.pop("name", None)
return config