mirror of
https://github.com/droidrun/droidrun.git
synced 2026-05-23 07:40:37 +00:00
refactor!: redesign external agent system to use raw ADB access
External agents now receive a raw async_adbutils.AdbDevice instead of internal DroidRun tools. This makes them fully self-contained — zero imports from droidrun, just copy-paste a file or folder and it works. - Remove mai_ui and autoglm agents (depended on internals) - Pass raw AdbDevice directly, skip Portal/driver/registry setup - Add --agent CLI flag with dynamic discovery from external/ dir - Add list_agents() for runtime agent enumeration - Add config migration v005 to clean up legacy agent entries - Update config_example.yaml to remove old agent references BREAKING CHANGE: external agent run() signature changed from (tools, instruction, config, max_steps) to (device, instruction, config, max_steps) where device is an async_adbutils.AdbDevice
This commit is contained in:
@@ -260,7 +260,10 @@ class DroidAgent(Workflow):
|
||||
self.app_opener_llm = None
|
||||
self.structured_output_llm = None
|
||||
|
||||
if self.config.logging.save_trajectory != "none":
|
||||
if (
|
||||
not self._using_external_agent
|
||||
and self.config.logging.save_trajectory != "none"
|
||||
):
|
||||
self.trajectory = Trajectory(
|
||||
goal=self.shared_state.instruction,
|
||||
base_path=self.config.logging.trajectory_path,
|
||||
@@ -371,6 +374,64 @@ class DroidAgent(Workflow):
|
||||
if self.trajectory_writer:
|
||||
await self.trajectory_writer.start()
|
||||
|
||||
# ── 0. External agent — early exit ────────────────────────────
|
||||
if self._using_external_agent:
|
||||
agent_name = self.config.agent.name
|
||||
|
||||
# Load the agent module
|
||||
agent_module = load_agent(agent_name)
|
||||
if not agent_module:
|
||||
from droidrun.agent.external import list_agents
|
||||
|
||||
available = list_agents()
|
||||
if available:
|
||||
agents_str = ", ".join(available)
|
||||
raise ValueError(
|
||||
f"Failed to load external agent '{agent_name}'.\n"
|
||||
f"Available agents: {agents_str}"
|
||||
)
|
||||
raise ValueError(
|
||||
f"External agent '{agent_name}' not found.\n"
|
||||
"No external agents are currently installed.\n"
|
||||
"Run: droidrun run --help to see available agents."
|
||||
)
|
||||
|
||||
# Resolve config
|
||||
agent_config = self.config.external_agents.get(agent_name)
|
||||
if not agent_config:
|
||||
raise ValueError(
|
||||
f"No configuration found for agent '{agent_name}'.\n\n"
|
||||
"Add to your config.yaml:\n\n"
|
||||
" external_agents:\n"
|
||||
f" {agent_name}:\n"
|
||||
' api_key: "your-api-key"\n'
|
||||
' model: "model-name"\n'
|
||||
" # ... any settings your agent needs"
|
||||
)
|
||||
|
||||
final_config = {**agent_module["config"], **agent_config}
|
||||
|
||||
# Resolve device serial and get raw AdbDevice
|
||||
device_serial = self.resolved_device_config.serial
|
||||
if device_serial is None:
|
||||
devices = await adb.list()
|
||||
if not devices:
|
||||
raise ValueError("No connected Android devices found.")
|
||||
device_serial = devices[0].serial
|
||||
|
||||
adb_device = await adb.device(serial=device_serial)
|
||||
|
||||
logger.info(f"🤖 Using external agent: {agent_name}")
|
||||
|
||||
result = await agent_module["run"](
|
||||
device=adb_device,
|
||||
instruction=self.shared_state.instruction,
|
||||
config=final_config,
|
||||
max_steps=self.config.agent.max_steps,
|
||||
)
|
||||
|
||||
return FinalizeEvent(success=result["success"], reason=result["reason"])
|
||||
|
||||
# ── 1. Create driver ──────────────────────────────────────────
|
||||
if self.config.agent.reasoning:
|
||||
vision_enabled = self.config.agent.manager.vision
|
||||
@@ -530,33 +591,6 @@ class DroidAgent(Workflow):
|
||||
# ── 6. Fetch device date once ─────────────────────────────────
|
||||
self.shared_state.device_date = await driver.get_date()
|
||||
|
||||
# ── 7. External agent mode ────────────────────────────────────
|
||||
if self._using_external_agent:
|
||||
agent_name = self.config.agent.name
|
||||
agent_module = load_agent(agent_name)
|
||||
if not agent_module:
|
||||
raise ValueError(f"Failed to load external agent: {agent_name}")
|
||||
|
||||
agent_config = self.config.external_agents.get(agent_name)
|
||||
if not agent_config:
|
||||
raise ValueError(
|
||||
f"No config found for agent '{agent_name}' in external_agents section"
|
||||
)
|
||||
|
||||
final_config = {**agent_module["config"], **agent_config}
|
||||
|
||||
logger.info(f"🤖 Using external agent: {agent_name}")
|
||||
|
||||
result = await agent_module["run"](
|
||||
driver=self.driver,
|
||||
action_ctx=self.action_ctx,
|
||||
instruction=self.shared_state.instruction,
|
||||
config=final_config,
|
||||
max_steps=self.config.agent.max_steps,
|
||||
)
|
||||
|
||||
return FinalizeEvent(success=result["success"], reason=result["reason"])
|
||||
|
||||
if self.config.logging.save_trajectory != "none":
|
||||
self.trajectory_writer.write(self.trajectory, stage="init")
|
||||
|
||||
|
||||
Vendored
+53
-10
@@ -1,28 +1,71 @@
|
||||
"""External agent loader - dynamic imports."""
|
||||
"""External agent loader — dynamic imports.
|
||||
|
||||
External agents are self-contained modules that receive raw ADB access
|
||||
via ``async_adbutils.AdbDevice``. They bring their own LLM client, prompts,
|
||||
parsing, and action loop — zero imports from ``droidrun``.
|
||||
|
||||
An external agent can be either:
|
||||
- A single file: ``droidrun/agent/external/my_agent.py``
|
||||
- A package: ``droidrun/agent/external/my_agent/__init__.py``
|
||||
|
||||
Required contract::
|
||||
|
||||
from async_adbutils import AdbDevice
|
||||
|
||||
async def run(
|
||||
device: AdbDevice, # raw ADB, already connected
|
||||
instruction: str, # the task
|
||||
config: dict, # from external_agents.<name> in config.yaml
|
||||
max_steps: int, # step limit
|
||||
) -> dict: # {"success": bool, "reason": str, "steps": int}
|
||||
|
||||
Optional: ``DEFAULT_CONFIG: dict`` — merged under the user's config.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from typing import Any, Callable, Dict, Optional, TypedDict
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, TypedDict
|
||||
|
||||
logger = logging.getLogger("droidrun")
|
||||
|
||||
_EXTERNAL_DIR = Path(__file__).parent
|
||||
|
||||
|
||||
class ExternalAgentModule(TypedDict):
|
||||
"""Type for loaded external agent module."""
|
||||
"""Type for a loaded external agent module."""
|
||||
|
||||
run: Callable
|
||||
config: Dict[str, Any]
|
||||
|
||||
|
||||
def load_agent(name: str) -> Optional[ExternalAgentModule]:
|
||||
"""
|
||||
Dynamically load an external agent by name.
|
||||
|
||||
Args:
|
||||
name: Agent module name (e.g., "mai_ui", "autoglm")
|
||||
def list_agents() -> List[str]:
|
||||
"""Discover available external agents by scanning the external/ directory.
|
||||
|
||||
Returns:
|
||||
Dict with 'run' function and 'config' defaults, or None if failed.
|
||||
Sorted list of agent names (module stems or package directory names).
|
||||
"""
|
||||
agents: list[str] = []
|
||||
for item in _EXTERNAL_DIR.iterdir():
|
||||
if item.name.startswith(("_", ".")):
|
||||
continue
|
||||
if item.is_file() and item.suffix == ".py":
|
||||
agents.append(item.stem)
|
||||
elif item.is_dir() and (item / "__init__.py").exists():
|
||||
agents.append(item.name)
|
||||
return sorted(agents)
|
||||
|
||||
|
||||
def load_agent(name: str) -> Optional[ExternalAgentModule]:
|
||||
"""Dynamically load an external agent by name.
|
||||
|
||||
Args:
|
||||
name: Agent module name (e.g., ``"my_agent"``).
|
||||
|
||||
Returns:
|
||||
Dict with ``run`` function and ``config`` defaults, or *None* on failure.
|
||||
"""
|
||||
try:
|
||||
module = importlib.import_module(f"droidrun.agent.external.{name}")
|
||||
|
||||
Vendored
-1119
File diff suppressed because it is too large
Load Diff
Vendored
-819
@@ -1,819 +0,0 @@
|
||||
"""MAI-UI External Agent - Exact implementation matching MAI-UI prompts and behavior.
|
||||
|
||||
This agent replicates MAI-UI's exact prompts, message building, and trajectory
|
||||
management while using DroidRun's AdbTools for execution.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from PIL import Image
|
||||
from jinja2 import Template
|
||||
|
||||
from droidrun.agent.oneflows.app_starter_workflow import AppStarter
|
||||
from droidrun.agent.utils.chat_utils import to_chat_messages
|
||||
from droidrun.agent.utils.inference import acall_with_retries
|
||||
from droidrun.agent.utils.llm_picker import load_llm
|
||||
|
||||
logger = logging.getLogger("droidrun")
|
||||
|
||||
# =============================================================================
|
||||
# Constants
|
||||
# =============================================================================
|
||||
|
||||
SCALE_FACTOR = 999
|
||||
|
||||
# =============================================================================
|
||||
# Default Configuration (agent-specific only, NOT LLM)
|
||||
# =============================================================================
|
||||
|
||||
DEFAULT_CONFIG: Dict[str, Any] = {
|
||||
# Agent-specific settings matching MAI-UI defaults
|
||||
"history_n": 3, # Number of history steps with images
|
||||
# Note: vision is always True for MAI-UI (screenshot-based agent)
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Trajectory Memory (matches MAI-UI's unified_memory.py)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrajStep:
|
||||
"""
|
||||
Single step in an agent's trajectory.
|
||||
|
||||
Attributes:
|
||||
screenshot_bytes: Screenshot as PNG bytes
|
||||
prediction: Raw LLM response text
|
||||
action: Parsed action dictionary
|
||||
thought: Extracted thinking/reasoning
|
||||
step_index: Index of this step
|
||||
structured_action: {"action_json": action} for history reconstruction
|
||||
ask_user_response: Response from user when ask_user action was used
|
||||
"""
|
||||
|
||||
screenshot_bytes: bytes
|
||||
prediction: str
|
||||
action: Dict[str, Any]
|
||||
thought: str
|
||||
step_index: int
|
||||
structured_action: Dict[str, Any]
|
||||
ask_user_response: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrajMemory:
|
||||
"""
|
||||
Container for complete trajectory.
|
||||
|
||||
Attributes:
|
||||
task_goal: The instruction/goal for this trajectory
|
||||
steps: List of trajectory steps
|
||||
"""
|
||||
|
||||
task_goal: str
|
||||
steps: List[TrajStep] = field(default_factory=list)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# System Prompt (exact MAI-UI prompt with MCP template - renders without MCP when no tools passed)
|
||||
# =============================================================================
|
||||
|
||||
# fmt: off
|
||||
MAI_MOBILE_SYS_PROMPT_TEMPLATE = Template(
|
||||
"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n"
|
||||
"\n"
|
||||
"## Output Format\n"
|
||||
"For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
|
||||
"```\n"
|
||||
"<thinking>\n"
|
||||
"...\n"
|
||||
"</thinking>\n"
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"mobile_use\", \"arguments\": <args-json-object>}\n"
|
||||
"</tool_call>\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"## Action Space\n"
|
||||
"\n"
|
||||
"{\"action\": \"click\", \"coordinate\": [x, y]}\n"
|
||||
"{\"action\": \"long_press\", \"coordinate\": [x, y]}\n"
|
||||
"{\"action\": \"type\", \"text\": \"\"}\n"
|
||||
"{\"action\": \"swipe\", \"direction\": \"up or down or left or right\", \"coordinate\": [x, y]} # \"coordinate\" is optional. Use the \"coordinate\" if you want to swipe a specific UI element.\n"
|
||||
"{\"action\": \"open\", \"text\": \"app_name\"}\n"
|
||||
"{\"action\": \"drag\", \"start_coordinate\": [x1, y1], \"end_coordinate\": [x2, y2]}\n"
|
||||
"{\"action\": \"system_button\", \"button\": \"button_name\"} # Options: back, home, menu, enter \n"
|
||||
"{\"action\": \"wait\"}\n"
|
||||
"{\"action\": \"terminate\", \"status\": \"success or fail\"} \n"
|
||||
"{\"action\": \"answer\", \"text\": \"xxx\"} # Use escape characters \\', \\\", and \\n in text part to ensure we can parse the text in normal python string format.\n"
|
||||
"{\"action\": \"ask_user\", \"text\": \"xxx\"} # you can ask user for more information to complete the task.\n"
|
||||
"{\"action\": \"double_click\", \"coordinate\": [x, y]}\n"
|
||||
"\n"
|
||||
"{% if tools %}"
|
||||
"## MCP Tools\n"
|
||||
"You are also provided with MCP tools, you can use them to complete the task.\n"
|
||||
"{{ tools }}\n"
|
||||
"\n"
|
||||
"If you want to use MCP tools, you must output as the following format:\n"
|
||||
"```\n"
|
||||
"<thinking>\n"
|
||||
"...\n"
|
||||
"</thinking>\n"
|
||||
"<tool_call>\n"
|
||||
"{\"name\": <function-name>, \"arguments\": <args-json-object>}\n"
|
||||
"</tool_call>\n"
|
||||
"```\n"
|
||||
"{% endif %}"
|
||||
"## Note\n"
|
||||
"- Available Apps: `{{ apps_list }}`.\n"
|
||||
"- Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part."
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Parsing Functions (matches MAI-UI's parsing)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def parse_tagged_text(text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse text containing <thinking> and <tool_call> tags.
|
||||
|
||||
Handles both standard format and thinking model format (</think>).
|
||||
|
||||
Args:
|
||||
text: Raw model output
|
||||
|
||||
Returns:
|
||||
Dictionary with "thinking" and "tool_call" keys
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# Handle thinking model output format (uses </think> instead of </thinking>)
|
||||
if "</think>" in text and "</thinking>" not in text:
|
||||
text = text.replace("</think>", "</thinking>")
|
||||
text = "<thinking>" + text
|
||||
|
||||
result: Dict[str, Any] = {
|
||||
"thinking": None,
|
||||
"tool_call": None,
|
||||
}
|
||||
|
||||
# Extract thinking content
|
||||
think_pattern = r"<thinking>(.*?)</thinking>"
|
||||
think_match = re.search(think_pattern, text, re.DOTALL)
|
||||
if think_match:
|
||||
result["thinking"] = think_match.group(1).strip()
|
||||
|
||||
# Extract tool_call content
|
||||
call_pattern = r"<tool_call>(.*?)</tool_call>"
|
||||
call_match = re.search(call_pattern, text, re.DOTALL)
|
||||
if call_match:
|
||||
try:
|
||||
result["tool_call"] = json.loads(call_match.group(1).strip())
|
||||
except json.JSONDecodeError:
|
||||
result["tool_call"] = None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_action(text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse model output into structured action format.
|
||||
|
||||
Normalizes coordinates from SCALE_FACTOR (0-999) to 0-1 range,
|
||||
matching MAI-UI's parse_action_to_structure_output behavior.
|
||||
|
||||
Args:
|
||||
text: Raw model output
|
||||
|
||||
Returns:
|
||||
Dictionary with "thinking" and "action_json" keys
|
||||
|
||||
Raises:
|
||||
ValueError: If parsing fails
|
||||
"""
|
||||
parsed = parse_tagged_text(text)
|
||||
|
||||
if not parsed["tool_call"]:
|
||||
raise ValueError("No valid tool_call found in response")
|
||||
|
||||
action = parsed["tool_call"].get("arguments", {})
|
||||
|
||||
# Normalize coordinates from SCALE_FACTOR range to [0, 1]
|
||||
# This matches MAI-UI's parse_action_to_structure_output behavior
|
||||
for coord_key in ["coordinate", "start_coordinate", "end_coordinate"]:
|
||||
if coord_key in action:
|
||||
coordinates = action[coord_key]
|
||||
if len(coordinates) == 2:
|
||||
point_x, point_y = coordinates
|
||||
elif len(coordinates) == 4:
|
||||
# Handle bounding box format (x1, y1, x2, y2) -> center point
|
||||
x1, y1, x2, y2 = coordinates
|
||||
point_x = (x1 + x2) / 2
|
||||
point_y = (y1 + y2) / 2
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid {coord_key} format: expected 2 or 4 values, got {len(coordinates)}"
|
||||
)
|
||||
action[coord_key] = [point_x / SCALE_FACTOR, point_y / SCALE_FACTOR]
|
||||
|
||||
return {
|
||||
"thinking": parsed["thinking"],
|
||||
"action_json": action,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def pil_to_base64(image: Image.Image) -> str:
|
||||
"""Convert PIL Image to base64 string."""
|
||||
buffer = BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
|
||||
def bytes_to_base64(image_bytes: bytes) -> str:
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
buffer = BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
|
||||
async def resolve_app_name(tools, app_name: str) -> str:
|
||||
"""
|
||||
Resolve friendly app name to package name.
|
||||
|
||||
Args:
|
||||
tools: DroidRun Tools instance
|
||||
app_name: Friendly app name (e.g., "Settings", "Chrome")
|
||||
|
||||
Returns:
|
||||
Package name (e.g., "com.android.settings")
|
||||
"""
|
||||
try:
|
||||
apps = await tools.get_apps(include_system=True)
|
||||
|
||||
# Try exact label match (case-insensitive)
|
||||
for app in apps:
|
||||
if app.get("label", "").lower() == app_name.lower():
|
||||
return app["package"]
|
||||
|
||||
# Try partial match
|
||||
for app in apps:
|
||||
if app_name.lower() in app.get("label", "").lower():
|
||||
return app["package"]
|
||||
|
||||
# Return as-is (might already be a package name)
|
||||
return app_name
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to resolve app name '{app_name}': {e}")
|
||||
return app_name
|
||||
|
||||
|
||||
async def get_available_apps(tools) -> str:
|
||||
"""
|
||||
Get list of available apps for the prompt.
|
||||
|
||||
Returns:
|
||||
Formatted string of app names
|
||||
"""
|
||||
try:
|
||||
apps = await tools.get_apps(include_system=False)
|
||||
app_names = [app.get("label", app.get("package", "")) for app in apps[:30]]
|
||||
return json.dumps(app_names)
|
||||
except Exception:
|
||||
# Fallback to generic list
|
||||
return '["Settings", "Chrome", "Camera", "Files", "Contacts", "Messages", "Phone", "Calendar", "Clock", "Calculator"]'
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Message Building (matches MAI-UI's _build_messages)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def mem2response(step: TrajStep) -> str:
|
||||
"""
|
||||
Reconstruct assistant response from trajectory step.
|
||||
|
||||
Converts stored action back to the format the LLM expects in history.
|
||||
|
||||
Args:
|
||||
step: Trajectory step
|
||||
|
||||
Returns:
|
||||
Formatted response string with <thinking> and <tool_call> tags
|
||||
"""
|
||||
thinking = step.thought or ""
|
||||
structured_action = step.structured_action
|
||||
|
||||
if not structured_action:
|
||||
return f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{{}}\n</tool_call>"
|
||||
|
||||
action_json = copy.deepcopy(structured_action.get("action_json", {}))
|
||||
|
||||
# Convert normalized coordinates back to SCALE_FACTOR range for history
|
||||
# NOTE: Original MAI-UI only converts "coordinate", NOT start_coordinate/end_coordinate
|
||||
# This matches the behavior in mai_naivigation_agent.py mem2response()
|
||||
if "coordinate" in action_json:
|
||||
coords = action_json["coordinate"]
|
||||
if len(coords) == 2:
|
||||
# Coordinates are stored normalized (0-1), convert to 0-999
|
||||
action_json["coordinate"] = [
|
||||
int(coords[0] * SCALE_FACTOR),
|
||||
int(coords[1] * SCALE_FACTOR),
|
||||
]
|
||||
|
||||
tool_call_dict = {
|
||||
"name": "mobile_use",
|
||||
"arguments": action_json,
|
||||
}
|
||||
tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
|
||||
|
||||
return f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{tool_call_json}\n</tool_call>"
|
||||
|
||||
|
||||
def build_messages(
|
||||
instruction: str,
|
||||
system_prompt: str,
|
||||
traj_memory: TrajMemory,
|
||||
current_screenshot_bytes: bytes,
|
||||
history_n: int = 3,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Build multi-turn messages matching MAI-UI's format.
|
||||
|
||||
Message structure:
|
||||
1. System prompt
|
||||
2. User instruction
|
||||
3. For each history step:
|
||||
- Image (only for last history_n-1 steps)
|
||||
- Assistant response
|
||||
4. Current screenshot
|
||||
|
||||
Args:
|
||||
instruction: Task instruction
|
||||
system_prompt: System prompt text
|
||||
traj_memory: Trajectory memory with history
|
||||
current_screenshot_bytes: Current screenshot as bytes
|
||||
history_n: Number of history images to include
|
||||
|
||||
Returns:
|
||||
List of message dictionaries
|
||||
"""
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": system_prompt}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": instruction}],
|
||||
},
|
||||
]
|
||||
|
||||
steps = traj_memory.steps
|
||||
image_idx = 0
|
||||
|
||||
if len(steps) > 0:
|
||||
# Calculate which steps get images (last history_n - 1 steps)
|
||||
start_image_idx = max(0, len(steps) - (history_n - 1))
|
||||
|
||||
# Collect history images
|
||||
history_images = []
|
||||
for i, step in enumerate(steps):
|
||||
if i >= start_image_idx:
|
||||
history_images.append(step.screenshot_bytes)
|
||||
|
||||
for history_idx, step in enumerate(steps):
|
||||
should_include_image = history_idx >= start_image_idx
|
||||
|
||||
if should_include_image and image_idx < len(history_images):
|
||||
# Add image before assistant response
|
||||
encoded = bytes_to_base64(history_images[image_idx])
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{encoded}"
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
image_idx += 1
|
||||
|
||||
# Add assistant response
|
||||
history_response = mem2response(step)
|
||||
messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": history_response}],
|
||||
}
|
||||
)
|
||||
|
||||
# Add ask_user_response if present (matches MAI-UI behavior)
|
||||
if step.ask_user_response:
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": step.ask_user_response}],
|
||||
}
|
||||
)
|
||||
|
||||
# Add current screenshot
|
||||
current_encoded = bytes_to_base64(current_screenshot_bytes)
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{current_encoded}"},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Action Execution
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def execute_action(
|
||||
tools,
|
||||
action: Dict[str, Any],
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
llm=None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Execute a MAI-UI action using DroidRun tools.
|
||||
|
||||
Args:
|
||||
tools: DroidRun Tools instance
|
||||
action: Parsed action dictionary with normalized coordinates (0-1 range)
|
||||
screen_width: Screen width in pixels
|
||||
screen_height: Screen height in pixels
|
||||
llm: LLM instance for intelligent app opening (AppStarter workflow)
|
||||
|
||||
Returns:
|
||||
Tuple of (success, result_message)
|
||||
"""
|
||||
action_type = action.get("action", "")
|
||||
w, h = screen_width, screen_height
|
||||
|
||||
try:
|
||||
if action_type == "click":
|
||||
# Coordinates are normalized (0-1), convert to pixels
|
||||
coord = action.get("coordinate", [0, 0])
|
||||
x = int(coord[0] * w)
|
||||
y = int(coord[1] * h)
|
||||
result = await tools.tap_by_coordinates(x, y)
|
||||
return True, f"click ({x},{y}): {result}"
|
||||
|
||||
elif action_type == "long_press":
|
||||
coord = action.get("coordinate", [0, 0])
|
||||
x = int(coord[0] * w)
|
||||
y = int(coord[1] * h)
|
||||
await tools.swipe(x, y, x, y, 1000)
|
||||
return True, f"long_press ({x},{y})"
|
||||
|
||||
elif action_type == "double_click":
|
||||
coord = action.get("coordinate", [0, 0])
|
||||
x = int(coord[0] * w)
|
||||
y = int(coord[1] * h)
|
||||
await tools.tap_by_coordinates(x, y)
|
||||
await asyncio.sleep(0.1)
|
||||
await tools.tap_by_coordinates(x, y)
|
||||
return True, f"double_click ({x},{y})"
|
||||
|
||||
elif action_type == "type":
|
||||
text = action.get("text", "")
|
||||
result = await tools.input_text(text)
|
||||
return True, (
|
||||
f"type '{text[:30]}...': {result}"
|
||||
if len(text) > 30
|
||||
else f"type '{text}': {result}"
|
||||
)
|
||||
|
||||
elif action_type == "swipe":
|
||||
direction = action.get("direction", "up")
|
||||
# Default to center if no coordinate provided (0.5, 0.5 normalized)
|
||||
coord = action.get("coordinate", [0.5, 0.5])
|
||||
|
||||
# Start position (normalized to pixels)
|
||||
sx = int(coord[0] * w)
|
||||
sy = int(coord[1] * h)
|
||||
|
||||
# Direction offsets (proportional to screen size)
|
||||
offsets = {
|
||||
"up": (0, -h // 3),
|
||||
"down": (0, h // 3),
|
||||
"left": (-w // 3, 0),
|
||||
"right": (w // 3, 0),
|
||||
}
|
||||
dx, dy = offsets.get(direction, (0, 0))
|
||||
|
||||
# Clamp end coordinates to screen bounds
|
||||
ex = max(0, min(w - 1, sx + dx))
|
||||
ey = max(0, min(h - 1, sy + dy))
|
||||
|
||||
await tools.swipe(sx, sy, ex, ey, 300)
|
||||
return True, f"swipe {direction} from ({sx},{sy}) to ({ex},{ey})"
|
||||
|
||||
elif action_type == "drag":
|
||||
start_coord = action.get("start_coordinate", [0, 0])
|
||||
end_coord = action.get("end_coordinate", [0, 0])
|
||||
|
||||
sx = int(start_coord[0] * w)
|
||||
sy = int(start_coord[1] * h)
|
||||
ex = int(end_coord[0] * w)
|
||||
ey = int(end_coord[1] * h)
|
||||
|
||||
# Use longer duration for drag semantics
|
||||
await tools.swipe(sx, sy, ex, ey, 2000)
|
||||
return True, f"drag from ({sx},{sy}) to ({ex},{ey})"
|
||||
|
||||
elif action_type == "open":
|
||||
app_name = action.get("text", "")
|
||||
if llm is not None:
|
||||
# Use intelligent LLM-based app matching via AppStarter
|
||||
workflow = AppStarter(
|
||||
tools=tools,
|
||||
llm=llm,
|
||||
timeout=60,
|
||||
verbose=False,
|
||||
)
|
||||
result = await workflow.run(app_description=app_name)
|
||||
await asyncio.sleep(1)
|
||||
return True, f"open '{app_name}': {result}"
|
||||
else:
|
||||
# Fallback to simple name matching
|
||||
package = await resolve_app_name(tools, app_name)
|
||||
result = await tools.start_app(package)
|
||||
return True, f"open '{app_name}' ({package}): {result}"
|
||||
|
||||
elif action_type == "system_button":
|
||||
button = action.get("button", "back")
|
||||
keycodes = {
|
||||
"back": 4,
|
||||
"home": 3,
|
||||
"enter": 66,
|
||||
"menu": 82,
|
||||
}
|
||||
keycode = keycodes.get(button, 4)
|
||||
result = await tools.press_key(keycode)
|
||||
return True, f"{button}: {result}"
|
||||
|
||||
elif action_type == "wait":
|
||||
await asyncio.sleep(1.0)
|
||||
return True, "wait 1s"
|
||||
|
||||
elif action_type == "terminate":
|
||||
# Handled in main loop
|
||||
return True, "terminate"
|
||||
|
||||
elif action_type == "answer":
|
||||
# Handled in main loop
|
||||
return True, f"answer: {action.get('text', '')}"
|
||||
|
||||
elif action_type == "ask_user":
|
||||
# Handled in main loop - returns special marker
|
||||
return True, "ask_user"
|
||||
|
||||
else:
|
||||
return False, f"unknown action: {action_type}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Action execution failed: {e}")
|
||||
return False, f"error: {e}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Run Function
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def run(
|
||||
tools,
|
||||
instruction: str,
|
||||
config: Dict[str, Any],
|
||||
max_steps: int = 15,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run MAI-UI agent with exact MAI-UI behavior.
|
||||
|
||||
Args:
|
||||
tools: DroidRun Tools instance (AdbTools)
|
||||
instruction: Task to complete
|
||||
config: Configuration dictionary:
|
||||
llm: Dict passed directly to load_llm() with:
|
||||
provider: LLM provider (default: "OpenAI")
|
||||
model, temperature, base_url, api_key, max_tokens, top_p, top_k, etc.
|
||||
history_n: Number of history images (default: 3)
|
||||
vision: Whether to use screenshots (default: True)
|
||||
max_steps: Maximum iterations
|
||||
|
||||
Returns:
|
||||
Dictionary with: success, reason, steps, answer (if answer action)
|
||||
"""
|
||||
# Validate LLM config - must be provided by user
|
||||
llm_cfg = config.get("llm")
|
||||
if not llm_cfg or not isinstance(llm_cfg, dict):
|
||||
raise ValueError(
|
||||
"MAI-UI requires 'llm' configuration. "
|
||||
"Please configure external_agents.mai_ui.llm in your config.yaml"
|
||||
)
|
||||
|
||||
if "provider" not in llm_cfg:
|
||||
raise ValueError(
|
||||
"MAI-UI requires 'llm.provider' to be specified. "
|
||||
"Example: provider: OpenAI"
|
||||
)
|
||||
|
||||
if "model" not in llm_cfg:
|
||||
raise ValueError(
|
||||
"MAI-UI requires 'llm.model' to be specified. " "Example: model: mai-ui-8b"
|
||||
)
|
||||
|
||||
# Load LLM - pass config directly to load_llm
|
||||
llm_cfg = dict(llm_cfg) # Copy to avoid mutating
|
||||
provider = llm_cfg.pop("provider")
|
||||
llm = load_llm(provider, **llm_cfg)
|
||||
|
||||
# Agent-specific configuration (defaults from DEFAULT_CONFIG)
|
||||
history_n = config.get("history_n", DEFAULT_CONFIG["history_n"])
|
||||
|
||||
# Initialize trajectory memory
|
||||
traj_memory = TrajMemory(task_goal=instruction)
|
||||
|
||||
# Get available apps for prompt and render system prompt
|
||||
# No MCP tools passed - the MCP section will not appear in the prompt
|
||||
apps_list = await get_available_apps(tools)
|
||||
system_prompt = MAI_MOBILE_SYS_PROMPT_TEMPLATE.render(
|
||||
apps_list=apps_list,
|
||||
tools=None, # No MCP tools - section won't render
|
||||
)
|
||||
|
||||
logger.info(f"MAI-UI agent starting: {instruction}")
|
||||
|
||||
for step in range(max_steps):
|
||||
logger.info(f"Step {step + 1}/{max_steps}")
|
||||
|
||||
# Get screen dimensions
|
||||
try:
|
||||
await tools.get_state()
|
||||
w, h = tools.screen_width, tools.screen_height
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get state: {e}")
|
||||
w, h = 1080, 2400 # Fallback dimensions
|
||||
|
||||
# Take screenshot (MAI-UI is vision-based, always requires screenshots)
|
||||
try:
|
||||
_, screenshot_bytes = await tools.take_screenshot()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to take screenshot: {e}")
|
||||
continue
|
||||
|
||||
if not screenshot_bytes:
|
||||
logger.error("No screenshot available")
|
||||
continue
|
||||
|
||||
# Build messages
|
||||
messages = build_messages(
|
||||
instruction=instruction,
|
||||
system_prompt=system_prompt,
|
||||
traj_memory=traj_memory,
|
||||
current_screenshot_bytes=screenshot_bytes,
|
||||
history_n=history_n,
|
||||
)
|
||||
|
||||
# Call LLM
|
||||
try:
|
||||
response = await acall_with_retries(llm, to_chat_messages(messages))
|
||||
response_text = str(response)
|
||||
except Exception as e:
|
||||
logger.error(f"LLM call failed: {e}")
|
||||
continue
|
||||
|
||||
# Parse response
|
||||
try:
|
||||
parsed = parse_action(response_text)
|
||||
thinking = parsed["thinking"] or ""
|
||||
action_json = parsed["action_json"]
|
||||
except ValueError as e:
|
||||
logger.error(f"Failed to parse response: {e}")
|
||||
logger.debug(f"Raw response: {response_text[:500]}")
|
||||
continue
|
||||
|
||||
logger.info(f"Thinking: {thinking[:150]}...")
|
||||
logger.info(f"Action: {action_json.get('action', 'unknown')}")
|
||||
|
||||
# Check for terminal actions first (before storing step)
|
||||
action_type = action_json.get("action", "")
|
||||
|
||||
if action_type == "terminate":
|
||||
# Store step before returning
|
||||
traj_step = TrajStep(
|
||||
screenshot_bytes=screenshot_bytes,
|
||||
prediction=response_text,
|
||||
action=action_json,
|
||||
thought=thinking,
|
||||
step_index=step,
|
||||
structured_action={"action_json": action_json},
|
||||
)
|
||||
traj_memory.steps.append(traj_step)
|
||||
|
||||
success = action_json.get("status") == "success"
|
||||
reason = action_json.get("message", "Task terminated")
|
||||
logger.info(f"Terminated: success={success}, reason={reason}")
|
||||
return {"success": success, "reason": reason, "steps": step + 1}
|
||||
|
||||
if action_type == "answer":
|
||||
# Store step before returning
|
||||
traj_step = TrajStep(
|
||||
screenshot_bytes=screenshot_bytes,
|
||||
prediction=response_text,
|
||||
action=action_json,
|
||||
thought=thinking,
|
||||
step_index=step,
|
||||
structured_action={"action_json": action_json},
|
||||
)
|
||||
traj_memory.steps.append(traj_step)
|
||||
|
||||
answer_text = action_json.get("text", "")
|
||||
logger.info(f"Answer: {answer_text}")
|
||||
return {
|
||||
"success": True,
|
||||
"reason": "Task completed with answer",
|
||||
"steps": step + 1,
|
||||
"answer": answer_text,
|
||||
}
|
||||
|
||||
if action_type == "ask_user":
|
||||
# Get user input via stdin
|
||||
question = action_json.get("text", "Please provide input:")
|
||||
logger.info(f"🤖 Agent asks: {question}")
|
||||
user_response = input("Your response: ").strip()
|
||||
logger.info(f"User response: {user_response}")
|
||||
|
||||
# Store step with ask_user_response
|
||||
traj_step = TrajStep(
|
||||
screenshot_bytes=screenshot_bytes,
|
||||
prediction=response_text,
|
||||
action=action_json,
|
||||
thought=thinking,
|
||||
step_index=step,
|
||||
structured_action={"action_json": action_json},
|
||||
ask_user_response=user_response,
|
||||
)
|
||||
traj_memory.steps.append(traj_step)
|
||||
|
||||
# Continue to next iteration (no device action needed)
|
||||
await asyncio.sleep(0.5)
|
||||
continue
|
||||
|
||||
# Store step in trajectory (for non-terminal, non-ask_user actions)
|
||||
# action_json already has normalized coordinates (0-1) from parse_action
|
||||
traj_step = TrajStep(
|
||||
screenshot_bytes=screenshot_bytes,
|
||||
prediction=response_text,
|
||||
action=action_json,
|
||||
thought=thinking,
|
||||
step_index=step,
|
||||
structured_action={"action_json": action_json},
|
||||
)
|
||||
traj_memory.steps.append(traj_step)
|
||||
|
||||
# Execute action
|
||||
success, result_msg = await execute_action(tools, action_json, w, h, llm)
|
||||
logger.info(f"Execution: {result_msg}")
|
||||
|
||||
# Brief pause between steps
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Max steps reached
|
||||
return {"success": False, "reason": "Max steps reached", "steps": max_steps}
|
||||
+18
-1
@@ -34,8 +34,9 @@ from droidrun.portal import (
|
||||
ping_portal_tcp,
|
||||
setup_portal,
|
||||
)
|
||||
from droidrun.telemetry import print_telemetry_message
|
||||
from droidrun.agent.external import list_agents
|
||||
from droidrun.agent.utils.llm_picker import load_llm
|
||||
from droidrun.telemetry import print_telemetry_message
|
||||
|
||||
# Suppress all warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
@@ -68,6 +69,7 @@ async def run_command(
|
||||
command: str,
|
||||
config_path: str | None = None,
|
||||
device: str | None = None,
|
||||
agent: str | None = None,
|
||||
provider: str | None = None,
|
||||
model: str | None = None,
|
||||
steps: int | None = None,
|
||||
@@ -140,6 +142,8 @@ async def run_command(
|
||||
config.agent.fast_agent.vision = fast_agent_vision
|
||||
|
||||
# Agent overrides
|
||||
if agent is not None:
|
||||
config.agent.name = agent
|
||||
if steps is not None:
|
||||
config.agent.max_steps = steps
|
||||
if reasoning is not None:
|
||||
@@ -314,10 +318,21 @@ def cli():
|
||||
pass
|
||||
|
||||
|
||||
_available_agents = list_agents()
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("command", type=str)
|
||||
@click.option("--config", "-c", help="Path to custom config file", default=None)
|
||||
@click.option("--device", "-d", help="Device serial number or IP address", default=None)
|
||||
@click.option(
|
||||
"--agent",
|
||||
"-a",
|
||||
type=click.Choice(_available_agents) if _available_agents else None,
|
||||
help="External agent to use"
|
||||
+ (f" [{', '.join(_available_agents)}]" if _available_agents else " (none available)"),
|
||||
default=None,
|
||||
)
|
||||
@click.option(
|
||||
"--provider",
|
||||
"-p",
|
||||
@@ -377,6 +392,7 @@ async def run(
|
||||
command: str,
|
||||
config: str | None,
|
||||
device: str | None,
|
||||
agent: str | None,
|
||||
provider: str | None,
|
||||
model: str | None,
|
||||
steps: int | None,
|
||||
@@ -399,6 +415,7 @@ async def run(
|
||||
command=command,
|
||||
config_path=config,
|
||||
device=device,
|
||||
agent=agent,
|
||||
provider=provider,
|
||||
model=model,
|
||||
steps=steps,
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
# DroidRun Configuration File
|
||||
# This file is auto-generated. Edit values as needed.
|
||||
|
||||
_version: 4
|
||||
_version: 5
|
||||
|
||||
# === Agent Settings ===
|
||||
agent:
|
||||
# Agent to use: "droidrun" (native) or external: "mai_ui", "autoglm"
|
||||
name: droidrun
|
||||
# Maximum number of steps per task
|
||||
max_steps: 15
|
||||
# Enable planning with reasoning mode
|
||||
@@ -214,48 +212,10 @@ mcp:
|
||||
# enabled: true
|
||||
|
||||
# === External Agent Settings ===
|
||||
# External agents are selected via agent.name above.
|
||||
# Set agent.name to "mai_ui" or "autoglm" to use external agents.
|
||||
# Settings below are merged with agent-specific defaults.
|
||||
# External agents coming soon. See docs for how to add custom agents.
|
||||
# Use: droidrun run "task" --agent <name>
|
||||
#
|
||||
# Example:
|
||||
# agent:
|
||||
# name: mai_ui
|
||||
# max_steps: 20
|
||||
#
|
||||
# Optional overrides via external_agent section:
|
||||
# external_agent:
|
||||
# llm:
|
||||
# base_url: http://custom:8000/v1
|
||||
|
||||
# External agent configurations (reference settings)
|
||||
external_agents:
|
||||
# MAI-UI - Alibaba's GUI agent foundation model
|
||||
# https://github.com/Tongyi-MAI/MAI-UI
|
||||
# Requires vLLM server: vllm serve Tongyi-MAI/MAI-UI-8B
|
||||
mai_ui:
|
||||
llm:
|
||||
provider: OpenAILike
|
||||
model: Tongyi-MAI/MAI-UI-8B # or mai-ui-2b, mai-ui-32b, mai-ui-235b-a22b
|
||||
api_base: https://enjoyed-placed-theaters-survival.trycloudflare.com/v1
|
||||
api_key: EMPTY
|
||||
temperature: 0.0
|
||||
max_tokens: 2048
|
||||
top_p: 1.0
|
||||
top_k: -1
|
||||
history_n: 3 # Number of history steps with images
|
||||
|
||||
# AutoGLM - Open-AutoGLM phone agent
|
||||
# https://github.com/zai-org/Open-AutoGLM/
|
||||
autoglm:
|
||||
llm:
|
||||
provider: OpenAILike
|
||||
model: autoglm-phone-9b
|
||||
api_base: http://localhost:8000/v1
|
||||
api_key: EMPTY
|
||||
temperature: 0.0
|
||||
top_p: 0.85
|
||||
frequency_penalty: 0.2
|
||||
max_tokens: 3000
|
||||
lang: en # cn or en
|
||||
stream: true
|
||||
# external_agents:
|
||||
# my_agent:
|
||||
# api_key: "your-api-key"
|
||||
# model: "model-name"
|
||||
|
||||
@@ -6,7 +6,7 @@ import pkgutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
CURRENT_VERSION = 4
|
||||
CURRENT_VERSION = 5
|
||||
|
||||
|
||||
def get_migrations() -> List:
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
"""Migration v5: Remove legacy external agent configs (mai_ui, autoglm)."""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
VERSION = 5
|
||||
|
||||
|
||||
def migrate(config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Remove legacy mai_ui and autoglm entries, reset name if it was one of those."""
|
||||
# Remove only the known legacy agent configs, leave user-added ones intact
|
||||
external_agents = config.get("external_agents", {})
|
||||
if isinstance(external_agents, dict):
|
||||
external_agents.pop("mai_ui", None)
|
||||
external_agents.pop("autoglm", None)
|
||||
|
||||
# Reset agent.name only if it was a removed agent
|
||||
agent = config.get("agent", {})
|
||||
if agent.get("name") in ("mai_ui", "autoglm"):
|
||||
agent.pop("name", None)
|
||||
|
||||
return config
|
||||
Reference in New Issue
Block a user