mirror of
https://github.com/droidrun/droidrun.git
synced 2026-05-23 07:40:37 +00:00
feat: add smoke test CI for PR regression testing
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
name: Smoke Tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "droidrun/**"
|
||||
|
||||
concurrency:
|
||||
group: smoke-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
smoke:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 20
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.13"
|
||||
|
||||
- uses: astral-sh/setup-uv@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: uv sync --all-groups
|
||||
|
||||
- name: Run smoke tests
|
||||
env:
|
||||
MOBILERUN_API_KEY: ${{ secrets.MOBILERUN_API_KEY }}
|
||||
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
||||
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
|
||||
LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
|
||||
LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }}
|
||||
run: uv run python -m tests.smoke.run --output-dir=artifacts
|
||||
|
||||
- name: Upload artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: smoke-test-results
|
||||
path: artifacts/
|
||||
|
||||
- name: Comment on PR
|
||||
if: always()
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
const path = 'artifacts/summary.md';
|
||||
|
||||
let body;
|
||||
if (fs.existsSync(path)) {
|
||||
body = fs.readFileSync(path, 'utf8');
|
||||
// Add artifact download link
|
||||
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
||||
body += `\n\n---\n[Download all artifacts](${runUrl}#artifacts)`;
|
||||
} else {
|
||||
body = '## Smoke Tests\n\n**Failed to generate results.** Check the [workflow run](' +
|
||||
`${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` +
|
||||
') for details.';
|
||||
}
|
||||
|
||||
// Find existing smoke test comment to update
|
||||
const { data: comments } = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
});
|
||||
|
||||
const marker = '<!-- smoke-test-results -->';
|
||||
body = marker + '\n' + body;
|
||||
const existing = comments.find(c => c.body?.includes(marker));
|
||||
|
||||
if (existing) {
|
||||
await github.rest.issues.updateComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: existing.id,
|
||||
body,
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body,
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,126 @@
|
||||
"""Run DroidAgent from source against a cloud device."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from uuid import uuid4
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from droidrun import DroidAgent, DroidrunConfig, load_llm
|
||||
from droidrun.agent.common.events import ScreenshotEvent, ToolExecutionEvent
|
||||
from droidrun.agent.droid.events import ResultEvent
|
||||
from droidrun.tools.driver.cloud import CloudDriver
|
||||
|
||||
from tests.smoke.config import SmokeTestConfig
|
||||
|
||||
logger = logging.getLogger("smoke")
|
||||
|
||||
LLM_MODEL = "gemini-3.1-flash-lite-preview"
|
||||
|
||||
|
||||
class AndroidVersion(BaseModel):
|
||||
"""Structured output model for extracting Android version."""
|
||||
|
||||
android_version: str = Field(description="The Android version number (e.g. '14', '15')")
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunResult:
|
||||
result: ResultEvent | None = None
|
||||
screenshots: list[bytes] = field(default_factory=list)
|
||||
tool_events: list[ToolExecutionEvent] = field(default_factory=list)
|
||||
error: str | None = None
|
||||
langfuse_session_id: str | None = None
|
||||
|
||||
|
||||
async def run_agent(
|
||||
test_config: SmokeTestConfig,
|
||||
device_id: str,
|
||||
api_key: str,
|
||||
base_url: str,
|
||||
trajectory_dir: str | None = None,
|
||||
langfuse_host: str | None = None,
|
||||
) -> RunResult:
|
||||
"""Run a single smoke test agent and collect results."""
|
||||
run_result = RunResult()
|
||||
|
||||
# Ensure screenshots are emitted even for non-vision runs
|
||||
os.environ["DROIDRUN_STREAM_SCREENSHOTS"] = "1"
|
||||
|
||||
try:
|
||||
driver = CloudDriver(
|
||||
device_id=device_id,
|
||||
api_key=api_key,
|
||||
base_url=base_url,
|
||||
)
|
||||
|
||||
config = DroidrunConfig()
|
||||
config.agent.reasoning = test_config.reasoning
|
||||
config.agent.max_steps = test_config.max_steps
|
||||
config.agent.streaming = False
|
||||
config.agent.fast_agent.vision = test_config.vision
|
||||
config.agent.manager.vision = test_config.vision
|
||||
config.agent.executor.vision = test_config.vision
|
||||
config.telemetry.enabled = False
|
||||
|
||||
# Trajectory writer
|
||||
if trajectory_dir:
|
||||
config.logging.save_trajectory = "all"
|
||||
config.logging.trajectory_path = trajectory_dir
|
||||
config.logging.trajectory_gifs = True
|
||||
else:
|
||||
config.logging.save_trajectory = "none"
|
||||
|
||||
# Langfuse tracing
|
||||
langfuse_secret = os.environ.get("LANGFUSE_SECRET_KEY", "")
|
||||
langfuse_public = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
|
||||
if langfuse_secret and langfuse_public:
|
||||
session_id = str(uuid4())
|
||||
run_result.langfuse_session_id = session_id
|
||||
config.tracing.enabled = True
|
||||
config.tracing.provider = "langfuse"
|
||||
config.tracing.langfuse_secret_key = langfuse_secret
|
||||
config.tracing.langfuse_public_key = langfuse_public
|
||||
config.tracing.langfuse_host = langfuse_host or os.environ.get(
|
||||
"LANGFUSE_HOST", "https://us.cloud.langfuse.com"
|
||||
)
|
||||
config.tracing.langfuse_session_id = session_id
|
||||
config.tracing.langfuse_user_id = "smoke-test"
|
||||
else:
|
||||
config.tracing.enabled = False
|
||||
|
||||
llm = load_llm("GoogleGenAI", model=LLM_MODEL)
|
||||
|
||||
credentials = None
|
||||
if test_config.credentials:
|
||||
credentials = {"test-account": "smoketest123"}
|
||||
|
||||
output_model = None
|
||||
if test_config.output_schema:
|
||||
output_model = AndroidVersion
|
||||
|
||||
agent = DroidAgent(
|
||||
goal=test_config.task,
|
||||
config=config,
|
||||
llms=llm,
|
||||
driver=driver,
|
||||
credentials=credentials,
|
||||
output_model=output_model,
|
||||
timeout=300,
|
||||
)
|
||||
|
||||
handler = agent.run()
|
||||
async for event in handler.stream_events():
|
||||
if isinstance(event, ScreenshotEvent):
|
||||
run_result.screenshots.append(event.screenshot)
|
||||
elif isinstance(event, ToolExecutionEvent):
|
||||
run_result.tool_events.append(event)
|
||||
|
||||
run_result.result = await handler
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Agent run failed: {e}")
|
||||
run_result.error = str(e)
|
||||
|
||||
return run_result
|
||||
@@ -0,0 +1,84 @@
|
||||
"""Smoke test assertions."""
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("smoke")
|
||||
|
||||
|
||||
class AssertionError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def assert_result_success(result) -> None:
|
||||
"""Assert the agent reported success."""
|
||||
if not result.success:
|
||||
raise AssertionError(
|
||||
f"Agent reported failure: {result.reason}"
|
||||
)
|
||||
|
||||
|
||||
def assert_structured_output(result) -> None:
|
||||
"""Assert structured output contains a valid Android version."""
|
||||
output = result.structured_output
|
||||
if output is None:
|
||||
raise AssertionError("No structured output returned")
|
||||
|
||||
version = getattr(output, "android_version", None)
|
||||
if version is None:
|
||||
raise AssertionError(
|
||||
f"structured_output missing 'android_version' field: {output}"
|
||||
)
|
||||
|
||||
if not re.match(r"^\d+", str(version)):
|
||||
raise AssertionError(
|
||||
f"android_version doesn't look like a version: '{version}'"
|
||||
)
|
||||
|
||||
|
||||
def assert_type_secret_called(tool_events: list) -> None:
|
||||
"""Assert type_secret was called and succeeded."""
|
||||
for event in tool_events:
|
||||
if event.tool_name == "type_secret" and event.success:
|
||||
return
|
||||
|
||||
names = [e.tool_name for e in tool_events]
|
||||
raise AssertionError(
|
||||
f"type_secret not found or failed in tool events. Tools called: {names}"
|
||||
)
|
||||
|
||||
|
||||
def assert_package_name(ui_state, expected_substring: str) -> None:
|
||||
"""Assert the device's current package name contains the expected substring."""
|
||||
pkg = ui_state.phone_state.package_name or ""
|
||||
if expected_substring.lower() not in pkg.lower():
|
||||
raise AssertionError(
|
||||
f"Expected package containing '{expected_substring}', got '{pkg}'"
|
||||
)
|
||||
|
||||
|
||||
ASSERTION_MAP = {
|
||||
"result_success": lambda ctx: assert_result_success(ctx["result"]),
|
||||
"structured_output": lambda ctx: assert_structured_output(ctx["result"]),
|
||||
"type_secret_called": lambda ctx: assert_type_secret_called(ctx["tool_events"]),
|
||||
"package_name": lambda ctx: assert_package_name(
|
||||
ctx["ui_state"], ctx["expected_package"]
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def run_assertions(assertion_names: list[str], context: dict) -> list[str]:
|
||||
"""Run named assertions and return list of failure messages."""
|
||||
failures = []
|
||||
for name in assertion_names:
|
||||
fn = ASSERTION_MAP.get(name)
|
||||
if fn is None:
|
||||
failures.append(f"Unknown assertion: {name}")
|
||||
continue
|
||||
try:
|
||||
fn(context)
|
||||
logger.info(f" PASS: {name}")
|
||||
except (AssertionError, Exception) as e:
|
||||
logger.error(f" FAIL: {name} — {e}")
|
||||
failures.append(f"{name}: {e}")
|
||||
return failures
|
||||
@@ -0,0 +1,58 @@
|
||||
"""Smoke test matrix configuration."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class SmokeTestConfig:
|
||||
name: str
|
||||
reasoning: bool
|
||||
vision: bool
|
||||
max_steps: int
|
||||
task: str
|
||||
output_schema: bool = False
|
||||
credentials: bool = False
|
||||
expected_package: str = ""
|
||||
assertions: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
SMOKE_TESTS: list[SmokeTestConfig] = [
|
||||
SmokeTestConfig(
|
||||
name="fast-no-vision",
|
||||
reasoning=False,
|
||||
vision=False,
|
||||
max_steps=15,
|
||||
task="Go to Settings and find the Android version number",
|
||||
expected_package="settings",
|
||||
assertions=["result_success", "package_name"],
|
||||
),
|
||||
SmokeTestConfig(
|
||||
name="fast-vision",
|
||||
reasoning=False,
|
||||
vision=True,
|
||||
max_steps=15,
|
||||
task="Go to Settings and find the Android version number",
|
||||
output_schema=True,
|
||||
expected_package="settings",
|
||||
assertions=["result_success", "structured_output", "package_name"],
|
||||
),
|
||||
SmokeTestConfig(
|
||||
name="reasoning-no-vision",
|
||||
reasoning=True,
|
||||
vision=False,
|
||||
max_steps=30,
|
||||
task="Open Chrome, tap the search bar, and use the type_secret tool to type the saved credential into it",
|
||||
credentials=True,
|
||||
expected_package="chrome",
|
||||
assertions=["type_secret_called"],
|
||||
),
|
||||
SmokeTestConfig(
|
||||
name="reasoning-vision",
|
||||
reasoning=True,
|
||||
vision=True,
|
||||
max_steps=30,
|
||||
task="Go to Settings and find the Android version number",
|
||||
expected_package="settings",
|
||||
assertions=["result_success", "package_name"],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Device lifecycle management via mobilerun SDK."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
|
||||
from mobilerun import AsyncMobilerun
|
||||
from mobilerun.types.device import Device
|
||||
from mobilerun.types.devices.state_ui_response import StateUiResponse
|
||||
|
||||
logger = logging.getLogger("smoke")
|
||||
|
||||
|
||||
async def provision_device(client: AsyncMobilerun) -> Device:
|
||||
"""Provision a temporary emulated device."""
|
||||
logger.info("Provisioning device...")
|
||||
device = await client.devices.create(device_type="dedicated_emulated_device")
|
||||
logger.info(f"Device created: {device.id} (state={device.state})")
|
||||
return device
|
||||
|
||||
|
||||
async def wait_for_ready(
|
||||
client: AsyncMobilerun, device_id: str, timeout: float = 180
|
||||
) -> Device:
|
||||
"""Wait for the device to reach 'ready' state, with retries on timeout."""
|
||||
logger.info(f"Waiting for device {device_id} to be ready (timeout={timeout}s)...")
|
||||
deadline = time.monotonic() + timeout
|
||||
|
||||
while True:
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
raise TimeoutError(f"Device {device_id} not ready after {timeout}s")
|
||||
|
||||
try:
|
||||
# Long-poll with remaining time (capped at 60s per request)
|
||||
poll_timeout = min(remaining, 60)
|
||||
device = await client.devices.wait_ready(device_id, timeout=poll_timeout)
|
||||
if device.state == "ready":
|
||||
logger.info(f"Device {device_id} is ready")
|
||||
return device
|
||||
logger.info(f"Device {device_id} state={device.state}, retrying...")
|
||||
except Exception as e:
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
raise TimeoutError(f"Device {device_id} not ready after {timeout}s") from e
|
||||
logger.info(f"Wait poll returned ({e.__class__.__name__}), retrying ({remaining:.0f}s left)...")
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
||||
async def press_home(client: AsyncMobilerun, device_id: str) -> None:
|
||||
"""Press the Home button to reset device state."""
|
||||
await client.devices.actions.global_(device_id, action=2)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
||||
async def get_ui_state(
|
||||
client: AsyncMobilerun, device_id: str
|
||||
) -> StateUiResponse:
|
||||
"""Get the current UI state of the device."""
|
||||
return await client.devices.state.ui(device_id)
|
||||
|
||||
|
||||
async def terminate_device(client: AsyncMobilerun, device_id: str) -> None:
|
||||
"""Terminate the device. Best-effort, never raises."""
|
||||
try:
|
||||
await client.devices.terminate(device_id, extra_body={})
|
||||
logger.info(f"Device {device_id} terminated")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to terminate device {device_id}: {e}")
|
||||
@@ -0,0 +1,52 @@
|
||||
"""GIF generation from screenshot bytes."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger("smoke")
|
||||
|
||||
|
||||
def create_gif(
|
||||
screenshots: list[bytes], output_path: Path, duration: int = 1000
|
||||
) -> Path | None:
|
||||
"""Create an animated GIF from a list of PNG screenshot bytes.
|
||||
|
||||
Returns the output path on success, None if no screenshots.
|
||||
"""
|
||||
if not screenshots:
|
||||
logger.warning("No screenshots to create GIF")
|
||||
return None
|
||||
|
||||
images = []
|
||||
for raw in screenshots:
|
||||
try:
|
||||
images.append(Image.open(io.BytesIO(raw)))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not images:
|
||||
return None
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
images[0].save(
|
||||
output_path,
|
||||
format="GIF",
|
||||
save_all=True,
|
||||
append_images=images[1:],
|
||||
duration=duration,
|
||||
loop=0,
|
||||
)
|
||||
finally:
|
||||
for img in images:
|
||||
try:
|
||||
img.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info(f"GIF saved: {output_path} ({len(images)} frames)")
|
||||
return output_path
|
||||
@@ -0,0 +1,253 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Smoke test runner for droidrun.
|
||||
|
||||
Provisions a cloud device (or uses an existing one), runs 4 agent configurations
|
||||
sequentially, asserts expected outcomes, generates trajectory GIFs, and writes a summary.
|
||||
|
||||
Usage:
|
||||
# Auto-provision a temporary device:
|
||||
MOBILERUN_API_KEY=xxx python tests/smoke/run.py --output-dir=artifacts
|
||||
|
||||
# Use an existing device (skips provisioning/termination):
|
||||
MOBILERUN_API_KEY=xxx python tests/smoke/run.py --device-id=UUID --output-dir=artifacts
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from mobilerun import AsyncMobilerun
|
||||
|
||||
from tests.smoke.agent_runner import run_agent
|
||||
from tests.smoke.assertions import run_assertions
|
||||
from tests.smoke.config import SMOKE_TESTS
|
||||
from tests.smoke.device import (
|
||||
get_ui_state,
|
||||
press_home,
|
||||
provision_device,
|
||||
terminate_device,
|
||||
wait_for_ready,
|
||||
)
|
||||
from tests.smoke.gif import create_gif
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("smoke")
|
||||
|
||||
|
||||
def _langfuse_session_url(host: str, session_id: str) -> str:
|
||||
"""Build a Langfuse session URL."""
|
||||
host = host.rstrip("/")
|
||||
return f"{host}/sessions/{session_id}"
|
||||
|
||||
|
||||
def write_summary(output_dir: Path, results: list[dict], langfuse_host: str) -> None:
|
||||
"""Write summary.md with pass/fail status, GIF links, and trace links."""
|
||||
lines = ["# Smoke Test Results\n"]
|
||||
|
||||
passed = sum(1 for r in results if r["passed"])
|
||||
total = len(results)
|
||||
lines.append(f"**{passed}/{total} passed**\n")
|
||||
lines.append("| Test | Mode | Vision | Status | Time | Details |")
|
||||
lines.append("|------|------|--------|--------|------|---------|")
|
||||
|
||||
for r in results:
|
||||
status = "PASS" if r["passed"] else "FAIL"
|
||||
mode = "reasoning" if r["reasoning"] else "fast"
|
||||
vision = "on" if r["vision"] else "off"
|
||||
elapsed = f"{r['elapsed']:.0f}s"
|
||||
details = r.get("error", "") or ", ".join(r.get("failures", []))
|
||||
if not details:
|
||||
details = r.get("reason", "")
|
||||
details = details.replace("\n", " ").replace("|", "\\|")[:100]
|
||||
lines.append(f"| {r['name']} | {mode} | {vision} | {status} | {elapsed} | {details} |")
|
||||
|
||||
# Langfuse traces
|
||||
has_traces = any(r.get("langfuse_session_id") for r in results)
|
||||
if has_traces:
|
||||
lines.append("")
|
||||
lines.append("## Langfuse Traces\n")
|
||||
for r in results:
|
||||
sid = r.get("langfuse_session_id")
|
||||
if sid:
|
||||
url = _langfuse_session_url(langfuse_host, sid)
|
||||
lines.append(f"- **{r['name']}**: [{sid[:8]}...]({url})")
|
||||
else:
|
||||
lines.append(f"- **{r['name']}**: _no trace_")
|
||||
|
||||
# Trajectory files
|
||||
lines.append("")
|
||||
lines.append("## Trajectories\n")
|
||||
for r in results:
|
||||
traj_dir = f"trajectories/{r['name']}"
|
||||
lines.append(f"- **{r['name']}**: [`{traj_dir}/`]({traj_dir}/)")
|
||||
|
||||
# GIFs
|
||||
lines.append("")
|
||||
lines.append("## Trajectory GIFs\n")
|
||||
for r in results:
|
||||
gif_name = f"{r['name']}.gif"
|
||||
if r["has_gif"]:
|
||||
lines.append(f"### {r['name']}\n")
|
||||
lines.append(f"![{r['name']}]({gif_name})\n")
|
||||
else:
|
||||
lines.append(f"### {r['name']}\n")
|
||||
lines.append("_No screenshots captured._\n")
|
||||
|
||||
summary_path = output_dir / "summary.md"
|
||||
summary_path.write_text("\n".join(lines))
|
||||
logger.info(f"Summary written to {summary_path}")
|
||||
|
||||
|
||||
async def main(output_dir: Path, device_id_arg: str | None) -> int:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
api_key = os.environ.get("MOBILERUN_API_KEY")
|
||||
if not api_key:
|
||||
logger.error("MOBILERUN_API_KEY env var is required")
|
||||
return 1
|
||||
|
||||
google_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
|
||||
if not google_key:
|
||||
logger.error("GOOGLE_API_KEY or GEMINI_API_KEY env var is required")
|
||||
return 1
|
||||
|
||||
base_url = os.environ.get("MOBILERUN_BASE_URL", "https://api.mobilerun.ai/v1")
|
||||
langfuse_host = os.environ.get("LANGFUSE_HOST", "https://us.cloud.langfuse.com")
|
||||
|
||||
client = AsyncMobilerun(api_key=api_key, base_url=base_url)
|
||||
device_id = device_id_arg
|
||||
provisioned = False
|
||||
|
||||
try:
|
||||
if device_id:
|
||||
logger.info(f"Using existing device: {device_id}")
|
||||
else:
|
||||
device = await provision_device(client)
|
||||
device_id = device.id
|
||||
provisioned = True
|
||||
await wait_for_ready(client, device_id, timeout=120)
|
||||
|
||||
results = []
|
||||
|
||||
for test_config in SMOKE_TESTS:
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info(f"Running: {test_config.name}")
|
||||
logger.info(f" mode={'reasoning' if test_config.reasoning else 'fast'} vision={test_config.vision}")
|
||||
logger.info(f" task: {test_config.task}")
|
||||
logger.info(f"{'=' * 60}")
|
||||
|
||||
# Reset device to home screen
|
||||
await press_home(client, device_id)
|
||||
|
||||
# Trajectory dir per test
|
||||
trajectory_dir = str(output_dir / "trajectories" / test_config.name)
|
||||
|
||||
t0 = time.monotonic()
|
||||
run_result = await run_agent(
|
||||
test_config,
|
||||
device_id,
|
||||
api_key,
|
||||
base_url,
|
||||
trajectory_dir=trajectory_dir,
|
||||
langfuse_host=langfuse_host,
|
||||
)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
# Build assertion context
|
||||
assertion_ctx = {
|
||||
"result": run_result.result,
|
||||
"tool_events": run_result.tool_events,
|
||||
"expected_package": test_config.expected_package,
|
||||
}
|
||||
|
||||
# Get UI state for package name assertion
|
||||
if "package_name" in test_config.assertions:
|
||||
try:
|
||||
assertion_ctx["ui_state"] = await get_ui_state(client, device_id)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get UI state: {e}")
|
||||
assertion_ctx["ui_state"] = None
|
||||
|
||||
# Run assertions
|
||||
if run_result.error:
|
||||
failures = [f"agent_error: {run_result.error}"]
|
||||
else:
|
||||
failures = run_assertions(test_config.assertions, assertion_ctx)
|
||||
|
||||
passed = len(failures) == 0
|
||||
|
||||
# Generate GIF
|
||||
gif_path = create_gif(
|
||||
run_result.screenshots,
|
||||
output_dir / f"{test_config.name}.gif",
|
||||
)
|
||||
|
||||
# Log result
|
||||
status = "PASS" if passed else "FAIL"
|
||||
logger.info(f"\n Result: {status} ({elapsed:.0f}s, {len(run_result.screenshots)} screenshots)")
|
||||
if run_result.langfuse_session_id:
|
||||
logger.info(f" Langfuse session: {run_result.langfuse_session_id}")
|
||||
if failures:
|
||||
for f in failures:
|
||||
logger.error(f" {f}")
|
||||
|
||||
results.append({
|
||||
"name": test_config.name,
|
||||
"passed": passed,
|
||||
"elapsed": elapsed,
|
||||
"reasoning": test_config.reasoning,
|
||||
"vision": test_config.vision,
|
||||
"failures": failures,
|
||||
"error": run_result.error,
|
||||
"reason": run_result.result.reason if run_result.result else "",
|
||||
"has_gif": gif_path is not None,
|
||||
"langfuse_session_id": run_result.langfuse_session_id,
|
||||
})
|
||||
|
||||
# Summary
|
||||
write_summary(output_dir, results, langfuse_host)
|
||||
|
||||
logger.info(f"\n{'=' * 60}")
|
||||
logger.info("SUMMARY")
|
||||
logger.info(f"{'=' * 60}")
|
||||
all_passed = all(r["passed"] for r in results)
|
||||
for r in results:
|
||||
icon = "PASS" if r["passed"] else "FAIL"
|
||||
logger.info(f" [{icon}] {r['name']} ({r['elapsed']:.0f}s)")
|
||||
|
||||
total_passed = sum(1 for r in results if r["passed"])
|
||||
logger.info(f"\n {total_passed}/{len(results)} passed")
|
||||
|
||||
return 0 if all_passed else 1
|
||||
|
||||
finally:
|
||||
if provisioned and device_id:
|
||||
await terminate_device(client, device_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run droidrun smoke tests")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("artifacts"),
|
||||
help="Directory for GIFs and summary (default: artifacts)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device-id",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Use an existing device ID instead of provisioning a new one",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
exit_code = asyncio.run(main(args.output_dir, args.device_id))
|
||||
sys.exit(exit_code)
|
||||
Reference in New Issue
Block a user