From b25e4be9cccd62b85095670cbf41dae82b3029b3 Mon Sep 17 00:00:00 2001 From: johnmalek312 Date: Thu, 5 Mar 2026 18:33:31 +1100 Subject: [PATCH] feat: add smoke test CI for PR regression testing --- .github/workflows/smoke-test.yml | 93 ++++++++++++ tests/__init__.py | 0 tests/smoke/__init__.py | 0 tests/smoke/agent_runner.py | 126 +++++++++++++++ tests/smoke/assertions.py | 84 ++++++++++ tests/smoke/config.py | 58 +++++++ tests/smoke/device.py | 70 +++++++++ tests/smoke/gif.py | 52 +++++++ tests/smoke/run.py | 253 +++++++++++++++++++++++++++++++ uv.lock | 2 +- 10 files changed, 737 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/smoke-test.yml create mode 100644 tests/__init__.py create mode 100644 tests/smoke/__init__.py create mode 100644 tests/smoke/agent_runner.py create mode 100644 tests/smoke/assertions.py create mode 100644 tests/smoke/config.py create mode 100644 tests/smoke/device.py create mode 100644 tests/smoke/gif.py create mode 100644 tests/smoke/run.py diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml new file mode 100644 index 0000000..f45eb5f --- /dev/null +++ b/.github/workflows/smoke-test.yml @@ -0,0 +1,93 @@ +name: Smoke Tests + +on: + pull_request: + paths: + - "droidrun/**" + +concurrency: + group: smoke-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + smoke: + runs-on: ubuntu-latest + timeout-minutes: 20 + permissions: + contents: read + pull-requests: write + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - uses: astral-sh/setup-uv@v4 + + - name: Install dependencies + run: uv sync --all-groups + + - name: Run smoke tests + env: + MOBILERUN_API_KEY: ${{ secrets.MOBILERUN_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} + LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} + LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }} + run: uv run python -m tests.smoke.run --output-dir=artifacts + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: smoke-test-results + path: artifacts/ + + - name: Comment on PR + if: always() + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const path = 'artifacts/summary.md'; + + let body; + if (fs.existsSync(path)) { + body = fs.readFileSync(path, 'utf8'); + // Add artifact download link + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + body += `\n\n---\n[Download all artifacts](${runUrl}#artifacts)`; + } else { + body = '## Smoke Tests\n\n**Failed to generate results.** Check the [workflow run](' + + `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` + + ') for details.'; + } + + // Find existing smoke test comment to update + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const marker = ''; + body = marker + '\n' + body; + const existing = comments.find(c => c.body?.includes(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/smoke/__init__.py b/tests/smoke/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/smoke/agent_runner.py b/tests/smoke/agent_runner.py new file mode 100644 index 0000000..02eeb4a --- /dev/null +++ b/tests/smoke/agent_runner.py @@ -0,0 +1,126 @@ +"""Run DroidAgent from source against a cloud device.""" + +import logging +import os +from dataclasses import dataclass, field +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from droidrun import DroidAgent, DroidrunConfig, load_llm +from droidrun.agent.common.events import ScreenshotEvent, ToolExecutionEvent +from droidrun.agent.droid.events import ResultEvent +from droidrun.tools.driver.cloud import CloudDriver + +from tests.smoke.config import SmokeTestConfig + +logger = logging.getLogger("smoke") + +LLM_MODEL = "gemini-3.1-flash-lite-preview" + + +class AndroidVersion(BaseModel): + """Structured output model for extracting Android version.""" + + android_version: str = Field(description="The Android version number (e.g. '14', '15')") + + +@dataclass +class RunResult: + result: ResultEvent | None = None + screenshots: list[bytes] = field(default_factory=list) + tool_events: list[ToolExecutionEvent] = field(default_factory=list) + error: str | None = None + langfuse_session_id: str | None = None + + +async def run_agent( + test_config: SmokeTestConfig, + device_id: str, + api_key: str, + base_url: str, + trajectory_dir: str | None = None, + langfuse_host: str | None = None, +) -> RunResult: + """Run a single smoke test agent and collect results.""" + run_result = RunResult() + + # Ensure screenshots are emitted even for non-vision runs + os.environ["DROIDRUN_STREAM_SCREENSHOTS"] = "1" + + try: + driver = CloudDriver( + device_id=device_id, + api_key=api_key, + base_url=base_url, + ) + + config = DroidrunConfig() + config.agent.reasoning = test_config.reasoning + config.agent.max_steps = test_config.max_steps + config.agent.streaming = False + config.agent.fast_agent.vision = test_config.vision + config.agent.manager.vision = test_config.vision + config.agent.executor.vision = test_config.vision + config.telemetry.enabled = False + + # Trajectory writer + if trajectory_dir: + config.logging.save_trajectory = "all" + config.logging.trajectory_path = trajectory_dir + config.logging.trajectory_gifs = True + else: + config.logging.save_trajectory = "none" + + # Langfuse tracing + langfuse_secret = os.environ.get("LANGFUSE_SECRET_KEY", "") + langfuse_public = os.environ.get("LANGFUSE_PUBLIC_KEY", "") + if langfuse_secret and langfuse_public: + session_id = str(uuid4()) + run_result.langfuse_session_id = session_id + config.tracing.enabled = True + config.tracing.provider = "langfuse" + config.tracing.langfuse_secret_key = langfuse_secret + config.tracing.langfuse_public_key = langfuse_public + config.tracing.langfuse_host = langfuse_host or os.environ.get( + "LANGFUSE_HOST", "https://us.cloud.langfuse.com" + ) + config.tracing.langfuse_session_id = session_id + config.tracing.langfuse_user_id = "smoke-test" + else: + config.tracing.enabled = False + + llm = load_llm("GoogleGenAI", model=LLM_MODEL) + + credentials = None + if test_config.credentials: + credentials = {"test-account": "smoketest123"} + + output_model = None + if test_config.output_schema: + output_model = AndroidVersion + + agent = DroidAgent( + goal=test_config.task, + config=config, + llms=llm, + driver=driver, + credentials=credentials, + output_model=output_model, + timeout=300, + ) + + handler = agent.run() + async for event in handler.stream_events(): + if isinstance(event, ScreenshotEvent): + run_result.screenshots.append(event.screenshot) + elif isinstance(event, ToolExecutionEvent): + run_result.tool_events.append(event) + + run_result.result = await handler + + except Exception as e: + logger.error(f"Agent run failed: {e}") + run_result.error = str(e) + + return run_result diff --git a/tests/smoke/assertions.py b/tests/smoke/assertions.py new file mode 100644 index 0000000..ddf5ea6 --- /dev/null +++ b/tests/smoke/assertions.py @@ -0,0 +1,84 @@ +"""Smoke test assertions.""" + +import re +import logging + +logger = logging.getLogger("smoke") + + +class AssertionError(Exception): + pass + + +def assert_result_success(result) -> None: + """Assert the agent reported success.""" + if not result.success: + raise AssertionError( + f"Agent reported failure: {result.reason}" + ) + + +def assert_structured_output(result) -> None: + """Assert structured output contains a valid Android version.""" + output = result.structured_output + if output is None: + raise AssertionError("No structured output returned") + + version = getattr(output, "android_version", None) + if version is None: + raise AssertionError( + f"structured_output missing 'android_version' field: {output}" + ) + + if not re.match(r"^\d+", str(version)): + raise AssertionError( + f"android_version doesn't look like a version: '{version}'" + ) + + +def assert_type_secret_called(tool_events: list) -> None: + """Assert type_secret was called and succeeded.""" + for event in tool_events: + if event.tool_name == "type_secret" and event.success: + return + + names = [e.tool_name for e in tool_events] + raise AssertionError( + f"type_secret not found or failed in tool events. Tools called: {names}" + ) + + +def assert_package_name(ui_state, expected_substring: str) -> None: + """Assert the device's current package name contains the expected substring.""" + pkg = ui_state.phone_state.package_name or "" + if expected_substring.lower() not in pkg.lower(): + raise AssertionError( + f"Expected package containing '{expected_substring}', got '{pkg}'" + ) + + +ASSERTION_MAP = { + "result_success": lambda ctx: assert_result_success(ctx["result"]), + "structured_output": lambda ctx: assert_structured_output(ctx["result"]), + "type_secret_called": lambda ctx: assert_type_secret_called(ctx["tool_events"]), + "package_name": lambda ctx: assert_package_name( + ctx["ui_state"], ctx["expected_package"] + ), +} + + +def run_assertions(assertion_names: list[str], context: dict) -> list[str]: + """Run named assertions and return list of failure messages.""" + failures = [] + for name in assertion_names: + fn = ASSERTION_MAP.get(name) + if fn is None: + failures.append(f"Unknown assertion: {name}") + continue + try: + fn(context) + logger.info(f" PASS: {name}") + except (AssertionError, Exception) as e: + logger.error(f" FAIL: {name} — {e}") + failures.append(f"{name}: {e}") + return failures diff --git a/tests/smoke/config.py b/tests/smoke/config.py new file mode 100644 index 0000000..95155c2 --- /dev/null +++ b/tests/smoke/config.py @@ -0,0 +1,58 @@ +"""Smoke test matrix configuration.""" + +from dataclasses import dataclass, field + + +@dataclass +class SmokeTestConfig: + name: str + reasoning: bool + vision: bool + max_steps: int + task: str + output_schema: bool = False + credentials: bool = False + expected_package: str = "" + assertions: list[str] = field(default_factory=list) + + +SMOKE_TESTS: list[SmokeTestConfig] = [ + SmokeTestConfig( + name="fast-no-vision", + reasoning=False, + vision=False, + max_steps=15, + task="Go to Settings and find the Android version number", + expected_package="settings", + assertions=["result_success", "package_name"], + ), + SmokeTestConfig( + name="fast-vision", + reasoning=False, + vision=True, + max_steps=15, + task="Go to Settings and find the Android version number", + output_schema=True, + expected_package="settings", + assertions=["result_success", "structured_output", "package_name"], + ), + SmokeTestConfig( + name="reasoning-no-vision", + reasoning=True, + vision=False, + max_steps=30, + task="Open Chrome, tap the search bar, and use the type_secret tool to type the saved credential into it", + credentials=True, + expected_package="chrome", + assertions=["type_secret_called"], + ), + SmokeTestConfig( + name="reasoning-vision", + reasoning=True, + vision=True, + max_steps=30, + task="Go to Settings and find the Android version number", + expected_package="settings", + assertions=["result_success", "package_name"], + ), +] diff --git a/tests/smoke/device.py b/tests/smoke/device.py new file mode 100644 index 0000000..31468cb --- /dev/null +++ b/tests/smoke/device.py @@ -0,0 +1,70 @@ +"""Device lifecycle management via mobilerun SDK.""" + +import asyncio +import logging +import time + +from mobilerun import AsyncMobilerun +from mobilerun.types.device import Device +from mobilerun.types.devices.state_ui_response import StateUiResponse + +logger = logging.getLogger("smoke") + + +async def provision_device(client: AsyncMobilerun) -> Device: + """Provision a temporary emulated device.""" + logger.info("Provisioning device...") + device = await client.devices.create(device_type="dedicated_emulated_device") + logger.info(f"Device created: {device.id} (state={device.state})") + return device + + +async def wait_for_ready( + client: AsyncMobilerun, device_id: str, timeout: float = 180 +) -> Device: + """Wait for the device to reach 'ready' state, with retries on timeout.""" + logger.info(f"Waiting for device {device_id} to be ready (timeout={timeout}s)...") + deadline = time.monotonic() + timeout + + while True: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"Device {device_id} not ready after {timeout}s") + + try: + # Long-poll with remaining time (capped at 60s per request) + poll_timeout = min(remaining, 60) + device = await client.devices.wait_ready(device_id, timeout=poll_timeout) + if device.state == "ready": + logger.info(f"Device {device_id} is ready") + return device + logger.info(f"Device {device_id} state={device.state}, retrying...") + except Exception as e: + remaining = deadline - time.monotonic() + if remaining <= 0: + raise TimeoutError(f"Device {device_id} not ready after {timeout}s") from e + logger.info(f"Wait poll returned ({e.__class__.__name__}), retrying ({remaining:.0f}s left)...") + + await asyncio.sleep(2) + + +async def press_home(client: AsyncMobilerun, device_id: str) -> None: + """Press the Home button to reset device state.""" + await client.devices.actions.global_(device_id, action=2) + await asyncio.sleep(2) + + +async def get_ui_state( + client: AsyncMobilerun, device_id: str +) -> StateUiResponse: + """Get the current UI state of the device.""" + return await client.devices.state.ui(device_id) + + +async def terminate_device(client: AsyncMobilerun, device_id: str) -> None: + """Terminate the device. Best-effort, never raises.""" + try: + await client.devices.terminate(device_id, extra_body={}) + logger.info(f"Device {device_id} terminated") + except Exception as e: + logger.warning(f"Failed to terminate device {device_id}: {e}") diff --git a/tests/smoke/gif.py b/tests/smoke/gif.py new file mode 100644 index 0000000..b6515c0 --- /dev/null +++ b/tests/smoke/gif.py @@ -0,0 +1,52 @@ +"""GIF generation from screenshot bytes.""" + +import io +import logging +from pathlib import Path + +from PIL import Image + +logger = logging.getLogger("smoke") + + +def create_gif( + screenshots: list[bytes], output_path: Path, duration: int = 1000 +) -> Path | None: + """Create an animated GIF from a list of PNG screenshot bytes. + + Returns the output path on success, None if no screenshots. + """ + if not screenshots: + logger.warning("No screenshots to create GIF") + return None + + images = [] + for raw in screenshots: + try: + images.append(Image.open(io.BytesIO(raw))) + except Exception: + continue + + if not images: + return None + + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + images[0].save( + output_path, + format="GIF", + save_all=True, + append_images=images[1:], + duration=duration, + loop=0, + ) + finally: + for img in images: + try: + img.close() + except Exception: + pass + + logger.info(f"GIF saved: {output_path} ({len(images)} frames)") + return output_path diff --git a/tests/smoke/run.py b/tests/smoke/run.py new file mode 100644 index 0000000..6ee7d9c --- /dev/null +++ b/tests/smoke/run.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +"""Smoke test runner for droidrun. + +Provisions a cloud device (or uses an existing one), runs 4 agent configurations +sequentially, asserts expected outcomes, generates trajectory GIFs, and writes a summary. + +Usage: + # Auto-provision a temporary device: + MOBILERUN_API_KEY=xxx python tests/smoke/run.py --output-dir=artifacts + + # Use an existing device (skips provisioning/termination): + MOBILERUN_API_KEY=xxx python tests/smoke/run.py --device-id=UUID --output-dir=artifacts +""" + +import argparse +import asyncio +import logging +import os +import sys +import time +from pathlib import Path + +from mobilerun import AsyncMobilerun + +from tests.smoke.agent_runner import run_agent +from tests.smoke.assertions import run_assertions +from tests.smoke.config import SMOKE_TESTS +from tests.smoke.device import ( + get_ui_state, + press_home, + provision_device, + terminate_device, + wait_for_ready, +) +from tests.smoke.gif import create_gif + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + datefmt="%H:%M:%S", +) +logger = logging.getLogger("smoke") + + +def _langfuse_session_url(host: str, session_id: str) -> str: + """Build a Langfuse session URL.""" + host = host.rstrip("/") + return f"{host}/sessions/{session_id}" + + +def write_summary(output_dir: Path, results: list[dict], langfuse_host: str) -> None: + """Write summary.md with pass/fail status, GIF links, and trace links.""" + lines = ["# Smoke Test Results\n"] + + passed = sum(1 for r in results if r["passed"]) + total = len(results) + lines.append(f"**{passed}/{total} passed**\n") + lines.append("| Test | Mode | Vision | Status | Time | Details |") + lines.append("|------|------|--------|--------|------|---------|") + + for r in results: + status = "PASS" if r["passed"] else "FAIL" + mode = "reasoning" if r["reasoning"] else "fast" + vision = "on" if r["vision"] else "off" + elapsed = f"{r['elapsed']:.0f}s" + details = r.get("error", "") or ", ".join(r.get("failures", [])) + if not details: + details = r.get("reason", "") + details = details.replace("\n", " ").replace("|", "\\|")[:100] + lines.append(f"| {r['name']} | {mode} | {vision} | {status} | {elapsed} | {details} |") + + # Langfuse traces + has_traces = any(r.get("langfuse_session_id") for r in results) + if has_traces: + lines.append("") + lines.append("## Langfuse Traces\n") + for r in results: + sid = r.get("langfuse_session_id") + if sid: + url = _langfuse_session_url(langfuse_host, sid) + lines.append(f"- **{r['name']}**: [{sid[:8]}...]({url})") + else: + lines.append(f"- **{r['name']}**: _no trace_") + + # Trajectory files + lines.append("") + lines.append("## Trajectories\n") + for r in results: + traj_dir = f"trajectories/{r['name']}" + lines.append(f"- **{r['name']}**: [`{traj_dir}/`]({traj_dir}/)") + + # GIFs + lines.append("") + lines.append("## Trajectory GIFs\n") + for r in results: + gif_name = f"{r['name']}.gif" + if r["has_gif"]: + lines.append(f"### {r['name']}\n") + lines.append(f"![{r['name']}]({gif_name})\n") + else: + lines.append(f"### {r['name']}\n") + lines.append("_No screenshots captured._\n") + + summary_path = output_dir / "summary.md" + summary_path.write_text("\n".join(lines)) + logger.info(f"Summary written to {summary_path}") + + +async def main(output_dir: Path, device_id_arg: str | None) -> int: + output_dir.mkdir(parents=True, exist_ok=True) + + api_key = os.environ.get("MOBILERUN_API_KEY") + if not api_key: + logger.error("MOBILERUN_API_KEY env var is required") + return 1 + + google_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY") + if not google_key: + logger.error("GOOGLE_API_KEY or GEMINI_API_KEY env var is required") + return 1 + + base_url = os.environ.get("MOBILERUN_BASE_URL", "https://api.mobilerun.ai/v1") + langfuse_host = os.environ.get("LANGFUSE_HOST", "https://us.cloud.langfuse.com") + + client = AsyncMobilerun(api_key=api_key, base_url=base_url) + device_id = device_id_arg + provisioned = False + + try: + if device_id: + logger.info(f"Using existing device: {device_id}") + else: + device = await provision_device(client) + device_id = device.id + provisioned = True + await wait_for_ready(client, device_id, timeout=120) + + results = [] + + for test_config in SMOKE_TESTS: + logger.info(f"\n{'=' * 60}") + logger.info(f"Running: {test_config.name}") + logger.info(f" mode={'reasoning' if test_config.reasoning else 'fast'} vision={test_config.vision}") + logger.info(f" task: {test_config.task}") + logger.info(f"{'=' * 60}") + + # Reset device to home screen + await press_home(client, device_id) + + # Trajectory dir per test + trajectory_dir = str(output_dir / "trajectories" / test_config.name) + + t0 = time.monotonic() + run_result = await run_agent( + test_config, + device_id, + api_key, + base_url, + trajectory_dir=trajectory_dir, + langfuse_host=langfuse_host, + ) + elapsed = time.monotonic() - t0 + + # Build assertion context + assertion_ctx = { + "result": run_result.result, + "tool_events": run_result.tool_events, + "expected_package": test_config.expected_package, + } + + # Get UI state for package name assertion + if "package_name" in test_config.assertions: + try: + assertion_ctx["ui_state"] = await get_ui_state(client, device_id) + except Exception as e: + logger.warning(f"Failed to get UI state: {e}") + assertion_ctx["ui_state"] = None + + # Run assertions + if run_result.error: + failures = [f"agent_error: {run_result.error}"] + else: + failures = run_assertions(test_config.assertions, assertion_ctx) + + passed = len(failures) == 0 + + # Generate GIF + gif_path = create_gif( + run_result.screenshots, + output_dir / f"{test_config.name}.gif", + ) + + # Log result + status = "PASS" if passed else "FAIL" + logger.info(f"\n Result: {status} ({elapsed:.0f}s, {len(run_result.screenshots)} screenshots)") + if run_result.langfuse_session_id: + logger.info(f" Langfuse session: {run_result.langfuse_session_id}") + if failures: + for f in failures: + logger.error(f" {f}") + + results.append({ + "name": test_config.name, + "passed": passed, + "elapsed": elapsed, + "reasoning": test_config.reasoning, + "vision": test_config.vision, + "failures": failures, + "error": run_result.error, + "reason": run_result.result.reason if run_result.result else "", + "has_gif": gif_path is not None, + "langfuse_session_id": run_result.langfuse_session_id, + }) + + # Summary + write_summary(output_dir, results, langfuse_host) + + logger.info(f"\n{'=' * 60}") + logger.info("SUMMARY") + logger.info(f"{'=' * 60}") + all_passed = all(r["passed"] for r in results) + for r in results: + icon = "PASS" if r["passed"] else "FAIL" + logger.info(f" [{icon}] {r['name']} ({r['elapsed']:.0f}s)") + + total_passed = sum(1 for r in results if r["passed"]) + logger.info(f"\n {total_passed}/{len(results)} passed") + + return 0 if all_passed else 1 + + finally: + if provisioned and device_id: + await terminate_device(client, device_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run droidrun smoke tests") + parser.add_argument( + "--output-dir", + type=Path, + default=Path("artifacts"), + help="Directory for GIFs and summary (default: artifacts)", + ) + parser.add_argument( + "--device-id", + type=str, + default=None, + help="Use an existing device ID instead of provisioning a new one", + ) + args = parser.parse_args() + + exit_code = asyncio.run(main(args.output_dir, args.device_id)) + sys.exit(exit_code) diff --git a/uv.lock b/uv.lock index d46a357..b7b5b22 100644 --- a/uv.lock +++ b/uv.lock @@ -760,7 +760,7 @@ wheels = [ [[package]] name = "droidrun" -version = "0.5.0.dev6" +version = "0.5.0" source = { editable = "." } dependencies = [ { name = "aiofiles" },