feat: add smoke test CI for PR regression testing

2026-05-23 07:40:37 +00:00 · 2026-03-05 18:33:31 +11:00
parent 3260801767
commit b25e4be9cc
10 changed files with 737 additions and 1 deletions
@@ -0,0 +1,93 @@
+name: Smoke Tests
+
+on:
+  pull_request:
+    paths:
+      - "droidrun/**"
+
+concurrency:
+  group: smoke-${{ github.head_ref }}
+  cancel-in-progress: true
+
+jobs:
+  smoke:
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    permissions:
+      contents: read
+      pull-requests: write
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - uses: astral-sh/setup-uv@v4
+
+      - name: Install dependencies
+        run: uv sync --all-groups
+
+      - name: Run smoke tests
+        env:
+          MOBILERUN_API_KEY: ${{ secrets.MOBILERUN_API_KEY }}
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
+          LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
+          LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }}
+        run: uv run python -m tests.smoke.run --output-dir=artifacts
+
+      - name: Upload artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: smoke-test-results
+          path: artifacts/
+
+      - name: Comment on PR
+        if: always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const path = 'artifacts/summary.md';
+
+            let body;
+            if (fs.existsSync(path)) {
+              body = fs.readFileSync(path, 'utf8');
+              // Add artifact download link
+              const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+              body += `\n\n---\n[Download all artifacts](${runUrl}#artifacts)`;
+            } else {
+              body = '## Smoke Tests\n\n**Failed to generate results.** Check the [workflow run](' +
+                `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` +
+                ') for details.';
+            }
+
+            // Find existing smoke test comment to update
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const marker = '<!-- smoke-test-results -->';
+            body = marker + '\n' + body;
+            const existing = comments.find(c => c.body?.includes(marker));
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body,
+              });
+            }
@@ -0,0 +1,126 @@
+"""Run DroidAgent from source against a cloud device."""
+
+import logging
+import os
+from dataclasses import dataclass, field
+from uuid import uuid4
+
+from pydantic import BaseModel, Field
+
+from droidrun import DroidAgent, DroidrunConfig, load_llm
+from droidrun.agent.common.events import ScreenshotEvent, ToolExecutionEvent
+from droidrun.agent.droid.events import ResultEvent
+from droidrun.tools.driver.cloud import CloudDriver
+
+from tests.smoke.config import SmokeTestConfig
+
+logger = logging.getLogger("smoke")
+
+LLM_MODEL = "gemini-3.1-flash-lite-preview"
+
+
+class AndroidVersion(BaseModel):
+    """Structured output model for extracting Android version."""
+
+    android_version: str = Field(description="The Android version number (e.g. '14', '15')")
+
+
+@dataclass
+class RunResult:
+    result: ResultEvent | None = None
+    screenshots: list[bytes] = field(default_factory=list)
+    tool_events: list[ToolExecutionEvent] = field(default_factory=list)
+    error: str | None = None
+    langfuse_session_id: str | None = None
+
+
+async def run_agent(
+    test_config: SmokeTestConfig,
+    device_id: str,
+    api_key: str,
+    base_url: str,
+    trajectory_dir: str | None = None,
+    langfuse_host: str | None = None,
+) -> RunResult:
+    """Run a single smoke test agent and collect results."""
+    run_result = RunResult()
+
+    # Ensure screenshots are emitted even for non-vision runs
+    os.environ["DROIDRUN_STREAM_SCREENSHOTS"] = "1"
+
+    try:
+        driver = CloudDriver(
+            device_id=device_id,
+            api_key=api_key,
+            base_url=base_url,
+        )
+
+        config = DroidrunConfig()
+        config.agent.reasoning = test_config.reasoning
+        config.agent.max_steps = test_config.max_steps
+        config.agent.streaming = False
+        config.agent.fast_agent.vision = test_config.vision
+        config.agent.manager.vision = test_config.vision
+        config.agent.executor.vision = test_config.vision
+        config.telemetry.enabled = False
+
+        # Trajectory writer
+        if trajectory_dir:
+            config.logging.save_trajectory = "all"
+            config.logging.trajectory_path = trajectory_dir
+            config.logging.trajectory_gifs = True
+        else:
+            config.logging.save_trajectory = "none"
+
+        # Langfuse tracing
+        langfuse_secret = os.environ.get("LANGFUSE_SECRET_KEY", "")
+        langfuse_public = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
+        if langfuse_secret and langfuse_public:
+            session_id = str(uuid4())
+            run_result.langfuse_session_id = session_id
+            config.tracing.enabled = True
+            config.tracing.provider = "langfuse"
+            config.tracing.langfuse_secret_key = langfuse_secret
+            config.tracing.langfuse_public_key = langfuse_public
+            config.tracing.langfuse_host = langfuse_host or os.environ.get(
+                "LANGFUSE_HOST", "https://us.cloud.langfuse.com"
+            )
+            config.tracing.langfuse_session_id = session_id
+            config.tracing.langfuse_user_id = "smoke-test"
+        else:
+            config.tracing.enabled = False
+
+        llm = load_llm("GoogleGenAI", model=LLM_MODEL)
+
+        credentials = None
+        if test_config.credentials:
+            credentials = {"test-account": "smoketest123"}
+
+        output_model = None
+        if test_config.output_schema:
+            output_model = AndroidVersion
+
+        agent = DroidAgent(
+            goal=test_config.task,
+            config=config,
+            llms=llm,
+            driver=driver,
+            credentials=credentials,
+            output_model=output_model,
+            timeout=300,
+        )
+
+        handler = agent.run()
+        async for event in handler.stream_events():
+            if isinstance(event, ScreenshotEvent):
+                run_result.screenshots.append(event.screenshot)
+            elif isinstance(event, ToolExecutionEvent):
+                run_result.tool_events.append(event)
+
+        run_result.result = await handler
+
+    except Exception as e:
+        logger.error(f"Agent run failed: {e}")
+        run_result.error = str(e)
+
+    return run_result
@@ -0,0 +1,84 @@
+"""Smoke test assertions."""
+
+import re
+import logging
+
+logger = logging.getLogger("smoke")
+
+
+class AssertionError(Exception):
+    pass
+
+
+def assert_result_success(result) -> None:
+    """Assert the agent reported success."""
+    if not result.success:
+        raise AssertionError(
+            f"Agent reported failure: {result.reason}"
+        )
+
+
+def assert_structured_output(result) -> None:
+    """Assert structured output contains a valid Android version."""
+    output = result.structured_output
+    if output is None:
+        raise AssertionError("No structured output returned")
+
+    version = getattr(output, "android_version", None)
+    if version is None:
+        raise AssertionError(
+            f"structured_output missing 'android_version' field: {output}"
+        )
+
+    if not re.match(r"^\d+", str(version)):
+        raise AssertionError(
+            f"android_version doesn't look like a version: '{version}'"
+        )
+
+
+def assert_type_secret_called(tool_events: list) -> None:
+    """Assert type_secret was called and succeeded."""
+    for event in tool_events:
+        if event.tool_name == "type_secret" and event.success:
+            return
+
+    names = [e.tool_name for e in tool_events]
+    raise AssertionError(
+        f"type_secret not found or failed in tool events. Tools called: {names}"
+    )
+
+
+def assert_package_name(ui_state, expected_substring: str) -> None:
+    """Assert the device's current package name contains the expected substring."""
+    pkg = ui_state.phone_state.package_name or ""
+    if expected_substring.lower() not in pkg.lower():
+        raise AssertionError(
+            f"Expected package containing '{expected_substring}', got '{pkg}'"
+        )
+
+
+ASSERTION_MAP = {
+    "result_success": lambda ctx: assert_result_success(ctx["result"]),
+    "structured_output": lambda ctx: assert_structured_output(ctx["result"]),
+    "type_secret_called": lambda ctx: assert_type_secret_called(ctx["tool_events"]),
+    "package_name": lambda ctx: assert_package_name(
+        ctx["ui_state"], ctx["expected_package"]
+    ),
+}
+
+
+def run_assertions(assertion_names: list[str], context: dict) -> list[str]:
+    """Run named assertions and return list of failure messages."""
+    failures = []
+    for name in assertion_names:
+        fn = ASSERTION_MAP.get(name)
+        if fn is None:
+            failures.append(f"Unknown assertion: {name}")
+            continue
+        try:
+            fn(context)
+            logger.info(f"  PASS: {name}")
+        except (AssertionError, Exception) as e:
+            logger.error(f"  FAIL: {name} — {e}")
+            failures.append(f"{name}: {e}")
+    return failures
@@ -0,0 +1,58 @@
+"""Smoke test matrix configuration."""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class SmokeTestConfig:
+    name: str
+    reasoning: bool
+    vision: bool
+    max_steps: int
+    task: str
+    output_schema: bool = False
+    credentials: bool = False
+    expected_package: str = ""
+    assertions: list[str] = field(default_factory=list)
+
+
+SMOKE_TESTS: list[SmokeTestConfig] = [
+    SmokeTestConfig(
+        name="fast-no-vision",
+        reasoning=False,
+        vision=False,
+        max_steps=15,
+        task="Go to Settings and find the Android version number",
+        expected_package="settings",
+        assertions=["result_success", "package_name"],
+    ),
+    SmokeTestConfig(
+        name="fast-vision",
+        reasoning=False,
+        vision=True,
+        max_steps=15,
+        task="Go to Settings and find the Android version number",
+        output_schema=True,
+        expected_package="settings",
+        assertions=["result_success", "structured_output", "package_name"],
+    ),
+    SmokeTestConfig(
+        name="reasoning-no-vision",
+        reasoning=True,
+        vision=False,
+        max_steps=30,
+        task="Open Chrome, tap the search bar, and use the type_secret tool to type the saved credential into it",
+        credentials=True,
+        expected_package="chrome",
+        assertions=["type_secret_called"],
+    ),
+    SmokeTestConfig(
+        name="reasoning-vision",
+        reasoning=True,
+        vision=True,
+        max_steps=30,
+        task="Go to Settings and find the Android version number",
+        expected_package="settings",
+        assertions=["result_success", "package_name"],
+    ),
+]
@@ -0,0 +1,70 @@
+"""Device lifecycle management via mobilerun SDK."""
+
+import asyncio
+import logging
+import time
+
+from mobilerun import AsyncMobilerun
+from mobilerun.types.device import Device
+from mobilerun.types.devices.state_ui_response import StateUiResponse
+
+logger = logging.getLogger("smoke")
+
+
+async def provision_device(client: AsyncMobilerun) -> Device:
+    """Provision a temporary emulated device."""
+    logger.info("Provisioning device...")
+    device = await client.devices.create(device_type="dedicated_emulated_device")
+    logger.info(f"Device created: {device.id} (state={device.state})")
+    return device
+
+
+async def wait_for_ready(
+    client: AsyncMobilerun, device_id: str, timeout: float = 180
+) -> Device:
+    """Wait for the device to reach 'ready' state, with retries on timeout."""
+    logger.info(f"Waiting for device {device_id} to be ready (timeout={timeout}s)...")
+    deadline = time.monotonic() + timeout
+
+    while True:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            raise TimeoutError(f"Device {device_id} not ready after {timeout}s")
+
+        try:
+            # Long-poll with remaining time (capped at 60s per request)
+            poll_timeout = min(remaining, 60)
+            device = await client.devices.wait_ready(device_id, timeout=poll_timeout)
+            if device.state == "ready":
+                logger.info(f"Device {device_id} is ready")
+                return device
+            logger.info(f"Device {device_id} state={device.state}, retrying...")
+        except Exception as e:
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                raise TimeoutError(f"Device {device_id} not ready after {timeout}s") from e
+            logger.info(f"Wait poll returned ({e.__class__.__name__}), retrying ({remaining:.0f}s left)...")
+
+        await asyncio.sleep(2)
+
+
+async def press_home(client: AsyncMobilerun, device_id: str) -> None:
+    """Press the Home button to reset device state."""
+    await client.devices.actions.global_(device_id, action=2)
+    await asyncio.sleep(2)
+
+
+async def get_ui_state(
+    client: AsyncMobilerun, device_id: str
+) -> StateUiResponse:
+    """Get the current UI state of the device."""
+    return await client.devices.state.ui(device_id)
+
+
+async def terminate_device(client: AsyncMobilerun, device_id: str) -> None:
+    """Terminate the device. Best-effort, never raises."""
+    try:
+        await client.devices.terminate(device_id, extra_body={})
+        logger.info(f"Device {device_id} terminated")
+    except Exception as e:
+        logger.warning(f"Failed to terminate device {device_id}: {e}")
@@ -0,0 +1,52 @@
+"""GIF generation from screenshot bytes."""
+
+import io
+import logging
+from pathlib import Path
+
+from PIL import Image
+
+logger = logging.getLogger("smoke")
+
+
+def create_gif(
+    screenshots: list[bytes], output_path: Path, duration: int = 1000
+) -> Path | None:
+    """Create an animated GIF from a list of PNG screenshot bytes.
+
+    Returns the output path on success, None if no screenshots.
+    """
+    if not screenshots:
+        logger.warning("No screenshots to create GIF")
+        return None
+
+    images = []
+    for raw in screenshots:
+        try:
+            images.append(Image.open(io.BytesIO(raw)))
+        except Exception:
+            continue
+
+    if not images:
+        return None
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        images[0].save(
+            output_path,
+            format="GIF",
+            save_all=True,
+            append_images=images[1:],
+            duration=duration,
+            loop=0,
+        )
+    finally:
+        for img in images:
+            try:
+                img.close()
+            except Exception:
+                pass
+
+    logger.info(f"GIF saved: {output_path} ({len(images)} frames)")
+    return output_path
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""Smoke test runner for droidrun.
+
+Provisions a cloud device (or uses an existing one), runs 4 agent configurations
+sequentially, asserts expected outcomes, generates trajectory GIFs, and writes a summary.
+
+Usage:
+    # Auto-provision a temporary device:
+    MOBILERUN_API_KEY=xxx python tests/smoke/run.py --output-dir=artifacts
+
+    # Use an existing device (skips provisioning/termination):
+    MOBILERUN_API_KEY=xxx python tests/smoke/run.py --device-id=UUID --output-dir=artifacts
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+from mobilerun import AsyncMobilerun
+
+from tests.smoke.agent_runner import run_agent
+from tests.smoke.assertions import run_assertions
+from tests.smoke.config import SMOKE_TESTS
+from tests.smoke.device import (
+    get_ui_state,
+    press_home,
+    provision_device,
+    terminate_device,
+    wait_for_ready,
+)
+from tests.smoke.gif import create_gif
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("smoke")
+
+
+def _langfuse_session_url(host: str, session_id: str) -> str:
+    """Build a Langfuse session URL."""
+    host = host.rstrip("/")
+    return f"{host}/sessions/{session_id}"
+
+
+def write_summary(output_dir: Path, results: list[dict], langfuse_host: str) -> None:
+    """Write summary.md with pass/fail status, GIF links, and trace links."""
+    lines = ["# Smoke Test Results\n"]
+
+    passed = sum(1 for r in results if r["passed"])
+    total = len(results)
+    lines.append(f"**{passed}/{total} passed**\n")
+    lines.append("| Test | Mode | Vision | Status | Time | Details |")
+    lines.append("|------|------|--------|--------|------|---------|")
+
+    for r in results:
+        status = "PASS" if r["passed"] else "FAIL"
+        mode = "reasoning" if r["reasoning"] else "fast"
+        vision = "on" if r["vision"] else "off"
+        elapsed = f"{r['elapsed']:.0f}s"
+        details = r.get("error", "") or ", ".join(r.get("failures", []))
+        if not details:
+            details = r.get("reason", "")
+        details = details.replace("\n", " ").replace("|", "\\|")[:100]
+        lines.append(f"| {r['name']} | {mode} | {vision} | {status} | {elapsed} | {details} |")
+
+    # Langfuse traces
+    has_traces = any(r.get("langfuse_session_id") for r in results)
+    if has_traces:
+        lines.append("")
+        lines.append("## Langfuse Traces\n")
+        for r in results:
+            sid = r.get("langfuse_session_id")
+            if sid:
+                url = _langfuse_session_url(langfuse_host, sid)
+                lines.append(f"- **{r['name']}**: [{sid[:8]}...]({url})")
+            else:
+                lines.append(f"- **{r['name']}**: _no trace_")
+
+    # Trajectory files
+    lines.append("")
+    lines.append("## Trajectories\n")
+    for r in results:
+        traj_dir = f"trajectories/{r['name']}"
+        lines.append(f"- **{r['name']}**: [`{traj_dir}/`]({traj_dir}/)")
+
+    # GIFs
+    lines.append("")
+    lines.append("## Trajectory GIFs\n")
+    for r in results:
+        gif_name = f"{r['name']}.gif"
+        if r["has_gif"]:
+            lines.append(f"### {r['name']}\n")
+            lines.append(f"![{r['name']}]({gif_name})\n")
+        else:
+            lines.append(f"### {r['name']}\n")
+            lines.append("_No screenshots captured._\n")
+
+    summary_path = output_dir / "summary.md"
+    summary_path.write_text("\n".join(lines))
+    logger.info(f"Summary written to {summary_path}")
+
+
+async def main(output_dir: Path, device_id_arg: str | None) -> int:
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    api_key = os.environ.get("MOBILERUN_API_KEY")
+    if not api_key:
+        logger.error("MOBILERUN_API_KEY env var is required")
+        return 1
+
+    google_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
+    if not google_key:
+        logger.error("GOOGLE_API_KEY or GEMINI_API_KEY env var is required")
+        return 1
+
+    base_url = os.environ.get("MOBILERUN_BASE_URL", "https://api.mobilerun.ai/v1")
+    langfuse_host = os.environ.get("LANGFUSE_HOST", "https://us.cloud.langfuse.com")
+
+    client = AsyncMobilerun(api_key=api_key, base_url=base_url)
+    device_id = device_id_arg
+    provisioned = False
+
+    try:
+        if device_id:
+            logger.info(f"Using existing device: {device_id}")
+        else:
+            device = await provision_device(client)
+            device_id = device.id
+            provisioned = True
+            await wait_for_ready(client, device_id, timeout=120)
+
+        results = []
+
+        for test_config in SMOKE_TESTS:
+            logger.info(f"\n{'=' * 60}")
+            logger.info(f"Running: {test_config.name}")
+            logger.info(f"  mode={'reasoning' if test_config.reasoning else 'fast'} vision={test_config.vision}")
+            logger.info(f"  task: {test_config.task}")
+            logger.info(f"{'=' * 60}")
+
+            # Reset device to home screen
+            await press_home(client, device_id)
+
+            # Trajectory dir per test
+            trajectory_dir = str(output_dir / "trajectories" / test_config.name)
+
+            t0 = time.monotonic()
+            run_result = await run_agent(
+                test_config,
+                device_id,
+                api_key,
+                base_url,
+                trajectory_dir=trajectory_dir,
+                langfuse_host=langfuse_host,
+            )
+            elapsed = time.monotonic() - t0
+
+            # Build assertion context
+            assertion_ctx = {
+                "result": run_result.result,
+                "tool_events": run_result.tool_events,
+                "expected_package": test_config.expected_package,
+            }
+
+            # Get UI state for package name assertion
+            if "package_name" in test_config.assertions:
+                try:
+                    assertion_ctx["ui_state"] = await get_ui_state(client, device_id)
+                except Exception as e:
+                    logger.warning(f"Failed to get UI state: {e}")
+                    assertion_ctx["ui_state"] = None
+
+            # Run assertions
+            if run_result.error:
+                failures = [f"agent_error: {run_result.error}"]
+            else:
+                failures = run_assertions(test_config.assertions, assertion_ctx)
+
+            passed = len(failures) == 0
+
+            # Generate GIF
+            gif_path = create_gif(
+                run_result.screenshots,
+                output_dir / f"{test_config.name}.gif",
+            )
+
+            # Log result
+            status = "PASS" if passed else "FAIL"
+            logger.info(f"\n  Result: {status} ({elapsed:.0f}s, {len(run_result.screenshots)} screenshots)")
+            if run_result.langfuse_session_id:
+                logger.info(f"  Langfuse session: {run_result.langfuse_session_id}")
+            if failures:
+                for f in failures:
+                    logger.error(f"    {f}")
+
+            results.append({
+                "name": test_config.name,
+                "passed": passed,
+                "elapsed": elapsed,
+                "reasoning": test_config.reasoning,
+                "vision": test_config.vision,
+                "failures": failures,
+                "error": run_result.error,
+                "reason": run_result.result.reason if run_result.result else "",
+                "has_gif": gif_path is not None,
+                "langfuse_session_id": run_result.langfuse_session_id,
+            })
+
+        # Summary
+        write_summary(output_dir, results, langfuse_host)
+
+        logger.info(f"\n{'=' * 60}")
+        logger.info("SUMMARY")
+        logger.info(f"{'=' * 60}")
+        all_passed = all(r["passed"] for r in results)
+        for r in results:
+            icon = "PASS" if r["passed"] else "FAIL"
+            logger.info(f"  [{icon}] {r['name']} ({r['elapsed']:.0f}s)")
+
+        total_passed = sum(1 for r in results if r["passed"])
+        logger.info(f"\n  {total_passed}/{len(results)} passed")
+
+        return 0 if all_passed else 1
+
+    finally:
+        if provisioned and device_id:
+            await terminate_device(client, device_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run droidrun smoke tests")
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("artifacts"),
+        help="Directory for GIFs and summary (default: artifacts)",
+    )
+    parser.add_argument(
+        "--device-id",
+        type=str,
+        default=None,
+        help="Use an existing device ID instead of provisioning a new one",
+    )
+    args = parser.parse_args()
+
+    exit_code = asyncio.run(main(args.output_dir, args.device_id))
+    sys.exit(exit_code)
@@ -760,7 +760,7 @@ wheels = [

 [[package]]
 name = "droidrun"
-version = "0.5.0.dev6"
+version = "0.5.0"
 source = { editable = "." }
 dependencies = [
    { name = "aiofiles" },