feat: add smoke test CI for PR regression testing

This commit is contained in:
johnmalek312
2026-03-05 18:33:31 +11:00
parent 3260801767
commit b25e4be9cc
10 changed files with 737 additions and 1 deletions
+93
View File
@@ -0,0 +1,93 @@
name: Smoke Tests
on:
pull_request:
paths:
- "droidrun/**"
concurrency:
group: smoke-${{ github.head_ref }}
cancel-in-progress: true
jobs:
smoke:
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
contents: read
pull-requests: write
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.13"
- uses: astral-sh/setup-uv@v4
- name: Install dependencies
run: uv sync --all-groups
- name: Run smoke tests
env:
MOBILERUN_API_KEY: ${{ secrets.MOBILERUN_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }}
LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }}
LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }}
run: uv run python -m tests.smoke.run --output-dir=artifacts
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: smoke-test-results
path: artifacts/
- name: Comment on PR
if: always()
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = 'artifacts/summary.md';
let body;
if (fs.existsSync(path)) {
body = fs.readFileSync(path, 'utf8');
// Add artifact download link
const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
body += `\n\n---\n[Download all artifacts](${runUrl}#artifacts)`;
} else {
body = '## Smoke Tests\n\n**Failed to generate results.** Check the [workflow run](' +
`${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` +
') for details.';
}
// Find existing smoke test comment to update
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const marker = '<!-- smoke-test-results -->';
body = marker + '\n' + body;
const existing = comments.find(c => c.body?.includes(marker));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}
View File
View File
+126
View File
@@ -0,0 +1,126 @@
"""Run DroidAgent from source against a cloud device."""
import logging
import os
from dataclasses import dataclass, field
from uuid import uuid4
from pydantic import BaseModel, Field
from droidrun import DroidAgent, DroidrunConfig, load_llm
from droidrun.agent.common.events import ScreenshotEvent, ToolExecutionEvent
from droidrun.agent.droid.events import ResultEvent
from droidrun.tools.driver.cloud import CloudDriver
from tests.smoke.config import SmokeTestConfig
logger = logging.getLogger("smoke")
LLM_MODEL = "gemini-3.1-flash-lite-preview"
class AndroidVersion(BaseModel):
"""Structured output model for extracting Android version."""
android_version: str = Field(description="The Android version number (e.g. '14', '15')")
@dataclass
class RunResult:
result: ResultEvent | None = None
screenshots: list[bytes] = field(default_factory=list)
tool_events: list[ToolExecutionEvent] = field(default_factory=list)
error: str | None = None
langfuse_session_id: str | None = None
async def run_agent(
test_config: SmokeTestConfig,
device_id: str,
api_key: str,
base_url: str,
trajectory_dir: str | None = None,
langfuse_host: str | None = None,
) -> RunResult:
"""Run a single smoke test agent and collect results."""
run_result = RunResult()
# Ensure screenshots are emitted even for non-vision runs
os.environ["DROIDRUN_STREAM_SCREENSHOTS"] = "1"
try:
driver = CloudDriver(
device_id=device_id,
api_key=api_key,
base_url=base_url,
)
config = DroidrunConfig()
config.agent.reasoning = test_config.reasoning
config.agent.max_steps = test_config.max_steps
config.agent.streaming = False
config.agent.fast_agent.vision = test_config.vision
config.agent.manager.vision = test_config.vision
config.agent.executor.vision = test_config.vision
config.telemetry.enabled = False
# Trajectory writer
if trajectory_dir:
config.logging.save_trajectory = "all"
config.logging.trajectory_path = trajectory_dir
config.logging.trajectory_gifs = True
else:
config.logging.save_trajectory = "none"
# Langfuse tracing
langfuse_secret = os.environ.get("LANGFUSE_SECRET_KEY", "")
langfuse_public = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
if langfuse_secret and langfuse_public:
session_id = str(uuid4())
run_result.langfuse_session_id = session_id
config.tracing.enabled = True
config.tracing.provider = "langfuse"
config.tracing.langfuse_secret_key = langfuse_secret
config.tracing.langfuse_public_key = langfuse_public
config.tracing.langfuse_host = langfuse_host or os.environ.get(
"LANGFUSE_HOST", "https://us.cloud.langfuse.com"
)
config.tracing.langfuse_session_id = session_id
config.tracing.langfuse_user_id = "smoke-test"
else:
config.tracing.enabled = False
llm = load_llm("GoogleGenAI", model=LLM_MODEL)
credentials = None
if test_config.credentials:
credentials = {"test-account": "smoketest123"}
output_model = None
if test_config.output_schema:
output_model = AndroidVersion
agent = DroidAgent(
goal=test_config.task,
config=config,
llms=llm,
driver=driver,
credentials=credentials,
output_model=output_model,
timeout=300,
)
handler = agent.run()
async for event in handler.stream_events():
if isinstance(event, ScreenshotEvent):
run_result.screenshots.append(event.screenshot)
elif isinstance(event, ToolExecutionEvent):
run_result.tool_events.append(event)
run_result.result = await handler
except Exception as e:
logger.error(f"Agent run failed: {e}")
run_result.error = str(e)
return run_result
+84
View File
@@ -0,0 +1,84 @@
"""Smoke test assertions."""
import re
import logging
logger = logging.getLogger("smoke")
class AssertionError(Exception):
pass
def assert_result_success(result) -> None:
"""Assert the agent reported success."""
if not result.success:
raise AssertionError(
f"Agent reported failure: {result.reason}"
)
def assert_structured_output(result) -> None:
"""Assert structured output contains a valid Android version."""
output = result.structured_output
if output is None:
raise AssertionError("No structured output returned")
version = getattr(output, "android_version", None)
if version is None:
raise AssertionError(
f"structured_output missing 'android_version' field: {output}"
)
if not re.match(r"^\d+", str(version)):
raise AssertionError(
f"android_version doesn't look like a version: '{version}'"
)
def assert_type_secret_called(tool_events: list) -> None:
"""Assert type_secret was called and succeeded."""
for event in tool_events:
if event.tool_name == "type_secret" and event.success:
return
names = [e.tool_name for e in tool_events]
raise AssertionError(
f"type_secret not found or failed in tool events. Tools called: {names}"
)
def assert_package_name(ui_state, expected_substring: str) -> None:
"""Assert the device's current package name contains the expected substring."""
pkg = ui_state.phone_state.package_name or ""
if expected_substring.lower() not in pkg.lower():
raise AssertionError(
f"Expected package containing '{expected_substring}', got '{pkg}'"
)
ASSERTION_MAP = {
"result_success": lambda ctx: assert_result_success(ctx["result"]),
"structured_output": lambda ctx: assert_structured_output(ctx["result"]),
"type_secret_called": lambda ctx: assert_type_secret_called(ctx["tool_events"]),
"package_name": lambda ctx: assert_package_name(
ctx["ui_state"], ctx["expected_package"]
),
}
def run_assertions(assertion_names: list[str], context: dict) -> list[str]:
"""Run named assertions and return list of failure messages."""
failures = []
for name in assertion_names:
fn = ASSERTION_MAP.get(name)
if fn is None:
failures.append(f"Unknown assertion: {name}")
continue
try:
fn(context)
logger.info(f" PASS: {name}")
except (AssertionError, Exception) as e:
logger.error(f" FAIL: {name}{e}")
failures.append(f"{name}: {e}")
return failures
+58
View File
@@ -0,0 +1,58 @@
"""Smoke test matrix configuration."""
from dataclasses import dataclass, field
@dataclass
class SmokeTestConfig:
name: str
reasoning: bool
vision: bool
max_steps: int
task: str
output_schema: bool = False
credentials: bool = False
expected_package: str = ""
assertions: list[str] = field(default_factory=list)
SMOKE_TESTS: list[SmokeTestConfig] = [
SmokeTestConfig(
name="fast-no-vision",
reasoning=False,
vision=False,
max_steps=15,
task="Go to Settings and find the Android version number",
expected_package="settings",
assertions=["result_success", "package_name"],
),
SmokeTestConfig(
name="fast-vision",
reasoning=False,
vision=True,
max_steps=15,
task="Go to Settings and find the Android version number",
output_schema=True,
expected_package="settings",
assertions=["result_success", "structured_output", "package_name"],
),
SmokeTestConfig(
name="reasoning-no-vision",
reasoning=True,
vision=False,
max_steps=30,
task="Open Chrome, tap the search bar, and use the type_secret tool to type the saved credential into it",
credentials=True,
expected_package="chrome",
assertions=["type_secret_called"],
),
SmokeTestConfig(
name="reasoning-vision",
reasoning=True,
vision=True,
max_steps=30,
task="Go to Settings and find the Android version number",
expected_package="settings",
assertions=["result_success", "package_name"],
),
]
+70
View File
@@ -0,0 +1,70 @@
"""Device lifecycle management via mobilerun SDK."""
import asyncio
import logging
import time
from mobilerun import AsyncMobilerun
from mobilerun.types.device import Device
from mobilerun.types.devices.state_ui_response import StateUiResponse
logger = logging.getLogger("smoke")
async def provision_device(client: AsyncMobilerun) -> Device:
"""Provision a temporary emulated device."""
logger.info("Provisioning device...")
device = await client.devices.create(device_type="dedicated_emulated_device")
logger.info(f"Device created: {device.id} (state={device.state})")
return device
async def wait_for_ready(
client: AsyncMobilerun, device_id: str, timeout: float = 180
) -> Device:
"""Wait for the device to reach 'ready' state, with retries on timeout."""
logger.info(f"Waiting for device {device_id} to be ready (timeout={timeout}s)...")
deadline = time.monotonic() + timeout
while True:
remaining = deadline - time.monotonic()
if remaining <= 0:
raise TimeoutError(f"Device {device_id} not ready after {timeout}s")
try:
# Long-poll with remaining time (capped at 60s per request)
poll_timeout = min(remaining, 60)
device = await client.devices.wait_ready(device_id, timeout=poll_timeout)
if device.state == "ready":
logger.info(f"Device {device_id} is ready")
return device
logger.info(f"Device {device_id} state={device.state}, retrying...")
except Exception as e:
remaining = deadline - time.monotonic()
if remaining <= 0:
raise TimeoutError(f"Device {device_id} not ready after {timeout}s") from e
logger.info(f"Wait poll returned ({e.__class__.__name__}), retrying ({remaining:.0f}s left)...")
await asyncio.sleep(2)
async def press_home(client: AsyncMobilerun, device_id: str) -> None:
"""Press the Home button to reset device state."""
await client.devices.actions.global_(device_id, action=2)
await asyncio.sleep(2)
async def get_ui_state(
client: AsyncMobilerun, device_id: str
) -> StateUiResponse:
"""Get the current UI state of the device."""
return await client.devices.state.ui(device_id)
async def terminate_device(client: AsyncMobilerun, device_id: str) -> None:
"""Terminate the device. Best-effort, never raises."""
try:
await client.devices.terminate(device_id, extra_body={})
logger.info(f"Device {device_id} terminated")
except Exception as e:
logger.warning(f"Failed to terminate device {device_id}: {e}")
+52
View File
@@ -0,0 +1,52 @@
"""GIF generation from screenshot bytes."""
import io
import logging
from pathlib import Path
from PIL import Image
logger = logging.getLogger("smoke")
def create_gif(
screenshots: list[bytes], output_path: Path, duration: int = 1000
) -> Path | None:
"""Create an animated GIF from a list of PNG screenshot bytes.
Returns the output path on success, None if no screenshots.
"""
if not screenshots:
logger.warning("No screenshots to create GIF")
return None
images = []
for raw in screenshots:
try:
images.append(Image.open(io.BytesIO(raw)))
except Exception:
continue
if not images:
return None
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
images[0].save(
output_path,
format="GIF",
save_all=True,
append_images=images[1:],
duration=duration,
loop=0,
)
finally:
for img in images:
try:
img.close()
except Exception:
pass
logger.info(f"GIF saved: {output_path} ({len(images)} frames)")
return output_path
+253
View File
@@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""Smoke test runner for droidrun.
Provisions a cloud device (or uses an existing one), runs 4 agent configurations
sequentially, asserts expected outcomes, generates trajectory GIFs, and writes a summary.
Usage:
# Auto-provision a temporary device:
MOBILERUN_API_KEY=xxx python tests/smoke/run.py --output-dir=artifacts
# Use an existing device (skips provisioning/termination):
MOBILERUN_API_KEY=xxx python tests/smoke/run.py --device-id=UUID --output-dir=artifacts
"""
import argparse
import asyncio
import logging
import os
import sys
import time
from pathlib import Path
from mobilerun import AsyncMobilerun
from tests.smoke.agent_runner import run_agent
from tests.smoke.assertions import run_assertions
from tests.smoke.config import SMOKE_TESTS
from tests.smoke.device import (
get_ui_state,
press_home,
provision_device,
terminate_device,
wait_for_ready,
)
from tests.smoke.gif import create_gif
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("smoke")
def _langfuse_session_url(host: str, session_id: str) -> str:
"""Build a Langfuse session URL."""
host = host.rstrip("/")
return f"{host}/sessions/{session_id}"
def write_summary(output_dir: Path, results: list[dict], langfuse_host: str) -> None:
"""Write summary.md with pass/fail status, GIF links, and trace links."""
lines = ["# Smoke Test Results\n"]
passed = sum(1 for r in results if r["passed"])
total = len(results)
lines.append(f"**{passed}/{total} passed**\n")
lines.append("| Test | Mode | Vision | Status | Time | Details |")
lines.append("|------|------|--------|--------|------|---------|")
for r in results:
status = "PASS" if r["passed"] else "FAIL"
mode = "reasoning" if r["reasoning"] else "fast"
vision = "on" if r["vision"] else "off"
elapsed = f"{r['elapsed']:.0f}s"
details = r.get("error", "") or ", ".join(r.get("failures", []))
if not details:
details = r.get("reason", "")
details = details.replace("\n", " ").replace("|", "\\|")[:100]
lines.append(f"| {r['name']} | {mode} | {vision} | {status} | {elapsed} | {details} |")
# Langfuse traces
has_traces = any(r.get("langfuse_session_id") for r in results)
if has_traces:
lines.append("")
lines.append("## Langfuse Traces\n")
for r in results:
sid = r.get("langfuse_session_id")
if sid:
url = _langfuse_session_url(langfuse_host, sid)
lines.append(f"- **{r['name']}**: [{sid[:8]}...]({url})")
else:
lines.append(f"- **{r['name']}**: _no trace_")
# Trajectory files
lines.append("")
lines.append("## Trajectories\n")
for r in results:
traj_dir = f"trajectories/{r['name']}"
lines.append(f"- **{r['name']}**: [`{traj_dir}/`]({traj_dir}/)")
# GIFs
lines.append("")
lines.append("## Trajectory GIFs\n")
for r in results:
gif_name = f"{r['name']}.gif"
if r["has_gif"]:
lines.append(f"### {r['name']}\n")
lines.append(f"![{r['name']}]({gif_name})\n")
else:
lines.append(f"### {r['name']}\n")
lines.append("_No screenshots captured._\n")
summary_path = output_dir / "summary.md"
summary_path.write_text("\n".join(lines))
logger.info(f"Summary written to {summary_path}")
async def main(output_dir: Path, device_id_arg: str | None) -> int:
output_dir.mkdir(parents=True, exist_ok=True)
api_key = os.environ.get("MOBILERUN_API_KEY")
if not api_key:
logger.error("MOBILERUN_API_KEY env var is required")
return 1
google_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
if not google_key:
logger.error("GOOGLE_API_KEY or GEMINI_API_KEY env var is required")
return 1
base_url = os.environ.get("MOBILERUN_BASE_URL", "https://api.mobilerun.ai/v1")
langfuse_host = os.environ.get("LANGFUSE_HOST", "https://us.cloud.langfuse.com")
client = AsyncMobilerun(api_key=api_key, base_url=base_url)
device_id = device_id_arg
provisioned = False
try:
if device_id:
logger.info(f"Using existing device: {device_id}")
else:
device = await provision_device(client)
device_id = device.id
provisioned = True
await wait_for_ready(client, device_id, timeout=120)
results = []
for test_config in SMOKE_TESTS:
logger.info(f"\n{'=' * 60}")
logger.info(f"Running: {test_config.name}")
logger.info(f" mode={'reasoning' if test_config.reasoning else 'fast'} vision={test_config.vision}")
logger.info(f" task: {test_config.task}")
logger.info(f"{'=' * 60}")
# Reset device to home screen
await press_home(client, device_id)
# Trajectory dir per test
trajectory_dir = str(output_dir / "trajectories" / test_config.name)
t0 = time.monotonic()
run_result = await run_agent(
test_config,
device_id,
api_key,
base_url,
trajectory_dir=trajectory_dir,
langfuse_host=langfuse_host,
)
elapsed = time.monotonic() - t0
# Build assertion context
assertion_ctx = {
"result": run_result.result,
"tool_events": run_result.tool_events,
"expected_package": test_config.expected_package,
}
# Get UI state for package name assertion
if "package_name" in test_config.assertions:
try:
assertion_ctx["ui_state"] = await get_ui_state(client, device_id)
except Exception as e:
logger.warning(f"Failed to get UI state: {e}")
assertion_ctx["ui_state"] = None
# Run assertions
if run_result.error:
failures = [f"agent_error: {run_result.error}"]
else:
failures = run_assertions(test_config.assertions, assertion_ctx)
passed = len(failures) == 0
# Generate GIF
gif_path = create_gif(
run_result.screenshots,
output_dir / f"{test_config.name}.gif",
)
# Log result
status = "PASS" if passed else "FAIL"
logger.info(f"\n Result: {status} ({elapsed:.0f}s, {len(run_result.screenshots)} screenshots)")
if run_result.langfuse_session_id:
logger.info(f" Langfuse session: {run_result.langfuse_session_id}")
if failures:
for f in failures:
logger.error(f" {f}")
results.append({
"name": test_config.name,
"passed": passed,
"elapsed": elapsed,
"reasoning": test_config.reasoning,
"vision": test_config.vision,
"failures": failures,
"error": run_result.error,
"reason": run_result.result.reason if run_result.result else "",
"has_gif": gif_path is not None,
"langfuse_session_id": run_result.langfuse_session_id,
})
# Summary
write_summary(output_dir, results, langfuse_host)
logger.info(f"\n{'=' * 60}")
logger.info("SUMMARY")
logger.info(f"{'=' * 60}")
all_passed = all(r["passed"] for r in results)
for r in results:
icon = "PASS" if r["passed"] else "FAIL"
logger.info(f" [{icon}] {r['name']} ({r['elapsed']:.0f}s)")
total_passed = sum(1 for r in results if r["passed"])
logger.info(f"\n {total_passed}/{len(results)} passed")
return 0 if all_passed else 1
finally:
if provisioned and device_id:
await terminate_device(client, device_id)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run droidrun smoke tests")
parser.add_argument(
"--output-dir",
type=Path,
default=Path("artifacts"),
help="Directory for GIFs and summary (default: artifacts)",
)
parser.add_argument(
"--device-id",
type=str,
default=None,
help="Use an existing device ID instead of provisioning a new one",
)
args = parser.parse_args()
exit_code = asyncio.run(main(args.output_dir, args.device_id))
sys.exit(exit_code)
Generated
+1 -1
View File
@@ -760,7 +760,7 @@ wheels = [
[[package]]
name = "droidrun"
version = "0.5.0.dev6"
version = "0.5.0"
source = { editable = "." }
dependencies = [
{ name = "aiofiles" },