Files
FluidAudio/Scripts/run_benchmarks.py
Alex 7d074e1ee6 chore: consolidate Python scripts into Scripts/ (#344)
## Summary
- Move `Benchmarks/nemo` to `Scripts/nemo_ami_benchmark`
- Move `Tools/voice_cloning` to `Scripts/voice_cloning`
- Remove now-empty `Benchmarks/` and `Tools/` top-level directories

Consolidates standalone Python utilities into a single `Scripts/`
directory to reduce top-level clutter.

## Test plan
- [x] Verify files moved correctly (no content changes)
<!-- devin-review-badge-begin -->

---

<a href="https://app.devin.ai/review/fluidinference/fluidaudio/pull/344"
target="_blank">
  <picture>
<source media="(prefers-color-scheme: dark)"
srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1">
<img
src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1"
alt="Open with Devin">
  </picture>
</a>
<!-- devin-review-badge-end -->
2026-03-04 12:46:03 -05:00

274 lines
8.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
FluidAudio Benchmark Suite
Runs ASR, VAD, and Diarization benchmarks and saves results to JSON.
Compare results against Documentation/Benchmarks.md baselines.
Usage:
python run_benchmarks.py # Run all benchmarks
python run_benchmarks.py --quick # Quick smoke test
python run_benchmarks.py --asr-only # ASR benchmark only
python run_benchmarks.py --vad-only # VAD benchmark only
python run_benchmarks.py --diar-only # Diarization only
"""
import argparse
import json
import subprocess
import sys
from datetime import datetime
from pathlib import Path
# Baseline values from Documentation/Benchmarks.md
BASELINES = {
"asr": {
"wer_percent": 5.8,
"rtfx_min": 200, # M4 Pro: ~210x
"description": "LibriSpeech test-clean, Parakeet TDT 0.6B"
},
"vad": {
"f1_percent": 85.0,
"rtfx_min": 500,
"description": "VOiCES dataset, Silero VAD"
},
"diarization": {
"der_percent": 17.7,
"rtfx_min": 1.0,
"description": "AMI SDM, pyannote-based"
}
}
def run_command(cmd: list[str], output_file: Path | None = None) -> tuple[int, str]:
"""Run a command and optionally save output."""
print(f"Running: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True
)
output = result.stdout + result.stderr
if output_file:
output_file.write_text(output)
return result.returncode, output
def build_release() -> bool:
"""Build the project in release mode."""
print("\n" + "=" * 60)
print("Building release...")
print("=" * 60)
returncode, _ = run_command(["swift", "build", "-c", "release"])
if returncode != 0:
print("ERROR: Build failed!")
return False
print("Build successful.")
return True
def run_asr_benchmark(output_dir: Path, quick: bool = False) -> dict | None:
"""Run ASR benchmark on LibriSpeech test-clean."""
print("\n" + "=" * 60)
print("ASR Benchmark (LibriSpeech test-clean)")
print("=" * 60)
max_files = "100" if quick else "all"
output_json = output_dir / f"asr_results.json"
cmd = [
"swift", "run", "-c", "release", "fluidaudio", "asr-benchmark",
"--subset", "test-clean",
"--max-files", max_files,
"--output", str(output_json)
]
returncode, output = run_command(cmd, output_dir / "asr_log.txt")
if returncode != 0:
print(f"ERROR: ASR benchmark failed!")
return None
if output_json.exists():
return json.loads(output_json.read_text())
return None
def run_vad_benchmark(output_dir: Path, quick: bool = False) -> dict | None:
"""Run VAD benchmark."""
print("\n" + "=" * 60)
print("VAD Benchmark")
print("=" * 60)
dataset = "mini50" if quick else "voices-subset"
output_json = output_dir / f"vad_results.json"
cmd = [
"swift", "run", "-c", "release", "fluidaudio", "vad-benchmark",
"--dataset", dataset,
"--all-files",
"--threshold", "0.5",
"--output", str(output_json)
]
returncode, output = run_command(cmd, output_dir / "vad_log.txt")
if returncode != 0:
print(f"ERROR: VAD benchmark failed!")
return None
if output_json.exists():
return json.loads(output_json.read_text())
return None
def run_diarization_benchmark(output_dir: Path, quick: bool = False) -> dict | None:
"""Run diarization benchmark on AMI SDM."""
print("\n" + "=" * 60)
print("Diarization Benchmark (AMI SDM)")
print("=" * 60)
output_json = output_dir / f"diarization_results.json"
cmd = [
"swift", "run", "-c", "release", "fluidaudio", "diarization-benchmark",
"--auto-download",
"--output", str(output_json)
]
if quick:
cmd.extend(["--single-file", "ES2004a"])
returncode, output = run_command(cmd, output_dir / "diarization_log.txt")
if returncode != 0:
print(f"ERROR: Diarization benchmark failed!")
return None
if output_json.exists():
return json.loads(output_json.read_text())
return None
def compare_results(results: dict) -> None:
"""Compare results against baselines."""
print("\n" + "=" * 60)
print("Results vs Baselines (Documentation/Benchmarks.md)")
print("=" * 60)
if "asr" in results and results["asr"]:
asr = results["asr"]
baseline = BASELINES["asr"]
wer = asr.get("wer", asr.get("average_wer", 0)) * 100
rtfx = asr.get("rtfx", asr.get("median_rtfx", 0))
wer_status = "" if wer <= baseline["wer_percent"] * 1.1 else ""
rtfx_status = "" if rtfx >= baseline["rtfx_min"] * 0.8 else ""
print(f"\nASR ({baseline['description']}):")
print(f" WER: {wer:.1f}% (baseline: {baseline['wer_percent']}%) {wer_status}")
print(f" RTFx: {rtfx:.1f}x (baseline: {baseline['rtfx_min']}x+) {rtfx_status}")
if "vad" in results and results["vad"]:
vad = results["vad"]
baseline = BASELINES["vad"]
f1 = vad.get("f1_score", 0)
rtfx = vad.get("rtfx", 0)
f1_status = "" if f1 >= baseline["f1_percent"] * 0.9 else ""
rtfx_status = "" if rtfx >= baseline["rtfx_min"] * 0.5 else ""
print(f"\nVAD ({baseline['description']}):")
print(f" F1: {f1:.1f}% (baseline: {baseline['f1_percent']}%+) {f1_status}")
print(f" RTFx: {rtfx:.1f}x (baseline: {baseline['rtfx_min']}x+) {rtfx_status}")
if "diarization" in results and results["diarization"]:
diar = results["diarization"]
baseline = BASELINES["diarization"]
der = diar.get("der", diar.get("average_der", 0)) * 100
rtfx = diar.get("rtfx", diar.get("average_rtfx", 0))
der_status = "" if der <= baseline["der_percent"] * 1.2 else ""
rtfx_status = "" if rtfx >= baseline["rtfx_min"] else ""
print(f"\nDiarization ({baseline['description']}):")
print(f" DER: {der:.1f}% (baseline: {baseline['der_percent']}%) {der_status}")
print(f" RTFx: {rtfx:.1f}x (baseline: {baseline['rtfx_min']}x+) {rtfx_status}")
def main():
parser = argparse.ArgumentParser(description="FluidAudio Benchmark Suite")
parser.add_argument("--quick", action="store_true", help="Quick smoke test with smaller datasets")
parser.add_argument("--asr-only", action="store_true", help="Run ASR benchmark only")
parser.add_argument("--vad-only", action="store_true", help="Run VAD benchmark only")
parser.add_argument("--diar-only", action="store_true", help="Run diarization benchmark only")
parser.add_argument("--output-dir", type=str, help="Output directory for results")
args = parser.parse_args()
# Determine which benchmarks to run
run_all = not (args.asr_only or args.vad_only or args.diar_only)
# Setup output directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if args.output_dir:
output_dir = Path(args.output_dir)
else:
output_dir = Path("benchmark-results") / timestamp
output_dir.mkdir(parents=True, exist_ok=True)
print("=" * 60)
print("FluidAudio Benchmark Suite")
print("=" * 60)
print(f"Mode: {'Quick' if args.quick else 'Full'}")
print(f"Output: {output_dir}")
print(f"Time: {timestamp}")
# Build first
if not build_release():
sys.exit(1)
results = {}
# Run benchmarks
if run_all or args.asr_only:
results["asr"] = run_asr_benchmark(output_dir, args.quick)
if run_all or args.vad_only:
results["vad"] = run_vad_benchmark(output_dir, args.quick)
if run_all or args.diar_only:
results["diarization"] = run_diarization_benchmark(output_dir, args.quick)
# Save combined results
combined_output = output_dir / "benchmark_results.json"
combined_output.write_text(json.dumps({
"timestamp": timestamp,
"mode": "quick" if args.quick else "full",
"baselines": BASELINES,
"results": results
}, indent=2))
# Compare against baselines
compare_results(results)
print("\n" + "=" * 60)
print("Benchmark complete!")
print("=" * 60)
print(f"Results saved to: {combined_output}")
if __name__ == "__main__":
main()