#!/bin/bash # Run all Parakeet model benchmarks (100 files each) with sleep prevention. # # Benchmarks: # 1. ASR v3 — parakeet-tdt-0.6b-v3 on LibriSpeech test-clean # 2. ASR v2 — parakeet-tdt-0.6b-v2 on LibriSpeech test-clean # 3. ASR tdt-ctc-110m — parakeet-tdt-ctc-110m on LibriSpeech test-clean # 4. CTC custom vocab — ctc-earnings-benchmark (v2 TDT + CTC 110m keyword spotting) # 5. EOU streaming — parakeet-eou 320ms on LibriSpeech test-clean # 6. Nemotron streaming — nemotron 1120ms on LibriSpeech test-clean # 7. TDT Japanese — parakeet-ja on JSUT dataset # 8. CTC Chinese — parakeet-ctc-zh-cn on THCHS-30 dataset # # Usage: # ./Scripts/parakeet_subset_benchmark.sh # verify + run # ./Scripts/parakeet_subset_benchmark.sh --download # download missing assets, then exit # # The script verifies all models and dataset files exist locally before running. # If anything is missing it will tell you exactly what and exit (unless --download). # Uses caffeinate to prevent sleep so you can close the lid. # Results are saved to benchmark_results/ with timestamps. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" RESULTS_DIR="$PROJECT_DIR/benchmark_results" TIMESTAMP=$(date +"%Y%m%d_%H%M%S") LOG_FILE="$RESULTS_DIR/benchmark_${TIMESTAMP}.log" MAX_FILES=100 SUBSET="test-clean" MODELS_DIR="$HOME/Library/Application Support/FluidAudio/Models" DATASETS_DIR="$HOME/Library/Application Support/FluidAudio/Datasets" EARNINGS_DIR="$HOME/Library/Application Support/FluidAudio/earnings22-kws/test-dataset" mkdir -p "$RESULTS_DIR" log() { echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG_FILE" } # --------------------------------------------------------------------------- # Verify local assets # --------------------------------------------------------------------------- verify_assets() { local missing=0 # --- Parakeet v3 --- local v3_dir="$MODELS_DIR/parakeet-tdt-0.6b-v3" for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do if [[ ! -e "$v3_dir/$f" ]]; then log "MISSING v3: $v3_dir/$f" missing=1 fi done # --- Parakeet v2 (folder may have -coreml suffix) --- local v2_dir="" if [[ -d "$MODELS_DIR/parakeet-tdt-0.6b-v2-coreml" ]]; then v2_dir="$MODELS_DIR/parakeet-tdt-0.6b-v2-coreml" elif [[ -d "$MODELS_DIR/parakeet-tdt-0.6b-v2" ]]; then v2_dir="$MODELS_DIR/parakeet-tdt-0.6b-v2" fi if [[ -z "$v2_dir" ]]; then log "MISSING v2: no parakeet-tdt-0.6b-v2* directory found" missing=1 else for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do if [[ ! -e "$v2_dir/$f" ]]; then log "MISSING v2: $v2_dir/$f" missing=1 fi done fi # --- TDT-CTC-110M (fused: no separate Encoder) --- local tdt_ctc_dir="$MODELS_DIR/parakeet-tdt-ctc-110m" for f in Preprocessor.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do if [[ ! -e "$tdt_ctc_dir/$f" ]]; then log "MISSING tdt-ctc-110m: $tdt_ctc_dir/$f" missing=1 fi done # --- CTC 110M model (for custom vocabulary / keyword spotting) --- local ctc_dir="$MODELS_DIR/parakeet-ctc-110m-coreml" for f in MelSpectrogram.mlmodelc AudioEncoder.mlmodelc vocab.json; do if [[ ! -e "$ctc_dir/$f" ]]; then log "MISSING ctc-110m: $ctc_dir/$f" missing=1 fi done # --- EOU streaming models (320ms chunks) --- local eou_dir="$MODELS_DIR/parakeet-eou-streaming/320ms" if [[ ! -d "$eou_dir" ]]; then log "MISSING eou-320ms: $eou_dir" missing=1 fi # --- Nemotron models (uses v3 encoder + nemotron-specific models) --- # Nemotron reuses the v3 models directory; no separate check needed beyond v3 above. # --- Japanese TDT (hybrid: CTC preprocessor/encoder + TDT decoder/joint) --- local ja_dir="$MODELS_DIR/parakeet-ja" for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoderv2.mlmodelc Jointerv2.mlmodelc vocab.json; do if [[ ! -e "$ja_dir/$f" ]]; then log "MISSING tdt-ja: $ja_dir/$f" missing=1 fi done # --- Chinese CTC --- local zh_dir="$MODELS_DIR/parakeet-ctc-zh-cn" for f in Preprocessor.mlmodelc Decoder.mlmodelc vocab.json; do if [[ ! -e "$zh_dir/$f" ]]; then log "MISSING ctc-zh-cn: $zh_dir/$f" missing=1 fi done # Check that at least one encoder variant exists (int8 or fp32) if [[ ! -e "$zh_dir/Encoder-v2-int8.mlmodelc" ]] && [[ ! -e "$zh_dir/Encoder-v1-fp32.mlmodelc" ]]; then log "MISSING ctc-zh-cn: $zh_dir/Encoder-v2-int8.mlmodelc or Encoder-v1-fp32.mlmodelc" missing=1 fi # --- LibriSpeech test-clean --- local ls_dir="$DATASETS_DIR/LibriSpeech/$SUBSET" local trans_count trans_count=$(find "$ls_dir" -name "*.trans.txt" 2>/dev/null | wc -l | tr -d ' ') if [[ "$trans_count" -lt 5 ]]; then log "MISSING LibriSpeech $SUBSET: found $trans_count transcript files (need >= 5)" missing=1 fi # --- Earnings22 KWS dataset --- local earnings_wav_count earnings_wav_count=$(find "$EARNINGS_DIR" -maxdepth 1 -name "*.wav" 2>/dev/null | wc -l | tr -d ' ') if [[ "$earnings_wav_count" -lt 10 ]]; then log "MISSING Earnings22 KWS: found $earnings_wav_count wav files (need >= 10)" missing=1 fi # --- JSUT Japanese dataset --- local jsut_dir="$DATASETS_DIR/JSUT-basic5000" if [[ ! -f "$jsut_dir/metadata.jsonl" ]]; then log "MISSING JSUT: $jsut_dir/metadata.jsonl" missing=1 fi # --- THCHS-30 Chinese dataset --- local thchs_dir="$DATASETS_DIR/THCHS-30" if [[ ! -f "$thchs_dir/metadata.jsonl" ]]; then log "MISSING THCHS-30: $thchs_dir/metadata.jsonl" missing=1 fi return $missing } # --------------------------------------------------------------------------- # Phase 1: --download (verify first, download only what's missing) # --------------------------------------------------------------------------- if [[ "${1:-}" == "--download" ]]; then log "=== Checking local assets ===" if verify_assets; then log "All models and datasets already present locally. Nothing to download." exit 0 fi log "Some assets are missing — downloading..." log "Building release binary..." cd "$PROJECT_DIR" && swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" CLI="$PROJECT_DIR/.build/release/fluidaudiocli" log "Downloading LibriSpeech $SUBSET dataset..." "$CLI" download --dataset "librispeech-$SUBSET" 2>&1 | tee -a "$LOG_FILE" log "Downloading Earnings22 KWS dataset..." "$CLI" download --dataset earnings22-kws 2>&1 | tee -a "$LOG_FILE" log "Pre-loading Parakeet v3 models (triggers download if missing)..." "$CLI" asr-benchmark --model-version v3 --subset "$SUBSET" --max-files 1 \ --output "$RESULTS_DIR/warmup_v3.json" 2>&1 | tee -a "$LOG_FILE" log "Pre-loading Parakeet v2 models..." "$CLI" asr-benchmark --model-version v2 --subset "$SUBSET" --max-files 1 \ --output "$RESULTS_DIR/warmup_v2.json" 2>&1 | tee -a "$LOG_FILE" log "Pre-loading CTC earnings models..." "$CLI" ctc-earnings-benchmark --max-files 1 --auto-download \ --output "$RESULTS_DIR/warmup_ctc.json" 2>&1 | tee -a "$LOG_FILE" log "Pre-loading EOU streaming models..." "$CLI" parakeet-eou --benchmark --chunk-size 320 --max-files 1 \ --output "$RESULTS_DIR/warmup_eou.json" 2>&1 | tee -a "$LOG_FILE" log "Pre-loading Nemotron streaming models..." "$CLI" nemotron-benchmark --max-files 1 2>&1 | tee -a "$LOG_FILE" log "Pre-loading Japanese TDT models and JSUT dataset..." "$CLI" ja-benchmark --decoder tdt --dataset jsut --samples 1 --auto-download \ --output "$RESULTS_DIR/warmup_ja.json" 2>&1 | tee -a "$LOG_FILE" log "Pre-loading Chinese CTC models and THCHS-30 dataset..." "$CLI" ctc-zh-cn-benchmark --samples 1 --auto-download \ --output "$RESULTS_DIR/warmup_zh.json" 2>&1 | tee -a "$LOG_FILE" rm -f "$RESULTS_DIR"/warmup_*.json /tmp/nemotron_*_benchmark.json log "=== Downloads complete ===" exit 0 fi # --------------------------------------------------------------------------- # Phase 2: Run benchmarks (offline-safe, sleep-prevented) # --------------------------------------------------------------------------- log "=== Verifying local assets before offline run ===" if ! verify_assets; then log "" log "ERROR: Missing assets — cannot run offline." log "Run with --download first while connected to the internet:" log " ./Scripts/parakeet_subset_benchmark.sh --download" exit 1 fi log "All assets verified locally." log "=== Parakeet benchmark suite: $MAX_FILES files x 8 benchmarks ===" log "Results directory: $RESULTS_DIR" cd "$PROJECT_DIR" # Build release if not already built if [[ ! -x ".build/release/fluidaudiocli" ]]; then log "Building release binary..." swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" fi CLI="$PROJECT_DIR/.build/release/fluidaudiocli" # caffeinate -s: prevent sleep even on AC power / lid closed # caffeinate -i: prevent idle sleep # We wrap the entire benchmark suite so caffeinate dies when the script ends. caffeinate -si -w $$ & CAFFEINATE_PID=$! log "caffeinate started (PID $CAFFEINATE_PID) — safe to close the lid" run_asr_benchmark() { local model_version="$1" local label="$2" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" log "--- $label: starting ($MAX_FILES files, $SUBSET) ---" local start_time=$(date +%s) "$CLI" asr-benchmark \ --model-version "$model_version" \ --subset "$SUBSET" \ --max-files "$MAX_FILES" \ --no-auto-download \ --output "$output_file" \ 2>&1 | tee -a "$LOG_FILE" local end_time=$(date +%s) local elapsed=$(( end_time - start_time )) log "--- $label: finished in ${elapsed}s — $output_file ---" } run_ctc_earnings_benchmark() { local label="ctc_earnings_vocab" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" log "--- $label: starting ($MAX_FILES files, v2 TDT + CTC keyword spotting) ---" local start_time=$(date +%s) # TDT v2 is used for transcription to match benchmarks100.md baseline "$CLI" ctc-earnings-benchmark \ --ctc-variant 110m \ --max-files "$MAX_FILES" \ --output "$output_file" \ 2>&1 | tee -a "$LOG_FILE" local end_time=$(date +%s) local elapsed=$(( end_time - start_time )) log "--- $label: finished in ${elapsed}s — $output_file ---" } run_eou_benchmark() { local label="eou_320ms" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" log "--- $label: starting ($MAX_FILES files, $SUBSET, 320ms chunks) ---" local start_time=$(date +%s) "$CLI" parakeet-eou \ --benchmark \ --chunk-size 320 \ --max-files "$MAX_FILES" \ --use-cache \ --output "$output_file" \ 2>&1 | tee -a "$LOG_FILE" local end_time=$(date +%s) local elapsed=$(( end_time - start_time )) log "--- $label: finished in ${elapsed}s — $output_file ---" } run_nemotron_benchmark() { local label="nemotron_1120ms" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" log "--- $label: starting ($MAX_FILES files, $SUBSET, 1120ms chunks) ---" local start_time=$(date +%s) "$CLI" nemotron-benchmark \ --max-files "$MAX_FILES" \ 2>&1 | tee -a "$LOG_FILE" # Nemotron writes to /tmp; copy to our results dir local tmp_file="/tmp/nemotron_1120ms_benchmark.json" if [[ -f "$tmp_file" ]]; then cp "$tmp_file" "$output_file" fi local end_time=$(date +%s) local elapsed=$(( end_time - start_time )) log "--- $label: finished in ${elapsed}s — $output_file ---" } run_ja_benchmark() { local label="parakeet_tdt_ja" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" log "--- $label: starting ($MAX_FILES files, JSUT dataset, TDT decoder) ---" local start_time=$(date +%s) "$CLI" ja-benchmark \ --decoder tdt \ --dataset jsut \ --samples "$MAX_FILES" \ --output "$output_file" \ 2>&1 | tee -a "$LOG_FILE" local end_time=$(date +%s) local elapsed=$(( end_time - start_time )) log "--- $label: finished in ${elapsed}s — $output_file ---" } run_zh_benchmark() { local label="parakeet_ctc_zh_cn" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" log "--- $label: starting ($MAX_FILES files, THCHS-30 dataset) ---" local start_time=$(date +%s) "$CLI" ctc-zh-cn-benchmark \ --samples "$MAX_FILES" \ --output "$output_file" \ 2>&1 | tee -a "$LOG_FILE" local end_time=$(date +%s) local elapsed=$(( end_time - start_time )) log "--- $label: finished in ${elapsed}s — $output_file ---" } SUITE_START=$(date +%s) run_asr_benchmark "v3" "parakeet_v3" run_asr_benchmark "v2" "parakeet_v2" run_asr_benchmark "tdt-ctc-110m" "parakeet_tdt_ctc_110m" run_ctc_earnings_benchmark run_eou_benchmark run_nemotron_benchmark run_ja_benchmark run_zh_benchmark SUITE_END=$(date +%s) SUITE_ELAPSED=$(( SUITE_END - SUITE_START )) log "=== All benchmarks complete in ${SUITE_ELAPSED}s ===" log "Results:" ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE" # --------------------------------------------------------------------------- # Compare WER against benchmarks100.md baselines # --------------------------------------------------------------------------- # Baselines from Documentation/ASR/benchmarks100.md (main column) BASELINE_V3_WER="2.6" BASELINE_V2_WER="3.8" BASELINE_TDT_CTC_WER="3.6" BASELINE_EARNINGS_WER="16.54" BASELINE_EOU_WER="7.11" BASELINE_NEMOTRON_WER="1.99" BASELINE_JA_CER="7.77" BASELINE_ZH_CER="8.37" extract_wer() { local json_file="$1" local field="$2" if [[ -f "$json_file" ]]; then python3 -c "import json,sys; d=json.load(open('$json_file')); print(round(d['summary']['$field']*100, 2))" 2>/dev/null || echo "N/A" else echo "N/A" fi } # For JSON fields that already store WER as a percentage (not decimal) extract_wer_pct() { local json_file="$1" local section="$2" local field="$3" if [[ -f "$json_file" ]]; then if [[ -n "$section" ]]; then python3 -c "import json; d=json.load(open('$json_file')); print(round(d['$section']['$field'], 2))" 2>/dev/null || echo "N/A" else python3 -c "import json; d=json.load(open('$json_file')); print(round(d['$field'], 2))" 2>/dev/null || echo "N/A" fi else echo "N/A" fi } V3_FILE="$RESULTS_DIR/parakeet_v3_${TIMESTAMP}.json" V2_FILE="$RESULTS_DIR/parakeet_v2_${TIMESTAMP}.json" TDT_CTC_FILE="$RESULTS_DIR/parakeet_tdt_ctc_110m_${TIMESTAMP}.json" EARNINGS_FILE="$RESULTS_DIR/ctc_earnings_vocab_${TIMESTAMP}.json" EOU_FILE="$RESULTS_DIR/eou_320ms_${TIMESTAMP}.json" NEMOTRON_FILE="$RESULTS_DIR/nemotron_1120ms_${TIMESTAMP}.json" JA_FILE="$RESULTS_DIR/parakeet_tdt_ja_${TIMESTAMP}.json" ZH_FILE="$RESULTS_DIR/parakeet_ctc_zh_cn_${TIMESTAMP}.json" V3_WER=$(extract_wer "$V3_FILE" "averageWER") V2_WER=$(extract_wer "$V2_FILE" "averageWER") TDT_CTC_WER=$(extract_wer "$TDT_CTC_FILE" "averageWER") EARNINGS_WER=$(extract_wer_pct "$EARNINGS_FILE" "summary" "avgWer") EOU_WER=$(extract_wer "$EOU_FILE" "averageWER") NEMOTRON_WER=$(extract_wer_pct "$NEMOTRON_FILE" "" "wer") JA_CER=$(extract_wer "$JA_FILE" "mean_cer") ZH_CER=$(extract_wer "$ZH_FILE" "mean_cer") log "" log "=== WER Comparison vs benchmarks100.md baselines ===" log "" printf "%-25s %10s %10s %10s\n" "Model" "Baseline" "Current" "Delta" | tee -a "$LOG_FILE" printf "%-25s %10s %10s %10s\n" "-------------------------" "----------" "----------" "----------" | tee -a "$LOG_FILE" compare_wer() { local label="$1" baseline="$2" current="$3" if [[ "$current" == "N/A" ]]; then printf "%-25s %9s%% %10s %10s\n" "$label" "$baseline" "N/A" "—" | tee -a "$LOG_FILE" return fi local delta delta=$(python3 -c "print(f'{$current - $baseline:+.2f}')" 2>/dev/null || echo "?") local marker="" local regression regression=$(python3 -c "print('YES' if $current > $baseline + 0.3 else 'NO')" 2>/dev/null || echo "NO") if [[ "$regression" == "YES" ]]; then marker=" ← REGRESSION" fi printf "%-25s %9s%% %9s%% %9s%%%s\n" "$label" "$baseline" "$current" "$delta" "$marker" | tee -a "$LOG_FILE" } compare_wer "Parakeet TDT v3 (0.6B)" "$BASELINE_V3_WER" "$V3_WER" compare_wer "Parakeet TDT v2 (0.6B)" "$BASELINE_V2_WER" "$V2_WER" compare_wer "CTC-TDT 110M" "$BASELINE_TDT_CTC_WER" "$TDT_CTC_WER" compare_wer "CTC Earnings" "$BASELINE_EARNINGS_WER" "$EARNINGS_WER" compare_wer "EOU 320ms (120M)" "$BASELINE_EOU_WER" "$EOU_WER" compare_wer "Nemotron 1120ms (0.6B)" "$BASELINE_NEMOTRON_WER" "$NEMOTRON_WER" log "" log "=== CER Comparison (Character Error Rate for non-English) ===" log "" printf "%-25s %10s %10s %10s\n" "Model" "Baseline" "Current" "Delta" | tee -a "$LOG_FILE" printf "%-25s %10s %10s %10s\n" "-------------------------" "----------" "----------" "----------" | tee -a "$LOG_FILE" compare_wer "TDT Japanese (0.6B)" "$BASELINE_JA_CER" "$JA_CER" if [[ "$BASELINE_ZH_CER" != "TBD" ]]; then compare_wer "CTC Chinese (0.6B)" "$BASELINE_ZH_CER" "$ZH_CER" else printf "%-25s %10s %10s %10s\n" "CTC Chinese (0.6B)" "TBD" "$ZH_CER" "—" | tee -a "$LOG_FILE" fi log "" # Check for any regressions (>0.3% WER/CER increase) ANY_REGRESSION=$(python3 -c " baselines = [($BASELINE_V3_WER, '$V3_WER'), ($BASELINE_V2_WER, '$V2_WER'), ($BASELINE_TDT_CTC_WER, '$TDT_CTC_WER'), ($BASELINE_EARNINGS_WER, '$EARNINGS_WER'), ($BASELINE_EOU_WER, '$EOU_WER'), ($BASELINE_NEMOTRON_WER, '$NEMOTRON_WER'), ($BASELINE_JA_CER, '$JA_CER')] if '$BASELINE_ZH_CER' != 'TBD': baselines.append(($BASELINE_ZH_CER, '$ZH_CER')) for b, c in baselines: if c != 'N/A' and float(c) > b + 0.3: print('YES'); exit() print('NO') " 2>/dev/null || echo "NO") if [[ "$ANY_REGRESSION" == "YES" ]]; then log "⚠ WER/CER REGRESSION DETECTED — investigate before merging" else log "✓ No WER/CER regressions (all within 0.3% of baseline)" fi # caffeinate will exit automatically since the parent process ($$) exits