FluidAudio/Scripts/diarizer_subset_benchmark.sh

#!/bin/bash
# Run all diarizer model benchmarks on AMI SDM with sleep prevention.
#
# Benchmarks:
#   1. Offline (VBx)       — OfflineDiarizerManager, step=0.2, min-seg=1.0
#   2. Streaming (5s)      — DiarizerManager, 5s chunks, 0s overlap, threshold=0.8
#   3. Sortformer          — SortformerDiarizer, NVIDIA high-latency config
#   4. LS-EEND             — LSEENDDiarizer, AMI variant
#
# Usage:
#   ./Scripts/diarizer_subset_benchmark.sh                    # quick run (4 meetings)
#   ./Scripts/diarizer_subset_benchmark.sh --all              # full run (all 16 meetings)
#   ./Scripts/diarizer_subset_benchmark.sh --max-files 8      # custom subset
#   ./Scripts/diarizer_subset_benchmark.sh --download         # download missing assets, then exit
#
# The script verifies all models and dataset files exist locally before running.
# If anything is missing it will tell you exactly what and exit (unless --download).
# Uses caffeinate to prevent sleep so you can close the lid.
# Results are saved to benchmark_results/ with timestamps.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
RESULTS_DIR="$PROJECT_DIR/benchmark_results"
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
LOG_FILE="$RESULTS_DIR/diarizer_benchmark_${TIMESTAMP}.log"

MODELS_DIR="$HOME/Library/Application Support/FluidAudio/Models"
DATASETS_DIR="$HOME/FluidAudioDatasets"
AMI_SDM_DIR="$DATASETS_DIR/ami_official/sdm"
AMI_RTTM_DIR="$DATASETS_DIR/ami_official/rttm"
MAX_FILES=4  # default: quick 4-meeting subset

# AMI SDM has 16 meetings — this is the standard diarization test set.
# Ordered so the first N picks one from each speaker group for maximum diversity.
# Groups: EN2002 (4 speakers), ES2004 (4), IS1009 (4), TS3003 (4)
ALL_AMI_MEETINGS=(
    EN2002a ES2004a IS1009a TS3003a
    EN2002b ES2004b IS1009b TS3003b
    EN2002c ES2004c IS1009c TS3003c
    EN2002d ES2004d IS1009d TS3003d
)

# Parse --all / --max-files <N> from arguments
args=("$@")
for ((i=0; i<${#args[@]}; i++)); do
    case "${args[$i]}" in
        --all)        MAX_FILES=${#ALL_AMI_MEETINGS[@]} ;;
        --max-files)  MAX_FILES="${args[$((i+1))]}" ; i=$((i+1)) ;;
    esac
done

# Select the subset of meetings to run
AMI_MEETINGS=("${ALL_AMI_MEETINGS[@]:0:$MAX_FILES}")

mkdir -p "$RESULTS_DIR"

log() {
    echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG_FILE"
}

# ---------------------------------------------------------------------------
# Verify local assets
# ---------------------------------------------------------------------------
verify_assets() {
    local missing=0

    # --- AMI SDM audio files ---
    local wav_count=0
    for meeting in "${AMI_MEETINGS[@]}"; do
        if [[ -f "$AMI_SDM_DIR/${meeting}.Mix-Headset.wav" ]]; then
            wav_count=$((wav_count + 1))
        else
            log "MISSING  AMI SDM: $AMI_SDM_DIR/${meeting}.Mix-Headset.wav"
            missing=1
        fi
    done
    if [[ "$wav_count" -eq 0 ]]; then
        log "MISSING  AMI SDM: no wav files found in $AMI_SDM_DIR"
        missing=1
    fi

    # --- AMI RTTM annotations (downloaded automatically by --auto-download) ---
    local rttm_count=0
    for meeting in "${ALL_AMI_MEETINGS[@]}"; do
        if [[ -f "$AMI_RTTM_DIR/${meeting}.rttm" ]]; then
            rttm_count=$((rttm_count + 1))
        fi
    done
    if [[ "$rttm_count" -eq 0 ]]; then
        log "NOTE     AMI RTTM annotations not found — will be auto-downloaded by CLI"
    fi

    # --- Offline diarizer models (pyannote segmentation + wespeaker embedding) ---
    local diar_dir="$MODELS_DIR/speaker-diarization-coreml"
    if [[ ! -d "$diar_dir" ]]; then
        log "MISSING  Diarizer models: $diar_dir"
        missing=1
    fi

    # --- Sortformer models (folder may or may not have -coreml suffix) ---
    if [[ ! -d "$MODELS_DIR/diar-streaming-sortformer-coreml" ]] && [[ ! -d "$MODELS_DIR/diar-streaming-sortformer" ]]; then
        log "MISSING  Sortformer models: $MODELS_DIR/diar-streaming-sortformer{,-coreml}"
        missing=1
    fi

    # --- LS-EEND models (folder may or may not have -coreml suffix) ---
    if [[ ! -d "$MODELS_DIR/ls-eend-coreml" ]] && [[ ! -d "$MODELS_DIR/ls-eend" ]]; then
        log "MISSING  LS-EEND models: $MODELS_DIR/ls-eend{,-coreml}"
        missing=1
    fi

    return $missing
}

# ---------------------------------------------------------------------------
# Phase 1: --download  (verify first, download only what's missing)
# ---------------------------------------------------------------------------
if [[ "${1:-}" == "--download" ]]; then
    log "=== Checking local assets ==="

    if verify_assets; then
        log "All models and datasets already present locally. Nothing to download."
        exit 0
    fi

    log "Some assets are missing — downloading..."

    log "Building release binary..."
    cd "$PROJECT_DIR" && swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE"
    CLI="$PROJECT_DIR/.build/release/fluidaudiocli"

    log "Downloading AMI SDM dataset + annotations..."
    "$CLI" diarization-benchmark --mode offline --auto-download --max-files 1 \
        --output "$RESULTS_DIR/warmup_offline.json" 2>&1 | tee -a "$LOG_FILE"

    log "Pre-loading Sortformer models..."
    "$CLI" sortformer-benchmark --nvidia-high-latency --hf --auto-download --max-files 1 \
        --output "$RESULTS_DIR/warmup_sortformer.json" 2>&1 | tee -a "$LOG_FILE"

    log "Pre-loading LS-EEND models..."
    "$CLI" lseend-benchmark --variant ami --auto-download --max-files 1 \
        --output "$RESULTS_DIR/warmup_lseend.json" 2>&1 | tee -a "$LOG_FILE"

    rm -f "$RESULTS_DIR"/warmup_*.json
    log "=== Downloads complete ==="
    exit 0
fi

# ---------------------------------------------------------------------------
# Phase 2: Run benchmarks (offline-safe, sleep-prevented)
# ---------------------------------------------------------------------------
log "=== Verifying local assets before offline run ==="
if ! verify_assets; then
    log ""
    log "ERROR: Missing assets — cannot run offline."
    log "Run with --download first while connected to the internet:"
    log "  ./Scripts/diarizer_subset_benchmark.sh --download"
    exit 1
fi
log "All assets verified locally."

log "=== Diarizer benchmark suite: ${#AMI_MEETINGS[@]}/${#ALL_AMI_MEETINGS[@]} meetings x 4 systems ==="
log "Results directory: $RESULTS_DIR"

cd "$PROJECT_DIR"

# Build release if not already built
if [[ ! -x ".build/release/fluidaudiocli" ]]; then
    log "Building release binary..."
    swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE"
fi
CLI="$PROJECT_DIR/.build/release/fluidaudiocli"

# caffeinate -s: prevent sleep even on AC power / lid closed
# caffeinate -i: prevent idle sleep
caffeinate -si -w $$ &
CAFFEINATE_PID=$!
log "caffeinate started (PID $CAFFEINATE_PID) — safe to close the lid"

# ---------------------------------------------------------------------------
# Benchmark runners
# ---------------------------------------------------------------------------

# Run a benchmark for each meeting via --single-file, then merge JSON results.
# This ensures we control exactly which meetings run (not the CLI's internal order).
merge_json_results() {
    local output_file="$1"
    shift
    local tmp_files=("$@")
    python3 -c "
import json, sys
results = []
for f in sys.argv[2:]:
    try:
        with open(f) as fh:
            data = json.load(fh)
            if isinstance(data, list):
                results.extend(data)
            else:
                results.append(data)
    except: pass
with open(sys.argv[1], 'w') as out:
    json.dump(results, out, indent=2)
" "$output_file" "${tmp_files[@]}" 2>/dev/null
    rm -f "${tmp_files[@]}"
}

run_offline_benchmark() {
    local label="offline_vbx"
    local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json"
    local tmp_files=()

    log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, offline VBx) ---"
    local start_time=$(date +%s)

    for meeting in "${AMI_MEETINGS[@]}"; do
        local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json"
        tmp_files+=("$tmp")
        log "  [$label] $meeting"
        "$CLI" diarization-benchmark \
            --mode offline \
            --dataset ami-sdm \
            --single-file "$meeting" \
            --auto-download \
            --output "$tmp" \
            2>&1 | tee -a "$LOG_FILE"
    done

    merge_json_results "$output_file" "${tmp_files[@]}"

    local end_time=$(date +%s)
    local elapsed=$(( end_time - start_time ))
    log "--- $label: finished in ${elapsed}s — $output_file ---"
}

run_streaming_benchmark() {
    local label="streaming_5s"
    local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json"
    local tmp_files=()

    log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, 5s chunks, threshold=0.8) ---"
    local start_time=$(date +%s)

    for meeting in "${AMI_MEETINGS[@]}"; do
        local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json"
        tmp_files+=("$tmp")
        log "  [$label] $meeting"
        "$CLI" diarization-benchmark \
            --mode streaming \
            --dataset ami-sdm \
            --single-file "$meeting" \
            --chunk-seconds 5.0 \
            --overlap-seconds 0.0 \
            --threshold 0.8 \
            --auto-download \
            --output "$tmp" \
            2>&1 | tee -a "$LOG_FILE"
    done

    merge_json_results "$output_file" "${tmp_files[@]}"

    local end_time=$(date +%s)
    local elapsed=$(( end_time - start_time ))
    log "--- $label: finished in ${elapsed}s — $output_file ---"
}

run_sortformer_benchmark() {
    local label="sortformer"
    local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json"
    local tmp_files=()

    log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, NVIDIA high-latency) ---"
    local start_time=$(date +%s)

    for meeting in "${AMI_MEETINGS[@]}"; do
        local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json"
        tmp_files+=("$tmp")
        log "  [$label] $meeting"
        "$CLI" sortformer-benchmark \
            --nvidia-high-latency \
            --hf \
            --dataset ami \
            --single-file "$meeting" \
            --auto-download \
            --output "$tmp" \
            2>&1 | tee -a "$LOG_FILE"
    done

    merge_json_results "$output_file" "${tmp_files[@]}"

    local end_time=$(date +%s)
    local elapsed=$(( end_time - start_time ))
    log "--- $label: finished in ${elapsed}s — $output_file ---"
}

run_lseend_benchmark() {
    local label="lseend_ami"
    local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json"
    local tmp_files=()

    log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, AMI variant) ---"
    local start_time=$(date +%s)

    for meeting in "${AMI_MEETINGS[@]}"; do
        local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json"
        tmp_files+=("$tmp")
        log "  [$label] $meeting"
        "$CLI" lseend-benchmark \
            --variant ami \
            --dataset ami \
            --single-file "$meeting" \
            --auto-download \
            --output "$tmp" \
            2>&1 | tee -a "$LOG_FILE"
    done

    merge_json_results "$output_file" "${tmp_files[@]}"

    local end_time=$(date +%s)
    local elapsed=$(( end_time - start_time ))
    log "--- $label: finished in ${elapsed}s — $output_file ---"
}

# ---------------------------------------------------------------------------
# Run all 4 benchmarks
# ---------------------------------------------------------------------------
SUITE_START=$(date +%s)

run_offline_benchmark
run_streaming_benchmark
run_sortformer_benchmark
run_lseend_benchmark

SUITE_END=$(date +%s)
SUITE_ELAPSED=$(( SUITE_END - SUITE_START ))

log "=== All benchmarks complete in ${SUITE_ELAPSED}s ==="
log "Results:"
ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE"

# ---------------------------------------------------------------------------
# Extract DER and RTFx from JSON results
# ---------------------------------------------------------------------------

# Streaming diarization benchmark: JSON is array of per-meeting results with "der" and "rtfx"
extract_streaming_metrics() {
    local json_file="$1"
    if [[ -f "$json_file" ]]; then
        python3 -c "
import json, sys
with open('$json_file') as f:
    results = json.load(f)
if not results:
    print('N/A N/A')
    sys.exit()
avg_der = sum(r['der'] for r in results) / len(results)
avg_rtfx = sum(r['rtfx'] for r in results) / len(results)
print(f'{avg_der:.1f} {avg_rtfx:.1f}')
" 2>/dev/null || echo "N/A N/A"
    else
        echo "N/A N/A"
    fi
}

# Sortformer/LS-EEND: same JSON array format via DiarizationBenchmarkUtils
extract_shared_metrics() {
    local json_file="$1"
    if [[ -f "$json_file" ]]; then
        python3 -c "
import json, sys
with open('$json_file') as f:
    results = json.load(f)
if not results:
    print('N/A N/A')
    sys.exit()
avg_der = sum(r['der'] for r in results) / len(results)
avg_rtfx = sum(r['rtfx'] for r in results) / len(results)
print(f'{avg_der:.1f} {avg_rtfx:.1f}')
" 2>/dev/null || echo "N/A N/A"
    else
        echo "N/A N/A"
    fi
}

# ---------------------------------------------------------------------------
# Compare DER & RTFx against Benchmarks.md baselines
# ---------------------------------------------------------------------------

# Baselines from Documentation/Benchmarks.md (AMI SDM, all 16 meetings)
# Note: when running a subset (--max-files <16), DER will differ from these baselines
# due to per-meeting variance. Baselines are for full 16-meeting runs only.
# Offline: no AMI SDM baseline yet — first --all run establishes it.
# Streaming: 5s/0s/0.8 on AMI SDM (7 meetings) = 26.2% DER, 223.1x RTFx
# Sortformer: NVIDIA high-latency on AMI SDM (16 meetings) = 31.7% DER, 126.7x RTFx
# LS-EEND: AMI variant on AMI SDM (16 meetings) = 20.7% DER, 74.5x RTFx
BASELINE_STREAMING_DER="26.2"
BASELINE_STREAMING_RTFX="223.1"
BASELINE_SORTFORMER_DER="31.7"
BASELINE_SORTFORMER_RTFX="126.7"
BASELINE_LSEEND_DER="20.7"
BASELINE_LSEEND_RTFX="74.5"

OFFLINE_FILE="$RESULTS_DIR/offline_vbx_${TIMESTAMP}.json"
STREAMING_FILE="$RESULTS_DIR/streaming_5s_${TIMESTAMP}.json"
SORTFORMER_FILE="$RESULTS_DIR/sortformer_${TIMESTAMP}.json"
LSEEND_FILE="$RESULTS_DIR/lseend_ami_${TIMESTAMP}.json"

read OFFLINE_DER OFFLINE_RTFX <<< $(extract_streaming_metrics "$OFFLINE_FILE")
read STREAMING_DER STREAMING_RTFX <<< $(extract_streaming_metrics "$STREAMING_FILE")
read SORTFORMER_DER SORTFORMER_RTFX <<< $(extract_shared_metrics "$SORTFORMER_FILE")
read LSEEND_DER LSEEND_RTFX <<< $(extract_shared_metrics "$LSEEND_FILE")

log ""
log "=== DER & RTFx Comparison vs Benchmarks.md baselines (AMI SDM, ${#AMI_MEETINGS[@]} meetings) ==="
log ""
printf "%-25s %12s %12s %12s %12s %12s\n" \
    "System" "Base DER" "DER" "Delta" "Base RTFx" "RTFx" | tee -a "$LOG_FILE"
printf "%-25s %12s %12s %12s %12s %12s\n" \
    "-------------------------" "------------" "------------" "------------" "------------" "------------" | tee -a "$LOG_FILE"

compare_der_rtfx() {
    local label="$1" base_der="$2" current_der="$3" base_rtfx="$4" current_rtfx="$5"

    if [[ "$current_der" == "N/A" ]]; then
        printf "%-25s %11s%% %12s %12s %11sx %12s\n" \
            "$label" "$base_der" "N/A" "—" "$base_rtfx" "N/A" | tee -a "$LOG_FILE"
        return
    fi

    local delta marker=""
    delta=$(python3 -c "print(f'{$current_der - $base_der:+.1f}')" 2>/dev/null || echo "?")
    local regression
    regression=$(python3 -c "print('YES' if $current_der > $base_der + 2.0 else 'NO')" 2>/dev/null || echo "NO")
    if [[ "$regression" == "YES" ]]; then
        marker=" <- REGRESSION"
    fi

    printf "%-25s %11s%% %11s%% %11s%% %11sx %11sx%s\n" \
        "$label" "$base_der" "$current_der" "$delta" "$base_rtfx" "$current_rtfx" "$marker" | tee -a "$LOG_FILE"
}

# Offline has no AMI SDM baseline yet — show as "new"
if [[ "$OFFLINE_DER" != "N/A" ]]; then
    printf "%-25s %12s %11s%% %12s %12s %11sx\n" \
        "Offline (VBx)" "—" "$OFFLINE_DER" "(new)" "—" "$OFFLINE_RTFX" | tee -a "$LOG_FILE"
else
    printf "%-25s %12s %12s %12s %12s %12s\n" \
        "Offline (VBx)" "—" "N/A" "—" "—" "N/A" | tee -a "$LOG_FILE"
fi

compare_der_rtfx "Streaming (5s/0.8)" "$BASELINE_STREAMING_DER" "$STREAMING_DER" "$BASELINE_STREAMING_RTFX" "$STREAMING_RTFX"
compare_der_rtfx "Sortformer (high-lat)" "$BASELINE_SORTFORMER_DER" "$SORTFORMER_DER" "$BASELINE_SORTFORMER_RTFX" "$SORTFORMER_RTFX"
compare_der_rtfx "LS-EEND (AMI)" "$BASELINE_LSEEND_DER" "$LSEEND_DER" "$BASELINE_LSEEND_RTFX" "$LSEEND_RTFX"

log ""

# Check for any DER regressions (>2.0% increase — diarization is noisier than ASR)
ANY_REGRESSION=$(python3 -c "
baselines = [
    ($BASELINE_STREAMING_DER, '$STREAMING_DER'),
    ($BASELINE_SORTFORMER_DER, '$SORTFORMER_DER'),
    ($BASELINE_LSEEND_DER, '$LSEEND_DER'),
]
for b, c in baselines:
    if c != 'N/A' and float(c) > b + 2.0:
        print('YES'); exit()
print('NO')
" 2>/dev/null || echo "NO")

if [[ "$ANY_REGRESSION" == "YES" ]]; then
    log "WARNING: DER REGRESSION DETECTED (>2.0% above baseline) — investigate before merging"
else
    log "No DER regressions (all within 2.0% of baseline)"
fi

# caffeinate will exit automatically since the parent process ($$) exits