feat(cli): add speaker-similarity command

Compare two audio files via DiarizerManager's 256-d speaker embedding
extractor + cosine similarity. Useful for sanity-checking voice cloning
output (does the synthesized voice match the reference?) and for
diarization debugging.

Usage:
  fluidaudio speaker-similarity <a.wav> <b.wav> [--threshold 0.65] [--json]
This commit is contained in:
Alex-Wengg
2026-04-26 13:12:47 -04:00
parent d302273d49
commit 5dfd39798d
2 changed files with 135 additions and 0 deletions
@@ -0,0 +1,132 @@
#if os(macOS)
import FluidAudio
import Foundation
/// Compare the speaker identity of two audio files using DiarizerManager's
/// 256-dim speaker embedding extractor and cosine similarity.
///
/// Usage:
/// fluidaudio speaker-similarity <a.wav> <b.wav>
/// [--threshold 0.65] [--json]
///
/// Output (default text mode):
/// distance : 0.1234 (0 = identical, 2 = opposite)
/// similarity : 0.8766 (1 = identical, -1 = opposite)
/// same speaker: yes (similarity > threshold)
enum SpeakerSimilarityCommand {
private static let logger = AppLogger(category: "SpeakerSimilarity")
static func run(arguments: [String]) async {
var positional: [String] = []
var threshold: Float = 0.65
var json = false
var i = 0
while i < arguments.count {
switch arguments[i] {
case "--threshold":
if i + 1 < arguments.count, let v = Float(arguments[i + 1]) {
threshold = v
i += 1
} else {
fputs("ERROR: --threshold requires a Float\n", stderr)
exit(1)
}
case "--json":
json = true
case "-h", "--help":
printUsage()
return
default:
positional.append(arguments[i])
}
i += 1
}
guard positional.count == 2 else {
fputs("ERROR: speaker-similarity expects exactly 2 audio paths\n", stderr)
printUsage()
exit(1)
}
let pathA = positional[0]
let pathB = positional[1]
let manager: DiarizerManager
do {
let config = DiarizerConfig()
manager = DiarizerManager(config: config)
let models = try await DiarizerModels.downloadIfNeeded()
manager.initialize(models: models)
} catch {
logger.error("Failed to initialize diarizer models: \(error)")
exit(1)
}
let converter = AudioConverter()
let samplesA: [Float]
let samplesB: [Float]
do {
samplesA = try converter.resampleAudioFile(path: pathA)
samplesB = try converter.resampleAudioFile(path: pathB)
} catch {
logger.error("Failed to load audio: \(error)")
exit(1)
}
let embA: [Float]
let embB: [Float]
do {
embA = try manager.extractSpeakerEmbedding(from: samplesA)
embB = try manager.extractSpeakerEmbedding(from: samplesB)
} catch {
logger.error("Failed to extract speaker embedding: \(error)")
exit(1)
}
let distance = SpeakerUtilities.cosineDistance(embA, embB)
let similarity = 1.0 - distance
let sameSpeaker = similarity > threshold
if json {
// Hand-roll JSON to avoid pulling in JSONSerialization for a flat 5-key dict.
let payload =
"{\"file_a\":\"\(escape(pathA))\","
+ "\"file_b\":\"\(escape(pathB))\","
+ "\"distance\":\(distance),"
+ "\"similarity\":\(similarity),"
+ "\"threshold\":\(threshold),"
+ "\"same_speaker\":\(sameSpeaker)}"
print(payload)
} else {
print("file a : \(pathA)")
print("file b : \(pathB)")
print(String(format: "distance : %.4f (0 = identical, 2 = opposite)", distance))
print(String(format: "similarity : %.4f (1 = identical, -1 = opposite)", similarity))
print(String(format: "threshold : %.4f", threshold))
print("same speaker: \(sameSpeaker ? "yes" : "no")")
}
}
private static func escape(_ s: String) -> String {
s.replacingOccurrences(of: "\\", with: "\\\\")
.replacingOccurrences(of: "\"", with: "\\\"")
}
private static func printUsage() {
print(
"""
Usage: fluidaudio speaker-similarity <a.wav> <b.wav> [options]
Compares two audio files using a 256-dim speaker embedding and
reports the cosine similarity / distance.
Options:
--threshold <float> Decision threshold for "same speaker"
(default: 0.65)
--json Emit a single-line JSON object
-h, --help Show this message
"""
)
}
}
#endif
@@ -46,6 +46,8 @@ struct FluidAudioCLI {
await StreamDiarizationBenchmark.run(arguments: Array(arguments.dropFirst(2)))
case "process":
await ProcessCommand.run(arguments: Array(arguments.dropFirst(2)))
case "speaker-similarity":
await SpeakerSimilarityCommand.run(arguments: Array(arguments.dropFirst(2)))
case "download":
await DownloadCommand.run(arguments: Array(arguments.dropFirst(2)))
case "parakeet-eou":
@@ -98,6 +100,7 @@ struct FluidAudioCLI {
Commands:
process Process a single audio file for diarization
speaker-similarity Compare two audio files via 256-dim speaker embeddings
diarization-benchmark Run diarization benchmark
vad-benchmark Run VAD-specific benchmark
vad-analyze Inspect VAD segmentation and streaming events