Files
Alex 7d074e1ee6 chore: consolidate Python scripts into Scripts/ (#344)
## Summary
- Move `Benchmarks/nemo` to `Scripts/nemo_ami_benchmark`
- Move `Tools/voice_cloning` to `Scripts/voice_cloning`
- Remove now-empty `Benchmarks/` and `Tools/` top-level directories

Consolidates standalone Python utilities into a single `Scripts/`
directory to reduce top-level clutter.

## Test plan
- [x] Verify files moved correctly (no content changes)
<!-- devin-review-badge-begin -->

---

<a href="https://app.devin.ai/review/fluidinference/fluidaudio/pull/344"
target="_blank">
  <picture>
<source media="(prefers-color-scheme: dark)"
srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1">
<img
src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1"
alt="Open with Devin">
  </picture>
</a>
<!-- devin-review-badge-end -->
2026-03-04 12:46:03 -05:00

297 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
"""Evaluate voice cloning quality using spectral similarity.
Compares a reference voice sample with synthesized TTS output using
mel-spectrogram cosine similarity - no neural network required.
Requirements:
pip install librosa numpy scipy
Usage:
python evaluate_voice.py reference.wav synthesized.wav
python evaluate_voice.py reference.wav synthesized.wav --plot
"""
import argparse
import logging
import sys
from pathlib import Path
import numpy as np
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)
SAMPLE_RATE = 24000 # PocketTTS native sample rate
def load_audio(path: Path) -> np.ndarray:
"""Load audio and resample to target sample rate."""
try:
import librosa
audio, _ = librosa.load(str(path), sr=SAMPLE_RATE, mono=True)
return audio
except ImportError:
from scipy.io import wavfile
from scipy import signal
sr, audio = wavfile.read(str(path))
if audio.dtype == np.int16:
audio = audio.astype(np.float32) / 32768.0
elif audio.dtype == np.int32:
audio = audio.astype(np.float32) / 2147483648.0
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
if sr != SAMPLE_RATE:
num_samples = int(len(audio) * SAMPLE_RATE / sr)
audio = signal.resample(audio, num_samples)
return audio.astype(np.float32)
def compute_mel_spectrogram(audio: np.ndarray, n_mels: int = 80, n_fft: int = 1024,
hop_length: int = 256) -> np.ndarray:
"""Compute mel spectrogram."""
try:
import librosa
mel = librosa.feature.melspectrogram(
y=audio, sr=SAMPLE_RATE, n_mels=n_mels,
n_fft=n_fft, hop_length=hop_length
)
return librosa.power_to_db(mel, ref=np.max)
except ImportError:
# Fallback using scipy
from scipy import signal
from scipy.fftpack import dct
# Simple STFT
_, _, Sxx = signal.spectrogram(audio, fs=SAMPLE_RATE, nperseg=n_fft,
noverlap=n_fft - hop_length)
# Approximate mel scaling (simplified)
mel_basis = np.zeros((n_mels, Sxx.shape[0]))
for i in range(n_mels):
center = int(Sxx.shape[0] * (i + 1) / (n_mels + 1))
width = max(1, Sxx.shape[0] // (n_mels * 2))
mel_basis[i, max(0, center-width):min(Sxx.shape[0], center+width)] = 1
mel_basis = mel_basis / (mel_basis.sum(axis=1, keepdims=True) + 1e-8)
mel = np.dot(mel_basis, Sxx)
return 10 * np.log10(mel + 1e-10)
def compute_mfcc(audio: np.ndarray, n_mfcc: int = 13) -> np.ndarray:
"""Compute MFCCs."""
try:
import librosa
return librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=n_mfcc)
except ImportError:
mel = compute_mel_spectrogram(audio)
from scipy.fftpack import dct
return dct(mel, type=2, axis=0, norm='ortho')[:n_mfcc]
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Compute cosine similarity between two vectors."""
a_flat = a.flatten()
b_flat = b.flatten()
# Truncate to same length
min_len = min(len(a_flat), len(b_flat))
a_flat = a_flat[:min_len]
b_flat = b_flat[:min_len]
norm_a = np.linalg.norm(a_flat)
norm_b = np.linalg.norm(b_flat)
if norm_a == 0 or norm_b == 0:
return 0.0
return float(np.dot(a_flat, b_flat) / (norm_a * norm_b))
def compute_spectral_similarity(ref_audio: np.ndarray, syn_audio: np.ndarray) -> dict:
"""Compute spectral similarity metrics."""
# Compute mel spectrograms
ref_mel = compute_mel_spectrogram(ref_audio)
syn_mel = compute_mel_spectrogram(syn_audio)
# Compute mean mel vectors (voice timbre signature)
ref_mel_mean = ref_mel.mean(axis=1)
syn_mel_mean = syn_mel.mean(axis=1)
mel_similarity = cosine_similarity(ref_mel_mean, syn_mel_mean)
# Compute MFCCs
ref_mfcc = compute_mfcc(ref_audio)
syn_mfcc = compute_mfcc(syn_audio)
# MFCC mean (captures voice characteristics)
ref_mfcc_mean = ref_mfcc.mean(axis=1)
syn_mfcc_mean = syn_mfcc.mean(axis=1)
mfcc_similarity = cosine_similarity(ref_mfcc_mean, syn_mfcc_mean)
# MFCC std (captures dynamics)
ref_mfcc_std = ref_mfcc.std(axis=1)
syn_mfcc_std = syn_mfcc.std(axis=1)
mfcc_std_similarity = cosine_similarity(ref_mfcc_std, syn_mfcc_std)
return {
'mel_similarity': mel_similarity,
'mfcc_similarity': mfcc_similarity,
'mfcc_std_similarity': mfcc_std_similarity,
}
def evaluate_voice_cloning(
reference_path: Path,
synthesized_path: Path,
plot: bool = False
) -> dict:
"""Evaluate voice cloning quality using spectral similarity."""
logger.info(f"Reference: {reference_path}")
logger.info(f"Synthesized: {synthesized_path}")
logger.info("")
# Load audio
ref_audio = load_audio(reference_path)
syn_audio = load_audio(synthesized_path)
logger.info(f"Reference duration: {len(ref_audio) / SAMPLE_RATE:.2f}s")
logger.info(f"Synthesized duration: {len(syn_audio) / SAMPLE_RATE:.2f}s")
logger.info("")
# Compute spectral similarity
logger.info("Computing spectral similarity...")
metrics = compute_spectral_similarity(ref_audio, syn_audio)
# Combined score (weighted average)
combined = (
0.4 * metrics['mel_similarity'] +
0.4 * metrics['mfcc_similarity'] +
0.2 * metrics['mfcc_std_similarity']
)
metrics['combined_similarity'] = combined
logger.info("")
logger.info(f" Mel Similarity: {metrics['mel_similarity']:.4f}")
logger.info(f" MFCC Similarity: {metrics['mfcc_similarity']:.4f}")
logger.info(f" MFCC Std Similarity: {metrics['mfcc_std_similarity']:.4f}")
logger.info(f" Combined Score: {combined:.4f}")
# Quality interpretation
if combined >= 0.90:
quality = "Excellent"
elif combined >= 0.80:
quality = "Good"
elif combined >= 0.70:
quality = "Fair"
else:
quality = "Poor"
metrics['quality'] = quality
logger.info(f" Quality: {quality}")
# Plot if requested
if plot:
plot_spectrograms(ref_audio, syn_audio, reference_path.stem, synthesized_path.stem)
return metrics
def plot_spectrograms(ref_audio: np.ndarray, syn_audio: np.ndarray,
ref_name: str, syn_name: str):
"""Visualize mel spectrograms."""
try:
import matplotlib.pyplot as plt
except ImportError:
logger.warning("matplotlib not installed, skipping plot")
return
ref_mel = compute_mel_spectrogram(ref_audio)
syn_mel = compute_mel_spectrogram(syn_audio)
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
# Reference mel spectrogram
im0 = axes[0, 0].imshow(ref_mel, aspect='auto', origin='lower', cmap='magma')
axes[0, 0].set_title(f'Reference: {ref_name}')
axes[0, 0].set_ylabel('Mel bin')
plt.colorbar(im0, ax=axes[0, 0], format='%+2.0f dB')
# Synthesized mel spectrogram
im1 = axes[0, 1].imshow(syn_mel, aspect='auto', origin='lower', cmap='magma')
axes[0, 1].set_title(f'Synthesized: {syn_name}')
axes[0, 1].set_ylabel('Mel bin')
plt.colorbar(im1, ax=axes[0, 1], format='%+2.0f dB')
# Mean mel comparison
ref_mel_mean = ref_mel.mean(axis=1)
syn_mel_mean = syn_mel.mean(axis=1)
axes[1, 0].plot(ref_mel_mean, label='Reference', alpha=0.8)
axes[1, 0].plot(syn_mel_mean, label='Synthesized', alpha=0.8)
axes[1, 0].set_xlabel('Mel bin')
axes[1, 0].set_ylabel('Mean energy (dB)')
axes[1, 0].set_title('Mean Mel Spectrum (Voice Timbre)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
# MFCC comparison
ref_mfcc = compute_mfcc(ref_audio).mean(axis=1)
syn_mfcc = compute_mfcc(syn_audio).mean(axis=1)
x = np.arange(len(ref_mfcc))
width = 0.35
axes[1, 1].bar(x - width/2, ref_mfcc, width, label='Reference', alpha=0.8)
axes[1, 1].bar(x + width/2, syn_mfcc, width, label='Synthesized', alpha=0.8)
axes[1, 1].set_xlabel('MFCC coefficient')
axes[1, 1].set_ylabel('Value')
axes[1, 1].set_title('Mean MFCCs')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('spectral_comparison.png', dpi=150)
logger.info("\nSaved comparison plot to: spectral_comparison.png")
plt.show()
def main():
parser = argparse.ArgumentParser(
description="Evaluate voice cloning using spectral similarity",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Spectral Similarity Thresholds:
0.90+ Excellent - Very close spectral match
0.80+ Good - Similar voice characteristics
0.70+ Fair - Some similarity
<0.70 Poor - Different spectral characteristics
Metrics:
- Mel Similarity: Cosine similarity of mean mel spectrum (timbre)
- MFCC Similarity: Cosine similarity of mean MFCCs (voice characteristics)
- MFCC Std Similarity: Similarity of MFCC dynamics
Requirements:
pip install librosa numpy
# Or minimal: pip install scipy numpy
Examples:
python evaluate_voice.py original_speaker.wav tts_output.wav
python evaluate_voice.py reference.wav synthesized.wav --plot
"""
)
parser.add_argument("reference", type=Path, help="Reference voice audio file")
parser.add_argument("synthesized", type=Path, help="Synthesized TTS audio file")
parser.add_argument("--plot", action="store_true", help="Show spectrogram comparison plots")
parser.add_argument("--json", action="store_true", help="Output metrics as JSON")
args = parser.parse_args()
if not args.reference.exists():
logger.error(f"Reference file not found: {args.reference}")
sys.exit(1)
if not args.synthesized.exists():
logger.error(f"Synthesized file not found: {args.synthesized}")
sys.exit(1)
metrics = evaluate_voice_cloning(args.reference, args.synthesized, plot=args.plot)
if args.json:
import json
print(json.dumps(metrics, indent=2))
if __name__ == "__main__":
main()