diff --git a/.claude/settings.json b/.claude/settings.json index 3c7658e0..6c9694cd 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -6,7 +6,7 @@ "hooks": [ { "type": "command", - "command": "swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/" + "command": "swift format --in-place --recursive --configuration .swift-format Sources/ Tests/" } ] } diff --git a/CLAUDE.md b/CLAUDE.md index d07547e8..f0ff4887 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,7 +41,7 @@ FluidAudio is a comprehensive Swift framework for local, low-latency audio proce - **Swift Format**: This project uses swift-format for consistent code style - **Configuration**: See `.swift-format` for style rules - **Auto-formatting**: PRs are automatically checked for formatting compliance -- **Local formatting**: Run `swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/` +- **Local formatting**: Run `swift format --in-place --recursive --configuration .swift-format Sources/ Tests/` ## Current Performance Status @@ -96,13 +96,13 @@ swift package clean # Clean build cache ### Code Quality ```bash # Format code (requires Swift 6+ for development) -swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/ +swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ # Check formatting without modifying -swift format lint --recursive --configuration .swift-format Sources/ Tests/ Examples/ +swift format lint --recursive --configuration .swift-format Sources/ Tests/ # Verify formatting compliance (CI-style check) -swift format --configuration .swift-format Sources/ Tests/ Examples/ +swift format --configuration .swift-format Sources/ Tests/ ``` ### CLI Commands diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..3ad7b94a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,25 @@ +# Contributing + +This project uses `swift-format` to maintain consistent code style. All pull requests are automatically checked for formatting compliance. + +## Local Development + +```bash +# Format all code (requires Swift 6+ for contributors only) +# Users of the library don't need Swift 6 +swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ + +# Check formatting without modifying +swift format lint --recursive --configuration .swift-format Sources/ Tests/ + +# For Swift <6, install swift-format separately: +# git clone https://github.com/apple/swift-format +# cd swift-format && swift build -c release +# cp .build/release/swift-format /usr/local/bin/ +``` + +## Automatic Checks + +- PRs will fail if code is not properly formatted +- GitHub Actions runs formatting checks on all Swift file changes +- See `.swift-format` for style configuration diff --git a/Documentation/API.md b/Documentation/API.md new file mode 100644 index 00000000..8462b83c --- /dev/null +++ b/Documentation/API.md @@ -0,0 +1,82 @@ +# API Reference + +This page summarizes the primary public APIs across modules. See inline doc comments and module-specific documentation for complete details. + +## Common Patterns + +**Audio Format:** All modules expect 16kHz mono Float32 audio samples. Use `AudioProcessor.loadAudioFile()` for conversion. + +**Model Loading:** Models auto-download from HuggingFace on first use. Set `https_proxy` environment variable if behind corporate firewall. + +**Error Handling:** All async methods throw descriptive errors. Use proper error handling in production code. + +**Thread Safety:** All managers are thread-safe and can be used concurrently across different queues. + +## Diarization + +### DiarizerManager +Main class for speaker diarization and "who spoke when" analysis. + +**Key Methods:** +- `performCompleteDiarization(_:sampleRate:) throws -> DiarizerResult` + - Process complete audio file and return speaker segments + - Parameters: `RandomAccessCollection` audio samples, sample rate (default: 16000) + - Returns: `DiarizerResult` with speaker segments and timing +- `compareSpeakers(audio1:audio2:) throws -> Float` + - Compare speaker similarity between two audio samples + - Returns: Similarity score (0.0-1.0, higher = more similar) +- `validateAudio(_:) throws -> AudioValidationResult` + - Validate audio quality, length, and format requirements + +**Configuration:** +- `DiarizerConfig`: Clustering threshold, minimum durations, activity thresholds +- Optimal threshold: 0.7 (17.7% DER on AMI dataset) + +## Voice Activity Detection + +### VadManager +Voice activity detection using Silero VAD models. + +**Key Methods:** +- `processChunk(_:) throws -> VadResult` + - Process single 512-sample chunk (32ms at 16kHz) + - Returns: Voice activity probability and boolean decision +- `processAudioFile(_:) throws -> [VadResult]` + - Process complete audio file in 512-sample chunks + - Returns: Array of VAD results for each frame + +**Configuration:** +- `VadConfig`: Threshold (0.0-1.0), window size, post-processing +- `VadAudioProcessor`: SNR filtering, noise reduction, adaptive thresholding +- Recommended threshold: 0.3-0.5 depending on noise conditions + +## Automatic Speech Recognition + +### AsrManager +Automatic speech recognition using Parakeet TDT v3 models. + +**Key Methods:** +- `transcribe(_:source:) throws -> AsrTranscription` + - Process complete audio and return transcription + - Parameters: `RandomAccessCollection` samples, `AudioSource` (microphone/system) + - Returns: `AsrTranscription` with text, confidence, and timing +- `initialize(models:) async throws` + - Load and initialize ASR models (automatic download if needed) + +**Model Management:** +- `AsrModels.downloadAndLoad() async throws -> AsrModels` + - Download models from HuggingFace and compile for CoreML + - Models cached locally after first download +- `ASRConfig`: Beam size, temperature, language model weights + +**Audio Processing:** +- `AudioProcessor.loadAudioFile(path:) throws -> [Float]` + - Load and convert audio to 16kHz mono Float32 + - Supports: WAV, M4A, MP3, FLAC, and other common formats +- `AudioSource`: `.microphone` or `.system` for different processing paths + +**Performance:** +- Real-time factor: ~120x on M4 Pro (processes 1min audio in 0.5s) +- Languages: 25 European languages supported +- Streaming: Available via `StreamingAsrManager` (beta) + diff --git a/Documentation/ASR/GettingStarted.md b/Documentation/ASR/GettingStarted.md new file mode 100644 index 00000000..6f920983 --- /dev/null +++ b/Documentation/ASR/GettingStarted.md @@ -0,0 +1,50 @@ +# Automatic Speech Recognition (ASR) / Transcription + +- Model: `FluidInference/parakeet-tdt-0.6b-v3-coreml` +- Languages: 25 European languages (see model card) +- Processing Mode: Batch transcription for complete audio files +- Real-time Factor: ~120x on M4 Pro (1 minute ≈ 0.5 seconds) +- Streaming Support: Coming soon — batch processing recommended for production use + +## Quick Start (Code) + +```swift +import FluidAudio + +// Batch transcription from an audio file +Task { + // 1) Initialize ASR manager and load models + let models = try await AsrModels.downloadAndLoad() + let asrManager = AsrManager(config: .default) + try await asrManager.initialize(models: models) + + // 2) Prepare 16 kHz mono samples (see: Audio Conversion) + let samples = try await loadSamples16kMono(path: "path/to/audio.wav") + + // 3) Transcribe the audio + let result = try await asrManager.transcribe(samples, source: .system) + print("Transcription: \(result.text)") + print("Confidence: \(result.confidence)") +} +``` + +## CLI + +```bash +# Transcribe an audio file (batch) +swift run fluidaudio transcribe audio.wav + +# Transcribe multiple files in parallel +swift run fluidaudio multi-stream audio1.wav audio2.wav + +# Benchmark ASR on LibriSpeech +swift run fluidaudio asr-benchmark --subset test-clean --num-files 50 + +# Multilingual ASR (FLEURS) benchmark +swift run fluidaudio fleurs-benchmark --languages en_us,fr_fr --samples 10 + +# Download LibriSpeech test sets +swift run fluidaudio download --dataset librispeech-test-clean +swift run fluidaudio download --dataset librispeech-test-other +``` + diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md new file mode 100644 index 00000000..74eabf78 --- /dev/null +++ b/Documentation/Benchmarks.md @@ -0,0 +1,70 @@ +## Transcription + +https://huggingface.co/FluidInference/parakeet-tdt-0.6b-v3-coreml + +```bash +swift run fluidaudio fleurs-benchmark --languages en_us,it_it,es_419,fr_fr,de_de,ru_ru,uk_ua --samples all +``` + +```text +================================================================================ +FLEURS BENCHMARK SUMMARY +================================================================================ + +Language | WER% | CER% | RTFx | Duration | Processed | Skipped +----------------------------------------------------------------------------------------- +English (US) | 5.7 | 2.8 | 136.7 | 3442.9s | 350 | - +French (France) | 5.8 | 2.4 | 136.5 | 560.8s | 52 | 298 +German (Germany) | 3.1 | 1.2 | 152.2 | 62.1s | 5 | - +Italian (Italy) | 4.3 | 2.0 | 153.7 | 743.3s | 50 | - +Russian (Russia) | 7.7 | 2.8 | 134.1 | 621.2s | 50 | - +Spanish (Spain) | 6.5 | 3.0 | 152.3 | 586.9s | 50 | - +Ukrainian (Ukraine) | 6.5 | 1.9 | 132.5 | 528.2s | 50 | - +----------------------------------------------------------------------------------------- +AVERAGE | 5.6 | 2.3 | 142.6 | 6545.5s | 607 | 298 +``` + +```text +2620 files per dataset • Test runtime: 4m 1s • 09/04/2025, 1:55 AM EDT +--- Benchmark Results --- + Dataset: librispeech test-clean + Files processed: 2620 + Average WER: 2.7% + Median WER: 0.0% + Average CER: 1.1% + Median RTFx: 99.3x + Overall RTFx: 109.6x (19452.5s / 177.5s) +``` + +## Voice Activity Detection + +![assets/vad-graph.png](assets/vad-graph.png) + +Dataset: https://github.com/Lab41/VOiCES-subset + +```text +swift run fluidaudio vad-benchmark --dataset voices-subset --all-files --threshold 0.5 +... +⏱️ Timing Statistics: +[23:26:10.167] [INFO] [VAD] Total processing time: 2.76s +[23:26:10.167] [INFO] [VAD] Total audio duration: 350.46s +[23:26:10.167] [INFO] [VAD] RTFx: 126.9x faster than real-time +[23:26:10.167] [INFO] [VAD] Audio loading time: 0.01s (0.2%) +[23:26:10.167] [INFO] [VAD] VAD inference time: 2.75s (99.7%) +[23:26:10.167] [INFO] [VAD] Average per file: 0.115s +[23:26:10.167] [INFO] [VAD] Min per file: 0.022s +[23:26:10.167] [INFO] [VAD] Max per file: 0.135s +[23:26:10.167] [INFO] [VAD] +📊 VAD Benchmark Results: +[23:26:10.167] [INFO] [VAD] Precision: 100.0% +[23:26:10.167] [INFO] [VAD] Accuracy: 100.0% +[23:26:10.167] [INFO] [VAD] Recall: 100.0% +[23:26:10.167] [INFO] [VAD] F1-Score: 100.0% +[23:26:10.167] [INFO] [VAD] Total Time: 2.76s +[23:26:10.167] [INFO] [VAD] RTFx: 126.9x faster than real-time +[23:26:10.167] [INFO] [VAD] Files Processed: 24 +[23:26:10.167] [INFO] [VAD] Avg Time per File: 0.115s +``` + + +## Speaker Diarization \ No newline at end of file diff --git a/Documentation/CLI.md b/Documentation/CLI.md new file mode 100644 index 00000000..8472651d --- /dev/null +++ b/Documentation/CLI.md @@ -0,0 +1,61 @@ +# Command Line Interface (CLI) + +This guide collects commonly used `fluidaudio` CLI commands for ASR, diarization, VAD, and datasets. + +## ASR + +```bash +# Transcribe an audio file (batch) +swift run fluidaudio transcribe audio.wav + +# Transcribe multiple files in parallel +swift run fluidaudio multi-stream audio1.wav audio2.wav + +# Benchmark ASR on LibriSpeech +swift run fluidaudio asr-benchmark --subset test-clean --num-files 50 + +# Multilingual ASR (FLEURS) benchmark +swift run fluidaudio fleurs-benchmark --languages en_us,fr_fr --samples 10 +``` + +## Diarization + +```bash +# Run AMI benchmark (auto-download dataset) +swift run fluidaudio diarization-benchmark --auto-download + +# Tune threshold and save results +swift run fluidaudio diarization-benchmark --threshold 0.7 --output results.json + +# Quick test on a single AMI file +swift run fluidaudio diarization-benchmark --single-file ES2004a --threshold 0.8 + +# Real-time-ish streaming benchmark (~3s chunks with 2s overlap) +swift run fluidaudio diarization-benchmark --single-file ES2004a \ + --chunk-seconds 3 --overlap-seconds 2 + +# Balanced throughput/quality (~10s chunks with 5s overlap) +swift run fluidaudio diarization-benchmark --dataset ami-sdm \ + --chunk-seconds 10 --overlap-seconds 5 +``` + +## VAD + +```bash +# Run VAD benchmark (mini50 dataset by default) +swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3 + +# Save results and enable debug output +swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug +``` + +## Datasets + +```bash +# Download test sets +swift run fluidaudio download --dataset librispeech-test-clean +swift run fluidaudio download --dataset librispeech-test-other +swift run fluidaudio download --dataset ami-sdm +swift run fluidaudio download --dataset vad +``` + diff --git a/Documentation/Guides/AudioConversion.md b/Documentation/Guides/AudioConversion.md new file mode 100644 index 00000000..374a2460 --- /dev/null +++ b/Documentation/Guides/AudioConversion.md @@ -0,0 +1,26 @@ +# Audio Conversion (16 kHz mono) + +Most FluidAudio features expect 16 kHz mono Float32 samples. Use `AudioConverter` to load and convert from any `AVAudioFile` format. + +## Swift Example + +```swift +import AVFoundation +import FluidAudio + +public func loadSamples16kMono(path: String) async throws -> [Float] { + let url = URL(fileURLWithPath: path) + let file = try AVAudioFile(forReading: url) + let capacity = AVAudioFrameCount(file.length) + guard let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: capacity) else { + return [] + } + try file.read(into: buf) + let converter = AudioConverter() + return try await converter.convertToAsrFormat(buf) +} +``` + +Notes: +- Input can be any PCM format supported by `AVAudioFile`. +- Output is 16 kHz mono Float32 samples suitable for ASR/VAD/Diarization. diff --git a/Documentation/Guides/MCP.md b/Documentation/Guides/MCP.md new file mode 100644 index 00000000..998f8fed --- /dev/null +++ b/Documentation/Guides/MCP.md @@ -0,0 +1,24 @@ +# MCP + +The repo is indexed by the DeepWiki MCP server, so your coding tools can access documentation programmatically. + +## VS Code / MCP Client Configuration + +Add this to your MCP client configuration file: + +```json +{ + "mcpServers": { + "deepwiki": { + "url": "https://mcp.deepwiki.com/mcp" + } + } +} +``` + +## Claude Code (CLI) + +```bash +claude mcp add -s user -t http deepwiki https://mcp.deepwiki.com/mcp +``` + diff --git a/Documentation/README.md b/Documentation/README.md new file mode 100644 index 00000000..a328ed50 --- /dev/null +++ b/Documentation/README.md @@ -0,0 +1,16 @@ +# Documentation Index + +- Guides + - [MCP](Guides/MCP.md) + - [Audio Conversion](Guides/AudioConversion.md) + +- Modules + - ASR: [Getting Started](ASR/GettingStarted.md) + - Diarization: [Speaker Diarization Guide](SpeakerDiarization.md) + - VAD: [Getting Started](VAD/GettingStarted.md) + +- API + - [API Reference](API.md) + +- CLI + - [Command Line Guide](CLI.md) diff --git a/Documentation/SpeakerDiarization.md b/Documentation/SpeakerDiarization.md index dbcdf27d..3676b682 100644 --- a/Documentation/SpeakerDiarization.md +++ b/Documentation/SpeakerDiarization.md @@ -80,6 +80,15 @@ for audioSamples in audioStream { } ``` +Notes: + +- Keep one `DiarizerManager` instance per stream so `SpeakerManager` maintains ID consistency. +- Always rebase per-chunk timestamps by `(chunkStartSample / sampleRate)`. +- Provide 16 kHz mono Float32 samples; pad final chunk to the model window. +- Tune `speakerThreshold` and `embeddingThreshold` to trade off ID stability vs. sensitivity. + +**Speaker Enrollment:** The `Speaker` class includes a `name` field for enrollment workflows. When users introduce themselves ("My name is Alice"), update the speaker's name from the default (e.g. "Speaker_1") to enable personalized identification. + ### Chunk Size Considerations The `performCompleteDiarization` function accepts audio of any length, but accuracy varies: diff --git a/Documentation/VAD/GettingStarted.md b/Documentation/VAD/GettingStarted.md new file mode 100644 index 00000000..1377cdba --- /dev/null +++ b/Documentation/VAD/GettingStarted.md @@ -0,0 +1,51 @@ +# Voice Activity Detection (VAD) + +The current VAD APIs require careful tuning for your specific use case. If you need help integrating VAD, reach out in our Discord channel. + +Our goal is to provide a streamlined API similar to Apple's upcoming SpeechDetector in [OS26](https://developer.apple.com/documentation/speech/speechdetector). + +## Quick Start (Code) + +```swift +import FluidAudio + +// Programmatic VAD over an audio file +Task { + // 1) Initialize VAD (async load of Silero model) + let vad = try await VadManager(config: VadConfig(threshold: 0.3)) + + // 2) Prepare 16 kHz mono samples (see: Audio Conversion) + let samples = try await loadSamples16kMono(path: "path/to/audio.wav") + + // 3) Run VAD and print speech segments (512-sample frames) + let results = try await vad.processAudioFile(samples) + let sampleRate = 16000.0 + let frame = 512.0 + + var startIndex: Int? = nil + for (i, r) in results.enumerated() { + if r.isVoiceActive { + if startIndex == nil { startIndex = i } + } else if let s = startIndex { + let startSec = (Double(s) * frame) / sampleRate + let endSec = (Double(i + 1) * frame) / sampleRate + print(String(format: "Speech: %.2f–%.2fs", startSec, endSec)) + startIndex = nil + } + } +} +``` + +## CLI + +```bash +# Run VAD benchmark (mini50 dataset by default) +swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3 + +# Save results and enable debug output +swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug + +# Download VAD dataset if needed +swift run fluidaudio download --dataset vad +``` + diff --git a/Documentation/assets/vad-graph.png b/Documentation/assets/vad-graph.png new file mode 100644 index 00000000..6ed1a033 Binary files /dev/null and b/Documentation/assets/vad-graph.png differ diff --git a/README.md b/README.md index bec8ddd3..4c3aee17 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ ![banner.png](banner.png) -# FluidAudio - Speaker Diarization, VAD and Transcription with CoreML +# FluidAudio - Speaker diarization, voice-activity-detection and transcription with CoreML [![Swift](https://img.shields.io/badge/Swift-5.9+-orange.svg)](https://swift.org) [![Platform](https://img.shields.io/badge/Platform-macOS%20%7C%20iOS-blue.svg)](https://developer.apple.com) [![Discord](https://img.shields.io/badge/Discord-Join%20Chat-7289da.svg)](https://discord.gg/WNsvaCtmDe) -[![Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue)](https://huggingface.co/collections/FluidInference/coreml-models-6873d9e310e638c66d22fba9) +[![All Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue)](https://huggingface.co/collections/FluidInference/coreml-models-6873d9e310e638c66d22fba9) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/FluidInference/FluidAudio) Fluid Audio is a Swift SDK for fully local, low-latency audio AI on Apple devices, with inference offloaded to the Apple Neural Engine (ANE), resulting in less memory and generally faster inference. -The SDK includes state-of-the-art speaker diarization, transcription, and voice activity detection via open-source models (MIT/Apache 2.0) that can be integrated with just a few lines of code. Models are optimized for background processing, ambient computing and always on workloads by running inference on the ANE, minimizing CPU usage and avoiding GPU/MPS entirely. +The SDK includes state-of-the-art speaker diarization, transcription, and voice activity detection via open-source models (MIT/Apache 2.0) that can be integrated with just a few lines of code. Models are optimized for background processing, ambient computing and always on workloads by running inference on the ANE, minimizing CPU usage and avoiding GPU/MPS entirely. -For custom use cases, feedback, additional model support, or platform requests, join our [Discord]. We’re also bringing visual, language, and TTS models to device and will share updates there. +For custom use cases, feedback, additional model support, or platform requests, join our [Discord](https://discord.gg/WNsvaCtmDe). We’re also bringing visual, language, and TTS models to device and will share updates there. Below are some featured local AI apps using Fluid Audio models on macOS and iOS: @@ -29,11 +29,9 @@ Below are some featured local AI apps using Fluid Audio models on macOS and iOS: - **Speaker Diarization**: Speaker separation with speaker clustering via Pyannote models - **Speaker Embedding Extraction**: Generate speaker embeddings for voice comparison and clustering, you can use this for speaker identification - **Voice Activity Detection (VAD)**: Voice activity detection with Silero models -- **CoreML Models**: Native Apple CoreML backend with custom-converted models optimized for Apple Silicon -- **Open-Source Models**: All models are publicly available on HuggingFace — converted and optimized by our team; permissive licenses - **Real-time Processing**: Designed for near real-time workloads but also works for offline processing -- **Cross-platform**: Support for macOS 14.0+ and iOS 17.0+ and Apple Silicon devices - **Apple Neural Engine**: Models run efficiently on Apple's ANE for maximum performance with minimal power consumption +- **Open-Source Models**: All models are publicly available on HuggingFace — converted and optimized by our team; permissive licenses ## Installation @@ -49,9 +47,24 @@ Important: When adding FluidAudio as a package dependency, only add the library ## Documentation -- **DeepWiki**: Auto-generated docs for this repo — https://deepwiki.com/FluidInference/FluidAudio +**[DeepWiki](https://deepwiki.com/FluidInference/FluidAudio)** for auto-generated docs for this repo. -### MCP +### Documentation Index + +- Guides + - [MCP](Documentation/Guides/MCP.md) + - [Audio Conversion for Inference](Documentation/Guides/AudioConversion.md) +- Modules + - ASR: [Getting Started](Documentation/ASR/GettingStarted.md) + - ASR: [Last Chunk Handling](Documentation/ASR/LastChunkHandling.md) + - Diarization: [Speaker Diarization Guide](Documentation/SpeakerDiarization.md) + - VAD: [Getting Started](Documentation/VAD/GettingStarted.md) +- API + - [API Reference](Documentation/API.md) +- CLI + - [Command Line Guide](Documentation/CLI.md) + +### MCP Server The repo is indexed by DeepWiki MCP server, so your coding tool can access the docs: @@ -71,27 +84,6 @@ For claude code: claude mcp add -s user -t http deepwiki https://mcp.deepwiki.com/mcp ``` -### Audio Conversion (16 kHz mono) - -Most features expect 16 kHz mono Float32 samples. Use `AudioConverter` to load and convert from any `AVAudioFile` format: - -```swift -import AVFoundation -import FluidAudio - -func loadSamples16kMono(path: String) async throws -> [Float] { - let url = URL(fileURLWithPath: path) - let file = try AVAudioFile(forReading: url) - let capacity = AVAudioFrameCount(file.length) - guard let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: capacity) else { - return [] - } - try file.read(into: buf) - let converter = AudioConverter() - return try await converter.convertToAsrFormat(buf) -} -``` - ## Automatic Speech Recognition (ASR) / Transcription - **Model**: `FluidInference/parakeet-tdt-0.6b-v3-coreml` @@ -101,7 +93,7 @@ func loadSamples16kMono(path: String) async throws -> [Float] { - **Streaming Support**: Coming soon — batch processing is recommended for production use - **Backend**: Same Parakeet TDT v3 model powers our backend ASR -### Quick Start (Code) +### ASR Quick Start ```swift import FluidAudio @@ -123,24 +115,9 @@ Task { } ``` -### CLI - ```bash # Transcribe an audio file (batch) swift run fluidaudio transcribe audio.wav - -# Transcribe multiple files in parallel -swift run fluidaudio multi-stream audio1.wav audio2.wav - -# Benchmark ASR on LibriSpeech -swift run fluidaudio asr-benchmark --subset test-clean --num-files 50 - -# Multilingual ASR (FLEURS) benchmark -swift run fluidaudio fleurs-benchmark --languages en_us,fr_fr --samples 10 - -# Download LibriSpeech test sets -swift run fluidaudio download --dataset librispeech-test-clean -swift run fluidaudio download --dataset librispeech-test-other ``` ## Speaker Diarization @@ -151,20 +128,7 @@ swift run fluidaudio download --dataset librispeech-test-other - **JER: 28.0%** — Outperforms EEND 2019 (25.3%) and x-vector clustering (28.7%) - **RTF: 0.02x** — Real-time processing with 50x speedup -```text -RTF = Processing Time / Audio Duration - -With RTF = 0.02x: -- 1 minute of audio takes 0.02 × 60 = 1.2 seconds to process -- 10 minutes of audio takes 0.02 × 600 = 12 seconds to process - -For real-time speech-to-text: -- Latency: ~1.2 seconds per minute of audio -- Throughput: Can process 50x faster than real-time -- Pipeline impact: Minimal — diarization won't be the bottleneck -``` - -### Quick Start (Code) +### Speaker Diarization Quick Start ```swift import FluidAudio @@ -186,114 +150,18 @@ Task { } ``` -### Streaming Diarization +For diarization streaming see [Documentation/SpeakerDiarization.md](Documentation/SpeakerDiarization.md) -Stream meeting audio in chunks while maintaining consistent speaker IDs across the session. Keep a single `DiarizerManager` alive, process fixed-size chunks, and rebase segment timestamps by the chunk’s start offset. Overlap helps reduce boundary errors and enables overlap speech handling. - -```swift -import FluidAudio - -Task { - // 1) Initialize diarizer once (models are reused across chunks) - let models = try await DiarizerModels.downloadIfNeeded() - let config = DiarizerConfig( - clusteringThreshold: 0.7, - minSpeechDuration: 1.0, - minSilenceGap: 0.5, - minActiveFramesCount: 10.0, - chunkDuration: 10.0, // model window; also used for chunk sizing below - chunkOverlap: 5.0, // optional overlap - debugMode: false - ) - let diarizer = DiarizerManager(config: config) - diarizer.initialize(models: models) - - // Optional: tune streaming behavior (assignment/update thresholds) - diarizer.speakerManager.speakerThreshold = 0.84 // assign to existing speakers - diarizer.speakerManager.embeddingThreshold = 0.56 // update embeddings over time - - // 2) Prepare 16 kHz mono samples (see: Audio Conversion) - let samples = try await loadSamples16kMono(path: "path/to/meeting.wav") - - // 3) Chunked streaming loop with timestamp rebasing - let sr = 16000.0 - let chunkSeconds = 10.0 - let overlapSeconds = 5.0 - let chunkSize = Int(chunkSeconds * sr) - let hop = Int(max(1.0, chunkSeconds - overlapSeconds) * sr) - - var position = 0 - var segments: [TimedSpeakerSegment] = [] - - while position < samples.count { - let end = min(position + chunkSize, samples.count) - var chunk = Array(samples[position..` (Array, ArraySlice, ContiguousArray, etc.) -- `compareSpeakers(audio1:audio2:)`: Compare similarity between two audio samples -- `validateAudio(_:)`: Validate audio quality and characteristics - -**Voice Activity Detection:** - -- `VadManager`: Voice activity detection with CoreML models -- `VadConfig`: Configuration for VAD processing with adaptive thresholding -- `processChunk(_:)`: Process a single audio chunk and detect voice activity -- `processAudioFile(_:)`: Process complete audio file in chunks -- `VadAudioProcessor`: Advanced audio processing with SNR filtering - -**Automatic Speech Recognition:** - -- `AsrManager`: Main ASR class with TDT decoding for batch processing -- `AsrModels`: Model loading and management with automatic downloads -- `ASRConfig`: Configuration for ASR processing -- `transcribe(_:source:)`: Process complete audio and return transcription results -- `AudioProcessor.loadAudioFile(path:)`: Load and convert audio files to required format -- `AudioSource`: Enum for microphone vs system audio separation - ## Everything Else -### Platform & Networking Notes +### FAQs - CLI is available on macOS only. For iOS, use the library programmatically. - Models auto-download on first use. If your network restricts Hugging Face access, set an HTTPS proxy: `export https_proxy=http://127.0.0.1:7890`. - Windows alternative in development: [fluid-server](https://github.com/FluidInference/fluid-server) - -If you're looking to get the system audio on a Mac, take a look at this repo for reference [AudioCap](https://github.com/insidegui/AudioCap/tree/main) +- If you're looking to get the system audio on a Mac, take a look at this repo for reference [AudioCap](https://github.com/insidegui/AudioCap/tree/main) ### License Apache 2.0 — see `LICENSE` for details. -### Contributing - -This project uses `swift-format` to maintain consistent code style. All pull requests are automatically checked for formatting compliance. - -**Local Development:** - -```bash -# Format all code (requires Swift 6+ for contributors only) -# Users of the library don't need Swift 6 -swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/ - -# Check formatting without modifying -swift format lint --recursive --configuration .swift-format Sources/ Tests/ Examples/ - -# For Swift <6, install swift-format separately: -# git clone https://github.com/apple/swift-format -# cd swift-format && swift build -c release -# cp .build/release/swift-format /usr/local/bin/ -``` - -**Automatic Checks:** - -- PRs will fail if code is not properly formatted -- GitHub Actions runs formatting checks on all Swift file changes -- See `.swift-format` for style configuration - ### Acknowledgments This project builds upon the excellent work of the [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) project for speaker diarization algorithms and techniques. diff --git a/Sources/FluidAudio/Diarizer/DiarizerManager.swift b/Sources/FluidAudio/Diarizer/DiarizerManager.swift index eb33b947..ba7143fe 100644 --- a/Sources/FluidAudio/Diarizer/DiarizerManager.swift +++ b/Sources/FluidAudio/Diarizer/DiarizerManager.swift @@ -245,8 +245,6 @@ public final class DiarizerManager { throw DiarizerError.notInitialized } - logger.debug("Using EmbeddingExtractor for embedding extraction") - var masks: [[Float]] = [] let numSpeakers = slidingFeature.data[0][0].count let numFrames = slidingFeature.data[0].count diff --git a/Sources/FluidAudio/Diarizer/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/SpeakerManager.swift index 9143052b..d5f0b802 100644 --- a/Sources/FluidAudio/Diarizer/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/SpeakerManager.swift @@ -151,10 +151,6 @@ public class SpeakerManager { segmentId: UUID(), alpha: 0.9 ) - - logger.debug( - "Updated embedding for \(speakerId), update count: \(speaker.updateCount), raw count: \(speaker.rawEmbeddings.count)" - ) } } else { // Just update duration if not updating embedding diff --git a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift index 1fe5d70b..e8fa8ebd 100644 --- a/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift +++ b/Sources/FluidAudioCLI/DatasetParsers/AMIParser.swift @@ -89,21 +89,14 @@ struct AMIParser { ] // Add comprehensive debug logging for path resolution - logger.debug("🔍 DEBUG: Searching for AMI annotations in \(possiblePaths.count) locations:") - logger.debug(" Current working directory: \(FileManager.default.currentDirectoryPath)") - var amiDir: URL? - for (index, path) in possiblePaths.enumerated() { + for (_, path) in possiblePaths.enumerated() { let segmentsDir = path.appendingPathComponent("segments") let meetingsFile = path.appendingPathComponent("corpusResources/meetings.xml") let segmentsExists = FileManager.default.fileExists(atPath: segmentsDir.path) let meetingsExists = FileManager.default.fileExists(atPath: meetingsFile.path) - logger.debug(" \(index + 1). \(path.path)") - logger.debug(" - segments/: \(segmentsExists ? "✅" : "❌") (\(segmentsDir.path))") - logger.debug(" - meetings.xml: \(meetingsExists ? "✅" : "❌") (\(meetingsFile.path))") - if segmentsExists && meetingsExists { logger.info(" - 🎯 SELECTED: This path will be used") amiDir = path