Clean up README (#98)

### Why is this change needed?  Readme has grown way too much, splitting it up to just preserve the essence and move the verbose and details to Documentation
2026-05-12 20:20:36 +00:00 · 2025-09-11 10:11:25 -04:00
parent 09cd849998
commit 4e8b54ed78
17 changed files with 449 additions and 243 deletions
@@ -6,7 +6,7 @@
        "hooks": [
          {
            "type": "command",
-            "command": "swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/"
+            "command": "swift format --in-place --recursive --configuration .swift-format Sources/ Tests/"
          }
        ]
      }
@@ -41,7 +41,7 @@ FluidAudio is a comprehensive Swift framework for local, low-latency audio proce
 - **Swift Format**: This project uses swift-format for consistent code style
 - **Configuration**: See `.swift-format` for style rules
 - **Auto-formatting**: PRs are automatically checked for formatting compliance
- **Local formatting**: Run `swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/`
+- **Local formatting**: Run `swift format --in-place --recursive --configuration .swift-format Sources/ Tests/`

 ## Current Performance Status

@@ -96,13 +96,13 @@ swift package clean                   # Clean build cache
 ### Code Quality
 ```bash
 # Format code (requires Swift 6+ for development)
-swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/
+swift format --in-place --recursive --configuration .swift-format Sources/ Tests/

 # Check formatting without modifying
-swift format lint --recursive --configuration .swift-format Sources/ Tests/ Examples/
+swift format lint --recursive --configuration .swift-format Sources/ Tests/

 # Verify formatting compliance (CI-style check)
-swift format --configuration .swift-format Sources/ Tests/ Examples/
+swift format --configuration .swift-format Sources/ Tests/
 ```

 ### CLI Commands
@@ -0,0 +1,25 @@
+# Contributing
+
+This project uses `swift-format` to maintain consistent code style. All pull requests are automatically checked for formatting compliance.
+
+## Local Development
+
+```bash
+# Format all code (requires Swift 6+ for contributors only)
+# Users of the library don't need Swift 6
+swift format --in-place --recursive --configuration .swift-format Sources/ Tests/
+
+# Check formatting without modifying
+swift format lint --recursive --configuration .swift-format Sources/ Tests/
+
+# For Swift <6, install swift-format separately:
+# git clone https://github.com/apple/swift-format
+# cd swift-format && swift build -c release
+# cp .build/release/swift-format /usr/local/bin/
+```
+
+## Automatic Checks
+
+- PRs will fail if code is not properly formatted
+- GitHub Actions runs formatting checks on all Swift file changes
+- See `.swift-format` for style configuration
@@ -0,0 +1,82 @@
+# API Reference
+
+This page summarizes the primary public APIs across modules. See inline doc comments and module-specific documentation for complete details.
+
+## Common Patterns
+
+**Audio Format:** All modules expect 16kHz mono Float32 audio samples. Use `AudioProcessor.loadAudioFile()` for conversion.
+
+**Model Loading:** Models auto-download from HuggingFace on first use. Set `https_proxy` environment variable if behind corporate firewall.
+
+**Error Handling:** All async methods throw descriptive errors. Use proper error handling in production code.
+
+**Thread Safety:** All managers are thread-safe and can be used concurrently across different queues.
+
+## Diarization
+
+### DiarizerManager
+Main class for speaker diarization and "who spoke when" analysis.
+
+**Key Methods:**
+- `performCompleteDiarization(_:sampleRate:) throws -> DiarizerResult`
+  - Process complete audio file and return speaker segments
+  - Parameters: `RandomAccessCollection<Float>` audio samples, sample rate (default: 16000)
+  - Returns: `DiarizerResult` with speaker segments and timing
+- `compareSpeakers(audio1:audio2:) throws -> Float`
+  - Compare speaker similarity between two audio samples
+  - Returns: Similarity score (0.0-1.0, higher = more similar)
+- `validateAudio(_:) throws -> AudioValidationResult`
+  - Validate audio quality, length, and format requirements
+
+**Configuration:**
+- `DiarizerConfig`: Clustering threshold, minimum durations, activity thresholds
+- Optimal threshold: 0.7 (17.7% DER on AMI dataset)
+
+## Voice Activity Detection
+
+### VadManager
+Voice activity detection using Silero VAD models.
+
+**Key Methods:**
+- `processChunk(_:) throws -> VadResult`
+  - Process single 512-sample chunk (32ms at 16kHz)
+  - Returns: Voice activity probability and boolean decision
+- `processAudioFile(_:) throws -> [VadResult]`
+  - Process complete audio file in 512-sample chunks
+  - Returns: Array of VAD results for each frame
+
+**Configuration:**
+- `VadConfig`: Threshold (0.0-1.0), window size, post-processing
+- `VadAudioProcessor`: SNR filtering, noise reduction, adaptive thresholding
+- Recommended threshold: 0.3-0.5 depending on noise conditions
+
+## Automatic Speech Recognition
+
+### AsrManager
+Automatic speech recognition using Parakeet TDT v3 models.
+
+**Key Methods:**
+- `transcribe(_:source:) throws -> AsrTranscription`
+  - Process complete audio and return transcription
+  - Parameters: `RandomAccessCollection<Float>` samples, `AudioSource` (microphone/system)
+  - Returns: `AsrTranscription` with text, confidence, and timing
+- `initialize(models:) async throws`
+  - Load and initialize ASR models (automatic download if needed)
+
+**Model Management:**
+- `AsrModels.downloadAndLoad() async throws -> AsrModels`
+  - Download models from HuggingFace and compile for CoreML
+  - Models cached locally after first download
+- `ASRConfig`: Beam size, temperature, language model weights
+
+**Audio Processing:**
+- `AudioProcessor.loadAudioFile(path:) throws -> [Float]`
+  - Load and convert audio to 16kHz mono Float32
+  - Supports: WAV, M4A, MP3, FLAC, and other common formats
+- `AudioSource`: `.microphone` or `.system` for different processing paths
+
+**Performance:**
+- Real-time factor: ~120x on M4 Pro (processes 1min audio in 0.5s)
+- Languages: 25 European languages supported
+- Streaming: Available via `StreamingAsrManager` (beta)
+
@@ -0,0 +1,50 @@
+# Automatic Speech Recognition (ASR) / Transcription
+
+- Model: `FluidInference/parakeet-tdt-0.6b-v3-coreml`
+- Languages: 25 European languages (see model card)
+- Processing Mode: Batch transcription for complete audio files
+- Real-time Factor: ~120x on M4 Pro (1 minute ≈ 0.5 seconds)
+- Streaming Support: Coming soon — batch processing recommended for production use
+
+## Quick Start (Code)
+
+```swift
+import FluidAudio
+
+// Batch transcription from an audio file
+Task {
+    // 1) Initialize ASR manager and load models
+    let models = try await AsrModels.downloadAndLoad()
+    let asrManager = AsrManager(config: .default)
+    try await asrManager.initialize(models: models)
+
+    // 2) Prepare 16 kHz mono samples (see: Audio Conversion)
+    let samples = try await loadSamples16kMono(path: "path/to/audio.wav")
+
+    // 3) Transcribe the audio
+    let result = try await asrManager.transcribe(samples, source: .system)
+    print("Transcription: \(result.text)")
+    print("Confidence: \(result.confidence)")
+}
+```
+
+## CLI
+
+```bash
+# Transcribe an audio file (batch)
+swift run fluidaudio transcribe audio.wav
+
+# Transcribe multiple files in parallel
+swift run fluidaudio multi-stream audio1.wav audio2.wav
+
+# Benchmark ASR on LibriSpeech
+swift run fluidaudio asr-benchmark --subset test-clean --num-files 50
+
+# Multilingual ASR (FLEURS) benchmark
+swift run fluidaudio fleurs-benchmark --languages en_us,fr_fr --samples 10
+
+# Download LibriSpeech test sets
+swift run fluidaudio download --dataset librispeech-test-clean
+swift run fluidaudio download --dataset librispeech-test-other
+```
+
@@ -0,0 +1,70 @@
+## Transcription
+
+https://huggingface.co/FluidInference/parakeet-tdt-0.6b-v3-coreml 
+
+```bash
+swift run fluidaudio fleurs-benchmark --languages en_us,it_it,es_419,fr_fr,de_de,ru_ru,uk_ua --samples all
+```
+
+```text
+================================================================================
+FLEURS BENCHMARK SUMMARY
+================================================================================
+
+Language                  | WER%   | CER%   | RTFx    | Duration | Processed | Skipped
+-----------------------------------------------------------------------------------------
+English (US)              | 5.7    | 2.8    | 136.7   | 3442.9s  | 350       | -
+French (France)           | 5.8    | 2.4    | 136.5   | 560.8s   | 52        | 298
+German (Germany)          | 3.1    | 1.2    | 152.2   | 62.1s    | 5         | -
+Italian (Italy)           | 4.3    | 2.0    | 153.7   | 743.3s   | 50        | -
+Russian (Russia)          | 7.7    | 2.8    | 134.1   | 621.2s   | 50        | -
+Spanish (Spain)           | 6.5    | 3.0    | 152.3   | 586.9s   | 50        | -
+Ukrainian (Ukraine)       | 6.5    | 1.9    | 132.5   | 528.2s   | 50        | -
+-----------------------------------------------------------------------------------------
+AVERAGE                   | 5.6    | 2.3    | 142.6   | 6545.5s  | 607       | 298
+```
+
+```text
+2620 files per dataset • Test runtime: 4m 1s • 09/04/2025, 1:55 AM EDT
+--- Benchmark Results ---
+   Dataset: librispeech test-clean
+   Files processed: 2620
+   Average WER: 2.7%
+   Median WER: 0.0%
+   Average CER: 1.1%
+   Median RTFx: 99.3x
+   Overall RTFx: 109.6x (19452.5s / 177.5s)
+```
+
+## Voice Activity Detection
+
+![assets/vad-graph.png](assets/vad-graph.png)
+
+Dataset: https://github.com/Lab41/VOiCES-subset
+
+```text
+swift run fluidaudio vad-benchmark --dataset voices-subset --all-files --threshold 0.5
+...
+⏱️ Timing Statistics:
+[23:26:10.167] [INFO] [VAD]    Total processing time: 2.76s
+[23:26:10.167] [INFO] [VAD]    Total audio duration: 350.46s
+[23:26:10.167] [INFO] [VAD]    RTFx: 126.9x faster than real-time
+[23:26:10.167] [INFO] [VAD]    Audio loading time: 0.01s (0.2%)
+[23:26:10.167] [INFO] [VAD]    VAD inference time: 2.75s (99.7%)
+[23:26:10.167] [INFO] [VAD]    Average per file: 0.115s
+[23:26:10.167] [INFO] [VAD]    Min per file: 0.022s
+[23:26:10.167] [INFO] [VAD]    Max per file: 0.135s
+[23:26:10.167] [INFO] [VAD]
+📊 VAD Benchmark Results:
+[23:26:10.167] [INFO] [VAD]    Precision: 100.0%
+[23:26:10.167] [INFO] [VAD]    Accuracy: 100.0%
+[23:26:10.167] [INFO] [VAD]    Recall: 100.0%
+[23:26:10.167] [INFO] [VAD]    F1-Score: 100.0%
+[23:26:10.167] [INFO] [VAD]    Total Time: 2.76s
+[23:26:10.167] [INFO] [VAD]    RTFx: 126.9x faster than real-time
+[23:26:10.167] [INFO] [VAD]    Files Processed: 24
+[23:26:10.167] [INFO] [VAD]    Avg Time per File: 0.115s
+```
+
+
+## Speaker Diarization
@@ -0,0 +1,61 @@
+# Command Line Interface (CLI)
+
+This guide collects commonly used `fluidaudio` CLI commands for ASR, diarization, VAD, and datasets.
+
+## ASR
+
+```bash
+# Transcribe an audio file (batch)
+swift run fluidaudio transcribe audio.wav
+
+# Transcribe multiple files in parallel
+swift run fluidaudio multi-stream audio1.wav audio2.wav
+
+# Benchmark ASR on LibriSpeech
+swift run fluidaudio asr-benchmark --subset test-clean --num-files 50
+
+# Multilingual ASR (FLEURS) benchmark
+swift run fluidaudio fleurs-benchmark --languages en_us,fr_fr --samples 10
+```
+
+## Diarization
+
+```bash
+# Run AMI benchmark (auto-download dataset)
+swift run fluidaudio diarization-benchmark --auto-download
+
+# Tune threshold and save results
+swift run fluidaudio diarization-benchmark --threshold 0.7 --output results.json
+
+# Quick test on a single AMI file
+swift run fluidaudio diarization-benchmark --single-file ES2004a --threshold 0.8
+
+# Real-time-ish streaming benchmark (~3s chunks with 2s overlap)
+swift run fluidaudio diarization-benchmark --single-file ES2004a \
+  --chunk-seconds 3 --overlap-seconds 2
+
+# Balanced throughput/quality (~10s chunks with 5s overlap)
+swift run fluidaudio diarization-benchmark --dataset ami-sdm \
+  --chunk-seconds 10 --overlap-seconds 5
+```
+
+## VAD
+
+```bash
+# Run VAD benchmark (mini50 dataset by default)
+swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3
+
+# Save results and enable debug output
+swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug
+```
+
+## Datasets
+
+```bash
+# Download test sets
+swift run fluidaudio download --dataset librispeech-test-clean
+swift run fluidaudio download --dataset librispeech-test-other
+swift run fluidaudio download --dataset ami-sdm
+swift run fluidaudio download --dataset vad
+```
+
@@ -0,0 +1,26 @@
+# Audio Conversion (16 kHz mono)
+
+Most FluidAudio features expect 16 kHz mono Float32 samples. Use `AudioConverter` to load and convert from any `AVAudioFile` format.
+
+## Swift Example
+
+```swift
+import AVFoundation
+import FluidAudio
+
+public func loadSamples16kMono(path: String) async throws -> [Float] {
+    let url = URL(fileURLWithPath: path)
+    let file = try AVAudioFile(forReading: url)
+    let capacity = AVAudioFrameCount(file.length)
+    guard let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: capacity) else {
+        return []
+    }
+    try file.read(into: buf)
+    let converter = AudioConverter()
+    return try await converter.convertToAsrFormat(buf)
+}
+```
+
+Notes:
+- Input can be any PCM format supported by `AVAudioFile`.
+- Output is 16 kHz mono Float32 samples suitable for ASR/VAD/Diarization.
@@ -0,0 +1,24 @@
+# MCP
+
+The repo is indexed by the DeepWiki MCP server, so your coding tools can access documentation programmatically.
+
+## VS Code / MCP Client Configuration
+
+Add this to your MCP client configuration file:
+
+```json
+{
+  "mcpServers": {
+    "deepwiki": {
+      "url": "https://mcp.deepwiki.com/mcp"
+    }
+  }
+}
+```
+
+## Claude Code (CLI)
+
+```bash
+claude mcp add -s user -t http deepwiki https://mcp.deepwiki.com/mcp
+```
+
@@ -0,0 +1,16 @@
+# Documentation Index
+
+- Guides
+  - [MCP](Guides/MCP.md)
+  - [Audio Conversion](Guides/AudioConversion.md)
+
+- Modules
+  - ASR: [Getting Started](ASR/GettingStarted.md)
+  - Diarization: [Speaker Diarization Guide](SpeakerDiarization.md)
+  - VAD: [Getting Started](VAD/GettingStarted.md)
+
+- API
+  - [API Reference](API.md)
+  
+- CLI
+  - [Command Line Guide](CLI.md)
@@ -80,6 +80,15 @@ for audioSamples in audioStream {
 }
 ```

+Notes:
+
+- Keep one `DiarizerManager` instance per stream so `SpeakerManager` maintains ID consistency.
+- Always rebase per-chunk timestamps by `(chunkStartSample / sampleRate)`.
+- Provide 16 kHz mono Float32 samples; pad final chunk to the model window.
+- Tune `speakerThreshold` and `embeddingThreshold` to trade off ID stability vs. sensitivity.
+
+**Speaker Enrollment:** The `Speaker` class includes a `name` field for enrollment workflows. When users introduce themselves ("My name is Alice"), update the speaker's name from the default (e.g. "Speaker_1") to enable personalized identification.
+
 ### Chunk Size Considerations

 The `performCompleteDiarization` function accepts audio of any length, but accuracy varies:
@@ -0,0 +1,51 @@
+# Voice Activity Detection (VAD)
+
+The current VAD APIs require careful tuning for your specific use case. If you need help integrating VAD, reach out in our Discord channel.
+
+Our goal is to provide a streamlined API similar to Apple's upcoming SpeechDetector in [OS26](https://developer.apple.com/documentation/speech/speechdetector).
+
+## Quick Start (Code)
+
+```swift
+import FluidAudio
+
+// Programmatic VAD over an audio file
+Task {
+    // 1) Initialize VAD (async load of Silero model)
+    let vad = try await VadManager(config: VadConfig(threshold: 0.3))
+
+    // 2) Prepare 16 kHz mono samples (see: Audio Conversion)
+    let samples = try await loadSamples16kMono(path: "path/to/audio.wav")
+
+    // 3) Run VAD and print speech segments (512-sample frames)
+    let results = try await vad.processAudioFile(samples)
+    let sampleRate = 16000.0
+    let frame = 512.0
+
+    var startIndex: Int? = nil
+    for (i, r) in results.enumerated() {
+        if r.isVoiceActive {
+            if startIndex == nil { startIndex = i }
+        } else if let s = startIndex {
+            let startSec = (Double(s) * frame) / sampleRate
+            let endSec = (Double(i + 1) * frame) / sampleRate
+            print(String(format: "Speech: %.2f–%.2fs", startSec, endSec))
+            startIndex = nil
+        }
+    }
+}
+```
+
+## CLI
+
+```bash
+# Run VAD benchmark (mini50 dataset by default)
+swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3
+
+# Save results and enable debug output
+swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug
+
+# Download VAD dataset if needed
+swift run fluidaudio download --dataset vad
+```
+
@@ -1,18 +1,18 @@
 ![banner.png](banner.png)

-# FluidAudio - Speaker Diarization, VAD and Transcription with CoreML
+# FluidAudio - Speaker diarization, voice-activity-detection and transcription with CoreML

 [![Swift](https://img.shields.io/badge/Swift-5.9+-orange.svg)](https://swift.org)
 [![Platform](https://img.shields.io/badge/Platform-macOS%20%7C%20iOS-blue.svg)](https://developer.apple.com)
 [![Discord](https://img.shields.io/badge/Discord-Join%20Chat-7289da.svg)](https://discord.gg/WNsvaCtmDe)
-[![Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue)](https://huggingface.co/collections/FluidInference/coreml-models-6873d9e310e638c66d22fba9)
+[![All Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue)](https://huggingface.co/collections/FluidInference/coreml-models-6873d9e310e638c66d22fba9)
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/FluidInference/FluidAudio)

 Fluid Audio is a Swift SDK for fully local, low-latency audio AI on Apple devices, with inference offloaded to the Apple Neural Engine (ANE), resulting in less memory and generally faster inference.

-The SDK includes state-of-the-art speaker diarization, transcription, and voice activity detection via open-source models (MIT/Apache 2.0) that can be integrated with just a few lines of code. Models are optimized for background processing, ambient computing and always on workloads by running inference on the ANE, minimizing CPU usage and avoiding GPU/MPS entirely. 
+The SDK includes state-of-the-art speaker diarization, transcription, and voice activity detection via open-source models (MIT/Apache 2.0) that can be integrated with just a few lines of code. Models are optimized for background processing, ambient computing and always on workloads by running inference on the ANE, minimizing CPU usage and avoiding GPU/MPS entirely.

-For custom use cases, feedback, additional model support, or platform requests, join our [Discord]. We’re also bringing visual, language, and TTS models to device and will share updates there.
+For custom use cases, feedback, additional model support, or platform requests, join our [Discord](https://discord.gg/WNsvaCtmDe). We’re also bringing visual, language, and TTS models to device and will share updates there.

 Below are some featured local AI apps using Fluid Audio models on macOS and iOS:

@@ -29,11 +29,9 @@ Below are some featured local AI apps using Fluid Audio models on macOS and iOS:
 - **Speaker Diarization**: Speaker separation with speaker clustering via Pyannote models
 - **Speaker Embedding Extraction**: Generate speaker embeddings for voice comparison and clustering, you can use this for speaker identification
 - **Voice Activity Detection (VAD)**: Voice activity detection with Silero models
- **CoreML Models**: Native Apple CoreML backend with custom-converted models optimized for Apple Silicon
- **Open-Source Models**: All models are publicly available on HuggingFace — converted and optimized by our team; permissive licenses
 - **Real-time Processing**: Designed for near real-time workloads but also works for offline processing
- **Cross-platform**: Support for macOS 14.0+ and iOS 17.0+ and Apple Silicon devices
 - **Apple Neural Engine**: Models run efficiently on Apple's ANE for maximum performance with minimal power consumption
+- **Open-Source Models**: All models are publicly available on HuggingFace — converted and optimized by our team; permissive licenses

 ## Installation

@@ -49,9 +47,24 @@ Important: When adding FluidAudio as a package dependency, only add the library

 ## Documentation

- **DeepWiki**: Auto-generated docs for this repo — https://deepwiki.com/FluidInference/FluidAudio
+**[DeepWiki](https://deepwiki.com/FluidInference/FluidAudio)** for auto-generated docs for this repo.

-### MCP
+### Documentation Index
+
+- Guides
+  - [MCP](Documentation/Guides/MCP.md)
+  - [Audio Conversion for Inference](Documentation/Guides/AudioConversion.md)
+- Modules
+  - ASR: [Getting Started](Documentation/ASR/GettingStarted.md)
+  - ASR: [Last Chunk Handling](Documentation/ASR/LastChunkHandling.md)
+  - Diarization: [Speaker Diarization Guide](Documentation/SpeakerDiarization.md)
+  - VAD: [Getting Started](Documentation/VAD/GettingStarted.md)
+- API
+  - [API Reference](Documentation/API.md)
+- CLI
+  - [Command Line Guide](Documentation/CLI.md)
+
+### MCP Server

 The repo is indexed by DeepWiki MCP server, so your coding tool can access the docs:

@@ -71,27 +84,6 @@ For claude code:
 claude mcp add -s user -t http deepwiki https://mcp.deepwiki.com/mcp
 ```

-### Audio Conversion (16 kHz mono)
-
-Most features expect 16 kHz mono Float32 samples. Use `AudioConverter` to load and convert from any `AVAudioFile` format:
-
-```swift
-import AVFoundation
-import FluidAudio
-
-func loadSamples16kMono(path: String) async throws -> [Float] {
-    let url = URL(fileURLWithPath: path)
-    let file = try AVAudioFile(forReading: url)
-    let capacity = AVAudioFrameCount(file.length)
-    guard let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat, frameCapacity: capacity) else {
-        return []
-    }
-    try file.read(into: buf)
-    let converter = AudioConverter()
-    return try await converter.convertToAsrFormat(buf)
-}
-```
-
 ## Automatic Speech Recognition (ASR) / Transcription

 - **Model**: `FluidInference/parakeet-tdt-0.6b-v3-coreml`
@@ -101,7 +93,7 @@ func loadSamples16kMono(path: String) async throws -> [Float] {
 - **Streaming Support**: Coming soon — batch processing is recommended for production use
 - **Backend**: Same Parakeet TDT v3 model powers our backend ASR

-### Quick Start (Code)
+### ASR Quick Start

 ```swift
 import FluidAudio
@@ -123,24 +115,9 @@ Task {
 }
 ```

-### CLI
-
 ```bash
 # Transcribe an audio file (batch)
 swift run fluidaudio transcribe audio.wav
-
-# Transcribe multiple files in parallel
-swift run fluidaudio multi-stream audio1.wav audio2.wav
-
-# Benchmark ASR on LibriSpeech
-swift run fluidaudio asr-benchmark --subset test-clean --num-files 50
-
-# Multilingual ASR (FLEURS) benchmark
-swift run fluidaudio fleurs-benchmark --languages en_us,fr_fr --samples 10
-
-# Download LibriSpeech test sets
-swift run fluidaudio download --dataset librispeech-test-clean
-swift run fluidaudio download --dataset librispeech-test-other
 ```

 ## Speaker Diarization
@@ -151,20 +128,7 @@ swift run fluidaudio download --dataset librispeech-test-other
 - **JER: 28.0%** — Outperforms EEND 2019 (25.3%) and x-vector clustering (28.7%)
 - **RTF: 0.02x** — Real-time processing with 50x speedup

-```text
-RTF = Processing Time / Audio Duration
-
-With RTF = 0.02x:
- 1 minute of audio takes 0.02 × 60 = 1.2 seconds to process
- 10 minutes of audio takes 0.02 × 600 = 12 seconds to process
-
-For real-time speech-to-text:
- Latency: ~1.2 seconds per minute of audio
- Throughput: Can process 50x faster than real-time
- Pipeline impact: Minimal — diarization won't be the bottleneck
-```
-
-### Quick Start (Code)
+### Speaker Diarization Quick Start

 ```swift
 import FluidAudio
@@ -186,114 +150,18 @@ Task {
 }
 ```

-### Streaming Diarization
+For diarization streaming see [Documentation/SpeakerDiarization.md](Documentation/SpeakerDiarization.md)

-Stream meeting audio in chunks while maintaining consistent speaker IDs across the session. Keep a single `DiarizerManager` alive, process fixed-size chunks, and rebase segment timestamps by the chunk’s start offset. Overlap helps reduce boundary errors and enables overlap speech handling.
-
-```swift
-import FluidAudio
-
-Task {
-    // 1) Initialize diarizer once (models are reused across chunks)
-    let models = try await DiarizerModels.downloadIfNeeded()
-    let config = DiarizerConfig(
-        clusteringThreshold: 0.7,
-        minSpeechDuration: 1.0,
-        minSilenceGap: 0.5,
-        minActiveFramesCount: 10.0,
-        chunkDuration: 10.0,   // model window; also used for chunk sizing below
-        chunkOverlap: 5.0,     // optional overlap
-        debugMode: false
-    )
-    let diarizer = DiarizerManager(config: config)
-    diarizer.initialize(models: models)
-
-    // Optional: tune streaming behavior (assignment/update thresholds)
-    diarizer.speakerManager.speakerThreshold = 0.84   // assign to existing speakers
-    diarizer.speakerManager.embeddingThreshold = 0.56  // update embeddings over time
-
-    // 2) Prepare 16 kHz mono samples (see: Audio Conversion)
-    let samples = try await loadSamples16kMono(path: "path/to/meeting.wav")
-
-    // 3) Chunked streaming loop with timestamp rebasing
-    let sr = 16000.0
-    let chunkSeconds = 10.0
-    let overlapSeconds = 5.0
-    let chunkSize = Int(chunkSeconds * sr)
-    let hop = Int(max(1.0, chunkSeconds - overlapSeconds) * sr)
-
-    var position = 0
-    var segments: [TimedSpeakerSegment] = []
-
-    while position < samples.count {
-        let end = min(position + chunkSize, samples.count)
-        var chunk = Array(samples[position..<end])
-        if chunk.count < chunkSize {
-            // Pad final chunk to model’s expected window
-            chunk += [Float](repeating: 0, count: chunkSize - chunk.count)
-        }
-
-        // Run diarization on this chunk
-        let result = try diarizer.performCompleteDiarization(chunk)
-
-        // Rebase chunk-relative times to stream time
-        let offsetSec = Float(position) / Float(sr)
-        for seg in result.segments {
-            segments.append(
-                TimedSpeakerSegment(
-                    speakerId: seg.speakerId,
-                    embedding: seg.embedding,
-                    startTimeSeconds: seg.startTimeSeconds + offsetSec,
-                    endTimeSeconds: seg.endTimeSeconds + offsetSec,
-                    qualityScore: seg.qualityScore
-                )
-            )
-        }
-
-        position += hop
-    }
-
-    // Optional: merge or deduplicate segments across overlaps/boundaries.
-    // diarizer.speakerManager preserves consistent IDs across all chunks.
-}
-```
-
-CLI equivalents:
 ```bash
-# Real-time-ish streaming benchmark (~3s chunks with 2s overlap)
 swift run fluidaudio diarization-benchmark --single-file ES2004a \
  --chunk-seconds 3 --overlap-seconds 2
-
-# Balanced throughput/quality (~10s chunks with 5s overlap)
-swift run fluidaudio diarization-benchmark --dataset ami-sdm \
-  --chunk-seconds 10 --overlap-seconds 5
 ```

-Notes:
- Keep one `DiarizerManager` instance per stream so `SpeakerManager` maintains ID consistency.
- Always rebase per-chunk timestamps by `(chunkStartSample / sampleRate)`.
- Provide 16 kHz mono Float32 samples; pad final chunk to the model window.
- Tune `speakerThreshold` and `embeddingThreshold` to trade off ID stability vs. sensitivity.
-
-**Speaker Enrollment:** The `Speaker` class includes a `name` field for enrollment workflows. When users introduce themselves ("My name is Alice"), update the speaker's name from the default (e.g. "Speaker_1") to enable personalized identification.
-
 ### CLI

 ```bash
-# Run AMI benchmark (auto-download dataset)
-swift run fluidaudio diarization-benchmark --auto-download
-
-# Tune threshold and save results
-swift run fluidaudio diarization-benchmark --threshold 0.7 --output results.json
-
-# Quick test on a single AMI file
-swift run fluidaudio diarization-benchmark --single-file ES2004a --threshold 0.8
-
 # Process an individual file and save JSON
 swift run fluidaudio process meeting.wav --output results.json --threshold 0.6
-
-# Download AMI dataset
-swift run fluidaudio download --dataset ami-sdm
 ```

 ## Voice Activity Detection (VAD)
@@ -302,7 +170,7 @@ The current VAD APIs require careful tuning for your specific use case. If you n

 Our goal is to provide a streamlined API similar to Apple's upcoming SpeechDetector in [OS26](https://developer.apple.com/documentation/speech/speechdetector).

-### Quick Start (Code)
+### VAD Quick Start

 ```swift
 import FluidAudio
@@ -334,20 +202,12 @@ Task {
 }
 ```

-### CLI
-
 ```bash
 # Run VAD benchmark (mini50 dataset by default)
 swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3
-
-# Save results and enable debug output
-swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug
-
-# Download VAD dataset if needed
-swift run fluidaudio download --dataset vad
 ```

-## Showcase 
+## Showcase

 Make a PR if you want to add your app!

@@ -358,74 +218,19 @@ Make a PR if you want to add your app!
 | **[Slipbox](https://slipbox.ai/)** | Privacy-first meeting assistant for real-time conversation intelligence. Uses Parakeet ASR (iOS) and speaker diarization across platforms. |
 | **[Whisper Mate](https://whisper.marksdo.com)** | Transcribes movies and audio locally; records and transcribes in real time from speakers or system apps. Uses speaker diarization. |

-
-## API Reference
-
-**Diarization:**
-
- `DiarizerManager`: Main diarization class
- `performCompleteDiarization(_:sampleRate:)`: Process audio and return speaker segments
-  - Accepts any `RandomAccessCollection<Float>` (Array, ArraySlice, ContiguousArray, etc.)
- `compareSpeakers(audio1:audio2:)`: Compare similarity between two audio samples
- `validateAudio(_:)`: Validate audio quality and characteristics
-
-**Voice Activity Detection:**
-
- `VadManager`: Voice activity detection with CoreML models
- `VadConfig`: Configuration for VAD processing with adaptive thresholding
- `processChunk(_:)`: Process a single audio chunk and detect voice activity
- `processAudioFile(_:)`: Process complete audio file in chunks
- `VadAudioProcessor`: Advanced audio processing with SNR filtering
-
-**Automatic Speech Recognition:**
-
- `AsrManager`: Main ASR class with TDT decoding for batch processing
- `AsrModels`: Model loading and management with automatic downloads
- `ASRConfig`: Configuration for ASR processing
- `transcribe(_:source:)`: Process complete audio and return transcription results
- `AudioProcessor.loadAudioFile(path:)`: Load and convert audio files to required format
- `AudioSource`: Enum for microphone vs system audio separation
-  
 ## Everything Else

-### Platform & Networking Notes
+### FAQs

 - CLI is available on macOS only. For iOS, use the library programmatically.
 - Models auto-download on first use. If your network restricts Hugging Face access, set an HTTPS proxy: `export https_proxy=http://127.0.0.1:7890`.
 - Windows alternative in development: [fluid-server](https://github.com/FluidInference/fluid-server)
-
-If you're looking to get the system audio on a Mac, take a look at this repo for reference [AudioCap](https://github.com/insidegui/AudioCap/tree/main)
+- If you're looking to get the system audio on a Mac, take a look at this repo for reference [AudioCap](https://github.com/insidegui/AudioCap/tree/main)

 ### License

 Apache 2.0 — see `LICENSE` for details.

-### Contributing
-
-This project uses `swift-format` to maintain consistent code style. All pull requests are automatically checked for formatting compliance.
-
-**Local Development:**
-
-```bash
-# Format all code (requires Swift 6+ for contributors only)
-# Users of the library don't need Swift 6
-swift format --in-place --recursive --configuration .swift-format Sources/ Tests/ Examples/
-
-# Check formatting without modifying
-swift format lint --recursive --configuration .swift-format Sources/ Tests/ Examples/
-
-# For Swift <6, install swift-format separately:
-# git clone https://github.com/apple/swift-format
-# cd swift-format && swift build -c release
-# cp .build/release/swift-format /usr/local/bin/
-```
-
-**Automatic Checks:**
-
- PRs will fail if code is not properly formatted
- GitHub Actions runs formatting checks on all Swift file changes
- See `.swift-format` for style configuration
-
 ### Acknowledgments

 This project builds upon the excellent work of the [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) project for speaker diarization algorithms and techniques.
@@ -245,8 +245,6 @@ public final class DiarizerManager {
            throw DiarizerError.notInitialized
        }

-        logger.debug("Using EmbeddingExtractor for embedding extraction")
-
        var masks: [[Float]] = []
        let numSpeakers = slidingFeature.data[0][0].count
        let numFrames = slidingFeature.data[0].count
@@ -151,10 +151,6 @@ public class SpeakerManager {
                    segmentId: UUID(),
                    alpha: 0.9
                )
-
-                logger.debug(
-                    "Updated embedding for \(speakerId), update count: \(speaker.updateCount), raw count: \(speaker.rawEmbeddings.count)"
-                )
            }
        } else {
            // Just update duration if not updating embedding
@@ -89,21 +89,14 @@ struct AMIParser {
        ]

        // Add comprehensive debug logging for path resolution
-        logger.debug("🔍 DEBUG: Searching for AMI annotations in \(possiblePaths.count) locations:")
-        logger.debug("   Current working directory: \(FileManager.default.currentDirectoryPath)")
-
        var amiDir: URL?
-        for (index, path) in possiblePaths.enumerated() {
+        for (_, path) in possiblePaths.enumerated() {
            let segmentsDir = path.appendingPathComponent("segments")
            let meetingsFile = path.appendingPathComponent("corpusResources/meetings.xml")

            let segmentsExists = FileManager.default.fileExists(atPath: segmentsDir.path)
            let meetingsExists = FileManager.default.fileExists(atPath: meetingsFile.path)

-            logger.debug("   \(index + 1). \(path.path)")
-            logger.debug("      - segments/: \(segmentsExists ? "✅" : "❌") (\(segmentsDir.path))")
-            logger.debug("      - meetings.xml: \(meetingsExists ? "✅" : "❌") (\(meetingsFile.path))")
-
            if segmentsExists && meetingsExists {
                logger.info("      - 🎯 SELECTED: This path will be used")
                amiDir = path