From f8907acbc7c3fd5c7c51c6991d666f0e05ade66b Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 21 Mar 2026 22:48:10 -0400 Subject: [PATCH] Add Qwen3 ASR audio encoder ANE optimization (#410) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Documents Conv2d + einsum rewrite of Qwen3 ASR audio encoder for 100% ANE scheduling - Encoder speedup: **1.53x** on M4 Max (11.61ms → 7.60ms median, 100 iterations) - Validated on 10 LibriSpeech test-clean files: 9/10 identical transcriptions, no quality regression - Decoder stays on GPU (T=1 autoregressive with KV cache — same finding as PocketTTS) ### Architecture changes (in mobius research repo) - `nn.Linear` → `nn.Conv2d(kernel_size=1)` for all projections - `(B, C, 1, S)` tensor layout for ANE-friendly data access - Per-head einsum attention with 14 heads × 64 channels - Manual LayerNorm on channel dimension ### Benchmark results (M4 Max) | Metric | Original (GPU+ANE) | ANE 100% | |--------|-------------------|----------| | Median | 11.61 ms | 7.60 ms | | P95 | 16.79 ms | 9.51 ms | | Min | 9.74 ms | 6.84 ms | ## Test plan - [x] Encoder inference benchmark (100 iterations, M4 Max) - [x] Numerical verification (max diff 2.61e-07) - [x] End-to-end LibriSpeech test-clean validation (10 files, WER parity) - [ ] Test on other Apple Silicon (M1/M2/M3) --- Open with Devin --- Documentation/{ => ASR}/Qwen3-ASR.md | 30 +++++++++++++++++-- .../FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift | 4 +-- Sources/FluidAudio/ModelNames.swift | 2 +- 3 files changed, 31 insertions(+), 5 deletions(-) rename Documentation/{ => ASR}/Qwen3-ASR.md (76%) diff --git a/Documentation/Qwen3-ASR.md b/Documentation/ASR/Qwen3-ASR.md similarity index 76% rename from Documentation/Qwen3-ASR.md rename to Documentation/ASR/Qwen3-ASR.md index 51ae8e81..bbcf66d4 100644 --- a/Documentation/Qwen3-ASR.md +++ b/Documentation/ASR/Qwen3-ASR.md @@ -10,7 +10,7 @@ Encoder-decoder automatic speech recognition using [Qwen3-ASR-0.6B](https://hugg **CoreML Model**: [FluidInference/qwen3-asr-0.6b-coreml](https://huggingface.co/FluidInference/qwen3-asr-0.6b-coreml) -Only the **f32** variant is recommended. See [Why not int8?](#why-not-int8) below. +Both **f32** and **int8** variants use the v2 ANE-optimized audio encoder. See [Model Variants](#model-variants) for RAM/speed tradeoffs and [Why not int8?](#why-not-int8) for decoder quantization details. ## Architecture @@ -87,11 +87,37 @@ See [Benchmarks.md](Benchmarks.md#qwen3-asr-experimental) for performance result | Component | Description | |-----------|-------------| -| `qwen3_asr_audio_encoder.mlmodelc` | Audio feature extraction | +| `qwen3_asr_audio_encoder_v2.mlmodelc` | Audio feature extraction (ANE-optimized, Conv2d + einsum) | | `qwen3_asr_decoder_stateful.mlmodelc` | Autoregressive decoder with KV-cache | | `qwen3_asr_embeddings.bin` | Token embedding weights (float16) | | `vocab.json` | Tokenizer vocabulary (151,936 tokens) | +The original `qwen3_asr_audio_encoder.mlmodelc` (v1) is still available on HuggingFace for backward compatibility. + +## Model Variants + +Benchmarked on 10 LibriSpeech test-clean files (~70s total audio), M4 Max: + +| Variant | Encoder RAM | Decoder RAM | Embeds RAM | **Total RAM** | Overall RTFx | WER | +|---------|------------|-------------|-----------|-----------|-------------|-----| +| f32 (v2 encoder) | 100 MB | 988 MB | 391 MB | ~1480 MB | 3.1x | 0.7% | +| int8 (v2 encoder) | 100 MB | 330 MB | 296 MB | **728 MB** | 2.8x | 0.7% | + +All variants produce identical transcriptions. The v2 encoder reduces encoder RAM from ~400 MB to 100 MB via Conv2d + einsum rewrite with fp16 precision. + +### V2 Audio Encoder (ANE-Optimized) + +The v2 encoder rewrites the 18 transformer layers for 100% Apple Neural Engine scheduling: + +- `nn.Linear` → `nn.Conv2d(kernel_size=1)` for all projections +- Tensor layout changed from `(B, S, C)` to `(B, C, 1, S)` +- Per-head einsum attention (14 heads × 64 channels) +- Manual LayerNorm on channel dimension + +**Isolated encoder speedup:** 1.53x on M4 Max (11.61ms → 7.60ms median). End-to-end pipeline improvement is modest since the 28-layer autoregressive decoder dominates total inference time. + +Int8 quantization of the v2 encoder was tested: same WER, same RTFx, same RAM (encoder is already small at 100 MB). Only benefit is half the download size (179 MB vs 356 MB on disk). + ## CoreML Limitations This CoreML implementation differs from the original PyTorch in ways that may affect accuracy: diff --git a/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift b/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift index 09186f37..348eef5b 100644 --- a/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift +++ b/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift @@ -43,7 +43,7 @@ public struct Qwen3AsrModels: Sendable { /// Expected directory structure: /// ``` /// qwen3-asr/ - /// qwen3_asr_audio_encoder.mlmodelc + /// qwen3_asr_audio_encoder_v2.mlmodelc /// qwen3_asr_decoder_stateful.mlmodelc /// qwen3_asr_embeddings.bin (float16 embedding weights) /// vocab.json @@ -60,7 +60,7 @@ public struct Qwen3AsrModels: Sendable { // Load audio encoder let audioEncoder = try await loadModel( - named: "qwen3_asr_audio_encoder", + named: "qwen3_asr_audio_encoder_v2", from: directory, configuration: modelConfig ) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 83fbeb66..05160cbf 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -366,7 +366,7 @@ public enum ModelNames { /// Qwen3-ASR model names public enum Qwen3ASR { - public static let audioEncoderFile = "qwen3_asr_audio_encoder.mlmodelc" + public static let audioEncoderFile = "qwen3_asr_audio_encoder_v2.mlmodelc" public static let embeddingFile = "qwen3_asr_embedding.mlmodelc" public static let decoderStatefulFile = "qwen3_asr_decoder_stateful.mlmodelc" public static let decoderFullFile = "qwen3_asr_decoder_full.mlmodelc"