From f8907acbc7c3fd5c7c51c6991d666f0e05ade66b Mon Sep 17 00:00:00 2001
From: Alex <hanweng9@gmail.com>
Date: Sat, 21 Mar 2026 22:48:10 -0400
Subject: [PATCH] Add Qwen3 ASR audio encoder ANE optimization (#410)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

- Documents Conv2d + einsum rewrite of Qwen3 ASR audio encoder for 100%
ANE scheduling
- Encoder speedup: **1.53x** on M4 Max (11.61ms → 7.60ms median, 100
iterations)
- Validated on 10 LibriSpeech test-clean files: 9/10 identical
transcriptions, no quality regression
- Decoder stays on GPU (T=1 autoregressive with KV cache — same finding
as PocketTTS)

### Architecture changes (in mobius research repo)
- `nn.Linear` → `nn.Conv2d(kernel_size=1)` for all projections
- `(B, C, 1, S)` tensor layout for ANE-friendly data access
- Per-head einsum attention with 14 heads × 64 channels
- Manual LayerNorm on channel dimension

### Benchmark results (M4 Max)

| Metric | Original (GPU+ANE) | ANE 100% |
|--------|-------------------|----------|
| Median | 11.61 ms | 7.60 ms |
| P95 | 16.79 ms | 9.51 ms |
| Min | 9.74 ms | 6.84 ms |

## Test plan

- [x] Encoder inference benchmark (100 iterations, M4 Max)
- [x] Numerical verification (max diff 2.61e-07)
- [x] End-to-end LibriSpeech test-clean validation (10 files, WER
parity)
- [ ] Test on other Apple Silicon (M1/M2/M3)
<!-- devin-review-badge-begin -->

---

<a href="https://app.devin.ai/review/fluidinference/fluidaudio/pull/410"
target="_blank">
  <picture>
<source media="(prefers-color-scheme: dark)"
srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1">
<img
src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1"
alt="Open with Devin">
  </picture>
</a>
<!-- devin-review-badge-end -->
---
 Documentation/{ => ASR}/Qwen3-ASR.md          | 30 +++++++++++++++++--
 .../FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift |  4 +--
 Sources/FluidAudio/ModelNames.swift           |  2 +-
 3 files changed, 31 insertions(+), 5 deletions(-)
 rename Documentation/{ => ASR}/Qwen3-ASR.md (76%)
diff --git a/Documentation/Qwen3-ASR.md b/Documentation/ASR/Qwen3-ASR.md
similarity index 76%
rename from Documentation/Qwen3-ASR.md
rename to Documentation/ASR/Qwen3-ASR.md
index 51ae8e81..bbcf66d4 100644
--- a/Documentation/Qwen3-ASR.md
+++ b/Documentation/ASR/Qwen3-ASR.md
@@ -10,7 +10,7 @@ Encoder-decoder automatic speech recognition using [Qwen3-ASR-0.6B](https://hugg
 
 **CoreML Model**: [FluidInference/qwen3-asr-0.6b-coreml](https://huggingface.co/FluidInference/qwen3-asr-0.6b-coreml)
 
-Only the **f32** variant is recommended. See [Why not int8?](#why-not-int8) below.
+Both **f32** and **int8** variants use the v2 ANE-optimized audio encoder. See [Model Variants](#model-variants) for RAM/speed tradeoffs and [Why not int8?](#why-not-int8) for decoder quantization details.
 
 ## Architecture
 
@@ -87,11 +87,37 @@ See [Benchmarks.md](Benchmarks.md#qwen3-asr-experimental) for performance result
 
 | Component | Description |
 |-----------|-------------|
-| `qwen3_asr_audio_encoder.mlmodelc` | Audio feature extraction |
+| `qwen3_asr_audio_encoder_v2.mlmodelc` | Audio feature extraction (ANE-optimized, Conv2d + einsum) |
 | `qwen3_asr_decoder_stateful.mlmodelc` | Autoregressive decoder with KV-cache |
 | `qwen3_asr_embeddings.bin` | Token embedding weights (float16) |
 | `vocab.json` | Tokenizer vocabulary (151,936 tokens) |
 
+The original `qwen3_asr_audio_encoder.mlmodelc` (v1) is still available on HuggingFace for backward compatibility.
+
+## Model Variants
+
+Benchmarked on 10 LibriSpeech test-clean files (~70s total audio), M4 Max:
+
+| Variant | Encoder RAM | Decoder RAM | Embeds RAM | **Total RAM** | Overall RTFx | WER |
+|---------|------------|-------------|-----------|-----------|-------------|-----|
+| f32 (v2 encoder) | 100 MB | 988 MB | 391 MB | ~1480 MB | 3.1x | 0.7% |
+| int8 (v2 encoder) | 100 MB | 330 MB | 296 MB | **728 MB** | 2.8x | 0.7% |
+
+All variants produce identical transcriptions. The v2 encoder reduces encoder RAM from ~400 MB to 100 MB via Conv2d + einsum rewrite with fp16 precision.
+
+### V2 Audio Encoder (ANE-Optimized)
+
+The v2 encoder rewrites the 18 transformer layers for 100% Apple Neural Engine scheduling:
+
+- `nn.Linear` → `nn.Conv2d(kernel_size=1)` for all projections
+- Tensor layout changed from `(B, S, C)` to `(B, C, 1, S)`
+- Per-head einsum attention (14 heads × 64 channels)
+- Manual LayerNorm on channel dimension
+
+**Isolated encoder speedup:** 1.53x on M4 Max (11.61ms → 7.60ms median). End-to-end pipeline improvement is modest since the 28-layer autoregressive decoder dominates total inference time.
+
+Int8 quantization of the v2 encoder was tested: same WER, same RTFx, same RAM (encoder is already small at 100 MB). Only benefit is half the download size (179 MB vs 356 MB on disk).
+
 ## CoreML Limitations
 
 This CoreML implementation differs from the original PyTorch in ways that may affect accuracy:
diff --git a/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift b/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift
index 09186f37..348eef5b 100644
--- a/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift
+++ b/Sources/FluidAudio/ASR/Qwen3/Qwen3AsrModels.swift
@@ -43,7 +43,7 @@ public struct Qwen3AsrModels: Sendable {
     /// Expected directory structure:
     /// ```
     /// qwen3-asr/
-    ///   qwen3_asr_audio_encoder.mlmodelc
+    ///   qwen3_asr_audio_encoder_v2.mlmodelc
     ///   qwen3_asr_decoder_stateful.mlmodelc
     ///   qwen3_asr_embeddings.bin  (float16 embedding weights)
     ///   vocab.json
@@ -60,7 +60,7 @@ public struct Qwen3AsrModels: Sendable {
 
         // Load audio encoder
         let audioEncoder = try await loadModel(
-            named: "qwen3_asr_audio_encoder",
+            named: "qwen3_asr_audio_encoder_v2",
             from: directory,
             configuration: modelConfig
         )
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 83fbeb66..05160cbf 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -366,7 +366,7 @@ public enum ModelNames {
 
     /// Qwen3-ASR model names
     public enum Qwen3ASR {
-        public static let audioEncoderFile = "qwen3_asr_audio_encoder.mlmodelc"
+        public static let audioEncoderFile = "qwen3_asr_audio_encoder_v2.mlmodelc"
         public static let embeddingFile = "qwen3_asr_embedding.mlmodelc"
         public static let decoderStatefulFile = "qwen3_asr_decoder_stateful.mlmodelc"
         public static let decoderFullFile = "qwen3_asr_decoder_full.mlmodelc"