fix(asr): add melChunkContext opt-out flag for Issue #594

PR #264 (commit 7459740a) added an 80ms (1 encoder frame, 1280 samples) mel-context prepend on non-first chunks to fix all-blank predictions at chunk boundaries on long English audio. On `parakeet-tdt-0.6b-v3-coreml` with non-English audio, that prepend shifts the FastConformer encoder's first-frame distribution just enough that the SOS-primed TDT decoder drifts back to its English-biased prior at every chunk seam. Reproduction (4 fixtures, default vs --no-mel-context): - notes_1408 (FR): drift -> clean - wwii (FR): clean -> clean - user_en (EN): clean -> clean - user2 99.9s (FR): clean -> clean Changes: - ASRConfig gains `melChunkContext: Bool = true` (default preserves PR #264 behavior; set to false for non-English long-form batch). - ChunkProcessor reads the flag and zeroes the prepend when disabled, expanding chunkSamples back so chunks aren't 80ms smaller than the encoder's max receptive window. - `transcribe` and `asr-benchmark` CLIs accept `--no-mel-context`. Closes #594
2026-05-12 20:20:36 +00:00 · 2026-05-12 01:34:33 -04:00
parent d9d06c731a
commit bfa14a1773
5 changed files with 73 additions and 12 deletions
@@ -23,6 +23,21 @@ public struct ASRConfig: Sendable {
    /// Default: 480,000 samples (~30 seconds at 16kHz)
    public let streamingThreshold: Int

+    /// Enable the 80ms (1 encoder frame) mel-context prepend on non-first
+    /// chunks in the long-form batch path. Added in PR #264 to fix
+    /// all-blank predictions at chunk boundaries on long English audio.
+    ///
+    /// Issue #594 root cause: on `parakeet-tdt-0.6b-v3-coreml` with
+    /// non-English audio, the 80ms prepend shifts the FastConformer
+    /// encoder's first-frame distribution just enough that the SOS-primed
+    /// TDT decoder drifts back to its English-biased prior. Disabling this
+    /// flag (`false`) restores clean French/multilingual transcription at
+    /// chunk boundaries while keeping parallel chunk processing.
+    ///
+    /// Default `true` preserves PR #264's blank-prediction fix on English.
+    /// Set to `false` for non-English long-form batch transcription.
+    public let melChunkContext: Bool
+
    public static let `default` = ASRConfig()

    public init(
@@ -31,7 +46,8 @@ public struct ASRConfig: Sendable {
        encoderHiddenSize: Int = ASRConstants.encoderHiddenSize,
        parallelChunkConcurrency: Int = 4,
        streamingEnabled: Bool = true,
-        streamingThreshold: Int = 480_000
+        streamingThreshold: Int = 480_000,
+        melChunkContext: Bool = true
    ) {
        self.sampleRate = sampleRate
        self.tdtConfig = tdtConfig
@@ -39,6 +55,7 @@ public struct ASRConfig: Sendable {
        self.parallelChunkConcurrency = max(1, parallelChunkConcurrency)
        self.streamingEnabled = streamingEnabled
        self.streamingThreshold = streamingThreshold
+        self.melChunkContext = melChunkContext
    }
 }

@@ -29,6 +29,13 @@ public actor AsrManager {
        config.parallelChunkConcurrency
    }

+    /// Issue #594: opt-out flag exposed to `ChunkProcessor`. When `false`,
+    /// disables PR #264's 80ms mel-context prepend so non-English audio
+    /// stops drifting at chunk boundaries.
+    internal var melChunkContext: Bool {
+        config.melChunkContext
+    }
+
    /// Cached vocabulary loaded once during initialization
    internal var vocabulary: [Int: String] = [:]
    #if DEBUG
@@ -26,24 +26,38 @@ struct ChunkProcessor {
    /// Context samples prepended from previous chunk for mel spectrogram stability (80ms = 1 encoder frame).
    /// The FastConformer encoder's depthwise convolutions need left context for stable output.
    /// Without this, the first frames of a chunk may produce features that cause all-blank predictions.
+    ///
+    /// Issue #594: on `parakeet-tdt-0.6b-v3-coreml` with non-English audio
+    /// this prepend shifts the encoder's first-frame distribution enough
+    /// to make the SOS-primed decoder drift to its English-biased prior.
+    /// Callers can opt out via `ASRConfig.melChunkContext = false` to
+    /// restore clean non-English transcription at chunk boundaries.
    private let melContextSamples: Int = ASRConstants.samplesPerEncoderFrame  // 1280 samples = 80ms

    private var maxModelSamples: Int { ASRConstants.maxModelSamples }

-    private var chunkSamples: Int {
-        // Reserve space for context samples that will be prepended to non-first chunks.
-        // This ensures chunkSamples + melContextSamples <= maxModelSamples.
-        let maxActualChunk = maxModelSamples - melContextSamples  // 240000 - 1280 = 238720
+    /// Effective per-chunk mel-context size based on the runtime flag.
+    private func effectiveMelContextSamples(melChunkContext: Bool) -> Int {
+        melChunkContext ? melContextSamples : 0
+    }
+
+    /// Frame-aligned chunk size that reserves space for the context prepend
+    /// (or fills the encoder window when context is disabled).
+    private func chunkSamples(melChunkContext: Bool) -> Int {
+        let reserved = effectiveMelContextSamples(melChunkContext: melChunkContext)
+        let maxActualChunk = maxModelSamples - reserved
        let raw = max(maxActualChunk - ASRConstants.melHopSize, ASRConstants.samplesPerEncoderFrame)
        return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
    }
-    private var overlapSamples: Int {
+
+    private func overlapSamples(forChunkSamples chunkSamples: Int) -> Int {
        let requested = Int(overlapSeconds * Double(ASRConstants.sampleRate))
        let capped = min(requested, chunkSamples / 2)
        return capped / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
    }
-    private var strideSamples: Int {
-        let raw = max(chunkSamples - overlapSamples, ASRConstants.samplesPerEncoderFrame)
+
+    private func strideSamples(forChunkSamples chunkSamples: Int) -> Int {
+        let raw = max(chunkSamples - overlapSamples(forChunkSamples: chunkSamples), ASRConstants.samplesPerEncoderFrame)
        return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
    }

@@ -68,6 +82,13 @@ struct ChunkProcessor {
        let workers = await makeWorkerPool(using: manager, count: requestedConcurrency) ?? [manager]
        let decoderLayers = await manager.decoderLayerCount
        let maxModelSamples = self.maxModelSamples
+        // Issue #594: opt-out of PR #264's 80ms mel-context prepend for
+        // non-English audio. When disabled, expand chunks to fill the
+        // encoder window (no prepended frames).
+        let melChunkContext = await manager.melChunkContext
+        let melContextSamples = effectiveMelContextSamples(melChunkContext: melChunkContext)
+        let chunkSamples = self.chunkSamples(melChunkContext: melChunkContext)
+        let strideSamples = self.strideSamples(forChunkSamples: chunkSamples)

        var chunkOutputs: [[TokenWindow]?] = []
        var availableWorkers = Array(workers.indices)
@@ -650,6 +650,7 @@ extension ASRBenchmark {
        var streamingChunkDuration = 10.0
        var useStreamingEou = false
        var modelVersion: AsrModelVersion = .v3  // Default to v3
+        var melChunkContext = true  // Issue #594: opt-out of PR #264's 80ms mel-context prepend

        // Check for help flag first
        if arguments.contains("--help") || arguments.contains("-h") {
@@ -717,6 +718,8 @@ extension ASRBenchmark {
                    }
                    i += 1
                }
+            case "--no-mel-context":
+                melChunkContext = false
            default:
                break
            }
@@ -743,6 +746,7 @@ extension ASRBenchmark {
        logger.info("   Auto-download: \(autoDownload ? "enabled" : "disabled")")
        logger.info("   Test streaming: \(testStreaming ? "enabled" : "disabled")")
        logger.info("   Streaming EOU: \(useStreamingEou ? "enabled" : "disabled")")
+        logger.info("   Mel chunk context (PR #264): \(melChunkContext ? "enabled" : "disabled")")
        if testStreaming {
            logger.info("   Chunk duration: \(streamingChunkDuration)s")
        }
@@ -764,7 +768,8 @@ extension ASRBenchmark {
        let tdtConfig = TdtConfig(blankId: modelVersion.blankId)
        let asrConfig = ASRConfig(
            tdtConfig: tdtConfig,
-            encoderHiddenSize: modelVersion.encoderHiddenSize
+            encoderHiddenSize: modelVersion.encoderHiddenSize,
+            melChunkContext: melChunkContext
        )

        let asrManager = AsrManager(config: asrConfig)
@@ -1035,6 +1040,7 @@ extension ASRBenchmark {
                --no-auto-download        Disable automatic dataset download
                --test-streaming          Enable streaming simulation mode
                --chunk-duration <secs>   Chunk duration for streaming mode (default: 0.1s, min: 1.0s)
+                --no-mel-context          Disable 80ms mel-context prepend (Issue #594; required for non-English long audio on v3)
                --help, -h               Show this help message

            Description:
@@ -216,6 +216,7 @@ enum TranscribeCommand {
        var parakeetVariant: StreamingModelVariant?
        var language: Language?
        var encoderPrecision: ParakeetEncoderPrecision = .int8
+        var melChunkContext = true

        // Parse options
        var i = 1
@@ -293,6 +294,11 @@ enum TranscribeCommand {
                    encoderPrecision = precision
                    i += 1
                }
+            case "--no-mel-context":
+                // Issue #594: opt-out of PR #264's 80ms mel-context prepend
+                // on non-first chunks. Restores clean transcription at chunk
+                // boundaries for non-English audio on parakeet-tdt-0.6b-v3.
+                melChunkContext = false
            default:
                logger.warning("Warning: Unknown option: \(arguments[i])")
            }
@@ -317,7 +323,8 @@ enum TranscribeCommand {
            await testBatchTranscription(
                audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
                outputJsonPath: outputJsonPath, modelVersion: modelVersion, customVocabPath: customVocabPath,
-                modelDir: modelDir, language: language, encoderPrecision: encoderPrecision)
+                modelDir: modelDir, language: language, encoderPrecision: encoderPrecision,
+                melChunkContext: melChunkContext)
        }
    }

@@ -325,7 +332,8 @@ enum TranscribeCommand {
    private static func testBatchTranscription(
        audioFile: String, showMetadata: Bool, wordTimestamps: Bool, outputJsonPath: String?,
        modelVersion: AsrModelVersion, customVocabPath: String?, modelDir: String? = nil,
-        language: Language? = nil, encoderPrecision: ParakeetEncoderPrecision = .int8
+        language: Language? = nil, encoderPrecision: ParakeetEncoderPrecision = .int8,
+        melChunkContext: Bool = true
    ) async {
        do {
            // Initialize ASR models
@@ -340,7 +348,8 @@ enum TranscribeCommand {
            let tdtConfig = TdtConfig(blankId: modelVersion.blankId)
            let asrConfig = ASRConfig(
                tdtConfig: tdtConfig,
-                encoderHiddenSize: modelVersion.encoderHiddenSize
+                encoderHiddenSize: modelVersion.encoderHiddenSize,
+                melChunkContext: melChunkContext
            )
            let asrManager = AsrManager(config: asrConfig)
            try await asrManager.loadModels(models)
@@ -895,6 +904,7 @@ enum TranscribeCommand {
                --model-dir <path>     Path to local model directory (skips download)
                --custom-vocab <file>  Apply vocabulary boosting using terms from file (batch mode only)
                --parakeet-variant <variant>  Use any Parakeet model via StreamingAsrManager protocol
+                --no-mel-context  Disable 80ms mel-context prepend (Issue #594; required for non-English long audio on v3)

            Streaming variants (for --parakeet-variant):
                parakeet-eou-160ms, parakeet-eou-320ms, parakeet-eou-1280ms,