fix(asr): add melChunkContext opt-out flag for Issue #594

PR #264 (commit 7459740a) added an 80ms (1 encoder frame, 1280 samples)
mel-context prepend on non-first chunks to fix all-blank predictions at
chunk boundaries on long English audio. On `parakeet-tdt-0.6b-v3-coreml`
with non-English audio, that prepend shifts the FastConformer encoder's
first-frame distribution just enough that the SOS-primed TDT decoder
drifts back to its English-biased prior at every chunk seam.

Reproduction (4 fixtures, default vs --no-mel-context):
  - notes_1408 (FR):  drift -> clean
  - wwii (FR):        clean -> clean
  - user_en (EN):     clean -> clean
  - user2 99.9s (FR): clean -> clean

Changes:
  - ASRConfig gains `melChunkContext: Bool = true` (default preserves
    PR #264 behavior; set to false for non-English long-form batch).
  - ChunkProcessor reads the flag and zeroes the prepend when disabled,
    expanding chunkSamples back so chunks aren't 80ms smaller than the
    encoder's max receptive window.
  - `transcribe` and `asr-benchmark` CLIs accept `--no-mel-context`.

Closes #594
This commit is contained in:
Alex-Wengg
2026-05-12 01:34:33 -04:00
parent d9d06c731a
commit bfa14a1773
5 changed files with 73 additions and 12 deletions
+18 -1
View File
@@ -23,6 +23,21 @@ public struct ASRConfig: Sendable {
/// Default: 480,000 samples (~30 seconds at 16kHz)
public let streamingThreshold: Int
/// Enable the 80ms (1 encoder frame) mel-context prepend on non-first
/// chunks in the long-form batch path. Added in PR #264 to fix
/// all-blank predictions at chunk boundaries on long English audio.
///
/// Issue #594 root cause: on `parakeet-tdt-0.6b-v3-coreml` with
/// non-English audio, the 80ms prepend shifts the FastConformer
/// encoder's first-frame distribution just enough that the SOS-primed
/// TDT decoder drifts back to its English-biased prior. Disabling this
/// flag (`false`) restores clean French/multilingual transcription at
/// chunk boundaries while keeping parallel chunk processing.
///
/// Default `true` preserves PR #264's blank-prediction fix on English.
/// Set to `false` for non-English long-form batch transcription.
public let melChunkContext: Bool
public static let `default` = ASRConfig()
public init(
@@ -31,7 +46,8 @@ public struct ASRConfig: Sendable {
encoderHiddenSize: Int = ASRConstants.encoderHiddenSize,
parallelChunkConcurrency: Int = 4,
streamingEnabled: Bool = true,
streamingThreshold: Int = 480_000
streamingThreshold: Int = 480_000,
melChunkContext: Bool = true
) {
self.sampleRate = sampleRate
self.tdtConfig = tdtConfig
@@ -39,6 +55,7 @@ public struct ASRConfig: Sendable {
self.parallelChunkConcurrency = max(1, parallelChunkConcurrency)
self.streamingEnabled = streamingEnabled
self.streamingThreshold = streamingThreshold
self.melChunkContext = melChunkContext
}
}
@@ -29,6 +29,13 @@ public actor AsrManager {
config.parallelChunkConcurrency
}
/// Issue #594: opt-out flag exposed to `ChunkProcessor`. When `false`,
/// disables PR #264's 80ms mel-context prepend so non-English audio
/// stops drifting at chunk boundaries.
internal var melChunkContext: Bool {
config.melChunkContext
}
/// Cached vocabulary loaded once during initialization
internal var vocabulary: [Int: String] = [:]
#if DEBUG
@@ -26,24 +26,38 @@ struct ChunkProcessor {
/// Context samples prepended from previous chunk for mel spectrogram stability (80ms = 1 encoder frame).
/// The FastConformer encoder's depthwise convolutions need left context for stable output.
/// Without this, the first frames of a chunk may produce features that cause all-blank predictions.
///
/// Issue #594: on `parakeet-tdt-0.6b-v3-coreml` with non-English audio
/// this prepend shifts the encoder's first-frame distribution enough
/// to make the SOS-primed decoder drift to its English-biased prior.
/// Callers can opt out via `ASRConfig.melChunkContext = false` to
/// restore clean non-English transcription at chunk boundaries.
private let melContextSamples: Int = ASRConstants.samplesPerEncoderFrame // 1280 samples = 80ms
private var maxModelSamples: Int { ASRConstants.maxModelSamples }
private var chunkSamples: Int {
// Reserve space for context samples that will be prepended to non-first chunks.
// This ensures chunkSamples + melContextSamples <= maxModelSamples.
let maxActualChunk = maxModelSamples - melContextSamples // 240000 - 1280 = 238720
/// Effective per-chunk mel-context size based on the runtime flag.
private func effectiveMelContextSamples(melChunkContext: Bool) -> Int {
melChunkContext ? melContextSamples : 0
}
/// Frame-aligned chunk size that reserves space for the context prepend
/// (or fills the encoder window when context is disabled).
private func chunkSamples(melChunkContext: Bool) -> Int {
let reserved = effectiveMelContextSamples(melChunkContext: melChunkContext)
let maxActualChunk = maxModelSamples - reserved
let raw = max(maxActualChunk - ASRConstants.melHopSize, ASRConstants.samplesPerEncoderFrame)
return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
}
private var overlapSamples: Int {
private func overlapSamples(forChunkSamples chunkSamples: Int) -> Int {
let requested = Int(overlapSeconds * Double(ASRConstants.sampleRate))
let capped = min(requested, chunkSamples / 2)
return capped / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
}
private var strideSamples: Int {
let raw = max(chunkSamples - overlapSamples, ASRConstants.samplesPerEncoderFrame)
private func strideSamples(forChunkSamples chunkSamples: Int) -> Int {
let raw = max(chunkSamples - overlapSamples(forChunkSamples: chunkSamples), ASRConstants.samplesPerEncoderFrame)
return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
}
@@ -68,6 +82,13 @@ struct ChunkProcessor {
let workers = await makeWorkerPool(using: manager, count: requestedConcurrency) ?? [manager]
let decoderLayers = await manager.decoderLayerCount
let maxModelSamples = self.maxModelSamples
// Issue #594: opt-out of PR #264's 80ms mel-context prepend for
// non-English audio. When disabled, expand chunks to fill the
// encoder window (no prepended frames).
let melChunkContext = await manager.melChunkContext
let melContextSamples = effectiveMelContextSamples(melChunkContext: melChunkContext)
let chunkSamples = self.chunkSamples(melChunkContext: melChunkContext)
let strideSamples = self.strideSamples(forChunkSamples: chunkSamples)
var chunkOutputs: [[TokenWindow]?] = []
var availableWorkers = Array(workers.indices)
@@ -650,6 +650,7 @@ extension ASRBenchmark {
var streamingChunkDuration = 10.0
var useStreamingEou = false
var modelVersion: AsrModelVersion = .v3 // Default to v3
var melChunkContext = true // Issue #594: opt-out of PR #264's 80ms mel-context prepend
// Check for help flag first
if arguments.contains("--help") || arguments.contains("-h") {
@@ -717,6 +718,8 @@ extension ASRBenchmark {
}
i += 1
}
case "--no-mel-context":
melChunkContext = false
default:
break
}
@@ -743,6 +746,7 @@ extension ASRBenchmark {
logger.info(" Auto-download: \(autoDownload ? "enabled" : "disabled")")
logger.info(" Test streaming: \(testStreaming ? "enabled" : "disabled")")
logger.info(" Streaming EOU: \(useStreamingEou ? "enabled" : "disabled")")
logger.info(" Mel chunk context (PR #264): \(melChunkContext ? "enabled" : "disabled")")
if testStreaming {
logger.info(" Chunk duration: \(streamingChunkDuration)s")
}
@@ -764,7 +768,8 @@ extension ASRBenchmark {
let tdtConfig = TdtConfig(blankId: modelVersion.blankId)
let asrConfig = ASRConfig(
tdtConfig: tdtConfig,
encoderHiddenSize: modelVersion.encoderHiddenSize
encoderHiddenSize: modelVersion.encoderHiddenSize,
melChunkContext: melChunkContext
)
let asrManager = AsrManager(config: asrConfig)
@@ -1035,6 +1040,7 @@ extension ASRBenchmark {
--no-auto-download Disable automatic dataset download
--test-streaming Enable streaming simulation mode
--chunk-duration <secs> Chunk duration for streaming mode (default: 0.1s, min: 1.0s)
--no-mel-context Disable 80ms mel-context prepend (Issue #594; required for non-English long audio on v3)
--help, -h Show this help message
Description:
@@ -216,6 +216,7 @@ enum TranscribeCommand {
var parakeetVariant: StreamingModelVariant?
var language: Language?
var encoderPrecision: ParakeetEncoderPrecision = .int8
var melChunkContext = true
// Parse options
var i = 1
@@ -293,6 +294,11 @@ enum TranscribeCommand {
encoderPrecision = precision
i += 1
}
case "--no-mel-context":
// Issue #594: opt-out of PR #264's 80ms mel-context prepend
// on non-first chunks. Restores clean transcription at chunk
// boundaries for non-English audio on parakeet-tdt-0.6b-v3.
melChunkContext = false
default:
logger.warning("Warning: Unknown option: \(arguments[i])")
}
@@ -317,7 +323,8 @@ enum TranscribeCommand {
await testBatchTranscription(
audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
outputJsonPath: outputJsonPath, modelVersion: modelVersion, customVocabPath: customVocabPath,
modelDir: modelDir, language: language, encoderPrecision: encoderPrecision)
modelDir: modelDir, language: language, encoderPrecision: encoderPrecision,
melChunkContext: melChunkContext)
}
}
@@ -325,7 +332,8 @@ enum TranscribeCommand {
private static func testBatchTranscription(
audioFile: String, showMetadata: Bool, wordTimestamps: Bool, outputJsonPath: String?,
modelVersion: AsrModelVersion, customVocabPath: String?, modelDir: String? = nil,
language: Language? = nil, encoderPrecision: ParakeetEncoderPrecision = .int8
language: Language? = nil, encoderPrecision: ParakeetEncoderPrecision = .int8,
melChunkContext: Bool = true
) async {
do {
// Initialize ASR models
@@ -340,7 +348,8 @@ enum TranscribeCommand {
let tdtConfig = TdtConfig(blankId: modelVersion.blankId)
let asrConfig = ASRConfig(
tdtConfig: tdtConfig,
encoderHiddenSize: modelVersion.encoderHiddenSize
encoderHiddenSize: modelVersion.encoderHiddenSize,
melChunkContext: melChunkContext
)
let asrManager = AsrManager(config: asrConfig)
try await asrManager.loadModels(models)
@@ -895,6 +904,7 @@ enum TranscribeCommand {
--model-dir <path> Path to local model directory (skips download)
--custom-vocab <file> Apply vocabulary boosting using terms from file (batch mode only)
--parakeet-variant <variant> Use any Parakeet model via StreamingAsrManager protocol
--no-mel-context Disable 80ms mel-context prepend (Issue #594; required for non-English long audio on v3)
Streaming variants (for --parakeet-variant):
parakeet-eou-160ms, parakeet-eou-320ms, parakeet-eou-1280ms,