fix: chunk boundary transcription loss due to missing mel context (#264)

## Summary
Fixes transcription truncation at chunk boundaries where valid speech
was being lost.

## Problem
Audio at certain chunk boundaries produced all-blank predictions. The
FastConformer encoder's depthwise convolutions require left context from
preceding audio to produce stable features for the first frames of each
chunk. Without this context, the encoder output for initial frames can
be unstable, causing the TDT decoder to predict silence.

## Solution
Prepend 80ms (1280 samples = 1 encoder frame) of context from the
overlap region to non-first chunks:
- Reserve space in `chunkSamples` calculation to stay within CoreML's
240k sample limit
- Use existing `contextFrameAdjustment` parameter to tell decoder to
skip context frames
- Context is drawn from the existing 2.0s overlap region (no additional
memory)
 
## Testing
- Verified on long-form audio (>2 minutes) that previously exhibited
truncation
- WER improved significantly on affected files
- Streaming mode unaffected (already handles left context correctly)
 
## Notes
This aligns batch mode context handling with how streaming already
works.
This commit is contained in:
Dmitry Starkov
2026-01-23 20:42:23 +03:00
committed by GitHub
parent bb960032d0
commit 7459740a6b
+33 -9
View File
@@ -21,10 +21,17 @@ struct ChunkProcessor {
private let sampleRate: Int = 16000
private let overlapSeconds: Double = 2.0
/// Context samples prepended from previous chunk for mel spectrogram stability (80ms = 1 encoder frame).
/// The FastConformer encoder's depthwise convolutions need left context for stable output.
/// Without this, the first frames of a chunk may produce features that cause all-blank predictions.
private let melContextSamples: Int = ASRConstants.samplesPerEncoderFrame // 1280 samples = 80ms
private var maxModelSamples: Int { 240_000 } // CoreML encoder capacity (15 seconds)
private var chunkSamples: Int {
// Match CoreML reference chunk length (239,840 samples 14.99s)
let raw = max(maxModelSamples - ASRConstants.melHopSize, ASRConstants.samplesPerEncoderFrame)
// Reserve space for context samples that will be prepended to non-first chunks.
// This ensures chunkSamples + melContextSamples <= maxModelSamples.
let maxActualChunk = maxModelSamples - melContextSamples // 240000 - 1280 = 238720
let raw = max(maxActualChunk - ASRConstants.melHopSize, ASRConstants.samplesPerEncoderFrame)
return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
}
private var overlapSamples: Int {
@@ -56,24 +63,30 @@ struct ChunkProcessor {
var chunkOutputs: [[TokenWindow]] = []
var chunkStart = 0
var chunkIndex = 0
var chunkDecoderState = TdtDecoderState.make()
while chunkStart < totalSamples {
let candidateEnd = chunkStart + chunkSamples
let isLastChunk = candidateEnd >= totalSamples
let chunkEnd = isLastChunk ? totalSamples : candidateEnd
let chunkLength = chunkEnd - chunkStart
if chunkLength <= 0 {
if chunkEnd <= chunkStart {
break
}
chunkDecoderState.reset()
let chunkSamplesArray = try readSamples(offset: chunkStart, count: chunkLength)
// For chunks after the first, prepend context samples from the overlap region.
// This provides left context for the mel spectrogram STFT window and encoder convolutions.
let contextSamples = chunkIndex > 0 ? melContextSamples : 0
let contextStart = chunkStart - contextSamples
let chunkLengthWithContext = chunkEnd - contextStart
let chunkSamplesArray = try readSamples(offset: contextStart, count: chunkLengthWithContext)
let (windowTokens, windowTimestamps, windowConfidences) = try await transcribeChunk(
samples: chunkSamplesArray,
contextSamples: contextSamples,
chunkStart: chunkStart,
isLastChunk: isLastChunk,
using: manager,
@@ -90,6 +103,8 @@ struct ChunkProcessor {
}
chunkOutputs.append(windowData)
chunkIndex += 1
if isLastChunk {
break
}
@@ -147,6 +162,7 @@ struct ChunkProcessor {
private func transcribeChunk(
samples: [Float],
contextSamples: Int,
chunkStart: Int,
isLastChunk: Bool,
using manager: AsrManager,
@@ -155,15 +171,23 @@ struct ChunkProcessor {
guard !samples.isEmpty else { return ([], [], []) }
let paddedChunk = manager.padAudioIfNeeded(samples, targetLength: maxModelSamples)
let actualFrameCount = ASRConstants.calculateEncoderFrames(from: samples.count)
// Calculate frame count for the ACTUAL audio (excluding prepended context)
let actualAudioSamples = samples.count - contextSamples
let actualFrameCount = ASRConstants.calculateEncoderFrames(from: actualAudioSamples)
// Global frame offset is based on original chunkStart (not context-adjusted start)
let globalFrameOffset = chunkStart / ASRConstants.samplesPerEncoderFrame
// Context frame adjustment tells decoder to skip the prepended context frames
let contextFrames = contextSamples / ASRConstants.samplesPerEncoderFrame
let (hypothesis, encoderSequenceLength) = try await manager.executeMLInferenceWithTimings(
paddedChunk,
originalLength: samples.count,
actualAudioFrames: actualFrameCount,
originalLength: samples.count, // Full length including context
actualAudioFrames: actualFrameCount, // Only actual audio frames (excluding context)
decoderState: &decoderState,
contextFrameAdjustment: 0,
contextFrameAdjustment: contextFrames, // Skip context frames in decoder
isLastChunk: isLastChunk,
globalFrameOffset: globalFrameOffset
)