mirror of
https://github.com/FluidInference/FluidAudio.git
synced 2026-05-12 20:20:36 +00:00
fix(asr): add melChunkContext opt-out flag for Issue #594
PR #264 (commit 7459740a) added an 80ms (1 encoder frame, 1280 samples)
mel-context prepend on non-first chunks to fix all-blank predictions at
chunk boundaries on long English audio. On `parakeet-tdt-0.6b-v3-coreml`
with non-English audio, that prepend shifts the FastConformer encoder's
first-frame distribution just enough that the SOS-primed TDT decoder
drifts back to its English-biased prior at every chunk seam.
Reproduction (4 fixtures, default vs --no-mel-context):
- notes_1408 (FR): drift -> clean
- wwii (FR): clean -> clean
- user_en (EN): clean -> clean
- user2 99.9s (FR): clean -> clean
Changes:
- ASRConfig gains `melChunkContext: Bool = true` (default preserves
PR #264 behavior; set to false for non-English long-form batch).
- ChunkProcessor reads the flag and zeroes the prepend when disabled,
expanding chunkSamples back so chunks aren't 80ms smaller than the
encoder's max receptive window.
- `transcribe` and `asr-benchmark` CLIs accept `--no-mel-context`.
Closes #594
This commit is contained in:
@@ -23,6 +23,21 @@ public struct ASRConfig: Sendable {
|
||||
/// Default: 480,000 samples (~30 seconds at 16kHz)
|
||||
public let streamingThreshold: Int
|
||||
|
||||
/// Enable the 80ms (1 encoder frame) mel-context prepend on non-first
|
||||
/// chunks in the long-form batch path. Added in PR #264 to fix
|
||||
/// all-blank predictions at chunk boundaries on long English audio.
|
||||
///
|
||||
/// Issue #594 root cause: on `parakeet-tdt-0.6b-v3-coreml` with
|
||||
/// non-English audio, the 80ms prepend shifts the FastConformer
|
||||
/// encoder's first-frame distribution just enough that the SOS-primed
|
||||
/// TDT decoder drifts back to its English-biased prior. Disabling this
|
||||
/// flag (`false`) restores clean French/multilingual transcription at
|
||||
/// chunk boundaries while keeping parallel chunk processing.
|
||||
///
|
||||
/// Default `true` preserves PR #264's blank-prediction fix on English.
|
||||
/// Set to `false` for non-English long-form batch transcription.
|
||||
public let melChunkContext: Bool
|
||||
|
||||
public static let `default` = ASRConfig()
|
||||
|
||||
public init(
|
||||
@@ -31,7 +46,8 @@ public struct ASRConfig: Sendable {
|
||||
encoderHiddenSize: Int = ASRConstants.encoderHiddenSize,
|
||||
parallelChunkConcurrency: Int = 4,
|
||||
streamingEnabled: Bool = true,
|
||||
streamingThreshold: Int = 480_000
|
||||
streamingThreshold: Int = 480_000,
|
||||
melChunkContext: Bool = true
|
||||
) {
|
||||
self.sampleRate = sampleRate
|
||||
self.tdtConfig = tdtConfig
|
||||
@@ -39,6 +55,7 @@ public struct ASRConfig: Sendable {
|
||||
self.parallelChunkConcurrency = max(1, parallelChunkConcurrency)
|
||||
self.streamingEnabled = streamingEnabled
|
||||
self.streamingThreshold = streamingThreshold
|
||||
self.melChunkContext = melChunkContext
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,6 +29,13 @@ public actor AsrManager {
|
||||
config.parallelChunkConcurrency
|
||||
}
|
||||
|
||||
/// Issue #594: opt-out flag exposed to `ChunkProcessor`. When `false`,
|
||||
/// disables PR #264's 80ms mel-context prepend so non-English audio
|
||||
/// stops drifting at chunk boundaries.
|
||||
internal var melChunkContext: Bool {
|
||||
config.melChunkContext
|
||||
}
|
||||
|
||||
/// Cached vocabulary loaded once during initialization
|
||||
internal var vocabulary: [Int: String] = [:]
|
||||
#if DEBUG
|
||||
|
||||
@@ -26,24 +26,38 @@ struct ChunkProcessor {
|
||||
/// Context samples prepended from previous chunk for mel spectrogram stability (80ms = 1 encoder frame).
|
||||
/// The FastConformer encoder's depthwise convolutions need left context for stable output.
|
||||
/// Without this, the first frames of a chunk may produce features that cause all-blank predictions.
|
||||
///
|
||||
/// Issue #594: on `parakeet-tdt-0.6b-v3-coreml` with non-English audio
|
||||
/// this prepend shifts the encoder's first-frame distribution enough
|
||||
/// to make the SOS-primed decoder drift to its English-biased prior.
|
||||
/// Callers can opt out via `ASRConfig.melChunkContext = false` to
|
||||
/// restore clean non-English transcription at chunk boundaries.
|
||||
private let melContextSamples: Int = ASRConstants.samplesPerEncoderFrame // 1280 samples = 80ms
|
||||
|
||||
private var maxModelSamples: Int { ASRConstants.maxModelSamples }
|
||||
|
||||
private var chunkSamples: Int {
|
||||
// Reserve space for context samples that will be prepended to non-first chunks.
|
||||
// This ensures chunkSamples + melContextSamples <= maxModelSamples.
|
||||
let maxActualChunk = maxModelSamples - melContextSamples // 240000 - 1280 = 238720
|
||||
/// Effective per-chunk mel-context size based on the runtime flag.
|
||||
private func effectiveMelContextSamples(melChunkContext: Bool) -> Int {
|
||||
melChunkContext ? melContextSamples : 0
|
||||
}
|
||||
|
||||
/// Frame-aligned chunk size that reserves space for the context prepend
|
||||
/// (or fills the encoder window when context is disabled).
|
||||
private func chunkSamples(melChunkContext: Bool) -> Int {
|
||||
let reserved = effectiveMelContextSamples(melChunkContext: melChunkContext)
|
||||
let maxActualChunk = maxModelSamples - reserved
|
||||
let raw = max(maxActualChunk - ASRConstants.melHopSize, ASRConstants.samplesPerEncoderFrame)
|
||||
return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
|
||||
}
|
||||
private var overlapSamples: Int {
|
||||
|
||||
private func overlapSamples(forChunkSamples chunkSamples: Int) -> Int {
|
||||
let requested = Int(overlapSeconds * Double(ASRConstants.sampleRate))
|
||||
let capped = min(requested, chunkSamples / 2)
|
||||
return capped / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
|
||||
}
|
||||
private var strideSamples: Int {
|
||||
let raw = max(chunkSamples - overlapSamples, ASRConstants.samplesPerEncoderFrame)
|
||||
|
||||
private func strideSamples(forChunkSamples chunkSamples: Int) -> Int {
|
||||
let raw = max(chunkSamples - overlapSamples(forChunkSamples: chunkSamples), ASRConstants.samplesPerEncoderFrame)
|
||||
return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame
|
||||
}
|
||||
|
||||
@@ -68,6 +82,13 @@ struct ChunkProcessor {
|
||||
let workers = await makeWorkerPool(using: manager, count: requestedConcurrency) ?? [manager]
|
||||
let decoderLayers = await manager.decoderLayerCount
|
||||
let maxModelSamples = self.maxModelSamples
|
||||
// Issue #594: opt-out of PR #264's 80ms mel-context prepend for
|
||||
// non-English audio. When disabled, expand chunks to fill the
|
||||
// encoder window (no prepended frames).
|
||||
let melChunkContext = await manager.melChunkContext
|
||||
let melContextSamples = effectiveMelContextSamples(melChunkContext: melChunkContext)
|
||||
let chunkSamples = self.chunkSamples(melChunkContext: melChunkContext)
|
||||
let strideSamples = self.strideSamples(forChunkSamples: chunkSamples)
|
||||
|
||||
var chunkOutputs: [[TokenWindow]?] = []
|
||||
var availableWorkers = Array(workers.indices)
|
||||
|
||||
@@ -650,6 +650,7 @@ extension ASRBenchmark {
|
||||
var streamingChunkDuration = 10.0
|
||||
var useStreamingEou = false
|
||||
var modelVersion: AsrModelVersion = .v3 // Default to v3
|
||||
var melChunkContext = true // Issue #594: opt-out of PR #264's 80ms mel-context prepend
|
||||
|
||||
// Check for help flag first
|
||||
if arguments.contains("--help") || arguments.contains("-h") {
|
||||
@@ -717,6 +718,8 @@ extension ASRBenchmark {
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
case "--no-mel-context":
|
||||
melChunkContext = false
|
||||
default:
|
||||
break
|
||||
}
|
||||
@@ -743,6 +746,7 @@ extension ASRBenchmark {
|
||||
logger.info(" Auto-download: \(autoDownload ? "enabled" : "disabled")")
|
||||
logger.info(" Test streaming: \(testStreaming ? "enabled" : "disabled")")
|
||||
logger.info(" Streaming EOU: \(useStreamingEou ? "enabled" : "disabled")")
|
||||
logger.info(" Mel chunk context (PR #264): \(melChunkContext ? "enabled" : "disabled")")
|
||||
if testStreaming {
|
||||
logger.info(" Chunk duration: \(streamingChunkDuration)s")
|
||||
}
|
||||
@@ -764,7 +768,8 @@ extension ASRBenchmark {
|
||||
let tdtConfig = TdtConfig(blankId: modelVersion.blankId)
|
||||
let asrConfig = ASRConfig(
|
||||
tdtConfig: tdtConfig,
|
||||
encoderHiddenSize: modelVersion.encoderHiddenSize
|
||||
encoderHiddenSize: modelVersion.encoderHiddenSize,
|
||||
melChunkContext: melChunkContext
|
||||
)
|
||||
|
||||
let asrManager = AsrManager(config: asrConfig)
|
||||
@@ -1035,6 +1040,7 @@ extension ASRBenchmark {
|
||||
--no-auto-download Disable automatic dataset download
|
||||
--test-streaming Enable streaming simulation mode
|
||||
--chunk-duration <secs> Chunk duration for streaming mode (default: 0.1s, min: 1.0s)
|
||||
--no-mel-context Disable 80ms mel-context prepend (Issue #594; required for non-English long audio on v3)
|
||||
--help, -h Show this help message
|
||||
|
||||
Description:
|
||||
|
||||
@@ -216,6 +216,7 @@ enum TranscribeCommand {
|
||||
var parakeetVariant: StreamingModelVariant?
|
||||
var language: Language?
|
||||
var encoderPrecision: ParakeetEncoderPrecision = .int8
|
||||
var melChunkContext = true
|
||||
|
||||
// Parse options
|
||||
var i = 1
|
||||
@@ -293,6 +294,11 @@ enum TranscribeCommand {
|
||||
encoderPrecision = precision
|
||||
i += 1
|
||||
}
|
||||
case "--no-mel-context":
|
||||
// Issue #594: opt-out of PR #264's 80ms mel-context prepend
|
||||
// on non-first chunks. Restores clean transcription at chunk
|
||||
// boundaries for non-English audio on parakeet-tdt-0.6b-v3.
|
||||
melChunkContext = false
|
||||
default:
|
||||
logger.warning("Warning: Unknown option: \(arguments[i])")
|
||||
}
|
||||
@@ -317,7 +323,8 @@ enum TranscribeCommand {
|
||||
await testBatchTranscription(
|
||||
audioFile: audioFile, showMetadata: showMetadata, wordTimestamps: wordTimestamps,
|
||||
outputJsonPath: outputJsonPath, modelVersion: modelVersion, customVocabPath: customVocabPath,
|
||||
modelDir: modelDir, language: language, encoderPrecision: encoderPrecision)
|
||||
modelDir: modelDir, language: language, encoderPrecision: encoderPrecision,
|
||||
melChunkContext: melChunkContext)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -325,7 +332,8 @@ enum TranscribeCommand {
|
||||
private static func testBatchTranscription(
|
||||
audioFile: String, showMetadata: Bool, wordTimestamps: Bool, outputJsonPath: String?,
|
||||
modelVersion: AsrModelVersion, customVocabPath: String?, modelDir: String? = nil,
|
||||
language: Language? = nil, encoderPrecision: ParakeetEncoderPrecision = .int8
|
||||
language: Language? = nil, encoderPrecision: ParakeetEncoderPrecision = .int8,
|
||||
melChunkContext: Bool = true
|
||||
) async {
|
||||
do {
|
||||
// Initialize ASR models
|
||||
@@ -340,7 +348,8 @@ enum TranscribeCommand {
|
||||
let tdtConfig = TdtConfig(blankId: modelVersion.blankId)
|
||||
let asrConfig = ASRConfig(
|
||||
tdtConfig: tdtConfig,
|
||||
encoderHiddenSize: modelVersion.encoderHiddenSize
|
||||
encoderHiddenSize: modelVersion.encoderHiddenSize,
|
||||
melChunkContext: melChunkContext
|
||||
)
|
||||
let asrManager = AsrManager(config: asrConfig)
|
||||
try await asrManager.loadModels(models)
|
||||
@@ -895,6 +904,7 @@ enum TranscribeCommand {
|
||||
--model-dir <path> Path to local model directory (skips download)
|
||||
--custom-vocab <file> Apply vocabulary boosting using terms from file (batch mode only)
|
||||
--parakeet-variant <variant> Use any Parakeet model via StreamingAsrManager protocol
|
||||
--no-mel-context Disable 80ms mel-context prepend (Issue #594; required for non-English long audio on v3)
|
||||
|
||||
Streaming variants (for --parakeet-variant):
|
||||
parakeet-eou-160ms, parakeet-eou-320ms, parakeet-eou-1280ms,
|
||||
|
||||
Reference in New Issue
Block a user