mirror of
https://github.com/FluidInference/FluidAudio.git
synced 2026-05-12 20:20:36 +00:00
Add missing source_noise input to Kokoro TTS models
Newer Kokoro CoreML models require a source_noise feature that wasn't being provided, causing CI failures with "Feature source_noise is required but not specified" errors. Changes: - Add source_noise tensor [1, sampleRate*duration, 9] with random Float16 values - Update both synthesis pipeline and warm-up prediction - Size adapts to model variant: 5s (120k samples) or 15s (360k samples) - Use multiarray pooling for memory efficiency Fixes #290 CI test-tts workflow failure.
This commit is contained in:
@@ -304,7 +304,22 @@ public struct KokoroSynthesizer {
|
||||
zeroFill: true
|
||||
)
|
||||
|
||||
// Source noise for newer Kokoro models
|
||||
let maxSeconds = variant.maxDurationSeconds
|
||||
let noiseLength = TtsConstants.audioSampleRate * maxSeconds
|
||||
let sourceNoise = try await multiArrayPool.rent(
|
||||
shape: [1, noiseLength, 9],
|
||||
dataType: .float16,
|
||||
zeroFill: false
|
||||
)
|
||||
let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9)
|
||||
for i in 0..<(noiseLength * 9) {
|
||||
let randomValue = Float.random(in: -1...1)
|
||||
noisePointer[i] = Float16(randomValue).bitPattern
|
||||
}
|
||||
|
||||
func recycleModelArrays() async {
|
||||
await multiArrayPool.recycle(sourceNoise, zeroFill: false)
|
||||
await multiArrayPool.recycle(phasesArray, zeroFill: true)
|
||||
await multiArrayPool.recycle(attentionMask, zeroFill: false)
|
||||
await multiArrayPool.recycle(inputArray, zeroFill: false)
|
||||
@@ -338,6 +353,7 @@ public struct KokoroSynthesizer {
|
||||
"attention_mask": attentionMask,
|
||||
"ref_s": refStyle,
|
||||
"random_phases": phasesArray,
|
||||
"source_noise": sourceNoise,
|
||||
])
|
||||
|
||||
let predictionStart = Date()
|
||||
|
||||
@@ -152,11 +152,25 @@ public struct TtsModels: Sendable {
|
||||
randomPhases[index] = NSNumber(value: Float(0))
|
||||
}
|
||||
|
||||
// Source noise for newer Kokoro models
|
||||
let maxSeconds = variant.maxDurationSeconds
|
||||
let noiseLength = TtsConstants.audioSampleRate * maxSeconds
|
||||
let sourceNoise = try MLMultiArray(
|
||||
shape: [1, NSNumber(value: noiseLength), 9],
|
||||
dataType: .float16
|
||||
)
|
||||
let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9)
|
||||
for i in 0..<(noiseLength * 9) {
|
||||
let randomValue = Float.random(in: -1...1)
|
||||
noisePointer[i] = Float16(randomValue).bitPattern
|
||||
}
|
||||
|
||||
let features = try MLDictionaryFeatureProvider(dictionary: [
|
||||
"input_ids": inputIds,
|
||||
"attention_mask": attentionMask,
|
||||
"ref_s": refStyle,
|
||||
"random_phases": randomPhases,
|
||||
"source_noise": sourceNoise,
|
||||
])
|
||||
|
||||
let options: MLPredictionOptions = optimizedPredictionOptions()
|
||||
|
||||
Reference in New Issue
Block a user