Add missing source_noise input to Kokoro TTS models

Newer Kokoro CoreML models require a source_noise feature that wasn't
being provided, causing CI failures with "Feature source_noise is
required but not specified" errors.

Changes:
- Add source_noise tensor [1, sampleRate*duration, 9] with random Float16 values
- Update both synthesis pipeline and warm-up prediction
- Size adapts to model variant: 5s (120k samples) or 15s (360k samples)
- Use multiarray pooling for memory efficiency

Fixes #290 CI test-tts workflow failure.
This commit is contained in:
Alex-Wengg
2026-03-22 00:43:05 -04:00
parent cb946c91be
commit c8a5056aeb
2 changed files with 30 additions and 0 deletions
@@ -304,7 +304,22 @@ public struct KokoroSynthesizer {
zeroFill: true
)
// Source noise for newer Kokoro models
let maxSeconds = variant.maxDurationSeconds
let noiseLength = TtsConstants.audioSampleRate * maxSeconds
let sourceNoise = try await multiArrayPool.rent(
shape: [1, noiseLength, 9],
dataType: .float16,
zeroFill: false
)
let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9)
for i in 0..<(noiseLength * 9) {
let randomValue = Float.random(in: -1...1)
noisePointer[i] = Float16(randomValue).bitPattern
}
func recycleModelArrays() async {
await multiArrayPool.recycle(sourceNoise, zeroFill: false)
await multiArrayPool.recycle(phasesArray, zeroFill: true)
await multiArrayPool.recycle(attentionMask, zeroFill: false)
await multiArrayPool.recycle(inputArray, zeroFill: false)
@@ -338,6 +353,7 @@ public struct KokoroSynthesizer {
"attention_mask": attentionMask,
"ref_s": refStyle,
"random_phases": phasesArray,
"source_noise": sourceNoise,
])
let predictionStart = Date()
+14
View File
@@ -152,11 +152,25 @@ public struct TtsModels: Sendable {
randomPhases[index] = NSNumber(value: Float(0))
}
// Source noise for newer Kokoro models
let maxSeconds = variant.maxDurationSeconds
let noiseLength = TtsConstants.audioSampleRate * maxSeconds
let sourceNoise = try MLMultiArray(
shape: [1, NSNumber(value: noiseLength), 9],
dataType: .float16
)
let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9)
for i in 0..<(noiseLength * 9) {
let randomValue = Float.random(in: -1...1)
noisePointer[i] = Float16(randomValue).bitPattern
}
let features = try MLDictionaryFeatureProvider(dictionary: [
"input_ids": inputIds,
"attention_mask": attentionMask,
"ref_s": refStyle,
"random_phases": randomPhases,
"source_noise": sourceNoise,
])
let options: MLPredictionOptions = optimizedPredictionOptions()