Add missing source_noise input to Kokoro TTS models

Newer Kokoro CoreML models require a source_noise feature that wasn't being provided, causing CI failures with "Feature source_noise is required but not specified" errors. Changes: - Add source_noise tensor [1, sampleRate*duration, 9] with random Float16 values - Update both synthesis pipeline and warm-up prediction - Size adapts to model variant: 5s (120k samples) or 15s (360k samples) - Use multiarray pooling for memory efficiency Fixes #290 CI test-tts workflow failure.
2026-05-12 20:20:36 +00:00 · 2026-03-22 00:43:05 -04:00
parent cb946c91be
commit c8a5056aeb
2 changed files with 30 additions and 0 deletions
@@ -304,7 +304,22 @@ public struct KokoroSynthesizer {
            zeroFill: true
        )

+        // Source noise for newer Kokoro models
+        let maxSeconds = variant.maxDurationSeconds
+        let noiseLength = TtsConstants.audioSampleRate * maxSeconds
+        let sourceNoise = try await multiArrayPool.rent(
+            shape: [1, noiseLength, 9],
+            dataType: .float16,
+            zeroFill: false
+        )
+        let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9)
+        for i in 0..<(noiseLength * 9) {
+            let randomValue = Float.random(in: -1...1)
+            noisePointer[i] = Float16(randomValue).bitPattern
+        }
+
        func recycleModelArrays() async {
+            await multiArrayPool.recycle(sourceNoise, zeroFill: false)
            await multiArrayPool.recycle(phasesArray, zeroFill: true)
            await multiArrayPool.recycle(attentionMask, zeroFill: false)
            await multiArrayPool.recycle(inputArray, zeroFill: false)
@@ -338,6 +353,7 @@ public struct KokoroSynthesizer {
            "attention_mask": attentionMask,
            "ref_s": refStyle,
            "random_phases": phasesArray,
+            "source_noise": sourceNoise,
        ])

        let predictionStart = Date()
@@ -152,11 +152,25 @@ public struct TtsModels: Sendable {
                randomPhases[index] = NSNumber(value: Float(0))
            }

+            // Source noise for newer Kokoro models
+            let maxSeconds = variant.maxDurationSeconds
+            let noiseLength = TtsConstants.audioSampleRate * maxSeconds
+            let sourceNoise = try MLMultiArray(
+                shape: [1, NSNumber(value: noiseLength), 9],
+                dataType: .float16
+            )
+            let noisePointer = sourceNoise.dataPointer.bindMemory(to: UInt16.self, capacity: noiseLength * 9)
+            for i in 0..<(noiseLength * 9) {
+                let randomValue = Float.random(in: -1...1)
+                noisePointer[i] = Float16(randomValue).bitPattern
+            }
+
            let features = try MLDictionaryFeatureProvider(dictionary: [
                "input_ids": inputIds,
                "attention_mask": attentionMask,
                "ref_s": refStyle,
                "random_phases": randomPhases,
+                "source_noise": sourceNoise,
            ])

            let options: MLPredictionOptions = optimizedPredictionOptions()