mirror of
https://github.com/FluidInference/FluidAudio.git
synced 2026-05-12 20:20:36 +00:00
## Summary Fixes #592 — PocketTTS voice cloning produced garbled audio on macOS after the `pocket-tts==2.0.0` upgrade. v2 (pre-baked KV snapshot) voices were unaffected — only the v1 path (user audio → `mimi_encoder` → `cond_step` prefill) was broken. Two compounding bugs: ### RCA 1 — stale `mimi_encoder` The `mimi_encoder.mlpackage` originally published on HF was traced against pre-2.0.0 `pocket-tts` (torch 2.9.1, Float32, scalar output) and no longer matched the runtime cond_step contract. Re-traced as `mimi_encoderv2` from `pocket-tts==2.0.0` (torch 2.11.0, Float16, fixed `[1, 1, 240000]` → `[1, 125, 1024]`). Both files now live at the HF repo root (legacy file kept for backwards compat); `ModelNames.mimiEncoder` points at the new one. ### RCA 2 — missing `bos_before_voice` prepend `pocket-tts` 2.0.0 added a learned 1024-d `flow_lm.bos_before_voice` buffer that has to be prepended to the audio_prompt during cond_step prefill. Without it the FlowLM sees a different token distribution than training. Extracted per-language as `constants_bin/bos_before_voice.bin` (4096 bytes each, 10 packs × distinct SHA-256s, all verified byte-for-byte against the HF upload). ### Swift-side changes - `PocketTtsVoiceCloner` pads/truncates input to the encoder's fixed 240 000 samples (10 s @ 24 kHz, non-flexible shape) and trims output frames to real-audio duration so zero-padded frames don't bleed into the prompt. - `PocketTtsSynthesizer+KVCache.prefillKVCache` prepends `bos_before_voice` ahead of the audio_prompt on the v1 path. v2 snapshots skip this — their pre-baked KV cache already encodes the prefix. - `PocketTtsResourceDownloader.ensureModels` backfills `bos_before_voice.bin` for caches that predate this fix (per-file fetch) instead of forcing a full language-pack re-download. Conversion artifacts and per-language SHA-256s documented in `mobius/models/tts/pocket_tts/coreml/TRIALS.md` (Phase 7). ## Test plan - [x] `swift build` clean - [x] `swift test --filter PocketTtsConstantsLoaderTests` — 3 new tests pass - [x] `swift format` applied - [x] E2E v1 cloning: `am_michael.wav` (7.5 s) → 3.92 s @ 24 kHz Int16, intelligible voice match. KV cache prefill lands at position 113 = 1 BOS + 95 voice + 17 text tokens (matches pocket-tts 2.0.0 layout). - [x] v2 snapshot regression check: default `alba.safetensors` voice still synthesizes correctly (prefill position 140, no `bos_before_voice` involvement) - [x] Backfill path: deleted `bos_before_voice.bin` from cache, re-ran cloning — file auto-fetched from HF (4096 bytes) before synthesis - [x] All 10 language packs verified on HF: SHA-256 match between local extraction and uploaded `v2/<lang>/constants_bin/bos_before_voice.bin`
This commit is contained in:
@@ -680,7 +680,7 @@ public enum ModelNames {
|
||||
public static let flowlmStepV2 = "flowlm_stepv2"
|
||||
public static let flowDecoder = "flow_decoder"
|
||||
public static let mimiDecoder = "mimi_decoder"
|
||||
public static let mimiEncoder = "mimi_encoder"
|
||||
public static let mimiEncoder = "mimi_encoderv2"
|
||||
|
||||
public static let condStepFile = condStep + ".mlmodelc"
|
||||
public static let flowlmStepFile = flowlmStep + ".mlmodelc"
|
||||
|
||||
@@ -5,6 +5,11 @@ public struct PocketTtsConstantsBundle: Sendable {
|
||||
public let bosEmbedding: [Float]
|
||||
public let textEmbedTable: [Float]
|
||||
public let tokenizer: SentencePieceTokenizer
|
||||
/// `flow_lm.bos_before_voice` (1024 floats) — prepended to the v1
|
||||
/// audio_prompt during `cond_step` prefill. `nil` when the language
|
||||
/// pack predates the FluidAudio #592 fix and omits the file; v2
|
||||
/// (snapshot) voices don't need it, so loading stays best-effort.
|
||||
public let bosBeforeVoice: [Float]?
|
||||
}
|
||||
|
||||
/// Pre-loaded voice conditioning data.
|
||||
@@ -100,12 +105,15 @@ public enum PocketTtsConstantsLoader {
|
||||
throw LoadError.tokenizerLoadFailed(error.localizedDescription)
|
||||
}
|
||||
|
||||
let bosBeforeVoice = try loadBosBeforeVoiceIfPresent(in: constantsDir)
|
||||
|
||||
logger.info("Loaded PocketTTS constants from \(directory.lastPathComponent)")
|
||||
|
||||
return PocketTtsConstantsBundle(
|
||||
bosEmbedding: bosEmb,
|
||||
textEmbedTable: embedTable,
|
||||
tokenizer: tokenizer
|
||||
tokenizer: tokenizer,
|
||||
bosBeforeVoice: bosBeforeVoice
|
||||
)
|
||||
}
|
||||
|
||||
@@ -259,6 +267,37 @@ public enum PocketTtsConstantsLoader {
|
||||
return PocketTtsVoiceCacheSnapshot(layers: layers, cacheSeqLen: seqLen)
|
||||
}
|
||||
|
||||
// MARK: - Internal helpers
|
||||
|
||||
/// Load `bos_before_voice.bin` from `constantsDir` if it exists.
|
||||
///
|
||||
/// `bos_before_voice.bin` ships with language packs updated for the
|
||||
/// FluidAudio #592 fix (pocket-tts 2.0.0 `flow_lm.bos_before_voice`).
|
||||
/// Older packs and snapshot-only callers don't need it, so a missing
|
||||
/// file resolves to `nil` rather than throwing — the v1 prefill path
|
||||
/// enforces presence at use time.
|
||||
///
|
||||
/// Exposed at internal access for unit tests; production code goes
|
||||
/// through `load(from:)`.
|
||||
static func loadBosBeforeVoiceIfPresent(in constantsDir: URL) throws -> [Float]? {
|
||||
let url = constantsDir.appendingPathComponent("bos_before_voice.bin")
|
||||
guard FileManager.default.fileExists(atPath: url.path) else {
|
||||
// Snapshot-voice users never need this file, so absence is the
|
||||
// expected steady state for pre-#592 caches. Log at debug to
|
||||
// avoid noise; the v1 cloned-voice path surfaces a clear error
|
||||
// at `prefillKVCache` use time when it actually matters.
|
||||
logger.debug(
|
||||
"PocketTTS constants_bin/bos_before_voice.bin not present; cloned-voice v1 prefill will fail until the language pack is updated"
|
||||
)
|
||||
return nil
|
||||
}
|
||||
return try loadFloatArray(
|
||||
from: url,
|
||||
expectedCount: PocketTtsConstants.embeddingDim,
|
||||
name: "bos_before_voice"
|
||||
)
|
||||
}
|
||||
|
||||
// MARK: - Private
|
||||
|
||||
/// Load a raw Float32 binary file into a [Float] array.
|
||||
|
||||
@@ -48,9 +48,27 @@ public enum PocketTtsResourceDownloader {
|
||||
atPath: languageRoot.appendingPathComponent(model).path)
|
||||
}
|
||||
|
||||
guard !allPresent else {
|
||||
if allPresent {
|
||||
logger.info(
|
||||
"PocketTTS \(language.rawValue) (\(precision)) models found in cache")
|
||||
// Pre-#592 caches lack `constants_bin/bos_before_voice.bin`. The
|
||||
// language-pack files are otherwise complete, so try to fetch just
|
||||
// the missing constant rather than re-downloading the whole subdir.
|
||||
//
|
||||
// Best-effort: shipped snapshot voices don't need this file at all,
|
||||
// and the v1 cloned-voice prefill path enforces presence at use
|
||||
// time (PocketTtsConstantsLoader returns nil gracefully). Failing
|
||||
// the fetch here — e.g. offline, or before the file lands on HF —
|
||||
// must not block users who only synthesize with shipped voices.
|
||||
do {
|
||||
try await ensureBosBeforeVoice(language: language, languageRoot: languageRoot)
|
||||
} catch {
|
||||
logger.warning(
|
||||
"Failed to backfill bos_before_voice.bin for \(language.rawValue): "
|
||||
+ "\(error.localizedDescription). Cloned-voice v1 prefill will fail "
|
||||
+ "until this file is available; shipped snapshot voices are unaffected."
|
||||
)
|
||||
}
|
||||
return languageRoot
|
||||
}
|
||||
|
||||
@@ -71,6 +89,34 @@ public enum PocketTtsResourceDownloader {
|
||||
return languageRoot
|
||||
}
|
||||
|
||||
/// Backfill `constants_bin/bos_before_voice.bin` for cached language packs
|
||||
/// that were downloaded before the FluidAudio #592 fix. New downloads pick
|
||||
/// it up via `downloadSubdirectory` — this helper exists only to upgrade
|
||||
/// older caches without a full re-download.
|
||||
private static func ensureBosBeforeVoice(
|
||||
language: PocketTtsLanguage,
|
||||
languageRoot: URL
|
||||
) async throws {
|
||||
let constantsDir = languageRoot.appendingPathComponent(ModelNames.PocketTTS.constantsBinDir)
|
||||
let bosURL = constantsDir.appendingPathComponent("bos_before_voice.bin")
|
||||
if FileManager.default.fileExists(atPath: bosURL.path) {
|
||||
return
|
||||
}
|
||||
try FileManager.default.createDirectory(
|
||||
at: constantsDir, withIntermediateDirectories: true)
|
||||
let remotePath = "\(language.repoSubdirectory)/constants_bin/bos_before_voice.bin"
|
||||
let remoteURL = try ModelRegistry.resolveModel(Repo.pocketTts.remotePath, remotePath)
|
||||
logger.info(
|
||||
"Backfilling bos_before_voice.bin for cached \(language.rawValue) pack...")
|
||||
let data = try await AssetDownloader.fetchData(
|
||||
from: remoteURL,
|
||||
description: "bos_before_voice.bin (\(language.rawValue))",
|
||||
logger: logger
|
||||
)
|
||||
try data.write(to: bosURL, options: [.atomic])
|
||||
logger.info("Wrote bos_before_voice.bin (\(data.count) bytes)")
|
||||
}
|
||||
|
||||
/// Delete the FlowLM `.mlmodelc` and `.mlpackage` directories that don't
|
||||
/// match the requested precision. Idempotent — silently skips paths that
|
||||
/// don't exist.
|
||||
|
||||
@@ -121,11 +121,20 @@ extension PocketTtsSynthesizer {
|
||||
|
||||
/// Prefill a KV cache state with voice conditioning tokens.
|
||||
///
|
||||
/// Processes all voice tokens from the voice data, writing K/V projections
|
||||
/// into the cache starting at the current position.
|
||||
/// Prepends a single `bos_before_voice` token to match pocket-tts 2.0.0's
|
||||
/// `flow_lm.bos_before_voice` prefix (see FluidAudio #592 — without it
|
||||
/// `cond_step` diverges from the deployed flowlm/flow_decoder weights and
|
||||
/// the LM emits EOS within a few steps, producing garbled audio). Then
|
||||
/// processes all voice tokens from `voiceData.audioPrompt`, writing K/V
|
||||
/// projections into the cache starting at the current position.
|
||||
///
|
||||
/// `bosBeforeVoice` must be provided whenever `voiceData.audioPrompt`
|
||||
/// has content (i.e. cloned voices); shipped v2 voices skip this path
|
||||
/// entirely via `cacheSnapshot`.
|
||||
static func prefillKVCacheVoice(
|
||||
state: KVCacheState,
|
||||
voiceData: PocketTtsVoiceData,
|
||||
bosBeforeVoice: [Float]?,
|
||||
model: MLModel,
|
||||
layerKeys: PocketTtsLayerKeys
|
||||
) async throws -> KVCacheState {
|
||||
@@ -133,6 +142,33 @@ extension PocketTtsSynthesizer {
|
||||
let dim = PocketTtsConstants.embeddingDim
|
||||
|
||||
let voiceTokenCount = voiceData.promptLength
|
||||
guard voiceTokenCount > 0 else {
|
||||
// Nothing to prefill (e.g. session warmup with empty cloned
|
||||
// voice). Skip the BOS prepend too — runtime callers that go
|
||||
// through `prefillKVCache` only hit this branch when both
|
||||
// `cacheSnapshot == nil` and `promptLength == 0`, which is a
|
||||
// no-op.
|
||||
return state
|
||||
}
|
||||
|
||||
guard let bosBeforeVoice else {
|
||||
throw PocketTTSError.processingFailed(
|
||||
"PocketTTS v1 cloned-voice prefill requires bos_before_voice constant. "
|
||||
+ "Re-download the language pack to get constants_bin/bos_before_voice.bin "
|
||||
+ "(added in the FluidAudio #592 fix)."
|
||||
)
|
||||
}
|
||||
guard bosBeforeVoice.count == dim else {
|
||||
throw PocketTTSError.processingFailed(
|
||||
"bos_before_voice has \(bosBeforeVoice.count) floats, expected \(dim)"
|
||||
)
|
||||
}
|
||||
|
||||
let bosToken = try createConditioningToken(
|
||||
from: bosBeforeVoice, offset: 0, dim: dim)
|
||||
try await runCondStep(
|
||||
conditioning: bosToken, state: &state, model: model, layerKeys: layerKeys)
|
||||
|
||||
for tokenIdx in 0..<voiceTokenCount {
|
||||
let token = try createConditioningToken(
|
||||
from: voiceData.audioPrompt,
|
||||
@@ -252,13 +288,15 @@ extension PocketTtsSynthesizer {
|
||||
///
|
||||
/// Two voice paths:
|
||||
/// - **Snapshot** (shipped voices): drop pre-baked K/V into cache, skip
|
||||
/// `cond_step` voice prefill entirely.
|
||||
/// - **Flat audio prompt** (cloned voices): feed every voice token
|
||||
/// through `cond_step`.
|
||||
/// `cond_step` voice prefill entirely. `bos_before_voice` is already
|
||||
/// baked into the snapshot.
|
||||
/// - **Flat audio prompt** (cloned voices): feed `bos_before_voice`
|
||||
/// then every voice token through `cond_step`.
|
||||
/// Text prefill runs identically in both cases.
|
||||
static func prefillKVCache(
|
||||
voiceData: PocketTtsVoiceData,
|
||||
textEmbeddings: [[Float]],
|
||||
bosBeforeVoice: [Float]?,
|
||||
model: MLModel,
|
||||
layerKeys: PocketTtsLayerKeys
|
||||
) async throws -> KVCacheState {
|
||||
@@ -268,7 +306,9 @@ extension PocketTtsSynthesizer {
|
||||
} else {
|
||||
let emptyState = try emptyKVCacheState(layers: layerKeys.layerCount)
|
||||
state = try await prefillKVCacheVoice(
|
||||
state: emptyState, voiceData: voiceData, model: model, layerKeys: layerKeys
|
||||
state: emptyState, voiceData: voiceData,
|
||||
bosBeforeVoice: bosBeforeVoice,
|
||||
model: model, layerKeys: layerKeys
|
||||
)
|
||||
}
|
||||
state = try await prefillKVCacheText(
|
||||
|
||||
@@ -273,8 +273,8 @@ public struct PocketTtsSynthesizer {
|
||||
// - Shipped voices (cacheSnapshot != nil): drop pre-baked K/V into
|
||||
// cache, skip cond_step entirely (`promptLength == 0`, so the
|
||||
// loop in `prefillKVCacheVoice` would be a no-op anyway).
|
||||
// - Cloned voices (flat audio prompt): feed every voice token
|
||||
// through cond_step.
|
||||
// - Cloned voices (flat audio prompt): feed `bos_before_voice`
|
||||
// plus every voice token through cond_step.
|
||||
let voiceKVSnapshot: KVCacheState
|
||||
if let snapshot = voiceData.cacheSnapshot {
|
||||
voiceKVSnapshot = try kvCacheStateFromSnapshot(
|
||||
@@ -282,8 +282,9 @@ public struct PocketTtsSynthesizer {
|
||||
} else {
|
||||
let emptyState = try emptyKVCacheState(layers: condLayerKeys.layerCount)
|
||||
voiceKVSnapshot = try await prefillKVCacheVoice(
|
||||
state: emptyState, voiceData: voiceData, model: condModel,
|
||||
layerKeys: condLayerKeys
|
||||
state: emptyState, voiceData: voiceData,
|
||||
bosBeforeVoice: constants.bosBeforeVoice,
|
||||
model: condModel, layerKeys: condLayerKeys
|
||||
)
|
||||
}
|
||||
|
||||
@@ -438,6 +439,7 @@ public struct PocketTtsSynthesizer {
|
||||
var kvState = try await PocketTtsSynthesizer.prefillKVCache(
|
||||
voiceData: voiceData,
|
||||
textEmbeddings: textEmbeddings,
|
||||
bosBeforeVoice: constants.bosBeforeVoice,
|
||||
model: condModel,
|
||||
layerKeys: condLayerKeys
|
||||
)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import Accelerate
|
||||
@preconcurrency import AVFoundation
|
||||
@preconcurrency import CoreML
|
||||
import Foundation
|
||||
@@ -19,14 +20,21 @@ public enum PocketTtsVoiceCloner {
|
||||
/// Frame size for the encoder (1920 samples = 80ms).
|
||||
public static let frameSize: Int = PocketTtsConstants.samplesPerFrame
|
||||
|
||||
/// Maximum voice prompt frames (caps at ~20s to leave KV cache room for text tokens).
|
||||
public static let maxVoiceFrames: Int = 250
|
||||
/// Fixed encoder input length in samples (10s @ 24kHz). `mimi_encoderv2` has
|
||||
/// `hasShapeFlexibility: "0"` and accepts exactly this many samples.
|
||||
public static let encoderInputSamples: Int = 240_000
|
||||
|
||||
/// Maximum voice prompt frames produced by the encoder for one forward pass
|
||||
/// (`encoderInputSamples / frameSize`). The encoder output shape is fixed at
|
||||
/// `[1, 125, 1024]`, so 125 is the hard ceiling.
|
||||
public static let maxVoiceFrames: Int = 125
|
||||
|
||||
/// Minimum audio duration in seconds for voice cloning.
|
||||
public static let minDurationSeconds: Double = 1.0
|
||||
|
||||
/// Maximum audio duration in seconds for voice cloning.
|
||||
public static let maxDurationSeconds: Double = 30.0
|
||||
/// Maximum audio duration in seconds for voice cloning (matches
|
||||
/// `encoderInputSamples`). Audio longer than this is truncated.
|
||||
public static let maxDurationSeconds: Double = 10.0
|
||||
|
||||
// MARK: - Voice Cloning
|
||||
|
||||
@@ -49,22 +57,24 @@ public enum PocketTtsVoiceCloner {
|
||||
+ "(minimum \(minDurationSeconds)s required)"
|
||||
)
|
||||
}
|
||||
guard durationSeconds <= maxDurationSeconds else {
|
||||
throw PocketTTSError.processingFailed(
|
||||
"Audio too long for voice cloning: \(String(format: "%.1f", durationSeconds))s "
|
||||
+ "(maximum \(maxDurationSeconds)s allowed)"
|
||||
)
|
||||
}
|
||||
|
||||
// Pad audio to frame boundary
|
||||
let paddedSamples = padToFrameBoundary(samples)
|
||||
// mimi_encoderv2 has a fixed input shape [1, 1, 240000]. Pad shorter
|
||||
// audio with zeros; truncate longer audio. Track the real sample count
|
||||
// so we can drop encoded-zero-padding frames from the output.
|
||||
let realSampleCount = min(samples.count, encoderInputSamples)
|
||||
let encoderInput = makeEncoderInputBuffer(samples)
|
||||
|
||||
logger.info("Encoding \(paddedSamples.count) samples (\(String(format: "%.1f", durationSeconds))s)")
|
||||
logger.info(
|
||||
"Encoding \(realSampleCount) samples (\(String(format: "%.1f", durationSeconds))s) "
|
||||
+ "padded/truncated to \(encoderInputSamples)"
|
||||
)
|
||||
|
||||
// Create input tensor [1, 1, T]
|
||||
let audioArray = try MLMultiArray(shape: [1, 1, NSNumber(value: paddedSamples.count)], dataType: .float32)
|
||||
for (i, sample) in paddedSamples.enumerated() {
|
||||
audioArray[[0, 0, NSNumber(value: i)]] = NSNumber(value: sample)
|
||||
// Create input tensor [1, 1, 240000]
|
||||
let audioArray = try MLMultiArray(
|
||||
shape: [1, 1, NSNumber(value: encoderInputSamples)], dataType: .float32)
|
||||
let dst = audioArray.dataPointer.bindMemory(to: Float.self, capacity: encoderInputSamples)
|
||||
encoderInput.withUnsafeBufferPointer { src in
|
||||
dst.update(from: src.baseAddress!, count: encoderInputSamples)
|
||||
}
|
||||
|
||||
// Run encoder
|
||||
@@ -78,7 +88,8 @@ public enum PocketTtsVoiceCloner {
|
||||
|
||||
let numFrames = conditioning.shape[1].intValue
|
||||
let embDim = conditioning.shape[2].intValue
|
||||
let usableFrames = min(numFrames, maxVoiceFrames)
|
||||
let usableFrames = usableFrameCount(
|
||||
realSampleCount: realSampleCount, availableFrames: numFrames)
|
||||
logger.info("Encoded to \(numFrames) frames, using \(usableFrames)")
|
||||
|
||||
// Extract conditioning with bulk memory copy (no zero-padding)
|
||||
@@ -168,28 +179,67 @@ public enum PocketTtsVoiceCloner {
|
||||
|
||||
// MARK: - Private Helpers
|
||||
|
||||
private static func padToFrameBoundary(_ samples: [Float]) -> [Float] {
|
||||
let length = samples.count
|
||||
let padLength = (frameSize - (length % frameSize)) % frameSize
|
||||
if padLength > 0 {
|
||||
return samples + [Float](repeating: 0, count: padLength)
|
||||
/// Build a fixed-length `encoderInputSamples`-sized buffer: copy the first
|
||||
/// `encoderInputSamples` of `samples` (truncating overflow), zero-pad the
|
||||
/// remainder. `mimi_encoderv2`'s input shape is non-flexible at runtime.
|
||||
///
|
||||
/// Exposed at internal access for unit tests; production callers go
|
||||
/// through `cloneVoice(from:using:)`.
|
||||
static func makeEncoderInputBuffer(_ samples: [Float]) -> [Float] {
|
||||
var buffer = [Float](repeating: 0, count: encoderInputSamples)
|
||||
let copyCount = min(samples.count, encoderInputSamples)
|
||||
if copyCount > 0 {
|
||||
buffer.replaceSubrange(0..<copyCount, with: samples[0..<copyCount])
|
||||
}
|
||||
return samples
|
||||
return buffer
|
||||
}
|
||||
|
||||
/// Extract conditioning floats from MLMultiArray [1, frames, embDim] via bulk memory copy.
|
||||
/// Number of encoder output frames that correspond to real (non-padded)
|
||||
/// audio. Drops trailing frames covering the zero-padded tail; rounds up
|
||||
/// so the last partial real frame still contributes voice content.
|
||||
/// Capped by both the encoder's actual frame output and `maxVoiceFrames`.
|
||||
///
|
||||
/// Exposed at internal access for unit tests.
|
||||
static func usableFrameCount(realSampleCount: Int, availableFrames: Int) -> Int {
|
||||
let realFrames = (realSampleCount + frameSize - 1) / frameSize
|
||||
return min(availableFrames, realFrames, maxVoiceFrames)
|
||||
}
|
||||
|
||||
/// Extract conditioning floats from MLMultiArray `[1, frames, embDim]`.
|
||||
///
|
||||
/// Both dtype paths assume contiguous storage starting at the array's
|
||||
/// base pointer: the encoder writes `[1, 125, 1024]` in row-major order
|
||||
/// and we read the leading `frames` rows. The Float32 path is a bulk
|
||||
/// `UnsafeBufferPointer` copy; the Float16 path uses
|
||||
/// `vDSP.convertElements` (vectorized fp16→fp32 conversion) so
|
||||
/// `mimi_encoderv2`'s Float16 output doesn't have to pay 128 k
|
||||
/// MLMultiArray subscript calls per clone. Falls back to NSNumber
|
||||
/// subscripting on x86 hosts where Swift `Float16` isn't available.
|
||||
private static func extractConditioning(
|
||||
_ conditioning: MLMultiArray, frames: Int, embDim: Int
|
||||
) -> [Float] {
|
||||
let count = frames * embDim
|
||||
if conditioning.dataType == .float16 {
|
||||
return (0..<count).map { i in
|
||||
var result = [Float](repeating: 0, count: count)
|
||||
#if arch(arm64)
|
||||
let srcPtr = conditioning.dataPointer.bindMemory(to: Float16.self, capacity: count)
|
||||
let srcBuffer = UnsafeBufferPointer(start: srcPtr, count: count)
|
||||
result.withUnsafeMutableBufferPointer { dst in
|
||||
vDSP.convertElements(of: srcBuffer, to: &dst)
|
||||
}
|
||||
#else
|
||||
// x86: Swift Float16 unavailable. Route through NSNumber.
|
||||
for i in 0..<count {
|
||||
let frame = i / embDim
|
||||
let dim = i % embDim
|
||||
return conditioning[[0, NSNumber(value: frame), NSNumber(value: dim)]].floatValue
|
||||
result[i] =
|
||||
conditioning[[0, NSNumber(value: frame), NSNumber(value: dim)]]
|
||||
.floatValue
|
||||
}
|
||||
#endif
|
||||
return result
|
||||
}
|
||||
// Fast path: float32 bulk copy
|
||||
// Float32: contiguous bulk copy
|
||||
let srcPtr = conditioning.dataPointer.bindMemory(to: Float.self, capacity: count)
|
||||
return Array(UnsafeBufferPointer(start: srcPtr, count: count))
|
||||
}
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
import Foundation
|
||||
import XCTest
|
||||
|
||||
@testable import FluidAudio
|
||||
|
||||
/// Pure-logic unit tests for `PocketTtsConstantsLoader`'s file-loading
|
||||
/// behavior. The full `load(from:)` entry point needs a real SentencePiece
|
||||
/// tokenizer file (and the rest of the language pack), so these tests
|
||||
/// drive the smaller `loadBosBeforeVoiceIfPresent(in:)` helper instead.
|
||||
final class PocketTtsConstantsLoaderTests: XCTestCase {
|
||||
|
||||
private var tmpDir: URL!
|
||||
|
||||
override func setUpWithError() throws {
|
||||
try super.setUpWithError()
|
||||
tmpDir = FileManager.default.temporaryDirectory
|
||||
.appendingPathComponent("FluidAudioPocketTtsLoaderTests-\(UUID().uuidString)")
|
||||
try FileManager.default.createDirectory(
|
||||
at: tmpDir, withIntermediateDirectories: true)
|
||||
}
|
||||
|
||||
override func tearDownWithError() throws {
|
||||
if let tmpDir, FileManager.default.fileExists(atPath: tmpDir.path) {
|
||||
try FileManager.default.removeItem(at: tmpDir)
|
||||
}
|
||||
try super.tearDownWithError()
|
||||
}
|
||||
|
||||
// MARK: - loadBosBeforeVoiceIfPresent
|
||||
|
||||
func testBosBeforeVoiceReturnsNilWhenMissing() throws {
|
||||
// Missing file → nil (cloned-voice v1 prefill will fail later at
|
||||
// use time; this code path is fine for snapshot-only voices).
|
||||
let result = try PocketTtsConstantsLoader.loadBosBeforeVoiceIfPresent(in: tmpDir)
|
||||
XCTAssertNil(result, "Absent bos_before_voice.bin must yield nil")
|
||||
}
|
||||
|
||||
func testBosBeforeVoiceLoadsExpectedFloats() throws {
|
||||
// Write a synthetic 1024-float file (every byte distinct so we
|
||||
// verify no truncation/padding).
|
||||
let dim = PocketTtsConstants.embeddingDim
|
||||
let expected: [Float] = (0..<dim).map { Float($0) * 0.001 }
|
||||
let data = expected.withUnsafeBufferPointer { buffer -> Data in
|
||||
Data(buffer: buffer)
|
||||
}
|
||||
let url = tmpDir.appendingPathComponent("bos_before_voice.bin")
|
||||
try data.write(to: url)
|
||||
|
||||
let loaded = try PocketTtsConstantsLoader.loadBosBeforeVoiceIfPresent(in: tmpDir)
|
||||
XCTAssertNotNil(loaded)
|
||||
XCTAssertEqual(loaded?.count, dim)
|
||||
XCTAssertEqual(loaded ?? [], expected)
|
||||
}
|
||||
|
||||
func testBosBeforeVoiceThrowsOnWrongSize() throws {
|
||||
// Truncated file (1023 floats instead of 1024) must be rejected,
|
||||
// not silently zero-padded.
|
||||
let bad: [Float] = Array(repeating: 0, count: PocketTtsConstants.embeddingDim - 1)
|
||||
let data = bad.withUnsafeBufferPointer { buffer -> Data in
|
||||
Data(buffer: buffer)
|
||||
}
|
||||
let url = tmpDir.appendingPathComponent("bos_before_voice.bin")
|
||||
try data.write(to: url)
|
||||
|
||||
XCTAssertThrowsError(
|
||||
try PocketTtsConstantsLoader.loadBosBeforeVoiceIfPresent(in: tmpDir)
|
||||
) { error in
|
||||
guard
|
||||
let loadError = error as? PocketTtsConstantsLoader.LoadError,
|
||||
case .invalidSize(let name, let expected, let actual) = loadError
|
||||
else {
|
||||
XCTFail("Expected LoadError.invalidSize, got \(error)")
|
||||
return
|
||||
}
|
||||
XCTAssertEqual(name, "bos_before_voice")
|
||||
XCTAssertEqual(expected, PocketTtsConstants.embeddingDim)
|
||||
XCTAssertEqual(actual, PocketTtsConstants.embeddingDim - 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
import Foundation
|
||||
import XCTest
|
||||
|
||||
@testable import FluidAudio
|
||||
|
||||
/// Pure-logic unit tests for `PocketTtsVoiceCloner`'s pad/truncate and
|
||||
/// frame-trim helpers. The full `cloneVoice(from:using:)` entry point
|
||||
/// needs an `MLModel`, so these tests drive the smaller internal
|
||||
/// helpers (`makeEncoderInputBuffer`, `usableFrameCount`) which the
|
||||
/// production path delegates to.
|
||||
final class PocketTtsVoiceClonerTests: XCTestCase {
|
||||
|
||||
// MARK: - makeEncoderInputBuffer
|
||||
|
||||
func testEncoderInputBufferPadsShorterAudio() {
|
||||
// 7.5 s of audio @ 24 kHz = 180_000 samples; encoder wants 240_000.
|
||||
let realCount = 180_000
|
||||
let input = (0..<realCount).map { Float($0 % 17) - 8 }
|
||||
let buffer = PocketTtsVoiceCloner.makeEncoderInputBuffer(input)
|
||||
|
||||
XCTAssertEqual(
|
||||
buffer.count, PocketTtsVoiceCloner.encoderInputSamples,
|
||||
"Buffer must always be encoderInputSamples long")
|
||||
XCTAssertEqual(
|
||||
Array(buffer.prefix(realCount)), input,
|
||||
"Real samples must be copied verbatim into the prefix")
|
||||
XCTAssertTrue(
|
||||
buffer.dropFirst(realCount).allSatisfy { $0 == 0 },
|
||||
"Padding region must be zero-filled")
|
||||
}
|
||||
|
||||
func testEncoderInputBufferTruncatesLongerAudio() {
|
||||
// 15 s @ 24 kHz = 360_000 samples; must be truncated to 240_000.
|
||||
let oversize = PocketTtsVoiceCloner.encoderInputSamples + 120_000
|
||||
let input = (0..<oversize).map { Float($0 % 23) - 11 }
|
||||
let buffer = PocketTtsVoiceCloner.makeEncoderInputBuffer(input)
|
||||
|
||||
XCTAssertEqual(
|
||||
buffer.count, PocketTtsVoiceCloner.encoderInputSamples,
|
||||
"Buffer must always be encoderInputSamples long, never longer")
|
||||
XCTAssertEqual(
|
||||
buffer, Array(input.prefix(PocketTtsVoiceCloner.encoderInputSamples)),
|
||||
"Truncation must keep the leading samples")
|
||||
}
|
||||
|
||||
func testEncoderInputBufferHandlesExactLength() {
|
||||
// Exactly 240_000 samples → no padding, no truncation.
|
||||
let input = (0..<PocketTtsVoiceCloner.encoderInputSamples).map { Float($0) * 1e-6 }
|
||||
let buffer = PocketTtsVoiceCloner.makeEncoderInputBuffer(input)
|
||||
|
||||
XCTAssertEqual(buffer, input)
|
||||
}
|
||||
|
||||
func testEncoderInputBufferHandlesEmptyInput() {
|
||||
// Defensive: empty input shouldn't crash, just produce all zeros.
|
||||
let buffer = PocketTtsVoiceCloner.makeEncoderInputBuffer([])
|
||||
|
||||
XCTAssertEqual(buffer.count, PocketTtsVoiceCloner.encoderInputSamples)
|
||||
XCTAssertTrue(buffer.allSatisfy { $0 == 0 })
|
||||
}
|
||||
|
||||
// MARK: - usableFrameCount
|
||||
|
||||
func testUsableFrameCountRoundsPartialFrameUp() {
|
||||
// 7.5 s @ 24 kHz = 180_000 samples. 180_000 / 1920 = 93.75 → 94 frames
|
||||
// (ceiling). Encoder always emits 125 frames for the full 10 s window,
|
||||
// so we use the ceiling rather than the full output.
|
||||
let usable = PocketTtsVoiceCloner.usableFrameCount(
|
||||
realSampleCount: 180_000, availableFrames: 125)
|
||||
XCTAssertEqual(usable, 94)
|
||||
}
|
||||
|
||||
func testUsableFrameCountCapsAtMaxVoiceFrames() {
|
||||
// Even with 10 s of real audio and a hypothetical bigger encoder
|
||||
// output, we never exceed `maxVoiceFrames` (KV cache budget).
|
||||
let usable = PocketTtsVoiceCloner.usableFrameCount(
|
||||
realSampleCount: PocketTtsVoiceCloner.encoderInputSamples,
|
||||
availableFrames: 200)
|
||||
XCTAssertEqual(usable, PocketTtsVoiceCloner.maxVoiceFrames)
|
||||
}
|
||||
|
||||
func testUsableFrameCountCapsAtAvailableFrames() {
|
||||
// If the encoder somehow emits fewer frames than the real audio
|
||||
// implies, trust the encoder rather than over-reading its buffer.
|
||||
let usable = PocketTtsVoiceCloner.usableFrameCount(
|
||||
realSampleCount: PocketTtsVoiceCloner.encoderInputSamples,
|
||||
availableFrames: 80)
|
||||
XCTAssertEqual(usable, 80)
|
||||
}
|
||||
|
||||
func testUsableFrameCountHandlesExactFrameBoundary() {
|
||||
// 95 * 1920 = 182_400 samples — clean multiple, no rounding needed.
|
||||
let usable = PocketTtsVoiceCloner.usableFrameCount(
|
||||
realSampleCount: 95 * PocketTtsVoiceCloner.frameSize,
|
||||
availableFrames: 125)
|
||||
XCTAssertEqual(usable, 95)
|
||||
}
|
||||
|
||||
func testUsableFrameCountHandlesSubFrameAudio() {
|
||||
// < 1 frame of audio rounds up to 1 (the encoder still produces a
|
||||
// frame even for a tiny prefix). Below-minDurationSeconds inputs
|
||||
// are rejected upstream so this is mostly defensive.
|
||||
let usable = PocketTtsVoiceCloner.usableFrameCount(
|
||||
realSampleCount: 100, availableFrames: 125)
|
||||
XCTAssertEqual(usable, 1)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user