diff --git a/.github/workflows/tts-test.yml b/.github/workflows/tts-test.yml new file mode 100644 index 00000000..ceb53015 --- /dev/null +++ b/.github/workflows/tts-test.yml @@ -0,0 +1,43 @@ +name: TTS Test + +on: + pull_request: + branches: [ main ] + workflow_dispatch: + +jobs: + test-tts: + runs-on: macos-14 + + steps: + - uses: actions/checkout@v4 + + - name: Build FluidAudio + run: | + swift build -c release + + - name: Generate TTS Audio + run: | + echo "🎤 Generating TTS audio (ground truth test)..." + TEXT="I can't believe we finally made it to the summit after climbing for twelve exhausting hours through wind and rain, but wow, this view of the endless mountain ranges stretching to the horizon makes every single difficult step completely worth the journey." + + # This will auto-download model and generate audio + swift run --configuration release fluidaudio tts "$TEXT" --output kokoro_output.wav --auto-download + + # Verify output + if [ -f kokoro_output.wav ]; then + SIZE=$(ls -l kokoro_output.wav | awk '{print $5}') + echo "✅ TTS successful: kokoro_output.wav ($SIZE bytes)" + else + echo "❌ Output file not created" + exit 1 + fi + + - name: Upload Audio Output + if: always() + uses: actions/upload-artifact@v4 + with: + name: kokoro-tts-output + path: kokoro_output.wav + retention-days: 7 + diff --git a/.gitignore b/.gitignore index 4cbe1d76..d3b6ef53 100644 --- a/.gitignore +++ b/.gitignore @@ -97,6 +97,10 @@ FluidAudioDatasets/ *.wav *.mp3 Resources/ +!Sources/FluidAudio/Resources/ +!Sources/FluidAudio/Resources/** scripts/ Documentation/parakeet-tdt/ docs/parakeet-tdt/ + +fluidaudio_cli/* diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md index 96b9ef4e..7c85b9d6 100644 --- a/Documentation/Benchmarks.md +++ b/Documentation/Benchmarks.md @@ -80,6 +80,91 @@ iPhone 16 Pro Max run, and only for models that were reloaded during the session | Decoder | 88.49 | 8.11 | 146.01 | MLComputeUnits(rawValue: 1) | | JointDecision | 48.46 | 7.97 | 71.85 | MLComputeUnits(rawValue: 1) | +## Text-to-Speech + +We generated the same strings with to gerneate audio between 1s to ~300s in order to test the speed across a range of varying inputs on Pytorch CPU, MPS, and MLX pipeline, and compared it against the native Swift version with Core ML models. + +Each pipeline warmed up the models by running through it once with pesudo inputs, and then comparing the raw inference time with the model already loaded. You can see that for the Core ML model, we traded lower memory and very slightly faster inference for longer initial warm-up. + +Note that the Pytorch kokoro model in Pytorch has a memory leak issue: https://github.com/hexgrad/kokoro/issues/152 + +The following tests were ran on M4 Pro, 48GB RAM, Macbook Pro. If you have another device, please do try replicating it as well! + +### Kokoro-82M PyTorch (CPU) + +```bash +KPipeline benchmark for voice af_heart (warm-up took 0.175s) using hexgrad/kokoro +Test Chars Output (s) Inf(s) RTFx Peak GB +1 42 2.750 0.187 14.737x 1.44 +2 129 8.625 0.530 16.264x 1.85 +3 254 15.525 0.923 16.814x 2.65 +4 93 6.125 0.349 17.566x 2.66 +5 104 7.200 0.410 17.567x 2.70 +6 130 9.300 0.504 18.443x 2.72 +7 197 12.850 0.726 17.711x 2.83 +8 6 1.350 0.098 13.823x 2.83 +9 1228 76.200 4.342 17.551x 3.19 +10 567 35.200 2.069 17.014x 4.85 +11 4615 286.525 17.041 16.814x 4.78 +Total - 461.650 27.177 16.987x 4.85 +``` + +### Kokoro-82M PyTorch (MPS) + +I wasn't able to run the MPS model for longer durations, even with `PYTORCH_ENABLE_MPS_FALLBACK=1` enabled, it kept crashing for the longer strings. + +```bash +KPipeline benchmark for voice af_heart (warm-up took 0.568s) using pip package +Test Chars Output (s) Inf(s) RTFx Peak GB +1 42 2.750 0.414 6.649x 1.41 +2 129 8.625 0.729 11.839x 1.54 +Total - 11.375 1.142 9.960x 1.54 +``` + +### Kokoro-82M MLX Pipeline + +```bash +TTS benchmark for voice af_heart (warm-up took an extra 2.155s) using model prince-canuma/Kokoro-82M +Test Chars Output (s) Inf(s) RTFx Peak GB +1 42 2.750 0.347 7.932x 1.12 +2 129 8.650 0.597 14.497x 2.47 +3 254 15.525 0.825 18.829x 2.65 +4 93 6.125 0.306 20.039x 2.65 +5 104 7.200 0.343 21.001x 2.65 +6 130 9.300 0.560 16.611x 2.65 +7 197 12.850 0.596 21.573x 2.65 +8 6 1.350 0.364 3.706x 2.65 +9 1228 76.200 2.979 25.583x 3.29 +10 567 35.200 1.374 25.615x 3.37 +11 4615 286.500 11.112 25.783x 3.37 +Total - 461.650 19.401 23.796x 3.37 +``` + +#### Swift + Fluid Audio Core ML models + +Note that it does take `~15s` to compile the model on the first run, subsequent runs are shorter, we expect ~2s to load. + +```bash +> swift run fluidaudio tts --benchmark +... +FluidAudio TTS benchmark for voice af_heart (warm-up took an extra 2.348s) +Test Chars Ouput (s) Inf(s) RTFx +1 42 2.825 0.440 6.424x +2 129 7.725 0.594 13.014x +3 254 13.400 0.776 17.278x +4 93 5.875 0.587 10.005x +5 104 6.675 0.613 10.889x +6 130 8.075 0.621 13.008x +7 197 10.650 0.627 16.983x +8 6 0.825 0.360 2.290x +9 1228 67.625 2.362 28.625x +10 567 33.025 1.341 24.619x +11 4269 247.600 9.087 27.248x +Total - 404.300 17.408 23.225 + +Peak memory usage (process-wide): 1.503 GB +``` + ## Voice Activity Detection Model is nearly identical to the base model in terms of quality, perforamnce wise we see an up to ~3.5x improvement compared to the silero Pytorch VAD model with the 256ms batch model (8 chunks of 32ms) diff --git a/Documentation/EspeakFramework.md b/Documentation/EspeakFramework.md new file mode 100644 index 00000000..37758ef4 --- /dev/null +++ b/Documentation/EspeakFramework.md @@ -0,0 +1,30 @@ +# eSpeak-NG Framework Packaging + +FluidAudio bundles the eSpeak-NG phoneme resources so Kokoro can fall back to G2P lookups when the US lexicons don’t contain a word. The Core ML pipeline expects the resources under `Resources/espeak-ng/espeak-ng-data.bundle` with the canonical `voices/` directory inside. + +## All Platforms (Primary Flow) +- `TtsResourceDownloader.ensureEspeakDataBundle` first attempts to stage the packaged `espeak-ng-data.bundle` from SwiftPM resources (`Sources/FluidAudio/Resources/espeak-ng/`). +- The bundle is copied to `~/.cache/fluidaudio/Models/kokoro/Resources/espeak-ng/`. +- The `voices/` directory is validated after staging; if missing, `TTSError.downloadFailed` is raised. + +## Fallback Behavior (macOS Only) +- If the packaged bundle is unavailable, **macOS only** falls back to downloading `espeak-ng.zip` from HuggingFace and extracting it with `/usr/bin/unzip`. +- **iOS/tvOS/watchOS** do not support fallback downloads and will throw `TTSError.downloadFailed` if the packaged bundle is missing. +- For mobile platforms, ensure the packaged bundle is present in the Swift package resources before building. + +## Best practices +- Keep the `espeak-ng-data.bundle` (packaged copy) and the optional `espeak-ng.zip` fallback in sync with any updates to the Kokoro phoneme mapper. +- If you customize the cache location, be sure the `Resources/espeak-ng/espeak-ng-data.bundle/voices/` directory is present before running TTS. +- When testing on iOS, bundle the extracted resources with the app or seed the simulator cache in advance to avoid runtime failures. + +## CocoaPods integration notes +- The `ESpeakNG.xcframework` now includes support for iOS device (arm64), iOS Simulator (arm64 + x86_64), and macOS (arm64 + x86_64). +- iOS Simulator support is provided via a stub framework that allows building and linking but returns failure values for ESpeakNG function calls. +- Pod validation passes successfully with `pod lib lint FluidAudio.podspec --allow-warnings` for all platforms. +- On iOS Simulator, ESpeakNG initialization will fail gracefully and phonemization requests will return `nil` due to the stub implementation. +- Full ESpeakNG functionality is available on iOS device and macOS platforms. + +## Licensing notes +- eSpeak-NG is distributed under the GNU GPL v3 (or later). Both the core library and the `espeak-ng-data` voices inherit the same license. +- The full license text now lives at `Licenses/ESpeakNG_LICENSE.txt`; ship this file (or the upstream `COPYING`) anywhere the framework is redistributed and surface it in your third-party notices UI. +- If you republish the prebuilt `ESpeakNG.xcframework`, keep the license alongside the binary and ensure downstream consumers can obtain the corresponding source per GPL requirements. diff --git a/Documentation/TTS/README.md b/Documentation/TTS/README.md new file mode 100644 index 00000000..6f155591 --- /dev/null +++ b/Documentation/TTS/README.md @@ -0,0 +1,67 @@ +# Text-To-Speech (TTS) Code Examples + +> **⚠️ Beta:** The TTS system is currently in beta and only supports American English. Additional language support is planned for future releases. + +Quick recipes for running the Kokoro synthesis stack. + +## CLI quick start + +```bash +swift run fluidaudio tts "Welcome to FluidAudio text to speech" \ + --output ~/Desktop/demo.wav \ + --voice af_heart +``` + +The first invocation downloads Kokoro models, phoneme dictionaries, and voice embeddings; later runs reuse the +cached assets. + +## Swift async usage + +```swift +import FluidAudio +import Foundation + +@main +struct DemoTTS { + static func main() async { + guard #available(macOS 13.0, *) else { + print("FluidAudio TTS requires macOS 13 or newer.") + return + } + + let manager = TtSManager() + + do { + try await manager.initialize() + let audioData = try await manager.synthesize(text: "Hello from FluidAudio!") + + let outputURL = URL(fileURLWithPath: "/tmp/fluidaudio-demo.wav") + try audioData.write(to: outputURL) + print("Saved synthesized audio to: \(outputURL.path)") + } catch { + print("Synthesis failed: \(error)") + } + } +} +``` + +Swap in `manager.initialize(models:)` when you want to preload only the long-form `.fifteenSecond` variant. + +## Inspecting chunk metadata + +```swift +let manager = TtSManager() +try await manager.initialize() + +let detailed = try await manager.synthesizeDetailed( + text: "FluidAudio can report chunk splits for you.", + variantPreference: .fifteenSecond +) + +for chunk in detailed.chunks { + print("Chunk #\(chunk.index) -> variant: \(chunk.variant), tokens: \(chunk.tokenCount)") + print(" text: \(chunk.text)") +} +``` + +`KokoroSynthesizer.SynthesisResult` also exposes `diagnostics` for per-run variant and audio footprint totals. diff --git a/FluidAudio.podspec b/FluidAudio.podspec index 08dc2495..cef030a8 100644 --- a/FluidAudio.podspec +++ b/FluidAudio.podspec @@ -18,7 +18,28 @@ Pod::Spec.new do |spec| spec.source = { :git => "https://github.com/FluidInference/FluidAudio.git", :tag => "v#{spec.version}" } spec.source_files = "Sources/FluidAudio/**/*.swift" - + + # iOS Configuration + # Exclude TTS module from iOS builds to avoid ESpeakNG xcframework linking issues. + # CocoaPods has known limitations with vendored xcframeworks during pod lib lint on iOS: + # the framework symbols aren't properly linked in the temporary build environment, + # causing "Undefined symbols" linker errors even though the binary is valid. + # iOS builds include: ASR (speech recognition), Diarization, and VAD (voice activity detection). + spec.ios.exclude_files = "Sources/FluidAudio/TextToSpeech/**/*" + spec.ios.frameworks = "CoreML", "AVFoundation", "Accelerate", "UIKit" + + # macOS Configuration + # ESpeakNG framework is only vendored for macOS in the podspec (not a framework limitation). + # The xcframework supports iOS, but CocoaPods fails to link it during iOS validation. + # This enables TTS (text-to-speech) functionality with G2P (grapheme-to-phoneme) conversion. + # macOS builds include: ASR, Diarization, VAD, and TTS with ESpeakNG support. + spec.osx.vendored_frameworks = "Sources/FluidAudio/Frameworks/ESpeakNG.xcframework" + spec.osx.frameworks = "CoreML", "AVFoundation", "Accelerate", "Cocoa" + spec.swift_versions = ["5.10"] - spec.frameworks = "CoreML", "AVFoundation", "Accelerate" + + # Enable module definition for proper framework imports + spec.pod_target_xcconfig = { + 'DEFINES_MODULE' => 'YES' + } end diff --git a/Package.swift b/Package.swift index a4ca1156..77c4d1cb 100644 --- a/Package.swift +++ b/Package.swift @@ -19,9 +19,15 @@ let package = Package( ], dependencies: [], targets: [ + .binaryTarget( + name: "ESpeakNG", + path: "Sources/FluidAudio/Frameworks/ESpeakNG.xcframework" + ), .target( name: "FluidAudio", - dependencies: [], + dependencies: [ + "ESpeakNG", + ], path: "Sources/FluidAudio", exclude: [] ), diff --git a/README.md b/README.md index 3a67a19e..31ac3743 100644 --- a/README.md +++ b/README.md @@ -281,11 +281,61 @@ swift run fluidaudio vad-analyze path/to/audio.wav --streaming # Benchmark accuracy/precision trade-offs swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3 ``` - `swift run fluidaudio vad-analyze --help` lists every tuning option, including negative-threshold overrides, max-speech splitting, padding, and chunk size. Offline mode also reports RTFx using the model's per-chunk processing time. +## Text‑To‑Speech (TTS) + +> **⚠️ Beta:** The TTS system is currently in beta and only supports American English. Additional language support is planned for future releases. + +- Model: Kokoro (CoreML unified model) +- Language: American English (beta) +- G2P: Dictionary first, then eSpeak NG (CEspeakNG) for OOV words +- Output: 24 kHz mono WAV + +Requirements (macOS) +Ensure eSpeak NG headers/libs are available via pkg-config (`espeak-ng`). +https://github.com/espeak-ng/espeak-ng/tree/master + +### Quick Start (CLI) + +```bash +# First run will download the Kokoro model and vocab +swift run fluidaudio tts "Hello from FluidAudio." --auto-download --output out.wav + +# Another example with punctuation and OOV handling +swift run fluidaudio tts "Edge-cases: URLs like https://example.com and e-mail test@example.com." --output out2.wav +``` + +Notes +- The TTS pipeline uses a word→phoneme dictionary first; unknown words are phonemized with eSpeak NG (C API) and mapped to the model’s token set. +- OOV words are printed with their IPA and mapped tokens for visibility during synthesis. +- We do not prepend any “language token” to avoid leading vowel artifacts. + +### Quick Start (Code) + +```swift +import FluidAudio + +Task { + do { + let data = try await KokoroModel.synthesize(text: "Hello from FluidAudio.") + try data.write(to: URL(fileURLWithPath: "out.wav")) + } catch { + print("TTS error: \(error)") + } +} +``` + +Troubleshooting +Build requires eSpeak NG headers/libs for the C API discoverable via pkg-config (`espeak-ng`). +- If SwiftPM cannot find headers, build with explicit paths: + - `swift build -Xcc -I/opt/homebrew/include -Xlinker -L/opt/homebrew/lib` +- Dictionary and model assets are cached under `~/.cache/fluidaudio/Models/kokoro`. + +## Showcase + ## Showcase Make a PR if you want to add your app! @@ -326,6 +376,8 @@ Parakeet-mlx: https://github.com/senstella/parakeet-mlx silero-vad: https://github.com/snakers4/silero-vad +Kokoro-82M: https://huggingface.co/hexgrad/Kokoro-82M + ### Citation If you use FluidAudio in your work, please cite: diff --git a/Sources/FluidAudio/DownloadUtils.swift b/Sources/FluidAudio/DownloadUtils.swift index 294618d3..c3d8f0ca 100644 --- a/Sources/FluidAudio/DownloadUtils.swift +++ b/Sources/FluidAudio/DownloadUtils.swift @@ -169,7 +169,7 @@ public class DownloadUtils { } do { - // Validate model directory structure before loading + // Validate model directory structure before loading (.mlmodelc bundle) var isDirectory: ObjCBool = false guard FileManager.default.fileExists( @@ -184,7 +184,6 @@ public class DownloadUtils { ]) } - // Check for essential model files let coremlDataPath = modelPath.appendingPathComponent("coremldata.bin") guard FileManager.default.fileExists(atPath: coremlDataPath.path) else { logger.error("Missing coremldata.bin in \(name)") @@ -209,11 +208,10 @@ public class DownloadUtils { } catch { logger.error("Failed to load model \(name): \(error)") - // List directory contents for debugging if let contents = try? FileManager.default.contentsOfDirectory( - at: modelPath, includingPropertiesForKeys: nil) + atPath: modelPath.deletingLastPathComponent().path) { - logger.error("Model directory contents: \(contents.map { $0.lastPathComponent })") + logger.error("Model directory contents: \(contents)") } throw error @@ -223,6 +221,22 @@ public class DownloadUtils { return models } + /// Get required model names for a given repository + /// Uses centralized ModelNames where available to avoid cross‑type coupling + @available(macOS 13.0, iOS 16.0, *) + private static func getRequiredModelNames(for repo: Repo) -> Set { + switch repo { + case .vad: + return ModelNames.VAD.requiredModels + case .parakeet, .parakeetV2: + return ModelNames.ASR.requiredModels + case .diarizer: + return ModelNames.Diarizer.requiredModels + case .kokoro: + return ModelNames.TTS.requiredModels + } + } + /// Download a HuggingFace repository private static func downloadRepo(_ repo: Repo, to directory: URL) async throws { logger.info("Downloading \(repo.folderName) from HuggingFace...") @@ -230,8 +244,8 @@ public class DownloadUtils { let repoPath = directory.appendingPathComponent(repo.folderName) try FileManager.default.createDirectory(at: repoPath, withIntermediateDirectories: true) - // Get the required model names for this repo from the appropriate manager - let requiredModels = ModelNames.getRequiredModelNames(for: repo) + // Get the required model names for this repo + let requiredModels = getRequiredModelNames(for: repo) // Download all repository contents let files = try await listRepoFiles(repo) @@ -273,7 +287,7 @@ public class DownloadUtils { /// List files in a HuggingFace repository private static func listRepoFiles(_ repo: Repo, path: String = "") async throws -> [RepoFile] { let apiPath = path.isEmpty ? "tree/main" : "tree/main/\(path)" - let apiURL = URL(string: "https://huggingface.co/api/models/\(repo.rawValue)/\(apiPath)")! + let apiURL = URL(string: "https://huggingface.co/api/models/\(repo.remotePath)/\(apiPath)")! var request = URLRequest(url: apiURL) request.timeoutInterval = 30 @@ -358,7 +372,6 @@ public class DownloadUtils { let parentDir = destination.deletingLastPathComponent() try FileManager.default.createDirectory(at: parentDir, withIntermediateDirectories: true) - // Check if file already exists and is complete if let attrs = try? FileManager.default.attributesOfItem(atPath: destination.path), let fileSize = attrs[.size] as? Int64, fileSize == expectedSize @@ -382,7 +395,7 @@ public class DownloadUtils { // Download URL let downloadURL = URL( - string: "https://huggingface.co/\(repo.rawValue)/resolve/main/\(path)")! + string: "https://huggingface.co/\(repo.remotePath)/resolve/main/\(path)")! // Download the file (no retries) do { diff --git a/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/Info.plist b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/Info.plist new file mode 100644 index 00000000..ffe61b23 --- /dev/null +++ b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/Info.plist @@ -0,0 +1,59 @@ + + + + + AvailableLibraries + + + BinaryPath + ESpeakNG.framework/Versions/A/ESpeakNG + LibraryIdentifier + macos-arm64_x86_64 + LibraryPath + ESpeakNG.framework + SupportedArchitectures + + arm64 + x86_64 + + SupportedPlatform + macos + + + BinaryPath + ESpeakNG.framework/ESpeakNG + LibraryIdentifier + ios-arm64 + LibraryPath + ESpeakNG.framework + SupportedArchitectures + + arm64 + + SupportedPlatform + ios + + + BinaryPath + ESpeakNG.framework/ESpeakNG + LibraryIdentifier + ios-arm64_x86_64-simulator + LibraryPath + ESpeakNG.framework + SupportedArchitectures + + arm64 + x86_64 + + SupportedPlatform + ios + SupportedPlatformVariant + simulator + + + CFBundlePackageType + XFWK + XCFrameworkFormatVersion + 1.0 + + diff --git a/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/ESpeakNG b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/ESpeakNG new file mode 100755 index 00000000..f4f3b538 Binary files /dev/null and b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/ESpeakNG differ diff --git a/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/ESpeakNG.h b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/ESpeakNG.h new file mode 100644 index 00000000..542cdffc --- /dev/null +++ b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/ESpeakNG.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/ESpeakNG.h b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/ESpeakNG.h new file mode 100644 index 00000000..542cdffc --- /dev/null +++ b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/ESpeakNG.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/espeak_ng.h b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/espeak_ng.h new file mode 100644 index 00000000..e6025e7c --- /dev/null +++ b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/espeak_ng.h @@ -0,0 +1,223 @@ +/* eSpeak NG API. + * + * Copyright (C) 2015-2017 Reece H. Dunn + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef ESPEAK_NG_H +#define ESPEAK_NG_H + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(_WIN32) || defined(_WIN64) +#ifdef LIBESPEAK_NG_EXPORT +#define ESPEAK_NG_API __declspec(dllexport) +#else +#define ESPEAK_NG_API __declspec(dllimport) +#endif +#else +#define ESPEAK_NG_API +#endif + +#define ESPEAKNG_DEFAULT_VOICE "en" + +typedef enum { + ENS_GROUP_MASK = 0x70000000, + ENS_GROUP_ERRNO = 0x00000000, /* Values 0-255 map to errno error codes. */ + ENS_GROUP_ESPEAK_NG = 0x10000000, /* eSpeak NG error codes. */ + + /* eSpeak NG 1.49.0 */ + ENS_OK = 0, + ENS_COMPILE_ERROR = 0x100001FF, + ENS_VERSION_MISMATCH = 0x100002FF, + ENS_FIFO_BUFFER_FULL = 0x100003FF, + ENS_NOT_INITIALIZED = 0x100004FF, + ENS_AUDIO_ERROR = 0x100005FF, + ENS_VOICE_NOT_FOUND = 0x100006FF, + ENS_MBROLA_NOT_FOUND = 0x100007FF, + ENS_MBROLA_VOICE_NOT_FOUND = 0x100008FF, + ENS_EVENT_BUFFER_FULL = 0x100009FF, + ENS_NOT_SUPPORTED = 0x10000AFF, + ENS_UNSUPPORTED_PHON_FORMAT = 0x10000BFF, + ENS_NO_SPECT_FRAMES = 0x10000CFF, + ENS_EMPTY_PHONEME_MANIFEST = 0x10000DFF, + ENS_SPEECH_STOPPED = 0x10000EFF, + + /* eSpeak NG 1.49.2 */ + ENS_UNKNOWN_PHONEME_FEATURE = 0x10000FFF, + ENS_UNKNOWN_TEXT_ENCODING = 0x100010FF, +} espeak_ng_STATUS; + +typedef enum { + ENOUTPUT_MODE_SYNCHRONOUS = 0x0001, + ENOUTPUT_MODE_SPEAK_AUDIO = 0x0002, +} espeak_ng_OUTPUT_MODE; + +typedef enum { + ENGENDER_UNKNOWN = 0, + ENGENDER_MALE = 1, + ENGENDER_FEMALE = 2, + ENGENDER_NEUTRAL = 3, +} espeak_ng_VOICE_GENDER; + +typedef struct +{ + void (*outputPhoSymbol)(char* pho_code,int pho_type); + void (*outputSilence)(short echo_tail); + void (*outputVoiced)(short sample); + void (*outputUnvoiced)(short sample); +} espeak_ng_OUTPUT_HOOKS; + +/* eSpeak NG 1.49.0 */ + +typedef struct espeak_ng_ERROR_CONTEXT_ *espeak_ng_ERROR_CONTEXT; + +ESPEAK_NG_API void +espeak_ng_ClearErrorContext(espeak_ng_ERROR_CONTEXT *context); + +ESPEAK_NG_API void +espeak_ng_GetStatusCodeMessage(espeak_ng_STATUS status, + char *buffer, + size_t length); + +ESPEAK_NG_API void +espeak_ng_PrintStatusCodeMessage(espeak_ng_STATUS status, + FILE *out, + espeak_ng_ERROR_CONTEXT context); + +ESPEAK_NG_API void +espeak_ng_InitializePath(const char *path); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_Initialize(espeak_ng_ERROR_CONTEXT *context); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_InitializeOutput(espeak_ng_OUTPUT_MODE output_mode, + int buffer_length, + const char *device); + +ESPEAK_NG_API int +espeak_ng_GetSampleRate(void); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetParameter(espeak_PARAMETER parameter, + int value, + int relative); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetPhonemeEvents(int enable, int ipa); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetPunctuationList(const wchar_t *punctlist); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetVoiceByName(const char *name); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetVoiceByFile(const char *filename); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetVoiceByProperties(espeak_VOICE *voice_selector); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_Synthesize(const void *text, + size_t size, + unsigned int position, + espeak_POSITION_TYPE position_type, + unsigned int end_position, + unsigned int flags, + unsigned int *unique_identifier, + void *user_data); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SynthesizeMark(const void *text, + size_t size, + const char *index_mark, + unsigned int end_position, + unsigned int flags, + unsigned int *unique_identifier, + void *user_data); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SpeakKeyName(const char *key_name); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SpeakCharacter(wchar_t character); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_Cancel(void); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_Synchronize(void); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_Terminate(void); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_CompileDictionary(const char *dsource, + const char *dict_name, + FILE *log, + int flags, + espeak_ng_ERROR_CONTEXT *context); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_CompileMbrolaVoice(const char *path, + FILE *log, + espeak_ng_ERROR_CONTEXT *context); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_CompilePhonemeData(long rate, + FILE *log, + espeak_ng_ERROR_CONTEXT *context); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_CompileIntonation(FILE *log, + espeak_ng_ERROR_CONTEXT *context); + + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_CompileIntonationPath(const char *source_path, + const char *destination_path, + FILE *log, + espeak_ng_ERROR_CONTEXT *context); + +/* eSpeak NG 1.49.1 */ + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_CompilePhonemeDataPath(long rate, + const char *source_path, + const char *destination_path, + FILE *log, + espeak_ng_ERROR_CONTEXT *context); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetOutputHooks(espeak_ng_OUTPUT_HOOKS* hooks); +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetConstF0(int f0); + +ESPEAK_NG_API espeak_ng_STATUS +espeak_ng_SetRandSeed(long seed); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/speak_lib.h b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/speak_lib.h new file mode 100644 index 00000000..9c0e2739 --- /dev/null +++ b/Sources/FluidAudio/Frameworks/ESpeakNG.xcframework/ios-arm64/ESpeakNG.framework/Headers/speak_lib.h @@ -0,0 +1,709 @@ +#ifndef SPEAK_LIB_H +#define SPEAK_LIB_H +/*************************************************************************** + * Copyright (C) 2005 to 2012 by Jonathan Duddington * + * email: jonsd@users.sourceforge.net * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 3 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, see: * + * . * + ***************************************************************************/ + + +/*************************************************************/ +/* This is the header file for the library version of espeak */ +/* */ +/*************************************************************/ + +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#ifdef LIBESPEAK_NG_EXPORT +#define ESPEAK_API __declspec(dllexport) +#else +#define ESPEAK_API __declspec(dllimport) +#endif +#else +#define ESPEAK_API +#endif + +#define ESPEAK_API_REVISION 12 +/* +Revision 2 + Added parameter "options" to eSpeakInitialize() + +Revision 3 + Added espeakWORDGAP to espeak_PARAMETER + +Revision 4 + Added flags parameter to espeak_CompileDictionary() + +Revision 5 + Added espeakCHARS_16BIT + +Revision 6 + Added macros: espeakRATE_MINIMUM, espeakRATE_MAXIMUM, espeakRATE_NORMAL + +Revision 7 24.Dec.2011 + Changed espeak_EVENT structure to add id.string[] for phoneme mnemonics. + Added espeakINITIALIZE_PHONEME_IPA option for espeak_Initialize() to report phonemes as IPA names. + +Revision 8 26.Apr.2013 + Added function espeak_TextToPhonemes(). + +Revision 9 30.May.2013 + Changed function espeak_TextToPhonemes(). + +Revision 10 29.Aug.2014 + Changed phonememode parameter to espeak_TextToPhonemes() and espeak_SetPhonemeTrace + +Revision 11 (espeak-ng) + Made ESPEAK_API import/export symbols correctly on Windows. + +Revision 12 (espeak-ng) + Exposed espeak_SetPhonemeCallback. This is available in eSpeak, but was not exposed in this header. + +*/ + /********************/ + /* Initialization */ + /********************/ + +// values for 'value' in espeak_SetParameter(espeakRATE, value, 0), nominally in words-per-minute +#define espeakRATE_MINIMUM 80 +#define espeakRATE_MAXIMUM 450 +#define espeakRATE_NORMAL 175 + + +typedef enum { + espeakEVENT_LIST_TERMINATED = 0, // Retrieval mode: terminates the event list. + espeakEVENT_WORD = 1, // Start of word + espeakEVENT_SENTENCE = 2, // Start of sentence + espeakEVENT_MARK = 3, // Mark + espeakEVENT_PLAY = 4, // Audio element + espeakEVENT_END = 5, // End of sentence or clause + espeakEVENT_MSG_TERMINATED = 6, // End of message + espeakEVENT_PHONEME = 7, // Phoneme, if enabled in espeak_Initialize() + espeakEVENT_SAMPLERATE = 8 // Set sample rate +} espeak_EVENT_TYPE; + + + +typedef struct { + espeak_EVENT_TYPE type; + unsigned int unique_identifier; // message identifier (or 0 for key or character) + int text_position; // the number of characters from the start of the text + int length; // word length, in characters (for espeakEVENT_WORD) + int audio_position; // the time in mS within the generated speech output data + int sample; // sample id (internal use) + void* user_data; // pointer supplied by the calling program + union { + int number; // used for WORD and SENTENCE events. + const char *name; // used for MARK and PLAY events. UTF8 string + char string[8]; // used for phoneme names (UTF8). Terminated by a zero byte unless the name needs the full 8 bytes. + } id; +} espeak_EVENT; +/* + When a message is supplied to espeak_synth, the request is buffered and espeak_synth returns. When the message is really processed, the callback function will be repetedly called. + + + In RETRIEVAL mode, the callback function supplies to the calling program the audio data and an event list terminated by 0 (LIST_TERMINATED). + + In PLAYBACK mode, the callback function is called as soon as an event happens. + + For example suppose that the following message is supplied to espeak_Synth: + "hello, hello." + + + * Once processed in RETRIEVAL mode, it could lead to 3 calls of the callback function : + + ** Block 1: +