mirror of
https://github.com/FluidInference/FluidAudio.git
synced 2026-05-12 20:20:36 +00:00
7fd5ac5446
### Why is this change needed? <!-- Explain the motivation for this change. What problem does it solve? --> Keeping the streaming one around as the VBx and AHC clustering gets pretty expensive after 30mins of audio and running it constantly gets expensive. Its still possible to support clustering between files but will save that for another PR. Pyannote's Bench mark is around 11% - i increased steps to 0.2s instead of 0.1 to double the speed but also selective fp16 results in more operations to run on ANE but also means that we lose some precision. ``` Average DER: 14.95% | Median DER: 10.89% | Average JER: 39.27% | Median JER: 40.74% (collar=0.25s, ignoreOverlap=True) Average RTFx: 139.63 (from 232 clips) Metrics summary saved to: /Users/brandonweng/FluidAudioDatasets/voxconverse/metrics/test_metrics_release.json Completed. New results: 232, Skipped existing: 0, Total attempted: 232 ``` See benchmark.md for more info but compared to Pytorch model, we are 100x faster than the CPU version and ~6x faster compared to the mps backend on mb pro 4 --------- Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Co-authored-by: Brandon Weng <BrandonWeng@users.noreply.github.com> Co-authored-by: Alex <36247722+Alex-Wengg@users.noreply.github.com> Co-authored-by: Alex-Wengg <hanweng9@gmail.com>
74 lines
3.4 KiB
Ruby
74 lines
3.4 KiB
Ruby
Pod::Spec.new do |spec|
|
|
spec.name = "FluidAudio"
|
|
spec.version = "0.7.4"
|
|
spec.summary = "Speaker diarization, voice-activity-detection and transcription with CoreML"
|
|
spec.description = <<-DESC
|
|
Fluid Audio is a Swift SDK for fully local, low-latency audio AI on Apple devices,
|
|
with inference offloaded to the Apple Neural Engine (ANE). The SDK includes
|
|
state-of-the-art speaker diarization, transcription, and voice activity detection
|
|
via open-source models that can be integrated with just a few lines of code.
|
|
DESC
|
|
|
|
spec.homepage = "https://github.com/FluidInference/FluidAudio"
|
|
spec.license = { :type => "Apache 2.0", :file => "LICENSE" }
|
|
spec.author = { "FluidInference" => "info@fluidinference.com" }
|
|
|
|
spec.ios.deployment_target = "17.0"
|
|
spec.osx.deployment_target = "14.0"
|
|
|
|
spec.source = { :git => "https://github.com/FluidInference/FluidAudio.git", :tag => "v#{spec.version}" }
|
|
spec.swift_versions = ["5.10"]
|
|
|
|
spec.pod_target_xcconfig = {
|
|
'DEFINES_MODULE' => 'YES',
|
|
'ARCHS[sdk=macosx*]' => 'arm64',
|
|
'EXCLUDED_ARCHS[sdk=macosx*]' => 'x86_64',
|
|
'ARCHS[sdk=iphonesimulator*]' => 'arm64',
|
|
'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386 x86_64',
|
|
'ARCHS[sdk=iphoneos*]' => 'arm64'
|
|
}
|
|
|
|
spec.user_target_xcconfig = {
|
|
'ARCHS[sdk=macosx*]' => 'arm64',
|
|
'EXCLUDED_ARCHS[sdk=macosx*]' => 'x86_64',
|
|
'ARCHS[sdk=iphonesimulator*]' => 'arm64',
|
|
'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'i386 x86_64',
|
|
'ARCHS[sdk=iphoneos*]' => 'arm64'
|
|
}
|
|
|
|
spec.subspec "FastClusterWrapper" do |wrapper|
|
|
wrapper.requires_arc = false
|
|
wrapper.source_files = "Sources/FastClusterWrapper/**/*.{cpp,h,hpp}"
|
|
wrapper.public_header_files = "Sources/FastClusterWrapper/include/FastClusterWrapper.h"
|
|
wrapper.private_header_files = "Sources/FastClusterWrapper/fastcluster_internal.hpp"
|
|
wrapper.header_mappings_dir = "Sources/FastClusterWrapper"
|
|
wrapper.pod_target_xcconfig = {
|
|
'CLANG_CXX_LANGUAGE_STANDARD' => 'c++17'
|
|
}
|
|
end
|
|
|
|
spec.subspec "Core" do |core|
|
|
core.dependency "#{spec.name}/FastClusterWrapper"
|
|
core.source_files = "Sources/FluidAudio/**/*.swift"
|
|
|
|
# iOS Configuration
|
|
# Exclude TTS module from iOS builds to avoid ESpeakNG xcframework linking issues.
|
|
# CocoaPods has known limitations with vendored xcframeworks during pod lib lint on iOS:
|
|
# the framework symbols aren't properly linked in the temporary build environment,
|
|
# causing "Undefined symbols" linker errors even though the binary is valid.
|
|
# iOS builds include: ASR (speech recognition), Diarization, and VAD (voice activity detection).
|
|
core.ios.exclude_files = "Sources/FluidAudio/TextToSpeech/**/*"
|
|
core.ios.frameworks = "CoreML", "AVFoundation", "Accelerate", "UIKit"
|
|
|
|
# macOS Configuration
|
|
# ESpeakNG framework is only vendored for macOS in the podspec (not a framework limitation).
|
|
# The xcframework supports iOS, but CocoaPods fails to link it during iOS validation.
|
|
# This enables TTS (text-to-speech) functionality with G2P (grapheme-to-phoneme) conversion.
|
|
# macOS builds include: ASR, Diarization, VAD, and TTS with ESpeakNG support.
|
|
core.osx.vendored_frameworks = "Sources/FluidAudio/Frameworks/ESpeakNG.xcframework"
|
|
core.osx.frameworks = "CoreML", "AVFoundation", "Accelerate", "Cocoa"
|
|
end
|
|
|
|
spec.default_subspecs = ["Core"]
|
|
end
|