joplin/packages/whisper-voice-typing/cpp/utils/findLongestSilence.cpp

#include "findLongestSilence.hpp"
#include "androidUtil.h"

static void highpass(std::vector<float>& data, int sampleRate) {
	// Highpass filter. See https://en.wikipedia.org/wiki/High-pass_filter and
	// the example in whisper.cpp/streaming.
	float highpassCutoffHz = 60.0f;
	float RC = 1.0f / (2 * 3.1416f * highpassCutoffHz);
	float timePerSample = 1.0f / sampleRate;
	float alpha = RC / (RC + timePerSample);

	float lastInput = data[0];
	for (int i = 1; i < data.size(); i++) {
		float currentInput = data[i];
		data[i] = alpha * data[i - 1] + alpha * (currentInput - lastInput);
		lastInput = currentInput;
	}
}

SilenceRange findLongestSilence(
	const std::vector<float>& audioData,
	LongestSilenceOptions options
) {
	// Options variables
	int sampleRate = options.sampleRate;
	int maxSilencePosition = options.maximumSilenceStartSamples;
	float minSilenceLengthSeconds = options.minSilenceLengthSeconds;
	bool returnFirstMatch = options.returnFirstMatch;

	// State
	int bestCandidateLength = 0;
	int bestCandidateStart = -1;
	int bestCandidateEnd = -1;
	int currentCandidateStart = -1;

	std::vector<float> processedAudio { audioData };
	highpass(processedAudio, sampleRate);

	// Break into windows of size `windowSize`:
	int windowSize = 256;
	int windowsPerSecond = sampleRate / windowSize;
	int quietWindows = 0; // Number of relatively quiet windows encountered

	// Finishes the current candidate for longest silence
	auto finalizeCandidate = [&] (int currentOffset) {
		bool hasCandidate = currentCandidateStart >= 0;
		if (!hasCandidate) {
			return;
		}

		int currentCandidateLength = currentOffset - currentCandidateStart;
		if (currentCandidateLength > bestCandidateLength && currentCandidateStart <= maxSilencePosition) {
			bestCandidateLength = currentCandidateLength;
			bestCandidateStart = currentCandidateStart;
			bestCandidateEnd = currentOffset;
			LOGD("New best candidate with length %d", currentCandidateLength);
		}

		currentCandidateStart = -1;
	};

	int windowOffset;
	for (windowOffset = 0; windowOffset < processedAudio.size() && windowOffset <= maxSilencePosition; windowOffset += windowSize) {
		int rollingAverageSize = 24;
		float threshold = static_cast<float>(rollingAverageSize) / 80.0f;

		// Count the number of samples that (when averaged with the nearby samples)
		// are below some threshold value.
		float absSum = 0;
		int silentSamples = 0;
		for (int i = windowOffset; i < windowOffset + windowSize && i < processedAudio.size(); i++) {
			absSum += abs(processedAudio[i]);

			bool isSumComplete = i - rollingAverageSize >= windowOffset;
			if (isSumComplete) {
				absSum -= abs(processedAudio[i - rollingAverageSize]);

				if (absSum < threshold) {
					silentSamples++;
				}
			}
		}

		// The window should be considered "quiet" if enough samples were below the threshold.
		// Don't require all of them to be to allow clicks and pops.
		if (silentSamples >= windowSize * 3 / 4) {
			quietWindows ++;
		} else {
			quietWindows = 0;
		}

		int minQuietWindows = static_cast<int>(windowsPerSecond * minSilenceLengthSeconds);
		if (quietWindows >= minQuietWindows && currentCandidateStart == -1) { // Found silence
			// Ignore the first window, which probably contains some of the start of the audio
			// and the most recent window, which came after windowOffset.
			int windowsToIgnore = 2;
			int estimatedQuietSamples = std::max(0, quietWindows - windowsToIgnore) * windowSize;
			currentCandidateStart = windowOffset - estimatedQuietSamples;
		} else if (quietWindows == 0) { // Silence ended
			// Ended a candidate. Is it better than the best?
			finalizeCandidate(windowOffset);

			// Search for more candidates or return now?
			if (returnFirstMatch && bestCandidateLength > 0) {
				break;
			}
		}
	}

	finalizeCandidate(windowOffset);

	// Return the best candidate.
	if (bestCandidateLength == 0) {
		return { .isValid = false, .start = 0, .end = 0 };
	} else {
		return {
			.isValid=true,
			.start=bestCandidateStart,
			.end=bestCandidateEnd
		};
	}
}