This commit is contained in:
Michael Hansen
2020-03-04 11:48:53 -05:00
6 changed files with 189 additions and 55 deletions
+24
View File
@@ -8,6 +8,7 @@ The following table summarizes language support for the various speech to text s
| ------ | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- |
| [pocketsphinx](speech-to-text.md#pocketsphinx) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
| [kaldi](speech-to-text.md#kaldi) | ✓ | ✓ | | ✓ | | ✓ | | | | | ✓ | | |
| [google](speech-to-text.md#google-cloud) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
## Pocketsphinx
@@ -77,6 +78,29 @@ Rhasspy expects a Kaldi-compatible profile to contain a `model` directory with a
If you just want to use Rhasspy for general speech to text, you can set `speech_to_text.kaldi.open_transcription` to `true` in your profile. This will use the included general language model (much slower) and ignore any custom voice commands you've specified.
## Google Cloud
Does speech recognition using [Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text) service.
You will need an active Google Cloud subscription and a JSON private key connected to a service account enabled to use
the speech-to-text API. The locale configured in your profile will be used for speech recognition.
```json
{
"locale": "en_US",
"speech_to_text": {
"system": "google",
"google": {
"credentials": "api-project-xxxxxxxx-abcdef.json",
"min_confidence": 0.7
}
}
}
```
Please note that this module sends the recorded audio after it's completed, so no streaming support.
See `rhasspy.stt.GoogleCloudDecoder` for details.
## Remote HTTP Server
Uses a remote HTTP server to transform speech (WAV) to text.
+61 -48
View File
@@ -1,17 +1,12 @@
#
# Copyright 2018 Picovoice Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE"
# file accompanying this source.
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
import os
@@ -20,7 +15,7 @@ from enum import Enum
class Porcupine(object):
"""Python binding for Picovoice's wake word detection (aka Porcupine) library."""
"""Python binding for Picovoice's wake word detection (Porcupine) engine."""
class PicovoiceStatuses(Enum):
"""Status codes corresponding to 'pv_status_t' defined in 'include/picovoice.h'"""
@@ -29,11 +24,17 @@ class Porcupine(object):
OUT_OF_MEMORY = 1
IO_ERROR = 2
INVALID_ARGUMENT = 3
STOP_ITERATION = 4
KEY_ERROR = 5
INVALID_STATE = 6
_PICOVOICE_STATUS_TO_EXCEPTION = {
PicovoiceStatuses.OUT_OF_MEMORY: MemoryError,
PicovoiceStatuses.IO_ERROR: IOError,
PicovoiceStatuses.INVALID_ARGUMENT: ValueError
PicovoiceStatuses.INVALID_ARGUMENT: ValueError,
PicovoiceStatuses.STOP_ITERATION: StopIteration,
PicovoiceStatuses.KEY_ERROR: KeyError,
PicovoiceStatuses.INVALID_STATE: ValueError,
}
class CPorcupine(Structure):
@@ -48,9 +49,9 @@ class Porcupine(object):
keyword_file_paths=None,
sensitivities=None):
"""
Loads Porcupine's shared library and creates an instance of wake word detection object.
Constructor.
:param library_path: Absolute path to Porcupine's shared library.
:param library_path: Absolute path to Porcupine's dynamic library.
:param model_file_path: Absolute path to file containing model parameters.
:param keyword_file_path: Absolute path to keyword file containing hyper-parameters. If not present then
'keyword_file_paths' will be used.
@@ -64,38 +65,38 @@ class Porcupine(object):
"""
if not os.path.exists(library_path):
raise IOError(f"Could not find Porcupine's library at '{library_path}'")
raise IOError("could'nt find Porcupine's library at '%s'" % library_path)
library = cdll.LoadLibrary(library_path)
if not os.path.exists(model_file_path):
raise IOError(f"Could not find model file at '{model_file_path}'")
raise IOError("could'nt find model file at '%s'" % model_file_path)
if sensitivity is not None and keyword_file_path is not None:
if not os.path.exists(keyword_file_path):
raise IOError(f"Could not find keyword file at '{keyword_file_path}'")
raise IOError("could'nt' find keyword file at '%s'" % keyword_file_path)
keyword_file_paths = [keyword_file_path]
if not (0 <= sensitivity <= 1):
raise ValueError('Sensitivity should be within [0, 1]')
raise ValueError('sensitivity should be within [0, 1]')
sensitivities = [sensitivity]
elif sensitivities is not None and keyword_file_paths is not None:
if len(keyword_file_paths) != len(sensitivities):
raise ValueError("Different number of sensitivity and keyword file path parameters are provided.")
raise ValueError("different number of sensitivity and keyword file path parameters are provided.")
for x in keyword_file_paths:
if not os.path.exists(os.path.expanduser(x)):
raise IOError(f"Could not find keyword file at '{x}'")
raise IOError("could not find keyword file at '%s'" % x)
for x in sensitivities:
if not (0 <= x <= 1):
raise ValueError('Sensitivity should be within [0, 1]')
raise ValueError('sensitivity should be within [0, 1]')
else:
raise ValueError("Sensitivity and/or keyword file path is missing")
raise ValueError("sensitivity and/or keyword file path is missing")
self._num_keywords = len(keyword_file_paths)
init_func = library.pv_porcupine_multiple_keywords_init
init_func = library.pv_porcupine_init
init_func.argtypes = [
c_char_p,
c_int,
@@ -107,44 +108,43 @@ class Porcupine(object):
self._handle = POINTER(self.CPorcupine)()
status = init_func(
model_file_path.encode(),
model_file_path.encode('utf-8'),
self._num_keywords,
(c_char_p * self._num_keywords)(*[os.path.expanduser(x).encode() for x in keyword_file_paths]),
(c_char_p * self._num_keywords)(*[os.path.expanduser(x).encode('utf-8') for x in keyword_file_paths]),
(c_float * self._num_keywords)(*sensitivities),
byref(self._handle))
if status is not self.PicovoiceStatuses.SUCCESS:
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('Initialization failed')
self.process_func = library.pv_porcupine_multiple_keywords_process
self.process_func.argtypes = [POINTER(self.CPorcupine), POINTER(c_short), POINTER(c_int)]
self.process_func.restype = self.PicovoiceStatuses
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('initialization failed')
self._delete_func = library.pv_porcupine_delete
self._delete_func.argtypes = [POINTER(self.CPorcupine)]
self._delete_func.restype = None
self._sample_rate = library.pv_sample_rate()
self.process_func = library.pv_porcupine_process
self.process_func.argtypes = [POINTER(self.CPorcupine), POINTER(c_short), POINTER(c_int)]
self.process_func.restype = self.PicovoiceStatuses
version_func = library.pv_porcupine_version
version_func.argtypes = []
version_func.restype = c_char_p
self._version = version_func().decode('utf-8')
self._frame_length = library.pv_porcupine_frame_length()
@property
def sample_rate(self):
"""Audio sample rate accepted by Porcupine library."""
self._sample_rate = library.pv_sample_rate()
return self._sample_rate
def delete(self):
"""Releases resources acquired by Porcupine's library."""
@property
def frame_length(self):
"""Number of audio samples per frame expected by C library."""
return self._frame_length
self._delete_func(self._handle)
def process(self, pcm):
"""
Monitors incoming audio stream for given wake word(s).
Processes a frame of the incoming audio stream and emits the detection result.
:param pcm: An array (or array-like) of consecutive audio samples. For more information regarding required audio
properties (i.e. sample rate, number of channels encoding, and number of samples per frame) please refer to
'include/pv_porcupine.h'.
:param pcm: A frame of audio samples. The number of samples per frame can be attained by calling
'.frame_length'. The incoming audio needs to have a sample rate equal to '.sample_rate' and be 16-bit
linearly-encoded. Porcupine operates on single-channel audio.
:return: For a single wake-word use cse True if wake word is detected. For multiple wake-word use case it
returns the index of detected wake-word. Indexing is 0-based and according to ordering of input keyword file
paths. It returns -1 when no keyword is detected.
@@ -153,7 +153,7 @@ class Porcupine(object):
result = c_int()
status = self.process_func(self._handle, (c_short * len(pcm))(*pcm), byref(result))
if status is not self.PicovoiceStatuses.SUCCESS:
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('Processing failed')
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]()
keyword_index = result.value
@@ -162,7 +162,20 @@ class Porcupine(object):
else:
return keyword_index
def delete(self):
"""Releases resources acquired by Porcupine's library."""
@property
def version(self):
"""Getter for version"""
self._delete_func(self._handle)
return self._version
@property
def frame_length(self):
"""Getter for number of audio samples per frame."""
return self._frame_length
@property
def sample_rate(self):
"""Audio sample rate accepted by Picovoice."""
return self._sample_rate
+7 -7
View File
@@ -324,31 +324,31 @@
"cache": false
},
"porcupine_params.pv": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/common/porcupine_params.pv",
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/common/porcupine_params.pv",
"cache": false
},
"porcupine.ppn": {
"cache": false,
"x86_64": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/linux/porcupine_linux.ppn"
"url": "https://github.com/Picovoice/Porcupine/raw/v1.7/resources/keyword_files/linux/porcupine_linux.ppn"
},
"armv7l": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/raspberrypi/porcupine_raspberrypi.ppn"
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/resources/keyword_files/raspberry-pi/porcupine_raspberry-pi.ppn"
},
"aarch64": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/raspberrypi/porcupine_raspberrypi.ppn"
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/resources/keyword_files/raspberry-pi/porcupine_raspberry-pi.ppn"
}
},
"libpv_porcupine.so": {
"cache": false,
"x86_64": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/linux/x86_64/libpv_porcupine.so"
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/linux/x86_64/libpv_porcupine.so"
},
"armv7l": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
},
"aarch64": {
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
}
}
}
+1
View File
@@ -3,6 +3,7 @@ aiohttp==3.6.2
doit==0.31.1
fuzzywuzzy[speedup]==0.17.0
google-cloud-texttospeech==0.5.0
google-cloud-speech==1.3.1
html5lib==1.0.1
json5==0.7.0
multidict==4.6.1
+11
View File
@@ -389,6 +389,7 @@ class RasaIntentRecognizer(RhasspyActor):
RhasspyActor.__init__(self)
self.project_name = ""
self.parse_url = ""
self.min_confidence: float = 0
def to_started(self, from_state: str) -> None:
"""Transition to started state."""
@@ -397,6 +398,7 @@ class RasaIntentRecognizer(RhasspyActor):
self.project_name = rasa_config.get(
"project_name", f"rhasspy_{self.profile.name}"
)
self.min_confidence = rasa_config.get("min_confidence", 0)
self.parse_url = urljoin(url, "model/parse")
def in_started(self, message: Any, sender: RhasspyActor) -> None:
@@ -406,6 +408,15 @@ class RasaIntentRecognizer(RhasspyActor):
intent = self.recognize(message.text)
intent["intent"]["name"] = intent["intent"]["name"] or ""
logging.debug(repr(intent))
confidence = intent["intent"]["confidence"]
if confidence < self.min_confidence:
intent["intent"]["name"] = ""
self._logger.warning(
"Intent did not meet confidence threshold: %s < %s",
confidence,
self.min_confidence,
)
except Exception:
self._logger.exception("in_started")
intent = empty_intent()
+85
View File
@@ -10,6 +10,9 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
from urllib.parse import urljoin
import requests
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from rhasspy.actor import RhasspyActor
from rhasspy.events import TranscribeWav, WavTranscription
@@ -25,6 +28,7 @@ def get_decoder_class(system: str) -> Type[RhasspyActor]:
"pocketsphinx",
"kaldi",
"remote",
"google",
"hass_stt",
"command",
], f"Invalid speech to text system: {system}"
@@ -38,6 +42,9 @@ def get_decoder_class(system: str) -> Type[RhasspyActor]:
if system == "remote":
# Use remote Rhasspy server
return RemoteDecoder
if system == "google":
# Use remote Google Cloud
return GoogleCloudDecoder
if system == "hass_stt":
# Use Home Assistant STT platform
return HomeAssistantSTTIntegration
@@ -320,6 +327,84 @@ class RemoteDecoder(RhasspyActor):
return response.text
# -----------------------------------------------------------------------------
# Google Cloud Speech-to-text decoder
# -----------------------------------------------------------------------------
class GoogleCloudDecoder(RhasspyActor):
"""Forwards speech to text request to Google Cloud STT service"""
def __init__(self) -> None:
RhasspyActor.__init__(self)
self.client = None
self.language_code = None
self.min_confidence: float = 0
def to_started(self, from_state: str) -> None:
"""Transition to started state."""
credentials_file = self.profile.get("speech_to_text.google.credentials")
self.min_confidence = self.profile.get("speech_to_text.google.min_confidence")
self.language_code = self.profile.get("locale").replace('_', '-')
from google.auth import environment_vars
os.environ[environment_vars.CREDENTIALS] = credentials_file
self.client = speech.SpeechClient()
def in_started(self, message: Any, sender: RhasspyActor) -> None:
"""Handle messages in started state."""
if isinstance(message, TranscribeWav):
try:
text, confidence = self.transcribe_wav(message.wav_data)
self._logger.debug(text)
self.send(
message.receiver or sender,
WavTranscription(
text, confidence=confidence, handle=message.handle
),
)
except Exception:
self._logger.exception("transcribing wav")
# Send empty transcription back
self.send(
message.receiver or sender,
WavTranscription("", confidence=0, handle=message.handle),
)
def transcribe_wav(self, wav_data: bytes) -> Tuple[str, float]:
"""POST to remote server and return response."""
headers = {"Content-Type": "audio/wav"}
self._logger.debug(
"POSTing %d byte(s) of WAV data to Google Cloud STT", len(wav_data)
)
audio = types.RecognitionAudio(content=wav_data)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
model='command_and_search',
language_code=self.language_code)
response = self.client.recognize(config, audio)
if len(response.results) == 0:
self._logger.debug("No results returned.")
return "", 0
result = response.results[0].alternatives[0]
self._logger.debug("Transcription confidence: %s", result.confidence)
if result.confidence >= self.min_confidence:
return result.transcript, result.confidence
self._logger.warning(
"Transcription did not meet confidence threshold: %s < %s",
result.confidence,
self.min_confidence,
)
return "", 0
# -----------------------------------------------------------------------------
# Kaldi Decoder
# http://kaldi-asr.org