Merge branch 'master' of https://github.com/synesthesiam/rhasspy
This commit is contained in:
@@ -8,6 +8,7 @@ The following table summarizes language support for the various speech to text s
|
||||
| ------ | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- | ------- |
|
||||
| [pocketsphinx](speech-to-text.md#pocketsphinx) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ |
|
||||
| [kaldi](speech-to-text.md#kaldi) | ✓ | ✓ | | ✓ | | ✓ | | | | | ✓ | | |
|
||||
| [google](speech-to-text.md#google-cloud) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
|
||||
|
||||
## Pocketsphinx
|
||||
|
||||
@@ -77,6 +78,29 @@ Rhasspy expects a Kaldi-compatible profile to contain a `model` directory with a
|
||||
|
||||
If you just want to use Rhasspy for general speech to text, you can set `speech_to_text.kaldi.open_transcription` to `true` in your profile. This will use the included general language model (much slower) and ignore any custom voice commands you've specified.
|
||||
|
||||
## Google Cloud
|
||||
|
||||
Does speech recognition using [Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text) service.
|
||||
You will need an active Google Cloud subscription and a JSON private key connected to a service account enabled to use
|
||||
the speech-to-text API. The locale configured in your profile will be used for speech recognition.
|
||||
|
||||
```json
|
||||
{
|
||||
"locale": "en_US",
|
||||
"speech_to_text": {
|
||||
"system": "google",
|
||||
"google": {
|
||||
"credentials": "api-project-xxxxxxxx-abcdef.json",
|
||||
"min_confidence": 0.7
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Please note that this module sends the recorded audio after it's completed, so no streaming support.
|
||||
|
||||
See `rhasspy.stt.GoogleCloudDecoder` for details.
|
||||
|
||||
## Remote HTTP Server
|
||||
|
||||
Uses a remote HTTP server to transform speech (WAV) to text.
|
||||
|
||||
+61
-48
@@ -1,17 +1,12 @@
|
||||
#
|
||||
# Copyright 2018 Picovoice Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE"
|
||||
# file accompanying this source.
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
@@ -20,7 +15,7 @@ from enum import Enum
|
||||
|
||||
|
||||
class Porcupine(object):
|
||||
"""Python binding for Picovoice's wake word detection (aka Porcupine) library."""
|
||||
"""Python binding for Picovoice's wake word detection (Porcupine) engine."""
|
||||
|
||||
class PicovoiceStatuses(Enum):
|
||||
"""Status codes corresponding to 'pv_status_t' defined in 'include/picovoice.h'"""
|
||||
@@ -29,11 +24,17 @@ class Porcupine(object):
|
||||
OUT_OF_MEMORY = 1
|
||||
IO_ERROR = 2
|
||||
INVALID_ARGUMENT = 3
|
||||
STOP_ITERATION = 4
|
||||
KEY_ERROR = 5
|
||||
INVALID_STATE = 6
|
||||
|
||||
_PICOVOICE_STATUS_TO_EXCEPTION = {
|
||||
PicovoiceStatuses.OUT_OF_MEMORY: MemoryError,
|
||||
PicovoiceStatuses.IO_ERROR: IOError,
|
||||
PicovoiceStatuses.INVALID_ARGUMENT: ValueError
|
||||
PicovoiceStatuses.INVALID_ARGUMENT: ValueError,
|
||||
PicovoiceStatuses.STOP_ITERATION: StopIteration,
|
||||
PicovoiceStatuses.KEY_ERROR: KeyError,
|
||||
PicovoiceStatuses.INVALID_STATE: ValueError,
|
||||
}
|
||||
|
||||
class CPorcupine(Structure):
|
||||
@@ -48,9 +49,9 @@ class Porcupine(object):
|
||||
keyword_file_paths=None,
|
||||
sensitivities=None):
|
||||
"""
|
||||
Loads Porcupine's shared library and creates an instance of wake word detection object.
|
||||
Constructor.
|
||||
|
||||
:param library_path: Absolute path to Porcupine's shared library.
|
||||
:param library_path: Absolute path to Porcupine's dynamic library.
|
||||
:param model_file_path: Absolute path to file containing model parameters.
|
||||
:param keyword_file_path: Absolute path to keyword file containing hyper-parameters. If not present then
|
||||
'keyword_file_paths' will be used.
|
||||
@@ -64,38 +65,38 @@ class Porcupine(object):
|
||||
"""
|
||||
|
||||
if not os.path.exists(library_path):
|
||||
raise IOError(f"Could not find Porcupine's library at '{library_path}'")
|
||||
raise IOError("could'nt find Porcupine's library at '%s'" % library_path)
|
||||
|
||||
library = cdll.LoadLibrary(library_path)
|
||||
|
||||
if not os.path.exists(model_file_path):
|
||||
raise IOError(f"Could not find model file at '{model_file_path}'")
|
||||
raise IOError("could'nt find model file at '%s'" % model_file_path)
|
||||
|
||||
if sensitivity is not None and keyword_file_path is not None:
|
||||
if not os.path.exists(keyword_file_path):
|
||||
raise IOError(f"Could not find keyword file at '{keyword_file_path}'")
|
||||
raise IOError("could'nt' find keyword file at '%s'" % keyword_file_path)
|
||||
keyword_file_paths = [keyword_file_path]
|
||||
|
||||
if not (0 <= sensitivity <= 1):
|
||||
raise ValueError('Sensitivity should be within [0, 1]')
|
||||
raise ValueError('sensitivity should be within [0, 1]')
|
||||
sensitivities = [sensitivity]
|
||||
elif sensitivities is not None and keyword_file_paths is not None:
|
||||
if len(keyword_file_paths) != len(sensitivities):
|
||||
raise ValueError("Different number of sensitivity and keyword file path parameters are provided.")
|
||||
raise ValueError("different number of sensitivity and keyword file path parameters are provided.")
|
||||
|
||||
for x in keyword_file_paths:
|
||||
if not os.path.exists(os.path.expanduser(x)):
|
||||
raise IOError(f"Could not find keyword file at '{x}'")
|
||||
raise IOError("could not find keyword file at '%s'" % x)
|
||||
|
||||
for x in sensitivities:
|
||||
if not (0 <= x <= 1):
|
||||
raise ValueError('Sensitivity should be within [0, 1]')
|
||||
raise ValueError('sensitivity should be within [0, 1]')
|
||||
else:
|
||||
raise ValueError("Sensitivity and/or keyword file path is missing")
|
||||
raise ValueError("sensitivity and/or keyword file path is missing")
|
||||
|
||||
self._num_keywords = len(keyword_file_paths)
|
||||
|
||||
init_func = library.pv_porcupine_multiple_keywords_init
|
||||
init_func = library.pv_porcupine_init
|
||||
init_func.argtypes = [
|
||||
c_char_p,
|
||||
c_int,
|
||||
@@ -107,44 +108,43 @@ class Porcupine(object):
|
||||
self._handle = POINTER(self.CPorcupine)()
|
||||
|
||||
status = init_func(
|
||||
model_file_path.encode(),
|
||||
model_file_path.encode('utf-8'),
|
||||
self._num_keywords,
|
||||
(c_char_p * self._num_keywords)(*[os.path.expanduser(x).encode() for x in keyword_file_paths]),
|
||||
(c_char_p * self._num_keywords)(*[os.path.expanduser(x).encode('utf-8') for x in keyword_file_paths]),
|
||||
(c_float * self._num_keywords)(*sensitivities),
|
||||
byref(self._handle))
|
||||
if status is not self.PicovoiceStatuses.SUCCESS:
|
||||
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('Initialization failed')
|
||||
|
||||
self.process_func = library.pv_porcupine_multiple_keywords_process
|
||||
self.process_func.argtypes = [POINTER(self.CPorcupine), POINTER(c_short), POINTER(c_int)]
|
||||
self.process_func.restype = self.PicovoiceStatuses
|
||||
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('initialization failed')
|
||||
|
||||
self._delete_func = library.pv_porcupine_delete
|
||||
self._delete_func.argtypes = [POINTER(self.CPorcupine)]
|
||||
self._delete_func.restype = None
|
||||
|
||||
self._sample_rate = library.pv_sample_rate()
|
||||
self.process_func = library.pv_porcupine_process
|
||||
self.process_func.argtypes = [POINTER(self.CPorcupine), POINTER(c_short), POINTER(c_int)]
|
||||
self.process_func.restype = self.PicovoiceStatuses
|
||||
|
||||
version_func = library.pv_porcupine_version
|
||||
version_func.argtypes = []
|
||||
version_func.restype = c_char_p
|
||||
self._version = version_func().decode('utf-8')
|
||||
|
||||
self._frame_length = library.pv_porcupine_frame_length()
|
||||
|
||||
@property
|
||||
def sample_rate(self):
|
||||
"""Audio sample rate accepted by Porcupine library."""
|
||||
self._sample_rate = library.pv_sample_rate()
|
||||
|
||||
return self._sample_rate
|
||||
def delete(self):
|
||||
"""Releases resources acquired by Porcupine's library."""
|
||||
|
||||
@property
|
||||
def frame_length(self):
|
||||
"""Number of audio samples per frame expected by C library."""
|
||||
|
||||
return self._frame_length
|
||||
self._delete_func(self._handle)
|
||||
|
||||
def process(self, pcm):
|
||||
"""
|
||||
Monitors incoming audio stream for given wake word(s).
|
||||
Processes a frame of the incoming audio stream and emits the detection result.
|
||||
|
||||
:param pcm: An array (or array-like) of consecutive audio samples. For more information regarding required audio
|
||||
properties (i.e. sample rate, number of channels encoding, and number of samples per frame) please refer to
|
||||
'include/pv_porcupine.h'.
|
||||
:param pcm: A frame of audio samples. The number of samples per frame can be attained by calling
|
||||
'.frame_length'. The incoming audio needs to have a sample rate equal to '.sample_rate' and be 16-bit
|
||||
linearly-encoded. Porcupine operates on single-channel audio.
|
||||
:return: For a single wake-word use cse True if wake word is detected. For multiple wake-word use case it
|
||||
returns the index of detected wake-word. Indexing is 0-based and according to ordering of input keyword file
|
||||
paths. It returns -1 when no keyword is detected.
|
||||
@@ -153,7 +153,7 @@ class Porcupine(object):
|
||||
result = c_int()
|
||||
status = self.process_func(self._handle, (c_short * len(pcm))(*pcm), byref(result))
|
||||
if status is not self.PicovoiceStatuses.SUCCESS:
|
||||
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('Processing failed')
|
||||
raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]()
|
||||
|
||||
keyword_index = result.value
|
||||
|
||||
@@ -162,7 +162,20 @@ class Porcupine(object):
|
||||
else:
|
||||
return keyword_index
|
||||
|
||||
def delete(self):
|
||||
"""Releases resources acquired by Porcupine's library."""
|
||||
@property
|
||||
def version(self):
|
||||
"""Getter for version"""
|
||||
|
||||
self._delete_func(self._handle)
|
||||
return self._version
|
||||
|
||||
@property
|
||||
def frame_length(self):
|
||||
"""Getter for number of audio samples per frame."""
|
||||
|
||||
return self._frame_length
|
||||
|
||||
@property
|
||||
def sample_rate(self):
|
||||
"""Audio sample rate accepted by Picovoice."""
|
||||
|
||||
return self._sample_rate
|
||||
|
||||
@@ -324,31 +324,31 @@
|
||||
"cache": false
|
||||
},
|
||||
"porcupine_params.pv": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/common/porcupine_params.pv",
|
||||
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/common/porcupine_params.pv",
|
||||
"cache": false
|
||||
},
|
||||
"porcupine.ppn": {
|
||||
"cache": false,
|
||||
"x86_64": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/linux/porcupine_linux.ppn"
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/v1.7/resources/keyword_files/linux/porcupine_linux.ppn"
|
||||
},
|
||||
"armv7l": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/raspberrypi/porcupine_raspberrypi.ppn"
|
||||
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/resources/keyword_files/raspberry-pi/porcupine_raspberry-pi.ppn"
|
||||
},
|
||||
"aarch64": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/raspberrypi/porcupine_raspberrypi.ppn"
|
||||
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/resources/keyword_files/raspberry-pi/porcupine_raspberry-pi.ppn"
|
||||
}
|
||||
},
|
||||
"libpv_porcupine.so": {
|
||||
"cache": false,
|
||||
"x86_64": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/linux/x86_64/libpv_porcupine.so"
|
||||
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/linux/x86_64/libpv_porcupine.so"
|
||||
},
|
||||
"armv7l": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
|
||||
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
|
||||
},
|
||||
"aarch64": {
|
||||
"url": "https://github.com/Picovoice/Porcupine/raw/master/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
|
||||
"url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ aiohttp==3.6.2
|
||||
doit==0.31.1
|
||||
fuzzywuzzy[speedup]==0.17.0
|
||||
google-cloud-texttospeech==0.5.0
|
||||
google-cloud-speech==1.3.1
|
||||
html5lib==1.0.1
|
||||
json5==0.7.0
|
||||
multidict==4.6.1
|
||||
|
||||
@@ -389,6 +389,7 @@ class RasaIntentRecognizer(RhasspyActor):
|
||||
RhasspyActor.__init__(self)
|
||||
self.project_name = ""
|
||||
self.parse_url = ""
|
||||
self.min_confidence: float = 0
|
||||
|
||||
def to_started(self, from_state: str) -> None:
|
||||
"""Transition to started state."""
|
||||
@@ -397,6 +398,7 @@ class RasaIntentRecognizer(RhasspyActor):
|
||||
self.project_name = rasa_config.get(
|
||||
"project_name", f"rhasspy_{self.profile.name}"
|
||||
)
|
||||
self.min_confidence = rasa_config.get("min_confidence", 0)
|
||||
self.parse_url = urljoin(url, "model/parse")
|
||||
|
||||
def in_started(self, message: Any, sender: RhasspyActor) -> None:
|
||||
@@ -406,6 +408,15 @@ class RasaIntentRecognizer(RhasspyActor):
|
||||
intent = self.recognize(message.text)
|
||||
intent["intent"]["name"] = intent["intent"]["name"] or ""
|
||||
logging.debug(repr(intent))
|
||||
confidence = intent["intent"]["confidence"]
|
||||
if confidence < self.min_confidence:
|
||||
intent["intent"]["name"] = ""
|
||||
|
||||
self._logger.warning(
|
||||
"Intent did not meet confidence threshold: %s < %s",
|
||||
confidence,
|
||||
self.min_confidence,
|
||||
)
|
||||
except Exception:
|
||||
self._logger.exception("in_started")
|
||||
intent = empty_intent()
|
||||
|
||||
@@ -10,6 +10,9 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from google.cloud import speech
|
||||
from google.cloud.speech import enums
|
||||
from google.cloud.speech import types
|
||||
|
||||
from rhasspy.actor import RhasspyActor
|
||||
from rhasspy.events import TranscribeWav, WavTranscription
|
||||
@@ -25,6 +28,7 @@ def get_decoder_class(system: str) -> Type[RhasspyActor]:
|
||||
"pocketsphinx",
|
||||
"kaldi",
|
||||
"remote",
|
||||
"google",
|
||||
"hass_stt",
|
||||
"command",
|
||||
], f"Invalid speech to text system: {system}"
|
||||
@@ -38,6 +42,9 @@ def get_decoder_class(system: str) -> Type[RhasspyActor]:
|
||||
if system == "remote":
|
||||
# Use remote Rhasspy server
|
||||
return RemoteDecoder
|
||||
if system == "google":
|
||||
# Use remote Google Cloud
|
||||
return GoogleCloudDecoder
|
||||
if system == "hass_stt":
|
||||
# Use Home Assistant STT platform
|
||||
return HomeAssistantSTTIntegration
|
||||
@@ -320,6 +327,84 @@ class RemoteDecoder(RhasspyActor):
|
||||
return response.text
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Google Cloud Speech-to-text decoder
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class GoogleCloudDecoder(RhasspyActor):
|
||||
"""Forwards speech to text request to Google Cloud STT service"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
RhasspyActor.__init__(self)
|
||||
self.client = None
|
||||
self.language_code = None
|
||||
self.min_confidence: float = 0
|
||||
|
||||
def to_started(self, from_state: str) -> None:
|
||||
"""Transition to started state."""
|
||||
credentials_file = self.profile.get("speech_to_text.google.credentials")
|
||||
self.min_confidence = self.profile.get("speech_to_text.google.min_confidence")
|
||||
self.language_code = self.profile.get("locale").replace('_', '-')
|
||||
from google.auth import environment_vars
|
||||
os.environ[environment_vars.CREDENTIALS] = credentials_file
|
||||
self.client = speech.SpeechClient()
|
||||
|
||||
def in_started(self, message: Any, sender: RhasspyActor) -> None:
|
||||
"""Handle messages in started state."""
|
||||
if isinstance(message, TranscribeWav):
|
||||
try:
|
||||
text, confidence = self.transcribe_wav(message.wav_data)
|
||||
self._logger.debug(text)
|
||||
self.send(
|
||||
message.receiver or sender,
|
||||
WavTranscription(
|
||||
text, confidence=confidence, handle=message.handle
|
||||
),
|
||||
)
|
||||
except Exception:
|
||||
self._logger.exception("transcribing wav")
|
||||
|
||||
# Send empty transcription back
|
||||
self.send(
|
||||
message.receiver or sender,
|
||||
WavTranscription("", confidence=0, handle=message.handle),
|
||||
)
|
||||
|
||||
def transcribe_wav(self, wav_data: bytes) -> Tuple[str, float]:
|
||||
"""POST to remote server and return response."""
|
||||
headers = {"Content-Type": "audio/wav"}
|
||||
self._logger.debug(
|
||||
"POSTing %d byte(s) of WAV data to Google Cloud STT", len(wav_data)
|
||||
)
|
||||
|
||||
audio = types.RecognitionAudio(content=wav_data)
|
||||
config = types.RecognitionConfig(
|
||||
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=16000,
|
||||
model='command_and_search',
|
||||
language_code=self.language_code)
|
||||
|
||||
response = self.client.recognize(config, audio)
|
||||
if len(response.results) == 0:
|
||||
self._logger.debug("No results returned.")
|
||||
return "", 0
|
||||
|
||||
result = response.results[0].alternatives[0]
|
||||
|
||||
self._logger.debug("Transcription confidence: %s", result.confidence)
|
||||
if result.confidence >= self.min_confidence:
|
||||
return result.transcript, result.confidence
|
||||
|
||||
self._logger.warning(
|
||||
"Transcription did not meet confidence threshold: %s < %s",
|
||||
result.confidence,
|
||||
self.min_confidence,
|
||||
)
|
||||
|
||||
return "", 0
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Kaldi Decoder
|
||||
# http://kaldi-asr.org
|
||||
|
||||
Reference in New Issue
Block a user