Merge branch 'master' of https://github.com/synesthesiam/rhasspy

2020-03-04 11:48:53 -05:00
parent d770679373 86e695a7a4
commit 33b847b828
6 changed files with 189 additions and 55 deletions
@@ -8,6 +8,7 @@ The following table summarizes language support for the various speech to text s
 | ------                                         | -------  | -------  | -------  | -------  | -------  | -------  | -------  | -------  | -------  | -------  | -------  | -------  | -------  |
 | [pocketsphinx](speech-to-text.md#pocketsphinx) | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; |          | &#x2713; | &#x2713; |
 | [kaldi](speech-to-text.md#kaldi)               | &#x2713; | &#x2713; |          | &#x2713; |          | &#x2713; |          |          |          |          | &#x2713; |          |          |
+| [google](speech-to-text.md#google-cloud)       | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; | &#x2713; |

 ## Pocketsphinx

@@ -77,6 +78,29 @@ Rhasspy expects a Kaldi-compatible profile to contain a `model` directory with a

 If you just want to use Rhasspy for general speech to text, you can set `speech_to_text.kaldi.open_transcription` to `true` in your profile. This will use the included general language model (much slower) and ignore any custom voice commands you've specified.

+## Google Cloud
+
+Does speech recognition using [Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text) service.
+You will need an active Google Cloud subscription and a JSON private key connected to a service account enabled to use
+the speech-to-text API. The locale configured in your profile will be used for speech recognition.
+
+```json
+{
+  "locale": "en_US",
+  "speech_to_text": {
+    "system": "google",
+    "google": {
+      "credentials": "api-project-xxxxxxxx-abcdef.json",
+      "min_confidence": 0.7
+      }
+  }
+}
+```
+
+Please note that this module sends the recorded audio after it's completed, so no streaming support.
+
+See `rhasspy.stt.GoogleCloudDecoder` for details.
+
 ## Remote HTTP Server

 Uses a remote HTTP server to transform speech (WAV) to text.
@@ -1,17 +1,12 @@
 #
 # Copyright 2018 Picovoice Inc.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE"
+# file accompanying this source.
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
 #

 import os
@@ -20,7 +15,7 @@ from enum import Enum


 class Porcupine(object):
-    """Python binding for Picovoice's wake word detection (aka Porcupine) library."""
+    """Python binding for Picovoice's wake word detection (Porcupine) engine."""

    class PicovoiceStatuses(Enum):
        """Status codes corresponding to 'pv_status_t' defined in 'include/picovoice.h'"""
@@ -29,11 +24,17 @@ class Porcupine(object):
        OUT_OF_MEMORY = 1
        IO_ERROR = 2
        INVALID_ARGUMENT = 3
+        STOP_ITERATION = 4
+        KEY_ERROR = 5
+        INVALID_STATE = 6

    _PICOVOICE_STATUS_TO_EXCEPTION = {
        PicovoiceStatuses.OUT_OF_MEMORY: MemoryError,
        PicovoiceStatuses.IO_ERROR: IOError,
-        PicovoiceStatuses.INVALID_ARGUMENT: ValueError
+        PicovoiceStatuses.INVALID_ARGUMENT: ValueError,
+        PicovoiceStatuses.STOP_ITERATION: StopIteration,
+        PicovoiceStatuses.KEY_ERROR: KeyError,
+        PicovoiceStatuses.INVALID_STATE: ValueError,
    }

    class CPorcupine(Structure):
@@ -48,9 +49,9 @@ class Porcupine(object):
            keyword_file_paths=None,
            sensitivities=None):
        """
-        Loads Porcupine's shared library and creates an instance of wake word detection object.
+        Constructor.

-        :param library_path: Absolute path to Porcupine's shared library.
+        :param library_path: Absolute path to Porcupine's dynamic library.
        :param model_file_path: Absolute path to file containing model parameters.
        :param keyword_file_path: Absolute path to keyword file containing hyper-parameters. If not present then
        'keyword_file_paths' will be used.
@@ -64,38 +65,38 @@ class Porcupine(object):
        """

        if not os.path.exists(library_path):
-            raise IOError(f"Could not find Porcupine's library at '{library_path}'")
+            raise IOError("could'nt find Porcupine's library at '%s'" % library_path)

        library = cdll.LoadLibrary(library_path)

        if not os.path.exists(model_file_path):
-            raise IOError(f"Could not find model file at '{model_file_path}'")
+            raise IOError("could'nt find model file at '%s'" % model_file_path)

        if sensitivity is not None and keyword_file_path is not None:
            if not os.path.exists(keyword_file_path):
-                raise IOError(f"Could not find keyword file at '{keyword_file_path}'")
+                raise IOError("could'nt' find keyword file at '%s'" % keyword_file_path)
            keyword_file_paths = [keyword_file_path]

            if not (0 <= sensitivity <= 1):
-                raise ValueError('Sensitivity should be within [0, 1]')
+                raise ValueError('sensitivity should be within [0, 1]')
            sensitivities = [sensitivity]
        elif sensitivities is not None and keyword_file_paths is not None:
            if len(keyword_file_paths) != len(sensitivities):
-                raise ValueError("Different number of sensitivity and keyword file path parameters are provided.")
+                raise ValueError("different number of sensitivity and keyword file path parameters are provided.")

            for x in keyword_file_paths:
                if not os.path.exists(os.path.expanduser(x)):
-                    raise IOError(f"Could not find keyword file at '{x}'")
+                    raise IOError("could not find keyword file at '%s'" % x)

            for x in sensitivities:
                if not (0 <= x <= 1):
-                    raise ValueError('Sensitivity should be within [0, 1]')
+                    raise ValueError('sensitivity should be within [0, 1]')
        else:
-            raise ValueError("Sensitivity and/or keyword file path is missing")
+            raise ValueError("sensitivity and/or keyword file path is missing")

        self._num_keywords = len(keyword_file_paths)

-        init_func = library.pv_porcupine_multiple_keywords_init
+        init_func = library.pv_porcupine_init
        init_func.argtypes = [
            c_char_p,
            c_int,
@@ -107,44 +108,43 @@ class Porcupine(object):
        self._handle = POINTER(self.CPorcupine)()

        status = init_func(
-            model_file_path.encode(),
+            model_file_path.encode('utf-8'),
            self._num_keywords,
-            (c_char_p * self._num_keywords)(*[os.path.expanduser(x).encode() for x in keyword_file_paths]),
+            (c_char_p * self._num_keywords)(*[os.path.expanduser(x).encode('utf-8') for x in keyword_file_paths]),
            (c_float * self._num_keywords)(*sensitivities),
            byref(self._handle))
        if status is not self.PicovoiceStatuses.SUCCESS:
-            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('Initialization failed')
-
-        self.process_func = library.pv_porcupine_multiple_keywords_process
-        self.process_func.argtypes = [POINTER(self.CPorcupine), POINTER(c_short), POINTER(c_int)]
-        self.process_func.restype = self.PicovoiceStatuses
+            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('initialization failed')

        self._delete_func = library.pv_porcupine_delete
        self._delete_func.argtypes = [POINTER(self.CPorcupine)]
        self._delete_func.restype = None

-        self._sample_rate = library.pv_sample_rate()
+        self.process_func = library.pv_porcupine_process
+        self.process_func.argtypes = [POINTER(self.CPorcupine), POINTER(c_short), POINTER(c_int)]
+        self.process_func.restype = self.PicovoiceStatuses
+
+        version_func = library.pv_porcupine_version
+        version_func.argtypes = []
+        version_func.restype = c_char_p
+        self._version = version_func().decode('utf-8')
+
        self._frame_length = library.pv_porcupine_frame_length()

-    @property
-    def sample_rate(self):
-        """Audio sample rate accepted by Porcupine library."""
+        self._sample_rate = library.pv_sample_rate()

-        return self._sample_rate
+    def delete(self):
+        """Releases resources acquired by Porcupine's library."""

-    @property
-    def frame_length(self):
-        """Number of audio samples per frame expected by C library."""
-
-        return self._frame_length
+        self._delete_func(self._handle)

    def process(self, pcm):
        """
-        Monitors incoming audio stream for given wake word(s).
+        Processes a frame of the incoming audio stream and emits the detection result.

-        :param pcm: An array (or array-like) of consecutive audio samples. For more information regarding required audio
-        properties (i.e. sample rate, number of channels encoding, and number of samples per frame) please refer to
-        'include/pv_porcupine.h'.
+        :param pcm: A frame of audio samples. The number of samples per frame can be attained by calling
+        '.frame_length'. The incoming audio needs to have a sample rate equal to '.sample_rate' and be 16-bit
+        linearly-encoded. Porcupine operates on single-channel audio.
        :return: For a single wake-word use cse True if wake word is detected. For multiple wake-word use case it
        returns the index of detected wake-word. Indexing is 0-based and according to ordering of input keyword file
        paths. It returns -1 when no keyword is detected.
@@ -153,7 +153,7 @@ class Porcupine(object):
        result = c_int()
        status = self.process_func(self._handle, (c_short * len(pcm))(*pcm), byref(result))
        if status is not self.PicovoiceStatuses.SUCCESS:
-            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]('Processing failed')
+            raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]()

        keyword_index = result.value

@@ -162,7 +162,20 @@ class Porcupine(object):
        else:
            return keyword_index

-    def delete(self):
-        """Releases resources acquired by Porcupine's library."""
+    @property
+    def version(self):
+        """Getter for version"""

-        self._delete_func(self._handle)
+        return self._version
+
+    @property
+    def frame_length(self):
+        """Getter for number of audio samples per frame."""
+
+        return self._frame_length
+
+    @property
+    def sample_rate(self):
+        """Audio sample rate accepted by Picovoice."""
+
+        return self._sample_rate
@@ -324,31 +324,31 @@
        "cache": false
      },
      "porcupine_params.pv": {
-        "url": "https://github.com/Picovoice/Porcupine/raw/master/lib/common/porcupine_params.pv",
+        "url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/common/porcupine_params.pv",
        "cache": false
      },
      "porcupine.ppn": {
        "cache": false,
        "x86_64": {
-          "url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/linux/porcupine_linux.ppn"
+          "url": "https://github.com/Picovoice/Porcupine/raw/v1.7/resources/keyword_files/linux/porcupine_linux.ppn"
        },
        "armv7l": {
-          "url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/raspberrypi/porcupine_raspberrypi.ppn"
+          "url": "https://github.com/Picovoice/porcupine/raw/v1.7/resources/keyword_files/raspberry-pi/porcupine_raspberry-pi.ppn"
        },
        "aarch64": {
-          "url": "https://github.com/Picovoice/Porcupine/raw/master/resources/keyword_files/raspberrypi/porcupine_raspberrypi.ppn"
+          "url": "https://github.com/Picovoice/porcupine/raw/v1.7/resources/keyword_files/raspberry-pi/porcupine_raspberry-pi.ppn"
        }
      },
      "libpv_porcupine.so": {
        "cache": false,
        "x86_64": {
-          "url": "https://github.com/Picovoice/Porcupine/raw/master/lib/linux/x86_64/libpv_porcupine.so"
+          "url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/linux/x86_64/libpv_porcupine.so"
        },
        "armv7l": {
-          "url": "https://github.com/Picovoice/Porcupine/raw/master/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
+          "url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
        },
        "aarch64": {
-          "url": "https://github.com/Picovoice/Porcupine/raw/master/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
+          "url": "https://github.com/Picovoice/porcupine/raw/v1.7/lib/raspberry-pi/cortex-a53/libpv_porcupine.so"
        }
      }
    }
@@ -3,6 +3,7 @@ aiohttp==3.6.2
 doit==0.31.1
 fuzzywuzzy[speedup]==0.17.0
 google-cloud-texttospeech==0.5.0
+google-cloud-speech==1.3.1
 html5lib==1.0.1
 json5==0.7.0
 multidict==4.6.1
@@ -389,6 +389,7 @@ class RasaIntentRecognizer(RhasspyActor):
        RhasspyActor.__init__(self)
        self.project_name = ""
        self.parse_url = ""
+        self.min_confidence: float = 0

    def to_started(self, from_state: str) -> None:
        """Transition to started state."""
@@ -397,6 +398,7 @@ class RasaIntentRecognizer(RhasspyActor):
        self.project_name = rasa_config.get(
            "project_name", f"rhasspy_{self.profile.name}"
        )
+        self.min_confidence = rasa_config.get("min_confidence", 0)
        self.parse_url = urljoin(url, "model/parse")

    def in_started(self, message: Any, sender: RhasspyActor) -> None:
@@ -406,6 +408,15 @@ class RasaIntentRecognizer(RhasspyActor):
                intent = self.recognize(message.text)
                intent["intent"]["name"] = intent["intent"]["name"] or ""
                logging.debug(repr(intent))
+                confidence = intent["intent"]["confidence"]
+                if confidence < self.min_confidence:
+                    intent["intent"]["name"] = ""
+
+                    self._logger.warning(
+                        "Intent did not meet confidence threshold: %s < %s",
+                        confidence,
+                        self.min_confidence,
+                    )
            except Exception:
                self._logger.exception("in_started")
                intent = empty_intent()
@@ -10,6 +10,9 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
 from urllib.parse import urljoin

 import requests
+from google.cloud import speech
+from google.cloud.speech import enums
+from google.cloud.speech import types

 from rhasspy.actor import RhasspyActor
 from rhasspy.events import TranscribeWav, WavTranscription
@@ -25,6 +28,7 @@ def get_decoder_class(system: str) -> Type[RhasspyActor]:
        "pocketsphinx",
        "kaldi",
        "remote",
+        "google",
        "hass_stt",
        "command",
    ], f"Invalid speech to text system: {system}"
@@ -38,6 +42,9 @@ def get_decoder_class(system: str) -> Type[RhasspyActor]:
    if system == "remote":
        # Use remote Rhasspy server
        return RemoteDecoder
+    if system == "google":
+        # Use remote Google Cloud
+        return GoogleCloudDecoder
    if system == "hass_stt":
        # Use Home Assistant STT platform
        return HomeAssistantSTTIntegration
@@ -320,6 +327,84 @@ class RemoteDecoder(RhasspyActor):
        return response.text


+# -----------------------------------------------------------------------------
+# Google Cloud Speech-to-text decoder
+# -----------------------------------------------------------------------------
+
+
+class GoogleCloudDecoder(RhasspyActor):
+    """Forwards speech to text request to Google Cloud STT service"""
+
+    def __init__(self) -> None:
+        RhasspyActor.__init__(self)
+        self.client = None
+        self.language_code = None
+        self.min_confidence: float = 0
+
+    def to_started(self, from_state: str) -> None:
+        """Transition to started state."""
+        credentials_file = self.profile.get("speech_to_text.google.credentials")
+        self.min_confidence = self.profile.get("speech_to_text.google.min_confidence")
+        self.language_code = self.profile.get("locale").replace('_', '-')
+        from google.auth import environment_vars
+        os.environ[environment_vars.CREDENTIALS] = credentials_file
+        self.client = speech.SpeechClient()
+
+    def in_started(self, message: Any, sender: RhasspyActor) -> None:
+        """Handle messages in started state."""
+        if isinstance(message, TranscribeWav):
+            try:
+                text, confidence = self.transcribe_wav(message.wav_data)
+                self._logger.debug(text)
+                self.send(
+                    message.receiver or sender,
+                    WavTranscription(
+                        text, confidence=confidence, handle=message.handle
+                    ),
+                    )
+            except Exception:
+                self._logger.exception("transcribing wav")
+
+                # Send empty transcription back
+                self.send(
+                    message.receiver or sender,
+                    WavTranscription("", confidence=0, handle=message.handle),
+                    )
+
+    def transcribe_wav(self, wav_data: bytes) -> Tuple[str, float]:
+        """POST to remote server and return response."""
+        headers = {"Content-Type": "audio/wav"}
+        self._logger.debug(
+            "POSTing %d byte(s) of WAV data to Google Cloud STT", len(wav_data)
+        )
+
+        audio = types.RecognitionAudio(content=wav_data)
+        config = types.RecognitionConfig(
+            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
+            sample_rate_hertz=16000,
+            model='command_and_search',
+            language_code=self.language_code)
+
+        response = self.client.recognize(config, audio)
+        if len(response.results) == 0:
+            self._logger.debug("No results returned.")
+            return "", 0
+
+        result = response.results[0].alternatives[0]
+
+        self._logger.debug("Transcription confidence: %s", result.confidence)
+        if result.confidence >= self.min_confidence:
+            return result.transcript, result.confidence
+
+        self._logger.warning(
+            "Transcription did not meet confidence threshold: %s < %s",
+            result.confidence,
+            self.min_confidence,
+        )
+
+        return "", 0
+
+
 # -----------------------------------------------------------------------------
 # Kaldi Decoder
 # http://kaldi-asr.org