Replacing numbers in sentences/slots. Using rhasspy-nlu 0.1.1
This commit is contained in:
@@ -41,7 +41,7 @@ from rhasspy.utils import (
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.root.setLevel(logging.DEBUG)
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
|
||||
+1
-38
@@ -22,41 +22,6 @@ from rhasspy.wake import WakeWordDetected
|
||||
|
||||
logger = logging.getLogger("rhasspy")
|
||||
|
||||
|
||||
try:
|
||||
# Need to import here because they screw with logging
|
||||
import flair # noqa: F401
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
logging.config.dictConfig(
|
||||
{
|
||||
"version": 1,
|
||||
"disable_existing_loggers": True,
|
||||
"formatters": {
|
||||
"rhasspy.format": {"format": "%(levelname)s:%(name)s:%(message)s"}
|
||||
},
|
||||
"handlers": {
|
||||
"rhasspy.handler": {
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "rhasspy.format",
|
||||
"stream": "ext://sys.stderr",
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"rhasspy": {"handlers": ["rhasspy.handler"], "propagate": False},
|
||||
"flair": {
|
||||
"handlers": ["rhasspy.handler"],
|
||||
"level": "INFO",
|
||||
"propagate": False,
|
||||
},
|
||||
},
|
||||
"root": {"handlers": ["rhasspy.handler"]},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Globals
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -247,9 +212,7 @@ async def main() -> None:
|
||||
)
|
||||
|
||||
# check
|
||||
sub_parsers.add_parser(
|
||||
"check", help="Check downloaded profile files"
|
||||
)
|
||||
sub_parsers.add_parser("check", help="Check downloaded profile files")
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
|
||||
+1
-1
@@ -223,7 +223,7 @@ class RhasspyCore:
|
||||
# Replace numbers
|
||||
if self.profile.get("intent.replace_numbers", True):
|
||||
language = self.profile.get("language", "")
|
||||
if len(language) == 0:
|
||||
if not language:
|
||||
language = None
|
||||
|
||||
# 75 -> seventy five
|
||||
|
||||
@@ -224,75 +224,6 @@ class FsticuffsRecognizer(RhasspyActor):
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
# def recognize(self, text: str) -> Dict[str, Any]:
|
||||
# """Use FST as acceptor."""
|
||||
# from rhasspy.train.jsgf2fst import fstaccept
|
||||
|
||||
# # Assume lower case, white-space separated tokens
|
||||
# tokens = re.split(r"\s+", text.lower())
|
||||
|
||||
# if self.profile.get("intent.fsticuffs.ignore_unknown_words", True):
|
||||
# tokens = [w for w in tokens if w in self.words]
|
||||
|
||||
# intents = fstaccept(self.fst, tokens)
|
||||
# self._logger.debug("Got %s intent(s)", len(intents))
|
||||
|
||||
# if len(intents) > 0:
|
||||
# self._logger.debug(intents)
|
||||
|
||||
# return intents[0]
|
||||
|
||||
# def recognize_fuzzy(self, text: str, eps: str = "<eps>") -> Dict[str, Any]:
|
||||
# """Do fuzzy breadth-first search on FST as graph."""
|
||||
# from rhasspy.train.jsgf2fst import symbols2intent
|
||||
|
||||
# # Assume lower case, white-space separated tokens
|
||||
# tokens = re.split(r"\s+", text)
|
||||
|
||||
# if self.profile.get("intent.fsticuffs.ignore_unknown_words", True):
|
||||
# # Filter tokens
|
||||
# tokens = [w for w in tokens if w in self.words]
|
||||
|
||||
# # Only run search if there are any tokens
|
||||
# intents = []
|
||||
# if len(tokens) > 0:
|
||||
# intent_symbols_and_costs = FsticuffsRecognizer._get_symbols_and_costs(
|
||||
# self.graph, tokens, stop_words=self.stop_words, eps=eps
|
||||
# )
|
||||
# for symbols, cost in intent_symbols_and_costs.values():
|
||||
# intent = symbols2intent(symbols, eps=eps)
|
||||
# intent["intent"]["confidence"] = (len(tokens) - cost) / len(tokens)
|
||||
# intents.append(intent)
|
||||
|
||||
# intents = sorted(
|
||||
# intents, key=lambda i: i["intent"]["confidence"], reverse=True
|
||||
# )
|
||||
|
||||
# self._logger.debug("Recognized %s intent(s)", len(intents))
|
||||
|
||||
# # Use first intent
|
||||
# if len(intents) > 0:
|
||||
# intent = intents[0]
|
||||
|
||||
# # Add slots
|
||||
# intent["slots"] = {}
|
||||
# for ev in intent["entities"]:
|
||||
# intent["slots"][ev["entity"]] = ev["value"]
|
||||
|
||||
# # Add alternative intents
|
||||
# intent["intents"] = []
|
||||
# for other_intent in intents[1:]:
|
||||
# intent["intents"].append(other_intent)
|
||||
|
||||
# self._logger.debug(intents)
|
||||
# else:
|
||||
# intent = empty_intent()
|
||||
# intent["text"] = text
|
||||
|
||||
# return intent
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def load_graph(self):
|
||||
"""Load intent graph from JSON file."""
|
||||
if self.graph is None:
|
||||
|
||||
+46
-184
@@ -3,17 +3,13 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import tempfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, Set, Iterable, Any, List, Tuple
|
||||
from collections import deque
|
||||
from typing import Iterable, List, Tuple
|
||||
|
||||
from num2words import num2words
|
||||
import pywrapfst as fst
|
||||
import networkx as nx
|
||||
import doit
|
||||
|
||||
from doit import create_after
|
||||
from doit.cmd_base import ModuleTaskLoader
|
||||
@@ -27,21 +23,14 @@ from rhasspynlu import (
|
||||
graph_to_json,
|
||||
json_to_graph,
|
||||
jsgf,
|
||||
ini_jsgf,
|
||||
)
|
||||
|
||||
# from rhasspy.train.jsgf2fst import (
|
||||
# get_grammar_dependencies,
|
||||
# grammar_to_fsts,
|
||||
# slots_to_fsts,
|
||||
# make_intent_fst,
|
||||
# )
|
||||
|
||||
# from rhasspy.train.ini_jsgf import make_grammars
|
||||
from rhasspy.train.vocab_dict import make_dict, FORMAT_CMU, FORMAT_JULIUS
|
||||
from rhasspy.profiles import Profile
|
||||
from rhasspy.utils import ppath as utils_ppath, read_dict
|
||||
|
||||
logger = logging.getLogger("train")
|
||||
_LOGGER = logging.getLogger("train")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
@@ -52,6 +41,8 @@ def train_profile(profile_dir: Path, profile: Profile) -> Tuple[int, List[str]]:
|
||||
def ppath(query, default=None, write=False):
|
||||
return utils_ppath(profile, profile_dir, query, default, write=write)
|
||||
|
||||
language = profile.get("language", "")
|
||||
|
||||
# Inputs
|
||||
stt_system = profile.get("speech_to_text.system")
|
||||
stt_prefix = f"speech_to_text.{stt_system}"
|
||||
@@ -116,177 +107,24 @@ def train_profile(profile_dir: Path, profile: Profile) -> Tuple[int, List[str]]:
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Create sentence transform for ini parse
|
||||
if word_casing == "upper":
|
||||
# Upper case
|
||||
sentence_transform = str.upper
|
||||
elif word_casing == "lower":
|
||||
# Lower case
|
||||
sentence_transform = str.lower
|
||||
else:
|
||||
# Identity
|
||||
sentence_transform = lambda s: s
|
||||
|
||||
# Parse sentences.ini
|
||||
intents = parse_ini(sentences_ini)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# def task_grammars():
|
||||
# """Transforms sentences.ini into JSGF grammars, one per intent."""
|
||||
# maybe_deps = []
|
||||
|
||||
# # Add profile itself as a dependency
|
||||
# profile_json_path = profile_dir / "profile.json"
|
||||
# if profile_json_path.is_file():
|
||||
# maybe_deps.append(profile_json_path)
|
||||
|
||||
# def ini_to_grammars(targets):
|
||||
# with open(sentences_ini, "r") as sentences_file:
|
||||
# make_grammars(sentences_file, grammar_dir, whitelist=whitelist)
|
||||
|
||||
# return {
|
||||
# "file_dep": [sentences_ini] + maybe_deps,
|
||||
# "targets": [grammar_dir / f"{intent}.gram" for intent in intents],
|
||||
# "actions": [ini_to_grammars],
|
||||
# }
|
||||
|
||||
# # -----------------------------------------------------------------------------
|
||||
|
||||
# def do_slots_to_fst(slot_names, targets):
|
||||
# # Extra arguments for word casing
|
||||
# kwargs = {}
|
||||
# if word_casing == "upper":
|
||||
# kwargs["upper"] = True
|
||||
# elif word_casing == "lower":
|
||||
# kwargs["lower"] = True
|
||||
|
||||
# slot_fsts = slots_to_fsts(slots_dir, slot_names=slot_names, **kwargs)
|
||||
# for slot_name, slot_fst in slot_fsts.items():
|
||||
# # Slot name will already have "$"
|
||||
# slot_fst.write(str(fsts_dir / f"{slot_name}.fst"))
|
||||
|
||||
# def do_grammar_to_fsts(
|
||||
# grammar_path: Path, replace_fst_paths: Dict[str, Path], targets
|
||||
# ):
|
||||
# # Load dependent FSTs
|
||||
# replace_fsts = {
|
||||
# replace_name: fst.Fst.read(str(replace_path))
|
||||
# for replace_name, replace_path in replace_fst_paths.items()
|
||||
# }
|
||||
|
||||
# # Extra arguments for word casing
|
||||
# kwargs = {}
|
||||
# if word_casing == "upper":
|
||||
# kwargs["upper"] = True
|
||||
# elif word_casing == "lower":
|
||||
# kwargs["lower"] = True
|
||||
|
||||
# grammar = grammar_path.read_text()
|
||||
# listener = grammar_to_fsts(grammar, replace_fsts=replace_fsts, **kwargs)
|
||||
# grammar_name = listener.grammar_name
|
||||
|
||||
# # Write FST for each JSGF rule
|
||||
# for rule_name, rule_fst in listener.fsts.items():
|
||||
# fst_path = fsts_dir / f"{rule_name}.fst"
|
||||
# rule_fst.write(str(fst_path))
|
||||
|
||||
# # Write FST for main grammar rule
|
||||
# grammar_fst_path = fsts_dir / f"{grammar_name}.fst"
|
||||
# assert listener.grammar_fst is not None
|
||||
# listener.grammar_fst.write(str(grammar_fst_path))
|
||||
|
||||
# # -----------------------------------------------------------------------------
|
||||
|
||||
# def do_grammar_dependencies(grammar_path: Path, targets):
|
||||
# grammar = grammar_path.read_text()
|
||||
# grammar_deps = get_grammar_dependencies(grammar).graph
|
||||
# graph_json = nx.readwrite.json_graph.node_link_data(grammar_deps)
|
||||
# with open(targets[0], "w") as graph_file:
|
||||
# json.dump(graph_json, graph_file)
|
||||
|
||||
# @create_after(executed="grammars")
|
||||
# def task_grammar_dependencies():
|
||||
# """Creates grammar dependency graphs from JSGF grammars and relevant slots."""
|
||||
|
||||
# for intent in intents:
|
||||
# grammar_path = grammar_dir / f"{intent}.gram"
|
||||
# yield {
|
||||
# "name": intent + "_dependencies",
|
||||
# "file_dep": [grammar_path],
|
||||
# "targets": [str(grammar_path) + ".json"],
|
||||
# "actions": [(do_grammar_dependencies, [grammar_path])],
|
||||
# }
|
||||
|
||||
# # -----------------------------------------------------------------------------
|
||||
|
||||
# @create_after(executed="grammar_dependencies")
|
||||
# def task_grammar_fsts():
|
||||
# """Creates grammar FSTs from JSGF grammars and relevant slots."""
|
||||
# used_slots: Set[str] = set()
|
||||
|
||||
# for intent in intents:
|
||||
# grammar_path = grammar_dir / f"{intent}.gram"
|
||||
# grammar_dep_path = str(grammar_path) + ".json"
|
||||
|
||||
# # Load dependency graph
|
||||
# with open(grammar_dep_path, "r") as graph_file:
|
||||
# graph_data = json.load(graph_file)
|
||||
# grammar_deps = nx.readwrite.json_graph.node_link_graph(graph_data)
|
||||
|
||||
# rule_names: Set[str] = set()
|
||||
# replace_fst_paths: Dict[str, Path] = {}
|
||||
|
||||
# # Process dependencies
|
||||
# for node, data in grammar_deps.nodes(data=True):
|
||||
# node_type = data["type"]
|
||||
|
||||
# if node_type == "slot":
|
||||
# # Strip "$"
|
||||
# slot_name = node[1:]
|
||||
# used_slots.add(slot_name)
|
||||
|
||||
# # Path to slot FST
|
||||
# replace_fst_paths[node] = fsts_dir / f"{node}.fst"
|
||||
# elif node_type == "remote rule":
|
||||
# # Path to rule FST
|
||||
# replace_fst_paths[node] = fsts_dir / f"{node}.fst"
|
||||
# elif node_type == "local rule":
|
||||
# rule_names.add(node)
|
||||
|
||||
# # All rule/grammar FSTs that will be generated
|
||||
# grammar_fst_paths = [
|
||||
# fsts_dir / f"{rule_name}.fst" for rule_name in rule_names
|
||||
# ]
|
||||
# grammar_fst_paths.append(fsts_dir / f"{intent}.fst")
|
||||
|
||||
# yield {
|
||||
# "name": intent + "_fst",
|
||||
# "file_dep": [grammar_path, grammar_dep_path]
|
||||
# + list(replace_fst_paths.values()),
|
||||
# "targets": grammar_fst_paths,
|
||||
# "actions": [(do_grammar_to_fsts, [grammar_path, replace_fst_paths])],
|
||||
# }
|
||||
|
||||
# # slots -> FST
|
||||
# if len(used_slots) > 0:
|
||||
# yield {
|
||||
# "name": "slot_fsts",
|
||||
# "file_dep": [slots_dir / slot_name for slot_name in used_slots],
|
||||
# "targets": [fsts_dir / f"${slot_name}.fst" for slot_name in used_slots],
|
||||
# "actions": [(do_slots_to_fst, [used_slots])],
|
||||
# }
|
||||
|
||||
# # -----------------------------------------------------------------------------
|
||||
|
||||
# def do_intent_fst(intents: Iterable[str], targets):
|
||||
# intent_fsts = {
|
||||
# intent: fst.Fst.read(str(fsts_dir / f"{intent}.fst")) for intent in intents
|
||||
# }
|
||||
# intent_fst = make_intent_fst(intent_fsts)
|
||||
# intent_fst.write(targets[0])
|
||||
|
||||
# @create_after(executed="grammar_fsts")
|
||||
# def task_intent_fst():
|
||||
# """Merges grammar FSTs into single intent.fst."""
|
||||
# return {
|
||||
# "file_dep": [fsts_dir / f"{intent}.fst" for intent in intents],
|
||||
# "targets": [intent_fst],
|
||||
# "actions": [(do_intent_fst, [intents])],
|
||||
# }
|
||||
intents = parse_ini(sentences_ini, sentence_transform=sentence_transform)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def get_slot_names(item):
|
||||
"""Yield referenced slot names."""
|
||||
if isinstance(item, jsgf.SlotReference):
|
||||
yield item.slot_name
|
||||
elif isinstance(item, jsgf.Sequence):
|
||||
@@ -297,9 +135,26 @@ def train_profile(profile_dir: Path, profile: Profile) -> Tuple[int, List[str]]:
|
||||
for slot_name in get_slot_names(item.rule_body):
|
||||
yield slot_name
|
||||
|
||||
def number_transform(word):
|
||||
"""Automatically transform numbers"""
|
||||
if not isinstance(word, jsgf.Word):
|
||||
# Skip anything besides words
|
||||
return
|
||||
|
||||
try:
|
||||
n = int(word.text)
|
||||
|
||||
# 75 -> (seventy five):75
|
||||
word.text = num2words(n, lang=language)
|
||||
word.substitution = str(n)
|
||||
except ValueError:
|
||||
# Not a number
|
||||
pass
|
||||
|
||||
def do_intents_to_graph(intents, slot_names, targets):
|
||||
sentences, replacements = ini_jsgf.split_rules(intents)
|
||||
|
||||
# Load slot values
|
||||
replacements = {}
|
||||
for slot_name in slot_names:
|
||||
slot_path = slots_dir / slot_name
|
||||
assert slot_path.is_file(), f"Missing slot file at {slot_path}"
|
||||
@@ -310,11 +165,18 @@ def train_profile(profile_dir: Path, profile: Profile) -> Tuple[int, List[str]]:
|
||||
for line in slot_file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
slot_values.append(jsgf.Sentence.parse(line))
|
||||
sentence = jsgf.Sentence.parse(line)
|
||||
slot_values.append(sentence)
|
||||
|
||||
# Replace $slot with sentences
|
||||
replacements[f"${slot_name}"] = slot_values
|
||||
|
||||
if profile.get("intent.replace_numbers", True):
|
||||
# Replace numbers in parsed sentences
|
||||
for intent_sentences in sentences.values():
|
||||
for sentence in intent_sentences:
|
||||
jsgf.walk_expression(sentence, number_transform, replacements)
|
||||
|
||||
# Convert to directed graph
|
||||
graph = intents_to_graph(intents, replacements)
|
||||
|
||||
@@ -491,7 +353,7 @@ def train_profile(profile_dir: Path, profile: Profile) -> Tuple[int, List[str]]:
|
||||
|
||||
if unknown_words.exists() and g2p_model.exists():
|
||||
# Generate single pronunciation guesses
|
||||
logger.debug("Guessing pronunciations for unknown word(s)")
|
||||
_LOGGER.debug("Guessing pronunciations for unknown word(s)")
|
||||
|
||||
g2p_output = subprocess.check_output(
|
||||
[
|
||||
|
||||
@@ -198,9 +198,12 @@ def fstprintall(
|
||||
out_file: Optional[TextIO] = None,
|
||||
exclude_meta: bool = True,
|
||||
eps: str = "<eps>",
|
||||
substitute: bool = False,
|
||||
) -> List[List[str]]:
|
||||
sentences = []
|
||||
input_symbols = in_fst.input_symbols()
|
||||
output_symbols = in_fst.output_symbols()
|
||||
in_eps = input_symbols.find(eps)
|
||||
out_eps = output_symbols.find(eps)
|
||||
zero_weight = fst.Weight.Zero(in_fst.weight_type())
|
||||
|
||||
@@ -218,12 +221,25 @@ def fstprintall(
|
||||
|
||||
for arc in in_fst.arcs(state):
|
||||
arc_sentence = list(sentence)
|
||||
if arc.olabel != out_eps:
|
||||
out_symbol = output_symbols.find(arc.olabel).decode()
|
||||
if exclude_meta and out_symbol.startswith("__"):
|
||||
pass # skip __label__, etc.
|
||||
else:
|
||||
arc_sentence.append(out_symbol)
|
||||
if substitute:
|
||||
# Use output label
|
||||
if arc.olabel != out_eps:
|
||||
out_symbol = output_symbols.find(arc.olabel).decode()
|
||||
if exclude_meta and out_symbol.startswith("__"):
|
||||
pass # skip __label__, etc.
|
||||
else:
|
||||
arc_sentence.append(out_symbol)
|
||||
else:
|
||||
# Use input label
|
||||
if arc.ilabel != in_eps:
|
||||
in_symbol = input_symbols.find(arc.ilabel).decode()
|
||||
arc_sentence.append(in_symbol)
|
||||
|
||||
# Use meta output labels
|
||||
if not exclude_meta and (arc.olabel != out_eps):
|
||||
out_symbol = output_symbols.find(arc.olabel).decode()
|
||||
if out_symbol.startswith("__"):
|
||||
arc_sentence.append(out_symbol)
|
||||
|
||||
state_queue.append((arc.nextstate, arc_sentence))
|
||||
|
||||
|
||||
@@ -403,6 +403,10 @@ def numbers_to_words(
|
||||
sentence: str, language: Optional[str] = None, add_substitution: bool = False
|
||||
) -> str:
|
||||
"""Replaces numbers with words in a sentence. Optionally substitues number back in."""
|
||||
if not language:
|
||||
# Default language
|
||||
language = None
|
||||
|
||||
words = split_whitespace(sentence)
|
||||
changed = False
|
||||
for i, word in enumerate(words):
|
||||
|
||||
Reference in New Issue
Block a user