Second attempt at git cleanup
This commit is contained in:
Regular → Executable
Regular → Executable
Regular → Executable
+19
@@ -12,14 +12,33 @@ etc/qemu-aarch64-static
|
||||
# npm
|
||||
node_modules/
|
||||
|
||||
# Web interface
|
||||
dist/
|
||||
|
||||
# Profiles
|
||||
acoustic_model/
|
||||
base_dictionary.txt
|
||||
base_language_model.txt
|
||||
g2p.fst
|
||||
|
||||
adapt_config.json
|
||||
tagged_sentences.md
|
||||
unknown_words.txt
|
||||
frequent_words.html
|
||||
intent_examples.json
|
||||
sentences.txt
|
||||
|
||||
en-kaldi/
|
||||
en-zamia/
|
||||
|
||||
grammars/
|
||||
sentences/
|
||||
record/
|
||||
|
||||
# Third party
|
||||
etc/*.tar.gz
|
||||
etc/*.tar.xz
|
||||
etc/*.deb
|
||||
|
||||
# Examples
|
||||
.storage/
|
||||
|
||||
Regular → Executable
Regular → Executable
@@ -10,8 +10,6 @@ import requests
|
||||
|
||||
def main():
|
||||
profiles_dir = sys.argv[1]
|
||||
|
||||
# Languages: eng, deu, fra, spa, ita, nld, rus
|
||||
languages = {
|
||||
'eng': 'en',
|
||||
'deu': 'de',
|
||||
@@ -19,7 +17,11 @@ def main():
|
||||
'spa': 'es',
|
||||
'ita': 'it',
|
||||
'nld': 'nl',
|
||||
'rus': 'ru'
|
||||
'rus': 'ru',
|
||||
'vie': 'vi',
|
||||
'cmn': 'zh',
|
||||
'hin': 'hi',
|
||||
'ell': 'el'
|
||||
}
|
||||
|
||||
for language in languages:
|
||||
|
||||
+17
-14
@@ -1,38 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
# This script downloads frequently used words in a language, looks up their
|
||||
# This script loads frequently used words in a language, looks up their
|
||||
# pronunciations in a CMU dictionary, then prints an example word +
|
||||
# pronunciation for each phoneme.
|
||||
|
||||
def main():
|
||||
# frequent words file
|
||||
freq_words_path = sys.argv[1]
|
||||
|
||||
# path to CMU dictionary
|
||||
dict_path = sys.argv[2]
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("frequent_words", help="Path to text file with frequent words")
|
||||
parser.add_argument("dictionary", help="Path to CMU dictionary")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Download frequently used words in the given language
|
||||
with open(freq_words_path, 'r') as word_file:
|
||||
with open(args.frequent_words, "r") as word_file:
|
||||
words = set([w.strip().upper() for w in word_file.read().splitlines()])
|
||||
|
||||
# phoneme -> [(word, pronunciation), ...]
|
||||
examples = defaultdict(list)
|
||||
|
||||
# Find pronunciations for each frequently used word
|
||||
with open(dict_path, 'r') as dict_file:
|
||||
with open(args.dictionary, "r") as dict_file:
|
||||
for line in dict_file:
|
||||
line = line.strip()
|
||||
if len(line) == 0:
|
||||
continue
|
||||
|
||||
parts = re.split(r'\s+', line)
|
||||
parts = re.split(r"\s+", line)
|
||||
word = parts[0]
|
||||
|
||||
if '(' in word:
|
||||
word = word[:word.index('(')]
|
||||
if "(" in word:
|
||||
word = word[: word.index("(")]
|
||||
|
||||
# Record example words for each phoneme
|
||||
upper_word = word.upper()
|
||||
@@ -44,16 +45,18 @@ def main():
|
||||
# Pick unique example words for every phoneme
|
||||
used_words = set()
|
||||
for phoneme in sorted(examples.keys()):
|
||||
# Choose the shortest, unused example word for this phoneme
|
||||
# Choose the shortest, unused example word for this phoneme.
|
||||
# Exclude words with 3 or fewer letters.
|
||||
for word, pron in sorted(examples[phoneme], key=lambda kv: len(kv[0])):
|
||||
if len(word) > 3 and (not word in used_words):
|
||||
# Output format is:
|
||||
# phoneme word pronunciation
|
||||
print(phoneme, word, ' '.join(pron))
|
||||
print(phoneme, word, " ".join(pron))
|
||||
used_words.add(word)
|
||||
break
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Executable
+262
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import itertools
|
||||
import argparse
|
||||
import tempfile
|
||||
import concurrent.futures
|
||||
from collections import Counter, defaultdict
|
||||
import subprocess
|
||||
|
||||
import pyparsing as pp
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("frequent_words", help="Path to text file with frequent words")
|
||||
parser.add_argument("dictionary", help="Path to CMU dictionary")
|
||||
parser.add_argument(
|
||||
"frequent_phones", help="Path to eSpeak pronunciations for frequent words"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load frequent words
|
||||
with open(args.frequent_words, "r") as freq_file:
|
||||
words = set([line.strip().lower() for line in freq_file])
|
||||
|
||||
# Find pronunciations for each frequently used word
|
||||
freq_phonemes = {}
|
||||
all_phonemes = set()
|
||||
with open(args.dictionary, "r") as dict_file:
|
||||
for line in dict_file:
|
||||
line = line.strip()
|
||||
if len(line) == 0:
|
||||
continue
|
||||
|
||||
parts = re.split(r"\s+", line)
|
||||
word = parts[0].lower()
|
||||
|
||||
if ("(" in word) or (word in freq_phonemes):
|
||||
continue
|
||||
|
||||
# Record example words for each phoneme
|
||||
if word in words:
|
||||
pronunciation = parts[1:]
|
||||
freq_phonemes[word] = " ".join(pronunciation)
|
||||
all_phonemes.update(pronunciation)
|
||||
|
||||
# Get eSpeak phones
|
||||
freq_espeak = {}
|
||||
if not os.path.exists(args.frequent_phones):
|
||||
# Generate
|
||||
def get_espeak(word):
|
||||
phones = (
|
||||
subprocess.check_output(["espeak", "-q", "-x", word]).decode().strip()
|
||||
)
|
||||
return (word, phones)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
freq_espeak = dict(executor.map(get_espeak, words))
|
||||
|
||||
with open(args.frequent_phones, "w") as freq_phones_file:
|
||||
for word, phones in freq_espeak.items():
|
||||
print(word, phones, file=freq_phones_file)
|
||||
else:
|
||||
# Load from previous run
|
||||
with open(args.frequent_phones, "r") as freq_phones_file:
|
||||
for line in freq_phones_file:
|
||||
line = line.strip()
|
||||
if len(line) == 0:
|
||||
continue
|
||||
|
||||
parts = re.split(r"\s+", line, maxsplit=1)
|
||||
word = parts[0].lower()
|
||||
freq_espeak[word] = parts[1]
|
||||
|
||||
# Generate possible mappings
|
||||
phoneme_counts = Counter()
|
||||
mappings = []
|
||||
bad_espeak = (":", ";", "-", "#")
|
||||
for word, espeak in freq_espeak.items():
|
||||
if not word in freq_phonemes:
|
||||
# No pronunciation
|
||||
continue
|
||||
|
||||
phonemes = freq_phonemes[word].split()
|
||||
|
||||
# Exclude emphasis, etc.
|
||||
espeak = [c for c in espeak if c not in ["'", ","]]
|
||||
|
||||
if len(phonemes) == len(espeak):
|
||||
# Direct mapping
|
||||
context = {}
|
||||
for p, e in zip(phonemes, espeak):
|
||||
if e[0] in bad_espeak:
|
||||
continue
|
||||
pe_ctx = dict(context)
|
||||
mappings.append([p, e, pe_ctx])
|
||||
context[p] = e
|
||||
phoneme_counts[(p, e)] += 1
|
||||
else:
|
||||
# Multiple possibilities
|
||||
possibilities = itertools.product(*[[(p, 1), (p, 2)] for p in phonemes])
|
||||
for possibility in possibilities:
|
||||
poss_len = sum(pl[1] for pl in possibility)
|
||||
if poss_len > len(espeak):
|
||||
continue
|
||||
|
||||
i = 0
|
||||
context = {}
|
||||
maybe_mappings = []
|
||||
maybe_counts = Counter()
|
||||
for p, l in possibility:
|
||||
e = "".join(espeak[i : i + l])
|
||||
if e[0] in bad_espeak:
|
||||
continue
|
||||
pe_ctx = dict(context)
|
||||
maybe_mappings.append([p, e, pe_ctx])
|
||||
context[p] = e
|
||||
maybe_counts[(p, e)] += 1
|
||||
i += l
|
||||
|
||||
if i == len(espeak):
|
||||
mappings.extend(maybe_mappings)
|
||||
phoneme_counts += maybe_counts
|
||||
|
||||
# Generate candidates
|
||||
sorted_phonemes = sorted(all_phonemes)
|
||||
candidates = defaultdict(list)
|
||||
n = 1
|
||||
m = 4
|
||||
for p in all_phonemes:
|
||||
candidate_counts = [
|
||||
(e, phoneme_counts[(cp, e)]) for (cp, e) in phoneme_counts.keys() if cp == p
|
||||
]
|
||||
candidate_counts = [ec for ec in candidate_counts if ec[1] > n]
|
||||
candidate_counts = sorted(candidate_counts, key=lambda x: x[1], reverse=True)
|
||||
if len(candidate_counts) < m:
|
||||
candidates[p] = [ec[0] for ec in candidate_counts]
|
||||
else:
|
||||
candidates[p] = [ec[0] for ec in candidate_counts[:m]]
|
||||
|
||||
for p in all_phonemes:
|
||||
assert p in candidates, p
|
||||
|
||||
# for p in sorted_phonemes:
|
||||
# print(p, ", ".join(candidates[p]))
|
||||
|
||||
# Write clingo file
|
||||
with tempfile.NamedTemporaryFile(suffix=".lp", mode="w+", delete=False) as clingo_file:
|
||||
for p in sorted_phonemes:
|
||||
print(f'phoneme("{p}").', file=clingo_file)
|
||||
for p, es in candidates.items():
|
||||
for e in es:
|
||||
print(f'candidate("{p}", "{e}").', file=clingo_file)
|
||||
for (cp, ce), count in phoneme_counts.items():
|
||||
if cp == p:
|
||||
print(
|
||||
f'candidate_count("{cp}", "{ce}", {count}).',
|
||||
file=clingo_file,
|
||||
)
|
||||
context_counts = Counter()
|
||||
for mp, me, pe_ctx in mappings:
|
||||
if (mp == p) and (me == e):
|
||||
for cp, ce in pe_ctx.items():
|
||||
context_counts[(cp, ce)] += 1
|
||||
|
||||
for (cp, ce), count in context_counts.items():
|
||||
print(
|
||||
f'context("{p}", "{e}", "{cp}", "{ce}", {count}).',
|
||||
file=clingo_file,
|
||||
)
|
||||
|
||||
# -----
|
||||
|
||||
print(
|
||||
"""
|
||||
0 { maybe_assign(P, E) } 1 :-
|
||||
candidate(P, E), phoneme(P).
|
||||
|
||||
assign(P, E) :- maybe_assign(P, E).
|
||||
|
||||
% All must be assigned
|
||||
:- not assign(P, _), phoneme(P).
|
||||
|
||||
% No duplicate assignments
|
||||
:- assign(P, E1), assign(P, E2),
|
||||
E1 != E2.
|
||||
|
||||
:- assign(P1, E), assign(P2, E),
|
||||
P1 != P2.
|
||||
|
||||
#maximize { S : candidate_count(P1, E1, C), context(P1, E1, P2, E2, N), assign(P1, E1), assign(P2, E2), S = C + N }.
|
||||
|
||||
#show assign/2.
|
||||
#show score/1.
|
||||
""",
|
||||
file=clingo_file,
|
||||
)
|
||||
|
||||
# Find optimal assignment
|
||||
parser = get_parser()
|
||||
clingo_file.seek(0)
|
||||
proc = subprocess.run(
|
||||
["clingo", "-n0", "--verbose=0", "--warn=none", clingo_file.name],
|
||||
stdout=subprocess.PIPE
|
||||
)
|
||||
predicates = []
|
||||
for line in proc.stdout.splitlines():
|
||||
line = line.decode().strip()
|
||||
if len(line) == 0:
|
||||
continue
|
||||
elif line.startswith("OPTIMUM FOUND"):
|
||||
break
|
||||
else:
|
||||
try:
|
||||
predicates = parser.parseString(line, parseAll=True).asList()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Collect best assignment
|
||||
assignments = {}
|
||||
for assignment in predicates:
|
||||
if assignment[0] != "assign":
|
||||
continue
|
||||
|
||||
# Phonemes are surrounded by double quotes
|
||||
assignments[assignment[1][1:-1]] = assignment[2][1:-1]
|
||||
|
||||
# Print best assignment
|
||||
for p in sorted_phonemes:
|
||||
print(p, assignments[p])
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_parser():
|
||||
identifier = pp.Combine(
|
||||
pp.Word(pp.alphas + "_", exact=1) + pp.Optional(pp.Word(pp.alphanums + "_"))
|
||||
)
|
||||
string = pp.quotedString
|
||||
number = pp.Combine(pp.Optional("-") + pp.Word(pp.nums))
|
||||
|
||||
predicate = pp.Forward()
|
||||
atom = pp.Or([predicate, string, number])
|
||||
lpar = pp.Literal("(").suppress()
|
||||
rpar = pp.Literal(")").suppress()
|
||||
predicate <<= pp.Group(
|
||||
identifier.setResultsName("head")
|
||||
+ pp.Optional(lpar + pp.delimitedList(atom).setResultsName("args") + rpar)
|
||||
)
|
||||
|
||||
predicates = pp.OneOrMore(predicate)
|
||||
|
||||
return predicates
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Regular → Executable
+1
-1
@@ -84,7 +84,7 @@ python3 -m pip install "${pocketsphinx_file}"
|
||||
case $CPU_ARCH in
|
||||
x86_64|armv7l)
|
||||
snowboy_file="${download_dir}/snowboy-1.3.0.tar.gz"
|
||||
if [[ ! -f "${pocketsphinx_file}" ]]; then
|
||||
if [[ ! -f "${snowboy_file}" ]]; then
|
||||
snowboy_url='https://github.com/Kitt-AI/snowboy/archive/v1.3.0.tar.gz'
|
||||
echo "Downloading snowboy (${snowboy_url})"
|
||||
wget -q -O "${snowboy_file}" "${snowboy_url}"
|
||||
|
||||
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Regular → Executable
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user