Second attempt at git cleanup

This commit is contained in:
Michael Hansen
2019-03-31 09:50:02 -04:00
parent 830b1b0017
commit 0da6df4ba1
247 changed files with 6867 additions and 105 deletions
Regular → Executable
View File
Regular → Executable
View File
Regular → Executable
+19
View File
@@ -12,14 +12,33 @@ etc/qemu-aarch64-static
# npm
node_modules/
# Web interface
dist/
# Profiles
acoustic_model/
base_dictionary.txt
base_language_model.txt
g2p.fst
adapt_config.json
tagged_sentences.md
unknown_words.txt
frequent_words.html
intent_examples.json
sentences.txt
en-kaldi/
en-zamia/
grammars/
sentences/
record/
# Third party
etc/*.tar.gz
etc/*.tar.xz
etc/*.deb
# Examples
.storage/
Regular → Executable
View File
Regular → Executable
View File
Regular → Executable
View File
+5 -3
View File
@@ -10,8 +10,6 @@ import requests
def main():
profiles_dir = sys.argv[1]
# Languages: eng, deu, fra, spa, ita, nld, rus
languages = {
'eng': 'en',
'deu': 'de',
@@ -19,7 +17,11 @@ def main():
'spa': 'es',
'ita': 'it',
'nld': 'nl',
'rus': 'ru'
'rus': 'ru',
'vie': 'vi',
'cmn': 'zh',
'hin': 'hi',
'ell': 'el'
}
for language in languages:
+17 -14
View File
@@ -1,38 +1,39 @@
#!/usr/bin/env python3
import sys
import re
import argparse
from collections import defaultdict
# This script downloads frequently used words in a language, looks up their
# This script loads frequently used words in a language, looks up their
# pronunciations in a CMU dictionary, then prints an example word +
# pronunciation for each phoneme.
def main():
# frequent words file
freq_words_path = sys.argv[1]
# path to CMU dictionary
dict_path = sys.argv[2]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("frequent_words", help="Path to text file with frequent words")
parser.add_argument("dictionary", help="Path to CMU dictionary")
args = parser.parse_args()
# Download frequently used words in the given language
with open(freq_words_path, 'r') as word_file:
with open(args.frequent_words, "r") as word_file:
words = set([w.strip().upper() for w in word_file.read().splitlines()])
# phoneme -> [(word, pronunciation), ...]
examples = defaultdict(list)
# Find pronunciations for each frequently used word
with open(dict_path, 'r') as dict_file:
with open(args.dictionary, "r") as dict_file:
for line in dict_file:
line = line.strip()
if len(line) == 0:
continue
parts = re.split(r'\s+', line)
parts = re.split(r"\s+", line)
word = parts[0]
if '(' in word:
word = word[:word.index('(')]
if "(" in word:
word = word[: word.index("(")]
# Record example words for each phoneme
upper_word = word.upper()
@@ -44,16 +45,18 @@ def main():
# Pick unique example words for every phoneme
used_words = set()
for phoneme in sorted(examples.keys()):
# Choose the shortest, unused example word for this phoneme
# Choose the shortest, unused example word for this phoneme.
# Exclude words with 3 or fewer letters.
for word, pron in sorted(examples[phoneme], key=lambda kv: len(kv[0])):
if len(word) > 3 and (not word in used_words):
# Output format is:
# phoneme word pronunciation
print(phoneme, word, ' '.join(pron))
print(phoneme, word, " ".join(pron))
used_words.add(word)
break
# -----------------------------------------------------------------------------
if __name__ == '__main__':
if __name__ == "__main__":
main()
+262
View File
@@ -0,0 +1,262 @@
#!/usr/bin/env python3
import os
import sys
import re
import itertools
import argparse
import tempfile
import concurrent.futures
from collections import Counter, defaultdict
import subprocess
import pyparsing as pp
def main():
parser = argparse.ArgumentParser()
parser.add_argument("frequent_words", help="Path to text file with frequent words")
parser.add_argument("dictionary", help="Path to CMU dictionary")
parser.add_argument(
"frequent_phones", help="Path to eSpeak pronunciations for frequent words"
)
args = parser.parse_args()
# Load frequent words
with open(args.frequent_words, "r") as freq_file:
words = set([line.strip().lower() for line in freq_file])
# Find pronunciations for each frequently used word
freq_phonemes = {}
all_phonemes = set()
with open(args.dictionary, "r") as dict_file:
for line in dict_file:
line = line.strip()
if len(line) == 0:
continue
parts = re.split(r"\s+", line)
word = parts[0].lower()
if ("(" in word) or (word in freq_phonemes):
continue
# Record example words for each phoneme
if word in words:
pronunciation = parts[1:]
freq_phonemes[word] = " ".join(pronunciation)
all_phonemes.update(pronunciation)
# Get eSpeak phones
freq_espeak = {}
if not os.path.exists(args.frequent_phones):
# Generate
def get_espeak(word):
phones = (
subprocess.check_output(["espeak", "-q", "-x", word]).decode().strip()
)
return (word, phones)
with concurrent.futures.ThreadPoolExecutor() as executor:
freq_espeak = dict(executor.map(get_espeak, words))
with open(args.frequent_phones, "w") as freq_phones_file:
for word, phones in freq_espeak.items():
print(word, phones, file=freq_phones_file)
else:
# Load from previous run
with open(args.frequent_phones, "r") as freq_phones_file:
for line in freq_phones_file:
line = line.strip()
if len(line) == 0:
continue
parts = re.split(r"\s+", line, maxsplit=1)
word = parts[0].lower()
freq_espeak[word] = parts[1]
# Generate possible mappings
phoneme_counts = Counter()
mappings = []
bad_espeak = (":", ";", "-", "#")
for word, espeak in freq_espeak.items():
if not word in freq_phonemes:
# No pronunciation
continue
phonemes = freq_phonemes[word].split()
# Exclude emphasis, etc.
espeak = [c for c in espeak if c not in ["'", ","]]
if len(phonemes) == len(espeak):
# Direct mapping
context = {}
for p, e in zip(phonemes, espeak):
if e[0] in bad_espeak:
continue
pe_ctx = dict(context)
mappings.append([p, e, pe_ctx])
context[p] = e
phoneme_counts[(p, e)] += 1
else:
# Multiple possibilities
possibilities = itertools.product(*[[(p, 1), (p, 2)] for p in phonemes])
for possibility in possibilities:
poss_len = sum(pl[1] for pl in possibility)
if poss_len > len(espeak):
continue
i = 0
context = {}
maybe_mappings = []
maybe_counts = Counter()
for p, l in possibility:
e = "".join(espeak[i : i + l])
if e[0] in bad_espeak:
continue
pe_ctx = dict(context)
maybe_mappings.append([p, e, pe_ctx])
context[p] = e
maybe_counts[(p, e)] += 1
i += l
if i == len(espeak):
mappings.extend(maybe_mappings)
phoneme_counts += maybe_counts
# Generate candidates
sorted_phonemes = sorted(all_phonemes)
candidates = defaultdict(list)
n = 1
m = 4
for p in all_phonemes:
candidate_counts = [
(e, phoneme_counts[(cp, e)]) for (cp, e) in phoneme_counts.keys() if cp == p
]
candidate_counts = [ec for ec in candidate_counts if ec[1] > n]
candidate_counts = sorted(candidate_counts, key=lambda x: x[1], reverse=True)
if len(candidate_counts) < m:
candidates[p] = [ec[0] for ec in candidate_counts]
else:
candidates[p] = [ec[0] for ec in candidate_counts[:m]]
for p in all_phonemes:
assert p in candidates, p
# for p in sorted_phonemes:
# print(p, ", ".join(candidates[p]))
# Write clingo file
with tempfile.NamedTemporaryFile(suffix=".lp", mode="w+", delete=False) as clingo_file:
for p in sorted_phonemes:
print(f'phoneme("{p}").', file=clingo_file)
for p, es in candidates.items():
for e in es:
print(f'candidate("{p}", "{e}").', file=clingo_file)
for (cp, ce), count in phoneme_counts.items():
if cp == p:
print(
f'candidate_count("{cp}", "{ce}", {count}).',
file=clingo_file,
)
context_counts = Counter()
for mp, me, pe_ctx in mappings:
if (mp == p) and (me == e):
for cp, ce in pe_ctx.items():
context_counts[(cp, ce)] += 1
for (cp, ce), count in context_counts.items():
print(
f'context("{p}", "{e}", "{cp}", "{ce}", {count}).',
file=clingo_file,
)
# -----
print(
"""
0 { maybe_assign(P, E) } 1 :-
candidate(P, E), phoneme(P).
assign(P, E) :- maybe_assign(P, E).
% All must be assigned
:- not assign(P, _), phoneme(P).
% No duplicate assignments
:- assign(P, E1), assign(P, E2),
E1 != E2.
:- assign(P1, E), assign(P2, E),
P1 != P2.
#maximize { S : candidate_count(P1, E1, C), context(P1, E1, P2, E2, N), assign(P1, E1), assign(P2, E2), S = C + N }.
#show assign/2.
#show score/1.
""",
file=clingo_file,
)
# Find optimal assignment
parser = get_parser()
clingo_file.seek(0)
proc = subprocess.run(
["clingo", "-n0", "--verbose=0", "--warn=none", clingo_file.name],
stdout=subprocess.PIPE
)
predicates = []
for line in proc.stdout.splitlines():
line = line.decode().strip()
if len(line) == 0:
continue
elif line.startswith("OPTIMUM FOUND"):
break
else:
try:
predicates = parser.parseString(line, parseAll=True).asList()
except:
pass
# Collect best assignment
assignments = {}
for assignment in predicates:
if assignment[0] != "assign":
continue
# Phonemes are surrounded by double quotes
assignments[assignment[1][1:-1]] = assignment[2][1:-1]
# Print best assignment
for p in sorted_phonemes:
print(p, assignments[p])
# -----------------------------------------------------------------------------
def get_parser():
identifier = pp.Combine(
pp.Word(pp.alphas + "_", exact=1) + pp.Optional(pp.Word(pp.alphanums + "_"))
)
string = pp.quotedString
number = pp.Combine(pp.Optional("-") + pp.Word(pp.nums))
predicate = pp.Forward()
atom = pp.Or([predicate, string, number])
lpar = pp.Literal("(").suppress()
rpar = pp.Literal(")").suppress()
predicate <<= pp.Group(
identifier.setResultsName("head")
+ pp.Optional(lpar + pp.delimitedList(atom).setResultsName("args") + rpar)
)
predicates = pp.OneOrMore(predicate)
return predicates
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()
Regular → Executable
View File
+1 -1
View File
@@ -84,7 +84,7 @@ python3 -m pip install "${pocketsphinx_file}"
case $CPU_ARCH in
x86_64|armv7l)
snowboy_file="${download_dir}/snowboy-1.3.0.tar.gz"
if [[ ! -f "${pocketsphinx_file}" ]]; then
if [[ ! -f "${snowboy_file}" ]]; then
snowboy_url='https://github.com/Kitt-AI/snowboy/archive/v1.3.0.tar.gz'
echo "Downloading snowboy (${snowboy_url})"
wget -q -O "${snowboy_file}" "${snowboy_url}"
View File
View File
View File
View File
View File
View File
Regular → Executable
View File
Regular → Executable
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File
View File

Some files were not shown because too many files have changed in this diff Show More