mirror of
https://github.com/cmusphinx/pocketsphinx.git
synced 2026-05-17 12:20:35 +00:00
77bcb6418e
Add output format section to general help describing JSON fields (b, d, p, t, w). Add documentation to align help explaining when each level (words, phones, states) is produced based on options.
870 lines
26 KiB
C
870 lines
26 KiB
C
/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
/* ====================================================================
|
|
* Copyright (c) 1999-2022 Carnegie Mellon University. All rights
|
|
* reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* This work was supported in part by funding from the Defense Advanced
|
|
* Research Projects Agency and the National Science Foundation of the
|
|
* United States of America, and the CMU Sphinx Speech Consortium.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
|
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
|
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* ====================================================================
|
|
*
|
|
*/
|
|
/**
|
|
* @file pocketsphinx.c
|
|
* @brief Simple command-line speech recognition.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <signal.h>
|
|
|
|
#include <pocketsphinx.h>
|
|
|
|
#include "util/ckd_alloc.h"
|
|
#include "config_macro.h"
|
|
#include "pocketsphinx_internal.h"
|
|
#include "ps_alignment_internal.h"
|
|
|
|
/* Le sigh. Didn't want to have to do this. */
|
|
static const ps_arg_t ps_main_args_def[] = {
|
|
POCKETSPHINX_OPTIONS,
|
|
{ "config",
|
|
ARG_STRING,
|
|
NULL,
|
|
"JSON file with configuration." },
|
|
{ "phone_align",
|
|
ARG_BOOLEAN,
|
|
"no",
|
|
"Run a second pass to align phones and print their durations "
|
|
"(DOES NOT WORK IN LIVE MODE)." },
|
|
{ "state_align",
|
|
ARG_BOOLEAN,
|
|
"no",
|
|
"Run a second pass to align phones and states and print their durations "
|
|
"(Implies -phone_align) "
|
|
"(DOES NOT WORK IN LIVE MODE)." },
|
|
CMDLN_EMPTY_OPTION
|
|
};
|
|
|
|
static int global_done = 0;
|
|
static void
|
|
catch_sig(int signum)
|
|
{
|
|
(void)signum;
|
|
global_done = 1;
|
|
}
|
|
|
|
#define HYP_FORMAT "{\"b\":%.3f,\"d\":%.3f,\"p\":%.3f,\"t\":\"%s\""
|
|
static int
|
|
format_hyp(char *outptr, int len, ps_endpointer_t *ep, ps_decoder_t *decoder)
|
|
{
|
|
logmath_t *lmath;
|
|
double prob, st, et;
|
|
const char *hyp;
|
|
|
|
lmath = ps_get_logmath(decoder);
|
|
prob = logmath_exp(lmath, ps_get_prob(decoder));
|
|
if (ep == NULL) {
|
|
st = 0.0;
|
|
et = (double)ps_get_n_frames(decoder)
|
|
/ ps_config_int(ps_get_config(decoder), "frate");
|
|
}
|
|
else {
|
|
st = ps_endpointer_speech_start(ep);
|
|
et = ps_endpointer_speech_end(ep);
|
|
}
|
|
hyp = ps_get_hyp(decoder, NULL);
|
|
if (hyp == NULL)
|
|
hyp = "";
|
|
return snprintf(outptr, len, HYP_FORMAT, st, et - st, prob, hyp);
|
|
}
|
|
|
|
static int
|
|
format_seg(char *outptr, int len, ps_seg_t *seg,
|
|
double utt_start, int frate,
|
|
logmath_t *lmath)
|
|
{
|
|
double prob, st, dur;
|
|
int sf, ef;
|
|
const char *word;
|
|
|
|
ps_seg_frames(seg, &sf, &ef);
|
|
st = utt_start + (double)sf / frate;
|
|
dur = (double)(ef + 1 - sf) / frate;
|
|
word = ps_seg_word(seg);
|
|
if (word == NULL)
|
|
word = "";
|
|
prob = logmath_exp(lmath, ps_seg_prob(seg, NULL, NULL, NULL));
|
|
len = snprintf(outptr, len, HYP_FORMAT, st, dur, prob, word);
|
|
if (outptr) {
|
|
outptr += len;
|
|
*outptr++ = '}';
|
|
*outptr = '\0';
|
|
}
|
|
len++;
|
|
return len;
|
|
}
|
|
|
|
static int
|
|
format_align_iter(char *outptr, int maxlen,
|
|
ps_alignment_iter_t *itor, double utt_start, int frate, logmath_t *lmath)
|
|
{
|
|
int start, duration, score;
|
|
double prob, st, dur;
|
|
const char *word;
|
|
|
|
score = ps_alignment_iter_seg(itor, &start, &duration);
|
|
st = utt_start + (double)start / frate;
|
|
dur = (double)duration / frate;
|
|
prob = logmath_exp(lmath, score);
|
|
word = ps_alignment_iter_name(itor);
|
|
if (word == NULL)
|
|
word = "";
|
|
|
|
return snprintf(outptr, maxlen, HYP_FORMAT, st, dur, prob, word);
|
|
}
|
|
|
|
static int
|
|
format_seg_align(char *outptr, int maxlen,
|
|
ps_alignment_iter_t *itor,
|
|
double utt_start, int frate,
|
|
logmath_t *lmath, int state_align)
|
|
{
|
|
ps_alignment_iter_t *pitor;
|
|
int len = 0, hyplen;
|
|
|
|
hyplen = format_align_iter(outptr, maxlen,
|
|
itor, utt_start, frate, lmath);
|
|
len += hyplen;
|
|
if (outptr)
|
|
outptr += hyplen;
|
|
if (maxlen)
|
|
maxlen -= hyplen;
|
|
|
|
len += 6; /* "w":,[ */
|
|
if (outptr) {
|
|
memcpy(outptr, ",\"w\":[", 6);
|
|
outptr += 6;
|
|
}
|
|
if (maxlen)
|
|
maxlen -= 6;
|
|
|
|
pitor = ps_alignment_iter_children(itor);
|
|
while (pitor != NULL) {
|
|
hyplen = format_align_iter(outptr, maxlen,
|
|
pitor, utt_start, frate, lmath);
|
|
len += hyplen;
|
|
if (outptr)
|
|
outptr += hyplen;
|
|
if (maxlen)
|
|
maxlen -= hyplen;
|
|
|
|
/* FIXME: refactor with recursion, someday */
|
|
if (state_align) {
|
|
ps_alignment_iter_t *sitor = ps_alignment_iter_children(pitor);
|
|
len += 6; /* "w":,[ */
|
|
if (outptr) {
|
|
memcpy(outptr, ",\"w\":[", 6);
|
|
outptr += 6;
|
|
}
|
|
if (maxlen)
|
|
maxlen -= 6;
|
|
while (sitor != NULL) {
|
|
hyplen = format_align_iter(outptr, maxlen,
|
|
sitor, utt_start, frate, lmath);
|
|
len += hyplen;
|
|
if (outptr)
|
|
outptr += hyplen;
|
|
if (maxlen)
|
|
maxlen -= hyplen;
|
|
|
|
len++; /* } */
|
|
if (outptr)
|
|
*outptr++ = '}';
|
|
if (maxlen)
|
|
maxlen--;
|
|
sitor = ps_alignment_iter_next(sitor);
|
|
if (sitor != NULL) {
|
|
len++;
|
|
if (outptr)
|
|
*outptr++ = ',';
|
|
if (maxlen)
|
|
maxlen--;
|
|
}
|
|
}
|
|
len++;
|
|
if (outptr)
|
|
*outptr++ = ']';
|
|
if (maxlen)
|
|
maxlen--;
|
|
}
|
|
|
|
len++; /* } */
|
|
if (outptr)
|
|
*outptr++ = '}';
|
|
if (maxlen)
|
|
maxlen--;
|
|
pitor = ps_alignment_iter_next(pitor);
|
|
if (pitor != NULL) {
|
|
len++;
|
|
if (outptr)
|
|
*outptr++ = ',';
|
|
if (maxlen)
|
|
maxlen--;
|
|
}
|
|
}
|
|
|
|
len += 2;
|
|
if (outptr) {
|
|
*outptr++ = ']';
|
|
*outptr++ = '}';
|
|
*outptr = '\0';
|
|
}
|
|
if (maxlen)
|
|
maxlen--;
|
|
|
|
return len;
|
|
}
|
|
|
|
static void
|
|
output_hyp(ps_endpointer_t *ep, ps_decoder_t *decoder, ps_alignment_t *alignment)
|
|
{
|
|
logmath_t *lmath;
|
|
char *hyp_json, *ptr;
|
|
int frate;
|
|
int maxlen, len;
|
|
double st;
|
|
int state_align = ps_config_bool(decoder->config, "state_align");
|
|
|
|
maxlen = format_hyp(NULL, 0, ep, decoder);
|
|
maxlen += 6; /* "w":,[ */
|
|
lmath = ps_get_logmath(decoder);
|
|
frate = ps_config_int(ps_get_config(decoder), "frate");
|
|
if (ep == NULL)
|
|
st = 0.0;
|
|
else
|
|
st = ps_endpointer_speech_start(ep);
|
|
if (alignment) {
|
|
ps_alignment_iter_t *itor = ps_alignment_words(alignment);
|
|
if (itor == NULL)
|
|
maxlen++; /* ] at end */
|
|
for (; itor; itor = ps_alignment_iter_next(itor)) {
|
|
maxlen += format_seg_align(NULL, 0, itor, st, frate,
|
|
lmath, state_align);
|
|
maxlen++; /* , or ] at end */
|
|
}
|
|
}
|
|
else {
|
|
ps_seg_t *itor = ps_seg_iter(decoder);
|
|
if (itor == NULL)
|
|
maxlen++; /* ] at end */
|
|
for (; itor; itor = ps_seg_next(itor)) {
|
|
maxlen += format_seg(NULL, 0, itor, st, frate, lmath);
|
|
maxlen++; /* , or ] at end */
|
|
}
|
|
}
|
|
maxlen++; /* final } */
|
|
maxlen++; /* trailing \0 */
|
|
|
|
ptr = hyp_json = ckd_calloc(maxlen, 1);
|
|
len = maxlen;
|
|
len = format_hyp(hyp_json, len, ep, decoder);
|
|
ptr += len;
|
|
maxlen -= len;
|
|
|
|
assert(maxlen > 6);
|
|
memcpy(ptr, ",\"w\":[", 6);
|
|
ptr += 6;
|
|
maxlen -= 6;
|
|
|
|
if (alignment) {
|
|
ps_alignment_iter_t *itor;
|
|
for (itor = ps_alignment_words(alignment); itor;
|
|
itor = ps_alignment_iter_next(itor)) {
|
|
assert(maxlen > 0);
|
|
len = format_seg_align(ptr, maxlen, itor, st, frate, lmath,
|
|
state_align);
|
|
ptr += len;
|
|
maxlen -= len;
|
|
*ptr++ = ',';
|
|
maxlen--;
|
|
}
|
|
}
|
|
else {
|
|
ps_seg_t *itor = ps_seg_iter(decoder);
|
|
if (itor == NULL) {
|
|
*ptr++ = ']'; /* Gets overwritten below... */
|
|
maxlen--;
|
|
}
|
|
for (; itor; itor = ps_seg_next(itor)) {
|
|
assert(maxlen > 0);
|
|
len = format_seg(ptr, maxlen, itor, st, frate, lmath);
|
|
ptr += len;
|
|
maxlen -= len;
|
|
*ptr++ = ',';
|
|
maxlen--;
|
|
}
|
|
}
|
|
--ptr;
|
|
*ptr++ = ']';
|
|
assert(maxlen == 2);
|
|
*ptr++ = '}';
|
|
--maxlen;
|
|
*ptr = '\0';
|
|
puts(hyp_json);
|
|
fflush(stdout);
|
|
ckd_free(hyp_json);
|
|
}
|
|
|
|
static int
|
|
live(ps_config_t *config, FILE *infile)
|
|
{
|
|
ps_decoder_t *decoder = NULL;
|
|
ps_endpointer_t *ep = NULL;
|
|
short *frame = NULL;
|
|
size_t frame_size;
|
|
|
|
if ((decoder = ps_init(config)) == NULL) {
|
|
E_FATAL("PocketSphinx decoder init failed\n");
|
|
goto error_out;
|
|
}
|
|
if ((ep = ps_endpointer_init(0, 0.0,
|
|
0, ps_config_int(config, "samprate"),
|
|
0)) == NULL) {
|
|
E_ERROR("PocketSphinx endpointer init failed\n");
|
|
goto error_out;
|
|
}
|
|
frame_size = ps_endpointer_frame_size(ep);
|
|
if ((frame = ckd_calloc(frame_size, sizeof(frame[0]))) == NULL) {
|
|
E_ERROR("Failed to allocate frame");
|
|
goto error_out;
|
|
}
|
|
if (signal(SIGINT, catch_sig) == SIG_ERR)
|
|
E_FATAL_SYSTEM("Failed to set SIGINT handler");
|
|
while (!global_done) {
|
|
const int16 *speech;
|
|
int prev_in_speech = ps_endpointer_in_speech(ep);
|
|
size_t len, end_samples;
|
|
if ((len = fread(frame, sizeof(frame[0]),
|
|
frame_size, infile)) != frame_size) {
|
|
if (len > 0) {
|
|
speech = ps_endpointer_end_stream(ep, frame,
|
|
frame_size,
|
|
&end_samples);
|
|
}
|
|
else
|
|
break;
|
|
} else
|
|
speech = ps_endpointer_process(ep, frame);
|
|
if (speech != NULL) {
|
|
if (!prev_in_speech) {
|
|
E_INFO("Speech start at %.2f\n",
|
|
ps_endpointer_speech_start(ep));
|
|
ps_start_utt(decoder);
|
|
}
|
|
if (ps_process_raw(decoder, speech, frame_size, FALSE, FALSE) < 0) {
|
|
E_ERROR("ps_process_raw() failed\n");
|
|
goto error_out;
|
|
}
|
|
if (!ps_endpointer_in_speech(ep)) {
|
|
E_INFO("Speech end at %.2f\n",
|
|
ps_endpointer_speech_end(ep));
|
|
ps_end_utt(decoder);
|
|
if (ps_config_bool(decoder->config, "phone_align"))
|
|
E_WARN("Subword alignment not yet supported in live mode\n");
|
|
output_hyp(ep, decoder, NULL);
|
|
}
|
|
}
|
|
}
|
|
ckd_free(frame);
|
|
ps_endpointer_free(ep);
|
|
ps_free(decoder);
|
|
return 0;
|
|
|
|
error_out:
|
|
if (frame)
|
|
ckd_free(frame);
|
|
if (ep)
|
|
ps_endpointer_free(ep);
|
|
if (decoder)
|
|
ps_free(decoder);
|
|
return -1;
|
|
}
|
|
|
|
static int
|
|
decode_single(ps_decoder_t *decoder, FILE *infile)
|
|
{
|
|
ps_alignment_t *alignment = NULL;
|
|
size_t data_size, block_size;
|
|
short *data, *ptr;
|
|
int rv = 0;
|
|
|
|
data_size = 65536;
|
|
block_size = 2048;
|
|
ptr = data = ckd_calloc(data_size, sizeof(*data));
|
|
if (signal(SIGINT, catch_sig) == SIG_ERR)
|
|
E_FATAL_SYSTEM("Failed to set SIGINT handler");
|
|
while (!global_done) {
|
|
size_t len;
|
|
if ((size_t)(ptr + block_size - data) > data_size) {
|
|
len = ptr - data;
|
|
data_size *= 2;
|
|
data = ckd_realloc(data, data_size * sizeof(*data));
|
|
ptr = data + len;
|
|
}
|
|
len = fread(ptr, sizeof(*ptr), block_size, infile);
|
|
if (len == 0) {
|
|
if (feof(infile))
|
|
break;
|
|
else {
|
|
E_ERROR_SYSTEM("Failed to read %d bytes\n",
|
|
sizeof(*ptr) * block_size);
|
|
rv = -1;
|
|
goto error_out;
|
|
}
|
|
}
|
|
ptr += len;
|
|
}
|
|
if ((rv = ps_start_utt(decoder)) < 0)
|
|
goto error_out;
|
|
if ((rv = ps_process_raw(decoder, data, ptr - data, FALSE, TRUE)) < 0) {
|
|
E_ERROR("ps_process_raw() failed\n");
|
|
goto error_out;
|
|
}
|
|
if ((rv = ps_end_utt(decoder)) < 0)
|
|
goto error_out;
|
|
if (ps_config_bool(decoder->config, "phone_align")) {
|
|
const char *prev_search = ps_current_search(decoder);
|
|
if (ps_set_alignment(decoder, NULL) < 0)
|
|
goto error_out;
|
|
if ((rv = ps_start_utt(decoder)) < 0)
|
|
goto error_out;
|
|
if ((rv = ps_process_raw(decoder, data, ptr - data, FALSE, TRUE)) < 0) {
|
|
E_ERROR("ps_process_raw() failed\n");
|
|
goto error_out;
|
|
}
|
|
if ((rv = ps_end_utt(decoder)) < 0)
|
|
goto error_out;
|
|
if ((alignment = ps_get_alignment(decoder)) == NULL)
|
|
goto error_out;
|
|
ps_activate_search(decoder, prev_search);
|
|
}
|
|
output_hyp(NULL, decoder, alignment);
|
|
/* Fall through intentionally */
|
|
error_out:
|
|
ckd_free(data);
|
|
return rv;
|
|
}
|
|
|
|
static int
|
|
single(ps_config_t *config, FILE *infile)
|
|
{
|
|
ps_decoder_t *decoder;
|
|
int rv = 0;
|
|
|
|
if ((decoder = ps_init(config)) == NULL) {
|
|
E_FATAL("PocketSphinx decoder init failed\n");
|
|
return -1;
|
|
}
|
|
rv = decode_single(decoder, infile);
|
|
ps_free(decoder);
|
|
return rv;
|
|
}
|
|
|
|
static char *
|
|
string_array_join(char **strings, int nstrings)
|
|
{
|
|
char *joined, *ptr;
|
|
int i, *len, jlen;
|
|
|
|
len = ckd_malloc(nstrings * sizeof(*len));
|
|
for (jlen = i = 0; i < nstrings; ++i) {
|
|
len[i] = strlen(strings[i]);
|
|
jlen += len[i] + 1;
|
|
}
|
|
ptr = joined = ckd_malloc(jlen);
|
|
for (i = 0; i < nstrings; ++i) {
|
|
memcpy(ptr, strings[i], len[i]);
|
|
ptr += len[i];
|
|
*ptr++ = ' ';
|
|
}
|
|
*--ptr = '\0';
|
|
ckd_free(len);
|
|
return joined;
|
|
}
|
|
|
|
static int
|
|
align(ps_config_t *config, char **inputs, int ninputs)
|
|
{
|
|
int rv = 0, is_stdin = FALSE;
|
|
ps_decoder_t *decoder = NULL;
|
|
char *text = NULL;
|
|
FILE *fh = NULL;
|
|
|
|
if (ninputs < 2) {
|
|
E_ERROR("Usage: pocketsphinx align INFILE TEXT...\n");
|
|
return -1;
|
|
}
|
|
/* Please do not use bestpath for alignment. */
|
|
ps_config_set_bool(config, "bestpath", FALSE);
|
|
ps_config_set_str(config, "lm", NULL);
|
|
if (0 == strcmp(inputs[0], "-")) {
|
|
is_stdin = TRUE;
|
|
fh = stdin;
|
|
}
|
|
else if ((fh = fopen(inputs[0], "rb")) == NULL) {
|
|
E_ERROR_SYSTEM("Failed to open %s for input", inputs[0]);
|
|
goto error_out;
|
|
}
|
|
if ((rv = ps_config_soundfile(config, fh, inputs[0])) < 0)
|
|
goto error_out;
|
|
if ((decoder = ps_init(config)) == NULL) {
|
|
E_FATAL("PocketSphinx decoder init failed\n");
|
|
rv = -1;
|
|
goto error_out;
|
|
}
|
|
text = string_array_join(inputs + 1, ninputs - 1);
|
|
if ((rv = ps_set_align_text(decoder, text)) < 0)
|
|
goto error_out;
|
|
rv = decode_single(decoder, fh);
|
|
/* Fall through intentionally. */
|
|
error_out:
|
|
if (fh && !is_stdin)
|
|
fclose(fh);
|
|
if (text)
|
|
ckd_free(text);
|
|
if (decoder)
|
|
ps_free(decoder);
|
|
return rv;
|
|
}
|
|
|
|
#if 0
|
|
static int sample_rates[] = {
|
|
8000,
|
|
11025,
|
|
16000,
|
|
22050,
|
|
32000,
|
|
44100,
|
|
48000
|
|
};
|
|
static const int n_sample_rates = sizeof(sample_rates)/sizeof(sample_rates[0]);
|
|
|
|
static int
|
|
minimum_samprate(ps_config_t *config)
|
|
{
|
|
double upperf = ps_config_float(config, "upperf");
|
|
int nyquist = (int)(upperf * 2);
|
|
int i;
|
|
for (i = 0; i < n_sample_rates; ++i)
|
|
if (sample_rates[i] >= nyquist)
|
|
break;
|
|
if (i == n_sample_rates)
|
|
E_FATAL("Unable to find sampling rate for -upperf %f\n", upperf);
|
|
return sample_rates[i];
|
|
}
|
|
#endif
|
|
|
|
#define SOX_FORMAT "-r %d -c 1 -b 16 -e signed-integer -t raw -"
|
|
static int
|
|
soxflags(ps_config_t *config)
|
|
{
|
|
int maxlen, len;
|
|
int samprate;
|
|
char *args;
|
|
|
|
/* Get feature extraction parameters. */
|
|
ps_expand_model_config(config);
|
|
samprate = ps_config_int(config, "samprate");
|
|
|
|
maxlen = snprintf(NULL, 0, SOX_FORMAT, samprate);
|
|
if (maxlen < 0) {
|
|
E_ERROR_SYSTEM("Failed to snprintf()");
|
|
return -1;
|
|
}
|
|
maxlen++;
|
|
args = ckd_calloc(maxlen, 1);
|
|
len = snprintf(args, maxlen, SOX_FORMAT, samprate);
|
|
if (len != maxlen - 1) {
|
|
E_ERROR_SYSTEM("Failed to snprintf()");
|
|
return -1;
|
|
}
|
|
puts(args);
|
|
fflush(stdout);
|
|
ckd_free(args);
|
|
return 0;
|
|
}
|
|
|
|
static char *
|
|
find_command(int *argc, char **argv)
|
|
{
|
|
int i;
|
|
for (i = 1; i < *argc; i += 2) {
|
|
char *arg = argv[i];
|
|
if (arg && arg[0] && arg[0] != '-') {
|
|
memmove(&argv[i],
|
|
&argv[i + 1],
|
|
(*argc - i - 1) * sizeof(argv[i]));
|
|
--*argc;
|
|
return arg;
|
|
}
|
|
}
|
|
return "live";
|
|
}
|
|
|
|
static char **
|
|
find_inputs(int *argc, char **argv, int *ninputs)
|
|
{
|
|
char **inputs = NULL;
|
|
int i = 1;
|
|
*ninputs = 0;
|
|
while (i < *argc) {
|
|
char *arg = argv[i];
|
|
/* Bubble-bogo-bobo-backward-sort them to the end of argv. */
|
|
if (arg && arg[0]
|
|
/* "-" on its own is an input, otherwise, - starts args. */
|
|
&& (arg[0] != '-' || arg[1] == '\0')) {
|
|
memmove(&argv[i],
|
|
&argv[i + 1],
|
|
(*argc - i - 1) * sizeof(argv[i]));
|
|
--*argc;
|
|
argv[*argc] = arg;
|
|
inputs = &argv[*argc];
|
|
++*ninputs;
|
|
}
|
|
else
|
|
i += 2;
|
|
}
|
|
/* Now reverse them. I won't be passing Google's coding interview
|
|
any time soon, not that it matters in this particular case. */
|
|
for (i = 0; i < *ninputs / 2; ++i) {
|
|
char *tmp = inputs[i];
|
|
inputs[i] = inputs[*ninputs - i - 1];
|
|
inputs[*ninputs - i - 1] = tmp;
|
|
}
|
|
return inputs;
|
|
}
|
|
|
|
int
|
|
process_inputs(int (*func)(ps_config_t *, FILE *),
|
|
ps_config_t *config,
|
|
char **inputs, int ninputs)
|
|
{
|
|
int rv = 0;
|
|
|
|
if (ninputs == 0)
|
|
return func(config, stdin);
|
|
else {
|
|
int i, rv_one;
|
|
for (i = 0; i < ninputs; ++i) {
|
|
char *file = inputs[i];
|
|
int is_stdin = FALSE;
|
|
FILE *fh;
|
|
|
|
if (0 == strcmp(file, "-")) {
|
|
is_stdin = TRUE;
|
|
fh = stdin;
|
|
}
|
|
else if ((fh = fopen(file, "rb")) == NULL) {
|
|
E_ERROR_SYSTEM("Failed to open %s for input", file);
|
|
rv = -1;
|
|
continue;
|
|
}
|
|
if ((rv_one = ps_config_soundfile(config, fh, file)) < 0) {
|
|
fclose(fh);
|
|
rv = rv_one;
|
|
continue;
|
|
}
|
|
if ((rv_one = func(config, fh)) < 0) {
|
|
rv = rv_one;
|
|
E_ERROR("Recognition failed on %s\n", file);
|
|
}
|
|
if (!is_stdin)
|
|
fclose(fh);
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static int
|
|
print_config(ps_config_t *config)
|
|
{
|
|
if (puts(ps_config_serialize_json(config)) < 0)
|
|
return -1;
|
|
fflush(stdout);
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
usage(char *name, int help_config)
|
|
{
|
|
fprintf(stderr, "Usage: %s [PARAMS] [soxflags | config | help | help-config | live | single | align] INPUTS...\n", name);
|
|
fprintf(stderr, "Examples:\n");
|
|
fprintf(stderr, "\tsox input.mp3 $(%s soxflags) | %s single -\n", name, name);
|
|
fprintf(stderr, "\tsox -qd $(%s soxflags) | %s live -\n", name, name);
|
|
fprintf(stderr, "\t%s single INPUT\n", name);
|
|
fprintf(stderr, "\t%s align INPUT WORDS...\n", name);
|
|
fprintf(stderr, "\nOutput format:\n");
|
|
fprintf(stderr, " JSON with the following fields:\n");
|
|
fprintf(stderr, " b Begin time in seconds\n");
|
|
fprintf(stderr, " d Duration in seconds\n");
|
|
fprintf(stderr, " p Probability (acoustic model score)\n");
|
|
fprintf(stderr, " t Text of utterance or segment\n");
|
|
fprintf(stderr, " w Array of word segments\n");
|
|
fprintf(stderr, "\nFor detailed PARAMS values, run %s help-config\n", name);
|
|
if (help_config) {
|
|
err_set_loglevel(ERR_INFO);
|
|
cmd_ln_log_help_r(NULL, ps_args());
|
|
}
|
|
}
|
|
|
|
void
|
|
usage_align(char *name)
|
|
{
|
|
fprintf(stderr, "Usage: %s [PARAMS] align INPUT WORDS...\n", name);
|
|
fprintf(stderr, "\nForce-align audio to a word sequence.\n");
|
|
fprintf(stderr, "\nArguments:\n");
|
|
fprintf(stderr, " INPUT Audio file to align (or '-' for stdin)\n");
|
|
fprintf(stderr, " WORDS... Words to align to (will be concatenated)\n");
|
|
fprintf(stderr, "\nAlignment-specific options:\n");
|
|
fprintf(stderr, " -phone_align yes/no Run a second pass to align phones and print their durations\n");
|
|
fprintf(stderr, " (default: no)\n");
|
|
fprintf(stderr, " -state_align yes/no Run a second pass to align phones and states and print their\n");
|
|
fprintf(stderr, " durations. This implies -phone_align yes (default: no)\n");
|
|
fprintf(stderr, "\nBy default, output contains words only. With -phone_align, each\n");
|
|
fprintf(stderr, "word in \"w\" contains a nested \"w\" array of phones. With\n");
|
|
fprintf(stderr, "-state_align, each phone also contains a nested \"w\" of HMM states.\n");
|
|
fprintf(stderr, "\nExamples:\n");
|
|
fprintf(stderr, " # Basic word alignment:\n");
|
|
fprintf(stderr, " %s align audio.wav \"hello world\"\n", name);
|
|
fprintf(stderr, "\n # Phone-level alignment:\n");
|
|
fprintf(stderr, " %s -phone_align yes align audio.wav \"hello world\"\n", name);
|
|
fprintf(stderr, "\n # State-level alignment:\n");
|
|
fprintf(stderr, " %s -state_align yes align audio.wav \"hello world\"\n", name);
|
|
fprintf(stderr, "\n # Extract word timings with jq:\n");
|
|
fprintf(stderr, " %s align audio.wav \"hello world\" | jq '.w[]|[.t,.b]'\n", name);
|
|
fprintf(stderr, "\n # Extract phone timings with jq:\n");
|
|
fprintf(stderr, " %s -phone_align yes align audio.wav \"hello world\" | jq '.w[]|.w[]|[.t,.d]'\n", name);
|
|
fprintf(stderr, "\nFor all available parameters, run %s help-config\n", name);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
ps_config_t *config;
|
|
const char *conffile;
|
|
char *command;
|
|
char **inputs;
|
|
int rv, ninputs;
|
|
|
|
command = find_command(&argc, argv);
|
|
inputs = find_inputs(&argc, argv, &ninputs);
|
|
/* Only soxflags, config, help-config, and help take no or optional arguments */
|
|
if (ninputs == 0) {
|
|
if ((0 != strcmp(command, "soxflags"))
|
|
&& 0 != strcmp(command, "config")
|
|
&& 0 != strcmp(command, "help-config")
|
|
&& 0 != strcmp(command, "help")) {
|
|
usage(argv[0], FALSE);
|
|
return 1;
|
|
}
|
|
}
|
|
/* If arg parsing fails */
|
|
if ((config = ps_config_parse_args(ps_main_args_def, argc, argv)) == NULL) {
|
|
usage(argv[0], FALSE);
|
|
return 1;
|
|
}
|
|
ps_default_search_args(config);
|
|
if (ps_config_bool(config, "state_align"))
|
|
ps_config_set_bool(config, "phone_align", TRUE);
|
|
if ((conffile = ps_config_str(config, "config")) != NULL) {
|
|
char *json;
|
|
FILE *infh;
|
|
size_t len;
|
|
if ((infh = fopen(conffile, "rt")) == NULL) {
|
|
E_ERROR_SYSTEM("Failed to open config file %s", conffile);
|
|
return 1;
|
|
}
|
|
fseek(infh, 0, SEEK_END);
|
|
len = (size_t)ftell(infh);
|
|
fseek(infh, 0, SEEK_SET);
|
|
json = ckd_malloc(len + 1);
|
|
if (fread(json, 1, len, infh) != len) {
|
|
E_ERROR_SYSTEM("Failed to read config file %s", conffile);
|
|
ckd_free(json);
|
|
fclose(infh);
|
|
return 1;
|
|
}
|
|
json[len] = '\0';
|
|
fclose(infh);
|
|
config = ps_config_parse_json(config, json);
|
|
ckd_free(json);
|
|
if (config == NULL)
|
|
return 1;
|
|
ps_config_set_str(config, "config", NULL);
|
|
}
|
|
if (0 == strcmp(command, "soxflags"))
|
|
rv = soxflags(config);
|
|
else if (0 == strcmp(command, "config"))
|
|
rv = print_config(config);
|
|
else if (0 == strcmp(command, "live"))
|
|
rv = process_inputs(live, config, inputs, ninputs);
|
|
else if (0 == strcmp(command, "single"))
|
|
rv = process_inputs(single, config, inputs, ninputs);
|
|
else if (0 == strcmp(command, "align"))
|
|
rv = align(config, inputs, ninputs);
|
|
else if (0 == strcmp(command, "help")) {
|
|
rv = 0;
|
|
/* Check if a specific command help was requested */
|
|
if (ninputs > 0) {
|
|
if (0 == strcmp(inputs[0], "align")) {
|
|
usage_align(argv[0]);
|
|
}
|
|
else {
|
|
fprintf(stderr, "No specific help available for command '%s'\n", inputs[0]);
|
|
usage(argv[0], FALSE);
|
|
}
|
|
}
|
|
else {
|
|
usage(argv[0], FALSE);
|
|
}
|
|
}
|
|
else if (0 == strcmp(command, "help-config")) {
|
|
rv = 0;
|
|
usage(argv[0], TRUE);
|
|
}
|
|
else {
|
|
E_ERROR("Unknown command \"%s\"\n", command);
|
|
return 1;
|
|
}
|
|
|
|
ps_config_free(config);
|
|
return rv;
|
|
}
|