pocketsphinx/programs/pocketsphinx_lm_eval.c

/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* ====================================================================
 * Copyright (c) 2008 Carnegie Mellon University.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * This work was supported in part by funding from the Defense Advanced
 * Research Projects Agency and the National Science Foundation of the
 * United States of America, and the CMU Sphinx Speech Consortium.
 *
 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 */
/**
 * \file sphinx_lm_eval.c
 * Language model evaluation tool.
 */
#include <pocketsphinx.h>

#include "lm/ngram_model.h"
#include "util/ckd_alloc.h"
#include "util/cmd_ln.h"
#include "util/ckd_alloc.h"
#include "util/pio.h"
#include "util/strfuncs.h"
#include "pocketsphinx_internal.h"

#include <stdio.h>
#include <string.h>
#include <math.h>

static const ps_arg_t defn[] = {
  { "help",
    ARG_BOOLEAN,
    "no",
    "Shows the usage of the tool"},

  { "logbase",
    ARG_FLOATING,
    "1.0001",
    "Base in which all log-likelihoods calculated" },

  { "lm",
    ARG_STRING,
    NULL,
    "Language model file"},

  { "probdef",
    ARG_STRING,
    NULL,
    "Probability definition file for classes in LM"},

  { "lmctlfn",
    ARG_STRING,
    NULL,
    "Control file listing a set of language models"},

  { "lmname",
    ARG_STRING,
    NULL,
    "Name of language model in -lmctlfn to use for all utterances" },

  { "lsn",
    ARG_STRING,
    NULL,
    "Transcription file to evaluate"},

  { "text",
    ARG_STRING,
    NULL,
    "Text string to evaluate"},

  { "mmap",
    ARG_BOOLEAN,
    "no",
    "Use memory-mapped I/O for reading binary LM files"},

  { "lw",
    ARG_FLOATING,
    "1.0",
    "Language model weight" },

  { "wip",
    ARG_FLOATING,
    "1.0",
    "Word insertion probability" },

  { "verbose",
    ARG_BOOLEAN,
    "no",
    "Print details of perplexity calculation" },

  /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
  { NULL, 0, NULL, NULL }
};

static int verbose;

static int
calc_entropy(ngram_model_t *lm, char **words, int32 n,
	     int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
{
	int32 *wids;
	int32 startwid;
	int32 i, ch, nccs, noovs, unk;

        if (n == 0)
            return 0;

        unk = ngram_unknown_wid(lm);

	/* Reverse this array into an array of word IDs. */
	wids = ckd_calloc(n, sizeof(*wids));
	for (i = 0; i < n; ++i)
		wids[n-i-1] = ngram_wid(lm, words[i]);
	/* Skip <s> as it's a context cue (HACK, this should be configurable). */
	startwid = ngram_wid(lm, "<s>");

	/* Now evaluate the list of words in reverse using the
	 * remainder of the array as the history. */
	ch = noovs = nccs = 0;
	for (i = 0; i < n; ++i) {
		int32 n_used;
		int32 prob;

		/* Skip <s> as it's a context cue (HACK, this should be configurable). */
		if (wids[i] == startwid) {
			++nccs;
			continue;
		}
		/* Skip and count OOVs. */
		if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
			++noovs;
			continue;
		}
		/* Sum up information for each N-gram */
		prob = ngram_ng_score(lm,
				      wids[i], wids + i + 1,
				      n - i - 1, &n_used);
                if (verbose) {
                    int m;
                    printf("log P(%s|", ngram_word(lm, wids[i]));
                    m = i + ngram_model_get_size(lm) - 1;
                    if (m >= n)
                        m = n - 1;
                    while (m > i) {
                        printf("%s ", ngram_word(lm, wids[m--]));
                    }
                    printf(") = %d\n", prob);
                }
		ch -= prob;
	}

	if (out_n_ccs) *out_n_ccs = nccs;
	if (out_n_oovs) *out_n_oovs = noovs;

	/* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
        n -= (nccs + noovs);
        if (n <= 0)
            return 0;
        if (out_lm_score)
            *out_lm_score = -ch;
	return ch / n;
}

static void
evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
{
	FILE *fh;
        lineiter_t *litor;
	int32 nccs, noovs, nwords, lscr;
	float64 ch, log_to_log2;;

	if ((fh = fopen(lsnfn, "r")) == NULL)
		E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);

	/* We have to keep ch in floating-point to avoid overflows, so
	 * we might as well use log2. */
	log_to_log2 = log(logmath_get_base(lmath)) / log(2);
	lscr = nccs = noovs = nwords = 0;
	ch = 0.0;
        for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
		char **words;
		int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;

		n = str2words(litor->buf, NULL, 0);
		if (n < 0)
			E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
		if (n == 0) /* Do nothing! */
			continue;
		words = ckd_calloc(n, sizeof(*words));
		str2words(litor->buf, words, n);

		/* Remove any utterance ID (FIXME: has to be a single "word") */
		if (words[n-1][0] == '('
		    && words[n-1][strlen(words[n-1])-1] == ')')
			n = n - 1;

		tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
                                      &tmp_noovs, &tmp_lscr);

		ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
		nccs += tmp_nccs;
		noovs += tmp_noovs;
                lscr += tmp_lscr;
		nwords += n;

		ckd_free(words);
	}

	ch /= (nwords - nccs - noovs);
	printf("cross-entropy: %f bits\n", ch);

	/* Calculate perplexity pplx = exp CH */
	printf("perplexity: %f\n", pow(2.0, ch));
        printf("lm score: %d\n", lscr);

	/* Report OOVs and CCs */
	printf("%d words evaluated\n", nwords);
	printf("%d OOVs (%.2f%%), %d context cues removed\n",
	       noovs, (double)noovs / nwords * 100, nccs);
}

static void
evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
{
	char *textfoo;
	char **words;
	int32 n, ch, noovs, nccs, lscr;

	/* Split it into an array of strings. */
	textfoo = ckd_salloc(text);
	n = str2words(textfoo, NULL, 0);
	if (n < 0)
		E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
	if (n == 0) /* Do nothing! */
		return;
	words = ckd_calloc(n, sizeof(*words));
	str2words(textfoo, words, n);

	ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);

	printf("input: %s\n", text);
	printf("cross-entropy: %f bits\n",
	       ch * log(logmath_get_base(lmath)) / log(2));

	/* Calculate perplexity pplx = exp CH */
	printf("perplexity: %f\n", logmath_exp(lmath, ch));
        printf("lm score: %d\n", lscr);

	/* Report OOVs and CCs */
	printf("%d words evaluated\n", n);
	printf("%d OOVs, %d context cues removed\n",
	      noovs, nccs);

	ckd_free(textfoo);
	ckd_free(words);
}

int
main(int argc, char *argv[])
{
	cmd_ln_t *config;
	ngram_model_t *lm = NULL;
	logmath_t *lmath;
	const char *lmfn, *probdefn, *lsnfn, *text;

	if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) {
            /* This probably just means that we got no arguments. */
            err_set_loglevel(ERR_INFO);
            cmd_ln_log_help_r(NULL, defn);
            return 1;
        }

        verbose = ps_config_bool(config, "verbose");

	/* Create log math object. */
	if ((lmath = logmath_init
	     (ps_config_float(config, "logbase"), 0, 0)) == NULL) {
		E_FATAL("Failed to initialize log math\n");
	}

	/* Load the language model. */
	lmfn = ps_config_str(config, "lm");
	if (lmfn == NULL
	    || (lm = ngram_model_read(config, lmfn,
				      NGRAM_AUTO, lmath)) == NULL) {
		E_FATAL("Failed to load language model from %s\n",
			ps_config_str(config, "lm"));
	}
        if ((probdefn = ps_config_str(config, "probdef")) != NULL)
            ngram_model_read_classdef(lm, probdefn);
        ngram_model_apply_weights(lm,
                                  ps_config_float(config, "lw"),
                                  ps_config_float(config, "wip"));

	/* Now evaluate some text. */
	lsnfn = ps_config_str(config, "lsn");
	text = ps_config_str(config, "text");
	if (lsnfn) {
		evaluate_file(lm, lmath, lsnfn);
	}
	else if (text) {
		evaluate_string(lm, lmath, text);
	}

	return 0;
}