mirror of
https://github.com/cmusphinx/sphinxtrain.git
synced 2026-05-17 13:10:52 +00:00
161 lines
4.5 KiB
Python
161 lines
4.5 KiB
Python
# Copyright (c) 2007 Carnegie Mellon University
|
|
#
|
|
# You may copy and modify this freely under the same terms as
|
|
# Sphinx-III
|
|
"""Corpus classes for acoustic model training.
|
|
|
|
This module provides classes for representing a corpus of utterances
|
|
for acoustic modeling. The Corpus class implements the iterator
|
|
protocol, acting as a list of Utterance objects.
|
|
"""
|
|
__author__ = "David Huggins-Daines <dhdaines@gmail.com>"
|
|
__version__ = "$Revision$"
|
|
|
|
import os
|
|
|
|
|
|
class Resource(object):
|
|
"""Resource associated with an utterance in a speech corpus.
|
|
|
|
Any utterance has an arbitrary set of resources associated with
|
|
it. These are things such as waveforms, acoustic feature files,
|
|
transcriptions and other forms of supervision, etc.
|
|
"""
|
|
pass
|
|
|
|
|
|
class FileResourceIterator(object):
|
|
"""
|
|
Iterator over items in a FileResource.
|
|
"""
|
|
def __init__(self, resource):
|
|
self.res = resource
|
|
self.ctl = iter(resource.ctl_file)
|
|
|
|
def __next__(self):
|
|
# This will raise StopIteration for us at EOF
|
|
entry = next(self.ctl)
|
|
if isinstance(entry, CtlEntry):
|
|
path = os.path.join(self.res.base_dir,
|
|
entry.fileid + self.res.file_ext)
|
|
else:
|
|
path = os.path.join(self.res.base_dir, entry + self.res.file_ext)
|
|
if self.res.data_type:
|
|
return self.res.data_type(path)
|
|
else:
|
|
return path
|
|
|
|
|
|
class FileResource(Resource):
|
|
def __init__(self, ctl_file, base_dir, file_ext, data_type=None):
|
|
"""
|
|
Initialize a file-based resource.
|
|
@param ctl_file: Control file resource on which this is based
|
|
@ptype ctl_file: iterator(CtlEntry)
|
|
@param base_dir: Base directory to prepend to control entries
|
|
@param file_ext: Filename extension to append to control entries
|
|
@param data_type: Class to construct from entries.
|
|
@ptype data_type: type
|
|
"""
|
|
self.ctl_file = ctl_file
|
|
self.base_dir = base_dir
|
|
self.file_ext = file_ext
|
|
self.data_type = data_type
|
|
|
|
def __iter__(self):
|
|
return FileResourceIterator(self)
|
|
|
|
|
|
class CtlEntry(object):
|
|
"""Entry in a control file"""
|
|
def __init__(self, str):
|
|
fields = str.split()
|
|
if len(fields) == 4:
|
|
self.fileid, self.sf, self.ef, self.uttid = fields
|
|
self.sf = int(self.sf)
|
|
self.ef = int(self.ef)
|
|
else:
|
|
self.fileid = self.uttid = str
|
|
self.sf = 0
|
|
self.ef = -1
|
|
|
|
|
|
class ListResourceIterator(object):
|
|
"""
|
|
Iterator over items in a ListResource.
|
|
"""
|
|
def __init__(self, resource):
|
|
self.fh = open(resource.file_name)
|
|
self.data_type = resource.data_type
|
|
|
|
def __del__(self):
|
|
if self.fh is not None:
|
|
self.fh.close()
|
|
self.fh = None
|
|
|
|
def __next__(self):
|
|
spam = self.fh.readline()
|
|
if spam == "":
|
|
raise StopIteration
|
|
if self.data_type:
|
|
return self.data_type(spam.rstrip())
|
|
else:
|
|
return spam.rstrip()
|
|
|
|
|
|
class ListResource(Resource):
|
|
"""
|
|
Corpus resource consisting of lines in a text file, of some data
|
|
type. This includes things like control and transcript files.
|
|
"""
|
|
def __init__(self, file_name, data_type=None):
|
|
"""
|
|
Initialize a listing-based resource.
|
|
|
|
If no data_type argument is specified, each item in the list
|
|
will be returned as a string.
|
|
|
|
@param file_name: File to read resource from
|
|
@ptype file_name: string
|
|
@param data_type: Class implementing the data type of each item
|
|
@ptype data_type: type
|
|
"""
|
|
self.data_type = data_type
|
|
self.file_name = file_name
|
|
|
|
def __iter__(self):
|
|
return ListResourceIterator(self)
|
|
|
|
|
|
class CorpusIterator(object):
|
|
"""
|
|
Iterator over elements in a Corpus.
|
|
"""
|
|
def __init__(self, corpus, part=1, npart=1):
|
|
self.corpus = corpus
|
|
self.iters = {}
|
|
if npart > 1:
|
|
pass
|
|
else:
|
|
for k, v in corpus.resources.items():
|
|
self.iters[k] = iter(v)
|
|
|
|
def __next__(self):
|
|
utt = {}
|
|
for k, v in self.iters.items():
|
|
utt[k] = next(v)
|
|
return utt
|
|
|
|
|
|
class Corpus(object):
|
|
"""Corpus of speech data."""
|
|
def __init__(self, ctl_file):
|
|
self.ctl = ListResource(ctl_file, CtlEntry)
|
|
self.resources = {'ctl': self.ctl}
|
|
|
|
def __iter__(self):
|
|
return CorpusIterator(self)
|
|
|
|
def add_resource(self, name, res):
|
|
self.resources[name] = res
|