Major refactor to clean things up. The MFCC appears to be working.

This commit is contained in:
jjung
2015-07-10 13:59:16 -05:00
parent e92209c5d6
commit fedf4f97bc
13 changed files with 228 additions and 379 deletions
+3 -3
View File
@@ -1,5 +1,5 @@
# node-mfcc
Node.JS implementation of the MFCC algorithm.
Node.JS implementation of the MFCC (Mel Frequency Cepstral Coefficients) algorithm.
# Introduction
@@ -7,7 +7,7 @@ Code in this project was made by following the tutorial here:
[http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/](http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/)
The proper process by which to compute the MFCC is:
To compute the MFCC:
1. Frame samples into `2^N` sized buffers where `N` is an integer.
2. Pass frames into the Fast Fourier Transform to produce `F` frequency bins.
@@ -24,7 +24,7 @@ The reason the term 'Cepstral' is used is that it is a play on spectral. In ordi
time-domain data. However, in step (6) above we are performing a discrete cosine transform on information that is already in the
frequency domain. As a result, the pseudo-spectral term cepstral was invented.
The reason for the discrete cosine transformation step is to remove any codependency from the Mel-Scale values after applying the triangular filters.
The reason for the discrete cosine transformation step is to both compress the mel-bands and to autocorrelate them.
# Example
View File
View File
View File
-1
View File
@@ -1 +0,0 @@
5.904757629127576e-7,0.0000013999298934710302,0.0000013999298934710302,0.000004265992436375758,0.000004265992436375758,0.000016099656669393937,0.000016099656669393937,0.00012208873831350304,1.0729054448683637,1.0729054448683637,5.671305168245454,1.0728470246934547
1 5.904757629127576e-7 0.0000013999298934710302 0.0000013999298934710302 0.000004265992436375758 0.000004265992436375758 0.000016099656669393937 0.000016099656669393937 0.00012208873831350304 1.0729054448683637 1.0729054448683637 5.671305168245454 1.0728470246934547
-4
View File
@@ -1,4 +0,0 @@
[ 1,
0,
1,
0 ]
+1 -5
View File
@@ -1,5 +1 @@
module.exports = {
Framer: require('./src/sampleFramer'),
mfcc: require('./src/mfcc'),
windows: require('./src/sampleWindows')
};
module.exports = require('./src/mfcc');
+87 -177
View File
@@ -7,207 +7,117 @@
* human speech analysis.
\*===========================================================================*/
var program = require('commander'),
Stats = require('fast-stats').Stats,
fs = require('fs'),
path = require('path'),
fjs = require('frequencyjs'),
wav = require('wav'),
Framer = require('./').Framer,
windows = require('./').windows,
mfcc = require('./').mfcc,
dct = require('dct');
var u2pcm = require('./u2pcm');
for (var k in u2pcm)
{
u2pcm[k] = u2pcm[k] / 32767;
}
fs = require('fs'),
path = require('path'),
wav = require('wav'),
fft = require('fft-js'),
Framer = require('signal-windows').framer,
ham = undefined,
mfcc = require('./');
program.version('0.1')
.usage('[options]')
.option('-v, --verbose', 'Supply to turn on verbose output. Default 0.', 0)
.option('-w, --wav [wav]', '[INPUT]: Supply a wave file to process.', undefined)
.option('-f, --fft [json]', '[INPUT]: Supply a JSON file of FFT bins to process (primarily for testing).', undefined)
.option('-d, --dct [json]', '[INPUT]: Supply a JSON file of inputs for the DCT stage to process (primarily for testing).', undefined)
.option('-o, --output [csv]', 'Output CSV (will append).', undefined);
.option('-v, --verbose', 'True for verbose output.', 0)
.option('-d, --debug [type]', '0: none, 1: Output power spectrum, post-filterbank values, and Mel coefficients. 2: output filter banks used. 3: output frequency magnitudes. Default 0.', 0)
.option('-w, --wav [wav]', 'Wave file path to process.', undefined)
.option('-m, --minFrequency [number]', 'Low frequency cutoff for MFCC. Default 300', 300)
.option('-x, --maxFrequency [number]', 'High frequency cutoff for MFCC. Default 3500', 3500)
.option('-f, --numMelSpecFilters [number]', 'Number of mel spec filter banks to use. Default is 26.', 26)
.option('-n, --samplesPerFrame [number]', 'Number of samples per frame to pass into the FFT. Default 128.', 128)
.option('-s, --samplesPerStep [number]', 'Number of samples to step between each frame. Default is samplesPerFrame.', 128);
program.parse(process.argv);
if (program.wav === undefined && program.fft === undefined && program.dct === undefined)
if (program.wav === undefined)
{
console.log('Must choose an input type from the program options.');
program.outputHelp();
process.exit(1);
console.log('Please provide a wave file to process.');
program.outputHelp();
process.exit(1);
}
if ((program.wav && program.fft) || (program.wav && program.dct) || (program.fft && program.dct))
{
console.log('Please provide a .wav file, a .json FFT, or a .json DCT file but not more than one!');
process.exit(1);
}
program.minFrequency = parseInt(program.minFrequency);
program.maxFrequency = parseInt(program.maxFrequency);
program.numMelSpecFilters = parseInt(program.numMelSpecFilters);
program.samplesPerFrame = parseInt(program.samplesPerFrame);
program.samplesPerStep = parseInt(program.samplesPerStep);
var freqAssigned = false;
sampleRate = 8000,
minFreq = 300,
maxFreq = 3500,
nMelSpecFilters = 26,
framerSamples = 64,
framerStep = 64,
fftBins = framerSamples / 2,
framer = undefined,
bins = [],
binsStats = [],
binsToFreq = [],
win = windows.construct('ham', framerSamples, 0.71);
for (var i = 0; i < fftBins; i++) bins[i] = [];
/*-----------------------------------------------------------------------------------*\
* FFT JSON file
\*-----------------------------------------------------------------------------------*/
if (program.fft)
{
// Assumes the file contains an array of FFT bins
var amplitudes = parseFFTAmplitudes(program.fft);
// This filterbank, periodogram and melspec generation are an important part of the MFCC as a whole,
// but not necessary in the testing of the DCT on its own, using the -d option.
var filterBank = mfcc.constructFilterBank(amplitudes.length, nMelSpecFilters, minFreq, maxFreq, sampleRate);
var freqPowers = mfcc.periodogram(amplitudes),
melSpec = filterBank(freqPowers);
var melCoefficients = dct(melSpec);
var columns = melCoefficients.map(function (mc, ix) {
return 'mfcc' + (ix+1);
});
if (program.output)
output(columns, [melCoefficients]);
else
console.log(melSpec);
process.exit(1);
}
/*-----------------------------------------------------------------------------------*\
* DCT JSON file
\*-----------------------------------------------------------------------------------*/
if (program.dct)
{
var spectrum = parseFFTAmplitudes(program.dct);
var dctCoefficients = dct(spectrum);
var columns = dctCoefficients.map(function(mc, ix) {
return 'dct' + (ix + 1);
});
if (program.output) {
output(columns, [dctCoefficients]);
} else {
console.log(dctCoefficients);
}
}
if (program.samplesPerFrame & (program.samplesPerFrame-1) !== 0)
throw Error('Please provide a samplesPerFrame that is a power of 2 (e.g. 32, 64, 128, 256, etc.). Was: ' + program.samplesPerFrame);
var mfcc, // We construct after loading the wav file and reading the header.
framer, // Framer is also constructed after loading the wav file
sampleRate;
/*-----------------------------------------------------------------------------------*\
* .wav file
\*-----------------------------------------------------------------------------------*/
if (program.wav)
{
var wr = new wav.Reader(),
filterBank = mfcc.constructFilterBank(fftBins, nMelSpecFilters, minFreq, maxFreq, sampleRate);
wr.on('data', function (buffer, offset, length) {
framer.frame(buffer, function (frame, fIx) {
var spectrum = fjs.toSpectrum(frame, {
sampling: sampleRate,
method: 'fft'
}),
amplitudes = [];
var wr = new wav.Reader();
wr.on('data', function (buffer, offset, length) {
framer.frame(buffer, function (frame, fIx) {
if (frame.length != program.samplesPerFrame) return;
spectrum.forEach(function (spectra, ix) {
if (!freqAssigned) binsToFreq[ix] = spectra.frequency;
bins[ix].push(spectra.amplitude);
amplitudes.push(spectra.amplitude);
});
var phasors = fft.fft(frame),
phasorMagnitudes = fft.util.fftMag(phasors),
result = mfcc(phasorMagnitudes, program.debug && true);
console.log(spectrum);
//var freqPowers = mfcc.periodogram(amplitudes),
// melSpec = filterBank(freqPowers);
if (program.debug == 1)
{
console.log('Frame ' + fIx);
console.log('Frame ' + frame.join(','));
console.log('FFT ' + phasorMagnitudes.join(','));
console.log('Powers: ' + result.powers.join(','));
console.log('Post-filters: ' + result.melSpec.join(','));
console.log('Post-DCT: ' + result.melCoef.join(','));
}
else if (program.debug == 2)
{
console.log('Filters: ', result.filters);
}
else if (program.debug == 3)
{
console.log(phasorMagnitudes.join(','));
}
else if (!program.debug) console.log(fIx + ',' + result.join(','));
});
});
//var melCoefficients= dct.run(melSpec);
//console.log([fIx].concat(melCoefficients).join(','));
});
});
wr.on('format', function (format) {
wr.on('format', function (format) {
//TODO customize framer based on actual wave headers rather
// than assuming ulaw!
framer = new Framer({
map: u2pcm,
sizeS: framerSamples,
stepS: framerStep,
scale: win
});
});
var sampleRate = format.sampleRate;
wr.on('end', function () {
statistics();
if (program.args.length == 2) writeToCSV();
var end = new Date().getTime();
console.log('Elapsed: ', end - start);
console.log(binsStats);
process.exit(1);
});
ham = require('signal-windows').windows.construct('ham', program.samplesPerFrame);
var start = new Date().getTime();
fs.createReadStream(program.args[0]).pipe(wr);
}
var ulawMap = format.ulaw ? JSON.parse(fs.readFileSync('data/ulaw2pcm.json').toString()) : undefined;
function parseFFTAmplitudes(fftBinFile) {
var amplitudes = JSON.parse(fs.readFileSync(path.resolve(fftBinFile)).toString());
return amplitudes.map(parseFloat);
}
if (ulawMap) for (var k in ulawMap) ulawMap[k] = ulawMap[k]/32767;
function statistics() {
bins.forEach(function (bin, ix) {
var s = (new Stats()).push(bin);
binsStats[ix] = {
freq: binsToFreq[ix],
amean: s.amean()
};
});
}
if (format.channels != 1)
throw Error('Right now this MFCC code only works on single channel 8-bit wave files.');
if (format.bitDepth != 8)
throw Error('Right now this MFCC code only works on single channel 8-bit wave files.');
function writeCSVHeader(columns) {
var headers = columns.join(',');
// Breaks samples up into frames and runs them through a transform (map) if
// provided. In our case we want to transform from u-law if the wave file is
// formatted as such.
// By default we force a 'hamming' window.
framer = new Framer({
map: ulawMap,
frameSize: program.samplesPerFrame,
frameStep: program.samplesPerStep,
scale: ham,
sampleType: 'UInt8'
});
fs.writeFileSync(program.output, headers);
}
mfcc = mfcc.construct(program.samplesPerFrame / 2,
program.numMelSpecFilters,
program.minFrequency,
program.maxFrequency,
format.sampleRate);
});
function output(columns, rows) {
// First see if CSV already exists
if (fs.existsSync(program.output))
{
var contents = fs.readFileSync(program.output).toString();
wr.on('end', function () {
process.exit(1);
});
if (contents.indexOf(columns[0]) == -1)
writeCSVHeader(columns);
// Otherwise header already exists
}
else
writeCSVHeader(columns);
rows = rows.map(function (r) {
return r.join(',');
});
rows = rows.join('\n');
// Write out a single line of data
fs.appendFileSync(program.output, '\n' + rows);
}
fs.createReadStream(program.wav).pipe(wr);
-26
View File
@@ -1,26 +0,0 @@
1.423903e+02
8.032330e+01
2.215118e+02
1.672679e+02
5.327997e+02
2.418544e+02
2.060463e+03
8.842805e+04
1.140045e+05
4.408057e+05
1.329174e+05
6.295898e+04
1.384400e+03
6.334704e+02
3.695203e+02
2.444446e+02
1.701005e+02
1.278180e+02
9.702592e+01
7.697556e+01
7.005164e+01
5.945079e+01
1.318371e+02
4.292482e+03
5.036520e+03
5.994083e+02
1 1.423903e+02
2 8.032330e+01
3 2.215118e+02
4 1.672679e+02
5 5.327997e+02
6 2.418544e+02
7 2.060463e+03
8 8.842805e+04
9 1.140045e+05
10 4.408057e+05
11 1.329174e+05
12 6.295898e+04
13 1.384400e+03
14 6.334704e+02
15 3.695203e+02
16 2.444446e+02
17 1.701005e+02
18 1.278180e+02
19 9.702592e+01
20 7.697556e+01
21 7.005164e+01
22 5.945079e+01
23 1.318371e+02
24 4.292482e+03
25 5.036520e+03
26 5.994083e+02
+2
View File
@@ -28,7 +28,9 @@
"dct": "^0.0.2",
"fast-stats": "0.0.2",
"fft": "^0.2.1",
"fft-js": "0.0.6",
"frequencyjs": "0.0.3",
"signal-windows": "0.0.1",
"wav": "^1.0.0"
}
}
+135 -92
View File
@@ -9,113 +9,156 @@
var dct = require('dct');
module.exports = {
/*
* Given a set of amplitudes, estimates the power for those amplitudes.
*/
periodogram: periodogram,
/*
* Converts from hertz to the Mel-scale. Used by constructFilterBank.
*
* Based on the concept that human perception of an equidistant pitch decreases as
* pitch increases.
*/
hzToMels: hzToMels,
/*
* Inverse of hzToMels.
*/
melsToHz: melsToHz,
/*
* Returns a filter bank with nBanks triangular filters distributed according to the mel scale.
*
* Focused specifically on human speech (300 hz - 8000 hz)
*
* Recommended values for u-law 8000 hz:
*
* - nFrequencies == 32 (64 bit FFT)
* - nBanks == 31
* - Low Frequency == 200
* - High Frequency == 3500
*/
constructFilterBank: constructFilterBank
/*
* Given a set of amplitudes, estimates the power for those amplitudes.
*/
powerSpectrum: powerSpectrum,
/*
* Converts from hertz to the Mel-scale. Used by constructFilterBank.
*
* Based on the concept that human perception of an equidistant pitch decreases as
* pitch increases.
*/
hzToMels: hzToMels,
/*
* Inverse of hzToMels.
*/
melsToHz: melsToHz,
/*
* Returns a filter bank with bankCount triangular filters distributed according to the mel scale.
*
* Focused specifically on human speech (300 hz - 8000 hz)
*
* Recommended values for u-law 8000 hz:
*
* - fftSize == 64 (128 bin FFT)
* - bankCount == 31
* - Low Frequency == 200
* - High Frequency == 3500
*/
constructMelFilterBank: constructMelFilterBank,
construct: construct
};
function constructFilterBank(nFrequencies, nFilters, lowF, highF, sampleRate) {
var bins = [],
fq = [],
filters = [];
function construct(fftSize, bankCount, lowFrequency, highFrequency, sampleRate) {
if (!fftSize) throw Error('Please provide an fftSize');
if (!bankCount) throw Error('Please provide a bankCount');
if (!lowFrequency) throw Error('Please provide a low frequency cutoff.');
if (!highFrequency) throw Error('Please provide a high frequency cutoff.');
if (!sampleRate) throw Error('Please provide a valid sampleRate.');
var lowM = hzToMels(lowF),
highM = hzToMels(highF),
deltaM = (highM - lowM) / (nFilters+1);
var filterBank = constructMelFilterBank(fftSize, bankCount, lowFrequency, highFrequency, sampleRate);
// Construct equidistant Mel values between lowM and highM.
for (var i = 0; i < nFilters; i++) {
// Get the Mel value and convert back to frequency.
// e.g. 200 hz <=> 401.25 Mel
fq[i] = melsToHz(lowM + (i * deltaM));
/**
* Perform a full MFCC on a FFT spectrum.
*
* FFT Array passed in should contain frequency amplitudes only.
*
* Pass in truthy for debug if you wish to return outputs of each step (freq. powers, melSpec, and MelCoef)
*/
return function (fft, debug) {
if (fft.length != fftSize)
throw Error('Passed in FFT bins were incorrect size. Expected ' + fftSize + ' but was ' + fft.length);
// Round the frequency we derived from the Mel-scale to the nearest actual FFT bin that we have.
// For example, in a 64 sample FFT for 8khz audio we have 32 bins from 0-8khz evenly spaced.
bins[i] = Math.floor((nFrequencies+1) * fq[i] / (sampleRate/2));
var powers = powerSpectrum(fft),
melSpec = filterBank.filter(powers),
melCoef = dct(melSpec);
return debug ? {
powers: powers,
melSpec: melSpec,
melCoef: melCoef,
filters: filterBank
} : melCoef;
}
}
function constructMelFilterBank(fftSize, nFilters, lowF, highF, sampleRate) {
var bins = [],
fq = [],
filters = [];
var lowM = hzToMels(lowF),
highM = hzToMels(highF),
deltaM = (highM - lowM) / (nFilters+1);
// Construct equidistant Mel values between lowM and highM.
for (var i = 0; i < nFilters; i++) {
// Get the Mel value and convert back to frequency.
// e.g. 200 hz <=> 401.25 Mel
fq[i] = melsToHz(lowM + (i * deltaM));
// Round the frequency we derived from the Mel-scale to the nearest actual FFT bin that we have.
// For example, in a 64 sample FFT for 8khz audio we have 32 bins from 0-8khz evenly spaced.
bins[i] = Math.floor((fftSize+1) * fq[i] / (sampleRate/2));
}
// Construct one cone filter per bin.
// Filters end up looking similar to [... 0, 0, 0.33, 0.66, 1.0, 0.66, 0.33, 0, 0...]
for (var i = 0; i < bins.length; i++)
{
filters[i] = [];
var filterRange = (i != bins.length-1) ? bins[i+1] - bins[i] : bins[i] - bins[i-1];
filters[i].filterRange = filterRange;
for (var f = 0; f < fftSize; f++) {
// Right, outside of cone
if (f > bins[i] + filterRange) filters[i][f] = 0.0;
// Right edge of cone
else if (f > bins[i]) filters[i][f] = 1.0 - ((f - bins[i]) / filterRange);
// Peak of cone
else if (f == bins[i]) filters[i][f] = 1.0;
// Left edge of cone
else if (f >= bins[i] - filterRange) filters[i][f] = 1.0 - (bins[i] - f) / filterRange;
// Left, outside of cone
else filters[i][f] = 0.0;
}
}
// Construct one cone filter per bin.
// Filters end up looking similar to [... 0, 0, 0.33, 0.66, 1.0, 0.66, 0.33, 0, 0...]
for (var i = 0; i < bins.length; i++)
{
filters[i] = [];
var filterRange = (i != bins.length-1) ? bins[i+1] - bins[i] : bins[i] - bins[i-1];
filters[i].filterRange = filterRange;
for (var f = 0; f < nFrequencies; f++) {
// Right, outside of cone
if (f > bins[i] + filterRange) filters[i][f] = 0.0;
// Right edge of cone
else if (f > bins[i]) filters[i][f] = 1.0 - ((f - bins[i]) / filterRange);
// Peak of cone
else if (f == bins[i]) filters[i][f] = 1.0;
// Left edge of cone
else if (f >= bins[i] - filterRange) filters[i][f] = 1.0 - (bins[i] - f) / filterRange;
// Left, outside of cone
else filters[i][f] = 0.0;
}
// Store for debugging.
filters.bins = bins;
// Here we actually apply the filters one by one. Then we add up the results of each applied filter
// to get the estimated power contained within that Mel-scale bin.
//
// First argument is expected to be the result of the frequencies passed to the powerSpectrum
// method.
return {
filters: filters,
lowMel: lowM,
highMel: highM,
deltaMel: deltaM,
lowFreq: lowF,
highFreq: highF,
filter: function (freqPowers) {
var ret = [];
filters.forEach(function (filter, fIx) {
var tot = 0;
freqPowers.forEach(function (fp, pIx) {
tot += fp * filter[pIx];
});
ret[fIx] = tot;
});
return ret;
}
// Store for debugging.
filters.bins = bins;
// Here we actually apply the filters one by one. Then we add up the results of each applied filter
// to get the estimated power contained within that Mel-scale bin.
//
// First argument is expected to be the result of the frequencies passed to the periodogram
// method.
return function (freqPowers) {
var ret = [];
filters.forEach(function (filter, fIx) {
var tot = 0;
freqPowers.forEach(function (fp, pIx) {
tot += fp * filter[pIx];
});
ret[fIx] = tot;
});
return ret;
};
};
}
function melsToHz(mels) {
return 700 * (Math.exp(mels / 1127) - 1);
return 700 * (Math.exp(mels / 1127) - 1);
}
function hzToMels(hertz) {
return 1127 * Math.log(1 + hertz/700);
return 1127 * Math.log(1 + hertz/700);
}
function periodogram(amplitudes) {
var power = [],
N = amplitudes.length;
/**
* Estimate the power spectrum density from FFT amplitudes.
*/
function powerSpectrum(amplitudes) {
var N = amplitudes.length;
for (var i = 0; i < N; i++) {
power[i] = (amplitudes[i] * amplitudes[i]) / N;
}
return power;
return amplitudes.map(function (a) {
return (a * a) / N;
});
}
-39
View File
@@ -1,39 +0,0 @@
var Framer = function (options) {
options = options || {};
this.fIx = 0;
this.sizeS = options.sizeS || 64;
this.stepS = options.stepS || this.sizeS;
this.map = options.map || undefined;
this.scale = options.scale || undefined;
this.offset = options.offset || 0;
};
Framer.prototype = {
frame: function(buffer, callback) {
var self = this,
cb = this.offset,
frame = [];
while (cb < buffer.length) {
if (this.map) frame.push(this.map[buffer.readUInt8(cb)]);
else frame.push(buffer.readUInt8(cb));
if (frame.length == this.sizeS)
{
if (this.scale)
frame = frame.map(function (s, ix) {
return s * self.scale[ix];
});
callback(frame, this.fIx);
frame = [];
cb -= (this.sizeS - this.stepS);
this.fIx++;
}
cb++;
}
}
};
module.exports = Framer;
-32
View File
@@ -1,32 +0,0 @@
module.exports = {
construct: function (type, samples, alpha) {
var coef = [];
switch (type)
{
case 'ham':
for (var i = 0; i < samples; i++) {
coef[i] = 0.54 - 0.46 * Math.cos((2 * Math.PI * i)/(samples - 1));
}
break;
case 'tukey':
var sm1 = samples - 1,
asm1 = alpha * sm1;
for (var i = 0; i < samples; i++) {
// Left tail
if (i < asm1 / 2)
coef[i] = 0.5 * (1 + Math.cos(Math.PI * ((2 * samples)/(asm1) - 1)));
// Right tail
else if (i > (sm1 * (1 - alpha / 2)))
coef[i] = 0.5 * (1 + Math.cos(Math.PI * ((2 * samples)/(asm1) - (2 / alpha) + 1)));
// Middle (rectangular)
else coef[i] = 1;
}
break;
case 'flat':
for (var i = 0; i < samples; i++)
coef[i] = 1.0;
break;
}
return coef;
}
};