51 #include <sphinxbase/fe.h>
58 #include <sphinxbase/byteorder.h>
61 #include "sphinx_wave2feat.h"
62 #include "cmd_ln_defn.h"
102 int32 RemainingLength;
124 if ((fh = fopen(infile,
"rb")) == NULL) {
128 if (fread(&hdr,
sizeof(hdr), 1, fh) != 1) {
134 if (0 != memcmp(hdr.rifftag,
"RIFF", 4)) {
140 cmd_ln_set_int32_r(wtf->
config,
"-nchans", hdr.numchannels);
141 cmd_ln_set_float32_r(wtf->
config,
"-samprate", hdr.SamplingFreq);
157 if ((fh = fopen(infile,
"rb")) == NULL) {
161 if (fread(&nist, 1, 7, fh) != 7) {
167 if (0 != strncmp(nist,
"NIST_1A", 7)) {
172 fseek(fh, 0, SEEK_SET);
178 if (strlen(li->buf) == 0) {
187 if (0 == strcmp(words[0],
"sample_rate")) {
188 cmd_ln_set_float32_r(wtf->
config,
"-samprate",
atof_c(words[2]));
190 if (0 == strcmp(words[0],
"channel_count")) {
191 cmd_ln_set_int32_r(wtf->
config,
"-nchans", atoi(words[2]));
193 if (0 == strcmp(words[0],
"sample_byte_format")) {
195 (0 == strcmp(words[2],
"10")) ?
"big" :
"little");
200 fseek(fh, 1024, SEEK_SET);
217 if ((rv = open_nist_file(wtf, infile, NULL)) != TRUE)
221 cmdline =
string_join(
"sph2pipe -f raw '", infile,
"'", NULL);
222 if ((fh = popen(cmdline,
"r")) == NULL) {
223 E_ERROR_SYSTEM(
"Failed to popen(\"sph2pipe -f raw '%s'\")", infile);
238 E_ERROR(
"popen() not available, cannot run sph2pipe\n");
254 if ((rv = open_nist_file(wtf, infile, &fh)) != TRUE)
275 if ((fh = fopen(infile,
"rb")) == NULL) {
299 if ((fh = fopen(infile,
"rb")) == NULL) {
303 if (fread(&len, 4, 1, fh) != 1) {
307 fseek(fh, 0, SEEK_END);
311 flen = (flen / 4) - 1;
317 E_ERROR(
"Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
324 ?
"little" :
"big"));
327 fseek(fh, 4, SEEK_SET);
341 E_ERROR(
"Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
349 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
354 for (i = whichchan - 1; i < nsamp; i += nchans)
355 buf[i/nchans] = buf[i];
358 for (i = 0; i < nsamp; i += nchans) {
360 for (j = 0; j < nchans && i + j < nsamp; ++j) {
363 buf[i/nchans] = (int16)(tmp / nchans);
369 #ifdef HAVE_SNDFILE_H
381 memset(&sfinfo, 0,
sizeof(sfinfo));
384 if ((sf = sf_open(infile, SFM_READ, &sfinfo)) == NULL) {
388 cmd_ln_set_int32_r(wtf->
config,
"-nchans", sfinfo.channels);
389 cmd_ln_set_float32_r(wtf->
config,
"-samprate", sfinfo.samplerate);
407 int32 nfr, nchans, whichchan;
410 nchans = cmd_ln_int32_r(wtf->
config,
"-nchans");
411 whichchan = cmd_ln_int32_r(wtf->
config,
"-whichchan");
412 fe_start_utt(wtf->
fe);
414 while ((nsamp = sf_read_short(wtf->insfh,
417 int16
const *inspeech;
422 nsamp = mixnpick_channels(wtf->
audio, nsamp, nchans, whichchan);
424 inspeech = wtf->
audio;
429 fe_process_frames(wtf->
fe, &inspeech, &nsamp, wtf->
feat, &nfr);
431 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
436 inspeech = wtf->
audio;
439 fe_end_utt(wtf->
fe, wtf->
feat[0], &nfr);
441 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
446 sf_close(wtf->insfh);
460 int32 nfr, nchans, whichchan;
463 nchans = cmd_ln_int32_r(wtf->
config,
"-nchans");
464 whichchan = cmd_ln_int32_r(wtf->
config,
"-whichchan");
465 fe_start_utt(wtf->
fe);
469 int16
const *inspeech;
473 for (n = 0; n < nsamp; ++n)
474 SWAP_INT16(wtf->
audio + n);
479 nsamp = mixnpick_channels(wtf->
audio, nsamp, nchans, whichchan);
481 inspeech = wtf->
audio;
486 fe_process_frames(wtf->
fe, &inspeech, &nsamp, wtf->
feat, &nfr);
488 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
493 inspeech = wtf->
audio;
496 fe_end_utt(wtf->
fe, wtf->
feat[0], &nfr);
498 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
503 if (fclose(wtf->
infh) == EOF)
524 while ((n = fread(wtf->
feat[0],
sizeof(**wtf->
feat),
528 E_ERROR(
"Size of file %d not a multiple of veclen %d\n",
534 for (i = 0; i < n; ++i)
535 SWAP_FLOAT32(wtf->
feat[0] + i);
537 fe_float_to_mfcc(wtf->
fe, (float32 **)wtf->
feat, wtf->
feat, nfr);
538 for (i = 0; i < nfr; ++i) {
541 fe_logspec_to_mfcc(wtf->
fe, wtf->
feat[i], wtf->
feat[i]);
543 fe_logspec_dct2(wtf->
fe, wtf->
feat[i], wtf->
feat[i]);
546 fe_mfcc_dct3(wtf->
fe, wtf->
feat[i], wtf->
feat[i]);
549 if ((n = (*wtf->
ot->output_frames)(wtf, wtf->
feat, nfr)) < 0)
554 if (fclose(wtf->
infh) == EOF)
561 #ifdef HAVE_SNDFILE_H
562 {
"-sndfile", &detect_sndfile, &decode_sndfile },
564 {
"-mswav", &detect_riff, &decode_pcm },
565 {
"-nist", &detect_nist, &decode_pcm },
566 {
"-raw", &detect_raw, &decode_pcm },
567 {
"-sph2pipe", &detect_sph2pipe, &decode_pcm }
569 static const int ntypes =
sizeof(types)/
sizeof(types[0]);
571 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
582 if (fwrite(&nfloat, 4, 1, wtf->
outfh) != 1) {
599 fe_mfcc_to_float(wtf->
fe, frames, (float32 **)frames, nfr);
600 for (i = 0; i < nfr; ++i) {
601 if (fwrite(frames[i],
sizeof(float32), wtf->
veclen, wtf->
outfh) != wtf->
veclen) {
611 typedef enum htk_feature_kind_e {
624 } htk_feature_kind_t;
626 typedef enum htk_feature_flag_e {
637 } htk_feature_flag_t;
654 if (swap) SWAP_INT32(&nfloat);
655 if (fwrite(&nfloat, 4, 1, wtf->
outfh) != 1)
658 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->
config,
"-frate"));
659 if (swap) SWAP_INT32(&samp_period);
660 if (fwrite(&samp_period, 4, 1, wtf->
outfh) != 1)
663 samp_size = wtf->
veclen * 4;
664 if (swap) SWAP_INT16(&samp_size);
665 if (fwrite(&samp_size, 2, 1, wtf->
outfh) != 1)
672 param_kind = MFCC | _O;
673 if (swap) SWAP_INT16(¶m_kind);
674 if (fwrite(¶m_kind, 2, 1, wtf->
outfh) != 1)
686 int i, j, swap, htk_reorder, nfloat = 0;
688 fe_mfcc_to_float(wtf->
fe, frames, (float32 **)frames, nfr);
691 htk_reorder = (0 == strcmp(
"htk", wtf->
ot->name)
694 for (i = 0; i < nfr; ++i) {
696 mfcc_t c0 = frames[i][0];
697 memmove(frames[i] + 1, frames[i], (wtf->
veclen - 1) * 4);
698 frames[i][wtf->
veclen - 1] = c0;
701 for (j = 0; j < wtf->
veclen; ++j)
702 SWAP_FLOAT32(frames[i] + j);
703 if (fwrite(frames[i],
sizeof(float32), wtf->
veclen, wtf->
outfh) != wtf->
veclen) {
719 int i, j, nfloat = 0;
721 fe_mfcc_to_float(wtf->
fe, frames, (float32 **)frames, nfr);
722 for (i = 0; i < nfr; ++i) {
723 for (j = 0; j < wtf->
veclen; ++j) {
724 fprintf(wtf->
outfh,
"%.5g", frames[i][j]);
726 fprintf(wtf->
outfh,
"\n");
728 fprintf(wtf->
outfh,
" ");
736 {
"sphinx", &output_header_sphinx, &output_frames_sphinx },
737 {
"htk", &output_header_htk, &output_frames_htk },
738 {
"text", NULL, &output_frames_text }
740 static const int nouttypes =
sizeof(outtypes)/
sizeof(outtypes[0]);
743 sphinx_wave2feat_init(
cmd_ln_t *config)
751 wtf->
fe = fe_init_auto_r(wtf->
config);
753 for (i = 0; i < nouttypes; ++i) {
755 if (0 == strcmp(
cmd_ln_str_r(config,
"-ofmt"), otype->name)) {
760 if (i == nouttypes) {
761 E_ERROR(
"Unknown output type: '%s'\n",
763 sphinx_wave2feat_free(wtf);
783 if (fclose(wtf->
infh) == EOF)
787 if (fclose(wtf->
outfh) == EOF)
813 int rv = mfcc_type.detect(wtf, infile);
820 for (i = 0; i < ntypes; ++i) {
824 rv = (*atype->detect)(wtf, infile);
833 for (i = 0; i < ntypes; ++i) {
836 rv = (*atype->detect)(wtf, infile);
855 char const *infile,
char const *outfile)
857 int nchans, minfft, nfft, nfloat, veclen;
862 E_INFO(
"Converting %s to %s\n", infile, outfile);
865 if ((atype = detect_audio_type(wtf, infile)) == NULL)
873 minfft = (int)(cmd_ln_float32_r(wtf->
config,
"-samprate")
874 * cmd_ln_float32_r(wtf->
config,
"-wlen") + 0.5);
875 for (nfft = 1; nfft < minfft; nfft <<= 1)
877 if (nfft > cmd_ln_int32_r(wtf->
config,
"-nfft")) {
878 E_WARN(
"Value of -nfft = %d is too small, increasing to %d\n",
879 cmd_ln_int32_r(wtf->
config,
"-nfft"), nfft);
880 cmd_ln_set_int32_r(wtf->
config,
"-nfft", nfft);
882 wtf->
fe = fe_init_auto_r(wtf->
config);
887 wtf->
veclen = fe_get_output_size(wtf->
fe);
890 fe_get_input_size(wtf->
fe, &fshift, &fsize);
894 nchans = cmd_ln_int32_r(wtf->
config,
"-nchans");
896 if (wtf->
blocksize < (fsize + fshift) * nchans) {
897 E_INFO(
"Block size of %d too small, increasing to %d\n",
899 (fsize + fshift) * nchans);
900 wtf->
blocksize = (fsize + fshift) * nchans;
911 if ((wtf->
outfh = fopen(outfile,
"wb")) == NULL) {
916 if (wtf->
ot->output_header &&
917 (*wtf->
ot->output_header)(wtf, 0) < 0) {
923 if ((nfloat = (*atype->decode)(wtf)) < 0)
926 if (wtf->
ot->output_header) {
927 if (fseek(wtf->
outfh, 0, SEEK_SET) < 0) {
931 if ((*wtf->
ot->output_header)(wtf, nfloat) < 0) {
936 if (fclose(wtf->
outfh) == EOF)
950 build_filenames(
cmd_ln_t *config,
char const *basename,
951 char **out_infile,
char **out_outfile)
953 char const *di, *do_, *ei, *eo;
989 int nskip, runlen, npart, rv = 0;
991 if ((ctlfh = fopen(ctlfile,
"r")) == NULL) {
995 nskip = cmd_ln_int32_r(wtf->
config,
"-nskip");
996 runlen = cmd_ln_int32_r(wtf->
config,
"-runlen");
997 if ((npart = cmd_ln_int32_r(wtf->
config,
"-npart"))) {
999 int partlen, part, nlines = 0;
1000 part = cmd_ln_int32_r(wtf->
config,
"-part");
1003 fseek(ctlfh, 0, SEEK_SET);
1004 partlen = nlines / npart;
1005 nskip = partlen * (part - 1);
1012 E_INFO(
"Processing %d utterances at position %d\n", runlen, nskip);
1016 E_INFO(
"Processing all remaining utterances at position %d\n", nskip);
1020 char *c, *infile, *outfile;
1032 if ((c = strchr(li->buf,
' ')) != NULL)
1034 build_filenames(wtf->
config, li->buf, &infile, &outfile);
1037 rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
1041 if (fclose(ctlfh) == EOF)
1056 main(
int argc,
char *argv[])
1063 if ((config =
cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
1070 if (config == NULL) {
1071 E_ERROR(
"Command line parsing failed\n");
1074 if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1075 E_ERROR(
"Failed to initialize wave2feat object\n");
1083 rv = run_control_file(wtf,
cmd_ln_str_r(config,
"-c"));
1085 rv = sphinx_wave2feat_convert_file(wtf,
cmd_ln_str_r(config,
"-i"),
1088 sphinx_wave2feat_free(wtf);