SphinxBase  0.6
sphinx_fe.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <time.h>
41 #include <assert.h>
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #ifdef HAVE_SNDFILE_H
48 #include <sndfile.h>
49 #endif
50 
51 #include <sphinxbase/fe.h>
52 #include <sphinxbase/strfuncs.h>
53 #include <sphinxbase/pio.h>
54 #include <sphinxbase/filename.h>
55 #include <sphinxbase/cmd_ln.h>
56 #include <sphinxbase/err.h>
57 #include <sphinxbase/ckd_alloc.h>
58 #include <sphinxbase/byteorder.h>
59 #include <sphinxbase/hash_table.h>
60 
61 #include "sphinx_wave2feat.h"
62 #include "cmd_ln_defn.h"
63 
64 typedef struct audio_type_s {
65  char const *name;
66  int (*detect)(sphinx_wave2feat_t *wtf, char const *infile);
67  int (*decode)(sphinx_wave2feat_t *wtf);
68 } audio_type_t;
69 
70 typedef struct output_type_s {
71  char const *name;
72  int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
73  int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
75 
77  int refcount;
79  fe_t *fe;
80  char *infile;
81  char *outfile;
82  FILE *infh;
83  FILE *outfh;
84  short *audio;
85  mfcc_t **feat;
86  int blocksize;
87  int featsize;
88  int veclen;
89  int in_veclen;
90  int byteswap;
91 #ifdef HAVE_SNDFILE_H
92  SNDFILE *insfh;
93 #endif
94  output_type_t const *ot;
95 };
96 
98 typedef struct RIFFHeader{
99  char rifftag[4]; /* "RIFF" string */
100  int32 TotalLength; /* Total length */
101  char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
102  int32 RemainingLength; /* Remaining length */
103  int16 data_format; /* data format tag, 1 = PCM */
104  int16 numchannels; /* Number of channels in file */
105  int32 SamplingFreq; /* Sampling frequency */
106  int32 BytesPerSec; /* Average bytes/sec */
107  int16 BlockAlign; /* Block align */
108  int16 BitsPerSample; /* 8 or 16 bit */
109  char datatag[4]; /* "data" string */
110  int32 datalength; /* Raw data length */
111 } MSWAV_hdr;
112 
118 static int
119 detect_riff(sphinx_wave2feat_t *wtf, char const *infile)
120 {
121  FILE *fh;
122  MSWAV_hdr hdr;
123 
124  if ((fh = fopen(infile, "rb")) == NULL) {
125  E_ERROR_SYSTEM("Failed to open %s", infile);
126  return -1;
127  }
128  if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
129  E_ERROR_SYSTEM("Failed to read RIFF header");
130  fclose(fh);
131  return -1;
132  }
133  /* Make sure it is actually a RIFF file. */
134  if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
135  fclose(fh);
136  return FALSE;
137  }
138 
139  /* Get relevant information. */
140  cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
141  cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
142  if (wtf->infile)
143  ckd_free(wtf->infile);
144  wtf->infile = ckd_salloc(infile);
145  wtf->infh = fh;
146 
147  return TRUE;
148 }
149 
150 static int
151 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh)
152 {
153  char nist[7];
154  lineiter_t *li;
155  FILE *fh;
156 
157  if ((fh = fopen(infile, "rb")) == NULL) {
158  E_ERROR_SYSTEM("Failed to open %s", infile);
159  return -1;
160  }
161  if (fread(&nist, 1, 7, fh) != 7) {
162  E_ERROR_SYSTEM("Failed to read NIST header");
163  fclose(fh);
164  return -1;
165  }
166  /* Is this actually a NIST file? */
167  if (0 != strncmp(nist, "NIST_1A", 7)) {
168  fclose(fh);
169  return FALSE;
170  }
171  /* Rewind, parse lines. */
172  fseek(fh, 0, SEEK_SET);
173  for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
174  char **words;
175  int nword;
176 
177  string_trim(li->buf, STRING_BOTH);
178  if (strlen(li->buf) == 0) {
179  lineiter_free(li);
180  break;
181  }
182  nword = str2words(li->buf, NULL, 0);
183  if (nword != 3)
184  continue;
185  words = ckd_calloc(nword, sizeof(*words));
186  str2words(li->buf, words, nword);
187  if (0 == strcmp(words[0], "sample_rate")) {
188  cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
189  }
190  if (0 == strcmp(words[0], "channel_count")) {
191  cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
192  }
193  if (0 == strcmp(words[0], "sample_byte_format")) {
194  cmd_ln_set_str_r(wtf->config, "-input_endian",
195  (0 == strcmp(words[2], "10")) ? "big" : "little");
196  }
197  ckd_free(words);
198  }
199 
200  fseek(fh, 1024, SEEK_SET);
201  if (out_fh)
202  *out_fh = fh;
203  else
204  fclose(fh);
205  return TRUE;
206 }
207 
208 #ifdef HAVE_POPEN
209 static int
210 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile)
211 {
212  FILE *fh;
213  char *cmdline;
214  int rv;
215 
216  /* Determine if it's NIST file and get parameters. */
217  if ((rv = open_nist_file(wtf, infile, NULL)) != TRUE)
218  return rv;
219 
220  /* Now popen it with sph2pipe. */
221  cmdline = string_join("sph2pipe -f raw '", infile, "'", NULL);
222  if ((fh = popen(cmdline, "r")) == NULL) {
223  E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", infile);
224  ckd_free(cmdline);
225  return -1;
226  }
227 
228  if (wtf->infile)
229  ckd_free(wtf->infile);
230  wtf->infile = ckd_salloc(infile);
231  wtf->infh = fh;
232  return TRUE;
233 }
234 #else /* !HAVE_POPEN */
235 static int
236 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile)
237 {
238  E_ERROR("popen() not available, cannot run sph2pipe\n");
239  return -1;
240 }
241 #endif /* !HAVE_POPEN */
242 
248 static int
249 detect_nist(sphinx_wave2feat_t *wtf, char const *infile)
250 {
251  FILE *fh;
252  int rv;
253 
254  if ((rv = open_nist_file(wtf, infile, &fh)) != TRUE)
255  return rv;
256  if (wtf->infile)
257  ckd_free(wtf->infile);
258  wtf->infile = ckd_salloc(infile);
259  wtf->infh = fh;
260  return TRUE;
261 }
262 
263 
270 static int
271 detect_raw(sphinx_wave2feat_t *wtf, char const *infile)
272 {
273  FILE *fh;
274 
275  if ((fh = fopen(infile, "rb")) == NULL) {
276  E_ERROR_SYSTEM("Failed to open %s", infile);
277  return -1;
278  }
279  if (wtf->infile)
280  ckd_free(wtf->infile);
281  wtf->infile = ckd_salloc(infile);
282  wtf->infh = fh;
283  return TRUE;
284 }
285 
292 static int
293 detect_sphinx_mfc(sphinx_wave2feat_t *wtf, char const *infile)
294 {
295  FILE *fh;
296  int32 len;
297  long flen;
298 
299  if ((fh = fopen(infile, "rb")) == NULL) {
300  E_ERROR_SYSTEM("Failed to open %s", infile);
301  return -1;
302  }
303  if (fread(&len, 4, 1, fh) != 1) {
304  E_ERROR_SYSTEM("Failed to read header from %s\n", infile);
305  return -1;
306  }
307  fseek(fh, 0, SEEK_END);
308  flen = ftell(fh);
309 
310  /* figure out whether to byteswap */
311  flen = (flen / 4) - 1;
312  if (flen != len) {
313  /* First make sure this is an endianness problem, otherwise fail. */
314  SWAP_INT32(&len);
315  if (flen != len) {
316  SWAP_INT32(&len);
317  E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
318  len, flen);
319  return -1;
320  }
321  /* Set the input endianness to the opposite of the machine endianness... */
322  cmd_ln_set_str_r(wtf->config, "-input_endian",
323  (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
324  ? "little" : "big"));
325  }
326 
327  fseek(fh, 4, SEEK_SET);
328  if (wtf->infile)
329  ckd_free(wtf->infile);
330  wtf->infile = ckd_salloc(infile);
331  wtf->infh = fh;
332  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
333  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
334  }
335  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
336  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
337  wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
338  }
339  else {
340  /* Should not happen. */
341  E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
342  assert(FALSE);
343  }
344 
345  return TRUE;
346 }
347 
348 int
349 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
350 {
351  int i, j;
352 
353  if (whichchan > 0) {
354  for (i = whichchan - 1; i < nsamp; i += nchans)
355  buf[i/nchans] = buf[i];
356  }
357  else {
358  for (i = 0; i < nsamp; i += nchans) {
359  float64 tmp = 0.0;
360  for (j = 0; j < nchans && i + j < nsamp; ++j) {
361  tmp += buf[i + j];
362  }
363  buf[i/nchans] = (int16)(tmp / nchans);
364  }
365  }
366  return i/nchans;
367 }
368 
369 #ifdef HAVE_SNDFILE_H
370 
375 static int
376 detect_sndfile(sphinx_wave2feat_t *wtf, char const *infile)
377 {
378  SNDFILE *sf;
379  SF_INFO sfinfo;
380 
381  memset(&sfinfo, 0, sizeof(sfinfo));
382  /* We let other detectors catch I/O errors, since there is
383  no way to tell them from format errors when opening :( */
384  if ((sf = sf_open(infile, SFM_READ, &sfinfo)) == NULL) {
385  return FALSE;
386  }
387  /* Get relevant information. */
388  cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels);
389  cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate);
390  if (wtf->infile)
391  ckd_free(wtf->infile);
392  wtf->infile = ckd_salloc(infile);
393  wtf->insfh = sf;
394  wtf->infh = NULL;
395 
396  return TRUE;
397 }
398 
403 static int
404 decode_sndfile(sphinx_wave2feat_t *wtf)
405 {
406  size_t nsamp;
407  int32 nfr, nchans, whichchan;
408  int nfloat, n;
409 
410  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
411  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
412  fe_start_utt(wtf->fe);
413  nfloat = 0;
414  while ((nsamp = sf_read_short(wtf->insfh,
415  wtf->audio,
416  wtf->blocksize)) != 0) {
417  int16 const *inspeech;
418  size_t nvec;
419 
420  /* Mix or pick channels. */
421  if (nchans > 1)
422  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
423 
424  inspeech = wtf->audio;
425  nvec = wtf->featsize;
426  /* Consume all samples. */
427  while (nsamp) {
428  nfr = nvec;
429  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
430  if (nfr) {
431  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
432  return -1;
433  nfloat += n;
434  }
435  }
436  inspeech = wtf->audio;
437  }
438  /* Now process any leftover audio frames. */
439  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
440  if (nfr) {
441  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
442  return -1;
443  nfloat += n;
444  }
445 
446  sf_close(wtf->insfh);
447  wtf->insfh = NULL;
448  return nfloat;
449 }
450 #endif /* HAVE_SNDFILE_H */
451 
456 static int
457 decode_pcm(sphinx_wave2feat_t *wtf)
458 {
459  size_t nsamp;
460  int32 nfr, nchans, whichchan;
461  int nfloat, n;
462 
463  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
464  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
465  fe_start_utt(wtf->fe);
466  nfloat = 0;
467  while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
468  size_t nvec;
469  int16 const *inspeech;
470 
471  /* Byteswap stuff here if necessary. */
472  if (wtf->byteswap) {
473  for (n = 0; n < nsamp; ++n)
474  SWAP_INT16(wtf->audio + n);
475  }
476 
477  /* Mix or pick channels. */
478  if (nchans > 1)
479  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
480 
481  inspeech = wtf->audio;
482  nvec = wtf->featsize;
483  /* Consume all samples. */
484  while (nsamp) {
485  nfr = nvec;
486  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
487  if (nfr) {
488  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
489  return -1;
490  nfloat += n;
491  }
492  }
493  inspeech = wtf->audio;
494  }
495  /* Now process any leftover audio frames. */
496  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
497  if (nfr) {
498  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
499  return -1;
500  nfloat += n;
501  }
502 
503  if (fclose(wtf->infh) == EOF)
504  E_ERROR_SYSTEM("Failed to close input file");
505  wtf->infh = NULL;
506  return nfloat;
507 }
508 
513 static int
514 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
515 {
516  int nfloat = 0, n;
517  int featsize = wtf->featsize;
518 
519  /* If the input vector length is less than the output length, we
520  * need to do this one frame at a time, because there's empty
521  * space at the end of each vector in wtf->feat. */
522  if (wtf->in_veclen < wtf->veclen)
523  featsize = 1;
524  while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
525  featsize * wtf->in_veclen, wtf->infh)) != 0) {
526  int i, nfr = n / wtf->in_veclen;
527  if (n % wtf->in_veclen) {
528  E_ERROR("Size of file %d not a multiple of veclen %d\n",
529  n, wtf->in_veclen);
530  return -1;
531  }
532  /* Byteswap stuff here if necessary. */
533  if (wtf->byteswap) {
534  for (i = 0; i < n; ++i)
535  SWAP_FLOAT32(wtf->feat[0] + i);
536  }
537  fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
538  for (i = 0; i < nfr; ++i) {
539  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
540  if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
541  fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
542  else
543  fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
544  }
545  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
546  fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
547  }
548  }
549  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
550  return -1;
551  nfloat += n;
552  }
553 
554  if (fclose(wtf->infh) == EOF)
555  E_ERROR_SYSTEM("Failed to close input file");
556  wtf->infh = NULL;
557  return nfloat;
558 }
559 
560 static const audio_type_t types[] = {
561 #ifdef HAVE_SNDFILE_H
562  { "-sndfile", &detect_sndfile, &decode_sndfile },
563 #endif
564  { "-mswav", &detect_riff, &decode_pcm },
565  { "-nist", &detect_nist, &decode_pcm },
566  { "-raw", &detect_raw, &decode_pcm },
567  { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
568 };
569 static const int ntypes = sizeof(types)/sizeof(types[0]);
570 static const audio_type_t mfcc_type = {
571  "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
572 };
573 
579 static int
580 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
581 {
582  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
583  E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
584  return -1;
585  }
586  return 0;
587 }
588 
594 static int
595 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
596 {
597  int i, nfloat = 0;
598 
599  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
600  for (i = 0; i < nfr; ++i) {
601  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
602  E_ERROR_SYSTEM("Writing %d values to %s failed",
603  wtf->veclen, wtf->outfile);
604  return -1;
605  }
606  nfloat += wtf->veclen;
607  }
608  return nfloat;
609 }
610 
611 typedef enum htk_feature_kind_e {
612  WAVEFORM = 0, /* PCM audio (rarely used) */
613  LPC = 1, /* LPC filter coefficients */
614  LPCREFC = 2, /* LPC reflection coefficients */
615  LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
616  LPCDELCEP = 4, /* LPCC plus deltas */
617  IREFC = 5, /* 16-bit integer LPC reflection coefficients */
618  MFCC = 6, /* MFCCs */
619  FBANK = 7, /* Log mel spectrum */
620  MELSPEC = 8, /* Linear mel spectrum */
621  USER = 9, /* User defined */
622  DISCRETE = 10, /* Vector quantized data */
623  PLP = 11 /* PLP coefficients */
624 } htk_feature_kind_t;
625 
626 typedef enum htk_feature_flag_e {
627  _E = 0000100, /* has energy */
628  _N = 0000200, /* absolute energy supressed */
629  _D = 0000400, /* has delta coefficients */
630  _A = 0001000, /* has acceleration (delta-delta) coefficients */
631  _C = 0002000, /* is compressed */
632  _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
633  _K = 0010000, /* has CRC checksum */
634  _O = 0020000, /* has 0th cepstral coefficient */
635  _V = 0040000, /* has VQ data */
636  _T = 0100000 /* has third differential coefficients */
637 } htk_feature_flag_t;
638 
642 static int
643 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
644 {
645  int32 samp_period;
646  int16 samp_size;
647  int16 param_kind;
648  int swap = FALSE;
649 
650  /* HTK files are big-endian. */
651  if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
652  swap = TRUE;
653  /* Same file size thing as in Sphinx files (I think) */
654  if (swap) SWAP_INT32(&nfloat);
655  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
656  return -1;
657  /* Sample period in 100ns units. */
658  samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
659  if (swap) SWAP_INT32(&samp_period);
660  if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
661  return -1;
662  /* Sample size - veclen * sizeof each sample. */
663  samp_size = wtf->veclen * 4;
664  if (swap) SWAP_INT16(&samp_size);
665  if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
666  return -1;
667  /* Format and flags. */
668  if (cmd_ln_boolean_r(wtf->config, "-logspec")
669  || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
670  param_kind = FBANK; /* log mel-filter bank outputs */
671  else
672  param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
673  if (swap) SWAP_INT16(&param_kind);
674  if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
675  return -1;
676 
677  return 0;
678 }
679 
683 static int
684 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
685 {
686  int i, j, swap, htk_reorder, nfloat = 0;
687 
688  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
689  /* This is possibly inefficient, but probably not a big deal. */
690  swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
691  htk_reorder = (0 == strcmp("htk", wtf->ot->name)
692  && !(cmd_ln_boolean_r(wtf->config, "-logspec")
693  || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
694  for (i = 0; i < nfr; ++i) {
695  if (htk_reorder) {
696  mfcc_t c0 = frames[i][0];
697  memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
698  frames[i][wtf->veclen - 1] = c0;
699  }
700  if (swap)
701  for (j = 0; j < wtf->veclen; ++j)
702  SWAP_FLOAT32(frames[i] + j);
703  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
704  E_ERROR_SYSTEM("Writing %d values to %s failed",
705  wtf->veclen, wtf->outfile);
706  return -1;
707  }
708  nfloat += wtf->veclen;
709  }
710  return nfloat;
711 }
712 
716 static int
717 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
718 {
719  int i, j, nfloat = 0;
720 
721  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
722  for (i = 0; i < nfr; ++i) {
723  for (j = 0; j < wtf->veclen; ++j) {
724  fprintf(wtf->outfh, "%.5g", frames[i][j]);
725  if (j == wtf->veclen - 1)
726  fprintf(wtf->outfh, "\n");
727  else
728  fprintf(wtf->outfh, " ");
729  }
730  nfloat += wtf->veclen;
731  }
732  return nfloat;
733 }
734 
735 static const output_type_t outtypes[] = {
736  { "sphinx", &output_header_sphinx, &output_frames_sphinx },
737  { "htk", &output_header_htk, &output_frames_htk },
738  { "text", NULL, &output_frames_text }
739 };
740 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
741 
743 sphinx_wave2feat_init(cmd_ln_t *config)
744 {
745  sphinx_wave2feat_t *wtf;
746  int i;
747 
748  wtf = ckd_calloc(1, sizeof(*wtf));
749  wtf->refcount = 1;
750  wtf->config = cmd_ln_retain(config);
751  wtf->fe = fe_init_auto_r(wtf->config);
752  wtf->ot = outtypes; /* Default (sphinx) type. */
753  for (i = 0; i < nouttypes; ++i) {
754  output_type_t const *otype = &outtypes[i];
755  if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
756  wtf->ot = otype;
757  break;
758  }
759  }
760  if (i == nouttypes) {
761  E_ERROR("Unknown output type: '%s'\n",
762  cmd_ln_str_r(config, "-ofmt"));
763  sphinx_wave2feat_free(wtf);
764  return NULL;
765  }
766 
767  return wtf;
768 }
769 
770 int
771 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
772 {
773  if (wtf == NULL)
774  return 0;
775  if (--wtf->refcount > 0)
776  return wtf->refcount;
777 
778  ckd_free(wtf->audio);
779  ckd_free_2d(wtf->feat);
780  ckd_free(wtf->infile);
781  ckd_free(wtf->outfile);
782  if (wtf->infh) {
783  if (fclose(wtf->infh) == EOF)
784  E_ERROR_SYSTEM("Failed to close input file");
785  }
786  if (wtf->outfh) {
787  if (fclose(wtf->outfh) == EOF)
788  E_ERROR_SYSTEM("Failed to close output file");
789  }
790  cmd_ln_free_r(wtf->config);
791  fe_free(wtf->fe);
792  ckd_free(wtf);
793 
794  return 0;
795 }
796 
798 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
799 {
800  ++wtf->refcount;
801  return wtf;
802 }
803 
804 static audio_type_t const *
805 detect_audio_type(sphinx_wave2feat_t *wtf, char const *infile)
806 {
807  audio_type_t const *atype;
808  int i;
809 
810  /* Special case audio type for Sphinx MFCC inputs. */
811  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
812  || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
813  int rv = mfcc_type.detect(wtf, infile);
814  if (rv == -1)
815  goto error_out;
816  return &mfcc_type;
817  }
818 
819  /* Try to use the type of infile given on the command line. */
820  for (i = 0; i < ntypes; ++i) {
821  int rv;
822  atype = &types[i];
823  if (cmd_ln_boolean_r(wtf->config, atype->name)) {
824  rv = (*atype->detect)(wtf, infile);
825  if (rv == -1)
826  goto error_out;
827  else if (rv == TRUE)
828  break;
829  }
830  }
831  if (i == ntypes) {
832  /* Detect file type of infile and get parameters. */
833  for (i = 0; i < ntypes; ++i) {
834  int rv;
835  atype = &types[i];
836  rv = (*atype->detect)(wtf, infile);
837  if (rv == -1)
838  goto error_out;
839  else if (rv == TRUE)
840  break;
841  }
842  if (i == ntypes)
843  goto error_out;
844  }
845  return atype;
846  error_out:
847  if (wtf->infh)
848  fclose(wtf->infh);
849  wtf->infh = NULL;
850  return NULL;
851 }
852 
853 int
854 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
855  char const *infile, char const *outfile)
856 {
857  int nchans, minfft, nfft, nfloat, veclen;
858  audio_type_t const *atype;
859  int fshift, fsize;
860 
861  if (cmd_ln_boolean_r(wtf->config, "-verbose"))
862  E_INFO("Converting %s to %s\n", infile, outfile);
863 
864  /* Detect input file type. */
865  if ((atype = detect_audio_type(wtf, infile)) == NULL)
866  return -1;
867 
868  /* Determine whether to byteswap input. */
869  wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
870  cmd_ln_str_r(wtf->config, "-input_endian"));
871 
872  /* Make sure the FFT size is sufficiently large. */
873  minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
874  * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
875  for (nfft = 1; nfft < minfft; nfft <<= 1)
876  ;
877  if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
878  E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
879  cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
880  cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
881  fe_free(wtf->fe);
882  wtf->fe = fe_init_auto_r(wtf->config);
883  }
884 
885  /* Get the output frame size (if not already set). */
886  if (wtf->veclen == 0)
887  wtf->veclen = fe_get_output_size(wtf->fe);
888 
889  /* Set up the input and output buffers. */
890  fe_get_input_size(wtf->fe, &fshift, &fsize);
891  /* Want to get at least a whole frame plus shift in here. Also we
892  will either pick or mix multiple channels so we need to read
893  them all at once. */
894  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
895  wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
896  if (wtf->blocksize < (fsize + fshift) * nchans) {
897  E_INFO("Block size of %d too small, increasing to %d\n",
898  wtf->blocksize,
899  (fsize + fshift) * nchans);
900  wtf->blocksize = (fsize + fshift) * nchans;
901  }
902  wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
903  wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
904 
905  /* Use the maximum of the input and output frame sizes to allocate this. */
906  veclen = wtf->veclen;
907  if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
908  wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
909 
910  /* Let's go! */
911  if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
912  E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
913  return -1;
914  }
915  /* Write an empty header, which we'll fill in later. */
916  if (wtf->ot->output_header &&
917  (*wtf->ot->output_header)(wtf, 0) < 0) {
918  E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
919  goto error_out;
920  }
921  wtf->outfile = ckd_salloc(outfile);
922 
923  if ((nfloat = (*atype->decode)(wtf)) < 0)
924  return -1;
925 
926  if (wtf->ot->output_header) {
927  if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
928  E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
929  goto error_out;
930  }
931  if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
932  E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
933  goto error_out;
934  }
935  }
936  if (fclose(wtf->outfh) == EOF)
937  E_ERROR_SYSTEM("Failed to close output file");
938  wtf->outfh = NULL;
939 
940  return 0;
941 error_out:
942  if (wtf->outfh) {
943  fclose(wtf->outfh);
944  wtf->outfh = NULL;
945  }
946  return -1;
947 }
948 
949 void
950 build_filenames(cmd_ln_t *config, char const *basename,
951  char **out_infile, char **out_outfile)
952 {
953  char const *di, *do_, *ei, *eo;
954 
955  di = cmd_ln_str_r(config, "-di");
956  do_ = cmd_ln_str_r(config, "-do");
957  ei = cmd_ln_str_r(config, "-ei");
958  eo = cmd_ln_str_r(config, "-eo");
959 
960  *out_infile = string_join(di ? di : "",
961  di ? "/" : "",
962  basename,
963  ei ? "." : "",
964  ei ? ei : "",
965  NULL);
966  *out_outfile = string_join(do_ ? do_ : "",
967  do_ ? "/" : "",
968  basename,
969  eo ? "." : "",
970  eo ? eo : "",
971  NULL);
972  /* Build output directory structure if possible/requested (it is
973  * by default). */
974  if (cmd_ln_boolean_r(config, "-build_outdirs")) {
975  char *dirname = ckd_salloc(*out_outfile);
976  path2dirname(*out_outfile, dirname);
977  build_directory(dirname);
978  ckd_free(dirname);
979  }
980 }
981 
982 static int
983 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
984 {
985  hash_table_t *files;
986  hash_iter_t *itor;
987  lineiter_t *li;
988  FILE *ctlfh;
989  int nskip, runlen, npart, rv = 0;
990 
991  if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
992  E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
993  return -1;
994  }
995  nskip = cmd_ln_int32_r(wtf->config, "-nskip");
996  runlen = cmd_ln_int32_r(wtf->config, "-runlen");
997  if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
998  /* Count lines in the file. */
999  int partlen, part, nlines = 0;
1000  part = cmd_ln_int32_r(wtf->config, "-part");
1001  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
1002  ++nlines;
1003  fseek(ctlfh, 0, SEEK_SET);
1004  partlen = nlines / npart;
1005  nskip = partlen * (part - 1);
1006  if (part == npart)
1007  runlen = -1;
1008  else
1009  runlen = partlen;
1010  }
1011  if (runlen != -1){
1012  E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
1013  files = hash_table_new(runlen, HASH_CASE_YES);
1014  }
1015  else {
1016  E_INFO("Processing all remaining utterances at position %d\n", nskip);
1017  files = hash_table_new(1000, HASH_CASE_YES);
1018  }
1019  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
1020  char *c, *infile, *outfile;
1021 
1022  if (nskip-- > 0)
1023  continue;
1024  if (runlen == 0) {
1025  lineiter_free(li);
1026  break;
1027  }
1028  --runlen;
1029 
1030  string_trim(li->buf, STRING_BOTH);
1031  /* Extract the file ID from the control line. */
1032  if ((c = strchr(li->buf, ' ')) != NULL)
1033  *c = '\0';
1034  build_filenames(wtf->config, li->buf, &infile, &outfile);
1035  if (hash_table_lookup(files, infile, NULL) == 0)
1036  continue;
1037  rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
1038  hash_table_enter(files, infile, outfile);
1039  if (rv != 0) {
1040  lineiter_free(li);
1041  if (fclose(ctlfh) == EOF)
1042  E_ERROR_SYSTEM("Failed to close control file");
1043  break;
1044  }
1045  }
1046  for (itor = hash_table_iter(files); itor;
1047  itor = hash_table_iter_next(itor)) {
1048  ckd_free((void *)hash_entry_key(itor->ent));
1049  ckd_free(hash_entry_val(itor->ent));
1050  }
1051  hash_table_free(files);
1052  return rv;
1053 }
1054 
1055 int
1056 main(int argc, char *argv[])
1057 {
1058  sphinx_wave2feat_t *wtf;
1059  cmd_ln_t *config;
1060  int rv;
1061 
1062  /* Initialize config. */
1063  if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
1064  return 2;
1065 
1066  /* Parse an argument file if there's one in there. */
1067  if (cmd_ln_str_r(config, "-argfile"))
1068  config = cmd_ln_parse_file_r(config, defn,
1069  cmd_ln_str_r(config, "-argfile"), FALSE);
1070  if (config == NULL) {
1071  E_ERROR("Command line parsing failed\n");
1072  return 1;
1073  }
1074  if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1075  E_ERROR("Failed to initialize wave2feat object\n");
1076  return 1;
1077  }
1078 
1079  /* If there's a control file run through it, otherwise we will do
1080  * a single file (which is what run_control_file will do
1081  * internally too) */
1082  if (cmd_ln_str_r(config, "-c"))
1083  rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1084  else
1085  rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1086  cmd_ln_str_r(config, "-o"));
1087 
1088  sphinx_wave2feat_free(wtf);
1089  return rv;
1090 }