SphinxBase  0.6
ngram_model.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file ngram_model.c N-Gram language models.
39  *
40  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
41  */
42 
43 #include <config.h>
44 
45 #include <string.h>
46 #include <assert.h>
47 
48 #ifdef HAVE_ICONV
49 #include <iconv.h>
50 #endif
51 
52 #include "sphinxbase/ngram_model.h"
53 #include "sphinxbase/ckd_alloc.h"
54 #include "sphinxbase/filename.h"
55 #include "sphinxbase/pio.h"
56 #include "sphinxbase/err.h"
57 #include "sphinxbase/logmath.h"
58 #include "sphinxbase/strfuncs.h"
59 #include "sphinxbase/case.h"
60 
61 #include "ngram_model_internal.h"
62 
64 ngram_file_name_to_type(const char *file_name)
65 {
66  const char *ext;
67 
68  ext = strrchr(file_name, '.');
69  if (ext == NULL) {
70  return NGRAM_INVALID;
71  }
72  if (0 == strcmp_nocase(ext, ".gz")) {
73  while (--ext >= file_name) {
74  if (*ext == '.') break;
75  }
76  if (ext < file_name) {
77  return NGRAM_INVALID;
78  }
79  }
80  else if (0 == strcmp_nocase(ext, ".bz2")) {
81  while (--ext >= file_name) {
82  if (*ext == '.') break;
83  }
84  if (ext < file_name) {
85  return NGRAM_INVALID;
86  }
87  }
88  /* We use strncmp because there might be a .gz on the end. */
89  if (0 == strncmp_nocase(ext, ".ARPA", 5))
90  return NGRAM_ARPA;
91  if (0 == strncmp_nocase(ext, ".DMP", 4))
92  return NGRAM_DMP;
93  return NGRAM_INVALID;
94  }
95 
97 ngram_str_to_type(const char *str_name)
98 {
99  if (0 == strcmp_nocase(str_name, "arpa"))
100  return NGRAM_ARPA;
101  if (0 == strcmp_nocase(str_name, "dmp"))
102  return NGRAM_DMP;
103  return NGRAM_INVALID;
104 }
105 
106 char const *
108 {
109  switch (type) {
110  case NGRAM_ARPA:
111  return "arpa";
112  case NGRAM_DMP:
113  return "dmp";
114  default:
115  return NULL;
116  }
117 }
118 
119 
120  ngram_model_t *
122  const char *file_name,
123  ngram_file_type_t file_type,
124  logmath_t *lmath)
125  {
126  ngram_model_t *model = NULL;
127 
128  switch (file_type) {
129  case NGRAM_AUTO: {
130  if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
131  break;
132  if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
133  break;
134  return NULL;
135  }
136  case NGRAM_ARPA:
137  model = ngram_model_arpa_read(config, file_name, lmath);
138  break;
139  case NGRAM_DMP:
140  model = ngram_model_dmp_read(config, file_name, lmath);
141  break;
142  default:
143  E_ERROR("language model file type not supported\n");
144  return NULL;
145  }
146 
147  /* Now set weights based on config if present. */
148  if (config) {
149  float32 lw = 1.0;
150  float32 wip = 1.0;
151  float32 uw = 1.0;
152 
153  if (cmd_ln_exists_r(config, "-lw"))
154  lw = cmd_ln_float32_r(config, "-lw");
155  if (cmd_ln_exists_r(config, "-wip"))
156  wip = cmd_ln_float32_r(config, "-wip");
157  if (cmd_ln_exists_r(config, "-uw"))
158  uw = cmd_ln_float32_r(config, "-uw");
159 
160  ngram_model_apply_weights(model, lw, wip, uw);
161  }
162 
163  return model;
164  }
165 
166  int
167  ngram_model_write(ngram_model_t *model, const char *file_name,
168  ngram_file_type_t file_type)
169  {
170  switch (file_type) {
171  case NGRAM_AUTO: {
172  file_type = ngram_file_name_to_type(file_name);
173  /* Default to ARPA (catches .lm and other things) */
174  if (file_type == NGRAM_INVALID)
175  file_type = NGRAM_ARPA;
176  return ngram_model_write(model, file_name, file_type);
177  }
178  case NGRAM_ARPA:
179  return ngram_model_arpa_write(model, file_name);
180  case NGRAM_DMP:
181  return ngram_model_dmp_write(model, file_name);
182  default:
183  E_ERROR("language model file type not supported\n");
184  return -1;
185  }
186  E_ERROR("language model file type not supported\n");
187  return -1;
188  }
189 
190  int32
191  ngram_model_init(ngram_model_t *base,
192  ngram_funcs_t *funcs,
193  logmath_t *lmath,
194  int32 n, int32 n_unigram)
195  {
196  base->refcount = 1;
197  base->funcs = funcs;
198  base->n = n;
199  /* If this was previously initialized... */
200  if (base->n_counts == NULL)
201  base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
202  /* Don't reset weights if logmath object hasn't changed. */
203  if (base->lmath != lmath) {
204  /* Set default values for weights. */
205  base->lw = 1.0;
206  base->log_wip = 0; /* i.e. 1.0 */
207  base->log_uw = 0; /* i.e. 1.0 */
208  base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
209  base->log_uniform_weight = logmath_get_zero(lmath);
210  base->log_zero = logmath_get_zero(lmath);
211  base->lmath = lmath;
212  }
213  /* Allocate or reallocate space for word strings. */
214  if (base->word_str) {
215  /* Free all previous word strings if they were allocated. */
216  if (base->writable) {
217  int32 i;
218  for (i = 0; i < base->n_words; ++i) {
219  ckd_free(base->word_str[i]);
220  base->word_str[i] = NULL;
221  }
222  }
223  base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
224  }
225  else
226  base->word_str = ckd_calloc(n_unigram, sizeof(char *));
227  /* NOTE: They are no longer case-insensitive since we are allowing
228  * other encodings for word strings. Beware. */
229  if (base->wid)
230  hash_table_empty(base->wid);
231  else
232  base->wid = hash_table_new(n_unigram, FALSE);
233  base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
234 
235  return 0;
236 }
237 
240 {
241  ++model->refcount;
242  return model;
243 }
244 
245 
246 void
248 {
249  if (model->funcs && model->funcs->flush)
250  (*model->funcs->flush)(model);
251 }
252 
253 int
255 {
256  int i;
257 
258  if (model == NULL)
259  return 0;
260  if (--model->refcount > 0)
261  return model->refcount;
262  if (model->funcs && model->funcs->free)
263  (*model->funcs->free)(model);
264  if (model->writable) {
265  /* Free all words. */
266  for (i = 0; i < model->n_words; ++i) {
267  ckd_free(model->word_str[i]);
268  }
269  }
270  else {
271  /* Free all class words. */
272  for (i = 0; i < model->n_classes; ++i) {
273  ngram_class_t *lmclass;
274  int32 j;
275 
276  lmclass = model->classes[i];
277  for (j = 0; j < lmclass->n_words; ++j) {
278  ckd_free(model->word_str[lmclass->start_wid + j]);
279  }
280  for (j = 0; j < lmclass->n_hash; ++j) {
281  if (lmclass->nword_hash[j].wid != -1) {
282  ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
283  }
284  }
285  }
286  }
287  for (i = 0; i < model->n_classes; ++i) {
288  ngram_class_free(model->classes[i]);
289  }
290  ckd_free(model->classes);
291  hash_table_free(model->wid);
292  ckd_free(model->word_str);
293  ckd_free(model->n_counts);
294  ckd_free(model);
295  return 0;
296 }
297 
298 int
300 {
301  int writable, i;
302  hash_table_t *new_wid;
303 
304  /* Were word strings already allocated? */
305  writable = model->writable;
306  /* Either way, we are going to allocate some word strings. */
307  model->writable = TRUE;
308 
309  /* And, don't forget, we need to rebuild the word to unigram ID
310  * mapping. */
311  new_wid = hash_table_new(model->n_words, FALSE);
312  for (i = 0; i < model->n_words; ++i) {
313  char *outstr;
314  if (writable) {
315  outstr = model->word_str[i];
316  }
317  else {
318  outstr = ckd_salloc(model->word_str[i]);
319  }
320  /* Don't case-fold <tags> or [classes] */
321  if (outstr[0] == '<' || outstr[0] == '[') {
322  }
323  else {
324  switch (kase) {
325  case NGRAM_UPPER:
326  ucase(outstr);
327  break;
328  case NGRAM_LOWER:
329  lcase(outstr);
330  break;
331  default:
332  ;
333  }
334  }
335  model->word_str[i] = outstr;
336 
337  /* Now update the hash table. We might have terrible
338  * collisions here, so warn about them. */
339  if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
340  E_WARN("Duplicate word in dictionary after conversion: %s\n",
341  model->word_str[i]);
342  }
343  }
344  /* Swap out the hash table. */
345  hash_table_free(model->wid);
346  model->wid = new_wid;
347  return 0;
348 }
349 
350 #ifdef HAVE_ICONV
351 int
352 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
353 {
354  iconv_t ic;
355  char *outbuf;
356  size_t maxlen;
357  int i, writable;
358  hash_table_t *new_wid;
359 
360  /* FIXME: Need to do a special case thing for the GB-HEX encoding
361  * used in Sphinx3 Mandarin models. */
362  if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
363  E_ERROR_SYSTEM("iconv_open() failed");
364  return -1;
365  }
366  /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
367  * unlike wcstombs(3). So we have to either call it over and over
368  * again until our buffer is big enough, or call it with a huge
369  * buffer and then copy things back to the output. We will use a
370  * mix of these two approaches here. We'll keep a single big
371  * buffer around, and expand it as necessary.
372  */
373  maxlen = 0;
374  for (i = 0; i < model->n_words; ++i) {
375  if (strlen(model->word_str[i]) > maxlen)
376  maxlen = strlen(model->word_str[i]);
377  }
378  /* Were word strings already allocated? */
379  writable = model->writable;
380  /* Either way, we are going to allocate some word strings. */
381  model->writable = TRUE;
382  /* Really should be big enough except for pathological cases. */
383  maxlen = maxlen * sizeof(int) + 15;
384  outbuf = ckd_calloc(maxlen, 1);
385  /* And, don't forget, we need to rebuild the word to unigram ID
386  * mapping. */
387  new_wid = hash_table_new(model->n_words, FALSE);
388  for (i = 0; i < model->n_words; ++i) {
389  ICONV_CONST char *in;
390  char *out;
391  size_t inleft, outleft, result;
392 
393  start_conversion:
394  in = (ICONV_CONST char *)model->word_str[i];
395  /* Yes, this assumes that we don't have any NUL bytes. */
396  inleft = strlen(in);
397  out = outbuf;
398  outleft = maxlen;
399 
400  while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
401  if (errno != E2BIG) {
402  /* FIXME: if we already converted any words, then they
403  * are going to be in an inconsistent state. */
404  E_ERROR_SYSTEM("iconv() failed");
405  ckd_free(outbuf);
406  hash_table_free(new_wid);
407  return -1;
408  }
409  /* Reset the internal state of conversion. */
410  iconv(ic, NULL, NULL, NULL, NULL);
411  /* Make everything bigger. */
412  maxlen *= 2;
413  out = outbuf = ckd_realloc(outbuf, maxlen);
414  /* Reset the input pointers. */
415  in = (ICONV_CONST char *)model->word_str[i];
416  inleft = strlen(in);
417  }
418 
419  /* Now flush a shift-out sequence, if any. */
420  if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
421  if (errno != E2BIG) {
422  /* FIXME: if we already converted any words, then they
423  * are going to be in an inconsistent state. */
424  E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
425  ckd_free(outbuf);
426  hash_table_free(new_wid);
427  return -1;
428  }
429  /* Reset the internal state of conversion. */
430  iconv(ic, NULL, NULL, NULL, NULL);
431  /* Make everything bigger. */
432  maxlen *= 2;
433  outbuf = ckd_realloc(outbuf, maxlen);
434  /* Be very evil. */
435  goto start_conversion;
436  }
437 
438  result = maxlen - outleft;
439  /* Okay, that was hard, now let's go shopping. */
440  if (writable) {
441  /* Grow or shrink the output string as necessary. */
442  model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
443  model->word_str[i][result] = '\0';
444  }
445  else {
446  /* It actually was not allocated previously, so do that now. */
447  model->word_str[i] = ckd_calloc(result + 1, 1);
448  }
449  /* Copy the new thing in. */
450  memcpy(model->word_str[i], outbuf, result);
451 
452  /* Now update the hash table. We might have terrible
453  * collisions if a non-reversible conversion was requested.,
454  * so warn about them. */
455  if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
456  E_WARN("Duplicate word in dictionary after conversion: %s\n",
457  model->word_str[i]);
458  }
459  }
460  ckd_free(outbuf);
461  iconv_close(ic);
462  /* Swap out the hash table. */
463  hash_table_free(model->wid);
464  model->wid = new_wid;
465 
466  return 0;
467 }
468 #else /* !HAVE_ICONV */
469 int
470 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
471 {
472  return -1;
473 }
474 #endif /* !HAVE_ICONV */
475 
476 int
478  float32 lw, float32 wip, float32 uw)
479 {
480  return (*model->funcs->apply_weights)(model, lw, wip, uw);
481 }
482 
483 float32
484 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
485  int32 *out_log_uw)
486 {
487  if (out_log_wip) *out_log_wip = model->log_wip;
488  if (out_log_uw) *out_log_uw = model->log_uw;
489  return model->lw;
490 }
491 
492 
493 int32
494 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
495  int32 n_hist, int32 *n_used)
496 {
497  int32 score, class_weight = 0;
498  int i;
499 
500  /* Closed vocabulary, OOV word probability is zero */
501  if (wid == NGRAM_INVALID_WID)
502  return model->log_zero;
503 
504  /* "Declassify" wid and history */
505  if (NGRAM_IS_CLASSWID(wid)) {
506  ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
507 
508  class_weight = ngram_class_prob(lmclass, wid);
509  if (class_weight == 1) /* Meaning, not found in class. */
510  return model->log_zero;
511  wid = lmclass->tag_wid;
512  }
513  for (i = 0; i < n_hist; ++i) {
514  if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
515  history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
516  }
517  score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
518 
519  /* Multiply by unigram in-class weight. */
520  return score + class_weight;
521 }
522 
523 int32
524 ngram_score(ngram_model_t *model, const char *word, ...)
525 {
526  va_list history;
527  const char *hword;
528  int32 *histid;
529  int32 n_hist;
530  int32 n_used;
531  int32 prob;
532 
533  va_start(history, word);
534  n_hist = 0;
535  while ((hword = va_arg(history, const char *)) != NULL)
536  ++n_hist;
537  va_end(history);
538 
539  histid = ckd_calloc(n_hist, sizeof(*histid));
540  va_start(history, word);
541  n_hist = 0;
542  while ((hword = va_arg(history, const char *)) != NULL) {
543  histid[n_hist] = ngram_wid(model, hword);
544  ++n_hist;
545  }
546  va_end(history);
547 
548  prob = ngram_ng_score(model, ngram_wid(model, word),
549  histid, n_hist, &n_used);
550  ckd_free(histid);
551  return prob;
552 }
553 
554 int32
555 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
556 {
557  int32 hist[2];
558  hist[0] = w2;
559  hist[1] = w1;
560  return ngram_ng_score(model, w3, hist, 2, n_used);
561 }
562 
563 int32
564 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
565 {
566  return ngram_ng_score(model, w2, &w1, 1, n_used);
567 }
568 
569 int32
570 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
571  int32 n_hist, int32 *n_used)
572 {
573  int32 prob, class_weight = 0;
574  int i;
575 
576  /* Closed vocabulary, OOV word probability is zero */
577  if (wid == NGRAM_INVALID_WID)
578  return model->log_zero;
579 
580  /* "Declassify" wid and history */
581  if (NGRAM_IS_CLASSWID(wid)) {
582  ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
583 
584  class_weight = ngram_class_prob(lmclass, wid);
585  if (class_weight == 1) /* Meaning, not found in class. */
586  return class_weight;
587  wid = lmclass->tag_wid;
588  }
589  for (i = 0; i < n_hist; ++i) {
590  if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
591  history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
592  }
593  prob = (*model->funcs->raw_score)(model, wid, history,
594  n_hist, n_used);
595  /* Multiply by unigram in-class weight. */
596  return prob + class_weight;
597 }
598 
599 int32
600 ngram_prob(ngram_model_t *model, const char *word, ...)
601 {
602  va_list history;
603  const char *hword;
604  int32 *histid;
605  int32 n_hist;
606  int32 n_used;
607  int32 prob;
608 
609  va_start(history, word);
610  n_hist = 0;
611  while ((hword = va_arg(history, const char *)) != NULL)
612  ++n_hist;
613  va_end(history);
614 
615  histid = ckd_calloc(n_hist, sizeof(*histid));
616  va_start(history, word);
617  n_hist = 0;
618  while ((hword = va_arg(history, const char *)) != NULL) {
619  histid[n_hist] = ngram_wid(model, hword);
620  ++n_hist;
621  }
622  va_end(history);
623 
624  prob = ngram_ng_prob(model, ngram_wid(model, word),
625  histid, n_hist, &n_used);
626  ckd_free(histid);
627  return prob;
628 }
629 
630 int32
632 {
633  int32 prob;
634 
635  /* Undo insertion penalty. */
636  prob = score - base->log_wip;
637  /* Undo language weight. */
638  prob = (int32)(prob / base->lw);
639 
640  return prob;
641 }
642 
643 int32
645 {
646  int32 val;
647 
648  /* FIXME: This could be memoized for speed if necessary. */
649  /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
650  if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
651  return NGRAM_INVALID_WID;
652  else
653  return val;
654 }
655 
656 int32
658 {
659  return model->log_zero;
660 }
661 
662 int32
664 {
665  if (model != NULL)
666  return model->n;
667  return 0;
668 }
669 
670 int32 const *
672 {
673  if (model != NULL)
674  return model->n_counts;
675  return NULL;
676 }
677 
678 void
679 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
680  int m, int successor)
681 {
682  itor->model = model;
683  itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
684  itor->m = m;
685  itor->successor = successor;
686 }
687 
688 ngram_iter_t *
690 {
691  ngram_iter_t *itor;
692  /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */
693  if (m >= model->n)
694  return NULL;
695  if (model->funcs->mgrams == NULL)
696  return NULL;
697  itor = (*model->funcs->mgrams)(model, m);
698  return itor;
699 }
700 
701 ngram_iter_t *
702 ngram_iter(ngram_model_t *model, const char *word, ...)
703 {
704  va_list history;
705  const char *hword;
706  int32 *histid;
707  int32 n_hist;
708  ngram_iter_t *itor;
709 
710  va_start(history, word);
711  n_hist = 0;
712  while ((hword = va_arg(history, const char *)) != NULL)
713  ++n_hist;
714  va_end(history);
715 
716  histid = ckd_calloc(n_hist, sizeof(*histid));
717  va_start(history, word);
718  n_hist = 0;
719  while ((hword = va_arg(history, const char *)) != NULL) {
720  histid[n_hist] = ngram_wid(model, hword);
721  ++n_hist;
722  }
723  va_end(history);
724 
725  itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
726  ckd_free(histid);
727  return itor;
728 }
729 
730 ngram_iter_t *
731 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
732 {
733  if (n_hist >= model->n)
734  return NULL;
735  if (model->funcs->iter == NULL)
736  return NULL;
737  return (*model->funcs->iter)(model, wid, history, n_hist);
738 }
739 
740 ngram_iter_t *
742 {
743  /* Stop when we are at the highest order N-Gram. */
744  if (itor->m == itor->model->n - 1)
745  return NULL;
746  return (*itor->model->funcs->successors)(itor);
747 }
748 
749 int32 const *
751  int32 *out_score,
752  int32 *out_bowt)
753 {
754  return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
755 }
756 
757 ngram_iter_t *
759 {
760  return (*itor->model->funcs->iter_next)(itor);
761 }
762 
763 void
765 {
766  ckd_free(itor->wids);
767  (*itor->model->funcs->iter_free)(itor);
768 }
769 
770 int32
771 ngram_wid(ngram_model_t *model, const char *word)
772 {
773  int32 val;
774 
775  if (hash_table_lookup_int32(model->wid, word, &val) == -1)
776  return ngram_unknown_wid(model);
777  else
778  return val;
779 }
780 
781 const char *
782 ngram_word(ngram_model_t *model, int32 wid)
783 {
784  /* Remove any class tag */
785  wid = NGRAM_BASEWID(wid);
786  if (wid >= model->n_words)
787  return NULL;
788  return model->word_str[wid];
789 }
790 
794 int32
795 ngram_add_word_internal(ngram_model_t *model,
796  const char *word,
797  int32 classid)
798 {
799  void *dummy;
800  int32 wid;
801 
802  /* Take the next available word ID */
803  wid = model->n_words;
804  if (classid >= 0) {
805  wid = NGRAM_CLASSWID(wid, classid);
806  }
807  /* Check for hash collisions. */
808  if (hash_table_lookup(model->wid, word, &dummy) == 0) {
809  E_ERROR("Duplicate definition of word %s\n", word);
810  return NGRAM_INVALID_WID;
811  }
812  /* Reallocate word_str if necessary. */
813  if (model->n_words >= model->n_1g_alloc) {
814  model->n_1g_alloc += UG_ALLOC_STEP;
815  model->word_str = ckd_realloc(model->word_str,
816  sizeof(*model->word_str) * model->n_1g_alloc);
817  }
818  /* Add the word string in the appropriate manner. */
819  /* Class words are always dynamically allocated. */
820  model->word_str[model->n_words] = ckd_salloc(word);
821  /* Now enter it into the hash table. */
822  if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
823  E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
824  model->word_str[model->n_words], (void *)(long)(wid));
825  }
826  /* Increment number of words. */
827  ++model->n_words;
828  return wid;
829 }
830 
831 int32
833  const char *word, float32 weight)
834 {
835  int32 wid, prob = model->log_zero;
836 
837  wid = ngram_add_word_internal(model, word, -1);
838  if (wid == NGRAM_INVALID_WID)
839  return wid;
840 
841  /* Do what needs to be done to add the word to the unigram. */
842  if (model->funcs && model->funcs->add_ug)
843  prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
844  if (prob == 0) {
845  if (model->writable)
846  ckd_free(model->word_str[wid]);
847  return -1;
848  }
849  return wid;
850 }
851 
853 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
854 {
855  ngram_class_t *lmclass;
856  gnode_t *gn;
857  float32 tprob;
858  int i;
859 
860  lmclass = ckd_calloc(1, sizeof(*lmclass));
861  lmclass->tag_wid = tag_wid;
862  /* wid_base is the wid (minus class tag) of the first word in the list. */
863  lmclass->start_wid = start_wid;
864  lmclass->n_words = glist_count(classwords);
865  lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
866  lmclass->nword_hash = NULL;
867  lmclass->n_hash = 0;
868  tprob = 0.0;
869  for (gn = classwords; gn; gn = gnode_next(gn)) {
870  tprob += gnode_float32(gn);
871  }
872  if (tprob > 1.1 || tprob < 0.9) {
873  E_WARN("Total class probability is %f, will normalize\n", tprob);
874  for (gn = classwords; gn; gn = gnode_next(gn)) {
875  gn->data.fl /= tprob;
876  }
877  }
878  for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
879  lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
880  }
881 
882  return lmclass;
883 }
884 
885 int32
886 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
887 {
888  int32 hash;
889 
890  if (lmclass->nword_hash == NULL) {
891  /* Initialize everything in it to -1 */
892  lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
893  memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
894  lmclass->n_hash = NGRAM_HASH_SIZE;
895  lmclass->n_hash_inuse = 0;
896  }
897  /* Stupidest possible hash function. This will work pretty well
898  * when this function is called repeatedly with contiguous word
899  * IDs, though... */
900  hash = wid & (lmclass->n_hash - 1);
901  if (lmclass->nword_hash[hash].wid == -1) {
902  /* Good, no collision. */
903  lmclass->nword_hash[hash].wid = wid;
904  lmclass->nword_hash[hash].prob1 = lweight;
905  ++lmclass->n_hash_inuse;
906  return hash;
907  }
908  else {
909  int32 next;
910  /* Collision... Find the end of the hash chain. */
911  while (lmclass->nword_hash[hash].next != -1)
912  hash = lmclass->nword_hash[hash].next;
913  assert(hash != -1);
914  /* Does we has any more bukkit? */
915  if (lmclass->n_hash_inuse == lmclass->n_hash) {
916  /* Oh noes! Ok, we makes more. */
917  lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
918  lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
919  memset(lmclass->nword_hash + lmclass->n_hash,
920  0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
921  /* Just use the next allocated one (easy) */
922  next = lmclass->n_hash;
923  lmclass->n_hash *= 2;
924  }
925  else {
926  /* Look for any available bucket. We hope this doesn't happen. */
927  for (next = 0; next < lmclass->n_hash; ++next)
928  if (lmclass->nword_hash[next].wid == -1)
929  break;
930  /* This should absolutely not happen. */
931  assert(next != lmclass->n_hash);
932  }
933  lmclass->nword_hash[next].wid = wid;
934  lmclass->nword_hash[next].prob1 = lweight;
935  lmclass->nword_hash[hash].next = next;
936  ++lmclass->n_hash_inuse;
937  return next;
938  }
939 }
940 
941 void
942 ngram_class_free(ngram_class_t *lmclass)
943 {
944  ckd_free(lmclass->nword_hash);
945  ckd_free(lmclass->prob1);
946  ckd_free(lmclass);
947 }
948 
949 int32
951  const char *classname,
952  const char *word,
953  float32 weight)
954 {
955  ngram_class_t *lmclass;
956  int32 classid, tag_wid, wid, i, scale;
957  float32 fprob;
958 
959  /* Find the class corresponding to classname. Linear search
960  * probably okay here since there won't be very many classes, and
961  * this doesn't have to be fast. */
962  tag_wid = ngram_wid(model, classname);
963  if (tag_wid == NGRAM_INVALID_WID) {
964  E_ERROR("No such word or class tag: %s\n", classname);
965  return tag_wid;
966  }
967  for (classid = 0; classid < model->n_classes; ++classid) {
968  if (model->classes[classid]->tag_wid == tag_wid)
969  break;
970  }
971  /* Hmm, no such class. It's probably not a good idea to create one. */
972  if (classid == model->n_classes) {
973  E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
974  return NGRAM_INVALID_WID;
975  }
976  lmclass = model->classes[classid];
977 
978  /* Add this word to the model's set of words. */
979  wid = ngram_add_word_internal(model, word, classid);
980  if (wid == NGRAM_INVALID_WID)
981  return wid;
982 
983  /* This is the fixed probability of the new word. */
984  fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
985  /* Now normalize everything else to fit it in. This is
986  * accomplished by simply scaling all the other probabilities
987  * by (1-fprob). */
988  scale = logmath_log(model->lmath, 1.0 - fprob);
989  for (i = 0; i < lmclass->n_words; ++i)
990  lmclass->prob1[i] += scale;
991  for (i = 0; i < lmclass->n_hash; ++i)
992  if (lmclass->nword_hash[i].wid != -1)
993  lmclass->nword_hash[i].prob1 += scale;
994 
995  /* Now add it to the class hash table. */
996  return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
997 }
998 
999 int32
1001  const char *classname,
1002  float32 classweight,
1003  char **words,
1004  const float32 *weights,
1005  int32 n_words)
1006 {
1007  ngram_class_t *lmclass;
1008  glist_t classwords = NULL;
1009  int32 i, start_wid = -1;
1010  int32 classid, tag_wid;
1011 
1012  /* Check if classname already exists in model. If not, add it.*/
1013  if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
1014  tag_wid = ngram_model_add_word(model, classname, classweight);
1015  if (tag_wid == NGRAM_INVALID_WID)
1016  return -1;
1017  }
1018 
1019  if (model->n_classes == 128) {
1020  E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
1021  return -1;
1022  }
1023  classid = model->n_classes;
1024  for (i = 0; i < n_words; ++i) {
1025  int32 wid;
1026 
1027  wid = ngram_add_word_internal(model, words[i], classid);
1028  if (wid == NGRAM_INVALID_WID)
1029  return -1;
1030  if (start_wid == -1)
1031  start_wid = NGRAM_BASEWID(wid);
1032  classwords = glist_add_float32(classwords, weights[i]);
1033  }
1034  classwords = glist_reverse(classwords);
1035  lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
1036  glist_free(classwords);
1037  if (lmclass == NULL)
1038  return -1;
1039 
1040  ++model->n_classes;
1041  if (model->classes == NULL)
1042  model->classes = ckd_calloc(1, sizeof(*model->classes));
1043  else
1044  model->classes = ckd_realloc(model->classes,
1045  model->n_classes * sizeof(*model->classes));
1046  model->classes[classid] = lmclass;
1047  return classid;
1048 }
1049 
1050 int32
1051 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
1052 {
1053  int32 base_wid = NGRAM_BASEWID(wid);
1054 
1055  if (base_wid < lmclass->start_wid
1056  || base_wid > lmclass->start_wid + lmclass->n_words) {
1057  int32 hash;
1058 
1059  /* Look it up in the hash table. */
1060  hash = wid & (lmclass->n_hash - 1);
1061  while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
1062  hash = lmclass->nword_hash[hash].next;
1063  if (hash == -1)
1064  return 1;
1065  return lmclass->nword_hash[hash].prob1;
1066  }
1067  else {
1068  return lmclass->prob1[base_wid - lmclass->start_wid];
1069  }
1070 }
1071 
1072 int32
1073 read_classdef_file(hash_table_t *classes, const char *file_name)
1074 {
1075  FILE *fp;
1076  int32 is_pipe;
1077  int inclass;
1078  int32 rv = -1;
1079  gnode_t *gn;
1080  glist_t classwords = NULL;
1081  glist_t classprobs = NULL;
1082  char *classname = NULL;
1083 
1084  if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
1085  E_ERROR("File %s not found\n", file_name);
1086  return -1;
1087  }
1088 
1089  inclass = FALSE;
1090  while (!feof(fp)) {
1091  char line[512];
1092  char *wptr[2];
1093  int n_words;
1094 
1095  if (fgets(line, sizeof(line), fp) == NULL)
1096  break;
1097 
1098  n_words = str2words(line, wptr, 2);
1099  if (n_words <= 0)
1100  continue;
1101 
1102  if (inclass) {
1103  /* Look for an end of class marker. */
1104  if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
1105  classdef_t *classdef;
1106  gnode_t *word, *weight;
1107  int32 i;
1108 
1109  if (classname == NULL || 0 != strcmp(wptr[1], classname))
1110  goto error_out;
1111  inclass = FALSE;
1112 
1113  /* Construct a class from the list of words collected. */
1114  classdef = ckd_calloc(1, sizeof(*classdef));
1115  classwords = glist_reverse(classwords);
1116  classprobs = glist_reverse(classprobs);
1117  classdef->n_words = glist_count(classwords);
1118  classdef->words = ckd_calloc(classdef->n_words,
1119  sizeof(*classdef->words));
1120  classdef->weights = ckd_calloc(classdef->n_words,
1121  sizeof(*classdef->weights));
1122  word = classwords;
1123  weight = classprobs;
1124  for (i = 0; i < classdef->n_words; ++i) {
1125  classdef->words[i] = gnode_ptr(word);
1126  classdef->weights[i] = gnode_float32(weight);
1127  word = gnode_next(word);
1128  weight = gnode_next(weight);
1129  }
1130 
1131  /* Add this class to the hash table. */
1132  if (hash_table_enter(classes, classname, classdef) != classdef) {
1133  classdef_free(classdef);
1134  goto error_out;
1135  }
1136 
1137  /* Reset everything. */
1138  glist_free(classwords);
1139  glist_free(classprobs);
1140  classwords = NULL;
1141  classprobs = NULL;
1142  classname = NULL;
1143  }
1144  else {
1145  float32 fprob;
1146 
1147  if (n_words == 2)
1148  fprob = (float32)atof_c(wptr[1]);
1149  else
1150  fprob = 1.0f;
1151  /* Add it to the list of words for this class. */
1152  classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
1153  classprobs = glist_add_float32(classprobs, fprob);
1154  }
1155  }
1156  else {
1157  /* Start a new LM class if the LMCLASS marker is seen */
1158  if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
1159  if (inclass)
1160  goto error_out;
1161  inclass = TRUE;
1162  classname = ckd_salloc(wptr[1]);
1163  }
1164  /* Otherwise, just ignore whatever junk we got */
1165  }
1166  }
1167  rv = 0; /* Success. */
1168 
1169 error_out:
1170  /* Free all the stuff we might have allocated. */
1171  fclose_comp(fp, is_pipe);
1172  for (gn = classwords; gn; gn = gnode_next(gn))
1173  ckd_free(gnode_ptr(gn));
1174  glist_free(classwords);
1175  glist_free(classprobs);
1176  ckd_free(classname);
1177 
1178  return rv;
1179 }
1180 
1181 void
1182 classdef_free(classdef_t *classdef)
1183 {
1184  int32 i;
1185  for (i = 0; i < classdef->n_words; ++i)
1186  ckd_free(classdef->words[i]);
1187  ckd_free(classdef->words);
1188  ckd_free(classdef->weights);
1189  ckd_free(classdef);
1190 }
1191 
1192 
1193 int32
1195  const char *file_name)
1196 {
1197  hash_table_t *classes;
1198  glist_t hl = NULL;
1199  gnode_t *gn;
1200  int32 rv = -1;
1201 
1202  classes = hash_table_new(0, FALSE);
1203  if (read_classdef_file(classes, file_name) < 0) {
1204  hash_table_free(classes);
1205  return -1;
1206  }
1207 
1208  /* Create a new class in the language model for each classdef. */
1209  hl = hash_table_tolist(classes, NULL);
1210  for (gn = hl; gn; gn = gnode_next(gn)) {
1211  hash_entry_t *he = gnode_ptr(gn);
1212  classdef_t *classdef = he->val;
1213 
1214  if (ngram_model_add_class(model, he->key, 1.0,
1215  classdef->words,
1216  classdef->weights,
1217  classdef->n_words) < 0)
1218  goto error_out;
1219  }
1220  rv = 0;
1221 
1222 error_out:
1223  for (gn = hl; gn; gn = gnode_next(gn)) {
1224  hash_entry_t *he = gnode_ptr(gn);
1225  ckd_free((char *)he->key);
1226  classdef_free(he->val);
1227  }
1228  glist_free(hl);
1229  hash_table_free(classes);
1230  return rv;
1231 }