Julius 4.1.5
libsent/src/ngram/init_ngram.c
説明を見る。
00001 
00018 /*
00019  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00020  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00021  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00022  * All rights reserved
00023  */
00024 
00025 #include <sent/stddefs.h>
00026 #include <sent/ngram2.h>
00027 #include <sent/vocabulary.h>
00028 
00035 boolean
00036 init_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file)
00037 {
00038   FILE *fp;
00039   
00040   jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file);
00041   if ((fp = fopen_readfile(bin_ngram_file)) == NULL) {
00042     jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file);
00043     return FALSE;
00044   }
00045   if (ngram_read_bin(fp, ndata) == FALSE) {
00046     jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file);
00047     return FALSE;
00048   }
00049   if (fclose_readfile(fp) == -1) {
00050     jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file);
00051     return FALSE;
00052   }
00053   jlog("Stat: init_ngram: finished reading n-gram\n");
00054   return TRUE;
00055 }
00056 
00064 boolean
00065 init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir)
00066 {
00067   FILE *fp;
00068 
00069   ndata->root = NULL;
00070   ndata->dir = dir;
00071 
00072   jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file);
00073   /* read RL n-gram */
00074   if ((fp = fopen_readfile(ngram_file)) == NULL) {
00075     jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file);
00076     return FALSE;
00077   }
00078   if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) {
00079     jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file);
00080     return FALSE;
00081   }
00082   if (fclose_readfile(fp) == -1) {
00083     jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file);
00084     return FALSE;
00085   }
00086   jlog("Stat: init_ngram: finished reading n-gram\n");
00087 
00088   return TRUE;
00089 }
00090 
00097 boolean
00098 init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file)
00099 {
00100   FILE *fp;
00101 
00102   jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file);
00103   if ((fp = fopen_readfile(bigram_file)) == NULL) {
00104     jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file);
00105     return FALSE;
00106   }
00107   if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) {
00108     jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file);
00109     return FALSE;
00110   }
00111   if (fclose_readfile(fp) == -1) {
00112     jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file);
00113     return FALSE;
00114   }
00115   jlog("Stat: init_ngram: finished reading LR 2-gram\n");
00116 
00117   return TRUE;
00118 }
00119 
00126 boolean
00127 make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo)
00128 {
00129   int i;
00130   boolean ok_flag = TRUE;
00131   int count = 0;
00132 
00133   jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n");
00134   ndata->unk_num = 0;
00135   for (i = 0; i < winfo->num; i++) {
00136     winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]);
00137     if (winfo->wton[i] == WORD_INVALID) {
00138       ok_flag = FALSE;
00139       count++;
00140       continue;
00141     }
00142     if (winfo->wton[i] == ndata->unk_id) {
00143       (ndata->unk_num)++;
00144     }
00145   }
00146   if (ok_flag == FALSE) {
00147     jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count);
00148     jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"<unk>\" or \"<UNK>\"\n");
00149     return FALSE;
00150   }
00151       
00152   if (ndata->unk_num == 0) {
00153     ndata->unk_num_log = 0.0;   /* for safe */
00154   } else {
00155     ndata->unk_num_log = (float)log10(ndata->unk_num);
00156   }
00157   jlog("Stat: init_ngram: finished word-to-ngram mapping\n");
00158   return TRUE;
00159 }
00160 
00168 void
00169 set_unknown_id(NGRAM_INFO *ndata, char *str)
00170 {
00171   ndata->unk_id = ngram_lookup_word(ndata, str);
00172   if (ndata->unk_id == WORD_INVALID) {
00173     if (strmatch(str, UNK_WORD_DEFAULT)) {
00174       /* if default "<unk>" is not found, also try "<UNK>" */
00175       ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2);
00176       if (ndata->unk_id == WORD_INVALID) {
00177         jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
00178         ndata->isopen = FALSE;
00179         return;
00180       }
00181     }
00182   }
00183   if (ndata->unk_id == WORD_INVALID) {
00184     jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str);
00185     ndata->isopen = FALSE;
00186   } else {
00187     jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str);
00188     ndata->isopen = TRUE;
00189   }
00190 }
00191 
00205 void
00206 fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo)
00207 {
00208   WORD_ID wb, we;
00209 
00210   wb = winfo->wton[winfo->head_silwid];
00211   we = winfo->wton[winfo->tail_silwid];
00212   if (ndata->d[0].prob[wb] == -99.0) {
00213     jlog("Warning: BOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[wb]);
00214     jlog("Warning: assigining value of EOS word \"%s\": %f\n", ndata->wname[we], ndata->d[0].prob[we]);
00215     ndata->d[0].prob[wb] = ndata->d[0].prob[we];
00216   } else if (ndata->d[0].prob[we] == -99.0) {
00217     jlog("Warning: EOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[we]);
00218     jlog("Warning: assigining value of BOS word \"%s\": %f\n", ndata->wname[wb], ndata->d[0].prob[wb]);
00219     ndata->d[0].prob[we] = ndata->d[0].prob[wb];
00220   }
00221 }
00222