Julius 4.1.5
|
00001 00018 /* 00019 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 00020 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00021 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 00022 * All rights reserved 00023 */ 00024 00025 #include <sent/stddefs.h> 00026 #include <sent/ngram2.h> 00027 #include <sent/vocabulary.h> 00028 00035 boolean 00036 init_ngram_bin(NGRAM_INFO *ndata, char *bin_ngram_file) 00037 { 00038 FILE *fp; 00039 00040 jlog("Stat: init_ngram: reading in binary n-gram from %s\n", bin_ngram_file); 00041 if ((fp = fopen_readfile(bin_ngram_file)) == NULL) { 00042 jlog("Error: init_ngram: failed to open \"%s\"\n", bin_ngram_file); 00043 return FALSE; 00044 } 00045 if (ngram_read_bin(fp, ndata) == FALSE) { 00046 jlog("Error: init_ngram: failed to read \"%s\"\n", bin_ngram_file); 00047 return FALSE; 00048 } 00049 if (fclose_readfile(fp) == -1) { 00050 jlog("Error: init_ngram: failed to close \"%s\"\n", bin_ngram_file); 00051 return FALSE; 00052 } 00053 jlog("Stat: init_ngram: finished reading n-gram\n"); 00054 return TRUE; 00055 } 00056 00064 boolean 00065 init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir) 00066 { 00067 FILE *fp; 00068 00069 ndata->root = NULL; 00070 ndata->dir = dir; 00071 00072 jlog("Stat: init_ngram: reading in ARPA %s n-gram from %s\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ngram_file); 00073 /* read RL n-gram */ 00074 if ((fp = fopen_readfile(ngram_file)) == NULL) { 00075 jlog("Error: init_ngram: failed to open \"%s\"\n", ngram_file); 00076 return FALSE; 00077 } 00078 if (ngram_read_arpa(fp, ndata, FALSE) == FALSE) { 00079 jlog("Error: init_ngram: failed to read \"%s\"\n", ngram_file); 00080 return FALSE; 00081 } 00082 if (fclose_readfile(fp) == -1) { 00083 jlog("Error: init_ngram: failed to close \"%s\"\n", ngram_file); 00084 return FALSE; 00085 } 00086 jlog("Stat: init_ngram: finished reading n-gram\n"); 00087 00088 return TRUE; 00089 } 00090 00097 boolean 00098 init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file) 00099 { 00100 FILE *fp; 00101 00102 jlog("Stat: init_ngram: reading in additional LR 2-gram for the 1st pass from %s\n", bigram_file); 00103 if ((fp = fopen_readfile(bigram_file)) == NULL) { 00104 jlog("Error: init_ngram: failed to open \"%s\"\n", bigram_file); 00105 return FALSE; 00106 } 00107 if (ngram_read_arpa(fp, ndata, TRUE) == FALSE) { 00108 jlog("Error: init_ngram: failed to read \"%s\"\n", bigram_file); 00109 return FALSE; 00110 } 00111 if (fclose_readfile(fp) == -1) { 00112 jlog("Error: init_ngram: failed to close \"%s\"\n", bigram_file); 00113 return FALSE; 00114 } 00115 jlog("Stat: init_ngram: finished reading LR 2-gram\n"); 00116 00117 return TRUE; 00118 } 00119 00126 boolean 00127 make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo) 00128 { 00129 int i; 00130 boolean ok_flag = TRUE; 00131 int count = 0; 00132 00133 jlog("Stat: init_ngram: mapping dictonary words to n-gram entries\n"); 00134 ndata->unk_num = 0; 00135 for (i = 0; i < winfo->num; i++) { 00136 winfo->wton[i] = make_ngram_ref(ndata, winfo->wname[i]); 00137 if (winfo->wton[i] == WORD_INVALID) { 00138 ok_flag = FALSE; 00139 count++; 00140 continue; 00141 } 00142 if (winfo->wton[i] == ndata->unk_id) { 00143 (ndata->unk_num)++; 00144 } 00145 } 00146 if (ok_flag == FALSE) { 00147 jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count); 00148 jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"<unk>\" or \"<UNK>\"\n"); 00149 return FALSE; 00150 } 00151 00152 if (ndata->unk_num == 0) { 00153 ndata->unk_num_log = 0.0; /* for safe */ 00154 } else { 00155 ndata->unk_num_log = (float)log10(ndata->unk_num); 00156 } 00157 jlog("Stat: init_ngram: finished word-to-ngram mapping\n"); 00158 return TRUE; 00159 } 00160 00168 void 00169 set_unknown_id(NGRAM_INFO *ndata, char *str) 00170 { 00171 ndata->unk_id = ngram_lookup_word(ndata, str); 00172 if (ndata->unk_id == WORD_INVALID) { 00173 if (strmatch(str, UNK_WORD_DEFAULT)) { 00174 /* if default "<unk>" is not found, also try "<UNK>" */ 00175 ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); 00176 if (ndata->unk_id == WORD_INVALID) { 00177 jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); 00178 ndata->isopen = FALSE; 00179 return; 00180 } 00181 } 00182 } 00183 if (ndata->unk_id == WORD_INVALID) { 00184 jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str); 00185 ndata->isopen = FALSE; 00186 } else { 00187 jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str); 00188 ndata->isopen = TRUE; 00189 } 00190 } 00191 00205 void 00206 fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo) 00207 { 00208 WORD_ID wb, we; 00209 00210 wb = winfo->wton[winfo->head_silwid]; 00211 we = winfo->wton[winfo->tail_silwid]; 00212 if (ndata->d[0].prob[wb] == -99.0) { 00213 jlog("Warning: BOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[wb]); 00214 jlog("Warning: assigining value of EOS word \"%s\": %f\n", ndata->wname[we], ndata->d[0].prob[we]); 00215 ndata->d[0].prob[wb] = ndata->d[0].prob[we]; 00216 } else if (ndata->d[0].prob[we] == -99.0) { 00217 jlog("Warning: EOS word \"%s\" has unigram prob of \"-99\"\n", ndata->wname[we]); 00218 jlog("Warning: assigining value of BOS word \"%s\": %f\n", ndata->wname[wb], ndata->d[0].prob[wb]); 00219 ndata->d[0].prob[we] = ndata->d[0].prob[wb]; 00220 } 00221 } 00222