00001
00018
00019
00020
00021
00022
00023
00024
00025 #include <sent/stddefs.h>
00026 #include <sent/ngram2.h>
00027
00038 NNID
00039 search_bigram(NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)
00040 {
00041
00042
00043 NNID left,right,mid;
00044
00045 if ((left = ndata->n2_bgn[w_l]) == NNID_INVALID)
00046 return (NNID_INVALID);
00047 right = left + ndata->n2_num[w_l] - 1;
00048 while(left < right) {
00049 mid = (left + right) / 2;
00050 if (ndata->n2tonid[mid] < w_r) {
00051 left = mid + 1;
00052 } else {
00053 right = mid;
00054 }
00055 }
00056 if (ndata->n2tonid[left] == w_r) {
00057 return (left);
00058 } else {
00059 return (NNID_INVALID);
00060 }
00061 }
00062
00076 static NNID
00077 search_trigram_v3(NGRAM_INFO *ndata, NNID n2, WORD_ID wkey)
00078 {
00079
00080
00081 int left,right,mid;
00082
00083 if ((left = ndata->n3_bgn[n2]) == NNID_INVALID)
00084 return (NNID_INVALID);
00085 right = left + ndata->n3_num[n2] - 1;
00086 while(left < right) {
00087 mid = (left + right) / 2;
00088 if (ndata->n3tonid[mid] < wkey) {
00089 left = mid + 1;
00090 } else {
00091 right = mid;
00092 }
00093 }
00094 if (ndata->n3tonid[left] == wkey) {
00095 return (left);
00096 } else {
00097 return (NNID_INVALID);
00098 }
00099 }
00100
00114 static NNID
00115 search_trigram_v4(NGRAM_INFO *ndata, NNID n2, WORD_ID wkey)
00116 {
00117
00118
00119 NNID left,right,mid,boid;
00120
00121 if ((boid = ndata->n2bo_upper[n2]) == NNID_INVALID_UPPER)
00122 return (NNID_INVALID);
00123 boid = (boid << 16) + (NNID)(ndata->n2bo_lower[n2]);
00124 left = ((NNID)(ndata->n3_bgn_upper[boid]) << 16) + (NNID)(ndata->n3_bgn_lower[boid]);
00125 right = left + ndata->n3_num[boid] - 1;
00126 while(left < right) {
00127 mid = (left + right) / 2;
00128 if (ndata->n3tonid[mid] < wkey) {
00129 left = mid + 1;
00130 } else {
00131 right = mid;
00132 }
00133 }
00134 if (ndata->n3tonid[left] == wkey) {
00135 return (left);
00136 } else {
00137 return (NNID_INVALID);
00138 }
00139 }
00140
00141
00142
00143
00144
00153 LOGPROB
00154 uni_prob(NGRAM_INFO *ndata, WORD_ID w)
00155 {
00156 if (w != ndata->unk_id) {
00157 return(ndata->p[w]);
00158 } else {
00159 return(ndata->p[w] - ndata->unk_num_log);
00160 }
00161 }
00162
00163
00174 LOGPROB
00175 bi_prob_lr(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
00176 {
00177 NNID n2;
00178 LOGPROB prob;
00179
00180 if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) {
00181
00182 prob = ndata->p_lr[n2];
00183 } else {
00184
00185
00186 prob = ndata->bo_wt_lr[w1] + ndata->p[w2];
00187 }
00188 if (w2 != ndata->unk_id) {
00189 return(prob);
00190 } else {
00191 return(prob - ndata->unk_num_log);
00192 }
00193 }
00194
00205 LOGPROB
00206 bi_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
00207 {
00208 NNID n2;
00209 LOGPROB prob;
00210
00211 if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) {
00212
00213 prob = ndata->p_rl[n2];
00214 } else {
00215
00216
00217 prob = ndata->bo_wt_rl[w2] + ndata->p[w1];
00218 }
00219 if (w1 != ndata->unk_id) {
00220 return(prob);
00221 } else {
00222 return(prob - ndata->unk_num_log);
00223 }
00224 }
00225
00226
00238 LOGPROB
00239 tri_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)
00240 {
00241 NNID n2, n3;
00242 int boid;
00243
00244 if ((n2 = search_bigram(ndata, w2, w3)) != NNID_INVALID) {
00245 switch(ndata->version) {
00246 case 4:
00247 n3 = search_trigram_v4(ndata, n2, w1);
00248 break;
00249 case 3:
00250 n3 = search_trigram_v3(ndata, n2, w1);
00251 break;
00252 }
00253 if (n3 != NNID_INVALID) {
00254
00255 if (w1 != ndata->unk_id) {
00256 return(ndata->p_rrl[n3]);
00257 } else {
00258 return(ndata->p_rrl[n3] - ndata->unk_num_log);
00259 }
00260 } else {
00261
00262
00263
00264 switch(ndata->version) {
00265 case 4:
00266 if ((boid = ndata->n2bo_upper[n2]) == NNID_INVALID_UPPER) {
00267 return(bi_prob_rl(ndata, w1, w2));
00268 } else {
00269 boid = (boid << 16) + (NNID)(ndata->n2bo_lower[n2]);
00270 return(ndata->bo_wt_rrl[boid] + bi_prob_rl(ndata, w1, w2));
00271 }
00272 break;
00273 case 3:
00274 return(ndata->bo_wt_rrl[n2] + bi_prob_rl(ndata, w1, w2));
00275 break;
00276 }
00277 }
00278 }
00279
00280 return(bi_prob_rl(ndata, w1, w2));
00281 }