Julius 4.2
|
00001 00026 /* 00027 * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University 00028 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00029 * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology 00030 * All rights reserved 00031 */ 00032 00033 /* $Id: ngram_read_arpa.c,v 1.18 2011/04/29 05:09:17 sumomo Exp $ */ 00034 00035 /* words should be alphabetically sorted */ 00036 00037 #include <sent/stddefs.h> 00038 #include <sent/ngram2.h> 00039 00040 static char buf[800]; 00041 static char pbuf[800]; 00042 00043 00052 static int 00053 get_total_info(FILE *fp, NNID **numlist) 00054 { 00055 char *p; 00056 int n; 00057 int maxn; 00058 unsigned long entry_num; 00059 int numnum; 00060 00061 maxn = 0; 00062 00063 numnum = 10; 00064 *numlist = (NNID *)mymalloc(sizeof(NNID) * numnum); 00065 00066 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { 00067 if (strnmatch(buf, "ngram", 5)) { /* n-gram num */ 00068 //p = strtok(buf, " ="); 00069 //n = atoi(p); 00070 //p = strtok(NULL, " ="); 00071 //entry_num = atol(p); 00072 //sscanf(p, "%lu", &entry_num); 00073 sscanf(buf, "ngram %d = %lu", &n, &entry_num); 00074 /* check maximum number */ 00075 if (entry_num > NNID_MAX) { 00076 jlog("Error: too big %d-gram (exceeds %d bit)\n", n, sizeof(NNID) * 8); 00077 return -1; 00078 } 00079 /* ignore empty entry */ 00080 if (entry_num == 0) { 00081 jlog("Warning: empty %d-gram, skipped\n", n); 00082 } else { 00083 if (maxn < n) maxn = n; 00084 if (n >= numnum) { 00085 numnum *= 2; 00086 *numlist = (NNID *)myrealloc(*numlist, sizeof(NNID) * numnum); 00087 } 00088 (*numlist)[n-1] = entry_num; 00089 } 00090 } 00091 } 00092 00093 return(maxn); 00094 } 00095 00102 static boolean 00103 set_unigram(FILE *fp, NGRAM_INFO *ndata) 00104 { 00105 WORD_ID nid; 00106 int resid; 00107 LOGPROB prob, bo_wt; 00108 char *name, *p; 00109 boolean ok_p = TRUE; 00110 NGRAM_TUPLE_INFO *t; 00111 00112 t = &(ndata->d[0]); 00113 00114 /* malloc name area */ 00115 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num); 00116 00117 /* malloc data area */ 00118 //t->bgn_upper = t->bgn_lower = t->bgn = t->num = NULL; 00119 t->bgn_upper = NULL; 00120 t->bgn_lower = NULL; 00121 t->bgn = NULL; 00122 t->num = NULL; 00123 t->bgnlistlen = 0; 00124 t->nnid2wid = NULL; 00125 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); 00126 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); 00127 t->context_num = t->totalnum; 00128 t->nnid2ctid_upper = NULL; 00129 t->nnid2ctid_lower = NULL; 00130 00131 nid = 0; 00132 00133 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { 00134 if ((p = strtok(buf, DELM)) == NULL) { 00135 jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n"); 00136 return FALSE; 00137 } 00138 prob = (LOGPROB)atof(p); 00139 if ((p = strtok(NULL, DELM)) == NULL) { 00140 jlog("Error: ngram_read_arpa: 1-gram: failed to parse, corrupted or invalid data?\n"); 00141 return FALSE; 00142 } 00143 name = strcpy((char *)mymalloc(strlen(p)+1), p); 00144 if ((p = strtok(NULL, DELM)) == NULL) { 00145 bo_wt = 0.0; 00146 } else { 00147 bo_wt = (LOGPROB)atof(p); 00148 } 00149 00150 /* register word entry name */ 00151 ndata->wname[nid] = name; 00152 00153 /* add entry name to index tree */ 00154 if (ndata->root == NULL) { 00155 ndata->root = ptree_make_root_node(nid, &(ndata->mroot)); 00156 } else { 00157 resid = ptree_search_data(name, ndata->root); 00158 if (resid != -1 && strmatch(name, ndata->wname[resid])) { /* already exist */ 00159 jlog("Error: ngram_read_arpa: duplicate word entry \"%s\" at #%d and #%d in 1-gram\n", name, resid, nid); 00160 ok_p = FALSE; 00161 continue; 00162 } else { 00163 ptree_add_entry(name, nid, ndata->wname[resid], &(ndata->root), &(ndata->mroot)); 00164 } 00165 } 00166 00167 if (nid >= ndata->max_word_num) { 00168 jlog("Error: ngram_read_arpa: num of 1-gram is bigger than header value (%d)\n", ndata->max_word_num); 00169 return FALSE; 00170 } 00171 00172 /* register entry info */ 00173 t->prob[nid] = prob; 00174 t->bo_wt[nid] = bo_wt; 00175 00176 nid++; 00177 } 00178 00179 if (nid != t->totalnum) { 00180 jlog("Error: ngram_read_arpa: num of 1-gram (%d) not equal to header value (%d)\n", nid, t->totalnum); 00181 return FALSE; 00182 } 00183 00184 if (ok_p == TRUE) { 00185 jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", nid); 00186 } 00187 00188 return ok_p; 00189 } 00190 00191 /* read-in 1-gram (RL) --- only add back-off weight */ 00199 static boolean 00200 add_unigram(FILE *fp, NGRAM_INFO *ndata) 00201 { 00202 WORD_ID read_word_num; 00203 WORD_ID nid; 00204 LOGPROB prob, bo_wt; 00205 char *name, *p; 00206 boolean ok_p = TRUE; 00207 boolean mismatched = FALSE; 00208 00209 ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->max_word_num); 00210 00211 read_word_num = 0; 00212 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { 00213 if ((p = strtok(buf, DELM)) == NULL) { 00214 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n"); 00215 return FALSE; 00216 } 00217 prob = atof(p); 00218 if ((p = strtok(NULL, DELM)) == NULL) { 00219 jlog("Error: ngram_read_arpa: RL 1-gram: failed to parse, corrupted or invalid data?\n"); 00220 return FALSE; 00221 } 00222 name = strcpy((char *)mymalloc(strlen(p)+1), p); 00223 if ((p = strtok(NULL, DELM)) == NULL) { 00224 bo_wt = 0.0; 00225 } else { 00226 bo_wt = (LOGPROB)atof(p); 00227 } 00228 00229 /* add bo_wt_rl to existing 1-gram entry */ 00230 nid = ngram_lookup_word(ndata, name); 00231 if (nid == WORD_INVALID) { 00232 if (mismatched == FALSE) { 00233 jlog("Error: ngram_read_arpa: vocabulary mismatch between LR n-gram and RL n-gram\n"); 00234 mismatched = TRUE; 00235 } 00236 jlog("Error: ngram_read_arpa: \"%s\" does not appears in LR n-gram\n", name); 00237 ok_p = FALSE; 00238 } else { 00239 ndata->bo_wt_1[nid] = bo_wt; 00240 } 00241 00242 read_word_num++; 00243 if (read_word_num > ndata->max_word_num) { 00244 jlog("Error: ngram_read_arpa: vocabulary size of RL n-gram is bigger than header value (%d)\n", ndata->max_word_num); 00245 return FALSE; 00246 } 00247 free(name); 00248 } 00249 if (ok_p == TRUE) { 00250 jlog("Stat: ngram_read_arpa: read %d 1-gram entries\n", read_word_num); 00251 } 00252 00253 return ok_p; 00254 } 00255 00263 static boolean 00264 add_bigram(FILE *fp, NGRAM_INFO *ndata) 00265 { 00266 WORD_ID w[2], wtmp; 00267 LOGPROB prob; 00268 NNID bi_count = 0; 00269 NNID n2; 00270 boolean ok_p = TRUE; 00271 char *s; 00272 00273 ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum); 00274 00275 while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') { 00276 strcpy(pbuf, buf); 00277 if ( ++bi_count % 100000 == 0) { 00278 jlog("Stat: ngram_read_arpa: 2-gram read %lu (%d%%)\n", bi_count, bi_count * 100 / ndata->d[1].totalnum); 00279 } 00280 if ((s = strtok(buf, DELM)) == NULL) { 00281 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n"); 00282 return FALSE; 00283 } 00284 prob = (LOGPROB)atof(s); 00285 if ((s = strtok(NULL, DELM)) == NULL) { 00286 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n"); 00287 return FALSE; 00288 } 00289 w[0] = ngram_lookup_word(ndata, s); 00290 if (w[0] == WORD_INVALID) { 00291 jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s); 00292 ok_p = FALSE; 00293 continue; 00294 } 00295 if ((s = strtok(NULL, DELM)) == NULL) { 00296 jlog("Error: ngram_read_arpa: 2-gram: failed to parse, corrupted or invalid data?\n"); 00297 return FALSE; 00298 } 00299 w[1] = ngram_lookup_word(ndata, s); 00300 if (w[1] == WORD_INVALID) { 00301 jlog("Error: ngram_read_arpa: 2-gram #%lu: \"%s\": \"%s\" not exist in 1-gram\n", bi_count, pbuf, s); 00302 ok_p = FALSE; 00303 continue; 00304 } 00305 if (ndata->dir == DIR_RL) { 00306 /* word order should be reversed */ 00307 wtmp = w[0]; 00308 w[0] = w[1]; 00309 w[1] = wtmp; 00310 } 00311 n2 = search_ngram(ndata, 2, w); 00312 if (n2 == NNID_INVALID) { 00313 jlog("Warning: ngram_read_arpa: 2-gram #%d: \"%s\": (%s,%s) not exist in LR 2-gram (ignored)\n", n2+1, pbuf, ndata->wname[w[0]], ndata->wname[w[1]]); 00314 } else { 00315 ndata->p_2[n2] = prob; 00316 } 00317 } 00318 00319 if (ok_p == TRUE) { 00320 jlog("Stat: ngram_read_arpa: 2-gram read %lu end\n", bi_count); 00321 } 00322 00323 return ok_p; 00324 } 00325 00332 static boolean 00333 set_ngram(FILE *fp, NGRAM_INFO *ndata, int n) 00334 { 00335 NNID i; 00336 WORD_ID *w; 00337 WORD_ID *w_last; 00338 LOGPROB p, bowt; 00339 NNID nnid; 00340 NNID cid, cid_last; 00341 boolean ok_p = TRUE; 00342 char *s; 00343 NGRAM_TUPLE_INFO *t; 00344 NGRAM_TUPLE_INFO *tprev; 00345 NNID ntmp; 00346 00347 if (n < 2) { 00348 jlog("Error: ngram_read_arpa: unable to process 1-gram\n"); 00349 return FALSE; 00350 } 00351 00352 w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * n); 00353 w_last = (WORD_ID *)mymalloc(sizeof(WORD_ID) * n); 00354 00355 t = &(ndata->d[n-1]); 00356 tprev = &(ndata->d[n-2]); 00357 00358 /* initialize pointer storage to access from (N-1)-gram */ 00359 t->bgnlistlen = tprev->context_num; 00360 if (t->is24bit) { 00361 t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen); 00362 t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen); 00363 for(i = 0; i < t->bgnlistlen; i++) { 00364 t->bgn_upper[i] = NNID_INVALID_UPPER; 00365 t->bgn_lower[i] = 0; 00366 } 00367 } else { 00368 t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen); 00369 for(i = 0;i < t->bgnlistlen; i++) { 00370 t->bgn[i] = NNID_INVALID; 00371 } 00372 } 00373 t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen); 00374 for(i = 0; i < t->bgnlistlen; i++) { 00375 t->num[i] = 0; 00376 } 00377 00378 /* allocate data area */ 00379 t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum); 00380 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); 00381 t->bo_wt = NULL; 00382 t->nnid2ctid_upper = NULL; 00383 t->nnid2ctid_lower = NULL; 00384 00385 nnid = 0; 00386 cid = cid_last = NNID_INVALID; 00387 for(i=0;i<n;i++) w_last[i] = WORD_INVALID; 00388 00389 /* read in N-gram */ 00390 for (;;) { 00391 if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break; 00392 strcpy(pbuf, buf); 00393 if ( nnid % 100000 == 0) { 00394 jlog("Stat: ngram_read_arpa: %d-gram read %d (%d%%)\n", n, nnid, nnid * 100 / t->totalnum); 00395 } 00396 00397 /* N-gram probability */ 00398 if ((s = strtok(buf, DELM)) == NULL) { 00399 jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n); 00400 free(w_last); free(w); 00401 return FALSE; 00402 } 00403 p = (LOGPROB)atof(s); 00404 /* read in context word and lookup the ID */ 00405 for(i=0;i<n;i++) { 00406 if ((s = strtok(NULL, DELM)) == NULL) { 00407 jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n); 00408 free(w_last); free(w); 00409 return FALSE; 00410 } 00411 if ((w[i] = ngram_lookup_word(ndata, s)) == WORD_INVALID) { 00412 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": \"%s\" not exist in %d-gram\n", n, nnid+1, pbuf, s, n); 00413 ok_p = FALSE; 00414 break; 00415 } 00416 /* increment nnid_bgn and nnid_num if context word changed */ 00417 } 00418 if (i < n) continue; /* error out */ 00419 00420 /* detect context entry change at this line */ 00421 for(i=0;i<n-1;i++) { 00422 if (w[i] != w_last[i]) break; 00423 } 00424 if (i < n-1) { /* context changed here */ 00425 /* find new entry point */ 00426 cid = search_ngram(ndata, n-1, w); 00427 if (cid == NNID_INVALID) { /* no context */ 00428 //jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (%s,%s) not exist in %d-gram (ignored)\n", n, nnid+1, pbuf, ndata->wname[w_m], ndata->wname[w_r], n-1); 00429 jlog("Warning: ngram_read_arpa: %d-gram #%d: \"%s\": context (", 00430 n, nnid+1, pbuf); 00431 for(i=0;i<n-1;i++) { 00432 jlog(" %s", ndata->wname[w[i]]); 00433 } 00434 jlog(") not exist in %d-gram (ignored)\n", n-1); 00435 ok_p = FALSE; 00436 continue; 00437 } 00438 if (cid_last != NNID_INVALID) { 00439 /* close last entry */ 00440 if (t->is24bit) { 00441 ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]); 00442 } else { 00443 ntmp = t->bgn[cid_last]; 00444 } 00445 t->num[cid_last] = nnid - ntmp; 00446 } 00447 /* the next context word should be an new entry */ 00448 if (t->is24bit) { 00449 if (t->bgn_upper[cid] != NNID_INVALID_UPPER) { 00450 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf); 00451 free(w_last); free(w); 00452 return FALSE; 00453 } 00454 ntmp = nnid & 0xffff; 00455 t->bgn_lower[cid] = ntmp; 00456 ntmp = nnid >> 16; 00457 t->bgn_upper[cid] = ntmp; 00458 } else { 00459 if (t->bgn[cid] != NNID_INVALID) { 00460 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf); 00461 free(w_last); free(w); 00462 return FALSE; 00463 } 00464 t->bgn[cid] = nnid; 00465 } 00466 00467 cid_last = cid; 00468 w_last[n-1] = WORD_INVALID; 00469 } 00470 00471 /* store the probabilities of the target word */ 00472 if (w[n-1] == w_last[n-1]) { 00473 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": duplicated entry\n", n, nnid+1, pbuf); 00474 ok_p = FALSE; 00475 continue; 00476 } else if (w_last[n-1] != WORD_INVALID && w[n-1] < w_last[n-1]) { 00477 jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf); 00478 free(w_last); free(w); 00479 return FALSE; 00480 } 00481 00482 /* if the 2-gram has back-off entries, store them here */ 00483 if ((s = strtok(NULL, DELM)) != NULL) { 00484 bowt = (LOGPROB) atof(s); 00485 if (t->bo_wt == NULL) { 00486 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); 00487 for(i=0;i<nnid;i++) t->bo_wt[i] = 0.0; 00488 } 00489 t->bo_wt[nnid] = bowt; 00490 } else { 00491 if (t->bo_wt != NULL) t->bo_wt[nnid] = 0.0; 00492 } 00493 00494 /* store the entry info */ 00495 t->nnid2wid[nnid] = w[n-1]; 00496 t->prob[nnid] = p; 00497 00498 nnid++; 00499 for(i=0;i<n;i++) w_last[i] = w[i]; 00500 00501 /* check total num */ 00502 if (nnid > t->totalnum) { 00503 jlog("Error: ngram_read_arpa: %d-gram: read num (%d) not match the header value (%d)\n", n, nnid, t->totalnum); 00504 free(w_last); free(w); 00505 return FALSE; 00506 } 00507 } 00508 00509 /* set the last entry */ 00510 if (t->is24bit) { 00511 ntmp = ((NNID)(t->bgn_upper[cid_last]) << 16) + (NNID)(t->bgn_lower[cid_last]); 00512 } else { 00513 ntmp = t->bgn[cid_last]; 00514 } 00515 t->num[cid_last] = nnid - ntmp; 00516 00517 if (t->bo_wt != NULL) t->context_num = t->totalnum; 00518 00519 if (ok_p == TRUE) { 00520 jlog("Stat: ngram_read_arpa: %d-gram read %d end\n", n, nnid); 00521 } 00522 00523 free(w_last); free(w); 00524 return ok_p; 00525 } 00526 00537 boolean 00538 ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition) 00539 { 00540 int i, n; 00541 NNID *num; 00542 00543 /* source file is not a binary N-gram */ 00544 ndata->from_bin = FALSE; 00545 ndata->bigram_index_reversed = FALSE; 00546 00547 /* read until `\data\' found */ 00548 while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0); 00549 00550 00551 if (addition) { 00552 /* reading additional forward 2-gram for the 1st pass */ 00553 00554 if (ndata->n < 2) { 00555 jlog("Error: base N-gram should be longer than 2-gram\n"); 00556 return FALSE; 00557 } 00558 00559 /* read n-gram total info */ 00560 n = get_total_info(fp, &num); 00561 if (n == -1) { /* error */ 00562 free(num); 00563 return FALSE; 00564 } 00565 00566 /* check N limit */ 00567 if (n < 2) { 00568 jlog("Error: forward N-gram for pass1 is does not contain 2-gram\n"); 00569 free(num); 00570 return FALSE; 00571 } 00572 if (n > 2) { 00573 jlog("Warning: forward N-gram for pass1 contains %d-gram, only 2-gram will be used\n", n); 00574 } 00575 00576 /* check if the numbers are the same with already read n-gram */ 00577 for(i=0;i<2;i++) { 00578 if (ndata->d[i].totalnum != num[i]) { 00579 jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", i+1); 00580 } 00581 } 00582 00583 free(num); 00584 00585 /* read additional 1-gram data */ 00586 if (!strnmatch(buf,"\\1-grams",8)) { 00587 jlog("Error: ngram_read_arpa: 1-gram not found for additional LR 2-gram\n"); 00588 return FALSE; 00589 } 00590 jlog("Stat: ngram_read_arpa: reading 1-gram part...\n"); 00591 if (add_unigram(fp, ndata) == FALSE) return FALSE; 00592 /* read 2-gram data */ 00593 if (!strnmatch(buf,"\\2-grams", 8)) { 00594 jlog("Error: ngram_read_arpa: 2-gram not found for additional LR 2-gram\n"); 00595 return FALSE; 00596 } 00597 jlog("Stat: ngram_read_arpa: reading 2-gram part...\n"); 00598 if (add_bigram(fp, ndata) == FALSE) return FALSE; 00599 00600 00601 /* ignore the rest */ 00602 if (strnmatch(buf,"\\3-grams", 8)) { 00603 jlog("Warning: forward n-gram contains more than 3-gram, ignored\n"); 00604 } 00605 00606 } else { 00607 /* read n-gram total info */ 00608 n = get_total_info(fp, &num); 00609 if (n == -1) { /* error */ 00610 free(num); 00611 return FALSE; 00612 } 00613 jlog("Stat: ngram_read_arpa: this is %d-gram file\n", n); 00614 ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * n); 00615 memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * n); 00616 for(i=0;i<n;i++) { 00617 ndata->d[i].totalnum = num[i]; 00618 } 00619 free(num); 00620 00621 /* set word num */ 00622 if (ndata->d[0].totalnum > MAX_WORD_NUM) { 00623 jlog("Error: ngram_read_arpa: N-gram vocabulary size exceeds the limit (%d)\n", MAX_WORD_NUM); 00624 return FALSE; 00625 } 00626 ndata->max_word_num = ndata->d[0].totalnum; 00627 00628 /* check if each N-gram allows 24bit and back-off compaction mode */ 00629 /* for fast access, 1-gram and 2-gram always use non-compaction mode */ 00630 for(i=0;i<n;i++) { 00631 if (i < 2) { /* not use for 1-gram and 2-gram */ 00632 ndata->d[i].is24bit = FALSE; 00633 } else { 00634 /* for 3-gram and later 24 bit mode is preferred, 00635 but should be disabled if number of entries is over 2^24 */ 00636 if (ndata->d[i].totalnum > NNID_MAX_24) { 00637 jlog("Warning: ngram_read_arpa: num of %d-gram exceeds 24bit, now switch to %dbit index\n", i+1, sizeof(NNID) * 8); 00638 ndata->d[i].is24bit = FALSE; 00639 } else { 00640 ndata->d[i].is24bit = TRUE; 00641 } 00642 } 00643 } 00644 /* disable ct_compaction flag while reading ARPA data */ 00645 for(i=0;i<n;i++) { 00646 ndata->d[i].ct_compaction = FALSE; 00647 } 00648 00649 /* read 1-gram data */ 00650 if (!strnmatch(buf,"\\1-grams",8)) { 00651 jlog("Error: ngram_read_arpa: data format error: 1-gram not found\n"); 00652 return FALSE; 00653 } 00654 jlog("Stat: ngram_read_arpa: reading 1-gram part...\n"); 00655 if (set_unigram(fp, ndata) == FALSE) return FALSE; 00656 00657 i = 2; 00658 while(i <= n) { 00659 /* read n-gram data in turn */ 00660 sprintf(pbuf, "\\%d-grams", i); 00661 if (!strnmatch(buf, pbuf, 8)) { 00662 jlog("Error: ngram_read_arpa: data format error: %d-gram not found\n", i); 00663 return FALSE; 00664 } 00665 jlog("Stat: ngram_read_arpa: reading %d-gram part...\n", i); 00666 if (set_ngram(fp, ndata, i) == FALSE) return FALSE; 00667 i++; 00668 } 00669 /* finished reading file */ 00670 if (!strnmatch(buf, "\\end", 4)) { 00671 jlog("Error: ngram_read_arpa: data format error: end marker \"\\end\" not found\n"); 00672 return FALSE; 00673 } 00674 00675 ndata->n = n; 00676 00677 for(i=2;i<n;i++) { 00678 if (ndata->d[i-1].bo_wt != NULL) { 00679 /* perform back-off compaction */ 00680 if (ngram_compact_context(ndata, i) == FALSE) return FALSE; 00681 } 00682 } 00683 00684 /* swap <s> and </s> for backward SRILM N-gram */ 00685 if (ndata->dir == DIR_RL) { 00686 WORD_ID bos, eos; 00687 char *p; 00688 bos = ngram_lookup_word(ndata, BEGIN_WORD_DEFAULT); 00689 eos = ngram_lookup_word(ndata, END_WORD_DEFAULT); 00690 if (!ndata->bos_eos_swap) { 00691 /* check */ 00692 if (bos != WORD_INVALID && eos != WORD_INVALID && ndata->d[0].prob[bos] == -99) { 00693 jlog("Stat: \"P(%s) = -99\" in reverse N-gram, may be trained by SRILM\n", BEGIN_WORD_DEFAULT); 00694 jlog("Stat: going to swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT); 00695 ndata->bos_eos_swap = TRUE; 00696 } 00697 } 00698 if (ndata->bos_eos_swap) { 00699 if (bos == WORD_INVALID) { 00700 jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", BEGIN_WORD_DEFAULT); 00701 } 00702 if (eos == WORD_INVALID) { 00703 jlog("Error: ngram_read_arpa: try to swap bos/eos but \"%s\" not found in N-gram\n", END_WORD_DEFAULT); 00704 } 00705 if (bos == WORD_INVALID || eos == WORD_INVALID) { 00706 return FALSE; 00707 } 00708 /* do swap */ 00709 jlog("Stat: ngram_read_arpa: swap \"%s\" and \"%s\" at backward N-gram\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT); 00710 /* swap name buffer */ 00711 p = ndata->wname[bos]; 00712 ndata->wname[bos] = ndata->wname[eos]; 00713 ndata->wname[eos] = p; 00714 /* replace index */ 00715 ptree_replace_data(BEGIN_WORD_DEFAULT, eos, ndata->root); 00716 ptree_replace_data(END_WORD_DEFAULT, bos, ndata->root); 00717 } 00718 } 00719 00720 } 00721 00722 #ifdef CLASS_NGRAM 00723 /* skip in-class word entries (they should be in word dictionary) */ 00724 if (getl(buf, sizeof(buf), fp) != NULL) { 00725 if (strnmatch(buf, "\\class", 6)) { 00726 jlog("Stat: ngram_read_arpa: skipping in-class word entries...\n"); 00727 } 00728 } 00729 #endif 00730 00731 bi_prob_func_set(ndata); 00732 00733 return TRUE; 00734 }