Julius 4.1.5
|
00001 00054 /* 00055 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 00056 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00057 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 00058 * All rights reserved 00059 */ 00060 00061 #include <sent/stddefs.h> 00062 #include <sent/ngram2.h> 00063 00064 static int file_version; 00065 static boolean need_swap; 00066 #ifdef WORDS_INT 00067 static boolean need_conv; 00068 static boolean words_int_retry = FALSE; 00069 #endif 00070 00075 #define rdn(A,B,C,D) if (rdnfunc(A,B,C,D) == FALSE) return FALSE 00076 #define rdn_wordid(A,B,C,D) if (rdn_wordid_func(A,B,C,D) == FALSE) return FALSE 00077 00085 static boolean 00086 rdnfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum) 00087 { 00088 size_t tmp; 00089 if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < unitnum) { 00090 jlog("Error: ngram_read_bin: failed to read %d bytes\n", unitbyte*unitnum); 00091 return FALSE; 00092 } 00093 if (need_swap) { 00094 if (unitbyte != 1) { 00095 swap_bytes(buf, unitbyte, unitnum); 00096 } 00097 } 00098 return TRUE; 00099 } 00100 00101 #ifdef WORDS_INT 00102 00110 static boolean 00111 rdn_wordid_func(FILE *fp, void *buf, int unitnum, boolean need_conv) 00112 { 00113 int i; 00114 unsigned short *s; 00115 WORD_ID *t; 00116 WORD_ID d; 00117 00118 if (need_conv) { 00119 /* read unsigned short units */ 00120 rdn(fp, buf, sizeof(unsigned short), unitnum); 00121 /* convert them to WORD_ID (integer) */ 00122 for(i=unitnum-1;i>=0;i--) { 00123 s = (unsigned short *)buf + i; 00124 t = (WORD_ID *)buf + i; 00125 d = *s; 00126 *t = d; 00127 } 00128 } else { 00129 /* read as usual */ 00130 rdn(fp, buf, sizeof(WORD_ID), unitnum); 00131 } 00132 return TRUE; 00133 } 00134 #endif 00135 00141 static boolean 00142 check_header(FILE *fp) 00143 { 00144 char buf[BINGRAM_HDSIZE], *p; 00145 00146 rdn(fp, buf, 1, BINGRAM_HDSIZE); 00147 p = buf; 00148 #ifdef WORDS_INT 00149 need_conv = FALSE; 00150 #endif 00151 00152 /* version check */ 00153 if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) { 00154 /* bingram file made by mkbingram before 3.4.2 */ 00155 file_version = 3; 00156 p += strlen(BINGRAM_IDSTR) + 1; 00157 } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) { 00158 /* bingram file made by mkbingram later than 3.5 */ 00159 file_version = 4; 00160 p += strlen(BINGRAM_IDSTR_V4) + 1; 00161 } else if (strnmatch(p, BINGRAM_IDSTR_V5, strlen(BINGRAM_IDSTR_V5))) { 00162 /* bingram file made by JuliusLib-4 and later */ 00163 file_version = 5; 00164 p += strlen(BINGRAM_IDSTR_V5) + 1; 00165 } else { 00166 /* not a bingram file */ 00167 jlog("Error: ngram_read_bin: invalid header\n"); 00168 return FALSE; 00169 } 00170 /* word size check (for bingram build by mkbingram 3.3p5 and later */ 00171 if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) { 00172 p += strlen(BINGRAM_SIZESTR_HEAD); 00173 if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) { 00174 /* word size does not match (int / short) */ 00175 #ifdef WORDS_INT 00176 if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) { 00177 /* this is 2-byte word ID, will convert while reading */ 00178 jlog("Warning: ngram_read_bin: 2-bytes bingram, converting to 4 bytes\n"); 00179 need_conv = TRUE; 00180 p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1; 00181 } else { 00182 jlog("Error: ngram_read_bin: unknown word byte size!\n"); 00183 return FALSE; 00184 } 00185 #else 00186 if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) { 00187 /*** 4bytes to 2bytes not implemented, just terminate here... ***/ 00188 jlog("Error: ngram_read_bin: cannot handle 4-bytes bingram\n"); 00189 jlog("Error: ngram_read_bin: please use Julius compiled with --enable-words-int\n"); 00190 return FALSE; 00191 //p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1; 00192 } else { 00193 jlog("Error: ngram_read_bin: unknown word byte size!\n"); 00194 return FALSE; 00195 } 00196 #endif 00197 } else { 00198 p += strlen(BINGRAM_SIZESTR_BODY) + 1; 00199 } 00200 00201 /* byte order check (v4 (rev.3.5) and later) */ 00202 if (file_version >= 4) { 00203 if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) { 00204 jlog("Error: ngram_read_bin: no information for byte order??\n"); 00205 return FALSE; 00206 } 00207 p += strlen(BINGRAM_BYTEORDER_HEAD); 00208 if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) { 00209 /* file endian and running endian is different, need swapping */ 00210 need_swap = TRUE; 00211 } else { 00212 need_swap = FALSE; 00213 } 00214 p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1; 00215 } 00216 } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */ 00217 00218 /* in case of V3 bingram file, the unit size of word_id and its byte order 00219 cannot be determined from the header. In that case, we assume 00220 byteorder to be a BIG ENDIAN. The word_id unit size (2byte in normal, 00221 or 4byte if bingram generated with mkbingram with --enable-words-int) 00222 will be automagically detected. 00223 */ 00224 00225 if (file_version < 4) { 00226 /* assume input as big endian */ 00227 #ifdef WORDS_BIGENDIAN 00228 need_swap = FALSE; 00229 #else 00230 need_swap = TRUE; 00231 #endif 00232 } 00233 00234 /*jlog("%s",buf);*/ 00235 00236 return TRUE; 00237 } 00238 00239 static boolean 00240 ngram_read_bin_v5(FILE *fp, NGRAM_INFO *ndata) 00241 { 00242 int i,n,len; 00243 char *w, *p; 00244 #ifdef WORDS_INT 00245 unsigned short *buf; 00246 #endif 00247 NGRAM_TUPLE_INFO *t; 00248 00249 /* read some info extended from version 5 */ 00250 rdn(fp, &(ndata->n), sizeof(int), 1); 00251 rdn(fp, &(ndata->dir), sizeof(int), 1); 00252 rdn(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1); 00253 00254 jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n); 00255 00256 /* read total info and set max_word_num */ 00257 ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * ndata->n); 00258 memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * ndata->n); 00259 for(n=0;n<ndata->n;n++) { 00260 rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); 00261 } 00262 ndata->max_word_num = ndata->d[0].totalnum; 00263 00264 /* read wname */ 00265 rdn(fp, &len, sizeof(int), 1); 00266 w = mymalloc(len); 00267 rdn(fp, w, 1, len); 00268 /* assign... */ 00269 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num); 00270 p = w; i = 0; 00271 while (p < w + len) { 00272 ndata->wname[i++] = p; 00273 while(*p != '\0') p++; 00274 p++; 00275 } 00276 if (i != ndata->max_word_num) { 00277 jlog("Error: ngram_read_bin_v5: wname error??\n"); 00278 return FALSE; 00279 } 00280 00281 /* read N-gram */ 00282 for(n=0;n<ndata->n;n++) { 00283 jlog("stat: ngram_read_bin_v5: reading %d-gram\n", n+1); 00284 00285 t = &(ndata->d[n]); 00286 00287 rdn(fp, &(t->is24bit), sizeof(boolean), 1); 00288 rdn(fp, &(t->ct_compaction), sizeof(boolean), 1); 00289 rdn(fp, &(t->bgnlistlen), sizeof(NNID), 1); 00290 rdn(fp, &(t->context_num), sizeof(NNID), 1); 00291 00292 if (n > 0) { 00293 if (t->is24bit) { 00294 t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen); 00295 rdn(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen); 00296 t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen); 00297 rdn(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen); 00298 } else { 00299 t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen); 00300 rdn(fp, t->bgn, sizeof(NNID), t->bgnlistlen); 00301 } 00302 t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen); 00303 rdn(fp, t->num, sizeof(WORD_ID), t->bgnlistlen); 00304 t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum); 00305 rdn(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum); 00306 } else { 00307 t->bgn_upper = NULL; 00308 t->bgn_lower = NULL; 00309 t->bgn = NULL; 00310 t->num = NULL; 00311 t->bgnlistlen = 0; 00312 t->nnid2wid = NULL; 00313 } 00314 00315 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); 00316 rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum); 00317 00318 rdn(fp, &i, sizeof(int), 1); 00319 if (i == 1) { 00320 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num); 00321 rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); 00322 } else { 00323 t->bo_wt = NULL; 00324 } 00325 rdn(fp, &i, sizeof(int), 1); 00326 if (i == 1) { 00327 t->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->totalnum); 00328 t->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->totalnum); 00329 rdn(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum); 00330 rdn(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum); 00331 } else { 00332 t->nnid2ctid_upper = NULL; 00333 t->nnid2ctid_lower = NULL; 00334 } 00335 } 00336 rdn(fp, &i, sizeof(int), 1); 00337 if (i == 1) { 00338 ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[0].context_num); 00339 rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num); 00340 } else { 00341 ndata->bo_wt_1 = NULL; 00342 } 00343 rdn(fp, &i, sizeof(int), 1); 00344 if (i == 1) { 00345 ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum); 00346 rdn(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum); 00347 } else { 00348 ndata->p_2 = NULL; 00349 } 00350 00351 return TRUE; 00352 } 00353 00354 static boolean 00355 ngram_read_bin_compat(FILE *fp, NGRAM_INFO *ndata, int *retry_ret) 00356 { 00357 int i,n,len; 00358 char *w, *p; 00359 NNID *n3_bgn; 00360 NNID d, ntmp; 00361 #ifdef WORDS_INT 00362 unsigned short *buf; 00363 #endif 00364 NGRAM_TUPLE_INFO *t, *tt, *ttt; 00365 00366 /* old binary N-gram assumes these types */ 00367 ndata->bigram_index_reversed = TRUE; 00368 ndata->n = 3; 00369 ndata->dir = DIR_RL; 00370 00371 /* read total info and set max_word_num */ 00372 ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * ndata->n); 00373 memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * ndata->n); 00374 for(n=0;n<ndata->n;n++) { 00375 rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); 00376 } 00377 ndata->max_word_num = ndata->d[0].totalnum; 00378 00379 if (file_version == 4) { 00380 rdn(fp, &(ndata->d[1].context_num), sizeof(NNID), 1); 00381 } 00382 00383 for(n=0;n<ndata->n;n++) { 00384 if (n < 2) { 00385 ndata->d[n].is24bit = FALSE; 00386 } else { 00387 if (ndata->d[n].totalnum >= NNID_MAX_24) { 00388 jlog("Warning: ngram_read_bin_compat: num of %d-gram exceeds 24bit, now switch to %dbit index\n", n+1, sizeof(NNID) * 8); 00389 ndata->d[n].is24bit = FALSE; 00390 } else { 00391 ndata->d[n].is24bit = TRUE; 00392 } 00393 } 00394 ndata->d[n].nnid2ctid_upper = NULL; 00395 ndata->d[n].nnid2ctid_lower = NULL; 00396 } 00397 /* always do back-off compaction for 3-gram and up */ 00398 /* mark 2-gram and up */ 00399 ndata->d[0].ct_compaction = FALSE; 00400 for(n=1;n<ndata->n;n++) { 00401 ndata->d[n].ct_compaction = TRUE; 00402 } 00403 00404 /* read wname */ 00405 rdn(fp, &len, sizeof(int), 1); 00406 w = mymalloc(len); 00407 rdn(fp, w, 1, len); 00408 /* assign... */ 00409 ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num); 00410 p = w; i = 0; 00411 while (p < w + len) { 00412 ndata->wname[i++] = p; 00413 while(*p != '\0') p++; 00414 p++; 00415 } 00416 if (i != ndata->max_word_num) { 00417 jlog("Error: ngram_read_bin_compat: wname error??\n"); 00418 return FALSE; 00419 } 00420 00421 /* malloc 1-gram */ 00422 t = &(ndata->d[0]); 00423 tt = &(ndata->d[1]); 00424 ttt = &(ndata->d[2]); 00425 00426 t->bgn_upper = NULL; 00427 t->bgn_lower = NULL; 00428 t->bgn = NULL; 00429 t->num = NULL; 00430 t->bgnlistlen = 0; 00431 t->nnid2wid = NULL; 00432 t->nnid2ctid_upper = NULL; 00433 t->nnid2ctid_lower = NULL; 00434 00435 t->context_num = t->totalnum; 00436 t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); 00437 ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num); 00438 t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num); 00439 tt->bgnlistlen = t->context_num; 00440 tt->bgn = (NNID *)mymalloc_big(sizeof(NNID), tt->bgnlistlen); 00441 tt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->bgnlistlen); 00442 00443 /* read 1-gram */ 00444 jlog("stat: ngram_read_bin_compat: reading 1-gram\n"); 00445 rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum); 00446 rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), t->context_num); 00447 rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); 00448 rdn(fp, tt->bgn, sizeof(NNID), tt->bgnlistlen); 00449 #ifdef WORDS_INT 00450 rdn_wordid(fp, tt->num, tt->bgnlistlen, need_conv); 00451 #else 00452 rdn(fp, tt->num, sizeof(WORD_ID), tt->bgnlistlen); 00453 #endif 00454 00455 #ifdef WORDS_INT 00456 { 00457 /* check if we are wrongly reading word_id=2byte bingram 00458 (if bingram version >= 4, this should not be happen because 00459 header correctly tells the word_id byte size. This will 00460 occur only if matches all the conditions below: 00461 - you run Julius with --enable-words-int, 00462 - you use old bingram of version <= 3, and 00463 - you use bingram file converted without --enable-words-int 00464 */ 00465 WORD_ID w; 00466 for(w=0;w<ndata->max_word_num;w++) { 00467 if (ndata->d[1].num[w] > ndata->max_word_num) { 00468 if (words_int_retry) { 00469 jlog("Error: ngram_read_bin_compat: retry failed, wrong bingram format\n"); 00470 return FALSE; 00471 } 00472 jlog("Warning: ngram_read_bin_compat: incorrect data, may be a 2-byte v3 bingram, retry with conversion\n"); 00473 free(ndata->wname[0]); 00474 free(ndata->wname); 00475 free(t->prob); 00476 free(ndata->bo_wt_1); 00477 free(t->bo_wt); 00478 free(tt->bgn); 00479 free(tt->num); 00480 myfrewind(fp); 00481 words_int_retry = TRUE; 00482 *retry_ret = 1; 00483 return FALSE; 00484 } 00485 } 00486 } 00487 #endif 00488 00489 /* malloc the rest */ 00490 tt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->totalnum); 00491 tt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum); 00492 ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum); 00493 if (file_version == 4) { /* context compaction and 24bit */ 00494 tt->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), tt->totalnum); 00495 tt->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), tt->totalnum); 00496 tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num); 00497 ttt->bgnlistlen = tt->context_num; 00498 ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen); 00499 ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen); 00500 ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen); 00501 } else { 00502 tt->context_num = tt->totalnum; 00503 tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num); 00504 ttt->bgnlistlen = tt->context_num; 00505 ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen); 00506 if (ttt->is24bit) { 00507 ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen); 00508 ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen); 00509 n3_bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen); 00510 } else { 00511 ttt->bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen); 00512 } 00513 } 00514 00515 ttt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->totalnum); 00516 ttt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ttt->totalnum); 00517 ttt->bo_wt = NULL; 00518 00519 /* read 2-gram*/ 00520 jlog("Stat: ngram_read_bin_compat: reading 2-gram\n"); 00521 #ifdef WORDS_INT 00522 rdn_wordid(fp, tt->nnid2wid, tt->totalnum, need_conv); 00523 #else 00524 rdn(fp, tt->nnid2wid, sizeof(WORD_ID), tt->totalnum); 00525 #endif 00526 rdn(fp, ndata->p_2, sizeof(LOGPROB), tt->totalnum); 00527 rdn(fp, tt->prob, sizeof(LOGPROB), tt->totalnum); 00528 if (file_version == 4) { 00529 rdn(fp, tt->nnid2ctid_upper, sizeof(NNID_UPPER), tt->totalnum); 00530 rdn(fp, tt->nnid2ctid_lower, sizeof(NNID_LOWER), tt->totalnum); 00531 rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num); 00532 rdn(fp, ttt->bgn_upper, sizeof(NNID_UPPER), ttt->bgnlistlen); 00533 rdn(fp, ttt->bgn_lower, sizeof(NNID_LOWER), ttt->bgnlistlen); 00534 #ifdef WORDS_INT 00535 rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv); 00536 #else 00537 rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen); 00538 #endif 00539 } else { 00540 rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num); 00541 if (ttt->is24bit) { 00542 rdn(fp, n3_bgn, sizeof(NNID), ttt->bgnlistlen); 00543 for(d=0;d<ttt->bgnlistlen;d++) { 00544 if (n3_bgn[d] == NNID_INVALID) { 00545 ttt->bgn_lower[d] = 0; 00546 ttt->bgn_upper[d] = NNID_INVALID_UPPER; 00547 } else { 00548 ntmp = n3_bgn[d] & 0xffff; 00549 ttt->bgn_lower[d] = ntmp; 00550 ntmp = n3_bgn[d] >> 16; 00551 ttt->bgn_upper[d] = ntmp; 00552 } 00553 } 00554 } else { 00555 rdn(fp, ttt->bgn, sizeof(NNID), ttt->bgnlistlen); 00556 } 00557 #ifdef WORDS_INT 00558 rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv); 00559 #else 00560 rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen); 00561 #endif 00562 } 00563 00564 /* read 3-gram*/ 00565 jlog("Stat: ngram_read_bin_compat: reading 3-gram\n"); 00566 #ifdef WORDS_INT 00567 rdn_wordid(fp, ttt->nnid2wid, ttt->totalnum, need_conv); 00568 #else 00569 rdn(fp, ttt->nnid2wid, sizeof(WORD_ID), ttt->totalnum); 00570 #endif 00571 rdn(fp, ttt->prob, sizeof(LOGPROB), ttt->totalnum); 00572 00573 /* compact the 2-gram back-off and 3-gram links */ 00574 if (file_version != 4) { 00575 if (ttt->is24bit) { 00576 free(n3_bgn); 00577 if (ngram_compact_context(ndata, 2) == FALSE) return FALSE; 00578 } 00579 } 00580 00581 return TRUE; 00582 } 00583 00584 00593 boolean 00594 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata) 00595 { 00596 int retry; 00597 00598 #ifdef WORDS_INT 00599 /* reset retry flag */ 00600 words_int_retry = FALSE; 00601 /* when retrying, it restarts from here with words_int_retry = TRUE */ 00602 ngram_read_bin_start: 00603 #endif 00604 00605 ndata->from_bin = TRUE; 00606 00607 /* check initial header */ 00608 if (check_header(fp) == FALSE) return FALSE; 00609 00610 #ifdef WORDS_INT 00611 /* in retry mode, force word_id conversion */ 00612 if (words_int_retry) need_conv = TRUE; 00613 #endif 00614 00615 #ifdef WORDS_INT 00616 if (need_conv) jlog("Stat: ngram_read_bin: word-id size conversion enabled\n"); 00617 #endif 00618 00619 if (file_version <= 4) { 00620 retry = 0; 00621 if (ngram_read_bin_compat(fp, ndata, &retry) == FALSE) { 00622 #ifdef WORDS_INT 00623 if (retry == 1) { 00624 goto ngram_read_bin_start; 00625 } else { 00626 return FALSE; 00627 } 00628 #else 00629 return FALSE; 00630 #endif 00631 } 00632 } else { 00633 if (ngram_read_bin_v5(fp, ndata) == FALSE) return FALSE; 00634 } 00635 00636 00637 /* make word search tree for later lookup */ 00638 jlog("Stat: ngram_read_bin: making entry name index\n"); 00639 ngram_make_lookup_tree(ndata); 00640 00641 bi_prob_func_set(ndata); 00642 00643 return TRUE; 00644 } 00645