00001
00053
00054
00055
00056
00057
00058
00059
00060 #include <sent/stddefs.h>
00061 #include <sent/ngram2.h>
00062
00063 static int file_version;
00064 static boolean need_swap;
00065 #ifdef WORDS_INT
00066 static boolean need_conv;
00067 static boolean words_int_retry = FALSE;
00068 #endif
00069
00078 static void
00079 rdn(FILE *fp, void *buf, size_t unitbyte, int unitnum)
00080 {
00081 size_t tmp;
00082 if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < (size_t)unitnum) {
00083 perror("ngram_read_bin");
00084 j_error("read failed\n");
00085 }
00086 if (need_swap) {
00087 if (unitbyte != 1) {
00088 swap_bytes(buf, unitbyte, unitnum);
00089 }
00090 }
00091
00092 }
00093
00094 #ifdef WORDS_INT
00095
00103 static void
00104 rdn_wordid(FILE *fp, void *buf, int unitnum, boolean need_conv)
00105 {
00106 int i;
00107 unsigned short *s;
00108 WORD_ID *t;
00109 WORD_ID d;
00110
00111 if (need_conv) {
00112
00113 rdn(fp, buf, sizeof(unsigned short), unitnum);
00114
00115 for(i=unitnum-1;i>=0;i--) {
00116 s = (unsigned short *)buf + i;
00117 t = (WORD_ID *)buf + i;
00118 d = *s;
00119 *t = d;
00120 }
00121 } else {
00122
00123 rdn(fp, buf, sizeof(WORD_ID), unitnum);
00124 }
00125 }
00126 #endif
00127
00133 static void
00134 check_header(FILE *fp)
00135 {
00136 char buf[BINGRAM_HDSIZE], *p;
00137 rdn(fp, buf, 1, BINGRAM_HDSIZE);
00138
00139 p = buf;
00140 #ifdef WORDS_INT
00141 need_conv = FALSE;
00142 #endif
00143
00144
00145 if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {
00146
00147 file_version = 3;
00148 p += strlen(BINGRAM_IDSTR) + 1;
00149 } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {
00150
00151 file_version = 4;
00152 p += strlen(BINGRAM_IDSTR_V4) + 1;
00153 } else {
00154
00155 j_printerr("Error: invalid header, you probably use an old bingram\n");
00156 j_error("Error: if so, please re-make with newer mkbingram that comes with Julius-2.0 or later\n");
00157 }
00158
00159 if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {
00160 p += strlen(BINGRAM_SIZESTR_HEAD);
00161 if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {
00162
00163 #ifdef WORDS_INT
00164 if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) {
00165
00166 j_printerr("\nWarning: 2-bytes bingram, converting to 4 bytes\n");
00167 need_conv = TRUE;
00168 p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1;
00169 } else {
00170 j_error("\nError: unknown word byte size!\n");
00171 }
00172 #else
00173 if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) {
00174
00175 j_printerr("\nError: cannot handle 4-bytes bingram\n");
00176 j_error("Error: please use Julius compiled with --enable-words-int\n");
00177
00178 } else {
00179 j_error("\nError: unknown word byte size!\n");
00180 }
00181 #endif
00182 } else {
00183 p += strlen(BINGRAM_SIZESTR_BODY) + 1;
00184 }
00185
00186
00187 if (file_version == 4) {
00188 if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {
00189 j_error("\nError: no information for byte order??\n");
00190 }
00191 p += strlen(BINGRAM_BYTEORDER_HEAD);
00192 if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {
00193
00194 need_swap = TRUE;
00195 } else {
00196 need_swap = FALSE;
00197 }
00198 p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;
00199 }
00200 }
00201
00202
00203
00204
00205
00206
00207
00208
00209 if (file_version != 4) {
00210
00211 #ifdef WORDS_BIGENDIAN
00212 need_swap = FALSE;
00213 #else
00214 need_swap = TRUE;
00215 #endif
00216 }
00217
00218
00219 }
00220
00229 boolean
00230 ngram_read_bin(FILE *fp, NGRAM_INFO *ndata)
00231 {
00232 int i,n,len;
00233 char *w, *p;
00234 NNID *n3_bgn;
00235 NNID d, ntmp;
00236 #ifdef WORDS_INT
00237 unsigned short *buf;
00238 #endif
00239
00240 #ifdef WORDS_INT
00241
00242 words_int_retry = FALSE;
00243
00244
00245 ngram_read_bin_start:
00246
00247 #endif
00248
00249 ndata->from_bin = TRUE;
00250
00251
00252 check_header(fp);
00253
00254 #ifdef WORDS_INT
00255
00256 if (words_int_retry) need_conv = TRUE;
00257 #endif
00258
00259 #ifdef WORDS_INT
00260 if (need_conv) j_printerr("(wordid conv)..");
00261 #endif
00262
00263
00264 for(n=0;n<MAX_N;n++) {
00265 rdn(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1);
00266 if (file_version == 4 && ndata->ngram_num[n] >= NNIDMAX) {
00267 j_error("Error: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->ngram_num[n], NNIDMAX);
00268 }
00269 }
00270 ndata->max_word_num = ndata->ngram_num[0];
00271 if (file_version == 4) rdn(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1);
00272
00273
00274 switch(file_version) {
00275 case 4:
00276 ndata->version = 4;
00277 break;
00278 case 3:
00279 if (ndata->ngram_num[2] >= NNIDMAX) {
00280 j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX);
00281 ndata->version = 3;
00282 } else {
00283 ndata->version = 4;
00284 }
00285 break;
00286 }
00287
00288
00289 rdn(fp, &len, sizeof(int), 1);
00290 w = mymalloc(len);
00291 rdn(fp, w, 1, len);
00292
00293 ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]);
00294 p = w; i = 0;
00295 while (p < w + len) {
00296 ndata->wname[i++] = p;
00297 while(*p != '\0') p++;
00298 p++;
00299 }
00300 if (i != ndata->ngram_num[0]) {
00301 j_error("wname error??\n");
00302 }
00303
00304 ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00305 ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00306 ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]);
00307 ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[0]);
00308 ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[0]);
00309
00310
00311 j_printerr("1-gram.");
00312 rdn(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]);
00313 j_printerr(".");
00314 rdn(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]);
00315 j_printerr(".");
00316 rdn(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]);
00317 j_printerr(".");
00318 rdn(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]);
00319 j_printerr(".");
00320 #ifdef WORDS_INT
00321 rdn_wordid(fp, ndata->n2_num, ndata->ngram_num[0], need_conv);
00322 #else
00323 rdn(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);
00324 #endif
00325
00326 #ifdef WORDS_INT
00327 {
00328
00329
00330
00331
00332
00333
00334
00335
00336 WORD_ID w;
00337 for(w=0;w<ndata->ngram_num[0];w++) {
00338 if (ndata->n2_num[w] > ndata->ngram_num[0]) {
00339 if (words_int_retry) {
00340 j_error("\nError: retry failed, wrong bingram format\n");
00341 }
00342 j_printerr("\nWarning: incorrect data, may be a 2-byte v3 bingram, retry with converion\n");
00343 free(ndata->wname[0]);
00344 free(ndata->wname);
00345 free(ndata->p);
00346 free(ndata->bo_wt_lr);
00347 free(ndata->bo_wt_rl);
00348 free(ndata->n2_bgn);
00349 free(ndata->n2_num);
00350 myfrewind(fp);
00351 words_int_retry = TRUE;
00352 goto ngram_read_bin_start;
00353 }
00354 }
00355 }
00356 #endif
00357
00358
00359 ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00360 ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00361 ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00362 if (file_version == 4) {
00363 ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00364 ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00365 ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->bigram_bo_num);
00366 ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->bigram_bo_num);
00367 ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->bigram_bo_num);
00368 ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->bigram_bo_num);
00369 } else {
00370 ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]);
00371 ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]);
00372 if (ndata->version == 4) {
00373 ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]);
00374 ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]);
00375 n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00376 } else {
00377 ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]);
00378 }
00379 }
00380
00381 ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[2]);
00382 ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[2]);
00383
00384
00385 j_printerr("2-gram.");
00386 #ifdef WORDS_INT
00387 rdn_wordid(fp, ndata->n2tonid, ndata->ngram_num[1], need_conv);
00388 #else
00389 rdn(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);
00390 #endif
00391 j_printerr(".");
00392 rdn(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]);
00393 j_printerr(".");
00394 rdn(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]);
00395 j_printerr(".");
00396 if (file_version == 4) {
00397 rdn(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]);
00398 rdn(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]);
00399 j_printerr(".");
00400 rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num);
00401 j_printerr(".");
00402 rdn(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num);
00403 rdn(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num);
00404 j_printerr(".");
00405 #ifdef WORDS_INT
00406 rdn_wordid(fp, ndata->n3_num, ndata->bigram_bo_num, need_conv);
00407 #else
00408 rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);
00409 #endif
00410 } else {
00411 rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]);
00412 j_printerr(".");
00413 if (ndata->version == 4) {
00414 rdn(fp, n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00415 for(d=0;d<ndata->ngram_num[1];d++) {
00416 if (n3_bgn[d] == NNID_INVALID) {
00417 ndata->n3_bgn_lower[d] = 0;
00418 ndata->n3_bgn_upper[d] = NNID_INVALID_UPPER;
00419 } else {
00420 ntmp = n3_bgn[d] & 0xffff;
00421 ndata->n3_bgn_lower[d] = ntmp;
00422 ntmp = n3_bgn[d] >> 16;
00423 ndata->n3_bgn_upper[d] = ntmp;
00424 }
00425 }
00426 } else {
00427 rdn(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]);
00428 }
00429 j_printerr(".");
00430 #ifdef WORDS_INT
00431 rdn_wordid(fp, ndata->n3_num, ndata->ngram_num[1], need_conv);
00432 #else
00433 rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);
00434 #endif
00435 }
00436
00437
00438 j_printerr("3-gram.");
00439 #ifdef WORDS_INT
00440 rdn_wordid(fp, ndata->n3tonid, ndata->ngram_num[2], need_conv);
00441 #else
00442 rdn(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);
00443 #endif
00444 j_printerr(".");
00445 rdn(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]);
00446
00447
00448 j_printerr("indexing...");
00449 ngram_make_lookup_tree(ndata);
00450
00451
00452 if (file_version != 4 && ndata->version == 4) {
00453 free(n3_bgn);
00454 ngram_compact_bigram_context(ndata);
00455 }
00456
00457
00458 set_unknown_id(ndata);
00459
00460 return TRUE;
00461 }