Julius 4.1.5
|
00001 00159 /* 00160 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 00161 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00162 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 00163 * All rights reserved 00164 */ 00165 00166 #include <julius/julius.h> 00167 00168 /*----------------------------------------------------------------------*/ 00169 00190 static void 00191 add_successor(WCHMM_INFO *wchmm, int node, WORD_ID w) 00192 { 00193 S_CELL *sctmp, *sc; 00194 00195 /* malloc a new successor list element */ 00196 sctmp=(S_CELL *) mymalloc(sizeof(S_CELL)); 00197 /* assign word ID to the new element */ 00198 sctmp->word = w; 00199 /* add the new element to existing list (keeping order) */ 00200 if (wchmm->state[node].scid == 0) { 00201 j_internal_error("add_successor: sclist id not assigned to branch node?\n"); 00202 } 00203 sc = wchmm->sclist[wchmm->state[node].scid]; 00204 if (sc == NULL || sctmp->word < sc->word) { 00205 sctmp->next = sc; 00206 wchmm->sclist[wchmm->state[node].scid] = sctmp; 00207 } else { 00208 for(;sc;sc=sc->next) { 00209 if (sc->next == NULL || sctmp->word < (sc->next)->word) { 00210 if (sctmp->word == sc->word) break; /* avoid duplication */ 00211 sctmp->next = sc->next; 00212 sc->next = sctmp; 00213 break; 00214 } 00215 } 00216 } 00217 } 00218 00239 static boolean 00240 match_successor(WCHMM_INFO *wchmm, int node1, int node2) 00241 { 00242 S_CELL *sc1,*sc2; 00243 00244 /* assume successor is sorted by ID */ 00245 if (wchmm->state[node1].scid == 0 || wchmm->state[node2].scid == 0) { 00246 j_internal_error("match_successor: sclist id not assigned to branch node?\n"); 00247 } 00248 sc1 = wchmm->sclist[wchmm->state[node1].scid]; 00249 sc2 = wchmm->sclist[wchmm->state[node2].scid]; 00250 for (;;) { 00251 if (sc1 == NULL || sc2 == NULL) { 00252 if (sc1 == NULL && sc2 == NULL) { 00253 return TRUE; 00254 } else { 00255 return FALSE; 00256 } 00257 } else if (sc1->word != sc2->word) { 00258 return FALSE; 00259 } 00260 sc1 = sc1->next; 00261 sc2 = sc2->next; 00262 } 00263 } 00264 00279 static void 00280 free_successor(WCHMM_INFO *wchmm, int scid) 00281 { 00282 S_CELL *sc; 00283 S_CELL *sctmp; 00284 00285 /* free sclist */ 00286 sc = wchmm->sclist[scid]; 00287 while (sc != NULL) { 00288 sctmp = sc; 00289 sc = sc->next; 00290 free(sctmp); 00291 } 00292 } 00293 00308 static void 00309 compaction_successor(WCHMM_INFO *wchmm) 00310 { 00311 int src, dst; 00312 00313 dst = 1; 00314 for(src=1;src<wchmm->scnum;src++) { 00315 if (wchmm->state[wchmm->sclist2node[src]].scid <= 0) { 00316 /* already freed, skip */ 00317 continue; 00318 } 00319 if (dst != src) { 00320 wchmm->sclist[dst] = wchmm->sclist[src]; 00321 wchmm->sclist2node[dst] = wchmm->sclist2node[src]; 00322 wchmm->state[wchmm->sclist2node[dst]].scid = dst; 00323 } 00324 dst++; 00325 } 00326 if (debug2_flag) { 00327 jlog("DEBUG: successor list shrinked from %d to %d\n", wchmm->scnum, dst); 00328 } 00329 wchmm->scnum = dst; 00330 } 00331 00346 static void 00347 shrink_successor(WCHMM_INFO *wchmm) 00348 { 00349 if (wchmm->sclist) { 00350 wchmm->sclist = (S_CELL **)myrealloc(wchmm->sclist, sizeof(S_CELL *) * wchmm->scnum); 00351 } 00352 if (wchmm->sclist2node) { 00353 wchmm->sclist2node = (int *)myrealloc(wchmm->sclist2node, sizeof(int) * wchmm->scnum); 00354 } 00355 } 00356 00373 void 00374 make_successor_list(WCHMM_INFO *wchmm) 00375 { 00376 int node; 00377 WORD_ID w; 00378 int i; 00379 boolean *freemark; 00380 int s; 00381 00382 jlog("STAT: make successor lists for factoring\n"); 00383 00384 /* 1. initialize */ 00385 /* initialize node->sclist index on wchmm tree */ 00386 for (node=0;node<wchmm->n;node++) wchmm->state[node].scid = 0; 00387 00388 /* parse the tree to get the maximum size of successor list */ 00389 s = 1; 00390 for (w=0;w<wchmm->winfo->num;w++) { 00391 for (i=0;i<wchmm->winfo->wlen[w];i++) { 00392 if (wchmm->state[wchmm->offset[w][i]].scid == 0) { 00393 wchmm->state[wchmm->offset[w][i]].scid = s; 00394 s++; 00395 } 00396 } 00397 if (wchmm->state[wchmm->wordend[w]].scid == 0) { 00398 wchmm->state[wchmm->wordend[w]].scid = s; 00399 s++; 00400 } 00401 } 00402 wchmm->scnum = s; 00403 if (debug2_flag) { 00404 jlog("DEBUG: initial successor list size = %d\n", wchmm->scnum); 00405 } 00406 00407 /* allocate successor list for the maximum size */ 00408 wchmm->sclist = (S_CELL **)mymalloc(sizeof(S_CELL *) * wchmm->scnum); 00409 for (i=1;i<wchmm->scnum;i++) wchmm->sclist[i] = NULL; 00410 wchmm->sclist2node = (int *)mymalloc(sizeof(int) * wchmm->scnum); 00411 00412 /* allocate misc. work area */ 00413 freemark = (boolean *)mymalloc(sizeof(boolean) * wchmm->scnum); 00414 for (i=1;i<wchmm->scnum;i++) freemark[i] = FALSE; 00415 00416 /* 2. make initial successor list: assign at all possible nodes */ 00417 for (w=0;w<wchmm->winfo->num;w++) { 00418 /* at each start node of phonemes */ 00419 for (i=0;i<wchmm->winfo->wlen[w];i++) { 00420 wchmm->sclist2node[wchmm->state[wchmm->offset[w][i]].scid] = wchmm->offset[w][i]; 00421 add_successor(wchmm, wchmm->offset[w][i], w); 00422 } 00423 /* at word end */ 00424 wchmm->sclist2node[wchmm->state[wchmm->wordend[w]].scid] = wchmm->wordend[w]; 00425 add_successor(wchmm, wchmm->wordend[w], w); 00426 } 00427 00428 /* 3. erase unnecessary successor list */ 00429 /* sucessor list same as the previous node is not needed, so */ 00430 /* parse lexicon tree from every leaf to find the same succesor list */ 00431 for (w=0;w<wchmm->winfo->num;w++) { 00432 node = wchmm->wordend[w]; /* begin from the word end node */ 00433 i = wchmm->winfo->wlen[w]-1; 00434 while (i >= 0) { /* for each phoneme start node */ 00435 if (node == wchmm->offset[w][i]) { 00436 /* word with only 1 state: skip */ 00437 i--; 00438 continue; 00439 } 00440 if (match_successor(wchmm, node, wchmm->offset[w][i])) { 00441 freemark[wchmm->state[node].scid] = TRUE; /* mark the node */ 00442 } 00443 /* 00444 * if (freemark[wchmm->offset[w][i]] != FALSE) { 00445 * break; 00446 * } 00447 */ 00448 node = wchmm->offset[w][i]; 00449 i--; 00450 } 00451 } 00452 /* really free */ 00453 for (i=1;i<wchmm->scnum;i++) { 00454 if (freemark[i] == TRUE) { 00455 free_successor(wchmm, i); 00456 /* reset node -> sclist link */ 00457 wchmm->state[wchmm->sclist2node[i]].scid = 0; 00458 } 00459 } 00460 /* garbage collection of deleted sclist */ 00461 compaction_successor(wchmm); 00462 00463 free(freemark); 00464 00465 jlog("STAT: done\n"); 00466 } 00467 00468 #ifdef UNIGRAM_FACTORING 00469 00486 void 00487 make_successor_list_unigram_factoring(WCHMM_INFO *wchmm) 00488 { 00489 00490 #ifndef FAST_FACTOR1_SUCCESSOR_LIST 00491 00492 /* old way */ 00493 make_successor_list(wchmm); 00494 calc_all_unigram_factoring_values(wchmm); 00495 00496 #else /* ~FAST_FACTOR1_SUCCESSOR_LIST */ 00497 00498 /* new way */ 00499 00500 int node, node2; 00501 WORD_ID w, w2; 00502 int i, j, n, f; 00503 int s; 00504 LOGPROB tmpprob; 00505 00506 jlog("STAT: make successor lists for unigram factoring\n"); 00507 00508 /* 1. initialize */ 00509 /* initialize node->sclist index on wchmm tree */ 00510 for (node=0;node<wchmm->n;node++) wchmm->state[node].scid = 0; 00511 00512 /* in unigram factoring, number of successor = vocabulary size */ 00513 wchmm->scnum = wchmm->winfo->num + 1; 00514 if (debug2_flag) { 00515 jlog("DEBUG: successor list size = %d\n", wchmm->scnum); 00516 } 00517 00518 /* allocate successor list */ 00519 wchmm->sclist = (S_CELL **)mymalloc(sizeof(S_CELL *) * wchmm->scnum); 00520 for (i=1;i<wchmm->scnum;i++) wchmm->sclist[i] = NULL; 00521 /* sclist2node is not used */ 00522 00523 /* 2. make successor list, and count needed fscore num */ 00524 f = 1; 00525 s = 1; 00526 for (w=0;w<wchmm->winfo->num;w++) { 00527 for (i=0;i<wchmm->winfo->wlen[w] + 1;i++) { 00528 if (i < wchmm->winfo->wlen[w]) { 00529 node = wchmm->offset[w][i]; 00530 } else { 00531 node = wchmm->wordend[w]; 00532 } 00533 if (wchmm->state[node].scid == 0) { /* not assigned */ 00534 /* new node found, assign new and exit here */ 00535 wchmm->state[node].scid = s++; 00536 if (s > wchmm->scnum) { 00537 jlog("InternalError: make_successor_list_unigram_factoring: scid num exceeded?\n"); 00538 return; 00539 } 00540 add_successor(wchmm, node, w); 00541 break; 00542 } else if (wchmm->state[node].scid > 0) { 00543 /* that node has sclist */ 00544 /* move it to the current first isolated node in that word */ 00545 w2 = wchmm->sclist[wchmm->state[node].scid]->word; 00546 for(j=i+1;j<wchmm->winfo->wlen[w2] + 1;j++) { 00547 if (j < wchmm->winfo->wlen[w2]) { 00548 node2 = wchmm->offset[w2][j]; 00549 } else { 00550 node2 = wchmm->wordend[w2]; 00551 } 00552 if (wchmm->state[node2].scid == 0) { /* not assigned */ 00553 /* move sclist to there */ 00554 wchmm->state[node2].scid = wchmm->state[node].scid; 00555 break; 00556 } 00557 } 00558 if (j >= wchmm->winfo->wlen[w2] + 1) { 00559 /* not found? */ 00560 jlog("InternalError: make_successor_list_unigram_factoring: no isolated word for %d\n", w2); 00561 return; 00562 } 00563 /* make current node as fscore node */ 00564 n = f++; 00565 wchmm->state[node].scid = -n; 00566 /* not compute unigram factoring value yet */ 00567 } 00568 00569 } 00570 } 00571 00572 /* 2. allocate fscore buffer */ 00573 wchmm->fsnum = f; 00574 wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum); 00575 for(n=0;n<wchmm->fsnum;n++) wchmm->fscore[n] = LOG_ZERO; 00576 00577 /* 3. parse again to assign fscore values */ 00578 for (w=0;w<wchmm->winfo->num;w++) { 00579 for (i=0;i<wchmm->winfo->wlen[w] + 1;i++) { 00580 if (i < wchmm->winfo->wlen[w]) { 00581 node = wchmm->offset[w][i]; 00582 } else { 00583 node = wchmm->wordend[w]; 00584 } 00585 if (wchmm->state[node].scid < 0) { 00586 /* update max */ 00587 if (wchmm->ngram) { 00588 tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[w]) 00589 #ifdef CLASS_NGRAM 00590 + wchmm->winfo->cprob[w] 00591 #endif 00592 ; 00593 } else { 00594 tmpprob = LOG_ZERO; 00595 } 00596 if (wchmm->lmvar == LM_NGRAM_USER) { 00597 tmpprob = (*(wchmm->uni_prob_user))(wchmm->winfo, w, tmpprob); 00598 } 00599 n = - wchmm->state[node].scid; 00600 if (wchmm->fscore[n] < tmpprob) { 00601 wchmm->fscore[n] = tmpprob; 00602 } 00603 } 00604 00605 } 00606 } 00607 00608 #endif /* ~FAST_FACTOR1_SUCCESSOR_LIST */ 00609 00610 jlog("STAT: done\n"); 00611 } 00612 00613 #endif /* UNIGRAM_FACTORING */ 00614 00615 00634 void 00635 adjust_sc_index(WCHMM_INFO *wchmm) 00636 { 00637 WORD_ID w; 00638 int i,j,k; 00639 HMM_Logical *ltmp; 00640 int ltmp_state_num; 00641 int ato; 00642 LOGPROB prob; 00643 int node, scid; 00644 A_CELL2 *ac; 00645 00646 /* duplicate scid for HMMs with more than one arc from initial state */ 00647 for(w=0;w<wchmm->winfo->num;w++) { 00648 for(k=0;k<wchmm->winfo->wlen[w];k++) { 00649 node = wchmm->offset[w][k]; 00650 scid = wchmm->state[node].scid; 00651 if (scid == 0) continue; 00652 ltmp = wchmm->winfo->wseq[w][k]; 00653 ltmp_state_num = hmm_logical_state_num(ltmp); 00654 if ((hmm_logical_trans(ltmp))->a[0][ltmp_state_num-1] != LOG_ZERO) { 00655 j = k + 1; 00656 if (j == wchmm->winfo->wlen[w]) { 00657 if (wchmm->state[wchmm->wordend[w]].scid == 0) { 00658 jlog("STAT: word %d: factoring node copied for skip phone\n", w); 00659 wchmm->state[wchmm->wordend[w]].scid = scid; 00660 } 00661 } else { 00662 if (wchmm->state[wchmm->offset[w][j]].scid == 0) { 00663 jlog("STAT: word %d: factoring node copied for skip phone\n", w); 00664 wchmm->state[wchmm->offset[w][j]].scid = scid; 00665 } 00666 } 00667 } 00668 for(ato=1;ato<ltmp_state_num;ato++) { 00669 prob = (hmm_logical_trans(ltmp))->a[0][ato]; 00670 if (prob != LOG_ZERO) { 00671 wchmm->state[node+ato-1].scid = scid; 00672 } 00673 } 00674 } 00675 } 00676 00677 /* move scid and fscore on the head state to the head grammar state */ 00678 for(i=0;i<wchmm->startnum;i++) { 00679 node = wchmm->startnode[i]; 00680 if (wchmm->state[node].out.state != NULL) { 00681 j_internal_error("adjust_sc_index: outprob exist in word-head node??\n"); 00682 } 00683 if (wchmm->next_a[node] != LOG_ZERO) { 00684 if (wchmm->state[node+1].scid != 0) { 00685 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[node+1].scid) { 00686 j_internal_error("adjust_sc_index: different successor list within word-head phone?\n"); 00687 } 00688 wchmm->state[node].scid = wchmm->state[node+1].scid; 00689 wchmm->state[node+1].scid = 0; 00690 } 00691 } 00692 for(ac=wchmm->ac[node];ac;ac=ac->next) { 00693 for(k=0;k<ac->n;k++) { 00694 if (wchmm->state[ac->arc[k]].scid != 0) { 00695 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[ac->arc[k]].scid) { 00696 j_internal_error("adjust_sc_index: different successor list within word-head phone?\n"); 00697 } 00698 wchmm->state[node].scid = wchmm->state[ac->arc[k]].scid; 00699 wchmm->state[ac->arc[k]].scid = 0; 00700 } 00701 } 00702 } 00703 } 00704 } 00705 00706 00707 /* -------------------------------------------------------------------- */ 00708 /* factoring computation */ 00709 00728 void 00729 max_successor_cache_init(WCHMM_INFO *wchmm) 00730 { 00731 int i; 00732 LM_PROB_CACHE *l; 00733 WORD_ID wnum; 00734 00735 /* finally shrink the memory area of successor list here */ 00736 shrink_successor(wchmm); 00737 00738 /* for word-internal */ 00739 l = &(wchmm->lmcache); 00740 00741 l->probcache = (LOGPROB *) mymalloc(sizeof(LOGPROB) * wchmm->scnum); 00742 l->lastwcache = (WORD_ID *) mymalloc(sizeof(WORD_ID) * wchmm->scnum); 00743 for (i=0;i<wchmm->scnum;i++) { 00744 l->lastwcache[i] = WORD_INVALID; 00745 } 00746 /* for cross-word */ 00747 if (wchmm->ngram) { 00748 wnum = wchmm->ngram->max_word_num; 00749 } else { 00750 wnum = wchmm->winfo->num; 00751 } 00752 #ifdef HASH_CACHE_IW 00753 l->iw_cache_num = wnum * jconf.search.pass1.iw_cache_rate / 100; 00754 if (l->iw_cache_num < 10) l->iw_cache_num = 10; 00755 #else 00756 l->iw_cache_num = wnum; 00757 #endif /* HASH_CACHE_IW */ 00758 l->iw_sc_cache = (LOGPROB **)mymalloc(sizeof(LOGPROB *) * l->iw_cache_num); 00759 for (i=0;i<l->iw_cache_num;i++) { 00760 l->iw_sc_cache[i] = NULL; 00761 } 00762 #ifdef HASH_CACHE_IW 00763 l->iw_lw_cache = (WORD_ID *)mymalloc(sizeof(WORD_ID) * l->iw_cache_num); 00764 for (i=0;i<l->iw_cache_num;i++) { 00765 l->iw_lw_cache[i] = WORD_INVALID; 00766 } 00767 #endif 00768 } 00769 00782 static void 00783 max_successor_prob_iw_free(WCHMM_INFO *wchmm) 00784 { 00785 int i; 00786 LM_PROB_CACHE *l; 00787 l = &(wchmm->lmcache); 00788 for (i=0;i<l->iw_cache_num;i++) { 00789 if (l->iw_sc_cache[i] != NULL) free(l->iw_sc_cache[i]); 00790 l->iw_sc_cache[i] = NULL; 00791 } 00792 } 00793 00810 void 00811 max_successor_cache_free(WCHMM_INFO *wchmm) 00812 { 00813 free(wchmm->lmcache.probcache); 00814 free(wchmm->lmcache.lastwcache); 00815 max_successor_prob_iw_free(wchmm); 00816 free(wchmm->lmcache.iw_sc_cache); 00817 #ifdef HASH_CACHE_IW 00818 free(wchmm->lmcache.iw_lw_cache); 00819 #endif 00820 } 00821 00822 #ifdef UNIGRAM_FACTORING 00823 00864 void 00865 make_iwcache_index(WCHMM_INFO *wchmm) 00866 { 00867 int i, node, num; 00868 00869 wchmm->start2isolate = (int *)mymalloc(sizeof(int) * wchmm->startnum); 00870 num = 0; 00871 for(i=0;i<wchmm->startnum;i++) { 00872 node = wchmm->startnode[i]; 00873 if (wchmm->state[node].scid >= 0) { /* not a factoring node (isolated node, has no 1-gram factoring value) */ 00874 wchmm->start2isolate[i] = num; 00875 num++; 00876 } else { /* factoring node (shared) */ 00877 wchmm->start2isolate[i] = -1; 00878 } 00879 } 00880 wchmm->isolatenum = num; 00881 } 00882 00927 void 00928 calc_all_unigram_factoring_values(WCHMM_INFO *wchmm) 00929 { 00930 S_CELL *sc, *sctmp; 00931 LOGPROB tmpprob, maxprob; 00932 int i, n; 00933 00934 /* count needed number of 1-gram factoring nodes */ 00935 n = 0; 00936 for (i=1;i<wchmm->scnum;i++) { 00937 sc = wchmm->sclist[i]; 00938 if (sc == NULL) { 00939 j_internal_error("call_all_unigram_factoring_values: sclist has no sc?\n"); 00940 } 00941 if (sc->next != NULL) { 00942 /* more than two words, so compute maximum 1-gram probability */ 00943 n++; 00944 } 00945 } 00946 wchmm->fsnum = n + 1; 00947 /* allocate area */ 00948 wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum); 00949 /* assign values */ 00950 n = 1; 00951 for (i=1;i<wchmm->scnum;i++) { 00952 sc = wchmm->sclist[i]; 00953 if (sc->next != NULL) { 00954 maxprob = LOG_ZERO; 00955 for (sctmp = sc; sctmp; sctmp = sctmp->next) { 00956 if (wchmm->ngram) { 00957 tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[sctmp->word]) 00958 #ifdef CLASS_NGRAM 00959 + wchmm->winfo->cprob[sctmp->word] 00960 #endif 00961 ; 00962 } else { 00963 tmpprob = LOG_ZERO; 00964 } 00965 if (wchmm->lmvar == LM_NGRAM_USER) { 00966 tmpprob = (*(wchmm->uni_prob_user))(wchmm->winfo, sctmp->word, tmpprob); 00967 } 00968 if (maxprob < tmpprob) maxprob = tmpprob; 00969 } 00970 wchmm->fscore[n] = maxprob; 00971 free_successor(wchmm, i); 00972 wchmm->state[wchmm->sclist2node[i]].scid = - n; 00973 n++; 00974 } 00975 } 00976 /* garbage collection of factored sclist */ 00977 compaction_successor(wchmm); 00978 } 00979 00980 #else /* ~UNIGRAM_FACTORING */ 00981 01004 static LOGPROB 01005 calc_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node) 01006 { 01007 S_CELL *sc; 01008 LOGPROB tmpprob, maxprob; 01009 WORD_ID lw; 01010 01011 maxprob = LOG_ZERO; 01012 if (wchmm->ngram) { 01013 lw = wchmm->winfo->wton[lastword]; 01014 } 01015 01016 for (sc = wchmm->sclist[wchmm->state[node].scid]; sc; sc = sc->next) { 01017 if (wchmm->ngram) { 01018 tmpprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, lw , wchmm->winfo->wton[sc->word]) 01019 #ifdef CLASS_NGRAM 01020 + wchmm->winfo->cprob[sc->word] 01021 #endif 01022 ; 01023 } else { 01024 tmpprob = LOG_ZERO; 01025 } 01026 if (wchmm->lmvar == LM_NGRAM_USER) { 01027 tmpprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, sc->word, tmpprob); 01028 } 01029 if (maxprob < tmpprob) maxprob = tmpprob; 01030 } 01031 01032 return(maxprob); 01033 } 01034 01035 #endif /* ~UNIGRAM_FACTORING */ 01036 01079 LOGPROB 01080 max_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node) 01081 { 01082 LOGPROB maxprob; 01083 WORD_ID last_nword, w; 01084 int scid; 01085 LM_PROB_CACHE *l; 01086 01087 l = &(wchmm->lmcache); 01088 01089 if (lastword != WORD_INVALID) { /* return nothing if no previous word */ 01090 if (wchmm->ngram) { 01091 last_nword = wchmm->winfo->wton[lastword]; 01092 } else { 01093 last_nword = lastword; 01094 } 01095 scid = wchmm->state[node].scid; 01096 #ifdef UNIGRAM_FACTORING 01097 if (scid < 0) { 01098 /* return 1-gram factoring value already calced */ 01099 return(wchmm->fscore[(- scid)]); 01100 } else { 01101 /* this node has only one successor */ 01102 /* return precise 2-gram score */ 01103 if (last_nword != l->lastwcache[scid]) { 01104 /* calc and cache */ 01105 w = (wchmm->sclist[scid])->word; 01106 if (wchmm->ngram) { 01107 maxprob = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w]) 01108 #ifdef CLASS_NGRAM 01109 + wchmm->winfo->cprob[w] 01110 #endif 01111 ; 01112 } else { 01113 maxprob = LOG_ZERO; 01114 } 01115 if (wchmm->lmvar == LM_NGRAM_USER) { 01116 maxprob = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, maxprob); 01117 } 01118 l->lastwcache[scid] = last_nword; 01119 l->probcache[scid] = maxprob; 01120 return(maxprob); 01121 } else { 01122 /* return cached */ 01123 return (l->probcache[scid]); 01124 } 01125 } 01126 #else /* UNIGRAM_FACTORING */ 01127 /* 2-gram */ 01128 if (last_nword != l->lastwcache[scid]) { 01129 maxprob = calc_successor_prob(wchmm, lastword, node); 01130 /* store to cache */ 01131 l->lastwcache[scid] = last_nword; 01132 l->probcache[scid] = maxprob; 01133 return(maxprob); 01134 } else { 01135 return (l->probcache[scid]); 01136 } 01137 #endif /* UNIGRAM_FACTORING */ 01138 } else { 01139 return(0.0); 01140 #if 0 01141 maxprob = LOG_ZERO; 01142 for (sc=wchmm->state[node].sc;sc;sc=sc->next) { 01143 tmpprob = uni_prob(wchmm->ngram, sc->word); 01144 if (maxprob < tmpprob) maxprob = tmpprob; 01145 } 01146 return(maxprob); 01147 #endif 01148 } 01149 01150 } 01151 01186 LOGPROB * 01187 max_successor_prob_iw(WCHMM_INFO *wchmm, WORD_ID lastword) 01188 { 01189 int i, j, x, node; 01190 int last_nword; 01191 WORD_ID w; 01192 LM_PROB_CACHE *l; 01193 LOGPROB p; 01194 01195 l = &(wchmm->lmcache); 01196 01197 if (wchmm->ngram) { 01198 last_nword = wchmm->winfo->wton[lastword]; 01199 } else { 01200 last_nword = lastword; 01201 } 01202 01203 #ifdef HASH_CACHE_IW 01204 x = last_nword % l->iw_cache_num; 01205 if (l->iw_lw_cache[x] == last_nword) { /* cache hit */ 01206 return(l->iw_sc_cache[x]); 01207 } 01208 #else /* full cache */ 01209 if (l->iw_sc_cache[last_nword] != NULL) { /* cache hit */ 01210 return(l->iw_sc_cache[last_nword]); 01211 } 01212 x = last_nword; 01213 /* cache mis-hit, calc probs and cache them as new */ 01214 #endif 01215 /* allocate cache memory */ 01216 if (l->iw_sc_cache[x] == NULL) { 01217 #ifdef UNIGRAM_FACTORING 01218 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum); 01219 #else 01220 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum); 01221 #endif 01222 if (l->iw_sc_cache[x] == NULL) { /* malloc failed */ 01223 /* clear existing cache, and retry */ 01224 max_successor_prob_iw_free(wchmm); 01225 jlog("STAT: inter-word LM cache (%dMB) rehashed\n", 01226 (l->iw_cache_num * 01227 #ifdef UNIGRAM_FACTORING 01228 wchmm->isolatenum 01229 #else 01230 wchmm->startnum 01231 #endif 01232 ) / 1000 * sizeof(LOGPROB) / 1000); 01233 #ifdef UNIGRAM_FACTORING 01234 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum); 01235 #else 01236 l->iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum); 01237 #endif 01238 if (l->iw_sc_cache[x] == NULL) { /* malloc failed again? */ 01239 j_internal_error("max_successor_prob_iw: cannot malloc\n"); 01240 } 01241 } 01242 } 01243 01244 /* calc prob for all startid */ 01245 #ifdef UNIGRAM_FACTORING 01246 for (j=0;j<wchmm->startnum;j++) { 01247 i = wchmm->start2isolate[j]; 01248 if (i == -1) continue; 01249 node = wchmm->startnode[j]; 01250 if (wchmm->state[node].scid <= 0) { 01251 /* should not happen!!! below is just for debugging */ 01252 j_internal_error("max_successor_prob_iw: isolated (not shared) tree root node has unigram factoring value??\n"); 01253 } else { 01254 w = (wchmm->sclist[wchmm->state[node].scid])->word; 01255 if (wchmm->ngram) { 01256 p = (*(wchmm->ngram->bigram_prob))(wchmm->ngram, last_nword, wchmm->winfo->wton[w]) 01257 #ifdef CLASS_NGRAM 01258 + wchmm->winfo->cprob[w] 01259 #endif 01260 ; 01261 } else { 01262 p = LOG_ZERO; 01263 } 01264 if (wchmm->lmvar == LM_NGRAM_USER) { 01265 p = (*(wchmm->bi_prob_user))(wchmm->winfo, lastword, w, p); 01266 } 01267 l->iw_sc_cache[x][i] = p; 01268 } 01269 } 01270 #else /* ~UNIGRAM_FACTORING */ 01271 for (i=0;i<wchmm->startnum;i++) { 01272 node = wchmm->startnode[i]; 01273 l->iw_sc_cache[x][i] = calc_successor_prob(wchmm, lastword, node); 01274 } 01275 #endif 01276 #ifdef HASH_CACHE_IW 01277 l->iw_lw_cache[x] = last_nword; 01278 #endif 01279 01280 return(l->iw_sc_cache[x]); 01281 } 01282 01332 boolean 01333 can_succeed(WCHMM_INFO *wchmm, WORD_ID lastword, int node) 01334 { 01335 int lc; 01336 S_CELL *sc; 01337 01338 /* return TRUE if at least one subtree word can connect */ 01339 01340 if (lastword == WORD_INVALID) { /* case at beginning-of-word */ 01341 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) { 01342 if (dfa_cp_begin(wchmm->dfa, sc->word) == TRUE) return(TRUE); 01343 } 01344 return(FALSE); 01345 } else { 01346 lc = wchmm->winfo->wton[lastword]; 01347 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) { 01348 if (dfa_cp(wchmm->dfa, lc, sc->word) == TRUE) return(TRUE); 01349 } 01350 return(FALSE); 01351 } 01352 } 01353 01354 /* end of file */