00001
00150
00151
00152
00153
00154
00155
00156
00157 #include <julius.h>
00158
00159 #ifndef CATEGORY_TREE
00160
00161
00162
00183 static void
00184 add_successor(WCHMM_INFO *wchmm, int node, WORD_ID w)
00185 {
00186 S_CELL *sctmp, *sc;
00187
00188
00189 sctmp=(S_CELL *) mymalloc(sizeof(S_CELL));
00190 if (sctmp == NULL) {
00191 j_error("malloc fault at add_succesor(%d,%d)\n",node,w);
00192 }
00193
00194 sctmp->word = w;
00195
00196 if (wchmm->state[node].scid == 0) {
00197 j_error("InternalError: sclist id not assigned to branch node?\n");
00198 }
00199 sc = wchmm->sclist[wchmm->state[node].scid];
00200 if (sc == NULL || sctmp->word < sc->word) {
00201 sctmp->next = sc;
00202 wchmm->sclist[wchmm->state[node].scid] = sctmp;
00203 } else {
00204 for(;sc;sc=sc->next) {
00205 if (sc->next == NULL || sctmp->word < (sc->next)->word) {
00206 if (sctmp->word == sc->word) break;
00207 sctmp->next = sc->next;
00208 sc->next = sctmp;
00209 break;
00210 }
00211 }
00212 }
00213 }
00214
00235 static boolean
00236 match_successor(WCHMM_INFO *wchmm, int node1, int node2)
00237 {
00238 S_CELL *sc1,*sc2;
00239
00240
00241 if (wchmm->state[node1].scid == 0 || wchmm->state[node2].scid == 0) {
00242 j_error("InternalError: sclist id not assigned to branch node?\n");
00243 }
00244 sc1 = wchmm->sclist[wchmm->state[node1].scid];
00245 sc2 = wchmm->sclist[wchmm->state[node2].scid];
00246 for (;;) {
00247 if (sc1 == NULL || sc2 == NULL) {
00248 if (sc1 == NULL && sc2 == NULL) {
00249 return TRUE;
00250 } else {
00251 return FALSE;
00252 }
00253 } else if (sc1->word != sc2->word) {
00254 return FALSE;
00255 }
00256 sc1 = sc1->next;
00257 sc2 = sc2->next;
00258 }
00259 }
00260
00275 static void
00276 free_successor(WCHMM_INFO *wchmm, int scid)
00277 {
00278 S_CELL *sc;
00279 S_CELL *sctmp;
00280
00281
00282 sc = wchmm->sclist[scid];
00283 while (sc != NULL) {
00284 sctmp = sc;
00285 sc = sc->next;
00286 free(sctmp);
00287 }
00288 }
00289
00304 static void
00305 compaction_successor(WCHMM_INFO *wchmm)
00306 {
00307 int src, dst;
00308
00309 dst = 1;
00310 for(src=1;src<wchmm->scnum;src++) {
00311 if (wchmm->state[wchmm->sclist2node[src]].scid <= 0) {
00312
00313 continue;
00314 }
00315 if (dst != src) {
00316 wchmm->sclist[dst] = wchmm->sclist[src];
00317 wchmm->sclist2node[dst] = wchmm->sclist2node[src];
00318 wchmm->state[wchmm->sclist2node[dst]].scid = dst;
00319 }
00320 dst++;
00321 }
00322 if (debug2_flag) j_printerr(" successor list shrinked from %d to %d...\n", wchmm->scnum, dst);
00323 wchmm->scnum = dst;
00324 }
00325
00340 static void
00341 shrink_successor(WCHMM_INFO *wchmm)
00342 {
00343 wchmm->sclist = (S_CELL **)myrealloc(wchmm->sclist, sizeof(S_CELL *) * wchmm->scnum);
00344 wchmm->sclist2node = (int *)myrealloc(wchmm->sclist2node, sizeof(int) * wchmm->scnum);
00345 }
00346
00359 void
00360 make_successor_list(WCHMM_INFO *wchmm)
00361 {
00362 int node;
00363 WORD_ID w;
00364 int i;
00365 boolean *freemark;
00366 int s;
00367
00368 VERMES(" make successor lists for factoring...\n");
00369
00370
00371
00372 for (node=0;node<wchmm->n;node++) wchmm->state[node].scid = 0;
00373
00374 s = 1;
00375 for (w=0;w<wchmm->winfo->num;w++) {
00376 for (i=0;i<wchmm->winfo->wlen[w];i++) {
00377 if (wchmm->state[wchmm->offset[w][i]].scid == 0) {
00378 wchmm->state[wchmm->offset[w][i]].scid = s;
00379 s++;
00380 }
00381 }
00382 if (wchmm->state[wchmm->wordend[w]].scid == 0) {
00383 wchmm->state[wchmm->wordend[w]].scid = s;
00384 s++;
00385 }
00386 }
00387 wchmm->scnum = s;
00388 if (debug2_flag) j_printerr(" initial successor list size = %d...\n", wchmm->scnum);
00389
00390 wchmm->sclist = (S_CELL **)mymalloc(sizeof(S_CELL *) * wchmm->scnum);
00391 for (i=1;i<wchmm->scnum;i++) wchmm->sclist[i] = NULL;
00392 wchmm->sclist2node = (int *)mymalloc(sizeof(int) * wchmm->scnum);
00393
00394 freemark = (boolean *)mymalloc(sizeof(boolean) * wchmm->scnum);
00395 for (i=1;i<wchmm->scnum;i++) freemark[i] = FALSE;
00396
00397
00398 for (w=0;w<wchmm->winfo->num;w++) {
00399
00400 for (i=0;i<wchmm->winfo->wlen[w];i++) {
00401 wchmm->sclist2node[wchmm->state[wchmm->offset[w][i]].scid] = wchmm->offset[w][i];
00402 add_successor(wchmm, wchmm->offset[w][i], w);
00403 }
00404
00405 wchmm->sclist2node[wchmm->state[wchmm->wordend[w]].scid] = wchmm->wordend[w];
00406 add_successor(wchmm, wchmm->wordend[w], w);
00407 }
00408
00409
00410
00411
00412 for (w=0;w<wchmm->winfo->num;w++) {
00413 node = wchmm->wordend[w];
00414 i = wchmm->winfo->wlen[w]-1;
00415 while (i >= 0) {
00416 if (node == wchmm->offset[w][i]) {
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426 i--;
00427 continue;
00428 }
00429 if (match_successor(wchmm, node, wchmm->offset[w][i])) {
00430 freemark[wchmm->state[node].scid] = TRUE;
00431 }
00432
00433
00434
00435
00436
00437 node = wchmm->offset[w][i];
00438 i--;
00439 }
00440 }
00441
00442 for (i=1;i<wchmm->scnum;i++) {
00443 if (freemark[i] == TRUE) {
00444 free_successor(wchmm, i);
00445
00446 wchmm->state[wchmm->sclist2node[i]].scid = 0;
00447 }
00448 }
00449
00450 compaction_successor(wchmm);
00451
00452 free(freemark);
00453
00454 VERMES("done\n");
00455 }
00456
00457 #ifndef CATEGORY_TREE
00458 # ifdef MULTIPATH_VERSION
00459
00473 void
00474 adjust_sc_index(WCHMM_INFO *wchmm)
00475 {
00476 WORD_ID w;
00477 int i,j,k;
00478 HMM_Logical *ltmp;
00479 int ltmp_state_num;
00480 int ato;
00481 LOGPROB prob;
00482 int node, scid;
00483 A_CELL *ac;
00484
00485
00486 for(w=0;w<wchmm->winfo->num;w++) {
00487 for(k=0;k<wchmm->winfo->wlen[w];k++) {
00488 node = wchmm->offset[w][k];
00489 scid = wchmm->state[node].scid;
00490 if (scid == 0) continue;
00491 ltmp = wchmm->winfo->wseq[w][k];
00492 ltmp_state_num = hmm_logical_state_num(ltmp);
00493 if ((hmm_logical_trans(ltmp))->a[0][ltmp_state_num-1] != LOG_ZERO) {
00494 j = k + 1;
00495 if (j == wchmm->winfo->wlen[w]) {
00496 if (wchmm->state[wchmm->wordend[w]].scid == 0) {
00497 printf("word %d: factoring node copied for skip phone\n", w);
00498 wchmm->state[wchmm->wordend[w]].scid = scid;
00499 }
00500 } else {
00501 if (wchmm->state[wchmm->offset[w][j]].scid == 0) {
00502 printf("word %d: factoring node copied for skip phone\n", w);
00503 wchmm->state[wchmm->offset[w][j]].scid = scid;
00504 }
00505 }
00506 }
00507 for(ato=1;ato<ltmp_state_num;ato++) {
00508 prob = (hmm_logical_trans(ltmp))->a[0][ato];
00509 if (prob != LOG_ZERO) {
00510 wchmm->state[node+ato-1].scid = scid;
00511 }
00512 }
00513 }
00514 }
00515
00516
00517 for(i=0;i<wchmm->startnum;i++) {
00518 node = wchmm->startnode[i];
00519 if (wchmm->state[node].out.state != NULL) {
00520 j_error("Error: outprob exist in word-head node??\n");
00521 }
00522 for(ac=wchmm->state[node].ac;ac;ac=ac->next) {
00523 if (wchmm->state[ac->arc].scid != 0) {
00524 if (wchmm->state[node].scid != 0 && wchmm->state[node].scid != wchmm->state[ac->arc].scid) {
00525 j_error("Error: different successor list within word-head phone?\n");
00526 }
00527 wchmm->state[node].scid = wchmm->state[ac->arc].scid;
00528 wchmm->state[ac->arc].scid = 0;
00529 }
00530 }
00531 }
00532 }
00533
00534 # endif
00535 #endif
00536
00537
00538
00539
00540
00541 #ifdef USE_NGRAM
00542
00543
00544
00545
00546
00547
00548
00549
00550 static LOGPROB *probcache;
00551 static WORD_ID *lastwcache;
00552
00553
00554
00555
00556
00557
00575 static LOGPROB **iw_sc_cache;
00580 static int iw_cache_num;
00581 #ifdef HASH_CACHE_IW
00582 static WORD_ID *iw_lw_cache;
00583 #endif
00584
00585
00586
00601 void
00602 max_successor_cache_init(WCHMM_INFO *wchmm)
00603 {
00604 int i;
00605
00606
00607 shrink_successor(wchmm);
00608
00609
00610 probcache = (LOGPROB *) mymalloc(sizeof(LOGPROB) * wchmm->scnum);
00611 lastwcache = (WORD_ID *) mymalloc(sizeof(WORD_ID) * wchmm->scnum);
00612 for (i=0;i<wchmm->scnum;i++) {
00613 lastwcache[i] = WORD_INVALID;
00614 }
00615
00616 #ifdef HASH_CACHE_IW
00617 iw_cache_num = wchmm->ngram->max_word_num * iw_cache_rate / 100;
00618 if (iw_cache_num < 10) iw_cache_num = 10;
00619 #else
00620 iw_cache_num = wchmm->ngram->max_word_num;
00621 #endif
00622 iw_sc_cache = (LOGPROB **)mymalloc(sizeof(LOGPROB *) * iw_cache_num);
00623 for (i=0;i<iw_cache_num;i++) {
00624 iw_sc_cache[i] = NULL;
00625 }
00626 #ifdef HASH_CACHE_IW
00627 iw_lw_cache = (WORD_ID *)mymalloc(sizeof(WORD_ID) * iw_cache_num);
00628 for (i=0;i<iw_cache_num;i++) {
00629 iw_lw_cache[i] = WORD_INVALID;
00630 }
00631 #endif
00632 }
00633
00644 static void
00645 max_successor_prob_iw_free()
00646 {
00647 int i;
00648 for (i=0;i<iw_cache_num;i++) {
00649 if (iw_sc_cache[i] != NULL) free(iw_sc_cache[i]);
00650 iw_sc_cache[i] = NULL;
00651 }
00652 }
00653
00664 void
00665 max_successor_cache_free()
00666 {
00667 free(probcache);
00668 free(lastwcache);
00669 max_successor_prob_iw_free();
00670 free(iw_sc_cache);
00671 #ifdef HASH_CACHE_IW
00672 free(iw_lw_cache);
00673 #endif
00674 }
00675
00676 #ifdef UNIGRAM_FACTORING
00677
00714 void
00715 make_iwcache_index(WCHMM_INFO *wchmm)
00716 {
00717 int i, node, num;
00718
00719 wchmm->start2isolate = (int *)mymalloc(sizeof(int) * wchmm->startnum);
00720 num = 0;
00721 for(i=0;i<wchmm->startnum;i++) {
00722 node = wchmm->startnode[i];
00723 if (wchmm->state[node].scid >= 0) {
00724 wchmm->start2isolate[i] = num;
00725 num++;
00726 } else {
00727 wchmm->start2isolate[i] = -1;
00728 }
00729 }
00730 wchmm->isolatenum = num;
00731 }
00732
00772 void
00773 calc_all_unigram_factoring_values(WCHMM_INFO *wchmm)
00774 {
00775 S_CELL *sc, *sctmp;
00776 LOGPROB tmpprob, maxprob;
00777 int i, n;
00778
00779
00780 n = 0;
00781 for (i=1;i<wchmm->scnum;i++) {
00782 sc = wchmm->sclist[i];
00783 if (sc == NULL) {
00784 j_error("InternalError: sclist has no sc?\n");
00785 }
00786 if (sc->next != NULL) {
00787
00788 n++;
00789 }
00790 }
00791 wchmm->fsnum = n + 1;
00792
00793 wchmm->fscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->fsnum);
00794
00795 n = 1;
00796 for (i=1;i<wchmm->scnum;i++) {
00797 sc = wchmm->sclist[i];
00798 if (sc->next != NULL) {
00799 maxprob = LOG_ZERO;
00800 for (sctmp = sc; sctmp; sctmp = sctmp->next) {
00801 tmpprob = uni_prob(wchmm->ngram, wchmm->winfo->wton[sctmp->word])
00802 #ifdef CLASS_NGRAM
00803 + wchmm->winfo->cprob[sctmp->word]
00804 #endif
00805 ;
00806 if (maxprob < tmpprob) maxprob = tmpprob;
00807 }
00808 wchmm->fscore[n] = maxprob;
00809 free_successor(wchmm, i);
00810 wchmm->state[wchmm->sclist2node[i]].scid = - n;
00811 n++;
00812 }
00813 }
00814
00815 compaction_successor(wchmm);
00816 }
00817
00818 #else
00819
00841 static LOGPROB
00842 calc_successor_prob(WCHMM_INFO *wchmm, WORD_ID last_nword, int node)
00843 {
00844 S_CELL *sc;
00845 LOGPROB tmpprob, maxprob;
00846
00847 maxprob = LOG_ZERO;
00848 for (sc = wchmm->sclist[wchmm->state[node].scid]; sc; sc = sc->next) {
00849 tmpprob = bi_prob_lr(wchmm->ngram, last_nword, wchmm->winfo->wton[sc->word])
00850 #ifdef CLASS_NGRAM
00851 + wchmm->winfo->cprob[sc->word]
00852 #endif
00853 ;
00854 if (maxprob < tmpprob) maxprob = tmpprob;
00855 }
00856 return(maxprob);
00857 }
00858
00859 #endif
00860
00899 LOGPROB
00900 max_successor_prob(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
00901 {
00902 LOGPROB maxprob;
00903 WORD_ID last_nword, w;
00904 int scid;
00905
00906 if (lastword != WORD_INVALID) {
00907 last_nword = wchmm->winfo->wton[lastword];
00908 scid = wchmm->state[node].scid;
00909 #ifdef UNIGRAM_FACTORING
00910 if (scid < 0) {
00911
00912 return(wchmm->fscore[(- scid)]);
00913 } else {
00914
00915
00916 if (last_nword != lastwcache[scid]) {
00917
00918 w = (wchmm->sclist[scid])->word;
00919 maxprob = bi_prob_lr(wchmm->ngram, last_nword, wchmm->winfo->wton[w])
00920 #ifdef CLASS_NGRAM
00921 + wchmm->winfo->cprob[w]
00922 #endif
00923 ;
00924 lastwcache[scid] = last_nword;
00925 probcache[scid] = maxprob;
00926 return(maxprob);
00927 } else {
00928
00929 return (probcache[scid]);
00930 }
00931 }
00932 #else
00933
00934 if (last_nword != lastwcache[scid]) {
00935 maxprob = calc_successor_prob(wchmm, last_nword, node);
00936
00937 lastwcache[scid] = last_nword;
00938 probcache[scid] = maxprob;
00939 return(maxprob);
00940 } else {
00941 return (probcache[scid]);
00942 }
00943 #endif
00944 } else {
00945 return(0.0);
00946 #if 0
00947 maxprob = LOG_ZERO;
00948 for (sc=wchmm->state[node].sc;sc;sc=sc->next) {
00949 tmpprob = uni_prob(wchmm->ngram, sc->word);
00950 if (maxprob < tmpprob) maxprob = tmpprob;
00951 }
00952 return(maxprob);
00953 #endif
00954 }
00955
00956 }
00957
00958 #ifdef HASH_CACHE_IW
00959 #define hashid(A) A % iw_cache_limit
00960 #endif
00961
00991 LOGPROB *
00992 max_successor_prob_iw(WCHMM_INFO *wchmm, WORD_ID lastword)
00993 {
00994 int i, j, x, node;
00995 int last_nword;
00996 WORD_ID w;
00997
00998 last_nword = wchmm->winfo->wton[lastword];
00999 #ifdef HASH_CACHE_IW
01000 x = hashid(last_nword);
01001 if (iw_lw_cache[x] == last_nword) {
01002 return(iw_sc_cache[x]);
01003 }
01004 #else
01005 if (iw_sc_cache[last_nword] != NULL) {
01006 return(iw_sc_cache[last_nword]);
01007 }
01008 x = last_nword;
01009
01010 #endif
01011
01012 if (iw_sc_cache[x] == NULL) {
01013 #ifdef UNIGRAM_FACTORING
01014 iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);
01015 #else
01016 iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);
01017 #endif
01018 if (iw_sc_cache[x] == NULL) {
01019
01020 max_successor_prob_iw_free();
01021 j_printf("inter-word LM cache (%dMB) rehashed\n",
01022 (iw_cache_num *
01023 #ifdef UNIGRAM_FACTORING
01024 wchmm->isolatenum
01025 #else
01026 wchmm->startnum
01027 #endif
01028 ) / 1000 * sizeof(LOGPROB) / 1000);
01029 #ifdef UNIGRAM_FACTORING
01030 iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->isolatenum);
01031 #else
01032 iw_sc_cache[x] = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->startnum);
01033 #endif
01034 if (iw_sc_cache[x] == NULL) {
01035 j_error("max_successor_prob_iw: cannot malloc\n");
01036 }
01037 }
01038 }
01039
01040
01041 #ifdef UNIGRAM_FACTORING
01042 for (j=0;j<wchmm->startnum;j++) {
01043 i = wchmm->start2isolate[j];
01044 if (i == -1) continue;
01045 node = wchmm->startnode[j];
01046 if (wchmm->state[node].scid <= 0) {
01047
01048 j_error("No!!\n");
01049 } else {
01050 w = (wchmm->sclist[wchmm->state[node].scid])->word;
01051 iw_sc_cache[x][i] = bi_prob_lr(ngram, last_nword, wchmm->winfo->wton[w])
01052 #ifdef CLASS_NGRAM
01053 + wchmm->winfo->cprob[w]
01054 #endif
01055 ;
01056 }
01057 }
01058 #else
01059 for (i=0;i<wchmm->startnum;i++) {
01060 node = wchmm->startnode[i];
01061 iw_sc_cache[x][i] = calc_successor_prob(wchmm, last_nword, node);
01062 }
01063 #endif
01064 #ifdef HASH_CACHE_IW
01065 iw_lw_cache[x] = last_nword;
01066 #endif
01067
01068 return(iw_sc_cache[x]);
01069 }
01070
01071
01072 #else
01073
01119 boolean
01120 can_succeed(WCHMM_INFO *wchmm, WORD_ID lastword, int node)
01121 {
01122 int lc;
01123 S_CELL *sc;
01124
01125
01126
01127 if (lastword == WORD_INVALID) {
01128 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) {
01129 if (dfa_cp_begin(wchmm->dfa, sc->word) == TRUE) return(TRUE);
01130 }
01131 return(FALSE);
01132 } else {
01133 lc = winfo->wton[lastword];
01134 for (sc=wchmm->sclist[wchmm->state[node].scid];sc;sc=sc->next) {
01135 if (dfa_cp(wchmm->dfa, lc, sc->word) == TRUE) return(TRUE);
01136 }
01137 return(FALSE);
01138 }
01139 }
01140
01141 #endif
01142
01143
01144 #endif