00001
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045 #include <julius.h>
00046
00047
00048 static int dupcount = 0;
00049
00050 #undef WCHMM_SIZE_CHECK
00051
00052
00053
00054
00055
00056
00069 WCHMM_INFO *
00070 wchmm_new()
00071 {
00072 WCHMM_INFO *w;
00073 w = (WCHMM_INFO *)mymalloc(sizeof(WCHMM_INFO));
00074 #ifdef USE_NGRAM
00075 w->ngram = NULL;
00076 #endif
00077 #ifdef USE_DFA
00078 w->dfa = NULL;
00079 #endif
00080 w->winfo = NULL;
00081 w->malloc_root = NULL;
00082 #ifdef PASS1_IWCD
00083 #ifdef CATEGORY_TREE
00084 w->lcdset_category_root = NULL;
00085 #endif
00086 #endif
00087 return w;
00088 }
00089
00102 static void
00103 wchmm_init(WCHMM_INFO *wchmm)
00104 {
00105 wchmm->maxwcn = MAXWCNSTEP;
00106 wchmm->state = (WCHMM_STATE *)mymalloc(sizeof(WCHMM_STATE)*wchmm->maxwcn);
00107 #ifdef MULTIPATH_VERSION
00108 wchmm->wordbegin = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00109 #else
00110 wchmm->ststart = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00111 #endif
00112 wchmm->stend = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn);
00113 wchmm->offset = (int **)mymalloc(sizeof(int *)*wchmm->winfo->num);
00114 wchmm->wordend = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00115 #ifdef MULTIPATH_VERSION
00116 wchmm->maxstartnum = STARTNODE_STEP;
00117 wchmm->startnode = (int *)mymalloc(sizeof(int)*STARTNODE_STEP);
00118 wchmm->startnum = 0;
00119 # ifdef CATEGORY_TREE
00120 wchmm->start2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*STARTNODE_STEP);
00121 # endif
00122 #else
00123 wchmm->startnode = (int *)mymalloc(sizeof(int)*wchmm->winfo->num);
00124 wchmm->wordend_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->winfo->num);
00125 #endif
00126 #ifdef PASS1_IWCD
00127 wchmm->outstyle = (unsigned char *)mymalloc(sizeof(unsigned char)*wchmm->maxwcn);
00128 #endif
00129 #ifdef UNIGRAM_FACTORING
00130 wchmm->start2isolate = NULL;
00131 wchmm->isolatenum = 0;
00132 #endif
00133 #ifndef CATEGORY_TREE
00134 wchmm->sclist = NULL;
00135 wchmm->sclist2node = NULL;
00136 #ifdef UNIGRAM_FACTORING
00137 wchmm->fscore = NULL;
00138 #endif
00139 #endif
00140 wchmm->n = 0;
00141 }
00142
00155 static void
00156 wchmm_extend(WCHMM_INFO *wchmm)
00157 {
00158 wchmm->maxwcn += MAXWCNSTEP;
00159 wchmm->state = (WCHMM_STATE *)myrealloc(wchmm->state, sizeof(WCHMM_STATE)*wchmm->maxwcn);
00160 #ifndef MULTIPATH_VERSION
00161 wchmm->ststart = (WORD_ID *)myrealloc(wchmm->ststart, sizeof(WORD_ID)*wchmm->maxwcn);
00162 #endif
00163 wchmm->stend = (WORD_ID *)myrealloc(wchmm->stend, sizeof(WORD_ID)*wchmm->maxwcn);
00164 #ifdef PASS1_IWCD
00165 wchmm->outstyle = (unsigned char *)myrealloc(wchmm->outstyle, sizeof(unsigned char)*wchmm->maxwcn);
00166 #endif
00167 }
00168
00169 #ifdef MULTIPATH_VERSION
00170
00182 static void
00183 wchmm_extend_startnode(WCHMM_INFO *wchmm)
00184 {
00185 wchmm->maxstartnum += STARTNODE_STEP;
00186 wchmm->startnode = (int *)myrealloc(wchmm->startnode, sizeof(int) * wchmm->maxstartnum);
00187 #ifdef CATEGORY_TREE
00188 wchmm->start2wid = (WORD_ID *)myrealloc(wchmm->startnode, sizeof(WORD_ID) * wchmm->maxstartnum);
00189 #endif
00190 }
00191 #endif
00192
00205 void
00206 wchmm_free(WCHMM_INFO *w)
00207 {
00208 S_CELL *sc, *sctmp;
00209 int i;
00210
00211
00212 #ifdef PASS1_IWCD
00213
00214 #endif
00215
00216 mybfree2(&(w->malloc_root));
00217 #ifndef CATEGORY_TREE
00218 if (w->sclist != NULL) {
00219 for(i=1;i<w->scnum;i++) {
00220 sc = w->sclist[i];
00221 while(sc) {
00222 sctmp = sc->next;
00223 free(sc);
00224 sc = sctmp;
00225 }
00226 }
00227 free(w->sclist);
00228 }
00229 if (w->sclist2node != NULL) free(w->sclist2node);
00230 #ifdef UNIGRAM_FACTORING
00231 if (w->fscore != NULL) free(w->fscore);
00232 #endif
00233 #endif
00234 #ifdef UNIGRAM_FACTORING
00235 if (w->start2isolate != NULL) free(w->start2isolate);
00236 #endif
00237 #ifdef PASS1_IWCD
00238 free(w->outstyle);
00239 #endif
00240 #ifndef MULTIPATH_VERSION
00241 free(w->wordend_a);
00242 #endif
00243 free(w->startnode);
00244 #ifdef MULTIPATH_VERSION
00245 # ifdef CATEGORY_TREE
00246 free(w->start2wid);
00247 # endif
00248 #endif
00249 free(w->wordend);
00250 free(w->offset);
00251 free(w->stend);
00252 #ifdef MULTIPATH_VERSION
00253 free(w->wordbegin);
00254 #else
00255 free(w->ststart);
00256 #endif
00257 free(w->state);
00258 #ifdef PASS1_IWCD
00259 #ifdef CATEGORY_TREE
00260 lcdset_remove_with_category_all(w);
00261 #endif
00262 #endif
00263 free(w);
00264 }
00265
00266
00267
00268
00269
00270
00271 static WORD_INFO *local_winfo;
00272
00291 static int
00292 compare_wseq(WORD_ID *widx1, WORD_ID *widx2)
00293 {
00294 int len1, len2, n;
00295 int p=0;
00296
00297 len1 = local_winfo->wlen[*widx1];
00298 len2 = local_winfo->wlen[*widx2];
00299
00300 n=0;
00301
00302 while (n < len1 && n < len2 && (p = strcmp((local_winfo->wseq[*widx1][n])->name, (local_winfo->wseq[*widx2][n])->name)) == 0 ) n++;
00303 if (n < len1) {
00304 if (n < len2) {
00305
00306 return(p);
00307 } else {
00308
00309 return(1);
00310 }
00311 } else {
00312 if (n < len2) {
00313
00314 return(-1);
00315 } else {
00316
00317 return(0);
00318 }
00319 }
00320 }
00321
00340 static void
00341 wchmm_sort_idx_by_wseq(WORD_INFO *winfo, WORD_ID *windex, WORD_ID bgn, WORD_ID len)
00342 {
00343 local_winfo = winfo;
00344 qsort(&(windex[bgn]), len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_wseq);
00345 }
00346
00347 #ifdef CATEGORY_TREE
00348
00366 static int
00367 compare_category(WORD_ID *widx1, WORD_ID *widx2)
00368 {
00369 int c1,c2;
00370 c1 = local_winfo->wton[*widx1];
00371 c2 = local_winfo->wton[*widx2];
00372 return(c1 - c2);
00373 }
00374
00391 static void
00392 wchmm_sort_idx_by_category(WORD_INFO *winfo, WORD_ID *windex, WORD_ID len)
00393 {
00394 local_winfo = winfo;
00395 qsort(windex, len, sizeof(WORD_ID), (int (*)(const void *, const void *))compare_category);
00396 }
00397 #endif
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00429 static int
00430 wchmm_check_match(WORD_INFO *winfo, int i, int j)
00431 {
00432 int k,tmplen;
00433
00434 for (tmplen=0,k=0;k<winfo->wlen[i];k++) {
00435 if (k > winfo->wlen[j]-1)
00436 break;
00437 if (! (strmatch(winfo->wseq[i][k]->name, winfo->wseq[j][k]->name)))
00438 break;
00439 tmplen++;
00440 }
00441 return(tmplen);
00442 }
00443
00462 static void
00463 add_wacc(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc)
00464 {
00465 A_CELL *ac;
00466 ac = (A_CELL *) mybmalloc2(sizeof(A_CELL), &(wchmm->malloc_root));
00467 ac->a = a;
00468 ac->arc = arc;
00469 ac->next = wchmm->state[node].ac;
00470 wchmm->state[node].ac = ac;
00471 }
00472
00473 #ifdef MULTIPATH_VERSION
00474
00475
00476
00503 static void
00504 get_outtrans_list(WCHMM_INFO *wchmm, WORD_ID w, int pos, int *node, LOGPROB *a, int *num, int maxnum, boolean insert_sp)
00505 {
00506 HMM_Logical *ltmp;
00507 int states;
00508 int k, ato;
00509 LOGPROB prob;
00510 int oldnum;
00511
00512 if (pos < 0) {
00513
00514
00515 node[*num] = wchmm->wordbegin[w];
00516 a[*num] = 0.0;
00517 (*num)++;
00518
00519 } else {
00520
00521 ltmp = wchmm->winfo->wseq[w][pos];
00522 states = hmm_logical_state_num(ltmp);
00523
00524
00525 if ((hmm_logical_trans(ltmp))->a[0][states-1] != LOG_ZERO) {
00526
00527 oldnum = *num;
00528 get_outtrans_list(wchmm, w, pos-1, node, a, num, maxnum, FALSE);
00529
00530 for(k=oldnum;k<*num;k++) {
00531 a[k] += (hmm_logical_trans(ltmp))->a[0][states-1];
00532 }
00533 }
00534
00535 for (k = 1; k < states - 1; k++) {
00536 prob = (hmm_logical_trans(ltmp))->a[k][states-1];
00537 if (prob != LOG_ZERO) {
00538 if (*num >= maxnum) {
00539 j_error("Maximum outtrans list num exceeded! (%d)\n", maxnum);
00540 }
00541 node[*num] = wchmm->offset[w][pos] + k - 1;
00542 a[*num] = prob;
00543 (*num)++;
00544 }
00545 }
00546
00547
00548
00549
00550
00551 if (enable_iwsp && insert_sp) {
00552
00553 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
00554 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][hmm_logical_state_num(wchmm->hmminfo->sp)-1];
00555 if (prob != LOG_ZERO) {
00556 if (*num >= maxnum) {
00557 j_error("Maximum outtrans list num exceeded! (%d)\n", maxnum);
00558 }
00559 node[*num] = wchmm->offset[w][pos] + (states - 2) + k - 1;
00560 a[*num] = prob;
00561 (*num)++;
00562 }
00563 }
00564 }
00565 }
00566
00567
00568 return;
00569 }
00570
00571 #else
00572
00591 static void
00592 wchmm_link_hmm(WCHMM_INFO *wchmm, int from_node, int to_node, HTK_HMM_Trans *tinfo)
00593 {
00594 A_CELL *actmp;
00595 LOGPROB a, atmp;
00596 int i;
00597 boolean tflag;
00598
00599
00600 for(i = tinfo->statenum - 2; i >= 0; i--) {
00601 if ((a = tinfo->a[i][tinfo->statenum-1]) != LOG_ZERO) {
00602 atmp = a;
00603
00604 tflag = FALSE;
00605 for (actmp = wchmm->state[from_node].ac; actmp; actmp = actmp->next) {
00606 if (actmp->arc == to_node && actmp->a == atmp) {
00607 tflag = TRUE;
00608 break;
00609 }
00610 }
00611 if (tflag) break;
00612
00613 add_wacc(wchmm, from_node, atmp, to_node);
00614 return;
00615 }
00616 }
00617 j_error("Error: No arc to endstate?\n");
00618 }
00619
00640 static void
00641 wchmm_link_subword(WCHMM_INFO *wchmm, int from_word, int from_seq, int to_word, int to_seq)
00642 {
00643 HMM_Logical *last;
00644 int lastp;
00645
00646 last = wchmm->winfo->wseq[from_word][from_seq];
00647 lastp = wchmm->offset[from_word][from_seq] + hmm_logical_state_num(last)-2 -1;
00648 wchmm_link_hmm(wchmm, lastp, wchmm->offset[to_word][to_seq],
00649 hmm_logical_trans(last));
00650 }
00651
00652
00653
00654
00655
00695 static void
00696 wchmm_duplicate_state(WCHMM_INFO *wchmm, int node, int word)
00697 {
00698 int n;
00699 int n_src, n_prev;
00700 A_CELL *ac;
00701 HMM_Logical *lastphone;
00702
00703
00704 if (wchmm->n + 1 >= wchmm->maxwcn) {
00705 wchmm_extend(wchmm);
00706 }
00707
00708 n = wchmm->n;
00709
00710 n_src = node;
00711
00712
00713 #ifdef PASS1_IWCD
00714 {
00715 RC_INFO *rcnew;
00716 LRC_INFO *lrcnew;
00717 wchmm->outstyle[n] = wchmm->outstyle[n_src];
00718 if (wchmm->outstyle[n] == AS_RSET) {
00719
00720 rcnew = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
00721 memcpy(rcnew, wchmm->state[n_src].out.rset, sizeof(RC_INFO));
00722 wchmm->state[n].out.rset = rcnew;
00723 } else if (wchmm->outstyle[n] == AS_LRSET) {
00724
00725 lrcnew = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
00726 memcpy(lrcnew, wchmm->state[n_src].out.lrset, sizeof(LRC_INFO));
00727 wchmm->state[n].out.lrset = lrcnew;
00728 } else {
00729
00730 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(ACOUSTIC_SPEC));
00731 }
00732 }
00733 #else
00734 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(HTK_HMM_State *));
00735 #endif
00736
00737 lastphone = wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-1];
00738 wchmm->state[n].ac = NULL;
00739
00740
00741 for(ac=wchmm->state[n_src].ac; ac; ac=ac->next) {
00742 if (ac->arc == n_src) {
00743 add_wacc(wchmm, n, ac->a, n);
00744 }
00745 }
00746
00747
00748 if (hmm_logical_state_num(lastphone) == 3) {
00749
00750 if (wchmm->winfo->wlen[word] == 1) {
00751
00752 wchmm->ststart[n] = word;
00753 wchmm->offset[word][0] = n;
00754 } else {
00755
00756 n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-2]
00757 + hmm_logical_state_num(wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-2]) - 3;
00758 for (ac=wchmm->state[n_prev].ac; ac; ac=ac->next) {
00759 if (ac->arc == n_src) {
00760 add_wacc(wchmm, n_prev, ac->a, n);
00761 }
00762 }
00763
00764 wchmm->offset[word][wchmm->winfo->wlen[word]-1] = n;
00765
00766 wchmm->ststart[n] = WORD_INVALID;
00767 }
00768 } else {
00769
00770 for (n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-1]; n_prev < n_src; n_prev++) {
00771 for (ac=wchmm->state[n_prev].ac; ac; ac=ac->next) {
00772 if (ac->arc == n_src) {
00773 add_wacc(wchmm, n_prev, ac->a, n);
00774 }
00775 }
00776 for (ac=wchmm->state[n_src].ac; ac; ac=ac->next) {
00777 if (ac->arc == n_prev) {
00778 add_wacc(wchmm, n, ac->a, n_prev);
00779 }
00780 }
00781 }
00782
00783 wchmm->ststart[n] = WORD_INVALID;
00784 }
00785
00786
00787 wchmm->stend[n] = word;
00788 wchmm->wordend[word] = n;
00789
00790
00791 wchmm->n++;
00792
00793 }
00794
00809 static void
00810 wchmm_duplicate_leafnode(WCHMM_INFO *wchmm)
00811 {
00812 int w, nlast, n, narc, narc_model;
00813 boolean *dupw;
00814 A_CELL *actmp;
00815
00816 nlast = wchmm->n;
00817 dupw = (boolean *)mymalloc(sizeof(boolean) * nlast);
00818 for(n=0;n<nlast;n++) dupw[n] = FALSE;
00819
00820 for (w=0;w<wchmm->winfo->num;w++) {
00821 n = wchmm->wordend[w];
00822 if (dupw[n]) {
00823 wchmm_duplicate_state(wchmm, n, w); dupcount++;
00824 } else {
00825
00826 {
00827
00828 HMM_Logical *lastphone;
00829 HTK_HMM_Trans *tinfo;
00830 int laststate, i;
00831 lastphone = wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1];
00832 laststate = hmm_logical_state_num(lastphone) - 2;
00833 tinfo = hmm_logical_trans(lastphone);
00834 narc_model=0;
00835 for(i=1;i<hmm_logical_state_num(lastphone)-1;i++) {
00836 if (tinfo->a[laststate][i] != LOG_ZERO) narc_model++;
00837 }
00838
00839 narc = 0;
00840 for(actmp=wchmm->state[n].ac;actmp;actmp=actmp->next) narc++;
00841 }
00842
00843 if (narc_model != narc) {
00844
00845
00846 wchmm_duplicate_state(wchmm, n, w); dupcount++;
00847
00848
00849 wchmm->stend[n] = WORD_INVALID;
00850 } else {
00851
00852
00853 wchmm->stend[n] = w;
00854 }
00855
00856 dupw[n] = TRUE;
00857 }
00858 }
00859 free(dupw);
00860 }
00861
00862 #endif
00863
00864
00865
00866
00867
00890 static void
00891 wchmm_add_word(WCHMM_INFO *wchmm, int word, int matchlen, int matchword)
00892 {
00893 int j,k,n;
00894 int add_head, add_tail, add_to;
00895 int word_len, matchword_len;
00896 HMM_Logical *ltmp;
00897 int ato;
00898 LOGPROB prob;
00899 int ntmp;
00900 int ltmp_state_num;
00901 #ifdef PASS1_IWCD
00902 CD_Set *lcd = NULL;
00903 #endif
00904
00905 #ifdef MULTIPATH_VERSION
00906 int *out_from, *out_from_next;
00907 LOGPROB *out_a, *out_a_next;
00908 int out_num_prev, out_num_next;
00909 int kkk;
00910 #endif
00911
00912
00913
00914
00915
00916
00917
00918
00919
00920
00921
00922 n = wchmm->n;
00923 word_len = wchmm->winfo->wlen[word];
00924 matchword_len = wchmm->winfo->wlen[matchword];
00925
00926
00927 if((wchmm->offset[word]=(int *)mybmalloc2(sizeof(int)*word_len, &(wchmm->malloc_root)))==NULL){
00928 j_error("malloc failed at wchmm_add_word()\n");
00929 }
00930
00931
00932 add_head = matchlen;
00933 add_tail = word_len - 1;
00934 add_to = matchlen - 1;
00935
00936 #ifdef MULTIPATH_VERSION
00937
00938 if (matchlen == 0) {
00939
00940 wchmm->wordbegin[word] = n;
00941 wchmm->stend[n] = WORD_INVALID;
00942 wchmm->state[n].ac = NULL;
00943 wchmm->state[n].out.state = NULL;
00944
00945 wchmm->startnode[wchmm->startnum] = n;
00946 #ifdef CATEGORY_TREE
00947 wchmm->start2wid[wchmm->startnum] = word;
00948 #endif
00949
00950 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm);
00951 if (++n >= wchmm->maxwcn) wchmm_extend(wchmm);
00952 } else {
00953 wchmm->wordbegin[word] = wchmm->wordbegin[matchword];
00954 }
00955
00956
00957
00958
00959 out_from = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00960 out_from_next = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn);
00961 out_a = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00962 out_a_next = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn);
00963
00964
00965 out_num_prev = 0;
00966 if (matchlen == 0) {
00967
00968 out_from[0] = wchmm->wordbegin[word];
00969 out_a[0] = 0.0;
00970 out_num_prev = 1;
00971 } else {
00972
00973
00974 get_outtrans_list(wchmm, matchword, add_to, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, (add_tail - add_head + 1 > 0) ? FALSE : TRUE);
00975
00976 }
00977
00978 #endif
00979
00980 if (add_tail - add_head + 1 > 0) {
00981 ntmp = n;
00982 for (j=add_head; j <= add_tail; j++) {
00983 ltmp = wchmm->winfo->wseq[word][j];
00984 ltmp_state_num = hmm_logical_state_num(ltmp);
00985 #ifdef PASS1_IWCD
00986 if (ccd_flag) {
00987
00988
00989 if (wchmm->winfo->wlen[word] > 1 && j == wchmm->winfo->wlen[word] - 1) {
00990 #ifdef CATEGORY_TREE
00991 if (! old_iwcd_flag) {
00992 lcd = lcdset_lookup_with_category(wchmm, ltmp, wchmm->winfo->wton[word]);
00993 if (lcd == NULL) {
00994
00995
00996
00997 j_printerr("Warning: no lcdset found for [%s::%d], fallback to [%s]\n", ltmp->name, wchmm->winfo->wton[word], ltmp->name);
00998 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
00999 }
01000 } else {
01001 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01002 }
01003 #else
01004 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name);
01005 #endif
01006 if (lcd == NULL) {
01007 j_error("Error: no lcdset found for [%s]\n",ltmp->name);
01008 }
01009 }
01010 }
01011 #endif
01012 for (k = 1; k < ltmp_state_num - 1; k++) {
01013
01014 #ifdef PASS1_IWCD
01015 if (ccd_flag) {
01016
01017 if (wchmm->winfo->wlen[word] == 1) {
01018 wchmm->outstyle[ntmp] = AS_LRSET;
01019 wchmm->state[ntmp].out.lrset = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root));
01020 (wchmm->state[ntmp].out.lrset)->hmm = ltmp;
01021 (wchmm->state[ntmp].out.lrset)->state_loc = k;
01022 #ifdef CATEGORY_TREE
01023 (wchmm->state[ntmp].out.lrset)->category = wchmm->winfo->wton[word];
01024 #endif
01025 } else if (j == 0) {
01026 wchmm->outstyle[ntmp] = AS_RSET;
01027 wchmm->state[ntmp].out.rset = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root));
01028 (wchmm->state[ntmp].out.rset)->hmm = ltmp;
01029 (wchmm->state[ntmp].out.rset)->state_loc = k;
01030 } else if (j == wchmm->winfo->wlen[word] - 1) {
01031 wchmm->outstyle[ntmp] = AS_LSET;
01032 wchmm->state[ntmp].out.lset = &(lcd->stateset[k]);
01033 } else {
01034 wchmm->outstyle[ntmp] = AS_STATE;
01035 if (ltmp->is_pseudo) {
01036 j_printerr("Warning: word-internal phone should not be pseudo\n");
01037 put_voca(wchmm->winfo, word);
01038 }
01039 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01040 }
01041 } else {
01042
01043 if (ltmp->is_pseudo) {
01044 j_printerr("InternalError: CDSET phoneme exist in monophone?\n");
01045 put_voca(wchmm->winfo, word);
01046 }
01047 wchmm->outstyle[ntmp] = AS_STATE;
01048 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k];
01049 }
01050 #else
01051 if (ltmp->is_pseudo) {
01052 j_printerr("InternalError: CDSET phone exist in monophone?\n");
01053 put_voca(wchmm->winfo, word);
01054 }
01055 wchmm->state[ntmp].out = ltmp->body.defined->s[k];
01056 #endif
01057
01058
01059 wchmm->state[ntmp].ac = NULL;
01060 wchmm->stend[ntmp] = WORD_INVALID;
01061 #ifndef MULTIPATH_VERSION
01062 wchmm->ststart[ntmp] = WORD_INVALID;
01063
01064 for (ato = 1; ato < ltmp_state_num; ato++) {
01065 prob = (hmm_logical_trans(ltmp))->a[k][ato];
01066 if
01067 (prob != LOG_ZERO)
01068 {
01069 if (j == add_tail && k == ltmp_state_num - 2 && ato == ltmp_state_num - 1) {
01070
01071 } else {
01072 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01073 }
01074 }
01075 }
01076 #endif
01077
01078 ntmp++;
01079
01080 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01081 }
01082 }
01083
01084 #ifdef MULTIPATH_VERSION
01085
01086
01087
01088
01089 ntmp = n;
01090 for (j = add_head; j <= add_tail; j++) {
01091 ltmp = wchmm->winfo->wseq[word][j];
01092 ltmp_state_num = hmm_logical_state_num(ltmp);
01093 out_num_next = 0;
01094
01095 for (ato = 1; ato < ltmp_state_num; ato++) {
01096 prob = (hmm_logical_trans(ltmp))->a[0][ato];
01097 if (prob != LOG_ZERO) {
01098
01099 if (ato == ltmp_state_num - 1) {
01100
01101 for(kkk=0; kkk<out_num_prev; kkk++) {
01102 out_from_next[out_num_next] = out_from[kkk];
01103 out_a_next[out_num_next] = out_a[kkk] + prob;
01104 out_num_next++;
01105 }
01106 } else {
01107 for(kkk=0; kkk<out_num_prev; kkk++) {
01108 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01109 }
01110 }
01111 }
01112 }
01113
01114 for(k = 1; k < ltmp_state_num - 1; k++) {
01115 for (ato = 1; ato < ltmp_state_num; ato++) {
01116 prob = (hmm_logical_trans(ltmp))->a[k][ato];
01117 if (prob != LOG_ZERO) {
01118 if (ato == ltmp_state_num - 1) {
01119
01120 out_from_next[out_num_next] = ntmp;
01121 out_a_next[out_num_next] = prob;
01122 out_num_next++;
01123 } else {
01124 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01125 }
01126 }
01127 }
01128 ntmp++;
01129 }
01130
01131 for(kkk=0;kkk<out_num_next;kkk++) {
01132 out_from[kkk] = out_from_next[kkk];
01133 out_a[kkk] = out_a_next[kkk];
01134 }
01135 out_num_prev = out_num_next;
01136 }
01137
01138 #endif
01139
01140 }
01141
01142
01143 #ifdef MULTIPATH_VERSION
01144
01145
01146
01147
01148
01149 if (enable_iwsp && add_tail - add_head + 1 > 0) {
01150 int ntmp_bak;
01151
01152
01153 ntmp_bak = ntmp;
01154 if (wchmm->hmminfo->sp->is_pseudo) {
01155 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01156 wchmm->outstyle[ntmp] = AS_LSET;
01157 wchmm->state[ntmp].out.lset = &(wchmm->hmminfo->sp->body.pseudo->stateset[k]);
01158 wchmm->state[ntmp].ac = NULL;
01159 wchmm->stend[ntmp] = WORD_INVALID;
01160 ntmp++;
01161 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01162 }
01163 } else {
01164 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01165 wchmm->outstyle[ntmp] = AS_STATE;
01166 wchmm->state[ntmp].out.state = wchmm->hmminfo->sp->body.defined->s[k];
01167 wchmm->state[ntmp].ac = NULL;
01168 wchmm->stend[ntmp] = WORD_INVALID;
01169 ntmp++;
01170 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm);
01171 }
01172 }
01173 ntmp = ntmp_bak;
01174
01175 out_num_next = 0;
01176 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01177 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[0][ato];
01178 if (prob != LOG_ZERO) {
01179
01180
01181 prob += wchmm->hmminfo->iwsp_penalty;
01182 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01183
01184 for(kkk=0; kkk<out_num_prev; kkk++) {
01185 out_from_next[out_num_next] = out_from[kkk];
01186 out_a_next[out_num_next] = out_a[kkk] + prob;
01187 out_num_next++;
01188 }
01189 } else {
01190
01191 for(kkk=0; kkk<out_num_prev; kkk++) {
01192 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1);
01193 }
01194 }
01195 }
01196 }
01197
01198 if (hmm_logical_trans(wchmm->hmminfo->sp)->a[0][hmm_logical_state_num(wchmm->hmminfo->sp)-1] == LOG_ZERO) {
01199
01200
01201 prob = 0.0;
01202 for(kkk=0; kkk<out_num_prev; kkk++) {
01203 out_from_next[out_num_next] = out_from[kkk];
01204 out_a_next[out_num_next] = out_a[kkk] + prob;
01205 out_num_next++;
01206 }
01207 }
01208
01209 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) {
01210 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) {
01211 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][ato];
01212 if (prob != LOG_ZERO) {
01213 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) {
01214 out_from_next[out_num_next] = ntmp;
01215 out_a_next[out_num_next] = prob;
01216 out_num_next++;
01217 } else {
01218 add_wacc(wchmm, ntmp, prob, ntmp + ato - k);
01219 }
01220 }
01221 }
01222 ntmp++;
01223 }
01224
01225 for(kkk=0;kkk<out_num_next;kkk++) {
01226 out_from[kkk] = out_from_next[kkk];
01227 out_a[kkk] = out_a_next[kkk];
01228 }
01229 out_num_prev = out_num_next;
01230 }
01231
01232 #endif
01233
01234
01235 for (j=0;j<word_len;j++) {
01236 if (j < add_head) {
01237 wchmm->offset[word][j] = wchmm->offset[matchword][j];
01238 } else if (add_tail < j) {
01239 wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)];
01240 } else {
01241 wchmm->offset[word][j] = n;
01242 n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2;
01243 }
01244 }
01245 #ifdef MULTIPATH_VERSION
01246
01247
01248 if (enable_iwsp && add_tail - add_head + 1 > 0) {
01249 n += hmm_logical_state_num(wchmm->hmminfo->sp) - 2;
01250 if (n != ntmp) j_error("Algorithm Error! cannot match\n");
01251 }
01252
01253
01254 wchmm->wordend[word] = n;
01255 wchmm->stend[n] = word;
01256 wchmm->state[n].ac = NULL;
01257 wchmm->state[n].out.state = NULL;
01258
01259
01260 for(k = 0; k < out_num_prev; k++) {
01261 add_wacc(wchmm, out_from[k], out_a[k], n);
01262 }
01263 n++;
01264 if (n >= wchmm->maxwcn) wchmm_extend(wchmm);
01265
01266 if (matchlen == 0) {
01267
01268
01269 out_num_prev = 0;
01270 get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, TRUE);
01271 for(k=0;k<out_num_prev;k++) {
01272 if (out_from[k] == wchmm->wordbegin[word]) {
01273 j_printerr("\n*** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n");
01274 j_printerr(" Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]);
01275 j_printerr(" All HMMs in the word:\n ");
01276 for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) {
01277 j_printerr("%s ", wchmm->winfo->wseq[word][kkk]->name);
01278 }
01279 j_printerr("\n has transitions from initial state to final state.\n");
01280 j_error(" This type of word skipping is not supported.\n");
01281 }
01282 }
01283 }
01284
01285 wchmm->n = n;
01286 free(out_from);
01287 free(out_from_next);
01288 free(out_a);
01289 free(out_a_next);
01290
01291 #else
01292
01293 wchmm->n = n;
01294 wchmm->ststart[wchmm->offset[word][0]] = word;
01295 k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1;
01296 wchmm->wordend[word] = k;
01297 wchmm->stend[k] = word;
01298
01299 if (matchlen != 0 && add_tail - add_head + 1 > 0) {
01300
01301
01302 wchmm_link_subword(wchmm, matchword,add_to,word,add_head);
01303 }
01304
01305 #endif
01306
01307 }
01308
01309
01310
01311
01312
01313 #ifndef MULTIPATH_VERSION
01314
01328 static void
01329 wchmm_index_ststart(WCHMM_INFO *wchmm)
01330 {
01331 int n;
01332 int id;
01333
01334 id = 0;
01335 for (n=0;n<wchmm->n;n++) {
01336 if (wchmm->ststart[n] != WORD_INVALID) {
01337 #ifdef USE_NGRAM
01338
01339
01340
01341 if (wchmm->ststart[n] == wchmm->winfo->head_silwid) continue;
01342 #endif
01343 wchmm->startnode[id] = n;
01344 id++;
01345 if (id > wchmm->winfo->num) {
01346 j_printerr("Error: start node num exceeded %d\n", wchmm->winfo->num);
01347 }
01348 }
01349 }
01350 wchmm->startnum = id;
01351 }
01352
01366 static void
01367 wchmm_calc_wordend_arc(WCHMM_INFO *wchmm)
01368 {
01369 WORD_ID w;
01370 HTK_HMM_Trans *tr;
01371 LOGPROB a;
01372
01373 for (w=0;w<wchmm->winfo->num;w++) {
01374 tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]);
01375 a = tr->a[tr->statenum-2][tr->statenum-1];
01376 wchmm->wordend_a[w] = a;
01377 }
01378 }
01379
01380 #endif
01381
01382
01383
01384
01385 #ifdef USE_NGRAM
01386 #ifdef SEPARATE_BY_UNIGRAM
01387
01406 static int
01407 compare_prob(LOGPROB *a, LOGPROB *b)
01408 {
01409 if (*a < *b) return (1);
01410 if (*a > *b) return (-1);
01411 return(0);
01412 }
01413
01432 static LOGPROB
01433 get_nbest_uniprob(WORD_INFO *winfo, int n)
01434 {
01435 LOGPROB *u_p;
01436 WORD_ID w;
01437 LOGPROB x;
01438
01439 if (n < 1) n = 1;
01440 if (n > winfo->num) n = winfo->num;
01441
01442
01443 u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num);
01444 for(w=0;w<winfo->num;w++) u_p[w] =
01445 uni_prob(ngram, winfo->wton[w])
01446 #ifdef CLASS_NGRAM
01447 + winfo->cprob[w]
01448 #endif
01449 ;
01450
01451 qsort(u_p, winfo->num, sizeof(LOGPROB),
01452 (int (*)(const void *,const void *))compare_prob);
01453
01454
01455 x = u_p[n-1];
01456 free(u_p);
01457 return(x);
01458 }
01459
01460 #endif
01461 #endif
01462
01463
01464
01465
01466
01467 static int separated_word_count;
01468
01469 #ifdef CATEGORY_TREE
01470
01471 #define COUNT_STEP 500
01472
01473
01490 void
01491 build_wchmm(WCHMM_INFO *wchmm)
01492 {
01493 int i,j;
01494 int matchword=0, sharelen=0, maxsharelen=0;
01495 int counter = COUNT_STEP;
01496 #ifdef SEPARATE_BY_UNIGRAM
01497 LOGPROB separate_thres;
01498 #endif
01499
01500
01501
01502 if (wchmm->winfo == NULL
01503 #ifdef USE_NGRAM
01504 || wchmm->ngram == NULL
01505 #endif
01506 #ifdef USE_DFA
01507 || wchmm->dfa == NULL
01508 #endif
01509 ) {
01510 j_error("InternalError: build_wchmm: lingustic info not available!!\n");
01511 }
01512
01513 #ifdef SEPARATE_BY_UNIGRAM
01514
01515
01516 separate_thres = get_nbest_uniprob(wchmm->winfo, separate_wnum);
01517 #endif
01518
01519 #ifdef PASS1_IWCD
01520 #ifdef CATEGORY_TREE
01521 if (ccd_flag && !old_iwcd_flag) {
01522
01523 lcdset_register_with_category_all(wchmm, wchmm->hmminfo, winfo, dfa);
01524 }
01525 #endif
01526 #endif
01527
01528
01529
01530 wchmm_init(wchmm);
01531
01532
01533 separated_word_count=0;
01534
01535 j_printerr("Building HMM lexicon tree (left-to-right)...\n");
01536 for (i=0;i<wchmm->winfo->num;i++) {
01537 if (verbose_flag) {
01538 if (i >= counter) {
01539 j_printerr("\r %5d words proceeded (%6d nodes)",i,wchmm->n);
01540 counter += COUNT_STEP;
01541 }
01542 }
01543 #ifdef USE_NGRAM
01544 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01545
01546
01547 wchmm_add_word(wchmm, i,0,0);
01548 continue;
01549 }
01550 #ifndef NO_SEPARATE_SHORT_WORD
01551 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01552
01553 wchmm_add_word(wchmm, i,0,0);
01554 separated_word_count++;
01555 continue;
01556 }
01557 #endif
01558 #ifdef SEPARATE_BY_UNIGRAM
01559 if (
01560 uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01561 #ifdef CLASS_NGRAM
01562 + wchmm->winfo->cprob[i]
01563 #endif
01564 >= separate_thres && separated_word_count < separate_wnum) {
01565
01566
01567 wchmm_add_word(wchmm, i,0,0);
01568 separated_word_count++;
01569 continue;
01570 }
01571 #endif
01572 #endif
01573
01574 maxsharelen=0;
01575 for (j=0;j<i;j++) {
01576 #ifdef CATEGORY_TREE
01577 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue;
01578 #endif
01579 sharelen = wchmm_check_match(wchmm->winfo, i, j);
01580 if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) {
01581
01582
01583 maxsharelen = sharelen;
01584 matchword = j;
01585 break;
01586 }
01587 if (sharelen > maxsharelen) {
01588 matchword = j;
01589 maxsharelen = sharelen;
01590 }
01591 }
01592 wchmm_add_word(wchmm, i,maxsharelen,matchword);
01593 }
01594 j_printerr("\r %5d words ended (%6d nodes)\n",i,wchmm->n);
01595
01596 #if 0
01597
01598 for (i=0;i<wchmm->winfo->num;i++) {
01599 if (verbose_flag) {
01600 if (i >= counter) {
01601 j_printerr(" %5d words proceeded (%6d nodes)\n",i,wchmm->n);
01602 counter += COUNT_STEP;
01603 }
01604 }
01605 wchmm_add_word(wchmm, i,0,0);
01606 }
01607 j_printerr(" %5d words ended (%6d nodes)\n",i,wchmm->n);
01608 #endif
01609
01610 #ifndef MULTIPATH_VERSION
01611
01612 wchmm_duplicate_leafnode(wchmm);
01613 VERMES(" %d leaf nodes are made unshared\n",dupcount);
01614
01615
01616 wchmm_calc_wordend_arc(wchmm);
01617 #endif
01618
01619
01620 check_wchmm(wchmm);
01621
01622 #ifndef MULTIPATH_VERSION
01623
01624 wchmm_index_ststart(wchmm);
01625 #endif
01626
01627
01628 #ifndef CATEGORY_TREE
01629 make_successor_list(wchmm);
01630 #ifdef USE_NGRAM
01631 #ifdef UNIGRAM_FACTORING
01632
01633
01634 calc_all_unigram_factoring_values(wchmm);
01635 j_printerr(" 1-gram factoring values has been pre-computed\n");
01636 #endif
01637 #endif
01638 #ifdef MULTIPATH_VERSION
01639
01640 adjust_sc_index(wchmm);
01641 #endif
01642 #ifdef USE_NGRAM
01643 #ifdef UNIGRAM_FACTORING
01644
01645 make_iwcache_index(wchmm);
01646 #endif
01647 #endif
01648 #endif
01649
01650 j_printerr("done\n");
01651
01652
01653 if (wchmm_check_flag) {
01654 wchmm_check_interactive(wchmm);
01655 }
01656 }
01657
01658 #endif
01659
01681 void
01682 build_wchmm2(WCHMM_INFO *wchmm)
01683 {
01684 int i,j, last_i;
01685 int count_step, counter;
01686 WORD_ID *windex;
01687 #ifdef USE_NGRAM
01688 #ifdef SEPARATE_BY_UNIGRAM
01689 LOGPROB separate_thres;
01690 #endif
01691 #endif
01692
01693
01694
01695 if (wchmm->winfo == NULL
01696 #ifdef USE_NGRAM
01697 || wchmm->ngram == NULL
01698 #endif
01699 #ifdef USE_DFA
01700 || wchmm->dfa == NULL
01701 #endif
01702 ) {
01703 j_error("InternalError: build_wchmm: lingustic info not available!!\n");
01704 }
01705
01706 separated_word_count = 0;
01707 count_step = wchmm->winfo->num / 10;
01708 counter = count_step;
01709
01710 j_printerr("Building HMM lexicon tree");
01711
01712 #ifdef USE_NGRAM
01713 #ifdef SEPARATE_BY_UNIGRAM
01714
01715
01716 separate_thres = get_nbest_uniprob(wchmm->winfo, separate_wnum);
01717 #endif
01718 #endif
01719
01720 #ifdef PASS1_IWCD
01721 #ifdef CATEGORY_TREE
01722 if (ccd_flag && !old_iwcd_flag) {
01723
01724
01725
01726 lcdset_register_with_category_all(wchmm, wchmm->hmminfo, winfo, dfa);
01727 }
01728 #endif
01729 #endif
01730
01731
01732 wchmm_init(wchmm);
01733
01734
01735 windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num);
01736 for(i=0;i<wchmm->winfo->num;i++) windex[i] = i;
01737 #ifdef CATEGORY_TREE
01738
01739 wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num);
01740 {
01741 int last_cate;
01742 last_i = 0;
01743 last_cate = wchmm->winfo->wton[windex[0]];
01744 for(i = 1;i<wchmm->winfo->num;i++) {
01745 if (wchmm->winfo->wton[windex[i]] != last_cate) {
01746 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i);
01747 last_cate = wchmm->winfo->wton[windex[i]];
01748 last_i = i;
01749 }
01750 }
01751 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i);
01752 }
01753 #else
01754
01755 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num);
01756 #endif
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768
01769
01770 last_i = WORD_INVALID;
01771 for (j=0;j<wchmm->winfo->num;j++) {
01772 i = windex[j];
01773 if (j >= counter) {
01774
01775 j_printerr(".");
01776 counter += count_step;
01777 }
01778 #ifdef USE_NGRAM
01779
01780 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) {
01781 wchmm_add_word(wchmm, i,0,0);
01782 continue;
01783 }
01784 #ifndef NO_SEPARATE_SHORT_WORD
01785
01786 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) {
01787 wchmm_add_word(wchmm, i,0,0);
01788 separated_word_count++;
01789 continue;
01790 }
01791 #endif
01792 #ifdef SEPARATE_BY_UNIGRAM
01793
01794 if (
01795 uni_prob(wchmm->ngram, wchmm->winfo->wton[i])
01796 #ifdef CLASS_NGRAM
01797 + wchmm->winfo->cprob[i]
01798 #endif
01799 >= separate_thres && separated_word_count < separate_wnum) {
01800 wchmm_add_word(wchmm, i,0,0);
01801 separated_word_count++;
01802 continue;
01803 }
01804 #endif
01805 #endif
01806 if (last_i == WORD_INVALID) {
01807 wchmm_add_word(wchmm, i,0,0);
01808 } else {
01809
01810 #ifdef CATEGORY_TREE
01811 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) {
01812 wchmm_add_word(wchmm, i,0,0);
01813 } else {
01814 wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i);
01815 }
01816 #else
01817 wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i);
01818 #endif
01819 }
01820 last_i = i;
01821
01822 }
01823
01824
01825
01826
01827 free(windex);
01828
01829 #ifdef MULTIPATH_VERSION
01830 j_printerr("%d nodes\n", wchmm->n);
01831 #else
01832
01833 j_printerr("%d", wchmm->n);
01834 wchmm_duplicate_leafnode(wchmm);
01835 j_printerr("+%d=%d nodes\n",dupcount, wchmm->n);
01836 #endif
01837
01838 #ifndef MULTIPATH_VERSION
01839
01840 wchmm_calc_wordend_arc(wchmm);
01841 #endif
01842
01843
01844 check_wchmm(wchmm);
01845
01846 #ifndef MULTIPATH_VERSION
01847
01848 wchmm_index_ststart(wchmm);
01849 #endif
01850
01851
01852 #ifndef CATEGORY_TREE
01853 make_successor_list(wchmm);
01854 #ifdef UNIGRAM_FACTORING
01855
01856 calc_all_unigram_factoring_values(wchmm);
01857 j_printerr(" 1-gram factoring values has been pre-computed\n");
01858 #endif
01859 #ifdef MULTIPATH_VERSION
01860
01861 adjust_sc_index(wchmm);
01862 #endif
01863 #ifdef UNIGRAM_FACTORING
01864
01865 make_iwcache_index(wchmm);
01866 #endif
01867 #endif
01868
01869 j_printerr("done\n");
01870
01871
01872 if (wchmm_check_flag) {
01873 wchmm_check_interactive(wchmm);
01874 }
01875
01876 #ifdef WCHMM_SIZE_CHECK
01877
01878 j_printf("--- estimated size of word lexicon ---\n");
01879 j_printf("wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n);
01880 j_printf("%9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n);
01881 #ifndef MULTIPATH_VERSION
01882 j_printf("%9d bytes: wchmm->ststart[node]\n", sizeof(WORD_ID) * wchmm->n);
01883 #endif
01884 printf("%9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n);
01885 {
01886 int w,count;
01887 count = 0;
01888 for(w=0;w<wchmm->winfo->num;w++) {
01889 count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *);
01890 }
01891 printf("%9d bytes: wchmm->offset[w][]\n", count);
01892 }
01893 #ifdef MULTIPATH_VERSION
01894 j_printf("%9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int));
01895 #endif
01896 j_printf("%9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int));
01897 j_printf("%9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int));
01898 #ifdef MULTIPATH_VERSION
01899 #ifdef CATEGORY_TREE
01900 j_printf("%9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID));
01901 #endif
01902 #endif
01903 #ifdef UNIGRAM_FACTORING
01904 printf("%9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int));
01905 #endif
01906 printf("%9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB));
01907 printf("%9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char));
01908 #ifndef CATEGORY_TREE
01909 printf("%9d bytes: wchmm->sclist[]\n", wchmm->scnum * sizeof(S_CELL *));
01910 printf("%9d bytes: wchmm->sclist2node[]\n", wchmm->scnum * sizeof(int));
01911 #ifdef UNIGRAM_FACTORING
01912 printf("%9d bytes: wchmm->fscore[]\n", wchmm->fsnum * sizeof(LOGPROB));
01913 #endif
01914 #endif
01915
01916 printf("under state[]:\n");
01917 {
01918 A_CELL *ac;
01919 int count,n;
01920 count = 0;
01921 for(n=0;n<wchmm->n;n++) {
01922 for(ac=wchmm->state[n].ac;ac;ac=ac->next) {
01923 count += sizeof(A_CELL);
01924 }
01925 }
01926 printf("\t%9d: ac\n", count);
01927 }
01928 #ifndef CATEGORY_TREE
01929 {
01930 S_CELL *sc;
01931 int count,n;
01932 count = 0;
01933 for(n=0;n<wchmm->n;n++) {
01934 for(sc=wchmm->state[n].sc;sc;sc=sc->next) {
01935 count += sizeof(S_CELL);
01936 }
01937 }
01938 printf("\t%9d: sc\n", count);
01939 }
01940 #endif
01941 #endif
01942 }
01943
01944
01957 void
01958 print_wchmm_info(WCHMM_INFO *wchmm)
01959 {
01960 int n,i, rootnum;
01961
01962 #ifdef MULTIPATH_VERSION
01963 rootnum = wchmm->startnum;
01964 #else
01965 #ifdef USE_NGRAM
01966 rootnum = wchmm->startnum + 1;
01967 #else
01968 rootnum = wchmm->startnum;
01969 #endif
01970 #endif
01971
01972 j_printf("Lexicon tree info:\n");
01973 j_printf("\t total node num = %6d\n", wchmm->n);
01974 #ifdef USE_NGRAM
01975 j_printf("\t root node num = %6d\n", rootnum);
01976 #ifdef NO_SEPARATE_SHORT_WORD
01977 #ifdef SEPARATE_BY_UNIGRAM
01978 j_printf("\t(%d hi-freq. words are separated from tree lexicon)\n", separated_word_count);
01979 #else
01980 j_printf(" (no words are separated from tree)\n");
01981 #endif
01982 #else
01983 j_printf(" (%d short words (<= %d phonemes) are separated from tree)\n", separated_word_count, SHORT_WORD_LEN);
01984 #endif
01985 #else
01986 j_printf("\t root node num = %6d\n", rootnum);
01987 #endif
01988 for(n=0,i=0;i<wchmm->n;i++) {
01989 if (wchmm->stend[i] != WORD_INVALID) n++;
01990 }
01991 j_printf("\t leaf node num = %6d\n", n);
01992 #ifndef CATEGORY_TREE
01993 j_printf("\t fact. node num = %6d\n", wchmm->scnum - 1);
01994 #endif
01995 }