Julius 4.1.5
|
00001 00037 /* 00038 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 00039 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00040 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 00041 * All rights reserved 00042 */ 00043 00044 /* wchmm = word conjunction HMM = lexicon tree */ 00045 00046 #include <julius/julius.h> 00047 00048 00049 #define WCHMM_SIZE_CHECK ///< If defined, do wchmm size estimation (for debug only) 00050 00051 /**************************************************************/ 00052 /*********** Initialization of tree lexicon *******************/ 00053 /**************************************************************/ 00054 00069 WCHMM_INFO * 00070 wchmm_new() 00071 { 00072 WCHMM_INFO *w; 00073 w = (WCHMM_INFO *)mymalloc(sizeof(WCHMM_INFO)); 00074 w->lmtype = LM_UNDEF; 00075 w->lmvar = LM_UNDEF; 00076 w->ngram = NULL; 00077 w->dfa = NULL; 00078 w->winfo = NULL; 00079 w->malloc_root = NULL; 00080 #ifdef PASS1_IWCD 00081 w->lcdset_category_root = NULL; 00082 w->lcdset_mroot = NULL; 00083 #endif /* PASS1_IWCD */ 00084 w->wrk.out_from_len = 0; 00085 /* reset user function entry point */ 00086 w->uni_prob_user = NULL; 00087 w->bi_prob_user = NULL; 00088 return w; 00089 } 00090 00103 static void 00104 wchmm_init(WCHMM_INFO *wchmm) 00105 { 00106 /* the resulting tree size is typically half of total state num */ 00107 wchmm->maxwcn = wchmm->winfo->totalstatenum / 2; 00108 wchmm->state = (WCHMM_STATE *)mymalloc(sizeof(WCHMM_STATE)*wchmm->maxwcn); 00109 wchmm->self_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn); 00110 wchmm->next_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->maxwcn); 00111 wchmm->ac = (A_CELL2 **)mymalloc(sizeof(A_CELL2 *)*wchmm->maxwcn); 00112 wchmm->stend = (WORD_ID *)mymalloc(sizeof(WORD_ID)*wchmm->maxwcn); 00113 wchmm->offset = (int **)mymalloc(sizeof(int *)*wchmm->winfo->num); 00114 wchmm->wordend = (int *)mymalloc(sizeof(int)*wchmm->winfo->num); 00115 wchmm->maxstartnum = STARTNODE_STEP; 00116 wchmm->startnode = (int *)mymalloc(sizeof(int)*STARTNODE_STEP); 00117 wchmm->startnum = 0; 00118 if (wchmm->category_tree) { 00119 wchmm->start2wid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*STARTNODE_STEP); 00120 } 00121 if (wchmm->hmminfo->multipath) { 00122 wchmm->wordbegin = (int *)mymalloc(sizeof(int)*wchmm->winfo->num); 00123 wchmm->wrk.out_from = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn); 00124 wchmm->wrk.out_from_next = (int *)mymalloc(sizeof(int) * wchmm->winfo->maxwn); 00125 wchmm->wrk.out_a = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn); 00126 wchmm->wrk.out_a_next = (LOGPROB *)mymalloc(sizeof(LOGPROB) * wchmm->winfo->maxwn); 00127 wchmm->wrk.out_from_len = wchmm->winfo->maxwn; 00128 } else { 00129 wchmm->wordend_a = (LOGPROB *)mymalloc(sizeof(LOGPROB)*wchmm->winfo->num); 00130 } 00131 #ifdef PASS1_IWCD 00132 wchmm->outstyle = (unsigned char *)mymalloc(sizeof(unsigned char)*wchmm->maxwcn); 00133 #endif 00134 #ifdef UNIGRAM_FACTORING 00135 wchmm->start2isolate = NULL; 00136 wchmm->isolatenum = 0; 00137 #endif 00138 if (!wchmm->category_tree) { 00139 wchmm->sclist = NULL; 00140 wchmm->sclist2node = NULL; 00141 #ifdef UNIGRAM_FACTORING 00142 wchmm->fscore = NULL; 00143 #endif 00144 } 00145 00146 wchmm->n = 0; 00147 } 00148 00161 static void 00162 wchmm_extend(WCHMM_INFO *wchmm) 00163 { 00164 /* practical value! */ 00165 wchmm->maxwcn += wchmm->winfo->totalstatenum / 6; 00166 wchmm->state = (WCHMM_STATE *)myrealloc(wchmm->state, sizeof(WCHMM_STATE)*wchmm->maxwcn); 00167 wchmm->self_a = (LOGPROB *)myrealloc(wchmm->self_a, sizeof(LOGPROB)*wchmm->maxwcn); 00168 wchmm->next_a = (LOGPROB *)myrealloc(wchmm->next_a, sizeof(LOGPROB)*wchmm->maxwcn); 00169 wchmm->ac = (A_CELL2 **)myrealloc(wchmm->ac, sizeof(A_CELL2 *)*wchmm->maxwcn); 00170 wchmm->stend = (WORD_ID *)myrealloc(wchmm->stend, sizeof(WORD_ID)*wchmm->maxwcn); 00171 #ifdef PASS1_IWCD 00172 wchmm->outstyle = (unsigned char *)myrealloc(wchmm->outstyle, sizeof(unsigned char)*wchmm->maxwcn); 00173 #endif 00174 } 00175 00188 static void 00189 wchmm_extend_startnode(WCHMM_INFO *wchmm) 00190 { 00191 wchmm->maxstartnum += STARTNODE_STEP; 00192 wchmm->startnode = (int *)myrealloc(wchmm->startnode, sizeof(int) * wchmm->maxstartnum); 00193 if (wchmm->category_tree) { 00194 wchmm->start2wid = (WORD_ID *)myrealloc(wchmm->start2wid, sizeof(WORD_ID) * wchmm->maxstartnum); 00195 } 00196 } 00197 00212 void 00213 wchmm_free(WCHMM_INFO *w) 00214 { 00215 S_CELL *sc, *sctmp; 00216 int i; 00217 /* wchmm->state[i].ac malloced by mybmalloc2() */ 00218 /* wchmm->offset[][] malloced by mybmalloc2() */ 00219 #ifdef PASS1_IWCD 00220 /* LRC_INFO, RC_INFO in wchmm->state[i].outsty malloced by mybmalloc2() */ 00221 #endif 00222 /* they all will be freed by a single mybfree2() call */ 00223 mybfree2(&(w->malloc_root)); 00224 if (!w->category_tree) { 00225 if (w->sclist != NULL) { 00226 for(i=1;i<w->scnum;i++) { 00227 sc = w->sclist[i]; 00228 while(sc) { 00229 sctmp = sc->next; 00230 free(sc); 00231 sc = sctmp; 00232 } 00233 } 00234 free(w->sclist); 00235 } 00236 if (w->sclist2node != NULL) free(w->sclist2node); 00237 #ifdef UNIGRAM_FACTORING 00238 if (w->fscore != NULL) free(w->fscore); 00239 #endif 00240 } 00241 #ifdef UNIGRAM_FACTORING 00242 if (w->start2isolate != NULL) free(w->start2isolate); 00243 #endif 00244 #ifdef PASS1_IWCD 00245 free(w->outstyle); 00246 #endif 00247 if (w->hmminfo->multipath) { 00248 free(w->wordbegin); 00249 } else { 00250 free(w->wordend_a); 00251 } 00252 if (w->category_tree) free(w->start2wid); 00253 free(w->startnode); 00254 free(w->wordend); 00255 free(w->offset); 00256 free(w->stend); 00257 free(w->ac); 00258 free(w->next_a); 00259 free(w->self_a); 00260 free(w->state); 00261 #ifdef PASS1_IWCD 00262 if (w->category_tree) lcdset_remove_with_category_all(w); 00263 #endif /* PASS1_IWCD */ 00264 if (w->wrk.out_from_len != 0) { 00265 free(w->wrk.out_from); 00266 free(w->wrk.out_from_next); 00267 free(w->wrk.out_a); 00268 free(w->wrk.out_a_next); 00269 w->wrk.out_from_len = 0; 00270 } 00271 free(w); 00272 } 00273 00274 00275 /**************************************************************/ 00276 /*********** Word sort functions for tree construction ********/ 00277 /**************************************************************/ 00278 00297 static int 00298 compare_wseq(WORD_ID *widx1, WORD_ID *widx2, WORD_INFO *winfo) 00299 { 00300 int len1, len2, n; 00301 int p=0; 00302 00303 len1 = winfo->wlen[*widx1]; 00304 len2 = winfo->wlen[*widx2]; 00305 00306 n=0; 00307 /* while (n < len1 && n < len2 && (p = (int)winfo->wseq[*widx1][n] - (int)winfo->wseq[*widx2][n]) == 0 ) n++;*/ 00308 while (n < len1 && n < len2 && (p = strcmp((winfo->wseq[*widx1][n])->name, (winfo->wseq[*widx2][n])->name)) == 0 ) n++; 00309 if (n < len1) { 00310 if (n < len2) { 00311 /* differ */ 00312 return(p); 00313 } else { 00314 /* 2 is part of 1 */ 00315 return(1); 00316 } 00317 } else { 00318 if (n < len2) { 00319 /* 1 is part of 2 */ 00320 return(-1); 00321 } else { 00322 /* same */ 00323 return(0); 00324 } 00325 } 00326 } 00327 00346 static void 00347 wchmm_sort_idx_by_wseq(WORD_INFO *winfo, WORD_ID *windex, WORD_ID bgn, WORD_ID len) 00348 { 00349 qsort_reentrant(&(windex[bgn]), len, sizeof(WORD_ID), (int (*)(const void *, const void *, void *))compare_wseq, winfo); 00350 } 00351 00370 static int 00371 compare_category(WORD_ID *widx1, WORD_ID *widx2, WORD_INFO *winfo) 00372 { 00373 int c1,c2; 00374 c1 = winfo->wton[*widx1]; 00375 c2 = winfo->wton[*widx2]; 00376 return(c1 - c2); 00377 } 00378 00395 static void 00396 wchmm_sort_idx_by_category(WORD_INFO *winfo, WORD_ID *windex, WORD_ID len) 00397 { 00398 qsort_reentrant(windex, len, sizeof(WORD_ID), (int (*)(const void *, const void *, void *))compare_category, winfo); 00399 } 00400 00401 00402 /**********************************************************************/ 00403 /************** Subroutines to link part of words ********************/ 00404 /**********************************************************************/ 00405 00427 static int 00428 wchmm_check_match(WORD_INFO *winfo, int i, int j) 00429 { 00430 int k,tmplen; 00431 00432 for (tmplen=0,k=0;k<winfo->wlen[i];k++) { 00433 if (k > winfo->wlen[j]-1) 00434 break; 00435 if (! (strmatch(winfo->wseq[i][k]->name, winfo->wseq[j][k]->name))) 00436 break; 00437 tmplen++; 00438 } 00439 return(tmplen); 00440 } 00441 00454 static void 00455 acc_init(WCHMM_INFO *wchmm, int node) 00456 { 00457 wchmm->self_a[node] = LOG_ZERO; 00458 wchmm->next_a[node] = LOG_ZERO; 00459 wchmm->ac[node] = NULL; 00460 } 00461 00478 static void 00479 add_ac(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc) 00480 { 00481 A_CELL2 *ac2; 00482 00483 for(ac2=wchmm->ac[node];ac2;ac2=ac2->next) { 00484 if (ac2->n < A_CELL2_ALLOC_STEP) break; 00485 } 00486 if (ac2 == NULL) { 00487 ac2 = (A_CELL2 *)mybmalloc2(sizeof(A_CELL2), &(wchmm->malloc_root)); 00488 ac2->n = 0; 00489 ac2->next = wchmm->ac[node]; 00490 wchmm->ac[node] = ac2; 00491 } 00492 ac2->arc[ac2->n] = arc; 00493 ac2->a[ac2->n] = a; 00494 ac2->n++; 00495 } 00496 00515 static void 00516 add_wacc(WCHMM_INFO *wchmm, int node, LOGPROB a, int arc) 00517 { 00518 if (arc == node) { 00519 wchmm->self_a[node] = a; 00520 } else if (arc == node + 1) { 00521 wchmm->next_a[node] = a; 00522 } else { 00523 add_ac(wchmm, node, a, arc); 00524 } 00525 } 00526 00553 static void 00554 get_outtrans_list(WCHMM_INFO *wchmm, WORD_ID w, int pos, int *node, LOGPROB *a, int *num, int maxnum, boolean insert_sp) 00555 { 00556 HMM_Logical *ltmp; 00557 int states; 00558 int k; 00559 LOGPROB prob; 00560 int oldnum; 00561 00562 if (pos < 0) { 00563 00564 /* set the word-beginning node, and return */ 00565 node[*num] = wchmm->wordbegin[w]; 00566 a[*num] = 0.0; 00567 (*num)++; 00568 00569 } else { 00570 00571 ltmp = wchmm->winfo->wseq[w][pos]; 00572 states = hmm_logical_state_num(ltmp); 00573 00574 /* check initial->final state */ 00575 if ((hmm_logical_trans(ltmp))->a[0][states-1] != LOG_ZERO) { 00576 /* recursive call for previous phone */ 00577 oldnum = *num; 00578 get_outtrans_list(wchmm, w, pos-1, node, a, num, maxnum, FALSE); /* previous phone should not be an sp-inserted phone */ 00579 /* add probability of the skip transition to all the previous ones */ 00580 for(k=oldnum;k<*num;k++) { 00581 a[k] += (hmm_logical_trans(ltmp))->a[0][states-1]; 00582 } 00583 } 00584 /* add to list the arcs from output state to final state */ 00585 for (k = 1; k < states - 1; k++) { 00586 prob = (hmm_logical_trans(ltmp))->a[k][states-1]; 00587 if (prob != LOG_ZERO) { 00588 if (*num >= maxnum) { 00589 j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum); 00590 } 00591 node[*num] = wchmm->offset[w][pos] + k - 1; 00592 a[*num] = prob; 00593 (*num)++; 00594 } 00595 } 00596 /* for -iwsp, add outgoing arc from the tail sp model 00597 only if need_sp == TRUE. 00598 need_sp should be TRUE only when the connecting [pos] phone is also an end phone of the to-be-added word (i.e. homophone word) 00599 */ 00600 /* */ 00601 if (insert_sp) { 00602 /* consider sp */ 00603 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { 00604 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][hmm_logical_state_num(wchmm->hmminfo->sp)-1]; 00605 if (prob != LOG_ZERO) { 00606 if (*num >= maxnum) { 00607 j_internal_error("get_outtrans_list: maximum outtrans list num exceeded %d\n", maxnum); 00608 } 00609 node[*num] = wchmm->offset[w][pos] + (states - 2) + k - 1; 00610 a[*num] = prob; 00611 (*num)++; 00612 } 00613 } 00614 } 00615 } 00616 /*printf(" %d(%s)-%d:\"%s\", num=%d\n", w, wchmm->winfo->woutput[w], pos, 00617 (pos < 0) ? "BGN" : wchmm->winfo->wseq[w][pos]->name, *num);*/ 00618 return; 00619 } 00620 00639 static void 00640 wchmm_link_hmm(WCHMM_INFO *wchmm, int from_node, int to_node, HTK_HMM_Trans *tinfo) 00641 { 00642 A_CELL2 *actmp; 00643 LOGPROB a; 00644 int i, j; 00645 boolean tflag; 00646 00647 /* get transition probability to outer state in tinfo */ 00648 for(i = tinfo->statenum - 2; i >= 0; i--) { 00649 if ((a = tinfo->a[i][tinfo->statenum-1]) != LOG_ZERO) { /* found */ 00650 /* check if the arc already exist */ 00651 tflag = FALSE; 00652 if (to_node == from_node && wchmm->self_a[from_node] == a) { 00653 tflag = TRUE; 00654 } else if (to_node == from_node + 1 && wchmm->next_a[from_node] == a) { 00655 tflag = TRUE; 00656 } else { 00657 for (actmp = wchmm->ac[from_node]; actmp; actmp = actmp->next) { 00658 for(j=0;j<actmp->n;j++) { 00659 if (actmp->arc[j] == to_node && actmp->a[j] == a) { 00660 tflag = TRUE; 00661 break; 00662 } 00663 } 00664 if (tflag == TRUE) break; 00665 } 00666 } 00667 if (tflag) break; 00668 /* add the arc to wchmm */ 00669 add_wacc(wchmm, from_node, a, to_node); 00670 return; /* exit function here */ 00671 } 00672 } 00673 j_internal_error("wchmm_link_hmm: No arc to endstate?\n"); 00674 } 00675 00696 static void 00697 wchmm_link_subword(WCHMM_INFO *wchmm, int from_word, int from_seq, int to_word, int to_seq) 00698 { 00699 HMM_Logical *last; 00700 int lastp; 00701 00702 last = wchmm->winfo->wseq[from_word][from_seq]; 00703 lastp = wchmm->offset[from_word][from_seq] + hmm_logical_state_num(last)-2 -1; 00704 wchmm_link_hmm(wchmm, lastp, wchmm->offset[to_word][to_seq], 00705 hmm_logical_trans(last)); 00706 } 00707 00708 /**************************************************************/ 00709 /******** homophone processing: duplicating leaf nodes ********/ 00710 /**************************************************************/ 00711 00751 static void 00752 wchmm_duplicate_state(WCHMM_INFO *wchmm, int node, int word) /* source node, new word */ 00753 { 00754 int j, n; 00755 int n_src, n_prev; 00756 A_CELL2 *ac; 00757 HMM_Logical *lastphone; 00758 00759 /* 1 state will newly created: expand tree if needed */ 00760 if (wchmm->n + 1 >= wchmm->maxwcn) { 00761 wchmm_extend(wchmm); 00762 } 00763 /* n: the target new node to which 'node' is copied */ 00764 n = wchmm->n; 00765 00766 n_src = node; 00767 00768 /* copy output probability info */ 00769 #ifdef PASS1_IWCD 00770 { 00771 RC_INFO *rcnew; 00772 LRC_INFO *lrcnew; 00773 wchmm->outstyle[n] = wchmm->outstyle[n_src]; 00774 if (wchmm->outstyle[n] == AS_RSET) { 00775 /* duplicate RC_INFO because it has its own cache */ 00776 rcnew = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root)); 00777 memcpy(rcnew, wchmm->state[n_src].out.rset, sizeof(RC_INFO)); 00778 wchmm->state[n].out.rset = rcnew; 00779 } else if (wchmm->outstyle[n] == AS_LRSET) { 00780 /* duplicate LRC_INFO because it has its own cache */ 00781 lrcnew = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root)); 00782 memcpy(lrcnew, wchmm->state[n_src].out.lrset, sizeof(LRC_INFO)); 00783 wchmm->state[n].out.lrset = lrcnew; 00784 } else { 00785 /* share same info, simply copy the pointer */ 00786 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(ACOUSTIC_SPEC)); 00787 } 00788 } 00789 #else /* ~PASS1_IWCD */ 00790 memcpy(&(wchmm->state[n].out), &(wchmm->state[n_src].out), sizeof(HTK_HMM_State *)); 00791 #endif 00792 00793 lastphone = wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-1]; 00794 acc_init(wchmm, n); 00795 00796 /* add self transition arc */ 00797 wchmm->self_a[n] = wchmm->self_a[n_src]; 00798 00799 /* copy transition arcs whose destination is the source node to new node */ 00800 if (hmm_logical_state_num(lastphone) == 3) { /* = 1 state */ 00801 /* phone with only 1 state should be treated carefully */ 00802 if (wchmm->winfo->wlen[word] == 1) { /* word consists of only this phone */ 00803 /* no arcs need to be copied: this is also a start node of a word */ 00804 wchmm->offset[word][0] = n; 00805 /* index the new word-beginning node as startnode (old ststart) */ 00806 if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) { 00807 wchmm->startnode[wchmm->startnum] = n; 00808 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word; 00809 /* expand data area if necessary */ 00810 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm); 00811 } 00812 } else { 00813 /* copy arcs from the last state of the previous phone */ 00814 n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-2] 00815 + hmm_logical_state_num(wchmm->winfo->wseq[word][wchmm->winfo->wlen[word]-2]) - 3; 00816 if(n_src == n_prev + 1) { 00817 add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n); 00818 } else { 00819 for(ac=wchmm->ac[n_prev];ac;ac=ac->next) { 00820 for(j=0;j<ac->n;j++) { 00821 if (ac->arc[j] == n_src) { 00822 add_wacc(wchmm, n_prev, ac->a[j], n); 00823 } 00824 } 00825 } 00826 } 00827 /* also update the last offset (== wordend in this case) */ 00828 wchmm->offset[word][wchmm->winfo->wlen[word]-1] = n; 00829 } 00830 } else { /* phone with more than 2 states */ 00831 /* copy arcs from/to the source node to new node */ 00832 for (n_prev = wchmm->offset[word][wchmm->winfo->wlen[word]-1]; n_prev < n_src; n_prev++) { 00833 if (n_src == n_prev + 1) { 00834 add_wacc(wchmm, n_prev, wchmm->next_a[n_prev], n); 00835 } else { 00836 for(ac=wchmm->ac[n_prev];ac;ac=ac->next) { 00837 for(j=0;j<ac->n;j++) { 00838 if (ac->arc[j] == n_src) { 00839 add_wacc(wchmm, n_prev, ac->a[j], n); 00840 } 00841 } 00842 } 00843 } 00844 if (n_prev == n_src + 1) { 00845 add_wacc(wchmm, n, wchmm->next_a[n_src], n_prev); 00846 } else { 00847 for(ac=wchmm->ac[n_src];ac;ac=ac->next) { 00848 for(j=0;j<ac->n;j++) { 00849 if (ac->arc[j] == n_prev) { 00850 add_wacc(wchmm, n, ac->a[j], n_prev); 00851 } 00852 } 00853 } 00854 } 00855 } 00856 } 00857 00858 /* map word <-> node */ 00859 wchmm->stend[n] = word; /* 'n' is an end node of word 'word' */ 00860 wchmm->wordend[word] = n; /* the word end node of 'word' is 'n' */ 00861 00862 /* new state has been created: increment the size */ 00863 wchmm->n++; 00864 00865 } 00866 00881 static int 00882 wchmm_duplicate_leafnode(WCHMM_INFO *wchmm) 00883 { 00884 int w, nlast, n, narc, narc_model; 00885 boolean *dupw; /* node marker */ 00886 A_CELL2 *actmp; 00887 int dupcount; 00888 00889 dupcount = 0; 00890 00891 nlast = wchmm->n; 00892 dupw = (boolean *)mymalloc(sizeof(boolean) * nlast); 00893 for(n=0;n<nlast;n++) dupw[n] = FALSE; /* initialize all marker */ 00894 00895 for (w=0;w<wchmm->winfo->num;w++) { 00896 n = wchmm->wordend[w]; 00897 if (dupw[n]) { /* if already marked (2nd time or later */ 00898 wchmm_duplicate_state(wchmm, n, w); dupcount++; /* duplicate */ 00899 } else { /* if not marked yet (1st time) */ 00900 /* try to find an arc outside the word */ 00901 { 00902 /* count number of model-internal arc from the last state */ 00903 HMM_Logical *lastphone; 00904 HTK_HMM_Trans *tinfo; 00905 int laststate, i; 00906 lastphone = wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]; 00907 laststate = hmm_logical_state_num(lastphone) - 2; 00908 tinfo = hmm_logical_trans(lastphone); 00909 narc_model=0; 00910 for(i=1;i<hmm_logical_state_num(lastphone)-1;i++) { 00911 if (tinfo->a[laststate][i] != LOG_ZERO) narc_model++; 00912 } 00913 /* count number of actual arc from the last state in the tree */ 00914 narc = 0; 00915 if (wchmm->self_a[n] != LOG_ZERO) narc++; 00916 if (wchmm->next_a[n] != LOG_ZERO) narc++; 00917 for(actmp=wchmm->ac[n];actmp;actmp=actmp->next) narc += actmp->n; 00918 } 00919 /* if both number does not match, it means it is not a single word tail */ 00920 if (narc_model != narc) { 00921 /* word 'w' is embedded as part of other words at this node 'n' */ 00922 /* duplicate this node now */ 00923 wchmm_duplicate_state(wchmm, n, w); dupcount++; 00924 /* as new node has been assigned as word end node of word 'w', 00925 reset this source node as it is not the word end node */ 00926 wchmm->stend[n] = WORD_INVALID; 00927 } else { 00928 /* no arc to other node found, it means it is a single word tail */ 00929 /* as this is first time, only make sure that this node is word end of [w] */ 00930 wchmm->stend[n] = w; 00931 } 00932 /* mark node 'n' */ 00933 dupw[n] = TRUE; 00934 } 00935 } 00936 free(dupw); 00937 00938 return(dupcount); 00939 } 00940 00941 /**************************************************************/ 00942 /*************** add a word to wchmm lexicon tree *************/ 00943 /**************************************************************/ 00944 00969 static boolean 00970 wchmm_add_word(WCHMM_INFO *wchmm, int word, int matchlen, int matchword, boolean enable_iwsp) 00971 { 00972 boolean ok_p; 00973 int j,k,n; 00974 int add_head, add_tail, add_to; 00975 int word_len, matchword_len; 00976 HMM_Logical *ltmp; 00977 int ato; 00978 LOGPROB prob; 00979 int ntmp; 00980 int ltmp_state_num; 00981 #ifdef PASS1_IWCD 00982 CD_Set *lcd = NULL; 00983 #endif 00984 int *out_from; 00985 int *out_from_next; 00986 LOGPROB *out_a; 00987 LOGPROB *out_a_next; 00988 00989 00990 /* for multipath handling */ 00991 int out_num_prev, out_num_next; 00992 int kkk; 00993 00994 ok_p = TRUE; 00995 if (wchmm->hmminfo->multipath) { 00996 out_from = wchmm->wrk.out_from; 00997 out_from_next = wchmm->wrk.out_from_next; 00998 out_a = wchmm->wrk.out_a; 00999 out_a_next = wchmm->wrk.out_a_next; 01000 } 01001 01002 /* 01003 * if (matchlen > 0) { 01004 * printf("--\n"); 01005 * put_voca(stdout, wchmm->winfo, word); 01006 * put_voca(stdout, wchmm->winfo, matchword); 01007 * printf("matchlen=%d\n", matchlen); 01008 * } 01009 */ 01010 01011 /* variable abbreviations */ 01012 n = wchmm->n; 01013 word_len = wchmm->winfo->wlen[word]; 01014 matchword_len = wchmm->winfo->wlen[matchword]; 01015 01016 /* malloc phone offset area */ 01017 wchmm->offset[word] = (int *)mybmalloc2(sizeof(int)*word_len, &(wchmm->malloc_root)); 01018 01019 /* allocate unshared (new) part */ 01020 add_head = matchlen; 01021 add_tail = word_len - 1; 01022 add_to = matchlen - 1; 01023 01024 if (wchmm->hmminfo->multipath) { 01025 /* make word-beginning node if needed */ 01026 if (matchlen == 0) { 01027 /* create word-beginning node */ 01028 wchmm->wordbegin[word] = n; 01029 wchmm->stend[n] = WORD_INVALID; 01030 acc_init(wchmm, n); 01031 wchmm->state[n].out.state = NULL; 01032 /* index the new word-beginning node as startnode (old ststart) */ 01033 wchmm->startnode[wchmm->startnum] = n; 01034 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word; 01035 /* expand data area if necessary */ 01036 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm); 01037 if (++n >= wchmm->maxwcn) wchmm_extend(wchmm); 01038 } else { 01039 wchmm->wordbegin[word] = wchmm->wordbegin[matchword]; 01040 } 01041 01042 /* now n is at beginning of output state */ 01043 01044 /* store the initial outgoing arcs to out_from[] and out_a[] */ 01045 out_num_prev = 0; 01046 if (matchlen == 0) { 01047 /* set the word-beginning node */ 01048 out_from[0] = wchmm->wordbegin[word]; 01049 out_a[0] = 0.0; 01050 out_num_prev = 1; 01051 } else { 01052 /*printf("%d(%s)\n", word, wchmm->winfo->woutput[word]);*/ 01053 /* on -iwsp, trailing sp is needed only when no phone will be created */ 01054 get_outtrans_list(wchmm, matchword, add_to, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, (enable_iwsp && add_tail - add_head + 1 <= 0) ? TRUE : FALSE); 01055 /*printf("NUM=%d\n", out_num_prev);*/ 01056 } 01057 } else { /* end of multipath block */ 01058 if (matchlen == 0) { 01059 if (wchmm->lmtype != LM_PROB || word != wchmm->winfo->head_silwid) { 01060 /* index the new word-beginning node as startnode (old ststart) */ 01061 wchmm->startnode[wchmm->startnum] = n; 01062 if (wchmm->category_tree) wchmm->start2wid[wchmm->startnum] = word; 01063 /* expand data area if necessary */ 01064 if (++wchmm->startnum >= wchmm->maxstartnum) wchmm_extend_startnode(wchmm); 01065 } 01066 } 01067 } 01068 01069 if (add_tail - add_head + 1 > 0) { /* there are new phones to be created */ 01070 ntmp = n; 01071 for (j=add_head; j <= add_tail; j++) { /* for each new phones */ 01072 ltmp = wchmm->winfo->wseq[word][j]; 01073 ltmp_state_num = hmm_logical_state_num(ltmp); 01074 #ifdef PASS1_IWCD 01075 if (wchmm->ccd_flag) { 01076 /* in the triphone lexicon tree, the last phone of a word has 01077 left-context cdset */ 01078 if (wchmm->winfo->wlen[word] > 1 && j == wchmm->winfo->wlen[word] - 1) { 01079 if (wchmm->category_tree) { 01080 #ifdef USE_OLD_IWCD 01081 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name); 01082 #else 01083 lcd = lcdset_lookup_with_category(wchmm, ltmp, wchmm->winfo->wton[word]); 01084 if (lcd == NULL) { 01085 /* no category-aware cdset found. This is case when no word 01086 can follow this word grammatically. 01087 so fallback to normal state */ 01088 jlog("WARNING: wchmm: no lcdset found for [%s::%04d], fallback to [%s]\n", ltmp->name, wchmm->winfo->wton[word], ltmp->name); 01089 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name); 01090 } 01091 #endif 01092 } else { 01093 lcd = lcdset_lookup_by_hmmname(wchmm->hmminfo, ltmp->name); 01094 } 01095 if (lcd == NULL) { 01096 jlog("ERROR: wchmm: at word #%d: no lcdset found for [%s]\n", word, ltmp->name); 01097 ok_p = FALSE; 01098 } 01099 } 01100 } 01101 #endif /* PASS1_IWCD */ 01102 for (k = 1; k < ltmp_state_num - 1; k++) { /* for each state in the phone */ 01103 /* set state output prob info */ 01104 #ifdef PASS1_IWCD 01105 if (wchmm->ccd_flag) { 01106 /* output info of triphones needs special handling */ 01107 if (wchmm->winfo->wlen[word] == 1) { /* word with only 1 phone */ 01108 wchmm->outstyle[ntmp] = AS_LRSET; 01109 wchmm->state[ntmp].out.lrset = (LRC_INFO *)mybmalloc2(sizeof(LRC_INFO), &(wchmm->malloc_root)); 01110 (wchmm->state[ntmp].out.lrset)->hmm = ltmp; 01111 (wchmm->state[ntmp].out.lrset)->state_loc = k; 01112 if (wchmm->category_tree) { 01113 (wchmm->state[ntmp].out.lrset)->category = wchmm->winfo->wton[word]; 01114 } 01115 } else if (j == 0) { /* head phone of a word */ 01116 wchmm->outstyle[ntmp] = AS_RSET; 01117 wchmm->state[ntmp].out.rset = (RC_INFO *)mybmalloc2(sizeof(RC_INFO), &(wchmm->malloc_root)); 01118 (wchmm->state[ntmp].out.rset)->hmm = ltmp; 01119 (wchmm->state[ntmp].out.rset)->state_loc = k; 01120 } else if (j == wchmm->winfo->wlen[word] - 1) { /* last phone of a word */ 01121 wchmm->outstyle[ntmp] = AS_LSET; 01122 wchmm->state[ntmp].out.lset = &(lcd->stateset[k]); 01123 } else { 01124 wchmm->outstyle[ntmp] = AS_STATE; 01125 if (ltmp->is_pseudo) { 01126 jlog("WARNING: wchmm: word-internal phone should not be pseudo\n"); 01127 put_voca(stdout, wchmm->winfo, word); 01128 ok_p = FALSE; 01129 } 01130 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k]; 01131 } 01132 } else { 01133 /* monophone */ 01134 if (ltmp->is_pseudo) { 01135 j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n"); 01136 put_voca(stdout, wchmm->winfo, word); 01137 ok_p = FALSE; 01138 } 01139 wchmm->outstyle[ntmp] = AS_STATE; 01140 wchmm->state[ntmp].out.state = ltmp->body.defined->s[k]; 01141 } 01142 #else /* ~PASS1_IWCD */ 01143 if (ltmp->is_pseudo) { 01144 j_internal_error("wchmm_add_word: CDSET phoneme exist in monophone?\n"); 01145 put_voca(stdout, wchmm->winfo, word); 01146 ok_p = FALSE; 01147 } 01148 wchmm->state[ntmp].out = ltmp->body.defined->s[k]; 01149 #endif /* PASS1_IWCD */ 01150 01151 /* initialize other info */ 01152 acc_init(wchmm, ntmp); 01153 wchmm->stend[ntmp] = WORD_INVALID; 01154 if (! wchmm->hmminfo->multipath) { 01155 /* make transition arc from HMM transition info */ 01156 for (ato = 1; ato < ltmp_state_num; ato++) { 01157 prob = (hmm_logical_trans(ltmp))->a[k][ato]; 01158 if (prob != LOG_ZERO) { 01159 if (j == add_tail && k == ltmp_state_num - 2 && ato == ltmp_state_num - 1) { 01160 /* arc outside new part will be handled later */ 01161 } else { 01162 add_wacc(wchmm, ntmp, prob, ntmp + ato - k); 01163 } 01164 } 01165 } 01166 } 01167 01168 ntmp++; 01169 /* expand wchmm if neccesary */ 01170 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm); 01171 } /* end of state loop */ 01172 } /* end of phone loop */ 01173 01174 if (wchmm->hmminfo->multipath) { 01175 01176 /* On multipath version, the skip transition should be handled! */ 01177 01178 /* make transition arc from HMM transition info */ 01179 ntmp = n; 01180 for (j = add_head; j <= add_tail; j++) { 01181 ltmp = wchmm->winfo->wseq[word][j]; 01182 ltmp_state_num = hmm_logical_state_num(ltmp); 01183 out_num_next = 0; 01184 /* arc from initial state ... need arc expansion from precious phone */ 01185 for (ato = 1; ato < ltmp_state_num; ato++) { 01186 prob = (hmm_logical_trans(ltmp))->a[0][ato]; 01187 if (prob != LOG_ZERO) { 01188 /* expand arc from previous HMM */ 01189 if (ato == ltmp_state_num - 1) { 01190 /* to final state ... just register states for next expansion */ 01191 for(kkk=0; kkk<out_num_prev; kkk++) { 01192 out_from_next[out_num_next] = out_from[kkk]; 01193 out_a_next[out_num_next] = out_a[kkk] + prob; 01194 out_num_next++; 01195 } 01196 } else { 01197 for(kkk=0; kkk<out_num_prev; kkk++) { 01198 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1); 01199 } 01200 } 01201 } 01202 } /* end of state loop */ 01203 /* from outprob state */ 01204 for(k = 1; k < ltmp_state_num - 1; k++) { 01205 for (ato = 1; ato < ltmp_state_num; ato++) { 01206 prob = (hmm_logical_trans(ltmp))->a[k][ato]; 01207 if (prob != LOG_ZERO) { 01208 if (ato == ltmp_state_num - 1) { 01209 /* to final state ... register states for next expansion */ 01210 out_from_next[out_num_next] = ntmp; 01211 out_a_next[out_num_next] = prob; 01212 out_num_next++; 01213 } else { 01214 add_wacc(wchmm, ntmp, prob, ntmp + ato - k); 01215 } 01216 } 01217 } 01218 ntmp++; 01219 } /* end of state loop */ 01220 /* swap out list for next phone */ 01221 for(kkk=0;kkk<out_num_next;kkk++) { 01222 out_from[kkk] = out_from_next[kkk]; 01223 out_a[kkk] = out_a_next[kkk]; 01224 } 01225 out_num_prev = out_num_next; 01226 } /* end of phone loop */ 01227 } /* end of multipath block */ 01228 01229 } /* new phone node creation loop for this word */ 01230 01231 01232 /*************************************/ 01233 /* Short Pause appending (multipath) */ 01234 /*************************************/ 01235 01236 /* if -iwsp, add noise model to the end of word at ntmp */ 01237 if (wchmm->hmminfo->multipath && enable_iwsp && add_tail - add_head + 1 > 0) { /* there are new phones to be created */ 01238 int ntmp_bak; 01239 01240 /* set short pause state info */ 01241 ntmp_bak = ntmp; 01242 if (wchmm->hmminfo->sp->is_pseudo) { 01243 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { 01244 wchmm->outstyle[ntmp] = AS_LSET; 01245 wchmm->state[ntmp].out.lset = &(wchmm->hmminfo->sp->body.pseudo->stateset[k]); 01246 acc_init(wchmm, ntmp); 01247 wchmm->stend[ntmp] = WORD_INVALID; 01248 ntmp++; 01249 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm); 01250 } 01251 } else { 01252 for(k = 1;k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { 01253 wchmm->outstyle[ntmp] = AS_STATE; 01254 wchmm->state[ntmp].out.state = wchmm->hmminfo->sp->body.defined->s[k]; 01255 acc_init(wchmm, ntmp); 01256 wchmm->stend[ntmp] = WORD_INVALID; 01257 ntmp++; 01258 if (ntmp >= wchmm->maxwcn) wchmm_extend(wchmm); 01259 } 01260 } 01261 ntmp = ntmp_bak; 01262 /* connect incoming arcs from previous phone */ 01263 out_num_next = 0; 01264 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) { 01265 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[0][ato]; 01266 if (prob != LOG_ZERO) { 01267 /* to control short pause insertion, transition probability toward 01268 the word-end short pause will be given a penalty */ 01269 prob += wchmm->hmminfo->iwsp_penalty; 01270 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) { 01271 /* model has a model skip transition, just inherit them to next */ 01272 for(kkk=0; kkk<out_num_prev; kkk++) { 01273 out_from_next[out_num_next] = out_from[kkk]; 01274 out_a_next[out_num_next] = out_a[kkk] + prob; 01275 out_num_next++; 01276 } 01277 } else { 01278 /* connect incoming arcs from previous phone to this phone */ 01279 for(kkk=0; kkk<out_num_prev; kkk++) { 01280 add_wacc(wchmm, out_from[kkk], out_a[kkk] + prob, ntmp + ato - 1); 01281 } 01282 } 01283 } 01284 } 01285 /* if short pause model doesn't have a model skip transition, also add it */ 01286 if (hmm_logical_trans(wchmm->hmminfo->sp)->a[0][hmm_logical_state_num(wchmm->hmminfo->sp)-1] == LOG_ZERO) { 01287 /* to make insertion sp model to have no effect on the original path, 01288 the skip transition probability should be 0.0 (=100%) */ 01289 prob = 0.0; 01290 for(kkk=0; kkk<out_num_prev; kkk++) { 01291 out_from_next[out_num_next] = out_from[kkk]; 01292 out_a_next[out_num_next] = out_a[kkk] + prob; 01293 out_num_next++; 01294 } 01295 } 01296 /* connect arcs within model, and store new outgoing arcs for wordend node */ 01297 for (k = 1; k < hmm_logical_state_num(wchmm->hmminfo->sp) - 1; k++) { 01298 for (ato = 1; ato < hmm_logical_state_num(wchmm->hmminfo->sp); ato++) { 01299 prob = hmm_logical_trans(wchmm->hmminfo->sp)->a[k][ato]; 01300 if (prob != LOG_ZERO) { 01301 if (ato == hmm_logical_state_num(wchmm->hmminfo->sp) - 1) { 01302 out_from_next[out_num_next] = ntmp; 01303 out_a_next[out_num_next] = prob; 01304 out_num_next++; 01305 } else { 01306 add_wacc(wchmm, ntmp, prob, ntmp + ato - k); 01307 } 01308 } 01309 } 01310 ntmp++; 01311 } 01312 /* swap work area for next */ 01313 for(kkk=0;kkk<out_num_next;kkk++) { 01314 out_from[kkk] = out_from_next[kkk]; 01315 out_a[kkk] = out_a_next[kkk]; 01316 } 01317 out_num_prev = out_num_next; 01318 01319 } /* end of inter-word short pause appending block */ 01320 01321 /* make mapping: word <-> node on wchmm */ 01322 for (j=0;j<word_len;j++) { 01323 if (j < add_head) { /* shared part */ 01324 wchmm->offset[word][j] = wchmm->offset[matchword][j]; 01325 } else if (add_tail < j) { /* shared tail part (should not happen..) */ 01326 wchmm->offset[word][j] = wchmm->offset[matchword][j+(matchword_len-word_len)]; 01327 } else { /* newly created part */ 01328 wchmm->offset[word][j] = n; 01329 n += hmm_logical_state_num(wchmm->winfo->wseq[word][j]) - 2; 01330 } 01331 } 01332 01333 01334 if (wchmm->hmminfo->multipath) { 01335 /* create word-end node */ 01336 01337 /* paranoia check if the short-pause addition has been done well */ 01338 if (enable_iwsp && add_tail - add_head + 1 > 0) { 01339 n += hmm_logical_state_num(wchmm->hmminfo->sp) - 2; 01340 if (n != ntmp) j_internal_error("wchmm_add_word: cannot match\n"); 01341 } 01342 01343 /* create word-end node */ 01344 wchmm->wordend[word] = n; /* tail node of 'word' is 'n' */ 01345 wchmm->stend[n] = word; /* node 'k' is a tail node of 'word' */ 01346 acc_init(wchmm, n); 01347 wchmm->state[n].out.state = NULL; 01348 01349 /* connect the final outgoing arcs in out_from[] to the word end node */ 01350 for(k = 0; k < out_num_prev; k++) { 01351 add_wacc(wchmm, out_from[k], out_a[k], n); 01352 } 01353 n++; 01354 if (n >= wchmm->maxwcn) wchmm_extend(wchmm); 01355 01356 if (matchlen == 0) { 01357 /* check if the new word has whole word-skipping transition */ 01358 /* (use out_from and out_num_prev temporary) */ 01359 out_num_prev = 0; 01360 get_outtrans_list(wchmm, word, word_len-1, out_from, out_a, &out_num_prev, wchmm->winfo->maxwn, enable_iwsp); 01361 for(k=0;k<out_num_prev;k++) { 01362 if (out_from[k] == wchmm->wordbegin[word]) { 01363 jlog("ERROR: *** ERROR: WORD SKIPPING TRANSITION NOT ALLOWED ***\n"); 01364 jlog("ERROR: Word id=%d (%s[%s]) has \"word skipping transition\".\n", word, wchmm->winfo->wname[word], wchmm->winfo->woutput[word]); 01365 jlog("ERROR: All HMMs in the word:\n "); 01366 for(kkk=0;kkk<wchmm->winfo->wlen[word];kkk++) { 01367 jlog("%s ", wchmm->winfo->wseq[word][kkk]->name); 01368 } 01369 jlog("\n"); 01370 jlog("ERROR: has transitions from initial state to final state.\n"); 01371 jlog("ERROR: This type of word skipping is not supported.\n"); 01372 ok_p = FALSE; 01373 } 01374 } 01375 } 01376 01377 wchmm->n = n; 01378 01379 } else { 01380 01381 wchmm->n = n; 01382 k = wchmm->offset[word][word_len-1] + hmm_logical_state_num(wchmm->winfo->wseq[word][word_len-1])-2 -1; 01383 wchmm->wordend[word] = k; /* tail node of 'word' is 'k' */ 01384 wchmm->stend[k] = word; /* node 'k' is a tail node of 'word' */ 01385 01386 if (matchlen != 0 && add_tail - add_head + 1 > 0) { 01387 /* new part has been created in the above procedure: */ 01388 /* now make link from shared part to the new part */ 01389 wchmm_link_subword(wchmm, matchword,add_to,word,add_head); 01390 } 01391 01392 } 01393 01394 return(ok_p); 01395 01396 } 01397 01398 /*************************************************************/ 01399 /**** parse whole structure (after wchmm has been built) *****/ 01400 /*************************************************************/ 01401 01416 static void 01417 wchmm_calc_wordend_arc(WCHMM_INFO *wchmm) 01418 { 01419 WORD_ID w; 01420 HTK_HMM_Trans *tr; 01421 LOGPROB a; 01422 01423 for (w=0;w<wchmm->winfo->num;w++) { 01424 tr = hmm_logical_trans(wchmm->winfo->wseq[w][wchmm->winfo->wlen[w]-1]); 01425 a = tr->a[tr->statenum-2][tr->statenum-1]; 01426 wchmm->wordend_a[w] = a; 01427 } 01428 } 01429 01430 #ifdef SEPARATE_BY_UNIGRAM 01431 01432 /********************************************************************/ 01433 /****** for separation (linearization) of high-frequent words *******/ 01434 /********************************************************************/ 01435 01454 static int 01455 compare_prob(LOGPROB *a, LOGPROB *b) 01456 { 01457 if (*a < *b) return (1); 01458 if (*a > *b) return (-1); 01459 return(0); 01460 } 01461 01480 static LOGPROB 01481 get_nbest_uniprob(WCHMM_INFO *wchmm, int n) 01482 { 01483 LOGPROB *u_p; 01484 WORD_ID w; 01485 LOGPROB x; 01486 WORD_INFO *winfo; 01487 NGRAM_INFO *ngram; 01488 01489 winfo = wchmm->winfo; 01490 ngram = wchmm->ngram; 01491 01492 if (n < 1) n = 1; 01493 if (n > winfo->num) n = winfo->num; 01494 01495 /* store all unigram probability to u_p[] */ 01496 u_p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * winfo->num); 01497 for(w=0;w<winfo->num;w++) { 01498 if (ngram) { 01499 x = uni_prob(ngram, winfo->wton[w]) 01500 #ifdef CLASS_NGRAM 01501 + winfo->cprob[w] 01502 #endif 01503 ; 01504 } else { 01505 x = LOG_ZERO; 01506 } 01507 if (wchmm->lmvar == LM_NGRAM_USER) { 01508 x = (*(wchmm->uni_prob_user))(wchmm->winfo, w, x); 01509 } 01510 u_p[w] = x; 01511 } 01512 01513 /* sort them downward */ 01514 qsort(u_p, winfo->num, sizeof(LOGPROB), 01515 (int (*)(const void *,const void *))compare_prob); 01516 01517 /* return the Nth value */ 01518 x = u_p[n-1]; 01519 free(u_p); 01520 return(x); 01521 } 01522 01523 #endif 01524 01525 /**********************************************************/ 01526 /****** MAKE WCHMM (LEXICON TREE) --- main function *******/ 01527 /**********************************************************/ 01528 01529 #define COUNT_STEP 500 ///< Word count step for debug progress output 01530 01552 boolean 01553 build_wchmm(WCHMM_INFO *wchmm, JCONF_LM *lmconf) 01554 { 01555 int i,j; 01556 int matchword=0, sharelen=0, maxsharelen=0; 01557 int num_duplicated; 01558 #ifdef SEPARATE_BY_UNIGRAM 01559 LOGPROB separate_thres; 01560 LOGPROB p; 01561 #endif 01562 boolean ok_p; 01563 01564 /* lingustic infos must be set before build_wchmm() is called */ 01565 /* check if necessary lingustic info is already assigned (for debug) */ 01566 if (wchmm->winfo == NULL 01567 || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL) 01568 || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL) 01569 ) { 01570 jlog("ERROR: wchmm: linguistic info not available!!\n"); 01571 return FALSE; 01572 } 01573 01574 ok_p = TRUE; 01575 01576 #ifdef SEPARATE_BY_UNIGRAM 01577 /* 上位[separate_wnum]番目の1-gramスコアを求める */ 01578 /* 1-gramスコアがこの値以上のものは木から分ける */ 01579 separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum); 01580 #endif 01581 01582 #ifdef PASS1_IWCD 01583 #ifndef USE_OLD_IWCD 01584 if (wchmm->category_tree) { 01585 if (wchmm->ccd_flag) { 01586 /* 全てのカテゴリID付き lcd_set を作成 */ 01587 lcdset_register_with_category_all(wchmm); 01588 } 01589 } 01590 #endif 01591 #endif /* PASS1_IWCD */ 01592 01593 01594 /* wchmmを初期化 */ 01595 wchmm_init(wchmm); 01596 01597 /* カウンタリセット */ 01598 wchmm->separated_word_count=0; 01599 01600 jlog("STAT: wchmm: Building HMM lexicon tree (left-to-right)\n"); 01601 for (i=0;i<wchmm->winfo->num;i++) { 01602 01603 if (wchmm->lmtype == LM_PROB) { 01604 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) { 01605 /* 先頭/末尾の無音モデルは木構造化せず, 01606 * 先頭の無音単語の先頭への遷移,末尾単語の末尾からの遷移は作らない*/ 01607 /* sharelen=0でそのまま */ 01608 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01609 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01610 ok_p = FALSE; 01611 } 01612 continue; 01613 } 01614 #ifndef NO_SEPARATE_SHORT_WORD 01615 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) { 01616 /* 長さの短い単語を木構造化しない(ここでは1音節) */ 01617 /* sharelen=0でそのまま */ 01618 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01619 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01620 ok_p = FALSE; 01621 } 01622 wchmm->separated_word_count++; 01623 continue; 01624 } 01625 #endif 01626 #ifdef SEPARATE_BY_UNIGRAM 01627 if (wchmm->ngram) { 01628 p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i]) 01629 #ifdef CLASS_NGRAM 01630 + wchmm->winfo->cprob[i] 01631 #endif 01632 ; 01633 } else { 01634 p = LOG_ZERO; 01635 } 01636 if (wchmm->lmvar == LM_NGRAM_USER) { 01637 p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p); 01638 } 01639 if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) { 01640 /* 頻度の高い単語を木構造化しない */ 01641 /* separate_thres は上位separate_wnum番目のスコア */ 01642 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01643 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01644 ok_p = FALSE; 01645 } 01646 wchmm->separated_word_count++; 01647 continue; 01648 } 01649 #endif 01650 } 01651 01652 /* 最も長く音素を共有出来る単語を探す */ 01653 maxsharelen=0; 01654 for (j=0;j<i;j++) { 01655 if (wchmm->category_tree && wchmm->lmtype == LM_DFA) { 01656 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[j]) continue; 01657 } 01658 sharelen = wchmm_check_match(wchmm->winfo, i, j); 01659 if (sharelen == wchmm->winfo->wlen[i] && sharelen == wchmm->winfo->wlen[j]) { 01660 /* word に同音語が存在する */ 01661 /* 必ず最大の長さであり,重複カウントを避けるためここで抜ける */ 01662 maxsharelen = sharelen; 01663 matchword = j; 01664 break; 01665 } 01666 if (sharelen > maxsharelen) { 01667 matchword = j; 01668 maxsharelen = sharelen; 01669 } 01670 } 01671 if (wchmm_add_word(wchmm, i, maxsharelen, matchword, lmconf->enable_iwsp) == FALSE) { 01672 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01673 ok_p = FALSE; 01674 } 01675 } 01676 01677 #if 0 01678 /* 木構造を作らない */ 01679 for (i=0;i<wchmm->winfo->num;i++) { 01680 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01681 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01682 ok_p = FALSE; 01683 } 01684 } 01685 #endif 01686 jlog("STAT: %5d words ended (%6d nodes)\n",i,wchmm->n); 01687 01688 if (! wchmm->hmminfo->multipath) { 01689 /* 同一音素系列を持つ単語同士の leaf node を2重化して区別する */ 01690 num_duplicated = wchmm_duplicate_leafnode(wchmm); 01691 jlog("STAT: %d leaf nodes are made unshared\n", num_duplicated); 01692 01693 /* 単語の終端から外への遷移確率を求めておく */ 01694 wchmm_calc_wordend_arc(wchmm); 01695 } 01696 01697 /* wchmmの整合性をチェックする */ 01698 check_wchmm(wchmm); 01699 01700 /* factoring用に各状態に後続単語のリストを付加する */ 01701 if (!wchmm->category_tree) { 01702 01703 #ifdef UNIGRAM_FACTORING 01704 if (wchmm->lmtype == LM_PROB) { 01705 /* 同時に前もってfactoring値を計算 */ 01706 make_successor_list_unigram_factoring(wchmm); 01707 jlog("STAT: 1-gram factoring values has been pre-computed\n"); 01708 } else { 01709 make_successor_list(wchmm); 01710 } 01711 #else 01712 make_successor_list(wchmm); 01713 #endif /* UNIGRAM_FACTORING */ 01714 01715 if (wchmm->hmminfo->multipath) { 01716 /* 構築された factoring 情報をスキップ遷移および文頭文法ノードにコピー */ 01717 adjust_sc_index(wchmm); 01718 } 01719 01720 #ifdef UNIGRAM_FACTORING 01721 if (wchmm->lmtype == LM_PROB) { 01722 /* 単語間LMキャッシュが必要なノードのリストを作る */ 01723 make_iwcache_index(wchmm); 01724 } 01725 #endif /* UNIGRAM_FACTORING */ 01726 01727 /* sclist2node is no longer used */ 01728 if (wchmm->sclist2node != NULL) { 01729 free(wchmm->sclist2node); 01730 wchmm->sclist2node = NULL; 01731 } 01732 01733 } 01734 01735 jlog("STAT: done\n"); 01736 01737 return ok_p; 01738 } 01739 01765 boolean 01766 build_wchmm2(WCHMM_INFO *wchmm, JCONF_LM *lmconf) 01767 { 01768 int i,j, last_i; 01769 int num_duplicated; 01770 WORD_ID *windex; 01771 #ifdef SEPARATE_BY_UNIGRAM 01772 LOGPROB separate_thres; 01773 LOGPROB p; 01774 #endif 01775 boolean ok_p; 01776 boolean ret; 01777 01778 /* lingustic infos must be set before build_wchmm() is called */ 01779 /* check if necessary lingustic info is already assigned (for debug) */ 01780 if (wchmm->winfo == NULL 01781 || (wchmm->lmvar == LM_NGRAM && wchmm->ngram == NULL) 01782 || (wchmm->lmvar == LM_DFA_GRAMMAR && wchmm->dfa == NULL) 01783 ) { 01784 jlog("ERROR: wchmm: linguistic info not available!!\n"); 01785 return FALSE; 01786 } 01787 01788 ok_p = TRUE; 01789 01790 wchmm->separated_word_count = 0; 01791 01792 jlog("STAT: Building HMM lexicon tree\n"); 01793 01794 if (wchmm->lmtype == LM_PROB) { 01795 #ifdef SEPARATE_BY_UNIGRAM 01796 /* compute score threshold beforehand to separate words from tree */ 01797 /* here we will separate best [separate_wnum] words from tree */ 01798 separate_thres = get_nbest_uniprob(wchmm, lmconf->separate_wnum); 01799 #endif 01800 } 01801 01802 #ifdef PASS1_IWCD 01803 #ifndef USE_OLD_IWCD 01804 if (wchmm->category_tree) { 01805 if (wchmm->ccd_flag) { 01806 /* when Julian mode (category-tree) and triphone is used, 01807 make all category-indexed context-dependent phone set (cdset) here */ 01808 /* these will be assigned on the last phone of each word on tree */ 01809 lcdset_register_with_category_all(wchmm); 01810 } 01811 } 01812 #endif 01813 #endif /* PASS1_IWCD */ 01814 01815 /* initialize wchmm */ 01816 wchmm_init(wchmm); 01817 01818 /* make sorted word index ordered by phone sequence */ 01819 windex = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wchmm->winfo->num); 01820 for(i=0;i<wchmm->winfo->num;i++) windex[i] = i; 01821 01822 if (wchmm->category_tree && wchmm->lmtype == LM_DFA) { 01823 01824 /* sort by category -> sort by word ID in each category */ 01825 wchmm_sort_idx_by_category(wchmm->winfo, windex, wchmm->winfo->num); 01826 { 01827 int last_cate; 01828 last_i = 0; 01829 last_cate = wchmm->winfo->wton[windex[0]]; 01830 for(i = 1;i<wchmm->winfo->num;i++) { 01831 if (wchmm->winfo->wton[windex[i]] != last_cate) { 01832 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, i - last_i); 01833 last_cate = wchmm->winfo->wton[windex[i]]; 01834 last_i = i; 01835 } 01836 } 01837 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, last_i, wchmm->winfo->num - last_i); 01838 } 01839 01840 } else { 01841 01842 /* sort by word ID for whole vocabulary */ 01843 wchmm_sort_idx_by_wseq(wchmm->winfo, windex, 0, wchmm->winfo->num); 01844 01845 } 01846 01847 /* 01848 * { 01849 * int i,w; 01850 * for(i=0;i<wchmm->winfo->num;i++) { 01851 * w = windex[i]; 01852 * printf("%d: cate=%4d wid=%4d %s\n",i, wchmm->winfo->wton[w], w, wchmm->winfo->woutput[w]); 01853 * } 01854 * } 01855 */ 01856 01857 /* incrementaly add words to lexicon tree */ 01858 /* now for each word, the previous word (last_i) is always the most matched one */ 01859 last_i = WORD_INVALID; 01860 for (j=0;j<wchmm->winfo->num;j++) { 01861 i = windex[j]; 01862 01863 if (wchmm->lmtype == LM_PROB) { 01864 01865 /* start/end silence word should not be shared */ 01866 if (i == wchmm->winfo->head_silwid || i == wchmm->winfo->tail_silwid) { 01867 /* add whole word as new (sharelen=0) */ 01868 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01869 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01870 ok_p = FALSE; 01871 } 01872 continue; 01873 } 01874 #ifndef NO_SEPARATE_SHORT_WORD 01875 /* separate short words from tree */ 01876 if (wchmm->winfo->wlen[i] <= SHORT_WORD_LEN) { 01877 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01878 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01879 ok_p = FALSE; 01880 } 01881 wchmm->separated_word_count++; 01882 continue; 01883 } 01884 #endif 01885 #ifdef SEPARATE_BY_UNIGRAM 01886 if (wchmm->ngram) { 01887 p = uni_prob(wchmm->ngram, wchmm->winfo->wton[i]) 01888 #ifdef CLASS_NGRAM 01889 + wchmm->winfo->cprob[i] 01890 #endif 01891 ; 01892 } else { 01893 p = LOG_ZERO; 01894 } 01895 if (wchmm->lmvar == LM_NGRAM_USER) { 01896 p = (*(wchmm->uni_prob_user))(wchmm->winfo, i, p); 01897 } 01898 /* separate high-frequent words from tree (threshold = separate_thres) */ 01899 if (p >= separate_thres && wchmm->separated_word_count < lmconf->separate_wnum) { 01900 if (wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp) == FALSE) { 01901 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01902 ok_p = FALSE; 01903 } 01904 wchmm->separated_word_count++; 01905 continue; 01906 } 01907 #endif 01908 } 01909 01910 if (last_i == WORD_INVALID) { /* first word */ 01911 ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp); 01912 } else { 01913 /* the previous word (last_i) is always the most matched one */ 01914 if (wchmm->category_tree && wchmm->lmtype == LM_DFA) { 01915 if (wchmm->winfo->wton[i] != wchmm->winfo->wton[last_i]) { 01916 ret = wchmm_add_word(wchmm, i, 0, 0, lmconf->enable_iwsp); 01917 } else { 01918 ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp); 01919 } 01920 } else { 01921 ret = wchmm_add_word(wchmm, i, wchmm_check_match(wchmm->winfo, i, last_i), last_i, lmconf->enable_iwsp); 01922 } 01923 } 01924 if (ret == FALSE) { 01925 jlog("ERROR: wchmm: failed to add word #%d to lexicon tree\n"); 01926 ok_p = FALSE; 01927 } 01928 last_i = i; 01929 01930 } /* end of add word loop */ 01931 01932 /*j_printerr("\r %5d words ended (%6d nodes)\n",j,wchmm->n);*/ 01933 01934 /* free work area */ 01935 free(windex); 01936 01937 if (wchmm->hmminfo->multipath) { 01938 jlog("STAT: lexicon size: %d nodes\n", wchmm->n); 01939 } else { 01940 /* duplicate leaf nodes of homophone/embedded words */ 01941 jlog("STAT: lexicon size: %d", wchmm->n); 01942 num_duplicated = wchmm_duplicate_leafnode(wchmm); 01943 jlog("+%d=%d\n", num_duplicated, wchmm->n); 01944 } 01945 01946 if (! wchmm->hmminfo->multipath) { 01947 /* calculate transition probability of word end node to outside */ 01948 wchmm_calc_wordend_arc(wchmm); 01949 } 01950 01951 /* check wchmm coherence (internal debug) */ 01952 check_wchmm(wchmm); 01953 01954 /* make successor list for all branch nodes for N-gram factoring */ 01955 if (!wchmm->category_tree) { 01956 01957 #ifdef UNIGRAM_FACTORING 01958 if (wchmm->lmtype == LM_PROB) { 01959 /* for 1-gram factoring, we can compute the values before search */ 01960 make_successor_list_unigram_factoring(wchmm); 01961 jlog("STAT: 1-gram factoring values has been pre-computed\n"); 01962 } else { 01963 make_successor_list(wchmm); 01964 } 01965 #else 01966 make_successor_list(wchmm); 01967 #endif /* UNIGRAM_FACTORING */ 01968 if (wchmm->hmminfo->multipath) { 01969 /* Copy the factoring data according to the skip transitions and startword nodes */ 01970 adjust_sc_index(wchmm); 01971 } 01972 #ifdef UNIGRAM_FACTORING 01973 if (wchmm->lmtype == LM_PROB) { 01974 /* make list of start nodes that needs inter-word LM cache */ 01975 make_iwcache_index(wchmm); 01976 } 01977 #endif /* UNIGRAM_FACTORING */ 01978 01979 /* sclist2node is no longer used */ 01980 if (wchmm->sclist2node != NULL) { 01981 free(wchmm->sclist2node); 01982 wchmm->sclist2node = NULL; 01983 } 01984 01985 } 01986 01987 //jlog("STAT: done\n"); 01988 01989 #ifdef WCHMM_SIZE_CHECK 01990 if (debug2_flag) { 01991 /* detailed check of lexicon tree size (inaccurate!) */ 01992 jlog("STAT: --- memory size of word lexicon ---\n"); 01993 jlog("STAT: wchmm: %d words, %d nodes\n", wchmm->winfo->num, wchmm->n); 01994 jlog("STAT: %9d bytes: wchmm->state[node] (exclude ac, sc)\n", sizeof(WCHMM_STATE) * wchmm->n); 01995 { 01996 int count1 = 0; 01997 int count2 = 0; 01998 int count3 = 0; 01999 for(i=0;i<wchmm->n;i++) { 02000 if (wchmm->self_a[i] != LOG_ZERO) count1++; 02001 if (wchmm->next_a[i] != LOG_ZERO) count2++; 02002 if (wchmm->ac[i] != NULL) count3++; 02003 } 02004 jlog("STAT: %9d bytes: wchmm->self_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count1 / (float)wchmm->n); 02005 jlog("STAT: %9d bytes: wchmm->next_a[node] (%4.1f%% filled)\n", sizeof(LOGPROB) * wchmm->n, 100.0 * count2 / (float)wchmm->n); 02006 jlog("STAT: %9d bytes: wchmm->ac[node] (%4.1f%% used)\n", sizeof(A_CELL2 *) * wchmm->n, 100.0 * count3 / (float)wchmm->n); 02007 } 02008 jlog("STAT: %9d bytes: wchmm->stend[node]\n", sizeof(WORD_ID) * wchmm->n); 02009 { 02010 int w,count; 02011 count = 0; 02012 for(w=0;w<wchmm->winfo->num;w++) { 02013 count += wchmm->winfo->wlen[w] * sizeof(int) + sizeof(int *); 02014 } 02015 jlog("STAT: %9d bytes: wchmm->offset[w][]\n", count); 02016 } 02017 if (wchmm->hmminfo->multipath) { 02018 jlog("STAT: %9d bytes: wchmm->wordbegin[w]\n", wchmm->winfo->num * sizeof(int)); 02019 } 02020 jlog("STAT: %9d bytes: wchmm->wordend[w]\n", wchmm->winfo->num * sizeof(int)); 02021 jlog("STAT: %9d bytes: wchmm->startnode[]\n", wchmm->startnum * sizeof(int)); 02022 if (wchmm->category_tree) { 02023 jlog("STAT: %9d bytes: wchmm->start2wid[]\n", wchmm->startnum * sizeof(WORD_ID)); 02024 } 02025 #ifdef UNIGRAM_FACTORING 02026 if (wchmm->lmtype == LM_PROB) { 02027 jlog("STAT: %9d bytes: wchmm->start2isolate[]\n", wchmm->isolatenum * sizeof(int)); 02028 } 02029 #endif 02030 if (!wchmm->hmminfo->multipath) { 02031 jlog("STAT: %9d bytes: wchmm->wordend_a[]\n", wchmm->winfo->num * sizeof(LOGPROB)); 02032 } 02033 #ifdef PASS1_IWCD 02034 jlog("STAT: %9d bytes: wchmm->outstyle[]\n", wchmm->n * sizeof(unsigned char)); 02035 { 02036 int c; 02037 c = 0; 02038 for(i=0;i<wchmm->n;i++) { 02039 switch(wchmm->outstyle[i]) { 02040 case AS_RSET: 02041 c += sizeof(RC_INFO); 02042 break; 02043 case AS_LRSET: 02044 c += sizeof(LRC_INFO); 02045 break; 02046 } 02047 } 02048 if (c > 0) jlog("STAT: %9d bytes: wchmm->out (RC_INFO / LRC_INFO)\n", c); 02049 } 02050 #endif 02051 if (!wchmm->category_tree) { 02052 jlog("STAT: %9d bytes: wchmm->sclist[]\n", wchmm->scnum * sizeof(S_CELL *)); 02053 jlog("STAT: %9d bytes: wchmm->sclist2node[]\n", wchmm->scnum * sizeof(int)); 02054 #ifdef UNIGRAM_FACTORING 02055 if (wchmm->lmtype == LM_PROB) { 02056 jlog("STAT: %9d bytes: wchmm->fscore[]\n", wchmm->fsnum * sizeof(LOGPROB)); 02057 } 02058 #endif 02059 } 02060 02061 { 02062 int count, n; 02063 A_CELL2 *ac; 02064 count = 0; 02065 for(n=0;n<wchmm->n;n++) { 02066 for(ac=wchmm->ac[n];ac;ac=ac->next) { 02067 count += sizeof(A_CELL2); 02068 } 02069 } 02070 jlog("STAT: %9d bytes: A_CELL2\n", count); 02071 } 02072 if (!wchmm->category_tree) { 02073 jlog("STAT: %9d bytes: sclist\n", wchmm->scnum * sizeof(S_CELL *)); 02074 jlog("STAT: %9d bytes: sclist2node\n", wchmm->scnum * sizeof(int)); 02075 } 02076 02077 } 02078 02079 #endif /* WCHMM_SIZE_CHECK */ 02080 02081 02082 return ok_p; 02083 02084 } 02085 02086 02101 void 02102 print_wchmm_info(WCHMM_INFO *wchmm) 02103 { 02104 int n,i, rootnum; 02105 02106 if (wchmm->hmminfo->multipath) { 02107 rootnum = wchmm->startnum; 02108 } else { 02109 if (wchmm->lmtype == LM_PROB) { 02110 rootnum = wchmm->startnum + 1; /* including winfo->head_silwid */ 02111 } else if (wchmm->lmtype == LM_DFA) { 02112 rootnum = wchmm->startnum; 02113 } 02114 } 02115 02116 jlog(" Lexicon tree:\n"); 02117 jlog("\t total node num = %6d\n", wchmm->n); 02118 if (wchmm->lmtype == LM_PROB) { 02119 jlog("\t root node num = %6d\n", rootnum); 02120 #ifdef NO_SEPARATE_SHORT_WORD 02121 #ifdef SEPARATE_BY_UNIGRAM 02122 jlog("\t(%d hi-freq. words are separated from tree lexicon)\n", wchmm->separated_word_count); 02123 #else 02124 jlog(" (no words are separated from tree)\n"); 02125 #endif /* SEPARATE_BY_UNIGRAM */ 02126 #else 02127 jlog(" (%d short words (<= %d phonemes) are separated from tree)\n", wchmm->separated_word_count, SHORT_WORD_LEN); 02128 #endif /* NO_SEPARATE_SHORT_WORD */ 02129 } 02130 if (wchmm->lmtype == LM_DFA) { 02131 jlog("\t root node num = %6d\n", rootnum); 02132 } 02133 for(n=0,i=0;i<wchmm->n;i++) { 02134 if (wchmm->stend[i] != WORD_INVALID) n++; 02135 } 02136 jlog("\t leaf node num = %6d\n", n); 02137 if (!wchmm->category_tree) { 02138 jlog("\t fact. node num = %6d\n", wchmm->scnum - 1); 02139 } 02140 } 02141 02142 /* end of file */