Julius 4.2
libjulius/src/word_align.c
説明を見る。
00001 
00039 /*
00040  * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University
00041  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00042  * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology
00043  * All rights reserved
00044  */
00045 
00046 #include <julius/julius.h>
00047 
00077 static HMM_Logical **
00078 make_phseq(WORD_ID *wseq, short num, boolean **has_sp_ret, int *num_ret, int **end_ret, int per_what, 
00079            RecogProcess *r)
00080 {
00081   HMM_Logical **ph;             /* phoneme sequence */
00082   boolean *has_sp;
00083   int k;
00084   int phnum;                    /* num of above */
00085   WORD_ID tmpw, w;
00086   int i, j, pn, st, endn;
00087   HMM_Logical *tmpp, *ret;
00088   WORD_INFO *winfo;
00089   HTK_HMM_INFO *hmminfo;
00090   boolean enable_iwsp;          /* for multipath */
00091 
00092   winfo = r->lm->winfo;
00093   hmminfo = r->am->hmminfo;
00094   if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
00095 
00096   /* make ph[] from wseq[] */
00097   /* 1. calc total phone num and malloc */
00098   phnum = 0;
00099   for (w=0;w<num;w++) phnum += winfo->wlen[wseq[w]];
00100   ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * phnum);
00101   
00102   if (hmminfo->multipath && enable_iwsp) {
00103     has_sp = (boolean *)mymalloc(sizeof(boolean) * phnum);
00104   } else {
00105     has_sp = NULL;
00106   }
00107   /* 2. make phoneme sequence */
00108   st = 0;
00109   if (hmminfo->multipath) st++;
00110   pn = 0;
00111   endn = 0;
00112   for (w=0;w<num;w++) {
00113     tmpw = wseq[w];
00114     for (i=0;i<winfo->wlen[tmpw];i++) {
00115       tmpp = winfo->wseq[tmpw][i];
00116       /* handle cross-word context dependency */
00117       if (r->ccd_flag) {
00118         if (w > 0 && i == 0) {  /* word head */
00119           
00120           if ((ret = get_left_context_HMM(tmpp, ph[pn-1]->name, hmminfo)) != NULL) {
00121             tmpp = ret;
00122           }
00123           /* if triphone not found, fallback to bi/mono-phone  */
00124           /* use pseudo phone when no bi-phone found in alignment... */
00125         }
00126         if (w < num-1 && i == winfo->wlen[tmpw] - 1) { /* word tail */
00127           if ((ret = get_right_context_HMM(tmpp, winfo->wseq[wseq[w+1]][0]->name, hmminfo)) != NULL) {
00128             tmpp = ret;
00129           }
00130         }
00131       }
00132       ph[pn] = tmpp;
00133       if (hmminfo->multipath && enable_iwsp) {
00134         if (i == winfo->wlen[tmpw] - 1) {
00135           has_sp[pn] = TRUE;
00136         } else {
00137           has_sp[pn] = FALSE;
00138         }
00139       }
00140       if (per_what == PER_STATE) {
00141         for (j=0;j<hmm_logical_state_num(tmpp)-2;j++) {
00142           (*end_ret)[endn++] = st + j;
00143         }
00144         if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
00145           for (k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
00146             (*end_ret)[endn++] = st + j + k;
00147           }
00148         }
00149       }
00150       st += hmm_logical_state_num(tmpp) - 2;
00151       if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
00152         st += hmm_logical_state_num(hmminfo->sp) - 2;
00153       }
00154       if (per_what == PER_PHONEME) (*end_ret)[endn++] = st - 1;
00155       pn++;
00156     }
00157     if (per_what == PER_WORD) (*end_ret)[endn++] = st - 1;
00158   }
00159   *num_ret = phnum;
00160   *has_sp_ret = has_sp;
00161   return ph;
00162 }
00163 
00164 
00187 static void
00188 do_align(WORD_ID *words, short wnum, HTK_Param *param, int per_what, SentenceAlign *align, RecogProcess *r)
00189 {
00190   HMM_Logical **phones;         /* phoneme sequence */
00191   boolean *has_sp;              /* whether phone can follow short pause */
00192   int k;
00193   int phonenum;                 /* num of above */
00194   HMM *shmm;                    /* sentence HMM */
00195   int *end_state;               /* state number of word ends */
00196   int *end_frame;               /* segmented last frame of words */
00197   LOGPROB *end_score;           /* normalized score of each words */
00198   LOGPROB allscore;             /* total score of this word sequence */
00199   WORD_ID w;
00200   int i, rlen;
00201   int end_num = 0;
00202   int *id_seq, *phloc = NULL, *stloc = NULL;
00203   int j,n,p;
00204   WORD_INFO *winfo;
00205   HTK_HMM_INFO *hmminfo;
00206   boolean enable_iwsp;          /* for multipath */
00207 
00208   winfo = r->lm->winfo;
00209   hmminfo = r->am->hmminfo;
00210   if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
00211 
00212   /* initialize result storage buffer */
00213   switch(per_what) {
00214   case PER_WORD:
00215     jlog("ALIGN: === word alignment begin ===\n");
00216     end_num = wnum;
00217     phloc = (int *)mymalloc(sizeof(int)*wnum);
00218     i = 0;
00219     for(w=0;w<wnum;w++) {
00220       phloc[w] = i;
00221       i += winfo->wlen[words[w]];
00222     }
00223     break;
00224   case PER_PHONEME:
00225     jlog("ALIGN: === phoneme alignment begin ===\n");
00226     end_num = 0;
00227     for(w=0;w<wnum;w++) end_num += winfo->wlen[words[w]];
00228     break;
00229   case PER_STATE:
00230     jlog("ALIGN: === state alignment begin ===\n");
00231     end_num = 0;
00232     for(w=0;w<wnum;w++) {
00233       for (i=0;i<winfo->wlen[words[w]]; i++) {
00234         end_num += hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2;
00235       }
00236       if (hmminfo->multipath && enable_iwsp) {
00237         end_num += hmm_logical_state_num(hmminfo->sp) - 2;
00238       }
00239     }
00240     phloc = (int *)mymalloc(sizeof(int)*end_num);
00241     stloc = (int *)mymalloc(sizeof(int)*end_num);
00242     {
00243       n = 0;
00244       p = 0;
00245       for(w=0;w<wnum;w++) {
00246         for(i=0;i<winfo->wlen[words[w]]; i++) {
00247           for(j=0; j<hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; j++) {
00248             phloc[n] = p;
00249             stloc[n] = j + 1;
00250             n++;
00251           }
00252           if (hmminfo->multipath && enable_iwsp && i == winfo->wlen[words[w]] - 1) {
00253             for(k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
00254               phloc[n] = p;
00255               stloc[n] = j + 1 + k + end_num;
00256               n++;
00257             }
00258           }
00259           p++;
00260         }
00261       }
00262     }
00263     
00264     break;
00265   }
00266   end_state = (int *)mymalloc(sizeof(int) * end_num);
00267 
00268   /* make phoneme sequence word sequence */
00269   phones = make_phseq(words, wnum, &has_sp, &phonenum, &end_state, per_what, r);
00270   /* build the sentence HMMs */
00271   shmm = new_make_word_hmm(hmminfo, phones, phonenum, has_sp);
00272   if (shmm == NULL) {
00273     j_internal_error("Error: failed to make word hmm for alignment\n");
00274   }
00275 
00276   /* call viterbi segmentation function */
00277   allscore = viterbi_segment(shmm, param, r->wchmm->hmmwrk, hmminfo->multipath, end_state, end_num, &id_seq, &end_frame, &end_score, &rlen);
00278 
00279   /* store result to s */
00280   align->num = rlen;
00281   align->unittype = per_what;
00282   align->begin_frame = (int *)mymalloc(sizeof(int) * rlen);
00283   align->end_frame   = (int *)mymalloc(sizeof(int) * rlen);
00284   align->avgscore    = (LOGPROB *)mymalloc(sizeof(LOGPROB) * rlen);
00285   for(i=0;i<rlen;i++) {
00286     align->begin_frame[i] = (i == 0) ? 0 : end_frame[i-1] + 1;
00287     align->end_frame[i]   = end_frame[i];
00288     align->avgscore[i]    = end_score[i];
00289   }
00290   switch(per_what) {
00291   case PER_WORD:
00292     align->w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * rlen);
00293     for(i=0;i<rlen;i++) {
00294       align->w[i] = words[id_seq[i]];
00295     }
00296     break;
00297   case PER_PHONEME:
00298     align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
00299     for(i=0;i<rlen;i++) {
00300       align->ph[i] = phones[id_seq[i]];
00301     }
00302     break;
00303   case PER_STATE:
00304     align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
00305     align->loc = (short *)mymalloc(sizeof(short) * rlen);
00306     if (hmminfo->multipath) align->is_iwsp = (boolean *)mymalloc(sizeof(boolean) * rlen);
00307     for(i=0;i<rlen;i++) {
00308       align->ph[i]  = phones[phloc[id_seq[i]]];
00309       if (hmminfo->multipath) {
00310         if (enable_iwsp && stloc[id_seq[i]] > end_num) {
00311           align->loc[i] = stloc[id_seq[i]] - end_num;
00312           align->is_iwsp[i] = TRUE;
00313         } else {
00314           align->loc[i] = stloc[id_seq[i]];
00315           align->is_iwsp[i] = FALSE;
00316         }
00317       } else {
00318         align->loc[i] = stloc[id_seq[i]];
00319       }
00320     }
00321     break;
00322   }
00323 
00324   align->allscore = allscore;
00325 
00326   free_hmm(shmm);
00327   free(id_seq);
00328   free(phones);
00329   if (has_sp) free(has_sp);
00330   free(end_score);
00331   free(end_frame);
00332   free(end_state);
00333 
00334   switch(per_what) {
00335   case PER_WORD:
00336     free(phloc);
00337     break;
00338   case PER_PHONEME:
00339     break;
00340   case PER_STATE:
00341     free(phloc);
00342     free(stloc);
00343   }
00344   
00345 }
00346 
00369 void
00370 word_align(WORD_ID *words, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
00371 {
00372   do_align(words, wnum, param, PER_WORD, align, r);
00373 }
00374 
00397 void
00398 word_rev_align(WORD_ID *revwords, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
00399 {
00400   WORD_ID *words;               /* word sequence (true order) */
00401   int w;
00402   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wnum);
00403   for (w=0;w<wnum;w++) words[w] = revwords[wnum-w-1];
00404   do_align(words, wnum, param, PER_WORD, align, r);
00405   free(words);
00406 }
00407 
00430 void
00431 phoneme_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
00432 {
00433   do_align(words, num, param, PER_PHONEME, align, r);
00434 }
00435 
00458 void
00459 phoneme_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
00460 {
00461   WORD_ID *words;               /* word sequence (true order) */
00462   int p;
00463   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
00464   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
00465   do_align(words, num, param, PER_PHONEME, align, r);
00466   free(words);
00467 }
00468 
00491 void
00492 state_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
00493 {
00494   do_align(words, num, param, PER_STATE, align, r);
00495 }
00496 
00519 void
00520 state_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
00521 {
00522   WORD_ID *words;               /* word sequence (true order) */
00523   int p;
00524   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
00525   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
00526   do_align(words, num, param, PER_STATE, align, r);
00527   free(words);
00528 }
00529 
00546 void
00547 do_alignment_all(RecogProcess *r, HTK_Param *param)
00548 {
00549   int n;
00550   Sentence *s;
00551   SentenceAlign *now, *prev;
00552 
00553   for(n = 0; n < r->result.sentnum; n++) {
00554     s = &(r->result.sent[n]);
00555     /* do forced alignment if needed */
00556     if (r->config->annotate.align_result_word_flag) {
00557       now = result_align_new();
00558       word_align(s->word, s->word_num, param, now, r);
00559       if (s->align == NULL) s->align = now;
00560       else prev->next = now;
00561       prev = now;
00562     }
00563     if (r->config->annotate.align_result_phoneme_flag) {
00564       now = result_align_new();
00565       phoneme_align(s->word, s->word_num, param, now, r);
00566       if (s->align == NULL) s->align = now;
00567       else prev->next = now;
00568       prev = now;
00569     }
00570     if (r->config->annotate.align_result_state_flag) {
00571       now = result_align_new();
00572       state_align(s->word, s->word_num, param, now, r);
00573       if (s->align == NULL) s->align = now;
00574       else prev->next = now;
00575       prev = now;
00576     }
00577   }
00578 } 
00579 
00580 /* end of file */