Julius 4.2
|
00001 00039 /* 00040 * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University 00041 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00042 * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology 00043 * All rights reserved 00044 */ 00045 00046 #include <julius/julius.h> 00047 00077 static HMM_Logical ** 00078 make_phseq(WORD_ID *wseq, short num, boolean **has_sp_ret, int *num_ret, int **end_ret, int per_what, 00079 RecogProcess *r) 00080 { 00081 HMM_Logical **ph; /* phoneme sequence */ 00082 boolean *has_sp; 00083 int k; 00084 int phnum; /* num of above */ 00085 WORD_ID tmpw, w; 00086 int i, j, pn, st, endn; 00087 HMM_Logical *tmpp, *ret; 00088 WORD_INFO *winfo; 00089 HTK_HMM_INFO *hmminfo; 00090 boolean enable_iwsp; /* for multipath */ 00091 00092 winfo = r->lm->winfo; 00093 hmminfo = r->am->hmminfo; 00094 if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp; 00095 00096 /* make ph[] from wseq[] */ 00097 /* 1. calc total phone num and malloc */ 00098 phnum = 0; 00099 for (w=0;w<num;w++) phnum += winfo->wlen[wseq[w]]; 00100 ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * phnum); 00101 00102 if (hmminfo->multipath && enable_iwsp) { 00103 has_sp = (boolean *)mymalloc(sizeof(boolean) * phnum); 00104 } else { 00105 has_sp = NULL; 00106 } 00107 /* 2. make phoneme sequence */ 00108 st = 0; 00109 if (hmminfo->multipath) st++; 00110 pn = 0; 00111 endn = 0; 00112 for (w=0;w<num;w++) { 00113 tmpw = wseq[w]; 00114 for (i=0;i<winfo->wlen[tmpw];i++) { 00115 tmpp = winfo->wseq[tmpw][i]; 00116 /* handle cross-word context dependency */ 00117 if (r->ccd_flag) { 00118 if (w > 0 && i == 0) { /* word head */ 00119 00120 if ((ret = get_left_context_HMM(tmpp, ph[pn-1]->name, hmminfo)) != NULL) { 00121 tmpp = ret; 00122 } 00123 /* if triphone not found, fallback to bi/mono-phone */ 00124 /* use pseudo phone when no bi-phone found in alignment... */ 00125 } 00126 if (w < num-1 && i == winfo->wlen[tmpw] - 1) { /* word tail */ 00127 if ((ret = get_right_context_HMM(tmpp, winfo->wseq[wseq[w+1]][0]->name, hmminfo)) != NULL) { 00128 tmpp = ret; 00129 } 00130 } 00131 } 00132 ph[pn] = tmpp; 00133 if (hmminfo->multipath && enable_iwsp) { 00134 if (i == winfo->wlen[tmpw] - 1) { 00135 has_sp[pn] = TRUE; 00136 } else { 00137 has_sp[pn] = FALSE; 00138 } 00139 } 00140 if (per_what == PER_STATE) { 00141 for (j=0;j<hmm_logical_state_num(tmpp)-2;j++) { 00142 (*end_ret)[endn++] = st + j; 00143 } 00144 if (hmminfo->multipath && enable_iwsp && has_sp[pn]) { 00145 for (k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) { 00146 (*end_ret)[endn++] = st + j + k; 00147 } 00148 } 00149 } 00150 st += hmm_logical_state_num(tmpp) - 2; 00151 if (hmminfo->multipath && enable_iwsp && has_sp[pn]) { 00152 st += hmm_logical_state_num(hmminfo->sp) - 2; 00153 } 00154 if (per_what == PER_PHONEME) (*end_ret)[endn++] = st - 1; 00155 pn++; 00156 } 00157 if (per_what == PER_WORD) (*end_ret)[endn++] = st - 1; 00158 } 00159 *num_ret = phnum; 00160 *has_sp_ret = has_sp; 00161 return ph; 00162 } 00163 00164 00187 static void 00188 do_align(WORD_ID *words, short wnum, HTK_Param *param, int per_what, SentenceAlign *align, RecogProcess *r) 00189 { 00190 HMM_Logical **phones; /* phoneme sequence */ 00191 boolean *has_sp; /* whether phone can follow short pause */ 00192 int k; 00193 int phonenum; /* num of above */ 00194 HMM *shmm; /* sentence HMM */ 00195 int *end_state; /* state number of word ends */ 00196 int *end_frame; /* segmented last frame of words */ 00197 LOGPROB *end_score; /* normalized score of each words */ 00198 LOGPROB allscore; /* total score of this word sequence */ 00199 WORD_ID w; 00200 int i, rlen; 00201 int end_num = 0; 00202 int *id_seq, *phloc = NULL, *stloc = NULL; 00203 int j,n,p; 00204 WORD_INFO *winfo; 00205 HTK_HMM_INFO *hmminfo; 00206 boolean enable_iwsp; /* for multipath */ 00207 00208 winfo = r->lm->winfo; 00209 hmminfo = r->am->hmminfo; 00210 if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp; 00211 00212 /* initialize result storage buffer */ 00213 switch(per_what) { 00214 case PER_WORD: 00215 jlog("ALIGN: === word alignment begin ===\n"); 00216 end_num = wnum; 00217 phloc = (int *)mymalloc(sizeof(int)*wnum); 00218 i = 0; 00219 for(w=0;w<wnum;w++) { 00220 phloc[w] = i; 00221 i += winfo->wlen[words[w]]; 00222 } 00223 break; 00224 case PER_PHONEME: 00225 jlog("ALIGN: === phoneme alignment begin ===\n"); 00226 end_num = 0; 00227 for(w=0;w<wnum;w++) end_num += winfo->wlen[words[w]]; 00228 break; 00229 case PER_STATE: 00230 jlog("ALIGN: === state alignment begin ===\n"); 00231 end_num = 0; 00232 for(w=0;w<wnum;w++) { 00233 for (i=0;i<winfo->wlen[words[w]]; i++) { 00234 end_num += hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; 00235 } 00236 if (hmminfo->multipath && enable_iwsp) { 00237 end_num += hmm_logical_state_num(hmminfo->sp) - 2; 00238 } 00239 } 00240 phloc = (int *)mymalloc(sizeof(int)*end_num); 00241 stloc = (int *)mymalloc(sizeof(int)*end_num); 00242 { 00243 n = 0; 00244 p = 0; 00245 for(w=0;w<wnum;w++) { 00246 for(i=0;i<winfo->wlen[words[w]]; i++) { 00247 for(j=0; j<hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; j++) { 00248 phloc[n] = p; 00249 stloc[n] = j + 1; 00250 n++; 00251 } 00252 if (hmminfo->multipath && enable_iwsp && i == winfo->wlen[words[w]] - 1) { 00253 for(k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) { 00254 phloc[n] = p; 00255 stloc[n] = j + 1 + k + end_num; 00256 n++; 00257 } 00258 } 00259 p++; 00260 } 00261 } 00262 } 00263 00264 break; 00265 } 00266 end_state = (int *)mymalloc(sizeof(int) * end_num); 00267 00268 /* make phoneme sequence word sequence */ 00269 phones = make_phseq(words, wnum, &has_sp, &phonenum, &end_state, per_what, r); 00270 /* build the sentence HMMs */ 00271 shmm = new_make_word_hmm(hmminfo, phones, phonenum, has_sp); 00272 if (shmm == NULL) { 00273 j_internal_error("Error: failed to make word hmm for alignment\n"); 00274 } 00275 00276 /* call viterbi segmentation function */ 00277 allscore = viterbi_segment(shmm, param, r->wchmm->hmmwrk, hmminfo->multipath, end_state, end_num, &id_seq, &end_frame, &end_score, &rlen); 00278 00279 /* store result to s */ 00280 align->num = rlen; 00281 align->unittype = per_what; 00282 align->begin_frame = (int *)mymalloc(sizeof(int) * rlen); 00283 align->end_frame = (int *)mymalloc(sizeof(int) * rlen); 00284 align->avgscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * rlen); 00285 for(i=0;i<rlen;i++) { 00286 align->begin_frame[i] = (i == 0) ? 0 : end_frame[i-1] + 1; 00287 align->end_frame[i] = end_frame[i]; 00288 align->avgscore[i] = end_score[i]; 00289 } 00290 switch(per_what) { 00291 case PER_WORD: 00292 align->w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * rlen); 00293 for(i=0;i<rlen;i++) { 00294 align->w[i] = words[id_seq[i]]; 00295 } 00296 break; 00297 case PER_PHONEME: 00298 align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen); 00299 for(i=0;i<rlen;i++) { 00300 align->ph[i] = phones[id_seq[i]]; 00301 } 00302 break; 00303 case PER_STATE: 00304 align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen); 00305 align->loc = (short *)mymalloc(sizeof(short) * rlen); 00306 if (hmminfo->multipath) align->is_iwsp = (boolean *)mymalloc(sizeof(boolean) * rlen); 00307 for(i=0;i<rlen;i++) { 00308 align->ph[i] = phones[phloc[id_seq[i]]]; 00309 if (hmminfo->multipath) { 00310 if (enable_iwsp && stloc[id_seq[i]] > end_num) { 00311 align->loc[i] = stloc[id_seq[i]] - end_num; 00312 align->is_iwsp[i] = TRUE; 00313 } else { 00314 align->loc[i] = stloc[id_seq[i]]; 00315 align->is_iwsp[i] = FALSE; 00316 } 00317 } else { 00318 align->loc[i] = stloc[id_seq[i]]; 00319 } 00320 } 00321 break; 00322 } 00323 00324 align->allscore = allscore; 00325 00326 free_hmm(shmm); 00327 free(id_seq); 00328 free(phones); 00329 if (has_sp) free(has_sp); 00330 free(end_score); 00331 free(end_frame); 00332 free(end_state); 00333 00334 switch(per_what) { 00335 case PER_WORD: 00336 free(phloc); 00337 break; 00338 case PER_PHONEME: 00339 break; 00340 case PER_STATE: 00341 free(phloc); 00342 free(stloc); 00343 } 00344 00345 } 00346 00369 void 00370 word_align(WORD_ID *words, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r) 00371 { 00372 do_align(words, wnum, param, PER_WORD, align, r); 00373 } 00374 00397 void 00398 word_rev_align(WORD_ID *revwords, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r) 00399 { 00400 WORD_ID *words; /* word sequence (true order) */ 00401 int w; 00402 words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wnum); 00403 for (w=0;w<wnum;w++) words[w] = revwords[wnum-w-1]; 00404 do_align(words, wnum, param, PER_WORD, align, r); 00405 free(words); 00406 } 00407 00430 void 00431 phoneme_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r) 00432 { 00433 do_align(words, num, param, PER_PHONEME, align, r); 00434 } 00435 00458 void 00459 phoneme_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r) 00460 { 00461 WORD_ID *words; /* word sequence (true order) */ 00462 int p; 00463 words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num); 00464 for (p=0;p<num;p++) words[p] = revwords[num-p-1]; 00465 do_align(words, num, param, PER_PHONEME, align, r); 00466 free(words); 00467 } 00468 00491 void 00492 state_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r) 00493 { 00494 do_align(words, num, param, PER_STATE, align, r); 00495 } 00496 00519 void 00520 state_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r) 00521 { 00522 WORD_ID *words; /* word sequence (true order) */ 00523 int p; 00524 words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num); 00525 for (p=0;p<num;p++) words[p] = revwords[num-p-1]; 00526 do_align(words, num, param, PER_STATE, align, r); 00527 free(words); 00528 } 00529 00546 void 00547 do_alignment_all(RecogProcess *r, HTK_Param *param) 00548 { 00549 int n; 00550 Sentence *s; 00551 SentenceAlign *now, *prev; 00552 00553 for(n = 0; n < r->result.sentnum; n++) { 00554 s = &(r->result.sent[n]); 00555 /* do forced alignment if needed */ 00556 if (r->config->annotate.align_result_word_flag) { 00557 now = result_align_new(); 00558 word_align(s->word, s->word_num, param, now, r); 00559 if (s->align == NULL) s->align = now; 00560 else prev->next = now; 00561 prev = now; 00562 } 00563 if (r->config->annotate.align_result_phoneme_flag) { 00564 now = result_align_new(); 00565 phoneme_align(s->word, s->word_num, param, now, r); 00566 if (s->align == NULL) s->align = now; 00567 else prev->next = now; 00568 prev = now; 00569 } 00570 if (r->config->annotate.align_result_state_flag) { 00571 now = result_align_new(); 00572 state_align(s->word, s->word_num, param, now, r); 00573 if (s->align == NULL) s->align = now; 00574 else prev->next = now; 00575 prev = now; 00576 } 00577 } 00578 } 00579 00580 /* end of file */