Julius 4.1.5
|
00001 00076 /* 00077 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 00078 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00079 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 00080 * All rights reserved 00081 */ 00082 00083 /* 00084 */ 00085 00086 #ifndef __J_RECOG_H__ 00087 #define __J_RECOG_H__ 00088 00089 #include <sent/stddefs.h> 00090 #include <sent/hmm.h> 00091 #include <sent/vocabulary.h> 00092 #include <sent/ngram2.h> 00093 #include <sent/dfa.h> 00094 #include <julius/wchmm.h> 00095 #include <julius/search.h> 00096 #include <julius/callback.h> 00097 #include <julius/jconf.h> 00098 00099 /* 00100 How tokens are managed: 00101 o tlist[][] is a token stocker. It holds all tokens in sequencial 00102 buffer. They are malloced first on startup, and refered by ID while 00103 Viterbi procedure. In word-pair mode, each token also has a link to 00104 another token to allow a node to have more than 1 token. 00105 00106 o token[n] holds the current ID number of a token associated to a 00107 lexicon tree node 'n'. 00108 00109 */ 00114 typedef struct __FSBeam__ { 00115 /* token stocker */ 00116 TOKEN2 *tlist[2]; 00117 TOKENID *tindex[2]; 00118 int maxtnum; 00119 int expand_step; 00120 boolean expanded; 00121 int tnum[2]; 00122 int n_start; 00123 int n_end; 00124 int tl; 00125 int tn; 00126 00127 /* Active token list */ 00128 TOKENID *token; 00129 #ifdef UNIGRAM_FACTORING 00130 /* for wordend processing with 1-gram factoring */ 00131 LOGPROB wordend_best_score; 00132 int wordend_best_node; 00133 TRELLIS_ATOM *wordend_best_tre; 00134 WORD_ID wordend_best_last_cword; 00135 #endif 00136 00137 int totalnodenum; 00138 TRELLIS_ATOM bos; 00139 boolean nodes_malloced; 00140 LOGPROB lm_weight; 00141 LOGPROB lm_penalty; 00142 LOGPROB lm_penalty_trans; 00143 LOGPROB penalty1; 00144 #if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT) 00145 boolean wpair_keep_nlimit; 00146 #endif 00147 /* for short-pause segmentation */ 00148 boolean in_sparea; 00149 int tmp_sparea_start; 00150 #ifdef SP_BREAK_RESUME_WORD_BEGIN 00151 WORD_ID tmp_sp_break_last_word; 00152 #else 00153 WORD_ID last_tre_word; 00154 #endif 00155 boolean first_sparea; 00156 int sp_duration; 00157 #ifdef SPSEGMENT_NAIST 00158 boolean after_trigger; 00159 int trigger_duration; 00160 boolean want_rewind; 00161 int rewind_frame; 00162 boolean want_rewind_reprocess; 00163 #endif 00164 char *pausemodelnames; 00165 char **pausemodel; 00166 int pausemodelnum; 00167 } FSBeam; 00168 00169 00174 typedef struct __RealBeam__ { 00175 /* input parameter */ 00176 int maxframelen; 00177 00178 SP16 *window; 00179 int windowlen; 00180 int windownum; 00181 00182 /* for short-pause segmentation */ 00183 boolean last_is_segmented; 00184 SP16 *rest_Speech; 00185 int rest_alloc_len; 00186 int rest_len; 00187 00188 } RealBeam; 00189 00194 typedef struct __StackDecode__ { 00195 int hypo_len_count[MAXSEQNUM+1]; 00196 int maximum_filled_length; 00197 #ifdef SCAN_BEAM 00198 LOGPROB *framemaxscore; 00199 #endif 00200 NODE *stocker_root; 00201 int popctr; 00202 int genectr; 00203 int pushctr; 00204 int finishnum; 00205 NODE *current; 00206 00207 #ifdef CONFIDENCE_MEASURE 00208 LOGPROB cm_alpha; 00209 # ifdef CM_MULTIPLE_ALPHA 00210 LOGPROB *cmsumlist; 00211 int cmsumlistlen; 00212 # endif 00213 # ifdef CM_SEARCH 00214 LOGPROB cm_tmpbestscore; 00215 # ifndef CM_MULTIPLE_ALPHA 00216 LOGPROB cm_tmpsum; 00217 # endif 00218 int l_stacksize; 00219 int l_stacknum; 00220 NODE *l_start; 00221 NODE *l_bottom; 00222 # endif 00223 # ifdef CM_NBEST 00224 LOGPROB *sentcm = NULL; 00225 LOGPROB *wordcm = NULL; 00226 int sentnum; 00227 # endif 00228 #endif /* CONFIDENCE_MEASURE */ 00229 00230 LOGPROB *wordtrellis[2]; 00231 LOGPROB *g; 00232 HMM_Logical **phmmseq; 00233 int phmmlen_max; 00234 boolean *has_sp; 00235 #ifdef GRAPHOUT_PRECISE_BOUNDARY 00236 short *wend_token_frame[2]; 00237 LOGPROB *wend_token_gscore[2]; 00238 short *wef; 00239 LOGPROB *wes; 00240 #endif 00241 WORD_ID *cnword; 00242 WORD_ID *cnwordrev; 00243 00244 } StackDecode; 00245 00250 typedef struct { 00251 LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); 00252 LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); 00253 LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); 00254 } LMFunc; 00255 00260 typedef struct __gmm_calc__{ 00261 LOGPROB *gmm_score; 00262 boolean *is_voice; 00263 int framecount; 00264 00265 short OP_nstream; 00266 VECT *OP_vec_stream[MAXSTREAMNUM]; 00267 short OP_veclen_stream[MAXSTREAMNUM]; 00268 00269 LOGPROB *OP_calced_score; 00270 int *OP_calced_id; 00271 int OP_calced_num; 00272 int OP_calced_maxnum; 00273 int OP_gprune_num; 00274 VECT *OP_vec; 00275 short OP_veclen; 00276 HTK_HMM_Data *max_d; 00277 int max_i; 00278 #ifdef CONFIDENCE_MEASURE 00279 LOGPROB gmm_max_cm; 00280 #endif 00281 #ifdef GMM_VAD 00282 LOGPROB *rates; 00283 int nframe; 00284 boolean filled; 00285 int framep; 00286 00287 boolean in_voice; 00288 boolean up_trigger; 00289 boolean down_trigger; 00290 boolean after_trigger; 00291 boolean want_rewind; 00292 boolean want_rewind_reprocess; 00293 int rewind_frame; 00294 int duration; 00295 #endif 00296 } GMMCalc; 00297 00302 typedef struct __sentence_align__ { 00303 int num; 00304 short unittype; 00305 WORD_ID *w; 00306 HMM_Logical **ph; 00307 short *loc; 00308 boolean *is_iwsp; 00309 int *begin_frame; 00310 int *end_frame; 00311 LOGPROB *avgscore; 00312 LOGPROB allscore; 00313 struct __sentence_align__ *next; 00314 } SentenceAlign; 00315 00320 typedef struct __sentence__ { 00321 WORD_ID word[MAXSEQNUM]; 00322 int word_num; 00323 LOGPROB score; 00324 LOGPROB confidence[MAXSEQNUM]; 00325 LOGPROB score_lm; 00326 LOGPROB score_am; 00327 int gram_id; 00328 SentenceAlign *align; 00329 00330 } Sentence; 00331 00336 typedef struct __adin__ { 00337 /* functions */ 00339 boolean (*ad_standby)(int, void *); 00341 boolean (*ad_begin)(char *); 00343 boolean (*ad_end)(); 00345 boolean (*ad_resume)(); 00347 boolean (*ad_pause)(); 00349 boolean (*ad_terminate)(); 00351 int (*ad_read)(SP16 *, int); 00353 char * (*ad_input_name)(); 00354 00355 /* configuration parameters */ 00356 int thres; 00357 int noise_zerocross; 00358 int nc_max; 00359 boolean adin_cut_on; 00360 boolean silence_cut_default; 00361 boolean strip_flag; 00362 boolean enable_thread; 00363 boolean need_zmean; 00364 00365 /* work area */ 00366 int c_length; 00367 int c_offset; 00368 SP16 *swapbuf; 00369 int sbsize; 00370 int sblen; 00371 int rest_tail; 00372 00373 ZEROCROSS zc; 00374 00375 #ifdef HAVE_PTHREAD 00376 /* Variables related to POSIX threading */ 00377 pthread_t adin_thread; 00378 pthread_mutex_t mutex; 00379 SP16 *speech; 00380 int speechlen; 00381 /* 00382 * Semaphore to start/stop recognition. 00383 * 00384 * If TRUE, A/D-in thread will store incoming samples to @a speech and 00385 * main thread will detect and process them. 00386 * If FALSE, A/D-in thread will still get input and check trigger as the same 00387 * as TRUE case, but does not store them to @a speech. 00388 * 00389 */ 00390 boolean transfer_online; 00395 boolean adinthread_buffer_overflowed; 00400 boolean adinthread_ended; 00401 00402 boolean ignore_speech_while_recog; 00403 00404 #endif 00405 00406 /* Input data buffer */ 00407 SP16 *buffer; 00408 int bpmax; 00409 int bp; 00410 int current_len; 00411 SP16 *cbuf; 00412 boolean down_sample; 00413 SP16 *buffer48; 00414 int io_rate; 00415 00416 boolean is_valid_data; 00417 int nc; 00418 boolean end_of_stream; 00419 boolean need_init; 00420 00421 DS_BUFFER *ds; 00422 00423 boolean rehash; 00424 00425 boolean input_side_segment; 00426 00427 unsigned int total_captured_len; 00428 unsigned int last_trigger_sample; 00429 00430 char current_input_name[MAXPATHLEN]; 00431 00432 } ADIn; 00433 00439 typedef struct __Output__ { 00448 int status; 00449 00450 int num_frame; 00451 int length_msec; 00452 00453 Sentence *sent; 00454 int sentnum; 00455 00456 WordGraph *wg1; 00457 int wg1_num; 00458 00459 WordGraph *wg; 00460 00461 CN_CLUSTER *confnet; 00462 00463 Sentence pass1; 00464 00465 } Output; 00466 00467 00468 /**********************************************************************/ 00469 /**********************************************************************/ 00470 /**********************************************************************/ 00471 00476 typedef struct __mfcc_calc__ { 00477 00482 short id; 00483 00488 Value *para; 00489 00494 boolean htk_loaded; 00499 boolean hmm_loaded; 00500 00505 boolean paramtype_check_flag; 00506 00511 MFCCWork *wrk; 00512 00517 HTK_Param *param; 00518 00522 HTK_Param *rest_param; 00523 00528 struct { 00532 char *load_filename; 00537 boolean update; 00541 char *save_filename; 00545 float map_weight; 00546 00550 boolean loaded; 00551 00556 CMNWork *wrk; 00557 00558 } cmn; 00559 00564 struct { 00568 float *ssbuf; 00569 00573 int sslen; 00574 00579 float ss_alpha; 00580 00585 float ss_floor; 00586 00590 boolean sscalc; 00591 00595 int sscalc_len; 00596 00600 char *ssload_filename; 00601 00606 MFCCWork *mfccwrk_ss; 00607 00608 } frontend; 00609 00614 ENERGYWork ewrk; 00615 00620 DeltaBuf *db; 00625 DeltaBuf *ab; 00630 VECT *tmpmfcc; 00631 00637 boolean valid; 00638 00643 int f; 00644 00649 int last_time; 00650 00655 int sparea_start; 00656 00661 boolean segmented; 00662 00667 boolean segmented_by_input; 00668 00673 int plugin_source; 00674 00679 struct { 00681 boolean (*fv_standby)(); 00683 boolean (*fv_begin)(); 00685 int (*fv_read)(VECT *, int); 00687 boolean (*fv_end)(); 00689 boolean (*fv_resume)(); 00691 boolean (*fv_pause)(); 00693 boolean (*fv_terminate)(); 00695 char * (*fv_input_name)(); 00696 } func; 00697 00698 #ifdef POWER_REJECT 00699 float avg_power; 00700 #endif 00701 00706 struct __mfcc_calc__ *next; 00707 00708 } MFCCCalc; 00709 00714 typedef struct __process_am__ { 00715 00720 JCONF_AM *config; 00721 00726 MFCCCalc *mfcc; 00727 00731 HTK_HMM_INFO *hmminfo; 00732 00736 HTK_HMM_INFO *hmm_gs; 00737 00741 HMMWork hmmwrk; 00742 00747 struct __process_am__ *next; 00748 00749 } PROCESS_AM; 00750 00755 typedef struct __process_lm__ { 00756 00761 JCONF_LM *config; 00762 00767 PROCESS_AM *am; 00768 00769 00774 int lmtype; 00775 00781 int lmvar; 00782 00786 WORD_INFO *winfo; 00787 00791 NGRAM_INFO *ngram; 00792 00796 MULTIGRAM *grammars; 00797 00803 int gram_maxid; 00804 00809 DFA_INFO *dfa; 00810 00815 boolean global_modified; 00816 00821 LMFunc lmfunc; 00822 00827 struct __process_lm__ *next; 00828 00829 } PROCESS_LM; 00830 00835 typedef struct __recogprocess__ { 00836 00841 boolean live; 00842 00849 short active; 00850 00855 JCONF_SEARCH *config; 00856 00861 PROCESS_AM *am; 00862 00867 PROCESS_LM *lm; 00868 00873 int lmtype; 00874 00880 int lmvar; 00881 00885 boolean ccd_flag; 00886 00890 WCHMM_INFO *wchmm; 00891 00895 int trellis_beam_width; 00896 00900 BACKTRELLIS *backtrellis; 00901 00905 FSBeam pass1; 00906 00911 StackDecode pass2; 00912 00916 WORD_ID pass1_wseq[MAXSEQNUM]; 00917 00921 int pass1_wnum; 00922 00926 LOGPROB pass1_score; 00927 00931 WORD_ID sp_break_last_word; 00935 WORD_ID sp_break_last_nword; 00939 boolean sp_break_last_nword_allow_override; 00943 WORD_ID sp_break_2_begin_word; 00947 WORD_ID sp_break_2_end_word; 00948 00952 int peseqlen; 00953 00957 int graph_totalwordnum; 00958 00963 Output result; 00964 00969 boolean graphout; 00970 00976 char *order_matrix; 00977 00983 int order_matrix_count; 00984 00985 #ifdef DETERMINE 00986 int determine_count; 00987 LOGPROB determine_maxnodescore; 00988 boolean determined; 00989 LOGPROB determine_last_wid; 00990 boolean have_determine; 00991 #endif 00992 00997 boolean have_interim; 00998 01003 void *hook; 01004 01009 struct __recogprocess__ *next; 01010 01011 } RecogProcess; 01012 01017 typedef struct __Recog__ { 01018 01019 /*******************************************/ 01024 Jconf *jconf; 01025 01026 /*******************************************/ 01031 ADIn *adin; 01032 01036 RealBeam real; 01037 01042 MFCCCalc *mfcclist; 01043 01048 PROCESS_AM *amlist; 01049 01054 PROCESS_LM *lmlist; 01055 01060 RecogProcess *process_list; 01061 01062 01067 boolean process_segment; 01068 01069 /*******************************************/ 01070 /* inputs */ 01071 01075 SP16 *speech; 01076 01081 int speechalloclen; 01082 01086 int speechlen; 01087 01091 int peseqlen; 01092 01093 /*******************************************/ 01094 01099 HTK_HMM_INFO *gmm; 01100 01105 MFCCCalc *gmmmfcc; 01106 01111 GMMCalc *gc; 01112 01113 /*******************************************/ 01114 /* misc. */ 01115 01127 boolean process_active; 01128 01134 boolean process_want_terminate; 01135 01143 boolean process_want_reload; 01144 01150 short gram_switch_input_method; 01151 01158 boolean process_online; 01159 01165 boolean (*calc_vector)(MFCCCalc *, SP16 *, int); 01166 01172 boolean triggered; 01173 01178 void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])(); 01183 void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK]; 01188 int callback_function_num[SIZEOF_CALLBACK_ID]; 01193 int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID]; 01198 int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID]; 01203 int callback_num; 01204 01205 /*******************************************/ 01206 01211 void *hook; 01212 01213 } Recog; 01214 01215 #endif /* __J_RECOG_H__ */