Julius 4.2
|
00001 00076 /* 00077 * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University 00078 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00079 * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology 00080 * All rights reserved 00081 */ 00082 00083 /* 00084 */ 00085 00086 #ifndef __J_RECOG_H__ 00087 #define __J_RECOG_H__ 00088 00089 #include <sent/stddefs.h> 00090 #include <sent/hmm.h> 00091 #include <sent/vocabulary.h> 00092 #include <sent/ngram2.h> 00093 #include <sent/dfa.h> 00094 #include <julius/wchmm.h> 00095 #include <julius/search.h> 00096 #include <julius/callback.h> 00097 #include <julius/jconf.h> 00098 00099 /* 00100 How tokens are managed: 00101 o tlist[][] is a token stocker. It holds all tokens in sequencial 00102 buffer. They are malloced first on startup, and refered by ID while 00103 Viterbi procedure. In word-pair mode, each token also has a link to 00104 another token to allow a node to have more than 1 token. 00105 00106 o token[n] holds the current ID number of a token associated to a 00107 lexicon tree node 'n'. 00108 00109 */ 00114 typedef struct __FSBeam__ { 00115 /* token stocker */ 00116 TOKEN2 *tlist[2]; 00117 TOKENID *tindex[2]; 00118 int maxtnum; 00119 int expand_step; 00120 boolean expanded; 00121 int tnum[2]; 00122 int n_start; 00123 int n_end; 00124 int tl; 00125 int tn; 00126 #ifdef SCORE_PRUNING 00127 LOGPROB score_pruning_max; 00128 LOGPROB score_pruning_threshold; 00129 int score_pruning_count; 00130 #endif 00131 00132 /* Active token list */ 00133 TOKENID *token; 00134 #ifdef UNIGRAM_FACTORING 00135 /* for wordend processing with 1-gram factoring */ 00136 LOGPROB wordend_best_score; 00137 int wordend_best_node; 00138 TRELLIS_ATOM *wordend_best_tre; 00139 WORD_ID wordend_best_last_cword; 00140 #endif 00141 00142 int totalnodenum; 00143 TRELLIS_ATOM bos; 00144 boolean nodes_malloced; 00145 LOGPROB lm_weight; 00146 LOGPROB lm_penalty; 00147 LOGPROB lm_penalty_trans; 00148 LOGPROB penalty1; 00149 #if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT) 00150 boolean wpair_keep_nlimit; 00151 #endif 00152 /* for short-pause segmentation */ 00153 boolean in_sparea; 00154 int tmp_sparea_start; 00155 #ifdef SP_BREAK_RESUME_WORD_BEGIN 00156 WORD_ID tmp_sp_break_last_word; 00157 #else 00158 WORD_ID last_tre_word; 00159 #endif 00160 boolean first_sparea; 00161 int sp_duration; 00162 #ifdef SPSEGMENT_NAIST 00163 boolean after_trigger; 00164 int trigger_duration; 00165 boolean want_rewind; 00166 int rewind_frame; 00167 boolean want_rewind_reprocess; 00168 #endif 00169 char *pausemodelnames; 00170 char **pausemodel; 00171 int pausemodelnum; 00172 } FSBeam; 00173 00174 00179 typedef struct __RealBeam__ { 00180 /* input parameter */ 00181 int maxframelen; 00182 00183 SP16 *window; 00184 int windowlen; 00185 int windownum; 00186 00187 /* for short-pause segmentation */ 00188 boolean last_is_segmented; 00189 SP16 *rest_Speech; 00190 int rest_alloc_len; 00191 int rest_len; 00192 00193 } RealBeam; 00194 00199 typedef struct __StackDecode__ { 00200 int hypo_len_count[MAXSEQNUM+1]; 00201 int maximum_filled_length; 00202 #ifdef SCAN_BEAM 00203 LOGPROB *framemaxscore; 00204 #endif 00205 NODE *stocker_root; 00206 int popctr; 00207 int genectr; 00208 int pushctr; 00209 int finishnum; 00210 NODE *current; 00211 00212 #ifdef CONFIDENCE_MEASURE 00213 LOGPROB cm_alpha; 00214 # ifdef CM_MULTIPLE_ALPHA 00215 LOGPROB *cmsumlist; 00216 int cmsumlistlen; 00217 # endif 00218 # ifdef CM_SEARCH 00219 LOGPROB cm_tmpbestscore; 00220 # ifndef CM_MULTIPLE_ALPHA 00221 LOGPROB cm_tmpsum; 00222 # endif 00223 int l_stacksize; 00224 int l_stacknum; 00225 NODE *l_start; 00226 NODE *l_bottom; 00227 # endif 00228 # ifdef CM_NBEST 00229 LOGPROB *sentcm = NULL; 00230 LOGPROB *wordcm = NULL; 00231 int sentnum; 00232 int wordnum; 00233 # endif 00234 #endif /* CONFIDENCE_MEASURE */ 00235 00236 LOGPROB *wordtrellis[2]; 00237 LOGPROB *g; 00238 HMM_Logical **phmmseq; 00239 int phmmlen_max; 00240 boolean *has_sp; 00241 #ifdef GRAPHOUT_PRECISE_BOUNDARY 00242 short *wend_token_frame[2]; 00243 LOGPROB *wend_token_gscore[2]; 00244 short *wef; 00245 LOGPROB *wes; 00246 #endif 00247 WORD_ID *cnword; 00248 WORD_ID *cnwordrev; 00249 00250 } StackDecode; 00251 00256 typedef struct { 00257 LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); 00258 LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); 00259 LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); 00260 } LMFunc; 00261 00266 typedef struct __gmm_calc__{ 00267 LOGPROB *gmm_score; 00268 boolean *is_voice; 00269 int framecount; 00270 00271 short OP_nstream; 00272 VECT *OP_vec_stream[MAXSTREAMNUM]; 00273 short OP_veclen_stream[MAXSTREAMNUM]; 00274 00275 LOGPROB *OP_calced_score; 00276 int *OP_calced_id; 00277 int OP_calced_num; 00278 int OP_calced_maxnum; 00279 int OP_gprune_num; 00280 VECT *OP_vec; 00281 short OP_veclen; 00282 HTK_HMM_Data *max_d; 00283 int max_i; 00284 #ifdef CONFIDENCE_MEASURE 00285 LOGPROB gmm_max_cm; 00286 #endif 00287 #ifdef GMM_VAD 00288 LOGPROB *rates; 00289 int nframe; 00290 boolean filled; 00291 int framep; 00292 00293 boolean in_voice; 00294 boolean up_trigger; 00295 boolean down_trigger; 00296 boolean after_trigger; 00297 boolean want_rewind; 00298 boolean want_rewind_reprocess; 00299 int rewind_frame; 00300 int duration; 00301 #endif 00302 } GMMCalc; 00303 00308 typedef struct __sentence_align__ { 00309 int num; 00310 short unittype; 00311 WORD_ID *w; 00312 HMM_Logical **ph; 00313 short *loc; 00314 boolean *is_iwsp; 00315 int *begin_frame; 00316 int *end_frame; 00317 LOGPROB *avgscore; 00318 LOGPROB allscore; 00319 struct __sentence_align__ *next; 00320 } SentenceAlign; 00321 00326 typedef struct __sentence__ { 00327 WORD_ID word[MAXSEQNUM]; 00328 int word_num; 00329 LOGPROB score; 00330 LOGPROB confidence[MAXSEQNUM]; 00331 LOGPROB score_lm; 00332 LOGPROB score_am; 00333 int gram_id; 00334 SentenceAlign *align; 00335 00336 } Sentence; 00337 00342 typedef struct __adin__ { 00343 /* functions */ 00345 boolean (*ad_standby)(int, void *); 00347 boolean (*ad_begin)(char *); 00349 boolean (*ad_end)(); 00351 boolean (*ad_resume)(); 00353 boolean (*ad_pause)(); 00355 boolean (*ad_terminate)(); 00357 int (*ad_read)(SP16 *, int); 00359 char * (*ad_input_name)(); 00360 00361 /* configuration parameters */ 00362 int thres; 00363 int noise_zerocross; 00364 int nc_max; 00365 boolean adin_cut_on; 00366 boolean silence_cut_default; 00367 boolean strip_flag; 00368 boolean enable_thread; 00369 boolean need_zmean; 00370 00371 /* work area */ 00372 int c_length; 00373 int c_offset; 00374 SP16 *swapbuf; 00375 int sbsize; 00376 int sblen; 00377 int rest_tail; 00378 00379 ZEROCROSS zc; 00380 00381 #ifdef HAVE_PTHREAD 00382 /* Variables related to POSIX threading */ 00383 pthread_t adin_thread; 00384 pthread_mutex_t mutex; 00385 SP16 *speech; 00386 int speechlen; 00387 /* 00388 * Semaphore to start/stop recognition. 00389 * 00390 * If TRUE, A/D-in thread will store incoming samples to @a speech and 00391 * main thread will detect and process them. 00392 * If FALSE, A/D-in thread will still get input and check trigger as the same 00393 * as TRUE case, but does not store them to @a speech. 00394 * 00395 */ 00396 boolean transfer_online; 00401 boolean adinthread_buffer_overflowed; 00406 boolean adinthread_ended; 00407 00408 boolean ignore_speech_while_recog; 00409 00410 #endif 00411 00412 /* Input data buffer */ 00413 SP16 *buffer; 00414 int bpmax; 00415 int bp; 00416 int current_len; 00417 SP16 *cbuf; 00418 boolean down_sample; 00419 SP16 *buffer48; 00420 int io_rate; 00421 00422 boolean is_valid_data; 00423 int nc; 00424 boolean end_of_stream; 00425 boolean need_init; 00426 00427 DS_BUFFER *ds; 00428 00429 boolean rehash; 00430 00431 boolean input_side_segment; 00432 00433 unsigned int total_captured_len; 00434 unsigned int last_trigger_sample; 00435 00436 char current_input_name[MAXPATHLEN]; 00437 00438 } ADIn; 00439 00445 typedef struct __Output__ { 00454 int status; 00455 00456 int num_frame; 00457 int length_msec; 00458 00459 Sentence *sent; 00460 int sentnum; 00461 00462 WordGraph *wg1; 00463 int wg1_num; 00464 00465 WordGraph *wg; 00466 00467 CN_CLUSTER *confnet; 00468 00469 Sentence pass1; 00470 00471 } Output; 00472 00473 00474 /**********************************************************************/ 00475 /**********************************************************************/ 00476 /**********************************************************************/ 00477 00482 typedef struct __mfcc_calc__ { 00483 00488 short id; 00489 00494 Value *para; 00495 00500 boolean htk_loaded; 00505 boolean hmm_loaded; 00506 00511 boolean paramtype_check_flag; 00512 00517 MFCCWork *wrk; 00518 00523 HTK_Param *param; 00524 00528 HTK_Param *rest_param; 00529 00534 struct { 00538 char *load_filename; 00543 boolean update; 00547 char *save_filename; 00551 float map_weight; 00552 00556 boolean loaded; 00557 00562 CMNWork *wrk; 00563 00564 } cmn; 00565 00570 struct { 00574 float *ssbuf; 00575 00579 int sslen; 00580 00585 float ss_alpha; 00586 00591 float ss_floor; 00592 00596 boolean sscalc; 00597 00601 int sscalc_len; 00602 00606 char *ssload_filename; 00607 00612 MFCCWork *mfccwrk_ss; 00613 00614 } frontend; 00615 00620 ENERGYWork ewrk; 00621 00626 DeltaBuf *db; 00631 DeltaBuf *ab; 00636 VECT *tmpmfcc; 00637 00643 boolean valid; 00644 00649 int f; 00650 00655 int last_time; 00656 00661 int sparea_start; 00662 00667 boolean segmented; 00668 00673 boolean segmented_by_input; 00674 00679 int plugin_source; 00680 00685 struct { 00687 boolean (*fv_standby)(); 00689 boolean (*fv_begin)(); 00691 int (*fv_read)(VECT *, int); 00693 boolean (*fv_end)(); 00695 boolean (*fv_resume)(); 00697 boolean (*fv_pause)(); 00699 boolean (*fv_terminate)(); 00701 char * (*fv_input_name)(); 00702 } func; 00703 00704 #ifdef POWER_REJECT 00705 float avg_power; 00706 #endif 00707 00712 struct __mfcc_calc__ *next; 00713 00714 } MFCCCalc; 00715 00720 typedef struct __process_am__ { 00721 00726 JCONF_AM *config; 00727 00732 MFCCCalc *mfcc; 00733 00737 HTK_HMM_INFO *hmminfo; 00738 00742 HTK_HMM_INFO *hmm_gs; 00743 00747 HMMWork hmmwrk; 00748 00753 struct __process_am__ *next; 00754 00755 } PROCESS_AM; 00756 00761 typedef struct __process_lm__ { 00762 00767 JCONF_LM *config; 00768 00773 PROCESS_AM *am; 00774 00775 00780 int lmtype; 00781 00787 int lmvar; 00788 00792 WORD_INFO *winfo; 00793 00797 NGRAM_INFO *ngram; 00798 00802 MULTIGRAM *grammars; 00803 00809 int gram_maxid; 00810 00815 DFA_INFO *dfa; 00816 00821 boolean global_modified; 00822 00827 LMFunc lmfunc; 00828 00833 struct __process_lm__ *next; 00834 00835 } PROCESS_LM; 00836 00841 typedef struct __recogprocess__ { 00842 00847 boolean live; 00848 00855 short active; 00856 00861 JCONF_SEARCH *config; 00862 00867 PROCESS_AM *am; 00868 00873 PROCESS_LM *lm; 00874 00879 int lmtype; 00880 00886 int lmvar; 00887 00891 boolean ccd_flag; 00892 00896 WCHMM_INFO *wchmm; 00897 00901 int trellis_beam_width; 00902 00906 BACKTRELLIS *backtrellis; 00907 00911 FSBeam pass1; 00912 00917 StackDecode pass2; 00918 00922 WORD_ID pass1_wseq[MAXSEQNUM]; 00923 00927 int pass1_wnum; 00928 00932 LOGPROB pass1_score; 00933 00937 WORD_ID sp_break_last_word; 00941 WORD_ID sp_break_last_nword; 00945 boolean sp_break_last_nword_allow_override; 00949 WORD_ID sp_break_2_begin_word; 00953 WORD_ID sp_break_2_end_word; 00954 00958 int peseqlen; 00959 00963 int graph_totalwordnum; 00964 00969 Output result; 00970 00975 boolean graphout; 00976 00982 char *order_matrix; 00983 00989 int order_matrix_count; 00990 00991 #ifdef DETERMINE 00992 int determine_count; 00993 LOGPROB determine_maxnodescore; 00994 boolean determined; 00995 LOGPROB determine_last_wid; 00996 boolean have_determine; 00997 #endif 00998 01003 boolean have_interim; 01004 01009 void *hook; 01010 01015 struct __recogprocess__ *next; 01016 01017 } RecogProcess; 01018 01023 typedef struct __Recog__ { 01024 01025 /*******************************************/ 01030 Jconf *jconf; 01031 01032 /*******************************************/ 01037 ADIn *adin; 01038 01042 RealBeam real; 01043 01048 MFCCCalc *mfcclist; 01049 01054 PROCESS_AM *amlist; 01055 01060 PROCESS_LM *lmlist; 01061 01066 RecogProcess *process_list; 01067 01068 01073 boolean process_segment; 01074 01075 /*******************************************/ 01076 /* inputs */ 01077 01081 SP16 *speech; 01082 01087 int speechalloclen; 01088 01092 int speechlen; 01093 01097 int peseqlen; 01098 01099 /*******************************************/ 01100 01105 HTK_HMM_INFO *gmm; 01106 01111 MFCCCalc *gmmmfcc; 01112 01117 GMMCalc *gc; 01118 01119 /*******************************************/ 01120 /* misc. */ 01121 01133 boolean process_active; 01134 01140 boolean process_want_terminate; 01141 01149 boolean process_want_reload; 01150 01156 short gram_switch_input_method; 01157 01164 boolean process_online; 01165 01171 boolean (*calc_vector)(MFCCCalc *, SP16 *, int); 01172 01178 boolean triggered; 01179 01184 void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])(); 01189 void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK]; 01194 int callback_function_num[SIZEOF_CALLBACK_ID]; 01199 int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID]; 01204 int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID]; 01209 int callback_num; 01210 01211 /*******************************************/ 01212 01217 void *hook; 01218 01219 } Recog; 01220 01221 #endif /* __J_RECOG_H__ */