Julius 4.2
libjulius/include/julius/recog.h
説明を見る。
00001 
00076 /*
00077  * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University
00078  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00079  * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology
00080  * All rights reserved
00081  */
00082 
00083 /*
00084 */
00085 
00086 #ifndef __J_RECOG_H__
00087 #define __J_RECOG_H__
00088 
00089 #include <sent/stddefs.h>
00090 #include <sent/hmm.h>
00091 #include <sent/vocabulary.h>
00092 #include <sent/ngram2.h>
00093 #include <sent/dfa.h>
00094 #include <julius/wchmm.h>
00095 #include <julius/search.h>
00096 #include <julius/callback.h>
00097 #include <julius/jconf.h>
00098 
00099 /*
00100   How tokens are managed:
00101    o  tlist[][] is a token stocker.  It holds all tokens in sequencial
00102       buffer.  They are malloced first on startup, and refered by ID while
00103       Viterbi procedure.  In word-pair mode, each token also has a link to
00104       another token to allow a node to have more than 1 token.
00105       
00106    o  token[n] holds the current ID number of a token associated to a
00107       lexicon tree node 'n'.
00108 
00109   */
00114 typedef struct __FSBeam__ {
00115   /* token stocker */
00116   TOKEN2 *tlist[2];     
00117   TOKENID *tindex[2];   
00118   int maxtnum;          
00119   int expand_step;      
00120   boolean expanded;     
00121   int tnum[2];          
00122   int n_start;          
00123   int n_end;            
00124   int tl;               
00125   int tn;               
00126 #ifdef SCORE_PRUNING
00127   LOGPROB score_pruning_max;      
00128   LOGPROB score_pruning_threshold;
00129   int score_pruning_count;        
00130 #endif
00131     
00132   /* Active token list */
00133   TOKENID *token;       
00134 #ifdef UNIGRAM_FACTORING
00135   /* for wordend processing with 1-gram factoring */
00136   LOGPROB wordend_best_score; 
00137   int wordend_best_node;        
00138   TRELLIS_ATOM *wordend_best_tre; 
00139   WORD_ID wordend_best_last_cword;      
00140 #endif
00141 
00142   int totalnodenum;     
00143   TRELLIS_ATOM bos;     
00144   boolean nodes_malloced; 
00145   LOGPROB lm_weight;           
00146   LOGPROB lm_penalty;          
00147   LOGPROB lm_penalty_trans; 
00148   LOGPROB penalty1; 
00149 #if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT)
00150   boolean wpair_keep_nlimit; 
00151 #endif
00152   /* for short-pause segmentation */
00153   boolean in_sparea;         
00154   int tmp_sparea_start;         
00155 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00156   WORD_ID tmp_sp_break_last_word; 
00157 #else
00158   WORD_ID last_tre_word;        
00159 #endif
00160   boolean first_sparea;  
00161   int sp_duration;   
00162 #ifdef SPSEGMENT_NAIST
00163   boolean after_trigger;        
00164   int trigger_duration;         
00165   boolean want_rewind;          
00166   int rewind_frame;             
00167   boolean want_rewind_reprocess; 
00168 #endif
00169   char *pausemodelnames;        
00170   char **pausemodel;            
00171   int pausemodelnum;            
00172 } FSBeam;
00173 
00174 
00179 typedef struct __RealBeam__ {
00180   /* input parameter */
00181   int maxframelen;              
00182 
00183   SP16 *window;         
00184   int windowlen;                
00185   int windownum;                
00186 
00187   /* for short-pause segmentation */
00188   boolean last_is_segmented; 
00189   SP16 *rest_Speech; 
00190   int rest_alloc_len;   
00191   int rest_len;         
00192 
00193 } RealBeam;
00194 
00199 typedef struct __StackDecode__ {
00200   int hypo_len_count[MAXSEQNUM+1];      
00201   int maximum_filled_length; 
00202 #ifdef SCAN_BEAM
00203   LOGPROB *framemaxscore; 
00204 #endif
00205   NODE *stocker_root; 
00206   int popctr;           
00207   int genectr;          
00208   int pushctr;          
00209   int finishnum;        
00210   NODE *current;                
00211 
00212 #ifdef CONFIDENCE_MEASURE
00213   LOGPROB cm_alpha;             
00214 # ifdef CM_MULTIPLE_ALPHA
00215   LOGPROB *cmsumlist;        
00216   int cmsumlistlen;             
00217 # endif
00218 # ifdef CM_SEARCH
00219   LOGPROB cm_tmpbestscore; 
00220 #  ifndef CM_MULTIPLE_ALPHA
00221   LOGPROB cm_tmpsum;            
00222 #  endif
00223   int l_stacksize;              
00224   int l_stacknum;               
00225   NODE *l_start;        
00226   NODE *l_bottom;       
00227 # endif
00228 # ifdef CM_NBEST
00229   LOGPROB *sentcm = NULL;       
00230   LOGPROB *wordcm = NULL;       
00231   int sentnum;          
00232   int wordnum;          
00233 # endif
00234 #endif /* CONFIDENCE_MEASURE */
00235 
00236   LOGPROB *wordtrellis[2]; 
00237   LOGPROB *g;           
00238   HMM_Logical **phmmseq;        
00239   int phmmlen_max;              
00240   boolean *has_sp;              
00241 #ifdef GRAPHOUT_PRECISE_BOUNDARY
00242   short *wend_token_frame[2]; 
00243   LOGPROB *wend_token_gscore[2]; 
00244   short *wef;           
00245   LOGPROB *wes;         
00246 #endif
00247   WORD_ID *cnword;              
00248   WORD_ID *cnwordrev;           
00249 
00250 } StackDecode;
00251 
00256 typedef struct {
00257   LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); 
00258   LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); 
00259   LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); 
00260 } LMFunc;
00261 
00266 typedef struct __gmm_calc__{
00267   LOGPROB *gmm_score;   
00268   boolean *is_voice;            
00269   int framecount;               
00270 
00271   short OP_nstream;             
00272   VECT *OP_vec_stream[MAXSTREAMNUM]; 
00273   short OP_veclen_stream[MAXSTREAMNUM]; 
00274 
00275   LOGPROB *OP_calced_score; 
00276   int *OP_calced_id; 
00277   int OP_calced_num; 
00278   int OP_calced_maxnum; 
00279   int OP_gprune_num; 
00280   VECT *OP_vec;         
00281   short OP_veclen;              
00282   HTK_HMM_Data *max_d;  
00283   int max_i;                    
00284 #ifdef CONFIDENCE_MEASURE
00285   LOGPROB gmm_max_cm;   
00286 #endif
00287 #ifdef GMM_VAD
00288   LOGPROB *rates;   
00289   int nframe;                   
00290   boolean filled;
00291   int framep;                   
00292 
00293   boolean in_voice;             
00294   boolean up_trigger;           
00295   boolean down_trigger;         
00296   boolean after_trigger;        
00297   boolean want_rewind;          
00298   boolean want_rewind_reprocess; 
00299   int rewind_frame;             
00300   int duration;                 
00301 #endif
00302 } GMMCalc;
00303 
00308 typedef struct __sentence_align__ {
00309   int num;                    
00310   short unittype;             
00311   WORD_ID *w;                 
00312   HMM_Logical **ph;     
00313   short *loc; 
00314   boolean *is_iwsp;           
00315   int *begin_frame;           
00316   int *end_frame;             
00317   LOGPROB *avgscore;          
00318   LOGPROB allscore;           
00319   struct __sentence_align__ *next; 
00320 } SentenceAlign;
00321 
00326 typedef struct __sentence__ {
00327   WORD_ID word[MAXSEQNUM];      
00328   int word_num;                 
00329   LOGPROB score;                
00330   LOGPROB confidence[MAXSEQNUM]; 
00331   LOGPROB score_lm;             
00332   LOGPROB score_am;             
00333   int gram_id;                  
00334   SentenceAlign *align;
00335 
00336 } Sentence;
00337 
00342 typedef struct __adin__ {
00343   /* functions */
00345   boolean (*ad_standby)(int, void *);
00347   boolean (*ad_begin)(char *);
00349   boolean (*ad_end)();
00351   boolean (*ad_resume)();
00353   boolean (*ad_pause)();
00355   boolean (*ad_terminate)();
00357   int (*ad_read)(SP16 *, int);
00359   char * (*ad_input_name)();
00360 
00361   /* configuration parameters */
00362   int thres;            
00363   int noise_zerocross;  
00364   int nc_max;           
00365   boolean adin_cut_on;  
00366   boolean silence_cut_default; 
00367   boolean strip_flag;   
00368   boolean enable_thread;        
00369   boolean need_zmean;   
00370 
00371   /* work area */
00372   int c_length; 
00373   int c_offset; 
00374   SP16 *swapbuf;                
00375   int sbsize;    
00376   int sblen;    
00377   int rest_tail;                
00378 
00379   ZEROCROSS zc;                 
00380 
00381 #ifdef HAVE_PTHREAD
00382   /* Variables related to POSIX threading */
00383   pthread_t adin_thread;        
00384   pthread_mutex_t mutex;        
00385   SP16 *speech;         
00386   int speechlen;                
00387 /*
00388  * Semaphore to start/stop recognition.
00389  * 
00390  * If TRUE, A/D-in thread will store incoming samples to @a speech and
00391  * main thread will detect and process them.
00392  * If FALSE, A/D-in thread will still get input and check trigger as the same
00393  * as TRUE case, but does not store them to @a speech.
00394  * 
00395  */
00396   boolean transfer_online;
00401   boolean adinthread_buffer_overflowed;
00406   boolean adinthread_ended;
00407 
00408   boolean ignore_speech_while_recog; 
00409 
00410 #endif
00411 
00412   /* Input data buffer */
00413   SP16 *buffer; 
00414   int bpmax;            
00415   int bp;                       
00416   int current_len;              
00417   SP16 *cbuf;           
00418   boolean down_sample; 
00419   SP16 *buffer48; 
00420   int io_rate; 
00421 
00422   boolean is_valid_data;        
00423   int nc;               
00424   boolean end_of_stream;        
00425   boolean need_init;    
00426 
00427   DS_BUFFER *ds;           
00428 
00429   boolean rehash; 
00430 
00431   boolean input_side_segment;   
00432 
00433   unsigned int total_captured_len; 
00434   unsigned int last_trigger_sample; 
00435 
00436   char current_input_name[MAXPATHLEN]; 
00437 
00438 } ADIn;
00439 
00445 typedef struct __Output__ {
00454   int status;
00455 
00456   int num_frame;                
00457   int length_msec;              
00458 
00459   Sentence *sent;               
00460   int sentnum;                  
00461 
00462   WordGraph *wg1;               
00463   int wg1_num;                  
00464 
00465   WordGraph *wg;                
00466 
00467   CN_CLUSTER *confnet;          
00468 
00469   Sentence pass1;               
00470 
00471 } Output;  
00472 
00473 
00474 /**********************************************************************/
00475 /**********************************************************************/
00476 /**********************************************************************/
00477 
00482 typedef struct __mfcc_calc__ {
00483 
00488   short id;
00489 
00494   Value *para;
00495 
00500   boolean htk_loaded;
00505   boolean hmm_loaded;
00506 
00511   boolean paramtype_check_flag;
00512 
00517   MFCCWork *wrk;
00518 
00523   HTK_Param *param;
00524 
00528   HTK_Param *rest_param;
00529 
00534   struct {
00538     char *load_filename;
00543     boolean update;
00547     char *save_filename;     
00551     float map_weight;
00552 
00556     boolean loaded;
00557 
00562     CMNWork *wrk;
00563 
00564   } cmn;
00565 
00570   struct {
00574     float *ssbuf;
00575     
00579     int sslen;
00580     
00585     float ss_alpha;
00586 
00591     float ss_floor;
00592 
00596     boolean sscalc;
00597 
00601     int sscalc_len;
00602 
00606     char *ssload_filename;
00607 
00612     MFCCWork *mfccwrk_ss;
00613     
00614   } frontend;
00615 
00620   ENERGYWork ewrk;
00621 
00626   DeltaBuf *db;
00631   DeltaBuf *ab;
00636   VECT *tmpmfcc;
00637 
00643   boolean valid;
00644 
00649   int f;
00650 
00655   int last_time;
00656 
00661   int sparea_start;
00662 
00667   boolean segmented;
00668 
00673   boolean segmented_by_input;
00674 
00679   int plugin_source;
00680 
00685   struct {
00687     boolean (*fv_standby)();
00689     boolean (*fv_begin)();
00691     int (*fv_read)(VECT *, int);
00693     boolean (*fv_end)();
00695     boolean (*fv_resume)();
00697     boolean (*fv_pause)();
00699     boolean (*fv_terminate)();
00701     char * (*fv_input_name)();
00702   } func;
00703 
00704 #ifdef POWER_REJECT
00705   float avg_power;
00706 #endif
00707 
00712   struct __mfcc_calc__ *next;
00713 
00714 } MFCCCalc;
00715 
00720 typedef struct __process_am__ {
00721 
00726   JCONF_AM *config;
00727 
00732   MFCCCalc *mfcc;
00733 
00737   HTK_HMM_INFO *hmminfo;
00738 
00742   HTK_HMM_INFO *hmm_gs;
00743 
00747   HMMWork hmmwrk;
00748 
00753   struct __process_am__ *next;
00754   
00755 } PROCESS_AM;
00756 
00761 typedef struct __process_lm__ {
00762 
00767   JCONF_LM *config;
00768 
00773   PROCESS_AM *am;
00774 
00775 
00780   int lmtype;
00781 
00787   int lmvar;
00788 
00792   WORD_INFO *winfo;
00793 
00797   NGRAM_INFO *ngram;
00798 
00802   MULTIGRAM *grammars;
00803 
00809   int gram_maxid;
00810 
00815   DFA_INFO *dfa;
00816 
00821   boolean global_modified;
00822 
00827   LMFunc lmfunc;
00828 
00833   struct __process_lm__ *next;
00834 
00835 } PROCESS_LM;
00836 
00841 typedef struct __recogprocess__ {
00842 
00847   boolean live;
00848 
00855   short active;
00856 
00861   JCONF_SEARCH *config;
00862 
00867   PROCESS_AM *am;
00868 
00873   PROCESS_LM *lm;
00874 
00879   int lmtype;
00880 
00886   int lmvar;
00887 
00891   boolean ccd_flag;
00892 
00896   WCHMM_INFO *wchmm;
00897 
00901   int trellis_beam_width;
00902 
00906   BACKTRELLIS *backtrellis;
00907 
00911   FSBeam pass1;
00912 
00917   StackDecode pass2;
00918 
00922   WORD_ID pass1_wseq[MAXSEQNUM];
00923 
00927   int pass1_wnum;
00928 
00932   LOGPROB pass1_score;
00933 
00937   WORD_ID sp_break_last_word;
00941   WORD_ID sp_break_last_nword;
00945   boolean sp_break_last_nword_allow_override;
00949   WORD_ID sp_break_2_begin_word;
00953   WORD_ID sp_break_2_end_word;
00954 
00958   int peseqlen;         
00959 
00963   int graph_totalwordnum;
00964 
00969   Output result;
00970 
00975   boolean graphout;
00976 
00982   char *order_matrix;
00983 
00989   int order_matrix_count;
00990 
00991 #ifdef DETERMINE
00992   int determine_count;
00993   LOGPROB determine_maxnodescore;
00994   boolean determined;
00995   LOGPROB determine_last_wid;
00996   boolean have_determine;
00997 #endif
00998 
01003   boolean have_interim;
01004 
01009   void *hook;
01010 
01015   struct __recogprocess__ *next;
01016 
01017 } RecogProcess;
01018 
01023 typedef struct __Recog__ {
01024 
01025   /*******************************************/
01030   Jconf *jconf;
01031 
01032   /*******************************************/
01037   ADIn *adin;
01038 
01042   RealBeam real;
01043 
01048   MFCCCalc *mfcclist;
01049 
01054   PROCESS_AM *amlist;
01055 
01060   PROCESS_LM *lmlist;
01061 
01066   RecogProcess *process_list;
01067 
01068 
01073   boolean process_segment;
01074 
01075   /*******************************************/
01076   /* inputs */
01077 
01081   SP16 *speech;
01082 
01087   int speechalloclen;
01088 
01092   int speechlen;                
01093 
01097   int peseqlen;         
01098 
01099   /*******************************************/
01100 
01105   HTK_HMM_INFO *gmm;
01106 
01111   MFCCCalc *gmmmfcc;
01112 
01117   GMMCalc *gc;
01118 
01119   /*******************************************/
01120   /* misc. */
01121 
01133   boolean process_active;
01134 
01140   boolean process_want_terminate;
01141 
01149   boolean process_want_reload;
01150 
01156   short gram_switch_input_method;
01157 
01164   boolean process_online;
01165 
01171   boolean (*calc_vector)(MFCCCalc *, SP16 *, int);
01172 
01178   boolean triggered;
01179 
01184   void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])();
01189   void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK];
01194   int callback_function_num[SIZEOF_CALLBACK_ID];
01199   int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
01204   int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
01209   int callback_num;
01210 
01211   /*******************************************/
01212 
01217   void *hook;
01218 
01219 } Recog;
01220 
01221 #endif /* __J_RECOG_H__ */