Julius 4.1.5
libjulius/include/julius/recog.h
説明を見る。
00001 
00076 /*
00077  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00078  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00079  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00080  * All rights reserved
00081  */
00082 
00083 /*
00084 */
00085 
00086 #ifndef __J_RECOG_H__
00087 #define __J_RECOG_H__
00088 
00089 #include <sent/stddefs.h>
00090 #include <sent/hmm.h>
00091 #include <sent/vocabulary.h>
00092 #include <sent/ngram2.h>
00093 #include <sent/dfa.h>
00094 #include <julius/wchmm.h>
00095 #include <julius/search.h>
00096 #include <julius/callback.h>
00097 #include <julius/jconf.h>
00098 
00099 /*
00100   How tokens are managed:
00101    o  tlist[][] is a token stocker.  It holds all tokens in sequencial
00102       buffer.  They are malloced first on startup, and refered by ID while
00103       Viterbi procedure.  In word-pair mode, each token also has a link to
00104       another token to allow a node to have more than 1 token.
00105       
00106    o  token[n] holds the current ID number of a token associated to a
00107       lexicon tree node 'n'.
00108 
00109   */
00114 typedef struct __FSBeam__ {
00115   /* token stocker */
00116   TOKEN2 *tlist[2];     
00117   TOKENID *tindex[2];   
00118   int maxtnum;          
00119   int expand_step;      
00120   boolean expanded;     
00121   int tnum[2];          
00122   int n_start;          
00123   int n_end;            
00124   int tl;               
00125   int tn;               
00126     
00127   /* Active token list */
00128   TOKENID *token;       
00129 #ifdef UNIGRAM_FACTORING
00130   /* for wordend processing with 1-gram factoring */
00131   LOGPROB wordend_best_score; 
00132   int wordend_best_node;        
00133   TRELLIS_ATOM *wordend_best_tre; 
00134   WORD_ID wordend_best_last_cword;      
00135 #endif
00136 
00137   int totalnodenum;     
00138   TRELLIS_ATOM bos;     
00139   boolean nodes_malloced; 
00140   LOGPROB lm_weight;           
00141   LOGPROB lm_penalty;          
00142   LOGPROB lm_penalty_trans; 
00143   LOGPROB penalty1; 
00144 #if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT)
00145   boolean wpair_keep_nlimit; 
00146 #endif
00147   /* for short-pause segmentation */
00148   boolean in_sparea;         
00149   int tmp_sparea_start;         
00150 #ifdef SP_BREAK_RESUME_WORD_BEGIN
00151   WORD_ID tmp_sp_break_last_word; 
00152 #else
00153   WORD_ID last_tre_word;        
00154 #endif
00155   boolean first_sparea;  
00156   int sp_duration;   
00157 #ifdef SPSEGMENT_NAIST
00158   boolean after_trigger;        
00159   int trigger_duration;         
00160   boolean want_rewind;          
00161   int rewind_frame;             
00162   boolean want_rewind_reprocess; 
00163 #endif
00164   char *pausemodelnames;        
00165   char **pausemodel;            
00166   int pausemodelnum;            
00167 } FSBeam;
00168 
00169 
00174 typedef struct __RealBeam__ {
00175   /* input parameter */
00176   int maxframelen;              
00177 
00178   SP16 *window;         
00179   int windowlen;                
00180   int windownum;                
00181 
00182   /* for short-pause segmentation */
00183   boolean last_is_segmented; 
00184   SP16 *rest_Speech; 
00185   int rest_alloc_len;   
00186   int rest_len;         
00187 
00188 } RealBeam;
00189 
00194 typedef struct __StackDecode__ {
00195   int hypo_len_count[MAXSEQNUM+1];      
00196   int maximum_filled_length; 
00197 #ifdef SCAN_BEAM
00198   LOGPROB *framemaxscore; 
00199 #endif
00200   NODE *stocker_root; 
00201   int popctr;           
00202   int genectr;          
00203   int pushctr;          
00204   int finishnum;        
00205   NODE *current;                
00206 
00207 #ifdef CONFIDENCE_MEASURE
00208   LOGPROB cm_alpha;             
00209 # ifdef CM_MULTIPLE_ALPHA
00210   LOGPROB *cmsumlist;        
00211   int cmsumlistlen;             
00212 # endif
00213 # ifdef CM_SEARCH
00214   LOGPROB cm_tmpbestscore; 
00215 #  ifndef CM_MULTIPLE_ALPHA
00216   LOGPROB cm_tmpsum;            
00217 #  endif
00218   int l_stacksize;              
00219   int l_stacknum;               
00220   NODE *l_start;        
00221   NODE *l_bottom;       
00222 # endif
00223 # ifdef CM_NBEST
00224   LOGPROB *sentcm = NULL;       
00225   LOGPROB *wordcm = NULL;       
00226   int sentnum;          
00227 # endif
00228 #endif /* CONFIDENCE_MEASURE */
00229 
00230   LOGPROB *wordtrellis[2]; 
00231   LOGPROB *g;           
00232   HMM_Logical **phmmseq;        
00233   int phmmlen_max;              
00234   boolean *has_sp;              
00235 #ifdef GRAPHOUT_PRECISE_BOUNDARY
00236   short *wend_token_frame[2]; 
00237   LOGPROB *wend_token_gscore[2]; 
00238   short *wef;           
00239   LOGPROB *wes;         
00240 #endif
00241   WORD_ID *cnword;              
00242   WORD_ID *cnwordrev;           
00243 
00244 } StackDecode;
00245 
00250 typedef struct {
00251   LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); 
00252   LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); 
00253   LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); 
00254 } LMFunc;
00255 
00260 typedef struct __gmm_calc__{
00261   LOGPROB *gmm_score;   
00262   boolean *is_voice;            
00263   int framecount;               
00264 
00265   short OP_nstream;             
00266   VECT *OP_vec_stream[MAXSTREAMNUM]; 
00267   short OP_veclen_stream[MAXSTREAMNUM]; 
00268 
00269   LOGPROB *OP_calced_score; 
00270   int *OP_calced_id; 
00271   int OP_calced_num; 
00272   int OP_calced_maxnum; 
00273   int OP_gprune_num; 
00274   VECT *OP_vec;         
00275   short OP_veclen;              
00276   HTK_HMM_Data *max_d;  
00277   int max_i;                    
00278 #ifdef CONFIDENCE_MEASURE
00279   LOGPROB gmm_max_cm;   
00280 #endif
00281 #ifdef GMM_VAD
00282   LOGPROB *rates;   
00283   int nframe;                   
00284   boolean filled;
00285   int framep;                   
00286 
00287   boolean in_voice;             
00288   boolean up_trigger;           
00289   boolean down_trigger;         
00290   boolean after_trigger;        
00291   boolean want_rewind;          
00292   boolean want_rewind_reprocess; 
00293   int rewind_frame;             
00294   int duration;                 
00295 #endif
00296 } GMMCalc;
00297 
00302 typedef struct __sentence_align__ {
00303   int num;                    
00304   short unittype;             
00305   WORD_ID *w;                 
00306   HMM_Logical **ph;     
00307   short *loc; 
00308   boolean *is_iwsp;           
00309   int *begin_frame;           
00310   int *end_frame;             
00311   LOGPROB *avgscore;          
00312   LOGPROB allscore;           
00313   struct __sentence_align__ *next; 
00314 } SentenceAlign;
00315 
00320 typedef struct __sentence__ {
00321   WORD_ID word[MAXSEQNUM];      
00322   int word_num;                 
00323   LOGPROB score;                
00324   LOGPROB confidence[MAXSEQNUM]; 
00325   LOGPROB score_lm;             
00326   LOGPROB score_am;             
00327   int gram_id;                  
00328   SentenceAlign *align;
00329 
00330 } Sentence;
00331 
00336 typedef struct __adin__ {
00337   /* functions */
00339   boolean (*ad_standby)(int, void *);
00341   boolean (*ad_begin)(char *);
00343   boolean (*ad_end)();
00345   boolean (*ad_resume)();
00347   boolean (*ad_pause)();
00349   boolean (*ad_terminate)();
00351   int (*ad_read)(SP16 *, int);
00353   char * (*ad_input_name)();
00354 
00355   /* configuration parameters */
00356   int thres;            
00357   int noise_zerocross;  
00358   int nc_max;           
00359   boolean adin_cut_on;  
00360   boolean silence_cut_default; 
00361   boolean strip_flag;   
00362   boolean enable_thread;        
00363   boolean need_zmean;   
00364 
00365   /* work area */
00366   int c_length; 
00367   int c_offset; 
00368   SP16 *swapbuf;                
00369   int sbsize;    
00370   int sblen;    
00371   int rest_tail;                
00372 
00373   ZEROCROSS zc;                 
00374 
00375 #ifdef HAVE_PTHREAD
00376   /* Variables related to POSIX threading */
00377   pthread_t adin_thread;        
00378   pthread_mutex_t mutex;        
00379   SP16 *speech;         
00380   int speechlen;                
00381 /*
00382  * Semaphore to start/stop recognition.
00383  * 
00384  * If TRUE, A/D-in thread will store incoming samples to @a speech and
00385  * main thread will detect and process them.
00386  * If FALSE, A/D-in thread will still get input and check trigger as the same
00387  * as TRUE case, but does not store them to @a speech.
00388  * 
00389  */
00390   boolean transfer_online;
00395   boolean adinthread_buffer_overflowed;
00400   boolean adinthread_ended;
00401 
00402   boolean ignore_speech_while_recog; 
00403 
00404 #endif
00405 
00406   /* Input data buffer */
00407   SP16 *buffer; 
00408   int bpmax;            
00409   int bp;                       
00410   int current_len;              
00411   SP16 *cbuf;           
00412   boolean down_sample; 
00413   SP16 *buffer48; 
00414   int io_rate; 
00415 
00416   boolean is_valid_data;        
00417   int nc;               
00418   boolean end_of_stream;        
00419   boolean need_init;    
00420 
00421   DS_BUFFER *ds;           
00422 
00423   boolean rehash; 
00424 
00425   boolean input_side_segment;   
00426 
00427   unsigned int total_captured_len; 
00428   unsigned int last_trigger_sample; 
00429 
00430   char current_input_name[MAXPATHLEN]; 
00431 
00432 } ADIn;
00433 
00439 typedef struct __Output__ {
00448   int status;
00449 
00450   int num_frame;                
00451   int length_msec;              
00452 
00453   Sentence *sent;               
00454   int sentnum;                  
00455 
00456   WordGraph *wg1;               
00457   int wg1_num;                  
00458 
00459   WordGraph *wg;                
00460 
00461   CN_CLUSTER *confnet;          
00462 
00463   Sentence pass1;               
00464 
00465 } Output;  
00466 
00467 
00468 /**********************************************************************/
00469 /**********************************************************************/
00470 /**********************************************************************/
00471 
00476 typedef struct __mfcc_calc__ {
00477 
00482   short id;
00483 
00488   Value *para;
00489 
00494   boolean htk_loaded;
00499   boolean hmm_loaded;
00500 
00505   boolean paramtype_check_flag;
00506 
00511   MFCCWork *wrk;
00512 
00517   HTK_Param *param;
00518 
00522   HTK_Param *rest_param;
00523 
00528   struct {
00532     char *load_filename;
00537     boolean update;
00541     char *save_filename;     
00545     float map_weight;
00546 
00550     boolean loaded;
00551 
00556     CMNWork *wrk;
00557 
00558   } cmn;
00559 
00564   struct {
00568     float *ssbuf;
00569     
00573     int sslen;
00574     
00579     float ss_alpha;
00580 
00585     float ss_floor;
00586 
00590     boolean sscalc;
00591 
00595     int sscalc_len;
00596 
00600     char *ssload_filename;
00601 
00606     MFCCWork *mfccwrk_ss;
00607     
00608   } frontend;
00609 
00614   ENERGYWork ewrk;
00615 
00620   DeltaBuf *db;
00625   DeltaBuf *ab;
00630   VECT *tmpmfcc;
00631 
00637   boolean valid;
00638 
00643   int f;
00644 
00649   int last_time;
00650 
00655   int sparea_start;
00656 
00661   boolean segmented;
00662 
00667   boolean segmented_by_input;
00668 
00673   int plugin_source;
00674 
00679   struct {
00681     boolean (*fv_standby)();
00683     boolean (*fv_begin)();
00685     int (*fv_read)(VECT *, int);
00687     boolean (*fv_end)();
00689     boolean (*fv_resume)();
00691     boolean (*fv_pause)();
00693     boolean (*fv_terminate)();
00695     char * (*fv_input_name)();
00696   } func;
00697 
00698 #ifdef POWER_REJECT
00699   float avg_power;
00700 #endif
00701 
00706   struct __mfcc_calc__ *next;
00707 
00708 } MFCCCalc;
00709 
00714 typedef struct __process_am__ {
00715 
00720   JCONF_AM *config;
00721 
00726   MFCCCalc *mfcc;
00727 
00731   HTK_HMM_INFO *hmminfo;
00732 
00736   HTK_HMM_INFO *hmm_gs;
00737 
00741   HMMWork hmmwrk;
00742 
00747   struct __process_am__ *next;
00748   
00749 } PROCESS_AM;
00750 
00755 typedef struct __process_lm__ {
00756 
00761   JCONF_LM *config;
00762 
00767   PROCESS_AM *am;
00768 
00769 
00774   int lmtype;
00775 
00781   int lmvar;
00782 
00786   WORD_INFO *winfo;
00787 
00791   NGRAM_INFO *ngram;
00792 
00796   MULTIGRAM *grammars;
00797 
00803   int gram_maxid;
00804 
00809   DFA_INFO *dfa;
00810 
00815   boolean global_modified;
00816 
00821   LMFunc lmfunc;
00822 
00827   struct __process_lm__ *next;
00828 
00829 } PROCESS_LM;
00830 
00835 typedef struct __recogprocess__ {
00836 
00841   boolean live;
00842 
00849   short active;
00850 
00855   JCONF_SEARCH *config;
00856 
00861   PROCESS_AM *am;
00862 
00867   PROCESS_LM *lm;
00868 
00873   int lmtype;
00874 
00880   int lmvar;
00881 
00885   boolean ccd_flag;
00886 
00890   WCHMM_INFO *wchmm;
00891 
00895   int trellis_beam_width;
00896 
00900   BACKTRELLIS *backtrellis;
00901 
00905   FSBeam pass1;
00906 
00911   StackDecode pass2;
00912 
00916   WORD_ID pass1_wseq[MAXSEQNUM];
00917 
00921   int pass1_wnum;
00922 
00926   LOGPROB pass1_score;
00927 
00931   WORD_ID sp_break_last_word;
00935   WORD_ID sp_break_last_nword;
00939   boolean sp_break_last_nword_allow_override;
00943   WORD_ID sp_break_2_begin_word;
00947   WORD_ID sp_break_2_end_word;
00948 
00952   int peseqlen;         
00953 
00957   int graph_totalwordnum;
00958 
00963   Output result;
00964 
00969   boolean graphout;
00970 
00976   char *order_matrix;
00977 
00983   int order_matrix_count;
00984 
00985 #ifdef DETERMINE
00986   int determine_count;
00987   LOGPROB determine_maxnodescore;
00988   boolean determined;
00989   LOGPROB determine_last_wid;
00990   boolean have_determine;
00991 #endif
00992 
00997   boolean have_interim;
00998 
01003   void *hook;
01004 
01009   struct __recogprocess__ *next;
01010 
01011 } RecogProcess;
01012 
01017 typedef struct __Recog__ {
01018 
01019   /*******************************************/
01024   Jconf *jconf;
01025 
01026   /*******************************************/
01031   ADIn *adin;
01032 
01036   RealBeam real;
01037 
01042   MFCCCalc *mfcclist;
01043 
01048   PROCESS_AM *amlist;
01049 
01054   PROCESS_LM *lmlist;
01055 
01060   RecogProcess *process_list;
01061 
01062 
01067   boolean process_segment;
01068 
01069   /*******************************************/
01070   /* inputs */
01071 
01075   SP16 *speech;
01076 
01081   int speechalloclen;
01082 
01086   int speechlen;                
01087 
01091   int peseqlen;         
01092 
01093   /*******************************************/
01094 
01099   HTK_HMM_INFO *gmm;
01100 
01105   MFCCCalc *gmmmfcc;
01106 
01111   GMMCalc *gc;
01112 
01113   /*******************************************/
01114   /* misc. */
01115 
01127   boolean process_active;
01128 
01134   boolean process_want_terminate;
01135 
01143   boolean process_want_reload;
01144 
01150   short gram_switch_input_method;
01151 
01158   boolean process_online;
01159 
01165   boolean (*calc_vector)(MFCCCalc *, SP16 *, int);
01166 
01172   boolean triggered;
01173 
01178   void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])();
01183   void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK];
01188   int callback_function_num[SIZEOF_CALLBACK_ID];
01193   int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
01198   int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
01203   int callback_num;
01204 
01205   /*******************************************/
01206 
01211   void *hook;
01212 
01213 } Recog;
01214 
01215 #endif /* __J_RECOG_H__ */