Julius 4.2
libsent/include/sent/ngram2.h
説明を見る。
00001 
00101 /*
00102  * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University
00103  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00104  * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology
00105  * All rights reserved
00106  */
00107 
00108 #ifndef __SENT_NGRAM2_H__
00109 #define __SENT_NGRAM2_H__
00110 
00111 #include <sent/stddefs.h>
00112 #include <sent/ptree.h>
00113 
00114 typedef unsigned int NNID;            
00115 #define NNID_INVALID 0xffffffff  ///< Value to indicate no id (full)
00116 #define NNID_MAX 0xfffffffe     ///< Value of maximum value (full)
00117 
00118 typedef unsigned char NNID_UPPER; 
00119 typedef unsigned short NNID_LOWER; 
00120 #define NNID_INVALID_UPPER 255  ///< Value to indicate no id at NNID_UPPER (24bit)
00121 #define NNID_MAX_24 16711679        ///< Allowed maximum number of id (255*65536-1) (24bit)
00122 
00124 #define BEGIN_WORD_DEFAULT "<s>"
00125 
00126 #define END_WORD_DEFAULT "</s>"
00127 
00128 #define UNK_WORD_DEFAULT "<unk>"
00129 #define UNK_WORD_DEFAULT2 "<UNK>"
00130 
00131 #define UNK_WORD_MAXLEN 30
00132 
00137 typedef struct {
00138   NNID totalnum;                
00139   boolean is24bit;              
00140   NNID bgnlistlen;              
00141   NNID_UPPER *bgn_upper;        
00142   NNID_LOWER *bgn_lower;        
00143   NNID *bgn;                    
00144   WORD_ID *num;         
00145 
00146   WORD_ID *nnid2wid;            
00147   LOGPROB *prob;                
00148 
00149   NNID context_num;             
00150   LOGPROB *bo_wt;               
00151   boolean ct_compaction;        
00152   NNID_UPPER *nnid2ctid_upper;  
00153   NNID_LOWER *nnid2ctid_lower;  
00154 
00155 } NGRAM_TUPLE_INFO;
00156 
00165 typedef struct __ngram_info__ {
00166   int n;                        
00167   int dir;                      
00168   boolean from_bin;             
00169   boolean bigram_index_reversed;                
00170   boolean bos_eos_swap;         
00171   WORD_ID max_word_num;         
00172   char **wname;                 
00173   PATNODE *root;                
00174   WORD_ID unk_id;               
00175   int unk_num;                  
00176   LOGPROB unk_num_log;          
00177   boolean isopen;               
00178 
00179   NGRAM_TUPLE_INFO *d;  
00180 
00181   /* for pass1 */
00182   LOGPROB *bo_wt_1;             
00183   LOGPROB *p_2;                 
00184   LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); 
00185 
00186   BMALLOC_BASE *mroot;          
00187 
00188 } NGRAM_INFO;
00189 
00190 
00191 /* Definitions for binary N-gram */
00192 
00194 #define BINGRAM_IDSTR "julius_bingram_v3"
00195 
00196 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00197 
00198 #define BINGRAM_IDSTR_V5 "julius_bingram_v5"
00199 
00200 #define BINGRAM_HDSIZE 512
00201 
00202 #define BINGRAM_SIZESTR_HEAD "word="
00203 
00204 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00205 
00206 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00207 #ifdef WORDS_INT
00208 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00209 #else
00210 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00211 #endif
00212 
00213 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00214 
00215 #ifdef WORDS_BIGENDIAN
00216 #define BINGRAM_NATURAL_BYTEORDER "BE"
00217 #else
00218 #define BINGRAM_NATURAL_BYTEORDER "LE"
00219 #endif
00220 
00221 
00222 #ifdef __cplusplus
00223 extern "C" {
00224 #endif
00225 
00226 /* function declaration */
00227 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
00228 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
00229 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00230 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00231 void bi_prob_func_set(NGRAM_INFO *ndata);
00232 
00233 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
00234 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00235 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00236 
00237 boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
00238 
00239 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00240 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00241 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00242 
00243 NGRAM_INFO *ngram_info_new();
00244 void ngram_info_free(NGRAM_INFO *ngram);
00245 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00246 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
00247 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
00248 void set_unknown_id(NGRAM_INFO *ndata, char *str);
00249 
00250 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
00251 
00252 #include <sent/vocabulary.h>
00253 boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00254 void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo);
00255 
00256 #ifdef __cplusplus
00257 }
00258 #endif
00259 
00260 #endif /* __SENT_NGRAM2_H__ */