Julius 4.1.5
libsent/include/sent/ngram2.h
説明を見る。
00001 
00101 /*
00102  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
00103  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00104  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
00105  * All rights reserved
00106  */
00107 
00108 #ifndef __SENT_NGRAM2_H__
00109 #define __SENT_NGRAM2_H__
00110 
00111 #include <sent/stddefs.h>
00112 #include <sent/ptree.h>
00113 
00114 typedef unsigned int NNID;            
00115 #define NNID_INVALID 0xffffffff  ///< Value to indicate no id (full)
00116 #define NNID_MAX 0xfffffffe     ///< Value of maximum value (full)
00117 
00118 typedef unsigned char NNID_UPPER; 
00119 typedef unsigned short NNID_LOWER; 
00120 #define NNID_INVALID_UPPER 255  ///< Value to indicate no id at NNID_UPPER (24bit)
00121 #define NNID_MAX_24 16711679        ///< Allowed maximum number of id (255*65536-1) (24bit)
00122 
00124 #define BEGIN_WORD_DEFAULT "<s>"
00125 
00126 #define END_WORD_DEFAULT "</s>"
00127 
00128 #define UNK_WORD_DEFAULT "<unk>"
00129 #define UNK_WORD_DEFAULT2 "<UNK>"
00130 
00131 #define UNK_WORD_MAXLEN 30
00132 
00137 typedef struct {
00138   NNID totalnum;                
00139   boolean is24bit;              
00140   NNID bgnlistlen;              
00141   NNID_UPPER *bgn_upper;        
00142   NNID_LOWER *bgn_lower;        
00143   NNID *bgn;                    
00144   WORD_ID *num;         
00145 
00146   WORD_ID *nnid2wid;            
00147   LOGPROB *prob;                
00148 
00149   NNID context_num;             
00150   LOGPROB *bo_wt;               
00151   boolean ct_compaction;        
00152   NNID_UPPER *nnid2ctid_upper;  
00153   NNID_LOWER *nnid2ctid_lower;  
00154 
00155 } NGRAM_TUPLE_INFO;
00156 
00165 typedef struct __ngram_info__ {
00166   int n;                        
00167   int dir;                      
00168   boolean from_bin;             
00169   boolean bigram_index_reversed;                
00170   boolean bos_eos_swap;         
00171   WORD_ID max_word_num;         
00172   char **wname;                 
00173   PATNODE *root;                
00174   WORD_ID unk_id;               
00175   int unk_num;                  
00176   LOGPROB unk_num_log;          
00177   boolean isopen;               
00178 
00179   NGRAM_TUPLE_INFO *d;  
00180 
00181   /* for pass1 */
00182   LOGPROB *bo_wt_1;             
00183   LOGPROB *p_2;                 
00184   LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); 
00185 } NGRAM_INFO;
00186 
00187 
00188 /* Definitions for binary N-gram */
00189 
00191 #define BINGRAM_IDSTR "julius_bingram_v3"
00192 
00193 #define BINGRAM_IDSTR_V4 "julius_bingram_v4"
00194 
00195 #define BINGRAM_IDSTR_V5 "julius_bingram_v5"
00196 
00197 #define BINGRAM_HDSIZE 512
00198 
00199 #define BINGRAM_SIZESTR_HEAD "word="
00200 
00201 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)"
00202 
00203 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)"
00204 #ifdef WORDS_INT
00205 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE
00206 #else
00207 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE
00208 #endif
00209 
00210 #define BINGRAM_BYTEORDER_HEAD "byteorder="
00211 
00212 #ifdef WORDS_BIGENDIAN
00213 #define BINGRAM_NATURAL_BYTEORDER "BE"
00214 #else
00215 #define BINGRAM_NATURAL_BYTEORDER "LE"
00216 #endif
00217 
00218 
00219 #ifdef __cplusplus
00220 extern "C" {
00221 #endif
00222 
00223 /* function declaration */
00224 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w);
00225 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w);
00226 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
00227 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
00228 void bi_prob_func_set(NGRAM_INFO *ndata);
00229 
00230 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition);
00231 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
00232 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);
00233 
00234 boolean ngram_compact_context(NGRAM_INFO *ndata, int n);
00235 
00236 void ngram_make_lookup_tree(NGRAM_INFO *ndata);
00237 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
00238 WORD_ID make_ngram_ref(NGRAM_INFO *, char *);
00239 
00240 NGRAM_INFO *ngram_info_new();
00241 void ngram_info_free(NGRAM_INFO *ngram);
00242 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
00243 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir);
00244 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file);
00245 void set_unknown_id(NGRAM_INFO *ndata, char *str);
00246 
00247 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata);
00248 
00249 #include <sent/vocabulary.h>
00250 boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);
00251 void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo);
00252 
00253 #ifdef __cplusplus
00254 }
00255 #endif
00256 
00257 #endif /* __SENT_NGRAM2_H__ */