Julius 4.2
|
00001 00101 /* 00102 * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University 00103 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00104 * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology 00105 * All rights reserved 00106 */ 00107 00108 #ifndef __SENT_NGRAM2_H__ 00109 #define __SENT_NGRAM2_H__ 00110 00111 #include <sent/stddefs.h> 00112 #include <sent/ptree.h> 00113 00114 typedef unsigned int NNID; 00115 #define NNID_INVALID 0xffffffff ///< Value to indicate no id (full) 00116 #define NNID_MAX 0xfffffffe ///< Value of maximum value (full) 00117 00118 typedef unsigned char NNID_UPPER; 00119 typedef unsigned short NNID_LOWER; 00120 #define NNID_INVALID_UPPER 255 ///< Value to indicate no id at NNID_UPPER (24bit) 00121 #define NNID_MAX_24 16711679 ///< Allowed maximum number of id (255*65536-1) (24bit) 00122 00124 #define BEGIN_WORD_DEFAULT "<s>" 00125 00126 #define END_WORD_DEFAULT "</s>" 00127 00128 #define UNK_WORD_DEFAULT "<unk>" 00129 #define UNK_WORD_DEFAULT2 "<UNK>" 00130 00131 #define UNK_WORD_MAXLEN 30 00132 00137 typedef struct { 00138 NNID totalnum; 00139 boolean is24bit; 00140 NNID bgnlistlen; 00141 NNID_UPPER *bgn_upper; 00142 NNID_LOWER *bgn_lower; 00143 NNID *bgn; 00144 WORD_ID *num; 00145 00146 WORD_ID *nnid2wid; 00147 LOGPROB *prob; 00148 00149 NNID context_num; 00150 LOGPROB *bo_wt; 00151 boolean ct_compaction; 00152 NNID_UPPER *nnid2ctid_upper; 00153 NNID_LOWER *nnid2ctid_lower; 00154 00155 } NGRAM_TUPLE_INFO; 00156 00165 typedef struct __ngram_info__ { 00166 int n; 00167 int dir; 00168 boolean from_bin; 00169 boolean bigram_index_reversed; 00170 boolean bos_eos_swap; 00171 WORD_ID max_word_num; 00172 char **wname; 00173 PATNODE *root; 00174 WORD_ID unk_id; 00175 int unk_num; 00176 LOGPROB unk_num_log; 00177 boolean isopen; 00178 00179 NGRAM_TUPLE_INFO *d; 00180 00181 /* for pass1 */ 00182 LOGPROB *bo_wt_1; 00183 LOGPROB *p_2; 00184 LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); 00185 00186 BMALLOC_BASE *mroot; 00187 00188 } NGRAM_INFO; 00189 00190 00191 /* Definitions for binary N-gram */ 00192 00194 #define BINGRAM_IDSTR "julius_bingram_v3" 00195 00196 #define BINGRAM_IDSTR_V4 "julius_bingram_v4" 00197 00198 #define BINGRAM_IDSTR_V5 "julius_bingram_v5" 00199 00200 #define BINGRAM_HDSIZE 512 00201 00202 #define BINGRAM_SIZESTR_HEAD "word=" 00203 00204 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)" 00205 00206 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)" 00207 #ifdef WORDS_INT 00208 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE 00209 #else 00210 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE 00211 #endif 00212 00213 #define BINGRAM_BYTEORDER_HEAD "byteorder=" 00214 00215 #ifdef WORDS_BIGENDIAN 00216 #define BINGRAM_NATURAL_BYTEORDER "BE" 00217 #else 00218 #define BINGRAM_NATURAL_BYTEORDER "LE" 00219 #endif 00220 00221 00222 #ifdef __cplusplus 00223 extern "C" { 00224 #endif 00225 00226 /* function declaration */ 00227 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w); 00228 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w); 00229 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w); 00230 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2); 00231 void bi_prob_func_set(NGRAM_INFO *ndata); 00232 00233 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition); 00234 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata); 00235 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str); 00236 00237 boolean ngram_compact_context(NGRAM_INFO *ndata, int n); 00238 00239 void ngram_make_lookup_tree(NGRAM_INFO *ndata); 00240 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr); 00241 WORD_ID make_ngram_ref(NGRAM_INFO *, char *); 00242 00243 NGRAM_INFO *ngram_info_new(); 00244 void ngram_info_free(NGRAM_INFO *ngram); 00245 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file); 00246 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir); 00247 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file); 00248 void set_unknown_id(NGRAM_INFO *ndata, char *str); 00249 00250 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata); 00251 00252 #include <sent/vocabulary.h> 00253 boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo); 00254 void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo); 00255 00256 #ifdef __cplusplus 00257 } 00258 #endif 00259 00260 #endif /* __SENT_NGRAM2_H__ */