Julius 4.1.5
|
00001 00101 /* 00102 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 00103 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00104 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 00105 * All rights reserved 00106 */ 00107 00108 #ifndef __SENT_NGRAM2_H__ 00109 #define __SENT_NGRAM2_H__ 00110 00111 #include <sent/stddefs.h> 00112 #include <sent/ptree.h> 00113 00114 typedef unsigned int NNID; 00115 #define NNID_INVALID 0xffffffff ///< Value to indicate no id (full) 00116 #define NNID_MAX 0xfffffffe ///< Value of maximum value (full) 00117 00118 typedef unsigned char NNID_UPPER; 00119 typedef unsigned short NNID_LOWER; 00120 #define NNID_INVALID_UPPER 255 ///< Value to indicate no id at NNID_UPPER (24bit) 00121 #define NNID_MAX_24 16711679 ///< Allowed maximum number of id (255*65536-1) (24bit) 00122 00124 #define BEGIN_WORD_DEFAULT "<s>" 00125 00126 #define END_WORD_DEFAULT "</s>" 00127 00128 #define UNK_WORD_DEFAULT "<unk>" 00129 #define UNK_WORD_DEFAULT2 "<UNK>" 00130 00131 #define UNK_WORD_MAXLEN 30 00132 00137 typedef struct { 00138 NNID totalnum; 00139 boolean is24bit; 00140 NNID bgnlistlen; 00141 NNID_UPPER *bgn_upper; 00142 NNID_LOWER *bgn_lower; 00143 NNID *bgn; 00144 WORD_ID *num; 00145 00146 WORD_ID *nnid2wid; 00147 LOGPROB *prob; 00148 00149 NNID context_num; 00150 LOGPROB *bo_wt; 00151 boolean ct_compaction; 00152 NNID_UPPER *nnid2ctid_upper; 00153 NNID_LOWER *nnid2ctid_lower; 00154 00155 } NGRAM_TUPLE_INFO; 00156 00165 typedef struct __ngram_info__ { 00166 int n; 00167 int dir; 00168 boolean from_bin; 00169 boolean bigram_index_reversed; 00170 boolean bos_eos_swap; 00171 WORD_ID max_word_num; 00172 char **wname; 00173 PATNODE *root; 00174 WORD_ID unk_id; 00175 int unk_num; 00176 LOGPROB unk_num_log; 00177 boolean isopen; 00178 00179 NGRAM_TUPLE_INFO *d; 00180 00181 /* for pass1 */ 00182 LOGPROB *bo_wt_1; 00183 LOGPROB *p_2; 00184 LOGPROB (*bigram_prob)(struct __ngram_info__ *, WORD_ID, WORD_ID); 00185 } NGRAM_INFO; 00186 00187 00188 /* Definitions for binary N-gram */ 00189 00191 #define BINGRAM_IDSTR "julius_bingram_v3" 00192 00193 #define BINGRAM_IDSTR_V4 "julius_bingram_v4" 00194 00195 #define BINGRAM_IDSTR_V5 "julius_bingram_v5" 00196 00197 #define BINGRAM_HDSIZE 512 00198 00199 #define BINGRAM_SIZESTR_HEAD "word=" 00200 00201 #define BINGRAM_SIZESTR_BODY_4BYTE "4byte(int)" 00202 00203 #define BINGRAM_SIZESTR_BODY_2BYTE "2byte(unsigned short)" 00204 #ifdef WORDS_INT 00205 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_4BYTE 00206 #else 00207 #define BINGRAM_SIZESTR_BODY BINGRAM_SIZESTR_BODY_2BYTE 00208 #endif 00209 00210 #define BINGRAM_BYTEORDER_HEAD "byteorder=" 00211 00212 #ifdef WORDS_BIGENDIAN 00213 #define BINGRAM_NATURAL_BYTEORDER "BE" 00214 #else 00215 #define BINGRAM_NATURAL_BYTEORDER "LE" 00216 #endif 00217 00218 00219 #ifdef __cplusplus 00220 extern "C" { 00221 #endif 00222 00223 /* function declaration */ 00224 NNID search_ngram(NGRAM_INFO *ndata, int n, WORD_ID *w); 00225 LOGPROB ngram_prob(NGRAM_INFO *ndata, int n, WORD_ID *w); 00226 LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w); 00227 LOGPROB bi_prob(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2); 00228 void bi_prob_func_set(NGRAM_INFO *ndata); 00229 00230 boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition); 00231 boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata); 00232 boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str); 00233 00234 boolean ngram_compact_context(NGRAM_INFO *ndata, int n); 00235 00236 void ngram_make_lookup_tree(NGRAM_INFO *ndata); 00237 WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr); 00238 WORD_ID make_ngram_ref(NGRAM_INFO *, char *); 00239 00240 NGRAM_INFO *ngram_info_new(); 00241 void ngram_info_free(NGRAM_INFO *ngram); 00242 boolean init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file); 00243 boolean init_ngram_arpa(NGRAM_INFO *ndata, char *ngram_file, int dir); 00244 boolean init_ngram_arpa_additional(NGRAM_INFO *ndata, char *bigram_file); 00245 void set_unknown_id(NGRAM_INFO *ndata, char *str); 00246 00247 void print_ngram_info(FILE *fp, NGRAM_INFO *ndata); 00248 00249 #include <sent/vocabulary.h> 00250 boolean make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo); 00251 void fix_uniprob_srilm(NGRAM_INFO *ndata, WORD_INFO *winfo); 00252 00253 #ifdef __cplusplus 00254 } 00255 #endif 00256 00257 #endif /* __SENT_NGRAM2_H__ */