Julius 4.2
|
00001 00046 /* 00047 * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University 00048 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 00049 * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology 00050 * All rights reserved 00051 */ 00052 00053 #include <sent/stddefs.h> 00054 #include <sent/ngram2.h> 00055 00056 static boolean need_swap; 00057 00058 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE 00059 00060 static unsigned int count; 00061 void 00062 reset_wrt_counter() 00063 { 00064 count = 0; 00065 } 00066 static unsigned int 00067 get_wrt_counter() 00068 { 00069 return count; 00070 } 00071 00072 00081 static boolean 00082 wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum) 00083 { 00084 if (need_swap == TRUE && unitbyte != 1) { 00085 swap_bytes((char *)buf, unitbyte, unitnum); 00086 } 00087 if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) { 00088 jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum); 00089 return FALSE; 00090 } 00091 if (need_swap == TRUE && unitbyte != 1) { 00092 swap_bytes((char *)buf, unitbyte, unitnum); 00093 } 00094 count += unitbyte * unitnum; 00095 return TRUE; 00096 } 00097 00106 static boolean 00107 write_header(FILE *fp, char *str) 00108 { 00109 char buf[BINGRAM_HDSIZE]; 00110 int i, totallen; 00111 00112 for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF; 00113 totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str); 00114 if (totallen >= BINGRAM_HDSIZE) { 00115 jlog("Warning: write_bingram: header too long, last will be truncated\n"); 00116 i = strlen(str) - (totallen - BINGRAM_HDSIZE); 00117 str[i] = '\0'; 00118 } 00119 sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str); 00120 wrt(fp, buf, 1, BINGRAM_HDSIZE); 00121 00122 return TRUE; 00123 } 00124 00134 boolean 00135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr) 00136 { 00137 int i,n; 00138 unsigned int len; 00139 int wlen; 00140 NGRAM_TUPLE_INFO *t; 00141 00142 reset_wrt_counter(); 00143 00144 /* write initial header */ 00145 if (write_header(fp, headerstr) == FALSE) return FALSE; 00146 00147 /* swap not needed any more */ 00148 need_swap = FALSE; 00149 00150 /* write some header info */ 00151 wrt(fp, &(ndata->n), sizeof(int), 1); 00152 wrt(fp, &(ndata->dir), sizeof(int), 1); 00153 wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1); 00154 00155 /* write total info */ 00156 for(n=0;n<ndata->n;n++) { 00157 wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); 00158 /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/ 00159 } 00160 00161 /* unk_*, isopen, max_word_num are set after read, so need not save */ 00162 00163 /* write wname */ 00164 wlen = 0; 00165 for(i=0;i<ndata->max_word_num;i++) { 00166 wlen += strlen(ndata->wname[i]) + 1; 00167 } 00168 wrt(fp, &wlen, sizeof(int), 1); 00169 for(i=0;i<ndata->max_word_num;i++) { 00170 wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */ 00171 } 00172 00173 /* write N-gram */ 00174 for(n=0;n<ndata->n;n++) { 00175 t = &(ndata->d[n]); 00176 00177 wrt(fp, &(t->is24bit), sizeof(boolean), 1); 00178 wrt(fp, &(t->ct_compaction), sizeof(boolean), 1); 00179 wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1); 00180 wrt(fp, &(t->context_num), sizeof(NNID), 1); 00181 if (n > 0) { 00182 if (t->is24bit) { 00183 wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen); 00184 wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen); 00185 } else { 00186 wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen); 00187 } 00188 wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen); 00189 wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum); 00190 } 00191 wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum); 00192 if (t->bo_wt) { 00193 i = 1; 00194 wrt(fp, &i, sizeof(int), 1); 00195 wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); 00196 } else { 00197 i = 0; 00198 wrt(fp, &i, sizeof(int), 1); 00199 } 00200 if (t->nnid2ctid_upper) { 00201 i = 1; 00202 wrt(fp, &i, sizeof(int), 1); 00203 wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum); 00204 wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum); 00205 } else { 00206 i = 0; 00207 wrt(fp, &i, sizeof(int), 1); 00208 } 00209 00210 } 00211 00212 /* write additional LR 2-gram */ 00213 if (ndata->bo_wt_1) { 00214 i = 1; 00215 wrt(fp, &i, sizeof(int), 1); 00216 wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num); 00217 } else { 00218 i = 0; 00219 wrt(fp, &i, sizeof(int), 1); 00220 } 00221 if (ndata->p_2) { 00222 i = 1; 00223 wrt(fp, &i, sizeof(int), 1); 00224 wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum); 00225 } else { 00226 i = 0; 00227 wrt(fp, &i, sizeof(int), 1); 00228 } 00229 00230 len = get_wrt_counter(); 00231 jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0); 00232 return TRUE; 00233 }