Julius 4.2
libsent/src/ngram/ngram_write_bin.c
説明を見る。
00001 
00046 /*
00047  * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University
00048  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00049  * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology
00050  * All rights reserved
00051  */
00052 
00053 #include <sent/stddefs.h>
00054 #include <sent/ngram2.h>
00055 
00056 static boolean need_swap; 
00057 
00058 #define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE
00059 
00060 static unsigned int count;
00061 void
00062 reset_wrt_counter()
00063 {
00064   count = 0;
00065 }
00066 static unsigned int
00067 get_wrt_counter()
00068 {
00069   return count;
00070 }
00071      
00072 
00081 static boolean
00082 wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
00083 {
00084   if (need_swap == TRUE && unitbyte != 1) {
00085     swap_bytes((char *)buf, unitbyte, unitnum);
00086   }
00087   if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) {
00088     jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
00089     return FALSE;
00090   }
00091   if (need_swap == TRUE && unitbyte != 1) {
00092     swap_bytes((char *)buf, unitbyte, unitnum);
00093   }
00094   count += unitbyte * unitnum;
00095   return TRUE;
00096 }
00097 
00106 static boolean
00107 write_header(FILE *fp, char *str)
00108 {
00109   char buf[BINGRAM_HDSIZE];
00110   int i, totallen;
00111 
00112   for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
00113   totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
00114   if (totallen >= BINGRAM_HDSIZE) {
00115     jlog("Warning: write_bingram: header too long, last will be truncated\n");
00116     i = strlen(str) - (totallen - BINGRAM_HDSIZE);
00117     str[i] = '\0';
00118   }
00119   sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
00120   wrt(fp, buf, 1, BINGRAM_HDSIZE);
00121 
00122   return TRUE;
00123 }
00124 
00134 boolean
00135 ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
00136 {
00137   int i,n;
00138   unsigned int len;
00139   int wlen;
00140   NGRAM_TUPLE_INFO *t;
00141 
00142   reset_wrt_counter();
00143 
00144   /* write initial header */
00145   if (write_header(fp, headerstr) == FALSE) return FALSE;
00146 
00147   /* swap not needed any more */
00148   need_swap = FALSE;
00149 
00150   /* write some header info */
00151   wrt(fp, &(ndata->n), sizeof(int), 1);
00152   wrt(fp, &(ndata->dir), sizeof(int), 1);
00153   wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);
00154 
00155   /* write total info */
00156   for(n=0;n<ndata->n;n++) {
00157     wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
00158     /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
00159   }
00160 
00161   /* unk_*, isopen, max_word_num are set after read, so need not save */
00162 
00163   /* write wname */
00164   wlen = 0;
00165   for(i=0;i<ndata->max_word_num;i++) {
00166     wlen += strlen(ndata->wname[i]) + 1;
00167   }
00168   wrt(fp, &wlen, sizeof(int), 1);
00169   for(i=0;i<ndata->max_word_num;i++) {
00170     wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
00171   }
00172 
00173   /* write N-gram */
00174   for(n=0;n<ndata->n;n++) {
00175     t = &(ndata->d[n]);
00176 
00177     wrt(fp, &(t->is24bit), sizeof(boolean), 1);
00178     wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
00179     wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1);
00180     wrt(fp, &(t->context_num), sizeof(NNID), 1);
00181     if (n > 0) {
00182       if (t->is24bit) {
00183         wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
00184         wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
00185       } else {
00186         wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
00187       }
00188       wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
00189       wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
00190     }
00191     wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
00192     if (t->bo_wt) {
00193       i = 1;
00194       wrt(fp, &i, sizeof(int), 1);
00195       wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
00196     } else {
00197       i = 0;
00198       wrt(fp, &i, sizeof(int), 1);
00199     }
00200     if (t->nnid2ctid_upper) {
00201       i = 1;
00202       wrt(fp, &i, sizeof(int), 1);
00203       wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
00204       wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
00205     } else {
00206       i = 0;
00207       wrt(fp, &i, sizeof(int), 1);
00208     }
00209 
00210   }
00211 
00212   /* write additional LR 2-gram */
00213   if (ndata->bo_wt_1) {
00214     i = 1;
00215     wrt(fp, &i, sizeof(int), 1);
00216     wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
00217   } else {
00218     i = 0;
00219     wrt(fp, &i, sizeof(int), 1);
00220   }
00221   if (ndata->p_2) {
00222     i = 1;
00223     wrt(fp, &i, sizeof(int), 1);
00224     wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
00225   } else {
00226     i = 0;
00227     wrt(fp, &i, sizeof(int), 1);
00228   }
00229 
00230   len = get_wrt_counter();
00231   jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0);
00232   return TRUE;
00233 }