Julius 4.2
libsent/src/dfa/cpair.c
説明を見る。
00001 
00024 /*
00025  * Copyright (c) 1991-2011 Kawahara Lab., Kyoto University
00026  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
00027  * Copyright (c) 2005-2011 Julius project team, Nagoya Institute of Technology
00028  * All rights reserved
00029  */
00030 
00031 #include <sent/stddefs.h>
00032 #include <sent/dfa.h>
00033 
00046 static int
00047 cp_find(int *list, int len, int key, int *loc)
00048 {
00049   int left, right, mid;
00050   int ret;
00051   
00052   if (len == 0) {
00053     *loc = 0;
00054     return -1;
00055   }
00056 
00057   left = 0;
00058   right = len - 1;
00059   while (left < right) {
00060     mid = (left + right) / 2;
00061     if (list[mid] < key) {
00062       left = mid + 1;
00063     } else {
00064       right = mid;
00065     }
00066   }
00067   if (list[left] == key) {
00068     *loc = left;
00069     ret = left;
00070   } else {
00071     ret = -1;
00072     if (list[left] > key) {
00073       *loc = left;
00074     } else {
00075       *loc = left + 1;
00076     }
00077   }
00078   return ret;
00079 }
00080 
00090 boolean
00091 dfa_cp(DFA_INFO *dfa, int i, int j)
00092 {
00093   int loc;
00094 
00095   /*return(dfa->cp[i][j]);*/
00096   //return((dfa->cp[i][j>>3] & cp_table[j&7]) ? TRUE : FALSE);
00097   return(cp_find(dfa->cp[i], dfa->cplen[i], j, &loc) != -1 ? TRUE : FALSE);
00098 }
00099 
00108 boolean
00109 dfa_cp_begin(DFA_INFO *dfa, int i)
00110 {
00111   int loc;
00112   /*return(dfa->cp_begin[i]);*/
00113   //return((dfa->cp_begin[i>>3] & cp_table[i&7]) ? TRUE : FALSE);
00114   return(cp_find(dfa->cp_begin, dfa->cp_begin_len, i, &loc) != -1 ? TRUE : FALSE);
00115 }
00116 
00125 boolean
00126 dfa_cp_end(DFA_INFO *dfa, int i)
00127 {
00128   int loc;
00129   /*return(dfa->cp_end[i]);*/
00130   //return((dfa->cp_end[i>>3] & cp_table[i&7]) ? TRUE : FALSE);
00131   return(cp_find(dfa->cp_end, dfa->cp_end_len, i, &loc) != -1 ? TRUE : FALSE);
00132 }
00133 
00146 static boolean
00147 cp_add(int **list, int *len, int *alloclen, int val, int loc)
00148 {
00149   if (loc > *len) {
00150     jlog("InternalError: skip?\n");
00151     return FALSE;
00152   }
00153   if (*len + 1 > *alloclen) {
00154     /* expand cplist if necessary */
00155     *alloclen *= 2;
00156     *list = (int *)myrealloc(*list, sizeof(int) * (*alloclen));
00157   }
00158   if (loc < *len) {
00159     memmove(&((*list)[loc+1]), &((*list)[loc]), sizeof(int) * (*len - loc));
00160   }
00161   (*list)[loc] = val;
00162   (*len)++;
00163   return TRUE;
00164 }
00165 static boolean
00176 cp_remove(int **list, int *len, int loc)
00177 {
00178   if (*len == 0) return TRUE;
00179   if (loc >= *len) {
00180     jlog("InternalError: skip?\n");
00181     return FALSE;
00182   }
00183   if (loc < *len - 1) {
00184     memmove(&((*list)[loc]), &((*list)[loc+1]), sizeof(int) * (*len - loc - 1));
00185   }
00186   (*len) --;
00187   return TRUE;
00188 }
00189 
00190 
00199 void
00200 set_dfa_cp(DFA_INFO *dfa, int i, int j, boolean value)
00201 {
00202   int loc;
00203   if (value) {
00204     /* add j to cp list of i */
00205     if (cp_find(dfa->cp[i], dfa->cplen[i], j, &loc) == -1) { /* not exist */
00206       cp_add(&(dfa->cp[i]), &(dfa->cplen[i]), &(dfa->cpalloclen[i]), j, loc);
00207     }
00208   } else {
00209     /* remove j from cp list of i */
00210     if (cp_find(dfa->cp[i], dfa->cplen[i], j, &loc) != -1) { /* already exist */
00211       cp_remove(&(dfa->cp[i]), &(dfa->cplen[i]), loc);
00212     }
00213   }
00214 }
00215 
00224 void
00225 set_dfa_cp_begin(DFA_INFO *dfa, int i, boolean value)
00226 {
00227   int loc;
00228 
00229   if (value) {
00230     /* add j to cp list of i */
00231     if (cp_find(dfa->cp_begin, dfa->cp_begin_len, i, &loc) == -1) { /* not exist */
00232       cp_add(&(dfa->cp_begin), &(dfa->cp_begin_len), &(dfa->cp_begin_alloclen), i, loc);
00233     }
00234   } else {
00235     /* remove j from cp list of i */
00236     if (cp_find(dfa->cp_begin, dfa->cp_begin_len, i, &loc) != -1) { /* already exist */
00237       cp_remove(&(dfa->cp_begin), &(dfa->cp_begin_len), loc);
00238     }
00239   }
00240 }
00241 
00250 void
00251 set_dfa_cp_end(DFA_INFO *dfa, int i, boolean value)
00252 {
00253   int loc;
00254 
00255   if (value) {
00256     /* add j to cp list of i */
00257     if (cp_find(dfa->cp_end, dfa->cp_end_len, i, &loc) == -1) { /* not exist */
00258       cp_add(&(dfa->cp_end), &(dfa->cp_end_len), &(dfa->cp_end_alloclen), i, loc);
00259     }
00260   } else {
00261     /* remove j from cp list of i */
00262     if (cp_find(dfa->cp_end, dfa->cp_end_len, i, &loc) != -1) { /* already exist */
00263       cp_remove(&(dfa->cp_end), &(dfa->cp_end_len), loc);
00264     }
00265   }
00266 }
00267 
00273 void
00274 init_dfa_cp(DFA_INFO *dfa)
00275 {
00276   dfa->cp = NULL;
00277 }
00278 
00286 void
00287 malloc_dfa_cp(DFA_INFO *dfa, int term_num, int size)
00288 {
00289   int i;
00290 
00291   dfa->cp = (int **)mymalloc(sizeof(int *) * term_num);
00292   dfa->cplen = (int *)mymalloc(sizeof(int) * term_num);
00293   dfa->cpalloclen = (int *)mymalloc(sizeof(int) * term_num);
00294   for(i=0;i<term_num;i++) {
00295     dfa->cp[i] = (int *)mymalloc(sizeof(int) * size);
00296     dfa->cpalloclen[i] = size;
00297     dfa->cplen[i] = 0;
00298   }
00299   dfa->cp_begin = (int *)mymalloc(sizeof(int) * size);
00300   dfa->cp_begin_alloclen = size;
00301   dfa->cp_begin_len = 0;
00302   dfa->cp_end = (int *)mymalloc(sizeof(int) * size);
00303   dfa->cp_end_alloclen = size;
00304   dfa->cp_end_len = 0;
00305 }
00306 
00318 boolean
00319 dfa_cp_append(DFA_INFO *dfa, DFA_INFO *src, int offset)
00320 {
00321   int i, j, size;
00322 
00323   if (dfa->cp == NULL) {
00324     /* no category pair information exist on target */
00325     if (dfa->term_num != src->term_num) {
00326       jlog("InternalError: dfa_cp_append\n");
00327       return FALSE;
00328     }
00329     /* just malloc and copy */
00330     dfa->cp = (int **)mymalloc(sizeof(int *) * dfa->term_num);
00331     dfa->cplen = (int *)mymalloc(sizeof(int) * dfa->term_num);
00332     dfa->cpalloclen = (int *)mymalloc(sizeof(int) * dfa->term_num);
00333     for(i=0;i<dfa->term_num;i++) {
00334       size = src->cplen[i];
00335       if (size < DFA_CP_MINSTEP) size = DFA_CP_MINSTEP;
00336       dfa->cp[i] = (int *)mymalloc(sizeof(int) * size);
00337       dfa->cpalloclen[i] = size;
00338       memcpy(dfa->cp[i], src->cp[i], sizeof(int) * src->cplen[i]);
00339       dfa->cplen[i] = src->cplen[i];
00340     }
00341     size = src->cp_begin_len;
00342     if (size < DFA_CP_MINSTEP) size = DFA_CP_MINSTEP;
00343     dfa->cp_begin = (int *)mymalloc(sizeof(int) * size);
00344     dfa->cp_begin_alloclen = size;
00345     memcpy(dfa->cp_begin, src->cp_begin, sizeof(int) * src->cp_begin_len);
00346     dfa->cp_begin_len = src->cp_begin_len;
00347     size = src->cp_end_len;
00348     if (size < DFA_CP_MINSTEP) size = DFA_CP_MINSTEP;
00349     dfa->cp_end = (int *)mymalloc(sizeof(int) * size);
00350     dfa->cp_end_alloclen = size;
00351     memcpy(dfa->cp_end, src->cp_end, sizeof(int) * src->cp_end_len);
00352     dfa->cp_end_len = src->cp_end_len;
00353     return TRUE;
00354   }
00355   /* expand index */
00356   dfa->cp = (int **)myrealloc(dfa->cp, sizeof(int *) * dfa->term_num);
00357   dfa->cplen = (int *)myrealloc(dfa->cplen, sizeof(int) * dfa->term_num);
00358   dfa->cpalloclen = (int *)myrealloc(dfa->cpalloclen, sizeof(int) * dfa->term_num);
00359   /* copy src->cp[i][j] to target->cp[i+offset][j+offset] */
00360   for(i=offset;i<dfa->term_num;i++) {
00361     size = src->cplen[i-offset];
00362     if (size < DFA_CP_MINSTEP) size = DFA_CP_MINSTEP;
00363     dfa->cp[i] = (int *)mymalloc(sizeof(int) * size);
00364     dfa->cpalloclen[i] = size;
00365     for(j=0;j<src->cplen[i-offset];j++) {
00366       dfa->cp[i][j] = src->cp[i-offset][j] + offset;
00367     }
00368     dfa->cplen[i] = src->cplen[i-offset];
00369   }
00370   if (dfa->cp_begin_alloclen < dfa->cp_begin_len + src->cp_begin_len) {
00371     dfa->cp_begin_alloclen = dfa->cp_begin_len + src->cp_begin_len;
00372     dfa->cp_begin = (int *)myrealloc(dfa->cp_begin, sizeof(int) * dfa->cp_begin_alloclen);
00373   }
00374   for(j=0;j<src->cp_begin_len;j++) {
00375     dfa->cp_begin[dfa->cp_begin_len + j] = src->cp_begin[j] + offset;
00376   }
00377   dfa->cp_begin_len += src->cp_begin_len;
00378   if (dfa->cp_end_alloclen < dfa->cp_end_len + src->cp_end_len) {
00379     dfa->cp_end_alloclen = dfa->cp_end_len + src->cp_end_len;
00380     dfa->cp_end = (int *)myrealloc(dfa->cp_end, sizeof(int) * dfa->cp_end_alloclen);
00381   }
00382   for(j=0;j<src->cp_end_len;j++) {
00383     dfa->cp_end[dfa->cp_end_len + j] = src->cp_end[j] + offset;
00384   }
00385   dfa->cp_end_len += src->cp_end_len;
00386 
00387   return TRUE;
00388 }
00389 
00395 void
00396 free_dfa_cp(DFA_INFO *dfa)
00397 {
00398   int i;
00399 
00400   if (dfa->cp != NULL) {
00401     free(dfa->cp_end);
00402     free(dfa->cp_begin);
00403     for(i=0;i<dfa->term_num;i++) free(dfa->cp[i]);
00404     free(dfa->cpalloclen);
00405     free(dfa->cplen);
00406     free(dfa->cp);
00407     dfa->cp = NULL;
00408   }
00409 }
00410 
00411 void
00412 dfa_cp_output_rawdata(FILE *fp, DFA_INFO *dfa)
00413 {
00414   int i, j;
00415 
00416   for(i=0;i<dfa->term_num;i++) {
00417     fprintf(fp, "%d:", i);
00418     for(j=0;j<dfa->cplen[i];j++) {
00419       fprintf(fp, " %d", dfa->cp[i][j]);
00420     }
00421     fprintf(fp, "\n");
00422   }
00423   fprintf(fp, "bgn:");
00424   for(j=0;j<dfa->cp_begin_len;j++) {
00425     fprintf(fp, " %d", dfa->cp_begin[j]);
00426   }
00427   fprintf(fp, "\n");
00428   fprintf(fp, "end:");
00429   for(j=0;j<dfa->cp_end_len;j++) {
00430     fprintf(fp, " %d", dfa->cp_end[j]);
00431   }
00432   fprintf(fp, "\n");
00433 }
00434 
00435 void
00436 dfa_cp_count_size(DFA_INFO *dfa, unsigned long *size_ret, unsigned long *allocsize_ret)
00437 {
00438   int i;
00439   unsigned long size, allocsize;
00440 
00441   size = 0;
00442   allocsize = 0;
00443   for(i=0;i<dfa->term_num;i++) {
00444     size += sizeof(int) * dfa->cplen[i];
00445     allocsize += sizeof(int) * dfa->cpalloclen[i];
00446   }
00447   size += sizeof(int) * dfa->cp_begin_len;
00448   allocsize += sizeof(int) * dfa->cp_begin_alloclen;
00449   size += sizeof(int) * dfa->cp_end_len;
00450   allocsize += sizeof(int) * dfa->cp_end_alloclen;
00451 
00452   allocsize += (sizeof(int *) + sizeof(int) + sizeof(int)) * dfa->term_num;
00453 
00454   *size_ret = size;
00455   *allocsize_ret = allocsize;
00456 }