| OLD | NEW | 
 | (Empty) | 
|    1 /* |  | 
|    2 ** 2007 June 22 |  | 
|    3 ** |  | 
|    4 ** The author disclaims copyright to this source code.  In place of |  | 
|    5 ** a legal notice, here is a blessing: |  | 
|    6 ** |  | 
|    7 **    May you do good and not evil. |  | 
|    8 **    May you find forgiveness for yourself and forgive others. |  | 
|    9 **    May you share freely, never taking more than you give. |  | 
|   10 ** |  | 
|   11 ************************************************************************* |  | 
|   12 ** This file implements a tokenizer for fts2 based on the ICU library. |  | 
|   13 **  |  | 
|   14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $ |  | 
|   15 */ |  | 
|   16  |  | 
|   17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) |  | 
|   18 #ifdef SQLITE_ENABLE_ICU |  | 
|   19  |  | 
|   20 #include <assert.h> |  | 
|   21 #include <string.h> |  | 
|   22 #include "fts2_tokenizer.h" |  | 
|   23  |  | 
|   24 #include <unicode/ubrk.h> |  | 
|   25 #include <unicode/ucol.h> |  | 
|   26 #include <unicode/ustring.h> |  | 
|   27 #include <unicode/utf16.h> |  | 
|   28  |  | 
|   29 typedef struct IcuTokenizer IcuTokenizer; |  | 
|   30 typedef struct IcuCursor IcuCursor; |  | 
|   31  |  | 
|   32 struct IcuTokenizer { |  | 
|   33   sqlite3_tokenizer base; |  | 
|   34   char *zLocale; |  | 
|   35 }; |  | 
|   36  |  | 
|   37 struct IcuCursor { |  | 
|   38   sqlite3_tokenizer_cursor base; |  | 
|   39  |  | 
|   40   UBreakIterator *pIter;      /* ICU break-iterator object */ |  | 
|   41   int nChar;                  /* Number of UChar elements in pInput */ |  | 
|   42   UChar *aChar;               /* Copy of input using utf-16 encoding */ |  | 
|   43   int *aOffset;               /* Offsets of each character in utf-8 input */ |  | 
|   44  |  | 
|   45   int nBuffer; |  | 
|   46   char *zBuffer; |  | 
|   47  |  | 
|   48   int iToken; |  | 
|   49 }; |  | 
|   50  |  | 
|   51 /* |  | 
|   52 ** Create a new tokenizer instance. |  | 
|   53 */ |  | 
|   54 static int icuCreate( |  | 
|   55   int argc,                            /* Number of entries in argv[] */ |  | 
|   56   const char * const *argv,            /* Tokenizer creation arguments */ |  | 
|   57   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */ |  | 
|   58 ){ |  | 
|   59   IcuTokenizer *p; |  | 
|   60   int n = 0; |  | 
|   61  |  | 
|   62   if( argc>0 ){ |  | 
|   63     n = strlen(argv[0])+1; |  | 
|   64   } |  | 
|   65   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); |  | 
|   66   if( !p ){ |  | 
|   67     return SQLITE_NOMEM; |  | 
|   68   } |  | 
|   69   memset(p, 0, sizeof(IcuTokenizer)); |  | 
|   70  |  | 
|   71   if( n ){ |  | 
|   72     p->zLocale = (char *)&p[1]; |  | 
|   73     memcpy(p->zLocale, argv[0], n); |  | 
|   74   } |  | 
|   75  |  | 
|   76   *ppTokenizer = (sqlite3_tokenizer *)p; |  | 
|   77  |  | 
|   78   return SQLITE_OK; |  | 
|   79 } |  | 
|   80  |  | 
|   81 /* |  | 
|   82 ** Destroy a tokenizer |  | 
|   83 */ |  | 
|   84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ |  | 
|   85   IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |  | 
|   86   sqlite3_free(p); |  | 
|   87   return SQLITE_OK; |  | 
|   88 } |  | 
|   89  |  | 
|   90 /* |  | 
|   91 ** Prepare to begin tokenizing a particular string.  The input |  | 
|   92 ** string to be tokenized is pInput[0..nBytes-1].  A cursor |  | 
|   93 ** used to incrementally tokenize this string is returned in  |  | 
|   94 ** *ppCursor. |  | 
|   95 */ |  | 
|   96 static int icuOpen( |  | 
|   97   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */ |  | 
|   98   const char *zInput,                    /* Input string */ |  | 
|   99   int nInput,                            /* Length of zInput in bytes */ |  | 
|  100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */ |  | 
|  101 ){ |  | 
|  102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |  | 
|  103   IcuCursor *pCsr; |  | 
|  104  |  | 
|  105   const int32_t opt = U_FOLD_CASE_DEFAULT; |  | 
|  106   UErrorCode status = U_ZERO_ERROR; |  | 
|  107   int nChar; |  | 
|  108  |  | 
|  109   UChar32 c; |  | 
|  110   int iInput = 0; |  | 
|  111   int iOut = 0; |  | 
|  112  |  | 
|  113   *ppCursor = 0; |  | 
|  114  |  | 
|  115   if( nInput<0 ){ |  | 
|  116     nInput = strlen(zInput); |  | 
|  117   } |  | 
|  118   nChar = nInput+1; |  | 
|  119   pCsr = (IcuCursor *)sqlite3_malloc( |  | 
|  120       sizeof(IcuCursor) +                /* IcuCursor */ |  | 
|  121       nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */ |  | 
|  122       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */ |  | 
|  123   ); |  | 
|  124   if( !pCsr ){ |  | 
|  125     return SQLITE_NOMEM; |  | 
|  126   } |  | 
|  127   memset(pCsr, 0, sizeof(IcuCursor)); |  | 
|  128   pCsr->aChar = (UChar *)&pCsr[1]; |  | 
|  129   pCsr->aOffset = (int *)&pCsr->aChar[nChar]; |  | 
|  130  |  | 
|  131   pCsr->aOffset[iOut] = iInput; |  | 
|  132   U8_NEXT(zInput, iInput, nInput, c);  |  | 
|  133   while( c>0 ){ |  | 
|  134     int isError = 0; |  | 
|  135     c = u_foldCase(c, opt); |  | 
|  136     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); |  | 
|  137     if( isError ){ |  | 
|  138       sqlite3_free(pCsr); |  | 
|  139       return SQLITE_ERROR; |  | 
|  140     } |  | 
|  141     pCsr->aOffset[iOut] = iInput; |  | 
|  142  |  | 
|  143     if( iInput<nInput ){ |  | 
|  144       U8_NEXT(zInput, iInput, nInput, c); |  | 
|  145     }else{ |  | 
|  146       c = 0; |  | 
|  147     } |  | 
|  148   } |  | 
|  149  |  | 
|  150   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); |  | 
|  151   if( !U_SUCCESS(status) ){ |  | 
|  152     sqlite3_free(pCsr); |  | 
|  153     return SQLITE_ERROR; |  | 
|  154   } |  | 
|  155   pCsr->nChar = iOut; |  | 
|  156  |  | 
|  157   ubrk_first(pCsr->pIter); |  | 
|  158   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; |  | 
|  159   return SQLITE_OK; |  | 
|  160 } |  | 
|  161  |  | 
|  162 /* |  | 
|  163 ** Close a tokenization cursor previously opened by a call to icuOpen(). |  | 
|  164 */ |  | 
|  165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ |  | 
|  166   IcuCursor *pCsr = (IcuCursor *)pCursor; |  | 
|  167   ubrk_close(pCsr->pIter); |  | 
|  168   sqlite3_free(pCsr->zBuffer); |  | 
|  169   sqlite3_free(pCsr); |  | 
|  170   return SQLITE_OK; |  | 
|  171 } |  | 
|  172  |  | 
|  173 /* |  | 
|  174 ** Extract the next token from a tokenization cursor. |  | 
|  175 */ |  | 
|  176 static int icuNext( |  | 
|  177   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */ |  | 
|  178   const char **ppToken,               /* OUT: *ppToken is the token text */ |  | 
|  179   int *pnBytes,                       /* OUT: Number of bytes in token */ |  | 
|  180   int *piStartOffset,                 /* OUT: Starting offset of token */ |  | 
|  181   int *piEndOffset,                   /* OUT: Ending offset of token */ |  | 
|  182   int *piPosition                     /* OUT: Position integer of token */ |  | 
|  183 ){ |  | 
|  184   IcuCursor *pCsr = (IcuCursor *)pCursor; |  | 
|  185  |  | 
|  186   int iStart = 0; |  | 
|  187   int iEnd = 0; |  | 
|  188   int nByte = 0; |  | 
|  189  |  | 
|  190   while( iStart==iEnd ){ |  | 
|  191     UChar32 c; |  | 
|  192  |  | 
|  193     iStart = ubrk_current(pCsr->pIter); |  | 
|  194     iEnd = ubrk_next(pCsr->pIter); |  | 
|  195     if( iEnd==UBRK_DONE ){ |  | 
|  196       return SQLITE_DONE; |  | 
|  197     } |  | 
|  198  |  | 
|  199     while( iStart<iEnd ){ |  | 
|  200       int iWhite = iStart; |  | 
|  201       U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); |  | 
|  202       if( u_isspace(c) ){ |  | 
|  203         iStart = iWhite; |  | 
|  204       }else{ |  | 
|  205         break; |  | 
|  206       } |  | 
|  207     } |  | 
|  208     assert(iStart<=iEnd); |  | 
|  209   } |  | 
|  210  |  | 
|  211   do { |  | 
|  212     UErrorCode status = U_ZERO_ERROR; |  | 
|  213     if( nByte ){ |  | 
|  214       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); |  | 
|  215       if( !zNew ){ |  | 
|  216         return SQLITE_NOMEM; |  | 
|  217       } |  | 
|  218       pCsr->zBuffer = zNew; |  | 
|  219       pCsr->nBuffer = nByte; |  | 
|  220     } |  | 
|  221  |  | 
|  222     u_strToUTF8( |  | 
|  223         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */ |  | 
|  224         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */ |  | 
|  225         &status                                  /* Output success/failure */ |  | 
|  226     ); |  | 
|  227   } while( nByte>pCsr->nBuffer ); |  | 
|  228  |  | 
|  229   *ppToken = pCsr->zBuffer; |  | 
|  230   *pnBytes = nByte; |  | 
|  231   *piStartOffset = pCsr->aOffset[iStart]; |  | 
|  232   *piEndOffset = pCsr->aOffset[iEnd]; |  | 
|  233   *piPosition = pCsr->iToken++; |  | 
|  234  |  | 
|  235   return SQLITE_OK; |  | 
|  236 } |  | 
|  237  |  | 
|  238 /* |  | 
|  239 ** The set of routines that implement the simple tokenizer |  | 
|  240 */ |  | 
|  241 static const sqlite3_tokenizer_module icuTokenizerModule = { |  | 
|  242   0,                           /* iVersion */ |  | 
|  243   icuCreate,                   /* xCreate  */ |  | 
|  244   icuDestroy,                  /* xCreate  */ |  | 
|  245   icuOpen,                     /* xOpen    */ |  | 
|  246   icuClose,                    /* xClose   */ |  | 
|  247   icuNext,                     /* xNext    */ |  | 
|  248 }; |  | 
|  249  |  | 
|  250 /* |  | 
|  251 ** Set *ppModule to point at the implementation of the ICU tokenizer. |  | 
|  252 */ |  | 
|  253 void sqlite3Fts2IcuTokenizerModule( |  | 
|  254   sqlite3_tokenizer_module const**ppModule |  | 
|  255 ){ |  | 
|  256   *ppModule = &icuTokenizerModule; |  | 
|  257 } |  | 
|  258  |  | 
|  259 #endif /* defined(SQLITE_ENABLE_ICU) */ |  | 
|  260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ |  | 
| OLD | NEW |