Index: third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_icu.c |
diff --git a/third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_icu.c b/third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_icu.c |
new file mode 100644 |
index 0000000000000000000000000000000000000000..85390d3b065b5c422fb07d2bdd69614893f56ccf |
--- /dev/null |
+++ b/third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_icu.c |
@@ -0,0 +1,260 @@ |
+/* |
+** 2007 June 22 |
+** |
+** The author disclaims copyright to this source code. In place of |
+** a legal notice, here is a blessing: |
+** |
+** May you do good and not evil. |
+** May you find forgiveness for yourself and forgive others. |
+** May you share freely, never taking more than you give. |
+** |
+************************************************************************* |
+** This file implements a tokenizer for fts3 based on the ICU library. |
+** |
+** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $ |
+*/ |
+ |
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
+#ifdef SQLITE_ENABLE_ICU |
+ |
+#include <assert.h> |
+#include <string.h> |
+#include "fts3_tokenizer.h" |
+ |
+#include <unicode/ubrk.h> |
+#include <unicode/ucol.h> |
+#include <unicode/ustring.h> |
+#include <unicode/utf16.h> |
+ |
+typedef struct IcuTokenizer IcuTokenizer; |
+typedef struct IcuCursor IcuCursor; |
+ |
+struct IcuTokenizer { |
+ sqlite3_tokenizer base; |
+ char *zLocale; |
+}; |
+ |
+struct IcuCursor { |
+ sqlite3_tokenizer_cursor base; |
+ |
+ UBreakIterator *pIter; /* ICU break-iterator object */ |
+ int nChar; /* Number of UChar elements in pInput */ |
+ UChar *aChar; /* Copy of input using utf-16 encoding */ |
+ int *aOffset; /* Offsets of each character in utf-8 input */ |
+ |
+ int nBuffer; |
+ char *zBuffer; |
+ |
+ int iToken; |
+}; |
+ |
+/* |
+** Create a new tokenizer instance. |
+*/ |
+static int icuCreate( |
+ int argc, /* Number of entries in argv[] */ |
+ const char * const *argv, /* Tokenizer creation arguments */ |
+ sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ |
+){ |
+ IcuTokenizer *p; |
+ int n = 0; |
+ |
+ if( argc>0 ){ |
+ n = strlen(argv[0])+1; |
+ } |
+ p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); |
+ if( !p ){ |
+ return SQLITE_NOMEM; |
+ } |
+ memset(p, 0, sizeof(IcuTokenizer)); |
+ |
+ if( n ){ |
+ p->zLocale = (char *)&p[1]; |
+ memcpy(p->zLocale, argv[0], n); |
+ } |
+ |
+ *ppTokenizer = (sqlite3_tokenizer *)p; |
+ |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Destroy a tokenizer |
+*/ |
+static int icuDestroy(sqlite3_tokenizer *pTokenizer){ |
+ IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
+ sqlite3_free(p); |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Prepare to begin tokenizing a particular string. The input |
+** string to be tokenized is pInput[0..nBytes-1]. A cursor |
+** used to incrementally tokenize this string is returned in |
+** *ppCursor. |
+*/ |
+static int icuOpen( |
+ sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
+ const char *zInput, /* Input string */ |
+ int nInput, /* Length of zInput in bytes */ |
+ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
+){ |
+ IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
+ IcuCursor *pCsr; |
+ |
+ const int32_t opt = U_FOLD_CASE_DEFAULT; |
+ UErrorCode status = U_ZERO_ERROR; |
+ int nChar; |
+ |
+ UChar32 c; |
+ int iInput = 0; |
+ int iOut = 0; |
+ |
+ *ppCursor = 0; |
+ |
+ if( nInput<0 ){ |
+ nInput = strlen(zInput); |
+ } |
+ nChar = nInput+1; |
+ pCsr = (IcuCursor *)sqlite3_malloc( |
+ sizeof(IcuCursor) + /* IcuCursor */ |
+ nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ |
+ (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ |
+ ); |
+ if( !pCsr ){ |
+ return SQLITE_NOMEM; |
+ } |
+ memset(pCsr, 0, sizeof(IcuCursor)); |
+ pCsr->aChar = (UChar *)&pCsr[1]; |
+ pCsr->aOffset = (int *)&pCsr->aChar[nChar]; |
+ |
+ pCsr->aOffset[iOut] = iInput; |
+ U8_NEXT(zInput, iInput, nInput, c); |
+ while( c>0 ){ |
+ int isError = 0; |
+ c = u_foldCase(c, opt); |
+ U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); |
+ if( isError ){ |
+ sqlite3_free(pCsr); |
+ return SQLITE_ERROR; |
+ } |
+ pCsr->aOffset[iOut] = iInput; |
+ |
+ if( iInput<nInput ){ |
+ U8_NEXT(zInput, iInput, nInput, c); |
+ }else{ |
+ c = 0; |
+ } |
+ } |
+ |
+ pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); |
+ if( !U_SUCCESS(status) ){ |
+ sqlite3_free(pCsr); |
+ return SQLITE_ERROR; |
+ } |
+ pCsr->nChar = iOut; |
+ |
+ ubrk_first(pCsr->pIter); |
+ *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Close a tokenization cursor previously opened by a call to icuOpen(). |
+*/ |
+static int icuClose(sqlite3_tokenizer_cursor *pCursor){ |
+ IcuCursor *pCsr = (IcuCursor *)pCursor; |
+ ubrk_close(pCsr->pIter); |
+ sqlite3_free(pCsr->zBuffer); |
+ sqlite3_free(pCsr); |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Extract the next token from a tokenization cursor. |
+*/ |
+static int icuNext( |
+ sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ |
+ const char **ppToken, /* OUT: *ppToken is the token text */ |
+ int *pnBytes, /* OUT: Number of bytes in token */ |
+ int *piStartOffset, /* OUT: Starting offset of token */ |
+ int *piEndOffset, /* OUT: Ending offset of token */ |
+ int *piPosition /* OUT: Position integer of token */ |
+){ |
+ IcuCursor *pCsr = (IcuCursor *)pCursor; |
+ |
+ int iStart = 0; |
+ int iEnd = 0; |
+ int nByte = 0; |
+ |
+ while( iStart==iEnd ){ |
+ UChar32 c; |
+ |
+ iStart = ubrk_current(pCsr->pIter); |
+ iEnd = ubrk_next(pCsr->pIter); |
+ if( iEnd==UBRK_DONE ){ |
+ return SQLITE_DONE; |
+ } |
+ |
+ while( iStart<iEnd ){ |
+ int iWhite = iStart; |
+ U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); |
+ if( u_isspace(c) ){ |
+ iStart = iWhite; |
+ }else{ |
+ break; |
+ } |
+ } |
+ assert(iStart<=iEnd); |
+ } |
+ |
+ do { |
+ UErrorCode status = U_ZERO_ERROR; |
+ if( nByte ){ |
+ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); |
+ if( !zNew ){ |
+ return SQLITE_NOMEM; |
+ } |
+ pCsr->zBuffer = zNew; |
+ pCsr->nBuffer = nByte; |
+ } |
+ |
+ u_strToUTF8( |
+ pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ |
+ &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ |
+ &status /* Output success/failure */ |
+ ); |
+ } while( nByte>pCsr->nBuffer ); |
+ |
+ *ppToken = pCsr->zBuffer; |
+ *pnBytes = nByte; |
+ *piStartOffset = pCsr->aOffset[iStart]; |
+ *piEndOffset = pCsr->aOffset[iEnd]; |
+ *piPosition = pCsr->iToken++; |
+ |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** The set of routines that implement the simple tokenizer |
+*/ |
+static const sqlite3_tokenizer_module icuTokenizerModule = { |
+ 0, /* iVersion */ |
+ icuCreate, /* xCreate */ |
+ icuDestroy, /* xCreate */ |
+ icuOpen, /* xOpen */ |
+ icuClose, /* xClose */ |
+ icuNext, /* xNext */ |
+}; |
+ |
+/* |
+** Set *ppModule to point at the implementation of the ICU tokenizer. |
+*/ |
+void sqlite3Fts3IcuTokenizerModule( |
+ sqlite3_tokenizer_module const**ppModule |
+){ |
+ *ppModule = &icuTokenizerModule; |
+} |
+ |
+#endif /* defined(SQLITE_ENABLE_ICU) */ |
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |