Index: third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_tokenizer1.c |
diff --git a/third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_tokenizer1.c b/third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_tokenizer1.c |
new file mode 100644 |
index 0000000000000000000000000000000000000000..432c35d1a2518065965ec918afac208c4cd776dd |
--- /dev/null |
+++ b/third_party/sqlite/sqlite-src-3070603/ext/fts3/fts3_tokenizer1.c |
@@ -0,0 +1,234 @@ |
+/* |
+** 2006 Oct 10 |
+** |
+** The author disclaims copyright to this source code. In place of |
+** a legal notice, here is a blessing: |
+** |
+** May you do good and not evil. |
+** May you find forgiveness for yourself and forgive others. |
+** May you share freely, never taking more than you give. |
+** |
+****************************************************************************** |
+** |
+** Implementation of the "simple" full-text-search tokenizer. |
+*/ |
+ |
+/* |
+** The code in this file is only compiled if: |
+** |
+** * The FTS3 module is being built as an extension |
+** (in which case SQLITE_CORE is not defined), or |
+** |
+** * The FTS3 module is being built into the core of |
+** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
+*/ |
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
+ |
+#include "fts3Int.h" |
+ |
+#include <assert.h> |
+#include <stdlib.h> |
+#include <stdio.h> |
+#include <string.h> |
+ |
+#include "fts3_tokenizer.h" |
+ |
+typedef struct simple_tokenizer { |
+ sqlite3_tokenizer base; |
+ char delim[128]; /* flag ASCII delimiters */ |
+} simple_tokenizer; |
+ |
+typedef struct simple_tokenizer_cursor { |
+ sqlite3_tokenizer_cursor base; |
+ const char *pInput; /* input we are tokenizing */ |
+ int nBytes; /* size of the input */ |
+ int iOffset; /* current position in pInput */ |
+ int iToken; /* index of next token to be returned */ |
+ char *pToken; /* storage for current token */ |
+ int nTokenAllocated; /* space allocated to zToken buffer */ |
+} simple_tokenizer_cursor; |
+ |
+ |
+static int simpleDelim(simple_tokenizer *t, unsigned char c){ |
+ return c<0x80 && t->delim[c]; |
+} |
+static int fts3_isalnum(int x){ |
+ return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z'); |
+} |
+ |
+/* |
+** Create a new tokenizer instance. |
+*/ |
+static int simpleCreate( |
+ int argc, const char * const *argv, |
+ sqlite3_tokenizer **ppTokenizer |
+){ |
+ simple_tokenizer *t; |
+ |
+ t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t)); |
+ if( t==NULL ) return SQLITE_NOMEM; |
+ memset(t, 0, sizeof(*t)); |
+ |
+ /* TODO(shess) Delimiters need to remain the same from run to run, |
+ ** else we need to reindex. One solution would be a meta-table to |
+ ** track such information in the database, then we'd only want this |
+ ** information on the initial create. |
+ */ |
+ if( argc>1 ){ |
+ int i, n = (int)strlen(argv[1]); |
+ for(i=0; i<n; i++){ |
+ unsigned char ch = argv[1][i]; |
+ /* We explicitly don't support UTF-8 delimiters for now. */ |
+ if( ch>=0x80 ){ |
+ sqlite3_free(t); |
+ return SQLITE_ERROR; |
+ } |
+ t->delim[ch] = 1; |
+ } |
+ } else { |
+ /* Mark non-alphanumeric ASCII characters as delimiters */ |
+ int i; |
+ for(i=1; i<0x80; i++){ |
+ t->delim[i] = !fts3_isalnum(i) ? -1 : 0; |
+ } |
+ } |
+ |
+ *ppTokenizer = &t->base; |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Destroy a tokenizer |
+*/ |
+static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ |
+ sqlite3_free(pTokenizer); |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Prepare to begin tokenizing a particular string. The input |
+** string to be tokenized is pInput[0..nBytes-1]. A cursor |
+** used to incrementally tokenize this string is returned in |
+** *ppCursor. |
+*/ |
+static int simpleOpen( |
+ sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
+ const char *pInput, int nBytes, /* String to be tokenized */ |
+ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
+){ |
+ simple_tokenizer_cursor *c; |
+ |
+ UNUSED_PARAMETER(pTokenizer); |
+ |
+ c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); |
+ if( c==NULL ) return SQLITE_NOMEM; |
+ |
+ c->pInput = pInput; |
+ if( pInput==0 ){ |
+ c->nBytes = 0; |
+ }else if( nBytes<0 ){ |
+ c->nBytes = (int)strlen(pInput); |
+ }else{ |
+ c->nBytes = nBytes; |
+ } |
+ c->iOffset = 0; /* start tokenizing at the beginning */ |
+ c->iToken = 0; |
+ c->pToken = NULL; /* no space allocated, yet. */ |
+ c->nTokenAllocated = 0; |
+ |
+ *ppCursor = &c->base; |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Close a tokenization cursor previously opened by a call to |
+** simpleOpen() above. |
+*/ |
+static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ |
+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; |
+ sqlite3_free(c->pToken); |
+ sqlite3_free(c); |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Extract the next token from a tokenization cursor. The cursor must |
+** have been opened by a prior call to simpleOpen(). |
+*/ |
+static int simpleNext( |
+ sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ |
+ const char **ppToken, /* OUT: *ppToken is the token text */ |
+ int *pnBytes, /* OUT: Number of bytes in token */ |
+ int *piStartOffset, /* OUT: Starting offset of token */ |
+ int *piEndOffset, /* OUT: Ending offset of token */ |
+ int *piPosition /* OUT: Position integer of token */ |
+){ |
+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; |
+ simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; |
+ unsigned char *p = (unsigned char *)c->pInput; |
+ |
+ while( c->iOffset<c->nBytes ){ |
+ int iStartOffset; |
+ |
+ /* Scan past delimiter characters */ |
+ while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){ |
+ c->iOffset++; |
+ } |
+ |
+ /* Count non-delimiter characters. */ |
+ iStartOffset = c->iOffset; |
+ while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){ |
+ c->iOffset++; |
+ } |
+ |
+ if( c->iOffset>iStartOffset ){ |
+ int i, n = c->iOffset-iStartOffset; |
+ if( n>c->nTokenAllocated ){ |
+ char *pNew; |
+ c->nTokenAllocated = n+20; |
+ pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); |
+ if( !pNew ) return SQLITE_NOMEM; |
+ c->pToken = pNew; |
+ } |
+ for(i=0; i<n; i++){ |
+ /* TODO(shess) This needs expansion to handle UTF-8 |
+ ** case-insensitivity. |
+ */ |
+ unsigned char ch = p[iStartOffset+i]; |
+ c->pToken[i] = (char)((ch>='A' && ch<='Z') ? ch-'A'+'a' : ch); |
+ } |
+ *ppToken = c->pToken; |
+ *pnBytes = n; |
+ *piStartOffset = iStartOffset; |
+ *piEndOffset = c->iOffset; |
+ *piPosition = c->iToken++; |
+ |
+ return SQLITE_OK; |
+ } |
+ } |
+ return SQLITE_DONE; |
+} |
+ |
+/* |
+** The set of routines that implement the simple tokenizer |
+*/ |
+static const sqlite3_tokenizer_module simpleTokenizerModule = { |
+ 0, |
+ simpleCreate, |
+ simpleDestroy, |
+ simpleOpen, |
+ simpleClose, |
+ simpleNext, |
+}; |
+ |
+/* |
+** Allocate a new simple tokenizer. Return a pointer to the new |
+** tokenizer in *ppModule |
+*/ |
+void sqlite3Fts3SimpleTokenizerModule( |
+ sqlite3_tokenizer_module const**ppModule |
+){ |
+ *ppModule = &simpleTokenizerModule; |
+} |
+ |
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |