OLD | NEW |
1 /* | 1 /* |
2 ** 2006 Oct 10 | 2 ** 2006 Oct 10 |
3 ** | 3 ** |
4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
6 ** | 6 ** |
7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
10 ** | 10 ** |
11 ****************************************************************************** | 11 ****************************************************************************** |
12 ** | 12 ** |
13 ** Implementation of the "simple" full-text-search tokenizer. | 13 ** Implementation of the "simple" full-text-search tokenizer. |
14 */ | 14 */ |
15 | 15 |
16 /* | 16 /* |
17 ** The code in this file is only compiled if: | 17 ** The code in this file is only compiled if: |
18 ** | 18 ** |
19 ** * The FTS3 module is being built as an extension | 19 ** * The FTS3 module is being built as an extension |
20 ** (in which case SQLITE_CORE is not defined), or | 20 ** (in which case SQLITE_CORE is not defined), or |
21 ** | 21 ** |
22 ** * The FTS3 module is being built into the core of | 22 ** * The FTS3 module is being built into the core of |
23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
24 */ | 24 */ |
25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | 25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
26 | 26 |
| 27 #include "fts3Int.h" |
27 | 28 |
28 #include <assert.h> | 29 #include <assert.h> |
29 #include <stdlib.h> | 30 #include <stdlib.h> |
30 #include <stdio.h> | 31 #include <stdio.h> |
31 #include <string.h> | 32 #include <string.h> |
32 | 33 |
33 #include "fts3_tokenizer.h" | 34 #include "fts3_tokenizer.h" |
34 | 35 |
35 typedef struct simple_tokenizer { | 36 typedef struct simple_tokenizer { |
36 sqlite3_tokenizer base; | 37 sqlite3_tokenizer base; |
37 char delim[128]; /* flag ASCII delimiters */ | 38 char delim[128]; /* flag ASCII delimiters */ |
38 } simple_tokenizer; | 39 } simple_tokenizer; |
39 | 40 |
40 typedef struct simple_tokenizer_cursor { | 41 typedef struct simple_tokenizer_cursor { |
41 sqlite3_tokenizer_cursor base; | 42 sqlite3_tokenizer_cursor base; |
42 const char *pInput; /* input we are tokenizing */ | 43 const char *pInput; /* input we are tokenizing */ |
43 int nBytes; /* size of the input */ | 44 int nBytes; /* size of the input */ |
44 int iOffset; /* current position in pInput */ | 45 int iOffset; /* current position in pInput */ |
45 int iToken; /* index of next token to be returned */ | 46 int iToken; /* index of next token to be returned */ |
46 char *pToken; /* storage for current token */ | 47 char *pToken; /* storage for current token */ |
47 int nTokenAllocated; /* space allocated to zToken buffer */ | 48 int nTokenAllocated; /* space allocated to zToken buffer */ |
48 } simple_tokenizer_cursor; | 49 } simple_tokenizer_cursor; |
49 | 50 |
50 | 51 |
51 /* Forward declaration */ | |
52 static const sqlite3_tokenizer_module simpleTokenizerModule; | |
53 | |
54 static int simpleDelim(simple_tokenizer *t, unsigned char c){ | 52 static int simpleDelim(simple_tokenizer *t, unsigned char c){ |
55 return c<0x80 && t->delim[c]; | 53 return c<0x80 && t->delim[c]; |
56 } | 54 } |
57 static int fts3_isalnum(int x){ | 55 static int fts3_isalnum(int x){ |
58 return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z'); | 56 return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z'); |
59 } | 57 } |
60 | 58 |
61 /* | 59 /* |
62 ** Create a new tokenizer instance. | 60 ** Create a new tokenizer instance. |
63 */ | 61 */ |
64 static int simpleCreate( | 62 static int simpleCreate( |
65 int argc, const char * const *argv, | 63 int argc, const char * const *argv, |
66 sqlite3_tokenizer **ppTokenizer | 64 sqlite3_tokenizer **ppTokenizer |
67 ){ | 65 ){ |
68 simple_tokenizer *t; | 66 simple_tokenizer *t; |
69 | 67 |
70 t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t)); | 68 t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t)); |
71 if( t==NULL ) return SQLITE_NOMEM; | 69 if( t==NULL ) return SQLITE_NOMEM; |
72 memset(t, 0, sizeof(*t)); | 70 memset(t, 0, sizeof(*t)); |
73 | 71 |
74 /* TODO(shess) Delimiters need to remain the same from run to run, | 72 /* TODO(shess) Delimiters need to remain the same from run to run, |
75 ** else we need to reindex. One solution would be a meta-table to | 73 ** else we need to reindex. One solution would be a meta-table to |
76 ** track such information in the database, then we'd only want this | 74 ** track such information in the database, then we'd only want this |
77 ** information on the initial create. | 75 ** information on the initial create. |
78 */ | 76 */ |
79 if( argc>1 ){ | 77 if( argc>1 ){ |
80 int i, n = strlen(argv[1]); | 78 int i, n = (int)strlen(argv[1]); |
81 for(i=0; i<n; i++){ | 79 for(i=0; i<n; i++){ |
82 unsigned char ch = argv[1][i]; | 80 unsigned char ch = argv[1][i]; |
83 /* We explicitly don't support UTF-8 delimiters for now. */ | 81 /* We explicitly don't support UTF-8 delimiters for now. */ |
84 if( ch>=0x80 ){ | 82 if( ch>=0x80 ){ |
85 sqlite3_free(t); | 83 sqlite3_free(t); |
86 return SQLITE_ERROR; | 84 return SQLITE_ERROR; |
87 } | 85 } |
88 t->delim[ch] = 1; | 86 t->delim[ch] = 1; |
89 } | 87 } |
90 } else { | 88 } else { |
91 /* Mark non-alphanumeric ASCII characters as delimiters */ | 89 /* Mark non-alphanumeric ASCII characters as delimiters */ |
92 int i; | 90 int i; |
93 for(i=1; i<0x80; i++){ | 91 for(i=1; i<0x80; i++){ |
94 t->delim[i] = !fts3_isalnum(i); | 92 t->delim[i] = !fts3_isalnum(i) ? -1 : 0; |
95 } | 93 } |
96 } | 94 } |
97 | 95 |
98 *ppTokenizer = &t->base; | 96 *ppTokenizer = &t->base; |
99 return SQLITE_OK; | 97 return SQLITE_OK; |
100 } | 98 } |
101 | 99 |
102 /* | 100 /* |
103 ** Destroy a tokenizer | 101 ** Destroy a tokenizer |
104 */ | 102 */ |
105 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ | 103 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ |
106 sqlite3_free(pTokenizer); | 104 sqlite3_free(pTokenizer); |
107 return SQLITE_OK; | 105 return SQLITE_OK; |
108 } | 106 } |
109 | 107 |
110 /* | 108 /* |
111 ** Prepare to begin tokenizing a particular string. The input | 109 ** Prepare to begin tokenizing a particular string. The input |
112 ** string to be tokenized is pInput[0..nBytes-1]. A cursor | 110 ** string to be tokenized is pInput[0..nBytes-1]. A cursor |
113 ** used to incrementally tokenize this string is returned in | 111 ** used to incrementally tokenize this string is returned in |
114 ** *ppCursor. | 112 ** *ppCursor. |
115 */ | 113 */ |
116 static int simpleOpen( | 114 static int simpleOpen( |
117 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | 115 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
118 const char *pInput, int nBytes, /* String to be tokenized */ | 116 const char *pInput, int nBytes, /* String to be tokenized */ |
119 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | 117 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
120 ){ | 118 ){ |
121 simple_tokenizer_cursor *c; | 119 simple_tokenizer_cursor *c; |
122 | 120 |
| 121 UNUSED_PARAMETER(pTokenizer); |
| 122 |
123 c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); | 123 c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); |
124 if( c==NULL ) return SQLITE_NOMEM; | 124 if( c==NULL ) return SQLITE_NOMEM; |
125 | 125 |
126 c->pInput = pInput; | 126 c->pInput = pInput; |
127 if( pInput==0 ){ | 127 if( pInput==0 ){ |
128 c->nBytes = 0; | 128 c->nBytes = 0; |
129 }else if( nBytes<0 ){ | 129 }else if( nBytes<0 ){ |
130 c->nBytes = (int)strlen(pInput); | 130 c->nBytes = (int)strlen(pInput); |
131 }else{ | 131 }else{ |
132 c->nBytes = nBytes; | 132 c->nBytes = nBytes; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
177 | 177 |
178 /* Count non-delimiter characters. */ | 178 /* Count non-delimiter characters. */ |
179 iStartOffset = c->iOffset; | 179 iStartOffset = c->iOffset; |
180 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){ | 180 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){ |
181 c->iOffset++; | 181 c->iOffset++; |
182 } | 182 } |
183 | 183 |
184 if( c->iOffset>iStartOffset ){ | 184 if( c->iOffset>iStartOffset ){ |
185 int i, n = c->iOffset-iStartOffset; | 185 int i, n = c->iOffset-iStartOffset; |
186 if( n>c->nTokenAllocated ){ | 186 if( n>c->nTokenAllocated ){ |
| 187 char *pNew; |
187 c->nTokenAllocated = n+20; | 188 c->nTokenAllocated = n+20; |
188 c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated); | 189 pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); |
189 if( c->pToken==NULL ) return SQLITE_NOMEM; | 190 if( !pNew ) return SQLITE_NOMEM; |
| 191 c->pToken = pNew; |
190 } | 192 } |
191 for(i=0; i<n; i++){ | 193 for(i=0; i<n; i++){ |
192 /* TODO(shess) This needs expansion to handle UTF-8 | 194 /* TODO(shess) This needs expansion to handle UTF-8 |
193 ** case-insensitivity. | 195 ** case-insensitivity. |
194 */ | 196 */ |
195 unsigned char ch = p[iStartOffset+i]; | 197 unsigned char ch = p[iStartOffset+i]; |
196 c->pToken[i] = (ch>='A' && ch<='Z') ? ch-'A'+'a' : ch; | 198 c->pToken[i] = (char)((ch>='A' && ch<='Z') ? ch-'A'+'a' : ch); |
197 } | 199 } |
198 *ppToken = c->pToken; | 200 *ppToken = c->pToken; |
199 *pnBytes = n; | 201 *pnBytes = n; |
200 *piStartOffset = iStartOffset; | 202 *piStartOffset = iStartOffset; |
201 *piEndOffset = c->iOffset; | 203 *piEndOffset = c->iOffset; |
202 *piPosition = c->iToken++; | 204 *piPosition = c->iToken++; |
203 | 205 |
204 return SQLITE_OK; | 206 return SQLITE_OK; |
205 } | 207 } |
206 } | 208 } |
(...skipping 16 matching lines...) Expand all Loading... |
223 ** Allocate a new simple tokenizer. Return a pointer to the new | 225 ** Allocate a new simple tokenizer. Return a pointer to the new |
224 ** tokenizer in *ppModule | 226 ** tokenizer in *ppModule |
225 */ | 227 */ |
226 void sqlite3Fts3SimpleTokenizerModule( | 228 void sqlite3Fts3SimpleTokenizerModule( |
227 sqlite3_tokenizer_module const**ppModule | 229 sqlite3_tokenizer_module const**ppModule |
228 ){ | 230 ){ |
229 *ppModule = &simpleTokenizerModule; | 231 *ppModule = &simpleTokenizerModule; |
230 } | 232 } |
231 | 233 |
232 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | 234 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
OLD | NEW |