OLD | NEW |
| (Empty) |
1 /* The author disclaims copyright to this source code. | |
2 * | |
3 * This is an SQLite module implementing full-text search. | |
4 */ | |
5 | |
6 #include <assert.h> | |
7 #if !defined(__APPLE__) | |
8 #include <malloc.h> | |
9 #else | |
10 #include <stdlib.h> | |
11 #endif | |
12 #include <stdio.h> | |
13 #include <string.h> | |
14 #include <ctype.h> | |
15 | |
16 #include "fulltext.h" | |
17 #include "ft_hash.h" | |
18 #include "tokenizer.h" | |
19 #include "sqlite3.h" | |
20 #include "sqlite3ext.h" | |
21 SQLITE_EXTENSION_INIT1 | |
22 | |
23 /* utility functions */ | |
24 | |
25 /* We encode variable-length integers in little-endian order using seven bits | |
26 * per byte as follows: | |
27 ** | |
28 ** KEY: | |
29 ** A = 0xxxxxxx 7 bits of data and one flag bit | |
30 ** B = 1xxxxxxx 7 bits of data and one flag bit | |
31 ** | |
32 ** 7 bits - A | |
33 ** 14 bits - BA | |
34 ** 21 bits - BBA | |
35 ** and so on. | |
36 */ | |
37 | |
38 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | |
39 #define VARINT_MAX 10 | |
40 | |
41 /* Write a 64-bit variable-length integer to memory starting at p[0]. | |
42 * The length of data written will be between 1 and VARINT_MAX bytes. | |
43 * The number of bytes written is returned. */ | |
44 static int putVarint(char *p, sqlite_int64 v){ | |
45 unsigned char *q = (unsigned char *) p; | |
46 sqlite_uint64 vu = v; | |
47 do{ | |
48 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | |
49 vu >>= 7; | |
50 }while( vu!=0 ); | |
51 q[-1] &= 0x7f; /* turn off high bit in final byte */ | |
52 assert( q - (unsigned char *)p <= VARINT_MAX ); | |
53 return (int) (q - (unsigned char *)p); | |
54 } | |
55 | |
56 /* Read a 64-bit variable-length integer from memory starting at p[0]. | |
57 * Return the number of bytes read, or 0 on error. | |
58 * The value is stored in *v. */ | |
59 static int getVarint(const char *p, sqlite_int64 *v){ | |
60 const unsigned char *q = (const unsigned char *) p; | |
61 sqlite_uint64 x = 0, y = 1; | |
62 while( (*q & 0x80) == 0x80 ){ | |
63 x += y * (*q++ & 0x7f); | |
64 y <<= 7; | |
65 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | |
66 assert( 0 ); | |
67 return 0; | |
68 } | |
69 } | |
70 x += y * (*q++); | |
71 *v = (sqlite_int64) x; | |
72 return (int) (q - (unsigned char *)p); | |
73 } | |
74 | |
75 static int getVarint32(const char *p, int *pi){ | |
76 sqlite_int64 i; | |
77 int ret = getVarint(p, &i); | |
78 *pi = (int) i; | |
79 assert( *pi==i ); | |
80 return ret; | |
81 } | |
82 | |
83 /*** Document lists *** | |
84 * | |
85 * A document list holds a sorted list of varint-encoded document IDs. | |
86 * | |
87 * A doclist with type DL_POSITIONS_OFFSETS is stored like this: | |
88 * | |
89 * array { | |
90 * varint docid; | |
91 * array { | |
92 * varint position; (delta from previous position plus 1, or 0 for end) | |
93 * varint startOffset; (delta from previous startOffset) | |
94 * varint endOffset; (delta from startOffset) | |
95 * } | |
96 * } | |
97 * | |
98 * Here, array { X } means zero or more occurrences of X, adjacent in memory. | |
99 * | |
100 * A doclist with type DL_POSITIONS is like the above, but holds only docids | |
101 * and positions without offset information. | |
102 * | |
103 * A doclist with type DL_DOCIDS is like the above, but holds only docids | |
104 * without positions or offset information. | |
105 * | |
106 * On disk, every document list has positions and offsets, so we don't bother | |
107 * to serialize a doclist's type. | |
108 * | |
109 * We don't yet delta-encode document IDs; doing so will probably be a | |
110 * modest win. | |
111 * | |
112 * NOTE(shess) I've thought of a slightly (1%) better offset encoding. | |
113 * After the first offset, estimate the next offset by using the | |
114 * current token position and the previous token position and offset, | |
115 * offset to handle some variance. So the estimate would be | |
116 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded | |
117 * as normal. Offsets more than 64 chars from the estimate are | |
118 * encoded as the delta to the previous start offset + 128. An | |
119 * additional tiny increment can be gained by using the end offset of | |
120 * the previous token to make the estimate a tiny bit more precise. | |
121 */ | |
122 | |
123 typedef enum DocListType { | |
124 DL_DOCIDS, /* docids only */ | |
125 DL_POSITIONS, /* docids + positions */ | |
126 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | |
127 } DocListType; | |
128 | |
129 typedef struct DocList { | |
130 char *pData; | |
131 int nData; | |
132 DocListType iType; | |
133 int iLastPos; /* the last position written */ | |
134 int iLastOffset; /* the last start offset written */ | |
135 } DocList; | |
136 | |
137 /* Initialize a new DocList to hold the given data. */ | |
138 static void docListInit(DocList *d, DocListType iType, | |
139 const char *pData, int nData){ | |
140 d->nData = nData; | |
141 if( nData>0 ){ | |
142 d->pData = malloc(nData); | |
143 memcpy(d->pData, pData, nData); | |
144 } else { | |
145 d->pData = NULL; | |
146 } | |
147 d->iType = iType; | |
148 d->iLastPos = 0; | |
149 d->iLastOffset = 0; | |
150 } | |
151 | |
152 /* Create a new dynamically-allocated DocList. */ | |
153 static DocList *docListNew(DocListType iType){ | |
154 DocList *d = (DocList *) malloc(sizeof(DocList)); | |
155 docListInit(d, iType, 0, 0); | |
156 return d; | |
157 } | |
158 | |
159 static void docListDestroy(DocList *d){ | |
160 free(d->pData); | |
161 #ifndef NDEBUG | |
162 memset(d, 0x55, sizeof(*d)); | |
163 #endif | |
164 } | |
165 | |
166 static void docListDelete(DocList *d){ | |
167 docListDestroy(d); | |
168 free(d); | |
169 } | |
170 | |
171 static char *docListEnd(DocList *d){ | |
172 return d->pData + d->nData; | |
173 } | |
174 | |
175 /* Append a varint to a DocList's data. */ | |
176 static void appendVarint(DocList *d, sqlite_int64 i){ | |
177 char c[VARINT_MAX]; | |
178 int n = putVarint(c, i); | |
179 d->pData = realloc(d->pData, d->nData + n); | |
180 memcpy(d->pData + d->nData, c, n); | |
181 d->nData += n; | |
182 } | |
183 | |
184 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ | |
185 appendVarint(d, iDocid); | |
186 d->iLastPos = 0; | |
187 } | |
188 | |
189 /* Add a position to the last position list in a doclist. */ | |
190 static void docListAddPos(DocList *d, int iPos){ | |
191 assert( d->iType>=DL_POSITIONS ); | |
192 appendVarint(d, iPos-d->iLastPos+1); | |
193 d->iLastPos = iPos; | |
194 } | |
195 | |
196 static void docListAddPosOffset(DocList *d, int iPos, | |
197 int iStartOffset, int iEndOffset){ | |
198 assert( d->iType==DL_POSITIONS_OFFSETS ); | |
199 docListAddPos(d, iPos); | |
200 appendVarint(d, iStartOffset-d->iLastOffset); | |
201 d->iLastOffset = iStartOffset; | |
202 appendVarint(d, iEndOffset-iStartOffset); | |
203 } | |
204 | |
205 /* Terminate the last position list in the given doclist. */ | |
206 static void docListAddEndPos(DocList *d){ | |
207 appendVarint(d, 0); | |
208 } | |
209 | |
210 typedef struct DocListReader { | |
211 DocList *pDoclist; | |
212 char *p; | |
213 int iLastPos; /* the last position read */ | |
214 } DocListReader; | |
215 | |
216 static void readerInit(DocListReader *r, DocList *pDoclist){ | |
217 r->pDoclist = pDoclist; | |
218 if( pDoclist!=NULL ){ | |
219 r->p = pDoclist->pData; | |
220 } | |
221 r->iLastPos = 0; | |
222 } | |
223 | |
224 static int readerAtEnd(DocListReader *pReader){ | |
225 return pReader->p >= docListEnd(pReader->pDoclist); | |
226 } | |
227 | |
228 /* Peek at the next docid without advancing the read pointer. */ | |
229 static sqlite_int64 peekDocid(DocListReader *pReader){ | |
230 sqlite_int64 ret; | |
231 assert( !readerAtEnd(pReader) ); | |
232 getVarint(pReader->p, &ret); | |
233 return ret; | |
234 } | |
235 | |
236 /* Read the next docid. */ | |
237 static sqlite_int64 readDocid(DocListReader *pReader){ | |
238 sqlite_int64 ret; | |
239 assert( !readerAtEnd(pReader) ); | |
240 pReader->p += getVarint(pReader->p, &ret); | |
241 pReader->iLastPos = 0; | |
242 return ret; | |
243 } | |
244 | |
245 /* Read the next position from a position list. | |
246 * Returns the position, or -1 at the end of the list. */ | |
247 static int readPosition(DocListReader *pReader){ | |
248 int i; | |
249 int iType = pReader->pDoclist->iType; | |
250 assert( iType>=DL_POSITIONS ); | |
251 assert( !readerAtEnd(pReader) ); | |
252 | |
253 pReader->p += getVarint32(pReader->p, &i); | |
254 if( i==0 ){ | |
255 pReader->iLastPos = -1; | |
256 return -1; | |
257 } | |
258 pReader->iLastPos += ((int) i)-1; | |
259 if( iType>=DL_POSITIONS_OFFSETS ){ | |
260 /* Skip over offsets, ignoring them for now. */ | |
261 int iStart, iEnd; | |
262 pReader->p += getVarint32(pReader->p, &iStart); | |
263 pReader->p += getVarint32(pReader->p, &iEnd); | |
264 } | |
265 return pReader->iLastPos; | |
266 } | |
267 | |
268 /* Skip past the end of a position list. */ | |
269 static void skipPositionList(DocListReader *pReader){ | |
270 while( readPosition(pReader)!=-1 ) | |
271 ; | |
272 } | |
273 | |
274 /* Skip over a docid, including its position list if the doclist has | |
275 * positions. */ | |
276 static void skipDocument(DocListReader *pReader){ | |
277 readDocid(pReader); | |
278 if( pReader->pDoclist->iType >= DL_POSITIONS ){ | |
279 skipPositionList(pReader); | |
280 } | |
281 } | |
282 | |
283 static sqlite_int64 firstDocid(DocList *d){ | |
284 DocListReader r; | |
285 readerInit(&r, d); | |
286 return readDocid(&r); | |
287 } | |
288 | |
289 /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; | |
290 * otherwise pUpdate, which must contain only the single docid [iDocid], is | |
291 * inserted (if not present) or updated (if already present). */ | |
292 static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ | |
293 int modified = 0; | |
294 DocListReader reader; | |
295 char *p; | |
296 | |
297 if( pUpdate!=NULL ){ | |
298 assert( d->iType==pUpdate->iType); | |
299 assert( iDocid==firstDocid(pUpdate) ); | |
300 } | |
301 | |
302 readerInit(&reader, d); | |
303 while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ | |
304 skipDocument(&reader); | |
305 } | |
306 | |
307 p = reader.p; | |
308 /* Delete if there is a matching element. */ | |
309 if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ | |
310 skipDocument(&reader); | |
311 memmove(p, reader.p, docListEnd(d) - reader.p); | |
312 d->nData -= (reader.p - p); | |
313 modified = 1; | |
314 } | |
315 | |
316 /* Insert if indicated. */ | |
317 if( pUpdate!=NULL ){ | |
318 int iDoclist = p-d->pData; | |
319 docListAddEndPos(pUpdate); | |
320 | |
321 d->pData = realloc(d->pData, d->nData+pUpdate->nData); | |
322 p = d->pData + iDoclist; | |
323 | |
324 memmove(p+pUpdate->nData, p, docListEnd(d) - p); | |
325 memcpy(p, pUpdate->pData, pUpdate->nData); | |
326 d->nData += pUpdate->nData; | |
327 modified = 1; | |
328 } | |
329 | |
330 return modified; | |
331 } | |
332 | |
333 /* Split the second half of doclist d into a separate doclist d2. Returns 1 | |
334 * if successful, or 0 if d contains a single document and hence can't be | |
335 * split. */ | |
336 static int docListSplit(DocList *d, DocList *d2){ | |
337 const char *pSplitPoint = d->pData + d->nData / 2; | |
338 DocListReader reader; | |
339 | |
340 readerInit(&reader, d); | |
341 while( reader.p<pSplitPoint ){ | |
342 skipDocument(&reader); | |
343 } | |
344 if( readerAtEnd(&reader) ) return 0; | |
345 docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); | |
346 d->nData = reader.p - d->pData; | |
347 d->pData = realloc(d->pData, d->nData); | |
348 return 1; | |
349 } | |
350 | |
351 /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked | |
352 * on-disk doclist, resulting in another in-memory DocList [out]. [in] | |
353 * and [out] may or may not store position information according to the | |
354 * caller's wishes. The on-disk doclist always comes with positions. | |
355 * | |
356 * The caller must read each chunk of the on-disk doclist in succession and | |
357 * pass it to mergeBlock(). | |
358 * | |
359 * If [in] has positions, then the merge output contains only documents with | |
360 * matching positions in the two input doclists. If [in] does not have | |
361 * positions, then the merge output contains all documents common to the two | |
362 * input doclists. | |
363 * | |
364 * If [in] is NULL, then the on-disk doclist is copied to [out] directly. | |
365 * | |
366 * A merge is performed using an integer [iOffset] provided by the caller. | |
367 * [iOffset] is subtracted from each position in the on-disk doclist for the | |
368 * purpose of position comparison; this is helpful in implementing phrase | |
369 * searches. | |
370 * | |
371 * A DocListMerge is not yet able to propagate offsets through query | |
372 * processing; we should add that capability soon. | |
373 */ | |
374 typedef struct DocListMerge { | |
375 DocListReader in; | |
376 DocList *pOut; | |
377 int iOffset; | |
378 } DocListMerge; | |
379 | |
380 static void mergeInit(DocListMerge *m, | |
381 DocList *pIn, int iOffset, DocList *pOut){ | |
382 readerInit(&m->in, pIn); | |
383 m->pOut = pOut; | |
384 m->iOffset = iOffset; | |
385 | |
386 /* can't handle offsets yet */ | |
387 assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); | |
388 assert( pOut->iType <= DL_POSITIONS ); | |
389 } | |
390 | |
391 /* A helper function for mergeBlock(), below. Merge the position lists | |
392 * pointed to by m->in and pBlockReader. | |
393 * If the merge matches, write [iDocid] to m->pOut; if m->pOut | |
394 * has positions then write all matching positions as well. */ | |
395 static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, | |
396 DocListReader *pBlockReader){ | |
397 int block_pos = readPosition(pBlockReader); | |
398 int in_pos = readPosition(&m->in); | |
399 int match = 0; | |
400 while( block_pos!=-1 || in_pos!=-1 ){ | |
401 if( block_pos-m->iOffset==in_pos ){ | |
402 if( !match ){ | |
403 docListAddDocid(m->pOut, iDocid); | |
404 match = 1; | |
405 } | |
406 if( m->pOut->iType >= DL_POSITIONS ){ | |
407 docListAddPos(m->pOut, in_pos); | |
408 } | |
409 block_pos = readPosition(pBlockReader); | |
410 in_pos = readPosition(&m->in); | |
411 } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ | |
412 block_pos = readPosition(pBlockReader); | |
413 } else { | |
414 in_pos = readPosition(&m->in); | |
415 } | |
416 } | |
417 if( m->pOut->iType >= DL_POSITIONS && match ){ | |
418 docListAddEndPos(m->pOut); | |
419 } | |
420 } | |
421 | |
422 /* Merge one block of an on-disk doclist into a DocListMerge. */ | |
423 static void mergeBlock(DocListMerge *m, DocList *pBlock){ | |
424 DocListReader blockReader; | |
425 assert( pBlock->iType >= DL_POSITIONS ); | |
426 readerInit(&blockReader, pBlock); | |
427 while( !readerAtEnd(&blockReader) ){ | |
428 sqlite_int64 iDocid = readDocid(&blockReader); | |
429 if( m->in.pDoclist!=NULL ){ | |
430 while( 1 ){ | |
431 if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ | |
432 if( peekDocid(&m->in)>=iDocid ) break; | |
433 skipDocument(&m->in); | |
434 } | |
435 if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ | |
436 skipPositionList(&blockReader); /* skip this docid in the block */ | |
437 continue; | |
438 } | |
439 readDocid(&m->in); | |
440 } | |
441 /* We have a document match. */ | |
442 if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ | |
443 /* We don't need to do a poslist merge. */ | |
444 docListAddDocid(m->pOut, iDocid); | |
445 if( m->pOut->iType >= DL_POSITIONS ){ | |
446 /* Copy all positions to the output doclist. */ | |
447 while( 1 ){ | |
448 int pos = readPosition(&blockReader); | |
449 if( pos==-1 ) break; | |
450 docListAddPos(m->pOut, pos); | |
451 } | |
452 docListAddEndPos(m->pOut); | |
453 } else skipPositionList(&blockReader); | |
454 continue; | |
455 } | |
456 mergePosList(m, iDocid, &blockReader); | |
457 } | |
458 } | |
459 | |
460 static char *string_dup_n(const char *s, int n){ | |
461 char *str = malloc(n + 1); | |
462 memcpy(str, s, n); | |
463 str[n] = '\0'; | |
464 return str; | |
465 } | |
466 | |
467 /* Duplicate a string; the caller must free() the returned string. | |
468 * (We don't use strdup() since it's not part of the standard C library and | |
469 * may not be available everywhere.) */ | |
470 static char *string_dup(const char *s){ | |
471 return string_dup_n(s, strlen(s)); | |
472 } | |
473 | |
474 /* Format a string, replacing each occurrence of the % character with | |
475 * zName. This may be more convenient than sqlite_mprintf() | |
476 * when one string is used repeatedly in a format string. | |
477 * The caller must free() the returned string. */ | |
478 static char *string_format(const char *zFormat, const char *zName){ | |
479 const char *p; | |
480 size_t len = 0; | |
481 size_t nName = strlen(zName); | |
482 char *result; | |
483 char *r; | |
484 | |
485 /* first compute length needed */ | |
486 for(p = zFormat ; *p ; ++p){ | |
487 len += (*p=='%' ? nName : 1); | |
488 } | |
489 len += 1; /* for null terminator */ | |
490 | |
491 r = result = malloc(len); | |
492 for(p = zFormat; *p; ++p){ | |
493 if( *p=='%' ){ | |
494 memcpy(r, zName, nName); | |
495 r += nName; | |
496 } else { | |
497 *r++ = *p; | |
498 } | |
499 } | |
500 *r++ = '\0'; | |
501 assert( r == result + len ); | |
502 return result; | |
503 } | |
504 | |
505 static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ | |
506 char *zCommand = string_format(zFormat, zName); | |
507 int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | |
508 free(zCommand); | |
509 return rc; | |
510 } | |
511 | |
512 static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, | |
513 const char *zFormat){ | |
514 char *zCommand = string_format(zFormat, zName); | |
515 int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); | |
516 free(zCommand); | |
517 return rc; | |
518 } | |
519 | |
520 /* end utility functions */ | |
521 | |
522 #define QUERY_GENERIC 0 | |
523 #define QUERY_FULLTEXT 1 | |
524 | |
525 #define CHUNK_MAX 1024 | |
526 | |
527 typedef enum fulltext_statement { | |
528 CONTENT_INSERT_STMT, | |
529 CONTENT_SELECT_STMT, | |
530 CONTENT_DELETE_STMT, | |
531 | |
532 TERM_SELECT_STMT, | |
533 TERM_CHUNK_SELECT_STMT, | |
534 TERM_INSERT_STMT, | |
535 TERM_UPDATE_STMT, | |
536 TERM_DELETE_STMT, | |
537 | |
538 MAX_STMT /* Always at end! */ | |
539 } fulltext_statement; | |
540 | |
541 /* These must exactly match the enum above. */ | |
542 /* TODO(adam): Is there some risk that a statement (in particular, | |
543 ** pTermSelectStmt) will be used in two cursors at once, e.g. if a | |
544 ** query joins a virtual table to itself? If so perhaps we should | |
545 ** move some of these to the cursor object. | |
546 */ | |
547 static const char *fulltext_zStatement[MAX_STMT] = { | |
548 /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", | |
549 /* CONTENT_SELECT */ "select content from %_content where rowid = ?", | |
550 /* CONTENT_DELETE */ "delete from %_content where rowid = ?", | |
551 | |
552 /* TERM_SELECT */ | |
553 "select rowid, doclist from %_term where term = ? and first = ?", | |
554 /* TERM_CHUNK_SELECT */ | |
555 "select max(first) from %_term where term = ? and first <= ?", | |
556 /* TERM_INSERT */ | |
557 "insert into %_term (term, first, doclist) values (?, ?, ?)", | |
558 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", | |
559 /* TERM_DELETE */ "delete from %_term where rowid = ?", | |
560 }; | |
561 | |
562 typedef struct fulltext_vtab { | |
563 sqlite3_vtab base; | |
564 sqlite3 *db; | |
565 const char *zName; /* virtual table name */ | |
566 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | |
567 | |
568 /* Precompiled statements which we keep as long as the table is | |
569 ** open. | |
570 */ | |
571 sqlite3_stmt *pFulltextStatements[MAX_STMT]; | |
572 } fulltext_vtab; | |
573 | |
574 typedef struct fulltext_cursor { | |
575 sqlite3_vtab_cursor base; | |
576 int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ | |
577 | |
578 sqlite3_stmt *pStmt; | |
579 | |
580 int eof; | |
581 | |
582 /* The following is used only when iCursorType == QUERY_FULLTEXT. */ | |
583 DocListReader result; | |
584 } fulltext_cursor; | |
585 | |
586 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | |
587 return (fulltext_vtab *) c->base.pVtab; | |
588 } | |
589 | |
590 static sqlite3_module fulltextModule; /* forward declaration */ | |
591 | |
592 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | |
593 ** If the indicated statement has never been prepared, it is prepared | |
594 ** and cached, otherwise the cached version is reset. | |
595 */ | |
596 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | |
597 sqlite3_stmt **ppStmt){ | |
598 assert( iStmt<MAX_STMT ); | |
599 if( v->pFulltextStatements[iStmt]==NULL ){ | |
600 int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], | |
601 fulltext_zStatement[iStmt]); | |
602 if( rc!=SQLITE_OK ) return rc; | |
603 } else { | |
604 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | |
605 if( rc!=SQLITE_OK ) return rc; | |
606 } | |
607 | |
608 *ppStmt = v->pFulltextStatements[iStmt]; | |
609 return SQLITE_OK; | |
610 } | |
611 | |
612 /* Step the indicated statement, handling errors SQLITE_BUSY (by | |
613 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring | |
614 ** bindings to the new statement). | |
615 ** TODO(adam): We should extend this function so that it can work with | |
616 ** statements declared locally, not only globally cached statements. | |
617 */ | |
618 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, | |
619 sqlite3_stmt **ppStmt){ | |
620 int rc; | |
621 sqlite3_stmt *s = *ppStmt; | |
622 assert( iStmt<MAX_STMT ); | |
623 assert( s==v->pFulltextStatements[iStmt] ); | |
624 | |
625 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ | |
626 sqlite3_stmt *pNewStmt; | |
627 | |
628 if( rc==SQLITE_BUSY ) continue; | |
629 if( rc!=SQLITE_ERROR ) return rc; | |
630 | |
631 rc = sqlite3_reset(s); | |
632 if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; | |
633 | |
634 v->pFulltextStatements[iStmt] = NULL; /* Still in s */ | |
635 rc = sql_get_statement(v, iStmt, &pNewStmt); | |
636 if( rc!=SQLITE_OK ) goto err; | |
637 *ppStmt = pNewStmt; | |
638 | |
639 rc = sqlite3_transfer_bindings(s, pNewStmt); | |
640 if( rc!=SQLITE_OK ) goto err; | |
641 | |
642 rc = sqlite3_finalize(s); | |
643 if( rc!=SQLITE_OK ) return rc; | |
644 s = pNewStmt; | |
645 } | |
646 return rc; | |
647 | |
648 err: | |
649 sqlite3_finalize(s); | |
650 return rc; | |
651 } | |
652 | |
653 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. | |
654 ** Useful for statements like UPDATE, where we expect no results. | |
655 */ | |
656 static int sql_single_step_statement(fulltext_vtab *v, | |
657 fulltext_statement iStmt, | |
658 sqlite3_stmt **ppStmt){ | |
659 int rc = sql_step_statement(v, iStmt, ppStmt); | |
660 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | |
661 } | |
662 | |
663 /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ | |
664 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, | |
665 const char *zContent, int nContent){ | |
666 sqlite3_stmt *s; | |
667 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | |
668 if( rc!=SQLITE_OK ) return rc; | |
669 | |
670 rc = sqlite3_bind_value(s, 1, rowid); | |
671 if( rc!=SQLITE_OK ) return rc; | |
672 | |
673 rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); | |
674 if( rc!=SQLITE_OK ) return rc; | |
675 | |
676 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); | |
677 } | |
678 | |
679 /* select content from %_content where rowid = [iRow] | |
680 * The caller must delete the returned string. */ | |
681 static int content_select(fulltext_vtab *v, sqlite_int64 iRow, | |
682 char **pzContent){ | |
683 sqlite3_stmt *s; | |
684 int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | |
685 if( rc!=SQLITE_OK ) return rc; | |
686 | |
687 rc = sqlite3_bind_int64(s, 1, iRow); | |
688 if( rc!=SQLITE_OK ) return rc; | |
689 | |
690 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); | |
691 if( rc!=SQLITE_ROW ) return rc; | |
692 | |
693 *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); | |
694 | |
695 /* We expect only one row. We must execute another sqlite3_step() | |
696 * to complete the iteration; otherwise the table will remain locked. */ | |
697 rc = sqlite3_step(s); | |
698 if( rc==SQLITE_DONE ) return SQLITE_OK; | |
699 | |
700 free(*pzContent); | |
701 return rc; | |
702 } | |
703 | |
704 /* delete from %_content where rowid = [iRow ] */ | |
705 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ | |
706 sqlite3_stmt *s; | |
707 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | |
708 if( rc!=SQLITE_OK ) return rc; | |
709 | |
710 rc = sqlite3_bind_int64(s, 1, iRow); | |
711 if( rc!=SQLITE_OK ) return rc; | |
712 | |
713 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); | |
714 } | |
715 | |
716 /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] | |
717 * If found, returns SQLITE_OK; the caller must free the returned doclist. | |
718 * If no rows found, returns SQLITE_ERROR. */ | |
719 static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, | |
720 sqlite_int64 iFirst, | |
721 sqlite_int64 *rowid, | |
722 DocList *out){ | |
723 sqlite3_stmt *s; | |
724 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); | |
725 if( rc!=SQLITE_OK ) return rc; | |
726 | |
727 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); | |
728 if( rc!=SQLITE_OK ) return rc; | |
729 | |
730 rc = sqlite3_bind_int64(s, 2, iFirst); | |
731 if( rc!=SQLITE_OK ) return rc; | |
732 | |
733 rc = sql_step_statement(v, TERM_SELECT_STMT, &s); | |
734 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | |
735 | |
736 *rowid = sqlite3_column_int64(s, 0); | |
737 docListInit(out, DL_POSITIONS_OFFSETS, | |
738 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); | |
739 | |
740 /* We expect only one row. We must execute another sqlite3_step() | |
741 * to complete the iteration; otherwise the table will remain locked. */ | |
742 rc = sqlite3_step(s); | |
743 return rc==SQLITE_DONE ? SQLITE_OK : rc; | |
744 } | |
745 | |
746 /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] | |
747 * If found, returns SQLITE_ROW and result in *piResult; if the query returns | |
748 * NULL (meaning no row found) returns SQLITE_DONE. | |
749 */ | |
750 static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, | |
751 sqlite_int64 iFirst, sqlite_int64 *piResult){ | |
752 sqlite3_stmt *s; | |
753 int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); | |
754 if( rc!=SQLITE_OK ) return rc; | |
755 | |
756 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); | |
757 if( rc!=SQLITE_OK ) return rc; | |
758 | |
759 rc = sqlite3_bind_int64(s, 2, iFirst); | |
760 if( rc!=SQLITE_OK ) return rc; | |
761 | |
762 rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); | |
763 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | |
764 | |
765 switch( sqlite3_column_type(s, 0) ){ | |
766 case SQLITE_NULL: | |
767 rc = SQLITE_DONE; | |
768 break; | |
769 case SQLITE_INTEGER: | |
770 *piResult = sqlite3_column_int64(s, 0); | |
771 break; | |
772 default: | |
773 return SQLITE_ERROR; | |
774 } | |
775 /* We expect only one row. We must execute another sqlite3_step() | |
776 * to complete the iteration; otherwise the table will remain locked. */ | |
777 if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; | |
778 return rc; | |
779 } | |
780 | |
781 /* insert into %_term (term, first, doclist) | |
782 values ([zTerm], [iFirst], [doclist]) */ | |
783 static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, | |
784 sqlite_int64 iFirst, DocList *doclist){ | |
785 sqlite3_stmt *s; | |
786 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); | |
787 if( rc!=SQLITE_OK ) return rc; | |
788 | |
789 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); | |
790 if( rc!=SQLITE_OK ) return rc; | |
791 | |
792 rc = sqlite3_bind_int64(s, 2, iFirst); | |
793 if( rc!=SQLITE_OK ) return rc; | |
794 | |
795 rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); | |
796 if( rc!=SQLITE_OK ) return rc; | |
797 | |
798 return sql_single_step_statement(v, TERM_INSERT_STMT, &s); | |
799 } | |
800 | |
801 /* update %_term set doclist = [doclist] where rowid = [rowid] */ | |
802 static int term_update(fulltext_vtab *v, sqlite_int64 rowid, | |
803 DocList *doclist){ | |
804 sqlite3_stmt *s; | |
805 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); | |
806 if( rc!=SQLITE_OK ) return rc; | |
807 | |
808 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, | |
809 SQLITE_STATIC); | |
810 if( rc!=SQLITE_OK ) return rc; | |
811 | |
812 rc = sqlite3_bind_int64(s, 2, rowid); | |
813 if( rc!=SQLITE_OK ) return rc; | |
814 | |
815 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); | |
816 } | |
817 | |
818 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ | |
819 sqlite3_stmt *s; | |
820 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); | |
821 if( rc!=SQLITE_OK ) return rc; | |
822 | |
823 rc = sqlite3_bind_int64(s, 1, rowid); | |
824 if( rc!=SQLITE_OK ) return rc; | |
825 | |
826 return sql_single_step_statement(v, TERM_DELETE_STMT, &s); | |
827 } | |
828 | |
829 static void fulltext_vtab_destroy(fulltext_vtab *v){ | |
830 int iStmt; | |
831 | |
832 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | |
833 if( v->pFulltextStatements[iStmt]!=NULL ){ | |
834 sqlite3_finalize(v->pFulltextStatements[iStmt]); | |
835 v->pFulltextStatements[iStmt] = NULL; | |
836 } | |
837 } | |
838 | |
839 if( v->pTokenizer!=NULL ){ | |
840 v->pTokenizer->pModule->xDestroy(v->pTokenizer); | |
841 v->pTokenizer = NULL; | |
842 } | |
843 | |
844 free((void *) v->zName); | |
845 free(v); | |
846 } | |
847 | |
848 /* Current interface: | |
849 ** argv[0] - module name | |
850 ** argv[1] - database name | |
851 ** argv[2] - table name | |
852 ** argv[3] - tokenizer name (optional, a sensible default is provided) | |
853 ** argv[4..] - passed to tokenizer (optional based on tokenizer) | |
854 **/ | |
855 static int fulltextConnect( | |
856 sqlite3 *db, | |
857 void *pAux, | |
858 int argc, | |
859 const char * const *argv, | |
860 sqlite3_vtab **ppVTab, | |
861 char **pzErr | |
862 ){ | |
863 int rc; | |
864 fulltext_vtab *v; | |
865 sqlite3_tokenizer_module *m = NULL; | |
866 | |
867 assert( argc>=3 ); | |
868 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | |
869 /* sqlite will initialize v->base */ | |
870 v->db = db; | |
871 v->zName = string_dup(argv[2]); | |
872 v->pTokenizer = NULL; | |
873 | |
874 if( argc==3 ){ | |
875 get_simple_tokenizer_module(&m); | |
876 } else { | |
877 /* TODO(shess) For now, add new tokenizers as else if clauses. */ | |
878 if( !strcmp(argv[3], "simple") ){ | |
879 get_simple_tokenizer_module(&m); | |
880 } else { | |
881 assert( "unrecognized tokenizer"==NULL ); | |
882 } | |
883 } | |
884 | |
885 /* TODO(shess) Since tokenization impacts the index, the parameters | |
886 ** to the tokenizer need to be identical when a persistent virtual | |
887 ** table is re-created. One solution would be a meta-table to track | |
888 ** such information in the database. Then we could verify that the | |
889 ** information is identical on subsequent creates. | |
890 */ | |
891 /* TODO(shess) Why isn't argv already (const char **)? */ | |
892 rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); | |
893 if( rc!=SQLITE_OK ) return rc; | |
894 v->pTokenizer->pModule = m; | |
895 | |
896 /* TODO: verify the existence of backing tables foo_content, foo_term */ | |
897 | |
898 rc = sqlite3_declare_vtab(db, "create table x(content text)"); | |
899 if( rc!=SQLITE_OK ) return rc; | |
900 | |
901 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | |
902 | |
903 *ppVTab = &v->base; | |
904 return SQLITE_OK; | |
905 } | |
906 | |
907 static int fulltextCreate( | |
908 sqlite3 *db, | |
909 void *pAux, | |
910 int argc, | |
911 const char * const *argv, | |
912 sqlite3_vtab **ppVTab, | |
913 char **pzErr | |
914 ){ | |
915 int rc; | |
916 assert( argc>=3 ); | |
917 | |
918 /* The %_content table holds the text of each full-text item, with | |
919 ** the rowid used as the docid. | |
920 ** | |
921 ** The %_term table maps each term to a document list blob | |
922 ** containing elements sorted by ascending docid, each element | |
923 ** encoded as: | |
924 ** | |
925 ** docid varint-encoded | |
926 ** token count varint-encoded | |
927 ** "count" token elements (poslist): | |
928 ** position varint-encoded as delta from previous position | |
929 ** start offset varint-encoded as delta from previous start offset | |
930 ** end offset varint-encoded as delta from start offset | |
931 ** | |
932 ** Additionally, doclist blobs can be chunked into multiple rows, | |
933 ** using "first" to order the blobs. "first" is simply the first | |
934 ** docid in the blob. | |
935 */ | |
936 /* | |
937 ** NOTE(shess) That last sentence is incorrect in the face of | |
938 ** deletion, which can leave a doclist that doesn't contain the | |
939 ** first from that row. I _believe_ this does not matter to the | |
940 ** operation of the system, but it might be reasonable to update | |
941 ** appropriately in case this assumption becomes more important. | |
942 */ | |
943 rc = sql_exec(db, argv[2], | |
944 "create table %_content(content text);" | |
945 "create table %_term(term text, first integer, doclist blob);" | |
946 "create index %_index on %_term(term, first)"); | |
947 if( rc!=SQLITE_OK ) return rc; | |
948 | |
949 return fulltextConnect(db, pAux, argc, argv, ppVTab, pzErr); | |
950 } | |
951 | |
952 /* Decide how to handle an SQL query. | |
953 * At the moment, MATCH queries can include implicit boolean ANDs; we | |
954 * haven't implemented phrase searches or OR yet. */ | |
955 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | |
956 int i; | |
957 | |
958 for(i=0; i<pInfo->nConstraint; ++i){ | |
959 const struct sqlite3_index_constraint *pConstraint; | |
960 pConstraint = &pInfo->aConstraint[i]; | |
961 if( pConstraint->iColumn==0 && | |
962 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && | |
963 pConstraint->usable ){ /* a full-text search */ | |
964 pInfo->aConstraintUsage[i].argvIndex = 1; | |
965 pInfo->aConstraintUsage[i].omit = 1; | |
966 pInfo->idxNum = QUERY_FULLTEXT; | |
967 pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ | |
968 return SQLITE_OK; | |
969 } | |
970 } | |
971 pInfo->idxNum = QUERY_GENERIC; | |
972 return SQLITE_OK; | |
973 } | |
974 | |
975 static int fulltextDisconnect(sqlite3_vtab *pVTab){ | |
976 fulltext_vtab_destroy((fulltext_vtab *)pVTab); | |
977 return SQLITE_OK; | |
978 } | |
979 | |
980 static int fulltextDestroy(sqlite3_vtab *pVTab){ | |
981 fulltext_vtab *v = (fulltext_vtab *)pVTab; | |
982 | |
983 int rc = sql_exec(v->db, v->zName, | |
984 "drop table %_content; drop table %_term"); | |
985 if( rc!=SQLITE_OK ) return rc; | |
986 | |
987 fulltext_vtab_destroy((fulltext_vtab *)pVTab); | |
988 return SQLITE_OK; | |
989 } | |
990 | |
991 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | |
992 fulltext_cursor *c; | |
993 | |
994 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | |
995 /* sqlite will initialize c->base */ | |
996 *ppCursor = &c->base; | |
997 | |
998 return SQLITE_OK; | |
999 } | |
1000 | |
1001 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | |
1002 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
1003 sqlite3_finalize(c->pStmt); | |
1004 if( c->result.pDoclist!=NULL ){ | |
1005 docListDelete(c->result.pDoclist); | |
1006 } | |
1007 free(c); | |
1008 return SQLITE_OK; | |
1009 } | |
1010 | |
1011 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | |
1012 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
1013 sqlite_int64 iDocid; | |
1014 int rc; | |
1015 | |
1016 switch( c->iCursorType ){ | |
1017 case QUERY_GENERIC: | |
1018 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | |
1019 rc = sqlite3_step(c->pStmt); | |
1020 switch( rc ){ | |
1021 case SQLITE_ROW: | |
1022 c->eof = 0; | |
1023 return SQLITE_OK; | |
1024 case SQLITE_DONE: | |
1025 c->eof = 1; | |
1026 return SQLITE_OK; | |
1027 default: | |
1028 c->eof = 1; | |
1029 return rc; | |
1030 } | |
1031 case QUERY_FULLTEXT: | |
1032 rc = sqlite3_reset(c->pStmt); | |
1033 if( rc!=SQLITE_OK ) return rc; | |
1034 | |
1035 if( readerAtEnd(&c->result)){ | |
1036 c->eof = 1; | |
1037 return SQLITE_OK; | |
1038 } | |
1039 iDocid = readDocid(&c->result); | |
1040 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); | |
1041 if( rc!=SQLITE_OK ) return rc; | |
1042 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | |
1043 rc = sqlite3_step(c->pStmt); | |
1044 if( rc==SQLITE_ROW ){ /* the case we expect */ | |
1045 c->eof = 0; | |
1046 return SQLITE_OK; | |
1047 } | |
1048 /* an error occurred; abort */ | |
1049 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | |
1050 default: | |
1051 assert( 0 ); | |
1052 return SQLITE_ERROR; /* not reached */ | |
1053 } | |
1054 } | |
1055 | |
1056 static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, | |
1057 sqlite3_stmt **ppStmt){ | |
1058 int rc; | |
1059 if( *ppStmt ){ | |
1060 rc = sqlite3_reset(*ppStmt); | |
1061 } else { | |
1062 rc = sql_prepare(v->db, v->zName, ppStmt, | |
1063 "select doclist from %_term where term = ? order by first"); | |
1064 } | |
1065 if( rc!=SQLITE_OK ) return rc; | |
1066 | |
1067 rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); | |
1068 if( rc!=SQLITE_OK ) return rc; | |
1069 | |
1070 return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ | |
1071 } | |
1072 | |
1073 /* Read the posting list for [zTerm]; AND it with the doclist [in] to | |
1074 * produce the doclist [out], using the given offset [iOffset] for phrase | |
1075 * matching. | |
1076 * (*pSelect) is used to hold an SQLite statement used inside this function; | |
1077 * the caller should initialize *pSelect to NULL before the first call. | |
1078 */ | |
1079 static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, | |
1080 const char *zTerm, | |
1081 DocList *pIn, int iOffset, DocList *out){ | |
1082 int rc; | |
1083 DocListMerge merge; | |
1084 | |
1085 if( pIn!=NULL && !pIn->nData ){ | |
1086 /* If [pIn] is already empty, there's no point in reading the | |
1087 * posting list to AND it in; return immediately. */ | |
1088 return SQLITE_OK; | |
1089 } | |
1090 | |
1091 rc = term_select_doclist(v, zTerm, -1, pSelect); | |
1092 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; | |
1093 | |
1094 mergeInit(&merge, pIn, iOffset, out); | |
1095 while( rc==SQLITE_ROW ){ | |
1096 DocList block; | |
1097 docListInit(&block, DL_POSITIONS_OFFSETS, | |
1098 sqlite3_column_blob(*pSelect, 0), | |
1099 sqlite3_column_bytes(*pSelect, 0)); | |
1100 mergeBlock(&merge, &block); | |
1101 docListDestroy(&block); | |
1102 | |
1103 rc = sqlite3_step(*pSelect); | |
1104 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ | |
1105 return rc; | |
1106 } | |
1107 } | |
1108 | |
1109 return SQLITE_OK; | |
1110 } | |
1111 | |
1112 typedef struct QueryTerm { | |
1113 int is_phrase; /* true if this term begins a new phrase */ | |
1114 const char *zTerm; | |
1115 } QueryTerm; | |
1116 | |
1117 /* A parsed query. | |
1118 * | |
1119 * As an example, parsing the query ["four score" years "new nation"] will | |
1120 * yield a Query with 5 terms: | |
1121 * "four", is_phrase = 1 | |
1122 * "score", is_phrase = 0 | |
1123 * "years", is_phrase = 1 | |
1124 * "new", is_phrase = 1 | |
1125 * "nation", is_phrase = 0 | |
1126 */ | |
1127 typedef struct Query { | |
1128 int nTerms; | |
1129 QueryTerm *pTerm; | |
1130 } Query; | |
1131 | |
1132 static void query_add(Query *q, int is_phrase, const char *zTerm){ | |
1133 QueryTerm *t; | |
1134 ++q->nTerms; | |
1135 q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); | |
1136 t = &q->pTerm[q->nTerms - 1]; | |
1137 t->is_phrase = is_phrase; | |
1138 t->zTerm = zTerm; | |
1139 } | |
1140 | |
1141 static void query_free(Query *q){ | |
1142 int i; | |
1143 for(i = 0; i < q->nTerms; ++i){ | |
1144 free((void *) q->pTerm[i].zTerm); | |
1145 } | |
1146 free(q->pTerm); | |
1147 } | |
1148 | |
1149 static int tokenize_segment(sqlite3_tokenizer *pTokenizer, | |
1150 const char *zQuery, int in_phrase, | |
1151 Query *pQuery){ | |
1152 sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | |
1153 sqlite3_tokenizer_cursor *pCursor; | |
1154 int is_first = 1; | |
1155 | |
1156 int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); | |
1157 if( rc!=SQLITE_OK ) return rc; | |
1158 pCursor->pTokenizer = pTokenizer; | |
1159 | |
1160 while( 1 ){ | |
1161 const char *zToken; | |
1162 int nToken, iStartOffset, iEndOffset, dummy_pos; | |
1163 | |
1164 rc = pModule->xNext(pCursor, | |
1165 &zToken, &nToken, | |
1166 &iStartOffset, &iEndOffset, | |
1167 &dummy_pos); | |
1168 if( rc!=SQLITE_OK ) break; | |
1169 query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); | |
1170 is_first = 0; | |
1171 } | |
1172 | |
1173 return pModule->xClose(pCursor); | |
1174 } | |
1175 | |
1176 /* Parse a query string, yielding a Query object. */ | |
1177 static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ | |
1178 char *zQuery1 = string_dup(zQuery); | |
1179 int in_phrase = 0; | |
1180 char *s = zQuery1; | |
1181 pQuery->nTerms = 0; | |
1182 pQuery->pTerm = NULL; | |
1183 | |
1184 while( *s ){ | |
1185 char *t = s; | |
1186 while( *t ){ | |
1187 if( *t=='"' ){ | |
1188 *t++ = '\0'; | |
1189 break; | |
1190 } | |
1191 ++t; | |
1192 } | |
1193 if( *s ){ | |
1194 tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); | |
1195 } | |
1196 s = t; | |
1197 in_phrase = !in_phrase; | |
1198 } | |
1199 | |
1200 free(zQuery1); | |
1201 return SQLITE_OK; | |
1202 } | |
1203 | |
1204 /* Perform a full-text query; return a list of documents in [pResult]. */ | |
1205 static int fulltext_query(fulltext_vtab *v, const char *zQuery, | |
1206 DocList **pResult){ | |
1207 Query q; | |
1208 int phrase_start = -1; | |
1209 int i; | |
1210 sqlite3_stmt *pSelect = NULL; | |
1211 DocList *d = NULL; | |
1212 | |
1213 int rc = parse_query(v, zQuery, &q); | |
1214 if( rc!=SQLITE_OK ) return rc; | |
1215 | |
1216 /* Merge terms. */ | |
1217 for(i = 0 ; i < q.nTerms ; ++i){ | |
1218 /* In each merge step, we need to generate positions whenever we're | |
1219 * processing a phrase which hasn't ended yet. */ | |
1220 int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; | |
1221 DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); | |
1222 if( q.pTerm[i].is_phrase ){ | |
1223 phrase_start = i; | |
1224 } | |
1225 rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); | |
1226 if( rc!=SQLITE_OK ) break; | |
1227 if( d!=NULL ){ | |
1228 docListDelete(d); | |
1229 } | |
1230 d = next; | |
1231 } | |
1232 | |
1233 sqlite3_finalize(pSelect); | |
1234 query_free(&q); | |
1235 *pResult = d; | |
1236 return rc; | |
1237 } | |
1238 | |
1239 static int fulltextFilter(sqlite3_vtab_cursor *pCursor, | |
1240 int idxNum, const char *idxStr, | |
1241 int argc, sqlite3_value **argv){ | |
1242 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
1243 fulltext_vtab *v = cursor_vtab(c); | |
1244 int rc; | |
1245 const char *zStatement; | |
1246 | |
1247 c->iCursorType = idxNum; | |
1248 switch( idxNum ){ | |
1249 case QUERY_GENERIC: | |
1250 zStatement = "select rowid, content from %_content"; | |
1251 break; | |
1252 | |
1253 case QUERY_FULLTEXT: /* full-text search */ | |
1254 { | |
1255 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | |
1256 DocList *pResult; | |
1257 assert( argc==1 ); | |
1258 rc = fulltext_query(v, zQuery, &pResult); | |
1259 if( rc!=SQLITE_OK ) return rc; | |
1260 readerInit(&c->result, pResult); | |
1261 zStatement = "select rowid, content from %_content where rowid = ?"; | |
1262 break; | |
1263 } | |
1264 | |
1265 default: | |
1266 assert( 0 ); | |
1267 } | |
1268 | |
1269 rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); | |
1270 if( rc!=SQLITE_OK ) return rc; | |
1271 | |
1272 return fulltextNext(pCursor); | |
1273 } | |
1274 | |
1275 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | |
1276 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
1277 return c->eof; | |
1278 } | |
1279 | |
1280 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | |
1281 sqlite3_context *pContext, int idxCol){ | |
1282 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
1283 const char *s; | |
1284 | |
1285 assert( idxCol==0 ); | |
1286 s = (const char *) sqlite3_column_text(c->pStmt, 1); | |
1287 sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); | |
1288 | |
1289 return SQLITE_OK; | |
1290 } | |
1291 | |
1292 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | |
1293 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
1294 | |
1295 *pRowid = sqlite3_column_int64(c->pStmt, 0); | |
1296 return SQLITE_OK; | |
1297 } | |
1298 | |
1299 /* Build a hash table containing all terms in zText. */ | |
1300 static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, | |
1301 const char *zText, sqlite_int64 iDocid){ | |
1302 sqlite3_tokenizer_cursor *pCursor; | |
1303 const char *pToken; | |
1304 int nTokenBytes; | |
1305 int iStartOffset, iEndOffset, iPosition; | |
1306 | |
1307 int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | |
1308 if( rc!=SQLITE_OK ) return rc; | |
1309 | |
1310 pCursor->pTokenizer = pTokenizer; | |
1311 HashInit(terms, HASH_STRING, 1); | |
1312 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | |
1313 &pToken, &nTokenBytes, | |
1314 &iStartOffset, &iEndOffset, | |
1315 &iPosition) ){ | |
1316 DocList *p; | |
1317 | |
1318 /* Positions can't be negative; we use -1 as a terminator internally. */ | |
1319 if( iPosition<0 ) { | |
1320 rc = SQLITE_ERROR; | |
1321 goto err; | |
1322 } | |
1323 | |
1324 p = HashFind(terms, pToken, nTokenBytes); | |
1325 if( p==NULL ){ | |
1326 p = docListNew(DL_POSITIONS_OFFSETS); | |
1327 docListAddDocid(p, iDocid); | |
1328 HashInsert(terms, pToken, nTokenBytes, p); | |
1329 } | |
1330 docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); | |
1331 } | |
1332 | |
1333 err: | |
1334 /* TODO(shess) Check return? Should this be able to cause errors at | |
1335 ** this point? Actually, same question about sqlite3_finalize(), | |
1336 ** though one could argue that failure there means that the data is | |
1337 ** not durable. *ponder* | |
1338 */ | |
1339 pTokenizer->pModule->xClose(pCursor); | |
1340 return rc; | |
1341 } | |
1342 /* Update the %_terms table to map the term [zTerm] to the given rowid. */ | |
1343 static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, | |
1344 sqlite_int64 iDocid, DocList *p){ | |
1345 sqlite_int64 iFirst; | |
1346 sqlite_int64 iIndexRow; | |
1347 DocList doclist; | |
1348 | |
1349 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); | |
1350 if( rc==SQLITE_DONE ){ | |
1351 docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); | |
1352 if( docListUpdate(&doclist, iDocid, p) ){ | |
1353 rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); | |
1354 docListDestroy(&doclist); | |
1355 return rc; | |
1356 } | |
1357 return SQLITE_OK; | |
1358 } | |
1359 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; | |
1360 | |
1361 /* This word is in the index; add this document ID to its blob. */ | |
1362 | |
1363 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); | |
1364 if( rc!=SQLITE_OK ) return rc; | |
1365 | |
1366 if( docListUpdate(&doclist, iDocid, p) ){ | |
1367 /* If the blob is too big, split it in half. */ | |
1368 if( doclist.nData>CHUNK_MAX ){ | |
1369 DocList half; | |
1370 if( docListSplit(&doclist, &half) ){ | |
1371 rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); | |
1372 docListDestroy(&half); | |
1373 if( rc!=SQLITE_OK ) goto err; | |
1374 } | |
1375 } | |
1376 rc = term_update(v, iIndexRow, &doclist); | |
1377 } | |
1378 | |
1379 err: | |
1380 docListDestroy(&doclist); | |
1381 return rc; | |
1382 } | |
1383 | |
1384 /* Insert a row into the full-text index; set *piRowid to be the ID of the | |
1385 * new row. */ | |
1386 static int index_insert(fulltext_vtab *v, | |
1387 sqlite3_value *pRequestRowid, const char *zText, | |
1388 sqlite_int64 *piRowid){ | |
1389 Hash terms; /* maps term string -> PosList */ | |
1390 HashElem *e; | |
1391 | |
1392 int rc = content_insert(v, pRequestRowid, zText, -1); | |
1393 if( rc!=SQLITE_OK ) return rc; | |
1394 *piRowid = sqlite3_last_insert_rowid(v->db); | |
1395 | |
1396 if( !zText ) return SQLITE_OK; /* nothing to index */ | |
1397 | |
1398 rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); | |
1399 if( rc!=SQLITE_OK ) return rc; | |
1400 | |
1401 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
1402 DocList *p = HashData(e); | |
1403 rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); | |
1404 if( rc!=SQLITE_OK ) break; | |
1405 } | |
1406 | |
1407 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
1408 DocList *p = HashData(e); | |
1409 docListDelete(p); | |
1410 } | |
1411 HashClear(&terms); | |
1412 return rc; | |
1413 } | |
1414 | |
1415 static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, | |
1416 sqlite_int64 iDocid){ | |
1417 sqlite_int64 iFirst; | |
1418 sqlite_int64 iIndexRow; | |
1419 DocList doclist; | |
1420 | |
1421 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); | |
1422 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; | |
1423 | |
1424 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); | |
1425 if( rc!=SQLITE_OK ) return rc; | |
1426 | |
1427 if( docListUpdate(&doclist, iDocid, NULL) ){ | |
1428 if( doclist.nData>0 ){ | |
1429 rc = term_update(v, iIndexRow, &doclist); | |
1430 } else { /* empty posting list */ | |
1431 rc = term_delete(v, iIndexRow); | |
1432 } | |
1433 } | |
1434 docListDestroy(&doclist); | |
1435 return rc; | |
1436 } | |
1437 | |
1438 /* Delete a row from the full-text index. */ | |
1439 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ | |
1440 char *zText; | |
1441 Hash terms; | |
1442 HashElem *e; | |
1443 | |
1444 int rc = content_select(v, iRow, &zText); | |
1445 if( rc!=SQLITE_OK ) return rc; | |
1446 | |
1447 rc = build_terms(&terms, v->pTokenizer, zText, iRow); | |
1448 free(zText); | |
1449 if( rc!=SQLITE_OK ) return rc; | |
1450 | |
1451 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
1452 rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); | |
1453 if( rc!=SQLITE_OK ) break; | |
1454 } | |
1455 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
1456 DocList *p = HashData(e); | |
1457 docListDelete(p); | |
1458 } | |
1459 HashClear(&terms); | |
1460 | |
1461 return content_delete(v, iRow); | |
1462 } | |
1463 | |
1464 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | |
1465 sqlite_int64 *pRowid){ | |
1466 fulltext_vtab *v = (fulltext_vtab *) pVtab; | |
1467 | |
1468 if( nArg<2 ){ | |
1469 return index_delete(v, sqlite3_value_int64(ppArg[0])); | |
1470 } | |
1471 | |
1472 if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | |
1473 return SQLITE_ERROR; /* an update; not yet supported */ | |
1474 } | |
1475 | |
1476 assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ | |
1477 return index_insert(v, ppArg[1], | |
1478 (const char *)sqlite3_value_text(ppArg[2]), pRowid); | |
1479 } | |
1480 | |
1481 static sqlite3_module fulltextModule = { | |
1482 0, | |
1483 fulltextCreate, | |
1484 fulltextConnect, | |
1485 fulltextBestIndex, | |
1486 fulltextDisconnect, | |
1487 fulltextDestroy, | |
1488 fulltextOpen, | |
1489 fulltextClose, | |
1490 fulltextFilter, | |
1491 fulltextNext, | |
1492 fulltextEof, | |
1493 fulltextColumn, | |
1494 fulltextRowid, | |
1495 fulltextUpdate | |
1496 }; | |
1497 | |
1498 int fulltext_init(sqlite3 *db){ | |
1499 return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); | |
1500 } | |
1501 | |
1502 #if !SQLITE_CORE | |
1503 #ifdef _WIN32 | |
1504 __declspec(dllexport) | |
1505 #endif | |
1506 int sqlite3_fulltext_init(sqlite3 *db, char **pzErrMsg, | |
1507 const sqlite3_api_routines *pApi){ | |
1508 SQLITE_EXTENSION_INIT2(pApi) | |
1509 return fulltext_init(db); | |
1510 } | |
1511 #endif | |
OLD | NEW |