| OLD | NEW |
| (Empty) |
| 1 /* The author disclaims copyright to this source code. | |
| 2 * | |
| 3 * This is an SQLite module implementing full-text search. | |
| 4 */ | |
| 5 | |
| 6 #include <assert.h> | |
| 7 #if !defined(__APPLE__) | |
| 8 #include <malloc.h> | |
| 9 #else | |
| 10 #include <stdlib.h> | |
| 11 #endif | |
| 12 #include <stdio.h> | |
| 13 #include <string.h> | |
| 14 #include <ctype.h> | |
| 15 | |
| 16 #include "fulltext.h" | |
| 17 #include "ft_hash.h" | |
| 18 #include "tokenizer.h" | |
| 19 #include "sqlite3.h" | |
| 20 #include "sqlite3ext.h" | |
| 21 SQLITE_EXTENSION_INIT1 | |
| 22 | |
| 23 /* utility functions */ | |
| 24 | |
| 25 /* We encode variable-length integers in little-endian order using seven bits | |
| 26 * per byte as follows: | |
| 27 ** | |
| 28 ** KEY: | |
| 29 ** A = 0xxxxxxx 7 bits of data and one flag bit | |
| 30 ** B = 1xxxxxxx 7 bits of data and one flag bit | |
| 31 ** | |
| 32 ** 7 bits - A | |
| 33 ** 14 bits - BA | |
| 34 ** 21 bits - BBA | |
| 35 ** and so on. | |
| 36 */ | |
| 37 | |
| 38 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | |
| 39 #define VARINT_MAX 10 | |
| 40 | |
| 41 /* Write a 64-bit variable-length integer to memory starting at p[0]. | |
| 42 * The length of data written will be between 1 and VARINT_MAX bytes. | |
| 43 * The number of bytes written is returned. */ | |
| 44 static int putVarint(char *p, sqlite_int64 v){ | |
| 45 unsigned char *q = (unsigned char *) p; | |
| 46 sqlite_uint64 vu = v; | |
| 47 do{ | |
| 48 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | |
| 49 vu >>= 7; | |
| 50 }while( vu!=0 ); | |
| 51 q[-1] &= 0x7f; /* turn off high bit in final byte */ | |
| 52 assert( q - (unsigned char *)p <= VARINT_MAX ); | |
| 53 return (int) (q - (unsigned char *)p); | |
| 54 } | |
| 55 | |
| 56 /* Read a 64-bit variable-length integer from memory starting at p[0]. | |
| 57 * Return the number of bytes read, or 0 on error. | |
| 58 * The value is stored in *v. */ | |
| 59 static int getVarint(const char *p, sqlite_int64 *v){ | |
| 60 const unsigned char *q = (const unsigned char *) p; | |
| 61 sqlite_uint64 x = 0, y = 1; | |
| 62 while( (*q & 0x80) == 0x80 ){ | |
| 63 x += y * (*q++ & 0x7f); | |
| 64 y <<= 7; | |
| 65 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | |
| 66 assert( 0 ); | |
| 67 return 0; | |
| 68 } | |
| 69 } | |
| 70 x += y * (*q++); | |
| 71 *v = (sqlite_int64) x; | |
| 72 return (int) (q - (unsigned char *)p); | |
| 73 } | |
| 74 | |
| 75 static int getVarint32(const char *p, int *pi){ | |
| 76 sqlite_int64 i; | |
| 77 int ret = getVarint(p, &i); | |
| 78 *pi = (int) i; | |
| 79 assert( *pi==i ); | |
| 80 return ret; | |
| 81 } | |
| 82 | |
| 83 /*** Document lists *** | |
| 84 * | |
| 85 * A document list holds a sorted list of varint-encoded document IDs. | |
| 86 * | |
| 87 * A doclist with type DL_POSITIONS_OFFSETS is stored like this: | |
| 88 * | |
| 89 * array { | |
| 90 * varint docid; | |
| 91 * array { | |
| 92 * varint position; (delta from previous position plus 1, or 0 for end) | |
| 93 * varint startOffset; (delta from previous startOffset) | |
| 94 * varint endOffset; (delta from startOffset) | |
| 95 * } | |
| 96 * } | |
| 97 * | |
| 98 * Here, array { X } means zero or more occurrences of X, adjacent in memory. | |
| 99 * | |
| 100 * A doclist with type DL_POSITIONS is like the above, but holds only docids | |
| 101 * and positions without offset information. | |
| 102 * | |
| 103 * A doclist with type DL_DOCIDS is like the above, but holds only docids | |
| 104 * without positions or offset information. | |
| 105 * | |
| 106 * On disk, every document list has positions and offsets, so we don't bother | |
| 107 * to serialize a doclist's type. | |
| 108 * | |
| 109 * We don't yet delta-encode document IDs; doing so will probably be a | |
| 110 * modest win. | |
| 111 * | |
| 112 * NOTE(shess) I've thought of a slightly (1%) better offset encoding. | |
| 113 * After the first offset, estimate the next offset by using the | |
| 114 * current token position and the previous token position and offset, | |
| 115 * offset to handle some variance. So the estimate would be | |
| 116 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded | |
| 117 * as normal. Offsets more than 64 chars from the estimate are | |
| 118 * encoded as the delta to the previous start offset + 128. An | |
| 119 * additional tiny increment can be gained by using the end offset of | |
| 120 * the previous token to make the estimate a tiny bit more precise. | |
| 121 */ | |
| 122 | |
| 123 typedef enum DocListType { | |
| 124 DL_DOCIDS, /* docids only */ | |
| 125 DL_POSITIONS, /* docids + positions */ | |
| 126 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | |
| 127 } DocListType; | |
| 128 | |
| 129 typedef struct DocList { | |
| 130 char *pData; | |
| 131 int nData; | |
| 132 DocListType iType; | |
| 133 int iLastPos; /* the last position written */ | |
| 134 int iLastOffset; /* the last start offset written */ | |
| 135 } DocList; | |
| 136 | |
| 137 /* Initialize a new DocList to hold the given data. */ | |
| 138 static void docListInit(DocList *d, DocListType iType, | |
| 139 const char *pData, int nData){ | |
| 140 d->nData = nData; | |
| 141 if( nData>0 ){ | |
| 142 d->pData = malloc(nData); | |
| 143 memcpy(d->pData, pData, nData); | |
| 144 } else { | |
| 145 d->pData = NULL; | |
| 146 } | |
| 147 d->iType = iType; | |
| 148 d->iLastPos = 0; | |
| 149 d->iLastOffset = 0; | |
| 150 } | |
| 151 | |
| 152 /* Create a new dynamically-allocated DocList. */ | |
| 153 static DocList *docListNew(DocListType iType){ | |
| 154 DocList *d = (DocList *) malloc(sizeof(DocList)); | |
| 155 docListInit(d, iType, 0, 0); | |
| 156 return d; | |
| 157 } | |
| 158 | |
| 159 static void docListDestroy(DocList *d){ | |
| 160 free(d->pData); | |
| 161 #ifndef NDEBUG | |
| 162 memset(d, 0x55, sizeof(*d)); | |
| 163 #endif | |
| 164 } | |
| 165 | |
| 166 static void docListDelete(DocList *d){ | |
| 167 docListDestroy(d); | |
| 168 free(d); | |
| 169 } | |
| 170 | |
| 171 static char *docListEnd(DocList *d){ | |
| 172 return d->pData + d->nData; | |
| 173 } | |
| 174 | |
| 175 /* Append a varint to a DocList's data. */ | |
| 176 static void appendVarint(DocList *d, sqlite_int64 i){ | |
| 177 char c[VARINT_MAX]; | |
| 178 int n = putVarint(c, i); | |
| 179 d->pData = realloc(d->pData, d->nData + n); | |
| 180 memcpy(d->pData + d->nData, c, n); | |
| 181 d->nData += n; | |
| 182 } | |
| 183 | |
| 184 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ | |
| 185 appendVarint(d, iDocid); | |
| 186 d->iLastPos = 0; | |
| 187 } | |
| 188 | |
| 189 /* Add a position to the last position list in a doclist. */ | |
| 190 static void docListAddPos(DocList *d, int iPos){ | |
| 191 assert( d->iType>=DL_POSITIONS ); | |
| 192 appendVarint(d, iPos-d->iLastPos+1); | |
| 193 d->iLastPos = iPos; | |
| 194 } | |
| 195 | |
| 196 static void docListAddPosOffset(DocList *d, int iPos, | |
| 197 int iStartOffset, int iEndOffset){ | |
| 198 assert( d->iType==DL_POSITIONS_OFFSETS ); | |
| 199 docListAddPos(d, iPos); | |
| 200 appendVarint(d, iStartOffset-d->iLastOffset); | |
| 201 d->iLastOffset = iStartOffset; | |
| 202 appendVarint(d, iEndOffset-iStartOffset); | |
| 203 } | |
| 204 | |
| 205 /* Terminate the last position list in the given doclist. */ | |
| 206 static void docListAddEndPos(DocList *d){ | |
| 207 appendVarint(d, 0); | |
| 208 } | |
| 209 | |
| 210 typedef struct DocListReader { | |
| 211 DocList *pDoclist; | |
| 212 char *p; | |
| 213 int iLastPos; /* the last position read */ | |
| 214 } DocListReader; | |
| 215 | |
| 216 static void readerInit(DocListReader *r, DocList *pDoclist){ | |
| 217 r->pDoclist = pDoclist; | |
| 218 if( pDoclist!=NULL ){ | |
| 219 r->p = pDoclist->pData; | |
| 220 } | |
| 221 r->iLastPos = 0; | |
| 222 } | |
| 223 | |
| 224 static int readerAtEnd(DocListReader *pReader){ | |
| 225 return pReader->p >= docListEnd(pReader->pDoclist); | |
| 226 } | |
| 227 | |
| 228 /* Peek at the next docid without advancing the read pointer. */ | |
| 229 static sqlite_int64 peekDocid(DocListReader *pReader){ | |
| 230 sqlite_int64 ret; | |
| 231 assert( !readerAtEnd(pReader) ); | |
| 232 getVarint(pReader->p, &ret); | |
| 233 return ret; | |
| 234 } | |
| 235 | |
| 236 /* Read the next docid. */ | |
| 237 static sqlite_int64 readDocid(DocListReader *pReader){ | |
| 238 sqlite_int64 ret; | |
| 239 assert( !readerAtEnd(pReader) ); | |
| 240 pReader->p += getVarint(pReader->p, &ret); | |
| 241 pReader->iLastPos = 0; | |
| 242 return ret; | |
| 243 } | |
| 244 | |
| 245 /* Read the next position from a position list. | |
| 246 * Returns the position, or -1 at the end of the list. */ | |
| 247 static int readPosition(DocListReader *pReader){ | |
| 248 int i; | |
| 249 int iType = pReader->pDoclist->iType; | |
| 250 assert( iType>=DL_POSITIONS ); | |
| 251 assert( !readerAtEnd(pReader) ); | |
| 252 | |
| 253 pReader->p += getVarint32(pReader->p, &i); | |
| 254 if( i==0 ){ | |
| 255 pReader->iLastPos = -1; | |
| 256 return -1; | |
| 257 } | |
| 258 pReader->iLastPos += ((int) i)-1; | |
| 259 if( iType>=DL_POSITIONS_OFFSETS ){ | |
| 260 /* Skip over offsets, ignoring them for now. */ | |
| 261 int iStart, iEnd; | |
| 262 pReader->p += getVarint32(pReader->p, &iStart); | |
| 263 pReader->p += getVarint32(pReader->p, &iEnd); | |
| 264 } | |
| 265 return pReader->iLastPos; | |
| 266 } | |
| 267 | |
| 268 /* Skip past the end of a position list. */ | |
| 269 static void skipPositionList(DocListReader *pReader){ | |
| 270 while( readPosition(pReader)!=-1 ) | |
| 271 ; | |
| 272 } | |
| 273 | |
| 274 /* Skip over a docid, including its position list if the doclist has | |
| 275 * positions. */ | |
| 276 static void skipDocument(DocListReader *pReader){ | |
| 277 readDocid(pReader); | |
| 278 if( pReader->pDoclist->iType >= DL_POSITIONS ){ | |
| 279 skipPositionList(pReader); | |
| 280 } | |
| 281 } | |
| 282 | |
| 283 static sqlite_int64 firstDocid(DocList *d){ | |
| 284 DocListReader r; | |
| 285 readerInit(&r, d); | |
| 286 return readDocid(&r); | |
| 287 } | |
| 288 | |
| 289 /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; | |
| 290 * otherwise pUpdate, which must contain only the single docid [iDocid], is | |
| 291 * inserted (if not present) or updated (if already present). */ | |
| 292 static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ | |
| 293 int modified = 0; | |
| 294 DocListReader reader; | |
| 295 char *p; | |
| 296 | |
| 297 if( pUpdate!=NULL ){ | |
| 298 assert( d->iType==pUpdate->iType); | |
| 299 assert( iDocid==firstDocid(pUpdate) ); | |
| 300 } | |
| 301 | |
| 302 readerInit(&reader, d); | |
| 303 while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ | |
| 304 skipDocument(&reader); | |
| 305 } | |
| 306 | |
| 307 p = reader.p; | |
| 308 /* Delete if there is a matching element. */ | |
| 309 if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ | |
| 310 skipDocument(&reader); | |
| 311 memmove(p, reader.p, docListEnd(d) - reader.p); | |
| 312 d->nData -= (reader.p - p); | |
| 313 modified = 1; | |
| 314 } | |
| 315 | |
| 316 /* Insert if indicated. */ | |
| 317 if( pUpdate!=NULL ){ | |
| 318 int iDoclist = p-d->pData; | |
| 319 docListAddEndPos(pUpdate); | |
| 320 | |
| 321 d->pData = realloc(d->pData, d->nData+pUpdate->nData); | |
| 322 p = d->pData + iDoclist; | |
| 323 | |
| 324 memmove(p+pUpdate->nData, p, docListEnd(d) - p); | |
| 325 memcpy(p, pUpdate->pData, pUpdate->nData); | |
| 326 d->nData += pUpdate->nData; | |
| 327 modified = 1; | |
| 328 } | |
| 329 | |
| 330 return modified; | |
| 331 } | |
| 332 | |
| 333 /* Split the second half of doclist d into a separate doclist d2. Returns 1 | |
| 334 * if successful, or 0 if d contains a single document and hence can't be | |
| 335 * split. */ | |
| 336 static int docListSplit(DocList *d, DocList *d2){ | |
| 337 const char *pSplitPoint = d->pData + d->nData / 2; | |
| 338 DocListReader reader; | |
| 339 | |
| 340 readerInit(&reader, d); | |
| 341 while( reader.p<pSplitPoint ){ | |
| 342 skipDocument(&reader); | |
| 343 } | |
| 344 if( readerAtEnd(&reader) ) return 0; | |
| 345 docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); | |
| 346 d->nData = reader.p - d->pData; | |
| 347 d->pData = realloc(d->pData, d->nData); | |
| 348 return 1; | |
| 349 } | |
| 350 | |
| 351 /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked | |
| 352 * on-disk doclist, resulting in another in-memory DocList [out]. [in] | |
| 353 * and [out] may or may not store position information according to the | |
| 354 * caller's wishes. The on-disk doclist always comes with positions. | |
| 355 * | |
| 356 * The caller must read each chunk of the on-disk doclist in succession and | |
| 357 * pass it to mergeBlock(). | |
| 358 * | |
| 359 * If [in] has positions, then the merge output contains only documents with | |
| 360 * matching positions in the two input doclists. If [in] does not have | |
| 361 * positions, then the merge output contains all documents common to the two | |
| 362 * input doclists. | |
| 363 * | |
| 364 * If [in] is NULL, then the on-disk doclist is copied to [out] directly. | |
| 365 * | |
| 366 * A merge is performed using an integer [iOffset] provided by the caller. | |
| 367 * [iOffset] is subtracted from each position in the on-disk doclist for the | |
| 368 * purpose of position comparison; this is helpful in implementing phrase | |
| 369 * searches. | |
| 370 * | |
| 371 * A DocListMerge is not yet able to propagate offsets through query | |
| 372 * processing; we should add that capability soon. | |
| 373 */ | |
| 374 typedef struct DocListMerge { | |
| 375 DocListReader in; | |
| 376 DocList *pOut; | |
| 377 int iOffset; | |
| 378 } DocListMerge; | |
| 379 | |
| 380 static void mergeInit(DocListMerge *m, | |
| 381 DocList *pIn, int iOffset, DocList *pOut){ | |
| 382 readerInit(&m->in, pIn); | |
| 383 m->pOut = pOut; | |
| 384 m->iOffset = iOffset; | |
| 385 | |
| 386 /* can't handle offsets yet */ | |
| 387 assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); | |
| 388 assert( pOut->iType <= DL_POSITIONS ); | |
| 389 } | |
| 390 | |
| 391 /* A helper function for mergeBlock(), below. Merge the position lists | |
| 392 * pointed to by m->in and pBlockReader. | |
| 393 * If the merge matches, write [iDocid] to m->pOut; if m->pOut | |
| 394 * has positions then write all matching positions as well. */ | |
| 395 static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, | |
| 396 DocListReader *pBlockReader){ | |
| 397 int block_pos = readPosition(pBlockReader); | |
| 398 int in_pos = readPosition(&m->in); | |
| 399 int match = 0; | |
| 400 while( block_pos!=-1 || in_pos!=-1 ){ | |
| 401 if( block_pos-m->iOffset==in_pos ){ | |
| 402 if( !match ){ | |
| 403 docListAddDocid(m->pOut, iDocid); | |
| 404 match = 1; | |
| 405 } | |
| 406 if( m->pOut->iType >= DL_POSITIONS ){ | |
| 407 docListAddPos(m->pOut, in_pos); | |
| 408 } | |
| 409 block_pos = readPosition(pBlockReader); | |
| 410 in_pos = readPosition(&m->in); | |
| 411 } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ | |
| 412 block_pos = readPosition(pBlockReader); | |
| 413 } else { | |
| 414 in_pos = readPosition(&m->in); | |
| 415 } | |
| 416 } | |
| 417 if( m->pOut->iType >= DL_POSITIONS && match ){ | |
| 418 docListAddEndPos(m->pOut); | |
| 419 } | |
| 420 } | |
| 421 | |
| 422 /* Merge one block of an on-disk doclist into a DocListMerge. */ | |
| 423 static void mergeBlock(DocListMerge *m, DocList *pBlock){ | |
| 424 DocListReader blockReader; | |
| 425 assert( pBlock->iType >= DL_POSITIONS ); | |
| 426 readerInit(&blockReader, pBlock); | |
| 427 while( !readerAtEnd(&blockReader) ){ | |
| 428 sqlite_int64 iDocid = readDocid(&blockReader); | |
| 429 if( m->in.pDoclist!=NULL ){ | |
| 430 while( 1 ){ | |
| 431 if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ | |
| 432 if( peekDocid(&m->in)>=iDocid ) break; | |
| 433 skipDocument(&m->in); | |
| 434 } | |
| 435 if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ | |
| 436 skipPositionList(&blockReader); /* skip this docid in the block */ | |
| 437 continue; | |
| 438 } | |
| 439 readDocid(&m->in); | |
| 440 } | |
| 441 /* We have a document match. */ | |
| 442 if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ | |
| 443 /* We don't need to do a poslist merge. */ | |
| 444 docListAddDocid(m->pOut, iDocid); | |
| 445 if( m->pOut->iType >= DL_POSITIONS ){ | |
| 446 /* Copy all positions to the output doclist. */ | |
| 447 while( 1 ){ | |
| 448 int pos = readPosition(&blockReader); | |
| 449 if( pos==-1 ) break; | |
| 450 docListAddPos(m->pOut, pos); | |
| 451 } | |
| 452 docListAddEndPos(m->pOut); | |
| 453 } else skipPositionList(&blockReader); | |
| 454 continue; | |
| 455 } | |
| 456 mergePosList(m, iDocid, &blockReader); | |
| 457 } | |
| 458 } | |
| 459 | |
| 460 static char *string_dup_n(const char *s, int n){ | |
| 461 char *str = malloc(n + 1); | |
| 462 memcpy(str, s, n); | |
| 463 str[n] = '\0'; | |
| 464 return str; | |
| 465 } | |
| 466 | |
| 467 /* Duplicate a string; the caller must free() the returned string. | |
| 468 * (We don't use strdup() since it's not part of the standard C library and | |
| 469 * may not be available everywhere.) */ | |
| 470 static char *string_dup(const char *s){ | |
| 471 return string_dup_n(s, strlen(s)); | |
| 472 } | |
| 473 | |
| 474 /* Format a string, replacing each occurrence of the % character with | |
| 475 * zName. This may be more convenient than sqlite_mprintf() | |
| 476 * when one string is used repeatedly in a format string. | |
| 477 * The caller must free() the returned string. */ | |
| 478 static char *string_format(const char *zFormat, const char *zName){ | |
| 479 const char *p; | |
| 480 size_t len = 0; | |
| 481 size_t nName = strlen(zName); | |
| 482 char *result; | |
| 483 char *r; | |
| 484 | |
| 485 /* first compute length needed */ | |
| 486 for(p = zFormat ; *p ; ++p){ | |
| 487 len += (*p=='%' ? nName : 1); | |
| 488 } | |
| 489 len += 1; /* for null terminator */ | |
| 490 | |
| 491 r = result = malloc(len); | |
| 492 for(p = zFormat; *p; ++p){ | |
| 493 if( *p=='%' ){ | |
| 494 memcpy(r, zName, nName); | |
| 495 r += nName; | |
| 496 } else { | |
| 497 *r++ = *p; | |
| 498 } | |
| 499 } | |
| 500 *r++ = '\0'; | |
| 501 assert( r == result + len ); | |
| 502 return result; | |
| 503 } | |
| 504 | |
| 505 static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ | |
| 506 char *zCommand = string_format(zFormat, zName); | |
| 507 int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | |
| 508 free(zCommand); | |
| 509 return rc; | |
| 510 } | |
| 511 | |
| 512 static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, | |
| 513 const char *zFormat){ | |
| 514 char *zCommand = string_format(zFormat, zName); | |
| 515 int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); | |
| 516 free(zCommand); | |
| 517 return rc; | |
| 518 } | |
| 519 | |
| 520 /* end utility functions */ | |
| 521 | |
| 522 #define QUERY_GENERIC 0 | |
| 523 #define QUERY_FULLTEXT 1 | |
| 524 | |
| 525 #define CHUNK_MAX 1024 | |
| 526 | |
| 527 typedef enum fulltext_statement { | |
| 528 CONTENT_INSERT_STMT, | |
| 529 CONTENT_SELECT_STMT, | |
| 530 CONTENT_DELETE_STMT, | |
| 531 | |
| 532 TERM_SELECT_STMT, | |
| 533 TERM_CHUNK_SELECT_STMT, | |
| 534 TERM_INSERT_STMT, | |
| 535 TERM_UPDATE_STMT, | |
| 536 TERM_DELETE_STMT, | |
| 537 | |
| 538 MAX_STMT /* Always at end! */ | |
| 539 } fulltext_statement; | |
| 540 | |
| 541 /* These must exactly match the enum above. */ | |
| 542 /* TODO(adam): Is there some risk that a statement (in particular, | |
| 543 ** pTermSelectStmt) will be used in two cursors at once, e.g. if a | |
| 544 ** query joins a virtual table to itself? If so perhaps we should | |
| 545 ** move some of these to the cursor object. | |
| 546 */ | |
| 547 static const char *fulltext_zStatement[MAX_STMT] = { | |
| 548 /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", | |
| 549 /* CONTENT_SELECT */ "select content from %_content where rowid = ?", | |
| 550 /* CONTENT_DELETE */ "delete from %_content where rowid = ?", | |
| 551 | |
| 552 /* TERM_SELECT */ | |
| 553 "select rowid, doclist from %_term where term = ? and first = ?", | |
| 554 /* TERM_CHUNK_SELECT */ | |
| 555 "select max(first) from %_term where term = ? and first <= ?", | |
| 556 /* TERM_INSERT */ | |
| 557 "insert into %_term (term, first, doclist) values (?, ?, ?)", | |
| 558 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", | |
| 559 /* TERM_DELETE */ "delete from %_term where rowid = ?", | |
| 560 }; | |
| 561 | |
| 562 typedef struct fulltext_vtab { | |
| 563 sqlite3_vtab base; | |
| 564 sqlite3 *db; | |
| 565 const char *zName; /* virtual table name */ | |
| 566 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | |
| 567 | |
| 568 /* Precompiled statements which we keep as long as the table is | |
| 569 ** open. | |
| 570 */ | |
| 571 sqlite3_stmt *pFulltextStatements[MAX_STMT]; | |
| 572 } fulltext_vtab; | |
| 573 | |
| 574 typedef struct fulltext_cursor { | |
| 575 sqlite3_vtab_cursor base; | |
| 576 int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ | |
| 577 | |
| 578 sqlite3_stmt *pStmt; | |
| 579 | |
| 580 int eof; | |
| 581 | |
| 582 /* The following is used only when iCursorType == QUERY_FULLTEXT. */ | |
| 583 DocListReader result; | |
| 584 } fulltext_cursor; | |
| 585 | |
| 586 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | |
| 587 return (fulltext_vtab *) c->base.pVtab; | |
| 588 } | |
| 589 | |
| 590 static sqlite3_module fulltextModule; /* forward declaration */ | |
| 591 | |
| 592 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | |
| 593 ** If the indicated statement has never been prepared, it is prepared | |
| 594 ** and cached, otherwise the cached version is reset. | |
| 595 */ | |
| 596 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | |
| 597 sqlite3_stmt **ppStmt){ | |
| 598 assert( iStmt<MAX_STMT ); | |
| 599 if( v->pFulltextStatements[iStmt]==NULL ){ | |
| 600 int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], | |
| 601 fulltext_zStatement[iStmt]); | |
| 602 if( rc!=SQLITE_OK ) return rc; | |
| 603 } else { | |
| 604 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | |
| 605 if( rc!=SQLITE_OK ) return rc; | |
| 606 } | |
| 607 | |
| 608 *ppStmt = v->pFulltextStatements[iStmt]; | |
| 609 return SQLITE_OK; | |
| 610 } | |
| 611 | |
| 612 /* Step the indicated statement, handling errors SQLITE_BUSY (by | |
| 613 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring | |
| 614 ** bindings to the new statement). | |
| 615 ** TODO(adam): We should extend this function so that it can work with | |
| 616 ** statements declared locally, not only globally cached statements. | |
| 617 */ | |
| 618 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, | |
| 619 sqlite3_stmt **ppStmt){ | |
| 620 int rc; | |
| 621 sqlite3_stmt *s = *ppStmt; | |
| 622 assert( iStmt<MAX_STMT ); | |
| 623 assert( s==v->pFulltextStatements[iStmt] ); | |
| 624 | |
| 625 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ | |
| 626 sqlite3_stmt *pNewStmt; | |
| 627 | |
| 628 if( rc==SQLITE_BUSY ) continue; | |
| 629 if( rc!=SQLITE_ERROR ) return rc; | |
| 630 | |
| 631 rc = sqlite3_reset(s); | |
| 632 if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; | |
| 633 | |
| 634 v->pFulltextStatements[iStmt] = NULL; /* Still in s */ | |
| 635 rc = sql_get_statement(v, iStmt, &pNewStmt); | |
| 636 if( rc!=SQLITE_OK ) goto err; | |
| 637 *ppStmt = pNewStmt; | |
| 638 | |
| 639 rc = sqlite3_transfer_bindings(s, pNewStmt); | |
| 640 if( rc!=SQLITE_OK ) goto err; | |
| 641 | |
| 642 rc = sqlite3_finalize(s); | |
| 643 if( rc!=SQLITE_OK ) return rc; | |
| 644 s = pNewStmt; | |
| 645 } | |
| 646 return rc; | |
| 647 | |
| 648 err: | |
| 649 sqlite3_finalize(s); | |
| 650 return rc; | |
| 651 } | |
| 652 | |
| 653 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. | |
| 654 ** Useful for statements like UPDATE, where we expect no results. | |
| 655 */ | |
| 656 static int sql_single_step_statement(fulltext_vtab *v, | |
| 657 fulltext_statement iStmt, | |
| 658 sqlite3_stmt **ppStmt){ | |
| 659 int rc = sql_step_statement(v, iStmt, ppStmt); | |
| 660 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | |
| 661 } | |
| 662 | |
| 663 /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ | |
| 664 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, | |
| 665 const char *zContent, int nContent){ | |
| 666 sqlite3_stmt *s; | |
| 667 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | |
| 668 if( rc!=SQLITE_OK ) return rc; | |
| 669 | |
| 670 rc = sqlite3_bind_value(s, 1, rowid); | |
| 671 if( rc!=SQLITE_OK ) return rc; | |
| 672 | |
| 673 rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); | |
| 674 if( rc!=SQLITE_OK ) return rc; | |
| 675 | |
| 676 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); | |
| 677 } | |
| 678 | |
| 679 /* select content from %_content where rowid = [iRow] | |
| 680 * The caller must delete the returned string. */ | |
| 681 static int content_select(fulltext_vtab *v, sqlite_int64 iRow, | |
| 682 char **pzContent){ | |
| 683 sqlite3_stmt *s; | |
| 684 int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | |
| 685 if( rc!=SQLITE_OK ) return rc; | |
| 686 | |
| 687 rc = sqlite3_bind_int64(s, 1, iRow); | |
| 688 if( rc!=SQLITE_OK ) return rc; | |
| 689 | |
| 690 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); | |
| 691 if( rc!=SQLITE_ROW ) return rc; | |
| 692 | |
| 693 *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); | |
| 694 | |
| 695 /* We expect only one row. We must execute another sqlite3_step() | |
| 696 * to complete the iteration; otherwise the table will remain locked. */ | |
| 697 rc = sqlite3_step(s); | |
| 698 if( rc==SQLITE_DONE ) return SQLITE_OK; | |
| 699 | |
| 700 free(*pzContent); | |
| 701 return rc; | |
| 702 } | |
| 703 | |
| 704 /* delete from %_content where rowid = [iRow ] */ | |
| 705 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ | |
| 706 sqlite3_stmt *s; | |
| 707 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | |
| 708 if( rc!=SQLITE_OK ) return rc; | |
| 709 | |
| 710 rc = sqlite3_bind_int64(s, 1, iRow); | |
| 711 if( rc!=SQLITE_OK ) return rc; | |
| 712 | |
| 713 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); | |
| 714 } | |
| 715 | |
| 716 /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] | |
| 717 * If found, returns SQLITE_OK; the caller must free the returned doclist. | |
| 718 * If no rows found, returns SQLITE_ERROR. */ | |
| 719 static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, | |
| 720 sqlite_int64 iFirst, | |
| 721 sqlite_int64 *rowid, | |
| 722 DocList *out){ | |
| 723 sqlite3_stmt *s; | |
| 724 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); | |
| 725 if( rc!=SQLITE_OK ) return rc; | |
| 726 | |
| 727 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); | |
| 728 if( rc!=SQLITE_OK ) return rc; | |
| 729 | |
| 730 rc = sqlite3_bind_int64(s, 2, iFirst); | |
| 731 if( rc!=SQLITE_OK ) return rc; | |
| 732 | |
| 733 rc = sql_step_statement(v, TERM_SELECT_STMT, &s); | |
| 734 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | |
| 735 | |
| 736 *rowid = sqlite3_column_int64(s, 0); | |
| 737 docListInit(out, DL_POSITIONS_OFFSETS, | |
| 738 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); | |
| 739 | |
| 740 /* We expect only one row. We must execute another sqlite3_step() | |
| 741 * to complete the iteration; otherwise the table will remain locked. */ | |
| 742 rc = sqlite3_step(s); | |
| 743 return rc==SQLITE_DONE ? SQLITE_OK : rc; | |
| 744 } | |
| 745 | |
| 746 /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] | |
| 747 * If found, returns SQLITE_ROW and result in *piResult; if the query returns | |
| 748 * NULL (meaning no row found) returns SQLITE_DONE. | |
| 749 */ | |
| 750 static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, | |
| 751 sqlite_int64 iFirst, sqlite_int64 *piResult){ | |
| 752 sqlite3_stmt *s; | |
| 753 int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); | |
| 754 if( rc!=SQLITE_OK ) return rc; | |
| 755 | |
| 756 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); | |
| 757 if( rc!=SQLITE_OK ) return rc; | |
| 758 | |
| 759 rc = sqlite3_bind_int64(s, 2, iFirst); | |
| 760 if( rc!=SQLITE_OK ) return rc; | |
| 761 | |
| 762 rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); | |
| 763 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | |
| 764 | |
| 765 switch( sqlite3_column_type(s, 0) ){ | |
| 766 case SQLITE_NULL: | |
| 767 rc = SQLITE_DONE; | |
| 768 break; | |
| 769 case SQLITE_INTEGER: | |
| 770 *piResult = sqlite3_column_int64(s, 0); | |
| 771 break; | |
| 772 default: | |
| 773 return SQLITE_ERROR; | |
| 774 } | |
| 775 /* We expect only one row. We must execute another sqlite3_step() | |
| 776 * to complete the iteration; otherwise the table will remain locked. */ | |
| 777 if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; | |
| 778 return rc; | |
| 779 } | |
| 780 | |
| 781 /* insert into %_term (term, first, doclist) | |
| 782 values ([zTerm], [iFirst], [doclist]) */ | |
| 783 static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, | |
| 784 sqlite_int64 iFirst, DocList *doclist){ | |
| 785 sqlite3_stmt *s; | |
| 786 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); | |
| 787 if( rc!=SQLITE_OK ) return rc; | |
| 788 | |
| 789 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); | |
| 790 if( rc!=SQLITE_OK ) return rc; | |
| 791 | |
| 792 rc = sqlite3_bind_int64(s, 2, iFirst); | |
| 793 if( rc!=SQLITE_OK ) return rc; | |
| 794 | |
| 795 rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); | |
| 796 if( rc!=SQLITE_OK ) return rc; | |
| 797 | |
| 798 return sql_single_step_statement(v, TERM_INSERT_STMT, &s); | |
| 799 } | |
| 800 | |
| 801 /* update %_term set doclist = [doclist] where rowid = [rowid] */ | |
| 802 static int term_update(fulltext_vtab *v, sqlite_int64 rowid, | |
| 803 DocList *doclist){ | |
| 804 sqlite3_stmt *s; | |
| 805 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); | |
| 806 if( rc!=SQLITE_OK ) return rc; | |
| 807 | |
| 808 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, | |
| 809 SQLITE_STATIC); | |
| 810 if( rc!=SQLITE_OK ) return rc; | |
| 811 | |
| 812 rc = sqlite3_bind_int64(s, 2, rowid); | |
| 813 if( rc!=SQLITE_OK ) return rc; | |
| 814 | |
| 815 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); | |
| 816 } | |
| 817 | |
| 818 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ | |
| 819 sqlite3_stmt *s; | |
| 820 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); | |
| 821 if( rc!=SQLITE_OK ) return rc; | |
| 822 | |
| 823 rc = sqlite3_bind_int64(s, 1, rowid); | |
| 824 if( rc!=SQLITE_OK ) return rc; | |
| 825 | |
| 826 return sql_single_step_statement(v, TERM_DELETE_STMT, &s); | |
| 827 } | |
| 828 | |
| 829 static void fulltext_vtab_destroy(fulltext_vtab *v){ | |
| 830 int iStmt; | |
| 831 | |
| 832 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | |
| 833 if( v->pFulltextStatements[iStmt]!=NULL ){ | |
| 834 sqlite3_finalize(v->pFulltextStatements[iStmt]); | |
| 835 v->pFulltextStatements[iStmt] = NULL; | |
| 836 } | |
| 837 } | |
| 838 | |
| 839 if( v->pTokenizer!=NULL ){ | |
| 840 v->pTokenizer->pModule->xDestroy(v->pTokenizer); | |
| 841 v->pTokenizer = NULL; | |
| 842 } | |
| 843 | |
| 844 free((void *) v->zName); | |
| 845 free(v); | |
| 846 } | |
| 847 | |
| 848 /* Current interface: | |
| 849 ** argv[0] - module name | |
| 850 ** argv[1] - database name | |
| 851 ** argv[2] - table name | |
| 852 ** argv[3] - tokenizer name (optional, a sensible default is provided) | |
| 853 ** argv[4..] - passed to tokenizer (optional based on tokenizer) | |
| 854 **/ | |
| 855 static int fulltextConnect( | |
| 856 sqlite3 *db, | |
| 857 void *pAux, | |
| 858 int argc, | |
| 859 const char * const *argv, | |
| 860 sqlite3_vtab **ppVTab, | |
| 861 char **pzErr | |
| 862 ){ | |
| 863 int rc; | |
| 864 fulltext_vtab *v; | |
| 865 sqlite3_tokenizer_module *m = NULL; | |
| 866 | |
| 867 assert( argc>=3 ); | |
| 868 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | |
| 869 /* sqlite will initialize v->base */ | |
| 870 v->db = db; | |
| 871 v->zName = string_dup(argv[2]); | |
| 872 v->pTokenizer = NULL; | |
| 873 | |
| 874 if( argc==3 ){ | |
| 875 get_simple_tokenizer_module(&m); | |
| 876 } else { | |
| 877 /* TODO(shess) For now, add new tokenizers as else if clauses. */ | |
| 878 if( !strcmp(argv[3], "simple") ){ | |
| 879 get_simple_tokenizer_module(&m); | |
| 880 } else { | |
| 881 assert( "unrecognized tokenizer"==NULL ); | |
| 882 } | |
| 883 } | |
| 884 | |
| 885 /* TODO(shess) Since tokenization impacts the index, the parameters | |
| 886 ** to the tokenizer need to be identical when a persistent virtual | |
| 887 ** table is re-created. One solution would be a meta-table to track | |
| 888 ** such information in the database. Then we could verify that the | |
| 889 ** information is identical on subsequent creates. | |
| 890 */ | |
| 891 /* TODO(shess) Why isn't argv already (const char **)? */ | |
| 892 rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); | |
| 893 if( rc!=SQLITE_OK ) return rc; | |
| 894 v->pTokenizer->pModule = m; | |
| 895 | |
| 896 /* TODO: verify the existence of backing tables foo_content, foo_term */ | |
| 897 | |
| 898 rc = sqlite3_declare_vtab(db, "create table x(content text)"); | |
| 899 if( rc!=SQLITE_OK ) return rc; | |
| 900 | |
| 901 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | |
| 902 | |
| 903 *ppVTab = &v->base; | |
| 904 return SQLITE_OK; | |
| 905 } | |
| 906 | |
| 907 static int fulltextCreate( | |
| 908 sqlite3 *db, | |
| 909 void *pAux, | |
| 910 int argc, | |
| 911 const char * const *argv, | |
| 912 sqlite3_vtab **ppVTab, | |
| 913 char **pzErr | |
| 914 ){ | |
| 915 int rc; | |
| 916 assert( argc>=3 ); | |
| 917 | |
| 918 /* The %_content table holds the text of each full-text item, with | |
| 919 ** the rowid used as the docid. | |
| 920 ** | |
| 921 ** The %_term table maps each term to a document list blob | |
| 922 ** containing elements sorted by ascending docid, each element | |
| 923 ** encoded as: | |
| 924 ** | |
| 925 ** docid varint-encoded | |
| 926 ** token count varint-encoded | |
| 927 ** "count" token elements (poslist): | |
| 928 ** position varint-encoded as delta from previous position | |
| 929 ** start offset varint-encoded as delta from previous start offset | |
| 930 ** end offset varint-encoded as delta from start offset | |
| 931 ** | |
| 932 ** Additionally, doclist blobs can be chunked into multiple rows, | |
| 933 ** using "first" to order the blobs. "first" is simply the first | |
| 934 ** docid in the blob. | |
| 935 */ | |
| 936 /* | |
| 937 ** NOTE(shess) That last sentence is incorrect in the face of | |
| 938 ** deletion, which can leave a doclist that doesn't contain the | |
| 939 ** first from that row. I _believe_ this does not matter to the | |
| 940 ** operation of the system, but it might be reasonable to update | |
| 941 ** appropriately in case this assumption becomes more important. | |
| 942 */ | |
| 943 rc = sql_exec(db, argv[2], | |
| 944 "create table %_content(content text);" | |
| 945 "create table %_term(term text, first integer, doclist blob);" | |
| 946 "create index %_index on %_term(term, first)"); | |
| 947 if( rc!=SQLITE_OK ) return rc; | |
| 948 | |
| 949 return fulltextConnect(db, pAux, argc, argv, ppVTab, pzErr); | |
| 950 } | |
| 951 | |
| 952 /* Decide how to handle an SQL query. | |
| 953 * At the moment, MATCH queries can include implicit boolean ANDs; we | |
| 954 * haven't implemented phrase searches or OR yet. */ | |
| 955 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | |
| 956 int i; | |
| 957 | |
| 958 for(i=0; i<pInfo->nConstraint; ++i){ | |
| 959 const struct sqlite3_index_constraint *pConstraint; | |
| 960 pConstraint = &pInfo->aConstraint[i]; | |
| 961 if( pConstraint->iColumn==0 && | |
| 962 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && | |
| 963 pConstraint->usable ){ /* a full-text search */ | |
| 964 pInfo->aConstraintUsage[i].argvIndex = 1; | |
| 965 pInfo->aConstraintUsage[i].omit = 1; | |
| 966 pInfo->idxNum = QUERY_FULLTEXT; | |
| 967 pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ | |
| 968 return SQLITE_OK; | |
| 969 } | |
| 970 } | |
| 971 pInfo->idxNum = QUERY_GENERIC; | |
| 972 return SQLITE_OK; | |
| 973 } | |
| 974 | |
| 975 static int fulltextDisconnect(sqlite3_vtab *pVTab){ | |
| 976 fulltext_vtab_destroy((fulltext_vtab *)pVTab); | |
| 977 return SQLITE_OK; | |
| 978 } | |
| 979 | |
| 980 static int fulltextDestroy(sqlite3_vtab *pVTab){ | |
| 981 fulltext_vtab *v = (fulltext_vtab *)pVTab; | |
| 982 | |
| 983 int rc = sql_exec(v->db, v->zName, | |
| 984 "drop table %_content; drop table %_term"); | |
| 985 if( rc!=SQLITE_OK ) return rc; | |
| 986 | |
| 987 fulltext_vtab_destroy((fulltext_vtab *)pVTab); | |
| 988 return SQLITE_OK; | |
| 989 } | |
| 990 | |
| 991 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | |
| 992 fulltext_cursor *c; | |
| 993 | |
| 994 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | |
| 995 /* sqlite will initialize c->base */ | |
| 996 *ppCursor = &c->base; | |
| 997 | |
| 998 return SQLITE_OK; | |
| 999 } | |
| 1000 | |
| 1001 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | |
| 1002 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
| 1003 sqlite3_finalize(c->pStmt); | |
| 1004 if( c->result.pDoclist!=NULL ){ | |
| 1005 docListDelete(c->result.pDoclist); | |
| 1006 } | |
| 1007 free(c); | |
| 1008 return SQLITE_OK; | |
| 1009 } | |
| 1010 | |
| 1011 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | |
| 1012 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
| 1013 sqlite_int64 iDocid; | |
| 1014 int rc; | |
| 1015 | |
| 1016 switch( c->iCursorType ){ | |
| 1017 case QUERY_GENERIC: | |
| 1018 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | |
| 1019 rc = sqlite3_step(c->pStmt); | |
| 1020 switch( rc ){ | |
| 1021 case SQLITE_ROW: | |
| 1022 c->eof = 0; | |
| 1023 return SQLITE_OK; | |
| 1024 case SQLITE_DONE: | |
| 1025 c->eof = 1; | |
| 1026 return SQLITE_OK; | |
| 1027 default: | |
| 1028 c->eof = 1; | |
| 1029 return rc; | |
| 1030 } | |
| 1031 case QUERY_FULLTEXT: | |
| 1032 rc = sqlite3_reset(c->pStmt); | |
| 1033 if( rc!=SQLITE_OK ) return rc; | |
| 1034 | |
| 1035 if( readerAtEnd(&c->result)){ | |
| 1036 c->eof = 1; | |
| 1037 return SQLITE_OK; | |
| 1038 } | |
| 1039 iDocid = readDocid(&c->result); | |
| 1040 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); | |
| 1041 if( rc!=SQLITE_OK ) return rc; | |
| 1042 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | |
| 1043 rc = sqlite3_step(c->pStmt); | |
| 1044 if( rc==SQLITE_ROW ){ /* the case we expect */ | |
| 1045 c->eof = 0; | |
| 1046 return SQLITE_OK; | |
| 1047 } | |
| 1048 /* an error occurred; abort */ | |
| 1049 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | |
| 1050 default: | |
| 1051 assert( 0 ); | |
| 1052 return SQLITE_ERROR; /* not reached */ | |
| 1053 } | |
| 1054 } | |
| 1055 | |
| 1056 static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, | |
| 1057 sqlite3_stmt **ppStmt){ | |
| 1058 int rc; | |
| 1059 if( *ppStmt ){ | |
| 1060 rc = sqlite3_reset(*ppStmt); | |
| 1061 } else { | |
| 1062 rc = sql_prepare(v->db, v->zName, ppStmt, | |
| 1063 "select doclist from %_term where term = ? order by first"); | |
| 1064 } | |
| 1065 if( rc!=SQLITE_OK ) return rc; | |
| 1066 | |
| 1067 rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); | |
| 1068 if( rc!=SQLITE_OK ) return rc; | |
| 1069 | |
| 1070 return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ | |
| 1071 } | |
| 1072 | |
| 1073 /* Read the posting list for [zTerm]; AND it with the doclist [in] to | |
| 1074 * produce the doclist [out], using the given offset [iOffset] for phrase | |
| 1075 * matching. | |
| 1076 * (*pSelect) is used to hold an SQLite statement used inside this function; | |
| 1077 * the caller should initialize *pSelect to NULL before the first call. | |
| 1078 */ | |
| 1079 static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, | |
| 1080 const char *zTerm, | |
| 1081 DocList *pIn, int iOffset, DocList *out){ | |
| 1082 int rc; | |
| 1083 DocListMerge merge; | |
| 1084 | |
| 1085 if( pIn!=NULL && !pIn->nData ){ | |
| 1086 /* If [pIn] is already empty, there's no point in reading the | |
| 1087 * posting list to AND it in; return immediately. */ | |
| 1088 return SQLITE_OK; | |
| 1089 } | |
| 1090 | |
| 1091 rc = term_select_doclist(v, zTerm, -1, pSelect); | |
| 1092 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; | |
| 1093 | |
| 1094 mergeInit(&merge, pIn, iOffset, out); | |
| 1095 while( rc==SQLITE_ROW ){ | |
| 1096 DocList block; | |
| 1097 docListInit(&block, DL_POSITIONS_OFFSETS, | |
| 1098 sqlite3_column_blob(*pSelect, 0), | |
| 1099 sqlite3_column_bytes(*pSelect, 0)); | |
| 1100 mergeBlock(&merge, &block); | |
| 1101 docListDestroy(&block); | |
| 1102 | |
| 1103 rc = sqlite3_step(*pSelect); | |
| 1104 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ | |
| 1105 return rc; | |
| 1106 } | |
| 1107 } | |
| 1108 | |
| 1109 return SQLITE_OK; | |
| 1110 } | |
| 1111 | |
| 1112 typedef struct QueryTerm { | |
| 1113 int is_phrase; /* true if this term begins a new phrase */ | |
| 1114 const char *zTerm; | |
| 1115 } QueryTerm; | |
| 1116 | |
| 1117 /* A parsed query. | |
| 1118 * | |
| 1119 * As an example, parsing the query ["four score" years "new nation"] will | |
| 1120 * yield a Query with 5 terms: | |
| 1121 * "four", is_phrase = 1 | |
| 1122 * "score", is_phrase = 0 | |
| 1123 * "years", is_phrase = 1 | |
| 1124 * "new", is_phrase = 1 | |
| 1125 * "nation", is_phrase = 0 | |
| 1126 */ | |
| 1127 typedef struct Query { | |
| 1128 int nTerms; | |
| 1129 QueryTerm *pTerm; | |
| 1130 } Query; | |
| 1131 | |
| 1132 static void query_add(Query *q, int is_phrase, const char *zTerm){ | |
| 1133 QueryTerm *t; | |
| 1134 ++q->nTerms; | |
| 1135 q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); | |
| 1136 t = &q->pTerm[q->nTerms - 1]; | |
| 1137 t->is_phrase = is_phrase; | |
| 1138 t->zTerm = zTerm; | |
| 1139 } | |
| 1140 | |
| 1141 static void query_free(Query *q){ | |
| 1142 int i; | |
| 1143 for(i = 0; i < q->nTerms; ++i){ | |
| 1144 free((void *) q->pTerm[i].zTerm); | |
| 1145 } | |
| 1146 free(q->pTerm); | |
| 1147 } | |
| 1148 | |
| 1149 static int tokenize_segment(sqlite3_tokenizer *pTokenizer, | |
| 1150 const char *zQuery, int in_phrase, | |
| 1151 Query *pQuery){ | |
| 1152 sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | |
| 1153 sqlite3_tokenizer_cursor *pCursor; | |
| 1154 int is_first = 1; | |
| 1155 | |
| 1156 int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); | |
| 1157 if( rc!=SQLITE_OK ) return rc; | |
| 1158 pCursor->pTokenizer = pTokenizer; | |
| 1159 | |
| 1160 while( 1 ){ | |
| 1161 const char *zToken; | |
| 1162 int nToken, iStartOffset, iEndOffset, dummy_pos; | |
| 1163 | |
| 1164 rc = pModule->xNext(pCursor, | |
| 1165 &zToken, &nToken, | |
| 1166 &iStartOffset, &iEndOffset, | |
| 1167 &dummy_pos); | |
| 1168 if( rc!=SQLITE_OK ) break; | |
| 1169 query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); | |
| 1170 is_first = 0; | |
| 1171 } | |
| 1172 | |
| 1173 return pModule->xClose(pCursor); | |
| 1174 } | |
| 1175 | |
| 1176 /* Parse a query string, yielding a Query object. */ | |
| 1177 static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ | |
| 1178 char *zQuery1 = string_dup(zQuery); | |
| 1179 int in_phrase = 0; | |
| 1180 char *s = zQuery1; | |
| 1181 pQuery->nTerms = 0; | |
| 1182 pQuery->pTerm = NULL; | |
| 1183 | |
| 1184 while( *s ){ | |
| 1185 char *t = s; | |
| 1186 while( *t ){ | |
| 1187 if( *t=='"' ){ | |
| 1188 *t++ = '\0'; | |
| 1189 break; | |
| 1190 } | |
| 1191 ++t; | |
| 1192 } | |
| 1193 if( *s ){ | |
| 1194 tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); | |
| 1195 } | |
| 1196 s = t; | |
| 1197 in_phrase = !in_phrase; | |
| 1198 } | |
| 1199 | |
| 1200 free(zQuery1); | |
| 1201 return SQLITE_OK; | |
| 1202 } | |
| 1203 | |
| 1204 /* Perform a full-text query; return a list of documents in [pResult]. */ | |
| 1205 static int fulltext_query(fulltext_vtab *v, const char *zQuery, | |
| 1206 DocList **pResult){ | |
| 1207 Query q; | |
| 1208 int phrase_start = -1; | |
| 1209 int i; | |
| 1210 sqlite3_stmt *pSelect = NULL; | |
| 1211 DocList *d = NULL; | |
| 1212 | |
| 1213 int rc = parse_query(v, zQuery, &q); | |
| 1214 if( rc!=SQLITE_OK ) return rc; | |
| 1215 | |
| 1216 /* Merge terms. */ | |
| 1217 for(i = 0 ; i < q.nTerms ; ++i){ | |
| 1218 /* In each merge step, we need to generate positions whenever we're | |
| 1219 * processing a phrase which hasn't ended yet. */ | |
| 1220 int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; | |
| 1221 DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); | |
| 1222 if( q.pTerm[i].is_phrase ){ | |
| 1223 phrase_start = i; | |
| 1224 } | |
| 1225 rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); | |
| 1226 if( rc!=SQLITE_OK ) break; | |
| 1227 if( d!=NULL ){ | |
| 1228 docListDelete(d); | |
| 1229 } | |
| 1230 d = next; | |
| 1231 } | |
| 1232 | |
| 1233 sqlite3_finalize(pSelect); | |
| 1234 query_free(&q); | |
| 1235 *pResult = d; | |
| 1236 return rc; | |
| 1237 } | |
| 1238 | |
| 1239 static int fulltextFilter(sqlite3_vtab_cursor *pCursor, | |
| 1240 int idxNum, const char *idxStr, | |
| 1241 int argc, sqlite3_value **argv){ | |
| 1242 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
| 1243 fulltext_vtab *v = cursor_vtab(c); | |
| 1244 int rc; | |
| 1245 const char *zStatement; | |
| 1246 | |
| 1247 c->iCursorType = idxNum; | |
| 1248 switch( idxNum ){ | |
| 1249 case QUERY_GENERIC: | |
| 1250 zStatement = "select rowid, content from %_content"; | |
| 1251 break; | |
| 1252 | |
| 1253 case QUERY_FULLTEXT: /* full-text search */ | |
| 1254 { | |
| 1255 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | |
| 1256 DocList *pResult; | |
| 1257 assert( argc==1 ); | |
| 1258 rc = fulltext_query(v, zQuery, &pResult); | |
| 1259 if( rc!=SQLITE_OK ) return rc; | |
| 1260 readerInit(&c->result, pResult); | |
| 1261 zStatement = "select rowid, content from %_content where rowid = ?"; | |
| 1262 break; | |
| 1263 } | |
| 1264 | |
| 1265 default: | |
| 1266 assert( 0 ); | |
| 1267 } | |
| 1268 | |
| 1269 rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); | |
| 1270 if( rc!=SQLITE_OK ) return rc; | |
| 1271 | |
| 1272 return fulltextNext(pCursor); | |
| 1273 } | |
| 1274 | |
| 1275 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | |
| 1276 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
| 1277 return c->eof; | |
| 1278 } | |
| 1279 | |
| 1280 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | |
| 1281 sqlite3_context *pContext, int idxCol){ | |
| 1282 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
| 1283 const char *s; | |
| 1284 | |
| 1285 assert( idxCol==0 ); | |
| 1286 s = (const char *) sqlite3_column_text(c->pStmt, 1); | |
| 1287 sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); | |
| 1288 | |
| 1289 return SQLITE_OK; | |
| 1290 } | |
| 1291 | |
| 1292 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | |
| 1293 fulltext_cursor *c = (fulltext_cursor *) pCursor; | |
| 1294 | |
| 1295 *pRowid = sqlite3_column_int64(c->pStmt, 0); | |
| 1296 return SQLITE_OK; | |
| 1297 } | |
| 1298 | |
| 1299 /* Build a hash table containing all terms in zText. */ | |
| 1300 static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, | |
| 1301 const char *zText, sqlite_int64 iDocid){ | |
| 1302 sqlite3_tokenizer_cursor *pCursor; | |
| 1303 const char *pToken; | |
| 1304 int nTokenBytes; | |
| 1305 int iStartOffset, iEndOffset, iPosition; | |
| 1306 | |
| 1307 int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | |
| 1308 if( rc!=SQLITE_OK ) return rc; | |
| 1309 | |
| 1310 pCursor->pTokenizer = pTokenizer; | |
| 1311 HashInit(terms, HASH_STRING, 1); | |
| 1312 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | |
| 1313 &pToken, &nTokenBytes, | |
| 1314 &iStartOffset, &iEndOffset, | |
| 1315 &iPosition) ){ | |
| 1316 DocList *p; | |
| 1317 | |
| 1318 /* Positions can't be negative; we use -1 as a terminator internally. */ | |
| 1319 if( iPosition<0 ) { | |
| 1320 rc = SQLITE_ERROR; | |
| 1321 goto err; | |
| 1322 } | |
| 1323 | |
| 1324 p = HashFind(terms, pToken, nTokenBytes); | |
| 1325 if( p==NULL ){ | |
| 1326 p = docListNew(DL_POSITIONS_OFFSETS); | |
| 1327 docListAddDocid(p, iDocid); | |
| 1328 HashInsert(terms, pToken, nTokenBytes, p); | |
| 1329 } | |
| 1330 docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); | |
| 1331 } | |
| 1332 | |
| 1333 err: | |
| 1334 /* TODO(shess) Check return? Should this be able to cause errors at | |
| 1335 ** this point? Actually, same question about sqlite3_finalize(), | |
| 1336 ** though one could argue that failure there means that the data is | |
| 1337 ** not durable. *ponder* | |
| 1338 */ | |
| 1339 pTokenizer->pModule->xClose(pCursor); | |
| 1340 return rc; | |
| 1341 } | |
| 1342 /* Update the %_terms table to map the term [zTerm] to the given rowid. */ | |
| 1343 static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, | |
| 1344 sqlite_int64 iDocid, DocList *p){ | |
| 1345 sqlite_int64 iFirst; | |
| 1346 sqlite_int64 iIndexRow; | |
| 1347 DocList doclist; | |
| 1348 | |
| 1349 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); | |
| 1350 if( rc==SQLITE_DONE ){ | |
| 1351 docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); | |
| 1352 if( docListUpdate(&doclist, iDocid, p) ){ | |
| 1353 rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); | |
| 1354 docListDestroy(&doclist); | |
| 1355 return rc; | |
| 1356 } | |
| 1357 return SQLITE_OK; | |
| 1358 } | |
| 1359 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; | |
| 1360 | |
| 1361 /* This word is in the index; add this document ID to its blob. */ | |
| 1362 | |
| 1363 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); | |
| 1364 if( rc!=SQLITE_OK ) return rc; | |
| 1365 | |
| 1366 if( docListUpdate(&doclist, iDocid, p) ){ | |
| 1367 /* If the blob is too big, split it in half. */ | |
| 1368 if( doclist.nData>CHUNK_MAX ){ | |
| 1369 DocList half; | |
| 1370 if( docListSplit(&doclist, &half) ){ | |
| 1371 rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); | |
| 1372 docListDestroy(&half); | |
| 1373 if( rc!=SQLITE_OK ) goto err; | |
| 1374 } | |
| 1375 } | |
| 1376 rc = term_update(v, iIndexRow, &doclist); | |
| 1377 } | |
| 1378 | |
| 1379 err: | |
| 1380 docListDestroy(&doclist); | |
| 1381 return rc; | |
| 1382 } | |
| 1383 | |
| 1384 /* Insert a row into the full-text index; set *piRowid to be the ID of the | |
| 1385 * new row. */ | |
| 1386 static int index_insert(fulltext_vtab *v, | |
| 1387 sqlite3_value *pRequestRowid, const char *zText, | |
| 1388 sqlite_int64 *piRowid){ | |
| 1389 Hash terms; /* maps term string -> PosList */ | |
| 1390 HashElem *e; | |
| 1391 | |
| 1392 int rc = content_insert(v, pRequestRowid, zText, -1); | |
| 1393 if( rc!=SQLITE_OK ) return rc; | |
| 1394 *piRowid = sqlite3_last_insert_rowid(v->db); | |
| 1395 | |
| 1396 if( !zText ) return SQLITE_OK; /* nothing to index */ | |
| 1397 | |
| 1398 rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); | |
| 1399 if( rc!=SQLITE_OK ) return rc; | |
| 1400 | |
| 1401 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
| 1402 DocList *p = HashData(e); | |
| 1403 rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); | |
| 1404 if( rc!=SQLITE_OK ) break; | |
| 1405 } | |
| 1406 | |
| 1407 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
| 1408 DocList *p = HashData(e); | |
| 1409 docListDelete(p); | |
| 1410 } | |
| 1411 HashClear(&terms); | |
| 1412 return rc; | |
| 1413 } | |
| 1414 | |
| 1415 static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, | |
| 1416 sqlite_int64 iDocid){ | |
| 1417 sqlite_int64 iFirst; | |
| 1418 sqlite_int64 iIndexRow; | |
| 1419 DocList doclist; | |
| 1420 | |
| 1421 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); | |
| 1422 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; | |
| 1423 | |
| 1424 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); | |
| 1425 if( rc!=SQLITE_OK ) return rc; | |
| 1426 | |
| 1427 if( docListUpdate(&doclist, iDocid, NULL) ){ | |
| 1428 if( doclist.nData>0 ){ | |
| 1429 rc = term_update(v, iIndexRow, &doclist); | |
| 1430 } else { /* empty posting list */ | |
| 1431 rc = term_delete(v, iIndexRow); | |
| 1432 } | |
| 1433 } | |
| 1434 docListDestroy(&doclist); | |
| 1435 return rc; | |
| 1436 } | |
| 1437 | |
| 1438 /* Delete a row from the full-text index. */ | |
| 1439 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ | |
| 1440 char *zText; | |
| 1441 Hash terms; | |
| 1442 HashElem *e; | |
| 1443 | |
| 1444 int rc = content_select(v, iRow, &zText); | |
| 1445 if( rc!=SQLITE_OK ) return rc; | |
| 1446 | |
| 1447 rc = build_terms(&terms, v->pTokenizer, zText, iRow); | |
| 1448 free(zText); | |
| 1449 if( rc!=SQLITE_OK ) return rc; | |
| 1450 | |
| 1451 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
| 1452 rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); | |
| 1453 if( rc!=SQLITE_OK ) break; | |
| 1454 } | |
| 1455 for(e=HashFirst(&terms); e; e=HashNext(e)){ | |
| 1456 DocList *p = HashData(e); | |
| 1457 docListDelete(p); | |
| 1458 } | |
| 1459 HashClear(&terms); | |
| 1460 | |
| 1461 return content_delete(v, iRow); | |
| 1462 } | |
| 1463 | |
| 1464 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | |
| 1465 sqlite_int64 *pRowid){ | |
| 1466 fulltext_vtab *v = (fulltext_vtab *) pVtab; | |
| 1467 | |
| 1468 if( nArg<2 ){ | |
| 1469 return index_delete(v, sqlite3_value_int64(ppArg[0])); | |
| 1470 } | |
| 1471 | |
| 1472 if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | |
| 1473 return SQLITE_ERROR; /* an update; not yet supported */ | |
| 1474 } | |
| 1475 | |
| 1476 assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ | |
| 1477 return index_insert(v, ppArg[1], | |
| 1478 (const char *)sqlite3_value_text(ppArg[2]), pRowid); | |
| 1479 } | |
| 1480 | |
| 1481 static sqlite3_module fulltextModule = { | |
| 1482 0, | |
| 1483 fulltextCreate, | |
| 1484 fulltextConnect, | |
| 1485 fulltextBestIndex, | |
| 1486 fulltextDisconnect, | |
| 1487 fulltextDestroy, | |
| 1488 fulltextOpen, | |
| 1489 fulltextClose, | |
| 1490 fulltextFilter, | |
| 1491 fulltextNext, | |
| 1492 fulltextEof, | |
| 1493 fulltextColumn, | |
| 1494 fulltextRowid, | |
| 1495 fulltextUpdate | |
| 1496 }; | |
| 1497 | |
| 1498 int fulltext_init(sqlite3 *db){ | |
| 1499 return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); | |
| 1500 } | |
| 1501 | |
| 1502 #if !SQLITE_CORE | |
| 1503 #ifdef _WIN32 | |
| 1504 __declspec(dllexport) | |
| 1505 #endif | |
| 1506 int sqlite3_fulltext_init(sqlite3 *db, char **pzErrMsg, | |
| 1507 const sqlite3_api_routines *pApi){ | |
| 1508 SQLITE_EXTENSION_INIT2(pApi) | |
| 1509 return fulltext_init(db); | |
| 1510 } | |
| 1511 #endif | |
| OLD | NEW |