OLD | NEW |
(Empty) | |
| 1 /* The author disclaims copyright to this source code. |
| 2 * |
| 3 * This is an SQLite module implementing full-text search. |
| 4 */ |
| 5 |
| 6 #include <assert.h> |
| 7 #if !defined(__APPLE__) |
| 8 #include <malloc.h> |
| 9 #else |
| 10 #include <stdlib.h> |
| 11 #endif |
| 12 #include <stdio.h> |
| 13 #include <string.h> |
| 14 #include <ctype.h> |
| 15 |
| 16 #include "fulltext.h" |
| 17 #include "ft_hash.h" |
| 18 #include "tokenizer.h" |
| 19 #include "sqlite3.h" |
| 20 #include "sqlite3ext.h" |
| 21 SQLITE_EXTENSION_INIT1 |
| 22 |
| 23 /* utility functions */ |
| 24 |
| 25 /* We encode variable-length integers in little-endian order using seven bits |
| 26 * per byte as follows: |
| 27 ** |
| 28 ** KEY: |
| 29 ** A = 0xxxxxxx 7 bits of data and one flag bit |
| 30 ** B = 1xxxxxxx 7 bits of data and one flag bit |
| 31 ** |
| 32 ** 7 bits - A |
| 33 ** 14 bits - BA |
| 34 ** 21 bits - BBA |
| 35 ** and so on. |
| 36 */ |
| 37 |
| 38 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ |
| 39 #define VARINT_MAX 10 |
| 40 |
| 41 /* Write a 64-bit variable-length integer to memory starting at p[0]. |
| 42 * The length of data written will be between 1 and VARINT_MAX bytes. |
| 43 * The number of bytes written is returned. */ |
| 44 static int putVarint(char *p, sqlite_int64 v){ |
| 45 unsigned char *q = (unsigned char *) p; |
| 46 sqlite_uint64 vu = v; |
| 47 do{ |
| 48 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); |
| 49 vu >>= 7; |
| 50 }while( vu!=0 ); |
| 51 q[-1] &= 0x7f; /* turn off high bit in final byte */ |
| 52 assert( q - (unsigned char *)p <= VARINT_MAX ); |
| 53 return (int) (q - (unsigned char *)p); |
| 54 } |
| 55 |
| 56 /* Read a 64-bit variable-length integer from memory starting at p[0]. |
| 57 * Return the number of bytes read, or 0 on error. |
| 58 * The value is stored in *v. */ |
| 59 static int getVarint(const char *p, sqlite_int64 *v){ |
| 60 const unsigned char *q = (const unsigned char *) p; |
| 61 sqlite_uint64 x = 0, y = 1; |
| 62 while( (*q & 0x80) == 0x80 ){ |
| 63 x += y * (*q++ & 0x7f); |
| 64 y <<= 7; |
| 65 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ |
| 66 assert( 0 ); |
| 67 return 0; |
| 68 } |
| 69 } |
| 70 x += y * (*q++); |
| 71 *v = (sqlite_int64) x; |
| 72 return (int) (q - (unsigned char *)p); |
| 73 } |
| 74 |
| 75 static int getVarint32(const char *p, int *pi){ |
| 76 sqlite_int64 i; |
| 77 int ret = getVarint(p, &i); |
| 78 *pi = (int) i; |
| 79 assert( *pi==i ); |
| 80 return ret; |
| 81 } |
| 82 |
| 83 /*** Document lists *** |
| 84 * |
| 85 * A document list holds a sorted list of varint-encoded document IDs. |
| 86 * |
| 87 * A doclist with type DL_POSITIONS_OFFSETS is stored like this: |
| 88 * |
| 89 * array { |
| 90 * varint docid; |
| 91 * array { |
| 92 * varint position; (delta from previous position plus 1, or 0 for end) |
| 93 * varint startOffset; (delta from previous startOffset) |
| 94 * varint endOffset; (delta from startOffset) |
| 95 * } |
| 96 * } |
| 97 * |
| 98 * Here, array { X } means zero or more occurrences of X, adjacent in memory. |
| 99 * |
| 100 * A doclist with type DL_POSITIONS is like the above, but holds only docids |
| 101 * and positions without offset information. |
| 102 * |
| 103 * A doclist with type DL_DOCIDS is like the above, but holds only docids |
| 104 * without positions or offset information. |
| 105 * |
| 106 * On disk, every document list has positions and offsets, so we don't bother |
| 107 * to serialize a doclist's type. |
| 108 * |
| 109 * We don't yet delta-encode document IDs; doing so will probably be a |
| 110 * modest win. |
| 111 * |
| 112 * NOTE(shess) I've thought of a slightly (1%) better offset encoding. |
| 113 * After the first offset, estimate the next offset by using the |
| 114 * current token position and the previous token position and offset, |
| 115 * offset to handle some variance. So the estimate would be |
| 116 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded |
| 117 * as normal. Offsets more than 64 chars from the estimate are |
| 118 * encoded as the delta to the previous start offset + 128. An |
| 119 * additional tiny increment can be gained by using the end offset of |
| 120 * the previous token to make the estimate a tiny bit more precise. |
| 121 */ |
| 122 |
| 123 typedef enum DocListType { |
| 124 DL_DOCIDS, /* docids only */ |
| 125 DL_POSITIONS, /* docids + positions */ |
| 126 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ |
| 127 } DocListType; |
| 128 |
| 129 typedef struct DocList { |
| 130 char *pData; |
| 131 int nData; |
| 132 DocListType iType; |
| 133 int iLastPos; /* the last position written */ |
| 134 int iLastOffset; /* the last start offset written */ |
| 135 } DocList; |
| 136 |
| 137 /* Initialize a new DocList to hold the given data. */ |
| 138 static void docListInit(DocList *d, DocListType iType, |
| 139 const char *pData, int nData){ |
| 140 d->nData = nData; |
| 141 if( nData>0 ){ |
| 142 d->pData = malloc(nData); |
| 143 memcpy(d->pData, pData, nData); |
| 144 } else { |
| 145 d->pData = NULL; |
| 146 } |
| 147 d->iType = iType; |
| 148 d->iLastPos = 0; |
| 149 d->iLastOffset = 0; |
| 150 } |
| 151 |
| 152 /* Create a new dynamically-allocated DocList. */ |
| 153 static DocList *docListNew(DocListType iType){ |
| 154 DocList *d = (DocList *) malloc(sizeof(DocList)); |
| 155 docListInit(d, iType, 0, 0); |
| 156 return d; |
| 157 } |
| 158 |
| 159 static void docListDestroy(DocList *d){ |
| 160 free(d->pData); |
| 161 #ifndef NDEBUG |
| 162 memset(d, 0x55, sizeof(*d)); |
| 163 #endif |
| 164 } |
| 165 |
| 166 static void docListDelete(DocList *d){ |
| 167 docListDestroy(d); |
| 168 free(d); |
| 169 } |
| 170 |
| 171 static char *docListEnd(DocList *d){ |
| 172 return d->pData + d->nData; |
| 173 } |
| 174 |
| 175 /* Append a varint to a DocList's data. */ |
| 176 static void appendVarint(DocList *d, sqlite_int64 i){ |
| 177 char c[VARINT_MAX]; |
| 178 int n = putVarint(c, i); |
| 179 d->pData = realloc(d->pData, d->nData + n); |
| 180 memcpy(d->pData + d->nData, c, n); |
| 181 d->nData += n; |
| 182 } |
| 183 |
| 184 static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ |
| 185 appendVarint(d, iDocid); |
| 186 d->iLastPos = 0; |
| 187 } |
| 188 |
| 189 /* Add a position to the last position list in a doclist. */ |
| 190 static void docListAddPos(DocList *d, int iPos){ |
| 191 assert( d->iType>=DL_POSITIONS ); |
| 192 appendVarint(d, iPos-d->iLastPos+1); |
| 193 d->iLastPos = iPos; |
| 194 } |
| 195 |
| 196 static void docListAddPosOffset(DocList *d, int iPos, |
| 197 int iStartOffset, int iEndOffset){ |
| 198 assert( d->iType==DL_POSITIONS_OFFSETS ); |
| 199 docListAddPos(d, iPos); |
| 200 appendVarint(d, iStartOffset-d->iLastOffset); |
| 201 d->iLastOffset = iStartOffset; |
| 202 appendVarint(d, iEndOffset-iStartOffset); |
| 203 } |
| 204 |
| 205 /* Terminate the last position list in the given doclist. */ |
| 206 static void docListAddEndPos(DocList *d){ |
| 207 appendVarint(d, 0); |
| 208 } |
| 209 |
| 210 typedef struct DocListReader { |
| 211 DocList *pDoclist; |
| 212 char *p; |
| 213 int iLastPos; /* the last position read */ |
| 214 } DocListReader; |
| 215 |
| 216 static void readerInit(DocListReader *r, DocList *pDoclist){ |
| 217 r->pDoclist = pDoclist; |
| 218 if( pDoclist!=NULL ){ |
| 219 r->p = pDoclist->pData; |
| 220 } |
| 221 r->iLastPos = 0; |
| 222 } |
| 223 |
| 224 static int readerAtEnd(DocListReader *pReader){ |
| 225 return pReader->p >= docListEnd(pReader->pDoclist); |
| 226 } |
| 227 |
| 228 /* Peek at the next docid without advancing the read pointer. */ |
| 229 static sqlite_int64 peekDocid(DocListReader *pReader){ |
| 230 sqlite_int64 ret; |
| 231 assert( !readerAtEnd(pReader) ); |
| 232 getVarint(pReader->p, &ret); |
| 233 return ret; |
| 234 } |
| 235 |
| 236 /* Read the next docid. */ |
| 237 static sqlite_int64 readDocid(DocListReader *pReader){ |
| 238 sqlite_int64 ret; |
| 239 assert( !readerAtEnd(pReader) ); |
| 240 pReader->p += getVarint(pReader->p, &ret); |
| 241 pReader->iLastPos = 0; |
| 242 return ret; |
| 243 } |
| 244 |
| 245 /* Read the next position from a position list. |
| 246 * Returns the position, or -1 at the end of the list. */ |
| 247 static int readPosition(DocListReader *pReader){ |
| 248 int i; |
| 249 int iType = pReader->pDoclist->iType; |
| 250 assert( iType>=DL_POSITIONS ); |
| 251 assert( !readerAtEnd(pReader) ); |
| 252 |
| 253 pReader->p += getVarint32(pReader->p, &i); |
| 254 if( i==0 ){ |
| 255 pReader->iLastPos = -1; |
| 256 return -1; |
| 257 } |
| 258 pReader->iLastPos += ((int) i)-1; |
| 259 if( iType>=DL_POSITIONS_OFFSETS ){ |
| 260 /* Skip over offsets, ignoring them for now. */ |
| 261 int iStart, iEnd; |
| 262 pReader->p += getVarint32(pReader->p, &iStart); |
| 263 pReader->p += getVarint32(pReader->p, &iEnd); |
| 264 } |
| 265 return pReader->iLastPos; |
| 266 } |
| 267 |
| 268 /* Skip past the end of a position list. */ |
| 269 static void skipPositionList(DocListReader *pReader){ |
| 270 while( readPosition(pReader)!=-1 ) |
| 271 ; |
| 272 } |
| 273 |
| 274 /* Skip over a docid, including its position list if the doclist has |
| 275 * positions. */ |
| 276 static void skipDocument(DocListReader *pReader){ |
| 277 readDocid(pReader); |
| 278 if( pReader->pDoclist->iType >= DL_POSITIONS ){ |
| 279 skipPositionList(pReader); |
| 280 } |
| 281 } |
| 282 |
| 283 static sqlite_int64 firstDocid(DocList *d){ |
| 284 DocListReader r; |
| 285 readerInit(&r, d); |
| 286 return readDocid(&r); |
| 287 } |
| 288 |
| 289 /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; |
| 290 * otherwise pUpdate, which must contain only the single docid [iDocid], is |
| 291 * inserted (if not present) or updated (if already present). */ |
| 292 static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ |
| 293 int modified = 0; |
| 294 DocListReader reader; |
| 295 char *p; |
| 296 |
| 297 if( pUpdate!=NULL ){ |
| 298 assert( d->iType==pUpdate->iType); |
| 299 assert( iDocid==firstDocid(pUpdate) ); |
| 300 } |
| 301 |
| 302 readerInit(&reader, d); |
| 303 while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ |
| 304 skipDocument(&reader); |
| 305 } |
| 306 |
| 307 p = reader.p; |
| 308 /* Delete if there is a matching element. */ |
| 309 if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ |
| 310 skipDocument(&reader); |
| 311 memmove(p, reader.p, docListEnd(d) - reader.p); |
| 312 d->nData -= (reader.p - p); |
| 313 modified = 1; |
| 314 } |
| 315 |
| 316 /* Insert if indicated. */ |
| 317 if( pUpdate!=NULL ){ |
| 318 int iDoclist = p-d->pData; |
| 319 docListAddEndPos(pUpdate); |
| 320 |
| 321 d->pData = realloc(d->pData, d->nData+pUpdate->nData); |
| 322 p = d->pData + iDoclist; |
| 323 |
| 324 memmove(p+pUpdate->nData, p, docListEnd(d) - p); |
| 325 memcpy(p, pUpdate->pData, pUpdate->nData); |
| 326 d->nData += pUpdate->nData; |
| 327 modified = 1; |
| 328 } |
| 329 |
| 330 return modified; |
| 331 } |
| 332 |
| 333 /* Split the second half of doclist d into a separate doclist d2. Returns 1 |
| 334 * if successful, or 0 if d contains a single document and hence can't be |
| 335 * split. */ |
| 336 static int docListSplit(DocList *d, DocList *d2){ |
| 337 const char *pSplitPoint = d->pData + d->nData / 2; |
| 338 DocListReader reader; |
| 339 |
| 340 readerInit(&reader, d); |
| 341 while( reader.p<pSplitPoint ){ |
| 342 skipDocument(&reader); |
| 343 } |
| 344 if( readerAtEnd(&reader) ) return 0; |
| 345 docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); |
| 346 d->nData = reader.p - d->pData; |
| 347 d->pData = realloc(d->pData, d->nData); |
| 348 return 1; |
| 349 } |
| 350 |
| 351 /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked |
| 352 * on-disk doclist, resulting in another in-memory DocList [out]. [in] |
| 353 * and [out] may or may not store position information according to the |
| 354 * caller's wishes. The on-disk doclist always comes with positions. |
| 355 * |
| 356 * The caller must read each chunk of the on-disk doclist in succession and |
| 357 * pass it to mergeBlock(). |
| 358 * |
| 359 * If [in] has positions, then the merge output contains only documents with |
| 360 * matching positions in the two input doclists. If [in] does not have |
| 361 * positions, then the merge output contains all documents common to the two |
| 362 * input doclists. |
| 363 * |
| 364 * If [in] is NULL, then the on-disk doclist is copied to [out] directly. |
| 365 * |
| 366 * A merge is performed using an integer [iOffset] provided by the caller. |
| 367 * [iOffset] is subtracted from each position in the on-disk doclist for the |
| 368 * purpose of position comparison; this is helpful in implementing phrase |
| 369 * searches. |
| 370 * |
| 371 * A DocListMerge is not yet able to propagate offsets through query |
| 372 * processing; we should add that capability soon. |
| 373 */ |
| 374 typedef struct DocListMerge { |
| 375 DocListReader in; |
| 376 DocList *pOut; |
| 377 int iOffset; |
| 378 } DocListMerge; |
| 379 |
| 380 static void mergeInit(DocListMerge *m, |
| 381 DocList *pIn, int iOffset, DocList *pOut){ |
| 382 readerInit(&m->in, pIn); |
| 383 m->pOut = pOut; |
| 384 m->iOffset = iOffset; |
| 385 |
| 386 /* can't handle offsets yet */ |
| 387 assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); |
| 388 assert( pOut->iType <= DL_POSITIONS ); |
| 389 } |
| 390 |
| 391 /* A helper function for mergeBlock(), below. Merge the position lists |
| 392 * pointed to by m->in and pBlockReader. |
| 393 * If the merge matches, write [iDocid] to m->pOut; if m->pOut |
| 394 * has positions then write all matching positions as well. */ |
| 395 static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, |
| 396 DocListReader *pBlockReader){ |
| 397 int block_pos = readPosition(pBlockReader); |
| 398 int in_pos = readPosition(&m->in); |
| 399 int match = 0; |
| 400 while( block_pos!=-1 || in_pos!=-1 ){ |
| 401 if( block_pos-m->iOffset==in_pos ){ |
| 402 if( !match ){ |
| 403 docListAddDocid(m->pOut, iDocid); |
| 404 match = 1; |
| 405 } |
| 406 if( m->pOut->iType >= DL_POSITIONS ){ |
| 407 docListAddPos(m->pOut, in_pos); |
| 408 } |
| 409 block_pos = readPosition(pBlockReader); |
| 410 in_pos = readPosition(&m->in); |
| 411 } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ |
| 412 block_pos = readPosition(pBlockReader); |
| 413 } else { |
| 414 in_pos = readPosition(&m->in); |
| 415 } |
| 416 } |
| 417 if( m->pOut->iType >= DL_POSITIONS && match ){ |
| 418 docListAddEndPos(m->pOut); |
| 419 } |
| 420 } |
| 421 |
| 422 /* Merge one block of an on-disk doclist into a DocListMerge. */ |
| 423 static void mergeBlock(DocListMerge *m, DocList *pBlock){ |
| 424 DocListReader blockReader; |
| 425 assert( pBlock->iType >= DL_POSITIONS ); |
| 426 readerInit(&blockReader, pBlock); |
| 427 while( !readerAtEnd(&blockReader) ){ |
| 428 sqlite_int64 iDocid = readDocid(&blockReader); |
| 429 if( m->in.pDoclist!=NULL ){ |
| 430 while( 1 ){ |
| 431 if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ |
| 432 if( peekDocid(&m->in)>=iDocid ) break; |
| 433 skipDocument(&m->in); |
| 434 } |
| 435 if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ |
| 436 skipPositionList(&blockReader); /* skip this docid in the block */ |
| 437 continue; |
| 438 } |
| 439 readDocid(&m->in); |
| 440 } |
| 441 /* We have a document match. */ |
| 442 if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ |
| 443 /* We don't need to do a poslist merge. */ |
| 444 docListAddDocid(m->pOut, iDocid); |
| 445 if( m->pOut->iType >= DL_POSITIONS ){ |
| 446 /* Copy all positions to the output doclist. */ |
| 447 while( 1 ){ |
| 448 int pos = readPosition(&blockReader); |
| 449 if( pos==-1 ) break; |
| 450 docListAddPos(m->pOut, pos); |
| 451 } |
| 452 docListAddEndPos(m->pOut); |
| 453 } else skipPositionList(&blockReader); |
| 454 continue; |
| 455 } |
| 456 mergePosList(m, iDocid, &blockReader); |
| 457 } |
| 458 } |
| 459 |
| 460 static char *string_dup_n(const char *s, int n){ |
| 461 char *str = malloc(n + 1); |
| 462 memcpy(str, s, n); |
| 463 str[n] = '\0'; |
| 464 return str; |
| 465 } |
| 466 |
| 467 /* Duplicate a string; the caller must free() the returned string. |
| 468 * (We don't use strdup() since it's not part of the standard C library and |
| 469 * may not be available everywhere.) */ |
| 470 static char *string_dup(const char *s){ |
| 471 return string_dup_n(s, strlen(s)); |
| 472 } |
| 473 |
| 474 /* Format a string, replacing each occurrence of the % character with |
| 475 * zName. This may be more convenient than sqlite_mprintf() |
| 476 * when one string is used repeatedly in a format string. |
| 477 * The caller must free() the returned string. */ |
| 478 static char *string_format(const char *zFormat, const char *zName){ |
| 479 const char *p; |
| 480 size_t len = 0; |
| 481 size_t nName = strlen(zName); |
| 482 char *result; |
| 483 char *r; |
| 484 |
| 485 /* first compute length needed */ |
| 486 for(p = zFormat ; *p ; ++p){ |
| 487 len += (*p=='%' ? nName : 1); |
| 488 } |
| 489 len += 1; /* for null terminator */ |
| 490 |
| 491 r = result = malloc(len); |
| 492 for(p = zFormat; *p; ++p){ |
| 493 if( *p=='%' ){ |
| 494 memcpy(r, zName, nName); |
| 495 r += nName; |
| 496 } else { |
| 497 *r++ = *p; |
| 498 } |
| 499 } |
| 500 *r++ = '\0'; |
| 501 assert( r == result + len ); |
| 502 return result; |
| 503 } |
| 504 |
| 505 static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ |
| 506 char *zCommand = string_format(zFormat, zName); |
| 507 int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); |
| 508 free(zCommand); |
| 509 return rc; |
| 510 } |
| 511 |
| 512 static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, |
| 513 const char *zFormat){ |
| 514 char *zCommand = string_format(zFormat, zName); |
| 515 int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); |
| 516 free(zCommand); |
| 517 return rc; |
| 518 } |
| 519 |
| 520 /* end utility functions */ |
| 521 |
| 522 #define QUERY_GENERIC 0 |
| 523 #define QUERY_FULLTEXT 1 |
| 524 |
| 525 #define CHUNK_MAX 1024 |
| 526 |
| 527 typedef enum fulltext_statement { |
| 528 CONTENT_INSERT_STMT, |
| 529 CONTENT_SELECT_STMT, |
| 530 CONTENT_DELETE_STMT, |
| 531 |
| 532 TERM_SELECT_STMT, |
| 533 TERM_CHUNK_SELECT_STMT, |
| 534 TERM_INSERT_STMT, |
| 535 TERM_UPDATE_STMT, |
| 536 TERM_DELETE_STMT, |
| 537 |
| 538 MAX_STMT /* Always at end! */ |
| 539 } fulltext_statement; |
| 540 |
| 541 /* These must exactly match the enum above. */ |
| 542 /* TODO(adam): Is there some risk that a statement (in particular, |
| 543 ** pTermSelectStmt) will be used in two cursors at once, e.g. if a |
| 544 ** query joins a virtual table to itself? If so perhaps we should |
| 545 ** move some of these to the cursor object. |
| 546 */ |
| 547 static const char *fulltext_zStatement[MAX_STMT] = { |
| 548 /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", |
| 549 /* CONTENT_SELECT */ "select content from %_content where rowid = ?", |
| 550 /* CONTENT_DELETE */ "delete from %_content where rowid = ?", |
| 551 |
| 552 /* TERM_SELECT */ |
| 553 "select rowid, doclist from %_term where term = ? and first = ?", |
| 554 /* TERM_CHUNK_SELECT */ |
| 555 "select max(first) from %_term where term = ? and first <= ?", |
| 556 /* TERM_INSERT */ |
| 557 "insert into %_term (term, first, doclist) values (?, ?, ?)", |
| 558 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", |
| 559 /* TERM_DELETE */ "delete from %_term where rowid = ?", |
| 560 }; |
| 561 |
| 562 typedef struct fulltext_vtab { |
| 563 sqlite3_vtab base; |
| 564 sqlite3 *db; |
| 565 const char *zName; /* virtual table name */ |
| 566 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ |
| 567 |
| 568 /* Precompiled statements which we keep as long as the table is |
| 569 ** open. |
| 570 */ |
| 571 sqlite3_stmt *pFulltextStatements[MAX_STMT]; |
| 572 } fulltext_vtab; |
| 573 |
| 574 typedef struct fulltext_cursor { |
| 575 sqlite3_vtab_cursor base; |
| 576 int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ |
| 577 |
| 578 sqlite3_stmt *pStmt; |
| 579 |
| 580 int eof; |
| 581 |
| 582 /* The following is used only when iCursorType == QUERY_FULLTEXT. */ |
| 583 DocListReader result; |
| 584 } fulltext_cursor; |
| 585 |
| 586 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ |
| 587 return (fulltext_vtab *) c->base.pVtab; |
| 588 } |
| 589 |
| 590 static sqlite3_module fulltextModule; /* forward declaration */ |
| 591 |
| 592 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. |
| 593 ** If the indicated statement has never been prepared, it is prepared |
| 594 ** and cached, otherwise the cached version is reset. |
| 595 */ |
| 596 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, |
| 597 sqlite3_stmt **ppStmt){ |
| 598 assert( iStmt<MAX_STMT ); |
| 599 if( v->pFulltextStatements[iStmt]==NULL ){ |
| 600 int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], |
| 601 fulltext_zStatement[iStmt]); |
| 602 if( rc!=SQLITE_OK ) return rc; |
| 603 } else { |
| 604 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); |
| 605 if( rc!=SQLITE_OK ) return rc; |
| 606 } |
| 607 |
| 608 *ppStmt = v->pFulltextStatements[iStmt]; |
| 609 return SQLITE_OK; |
| 610 } |
| 611 |
| 612 /* Step the indicated statement, handling errors SQLITE_BUSY (by |
| 613 ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring |
| 614 ** bindings to the new statement). |
| 615 ** TODO(adam): We should extend this function so that it can work with |
| 616 ** statements declared locally, not only globally cached statements. |
| 617 */ |
| 618 static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, |
| 619 sqlite3_stmt **ppStmt){ |
| 620 int rc; |
| 621 sqlite3_stmt *s = *ppStmt; |
| 622 assert( iStmt<MAX_STMT ); |
| 623 assert( s==v->pFulltextStatements[iStmt] ); |
| 624 |
| 625 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ |
| 626 sqlite3_stmt *pNewStmt; |
| 627 |
| 628 if( rc==SQLITE_BUSY ) continue; |
| 629 if( rc!=SQLITE_ERROR ) return rc; |
| 630 |
| 631 rc = sqlite3_reset(s); |
| 632 if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; |
| 633 |
| 634 v->pFulltextStatements[iStmt] = NULL; /* Still in s */ |
| 635 rc = sql_get_statement(v, iStmt, &pNewStmt); |
| 636 if( rc!=SQLITE_OK ) goto err; |
| 637 *ppStmt = pNewStmt; |
| 638 |
| 639 rc = sqlite3_transfer_bindings(s, pNewStmt); |
| 640 if( rc!=SQLITE_OK ) goto err; |
| 641 |
| 642 rc = sqlite3_finalize(s); |
| 643 if( rc!=SQLITE_OK ) return rc; |
| 644 s = pNewStmt; |
| 645 } |
| 646 return rc; |
| 647 |
| 648 err: |
| 649 sqlite3_finalize(s); |
| 650 return rc; |
| 651 } |
| 652 |
| 653 /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. |
| 654 ** Useful for statements like UPDATE, where we expect no results. |
| 655 */ |
| 656 static int sql_single_step_statement(fulltext_vtab *v, |
| 657 fulltext_statement iStmt, |
| 658 sqlite3_stmt **ppStmt){ |
| 659 int rc = sql_step_statement(v, iStmt, ppStmt); |
| 660 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; |
| 661 } |
| 662 |
| 663 /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ |
| 664 static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, |
| 665 const char *zContent, int nContent){ |
| 666 sqlite3_stmt *s; |
| 667 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); |
| 668 if( rc!=SQLITE_OK ) return rc; |
| 669 |
| 670 rc = sqlite3_bind_value(s, 1, rowid); |
| 671 if( rc!=SQLITE_OK ) return rc; |
| 672 |
| 673 rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); |
| 674 if( rc!=SQLITE_OK ) return rc; |
| 675 |
| 676 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); |
| 677 } |
| 678 |
| 679 /* select content from %_content where rowid = [iRow] |
| 680 * The caller must delete the returned string. */ |
| 681 static int content_select(fulltext_vtab *v, sqlite_int64 iRow, |
| 682 char **pzContent){ |
| 683 sqlite3_stmt *s; |
| 684 int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); |
| 685 if( rc!=SQLITE_OK ) return rc; |
| 686 |
| 687 rc = sqlite3_bind_int64(s, 1, iRow); |
| 688 if( rc!=SQLITE_OK ) return rc; |
| 689 |
| 690 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); |
| 691 if( rc!=SQLITE_ROW ) return rc; |
| 692 |
| 693 *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); |
| 694 |
| 695 /* We expect only one row. We must execute another sqlite3_step() |
| 696 * to complete the iteration; otherwise the table will remain locked. */ |
| 697 rc = sqlite3_step(s); |
| 698 if( rc==SQLITE_DONE ) return SQLITE_OK; |
| 699 |
| 700 free(*pzContent); |
| 701 return rc; |
| 702 } |
| 703 |
| 704 /* delete from %_content where rowid = [iRow ] */ |
| 705 static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ |
| 706 sqlite3_stmt *s; |
| 707 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); |
| 708 if( rc!=SQLITE_OK ) return rc; |
| 709 |
| 710 rc = sqlite3_bind_int64(s, 1, iRow); |
| 711 if( rc!=SQLITE_OK ) return rc; |
| 712 |
| 713 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); |
| 714 } |
| 715 |
| 716 /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] |
| 717 * If found, returns SQLITE_OK; the caller must free the returned doclist. |
| 718 * If no rows found, returns SQLITE_ERROR. */ |
| 719 static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, |
| 720 sqlite_int64 iFirst, |
| 721 sqlite_int64 *rowid, |
| 722 DocList *out){ |
| 723 sqlite3_stmt *s; |
| 724 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); |
| 725 if( rc!=SQLITE_OK ) return rc; |
| 726 |
| 727 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); |
| 728 if( rc!=SQLITE_OK ) return rc; |
| 729 |
| 730 rc = sqlite3_bind_int64(s, 2, iFirst); |
| 731 if( rc!=SQLITE_OK ) return rc; |
| 732 |
| 733 rc = sql_step_statement(v, TERM_SELECT_STMT, &s); |
| 734 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; |
| 735 |
| 736 *rowid = sqlite3_column_int64(s, 0); |
| 737 docListInit(out, DL_POSITIONS_OFFSETS, |
| 738 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); |
| 739 |
| 740 /* We expect only one row. We must execute another sqlite3_step() |
| 741 * to complete the iteration; otherwise the table will remain locked. */ |
| 742 rc = sqlite3_step(s); |
| 743 return rc==SQLITE_DONE ? SQLITE_OK : rc; |
| 744 } |
| 745 |
| 746 /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] |
| 747 * If found, returns SQLITE_ROW and result in *piResult; if the query returns |
| 748 * NULL (meaning no row found) returns SQLITE_DONE. |
| 749 */ |
| 750 static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, |
| 751 sqlite_int64 iFirst, sqlite_int64 *piResult){ |
| 752 sqlite3_stmt *s; |
| 753 int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); |
| 754 if( rc!=SQLITE_OK ) return rc; |
| 755 |
| 756 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); |
| 757 if( rc!=SQLITE_OK ) return rc; |
| 758 |
| 759 rc = sqlite3_bind_int64(s, 2, iFirst); |
| 760 if( rc!=SQLITE_OK ) return rc; |
| 761 |
| 762 rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); |
| 763 if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; |
| 764 |
| 765 switch( sqlite3_column_type(s, 0) ){ |
| 766 case SQLITE_NULL: |
| 767 rc = SQLITE_DONE; |
| 768 break; |
| 769 case SQLITE_INTEGER: |
| 770 *piResult = sqlite3_column_int64(s, 0); |
| 771 break; |
| 772 default: |
| 773 return SQLITE_ERROR; |
| 774 } |
| 775 /* We expect only one row. We must execute another sqlite3_step() |
| 776 * to complete the iteration; otherwise the table will remain locked. */ |
| 777 if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; |
| 778 return rc; |
| 779 } |
| 780 |
| 781 /* insert into %_term (term, first, doclist) |
| 782 values ([zTerm], [iFirst], [doclist]) */ |
| 783 static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, |
| 784 sqlite_int64 iFirst, DocList *doclist){ |
| 785 sqlite3_stmt *s; |
| 786 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); |
| 787 if( rc!=SQLITE_OK ) return rc; |
| 788 |
| 789 rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); |
| 790 if( rc!=SQLITE_OK ) return rc; |
| 791 |
| 792 rc = sqlite3_bind_int64(s, 2, iFirst); |
| 793 if( rc!=SQLITE_OK ) return rc; |
| 794 |
| 795 rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); |
| 796 if( rc!=SQLITE_OK ) return rc; |
| 797 |
| 798 return sql_single_step_statement(v, TERM_INSERT_STMT, &s); |
| 799 } |
| 800 |
| 801 /* update %_term set doclist = [doclist] where rowid = [rowid] */ |
| 802 static int term_update(fulltext_vtab *v, sqlite_int64 rowid, |
| 803 DocList *doclist){ |
| 804 sqlite3_stmt *s; |
| 805 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); |
| 806 if( rc!=SQLITE_OK ) return rc; |
| 807 |
| 808 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, |
| 809 SQLITE_STATIC); |
| 810 if( rc!=SQLITE_OK ) return rc; |
| 811 |
| 812 rc = sqlite3_bind_int64(s, 2, rowid); |
| 813 if( rc!=SQLITE_OK ) return rc; |
| 814 |
| 815 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); |
| 816 } |
| 817 |
| 818 static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ |
| 819 sqlite3_stmt *s; |
| 820 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); |
| 821 if( rc!=SQLITE_OK ) return rc; |
| 822 |
| 823 rc = sqlite3_bind_int64(s, 1, rowid); |
| 824 if( rc!=SQLITE_OK ) return rc; |
| 825 |
| 826 return sql_single_step_statement(v, TERM_DELETE_STMT, &s); |
| 827 } |
| 828 |
| 829 static void fulltext_vtab_destroy(fulltext_vtab *v){ |
| 830 int iStmt; |
| 831 |
| 832 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ |
| 833 if( v->pFulltextStatements[iStmt]!=NULL ){ |
| 834 sqlite3_finalize(v->pFulltextStatements[iStmt]); |
| 835 v->pFulltextStatements[iStmt] = NULL; |
| 836 } |
| 837 } |
| 838 |
| 839 if( v->pTokenizer!=NULL ){ |
| 840 v->pTokenizer->pModule->xDestroy(v->pTokenizer); |
| 841 v->pTokenizer = NULL; |
| 842 } |
| 843 |
| 844 free((void *) v->zName); |
| 845 free(v); |
| 846 } |
| 847 |
| 848 /* Current interface: |
| 849 ** argv[0] - module name |
| 850 ** argv[1] - database name |
| 851 ** argv[2] - table name |
| 852 ** argv[3] - tokenizer name (optional, a sensible default is provided) |
| 853 ** argv[4..] - passed to tokenizer (optional based on tokenizer) |
| 854 **/ |
| 855 static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, |
| 856 sqlite3_vtab **ppVTab){ |
| 857 int rc; |
| 858 fulltext_vtab *v; |
| 859 sqlite3_tokenizer_module *m = NULL; |
| 860 |
| 861 assert( argc>=3 ); |
| 862 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); |
| 863 /* sqlite will initialize v->base */ |
| 864 v->db = db; |
| 865 v->zName = string_dup(argv[2]); |
| 866 v->pTokenizer = NULL; |
| 867 |
| 868 if( argc==3 ){ |
| 869 get_simple_tokenizer_module(&m); |
| 870 } else { |
| 871 /* TODO(shess) For now, add new tokenizers as else if clauses. */ |
| 872 if( !strcmp(argv[3], "simple") ){ |
| 873 get_simple_tokenizer_module(&m); |
| 874 } else { |
| 875 assert( "unrecognized tokenizer"==NULL ); |
| 876 } |
| 877 } |
| 878 |
| 879 /* TODO(shess) Since tokenization impacts the index, the parameters |
| 880 ** to the tokenizer need to be identical when a persistent virtual |
| 881 ** table is re-created. One solution would be a meta-table to track |
| 882 ** such information in the database. Then we could verify that the |
| 883 ** information is identical on subsequent creates. |
| 884 */ |
| 885 /* TODO(shess) Why isn't argv already (const char **)? */ |
| 886 rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); |
| 887 if( rc!=SQLITE_OK ) return rc; |
| 888 v->pTokenizer->pModule = m; |
| 889 |
| 890 /* TODO: verify the existence of backing tables foo_content, foo_term */ |
| 891 |
| 892 rc = sqlite3_declare_vtab(db, "create table x(content text)"); |
| 893 if( rc!=SQLITE_OK ) return rc; |
| 894 |
| 895 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); |
| 896 |
| 897 *ppVTab = &v->base; |
| 898 return SQLITE_OK; |
| 899 } |
| 900 |
| 901 static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, |
| 902 sqlite3_vtab **ppVTab){ |
| 903 int rc; |
| 904 assert( argc>=3 ); |
| 905 |
| 906 /* The %_content table holds the text of each full-text item, with |
| 907 ** the rowid used as the docid. |
| 908 ** |
| 909 ** The %_term table maps each term to a document list blob |
| 910 ** containing elements sorted by ascending docid, each element |
| 911 ** encoded as: |
| 912 ** |
| 913 ** docid varint-encoded |
| 914 ** token count varint-encoded |
| 915 ** "count" token elements (poslist): |
| 916 ** position varint-encoded as delta from previous position |
| 917 ** start offset varint-encoded as delta from previous start offset |
| 918 ** end offset varint-encoded as delta from start offset |
| 919 ** |
| 920 ** Additionally, doclist blobs can be chunked into multiple rows, |
| 921 ** using "first" to order the blobs. "first" is simply the first |
| 922 ** docid in the blob. |
| 923 */ |
| 924 /* |
| 925 ** NOTE(shess) That last sentence is incorrect in the face of |
| 926 ** deletion, which can leave a doclist that doesn't contain the |
| 927 ** first from that row. I _believe_ this does not matter to the |
| 928 ** operation of the system, but it might be reasonable to update |
| 929 ** appropriately in case this assumption becomes more important. |
| 930 */ |
| 931 rc = sql_exec(db, argv[2], |
| 932 "create table %_content(content text);" |
| 933 "create table %_term(term text, first integer, doclist blob);" |
| 934 "create index %_index on %_term(term, first)"); |
| 935 if( rc!=SQLITE_OK ) return rc; |
| 936 |
| 937 return fulltextConnect(db, pAux, argc, argv, ppVTab); |
| 938 } |
| 939 |
| 940 /* Decide how to handle an SQL query. |
| 941 * At the moment, MATCH queries can include implicit boolean ANDs; we |
| 942 * haven't implemented phrase searches or OR yet. */ |
| 943 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ |
| 944 int i; |
| 945 |
| 946 for(i=0; i<pInfo->nConstraint; ++i){ |
| 947 const struct sqlite3_index_constraint *pConstraint; |
| 948 pConstraint = &pInfo->aConstraint[i]; |
| 949 if( pConstraint->iColumn==0 && |
| 950 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && |
| 951 pConstraint->usable ){ /* a full-text search */ |
| 952 pInfo->aConstraintUsage[i].argvIndex = 1; |
| 953 pInfo->aConstraintUsage[i].omit = 1; |
| 954 pInfo->idxNum = QUERY_FULLTEXT; |
| 955 pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ |
| 956 return SQLITE_OK; |
| 957 } |
| 958 } |
| 959 pInfo->idxNum = QUERY_GENERIC; |
| 960 return SQLITE_OK; |
| 961 } |
| 962 |
| 963 static int fulltextDisconnect(sqlite3_vtab *pVTab){ |
| 964 fulltext_vtab_destroy((fulltext_vtab *)pVTab); |
| 965 return SQLITE_OK; |
| 966 } |
| 967 |
| 968 static int fulltextDestroy(sqlite3_vtab *pVTab){ |
| 969 fulltext_vtab *v = (fulltext_vtab *)pVTab; |
| 970 |
| 971 int rc = sql_exec(v->db, v->zName, |
| 972 "drop table %_content; drop table %_term"); |
| 973 if( rc!=SQLITE_OK ) return rc; |
| 974 |
| 975 fulltext_vtab_destroy((fulltext_vtab *)pVTab); |
| 976 return SQLITE_OK; |
| 977 } |
| 978 |
| 979 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ |
| 980 fulltext_cursor *c; |
| 981 |
| 982 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); |
| 983 /* sqlite will initialize c->base */ |
| 984 *ppCursor = &c->base; |
| 985 |
| 986 return SQLITE_OK; |
| 987 } |
| 988 |
| 989 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ |
| 990 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
| 991 sqlite3_finalize(c->pStmt); |
| 992 if( c->result.pDoclist!=NULL ){ |
| 993 docListDelete(c->result.pDoclist); |
| 994 } |
| 995 free(c); |
| 996 return SQLITE_OK; |
| 997 } |
| 998 |
| 999 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ |
| 1000 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
| 1001 sqlite_int64 iDocid; |
| 1002 int rc; |
| 1003 |
| 1004 switch( c->iCursorType ){ |
| 1005 case QUERY_GENERIC: |
| 1006 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ |
| 1007 rc = sqlite3_step(c->pStmt); |
| 1008 switch( rc ){ |
| 1009 case SQLITE_ROW: |
| 1010 c->eof = 0; |
| 1011 return SQLITE_OK; |
| 1012 case SQLITE_DONE: |
| 1013 c->eof = 1; |
| 1014 return SQLITE_OK; |
| 1015 default: |
| 1016 c->eof = 1; |
| 1017 return rc; |
| 1018 } |
| 1019 case QUERY_FULLTEXT: |
| 1020 rc = sqlite3_reset(c->pStmt); |
| 1021 if( rc!=SQLITE_OK ) return rc; |
| 1022 |
| 1023 if( readerAtEnd(&c->result)){ |
| 1024 c->eof = 1; |
| 1025 return SQLITE_OK; |
| 1026 } |
| 1027 iDocid = readDocid(&c->result); |
| 1028 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); |
| 1029 if( rc!=SQLITE_OK ) return rc; |
| 1030 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ |
| 1031 rc = sqlite3_step(c->pStmt); |
| 1032 if( rc==SQLITE_ROW ){ /* the case we expect */ |
| 1033 c->eof = 0; |
| 1034 return SQLITE_OK; |
| 1035 } |
| 1036 /* an error occurred; abort */ |
| 1037 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; |
| 1038 default: |
| 1039 assert( 0 ); |
| 1040 return SQLITE_ERROR; /* not reached */ |
| 1041 } |
| 1042 } |
| 1043 |
| 1044 static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, |
| 1045 sqlite3_stmt **ppStmt){ |
| 1046 int rc; |
| 1047 if( *ppStmt ){ |
| 1048 rc = sqlite3_reset(*ppStmt); |
| 1049 } else { |
| 1050 rc = sql_prepare(v->db, v->zName, ppStmt, |
| 1051 "select doclist from %_term where term = ? order by first"); |
| 1052 } |
| 1053 if( rc!=SQLITE_OK ) return rc; |
| 1054 |
| 1055 rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); |
| 1056 if( rc!=SQLITE_OK ) return rc; |
| 1057 |
| 1058 return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ |
| 1059 } |
| 1060 |
| 1061 /* Read the posting list for [zTerm]; AND it with the doclist [in] to |
| 1062 * produce the doclist [out], using the given offset [iOffset] for phrase |
| 1063 * matching. |
| 1064 * (*pSelect) is used to hold an SQLite statement used inside this function; |
| 1065 * the caller should initialize *pSelect to NULL before the first call. |
| 1066 */ |
| 1067 static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, |
| 1068 const char *zTerm, |
| 1069 DocList *pIn, int iOffset, DocList *out){ |
| 1070 int rc; |
| 1071 DocListMerge merge; |
| 1072 |
| 1073 if( pIn!=NULL && !pIn->nData ){ |
| 1074 /* If [pIn] is already empty, there's no point in reading the |
| 1075 * posting list to AND it in; return immediately. */ |
| 1076 return SQLITE_OK; |
| 1077 } |
| 1078 |
| 1079 rc = term_select_doclist(v, zTerm, -1, pSelect); |
| 1080 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; |
| 1081 |
| 1082 mergeInit(&merge, pIn, iOffset, out); |
| 1083 while( rc==SQLITE_ROW ){ |
| 1084 DocList block; |
| 1085 docListInit(&block, DL_POSITIONS_OFFSETS, |
| 1086 sqlite3_column_blob(*pSelect, 0), |
| 1087 sqlite3_column_bytes(*pSelect, 0)); |
| 1088 mergeBlock(&merge, &block); |
| 1089 docListDestroy(&block); |
| 1090 |
| 1091 rc = sqlite3_step(*pSelect); |
| 1092 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ |
| 1093 return rc; |
| 1094 } |
| 1095 } |
| 1096 |
| 1097 return SQLITE_OK; |
| 1098 } |
| 1099 |
| 1100 typedef struct QueryTerm { |
| 1101 int is_phrase; /* true if this term begins a new phrase */ |
| 1102 const char *zTerm; |
| 1103 } QueryTerm; |
| 1104 |
| 1105 /* A parsed query. |
| 1106 * |
| 1107 * As an example, parsing the query ["four score" years "new nation"] will |
| 1108 * yield a Query with 5 terms: |
| 1109 * "four", is_phrase = 1 |
| 1110 * "score", is_phrase = 0 |
| 1111 * "years", is_phrase = 1 |
| 1112 * "new", is_phrase = 1 |
| 1113 * "nation", is_phrase = 0 |
| 1114 */ |
| 1115 typedef struct Query { |
| 1116 int nTerms; |
| 1117 QueryTerm *pTerm; |
| 1118 } Query; |
| 1119 |
| 1120 static void query_add(Query *q, int is_phrase, const char *zTerm){ |
| 1121 QueryTerm *t; |
| 1122 ++q->nTerms; |
| 1123 q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); |
| 1124 t = &q->pTerm[q->nTerms - 1]; |
| 1125 t->is_phrase = is_phrase; |
| 1126 t->zTerm = zTerm; |
| 1127 } |
| 1128 |
| 1129 static void query_free(Query *q){ |
| 1130 int i; |
| 1131 for(i = 0; i < q->nTerms; ++i){ |
| 1132 free((void *) q->pTerm[i].zTerm); |
| 1133 } |
| 1134 free(q->pTerm); |
| 1135 } |
| 1136 |
| 1137 static int tokenize_segment(sqlite3_tokenizer *pTokenizer, |
| 1138 const char *zQuery, int in_phrase, |
| 1139 Query *pQuery){ |
| 1140 sqlite3_tokenizer_module *pModule = pTokenizer->pModule; |
| 1141 sqlite3_tokenizer_cursor *pCursor; |
| 1142 int is_first = 1; |
| 1143 |
| 1144 int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); |
| 1145 if( rc!=SQLITE_OK ) return rc; |
| 1146 pCursor->pTokenizer = pTokenizer; |
| 1147 |
| 1148 while( 1 ){ |
| 1149 const char *zToken; |
| 1150 int nToken, iStartOffset, iEndOffset, dummy_pos; |
| 1151 |
| 1152 rc = pModule->xNext(pCursor, |
| 1153 &zToken, &nToken, |
| 1154 &iStartOffset, &iEndOffset, |
| 1155 &dummy_pos); |
| 1156 if( rc!=SQLITE_OK ) break; |
| 1157 query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); |
| 1158 is_first = 0; |
| 1159 } |
| 1160 |
| 1161 return pModule->xClose(pCursor); |
| 1162 } |
| 1163 |
| 1164 /* Parse a query string, yielding a Query object. */ |
| 1165 static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ |
| 1166 char *zQuery1 = string_dup(zQuery); |
| 1167 int in_phrase = 0; |
| 1168 char *s = zQuery1; |
| 1169 pQuery->nTerms = 0; |
| 1170 pQuery->pTerm = NULL; |
| 1171 |
| 1172 while( *s ){ |
| 1173 char *t = s; |
| 1174 while( *t ){ |
| 1175 if( *t=='"' ){ |
| 1176 *t++ = '\0'; |
| 1177 break; |
| 1178 } |
| 1179 ++t; |
| 1180 } |
| 1181 if( *s ){ |
| 1182 tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); |
| 1183 } |
| 1184 s = t; |
| 1185 in_phrase = !in_phrase; |
| 1186 } |
| 1187 |
| 1188 free(zQuery1); |
| 1189 return SQLITE_OK; |
| 1190 } |
| 1191 |
| 1192 /* Perform a full-text query; return a list of documents in [pResult]. */ |
| 1193 static int fulltext_query(fulltext_vtab *v, const char *zQuery, |
| 1194 DocList **pResult){ |
| 1195 Query q; |
| 1196 int phrase_start = -1; |
| 1197 int i; |
| 1198 sqlite3_stmt *pSelect = NULL; |
| 1199 DocList *d = NULL; |
| 1200 |
| 1201 int rc = parse_query(v, zQuery, &q); |
| 1202 if( rc!=SQLITE_OK ) return rc; |
| 1203 |
| 1204 /* Merge terms. */ |
| 1205 for(i = 0 ; i < q.nTerms ; ++i){ |
| 1206 /* In each merge step, we need to generate positions whenever we're |
| 1207 * processing a phrase which hasn't ended yet. */ |
| 1208 int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; |
| 1209 DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); |
| 1210 if( q.pTerm[i].is_phrase ){ |
| 1211 phrase_start = i; |
| 1212 } |
| 1213 rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); |
| 1214 if( rc!=SQLITE_OK ) break; |
| 1215 if( d!=NULL ){ |
| 1216 docListDelete(d); |
| 1217 } |
| 1218 d = next; |
| 1219 } |
| 1220 |
| 1221 sqlite3_finalize(pSelect); |
| 1222 query_free(&q); |
| 1223 *pResult = d; |
| 1224 return rc; |
| 1225 } |
| 1226 |
| 1227 static int fulltextFilter(sqlite3_vtab_cursor *pCursor, |
| 1228 int idxNum, const char *idxStr, |
| 1229 int argc, sqlite3_value **argv){ |
| 1230 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
| 1231 fulltext_vtab *v = cursor_vtab(c); |
| 1232 int rc; |
| 1233 const char *zStatement; |
| 1234 |
| 1235 c->iCursorType = idxNum; |
| 1236 switch( idxNum ){ |
| 1237 case QUERY_GENERIC: |
| 1238 zStatement = "select rowid, content from %_content"; |
| 1239 break; |
| 1240 |
| 1241 case QUERY_FULLTEXT: /* full-text search */ |
| 1242 { |
| 1243 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); |
| 1244 DocList *pResult; |
| 1245 assert( argc==1 ); |
| 1246 rc = fulltext_query(v, zQuery, &pResult); |
| 1247 if( rc!=SQLITE_OK ) return rc; |
| 1248 readerInit(&c->result, pResult); |
| 1249 zStatement = "select rowid, content from %_content where rowid = ?"; |
| 1250 break; |
| 1251 } |
| 1252 |
| 1253 default: |
| 1254 assert( 0 ); |
| 1255 } |
| 1256 |
| 1257 rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); |
| 1258 if( rc!=SQLITE_OK ) return rc; |
| 1259 |
| 1260 return fulltextNext(pCursor); |
| 1261 } |
| 1262 |
| 1263 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ |
| 1264 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
| 1265 return c->eof; |
| 1266 } |
| 1267 |
| 1268 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, |
| 1269 sqlite3_context *pContext, int idxCol){ |
| 1270 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
| 1271 const char *s; |
| 1272 |
| 1273 assert( idxCol==0 ); |
| 1274 s = (const char *) sqlite3_column_text(c->pStmt, 1); |
| 1275 sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); |
| 1276 |
| 1277 return SQLITE_OK; |
| 1278 } |
| 1279 |
| 1280 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ |
| 1281 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
| 1282 |
| 1283 *pRowid = sqlite3_column_int64(c->pStmt, 0); |
| 1284 return SQLITE_OK; |
| 1285 } |
| 1286 |
| 1287 /* Build a hash table containing all terms in zText. */ |
| 1288 static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, |
| 1289 const char *zText, sqlite_int64 iDocid){ |
| 1290 sqlite3_tokenizer_cursor *pCursor; |
| 1291 const char *pToken; |
| 1292 int nTokenBytes; |
| 1293 int iStartOffset, iEndOffset, iPosition; |
| 1294 |
| 1295 int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); |
| 1296 if( rc!=SQLITE_OK ) return rc; |
| 1297 |
| 1298 pCursor->pTokenizer = pTokenizer; |
| 1299 HashInit(terms, HASH_STRING, 1); |
| 1300 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, |
| 1301 &pToken, &nTokenBytes, |
| 1302 &iStartOffset, &iEndOffset, |
| 1303 &iPosition) ){ |
| 1304 DocList *p; |
| 1305 |
| 1306 /* Positions can't be negative; we use -1 as a terminator internally. */ |
| 1307 if( iPosition<0 ) { |
| 1308 rc = SQLITE_ERROR; |
| 1309 goto err; |
| 1310 } |
| 1311 |
| 1312 p = HashFind(terms, pToken, nTokenBytes); |
| 1313 if( p==NULL ){ |
| 1314 p = docListNew(DL_POSITIONS_OFFSETS); |
| 1315 docListAddDocid(p, iDocid); |
| 1316 HashInsert(terms, pToken, nTokenBytes, p); |
| 1317 } |
| 1318 docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); |
| 1319 } |
| 1320 |
| 1321 err: |
| 1322 /* TODO(shess) Check return? Should this be able to cause errors at |
| 1323 ** this point? Actually, same question about sqlite3_finalize(), |
| 1324 ** though one could argue that failure there means that the data is |
| 1325 ** not durable. *ponder* |
| 1326 */ |
| 1327 pTokenizer->pModule->xClose(pCursor); |
| 1328 return rc; |
| 1329 } |
| 1330 /* Update the %_terms table to map the term [zTerm] to the given rowid. */ |
| 1331 static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, |
| 1332 sqlite_int64 iDocid, DocList *p){ |
| 1333 sqlite_int64 iFirst; |
| 1334 sqlite_int64 iIndexRow; |
| 1335 DocList doclist; |
| 1336 |
| 1337 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); |
| 1338 if( rc==SQLITE_DONE ){ |
| 1339 docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); |
| 1340 if( docListUpdate(&doclist, iDocid, p) ){ |
| 1341 rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); |
| 1342 docListDestroy(&doclist); |
| 1343 return rc; |
| 1344 } |
| 1345 return SQLITE_OK; |
| 1346 } |
| 1347 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; |
| 1348 |
| 1349 /* This word is in the index; add this document ID to its blob. */ |
| 1350 |
| 1351 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); |
| 1352 if( rc!=SQLITE_OK ) return rc; |
| 1353 |
| 1354 if( docListUpdate(&doclist, iDocid, p) ){ |
| 1355 /* If the blob is too big, split it in half. */ |
| 1356 if( doclist.nData>CHUNK_MAX ){ |
| 1357 DocList half; |
| 1358 if( docListSplit(&doclist, &half) ){ |
| 1359 rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); |
| 1360 docListDestroy(&half); |
| 1361 if( rc!=SQLITE_OK ) goto err; |
| 1362 } |
| 1363 } |
| 1364 rc = term_update(v, iIndexRow, &doclist); |
| 1365 } |
| 1366 |
| 1367 err: |
| 1368 docListDestroy(&doclist); |
| 1369 return rc; |
| 1370 } |
| 1371 |
| 1372 /* Insert a row into the full-text index; set *piRowid to be the ID of the |
| 1373 * new row. */ |
| 1374 static int index_insert(fulltext_vtab *v, |
| 1375 sqlite3_value *pRequestRowid, const char *zText, |
| 1376 sqlite_int64 *piRowid){ |
| 1377 Hash terms; /* maps term string -> PosList */ |
| 1378 HashElem *e; |
| 1379 |
| 1380 int rc = content_insert(v, pRequestRowid, zText, -1); |
| 1381 if( rc!=SQLITE_OK ) return rc; |
| 1382 *piRowid = sqlite3_last_insert_rowid(v->db); |
| 1383 |
| 1384 if( !zText ) return SQLITE_OK; /* nothing to index */ |
| 1385 |
| 1386 rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); |
| 1387 if( rc!=SQLITE_OK ) return rc; |
| 1388 |
| 1389 for(e=HashFirst(&terms); e; e=HashNext(e)){ |
| 1390 DocList *p = HashData(e); |
| 1391 rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); |
| 1392 if( rc!=SQLITE_OK ) break; |
| 1393 } |
| 1394 |
| 1395 for(e=HashFirst(&terms); e; e=HashNext(e)){ |
| 1396 DocList *p = HashData(e); |
| 1397 docListDelete(p); |
| 1398 } |
| 1399 HashClear(&terms); |
| 1400 return rc; |
| 1401 } |
| 1402 |
| 1403 static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, |
| 1404 sqlite_int64 iDocid){ |
| 1405 sqlite_int64 iFirst; |
| 1406 sqlite_int64 iIndexRow; |
| 1407 DocList doclist; |
| 1408 |
| 1409 int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); |
| 1410 if( rc!=SQLITE_ROW ) return SQLITE_ERROR; |
| 1411 |
| 1412 rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); |
| 1413 if( rc!=SQLITE_OK ) return rc; |
| 1414 |
| 1415 if( docListUpdate(&doclist, iDocid, NULL) ){ |
| 1416 if( doclist.nData>0 ){ |
| 1417 rc = term_update(v, iIndexRow, &doclist); |
| 1418 } else { /* empty posting list */ |
| 1419 rc = term_delete(v, iIndexRow); |
| 1420 } |
| 1421 } |
| 1422 docListDestroy(&doclist); |
| 1423 return rc; |
| 1424 } |
| 1425 |
| 1426 /* Delete a row from the full-text index. */ |
| 1427 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ |
| 1428 char *zText; |
| 1429 Hash terms; |
| 1430 HashElem *e; |
| 1431 |
| 1432 int rc = content_select(v, iRow, &zText); |
| 1433 if( rc!=SQLITE_OK ) return rc; |
| 1434 |
| 1435 rc = build_terms(&terms, v->pTokenizer, zText, iRow); |
| 1436 free(zText); |
| 1437 if( rc!=SQLITE_OK ) return rc; |
| 1438 |
| 1439 for(e=HashFirst(&terms); e; e=HashNext(e)){ |
| 1440 rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); |
| 1441 if( rc!=SQLITE_OK ) break; |
| 1442 } |
| 1443 for(e=HashFirst(&terms); e; e=HashNext(e)){ |
| 1444 DocList *p = HashData(e); |
| 1445 docListDelete(p); |
| 1446 } |
| 1447 HashClear(&terms); |
| 1448 |
| 1449 return content_delete(v, iRow); |
| 1450 } |
| 1451 |
| 1452 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, |
| 1453 sqlite_int64 *pRowid){ |
| 1454 fulltext_vtab *v = (fulltext_vtab *) pVtab; |
| 1455 |
| 1456 if( nArg<2 ){ |
| 1457 return index_delete(v, sqlite3_value_int64(ppArg[0])); |
| 1458 } |
| 1459 |
| 1460 if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ |
| 1461 return SQLITE_ERROR; /* an update; not yet supported */ |
| 1462 } |
| 1463 |
| 1464 assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ |
| 1465 return index_insert(v, ppArg[1], |
| 1466 (const char *)sqlite3_value_text(ppArg[2]), pRowid); |
| 1467 } |
| 1468 |
| 1469 static sqlite3_module fulltextModule = { |
| 1470 0, |
| 1471 fulltextCreate, |
| 1472 fulltextConnect, |
| 1473 fulltextBestIndex, |
| 1474 fulltextDisconnect, |
| 1475 fulltextDestroy, |
| 1476 fulltextOpen, |
| 1477 fulltextClose, |
| 1478 fulltextFilter, |
| 1479 fulltextNext, |
| 1480 fulltextEof, |
| 1481 fulltextColumn, |
| 1482 fulltextRowid, |
| 1483 fulltextUpdate |
| 1484 }; |
| 1485 |
| 1486 int fulltext_init(sqlite3 *db){ |
| 1487 return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); |
| 1488 } |
| 1489 |
| 1490 #if !SQLITE_CORE |
| 1491 int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, |
| 1492 const sqlite3_api_routines *pApi){ |
| 1493 SQLITE_EXTENSION_INIT2(pApi) |
| 1494 return fulltext_init(db); |
| 1495 } |
| 1496 #endif |
OLD | NEW |