Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(167)

Side by Side Diff: third_party/sqlite/src/ext/fts3/fts3_snippet.c

Issue 6990047: Import SQLite 3.7.6.3. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 9 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 ** 2009 Oct 23
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13
14 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
15
16 #include "fts3Int.h"
17 #include <string.h>
18 #include <assert.h>
19
20 /*
21 ** Characters that may appear in the second argument to matchinfo().
22 */
23 #define FTS3_MATCHINFO_NPHRASE 'p' /* 1 value */
24 #define FTS3_MATCHINFO_NCOL 'c' /* 1 value */
25 #define FTS3_MATCHINFO_NDOC 'n' /* 1 value */
26 #define FTS3_MATCHINFO_AVGLENGTH 'a' /* nCol values */
27 #define FTS3_MATCHINFO_LENGTH 'l' /* nCol values */
28 #define FTS3_MATCHINFO_LCS 's' /* nCol values */
29 #define FTS3_MATCHINFO_HITS 'x' /* 3*nCol*nPhrase values */
30
31 /*
32 ** The default value for the second argument to matchinfo().
33 */
34 #define FTS3_MATCHINFO_DEFAULT "pcx"
35
36
37 /*
38 ** Used as an fts3ExprIterate() context when loading phrase doclists to
39 ** Fts3Expr.aDoclist[]/nDoclist.
40 */
41 typedef struct LoadDoclistCtx LoadDoclistCtx;
42 struct LoadDoclistCtx {
43 Fts3Cursor *pCsr; /* FTS3 Cursor */
44 int nPhrase; /* Number of phrases seen so far */
45 int nToken; /* Number of tokens seen so far */
46 };
47
48 /*
49 ** The following types are used as part of the implementation of the
50 ** fts3BestSnippet() routine.
51 */
52 typedef struct SnippetIter SnippetIter;
53 typedef struct SnippetPhrase SnippetPhrase;
54 typedef struct SnippetFragment SnippetFragment;
55
56 struct SnippetIter {
57 Fts3Cursor *pCsr; /* Cursor snippet is being generated from */
58 int iCol; /* Extract snippet from this column */
59 int nSnippet; /* Requested snippet length (in tokens) */
60 int nPhrase; /* Number of phrases in query */
61 SnippetPhrase *aPhrase; /* Array of size nPhrase */
62 int iCurrent; /* First token of current snippet */
63 };
64
65 struct SnippetPhrase {
66 int nToken; /* Number of tokens in phrase */
67 char *pList; /* Pointer to start of phrase position list */
68 int iHead; /* Next value in position list */
69 char *pHead; /* Position list data following iHead */
70 int iTail; /* Next value in trailing position list */
71 char *pTail; /* Position list data following iTail */
72 };
73
74 struct SnippetFragment {
75 int iCol; /* Column snippet is extracted from */
76 int iPos; /* Index of first token in snippet */
77 u64 covered; /* Mask of query phrases covered */
78 u64 hlmask; /* Mask of snippet terms to highlight */
79 };
80
81 /*
82 ** This type is used as an fts3ExprIterate() context object while
83 ** accumulating the data returned by the matchinfo() function.
84 */
85 typedef struct MatchInfo MatchInfo;
86 struct MatchInfo {
87 Fts3Cursor *pCursor; /* FTS3 Cursor */
88 int nCol; /* Number of columns in table */
89 int nPhrase; /* Number of matchable phrases in query */
90 sqlite3_int64 nDoc; /* Number of docs in database */
91 u32 *aMatchinfo; /* Pre-allocated buffer */
92 };
93
94
95
96 /*
97 ** The snippet() and offsets() functions both return text values. An instance
98 ** of the following structure is used to accumulate those values while the
99 ** functions are running. See fts3StringAppend() for details.
100 */
101 typedef struct StrBuffer StrBuffer;
102 struct StrBuffer {
103 char *z; /* Pointer to buffer containing string */
104 int n; /* Length of z in bytes (excl. nul-term) */
105 int nAlloc; /* Allocated size of buffer z in bytes */
106 };
107
108
109 /*
110 ** This function is used to help iterate through a position-list. A position
111 ** list is a list of unique integers, sorted from smallest to largest. Each
112 ** element of the list is represented by an FTS3 varint that takes the value
113 ** of the difference between the current element and the previous one plus
114 ** two. For example, to store the position-list:
115 **
116 ** 4 9 113
117 **
118 ** the three varints:
119 **
120 ** 6 7 106
121 **
122 ** are encoded.
123 **
124 ** When this function is called, *pp points to the start of an element of
125 ** the list. *piPos contains the value of the previous entry in the list.
126 ** After it returns, *piPos contains the value of the next element of the
127 ** list and *pp is advanced to the following varint.
128 */
129 static void fts3GetDeltaPosition(char **pp, int *piPos){
130 int iVal;
131 *pp += sqlite3Fts3GetVarint32(*pp, &iVal);
132 *piPos += (iVal-2);
133 }
134
135 /*
136 ** Helper function for fts3ExprIterate() (see below).
137 */
138 static int fts3ExprIterate2(
139 Fts3Expr *pExpr, /* Expression to iterate phrases of */
140 int *piPhrase, /* Pointer to phrase counter */
141 int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
142 void *pCtx /* Second argument to pass to callback */
143 ){
144 int rc; /* Return code */
145 int eType = pExpr->eType; /* Type of expression node pExpr */
146
147 if( eType!=FTSQUERY_PHRASE ){
148 assert( pExpr->pLeft && pExpr->pRight );
149 rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
150 if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
151 rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
152 }
153 }else{
154 rc = x(pExpr, *piPhrase, pCtx);
155 (*piPhrase)++;
156 }
157 return rc;
158 }
159
160 /*
161 ** Iterate through all phrase nodes in an FTS3 query, except those that
162 ** are part of a sub-tree that is the right-hand-side of a NOT operator.
163 ** For each phrase node found, the supplied callback function is invoked.
164 **
165 ** If the callback function returns anything other than SQLITE_OK,
166 ** the iteration is abandoned and the error code returned immediately.
167 ** Otherwise, SQLITE_OK is returned after a callback has been made for
168 ** all eligible phrase nodes.
169 */
170 static int fts3ExprIterate(
171 Fts3Expr *pExpr, /* Expression to iterate phrases of */
172 int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
173 void *pCtx /* Second argument to pass to callback */
174 ){
175 int iPhrase = 0; /* Variable used as the phrase counter */
176 return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
177 }
178
179 /*
180 ** The argument to this function is always a phrase node. Its doclist
181 ** (Fts3Expr.aDoclist[]) and the doclists associated with all phrase nodes
182 ** to the left of this one in the query tree have already been loaded.
183 **
184 ** If this phrase node is part of a series of phrase nodes joined by
185 ** NEAR operators (and is not the left-most of said series), then elements are
186 ** removed from the phrases doclist consistent with the NEAR restriction. If
187 ** required, elements may be removed from the doclists of phrases to the
188 ** left of this one that are part of the same series of NEAR operator
189 ** connected phrases.
190 **
191 ** If an OOM error occurs, SQLITE_NOMEM is returned. Otherwise, SQLITE_OK.
192 */
193 static int fts3ExprNearTrim(Fts3Expr *pExpr){
194 int rc = SQLITE_OK;
195 Fts3Expr *pParent = pExpr->pParent;
196
197 assert( pExpr->eType==FTSQUERY_PHRASE );
198 while( rc==SQLITE_OK
199 && pParent
200 && pParent->eType==FTSQUERY_NEAR
201 && pParent->pRight==pExpr
202 ){
203 /* This expression (pExpr) is the right-hand-side of a NEAR operator.
204 ** Find the expression to the left of the same operator.
205 */
206 int nNear = pParent->nNear;
207 Fts3Expr *pLeft = pParent->pLeft;
208
209 if( pLeft->eType!=FTSQUERY_PHRASE ){
210 assert( pLeft->eType==FTSQUERY_NEAR );
211 assert( pLeft->pRight->eType==FTSQUERY_PHRASE );
212 pLeft = pLeft->pRight;
213 }
214
215 rc = sqlite3Fts3ExprNearTrim(pLeft, pExpr, nNear);
216
217 pExpr = pLeft;
218 pParent = pExpr->pParent;
219 }
220
221 return rc;
222 }
223
224 /*
225 ** This is an fts3ExprIterate() callback used while loading the doclists
226 ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
227 ** fts3ExprLoadDoclists().
228 */
229 static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
230 int rc = SQLITE_OK;
231 LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
232
233 UNUSED_PARAMETER(iPhrase);
234
235 p->nPhrase++;
236 p->nToken += pExpr->pPhrase->nToken;
237
238 if( pExpr->isLoaded==0 ){
239 rc = sqlite3Fts3ExprLoadDoclist(p->pCsr, pExpr);
240 pExpr->isLoaded = 1;
241 if( rc==SQLITE_OK ){
242 rc = fts3ExprNearTrim(pExpr);
243 }
244 }
245
246 return rc;
247 }
248
249 /*
250 ** Load the doclists for each phrase in the query associated with FTS3 cursor
251 ** pCsr.
252 **
253 ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable
254 ** phrases in the expression (all phrases except those directly or
255 ** indirectly descended from the right-hand-side of a NOT operator). If
256 ** pnToken is not NULL, then it is set to the number of tokens in all
257 ** matchable phrases of the expression.
258 */
259 static int fts3ExprLoadDoclists(
260 Fts3Cursor *pCsr, /* Fts3 cursor for current query */
261 int *pnPhrase, /* OUT: Number of phrases in query */
262 int *pnToken /* OUT: Number of tokens in query */
263 ){
264 int rc; /* Return Code */
265 LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */
266 sCtx.pCsr = pCsr;
267 rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
268 if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
269 if( pnToken ) *pnToken = sCtx.nToken;
270 return rc;
271 }
272
273 static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
274 (*(int *)ctx)++;
275 UNUSED_PARAMETER(pExpr);
276 UNUSED_PARAMETER(iPhrase);
277 return SQLITE_OK;
278 }
279 static int fts3ExprPhraseCount(Fts3Expr *pExpr){
280 int nPhrase = 0;
281 (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase);
282 return nPhrase;
283 }
284
285 /*
286 ** Advance the position list iterator specified by the first two
287 ** arguments so that it points to the first element with a value greater
288 ** than or equal to parameter iNext.
289 */
290 static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
291 char *pIter = *ppIter;
292 if( pIter ){
293 int iIter = *piIter;
294
295 while( iIter<iNext ){
296 if( 0==(*pIter & 0xFE) ){
297 iIter = -1;
298 pIter = 0;
299 break;
300 }
301 fts3GetDeltaPosition(&pIter, &iIter);
302 }
303
304 *piIter = iIter;
305 *ppIter = pIter;
306 }
307 }
308
309 /*
310 ** Advance the snippet iterator to the next candidate snippet.
311 */
312 static int fts3SnippetNextCandidate(SnippetIter *pIter){
313 int i; /* Loop counter */
314
315 if( pIter->iCurrent<0 ){
316 /* The SnippetIter object has just been initialized. The first snippet
317 ** candidate always starts at offset 0 (even if this candidate has a
318 ** score of 0.0).
319 */
320 pIter->iCurrent = 0;
321
322 /* Advance the 'head' iterator of each phrase to the first offset that
323 ** is greater than or equal to (iNext+nSnippet).
324 */
325 for(i=0; i<pIter->nPhrase; i++){
326 SnippetPhrase *pPhrase = &pIter->aPhrase[i];
327 fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
328 }
329 }else{
330 int iStart;
331 int iEnd = 0x7FFFFFFF;
332
333 for(i=0; i<pIter->nPhrase; i++){
334 SnippetPhrase *pPhrase = &pIter->aPhrase[i];
335 if( pPhrase->pHead && pPhrase->iHead<iEnd ){
336 iEnd = pPhrase->iHead;
337 }
338 }
339 if( iEnd==0x7FFFFFFF ){
340 return 1;
341 }
342
343 pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
344 for(i=0; i<pIter->nPhrase; i++){
345 SnippetPhrase *pPhrase = &pIter->aPhrase[i];
346 fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
347 fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
348 }
349 }
350
351 return 0;
352 }
353
354 /*
355 ** Retrieve information about the current candidate snippet of snippet
356 ** iterator pIter.
357 */
358 static void fts3SnippetDetails(
359 SnippetIter *pIter, /* Snippet iterator */
360 u64 mCovered, /* Bitmask of phrases already covered */
361 int *piToken, /* OUT: First token of proposed snippet */
362 int *piScore, /* OUT: "Score" for this snippet */
363 u64 *pmCover, /* OUT: Bitmask of phrases covered */
364 u64 *pmHighlight /* OUT: Bitmask of terms to highlight */
365 ){
366 int iStart = pIter->iCurrent; /* First token of snippet */
367 int iScore = 0; /* Score of this snippet */
368 int i; /* Loop counter */
369 u64 mCover = 0; /* Mask of phrases covered by this snippet */
370 u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */
371
372 for(i=0; i<pIter->nPhrase; i++){
373 SnippetPhrase *pPhrase = &pIter->aPhrase[i];
374 if( pPhrase->pTail ){
375 char *pCsr = pPhrase->pTail;
376 int iCsr = pPhrase->iTail;
377
378 while( iCsr<(iStart+pIter->nSnippet) ){
379 int j;
380 u64 mPhrase = (u64)1 << i;
381 u64 mPos = (u64)1 << (iCsr - iStart);
382 assert( iCsr>=iStart );
383 if( (mCover|mCovered)&mPhrase ){
384 iScore++;
385 }else{
386 iScore += 1000;
387 }
388 mCover |= mPhrase;
389
390 for(j=0; j<pPhrase->nToken; j++){
391 mHighlight |= (mPos>>j);
392 }
393
394 if( 0==(*pCsr & 0x0FE) ) break;
395 fts3GetDeltaPosition(&pCsr, &iCsr);
396 }
397 }
398 }
399
400 /* Set the output variables before returning. */
401 *piToken = iStart;
402 *piScore = iScore;
403 *pmCover = mCover;
404 *pmHighlight = mHighlight;
405 }
406
407 /*
408 ** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
409 ** Each invocation populates an element of the SnippetIter.aPhrase[] array.
410 */
411 static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
412 SnippetIter *p = (SnippetIter *)ctx;
413 SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
414 char *pCsr;
415
416 pPhrase->nToken = pExpr->pPhrase->nToken;
417
418 pCsr = sqlite3Fts3FindPositions(pExpr, p->pCsr->iPrevId, p->iCol);
419 if( pCsr ){
420 int iFirst = 0;
421 pPhrase->pList = pCsr;
422 fts3GetDeltaPosition(&pCsr, &iFirst);
423 pPhrase->pHead = pCsr;
424 pPhrase->pTail = pCsr;
425 pPhrase->iHead = iFirst;
426 pPhrase->iTail = iFirst;
427 }else{
428 assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 );
429 }
430
431 return SQLITE_OK;
432 }
433
434 /*
435 ** Select the fragment of text consisting of nFragment contiguous tokens
436 ** from column iCol that represent the "best" snippet. The best snippet
437 ** is the snippet with the highest score, where scores are calculated
438 ** by adding:
439 **
440 ** (a) +1 point for each occurence of a matchable phrase in the snippet.
441 **
442 ** (b) +1000 points for the first occurence of each matchable phrase in
443 ** the snippet for which the corresponding mCovered bit is not set.
444 **
445 ** The selected snippet parameters are stored in structure *pFragment before
446 ** returning. The score of the selected snippet is stored in *piScore
447 ** before returning.
448 */
449 static int fts3BestSnippet(
450 int nSnippet, /* Desired snippet length */
451 Fts3Cursor *pCsr, /* Cursor to create snippet for */
452 int iCol, /* Index of column to create snippet from */
453 u64 mCovered, /* Mask of phrases already covered */
454 u64 *pmSeen, /* IN/OUT: Mask of phrases seen */
455 SnippetFragment *pFragment, /* OUT: Best snippet found */
456 int *piScore /* OUT: Score of snippet pFragment */
457 ){
458 int rc; /* Return Code */
459 int nList; /* Number of phrases in expression */
460 SnippetIter sIter; /* Iterates through snippet candidates */
461 int nByte; /* Number of bytes of space to allocate */
462 int iBestScore = -1; /* Best snippet score found so far */
463 int i; /* Loop counter */
464
465 memset(&sIter, 0, sizeof(sIter));
466
467 /* Iterate through the phrases in the expression to count them. The same
468 ** callback makes sure the doclists are loaded for each phrase.
469 */
470 rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
471 if( rc!=SQLITE_OK ){
472 return rc;
473 }
474
475 /* Now that it is known how many phrases there are, allocate and zero
476 ** the required space using malloc().
477 */
478 nByte = sizeof(SnippetPhrase) * nList;
479 sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
480 if( !sIter.aPhrase ){
481 return SQLITE_NOMEM;
482 }
483 memset(sIter.aPhrase, 0, nByte);
484
485 /* Initialize the contents of the SnippetIter object. Then iterate through
486 ** the set of phrases in the expression to populate the aPhrase[] array.
487 */
488 sIter.pCsr = pCsr;
489 sIter.iCol = iCol;
490 sIter.nSnippet = nSnippet;
491 sIter.nPhrase = nList;
492 sIter.iCurrent = -1;
493 (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
494
495 /* Set the *pmSeen output variable. */
496 for(i=0; i<nList; i++){
497 if( sIter.aPhrase[i].pHead ){
498 *pmSeen |= (u64)1 << i;
499 }
500 }
501
502 /* Loop through all candidate snippets. Store the best snippet in
503 ** *pFragment. Store its associated 'score' in iBestScore.
504 */
505 pFragment->iCol = iCol;
506 while( !fts3SnippetNextCandidate(&sIter) ){
507 int iPos;
508 int iScore;
509 u64 mCover;
510 u64 mHighlight;
511 fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
512 assert( iScore>=0 );
513 if( iScore>iBestScore ){
514 pFragment->iPos = iPos;
515 pFragment->hlmask = mHighlight;
516 pFragment->covered = mCover;
517 iBestScore = iScore;
518 }
519 }
520
521 sqlite3_free(sIter.aPhrase);
522 *piScore = iBestScore;
523 return SQLITE_OK;
524 }
525
526
527 /*
528 ** Append a string to the string-buffer passed as the first argument.
529 **
530 ** If nAppend is negative, then the length of the string zAppend is
531 ** determined using strlen().
532 */
533 static int fts3StringAppend(
534 StrBuffer *pStr, /* Buffer to append to */
535 const char *zAppend, /* Pointer to data to append to buffer */
536 int nAppend /* Size of zAppend in bytes (or -1) */
537 ){
538 if( nAppend<0 ){
539 nAppend = (int)strlen(zAppend);
540 }
541
542 /* If there is insufficient space allocated at StrBuffer.z, use realloc()
543 ** to grow the buffer until so that it is big enough to accomadate the
544 ** appended data.
545 */
546 if( pStr->n+nAppend+1>=pStr->nAlloc ){
547 int nAlloc = pStr->nAlloc+nAppend+100;
548 char *zNew = sqlite3_realloc(pStr->z, nAlloc);
549 if( !zNew ){
550 return SQLITE_NOMEM;
551 }
552 pStr->z = zNew;
553 pStr->nAlloc = nAlloc;
554 }
555
556 /* Append the data to the string buffer. */
557 memcpy(&pStr->z[pStr->n], zAppend, nAppend);
558 pStr->n += nAppend;
559 pStr->z[pStr->n] = '\0';
560
561 return SQLITE_OK;
562 }
563
564 /*
565 ** The fts3BestSnippet() function often selects snippets that end with a
566 ** query term. That is, the final term of the snippet is always a term
567 ** that requires highlighting. For example, if 'X' is a highlighted term
568 ** and '.' is a non-highlighted term, BestSnippet() may select:
569 **
570 ** ........X.....X
571 **
572 ** This function "shifts" the beginning of the snippet forward in the
573 ** document so that there are approximately the same number of
574 ** non-highlighted terms to the right of the final highlighted term as there
575 ** are to the left of the first highlighted term. For example, to this:
576 **
577 ** ....X.....X....
578 **
579 ** This is done as part of extracting the snippet text, not when selecting
580 ** the snippet. Snippet selection is done based on doclists only, so there
581 ** is no way for fts3BestSnippet() to know whether or not the document
582 ** actually contains terms that follow the final highlighted term.
583 */
584 static int fts3SnippetShift(
585 Fts3Table *pTab, /* FTS3 table snippet comes from */
586 int nSnippet, /* Number of tokens desired for snippet */
587 const char *zDoc, /* Document text to extract snippet from */
588 int nDoc, /* Size of buffer zDoc in bytes */
589 int *piPos, /* IN/OUT: First token of snippet */
590 u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */
591 ){
592 u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */
593
594 if( hlmask ){
595 int nLeft; /* Tokens to the left of first highlight */
596 int nRight; /* Tokens to the right of last highlight */
597 int nDesired; /* Ideal number of tokens to shift forward */
598
599 for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
600 for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
601 nDesired = (nLeft-nRight)/2;
602
603 /* Ideally, the start of the snippet should be pushed forward in the
604 ** document nDesired tokens. This block checks if there are actually
605 ** nDesired tokens to the right of the snippet. If so, *piPos and
606 ** *pHlMask are updated to shift the snippet nDesired tokens to the
607 ** right. Otherwise, the snippet is shifted by the number of tokens
608 ** available.
609 */
610 if( nDesired>0 ){
611 int nShift; /* Number of tokens to shift snippet by */
612 int iCurrent = 0; /* Token counter */
613 int rc; /* Return Code */
614 sqlite3_tokenizer_module *pMod;
615 sqlite3_tokenizer_cursor *pC;
616 pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
617
618 /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
619 ** or more tokens in zDoc/nDoc.
620 */
621 rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
622 if( rc!=SQLITE_OK ){
623 return rc;
624 }
625 pC->pTokenizer = pTab->pTokenizer;
626 while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
627 const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
628 rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
629 }
630 pMod->xClose(pC);
631 if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
632
633 nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
634 assert( nShift<=nDesired );
635 if( nShift>0 ){
636 *piPos += nShift;
637 *pHlmask = hlmask >> nShift;
638 }
639 }
640 }
641 return SQLITE_OK;
642 }
643
644 /*
645 ** Extract the snippet text for fragment pFragment from cursor pCsr and
646 ** append it to string buffer pOut.
647 */
648 static int fts3SnippetText(
649 Fts3Cursor *pCsr, /* FTS3 Cursor */
650 SnippetFragment *pFragment, /* Snippet to extract */
651 int iFragment, /* Fragment number */
652 int isLast, /* True for final fragment in snippet */
653 int nSnippet, /* Number of tokens in extracted snippet */
654 const char *zOpen, /* String inserted before highlighted term */
655 const char *zClose, /* String inserted after highlighted term */
656 const char *zEllipsis, /* String inserted between snippets */
657 StrBuffer *pOut /* Write output here */
658 ){
659 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
660 int rc; /* Return code */
661 const char *zDoc; /* Document text to extract snippet from */
662 int nDoc; /* Size of zDoc in bytes */
663 int iCurrent = 0; /* Current token number of document */
664 int iEnd = 0; /* Byte offset of end of current token */
665 int isShiftDone = 0; /* True after snippet is shifted */
666 int iPos = pFragment->iPos; /* First token of snippet */
667 u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
668 int iCol = pFragment->iCol+1; /* Query column to extract text from */
669 sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
670 sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */
671 const char *ZDUMMY; /* Dummy argument used with tokenizer */
672 int DUMMY1; /* Dummy argument used with tokenizer */
673
674 zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
675 if( zDoc==0 ){
676 if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
677 return SQLITE_NOMEM;
678 }
679 return SQLITE_OK;
680 }
681 nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
682
683 /* Open a token cursor on the document. */
684 pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
685 rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
686 if( rc!=SQLITE_OK ){
687 return rc;
688 }
689 pC->pTokenizer = pTab->pTokenizer;
690
691 while( rc==SQLITE_OK ){
692 int iBegin; /* Offset in zDoc of start of token */
693 int iFin; /* Offset in zDoc of end of token */
694 int isHighlight; /* True for highlighted terms */
695
696 rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
697 if( rc!=SQLITE_OK ){
698 if( rc==SQLITE_DONE ){
699 /* Special case - the last token of the snippet is also the last token
700 ** of the column. Append any punctuation that occurred between the end
701 ** of the previous token and the end of the document to the output.
702 ** Then break out of the loop. */
703 rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
704 }
705 break;
706 }
707 if( iCurrent<iPos ){ continue; }
708
709 if( !isShiftDone ){
710 int n = nDoc - iBegin;
711 rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask);
712 isShiftDone = 1;
713
714 /* Now that the shift has been done, check if the initial "..." are
715 ** required. They are required if (a) this is not the first fragment,
716 ** or (b) this fragment does not begin at position 0 of its column.
717 */
718 if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
719 rc = fts3StringAppend(pOut, zEllipsis, -1);
720 }
721 if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
722 }
723
724 if( iCurrent>=(iPos+nSnippet) ){
725 if( isLast ){
726 rc = fts3StringAppend(pOut, zEllipsis, -1);
727 }
728 break;
729 }
730
731 /* Set isHighlight to true if this term should be highlighted. */
732 isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
733
734 if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
735 if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
736 if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
737 if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
738
739 iEnd = iFin;
740 }
741
742 pMod->xClose(pC);
743 return rc;
744 }
745
746
747 /*
748 ** This function is used to count the entries in a column-list (a
749 ** delta-encoded list of term offsets within a single column of a single
750 ** row). When this function is called, *ppCollist should point to the
751 ** beginning of the first varint in the column-list (the varint that
752 ** contains the position of the first matching term in the column data).
753 ** Before returning, *ppCollist is set to point to the first byte after
754 ** the last varint in the column-list (either the 0x00 signifying the end
755 ** of the position-list, or the 0x01 that precedes the column number of
756 ** the next column in the position-list).
757 **
758 ** The number of elements in the column-list is returned.
759 */
760 static int fts3ColumnlistCount(char **ppCollist){
761 char *pEnd = *ppCollist;
762 char c = 0;
763 int nEntry = 0;
764
765 /* A column-list is terminated by either a 0x01 or 0x00. */
766 while( 0xFE & (*pEnd | c) ){
767 c = *pEnd++ & 0x80;
768 if( !c ) nEntry++;
769 }
770
771 *ppCollist = pEnd;
772 return nEntry;
773 }
774
775 static void fts3LoadColumnlistCounts(char **pp, u32 *aOut, int isGlobal){
776 char *pCsr = *pp;
777 while( *pCsr ){
778 int nHit;
779 sqlite3_int64 iCol = 0;
780 if( *pCsr==0x01 ){
781 pCsr++;
782 pCsr += sqlite3Fts3GetVarint(pCsr, &iCol);
783 }
784 nHit = fts3ColumnlistCount(&pCsr);
785 assert( nHit>0 );
786 if( isGlobal ){
787 aOut[iCol*3+1]++;
788 }
789 aOut[iCol*3] += nHit;
790 }
791 pCsr++;
792 *pp = pCsr;
793 }
794
795 /*
796 ** fts3ExprIterate() callback used to collect the "global" matchinfo stats
797 ** for a single query.
798 **
799 ** fts3ExprIterate() callback to load the 'global' elements of a
800 ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements
801 ** of the matchinfo array that are constant for all rows returned by the
802 ** current query.
803 **
804 ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This
805 ** function populates Matchinfo.aMatchinfo[] as follows:
806 **
807 ** for(iCol=0; iCol<nCol; iCol++){
808 ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X;
809 ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y;
810 ** }
811 **
812 ** where X is the number of matches for phrase iPhrase is column iCol of all
813 ** rows of the table. Y is the number of rows for which column iCol contains
814 ** at least one instance of phrase iPhrase.
815 **
816 ** If the phrase pExpr consists entirely of deferred tokens, then all X and
817 ** Y values are set to nDoc, where nDoc is the number of documents in the
818 ** file system. This is done because the full-text index doclist is required
819 ** to calculate these values properly, and the full-text index doclist is
820 ** not available for deferred tokens.
821 */
822 static int fts3ExprGlobalHitsCb(
823 Fts3Expr *pExpr, /* Phrase expression node */
824 int iPhrase, /* Phrase number (numbered from zero) */
825 void *pCtx /* Pointer to MatchInfo structure */
826 ){
827 MatchInfo *p = (MatchInfo *)pCtx;
828 Fts3Cursor *pCsr = p->pCursor;
829 char *pIter;
830 char *pEnd;
831 char *pFree = 0;
832 u32 *aOut = &p->aMatchinfo[3*iPhrase*p->nCol];
833
834 assert( pExpr->isLoaded );
835 assert( pExpr->eType==FTSQUERY_PHRASE );
836
837 if( pCsr->pDeferred ){
838 Fts3Phrase *pPhrase = pExpr->pPhrase;
839 int ii;
840 for(ii=0; ii<pPhrase->nToken; ii++){
841 if( pPhrase->aToken[ii].bFulltext ) break;
842 }
843 if( ii<pPhrase->nToken ){
844 int nFree = 0;
845 int rc = sqlite3Fts3ExprLoadFtDoclist(pCsr, pExpr, &pFree, &nFree);
846 if( rc!=SQLITE_OK ) return rc;
847 pIter = pFree;
848 pEnd = &pFree[nFree];
849 }else{
850 int iCol; /* Column index */
851 for(iCol=0; iCol<p->nCol; iCol++){
852 aOut[iCol*3 + 1] = (u32)p->nDoc;
853 aOut[iCol*3 + 2] = (u32)p->nDoc;
854 }
855 return SQLITE_OK;
856 }
857 }else{
858 pIter = pExpr->aDoclist;
859 pEnd = &pExpr->aDoclist[pExpr->nDoclist];
860 }
861
862 /* Fill in the global hit count matrix row for this phrase. */
863 while( pIter<pEnd ){
864 while( *pIter++ & 0x80 ); /* Skip past docid. */
865 fts3LoadColumnlistCounts(&pIter, &aOut[1], 1);
866 }
867
868 sqlite3_free(pFree);
869 return SQLITE_OK;
870 }
871
872 /*
873 ** fts3ExprIterate() callback used to collect the "local" part of the
874 ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the
875 ** array that are different for each row returned by the query.
876 */
877 static int fts3ExprLocalHitsCb(
878 Fts3Expr *pExpr, /* Phrase expression node */
879 int iPhrase, /* Phrase number */
880 void *pCtx /* Pointer to MatchInfo structure */
881 ){
882 MatchInfo *p = (MatchInfo *)pCtx;
883 int iStart = iPhrase * p->nCol * 3;
884 int i;
885
886 for(i=0; i<p->nCol; i++) p->aMatchinfo[iStart+i*3] = 0;
887
888 if( pExpr->aDoclist ){
889 char *pCsr;
890
891 pCsr = sqlite3Fts3FindPositions(pExpr, p->pCursor->iPrevId, -1);
892 if( pCsr ){
893 fts3LoadColumnlistCounts(&pCsr, &p->aMatchinfo[iStart], 0);
894 }
895 }
896
897 return SQLITE_OK;
898 }
899
900 static int fts3MatchinfoCheck(
901 Fts3Table *pTab,
902 char cArg,
903 char **pzErr
904 ){
905 if( (cArg==FTS3_MATCHINFO_NPHRASE)
906 || (cArg==FTS3_MATCHINFO_NCOL)
907 || (cArg==FTS3_MATCHINFO_NDOC && pTab->bHasStat)
908 || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bHasStat)
909 || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize)
910 || (cArg==FTS3_MATCHINFO_LCS)
911 || (cArg==FTS3_MATCHINFO_HITS)
912 ){
913 return SQLITE_OK;
914 }
915 *pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg);
916 return SQLITE_ERROR;
917 }
918
919 static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){
920 int nVal; /* Number of integers output by cArg */
921
922 switch( cArg ){
923 case FTS3_MATCHINFO_NDOC:
924 case FTS3_MATCHINFO_NPHRASE:
925 case FTS3_MATCHINFO_NCOL:
926 nVal = 1;
927 break;
928
929 case FTS3_MATCHINFO_AVGLENGTH:
930 case FTS3_MATCHINFO_LENGTH:
931 case FTS3_MATCHINFO_LCS:
932 nVal = pInfo->nCol;
933 break;
934
935 default:
936 assert( cArg==FTS3_MATCHINFO_HITS );
937 nVal = pInfo->nCol * pInfo->nPhrase * 3;
938 break;
939 }
940
941 return nVal;
942 }
943
944 static int fts3MatchinfoSelectDoctotal(
945 Fts3Table *pTab,
946 sqlite3_stmt **ppStmt,
947 sqlite3_int64 *pnDoc,
948 const char **paLen
949 ){
950 sqlite3_stmt *pStmt;
951 const char *a;
952 sqlite3_int64 nDoc;
953
954 if( !*ppStmt ){
955 int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt);
956 if( rc!=SQLITE_OK ) return rc;
957 }
958 pStmt = *ppStmt;
959 assert( sqlite3_data_count(pStmt)==1 );
960
961 a = sqlite3_column_blob(pStmt, 0);
962 a += sqlite3Fts3GetVarint(a, &nDoc);
963 if( nDoc==0 ) return SQLITE_CORRUPT;
964 *pnDoc = (u32)nDoc;
965
966 if( paLen ) *paLen = a;
967 return SQLITE_OK;
968 }
969
970 /*
971 ** An instance of the following structure is used to store state while
972 ** iterating through a multi-column position-list corresponding to the
973 ** hits for a single phrase on a single row in order to calculate the
974 ** values for a matchinfo() FTS3_MATCHINFO_LCS request.
975 */
976 typedef struct LcsIterator LcsIterator;
977 struct LcsIterator {
978 Fts3Expr *pExpr; /* Pointer to phrase expression */
979 char *pRead; /* Cursor used to iterate through aDoclist */
980 int iPosOffset; /* Tokens count up to end of this phrase */
981 int iCol; /* Current column number */
982 int iPos; /* Current position */
983 };
984
985 /*
986 ** If LcsIterator.iCol is set to the following value, the iterator has
987 ** finished iterating through all offsets for all columns.
988 */
989 #define LCS_ITERATOR_FINISHED 0x7FFFFFFF;
990
991 static int fts3MatchinfoLcsCb(
992 Fts3Expr *pExpr, /* Phrase expression node */
993 int iPhrase, /* Phrase number (numbered from zero) */
994 void *pCtx /* Pointer to MatchInfo structure */
995 ){
996 LcsIterator *aIter = (LcsIterator *)pCtx;
997 aIter[iPhrase].pExpr = pExpr;
998 return SQLITE_OK;
999 }
1000
1001 /*
1002 ** Advance the iterator passed as an argument to the next position. Return
1003 ** 1 if the iterator is at EOF or if it now points to the start of the
1004 ** position list for the next column.
1005 */
1006 static int fts3LcsIteratorAdvance(LcsIterator *pIter){
1007 char *pRead = pIter->pRead;
1008 sqlite3_int64 iRead;
1009 int rc = 0;
1010
1011 pRead += sqlite3Fts3GetVarint(pRead, &iRead);
1012 if( iRead==0 ){
1013 pIter->iCol = LCS_ITERATOR_FINISHED;
1014 rc = 1;
1015 }else{
1016 if( iRead==1 ){
1017 pRead += sqlite3Fts3GetVarint(pRead, &iRead);
1018 pIter->iCol = (int)iRead;
1019 pIter->iPos = pIter->iPosOffset;
1020 pRead += sqlite3Fts3GetVarint(pRead, &iRead);
1021 rc = 1;
1022 }
1023 pIter->iPos += (int)(iRead-2);
1024 }
1025
1026 pIter->pRead = pRead;
1027 return rc;
1028 }
1029
1030 /*
1031 ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag.
1032 **
1033 ** If the call is successful, the longest-common-substring lengths for each
1034 ** column are written into the first nCol elements of the pInfo->aMatchinfo[]
1035 ** array before returning. SQLITE_OK is returned in this case.
1036 **
1037 ** Otherwise, if an error occurs, an SQLite error code is returned and the
1038 ** data written to the first nCol elements of pInfo->aMatchinfo[] is
1039 ** undefined.
1040 */
1041 static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){
1042 LcsIterator *aIter;
1043 int i;
1044 int iCol;
1045 int nToken = 0;
1046
1047 /* Allocate and populate the array of LcsIterator objects. The array
1048 ** contains one element for each matchable phrase in the query.
1049 **/
1050 aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase);
1051 if( !aIter ) return SQLITE_NOMEM;
1052 memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase);
1053 (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter);
1054 for(i=0; i<pInfo->nPhrase; i++){
1055 LcsIterator *pIter = &aIter[i];
1056 nToken -= pIter->pExpr->pPhrase->nToken;
1057 pIter->iPosOffset = nToken;
1058 pIter->pRead = sqlite3Fts3FindPositions(pIter->pExpr, pCsr->iPrevId, -1);
1059 if( pIter->pRead ){
1060 pIter->iPos = pIter->iPosOffset;
1061 fts3LcsIteratorAdvance(&aIter[i]);
1062 }else{
1063 pIter->iCol = LCS_ITERATOR_FINISHED;
1064 }
1065 }
1066
1067 for(iCol=0; iCol<pInfo->nCol; iCol++){
1068 int nLcs = 0; /* LCS value for this column */
1069 int nLive = 0; /* Number of iterators in aIter not at EOF */
1070
1071 /* Loop through the iterators in aIter[]. Set nLive to the number of
1072 ** iterators that point to a position-list corresponding to column iCol.
1073 */
1074 for(i=0; i<pInfo->nPhrase; i++){
1075 assert( aIter[i].iCol>=iCol );
1076 if( aIter[i].iCol==iCol ) nLive++;
1077 }
1078
1079 /* The following loop runs until all iterators in aIter[] have finished
1080 ** iterating through positions in column iCol. Exactly one of the
1081 ** iterators is advanced each time the body of the loop is run.
1082 */
1083 while( nLive>0 ){
1084 LcsIterator *pAdv = 0; /* The iterator to advance by one position */
1085 int nThisLcs = 0; /* LCS for the current iterator positions */
1086
1087 for(i=0; i<pInfo->nPhrase; i++){
1088 LcsIterator *pIter = &aIter[i];
1089 if( iCol!=pIter->iCol ){
1090 /* This iterator is already at EOF for this column. */
1091 nThisLcs = 0;
1092 }else{
1093 if( pAdv==0 || pIter->iPos<pAdv->iPos ){
1094 pAdv = pIter;
1095 }
1096 if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){
1097 nThisLcs++;
1098 }else{
1099 nThisLcs = 1;
1100 }
1101 if( nThisLcs>nLcs ) nLcs = nThisLcs;
1102 }
1103 }
1104 if( fts3LcsIteratorAdvance(pAdv) ) nLive--;
1105 }
1106
1107 pInfo->aMatchinfo[iCol] = nLcs;
1108 }
1109
1110 sqlite3_free(aIter);
1111 return SQLITE_OK;
1112 }
1113
1114 /*
1115 ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to
1116 ** be returned by the matchinfo() function. Argument zArg contains the
1117 ** format string passed as the second argument to matchinfo (or the
1118 ** default value "pcx" if no second argument was specified). The format
1119 ** string has already been validated and the pInfo->aMatchinfo[] array
1120 ** is guaranteed to be large enough for the output.
1121 **
1122 ** If bGlobal is true, then populate all fields of the matchinfo() output.
1123 ** If it is false, then assume that those fields that do not change between
1124 ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS)
1125 ** have already been populated.
1126 **
1127 ** Return SQLITE_OK if successful, or an SQLite error code if an error
1128 ** occurs. If a value other than SQLITE_OK is returned, the state the
1129 ** pInfo->aMatchinfo[] buffer is left in is undefined.
1130 */
1131 static int fts3MatchinfoValues(
1132 Fts3Cursor *pCsr, /* FTS3 cursor object */
1133 int bGlobal, /* True to grab the global stats */
1134 MatchInfo *pInfo, /* Matchinfo context object */
1135 const char *zArg /* Matchinfo format string */
1136 ){
1137 int rc = SQLITE_OK;
1138 int i;
1139 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1140 sqlite3_stmt *pSelect = 0;
1141
1142 for(i=0; rc==SQLITE_OK && zArg[i]; i++){
1143
1144 switch( zArg[i] ){
1145 case FTS3_MATCHINFO_NPHRASE:
1146 if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase;
1147 break;
1148
1149 case FTS3_MATCHINFO_NCOL:
1150 if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol;
1151 break;
1152
1153 case FTS3_MATCHINFO_NDOC:
1154 if( bGlobal ){
1155 sqlite3_int64 nDoc;
1156 rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0);
1157 pInfo->aMatchinfo[0] = (u32)nDoc;
1158 }
1159 break;
1160
1161 case FTS3_MATCHINFO_AVGLENGTH:
1162 if( bGlobal ){
1163 sqlite3_int64 nDoc; /* Number of rows in table */
1164 const char *a; /* Aggregate column length array */
1165
1166 rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a);
1167 if( rc==SQLITE_OK ){
1168 int iCol;
1169 for(iCol=0; iCol<pInfo->nCol; iCol++){
1170 u32 iVal;
1171 sqlite3_int64 nToken;
1172 a += sqlite3Fts3GetVarint(a, &nToken);
1173 iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc);
1174 pInfo->aMatchinfo[iCol] = iVal;
1175 }
1176 }
1177 }
1178 break;
1179
1180 case FTS3_MATCHINFO_LENGTH: {
1181 sqlite3_stmt *pSelectDocsize = 0;
1182 rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize);
1183 if( rc==SQLITE_OK ){
1184 int iCol;
1185 const char *a = sqlite3_column_blob(pSelectDocsize, 0);
1186 for(iCol=0; iCol<pInfo->nCol; iCol++){
1187 sqlite3_int64 nToken;
1188 a += sqlite3Fts3GetVarint(a, &nToken);
1189 pInfo->aMatchinfo[iCol] = (u32)nToken;
1190 }
1191 }
1192 sqlite3_reset(pSelectDocsize);
1193 break;
1194 }
1195
1196 case FTS3_MATCHINFO_LCS:
1197 rc = fts3ExprLoadDoclists(pCsr, 0, 0);
1198 if( rc==SQLITE_OK ){
1199 rc = fts3MatchinfoLcs(pCsr, pInfo);
1200 }
1201 break;
1202
1203 default: {
1204 Fts3Expr *pExpr;
1205 assert( zArg[i]==FTS3_MATCHINFO_HITS );
1206 pExpr = pCsr->pExpr;
1207 rc = fts3ExprLoadDoclists(pCsr, 0, 0);
1208 if( rc!=SQLITE_OK ) break;
1209 if( bGlobal ){
1210 if( pCsr->pDeferred ){
1211 rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0);
1212 if( rc!=SQLITE_OK ) break;
1213 }
1214 rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo);
1215 if( rc!=SQLITE_OK ) break;
1216 }
1217 (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo);
1218 break;
1219 }
1220 }
1221
1222 pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]);
1223 }
1224
1225 sqlite3_reset(pSelect);
1226 return rc;
1227 }
1228
1229
1230 /*
1231 ** Populate pCsr->aMatchinfo[] with data for the current row. The
1232 ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
1233 */
1234 static int fts3GetMatchinfo(
1235 Fts3Cursor *pCsr, /* FTS3 Cursor object */
1236 const char *zArg /* Second argument to matchinfo() function */
1237 ){
1238 MatchInfo sInfo;
1239 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1240 int rc = SQLITE_OK;
1241 int bGlobal = 0; /* Collect 'global' stats as well as local */
1242
1243 memset(&sInfo, 0, sizeof(MatchInfo));
1244 sInfo.pCursor = pCsr;
1245 sInfo.nCol = pTab->nColumn;
1246
1247 /* If there is cached matchinfo() data, but the format string for the
1248 ** cache does not match the format string for this request, discard
1249 ** the cached data. */
1250 if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){
1251 assert( pCsr->aMatchinfo );
1252 sqlite3_free(pCsr->aMatchinfo);
1253 pCsr->zMatchinfo = 0;
1254 pCsr->aMatchinfo = 0;
1255 }
1256
1257 /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
1258 ** matchinfo function has been called for this query. In this case
1259 ** allocate the array used to accumulate the matchinfo data and
1260 ** initialize those elements that are constant for every row.
1261 */
1262 if( pCsr->aMatchinfo==0 ){
1263 int nMatchinfo = 0; /* Number of u32 elements in match-info */
1264 int nArg; /* Bytes in zArg */
1265 int i; /* Used to iterate through zArg */
1266
1267 /* Determine the number of phrases in the query */
1268 pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr);
1269 sInfo.nPhrase = pCsr->nPhrase;
1270
1271 /* Determine the number of integers in the buffer returned by this call. */
1272 for(i=0; zArg[i]; i++){
1273 nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]);
1274 }
1275
1276 /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */
1277 nArg = (int)strlen(zArg);
1278 pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1);
1279 if( !pCsr->aMatchinfo ) return SQLITE_NOMEM;
1280
1281 pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo];
1282 pCsr->nMatchinfo = nMatchinfo;
1283 memcpy(pCsr->zMatchinfo, zArg, nArg+1);
1284 memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo);
1285 pCsr->isMatchinfoNeeded = 1;
1286 bGlobal = 1;
1287 }
1288
1289 sInfo.aMatchinfo = pCsr->aMatchinfo;
1290 sInfo.nPhrase = pCsr->nPhrase;
1291 if( pCsr->isMatchinfoNeeded ){
1292 rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg);
1293 pCsr->isMatchinfoNeeded = 0;
1294 }
1295
1296 return rc;
1297 }
1298
1299 /*
1300 ** Implementation of snippet() function.
1301 */
1302 void sqlite3Fts3Snippet(
1303 sqlite3_context *pCtx, /* SQLite function call context */
1304 Fts3Cursor *pCsr, /* Cursor object */
1305 const char *zStart, /* Snippet start text - "<b>" */
1306 const char *zEnd, /* Snippet end text - "</b>" */
1307 const char *zEllipsis, /* Snippet ellipsis text - "<b>...</b>" */
1308 int iCol, /* Extract snippet from this column */
1309 int nToken /* Approximate number of tokens in snippet */
1310 ){
1311 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1312 int rc = SQLITE_OK;
1313 int i;
1314 StrBuffer res = {0, 0, 0};
1315
1316 /* The returned text includes up to four fragments of text extracted from
1317 ** the data in the current row. The first iteration of the for(...) loop
1318 ** below attempts to locate a single fragment of text nToken tokens in
1319 ** size that contains at least one instance of all phrases in the query
1320 ** expression that appear in the current row. If such a fragment of text
1321 ** cannot be found, the second iteration of the loop attempts to locate
1322 ** a pair of fragments, and so on.
1323 */
1324 int nSnippet = 0; /* Number of fragments in this snippet */
1325 SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */
1326 int nFToken = -1; /* Number of tokens in each fragment */
1327
1328 if( !pCsr->pExpr ){
1329 sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
1330 return;
1331 }
1332
1333 for(nSnippet=1; 1; nSnippet++){
1334
1335 int iSnip; /* Loop counter 0..nSnippet-1 */
1336 u64 mCovered = 0; /* Bitmask of phrases covered by snippet */
1337 u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */
1338
1339 if( nToken>=0 ){
1340 nFToken = (nToken+nSnippet-1) / nSnippet;
1341 }else{
1342 nFToken = -1 * nToken;
1343 }
1344
1345 for(iSnip=0; iSnip<nSnippet; iSnip++){
1346 int iBestScore = -1; /* Best score of columns checked so far */
1347 int iRead; /* Used to iterate through columns */
1348 SnippetFragment *pFragment = &aSnippet[iSnip];
1349
1350 memset(pFragment, 0, sizeof(*pFragment));
1351
1352 /* Loop through all columns of the table being considered for snippets.
1353 ** If the iCol argument to this function was negative, this means all
1354 ** columns of the FTS3 table. Otherwise, only column iCol is considered.
1355 */
1356 for(iRead=0; iRead<pTab->nColumn; iRead++){
1357 SnippetFragment sF = {0, 0, 0, 0};
1358 int iS;
1359 if( iCol>=0 && iRead!=iCol ) continue;
1360
1361 /* Find the best snippet of nFToken tokens in column iRead. */
1362 rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
1363 if( rc!=SQLITE_OK ){
1364 goto snippet_out;
1365 }
1366 if( iS>iBestScore ){
1367 *pFragment = sF;
1368 iBestScore = iS;
1369 }
1370 }
1371
1372 mCovered |= pFragment->covered;
1373 }
1374
1375 /* If all query phrases seen by fts3BestSnippet() are present in at least
1376 ** one of the nSnippet snippet fragments, break out of the loop.
1377 */
1378 assert( (mCovered&mSeen)==mCovered );
1379 if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
1380 }
1381
1382 assert( nFToken>0 );
1383
1384 for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
1385 rc = fts3SnippetText(pCsr, &aSnippet[i],
1386 i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
1387 );
1388 }
1389
1390 snippet_out:
1391 sqlite3Fts3SegmentsClose(pTab);
1392 if( rc!=SQLITE_OK ){
1393 sqlite3_result_error_code(pCtx, rc);
1394 sqlite3_free(res.z);
1395 }else{
1396 sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
1397 }
1398 }
1399
1400
1401 typedef struct TermOffset TermOffset;
1402 typedef struct TermOffsetCtx TermOffsetCtx;
1403
1404 struct TermOffset {
1405 char *pList; /* Position-list */
1406 int iPos; /* Position just read from pList */
1407 int iOff; /* Offset of this term from read positions */
1408 };
1409
1410 struct TermOffsetCtx {
1411 int iCol; /* Column of table to populate aTerm for */
1412 int iTerm;
1413 sqlite3_int64 iDocid;
1414 TermOffset *aTerm;
1415 };
1416
1417 /*
1418 ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
1419 */
1420 static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
1421 TermOffsetCtx *p = (TermOffsetCtx *)ctx;
1422 int nTerm; /* Number of tokens in phrase */
1423 int iTerm; /* For looping through nTerm phrase terms */
1424 char *pList; /* Pointer to position list for phrase */
1425 int iPos = 0; /* First position in position-list */
1426
1427 UNUSED_PARAMETER(iPhrase);
1428 pList = sqlite3Fts3FindPositions(pExpr, p->iDocid, p->iCol);
1429 nTerm = pExpr->pPhrase->nToken;
1430 if( pList ){
1431 fts3GetDeltaPosition(&pList, &iPos);
1432 assert( iPos>=0 );
1433 }
1434
1435 for(iTerm=0; iTerm<nTerm; iTerm++){
1436 TermOffset *pT = &p->aTerm[p->iTerm++];
1437 pT->iOff = nTerm-iTerm-1;
1438 pT->pList = pList;
1439 pT->iPos = iPos;
1440 }
1441
1442 return SQLITE_OK;
1443 }
1444
1445 /*
1446 ** Implementation of offsets() function.
1447 */
1448 void sqlite3Fts3Offsets(
1449 sqlite3_context *pCtx, /* SQLite function call context */
1450 Fts3Cursor *pCsr /* Cursor object */
1451 ){
1452 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1453 sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
1454 const char *ZDUMMY; /* Dummy argument used with xNext() */
1455 int NDUMMY; /* Dummy argument used with xNext() */
1456 int rc; /* Return Code */
1457 int nToken; /* Number of tokens in query */
1458 int iCol; /* Column currently being processed */
1459 StrBuffer res = {0, 0, 0}; /* Result string */
1460 TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */
1461
1462 if( !pCsr->pExpr ){
1463 sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
1464 return;
1465 }
1466
1467 memset(&sCtx, 0, sizeof(sCtx));
1468 assert( pCsr->isRequireSeek==0 );
1469
1470 /* Count the number of terms in the query */
1471 rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
1472 if( rc!=SQLITE_OK ) goto offsets_out;
1473
1474 /* Allocate the array of TermOffset iterators. */
1475 sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
1476 if( 0==sCtx.aTerm ){
1477 rc = SQLITE_NOMEM;
1478 goto offsets_out;
1479 }
1480 sCtx.iDocid = pCsr->iPrevId;
1481
1482 /* Loop through the table columns, appending offset information to
1483 ** string-buffer res for each column.
1484 */
1485 for(iCol=0; iCol<pTab->nColumn; iCol++){
1486 sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
1487 int iStart;
1488 int iEnd;
1489 int iCurrent;
1490 const char *zDoc;
1491 int nDoc;
1492
1493 /* Initialize the contents of sCtx.aTerm[] for column iCol. There is
1494 ** no way that this operation can fail, so the return code from
1495 ** fts3ExprIterate() can be discarded.
1496 */
1497 sCtx.iCol = iCol;
1498 sCtx.iTerm = 0;
1499 (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
1500
1501 /* Retreive the text stored in column iCol. If an SQL NULL is stored
1502 ** in column iCol, jump immediately to the next iteration of the loop.
1503 ** If an OOM occurs while retrieving the data (this can happen if SQLite
1504 ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM
1505 ** to the caller.
1506 */
1507 zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
1508 nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
1509 if( zDoc==0 ){
1510 if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
1511 continue;
1512 }
1513 rc = SQLITE_NOMEM;
1514 goto offsets_out;
1515 }
1516
1517 /* Initialize a tokenizer iterator to iterate through column iCol. */
1518 rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
1519 if( rc!=SQLITE_OK ) goto offsets_out;
1520 pC->pTokenizer = pTab->pTokenizer;
1521
1522 rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
1523 while( rc==SQLITE_OK ){
1524 int i; /* Used to loop through terms */
1525 int iMinPos = 0x7FFFFFFF; /* Position of next token */
1526 TermOffset *pTerm = 0; /* TermOffset associated with next token */
1527
1528 for(i=0; i<nToken; i++){
1529 TermOffset *pT = &sCtx.aTerm[i];
1530 if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
1531 iMinPos = pT->iPos-pT->iOff;
1532 pTerm = pT;
1533 }
1534 }
1535
1536 if( !pTerm ){
1537 /* All offsets for this column have been gathered. */
1538 break;
1539 }else{
1540 assert( iCurrent<=iMinPos );
1541 if( 0==(0xFE&*pTerm->pList) ){
1542 pTerm->pList = 0;
1543 }else{
1544 fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
1545 }
1546 while( rc==SQLITE_OK && iCurrent<iMinPos ){
1547 rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
1548 }
1549 if( rc==SQLITE_OK ){
1550 char aBuffer[64];
1551 sqlite3_snprintf(sizeof(aBuffer), aBuffer,
1552 "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
1553 );
1554 rc = fts3StringAppend(&res, aBuffer, -1);
1555 }else if( rc==SQLITE_DONE ){
1556 rc = SQLITE_CORRUPT;
1557 }
1558 }
1559 }
1560 if( rc==SQLITE_DONE ){
1561 rc = SQLITE_OK;
1562 }
1563
1564 pMod->xClose(pC);
1565 if( rc!=SQLITE_OK ) goto offsets_out;
1566 }
1567
1568 offsets_out:
1569 sqlite3_free(sCtx.aTerm);
1570 assert( rc!=SQLITE_DONE );
1571 sqlite3Fts3SegmentsClose(pTab);
1572 if( rc!=SQLITE_OK ){
1573 sqlite3_result_error_code(pCtx, rc);
1574 sqlite3_free(res.z);
1575 }else{
1576 sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
1577 }
1578 return;
1579 }
1580
1581 /*
1582 ** Implementation of matchinfo() function.
1583 */
1584 void sqlite3Fts3Matchinfo(
1585 sqlite3_context *pContext, /* Function call context */
1586 Fts3Cursor *pCsr, /* FTS3 table cursor */
1587 const char *zArg /* Second arg to matchinfo() function */
1588 ){
1589 Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1590 int rc;
1591 int i;
1592 const char *zFormat;
1593
1594 if( zArg ){
1595 for(i=0; zArg[i]; i++){
1596 char *zErr = 0;
1597 if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){
1598 sqlite3_result_error(pContext, zErr, -1);
1599 sqlite3_free(zErr);
1600 return;
1601 }
1602 }
1603 zFormat = zArg;
1604 }else{
1605 zFormat = FTS3_MATCHINFO_DEFAULT;
1606 }
1607
1608 if( !pCsr->pExpr ){
1609 sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
1610 return;
1611 }
1612
1613 /* Retrieve matchinfo() data. */
1614 rc = fts3GetMatchinfo(pCsr, zFormat);
1615 sqlite3Fts3SegmentsClose(pTab);
1616
1617 if( rc!=SQLITE_OK ){
1618 sqlite3_result_error_code(pContext, rc);
1619 }else{
1620 int n = pCsr->nMatchinfo * sizeof(u32);
1621 sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
1622 }
1623 }
1624
1625 #endif
OLDNEW
« no previous file with comments | « third_party/sqlite/src/ext/fts3/fts3_porter.c ('k') | third_party/sqlite/src/ext/fts3/fts3_tokenizer.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698