Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(92)

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/fts5/fts5_aux.c

Issue 1610543003: [sql] Import reference version of SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13
14
15 #include "fts5Int.h"
16 #include <math.h> /* amalgamator: keep */
17
18 /*
19 ** Object used to iterate through all "coalesced phrase instances" in
20 ** a single column of the current row. If the phrase instances in the
21 ** column being considered do not overlap, this object simply iterates
22 ** through them. Or, if they do overlap (share one or more tokens in
23 ** common), each set of overlapping instances is treated as a single
24 ** match. See documentation for the highlight() auxiliary function for
25 ** details.
26 **
27 ** Usage is:
28 **
29 ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
30 ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
31 ** rc = fts5CInstIterNext(&iter)
32 ** ){
33 ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
34 ** }
35 **
36 */
37 typedef struct CInstIter CInstIter;
38 struct CInstIter {
39 const Fts5ExtensionApi *pApi; /* API offered by current FTS version */
40 Fts5Context *pFts; /* First arg to pass to pApi functions */
41 int iCol; /* Column to search */
42 int iInst; /* Next phrase instance index */
43 int nInst; /* Total number of phrase instances */
44
45 /* Output variables */
46 int iStart; /* First token in coalesced phrase instance */
47 int iEnd; /* Last token in coalesced phrase instance */
48 };
49
50 /*
51 ** Advance the iterator to the next coalesced phrase instance. Return
52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
53 */
54 static int fts5CInstIterNext(CInstIter *pIter){
55 int rc = SQLITE_OK;
56 pIter->iStart = -1;
57 pIter->iEnd = -1;
58
59 while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
60 int ip; int ic; int io;
61 rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
62 if( rc==SQLITE_OK ){
63 if( ic==pIter->iCol ){
64 int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
65 if( pIter->iStart<0 ){
66 pIter->iStart = io;
67 pIter->iEnd = iEnd;
68 }else if( io<=pIter->iEnd ){
69 if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
70 }else{
71 break;
72 }
73 }
74 pIter->iInst++;
75 }
76 }
77
78 return rc;
79 }
80
81 /*
82 ** Initialize the iterator object indicated by the final parameter to
83 ** iterate through coalesced phrase instances in column iCol.
84 */
85 static int fts5CInstIterInit(
86 const Fts5ExtensionApi *pApi,
87 Fts5Context *pFts,
88 int iCol,
89 CInstIter *pIter
90 ){
91 int rc;
92
93 memset(pIter, 0, sizeof(CInstIter));
94 pIter->pApi = pApi;
95 pIter->pFts = pFts;
96 pIter->iCol = iCol;
97 rc = pApi->xInstCount(pFts, &pIter->nInst);
98
99 if( rc==SQLITE_OK ){
100 rc = fts5CInstIterNext(pIter);
101 }
102
103 return rc;
104 }
105
106
107
108 /*************************************************************************
109 ** Start of highlight() implementation.
110 */
111 typedef struct HighlightContext HighlightContext;
112 struct HighlightContext {
113 CInstIter iter; /* Coalesced Instance Iterator */
114 int iPos; /* Current token offset in zIn[] */
115 int iRangeStart; /* First token to include */
116 int iRangeEnd; /* If non-zero, last token to include */
117 const char *zOpen; /* Opening highlight */
118 const char *zClose; /* Closing highlight */
119 const char *zIn; /* Input text */
120 int nIn; /* Size of input text in bytes */
121 int iOff; /* Current offset within zIn[] */
122 char *zOut; /* Output value */
123 };
124
125 /*
126 ** Append text to the HighlightContext output string - p->zOut. Argument
127 ** z points to a buffer containing n bytes of text to append. If n is
128 ** negative, everything up until the first '\0' is appended to the output.
129 **
130 ** If *pRc is set to any value other than SQLITE_OK when this function is
131 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
132 ** *pRc is set to an error code before returning.
133 */
134 static void fts5HighlightAppend(
135 int *pRc,
136 HighlightContext *p,
137 const char *z, int n
138 ){
139 if( *pRc==SQLITE_OK ){
140 if( n<0 ) n = (int)strlen(z);
141 p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
142 if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
143 }
144 }
145
146 /*
147 ** Tokenizer callback used by implementation of highlight() function.
148 */
149 static int fts5HighlightCb(
150 void *pContext, /* Pointer to HighlightContext object */
151 int tflags, /* Mask of FTS5_TOKEN_* flags */
152 const char *pToken, /* Buffer containing token */
153 int nToken, /* Size of token in bytes */
154 int iStartOff, /* Start offset of token */
155 int iEndOff /* End offset of token */
156 ){
157 HighlightContext *p = (HighlightContext*)pContext;
158 int rc = SQLITE_OK;
159 int iPos;
160
161 if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
162 iPos = p->iPos++;
163
164 if( p->iRangeEnd>0 ){
165 if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
166 if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
167 }
168
169 if( iPos==p->iter.iStart ){
170 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
171 fts5HighlightAppend(&rc, p, p->zOpen, -1);
172 p->iOff = iStartOff;
173 }
174
175 if( iPos==p->iter.iEnd ){
176 if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){
177 fts5HighlightAppend(&rc, p, p->zOpen, -1);
178 }
179 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
180 fts5HighlightAppend(&rc, p, p->zClose, -1);
181 p->iOff = iEndOff;
182 if( rc==SQLITE_OK ){
183 rc = fts5CInstIterNext(&p->iter);
184 }
185 }
186
187 if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){
188 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
189 p->iOff = iEndOff;
190 if( iPos<p->iter.iEnd ){
191 fts5HighlightAppend(&rc, p, p->zClose, -1);
192 }
193 }
194
195 return rc;
196 }
197
198 /*
199 ** Implementation of highlight() function.
200 */
201 static void fts5HighlightFunction(
202 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
203 Fts5Context *pFts, /* First arg to pass to pApi functions */
204 sqlite3_context *pCtx, /* Context for returning result/error */
205 int nVal, /* Number of values in apVal[] array */
206 sqlite3_value **apVal /* Array of trailing arguments */
207 ){
208 HighlightContext ctx;
209 int rc;
210 int iCol;
211
212 if( nVal!=3 ){
213 const char *zErr = "wrong number of arguments to function highlight()";
214 sqlite3_result_error(pCtx, zErr, -1);
215 return;
216 }
217
218 iCol = sqlite3_value_int(apVal[0]);
219 memset(&ctx, 0, sizeof(HighlightContext));
220 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
221 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
222 rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
223
224 if( ctx.zIn ){
225 if( rc==SQLITE_OK ){
226 rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
227 }
228
229 if( rc==SQLITE_OK ){
230 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
231 }
232 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
233
234 if( rc==SQLITE_OK ){
235 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
236 }
237 sqlite3_free(ctx.zOut);
238 }
239 if( rc!=SQLITE_OK ){
240 sqlite3_result_error_code(pCtx, rc);
241 }
242 }
243 /*
244 ** End of highlight() implementation.
245 **************************************************************************/
246
247 /*
248 ** Implementation of snippet() function.
249 */
250 static void fts5SnippetFunction(
251 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
252 Fts5Context *pFts, /* First arg to pass to pApi functions */
253 sqlite3_context *pCtx, /* Context for returning result/error */
254 int nVal, /* Number of values in apVal[] array */
255 sqlite3_value **apVal /* Array of trailing arguments */
256 ){
257 HighlightContext ctx;
258 int rc = SQLITE_OK; /* Return code */
259 int iCol; /* 1st argument to snippet() */
260 const char *zEllips; /* 4th argument to snippet() */
261 int nToken; /* 5th argument to snippet() */
262 int nInst = 0; /* Number of instance matches this row */
263 int i; /* Used to iterate through instances */
264 int nPhrase; /* Number of phrases in query */
265 unsigned char *aSeen; /* Array of "seen instance" flags */
266 int iBestCol; /* Column containing best snippet */
267 int iBestStart = 0; /* First token of best snippet */
268 int iBestLast; /* Last token of best snippet */
269 int nBestScore = 0; /* Score of best snippet */
270 int nColSize = 0; /* Total size of iBestCol in tokens */
271
272 if( nVal!=5 ){
273 const char *zErr = "wrong number of arguments to function snippet()";
274 sqlite3_result_error(pCtx, zErr, -1);
275 return;
276 }
277
278 memset(&ctx, 0, sizeof(HighlightContext));
279 iCol = sqlite3_value_int(apVal[0]);
280 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
281 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
282 zEllips = (const char*)sqlite3_value_text(apVal[3]);
283 nToken = sqlite3_value_int(apVal[4]);
284 iBestLast = nToken-1;
285
286 iBestCol = (iCol>=0 ? iCol : 0);
287 nPhrase = pApi->xPhraseCount(pFts);
288 aSeen = sqlite3_malloc(nPhrase);
289 if( aSeen==0 ){
290 rc = SQLITE_NOMEM;
291 }
292
293 if( rc==SQLITE_OK ){
294 rc = pApi->xInstCount(pFts, &nInst);
295 }
296 for(i=0; rc==SQLITE_OK && i<nInst; i++){
297 int ip, iSnippetCol, iStart;
298 memset(aSeen, 0, nPhrase);
299 rc = pApi->xInst(pFts, i, &ip, &iSnippetCol, &iStart);
300 if( rc==SQLITE_OK && (iCol<0 || iSnippetCol==iCol) ){
301 int nScore = 1000;
302 int iLast = iStart - 1 + pApi->xPhraseSize(pFts, ip);
303 int j;
304 aSeen[ip] = 1;
305
306 for(j=i+1; rc==SQLITE_OK && j<nInst; j++){
307 int ic; int io; int iFinal;
308 rc = pApi->xInst(pFts, j, &ip, &ic, &io);
309 iFinal = io + pApi->xPhraseSize(pFts, ip) - 1;
310 if( rc==SQLITE_OK && ic==iSnippetCol && iLast<iStart+nToken ){
311 nScore += aSeen[ip] ? 1000 : 1;
312 aSeen[ip] = 1;
313 if( iFinal>iLast ) iLast = iFinal;
314 }
315 }
316
317 if( rc==SQLITE_OK && nScore>nBestScore ){
318 iBestCol = iSnippetCol;
319 iBestStart = iStart;
320 iBestLast = iLast;
321 nBestScore = nScore;
322 }
323 }
324 }
325
326 if( rc==SQLITE_OK ){
327 rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
328 }
329 if( rc==SQLITE_OK ){
330 rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
331 }
332 if( ctx.zIn ){
333 if( rc==SQLITE_OK ){
334 rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
335 }
336
337 if( (iBestStart+nToken-1)>iBestLast ){
338 iBestStart -= (iBestStart+nToken-1-iBestLast) / 2;
339 }
340 if( iBestStart+nToken>nColSize ){
341 iBestStart = nColSize - nToken;
342 }
343 if( iBestStart<0 ) iBestStart = 0;
344
345 ctx.iRangeStart = iBestStart;
346 ctx.iRangeEnd = iBestStart + nToken - 1;
347
348 if( iBestStart>0 ){
349 fts5HighlightAppend(&rc, &ctx, zEllips, -1);
350 }
351 if( rc==SQLITE_OK ){
352 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
353 }
354 if( ctx.iRangeEnd>=(nColSize-1) ){
355 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
356 }else{
357 fts5HighlightAppend(&rc, &ctx, zEllips, -1);
358 }
359
360 if( rc==SQLITE_OK ){
361 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
362 }else{
363 sqlite3_result_error_code(pCtx, rc);
364 }
365 sqlite3_free(ctx.zOut);
366 }
367 sqlite3_free(aSeen);
368 }
369
370 /************************************************************************/
371
372 /*
373 ** The first time the bm25() function is called for a query, an instance
374 ** of the following structure is allocated and populated.
375 */
376 typedef struct Fts5Bm25Data Fts5Bm25Data;
377 struct Fts5Bm25Data {
378 int nPhrase; /* Number of phrases in query */
379 double avgdl; /* Average number of tokens in each row */
380 double *aIDF; /* IDF for each phrase */
381 double *aFreq; /* Array used to calculate phrase freq. */
382 };
383
384 /*
385 ** Callback used by fts5Bm25GetData() to count the number of rows in the
386 ** table matched by each individual phrase within the query.
387 */
388 static int fts5CountCb(
389 const Fts5ExtensionApi *pApi,
390 Fts5Context *pFts,
391 void *pUserData /* Pointer to sqlite3_int64 variable */
392 ){
393 sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
394 (*pn)++;
395 return SQLITE_OK;
396 }
397
398 /*
399 ** Set *ppData to point to the Fts5Bm25Data object for the current query.
400 ** If the object has not already been allocated, allocate and populate it
401 ** now.
402 */
403 static int fts5Bm25GetData(
404 const Fts5ExtensionApi *pApi,
405 Fts5Context *pFts,
406 Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */
407 ){
408 int rc = SQLITE_OK; /* Return code */
409 Fts5Bm25Data *p; /* Object to return */
410
411 p = pApi->xGetAuxdata(pFts, 0);
412 if( p==0 ){
413 int nPhrase; /* Number of phrases in query */
414 sqlite3_int64 nRow = 0; /* Number of rows in table */
415 sqlite3_int64 nToken = 0; /* Number of tokens in table */
416 int nByte; /* Bytes of space to allocate */
417 int i;
418
419 /* Allocate the Fts5Bm25Data object */
420 nPhrase = pApi->xPhraseCount(pFts);
421 nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
422 p = (Fts5Bm25Data*)sqlite3_malloc(nByte);
423 if( p==0 ){
424 rc = SQLITE_NOMEM;
425 }else{
426 memset(p, 0, nByte);
427 p->nPhrase = nPhrase;
428 p->aIDF = (double*)&p[1];
429 p->aFreq = &p->aIDF[nPhrase];
430 }
431
432 /* Calculate the average document length for this FTS5 table */
433 if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
434 if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
435 if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow;
436
437 /* Calculate an IDF for each phrase in the query */
438 for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
439 sqlite3_int64 nHit = 0;
440 rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
441 if( rc==SQLITE_OK ){
442 /* Calculate the IDF (Inverse Document Frequency) for phrase i.
443 ** This is done using the standard BM25 formula as found on wikipedia:
444 **
445 ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
446 **
447 ** where "N" is the total number of documents in the set and nHit
448 ** is the number that contain at least one instance of the phrase
449 ** under consideration.
450 **
451 ** The problem with this is that if (N < 2*nHit), the IDF is
452 ** negative. Which is undesirable. So the mimimum allowable IDF is
453 ** (1e-6) - roughly the same as a term that appears in just over
454 ** half of set of 5,000,000 documents. */
455 double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
456 if( idf<=0.0 ) idf = 1e-6;
457 p->aIDF[i] = idf;
458 }
459 }
460
461 if( rc!=SQLITE_OK ){
462 sqlite3_free(p);
463 }else{
464 rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
465 }
466 if( rc!=SQLITE_OK ) p = 0;
467 }
468 *ppData = p;
469 return rc;
470 }
471
472 /*
473 ** Implementation of bm25() function.
474 */
475 static void fts5Bm25Function(
476 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
477 Fts5Context *pFts, /* First arg to pass to pApi functions */
478 sqlite3_context *pCtx, /* Context for returning result/error */
479 int nVal, /* Number of values in apVal[] array */
480 sqlite3_value **apVal /* Array of trailing arguments */
481 ){
482 const double k1 = 1.2; /* Constant "k1" from BM25 formula */
483 const double b = 0.75; /* Constant "b" from BM25 formula */
484 int rc = SQLITE_OK; /* Error code */
485 double score = 0.0; /* SQL function return value */
486 Fts5Bm25Data *pData; /* Values allocated/calculated once only */
487 int i; /* Iterator variable */
488 int nInst = 0; /* Value returned by xInstCount() */
489 double D = 0.0; /* Total number of tokens in row */
490 double *aFreq = 0; /* Array of phrase freq. for current row */
491
492 /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
493 ** for each phrase in the query for the current row. */
494 rc = fts5Bm25GetData(pApi, pFts, &pData);
495 if( rc==SQLITE_OK ){
496 aFreq = pData->aFreq;
497 memset(aFreq, 0, sizeof(double) * pData->nPhrase);
498 rc = pApi->xInstCount(pFts, &nInst);
499 }
500 for(i=0; rc==SQLITE_OK && i<nInst; i++){
501 int ip; int ic; int io;
502 rc = pApi->xInst(pFts, i, &ip, &ic, &io);
503 if( rc==SQLITE_OK ){
504 double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
505 aFreq[ip] += w;
506 }
507 }
508
509 /* Figure out the total size of the current row in tokens. */
510 if( rc==SQLITE_OK ){
511 int nTok;
512 rc = pApi->xColumnSize(pFts, -1, &nTok);
513 D = (double)nTok;
514 }
515
516 /* Determine the BM25 score for the current row. */
517 for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){
518 score += pData->aIDF[i] * (
519 ( aFreq[i] * (k1 + 1.0) ) /
520 ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
521 );
522 }
523
524 /* If no error has occurred, return the calculated score. Otherwise,
525 ** throw an SQL exception. */
526 if( rc==SQLITE_OK ){
527 sqlite3_result_double(pCtx, -1.0 * score);
528 }else{
529 sqlite3_result_error_code(pCtx, rc);
530 }
531 }
532
533 int sqlite3Fts5AuxInit(fts5_api *pApi){
534 struct Builtin {
535 const char *zFunc; /* Function name (nul-terminated) */
536 void *pUserData; /* User-data pointer */
537 fts5_extension_function xFunc;/* Callback function */
538 void (*xDestroy)(void*); /* Destructor function */
539 } aBuiltin [] = {
540 { "snippet", 0, fts5SnippetFunction, 0 },
541 { "highlight", 0, fts5HighlightFunction, 0 },
542 { "bm25", 0, fts5Bm25Function, 0 },
543 };
544 int rc = SQLITE_OK; /* Return code */
545 int i; /* To iterate through builtin functions */
546
547 for(i=0; rc==SQLITE_OK && i<(int)ArraySize(aBuiltin); i++){
548 rc = pApi->xCreateFunction(pApi,
549 aBuiltin[i].zFunc,
550 aBuiltin[i].pUserData,
551 aBuiltin[i].xFunc,
552 aBuiltin[i].xDestroy
553 );
554 }
555
556 return rc;
557 }
558
559
OLDNEW
« no previous file with comments | « third_party/sqlite/sqlite-src-3100200/ext/fts5/fts5Int.h ('k') | third_party/sqlite/sqlite-src-3100200/ext/fts5/fts5_buffer.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698