OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ** 2014 May 31 |
| 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: |
| 6 ** |
| 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. |
| 10 ** |
| 11 ****************************************************************************** |
| 12 */ |
| 13 |
| 14 |
| 15 #include "fts5Int.h" |
| 16 #include <math.h> /* amalgamator: keep */ |
| 17 |
| 18 /* |
| 19 ** Object used to iterate through all "coalesced phrase instances" in |
| 20 ** a single column of the current row. If the phrase instances in the |
| 21 ** column being considered do not overlap, this object simply iterates |
| 22 ** through them. Or, if they do overlap (share one or more tokens in |
| 23 ** common), each set of overlapping instances is treated as a single |
| 24 ** match. See documentation for the highlight() auxiliary function for |
| 25 ** details. |
| 26 ** |
| 27 ** Usage is: |
| 28 ** |
| 29 ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter); |
| 30 ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter); |
| 31 ** rc = fts5CInstIterNext(&iter) |
| 32 ** ){ |
| 33 ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd); |
| 34 ** } |
| 35 ** |
| 36 */ |
| 37 typedef struct CInstIter CInstIter; |
| 38 struct CInstIter { |
| 39 const Fts5ExtensionApi *pApi; /* API offered by current FTS version */ |
| 40 Fts5Context *pFts; /* First arg to pass to pApi functions */ |
| 41 int iCol; /* Column to search */ |
| 42 int iInst; /* Next phrase instance index */ |
| 43 int nInst; /* Total number of phrase instances */ |
| 44 |
| 45 /* Output variables */ |
| 46 int iStart; /* First token in coalesced phrase instance */ |
| 47 int iEnd; /* Last token in coalesced phrase instance */ |
| 48 }; |
| 49 |
| 50 /* |
| 51 ** Advance the iterator to the next coalesced phrase instance. Return |
| 52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise. |
| 53 */ |
| 54 static int fts5CInstIterNext(CInstIter *pIter){ |
| 55 int rc = SQLITE_OK; |
| 56 pIter->iStart = -1; |
| 57 pIter->iEnd = -1; |
| 58 |
| 59 while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){ |
| 60 int ip; int ic; int io; |
| 61 rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io); |
| 62 if( rc==SQLITE_OK ){ |
| 63 if( ic==pIter->iCol ){ |
| 64 int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip); |
| 65 if( pIter->iStart<0 ){ |
| 66 pIter->iStart = io; |
| 67 pIter->iEnd = iEnd; |
| 68 }else if( io<=pIter->iEnd ){ |
| 69 if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd; |
| 70 }else{ |
| 71 break; |
| 72 } |
| 73 } |
| 74 pIter->iInst++; |
| 75 } |
| 76 } |
| 77 |
| 78 return rc; |
| 79 } |
| 80 |
| 81 /* |
| 82 ** Initialize the iterator object indicated by the final parameter to |
| 83 ** iterate through coalesced phrase instances in column iCol. |
| 84 */ |
| 85 static int fts5CInstIterInit( |
| 86 const Fts5ExtensionApi *pApi, |
| 87 Fts5Context *pFts, |
| 88 int iCol, |
| 89 CInstIter *pIter |
| 90 ){ |
| 91 int rc; |
| 92 |
| 93 memset(pIter, 0, sizeof(CInstIter)); |
| 94 pIter->pApi = pApi; |
| 95 pIter->pFts = pFts; |
| 96 pIter->iCol = iCol; |
| 97 rc = pApi->xInstCount(pFts, &pIter->nInst); |
| 98 |
| 99 if( rc==SQLITE_OK ){ |
| 100 rc = fts5CInstIterNext(pIter); |
| 101 } |
| 102 |
| 103 return rc; |
| 104 } |
| 105 |
| 106 |
| 107 |
| 108 /************************************************************************* |
| 109 ** Start of highlight() implementation. |
| 110 */ |
| 111 typedef struct HighlightContext HighlightContext; |
| 112 struct HighlightContext { |
| 113 CInstIter iter; /* Coalesced Instance Iterator */ |
| 114 int iPos; /* Current token offset in zIn[] */ |
| 115 int iRangeStart; /* First token to include */ |
| 116 int iRangeEnd; /* If non-zero, last token to include */ |
| 117 const char *zOpen; /* Opening highlight */ |
| 118 const char *zClose; /* Closing highlight */ |
| 119 const char *zIn; /* Input text */ |
| 120 int nIn; /* Size of input text in bytes */ |
| 121 int iOff; /* Current offset within zIn[] */ |
| 122 char *zOut; /* Output value */ |
| 123 }; |
| 124 |
| 125 /* |
| 126 ** Append text to the HighlightContext output string - p->zOut. Argument |
| 127 ** z points to a buffer containing n bytes of text to append. If n is |
| 128 ** negative, everything up until the first '\0' is appended to the output. |
| 129 ** |
| 130 ** If *pRc is set to any value other than SQLITE_OK when this function is |
| 131 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered, |
| 132 ** *pRc is set to an error code before returning. |
| 133 */ |
| 134 static void fts5HighlightAppend( |
| 135 int *pRc, |
| 136 HighlightContext *p, |
| 137 const char *z, int n |
| 138 ){ |
| 139 if( *pRc==SQLITE_OK ){ |
| 140 if( n<0 ) n = (int)strlen(z); |
| 141 p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z); |
| 142 if( p->zOut==0 ) *pRc = SQLITE_NOMEM; |
| 143 } |
| 144 } |
| 145 |
| 146 /* |
| 147 ** Tokenizer callback used by implementation of highlight() function. |
| 148 */ |
| 149 static int fts5HighlightCb( |
| 150 void *pContext, /* Pointer to HighlightContext object */ |
| 151 int tflags, /* Mask of FTS5_TOKEN_* flags */ |
| 152 const char *pToken, /* Buffer containing token */ |
| 153 int nToken, /* Size of token in bytes */ |
| 154 int iStartOff, /* Start offset of token */ |
| 155 int iEndOff /* End offset of token */ |
| 156 ){ |
| 157 HighlightContext *p = (HighlightContext*)pContext; |
| 158 int rc = SQLITE_OK; |
| 159 int iPos; |
| 160 |
| 161 UNUSED_PARAM2(pToken, nToken); |
| 162 |
| 163 if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; |
| 164 iPos = p->iPos++; |
| 165 |
| 166 if( p->iRangeEnd>0 ){ |
| 167 if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; |
| 168 if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff; |
| 169 } |
| 170 |
| 171 if( iPos==p->iter.iStart ){ |
| 172 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff); |
| 173 fts5HighlightAppend(&rc, p, p->zOpen, -1); |
| 174 p->iOff = iStartOff; |
| 175 } |
| 176 |
| 177 if( iPos==p->iter.iEnd ){ |
| 178 if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){ |
| 179 fts5HighlightAppend(&rc, p, p->zOpen, -1); |
| 180 } |
| 181 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); |
| 182 fts5HighlightAppend(&rc, p, p->zClose, -1); |
| 183 p->iOff = iEndOff; |
| 184 if( rc==SQLITE_OK ){ |
| 185 rc = fts5CInstIterNext(&p->iter); |
| 186 } |
| 187 } |
| 188 |
| 189 if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){ |
| 190 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); |
| 191 p->iOff = iEndOff; |
| 192 if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){ |
| 193 fts5HighlightAppend(&rc, p, p->zClose, -1); |
| 194 } |
| 195 } |
| 196 |
| 197 return rc; |
| 198 } |
| 199 |
| 200 /* |
| 201 ** Implementation of highlight() function. |
| 202 */ |
| 203 static void fts5HighlightFunction( |
| 204 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
| 205 Fts5Context *pFts, /* First arg to pass to pApi functions */ |
| 206 sqlite3_context *pCtx, /* Context for returning result/error */ |
| 207 int nVal, /* Number of values in apVal[] array */ |
| 208 sqlite3_value **apVal /* Array of trailing arguments */ |
| 209 ){ |
| 210 HighlightContext ctx; |
| 211 int rc; |
| 212 int iCol; |
| 213 |
| 214 if( nVal!=3 ){ |
| 215 const char *zErr = "wrong number of arguments to function highlight()"; |
| 216 sqlite3_result_error(pCtx, zErr, -1); |
| 217 return; |
| 218 } |
| 219 |
| 220 iCol = sqlite3_value_int(apVal[0]); |
| 221 memset(&ctx, 0, sizeof(HighlightContext)); |
| 222 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); |
| 223 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); |
| 224 rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn); |
| 225 |
| 226 if( ctx.zIn ){ |
| 227 if( rc==SQLITE_OK ){ |
| 228 rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); |
| 229 } |
| 230 |
| 231 if( rc==SQLITE_OK ){ |
| 232 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); |
| 233 } |
| 234 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); |
| 235 |
| 236 if( rc==SQLITE_OK ){ |
| 237 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); |
| 238 } |
| 239 sqlite3_free(ctx.zOut); |
| 240 } |
| 241 if( rc!=SQLITE_OK ){ |
| 242 sqlite3_result_error_code(pCtx, rc); |
| 243 } |
| 244 } |
| 245 /* |
| 246 ** End of highlight() implementation. |
| 247 **************************************************************************/ |
| 248 |
| 249 /* |
| 250 ** Context object passed to the fts5SentenceFinderCb() function. |
| 251 */ |
| 252 typedef struct Fts5SFinder Fts5SFinder; |
| 253 struct Fts5SFinder { |
| 254 int iPos; /* Current token position */ |
| 255 int nFirstAlloc; /* Allocated size of aFirst[] */ |
| 256 int nFirst; /* Number of entries in aFirst[] */ |
| 257 int *aFirst; /* Array of first token in each sentence */ |
| 258 const char *zDoc; /* Document being tokenized */ |
| 259 }; |
| 260 |
| 261 /* |
| 262 ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if |
| 263 ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an |
| 264 ** error occurs. |
| 265 */ |
| 266 static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){ |
| 267 if( p->nFirstAlloc==p->nFirst ){ |
| 268 int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64; |
| 269 int *aNew; |
| 270 |
| 271 aNew = (int*)sqlite3_realloc(p->aFirst, nNew*sizeof(int)); |
| 272 if( aNew==0 ) return SQLITE_NOMEM; |
| 273 p->aFirst = aNew; |
| 274 p->nFirstAlloc = nNew; |
| 275 } |
| 276 p->aFirst[p->nFirst++] = iAdd; |
| 277 return SQLITE_OK; |
| 278 } |
| 279 |
| 280 /* |
| 281 ** This function is an xTokenize() callback used by the auxiliary snippet() |
| 282 ** function. Its job is to identify tokens that are the first in a sentence. |
| 283 ** For each such token, an entry is added to the SFinder.aFirst[] array. |
| 284 */ |
| 285 static int fts5SentenceFinderCb( |
| 286 void *pContext, /* Pointer to HighlightContext object */ |
| 287 int tflags, /* Mask of FTS5_TOKEN_* flags */ |
| 288 const char *pToken, /* Buffer containing token */ |
| 289 int nToken, /* Size of token in bytes */ |
| 290 int iStartOff, /* Start offset of token */ |
| 291 int iEndOff /* End offset of token */ |
| 292 ){ |
| 293 int rc = SQLITE_OK; |
| 294 |
| 295 UNUSED_PARAM2(pToken, nToken); |
| 296 UNUSED_PARAM(iEndOff); |
| 297 |
| 298 if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ |
| 299 Fts5SFinder *p = (Fts5SFinder*)pContext; |
| 300 if( p->iPos>0 ){ |
| 301 int i; |
| 302 char c = 0; |
| 303 for(i=iStartOff-1; i>=0; i--){ |
| 304 c = p->zDoc[i]; |
| 305 if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break; |
| 306 } |
| 307 if( i!=iStartOff-1 && (c=='.' || c==':') ){ |
| 308 rc = fts5SentenceFinderAdd(p, p->iPos); |
| 309 } |
| 310 }else{ |
| 311 rc = fts5SentenceFinderAdd(p, 0); |
| 312 } |
| 313 p->iPos++; |
| 314 } |
| 315 return rc; |
| 316 } |
| 317 |
| 318 static int fts5SnippetScore( |
| 319 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
| 320 Fts5Context *pFts, /* First arg to pass to pApi functions */ |
| 321 int nDocsize, /* Size of column in tokens */ |
| 322 unsigned char *aSeen, /* Array with one element per query phrase */ |
| 323 int iCol, /* Column to score */ |
| 324 int iPos, /* Starting offset to score */ |
| 325 int nToken, /* Max tokens per snippet */ |
| 326 int *pnScore, /* OUT: Score */ |
| 327 int *piPos /* OUT: Adjusted offset */ |
| 328 ){ |
| 329 int rc; |
| 330 int i; |
| 331 int ip = 0; |
| 332 int ic = 0; |
| 333 int iOff = 0; |
| 334 int iFirst = -1; |
| 335 int nInst; |
| 336 int nScore = 0; |
| 337 int iLast = 0; |
| 338 |
| 339 rc = pApi->xInstCount(pFts, &nInst); |
| 340 for(i=0; i<nInst && rc==SQLITE_OK; i++){ |
| 341 rc = pApi->xInst(pFts, i, &ip, &ic, &iOff); |
| 342 if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<(iPos+nToken) ){ |
| 343 nScore += (aSeen[ip] ? 1 : 1000); |
| 344 aSeen[ip] = 1; |
| 345 if( iFirst<0 ) iFirst = iOff; |
| 346 iLast = iOff + pApi->xPhraseSize(pFts, ip); |
| 347 } |
| 348 } |
| 349 |
| 350 *pnScore = nScore; |
| 351 if( piPos ){ |
| 352 int iAdj = iFirst - (nToken - (iLast-iFirst)) / 2; |
| 353 if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken; |
| 354 if( iAdj<0 ) iAdj = 0; |
| 355 *piPos = iAdj; |
| 356 } |
| 357 |
| 358 return rc; |
| 359 } |
| 360 |
| 361 /* |
| 362 ** Implementation of snippet() function. |
| 363 */ |
| 364 static void fts5SnippetFunction( |
| 365 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
| 366 Fts5Context *pFts, /* First arg to pass to pApi functions */ |
| 367 sqlite3_context *pCtx, /* Context for returning result/error */ |
| 368 int nVal, /* Number of values in apVal[] array */ |
| 369 sqlite3_value **apVal /* Array of trailing arguments */ |
| 370 ){ |
| 371 HighlightContext ctx; |
| 372 int rc = SQLITE_OK; /* Return code */ |
| 373 int iCol; /* 1st argument to snippet() */ |
| 374 const char *zEllips; /* 4th argument to snippet() */ |
| 375 int nToken; /* 5th argument to snippet() */ |
| 376 int nInst = 0; /* Number of instance matches this row */ |
| 377 int i; /* Used to iterate through instances */ |
| 378 int nPhrase; /* Number of phrases in query */ |
| 379 unsigned char *aSeen; /* Array of "seen instance" flags */ |
| 380 int iBestCol; /* Column containing best snippet */ |
| 381 int iBestStart = 0; /* First token of best snippet */ |
| 382 int nBestScore = 0; /* Score of best snippet */ |
| 383 int nColSize = 0; /* Total size of iBestCol in tokens */ |
| 384 Fts5SFinder sFinder; /* Used to find the beginnings of sentences */ |
| 385 int nCol; |
| 386 |
| 387 if( nVal!=5 ){ |
| 388 const char *zErr = "wrong number of arguments to function snippet()"; |
| 389 sqlite3_result_error(pCtx, zErr, -1); |
| 390 return; |
| 391 } |
| 392 |
| 393 nCol = pApi->xColumnCount(pFts); |
| 394 memset(&ctx, 0, sizeof(HighlightContext)); |
| 395 iCol = sqlite3_value_int(apVal[0]); |
| 396 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); |
| 397 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); |
| 398 zEllips = (const char*)sqlite3_value_text(apVal[3]); |
| 399 nToken = sqlite3_value_int(apVal[4]); |
| 400 |
| 401 iBestCol = (iCol>=0 ? iCol : 0); |
| 402 nPhrase = pApi->xPhraseCount(pFts); |
| 403 aSeen = sqlite3_malloc(nPhrase); |
| 404 if( aSeen==0 ){ |
| 405 rc = SQLITE_NOMEM; |
| 406 } |
| 407 if( rc==SQLITE_OK ){ |
| 408 rc = pApi->xInstCount(pFts, &nInst); |
| 409 } |
| 410 |
| 411 memset(&sFinder, 0, sizeof(Fts5SFinder)); |
| 412 for(i=0; i<nCol; i++){ |
| 413 if( iCol<0 || iCol==i ){ |
| 414 int nDoc; |
| 415 int nDocsize; |
| 416 int ii; |
| 417 sFinder.iPos = 0; |
| 418 sFinder.nFirst = 0; |
| 419 rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc); |
| 420 if( rc!=SQLITE_OK ) break; |
| 421 rc = pApi->xTokenize(pFts, |
| 422 sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb |
| 423 ); |
| 424 if( rc!=SQLITE_OK ) break; |
| 425 rc = pApi->xColumnSize(pFts, i, &nDocsize); |
| 426 if( rc!=SQLITE_OK ) break; |
| 427 |
| 428 for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){ |
| 429 int ip, ic, io; |
| 430 int iAdj; |
| 431 int nScore; |
| 432 int jj; |
| 433 |
| 434 rc = pApi->xInst(pFts, ii, &ip, &ic, &io); |
| 435 if( ic!=i || rc!=SQLITE_OK ) continue; |
| 436 memset(aSeen, 0, nPhrase); |
| 437 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, |
| 438 io, nToken, &nScore, &iAdj |
| 439 ); |
| 440 if( rc==SQLITE_OK && nScore>nBestScore ){ |
| 441 nBestScore = nScore; |
| 442 iBestCol = i; |
| 443 iBestStart = iAdj; |
| 444 nColSize = nDocsize; |
| 445 } |
| 446 |
| 447 if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){ |
| 448 for(jj=0; jj<(sFinder.nFirst-1); jj++){ |
| 449 if( sFinder.aFirst[jj+1]>io ) break; |
| 450 } |
| 451 |
| 452 if( sFinder.aFirst[jj]<io ){ |
| 453 memset(aSeen, 0, nPhrase); |
| 454 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, |
| 455 sFinder.aFirst[jj], nToken, &nScore, 0 |
| 456 ); |
| 457 |
| 458 nScore += (sFinder.aFirst[jj]==0 ? 120 : 100); |
| 459 if( rc==SQLITE_OK && nScore>nBestScore ){ |
| 460 nBestScore = nScore; |
| 461 iBestCol = i; |
| 462 iBestStart = sFinder.aFirst[jj]; |
| 463 nColSize = nDocsize; |
| 464 } |
| 465 } |
| 466 } |
| 467 } |
| 468 } |
| 469 } |
| 470 |
| 471 if( rc==SQLITE_OK ){ |
| 472 rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn); |
| 473 } |
| 474 if( rc==SQLITE_OK && nColSize==0 ){ |
| 475 rc = pApi->xColumnSize(pFts, iBestCol, &nColSize); |
| 476 } |
| 477 if( ctx.zIn ){ |
| 478 if( rc==SQLITE_OK ){ |
| 479 rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter); |
| 480 } |
| 481 |
| 482 ctx.iRangeStart = iBestStart; |
| 483 ctx.iRangeEnd = iBestStart + nToken - 1; |
| 484 |
| 485 if( iBestStart>0 ){ |
| 486 fts5HighlightAppend(&rc, &ctx, zEllips, -1); |
| 487 } |
| 488 |
| 489 /* Advance iterator ctx.iter so that it points to the first coalesced |
| 490 ** phrase instance at or following position iBestStart. */ |
| 491 while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){ |
| 492 rc = fts5CInstIterNext(&ctx.iter); |
| 493 } |
| 494 |
| 495 if( rc==SQLITE_OK ){ |
| 496 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); |
| 497 } |
| 498 if( ctx.iRangeEnd>=(nColSize-1) ){ |
| 499 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); |
| 500 }else{ |
| 501 fts5HighlightAppend(&rc, &ctx, zEllips, -1); |
| 502 } |
| 503 } |
| 504 if( rc==SQLITE_OK ){ |
| 505 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); |
| 506 }else{ |
| 507 sqlite3_result_error_code(pCtx, rc); |
| 508 } |
| 509 sqlite3_free(ctx.zOut); |
| 510 sqlite3_free(aSeen); |
| 511 sqlite3_free(sFinder.aFirst); |
| 512 } |
| 513 |
| 514 /************************************************************************/ |
| 515 |
| 516 /* |
| 517 ** The first time the bm25() function is called for a query, an instance |
| 518 ** of the following structure is allocated and populated. |
| 519 */ |
| 520 typedef struct Fts5Bm25Data Fts5Bm25Data; |
| 521 struct Fts5Bm25Data { |
| 522 int nPhrase; /* Number of phrases in query */ |
| 523 double avgdl; /* Average number of tokens in each row */ |
| 524 double *aIDF; /* IDF for each phrase */ |
| 525 double *aFreq; /* Array used to calculate phrase freq. */ |
| 526 }; |
| 527 |
| 528 /* |
| 529 ** Callback used by fts5Bm25GetData() to count the number of rows in the |
| 530 ** table matched by each individual phrase within the query. |
| 531 */ |
| 532 static int fts5CountCb( |
| 533 const Fts5ExtensionApi *pApi, |
| 534 Fts5Context *pFts, |
| 535 void *pUserData /* Pointer to sqlite3_int64 variable */ |
| 536 ){ |
| 537 sqlite3_int64 *pn = (sqlite3_int64*)pUserData; |
| 538 UNUSED_PARAM2(pApi, pFts); |
| 539 (*pn)++; |
| 540 return SQLITE_OK; |
| 541 } |
| 542 |
| 543 /* |
| 544 ** Set *ppData to point to the Fts5Bm25Data object for the current query. |
| 545 ** If the object has not already been allocated, allocate and populate it |
| 546 ** now. |
| 547 */ |
| 548 static int fts5Bm25GetData( |
| 549 const Fts5ExtensionApi *pApi, |
| 550 Fts5Context *pFts, |
| 551 Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */ |
| 552 ){ |
| 553 int rc = SQLITE_OK; /* Return code */ |
| 554 Fts5Bm25Data *p; /* Object to return */ |
| 555 |
| 556 p = pApi->xGetAuxdata(pFts, 0); |
| 557 if( p==0 ){ |
| 558 int nPhrase; /* Number of phrases in query */ |
| 559 sqlite3_int64 nRow = 0; /* Number of rows in table */ |
| 560 sqlite3_int64 nToken = 0; /* Number of tokens in table */ |
| 561 int nByte; /* Bytes of space to allocate */ |
| 562 int i; |
| 563 |
| 564 /* Allocate the Fts5Bm25Data object */ |
| 565 nPhrase = pApi->xPhraseCount(pFts); |
| 566 nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double); |
| 567 p = (Fts5Bm25Data*)sqlite3_malloc(nByte); |
| 568 if( p==0 ){ |
| 569 rc = SQLITE_NOMEM; |
| 570 }else{ |
| 571 memset(p, 0, nByte); |
| 572 p->nPhrase = nPhrase; |
| 573 p->aIDF = (double*)&p[1]; |
| 574 p->aFreq = &p->aIDF[nPhrase]; |
| 575 } |
| 576 |
| 577 /* Calculate the average document length for this FTS5 table */ |
| 578 if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow); |
| 579 if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken); |
| 580 if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow; |
| 581 |
| 582 /* Calculate an IDF for each phrase in the query */ |
| 583 for(i=0; rc==SQLITE_OK && i<nPhrase; i++){ |
| 584 sqlite3_int64 nHit = 0; |
| 585 rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb); |
| 586 if( rc==SQLITE_OK ){ |
| 587 /* Calculate the IDF (Inverse Document Frequency) for phrase i. |
| 588 ** This is done using the standard BM25 formula as found on wikipedia: |
| 589 ** |
| 590 ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) ) |
| 591 ** |
| 592 ** where "N" is the total number of documents in the set and nHit |
| 593 ** is the number that contain at least one instance of the phrase |
| 594 ** under consideration. |
| 595 ** |
| 596 ** The problem with this is that if (N < 2*nHit), the IDF is |
| 597 ** negative. Which is undesirable. So the mimimum allowable IDF is |
| 598 ** (1e-6) - roughly the same as a term that appears in just over |
| 599 ** half of set of 5,000,000 documents. */ |
| 600 double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) ); |
| 601 if( idf<=0.0 ) idf = 1e-6; |
| 602 p->aIDF[i] = idf; |
| 603 } |
| 604 } |
| 605 |
| 606 if( rc!=SQLITE_OK ){ |
| 607 sqlite3_free(p); |
| 608 }else{ |
| 609 rc = pApi->xSetAuxdata(pFts, p, sqlite3_free); |
| 610 } |
| 611 if( rc!=SQLITE_OK ) p = 0; |
| 612 } |
| 613 *ppData = p; |
| 614 return rc; |
| 615 } |
| 616 |
| 617 /* |
| 618 ** Implementation of bm25() function. |
| 619 */ |
| 620 static void fts5Bm25Function( |
| 621 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
| 622 Fts5Context *pFts, /* First arg to pass to pApi functions */ |
| 623 sqlite3_context *pCtx, /* Context for returning result/error */ |
| 624 int nVal, /* Number of values in apVal[] array */ |
| 625 sqlite3_value **apVal /* Array of trailing arguments */ |
| 626 ){ |
| 627 const double k1 = 1.2; /* Constant "k1" from BM25 formula */ |
| 628 const double b = 0.75; /* Constant "b" from BM25 formula */ |
| 629 int rc = SQLITE_OK; /* Error code */ |
| 630 double score = 0.0; /* SQL function return value */ |
| 631 Fts5Bm25Data *pData; /* Values allocated/calculated once only */ |
| 632 int i; /* Iterator variable */ |
| 633 int nInst = 0; /* Value returned by xInstCount() */ |
| 634 double D = 0.0; /* Total number of tokens in row */ |
| 635 double *aFreq = 0; /* Array of phrase freq. for current row */ |
| 636 |
| 637 /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation) |
| 638 ** for each phrase in the query for the current row. */ |
| 639 rc = fts5Bm25GetData(pApi, pFts, &pData); |
| 640 if( rc==SQLITE_OK ){ |
| 641 aFreq = pData->aFreq; |
| 642 memset(aFreq, 0, sizeof(double) * pData->nPhrase); |
| 643 rc = pApi->xInstCount(pFts, &nInst); |
| 644 } |
| 645 for(i=0; rc==SQLITE_OK && i<nInst; i++){ |
| 646 int ip; int ic; int io; |
| 647 rc = pApi->xInst(pFts, i, &ip, &ic, &io); |
| 648 if( rc==SQLITE_OK ){ |
| 649 double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0; |
| 650 aFreq[ip] += w; |
| 651 } |
| 652 } |
| 653 |
| 654 /* Figure out the total size of the current row in tokens. */ |
| 655 if( rc==SQLITE_OK ){ |
| 656 int nTok; |
| 657 rc = pApi->xColumnSize(pFts, -1, &nTok); |
| 658 D = (double)nTok; |
| 659 } |
| 660 |
| 661 /* Determine the BM25 score for the current row. */ |
| 662 for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){ |
| 663 score += pData->aIDF[i] * ( |
| 664 ( aFreq[i] * (k1 + 1.0) ) / |
| 665 ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) ) |
| 666 ); |
| 667 } |
| 668 |
| 669 /* If no error has occurred, return the calculated score. Otherwise, |
| 670 ** throw an SQL exception. */ |
| 671 if( rc==SQLITE_OK ){ |
| 672 sqlite3_result_double(pCtx, -1.0 * score); |
| 673 }else{ |
| 674 sqlite3_result_error_code(pCtx, rc); |
| 675 } |
| 676 } |
| 677 |
| 678 int sqlite3Fts5AuxInit(fts5_api *pApi){ |
| 679 struct Builtin { |
| 680 const char *zFunc; /* Function name (nul-terminated) */ |
| 681 void *pUserData; /* User-data pointer */ |
| 682 fts5_extension_function xFunc;/* Callback function */ |
| 683 void (*xDestroy)(void*); /* Destructor function */ |
| 684 } aBuiltin [] = { |
| 685 { "snippet", 0, fts5SnippetFunction, 0 }, |
| 686 { "highlight", 0, fts5HighlightFunction, 0 }, |
| 687 { "bm25", 0, fts5Bm25Function, 0 }, |
| 688 }; |
| 689 int rc = SQLITE_OK; /* Return code */ |
| 690 int i; /* To iterate through builtin functions */ |
| 691 |
| 692 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){ |
| 693 rc = pApi->xCreateFunction(pApi, |
| 694 aBuiltin[i].zFunc, |
| 695 aBuiltin[i].pUserData, |
| 696 aBuiltin[i].xFunc, |
| 697 aBuiltin[i].xDestroy |
| 698 ); |
| 699 } |
| 700 |
| 701 return rc; |
| 702 } |
| 703 |
| 704 |
OLD | NEW |