Index: third_party/sqlite/sqlite-src-3170000/ext/fts5/fts5_aux.c |
diff --git a/third_party/sqlite/sqlite-src-3170000/ext/fts5/fts5_aux.c b/third_party/sqlite/sqlite-src-3170000/ext/fts5/fts5_aux.c |
new file mode 100644 |
index 0000000000000000000000000000000000000000..219ea6fff8f504798f684900967a584f2e1983e7 |
--- /dev/null |
+++ b/third_party/sqlite/sqlite-src-3170000/ext/fts5/fts5_aux.c |
@@ -0,0 +1,704 @@ |
+/* |
+** 2014 May 31 |
+** |
+** The author disclaims copyright to this source code. In place of |
+** a legal notice, here is a blessing: |
+** |
+** May you do good and not evil. |
+** May you find forgiveness for yourself and forgive others. |
+** May you share freely, never taking more than you give. |
+** |
+****************************************************************************** |
+*/ |
+ |
+ |
+#include "fts5Int.h" |
+#include <math.h> /* amalgamator: keep */ |
+ |
+/* |
+** Object used to iterate through all "coalesced phrase instances" in |
+** a single column of the current row. If the phrase instances in the |
+** column being considered do not overlap, this object simply iterates |
+** through them. Or, if they do overlap (share one or more tokens in |
+** common), each set of overlapping instances is treated as a single |
+** match. See documentation for the highlight() auxiliary function for |
+** details. |
+** |
+** Usage is: |
+** |
+** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter); |
+** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter); |
+** rc = fts5CInstIterNext(&iter) |
+** ){ |
+** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd); |
+** } |
+** |
+*/ |
+typedef struct CInstIter CInstIter; |
+struct CInstIter { |
+ const Fts5ExtensionApi *pApi; /* API offered by current FTS version */ |
+ Fts5Context *pFts; /* First arg to pass to pApi functions */ |
+ int iCol; /* Column to search */ |
+ int iInst; /* Next phrase instance index */ |
+ int nInst; /* Total number of phrase instances */ |
+ |
+ /* Output variables */ |
+ int iStart; /* First token in coalesced phrase instance */ |
+ int iEnd; /* Last token in coalesced phrase instance */ |
+}; |
+ |
+/* |
+** Advance the iterator to the next coalesced phrase instance. Return |
+** an SQLite error code if an error occurs, or SQLITE_OK otherwise. |
+*/ |
+static int fts5CInstIterNext(CInstIter *pIter){ |
+ int rc = SQLITE_OK; |
+ pIter->iStart = -1; |
+ pIter->iEnd = -1; |
+ |
+ while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){ |
+ int ip; int ic; int io; |
+ rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io); |
+ if( rc==SQLITE_OK ){ |
+ if( ic==pIter->iCol ){ |
+ int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip); |
+ if( pIter->iStart<0 ){ |
+ pIter->iStart = io; |
+ pIter->iEnd = iEnd; |
+ }else if( io<=pIter->iEnd ){ |
+ if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd; |
+ }else{ |
+ break; |
+ } |
+ } |
+ pIter->iInst++; |
+ } |
+ } |
+ |
+ return rc; |
+} |
+ |
+/* |
+** Initialize the iterator object indicated by the final parameter to |
+** iterate through coalesced phrase instances in column iCol. |
+*/ |
+static int fts5CInstIterInit( |
+ const Fts5ExtensionApi *pApi, |
+ Fts5Context *pFts, |
+ int iCol, |
+ CInstIter *pIter |
+){ |
+ int rc; |
+ |
+ memset(pIter, 0, sizeof(CInstIter)); |
+ pIter->pApi = pApi; |
+ pIter->pFts = pFts; |
+ pIter->iCol = iCol; |
+ rc = pApi->xInstCount(pFts, &pIter->nInst); |
+ |
+ if( rc==SQLITE_OK ){ |
+ rc = fts5CInstIterNext(pIter); |
+ } |
+ |
+ return rc; |
+} |
+ |
+ |
+ |
+/************************************************************************* |
+** Start of highlight() implementation. |
+*/ |
+typedef struct HighlightContext HighlightContext; |
+struct HighlightContext { |
+ CInstIter iter; /* Coalesced Instance Iterator */ |
+ int iPos; /* Current token offset in zIn[] */ |
+ int iRangeStart; /* First token to include */ |
+ int iRangeEnd; /* If non-zero, last token to include */ |
+ const char *zOpen; /* Opening highlight */ |
+ const char *zClose; /* Closing highlight */ |
+ const char *zIn; /* Input text */ |
+ int nIn; /* Size of input text in bytes */ |
+ int iOff; /* Current offset within zIn[] */ |
+ char *zOut; /* Output value */ |
+}; |
+ |
+/* |
+** Append text to the HighlightContext output string - p->zOut. Argument |
+** z points to a buffer containing n bytes of text to append. If n is |
+** negative, everything up until the first '\0' is appended to the output. |
+** |
+** If *pRc is set to any value other than SQLITE_OK when this function is |
+** called, it is a no-op. If an error (i.e. an OOM condition) is encountered, |
+** *pRc is set to an error code before returning. |
+*/ |
+static void fts5HighlightAppend( |
+ int *pRc, |
+ HighlightContext *p, |
+ const char *z, int n |
+){ |
+ if( *pRc==SQLITE_OK ){ |
+ if( n<0 ) n = (int)strlen(z); |
+ p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z); |
+ if( p->zOut==0 ) *pRc = SQLITE_NOMEM; |
+ } |
+} |
+ |
+/* |
+** Tokenizer callback used by implementation of highlight() function. |
+*/ |
+static int fts5HighlightCb( |
+ void *pContext, /* Pointer to HighlightContext object */ |
+ int tflags, /* Mask of FTS5_TOKEN_* flags */ |
+ const char *pToken, /* Buffer containing token */ |
+ int nToken, /* Size of token in bytes */ |
+ int iStartOff, /* Start offset of token */ |
+ int iEndOff /* End offset of token */ |
+){ |
+ HighlightContext *p = (HighlightContext*)pContext; |
+ int rc = SQLITE_OK; |
+ int iPos; |
+ |
+ UNUSED_PARAM2(pToken, nToken); |
+ |
+ if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK; |
+ iPos = p->iPos++; |
+ |
+ if( p->iRangeEnd>0 ){ |
+ if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK; |
+ if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff; |
+ } |
+ |
+ if( iPos==p->iter.iStart ){ |
+ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff); |
+ fts5HighlightAppend(&rc, p, p->zOpen, -1); |
+ p->iOff = iStartOff; |
+ } |
+ |
+ if( iPos==p->iter.iEnd ){ |
+ if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){ |
+ fts5HighlightAppend(&rc, p, p->zOpen, -1); |
+ } |
+ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); |
+ fts5HighlightAppend(&rc, p, p->zClose, -1); |
+ p->iOff = iEndOff; |
+ if( rc==SQLITE_OK ){ |
+ rc = fts5CInstIterNext(&p->iter); |
+ } |
+ } |
+ |
+ if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){ |
+ fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff); |
+ p->iOff = iEndOff; |
+ if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){ |
+ fts5HighlightAppend(&rc, p, p->zClose, -1); |
+ } |
+ } |
+ |
+ return rc; |
+} |
+ |
+/* |
+** Implementation of highlight() function. |
+*/ |
+static void fts5HighlightFunction( |
+ const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
+ Fts5Context *pFts, /* First arg to pass to pApi functions */ |
+ sqlite3_context *pCtx, /* Context for returning result/error */ |
+ int nVal, /* Number of values in apVal[] array */ |
+ sqlite3_value **apVal /* Array of trailing arguments */ |
+){ |
+ HighlightContext ctx; |
+ int rc; |
+ int iCol; |
+ |
+ if( nVal!=3 ){ |
+ const char *zErr = "wrong number of arguments to function highlight()"; |
+ sqlite3_result_error(pCtx, zErr, -1); |
+ return; |
+ } |
+ |
+ iCol = sqlite3_value_int(apVal[0]); |
+ memset(&ctx, 0, sizeof(HighlightContext)); |
+ ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); |
+ ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); |
+ rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn); |
+ |
+ if( ctx.zIn ){ |
+ if( rc==SQLITE_OK ){ |
+ rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter); |
+ } |
+ |
+ if( rc==SQLITE_OK ){ |
+ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); |
+ } |
+ fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); |
+ |
+ if( rc==SQLITE_OK ){ |
+ sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); |
+ } |
+ sqlite3_free(ctx.zOut); |
+ } |
+ if( rc!=SQLITE_OK ){ |
+ sqlite3_result_error_code(pCtx, rc); |
+ } |
+} |
+/* |
+** End of highlight() implementation. |
+**************************************************************************/ |
+ |
+/* |
+** Context object passed to the fts5SentenceFinderCb() function. |
+*/ |
+typedef struct Fts5SFinder Fts5SFinder; |
+struct Fts5SFinder { |
+ int iPos; /* Current token position */ |
+ int nFirstAlloc; /* Allocated size of aFirst[] */ |
+ int nFirst; /* Number of entries in aFirst[] */ |
+ int *aFirst; /* Array of first token in each sentence */ |
+ const char *zDoc; /* Document being tokenized */ |
+}; |
+ |
+/* |
+** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if |
+** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an |
+** error occurs. |
+*/ |
+static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){ |
+ if( p->nFirstAlloc==p->nFirst ){ |
+ int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64; |
+ int *aNew; |
+ |
+ aNew = (int*)sqlite3_realloc(p->aFirst, nNew*sizeof(int)); |
+ if( aNew==0 ) return SQLITE_NOMEM; |
+ p->aFirst = aNew; |
+ p->nFirstAlloc = nNew; |
+ } |
+ p->aFirst[p->nFirst++] = iAdd; |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** This function is an xTokenize() callback used by the auxiliary snippet() |
+** function. Its job is to identify tokens that are the first in a sentence. |
+** For each such token, an entry is added to the SFinder.aFirst[] array. |
+*/ |
+static int fts5SentenceFinderCb( |
+ void *pContext, /* Pointer to HighlightContext object */ |
+ int tflags, /* Mask of FTS5_TOKEN_* flags */ |
+ const char *pToken, /* Buffer containing token */ |
+ int nToken, /* Size of token in bytes */ |
+ int iStartOff, /* Start offset of token */ |
+ int iEndOff /* End offset of token */ |
+){ |
+ int rc = SQLITE_OK; |
+ |
+ UNUSED_PARAM2(pToken, nToken); |
+ UNUSED_PARAM(iEndOff); |
+ |
+ if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){ |
+ Fts5SFinder *p = (Fts5SFinder*)pContext; |
+ if( p->iPos>0 ){ |
+ int i; |
+ char c = 0; |
+ for(i=iStartOff-1; i>=0; i--){ |
+ c = p->zDoc[i]; |
+ if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break; |
+ } |
+ if( i!=iStartOff-1 && (c=='.' || c==':') ){ |
+ rc = fts5SentenceFinderAdd(p, p->iPos); |
+ } |
+ }else{ |
+ rc = fts5SentenceFinderAdd(p, 0); |
+ } |
+ p->iPos++; |
+ } |
+ return rc; |
+} |
+ |
+static int fts5SnippetScore( |
+ const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
+ Fts5Context *pFts, /* First arg to pass to pApi functions */ |
+ int nDocsize, /* Size of column in tokens */ |
+ unsigned char *aSeen, /* Array with one element per query phrase */ |
+ int iCol, /* Column to score */ |
+ int iPos, /* Starting offset to score */ |
+ int nToken, /* Max tokens per snippet */ |
+ int *pnScore, /* OUT: Score */ |
+ int *piPos /* OUT: Adjusted offset */ |
+){ |
+ int rc; |
+ int i; |
+ int ip = 0; |
+ int ic = 0; |
+ int iOff = 0; |
+ int iFirst = -1; |
+ int nInst; |
+ int nScore = 0; |
+ int iLast = 0; |
+ |
+ rc = pApi->xInstCount(pFts, &nInst); |
+ for(i=0; i<nInst && rc==SQLITE_OK; i++){ |
+ rc = pApi->xInst(pFts, i, &ip, &ic, &iOff); |
+ if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<(iPos+nToken) ){ |
+ nScore += (aSeen[ip] ? 1 : 1000); |
+ aSeen[ip] = 1; |
+ if( iFirst<0 ) iFirst = iOff; |
+ iLast = iOff + pApi->xPhraseSize(pFts, ip); |
+ } |
+ } |
+ |
+ *pnScore = nScore; |
+ if( piPos ){ |
+ int iAdj = iFirst - (nToken - (iLast-iFirst)) / 2; |
+ if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken; |
+ if( iAdj<0 ) iAdj = 0; |
+ *piPos = iAdj; |
+ } |
+ |
+ return rc; |
+} |
+ |
+/* |
+** Implementation of snippet() function. |
+*/ |
+static void fts5SnippetFunction( |
+ const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
+ Fts5Context *pFts, /* First arg to pass to pApi functions */ |
+ sqlite3_context *pCtx, /* Context for returning result/error */ |
+ int nVal, /* Number of values in apVal[] array */ |
+ sqlite3_value **apVal /* Array of trailing arguments */ |
+){ |
+ HighlightContext ctx; |
+ int rc = SQLITE_OK; /* Return code */ |
+ int iCol; /* 1st argument to snippet() */ |
+ const char *zEllips; /* 4th argument to snippet() */ |
+ int nToken; /* 5th argument to snippet() */ |
+ int nInst = 0; /* Number of instance matches this row */ |
+ int i; /* Used to iterate through instances */ |
+ int nPhrase; /* Number of phrases in query */ |
+ unsigned char *aSeen; /* Array of "seen instance" flags */ |
+ int iBestCol; /* Column containing best snippet */ |
+ int iBestStart = 0; /* First token of best snippet */ |
+ int nBestScore = 0; /* Score of best snippet */ |
+ int nColSize = 0; /* Total size of iBestCol in tokens */ |
+ Fts5SFinder sFinder; /* Used to find the beginnings of sentences */ |
+ int nCol; |
+ |
+ if( nVal!=5 ){ |
+ const char *zErr = "wrong number of arguments to function snippet()"; |
+ sqlite3_result_error(pCtx, zErr, -1); |
+ return; |
+ } |
+ |
+ nCol = pApi->xColumnCount(pFts); |
+ memset(&ctx, 0, sizeof(HighlightContext)); |
+ iCol = sqlite3_value_int(apVal[0]); |
+ ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]); |
+ ctx.zClose = (const char*)sqlite3_value_text(apVal[2]); |
+ zEllips = (const char*)sqlite3_value_text(apVal[3]); |
+ nToken = sqlite3_value_int(apVal[4]); |
+ |
+ iBestCol = (iCol>=0 ? iCol : 0); |
+ nPhrase = pApi->xPhraseCount(pFts); |
+ aSeen = sqlite3_malloc(nPhrase); |
+ if( aSeen==0 ){ |
+ rc = SQLITE_NOMEM; |
+ } |
+ if( rc==SQLITE_OK ){ |
+ rc = pApi->xInstCount(pFts, &nInst); |
+ } |
+ |
+ memset(&sFinder, 0, sizeof(Fts5SFinder)); |
+ for(i=0; i<nCol; i++){ |
+ if( iCol<0 || iCol==i ){ |
+ int nDoc; |
+ int nDocsize; |
+ int ii; |
+ sFinder.iPos = 0; |
+ sFinder.nFirst = 0; |
+ rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc); |
+ if( rc!=SQLITE_OK ) break; |
+ rc = pApi->xTokenize(pFts, |
+ sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb |
+ ); |
+ if( rc!=SQLITE_OK ) break; |
+ rc = pApi->xColumnSize(pFts, i, &nDocsize); |
+ if( rc!=SQLITE_OK ) break; |
+ |
+ for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){ |
+ int ip, ic, io; |
+ int iAdj; |
+ int nScore; |
+ int jj; |
+ |
+ rc = pApi->xInst(pFts, ii, &ip, &ic, &io); |
+ if( ic!=i || rc!=SQLITE_OK ) continue; |
+ memset(aSeen, 0, nPhrase); |
+ rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, |
+ io, nToken, &nScore, &iAdj |
+ ); |
+ if( rc==SQLITE_OK && nScore>nBestScore ){ |
+ nBestScore = nScore; |
+ iBestCol = i; |
+ iBestStart = iAdj; |
+ nColSize = nDocsize; |
+ } |
+ |
+ if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){ |
+ for(jj=0; jj<(sFinder.nFirst-1); jj++){ |
+ if( sFinder.aFirst[jj+1]>io ) break; |
+ } |
+ |
+ if( sFinder.aFirst[jj]<io ){ |
+ memset(aSeen, 0, nPhrase); |
+ rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i, |
+ sFinder.aFirst[jj], nToken, &nScore, 0 |
+ ); |
+ |
+ nScore += (sFinder.aFirst[jj]==0 ? 120 : 100); |
+ if( rc==SQLITE_OK && nScore>nBestScore ){ |
+ nBestScore = nScore; |
+ iBestCol = i; |
+ iBestStart = sFinder.aFirst[jj]; |
+ nColSize = nDocsize; |
+ } |
+ } |
+ } |
+ } |
+ } |
+ } |
+ |
+ if( rc==SQLITE_OK ){ |
+ rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn); |
+ } |
+ if( rc==SQLITE_OK && nColSize==0 ){ |
+ rc = pApi->xColumnSize(pFts, iBestCol, &nColSize); |
+ } |
+ if( ctx.zIn ){ |
+ if( rc==SQLITE_OK ){ |
+ rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter); |
+ } |
+ |
+ ctx.iRangeStart = iBestStart; |
+ ctx.iRangeEnd = iBestStart + nToken - 1; |
+ |
+ if( iBestStart>0 ){ |
+ fts5HighlightAppend(&rc, &ctx, zEllips, -1); |
+ } |
+ |
+ /* Advance iterator ctx.iter so that it points to the first coalesced |
+ ** phrase instance at or following position iBestStart. */ |
+ while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){ |
+ rc = fts5CInstIterNext(&ctx.iter); |
+ } |
+ |
+ if( rc==SQLITE_OK ){ |
+ rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb); |
+ } |
+ if( ctx.iRangeEnd>=(nColSize-1) ){ |
+ fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff); |
+ }else{ |
+ fts5HighlightAppend(&rc, &ctx, zEllips, -1); |
+ } |
+ } |
+ if( rc==SQLITE_OK ){ |
+ sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT); |
+ }else{ |
+ sqlite3_result_error_code(pCtx, rc); |
+ } |
+ sqlite3_free(ctx.zOut); |
+ sqlite3_free(aSeen); |
+ sqlite3_free(sFinder.aFirst); |
+} |
+ |
+/************************************************************************/ |
+ |
+/* |
+** The first time the bm25() function is called for a query, an instance |
+** of the following structure is allocated and populated. |
+*/ |
+typedef struct Fts5Bm25Data Fts5Bm25Data; |
+struct Fts5Bm25Data { |
+ int nPhrase; /* Number of phrases in query */ |
+ double avgdl; /* Average number of tokens in each row */ |
+ double *aIDF; /* IDF for each phrase */ |
+ double *aFreq; /* Array used to calculate phrase freq. */ |
+}; |
+ |
+/* |
+** Callback used by fts5Bm25GetData() to count the number of rows in the |
+** table matched by each individual phrase within the query. |
+*/ |
+static int fts5CountCb( |
+ const Fts5ExtensionApi *pApi, |
+ Fts5Context *pFts, |
+ void *pUserData /* Pointer to sqlite3_int64 variable */ |
+){ |
+ sqlite3_int64 *pn = (sqlite3_int64*)pUserData; |
+ UNUSED_PARAM2(pApi, pFts); |
+ (*pn)++; |
+ return SQLITE_OK; |
+} |
+ |
+/* |
+** Set *ppData to point to the Fts5Bm25Data object for the current query. |
+** If the object has not already been allocated, allocate and populate it |
+** now. |
+*/ |
+static int fts5Bm25GetData( |
+ const Fts5ExtensionApi *pApi, |
+ Fts5Context *pFts, |
+ Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */ |
+){ |
+ int rc = SQLITE_OK; /* Return code */ |
+ Fts5Bm25Data *p; /* Object to return */ |
+ |
+ p = pApi->xGetAuxdata(pFts, 0); |
+ if( p==0 ){ |
+ int nPhrase; /* Number of phrases in query */ |
+ sqlite3_int64 nRow = 0; /* Number of rows in table */ |
+ sqlite3_int64 nToken = 0; /* Number of tokens in table */ |
+ int nByte; /* Bytes of space to allocate */ |
+ int i; |
+ |
+ /* Allocate the Fts5Bm25Data object */ |
+ nPhrase = pApi->xPhraseCount(pFts); |
+ nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double); |
+ p = (Fts5Bm25Data*)sqlite3_malloc(nByte); |
+ if( p==0 ){ |
+ rc = SQLITE_NOMEM; |
+ }else{ |
+ memset(p, 0, nByte); |
+ p->nPhrase = nPhrase; |
+ p->aIDF = (double*)&p[1]; |
+ p->aFreq = &p->aIDF[nPhrase]; |
+ } |
+ |
+ /* Calculate the average document length for this FTS5 table */ |
+ if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow); |
+ if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken); |
+ if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow; |
+ |
+ /* Calculate an IDF for each phrase in the query */ |
+ for(i=0; rc==SQLITE_OK && i<nPhrase; i++){ |
+ sqlite3_int64 nHit = 0; |
+ rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb); |
+ if( rc==SQLITE_OK ){ |
+ /* Calculate the IDF (Inverse Document Frequency) for phrase i. |
+ ** This is done using the standard BM25 formula as found on wikipedia: |
+ ** |
+ ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) ) |
+ ** |
+ ** where "N" is the total number of documents in the set and nHit |
+ ** is the number that contain at least one instance of the phrase |
+ ** under consideration. |
+ ** |
+ ** The problem with this is that if (N < 2*nHit), the IDF is |
+ ** negative. Which is undesirable. So the mimimum allowable IDF is |
+ ** (1e-6) - roughly the same as a term that appears in just over |
+ ** half of set of 5,000,000 documents. */ |
+ double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) ); |
+ if( idf<=0.0 ) idf = 1e-6; |
+ p->aIDF[i] = idf; |
+ } |
+ } |
+ |
+ if( rc!=SQLITE_OK ){ |
+ sqlite3_free(p); |
+ }else{ |
+ rc = pApi->xSetAuxdata(pFts, p, sqlite3_free); |
+ } |
+ if( rc!=SQLITE_OK ) p = 0; |
+ } |
+ *ppData = p; |
+ return rc; |
+} |
+ |
+/* |
+** Implementation of bm25() function. |
+*/ |
+static void fts5Bm25Function( |
+ const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ |
+ Fts5Context *pFts, /* First arg to pass to pApi functions */ |
+ sqlite3_context *pCtx, /* Context for returning result/error */ |
+ int nVal, /* Number of values in apVal[] array */ |
+ sqlite3_value **apVal /* Array of trailing arguments */ |
+){ |
+ const double k1 = 1.2; /* Constant "k1" from BM25 formula */ |
+ const double b = 0.75; /* Constant "b" from BM25 formula */ |
+ int rc = SQLITE_OK; /* Error code */ |
+ double score = 0.0; /* SQL function return value */ |
+ Fts5Bm25Data *pData; /* Values allocated/calculated once only */ |
+ int i; /* Iterator variable */ |
+ int nInst = 0; /* Value returned by xInstCount() */ |
+ double D = 0.0; /* Total number of tokens in row */ |
+ double *aFreq = 0; /* Array of phrase freq. for current row */ |
+ |
+ /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation) |
+ ** for each phrase in the query for the current row. */ |
+ rc = fts5Bm25GetData(pApi, pFts, &pData); |
+ if( rc==SQLITE_OK ){ |
+ aFreq = pData->aFreq; |
+ memset(aFreq, 0, sizeof(double) * pData->nPhrase); |
+ rc = pApi->xInstCount(pFts, &nInst); |
+ } |
+ for(i=0; rc==SQLITE_OK && i<nInst; i++){ |
+ int ip; int ic; int io; |
+ rc = pApi->xInst(pFts, i, &ip, &ic, &io); |
+ if( rc==SQLITE_OK ){ |
+ double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0; |
+ aFreq[ip] += w; |
+ } |
+ } |
+ |
+ /* Figure out the total size of the current row in tokens. */ |
+ if( rc==SQLITE_OK ){ |
+ int nTok; |
+ rc = pApi->xColumnSize(pFts, -1, &nTok); |
+ D = (double)nTok; |
+ } |
+ |
+ /* Determine the BM25 score for the current row. */ |
+ for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){ |
+ score += pData->aIDF[i] * ( |
+ ( aFreq[i] * (k1 + 1.0) ) / |
+ ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) ) |
+ ); |
+ } |
+ |
+ /* If no error has occurred, return the calculated score. Otherwise, |
+ ** throw an SQL exception. */ |
+ if( rc==SQLITE_OK ){ |
+ sqlite3_result_double(pCtx, -1.0 * score); |
+ }else{ |
+ sqlite3_result_error_code(pCtx, rc); |
+ } |
+} |
+ |
+int sqlite3Fts5AuxInit(fts5_api *pApi){ |
+ struct Builtin { |
+ const char *zFunc; /* Function name (nul-terminated) */ |
+ void *pUserData; /* User-data pointer */ |
+ fts5_extension_function xFunc;/* Callback function */ |
+ void (*xDestroy)(void*); /* Destructor function */ |
+ } aBuiltin [] = { |
+ { "snippet", 0, fts5SnippetFunction, 0 }, |
+ { "highlight", 0, fts5HighlightFunction, 0 }, |
+ { "bm25", 0, fts5Bm25Function, 0 }, |
+ }; |
+ int rc = SQLITE_OK; /* Return code */ |
+ int i; /* To iterate through builtin functions */ |
+ |
+ for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){ |
+ rc = pApi->xCreateFunction(pApi, |
+ aBuiltin[i].zFunc, |
+ aBuiltin[i].pUserData, |
+ aBuiltin[i].xFunc, |
+ aBuiltin[i].xDestroy |
+ ); |
+ } |
+ |
+ return rc; |
+} |
+ |
+ |