OLD | NEW |
| (Empty) |
1 /* | |
2 ** 2014 August 11 | |
3 ** | |
4 ** The author disclaims copyright to this source code. In place of | |
5 ** a legal notice, here is a blessing: | |
6 ** | |
7 ** May you do good and not evil. | |
8 ** May you find forgiveness for yourself and forgive others. | |
9 ** May you share freely, never taking more than you give. | |
10 ** | |
11 ****************************************************************************** | |
12 ** | |
13 */ | |
14 | |
15 | |
16 | |
17 #include "fts5Int.h" | |
18 | |
19 typedef struct Fts5HashEntry Fts5HashEntry; | |
20 | |
21 /* | |
22 ** This file contains the implementation of an in-memory hash table used | |
23 ** to accumuluate "term -> doclist" content before it is flused to a level-0 | |
24 ** segment. | |
25 */ | |
26 | |
27 | |
28 struct Fts5Hash { | |
29 int *pnByte; /* Pointer to bytes counter */ | |
30 int nEntry; /* Number of entries currently in hash */ | |
31 int nSlot; /* Size of aSlot[] array */ | |
32 Fts5HashEntry *pScan; /* Current ordered scan item */ | |
33 Fts5HashEntry **aSlot; /* Array of hash slots */ | |
34 }; | |
35 | |
36 /* | |
37 ** Each entry in the hash table is represented by an object of the | |
38 ** following type. Each object, its key (zKey[]) and its current data | |
39 ** are stored in a single memory allocation. The position list data | |
40 ** immediately follows the key data in memory. | |
41 ** | |
42 ** The data that follows the key is in a similar, but not identical format | |
43 ** to the doclist data stored in the database. It is: | |
44 ** | |
45 ** * Rowid, as a varint | |
46 ** * Position list, without 0x00 terminator. | |
47 ** * Size of previous position list and rowid, as a 4 byte | |
48 ** big-endian integer. | |
49 ** | |
50 ** iRowidOff: | |
51 ** Offset of last rowid written to data area. Relative to first byte of | |
52 ** structure. | |
53 ** | |
54 ** nData: | |
55 ** Bytes of data written since iRowidOff. | |
56 */ | |
57 struct Fts5HashEntry { | |
58 Fts5HashEntry *pHashNext; /* Next hash entry with same hash-key */ | |
59 Fts5HashEntry *pScanNext; /* Next entry in sorted order */ | |
60 | |
61 int nAlloc; /* Total size of allocation */ | |
62 int iSzPoslist; /* Offset of space for 4-byte poslist size */ | |
63 int nData; /* Total bytes of data (incl. structure) */ | |
64 u8 bDel; /* Set delete-flag @ iSzPoslist */ | |
65 | |
66 int iCol; /* Column of last value written */ | |
67 int iPos; /* Position of last value written */ | |
68 i64 iRowid; /* Rowid of last value written */ | |
69 char zKey[8]; /* Nul-terminated entry key */ | |
70 }; | |
71 | |
72 /* | |
73 ** Size of Fts5HashEntry without the zKey[] array. | |
74 */ | |
75 #define FTS5_HASHENTRYSIZE (sizeof(Fts5HashEntry)-8) | |
76 | |
77 | |
78 | |
79 /* | |
80 ** Allocate a new hash table. | |
81 */ | |
82 int sqlite3Fts5HashNew(Fts5Hash **ppNew, int *pnByte){ | |
83 int rc = SQLITE_OK; | |
84 Fts5Hash *pNew; | |
85 | |
86 *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash)); | |
87 if( pNew==0 ){ | |
88 rc = SQLITE_NOMEM; | |
89 }else{ | |
90 int nByte; | |
91 memset(pNew, 0, sizeof(Fts5Hash)); | |
92 pNew->pnByte = pnByte; | |
93 | |
94 pNew->nSlot = 1024; | |
95 nByte = sizeof(Fts5HashEntry*) * pNew->nSlot; | |
96 pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc(nByte); | |
97 if( pNew->aSlot==0 ){ | |
98 sqlite3_free(pNew); | |
99 *ppNew = 0; | |
100 rc = SQLITE_NOMEM; | |
101 }else{ | |
102 memset(pNew->aSlot, 0, nByte); | |
103 } | |
104 } | |
105 return rc; | |
106 } | |
107 | |
108 /* | |
109 ** Free a hash table object. | |
110 */ | |
111 void sqlite3Fts5HashFree(Fts5Hash *pHash){ | |
112 if( pHash ){ | |
113 sqlite3Fts5HashClear(pHash); | |
114 sqlite3_free(pHash->aSlot); | |
115 sqlite3_free(pHash); | |
116 } | |
117 } | |
118 | |
119 /* | |
120 ** Empty (but do not delete) a hash table. | |
121 */ | |
122 void sqlite3Fts5HashClear(Fts5Hash *pHash){ | |
123 int i; | |
124 for(i=0; i<pHash->nSlot; i++){ | |
125 Fts5HashEntry *pNext; | |
126 Fts5HashEntry *pSlot; | |
127 for(pSlot=pHash->aSlot[i]; pSlot; pSlot=pNext){ | |
128 pNext = pSlot->pHashNext; | |
129 sqlite3_free(pSlot); | |
130 } | |
131 } | |
132 memset(pHash->aSlot, 0, pHash->nSlot * sizeof(Fts5HashEntry*)); | |
133 pHash->nEntry = 0; | |
134 } | |
135 | |
136 static unsigned int fts5HashKey(int nSlot, const u8 *p, int n){ | |
137 int i; | |
138 unsigned int h = 13; | |
139 for(i=n-1; i>=0; i--){ | |
140 h = (h << 3) ^ h ^ p[i]; | |
141 } | |
142 return (h % nSlot); | |
143 } | |
144 | |
145 static unsigned int fts5HashKey2(int nSlot, u8 b, const u8 *p, int n){ | |
146 int i; | |
147 unsigned int h = 13; | |
148 for(i=n-1; i>=0; i--){ | |
149 h = (h << 3) ^ h ^ p[i]; | |
150 } | |
151 h = (h << 3) ^ h ^ b; | |
152 return (h % nSlot); | |
153 } | |
154 | |
155 /* | |
156 ** Resize the hash table by doubling the number of slots. | |
157 */ | |
158 static int fts5HashResize(Fts5Hash *pHash){ | |
159 int nNew = pHash->nSlot*2; | |
160 int i; | |
161 Fts5HashEntry **apNew; | |
162 Fts5HashEntry **apOld = pHash->aSlot; | |
163 | |
164 apNew = (Fts5HashEntry**)sqlite3_malloc(nNew*sizeof(Fts5HashEntry*)); | |
165 if( !apNew ) return SQLITE_NOMEM; | |
166 memset(apNew, 0, nNew*sizeof(Fts5HashEntry*)); | |
167 | |
168 for(i=0; i<pHash->nSlot; i++){ | |
169 while( apOld[i] ){ | |
170 int iHash; | |
171 Fts5HashEntry *p = apOld[i]; | |
172 apOld[i] = p->pHashNext; | |
173 iHash = fts5HashKey(nNew, (u8*)p->zKey, (int)strlen(p->zKey)); | |
174 p->pHashNext = apNew[iHash]; | |
175 apNew[iHash] = p; | |
176 } | |
177 } | |
178 | |
179 sqlite3_free(apOld); | |
180 pHash->nSlot = nNew; | |
181 pHash->aSlot = apNew; | |
182 return SQLITE_OK; | |
183 } | |
184 | |
185 static void fts5HashAddPoslistSize(Fts5HashEntry *p){ | |
186 if( p->iSzPoslist ){ | |
187 u8 *pPtr = (u8*)p; | |
188 int nSz = (p->nData - p->iSzPoslist - 1); /* Size in bytes */ | |
189 int nPos = nSz*2 + p->bDel; /* Value of nPos field */ | |
190 | |
191 assert( p->bDel==0 || p->bDel==1 ); | |
192 if( nPos<=127 ){ | |
193 pPtr[p->iSzPoslist] = (u8)nPos; | |
194 }else{ | |
195 int nByte = sqlite3Fts5GetVarintLen((u32)nPos); | |
196 memmove(&pPtr[p->iSzPoslist + nByte], &pPtr[p->iSzPoslist + 1], nSz); | |
197 sqlite3Fts5PutVarint(&pPtr[p->iSzPoslist], nPos); | |
198 p->nData += (nByte-1); | |
199 } | |
200 p->bDel = 0; | |
201 p->iSzPoslist = 0; | |
202 } | |
203 } | |
204 | |
205 int sqlite3Fts5HashWrite( | |
206 Fts5Hash *pHash, | |
207 i64 iRowid, /* Rowid for this entry */ | |
208 int iCol, /* Column token appears in (-ve -> delete) */ | |
209 int iPos, /* Position of token within column */ | |
210 char bByte, /* First byte of token */ | |
211 const char *pToken, int nToken /* Token to add or remove to or from index */ | |
212 ){ | |
213 unsigned int iHash; | |
214 Fts5HashEntry *p; | |
215 u8 *pPtr; | |
216 int nIncr = 0; /* Amount to increment (*pHash->pnByte) by */ | |
217 | |
218 /* Attempt to locate an existing hash entry */ | |
219 iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken); | |
220 for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){ | |
221 if( p->zKey[0]==bByte | |
222 && memcmp(&p->zKey[1], pToken, nToken)==0 | |
223 && p->zKey[nToken+1]==0 | |
224 ){ | |
225 break; | |
226 } | |
227 } | |
228 | |
229 /* If an existing hash entry cannot be found, create a new one. */ | |
230 if( p==0 ){ | |
231 int nByte = FTS5_HASHENTRYSIZE + (nToken+1) + 1 + 64; | |
232 if( nByte<128 ) nByte = 128; | |
233 | |
234 if( (pHash->nEntry*2)>=pHash->nSlot ){ | |
235 int rc = fts5HashResize(pHash); | |
236 if( rc!=SQLITE_OK ) return rc; | |
237 iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken); | |
238 } | |
239 | |
240 p = (Fts5HashEntry*)sqlite3_malloc(nByte); | |
241 if( !p ) return SQLITE_NOMEM; | |
242 memset(p, 0, FTS5_HASHENTRYSIZE); | |
243 p->nAlloc = nByte; | |
244 p->zKey[0] = bByte; | |
245 memcpy(&p->zKey[1], pToken, nToken); | |
246 assert( iHash==fts5HashKey(pHash->nSlot, (u8*)p->zKey, nToken+1) ); | |
247 p->zKey[nToken+1] = '\0'; | |
248 p->nData = nToken+1 + 1 + FTS5_HASHENTRYSIZE; | |
249 p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid); | |
250 p->iSzPoslist = p->nData; | |
251 p->nData += 1; | |
252 p->iRowid = iRowid; | |
253 p->pHashNext = pHash->aSlot[iHash]; | |
254 pHash->aSlot[iHash] = p; | |
255 pHash->nEntry++; | |
256 nIncr += p->nData; | |
257 } | |
258 | |
259 /* Check there is enough space to append a new entry. Worst case scenario | |
260 ** is: | |
261 ** | |
262 ** + 9 bytes for a new rowid, | |
263 ** + 4 byte reserved for the "poslist size" varint. | |
264 ** + 1 byte for a "new column" byte, | |
265 ** + 3 bytes for a new column number (16-bit max) as a varint, | |
266 ** + 5 bytes for the new position offset (32-bit max). | |
267 */ | |
268 if( (p->nAlloc - p->nData) < (9 + 4 + 1 + 3 + 5) ){ | |
269 int nNew = p->nAlloc * 2; | |
270 Fts5HashEntry *pNew; | |
271 Fts5HashEntry **pp; | |
272 pNew = (Fts5HashEntry*)sqlite3_realloc(p, nNew); | |
273 if( pNew==0 ) return SQLITE_NOMEM; | |
274 pNew->nAlloc = nNew; | |
275 for(pp=&pHash->aSlot[iHash]; *pp!=p; pp=&(*pp)->pHashNext); | |
276 *pp = pNew; | |
277 p = pNew; | |
278 } | |
279 pPtr = (u8*)p; | |
280 nIncr -= p->nData; | |
281 | |
282 /* If this is a new rowid, append the 4-byte size field for the previous | |
283 ** entry, and the new rowid for this entry. */ | |
284 if( iRowid!=p->iRowid ){ | |
285 fts5HashAddPoslistSize(p); | |
286 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iRowid - p->iRowid); | |
287 p->iSzPoslist = p->nData; | |
288 p->nData += 1; | |
289 p->iCol = 0; | |
290 p->iPos = 0; | |
291 p->iRowid = iRowid; | |
292 } | |
293 | |
294 if( iCol>=0 ){ | |
295 /* Append a new column value, if necessary */ | |
296 assert( iCol>=p->iCol ); | |
297 if( iCol!=p->iCol ){ | |
298 pPtr[p->nData++] = 0x01; | |
299 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol); | |
300 p->iCol = iCol; | |
301 p->iPos = 0; | |
302 } | |
303 | |
304 /* Append the new position offset */ | |
305 p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2); | |
306 p->iPos = iPos; | |
307 }else{ | |
308 /* This is a delete. Set the delete flag. */ | |
309 p->bDel = 1; | |
310 } | |
311 nIncr += p->nData; | |
312 | |
313 *pHash->pnByte += nIncr; | |
314 return SQLITE_OK; | |
315 } | |
316 | |
317 | |
318 /* | |
319 ** Arguments pLeft and pRight point to linked-lists of hash-entry objects, | |
320 ** each sorted in key order. This function merges the two lists into a | |
321 ** single list and returns a pointer to its first element. | |
322 */ | |
323 static Fts5HashEntry *fts5HashEntryMerge( | |
324 Fts5HashEntry *pLeft, | |
325 Fts5HashEntry *pRight | |
326 ){ | |
327 Fts5HashEntry *p1 = pLeft; | |
328 Fts5HashEntry *p2 = pRight; | |
329 Fts5HashEntry *pRet = 0; | |
330 Fts5HashEntry **ppOut = &pRet; | |
331 | |
332 while( p1 || p2 ){ | |
333 if( p1==0 ){ | |
334 *ppOut = p2; | |
335 p2 = 0; | |
336 }else if( p2==0 ){ | |
337 *ppOut = p1; | |
338 p1 = 0; | |
339 }else{ | |
340 int i = 0; | |
341 while( p1->zKey[i]==p2->zKey[i] ) i++; | |
342 | |
343 if( ((u8)p1->zKey[i])>((u8)p2->zKey[i]) ){ | |
344 /* p2 is smaller */ | |
345 *ppOut = p2; | |
346 ppOut = &p2->pScanNext; | |
347 p2 = p2->pScanNext; | |
348 }else{ | |
349 /* p1 is smaller */ | |
350 *ppOut = p1; | |
351 ppOut = &p1->pScanNext; | |
352 p1 = p1->pScanNext; | |
353 } | |
354 *ppOut = 0; | |
355 } | |
356 } | |
357 | |
358 return pRet; | |
359 } | |
360 | |
361 /* | |
362 ** Extract all tokens from hash table iHash and link them into a list | |
363 ** in sorted order. The hash table is cleared before returning. It is | |
364 ** the responsibility of the caller to free the elements of the returned | |
365 ** list. | |
366 */ | |
367 static int fts5HashEntrySort( | |
368 Fts5Hash *pHash, | |
369 const char *pTerm, int nTerm, /* Query prefix, if any */ | |
370 Fts5HashEntry **ppSorted | |
371 ){ | |
372 const int nMergeSlot = 32; | |
373 Fts5HashEntry **ap; | |
374 Fts5HashEntry *pList; | |
375 int iSlot; | |
376 int i; | |
377 | |
378 *ppSorted = 0; | |
379 ap = sqlite3_malloc(sizeof(Fts5HashEntry*) * nMergeSlot); | |
380 if( !ap ) return SQLITE_NOMEM; | |
381 memset(ap, 0, sizeof(Fts5HashEntry*) * nMergeSlot); | |
382 | |
383 for(iSlot=0; iSlot<pHash->nSlot; iSlot++){ | |
384 Fts5HashEntry *pIter; | |
385 for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){ | |
386 if( pTerm==0 || 0==memcmp(pIter->zKey, pTerm, nTerm) ){ | |
387 Fts5HashEntry *pEntry = pIter; | |
388 pEntry->pScanNext = 0; | |
389 for(i=0; ap[i]; i++){ | |
390 pEntry = fts5HashEntryMerge(pEntry, ap[i]); | |
391 ap[i] = 0; | |
392 } | |
393 ap[i] = pEntry; | |
394 } | |
395 } | |
396 } | |
397 | |
398 pList = 0; | |
399 for(i=0; i<nMergeSlot; i++){ | |
400 pList = fts5HashEntryMerge(pList, ap[i]); | |
401 } | |
402 | |
403 pHash->nEntry = 0; | |
404 sqlite3_free(ap); | |
405 *ppSorted = pList; | |
406 return SQLITE_OK; | |
407 } | |
408 | |
409 /* | |
410 ** Query the hash table for a doclist associated with term pTerm/nTerm. | |
411 */ | |
412 int sqlite3Fts5HashQuery( | |
413 Fts5Hash *pHash, /* Hash table to query */ | |
414 const char *pTerm, int nTerm, /* Query term */ | |
415 const u8 **ppDoclist, /* OUT: Pointer to doclist for pTerm */ | |
416 int *pnDoclist /* OUT: Size of doclist in bytes */ | |
417 ){ | |
418 unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm); | |
419 Fts5HashEntry *p; | |
420 | |
421 for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){ | |
422 if( memcmp(p->zKey, pTerm, nTerm)==0 && p->zKey[nTerm]==0 ) break; | |
423 } | |
424 | |
425 if( p ){ | |
426 fts5HashAddPoslistSize(p); | |
427 *ppDoclist = (const u8*)&p->zKey[nTerm+1]; | |
428 *pnDoclist = p->nData - (FTS5_HASHENTRYSIZE + nTerm + 1); | |
429 }else{ | |
430 *ppDoclist = 0; | |
431 *pnDoclist = 0; | |
432 } | |
433 | |
434 return SQLITE_OK; | |
435 } | |
436 | |
437 int sqlite3Fts5HashScanInit( | |
438 Fts5Hash *p, /* Hash table to query */ | |
439 const char *pTerm, int nTerm /* Query prefix */ | |
440 ){ | |
441 return fts5HashEntrySort(p, pTerm, nTerm, &p->pScan); | |
442 } | |
443 | |
444 void sqlite3Fts5HashScanNext(Fts5Hash *p){ | |
445 assert( !sqlite3Fts5HashScanEof(p) ); | |
446 p->pScan = p->pScan->pScanNext; | |
447 } | |
448 | |
449 int sqlite3Fts5HashScanEof(Fts5Hash *p){ | |
450 return (p->pScan==0); | |
451 } | |
452 | |
453 void sqlite3Fts5HashScanEntry( | |
454 Fts5Hash *pHash, | |
455 const char **pzTerm, /* OUT: term (nul-terminated) */ | |
456 const u8 **ppDoclist, /* OUT: pointer to doclist */ | |
457 int *pnDoclist /* OUT: size of doclist in bytes */ | |
458 ){ | |
459 Fts5HashEntry *p; | |
460 if( (p = pHash->pScan) ){ | |
461 int nTerm = (int)strlen(p->zKey); | |
462 fts5HashAddPoslistSize(p); | |
463 *pzTerm = p->zKey; | |
464 *ppDoclist = (const u8*)&p->zKey[nTerm+1]; | |
465 *pnDoclist = p->nData - (FTS5_HASHENTRYSIZE + nTerm + 1); | |
466 }else{ | |
467 *pzTerm = 0; | |
468 *ppDoclist = 0; | |
469 *pnDoclist = 0; | |
470 } | |
471 } | |
472 | |
OLD | NEW |