Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: third_party/sqlite/src/btree.c

Issue 3108030: Move bundled copy of sqlite one level deeper to better separate it... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/sqlite/src/btree.h ('k') | third_party/sqlite/src/btreeInt.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.705 2009/08/10 03:57:58 shane Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
17 */
18 #include "btreeInt.h"
19
20 /*
21 ** The header string that appears at the beginning of every
22 ** SQLite database.
23 */
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
25
26 /*
27 ** The header string that appears at the beginning of a SQLite
28 ** database which has been poisoned.
29 */
30 static const char zPoisonHeader[] = "SQLite poison 3";
31
32 /*
33 ** Set this global variable to 1 to enable tracing using the TRACE
34 ** macro.
35 */
36 #if 0
37 int sqlite3BtreeTrace=1; /* True to enable tracing */
38 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
39 #else
40 # define TRACE(X)
41 #endif
42
43
44
45 #ifndef SQLITE_OMIT_SHARED_CACHE
46 /*
47 ** A list of BtShared objects that are eligible for participation
48 ** in shared cache. This variable has file scope during normal builds,
49 ** but the test harness needs to access it so we make it global for
50 ** test builds.
51 **
52 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
53 */
54 #ifdef SQLITE_TEST
55 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
56 #else
57 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
58 #endif
59 #endif /* SQLITE_OMIT_SHARED_CACHE */
60
61 #ifndef SQLITE_OMIT_SHARED_CACHE
62 /*
63 ** Enable or disable the shared pager and schema features.
64 **
65 ** This routine has no effect on existing database connections.
66 ** The shared cache setting effects only future calls to
67 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
68 */
69 int sqlite3_enable_shared_cache(int enable){
70 sqlite3GlobalConfig.sharedCacheEnabled = enable;
71 return SQLITE_OK;
72 }
73 #endif
74
75
76
77 #ifdef SQLITE_OMIT_SHARED_CACHE
78 /*
79 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
80 ** and clearAllSharedCacheTableLocks()
81 ** manipulate entries in the BtShared.pLock linked list used to store
82 ** shared-cache table level locks. If the library is compiled with the
83 ** shared-cache feature disabled, then there is only ever one user
84 ** of each BtShared structure and so this locking is not necessary.
85 ** So define the lock related functions as no-ops.
86 */
87 #define querySharedCacheTableLock(a,b,c) SQLITE_OK
88 #define setSharedCacheTableLock(a,b,c) SQLITE_OK
89 #define clearAllSharedCacheTableLocks(a)
90 #define downgradeAllSharedCacheTableLocks(a)
91 #define hasSharedCacheTableLock(a,b,c,d) 1
92 #define hasReadConflicts(a, b) 0
93 #endif
94
95 #ifndef SQLITE_OMIT_SHARED_CACHE
96
97 #ifdef SQLITE_DEBUG
98 /*
99 ** This function is only used as part of an assert() statement. It checks
100 ** that connection p holds the required locks to read or write to the
101 ** b-tree with root page iRoot. If so, true is returned. Otherwise, false.
102 ** For example, when writing to a table b-tree with root-page iRoot via
103 ** Btree connection pBtree:
104 **
105 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
106 **
107 ** When writing to an index b-tree that resides in a sharable database, the
108 ** caller should have first obtained a lock specifying the root page of
109 ** the corresponding table b-tree. This makes things a bit more complicated,
110 ** as this module treats each b-tree as a separate structure. To determine
111 ** the table b-tree corresponding to the index b-tree being written, this
112 ** function has to search through the database schema.
113 **
114 ** Instead of a lock on the b-tree rooted at page iRoot, the caller may
115 ** hold a write-lock on the schema table (root page 1). This is also
116 ** acceptable.
117 */
118 static int hasSharedCacheTableLock(
119 Btree *pBtree, /* Handle that must hold lock */
120 Pgno iRoot, /* Root page of b-tree */
121 int isIndex, /* True if iRoot is the root of an index b-tree */
122 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */
123 ){
124 Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
125 Pgno iTab = 0;
126 BtLock *pLock;
127
128 /* If this b-tree database is not shareable, or if the client is reading
129 ** and has the read-uncommitted flag set, then no lock is required.
130 ** In these cases return true immediately. If the client is reading
131 ** or writing an index b-tree, but the schema is not loaded, then return
132 ** true also. In this case the lock is required, but it is too difficult
133 ** to check if the client actually holds it. This doesn't happen very
134 ** often. */
135 if( (pBtree->sharable==0)
136 || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
137 || (isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0 ))
138 ){
139 return 1;
140 }
141
142 /* Figure out the root-page that the lock should be held on. For table
143 ** b-trees, this is just the root page of the b-tree being read or
144 ** written. For index b-trees, it is the root page of the associated
145 ** table. */
146 if( isIndex ){
147 HashElem *p;
148 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
149 Index *pIdx = (Index *)sqliteHashData(p);
150 if( pIdx->tnum==(int)iRoot ){
151 iTab = pIdx->pTable->tnum;
152 }
153 }
154 }else{
155 iTab = iRoot;
156 }
157
158 /* Search for the required lock. Either a write-lock on root-page iTab, a
159 ** write-lock on the schema table, or (if the client is reading) a
160 ** read-lock on iTab will suffice. Return 1 if any of these are found. */
161 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
162 if( pLock->pBtree==pBtree
163 && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
164 && pLock->eLock>=eLockType
165 ){
166 return 1;
167 }
168 }
169
170 /* Failed to find the required lock. */
171 return 0;
172 }
173
174 /*
175 ** This function is also used as part of assert() statements only. It
176 ** returns true if there exist one or more cursors open on the table
177 ** with root page iRoot that do not belong to either connection pBtree
178 ** or some other connection that has the read-uncommitted flag set.
179 **
180 ** For example, before writing to page iRoot:
181 **
182 ** assert( !hasReadConflicts(pBtree, iRoot) );
183 */
184 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
185 BtCursor *p;
186 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
187 if( p->pgnoRoot==iRoot
188 && p->pBtree!=pBtree
189 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
190 ){
191 return 1;
192 }
193 }
194 return 0;
195 }
196 #endif /* #ifdef SQLITE_DEBUG */
197
198 /*
199 ** Query to see if btree handle p may obtain a lock of type eLock
200 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
201 ** SQLITE_OK if the lock may be obtained (by calling
202 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
203 */
204 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
205 BtShared *pBt = p->pBt;
206 BtLock *pIter;
207
208 assert( sqlite3BtreeHoldsMutex(p) );
209 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
210 assert( p->db!=0 );
211 assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
212
213 /* If requesting a write-lock, then the Btree must have an open write
214 ** transaction on this file. And, obviously, for this to be so there
215 ** must be an open write transaction on the file itself.
216 */
217 assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
218 assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
219
220 /* This is a no-op if the shared-cache is not enabled */
221 if( !p->sharable ){
222 return SQLITE_OK;
223 }
224
225 /* If some other connection is holding an exclusive lock, the
226 ** requested lock may not be obtained.
227 */
228 if( pBt->pWriter!=p && pBt->isExclusive ){
229 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
230 return SQLITE_LOCKED_SHAREDCACHE;
231 }
232
233 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
234 /* The condition (pIter->eLock!=eLock) in the following if(...)
235 ** statement is a simplification of:
236 **
237 ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
238 **
239 ** since we know that if eLock==WRITE_LOCK, then no other connection
240 ** may hold a WRITE_LOCK on any table in this file (since there can
241 ** only be a single writer).
242 */
243 assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
244 assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
245 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
246 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
247 if( eLock==WRITE_LOCK ){
248 assert( p==pBt->pWriter );
249 pBt->isPending = 1;
250 }
251 return SQLITE_LOCKED_SHAREDCACHE;
252 }
253 }
254 return SQLITE_OK;
255 }
256 #endif /* !SQLITE_OMIT_SHARED_CACHE */
257
258 #ifndef SQLITE_OMIT_SHARED_CACHE
259 /*
260 ** Add a lock on the table with root-page iTable to the shared-btree used
261 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
262 ** WRITE_LOCK.
263 **
264 ** This function assumes the following:
265 **
266 ** (a) The specified b-tree connection handle is connected to a sharable
267 ** b-tree database (one with the BtShared.sharable) flag set, and
268 **
269 ** (b) No other b-tree connection handle holds a lock that conflicts
270 ** with the requested lock (i.e. querySharedCacheTableLock() has
271 ** already been called and returned SQLITE_OK).
272 **
273 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
274 ** is returned if a malloc attempt fails.
275 */
276 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
277 BtShared *pBt = p->pBt;
278 BtLock *pLock = 0;
279 BtLock *pIter;
280
281 assert( sqlite3BtreeHoldsMutex(p) );
282 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
283 assert( p->db!=0 );
284
285 /* A connection with the read-uncommitted flag set will never try to
286 ** obtain a read-lock using this function. The only read-lock obtained
287 ** by a connection in read-uncommitted mode is on the sqlite_master
288 ** table, and that lock is obtained in BtreeBeginTrans(). */
289 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
290
291 /* This function should only be called on a sharable b-tree after it
292 ** has been determined that no other b-tree holds a conflicting lock. */
293 assert( p->sharable );
294 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
295
296 /* First search the list for an existing lock on this table. */
297 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
298 if( pIter->iTable==iTable && pIter->pBtree==p ){
299 pLock = pIter;
300 break;
301 }
302 }
303
304 /* If the above search did not find a BtLock struct associating Btree p
305 ** with table iTable, allocate one and link it into the list.
306 */
307 if( !pLock ){
308 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
309 if( !pLock ){
310 return SQLITE_NOMEM;
311 }
312 pLock->iTable = iTable;
313 pLock->pBtree = p;
314 pLock->pNext = pBt->pLock;
315 pBt->pLock = pLock;
316 }
317
318 /* Set the BtLock.eLock variable to the maximum of the current lock
319 ** and the requested lock. This means if a write-lock was already held
320 ** and a read-lock requested, we don't incorrectly downgrade the lock.
321 */
322 assert( WRITE_LOCK>READ_LOCK );
323 if( eLock>pLock->eLock ){
324 pLock->eLock = eLock;
325 }
326
327 return SQLITE_OK;
328 }
329 #endif /* !SQLITE_OMIT_SHARED_CACHE */
330
331 #ifndef SQLITE_OMIT_SHARED_CACHE
332 /*
333 ** Release all the table locks (locks obtained via calls to
334 ** the setSharedCacheTableLock() procedure) held by Btree handle p.
335 **
336 ** This function assumes that handle p has an open read or write
337 ** transaction. If it does not, then the BtShared.isPending variable
338 ** may be incorrectly cleared.
339 */
340 static void clearAllSharedCacheTableLocks(Btree *p){
341 BtShared *pBt = p->pBt;
342 BtLock **ppIter = &pBt->pLock;
343
344 assert( sqlite3BtreeHoldsMutex(p) );
345 assert( p->sharable || 0==*ppIter );
346 assert( p->inTrans>0 );
347
348 while( *ppIter ){
349 BtLock *pLock = *ppIter;
350 assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree );
351 assert( pLock->pBtree->inTrans>=pLock->eLock );
352 if( pLock->pBtree==p ){
353 *ppIter = pLock->pNext;
354 assert( pLock->iTable!=1 || pLock==&p->lock );
355 if( pLock->iTable!=1 ){
356 sqlite3_free(pLock);
357 }
358 }else{
359 ppIter = &pLock->pNext;
360 }
361 }
362
363 assert( pBt->isPending==0 || pBt->pWriter );
364 if( pBt->pWriter==p ){
365 pBt->pWriter = 0;
366 pBt->isExclusive = 0;
367 pBt->isPending = 0;
368 }else if( pBt->nTransaction==2 ){
369 /* This function is called when connection p is concluding its
370 ** transaction. If there currently exists a writer, and p is not
371 ** that writer, then the number of locks held by connections other
372 ** than the writer must be about to drop to zero. In this case
373 ** set the isPending flag to 0.
374 **
375 ** If there is not currently a writer, then BtShared.isPending must
376 ** be zero already. So this next line is harmless in that case.
377 */
378 pBt->isPending = 0;
379 }
380 }
381
382 /*
383 ** This function changes all write-locks held by connection p to read-locks.
384 */
385 static void downgradeAllSharedCacheTableLocks(Btree *p){
386 BtShared *pBt = p->pBt;
387 if( pBt->pWriter==p ){
388 BtLock *pLock;
389 pBt->pWriter = 0;
390 pBt->isExclusive = 0;
391 pBt->isPending = 0;
392 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
393 assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
394 pLock->eLock = READ_LOCK;
395 }
396 }
397 }
398
399 #endif /* SQLITE_OMIT_SHARED_CACHE */
400
401 static void releasePage(MemPage *pPage); /* Forward reference */
402
403 /*
404 ** Verify that the cursor holds a mutex on the BtShared
405 */
406 #ifndef NDEBUG
407 static int cursorHoldsMutex(BtCursor *p){
408 return sqlite3_mutex_held(p->pBt->mutex);
409 }
410 #endif
411
412
413 #ifndef SQLITE_OMIT_INCRBLOB
414 /*
415 ** Invalidate the overflow page-list cache for cursor pCur, if any.
416 */
417 static void invalidateOverflowCache(BtCursor *pCur){
418 assert( cursorHoldsMutex(pCur) );
419 sqlite3_free(pCur->aOverflow);
420 pCur->aOverflow = 0;
421 }
422
423 /*
424 ** Invalidate the overflow page-list cache for all cursors opened
425 ** on the shared btree structure pBt.
426 */
427 static void invalidateAllOverflowCache(BtShared *pBt){
428 BtCursor *p;
429 assert( sqlite3_mutex_held(pBt->mutex) );
430 for(p=pBt->pCursor; p; p=p->pNext){
431 invalidateOverflowCache(p);
432 }
433 }
434
435 /*
436 ** This function is called before modifying the contents of a table
437 ** b-tree to invalidate any incrblob cursors that are open on the
438 ** row or one of the rows being modified.
439 **
440 ** If argument isClearTable is true, then the entire contents of the
441 ** table is about to be deleted. In this case invalidate all incrblob
442 ** cursors open on any row within the table with root-page pgnoRoot.
443 **
444 ** Otherwise, if argument isClearTable is false, then the row with
445 ** rowid iRow is being replaced or deleted. In this case invalidate
446 ** only those incrblob cursors open on this specific row.
447 */
448 static void invalidateIncrblobCursors(
449 Btree *pBtree, /* The database file to check */
450 i64 iRow, /* The rowid that might be changing */
451 int isClearTable /* True if all rows are being deleted */
452 ){
453 BtCursor *p;
454 BtShared *pBt = pBtree->pBt;
455 assert( sqlite3BtreeHoldsMutex(pBtree) );
456 for(p=pBt->pCursor; p; p=p->pNext){
457 if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
458 p->eState = CURSOR_INVALID;
459 }
460 }
461 }
462
463 #else
464 #define invalidateOverflowCache(x)
465 #define invalidateAllOverflowCache(x)
466 #define invalidateIncrblobCursors(x,y,z)
467 #endif
468
469 /*
470 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
471 ** when a page that previously contained data becomes a free-list leaf
472 ** page.
473 **
474 ** The BtShared.pHasContent bitvec exists to work around an obscure
475 ** bug caused by the interaction of two useful IO optimizations surrounding
476 ** free-list leaf pages:
477 **
478 ** 1) When all data is deleted from a page and the page becomes
479 ** a free-list leaf page, the page is not written to the database
480 ** (as free-list leaf pages contain no meaningful data). Sometimes
481 ** such a page is not even journalled (as it will not be modified,
482 ** why bother journalling it?).
483 **
484 ** 2) When a free-list leaf page is reused, its content is not read
485 ** from the database or written to the journal file (why should it
486 ** be, if it is not at all meaningful?).
487 **
488 ** By themselves, these optimizations work fine and provide a handy
489 ** performance boost to bulk delete or insert operations. However, if
490 ** a page is moved to the free-list and then reused within the same
491 ** transaction, a problem comes up. If the page is not journalled when
492 ** it is moved to the free-list and it is also not journalled when it
493 ** is extracted from the free-list and reused, then the original data
494 ** may be lost. In the event of a rollback, it may not be possible
495 ** to restore the database to its original configuration.
496 **
497 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
498 ** moved to become a free-list leaf page, the corresponding bit is
499 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
500 ** optimization 2 above is ommitted if the corresponding bit is already
501 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
502 ** at the end of every transaction.
503 */
504 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
505 int rc = SQLITE_OK;
506 if( !pBt->pHasContent ){
507 int nPage = 100;
508 sqlite3PagerPagecount(pBt->pPager, &nPage);
509 /* If sqlite3PagerPagecount() fails there is no harm because the
510 ** nPage variable is unchanged from its default value of 100 */
511 pBt->pHasContent = sqlite3BitvecCreate((u32)nPage);
512 if( !pBt->pHasContent ){
513 rc = SQLITE_NOMEM;
514 }
515 }
516 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
517 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
518 }
519 return rc;
520 }
521
522 /*
523 ** Query the BtShared.pHasContent vector.
524 **
525 ** This function is called when a free-list leaf page is removed from the
526 ** free-list for reuse. It returns false if it is safe to retrieve the
527 ** page from the pager layer with the 'no-content' flag set. True otherwise.
528 */
529 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
530 Bitvec *p = pBt->pHasContent;
531 return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
532 }
533
534 /*
535 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
536 ** invoked at the conclusion of each write-transaction.
537 */
538 static void btreeClearHasContent(BtShared *pBt){
539 sqlite3BitvecDestroy(pBt->pHasContent);
540 pBt->pHasContent = 0;
541 }
542
543 /*
544 ** Save the current cursor position in the variables BtCursor.nKey
545 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
546 **
547 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
548 ** prior to calling this routine.
549 */
550 static int saveCursorPosition(BtCursor *pCur){
551 int rc;
552
553 assert( CURSOR_VALID==pCur->eState );
554 assert( 0==pCur->pKey );
555 assert( cursorHoldsMutex(pCur) );
556
557 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
558 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */
559
560 /* If this is an intKey table, then the above call to BtreeKeySize()
561 ** stores the integer key in pCur->nKey. In this case this value is
562 ** all that is required. Otherwise, if pCur is not open on an intKey
563 ** table, then malloc space for and store the pCur->nKey bytes of key
564 ** data.
565 */
566 if( 0==pCur->apPage[0]->intKey ){
567 void *pKey = sqlite3Malloc( (int)pCur->nKey );
568 if( pKey ){
569 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
570 if( rc==SQLITE_OK ){
571 pCur->pKey = pKey;
572 }else{
573 sqlite3_free(pKey);
574 }
575 }else{
576 rc = SQLITE_NOMEM;
577 }
578 }
579 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
580
581 if( rc==SQLITE_OK ){
582 int i;
583 for(i=0; i<=pCur->iPage; i++){
584 releasePage(pCur->apPage[i]);
585 pCur->apPage[i] = 0;
586 }
587 pCur->iPage = -1;
588 pCur->eState = CURSOR_REQUIRESEEK;
589 }
590
591 invalidateOverflowCache(pCur);
592 return rc;
593 }
594
595 /*
596 ** Save the positions of all cursors except pExcept open on the table
597 ** with root-page iRoot. Usually, this is called just before cursor
598 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
599 */
600 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
601 BtCursor *p;
602 assert( sqlite3_mutex_held(pBt->mutex) );
603 assert( pExcept==0 || pExcept->pBt==pBt );
604 for(p=pBt->pCursor; p; p=p->pNext){
605 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
606 p->eState==CURSOR_VALID ){
607 int rc = saveCursorPosition(p);
608 if( SQLITE_OK!=rc ){
609 return rc;
610 }
611 }
612 }
613 return SQLITE_OK;
614 }
615
616 /*
617 ** Clear the current cursor position.
618 */
619 void sqlite3BtreeClearCursor(BtCursor *pCur){
620 assert( cursorHoldsMutex(pCur) );
621 sqlite3_free(pCur->pKey);
622 pCur->pKey = 0;
623 pCur->eState = CURSOR_INVALID;
624 }
625
626 /*
627 ** In this version of BtreeMoveto, pKey is a packed index record
628 ** such as is generated by the OP_MakeRecord opcode. Unpack the
629 ** record and then call BtreeMovetoUnpacked() to do the work.
630 */
631 static int btreeMoveto(
632 BtCursor *pCur, /* Cursor open on the btree to be searched */
633 const void *pKey, /* Packed key if the btree is an index */
634 i64 nKey, /* Integer key for tables. Size of pKey for indices */
635 int bias, /* Bias search to the high end */
636 int *pRes /* Write search results here */
637 ){
638 int rc; /* Status code */
639 UnpackedRecord *pIdxKey; /* Unpacked index key */
640 char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */
641
642 if( pKey ){
643 assert( nKey==(i64)(int)nKey );
644 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey,
645 aSpace, sizeof(aSpace));
646 if( pIdxKey==0 ) return SQLITE_NOMEM;
647 }else{
648 pIdxKey = 0;
649 }
650 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
651 if( pKey ){
652 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
653 }
654 return rc;
655 }
656
657 /*
658 ** Restore the cursor to the position it was in (or as close to as possible)
659 ** when saveCursorPosition() was called. Note that this call deletes the
660 ** saved position info stored by saveCursorPosition(), so there can be
661 ** at most one effective restoreCursorPosition() call after each
662 ** saveCursorPosition().
663 */
664 static int btreeRestoreCursorPosition(BtCursor *pCur){
665 int rc;
666 assert( cursorHoldsMutex(pCur) );
667 assert( pCur->eState>=CURSOR_REQUIRESEEK );
668 if( pCur->eState==CURSOR_FAULT ){
669 return pCur->skipNext;
670 }
671 pCur->eState = CURSOR_INVALID;
672 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
673 if( rc==SQLITE_OK ){
674 sqlite3_free(pCur->pKey);
675 pCur->pKey = 0;
676 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
677 }
678 return rc;
679 }
680
681 #define restoreCursorPosition(p) \
682 (p->eState>=CURSOR_REQUIRESEEK ? \
683 btreeRestoreCursorPosition(p) : \
684 SQLITE_OK)
685
686 /*
687 ** Determine whether or not a cursor has moved from the position it
688 ** was last placed at. Cursors can move when the row they are pointing
689 ** at is deleted out from under them.
690 **
691 ** This routine returns an error code if something goes wrong. The
692 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
693 */
694 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
695 int rc;
696
697 rc = restoreCursorPosition(pCur);
698 if( rc ){
699 *pHasMoved = 1;
700 return rc;
701 }
702 if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){
703 *pHasMoved = 1;
704 }else{
705 *pHasMoved = 0;
706 }
707 return SQLITE_OK;
708 }
709
710 #ifndef SQLITE_OMIT_AUTOVACUUM
711 /*
712 ** Given a page number of a regular database page, return the page
713 ** number for the pointer-map page that contains the entry for the
714 ** input page number.
715 */
716 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
717 int nPagesPerMapPage;
718 Pgno iPtrMap, ret;
719 assert( sqlite3_mutex_held(pBt->mutex) );
720 nPagesPerMapPage = (pBt->usableSize/5)+1;
721 iPtrMap = (pgno-2)/nPagesPerMapPage;
722 ret = (iPtrMap*nPagesPerMapPage) + 2;
723 if( ret==PENDING_BYTE_PAGE(pBt) ){
724 ret++;
725 }
726 return ret;
727 }
728
729 /*
730 ** Write an entry into the pointer map.
731 **
732 ** This routine updates the pointer map entry for page number 'key'
733 ** so that it maps to type 'eType' and parent page number 'pgno'.
734 **
735 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
736 ** a no-op. If an error occurs, the appropriate error code is written
737 ** into *pRC.
738 */
739 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
740 DbPage *pDbPage; /* The pointer map page */
741 u8 *pPtrmap; /* The pointer map data */
742 Pgno iPtrmap; /* The pointer map page number */
743 int offset; /* Offset in pointer map page */
744 int rc; /* Return code from subfunctions */
745
746 if( *pRC ) return;
747
748 assert( sqlite3_mutex_held(pBt->mutex) );
749 /* The master-journal page number must never be used as a pointer map page */
750 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
751
752 assert( pBt->autoVacuum );
753 if( key==0 ){
754 *pRC = SQLITE_CORRUPT_BKPT;
755 return;
756 }
757 iPtrmap = PTRMAP_PAGENO(pBt, key);
758 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
759 if( rc!=SQLITE_OK ){
760 *pRC = rc;
761 return;
762 }
763 offset = PTRMAP_PTROFFSET(iPtrmap, key);
764 if( offset<0 ){
765 *pRC = SQLITE_CORRUPT_BKPT;
766 goto ptrmap_exit;
767 }
768 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
769
770 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
771 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
772 *pRC= rc = sqlite3PagerWrite(pDbPage);
773 if( rc==SQLITE_OK ){
774 pPtrmap[offset] = eType;
775 put4byte(&pPtrmap[offset+1], parent);
776 }
777 }
778
779 ptrmap_exit:
780 sqlite3PagerUnref(pDbPage);
781 }
782
783 /*
784 ** Read an entry from the pointer map.
785 **
786 ** This routine retrieves the pointer map entry for page 'key', writing
787 ** the type and parent page number to *pEType and *pPgno respectively.
788 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
789 */
790 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
791 DbPage *pDbPage; /* The pointer map page */
792 int iPtrmap; /* Pointer map page index */
793 u8 *pPtrmap; /* Pointer map page data */
794 int offset; /* Offset of entry in pointer map */
795 int rc;
796
797 assert( sqlite3_mutex_held(pBt->mutex) );
798
799 iPtrmap = PTRMAP_PAGENO(pBt, key);
800 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
801 if( rc!=0 ){
802 return rc;
803 }
804 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
805
806 offset = PTRMAP_PTROFFSET(iPtrmap, key);
807 assert( pEType!=0 );
808 *pEType = pPtrmap[offset];
809 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
810
811 sqlite3PagerUnref(pDbPage);
812 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
813 return SQLITE_OK;
814 }
815
816 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
817 #define ptrmapPut(w,x,y,z,rc)
818 #define ptrmapGet(w,x,y,z) SQLITE_OK
819 #define ptrmapPutOvflPtr(x, y, rc)
820 #endif
821
822 /*
823 ** Given a btree page and a cell index (0 means the first cell on
824 ** the page, 1 means the second cell, and so forth) return a pointer
825 ** to the cell content.
826 **
827 ** This routine works only for pages that do not contain overflow cells.
828 */
829 #define findCell(P,I) \
830 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
831
832 /*
833 ** This a more complex version of findCell() that works for
834 ** pages that do contain overflow cells.
835 */
836 static u8 *findOverflowCell(MemPage *pPage, int iCell){
837 int i;
838 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
839 for(i=pPage->nOverflow-1; i>=0; i--){
840 int k;
841 struct _OvflCell *pOvfl;
842 pOvfl = &pPage->aOvfl[i];
843 k = pOvfl->idx;
844 if( k<=iCell ){
845 if( k==iCell ){
846 return pOvfl->pCell;
847 }
848 iCell--;
849 }
850 }
851 return findCell(pPage, iCell);
852 }
853
854 /*
855 ** Parse a cell content block and fill in the CellInfo structure. There
856 ** are two versions of this function. btreeParseCell() takes a
857 ** cell index as the second argument and btreeParseCellPtr()
858 ** takes a pointer to the body of the cell as its second argument.
859 **
860 ** Within this file, the parseCell() macro can be called instead of
861 ** btreeParseCellPtr(). Using some compilers, this will be faster.
862 */
863 static void btreeParseCellPtr(
864 MemPage *pPage, /* Page containing the cell */
865 u8 *pCell, /* Pointer to the cell text. */
866 CellInfo *pInfo /* Fill in this structure */
867 ){
868 u16 n; /* Number bytes in cell content header */
869 u32 nPayload; /* Number of bytes of cell payload */
870
871 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
872
873 pInfo->pCell = pCell;
874 assert( pPage->leaf==0 || pPage->leaf==1 );
875 n = pPage->childPtrSize;
876 assert( n==4-4*pPage->leaf );
877 if( pPage->intKey ){
878 if( pPage->hasData ){
879 n += getVarint32(&pCell[n], nPayload);
880 }else{
881 nPayload = 0;
882 }
883 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
884 pInfo->nData = nPayload;
885 }else{
886 pInfo->nData = 0;
887 n += getVarint32(&pCell[n], nPayload);
888 pInfo->nKey = nPayload;
889 }
890 pInfo->nPayload = nPayload;
891 pInfo->nHeader = n;
892 testcase( nPayload==pPage->maxLocal );
893 testcase( nPayload==pPage->maxLocal+1 );
894 if( likely(nPayload<=pPage->maxLocal) ){
895 /* This is the (easy) common case where the entire payload fits
896 ** on the local page. No overflow is required.
897 */
898 int nSize; /* Total size of cell content in bytes */
899 nSize = nPayload + n;
900 pInfo->nLocal = (u16)nPayload;
901 pInfo->iOverflow = 0;
902 if( (nSize & ~3)==0 ){
903 nSize = 4; /* Minimum cell size is 4 */
904 }
905 pInfo->nSize = (u16)nSize;
906 }else{
907 /* If the payload will not fit completely on the local page, we have
908 ** to decide how much to store locally and how much to spill onto
909 ** overflow pages. The strategy is to minimize the amount of unused
910 ** space on overflow pages while keeping the amount of local storage
911 ** in between minLocal and maxLocal.
912 **
913 ** Warning: changing the way overflow payload is distributed in any
914 ** way will result in an incompatible file format.
915 */
916 int minLocal; /* Minimum amount of payload held locally */
917 int maxLocal; /* Maximum amount of payload held locally */
918 int surplus; /* Overflow payload available for local storage */
919
920 minLocal = pPage->minLocal;
921 maxLocal = pPage->maxLocal;
922 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
923 testcase( surplus==maxLocal );
924 testcase( surplus==maxLocal+1 );
925 if( surplus <= maxLocal ){
926 pInfo->nLocal = (u16)surplus;
927 }else{
928 pInfo->nLocal = (u16)minLocal;
929 }
930 pInfo->iOverflow = (u16)(pInfo->nLocal + n);
931 pInfo->nSize = pInfo->iOverflow + 4;
932 }
933 }
934 #define parseCell(pPage, iCell, pInfo) \
935 btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
936 static void btreeParseCell(
937 MemPage *pPage, /* Page containing the cell */
938 int iCell, /* The cell index. First cell is 0 */
939 CellInfo *pInfo /* Fill in this structure */
940 ){
941 parseCell(pPage, iCell, pInfo);
942 }
943
944 /*
945 ** Compute the total number of bytes that a Cell needs in the cell
946 ** data area of the btree-page. The return number includes the cell
947 ** data header and the local payload, but not any overflow page or
948 ** the space used by the cell pointer.
949 */
950 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
951 u8 *pIter = &pCell[pPage->childPtrSize];
952 u32 nSize;
953
954 #ifdef SQLITE_DEBUG
955 /* The value returned by this function should always be the same as
956 ** the (CellInfo.nSize) value found by doing a full parse of the
957 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
958 ** this function verifies that this invariant is not violated. */
959 CellInfo debuginfo;
960 btreeParseCellPtr(pPage, pCell, &debuginfo);
961 #endif
962
963 if( pPage->intKey ){
964 u8 *pEnd;
965 if( pPage->hasData ){
966 pIter += getVarint32(pIter, nSize);
967 }else{
968 nSize = 0;
969 }
970
971 /* pIter now points at the 64-bit integer key value, a variable length
972 ** integer. The following block moves pIter to point at the first byte
973 ** past the end of the key value. */
974 pEnd = &pIter[9];
975 while( (*pIter++)&0x80 && pIter<pEnd );
976 }else{
977 pIter += getVarint32(pIter, nSize);
978 }
979
980 testcase( nSize==pPage->maxLocal );
981 testcase( nSize==pPage->maxLocal+1 );
982 if( nSize>pPage->maxLocal ){
983 int minLocal = pPage->minLocal;
984 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
985 testcase( nSize==pPage->maxLocal );
986 testcase( nSize==pPage->maxLocal+1 );
987 if( nSize>pPage->maxLocal ){
988 nSize = minLocal;
989 }
990 nSize += 4;
991 }
992 nSize += (u32)(pIter - pCell);
993
994 /* The minimum size of any cell is 4 bytes. */
995 if( nSize<4 ){
996 nSize = 4;
997 }
998
999 assert( nSize==debuginfo.nSize );
1000 return (u16)nSize;
1001 }
1002 #ifndef NDEBUG
1003 static u16 cellSize(MemPage *pPage, int iCell){
1004 return cellSizePtr(pPage, findCell(pPage, iCell));
1005 }
1006 #endif
1007
1008 #ifndef SQLITE_OMIT_AUTOVACUUM
1009 /*
1010 ** If the cell pCell, part of page pPage contains a pointer
1011 ** to an overflow page, insert an entry into the pointer-map
1012 ** for the overflow page.
1013 */
1014 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
1015 CellInfo info;
1016 if( *pRC ) return;
1017 assert( pCell!=0 );
1018 btreeParseCellPtr(pPage, pCell, &info);
1019 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1020 if( info.iOverflow ){
1021 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1022 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1023 }
1024 }
1025 #endif
1026
1027
1028 /*
1029 ** Defragment the page given. All Cells are moved to the
1030 ** end of the page and all free space is collected into one
1031 ** big FreeBlk that occurs in between the header and cell
1032 ** pointer array and the cell content area.
1033 */
1034 static int defragmentPage(MemPage *pPage){
1035 int i; /* Loop counter */
1036 int pc; /* Address of a i-th cell */
1037 int hdr; /* Offset to the page header */
1038 int size; /* Size of a cell */
1039 int usableSize; /* Number of usable bytes on a page */
1040 int cellOffset; /* Offset to the cell pointer array */
1041 int cbrk; /* Offset to the cell content area */
1042 int nCell; /* Number of cells on the page */
1043 unsigned char *data; /* The page data */
1044 unsigned char *temp; /* Temp area for cell content */
1045 int iCellFirst; /* First allowable cell index */
1046 int iCellLast; /* Last possible cell index */
1047
1048
1049 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1050 assert( pPage->pBt!=0 );
1051 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1052 assert( pPage->nOverflow==0 );
1053 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1054 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1055 data = pPage->aData;
1056 hdr = pPage->hdrOffset;
1057 cellOffset = pPage->cellOffset;
1058 nCell = pPage->nCell;
1059 assert( nCell==get2byte(&data[hdr+3]) );
1060 usableSize = pPage->pBt->usableSize;
1061 cbrk = get2byte(&data[hdr+5]);
1062 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1063 cbrk = usableSize;
1064 iCellFirst = cellOffset + 2*nCell;
1065 iCellLast = usableSize - 4;
1066 for(i=0; i<nCell; i++){
1067 u8 *pAddr; /* The i-th cell pointer */
1068 pAddr = &data[cellOffset + i*2];
1069 pc = get2byte(pAddr);
1070 testcase( pc==iCellFirst );
1071 testcase( pc==iCellLast );
1072 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1073 /* These conditions have already been verified in btreeInitPage()
1074 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
1075 */
1076 if( pc<iCellFirst || pc>iCellLast ){
1077 return SQLITE_CORRUPT_BKPT;
1078 }
1079 #endif
1080 assert( pc>=iCellFirst && pc<=iCellLast );
1081 size = cellSizePtr(pPage, &temp[pc]);
1082 cbrk -= size;
1083 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1084 if( cbrk<iCellFirst ){
1085 return SQLITE_CORRUPT_BKPT;
1086 }
1087 #else
1088 if( cbrk<iCellFirst || pc+size>usableSize ){
1089 return SQLITE_CORRUPT_BKPT;
1090 }
1091 #endif
1092 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1093 testcase( cbrk+size==usableSize );
1094 testcase( pc+size==usableSize );
1095 memcpy(&data[cbrk], &temp[pc], size);
1096 put2byte(pAddr, cbrk);
1097 }
1098 assert( cbrk>=iCellFirst );
1099 put2byte(&data[hdr+5], cbrk);
1100 data[hdr+1] = 0;
1101 data[hdr+2] = 0;
1102 data[hdr+7] = 0;
1103 memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1104 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1105 if( cbrk-iCellFirst!=pPage->nFree ){
1106 return SQLITE_CORRUPT_BKPT;
1107 }
1108 return SQLITE_OK;
1109 }
1110
1111 /*
1112 ** Allocate nByte bytes of space from within the B-Tree page passed
1113 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1114 ** of the first byte of allocated space. Return either SQLITE_OK or
1115 ** an error code (usually SQLITE_CORRUPT).
1116 **
1117 ** The caller guarantees that there is sufficient space to make the
1118 ** allocation. This routine might need to defragment in order to bring
1119 ** all the space together, however. This routine will avoid using
1120 ** the first two bytes past the cell pointer area since presumably this
1121 ** allocation is being made in order to insert a new cell, so we will
1122 ** also end up needing a new cell pointer.
1123 */
1124 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1125 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
1126 u8 * const data = pPage->aData; /* Local cache of pPage->aData */
1127 int nFrag; /* Number of fragmented bytes on pPage */
1128 int top; /* First byte of cell content area */
1129 int gap; /* First byte of gap between cell pointers and cell content */
1130 int rc; /* Integer return code */
1131
1132 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1133 assert( pPage->pBt );
1134 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1135 assert( nByte>=0 ); /* Minimum cell size is 4 */
1136 assert( pPage->nFree>=nByte );
1137 assert( pPage->nOverflow==0 );
1138 assert( nByte<pPage->pBt->usableSize-8 );
1139
1140 nFrag = data[hdr+7];
1141 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1142 gap = pPage->cellOffset + 2*pPage->nCell;
1143 top = get2byte(&data[hdr+5]);
1144 if( gap>top ) return SQLITE_CORRUPT_BKPT;
1145 testcase( gap+2==top );
1146 testcase( gap+1==top );
1147 testcase( gap==top );
1148
1149 if( nFrag>=60 ){
1150 /* Always defragment highly fragmented pages */
1151 rc = defragmentPage(pPage);
1152 if( rc ) return rc;
1153 top = get2byte(&data[hdr+5]);
1154 }else if( gap+2<=top ){
1155 /* Search the freelist looking for a free slot big enough to satisfy
1156 ** the request. The allocation is made from the first free slot in
1157 ** the list that is large enough to accomadate it.
1158 */
1159 int pc, addr;
1160 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
1161 int size = get2byte(&data[pc+2]); /* Size of free slot */
1162 if( size>=nByte ){
1163 int x = size - nByte;
1164 testcase( x==4 );
1165 testcase( x==3 );
1166 if( x<4 ){
1167 /* Remove the slot from the free-list. Update the number of
1168 ** fragmented bytes within the page. */
1169 memcpy(&data[addr], &data[pc], 2);
1170 data[hdr+7] = (u8)(nFrag + x);
1171 }else{
1172 /* The slot remains on the free-list. Reduce its size to account
1173 ** for the portion used by the new allocation. */
1174 put2byte(&data[pc+2], x);
1175 }
1176 *pIdx = pc + x;
1177 return SQLITE_OK;
1178 }
1179 }
1180 }
1181
1182 /* Check to make sure there is enough space in the gap to satisfy
1183 ** the allocation. If not, defragment.
1184 */
1185 testcase( gap+2+nByte==top );
1186 if( gap+2+nByte>top ){
1187 rc = defragmentPage(pPage);
1188 if( rc ) return rc;
1189 top = get2byte(&data[hdr+5]);
1190 assert( gap+nByte<=top );
1191 }
1192
1193
1194 /* Allocate memory from the gap in between the cell pointer array
1195 ** and the cell content area. The btreeInitPage() call has already
1196 ** validated the freelist. Given that the freelist is valid, there
1197 ** is no way that the allocation can extend off the end of the page.
1198 ** The assert() below verifies the previous sentence.
1199 */
1200 top -= nByte;
1201 put2byte(&data[hdr+5], top);
1202 assert( top+nByte <= pPage->pBt->usableSize );
1203 *pIdx = top;
1204 return SQLITE_OK;
1205 }
1206
1207 /*
1208 ** Return a section of the pPage->aData to the freelist.
1209 ** The first byte of the new free block is pPage->aDisk[start]
1210 ** and the size of the block is "size" bytes.
1211 **
1212 ** Most of the effort here is involved in coalesing adjacent
1213 ** free blocks into a single big free block.
1214 */
1215 static int freeSpace(MemPage *pPage, int start, int size){
1216 int addr, pbegin, hdr;
1217 int iLast; /* Largest possible freeblock offset */
1218 unsigned char *data = pPage->aData;
1219
1220 assert( pPage->pBt!=0 );
1221 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1222 assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
1223 assert( (start + size)<=pPage->pBt->usableSize );
1224 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1225 assert( size>=0 ); /* Minimum cell size is 4 */
1226
1227 #ifdef SQLITE_SECURE_DELETE
1228 /* Overwrite deleted information with zeros when the SECURE_DELETE
1229 ** option is enabled at compile-time */
1230 memset(&data[start], 0, size);
1231 #endif
1232
1233 /* Add the space back into the linked list of freeblocks. Note that
1234 ** even though the freeblock list was checked by btreeInitPage(),
1235 ** btreeInitPage() did not detect overlapping cells or
1236 ** freeblocks that overlapped cells. Nor does it detect when the
1237 ** cell content area exceeds the value in the page header. If these
1238 ** situations arise, then subsequent insert operations might corrupt
1239 ** the freelist. So we do need to check for corruption while scanning
1240 ** the freelist.
1241 */
1242 hdr = pPage->hdrOffset;
1243 addr = hdr + 1;
1244 iLast = pPage->pBt->usableSize - 4;
1245 assert( start<=iLast );
1246 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1247 if( pbegin<addr+4 ){
1248 return SQLITE_CORRUPT_BKPT;
1249 }
1250 addr = pbegin;
1251 }
1252 if( pbegin>iLast ){
1253 return SQLITE_CORRUPT_BKPT;
1254 }
1255 assert( pbegin>addr || pbegin==0 );
1256 put2byte(&data[addr], start);
1257 put2byte(&data[start], pbegin);
1258 put2byte(&data[start+2], size);
1259 pPage->nFree = pPage->nFree + (u16)size;
1260
1261 /* Coalesce adjacent free blocks */
1262 addr = hdr + 1;
1263 while( (pbegin = get2byte(&data[addr]))>0 ){
1264 int pnext, psize, x;
1265 assert( pbegin>addr );
1266 assert( pbegin<=pPage->pBt->usableSize-4 );
1267 pnext = get2byte(&data[pbegin]);
1268 psize = get2byte(&data[pbegin+2]);
1269 if( pbegin + psize + 3 >= pnext && pnext>0 ){
1270 int frag = pnext - (pbegin+psize);
1271 if( (frag<0) || (frag>(int)data[hdr+7]) ){
1272 return SQLITE_CORRUPT_BKPT;
1273 }
1274 data[hdr+7] -= (u8)frag;
1275 x = get2byte(&data[pnext]);
1276 put2byte(&data[pbegin], x);
1277 x = pnext + get2byte(&data[pnext+2]) - pbegin;
1278 put2byte(&data[pbegin+2], x);
1279 }else{
1280 addr = pbegin;
1281 }
1282 }
1283
1284 /* If the cell content area begins with a freeblock, remove it. */
1285 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1286 int top;
1287 pbegin = get2byte(&data[hdr+1]);
1288 memcpy(&data[hdr+1], &data[pbegin], 2);
1289 top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
1290 put2byte(&data[hdr+5], top);
1291 }
1292 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1293 return SQLITE_OK;
1294 }
1295
1296 /*
1297 ** Decode the flags byte (the first byte of the header) for a page
1298 ** and initialize fields of the MemPage structure accordingly.
1299 **
1300 ** Only the following combinations are supported. Anything different
1301 ** indicates a corrupt database files:
1302 **
1303 ** PTF_ZERODATA
1304 ** PTF_ZERODATA | PTF_LEAF
1305 ** PTF_LEAFDATA | PTF_INTKEY
1306 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1307 */
1308 static int decodeFlags(MemPage *pPage, int flagByte){
1309 BtShared *pBt; /* A copy of pPage->pBt */
1310
1311 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1312 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1313 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );
1314 flagByte &= ~PTF_LEAF;
1315 pPage->childPtrSize = 4-4*pPage->leaf;
1316 pBt = pPage->pBt;
1317 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1318 pPage->intKey = 1;
1319 pPage->hasData = pPage->leaf;
1320 pPage->maxLocal = pBt->maxLeaf;
1321 pPage->minLocal = pBt->minLeaf;
1322 }else if( flagByte==PTF_ZERODATA ){
1323 pPage->intKey = 0;
1324 pPage->hasData = 0;
1325 pPage->maxLocal = pBt->maxLocal;
1326 pPage->minLocal = pBt->minLocal;
1327 }else{
1328 return SQLITE_CORRUPT_BKPT;
1329 }
1330 return SQLITE_OK;
1331 }
1332
1333 /*
1334 ** Initialize the auxiliary information for a disk block.
1335 **
1336 ** Return SQLITE_OK on success. If we see that the page does
1337 ** not contain a well-formed database page, then return
1338 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1339 ** guarantee that the page is well-formed. It only shows that
1340 ** we failed to detect any corruption.
1341 */
1342 static int btreeInitPage(MemPage *pPage){
1343
1344 assert( pPage->pBt!=0 );
1345 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1346 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1347 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1348 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1349
1350 if( !pPage->isInit ){
1351 u16 pc; /* Address of a freeblock within pPage->aData[] */
1352 u8 hdr; /* Offset to beginning of page header */
1353 u8 *data; /* Equal to pPage->aData */
1354 BtShared *pBt; /* The main btree structure */
1355 u16 usableSize; /* Amount of usable space on each page */
1356 u16 cellOffset; /* Offset from start of page to first cell pointer */
1357 u16 nFree; /* Number of unused bytes on the page */
1358 u16 top; /* First byte of the cell content area */
1359 int iCellFirst; /* First allowable cell or freeblock offset */
1360 int iCellLast; /* Last possible cell or freeblock offset */
1361
1362 pBt = pPage->pBt;
1363
1364 hdr = pPage->hdrOffset;
1365 data = pPage->aData;
1366 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1367 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1368 pPage->maskPage = pBt->pageSize - 1;
1369 pPage->nOverflow = 0;
1370 usableSize = pBt->usableSize;
1371 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1372 top = get2byte(&data[hdr+5]);
1373 pPage->nCell = get2byte(&data[hdr+3]);
1374 if( pPage->nCell>MX_CELL(pBt) ){
1375 /* To many cells for a single page. The page must be corrupt */
1376 return SQLITE_CORRUPT_BKPT;
1377 }
1378 testcase( pPage->nCell==MX_CELL(pBt) );
1379
1380 /* A malformed database page might cause us to read past the end
1381 ** of page when parsing a cell.
1382 **
1383 ** The following block of code checks early to see if a cell extends
1384 ** past the end of a page boundary and causes SQLITE_CORRUPT to be
1385 ** returned if it does.
1386 */
1387 iCellFirst = cellOffset + 2*pPage->nCell;
1388 iCellLast = usableSize - 4;
1389 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
1390 {
1391 int i; /* Index into the cell pointer array */
1392 int sz; /* Size of a cell */
1393
1394 if( !pPage->leaf ) iCellLast--;
1395 for(i=0; i<pPage->nCell; i++){
1396 pc = get2byte(&data[cellOffset+i*2]);
1397 testcase( pc==iCellFirst );
1398 testcase( pc==iCellLast );
1399 if( pc<iCellFirst || pc>iCellLast ){
1400 return SQLITE_CORRUPT_BKPT;
1401 }
1402 sz = cellSizePtr(pPage, &data[pc]);
1403 testcase( pc+sz==usableSize );
1404 if( pc+sz>usableSize ){
1405 return SQLITE_CORRUPT_BKPT;
1406 }
1407 }
1408 if( !pPage->leaf ) iCellLast++;
1409 }
1410 #endif
1411
1412 /* Compute the total free space on the page */
1413 pc = get2byte(&data[hdr+1]);
1414 nFree = data[hdr+7] + top;
1415 while( pc>0 ){
1416 u16 next, size;
1417 if( pc<iCellFirst || pc>iCellLast ){
1418 /* Start of free block is off the page */
1419 return SQLITE_CORRUPT_BKPT;
1420 }
1421 next = get2byte(&data[pc]);
1422 size = get2byte(&data[pc+2]);
1423 if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
1424 /* Free blocks must be in ascending order. And the last byte of
1425 ** the free-block must lie on the database page. */
1426 return SQLITE_CORRUPT_BKPT;
1427 }
1428 nFree = nFree + size;
1429 pc = next;
1430 }
1431
1432 /* At this point, nFree contains the sum of the offset to the start
1433 ** of the cell-content area plus the number of free bytes within
1434 ** the cell-content area. If this is greater than the usable-size
1435 ** of the page, then the page must be corrupted. This check also
1436 ** serves to verify that the offset to the start of the cell-content
1437 ** area, according to the page header, lies within the page.
1438 */
1439 if( nFree>usableSize ){
1440 return SQLITE_CORRUPT_BKPT;
1441 }
1442 pPage->nFree = (u16)(nFree - iCellFirst);
1443 pPage->isInit = 1;
1444 }
1445 return SQLITE_OK;
1446 }
1447
1448 /*
1449 ** Set up a raw page so that it looks like a database page holding
1450 ** no entries.
1451 */
1452 static void zeroPage(MemPage *pPage, int flags){
1453 unsigned char *data = pPage->aData;
1454 BtShared *pBt = pPage->pBt;
1455 u8 hdr = pPage->hdrOffset;
1456 u16 first;
1457
1458 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1459 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1460 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1461 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1462 assert( sqlite3_mutex_held(pBt->mutex) );
1463 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1464 data[hdr] = (char)flags;
1465 first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
1466 memset(&data[hdr+1], 0, 4);
1467 data[hdr+7] = 0;
1468 put2byte(&data[hdr+5], pBt->usableSize);
1469 pPage->nFree = pBt->usableSize - first;
1470 decodeFlags(pPage, flags);
1471 pPage->hdrOffset = hdr;
1472 pPage->cellOffset = first;
1473 pPage->nOverflow = 0;
1474 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1475 pPage->maskPage = pBt->pageSize - 1;
1476 pPage->nCell = 0;
1477 pPage->isInit = 1;
1478 }
1479
1480
1481 /*
1482 ** Convert a DbPage obtained from the pager into a MemPage used by
1483 ** the btree layer.
1484 */
1485 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1486 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1487 pPage->aData = sqlite3PagerGetData(pDbPage);
1488 pPage->pDbPage = pDbPage;
1489 pPage->pBt = pBt;
1490 pPage->pgno = pgno;
1491 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1492 return pPage;
1493 }
1494
1495 /*
1496 ** Get a page from the pager. Initialize the MemPage.pBt and
1497 ** MemPage.aData elements if needed.
1498 **
1499 ** If the noContent flag is set, it means that we do not care about
1500 ** the content of the page at this time. So do not go to the disk
1501 ** to fetch the content. Just fill in the content with zeros for now.
1502 ** If in the future we call sqlite3PagerWrite() on this page, that
1503 ** means we have started to be concerned about content and the disk
1504 ** read should occur at that point.
1505 */
1506 static int btreeGetPage(
1507 BtShared *pBt, /* The btree */
1508 Pgno pgno, /* Number of the page to fetch */
1509 MemPage **ppPage, /* Return the page in this parameter */
1510 int noContent /* Do not load page content if true */
1511 ){
1512 int rc;
1513 DbPage *pDbPage;
1514
1515 assert( sqlite3_mutex_held(pBt->mutex) );
1516 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1517 if( rc ) return rc;
1518 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1519 return SQLITE_OK;
1520 }
1521
1522 /*
1523 ** Retrieve a page from the pager cache. If the requested page is not
1524 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
1525 ** MemPage.aData elements if needed.
1526 */
1527 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
1528 DbPage *pDbPage;
1529 assert( sqlite3_mutex_held(pBt->mutex) );
1530 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1531 if( pDbPage ){
1532 return btreePageFromDbPage(pDbPage, pgno, pBt);
1533 }
1534 return 0;
1535 }
1536
1537 /*
1538 ** Return the size of the database file in pages. If there is any kind of
1539 ** error, return ((unsigned int)-1).
1540 */
1541 static Pgno pagerPagecount(BtShared *pBt){
1542 int nPage = -1;
1543 int rc;
1544 assert( pBt->pPage1 );
1545 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1546 assert( rc==SQLITE_OK || nPage==-1 );
1547 return (Pgno)nPage;
1548 }
1549
1550 /*
1551 ** Get a page from the pager and initialize it. This routine is just a
1552 ** convenience wrapper around separate calls to btreeGetPage() and
1553 ** btreeInitPage().
1554 **
1555 ** If an error occurs, then the value *ppPage is set to is undefined. It
1556 ** may remain unchanged, or it may be set to an invalid value.
1557 */
1558 static int getAndInitPage(
1559 BtShared *pBt, /* The database file */
1560 Pgno pgno, /* Number of the page to get */
1561 MemPage **ppPage /* Write the page pointer here */
1562 ){
1563 int rc;
1564 TESTONLY( Pgno iLastPg = pagerPagecount(pBt); )
1565 assert( sqlite3_mutex_held(pBt->mutex) );
1566
1567 rc = btreeGetPage(pBt, pgno, ppPage, 0);
1568 if( rc==SQLITE_OK ){
1569 rc = btreeInitPage(*ppPage);
1570 if( rc!=SQLITE_OK ){
1571 releasePage(*ppPage);
1572 }
1573 }
1574
1575 /* If the requested page number was either 0 or greater than the page
1576 ** number of the last page in the database, this function should return
1577 ** SQLITE_CORRUPT or some other error (i.e. SQLITE_FULL). Check that this
1578 ** is the case. */
1579 assert( (pgno>0 && pgno<=iLastPg) || rc!=SQLITE_OK );
1580 testcase( pgno==0 );
1581 testcase( pgno==iLastPg );
1582
1583 return rc;
1584 }
1585
1586 /*
1587 ** Release a MemPage. This should be called once for each prior
1588 ** call to btreeGetPage.
1589 */
1590 static void releasePage(MemPage *pPage){
1591 if( pPage ){
1592 assert( pPage->nOverflow==0 || sqlite3PagerPageRefcount(pPage->pDbPage)>1 );
1593 assert( pPage->aData );
1594 assert( pPage->pBt );
1595 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1596 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1597 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1598 sqlite3PagerUnref(pPage->pDbPage);
1599 }
1600 }
1601
1602 /*
1603 ** During a rollback, when the pager reloads information into the cache
1604 ** so that the cache is restored to its original state at the start of
1605 ** the transaction, for each page restored this routine is called.
1606 **
1607 ** This routine needs to reset the extra data section at the end of the
1608 ** page to agree with the restored data.
1609 */
1610 static void pageReinit(DbPage *pData){
1611 MemPage *pPage;
1612 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1613 assert( sqlite3PagerPageRefcount(pData)>0 );
1614 if( pPage->isInit ){
1615 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1616 pPage->isInit = 0;
1617 if( sqlite3PagerPageRefcount(pData)>1 ){
1618 /* pPage might not be a btree page; it might be an overflow page
1619 ** or ptrmap page or a free page. In those cases, the following
1620 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
1621 ** But no harm is done by this. And it is very important that
1622 ** btreeInitPage() be called on every btree page so we make
1623 ** the call for every page that comes in for re-initing. */
1624 btreeInitPage(pPage);
1625 }
1626 }
1627 }
1628
1629 /*
1630 ** Invoke the busy handler for a btree.
1631 */
1632 static int btreeInvokeBusyHandler(void *pArg){
1633 BtShared *pBt = (BtShared*)pArg;
1634 assert( pBt->db );
1635 assert( sqlite3_mutex_held(pBt->db->mutex) );
1636 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1637 }
1638
1639 /*
1640 ** Open a database file.
1641 **
1642 ** zFilename is the name of the database file. If zFilename is NULL
1643 ** a new database with a random name is created. This randomly named
1644 ** database file will be deleted when sqlite3BtreeClose() is called.
1645 ** If zFilename is ":memory:" then an in-memory database is created
1646 ** that is automatically destroyed when it is closed.
1647 **
1648 ** If the database is already opened in the same database connection
1649 ** and we are in shared cache mode, then the open will fail with an
1650 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared
1651 ** objects in the same database connection since doing so will lead
1652 ** to problems with locking.
1653 */
1654 int sqlite3BtreeOpen(
1655 const char *zFilename, /* Name of the file containing the BTree database */
1656 sqlite3 *db, /* Associated database handle */
1657 Btree **ppBtree, /* Pointer to new Btree object written here */
1658 int flags, /* Options */
1659 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1660 ){
1661 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1662 BtShared *pBt = 0; /* Shared part of btree structure */
1663 Btree *p; /* Handle to return */
1664 sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */
1665 int rc = SQLITE_OK; /* Result code from this function */
1666 u8 nReserve; /* Byte of unused space on each page */
1667 unsigned char zDbHeader[100]; /* Database header content */
1668
1669 /* Set the variable isMemdb to true for an in-memory database, or
1670 ** false for a file-based database. This symbol is only required if
1671 ** either of the shared-data or autovacuum features are compiled
1672 ** into the library.
1673 */
1674 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1675 #ifdef SQLITE_OMIT_MEMORYDB
1676 const int isMemdb = 0;
1677 #else
1678 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1679 #endif
1680 #endif
1681
1682 assert( db!=0 );
1683 assert( sqlite3_mutex_held(db->mutex) );
1684
1685 pVfs = db->pVfs;
1686 p = sqlite3MallocZero(sizeof(Btree));
1687 if( !p ){
1688 return SQLITE_NOMEM;
1689 }
1690 p->inTrans = TRANS_NONE;
1691 p->db = db;
1692 #ifndef SQLITE_OMIT_SHARED_CACHE
1693 p->lock.pBtree = p;
1694 p->lock.iTable = 1;
1695 #endif
1696
1697 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1698 /*
1699 ** If this Btree is a candidate for shared cache, try to find an
1700 ** existing BtShared object that we can share with
1701 */
1702 if( isMemdb==0 && zFilename && zFilename[0] ){
1703 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
1704 int nFullPathname = pVfs->mxPathname+1;
1705 char *zFullPathname = sqlite3Malloc(nFullPathname);
1706 sqlite3_mutex *mutexShared;
1707 p->sharable = 1;
1708 if( !zFullPathname ){
1709 sqlite3_free(p);
1710 return SQLITE_NOMEM;
1711 }
1712 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1713 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1714 sqlite3_mutex_enter(mutexOpen);
1715 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1716 sqlite3_mutex_enter(mutexShared);
1717 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1718 assert( pBt->nRef>0 );
1719 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1720 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1721 int iDb;
1722 for(iDb=db->nDb-1; iDb>=0; iDb--){
1723 Btree *pExisting = db->aDb[iDb].pBt;
1724 if( pExisting && pExisting->pBt==pBt ){
1725 sqlite3_mutex_leave(mutexShared);
1726 sqlite3_mutex_leave(mutexOpen);
1727 sqlite3_free(zFullPathname);
1728 sqlite3_free(p);
1729 return SQLITE_CONSTRAINT;
1730 }
1731 }
1732 p->pBt = pBt;
1733 pBt->nRef++;
1734 break;
1735 }
1736 }
1737 sqlite3_mutex_leave(mutexShared);
1738 sqlite3_free(zFullPathname);
1739 }
1740 #ifdef SQLITE_DEBUG
1741 else{
1742 /* In debug mode, we mark all persistent databases as sharable
1743 ** even when they are not. This exercises the locking code and
1744 ** gives more opportunity for asserts(sqlite3_mutex_held())
1745 ** statements to find locking problems.
1746 */
1747 p->sharable = 1;
1748 }
1749 #endif
1750 }
1751 #endif
1752 if( pBt==0 ){
1753 /*
1754 ** The following asserts make sure that structures used by the btree are
1755 ** the right size. This is to guard against size changes that result
1756 ** when compiling on a different architecture.
1757 */
1758 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1759 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1760 assert( sizeof(u32)==4 );
1761 assert( sizeof(u16)==2 );
1762 assert( sizeof(Pgno)==4 );
1763
1764 pBt = sqlite3MallocZero( sizeof(*pBt) );
1765 if( pBt==0 ){
1766 rc = SQLITE_NOMEM;
1767 goto btree_open_out;
1768 }
1769 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1770 EXTRA_SIZE, flags, vfsFlags, pageReinit);
1771 if( rc==SQLITE_OK ){
1772 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1773 }
1774 if( rc!=SQLITE_OK ){
1775 goto btree_open_out;
1776 }
1777 pBt->db = db;
1778 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
1779 p->pBt = pBt;
1780
1781 pBt->pCursor = 0;
1782 pBt->pPage1 = 0;
1783 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1784 pBt->pageSize = get2byte(&zDbHeader[16]);
1785 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1786 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1787 pBt->pageSize = 0;
1788 #ifndef SQLITE_OMIT_AUTOVACUUM
1789 /* If the magic name ":memory:" will create an in-memory database, then
1790 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1791 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1792 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1793 ** regular file-name. In this case the auto-vacuum applies as per normal.
1794 */
1795 if( zFilename && !isMemdb ){
1796 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1797 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1798 }
1799 #endif
1800 nReserve = 0;
1801 }else{
1802 nReserve = zDbHeader[20];
1803 pBt->pageSizeFixed = 1;
1804 #ifndef SQLITE_OMIT_AUTOVACUUM
1805 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1806 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1807 #endif
1808 }
1809 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
1810 if( rc ) goto btree_open_out;
1811 pBt->usableSize = pBt->pageSize - nReserve;
1812 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1813
1814 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1815 /* Add the new BtShared object to the linked list sharable BtShareds.
1816 */
1817 if( p->sharable ){
1818 sqlite3_mutex *mutexShared;
1819 pBt->nRef = 1;
1820 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1821 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1822 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1823 if( pBt->mutex==0 ){
1824 rc = SQLITE_NOMEM;
1825 db->mallocFailed = 0;
1826 goto btree_open_out;
1827 }
1828 }
1829 sqlite3_mutex_enter(mutexShared);
1830 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1831 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1832 sqlite3_mutex_leave(mutexShared);
1833 }
1834 #endif
1835 }
1836
1837 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1838 /* If the new Btree uses a sharable pBtShared, then link the new
1839 ** Btree into the list of all sharable Btrees for the same connection.
1840 ** The list is kept in ascending order by pBt address.
1841 */
1842 if( p->sharable ){
1843 int i;
1844 Btree *pSib;
1845 for(i=0; i<db->nDb; i++){
1846 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1847 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1848 if( p->pBt<pSib->pBt ){
1849 p->pNext = pSib;
1850 p->pPrev = 0;
1851 pSib->pPrev = p;
1852 }else{
1853 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1854 pSib = pSib->pNext;
1855 }
1856 p->pNext = pSib->pNext;
1857 p->pPrev = pSib;
1858 if( p->pNext ){
1859 p->pNext->pPrev = p;
1860 }
1861 pSib->pNext = p;
1862 }
1863 break;
1864 }
1865 }
1866 }
1867 #endif
1868 *ppBtree = p;
1869
1870 btree_open_out:
1871 if( rc!=SQLITE_OK ){
1872 if( pBt && pBt->pPager ){
1873 sqlite3PagerClose(pBt->pPager);
1874 }
1875 sqlite3_free(pBt);
1876 sqlite3_free(p);
1877 *ppBtree = 0;
1878 }
1879 if( mutexOpen ){
1880 assert( sqlite3_mutex_held(mutexOpen) );
1881 sqlite3_mutex_leave(mutexOpen);
1882 }
1883 return rc;
1884 }
1885
1886 /*
1887 ** Decrement the BtShared.nRef counter. When it reaches zero,
1888 ** remove the BtShared structure from the sharing list. Return
1889 ** true if the BtShared.nRef counter reaches zero and return
1890 ** false if it is still positive.
1891 */
1892 static int removeFromSharingList(BtShared *pBt){
1893 #ifndef SQLITE_OMIT_SHARED_CACHE
1894 sqlite3_mutex *pMaster;
1895 BtShared *pList;
1896 int removed = 0;
1897
1898 assert( sqlite3_mutex_notheld(pBt->mutex) );
1899 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1900 sqlite3_mutex_enter(pMaster);
1901 pBt->nRef--;
1902 if( pBt->nRef<=0 ){
1903 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1904 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1905 }else{
1906 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1907 while( ALWAYS(pList) && pList->pNext!=pBt ){
1908 pList=pList->pNext;
1909 }
1910 if( ALWAYS(pList) ){
1911 pList->pNext = pBt->pNext;
1912 }
1913 }
1914 if( SQLITE_THREADSAFE ){
1915 sqlite3_mutex_free(pBt->mutex);
1916 }
1917 removed = 1;
1918 }
1919 sqlite3_mutex_leave(pMaster);
1920 return removed;
1921 #else
1922 return 1;
1923 #endif
1924 }
1925
1926 /*
1927 ** Make sure pBt->pTmpSpace points to an allocation of
1928 ** MX_CELL_SIZE(pBt) bytes.
1929 */
1930 static void allocateTempSpace(BtShared *pBt){
1931 if( !pBt->pTmpSpace ){
1932 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1933 }
1934 }
1935
1936 /*
1937 ** Free the pBt->pTmpSpace allocation
1938 */
1939 static void freeTempSpace(BtShared *pBt){
1940 sqlite3PageFree( pBt->pTmpSpace);
1941 pBt->pTmpSpace = 0;
1942 }
1943
1944 /*
1945 ** Close an open database and invalidate all cursors.
1946 */
1947 int sqlite3BtreeClose(Btree *p){
1948 BtShared *pBt = p->pBt;
1949 BtCursor *pCur;
1950
1951 /* Close all cursors opened via this handle. */
1952 assert( sqlite3_mutex_held(p->db->mutex) );
1953 sqlite3BtreeEnter(p);
1954 pCur = pBt->pCursor;
1955 while( pCur ){
1956 BtCursor *pTmp = pCur;
1957 pCur = pCur->pNext;
1958 if( pTmp->pBtree==p ){
1959 sqlite3BtreeCloseCursor(pTmp);
1960 }
1961 }
1962
1963 /* Rollback any active transaction and free the handle structure.
1964 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1965 ** this handle.
1966 */
1967 sqlite3BtreeRollback(p);
1968 sqlite3BtreeLeave(p);
1969
1970 /* If there are still other outstanding references to the shared-btree
1971 ** structure, return now. The remainder of this procedure cleans
1972 ** up the shared-btree.
1973 */
1974 assert( p->wantToLock==0 && p->locked==0 );
1975 if( !p->sharable || removeFromSharingList(pBt) ){
1976 /* The pBt is no longer on the sharing list, so we can access
1977 ** it without having to hold the mutex.
1978 **
1979 ** Clean out and delete the BtShared object.
1980 */
1981 assert( !pBt->pCursor );
1982 sqlite3PagerClose(pBt->pPager);
1983 if( pBt->xFreeSchema && pBt->pSchema ){
1984 pBt->xFreeSchema(pBt->pSchema);
1985 }
1986 sqlite3_free(pBt->pSchema);
1987 freeTempSpace(pBt);
1988 sqlite3_free(pBt);
1989 }
1990
1991 #ifndef SQLITE_OMIT_SHARED_CACHE
1992 assert( p->wantToLock==0 );
1993 assert( p->locked==0 );
1994 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1995 if( p->pNext ) p->pNext->pPrev = p->pPrev;
1996 #endif
1997
1998 sqlite3_free(p);
1999 return SQLITE_OK;
2000 }
2001
2002 /*
2003 ** Change the limit on the number of pages allowed in the cache.
2004 **
2005 ** The maximum number of cache pages is set to the absolute
2006 ** value of mxPage. If mxPage is negative, the pager will
2007 ** operate asynchronously - it will not stop to do fsync()s
2008 ** to insure data is written to the disk surface before
2009 ** continuing. Transactions still work if synchronous is off,
2010 ** and the database cannot be corrupted if this program
2011 ** crashes. But if the operating system crashes or there is
2012 ** an abrupt power failure when synchronous is off, the database
2013 ** could be left in an inconsistent and unrecoverable state.
2014 ** Synchronous is on by default so database corruption is not
2015 ** normally a worry.
2016 */
2017 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2018 BtShared *pBt = p->pBt;
2019 assert( sqlite3_mutex_held(p->db->mutex) );
2020 sqlite3BtreeEnter(p);
2021 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2022 sqlite3BtreeLeave(p);
2023 return SQLITE_OK;
2024 }
2025
2026 /*
2027 ** Change the way data is synced to disk in order to increase or decrease
2028 ** how well the database resists damage due to OS crashes and power
2029 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
2030 ** there is a high probability of damage) Level 2 is the default. There
2031 ** is a very low but non-zero probability of damage. Level 3 reduces the
2032 ** probability of damage to near zero but with a write performance reduction.
2033 */
2034 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
2035 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
2036 BtShared *pBt = p->pBt;
2037 assert( sqlite3_mutex_held(p->db->mutex) );
2038 sqlite3BtreeEnter(p);
2039 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
2040 sqlite3BtreeLeave(p);
2041 return SQLITE_OK;
2042 }
2043 #endif
2044
2045 /*
2046 ** Return TRUE if the given btree is set to safety level 1. In other
2047 ** words, return TRUE if no sync() occurs on the disk files.
2048 */
2049 int sqlite3BtreeSyncDisabled(Btree *p){
2050 BtShared *pBt = p->pBt;
2051 int rc;
2052 assert( sqlite3_mutex_held(p->db->mutex) );
2053 sqlite3BtreeEnter(p);
2054 assert( pBt && pBt->pPager );
2055 rc = sqlite3PagerNosync(pBt->pPager);
2056 sqlite3BtreeLeave(p);
2057 return rc;
2058 }
2059
2060 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
2061 /*
2062 ** Change the default pages size and the number of reserved bytes per page.
2063 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2064 ** without changing anything.
2065 **
2066 ** The page size must be a power of 2 between 512 and 65536. If the page
2067 ** size supplied does not meet this constraint then the page size is not
2068 ** changed.
2069 **
2070 ** Page sizes are constrained to be a power of two so that the region
2071 ** of the database file used for locking (beginning at PENDING_BYTE,
2072 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2073 ** at the beginning of a page.
2074 **
2075 ** If parameter nReserve is less than zero, then the number of reserved
2076 ** bytes per page is left unchanged.
2077 **
2078 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
2079 ** and autovacuum mode can no longer be changed.
2080 */
2081 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2082 int rc = SQLITE_OK;
2083 BtShared *pBt = p->pBt;
2084 assert( nReserve>=-1 && nReserve<=255 );
2085 sqlite3BtreeEnter(p);
2086 if( pBt->pageSizeFixed ){
2087 sqlite3BtreeLeave(p);
2088 return SQLITE_READONLY;
2089 }
2090 if( nReserve<0 ){
2091 nReserve = pBt->pageSize - pBt->usableSize;
2092 }
2093 assert( nReserve>=0 && nReserve<=255 );
2094 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2095 ((pageSize-1)&pageSize)==0 ){
2096 assert( (pageSize & 7)==0 );
2097 assert( !pBt->pPage1 && !pBt->pCursor );
2098 pBt->pageSize = (u16)pageSize;
2099 freeTempSpace(pBt);
2100 }
2101 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2102 pBt->usableSize = pBt->pageSize - (u16)nReserve;
2103 if( iFix ) pBt->pageSizeFixed = 1;
2104 sqlite3BtreeLeave(p);
2105 return rc;
2106 }
2107
2108 /*
2109 ** Return the currently defined page size
2110 */
2111 int sqlite3BtreeGetPageSize(Btree *p){
2112 return p->pBt->pageSize;
2113 }
2114
2115 /*
2116 ** Return the number of bytes of space at the end of every page that
2117 ** are intentually left unused. This is the "reserved" space that is
2118 ** sometimes used by extensions.
2119 */
2120 int sqlite3BtreeGetReserve(Btree *p){
2121 int n;
2122 sqlite3BtreeEnter(p);
2123 n = p->pBt->pageSize - p->pBt->usableSize;
2124 sqlite3BtreeLeave(p);
2125 return n;
2126 }
2127
2128 /*
2129 ** Set the maximum page count for a database if mxPage is positive.
2130 ** No changes are made if mxPage is 0 or negative.
2131 ** Regardless of the value of mxPage, return the maximum page count.
2132 */
2133 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
2134 int n;
2135 sqlite3BtreeEnter(p);
2136 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2137 sqlite3BtreeLeave(p);
2138 return n;
2139 }
2140 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
2141
2142 /*
2143 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2144 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2145 ** is disabled. The default value for the auto-vacuum property is
2146 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2147 */
2148 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2149 #ifdef SQLITE_OMIT_AUTOVACUUM
2150 return SQLITE_READONLY;
2151 #else
2152 BtShared *pBt = p->pBt;
2153 int rc = SQLITE_OK;
2154 u8 av = (u8)autoVacuum;
2155
2156 sqlite3BtreeEnter(p);
2157 if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){
2158 rc = SQLITE_READONLY;
2159 }else{
2160 pBt->autoVacuum = av ?1:0;
2161 pBt->incrVacuum = av==2 ?1:0;
2162 }
2163 sqlite3BtreeLeave(p);
2164 return rc;
2165 #endif
2166 }
2167
2168 /*
2169 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
2170 ** enabled 1 is returned. Otherwise 0.
2171 */
2172 int sqlite3BtreeGetAutoVacuum(Btree *p){
2173 #ifdef SQLITE_OMIT_AUTOVACUUM
2174 return BTREE_AUTOVACUUM_NONE;
2175 #else
2176 int rc;
2177 sqlite3BtreeEnter(p);
2178 rc = (
2179 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
2180 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
2181 BTREE_AUTOVACUUM_INCR
2182 );
2183 sqlite3BtreeLeave(p);
2184 return rc;
2185 #endif
2186 }
2187
2188
2189 /*
2190 ** Get a reference to pPage1 of the database file. This will
2191 ** also acquire a readlock on that file.
2192 **
2193 ** SQLITE_OK is returned on success. If the file is not a
2194 ** well-formed database file, then SQLITE_CORRUPT is returned.
2195 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
2196 ** is returned if we run out of memory.
2197 */
2198 static int lockBtree(BtShared *pBt){
2199 int rc;
2200 MemPage *pPage1;
2201 int nPage;
2202
2203 assert( sqlite3_mutex_held(pBt->mutex) );
2204 assert( pBt->pPage1==0 );
2205 rc = sqlite3PagerSharedLock(pBt->pPager);
2206 if( rc!=SQLITE_OK ) return rc;
2207 rc = btreeGetPage(pBt, 1, &pPage1, 0);
2208 if( rc!=SQLITE_OK ) return rc;
2209
2210 /* Do some checking to help insure the file we opened really is
2211 ** a valid database file.
2212 */
2213 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
2214 if( rc!=SQLITE_OK ){
2215 goto page1_init_failed;
2216 }else if( nPage>0 ){
2217 int pageSize;
2218 int usableSize;
2219 u8 *page1 = pPage1->aData;
2220 rc = SQLITE_NOTADB;
2221 if( memcmp(page1, zMagicHeader, 16)!=0 ){
2222 goto page1_init_failed;
2223 }
2224 if( page1[18]>1 ){
2225 pBt->readOnly = 1;
2226 }
2227 if( page1[19]>1 ){
2228 goto page1_init_failed;
2229 }
2230
2231 /* The maximum embedded fraction must be exactly 25%. And the minimum
2232 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
2233 ** The original design allowed these amounts to vary, but as of
2234 ** version 3.6.0, we require them to be fixed.
2235 */
2236 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
2237 goto page1_init_failed;
2238 }
2239 pageSize = get2byte(&page1[16]);
2240 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
2241 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
2242 ){
2243 goto page1_init_failed;
2244 }
2245 assert( (pageSize & 7)==0 );
2246 usableSize = pageSize - page1[20];
2247 if( pageSize!=pBt->pageSize ){
2248 /* After reading the first page of the database assuming a page size
2249 ** of BtShared.pageSize, we have discovered that the page-size is
2250 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
2251 ** zero and return SQLITE_OK. The caller will call this function
2252 ** again with the correct page-size.
2253 */
2254 releasePage(pPage1);
2255 pBt->usableSize = (u16)usableSize;
2256 pBt->pageSize = (u16)pageSize;
2257 freeTempSpace(pBt);
2258 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
2259 pageSize-usableSize);
2260 return rc;
2261 }
2262 if( usableSize<480 ){
2263 goto page1_init_failed;
2264 }
2265 pBt->pageSize = (u16)pageSize;
2266 pBt->usableSize = (u16)usableSize;
2267 #ifndef SQLITE_OMIT_AUTOVACUUM
2268 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
2269 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
2270 #endif
2271 }
2272
2273 /* maxLocal is the maximum amount of payload to store locally for
2274 ** a cell. Make sure it is small enough so that at least minFanout
2275 ** cells can will fit on one page. We assume a 10-byte page header.
2276 ** Besides the payload, the cell must store:
2277 ** 2-byte pointer to the cell
2278 ** 4-byte child pointer
2279 ** 9-byte nKey value
2280 ** 4-byte nData value
2281 ** 4-byte overflow page pointer
2282 ** So a cell consists of a 2-byte poiner, a header which is as much as
2283 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
2284 ** page pointer.
2285 */
2286 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
2287 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
2288 pBt->maxLeaf = pBt->usableSize - 35;
2289 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
2290 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
2291 pBt->pPage1 = pPage1;
2292 return SQLITE_OK;
2293
2294 page1_init_failed:
2295 releasePage(pPage1);
2296 pBt->pPage1 = 0;
2297 return rc;
2298 }
2299
2300 /*
2301 ** If there are no outstanding cursors and we are not in the middle
2302 ** of a transaction but there is a read lock on the database, then
2303 ** this routine unrefs the first page of the database file which
2304 ** has the effect of releasing the read lock.
2305 **
2306 ** If there is a transaction in progress, this routine is a no-op.
2307 */
2308 static void unlockBtreeIfUnused(BtShared *pBt){
2309 assert( sqlite3_mutex_held(pBt->mutex) );
2310 assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE );
2311 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
2312 assert( pBt->pPage1->aData );
2313 assert( sqlite3PagerRefcount(pBt->pPager)==1 );
2314 assert( pBt->pPage1->aData );
2315 releasePage(pBt->pPage1);
2316 pBt->pPage1 = 0;
2317 }
2318 }
2319
2320 /*
2321 ** If pBt points to an empty file then convert that empty file
2322 ** into a new empty database by initializing the first page of
2323 ** the database.
2324 */
2325 static int newDatabase(BtShared *pBt){
2326 MemPage *pP1;
2327 unsigned char *data;
2328 int rc;
2329 int nPage;
2330
2331 assert( sqlite3_mutex_held(pBt->mutex) );
2332 /* The database size has already been measured and cached, so failure
2333 ** is impossible here. If the original size measurement failed, then
2334 ** processing aborts before entering this routine. */
2335 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
2336 if( NEVER(rc!=SQLITE_OK) || nPage>0 ){
2337 return rc;
2338 }
2339 pP1 = pBt->pPage1;
2340 assert( pP1!=0 );
2341 data = pP1->aData;
2342 rc = sqlite3PagerWrite(pP1->pDbPage);
2343 if( rc ) return rc;
2344 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
2345 assert( sizeof(zMagicHeader)==16 );
2346 assert( sizeof(zMagicHeader)==sizeof(zPoisonHeader) );
2347 put2byte(&data[16], pBt->pageSize);
2348 data[18] = 1;
2349 data[19] = 1;
2350 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
2351 data[20] = (u8)(pBt->pageSize - pBt->usableSize);
2352 data[21] = 64;
2353 data[22] = 32;
2354 data[23] = 32;
2355 memset(&data[24], 0, 100-24);
2356 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
2357 pBt->pageSizeFixed = 1;
2358 #ifndef SQLITE_OMIT_AUTOVACUUM
2359 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
2360 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
2361 put4byte(&data[36 + 4*4], pBt->autoVacuum);
2362 put4byte(&data[36 + 7*4], pBt->incrVacuum);
2363 #endif
2364 return SQLITE_OK;
2365 }
2366
2367 /*
2368 ** Attempt to start a new transaction. A write-transaction
2369 ** is started if the second argument is nonzero, otherwise a read-
2370 ** transaction. If the second argument is 2 or more and exclusive
2371 ** transaction is started, meaning that no other process is allowed
2372 ** to access the database. A preexisting transaction may not be
2373 ** upgraded to exclusive by calling this routine a second time - the
2374 ** exclusivity flag only works for a new transaction.
2375 **
2376 ** A write-transaction must be started before attempting any
2377 ** changes to the database. None of the following routines
2378 ** will work unless a transaction is started first:
2379 **
2380 ** sqlite3BtreeCreateTable()
2381 ** sqlite3BtreeCreateIndex()
2382 ** sqlite3BtreeClearTable()
2383 ** sqlite3BtreeDropTable()
2384 ** sqlite3BtreeInsert()
2385 ** sqlite3BtreeDelete()
2386 ** sqlite3BtreeUpdateMeta()
2387 **
2388 ** If an initial attempt to acquire the lock fails because of lock contention
2389 ** and the database was previously unlocked, then invoke the busy handler
2390 ** if there is one. But if there was previously a read-lock, do not
2391 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
2392 ** returned when there is already a read-lock in order to avoid a deadlock.
2393 **
2394 ** Suppose there are two processes A and B. A has a read lock and B has
2395 ** a reserved lock. B tries to promote to exclusive but is blocked because
2396 ** of A's read lock. A tries to promote to reserved but is blocked by B.
2397 ** One or the other of the two processes must give way or there can be
2398 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
2399 ** when A already has a read lock, we encourage A to give up and let B
2400 ** proceed.
2401 */
2402 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
2403 sqlite3 *pBlock = 0;
2404 BtShared *pBt = p->pBt;
2405 int rc = SQLITE_OK;
2406
2407 sqlite3BtreeEnter(p);
2408 btreeIntegrity(p);
2409
2410 /* If the btree is already in a write-transaction, or it
2411 ** is already in a read-transaction and a read-transaction
2412 ** is requested, this is a no-op.
2413 */
2414 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
2415 goto trans_begun;
2416 }
2417
2418 /* Write transactions are not possible on a read-only database */
2419 if( pBt->readOnly && wrflag ){
2420 rc = SQLITE_READONLY;
2421 goto trans_begun;
2422 }
2423
2424 #ifndef SQLITE_OMIT_SHARED_CACHE
2425 /* If another database handle has already opened a write transaction
2426 ** on this shared-btree structure and a second write transaction is
2427 ** requested, return SQLITE_LOCKED.
2428 */
2429 if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){
2430 pBlock = pBt->pWriter->db;
2431 }else if( wrflag>1 ){
2432 BtLock *pIter;
2433 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
2434 if( pIter->pBtree!=p ){
2435 pBlock = pIter->pBtree->db;
2436 break;
2437 }
2438 }
2439 }
2440 if( pBlock ){
2441 sqlite3ConnectionBlocked(p->db, pBlock);
2442 rc = SQLITE_LOCKED_SHAREDCACHE;
2443 goto trans_begun;
2444 }
2445 #endif
2446
2447 /* Any read-only or read-write transaction implies a read-lock on
2448 ** page 1. So if some other shared-cache client already has a write-lock
2449 ** on page 1, the transaction cannot be opened. */
2450 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
2451 if( SQLITE_OK!=rc ) goto trans_begun;
2452
2453 do {
2454 /* Call lockBtree() until either pBt->pPage1 is populated or
2455 ** lockBtree() returns something other than SQLITE_OK. lockBtree()
2456 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
2457 ** reading page 1 it discovers that the page-size of the database
2458 ** file is not pBt->pageSize. In this case lockBtree() will update
2459 ** pBt->pageSize to the page-size of the file on disk.
2460 */
2461 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
2462
2463 if( rc==SQLITE_OK && wrflag ){
2464 if( pBt->readOnly ){
2465 rc = SQLITE_READONLY;
2466 }else{
2467 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
2468 if( rc==SQLITE_OK ){
2469 rc = newDatabase(pBt);
2470 }
2471 }
2472 }
2473
2474 if( rc!=SQLITE_OK ){
2475 unlockBtreeIfUnused(pBt);
2476 }
2477 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2478 btreeInvokeBusyHandler(pBt) );
2479
2480 if( rc==SQLITE_OK ){
2481 if( p->inTrans==TRANS_NONE ){
2482 pBt->nTransaction++;
2483 #ifndef SQLITE_OMIT_SHARED_CACHE
2484 if( p->sharable ){
2485 assert( p->lock.pBtree==p && p->lock.iTable==1 );
2486 p->lock.eLock = READ_LOCK;
2487 p->lock.pNext = pBt->pLock;
2488 pBt->pLock = &p->lock;
2489 }
2490 #endif
2491 }
2492 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2493 if( p->inTrans>pBt->inTransaction ){
2494 pBt->inTransaction = p->inTrans;
2495 }
2496 #ifndef SQLITE_OMIT_SHARED_CACHE
2497 if( wrflag ){
2498 assert( !pBt->pWriter );
2499 pBt->pWriter = p;
2500 pBt->isExclusive = (u8)(wrflag>1);
2501 }
2502 #endif
2503 }
2504
2505
2506 trans_begun:
2507 if( rc==SQLITE_OK && wrflag ){
2508 /* This call makes sure that the pager has the correct number of
2509 ** open savepoints. If the second parameter is greater than 0 and
2510 ** the sub-journal is not already open, then it will be opened here.
2511 */
2512 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
2513 }
2514
2515 btreeIntegrity(p);
2516 sqlite3BtreeLeave(p);
2517 return rc;
2518 }
2519
2520 #ifndef SQLITE_OMIT_AUTOVACUUM
2521
2522 /*
2523 ** Set the pointer-map entries for all children of page pPage. Also, if
2524 ** pPage contains cells that point to overflow pages, set the pointer
2525 ** map entries for the overflow pages as well.
2526 */
2527 static int setChildPtrmaps(MemPage *pPage){
2528 int i; /* Counter variable */
2529 int nCell; /* Number of cells in page pPage */
2530 int rc; /* Return code */
2531 BtShared *pBt = pPage->pBt;
2532 u8 isInitOrig = pPage->isInit;
2533 Pgno pgno = pPage->pgno;
2534
2535 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2536 rc = btreeInitPage(pPage);
2537 if( rc!=SQLITE_OK ){
2538 goto set_child_ptrmaps_out;
2539 }
2540 nCell = pPage->nCell;
2541
2542 for(i=0; i<nCell; i++){
2543 u8 *pCell = findCell(pPage, i);
2544
2545 ptrmapPutOvflPtr(pPage, pCell, &rc);
2546
2547 if( !pPage->leaf ){
2548 Pgno childPgno = get4byte(pCell);
2549 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2550 }
2551 }
2552
2553 if( !pPage->leaf ){
2554 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2555 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
2556 }
2557
2558 set_child_ptrmaps_out:
2559 pPage->isInit = isInitOrig;
2560 return rc;
2561 }
2562
2563 /*
2564 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so
2565 ** that it points to iTo. Parameter eType describes the type of pointer to
2566 ** be modified, as follows:
2567 **
2568 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2569 ** page of pPage.
2570 **
2571 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2572 ** page pointed to by one of the cells on pPage.
2573 **
2574 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2575 ** overflow page in the list.
2576 */
2577 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2578 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2579 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2580 if( eType==PTRMAP_OVERFLOW2 ){
2581 /* The pointer is always the first 4 bytes of the page in this case. */
2582 if( get4byte(pPage->aData)!=iFrom ){
2583 return SQLITE_CORRUPT_BKPT;
2584 }
2585 put4byte(pPage->aData, iTo);
2586 }else{
2587 u8 isInitOrig = pPage->isInit;
2588 int i;
2589 int nCell;
2590
2591 btreeInitPage(pPage);
2592 nCell = pPage->nCell;
2593
2594 for(i=0; i<nCell; i++){
2595 u8 *pCell = findCell(pPage, i);
2596 if( eType==PTRMAP_OVERFLOW1 ){
2597 CellInfo info;
2598 btreeParseCellPtr(pPage, pCell, &info);
2599 if( info.iOverflow ){
2600 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2601 put4byte(&pCell[info.iOverflow], iTo);
2602 break;
2603 }
2604 }
2605 }else{
2606 if( get4byte(pCell)==iFrom ){
2607 put4byte(pCell, iTo);
2608 break;
2609 }
2610 }
2611 }
2612
2613 if( i==nCell ){
2614 if( eType!=PTRMAP_BTREE ||
2615 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2616 return SQLITE_CORRUPT_BKPT;
2617 }
2618 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2619 }
2620
2621 pPage->isInit = isInitOrig;
2622 }
2623 return SQLITE_OK;
2624 }
2625
2626
2627 /*
2628 ** Move the open database page pDbPage to location iFreePage in the
2629 ** database. The pDbPage reference remains valid.
2630 **
2631 ** The isCommit flag indicates that there is no need to remember that
2632 ** the journal needs to be sync()ed before database page pDbPage->pgno
2633 ** can be written to. The caller has already promised not to write to that
2634 ** page.
2635 */
2636 static int relocatePage(
2637 BtShared *pBt, /* Btree */
2638 MemPage *pDbPage, /* Open page to move */
2639 u8 eType, /* Pointer map 'type' entry for pDbPage */
2640 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2641 Pgno iFreePage, /* The location to move pDbPage to */
2642 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */
2643 ){
2644 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2645 Pgno iDbPage = pDbPage->pgno;
2646 Pager *pPager = pBt->pPager;
2647 int rc;
2648
2649 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2650 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2651 assert( sqlite3_mutex_held(pBt->mutex) );
2652 assert( pDbPage->pBt==pBt );
2653
2654 /* Move page iDbPage from its current location to page number iFreePage */
2655 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2656 iDbPage, iFreePage, iPtrPage, eType));
2657 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2658 if( rc!=SQLITE_OK ){
2659 return rc;
2660 }
2661 pDbPage->pgno = iFreePage;
2662
2663 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2664 ** that point to overflow pages. The pointer map entries for all these
2665 ** pages need to be changed.
2666 **
2667 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2668 ** pointer to a subsequent overflow page. If this is the case, then
2669 ** the pointer map needs to be updated for the subsequent overflow page.
2670 */
2671 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2672 rc = setChildPtrmaps(pDbPage);
2673 if( rc!=SQLITE_OK ){
2674 return rc;
2675 }
2676 }else{
2677 Pgno nextOvfl = get4byte(pDbPage->aData);
2678 if( nextOvfl!=0 ){
2679 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
2680 if( rc!=SQLITE_OK ){
2681 return rc;
2682 }
2683 }
2684 }
2685
2686 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2687 ** that it points at iFreePage. Also fix the pointer map entry for
2688 ** iPtrPage.
2689 */
2690 if( eType!=PTRMAP_ROOTPAGE ){
2691 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2692 if( rc!=SQLITE_OK ){
2693 return rc;
2694 }
2695 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2696 if( rc!=SQLITE_OK ){
2697 releasePage(pPtrPage);
2698 return rc;
2699 }
2700 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2701 releasePage(pPtrPage);
2702 if( rc==SQLITE_OK ){
2703 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
2704 }
2705 }
2706 return rc;
2707 }
2708
2709 /* Forward declaration required by incrVacuumStep(). */
2710 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2711
2712 /*
2713 ** Perform a single step of an incremental-vacuum. If successful,
2714 ** return SQLITE_OK. If there is no work to do (and therefore no
2715 ** point in calling this function again), return SQLITE_DONE.
2716 **
2717 ** More specificly, this function attempts to re-organize the
2718 ** database so that the last page of the file currently in use
2719 ** is no longer in use.
2720 **
2721 ** If the nFin parameter is non-zero, this function assumes
2722 ** that the caller will keep calling incrVacuumStep() until
2723 ** it returns SQLITE_DONE or an error, and that nFin is the
2724 ** number of pages the database file will contain after this
2725 ** process is complete. If nFin is zero, it is assumed that
2726 ** incrVacuumStep() will be called a finite amount of times
2727 ** which may or may not empty the freelist. A full autovacuum
2728 ** has nFin>0. A "PRAGMA incremental_vacuum" has nFin==0.
2729 */
2730 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){
2731 Pgno nFreeList; /* Number of pages still on the free-list */
2732
2733 assert( sqlite3_mutex_held(pBt->mutex) );
2734 assert( iLastPg>nFin );
2735
2736 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2737 int rc;
2738 u8 eType;
2739 Pgno iPtrPage;
2740
2741 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2742 if( nFreeList==0 ){
2743 return SQLITE_DONE;
2744 }
2745
2746 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2747 if( rc!=SQLITE_OK ){
2748 return rc;
2749 }
2750 if( eType==PTRMAP_ROOTPAGE ){
2751 return SQLITE_CORRUPT_BKPT;
2752 }
2753
2754 if( eType==PTRMAP_FREEPAGE ){
2755 if( nFin==0 ){
2756 /* Remove the page from the files free-list. This is not required
2757 ** if nFin is non-zero. In that case, the free-list will be
2758 ** truncated to zero after this function returns, so it doesn't
2759 ** matter if it still contains some garbage entries.
2760 */
2761 Pgno iFreePg;
2762 MemPage *pFreePg;
2763 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2764 if( rc!=SQLITE_OK ){
2765 return rc;
2766 }
2767 assert( iFreePg==iLastPg );
2768 releasePage(pFreePg);
2769 }
2770 } else {
2771 Pgno iFreePg; /* Index of free page to move pLastPg to */
2772 MemPage *pLastPg;
2773
2774 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
2775 if( rc!=SQLITE_OK ){
2776 return rc;
2777 }
2778
2779 /* If nFin is zero, this loop runs exactly once and page pLastPg
2780 ** is swapped with the first free page pulled off the free list.
2781 **
2782 ** On the other hand, if nFin is greater than zero, then keep
2783 ** looping until a free-page located within the first nFin pages
2784 ** of the file is found.
2785 */
2786 do {
2787 MemPage *pFreePg;
2788 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2789 if( rc!=SQLITE_OK ){
2790 releasePage(pLastPg);
2791 return rc;
2792 }
2793 releasePage(pFreePg);
2794 }while( nFin!=0 && iFreePg>nFin );
2795 assert( iFreePg<iLastPg );
2796
2797 rc = sqlite3PagerWrite(pLastPg->pDbPage);
2798 if( rc==SQLITE_OK ){
2799 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2800 }
2801 releasePage(pLastPg);
2802 if( rc!=SQLITE_OK ){
2803 return rc;
2804 }
2805 }
2806 }
2807
2808 if( nFin==0 ){
2809 iLastPg--;
2810 while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){
2811 if( PTRMAP_ISPAGE(pBt, iLastPg) ){
2812 MemPage *pPg;
2813 int rc = btreeGetPage(pBt, iLastPg, &pPg, 0);
2814 if( rc!=SQLITE_OK ){
2815 return rc;
2816 }
2817 rc = sqlite3PagerWrite(pPg->pDbPage);
2818 releasePage(pPg);
2819 if( rc!=SQLITE_OK ){
2820 return rc;
2821 }
2822 }
2823 iLastPg--;
2824 }
2825 sqlite3PagerTruncateImage(pBt->pPager, iLastPg);
2826 }
2827 return SQLITE_OK;
2828 }
2829
2830 /*
2831 ** A write-transaction must be opened before calling this function.
2832 ** It performs a single unit of work towards an incremental vacuum.
2833 **
2834 ** If the incremental vacuum is finished after this function has run,
2835 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
2836 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2837 */
2838 int sqlite3BtreeIncrVacuum(Btree *p){
2839 int rc;
2840 BtShared *pBt = p->pBt;
2841
2842 sqlite3BtreeEnter(p);
2843 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2844 if( !pBt->autoVacuum ){
2845 rc = SQLITE_DONE;
2846 }else{
2847 invalidateAllOverflowCache(pBt);
2848 rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt));
2849 }
2850 sqlite3BtreeLeave(p);
2851 return rc;
2852 }
2853
2854 /*
2855 ** This routine is called prior to sqlite3PagerCommit when a transaction
2856 ** is commited for an auto-vacuum database.
2857 **
2858 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2859 ** the database file should be truncated to during the commit process.
2860 ** i.e. the database has been reorganized so that only the first *pnTrunc
2861 ** pages are in use.
2862 */
2863 static int autoVacuumCommit(BtShared *pBt){
2864 int rc = SQLITE_OK;
2865 Pager *pPager = pBt->pPager;
2866 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
2867
2868 assert( sqlite3_mutex_held(pBt->mutex) );
2869 invalidateAllOverflowCache(pBt);
2870 assert(pBt->autoVacuum);
2871 if( !pBt->incrVacuum ){
2872 Pgno nFin; /* Number of pages in database after autovacuuming */
2873 Pgno nFree; /* Number of pages on the freelist initially */
2874 Pgno nPtrmap; /* Number of PtrMap pages to be freed */
2875 Pgno iFree; /* The next page to be freed */
2876 int nEntry; /* Number of entries on one ptrmap page */
2877 Pgno nOrig; /* Database size before freeing */
2878
2879 nOrig = pagerPagecount(pBt);
2880 if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
2881 /* It is not possible to create a database for which the final page
2882 ** is either a pointer-map page or the pending-byte page. If one
2883 ** is encountered, this indicates corruption.
2884 */
2885 return SQLITE_CORRUPT_BKPT;
2886 }
2887
2888 nFree = get4byte(&pBt->pPage1->aData[36]);
2889 nEntry = pBt->usableSize/5;
2890 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
2891 nFin = nOrig - nFree - nPtrmap;
2892 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
2893 nFin--;
2894 }
2895 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2896 nFin--;
2897 }
2898 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
2899
2900 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
2901 rc = incrVacuumStep(pBt, nFin, iFree);
2902 }
2903 if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
2904 rc = SQLITE_OK;
2905 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2906 put4byte(&pBt->pPage1->aData[32], 0);
2907 put4byte(&pBt->pPage1->aData[36], 0);
2908 sqlite3PagerTruncateImage(pBt->pPager, nFin);
2909 }
2910 if( rc!=SQLITE_OK ){
2911 sqlite3PagerRollback(pPager);
2912 }
2913 }
2914
2915 assert( nRef==sqlite3PagerRefcount(pPager) );
2916 return rc;
2917 }
2918
2919 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
2920 # define setChildPtrmaps(x) SQLITE_OK
2921 #endif
2922
2923 /*
2924 ** This routine does the first phase of a two-phase commit. This routine
2925 ** causes a rollback journal to be created (if it does not already exist)
2926 ** and populated with enough information so that if a power loss occurs
2927 ** the database can be restored to its original state by playing back
2928 ** the journal. Then the contents of the journal are flushed out to
2929 ** the disk. After the journal is safely on oxide, the changes to the
2930 ** database are written into the database file and flushed to oxide.
2931 ** At the end of this call, the rollback journal still exists on the
2932 ** disk and we are still holding all locks, so the transaction has not
2933 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the
2934 ** commit process.
2935 **
2936 ** This call is a no-op if no write-transaction is currently active on pBt.
2937 **
2938 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2939 ** the name of a master journal file that should be written into the
2940 ** individual journal file, or is NULL, indicating no master journal file
2941 ** (single database transaction).
2942 **
2943 ** When this is called, the master journal should already have been
2944 ** created, populated with this journal pointer and synced to disk.
2945 **
2946 ** Once this is routine has returned, the only thing required to commit
2947 ** the write-transaction for this database file is to delete the journal.
2948 */
2949 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2950 int rc = SQLITE_OK;
2951 if( p->inTrans==TRANS_WRITE ){
2952 BtShared *pBt = p->pBt;
2953 sqlite3BtreeEnter(p);
2954 #ifndef SQLITE_OMIT_AUTOVACUUM
2955 if( pBt->autoVacuum ){
2956 rc = autoVacuumCommit(pBt);
2957 if( rc!=SQLITE_OK ){
2958 sqlite3BtreeLeave(p);
2959 return rc;
2960 }
2961 }
2962 #endif
2963 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
2964 sqlite3BtreeLeave(p);
2965 }
2966 return rc;
2967 }
2968
2969 /*
2970 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
2971 ** at the conclusion of a transaction.
2972 */
2973 static void btreeEndTransaction(Btree *p){
2974 BtShared *pBt = p->pBt;
2975 BtCursor *pCsr;
2976 assert( sqlite3BtreeHoldsMutex(p) );
2977
2978 /* Search for a cursor held open by this b-tree connection. If one exists,
2979 ** then the transaction will be downgraded to a read-only transaction
2980 ** instead of actually concluded. A subsequent call to CommitPhaseTwo()
2981 ** or Rollback() will finish the transaction and unlock the database. */
2982 for(pCsr=pBt->pCursor; pCsr && pCsr->pBtree!=p; pCsr=pCsr->pNext);
2983 assert( pCsr==0 || p->inTrans>TRANS_NONE );
2984
2985 btreeClearHasContent(pBt);
2986 if( pCsr ){
2987 downgradeAllSharedCacheTableLocks(p);
2988 p->inTrans = TRANS_READ;
2989 }else{
2990 /* If the handle had any kind of transaction open, decrement the
2991 ** transaction count of the shared btree. If the transaction count
2992 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
2993 ** call below will unlock the pager. */
2994 if( p->inTrans!=TRANS_NONE ){
2995 clearAllSharedCacheTableLocks(p);
2996 pBt->nTransaction--;
2997 if( 0==pBt->nTransaction ){
2998 pBt->inTransaction = TRANS_NONE;
2999 }
3000 }
3001
3002 /* Set the current transaction state to TRANS_NONE and unlock the
3003 ** pager if this call closed the only read or write transaction. */
3004 p->inTrans = TRANS_NONE;
3005 unlockBtreeIfUnused(pBt);
3006 }
3007
3008 btreeIntegrity(p);
3009 }
3010
3011 /*
3012 ** Commit the transaction currently in progress.
3013 **
3014 ** This routine implements the second phase of a 2-phase commit. The
3015 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
3016 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
3017 ** routine did all the work of writing information out to disk and flushing the
3018 ** contents so that they are written onto the disk platter. All this
3019 ** routine has to do is delete or truncate or zero the header in the
3020 ** the rollback journal (which causes the transaction to commit) and
3021 ** drop locks.
3022 **
3023 ** This will release the write lock on the database file. If there
3024 ** are no active cursors, it also releases the read lock.
3025 */
3026 int sqlite3BtreeCommitPhaseTwo(Btree *p){
3027 BtShared *pBt = p->pBt;
3028
3029 sqlite3BtreeEnter(p);
3030 btreeIntegrity(p);
3031
3032 /* If the handle has a write-transaction open, commit the shared-btrees
3033 ** transaction and set the shared state to TRANS_READ.
3034 */
3035 if( p->inTrans==TRANS_WRITE ){
3036 int rc;
3037 assert( pBt->inTransaction==TRANS_WRITE );
3038 assert( pBt->nTransaction>0 );
3039 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
3040 if( rc!=SQLITE_OK ){
3041 sqlite3BtreeLeave(p);
3042 return rc;
3043 }
3044 pBt->inTransaction = TRANS_READ;
3045 }
3046
3047 btreeEndTransaction(p);
3048 sqlite3BtreeLeave(p);
3049 return SQLITE_OK;
3050 }
3051
3052 /*
3053 ** Do both phases of a commit.
3054 */
3055 int sqlite3BtreeCommit(Btree *p){
3056 int rc;
3057 sqlite3BtreeEnter(p);
3058 rc = sqlite3BtreeCommitPhaseOne(p, 0);
3059 if( rc==SQLITE_OK ){
3060 rc = sqlite3BtreeCommitPhaseTwo(p);
3061 }
3062 sqlite3BtreeLeave(p);
3063 return rc;
3064 }
3065
3066 #ifndef NDEBUG
3067 /*
3068 ** Return the number of write-cursors open on this handle. This is for use
3069 ** in assert() expressions, so it is only compiled if NDEBUG is not
3070 ** defined.
3071 **
3072 ** For the purposes of this routine, a write-cursor is any cursor that
3073 ** is capable of writing to the databse. That means the cursor was
3074 ** originally opened for writing and the cursor has not be disabled
3075 ** by having its state changed to CURSOR_FAULT.
3076 */
3077 static int countWriteCursors(BtShared *pBt){
3078 BtCursor *pCur;
3079 int r = 0;
3080 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3081 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
3082 }
3083 return r;
3084 }
3085 #endif
3086
3087 /*
3088 ** This routine sets the state to CURSOR_FAULT and the error
3089 ** code to errCode for every cursor on BtShared that pBtree
3090 ** references.
3091 **
3092 ** Every cursor is tripped, including cursors that belong
3093 ** to other database connections that happen to be sharing
3094 ** the cache with pBtree.
3095 **
3096 ** This routine gets called when a rollback occurs.
3097 ** All cursors using the same cache must be tripped
3098 ** to prevent them from trying to use the btree after
3099 ** the rollback. The rollback may have deleted tables
3100 ** or moved root pages, so it is not sufficient to
3101 ** save the state of the cursor. The cursor must be
3102 ** invalidated.
3103 */
3104 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
3105 BtCursor *p;
3106 sqlite3BtreeEnter(pBtree);
3107 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
3108 int i;
3109 sqlite3BtreeClearCursor(p);
3110 p->eState = CURSOR_FAULT;
3111 p->skipNext = errCode;
3112 for(i=0; i<=p->iPage; i++){
3113 releasePage(p->apPage[i]);
3114 p->apPage[i] = 0;
3115 }
3116 }
3117 sqlite3BtreeLeave(pBtree);
3118 }
3119
3120 /*
3121 ** Rollback the transaction in progress. All cursors will be
3122 ** invalided by this operation. Any attempt to use a cursor
3123 ** that was open at the beginning of this operation will result
3124 ** in an error.
3125 **
3126 ** This will release the write lock on the database file. If there
3127 ** are no active cursors, it also releases the read lock.
3128 */
3129 int sqlite3BtreeRollback(Btree *p){
3130 int rc;
3131 BtShared *pBt = p->pBt;
3132 MemPage *pPage1;
3133
3134 sqlite3BtreeEnter(p);
3135 rc = saveAllCursors(pBt, 0, 0);
3136 #ifndef SQLITE_OMIT_SHARED_CACHE
3137 if( rc!=SQLITE_OK ){
3138 /* This is a horrible situation. An IO or malloc() error occurred whilst
3139 ** trying to save cursor positions. If this is an automatic rollback (as
3140 ** the result of a constraint, malloc() failure or IO error) then
3141 ** the cache may be internally inconsistent (not contain valid trees) so
3142 ** we cannot simply return the error to the caller. Instead, abort
3143 ** all queries that may be using any of the cursors that failed to save.
3144 */
3145 sqlite3BtreeTripAllCursors(p, rc);
3146 }
3147 #endif
3148 btreeIntegrity(p);
3149
3150 if( p->inTrans==TRANS_WRITE ){
3151 int rc2;
3152
3153 assert( TRANS_WRITE==pBt->inTransaction );
3154 rc2 = sqlite3PagerRollback(pBt->pPager);
3155 if( rc2!=SQLITE_OK ){
3156 rc = rc2;
3157 }
3158
3159 /* The rollback may have destroyed the pPage1->aData value. So
3160 ** call btreeGetPage() on page 1 again to make
3161 ** sure pPage1->aData is set correctly. */
3162 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
3163 releasePage(pPage1);
3164 }
3165 assert( countWriteCursors(pBt)==0 );
3166 pBt->inTransaction = TRANS_READ;
3167 }
3168
3169 btreeEndTransaction(p);
3170 sqlite3BtreeLeave(p);
3171 return rc;
3172 }
3173
3174 /*
3175 ** Start a statement subtransaction. The subtransaction can can be rolled
3176 ** back independently of the main transaction. You must start a transaction
3177 ** before starting a subtransaction. The subtransaction is ended automatically
3178 ** if the main transaction commits or rolls back.
3179 **
3180 ** Statement subtransactions are used around individual SQL statements
3181 ** that are contained within a BEGIN...COMMIT block. If a constraint
3182 ** error occurs within the statement, the effect of that one statement
3183 ** can be rolled back without having to rollback the entire transaction.
3184 **
3185 ** A statement sub-transaction is implemented as an anonymous savepoint. The
3186 ** value passed as the second parameter is the total number of savepoints,
3187 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
3188 ** are no active savepoints and no other statement-transactions open,
3189 ** iStatement is 1. This anonymous savepoint can be released or rolled back
3190 ** using the sqlite3BtreeSavepoint() function.
3191 */
3192 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
3193 int rc;
3194 BtShared *pBt = p->pBt;
3195 sqlite3BtreeEnter(p);
3196 assert( p->inTrans==TRANS_WRITE );
3197 assert( pBt->readOnly==0 );
3198 assert( iStatement>0 );
3199 assert( iStatement>p->db->nSavepoint );
3200 if( NEVER(p->inTrans!=TRANS_WRITE || pBt->readOnly) ){
3201 rc = SQLITE_INTERNAL;
3202 }else{
3203 assert( pBt->inTransaction==TRANS_WRITE );
3204 /* At the pager level, a statement transaction is a savepoint with
3205 ** an index greater than all savepoints created explicitly using
3206 ** SQL statements. It is illegal to open, release or rollback any
3207 ** such savepoints while the statement transaction savepoint is active.
3208 */
3209 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
3210 }
3211 sqlite3BtreeLeave(p);
3212 return rc;
3213 }
3214
3215 /*
3216 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
3217 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
3218 ** savepoint identified by parameter iSavepoint, depending on the value
3219 ** of op.
3220 **
3221 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
3222 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
3223 ** contents of the entire transaction are rolled back. This is different
3224 ** from a normal transaction rollback, as no locks are released and the
3225 ** transaction remains open.
3226 */
3227 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
3228 int rc = SQLITE_OK;
3229 if( p && p->inTrans==TRANS_WRITE ){
3230 BtShared *pBt = p->pBt;
3231 assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
3232 assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
3233 sqlite3BtreeEnter(p);
3234 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
3235 if( rc==SQLITE_OK ){
3236 rc = newDatabase(pBt);
3237 }
3238 sqlite3BtreeLeave(p);
3239 }
3240 return rc;
3241 }
3242
3243 /*
3244 ** Create a new cursor for the BTree whose root is on the page
3245 ** iTable. If a read-only cursor is requested, it is assumed that
3246 ** the caller already has at least a read-only transaction open
3247 ** on the database already. If a write-cursor is requested, then
3248 ** the caller is assumed to have an open write transaction.
3249 **
3250 ** If wrFlag==0, then the cursor can only be used for reading.
3251 ** If wrFlag==1, then the cursor can be used for reading or for
3252 ** writing if other conditions for writing are also met. These
3253 ** are the conditions that must be met in order for writing to
3254 ** be allowed:
3255 **
3256 ** 1: The cursor must have been opened with wrFlag==1
3257 **
3258 ** 2: Other database connections that share the same pager cache
3259 ** but which are not in the READ_UNCOMMITTED state may not have
3260 ** cursors open with wrFlag==0 on the same table. Otherwise
3261 ** the changes made by this write cursor would be visible to
3262 ** the read cursors in the other database connection.
3263 **
3264 ** 3: The database must be writable (not on read-only media)
3265 **
3266 ** 4: There must be an active transaction.
3267 **
3268 ** No checking is done to make sure that page iTable really is the
3269 ** root page of a b-tree. If it is not, then the cursor acquired
3270 ** will not work correctly.
3271 **
3272 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
3273 ** pointed to by pCur have been zeroed by the caller.
3274 */
3275 static int btreeCursor(
3276 Btree *p, /* The btree */
3277 int iTable, /* Root page of table to open */
3278 int wrFlag, /* 1 to write. 0 read-only */
3279 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
3280 BtCursor *pCur /* Space for new cursor */
3281 ){
3282 BtShared *pBt = p->pBt; /* Shared b-tree handle */
3283
3284 assert( sqlite3BtreeHoldsMutex(p) );
3285 assert( wrFlag==0 || wrFlag==1 );
3286
3287 /* The following assert statements verify that if this is a sharable
3288 ** b-tree database, the connection is holding the required table locks,
3289 ** and that no other connection has any open cursor that conflicts with
3290 ** this lock. */
3291 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
3292 assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
3293
3294 /* Assert that the caller has opened the required transaction. */
3295 assert( p->inTrans>TRANS_NONE );
3296 assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
3297 assert( pBt->pPage1 && pBt->pPage1->aData );
3298
3299 if( NEVER(wrFlag && pBt->readOnly) ){
3300 return SQLITE_READONLY;
3301 }
3302 if( iTable==1 && pagerPagecount(pBt)==0 ){
3303 return SQLITE_EMPTY;
3304 }
3305
3306 /* Now that no other errors can occur, finish filling in the BtCursor
3307 ** variables and link the cursor into the BtShared list. */
3308 pCur->pgnoRoot = (Pgno)iTable;
3309 pCur->iPage = -1;
3310 pCur->pKeyInfo = pKeyInfo;
3311 pCur->pBtree = p;
3312 pCur->pBt = pBt;
3313 pCur->wrFlag = (u8)wrFlag;
3314 pCur->pNext = pBt->pCursor;
3315 if( pCur->pNext ){
3316 pCur->pNext->pPrev = pCur;
3317 }
3318 pBt->pCursor = pCur;
3319 pCur->eState = CURSOR_INVALID;
3320 pCur->cachedRowid = 0;
3321 return SQLITE_OK;
3322 }
3323 int sqlite3BtreeCursor(
3324 Btree *p, /* The btree */
3325 int iTable, /* Root page of table to open */
3326 int wrFlag, /* 1 to write. 0 read-only */
3327 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
3328 BtCursor *pCur /* Write new cursor here */
3329 ){
3330 int rc;
3331 sqlite3BtreeEnter(p);
3332 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
3333 sqlite3BtreeLeave(p);
3334 return rc;
3335 }
3336
3337 /*
3338 ** Return the size of a BtCursor object in bytes.
3339 **
3340 ** This interfaces is needed so that users of cursors can preallocate
3341 ** sufficient storage to hold a cursor. The BtCursor object is opaque
3342 ** to users so they cannot do the sizeof() themselves - they must call
3343 ** this routine.
3344 */
3345 int sqlite3BtreeCursorSize(void){
3346 return sizeof(BtCursor);
3347 }
3348
3349 /*
3350 ** Set the cached rowid value of every cursor in the same database file
3351 ** as pCur and having the same root page number as pCur. The value is
3352 ** set to iRowid.
3353 **
3354 ** Only positive rowid values are considered valid for this cache.
3355 ** The cache is initialized to zero, indicating an invalid cache.
3356 ** A btree will work fine with zero or negative rowids. We just cannot
3357 ** cache zero or negative rowids, which means tables that use zero or
3358 ** negative rowids might run a little slower. But in practice, zero
3359 ** or negative rowids are very uncommon so this should not be a problem.
3360 */
3361 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
3362 BtCursor *p;
3363 for(p=pCur->pBt->pCursor; p; p=p->pNext){
3364 if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
3365 }
3366 assert( pCur->cachedRowid==iRowid );
3367 }
3368
3369 /*
3370 ** Return the cached rowid for the given cursor. A negative or zero
3371 ** return value indicates that the rowid cache is invalid and should be
3372 ** ignored. If the rowid cache has never before been set, then a
3373 ** zero is returned.
3374 */
3375 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
3376 return pCur->cachedRowid;
3377 }
3378
3379 /*
3380 ** Close a cursor. The read lock on the database file is released
3381 ** when the last cursor is closed.
3382 */
3383 int sqlite3BtreeCloseCursor(BtCursor *pCur){
3384 Btree *pBtree = pCur->pBtree;
3385 if( pBtree ){
3386 int i;
3387 BtShared *pBt = pCur->pBt;
3388 sqlite3BtreeEnter(pBtree);
3389 sqlite3BtreeClearCursor(pCur);
3390 if( pCur->pPrev ){
3391 pCur->pPrev->pNext = pCur->pNext;
3392 }else{
3393 pBt->pCursor = pCur->pNext;
3394 }
3395 if( pCur->pNext ){
3396 pCur->pNext->pPrev = pCur->pPrev;
3397 }
3398 for(i=0; i<=pCur->iPage; i++){
3399 releasePage(pCur->apPage[i]);
3400 }
3401 unlockBtreeIfUnused(pBt);
3402 invalidateOverflowCache(pCur);
3403 /* sqlite3_free(pCur); */
3404 sqlite3BtreeLeave(pBtree);
3405 }
3406 return SQLITE_OK;
3407 }
3408
3409 /*
3410 ** Make sure the BtCursor* given in the argument has a valid
3411 ** BtCursor.info structure. If it is not already valid, call
3412 ** btreeParseCell() to fill it in.
3413 **
3414 ** BtCursor.info is a cache of the information in the current cell.
3415 ** Using this cache reduces the number of calls to btreeParseCell().
3416 **
3417 ** 2007-06-25: There is a bug in some versions of MSVC that cause the
3418 ** compiler to crash when getCellInfo() is implemented as a macro.
3419 ** But there is a measureable speed advantage to using the macro on gcc
3420 ** (when less compiler optimizations like -Os or -O0 are used and the
3421 ** compiler is not doing agressive inlining.) So we use a real function
3422 ** for MSVC and a macro for everything else. Ticket #2457.
3423 */
3424 #ifndef NDEBUG
3425 static void assertCellInfo(BtCursor *pCur){
3426 CellInfo info;
3427 int iPage = pCur->iPage;
3428 memset(&info, 0, sizeof(info));
3429 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
3430 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
3431 }
3432 #else
3433 #define assertCellInfo(x)
3434 #endif
3435 #ifdef _MSC_VER
3436 /* Use a real function in MSVC to work around bugs in that compiler. */
3437 static void getCellInfo(BtCursor *pCur){
3438 if( pCur->info.nSize==0 ){
3439 int iPage = pCur->iPage;
3440 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
3441 pCur->validNKey = 1;
3442 }else{
3443 assertCellInfo(pCur);
3444 }
3445 }
3446 #else /* if not _MSC_VER */
3447 /* Use a macro in all other compilers so that the function is inlined */
3448 #define getCellInfo(pCur) \
3449 if( pCur->info.nSize==0 ){ \
3450 int iPage = pCur->iPage; \
3451 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
3452 pCur->validNKey = 1; \
3453 }else{ \
3454 assertCellInfo(pCur); \
3455 }
3456 #endif /* _MSC_VER */
3457
3458 #ifndef NDEBUG /* The next routine used only within assert() statements */
3459 /*
3460 ** Return true if the given BtCursor is valid. A valid cursor is one
3461 ** that is currently pointing to a row in a (non-empty) table.
3462 ** This is a verification routine is used only within assert() statements.
3463 */
3464 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
3465 return pCur && pCur->eState==CURSOR_VALID;
3466 }
3467 #endif /* NDEBUG */
3468
3469 /*
3470 ** Set *pSize to the size of the buffer needed to hold the value of
3471 ** the key for the current entry. If the cursor is not pointing
3472 ** to a valid entry, *pSize is set to 0.
3473 **
3474 ** For a table with the INTKEY flag set, this routine returns the key
3475 ** itself, not the number of bytes in the key.
3476 **
3477 ** The caller must position the cursor prior to invoking this routine.
3478 **
3479 ** This routine cannot fail. It always returns SQLITE_OK.
3480 */
3481 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
3482 assert( cursorHoldsMutex(pCur) );
3483 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3484 if( pCur->eState!=CURSOR_VALID ){
3485 *pSize = 0;
3486 }else{
3487 getCellInfo(pCur);
3488 *pSize = pCur->info.nKey;
3489 }
3490 return SQLITE_OK;
3491 }
3492
3493 /*
3494 ** Set *pSize to the number of bytes of data in the entry the
3495 ** cursor currently points to.
3496 **
3497 ** The caller must guarantee that the cursor is pointing to a non-NULL
3498 ** valid entry. In other words, the calling procedure must guarantee
3499 ** that the cursor has Cursor.eState==CURSOR_VALID.
3500 **
3501 ** Failure is not possible. This function always returns SQLITE_OK.
3502 ** It might just as well be a procedure (returning void) but we continue
3503 ** to return an integer result code for historical reasons.
3504 */
3505 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3506 assert( cursorHoldsMutex(pCur) );
3507 assert( pCur->eState==CURSOR_VALID );
3508 getCellInfo(pCur);
3509 *pSize = pCur->info.nData;
3510 return SQLITE_OK;
3511 }
3512
3513 /*
3514 ** Given the page number of an overflow page in the database (parameter
3515 ** ovfl), this function finds the page number of the next page in the
3516 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3517 ** pointer-map data instead of reading the content of page ovfl to do so.
3518 **
3519 ** If an error occurs an SQLite error code is returned. Otherwise:
3520 **
3521 ** The page number of the next overflow page in the linked list is
3522 ** written to *pPgnoNext. If page ovfl is the last page in its linked
3523 ** list, *pPgnoNext is set to zero.
3524 **
3525 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
3526 ** to page number pOvfl was obtained, then *ppPage is set to point to that
3527 ** reference. It is the responsibility of the caller to call releasePage()
3528 ** on *ppPage to free the reference. In no reference was obtained (because
3529 ** the pointer-map was used to obtain the value for *pPgnoNext), then
3530 ** *ppPage is set to zero.
3531 */
3532 static int getOverflowPage(
3533 BtShared *pBt, /* The database file */
3534 Pgno ovfl, /* Current overflow page number */
3535 MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
3536 Pgno *pPgnoNext /* OUT: Next overflow page number */
3537 ){
3538 Pgno next = 0;
3539 MemPage *pPage = 0;
3540 int rc = SQLITE_OK;
3541
3542 assert( sqlite3_mutex_held(pBt->mutex) );
3543 assert(pPgnoNext);
3544
3545 #ifndef SQLITE_OMIT_AUTOVACUUM
3546 /* Try to find the next page in the overflow list using the
3547 ** autovacuum pointer-map pages. Guess that the next page in
3548 ** the overflow list is page number (ovfl+1). If that guess turns
3549 ** out to be wrong, fall back to loading the data of page
3550 ** number ovfl to determine the next page number.
3551 */
3552 if( pBt->autoVacuum ){
3553 Pgno pgno;
3554 Pgno iGuess = ovfl+1;
3555 u8 eType;
3556
3557 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3558 iGuess++;
3559 }
3560
3561 if( iGuess<=pagerPagecount(pBt) ){
3562 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3563 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3564 next = iGuess;
3565 rc = SQLITE_DONE;
3566 }
3567 }
3568 }
3569 #endif
3570
3571 assert( next==0 || rc==SQLITE_DONE );
3572 if( rc==SQLITE_OK ){
3573 rc = btreeGetPage(pBt, ovfl, &pPage, 0);
3574 assert( rc==SQLITE_OK || pPage==0 );
3575 if( rc==SQLITE_OK ){
3576 next = get4byte(pPage->aData);
3577 }
3578 }
3579
3580 *pPgnoNext = next;
3581 if( ppPage ){
3582 *ppPage = pPage;
3583 }else{
3584 releasePage(pPage);
3585 }
3586 return (rc==SQLITE_DONE ? SQLITE_OK : rc);
3587 }
3588
3589 /*
3590 ** Copy data from a buffer to a page, or from a page to a buffer.
3591 **
3592 ** pPayload is a pointer to data stored on database page pDbPage.
3593 ** If argument eOp is false, then nByte bytes of data are copied
3594 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3595 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3596 ** of data are copied from the buffer pBuf to pPayload.
3597 **
3598 ** SQLITE_OK is returned on success, otherwise an error code.
3599 */
3600 static int copyPayload(
3601 void *pPayload, /* Pointer to page data */
3602 void *pBuf, /* Pointer to buffer */
3603 int nByte, /* Number of bytes to copy */
3604 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3605 DbPage *pDbPage /* Page containing pPayload */
3606 ){
3607 if( eOp ){
3608 /* Copy data from buffer to page (a write operation) */
3609 int rc = sqlite3PagerWrite(pDbPage);
3610 if( rc!=SQLITE_OK ){
3611 return rc;
3612 }
3613 memcpy(pPayload, pBuf, nByte);
3614 }else{
3615 /* Copy data from page to buffer (a read operation) */
3616 memcpy(pBuf, pPayload, nByte);
3617 }
3618 return SQLITE_OK;
3619 }
3620
3621 /*
3622 ** This function is used to read or overwrite payload information
3623 ** for the entry that the pCur cursor is pointing to. If the eOp
3624 ** parameter is 0, this is a read operation (data copied into
3625 ** buffer pBuf). If it is non-zero, a write (data copied from
3626 ** buffer pBuf).
3627 **
3628 ** A total of "amt" bytes are read or written beginning at "offset".
3629 ** Data is read to or from the buffer pBuf.
3630 **
3631 ** The content being read or written might appear on the main page
3632 ** or be scattered out on multiple overflow pages.
3633 **
3634 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3635 ** cursor entry uses one or more overflow pages, this function
3636 ** allocates space for and lazily popluates the overflow page-list
3637 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3638 ** cache to make seeking to the supplied offset more efficient.
3639 **
3640 ** Once an overflow page-list cache has been allocated, it may be
3641 ** invalidated if some other cursor writes to the same table, or if
3642 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3643 ** mode, the following events may invalidate an overflow page-list cache.
3644 **
3645 ** * An incremental vacuum,
3646 ** * A commit in auto_vacuum="full" mode,
3647 ** * Creating a table (may require moving an overflow page).
3648 */
3649 static int accessPayload(
3650 BtCursor *pCur, /* Cursor pointing to entry to read from */
3651 u32 offset, /* Begin reading this far into payload */
3652 u32 amt, /* Read this many bytes */
3653 unsigned char *pBuf, /* Write the bytes into this buffer */
3654 int eOp /* zero to read. non-zero to write. */
3655 ){
3656 unsigned char *aPayload;
3657 int rc = SQLITE_OK;
3658 u32 nKey;
3659 int iIdx = 0;
3660 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3661 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
3662
3663 assert( pPage );
3664 assert( pCur->eState==CURSOR_VALID );
3665 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3666 assert( cursorHoldsMutex(pCur) );
3667
3668 getCellInfo(pCur);
3669 aPayload = pCur->info.pCell + pCur->info.nHeader;
3670 nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
3671
3672 if( NEVER(offset+amt > nKey+pCur->info.nData)
3673 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3674 ){
3675 /* Trying to read or write past the end of the data is an error */
3676 return SQLITE_CORRUPT_BKPT;
3677 }
3678
3679 /* Check if data must be read/written to/from the btree page itself. */
3680 if( offset<pCur->info.nLocal ){
3681 int a = amt;
3682 if( a+offset>pCur->info.nLocal ){
3683 a = pCur->info.nLocal - offset;
3684 }
3685 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3686 offset = 0;
3687 pBuf += a;
3688 amt -= a;
3689 }else{
3690 offset -= pCur->info.nLocal;
3691 }
3692
3693 if( rc==SQLITE_OK && amt>0 ){
3694 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3695 Pgno nextPage;
3696
3697 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3698
3699 #ifndef SQLITE_OMIT_INCRBLOB
3700 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3701 ** has not been allocated, allocate it now. The array is sized at
3702 ** one entry for each overflow page in the overflow chain. The
3703 ** page number of the first overflow page is stored in aOverflow[0],
3704 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3705 ** (the cache is lazily populated).
3706 */
3707 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3708 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3709 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3710 /* nOvfl is always positive. If it were zero, fetchPayload would have
3711 ** been used instead of this routine. */
3712 if( ALWAYS(nOvfl) && !pCur->aOverflow ){
3713 rc = SQLITE_NOMEM;
3714 }
3715 }
3716
3717 /* If the overflow page-list cache has been allocated and the
3718 ** entry for the first required overflow page is valid, skip
3719 ** directly to it.
3720 */
3721 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3722 iIdx = (offset/ovflSize);
3723 nextPage = pCur->aOverflow[iIdx];
3724 offset = (offset%ovflSize);
3725 }
3726 #endif
3727
3728 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3729
3730 #ifndef SQLITE_OMIT_INCRBLOB
3731 /* If required, populate the overflow page-list cache. */
3732 if( pCur->aOverflow ){
3733 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3734 pCur->aOverflow[iIdx] = nextPage;
3735 }
3736 #endif
3737
3738 if( offset>=ovflSize ){
3739 /* The only reason to read this page is to obtain the page
3740 ** number for the next page in the overflow chain. The page
3741 ** data is not required. So first try to lookup the overflow
3742 ** page-list cache, if any, then fall back to the getOverflowPage()
3743 ** function.
3744 */
3745 #ifndef SQLITE_OMIT_INCRBLOB
3746 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3747 nextPage = pCur->aOverflow[iIdx+1];
3748 } else
3749 #endif
3750 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3751 offset -= ovflSize;
3752 }else{
3753 /* Need to read this page properly. It contains some of the
3754 ** range of data that is being read (eOp==0) or written (eOp!=0).
3755 */
3756 DbPage *pDbPage;
3757 int a = amt;
3758 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3759 if( rc==SQLITE_OK ){
3760 aPayload = sqlite3PagerGetData(pDbPage);
3761 nextPage = get4byte(aPayload);
3762 if( a + offset > ovflSize ){
3763 a = ovflSize - offset;
3764 }
3765 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3766 sqlite3PagerUnref(pDbPage);
3767 offset = 0;
3768 amt -= a;
3769 pBuf += a;
3770 }
3771 }
3772 }
3773 }
3774
3775 if( rc==SQLITE_OK && amt>0 ){
3776 return SQLITE_CORRUPT_BKPT;
3777 }
3778 return rc;
3779 }
3780
3781 /*
3782 ** Read part of the key associated with cursor pCur. Exactly
3783 ** "amt" bytes will be transfered into pBuf[]. The transfer
3784 ** begins at "offset".
3785 **
3786 ** The caller must ensure that pCur is pointing to a valid row
3787 ** in the table.
3788 **
3789 ** Return SQLITE_OK on success or an error code if anything goes
3790 ** wrong. An error is returned if "offset+amt" is larger than
3791 ** the available payload.
3792 */
3793 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3794 assert( cursorHoldsMutex(pCur) );
3795 assert( pCur->eState==CURSOR_VALID );
3796 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3797 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3798 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
3799 }
3800
3801 /*
3802 ** Read part of the data associated with cursor pCur. Exactly
3803 ** "amt" bytes will be transfered into pBuf[]. The transfer
3804 ** begins at "offset".
3805 **
3806 ** Return SQLITE_OK on success or an error code if anything goes
3807 ** wrong. An error is returned if "offset+amt" is larger than
3808 ** the available payload.
3809 */
3810 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3811 int rc;
3812
3813 #ifndef SQLITE_OMIT_INCRBLOB
3814 if ( pCur->eState==CURSOR_INVALID ){
3815 return SQLITE_ABORT;
3816 }
3817 #endif
3818
3819 assert( cursorHoldsMutex(pCur) );
3820 rc = restoreCursorPosition(pCur);
3821 if( rc==SQLITE_OK ){
3822 assert( pCur->eState==CURSOR_VALID );
3823 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3824 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3825 rc = accessPayload(pCur, offset, amt, pBuf, 0);
3826 }
3827 return rc;
3828 }
3829
3830 /*
3831 ** Return a pointer to payload information from the entry that the
3832 ** pCur cursor is pointing to. The pointer is to the beginning of
3833 ** the key if skipKey==0 and it points to the beginning of data if
3834 ** skipKey==1. The number of bytes of available key/data is written
3835 ** into *pAmt. If *pAmt==0, then the value returned will not be
3836 ** a valid pointer.
3837 **
3838 ** This routine is an optimization. It is common for the entire key
3839 ** and data to fit on the local page and for there to be no overflow
3840 ** pages. When that is so, this routine can be used to access the
3841 ** key and data without making a copy. If the key and/or data spills
3842 ** onto overflow pages, then accessPayload() must be used to reassemble
3843 ** the key/data and copy it into a preallocated buffer.
3844 **
3845 ** The pointer returned by this routine looks directly into the cached
3846 ** page of the database. The data might change or move the next time
3847 ** any btree routine is called.
3848 */
3849 static const unsigned char *fetchPayload(
3850 BtCursor *pCur, /* Cursor pointing to entry to read from */
3851 int *pAmt, /* Write the number of available bytes here */
3852 int skipKey /* read beginning at data if this is true */
3853 ){
3854 unsigned char *aPayload;
3855 MemPage *pPage;
3856 u32 nKey;
3857 u32 nLocal;
3858
3859 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
3860 assert( pCur->eState==CURSOR_VALID );
3861 assert( cursorHoldsMutex(pCur) );
3862 pPage = pCur->apPage[pCur->iPage];
3863 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3864 if( NEVER(pCur->info.nSize==0) ){
3865 btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
3866 &pCur->info);
3867 }
3868 aPayload = pCur->info.pCell;
3869 aPayload += pCur->info.nHeader;
3870 if( pPage->intKey ){
3871 nKey = 0;
3872 }else{
3873 nKey = (int)pCur->info.nKey;
3874 }
3875 if( skipKey ){
3876 aPayload += nKey;
3877 nLocal = pCur->info.nLocal - nKey;
3878 }else{
3879 nLocal = pCur->info.nLocal;
3880 assert( nLocal<=nKey );
3881 }
3882 *pAmt = nLocal;
3883 return aPayload;
3884 }
3885
3886
3887 /*
3888 ** For the entry that cursor pCur is point to, return as
3889 ** many bytes of the key or data as are available on the local
3890 ** b-tree page. Write the number of available bytes into *pAmt.
3891 **
3892 ** The pointer returned is ephemeral. The key/data may move
3893 ** or be destroyed on the next call to any Btree routine,
3894 ** including calls from other threads against the same cache.
3895 ** Hence, a mutex on the BtShared should be held prior to calling
3896 ** this routine.
3897 **
3898 ** These routines is used to get quick access to key and data
3899 ** in the common case where no overflow pages are used.
3900 */
3901 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3902 const void *p = 0;
3903 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3904 assert( cursorHoldsMutex(pCur) );
3905 if( ALWAYS(pCur->eState==CURSOR_VALID) ){
3906 p = (const void*)fetchPayload(pCur, pAmt, 0);
3907 }
3908 return p;
3909 }
3910 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3911 const void *p = 0;
3912 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3913 assert( cursorHoldsMutex(pCur) );
3914 if( ALWAYS(pCur->eState==CURSOR_VALID) ){
3915 p = (const void*)fetchPayload(pCur, pAmt, 1);
3916 }
3917 return p;
3918 }
3919
3920
3921 /*
3922 ** Move the cursor down to a new child page. The newPgno argument is the
3923 ** page number of the child page to move to.
3924 **
3925 ** This function returns SQLITE_CORRUPT if the page-header flags field of
3926 ** the new child page does not match the flags field of the parent (i.e.
3927 ** if an intkey page appears to be the parent of a non-intkey page, or
3928 ** vice-versa).
3929 */
3930 static int moveToChild(BtCursor *pCur, u32 newPgno){
3931 int rc;
3932 int i = pCur->iPage;
3933 MemPage *pNewPage;
3934 BtShared *pBt = pCur->pBt;
3935
3936 assert( cursorHoldsMutex(pCur) );
3937 assert( pCur->eState==CURSOR_VALID );
3938 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3939 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3940 return SQLITE_CORRUPT_BKPT;
3941 }
3942 rc = getAndInitPage(pBt, newPgno, &pNewPage);
3943 if( rc ) return rc;
3944 pCur->apPage[i+1] = pNewPage;
3945 pCur->aiIdx[i+1] = 0;
3946 pCur->iPage++;
3947
3948 pCur->info.nSize = 0;
3949 pCur->validNKey = 0;
3950 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
3951 return SQLITE_CORRUPT_BKPT;
3952 }
3953 return SQLITE_OK;
3954 }
3955
3956 #ifndef NDEBUG
3957 /*
3958 ** Page pParent is an internal (non-leaf) tree page. This function
3959 ** asserts that page number iChild is the left-child if the iIdx'th
3960 ** cell in page pParent. Or, if iIdx is equal to the total number of
3961 ** cells in pParent, that page number iChild is the right-child of
3962 ** the page.
3963 */
3964 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3965 assert( iIdx<=pParent->nCell );
3966 if( iIdx==pParent->nCell ){
3967 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3968 }else{
3969 assert( get4byte(findCell(pParent, iIdx))==iChild );
3970 }
3971 }
3972 #else
3973 # define assertParentIndex(x,y,z)
3974 #endif
3975
3976 /*
3977 ** Move the cursor up to the parent page.
3978 **
3979 ** pCur->idx is set to the cell index that contains the pointer
3980 ** to the page we are coming from. If we are coming from the
3981 ** right-most child page then pCur->idx is set to one more than
3982 ** the largest cell index.
3983 */
3984 static void moveToParent(BtCursor *pCur){
3985 assert( cursorHoldsMutex(pCur) );
3986 assert( pCur->eState==CURSOR_VALID );
3987 assert( pCur->iPage>0 );
3988 assert( pCur->apPage[pCur->iPage] );
3989 assertParentIndex(
3990 pCur->apPage[pCur->iPage-1],
3991 pCur->aiIdx[pCur->iPage-1],
3992 pCur->apPage[pCur->iPage]->pgno
3993 );
3994 releasePage(pCur->apPage[pCur->iPage]);
3995 pCur->iPage--;
3996 pCur->info.nSize = 0;
3997 pCur->validNKey = 0;
3998 }
3999
4000 /*
4001 ** Move the cursor to point to the root page of its b-tree structure.
4002 **
4003 ** If the table has a virtual root page, then the cursor is moved to point
4004 ** to the virtual root page instead of the actual root page. A table has a
4005 ** virtual root page when the actual root page contains no cells and a
4006 ** single child page. This can only happen with the table rooted at page 1.
4007 **
4008 ** If the b-tree structure is empty, the cursor state is set to
4009 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
4010 ** cell located on the root (or virtual root) page and the cursor state
4011 ** is set to CURSOR_VALID.
4012 **
4013 ** If this function returns successfully, it may be assumed that the
4014 ** page-header flags indicate that the [virtual] root-page is the expected
4015 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
4016 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
4017 ** indicating a table b-tree, or if the caller did specify a KeyInfo
4018 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
4019 ** b-tree).
4020 */
4021 static int moveToRoot(BtCursor *pCur){
4022 MemPage *pRoot;
4023 int rc = SQLITE_OK;
4024 Btree *p = pCur->pBtree;
4025 BtShared *pBt = p->pBt;
4026
4027 assert( cursorHoldsMutex(pCur) );
4028 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
4029 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
4030 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
4031 if( pCur->eState>=CURSOR_REQUIRESEEK ){
4032 if( pCur->eState==CURSOR_FAULT ){
4033 assert( pCur->skipNext!=SQLITE_OK );
4034 return pCur->skipNext;
4035 }
4036 sqlite3BtreeClearCursor(pCur);
4037 }
4038
4039 if( pCur->iPage>=0 ){
4040 int i;
4041 for(i=1; i<=pCur->iPage; i++){
4042 releasePage(pCur->apPage[i]);
4043 }
4044 pCur->iPage = 0;
4045 }else{
4046 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
4047 if( rc!=SQLITE_OK ){
4048 pCur->eState = CURSOR_INVALID;
4049 return rc;
4050 }
4051 pCur->iPage = 0;
4052
4053 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
4054 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
4055 ** NULL, the caller expects a table b-tree. If this is not the case,
4056 ** return an SQLITE_CORRUPT error. */
4057 assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
4058 if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
4059 return SQLITE_CORRUPT_BKPT;
4060 }
4061 }
4062
4063 /* Assert that the root page is of the correct type. This must be the
4064 ** case as the call to this function that loaded the root-page (either
4065 ** this call or a previous invocation) would have detected corruption
4066 ** if the assumption were not true, and it is not possible for the flags
4067 ** byte to have been modified while this cursor is holding a reference
4068 ** to the page. */
4069 pRoot = pCur->apPage[0];
4070 assert( pRoot->pgno==pCur->pgnoRoot );
4071 assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
4072
4073 pCur->aiIdx[0] = 0;
4074 pCur->info.nSize = 0;
4075 pCur->atLast = 0;
4076 pCur->validNKey = 0;
4077
4078 if( pRoot->nCell==0 && !pRoot->leaf ){
4079 Pgno subpage;
4080 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
4081 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
4082 pCur->eState = CURSOR_VALID;
4083 rc = moveToChild(pCur, subpage);
4084 }else{
4085 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
4086 }
4087 return rc;
4088 }
4089
4090 /*
4091 ** Move the cursor down to the left-most leaf entry beneath the
4092 ** entry to which it is currently pointing.
4093 **
4094 ** The left-most leaf is the one with the smallest key - the first
4095 ** in ascending order.
4096 */
4097 static int moveToLeftmost(BtCursor *pCur){
4098 Pgno pgno;
4099 int rc = SQLITE_OK;
4100 MemPage *pPage;
4101
4102 assert( cursorHoldsMutex(pCur) );
4103 assert( pCur->eState==CURSOR_VALID );
4104 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4105 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
4106 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
4107 rc = moveToChild(pCur, pgno);
4108 }
4109 return rc;
4110 }
4111
4112 /*
4113 ** Move the cursor down to the right-most leaf entry beneath the
4114 ** page to which it is currently pointing. Notice the difference
4115 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
4116 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
4117 ** finds the right-most entry beneath the *page*.
4118 **
4119 ** The right-most entry is the one with the largest key - the last
4120 ** key in ascending order.
4121 */
4122 static int moveToRightmost(BtCursor *pCur){
4123 Pgno pgno;
4124 int rc = SQLITE_OK;
4125 MemPage *pPage = 0;
4126
4127 assert( cursorHoldsMutex(pCur) );
4128 assert( pCur->eState==CURSOR_VALID );
4129 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
4130 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4131 pCur->aiIdx[pCur->iPage] = pPage->nCell;
4132 rc = moveToChild(pCur, pgno);
4133 }
4134 if( rc==SQLITE_OK ){
4135 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
4136 pCur->info.nSize = 0;
4137 pCur->validNKey = 0;
4138 }
4139 return rc;
4140 }
4141
4142 /* Move the cursor to the first entry in the table. Return SQLITE_OK
4143 ** on success. Set *pRes to 0 if the cursor actually points to something
4144 ** or set *pRes to 1 if the table is empty.
4145 */
4146 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
4147 int rc;
4148
4149 assert( cursorHoldsMutex(pCur) );
4150 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4151 rc = moveToRoot(pCur);
4152 if( rc==SQLITE_OK ){
4153 if( pCur->eState==CURSOR_INVALID ){
4154 assert( pCur->apPage[pCur->iPage]->nCell==0 );
4155 *pRes = 1;
4156 rc = SQLITE_OK;
4157 }else{
4158 assert( pCur->apPage[pCur->iPage]->nCell>0 );
4159 *pRes = 0;
4160 rc = moveToLeftmost(pCur);
4161 }
4162 }
4163 return rc;
4164 }
4165
4166 /* Move the cursor to the last entry in the table. Return SQLITE_OK
4167 ** on success. Set *pRes to 0 if the cursor actually points to something
4168 ** or set *pRes to 1 if the table is empty.
4169 */
4170 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
4171 int rc;
4172
4173 assert( cursorHoldsMutex(pCur) );
4174 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4175
4176 /* If the cursor already points to the last entry, this is a no-op. */
4177 if( CURSOR_VALID==pCur->eState && pCur->atLast ){
4178 #ifdef SQLITE_DEBUG
4179 /* This block serves to assert() that the cursor really does point
4180 ** to the last entry in the b-tree. */
4181 int ii;
4182 for(ii=0; ii<pCur->iPage; ii++){
4183 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
4184 }
4185 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
4186 assert( pCur->apPage[pCur->iPage]->leaf );
4187 #endif
4188 return SQLITE_OK;
4189 }
4190
4191 rc = moveToRoot(pCur);
4192 if( rc==SQLITE_OK ){
4193 if( CURSOR_INVALID==pCur->eState ){
4194 assert( pCur->apPage[pCur->iPage]->nCell==0 );
4195 *pRes = 1;
4196 }else{
4197 assert( pCur->eState==CURSOR_VALID );
4198 *pRes = 0;
4199 rc = moveToRightmost(pCur);
4200 pCur->atLast = rc==SQLITE_OK ?1:0;
4201 }
4202 }
4203 return rc;
4204 }
4205
4206 /* Move the cursor so that it points to an entry near the key
4207 ** specified by pIdxKey or intKey. Return a success code.
4208 **
4209 ** For INTKEY tables, the intKey parameter is used. pIdxKey
4210 ** must be NULL. For index tables, pIdxKey is used and intKey
4211 ** is ignored.
4212 **
4213 ** If an exact match is not found, then the cursor is always
4214 ** left pointing at a leaf page which would hold the entry if it
4215 ** were present. The cursor might point to an entry that comes
4216 ** before or after the key.
4217 **
4218 ** An integer is written into *pRes which is the result of
4219 ** comparing the key with the entry to which the cursor is
4220 ** pointing. The meaning of the integer written into
4221 ** *pRes is as follows:
4222 **
4223 ** *pRes<0 The cursor is left pointing at an entry that
4224 ** is smaller than intKey/pIdxKey or if the table is empty
4225 ** and the cursor is therefore left point to nothing.
4226 **
4227 ** *pRes==0 The cursor is left pointing at an entry that
4228 ** exactly matches intKey/pIdxKey.
4229 **
4230 ** *pRes>0 The cursor is left pointing at an entry that
4231 ** is larger than intKey/pIdxKey.
4232 **
4233 */
4234 int sqlite3BtreeMovetoUnpacked(
4235 BtCursor *pCur, /* The cursor to be moved */
4236 UnpackedRecord *pIdxKey, /* Unpacked index key */
4237 i64 intKey, /* The table key */
4238 int biasRight, /* If true, bias the search to the high end */
4239 int *pRes /* Write search results here */
4240 ){
4241 int rc;
4242
4243 assert( cursorHoldsMutex(pCur) );
4244 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
4245 assert( pRes );
4246 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
4247
4248 /* If the cursor is already positioned at the point we are trying
4249 ** to move to, then just return without doing any work */
4250 if( pCur->eState==CURSOR_VALID && pCur->validNKey
4251 && pCur->apPage[0]->intKey
4252 ){
4253 if( pCur->info.nKey==intKey ){
4254 *pRes = 0;
4255 return SQLITE_OK;
4256 }
4257 if( pCur->atLast && pCur->info.nKey<intKey ){
4258 *pRes = -1;
4259 return SQLITE_OK;
4260 }
4261 }
4262
4263 rc = moveToRoot(pCur);
4264 if( rc ){
4265 return rc;
4266 }
4267 assert( pCur->apPage[pCur->iPage] );
4268 assert( pCur->apPage[pCur->iPage]->isInit );
4269 assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID );
4270 if( pCur->eState==CURSOR_INVALID ){
4271 *pRes = -1;
4272 assert( pCur->apPage[pCur->iPage]->nCell==0 );
4273 return SQLITE_OK;
4274 }
4275 assert( pCur->apPage[0]->intKey || pIdxKey );
4276 for(;;){
4277 int lwr, upr;
4278 Pgno chldPg;
4279 MemPage *pPage = pCur->apPage[pCur->iPage];
4280 int c;
4281
4282 /* pPage->nCell must be greater than zero. If this is the root-page
4283 ** the cursor would have been INVALID above and this for(;;) loop
4284 ** not run. If this is not the root-page, then the moveToChild() routine
4285 ** would have already detected db corruption. Similarly, pPage must
4286 ** be the right kind (index or table) of b-tree page. Otherwise
4287 ** a moveToChild() or moveToRoot() call would have detected corruption. */
4288 assert( pPage->nCell>0 );
4289 assert( pPage->intKey==(pIdxKey==0) );
4290 lwr = 0;
4291 upr = pPage->nCell-1;
4292 if( biasRight ){
4293 pCur->aiIdx[pCur->iPage] = (u16)upr;
4294 }else{
4295 pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2);
4296 }
4297 for(;;){
4298 int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */
4299 u8 *pCell; /* Pointer to current cell in pPage */
4300
4301 pCur->info.nSize = 0;
4302 pCell = findCell(pPage, idx) + pPage->childPtrSize;
4303 if( pPage->intKey ){
4304 i64 nCellKey;
4305 if( pPage->hasData ){
4306 u32 dummy;
4307 pCell += getVarint32(pCell, dummy);
4308 }
4309 getVarint(pCell, (u64*)&nCellKey);
4310 if( nCellKey==intKey ){
4311 c = 0;
4312 }else if( nCellKey<intKey ){
4313 c = -1;
4314 }else{
4315 assert( nCellKey>intKey );
4316 c = +1;
4317 }
4318 pCur->validNKey = 1;
4319 pCur->info.nKey = nCellKey;
4320 }else{
4321 /* The maximum supported page-size is 32768 bytes. This means that
4322 ** the maximum number of record bytes stored on an index B-Tree
4323 ** page is at most 8198 bytes, which may be stored as a 2-byte
4324 ** varint. This information is used to attempt to avoid parsing
4325 ** the entire cell by checking for the cases where the record is
4326 ** stored entirely within the b-tree page by inspecting the first
4327 ** 2 bytes of the cell.
4328 */
4329 int nCell = pCell[0];
4330 if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){
4331 /* This branch runs if the record-size field of the cell is a
4332 ** single byte varint and the record fits entirely on the main
4333 ** b-tree page. */
4334 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
4335 }else if( !(pCell[1] & 0x80)
4336 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
4337 ){
4338 /* The record-size field is a 2 byte varint and the record
4339 ** fits entirely on the main b-tree page. */
4340 c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
4341 }else{
4342 /* The record flows over onto one or more overflow pages. In
4343 ** this case the whole cell needs to be parsed, a buffer allocated
4344 ** and accessPayload() used to retrieve the record into the
4345 ** buffer before VdbeRecordCompare() can be called. */
4346 void *pCellKey;
4347 u8 * const pCellBody = pCell - pPage->childPtrSize;
4348 btreeParseCellPtr(pPage, pCellBody, &pCur->info);
4349 nCell = (int)pCur->info.nKey;
4350 pCellKey = sqlite3Malloc( nCell );
4351 if( pCellKey==0 ){
4352 rc = SQLITE_NOMEM;
4353 goto moveto_finish;
4354 }
4355 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
4356 if( rc ){
4357 sqlite3_free(pCellKey);
4358 goto moveto_finish;
4359 }
4360 c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
4361 sqlite3_free(pCellKey);
4362 }
4363 }
4364 if( c==0 ){
4365 if( pPage->intKey && !pPage->leaf ){
4366 lwr = idx;
4367 upr = lwr - 1;
4368 break;
4369 }else{
4370 *pRes = 0;
4371 rc = SQLITE_OK;
4372 goto moveto_finish;
4373 }
4374 }
4375 if( c<0 ){
4376 lwr = idx+1;
4377 }else{
4378 upr = idx-1;
4379 }
4380 if( lwr>upr ){
4381 break;
4382 }
4383 pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2);
4384 }
4385 assert( lwr==upr+1 );
4386 assert( pPage->isInit );
4387 if( pPage->leaf ){
4388 chldPg = 0;
4389 }else if( lwr>=pPage->nCell ){
4390 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4391 }else{
4392 chldPg = get4byte(findCell(pPage, lwr));
4393 }
4394 if( chldPg==0 ){
4395 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
4396 *pRes = c;
4397 rc = SQLITE_OK;
4398 goto moveto_finish;
4399 }
4400 pCur->aiIdx[pCur->iPage] = (u16)lwr;
4401 pCur->info.nSize = 0;
4402 pCur->validNKey = 0;
4403 rc = moveToChild(pCur, chldPg);
4404 if( rc ) goto moveto_finish;
4405 }
4406 moveto_finish:
4407 return rc;
4408 }
4409
4410
4411 /*
4412 ** Return TRUE if the cursor is not pointing at an entry of the table.
4413 **
4414 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
4415 ** past the last entry in the table or sqlite3BtreePrev() moves past
4416 ** the first entry. TRUE is also returned if the table is empty.
4417 */
4418 int sqlite3BtreeEof(BtCursor *pCur){
4419 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
4420 ** have been deleted? This API will need to change to return an error code
4421 ** as well as the boolean result value.
4422 */
4423 return (CURSOR_VALID!=pCur->eState);
4424 }
4425
4426 /*
4427 ** Advance the cursor to the next entry in the database. If
4428 ** successful then set *pRes=0. If the cursor
4429 ** was already pointing to the last entry in the database before
4430 ** this routine was called, then set *pRes=1.
4431 */
4432 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
4433 int rc;
4434 int idx;
4435 MemPage *pPage;
4436
4437 assert( cursorHoldsMutex(pCur) );
4438 rc = restoreCursorPosition(pCur);
4439 if( rc!=SQLITE_OK ){
4440 return rc;
4441 }
4442 assert( pRes!=0 );
4443 if( CURSOR_INVALID==pCur->eState ){
4444 *pRes = 1;
4445 return SQLITE_OK;
4446 }
4447 if( pCur->skipNext>0 ){
4448 pCur->skipNext = 0;
4449 *pRes = 0;
4450 return SQLITE_OK;
4451 }
4452 pCur->skipNext = 0;
4453
4454 pPage = pCur->apPage[pCur->iPage];
4455 idx = ++pCur->aiIdx[pCur->iPage];
4456 assert( pPage->isInit );
4457 assert( idx<=pPage->nCell );
4458
4459 pCur->info.nSize = 0;
4460 pCur->validNKey = 0;
4461 if( idx>=pPage->nCell ){
4462 if( !pPage->leaf ){
4463 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
4464 if( rc ) return rc;
4465 rc = moveToLeftmost(pCur);
4466 *pRes = 0;
4467 return rc;
4468 }
4469 do{
4470 if( pCur->iPage==0 ){
4471 *pRes = 1;
4472 pCur->eState = CURSOR_INVALID;
4473 return SQLITE_OK;
4474 }
4475 moveToParent(pCur);
4476 pPage = pCur->apPage[pCur->iPage];
4477 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
4478 *pRes = 0;
4479 if( pPage->intKey ){
4480 rc = sqlite3BtreeNext(pCur, pRes);
4481 }else{
4482 rc = SQLITE_OK;
4483 }
4484 return rc;
4485 }
4486 *pRes = 0;
4487 if( pPage->leaf ){
4488 return SQLITE_OK;
4489 }
4490 rc = moveToLeftmost(pCur);
4491 return rc;
4492 }
4493
4494
4495 /*
4496 ** Step the cursor to the back to the previous entry in the database. If
4497 ** successful then set *pRes=0. If the cursor
4498 ** was already pointing to the first entry in the database before
4499 ** this routine was called, then set *pRes=1.
4500 */
4501 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
4502 int rc;
4503 MemPage *pPage;
4504
4505 assert( cursorHoldsMutex(pCur) );
4506 rc = restoreCursorPosition(pCur);
4507 if( rc!=SQLITE_OK ){
4508 return rc;
4509 }
4510 pCur->atLast = 0;
4511 if( CURSOR_INVALID==pCur->eState ){
4512 *pRes = 1;
4513 return SQLITE_OK;
4514 }
4515 if( pCur->skipNext<0 ){
4516 pCur->skipNext = 0;
4517 *pRes = 0;
4518 return SQLITE_OK;
4519 }
4520 pCur->skipNext = 0;
4521
4522 pPage = pCur->apPage[pCur->iPage];
4523 assert( pPage->isInit );
4524 if( !pPage->leaf ){
4525 int idx = pCur->aiIdx[pCur->iPage];
4526 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4527 if( rc ){
4528 return rc;
4529 }
4530 rc = moveToRightmost(pCur);
4531 }else{
4532 while( pCur->aiIdx[pCur->iPage]==0 ){
4533 if( pCur->iPage==0 ){
4534 pCur->eState = CURSOR_INVALID;
4535 *pRes = 1;
4536 return SQLITE_OK;
4537 }
4538 moveToParent(pCur);
4539 }
4540 pCur->info.nSize = 0;
4541 pCur->validNKey = 0;
4542
4543 pCur->aiIdx[pCur->iPage]--;
4544 pPage = pCur->apPage[pCur->iPage];
4545 if( pPage->intKey && !pPage->leaf ){
4546 rc = sqlite3BtreePrevious(pCur, pRes);
4547 }else{
4548 rc = SQLITE_OK;
4549 }
4550 }
4551 *pRes = 0;
4552 return rc;
4553 }
4554
4555 /*
4556 ** Allocate a new page from the database file.
4557 **
4558 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
4559 ** has already been called on the new page.) The new page has also
4560 ** been referenced and the calling routine is responsible for calling
4561 ** sqlite3PagerUnref() on the new page when it is done.
4562 **
4563 ** SQLITE_OK is returned on success. Any other return value indicates
4564 ** an error. *ppPage and *pPgno are undefined in the event of an error.
4565 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4566 **
4567 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4568 ** locate a page close to the page number "nearby". This can be used in an
4569 ** attempt to keep related pages close to each other in the database file,
4570 ** which in turn can make database access faster.
4571 **
4572 ** If the "exact" parameter is not 0, and the page-number nearby exists
4573 ** anywhere on the free-list, then it is guarenteed to be returned. This
4574 ** is only used by auto-vacuum databases when allocating a new table.
4575 */
4576 static int allocateBtreePage(
4577 BtShared *pBt,
4578 MemPage **ppPage,
4579 Pgno *pPgno,
4580 Pgno nearby,
4581 u8 exact
4582 ){
4583 MemPage *pPage1;
4584 int rc;
4585 u32 n; /* Number of pages on the freelist */
4586 u32 k; /* Number of leaves on the trunk of the freelist */
4587 MemPage *pTrunk = 0;
4588 MemPage *pPrevTrunk = 0;
4589 Pgno mxPage; /* Total size of the database file */
4590
4591 assert( sqlite3_mutex_held(pBt->mutex) );
4592 pPage1 = pBt->pPage1;
4593 mxPage = pagerPagecount(pBt);
4594 n = get4byte(&pPage1->aData[36]);
4595 testcase( n==mxPage-1 );
4596 if( n>=mxPage ){
4597 return SQLITE_CORRUPT_BKPT;
4598 }
4599 if( n>0 ){
4600 /* There are pages on the freelist. Reuse one of those pages. */
4601 Pgno iTrunk;
4602 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4603
4604 /* If the 'exact' parameter was true and a query of the pointer-map
4605 ** shows that the page 'nearby' is somewhere on the free-list, then
4606 ** the entire-list will be searched for that page.
4607 */
4608 #ifndef SQLITE_OMIT_AUTOVACUUM
4609 if( exact && nearby<=mxPage ){
4610 u8 eType;
4611 assert( nearby>0 );
4612 assert( pBt->autoVacuum );
4613 rc = ptrmapGet(pBt, nearby, &eType, 0);
4614 if( rc ) return rc;
4615 if( eType==PTRMAP_FREEPAGE ){
4616 searchList = 1;
4617 }
4618 *pPgno = nearby;
4619 }
4620 #endif
4621
4622 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4623 ** first free-list trunk page. iPrevTrunk is initially 1.
4624 */
4625 rc = sqlite3PagerWrite(pPage1->pDbPage);
4626 if( rc ) return rc;
4627 put4byte(&pPage1->aData[36], n-1);
4628
4629 /* The code within this loop is run only once if the 'searchList' variable
4630 ** is not true. Otherwise, it runs once for each trunk-page on the
4631 ** free-list until the page 'nearby' is located.
4632 */
4633 do {
4634 pPrevTrunk = pTrunk;
4635 if( pPrevTrunk ){
4636 iTrunk = get4byte(&pPrevTrunk->aData[0]);
4637 }else{
4638 iTrunk = get4byte(&pPage1->aData[32]);
4639 }
4640 testcase( iTrunk==mxPage );
4641 if( iTrunk>mxPage ){
4642 rc = SQLITE_CORRUPT_BKPT;
4643 }else{
4644 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4645 }
4646 if( rc ){
4647 pTrunk = 0;
4648 goto end_allocate_page;
4649 }
4650
4651 k = get4byte(&pTrunk->aData[4]);
4652 if( k==0 && !searchList ){
4653 /* The trunk has no leaves and the list is not being searched.
4654 ** So extract the trunk page itself and use it as the newly
4655 ** allocated page */
4656 assert( pPrevTrunk==0 );
4657 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4658 if( rc ){
4659 goto end_allocate_page;
4660 }
4661 *pPgno = iTrunk;
4662 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4663 *ppPage = pTrunk;
4664 pTrunk = 0;
4665 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4666 }else if( k>(u32)(pBt->usableSize/4 - 2) ){
4667 /* Value of k is out of range. Database corruption */
4668 rc = SQLITE_CORRUPT_BKPT;
4669 goto end_allocate_page;
4670 #ifndef SQLITE_OMIT_AUTOVACUUM
4671 }else if( searchList && nearby==iTrunk ){
4672 /* The list is being searched and this trunk page is the page
4673 ** to allocate, regardless of whether it has leaves.
4674 */
4675 assert( *pPgno==iTrunk );
4676 *ppPage = pTrunk;
4677 searchList = 0;
4678 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4679 if( rc ){
4680 goto end_allocate_page;
4681 }
4682 if( k==0 ){
4683 if( !pPrevTrunk ){
4684 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4685 }else{
4686 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4687 }
4688 }else{
4689 /* The trunk page is required by the caller but it contains
4690 ** pointers to free-list leaves. The first leaf becomes a trunk
4691 ** page in this case.
4692 */
4693 MemPage *pNewTrunk;
4694 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4695 if( iNewTrunk>mxPage ){
4696 rc = SQLITE_CORRUPT_BKPT;
4697 goto end_allocate_page;
4698 }
4699 testcase( iNewTrunk==mxPage );
4700 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4701 if( rc!=SQLITE_OK ){
4702 goto end_allocate_page;
4703 }
4704 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4705 if( rc!=SQLITE_OK ){
4706 releasePage(pNewTrunk);
4707 goto end_allocate_page;
4708 }
4709 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4710 put4byte(&pNewTrunk->aData[4], k-1);
4711 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4712 releasePage(pNewTrunk);
4713 if( !pPrevTrunk ){
4714 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
4715 put4byte(&pPage1->aData[32], iNewTrunk);
4716 }else{
4717 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4718 if( rc ){
4719 goto end_allocate_page;
4720 }
4721 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4722 }
4723 }
4724 pTrunk = 0;
4725 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4726 #endif
4727 }else if( k>0 ){
4728 /* Extract a leaf from the trunk */
4729 u32 closest;
4730 Pgno iPage;
4731 unsigned char *aData = pTrunk->aData;
4732 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4733 if( rc ){
4734 goto end_allocate_page;
4735 }
4736 if( nearby>0 ){
4737 u32 i;
4738 int dist;
4739 closest = 0;
4740 dist = get4byte(&aData[8]) - nearby;
4741 if( dist<0 ) dist = -dist;
4742 for(i=1; i<k; i++){
4743 int d2 = get4byte(&aData[8+i*4]) - nearby;
4744 if( d2<0 ) d2 = -d2;
4745 if( d2<dist ){
4746 closest = i;
4747 dist = d2;
4748 }
4749 }
4750 }else{
4751 closest = 0;
4752 }
4753
4754 iPage = get4byte(&aData[8+closest*4]);
4755 testcase( iPage==mxPage );
4756 if( iPage>mxPage ){
4757 rc = SQLITE_CORRUPT_BKPT;
4758 goto end_allocate_page;
4759 }
4760 testcase( iPage==mxPage );
4761 if( !searchList || iPage==nearby ){
4762 int noContent;
4763 *pPgno = iPage;
4764 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4765 ": %d more free pages\n",
4766 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4767 if( closest<k-1 ){
4768 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4769 }
4770 put4byte(&aData[4], k-1);
4771 assert( sqlite3PagerIswriteable(pTrunk->pDbPage) );
4772 noContent = !btreeGetHasContent(pBt, *pPgno);
4773 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
4774 if( rc==SQLITE_OK ){
4775 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4776 if( rc!=SQLITE_OK ){
4777 releasePage(*ppPage);
4778 }
4779 }
4780 searchList = 0;
4781 }
4782 }
4783 releasePage(pPrevTrunk);
4784 pPrevTrunk = 0;
4785 }while( searchList );
4786 }else{
4787 /* There are no pages on the freelist, so create a new page at the
4788 ** end of the file */
4789 int nPage = pagerPagecount(pBt);
4790 *pPgno = nPage + 1;
4791
4792 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4793 (*pPgno)++;
4794 }
4795
4796 #ifndef SQLITE_OMIT_AUTOVACUUM
4797 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4798 /* If *pPgno refers to a pointer-map page, allocate two new pages
4799 ** at the end of the file instead of one. The first allocated page
4800 ** becomes a new pointer-map page, the second is used by the caller.
4801 */
4802 MemPage *pPg = 0;
4803 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4804 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4805 rc = btreeGetPage(pBt, *pPgno, &pPg, 0);
4806 if( rc==SQLITE_OK ){
4807 rc = sqlite3PagerWrite(pPg->pDbPage);
4808 releasePage(pPg);
4809 }
4810 if( rc ) return rc;
4811 (*pPgno)++;
4812 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
4813 }
4814 #endif
4815
4816 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4817 rc = btreeGetPage(pBt, *pPgno, ppPage, 0);
4818 if( rc ) return rc;
4819 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4820 if( rc!=SQLITE_OK ){
4821 releasePage(*ppPage);
4822 }
4823 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4824 }
4825
4826 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4827
4828 end_allocate_page:
4829 releasePage(pTrunk);
4830 releasePage(pPrevTrunk);
4831 if( rc==SQLITE_OK ){
4832 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4833 releasePage(*ppPage);
4834 return SQLITE_CORRUPT_BKPT;
4835 }
4836 (*ppPage)->isInit = 0;
4837 }else{
4838 *ppPage = 0;
4839 }
4840 return rc;
4841 }
4842
4843 /*
4844 ** This function is used to add page iPage to the database file free-list.
4845 ** It is assumed that the page is not already a part of the free-list.
4846 **
4847 ** The value passed as the second argument to this function is optional.
4848 ** If the caller happens to have a pointer to the MemPage object
4849 ** corresponding to page iPage handy, it may pass it as the second value.
4850 ** Otherwise, it may pass NULL.
4851 **
4852 ** If a pointer to a MemPage object is passed as the second argument,
4853 ** its reference count is not altered by this function.
4854 */
4855 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
4856 MemPage *pTrunk = 0; /* Free-list trunk page */
4857 Pgno iTrunk = 0; /* Page number of free-list trunk page */
4858 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
4859 MemPage *pPage; /* Page being freed. May be NULL. */
4860 int rc; /* Return Code */
4861 int nFree; /* Initial number of pages on free-list */
4862
4863 assert( sqlite3_mutex_held(pBt->mutex) );
4864 assert( iPage>1 );
4865 assert( !pMemPage || pMemPage->pgno==iPage );
4866
4867 if( pMemPage ){
4868 pPage = pMemPage;
4869 sqlite3PagerRef(pPage->pDbPage);
4870 }else{
4871 pPage = btreePageLookup(pBt, iPage);
4872 }
4873
4874 /* Increment the free page count on pPage1 */
4875 rc = sqlite3PagerWrite(pPage1->pDbPage);
4876 if( rc ) goto freepage_out;
4877 nFree = get4byte(&pPage1->aData[36]);
4878 put4byte(&pPage1->aData[36], nFree+1);
4879
4880 #ifdef SQLITE_SECURE_DELETE
4881 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4882 ** always fully overwrite deleted information with zeros.
4883 */
4884 if( (!pPage && (rc = btreeGetPage(pBt, iPage, &pPage, 0)))
4885 || (rc = sqlite3PagerWrite(pPage->pDbPage))
4886 ){
4887 goto freepage_out;
4888 }
4889 memset(pPage->aData, 0, pPage->pBt->pageSize);
4890 #endif
4891
4892 /* If the database supports auto-vacuum, write an entry in the pointer-map
4893 ** to indicate that the page is free.
4894 */
4895 if( ISAUTOVACUUM ){
4896 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
4897 if( rc ) goto freepage_out;
4898 }
4899
4900 /* Now manipulate the actual database free-list structure. There are two
4901 ** possibilities. If the free-list is currently empty, or if the first
4902 ** trunk page in the free-list is full, then this page will become a
4903 ** new free-list trunk page. Otherwise, it will become a leaf of the
4904 ** first trunk page in the current free-list. This block tests if it
4905 ** is possible to add the page as a new free-list leaf.
4906 */
4907 if( nFree!=0 ){
4908 u32 nLeaf; /* Initial number of leaf cells on trunk page */
4909
4910 iTrunk = get4byte(&pPage1->aData[32]);
4911 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
4912 if( rc!=SQLITE_OK ){
4913 goto freepage_out;
4914 }
4915
4916 nLeaf = get4byte(&pTrunk->aData[4]);
4917 assert( pBt->usableSize>32 );
4918 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
4919 rc = SQLITE_CORRUPT_BKPT;
4920 goto freepage_out;
4921 }
4922 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
4923 /* In this case there is room on the trunk page to insert the page
4924 ** being freed as a new leaf.
4925 **
4926 ** Note that the trunk page is not really full until it contains
4927 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4928 ** coded. But due to a coding error in versions of SQLite prior to
4929 ** 3.6.0, databases with freelist trunk pages holding more than
4930 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4931 ** to maintain backwards compatibility with older versions of SQLite,
4932 ** we will continue to restrict the number of entries to usableSize/4 - 8
4933 ** for now. At some point in the future (once everyone has upgraded
4934 ** to 3.6.0 or later) we should consider fixing the conditional above
4935 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4936 */
4937 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4938 if( rc==SQLITE_OK ){
4939 put4byte(&pTrunk->aData[4], nLeaf+1);
4940 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
4941 #ifndef SQLITE_SECURE_DELETE
4942 if( pPage ){
4943 sqlite3PagerDontWrite(pPage->pDbPage);
4944 }
4945 #endif
4946 rc = btreeSetHasContent(pBt, iPage);
4947 }
4948 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4949 goto freepage_out;
4950 }
4951 }
4952
4953 /* If control flows to this point, then it was not possible to add the
4954 ** the page being freed as a leaf page of the first trunk in the free-list.
4955 ** Possibly because the free-list is empty, or possibly because the
4956 ** first trunk in the free-list is full. Either way, the page being freed
4957 ** will become the new first trunk page in the free-list.
4958 */
4959 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
4960 goto freepage_out;
4961 }
4962 rc = sqlite3PagerWrite(pPage->pDbPage);
4963 if( rc!=SQLITE_OK ){
4964 goto freepage_out;
4965 }
4966 put4byte(pPage->aData, iTrunk);
4967 put4byte(&pPage->aData[4], 0);
4968 put4byte(&pPage1->aData[32], iPage);
4969 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
4970
4971 freepage_out:
4972 if( pPage ){
4973 pPage->isInit = 0;
4974 }
4975 releasePage(pPage);
4976 releasePage(pTrunk);
4977 return rc;
4978 }
4979 static void freePage(MemPage *pPage, int *pRC){
4980 if( (*pRC)==SQLITE_OK ){
4981 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
4982 }
4983 }
4984
4985 /*
4986 ** Free any overflow pages associated with the given Cell.
4987 */
4988 static int clearCell(MemPage *pPage, unsigned char *pCell){
4989 BtShared *pBt = pPage->pBt;
4990 CellInfo info;
4991 Pgno ovflPgno;
4992 int rc;
4993 int nOvfl;
4994 u16 ovflPageSize;
4995
4996 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4997 btreeParseCellPtr(pPage, pCell, &info);
4998 if( info.iOverflow==0 ){
4999 return SQLITE_OK; /* No overflow pages. Return without doing anything */
5000 }
5001 ovflPgno = get4byte(&pCell[info.iOverflow]);
5002 assert( pBt->usableSize > 4 );
5003 ovflPageSize = pBt->usableSize - 4;
5004 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
5005 assert( ovflPgno==0 || nOvfl>0 );
5006 while( nOvfl-- ){
5007 Pgno iNext = 0;
5008 MemPage *pOvfl = 0;
5009 if( ovflPgno<2 || ovflPgno>pagerPagecount(pBt) ){
5010 /* 0 is not a legal page number and page 1 cannot be an
5011 ** overflow page. Therefore if ovflPgno<2 or past the end of the
5012 ** file the database must be corrupt. */
5013 return SQLITE_CORRUPT_BKPT;
5014 }
5015 if( nOvfl ){
5016 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
5017 if( rc ) return rc;
5018 }
5019 rc = freePage2(pBt, pOvfl, ovflPgno);
5020 if( pOvfl ){
5021 sqlite3PagerUnref(pOvfl->pDbPage);
5022 }
5023 if( rc ) return rc;
5024 ovflPgno = iNext;
5025 }
5026 return SQLITE_OK;
5027 }
5028
5029 /*
5030 ** Create the byte sequence used to represent a cell on page pPage
5031 ** and write that byte sequence into pCell[]. Overflow pages are
5032 ** allocated and filled in as necessary. The calling procedure
5033 ** is responsible for making sure sufficient space has been allocated
5034 ** for pCell[].
5035 **
5036 ** Note that pCell does not necessary need to point to the pPage->aData
5037 ** area. pCell might point to some temporary storage. The cell will
5038 ** be constructed in this temporary area then copied into pPage->aData
5039 ** later.
5040 */
5041 static int fillInCell(
5042 MemPage *pPage, /* The page that contains the cell */
5043 unsigned char *pCell, /* Complete text of the cell */
5044 const void *pKey, i64 nKey, /* The key */
5045 const void *pData,int nData, /* The data */
5046 int nZero, /* Extra zero bytes to append to pData */
5047 int *pnSize /* Write cell size here */
5048 ){
5049 int nPayload;
5050 const u8 *pSrc;
5051 int nSrc, n, rc;
5052 int spaceLeft;
5053 MemPage *pOvfl = 0;
5054 MemPage *pToRelease = 0;
5055 unsigned char *pPrior;
5056 unsigned char *pPayload;
5057 BtShared *pBt = pPage->pBt;
5058 Pgno pgnoOvfl = 0;
5059 int nHeader;
5060 CellInfo info;
5061
5062 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5063
5064 /* pPage is not necessarily writeable since pCell might be auxiliary
5065 ** buffer space that is separate from the pPage buffer area */
5066 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
5067 || sqlite3PagerIswriteable(pPage->pDbPage) );
5068
5069 /* Fill in the header. */
5070 nHeader = 0;
5071 if( !pPage->leaf ){
5072 nHeader += 4;
5073 }
5074 if( pPage->hasData ){
5075 nHeader += putVarint(&pCell[nHeader], nData+nZero);
5076 }else{
5077 nData = nZero = 0;
5078 }
5079 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
5080 btreeParseCellPtr(pPage, pCell, &info);
5081 assert( info.nHeader==nHeader );
5082 assert( info.nKey==nKey );
5083 assert( info.nData==(u32)(nData+nZero) );
5084
5085 /* Fill in the payload */
5086 nPayload = nData + nZero;
5087 if( pPage->intKey ){
5088 pSrc = pData;
5089 nSrc = nData;
5090 nData = 0;
5091 }else{
5092 if( NEVER(nKey>0x7fffffff || pKey==0) ){
5093 return SQLITE_CORRUPT_BKPT;
5094 }
5095 nPayload += (int)nKey;
5096 pSrc = pKey;
5097 nSrc = (int)nKey;
5098 }
5099 *pnSize = info.nSize;
5100 spaceLeft = info.nLocal;
5101 pPayload = &pCell[nHeader];
5102 pPrior = &pCell[info.iOverflow];
5103
5104 while( nPayload>0 ){
5105 if( spaceLeft==0 ){
5106 #ifndef SQLITE_OMIT_AUTOVACUUM
5107 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
5108 if( pBt->autoVacuum ){
5109 do{
5110 pgnoOvfl++;
5111 } while(
5112 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
5113 );
5114 }
5115 #endif
5116 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
5117 #ifndef SQLITE_OMIT_AUTOVACUUM
5118 /* If the database supports auto-vacuum, and the second or subsequent
5119 ** overflow page is being allocated, add an entry to the pointer-map
5120 ** for that page now.
5121 **
5122 ** If this is the first overflow page, then write a partial entry
5123 ** to the pointer-map. If we write nothing to this pointer-map slot,
5124 ** then the optimistic overflow chain processing in clearCell()
5125 ** may misinterpret the uninitialised values and delete the
5126 ** wrong pages from the database.
5127 */
5128 if( pBt->autoVacuum && rc==SQLITE_OK ){
5129 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
5130 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
5131 if( rc ){
5132 releasePage(pOvfl);
5133 }
5134 }
5135 #endif
5136 if( rc ){
5137 releasePage(pToRelease);
5138 return rc;
5139 }
5140
5141 /* If pToRelease is not zero than pPrior points into the data area
5142 ** of pToRelease. Make sure pToRelease is still writeable. */
5143 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5144
5145 /* If pPrior is part of the data area of pPage, then make sure pPage
5146 ** is still writeable */
5147 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
5148 || sqlite3PagerIswriteable(pPage->pDbPage) );
5149
5150 put4byte(pPrior, pgnoOvfl);
5151 releasePage(pToRelease);
5152 pToRelease = pOvfl;
5153 pPrior = pOvfl->aData;
5154 put4byte(pPrior, 0);
5155 pPayload = &pOvfl->aData[4];
5156 spaceLeft = pBt->usableSize - 4;
5157 }
5158 n = nPayload;
5159 if( n>spaceLeft ) n = spaceLeft;
5160
5161 /* If pToRelease is not zero than pPayload points into the data area
5162 ** of pToRelease. Make sure pToRelease is still writeable. */
5163 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
5164
5165 /* If pPayload is part of the data area of pPage, then make sure pPage
5166 ** is still writeable */
5167 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
5168 || sqlite3PagerIswriteable(pPage->pDbPage) );
5169
5170 if( nSrc>0 ){
5171 if( n>nSrc ) n = nSrc;
5172 assert( pSrc );
5173 memcpy(pPayload, pSrc, n);
5174 }else{
5175 memset(pPayload, 0, n);
5176 }
5177 nPayload -= n;
5178 pPayload += n;
5179 pSrc += n;
5180 nSrc -= n;
5181 spaceLeft -= n;
5182 if( nSrc==0 ){
5183 nSrc = nData;
5184 pSrc = pData;
5185 }
5186 }
5187 releasePage(pToRelease);
5188 return SQLITE_OK;
5189 }
5190
5191 /*
5192 ** Remove the i-th cell from pPage. This routine effects pPage only.
5193 ** The cell content is not freed or deallocated. It is assumed that
5194 ** the cell content has been copied someplace else. This routine just
5195 ** removes the reference to the cell from pPage.
5196 **
5197 ** "sz" must be the number of bytes in the cell.
5198 */
5199 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
5200 int i; /* Loop counter */
5201 int pc; /* Offset to cell content of cell being deleted */
5202 u8 *data; /* pPage->aData */
5203 u8 *ptr; /* Used to move bytes around within data[] */
5204 int rc; /* The return code */
5205 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */
5206
5207 if( *pRC ) return;
5208
5209 assert( idx>=0 && idx<pPage->nCell );
5210 assert( sz==cellSize(pPage, idx) );
5211 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5212 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5213 data = pPage->aData;
5214 ptr = &data[pPage->cellOffset + 2*idx];
5215 pc = get2byte(ptr);
5216 hdr = pPage->hdrOffset;
5217 testcase( pc==get2byte(&data[hdr+5]) );
5218 testcase( pc+sz==pPage->pBt->usableSize );
5219 if( pc < get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
5220 *pRC = SQLITE_CORRUPT_BKPT;
5221 return;
5222 }
5223 rc = freeSpace(pPage, pc, sz);
5224 if( rc ){
5225 *pRC = rc;
5226 return;
5227 }
5228 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
5229 ptr[0] = ptr[2];
5230 ptr[1] = ptr[3];
5231 }
5232 pPage->nCell--;
5233 put2byte(&data[hdr+3], pPage->nCell);
5234 pPage->nFree += 2;
5235 }
5236
5237 /*
5238 ** Insert a new cell on pPage at cell index "i". pCell points to the
5239 ** content of the cell.
5240 **
5241 ** If the cell content will fit on the page, then put it there. If it
5242 ** will not fit, then make a copy of the cell content into pTemp if
5243 ** pTemp is not null. Regardless of pTemp, allocate a new entry
5244 ** in pPage->aOvfl[] and make it point to the cell content (either
5245 ** in pTemp or the original pCell) and also record its index.
5246 ** Allocating a new entry in pPage->aCell[] implies that
5247 ** pPage->nOverflow is incremented.
5248 **
5249 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
5250 ** cell. The caller will overwrite them after this function returns. If
5251 ** nSkip is non-zero, then pCell may not point to an invalid memory location
5252 ** (but pCell+nSkip is always valid).
5253 */
5254 static void insertCell(
5255 MemPage *pPage, /* Page into which we are copying */
5256 int i, /* New cell becomes the i-th cell of the page */
5257 u8 *pCell, /* Content of the new cell */
5258 int sz, /* Bytes of content in pCell */
5259 u8 *pTemp, /* Temp storage space for pCell, if needed */
5260 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */
5261 int *pRC /* Read and write return code from here */
5262 ){
5263 int idx; /* Where to write new cell content in data[] */
5264 int j; /* Loop counter */
5265 int end; /* First byte past the last cell pointer in data[] */
5266 int ins; /* Index in data[] where new cell pointer is inserted */
5267 int cellOffset; /* Address of first cell pointer in data[] */
5268 u8 *data; /* The content of the whole page */
5269 u8 *ptr; /* Used for moving information around in data[] */
5270
5271 int nSkip = (iChild ? 4 : 0);
5272
5273 if( *pRC ) return;
5274
5275 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
5276 assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
5277 assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) );
5278 assert( sz==cellSizePtr(pPage, pCell) );
5279 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5280 if( pPage->nOverflow || sz+2>pPage->nFree ){
5281 if( pTemp ){
5282 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
5283 pCell = pTemp;
5284 }
5285 if( iChild ){
5286 put4byte(pCell, iChild);
5287 }
5288 j = pPage->nOverflow++;
5289 assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) );
5290 pPage->aOvfl[j].pCell = pCell;
5291 pPage->aOvfl[j].idx = (u16)i;
5292 }else{
5293 int rc = sqlite3PagerWrite(pPage->pDbPage);
5294 if( rc!=SQLITE_OK ){
5295 *pRC = rc;
5296 return;
5297 }
5298 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5299 data = pPage->aData;
5300 cellOffset = pPage->cellOffset;
5301 end = cellOffset + 2*pPage->nCell;
5302 ins = cellOffset + 2*i;
5303 rc = allocateSpace(pPage, sz, &idx);
5304 if( rc ){ *pRC = rc; return; }
5305 /* The allocateSpace() routine guarantees the following two properties
5306 ** if it returns success */
5307 assert( idx >= end+2 );
5308 assert( idx+sz <= pPage->pBt->usableSize );
5309 pPage->nCell++;
5310 pPage->nFree -= (u16)(2 + sz);
5311 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
5312 if( iChild ){
5313 put4byte(&data[idx], iChild);
5314 }
5315 for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){
5316 ptr[0] = ptr[-2];
5317 ptr[1] = ptr[-1];
5318 }
5319 put2byte(&data[ins], idx);
5320 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
5321 #ifndef SQLITE_OMIT_AUTOVACUUM
5322 if( pPage->pBt->autoVacuum ){
5323 /* The cell may contain a pointer to an overflow page. If so, write
5324 ** the entry for the overflow page into the pointer map.
5325 */
5326 ptrmapPutOvflPtr(pPage, pCell, pRC);
5327 }
5328 #endif
5329 }
5330 }
5331
5332 /*
5333 ** Add a list of cells to a page. The page should be initially empty.
5334 ** The cells are guaranteed to fit on the page.
5335 */
5336 static void assemblePage(
5337 MemPage *pPage, /* The page to be assemblied */
5338 int nCell, /* The number of cells to add to this page */
5339 u8 **apCell, /* Pointers to cell bodies */
5340 u16 *aSize /* Sizes of the cells */
5341 ){
5342 int i; /* Loop counter */
5343 u8 *pCellptr; /* Address of next cell pointer */
5344 int cellbody; /* Address of next cell body */
5345 u8 * const data = pPage->aData; /* Pointer to data for pPage */
5346 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
5347 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
5348
5349 assert( pPage->nOverflow==0 );
5350 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5351 assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 );
5352 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
5353
5354 /* Check that the page has just been zeroed by zeroPage() */
5355 assert( pPage->nCell==0 );
5356 assert( get2byte(&data[hdr+5])==nUsable );
5357
5358 pCellptr = &data[pPage->cellOffset + nCell*2];
5359 cellbody = nUsable;
5360 for(i=nCell-1; i>=0; i--){
5361 pCellptr -= 2;
5362 cellbody -= aSize[i];
5363 put2byte(pCellptr, cellbody);
5364 memcpy(&data[cellbody], apCell[i], aSize[i]);
5365 }
5366 put2byte(&data[hdr+3], nCell);
5367 put2byte(&data[hdr+5], cellbody);
5368 pPage->nFree -= (nCell*2 + nUsable - cellbody);
5369 pPage->nCell = (u16)nCell;
5370 }
5371
5372 /*
5373 ** The following parameters determine how many adjacent pages get involved
5374 ** in a balancing operation. NN is the number of neighbors on either side
5375 ** of the page that participate in the balancing operation. NB is the
5376 ** total number of pages that participate, including the target page and
5377 ** NN neighbors on either side.
5378 **
5379 ** The minimum value of NN is 1 (of course). Increasing NN above 1
5380 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
5381 ** in exchange for a larger degradation in INSERT and UPDATE performance.
5382 ** The value of NN appears to give the best results overall.
5383 */
5384 #define NN 1 /* Number of neighbors on either side of pPage */
5385 #define NB (NN*2+1) /* Total pages involved in the balance */
5386
5387
5388 #ifndef SQLITE_OMIT_QUICKBALANCE
5389 /*
5390 ** This version of balance() handles the common special case where
5391 ** a new entry is being inserted on the extreme right-end of the
5392 ** tree, in other words, when the new entry will become the largest
5393 ** entry in the tree.
5394 **
5395 ** Instead of trying to balance the 3 right-most leaf pages, just add
5396 ** a new page to the right-hand side and put the one new entry in
5397 ** that page. This leaves the right side of the tree somewhat
5398 ** unbalanced. But odds are that we will be inserting new entries
5399 ** at the end soon afterwards so the nearly empty page will quickly
5400 ** fill up. On average.
5401 **
5402 ** pPage is the leaf page which is the right-most page in the tree.
5403 ** pParent is its parent. pPage must have a single overflow entry
5404 ** which is also the right-most entry on the page.
5405 **
5406 ** The pSpace buffer is used to store a temporary copy of the divider
5407 ** cell that will be inserted into pParent. Such a cell consists of a 4
5408 ** byte page number followed by a variable length integer. In other
5409 ** words, at most 13 bytes. Hence the pSpace buffer must be at
5410 ** least 13 bytes in size.
5411 */
5412 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
5413 BtShared *const pBt = pPage->pBt; /* B-Tree Database */
5414 MemPage *pNew; /* Newly allocated page */
5415 int rc; /* Return Code */
5416 Pgno pgnoNew; /* Page number of pNew */
5417
5418 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5419 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5420 assert( pPage->nOverflow==1 );
5421
5422 if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT;
5423
5424 /* Allocate a new page. This page will become the right-sibling of
5425 ** pPage. Make the parent page writable, so that the new divider cell
5426 ** may be inserted. If both these operations are successful, proceed.
5427 */
5428 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
5429
5430 if( rc==SQLITE_OK ){
5431
5432 u8 *pOut = &pSpace[4];
5433 u8 *pCell = pPage->aOvfl[0].pCell;
5434 u16 szCell = cellSizePtr(pPage, pCell);
5435 u8 *pStop;
5436
5437 assert( sqlite3PagerIswriteable(pNew->pDbPage) );
5438 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
5439 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
5440 assemblePage(pNew, 1, &pCell, &szCell);
5441
5442 /* If this is an auto-vacuum database, update the pointer map
5443 ** with entries for the new page, and any pointer from the
5444 ** cell on the page to an overflow page. If either of these
5445 ** operations fails, the return code is set, but the contents
5446 ** of the parent page are still manipulated by thh code below.
5447 ** That is Ok, at this point the parent page is guaranteed to
5448 ** be marked as dirty. Returning an error code will cause a
5449 ** rollback, undoing any changes made to the parent page.
5450 */
5451 if( ISAUTOVACUUM ){
5452 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
5453 if( szCell>pNew->minLocal ){
5454 ptrmapPutOvflPtr(pNew, pCell, &rc);
5455 }
5456 }
5457
5458 /* Create a divider cell to insert into pParent. The divider cell
5459 ** consists of a 4-byte page number (the page number of pPage) and
5460 ** a variable length key value (which must be the same value as the
5461 ** largest key on pPage).
5462 **
5463 ** To find the largest key value on pPage, first find the right-most
5464 ** cell on pPage. The first two fields of this cell are the
5465 ** record-length (a variable length integer at most 32-bits in size)
5466 ** and the key value (a variable length integer, may have any value).
5467 ** The first of the while(...) loops below skips over the record-length
5468 ** field. The second while(...) loop copies the key value from the
5469 ** cell on pPage into the pSpace buffer.
5470 */
5471 pCell = findCell(pPage, pPage->nCell-1);
5472 pStop = &pCell[9];
5473 while( (*(pCell++)&0x80) && pCell<pStop );
5474 pStop = &pCell[9];
5475 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
5476
5477 /* Insert the new divider cell into pParent. */
5478 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
5479 0, pPage->pgno, &rc);
5480
5481 /* Set the right-child pointer of pParent to point to the new page. */
5482 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
5483
5484 /* Release the reference to the new page. */
5485 releasePage(pNew);
5486 }
5487
5488 return rc;
5489 }
5490 #endif /* SQLITE_OMIT_QUICKBALANCE */
5491
5492 #if 0
5493 /*
5494 ** This function does not contribute anything to the operation of SQLite.
5495 ** it is sometimes activated temporarily while debugging code responsible
5496 ** for setting pointer-map entries.
5497 */
5498 static int ptrmapCheckPages(MemPage **apPage, int nPage){
5499 int i, j;
5500 for(i=0; i<nPage; i++){
5501 Pgno n;
5502 u8 e;
5503 MemPage *pPage = apPage[i];
5504 BtShared *pBt = pPage->pBt;
5505 assert( pPage->isInit );
5506
5507 for(j=0; j<pPage->nCell; j++){
5508 CellInfo info;
5509 u8 *z;
5510
5511 z = findCell(pPage, j);
5512 btreeParseCellPtr(pPage, z, &info);
5513 if( info.iOverflow ){
5514 Pgno ovfl = get4byte(&z[info.iOverflow]);
5515 ptrmapGet(pBt, ovfl, &e, &n);
5516 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
5517 }
5518 if( !pPage->leaf ){
5519 Pgno child = get4byte(z);
5520 ptrmapGet(pBt, child, &e, &n);
5521 assert( n==pPage->pgno && e==PTRMAP_BTREE );
5522 }
5523 }
5524 if( !pPage->leaf ){
5525 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5526 ptrmapGet(pBt, child, &e, &n);
5527 assert( n==pPage->pgno && e==PTRMAP_BTREE );
5528 }
5529 }
5530 return 1;
5531 }
5532 #endif
5533
5534 /*
5535 ** This function is used to copy the contents of the b-tree node stored
5536 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
5537 ** the pointer-map entries for each child page are updated so that the
5538 ** parent page stored in the pointer map is page pTo. If pFrom contained
5539 ** any cells with overflow page pointers, then the corresponding pointer
5540 ** map entries are also updated so that the parent page is page pTo.
5541 **
5542 ** If pFrom is currently carrying any overflow cells (entries in the
5543 ** MemPage.aOvfl[] array), they are not copied to pTo.
5544 **
5545 ** Before returning, page pTo is reinitialized using btreeInitPage().
5546 **
5547 ** The performance of this function is not critical. It is only used by
5548 ** the balance_shallower() and balance_deeper() procedures, neither of
5549 ** which are called often under normal circumstances.
5550 */
5551 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
5552 if( (*pRC)==SQLITE_OK ){
5553 BtShared * const pBt = pFrom->pBt;
5554 u8 * const aFrom = pFrom->aData;
5555 u8 * const aTo = pTo->aData;
5556 int const iFromHdr = pFrom->hdrOffset;
5557 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
5558 TESTONLY(int rc;)
5559 int iData;
5560
5561
5562 assert( pFrom->isInit );
5563 assert( pFrom->nFree>=iToHdr );
5564 assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize );
5565
5566 /* Copy the b-tree node content from page pFrom to page pTo. */
5567 iData = get2byte(&aFrom[iFromHdr+5]);
5568 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
5569 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
5570
5571 /* Reinitialize page pTo so that the contents of the MemPage structure
5572 ** match the new data. The initialization of pTo "cannot" fail, as the
5573 ** data copied from pFrom is known to be valid. */
5574 pTo->isInit = 0;
5575 TESTONLY(rc = ) btreeInitPage(pTo);
5576 assert( rc==SQLITE_OK );
5577
5578 /* If this is an auto-vacuum database, update the pointer-map entries
5579 ** for any b-tree or overflow pages that pTo now contains the pointers to.
5580 */
5581 if( ISAUTOVACUUM ){
5582 *pRC = setChildPtrmaps(pTo);
5583 }
5584 }
5585 }
5586
5587 /*
5588 ** This routine redistributes cells on the iParentIdx'th child of pParent
5589 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
5590 ** same amount of free space. Usually a single sibling on either side of the
5591 ** page are used in the balancing, though both siblings might come from one
5592 ** side if the page is the first or last child of its parent. If the page
5593 ** has fewer than 2 siblings (something which can only happen if the page
5594 ** is a root page or a child of a root page) then all available siblings
5595 ** participate in the balancing.
5596 **
5597 ** The number of siblings of the page might be increased or decreased by
5598 ** one or two in an effort to keep pages nearly full but not over full.
5599 **
5600 ** Note that when this routine is called, some of the cells on the page
5601 ** might not actually be stored in MemPage.aData[]. This can happen
5602 ** if the page is overfull. This routine ensures that all cells allocated
5603 ** to the page and its siblings fit into MemPage.aData[] before returning.
5604 **
5605 ** In the course of balancing the page and its siblings, cells may be
5606 ** inserted into or removed from the parent page (pParent). Doing so
5607 ** may cause the parent page to become overfull or underfull. If this
5608 ** happens, it is the responsibility of the caller to invoke the correct
5609 ** balancing routine to fix this problem (see the balance() routine).
5610 **
5611 ** If this routine fails for any reason, it might leave the database
5612 ** in a corrupted state. So if this routine fails, the database should
5613 ** be rolled back.
5614 **
5615 ** The third argument to this function, aOvflSpace, is a pointer to a
5616 ** buffer big enough to hold one page. If while inserting cells into the parent
5617 ** page (pParent) the parent page becomes overfull, this buffer is
5618 ** used to store the parent's overflow cells. Because this function inserts
5619 ** a maximum of four divider cells into the parent page, and the maximum
5620 ** size of a cell stored within an internal node is always less than 1/4
5621 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
5622 ** enough for all overflow cells.
5623 **
5624 ** If aOvflSpace is set to a null pointer, this function returns
5625 ** SQLITE_NOMEM.
5626 */
5627 static int balance_nonroot(
5628 MemPage *pParent, /* Parent page of siblings being balanced */
5629 int iParentIdx, /* Index of "the page" in pParent */
5630 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
5631 int isRoot /* True if pParent is a root-page */
5632 ){
5633 BtShared *pBt; /* The whole database */
5634 int nCell = 0; /* Number of cells in apCell[] */
5635 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
5636 int nNew = 0; /* Number of pages in apNew[] */
5637 int nOld; /* Number of pages in apOld[] */
5638 int i, j, k; /* Loop counters */
5639 int nxDiv; /* Next divider slot in pParent->aCell[] */
5640 int rc = SQLITE_OK; /* The return code */
5641 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
5642 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
5643 int usableSpace; /* Bytes in pPage beyond the header */
5644 int pageFlags; /* Value of pPage->aData[0] */
5645 int subtotal; /* Subtotal of bytes in cells on one page */
5646 int iSpace1 = 0; /* First unused byte of aSpace1[] */
5647 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
5648 int szScratch; /* Size of scratch memory requested */
5649 MemPage *apOld[NB]; /* pPage and up to two siblings */
5650 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
5651 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
5652 u8 *pRight; /* Location in parent of right-sibling pointer */
5653 u8 *apDiv[NB-1]; /* Divider cells in pParent */
5654 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
5655 int szNew[NB+2]; /* Combined size of cells place on i-th page */
5656 u8 **apCell = 0; /* All cells begin balanced */
5657 u16 *szCell; /* Local size of all cells in apCell[] */
5658 u8 *aSpace1; /* Space for copies of dividers cells */
5659 Pgno pgno; /* Temp var to store a page number in */
5660
5661 pBt = pParent->pBt;
5662 assert( sqlite3_mutex_held(pBt->mutex) );
5663 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
5664
5665 #if 0
5666 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
5667 #endif
5668
5669 /* At this point pParent may have at most one overflow cell. And if
5670 ** this overflow cell is present, it must be the cell with
5671 ** index iParentIdx. This scenario comes about when this function
5672 ** is called (indirectly) from sqlite3BtreeDelete().
5673 */
5674 assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
5675 assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx );
5676
5677 if( !aOvflSpace ){
5678 return SQLITE_NOMEM;
5679 }
5680
5681 /* Find the sibling pages to balance. Also locate the cells in pParent
5682 ** that divide the siblings. An attempt is made to find NN siblings on
5683 ** either side of pPage. More siblings are taken from one side, however,
5684 ** if there are fewer than NN siblings on the other side. If pParent
5685 ** has NB or fewer children then all children of pParent are taken.
5686 **
5687 ** This loop also drops the divider cells from the parent page. This
5688 ** way, the remainder of the function does not have to deal with any
5689 ** overflow cells in the parent page, since if any existed they will
5690 ** have already been removed.
5691 */
5692 i = pParent->nOverflow + pParent->nCell;
5693 if( i<2 ){
5694 nxDiv = 0;
5695 nOld = i+1;
5696 }else{
5697 nOld = 3;
5698 if( iParentIdx==0 ){
5699 nxDiv = 0;
5700 }else if( iParentIdx==i ){
5701 nxDiv = i-2;
5702 }else{
5703 nxDiv = iParentIdx-1;
5704 }
5705 i = 2;
5706 }
5707 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
5708 pRight = &pParent->aData[pParent->hdrOffset+8];
5709 }else{
5710 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
5711 }
5712 pgno = get4byte(pRight);
5713 while( 1 ){
5714 rc = getAndInitPage(pBt, pgno, &apOld[i]);
5715 if( rc ){
5716 memset(apOld, 0, (i+1)*sizeof(MemPage*));
5717 goto balance_cleanup;
5718 }
5719 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5720 if( (i--)==0 ) break;
5721
5722 if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){
5723 apDiv[i] = pParent->aOvfl[0].pCell;
5724 pgno = get4byte(apDiv[i]);
5725 szNew[i] = cellSizePtr(pParent, apDiv[i]);
5726 pParent->nOverflow = 0;
5727 }else{
5728 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
5729 pgno = get4byte(apDiv[i]);
5730 szNew[i] = cellSizePtr(pParent, apDiv[i]);
5731
5732 /* Drop the cell from the parent page. apDiv[i] still points to
5733 ** the cell within the parent, even though it has been dropped.
5734 ** This is safe because dropping a cell only overwrites the first
5735 ** four bytes of it, and this function does not need the first
5736 ** four bytes of the divider cell. So the pointer is safe to use
5737 ** later on.
5738 **
5739 ** Unless SQLite is compiled in secure-delete mode. In this case,
5740 ** the dropCell() routine will overwrite the entire cell with zeroes.
5741 ** In this case, temporarily copy the cell into the aOvflSpace[]
5742 ** buffer. It will be copied out again as soon as the aSpace[] buffer
5743 ** is allocated. */
5744 #ifdef SQLITE_SECURE_DELETE
5745 memcpy(&aOvflSpace[apDiv[i]-pParent->aData], apDiv[i], szNew[i]);
5746 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
5747 #endif
5748 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
5749 }
5750 }
5751
5752 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5753 ** alignment */
5754 nMaxCells = (nMaxCells + 3)&~3;
5755
5756 /*
5757 ** Allocate space for memory structures
5758 */
5759 k = pBt->pageSize + ROUND8(sizeof(MemPage));
5760 szScratch =
5761 nMaxCells*sizeof(u8*) /* apCell */
5762 + nMaxCells*sizeof(u16) /* szCell */
5763 + pBt->pageSize /* aSpace1 */
5764 + k*nOld; /* Page copies (apCopy) */
5765 apCell = sqlite3ScratchMalloc( szScratch );
5766 if( apCell==0 ){
5767 rc = SQLITE_NOMEM;
5768 goto balance_cleanup;
5769 }
5770 szCell = (u16*)&apCell[nMaxCells];
5771 aSpace1 = (u8*)&szCell[nMaxCells];
5772 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
5773
5774 /*
5775 ** Load pointers to all cells on sibling pages and the divider cells
5776 ** into the local apCell[] array. Make copies of the divider cells
5777 ** into space obtained from aSpace1[] and remove the the divider Cells
5778 ** from pParent.
5779 **
5780 ** If the siblings are on leaf pages, then the child pointers of the
5781 ** divider cells are stripped from the cells before they are copied
5782 ** into aSpace1[]. In this way, all cells in apCell[] are without
5783 ** child pointers. If siblings are not leaves, then all cell in
5784 ** apCell[] include child pointers. Either way, all cells in apCell[]
5785 ** are alike.
5786 **
5787 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5788 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
5789 */
5790 leafCorrection = apOld[0]->leaf*4;
5791 leafData = apOld[0]->hasData;
5792 for(i=0; i<nOld; i++){
5793 int limit;
5794
5795 /* Before doing anything else, take a copy of the i'th original sibling
5796 ** The rest of this function will use data from the copies rather
5797 ** that the original pages since the original pages will be in the
5798 ** process of being overwritten. */
5799 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
5800 memcpy(pOld, apOld[i], sizeof(MemPage));
5801 pOld->aData = (void*)&pOld[1];
5802 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
5803
5804 limit = pOld->nCell+pOld->nOverflow;
5805 for(j=0; j<limit; j++){
5806 assert( nCell<nMaxCells );
5807 apCell[nCell] = findOverflowCell(pOld, j);
5808 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
5809 nCell++;
5810 }
5811 if( i<nOld-1 && !leafData){
5812 u16 sz = (u16)szNew[i];
5813 u8 *pTemp;
5814 assert( nCell<nMaxCells );
5815 szCell[nCell] = sz;
5816 pTemp = &aSpace1[iSpace1];
5817 iSpace1 += sz;
5818 assert( sz<=pBt->pageSize/4 );
5819 assert( iSpace1<=pBt->pageSize );
5820 memcpy(pTemp, apDiv[i], sz);
5821 apCell[nCell] = pTemp+leafCorrection;
5822 assert( leafCorrection==0 || leafCorrection==4 );
5823 szCell[nCell] = szCell[nCell] - leafCorrection;
5824 if( !pOld->leaf ){
5825 assert( leafCorrection==0 );
5826 assert( pOld->hdrOffset==0 );
5827 /* The right pointer of the child page pOld becomes the left
5828 ** pointer of the divider cell */
5829 memcpy(apCell[nCell], &pOld->aData[8], 4);
5830 }else{
5831 assert( leafCorrection==4 );
5832 if( szCell[nCell]<4 ){
5833 /* Do not allow any cells smaller than 4 bytes. */
5834 szCell[nCell] = 4;
5835 }
5836 }
5837 nCell++;
5838 }
5839 }
5840
5841 /*
5842 ** Figure out the number of pages needed to hold all nCell cells.
5843 ** Store this number in "k". Also compute szNew[] which is the total
5844 ** size of all cells on the i-th page and cntNew[] which is the index
5845 ** in apCell[] of the cell that divides page i from page i+1.
5846 ** cntNew[k] should equal nCell.
5847 **
5848 ** Values computed by this block:
5849 **
5850 ** k: The total number of sibling pages
5851 ** szNew[i]: Spaced used on the i-th sibling page.
5852 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5853 ** the right of the i-th sibling page.
5854 ** usableSpace: Number of bytes of space available on each sibling.
5855 **
5856 */
5857 usableSpace = pBt->usableSize - 12 + leafCorrection;
5858 for(subtotal=k=i=0; i<nCell; i++){
5859 assert( i<nMaxCells );
5860 subtotal += szCell[i] + 2;
5861 if( subtotal > usableSpace ){
5862 szNew[k] = subtotal - szCell[i];
5863 cntNew[k] = i;
5864 if( leafData ){ i--; }
5865 subtotal = 0;
5866 k++;
5867 if( k>NB+1 ){ rc = SQLITE_CORRUPT; goto balance_cleanup; }
5868 }
5869 }
5870 szNew[k] = subtotal;
5871 cntNew[k] = nCell;
5872 k++;
5873
5874 /*
5875 ** The packing computed by the previous block is biased toward the siblings
5876 ** on the left side. The left siblings are always nearly full, while the
5877 ** right-most sibling might be nearly empty. This block of code attempts
5878 ** to adjust the packing of siblings to get a better balance.
5879 **
5880 ** This adjustment is more than an optimization. The packing above might
5881 ** be so out of balance as to be illegal. For example, the right-most
5882 ** sibling might be completely empty. This adjustment is not optional.
5883 */
5884 for(i=k-1; i>0; i--){
5885 int szRight = szNew[i]; /* Size of sibling on the right */
5886 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5887 int r; /* Index of right-most cell in left sibling */
5888 int d; /* Index of first cell to the left of right sibling */
5889
5890 r = cntNew[i-1] - 1;
5891 d = r + 1 - leafData;
5892 assert( d<nMaxCells );
5893 assert( r<nMaxCells );
5894 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5895 szRight += szCell[d] + 2;
5896 szLeft -= szCell[r] + 2;
5897 cntNew[i-1]--;
5898 r = cntNew[i-1] - 1;
5899 d = r + 1 - leafData;
5900 }
5901 szNew[i] = szRight;
5902 szNew[i-1] = szLeft;
5903 }
5904
5905 /* Either we found one or more cells (cntnew[0])>0) or pPage is
5906 ** a virtual root page. A virtual root page is when the real root
5907 ** page is page 1 and we are the only child of that page.
5908 */
5909 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5910
5911 TRACE(("BALANCE: old: %d %d %d ",
5912 apOld[0]->pgno,
5913 nOld>=2 ? apOld[1]->pgno : 0,
5914 nOld>=3 ? apOld[2]->pgno : 0
5915 ));
5916
5917 /*
5918 ** Allocate k new pages. Reuse old pages where possible.
5919 */
5920 if( apOld[0]->pgno<=1 ){
5921 rc = SQLITE_CORRUPT;
5922 goto balance_cleanup;
5923 }
5924 pageFlags = apOld[0]->aData[0];
5925 for(i=0; i<k; i++){
5926 MemPage *pNew;
5927 if( i<nOld ){
5928 pNew = apNew[i] = apOld[i];
5929 apOld[i] = 0;
5930 rc = sqlite3PagerWrite(pNew->pDbPage);
5931 nNew++;
5932 if( rc ) goto balance_cleanup;
5933 }else{
5934 assert( i>0 );
5935 rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
5936 if( rc ) goto balance_cleanup;
5937 apNew[i] = pNew;
5938 nNew++;
5939
5940 /* Set the pointer-map entry for the new sibling page. */
5941 if( ISAUTOVACUUM ){
5942 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
5943 if( rc!=SQLITE_OK ){
5944 goto balance_cleanup;
5945 }
5946 }
5947 }
5948 }
5949
5950 /* Free any old pages that were not reused as new pages.
5951 */
5952 while( i<nOld ){
5953 freePage(apOld[i], &rc);
5954 if( rc ) goto balance_cleanup;
5955 releasePage(apOld[i]);
5956 apOld[i] = 0;
5957 i++;
5958 }
5959
5960 /*
5961 ** Put the new pages in accending order. This helps to
5962 ** keep entries in the disk file in order so that a scan
5963 ** of the table is a linear scan through the file. That
5964 ** in turn helps the operating system to deliver pages
5965 ** from the disk more rapidly.
5966 **
5967 ** An O(n^2) insertion sort algorithm is used, but since
5968 ** n is never more than NB (a small constant), that should
5969 ** not be a problem.
5970 **
5971 ** When NB==3, this one optimization makes the database
5972 ** about 25% faster for large insertions and deletions.
5973 */
5974 for(i=0; i<k-1; i++){
5975 int minV = apNew[i]->pgno;
5976 int minI = i;
5977 for(j=i+1; j<k; j++){
5978 if( apNew[j]->pgno<(unsigned)minV ){
5979 minI = j;
5980 minV = apNew[j]->pgno;
5981 }
5982 }
5983 if( minI>i ){
5984 int t;
5985 MemPage *pT;
5986 t = apNew[i]->pgno;
5987 pT = apNew[i];
5988 apNew[i] = apNew[minI];
5989 apNew[minI] = pT;
5990 }
5991 }
5992 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5993 apNew[0]->pgno, szNew[0],
5994 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
5995 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
5996 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
5997 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
5998
5999 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6000 put4byte(pRight, apNew[nNew-1]->pgno);
6001
6002 /*
6003 ** Evenly distribute the data in apCell[] across the new pages.
6004 ** Insert divider cells into pParent as necessary.
6005 */
6006 j = 0;
6007 for(i=0; i<nNew; i++){
6008 /* Assemble the new sibling page. */
6009 MemPage *pNew = apNew[i];
6010 assert( j<nMaxCells );
6011 zeroPage(pNew, pageFlags);
6012 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
6013 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
6014 assert( pNew->nOverflow==0 );
6015
6016 j = cntNew[i];
6017
6018 /* If the sibling page assembled above was not the right-most sibling,
6019 ** insert a divider cell into the parent page.
6020 */
6021 assert( i<nNew-1 || j==nCell );
6022 if( j<nCell ){
6023 u8 *pCell;
6024 u8 *pTemp;
6025 int sz;
6026
6027 assert( j<nMaxCells );
6028 pCell = apCell[j];
6029 sz = szCell[j] + leafCorrection;
6030 pTemp = &aOvflSpace[iOvflSpace];
6031 if( !pNew->leaf ){
6032 memcpy(&pNew->aData[8], pCell, 4);
6033 }else if( leafData ){
6034 /* If the tree is a leaf-data tree, and the siblings are leaves,
6035 ** then there is no divider cell in apCell[]. Instead, the divider
6036 ** cell consists of the integer key for the right-most cell of
6037 ** the sibling-page assembled above only.
6038 */
6039 CellInfo info;
6040 j--;
6041 btreeParseCellPtr(pNew, apCell[j], &info);
6042 pCell = pTemp;
6043 sz = 4 + putVarint(&pCell[4], info.nKey);
6044 pTemp = 0;
6045 }else{
6046 pCell -= 4;
6047 /* Obscure case for non-leaf-data trees: If the cell at pCell was
6048 ** previously stored on a leaf node, and its reported size was 4
6049 ** bytes, then it may actually be smaller than this
6050 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
6051 ** any cell). But it is important to pass the correct size to
6052 ** insertCell(), so reparse the cell now.
6053 **
6054 ** Note that this can never happen in an SQLite data file, as all
6055 ** cells are at least 4 bytes. It only happens in b-trees used
6056 ** to evaluate "IN (SELECT ...)" and similar clauses.
6057 */
6058 if( szCell[j]==4 ){
6059 assert(leafCorrection==4);
6060 sz = cellSizePtr(pParent, pCell);
6061 }
6062 }
6063 iOvflSpace += sz;
6064 assert( sz<=pBt->pageSize/4 );
6065 assert( iOvflSpace<=pBt->pageSize );
6066 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
6067 if( rc!=SQLITE_OK ) goto balance_cleanup;
6068 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
6069
6070 j++;
6071 nxDiv++;
6072 }
6073 }
6074 assert( j==nCell );
6075 assert( nOld>0 );
6076 assert( nNew>0 );
6077 if( (pageFlags & PTF_LEAF)==0 ){
6078 u8 *zChild = &apCopy[nOld-1]->aData[8];
6079 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
6080 }
6081
6082 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
6083 /* The root page of the b-tree now contains no cells. The only sibling
6084 ** page is the right-child of the parent. Copy the contents of the
6085 ** child page into the parent, decreasing the overall height of the
6086 ** b-tree structure by one. This is described as the "balance-shallower"
6087 ** sub-algorithm in some documentation.
6088 **
6089 ** If this is an auto-vacuum database, the call to copyNodeContent()
6090 ** sets all pointer-map entries corresponding to database image pages
6091 ** for which the pointer is stored within the content being copied.
6092 **
6093 ** The second assert below verifies that the child page is defragmented
6094 ** (it must be, as it was just reconstructed using assemblePage()). This
6095 ** is important if the parent page happens to be page 1 of the database
6096 ** image. */
6097 assert( nNew==1 );
6098 assert( apNew[0]->nFree ==
6099 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
6100 );
6101 copyNodeContent(apNew[0], pParent, &rc);
6102 freePage(apNew[0], &rc);
6103 }else if( ISAUTOVACUUM ){
6104 /* Fix the pointer-map entries for all the cells that were shifted around.
6105 ** There are several different types of pointer-map entries that need to
6106 ** be dealt with by this routine. Some of these have been set already, but
6107 ** many have not. The following is a summary:
6108 **
6109 ** 1) The entries associated with new sibling pages that were not
6110 ** siblings when this function was called. These have already
6111 ** been set. We don't need to worry about old siblings that were
6112 ** moved to the free-list - the freePage() code has taken care
6113 ** of those.
6114 **
6115 ** 2) The pointer-map entries associated with the first overflow
6116 ** page in any overflow chains used by new divider cells. These
6117 ** have also already been taken care of by the insertCell() code.
6118 **
6119 ** 3) If the sibling pages are not leaves, then the child pages of
6120 ** cells stored on the sibling pages may need to be updated.
6121 **
6122 ** 4) If the sibling pages are not internal intkey nodes, then any
6123 ** overflow pages used by these cells may need to be updated
6124 ** (internal intkey nodes never contain pointers to overflow pages).
6125 **
6126 ** 5) If the sibling pages are not leaves, then the pointer-map
6127 ** entries for the right-child pages of each sibling may need
6128 ** to be updated.
6129 **
6130 ** Cases 1 and 2 are dealt with above by other code. The next
6131 ** block deals with cases 3 and 4 and the one after that, case 5. Since
6132 ** setting a pointer map entry is a relatively expensive operation, this
6133 ** code only sets pointer map entries for child or overflow pages that have
6134 ** actually moved between pages. */
6135 MemPage *pNew = apNew[0];
6136 MemPage *pOld = apCopy[0];
6137 int nOverflow = pOld->nOverflow;
6138 int iNextOld = pOld->nCell + nOverflow;
6139 int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1);
6140 j = 0; /* Current 'old' sibling page */
6141 k = 0; /* Current 'new' sibling page */
6142 for(i=0; i<nCell; i++){
6143 int isDivider = 0;
6144 while( i==iNextOld ){
6145 /* Cell i is the cell immediately following the last cell on old
6146 ** sibling page j. If the siblings are not leaf pages of an
6147 ** intkey b-tree, then cell i was a divider cell. */
6148 pOld = apCopy[++j];
6149 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
6150 if( pOld->nOverflow ){
6151 nOverflow = pOld->nOverflow;
6152 iOverflow = i + !leafData + pOld->aOvfl[0].idx;
6153 }
6154 isDivider = !leafData;
6155 }
6156
6157 assert(nOverflow>0 || iOverflow<i );
6158 assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1);
6159 assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1);
6160 if( i==iOverflow ){
6161 isDivider = 1;
6162 if( (--nOverflow)>0 ){
6163 iOverflow++;
6164 }
6165 }
6166
6167 if( i==cntNew[k] ){
6168 /* Cell i is the cell immediately following the last cell on new
6169 ** sibling page k. If the siblings are not leaf pages of an
6170 ** intkey b-tree, then cell i is a divider cell. */
6171 pNew = apNew[++k];
6172 if( !leafData ) continue;
6173 }
6174 assert( j<nOld );
6175 assert( k<nNew );
6176
6177 /* If the cell was originally divider cell (and is not now) or
6178 ** an overflow cell, or if the cell was located on a different sibling
6179 ** page before the balancing, then the pointer map entries associated
6180 ** with any child or overflow pages need to be updated. */
6181 if( isDivider || pOld->pgno!=pNew->pgno ){
6182 if( !leafCorrection ){
6183 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
6184 }
6185 if( szCell[i]>pNew->minLocal ){
6186 ptrmapPutOvflPtr(pNew, apCell[i], &rc);
6187 }
6188 }
6189 }
6190
6191 if( !leafCorrection ){
6192 for(i=0; i<nNew; i++){
6193 u32 key = get4byte(&apNew[i]->aData[8]);
6194 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
6195 }
6196 }
6197
6198 #if 0
6199 /* The ptrmapCheckPages() contains assert() statements that verify that
6200 ** all pointer map pages are set correctly. This is helpful while
6201 ** debugging. This is usually disabled because a corrupt database may
6202 ** cause an assert() statement to fail. */
6203 ptrmapCheckPages(apNew, nNew);
6204 ptrmapCheckPages(&pParent, 1);
6205 #endif
6206 }
6207
6208 assert( pParent->isInit );
6209 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
6210 nOld, nNew, nCell));
6211
6212 /*
6213 ** Cleanup before returning.
6214 */
6215 balance_cleanup:
6216 sqlite3ScratchFree(apCell);
6217 for(i=0; i<nOld; i++){
6218 releasePage(apOld[i]);
6219 }
6220 for(i=0; i<nNew; i++){
6221 releasePage(apNew[i]);
6222 }
6223
6224 return rc;
6225 }
6226
6227
6228 /*
6229 ** This function is called when the root page of a b-tree structure is
6230 ** overfull (has one or more overflow pages).
6231 **
6232 ** A new child page is allocated and the contents of the current root
6233 ** page, including overflow cells, are copied into the child. The root
6234 ** page is then overwritten to make it an empty page with the right-child
6235 ** pointer pointing to the new page.
6236 **
6237 ** Before returning, all pointer-map entries corresponding to pages
6238 ** that the new child-page now contains pointers to are updated. The
6239 ** entry corresponding to the new right-child pointer of the root
6240 ** page is also updated.
6241 **
6242 ** If successful, *ppChild is set to contain a reference to the child
6243 ** page and SQLITE_OK is returned. In this case the caller is required
6244 ** to call releasePage() on *ppChild exactly once. If an error occurs,
6245 ** an error code is returned and *ppChild is set to 0.
6246 */
6247 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
6248 int rc; /* Return value from subprocedures */
6249 MemPage *pChild = 0; /* Pointer to a new child page */
6250 Pgno pgnoChild = 0; /* Page number of the new child page */
6251 BtShared *pBt = pRoot->pBt; /* The BTree */
6252
6253 assert( pRoot->nOverflow>0 );
6254 assert( sqlite3_mutex_held(pBt->mutex) );
6255
6256 /* Make pRoot, the root page of the b-tree, writable. Allocate a new
6257 ** page that will become the new right-child of pPage. Copy the contents
6258 ** of the node stored on pRoot into the new child page.
6259 */
6260 rc = sqlite3PagerWrite(pRoot->pDbPage);
6261 if( rc==SQLITE_OK ){
6262 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
6263 copyNodeContent(pRoot, pChild, &rc);
6264 if( ISAUTOVACUUM ){
6265 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
6266 }
6267 }
6268 if( rc ){
6269 *ppChild = 0;
6270 releasePage(pChild);
6271 return rc;
6272 }
6273 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
6274 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6275 assert( pChild->nCell==pRoot->nCell );
6276
6277 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
6278
6279 /* Copy the overflow cells from pRoot to pChild */
6280 memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0]));
6281 pChild->nOverflow = pRoot->nOverflow;
6282
6283 /* Zero the contents of pRoot. Then install pChild as the right-child. */
6284 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
6285 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
6286
6287 *ppChild = pChild;
6288 return SQLITE_OK;
6289 }
6290
6291 /*
6292 ** The page that pCur currently points to has just been modified in
6293 ** some way. This function figures out if this modification means the
6294 ** tree needs to be balanced, and if so calls the appropriate balancing
6295 ** routine. Balancing routines are:
6296 **
6297 ** balance_quick()
6298 ** balance_deeper()
6299 ** balance_nonroot()
6300 */
6301 static int balance(BtCursor *pCur){
6302 int rc = SQLITE_OK;
6303 const int nMin = pCur->pBt->usableSize * 2 / 3;
6304 u8 aBalanceQuickSpace[13];
6305 u8 *pFree = 0;
6306
6307 TESTONLY( int balance_quick_called = 0 );
6308 TESTONLY( int balance_deeper_called = 0 );
6309
6310 do {
6311 int iPage = pCur->iPage;
6312 MemPage *pPage = pCur->apPage[iPage];
6313
6314 if( iPage==0 ){
6315 if( pPage->nOverflow ){
6316 /* The root page of the b-tree is overfull. In this case call the
6317 ** balance_deeper() function to create a new child for the root-page
6318 ** and copy the current contents of the root-page to it. The
6319 ** next iteration of the do-loop will balance the child page.
6320 */
6321 assert( (balance_deeper_called++)==0 );
6322 rc = balance_deeper(pPage, &pCur->apPage[1]);
6323 if( rc==SQLITE_OK ){
6324 pCur->iPage = 1;
6325 pCur->aiIdx[0] = 0;
6326 pCur->aiIdx[1] = 0;
6327 assert( pCur->apPage[1]->nOverflow );
6328 }
6329 }else{
6330 break;
6331 }
6332 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
6333 break;
6334 }else{
6335 MemPage * const pParent = pCur->apPage[iPage-1];
6336 int const iIdx = pCur->aiIdx[iPage-1];
6337
6338 rc = sqlite3PagerWrite(pParent->pDbPage);
6339 if( rc==SQLITE_OK ){
6340 #ifndef SQLITE_OMIT_QUICKBALANCE
6341 if( pPage->hasData
6342 && pPage->nOverflow==1
6343 && pPage->aOvfl[0].idx==pPage->nCell
6344 && pParent->pgno!=1
6345 && pParent->nCell==iIdx
6346 ){
6347 /* Call balance_quick() to create a new sibling of pPage on which
6348 ** to store the overflow cell. balance_quick() inserts a new cell
6349 ** into pParent, which may cause pParent overflow. If this
6350 ** happens, the next interation of the do-loop will balance pParent
6351 ** use either balance_nonroot() or balance_deeper(). Until this
6352 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
6353 ** buffer.
6354 **
6355 ** The purpose of the following assert() is to check that only a
6356 ** single call to balance_quick() is made for each call to this
6357 ** function. If this were not verified, a subtle bug involving reuse
6358 ** of the aBalanceQuickSpace[] might sneak in.
6359 */
6360 assert( (balance_quick_called++)==0 );
6361 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
6362 }else
6363 #endif
6364 {
6365 /* In this case, call balance_nonroot() to redistribute cells
6366 ** between pPage and up to 2 of its sibling pages. This involves
6367 ** modifying the contents of pParent, which may cause pParent to
6368 ** become overfull or underfull. The next iteration of the do-loop
6369 ** will balance the parent page to correct this.
6370 **
6371 ** If the parent page becomes overfull, the overflow cell or cells
6372 ** are stored in the pSpace buffer allocated immediately below.
6373 ** A subsequent iteration of the do-loop will deal with this by
6374 ** calling balance_nonroot() (balance_deeper() may be called first,
6375 ** but it doesn't deal with overflow cells - just moves them to a
6376 ** different page). Once this subsequent call to balance_nonroot()
6377 ** has completed, it is safe to release the pSpace buffer used by
6378 ** the previous call, as the overflow cell data will have been
6379 ** copied either into the body of a database page or into the new
6380 ** pSpace buffer passed to the latter call to balance_nonroot().
6381 */
6382 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
6383 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
6384 if( pFree ){
6385 /* If pFree is not NULL, it points to the pSpace buffer used
6386 ** by a previous call to balance_nonroot(). Its contents are
6387 ** now stored either on real database pages or within the
6388 ** new pSpace buffer, so it may be safely freed here. */
6389 sqlite3PageFree(pFree);
6390 }
6391
6392 /* The pSpace buffer will be freed after the next call to
6393 ** balance_nonroot(), or just before this function returns, whichever
6394 ** comes first. */
6395 pFree = pSpace;
6396 }
6397 }
6398
6399 pPage->nOverflow = 0;
6400
6401 /* The next iteration of the do-loop balances the parent page. */
6402 releasePage(pPage);
6403 pCur->iPage--;
6404 }
6405 }while( rc==SQLITE_OK );
6406
6407 if( pFree ){
6408 sqlite3PageFree(pFree);
6409 }
6410 return rc;
6411 }
6412
6413
6414 /*
6415 ** Insert a new record into the BTree. The key is given by (pKey,nKey)
6416 ** and the data is given by (pData,nData). The cursor is used only to
6417 ** define what table the record should be inserted into. The cursor
6418 ** is left pointing at a random location.
6419 **
6420 ** For an INTKEY table, only the nKey value of the key is used. pKey is
6421 ** ignored. For a ZERODATA table, the pData and nData are both ignored.
6422 **
6423 ** If the seekResult parameter is non-zero, then a successful call to
6424 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
6425 ** been performed. seekResult is the search result returned (a negative
6426 ** number if pCur points at an entry that is smaller than (pKey, nKey), or
6427 ** a positive value if pCur points at an etry that is larger than
6428 ** (pKey, nKey)).
6429 **
6430 ** If the seekResult parameter is non-zero, then the caller guarantees that
6431 ** cursor pCur is pointing at the existing copy of a row that is to be
6432 ** overwritten. If the seekResult parameter is 0, then cursor pCur may
6433 ** point to any entry or to no entry at all and so this function has to seek
6434 ** the cursor before the new key can be inserted.
6435 */
6436 int sqlite3BtreeInsert(
6437 BtCursor *pCur, /* Insert data into the table of this cursor */
6438 const void *pKey, i64 nKey, /* The key of the new record */
6439 const void *pData, int nData, /* The data of the new record */
6440 int nZero, /* Number of extra 0 bytes to append to data */
6441 int appendBias, /* True if this is likely an append */
6442 int seekResult /* Result of prior MovetoUnpacked() call */
6443 ){
6444 int rc;
6445 int loc = seekResult; /* -1: before desired location +1: after */
6446 int szNew;
6447 int idx;
6448 MemPage *pPage;
6449 Btree *p = pCur->pBtree;
6450 BtShared *pBt = p->pBt;
6451 unsigned char *oldCell;
6452 unsigned char *newCell = 0;
6453
6454 if( pCur->eState==CURSOR_FAULT ){
6455 assert( pCur->skipNext!=SQLITE_OK );
6456 return pCur->skipNext;
6457 }
6458
6459 assert( cursorHoldsMutex(pCur) );
6460 assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly );
6461 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6462
6463 /* Assert that the caller has been consistent. If this cursor was opened
6464 ** expecting an index b-tree, then the caller should be inserting blob
6465 ** keys with no associated data. If the cursor was opened expecting an
6466 ** intkey table, the caller should be inserting integer keys with a
6467 ** blob of associated data. */
6468 assert( (pKey==0)==(pCur->pKeyInfo==0) );
6469
6470 /* If this is an insert into a table b-tree, invalidate any incrblob
6471 ** cursors open on the row being replaced (assuming this is a replace
6472 ** operation - if it is not, the following is a no-op). */
6473 if( pCur->pKeyInfo==0 ){
6474 invalidateIncrblobCursors(p, nKey, 0);
6475 }
6476
6477 /* Save the positions of any other cursors open on this table.
6478 **
6479 ** In some cases, the call to btreeMoveto() below is a no-op. For
6480 ** example, when inserting data into a table with auto-generated integer
6481 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
6482 ** integer key to use. It then calls this function to actually insert the
6483 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
6484 ** that the cursor is already where it needs to be and returns without
6485 ** doing any work. To avoid thwarting these optimizations, it is important
6486 ** not to clear the cursor here.
6487 */
6488 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6489 if( rc ) return rc;
6490 if( !loc ){
6491 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
6492 if( rc ) return rc;
6493 }
6494 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
6495
6496 pPage = pCur->apPage[pCur->iPage];
6497 assert( pPage->intKey || nKey>=0 );
6498 assert( pPage->leaf || !pPage->intKey );
6499
6500 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
6501 pCur->pgnoRoot, nKey, nData, pPage->pgno,
6502 loc==0 ? "overwrite" : "new entry"));
6503 assert( pPage->isInit );
6504 allocateTempSpace(pBt);
6505 newCell = pBt->pTmpSpace;
6506 if( newCell==0 ) return SQLITE_NOMEM;
6507 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
6508 if( rc ) goto end_insert;
6509 assert( szNew==cellSizePtr(pPage, newCell) );
6510 assert( szNew<=MX_CELL_SIZE(pBt) );
6511 idx = pCur->aiIdx[pCur->iPage];
6512 if( loc==0 ){
6513 u16 szOld;
6514 assert( idx<pPage->nCell );
6515 rc = sqlite3PagerWrite(pPage->pDbPage);
6516 if( rc ){
6517 goto end_insert;
6518 }
6519 oldCell = findCell(pPage, idx);
6520 if( !pPage->leaf ){
6521 memcpy(newCell, oldCell, 4);
6522 }
6523 szOld = cellSizePtr(pPage, oldCell);
6524 rc = clearCell(pPage, oldCell);
6525 dropCell(pPage, idx, szOld, &rc);
6526 if( rc ) goto end_insert;
6527 }else if( loc<0 && pPage->nCell>0 ){
6528 assert( pPage->leaf );
6529 idx = ++pCur->aiIdx[pCur->iPage];
6530 }else{
6531 assert( pPage->leaf );
6532 }
6533 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
6534 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
6535
6536 /* If no error has occured and pPage has an overflow cell, call balance()
6537 ** to redistribute the cells within the tree. Since balance() may move
6538 ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
6539 ** variables.
6540 **
6541 ** Previous versions of SQLite called moveToRoot() to move the cursor
6542 ** back to the root page as balance() used to invalidate the contents
6543 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
6544 ** set the cursor state to "invalid". This makes common insert operations
6545 ** slightly faster.
6546 **
6547 ** There is a subtle but important optimization here too. When inserting
6548 ** multiple records into an intkey b-tree using a single cursor (as can
6549 ** happen while processing an "INSERT INTO ... SELECT" statement), it
6550 ** is advantageous to leave the cursor pointing to the last entry in
6551 ** the b-tree if possible. If the cursor is left pointing to the last
6552 ** entry in the table, and the next row inserted has an integer key
6553 ** larger than the largest existing key, it is possible to insert the
6554 ** row without seeking the cursor. This can be a big performance boost.
6555 */
6556 pCur->info.nSize = 0;
6557 pCur->validNKey = 0;
6558 if( rc==SQLITE_OK && pPage->nOverflow ){
6559 rc = balance(pCur);
6560
6561 /* Must make sure nOverflow is reset to zero even if the balance()
6562 ** fails. Internal data structure corruption will result otherwise.
6563 ** Also, set the cursor state to invalid. This stops saveCursorPosition()
6564 ** from trying to save the current position of the cursor. */
6565 pCur->apPage[pCur->iPage]->nOverflow = 0;
6566 pCur->eState = CURSOR_INVALID;
6567 }
6568 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
6569
6570 end_insert:
6571 return rc;
6572 }
6573
6574 /*
6575 ** Delete the entry that the cursor is pointing to. The cursor
6576 ** is left pointing at a arbitrary location.
6577 */
6578 int sqlite3BtreeDelete(BtCursor *pCur){
6579 Btree *p = pCur->pBtree;
6580 BtShared *pBt = p->pBt;
6581 int rc; /* Return code */
6582 MemPage *pPage; /* Page to delete cell from */
6583 unsigned char *pCell; /* Pointer to cell to delete */
6584 int iCellIdx; /* Index of cell to delete */
6585 int iCellDepth; /* Depth of node containing pCell */
6586
6587 assert( cursorHoldsMutex(pCur) );
6588 assert( pBt->inTransaction==TRANS_WRITE );
6589 assert( !pBt->readOnly );
6590 assert( pCur->wrFlag );
6591 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
6592 assert( !hasReadConflicts(p, pCur->pgnoRoot) );
6593
6594 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
6595 || NEVER(pCur->eState!=CURSOR_VALID)
6596 ){
6597 return SQLITE_ERROR; /* Something has gone awry. */
6598 }
6599
6600 /* If this is a delete operation to remove a row from a table b-tree,
6601 ** invalidate any incrblob cursors open on the row being deleted. */
6602 if( pCur->pKeyInfo==0 ){
6603 invalidateIncrblobCursors(p, pCur->info.nKey, 0);
6604 }
6605
6606 iCellDepth = pCur->iPage;
6607 iCellIdx = pCur->aiIdx[iCellDepth];
6608 pPage = pCur->apPage[iCellDepth];
6609 pCell = findCell(pPage, iCellIdx);
6610
6611 /* If the page containing the entry to delete is not a leaf page, move
6612 ** the cursor to the largest entry in the tree that is smaller than
6613 ** the entry being deleted. This cell will replace the cell being deleted
6614 ** from the internal node. The 'previous' entry is used for this instead
6615 ** of the 'next' entry, as the previous entry is always a part of the
6616 ** sub-tree headed by the child page of the cell being deleted. This makes
6617 ** balancing the tree following the delete operation easier. */
6618 if( !pPage->leaf ){
6619 int notUsed;
6620 rc = sqlite3BtreePrevious(pCur, &notUsed);
6621 if( rc ) return rc;
6622 }
6623
6624 /* Save the positions of any other cursors open on this table before
6625 ** making any modifications. Make the page containing the entry to be
6626 ** deleted writable. Then free any overflow pages associated with the
6627 ** entry and finally remove the cell itself from within the page.
6628 */
6629 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
6630 if( rc ) return rc;
6631 rc = sqlite3PagerWrite(pPage->pDbPage);
6632 if( rc ) return rc;
6633 rc = clearCell(pPage, pCell);
6634 dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
6635 if( rc ) return rc;
6636
6637 /* If the cell deleted was not located on a leaf page, then the cursor
6638 ** is currently pointing to the largest entry in the sub-tree headed
6639 ** by the child-page of the cell that was just deleted from an internal
6640 ** node. The cell from the leaf node needs to be moved to the internal
6641 ** node to replace the deleted cell. */
6642 if( !pPage->leaf ){
6643 MemPage *pLeaf = pCur->apPage[pCur->iPage];
6644 int nCell;
6645 Pgno n = pCur->apPage[iCellDepth+1]->pgno;
6646 unsigned char *pTmp;
6647
6648 pCell = findCell(pLeaf, pLeaf->nCell-1);
6649 nCell = cellSizePtr(pLeaf, pCell);
6650 assert( MX_CELL_SIZE(pBt)>=nCell );
6651
6652 allocateTempSpace(pBt);
6653 pTmp = pBt->pTmpSpace;
6654
6655 rc = sqlite3PagerWrite(pLeaf->pDbPage);
6656 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
6657 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
6658 if( rc ) return rc;
6659 }
6660
6661 /* Balance the tree. If the entry deleted was located on a leaf page,
6662 ** then the cursor still points to that page. In this case the first
6663 ** call to balance() repairs the tree, and the if(...) condition is
6664 ** never true.
6665 **
6666 ** Otherwise, if the entry deleted was on an internal node page, then
6667 ** pCur is pointing to the leaf page from which a cell was removed to
6668 ** replace the cell deleted from the internal node. This is slightly
6669 ** tricky as the leaf node may be underfull, and the internal node may
6670 ** be either under or overfull. In this case run the balancing algorithm
6671 ** on the leaf node first. If the balance proceeds far enough up the
6672 ** tree that we can be sure that any problem in the internal node has
6673 ** been corrected, so be it. Otherwise, after balancing the leaf node,
6674 ** walk the cursor up the tree to the internal node and balance it as
6675 ** well. */
6676 rc = balance(pCur);
6677 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
6678 while( pCur->iPage>iCellDepth ){
6679 releasePage(pCur->apPage[pCur->iPage--]);
6680 }
6681 rc = balance(pCur);
6682 }
6683
6684 if( rc==SQLITE_OK ){
6685 moveToRoot(pCur);
6686 }
6687 return rc;
6688 }
6689
6690 /*
6691 ** Create a new BTree table. Write into *piTable the page
6692 ** number for the root page of the new table.
6693 **
6694 ** The type of type is determined by the flags parameter. Only the
6695 ** following values of flags are currently in use. Other values for
6696 ** flags might not work:
6697 **
6698 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6699 ** BTREE_ZERODATA Used for SQL indices
6700 */
6701 static int btreeCreateTable(Btree *p, int *piTable, int flags){
6702 BtShared *pBt = p->pBt;
6703 MemPage *pRoot;
6704 Pgno pgnoRoot;
6705 int rc;
6706
6707 assert( sqlite3BtreeHoldsMutex(p) );
6708 assert( pBt->inTransaction==TRANS_WRITE );
6709 assert( !pBt->readOnly );
6710
6711 #ifdef SQLITE_OMIT_AUTOVACUUM
6712 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6713 if( rc ){
6714 return rc;
6715 }
6716 #else
6717 if( pBt->autoVacuum ){
6718 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6719 MemPage *pPageMove; /* The page to move to. */
6720
6721 /* Creating a new table may probably require moving an existing database
6722 ** to make room for the new tables root page. In case this page turns
6723 ** out to be an overflow page, delete all overflow page-map caches
6724 ** held by open cursors.
6725 */
6726 invalidateAllOverflowCache(pBt);
6727
6728 /* Read the value of meta[3] from the database to determine where the
6729 ** root page of the new table should go. meta[3] is the largest root-page
6730 ** created so far, so the new root-page is (meta[3]+1).
6731 */
6732 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
6733 pgnoRoot++;
6734
6735 /* The new root-page may not be allocated on a pointer-map page, or the
6736 ** PENDING_BYTE page.
6737 */
6738 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6739 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6740 pgnoRoot++;
6741 }
6742 assert( pgnoRoot>=3 );
6743
6744 /* Allocate a page. The page that currently resides at pgnoRoot will
6745 ** be moved to the allocated page (unless the allocated page happens
6746 ** to reside at pgnoRoot).
6747 */
6748 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6749 if( rc!=SQLITE_OK ){
6750 return rc;
6751 }
6752
6753 if( pgnoMove!=pgnoRoot ){
6754 /* pgnoRoot is the page that will be used for the root-page of
6755 ** the new table (assuming an error did not occur). But we were
6756 ** allocated pgnoMove. If required (i.e. if it was not allocated
6757 ** by extending the file), the current page at position pgnoMove
6758 ** is already journaled.
6759 */
6760 u8 eType = 0;
6761 Pgno iPtrPage = 0;
6762
6763 releasePage(pPageMove);
6764
6765 /* Move the page currently at pgnoRoot to pgnoMove. */
6766 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6767 if( rc!=SQLITE_OK ){
6768 return rc;
6769 }
6770 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6771 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6772 rc = SQLITE_CORRUPT_BKPT;
6773 }
6774 if( rc!=SQLITE_OK ){
6775 releasePage(pRoot);
6776 return rc;
6777 }
6778 assert( eType!=PTRMAP_ROOTPAGE );
6779 assert( eType!=PTRMAP_FREEPAGE );
6780 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6781 releasePage(pRoot);
6782
6783 /* Obtain the page at pgnoRoot */
6784 if( rc!=SQLITE_OK ){
6785 return rc;
6786 }
6787 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6788 if( rc!=SQLITE_OK ){
6789 return rc;
6790 }
6791 rc = sqlite3PagerWrite(pRoot->pDbPage);
6792 if( rc!=SQLITE_OK ){
6793 releasePage(pRoot);
6794 return rc;
6795 }
6796 }else{
6797 pRoot = pPageMove;
6798 }
6799
6800 /* Update the pointer-map and meta-data with the new root-page number. */
6801 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
6802 if( rc ){
6803 releasePage(pRoot);
6804 return rc;
6805 }
6806 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
6807 if( rc ){
6808 releasePage(pRoot);
6809 return rc;
6810 }
6811
6812 }else{
6813 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6814 if( rc ) return rc;
6815 }
6816 #endif
6817 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6818 zeroPage(pRoot, flags | PTF_LEAF);
6819 sqlite3PagerUnref(pRoot->pDbPage);
6820 *piTable = (int)pgnoRoot;
6821 return SQLITE_OK;
6822 }
6823 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6824 int rc;
6825 sqlite3BtreeEnter(p);
6826 rc = btreeCreateTable(p, piTable, flags);
6827 sqlite3BtreeLeave(p);
6828 return rc;
6829 }
6830
6831 /*
6832 ** Erase the given database page and all its children. Return
6833 ** the page to the freelist.
6834 */
6835 static int clearDatabasePage(
6836 BtShared *pBt, /* The BTree that contains the table */
6837 Pgno pgno, /* Page number to clear */
6838 int freePageFlag, /* Deallocate page if true */
6839 int *pnChange
6840 ){
6841 MemPage *pPage;
6842 int rc;
6843 unsigned char *pCell;
6844 int i;
6845
6846 assert( sqlite3_mutex_held(pBt->mutex) );
6847 if( pgno>pagerPagecount(pBt) ){
6848 return SQLITE_CORRUPT_BKPT;
6849 }
6850
6851 rc = getAndInitPage(pBt, pgno, &pPage);
6852 if( rc ) return rc;
6853 for(i=0; i<pPage->nCell; i++){
6854 pCell = findCell(pPage, i);
6855 if( !pPage->leaf ){
6856 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
6857 if( rc ) goto cleardatabasepage_out;
6858 }
6859 rc = clearCell(pPage, pCell);
6860 if( rc ) goto cleardatabasepage_out;
6861 }
6862 if( !pPage->leaf ){
6863 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
6864 if( rc ) goto cleardatabasepage_out;
6865 }else if( pnChange ){
6866 assert( pPage->intKey );
6867 *pnChange += pPage->nCell;
6868 }
6869 if( freePageFlag ){
6870 freePage(pPage, &rc);
6871 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
6872 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
6873 }
6874
6875 cleardatabasepage_out:
6876 releasePage(pPage);
6877 return rc;
6878 }
6879
6880 /*
6881 ** Delete all information from a single table in the database. iTable is
6882 ** the page number of the root of the table. After this routine returns,
6883 ** the root page is empty, but still exists.
6884 **
6885 ** This routine will fail with SQLITE_LOCKED if there are any open
6886 ** read cursors on the table. Open write cursors are moved to the
6887 ** root of the table.
6888 **
6889 ** If pnChange is not NULL, then table iTable must be an intkey table. The
6890 ** integer value pointed to by pnChange is incremented by the number of
6891 ** entries in the table.
6892 */
6893 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
6894 int rc;
6895 BtShared *pBt = p->pBt;
6896 sqlite3BtreeEnter(p);
6897 assert( p->inTrans==TRANS_WRITE );
6898
6899 /* Invalidate all incrblob cursors open on table iTable (assuming iTable
6900 ** is the root of a table b-tree - if it is not, the following call is
6901 ** a no-op). */
6902 invalidateIncrblobCursors(p, 0, 1);
6903
6904 rc = saveAllCursors(pBt, (Pgno)iTable, 0);
6905 if( SQLITE_OK==rc ){
6906 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
6907 }
6908 sqlite3BtreeLeave(p);
6909 return rc;
6910 }
6911
6912 /*
6913 ** Erase all information in a table and add the root of the table to
6914 ** the freelist. Except, the root of the principle table (the one on
6915 ** page 1) is never added to the freelist.
6916 **
6917 ** This routine will fail with SQLITE_LOCKED if there are any open
6918 ** cursors on the table.
6919 **
6920 ** If AUTOVACUUM is enabled and the page at iTable is not the last
6921 ** root page in the database file, then the last root page
6922 ** in the database file is moved into the slot formerly occupied by
6923 ** iTable and that last slot formerly occupied by the last root page
6924 ** is added to the freelist instead of iTable. In this say, all
6925 ** root pages are kept at the beginning of the database file, which
6926 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6927 ** page number that used to be the last root page in the file before
6928 ** the move. If no page gets moved, *piMoved is set to 0.
6929 ** The last root page is recorded in meta[3] and the value of
6930 ** meta[3] is updated by this procedure.
6931 */
6932 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
6933 int rc;
6934 MemPage *pPage = 0;
6935 BtShared *pBt = p->pBt;
6936
6937 assert( sqlite3BtreeHoldsMutex(p) );
6938 assert( p->inTrans==TRANS_WRITE );
6939
6940 /* It is illegal to drop a table if any cursors are open on the
6941 ** database. This is because in auto-vacuum mode the backend may
6942 ** need to move another root-page to fill a gap left by the deleted
6943 ** root page. If an open cursor was using this page a problem would
6944 ** occur.
6945 **
6946 ** This error is caught long before control reaches this point.
6947 */
6948 if( NEVER(pBt->pCursor) ){
6949 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
6950 return SQLITE_LOCKED_SHAREDCACHE;
6951 }
6952
6953 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6954 if( rc ) return rc;
6955 rc = sqlite3BtreeClearTable(p, iTable, 0);
6956 if( rc ){
6957 releasePage(pPage);
6958 return rc;
6959 }
6960
6961 *piMoved = 0;
6962
6963 if( iTable>1 ){
6964 #ifdef SQLITE_OMIT_AUTOVACUUM
6965 freePage(pPage, &rc);
6966 releasePage(pPage);
6967 #else
6968 if( pBt->autoVacuum ){
6969 Pgno maxRootPgno;
6970 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
6971
6972 if( iTable==maxRootPgno ){
6973 /* If the table being dropped is the table with the largest root-page
6974 ** number in the database, put the root page on the free list.
6975 */
6976 freePage(pPage, &rc);
6977 releasePage(pPage);
6978 if( rc!=SQLITE_OK ){
6979 return rc;
6980 }
6981 }else{
6982 /* The table being dropped does not have the largest root-page
6983 ** number in the database. So move the page that does into the
6984 ** gap left by the deleted root-page.
6985 */
6986 MemPage *pMove;
6987 releasePage(pPage);
6988 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
6989 if( rc!=SQLITE_OK ){
6990 return rc;
6991 }
6992 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
6993 releasePage(pMove);
6994 if( rc!=SQLITE_OK ){
6995 return rc;
6996 }
6997 pMove = 0;
6998 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
6999 freePage(pMove, &rc);
7000 releasePage(pMove);
7001 if( rc!=SQLITE_OK ){
7002 return rc;
7003 }
7004 *piMoved = maxRootPgno;
7005 }
7006
7007 /* Set the new 'max-root-page' value in the database header. This
7008 ** is the old value less one, less one more if that happens to
7009 ** be a root-page number, less one again if that is the
7010 ** PENDING_BYTE_PAGE.
7011 */
7012 maxRootPgno--;
7013 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
7014 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
7015 maxRootPgno--;
7016 }
7017 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
7018
7019 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
7020 }else{
7021 freePage(pPage, &rc);
7022 releasePage(pPage);
7023 }
7024 #endif
7025 }else{
7026 /* If sqlite3BtreeDropTable was called on page 1.
7027 ** This really never should happen except in a corrupt
7028 ** database.
7029 */
7030 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
7031 releasePage(pPage);
7032 }
7033 return rc;
7034 }
7035 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
7036 int rc;
7037 sqlite3BtreeEnter(p);
7038 rc = btreeDropTable(p, iTable, piMoved);
7039 sqlite3BtreeLeave(p);
7040 return rc;
7041 }
7042
7043
7044 /*
7045 ** This function may only be called if the b-tree connection already
7046 ** has a read or write transaction open on the database.
7047 **
7048 ** Read the meta-information out of a database file. Meta[0]
7049 ** is the number of free pages currently in the database. Meta[1]
7050 ** through meta[15] are available for use by higher layers. Meta[0]
7051 ** is read-only, the others are read/write.
7052 **
7053 ** The schema layer numbers meta values differently. At the schema
7054 ** layer (and the SetCookie and ReadCookie opcodes) the number of
7055 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
7056 */
7057 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
7058 BtShared *pBt = p->pBt;
7059
7060 sqlite3BtreeEnter(p);
7061 assert( p->inTrans>TRANS_NONE );
7062 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
7063 assert( pBt->pPage1 );
7064 assert( idx>=0 && idx<=15 );
7065
7066 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
7067
7068 /* If auto-vacuum is disabled in this build and this is an auto-vacuum
7069 ** database, mark the database as read-only. */
7070 #ifdef SQLITE_OMIT_AUTOVACUUM
7071 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1;
7072 #endif
7073
7074 sqlite3BtreeLeave(p);
7075 }
7076
7077 /*
7078 ** Write meta-information back into the database. Meta[0] is
7079 ** read-only and may not be written.
7080 */
7081 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
7082 BtShared *pBt = p->pBt;
7083 unsigned char *pP1;
7084 int rc;
7085 assert( idx>=1 && idx<=15 );
7086 sqlite3BtreeEnter(p);
7087 assert( p->inTrans==TRANS_WRITE );
7088 assert( pBt->pPage1!=0 );
7089 pP1 = pBt->pPage1->aData;
7090 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7091 if( rc==SQLITE_OK ){
7092 put4byte(&pP1[36 + idx*4], iMeta);
7093 #ifndef SQLITE_OMIT_AUTOVACUUM
7094 if( idx==BTREE_INCR_VACUUM ){
7095 assert( pBt->autoVacuum || iMeta==0 );
7096 assert( iMeta==0 || iMeta==1 );
7097 pBt->incrVacuum = (u8)iMeta;
7098 }
7099 #endif
7100 }
7101 sqlite3BtreeLeave(p);
7102 return rc;
7103 }
7104
7105 #ifndef SQLITE_OMIT_BTREECOUNT
7106 /*
7107 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
7108 ** number of entries in the b-tree and write the result to *pnEntry.
7109 **
7110 ** SQLITE_OK is returned if the operation is successfully executed.
7111 ** Otherwise, if an error is encountered (i.e. an IO error or database
7112 ** corruption) an SQLite error code is returned.
7113 */
7114 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
7115 i64 nEntry = 0; /* Value to return in *pnEntry */
7116 int rc; /* Return code */
7117 rc = moveToRoot(pCur);
7118
7119 /* Unless an error occurs, the following loop runs one iteration for each
7120 ** page in the B-Tree structure (not including overflow pages).
7121 */
7122 while( rc==SQLITE_OK ){
7123 int iIdx; /* Index of child node in parent */
7124 MemPage *pPage; /* Current page of the b-tree */
7125
7126 /* If this is a leaf page or the tree is not an int-key tree, then
7127 ** this page contains countable entries. Increment the entry counter
7128 ** accordingly.
7129 */
7130 pPage = pCur->apPage[pCur->iPage];
7131 if( pPage->leaf || !pPage->intKey ){
7132 nEntry += pPage->nCell;
7133 }
7134
7135 /* pPage is a leaf node. This loop navigates the cursor so that it
7136 ** points to the first interior cell that it points to the parent of
7137 ** the next page in the tree that has not yet been visited. The
7138 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
7139 ** of the page, or to the number of cells in the page if the next page
7140 ** to visit is the right-child of its parent.
7141 **
7142 ** If all pages in the tree have been visited, return SQLITE_OK to the
7143 ** caller.
7144 */
7145 if( pPage->leaf ){
7146 do {
7147 if( pCur->iPage==0 ){
7148 /* All pages of the b-tree have been visited. Return successfully. */
7149 *pnEntry = nEntry;
7150 return SQLITE_OK;
7151 }
7152 moveToParent(pCur);
7153 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
7154
7155 pCur->aiIdx[pCur->iPage]++;
7156 pPage = pCur->apPage[pCur->iPage];
7157 }
7158
7159 /* Descend to the child node of the cell that the cursor currently
7160 ** points at. This is the right-child if (iIdx==pPage->nCell).
7161 */
7162 iIdx = pCur->aiIdx[pCur->iPage];
7163 if( iIdx==pPage->nCell ){
7164 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
7165 }else{
7166 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
7167 }
7168 }
7169
7170 /* An error has occurred. Return an error code. */
7171 return rc;
7172 }
7173 #endif
7174
7175 /*
7176 ** Return the pager associated with a BTree. This routine is used for
7177 ** testing and debugging only.
7178 */
7179 Pager *sqlite3BtreePager(Btree *p){
7180 return p->pBt->pPager;
7181 }
7182
7183 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7184 /*
7185 ** Append a message to the error message string.
7186 */
7187 static void checkAppendMsg(
7188 IntegrityCk *pCheck,
7189 char *zMsg1,
7190 const char *zFormat,
7191 ...
7192 ){
7193 va_list ap;
7194 if( !pCheck->mxErr ) return;
7195 pCheck->mxErr--;
7196 pCheck->nErr++;
7197 va_start(ap, zFormat);
7198 if( pCheck->errMsg.nChar ){
7199 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
7200 }
7201 if( zMsg1 ){
7202 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
7203 }
7204 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
7205 va_end(ap);
7206 if( pCheck->errMsg.mallocFailed ){
7207 pCheck->mallocFailed = 1;
7208 }
7209 }
7210 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7211
7212 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7213 /*
7214 ** Add 1 to the reference count for page iPage. If this is the second
7215 ** reference to the page, add an error message to pCheck->zErrMsg.
7216 ** Return 1 if there are 2 ore more references to the page and 0 if
7217 ** if this is the first reference to the page.
7218 **
7219 ** Also check that the page number is in bounds.
7220 */
7221 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
7222 if( iPage==0 ) return 1;
7223 if( iPage>pCheck->nPage ){
7224 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
7225 return 1;
7226 }
7227 if( pCheck->anRef[iPage]==1 ){
7228 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
7229 return 1;
7230 }
7231 return (pCheck->anRef[iPage]++)>1;
7232 }
7233
7234 #ifndef SQLITE_OMIT_AUTOVACUUM
7235 /*
7236 ** Check that the entry in the pointer-map for page iChild maps to
7237 ** page iParent, pointer type ptrType. If not, append an error message
7238 ** to pCheck.
7239 */
7240 static void checkPtrmap(
7241 IntegrityCk *pCheck, /* Integrity check context */
7242 Pgno iChild, /* Child page number */
7243 u8 eType, /* Expected pointer map type */
7244 Pgno iParent, /* Expected pointer map parent page number */
7245 char *zContext /* Context description (used for error msg) */
7246 ){
7247 int rc;
7248 u8 ePtrmapType;
7249 Pgno iPtrmapParent;
7250
7251 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
7252 if( rc!=SQLITE_OK ){
7253 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
7254 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
7255 return;
7256 }
7257
7258 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
7259 checkAppendMsg(pCheck, zContext,
7260 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
7261 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
7262 }
7263 }
7264 #endif
7265
7266 /*
7267 ** Check the integrity of the freelist or of an overflow page list.
7268 ** Verify that the number of pages on the list is N.
7269 */
7270 static void checkList(
7271 IntegrityCk *pCheck, /* Integrity checking context */
7272 int isFreeList, /* True for a freelist. False for overflow page list */
7273 int iPage, /* Page number for first page in the list */
7274 int N, /* Expected number of pages in the list */
7275 char *zContext /* Context for error messages */
7276 ){
7277 int i;
7278 int expected = N;
7279 int iFirst = iPage;
7280 while( N-- > 0 && pCheck->mxErr ){
7281 DbPage *pOvflPage;
7282 unsigned char *pOvflData;
7283 if( iPage<1 ){
7284 checkAppendMsg(pCheck, zContext,
7285 "%d of %d pages missing from overflow list starting at %d",
7286 N+1, expected, iFirst);
7287 break;
7288 }
7289 if( checkRef(pCheck, iPage, zContext) ) break;
7290 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
7291 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
7292 break;
7293 }
7294 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
7295 if( isFreeList ){
7296 int n = get4byte(&pOvflData[4]);
7297 #ifndef SQLITE_OMIT_AUTOVACUUM
7298 if( pCheck->pBt->autoVacuum ){
7299 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
7300 }
7301 #endif
7302 if( n>pCheck->pBt->usableSize/4-2 ){
7303 checkAppendMsg(pCheck, zContext,
7304 "freelist leaf count too big on page %d", iPage);
7305 N--;
7306 }else{
7307 for(i=0; i<n; i++){
7308 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
7309 #ifndef SQLITE_OMIT_AUTOVACUUM
7310 if( pCheck->pBt->autoVacuum ){
7311 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
7312 }
7313 #endif
7314 checkRef(pCheck, iFreePage, zContext);
7315 }
7316 N -= n;
7317 }
7318 }
7319 #ifndef SQLITE_OMIT_AUTOVACUUM
7320 else{
7321 /* If this database supports auto-vacuum and iPage is not the last
7322 ** page in this overflow list, check that the pointer-map entry for
7323 ** the following page matches iPage.
7324 */
7325 if( pCheck->pBt->autoVacuum && N>0 ){
7326 i = get4byte(pOvflData);
7327 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
7328 }
7329 }
7330 #endif
7331 iPage = get4byte(pOvflData);
7332 sqlite3PagerUnref(pOvflPage);
7333 }
7334 }
7335 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7336
7337 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7338 /*
7339 ** Do various sanity checks on a single page of a tree. Return
7340 ** the tree depth. Root pages return 0. Parents of root pages
7341 ** return 1, and so forth.
7342 **
7343 ** These checks are done:
7344 **
7345 ** 1. Make sure that cells and freeblocks do not overlap
7346 ** but combine to completely cover the page.
7347 ** NO 2. Make sure cell keys are in order.
7348 ** NO 3. Make sure no key is less than or equal to zLowerBound.
7349 ** NO 4. Make sure no key is greater than or equal to zUpperBound.
7350 ** 5. Check the integrity of overflow pages.
7351 ** 6. Recursively call checkTreePage on all children.
7352 ** 7. Verify that the depth of all children is the same.
7353 ** 8. Make sure this page is at least 33% full or else it is
7354 ** the root of the tree.
7355 */
7356 static int checkTreePage(
7357 IntegrityCk *pCheck, /* Context for the sanity check */
7358 int iPage, /* Page number of the page to check */
7359 char *zParentContext /* Parent context */
7360 ){
7361 MemPage *pPage;
7362 int i, rc, depth, d2, pgno, cnt;
7363 int hdr, cellStart;
7364 int nCell;
7365 u8 *data;
7366 BtShared *pBt;
7367 int usableSize;
7368 char zContext[100];
7369 char *hit = 0;
7370
7371 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
7372
7373 /* Check that the page exists
7374 */
7375 pBt = pCheck->pBt;
7376 usableSize = pBt->usableSize;
7377 if( iPage==0 ) return 0;
7378 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
7379 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
7380 checkAppendMsg(pCheck, zContext,
7381 "unable to get the page. error code=%d", rc);
7382 return 0;
7383 }
7384
7385 /* Clear MemPage.isInit to make sure the corruption detection code in
7386 ** btreeInitPage() is executed. */
7387 pPage->isInit = 0;
7388 if( (rc = btreeInitPage(pPage))!=0 ){
7389 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
7390 checkAppendMsg(pCheck, zContext,
7391 "btreeInitPage() returns error code %d", rc);
7392 releasePage(pPage);
7393 return 0;
7394 }
7395
7396 /* Check out all the cells.
7397 */
7398 depth = 0;
7399 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
7400 u8 *pCell;
7401 u32 sz;
7402 CellInfo info;
7403
7404 /* Check payload overflow pages
7405 */
7406 sqlite3_snprintf(sizeof(zContext), zContext,
7407 "On tree page %d cell %d: ", iPage, i);
7408 pCell = findCell(pPage,i);
7409 btreeParseCellPtr(pPage, pCell, &info);
7410 sz = info.nData;
7411 if( !pPage->intKey ) sz += (int)info.nKey;
7412 assert( sz==info.nPayload );
7413 if( (sz>info.nLocal)
7414 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
7415 ){
7416 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
7417 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
7418 #ifndef SQLITE_OMIT_AUTOVACUUM
7419 if( pBt->autoVacuum ){
7420 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
7421 }
7422 #endif
7423 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
7424 }
7425
7426 /* Check sanity of left child page.
7427 */
7428 if( !pPage->leaf ){
7429 pgno = get4byte(pCell);
7430 #ifndef SQLITE_OMIT_AUTOVACUUM
7431 if( pBt->autoVacuum ){
7432 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
7433 }
7434 #endif
7435 d2 = checkTreePage(pCheck, pgno, zContext);
7436 if( i>0 && d2!=depth ){
7437 checkAppendMsg(pCheck, zContext, "Child page depth differs");
7438 }
7439 depth = d2;
7440 }
7441 }
7442 if( !pPage->leaf ){
7443 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7444 sqlite3_snprintf(sizeof(zContext), zContext,
7445 "On page %d at right child: ", iPage);
7446 #ifndef SQLITE_OMIT_AUTOVACUUM
7447 if( pBt->autoVacuum ){
7448 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
7449 }
7450 #endif
7451 checkTreePage(pCheck, pgno, zContext);
7452 }
7453
7454 /* Check for complete coverage of the page
7455 */
7456 data = pPage->aData;
7457 hdr = pPage->hdrOffset;
7458 hit = sqlite3PageMalloc( pBt->pageSize );
7459 if( hit==0 ){
7460 pCheck->mallocFailed = 1;
7461 }else{
7462 u16 contentOffset = get2byte(&data[hdr+5]);
7463 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
7464 memset(hit+contentOffset, 0, usableSize-contentOffset);
7465 memset(hit, 1, contentOffset);
7466 nCell = get2byte(&data[hdr+3]);
7467 cellStart = hdr + 12 - 4*pPage->leaf;
7468 for(i=0; i<nCell; i++){
7469 int pc = get2byte(&data[cellStart+i*2]);
7470 u16 size = 1024;
7471 int j;
7472 if( pc<=usableSize-4 ){
7473 size = cellSizePtr(pPage, &data[pc]);
7474 }
7475 if( (pc+size-1)>=usableSize ){
7476 checkAppendMsg(pCheck, 0,
7477 "Corruption detected in cell %d on page %d",i,iPage,0);
7478 }else{
7479 for(j=pc+size-1; j>=pc; j--) hit[j]++;
7480 }
7481 }
7482 i = get2byte(&data[hdr+1]);
7483 while( i>0 ){
7484 int size, j;
7485 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */
7486 size = get2byte(&data[i+2]);
7487 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */
7488 for(j=i+size-1; j>=i; j--) hit[j]++;
7489 j = get2byte(&data[i]);
7490 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */
7491 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */
7492 i = j;
7493 }
7494 for(i=cnt=0; i<usableSize; i++){
7495 if( hit[i]==0 ){
7496 cnt++;
7497 }else if( hit[i]>1 ){
7498 checkAppendMsg(pCheck, 0,
7499 "Multiple uses for byte %d of page %d", i, iPage);
7500 break;
7501 }
7502 }
7503 if( cnt!=data[hdr+7] ){
7504 checkAppendMsg(pCheck, 0,
7505 "Fragmentation of %d bytes reported as %d on page %d",
7506 cnt, data[hdr+7], iPage);
7507 }
7508 }
7509 sqlite3PageFree(hit);
7510 releasePage(pPage);
7511 return depth+1;
7512 }
7513 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7514
7515 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
7516 /*
7517 ** This routine does a complete check of the given BTree file. aRoot[] is
7518 ** an array of pages numbers were each page number is the root page of
7519 ** a table. nRoot is the number of entries in aRoot.
7520 **
7521 ** A read-only or read-write transaction must be opened before calling
7522 ** this function.
7523 **
7524 ** Write the number of error seen in *pnErr. Except for some memory
7525 ** allocation errors, an error message held in memory obtained from
7526 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
7527 ** returned. If a memory allocation error occurs, NULL is returned.
7528 */
7529 char *sqlite3BtreeIntegrityCheck(
7530 Btree *p, /* The btree to be checked */
7531 int *aRoot, /* An array of root pages numbers for individual trees */
7532 int nRoot, /* Number of entries in aRoot[] */
7533 int mxErr, /* Stop reporting errors after this many */
7534 int *pnErr /* Write number of errors seen to this variable */
7535 ){
7536 Pgno i;
7537 int nRef;
7538 IntegrityCk sCheck;
7539 BtShared *pBt = p->pBt;
7540 char zErr[100];
7541
7542 sqlite3BtreeEnter(p);
7543 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
7544 nRef = sqlite3PagerRefcount(pBt->pPager);
7545 sCheck.pBt = pBt;
7546 sCheck.pPager = pBt->pPager;
7547 sCheck.nPage = pagerPagecount(sCheck.pBt);
7548 sCheck.mxErr = mxErr;
7549 sCheck.nErr = 0;
7550 sCheck.mallocFailed = 0;
7551 *pnErr = 0;
7552 if( sCheck.nPage==0 ){
7553 sqlite3BtreeLeave(p);
7554 return 0;
7555 }
7556 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
7557 if( !sCheck.anRef ){
7558 *pnErr = 1;
7559 sqlite3BtreeLeave(p);
7560 return 0;
7561 }
7562 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
7563 i = PENDING_BYTE_PAGE(pBt);
7564 if( i<=sCheck.nPage ){
7565 sCheck.anRef[i] = 1;
7566 }
7567 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
7568
7569 /* Check the integrity of the freelist
7570 */
7571 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
7572 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
7573
7574 /* Check all the tables.
7575 */
7576 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
7577 if( aRoot[i]==0 ) continue;
7578 #ifndef SQLITE_OMIT_AUTOVACUUM
7579 if( pBt->autoVacuum && aRoot[i]>1 ){
7580 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
7581 }
7582 #endif
7583 checkTreePage(&sCheck, aRoot[i], "List of tree roots: ");
7584 }
7585
7586 /* Make sure every page in the file is referenced
7587 */
7588 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
7589 #ifdef SQLITE_OMIT_AUTOVACUUM
7590 if( sCheck.anRef[i]==0 ){
7591 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7592 }
7593 #else
7594 /* If the database supports auto-vacuum, make sure no tables contain
7595 ** references to pointer-map pages.
7596 */
7597 if( sCheck.anRef[i]==0 &&
7598 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
7599 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
7600 }
7601 if( sCheck.anRef[i]!=0 &&
7602 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
7603 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
7604 }
7605 #endif
7606 }
7607
7608 /* Make sure this analysis did not leave any unref() pages.
7609 ** This is an internal consistency check; an integrity check
7610 ** of the integrity check.
7611 */
7612 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
7613 checkAppendMsg(&sCheck, 0,
7614 "Outstanding page count goes from %d to %d during this analysis",
7615 nRef, sqlite3PagerRefcount(pBt->pPager)
7616 );
7617 }
7618
7619 /* Clean up and report errors.
7620 */
7621 sqlite3BtreeLeave(p);
7622 sqlite3_free(sCheck.anRef);
7623 if( sCheck.mallocFailed ){
7624 sqlite3StrAccumReset(&sCheck.errMsg);
7625 *pnErr = sCheck.nErr+1;
7626 return 0;
7627 }
7628 *pnErr = sCheck.nErr;
7629 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7630 return sqlite3StrAccumFinish(&sCheck.errMsg);
7631 }
7632 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7633
7634 /*
7635 ** Return the full pathname of the underlying database file.
7636 **
7637 ** The pager filename is invariant as long as the pager is
7638 ** open so it is safe to access without the BtShared mutex.
7639 */
7640 const char *sqlite3BtreeGetFilename(Btree *p){
7641 assert( p->pBt->pPager!=0 );
7642 return sqlite3PagerFilename(p->pBt->pPager);
7643 }
7644
7645 /*
7646 ** Return the pathname of the journal file for this database. The return
7647 ** value of this routine is the same regardless of whether the journal file
7648 ** has been created or not.
7649 **
7650 ** The pager journal filename is invariant as long as the pager is
7651 ** open so it is safe to access without the BtShared mutex.
7652 */
7653 const char *sqlite3BtreeGetJournalname(Btree *p){
7654 assert( p->pBt->pPager!=0 );
7655 return sqlite3PagerJournalname(p->pBt->pPager);
7656 }
7657
7658 /*
7659 ** Return non-zero if a transaction is active.
7660 */
7661 int sqlite3BtreeIsInTrans(Btree *p){
7662 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7663 return (p && (p->inTrans==TRANS_WRITE));
7664 }
7665
7666 /*
7667 ** Return non-zero if a read (or write) transaction is active.
7668 */
7669 int sqlite3BtreeIsInReadTrans(Btree *p){
7670 assert( p );
7671 assert( sqlite3_mutex_held(p->db->mutex) );
7672 return p->inTrans!=TRANS_NONE;
7673 }
7674
7675 int sqlite3BtreeIsInBackup(Btree *p){
7676 assert( p );
7677 assert( sqlite3_mutex_held(p->db->mutex) );
7678 return p->nBackup!=0;
7679 }
7680
7681 /*
7682 ** This function returns a pointer to a blob of memory associated with
7683 ** a single shared-btree. The memory is used by client code for its own
7684 ** purposes (for example, to store a high-level schema associated with
7685 ** the shared-btree). The btree layer manages reference counting issues.
7686 **
7687 ** The first time this is called on a shared-btree, nBytes bytes of memory
7688 ** are allocated, zeroed, and returned to the caller. For each subsequent
7689 ** call the nBytes parameter is ignored and a pointer to the same blob
7690 ** of memory returned.
7691 **
7692 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7693 ** allocated, a null pointer is returned. If the blob has already been
7694 ** allocated, it is returned as normal.
7695 **
7696 ** Just before the shared-btree is closed, the function passed as the
7697 ** xFree argument when the memory allocation was made is invoked on the
7698 ** blob of allocated memory. This function should not call sqlite3_free()
7699 ** on the memory, the btree layer does that.
7700 */
7701 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7702 BtShared *pBt = p->pBt;
7703 sqlite3BtreeEnter(p);
7704 if( !pBt->pSchema && nBytes ){
7705 pBt->pSchema = sqlite3MallocZero(nBytes);
7706 pBt->xFreeSchema = xFree;
7707 }
7708 sqlite3BtreeLeave(p);
7709 return pBt->pSchema;
7710 }
7711
7712 /*
7713 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
7714 ** btree as the argument handle holds an exclusive lock on the
7715 ** sqlite_master table. Otherwise SQLITE_OK.
7716 */
7717 int sqlite3BtreeSchemaLocked(Btree *p){
7718 int rc;
7719 assert( sqlite3_mutex_held(p->db->mutex) );
7720 sqlite3BtreeEnter(p);
7721 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
7722 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
7723 sqlite3BtreeLeave(p);
7724 return rc;
7725 }
7726
7727
7728 #ifndef SQLITE_OMIT_SHARED_CACHE
7729 /*
7730 ** Obtain a lock on the table whose root page is iTab. The
7731 ** lock is a write lock if isWritelock is true or a read lock
7732 ** if it is false.
7733 */
7734 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
7735 int rc = SQLITE_OK;
7736 assert( p->inTrans!=TRANS_NONE );
7737 if( p->sharable ){
7738 u8 lockType = READ_LOCK + isWriteLock;
7739 assert( READ_LOCK+1==WRITE_LOCK );
7740 assert( isWriteLock==0 || isWriteLock==1 );
7741
7742 sqlite3BtreeEnter(p);
7743 rc = querySharedCacheTableLock(p, iTab, lockType);
7744 if( rc==SQLITE_OK ){
7745 rc = setSharedCacheTableLock(p, iTab, lockType);
7746 }
7747 sqlite3BtreeLeave(p);
7748 }
7749 return rc;
7750 }
7751 #endif
7752
7753 #ifndef SQLITE_OMIT_INCRBLOB
7754 /*
7755 ** Argument pCsr must be a cursor opened for writing on an
7756 ** INTKEY table currently pointing at a valid table entry.
7757 ** This function modifies the data stored as part of that entry.
7758 **
7759 ** Only the data content may only be modified, it is not possible to
7760 ** change the length of the data stored. If this function is called with
7761 ** parameters that attempt to write past the end of the existing data,
7762 ** no modifications are made and SQLITE_CORRUPT is returned.
7763 */
7764 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
7765 int rc;
7766 assert( cursorHoldsMutex(pCsr) );
7767 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
7768 assert( pCsr->isIncrblobHandle );
7769
7770 rc = restoreCursorPosition(pCsr);
7771 if( rc!=SQLITE_OK ){
7772 return rc;
7773 }
7774 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7775 if( pCsr->eState!=CURSOR_VALID ){
7776 return SQLITE_ABORT;
7777 }
7778
7779 /* Check some assumptions:
7780 ** (a) the cursor is open for writing,
7781 ** (b) there is a read/write transaction open,
7782 ** (c) the connection holds a write-lock on the table (if required),
7783 ** (d) there are no conflicting read-locks, and
7784 ** (e) the cursor points at a valid row of an intKey table.
7785 */
7786 if( !pCsr->wrFlag ){
7787 return SQLITE_READONLY;
7788 }
7789 assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE );
7790 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
7791 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
7792 assert( pCsr->apPage[pCsr->iPage]->intKey );
7793
7794 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
7795 }
7796
7797 /*
7798 ** Set a flag on this cursor to cache the locations of pages from the
7799 ** overflow list for the current row. This is used by cursors opened
7800 ** for incremental blob IO only.
7801 **
7802 ** This function sets a flag only. The actual page location cache
7803 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
7804 ** accessPayload() (the worker function for sqlite3BtreeData() and
7805 ** sqlite3BtreePutData()).
7806 */
7807 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
7808 assert( cursorHoldsMutex(pCur) );
7809 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
7810 assert(!pCur->isIncrblobHandle);
7811 assert(!pCur->aOverflow);
7812 pCur->isIncrblobHandle = 1;
7813 }
7814
7815 /* Poison the db so that other clients error out as quickly as
7816 ** possible.
7817 */
7818 int sqlite3Poison(sqlite3 *db){
7819 int rc;
7820 Btree *p;
7821 BtShared *pBt;
7822 unsigned char *pP1;
7823
7824 if( db == NULL) return SQLITE_OK;
7825
7826 /* Database 0 corrosponds to the main database. */
7827 if( db->nDb<1 ) return SQLITE_OK;
7828 p = db->aDb[0].pBt;
7829 pBt = p->pBt;
7830
7831 /* If in a transaction, roll it back. Committing any changes to a
7832 ** corrupt database may mess up evidence, we definitely don't want
7833 ** to allow poisoning to be rolled back, and the database is anyhow
7834 ** going bye-bye RSN.
7835 */
7836 /* TODO(shess): Figure out if this might release the lock and let
7837 ** someone else get in there, which might deny us the lock a couple
7838 ** lines down.
7839 */
7840 if( sqlite3BtreeIsInTrans(p) ) sqlite3BtreeRollback(p);
7841
7842 /* Start an exclusive transaction. This will check the headers, so
7843 ** if someone else poisoned the database we should get an error.
7844 */
7845 rc = sqlite3BtreeBeginTrans(p, 2);
7846 /* TODO(shess): Handle SQLITE_BUSY? */
7847 if( rc!=SQLITE_OK ) return rc;
7848
7849 /* Copied from sqlite3BtreeUpdateMeta(). Writing the old version of
7850 ** the page to the journal may be overkill, but it probably won't
7851 ** hurt.
7852 */
7853 assert( pBt->inTrans==TRANS_WRITE );
7854 assert( pBt->pPage1!=0 );
7855 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
7856 if( rc ) goto err;
7857
7858 /* "SQLite format 3" changes to
7859 ** "SQLite poison 3". Be extra paranoid about making this change.
7860 */
7861 if( sizeof(zMagicHeader)!=16 ||
7862 sizeof(zPoisonHeader)!=sizeof(zMagicHeader) ){
7863 rc = SQLITE_ERROR;
7864 goto err;
7865 }
7866 pP1 = pBt->pPage1->aData;
7867 if( memcmp(pP1, zMagicHeader, 16)!=0 ){
7868 rc = SQLITE_CORRUPT;
7869 goto err;
7870 }
7871 memcpy(pP1, zPoisonHeader, 16);
7872
7873 /* Push it to the database file. */
7874 return sqlite3BtreeCommit(p);
7875
7876 err:
7877 /* TODO(shess): What about errors, here? */
7878 sqlite3BtreeRollback(p);
7879 return rc;
7880 }
7881
7882 #endif
OLDNEW
« no previous file with comments | « third_party/sqlite/src/btree.h ('k') | third_party/sqlite/src/btreeInt.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698