third_party/sqlite/sqlite-src-3170000/src/btree.c - Issue 2747283002: [sql] Import reference version of SQLite 3.17..

Side by Side Diff: third_party/sqlite/sqlite-src-3170000/src/btree.c

Issue 2747283002: [sql] Import reference version of SQLite 3.17.. (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 ** 2004 April 6

	3 **

	4 ** The author disclaims copyright to this source code. In place of

	5 ** a legal notice, here is a blessing:

	6 **

	7 ** May you do good and not evil.

	8 ** May you find forgiveness for yourself and forgive others.

	9 ** May you share freely, never taking more than you give.

	10 **

	11 *************************************************************************

	12 ** This file implements an external (disk-based) database using BTrees.

	13 ** See the header comment on "btreeInt.h" for additional information.

	14 ** Including a description of file format and an overview of operation.

	15 */

	16 #include "btreeInt.h"

	17

	18 /*

	19 ** The header string that appears at the beginning of every

	20 ** SQLite database.

	21 */

	22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;

	23

	24 /*

	25 ** Set this global variable to 1 to enable tracing using the TRACE

	26 ** macro.

	27 */

	28 #if 0

	29 int sqlite3BtreeTrace=1; /* True to enable tracing */

	30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}

	31 #else

	32 # define TRACE(X)

	33 #endif

	34

	35 /*

	36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.

	37 ** But if the value is zero, make it 65536.

	38 **

	39 ** This routine is used to extract the "offset to cell content area" value

	40 ** from the header of a btree page. If the page size is 65536 and the page

	41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.

	42 ** This routine makes the necessary adjustment to 65536.

	43 */

	44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)

	45

	46 /*

	47 ** Values passed as the 5th argument to allocateBtreePage()

	48 */

	49 #define BTALLOC_ANY 0 /* Allocate any page */

	50 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */

	51 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */

	52

	53 /*

	54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not

	55 ** defined, or 0 if it is. For example:

	56 **

	57 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);

	58 */

	59 #ifndef SQLITE_OMIT_AUTOVACUUM

	60 #define IfNotOmitAV(expr) (expr)

	61 #else

	62 #define IfNotOmitAV(expr) 0

	63 #endif

	64

	65 #ifndef SQLITE_OMIT_SHARED_CACHE

	66 /*

	67 ** A list of BtShared objects that are eligible for participation

	68 ** in shared cache. This variable has file scope during normal builds,

	69 ** but the test harness needs to access it so we make it global for

	70 ** test builds.

	71 **

	72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.

	73 */

	74 #ifdef SQLITE_TEST

	75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

	76 #else

	77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

	78 #endif

	79 #endif /* SQLITE_OMIT_SHARED_CACHE */

	80

	81 #ifndef SQLITE_OMIT_SHARED_CACHE

	82 /*

	83 ** Enable or disable the shared pager and schema features.

	84 **

	85 ** This routine has no effect on existing database connections.

	86 ** The shared cache setting effects only future calls to

	87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().

	88 */

	89 int sqlite3_enable_shared_cache(int enable){

	90 sqlite3GlobalConfig.sharedCacheEnabled = enable;

	91 return SQLITE_OK;

	92 }

	93 #endif

	94

	95

	96

	97 #ifdef SQLITE_OMIT_SHARED_CACHE

	98 /*

	99 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),

	100 ** and clearAllSharedCacheTableLocks()

	101 ** manipulate entries in the BtShared.pLock linked list used to store

	102 ** shared-cache table level locks. If the library is compiled with the

	103 ** shared-cache feature disabled, then there is only ever one user

	104 ** of each BtShared structure and so this locking is not necessary.

	105 ** So define the lock related functions as no-ops.

	106 */

	107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK

	108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK

	109 #define clearAllSharedCacheTableLocks(a)

	110 #define downgradeAllSharedCacheTableLocks(a)

	111 #define hasSharedCacheTableLock(a,b,c,d) 1

	112 #define hasReadConflicts(a, b) 0

	113 #endif

	114

	115 #ifndef SQLITE_OMIT_SHARED_CACHE

	116

	117 #ifdef SQLITE_DEBUG

	118 /*

	119 ** This function is only used as part of an assert() statement. *

	120 **

	121 ** Check to see if pBtree holds the required locks to read or write to the

	122 ** table with root page iRoot. Return 1 if it does and 0 if not.

	123 **

	124 ** For example, when writing to a table with root-page iRoot via

	125 ** Btree connection pBtree:

	126 **

	127 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );

	128 **

	129 ** When writing to an index that resides in a sharable database, the

	130 ** caller should have first obtained a lock specifying the root page of

	131 ** the corresponding table. This makes things a bit more complicated,

	132 ** as this module treats each table as a separate structure. To determine

	133 ** the table corresponding to the index being written, this

	134 ** function has to search through the database schema.

	135 **

	136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may

	137 ** hold a write-lock on the schema table (root page 1). This is also

	138 ** acceptable.

	139 */

	140 static int hasSharedCacheTableLock(

	141 Btree pBtree, / Handle that must hold lock */

	142 Pgno iRoot, /* Root page of b-tree */

	143 int isIndex, /* True if iRoot is the root of an index b-tree */

	144 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */

	145 ){

	146 Schema pSchema = (Schema )pBtree->pBt->pSchema;

	147 Pgno iTab = 0;

	148 BtLock *pLock;

	149

	150 /* If this database is not shareable, or if the client is reading

	151 ** and has the read-uncommitted flag set, then no lock is required.

	152 ** Return true immediately.

	153 */

	154 if( (pBtree->sharable==0)

	155 \|\| (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))

	156 ){

	157 return 1;

	158 }

	159

	160 /* If the client is reading or writing an index and the schema is

	161 ** not loaded, then it is too difficult to actually check to see if

	162 ** the correct locks are held. So do not bother - just return true.

	163 ** This case does not come up very often anyhow.

	164 */

	165 if( isIndex && (!pSchema \|\| (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){

	166 return 1;

	167 }

	168

	169 /* Figure out the root-page that the lock should be held on. For table

	170 ** b-trees, this is just the root page of the b-tree being read or

	171 ** written. For index b-trees, it is the root page of the associated

	172 ** table. */

	173 if( isIndex ){

	174 HashElem *p;

	175 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){

	176 Index pIdx = (Index )sqliteHashData(p);

	177 if( pIdx->tnum==(int)iRoot ){

	178 if( iTab ){

	179 /* Two or more indexes share the same root page. There must

	180 ** be imposter tables. So just return true. The assert is not

	181 ** useful in that case. */

	182 return 1;

	183 }

	184 iTab = pIdx->pTable->tnum;

	185 }

	186 }

	187 }else{

	188 iTab = iRoot;

	189 }

	190

	191 /* Search for the required lock. Either a write-lock on root-page iTab, a

	192 ** write-lock on the schema table, or (if the client is reading) a

	193 ** read-lock on iTab will suffice. Return 1 if any of these are found. */

	194 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){

	195 if( pLock->pBtree==pBtree

	196 && (pLock->iTable==iTab \|\| (pLock->eLock==WRITE_LOCK && pLock->iTable==1))

	197 && pLock->eLock>=eLockType

	198 ){

	199 return 1;

	200 }

	201 }

	202

	203 /* Failed to find the required lock. */

	204 return 0;

	205 }

	206 #endif /* SQLITE_DEBUG */

	207

	208 #ifdef SQLITE_DEBUG

	209 /*

	210 ** This function may be used as part of assert() statements only. **

	211 **

	212 ** Return true if it would be illegal for pBtree to write into the

	213 ** table or index rooted at iRoot because other shared connections are

	214 ** simultaneously reading that same table or index.

	215 **

	216 ** It is illegal for pBtree to write if some other Btree object that

	217 ** shares the same BtShared object is currently reading or writing

	218 ** the iRoot table. Except, if the other Btree object has the

	219 ** read-uncommitted flag set, then it is OK for the other object to

	220 ** have a read cursor.

	221 **

	222 ** For example, before writing to any part of the table or index

	223 ** rooted at page iRoot, one should call:

	224 **

	225 ** assert( !hasReadConflicts(pBtree, iRoot) );

	226 */

	227 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){

	228 BtCursor *p;

	229 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	230 if( p->pgnoRoot==iRoot

	231 && p->pBtree!=pBtree

	232 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)

	233 ){

	234 return 1;

	235 }

	236 }

	237 return 0;

	238 }

	239 #endif /* #ifdef SQLITE_DEBUG */

	240

	241 /*

	242 ** Query to see if Btree handle p may obtain a lock of type eLock

	243 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return

	244 ** SQLITE_OK if the lock may be obtained (by calling

	245 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.

	246 */

	247 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){

	248 BtShared *pBt = p->pBt;

	249 BtLock *pIter;

	250

	251 assert( sqlite3BtreeHoldsMutex(p) );

	252 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

	253 assert( p->db!=0 );

	254 assert( !(p->db->flags&SQLITE_ReadUncommitted)\|\|eLock==WRITE_LOCK\|\|iTab==1 );

	255

	256 /* If requesting a write-lock, then the Btree must have an open write

	257 ** transaction on this file. And, obviously, for this to be so there

	258 ** must be an open write transaction on the file itself.

	259 */

	260 assert( eLock==READ_LOCK \|\| (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );

	261 assert( eLock==READ_LOCK \|\| pBt->inTransaction==TRANS_WRITE );

	262

	263 /* This routine is a no-op if the shared-cache is not enabled */

	264 if( !p->sharable ){

	265 return SQLITE_OK;

	266 }

	267

	268 /* If some other connection is holding an exclusive lock, the

	269 ** requested lock may not be obtained.

	270 */

	271 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){

	272 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);

	273 return SQLITE_LOCKED_SHAREDCACHE;

	274 }

	275

	276 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	277 /* The condition (pIter->eLock!=eLock) in the following if(...)

	278 ** statement is a simplification of:

	279 **

	280 ** (eLock==WRITE_LOCK \|\| pIter->eLock==WRITE_LOCK)

	281 **

	282 ** since we know that if eLock==WRITE_LOCK, then no other connection

	283 ** may hold a WRITE_LOCK on any table in this file (since there can

	284 ** only be a single writer).

	285 */

	286 assert( pIter->eLock==READ_LOCK \|\| pIter->eLock==WRITE_LOCK );

	287 assert( eLock==READ_LOCK \|\| pIter->pBtree==p \|\| pIter->eLock==READ_LOCK);

	288 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){

	289 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);

	290 if( eLock==WRITE_LOCK ){

	291 assert( p==pBt->pWriter );

	292 pBt->btsFlags \|= BTS_PENDING;

	293 }

	294 return SQLITE_LOCKED_SHAREDCACHE;

	295 }

	296 }

	297 return SQLITE_OK;

	298 }

	299 #endif /* !SQLITE_OMIT_SHARED_CACHE */

	300

	301 #ifndef SQLITE_OMIT_SHARED_CACHE

	302 /*

	303 ** Add a lock on the table with root-page iTable to the shared-btree used

	304 ** by Btree handle p. Parameter eLock must be either READ_LOCK or

	305 ** WRITE_LOCK.

	306 **

	307 ** This function assumes the following:

	308 **

	309 ** (a) The specified Btree object p is connected to a sharable

	310 ** database (one with the BtShared.sharable flag set), and

	311 **

	312 ** (b) No other Btree objects hold a lock that conflicts

	313 ** with the requested lock (i.e. querySharedCacheTableLock() has

	314 ** already been called and returned SQLITE_OK).

	315 **

	316 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM

	317 ** is returned if a malloc attempt fails.

	318 */

	319 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){

	320 BtShared *pBt = p->pBt;

	321 BtLock *pLock = 0;

	322 BtLock *pIter;

	323

	324 assert( sqlite3BtreeHoldsMutex(p) );

	325 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

	326 assert( p->db!=0 );

	327

	328 /* A connection with the read-uncommitted flag set will never try to

	329 ** obtain a read-lock using this function. The only read-lock obtained

	330 ** by a connection in read-uncommitted mode is on the sqlite_master

	331 ** table, and that lock is obtained in BtreeBeginTrans(). */

	332 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) \|\| eLock==WRITE_LOCK );

	333

	334 /* This function should only be called on a sharable b-tree after it

	335 ** has been determined that no other b-tree holds a conflicting lock. */

	336 assert( p->sharable );

	337 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );

	338

	339 /* First search the list for an existing lock on this table. */

	340 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	341 if( pIter->iTable==iTable && pIter->pBtree==p ){

	342 pLock = pIter;

	343 break;

	344 }

	345 }

	346

	347 /* If the above search did not find a BtLock struct associating Btree p

	348 ** with table iTable, allocate one and link it into the list.

	349 */

	350 if( !pLock ){

	351 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));

	352 if( !pLock ){

	353 return SQLITE_NOMEM_BKPT;

	354 }

	355 pLock->iTable = iTable;

	356 pLock->pBtree = p;

	357 pLock->pNext = pBt->pLock;

	358 pBt->pLock = pLock;

	359 }

	360

	361 /* Set the BtLock.eLock variable to the maximum of the current lock

	362 ** and the requested lock. This means if a write-lock was already held

	363 ** and a read-lock requested, we don't incorrectly downgrade the lock.

	364 */

	365 assert( WRITE_LOCK>READ_LOCK );

	366 if( eLock>pLock->eLock ){

	367 pLock->eLock = eLock;

	368 }

	369

	370 return SQLITE_OK;

	371 }

	372 #endif /* !SQLITE_OMIT_SHARED_CACHE */

	373

	374 #ifndef SQLITE_OMIT_SHARED_CACHE

	375 /*

	376 ** Release all the table locks (locks obtained via calls to

	377 ** the setSharedCacheTableLock() procedure) held by Btree object p.

	378 **

	379 ** This function assumes that Btree p has an open read or write

	380 ** transaction. If it does not, then the BTS_PENDING flag

	381 ** may be incorrectly cleared.

	382 */

	383 static void clearAllSharedCacheTableLocks(Btree *p){

	384 BtShared *pBt = p->pBt;

	385 BtLock **ppIter = &pBt->pLock;

	386

	387 assert( sqlite3BtreeHoldsMutex(p) );

	388 assert( p->sharable \|\| 0==*ppIter );

	389 assert( p->inTrans>0 );

	390

	391 while( *ppIter ){

	392 BtLock pLock = ppIter;

	393 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 \|\| pBt->pWriter==pLock->pBtree );

	394 assert( pLock->pBtree->inTrans>=pLock->eLock );

	395 if( pLock->pBtree==p ){

	396 *ppIter = pLock->pNext;

	397 assert( pLock->iTable!=1 \|\| pLock==&p->lock );

	398 if( pLock->iTable!=1 ){

	399 sqlite3_free(pLock);

	400 }

	401 }else{

	402 ppIter = &pLock->pNext;

	403 }

	404 }

	405

	406 assert( (pBt->btsFlags & BTS_PENDING)==0 \|\| pBt->pWriter );

	407 if( pBt->pWriter==p ){

	408 pBt->pWriter = 0;

	409 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

	410 }else if( pBt->nTransaction==2 ){

	411 /* This function is called when Btree p is concluding its

	412 ** transaction. If there currently exists a writer, and p is not

	413 ** that writer, then the number of locks held by connections other

	414 ** than the writer must be about to drop to zero. In this case

	415 ** set the BTS_PENDING flag to 0.

	416 **

	417 ** If there is not currently a writer, then BTS_PENDING must

	418 ** be zero already. So this next line is harmless in that case.

	419 */

	420 pBt->btsFlags &= ~BTS_PENDING;

	421 }

	422 }

	423

	424 /*

	425 ** This function changes all write-locks held by Btree p into read-locks.

	426 */

	427 static void downgradeAllSharedCacheTableLocks(Btree *p){

	428 BtShared *pBt = p->pBt;

	429 if( pBt->pWriter==p ){

	430 BtLock *pLock;

	431 pBt->pWriter = 0;

	432 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

	433 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){

	434 assert( pLock->eLock==READ_LOCK \|\| pLock->pBtree==p );

	435 pLock->eLock = READ_LOCK;

	436 }

	437 }

	438 }

	439

	440 #endif /* SQLITE_OMIT_SHARED_CACHE */

	441

	442 static void releasePage(MemPage pPage); / Forward reference */

	443

	444 /*

	445 *** This routine is used inside of assert() only **

	446 **

	447 ** Verify that the cursor holds the mutex on its BtShared

	448 */

	449 #ifdef SQLITE_DEBUG

	450 static int cursorHoldsMutex(BtCursor *p){

	451 return sqlite3_mutex_held(p->pBt->mutex);

	452 }

	453

	454 /* Verify that the cursor and the BtShared agree about what is the current

	455 ** database connetion. This is important in shared-cache mode. If the database

	456 ** connection pointers get out-of-sync, it is possible for routines like

	457 ** btreeInitPage() to reference an stale connection pointer that references a

	458 ** a connection that has already closed. This routine is used inside assert()

	459 ** statements only and for the purpose of double-checking that the btree code

	460 ** does keep the database connection pointers up-to-date.

	461 */

	462 static int cursorOwnsBtShared(BtCursor *p){

	463 assert( cursorHoldsMutex(p) );

	464 return (p->pBtree->db==p->pBt->db);

	465 }

	466 #endif

	467

	468 /*

	469 ** Invalidate the overflow cache of the cursor passed as the first argument.

	470 ** on the shared btree structure pBt.

	471 */

	472 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)

	473

	474 /*

	475 ** Invalidate the overflow page-list cache for all cursors opened

	476 ** on the shared btree structure pBt.

	477 */

	478 static void invalidateAllOverflowCache(BtShared *pBt){

	479 BtCursor *p;

	480 assert( sqlite3_mutex_held(pBt->mutex) );

	481 for(p=pBt->pCursor; p; p=p->pNext){

	482 invalidateOverflowCache(p);

	483 }

	484 }

	485

	486 #ifndef SQLITE_OMIT_INCRBLOB

	487 /*

	488 ** This function is called before modifying the contents of a table

	489 ** to invalidate any incrblob cursors that are open on the

	490 ** row or one of the rows being modified.

	491 **

	492 ** If argument isClearTable is true, then the entire contents of the

	493 ** table is about to be deleted. In this case invalidate all incrblob

	494 ** cursors open on any row within the table with root-page pgnoRoot.

	495 **

	496 ** Otherwise, if argument isClearTable is false, then the row with

	497 ** rowid iRow is being replaced or deleted. In this case invalidate

	498 ** only those incrblob cursors open on that specific row.

	499 */

	500 static void invalidateIncrblobCursors(

	501 Btree pBtree, / The database file to check */

	502 i64 iRow, /* The rowid that might be changing */

	503 int isClearTable /* True if all rows are being deleted */

	504 ){

	505 BtCursor *p;

	506 if( pBtree->hasIncrblobCur==0 ) return;

	507 assert( sqlite3BtreeHoldsMutex(pBtree) );

	508 pBtree->hasIncrblobCur = 0;

	509 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	510 if( (p->curFlags & BTCF_Incrblob)!=0 ){

	511 pBtree->hasIncrblobCur = 1;

	512 if( isClearTable \|\| p->info.nKey==iRow ){

	513 p->eState = CURSOR_INVALID;

	514 }

	515 }

	516 }

	517 }

	518

	519 #else

	520 /* Stub function when INCRBLOB is omitted */

	521 #define invalidateIncrblobCursors(x,y,z)

	522 #endif /* SQLITE_OMIT_INCRBLOB */

	523

	524 /*

	525 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called

	526 ** when a page that previously contained data becomes a free-list leaf

	527 ** page.

	528 **

	529 ** The BtShared.pHasContent bitvec exists to work around an obscure

	530 ** bug caused by the interaction of two useful IO optimizations surrounding

	531 ** free-list leaf pages:

	532 **

	533 ** 1) When all data is deleted from a page and the page becomes

	534 ** a free-list leaf page, the page is not written to the database

	535 ** (as free-list leaf pages contain no meaningful data). Sometimes

	536 ** such a page is not even journalled (as it will not be modified,

	537 ** why bother journalling it?).

	538 **

	539 ** 2) When a free-list leaf page is reused, its content is not read

	540 ** from the database or written to the journal file (why should it

	541 ** be, if it is not at all meaningful?).

	542 **

	543 ** By themselves, these optimizations work fine and provide a handy

	544 ** performance boost to bulk delete or insert operations. However, if

	545 ** a page is moved to the free-list and then reused within the same

	546 ** transaction, a problem comes up. If the page is not journalled when

	547 ** it is moved to the free-list and it is also not journalled when it

	548 ** is extracted from the free-list and reused, then the original data

	549 ** may be lost. In the event of a rollback, it may not be possible

	550 ** to restore the database to its original configuration.

	551 **

	552 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is

	553 ** moved to become a free-list leaf page, the corresponding bit is

	554 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,

	555 ** optimization 2 above is omitted if the corresponding bit is already

	556 ** set in BtShared.pHasContent. The contents of the bitvec are cleared

	557 ** at the end of every transaction.

	558 */

	559 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){

	560 int rc = SQLITE_OK;

	561 if( !pBt->pHasContent ){

	562 assert( pgno<=pBt->nPage );

	563 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);

	564 if( !pBt->pHasContent ){

	565 rc = SQLITE_NOMEM_BKPT;

	566 }

	567 }

	568 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){

	569 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);

	570 }

	571 return rc;

	572 }

	573

	574 /*

	575 ** Query the BtShared.pHasContent vector.

	576 **

	577 ** This function is called when a free-list leaf page is removed from the

	578 ** free-list for reuse. It returns false if it is safe to retrieve the

	579 ** page from the pager layer with the 'no-content' flag set. True otherwise.

	580 */

	581 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){

	582 Bitvec *p = pBt->pHasContent;

	583 return (p && (pgno>sqlite3BitvecSize(p) \|\| sqlite3BitvecTest(p, pgno)));

	584 }

	585

	586 /*

	587 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be

	588 ** invoked at the conclusion of each write-transaction.

	589 */

	590 static void btreeClearHasContent(BtShared *pBt){

	591 sqlite3BitvecDestroy(pBt->pHasContent);

	592 pBt->pHasContent = 0;

	593 }

	594

	595 /*

	596 ** Release all of the apPage[] pages for a cursor.

	597 */

	598 static void btreeReleaseAllCursorPages(BtCursor *pCur){

	599 int i;

	600 for(i=0; i<=pCur->iPage; i++){

	601 releasePage(pCur->apPage[i]);

	602 pCur->apPage[i] = 0;

	603 }

	604 pCur->iPage = -1;

	605 }

	606

	607 /*

	608 ** The cursor passed as the only argument must point to a valid entry

	609 ** when this function is called (i.e. have eState==CURSOR_VALID). This

	610 ** function saves the current cursor key in variables pCur->nKey and

	611 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error

	612 ** code otherwise.

	613 **

	614 ** If the cursor is open on an intkey table, then the integer key

	615 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to

	616 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is

	617 ** set to point to a malloced buffer pCur->nKey bytes in size containing

	618 ** the key.

	619 */

	620 static int saveCursorKey(BtCursor *pCur){

	621 int rc = SQLITE_OK;

	622 assert( CURSOR_VALID==pCur->eState );

	623 assert( 0==pCur->pKey );

	624 assert( cursorHoldsMutex(pCur) );

	625

	626 if( pCur->curIntKey ){

	627 /* Only the rowid is required for a table btree */

	628 pCur->nKey = sqlite3BtreeIntegerKey(pCur);

	629 }else{

	630 /* For an index btree, save the complete key content */

	631 void *pKey;

	632 pCur->nKey = sqlite3BtreePayloadSize(pCur);

	633 pKey = sqlite3Malloc( pCur->nKey );

	634 if( pKey ){

	635 rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);

	636 if( rc==SQLITE_OK ){

	637 pCur->pKey = pKey;

	638 }else{

	639 sqlite3_free(pKey);

	640 }

	641 }else{

	642 rc = SQLITE_NOMEM_BKPT;

	643 }

	644 }

	645 assert( !pCur->curIntKey \|\| !pCur->pKey );

	646 return rc;

	647 }

	648

	649 /*

	650 ** Save the current cursor position in the variables BtCursor.nKey

	651 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.

	652 **

	653 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)

	654 ** prior to calling this routine.

	655 */

	656 static int saveCursorPosition(BtCursor *pCur){

	657 int rc;

	658

	659 assert( CURSOR_VALID==pCur->eState \|\| CURSOR_SKIPNEXT==pCur->eState );

	660 assert( 0==pCur->pKey );

	661 assert( cursorHoldsMutex(pCur) );

	662

	663 if( pCur->eState==CURSOR_SKIPNEXT ){

	664 pCur->eState = CURSOR_VALID;

	665 }else{

	666 pCur->skipNext = 0;

	667 }

	668

	669 rc = saveCursorKey(pCur);

	670 if( rc==SQLITE_OK ){

	671 btreeReleaseAllCursorPages(pCur);

	672 pCur->eState = CURSOR_REQUIRESEEK;

	673 }

	674

	675 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl\|BTCF_AtLast);

	676 return rc;

	677 }

	678

	679 /* Forward reference */

	680 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor,Pgno,BtCursor);

	681

	682 /*

	683 ** Save the positions of all cursors (except pExcept) that are open on

	684 ** the table with root-page iRoot. "Saving the cursor position" means that

	685 ** the location in the btree is remembered in such a way that it can be

	686 ** moved back to the same spot after the btree has been modified. This

	687 ** routine is called just before cursor pExcept is used to modify the

	688 ** table, for example in BtreeDelete() or BtreeInsert().

	689 **

	690 ** If there are two or more cursors on the same btree, then all such

	691 ** cursors should have their BTCF_Multiple flag set. The btreeCursor()

	692 ** routine enforces that rule. This routine only needs to be called in

	693 ** the uncommon case when pExpect has the BTCF_Multiple flag set.

	694 **

	695 ** If pExpect!=NULL and if no other cursors are found on the same root-page,

	696 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another

	697 ** pointless call to this routine.

	698 **

	699 ** Implementation note: This routine merely checks to see if any cursors

	700 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual)

	701 ** event that cursors are in need to being saved.

	702 */

	703 static int saveAllCursors(BtShared pBt, Pgno iRoot, BtCursor pExcept){

	704 BtCursor *p;

	705 assert( sqlite3_mutex_held(pBt->mutex) );

	706 assert( pExcept==0 \|\| pExcept->pBt==pBt );

	707 for(p=pBt->pCursor; p; p=p->pNext){

	708 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ) break;

	709 }

	710 if( p ) return saveCursorsOnList(p, iRoot, pExcept);

	711 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;

	712 return SQLITE_OK;

	713 }

	714

	715 /* This helper routine to saveAllCursors does the actual work of saving

	716 ** the cursors if and when a cursor is found that actually requires saving.

	717 ** The common case is that no cursors need to be saved, so this routine is

	718 ** broken out from its caller to avoid unnecessary stack pointer movement.

	719 */

	720 static int SQLITE_NOINLINE saveCursorsOnList(

	721 BtCursor p, / The first cursor that needs saving */

	722 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */

	723 BtCursor pExcept / Do not save this cursor */

	724 ){

	725 do{

	726 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ){

	727 if( p->eState==CURSOR_VALID \|\| p->eState==CURSOR_SKIPNEXT ){

	728 int rc = saveCursorPosition(p);

	729 if( SQLITE_OK!=rc ){

	730 return rc;

	731 }

	732 }else{

	733 testcase( p->iPage>0 );

	734 btreeReleaseAllCursorPages(p);

	735 }

	736 }

	737 p = p->pNext;

	738 }while( p );

	739 return SQLITE_OK;

	740 }

	741

	742 /*

	743 ** Clear the current cursor position.

	744 */

	745 void sqlite3BtreeClearCursor(BtCursor *pCur){

	746 assert( cursorHoldsMutex(pCur) );

	747 sqlite3_free(pCur->pKey);

	748 pCur->pKey = 0;

	749 pCur->eState = CURSOR_INVALID;

	750 }

	751

	752 /*

	753 ** In this version of BtreeMoveto, pKey is a packed index record

	754 ** such as is generated by the OP_MakeRecord opcode. Unpack the

	755 ** record and then call BtreeMovetoUnpacked() to do the work.

	756 */

	757 static int btreeMoveto(

	758 BtCursor pCur, / Cursor open on the btree to be searched */

	759 const void pKey, / Packed key if the btree is an index */

	760 i64 nKey, /* Integer key for tables. Size of pKey for indices */

	761 int bias, /* Bias search to the high end */

	762 int pRes / Write search results here */

	763 ){

	764 int rc; /* Status code */

	765 UnpackedRecord pIdxKey; / Unpacked index key */

	766

	767 if( pKey ){

	768 assert( nKey==(i64)(int)nKey );

	769 pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);

	770 if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;

	771 sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);

	772 if( pIdxKey->nField==0 ){

	773 rc = SQLITE_CORRUPT_BKPT;

	774 goto moveto_done;

	775 }

	776 }else{

	777 pIdxKey = 0;

	778 }

	779 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);

	780 moveto_done:

	781 if( pIdxKey ){

	782 sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);

	783 }

	784 return rc;

	785 }

	786

	787 /*

	788 ** Restore the cursor to the position it was in (or as close to as possible)

	789 ** when saveCursorPosition() was called. Note that this call deletes the

	790 ** saved position info stored by saveCursorPosition(), so there can be

	791 ** at most one effective restoreCursorPosition() call after each

	792 ** saveCursorPosition().

	793 */

	794 static int btreeRestoreCursorPosition(BtCursor *pCur){

	795 int rc;

	796 int skipNext;

	797 assert( cursorOwnsBtShared(pCur) );

	798 assert( pCur->eState>=CURSOR_REQUIRESEEK );

	799 if( pCur->eState==CURSOR_FAULT ){

	800 return pCur->skipNext;

	801 }

	802 pCur->eState = CURSOR_INVALID;

	803 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);

	804 if( rc==SQLITE_OK ){

	805 sqlite3_free(pCur->pKey);

	806 pCur->pKey = 0;

	807 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_INVALID );

	808 pCur->skipNext \|= skipNext;

	809 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){

	810 pCur->eState = CURSOR_SKIPNEXT;

	811 }

	812 }

	813 return rc;

	814 }

	815

	816 #define restoreCursorPosition(p) \

	817 (p->eState>=CURSOR_REQUIRESEEK ? \

	818 btreeRestoreCursorPosition(p) : \

	819 SQLITE_OK)

	820

	821 /*

	822 ** Determine whether or not a cursor has moved from the position where

	823 ** it was last placed, or has been invalidated for any other reason.

	824 ** Cursors can move when the row they are pointing at is deleted out

	825 ** from under them, for example. Cursor might also move if a btree

	826 ** is rebalanced.

	827 **

	828 ** Calling this routine with a NULL cursor pointer returns false.

	829 **

	830 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor

	831 ** back to where it ought to be if this routine returns true.

	832 */

	833 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){

	834 return pCur->eState!=CURSOR_VALID;

	835 }

	836

	837 /*

	838 ** This routine restores a cursor back to its original position after it

	839 ** has been moved by some outside activity (such as a btree rebalance or

	840 ** a row having been deleted out from under the cursor).

	841 **

	842 ** On success, the *pDifferentRow parameter is false if the cursor is left

	843 ** pointing at exactly the same row. *pDifferntRow is the row the cursor

	844 ** was pointing to has been deleted, forcing the cursor to point to some

	845 ** nearby row.

	846 **

	847 ** This routine should only be called for a cursor that just returned

	848 ** TRUE from sqlite3BtreeCursorHasMoved().

	849 */

	850 int sqlite3BtreeCursorRestore(BtCursor pCur, int pDifferentRow){

	851 int rc;

	852

	853 assert( pCur!=0 );

	854 assert( pCur->eState!=CURSOR_VALID );

	855 rc = restoreCursorPosition(pCur);

	856 if( rc ){

	857 *pDifferentRow = 1;

	858 return rc;

	859 }

	860 if( pCur->eState!=CURSOR_VALID ){

	861 *pDifferentRow = 1;

	862 }else{

	863 assert( pCur->skipNext==0 );

	864 *pDifferentRow = 0;

	865 }

	866 return SQLITE_OK;

	867 }

	868

	869 #ifdef SQLITE_ENABLE_CURSOR_HINTS

	870 /*

	871 ** Provide hints to the cursor. The particular hint given (and the type

	872 ** and number of the varargs parameters) is determined by the eHintType

	873 ** parameter. See the definitions of the BTREE_HINT_* macros for details.

	874 */

	875 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){

	876 /* Used only by system that substitute their own storage engine */

	877 }

	878 #endif

	879

	880 /*

	881 ** Provide flag hints to the cursor.

	882 */

	883 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){

	884 assert( x==BTREE_SEEK_EQ \|\| x==BTREE_BULKLOAD \|\| x==0 );

	885 pCur->hints = x;

	886 }

	887

	888

	889 #ifndef SQLITE_OMIT_AUTOVACUUM

	890 /*

	891 ** Given a page number of a regular database page, return the page

	892 ** number for the pointer-map page that contains the entry for the

	893 ** input page number.

	894 **

	895 ** Return 0 (not a valid page) for pgno==1 since there is

	896 ** no pointer map associated with page 1. The integrity_check logic

	897 ** requires that ptrmapPageno(*,1)!=1.

	898 */

	899 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){

	900 int nPagesPerMapPage;

	901 Pgno iPtrMap, ret;

	902 assert( sqlite3_mutex_held(pBt->mutex) );

	903 if( pgno<2 ) return 0;

	904 nPagesPerMapPage = (pBt->usableSize/5)+1;

	905 iPtrMap = (pgno-2)/nPagesPerMapPage;

	906 ret = (iPtrMap*nPagesPerMapPage) + 2;

	907 if( ret==PENDING_BYTE_PAGE(pBt) ){

	908 ret++;

	909 }

	910 return ret;

	911 }

	912

	913 /*

	914 ** Write an entry into the pointer map.

	915 **

	916 ** This routine updates the pointer map entry for page number 'key'

	917 ** so that it maps to type 'eType' and parent page number 'pgno'.

	918 **

	919 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is

	920 ** a no-op. If an error occurs, the appropriate error code is written

	921 ** into *pRC.

	922 */

	923 static void ptrmapPut(BtShared pBt, Pgno key, u8 eType, Pgno parent, int pRC){

	924 DbPage pDbPage; / The pointer map page */

	925 u8 pPtrmap; / The pointer map data */

	926 Pgno iPtrmap; /* The pointer map page number */

	927 int offset; /* Offset in pointer map page */

	928 int rc; /* Return code from subfunctions */

	929

	930 if( *pRC ) return;

	931

	932 assert( sqlite3_mutex_held(pBt->mutex) );

	933 /* The master-journal page number must never be used as a pointer map page */

	934 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );

	935

	936 assert( pBt->autoVacuum );

	937 if( key==0 ){

	938 *pRC = SQLITE_CORRUPT_BKPT;

	939 return;

	940 }

	941 iPtrmap = PTRMAP_PAGENO(pBt, key);

	942 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);

	943 if( rc!=SQLITE_OK ){

	944 *pRC = rc;

	945 return;

	946 }

	947 offset = PTRMAP_PTROFFSET(iPtrmap, key);

	948 if( offset<0 ){

	949 *pRC = SQLITE_CORRUPT_BKPT;

	950 goto ptrmap_exit;

	951 }

	952 assert( offset <= (int)pBt->usableSize-5 );

	953 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

	954

	955 if( eType!=pPtrmap[offset] \|\| get4byte(&pPtrmap[offset+1])!=parent ){

	956 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));

	957 *pRC= rc = sqlite3PagerWrite(pDbPage);

	958 if( rc==SQLITE_OK ){

	959 pPtrmap[offset] = eType;

	960 put4byte(&pPtrmap[offset+1], parent);

	961 }

	962 }

	963

	964 ptrmap_exit:

	965 sqlite3PagerUnref(pDbPage);

	966 }

	967

	968 /*

	969 ** Read an entry from the pointer map.

	970 **

	971 ** This routine retrieves the pointer map entry for page 'key', writing

	972 ** the type and parent page number to pEType and pPgno respectively.

	973 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.

	974 */

	975 static int ptrmapGet(BtShared pBt, Pgno key, u8 pEType, Pgno *pPgno){

	976 DbPage pDbPage; / The pointer map page */

	977 int iPtrmap; /* Pointer map page index */

	978 u8 pPtrmap; / Pointer map page data */

	979 int offset; /* Offset of entry in pointer map */

	980 int rc;

	981

	982 assert( sqlite3_mutex_held(pBt->mutex) );

	983

	984 iPtrmap = PTRMAP_PAGENO(pBt, key);

	985 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);

	986 if( rc!=0 ){

	987 return rc;

	988 }

	989 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

	990

	991 offset = PTRMAP_PTROFFSET(iPtrmap, key);

	992 if( offset<0 ){

	993 sqlite3PagerUnref(pDbPage);

	994 return SQLITE_CORRUPT_BKPT;

	995 }

	996 assert( offset <= (int)pBt->usableSize-5 );

	997 assert( pEType!=0 );

	998 *pEType = pPtrmap[offset];

	999 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);

	1000

	1001 sqlite3PagerUnref(pDbPage);

	1002 if( pEType<1 \|\| pEType>5 ) return SQLITE_CORRUPT_BKPT;

	1003 return SQLITE_OK;

	1004 }

	1005

	1006 #else /* if defined SQLITE_OMIT_AUTOVACUUM */

	1007 #define ptrmapPut(w,x,y,z,rc)

	1008 #define ptrmapGet(w,x,y,z) SQLITE_OK

	1009 #define ptrmapPutOvflPtr(x, y, rc)

	1010 #endif

	1011

	1012 /*

	1013 ** Given a btree page and a cell index (0 means the first cell on

	1014 ** the page, 1 means the second cell, and so forth) return a pointer

	1015 ** to the cell content.

	1016 **

	1017 ** findCellPastPtr() does the same except it skips past the initial

	1018 ** 4-byte child pointer found on interior pages, if there is one.

	1019 **

	1020 ** This routine works only for pages that do not contain overflow cells.

	1021 */

	1022 #define findCell(P,I) \

	1023 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))

	1024 #define findCellPastPtr(P,I) \

	1025 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))

	1026

	1027

	1028 /*

	1029 ** This is common tail processing for btreeParseCellPtr() and

	1030 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely

	1031 ** on a single B-tree page. Make necessary adjustments to the CellInfo

	1032 ** structure.

	1033 */

	1034 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(

	1035 MemPage pPage, / Page containing the cell */

	1036 u8 pCell, / Pointer to the cell text. */

	1037 CellInfo pInfo / Fill in this structure */

	1038 ){

	1039 /* If the payload will not fit completely on the local page, we have

	1040 ** to decide how much to store locally and how much to spill onto

	1041 ** overflow pages. The strategy is to minimize the amount of unused

	1042 ** space on overflow pages while keeping the amount of local storage

	1043 ** in between minLocal and maxLocal.

	1044 **

	1045 ** Warning: changing the way overflow payload is distributed in any

	1046 ** way will result in an incompatible file format.

	1047 */

	1048 int minLocal; /* Minimum amount of payload held locally */

	1049 int maxLocal; /* Maximum amount of payload held locally */

	1050 int surplus; /* Overflow payload available for local storage */

	1051

	1052 minLocal = pPage->minLocal;

	1053 maxLocal = pPage->maxLocal;

	1054 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);

	1055 testcase( surplus==maxLocal );

	1056 testcase( surplus==maxLocal+1 );

	1057 if( surplus <= maxLocal ){

	1058 pInfo->nLocal = (u16)surplus;

	1059 }else{

	1060 pInfo->nLocal = (u16)minLocal;

	1061 }

	1062 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;

	1063 }

	1064

	1065 /*

	1066 ** The following routines are implementations of the MemPage.xParseCell()

	1067 ** method.

	1068 **

	1069 ** Parse a cell content block and fill in the CellInfo structure.

	1070 **

	1071 ** btreeParseCellPtr() => table btree leaf nodes

	1072 ** btreeParseCellNoPayload() => table btree internal nodes

	1073 ** btreeParseCellPtrIndex() => index btree nodes

	1074 **

	1075 ** There is also a wrapper function btreeParseCell() that works for

	1076 ** all MemPage types and that references the cell by index rather than

	1077 ** by pointer.

	1078 */

	1079 static void btreeParseCellPtrNoPayload(

	1080 MemPage pPage, / Page containing the cell */

	1081 u8 pCell, / Pointer to the cell text. */

	1082 CellInfo pInfo / Fill in this structure */

	1083 ){

	1084 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1085 assert( pPage->leaf==0 );

	1086 assert( pPage->childPtrSize==4 );

	1087 #ifndef SQLITE_DEBUG

	1088 UNUSED_PARAMETER(pPage);

	1089 #endif

	1090 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);

	1091 pInfo->nPayload = 0;

	1092 pInfo->nLocal = 0;

	1093 pInfo->pPayload = 0;

	1094 return;

	1095 }

	1096 static void btreeParseCellPtr(

	1097 MemPage pPage, / Page containing the cell */

	1098 u8 pCell, / Pointer to the cell text. */

	1099 CellInfo pInfo / Fill in this structure */

	1100 ){

	1101 u8 pIter; / For scanning through pCell */

	1102 u32 nPayload; /* Number of bytes of cell payload */

	1103 u64 iKey; /* Extracted Key value */

	1104

	1105 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1106 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

	1107 assert( pPage->intKeyLeaf );

	1108 assert( pPage->childPtrSize==0 );

	1109 pIter = pCell;

	1110

	1111 /* The next block of code is equivalent to:

	1112 **

	1113 ** pIter += getVarint32(pIter, nPayload);

	1114 **

	1115 ** The code is inlined to avoid a function call.

	1116 */

	1117 nPayload = *pIter;

	1118 if( nPayload>=0x80 ){

	1119 u8 *pEnd = &pIter[8];

	1120 nPayload &= 0x7f;

	1121 do{

	1122 nPayload = (nPayload<<7) \| (*++pIter & 0x7f);

	1123 }while( (*pIter)>=0x80 && pIter<pEnd );

	1124 }

	1125 pIter++;

	1126

	1127 /* The next block of code is equivalent to:

	1128 **

	1129 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey);

	1130 **

	1131 ** The code is inlined to avoid a function call.

	1132 */

	1133 iKey = *pIter;

	1134 if( iKey>=0x80 ){

	1135 u8 *pEnd = &pIter[7];

	1136 iKey &= 0x7f;

	1137 while(1){

	1138 iKey = (iKey<<7) \| (*++pIter & 0x7f);

	1139 if( (*pIter)<0x80 ) break;

	1140 if( pIter>=pEnd ){

	1141 iKey = (iKey<<8) \| *++pIter;

	1142 break;

	1143 }

	1144 }

	1145 }

	1146 pIter++;

	1147

	1148 pInfo->nKey = (i64)&iKey;

	1149 pInfo->nPayload = nPayload;

	1150 pInfo->pPayload = pIter;

	1151 testcase( nPayload==pPage->maxLocal );

	1152 testcase( nPayload==pPage->maxLocal+1 );

	1153 if( nPayload<=pPage->maxLocal ){

	1154 /* This is the (easy) common case where the entire payload fits

	1155 ** on the local page. No overflow is required.

	1156 */

	1157 pInfo->nSize = nPayload + (u16)(pIter - pCell);

	1158 if( pInfo->nSize<4 ) pInfo->nSize = 4;

	1159 pInfo->nLocal = (u16)nPayload;

	1160 }else{

	1161 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);

	1162 }

	1163 }

	1164 static void btreeParseCellPtrIndex(

	1165 MemPage pPage, / Page containing the cell */

	1166 u8 pCell, / Pointer to the cell text. */

	1167 CellInfo pInfo / Fill in this structure */

	1168 ){

	1169 u8 pIter; / For scanning through pCell */

	1170 u32 nPayload; /* Number of bytes of cell payload */

	1171

	1172 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1173 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

	1174 assert( pPage->intKeyLeaf==0 );

	1175 pIter = pCell + pPage->childPtrSize;

	1176 nPayload = *pIter;

	1177 if( nPayload>=0x80 ){

	1178 u8 *pEnd = &pIter[8];

	1179 nPayload &= 0x7f;

	1180 do{

	1181 nPayload = (nPayload<<7) \| (*++pIter & 0x7f);

	1182 }while( *(pIter)>=0x80 && pIter<pEnd );

	1183 }

	1184 pIter++;

	1185 pInfo->nKey = nPayload;

	1186 pInfo->nPayload = nPayload;

	1187 pInfo->pPayload = pIter;

	1188 testcase( nPayload==pPage->maxLocal );

	1189 testcase( nPayload==pPage->maxLocal+1 );

	1190 if( nPayload<=pPage->maxLocal ){

	1191 /* This is the (easy) common case where the entire payload fits

	1192 ** on the local page. No overflow is required.

	1193 */

	1194 pInfo->nSize = nPayload + (u16)(pIter - pCell);

	1195 if( pInfo->nSize<4 ) pInfo->nSize = 4;

	1196 pInfo->nLocal = (u16)nPayload;

	1197 }else{

	1198 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);

	1199 }

	1200 }

	1201 static void btreeParseCell(

	1202 MemPage pPage, / Page containing the cell */

	1203 int iCell, /* The cell index. First cell is 0 */

	1204 CellInfo pInfo / Fill in this structure */

	1205 ){

	1206 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);

	1207 }

	1208

	1209 /*

	1210 ** The following routines are implementations of the MemPage.xCellSize

	1211 ** method.

	1212 **

	1213 ** Compute the total number of bytes that a Cell needs in the cell

	1214 ** data area of the btree-page. The return number includes the cell

	1215 ** data header and the local payload, but not any overflow page or

	1216 ** the space used by the cell pointer.

	1217 **

	1218 ** cellSizePtrNoPayload() => table internal nodes

	1219 ** cellSizePtr() => all index nodes & table leaf nodes

	1220 */

	1221 static u16 cellSizePtr(MemPage pPage, u8 pCell){

	1222 u8 pIter = pCell + pPage->childPtrSize; / For looping over bytes of pCell */

	1223 u8 pEnd; / End mark for a varint */

	1224 u32 nSize; /* Size value to return */

	1225

	1226 #ifdef SQLITE_DEBUG

	1227 /* The value returned by this function should always be the same as

	1228 ** the (CellInfo.nSize) value found by doing a full parse of the

	1229 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

	1230 ** this function verifies that this invariant is not violated. */

	1231 CellInfo debuginfo;

	1232 pPage->xParseCell(pPage, pCell, &debuginfo);

	1233 #endif

	1234

	1235 nSize = *pIter;

	1236 if( nSize>=0x80 ){

	1237 pEnd = &pIter[8];

	1238 nSize &= 0x7f;

	1239 do{

	1240 nSize = (nSize<<7) \| (*++pIter & 0x7f);

	1241 }while( *(pIter)>=0x80 && pIter<pEnd );

	1242 }

	1243 pIter++;

	1244 if( pPage->intKey ){

	1245 /* pIter now points at the 64-bit integer key value, a variable length

	1246 ** integer. The following block moves pIter to point at the first byte

	1247 ** past the end of the key value. */

	1248 pEnd = &pIter[9];

	1249 while( (*pIter++)&0x80 && pIter<pEnd );

	1250 }

	1251 testcase( nSize==pPage->maxLocal );

	1252 testcase( nSize==pPage->maxLocal+1 );

	1253 if( nSize<=pPage->maxLocal ){

	1254 nSize += (u32)(pIter - pCell);

	1255 if( nSize<4 ) nSize = 4;

	1256 }else{

	1257 int minLocal = pPage->minLocal;

	1258 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);

	1259 testcase( nSize==pPage->maxLocal );

	1260 testcase( nSize==pPage->maxLocal+1 );

	1261 if( nSize>pPage->maxLocal ){

	1262 nSize = minLocal;

	1263 }

	1264 nSize += 4 + (u16)(pIter - pCell);

	1265 }

	1266 assert( nSize==debuginfo.nSize \|\| CORRUPT_DB );

	1267 return (u16)nSize;

	1268 }

	1269 static u16 cellSizePtrNoPayload(MemPage pPage, u8 pCell){

	1270 u8 pIter = pCell + 4; / For looping over bytes of pCell */

	1271 u8 pEnd; / End mark for a varint */

	1272

	1273 #ifdef SQLITE_DEBUG

	1274 /* The value returned by this function should always be the same as

	1275 ** the (CellInfo.nSize) value found by doing a full parse of the

	1276 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

	1277 ** this function verifies that this invariant is not violated. */

	1278 CellInfo debuginfo;

	1279 pPage->xParseCell(pPage, pCell, &debuginfo);

	1280 #else

	1281 UNUSED_PARAMETER(pPage);

	1282 #endif

	1283

	1284 assert( pPage->childPtrSize==4 );

	1285 pEnd = pIter + 9;

	1286 while( (*pIter++)&0x80 && pIter<pEnd );

	1287 assert( debuginfo.nSize==(u16)(pIter - pCell) \|\| CORRUPT_DB );

	1288 return (u16)(pIter - pCell);

	1289 }

	1290

	1291

	1292 #ifdef SQLITE_DEBUG

	1293 /* This variation on cellSizePtr() is used inside of assert() statements

	1294 ** only. */

	1295 static u16 cellSize(MemPage *pPage, int iCell){

	1296 return pPage->xCellSize(pPage, findCell(pPage, iCell));

	1297 }

	1298 #endif

	1299

	1300 #ifndef SQLITE_OMIT_AUTOVACUUM

	1301 /*

	1302 ** If the cell pCell, part of page pPage contains a pointer

	1303 ** to an overflow page, insert an entry into the pointer-map

	1304 ** for the overflow page.

	1305 */

	1306 static void ptrmapPutOvflPtr(MemPage pPage, u8 pCell, int *pRC){

	1307 CellInfo info;

	1308 if( *pRC ) return;

	1309 assert( pCell!=0 );

	1310 pPage->xParseCell(pPage, pCell, &info);

	1311 if( info.nLocal<info.nPayload ){

	1312 Pgno ovfl = get4byte(&pCell[info.nSize-4]);

	1313 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);

	1314 }

	1315 }

	1316 #endif

	1317

	1318

	1319 /*

	1320 ** Defragment the page given. All Cells are moved to the

	1321 ** end of the page and all free space is collected into one

	1322 ** big FreeBlk that occurs in between the header and cell

	1323 ** pointer array and the cell content area.

	1324 **

	1325 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a

	1326 ** b-tree page so that there are no freeblocks or fragment bytes, all

	1327 ** unused bytes are contained in the unallocated space region, and all

	1328 ** cells are packed tightly at the end of the page.

	1329 */

	1330 static int defragmentPage(MemPage *pPage){

	1331 int i; /* Loop counter */

	1332 int pc; /* Address of the i-th cell */

	1333 int hdr; /* Offset to the page header */

	1334 int size; /* Size of a cell */

	1335 int usableSize; /* Number of usable bytes on a page */

	1336 int cellOffset; /* Offset to the cell pointer array */

	1337 int cbrk; /* Offset to the cell content area */

	1338 int nCell; /* Number of cells on the page */

	1339 unsigned char data; / The page data */

	1340 unsigned char temp; / Temp area for cell content */

	1341 unsigned char src; / Source of content */

	1342 int iCellFirst; /* First allowable cell index */

	1343 int iCellLast; /* Last possible cell index */

	1344

	1345

	1346 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1347 assert( pPage->pBt!=0 );

	1348 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );

	1349 assert( pPage->nOverflow==0 );

	1350 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1351 temp = 0;

	1352 src = data = pPage->aData;

	1353 hdr = pPage->hdrOffset;

	1354 cellOffset = pPage->cellOffset;

	1355 nCell = pPage->nCell;

	1356 assert( nCell==get2byte(&data[hdr+3]) );

	1357 usableSize = pPage->pBt->usableSize;

	1358 cbrk = usableSize;

	1359 iCellFirst = cellOffset + 2*nCell;

	1360 iCellLast = usableSize - 4;

	1361 for(i=0; i<nCell; i++){

	1362 u8 pAddr; / The i-th cell pointer */

	1363 pAddr = &data[cellOffset + i*2];

	1364 pc = get2byte(pAddr);

	1365 testcase( pc==iCellFirst );

	1366 testcase( pc==iCellLast );

	1367 /* These conditions have already been verified in btreeInitPage()

	1368 ** if PRAGMA cell_size_check=ON.

	1369 */

	1370 if( pc<iCellFirst \|\| pc>iCellLast ){

	1371 return SQLITE_CORRUPT_BKPT;

	1372 }

	1373 assert( pc>=iCellFirst && pc<=iCellLast );

	1374 size = pPage->xCellSize(pPage, &src[pc]);

	1375 cbrk -= size;

	1376 if( cbrk<iCellFirst \|\| pc+size>usableSize ){

	1377 return SQLITE_CORRUPT_BKPT;

	1378 }

	1379 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );

	1380 testcase( cbrk+size==usableSize );

	1381 testcase( pc+size==usableSize );

	1382 put2byte(pAddr, cbrk);

	1383 if( temp==0 ){

	1384 int x;

	1385 if( cbrk==pc ) continue;

	1386 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);

	1387 x = get2byte(&data[hdr+5]);

	1388 memcpy(&temp[x], &data[x], (cbrk+size) - x);

	1389 src = temp;

	1390 }

	1391 memcpy(&data[cbrk], &src[pc], size);

	1392 }

	1393 assert( cbrk>=iCellFirst );

	1394 put2byte(&data[hdr+5], cbrk);

	1395 data[hdr+1] = 0;

	1396 data[hdr+2] = 0;

	1397 data[hdr+7] = 0;

	1398 memset(&data[iCellFirst], 0, cbrk-iCellFirst);

	1399 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1400 if( cbrk-iCellFirst!=pPage->nFree ){

	1401 return SQLITE_CORRUPT_BKPT;

	1402 }

	1403 return SQLITE_OK;

	1404 }

	1405

	1406 /*

	1407 ** Search the free-list on page pPg for space to store a cell nByte bytes in

	1408 ** size. If one can be found, return a pointer to the space and remove it

	1409 ** from the free-list.

	1410 **

	1411 ** If no suitable space can be found on the free-list, return NULL.

	1412 **

	1413 ** This function may detect corruption within pPg. If corruption is

	1414 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.

	1415 **

	1416 ** Slots on the free list that are between 1 and 3 bytes larger than nByte

	1417 ** will be ignored if adding the extra space to the fragmentation count

	1418 ** causes the fragmentation count to exceed 60.

	1419 */

	1420 static u8 pageFindSlot(MemPage pPg, int nByte, int *pRc){

	1421 const int hdr = pPg->hdrOffset;

	1422 u8 * const aData = pPg->aData;

	1423 int iAddr = hdr + 1;

	1424 int pc = get2byte(&aData[iAddr]);

	1425 int x;

	1426 int usableSize = pPg->pBt->usableSize;

	1427

	1428 assert( pc>0 );

	1429 do{

	1430 int size; /* Size of the free slot */

	1431 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of

	1432 ** increasing offset. */

	1433 if( pc>usableSize-4 \|\| pc<iAddr+4 ){

	1434 *pRc = SQLITE_CORRUPT_BKPT;

	1435 return 0;

	1436 }

	1437 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each

	1438 ** freeblock form a big-endian integer which is the size of the freeblock

	1439 ** in bytes, including the 4-byte header. */

	1440 size = get2byte(&aData[pc+2]);

	1441 if( (x = size - nByte)>=0 ){

	1442 testcase( x==4 );

	1443 testcase( x==3 );

	1444 if( pc < pPg->cellOffset+2*pPg->nCell \|\| size+pc > usableSize ){

	1445 *pRc = SQLITE_CORRUPT_BKPT;

	1446 return 0;

	1447 }else if( x<4 ){

	1448 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total

	1449 ** number of bytes in fragments may not exceed 60. */

	1450 if( aData[hdr+7]>57 ) return 0;

	1451

	1452 /* Remove the slot from the free-list. Update the number of

	1453 ** fragmented bytes within the page. */

	1454 memcpy(&aData[iAddr], &aData[pc], 2);

	1455 aData[hdr+7] += (u8)x;

	1456 }else{

	1457 /* The slot remains on the free-list. Reduce its size to account

	1458 ** for the portion used by the new allocation. */

	1459 put2byte(&aData[pc+2], x);

	1460 }

	1461 return &aData[pc + x];

	1462 }

	1463 iAddr = pc;

	1464 pc = get2byte(&aData[pc]);

	1465 }while( pc );

	1466

	1467 return 0;

	1468 }

	1469

	1470 /*

	1471 ** Allocate nByte bytes of space from within the B-Tree page passed

	1472 ** as the first argument. Write into *pIdx the index into pPage->aData[]

	1473 ** of the first byte of allocated space. Return either SQLITE_OK or

	1474 ** an error code (usually SQLITE_CORRUPT).

	1475 **

	1476 ** The caller guarantees that there is sufficient space to make the

	1477 ** allocation. This routine might need to defragment in order to bring

	1478 ** all the space together, however. This routine will avoid using

	1479 ** the first two bytes past the cell pointer area since presumably this

	1480 ** allocation is being made in order to insert a new cell, so we will

	1481 ** also end up needing a new cell pointer.

	1482 */

	1483 static int allocateSpace(MemPage pPage, int nByte, int pIdx){

	1484 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */

	1485 u8 * const data = pPage->aData; /* Local cache of pPage->aData */

	1486 int top; /* First byte of cell content area */

	1487 int rc = SQLITE_OK; /* Integer return code */

	1488 int gap; /* First byte of gap between cell pointers and cell content */

	1489

	1490 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1491 assert( pPage->pBt );

	1492 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1493 assert( nByte>=0 ); /* Minimum cell size is 4 */

	1494 assert( pPage->nFree>=nByte );

	1495 assert( pPage->nOverflow==0 );

	1496 assert( nByte < (int)(pPage->pBt->usableSize-8) );

	1497

	1498 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );

	1499 gap = pPage->cellOffset + 2*pPage->nCell;

	1500 assert( gap<=65536 );

	1501 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size

	1502 ** and the reserved space is zero (the usual value for reserved space)

	1503 ** then the cell content offset of an empty page wants to be 65536.

	1504 ** However, that integer is too large to be stored in a 2-byte unsigned

	1505 ** integer, so a value of 0 is used in its place. */

	1506 top = get2byte(&data[hdr+5]);

	1507 assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */

	1508 if( gap>top ){

	1509 if( top==0 && pPage->pBt->usableSize==65536 ){

	1510 top = 65536;

	1511 }else{

	1512 return SQLITE_CORRUPT_BKPT;

	1513 }

	1514 }

	1515

	1516 /* If there is enough space between gap and top for one more cell pointer

	1517 ** array entry offset, and if the freelist is not empty, then search the

	1518 ** freelist looking for a free slot big enough to satisfy the request.

	1519 */

	1520 testcase( gap+2==top );

	1521 testcase( gap+1==top );

	1522 testcase( gap==top );

	1523 if( (data[hdr+2] \|\| data[hdr+1]) && gap+2<=top ){

	1524 u8 *pSpace = pageFindSlot(pPage, nByte, &rc);

	1525 if( pSpace ){

	1526 assert( pSpace>=data && (pSpace - data)<65536 );

	1527 *pIdx = (int)(pSpace - data);

	1528 return SQLITE_OK;

	1529 }else if( rc ){

	1530 return rc;

	1531 }

	1532 }

	1533

	1534 /* The request could not be fulfilled using a freelist slot. Check

	1535 ** to see if defragmentation is necessary.

	1536 */

	1537 testcase( gap+2+nByte==top );

	1538 if( gap+2+nByte>top ){

	1539 assert( pPage->nCell>0 \|\| CORRUPT_DB );

	1540 rc = defragmentPage(pPage);

	1541 if( rc ) return rc;

	1542 top = get2byteNotZero(&data[hdr+5]);

	1543 assert( gap+nByte<=top );

	1544 }

	1545

	1546

	1547 /* Allocate memory from the gap in between the cell pointer array

	1548 ** and the cell content area. The btreeInitPage() call has already

	1549 ** validated the freelist. Given that the freelist is valid, there

	1550 ** is no way that the allocation can extend off the end of the page.

	1551 ** The assert() below verifies the previous sentence.

	1552 */

	1553 top -= nByte;

	1554 put2byte(&data[hdr+5], top);

	1555 assert( top+nByte <= (int)pPage->pBt->usableSize );

	1556 *pIdx = top;

	1557 return SQLITE_OK;

	1558 }

	1559

	1560 /*

	1561 ** Return a section of the pPage->aData to the freelist.

	1562 ** The first byte of the new free block is pPage->aData[iStart]

	1563 ** and the size of the block is iSize bytes.

	1564 **

	1565 ** Adjacent freeblocks are coalesced.

	1566 **

	1567 ** Note that even though the freeblock list was checked by btreeInitPage(),

	1568 ** that routine will not detect overlap between cells or freeblocks. Nor

	1569 ** does it detect cells or freeblocks that encrouch into the reserved bytes

	1570 ** at the end of the page. So do additional corruption checks inside this

	1571 ** routine and return SQLITE_CORRUPT if any problems are found.

	1572 */

	1573 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){

	1574 u16 iPtr; /* Address of ptr to next freeblock */

	1575 u16 iFreeBlk; /* Address of the next freeblock */

	1576 u8 hdr; /* Page header size. 0 or 100 */

	1577 u8 nFrag = 0; /* Reduction in fragmentation */

	1578 u16 iOrigSize = iSize; /* Original value of iSize */

	1579 u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */

	1580 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */

	1581 unsigned char data = pPage->aData; / Page content */

	1582

	1583 assert( pPage->pBt!=0 );

	1584 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1585 assert( CORRUPT_DB \|\| iStart>=pPage->hdrOffset+6+pPage->childPtrSize );

	1586 assert( CORRUPT_DB \|\| iEnd <= pPage->pBt->usableSize );

	1587 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1588 assert( iSize>=4 ); /* Minimum cell size is 4 */

	1589 assert( iStart<=iLast );

	1590

	1591 /* Overwrite deleted information with zeros when the secure_delete

	1592 ** option is enabled */

	1593 if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){

	1594 memset(&data[iStart], 0, iSize);

	1595 }

	1596

	1597 /* The list of freeblocks must be in ascending order. Find the

	1598 ** spot on the list where iStart should be inserted.

	1599 */

	1600 hdr = pPage->hdrOffset;

	1601 iPtr = hdr + 1;

	1602 if( data[iPtr+1]==0 && data[iPtr]==0 ){

	1603 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */

	1604 }else{

	1605 while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){

	1606 if( iFreeBlk<iPtr+4 ){

	1607 if( iFreeBlk==0 ) break;

	1608 return SQLITE_CORRUPT_BKPT;

	1609 }

	1610 iPtr = iFreeBlk;

	1611 }

	1612 if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;

	1613 assert( iFreeBlk>iPtr \|\| iFreeBlk==0 );

	1614

	1615 /* At this point:

	1616 ** iFreeBlk: First freeblock after iStart, or zero if none

	1617 ** iPtr: The address of a pointer to iFreeBlk

	1618 **

	1619 ** Check to see if iFreeBlk should be coalesced onto the end of iStart.

	1620 */

	1621 if( iFreeBlk && iEnd+3>=iFreeBlk ){

	1622 nFrag = iFreeBlk - iEnd;

	1623 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;

	1624 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);

	1625 if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;

	1626 iSize = iEnd - iStart;

	1627 iFreeBlk = get2byte(&data[iFreeBlk]);

	1628 }

	1629

	1630 /* If iPtr is another freeblock (that is, if iPtr is not the freelist

	1631 ** pointer in the page header) then check to see if iStart should be

	1632 ** coalesced onto the end of iPtr.

	1633 */

	1634 if( iPtr>hdr+1 ){

	1635 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);

	1636 if( iPtrEnd+3>=iStart ){

	1637 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;

	1638 nFrag += iStart - iPtrEnd;

	1639 iSize = iEnd - iPtr;

	1640 iStart = iPtr;

	1641 }

	1642 }

	1643 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;

	1644 data[hdr+7] -= nFrag;

	1645 }

	1646 if( iStart==get2byte(&data[hdr+5]) ){

	1647 /* The new freeblock is at the beginning of the cell content area,

	1648 ** so just extend the cell content area rather than create another

	1649 ** freelist entry */

	1650 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;

	1651 put2byte(&data[hdr+1], iFreeBlk);

	1652 put2byte(&data[hdr+5], iEnd);

	1653 }else{

	1654 /* Insert the new freeblock into the freelist */

	1655 put2byte(&data[iPtr], iStart);

	1656 put2byte(&data[iStart], iFreeBlk);

	1657 put2byte(&data[iStart+2], iSize);

	1658 }

	1659 pPage->nFree += iOrigSize;

	1660 return SQLITE_OK;

	1661 }

	1662

	1663 /*

	1664 ** Decode the flags byte (the first byte of the header) for a page

	1665 ** and initialize fields of the MemPage structure accordingly.

	1666 **

	1667 ** Only the following combinations are supported. Anything different

	1668 ** indicates a corrupt database files:

	1669 **

	1670 ** PTF_ZERODATA

	1671 ** PTF_ZERODATA \| PTF_LEAF

	1672 ** PTF_LEAFDATA \| PTF_INTKEY

	1673 ** PTF_LEAFDATA \| PTF_INTKEY \| PTF_LEAF

	1674 */

	1675 static int decodeFlags(MemPage *pPage, int flagByte){

	1676 BtShared pBt; / A copy of pPage->pBt */

	1677

	1678 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );

	1679 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1680 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );

	1681 flagByte &= ~PTF_LEAF;

	1682 pPage->childPtrSize = 4-4*pPage->leaf;

	1683 pPage->xCellSize = cellSizePtr;

	1684 pBt = pPage->pBt;

	1685 if( flagByte==(PTF_LEAFDATA \| PTF_INTKEY) ){

	1686 /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an

	1687 ** interior table b-tree page. */

	1688 assert( (PTF_LEAFDATA\|PTF_INTKEY)==5 );

	1689 /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a

	1690 ** leaf table b-tree page. */

	1691 assert( (PTF_LEAFDATA\|PTF_INTKEY\|PTF_LEAF)==13 );

	1692 pPage->intKey = 1;

	1693 if( pPage->leaf ){

	1694 pPage->intKeyLeaf = 1;

	1695 pPage->xParseCell = btreeParseCellPtr;

	1696 }else{

	1697 pPage->intKeyLeaf = 0;

	1698 pPage->xCellSize = cellSizePtrNoPayload;

	1699 pPage->xParseCell = btreeParseCellPtrNoPayload;

	1700 }

	1701 pPage->maxLocal = pBt->maxLeaf;

	1702 pPage->minLocal = pBt->minLeaf;

	1703 }else if( flagByte==PTF_ZERODATA ){

	1704 /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an

	1705 ** interior index b-tree page. */

	1706 assert( (PTF_ZERODATA)==2 );

	1707 /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a

	1708 ** leaf index b-tree page. */

	1709 assert( (PTF_ZERODATA\|PTF_LEAF)==10 );

	1710 pPage->intKey = 0;

	1711 pPage->intKeyLeaf = 0;

	1712 pPage->xParseCell = btreeParseCellPtrIndex;

	1713 pPage->maxLocal = pBt->maxLocal;

	1714 pPage->minLocal = pBt->minLocal;

	1715 }else{

	1716 /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is

	1717 ** an error. */

	1718 return SQLITE_CORRUPT_BKPT;

	1719 }

	1720 pPage->max1bytePayload = pBt->max1bytePayload;

	1721 return SQLITE_OK;

	1722 }

	1723

	1724 /*

	1725 ** Initialize the auxiliary information for a disk block.

	1726 **

	1727 ** Return SQLITE_OK on success. If we see that the page does

	1728 ** not contain a well-formed database page, then return

	1729 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not

	1730 ** guarantee that the page is well-formed. It only shows that

	1731 ** we failed to detect any corruption.

	1732 */

	1733 static int btreeInitPage(MemPage *pPage){

	1734

	1735 assert( pPage->pBt!=0 );

	1736 assert( pPage->pBt->db!=0 );

	1737 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1738 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );

	1739 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );

	1740 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );

	1741

	1742 if( !pPage->isInit ){

	1743 int pc; /* Address of a freeblock within pPage->aData[] */

	1744 u8 hdr; /* Offset to beginning of page header */

	1745 u8 data; / Equal to pPage->aData */

	1746 BtShared pBt; / The main btree structure */

	1747 int usableSize; /* Amount of usable space on each page */

	1748 u16 cellOffset; /* Offset from start of page to first cell pointer */

	1749 int nFree; /* Number of unused bytes on the page */

	1750 int top; /* First byte of the cell content area */

	1751 int iCellFirst; /* First allowable cell or freeblock offset */

	1752 int iCellLast; /* Last possible cell or freeblock offset */

	1753

	1754 pBt = pPage->pBt;

	1755

	1756 hdr = pPage->hdrOffset;

	1757 data = pPage->aData;

	1758 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating

	1759 ** the b-tree page type. */

	1760 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;

	1761 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

	1762 pPage->maskPage = (u16)(pBt->pageSize - 1);

	1763 pPage->nOverflow = 0;

	1764 usableSize = pBt->usableSize;

	1765 pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;

	1766 pPage->aDataEnd = &data[usableSize];

	1767 pPage->aCellIdx = &data[cellOffset];

	1768 pPage->aDataOfst = &data[pPage->childPtrSize];

	1769 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates

	1770 ** the start of the cell content area. A zero value for this integer is

	1771 ** interpreted as 65536. */

	1772 top = get2byteNotZero(&data[hdr+5]);

	1773 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the

	1774 ** number of cells on the page. */

	1775 pPage->nCell = get2byte(&data[hdr+3]);

	1776 if( pPage->nCell>MX_CELL(pBt) ){

	1777 /* To many cells for a single page. The page must be corrupt */

	1778 return SQLITE_CORRUPT_BKPT;

	1779 }

	1780 testcase( pPage->nCell==MX_CELL(pBt) );

	1781 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only

	1782 ** possible for a root page of a table that contains no rows) then the

	1783 ** offset to the cell content area will equal the page size minus the

	1784 ** bytes of reserved space. */

	1785 assert( pPage->nCell>0 \|\| top==usableSize \|\| CORRUPT_DB );

	1786

	1787 /* A malformed database page might cause us to read past the end

	1788 ** of page when parsing a cell.

	1789 **

	1790 ** The following block of code checks early to see if a cell extends

	1791 ** past the end of a page boundary and causes SQLITE_CORRUPT to be

	1792 ** returned if it does.

	1793 */

	1794 iCellFirst = cellOffset + 2*pPage->nCell;

	1795 iCellLast = usableSize - 4;

	1796 if( pBt->db->flags & SQLITE_CellSizeCk ){

	1797 int i; /* Index into the cell pointer array */

	1798 int sz; /* Size of a cell */

	1799

	1800 if( !pPage->leaf ) iCellLast--;

	1801 for(i=0; i<pPage->nCell; i++){

	1802 pc = get2byteAligned(&data[cellOffset+i*2]);

	1803 testcase( pc==iCellFirst );

	1804 testcase( pc==iCellLast );

	1805 if( pc<iCellFirst \|\| pc>iCellLast ){

	1806 return SQLITE_CORRUPT_BKPT;

	1807 }

	1808 sz = pPage->xCellSize(pPage, &data[pc]);

	1809 testcase( pc+sz==usableSize );

	1810 if( pc+sz>usableSize ){

	1811 return SQLITE_CORRUPT_BKPT;

	1812 }

	1813 }

	1814 if( !pPage->leaf ) iCellLast++;

	1815 }

	1816

	1817 /* Compute the total free space on the page

	1818 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the

	1819 ** start of the first freeblock on the page, or is zero if there are no

	1820 ** freeblocks. */

	1821 pc = get2byte(&data[hdr+1]);

	1822 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */

	1823 if( pc>0 ){

	1824 u32 next, size;

	1825 if( pc<iCellFirst ){

	1826 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will

	1827 ** always be at least one cell before the first freeblock.

	1828 */

	1829 return SQLITE_CORRUPT_BKPT;

	1830 }

	1831 while( 1 ){

	1832 if( pc>iCellLast ){

	1833 return SQLITE_CORRUPT_BKPT; /* Freeblock off the end of the page */

	1834 }

	1835 next = get2byte(&data[pc]);

	1836 size = get2byte(&data[pc+2]);

	1837 nFree = nFree + size;

	1838 if( next<=pc+size+3 ) break;

	1839 pc = next;

	1840 }

	1841 if( next>0 ){

	1842 return SQLITE_CORRUPT_BKPT; /* Freeblock not in ascending order */

	1843 }

	1844 if( pc+size>(unsigned int)usableSize ){

	1845 return SQLITE_CORRUPT_BKPT; /* Last freeblock extends past page end */

	1846 }

	1847 }

	1848

	1849 /* At this point, nFree contains the sum of the offset to the start

	1850 ** of the cell-content area plus the number of free bytes within

	1851 ** the cell-content area. If this is greater than the usable-size

	1852 ** of the page, then the page must be corrupted. This check also

	1853 ** serves to verify that the offset to the start of the cell-content

	1854 ** area, according to the page header, lies within the page.

	1855 */

	1856 if( nFree>usableSize ){

	1857 return SQLITE_CORRUPT_BKPT;

	1858 }

	1859 pPage->nFree = (u16)(nFree - iCellFirst);

	1860 pPage->isInit = 1;

	1861 }

	1862 return SQLITE_OK;

	1863 }

	1864

	1865 /*

	1866 ** Set up a raw page so that it looks like a database page holding

	1867 ** no entries.

	1868 */

	1869 static void zeroPage(MemPage *pPage, int flags){

	1870 unsigned char *data = pPage->aData;

	1871 BtShared *pBt = pPage->pBt;

	1872 u8 hdr = pPage->hdrOffset;

	1873 u16 first;

	1874

	1875 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );

	1876 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

	1877 assert( sqlite3PagerGetData(pPage->pDbPage) == data );

	1878 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1879 assert( sqlite3_mutex_held(pBt->mutex) );

	1880 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	1881 memset(&data[hdr], 0, pBt->usableSize - hdr);

	1882 }

	1883 data[hdr] = (char)flags;

	1884 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);

	1885 memset(&data[hdr+1], 0, 4);

	1886 data[hdr+7] = 0;

	1887 put2byte(&data[hdr+5], pBt->usableSize);

	1888 pPage->nFree = (u16)(pBt->usableSize - first);

	1889 decodeFlags(pPage, flags);

	1890 pPage->cellOffset = first;

	1891 pPage->aDataEnd = &data[pBt->usableSize];

	1892 pPage->aCellIdx = &data[first];

	1893 pPage->aDataOfst = &data[pPage->childPtrSize];

	1894 pPage->nOverflow = 0;

	1895 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

	1896 pPage->maskPage = (u16)(pBt->pageSize - 1);

	1897 pPage->nCell = 0;

	1898 pPage->isInit = 1;

	1899 }

	1900

	1901

	1902 /*

	1903 ** Convert a DbPage obtained from the pager into a MemPage used by

	1904 ** the btree layer.

	1905 */

	1906 static MemPage btreePageFromDbPage(DbPage pDbPage, Pgno pgno, BtShared *pBt){

	1907 MemPage pPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

	1908 if( pgno!=pPage->pgno ){

	1909 pPage->aData = sqlite3PagerGetData(pDbPage);

	1910 pPage->pDbPage = pDbPage;

	1911 pPage->pBt = pBt;

	1912 pPage->pgno = pgno;

	1913 pPage->hdrOffset = pgno==1 ? 100 : 0;

	1914 }

	1915 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );

	1916 return pPage;

	1917 }

	1918

	1919 /*

	1920 ** Get a page from the pager. Initialize the MemPage.pBt and

	1921 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage().

	1922 **

	1923 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care

	1924 ** about the content of the page at this time. So do not go to the disk

	1925 ** to fetch the content. Just fill in the content with zeros for now.

	1926 ** If in the future we call sqlite3PagerWrite() on this page, that

	1927 ** means we have started to be concerned about content and the disk

	1928 ** read should occur at that point.

	1929 */

	1930 static int btreeGetPage(

	1931 BtShared pBt, / The btree */

	1932 Pgno pgno, /* Number of the page to fetch */

	1933 MemPage *ppPage, / Return the page in this parameter */

	1934 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

	1935 ){

	1936 int rc;

	1937 DbPage *pDbPage;

	1938

	1939 assert( flags==0 \|\| flags==PAGER_GET_NOCONTENT \|\| flags==PAGER_GET_READONLY );

	1940 assert( sqlite3_mutex_held(pBt->mutex) );

	1941 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);

	1942 if( rc ) return rc;

	1943 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);

	1944 return SQLITE_OK;

	1945 }

	1946

	1947 /*

	1948 ** Retrieve a page from the pager cache. If the requested page is not

	1949 ** already in the pager cache return NULL. Initialize the MemPage.pBt and

	1950 ** MemPage.aData elements if needed.

	1951 */

	1952 static MemPage btreePageLookup(BtShared pBt, Pgno pgno){

	1953 DbPage *pDbPage;

	1954 assert( sqlite3_mutex_held(pBt->mutex) );

	1955 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);

	1956 if( pDbPage ){

	1957 return btreePageFromDbPage(pDbPage, pgno, pBt);

	1958 }

	1959 return 0;

	1960 }

	1961

	1962 /*

	1963 ** Return the size of the database file in pages. If there is any kind of

	1964 ** error, return ((unsigned int)-1).

	1965 */

	1966 static Pgno btreePagecount(BtShared *pBt){

	1967 return pBt->nPage;

	1968 }

	1969 u32 sqlite3BtreeLastPage(Btree *p){

	1970 assert( sqlite3BtreeHoldsMutex(p) );

	1971 assert( ((p->pBt->nPage)&0x8000000)==0 );

	1972 return btreePagecount(p->pBt);

	1973 }

	1974

	1975 /*

	1976 ** Get a page from the pager and initialize it.

	1977 **

	1978 ** If pCur!=0 then the page is being fetched as part of a moveToChild()

	1979 ** call. Do additional sanity checking on the page in this case.

	1980 ** And if the fetch fails, this routine must decrement pCur->iPage.

	1981 **

	1982 ** The page is fetched as read-write unless pCur is not NULL and is

	1983 ** a read-only cursor.

	1984 **

	1985 ** If an error occurs, then *ppPage is undefined. It

	1986 ** may remain unchanged, or it may be set to an invalid value.

	1987 */

	1988 static int getAndInitPage(

	1989 BtShared pBt, / The database file */

	1990 Pgno pgno, /* Number of the page to get */

	1991 MemPage *ppPage, / Write the page pointer here */

	1992 BtCursor pCur, / Cursor to receive the page, or NULL */

	1993 int bReadOnly /* True for a read-only page */

	1994 ){

	1995 int rc;

	1996 DbPage *pDbPage;

	1997 assert( sqlite3_mutex_held(pBt->mutex) );

	1998 assert( pCur==0 \|\| ppPage==&pCur->apPage[pCur->iPage] );

	1999 assert( pCur==0 \|\| bReadOnly==pCur->curPagerFlags );

	2000 assert( pCur==0 \|\| pCur->iPage>0 );

	2001

	2002 if( pgno>btreePagecount(pBt) ){

	2003 rc = SQLITE_CORRUPT_BKPT;

	2004 goto getAndInitPage_error;

	2005 }

	2006 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);

	2007 if( rc ){

	2008 goto getAndInitPage_error;

	2009 }

	2010 ppPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

	2011 if( (*ppPage)->isInit==0 ){

	2012 btreePageFromDbPage(pDbPage, pgno, pBt);

	2013 rc = btreeInitPage(*ppPage);

	2014 if( rc!=SQLITE_OK ){

	2015 releasePage(*ppPage);

	2016 goto getAndInitPage_error;

	2017 }

	2018 }

	2019 assert( (*ppPage)->pgno==pgno );

	2020 assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );

	2021

	2022 /* If obtaining a child page for a cursor, we must verify that the page is

	2023 ** compatible with the root page. */

	2024 if( pCur && ((ppPage)->nCell<1 \|\| (ppPage)->intKey!=pCur->curIntKey) ){

	2025 rc = SQLITE_CORRUPT_BKPT;

	2026 releasePage(*ppPage);

	2027 goto getAndInitPage_error;

	2028 }

	2029 return SQLITE_OK;

	2030

	2031 getAndInitPage_error:

	2032 if( pCur ) pCur->iPage--;

	2033 testcase( pgno==0 );

	2034 assert( pgno!=0 \|\| rc==SQLITE_CORRUPT );

	2035 return rc;

	2036 }

	2037

	2038 /*

	2039 ** Release a MemPage. This should be called once for each prior

	2040 ** call to btreeGetPage.

	2041 */

	2042 static void releasePageNotNull(MemPage *pPage){

	2043 assert( pPage->aData );

	2044 assert( pPage->pBt );

	2045 assert( pPage->pDbPage!=0 );

	2046 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

	2047 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );

	2048 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	2049 sqlite3PagerUnrefNotNull(pPage->pDbPage);

	2050 }

	2051 static void releasePage(MemPage *pPage){

	2052 if( pPage ) releasePageNotNull(pPage);

	2053 }

	2054

	2055 /*

	2056 ** Get an unused page.

	2057 **

	2058 ** This works just like btreeGetPage() with the addition:

	2059 **

	2060 ** * If the page is already in use for some other purpose, immediately

	2061 ** release it and return an SQLITE_CURRUPT error.

	2062 ** * Make sure the isInit flag is clear

	2063 */

	2064 static int btreeGetUnusedPage(

	2065 BtShared pBt, / The btree */

	2066 Pgno pgno, /* Number of the page to fetch */

	2067 MemPage *ppPage, / Return the page in this parameter */

	2068 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

	2069 ){

	2070 int rc = btreeGetPage(pBt, pgno, ppPage, flags);

	2071 if( rc==SQLITE_OK ){

	2072 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){

	2073 releasePage(*ppPage);

	2074 *ppPage = 0;

	2075 return SQLITE_CORRUPT_BKPT;

	2076 }

	2077 (*ppPage)->isInit = 0;

	2078 }else{

	2079 *ppPage = 0;

	2080 }

	2081 return rc;

	2082 }

	2083

	2084

	2085 /*

	2086 ** During a rollback, when the pager reloads information into the cache

	2087 ** so that the cache is restored to its original state at the start of

	2088 ** the transaction, for each page restored this routine is called.

	2089 **

	2090 ** This routine needs to reset the extra data section at the end of the

	2091 ** page to agree with the restored data.

	2092 */

	2093 static void pageReinit(DbPage *pData){

	2094 MemPage *pPage;

	2095 pPage = (MemPage *)sqlite3PagerGetExtra(pData);

	2096 assert( sqlite3PagerPageRefcount(pData)>0 );

	2097 if( pPage->isInit ){

	2098 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	2099 pPage->isInit = 0;

	2100 if( sqlite3PagerPageRefcount(pData)>1 ){

	2101 /* pPage might not be a btree page; it might be an overflow page

	2102 ** or ptrmap page or a free page. In those cases, the following

	2103 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.

	2104 ** But no harm is done by this. And it is very important that

	2105 ** btreeInitPage() be called on every btree page so we make

	2106 ** the call for every page that comes in for re-initing. */

	2107 btreeInitPage(pPage);

	2108 }

	2109 }

	2110 }

	2111

	2112 /*

	2113 ** Invoke the busy handler for a btree.

	2114 */

	2115 static int btreeInvokeBusyHandler(void *pArg){

	2116 BtShared pBt = (BtShared)pArg;

	2117 assert( pBt->db );

	2118 assert( sqlite3_mutex_held(pBt->db->mutex) );

	2119 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);

	2120 }

	2121

	2122 /*

	2123 ** Open a database file.

	2124 **

	2125 ** zFilename is the name of the database file. If zFilename is NULL

	2126 ** then an ephemeral database is created. The ephemeral database might

	2127 ** be exclusively in memory, or it might use a disk-based memory cache.

	2128 ** Either way, the ephemeral database will be automatically deleted

	2129 ** when sqlite3BtreeClose() is called.

	2130 **

	2131 ** If zFilename is ":memory:" then an in-memory database is created

	2132 ** that is automatically destroyed when it is closed.

	2133 **

	2134 ** The "flags" parameter is a bitmask that might contain bits like

	2135 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.

	2136 **

	2137 ** If the database is already opened in the same database connection

	2138 ** and we are in shared cache mode, then the open will fail with an

	2139 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared

	2140 ** objects in the same database connection since doing so will lead

	2141 ** to problems with locking.

	2142 */

	2143 int sqlite3BtreeOpen(

	2144 sqlite3_vfs pVfs, / VFS to use for this b-tree */

	2145 const char zFilename, / Name of the file containing the BTree database */

	2146 sqlite3 db, / Associated database handle */

	2147 Btree *ppBtree, / Pointer to new Btree object written here */

	2148 int flags, /* Options */

	2149 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */

	2150 ){

	2151 BtShared pBt = 0; / Shared part of btree structure */

	2152 Btree p; / Handle to return */

	2153 sqlite3_mutex mutexOpen = 0; / Prevents a race condition. Ticket #3537 */

	2154 int rc = SQLITE_OK; /* Result code from this function */

	2155 u8 nReserve; /* Byte of unused space on each page */

	2156 unsigned char zDbHeader[100]; /* Database header content */

	2157

	2158 /* True if opening an ephemeral, temporary database */

	2159 const int isTempDb = zFilename==0 \|\| zFilename[0]==0;

	2160

	2161 /* Set the variable isMemdb to true for an in-memory database, or

	2162 ** false for a file-based database.

	2163 */

	2164 #ifdef SQLITE_OMIT_MEMORYDB

	2165 const int isMemdb = 0;

	2166 #else

	2167 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)

	2168 \|\| (isTempDb && sqlite3TempInMemory(db))

	2169 \|\| (vfsFlags & SQLITE_OPEN_MEMORY)!=0;

	2170 #endif

	2171

	2172 assert( db!=0 );

	2173 assert( pVfs!=0 );

	2174 assert( sqlite3_mutex_held(db->mutex) );

	2175 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */

	2176

	2177 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */

	2178 assert( (flags & BTREE_UNORDERED)==0 \|\| (flags & BTREE_SINGLE)!=0 );

	2179

	2180 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */

	2181 assert( (flags & BTREE_SINGLE)==0 \|\| isTempDb );

	2182

	2183 if( isMemdb ){

	2184 flags \|= BTREE_MEMORY;

	2185 }

	2186 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb \|\| isTempDb) ){

	2187 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) \| SQLITE_OPEN_TEMP_DB;

	2188 }

	2189 p = sqlite3MallocZero(sizeof(Btree));

	2190 if( !p ){

	2191 return SQLITE_NOMEM_BKPT;

	2192 }

	2193 p->inTrans = TRANS_NONE;

	2194 p->db = db;

	2195 #ifndef SQLITE_OMIT_SHARED_CACHE

	2196 p->lock.pBtree = p;

	2197 p->lock.iTable = 1;

	2198 #endif

	2199

	2200 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	2201 /*

	2202 ** If this Btree is a candidate for shared cache, try to find an

	2203 ** existing BtShared object that we can share with

	2204 */

	2205 if( isTempDb==0 && (isMemdb==0 \|\| (vfsFlags&SQLITE_OPEN_URI)!=0) ){

	2206 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){

	2207 int nFilename = sqlite3Strlen30(zFilename)+1;

	2208 int nFullPathname = pVfs->mxPathname+1;

	2209 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));

	2210 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

	2211

	2212 p->sharable = 1;

	2213 if( !zFullPathname ){

	2214 sqlite3_free(p);

	2215 return SQLITE_NOMEM_BKPT;

	2216 }

	2217 if( isMemdb ){

	2218 memcpy(zFullPathname, zFilename, nFilename);

	2219 }else{

	2220 rc = sqlite3OsFullPathname(pVfs, zFilename,

	2221 nFullPathname, zFullPathname);

	2222 if( rc ){

	2223 sqlite3_free(zFullPathname);

	2224 sqlite3_free(p);

	2225 return rc;

	2226 }

	2227 }

	2228 #if SQLITE_THREADSAFE

	2229 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);

	2230 sqlite3_mutex_enter(mutexOpen);

	2231 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);

	2232 sqlite3_mutex_enter(mutexShared);

	2233 #endif

	2234 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){

	2235 assert( pBt->nRef>0 );

	2236 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))

	2237 && sqlite3PagerVfs(pBt->pPager)==pVfs ){

	2238 int iDb;

	2239 for(iDb=db->nDb-1; iDb>=0; iDb--){

	2240 Btree *pExisting = db->aDb[iDb].pBt;

	2241 if( pExisting && pExisting->pBt==pBt ){

	2242 sqlite3_mutex_leave(mutexShared);

	2243 sqlite3_mutex_leave(mutexOpen);

	2244 sqlite3_free(zFullPathname);

	2245 sqlite3_free(p);

	2246 return SQLITE_CONSTRAINT;

	2247 }

	2248 }

	2249 p->pBt = pBt;

	2250 pBt->nRef++;

	2251 break;

	2252 }

	2253 }

	2254 sqlite3_mutex_leave(mutexShared);

	2255 sqlite3_free(zFullPathname);

	2256 }

	2257 #ifdef SQLITE_DEBUG

	2258 else{

	2259 /* In debug mode, we mark all persistent databases as sharable

	2260 ** even when they are not. This exercises the locking code and

	2261 ** gives more opportunity for asserts(sqlite3_mutex_held())

	2262 ** statements to find locking problems.

	2263 */

	2264 p->sharable = 1;

	2265 }

	2266 #endif

	2267 }

	2268 #endif

	2269 if( pBt==0 ){

	2270 /*

	2271 ** The following asserts make sure that structures used by the btree are

	2272 ** the right size. This is to guard against size changes that result

	2273 ** when compiling on a different architecture.

	2274 */

	2275 assert( sizeof(i64)==8 );

	2276 assert( sizeof(u64)==8 );

	2277 assert( sizeof(u32)==4 );

	2278 assert( sizeof(u16)==2 );

	2279 assert( sizeof(Pgno)==4 );

	2280

	2281 pBt = sqlite3MallocZero( sizeof(*pBt) );

	2282 if( pBt==0 ){

	2283 rc = SQLITE_NOMEM_BKPT;

	2284 goto btree_open_out;

	2285 }

	2286 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,

	2287 sizeof(MemPage), flags, vfsFlags, pageReinit);

	2288 if( rc==SQLITE_OK ){

	2289 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);

	2290 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);

	2291 }

	2292 if( rc!=SQLITE_OK ){

	2293 goto btree_open_out;

	2294 }

	2295 pBt->openFlags = (u8)flags;

	2296 pBt->db = db;

	2297 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);

	2298 p->pBt = pBt;

	2299

	2300 pBt->pCursor = 0;

	2301 pBt->pPage1 = 0;

	2302 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags \|= BTS_READ_ONLY;

	2303 #ifdef SQLITE_SECURE_DELETE

	2304 pBt->btsFlags \|= BTS_SECURE_DELETE;

	2305 #endif

	2306 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is

	2307 ** determined by the 2-byte integer located at an offset of 16 bytes from

	2308 ** the beginning of the database file. */

	2309 pBt->pageSize = (zDbHeader[16]<<8) \| (zDbHeader[17]<<16);

	2310 if( pBt->pageSize<512 \|\| pBt->pageSize>SQLITE_MAX_PAGE_SIZE

	2311 \|\| ((pBt->pageSize-1)&pBt->pageSize)!=0 ){

	2312 pBt->pageSize = 0;

	2313 #ifndef SQLITE_OMIT_AUTOVACUUM

	2314 /* If the magic name ":memory:" will create an in-memory database, then

	2315 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if

	2316 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if

	2317 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a

	2318 ** regular file-name. In this case the auto-vacuum applies as per normal.

	2319 */

	2320 if( zFilename && !isMemdb ){

	2321 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);

	2322 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);

	2323 }

	2324 #endif

	2325 nReserve = 0;

	2326 }else{

	2327 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is

	2328 ** determined by the one-byte unsigned integer found at an offset of 20

	2329 ** into the database file header. */

	2330 nReserve = zDbHeader[20];

	2331 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	2332 #ifndef SQLITE_OMIT_AUTOVACUUM

	2333 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);

	2334 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);

	2335 #endif

	2336 }

	2337 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

	2338 if( rc ) goto btree_open_out;

	2339 pBt->usableSize = pBt->pageSize - nReserve;

	2340 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */

	2341

	2342 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	2343 /* Add the new BtShared object to the linked list sharable BtShareds.

	2344 */

	2345 pBt->nRef = 1;

	2346 if( p->sharable ){

	2347 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

	2348 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)

	2349 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){

	2350 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);

	2351 if( pBt->mutex==0 ){

	2352 rc = SQLITE_NOMEM_BKPT;

	2353 goto btree_open_out;

	2354 }

	2355 }

	2356 sqlite3_mutex_enter(mutexShared);

	2357 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);

	2358 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;

	2359 sqlite3_mutex_leave(mutexShared);

	2360 }

	2361 #endif

	2362 }

	2363

	2364 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	2365 /* If the new Btree uses a sharable pBtShared, then link the new

	2366 ** Btree into the list of all sharable Btrees for the same connection.

	2367 ** The list is kept in ascending order by pBt address.

	2368 */

	2369 if( p->sharable ){

	2370 int i;

	2371 Btree *pSib;

	2372 for(i=0; i<db->nDb; i++){

	2373 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){

	2374 while( pSib->pPrev ){ pSib = pSib->pPrev; }

	2375 if( (uptr)p->pBt<(uptr)pSib->pBt ){

	2376 p->pNext = pSib;

	2377 p->pPrev = 0;

	2378 pSib->pPrev = p;

	2379 }else{

	2380 while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){

	2381 pSib = pSib->pNext;

	2382 }

	2383 p->pNext = pSib->pNext;

	2384 p->pPrev = pSib;

	2385 if( p->pNext ){

	2386 p->pNext->pPrev = p;

	2387 }

	2388 pSib->pNext = p;

	2389 }

	2390 break;

	2391 }

	2392 }

	2393 }

	2394 #endif

	2395 *ppBtree = p;

	2396

	2397 btree_open_out:

	2398 if( rc!=SQLITE_OK ){

	2399 if( pBt && pBt->pPager ){

	2400 sqlite3PagerClose(pBt->pPager, 0);

	2401 }

	2402 sqlite3_free(pBt);

	2403 sqlite3_free(p);

	2404 *ppBtree = 0;

	2405 }else{

	2406 sqlite3_file *pFile;

	2407

	2408 /* If the B-Tree was successfully opened, set the pager-cache size to the

	2409 ** default value. Except, when opening on an existing shared pager-cache,

	2410 ** do not change the pager-cache size.

	2411 */

	2412 if( sqlite3BtreeSchema(p, 0, 0)==0 ){

	2413 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);

	2414 }

	2415

	2416 pFile = sqlite3PagerFile(pBt->pPager);

	2417 if( pFile->pMethods ){

	2418 sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);

	2419 }

	2420 }

	2421 if( mutexOpen ){

	2422 assert( sqlite3_mutex_held(mutexOpen) );

	2423 sqlite3_mutex_leave(mutexOpen);

	2424 }

	2425 assert( rc!=SQLITE_OK \|\| sqlite3BtreeConnectionCount(*ppBtree)>0 );

	2426 return rc;

	2427 }

	2428

	2429 /*

	2430 ** Decrement the BtShared.nRef counter. When it reaches zero,

	2431 ** remove the BtShared structure from the sharing list. Return

	2432 ** true if the BtShared.nRef counter reaches zero and return

	2433 ** false if it is still positive.

	2434 */

	2435 static int removeFromSharingList(BtShared *pBt){

	2436 #ifndef SQLITE_OMIT_SHARED_CACHE

	2437 MUTEX_LOGIC( sqlite3_mutex *pMaster; )

	2438 BtShared *pList;

	2439 int removed = 0;

	2440

	2441 assert( sqlite3_mutex_notheld(pBt->mutex) );

	2442 MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )

	2443 sqlite3_mutex_enter(pMaster);

	2444 pBt->nRef--;

	2445 if( pBt->nRef<=0 ){

	2446 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){

	2447 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;

	2448 }else{

	2449 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);

	2450 while( ALWAYS(pList) && pList->pNext!=pBt ){

	2451 pList=pList->pNext;

	2452 }

	2453 if( ALWAYS(pList) ){

	2454 pList->pNext = pBt->pNext;

	2455 }

	2456 }

	2457 if( SQLITE_THREADSAFE ){

	2458 sqlite3_mutex_free(pBt->mutex);

	2459 }

	2460 removed = 1;

	2461 }

	2462 sqlite3_mutex_leave(pMaster);

	2463 return removed;

	2464 #else

	2465 return 1;

	2466 #endif

	2467 }

	2468

	2469 /*

	2470 ** Make sure pBt->pTmpSpace points to an allocation of

	2471 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child

	2472 ** pointer.

	2473 */

	2474 static void allocateTempSpace(BtShared *pBt){

	2475 if( !pBt->pTmpSpace ){

	2476 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );

	2477

	2478 /* One of the uses of pBt->pTmpSpace is to format cells before

	2479 ** inserting them into a leaf page (function fillInCell()). If

	2480 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes

	2481 ** by the various routines that manipulate binary cells. Which

	2482 ** can mean that fillInCell() only initializes the first 2 or 3

	2483 ** bytes of pTmpSpace, but that the first 4 bytes are copied from

	2484 ** it into a database page. This is not actually a problem, but it

	2485 ** does cause a valgrind error when the 1 or 2 bytes of unitialized

	2486 ** data is passed to system call write(). So to avoid this error,

	2487 ** zero the first 4 bytes of temp space here.

	2488 **

	2489 ** Also: Provide four bytes of initialized space before the

	2490 ** beginning of pTmpSpace as an area available to prepend the

	2491 ** left-child pointer to the beginning of a cell.

	2492 */

	2493 if( pBt->pTmpSpace ){

	2494 memset(pBt->pTmpSpace, 0, 8);

	2495 pBt->pTmpSpace += 4;

	2496 }

	2497 }

	2498 }

	2499

	2500 /*

	2501 ** Free the pBt->pTmpSpace allocation

	2502 */

	2503 static void freeTempSpace(BtShared *pBt){

	2504 if( pBt->pTmpSpace ){

	2505 pBt->pTmpSpace -= 4;

	2506 sqlite3PageFree(pBt->pTmpSpace);

	2507 pBt->pTmpSpace = 0;

	2508 }

	2509 }

	2510

	2511 /*

	2512 ** Close an open database and invalidate all cursors.

	2513 */

	2514 int sqlite3BtreeClose(Btree *p){

	2515 BtShared *pBt = p->pBt;

	2516 BtCursor *pCur;

	2517

	2518 /* Close all cursors opened via this handle. */

	2519 assert( sqlite3_mutex_held(p->db->mutex) );

	2520 sqlite3BtreeEnter(p);

	2521 pCur = pBt->pCursor;

	2522 while( pCur ){

	2523 BtCursor *pTmp = pCur;

	2524 pCur = pCur->pNext;

	2525 if( pTmp->pBtree==p ){

	2526 sqlite3BtreeCloseCursor(pTmp);

	2527 }

	2528 }

	2529

	2530 /* Rollback any active transaction and free the handle structure.

	2531 ** The call to sqlite3BtreeRollback() drops any table-locks held by

	2532 ** this handle.

	2533 */

	2534 sqlite3BtreeRollback(p, SQLITE_OK, 0);

	2535 sqlite3BtreeLeave(p);

	2536

	2537 /* If there are still other outstanding references to the shared-btree

	2538 ** structure, return now. The remainder of this procedure cleans

	2539 ** up the shared-btree.

	2540 */

	2541 assert( p->wantToLock==0 && p->locked==0 );

	2542 if( !p->sharable \|\| removeFromSharingList(pBt) ){

	2543 /* The pBt is no longer on the sharing list, so we can access

	2544 ** it without having to hold the mutex.

	2545 **

	2546 ** Clean out and delete the BtShared object.

	2547 */

	2548 assert( !pBt->pCursor );

	2549 sqlite3PagerClose(pBt->pPager, p->db);

	2550 if( pBt->xFreeSchema && pBt->pSchema ){

	2551 pBt->xFreeSchema(pBt->pSchema);

	2552 }

	2553 sqlite3DbFree(0, pBt->pSchema);

	2554 freeTempSpace(pBt);

	2555 sqlite3_free(pBt);

	2556 }

	2557

	2558 #ifndef SQLITE_OMIT_SHARED_CACHE

	2559 assert( p->wantToLock==0 );

	2560 assert( p->locked==0 );

	2561 if( p->pPrev ) p->pPrev->pNext = p->pNext;

	2562 if( p->pNext ) p->pNext->pPrev = p->pPrev;

	2563 #endif

	2564

	2565 sqlite3_free(p);

	2566 return SQLITE_OK;

	2567 }

	2568

	2569 /*

	2570 ** Change the "soft" limit on the number of pages in the cache.

	2571 ** Unused and unmodified pages will be recycled when the number of

	2572 ** pages in the cache exceeds this soft limit. But the size of the

	2573 ** cache is allowed to grow larger than this limit if it contains

	2574 ** dirty pages or pages still in active use.

	2575 */

	2576 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){

	2577 BtShared *pBt = p->pBt;

	2578 assert( sqlite3_mutex_held(p->db->mutex) );

	2579 sqlite3BtreeEnter(p);

	2580 sqlite3PagerSetCachesize(pBt->pPager, mxPage);

	2581 sqlite3BtreeLeave(p);

	2582 return SQLITE_OK;

	2583 }

	2584

	2585 /*

	2586 ** Change the "spill" limit on the number of pages in the cache.

	2587 ** If the number of pages exceeds this limit during a write transaction,

	2588 ** the pager might attempt to "spill" pages to the journal early in

	2589 ** order to free up memory.

	2590 **

	2591 ** The value returned is the current spill size. If zero is passed

	2592 ** as an argument, no changes are made to the spill size setting, so

	2593 ** using mxPage of 0 is a way to query the current spill size.

	2594 */

	2595 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){

	2596 BtShared *pBt = p->pBt;

	2597 int res;

	2598 assert( sqlite3_mutex_held(p->db->mutex) );

	2599 sqlite3BtreeEnter(p);

	2600 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);

	2601 sqlite3BtreeLeave(p);

	2602 return res;

	2603 }

	2604

	2605 #if SQLITE_MAX_MMAP_SIZE>0

	2606 /*

	2607 ** Change the limit on the amount of the database file that may be

	2608 ** memory mapped.

	2609 */

	2610 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){

	2611 BtShared *pBt = p->pBt;

	2612 assert( sqlite3_mutex_held(p->db->mutex) );

	2613 sqlite3BtreeEnter(p);

	2614 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);

	2615 sqlite3BtreeLeave(p);

	2616 return SQLITE_OK;

	2617 }

	2618 #endif /* SQLITE_MAX_MMAP_SIZE>0 */

	2619

	2620 /*

	2621 ** Change the way data is synced to disk in order to increase or decrease

	2622 ** how well the database resists damage due to OS crashes and power

	2623 ** failures. Level 1 is the same as asynchronous (no syncs() occur and

	2624 ** there is a high probability of damage) Level 2 is the default. There

	2625 ** is a very low but non-zero probability of damage. Level 3 reduces the

	2626 ** probability of damage to near zero but with a write performance reduction.

	2627 */

	2628 #ifndef SQLITE_OMIT_PAGER_PRAGMAS

	2629 int sqlite3BtreeSetPagerFlags(

	2630 Btree p, / The btree to set the safety level on */

	2631 unsigned pgFlags /* Various PAGER_* flags */

	2632 ){

	2633 BtShared *pBt = p->pBt;

	2634 assert( sqlite3_mutex_held(p->db->mutex) );

	2635 sqlite3BtreeEnter(p);

	2636 sqlite3PagerSetFlags(pBt->pPager, pgFlags);

	2637 sqlite3BtreeLeave(p);

	2638 return SQLITE_OK;

	2639 }

	2640 #endif

	2641

	2642 /*

	2643 ** Change the default pages size and the number of reserved bytes per page.

	2644 ** Or, if the page size has already been fixed, return SQLITE_READONLY

	2645 ** without changing anything.

	2646 **

	2647 ** The page size must be a power of 2 between 512 and 65536. If the page

	2648 ** size supplied does not meet this constraint then the page size is not

	2649 ** changed.

	2650 **

	2651 ** Page sizes are constrained to be a power of two so that the region

	2652 ** of the database file used for locking (beginning at PENDING_BYTE,

	2653 ** the first byte past the 1GB boundary, 0x40000000) needs to occur

	2654 ** at the beginning of a page.

	2655 **

	2656 ** If parameter nReserve is less than zero, then the number of reserved

	2657 ** bytes per page is left unchanged.

	2658 **

	2659 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size

	2660 ** and autovacuum mode can no longer be changed.

	2661 */

	2662 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){

	2663 int rc = SQLITE_OK;

	2664 BtShared *pBt = p->pBt;

	2665 assert( nReserve>=-1 && nReserve<=255 );

	2666 sqlite3BtreeEnter(p);

	2667 #if SQLITE_HAS_CODEC

	2668 if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;

	2669 #endif

	2670 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){

	2671 sqlite3BtreeLeave(p);

	2672 return SQLITE_READONLY;

	2673 }

	2674 if( nReserve<0 ){

	2675 nReserve = pBt->pageSize - pBt->usableSize;

	2676 }

	2677 assert( nReserve>=0 && nReserve<=255 );

	2678 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&

	2679 ((pageSize-1)&pageSize)==0 ){

	2680 assert( (pageSize & 7)==0 );

	2681 assert( !pBt->pCursor );

	2682 pBt->pageSize = (u32)pageSize;

	2683 freeTempSpace(pBt);

	2684 }

	2685 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

	2686 pBt->usableSize = pBt->pageSize - (u16)nReserve;

	2687 if( iFix ) pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	2688 sqlite3BtreeLeave(p);

	2689 return rc;

	2690 }

	2691

	2692 /*

	2693 ** Return the currently defined page size

	2694 */

	2695 int sqlite3BtreeGetPageSize(Btree *p){

	2696 return p->pBt->pageSize;

	2697 }

	2698

	2699 /*

	2700 ** This function is similar to sqlite3BtreeGetReserve(), except that it

	2701 ** may only be called if it is guaranteed that the b-tree mutex is already

	2702 ** held.

	2703 **

	2704 ** This is useful in one special case in the backup API code where it is

	2705 ** known that the shared b-tree mutex is held, but the mutex on the

	2706 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()

	2707 ** were to be called, it might collide with some other operation on the

	2708 ** database handle that owns *p, causing undefined behavior.

	2709 */

	2710 int sqlite3BtreeGetReserveNoMutex(Btree *p){

	2711 int n;

	2712 assert( sqlite3_mutex_held(p->pBt->mutex) );

	2713 n = p->pBt->pageSize - p->pBt->usableSize;

	2714 return n;

	2715 }

	2716

	2717 /*

	2718 ** Return the number of bytes of space at the end of every page that

	2719 ** are intentually left unused. This is the "reserved" space that is

	2720 ** sometimes used by extensions.

	2721 **

	2722 ** If SQLITE_HAS_MUTEX is defined then the number returned is the

	2723 ** greater of the current reserved space and the maximum requested

	2724 ** reserve space.

	2725 */

	2726 int sqlite3BtreeGetOptimalReserve(Btree *p){

	2727 int n;

	2728 sqlite3BtreeEnter(p);

	2729 n = sqlite3BtreeGetReserveNoMutex(p);

	2730 #ifdef SQLITE_HAS_CODEC

	2731 if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;

	2732 #endif

	2733 sqlite3BtreeLeave(p);

	2734 return n;

	2735 }

	2736

	2737

	2738 /*

	2739 ** Set the maximum page count for a database if mxPage is positive.

	2740 ** No changes are made if mxPage is 0 or negative.

	2741 ** Regardless of the value of mxPage, return the maximum page count.

	2742 */

	2743 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){

	2744 int n;

	2745 sqlite3BtreeEnter(p);

	2746 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);

	2747 sqlite3BtreeLeave(p);

	2748 return n;

	2749 }

	2750

	2751 /*

	2752 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1. If newFlag is -1,

	2753 ** then make no changes. Always return the value of the BTS_SECURE_DELETE

	2754 ** setting after the change.

	2755 */

	2756 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){

	2757 int b;

	2758 if( p==0 ) return 0;

	2759 sqlite3BtreeEnter(p);

	2760 if( newFlag>=0 ){

	2761 p->pBt->btsFlags &= ~BTS_SECURE_DELETE;

	2762 if( newFlag ) p->pBt->btsFlags \|= BTS_SECURE_DELETE;

	2763 }

	2764 b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;

	2765 sqlite3BtreeLeave(p);

	2766 return b;

	2767 }

	2768

	2769 /*

	2770 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'

	2771 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it

	2772 ** is disabled. The default value for the auto-vacuum property is

	2773 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.

	2774 */

	2775 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){

	2776 #ifdef SQLITE_OMIT_AUTOVACUUM

	2777 return SQLITE_READONLY;

	2778 #else

	2779 BtShared *pBt = p->pBt;

	2780 int rc = SQLITE_OK;

	2781 u8 av = (u8)autoVacuum;

	2782

	2783 sqlite3BtreeEnter(p);

	2784 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){

	2785 rc = SQLITE_READONLY;

	2786 }else{

	2787 pBt->autoVacuum = av ?1:0;

	2788 pBt->incrVacuum = av==2 ?1:0;

	2789 }

	2790 sqlite3BtreeLeave(p);

	2791 return rc;

	2792 #endif

	2793 }

	2794

	2795 /*

	2796 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is

	2797 ** enabled 1 is returned. Otherwise 0.

	2798 */

	2799 int sqlite3BtreeGetAutoVacuum(Btree *p){

	2800 #ifdef SQLITE_OMIT_AUTOVACUUM

	2801 return BTREE_AUTOVACUUM_NONE;

	2802 #else

	2803 int rc;

	2804 sqlite3BtreeEnter(p);

	2805 rc = (

	2806 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:

	2807 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:

	2808 BTREE_AUTOVACUUM_INCR

	2809 );

	2810 sqlite3BtreeLeave(p);

	2811 return rc;

	2812 #endif

	2813 }

	2814

	2815

	2816 /*

	2817 ** Get a reference to pPage1 of the database file. This will

	2818 ** also acquire a readlock on that file.

	2819 **

	2820 ** SQLITE_OK is returned on success. If the file is not a

	2821 ** well-formed database file, then SQLITE_CORRUPT is returned.

	2822 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM

	2823 ** is returned if we run out of memory.

	2824 */

	2825 static int lockBtree(BtShared *pBt){

	2826 int rc; /* Result code from subfunctions */

	2827 MemPage pPage1; / Page 1 of the database file */

	2828 int nPage; /* Number of pages in the database */

	2829 int nPageFile = 0; /* Number of pages in the database file */

	2830 int nPageHeader; /* Number of pages in the database according to hdr */

	2831

	2832 assert( sqlite3_mutex_held(pBt->mutex) );

	2833 assert( pBt->pPage1==0 );

	2834 rc = sqlite3PagerSharedLock(pBt->pPager);

	2835 if( rc!=SQLITE_OK ) return rc;

	2836 rc = btreeGetPage(pBt, 1, &pPage1, 0);

	2837 if( rc!=SQLITE_OK ) return rc;

	2838

	2839 /* Do some checking to help insure the file we opened really is

	2840 ** a valid database file.

	2841 */

	2842 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);

	2843 sqlite3PagerPagecount(pBt->pPager, &nPageFile);

	2844 if( nPage==0 \|\| memcmp(24+(u8)pPage1->aData, 92+(u8)pPage1->aData,4)!=0 ){

	2845 nPage = nPageFile;

	2846 }

	2847 if( nPage>0 ){

	2848 u32 pageSize;

	2849 u32 usableSize;

	2850 u8 *page1 = pPage1->aData;

	2851 rc = SQLITE_NOTADB;

	2852 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins

	2853 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d

	2854 ** 61 74 20 33 00. */

	2855 if( memcmp(page1, zMagicHeader, 16)!=0 ){

	2856 goto page1_init_failed;

	2857 }

	2858

	2859 #ifdef SQLITE_OMIT_WAL

	2860 if( page1[18]>1 ){

	2861 pBt->btsFlags \|= BTS_READ_ONLY;

	2862 }

	2863 if( page1[19]>1 ){

	2864 goto page1_init_failed;

	2865 }

	2866 #else

	2867 if( page1[18]>2 ){

	2868 pBt->btsFlags \|= BTS_READ_ONLY;

	2869 }

	2870 if( page1[19]>2 ){

	2871 goto page1_init_failed;

	2872 }

	2873

	2874 /* If the write version is set to 2, this database should be accessed

	2875 ** in WAL mode. If the log is not already open, open it now. Then

	2876 ** return SQLITE_OK and return without populating BtShared.pPage1.

	2877 ** The caller detects this and calls this function again. This is

	2878 ** required as the version of page 1 currently in the page1 buffer

	2879 ** may not be the latest version - there may be a newer one in the log

	2880 ** file.

	2881 */

	2882 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){

	2883 int isOpen = 0;

	2884 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);

	2885 if( rc!=SQLITE_OK ){

	2886 goto page1_init_failed;

	2887 }else{

	2888 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS

	2889 sqlite3 *db;

	2890 Db *pDb;

	2891 if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){

	2892 while( pDb->pBt==0 \|\| pDb->pBt->pBt!=pBt ){ pDb++; }

	2893 if( pDb->bSyncSet==0

	2894 && pDb->safety_level==SQLITE_DEFAULT_SYNCHRONOUS+1

	2895 ){

	2896 pDb->safety_level = SQLITE_DEFAULT_WAL_SYNCHRONOUS+1;

	2897 sqlite3PagerSetFlags(pBt->pPager,

	2898 pDb->safety_level \| (db->flags & PAGER_FLAGS_MASK));

	2899 }

	2900 }

	2901 #endif

	2902 if( isOpen==0 ){

	2903 releasePage(pPage1);

	2904 return SQLITE_OK;

	2905 }

	2906 }

	2907 rc = SQLITE_NOTADB;

	2908 }

	2909 #endif

	2910

	2911 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload

	2912 ** fractions and the leaf payload fraction values must be 64, 32, and 32.

	2913 **

	2914 ** The original design allowed these amounts to vary, but as of

	2915 ** version 3.6.0, we require them to be fixed.

	2916 */

	2917 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){

	2918 goto page1_init_failed;

	2919 }

	2920 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is

	2921 ** determined by the 2-byte integer located at an offset of 16 bytes from

	2922 ** the beginning of the database file. */

	2923 pageSize = (page1[16]<<8) \| (page1[17]<<16);

	2924 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two

	2925 ** between 512 and 65536 inclusive. */

	2926 if( ((pageSize-1)&pageSize)!=0

	2927 \|\| pageSize>SQLITE_MAX_PAGE_SIZE

	2928 \|\| pageSize<=256

	2929 ){

	2930 goto page1_init_failed;

	2931 }

	2932 assert( (pageSize & 7)==0 );

	2933 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte

	2934 ** integer at offset 20 is the number of bytes of space at the end of

	2935 ** each page to reserve for extensions.

	2936 **

	2937 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is

	2938 ** determined by the one-byte unsigned integer found at an offset of 20

	2939 ** into the database file header. */

	2940 usableSize = pageSize - page1[20];

	2941 if( (u32)pageSize!=pBt->pageSize ){

	2942 /* After reading the first page of the database assuming a page size

	2943 ** of BtShared.pageSize, we have discovered that the page-size is

	2944 ** actually pageSize. Unlock the database, leave pBt->pPage1 at

	2945 ** zero and return SQLITE_OK. The caller will call this function

	2946 ** again with the correct page-size.

	2947 */

	2948 releasePage(pPage1);

	2949 pBt->usableSize = usableSize;

	2950 pBt->pageSize = pageSize;

	2951 freeTempSpace(pBt);

	2952 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,

	2953 pageSize-usableSize);

	2954 return rc;

	2955 }

	2956 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){

	2957 rc = SQLITE_CORRUPT_BKPT;

	2958 goto page1_init_failed;

	2959 }

	2960 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to

	2961 ** be less than 480. In other words, if the page size is 512, then the

	2962 ** reserved space size cannot exceed 32. */

	2963 if( usableSize<480 ){

	2964 goto page1_init_failed;

	2965 }

	2966 pBt->pageSize = pageSize;

	2967 pBt->usableSize = usableSize;

	2968 #ifndef SQLITE_OMIT_AUTOVACUUM

	2969 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);

	2970 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);

	2971 #endif

	2972 }

	2973

	2974 /* maxLocal is the maximum amount of payload to store locally for

	2975 ** a cell. Make sure it is small enough so that at least minFanout

	2976 ** cells can will fit on one page. We assume a 10-byte page header.

	2977 ** Besides the payload, the cell must store:

	2978 ** 2-byte pointer to the cell

	2979 ** 4-byte child pointer

	2980 ** 9-byte nKey value

	2981 ** 4-byte nData value

	2982 ** 4-byte overflow page pointer

	2983 ** So a cell consists of a 2-byte pointer, a header which is as much as

	2984 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow

	2985 ** page pointer.

	2986 */

	2987 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);

	2988 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);

	2989 pBt->maxLeaf = (u16)(pBt->usableSize - 35);

	2990 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);

	2991 if( pBt->maxLocal>127 ){

	2992 pBt->max1bytePayload = 127;

	2993 }else{

	2994 pBt->max1bytePayload = (u8)pBt->maxLocal;

	2995 }

	2996 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );

	2997 pBt->pPage1 = pPage1;

	2998 pBt->nPage = nPage;

	2999 return SQLITE_OK;

	3000

	3001 page1_init_failed:

	3002 releasePage(pPage1);

	3003 pBt->pPage1 = 0;

	3004 return rc;

	3005 }

	3006

	3007 #ifndef NDEBUG

	3008 /*

	3009 ** Return the number of cursors open on pBt. This is for use

	3010 ** in assert() expressions, so it is only compiled if NDEBUG is not

	3011 ** defined.

	3012 **

	3013 ** Only write cursors are counted if wrOnly is true. If wrOnly is

	3014 ** false then all cursors are counted.

	3015 **

	3016 ** For the purposes of this routine, a cursor is any cursor that

	3017 ** is capable of reading or writing to the database. Cursors that

	3018 ** have been tripped into the CURSOR_FAULT state are not counted.

	3019 */

	3020 static int countValidCursors(BtShared *pBt, int wrOnly){

	3021 BtCursor *pCur;

	3022 int r = 0;

	3023 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){

	3024 if( (wrOnly==0 \|\| (pCur->curFlags & BTCF_WriteFlag)!=0)

	3025 && pCur->eState!=CURSOR_FAULT ) r++;

	3026 }

	3027 return r;

	3028 }

	3029 #endif

	3030

	3031 /*

	3032 ** If there are no outstanding cursors and we are not in the middle

	3033 ** of a transaction but there is a read lock on the database, then

	3034 ** this routine unrefs the first page of the database file which

	3035 ** has the effect of releasing the read lock.

	3036 **

	3037 ** If there is a transaction in progress, this routine is a no-op.

	3038 */

	3039 static void unlockBtreeIfUnused(BtShared *pBt){

	3040 assert( sqlite3_mutex_held(pBt->mutex) );

	3041 assert( countValidCursors(pBt,0)==0 \|\| pBt->inTransaction>TRANS_NONE );

	3042 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){

	3043 MemPage *pPage1 = pBt->pPage1;

	3044 assert( pPage1->aData );

	3045 assert( sqlite3PagerRefcount(pBt->pPager)==1 );

	3046 pBt->pPage1 = 0;

	3047 releasePageNotNull(pPage1);

	3048 }

	3049 }

	3050

	3051 /*

	3052 ** If pBt points to an empty file then convert that empty file

	3053 ** into a new empty database by initializing the first page of

	3054 ** the database.

	3055 */

	3056 static int newDatabase(BtShared *pBt){

	3057 MemPage *pP1;

	3058 unsigned char *data;

	3059 int rc;

	3060

	3061 assert( sqlite3_mutex_held(pBt->mutex) );

	3062 if( pBt->nPage>0 ){

	3063 return SQLITE_OK;

	3064 }

	3065 pP1 = pBt->pPage1;

	3066 assert( pP1!=0 );

	3067 data = pP1->aData;

	3068 rc = sqlite3PagerWrite(pP1->pDbPage);

	3069 if( rc ) return rc;

	3070 memcpy(data, zMagicHeader, sizeof(zMagicHeader));

	3071 assert( sizeof(zMagicHeader)==16 );

	3072 data[16] = (u8)((pBt->pageSize>>8)&0xff);

	3073 data[17] = (u8)((pBt->pageSize>>16)&0xff);

	3074 data[18] = 1;

	3075 data[19] = 1;

	3076 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);

	3077 data[20] = (u8)(pBt->pageSize - pBt->usableSize);

	3078 data[21] = 64;

	3079 data[22] = 32;

	3080 data[23] = 32;

	3081 memset(&data[24], 0, 100-24);

	3082 zeroPage(pP1, PTF_INTKEY\|PTF_LEAF\|PTF_LEAFDATA );

	3083 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	3084 #ifndef SQLITE_OMIT_AUTOVACUUM

	3085 assert( pBt->autoVacuum==1 \|\| pBt->autoVacuum==0 );

	3086 assert( pBt->incrVacuum==1 \|\| pBt->incrVacuum==0 );

	3087 put4byte(&data[36 + 4*4], pBt->autoVacuum);

	3088 put4byte(&data[36 + 7*4], pBt->incrVacuum);

	3089 #endif

	3090 pBt->nPage = 1;

	3091 data[31] = 1;

	3092 return SQLITE_OK;

	3093 }

	3094

	3095 /*

	3096 ** Initialize the first page of the database file (creating a database

	3097 ** consisting of a single page and no schema objects). Return SQLITE_OK

	3098 ** if successful, or an SQLite error code otherwise.

	3099 */

	3100 int sqlite3BtreeNewDb(Btree *p){

	3101 int rc;

	3102 sqlite3BtreeEnter(p);

	3103 p->pBt->nPage = 0;

	3104 rc = newDatabase(p->pBt);

	3105 sqlite3BtreeLeave(p);

	3106 return rc;

	3107 }

	3108

	3109 /*

	3110 ** Attempt to start a new transaction. A write-transaction

	3111 ** is started if the second argument is nonzero, otherwise a read-

	3112 ** transaction. If the second argument is 2 or more and exclusive

	3113 ** transaction is started, meaning that no other process is allowed

	3114 ** to access the database. A preexisting transaction may not be

	3115 ** upgraded to exclusive by calling this routine a second time - the

	3116 ** exclusivity flag only works for a new transaction.

	3117 **

	3118 ** A write-transaction must be started before attempting any

	3119 ** changes to the database. None of the following routines

	3120 ** will work unless a transaction is started first:

	3121 **

	3122 ** sqlite3BtreeCreateTable()

	3123 ** sqlite3BtreeCreateIndex()

	3124 ** sqlite3BtreeClearTable()

	3125 ** sqlite3BtreeDropTable()

	3126 ** sqlite3BtreeInsert()

	3127 ** sqlite3BtreeDelete()

	3128 ** sqlite3BtreeUpdateMeta()

	3129 **

	3130 ** If an initial attempt to acquire the lock fails because of lock contention

	3131 ** and the database was previously unlocked, then invoke the busy handler

	3132 ** if there is one. But if there was previously a read-lock, do not

	3133 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is

	3134 ** returned when there is already a read-lock in order to avoid a deadlock.

	3135 **

	3136 ** Suppose there are two processes A and B. A has a read lock and B has

	3137 ** a reserved lock. B tries to promote to exclusive but is blocked because

	3138 ** of A's read lock. A tries to promote to reserved but is blocked by B.

	3139 ** One or the other of the two processes must give way or there can be

	3140 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback

	3141 ** when A already has a read lock, we encourage A to give up and let B

	3142 ** proceed.

	3143 */

	3144 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){

	3145 BtShared *pBt = p->pBt;

	3146 int rc = SQLITE_OK;

	3147

	3148 sqlite3BtreeEnter(p);

	3149 btreeIntegrity(p);

	3150

	3151 /* If the btree is already in a write-transaction, or it

	3152 ** is already in a read-transaction and a read-transaction

	3153 ** is requested, this is a no-op.

	3154 */

	3155 if( p->inTrans==TRANS_WRITE \|\| (p->inTrans==TRANS_READ && !wrflag) ){

	3156 goto trans_begun;

	3157 }

	3158 assert( pBt->inTransaction==TRANS_WRITE \|\| IfNotOmitAV(pBt->bDoTruncate)==0 );

	3159

	3160 /* Write transactions are not possible on a read-only database */

	3161 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){

	3162 rc = SQLITE_READONLY;

	3163 goto trans_begun;

	3164 }

	3165

	3166 #ifndef SQLITE_OMIT_SHARED_CACHE

	3167 {

	3168 sqlite3 *pBlock = 0;

	3169 /* If another database handle has already opened a write transaction

	3170 ** on this shared-btree structure and a second write transaction is

	3171 ** requested, return SQLITE_LOCKED.

	3172 */

	3173 if( (wrflag && pBt->inTransaction==TRANS_WRITE)

	3174 \|\| (pBt->btsFlags & BTS_PENDING)!=0

	3175 ){

	3176 pBlock = pBt->pWriter->db;

	3177 }else if( wrflag>1 ){

	3178 BtLock *pIter;

	3179 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	3180 if( pIter->pBtree!=p ){

	3181 pBlock = pIter->pBtree->db;

	3182 break;

	3183 }

	3184 }

	3185 }

	3186 if( pBlock ){

	3187 sqlite3ConnectionBlocked(p->db, pBlock);

	3188 rc = SQLITE_LOCKED_SHAREDCACHE;

	3189 goto trans_begun;

	3190 }

	3191 }

	3192 #endif

	3193

	3194 /* Any read-only or read-write transaction implies a read-lock on

	3195 ** page 1. So if some other shared-cache client already has a write-lock

	3196 ** on page 1, the transaction cannot be opened. */

	3197 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

	3198 if( SQLITE_OK!=rc ) goto trans_begun;

	3199

	3200 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;

	3201 if( pBt->nPage==0 ) pBt->btsFlags \|= BTS_INITIALLY_EMPTY;

	3202 do {

	3203 /* Call lockBtree() until either pBt->pPage1 is populated or

	3204 ** lockBtree() returns something other than SQLITE_OK. lockBtree()

	3205 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after

	3206 ** reading page 1 it discovers that the page-size of the database

	3207 ** file is not pBt->pageSize. In this case lockBtree() will update

	3208 ** pBt->pageSize to the page-size of the file on disk.

	3209 */

	3210 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );

	3211

	3212 if( rc==SQLITE_OK && wrflag ){

	3213 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){

	3214 rc = SQLITE_READONLY;

	3215 }else{

	3216 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));

	3217 if( rc==SQLITE_OK ){

	3218 rc = newDatabase(pBt);

	3219 }

	3220 }

	3221 }

	3222

	3223 if( rc!=SQLITE_OK ){

	3224 unlockBtreeIfUnused(pBt);

	3225 }

	3226 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&

	3227 btreeInvokeBusyHandler(pBt) );

	3228

	3229 if( rc==SQLITE_OK ){

	3230 if( p->inTrans==TRANS_NONE ){

	3231 pBt->nTransaction++;

	3232 #ifndef SQLITE_OMIT_SHARED_CACHE

	3233 if( p->sharable ){

	3234 assert( p->lock.pBtree==p && p->lock.iTable==1 );

	3235 p->lock.eLock = READ_LOCK;

	3236 p->lock.pNext = pBt->pLock;

	3237 pBt->pLock = &p->lock;

	3238 }

	3239 #endif

	3240 }

	3241 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);

	3242 if( p->inTrans>pBt->inTransaction ){

	3243 pBt->inTransaction = p->inTrans;

	3244 }

	3245 if( wrflag ){

	3246 MemPage *pPage1 = pBt->pPage1;

	3247 #ifndef SQLITE_OMIT_SHARED_CACHE

	3248 assert( !pBt->pWriter );

	3249 pBt->pWriter = p;

	3250 pBt->btsFlags &= ~BTS_EXCLUSIVE;

	3251 if( wrflag>1 ) pBt->btsFlags \|= BTS_EXCLUSIVE;

	3252 #endif

	3253

	3254 /* If the db-size header field is incorrect (as it may be if an old

	3255 ** client has been writing the database file), update it now. Doing

	3256 ** this sooner rather than later means the database size can safely

	3257 ** re-read the database size from page 1 if a savepoint or transaction

	3258 ** rollback occurs within the transaction.

	3259 */

	3260 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){

	3261 rc = sqlite3PagerWrite(pPage1->pDbPage);

	3262 if( rc==SQLITE_OK ){

	3263 put4byte(&pPage1->aData[28], pBt->nPage);

	3264 }

	3265 }

	3266 }

	3267 }

	3268

	3269

	3270 trans_begun:

	3271 if( rc==SQLITE_OK && wrflag ){

	3272 /* This call makes sure that the pager has the correct number of

	3273 ** open savepoints. If the second parameter is greater than 0 and

	3274 ** the sub-journal is not already open, then it will be opened here.

	3275 */

	3276 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);

	3277 }

	3278

	3279 btreeIntegrity(p);

	3280 sqlite3BtreeLeave(p);

	3281 return rc;

	3282 }

	3283

	3284 #ifndef SQLITE_OMIT_AUTOVACUUM

	3285

	3286 /*

	3287 ** Set the pointer-map entries for all children of page pPage. Also, if

	3288 ** pPage contains cells that point to overflow pages, set the pointer

	3289 ** map entries for the overflow pages as well.

	3290 */

	3291 static int setChildPtrmaps(MemPage *pPage){

	3292 int i; /* Counter variable */

	3293 int nCell; /* Number of cells in page pPage */

	3294 int rc; /* Return code */

	3295 BtShared *pBt = pPage->pBt;

	3296 Pgno pgno = pPage->pgno;

	3297

	3298 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	3299 rc = btreeInitPage(pPage);

	3300 if( rc!=SQLITE_OK ) return rc;

	3301 nCell = pPage->nCell;

	3302

	3303 for(i=0; i<nCell; i++){

	3304 u8 *pCell = findCell(pPage, i);

	3305

	3306 ptrmapPutOvflPtr(pPage, pCell, &rc);

	3307

	3308 if( !pPage->leaf ){

	3309 Pgno childPgno = get4byte(pCell);

	3310 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

	3311 }

	3312 }

	3313

	3314 if( !pPage->leaf ){

	3315 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	3316 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

	3317 }

	3318

	3319 return rc;

	3320 }

	3321

	3322 /*

	3323 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so

	3324 ** that it points to iTo. Parameter eType describes the type of pointer to

	3325 ** be modified, as follows:

	3326 **

	3327 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child

	3328 ** page of pPage.

	3329 **

	3330 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow

	3331 ** page pointed to by one of the cells on pPage.

	3332 **

	3333 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next

	3334 ** overflow page in the list.

	3335 */

	3336 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){

	3337 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	3338 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	3339 if( eType==PTRMAP_OVERFLOW2 ){

	3340 /* The pointer is always the first 4 bytes of the page in this case. */

	3341 if( get4byte(pPage->aData)!=iFrom ){

	3342 return SQLITE_CORRUPT_BKPT;

	3343 }

	3344 put4byte(pPage->aData, iTo);

	3345 }else{

	3346 int i;

	3347 int nCell;

	3348 int rc;

	3349

	3350 rc = btreeInitPage(pPage);

	3351 if( rc ) return rc;

	3352 nCell = pPage->nCell;

	3353

	3354 for(i=0; i<nCell; i++){

	3355 u8 *pCell = findCell(pPage, i);

	3356 if( eType==PTRMAP_OVERFLOW1 ){

	3357 CellInfo info;

	3358 pPage->xParseCell(pPage, pCell, &info);

	3359 if( info.nLocal<info.nPayload ){

	3360 if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){

	3361 return SQLITE_CORRUPT_BKPT;

	3362 }

	3363 if( iFrom==get4byte(pCell+info.nSize-4) ){

	3364 put4byte(pCell+info.nSize-4, iTo);

	3365 break;

	3366 }

	3367 }

	3368 }else{

	3369 if( get4byte(pCell)==iFrom ){

	3370 put4byte(pCell, iTo);

	3371 break;

	3372 }

	3373 }

	3374 }

	3375

	3376 if( i==nCell ){

	3377 if( eType!=PTRMAP_BTREE \|\|

	3378 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){

	3379 return SQLITE_CORRUPT_BKPT;

	3380 }

	3381 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);

	3382 }

	3383 }

	3384 return SQLITE_OK;

	3385 }

	3386

	3387

	3388 /*

	3389 ** Move the open database page pDbPage to location iFreePage in the

	3390 ** database. The pDbPage reference remains valid.

	3391 **

	3392 ** The isCommit flag indicates that there is no need to remember that

	3393 ** the journal needs to be sync()ed before database page pDbPage->pgno

	3394 ** can be written to. The caller has already promised not to write to that

	3395 ** page.

	3396 */

	3397 static int relocatePage(

	3398 BtShared pBt, / Btree */

	3399 MemPage pDbPage, / Open page to move */

	3400 u8 eType, /* Pointer map 'type' entry for pDbPage */

	3401 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */

	3402 Pgno iFreePage, /* The location to move pDbPage to */

	3403 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */

	3404 ){

	3405 MemPage pPtrPage; / The page that contains a pointer to pDbPage */

	3406 Pgno iDbPage = pDbPage->pgno;

	3407 Pager *pPager = pBt->pPager;

	3408 int rc;

	3409

	3410 assert( eType==PTRMAP_OVERFLOW2 \|\| eType==PTRMAP_OVERFLOW1 \|\|

	3411 eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE );

	3412 assert( sqlite3_mutex_held(pBt->mutex) );

	3413 assert( pDbPage->pBt==pBt );

	3414

	3415 /* Move page iDbPage from its current location to page number iFreePage */

	3416 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",

	3417 iDbPage, iFreePage, iPtrPage, eType));

	3418 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);

	3419 if( rc!=SQLITE_OK ){

	3420 return rc;

	3421 }

	3422 pDbPage->pgno = iFreePage;

	3423

	3424 /* If pDbPage was a btree-page, then it may have child pages and/or cells

	3425 ** that point to overflow pages. The pointer map entries for all these

	3426 ** pages need to be changed.

	3427 **

	3428 ** If pDbPage is an overflow page, then the first 4 bytes may store a

	3429 ** pointer to a subsequent overflow page. If this is the case, then

	3430 ** the pointer map needs to be updated for the subsequent overflow page.

	3431 */

	3432 if( eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE ){

	3433 rc = setChildPtrmaps(pDbPage);

	3434 if( rc!=SQLITE_OK ){

	3435 return rc;

	3436 }

	3437 }else{

	3438 Pgno nextOvfl = get4byte(pDbPage->aData);

	3439 if( nextOvfl!=0 ){

	3440 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);

	3441 if( rc!=SQLITE_OK ){

	3442 return rc;

	3443 }

	3444 }

	3445 }

	3446

	3447 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so

	3448 ** that it points at iFreePage. Also fix the pointer map entry for

	3449 ** iPtrPage.

	3450 */

	3451 if( eType!=PTRMAP_ROOTPAGE ){

	3452 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);

	3453 if( rc!=SQLITE_OK ){

	3454 return rc;

	3455 }

	3456 rc = sqlite3PagerWrite(pPtrPage->pDbPage);

	3457 if( rc!=SQLITE_OK ){

	3458 releasePage(pPtrPage);

	3459 return rc;

	3460 }

	3461 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);

	3462 releasePage(pPtrPage);

	3463 if( rc==SQLITE_OK ){

	3464 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);

	3465 }

	3466 }

	3467 return rc;

	3468 }

	3469

	3470 /* Forward declaration required by incrVacuumStep(). */

	3471 static int allocateBtreePage(BtShared , MemPage , Pgno , Pgno, u8);

	3472

	3473 /*

	3474 ** Perform a single step of an incremental-vacuum. If successful, return

	3475 ** SQLITE_OK. If there is no work to do (and therefore no point in

	3476 ** calling this function again), return SQLITE_DONE. Or, if an error

	3477 ** occurs, return some other error code.

	3478 **

	3479 ** More specifically, this function attempts to re-organize the database so

	3480 ** that the last page of the file currently in use is no longer in use.

	3481 **

	3482 ** Parameter nFin is the number of pages that this database would contain

	3483 ** were this function called until it returns SQLITE_DONE.

	3484 **

	3485 ** If the bCommit parameter is non-zero, this function assumes that the

	3486 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE

	3487 ** or an error. bCommit is passed true for an auto-vacuum-on-commit

	3488 ** operation, or false for an incremental vacuum.

	3489 */

	3490 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){

	3491 Pgno nFreeList; /* Number of pages still on the free-list */

	3492 int rc;

	3493

	3494 assert( sqlite3_mutex_held(pBt->mutex) );

	3495 assert( iLastPg>nFin );

	3496

	3497 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){

	3498 u8 eType;

	3499 Pgno iPtrPage;

	3500

	3501 nFreeList = get4byte(&pBt->pPage1->aData[36]);

	3502 if( nFreeList==0 ){

	3503 return SQLITE_DONE;

	3504 }

	3505

	3506 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);

	3507 if( rc!=SQLITE_OK ){

	3508 return rc;

	3509 }

	3510 if( eType==PTRMAP_ROOTPAGE ){

	3511 return SQLITE_CORRUPT_BKPT;

	3512 }

	3513

	3514 if( eType==PTRMAP_FREEPAGE ){

	3515 if( bCommit==0 ){

	3516 /* Remove the page from the files free-list. This is not required

	3517 ** if bCommit is non-zero. In that case, the free-list will be

	3518 ** truncated to zero after this function returns, so it doesn't

	3519 ** matter if it still contains some garbage entries.

	3520 */

	3521 Pgno iFreePg;

	3522 MemPage *pFreePg;

	3523 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);

	3524 if( rc!=SQLITE_OK ){

	3525 return rc;

	3526 }

	3527 assert( iFreePg==iLastPg );

	3528 releasePage(pFreePg);

	3529 }

	3530 } else {

	3531 Pgno iFreePg; /* Index of free page to move pLastPg to */

	3532 MemPage *pLastPg;

	3533 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */

	3534 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */

	3535

	3536 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);

	3537 if( rc!=SQLITE_OK ){

	3538 return rc;

	3539 }

	3540

	3541 /* If bCommit is zero, this loop runs exactly once and page pLastPg

	3542 ** is swapped with the first free page pulled off the free list.

	3543 **

	3544 ** On the other hand, if bCommit is greater than zero, then keep

	3545 ** looping until a free-page located within the first nFin pages

	3546 ** of the file is found.

	3547 */

	3548 if( bCommit==0 ){

	3549 eMode = BTALLOC_LE;

	3550 iNear = nFin;

	3551 }

	3552 do {

	3553 MemPage *pFreePg;

	3554 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);

	3555 if( rc!=SQLITE_OK ){

	3556 releasePage(pLastPg);

	3557 return rc;

	3558 }

	3559 releasePage(pFreePg);

	3560 }while( bCommit && iFreePg>nFin );

	3561 assert( iFreePg<iLastPg );

	3562

	3563 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);

	3564 releasePage(pLastPg);

	3565 if( rc!=SQLITE_OK ){

	3566 return rc;

	3567 }

	3568 }

	3569 }

	3570

	3571 if( bCommit==0 ){

	3572 do {

	3573 iLastPg--;

	3574 }while( iLastPg==PENDING_BYTE_PAGE(pBt) \|\| PTRMAP_ISPAGE(pBt, iLastPg) );

	3575 pBt->bDoTruncate = 1;

	3576 pBt->nPage = iLastPg;

	3577 }

	3578 return SQLITE_OK;

	3579 }

	3580

	3581 /*

	3582 ** The database opened by the first argument is an auto-vacuum database

	3583 ** nOrig pages in size containing nFree free pages. Return the expected

	3584 ** size of the database in pages following an auto-vacuum operation.

	3585 */

	3586 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){

	3587 int nEntry; /* Number of entries on one ptrmap page */

	3588 Pgno nPtrmap; /* Number of PtrMap pages to be freed */

	3589 Pgno nFin; /* Return value */

	3590

	3591 nEntry = pBt->usableSize/5;

	3592 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;

	3593 nFin = nOrig - nFree - nPtrmap;

	3594 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){

	3595 nFin--;

	3596 }

	3597 while( PTRMAP_ISPAGE(pBt, nFin) \|\| nFin==PENDING_BYTE_PAGE(pBt) ){

	3598 nFin--;

	3599 }

	3600

	3601 return nFin;

	3602 }

	3603

	3604 /*

	3605 ** A write-transaction must be opened before calling this function.

	3606 ** It performs a single unit of work towards an incremental vacuum.

	3607 **

	3608 ** If the incremental vacuum is finished after this function has run,

	3609 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,

	3610 ** SQLITE_OK is returned. Otherwise an SQLite error code.

	3611 */

	3612 int sqlite3BtreeIncrVacuum(Btree *p){

	3613 int rc;

	3614 BtShared *pBt = p->pBt;

	3615

	3616 sqlite3BtreeEnter(p);

	3617 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );

	3618 if( !pBt->autoVacuum ){

	3619 rc = SQLITE_DONE;

	3620 }else{

	3621 Pgno nOrig = btreePagecount(pBt);

	3622 Pgno nFree = get4byte(&pBt->pPage1->aData[36]);

	3623 Pgno nFin = finalDbSize(pBt, nOrig, nFree);

	3624

	3625 if( nOrig<nFin ){

	3626 rc = SQLITE_CORRUPT_BKPT;

	3627 }else if( nFree>0 ){

	3628 rc = saveAllCursors(pBt, 0, 0);

	3629 if( rc==SQLITE_OK ){

	3630 invalidateAllOverflowCache(pBt);

	3631 rc = incrVacuumStep(pBt, nFin, nOrig, 0);

	3632 }

	3633 if( rc==SQLITE_OK ){

	3634 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	3635 put4byte(&pBt->pPage1->aData[28], pBt->nPage);

	3636 }

	3637 }else{

	3638 rc = SQLITE_DONE;

	3639 }

	3640 }

	3641 sqlite3BtreeLeave(p);

	3642 return rc;

	3643 }

	3644

	3645 /*

	3646 ** This routine is called prior to sqlite3PagerCommit when a transaction

	3647 ** is committed for an auto-vacuum database.

	3648 **

	3649 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages

	3650 ** the database file should be truncated to during the commit process.

	3651 ** i.e. the database has been reorganized so that only the first *pnTrunc

	3652 ** pages are in use.

	3653 */

	3654 static int autoVacuumCommit(BtShared *pBt){

	3655 int rc = SQLITE_OK;

	3656 Pager *pPager = pBt->pPager;

	3657 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )

	3658

	3659 assert( sqlite3_mutex_held(pBt->mutex) );

	3660 invalidateAllOverflowCache(pBt);

	3661 assert(pBt->autoVacuum);

	3662 if( !pBt->incrVacuum ){

	3663 Pgno nFin; /* Number of pages in database after autovacuuming */

	3664 Pgno nFree; /* Number of pages on the freelist initially */

	3665 Pgno iFree; /* The next page to be freed */

	3666 Pgno nOrig; /* Database size before freeing */

	3667

	3668 nOrig = btreePagecount(pBt);

	3669 if( PTRMAP_ISPAGE(pBt, nOrig) \|\| nOrig==PENDING_BYTE_PAGE(pBt) ){

	3670 /* It is not possible to create a database for which the final page

	3671 ** is either a pointer-map page or the pending-byte page. If one

	3672 ** is encountered, this indicates corruption.

	3673 */

	3674 return SQLITE_CORRUPT_BKPT;

	3675 }

	3676

	3677 nFree = get4byte(&pBt->pPage1->aData[36]);

	3678 nFin = finalDbSize(pBt, nOrig, nFree);

	3679 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;

	3680 if( nFin<nOrig ){

	3681 rc = saveAllCursors(pBt, 0, 0);

	3682 }

	3683 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){

	3684 rc = incrVacuumStep(pBt, nFin, iFree, 1);

	3685 }

	3686 if( (rc==SQLITE_DONE \|\| rc==SQLITE_OK) && nFree>0 ){

	3687 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	3688 put4byte(&pBt->pPage1->aData[32], 0);

	3689 put4byte(&pBt->pPage1->aData[36], 0);

	3690 put4byte(&pBt->pPage1->aData[28], nFin);

	3691 pBt->bDoTruncate = 1;

	3692 pBt->nPage = nFin;

	3693 }

	3694 if( rc!=SQLITE_OK ){

	3695 sqlite3PagerRollback(pPager);

	3696 }

	3697 }

	3698

	3699 assert( nRef>=sqlite3PagerRefcount(pPager) );

	3700 return rc;

	3701 }

	3702

	3703 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */

	3704 # define setChildPtrmaps(x) SQLITE_OK

	3705 #endif

	3706

	3707 /*

	3708 ** This routine does the first phase of a two-phase commit. This routine

	3709 ** causes a rollback journal to be created (if it does not already exist)

	3710 ** and populated with enough information so that if a power loss occurs

	3711 ** the database can be restored to its original state by playing back

	3712 ** the journal. Then the contents of the journal are flushed out to

	3713 ** the disk. After the journal is safely on oxide, the changes to the

	3714 ** database are written into the database file and flushed to oxide.

	3715 ** At the end of this call, the rollback journal still exists on the

	3716 ** disk and we are still holding all locks, so the transaction has not

	3717 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the

	3718 ** commit process.

	3719 **

	3720 ** This call is a no-op if no write-transaction is currently active on pBt.

	3721 **

	3722 ** Otherwise, sync the database file for the btree pBt. zMaster points to

	3723 ** the name of a master journal file that should be written into the

	3724 ** individual journal file, or is NULL, indicating no master journal file

	3725 ** (single database transaction).

	3726 **

	3727 ** When this is called, the master journal should already have been

	3728 ** created, populated with this journal pointer and synced to disk.

	3729 **

	3730 ** Once this is routine has returned, the only thing required to commit

	3731 ** the write-transaction for this database file is to delete the journal.

	3732 */

	3733 int sqlite3BtreeCommitPhaseOne(Btree p, const char zMaster){

	3734 int rc = SQLITE_OK;

	3735 if( p->inTrans==TRANS_WRITE ){

	3736 BtShared *pBt = p->pBt;

	3737 sqlite3BtreeEnter(p);

	3738 #ifndef SQLITE_OMIT_AUTOVACUUM

	3739 if( pBt->autoVacuum ){

	3740 rc = autoVacuumCommit(pBt);

	3741 if( rc!=SQLITE_OK ){

	3742 sqlite3BtreeLeave(p);

	3743 return rc;

	3744 }

	3745 }

	3746 if( pBt->bDoTruncate ){

	3747 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);

	3748 }

	3749 #endif

	3750 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);

	3751 sqlite3BtreeLeave(p);

	3752 }

	3753 return rc;

	3754 }

	3755

	3756 /*

	3757 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()

	3758 ** at the conclusion of a transaction.

	3759 */

	3760 static void btreeEndTransaction(Btree *p){

	3761 BtShared *pBt = p->pBt;

	3762 sqlite3 *db = p->db;

	3763 assert( sqlite3BtreeHoldsMutex(p) );

	3764

	3765 #ifndef SQLITE_OMIT_AUTOVACUUM

	3766 pBt->bDoTruncate = 0;

	3767 #endif

	3768 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){

	3769 /* If there are other active statements that belong to this database

	3770 ** handle, downgrade to a read-only transaction. The other statements

	3771 ** may still be reading from the database. */

	3772 downgradeAllSharedCacheTableLocks(p);

	3773 p->inTrans = TRANS_READ;

	3774 }else{

	3775 /* If the handle had any kind of transaction open, decrement the

	3776 ** transaction count of the shared btree. If the transaction count

	3777 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()

	3778 ** call below will unlock the pager. */

	3779 if( p->inTrans!=TRANS_NONE ){

	3780 clearAllSharedCacheTableLocks(p);

	3781 pBt->nTransaction--;

	3782 if( 0==pBt->nTransaction ){

	3783 pBt->inTransaction = TRANS_NONE;

	3784 }

	3785 }

	3786

	3787 /* Set the current transaction state to TRANS_NONE and unlock the

	3788 ** pager if this call closed the only read or write transaction. */

	3789 p->inTrans = TRANS_NONE;

	3790 unlockBtreeIfUnused(pBt);

	3791 }

	3792

	3793 btreeIntegrity(p);

	3794 }

	3795

	3796 /*

	3797 ** Commit the transaction currently in progress.

	3798 **

	3799 ** This routine implements the second phase of a 2-phase commit. The

	3800 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should

	3801 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()

	3802 ** routine did all the work of writing information out to disk and flushing the

	3803 ** contents so that they are written onto the disk platter. All this

	3804 ** routine has to do is delete or truncate or zero the header in the

	3805 ** the rollback journal (which causes the transaction to commit) and

	3806 ** drop locks.

	3807 **

	3808 ** Normally, if an error occurs while the pager layer is attempting to

	3809 ** finalize the underlying journal file, this function returns an error and

	3810 ** the upper layer will attempt a rollback. However, if the second argument

	3811 ** is non-zero then this b-tree transaction is part of a multi-file

	3812 ** transaction. In this case, the transaction has already been committed

	3813 ** (by deleting a master journal file) and the caller will ignore this

	3814 ** functions return code. So, even if an error occurs in the pager layer,

	3815 ** reset the b-tree objects internal state to indicate that the write

	3816 ** transaction has been closed. This is quite safe, as the pager will have

	3817 ** transitioned to the error state.

	3818 **

	3819 ** This will release the write lock on the database file. If there

	3820 ** are no active cursors, it also releases the read lock.

	3821 */

	3822 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){

	3823

	3824 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;

	3825 sqlite3BtreeEnter(p);

	3826 btreeIntegrity(p);

	3827

	3828 /* If the handle has a write-transaction open, commit the shared-btrees

	3829 ** transaction and set the shared state to TRANS_READ.

	3830 */

	3831 if( p->inTrans==TRANS_WRITE ){

	3832 int rc;

	3833 BtShared *pBt = p->pBt;

	3834 assert( pBt->inTransaction==TRANS_WRITE );

	3835 assert( pBt->nTransaction>0 );

	3836 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);

	3837 if( rc!=SQLITE_OK && bCleanup==0 ){

	3838 sqlite3BtreeLeave(p);

	3839 return rc;

	3840 }

	3841 p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */

	3842 pBt->inTransaction = TRANS_READ;

	3843 btreeClearHasContent(pBt);

	3844 }

	3845

	3846 btreeEndTransaction(p);

	3847 sqlite3BtreeLeave(p);

	3848 return SQLITE_OK;

	3849 }

	3850

	3851 /*

	3852 ** Do both phases of a commit.

	3853 */

	3854 int sqlite3BtreeCommit(Btree *p){

	3855 int rc;

	3856 sqlite3BtreeEnter(p);

	3857 rc = sqlite3BtreeCommitPhaseOne(p, 0);

	3858 if( rc==SQLITE_OK ){

	3859 rc = sqlite3BtreeCommitPhaseTwo(p, 0);

	3860 }

	3861 sqlite3BtreeLeave(p);

	3862 return rc;

	3863 }

	3864

	3865 /*

	3866 ** This routine sets the state to CURSOR_FAULT and the error

	3867 ** code to errCode for every cursor on any BtShared that pBtree

	3868 ** references. Or if the writeOnly flag is set to 1, then only

	3869 ** trip write cursors and leave read cursors unchanged.

	3870 **

	3871 ** Every cursor is a candidate to be tripped, including cursors

	3872 ** that belong to other database connections that happen to be

	3873 ** sharing the cache with pBtree.

	3874 **

	3875 ** This routine gets called when a rollback occurs. If the writeOnly

	3876 ** flag is true, then only write-cursors need be tripped - read-only

	3877 ** cursors save their current positions so that they may continue

	3878 ** following the rollback. Or, if writeOnly is false, all cursors are

	3879 ** tripped. In general, writeOnly is false if the transaction being

	3880 ** rolled back modified the database schema. In this case b-tree root

	3881 ** pages may be moved or deleted from the database altogether, making

	3882 ** it unsafe for read cursors to continue.

	3883 **

	3884 ** If the writeOnly flag is true and an error is encountered while

	3885 ** saving the current position of a read-only cursor, all cursors,

	3886 ** including all read-cursors are tripped.

	3887 **

	3888 ** SQLITE_OK is returned if successful, or if an error occurs while

	3889 ** saving a cursor position, an SQLite error code.

	3890 */

	3891 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){

	3892 BtCursor *p;

	3893 int rc = SQLITE_OK;

	3894

	3895 assert( (writeOnly==0 \|\| writeOnly==1) && BTCF_WriteFlag==1 );

	3896 if( pBtree ){

	3897 sqlite3BtreeEnter(pBtree);

	3898 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	3899 int i;

	3900 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){

	3901 if( p->eState==CURSOR_VALID \|\| p->eState==CURSOR_SKIPNEXT ){

	3902 rc = saveCursorPosition(p);

	3903 if( rc!=SQLITE_OK ){

	3904 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);

	3905 break;

	3906 }

	3907 }

	3908 }else{

	3909 sqlite3BtreeClearCursor(p);

	3910 p->eState = CURSOR_FAULT;

	3911 p->skipNext = errCode;

	3912 }

	3913 for(i=0; i<=p->iPage; i++){

	3914 releasePage(p->apPage[i]);

	3915 p->apPage[i] = 0;

	3916 }

	3917 }

	3918 sqlite3BtreeLeave(pBtree);

	3919 }

	3920 return rc;

	3921 }

	3922

	3923 /*

	3924 ** Rollback the transaction in progress.

	3925 **

	3926 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).

	3927 ** Only write cursors are tripped if writeOnly is true but all cursors are

	3928 ** tripped if writeOnly is false. Any attempt to use

	3929 ** a tripped cursor will result in an error.

	3930 **

	3931 ** This will release the write lock on the database file. If there

	3932 ** are no active cursors, it also releases the read lock.

	3933 */

	3934 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){

	3935 int rc;

	3936 BtShared *pBt = p->pBt;

	3937 MemPage *pPage1;

	3938

	3939 assert( writeOnly==1 \|\| writeOnly==0 );

	3940 assert( tripCode==SQLITE_ABORT_ROLLBACK \|\| tripCode==SQLITE_OK );

	3941 sqlite3BtreeEnter(p);

	3942 if( tripCode==SQLITE_OK ){

	3943 rc = tripCode = saveAllCursors(pBt, 0, 0);

	3944 if( rc ) writeOnly = 0;

	3945 }else{

	3946 rc = SQLITE_OK;

	3947 }

	3948 if( tripCode ){

	3949 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);

	3950 assert( rc==SQLITE_OK \|\| (writeOnly==0 && rc2==SQLITE_OK) );

	3951 if( rc2!=SQLITE_OK ) rc = rc2;

	3952 }

	3953 btreeIntegrity(p);

	3954

	3955 if( p->inTrans==TRANS_WRITE ){

	3956 int rc2;

	3957

	3958 assert( TRANS_WRITE==pBt->inTransaction );

	3959 rc2 = sqlite3PagerRollback(pBt->pPager);

	3960 if( rc2!=SQLITE_OK ){

	3961 rc = rc2;

	3962 }

	3963

	3964 /* The rollback may have destroyed the pPage1->aData value. So

	3965 ** call btreeGetPage() on page 1 again to make

	3966 ** sure pPage1->aData is set correctly. */

	3967 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){

	3968 int nPage = get4byte(28+(u8*)pPage1->aData);

	3969 testcase( nPage==0 );

	3970 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);

	3971 testcase( pBt->nPage!=nPage );

	3972 pBt->nPage = nPage;

	3973 releasePage(pPage1);

	3974 }

	3975 assert( countValidCursors(pBt, 1)==0 );

	3976 pBt->inTransaction = TRANS_READ;

	3977 btreeClearHasContent(pBt);

	3978 }

	3979

	3980 btreeEndTransaction(p);

	3981 sqlite3BtreeLeave(p);

	3982 return rc;

	3983 }

	3984

	3985 /*

	3986 ** Start a statement subtransaction. The subtransaction can be rolled

	3987 ** back independently of the main transaction. You must start a transaction

	3988 ** before starting a subtransaction. The subtransaction is ended automatically

	3989 ** if the main transaction commits or rolls back.

	3990 **

	3991 ** Statement subtransactions are used around individual SQL statements

	3992 ** that are contained within a BEGIN...COMMIT block. If a constraint

	3993 ** error occurs within the statement, the effect of that one statement

	3994 ** can be rolled back without having to rollback the entire transaction.

	3995 **

	3996 ** A statement sub-transaction is implemented as an anonymous savepoint. The

	3997 ** value passed as the second parameter is the total number of savepoints,

	3998 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there

	3999 ** are no active savepoints and no other statement-transactions open,

	4000 ** iStatement is 1. This anonymous savepoint can be released or rolled back

	4001 ** using the sqlite3BtreeSavepoint() function.

	4002 */

	4003 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){

	4004 int rc;

	4005 BtShared *pBt = p->pBt;

	4006 sqlite3BtreeEnter(p);

	4007 assert( p->inTrans==TRANS_WRITE );

	4008 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	4009 assert( iStatement>0 );

	4010 assert( iStatement>p->db->nSavepoint );

	4011 assert( pBt->inTransaction==TRANS_WRITE );

	4012 /* At the pager level, a statement transaction is a savepoint with

	4013 ** an index greater than all savepoints created explicitly using

	4014 ** SQL statements. It is illegal to open, release or rollback any

	4015 ** such savepoints while the statement transaction savepoint is active.

	4016 */

	4017 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);

	4018 sqlite3BtreeLeave(p);

	4019 return rc;

	4020 }

	4021

	4022 /*

	4023 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK

	4024 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the

	4025 ** savepoint identified by parameter iSavepoint, depending on the value

	4026 ** of op.

	4027 **

	4028 ** Normally, iSavepoint is greater than or equal to zero. However, if op is

	4029 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the

	4030 ** contents of the entire transaction are rolled back. This is different

	4031 ** from a normal transaction rollback, as no locks are released and the

	4032 ** transaction remains open.

	4033 */

	4034 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){

	4035 int rc = SQLITE_OK;

	4036 if( p && p->inTrans==TRANS_WRITE ){

	4037 BtShared *pBt = p->pBt;

	4038 assert( op==SAVEPOINT_RELEASE \|\| op==SAVEPOINT_ROLLBACK );

	4039 assert( iSavepoint>=0 \|\| (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );

	4040 sqlite3BtreeEnter(p);

	4041 if( op==SAVEPOINT_ROLLBACK ){

	4042 rc = saveAllCursors(pBt, 0, 0);

	4043 }

	4044 if( rc==SQLITE_OK ){

	4045 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);

	4046 }

	4047 if( rc==SQLITE_OK ){

	4048 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){

	4049 pBt->nPage = 0;

	4050 }

	4051 rc = newDatabase(pBt);

	4052 pBt->nPage = get4byte(28 + pBt->pPage1->aData);

	4053

	4054 /* The database size was written into the offset 28 of the header

	4055 ** when the transaction started, so we know that the value at offset

	4056 ** 28 is nonzero. */

	4057 assert( pBt->nPage>0 );

	4058 }

	4059 sqlite3BtreeLeave(p);

	4060 }

	4061 return rc;

	4062 }

	4063

	4064 /*

	4065 ** Create a new cursor for the BTree whose root is on the page

	4066 ** iTable. If a read-only cursor is requested, it is assumed that

	4067 ** the caller already has at least a read-only transaction open

	4068 ** on the database already. If a write-cursor is requested, then

	4069 ** the caller is assumed to have an open write transaction.

	4070 **

	4071 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only

	4072 ** be used for reading. If the BTREE_WRCSR bit is set, then the cursor

	4073 ** can be used for reading or for writing if other conditions for writing

	4074 ** are also met. These are the conditions that must be met in order

	4075 ** for writing to be allowed:

	4076 **

	4077 ** 1: The cursor must have been opened with wrFlag containing BTREE_WRCSR

	4078 **

	4079 ** 2: Other database connections that share the same pager cache

	4080 ** but which are not in the READ_UNCOMMITTED state may not have

	4081 ** cursors open with wrFlag==0 on the same table. Otherwise

	4082 ** the changes made by this write cursor would be visible to

	4083 ** the read cursors in the other database connection.

	4084 **

	4085 ** 3: The database must be writable (not on read-only media)

	4086 **

	4087 ** 4: There must be an active transaction.

	4088 **

	4089 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR

	4090 ** is set. If FORDELETE is set, that is a hint to the implementation that

	4091 ** this cursor will only be used to seek to and delete entries of an index

	4092 ** as part of a larger DELETE statement. The FORDELETE hint is not used by

	4093 ** this implementation. But in a hypothetical alternative storage engine

	4094 ** in which index entries are automatically deleted when corresponding table

	4095 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE

	4096 ** operations on this cursor can be no-ops and all READ operations can

	4097 ** return a null row (2-bytes: 0x01 0x00).

	4098 **

	4099 ** No checking is done to make sure that page iTable really is the

	4100 ** root page of a b-tree. If it is not, then the cursor acquired

	4101 ** will not work correctly.

	4102 **

	4103 ** It is assumed that the sqlite3BtreeCursorZero() has been called

	4104 ** on pCur to initialize the memory space prior to invoking this routine.

	4105 */

	4106 static int btreeCursor(

	4107 Btree p, / The btree */

	4108 int iTable, /* Root page of table to open */

	4109 int wrFlag, /* 1 to write. 0 read-only */

	4110 struct KeyInfo pKeyInfo, / First arg to comparison function */

	4111 BtCursor pCur / Space for new cursor */

	4112 ){

	4113 BtShared pBt = p->pBt; / Shared b-tree handle */

	4114 BtCursor pX; / Looping over other all cursors */

	4115

	4116 assert( sqlite3BtreeHoldsMutex(p) );

	4117 assert( wrFlag==0

	4118 \|\| wrFlag==BTREE_WRCSR

	4119 \|\| wrFlag==(BTREE_WRCSR\|BTREE_FORDELETE)

	4120 );

	4121

	4122 /* The following assert statements verify that if this is a sharable

	4123 ** b-tree database, the connection is holding the required table locks,

	4124 ** and that no other connection has any open cursor that conflicts with

	4125 ** this lock. */

	4126 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );

	4127 assert( wrFlag==0 \|\| !hasReadConflicts(p, iTable) );

	4128

	4129 /* Assert that the caller has opened the required transaction. */

	4130 assert( p->inTrans>TRANS_NONE );

	4131 assert( wrFlag==0 \|\| p->inTrans==TRANS_WRITE );

	4132 assert( pBt->pPage1 && pBt->pPage1->aData );

	4133 assert( wrFlag==0 \|\| (pBt->btsFlags & BTS_READ_ONLY)==0 );

	4134

	4135 if( wrFlag ){

	4136 allocateTempSpace(pBt);

	4137 if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;

	4138 }

	4139 if( iTable==1 && btreePagecount(pBt)==0 ){

	4140 assert( wrFlag==0 );

	4141 iTable = 0;

	4142 }

	4143

	4144 /* Now that no other errors can occur, finish filling in the BtCursor

	4145 ** variables and link the cursor into the BtShared list. */

	4146 pCur->pgnoRoot = (Pgno)iTable;

	4147 pCur->iPage = -1;

	4148 pCur->pKeyInfo = pKeyInfo;

	4149 pCur->pBtree = p;

	4150 pCur->pBt = pBt;

	4151 pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;

	4152 pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;

	4153 /* If there are two or more cursors on the same btree, then all such

	4154 ** cursors must have the BTCF_Multiple flag set. */

	4155 for(pX=pBt->pCursor; pX; pX=pX->pNext){

	4156 if( pX->pgnoRoot==(Pgno)iTable ){

	4157 pX->curFlags \|= BTCF_Multiple;

	4158 pCur->curFlags \|= BTCF_Multiple;

	4159 }

	4160 }

	4161 pCur->pNext = pBt->pCursor;

	4162 pBt->pCursor = pCur;

	4163 pCur->eState = CURSOR_INVALID;

	4164 return SQLITE_OK;

	4165 }

	4166 int sqlite3BtreeCursor(

	4167 Btree p, / The btree */

	4168 int iTable, /* Root page of table to open */

	4169 int wrFlag, /* 1 to write. 0 read-only */

	4170 struct KeyInfo pKeyInfo, / First arg to xCompare() */

	4171 BtCursor pCur / Write new cursor here */

	4172 ){

	4173 int rc;

	4174 if( iTable<1 ){

	4175 rc = SQLITE_CORRUPT_BKPT;

	4176 }else{

	4177 sqlite3BtreeEnter(p);

	4178 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);

	4179 sqlite3BtreeLeave(p);

	4180 }

	4181 return rc;

	4182 }

	4183

	4184 /*

	4185 ** Return the size of a BtCursor object in bytes.

	4186 **

	4187 ** This interfaces is needed so that users of cursors can preallocate

	4188 ** sufficient storage to hold a cursor. The BtCursor object is opaque

	4189 ** to users so they cannot do the sizeof() themselves - they must call

	4190 ** this routine.

	4191 */

	4192 int sqlite3BtreeCursorSize(void){

	4193 return ROUND8(sizeof(BtCursor));

	4194 }

	4195

	4196 /*

	4197 ** Initialize memory that will be converted into a BtCursor object.

	4198 **

	4199 ** The simple approach here would be to memset() the entire object

	4200 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays

	4201 ** do not need to be zeroed and they are large, so we can save a lot

	4202 ** of run-time by skipping the initialization of those elements.

	4203 */

	4204 void sqlite3BtreeCursorZero(BtCursor *p){

	4205 memset(p, 0, offsetof(BtCursor, iPage));

	4206 }

	4207

	4208 /*

	4209 ** Close a cursor. The read lock on the database file is released

	4210 ** when the last cursor is closed.

	4211 */

	4212 int sqlite3BtreeCloseCursor(BtCursor *pCur){

	4213 Btree *pBtree = pCur->pBtree;

	4214 if( pBtree ){

	4215 int i;

	4216 BtShared *pBt = pCur->pBt;

	4217 sqlite3BtreeEnter(pBtree);

	4218 sqlite3BtreeClearCursor(pCur);

	4219 assert( pBt->pCursor!=0 );

	4220 if( pBt->pCursor==pCur ){

	4221 pBt->pCursor = pCur->pNext;

	4222 }else{

	4223 BtCursor *pPrev = pBt->pCursor;

	4224 do{

	4225 if( pPrev->pNext==pCur ){

	4226 pPrev->pNext = pCur->pNext;

	4227 break;

	4228 }

	4229 pPrev = pPrev->pNext;

	4230 }while( ALWAYS(pPrev) );

	4231 }

	4232 for(i=0; i<=pCur->iPage; i++){

	4233 releasePage(pCur->apPage[i]);

	4234 }

	4235 unlockBtreeIfUnused(pBt);

	4236 sqlite3_free(pCur->aOverflow);

	4237 /* sqlite3_free(pCur); */

	4238 sqlite3BtreeLeave(pBtree);

	4239 }

	4240 return SQLITE_OK;

	4241 }

	4242

	4243 /*

	4244 ** Make sure the BtCursor* given in the argument has a valid

	4245 ** BtCursor.info structure. If it is not already valid, call

	4246 ** btreeParseCell() to fill it in.

	4247 **

	4248 ** BtCursor.info is a cache of the information in the current cell.

	4249 ** Using this cache reduces the number of calls to btreeParseCell().

	4250 */

	4251 #ifndef NDEBUG

	4252 static void assertCellInfo(BtCursor *pCur){

	4253 CellInfo info;

	4254 int iPage = pCur->iPage;

	4255 memset(&info, 0, sizeof(info));

	4256 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);

	4257 assert( CORRUPT_DB \|\| memcmp(&info, &pCur->info, sizeof(info))==0 );

	4258 }

	4259 #else

	4260 #define assertCellInfo(x)

	4261 #endif

	4262 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){

	4263 if( pCur->info.nSize==0 ){

	4264 int iPage = pCur->iPage;

	4265 pCur->curFlags \|= BTCF_ValidNKey;

	4266 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);

	4267 }else{

	4268 assertCellInfo(pCur);

	4269 }

	4270 }

	4271

	4272 #ifndef NDEBUG /* The next routine used only within assert() statements */

	4273 /*

	4274 ** Return true if the given BtCursor is valid. A valid cursor is one

	4275 ** that is currently pointing to a row in a (non-empty) table.

	4276 ** This is a verification routine is used only within assert() statements.

	4277 */

	4278 int sqlite3BtreeCursorIsValid(BtCursor *pCur){

	4279 return pCur && pCur->eState==CURSOR_VALID;

	4280 }

	4281 #endif /* NDEBUG */

	4282 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){

	4283 assert( pCur!=0 );

	4284 return pCur->eState==CURSOR_VALID;

	4285 }

	4286

	4287 /*

	4288 ** Return the value of the integer key or "rowid" for a table btree.

	4289 ** This routine is only valid for a cursor that is pointing into a

	4290 ** ordinary table btree. If the cursor points to an index btree or

	4291 ** is invalid, the result of this routine is undefined.

	4292 */

	4293 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){

	4294 assert( cursorHoldsMutex(pCur) );

	4295 assert( pCur->eState==CURSOR_VALID );

	4296 assert( pCur->curIntKey );

	4297 getCellInfo(pCur);

	4298 return pCur->info.nKey;

	4299 }

	4300

	4301 /*

	4302 ** Return the number of bytes of payload for the entry that pCur is

	4303 ** currently pointing to. For table btrees, this will be the amount

	4304 ** of data. For index btrees, this will be the size of the key.

	4305 **

	4306 ** The caller must guarantee that the cursor is pointing to a non-NULL

	4307 ** valid entry. In other words, the calling procedure must guarantee

	4308 ** that the cursor has Cursor.eState==CURSOR_VALID.

	4309 */

	4310 u32 sqlite3BtreePayloadSize(BtCursor *pCur){

	4311 assert( cursorHoldsMutex(pCur) );

	4312 assert( pCur->eState==CURSOR_VALID );

	4313 getCellInfo(pCur);

	4314 return pCur->info.nPayload;

	4315 }

	4316

	4317 /*

	4318 ** Given the page number of an overflow page in the database (parameter

	4319 ** ovfl), this function finds the page number of the next page in the

	4320 ** linked list of overflow pages. If possible, it uses the auto-vacuum

	4321 ** pointer-map data instead of reading the content of page ovfl to do so.

	4322 **

	4323 ** If an error occurs an SQLite error code is returned. Otherwise:

	4324 **

	4325 ** The page number of the next overflow page in the linked list is

	4326 ** written to *pPgnoNext. If page ovfl is the last page in its linked

	4327 ** list, *pPgnoNext is set to zero.

	4328 **

	4329 ** If ppPage is not NULL, and a reference to the MemPage object corresponding

	4330 ** to page number pOvfl was obtained, then *ppPage is set to point to that

	4331 ** reference. It is the responsibility of the caller to call releasePage()

	4332 ** on *ppPage to free the reference. In no reference was obtained (because

	4333 ** the pointer-map was used to obtain the value for *pPgnoNext), then

	4334 ** *ppPage is set to zero.

	4335 */

	4336 static int getOverflowPage(

	4337 BtShared pBt, / The database file */

	4338 Pgno ovfl, /* Current overflow page number */

	4339 MemPage *ppPage, / OUT: MemPage handle (may be NULL) */

	4340 Pgno pPgnoNext / OUT: Next overflow page number */

	4341 ){

	4342 Pgno next = 0;

	4343 MemPage *pPage = 0;

	4344 int rc = SQLITE_OK;

	4345

	4346 assert( sqlite3_mutex_held(pBt->mutex) );

	4347 assert(pPgnoNext);

	4348

	4349 #ifndef SQLITE_OMIT_AUTOVACUUM

	4350 /* Try to find the next page in the overflow list using the

	4351 ** autovacuum pointer-map pages. Guess that the next page in

	4352 ** the overflow list is page number (ovfl+1). If that guess turns

	4353 ** out to be wrong, fall back to loading the data of page

	4354 ** number ovfl to determine the next page number.

	4355 */

	4356 if( pBt->autoVacuum ){

	4357 Pgno pgno;

	4358 Pgno iGuess = ovfl+1;

	4359 u8 eType;

	4360

	4361 while( PTRMAP_ISPAGE(pBt, iGuess) \|\| iGuess==PENDING_BYTE_PAGE(pBt) ){

	4362 iGuess++;

	4363 }

	4364

	4365 if( iGuess<=btreePagecount(pBt) ){

	4366 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);

	4367 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){

	4368 next = iGuess;

	4369 rc = SQLITE_DONE;

	4370 }

	4371 }

	4372 }

	4373 #endif

	4374

	4375 assert( next==0 \|\| rc==SQLITE_DONE );

	4376 if( rc==SQLITE_OK ){

	4377 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);

	4378 assert( rc==SQLITE_OK \|\| pPage==0 );

	4379 if( rc==SQLITE_OK ){

	4380 next = get4byte(pPage->aData);

	4381 }

	4382 }

	4383

	4384 *pPgnoNext = next;

	4385 if( ppPage ){

	4386 *ppPage = pPage;

	4387 }else{

	4388 releasePage(pPage);

	4389 }

	4390 return (rc==SQLITE_DONE ? SQLITE_OK : rc);

	4391 }

	4392

	4393 /*

	4394 ** Copy data from a buffer to a page, or from a page to a buffer.

	4395 **

	4396 ** pPayload is a pointer to data stored on database page pDbPage.

	4397 ** If argument eOp is false, then nByte bytes of data are copied

	4398 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,

	4399 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes

	4400 ** of data are copied from the buffer pBuf to pPayload.

	4401 **

	4402 ** SQLITE_OK is returned on success, otherwise an error code.

	4403 */

	4404 static int copyPayload(

	4405 void pPayload, / Pointer to page data */

	4406 void pBuf, / Pointer to buffer */

	4407 int nByte, /* Number of bytes to copy */

	4408 int eOp, /* 0 -> copy from page, 1 -> copy to page */

	4409 DbPage pDbPage / Page containing pPayload */

	4410 ){

	4411 if( eOp ){

	4412 /* Copy data from buffer to page (a write operation) */

	4413 int rc = sqlite3PagerWrite(pDbPage);

	4414 if( rc!=SQLITE_OK ){

	4415 return rc;

	4416 }

	4417 memcpy(pPayload, pBuf, nByte);

	4418 }else{

	4419 /* Copy data from page to buffer (a read operation) */

	4420 memcpy(pBuf, pPayload, nByte);

	4421 }

	4422 return SQLITE_OK;

	4423 }

	4424

	4425 /*

	4426 ** This function is used to read or overwrite payload information

	4427 ** for the entry that the pCur cursor is pointing to. The eOp

	4428 ** argument is interpreted as follows:

	4429 **

	4430 ** 0: The operation is a read. Populate the overflow cache.

	4431 ** 1: The operation is a write. Populate the overflow cache.

	4432 **

	4433 ** A total of "amt" bytes are read or written beginning at "offset".

	4434 ** Data is read to or from the buffer pBuf.

	4435 **

	4436 ** The content being read or written might appear on the main page

	4437 ** or be scattered out on multiple overflow pages.

	4438 **

	4439 ** If the current cursor entry uses one or more overflow pages

	4440 ** this function may allocate space for and lazily populate

	4441 ** the overflow page-list cache array (BtCursor.aOverflow).

	4442 ** Subsequent calls use this cache to make seeking to the supplied offset

	4443 ** more efficient.

	4444 **

	4445 ** Once an overflow page-list cache has been allocated, it must be

	4446 ** invalidated if some other cursor writes to the same table, or if

	4447 ** the cursor is moved to a different row. Additionally, in auto-vacuum

	4448 ** mode, the following events may invalidate an overflow page-list cache.

	4449 **

	4450 ** * An incremental vacuum,

	4451 ** * A commit in auto_vacuum="full" mode,

	4452 ** * Creating a table (may require moving an overflow page).

	4453 */

	4454 static int accessPayload(

	4455 BtCursor pCur, / Cursor pointing to entry to read from */

	4456 u32 offset, /* Begin reading this far into payload */

	4457 u32 amt, /* Read this many bytes */

	4458 unsigned char pBuf, / Write the bytes into this buffer */

	4459 int eOp /* zero to read. non-zero to write. */

	4460 ){

	4461 unsigned char *aPayload;

	4462 int rc = SQLITE_OK;

	4463 int iIdx = 0;

	4464 MemPage pPage = pCur->apPage[pCur->iPage]; / Btree page of current entry */

	4465 BtShared pBt = pCur->pBt; / Btree this cursor belongs to */

	4466 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4467 unsigned char * const pBufStart = pBuf; /* Start of original out buffer */

	4468 #endif

	4469

	4470 assert( pPage );

	4471 assert( eOp==0 \|\| eOp==1 );

	4472 assert( pCur->eState==CURSOR_VALID );

	4473 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

	4474 assert( cursorHoldsMutex(pCur) );

	4475

	4476 getCellInfo(pCur);

	4477 aPayload = pCur->info.pPayload;

	4478 assert( offset+amt <= pCur->info.nPayload );

	4479

	4480 assert( aPayload > pPage->aData );

	4481 if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){

	4482 /* Trying to read or write past the end of the data is an error. The

	4483 ** conditional above is really:

	4484 ** &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]

	4485 ** but is recast into its current form to avoid integer overflow problems

	4486 */

	4487 return SQLITE_CORRUPT_BKPT;

	4488 }

	4489

	4490 /* Check if data must be read/written to/from the btree page itself. */

	4491 if( offset<pCur->info.nLocal ){

	4492 int a = amt;

	4493 if( a+offset>pCur->info.nLocal ){

	4494 a = pCur->info.nLocal - offset;

	4495 }

	4496 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);

	4497 offset = 0;

	4498 pBuf += a;

	4499 amt -= a;

	4500 }else{

	4501 offset -= pCur->info.nLocal;

	4502 }

	4503

	4504

	4505 if( rc==SQLITE_OK && amt>0 ){

	4506 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */

	4507 Pgno nextPage;

	4508

	4509 nextPage = get4byte(&aPayload[pCur->info.nLocal]);

	4510

	4511 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.

	4512 **

	4513 ** The aOverflow[] array is sized at one entry for each overflow page

	4514 ** in the overflow chain. The page number of the first overflow page is

	4515 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array

	4516 ** means "not yet known" (the cache is lazily populated).

	4517 */

	4518 if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){

	4519 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;

	4520 if( nOvfl>pCur->nOvflAlloc ){

	4521 Pgno aNew = (Pgno)sqlite3Realloc(

	4522 pCur->aOverflow, nOvfl2sizeof(Pgno)

	4523 );

	4524 if( aNew==0 ){

	4525 return SQLITE_NOMEM_BKPT;

	4526 }else{

	4527 pCur->nOvflAlloc = nOvfl*2;

	4528 pCur->aOverflow = aNew;

	4529 }

	4530 }

	4531 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));

	4532 pCur->curFlags \|= BTCF_ValidOvfl;

	4533 }else{

	4534 /* If the overflow page-list cache has been allocated and the

	4535 ** entry for the first required overflow page is valid, skip

	4536 ** directly to it.

	4537 */

	4538 if( pCur->aOverflow[offset/ovflSize] ){

	4539 iIdx = (offset/ovflSize);

	4540 nextPage = pCur->aOverflow[iIdx];

	4541 offset = (offset%ovflSize);

	4542 }

	4543 }

	4544

	4545 assert( rc==SQLITE_OK && amt>0 );

	4546 while( nextPage ){

	4547 /* If required, populate the overflow page-list cache. */

	4548 assert( pCur->aOverflow[iIdx]==0

	4549 \|\| pCur->aOverflow[iIdx]==nextPage

	4550 \|\| CORRUPT_DB );

	4551 pCur->aOverflow[iIdx] = nextPage;

	4552

	4553 if( offset>=ovflSize ){

	4554 /* The only reason to read this page is to obtain the page

	4555 ** number for the next page in the overflow chain. The page

	4556 ** data is not required. So first try to lookup the overflow

	4557 ** page-list cache, if any, then fall back to the getOverflowPage()

	4558 ** function.

	4559 */

	4560 assert( pCur->curFlags & BTCF_ValidOvfl );

	4561 assert( pCur->pBtree->db==pBt->db );

	4562 if( pCur->aOverflow[iIdx+1] ){

	4563 nextPage = pCur->aOverflow[iIdx+1];

	4564 }else{

	4565 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);

	4566 }

	4567 offset -= ovflSize;

	4568 }else{

	4569 /* Need to read this page properly. It contains some of the

	4570 ** range of data that is being read (eOp==0) or written (eOp!=0).

	4571 */

	4572 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4573 sqlite3_file fd; / File from which to do direct overflow read */

	4574 #endif

	4575 int a = amt;

	4576 if( a + offset > ovflSize ){

	4577 a = ovflSize - offset;

	4578 }

	4579

	4580 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4581 /* If all the following are true:

	4582 **

	4583 ** 1) this is a read operation, and

	4584 ** 2) data is required from the start of this overflow page, and

	4585 ** 3) there is no open write-transaction, and

	4586 ** 4) the database is file-backed, and

	4587 ** 5) the page is not in the WAL file

	4588 ** 6) at least 4 bytes have already been read into the output buffer

	4589 **

	4590 ** then data can be read directly from the database file into the

	4591 ** output buffer, bypassing the page-cache altogether. This speeds

	4592 ** up loading large records that span many overflow pages.

	4593 */

	4594 if( eOp==0 /* (1) */

	4595 && offset==0 /* (2) */

	4596 && pBt->inTransaction==TRANS_READ /* (3) */

	4597 && (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (4) */

	4598 && 0==sqlite3PagerUseWal(pBt->pPager, nextPage) /* (5) */

	4599 && &pBuf[-4]>=pBufStart /* (6) */

	4600 ){

	4601 u8 aSave[4];

	4602 u8 *aWrite = &pBuf[-4];

	4603 assert( aWrite>=pBufStart ); /* due to (6) */

	4604 memcpy(aSave, aWrite, 4);

	4605 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));

	4606 nextPage = get4byte(aWrite);

	4607 memcpy(aWrite, aSave, 4);

	4608 }else

	4609 #endif

	4610

	4611 {

	4612 DbPage *pDbPage;

	4613 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,

	4614 (eOp==0 ? PAGER_GET_READONLY : 0)

	4615 );

	4616 if( rc==SQLITE_OK ){

	4617 aPayload = sqlite3PagerGetData(pDbPage);

	4618 nextPage = get4byte(aPayload);

	4619 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);

	4620 sqlite3PagerUnref(pDbPage);

	4621 offset = 0;

	4622 }

	4623 }

	4624 amt -= a;

	4625 if( amt==0 ) return rc;

	4626 pBuf += a;

	4627 }

	4628 if( rc ) break;

	4629 iIdx++;

	4630 }

	4631 }

	4632

	4633 if( rc==SQLITE_OK && amt>0 ){

	4634 return SQLITE_CORRUPT_BKPT; /* Overflow chain ends prematurely */

	4635 }

	4636 return rc;

	4637 }

	4638

	4639 /*

	4640 ** Read part of the payload for the row at which that cursor pCur is currently

	4641 ** pointing. "amt" bytes will be transferred into pBuf[]. The transfer

	4642 ** begins at "offset".

	4643 **

	4644 ** pCur can be pointing to either a table or an index b-tree.

	4645 ** If pointing to a table btree, then the content section is read. If

	4646 ** pCur is pointing to an index b-tree then the key section is read.

	4647 **

	4648 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing

	4649 ** to a valid row in the table. For sqlite3BtreePayloadChecked(), the

	4650 ** cursor might be invalid or might need to be restored before being read.

	4651 **

	4652 ** Return SQLITE_OK on success or an error code if anything goes

	4653 ** wrong. An error is returned if "offset+amt" is larger than

	4654 ** the available payload.

	4655 */

	4656 int sqlite3BtreePayload(BtCursor pCur, u32 offset, u32 amt, void pBuf){

	4657 assert( cursorHoldsMutex(pCur) );

	4658 assert( pCur->eState==CURSOR_VALID );

	4659 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

	4660 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	4661 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);

	4662 }

	4663

	4664 /*

	4665 ** This variant of sqlite3BtreePayload() works even if the cursor has not

	4666 ** in the CURSOR_VALID state. It is only used by the sqlite3_blob_read()

	4667 ** interface.

	4668 */

	4669 #ifndef SQLITE_OMIT_INCRBLOB

	4670 static SQLITE_NOINLINE int accessPayloadChecked(

	4671 BtCursor *pCur,

	4672 u32 offset,

	4673 u32 amt,

	4674 void *pBuf

	4675 ){

	4676 int rc;

	4677 if ( pCur->eState==CURSOR_INVALID ){

	4678 return SQLITE_ABORT;

	4679 }

	4680 assert( cursorOwnsBtShared(pCur) );

	4681 rc = btreeRestoreCursorPosition(pCur);

	4682 return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);

	4683 }

	4684 int sqlite3BtreePayloadChecked(BtCursor pCur, u32 offset, u32 amt, void pBuf){

	4685 if( pCur->eState==CURSOR_VALID ){

	4686 assert( cursorOwnsBtShared(pCur) );

	4687 return accessPayload(pCur, offset, amt, pBuf, 0);

	4688 }else{

	4689 return accessPayloadChecked(pCur, offset, amt, pBuf);

	4690 }

	4691 }

	4692 #endif /* SQLITE_OMIT_INCRBLOB */

	4693

	4694 /*

	4695 ** Return a pointer to payload information from the entry that the

	4696 ** pCur cursor is pointing to. The pointer is to the beginning of

	4697 ** the key if index btrees (pPage->intKey==0) and is the data for

	4698 ** table btrees (pPage->intKey==1). The number of bytes of available

	4699 ** key/data is written into pAmt. If pAmt==0, then the value

	4700 ** returned will not be a valid pointer.

	4701 **

	4702 ** This routine is an optimization. It is common for the entire key

	4703 ** and data to fit on the local page and for there to be no overflow

	4704 ** pages. When that is so, this routine can be used to access the

	4705 ** key and data without making a copy. If the key and/or data spills

	4706 ** onto overflow pages, then accessPayload() must be used to reassemble

	4707 ** the key/data and copy it into a preallocated buffer.

	4708 **

	4709 ** The pointer returned by this routine looks directly into the cached

	4710 ** page of the database. The data might change or move the next time

	4711 ** any btree routine is called.

	4712 */

	4713 static const void *fetchPayload(

	4714 BtCursor pCur, / Cursor pointing to entry to read from */

	4715 u32 pAmt / Write the number of available bytes here */

	4716 ){

	4717 u32 amt;

	4718 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);

	4719 assert( pCur->eState==CURSOR_VALID );

	4720 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	4721 assert( cursorOwnsBtShared(pCur) );

	4722 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	4723 assert( pCur->info.nSize>0 );

	4724 assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData \|\| CORRUPT_DB );

	4725 assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd \|\|CORRUPT_DB);

	4726 amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);

	4727 if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;

	4728 *pAmt = amt;

	4729 return (void*)pCur->info.pPayload;

	4730 }

	4731

	4732

	4733 /*

	4734 ** For the entry that cursor pCur is point to, return as

	4735 ** many bytes of the key or data as are available on the local

	4736 ** b-tree page. Write the number of available bytes into *pAmt.

	4737 **

	4738 ** The pointer returned is ephemeral. The key/data may move

	4739 ** or be destroyed on the next call to any Btree routine,

	4740 ** including calls from other threads against the same cache.

	4741 ** Hence, a mutex on the BtShared should be held prior to calling

	4742 ** this routine.

	4743 **

	4744 ** These routines is used to get quick access to key and data

	4745 ** in the common case where no overflow pages are used.

	4746 */

	4747 const void sqlite3BtreePayloadFetch(BtCursor pCur, u32 *pAmt){

	4748 return fetchPayload(pCur, pAmt);

	4749 }

	4750

	4751

	4752 /*

	4753 ** Move the cursor down to a new child page. The newPgno argument is the

	4754 ** page number of the child page to move to.

	4755 **

	4756 ** This function returns SQLITE_CORRUPT if the page-header flags field of

	4757 ** the new child page does not match the flags field of the parent (i.e.

	4758 ** if an intkey page appears to be the parent of a non-intkey page, or

	4759 ** vice-versa).

	4760 */

	4761 static int moveToChild(BtCursor *pCur, u32 newPgno){

	4762 BtShared *pBt = pCur->pBt;

	4763

	4764 assert( cursorOwnsBtShared(pCur) );

	4765 assert( pCur->eState==CURSOR_VALID );

	4766 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );

	4767 assert( pCur->iPage>=0 );

	4768 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){

	4769 return SQLITE_CORRUPT_BKPT;

	4770 }

	4771 pCur->info.nSize = 0;

	4772 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	4773 pCur->iPage++;

	4774 pCur->aiIdx[pCur->iPage] = 0;

	4775 return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],

	4776 pCur, pCur->curPagerFlags);

	4777 }

	4778

	4779 #if SQLITE_DEBUG

	4780 /*

	4781 ** Page pParent is an internal (non-leaf) tree page. This function

	4782 ** asserts that page number iChild is the left-child if the iIdx'th

	4783 ** cell in page pParent. Or, if iIdx is equal to the total number of

	4784 ** cells in pParent, that page number iChild is the right-child of

	4785 ** the page.

	4786 */

	4787 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){

	4788 if( CORRUPT_DB ) return; /* The conditions tested below might not be true

	4789 ** in a corrupt database */

	4790 assert( iIdx<=pParent->nCell );

	4791 if( iIdx==pParent->nCell ){

	4792 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );

	4793 }else{

	4794 assert( get4byte(findCell(pParent, iIdx))==iChild );

	4795 }

	4796 }

	4797 #else

	4798 # define assertParentIndex(x,y,z)

	4799 #endif

	4800

	4801 /*

	4802 ** Move the cursor up to the parent page.

	4803 **

	4804 ** pCur->idx is set to the cell index that contains the pointer

	4805 ** to the page we are coming from. If we are coming from the

	4806 ** right-most child page then pCur->idx is set to one more than

	4807 ** the largest cell index.

	4808 */

	4809 static void moveToParent(BtCursor *pCur){

	4810 assert( cursorOwnsBtShared(pCur) );

	4811 assert( pCur->eState==CURSOR_VALID );

	4812 assert( pCur->iPage>0 );

	4813 assert( pCur->apPage[pCur->iPage] );

	4814 assertParentIndex(

	4815 pCur->apPage[pCur->iPage-1],

	4816 pCur->aiIdx[pCur->iPage-1],

	4817 pCur->apPage[pCur->iPage]->pgno

	4818 );

	4819 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );

	4820 pCur->info.nSize = 0;

	4821 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	4822 releasePageNotNull(pCur->apPage[pCur->iPage--]);

	4823 }

	4824

	4825 /*

	4826 ** Move the cursor to point to the root page of its b-tree structure.

	4827 **

	4828 ** If the table has a virtual root page, then the cursor is moved to point

	4829 ** to the virtual root page instead of the actual root page. A table has a

	4830 ** virtual root page when the actual root page contains no cells and a

	4831 ** single child page. This can only happen with the table rooted at page 1.

	4832 **

	4833 ** If the b-tree structure is empty, the cursor state is set to

	4834 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first

	4835 ** cell located on the root (or virtual root) page and the cursor state

	4836 ** is set to CURSOR_VALID.

	4837 **

	4838 ** If this function returns successfully, it may be assumed that the

	4839 ** page-header flags indicate that the [virtual] root-page is the expected

	4840 ** kind of b-tree page (i.e. if when opening the cursor the caller did not

	4841 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,

	4842 ** indicating a table b-tree, or if the caller did specify a KeyInfo

	4843 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index

	4844 ** b-tree).

	4845 */

	4846 static int moveToRoot(BtCursor *pCur){

	4847 MemPage *pRoot;

	4848 int rc = SQLITE_OK;

	4849

	4850 assert( cursorOwnsBtShared(pCur) );

	4851 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );

	4852 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );

	4853 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );

	4854 if( pCur->eState>=CURSOR_REQUIRESEEK ){

	4855 if( pCur->eState==CURSOR_FAULT ){

	4856 assert( pCur->skipNext!=SQLITE_OK );

	4857 return pCur->skipNext;

	4858 }

	4859 sqlite3BtreeClearCursor(pCur);

	4860 }

	4861

	4862 if( pCur->iPage>=0 ){

	4863 if( pCur->iPage ){

	4864 do{

	4865 assert( pCur->apPage[pCur->iPage]!=0 );

	4866 releasePageNotNull(pCur->apPage[pCur->iPage--]);

	4867 }while( pCur->iPage);

	4868 goto skip_init;

	4869 }

	4870 }else if( pCur->pgnoRoot==0 ){

	4871 pCur->eState = CURSOR_INVALID;

	4872 return SQLITE_OK;

	4873 }else{

	4874 assert( pCur->iPage==(-1) );

	4875 rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],

	4876 0, pCur->curPagerFlags);

	4877 if( rc!=SQLITE_OK ){

	4878 pCur->eState = CURSOR_INVALID;

	4879 return rc;

	4880 }

	4881 pCur->iPage = 0;

	4882 pCur->curIntKey = pCur->apPage[0]->intKey;

	4883 }

	4884 pRoot = pCur->apPage[0];

	4885 assert( pRoot->pgno==pCur->pgnoRoot );

	4886

	4887 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor

	4888 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is

	4889 ** NULL, the caller expects a table b-tree. If this is not the case,

	4890 ** return an SQLITE_CORRUPT error.

	4891 **

	4892 ** Earlier versions of SQLite assumed that this test could not fail

	4893 ** if the root page was already loaded when this function was called (i.e.

	4894 ** if pCur->iPage>=0). But this is not so if the database is corrupted

	4895 ** in such a way that page pRoot is linked into a second b-tree table

	4896 ** (or the freelist). */

	4897 assert( pRoot->intKey==1 \|\| pRoot->intKey==0 );

	4898 if( pRoot->isInit==0 \|\| (pCur->pKeyInfo==0)!=pRoot->intKey ){

	4899 return SQLITE_CORRUPT_BKPT;

	4900 }

	4901

	4902 skip_init:

	4903 pCur->aiIdx[0] = 0;

	4904 pCur->info.nSize = 0;

	4905 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidNKey\|BTCF_ValidOvfl);

	4906

	4907 pRoot = pCur->apPage[0];

	4908 if( pRoot->nCell>0 ){

	4909 pCur->eState = CURSOR_VALID;

	4910 }else if( !pRoot->leaf ){

	4911 Pgno subpage;

	4912 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;

	4913 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);

	4914 pCur->eState = CURSOR_VALID;

	4915 rc = moveToChild(pCur, subpage);

	4916 }else{

	4917 pCur->eState = CURSOR_INVALID;

	4918 }

	4919 return rc;

	4920 }

	4921

	4922 /*

	4923 ** Move the cursor down to the left-most leaf entry beneath the

	4924 ** entry to which it is currently pointing.

	4925 **

	4926 ** The left-most leaf is the one with the smallest key - the first

	4927 ** in ascending order.

	4928 */

	4929 static int moveToLeftmost(BtCursor *pCur){

	4930 Pgno pgno;

	4931 int rc = SQLITE_OK;

	4932 MemPage *pPage;

	4933

	4934 assert( cursorOwnsBtShared(pCur) );

	4935 assert( pCur->eState==CURSOR_VALID );

	4936 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){

	4937 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

	4938 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));

	4939 rc = moveToChild(pCur, pgno);

	4940 }

	4941 return rc;

	4942 }

	4943

	4944 /*

	4945 ** Move the cursor down to the right-most leaf entry beneath the

	4946 ** page to which it is currently pointing. Notice the difference

	4947 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()

	4948 ** finds the left-most entry beneath the entry whereas moveToRightmost()

	4949 ** finds the right-most entry beneath the page.

	4950 **

	4951 ** The right-most entry is the one with the largest key - the last

	4952 ** key in ascending order.

	4953 */

	4954 static int moveToRightmost(BtCursor *pCur){

	4955 Pgno pgno;

	4956 int rc = SQLITE_OK;

	4957 MemPage *pPage = 0;

	4958

	4959 assert( cursorOwnsBtShared(pCur) );

	4960 assert( pCur->eState==CURSOR_VALID );

	4961 while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){

	4962 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	4963 pCur->aiIdx[pCur->iPage] = pPage->nCell;

	4964 rc = moveToChild(pCur, pgno);

	4965 if( rc ) return rc;

	4966 }

	4967 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;

	4968 assert( pCur->info.nSize==0 );

	4969 assert( (pCur->curFlags & BTCF_ValidNKey)==0 );

	4970 return SQLITE_OK;

	4971 }

	4972

	4973 /* Move the cursor to the first entry in the table. Return SQLITE_OK

	4974 ** on success. Set *pRes to 0 if the cursor actually points to something

	4975 ** or set *pRes to 1 if the table is empty.

	4976 */

	4977 int sqlite3BtreeFirst(BtCursor pCur, int pRes){

	4978 int rc;

	4979

	4980 assert( cursorOwnsBtShared(pCur) );

	4981 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	4982 rc = moveToRoot(pCur);

	4983 if( rc==SQLITE_OK ){

	4984 if( pCur->eState==CURSOR_INVALID ){

	4985 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	4986 *pRes = 1;

	4987 }else{

	4988 assert( pCur->apPage[pCur->iPage]->nCell>0 );

	4989 *pRes = 0;

	4990 rc = moveToLeftmost(pCur);

	4991 }

	4992 }

	4993 return rc;

	4994 }

	4995

	4996 /* Move the cursor to the last entry in the table. Return SQLITE_OK

	4997 ** on success. Set *pRes to 0 if the cursor actually points to something

	4998 ** or set *pRes to 1 if the table is empty.

	4999 */

	5000 int sqlite3BtreeLast(BtCursor pCur, int pRes){

	5001 int rc;

	5002

	5003 assert( cursorOwnsBtShared(pCur) );

	5004 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	5005

	5006 /* If the cursor already points to the last entry, this is a no-op. */

	5007 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){

	5008 #ifdef SQLITE_DEBUG

	5009 /* This block serves to assert() that the cursor really does point

	5010 ** to the last entry in the b-tree. */

	5011 int ii;

	5012 for(ii=0; ii<pCur->iPage; ii++){

	5013 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );

	5014 }

	5015 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );

	5016 assert( pCur->apPage[pCur->iPage]->leaf );

	5017 #endif

	5018 return SQLITE_OK;

	5019 }

	5020

	5021 rc = moveToRoot(pCur);

	5022 if( rc==SQLITE_OK ){

	5023 if( CURSOR_INVALID==pCur->eState ){

	5024 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	5025 *pRes = 1;

	5026 }else{

	5027 assert( pCur->eState==CURSOR_VALID );

	5028 *pRes = 0;

	5029 rc = moveToRightmost(pCur);

	5030 if( rc==SQLITE_OK ){

	5031 pCur->curFlags \|= BTCF_AtLast;

	5032 }else{

	5033 pCur->curFlags &= ~BTCF_AtLast;

	5034 }

	5035

	5036 }

	5037 }

	5038 return rc;

	5039 }

	5040

	5041 /* Move the cursor so that it points to an entry near the key

	5042 ** specified by pIdxKey or intKey. Return a success code.

	5043 **

	5044 ** For INTKEY tables, the intKey parameter is used. pIdxKey

	5045 ** must be NULL. For index tables, pIdxKey is used and intKey

	5046 ** is ignored.

	5047 **

	5048 ** If an exact match is not found, then the cursor is always

	5049 ** left pointing at a leaf page which would hold the entry if it

	5050 ** were present. The cursor might point to an entry that comes

	5051 ** before or after the key.

	5052 **

	5053 ** An integer is written into *pRes which is the result of

	5054 ** comparing the key with the entry to which the cursor is

	5055 ** pointing. The meaning of the integer written into

	5056 ** *pRes is as follows:

	5057 **

	5058 ** *pRes<0 The cursor is left pointing at an entry that

	5059 ** is smaller than intKey/pIdxKey or if the table is empty

	5060 ** and the cursor is therefore left point to nothing.

	5061 **

	5062 ** *pRes==0 The cursor is left pointing at an entry that

	5063 ** exactly matches intKey/pIdxKey.

	5064 **

	5065 ** *pRes>0 The cursor is left pointing at an entry that

	5066 ** is larger than intKey/pIdxKey.

	5067 **

	5068 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there

	5069 ** exists an entry in the table that exactly matches pIdxKey.

	5070 */

	5071 int sqlite3BtreeMovetoUnpacked(

	5072 BtCursor pCur, / The cursor to be moved */

	5073 UnpackedRecord pIdxKey, / Unpacked index key */

	5074 i64 intKey, /* The table key */

	5075 int biasRight, /* If true, bias the search to the high end */

	5076 int pRes / Write search results here */

	5077 ){

	5078 int rc;

	5079 RecordCompare xRecordCompare;

	5080

	5081 assert( cursorOwnsBtShared(pCur) );

	5082 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	5083 assert( pRes );

	5084 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );

	5085 assert( pCur->eState!=CURSOR_VALID \|\| (pIdxKey==0)==(pCur->curIntKey!=0) );

	5086

	5087 /* If the cursor is already positioned at the point we are trying

	5088 ** to move to, then just return without doing any work */

	5089 if( pIdxKey==0

	5090 && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0

	5091 ){

	5092 if( pCur->info.nKey==intKey ){

	5093 *pRes = 0;

	5094 return SQLITE_OK;

	5095 }

	5096 if( pCur->info.nKey<intKey ){

	5097 if( (pCur->curFlags & BTCF_AtLast)!=0 ){

	5098 *pRes = -1;

	5099 return SQLITE_OK;

	5100 }

	5101 /* If the requested key is one more than the previous key, then

	5102 ** try to get there using sqlite3BtreeNext() rather than a full

	5103 ** binary search. This is an optimization only. The correct answer

	5104 ** is still obtained without this ase, only a little more slowely */

	5105 if( pCur->info.nKey+1==intKey && !pCur->skipNext ){

	5106 *pRes = 0;

	5107 rc = sqlite3BtreeNext(pCur, pRes);

	5108 if( rc ) return rc;

	5109 if( *pRes==0 ){

	5110 getCellInfo(pCur);

	5111 if( pCur->info.nKey==intKey ){

	5112 return SQLITE_OK;

	5113 }

	5114 }

	5115 }

	5116 }

	5117 }

	5118

	5119 if( pIdxKey ){

	5120 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);

	5121 pIdxKey->errCode = 0;

	5122 assert( pIdxKey->default_rc==1

	5123 \|\| pIdxKey->default_rc==0

	5124 \|\| pIdxKey->default_rc==-1

	5125 );

	5126 }else{

	5127 xRecordCompare = 0; /* All keys are integers */

	5128 }

	5129

	5130 rc = moveToRoot(pCur);

	5131 if( rc ){

	5132 return rc;

	5133 }

	5134 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage] );

	5135 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->isInit );

	5136 assert( pCur->eState==CURSOR_INVALID \|\| pCur->apPage[pCur->iPage]->nCell>0 );

	5137 if( pCur->eState==CURSOR_INVALID ){

	5138 *pRes = -1;

	5139 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	5140 return SQLITE_OK;

	5141 }

	5142 assert( pCur->apPage[0]->intKey==pCur->curIntKey );

	5143 assert( pCur->curIntKey \|\| pIdxKey );

	5144 for(;;){

	5145 int lwr, upr, idx, c;

	5146 Pgno chldPg;

	5147 MemPage *pPage = pCur->apPage[pCur->iPage];

	5148 u8 pCell; / Pointer to current cell in pPage */

	5149

	5150 /* pPage->nCell must be greater than zero. If this is the root-page

	5151 ** the cursor would have been INVALID above and this for(;;) loop

	5152 ** not run. If this is not the root-page, then the moveToChild() routine

	5153 ** would have already detected db corruption. Similarly, pPage must

	5154 ** be the right kind (index or table) of b-tree page. Otherwise

	5155 ** a moveToChild() or moveToRoot() call would have detected corruption. */

	5156 assert( pPage->nCell>0 );

	5157 assert( pPage->intKey==(pIdxKey==0) );

	5158 lwr = 0;

	5159 upr = pPage->nCell-1;

	5160 assert( biasRight==0 \|\| biasRight==1 );

	5161 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */

	5162 pCur->aiIdx[pCur->iPage] = (u16)idx;

	5163 if( xRecordCompare==0 ){

	5164 for(;;){

	5165 i64 nCellKey;

	5166 pCell = findCellPastPtr(pPage, idx);

	5167 if( pPage->intKeyLeaf ){

	5168 while( 0x80 <= *(pCell++) ){

	5169 if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;

	5170 }

	5171 }

	5172 getVarint(pCell, (u64*)&nCellKey);

	5173 if( nCellKey<intKey ){

	5174 lwr = idx+1;

	5175 if( lwr>upr ){ c = -1; break; }

	5176 }else if( nCellKey>intKey ){

	5177 upr = idx-1;

	5178 if( lwr>upr ){ c = +1; break; }

	5179 }else{

	5180 assert( nCellKey==intKey );

	5181 pCur->aiIdx[pCur->iPage] = (u16)idx;

	5182 if( !pPage->leaf ){

	5183 lwr = idx;

	5184 goto moveto_next_layer;

	5185 }else{

	5186 pCur->curFlags \|= BTCF_ValidNKey;

	5187 pCur->info.nKey = nCellKey;

	5188 pCur->info.nSize = 0;

	5189 *pRes = 0;

	5190 return SQLITE_OK;

	5191 }

	5192 }

	5193 assert( lwr+upr>=0 );

	5194 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */

	5195 }

	5196 }else{

	5197 for(;;){

	5198 int nCell; /* Size of the pCell cell in bytes */

	5199 pCell = findCellPastPtr(pPage, idx);

	5200

	5201 /* The maximum supported page-size is 65536 bytes. This means that

	5202 ** the maximum number of record bytes stored on an index B-Tree

	5203 ** page is less than 16384 bytes and may be stored as a 2-byte

	5204 ** varint. This information is used to attempt to avoid parsing

	5205 ** the entire cell by checking for the cases where the record is

	5206 ** stored entirely within the b-tree page by inspecting the first

	5207 ** 2 bytes of the cell.

	5208 */

	5209 nCell = pCell[0];

	5210 if( nCell<=pPage->max1bytePayload ){

	5211 /* This branch runs if the record-size field of the cell is a

	5212 ** single byte varint and the record fits entirely on the main

	5213 ** b-tree page. */

	5214 testcase( pCell+nCell+1==pPage->aDataEnd );

	5215 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);

	5216 }else if( !(pCell[1] & 0x80)

	5217 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal

	5218 ){

	5219 /* The record-size field is a 2 byte varint and the record

	5220 ** fits entirely on the main b-tree page. */

	5221 testcase( pCell+nCell+2==pPage->aDataEnd );

	5222 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);

	5223 }else{

	5224 /* The record flows over onto one or more overflow pages. In

	5225 ** this case the whole cell needs to be parsed, a buffer allocated

	5226 ** and accessPayload() used to retrieve the record into the

	5227 ** buffer before VdbeRecordCompare() can be called.

	5228 **

	5229 ** If the record is corrupt, the xRecordCompare routine may read

	5230 ** up to two varints past the end of the buffer. An extra 18

	5231 ** bytes of padding is allocated at the end of the buffer in

	5232 ** case this happens. */

	5233 void *pCellKey;

	5234 u8 * const pCellBody = pCell - pPage->childPtrSize;

	5235 pPage->xParseCell(pPage, pCellBody, &pCur->info);

	5236 nCell = (int)pCur->info.nKey;

	5237 testcase( nCell<0 ); /* True if key size is 2^32 or more */

	5238 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */

	5239 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */

	5240 testcase( nCell==2 ); /* Minimum legal index key size */

	5241 if( nCell<2 ){

	5242 rc = SQLITE_CORRUPT_BKPT;

	5243 goto moveto_finish;

	5244 }

	5245 pCellKey = sqlite3Malloc( nCell+18 );

	5246 if( pCellKey==0 ){

	5247 rc = SQLITE_NOMEM_BKPT;

	5248 goto moveto_finish;

	5249 }

	5250 pCur->aiIdx[pCur->iPage] = (u16)idx;

	5251 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);

	5252 pCur->curFlags &= ~BTCF_ValidOvfl;

	5253 if( rc ){

	5254 sqlite3_free(pCellKey);

	5255 goto moveto_finish;

	5256 }

	5257 c = xRecordCompare(nCell, pCellKey, pIdxKey);

	5258 sqlite3_free(pCellKey);

	5259 }

	5260 assert(

	5261 (pIdxKey->errCode!=SQLITE_CORRUPT \|\| c==0)

	5262 && (pIdxKey->errCode!=SQLITE_NOMEM \|\| pCur->pBtree->db->mallocFailed)

	5263 );

	5264 if( c<0 ){

	5265 lwr = idx+1;

	5266 }else if( c>0 ){

	5267 upr = idx-1;

	5268 }else{

	5269 assert( c==0 );

	5270 *pRes = 0;

	5271 rc = SQLITE_OK;

	5272 pCur->aiIdx[pCur->iPage] = (u16)idx;

	5273 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;

	5274 goto moveto_finish;

	5275 }

	5276 if( lwr>upr ) break;

	5277 assert( lwr+upr>=0 );

	5278 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */

	5279 }

	5280 }

	5281 assert( lwr==upr+1 \|\| (pPage->intKey && !pPage->leaf) );

	5282 assert( pPage->isInit );

	5283 if( pPage->leaf ){

	5284 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	5285 pCur->aiIdx[pCur->iPage] = (u16)idx;

	5286 *pRes = c;

	5287 rc = SQLITE_OK;

	5288 goto moveto_finish;

	5289 }

	5290 moveto_next_layer:

	5291 if( lwr>=pPage->nCell ){

	5292 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	5293 }else{

	5294 chldPg = get4byte(findCell(pPage, lwr));

	5295 }

	5296 pCur->aiIdx[pCur->iPage] = (u16)lwr;

	5297 rc = moveToChild(pCur, chldPg);

	5298 if( rc ) break;

	5299 }

	5300 moveto_finish:

	5301 pCur->info.nSize = 0;

	5302 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );

	5303 return rc;

	5304 }

	5305

	5306

	5307 /*

	5308 ** Return TRUE if the cursor is not pointing at an entry of the table.

	5309 **

	5310 ** TRUE will be returned after a call to sqlite3BtreeNext() moves

	5311 ** past the last entry in the table or sqlite3BtreePrev() moves past

	5312 ** the first entry. TRUE is also returned if the table is empty.

	5313 */

	5314 int sqlite3BtreeEof(BtCursor *pCur){

	5315 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries

	5316 ** have been deleted? This API will need to change to return an error code

	5317 ** as well as the boolean result value.

	5318 */

	5319 return (CURSOR_VALID!=pCur->eState);

	5320 }

	5321

	5322 /*

	5323 ** Advance the cursor to the next entry in the database. If

	5324 ** successful then set *pRes=0. If the cursor

	5325 ** was already pointing to the last entry in the database before

	5326 ** this routine was called, then set *pRes=1.

	5327 **

	5328 ** The main entry point is sqlite3BtreeNext(). That routine is optimized

	5329 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx

	5330 ** to the next cell on the current page. The (slower) btreeNext() helper

	5331 ** routine is called when it is necessary to move to a different page or

	5332 ** to restore the cursor.

	5333 **

	5334 ** The calling function will set pRes to 0 or 1. The initial pRes value

	5335 ** will be 1 if the cursor being stepped corresponds to an SQL index and

	5336 ** if this routine could have been skipped if that SQL index had been

	5337 ** a unique index. Otherwise the caller will have set *pRes to zero.

	5338 ** Zero is the common case. The btree implementation is free to use the

	5339 ** initial *pRes value as a hint to improve performance, but the current

	5340 ** SQLite btree implementation does not. (Note that the comdb2 btree

	5341 ** implementation does use this hint, however.)

	5342 */

	5343 static SQLITE_NOINLINE int btreeNext(BtCursor pCur, int pRes){

	5344 int rc;

	5345 int idx;

	5346 MemPage *pPage;

	5347

	5348 assert( cursorOwnsBtShared(pCur) );

	5349 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	5350 assert( *pRes==0 );

	5351 if( pCur->eState!=CURSOR_VALID ){

	5352 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );

	5353 rc = restoreCursorPosition(pCur);

	5354 if( rc!=SQLITE_OK ){

	5355 return rc;

	5356 }

	5357 if( CURSOR_INVALID==pCur->eState ){

	5358 *pRes = 1;

	5359 return SQLITE_OK;

	5360 }

	5361 if( pCur->skipNext ){

	5362 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

	5363 pCur->eState = CURSOR_VALID;

	5364 if( pCur->skipNext>0 ){

	5365 pCur->skipNext = 0;

	5366 return SQLITE_OK;

	5367 }

	5368 pCur->skipNext = 0;

	5369 }

	5370 }

	5371

	5372 pPage = pCur->apPage[pCur->iPage];

	5373 idx = ++pCur->aiIdx[pCur->iPage];

	5374 assert( pPage->isInit );

	5375

	5376 /* If the database file is corrupt, it is possible for the value of idx

	5377 ** to be invalid here. This can only occur if a second cursor modifies

	5378 ** the page while cursor pCur is holding a reference to it. Which can

	5379 ** only happen if the database is corrupt in such a way as to link the

	5380 ** page into more than one b-tree structure. */

	5381 testcase( idx>pPage->nCell );

	5382

	5383 if( idx>=pPage->nCell ){

	5384 if( !pPage->leaf ){

	5385 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

	5386 if( rc ) return rc;

	5387 return moveToLeftmost(pCur);

	5388 }

	5389 do{

	5390 if( pCur->iPage==0 ){

	5391 *pRes = 1;

	5392 pCur->eState = CURSOR_INVALID;

	5393 return SQLITE_OK;

	5394 }

	5395 moveToParent(pCur);

	5396 pPage = pCur->apPage[pCur->iPage];

	5397 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );

	5398 if( pPage->intKey ){

	5399 return sqlite3BtreeNext(pCur, pRes);

	5400 }else{

	5401 return SQLITE_OK;

	5402 }

	5403 }

	5404 if( pPage->leaf ){

	5405 return SQLITE_OK;

	5406 }else{

	5407 return moveToLeftmost(pCur);

	5408 }

	5409 }

	5410 int sqlite3BtreeNext(BtCursor pCur, int pRes){

	5411 MemPage *pPage;

	5412 assert( cursorOwnsBtShared(pCur) );

	5413 assert( pRes!=0 );

	5414 assert( pRes==0 \|\| pRes==1 );

	5415 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	5416 pCur->info.nSize = 0;

	5417 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	5418 *pRes = 0;

	5419 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);

	5420 pPage = pCur->apPage[pCur->iPage];

	5421 if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){

	5422 pCur->aiIdx[pCur->iPage]--;

	5423 return btreeNext(pCur, pRes);

	5424 }

	5425 if( pPage->leaf ){

	5426 return SQLITE_OK;

	5427 }else{

	5428 return moveToLeftmost(pCur);

	5429 }

	5430 }

	5431

	5432 /*

	5433 ** Step the cursor to the back to the previous entry in the database. If

	5434 ** successful then set *pRes=0. If the cursor

	5435 ** was already pointing to the first entry in the database before

	5436 ** this routine was called, then set *pRes=1.

	5437 **

	5438 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized

	5439 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx

	5440 ** to the previous cell on the current page. The (slower) btreePrevious()

	5441 ** helper routine is called when it is necessary to move to a different page

	5442 ** or to restore the cursor.

	5443 **

	5444 ** The calling function will set pRes to 0 or 1. The initial pRes value

	5445 ** will be 1 if the cursor being stepped corresponds to an SQL index and

	5446 ** if this routine could have been skipped if that SQL index had been

	5447 ** a unique index. Otherwise the caller will have set *pRes to zero.

	5448 ** Zero is the common case. The btree implementation is free to use the

	5449 ** initial *pRes value as a hint to improve performance, but the current

	5450 ** SQLite btree implementation does not. (Note that the comdb2 btree

	5451 ** implementation does use this hint, however.)

	5452 */

	5453 static SQLITE_NOINLINE int btreePrevious(BtCursor pCur, int pRes){

	5454 int rc;

	5455 MemPage *pPage;

	5456

	5457 assert( cursorOwnsBtShared(pCur) );

	5458 assert( pRes!=0 );

	5459 assert( *pRes==0 );

	5460 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	5461 assert( (pCur->curFlags & (BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey))==0 );

	5462 assert( pCur->info.nSize==0 );

	5463 if( pCur->eState!=CURSOR_VALID ){

	5464 rc = restoreCursorPosition(pCur);

	5465 if( rc!=SQLITE_OK ){

	5466 return rc;

	5467 }

	5468 if( CURSOR_INVALID==pCur->eState ){

	5469 *pRes = 1;

	5470 return SQLITE_OK;

	5471 }

	5472 if( pCur->skipNext ){

	5473 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

	5474 pCur->eState = CURSOR_VALID;

	5475 if( pCur->skipNext<0 ){

	5476 pCur->skipNext = 0;

	5477 return SQLITE_OK;

	5478 }

	5479 pCur->skipNext = 0;

	5480 }

	5481 }

	5482

	5483 pPage = pCur->apPage[pCur->iPage];

	5484 assert( pPage->isInit );

	5485 if( !pPage->leaf ){

	5486 int idx = pCur->aiIdx[pCur->iPage];

	5487 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));

	5488 if( rc ) return rc;

	5489 rc = moveToRightmost(pCur);

	5490 }else{

	5491 while( pCur->aiIdx[pCur->iPage]==0 ){

	5492 if( pCur->iPage==0 ){

	5493 pCur->eState = CURSOR_INVALID;

	5494 *pRes = 1;

	5495 return SQLITE_OK;

	5496 }

	5497 moveToParent(pCur);

	5498 }

	5499 assert( pCur->info.nSize==0 );

	5500 assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );

	5501

	5502 pCur->aiIdx[pCur->iPage]--;

	5503 pPage = pCur->apPage[pCur->iPage];

	5504 if( pPage->intKey && !pPage->leaf ){

	5505 rc = sqlite3BtreePrevious(pCur, pRes);

	5506 }else{

	5507 rc = SQLITE_OK;

	5508 }

	5509 }

	5510 return rc;

	5511 }

	5512 int sqlite3BtreePrevious(BtCursor pCur, int pRes){

	5513 assert( cursorOwnsBtShared(pCur) );

	5514 assert( pRes!=0 );

	5515 assert( pRes==0 \|\| pRes==1 );

	5516 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	5517 *pRes = 0;

	5518 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey);

	5519 pCur->info.nSize = 0;

	5520 if( pCur->eState!=CURSOR_VALID

	5521 \|\| pCur->aiIdx[pCur->iPage]==0

	5522 \|\| pCur->apPage[pCur->iPage]->leaf==0

	5523 ){

	5524 return btreePrevious(pCur, pRes);

	5525 }

	5526 pCur->aiIdx[pCur->iPage]--;

	5527 return SQLITE_OK;

	5528 }

	5529

	5530 /*

	5531 ** Allocate a new page from the database file.

	5532 **

	5533 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()

	5534 ** has already been called on the new page.) The new page has also

	5535 ** been referenced and the calling routine is responsible for calling

	5536 ** sqlite3PagerUnref() on the new page when it is done.

	5537 **

	5538 ** SQLITE_OK is returned on success. Any other return value indicates

	5539 ** an error. *ppPage is set to NULL in the event of an error.

	5540 **

	5541 ** If the "nearby" parameter is not 0, then an effort is made to

	5542 ** locate a page close to the page number "nearby". This can be used in an

	5543 ** attempt to keep related pages close to each other in the database file,

	5544 ** which in turn can make database access faster.

	5545 **

	5546 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists

	5547 ** anywhere on the free-list, then it is guaranteed to be returned. If

	5548 ** eMode is BTALLOC_LT then the page returned will be less than or equal

	5549 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there

	5550 ** are no restrictions on which page is returned.

	5551 */

	5552 static int allocateBtreePage(

	5553 BtShared pBt, / The btree */

	5554 MemPage *ppPage, / Store pointer to the allocated page here */

	5555 Pgno pPgno, / Store the page number here */

	5556 Pgno nearby, /* Search for a page near this one */

	5557 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */

	5558 ){

	5559 MemPage *pPage1;

	5560 int rc;

	5561 u32 n; /* Number of pages on the freelist */

	5562 u32 k; /* Number of leaves on the trunk of the freelist */

	5563 MemPage *pTrunk = 0;

	5564 MemPage *pPrevTrunk = 0;

	5565 Pgno mxPage; /* Total size of the database file */

	5566

	5567 assert( sqlite3_mutex_held(pBt->mutex) );

	5568 assert( eMode==BTALLOC_ANY \|\| (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );

	5569 pPage1 = pBt->pPage1;

	5570 mxPage = btreePagecount(pBt);

	5571 /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36

	5572 ** stores stores the total number of pages on the freelist. */

	5573 n = get4byte(&pPage1->aData[36]);

	5574 testcase( n==mxPage-1 );

	5575 if( n>=mxPage ){

	5576 return SQLITE_CORRUPT_BKPT;

	5577 }

	5578 if( n>0 ){

	5579 /* There are pages on the freelist. Reuse one of those pages. */

	5580 Pgno iTrunk;

	5581 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */

	5582 u32 nSearch = 0; /* Count of the number of search attempts */

	5583

	5584 /* If eMode==BTALLOC_EXACT and a query of the pointer-map

	5585 ** shows that the page 'nearby' is somewhere on the free-list, then

	5586 ** the entire-list will be searched for that page.

	5587 */

	5588 #ifndef SQLITE_OMIT_AUTOVACUUM

	5589 if( eMode==BTALLOC_EXACT ){

	5590 if( nearby<=mxPage ){

	5591 u8 eType;

	5592 assert( nearby>0 );

	5593 assert( pBt->autoVacuum );

	5594 rc = ptrmapGet(pBt, nearby, &eType, 0);

	5595 if( rc ) return rc;

	5596 if( eType==PTRMAP_FREEPAGE ){

	5597 searchList = 1;

	5598 }

	5599 }

	5600 }else if( eMode==BTALLOC_LE ){

	5601 searchList = 1;

	5602 }

	5603 #endif

	5604

	5605 /* Decrement the free-list count by 1. Set iTrunk to the index of the

	5606 ** first free-list trunk page. iPrevTrunk is initially 1.

	5607 */

	5608 rc = sqlite3PagerWrite(pPage1->pDbPage);

	5609 if( rc ) return rc;

	5610 put4byte(&pPage1->aData[36], n-1);

	5611

	5612 /* The code within this loop is run only once if the 'searchList' variable

	5613 ** is not true. Otherwise, it runs once for each trunk-page on the

	5614 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)

	5615 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)

	5616 */

	5617 do {

	5618 pPrevTrunk = pTrunk;

	5619 if( pPrevTrunk ){

	5620 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page

	5621 ** is the page number of the next freelist trunk page in the list or

	5622 ** zero if this is the last freelist trunk page. */

	5623 iTrunk = get4byte(&pPrevTrunk->aData[0]);

	5624 }else{

	5625 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32

	5626 ** stores the page number of the first page of the freelist, or zero if

	5627 ** the freelist is empty. */

	5628 iTrunk = get4byte(&pPage1->aData[32]);

	5629 }

	5630 testcase( iTrunk==mxPage );

	5631 if( iTrunk>mxPage \|\| nSearch++ > n ){

	5632 rc = SQLITE_CORRUPT_BKPT;

	5633 }else{

	5634 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);

	5635 }

	5636 if( rc ){

	5637 pTrunk = 0;

	5638 goto end_allocate_page;

	5639 }

	5640 assert( pTrunk!=0 );

	5641 assert( pTrunk->aData!=0 );

	5642 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page

	5643 ** is the number of leaf page pointers to follow. */

	5644 k = get4byte(&pTrunk->aData[4]);

	5645 if( k==0 && !searchList ){

	5646 /* The trunk has no leaves and the list is not being searched.

	5647 ** So extract the trunk page itself and use it as the newly

	5648 ** allocated page */

	5649 assert( pPrevTrunk==0 );

	5650 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5651 if( rc ){

	5652 goto end_allocate_page;

	5653 }

	5654 *pPgno = iTrunk;

	5655 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

	5656 *ppPage = pTrunk;

	5657 pTrunk = 0;

	5658 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

	5659 }else if( k>(u32)(pBt->usableSize/4 - 2) ){

	5660 /* Value of k is out of range. Database corruption */

	5661 rc = SQLITE_CORRUPT_BKPT;

	5662 goto end_allocate_page;

	5663 #ifndef SQLITE_OMIT_AUTOVACUUM

	5664 }else if( searchList

	5665 && (nearby==iTrunk \|\| (iTrunk<nearby && eMode==BTALLOC_LE))

	5666 ){

	5667 /* The list is being searched and this trunk page is the page

	5668 ** to allocate, regardless of whether it has leaves.

	5669 */

	5670 *pPgno = iTrunk;

	5671 *ppPage = pTrunk;

	5672 searchList = 0;

	5673 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5674 if( rc ){

	5675 goto end_allocate_page;

	5676 }

	5677 if( k==0 ){

	5678 if( !pPrevTrunk ){

	5679 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

	5680 }else{

	5681 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

	5682 if( rc!=SQLITE_OK ){

	5683 goto end_allocate_page;

	5684 }

	5685 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);

	5686 }

	5687 }else{

	5688 /* The trunk page is required by the caller but it contains

	5689 ** pointers to free-list leaves. The first leaf becomes a trunk

	5690 ** page in this case.

	5691 */

	5692 MemPage *pNewTrunk;

	5693 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);

	5694 if( iNewTrunk>mxPage ){

	5695 rc = SQLITE_CORRUPT_BKPT;

	5696 goto end_allocate_page;

	5697 }

	5698 testcase( iNewTrunk==mxPage );

	5699 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);

	5700 if( rc!=SQLITE_OK ){

	5701 goto end_allocate_page;

	5702 }

	5703 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);

	5704 if( rc!=SQLITE_OK ){

	5705 releasePage(pNewTrunk);

	5706 goto end_allocate_page;

	5707 }

	5708 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);

	5709 put4byte(&pNewTrunk->aData[4], k-1);

	5710 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);

	5711 releasePage(pNewTrunk);

	5712 if( !pPrevTrunk ){

	5713 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );

	5714 put4byte(&pPage1->aData[32], iNewTrunk);

	5715 }else{

	5716 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

	5717 if( rc ){

	5718 goto end_allocate_page;

	5719 }

	5720 put4byte(&pPrevTrunk->aData[0], iNewTrunk);

	5721 }

	5722 }

	5723 pTrunk = 0;

	5724 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

	5725 #endif

	5726 }else if( k>0 ){

	5727 /* Extract a leaf from the trunk */

	5728 u32 closest;

	5729 Pgno iPage;

	5730 unsigned char *aData = pTrunk->aData;

	5731 if( nearby>0 ){

	5732 u32 i;

	5733 closest = 0;

	5734 if( eMode==BTALLOC_LE ){

	5735 for(i=0; i<k; i++){

	5736 iPage = get4byte(&aData[8+i*4]);

	5737 if( iPage<=nearby ){

	5738 closest = i;

	5739 break;

	5740 }

	5741 }

	5742 }else{

	5743 int dist;

	5744 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);

	5745 for(i=1; i<k; i++){

	5746 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);

	5747 if( d2<dist ){

	5748 closest = i;

	5749 dist = d2;

	5750 }

	5751 }

	5752 }

	5753 }else{

	5754 closest = 0;

	5755 }

	5756

	5757 iPage = get4byte(&aData[8+closest*4]);

	5758 testcase( iPage==mxPage );

	5759 if( iPage>mxPage ){

	5760 rc = SQLITE_CORRUPT_BKPT;

	5761 goto end_allocate_page;

	5762 }

	5763 testcase( iPage==mxPage );

	5764 if( !searchList

	5765 \|\| (iPage==nearby \|\| (iPage<nearby && eMode==BTALLOC_LE))

	5766 ){

	5767 int noContent;

	5768 *pPgno = iPage;

	5769 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"

	5770 ": %d more free pages\n",

	5771 *pPgno, closest+1, k, pTrunk->pgno, n-1));

	5772 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5773 if( rc ) goto end_allocate_page;

	5774 if( closest<k-1 ){

	5775 memcpy(&aData[8+closest4], &aData[4+k4], 4);

	5776 }

	5777 put4byte(&aData[4], k-1);

	5778 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;

	5779 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);

	5780 if( rc==SQLITE_OK ){

	5781 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

	5782 if( rc!=SQLITE_OK ){

	5783 releasePage(*ppPage);

	5784 *ppPage = 0;

	5785 }

	5786 }

	5787 searchList = 0;

	5788 }

	5789 }

	5790 releasePage(pPrevTrunk);

	5791 pPrevTrunk = 0;

	5792 }while( searchList );

	5793 }else{

	5794 /* There are no pages on the freelist, so append a new page to the

	5795 ** database image.

	5796 **

	5797 ** Normally, new pages allocated by this block can be requested from the

	5798 ** pager layer with the 'no-content' flag set. This prevents the pager

	5799 ** from trying to read the pages content from disk. However, if the

	5800 ** current transaction has already run one or more incremental-vacuum

	5801 ** steps, then the page we are about to allocate may contain content

	5802 ** that is required in the event of a rollback. In this case, do

	5803 ** not set the no-content flag. This causes the pager to load and journal

	5804 ** the current page content before overwriting it.

	5805 **

	5806 ** Note that the pager will not actually attempt to load or journal

	5807 ** content for any page that really does lie past the end of the database

	5808 ** file on disk. So the effects of disabling the no-content optimization

	5809 ** here are confined to those pages that lie between the end of the

	5810 ** database image and the end of the database file.

	5811 */

	5812 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;

	5813

	5814 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	5815 if( rc ) return rc;

	5816 pBt->nPage++;

	5817 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;

	5818

	5819 #ifndef SQLITE_OMIT_AUTOVACUUM

	5820 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){

	5821 /* If *pPgno refers to a pointer-map page, allocate two new pages

	5822 ** at the end of the file instead of one. The first allocated page

	5823 ** becomes a new pointer-map page, the second is used by the caller.

	5824 */

	5825 MemPage *pPg = 0;

	5826 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));

	5827 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );

	5828 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);

	5829 if( rc==SQLITE_OK ){

	5830 rc = sqlite3PagerWrite(pPg->pDbPage);

	5831 releasePage(pPg);

	5832 }

	5833 if( rc ) return rc;

	5834 pBt->nPage++;

	5835 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }

	5836 }

	5837 #endif

	5838 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);

	5839 *pPgno = pBt->nPage;

	5840

	5841 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

	5842 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);

	5843 if( rc ) return rc;

	5844 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

	5845 if( rc!=SQLITE_OK ){

	5846 releasePage(*ppPage);

	5847 *ppPage = 0;

	5848 }

	5849 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));

	5850 }

	5851

	5852 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

	5853

	5854 end_allocate_page:

	5855 releasePage(pTrunk);

	5856 releasePage(pPrevTrunk);

	5857 assert( rc!=SQLITE_OK \|\| sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );

	5858 assert( rc!=SQLITE_OK \|\| (*ppPage)->isInit==0 );

	5859 return rc;

	5860 }

	5861

	5862 /*

	5863 ** This function is used to add page iPage to the database file free-list.

	5864 ** It is assumed that the page is not already a part of the free-list.

	5865 **

	5866 ** The value passed as the second argument to this function is optional.

	5867 ** If the caller happens to have a pointer to the MemPage object

	5868 ** corresponding to page iPage handy, it may pass it as the second value.

	5869 ** Otherwise, it may pass NULL.

	5870 **

	5871 ** If a pointer to a MemPage object is passed as the second argument,

	5872 ** its reference count is not altered by this function.

	5873 */

	5874 static int freePage2(BtShared pBt, MemPage pMemPage, Pgno iPage){

	5875 MemPage pTrunk = 0; / Free-list trunk page */

	5876 Pgno iTrunk = 0; /* Page number of free-list trunk page */

	5877 MemPage pPage1 = pBt->pPage1; / Local reference to page 1 */

	5878 MemPage pPage; / Page being freed. May be NULL. */

	5879 int rc; /* Return Code */

	5880 int nFree; /* Initial number of pages on free-list */

	5881

	5882 assert( sqlite3_mutex_held(pBt->mutex) );

	5883 assert( CORRUPT_DB \|\| iPage>1 );

	5884 assert( !pMemPage \|\| pMemPage->pgno==iPage );

	5885

	5886 if( iPage<2 ) return SQLITE_CORRUPT_BKPT;

	5887 if( pMemPage ){

	5888 pPage = pMemPage;

	5889 sqlite3PagerRef(pPage->pDbPage);

	5890 }else{

	5891 pPage = btreePageLookup(pBt, iPage);

	5892 }

	5893

	5894 /* Increment the free page count on pPage1 */

	5895 rc = sqlite3PagerWrite(pPage1->pDbPage);

	5896 if( rc ) goto freepage_out;

	5897 nFree = get4byte(&pPage1->aData[36]);

	5898 put4byte(&pPage1->aData[36], nFree+1);

	5899

	5900 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	5901 /* If the secure_delete option is enabled, then

	5902 ** always fully overwrite deleted information with zeros.

	5903 */

	5904 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )

	5905 \|\| ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)

	5906 ){

	5907 goto freepage_out;

	5908 }

	5909 memset(pPage->aData, 0, pPage->pBt->pageSize);

	5910 }

	5911

	5912 /* If the database supports auto-vacuum, write an entry in the pointer-map

	5913 ** to indicate that the page is free.

	5914 */

	5915 if( ISAUTOVACUUM ){

	5916 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);

	5917 if( rc ) goto freepage_out;

	5918 }

	5919

	5920 /* Now manipulate the actual database free-list structure. There are two

	5921 ** possibilities. If the free-list is currently empty, or if the first

	5922 ** trunk page in the free-list is full, then this page will become a

	5923 ** new free-list trunk page. Otherwise, it will become a leaf of the

	5924 ** first trunk page in the current free-list. This block tests if it

	5925 ** is possible to add the page as a new free-list leaf.

	5926 */

	5927 if( nFree!=0 ){

	5928 u32 nLeaf; /* Initial number of leaf cells on trunk page */

	5929

	5930 iTrunk = get4byte(&pPage1->aData[32]);

	5931 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);

	5932 if( rc!=SQLITE_OK ){

	5933 goto freepage_out;

	5934 }

	5935

	5936 nLeaf = get4byte(&pTrunk->aData[4]);

	5937 assert( pBt->usableSize>32 );

	5938 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){

	5939 rc = SQLITE_CORRUPT_BKPT;

	5940 goto freepage_out;

	5941 }

	5942 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){

	5943 /* In this case there is room on the trunk page to insert the page

	5944 ** being freed as a new leaf.

	5945 **

	5946 ** Note that the trunk page is not really full until it contains

	5947 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have

	5948 ** coded. But due to a coding error in versions of SQLite prior to

	5949 ** 3.6.0, databases with freelist trunk pages holding more than

	5950 ** usableSize/4 - 8 entries will be reported as corrupt. In order

	5951 ** to maintain backwards compatibility with older versions of SQLite,

	5952 ** we will continue to restrict the number of entries to usableSize/4 - 8

	5953 ** for now. At some point in the future (once everyone has upgraded

	5954 ** to 3.6.0 or later) we should consider fixing the conditional above

	5955 ** to read "usableSize/4-2" instead of "usableSize/4-8".

	5956 **

	5957 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still

	5958 ** avoid using the last six entries in the freelist trunk page array in

	5959 ** order that database files created by newer versions of SQLite can be

	5960 ** read by older versions of SQLite.

	5961 */

	5962 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5963 if( rc==SQLITE_OK ){

	5964 put4byte(&pTrunk->aData[4], nLeaf+1);

	5965 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);

	5966 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){

	5967 sqlite3PagerDontWrite(pPage->pDbPage);

	5968 }

	5969 rc = btreeSetHasContent(pBt, iPage);

	5970 }

	5971 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));

	5972 goto freepage_out;

	5973 }

	5974 }

	5975

	5976 /* If control flows to this point, then it was not possible to add the

	5977 ** the page being freed as a leaf page of the first trunk in the free-list.

	5978 ** Possibly because the free-list is empty, or possibly because the

	5979 ** first trunk in the free-list is full. Either way, the page being freed

	5980 ** will become the new first trunk page in the free-list.

	5981 */

	5982 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){

	5983 goto freepage_out;

	5984 }

	5985 rc = sqlite3PagerWrite(pPage->pDbPage);

	5986 if( rc!=SQLITE_OK ){

	5987 goto freepage_out;

	5988 }

	5989 put4byte(pPage->aData, iTrunk);

	5990 put4byte(&pPage->aData[4], 0);

	5991 put4byte(&pPage1->aData[32], iPage);

	5992 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));

	5993

	5994 freepage_out:

	5995 if( pPage ){

	5996 pPage->isInit = 0;

	5997 }

	5998 releasePage(pPage);

	5999 releasePage(pTrunk);

	6000 return rc;

	6001 }

	6002 static void freePage(MemPage pPage, int pRC){

	6003 if( (*pRC)==SQLITE_OK ){

	6004 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);

	6005 }

	6006 }

	6007

	6008 /*

	6009 ** Free any overflow pages associated with the given Cell. Write the

	6010 ** local Cell size (the number of bytes on the original page, omitting

	6011 ** overflow) into *pnSize.

	6012 */

	6013 static int clearCell(

	6014 MemPage pPage, / The page that contains the Cell */

	6015 unsigned char pCell, / First byte of the Cell */

	6016 CellInfo pInfo / Size information about the cell */

	6017 ){

	6018 BtShared *pBt = pPage->pBt;

	6019 Pgno ovflPgno;

	6020 int rc;

	6021 int nOvfl;

	6022 u32 ovflPageSize;

	6023

	6024 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	6025 pPage->xParseCell(pPage, pCell, pInfo);

	6026 if( pInfo->nLocal==pInfo->nPayload ){

	6027 return SQLITE_OK; /* No overflow pages. Return without doing anything */

	6028 }

	6029 if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){

	6030 return SQLITE_CORRUPT_BKPT; /* Cell extends past end of page */

	6031 }

	6032 ovflPgno = get4byte(pCell + pInfo->nSize - 4);

	6033 assert( pBt->usableSize > 4 );

	6034 ovflPageSize = pBt->usableSize - 4;

	6035 nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;

	6036 assert( nOvfl>0 \|\|

	6037 (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)

	6038 );

	6039 while( nOvfl-- ){

	6040 Pgno iNext = 0;

	6041 MemPage *pOvfl = 0;

	6042 if( ovflPgno<2 \|\| ovflPgno>btreePagecount(pBt) ){

	6043 /* 0 is not a legal page number and page 1 cannot be an

	6044 ** overflow page. Therefore if ovflPgno<2 or past the end of the

	6045 ** file the database must be corrupt. */

	6046 return SQLITE_CORRUPT_BKPT;

	6047 }

	6048 if( nOvfl ){

	6049 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);

	6050 if( rc ) return rc;

	6051 }

	6052

	6053 if( ( pOvfl \|\| ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )

	6054 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1

	6055 ){

	6056 /* There is no reason any cursor should have an outstanding reference

	6057 ** to an overflow page belonging to a cell that is being deleted/updated.

	6058 ** So if there exists more than one reference to this page, then it

	6059 ** must not really be an overflow page and the database must be corrupt.

	6060 ** It is helpful to detect this before calling freePage2(), as

	6061 ** freePage2() may zero the page contents if secure-delete mode is

	6062 ** enabled. If this 'overflow' page happens to be a page that the

	6063 ** caller is iterating through or using in some other way, this

	6064 ** can be problematic.

	6065 */

	6066 rc = SQLITE_CORRUPT_BKPT;

	6067 }else{

	6068 rc = freePage2(pBt, pOvfl, ovflPgno);

	6069 }

	6070

	6071 if( pOvfl ){

	6072 sqlite3PagerUnref(pOvfl->pDbPage);

	6073 }

	6074 if( rc ) return rc;

	6075 ovflPgno = iNext;

	6076 }

	6077 return SQLITE_OK;

	6078 }

	6079

	6080 /*

	6081 ** Create the byte sequence used to represent a cell on page pPage

	6082 ** and write that byte sequence into pCell[]. Overflow pages are

	6083 ** allocated and filled in as necessary. The calling procedure

	6084 ** is responsible for making sure sufficient space has been allocated

	6085 ** for pCell[].

	6086 **

	6087 ** Note that pCell does not necessary need to point to the pPage->aData

	6088 ** area. pCell might point to some temporary storage. The cell will

	6089 ** be constructed in this temporary area then copied into pPage->aData

	6090 ** later.

	6091 */

	6092 static int fillInCell(

	6093 MemPage pPage, / The page that contains the cell */

	6094 unsigned char pCell, / Complete text of the cell */

	6095 const BtreePayload pX, / Payload with which to construct the cell */

	6096 int pnSize / Write cell size here */

	6097 ){

	6098 int nPayload;

	6099 const u8 *pSrc;

	6100 int nSrc, n, rc;

	6101 int spaceLeft;

	6102 MemPage *pOvfl = 0;

	6103 MemPage *pToRelease = 0;

	6104 unsigned char *pPrior;

	6105 unsigned char *pPayload;

	6106 BtShared *pBt = pPage->pBt;

	6107 Pgno pgnoOvfl = 0;

	6108 int nHeader;

	6109

	6110 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	6111

	6112 /* pPage is not necessarily writeable since pCell might be auxiliary

	6113 ** buffer space that is separate from the pPage buffer area */

	6114 assert( pCell<pPage->aData \|\| pCell>=&pPage->aData[pBt->pageSize]

	6115 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	6116

	6117 /* Fill in the header. */

	6118 nHeader = pPage->childPtrSize;

	6119 if( pPage->intKey ){

	6120 nPayload = pX->nData + pX->nZero;

	6121 pSrc = pX->pData;

	6122 nSrc = pX->nData;

	6123 assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */

	6124 nHeader += putVarint32(&pCell[nHeader], nPayload);

	6125 nHeader += putVarint(&pCell[nHeader], (u64)&pX->nKey);

	6126 }else{

	6127 assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );

	6128 nSrc = nPayload = (int)pX->nKey;

	6129 pSrc = pX->pKey;

	6130 nHeader += putVarint32(&pCell[nHeader], nPayload);

	6131 }

	6132

	6133 /* Fill in the payload */

	6134 if( nPayload<=pPage->maxLocal ){

	6135 n = nHeader + nPayload;

	6136 testcase( n==3 );

	6137 testcase( n==4 );

	6138 if( n<4 ) n = 4;

	6139 *pnSize = n;

	6140 spaceLeft = nPayload;

	6141 pPrior = pCell;

	6142 }else{

	6143 int mn = pPage->minLocal;

	6144 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);

	6145 testcase( n==pPage->maxLocal );

	6146 testcase( n==pPage->maxLocal+1 );

	6147 if( n > pPage->maxLocal ) n = mn;

	6148 spaceLeft = n;

	6149 *pnSize = n + nHeader + 4;

	6150 pPrior = &pCell[nHeader+n];

	6151 }

	6152 pPayload = &pCell[nHeader];

	6153

	6154 /* At this point variables should be set as follows:

	6155 **

	6156 ** nPayload Total payload size in bytes

	6157 ** pPayload Begin writing payload here

	6158 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,

	6159 ** that means content must spill into overflow pages.

	6160 ** *pnSize Size of the local cell (not counting overflow pages)

	6161 ** pPrior Where to write the pgno of the first overflow page

	6162 **

	6163 ** Use a call to btreeParseCellPtr() to verify that the values above

	6164 ** were computed correctly.

	6165 */

	6166 #if SQLITE_DEBUG

	6167 {

	6168 CellInfo info;

	6169 pPage->xParseCell(pPage, pCell, &info);

	6170 assert( nHeader==(int)(info.pPayload - pCell) );

	6171 assert( info.nKey==pX->nKey );

	6172 assert( *pnSize == info.nSize );

	6173 assert( spaceLeft == info.nLocal );

	6174 }

	6175 #endif

	6176

	6177 /* Write the payload into the local Cell and any extra into overflow pages */

	6178 while( nPayload>0 ){

	6179 if( spaceLeft==0 ){

	6180 #ifndef SQLITE_OMIT_AUTOVACUUM

	6181 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */

	6182 if( pBt->autoVacuum ){

	6183 do{

	6184 pgnoOvfl++;

	6185 } while(

	6186 PTRMAP_ISPAGE(pBt, pgnoOvfl) \|\| pgnoOvfl==PENDING_BYTE_PAGE(pBt)

	6187 );

	6188 }

	6189 #endif

	6190 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);

	6191 #ifndef SQLITE_OMIT_AUTOVACUUM

	6192 /* If the database supports auto-vacuum, and the second or subsequent

	6193 ** overflow page is being allocated, add an entry to the pointer-map

	6194 ** for that page now.

	6195 **

	6196 ** If this is the first overflow page, then write a partial entry

	6197 ** to the pointer-map. If we write nothing to this pointer-map slot,

	6198 ** then the optimistic overflow chain processing in clearCell()

	6199 ** may misinterpret the uninitialized values and delete the

	6200 ** wrong pages from the database.

	6201 */

	6202 if( pBt->autoVacuum && rc==SQLITE_OK ){

	6203 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);

	6204 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);

	6205 if( rc ){

	6206 releasePage(pOvfl);

	6207 }

	6208 }

	6209 #endif

	6210 if( rc ){

	6211 releasePage(pToRelease);

	6212 return rc;

	6213 }

	6214

	6215 /* If pToRelease is not zero than pPrior points into the data area

	6216 ** of pToRelease. Make sure pToRelease is still writeable. */

	6217 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

	6218

	6219 /* If pPrior is part of the data area of pPage, then make sure pPage

	6220 ** is still writeable */

	6221 assert( pPrior<pPage->aData \|\| pPrior>=&pPage->aData[pBt->pageSize]

	6222 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	6223

	6224 put4byte(pPrior, pgnoOvfl);

	6225 releasePage(pToRelease);

	6226 pToRelease = pOvfl;

	6227 pPrior = pOvfl->aData;

	6228 put4byte(pPrior, 0);

	6229 pPayload = &pOvfl->aData[4];

	6230 spaceLeft = pBt->usableSize - 4;

	6231 }

	6232 n = nPayload;

	6233 if( n>spaceLeft ) n = spaceLeft;

	6234

	6235 /* If pToRelease is not zero than pPayload points into the data area

	6236 ** of pToRelease. Make sure pToRelease is still writeable. */

	6237 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

	6238

	6239 /* If pPayload is part of the data area of pPage, then make sure pPage

	6240 ** is still writeable */

	6241 assert( pPayload<pPage->aData \|\| pPayload>=&pPage->aData[pBt->pageSize]

	6242 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	6243

	6244 if( nSrc>0 ){

	6245 if( n>nSrc ) n = nSrc;

	6246 assert( pSrc );

	6247 memcpy(pPayload, pSrc, n);

	6248 }else{

	6249 memset(pPayload, 0, n);

	6250 }

	6251 nPayload -= n;

	6252 pPayload += n;

	6253 pSrc += n;

	6254 nSrc -= n;

	6255 spaceLeft -= n;

	6256 }

	6257 releasePage(pToRelease);

	6258 return SQLITE_OK;

	6259 }

	6260

	6261 /*

	6262 ** Remove the i-th cell from pPage. This routine effects pPage only.

	6263 ** The cell content is not freed or deallocated. It is assumed that

	6264 ** the cell content has been copied someplace else. This routine just

	6265 ** removes the reference to the cell from pPage.

	6266 **

	6267 ** "sz" must be the number of bytes in the cell.

	6268 */

	6269 static void dropCell(MemPage pPage, int idx, int sz, int pRC){

	6270 u32 pc; /* Offset to cell content of cell being deleted */

	6271 u8 data; / pPage->aData */

	6272 u8 ptr; / Used to move bytes around within data[] */

	6273 int rc; /* The return code */

	6274 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */

	6275

	6276 if( *pRC ) return;

	6277 assert( idx>=0 && idx<pPage->nCell );

	6278 assert( CORRUPT_DB \|\| sz==cellSize(pPage, idx) );

	6279 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	6280 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	6281 data = pPage->aData;

	6282 ptr = &pPage->aCellIdx[2*idx];

	6283 pc = get2byte(ptr);

	6284 hdr = pPage->hdrOffset;

	6285 testcase( pc==get2byte(&data[hdr+5]) );

	6286 testcase( pc+sz==pPage->pBt->usableSize );

	6287 if( pc < (u32)get2byte(&data[hdr+5]) \|\| pc+sz > pPage->pBt->usableSize ){

	6288 *pRC = SQLITE_CORRUPT_BKPT;

	6289 return;

	6290 }

	6291 rc = freeSpace(pPage, pc, sz);

	6292 if( rc ){

	6293 *pRC = rc;

	6294 return;

	6295 }

	6296 pPage->nCell--;

	6297 if( pPage->nCell==0 ){

	6298 memset(&data[hdr+1], 0, 4);

	6299 data[hdr+7] = 0;

	6300 put2byte(&data[hdr+5], pPage->pBt->usableSize);

	6301 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset

	6302 - pPage->childPtrSize - 8;

	6303 }else{

	6304 memmove(ptr, ptr+2, 2*(pPage->nCell - idx));

	6305 put2byte(&data[hdr+3], pPage->nCell);

	6306 pPage->nFree += 2;

	6307 }

	6308 }

	6309

	6310 /*

	6311 ** Insert a new cell on pPage at cell index "i". pCell points to the

	6312 ** content of the cell.

	6313 **

	6314 ** If the cell content will fit on the page, then put it there. If it

	6315 ** will not fit, then make a copy of the cell content into pTemp if

	6316 ** pTemp is not null. Regardless of pTemp, allocate a new entry

	6317 ** in pPage->apOvfl[] and make it point to the cell content (either

	6318 ** in pTemp or the original pCell) and also record its index.

	6319 ** Allocating a new entry in pPage->aCell[] implies that

	6320 ** pPage->nOverflow is incremented.

	6321 **

	6322 ** *pRC must be SQLITE_OK when this routine is called.

	6323 */

	6324 static void insertCell(

	6325 MemPage pPage, / Page into which we are copying */

	6326 int i, /* New cell becomes the i-th cell of the page */

	6327 u8 pCell, / Content of the new cell */

	6328 int sz, /* Bytes of content in pCell */

	6329 u8 pTemp, / Temp storage space for pCell, if needed */

	6330 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */

	6331 int pRC / Read and write return code from here */

	6332 ){

	6333 int idx = 0; /* Where to write new cell content in data[] */

	6334 int j; /* Loop counter */

	6335 u8 data; / The content of the whole page */

	6336 u8 pIns; / The point in pPage->aCellIdx[] where no cell inserted */

	6337

	6338 assert( *pRC==SQLITE_OK );

	6339 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );

	6340 assert( MX_CELL(pPage->pBt)<=10921 );

	6341 assert( pPage->nCell<=MX_CELL(pPage->pBt) \|\| CORRUPT_DB );

	6342 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );

	6343 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );

	6344 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	6345 /* The cell should normally be sized correctly. However, when moving a

	6346 ** malformed cell from a leaf page to an interior page, if the cell size

	6347 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size

	6348 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence

	6349 ** the term after the \|\| in the following assert(). */

	6350 assert( sz==pPage->xCellSize(pPage, pCell) \|\| (sz==8 && iChild>0) );

	6351 if( pPage->nOverflow \|\| sz+2>pPage->nFree ){

	6352 if( pTemp ){

	6353 memcpy(pTemp, pCell, sz);

	6354 pCell = pTemp;

	6355 }

	6356 if( iChild ){

	6357 put4byte(pCell, iChild);

	6358 }

	6359 j = pPage->nOverflow++;

	6360 /* Comparison against ArraySize-1 since we hold back one extra slot

	6361 ** as a contingency. In other words, never need more than 3 overflow

	6362 ** slots but 4 are allocated, just to be safe. */

	6363 assert( j < ArraySize(pPage->apOvfl)-1 );

	6364 pPage->apOvfl[j] = pCell;

	6365 pPage->aiOvfl[j] = (u16)i;

	6366

	6367 /* When multiple overflows occur, they are always sequential and in

	6368 ** sorted order. This invariants arise because multiple overflows can

	6369 ** only occur when inserting divider cells into the parent page during

	6370 ** balancing, and the dividers are adjacent and sorted.

	6371 */

	6372 assert( j==0 \|\| pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */

	6373 assert( j==0 \|\| i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */

	6374 }else{

	6375 int rc = sqlite3PagerWrite(pPage->pDbPage);

	6376 if( rc!=SQLITE_OK ){

	6377 *pRC = rc;

	6378 return;

	6379 }

	6380 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	6381 data = pPage->aData;

	6382 assert( &data[pPage->cellOffset]==pPage->aCellIdx );

	6383 rc = allocateSpace(pPage, sz, &idx);

	6384 if( rc ){ *pRC = rc; return; }

	6385 /* The allocateSpace() routine guarantees the following properties

	6386 ** if it returns successfully */

	6387 assert( idx >= 0 );

	6388 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 \|\| CORRUPT_DB );

	6389 assert( idx+sz <= (int)pPage->pBt->usableSize );

	6390 pPage->nFree -= (u16)(2 + sz);

	6391 memcpy(&data[idx], pCell, sz);

	6392 if( iChild ){

	6393 put4byte(&data[idx], iChild);

	6394 }

	6395 pIns = pPage->aCellIdx + i*2;

	6396 memmove(pIns+2, pIns, 2*(pPage->nCell - i));

	6397 put2byte(pIns, idx);

	6398 pPage->nCell++;

	6399 /* increment the cell count */

	6400 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;

	6401 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );

	6402 #ifndef SQLITE_OMIT_AUTOVACUUM

	6403 if( pPage->pBt->autoVacuum ){

	6404 /* The cell may contain a pointer to an overflow page. If so, write

	6405 ** the entry for the overflow page into the pointer map.

	6406 */

	6407 ptrmapPutOvflPtr(pPage, pCell, pRC);

	6408 }

	6409 #endif

	6410 }

	6411 }

	6412

	6413 /*

	6414 ** A CellArray object contains a cache of pointers and sizes for a

	6415 ** consecutive sequence of cells that might be held on multiple pages.

	6416 */

	6417 typedef struct CellArray CellArray;

	6418 struct CellArray {

	6419 int nCell; /* Number of cells in apCell[] */

	6420 MemPage pRef; / Reference page */

	6421 u8 *apCell; / All cells begin balanced */

	6422 u16 szCell; / Local size of all cells in apCell[] */

	6423 };

	6424

	6425 /*

	6426 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been

	6427 ** computed.

	6428 */

	6429 static void populateCellCache(CellArray *p, int idx, int N){

	6430 assert( idx>=0 && idx+N<=p->nCell );

	6431 while( N>0 ){

	6432 assert( p->apCell[idx]!=0 );

	6433 if( p->szCell[idx]==0 ){

	6434 p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);

	6435 }else{

	6436 assert( CORRUPT_DB \|\|

	6437 p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );

	6438 }

	6439 idx++;

	6440 N--;

	6441 }

	6442 }

	6443

	6444 /*

	6445 ** Return the size of the Nth element of the cell array

	6446 */

	6447 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){

	6448 assert( N>=0 && N<p->nCell );

	6449 assert( p->szCell[N]==0 );

	6450 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);

	6451 return p->szCell[N];

	6452 }

	6453 static u16 cachedCellSize(CellArray *p, int N){

	6454 assert( N>=0 && N<p->nCell );

	6455 if( p->szCell[N] ) return p->szCell[N];

	6456 return computeCellSize(p, N);

	6457 }

	6458

	6459 /*

	6460 ** Array apCell[] contains pointers to nCell b-tree page cells. The

	6461 ** szCell[] array contains the size in bytes of each cell. This function

	6462 ** replaces the current contents of page pPg with the contents of the cell

	6463 ** array.

	6464 **

	6465 ** Some of the cells in apCell[] may currently be stored in pPg. This

	6466 ** function works around problems caused by this by making a copy of any

	6467 ** such cells before overwriting the page data.

	6468 **

	6469 ** The MemPage.nFree field is invalidated by this function. It is the

	6470 ** responsibility of the caller to set it correctly.

	6471 */

	6472 static int rebuildPage(

	6473 MemPage pPg, / Edit this page */

	6474 int nCell, /* Final number of cells on page */

	6475 u8 *apCell, / Array of cells */

	6476 u16 szCell / Array of cell sizes */

	6477 ){

	6478 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */

	6479 u8 * const aData = pPg->aData; /* Pointer to data for pPg */

	6480 const int usableSize = pPg->pBt->usableSize;

	6481 u8 * const pEnd = &aData[usableSize];

	6482 int i;

	6483 u8 *pCellptr = pPg->aCellIdx;

	6484 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);

	6485 u8 *pData;

	6486

	6487 i = get2byte(&aData[hdr+5]);

	6488 memcpy(&pTmp[i], &aData[i], usableSize - i);

	6489

	6490 pData = pEnd;

	6491 for(i=0; i<nCell; i++){

	6492 u8 *pCell = apCell[i];

	6493 if( SQLITE_WITHIN(pCell,aData,pEnd) ){

	6494 pCell = &pTmp[pCell - aData];

	6495 }

	6496 pData -= szCell[i];

	6497 put2byte(pCellptr, (pData - aData));

	6498 pCellptr += 2;

	6499 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;

	6500 memcpy(pData, pCell, szCell[i]);

	6501 assert( szCell[i]==pPg->xCellSize(pPg, pCell) \|\| CORRUPT_DB );

	6502 testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );

	6503 }

	6504

	6505 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */

	6506 pPg->nCell = nCell;

	6507 pPg->nOverflow = 0;

	6508

	6509 put2byte(&aData[hdr+1], 0);

	6510 put2byte(&aData[hdr+3], pPg->nCell);

	6511 put2byte(&aData[hdr+5], pData - aData);

	6512 aData[hdr+7] = 0x00;

	6513 return SQLITE_OK;

	6514 }

	6515

	6516 /*

	6517 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell

	6518 ** contains the size in bytes of each such cell. This function attempts to

	6519 ** add the cells stored in the array to page pPg. If it cannot (because

	6520 ** the page needs to be defragmented before the cells will fit), non-zero

	6521 ** is returned. Otherwise, if the cells are added successfully, zero is

	6522 ** returned.

	6523 **

	6524 ** Argument pCellptr points to the first entry in the cell-pointer array

	6525 ** (part of page pPg) to populate. After cell apCell[0] is written to the

	6526 ** page body, a 16-bit offset is written to pCellptr. And so on, for each

	6527 ** cell in the array. It is the responsibility of the caller to ensure

	6528 ** that it is safe to overwrite this part of the cell-pointer array.

	6529 **

	6530 ** When this function is called, *ppData points to the start of the

	6531 ** content area on page pPg. If the size of the content area is extended,

	6532 ** *ppData is updated to point to the new start of the content area

	6533 ** before returning.

	6534 **

	6535 ** Finally, argument pBegin points to the byte immediately following the

	6536 ** end of the space required by this page for the cell-pointer area (for

	6537 ** all cells - not just those inserted by the current call). If the content

	6538 ** area must be extended to before this point in order to accomodate all

	6539 ** cells in apCell[], then the cells do not fit and non-zero is returned.

	6540 */

	6541 static int pageInsertArray(

	6542 MemPage pPg, / Page to add cells to */

	6543 u8 pBegin, / End of cell-pointer array */

	6544 u8 *ppData, / IN/OUT: Page content -area pointer */

	6545 u8 pCellptr, / Pointer to cell-pointer area */

	6546 int iFirst, /* Index of first cell to add */

	6547 int nCell, /* Number of cells to add to pPg */

	6548 CellArray pCArray / Array of cells */

	6549 ){

	6550 int i;

	6551 u8 *aData = pPg->aData;

	6552 u8 pData = ppData;

	6553 int iEnd = iFirst + nCell;

	6554 assert( CORRUPT_DB \|\| pPg->hdrOffset==0 ); /* Never called on page 1 */

	6555 for(i=iFirst; i<iEnd; i++){

	6556 int sz, rc;

	6557 u8 *pSlot;

	6558 sz = cachedCellSize(pCArray, i);

	6559 if( (aData[1]==0 && aData[2]==0) \|\| (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){

	6560 if( (pData - pBegin)<sz ) return 1;

	6561 pData -= sz;

	6562 pSlot = pData;

	6563 }

	6564 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed

	6565 ** database. But they might for a corrupt database. Hence use memmove()

	6566 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */

	6567 assert( (pSlot+sz)<=pCArray->apCell[i]

	6568 \|\| pSlot>=(pCArray->apCell[i]+sz)

	6569 \|\| CORRUPT_DB );

	6570 memmove(pSlot, pCArray->apCell[i], sz);

	6571 put2byte(pCellptr, (pSlot - aData));

	6572 pCellptr += 2;

	6573 }

	6574 *ppData = pData;

	6575 return 0;

	6576 }

	6577

	6578 /*

	6579 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell

	6580 ** contains the size in bytes of each such cell. This function adds the

	6581 ** space associated with each cell in the array that is currently stored

	6582 ** within the body of pPg to the pPg free-list. The cell-pointers and other

	6583 ** fields of the page are not updated.

	6584 **

	6585 ** This function returns the total number of cells added to the free-list.

	6586 */

	6587 static int pageFreeArray(

	6588 MemPage pPg, / Page to edit */

	6589 int iFirst, /* First cell to delete */

	6590 int nCell, /* Cells to delete */

	6591 CellArray pCArray / Array of cells */

	6592 ){

	6593 u8 * const aData = pPg->aData;

	6594 u8 * const pEnd = &aData[pPg->pBt->usableSize];

	6595 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];

	6596 int nRet = 0;

	6597 int i;

	6598 int iEnd = iFirst + nCell;

	6599 u8 *pFree = 0;

	6600 int szFree = 0;

	6601

	6602 for(i=iFirst; i<iEnd; i++){

	6603 u8 *pCell = pCArray->apCell[i];

	6604 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){

	6605 int sz;

	6606 /* No need to use cachedCellSize() here. The sizes of all cells that

	6607 ** are to be freed have already been computing while deciding which

	6608 ** cells need freeing */

	6609 sz = pCArray->szCell[i]; assert( sz>0 );

	6610 if( pFree!=(pCell + sz) ){

	6611 if( pFree ){

	6612 assert( pFree>aData && (pFree - aData)<65536 );

	6613 freeSpace(pPg, (u16)(pFree - aData), szFree);

	6614 }

	6615 pFree = pCell;

	6616 szFree = sz;

	6617 if( pFree+sz>pEnd ) return 0;

	6618 }else{

	6619 pFree = pCell;

	6620 szFree += sz;

	6621 }

	6622 nRet++;

	6623 }

	6624 }

	6625 if( pFree ){

	6626 assert( pFree>aData && (pFree - aData)<65536 );

	6627 freeSpace(pPg, (u16)(pFree - aData), szFree);

	6628 }

	6629 return nRet;

	6630 }

	6631

	6632 /*

	6633 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the

	6634 ** pages being balanced. The current page, pPg, has pPg->nCell cells starting

	6635 ** with apCell[iOld]. After balancing, this page should hold nNew cells

	6636 ** starting at apCell[iNew].

	6637 **

	6638 ** This routine makes the necessary adjustments to pPg so that it contains

	6639 ** the correct cells after being balanced.

	6640 **

	6641 ** The pPg->nFree field is invalid when this function returns. It is the

	6642 ** responsibility of the caller to set it correctly.

	6643 */

	6644 static int editPage(

	6645 MemPage pPg, / Edit this page */

	6646 int iOld, /* Index of first cell currently on page */

	6647 int iNew, /* Index of new first cell on page */

	6648 int nNew, /* Final number of cells on page */

	6649 CellArray pCArray / Array of cells and sizes */

	6650 ){

	6651 u8 * const aData = pPg->aData;

	6652 const int hdr = pPg->hdrOffset;

	6653 u8 pBegin = &pPg->aCellIdx[nNew 2];

	6654 int nCell = pPg->nCell; /* Cells stored on pPg */

	6655 u8 *pData;

	6656 u8 *pCellptr;

	6657 int i;

	6658 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;

	6659 int iNewEnd = iNew + nNew;

	6660

	6661 #ifdef SQLITE_DEBUG

	6662 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);

	6663 memcpy(pTmp, aData, pPg->pBt->usableSize);

	6664 #endif

	6665

	6666 /* Remove cells from the start and end of the page */

	6667 if( iOld<iNew ){

	6668 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);

	6669 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift2], nCell2);

	6670 nCell -= nShift;

	6671 }

	6672 if( iNewEnd < iOldEnd ){

	6673 nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);

	6674 }

	6675

	6676 pData = &aData[get2byteNotZero(&aData[hdr+5])];

	6677 if( pData<pBegin ) goto editpage_fail;

	6678

	6679 /* Add cells to the start of the page */

	6680 if( iNew<iOld ){

	6681 int nAdd = MIN(nNew,iOld-iNew);

	6682 assert( (iOld-iNew)<nNew \|\| nCell==0 \|\| CORRUPT_DB );

	6683 pCellptr = pPg->aCellIdx;

	6684 memmove(&pCellptr[nAdd2], pCellptr, nCell2);

	6685 if( pageInsertArray(

	6686 pPg, pBegin, &pData, pCellptr,

	6687 iNew, nAdd, pCArray

	6688 ) ) goto editpage_fail;

	6689 nCell += nAdd;

	6690 }

	6691

	6692 /* Add any overflow cells */

	6693 for(i=0; i<pPg->nOverflow; i++){

	6694 int iCell = (iOld + pPg->aiOvfl[i]) - iNew;

	6695 if( iCell>=0 && iCell<nNew ){

	6696 pCellptr = &pPg->aCellIdx[iCell * 2];

	6697 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);

	6698 nCell++;

	6699 if( pageInsertArray(

	6700 pPg, pBegin, &pData, pCellptr,

	6701 iCell+iNew, 1, pCArray

	6702 ) ) goto editpage_fail;

	6703 }

	6704 }

	6705

	6706 /* Append cells to the end of the page */

	6707 pCellptr = &pPg->aCellIdx[nCell*2];

	6708 if( pageInsertArray(

	6709 pPg, pBegin, &pData, pCellptr,

	6710 iNew+nCell, nNew-nCell, pCArray

	6711 ) ) goto editpage_fail;

	6712

	6713 pPg->nCell = nNew;

	6714 pPg->nOverflow = 0;

	6715

	6716 put2byte(&aData[hdr+3], pPg->nCell);

	6717 put2byte(&aData[hdr+5], pData - aData);

	6718

	6719 #ifdef SQLITE_DEBUG

	6720 for(i=0; i<nNew && !CORRUPT_DB; i++){

	6721 u8 *pCell = pCArray->apCell[i+iNew];

	6722 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);

	6723 if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){

	6724 pCell = &pTmp[pCell - aData];

	6725 }

	6726 assert( 0==memcmp(pCell, &aData[iOff],

	6727 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );

	6728 }

	6729 #endif

	6730

	6731 return SQLITE_OK;

	6732 editpage_fail:

	6733 /* Unable to edit this page. Rebuild it from scratch instead. */

	6734 populateCellCache(pCArray, iNew, nNew);

	6735 return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);

	6736 }

	6737

	6738 /*

	6739 ** The following parameters determine how many adjacent pages get involved

	6740 ** in a balancing operation. NN is the number of neighbors on either side

	6741 ** of the page that participate in the balancing operation. NB is the

	6742 ** total number of pages that participate, including the target page and

	6743 ** NN neighbors on either side.

	6744 **

	6745 ** The minimum value of NN is 1 (of course). Increasing NN above 1

	6746 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance

	6747 ** in exchange for a larger degradation in INSERT and UPDATE performance.

	6748 ** The value of NN appears to give the best results overall.

	6749 */

	6750 #define NN 1 /* Number of neighbors on either side of pPage */

	6751 #define NB (NN2+1) / Total pages involved in the balance */

	6752

	6753

	6754 #ifndef SQLITE_OMIT_QUICKBALANCE

	6755 /*

	6756 ** This version of balance() handles the common special case where

	6757 ** a new entry is being inserted on the extreme right-end of the

	6758 ** tree, in other words, when the new entry will become the largest

	6759 ** entry in the tree.

	6760 **

	6761 ** Instead of trying to balance the 3 right-most leaf pages, just add

	6762 ** a new page to the right-hand side and put the one new entry in

	6763 ** that page. This leaves the right side of the tree somewhat

	6764 ** unbalanced. But odds are that we will be inserting new entries

	6765 ** at the end soon afterwards so the nearly empty page will quickly

	6766 ** fill up. On average.

	6767 **

	6768 ** pPage is the leaf page which is the right-most page in the tree.

	6769 ** pParent is its parent. pPage must have a single overflow entry

	6770 ** which is also the right-most entry on the page.

	6771 **

	6772 ** The pSpace buffer is used to store a temporary copy of the divider

	6773 ** cell that will be inserted into pParent. Such a cell consists of a 4

	6774 ** byte page number followed by a variable length integer. In other

	6775 ** words, at most 13 bytes. Hence the pSpace buffer must be at

	6776 ** least 13 bytes in size.

	6777 */

	6778 static int balance_quick(MemPage pParent, MemPage pPage, u8 *pSpace){

	6779 BtShared const pBt = pPage->pBt; / B-Tree Database */

	6780 MemPage pNew; / Newly allocated page */

	6781 int rc; /* Return Code */

	6782 Pgno pgnoNew; /* Page number of pNew */

	6783

	6784 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	6785 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	6786 assert( pPage->nOverflow==1 );

	6787

	6788 /* This error condition is now caught prior to reaching this function */

	6789 if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;

	6790

	6791 /* Allocate a new page. This page will become the right-sibling of

	6792 ** pPage. Make the parent page writable, so that the new divider cell

	6793 ** may be inserted. If both these operations are successful, proceed.

	6794 */

	6795 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

	6796

	6797 if( rc==SQLITE_OK ){

	6798

	6799 u8 *pOut = &pSpace[4];

	6800 u8 *pCell = pPage->apOvfl[0];

	6801 u16 szCell = pPage->xCellSize(pPage, pCell);

	6802 u8 *pStop;

	6803

	6804 assert( sqlite3PagerIswriteable(pNew->pDbPage) );

	6805 assert( pPage->aData[0]==(PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF) );

	6806 zeroPage(pNew, PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF);

	6807 rc = rebuildPage(pNew, 1, &pCell, &szCell);

	6808 if( NEVER(rc) ) return rc;

	6809 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;

	6810

	6811 /* If this is an auto-vacuum database, update the pointer map

	6812 ** with entries for the new page, and any pointer from the

	6813 ** cell on the page to an overflow page. If either of these

	6814 ** operations fails, the return code is set, but the contents

	6815 ** of the parent page are still manipulated by thh code below.

	6816 ** That is Ok, at this point the parent page is guaranteed to

	6817 ** be marked as dirty. Returning an error code will cause a

	6818 ** rollback, undoing any changes made to the parent page.

	6819 */

	6820 if( ISAUTOVACUUM ){

	6821 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);

	6822 if( szCell>pNew->minLocal ){

	6823 ptrmapPutOvflPtr(pNew, pCell, &rc);

	6824 }

	6825 }

	6826

	6827 /* Create a divider cell to insert into pParent. The divider cell

	6828 ** consists of a 4-byte page number (the page number of pPage) and

	6829 ** a variable length key value (which must be the same value as the

	6830 ** largest key on pPage).

	6831 **

	6832 ** To find the largest key value on pPage, first find the right-most

	6833 ** cell on pPage. The first two fields of this cell are the

	6834 ** record-length (a variable length integer at most 32-bits in size)

	6835 ** and the key value (a variable length integer, may have any value).

	6836 ** The first of the while(...) loops below skips over the record-length

	6837 ** field. The second while(...) loop copies the key value from the

	6838 ** cell on pPage into the pSpace buffer.

	6839 */

	6840 pCell = findCell(pPage, pPage->nCell-1);

	6841 pStop = &pCell[9];

	6842 while( (*(pCell++)&0x80) && pCell<pStop );

	6843 pStop = &pCell[9];

	6844 while( (((pOut++) = (pCell++))&0x80) && pCell<pStop );

	6845

	6846 /* Insert the new divider cell into pParent. */

	6847 if( rc==SQLITE_OK ){

	6848 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),

	6849 0, pPage->pgno, &rc);

	6850 }

	6851

	6852 /* Set the right-child pointer of pParent to point to the new page. */

	6853 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);

	6854

	6855 /* Release the reference to the new page. */

	6856 releasePage(pNew);

	6857 }

	6858

	6859 return rc;

	6860 }

	6861 #endif /* SQLITE_OMIT_QUICKBALANCE */

	6862

	6863 #if 0

	6864 /*

	6865 ** This function does not contribute anything to the operation of SQLite.

	6866 ** it is sometimes activated temporarily while debugging code responsible

	6867 ** for setting pointer-map entries.

	6868 */

	6869 static int ptrmapCheckPages(MemPage **apPage, int nPage){

	6870 int i, j;

	6871 for(i=0; i<nPage; i++){

	6872 Pgno n;

	6873 u8 e;

	6874 MemPage *pPage = apPage[i];

	6875 BtShared *pBt = pPage->pBt;

	6876 assert( pPage->isInit );

	6877

	6878 for(j=0; j<pPage->nCell; j++){

	6879 CellInfo info;

	6880 u8 *z;

	6881

	6882 z = findCell(pPage, j);

	6883 pPage->xParseCell(pPage, z, &info);

	6884 if( info.nLocal<info.nPayload ){

	6885 Pgno ovfl = get4byte(&z[info.nSize-4]);

	6886 ptrmapGet(pBt, ovfl, &e, &n);

	6887 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );

	6888 }

	6889 if( !pPage->leaf ){

	6890 Pgno child = get4byte(z);

	6891 ptrmapGet(pBt, child, &e, &n);

	6892 assert( n==pPage->pgno && e==PTRMAP_BTREE );

	6893 }

	6894 }

	6895 if( !pPage->leaf ){

	6896 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	6897 ptrmapGet(pBt, child, &e, &n);

	6898 assert( n==pPage->pgno && e==PTRMAP_BTREE );

	6899 }

	6900 }

	6901 return 1;

	6902 }

	6903 #endif

	6904

	6905 /*

	6906 ** This function is used to copy the contents of the b-tree node stored

	6907 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then

	6908 ** the pointer-map entries for each child page are updated so that the

	6909 ** parent page stored in the pointer map is page pTo. If pFrom contained

	6910 ** any cells with overflow page pointers, then the corresponding pointer

	6911 ** map entries are also updated so that the parent page is page pTo.

	6912 **

	6913 ** If pFrom is currently carrying any overflow cells (entries in the

	6914 ** MemPage.apOvfl[] array), they are not copied to pTo.

	6915 **

	6916 ** Before returning, page pTo is reinitialized using btreeInitPage().

	6917 **

	6918 ** The performance of this function is not critical. It is only used by

	6919 ** the balance_shallower() and balance_deeper() procedures, neither of

	6920 ** which are called often under normal circumstances.

	6921 */

	6922 static void copyNodeContent(MemPage pFrom, MemPage pTo, int *pRC){

	6923 if( (*pRC)==SQLITE_OK ){

	6924 BtShared * const pBt = pFrom->pBt;

	6925 u8 * const aFrom = pFrom->aData;

	6926 u8 * const aTo = pTo->aData;

	6927 int const iFromHdr = pFrom->hdrOffset;

	6928 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);

	6929 int rc;

	6930 int iData;

	6931

	6932

	6933 assert( pFrom->isInit );

	6934 assert( pFrom->nFree>=iToHdr );

	6935 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );

	6936

	6937 /* Copy the b-tree node content from page pFrom to page pTo. */

	6938 iData = get2byte(&aFrom[iFromHdr+5]);

	6939 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);

	6940 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);

	6941

	6942 /* Reinitialize page pTo so that the contents of the MemPage structure

	6943 ** match the new data. The initialization of pTo can actually fail under

	6944 ** fairly obscure circumstances, even though it is a copy of initialized

	6945 ** page pFrom.

	6946 */

	6947 pTo->isInit = 0;

	6948 rc = btreeInitPage(pTo);

	6949 if( rc!=SQLITE_OK ){

	6950 *pRC = rc;

	6951 return;

	6952 }

	6953

	6954 /* If this is an auto-vacuum database, update the pointer-map entries

	6955 ** for any b-tree or overflow pages that pTo now contains the pointers to.

	6956 */

	6957 if( ISAUTOVACUUM ){

	6958 *pRC = setChildPtrmaps(pTo);

	6959 }

	6960 }

	6961 }

	6962

	6963 /*

	6964 ** This routine redistributes cells on the iParentIdx'th child of pParent

	6965 ** (hereafter "the page") and up to 2 siblings so that all pages have about the

	6966 ** same amount of free space. Usually a single sibling on either side of the

	6967 ** page are used in the balancing, though both siblings might come from one

	6968 ** side if the page is the first or last child of its parent. If the page

	6969 ** has fewer than 2 siblings (something which can only happen if the page

	6970 ** is a root page or a child of a root page) then all available siblings

	6971 ** participate in the balancing.

	6972 **

	6973 ** The number of siblings of the page might be increased or decreased by

	6974 ** one or two in an effort to keep pages nearly full but not over full.

	6975 **

	6976 ** Note that when this routine is called, some of the cells on the page

	6977 ** might not actually be stored in MemPage.aData[]. This can happen

	6978 ** if the page is overfull. This routine ensures that all cells allocated

	6979 ** to the page and its siblings fit into MemPage.aData[] before returning.

	6980 **

	6981 ** In the course of balancing the page and its siblings, cells may be

	6982 ** inserted into or removed from the parent page (pParent). Doing so

	6983 ** may cause the parent page to become overfull or underfull. If this

	6984 ** happens, it is the responsibility of the caller to invoke the correct

	6985 ** balancing routine to fix this problem (see the balance() routine).

	6986 **

	6987 ** If this routine fails for any reason, it might leave the database

	6988 ** in a corrupted state. So if this routine fails, the database should

	6989 ** be rolled back.

	6990 **

	6991 ** The third argument to this function, aOvflSpace, is a pointer to a

	6992 ** buffer big enough to hold one page. If while inserting cells into the parent

	6993 ** page (pParent) the parent page becomes overfull, this buffer is

	6994 ** used to store the parent's overflow cells. Because this function inserts

	6995 ** a maximum of four divider cells into the parent page, and the maximum

	6996 ** size of a cell stored within an internal node is always less than 1/4

	6997 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large

	6998 ** enough for all overflow cells.

	6999 **

	7000 ** If aOvflSpace is set to a null pointer, this function returns

	7001 ** SQLITE_NOMEM.

	7002 */

	7003 static int balance_nonroot(

	7004 MemPage pParent, / Parent page of siblings being balanced */

	7005 int iParentIdx, /* Index of "the page" in pParent */

	7006 u8 aOvflSpace, / page-size bytes of space for parent ovfl */

	7007 int isRoot, /* True if pParent is a root-page */

	7008 int bBulk /* True if this call is part of a bulk load */

	7009 ){

	7010 BtShared pBt; / The whole database */

	7011 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */

	7012 int nNew = 0; /* Number of pages in apNew[] */

	7013 int nOld; /* Number of pages in apOld[] */

	7014 int i, j, k; /* Loop counters */

	7015 int nxDiv; /* Next divider slot in pParent->aCell[] */

	7016 int rc = SQLITE_OK; /* The return code */

	7017 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */

	7018 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */

	7019 int usableSpace; /* Bytes in pPage beyond the header */

	7020 int pageFlags; /* Value of pPage->aData[0] */

	7021 int iSpace1 = 0; /* First unused byte of aSpace1[] */

	7022 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */

	7023 int szScratch; /* Size of scratch memory requested */

	7024 MemPage apOld[NB]; / pPage and up to two siblings */

	7025 MemPage apNew[NB+2]; / pPage and up to NB siblings after balancing */

	7026 u8 pRight; / Location in parent of right-sibling pointer */

	7027 u8 apDiv[NB-1]; / Divider cells in pParent */

	7028 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */

	7029 int cntOld[NB+2]; /* Old index in b.apCell[] */

	7030 int szNew[NB+2]; /* Combined size of cells placed on i-th page */

	7031 u8 aSpace1; / Space for copies of dividers cells */

	7032 Pgno pgno; /* Temp var to store a page number in */

	7033 u8 abDone[NB+2]; /* True after i'th new page is populated */

	7034 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */

	7035 Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */

	7036 u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */

	7037 CellArray b; /* Parsed information on cells being balanced */

	7038

	7039 memset(abDone, 0, sizeof(abDone));

	7040 b.nCell = 0;

	7041 b.apCell = 0;

	7042 pBt = pParent->pBt;

	7043 assert( sqlite3_mutex_held(pBt->mutex) );

	7044 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	7045

	7046 #if 0

	7047 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));

	7048 #endif

	7049

	7050 /* At this point pParent may have at most one overflow cell. And if

	7051 ** this overflow cell is present, it must be the cell with

	7052 ** index iParentIdx. This scenario comes about when this function

	7053 ** is called (indirectly) from sqlite3BtreeDelete().

	7054 */

	7055 assert( pParent->nOverflow==0 \|\| pParent->nOverflow==1 );

	7056 assert( pParent->nOverflow==0 \|\| pParent->aiOvfl[0]==iParentIdx );

	7057

	7058 if( !aOvflSpace ){

	7059 return SQLITE_NOMEM_BKPT;

	7060 }

	7061

	7062 /* Find the sibling pages to balance. Also locate the cells in pParent

	7063 ** that divide the siblings. An attempt is made to find NN siblings on

	7064 ** either side of pPage. More siblings are taken from one side, however,

	7065 ** if there are fewer than NN siblings on the other side. If pParent

	7066 ** has NB or fewer children then all children of pParent are taken.

	7067 **

	7068 ** This loop also drops the divider cells from the parent page. This

	7069 ** way, the remainder of the function does not have to deal with any

	7070 ** overflow cells in the parent page, since if any existed they will

	7071 ** have already been removed.

	7072 */

	7073 i = pParent->nOverflow + pParent->nCell;

	7074 if( i<2 ){

	7075 nxDiv = 0;

	7076 }else{

	7077 assert( bBulk==0 \|\| bBulk==1 );

	7078 if( iParentIdx==0 ){

	7079 nxDiv = 0;

	7080 }else if( iParentIdx==i ){

	7081 nxDiv = i-2+bBulk;

	7082 }else{

	7083 nxDiv = iParentIdx-1;

	7084 }

	7085 i = 2-bBulk;

	7086 }

	7087 nOld = i+1;

	7088 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){

	7089 pRight = &pParent->aData[pParent->hdrOffset+8];

	7090 }else{

	7091 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);

	7092 }

	7093 pgno = get4byte(pRight);

	7094 while( 1 ){

	7095 rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);

	7096 if( rc ){

	7097 memset(apOld, 0, (i+1)sizeof(MemPage));

	7098 goto balance_cleanup;

	7099 }

	7100 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;

	7101 if( (i--)==0 ) break;

	7102

	7103 if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){

	7104 apDiv[i] = pParent->apOvfl[0];

	7105 pgno = get4byte(apDiv[i]);

	7106 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);

	7107 pParent->nOverflow = 0;

	7108 }else{

	7109 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);

	7110 pgno = get4byte(apDiv[i]);

	7111 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);

	7112

	7113 /* Drop the cell from the parent page. apDiv[i] still points to

	7114 ** the cell within the parent, even though it has been dropped.

	7115 ** This is safe because dropping a cell only overwrites the first

	7116 ** four bytes of it, and this function does not need the first

	7117 ** four bytes of the divider cell. So the pointer is safe to use

	7118 ** later on.

	7119 **

	7120 ** But not if we are in secure-delete mode. In secure-delete mode,

	7121 ** the dropCell() routine will overwrite the entire cell with zeroes.

	7122 ** In this case, temporarily copy the cell into the aOvflSpace[]

	7123 ** buffer. It will be copied out again as soon as the aSpace[] buffer

	7124 ** is allocated. */

	7125 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	7126 int iOff;

	7127

	7128 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);

	7129 if( (iOff+szNew[i])>(int)pBt->usableSize ){

	7130 rc = SQLITE_CORRUPT_BKPT;

	7131 memset(apOld, 0, (i+1)sizeof(MemPage));

	7132 goto balance_cleanup;

	7133 }else{

	7134 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);

	7135 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];

	7136 }

	7137 }

	7138 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);

	7139 }

	7140 }

	7141

	7142 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte

	7143 ** alignment */

	7144 nMaxCells = (nMaxCells + 3)&~3;

	7145

	7146 /*

	7147 ** Allocate space for memory structures

	7148 */

	7149 szScratch =

	7150 nMaxCellssizeof(u8) /* b.apCell */

	7151 + nMaxCellssizeof(u16) / b.szCell */

	7152 + pBt->pageSize; /* aSpace1 */

	7153

	7154 /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer

	7155 ** that is more than 6 times the database page size. */

	7156 assert( szScratch<=6*(int)pBt->pageSize );

	7157 b.apCell = sqlite3ScratchMalloc( szScratch );

	7158 if( b.apCell==0 ){

	7159 rc = SQLITE_NOMEM_BKPT;

	7160 goto balance_cleanup;

	7161 }

	7162 b.szCell = (u16*)&b.apCell[nMaxCells];

	7163 aSpace1 = (u8*)&b.szCell[nMaxCells];

	7164 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

	7165

	7166 /*

	7167 ** Load pointers to all cells on sibling pages and the divider cells

	7168 ** into the local b.apCell[] array. Make copies of the divider cells

	7169 ** into space obtained from aSpace1[]. The divider cells have already

	7170 ** been removed from pParent.

	7171 **

	7172 ** If the siblings are on leaf pages, then the child pointers of the

	7173 ** divider cells are stripped from the cells before they are copied

	7174 ** into aSpace1[]. In this way, all cells in b.apCell[] are without

	7175 ** child pointers. If siblings are not leaves, then all cell in

	7176 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[]

	7177 ** are alike.

	7178 **

	7179 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.

	7180 ** leafData: 1 if pPage holds key+data and pParent holds only keys.

	7181 */

	7182 b.pRef = apOld[0];

	7183 leafCorrection = b.pRef->leaf*4;

	7184 leafData = b.pRef->intKeyLeaf;

	7185 for(i=0; i<nOld; i++){

	7186 MemPage *pOld = apOld[i];

	7187 int limit = pOld->nCell;

	7188 u8 *aData = pOld->aData;

	7189 u16 maskPage = pOld->maskPage;

	7190 u8 *piCell = aData + pOld->cellOffset;

	7191 u8 *piEnd;

	7192

	7193 /* Verify that all sibling pages are of the same "type" (table-leaf,

	7194 ** table-interior, index-leaf, or index-interior).

	7195 */

	7196 if( pOld->aData[0]!=apOld[0]->aData[0] ){

	7197 rc = SQLITE_CORRUPT_BKPT;

	7198 goto balance_cleanup;

	7199 }

	7200

	7201 /* Load b.apCell[] with pointers to all cells in pOld. If pOld

	7202 ** constains overflow cells, include them in the b.apCell[] array

	7203 ** in the correct spot.

	7204 **

	7205 ** Note that when there are multiple overflow cells, it is always the

	7206 ** case that they are sequential and adjacent. This invariant arises

	7207 ** because multiple overflows can only occurs when inserting divider

	7208 ** cells into a parent on a prior balance, and divider cells are always

	7209 ** adjacent and are inserted in order. There is an assert() tagged

	7210 ** with "NOTE 1" in the overflow cell insertion loop to prove this

	7211 ** invariant.

	7212 **

	7213 ** This must be done in advance. Once the balance starts, the cell

	7214 ** offset section of the btree page will be overwritten and we will no

	7215 ** long be able to find the cells if a pointer to each cell is not saved

	7216 ** first.

	7217 */

	7218 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));

	7219 if( pOld->nOverflow>0 ){

	7220 limit = pOld->aiOvfl[0];

	7221 for(j=0; j<limit; j++){

	7222 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));

	7223 piCell += 2;

	7224 b.nCell++;

	7225 }

	7226 for(k=0; k<pOld->nOverflow; k++){

	7227 assert( k==0 \|\| pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */

	7228 b.apCell[b.nCell] = pOld->apOvfl[k];

	7229 b.nCell++;

	7230 }

	7231 }

	7232 piEnd = aData + pOld->cellOffset + 2*pOld->nCell;

	7233 while( piCell<piEnd ){

	7234 assert( b.nCell<nMaxCells );

	7235 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));

	7236 piCell += 2;

	7237 b.nCell++;

	7238 }

	7239

	7240 cntOld[i] = b.nCell;

	7241 if( i<nOld-1 && !leafData){

	7242 u16 sz = (u16)szNew[i];

	7243 u8 *pTemp;

	7244 assert( b.nCell<nMaxCells );

	7245 b.szCell[b.nCell] = sz;

	7246 pTemp = &aSpace1[iSpace1];

	7247 iSpace1 += sz;

	7248 assert( sz<=pBt->maxLocal+23 );

	7249 assert( iSpace1 <= (int)pBt->pageSize );

	7250 memcpy(pTemp, apDiv[i], sz);

	7251 b.apCell[b.nCell] = pTemp+leafCorrection;

	7252 assert( leafCorrection==0 \|\| leafCorrection==4 );

	7253 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;

	7254 if( !pOld->leaf ){

	7255 assert( leafCorrection==0 );

	7256 assert( pOld->hdrOffset==0 );

	7257 /* The right pointer of the child page pOld becomes the left

	7258 ** pointer of the divider cell */

	7259 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);

	7260 }else{

	7261 assert( leafCorrection==4 );

	7262 while( b.szCell[b.nCell]<4 ){

	7263 /* Do not allow any cells smaller than 4 bytes. If a smaller cell

	7264 ** does exist, pad it with 0x00 bytes. */

	7265 assert( b.szCell[b.nCell]==3 \|\| CORRUPT_DB );

	7266 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] \|\| CORRUPT_DB );

	7267 aSpace1[iSpace1++] = 0x00;

	7268 b.szCell[b.nCell]++;

	7269 }

	7270 }

	7271 b.nCell++;

	7272 }

	7273 }

	7274

	7275 /*

	7276 ** Figure out the number of pages needed to hold all b.nCell cells.

	7277 ** Store this number in "k". Also compute szNew[] which is the total

	7278 ** size of all cells on the i-th page and cntNew[] which is the index

	7279 ** in b.apCell[] of the cell that divides page i from page i+1.

	7280 ** cntNew[k] should equal b.nCell.

	7281 **

	7282 ** Values computed by this block:

	7283 **

	7284 ** k: The total number of sibling pages

	7285 ** szNew[i]: Spaced used on the i-th sibling page.

	7286 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to

	7287 ** the right of the i-th sibling page.

	7288 ** usableSpace: Number of bytes of space available on each sibling.

	7289 **

	7290 */

	7291 usableSpace = pBt->usableSize - 12 + leafCorrection;

	7292 for(i=0; i<nOld; i++){

	7293 MemPage *p = apOld[i];

	7294 szNew[i] = usableSpace - p->nFree;

	7295 for(j=0; j<p->nOverflow; j++){

	7296 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);

	7297 }

	7298 cntNew[i] = cntOld[i];

	7299 }

	7300 k = nOld;

	7301 for(i=0; i<k; i++){

	7302 int sz;

	7303 while( szNew[i]>usableSpace ){

	7304 if( i+1>=k ){

	7305 k = i+2;

	7306 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }

	7307 szNew[k-1] = 0;

	7308 cntNew[k-1] = b.nCell;

	7309 }

	7310 sz = 2 + cachedCellSize(&b, cntNew[i]-1);

	7311 szNew[i] -= sz;

	7312 if( !leafData ){

	7313 if( cntNew[i]<b.nCell ){

	7314 sz = 2 + cachedCellSize(&b, cntNew[i]);

	7315 }else{

	7316 sz = 0;

	7317 }

	7318 }

	7319 szNew[i+1] += sz;

	7320 cntNew[i]--;

	7321 }

	7322 while( cntNew[i]<b.nCell ){

	7323 sz = 2 + cachedCellSize(&b, cntNew[i]);

	7324 if( szNew[i]+sz>usableSpace ) break;

	7325 szNew[i] += sz;

	7326 cntNew[i]++;

	7327 if( !leafData ){

	7328 if( cntNew[i]<b.nCell ){

	7329 sz = 2 + cachedCellSize(&b, cntNew[i]);

	7330 }else{

	7331 sz = 0;

	7332 }

	7333 }

	7334 szNew[i+1] -= sz;

	7335 }

	7336 if( cntNew[i]>=b.nCell ){

	7337 k = i+1;

	7338 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){

	7339 rc = SQLITE_CORRUPT_BKPT;

	7340 goto balance_cleanup;

	7341 }

	7342 }

	7343

	7344 /*

	7345 ** The packing computed by the previous block is biased toward the siblings

	7346 ** on the left side (siblings with smaller keys). The left siblings are

	7347 ** always nearly full, while the right-most sibling might be nearly empty.

	7348 ** The next block of code attempts to adjust the packing of siblings to

	7349 ** get a better balance.

	7350 **

	7351 ** This adjustment is more than an optimization. The packing above might

	7352 ** be so out of balance as to be illegal. For example, the right-most

	7353 ** sibling might be completely empty. This adjustment is not optional.

	7354 */

	7355 for(i=k-1; i>0; i--){

	7356 int szRight = szNew[i]; /* Size of sibling on the right */

	7357 int szLeft = szNew[i-1]; /* Size of sibling on the left */

	7358 int r; /* Index of right-most cell in left sibling */

	7359 int d; /* Index of first cell to the left of right sibling */

	7360

	7361 r = cntNew[i-1] - 1;

	7362 d = r + 1 - leafData;

	7363 (void)cachedCellSize(&b, d);

	7364 do{

	7365 assert( d<nMaxCells );

	7366 assert( r<nMaxCells );

	7367 (void)cachedCellSize(&b, r);

	7368 if( szRight!=0

	7369 && (bBulk \|\| szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){

	7370 break;

	7371 }

	7372 szRight += b.szCell[d] + 2;

	7373 szLeft -= b.szCell[r] + 2;

	7374 cntNew[i-1] = r;

	7375 r--;

	7376 d--;

	7377 }while( r>=0 );

	7378 szNew[i] = szRight;

	7379 szNew[i-1] = szLeft;

	7380 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){

	7381 rc = SQLITE_CORRUPT_BKPT;

	7382 goto balance_cleanup;

	7383 }

	7384 }

	7385

	7386 /* Sanity check: For a non-corrupt database file one of the follwing

	7387 ** must be true:

	7388 ** (1) We found one or more cells (cntNew[0])>0), or

	7389 ** (2) pPage is a virtual root page. A virtual root page is when

	7390 ** the real root page is page 1 and we are the only child of

	7391 ** that page.

	7392 */

	7393 assert( cntNew[0]>0 \|\| (pParent->pgno==1 && pParent->nCell==0) \|\| CORRUPT_DB);

	7394 TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",

	7395 apOld[0]->pgno, apOld[0]->nCell,

	7396 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,

	7397 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0

	7398 ));

	7399

	7400 /*

	7401 ** Allocate k new pages. Reuse old pages where possible.

	7402 */

	7403 pageFlags = apOld[0]->aData[0];

	7404 for(i=0; i<k; i++){

	7405 MemPage *pNew;

	7406 if( i<nOld ){

	7407 pNew = apNew[i] = apOld[i];

	7408 apOld[i] = 0;

	7409 rc = sqlite3PagerWrite(pNew->pDbPage);

	7410 nNew++;

	7411 if( rc ) goto balance_cleanup;

	7412 }else{

	7413 assert( i>0 );

	7414 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);

	7415 if( rc ) goto balance_cleanup;

	7416 zeroPage(pNew, pageFlags);

	7417 apNew[i] = pNew;

	7418 nNew++;

	7419 cntOld[i] = b.nCell;

	7420

	7421 /* Set the pointer-map entry for the new sibling page. */

	7422 if( ISAUTOVACUUM ){

	7423 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);

	7424 if( rc!=SQLITE_OK ){

	7425 goto balance_cleanup;

	7426 }

	7427 }

	7428 }

	7429 }

	7430

	7431 /*

	7432 ** Reassign page numbers so that the new pages are in ascending order.

	7433 ** This helps to keep entries in the disk file in order so that a scan

	7434 ** of the table is closer to a linear scan through the file. That in turn

	7435 ** helps the operating system to deliver pages from the disk more rapidly.

	7436 **

	7437 ** An O(n^2) insertion sort algorithm is used, but since n is never more

	7438 ** than (NB+2) (a small constant), that should not be a problem.

	7439 **

	7440 ** When NB==3, this one optimization makes the database about 25% faster

	7441 ** for large insertions and deletions.

	7442 */

	7443 for(i=0; i<nNew; i++){

	7444 aPgOrder[i] = aPgno[i] = apNew[i]->pgno;

	7445 aPgFlags[i] = apNew[i]->pDbPage->flags;

	7446 for(j=0; j<i; j++){

	7447 if( aPgno[j]==aPgno[i] ){

	7448 /* This branch is taken if the set of sibling pages somehow contains

	7449 ** duplicate entries. This can happen if the database is corrupt.

	7450 ** It would be simpler to detect this as part of the loop below, but

	7451 ** we do the detection here in order to avoid populating the pager

	7452 ** cache with two separate objects associated with the same

	7453 ** page number. */

	7454 assert( CORRUPT_DB );

	7455 rc = SQLITE_CORRUPT_BKPT;

	7456 goto balance_cleanup;

	7457 }

	7458 }

	7459 }

	7460 for(i=0; i<nNew; i++){

	7461 int iBest = 0; /* aPgno[] index of page number to use */

	7462 for(j=1; j<nNew; j++){

	7463 if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;

	7464 }

	7465 pgno = aPgOrder[iBest];

	7466 aPgOrder[iBest] = 0xffffffff;

	7467 if( iBest!=i ){

	7468 if( iBest>i ){

	7469 sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);

	7470 }

	7471 sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);

	7472 apNew[i]->pgno = pgno;

	7473 }

	7474 }

	7475

	7476 TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "

	7477 "%d(%d nc=%d) %d(%d nc=%d)\n",

	7478 apNew[0]->pgno, szNew[0], cntNew[0],

	7479 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,

	7480 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,

	7481 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,

	7482 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,

	7483 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,

	7484 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,

	7485 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,

	7486 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0

	7487 ));

	7488

	7489 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	7490 put4byte(pRight, apNew[nNew-1]->pgno);

	7491

	7492 /* If the sibling pages are not leaves, ensure that the right-child pointer

	7493 ** of the right-most new sibling page is set to the value that was

	7494 ** originally in the same field of the right-most old sibling page. */

	7495 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){

	7496 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];

	7497 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);

	7498 }

	7499

	7500 /* Make any required updates to pointer map entries associated with

	7501 ** cells stored on sibling pages following the balance operation. Pointer

	7502 ** map entries associated with divider cells are set by the insertCell()

	7503 ** routine. The associated pointer map entries are:

	7504 **

	7505 ** a) if the cell contains a reference to an overflow chain, the

	7506 ** entry associated with the first page in the overflow chain, and

	7507 **

	7508 ** b) if the sibling pages are not leaves, the child page associated

	7509 ** with the cell.

	7510 **

	7511 ** If the sibling pages are not leaves, then the pointer map entry

	7512 ** associated with the right-child of each sibling may also need to be

	7513 ** updated. This happens below, after the sibling pages have been

	7514 ** populated, not here.

	7515 */

	7516 if( ISAUTOVACUUM ){

	7517 MemPage *pNew = apNew[0];

	7518 u8 *aOld = pNew->aData;

	7519 int cntOldNext = pNew->nCell + pNew->nOverflow;

	7520 int usableSize = pBt->usableSize;

	7521 int iNew = 0;

	7522 int iOld = 0;

	7523

	7524 for(i=0; i<b.nCell; i++){

	7525 u8 *pCell = b.apCell[i];

	7526 if( i==cntOldNext ){

	7527 MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];

	7528 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;

	7529 aOld = pOld->aData;

	7530 }

	7531 if( i==cntNew[iNew] ){

	7532 pNew = apNew[++iNew];

	7533 if( !leafData ) continue;

	7534 }

	7535

	7536 /* Cell pCell is destined for new sibling page pNew. Originally, it

	7537 ** was either part of sibling page iOld (possibly an overflow cell),

	7538 ** or else the divider cell to the left of sibling page iOld. So,

	7539 ** if sibling page iOld had the same page number as pNew, and if

	7540 ** pCell really was a part of sibling page iOld (not a divider or

	7541 ** overflow cell), we can skip updating the pointer map entries. */

	7542 if( iOld>=nNew

	7543 \|\| pNew->pgno!=aPgno[iOld]

	7544 \|\| !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])

	7545 ){

	7546 if( !leafCorrection ){

	7547 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);

	7548 }

	7549 if( cachedCellSize(&b,i)>pNew->minLocal ){

	7550 ptrmapPutOvflPtr(pNew, pCell, &rc);

	7551 }

	7552 if( rc ) goto balance_cleanup;

	7553 }

	7554 }

	7555 }

	7556

	7557 /* Insert new divider cells into pParent. */

	7558 for(i=0; i<nNew-1; i++){

	7559 u8 *pCell;

	7560 u8 *pTemp;

	7561 int sz;

	7562 MemPage *pNew = apNew[i];

	7563 j = cntNew[i];

	7564

	7565 assert( j<nMaxCells );

	7566 assert( b.apCell[j]!=0 );

	7567 pCell = b.apCell[j];

	7568 sz = b.szCell[j] + leafCorrection;

	7569 pTemp = &aOvflSpace[iOvflSpace];

	7570 if( !pNew->leaf ){

	7571 memcpy(&pNew->aData[8], pCell, 4);

	7572 }else if( leafData ){

	7573 /* If the tree is a leaf-data tree, and the siblings are leaves,

	7574 ** then there is no divider cell in b.apCell[]. Instead, the divider

	7575 ** cell consists of the integer key for the right-most cell of

	7576 ** the sibling-page assembled above only.

	7577 */

	7578 CellInfo info;

	7579 j--;

	7580 pNew->xParseCell(pNew, b.apCell[j], &info);

	7581 pCell = pTemp;

	7582 sz = 4 + putVarint(&pCell[4], info.nKey);

	7583 pTemp = 0;

	7584 }else{

	7585 pCell -= 4;

	7586 /* Obscure case for non-leaf-data trees: If the cell at pCell was

	7587 ** previously stored on a leaf node, and its reported size was 4

	7588 ** bytes, then it may actually be smaller than this

	7589 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of

	7590 ** any cell). But it is important to pass the correct size to

	7591 ** insertCell(), so reparse the cell now.

	7592 **

	7593 ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"

	7594 ** and WITHOUT ROWID tables with exactly one column which is the

	7595 ** primary key.

	7596 */

	7597 if( b.szCell[j]==4 ){

	7598 assert(leafCorrection==4);

	7599 sz = pParent->xCellSize(pParent, pCell);

	7600 }

	7601 }

	7602 iOvflSpace += sz;

	7603 assert( sz<=pBt->maxLocal+23 );

	7604 assert( iOvflSpace <= (int)pBt->pageSize );

	7605 insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);

	7606 if( rc!=SQLITE_OK ) goto balance_cleanup;

	7607 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	7608 }

	7609

	7610 /* Now update the actual sibling pages. The order in which they are updated

	7611 ** is important, as this code needs to avoid disrupting any page from which

	7612 ** cells may still to be read. In practice, this means:

	7613 **

	7614 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])

	7615 ** then it is not safe to update page apNew[iPg] until after

	7616 ** the left-hand sibling apNew[iPg-1] has been updated.

	7617 **

	7618 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])

	7619 ** then it is not safe to update page apNew[iPg] until after

	7620 ** the right-hand sibling apNew[iPg+1] has been updated.

	7621 **

	7622 ** If neither of the above apply, the page is safe to update.

	7623 **

	7624 ** The iPg value in the following loop starts at nNew-1 goes down

	7625 ** to 0, then back up to nNew-1 again, thus making two passes over

	7626 ** the pages. On the initial downward pass, only condition (1) above

	7627 ** needs to be tested because (2) will always be true from the previous

	7628 ** step. On the upward pass, both conditions are always true, so the

	7629 ** upwards pass simply processes pages that were missed on the downward

	7630 ** pass.

	7631 */

	7632 for(i=1-nNew; i<nNew; i++){

	7633 int iPg = i<0 ? -i : i;

	7634 assert( iPg>=0 && iPg<nNew );

	7635 if( abDone[iPg] ) continue; /* Skip pages already processed */

	7636 if( i>=0 /* On the upwards pass, or... */

	7637 \|\| cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */

	7638 ){

	7639 int iNew;

	7640 int iOld;

	7641 int nNewCell;

	7642

	7643 /* Verify condition (1): If cells are moving left, update iPg

	7644 ** only after iPg-1 has already been updated. */

	7645 assert( iPg==0 \|\| cntOld[iPg-1]>=cntNew[iPg-1] \|\| abDone[iPg-1] );

	7646

	7647 /* Verify condition (2): If cells are moving right, update iPg

	7648 ** only after iPg+1 has already been updated. */

	7649 assert( cntNew[iPg]>=cntOld[iPg] \|\| abDone[iPg+1] );

	7650

	7651 if( iPg==0 ){

	7652 iNew = iOld = 0;

	7653 nNewCell = cntNew[0];

	7654 }else{

	7655 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;

	7656 iNew = cntNew[iPg-1] + !leafData;

	7657 nNewCell = cntNew[iPg] - iNew;

	7658 }

	7659

	7660 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);

	7661 if( rc ) goto balance_cleanup;

	7662 abDone[iPg]++;

	7663 apNew[iPg]->nFree = usableSpace-szNew[iPg];

	7664 assert( apNew[iPg]->nOverflow==0 );

	7665 assert( apNew[iPg]->nCell==nNewCell );

	7666 }

	7667 }

	7668

	7669 /* All pages have been processed exactly once */

	7670 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );

	7671

	7672 assert( nOld>0 );

	7673 assert( nNew>0 );

	7674

	7675 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){

	7676 /* The root page of the b-tree now contains no cells. The only sibling

	7677 ** page is the right-child of the parent. Copy the contents of the

	7678 ** child page into the parent, decreasing the overall height of the

	7679 ** b-tree structure by one. This is described as the "balance-shallower"

	7680 ** sub-algorithm in some documentation.

	7681 **

	7682 ** If this is an auto-vacuum database, the call to copyNodeContent()

	7683 ** sets all pointer-map entries corresponding to database image pages

	7684 ** for which the pointer is stored within the content being copied.

	7685 **

	7686 ** It is critical that the child page be defragmented before being

	7687 ** copied into the parent, because if the parent is page 1 then it will

	7688 ** by smaller than the child due to the database header, and so all the

	7689 ** free space needs to be up front.

	7690 */

	7691 assert( nNew==1 \|\| CORRUPT_DB );

	7692 rc = defragmentPage(apNew[0]);

	7693 testcase( rc!=SQLITE_OK );

	7694 assert( apNew[0]->nFree ==

	7695 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)

	7696 \|\| rc!=SQLITE_OK

	7697 );

	7698 copyNodeContent(apNew[0], pParent, &rc);

	7699 freePage(apNew[0], &rc);

	7700 }else if( ISAUTOVACUUM && !leafCorrection ){

	7701 /* Fix the pointer map entries associated with the right-child of each

	7702 ** sibling page. All other pointer map entries have already been taken

	7703 ** care of. */

	7704 for(i=0; i<nNew; i++){

	7705 u32 key = get4byte(&apNew[i]->aData[8]);

	7706 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);

	7707 }

	7708 }

	7709

	7710 assert( pParent->isInit );

	7711 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",

	7712 nOld, nNew, b.nCell));

	7713

	7714 /* Free any old pages that were not reused as new pages.

	7715 */

	7716 for(i=nNew; i<nOld; i++){

	7717 freePage(apOld[i], &rc);

	7718 }

	7719

	7720 #if 0

	7721 if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){

	7722 /* The ptrmapCheckPages() contains assert() statements that verify that

	7723 ** all pointer map pages are set correctly. This is helpful while

	7724 ** debugging. This is usually disabled because a corrupt database may

	7725 ** cause an assert() statement to fail. */

	7726 ptrmapCheckPages(apNew, nNew);

	7727 ptrmapCheckPages(&pParent, 1);

	7728 }

	7729 #endif

	7730

	7731 /*

	7732 ** Cleanup before returning.

	7733 */

	7734 balance_cleanup:

	7735 sqlite3ScratchFree(b.apCell);

	7736 for(i=0; i<nOld; i++){

	7737 releasePage(apOld[i]);

	7738 }

	7739 for(i=0; i<nNew; i++){

	7740 releasePage(apNew[i]);

	7741 }

	7742

	7743 return rc;

	7744 }

	7745

	7746

	7747 /*

	7748 ** This function is called when the root page of a b-tree structure is

	7749 ** overfull (has one or more overflow pages).

	7750 **

	7751 ** A new child page is allocated and the contents of the current root

	7752 ** page, including overflow cells, are copied into the child. The root

	7753 ** page is then overwritten to make it an empty page with the right-child

	7754 ** pointer pointing to the new page.

	7755 **

	7756 ** Before returning, all pointer-map entries corresponding to pages

	7757 ** that the new child-page now contains pointers to are updated. The

	7758 ** entry corresponding to the new right-child pointer of the root

	7759 ** page is also updated.

	7760 **

	7761 ** If successful, *ppChild is set to contain a reference to the child

	7762 ** page and SQLITE_OK is returned. In this case the caller is required

	7763 ** to call releasePage() on *ppChild exactly once. If an error occurs,

	7764 ** an error code is returned and *ppChild is set to 0.

	7765 */

	7766 static int balance_deeper(MemPage pRoot, MemPage *ppChild){

	7767 int rc; /* Return value from subprocedures */

	7768 MemPage pChild = 0; / Pointer to a new child page */

	7769 Pgno pgnoChild = 0; /* Page number of the new child page */

	7770 BtShared pBt = pRoot->pBt; / The BTree */

	7771

	7772 assert( pRoot->nOverflow>0 );

	7773 assert( sqlite3_mutex_held(pBt->mutex) );

	7774

	7775 /* Make pRoot, the root page of the b-tree, writable. Allocate a new

	7776 ** page that will become the new right-child of pPage. Copy the contents

	7777 ** of the node stored on pRoot into the new child page.

	7778 */

	7779 rc = sqlite3PagerWrite(pRoot->pDbPage);

	7780 if( rc==SQLITE_OK ){

	7781 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);

	7782 copyNodeContent(pRoot, pChild, &rc);

	7783 if( ISAUTOVACUUM ){

	7784 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);

	7785 }

	7786 }

	7787 if( rc ){

	7788 *ppChild = 0;

	7789 releasePage(pChild);

	7790 return rc;

	7791 }

	7792 assert( sqlite3PagerIswriteable(pChild->pDbPage) );

	7793 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

	7794 assert( pChild->nCell==pRoot->nCell );

	7795

	7796 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));

	7797

	7798 /* Copy the overflow cells from pRoot to pChild */

	7799 memcpy(pChild->aiOvfl, pRoot->aiOvfl,

	7800 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));

	7801 memcpy(pChild->apOvfl, pRoot->apOvfl,

	7802 pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));

	7803 pChild->nOverflow = pRoot->nOverflow;

	7804

	7805 /* Zero the contents of pRoot. Then install pChild as the right-child. */

	7806 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);

	7807 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);

	7808

	7809 *ppChild = pChild;

	7810 return SQLITE_OK;

	7811 }

	7812

	7813 /*

	7814 ** The page that pCur currently points to has just been modified in

	7815 ** some way. This function figures out if this modification means the

	7816 ** tree needs to be balanced, and if so calls the appropriate balancing

	7817 ** routine. Balancing routines are:

	7818 **

	7819 ** balance_quick()

	7820 ** balance_deeper()

	7821 ** balance_nonroot()

	7822 */

	7823 static int balance(BtCursor *pCur){

	7824 int rc = SQLITE_OK;

	7825 const int nMin = pCur->pBt->usableSize * 2 / 3;

	7826 u8 aBalanceQuickSpace[13];

	7827 u8 *pFree = 0;

	7828

	7829 VVA_ONLY( int balance_quick_called = 0 );

	7830 VVA_ONLY( int balance_deeper_called = 0 );

	7831

	7832 do {

	7833 int iPage = pCur->iPage;

	7834 MemPage *pPage = pCur->apPage[iPage];

	7835

	7836 if( iPage==0 ){

	7837 if( pPage->nOverflow ){

	7838 /* The root page of the b-tree is overfull. In this case call the

	7839 ** balance_deeper() function to create a new child for the root-page

	7840 ** and copy the current contents of the root-page to it. The

	7841 ** next iteration of the do-loop will balance the child page.

	7842 */

	7843 assert( balance_deeper_called==0 );

	7844 VVA_ONLY( balance_deeper_called++ );

	7845 rc = balance_deeper(pPage, &pCur->apPage[1]);

	7846 if( rc==SQLITE_OK ){

	7847 pCur->iPage = 1;

	7848 pCur->aiIdx[0] = 0;

	7849 pCur->aiIdx[1] = 0;

	7850 assert( pCur->apPage[1]->nOverflow );

	7851 }

	7852 }else{

	7853 break;

	7854 }

	7855 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){

	7856 break;

	7857 }else{

	7858 MemPage * const pParent = pCur->apPage[iPage-1];

	7859 int const iIdx = pCur->aiIdx[iPage-1];

	7860

	7861 rc = sqlite3PagerWrite(pParent->pDbPage);

	7862 if( rc==SQLITE_OK ){

	7863 #ifndef SQLITE_OMIT_QUICKBALANCE

	7864 if( pPage->intKeyLeaf

	7865 && pPage->nOverflow==1

	7866 && pPage->aiOvfl[0]==pPage->nCell

	7867 && pParent->pgno!=1

	7868 && pParent->nCell==iIdx

	7869 ){

	7870 /* Call balance_quick() to create a new sibling of pPage on which

	7871 ** to store the overflow cell. balance_quick() inserts a new cell

	7872 ** into pParent, which may cause pParent overflow. If this

	7873 ** happens, the next iteration of the do-loop will balance pParent

	7874 ** use either balance_nonroot() or balance_deeper(). Until this

	7875 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]

	7876 ** buffer.

	7877 **

	7878 ** The purpose of the following assert() is to check that only a

	7879 ** single call to balance_quick() is made for each call to this

	7880 ** function. If this were not verified, a subtle bug involving reuse

	7881 ** of the aBalanceQuickSpace[] might sneak in.

	7882 */

	7883 assert( balance_quick_called==0 );

	7884 VVA_ONLY( balance_quick_called++ );

	7885 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);

	7886 }else

	7887 #endif

	7888 {

	7889 /* In this case, call balance_nonroot() to redistribute cells

	7890 ** between pPage and up to 2 of its sibling pages. This involves

	7891 ** modifying the contents of pParent, which may cause pParent to

	7892 ** become overfull or underfull. The next iteration of the do-loop

	7893 ** will balance the parent page to correct this.

	7894 **

	7895 ** If the parent page becomes overfull, the overflow cell or cells

	7896 ** are stored in the pSpace buffer allocated immediately below.

	7897 ** A subsequent iteration of the do-loop will deal with this by

	7898 ** calling balance_nonroot() (balance_deeper() may be called first,

	7899 ** but it doesn't deal with overflow cells - just moves them to a

	7900 ** different page). Once this subsequent call to balance_nonroot()

	7901 ** has completed, it is safe to release the pSpace buffer used by

	7902 ** the previous call, as the overflow cell data will have been

	7903 ** copied either into the body of a database page or into the new

	7904 ** pSpace buffer passed to the latter call to balance_nonroot().

	7905 */

	7906 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);

	7907 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,

	7908 pCur->hints&BTREE_BULKLOAD);

	7909 if( pFree ){

	7910 /* If pFree is not NULL, it points to the pSpace buffer used

	7911 ** by a previous call to balance_nonroot(). Its contents are

	7912 ** now stored either on real database pages or within the

	7913 ** new pSpace buffer, so it may be safely freed here. */

	7914 sqlite3PageFree(pFree);

	7915 }

	7916

	7917 /* The pSpace buffer will be freed after the next call to

	7918 ** balance_nonroot(), or just before this function returns, whichever

	7919 ** comes first. */

	7920 pFree = pSpace;

	7921 }

	7922 }

	7923

	7924 pPage->nOverflow = 0;

	7925

	7926 /* The next iteration of the do-loop balances the parent page. */

	7927 releasePage(pPage);

	7928 pCur->iPage--;

	7929 assert( pCur->iPage>=0 );

	7930 }

	7931 }while( rc==SQLITE_OK );

	7932

	7933 if( pFree ){

	7934 sqlite3PageFree(pFree);

	7935 }

	7936 return rc;

	7937 }

	7938

	7939

	7940 /*

	7941 ** Insert a new record into the BTree. The content of the new record

	7942 ** is described by the pX object. The pCur cursor is used only to

	7943 ** define what table the record should be inserted into, and is left

	7944 ** pointing at a random location.

	7945 **

	7946 ** For a table btree (used for rowid tables), only the pX.nKey value of

	7947 ** the key is used. The pX.pKey value must be NULL. The pX.nKey is the

	7948 ** rowid or INTEGER PRIMARY KEY of the row. The pX.nData,pData,nZero fields

	7949 ** hold the content of the row.

	7950 **

	7951 ** For an index btree (used for indexes and WITHOUT ROWID tables), the

	7952 ** key is an arbitrary byte sequence stored in pX.pKey,nKey. The

	7953 ** pX.pData,nData,nZero fields must be zero.

	7954 **

	7955 ** If the seekResult parameter is non-zero, then a successful call to

	7956 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already

	7957 ** been performed. In other words, if seekResult!=0 then the cursor

	7958 ** is currently pointing to a cell that will be adjacent to the cell

	7959 ** to be inserted. If seekResult<0 then pCur points to a cell that is

	7960 ** smaller then (pKey,nKey). If seekResult>0 then pCur points to a cell

	7961 ** that is larger than (pKey,nKey).

	7962 **

	7963 ** If seekResult==0, that means pCur is pointing at some unknown location.

	7964 ** In that case, this routine must seek the cursor to the correct insertion

	7965 ** point for (pKey,nKey) before doing the insertion. For index btrees,

	7966 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked

	7967 ** key values and pX->aMem can be used instead of pX->pKey to avoid having

	7968 ** to decode the key.

	7969 */

	7970 int sqlite3BtreeInsert(

	7971 BtCursor pCur, / Insert data into the table of this cursor */

	7972 const BtreePayload pX, / Content of the row to be inserted */

	7973 int flags, /* True if this is likely an append */

	7974 int seekResult /* Result of prior MovetoUnpacked() call */

	7975 ){

	7976 int rc;

	7977 int loc = seekResult; /* -1: before desired location +1: after */

	7978 int szNew = 0;

	7979 int idx;

	7980 MemPage *pPage;

	7981 Btree *p = pCur->pBtree;

	7982 BtShared *pBt = p->pBt;

	7983 unsigned char *oldCell;

	7984 unsigned char *newCell = 0;

	7985

	7986 assert( (flags & (BTREE_SAVEPOSITION\|BTREE_APPEND))==flags );

	7987

	7988 if( pCur->eState==CURSOR_FAULT ){

	7989 assert( pCur->skipNext!=SQLITE_OK );

	7990 return pCur->skipNext;

	7991 }

	7992

	7993 assert( cursorOwnsBtShared(pCur) );

	7994 assert( (pCur->curFlags & BTCF_WriteFlag)!=0

	7995 && pBt->inTransaction==TRANS_WRITE

	7996 && (pBt->btsFlags & BTS_READ_ONLY)==0 );

	7997 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

	7998

	7999 /* Assert that the caller has been consistent. If this cursor was opened

	8000 ** expecting an index b-tree, then the caller should be inserting blob

	8001 ** keys with no associated data. If the cursor was opened expecting an

	8002 ** intkey table, the caller should be inserting integer keys with a

	8003 ** blob of associated data. */

	8004 assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );

	8005

	8006 /* Save the positions of any other cursors open on this table.

	8007 **

	8008 ** In some cases, the call to btreeMoveto() below is a no-op. For

	8009 ** example, when inserting data into a table with auto-generated integer

	8010 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the

	8011 ** integer key to use. It then calls this function to actually insert the

	8012 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes

	8013 ** that the cursor is already where it needs to be and returns without

	8014 ** doing any work. To avoid thwarting these optimizations, it is important

	8015 ** not to clear the cursor here.

	8016 */

	8017 if( pCur->curFlags & BTCF_Multiple ){

	8018 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

	8019 if( rc ) return rc;

	8020 }

	8021

	8022 if( pCur->pKeyInfo==0 ){

	8023 assert( pX->pKey==0 );

	8024 /* If this is an insert into a table b-tree, invalidate any incrblob

	8025 ** cursors open on the row being replaced */

	8026 invalidateIncrblobCursors(p, pX->nKey, 0);

	8027

	8028 /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing

	8029 ** to a row with the same key as the new entry being inserted. */

	8030 assert( (flags & BTREE_SAVEPOSITION)==0 \|\|

	8031 ((pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey) );

	8032

	8033 /* If the cursor is currently on the last row and we are appending a

	8034 ** new row onto the end, set the "loc" to avoid an unnecessary

	8035 ** btreeMoveto() call */

	8036 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){

	8037 loc = 0;

	8038 }else if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey>0

	8039 && pCur->info.nKey==pX->nKey-1 ){

	8040 loc = -1;

	8041 }else if( loc==0 ){

	8042 rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);

	8043 if( rc ) return rc;

	8044 }

	8045 }else if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){

	8046 if( pX->nMem ){

	8047 UnpackedRecord r;

	8048 r.pKeyInfo = pCur->pKeyInfo;

	8049 r.aMem = pX->aMem;

	8050 r.nField = pX->nMem;

	8051 r.default_rc = 0;

	8052 r.errCode = 0;

	8053 r.r1 = 0;

	8054 r.r2 = 0;

	8055 r.eqSeen = 0;

	8056 rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);

	8057 }else{

	8058 rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);

	8059 }

	8060 if( rc ) return rc;

	8061 }

	8062 assert( pCur->eState==CURSOR_VALID \|\| (pCur->eState==CURSOR_INVALID && loc) );

	8063

	8064 pPage = pCur->apPage[pCur->iPage];

	8065 assert( pPage->intKey \|\| pX->nKey>=0 );

	8066 assert( pPage->leaf \|\| !pPage->intKey );

	8067

	8068 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",

	8069 pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,

	8070 loc==0 ? "overwrite" : "new entry"));

	8071 assert( pPage->isInit );

	8072 newCell = pBt->pTmpSpace;

	8073 assert( newCell!=0 );

	8074 rc = fillInCell(pPage, newCell, pX, &szNew);

	8075 if( rc ) goto end_insert;

	8076 assert( szNew==pPage->xCellSize(pPage, newCell) );

	8077 assert( szNew <= MX_CELL_SIZE(pBt) );

	8078 idx = pCur->aiIdx[pCur->iPage];

	8079 if( loc==0 ){

	8080 CellInfo info;

	8081 assert( idx<pPage->nCell );

	8082 rc = sqlite3PagerWrite(pPage->pDbPage);

	8083 if( rc ){

	8084 goto end_insert;

	8085 }

	8086 oldCell = findCell(pPage, idx);

	8087 if( !pPage->leaf ){

	8088 memcpy(newCell, oldCell, 4);

	8089 }

	8090 rc = clearCell(pPage, oldCell, &info);

	8091 if( info.nSize==szNew && info.nLocal==info.nPayload ){

	8092 /* Overwrite the old cell with the new if they are the same size.

	8093 ** We could also try to do this if the old cell is smaller, then add

	8094 ** the leftover space to the free list. But experiments show that

	8095 ** doing that is no faster then skipping this optimization and just

	8096 ** calling dropCell() and insertCell(). */

	8097 assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */

	8098 if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;

	8099 memcpy(oldCell, newCell, szNew);

	8100 return SQLITE_OK;

	8101 }

	8102 dropCell(pPage, idx, info.nSize, &rc);

	8103 if( rc ) goto end_insert;

	8104 }else if( loc<0 && pPage->nCell>0 ){

	8105 assert( pPage->leaf );

	8106 idx = ++pCur->aiIdx[pCur->iPage];

	8107 }else{

	8108 assert( pPage->leaf );

	8109 }

	8110 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);

	8111 assert( pPage->nOverflow==0 \|\| rc==SQLITE_OK );

	8112 assert( rc!=SQLITE_OK \|\| pPage->nCell>0 \|\| pPage->nOverflow>0 );

	8113

	8114 /* If no error has occurred and pPage has an overflow cell, call balance()

	8115 ** to redistribute the cells within the tree. Since balance() may move

	8116 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey

	8117 ** variables.

	8118 **

	8119 ** Previous versions of SQLite called moveToRoot() to move the cursor

	8120 ** back to the root page as balance() used to invalidate the contents

	8121 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,

	8122 ** set the cursor state to "invalid". This makes common insert operations

	8123 ** slightly faster.

	8124 **

	8125 ** There is a subtle but important optimization here too. When inserting

	8126 ** multiple records into an intkey b-tree using a single cursor (as can

	8127 ** happen while processing an "INSERT INTO ... SELECT" statement), it

	8128 ** is advantageous to leave the cursor pointing to the last entry in

	8129 ** the b-tree if possible. If the cursor is left pointing to the last

	8130 ** entry in the table, and the next row inserted has an integer key

	8131 ** larger than the largest existing key, it is possible to insert the

	8132 ** row without seeking the cursor. This can be a big performance boost.

	8133 */

	8134 pCur->info.nSize = 0;

	8135 if( pPage->nOverflow ){

	8136 assert( rc==SQLITE_OK );

	8137 pCur->curFlags &= ~(BTCF_ValidNKey);

	8138 rc = balance(pCur);

	8139

	8140 /* Must make sure nOverflow is reset to zero even if the balance()

	8141 ** fails. Internal data structure corruption will result otherwise.

	8142 ** Also, set the cursor state to invalid. This stops saveCursorPosition()

	8143 ** from trying to save the current position of the cursor. */

	8144 pCur->apPage[pCur->iPage]->nOverflow = 0;

	8145 pCur->eState = CURSOR_INVALID;

	8146 if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){

	8147 rc = moveToRoot(pCur);

	8148 if( pCur->pKeyInfo ){

	8149 assert( pCur->pKey==0 );

	8150 pCur->pKey = sqlite3Malloc( pX->nKey );

	8151 if( pCur->pKey==0 ){

	8152 rc = SQLITE_NOMEM;

	8153 }else{

	8154 memcpy(pCur->pKey, pX->pKey, pX->nKey);

	8155 }

	8156 }

	8157 pCur->eState = CURSOR_REQUIRESEEK;

	8158 pCur->nKey = pX->nKey;

	8159 }

	8160 }

	8161 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );

	8162

	8163 end_insert:

	8164 return rc;

	8165 }

	8166

	8167 /*

	8168 ** Delete the entry that the cursor is pointing to.

	8169 **

	8170 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then

	8171 ** the cursor is left pointing at an arbitrary location after the delete.

	8172 ** But if that bit is set, then the cursor is left in a state such that

	8173 ** the next call to BtreeNext() or BtreePrev() moves it to the same row

	8174 ** as it would have been on if the call to BtreeDelete() had been omitted.

	8175 **

	8176 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes

	8177 ** associated with a single table entry and its indexes. Only one of those

	8178 ** deletes is considered the "primary" delete. The primary delete occurs

	8179 ** on a cursor that is not a BTREE_FORDELETE cursor. All but one delete

	8180 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.

	8181 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,

	8182 ** but which might be used by alternative storage engines.

	8183 */

	8184 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){

	8185 Btree *p = pCur->pBtree;

	8186 BtShared *pBt = p->pBt;

	8187 int rc; /* Return code */

	8188 MemPage pPage; / Page to delete cell from */

	8189 unsigned char pCell; / Pointer to cell to delete */

	8190 int iCellIdx; /* Index of cell to delete */

	8191 int iCellDepth; /* Depth of node containing pCell */

	8192 CellInfo info; /* Size of the cell being deleted */

	8193 int bSkipnext = 0; /* Leaf cursor in SKIPNEXT state */

	8194 u8 bPreserve = flags & BTREE_SAVEPOSITION; /* Keep cursor valid */

	8195

	8196 assert( cursorOwnsBtShared(pCur) );

	8197 assert( pBt->inTransaction==TRANS_WRITE );

	8198 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	8199 assert( pCur->curFlags & BTCF_WriteFlag );

	8200 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

	8201 assert( !hasReadConflicts(p, pCur->pgnoRoot) );

	8202 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	8203 assert( pCur->eState==CURSOR_VALID );

	8204 assert( (flags & ~(BTREE_SAVEPOSITION \| BTREE_AUXDELETE))==0 );

	8205

	8206 iCellDepth = pCur->iPage;

	8207 iCellIdx = pCur->aiIdx[iCellDepth];

	8208 pPage = pCur->apPage[iCellDepth];

	8209 pCell = findCell(pPage, iCellIdx);

	8210

	8211 /* If the bPreserve flag is set to true, then the cursor position must

	8212 ** be preserved following this delete operation. If the current delete

	8213 ** will cause a b-tree rebalance, then this is done by saving the cursor

	8214 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before

	8215 ** returning.

	8216 **

	8217 ** Or, if the current delete will not cause a rebalance, then the cursor

	8218 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately

	8219 ** before or after the deleted entry. In this case set bSkipnext to true. */

	8220 if( bPreserve ){

	8221 if( !pPage->leaf

	8222 \|\| (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)

	8223 ){

	8224 /* A b-tree rebalance will be required after deleting this entry.

	8225 ** Save the cursor key. */

	8226 rc = saveCursorKey(pCur);

	8227 if( rc ) return rc;

	8228 }else{

	8229 bSkipnext = 1;

	8230 }

	8231 }

	8232

	8233 /* If the page containing the entry to delete is not a leaf page, move

	8234 ** the cursor to the largest entry in the tree that is smaller than

	8235 ** the entry being deleted. This cell will replace the cell being deleted

	8236 ** from the internal node. The 'previous' entry is used for this instead

	8237 ** of the 'next' entry, as the previous entry is always a part of the

	8238 ** sub-tree headed by the child page of the cell being deleted. This makes

	8239 ** balancing the tree following the delete operation easier. */

	8240 if( !pPage->leaf ){

	8241 int notUsed = 0;

	8242 rc = sqlite3BtreePrevious(pCur, &notUsed);

	8243 if( rc ) return rc;

	8244 }

	8245

	8246 /* Save the positions of any other cursors open on this table before

	8247 ** making any modifications. */

	8248 if( pCur->curFlags & BTCF_Multiple ){

	8249 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

	8250 if( rc ) return rc;

	8251 }

	8252

	8253 /* If this is a delete operation to remove a row from a table b-tree,

	8254 ** invalidate any incrblob cursors open on the row being deleted. */

	8255 if( pCur->pKeyInfo==0 ){

	8256 invalidateIncrblobCursors(p, pCur->info.nKey, 0);

	8257 }

	8258

	8259 /* Make the page containing the entry to be deleted writable. Then free any

	8260 ** overflow pages associated with the entry and finally remove the cell

	8261 ** itself from within the page. */

	8262 rc = sqlite3PagerWrite(pPage->pDbPage);

	8263 if( rc ) return rc;

	8264 rc = clearCell(pPage, pCell, &info);

	8265 dropCell(pPage, iCellIdx, info.nSize, &rc);

	8266 if( rc ) return rc;

	8267

	8268 /* If the cell deleted was not located on a leaf page, then the cursor

	8269 ** is currently pointing to the largest entry in the sub-tree headed

	8270 ** by the child-page of the cell that was just deleted from an internal

	8271 ** node. The cell from the leaf node needs to be moved to the internal

	8272 ** node to replace the deleted cell. */

	8273 if( !pPage->leaf ){

	8274 MemPage *pLeaf = pCur->apPage[pCur->iPage];

	8275 int nCell;

	8276 Pgno n = pCur->apPage[iCellDepth+1]->pgno;

	8277 unsigned char *pTmp;

	8278

	8279 pCell = findCell(pLeaf, pLeaf->nCell-1);

	8280 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;

	8281 nCell = pLeaf->xCellSize(pLeaf, pCell);

	8282 assert( MX_CELL_SIZE(pBt) >= nCell );

	8283 pTmp = pBt->pTmpSpace;

	8284 assert( pTmp!=0 );

	8285 rc = sqlite3PagerWrite(pLeaf->pDbPage);

	8286 if( rc==SQLITE_OK ){

	8287 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);

	8288 }

	8289 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);

	8290 if( rc ) return rc;

	8291 }

	8292

	8293 /* Balance the tree. If the entry deleted was located on a leaf page,

	8294 ** then the cursor still points to that page. In this case the first

	8295 ** call to balance() repairs the tree, and the if(...) condition is

	8296 ** never true.

	8297 **

	8298 ** Otherwise, if the entry deleted was on an internal node page, then

	8299 ** pCur is pointing to the leaf page from which a cell was removed to

	8300 ** replace the cell deleted from the internal node. This is slightly

	8301 ** tricky as the leaf node may be underfull, and the internal node may

	8302 ** be either under or overfull. In this case run the balancing algorithm

	8303 ** on the leaf node first. If the balance proceeds far enough up the

	8304 ** tree that we can be sure that any problem in the internal node has

	8305 ** been corrected, so be it. Otherwise, after balancing the leaf node,

	8306 ** walk the cursor up the tree to the internal node and balance it as

	8307 ** well. */

	8308 rc = balance(pCur);

	8309 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){

	8310 while( pCur->iPage>iCellDepth ){

	8311 releasePage(pCur->apPage[pCur->iPage--]);

	8312 }

	8313 rc = balance(pCur);

	8314 }

	8315

	8316 if( rc==SQLITE_OK ){

	8317 if( bSkipnext ){

	8318 assert( bPreserve && (pCur->iPage==iCellDepth \|\| CORRUPT_DB) );

	8319 assert( pPage==pCur->apPage[pCur->iPage] \|\| CORRUPT_DB );

	8320 assert( (pPage->nCell>0 \|\| CORRUPT_DB) && iCellIdx<=pPage->nCell );

	8321 pCur->eState = CURSOR_SKIPNEXT;

	8322 if( iCellIdx>=pPage->nCell ){

	8323 pCur->skipNext = -1;

	8324 pCur->aiIdx[iCellDepth] = pPage->nCell-1;

	8325 }else{

	8326 pCur->skipNext = 1;

	8327 }

	8328 }else{

	8329 rc = moveToRoot(pCur);

	8330 if( bPreserve ){

	8331 pCur->eState = CURSOR_REQUIRESEEK;

	8332 }

	8333 }

	8334 }

	8335 return rc;

	8336 }

	8337

	8338 /*

	8339 ** Create a new BTree table. Write into *piTable the page

	8340 ** number for the root page of the new table.

	8341 **

	8342 ** The type of type is determined by the flags parameter. Only the

	8343 ** following values of flags are currently in use. Other values for

	8344 ** flags might not work:

	8345 **

	8346 ** BTREE_INTKEY\|BTREE_LEAFDATA Used for SQL tables with rowid keys

	8347 ** BTREE_ZERODATA Used for SQL indices

	8348 */

	8349 static int btreeCreateTable(Btree p, int piTable, int createTabFlags){

	8350 BtShared *pBt = p->pBt;

	8351 MemPage *pRoot;

	8352 Pgno pgnoRoot;

	8353 int rc;

	8354 int ptfFlags; /* Page-type flage for the root page of new table */

	8355

	8356 assert( sqlite3BtreeHoldsMutex(p) );

	8357 assert( pBt->inTransaction==TRANS_WRITE );

	8358 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	8359

	8360 #ifdef SQLITE_OMIT_AUTOVACUUM

	8361 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

	8362 if( rc ){

	8363 return rc;

	8364 }

	8365 #else

	8366 if( pBt->autoVacuum ){

	8367 Pgno pgnoMove; /* Move a page here to make room for the root-page */

	8368 MemPage pPageMove; / The page to move to. */

	8369

	8370 /* Creating a new table may probably require moving an existing database

	8371 ** to make room for the new tables root page. In case this page turns

	8372 ** out to be an overflow page, delete all overflow page-map caches

	8373 ** held by open cursors.

	8374 */

	8375 invalidateAllOverflowCache(pBt);

	8376

	8377 /* Read the value of meta[3] from the database to determine where the

	8378 ** root page of the new table should go. meta[3] is the largest root-page

	8379 ** created so far, so the new root-page is (meta[3]+1).

	8380 */

	8381 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);

	8382 pgnoRoot++;

	8383

	8384 /* The new root-page may not be allocated on a pointer-map page, or the

	8385 ** PENDING_BYTE page.

	8386 */

	8387 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) \|\|

	8388 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){

	8389 pgnoRoot++;

	8390 }

	8391 assert( pgnoRoot>=3 \|\| CORRUPT_DB );

	8392 testcase( pgnoRoot<3 );

	8393

	8394 /* Allocate a page. The page that currently resides at pgnoRoot will

	8395 ** be moved to the allocated page (unless the allocated page happens

	8396 ** to reside at pgnoRoot).

	8397 */

	8398 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);

	8399 if( rc!=SQLITE_OK ){

	8400 return rc;

	8401 }

	8402

	8403 if( pgnoMove!=pgnoRoot ){

	8404 /* pgnoRoot is the page that will be used for the root-page of

	8405 ** the new table (assuming an error did not occur). But we were

	8406 ** allocated pgnoMove. If required (i.e. if it was not allocated

	8407 ** by extending the file), the current page at position pgnoMove

	8408 ** is already journaled.

	8409 */

	8410 u8 eType = 0;

	8411 Pgno iPtrPage = 0;

	8412

	8413 /* Save the positions of any open cursors. This is required in

	8414 ** case they are holding a reference to an xFetch reference

	8415 ** corresponding to page pgnoRoot. */

	8416 rc = saveAllCursors(pBt, 0, 0);

	8417 releasePage(pPageMove);

	8418 if( rc!=SQLITE_OK ){

	8419 return rc;

	8420 }

	8421

	8422 /* Move the page currently at pgnoRoot to pgnoMove. */

	8423 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

	8424 if( rc!=SQLITE_OK ){

	8425 return rc;

	8426 }

	8427 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);

	8428 if( eType==PTRMAP_ROOTPAGE \|\| eType==PTRMAP_FREEPAGE ){

	8429 rc = SQLITE_CORRUPT_BKPT;

	8430 }

	8431 if( rc!=SQLITE_OK ){

	8432 releasePage(pRoot);

	8433 return rc;

	8434 }

	8435 assert( eType!=PTRMAP_ROOTPAGE );

	8436 assert( eType!=PTRMAP_FREEPAGE );

	8437 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);

	8438 releasePage(pRoot);

	8439

	8440 /* Obtain the page at pgnoRoot */

	8441 if( rc!=SQLITE_OK ){

	8442 return rc;

	8443 }

	8444 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

	8445 if( rc!=SQLITE_OK ){

	8446 return rc;

	8447 }

	8448 rc = sqlite3PagerWrite(pRoot->pDbPage);

	8449 if( rc!=SQLITE_OK ){

	8450 releasePage(pRoot);

	8451 return rc;

	8452 }

	8453 }else{

	8454 pRoot = pPageMove;

	8455 }

	8456

	8457 /* Update the pointer-map and meta-data with the new root-page number. */

	8458 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);

	8459 if( rc ){

	8460 releasePage(pRoot);

	8461 return rc;

	8462 }

	8463

	8464 /* When the new root page was allocated, page 1 was made writable in

	8465 ** order either to increase the database filesize, or to decrement the

	8466 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.

	8467 */

	8468 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );

	8469 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);

	8470 if( NEVER(rc) ){

	8471 releasePage(pRoot);

	8472 return rc;

	8473 }

	8474

	8475 }else{

	8476 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

	8477 if( rc ) return rc;

	8478 }

	8479 #endif

	8480 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

	8481 if( createTabFlags & BTREE_INTKEY ){

	8482 ptfFlags = PTF_INTKEY \| PTF_LEAFDATA \| PTF_LEAF;

	8483 }else{

	8484 ptfFlags = PTF_ZERODATA \| PTF_LEAF;

	8485 }

	8486 zeroPage(pRoot, ptfFlags);

	8487 sqlite3PagerUnref(pRoot->pDbPage);

	8488 assert( (pBt->openFlags & BTREE_SINGLE)==0 \|\| pgnoRoot==2 );

	8489 *piTable = (int)pgnoRoot;

	8490 return SQLITE_OK;

	8491 }

	8492 int sqlite3BtreeCreateTable(Btree p, int piTable, int flags){

	8493 int rc;

	8494 sqlite3BtreeEnter(p);

	8495 rc = btreeCreateTable(p, piTable, flags);

	8496 sqlite3BtreeLeave(p);

	8497 return rc;

	8498 }

	8499

	8500 /*

	8501 ** Erase the given database page and all its children. Return

	8502 ** the page to the freelist.

	8503 */

	8504 static int clearDatabasePage(

	8505 BtShared pBt, / The BTree that contains the table */

	8506 Pgno pgno, /* Page number to clear */

	8507 int freePageFlag, /* Deallocate page if true */

	8508 int pnChange / Add number of Cells freed to this counter */

	8509 ){

	8510 MemPage *pPage;

	8511 int rc;

	8512 unsigned char *pCell;

	8513 int i;

	8514 int hdr;

	8515 CellInfo info;

	8516

	8517 assert( sqlite3_mutex_held(pBt->mutex) );

	8518 if( pgno>btreePagecount(pBt) ){

	8519 return SQLITE_CORRUPT_BKPT;

	8520 }

	8521 rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);

	8522 if( rc ) return rc;

	8523 if( pPage->bBusy ){

	8524 rc = SQLITE_CORRUPT_BKPT;

	8525 goto cleardatabasepage_out;

	8526 }

	8527 pPage->bBusy = 1;

	8528 hdr = pPage->hdrOffset;

	8529 for(i=0; i<pPage->nCell; i++){

	8530 pCell = findCell(pPage, i);

	8531 if( !pPage->leaf ){

	8532 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);

	8533 if( rc ) goto cleardatabasepage_out;

	8534 }

	8535 rc = clearCell(pPage, pCell, &info);

	8536 if( rc ) goto cleardatabasepage_out;

	8537 }

	8538 if( !pPage->leaf ){

	8539 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);

	8540 if( rc ) goto cleardatabasepage_out;

	8541 }else if( pnChange ){

	8542 assert( pPage->intKey \|\| CORRUPT_DB );

	8543 testcase( !pPage->intKey );

	8544 *pnChange += pPage->nCell;

	8545 }

	8546 if( freePageFlag ){

	8547 freePage(pPage, &rc);

	8548 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){

	8549 zeroPage(pPage, pPage->aData[hdr] \| PTF_LEAF);

	8550 }

	8551

	8552 cleardatabasepage_out:

	8553 pPage->bBusy = 0;

	8554 releasePage(pPage);

	8555 return rc;

	8556 }

	8557

	8558 /*

	8559 ** Delete all information from a single table in the database. iTable is

	8560 ** the page number of the root of the table. After this routine returns,

	8561 ** the root page is empty, but still exists.

	8562 **

	8563 ** This routine will fail with SQLITE_LOCKED if there are any open

	8564 ** read cursors on the table. Open write cursors are moved to the

	8565 ** root of the table.

	8566 **

	8567 ** If pnChange is not NULL, then table iTable must be an intkey table. The

	8568 ** integer value pointed to by pnChange is incremented by the number of

	8569 ** entries in the table.

	8570 */

	8571 int sqlite3BtreeClearTable(Btree p, int iTable, int pnChange){

	8572 int rc;

	8573 BtShared *pBt = p->pBt;

	8574 sqlite3BtreeEnter(p);

	8575 assert( p->inTrans==TRANS_WRITE );

	8576

	8577 rc = saveAllCursors(pBt, (Pgno)iTable, 0);

	8578

	8579 if( SQLITE_OK==rc ){

	8580 /* Invalidate all incrblob cursors open on table iTable (assuming iTable

	8581 ** is the root of a table b-tree - if it is not, the following call is

	8582 ** a no-op). */

	8583 invalidateIncrblobCursors(p, 0, 1);

	8584 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);

	8585 }

	8586 sqlite3BtreeLeave(p);

	8587 return rc;

	8588 }

	8589

	8590 /*

	8591 ** Delete all information from the single table that pCur is open on.

	8592 **

	8593 ** This routine only work for pCur on an ephemeral table.

	8594 */

	8595 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){

	8596 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);

	8597 }

	8598

	8599 /*

	8600 ** Erase all information in a table and add the root of the table to

	8601 ** the freelist. Except, the root of the principle table (the one on

	8602 ** page 1) is never added to the freelist.

	8603 **

	8604 ** This routine will fail with SQLITE_LOCKED if there are any open

	8605 ** cursors on the table.

	8606 **

	8607 ** If AUTOVACUUM is enabled and the page at iTable is not the last

	8608 ** root page in the database file, then the last root page

	8609 ** in the database file is moved into the slot formerly occupied by

	8610 ** iTable and that last slot formerly occupied by the last root page

	8611 ** is added to the freelist instead of iTable. In this say, all

	8612 ** root pages are kept at the beginning of the database file, which

	8613 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the

	8614 ** page number that used to be the last root page in the file before

	8615 ** the move. If no page gets moved, *piMoved is set to 0.

	8616 ** The last root page is recorded in meta[3] and the value of

	8617 ** meta[3] is updated by this procedure.

	8618 */

	8619 static int btreeDropTable(Btree p, Pgno iTable, int piMoved){

	8620 int rc;

	8621 MemPage *pPage = 0;

	8622 BtShared *pBt = p->pBt;

	8623

	8624 assert( sqlite3BtreeHoldsMutex(p) );

	8625 assert( p->inTrans==TRANS_WRITE );

	8626 assert( iTable>=2 );

	8627

	8628 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);

	8629 if( rc ) return rc;

	8630 rc = sqlite3BtreeClearTable(p, iTable, 0);

	8631 if( rc ){

	8632 releasePage(pPage);

	8633 return rc;

	8634 }

	8635

	8636 *piMoved = 0;

	8637

	8638 #ifdef SQLITE_OMIT_AUTOVACUUM

	8639 freePage(pPage, &rc);

	8640 releasePage(pPage);

	8641 #else

	8642 if( pBt->autoVacuum ){

	8643 Pgno maxRootPgno;

	8644 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);

	8645

	8646 if( iTable==maxRootPgno ){

	8647 /* If the table being dropped is the table with the largest root-page

	8648 ** number in the database, put the root page on the free list.

	8649 */

	8650 freePage(pPage, &rc);

	8651 releasePage(pPage);

	8652 if( rc!=SQLITE_OK ){

	8653 return rc;

	8654 }

	8655 }else{

	8656 /* The table being dropped does not have the largest root-page

	8657 ** number in the database. So move the page that does into the

	8658 ** gap left by the deleted root-page.

	8659 */

	8660 MemPage *pMove;

	8661 releasePage(pPage);

	8662 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

	8663 if( rc!=SQLITE_OK ){

	8664 return rc;

	8665 }

	8666 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);

	8667 releasePage(pMove);

	8668 if( rc!=SQLITE_OK ){

	8669 return rc;

	8670 }

	8671 pMove = 0;

	8672 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

	8673 freePage(pMove, &rc);

	8674 releasePage(pMove);

	8675 if( rc!=SQLITE_OK ){

	8676 return rc;

	8677 }

	8678 *piMoved = maxRootPgno;

	8679 }

	8680

	8681 /* Set the new 'max-root-page' value in the database header. This

	8682 ** is the old value less one, less one more if that happens to

	8683 ** be a root-page number, less one again if that is the

	8684 ** PENDING_BYTE_PAGE.

	8685 */

	8686 maxRootPgno--;

	8687 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)

	8688 \|\| PTRMAP_ISPAGE(pBt, maxRootPgno) ){

	8689 maxRootPgno--;

	8690 }

	8691 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );

	8692

	8693 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);

	8694 }else{

	8695 freePage(pPage, &rc);

	8696 releasePage(pPage);

	8697 }

	8698 #endif

	8699 return rc;

	8700 }

	8701 int sqlite3BtreeDropTable(Btree p, int iTable, int piMoved){

	8702 int rc;

	8703 sqlite3BtreeEnter(p);

	8704 rc = btreeDropTable(p, iTable, piMoved);

	8705 sqlite3BtreeLeave(p);

	8706 return rc;

	8707 }

	8708

	8709

	8710 /*

	8711 ** This function may only be called if the b-tree connection already

	8712 ** has a read or write transaction open on the database.

	8713 **

	8714 ** Read the meta-information out of a database file. Meta[0]

	8715 ** is the number of free pages currently in the database. Meta[1]

	8716 ** through meta[15] are available for use by higher layers. Meta[0]

	8717 ** is read-only, the others are read/write.

	8718 **

	8719 ** The schema layer numbers meta values differently. At the schema

	8720 ** layer (and the SetCookie and ReadCookie opcodes) the number of

	8721 ** free pages is not visible. So Cookie[0] is the same as Meta[1].

	8722 **

	8723 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead

	8724 ** of reading the value out of the header, it instead loads the "DataVersion"

	8725 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the

	8726 ** database file. It is a number computed by the pager. But its access

	8727 ** pattern is the same as header meta values, and so it is convenient to

	8728 ** read it from this routine.

	8729 */

	8730 void sqlite3BtreeGetMeta(Btree p, int idx, u32 pMeta){

	8731 BtShared *pBt = p->pBt;

	8732

	8733 sqlite3BtreeEnter(p);

	8734 assert( p->inTrans>TRANS_NONE );

	8735 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );

	8736 assert( pBt->pPage1 );

	8737 assert( idx>=0 && idx<=15 );

	8738

	8739 if( idx==BTREE_DATA_VERSION ){

	8740 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;

	8741 }else{

	8742 pMeta = get4byte(&pBt->pPage1->aData[36 + idx4]);

	8743 }

	8744

	8745 /* If auto-vacuum is disabled in this build and this is an auto-vacuum

	8746 ** database, mark the database as read-only. */

	8747 #ifdef SQLITE_OMIT_AUTOVACUUM

	8748 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){

	8749 pBt->btsFlags \|= BTS_READ_ONLY;

	8750 }

	8751 #endif

	8752

	8753 sqlite3BtreeLeave(p);

	8754 }

	8755

	8756 /*

	8757 ** Write meta-information back into the database. Meta[0] is

	8758 ** read-only and may not be written.

	8759 */

	8760 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){

	8761 BtShared *pBt = p->pBt;

	8762 unsigned char *pP1;

	8763 int rc;

	8764 assert( idx>=1 && idx<=15 );

	8765 sqlite3BtreeEnter(p);

	8766 assert( p->inTrans==TRANS_WRITE );

	8767 assert( pBt->pPage1!=0 );

	8768 pP1 = pBt->pPage1->aData;

	8769 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	8770 if( rc==SQLITE_OK ){

	8771 put4byte(&pP1[36 + idx*4], iMeta);

	8772 #ifndef SQLITE_OMIT_AUTOVACUUM

	8773 if( idx==BTREE_INCR_VACUUM ){

	8774 assert( pBt->autoVacuum \|\| iMeta==0 );

	8775 assert( iMeta==0 \|\| iMeta==1 );

	8776 pBt->incrVacuum = (u8)iMeta;

	8777 }

	8778 #endif

	8779 }

	8780 sqlite3BtreeLeave(p);

	8781 return rc;

	8782 }

	8783

	8784 #ifndef SQLITE_OMIT_BTREECOUNT

	8785 /*

	8786 ** The first argument, pCur, is a cursor opened on some b-tree. Count the

	8787 ** number of entries in the b-tree and write the result to *pnEntry.

	8788 **

	8789 ** SQLITE_OK is returned if the operation is successfully executed.

	8790 ** Otherwise, if an error is encountered (i.e. an IO error or database

	8791 ** corruption) an SQLite error code is returned.

	8792 */

	8793 int sqlite3BtreeCount(BtCursor pCur, i64 pnEntry){

	8794 i64 nEntry = 0; /* Value to return in pnEntry /

	8795 int rc; /* Return code */

	8796

	8797 if( pCur->pgnoRoot==0 ){

	8798 *pnEntry = 0;

	8799 return SQLITE_OK;

	8800 }

	8801 rc = moveToRoot(pCur);

	8802

	8803 /* Unless an error occurs, the following loop runs one iteration for each

	8804 ** page in the B-Tree structure (not including overflow pages).

	8805 */

	8806 while( rc==SQLITE_OK ){

	8807 int iIdx; /* Index of child node in parent */

	8808 MemPage pPage; / Current page of the b-tree */

	8809

	8810 /* If this is a leaf page or the tree is not an int-key tree, then

	8811 ** this page contains countable entries. Increment the entry counter

	8812 ** accordingly.

	8813 */

	8814 pPage = pCur->apPage[pCur->iPage];

	8815 if( pPage->leaf \|\| !pPage->intKey ){

	8816 nEntry += pPage->nCell;

	8817 }

	8818

	8819 /* pPage is a leaf node. This loop navigates the cursor so that it

	8820 ** points to the first interior cell that it points to the parent of

	8821 ** the next page in the tree that has not yet been visited. The

	8822 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell

	8823 ** of the page, or to the number of cells in the page if the next page

	8824 ** to visit is the right-child of its parent.

	8825 **

	8826 ** If all pages in the tree have been visited, return SQLITE_OK to the

	8827 ** caller.

	8828 */

	8829 if( pPage->leaf ){

	8830 do {

	8831 if( pCur->iPage==0 ){

	8832 /* All pages of the b-tree have been visited. Return successfully. */

	8833 *pnEntry = nEntry;

	8834 return moveToRoot(pCur);

	8835 }

	8836 moveToParent(pCur);

	8837 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );

	8838

	8839 pCur->aiIdx[pCur->iPage]++;

	8840 pPage = pCur->apPage[pCur->iPage];

	8841 }

	8842

	8843 /* Descend to the child node of the cell that the cursor currently

	8844 ** points at. This is the right-child if (iIdx==pPage->nCell).

	8845 */

	8846 iIdx = pCur->aiIdx[pCur->iPage];

	8847 if( iIdx==pPage->nCell ){

	8848 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

	8849 }else{

	8850 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));

	8851 }

	8852 }

	8853

	8854 /* An error has occurred. Return an error code. */

	8855 return rc;

	8856 }

	8857 #endif

	8858

	8859 /*

	8860 ** Return the pager associated with a BTree. This routine is used for

	8861 ** testing and debugging only.

	8862 */

	8863 Pager sqlite3BtreePager(Btree p){

	8864 return p->pBt->pPager;

	8865 }

	8866

	8867 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	8868 /*

	8869 ** Append a message to the error message string.

	8870 */

	8871 static void checkAppendMsg(

	8872 IntegrityCk *pCheck,

	8873 const char *zFormat,

	8874 ...

	8875 ){

	8876 va_list ap;

	8877 if( !pCheck->mxErr ) return;

	8878 pCheck->mxErr--;

	8879 pCheck->nErr++;

	8880 va_start(ap, zFormat);

	8881 if( pCheck->errMsg.nChar ){

	8882 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);

	8883 }

	8884 if( pCheck->zPfx ){

	8885 sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);

	8886 }

	8887 sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);

	8888 va_end(ap);

	8889 if( pCheck->errMsg.accError==STRACCUM_NOMEM ){

	8890 pCheck->mallocFailed = 1;

	8891 }

	8892 }

	8893 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	8894

	8895 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	8896

	8897 /*

	8898 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that

	8899 ** corresponds to page iPg is already set.

	8900 */

	8901 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){

	8902 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

	8903 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));

	8904 }

	8905

	8906 /*

	8907 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.

	8908 */

	8909 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){

	8910 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

	8911 pCheck->aPgRef[iPg/8] \|= (1 << (iPg & 0x07));

	8912 }

	8913

	8914

	8915 /*

	8916 ** Add 1 to the reference count for page iPage. If this is the second

	8917 ** reference to the page, add an error message to pCheck->zErrMsg.

	8918 ** Return 1 if there are 2 or more references to the page and 0 if

	8919 ** if this is the first reference to the page.

	8920 **

	8921 ** Also check that the page number is in bounds.

	8922 */

	8923 static int checkRef(IntegrityCk *pCheck, Pgno iPage){

	8924 if( iPage==0 ) return 1;

	8925 if( iPage>pCheck->nPage ){

	8926 checkAppendMsg(pCheck, "invalid page number %d", iPage);

	8927 return 1;

	8928 }

	8929 if( getPageReferenced(pCheck, iPage) ){

	8930 checkAppendMsg(pCheck, "2nd reference to page %d", iPage);

	8931 return 1;

	8932 }

	8933 setPageReferenced(pCheck, iPage);

	8934 return 0;

	8935 }

	8936

	8937 #ifndef SQLITE_OMIT_AUTOVACUUM

	8938 /*

	8939 ** Check that the entry in the pointer-map for page iChild maps to

	8940 ** page iParent, pointer type ptrType. If not, append an error message

	8941 ** to pCheck.

	8942 */

	8943 static void checkPtrmap(

	8944 IntegrityCk pCheck, / Integrity check context */

	8945 Pgno iChild, /* Child page number */

	8946 u8 eType, /* Expected pointer map type */

	8947 Pgno iParent /* Expected pointer map parent page number */

	8948 ){

	8949 int rc;

	8950 u8 ePtrmapType;

	8951 Pgno iPtrmapParent;

	8952

	8953 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);

	8954 if( rc!=SQLITE_OK ){

	8955 if( rc==SQLITE_NOMEM \|\| rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;

	8956 checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);

	8957 return;

	8958 }

	8959

	8960 if( ePtrmapType!=eType \|\| iPtrmapParent!=iParent ){

	8961 checkAppendMsg(pCheck,

	8962 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",

	8963 iChild, eType, iParent, ePtrmapType, iPtrmapParent);

	8964 }

	8965 }

	8966 #endif

	8967

	8968 /*

	8969 ** Check the integrity of the freelist or of an overflow page list.

	8970 ** Verify that the number of pages on the list is N.

	8971 */

	8972 static void checkList(

	8973 IntegrityCk pCheck, / Integrity checking context */

	8974 int isFreeList, /* True for a freelist. False for overflow page list */

	8975 int iPage, /* Page number for first page in the list */

	8976 int N /* Expected number of pages in the list */

	8977 ){

	8978 int i;

	8979 int expected = N;

	8980 int iFirst = iPage;

	8981 while( N-- > 0 && pCheck->mxErr ){

	8982 DbPage *pOvflPage;

	8983 unsigned char *pOvflData;

	8984 if( iPage<1 ){

	8985 checkAppendMsg(pCheck,

	8986 "%d of %d pages missing from overflow list starting at %d",

	8987 N+1, expected, iFirst);

	8988 break;

	8989 }

	8990 if( checkRef(pCheck, iPage) ) break;

	8991 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){

	8992 checkAppendMsg(pCheck, "failed to get page %d", iPage);

	8993 break;

	8994 }

	8995 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);

	8996 if( isFreeList ){

	8997 int n = get4byte(&pOvflData[4]);

	8998 #ifndef SQLITE_OMIT_AUTOVACUUM

	8999 if( pCheck->pBt->autoVacuum ){

	9000 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);

	9001 }

	9002 #endif

	9003 if( n>(int)pCheck->pBt->usableSize/4-2 ){

	9004 checkAppendMsg(pCheck,

	9005 "freelist leaf count too big on page %d", iPage);

	9006 N--;

	9007 }else{

	9008 for(i=0; i<n; i++){

	9009 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);

	9010 #ifndef SQLITE_OMIT_AUTOVACUUM

	9011 if( pCheck->pBt->autoVacuum ){

	9012 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);

	9013 }

	9014 #endif

	9015 checkRef(pCheck, iFreePage);

	9016 }

	9017 N -= n;

	9018 }

	9019 }

	9020 #ifndef SQLITE_OMIT_AUTOVACUUM

	9021 else{

	9022 /* If this database supports auto-vacuum and iPage is not the last

	9023 ** page in this overflow list, check that the pointer-map entry for

	9024 ** the following page matches iPage.

	9025 */

	9026 if( pCheck->pBt->autoVacuum && N>0 ){

	9027 i = get4byte(pOvflData);

	9028 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);

	9029 }

	9030 }

	9031 #endif

	9032 iPage = get4byte(pOvflData);

	9033 sqlite3PagerUnref(pOvflPage);

	9034

	9035 if( isFreeList && N<(iPage!=0) ){

	9036 checkAppendMsg(pCheck, "free-page count in header is too small");

	9037 }

	9038 }

	9039 }

	9040 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	9041

	9042 /*

	9043 ** An implementation of a min-heap.

	9044 **

	9045 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the

	9046 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2]

	9047 ** and aHeap[N*2+1].

	9048 **

	9049 ** The heap property is this: Every node is less than or equal to both

	9050 ** of its daughter nodes. A consequence of the heap property is that the

	9051 ** root node aHeap[1] is always the minimum value currently in the heap.

	9052 **

	9053 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto

	9054 ** the heap, preserving the heap property. The btreeHeapPull() routine

	9055 ** removes the root element from the heap (the minimum value in the heap)

	9056 ** and then moves other nodes around as necessary to preserve the heap

	9057 ** property.

	9058 **

	9059 ** This heap is used for cell overlap and coverage testing. Each u32

	9060 ** entry represents the span of a cell or freeblock on a btree page.

	9061 ** The upper 16 bits are the index of the first byte of a range and the

	9062 ** lower 16 bits are the index of the last byte of that range.

	9063 */

	9064 static void btreeHeapInsert(u32 *aHeap, u32 x){

	9065 u32 j, i = ++aHeap[0];

	9066 aHeap[i] = x;

	9067 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){

	9068 x = aHeap[j];

	9069 aHeap[j] = aHeap[i];

	9070 aHeap[i] = x;

	9071 i = j;

	9072 }

	9073 }

	9074 static int btreeHeapPull(u32 aHeap, u32 pOut){

	9075 u32 j, i, x;

	9076 if( (x = aHeap[0])==0 ) return 0;

	9077 *pOut = aHeap[1];

	9078 aHeap[1] = aHeap[x];

	9079 aHeap[x] = 0xffffffff;

	9080 aHeap[0]--;

	9081 i = 1;

	9082 while( (j = i*2)<=aHeap[0] ){

	9083 if( aHeap[j]>aHeap[j+1] ) j++;

	9084 if( aHeap[i]<aHeap[j] ) break;

	9085 x = aHeap[i];

	9086 aHeap[i] = aHeap[j];

	9087 aHeap[j] = x;

	9088 i = j;

	9089 }

	9090 return 1;

	9091 }

	9092

	9093 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	9094 /*

	9095 ** Do various sanity checks on a single page of a tree. Return

	9096 ** the tree depth. Root pages return 0. Parents of root pages

	9097 ** return 1, and so forth.

	9098 **

	9099 ** These checks are done:

	9100 **

	9101 ** 1. Make sure that cells and freeblocks do not overlap

	9102 ** but combine to completely cover the page.

	9103 ** 2. Make sure integer cell keys are in order.

	9104 ** 3. Check the integrity of overflow pages.

	9105 ** 4. Recursively call checkTreePage on all children.

	9106 ** 5. Verify that the depth of all children is the same.

	9107 */

	9108 static int checkTreePage(

	9109 IntegrityCk pCheck, / Context for the sanity check */

	9110 int iPage, /* Page number of the page to check */

	9111 i64 piMinKey, / Write minimum integer primary key here */

	9112 i64 maxKey /* Error if integer primary key greater than this */

	9113 ){

	9114 MemPage pPage = 0; / The page being analyzed */

	9115 int i; /* Loop counter */

	9116 int rc; /* Result code from subroutine call */

	9117 int depth = -1, d2; /* Depth of a subtree */

	9118 int pgno; /* Page number */

	9119 int nFrag; /* Number of fragmented bytes on the page */

	9120 int hdr; /* Offset to the page header */

	9121 int cellStart; /* Offset to the start of the cell pointer array */

	9122 int nCell; /* Number of cells */

	9123 int doCoverageCheck = 1; /* True if cell coverage checking should be done */

	9124 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey

	9125 ** False if IPK must be strictly less than maxKey */

	9126 u8 data; / Page content */

	9127 u8 pCell; / Cell content */

	9128 u8 pCellIdx; / Next element of the cell pointer array */

	9129 BtShared pBt; / The BtShared object that owns pPage */

	9130 u32 pc; /* Address of a cell */

	9131 u32 usableSize; /* Usable size of the page */

	9132 u32 contentOffset; /* Offset to the start of the cell content area */

	9133 u32 heap = 0; / Min-heap used for checking cell coverage */

	9134 u32 x, prev = 0; /* Next and previous entry on the min-heap */

	9135 const char *saved_zPfx = pCheck->zPfx;

	9136 int saved_v1 = pCheck->v1;

	9137 int saved_v2 = pCheck->v2;

	9138 u8 savedIsInit = 0;

	9139

	9140 /* Check that the page exists

	9141 */

	9142 pBt = pCheck->pBt;

	9143 usableSize = pBt->usableSize;

	9144 if( iPage==0 ) return 0;

	9145 if( checkRef(pCheck, iPage) ) return 0;

	9146 pCheck->zPfx = "Page %d: ";

	9147 pCheck->v1 = iPage;

	9148 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){

	9149 checkAppendMsg(pCheck,

	9150 "unable to get the page. error code=%d", rc);

	9151 goto end_of_check;

	9152 }

	9153

	9154 /* Clear MemPage.isInit to make sure the corruption detection code in

	9155 ** btreeInitPage() is executed. */

	9156 savedIsInit = pPage->isInit;

	9157 pPage->isInit = 0;

	9158 if( (rc = btreeInitPage(pPage))!=0 ){

	9159 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */

	9160 checkAppendMsg(pCheck,

	9161 "btreeInitPage() returns error code %d", rc);

	9162 goto end_of_check;

	9163 }

	9164 data = pPage->aData;

	9165 hdr = pPage->hdrOffset;

	9166

	9167 /* Set up for cell analysis */

	9168 pCheck->zPfx = "On tree page %d cell %d: ";

	9169 contentOffset = get2byteNotZero(&data[hdr+5]);

	9170 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */

	9171

	9172 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the

	9173 ** number of cells on the page. */

	9174 nCell = get2byte(&data[hdr+3]);

	9175 assert( pPage->nCell==nCell );

	9176

	9177 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page

	9178 ** immediately follows the b-tree page header. */

	9179 cellStart = hdr + 12 - 4*pPage->leaf;

	9180 assert( pPage->aCellIdx==&data[cellStart] );

	9181 pCellIdx = &data[cellStart + 2*(nCell-1)];

	9182

	9183 if( !pPage->leaf ){

	9184 /* Analyze the right-child page of internal pages */

	9185 pgno = get4byte(&data[hdr+8]);

	9186 #ifndef SQLITE_OMIT_AUTOVACUUM

	9187 if( pBt->autoVacuum ){

	9188 pCheck->zPfx = "On page %d at right child: ";

	9189 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

	9190 }

	9191 #endif

	9192 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);

	9193 keyCanBeEqual = 0;

	9194 }else{

	9195 /* For leaf pages, the coverage check will occur in the same loop

	9196 ** as the other cell checks, so initialize the heap. */

	9197 heap = pCheck->heap;

	9198 heap[0] = 0;

	9199 }

	9200

	9201 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte

	9202 ** integer offsets to the cell contents. */

	9203 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){

	9204 CellInfo info;

	9205

	9206 /* Check cell size */

	9207 pCheck->v2 = i;

	9208 assert( pCellIdx==&data[cellStart + i*2] );

	9209 pc = get2byteAligned(pCellIdx);

	9210 pCellIdx -= 2;

	9211 if( pc<contentOffset \|\| pc>usableSize-4 ){

	9212 checkAppendMsg(pCheck, "Offset %d out of range %d..%d",

	9213 pc, contentOffset, usableSize-4);

	9214 doCoverageCheck = 0;

	9215 continue;

	9216 }

	9217 pCell = &data[pc];

	9218 pPage->xParseCell(pPage, pCell, &info);

	9219 if( pc+info.nSize>usableSize ){

	9220 checkAppendMsg(pCheck, "Extends off end of page");

	9221 doCoverageCheck = 0;

	9222 continue;

	9223 }

	9224

	9225 /* Check for integer primary key out of range */

	9226 if( pPage->intKey ){

	9227 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){

	9228 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);

	9229 }

	9230 maxKey = info.nKey;

	9231 }

	9232

	9233 /* Check the content overflow list */

	9234 if( info.nPayload>info.nLocal ){

	9235 int nPage; /* Number of pages on the overflow chain */

	9236 Pgno pgnoOvfl; /* First page of the overflow chain */

	9237 assert( pc + info.nSize - 4 <= usableSize );

	9238 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);

	9239 pgnoOvfl = get4byte(&pCell[info.nSize - 4]);

	9240 #ifndef SQLITE_OMIT_AUTOVACUUM

	9241 if( pBt->autoVacuum ){

	9242 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);

	9243 }

	9244 #endif

	9245 checkList(pCheck, 0, pgnoOvfl, nPage);

	9246 }

	9247

	9248 if( !pPage->leaf ){

	9249 /* Check sanity of left child page for internal pages */

	9250 pgno = get4byte(pCell);

	9251 #ifndef SQLITE_OMIT_AUTOVACUUM

	9252 if( pBt->autoVacuum ){

	9253 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

	9254 }

	9255 #endif

	9256 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);

	9257 keyCanBeEqual = 0;

	9258 if( d2!=depth ){

	9259 checkAppendMsg(pCheck, "Child page depth differs");

	9260 depth = d2;

	9261 }

	9262 }else{

	9263 /* Populate the coverage-checking heap for leaf pages */

	9264 btreeHeapInsert(heap, (pc<<16)\|(pc+info.nSize-1));

	9265 }

	9266 }

	9267 *piMinKey = maxKey;

	9268

	9269 /* Check for complete coverage of the page

	9270 */

	9271 pCheck->zPfx = 0;

	9272 if( doCoverageCheck && pCheck->mxErr>0 ){

	9273 /* For leaf pages, the min-heap has already been initialized and the

	9274 ** cells have already been inserted. But for internal pages, that has

	9275 ** not yet been done, so do it now */

	9276 if( !pPage->leaf ){

	9277 heap = pCheck->heap;

	9278 heap[0] = 0;

	9279 for(i=nCell-1; i>=0; i--){

	9280 u32 size;

	9281 pc = get2byteAligned(&data[cellStart+i*2]);

	9282 size = pPage->xCellSize(pPage, &data[pc]);

	9283 btreeHeapInsert(heap, (pc<<16)\|(pc+size-1));

	9284 }

	9285 }

	9286 /* Add the freeblocks to the min-heap

	9287 **

	9288 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header

	9289 ** is the offset of the first freeblock, or zero if there are no

	9290 ** freeblocks on the page.

	9291 */

	9292 i = get2byte(&data[hdr+1]);

	9293 while( i>0 ){

	9294 int size, j;

	9295 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeInitPage() */

	9296 size = get2byte(&data[i+2]);

	9297 assert( (u32)(i+size)<=usableSize ); /* Enforced by btreeInitPage() */

	9298 btreeHeapInsert(heap, (((u32)i)<<16)\|(i+size-1));

	9299 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a

	9300 ** big-endian integer which is the offset in the b-tree page of the next

	9301 ** freeblock in the chain, or zero if the freeblock is the last on the

	9302 ** chain. */

	9303 j = get2byte(&data[i]);

	9304 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of

	9305 ** increasing offset. */

	9306 assert( j==0 \|\| j>i+size ); /* Enforced by btreeInitPage() */

	9307 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeInitPage() */

	9308 i = j;

	9309 }

	9310 /* Analyze the min-heap looking for overlap between cells and/or

	9311 ** freeblocks, and counting the number of untracked bytes in nFrag.

	9312 **

	9313 ** Each min-heap entry is of the form: (start_address<<16)\|end_address.

	9314 ** There is an implied first entry the covers the page header, the cell

	9315 ** pointer index, and the gap between the cell pointer index and the start

	9316 ** of cell content.

	9317 **

	9318 ** The loop below pulls entries from the min-heap in order and compares

	9319 ** the start_address against the previous end_address. If there is an

	9320 ** overlap, that means bytes are used multiple times. If there is a gap,

	9321 ** that gap is added to the fragmentation count.

	9322 */

	9323 nFrag = 0;

	9324 prev = contentOffset - 1; /* Implied first min-heap entry */

	9325 while( btreeHeapPull(heap,&x) ){

	9326 if( (prev&0xffff)>=(x>>16) ){

	9327 checkAppendMsg(pCheck,

	9328 "Multiple uses for byte %u of page %d", x>>16, iPage);

	9329 break;

	9330 }else{

	9331 nFrag += (x>>16) - (prev&0xffff) - 1;

	9332 prev = x;

	9333 }

	9334 }

	9335 nFrag += usableSize - (prev&0xffff) - 1;

	9336 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments

	9337 ** is stored in the fifth field of the b-tree page header.

	9338 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the

	9339 ** number of fragmented free bytes within the cell content area.

	9340 */

	9341 if( heap[0]==0 && nFrag!=data[hdr+7] ){

	9342 checkAppendMsg(pCheck,

	9343 "Fragmentation of %d bytes reported as %d on page %d",

	9344 nFrag, data[hdr+7], iPage);

	9345 }

	9346 }

	9347

	9348 end_of_check:

	9349 if( !doCoverageCheck ) pPage->isInit = savedIsInit;

	9350 releasePage(pPage);

	9351 pCheck->zPfx = saved_zPfx;

	9352 pCheck->v1 = saved_v1;

	9353 pCheck->v2 = saved_v2;

	9354 return depth+1;

	9355 }

	9356 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	9357

	9358 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	9359 /*

	9360 ** This routine does a complete check of the given BTree file. aRoot[] is

	9361 ** an array of pages numbers were each page number is the root page of

	9362 ** a table. nRoot is the number of entries in aRoot.

	9363 **

	9364 ** A read-only or read-write transaction must be opened before calling

	9365 ** this function.

	9366 **

	9367 ** Write the number of error seen in *pnErr. Except for some memory

	9368 ** allocation errors, an error message held in memory obtained from

	9369 ** malloc is returned if pnErr is non-zero. If pnErr==0 then NULL is

	9370 ** returned. If a memory allocation error occurs, NULL is returned.

	9371 */

	9372 char *sqlite3BtreeIntegrityCheck(

	9373 Btree p, / The btree to be checked */

	9374 int aRoot, / An array of root pages numbers for individual trees */

	9375 int nRoot, /* Number of entries in aRoot[] */

	9376 int mxErr, /* Stop reporting errors after this many */

	9377 int pnErr / Write number of errors seen to this variable */

	9378 ){

	9379 Pgno i;

	9380 IntegrityCk sCheck;

	9381 BtShared *pBt = p->pBt;

	9382 int savedDbFlags = pBt->db->flags;

	9383 char zErr[100];

	9384 VVA_ONLY( int nRef );

	9385

	9386 sqlite3BtreeEnter(p);

	9387 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );

	9388 VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );

	9389 assert( nRef>=0 );

	9390 sCheck.pBt = pBt;

	9391 sCheck.pPager = pBt->pPager;

	9392 sCheck.nPage = btreePagecount(sCheck.pBt);

	9393 sCheck.mxErr = mxErr;

	9394 sCheck.nErr = 0;

	9395 sCheck.mallocFailed = 0;

	9396 sCheck.zPfx = 0;

	9397 sCheck.v1 = 0;

	9398 sCheck.v2 = 0;

	9399 sCheck.aPgRef = 0;

	9400 sCheck.heap = 0;

	9401 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);

	9402 sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;

	9403 if( sCheck.nPage==0 ){

	9404 goto integrity_ck_cleanup;

	9405 }

	9406

	9407 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);

	9408 if( !sCheck.aPgRef ){

	9409 sCheck.mallocFailed = 1;

	9410 goto integrity_ck_cleanup;

	9411 }

	9412 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );

	9413 if( sCheck.heap==0 ){

	9414 sCheck.mallocFailed = 1;

	9415 goto integrity_ck_cleanup;

	9416 }

	9417

	9418 i = PENDING_BYTE_PAGE(pBt);

	9419 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);

	9420

	9421 /* Check the integrity of the freelist

	9422 */

	9423 sCheck.zPfx = "Main freelist: ";

	9424 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),

	9425 get4byte(&pBt->pPage1->aData[36]));

	9426 sCheck.zPfx = 0;

	9427

	9428 /* Check all the tables.

	9429 */

	9430 testcase( pBt->db->flags & SQLITE_CellSizeCk );

	9431 pBt->db->flags &= ~SQLITE_CellSizeCk;

	9432 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){

	9433 i64 notUsed;

	9434 if( aRoot[i]==0 ) continue;

	9435 #ifndef SQLITE_OMIT_AUTOVACUUM

	9436 if( pBt->autoVacuum && aRoot[i]>1 ){

	9437 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);

	9438 }

	9439 #endif

	9440 checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);

	9441 }

	9442 pBt->db->flags = savedDbFlags;

	9443

	9444 /* Make sure every page in the file is referenced

	9445 */

	9446 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){

	9447 #ifdef SQLITE_OMIT_AUTOVACUUM

	9448 if( getPageReferenced(&sCheck, i)==0 ){

	9449 checkAppendMsg(&sCheck, "Page %d is never used", i);

	9450 }

	9451 #else

	9452 /* If the database supports auto-vacuum, make sure no tables contain

	9453 ** references to pointer-map pages.

	9454 */

	9455 if( getPageReferenced(&sCheck, i)==0 &&

	9456 (PTRMAP_PAGENO(pBt, i)!=i \|\| !pBt->autoVacuum) ){

	9457 checkAppendMsg(&sCheck, "Page %d is never used", i);

	9458 }

	9459 if( getPageReferenced(&sCheck, i)!=0 &&

	9460 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){

	9461 checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);

	9462 }

	9463 #endif

	9464 }

	9465

	9466 /* Clean up and report errors.

	9467 */

	9468 integrity_ck_cleanup:

	9469 sqlite3PageFree(sCheck.heap);

	9470 sqlite3_free(sCheck.aPgRef);

	9471 if( sCheck.mallocFailed ){

	9472 sqlite3StrAccumReset(&sCheck.errMsg);

	9473 sCheck.nErr++;

	9474 }

	9475 *pnErr = sCheck.nErr;

	9476 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);

	9477 /* Make sure this analysis did not leave any unref() pages. */

	9478 assert( nRef==sqlite3PagerRefcount(pBt->pPager) );

	9479 sqlite3BtreeLeave(p);

	9480 return sqlite3StrAccumFinish(&sCheck.errMsg);

	9481 }

	9482 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	9483

	9484 /*

	9485 ** Return the full pathname of the underlying database file. Return

	9486 ** an empty string if the database is in-memory or a TEMP database.

	9487 **

	9488 ** The pager filename is invariant as long as the pager is

	9489 ** open so it is safe to access without the BtShared mutex.

	9490 */

	9491 const char sqlite3BtreeGetFilename(Btree p){

	9492 assert( p->pBt->pPager!=0 );

	9493 return sqlite3PagerFilename(p->pBt->pPager, 1);

	9494 }

	9495

	9496 /*

	9497 ** Return the pathname of the journal file for this database. The return

	9498 ** value of this routine is the same regardless of whether the journal file

	9499 ** has been created or not.

	9500 **

	9501 ** The pager journal filename is invariant as long as the pager is

	9502 ** open so it is safe to access without the BtShared mutex.

	9503 */

	9504 const char sqlite3BtreeGetJournalname(Btree p){

	9505 assert( p->pBt->pPager!=0 );

	9506 return sqlite3PagerJournalname(p->pBt->pPager);

	9507 }

	9508

	9509 /*

	9510 ** Return non-zero if a transaction is active.

	9511 */

	9512 int sqlite3BtreeIsInTrans(Btree *p){

	9513 assert( p==0 \|\| sqlite3_mutex_held(p->db->mutex) );

	9514 return (p && (p->inTrans==TRANS_WRITE));

	9515 }

	9516

	9517 #ifndef SQLITE_OMIT_WAL

	9518 /*

	9519 ** Run a checkpoint on the Btree passed as the first argument.

	9520 **

	9521 ** Return SQLITE_LOCKED if this or any other connection has an open

	9522 ** transaction on the shared-cache the argument Btree is connected to.

	9523 **

	9524 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.

	9525 */

	9526 int sqlite3BtreeCheckpoint(Btree p, int eMode, int pnLog, int *pnCkpt){

	9527 int rc = SQLITE_OK;

	9528 if( p ){

	9529 BtShared *pBt = p->pBt;

	9530 sqlite3BtreeEnter(p);

	9531 if( pBt->inTransaction!=TRANS_NONE ){

	9532 rc = SQLITE_LOCKED;

	9533 }else{

	9534 rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);

	9535 }

	9536 sqlite3BtreeLeave(p);

	9537 }

	9538 return rc;

	9539 }

	9540 #endif

	9541

	9542 /*

	9543 ** Return non-zero if a read (or write) transaction is active.

	9544 */

	9545 int sqlite3BtreeIsInReadTrans(Btree *p){

	9546 assert( p );

	9547 assert( sqlite3_mutex_held(p->db->mutex) );

	9548 return p->inTrans!=TRANS_NONE;

	9549 }

	9550

	9551 int sqlite3BtreeIsInBackup(Btree *p){

	9552 assert( p );

	9553 assert( sqlite3_mutex_held(p->db->mutex) );

	9554 return p->nBackup!=0;

	9555 }

	9556

	9557 /*

	9558 ** This function returns a pointer to a blob of memory associated with

	9559 ** a single shared-btree. The memory is used by client code for its own

	9560 ** purposes (for example, to store a high-level schema associated with

	9561 ** the shared-btree). The btree layer manages reference counting issues.

	9562 **

	9563 ** The first time this is called on a shared-btree, nBytes bytes of memory

	9564 ** are allocated, zeroed, and returned to the caller. For each subsequent

	9565 ** call the nBytes parameter is ignored and a pointer to the same blob

	9566 ** of memory returned.

	9567 **

	9568 ** If the nBytes parameter is 0 and the blob of memory has not yet been

	9569 ** allocated, a null pointer is returned. If the blob has already been

	9570 ** allocated, it is returned as normal.

	9571 **

	9572 ** Just before the shared-btree is closed, the function passed as the

	9573 ** xFree argument when the memory allocation was made is invoked on the

	9574 ** blob of allocated memory. The xFree function should not call sqlite3_free()

	9575 ** on the memory, the btree layer does that.

	9576 */

	9577 void sqlite3BtreeSchema(Btree p, int nBytes, void(xFree)(void )){

	9578 BtShared *pBt = p->pBt;

	9579 sqlite3BtreeEnter(p);

	9580 if( !pBt->pSchema && nBytes ){

	9581 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);

	9582 pBt->xFreeSchema = xFree;

	9583 }

	9584 sqlite3BtreeLeave(p);

	9585 return pBt->pSchema;

	9586 }

	9587

	9588 /*

	9589 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared

	9590 ** btree as the argument handle holds an exclusive lock on the

	9591 ** sqlite_master table. Otherwise SQLITE_OK.

	9592 */

	9593 int sqlite3BtreeSchemaLocked(Btree *p){

	9594 int rc;

	9595 assert( sqlite3_mutex_held(p->db->mutex) );

	9596 sqlite3BtreeEnter(p);

	9597 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

	9598 assert( rc==SQLITE_OK \|\| rc==SQLITE_LOCKED_SHAREDCACHE );

	9599 sqlite3BtreeLeave(p);

	9600 return rc;

	9601 }

	9602

	9603

	9604 #ifndef SQLITE_OMIT_SHARED_CACHE

	9605 /*

	9606 ** Obtain a lock on the table whose root page is iTab. The

	9607 ** lock is a write lock if isWritelock is true or a read lock

	9608 ** if it is false.

	9609 */

	9610 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){

	9611 int rc = SQLITE_OK;

	9612 assert( p->inTrans!=TRANS_NONE );

	9613 if( p->sharable ){

	9614 u8 lockType = READ_LOCK + isWriteLock;

	9615 assert( READ_LOCK+1==WRITE_LOCK );

	9616 assert( isWriteLock==0 \|\| isWriteLock==1 );

	9617

	9618 sqlite3BtreeEnter(p);

	9619 rc = querySharedCacheTableLock(p, iTab, lockType);

	9620 if( rc==SQLITE_OK ){

	9621 rc = setSharedCacheTableLock(p, iTab, lockType);

	9622 }

	9623 sqlite3BtreeLeave(p);

	9624 }

	9625 return rc;

	9626 }

	9627 #endif

	9628

	9629 #ifndef SQLITE_OMIT_INCRBLOB

	9630 /*

	9631 ** Argument pCsr must be a cursor opened for writing on an

	9632 ** INTKEY table currently pointing at a valid table entry.

	9633 ** This function modifies the data stored as part of that entry.

	9634 **

	9635 ** Only the data content may only be modified, it is not possible to

	9636 ** change the length of the data stored. If this function is called with

	9637 ** parameters that attempt to write past the end of the existing data,

	9638 ** no modifications are made and SQLITE_CORRUPT is returned.

	9639 */

	9640 int sqlite3BtreePutData(BtCursor pCsr, u32 offset, u32 amt, void z){

	9641 int rc;

	9642 assert( cursorOwnsBtShared(pCsr) );

	9643 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );

	9644 assert( pCsr->curFlags & BTCF_Incrblob );

	9645

	9646 rc = restoreCursorPosition(pCsr);

	9647 if( rc!=SQLITE_OK ){

	9648 return rc;

	9649 }

	9650 assert( pCsr->eState!=CURSOR_REQUIRESEEK );

	9651 if( pCsr->eState!=CURSOR_VALID ){

	9652 return SQLITE_ABORT;

	9653 }

	9654

	9655 /* Save the positions of all other cursors open on this table. This is

	9656 ** required in case any of them are holding references to an xFetch

	9657 ** version of the b-tree page modified by the accessPayload call below.

	9658 **

	9659 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()

	9660 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence

	9661 ** saveAllCursors can only return SQLITE_OK.

	9662 */

	9663 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);

	9664 assert( rc==SQLITE_OK );

	9665

	9666 /* Check some assumptions:

	9667 ** (a) the cursor is open for writing,

	9668 ** (b) there is a read/write transaction open,

	9669 ** (c) the connection holds a write-lock on the table (if required),

	9670 ** (d) there are no conflicting read-locks, and

	9671 ** (e) the cursor points at a valid row of an intKey table.

	9672 */

	9673 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){

	9674 return SQLITE_READONLY;

	9675 }

	9676 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0

	9677 && pCsr->pBt->inTransaction==TRANS_WRITE );

	9678 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );

	9679 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );

	9680 assert( pCsr->apPage[pCsr->iPage]->intKey );

	9681

	9682 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);

	9683 }

	9684

	9685 /*

	9686 ** Mark this cursor as an incremental blob cursor.

	9687 */

	9688 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){

	9689 pCur->curFlags \|= BTCF_Incrblob;

	9690 pCur->pBtree->hasIncrblobCur = 1;

	9691 }

	9692 #endif

	9693

	9694 /*

	9695 ** Set both the "read version" (single byte at byte offset 18) and

	9696 ** "write version" (single byte at byte offset 19) fields in the database

	9697 ** header to iVersion.

	9698 */

	9699 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){

	9700 BtShared *pBt = pBtree->pBt;

	9701 int rc; /* Return code */

	9702

	9703 assert( iVersion==1 \|\| iVersion==2 );

	9704

	9705 /* If setting the version fields to 1, do not automatically open the

	9706 ** WAL connection, even if the version fields are currently set to 2.

	9707 */

	9708 pBt->btsFlags &= ~BTS_NO_WAL;

	9709 if( iVersion==1 ) pBt->btsFlags \|= BTS_NO_WAL;

	9710

	9711 rc = sqlite3BtreeBeginTrans(pBtree, 0);

	9712 if( rc==SQLITE_OK ){

	9713 u8 *aData = pBt->pPage1->aData;

	9714 if( aData[18]!=(u8)iVersion \|\| aData[19]!=(u8)iVersion ){

	9715 rc = sqlite3BtreeBeginTrans(pBtree, 2);

	9716 if( rc==SQLITE_OK ){

	9717 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	9718 if( rc==SQLITE_OK ){

	9719 aData[18] = (u8)iVersion;

	9720 aData[19] = (u8)iVersion;

	9721 }

	9722 }

	9723 }

	9724 }

	9725

	9726 pBt->btsFlags &= ~BTS_NO_WAL;

	9727 return rc;

	9728 }

	9729

	9730 /*

	9731 ** Return true if the cursor has a hint specified. This routine is

	9732 ** only used from within assert() statements

	9733 */

	9734 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){

	9735 return (pCsr->hints & mask)!=0;

	9736 }

	9737

	9738 /*

	9739 ** Return true if the given Btree is read-only.

	9740 */

	9741 int sqlite3BtreeIsReadonly(Btree *p){

	9742 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;

	9743 }

	9744

	9745 /*

	9746 ** Return the size of the header added to each page by this module.

	9747 */

	9748 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }

	9749

	9750 #if !defined(SQLITE_OMIT_SHARED_CACHE)

	9751 /*

	9752 ** Return true if the Btree passed as the only argument is sharable.

	9753 */

	9754 int sqlite3BtreeSharable(Btree *p){

	9755 return p->sharable;

	9756 }

	9757

	9758 /*

	9759 ** Return the number of connections to the BtShared object accessed by

	9760 ** the Btree handle passed as the only argument. For private caches

	9761 ** this is always 1. For shared caches it may be 1 or greater.

	9762 */

	9763 int sqlite3BtreeConnectionCount(Btree *p){

	9764 testcase( p->sharable );

	9765 return p->pBt->nRef;

	9766 }

	9767 #endif

OLD	NEW

« no previous file with comments | « third_party/sqlite/sqlite-src-3170000/src/btree.h ('k') | third_party/sqlite/sqlite-src-3170000/src/btreeInt.h » ('j') | no next file with comments »