third_party/sqlite/sqlite-src-3080704/src/btree.c - Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch

Side by Side Diff: third_party/sqlite/sqlite-src-3080704/src/btree.c

Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 ** 2004 April 6

	3 **

	4 ** The author disclaims copyright to this source code. In place of

	5 ** a legal notice, here is a blessing:

	6 **

	7 ** May you do good and not evil.

	8 ** May you find forgiveness for yourself and forgive others.

	9 ** May you share freely, never taking more than you give.

	10 **

	11 *************************************************************************

	12 ** This file implements an external (disk-based) database using BTrees.

	13 ** See the header comment on "btreeInt.h" for additional information.

	14 ** Including a description of file format and an overview of operation.

	15 */

	16 #include "btreeInt.h"

	17

	18 /*

	19 ** The header string that appears at the beginning of every

	20 ** SQLite database.

	21 */

	22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;

	23

	24 /*

	25 ** Set this global variable to 1 to enable tracing using the TRACE

	26 ** macro.

	27 */

	28 #if 0

	29 int sqlite3BtreeTrace=1; /* True to enable tracing */

	30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}

	31 #else

	32 # define TRACE(X)

	33 #endif

	34

	35 /*

	36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.

	37 ** But if the value is zero, make it 65536.

	38 **

	39 ** This routine is used to extract the "offset to cell content area" value

	40 ** from the header of a btree page. If the page size is 65536 and the page

	41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.

	42 ** This routine makes the necessary adjustment to 65536.

	43 */

	44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)

	45

	46 /*

	47 ** Values passed as the 5th argument to allocateBtreePage()

	48 */

	49 #define BTALLOC_ANY 0 /* Allocate any page */

	50 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */

	51 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */

	52

	53 /*

	54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not

	55 ** defined, or 0 if it is. For example:

	56 **

	57 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);

	58 */

	59 #ifndef SQLITE_OMIT_AUTOVACUUM

	60 #define IfNotOmitAV(expr) (expr)

	61 #else

	62 #define IfNotOmitAV(expr) 0

	63 #endif

	64

	65 #ifndef SQLITE_OMIT_SHARED_CACHE

	66 /*

	67 ** A list of BtShared objects that are eligible for participation

	68 ** in shared cache. This variable has file scope during normal builds,

	69 ** but the test harness needs to access it so we make it global for

	70 ** test builds.

	71 **

	72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.

	73 */

	74 #ifdef SQLITE_TEST

	75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

	76 #else

	77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

	78 #endif

	79 #endif /* SQLITE_OMIT_SHARED_CACHE */

	80

	81 #ifndef SQLITE_OMIT_SHARED_CACHE

	82 /*

	83 ** Enable or disable the shared pager and schema features.

	84 **

	85 ** This routine has no effect on existing database connections.

	86 ** The shared cache setting effects only future calls to

	87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().

	88 */

	89 int sqlite3_enable_shared_cache(int enable){

	90 sqlite3GlobalConfig.sharedCacheEnabled = enable;

	91 return SQLITE_OK;

	92 }

	93 #endif

	94

	95

	96

	97 #ifdef SQLITE_OMIT_SHARED_CACHE

	98 /*

	99 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),

	100 ** and clearAllSharedCacheTableLocks()

	101 ** manipulate entries in the BtShared.pLock linked list used to store

	102 ** shared-cache table level locks. If the library is compiled with the

	103 ** shared-cache feature disabled, then there is only ever one user

	104 ** of each BtShared structure and so this locking is not necessary.

	105 ** So define the lock related functions as no-ops.

	106 */

	107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK

	108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK

	109 #define clearAllSharedCacheTableLocks(a)

	110 #define downgradeAllSharedCacheTableLocks(a)

	111 #define hasSharedCacheTableLock(a,b,c,d) 1

	112 #define hasReadConflicts(a, b) 0

	113 #endif

	114

	115 #ifndef SQLITE_OMIT_SHARED_CACHE

	116

	117 #ifdef SQLITE_DEBUG

	118 /*

	119 ** This function is only used as part of an assert() statement. *

	120 **

	121 ** Check to see if pBtree holds the required locks to read or write to the

	122 ** table with root page iRoot. Return 1 if it does and 0 if not.

	123 **

	124 ** For example, when writing to a table with root-page iRoot via

	125 ** Btree connection pBtree:

	126 **

	127 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );

	128 **

	129 ** When writing to an index that resides in a sharable database, the

	130 ** caller should have first obtained a lock specifying the root page of

	131 ** the corresponding table. This makes things a bit more complicated,

	132 ** as this module treats each table as a separate structure. To determine

	133 ** the table corresponding to the index being written, this

	134 ** function has to search through the database schema.

	135 **

	136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may

	137 ** hold a write-lock on the schema table (root page 1). This is also

	138 ** acceptable.

	139 */

	140 static int hasSharedCacheTableLock(

	141 Btree pBtree, / Handle that must hold lock */

	142 Pgno iRoot, /* Root page of b-tree */

	143 int isIndex, /* True if iRoot is the root of an index b-tree */

	144 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */

	145 ){

	146 Schema pSchema = (Schema )pBtree->pBt->pSchema;

	147 Pgno iTab = 0;

	148 BtLock *pLock;

	149

	150 /* If this database is not shareable, or if the client is reading

	151 ** and has the read-uncommitted flag set, then no lock is required.

	152 ** Return true immediately.

	153 */

	154 if( (pBtree->sharable==0)

	155 \|\| (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))

	156 ){

	157 return 1;

	158 }

	159

	160 /* If the client is reading or writing an index and the schema is

	161 ** not loaded, then it is too difficult to actually check to see if

	162 ** the correct locks are held. So do not bother - just return true.

	163 ** This case does not come up very often anyhow.

	164 */

	165 if( isIndex && (!pSchema \|\| (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){

	166 return 1;

	167 }

	168

	169 /* Figure out the root-page that the lock should be held on. For table

	170 ** b-trees, this is just the root page of the b-tree being read or

	171 ** written. For index b-trees, it is the root page of the associated

	172 ** table. */

	173 if( isIndex ){

	174 HashElem *p;

	175 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){

	176 Index pIdx = (Index )sqliteHashData(p);

	177 if( pIdx->tnum==(int)iRoot ){

	178 iTab = pIdx->pTable->tnum;

	179 }

	180 }

	181 }else{

	182 iTab = iRoot;

	183 }

	184

	185 /* Search for the required lock. Either a write-lock on root-page iTab, a

	186 ** write-lock on the schema table, or (if the client is reading) a

	187 ** read-lock on iTab will suffice. Return 1 if any of these are found. */

	188 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){

	189 if( pLock->pBtree==pBtree

	190 && (pLock->iTable==iTab \|\| (pLock->eLock==WRITE_LOCK && pLock->iTable==1))

	191 && pLock->eLock>=eLockType

	192 ){

	193 return 1;

	194 }

	195 }

	196

	197 /* Failed to find the required lock. */

	198 return 0;

	199 }

	200 #endif /* SQLITE_DEBUG */

	201

	202 #ifdef SQLITE_DEBUG

	203 /*

	204 ** This function may be used as part of assert() statements only. **

	205 **

	206 ** Return true if it would be illegal for pBtree to write into the

	207 ** table or index rooted at iRoot because other shared connections are

	208 ** simultaneously reading that same table or index.

	209 **

	210 ** It is illegal for pBtree to write if some other Btree object that

	211 ** shares the same BtShared object is currently reading or writing

	212 ** the iRoot table. Except, if the other Btree object has the

	213 ** read-uncommitted flag set, then it is OK for the other object to

	214 ** have a read cursor.

	215 **

	216 ** For example, before writing to any part of the table or index

	217 ** rooted at page iRoot, one should call:

	218 **

	219 ** assert( !hasReadConflicts(pBtree, iRoot) );

	220 */

	221 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){

	222 BtCursor *p;

	223 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	224 if( p->pgnoRoot==iRoot

	225 && p->pBtree!=pBtree

	226 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)

	227 ){

	228 return 1;

	229 }

	230 }

	231 return 0;

	232 }

	233 #endif /* #ifdef SQLITE_DEBUG */

	234

	235 /*

	236 ** Query to see if Btree handle p may obtain a lock of type eLock

	237 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return

	238 ** SQLITE_OK if the lock may be obtained (by calling

	239 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.

	240 */

	241 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){

	242 BtShared *pBt = p->pBt;

	243 BtLock *pIter;

	244

	245 assert( sqlite3BtreeHoldsMutex(p) );

	246 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

	247 assert( p->db!=0 );

	248 assert( !(p->db->flags&SQLITE_ReadUncommitted)\|\|eLock==WRITE_LOCK\|\|iTab==1 );

	249

	250 /* If requesting a write-lock, then the Btree must have an open write

	251 ** transaction on this file. And, obviously, for this to be so there

	252 ** must be an open write transaction on the file itself.

	253 */

	254 assert( eLock==READ_LOCK \|\| (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );

	255 assert( eLock==READ_LOCK \|\| pBt->inTransaction==TRANS_WRITE );

	256

	257 /* This routine is a no-op if the shared-cache is not enabled */

	258 if( !p->sharable ){

	259 return SQLITE_OK;

	260 }

	261

	262 /* If some other connection is holding an exclusive lock, the

	263 ** requested lock may not be obtained.

	264 */

	265 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){

	266 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);

	267 return SQLITE_LOCKED_SHAREDCACHE;

	268 }

	269

	270 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	271 /* The condition (pIter->eLock!=eLock) in the following if(...)

	272 ** statement is a simplification of:

	273 **

	274 ** (eLock==WRITE_LOCK \|\| pIter->eLock==WRITE_LOCK)

	275 **

	276 ** since we know that if eLock==WRITE_LOCK, then no other connection

	277 ** may hold a WRITE_LOCK on any table in this file (since there can

	278 ** only be a single writer).

	279 */

	280 assert( pIter->eLock==READ_LOCK \|\| pIter->eLock==WRITE_LOCK );

	281 assert( eLock==READ_LOCK \|\| pIter->pBtree==p \|\| pIter->eLock==READ_LOCK);

	282 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){

	283 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);

	284 if( eLock==WRITE_LOCK ){

	285 assert( p==pBt->pWriter );

	286 pBt->btsFlags \|= BTS_PENDING;

	287 }

	288 return SQLITE_LOCKED_SHAREDCACHE;

	289 }

	290 }

	291 return SQLITE_OK;

	292 }

	293 #endif /* !SQLITE_OMIT_SHARED_CACHE */

	294

	295 #ifndef SQLITE_OMIT_SHARED_CACHE

	296 /*

	297 ** Add a lock on the table with root-page iTable to the shared-btree used

	298 ** by Btree handle p. Parameter eLock must be either READ_LOCK or

	299 ** WRITE_LOCK.

	300 **

	301 ** This function assumes the following:

	302 **

	303 ** (a) The specified Btree object p is connected to a sharable

	304 ** database (one with the BtShared.sharable flag set), and

	305 **

	306 ** (b) No other Btree objects hold a lock that conflicts

	307 ** with the requested lock (i.e. querySharedCacheTableLock() has

	308 ** already been called and returned SQLITE_OK).

	309 **

	310 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM

	311 ** is returned if a malloc attempt fails.

	312 */

	313 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){

	314 BtShared *pBt = p->pBt;

	315 BtLock *pLock = 0;

	316 BtLock *pIter;

	317

	318 assert( sqlite3BtreeHoldsMutex(p) );

	319 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

	320 assert( p->db!=0 );

	321

	322 /* A connection with the read-uncommitted flag set will never try to

	323 ** obtain a read-lock using this function. The only read-lock obtained

	324 ** by a connection in read-uncommitted mode is on the sqlite_master

	325 ** table, and that lock is obtained in BtreeBeginTrans(). */

	326 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) \|\| eLock==WRITE_LOCK );

	327

	328 /* This function should only be called on a sharable b-tree after it

	329 ** has been determined that no other b-tree holds a conflicting lock. */

	330 assert( p->sharable );

	331 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );

	332

	333 /* First search the list for an existing lock on this table. */

	334 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	335 if( pIter->iTable==iTable && pIter->pBtree==p ){

	336 pLock = pIter;

	337 break;

	338 }

	339 }

	340

	341 /* If the above search did not find a BtLock struct associating Btree p

	342 ** with table iTable, allocate one and link it into the list.

	343 */

	344 if( !pLock ){

	345 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));

	346 if( !pLock ){

	347 return SQLITE_NOMEM;

	348 }

	349 pLock->iTable = iTable;

	350 pLock->pBtree = p;

	351 pLock->pNext = pBt->pLock;

	352 pBt->pLock = pLock;

	353 }

	354

	355 /* Set the BtLock.eLock variable to the maximum of the current lock

	356 ** and the requested lock. This means if a write-lock was already held

	357 ** and a read-lock requested, we don't incorrectly downgrade the lock.

	358 */

	359 assert( WRITE_LOCK>READ_LOCK );

	360 if( eLock>pLock->eLock ){

	361 pLock->eLock = eLock;

	362 }

	363

	364 return SQLITE_OK;

	365 }

	366 #endif /* !SQLITE_OMIT_SHARED_CACHE */

	367

	368 #ifndef SQLITE_OMIT_SHARED_CACHE

	369 /*

	370 ** Release all the table locks (locks obtained via calls to

	371 ** the setSharedCacheTableLock() procedure) held by Btree object p.

	372 **

	373 ** This function assumes that Btree p has an open read or write

	374 ** transaction. If it does not, then the BTS_PENDING flag

	375 ** may be incorrectly cleared.

	376 */

	377 static void clearAllSharedCacheTableLocks(Btree *p){

	378 BtShared *pBt = p->pBt;

	379 BtLock **ppIter = &pBt->pLock;

	380

	381 assert( sqlite3BtreeHoldsMutex(p) );

	382 assert( p->sharable \|\| 0==*ppIter );

	383 assert( p->inTrans>0 );

	384

	385 while( *ppIter ){

	386 BtLock pLock = ppIter;

	387 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 \|\| pBt->pWriter==pLock->pBtree );

	388 assert( pLock->pBtree->inTrans>=pLock->eLock );

	389 if( pLock->pBtree==p ){

	390 *ppIter = pLock->pNext;

	391 assert( pLock->iTable!=1 \|\| pLock==&p->lock );

	392 if( pLock->iTable!=1 ){

	393 sqlite3_free(pLock);

	394 }

	395 }else{

	396 ppIter = &pLock->pNext;

	397 }

	398 }

	399

	400 assert( (pBt->btsFlags & BTS_PENDING)==0 \|\| pBt->pWriter );

	401 if( pBt->pWriter==p ){

	402 pBt->pWriter = 0;

	403 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

	404 }else if( pBt->nTransaction==2 ){

	405 /* This function is called when Btree p is concluding its

	406 ** transaction. If there currently exists a writer, and p is not

	407 ** that writer, then the number of locks held by connections other

	408 ** than the writer must be about to drop to zero. In this case

	409 ** set the BTS_PENDING flag to 0.

	410 **

	411 ** If there is not currently a writer, then BTS_PENDING must

	412 ** be zero already. So this next line is harmless in that case.

	413 */

	414 pBt->btsFlags &= ~BTS_PENDING;

	415 }

	416 }

	417

	418 /*

	419 ** This function changes all write-locks held by Btree p into read-locks.

	420 */

	421 static void downgradeAllSharedCacheTableLocks(Btree *p){

	422 BtShared *pBt = p->pBt;

	423 if( pBt->pWriter==p ){

	424 BtLock *pLock;

	425 pBt->pWriter = 0;

	426 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

	427 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){

	428 assert( pLock->eLock==READ_LOCK \|\| pLock->pBtree==p );

	429 pLock->eLock = READ_LOCK;

	430 }

	431 }

	432 }

	433

	434 #endif /* SQLITE_OMIT_SHARED_CACHE */

	435

	436 static void releasePage(MemPage pPage); / Forward reference */

	437

	438 /*

	439 *** This routine is used inside of assert() only **

	440 **

	441 ** Verify that the cursor holds the mutex on its BtShared

	442 */

	443 #ifdef SQLITE_DEBUG

	444 static int cursorHoldsMutex(BtCursor *p){

	445 return sqlite3_mutex_held(p->pBt->mutex);

	446 }

	447 #endif

	448

	449 /*

	450 ** Invalidate the overflow cache of the cursor passed as the first argument.

	451 ** on the shared btree structure pBt.

	452 */

	453 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)

	454

	455 /*

	456 ** Invalidate the overflow page-list cache for all cursors opened

	457 ** on the shared btree structure pBt.

	458 */

	459 static void invalidateAllOverflowCache(BtShared *pBt){

	460 BtCursor *p;

	461 assert( sqlite3_mutex_held(pBt->mutex) );

	462 for(p=pBt->pCursor; p; p=p->pNext){

	463 invalidateOverflowCache(p);

	464 }

	465 }

	466

	467 #ifndef SQLITE_OMIT_INCRBLOB

	468 /*

	469 ** This function is called before modifying the contents of a table

	470 ** to invalidate any incrblob cursors that are open on the

	471 ** row or one of the rows being modified.

	472 **

	473 ** If argument isClearTable is true, then the entire contents of the

	474 ** table is about to be deleted. In this case invalidate all incrblob

	475 ** cursors open on any row within the table with root-page pgnoRoot.

	476 **

	477 ** Otherwise, if argument isClearTable is false, then the row with

	478 ** rowid iRow is being replaced or deleted. In this case invalidate

	479 ** only those incrblob cursors open on that specific row.

	480 */

	481 static void invalidateIncrblobCursors(

	482 Btree pBtree, / The database file to check */

	483 i64 iRow, /* The rowid that might be changing */

	484 int isClearTable /* True if all rows are being deleted */

	485 ){

	486 BtCursor *p;

	487 BtShared *pBt = pBtree->pBt;

	488 assert( sqlite3BtreeHoldsMutex(pBtree) );

	489 for(p=pBt->pCursor; p; p=p->pNext){

	490 if( (p->curFlags & BTCF_Incrblob)!=0

	491 && (isClearTable \|\| p->info.nKey==iRow)

	492 ){

	493 p->eState = CURSOR_INVALID;

	494 }

	495 }

	496 }

	497

	498 #else

	499 /* Stub function when INCRBLOB is omitted */

	500 #define invalidateIncrblobCursors(x,y,z)

	501 #endif /* SQLITE_OMIT_INCRBLOB */

	502

	503 /*

	504 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called

	505 ** when a page that previously contained data becomes a free-list leaf

	506 ** page.

	507 **

	508 ** The BtShared.pHasContent bitvec exists to work around an obscure

	509 ** bug caused by the interaction of two useful IO optimizations surrounding

	510 ** free-list leaf pages:

	511 **

	512 ** 1) When all data is deleted from a page and the page becomes

	513 ** a free-list leaf page, the page is not written to the database

	514 ** (as free-list leaf pages contain no meaningful data). Sometimes

	515 ** such a page is not even journalled (as it will not be modified,

	516 ** why bother journalling it?).

	517 **

	518 ** 2) When a free-list leaf page is reused, its content is not read

	519 ** from the database or written to the journal file (why should it

	520 ** be, if it is not at all meaningful?).

	521 **

	522 ** By themselves, these optimizations work fine and provide a handy

	523 ** performance boost to bulk delete or insert operations. However, if

	524 ** a page is moved to the free-list and then reused within the same

	525 ** transaction, a problem comes up. If the page is not journalled when

	526 ** it is moved to the free-list and it is also not journalled when it

	527 ** is extracted from the free-list and reused, then the original data

	528 ** may be lost. In the event of a rollback, it may not be possible

	529 ** to restore the database to its original configuration.

	530 **

	531 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is

	532 ** moved to become a free-list leaf page, the corresponding bit is

	533 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,

	534 ** optimization 2 above is omitted if the corresponding bit is already

	535 ** set in BtShared.pHasContent. The contents of the bitvec are cleared

	536 ** at the end of every transaction.

	537 */

	538 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){

	539 int rc = SQLITE_OK;

	540 if( !pBt->pHasContent ){

	541 assert( pgno<=pBt->nPage );

	542 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);

	543 if( !pBt->pHasContent ){

	544 rc = SQLITE_NOMEM;

	545 }

	546 }

	547 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){

	548 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);

	549 }

	550 return rc;

	551 }

	552

	553 /*

	554 ** Query the BtShared.pHasContent vector.

	555 **

	556 ** This function is called when a free-list leaf page is removed from the

	557 ** free-list for reuse. It returns false if it is safe to retrieve the

	558 ** page from the pager layer with the 'no-content' flag set. True otherwise.

	559 */

	560 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){

	561 Bitvec *p = pBt->pHasContent;

	562 return (p && (pgno>sqlite3BitvecSize(p) \|\| sqlite3BitvecTest(p, pgno)));

	563 }

	564

	565 /*

	566 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be

	567 ** invoked at the conclusion of each write-transaction.

	568 */

	569 static void btreeClearHasContent(BtShared *pBt){

	570 sqlite3BitvecDestroy(pBt->pHasContent);

	571 pBt->pHasContent = 0;

	572 }

	573

	574 /*

	575 ** Release all of the apPage[] pages for a cursor.

	576 */

	577 static void btreeReleaseAllCursorPages(BtCursor *pCur){

	578 int i;

	579 for(i=0; i<=pCur->iPage; i++){

	580 releasePage(pCur->apPage[i]);

	581 pCur->apPage[i] = 0;

	582 }

	583 pCur->iPage = -1;

	584 }

	585

	586

	587 /*

	588 ** Save the current cursor position in the variables BtCursor.nKey

	589 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.

	590 **

	591 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)

	592 ** prior to calling this routine.

	593 */

	594 static int saveCursorPosition(BtCursor *pCur){

	595 int rc;

	596

	597 assert( CURSOR_VALID==pCur->eState );

	598 assert( 0==pCur->pKey );

	599 assert( cursorHoldsMutex(pCur) );

	600

	601 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);

	602 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */

	603

	604 /* If this is an intKey table, then the above call to BtreeKeySize()

	605 ** stores the integer key in pCur->nKey. In this case this value is

	606 ** all that is required. Otherwise, if pCur is not open on an intKey

	607 ** table, then malloc space for and store the pCur->nKey bytes of key

	608 ** data.

	609 */

	610 if( 0==pCur->apPage[0]->intKey ){

	611 void *pKey = sqlite3Malloc( pCur->nKey );

	612 if( pKey ){

	613 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);

	614 if( rc==SQLITE_OK ){

	615 pCur->pKey = pKey;

	616 }else{

	617 sqlite3_free(pKey);

	618 }

	619 }else{

	620 rc = SQLITE_NOMEM;

	621 }

	622 }

	623 assert( !pCur->apPage[0]->intKey \|\| !pCur->pKey );

	624

	625 if( rc==SQLITE_OK ){

	626 btreeReleaseAllCursorPages(pCur);

	627 pCur->eState = CURSOR_REQUIRESEEK;

	628 }

	629

	630 invalidateOverflowCache(pCur);

	631 return rc;

	632 }

	633

	634 /* Forward reference */

	635 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor,Pgno,BtCursor);

	636

	637 /*

	638 ** Save the positions of all cursors (except pExcept) that are open on

	639 ** the table with root-page iRoot. "Saving the cursor position" means that

	640 ** the location in the btree is remembered in such a way that it can be

	641 ** moved back to the same spot after the btree has been modified. This

	642 ** routine is called just before cursor pExcept is used to modify the

	643 ** table, for example in BtreeDelete() or BtreeInsert().

	644 **

	645 ** Implementation note: This routine merely checks to see if any cursors

	646 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual)

	647 ** event that cursors are in need to being saved.

	648 */

	649 static int saveAllCursors(BtShared pBt, Pgno iRoot, BtCursor pExcept){

	650 BtCursor *p;

	651 assert( sqlite3_mutex_held(pBt->mutex) );

	652 assert( pExcept==0 \|\| pExcept->pBt==pBt );

	653 for(p=pBt->pCursor; p; p=p->pNext){

	654 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ) break;

	655 }

	656 return p ? saveCursorsOnList(p, iRoot, pExcept) : SQLITE_OK;

	657 }

	658

	659 /* This helper routine to saveAllCursors does the actual work of saving

	660 ** the cursors if and when a cursor is found that actually requires saving.

	661 ** The common case is that no cursors need to be saved, so this routine is

	662 ** broken out from its caller to avoid unnecessary stack pointer movement.

	663 */

	664 static int SQLITE_NOINLINE saveCursorsOnList(

	665 BtCursor p, / The first cursor that needs saving */

	666 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */

	667 BtCursor pExcept / Do not save this cursor */

	668 ){

	669 do{

	670 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ){

	671 if( p->eState==CURSOR_VALID ){

	672 int rc = saveCursorPosition(p);

	673 if( SQLITE_OK!=rc ){

	674 return rc;

	675 }

	676 }else{

	677 testcase( p->iPage>0 );

	678 btreeReleaseAllCursorPages(p);

	679 }

	680 }

	681 p = p->pNext;

	682 }while( p );

	683 return SQLITE_OK;

	684 }

	685

	686 /*

	687 ** Clear the current cursor position.

	688 */

	689 void sqlite3BtreeClearCursor(BtCursor *pCur){

	690 assert( cursorHoldsMutex(pCur) );

	691 sqlite3_free(pCur->pKey);

	692 pCur->pKey = 0;

	693 pCur->eState = CURSOR_INVALID;

	694 }

	695

	696 /*

	697 ** In this version of BtreeMoveto, pKey is a packed index record

	698 ** such as is generated by the OP_MakeRecord opcode. Unpack the

	699 ** record and then call BtreeMovetoUnpacked() to do the work.

	700 */

	701 static int btreeMoveto(

	702 BtCursor pCur, / Cursor open on the btree to be searched */

	703 const void pKey, / Packed key if the btree is an index */

	704 i64 nKey, /* Integer key for tables. Size of pKey for indices */

	705 int bias, /* Bias search to the high end */

	706 int pRes / Write search results here */

	707 ){

	708 int rc; /* Status code */

	709 UnpackedRecord pIdxKey; / Unpacked index key */

	710 char aSpace[200]; /* Temp space for pIdxKey - to avoid a malloc */

	711 char *pFree = 0;

	712

	713 if( pKey ){

	714 assert( nKey==(i64)(int)nKey );

	715 pIdxKey = sqlite3VdbeAllocUnpackedRecord(

	716 pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree

	717 );

	718 if( pIdxKey==0 ) return SQLITE_NOMEM;

	719 sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);

	720 if( pIdxKey->nField==0 ){

	721 sqlite3DbFree(pCur->pKeyInfo->db, pFree);

	722 return SQLITE_CORRUPT_BKPT;

	723 }

	724 }else{

	725 pIdxKey = 0;

	726 }

	727 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);

	728 if( pFree ){

	729 sqlite3DbFree(pCur->pKeyInfo->db, pFree);

	730 }

	731 return rc;

	732 }

	733

	734 /*

	735 ** Restore the cursor to the position it was in (or as close to as possible)

	736 ** when saveCursorPosition() was called. Note that this call deletes the

	737 ** saved position info stored by saveCursorPosition(), so there can be

	738 ** at most one effective restoreCursorPosition() call after each

	739 ** saveCursorPosition().

	740 */

	741 static int btreeRestoreCursorPosition(BtCursor *pCur){

	742 int rc;

	743 assert( cursorHoldsMutex(pCur) );

	744 assert( pCur->eState>=CURSOR_REQUIRESEEK );

	745 if( pCur->eState==CURSOR_FAULT ){

	746 return pCur->skipNext;

	747 }

	748 pCur->eState = CURSOR_INVALID;

	749 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);

	750 if( rc==SQLITE_OK ){

	751 sqlite3_free(pCur->pKey);

	752 pCur->pKey = 0;

	753 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_INVALID );

	754 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){

	755 pCur->eState = CURSOR_SKIPNEXT;

	756 }

	757 }

	758 return rc;

	759 }

	760

	761 #define restoreCursorPosition(p) \

	762 (p->eState>=CURSOR_REQUIRESEEK ? \

	763 btreeRestoreCursorPosition(p) : \

	764 SQLITE_OK)

	765

	766 /*

	767 ** Determine whether or not a cursor has moved from the position where

	768 ** it was last placed, or has been invalidated for any other reason.

	769 ** Cursors can move when the row they are pointing at is deleted out

	770 ** from under them, for example. Cursor might also move if a btree

	771 ** is rebalanced.

	772 **

	773 ** Calling this routine with a NULL cursor pointer returns false.

	774 **

	775 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor

	776 ** back to where it ought to be if this routine returns true.

	777 */

	778 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){

	779 return pCur->eState!=CURSOR_VALID;

	780 }

	781

	782 /*

	783 ** This routine restores a cursor back to its original position after it

	784 ** has been moved by some outside activity (such as a btree rebalance or

	785 ** a row having been deleted out from under the cursor).

	786 **

	787 ** On success, the *pDifferentRow parameter is false if the cursor is left

	788 ** pointing at exactly the same row. *pDifferntRow is the row the cursor

	789 ** was pointing to has been deleted, forcing the cursor to point to some

	790 ** nearby row.

	791 **

	792 ** This routine should only be called for a cursor that just returned

	793 ** TRUE from sqlite3BtreeCursorHasMoved().

	794 */

	795 int sqlite3BtreeCursorRestore(BtCursor pCur, int pDifferentRow){

	796 int rc;

	797

	798 assert( pCur!=0 );

	799 assert( pCur->eState!=CURSOR_VALID );

	800 rc = restoreCursorPosition(pCur);

	801 if( rc ){

	802 *pDifferentRow = 1;

	803 return rc;

	804 }

	805 if( pCur->eState!=CURSOR_VALID \|\| NEVER(pCur->skipNext!=0) ){

	806 *pDifferentRow = 1;

	807 }else{

	808 *pDifferentRow = 0;

	809 }

	810 return SQLITE_OK;

	811 }

	812

	813 #ifndef SQLITE_OMIT_AUTOVACUUM

	814 /*

	815 ** Given a page number of a regular database page, return the page

	816 ** number for the pointer-map page that contains the entry for the

	817 ** input page number.

	818 **

	819 ** Return 0 (not a valid page) for pgno==1 since there is

	820 ** no pointer map associated with page 1. The integrity_check logic

	821 ** requires that ptrmapPageno(*,1)!=1.

	822 */

	823 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){

	824 int nPagesPerMapPage;

	825 Pgno iPtrMap, ret;

	826 assert( sqlite3_mutex_held(pBt->mutex) );

	827 if( pgno<2 ) return 0;

	828 nPagesPerMapPage = (pBt->usableSize/5)+1;

	829 iPtrMap = (pgno-2)/nPagesPerMapPage;

	830 ret = (iPtrMap*nPagesPerMapPage) + 2;

	831 if( ret==PENDING_BYTE_PAGE(pBt) ){

	832 ret++;

	833 }

	834 return ret;

	835 }

	836

	837 /*

	838 ** Write an entry into the pointer map.

	839 **

	840 ** This routine updates the pointer map entry for page number 'key'

	841 ** so that it maps to type 'eType' and parent page number 'pgno'.

	842 **

	843 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is

	844 ** a no-op. If an error occurs, the appropriate error code is written

	845 ** into *pRC.

	846 */

	847 static void ptrmapPut(BtShared pBt, Pgno key, u8 eType, Pgno parent, int pRC){

	848 DbPage pDbPage; / The pointer map page */

	849 u8 pPtrmap; / The pointer map data */

	850 Pgno iPtrmap; /* The pointer map page number */

	851 int offset; /* Offset in pointer map page */

	852 int rc; /* Return code from subfunctions */

	853

	854 if( *pRC ) return;

	855

	856 assert( sqlite3_mutex_held(pBt->mutex) );

	857 /* The master-journal page number must never be used as a pointer map page */

	858 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );

	859

	860 assert( pBt->autoVacuum );

	861 if( key==0 ){

	862 *pRC = SQLITE_CORRUPT_BKPT;

	863 return;

	864 }

	865 iPtrmap = PTRMAP_PAGENO(pBt, key);

	866 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);

	867 if( rc!=SQLITE_OK ){

	868 *pRC = rc;

	869 return;

	870 }

	871 offset = PTRMAP_PTROFFSET(iPtrmap, key);

	872 if( offset<0 ){

	873 *pRC = SQLITE_CORRUPT_BKPT;

	874 goto ptrmap_exit;

	875 }

	876 assert( offset <= (int)pBt->usableSize-5 );

	877 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

	878

	879 if( eType!=pPtrmap[offset] \|\| get4byte(&pPtrmap[offset+1])!=parent ){

	880 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));

	881 *pRC= rc = sqlite3PagerWrite(pDbPage);

	882 if( rc==SQLITE_OK ){

	883 pPtrmap[offset] = eType;

	884 put4byte(&pPtrmap[offset+1], parent);

	885 }

	886 }

	887

	888 ptrmap_exit:

	889 sqlite3PagerUnref(pDbPage);

	890 }

	891

	892 /*

	893 ** Read an entry from the pointer map.

	894 **

	895 ** This routine retrieves the pointer map entry for page 'key', writing

	896 ** the type and parent page number to pEType and pPgno respectively.

	897 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.

	898 */

	899 static int ptrmapGet(BtShared pBt, Pgno key, u8 pEType, Pgno *pPgno){

	900 DbPage pDbPage; / The pointer map page */

	901 int iPtrmap; /* Pointer map page index */

	902 u8 pPtrmap; / Pointer map page data */

	903 int offset; /* Offset of entry in pointer map */

	904 int rc;

	905

	906 assert( sqlite3_mutex_held(pBt->mutex) );

	907

	908 iPtrmap = PTRMAP_PAGENO(pBt, key);

	909 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);

	910 if( rc!=0 ){

	911 return rc;

	912 }

	913 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

	914

	915 offset = PTRMAP_PTROFFSET(iPtrmap, key);

	916 if( offset<0 ){

	917 sqlite3PagerUnref(pDbPage);

	918 return SQLITE_CORRUPT_BKPT;

	919 }

	920 assert( offset <= (int)pBt->usableSize-5 );

	921 assert( pEType!=0 );

	922 *pEType = pPtrmap[offset];

	923 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);

	924

	925 sqlite3PagerUnref(pDbPage);

	926 if( pEType<1 \|\| pEType>5 ) return SQLITE_CORRUPT_BKPT;

	927 return SQLITE_OK;

	928 }

	929

	930 #else /* if defined SQLITE_OMIT_AUTOVACUUM */

	931 #define ptrmapPut(w,x,y,z,rc)

	932 #define ptrmapGet(w,x,y,z) SQLITE_OK

	933 #define ptrmapPutOvflPtr(x, y, rc)

	934 #endif

	935

	936 /*

	937 ** Given a btree page and a cell index (0 means the first cell on

	938 ** the page, 1 means the second cell, and so forth) return a pointer

	939 ** to the cell content.

	940 **

	941 ** This routine works only for pages that do not contain overflow cells.

	942 */

	943 #define findCell(P,I) \

	944 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)])))

	945 #define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I)))))

	946

	947

	948 /*

	949 ** This a more complex version of findCell() that works for

	950 ** pages that do contain overflow cells.

	951 */

	952 static u8 findOverflowCell(MemPage pPage, int iCell){

	953 int i;

	954 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	955 for(i=pPage->nOverflow-1; i>=0; i--){

	956 int k;

	957 k = pPage->aiOvfl[i];

	958 if( k<=iCell ){

	959 if( k==iCell ){

	960 return pPage->apOvfl[i];

	961 }

	962 iCell--;

	963 }

	964 }

	965 return findCell(pPage, iCell);

	966 }

	967

	968 /*

	969 ** Parse a cell content block and fill in the CellInfo structure. There

	970 ** are two versions of this function. btreeParseCell() takes a

	971 ** cell index as the second argument and btreeParseCellPtr()

	972 ** takes a pointer to the body of the cell as its second argument.

	973 */

	974 static void btreeParseCellPtr(

	975 MemPage pPage, / Page containing the cell */

	976 u8 pCell, / Pointer to the cell text. */

	977 CellInfo pInfo / Fill in this structure */

	978 ){

	979 u8 pIter; / For scanning through pCell */

	980 u32 nPayload; /* Number of bytes of cell payload */

	981

	982 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	983 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

	984 if( pPage->intKeyLeaf ){

	985 assert( pPage->childPtrSize==0 );

	986 pIter = pCell + getVarint32(pCell, nPayload);

	987 pIter += getVarint(pIter, (u64*)&pInfo->nKey);

	988 }else if( pPage->noPayload ){

	989 assert( pPage->childPtrSize==4 );

	990 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);

	991 pInfo->nPayload = 0;

	992 pInfo->nLocal = 0;

	993 pInfo->iOverflow = 0;

	994 pInfo->pPayload = 0;

	995 return;

	996 }else{

	997 pIter = pCell + pPage->childPtrSize;

	998 pIter += getVarint32(pIter, nPayload);

	999 pInfo->nKey = nPayload;

	1000 }

	1001 pInfo->nPayload = nPayload;

	1002 pInfo->pPayload = pIter;

	1003 testcase( nPayload==pPage->maxLocal );

	1004 testcase( nPayload==pPage->maxLocal+1 );

	1005 if( nPayload<=pPage->maxLocal ){

	1006 /* This is the (easy) common case where the entire payload fits

	1007 ** on the local page. No overflow is required.

	1008 */

	1009 pInfo->nSize = nPayload + (u16)(pIter - pCell);

	1010 if( pInfo->nSize<4 ) pInfo->nSize = 4;

	1011 pInfo->nLocal = (u16)nPayload;

	1012 pInfo->iOverflow = 0;

	1013 }else{

	1014 /* If the payload will not fit completely on the local page, we have

	1015 ** to decide how much to store locally and how much to spill onto

	1016 ** overflow pages. The strategy is to minimize the amount of unused

	1017 ** space on overflow pages while keeping the amount of local storage

	1018 ** in between minLocal and maxLocal.

	1019 **

	1020 ** Warning: changing the way overflow payload is distributed in any

	1021 ** way will result in an incompatible file format.

	1022 */

	1023 int minLocal; /* Minimum amount of payload held locally */

	1024 int maxLocal; /* Maximum amount of payload held locally */

	1025 int surplus; /* Overflow payload available for local storage */

	1026

	1027 minLocal = pPage->minLocal;

	1028 maxLocal = pPage->maxLocal;

	1029 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);

	1030 testcase( surplus==maxLocal );

	1031 testcase( surplus==maxLocal+1 );

	1032 if( surplus <= maxLocal ){

	1033 pInfo->nLocal = (u16)surplus;

	1034 }else{

	1035 pInfo->nLocal = (u16)minLocal;

	1036 }

	1037 pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell);

	1038 pInfo->nSize = pInfo->iOverflow + 4;

	1039 }

	1040 }

	1041 static void btreeParseCell(

	1042 MemPage pPage, / Page containing the cell */

	1043 int iCell, /* The cell index. First cell is 0 */

	1044 CellInfo pInfo / Fill in this structure */

	1045 ){

	1046 btreeParseCellPtr(pPage, findCell(pPage, iCell), pInfo);

	1047 }

	1048

	1049 /*

	1050 ** Compute the total number of bytes that a Cell needs in the cell

	1051 ** data area of the btree-page. The return number includes the cell

	1052 ** data header and the local payload, but not any overflow page or

	1053 ** the space used by the cell pointer.

	1054 */

	1055 static u16 cellSizePtr(MemPage pPage, u8 pCell){

	1056 u8 pIter = pCell + pPage->childPtrSize; / For looping over bytes of pCell */

	1057 u8 pEnd; / End mark for a varint */

	1058 u32 nSize; /* Size value to return */

	1059

	1060 #ifdef SQLITE_DEBUG

	1061 /* The value returned by this function should always be the same as

	1062 ** the (CellInfo.nSize) value found by doing a full parse of the

	1063 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

	1064 ** this function verifies that this invariant is not violated. */

	1065 CellInfo debuginfo;

	1066 btreeParseCellPtr(pPage, pCell, &debuginfo);

	1067 #endif

	1068

	1069 if( pPage->noPayload ){

	1070 pEnd = &pIter[9];

	1071 while( (*pIter++)&0x80 && pIter<pEnd );

	1072 assert( pPage->childPtrSize==4 );

	1073 return (u16)(pIter - pCell);

	1074 }

	1075 nSize = *pIter;

	1076 if( nSize>=0x80 ){

	1077 pEnd = &pIter[9];

	1078 nSize &= 0x7f;

	1079 do{

	1080 nSize = (nSize<<7) \| (*++pIter & 0x7f);

	1081 }while( *(pIter)>=0x80 && pIter<pEnd );

	1082 }

	1083 pIter++;

	1084 if( pPage->intKey ){

	1085 /* pIter now points at the 64-bit integer key value, a variable length

	1086 ** integer. The following block moves pIter to point at the first byte

	1087 ** past the end of the key value. */

	1088 pEnd = &pIter[9];

	1089 while( (*pIter++)&0x80 && pIter<pEnd );

	1090 }

	1091 testcase( nSize==pPage->maxLocal );

	1092 testcase( nSize==pPage->maxLocal+1 );

	1093 if( nSize<=pPage->maxLocal ){

	1094 nSize += (u32)(pIter - pCell);

	1095 if( nSize<4 ) nSize = 4;

	1096 }else{

	1097 int minLocal = pPage->minLocal;

	1098 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);

	1099 testcase( nSize==pPage->maxLocal );

	1100 testcase( nSize==pPage->maxLocal+1 );

	1101 if( nSize>pPage->maxLocal ){

	1102 nSize = minLocal;

	1103 }

	1104 nSize += 4 + (u16)(pIter - pCell);

	1105 }

	1106 assert( nSize==debuginfo.nSize \|\| CORRUPT_DB );

	1107 return (u16)nSize;

	1108 }

	1109

	1110 #ifdef SQLITE_DEBUG

	1111 /* This variation on cellSizePtr() is used inside of assert() statements

	1112 ** only. */

	1113 static u16 cellSize(MemPage *pPage, int iCell){

	1114 return cellSizePtr(pPage, findCell(pPage, iCell));

	1115 }

	1116 #endif

	1117

	1118 #ifndef SQLITE_OMIT_AUTOVACUUM

	1119 /*

	1120 ** If the cell pCell, part of page pPage contains a pointer

	1121 ** to an overflow page, insert an entry into the pointer-map

	1122 ** for the overflow page.

	1123 */

	1124 static void ptrmapPutOvflPtr(MemPage pPage, u8 pCell, int *pRC){

	1125 CellInfo info;

	1126 if( *pRC ) return;

	1127 assert( pCell!=0 );

	1128 btreeParseCellPtr(pPage, pCell, &info);

	1129 if( info.iOverflow ){

	1130 Pgno ovfl = get4byte(&pCell[info.iOverflow]);

	1131 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);

	1132 }

	1133 }

	1134 #endif

	1135

	1136

	1137 /*

	1138 ** Defragment the page given. All Cells are moved to the

	1139 ** end of the page and all free space is collected into one

	1140 ** big FreeBlk that occurs in between the header and cell

	1141 ** pointer array and the cell content area.

	1142 */

	1143 static int defragmentPage(MemPage *pPage){

	1144 int i; /* Loop counter */

	1145 int pc; /* Address of the i-th cell */

	1146 int hdr; /* Offset to the page header */

	1147 int size; /* Size of a cell */

	1148 int usableSize; /* Number of usable bytes on a page */

	1149 int cellOffset; /* Offset to the cell pointer array */

	1150 int cbrk; /* Offset to the cell content area */

	1151 int nCell; /* Number of cells on the page */

	1152 unsigned char data; / The page data */

	1153 unsigned char temp; / Temp area for cell content */

	1154 int iCellFirst; /* First allowable cell index */

	1155 int iCellLast; /* Last possible cell index */

	1156

	1157

	1158 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1159 assert( pPage->pBt!=0 );

	1160 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );

	1161 assert( pPage->nOverflow==0 );

	1162 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1163 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);

	1164 data = pPage->aData;

	1165 hdr = pPage->hdrOffset;

	1166 cellOffset = pPage->cellOffset;

	1167 nCell = pPage->nCell;

	1168 assert( nCell==get2byte(&data[hdr+3]) );

	1169 usableSize = pPage->pBt->usableSize;

	1170 cbrk = get2byte(&data[hdr+5]);

	1171 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);

	1172 cbrk = usableSize;

	1173 iCellFirst = cellOffset + 2*nCell;

	1174 iCellLast = usableSize - 4;

	1175 for(i=0; i<nCell; i++){

	1176 u8 pAddr; / The i-th cell pointer */

	1177 pAddr = &data[cellOffset + i*2];

	1178 pc = get2byte(pAddr);

	1179 testcase( pc==iCellFirst );

	1180 testcase( pc==iCellLast );

	1181 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)

	1182 /* These conditions have already been verified in btreeInitPage()

	1183 ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined

	1184 */

	1185 if( pc<iCellFirst \|\| pc>iCellLast ){

	1186 return SQLITE_CORRUPT_BKPT;

	1187 }

	1188 #endif

	1189 assert( pc>=iCellFirst && pc<=iCellLast );

	1190 size = cellSizePtr(pPage, &temp[pc]);

	1191 cbrk -= size;

	1192 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)

	1193 if( cbrk<iCellFirst ){

	1194 return SQLITE_CORRUPT_BKPT;

	1195 }

	1196 #else

	1197 if( cbrk<iCellFirst \|\| pc+size>usableSize ){

	1198 return SQLITE_CORRUPT_BKPT;

	1199 }

	1200 #endif

	1201 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );

	1202 testcase( cbrk+size==usableSize );

	1203 testcase( pc+size==usableSize );

	1204 memcpy(&data[cbrk], &temp[pc], size);

	1205 put2byte(pAddr, cbrk);

	1206 }

	1207 assert( cbrk>=iCellFirst );

	1208 put2byte(&data[hdr+5], cbrk);

	1209 data[hdr+1] = 0;

	1210 data[hdr+2] = 0;

	1211 data[hdr+7] = 0;

	1212 memset(&data[iCellFirst], 0, cbrk-iCellFirst);

	1213 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1214 if( cbrk-iCellFirst!=pPage->nFree ){

	1215 return SQLITE_CORRUPT_BKPT;

	1216 }

	1217 return SQLITE_OK;

	1218 }

	1219

	1220 /*

	1221 ** Allocate nByte bytes of space from within the B-Tree page passed

	1222 ** as the first argument. Write into *pIdx the index into pPage->aData[]

	1223 ** of the first byte of allocated space. Return either SQLITE_OK or

	1224 ** an error code (usually SQLITE_CORRUPT).

	1225 **

	1226 ** The caller guarantees that there is sufficient space to make the

	1227 ** allocation. This routine might need to defragment in order to bring

	1228 ** all the space together, however. This routine will avoid using

	1229 ** the first two bytes past the cell pointer area since presumably this

	1230 ** allocation is being made in order to insert a new cell, so we will

	1231 ** also end up needing a new cell pointer.

	1232 */

	1233 static int allocateSpace(MemPage pPage, int nByte, int pIdx){

	1234 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */

	1235 u8 * const data = pPage->aData; /* Local cache of pPage->aData */

	1236 int top; /* First byte of cell content area */

	1237 int gap; /* First byte of gap between cell pointers and cell content */

	1238 int rc; /* Integer return code */

	1239 int usableSize; /* Usable size of the page */

	1240

	1241 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1242 assert( pPage->pBt );

	1243 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1244 assert( nByte>=0 ); /* Minimum cell size is 4 */

	1245 assert( pPage->nFree>=nByte );

	1246 assert( pPage->nOverflow==0 );

	1247 usableSize = pPage->pBt->usableSize;

	1248 assert( nByte < usableSize-8 );

	1249

	1250 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );

	1251 gap = pPage->cellOffset + 2*pPage->nCell;

	1252 assert( gap<=65536 );

	1253 top = get2byte(&data[hdr+5]);

	1254 if( gap>top ){

	1255 if( top==0 ){

	1256 top = 65536;

	1257 }else{

	1258 return SQLITE_CORRUPT_BKPT;

	1259 }

	1260 }

	1261

	1262 /* If there is enough space between gap and top for one more cell pointer

	1263 ** array entry offset, and if the freelist is not empty, then search the

	1264 ** freelist looking for a free slot big enough to satisfy the request.

	1265 */

	1266 testcase( gap+2==top );

	1267 testcase( gap+1==top );

	1268 testcase( gap==top );

	1269 if( gap+2<=top && (data[hdr+1] \|\| data[hdr+2]) ){

	1270 int pc, addr;

	1271 for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){

	1272 int size; /* Size of the free slot */

	1273 if( pc>usableSize-4 \|\| pc<addr+4 ){

	1274 return SQLITE_CORRUPT_BKPT;

	1275 }

	1276 size = get2byte(&data[pc+2]);

	1277 if( size>=nByte ){

	1278 int x = size - nByte;

	1279 testcase( x==4 );

	1280 testcase( x==3 );

	1281 if( x<4 ){

	1282 if( data[hdr+7]>=60 ) goto defragment_page;

	1283 /* Remove the slot from the free-list. Update the number of

	1284 ** fragmented bytes within the page. */

	1285 memcpy(&data[addr], &data[pc], 2);

	1286 data[hdr+7] += (u8)x;

	1287 }else if( size+pc > usableSize ){

	1288 return SQLITE_CORRUPT_BKPT;

	1289 }else{

	1290 /* The slot remains on the free-list. Reduce its size to account

	1291 ** for the portion used by the new allocation. */

	1292 put2byte(&data[pc+2], x);

	1293 }

	1294 *pIdx = pc + x;

	1295 return SQLITE_OK;

	1296 }

	1297 }

	1298 }

	1299

	1300 /* The request could not be fulfilled using a freelist slot. Check

	1301 ** to see if defragmentation is necessary.

	1302 */

	1303 testcase( gap+2+nByte==top );

	1304 if( gap+2+nByte>top ){

	1305 defragment_page:

	1306 testcase( pPage->nCell==0 );

	1307 rc = defragmentPage(pPage);

	1308 if( rc ) return rc;

	1309 top = get2byteNotZero(&data[hdr+5]);

	1310 assert( gap+nByte<=top );

	1311 }

	1312

	1313

	1314 /* Allocate memory from the gap in between the cell pointer array

	1315 ** and the cell content area. The btreeInitPage() call has already

	1316 ** validated the freelist. Given that the freelist is valid, there

	1317 ** is no way that the allocation can extend off the end of the page.

	1318 ** The assert() below verifies the previous sentence.

	1319 */

	1320 top -= nByte;

	1321 put2byte(&data[hdr+5], top);

	1322 assert( top+nByte <= (int)pPage->pBt->usableSize );

	1323 *pIdx = top;

	1324 return SQLITE_OK;

	1325 }

	1326

	1327 /*

	1328 ** Return a section of the pPage->aData to the freelist.

	1329 ** The first byte of the new free block is pPage->aData[iStart]

	1330 ** and the size of the block is iSize bytes.

	1331 **

	1332 ** Adjacent freeblocks are coalesced.

	1333 **

	1334 ** Note that even though the freeblock list was checked by btreeInitPage(),

	1335 ** that routine will not detect overlap between cells or freeblocks. Nor

	1336 ** does it detect cells or freeblocks that encrouch into the reserved bytes

	1337 ** at the end of the page. So do additional corruption checks inside this

	1338 ** routine and return SQLITE_CORRUPT if any problems are found.

	1339 */

	1340 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){

	1341 u16 iPtr; /* Address of ptr to next freeblock */

	1342 u16 iFreeBlk; /* Address of the next freeblock */

	1343 u8 hdr; /* Page header size. 0 or 100 */

	1344 u8 nFrag = 0; /* Reduction in fragmentation */

	1345 u16 iOrigSize = iSize; /* Original value of iSize */

	1346 u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */

	1347 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */

	1348 unsigned char data = pPage->aData; / Page content */

	1349

	1350 assert( pPage->pBt!=0 );

	1351 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1352 assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize );

	1353 assert( iEnd <= pPage->pBt->usableSize );

	1354 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1355 assert( iSize>=4 ); /* Minimum cell size is 4 */

	1356 assert( iStart<=iLast );

	1357

	1358 /* Overwrite deleted information with zeros when the secure_delete

	1359 ** option is enabled */

	1360 if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){

	1361 memset(&data[iStart], 0, iSize);

	1362 }

	1363

	1364 /* The list of freeblocks must be in ascending order. Find the

	1365 ** spot on the list where iStart should be inserted.

	1366 */

	1367 hdr = pPage->hdrOffset;

	1368 iPtr = hdr + 1;

	1369 if( data[iPtr+1]==0 && data[iPtr]==0 ){

	1370 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */

	1371 }else{

	1372 while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){

	1373 if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;

	1374 iPtr = iFreeBlk;

	1375 }

	1376 if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;

	1377 assert( iFreeBlk>iPtr \|\| iFreeBlk==0 );

	1378

	1379 /* At this point:

	1380 ** iFreeBlk: First freeblock after iStart, or zero if none

	1381 ** iPtr: The address of a pointer iFreeBlk

	1382 **

	1383 ** Check to see if iFreeBlk should be coalesced onto the end of iStart.

	1384 */

	1385 if( iFreeBlk && iEnd+3>=iFreeBlk ){

	1386 nFrag = iFreeBlk - iEnd;

	1387 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;

	1388 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);

	1389 iSize = iEnd - iStart;

	1390 iFreeBlk = get2byte(&data[iFreeBlk]);

	1391 }

	1392

	1393 /* If iPtr is another freeblock (that is, if iPtr is not the freelist

	1394 ** pointer in the page header) then check to see if iStart should be

	1395 ** coalesced onto the end of iPtr.

	1396 */

	1397 if( iPtr>hdr+1 ){

	1398 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);

	1399 if( iPtrEnd+3>=iStart ){

	1400 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;

	1401 nFrag += iStart - iPtrEnd;

	1402 iSize = iEnd - iPtr;

	1403 iStart = iPtr;

	1404 }

	1405 }

	1406 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;

	1407 data[hdr+7] -= nFrag;

	1408 }

	1409 if( iStart==get2byte(&data[hdr+5]) ){

	1410 /* The new freeblock is at the beginning of the cell content area,

	1411 ** so just extend the cell content area rather than create another

	1412 ** freelist entry */

	1413 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;

	1414 put2byte(&data[hdr+1], iFreeBlk);

	1415 put2byte(&data[hdr+5], iEnd);

	1416 }else{

	1417 /* Insert the new freeblock into the freelist */

	1418 put2byte(&data[iPtr], iStart);

	1419 put2byte(&data[iStart], iFreeBlk);

	1420 put2byte(&data[iStart+2], iSize);

	1421 }

	1422 pPage->nFree += iOrigSize;

	1423 return SQLITE_OK;

	1424 }

	1425

	1426 /*

	1427 ** Decode the flags byte (the first byte of the header) for a page

	1428 ** and initialize fields of the MemPage structure accordingly.

	1429 **

	1430 ** Only the following combinations are supported. Anything different

	1431 ** indicates a corrupt database files:

	1432 **

	1433 ** PTF_ZERODATA

	1434 ** PTF_ZERODATA \| PTF_LEAF

	1435 ** PTF_LEAFDATA \| PTF_INTKEY

	1436 ** PTF_LEAFDATA \| PTF_INTKEY \| PTF_LEAF

	1437 */

	1438 static int decodeFlags(MemPage *pPage, int flagByte){

	1439 BtShared pBt; / A copy of pPage->pBt */

	1440

	1441 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );

	1442 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1443 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );

	1444 flagByte &= ~PTF_LEAF;

	1445 pPage->childPtrSize = 4-4*pPage->leaf;

	1446 pBt = pPage->pBt;

	1447 if( flagByte==(PTF_LEAFDATA \| PTF_INTKEY) ){

	1448 pPage->intKey = 1;

	1449 pPage->intKeyLeaf = pPage->leaf;

	1450 pPage->noPayload = !pPage->leaf;

	1451 pPage->maxLocal = pBt->maxLeaf;

	1452 pPage->minLocal = pBt->minLeaf;

	1453 }else if( flagByte==PTF_ZERODATA ){

	1454 pPage->intKey = 0;

	1455 pPage->intKeyLeaf = 0;

	1456 pPage->noPayload = 0;

	1457 pPage->maxLocal = pBt->maxLocal;

	1458 pPage->minLocal = pBt->minLocal;

	1459 }else{

	1460 return SQLITE_CORRUPT_BKPT;

	1461 }

	1462 pPage->max1bytePayload = pBt->max1bytePayload;

	1463 return SQLITE_OK;

	1464 }

	1465

	1466 /*

	1467 ** Initialize the auxiliary information for a disk block.

	1468 **

	1469 ** Return SQLITE_OK on success. If we see that the page does

	1470 ** not contain a well-formed database page, then return

	1471 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not

	1472 ** guarantee that the page is well-formed. It only shows that

	1473 ** we failed to detect any corruption.

	1474 */

	1475 static int btreeInitPage(MemPage *pPage){

	1476

	1477 assert( pPage->pBt!=0 );

	1478 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1479 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );

	1480 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );

	1481 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );

	1482

	1483 if( !pPage->isInit ){

	1484 u16 pc; /* Address of a freeblock within pPage->aData[] */

	1485 u8 hdr; /* Offset to beginning of page header */

	1486 u8 data; / Equal to pPage->aData */

	1487 BtShared pBt; / The main btree structure */

	1488 int usableSize; /* Amount of usable space on each page */

	1489 u16 cellOffset; /* Offset from start of page to first cell pointer */

	1490 int nFree; /* Number of unused bytes on the page */

	1491 int top; /* First byte of the cell content area */

	1492 int iCellFirst; /* First allowable cell or freeblock offset */

	1493 int iCellLast; /* Last possible cell or freeblock offset */

	1494

	1495 pBt = pPage->pBt;

	1496

	1497 hdr = pPage->hdrOffset;

	1498 data = pPage->aData;

	1499 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;

	1500 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

	1501 pPage->maskPage = (u16)(pBt->pageSize - 1);

	1502 pPage->nOverflow = 0;

	1503 usableSize = pBt->usableSize;

	1504 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;

	1505 pPage->aDataEnd = &data[usableSize];

	1506 pPage->aCellIdx = &data[cellOffset];

	1507 top = get2byteNotZero(&data[hdr+5]);

	1508 pPage->nCell = get2byte(&data[hdr+3]);

	1509 if( pPage->nCell>MX_CELL(pBt) ){

	1510 /* To many cells for a single page. The page must be corrupt */

	1511 return SQLITE_CORRUPT_BKPT;

	1512 }

	1513 testcase( pPage->nCell==MX_CELL(pBt) );

	1514

	1515 /* A malformed database page might cause us to read past the end

	1516 ** of page when parsing a cell.

	1517 **

	1518 ** The following block of code checks early to see if a cell extends

	1519 ** past the end of a page boundary and causes SQLITE_CORRUPT to be

	1520 ** returned if it does.

	1521 */

	1522 iCellFirst = cellOffset + 2*pPage->nCell;

	1523 iCellLast = usableSize - 4;

	1524 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)

	1525 {

	1526 int i; /* Index into the cell pointer array */

	1527 int sz; /* Size of a cell */

	1528

	1529 if( !pPage->leaf ) iCellLast--;

	1530 for(i=0; i<pPage->nCell; i++){

	1531 pc = get2byte(&data[cellOffset+i*2]);

	1532 testcase( pc==iCellFirst );

	1533 testcase( pc==iCellLast );

	1534 if( pc<iCellFirst \|\| pc>iCellLast ){

	1535 return SQLITE_CORRUPT_BKPT;

	1536 }

	1537 sz = cellSizePtr(pPage, &data[pc]);

	1538 testcase( pc+sz==usableSize );

	1539 if( pc+sz>usableSize ){

	1540 return SQLITE_CORRUPT_BKPT;

	1541 }

	1542 }

	1543 if( !pPage->leaf ) iCellLast++;

	1544 }

	1545 #endif

	1546

	1547 /* Compute the total free space on the page */

	1548 pc = get2byte(&data[hdr+1]);

	1549 nFree = data[hdr+7] + top;

	1550 while( pc>0 ){

	1551 u16 next, size;

	1552 if( pc<iCellFirst \|\| pc>iCellLast ){

	1553 /* Start of free block is off the page */

	1554 return SQLITE_CORRUPT_BKPT;

	1555 }

	1556 next = get2byte(&data[pc]);

	1557 size = get2byte(&data[pc+2]);

	1558 if( (next>0 && next<=pc+size+3) \|\| pc+size>usableSize ){

	1559 /* Free blocks must be in ascending order. And the last byte of

	1560 ** the free-block must lie on the database page. */

	1561 return SQLITE_CORRUPT_BKPT;

	1562 }

	1563 nFree = nFree + size;

	1564 pc = next;

	1565 }

	1566

	1567 /* At this point, nFree contains the sum of the offset to the start

	1568 ** of the cell-content area plus the number of free bytes within

	1569 ** the cell-content area. If this is greater than the usable-size

	1570 ** of the page, then the page must be corrupted. This check also

	1571 ** serves to verify that the offset to the start of the cell-content

	1572 ** area, according to the page header, lies within the page.

	1573 */

	1574 if( nFree>usableSize ){

	1575 return SQLITE_CORRUPT_BKPT;

	1576 }

	1577 pPage->nFree = (u16)(nFree - iCellFirst);

	1578 pPage->isInit = 1;

	1579 }

	1580 return SQLITE_OK;

	1581 }

	1582

	1583 /*

	1584 ** Set up a raw page so that it looks like a database page holding

	1585 ** no entries.

	1586 */

	1587 static void zeroPage(MemPage *pPage, int flags){

	1588 unsigned char *data = pPage->aData;

	1589 BtShared *pBt = pPage->pBt;

	1590 u8 hdr = pPage->hdrOffset;

	1591 u16 first;

	1592

	1593 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );

	1594 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

	1595 assert( sqlite3PagerGetData(pPage->pDbPage) == data );

	1596 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	1597 assert( sqlite3_mutex_held(pBt->mutex) );

	1598 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	1599 memset(&data[hdr], 0, pBt->usableSize - hdr);

	1600 }

	1601 data[hdr] = (char)flags;

	1602 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);

	1603 memset(&data[hdr+1], 0, 4);

	1604 data[hdr+7] = 0;

	1605 put2byte(&data[hdr+5], pBt->usableSize);

	1606 pPage->nFree = (u16)(pBt->usableSize - first);

	1607 decodeFlags(pPage, flags);

	1608 pPage->cellOffset = first;

	1609 pPage->aDataEnd = &data[pBt->usableSize];

	1610 pPage->aCellIdx = &data[first];

	1611 pPage->nOverflow = 0;

	1612 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

	1613 pPage->maskPage = (u16)(pBt->pageSize - 1);

	1614 pPage->nCell = 0;

	1615 pPage->isInit = 1;

	1616 }

	1617

	1618

	1619 /*

	1620 ** Convert a DbPage obtained from the pager into a MemPage used by

	1621 ** the btree layer.

	1622 */

	1623 static MemPage btreePageFromDbPage(DbPage pDbPage, Pgno pgno, BtShared *pBt){

	1624 MemPage pPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

	1625 pPage->aData = sqlite3PagerGetData(pDbPage);

	1626 pPage->pDbPage = pDbPage;

	1627 pPage->pBt = pBt;

	1628 pPage->pgno = pgno;

	1629 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;

	1630 return pPage;

	1631 }

	1632

	1633 /*

	1634 ** Get a page from the pager. Initialize the MemPage.pBt and

	1635 ** MemPage.aData elements if needed.

	1636 **

	1637 ** If the noContent flag is set, it means that we do not care about

	1638 ** the content of the page at this time. So do not go to the disk

	1639 ** to fetch the content. Just fill in the content with zeros for now.

	1640 ** If in the future we call sqlite3PagerWrite() on this page, that

	1641 ** means we have started to be concerned about content and the disk

	1642 ** read should occur at that point.

	1643 */

	1644 static int btreeGetPage(

	1645 BtShared pBt, / The btree */

	1646 Pgno pgno, /* Number of the page to fetch */

	1647 MemPage *ppPage, / Return the page in this parameter */

	1648 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

	1649 ){

	1650 int rc;

	1651 DbPage *pDbPage;

	1652

	1653 assert( flags==0 \|\| flags==PAGER_GET_NOCONTENT \|\| flags==PAGER_GET_READONLY );

	1654 assert( sqlite3_mutex_held(pBt->mutex) );

	1655 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);

	1656 if( rc ) return rc;

	1657 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);

	1658 return SQLITE_OK;

	1659 }

	1660

	1661 /*

	1662 ** Retrieve a page from the pager cache. If the requested page is not

	1663 ** already in the pager cache return NULL. Initialize the MemPage.pBt and

	1664 ** MemPage.aData elements if needed.

	1665 */

	1666 static MemPage btreePageLookup(BtShared pBt, Pgno pgno){

	1667 DbPage *pDbPage;

	1668 assert( sqlite3_mutex_held(pBt->mutex) );

	1669 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);

	1670 if( pDbPage ){

	1671 return btreePageFromDbPage(pDbPage, pgno, pBt);

	1672 }

	1673 return 0;

	1674 }

	1675

	1676 /*

	1677 ** Return the size of the database file in pages. If there is any kind of

	1678 ** error, return ((unsigned int)-1).

	1679 */

	1680 static Pgno btreePagecount(BtShared *pBt){

	1681 return pBt->nPage;

	1682 }

	1683 u32 sqlite3BtreeLastPage(Btree *p){

	1684 assert( sqlite3BtreeHoldsMutex(p) );

	1685 assert( ((p->pBt->nPage)&0x8000000)==0 );

	1686 return btreePagecount(p->pBt);

	1687 }

	1688

	1689 /*

	1690 ** Get a page from the pager and initialize it. This routine is just a

	1691 ** convenience wrapper around separate calls to btreeGetPage() and

	1692 ** btreeInitPage().

	1693 **

	1694 ** If an error occurs, then the value *ppPage is set to is undefined. It

	1695 ** may remain unchanged, or it may be set to an invalid value.

	1696 */

	1697 static int getAndInitPage(

	1698 BtShared pBt, / The database file */

	1699 Pgno pgno, /* Number of the page to get */

	1700 MemPage *ppPage, / Write the page pointer here */

	1701 int bReadonly /* PAGER_GET_READONLY or 0 */

	1702 ){

	1703 int rc;

	1704 assert( sqlite3_mutex_held(pBt->mutex) );

	1705 assert( bReadonly==PAGER_GET_READONLY \|\| bReadonly==0 );

	1706

	1707 if( pgno>btreePagecount(pBt) ){

	1708 rc = SQLITE_CORRUPT_BKPT;

	1709 }else{

	1710 rc = btreeGetPage(pBt, pgno, ppPage, bReadonly);

	1711 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){

	1712 rc = btreeInitPage(*ppPage);

	1713 if( rc!=SQLITE_OK ){

	1714 releasePage(*ppPage);

	1715 }

	1716 }

	1717 }

	1718

	1719 testcase( pgno==0 );

	1720 assert( pgno!=0 \|\| rc==SQLITE_CORRUPT );

	1721 return rc;

	1722 }

	1723

	1724 /*

	1725 ** Release a MemPage. This should be called once for each prior

	1726 ** call to btreeGetPage.

	1727 */

	1728 static void releasePage(MemPage *pPage){

	1729 if( pPage ){

	1730 assert( pPage->aData );

	1731 assert( pPage->pBt );

	1732 assert( pPage->pDbPage!=0 );

	1733 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

	1734 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );

	1735 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1736 sqlite3PagerUnrefNotNull(pPage->pDbPage);

	1737 }

	1738 }

	1739

	1740 /*

	1741 ** During a rollback, when the pager reloads information into the cache

	1742 ** so that the cache is restored to its original state at the start of

	1743 ** the transaction, for each page restored this routine is called.

	1744 **

	1745 ** This routine needs to reset the extra data section at the end of the

	1746 ** page to agree with the restored data.

	1747 */

	1748 static void pageReinit(DbPage *pData){

	1749 MemPage *pPage;

	1750 pPage = (MemPage *)sqlite3PagerGetExtra(pData);

	1751 assert( sqlite3PagerPageRefcount(pData)>0 );

	1752 if( pPage->isInit ){

	1753 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	1754 pPage->isInit = 0;

	1755 if( sqlite3PagerPageRefcount(pData)>1 ){

	1756 /* pPage might not be a btree page; it might be an overflow page

	1757 ** or ptrmap page or a free page. In those cases, the following

	1758 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.

	1759 ** But no harm is done by this. And it is very important that

	1760 ** btreeInitPage() be called on every btree page so we make

	1761 ** the call for every page that comes in for re-initing. */

	1762 btreeInitPage(pPage);

	1763 }

	1764 }

	1765 }

	1766

	1767 /*

	1768 ** Invoke the busy handler for a btree.

	1769 */

	1770 static int btreeInvokeBusyHandler(void *pArg){

	1771 BtShared pBt = (BtShared)pArg;

	1772 assert( pBt->db );

	1773 assert( sqlite3_mutex_held(pBt->db->mutex) );

	1774 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);

	1775 }

	1776

	1777 /*

	1778 ** Open a database file.

	1779 **

	1780 ** zFilename is the name of the database file. If zFilename is NULL

	1781 ** then an ephemeral database is created. The ephemeral database might

	1782 ** be exclusively in memory, or it might use a disk-based memory cache.

	1783 ** Either way, the ephemeral database will be automatically deleted

	1784 ** when sqlite3BtreeClose() is called.

	1785 **

	1786 ** If zFilename is ":memory:" then an in-memory database is created

	1787 ** that is automatically destroyed when it is closed.

	1788 **

	1789 ** The "flags" parameter is a bitmask that might contain bits like

	1790 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.

	1791 **

	1792 ** If the database is already opened in the same database connection

	1793 ** and we are in shared cache mode, then the open will fail with an

	1794 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared

	1795 ** objects in the same database connection since doing so will lead

	1796 ** to problems with locking.

	1797 */

	1798 int sqlite3BtreeOpen(

	1799 sqlite3_vfs pVfs, / VFS to use for this b-tree */

	1800 const char zFilename, / Name of the file containing the BTree database */

	1801 sqlite3 db, / Associated database handle */

	1802 Btree *ppBtree, / Pointer to new Btree object written here */

	1803 int flags, /* Options */

	1804 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */

	1805 ){

	1806 BtShared pBt = 0; / Shared part of btree structure */

	1807 Btree p; / Handle to return */

	1808 sqlite3_mutex mutexOpen = 0; / Prevents a race condition. Ticket #3537 */

	1809 int rc = SQLITE_OK; /* Result code from this function */

	1810 u8 nReserve; /* Byte of unused space on each page */

	1811 unsigned char zDbHeader[100]; /* Database header content */

	1812

	1813 /* True if opening an ephemeral, temporary database */

	1814 const int isTempDb = zFilename==0 \|\| zFilename[0]==0;

	1815

	1816 /* Set the variable isMemdb to true for an in-memory database, or

	1817 ** false for a file-based database.

	1818 */

	1819 #ifdef SQLITE_OMIT_MEMORYDB

	1820 const int isMemdb = 0;

	1821 #else

	1822 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)

	1823 \|\| (isTempDb && sqlite3TempInMemory(db))

	1824 \|\| (vfsFlags & SQLITE_OPEN_MEMORY)!=0;

	1825 #endif

	1826

	1827 assert( db!=0 );

	1828 assert( pVfs!=0 );

	1829 assert( sqlite3_mutex_held(db->mutex) );

	1830 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */

	1831

	1832 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */

	1833 assert( (flags & BTREE_UNORDERED)==0 \|\| (flags & BTREE_SINGLE)!=0 );

	1834

	1835 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */

	1836 assert( (flags & BTREE_SINGLE)==0 \|\| isTempDb );

	1837

	1838 if( isMemdb ){

	1839 flags \|= BTREE_MEMORY;

	1840 }

	1841 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb \|\| isTempDb) ){

	1842 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) \| SQLITE_OPEN_TEMP_DB;

	1843 }

	1844 p = sqlite3MallocZero(sizeof(Btree));

	1845 if( !p ){

	1846 return SQLITE_NOMEM;

	1847 }

	1848 p->inTrans = TRANS_NONE;

	1849 p->db = db;

	1850 #ifndef SQLITE_OMIT_SHARED_CACHE

	1851 p->lock.pBtree = p;

	1852 p->lock.iTable = 1;

	1853 #endif

	1854

	1855 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	1856 /*

	1857 ** If this Btree is a candidate for shared cache, try to find an

	1858 ** existing BtShared object that we can share with

	1859 */

	1860 if( isTempDb==0 && (isMemdb==0 \|\| (vfsFlags&SQLITE_OPEN_URI)!=0) ){

	1861 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){

	1862 int nFullPathname = pVfs->mxPathname+1;

	1863 char *zFullPathname = sqlite3Malloc(nFullPathname);

	1864 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

	1865 p->sharable = 1;

	1866 if( !zFullPathname ){

	1867 sqlite3_free(p);

	1868 return SQLITE_NOMEM;

	1869 }

	1870 if( isMemdb ){

	1871 memcpy(zFullPathname, zFilename, sqlite3Strlen30(zFilename)+1);

	1872 }else{

	1873 rc = sqlite3OsFullPathname(pVfs, zFilename,

	1874 nFullPathname, zFullPathname);

	1875 if( rc ){

	1876 sqlite3_free(zFullPathname);

	1877 sqlite3_free(p);

	1878 return rc;

	1879 }

	1880 }

	1881 #if SQLITE_THREADSAFE

	1882 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);

	1883 sqlite3_mutex_enter(mutexOpen);

	1884 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);

	1885 sqlite3_mutex_enter(mutexShared);

	1886 #endif

	1887 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){

	1888 assert( pBt->nRef>0 );

	1889 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))

	1890 && sqlite3PagerVfs(pBt->pPager)==pVfs ){

	1891 int iDb;

	1892 for(iDb=db->nDb-1; iDb>=0; iDb--){

	1893 Btree *pExisting = db->aDb[iDb].pBt;

	1894 if( pExisting && pExisting->pBt==pBt ){

	1895 sqlite3_mutex_leave(mutexShared);

	1896 sqlite3_mutex_leave(mutexOpen);

	1897 sqlite3_free(zFullPathname);

	1898 sqlite3_free(p);

	1899 return SQLITE_CONSTRAINT;

	1900 }

	1901 }

	1902 p->pBt = pBt;

	1903 pBt->nRef++;

	1904 break;

	1905 }

	1906 }

	1907 sqlite3_mutex_leave(mutexShared);

	1908 sqlite3_free(zFullPathname);

	1909 }

	1910 #ifdef SQLITE_DEBUG

	1911 else{

	1912 /* In debug mode, we mark all persistent databases as sharable

	1913 ** even when they are not. This exercises the locking code and

	1914 ** gives more opportunity for asserts(sqlite3_mutex_held())

	1915 ** statements to find locking problems.

	1916 */

	1917 p->sharable = 1;

	1918 }

	1919 #endif

	1920 }

	1921 #endif

	1922 if( pBt==0 ){

	1923 /*

	1924 ** The following asserts make sure that structures used by the btree are

	1925 ** the right size. This is to guard against size changes that result

	1926 ** when compiling on a different architecture.

	1927 */

	1928 assert( sizeof(i64)==8 \|\| sizeof(i64)==4 );

	1929 assert( sizeof(u64)==8 \|\| sizeof(u64)==4 );

	1930 assert( sizeof(u32)==4 );

	1931 assert( sizeof(u16)==2 );

	1932 assert( sizeof(Pgno)==4 );

	1933

	1934 pBt = sqlite3MallocZero( sizeof(*pBt) );

	1935 if( pBt==0 ){

	1936 rc = SQLITE_NOMEM;

	1937 goto btree_open_out;

	1938 }

	1939 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,

	1940 EXTRA_SIZE, flags, vfsFlags, pageReinit);

	1941 if( rc==SQLITE_OK ){

	1942 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);

	1943 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);

	1944 }

	1945 if( rc!=SQLITE_OK ){

	1946 goto btree_open_out;

	1947 }

	1948 pBt->openFlags = (u8)flags;

	1949 pBt->db = db;

	1950 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);

	1951 p->pBt = pBt;

	1952

	1953 pBt->pCursor = 0;

	1954 pBt->pPage1 = 0;

	1955 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags \|= BTS_READ_ONLY;

	1956 #ifdef SQLITE_SECURE_DELETE

	1957 pBt->btsFlags \|= BTS_SECURE_DELETE;

	1958 #endif

	1959 pBt->pageSize = (zDbHeader[16]<<8) \| (zDbHeader[17]<<16);

	1960 if( pBt->pageSize<512 \|\| pBt->pageSize>SQLITE_MAX_PAGE_SIZE

	1961 \|\| ((pBt->pageSize-1)&pBt->pageSize)!=0 ){

	1962 pBt->pageSize = 0;

	1963 #ifndef SQLITE_OMIT_AUTOVACUUM

	1964 /* If the magic name ":memory:" will create an in-memory database, then

	1965 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if

	1966 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if

	1967 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a

	1968 ** regular file-name. In this case the auto-vacuum applies as per normal.

	1969 */

	1970 if( zFilename && !isMemdb ){

	1971 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);

	1972 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);

	1973 }

	1974 #endif

	1975 nReserve = 0;

	1976 }else{

	1977 nReserve = zDbHeader[20];

	1978 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	1979 #ifndef SQLITE_OMIT_AUTOVACUUM

	1980 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);

	1981 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);

	1982 #endif

	1983 }

	1984 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

	1985 if( rc ) goto btree_open_out;

	1986 pBt->usableSize = pBt->pageSize - nReserve;

	1987 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */

	1988

	1989 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	1990 /* Add the new BtShared object to the linked list sharable BtShareds.

	1991 */

	1992 if( p->sharable ){

	1993 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

	1994 pBt->nRef = 1;

	1995 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)

	1996 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){

	1997 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);

	1998 if( pBt->mutex==0 ){

	1999 rc = SQLITE_NOMEM;

	2000 db->mallocFailed = 0;

	2001 goto btree_open_out;

	2002 }

	2003 }

	2004 sqlite3_mutex_enter(mutexShared);

	2005 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);

	2006 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;

	2007 sqlite3_mutex_leave(mutexShared);

	2008 }

	2009 #endif

	2010 }

	2011

	2012 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	2013 /* If the new Btree uses a sharable pBtShared, then link the new

	2014 ** Btree into the list of all sharable Btrees for the same connection.

	2015 ** The list is kept in ascending order by pBt address.

	2016 */

	2017 if( p->sharable ){

	2018 int i;

	2019 Btree *pSib;

	2020 for(i=0; i<db->nDb; i++){

	2021 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){

	2022 while( pSib->pPrev ){ pSib = pSib->pPrev; }

	2023 if( p->pBt<pSib->pBt ){

	2024 p->pNext = pSib;

	2025 p->pPrev = 0;

	2026 pSib->pPrev = p;

	2027 }else{

	2028 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){

	2029 pSib = pSib->pNext;

	2030 }

	2031 p->pNext = pSib->pNext;

	2032 p->pPrev = pSib;

	2033 if( p->pNext ){

	2034 p->pNext->pPrev = p;

	2035 }

	2036 pSib->pNext = p;

	2037 }

	2038 break;

	2039 }

	2040 }

	2041 }

	2042 #endif

	2043 *ppBtree = p;

	2044

	2045 btree_open_out:

	2046 if( rc!=SQLITE_OK ){

	2047 if( pBt && pBt->pPager ){

	2048 sqlite3PagerClose(pBt->pPager);

	2049 }

	2050 sqlite3_free(pBt);

	2051 sqlite3_free(p);

	2052 *ppBtree = 0;

	2053 }else{

	2054 /* If the B-Tree was successfully opened, set the pager-cache size to the

	2055 ** default value. Except, when opening on an existing shared pager-cache,

	2056 ** do not change the pager-cache size.

	2057 */

	2058 if( sqlite3BtreeSchema(p, 0, 0)==0 ){

	2059 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);

	2060 }

	2061 }

	2062 if( mutexOpen ){

	2063 assert( sqlite3_mutex_held(mutexOpen) );

	2064 sqlite3_mutex_leave(mutexOpen);

	2065 }

	2066 return rc;

	2067 }

	2068

	2069 /*

	2070 ** Decrement the BtShared.nRef counter. When it reaches zero,

	2071 ** remove the BtShared structure from the sharing list. Return

	2072 ** true if the BtShared.nRef counter reaches zero and return

	2073 ** false if it is still positive.

	2074 */

	2075 static int removeFromSharingList(BtShared *pBt){

	2076 #ifndef SQLITE_OMIT_SHARED_CACHE

	2077 MUTEX_LOGIC( sqlite3_mutex *pMaster; )

	2078 BtShared *pList;

	2079 int removed = 0;

	2080

	2081 assert( sqlite3_mutex_notheld(pBt->mutex) );

	2082 MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )

	2083 sqlite3_mutex_enter(pMaster);

	2084 pBt->nRef--;

	2085 if( pBt->nRef<=0 ){

	2086 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){

	2087 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;

	2088 }else{

	2089 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);

	2090 while( ALWAYS(pList) && pList->pNext!=pBt ){

	2091 pList=pList->pNext;

	2092 }

	2093 if( ALWAYS(pList) ){

	2094 pList->pNext = pBt->pNext;

	2095 }

	2096 }

	2097 if( SQLITE_THREADSAFE ){

	2098 sqlite3_mutex_free(pBt->mutex);

	2099 }

	2100 removed = 1;

	2101 }

	2102 sqlite3_mutex_leave(pMaster);

	2103 return removed;

	2104 #else

	2105 return 1;

	2106 #endif

	2107 }

	2108

	2109 /*

	2110 ** Make sure pBt->pTmpSpace points to an allocation of

	2111 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child

	2112 ** pointer.

	2113 */

	2114 static void allocateTempSpace(BtShared *pBt){

	2115 if( !pBt->pTmpSpace ){

	2116 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );

	2117

	2118 /* One of the uses of pBt->pTmpSpace is to format cells before

	2119 ** inserting them into a leaf page (function fillInCell()). If

	2120 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes

	2121 ** by the various routines that manipulate binary cells. Which

	2122 ** can mean that fillInCell() only initializes the first 2 or 3

	2123 ** bytes of pTmpSpace, but that the first 4 bytes are copied from

	2124 ** it into a database page. This is not actually a problem, but it

	2125 ** does cause a valgrind error when the 1 or 2 bytes of unitialized

	2126 ** data is passed to system call write(). So to avoid this error,

	2127 ** zero the first 4 bytes of temp space here.

	2128 **

	2129 ** Also: Provide four bytes of initialized space before the

	2130 ** beginning of pTmpSpace as an area available to prepend the

	2131 ** left-child pointer to the beginning of a cell.

	2132 */

	2133 if( pBt->pTmpSpace ){

	2134 memset(pBt->pTmpSpace, 0, 8);

	2135 pBt->pTmpSpace += 4;

	2136 }

	2137 }

	2138 }

	2139

	2140 /*

	2141 ** Free the pBt->pTmpSpace allocation

	2142 */

	2143 static void freeTempSpace(BtShared *pBt){

	2144 if( pBt->pTmpSpace ){

	2145 pBt->pTmpSpace -= 4;

	2146 sqlite3PageFree(pBt->pTmpSpace);

	2147 pBt->pTmpSpace = 0;

	2148 }

	2149 }

	2150

	2151 /*

	2152 ** Close an open database and invalidate all cursors.

	2153 */

	2154 int sqlite3BtreeClose(Btree *p){

	2155 BtShared *pBt = p->pBt;

	2156 BtCursor *pCur;

	2157

	2158 /* Close all cursors opened via this handle. */

	2159 assert( sqlite3_mutex_held(p->db->mutex) );

	2160 sqlite3BtreeEnter(p);

	2161 pCur = pBt->pCursor;

	2162 while( pCur ){

	2163 BtCursor *pTmp = pCur;

	2164 pCur = pCur->pNext;

	2165 if( pTmp->pBtree==p ){

	2166 sqlite3BtreeCloseCursor(pTmp);

	2167 }

	2168 }

	2169

	2170 /* Rollback any active transaction and free the handle structure.

	2171 ** The call to sqlite3BtreeRollback() drops any table-locks held by

	2172 ** this handle.

	2173 */

	2174 sqlite3BtreeRollback(p, SQLITE_OK, 0);

	2175 sqlite3BtreeLeave(p);

	2176

	2177 /* If there are still other outstanding references to the shared-btree

	2178 ** structure, return now. The remainder of this procedure cleans

	2179 ** up the shared-btree.

	2180 */

	2181 assert( p->wantToLock==0 && p->locked==0 );

	2182 if( !p->sharable \|\| removeFromSharingList(pBt) ){

	2183 /* The pBt is no longer on the sharing list, so we can access

	2184 ** it without having to hold the mutex.

	2185 **

	2186 ** Clean out and delete the BtShared object.

	2187 */

	2188 assert( !pBt->pCursor );

	2189 sqlite3PagerClose(pBt->pPager);

	2190 if( pBt->xFreeSchema && pBt->pSchema ){

	2191 pBt->xFreeSchema(pBt->pSchema);

	2192 }

	2193 sqlite3DbFree(0, pBt->pSchema);

	2194 freeTempSpace(pBt);

	2195 sqlite3_free(pBt);

	2196 }

	2197

	2198 #ifndef SQLITE_OMIT_SHARED_CACHE

	2199 assert( p->wantToLock==0 );

	2200 assert( p->locked==0 );

	2201 if( p->pPrev ) p->pPrev->pNext = p->pNext;

	2202 if( p->pNext ) p->pNext->pPrev = p->pPrev;

	2203 #endif

	2204

	2205 sqlite3_free(p);

	2206 return SQLITE_OK;

	2207 }

	2208

	2209 /*

	2210 ** Change the limit on the number of pages allowed in the cache.

	2211 **

	2212 ** The maximum number of cache pages is set to the absolute

	2213 ** value of mxPage. If mxPage is negative, the pager will

	2214 ** operate asynchronously - it will not stop to do fsync()s

	2215 ** to insure data is written to the disk surface before

	2216 ** continuing. Transactions still work if synchronous is off,

	2217 ** and the database cannot be corrupted if this program

	2218 ** crashes. But if the operating system crashes or there is

	2219 ** an abrupt power failure when synchronous is off, the database

	2220 ** could be left in an inconsistent and unrecoverable state.

	2221 ** Synchronous is on by default so database corruption is not

	2222 ** normally a worry.

	2223 */

	2224 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){

	2225 BtShared *pBt = p->pBt;

	2226 assert( sqlite3_mutex_held(p->db->mutex) );

	2227 sqlite3BtreeEnter(p);

	2228 sqlite3PagerSetCachesize(pBt->pPager, mxPage);

	2229 sqlite3BtreeLeave(p);

	2230 return SQLITE_OK;

	2231 }

	2232

	2233 #if SQLITE_MAX_MMAP_SIZE>0

	2234 /*

	2235 ** Change the limit on the amount of the database file that may be

	2236 ** memory mapped.

	2237 */

	2238 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){

	2239 BtShared *pBt = p->pBt;

	2240 assert( sqlite3_mutex_held(p->db->mutex) );

	2241 sqlite3BtreeEnter(p);

	2242 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);

	2243 sqlite3BtreeLeave(p);

	2244 return SQLITE_OK;

	2245 }

	2246 #endif /* SQLITE_MAX_MMAP_SIZE>0 */

	2247

	2248 /*

	2249 ** Change the way data is synced to disk in order to increase or decrease

	2250 ** how well the database resists damage due to OS crashes and power

	2251 ** failures. Level 1 is the same as asynchronous (no syncs() occur and

	2252 ** there is a high probability of damage) Level 2 is the default. There

	2253 ** is a very low but non-zero probability of damage. Level 3 reduces the

	2254 ** probability of damage to near zero but with a write performance reduction.

	2255 */

	2256 #ifndef SQLITE_OMIT_PAGER_PRAGMAS

	2257 int sqlite3BtreeSetPagerFlags(

	2258 Btree p, / The btree to set the safety level on */

	2259 unsigned pgFlags /* Various PAGER_* flags */

	2260 ){

	2261 BtShared *pBt = p->pBt;

	2262 assert( sqlite3_mutex_held(p->db->mutex) );

	2263 sqlite3BtreeEnter(p);

	2264 sqlite3PagerSetFlags(pBt->pPager, pgFlags);

	2265 sqlite3BtreeLeave(p);

	2266 return SQLITE_OK;

	2267 }

	2268 #endif

	2269

	2270 /*

	2271 ** Return TRUE if the given btree is set to safety level 1. In other

	2272 ** words, return TRUE if no sync() occurs on the disk files.

	2273 */

	2274 int sqlite3BtreeSyncDisabled(Btree *p){

	2275 BtShared *pBt = p->pBt;

	2276 int rc;

	2277 assert( sqlite3_mutex_held(p->db->mutex) );

	2278 sqlite3BtreeEnter(p);

	2279 assert( pBt && pBt->pPager );

	2280 rc = sqlite3PagerNosync(pBt->pPager);

	2281 sqlite3BtreeLeave(p);

	2282 return rc;

	2283 }

	2284

	2285 /*

	2286 ** Change the default pages size and the number of reserved bytes per page.

	2287 ** Or, if the page size has already been fixed, return SQLITE_READONLY

	2288 ** without changing anything.

	2289 **

	2290 ** The page size must be a power of 2 between 512 and 65536. If the page

	2291 ** size supplied does not meet this constraint then the page size is not

	2292 ** changed.

	2293 **

	2294 ** Page sizes are constrained to be a power of two so that the region

	2295 ** of the database file used for locking (beginning at PENDING_BYTE,

	2296 ** the first byte past the 1GB boundary, 0x40000000) needs to occur

	2297 ** at the beginning of a page.

	2298 **

	2299 ** If parameter nReserve is less than zero, then the number of reserved

	2300 ** bytes per page is left unchanged.

	2301 **

	2302 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size

	2303 ** and autovacuum mode can no longer be changed.

	2304 */

	2305 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){

	2306 int rc = SQLITE_OK;

	2307 BtShared *pBt = p->pBt;

	2308 assert( nReserve>=-1 && nReserve<=255 );

	2309 sqlite3BtreeEnter(p);

	2310 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){

	2311 sqlite3BtreeLeave(p);

	2312 return SQLITE_READONLY;

	2313 }

	2314 if( nReserve<0 ){

	2315 nReserve = pBt->pageSize - pBt->usableSize;

	2316 }

	2317 assert( nReserve>=0 && nReserve<=255 );

	2318 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&

	2319 ((pageSize-1)&pageSize)==0 ){

	2320 assert( (pageSize & 7)==0 );

	2321 assert( !pBt->pPage1 && !pBt->pCursor );

	2322 pBt->pageSize = (u32)pageSize;

	2323 freeTempSpace(pBt);

	2324 }

	2325 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

	2326 pBt->usableSize = pBt->pageSize - (u16)nReserve;

	2327 if( iFix ) pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	2328 sqlite3BtreeLeave(p);

	2329 return rc;

	2330 }

	2331

	2332 /*

	2333 ** Return the currently defined page size

	2334 */

	2335 int sqlite3BtreeGetPageSize(Btree *p){

	2336 return p->pBt->pageSize;

	2337 }

	2338

	2339 #if defined(SQLITE_HAS_CODEC) \|\| defined(SQLITE_DEBUG)

	2340 /*

	2341 ** This function is similar to sqlite3BtreeGetReserve(), except that it

	2342 ** may only be called if it is guaranteed that the b-tree mutex is already

	2343 ** held.

	2344 **

	2345 ** This is useful in one special case in the backup API code where it is

	2346 ** known that the shared b-tree mutex is held, but the mutex on the

	2347 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()

	2348 ** were to be called, it might collide with some other operation on the

	2349 ** database handle that owns *p, causing undefined behavior.

	2350 */

	2351 int sqlite3BtreeGetReserveNoMutex(Btree *p){

	2352 assert( sqlite3_mutex_held(p->pBt->mutex) );

	2353 return p->pBt->pageSize - p->pBt->usableSize;

	2354 }

	2355 #endif /* SQLITE_HAS_CODEC \|\| SQLITE_DEBUG */

	2356

	2357 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) \|\| !defined(SQLITE_OMIT_VACUUM)

	2358 /*

	2359 ** Return the number of bytes of space at the end of every page that

	2360 ** are intentually left unused. This is the "reserved" space that is

	2361 ** sometimes used by extensions.

	2362 */

	2363 int sqlite3BtreeGetReserve(Btree *p){

	2364 int n;

	2365 sqlite3BtreeEnter(p);

	2366 n = p->pBt->pageSize - p->pBt->usableSize;

	2367 sqlite3BtreeLeave(p);

	2368 return n;

	2369 }

	2370

	2371 /*

	2372 ** Set the maximum page count for a database if mxPage is positive.

	2373 ** No changes are made if mxPage is 0 or negative.

	2374 ** Regardless of the value of mxPage, return the maximum page count.

	2375 */

	2376 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){

	2377 int n;

	2378 sqlite3BtreeEnter(p);

	2379 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);

	2380 sqlite3BtreeLeave(p);

	2381 return n;

	2382 }

	2383

	2384 /*

	2385 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1. If newFlag is -1,

	2386 ** then make no changes. Always return the value of the BTS_SECURE_DELETE

	2387 ** setting after the change.

	2388 */

	2389 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){

	2390 int b;

	2391 if( p==0 ) return 0;

	2392 sqlite3BtreeEnter(p);

	2393 if( newFlag>=0 ){

	2394 p->pBt->btsFlags &= ~BTS_SECURE_DELETE;

	2395 if( newFlag ) p->pBt->btsFlags \|= BTS_SECURE_DELETE;

	2396 }

	2397 b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;

	2398 sqlite3BtreeLeave(p);

	2399 return b;

	2400 }

	2401 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) \|\| !defined(SQLITE_OMIT_VACUUM) */

	2402

	2403 /*

	2404 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'

	2405 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it

	2406 ** is disabled. The default value for the auto-vacuum property is

	2407 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.

	2408 */

	2409 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){

	2410 #ifdef SQLITE_OMIT_AUTOVACUUM

	2411 return SQLITE_READONLY;

	2412 #else

	2413 BtShared *pBt = p->pBt;

	2414 int rc = SQLITE_OK;

	2415 u8 av = (u8)autoVacuum;

	2416

	2417 sqlite3BtreeEnter(p);

	2418 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){

	2419 rc = SQLITE_READONLY;

	2420 }else{

	2421 pBt->autoVacuum = av ?1:0;

	2422 pBt->incrVacuum = av==2 ?1:0;

	2423 }

	2424 sqlite3BtreeLeave(p);

	2425 return rc;

	2426 #endif

	2427 }

	2428

	2429 /*

	2430 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is

	2431 ** enabled 1 is returned. Otherwise 0.

	2432 */

	2433 int sqlite3BtreeGetAutoVacuum(Btree *p){

	2434 #ifdef SQLITE_OMIT_AUTOVACUUM

	2435 return BTREE_AUTOVACUUM_NONE;

	2436 #else

	2437 int rc;

	2438 sqlite3BtreeEnter(p);

	2439 rc = (

	2440 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:

	2441 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:

	2442 BTREE_AUTOVACUUM_INCR

	2443 );

	2444 sqlite3BtreeLeave(p);

	2445 return rc;

	2446 #endif

	2447 }

	2448

	2449

	2450 /*

	2451 ** Get a reference to pPage1 of the database file. This will

	2452 ** also acquire a readlock on that file.

	2453 **

	2454 ** SQLITE_OK is returned on success. If the file is not a

	2455 ** well-formed database file, then SQLITE_CORRUPT is returned.

	2456 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM

	2457 ** is returned if we run out of memory.

	2458 */

	2459 static int lockBtree(BtShared *pBt){

	2460 int rc; /* Result code from subfunctions */

	2461 MemPage pPage1; / Page 1 of the database file */

	2462 int nPage; /* Number of pages in the database */

	2463 int nPageFile = 0; /* Number of pages in the database file */

	2464 int nPageHeader; /* Number of pages in the database according to hdr */

	2465

	2466 assert( sqlite3_mutex_held(pBt->mutex) );

	2467 assert( pBt->pPage1==0 );

	2468 rc = sqlite3PagerSharedLock(pBt->pPager);

	2469 if( rc!=SQLITE_OK ) return rc;

	2470 rc = btreeGetPage(pBt, 1, &pPage1, 0);

	2471 if( rc!=SQLITE_OK ) return rc;

	2472

	2473 /* Do some checking to help insure the file we opened really is

	2474 ** a valid database file.

	2475 */

	2476 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);

	2477 sqlite3PagerPagecount(pBt->pPager, &nPageFile);

	2478 if( nPage==0 \|\| memcmp(24+(u8)pPage1->aData, 92+(u8)pPage1->aData,4)!=0 ){

	2479 nPage = nPageFile;

	2480 }

	2481 if( nPage>0 ){

	2482 u32 pageSize;

	2483 u32 usableSize;

	2484 u8 *page1 = pPage1->aData;

	2485 rc = SQLITE_NOTADB;

	2486 if( memcmp(page1, zMagicHeader, 16)!=0 ){

	2487 goto page1_init_failed;

	2488 }

	2489

	2490 #ifdef SQLITE_OMIT_WAL

	2491 if( page1[18]>1 ){

	2492 pBt->btsFlags \|= BTS_READ_ONLY;

	2493 }

	2494 if( page1[19]>1 ){

	2495 goto page1_init_failed;

	2496 }

	2497 #else

	2498 if( page1[18]>2 ){

	2499 pBt->btsFlags \|= BTS_READ_ONLY;

	2500 }

	2501 if( page1[19]>2 ){

	2502 goto page1_init_failed;

	2503 }

	2504

	2505 /* If the write version is set to 2, this database should be accessed

	2506 ** in WAL mode. If the log is not already open, open it now. Then

	2507 ** return SQLITE_OK and return without populating BtShared.pPage1.

	2508 ** The caller detects this and calls this function again. This is

	2509 ** required as the version of page 1 currently in the page1 buffer

	2510 ** may not be the latest version - there may be a newer one in the log

	2511 ** file.

	2512 */

	2513 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){

	2514 int isOpen = 0;

	2515 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);

	2516 if( rc!=SQLITE_OK ){

	2517 goto page1_init_failed;

	2518 }else if( isOpen==0 ){

	2519 releasePage(pPage1);

	2520 return SQLITE_OK;

	2521 }

	2522 rc = SQLITE_NOTADB;

	2523 }

	2524 #endif

	2525

	2526 /* The maximum embedded fraction must be exactly 25%. And the minimum

	2527 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.

	2528 ** The original design allowed these amounts to vary, but as of

	2529 ** version 3.6.0, we require them to be fixed.

	2530 */

	2531 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){

	2532 goto page1_init_failed;

	2533 }

	2534 pageSize = (page1[16]<<8) \| (page1[17]<<16);

	2535 if( ((pageSize-1)&pageSize)!=0

	2536 \|\| pageSize>SQLITE_MAX_PAGE_SIZE

	2537 \|\| pageSize<=256

	2538 ){

	2539 goto page1_init_failed;

	2540 }

	2541 assert( (pageSize & 7)==0 );

	2542 usableSize = pageSize - page1[20];

	2543 if( (u32)pageSize!=pBt->pageSize ){

	2544 /* After reading the first page of the database assuming a page size

	2545 ** of BtShared.pageSize, we have discovered that the page-size is

	2546 ** actually pageSize. Unlock the database, leave pBt->pPage1 at

	2547 ** zero and return SQLITE_OK. The caller will call this function

	2548 ** again with the correct page-size.

	2549 */

	2550 releasePage(pPage1);

	2551 pBt->usableSize = usableSize;

	2552 pBt->pageSize = pageSize;

	2553 freeTempSpace(pBt);

	2554 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,

	2555 pageSize-usableSize);

	2556 return rc;

	2557 }

	2558 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){

	2559 rc = SQLITE_CORRUPT_BKPT;

	2560 goto page1_init_failed;

	2561 }

	2562 if( usableSize<480 ){

	2563 goto page1_init_failed;

	2564 }

	2565 pBt->pageSize = pageSize;

	2566 pBt->usableSize = usableSize;

	2567 #ifndef SQLITE_OMIT_AUTOVACUUM

	2568 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);

	2569 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);

	2570 #endif

	2571 }

	2572

	2573 /* maxLocal is the maximum amount of payload to store locally for

	2574 ** a cell. Make sure it is small enough so that at least minFanout

	2575 ** cells can will fit on one page. We assume a 10-byte page header.

	2576 ** Besides the payload, the cell must store:

	2577 ** 2-byte pointer to the cell

	2578 ** 4-byte child pointer

	2579 ** 9-byte nKey value

	2580 ** 4-byte nData value

	2581 ** 4-byte overflow page pointer

	2582 ** So a cell consists of a 2-byte pointer, a header which is as much as

	2583 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow

	2584 ** page pointer.

	2585 */

	2586 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);

	2587 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);

	2588 pBt->maxLeaf = (u16)(pBt->usableSize - 35);

	2589 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);

	2590 if( pBt->maxLocal>127 ){

	2591 pBt->max1bytePayload = 127;

	2592 }else{

	2593 pBt->max1bytePayload = (u8)pBt->maxLocal;

	2594 }

	2595 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );

	2596 pBt->pPage1 = pPage1;

	2597 pBt->nPage = nPage;

	2598 return SQLITE_OK;

	2599

	2600 page1_init_failed:

	2601 releasePage(pPage1);

	2602 pBt->pPage1 = 0;

	2603 return rc;

	2604 }

	2605

	2606 #ifndef NDEBUG

	2607 /*

	2608 ** Return the number of cursors open on pBt. This is for use

	2609 ** in assert() expressions, so it is only compiled if NDEBUG is not

	2610 ** defined.

	2611 **

	2612 ** Only write cursors are counted if wrOnly is true. If wrOnly is

	2613 ** false then all cursors are counted.

	2614 **

	2615 ** For the purposes of this routine, a cursor is any cursor that

	2616 ** is capable of reading or writing to the database. Cursors that

	2617 ** have been tripped into the CURSOR_FAULT state are not counted.

	2618 */

	2619 static int countValidCursors(BtShared *pBt, int wrOnly){

	2620 BtCursor *pCur;

	2621 int r = 0;

	2622 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){

	2623 if( (wrOnly==0 \|\| (pCur->curFlags & BTCF_WriteFlag)!=0)

	2624 && pCur->eState!=CURSOR_FAULT ) r++;

	2625 }

	2626 return r;

	2627 }

	2628 #endif

	2629

	2630 /*

	2631 ** If there are no outstanding cursors and we are not in the middle

	2632 ** of a transaction but there is a read lock on the database, then

	2633 ** this routine unrefs the first page of the database file which

	2634 ** has the effect of releasing the read lock.

	2635 **

	2636 ** If there is a transaction in progress, this routine is a no-op.

	2637 */

	2638 static void unlockBtreeIfUnused(BtShared *pBt){

	2639 assert( sqlite3_mutex_held(pBt->mutex) );

	2640 assert( countValidCursors(pBt,0)==0 \|\| pBt->inTransaction>TRANS_NONE );

	2641 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){

	2642 MemPage *pPage1 = pBt->pPage1;

	2643 assert( pPage1->aData );

	2644 assert( sqlite3PagerRefcount(pBt->pPager)==1 );

	2645 pBt->pPage1 = 0;

	2646 releasePage(pPage1);

	2647 }

	2648 }

	2649

	2650 /*

	2651 ** If pBt points to an empty file then convert that empty file

	2652 ** into a new empty database by initializing the first page of

	2653 ** the database.

	2654 */

	2655 static int newDatabase(BtShared *pBt){

	2656 MemPage *pP1;

	2657 unsigned char *data;

	2658 int rc;

	2659

	2660 assert( sqlite3_mutex_held(pBt->mutex) );

	2661 if( pBt->nPage>0 ){

	2662 return SQLITE_OK;

	2663 }

	2664 pP1 = pBt->pPage1;

	2665 assert( pP1!=0 );

	2666 data = pP1->aData;

	2667 rc = sqlite3PagerWrite(pP1->pDbPage);

	2668 if( rc ) return rc;

	2669 memcpy(data, zMagicHeader, sizeof(zMagicHeader));

	2670 assert( sizeof(zMagicHeader)==16 );

	2671 data[16] = (u8)((pBt->pageSize>>8)&0xff);

	2672 data[17] = (u8)((pBt->pageSize>>16)&0xff);

	2673 data[18] = 1;

	2674 data[19] = 1;

	2675 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);

	2676 data[20] = (u8)(pBt->pageSize - pBt->usableSize);

	2677 data[21] = 64;

	2678 data[22] = 32;

	2679 data[23] = 32;

	2680 memset(&data[24], 0, 100-24);

	2681 zeroPage(pP1, PTF_INTKEY\|PTF_LEAF\|PTF_LEAFDATA );

	2682 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	2683 #ifndef SQLITE_OMIT_AUTOVACUUM

	2684 assert( pBt->autoVacuum==1 \|\| pBt->autoVacuum==0 );

	2685 assert( pBt->incrVacuum==1 \|\| pBt->incrVacuum==0 );

	2686 put4byte(&data[36 + 4*4], pBt->autoVacuum);

	2687 put4byte(&data[36 + 7*4], pBt->incrVacuum);

	2688 #endif

	2689 pBt->nPage = 1;

	2690 data[31] = 1;

	2691 return SQLITE_OK;

	2692 }

	2693

	2694 /*

	2695 ** Initialize the first page of the database file (creating a database

	2696 ** consisting of a single page and no schema objects). Return SQLITE_OK

	2697 ** if successful, or an SQLite error code otherwise.

	2698 */

	2699 int sqlite3BtreeNewDb(Btree *p){

	2700 int rc;

	2701 sqlite3BtreeEnter(p);

	2702 p->pBt->nPage = 0;

	2703 rc = newDatabase(p->pBt);

	2704 sqlite3BtreeLeave(p);

	2705 return rc;

	2706 }

	2707

	2708 /*

	2709 ** Attempt to start a new transaction. A write-transaction

	2710 ** is started if the second argument is nonzero, otherwise a read-

	2711 ** transaction. If the second argument is 2 or more and exclusive

	2712 ** transaction is started, meaning that no other process is allowed

	2713 ** to access the database. A preexisting transaction may not be

	2714 ** upgraded to exclusive by calling this routine a second time - the

	2715 ** exclusivity flag only works for a new transaction.

	2716 **

	2717 ** A write-transaction must be started before attempting any

	2718 ** changes to the database. None of the following routines

	2719 ** will work unless a transaction is started first:

	2720 **

	2721 ** sqlite3BtreeCreateTable()

	2722 ** sqlite3BtreeCreateIndex()

	2723 ** sqlite3BtreeClearTable()

	2724 ** sqlite3BtreeDropTable()

	2725 ** sqlite3BtreeInsert()

	2726 ** sqlite3BtreeDelete()

	2727 ** sqlite3BtreeUpdateMeta()

	2728 **

	2729 ** If an initial attempt to acquire the lock fails because of lock contention

	2730 ** and the database was previously unlocked, then invoke the busy handler

	2731 ** if there is one. But if there was previously a read-lock, do not

	2732 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is

	2733 ** returned when there is already a read-lock in order to avoid a deadlock.

	2734 **

	2735 ** Suppose there are two processes A and B. A has a read lock and B has

	2736 ** a reserved lock. B tries to promote to exclusive but is blocked because

	2737 ** of A's read lock. A tries to promote to reserved but is blocked by B.

	2738 ** One or the other of the two processes must give way or there can be

	2739 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback

	2740 ** when A already has a read lock, we encourage A to give up and let B

	2741 ** proceed.

	2742 */

	2743 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){

	2744 sqlite3 *pBlock = 0;

	2745 BtShared *pBt = p->pBt;

	2746 int rc = SQLITE_OK;

	2747

	2748 sqlite3BtreeEnter(p);

	2749 btreeIntegrity(p);

	2750

	2751 /* If the btree is already in a write-transaction, or it

	2752 ** is already in a read-transaction and a read-transaction

	2753 ** is requested, this is a no-op.

	2754 */

	2755 if( p->inTrans==TRANS_WRITE \|\| (p->inTrans==TRANS_READ && !wrflag) ){

	2756 goto trans_begun;

	2757 }

	2758 assert( pBt->inTransaction==TRANS_WRITE \|\| IfNotOmitAV(pBt->bDoTruncate)==0 );

	2759

	2760 /* Write transactions are not possible on a read-only database */

	2761 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){

	2762 rc = SQLITE_READONLY;

	2763 goto trans_begun;

	2764 }

	2765

	2766 #ifndef SQLITE_OMIT_SHARED_CACHE

	2767 /* If another database handle has already opened a write transaction

	2768 ** on this shared-btree structure and a second write transaction is

	2769 ** requested, return SQLITE_LOCKED.

	2770 */

	2771 if( (wrflag && pBt->inTransaction==TRANS_WRITE)

	2772 \|\| (pBt->btsFlags & BTS_PENDING)!=0

	2773 ){

	2774 pBlock = pBt->pWriter->db;

	2775 }else if( wrflag>1 ){

	2776 BtLock *pIter;

	2777 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	2778 if( pIter->pBtree!=p ){

	2779 pBlock = pIter->pBtree->db;

	2780 break;

	2781 }

	2782 }

	2783 }

	2784 if( pBlock ){

	2785 sqlite3ConnectionBlocked(p->db, pBlock);

	2786 rc = SQLITE_LOCKED_SHAREDCACHE;

	2787 goto trans_begun;

	2788 }

	2789 #endif

	2790

	2791 /* Any read-only or read-write transaction implies a read-lock on

	2792 ** page 1. So if some other shared-cache client already has a write-lock

	2793 ** on page 1, the transaction cannot be opened. */

	2794 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

	2795 if( SQLITE_OK!=rc ) goto trans_begun;

	2796

	2797 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;

	2798 if( pBt->nPage==0 ) pBt->btsFlags \|= BTS_INITIALLY_EMPTY;

	2799 do {

	2800 /* Call lockBtree() until either pBt->pPage1 is populated or

	2801 ** lockBtree() returns something other than SQLITE_OK. lockBtree()

	2802 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after

	2803 ** reading page 1 it discovers that the page-size of the database

	2804 ** file is not pBt->pageSize. In this case lockBtree() will update

	2805 ** pBt->pageSize to the page-size of the file on disk.

	2806 */

	2807 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );

	2808

	2809 if( rc==SQLITE_OK && wrflag ){

	2810 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){

	2811 rc = SQLITE_READONLY;

	2812 }else{

	2813 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));

	2814 if( rc==SQLITE_OK ){

	2815 rc = newDatabase(pBt);

	2816 }

	2817 }

	2818 }

	2819

	2820 if( rc!=SQLITE_OK ){

	2821 unlockBtreeIfUnused(pBt);

	2822 }

	2823 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&

	2824 btreeInvokeBusyHandler(pBt) );

	2825

	2826 if( rc==SQLITE_OK ){

	2827 if( p->inTrans==TRANS_NONE ){

	2828 pBt->nTransaction++;

	2829 #ifndef SQLITE_OMIT_SHARED_CACHE

	2830 if( p->sharable ){

	2831 assert( p->lock.pBtree==p && p->lock.iTable==1 );

	2832 p->lock.eLock = READ_LOCK;

	2833 p->lock.pNext = pBt->pLock;

	2834 pBt->pLock = &p->lock;

	2835 }

	2836 #endif

	2837 }

	2838 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);

	2839 if( p->inTrans>pBt->inTransaction ){

	2840 pBt->inTransaction = p->inTrans;

	2841 }

	2842 if( wrflag ){

	2843 MemPage *pPage1 = pBt->pPage1;

	2844 #ifndef SQLITE_OMIT_SHARED_CACHE

	2845 assert( !pBt->pWriter );

	2846 pBt->pWriter = p;

	2847 pBt->btsFlags &= ~BTS_EXCLUSIVE;

	2848 if( wrflag>1 ) pBt->btsFlags \|= BTS_EXCLUSIVE;

	2849 #endif

	2850

	2851 /* If the db-size header field is incorrect (as it may be if an old

	2852 ** client has been writing the database file), update it now. Doing

	2853 ** this sooner rather than later means the database size can safely

	2854 ** re-read the database size from page 1 if a savepoint or transaction

	2855 ** rollback occurs within the transaction.

	2856 */

	2857 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){

	2858 rc = sqlite3PagerWrite(pPage1->pDbPage);

	2859 if( rc==SQLITE_OK ){

	2860 put4byte(&pPage1->aData[28], pBt->nPage);

	2861 }

	2862 }

	2863 }

	2864 }

	2865

	2866

	2867 trans_begun:

	2868 if( rc==SQLITE_OK && wrflag ){

	2869 /* This call makes sure that the pager has the correct number of

	2870 ** open savepoints. If the second parameter is greater than 0 and

	2871 ** the sub-journal is not already open, then it will be opened here.

	2872 */

	2873 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);

	2874 }

	2875

	2876 btreeIntegrity(p);

	2877 sqlite3BtreeLeave(p);

	2878 return rc;

	2879 }

	2880

	2881 #ifndef SQLITE_OMIT_AUTOVACUUM

	2882

	2883 /*

	2884 ** Set the pointer-map entries for all children of page pPage. Also, if

	2885 ** pPage contains cells that point to overflow pages, set the pointer

	2886 ** map entries for the overflow pages as well.

	2887 */

	2888 static int setChildPtrmaps(MemPage *pPage){

	2889 int i; /* Counter variable */

	2890 int nCell; /* Number of cells in page pPage */

	2891 int rc; /* Return code */

	2892 BtShared *pBt = pPage->pBt;

	2893 u8 isInitOrig = pPage->isInit;

	2894 Pgno pgno = pPage->pgno;

	2895

	2896 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	2897 rc = btreeInitPage(pPage);

	2898 if( rc!=SQLITE_OK ){

	2899 goto set_child_ptrmaps_out;

	2900 }

	2901 nCell = pPage->nCell;

	2902

	2903 for(i=0; i<nCell; i++){

	2904 u8 *pCell = findCell(pPage, i);

	2905

	2906 ptrmapPutOvflPtr(pPage, pCell, &rc);

	2907

	2908 if( !pPage->leaf ){

	2909 Pgno childPgno = get4byte(pCell);

	2910 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

	2911 }

	2912 }

	2913

	2914 if( !pPage->leaf ){

	2915 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	2916 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

	2917 }

	2918

	2919 set_child_ptrmaps_out:

	2920 pPage->isInit = isInitOrig;

	2921 return rc;

	2922 }

	2923

	2924 /*

	2925 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so

	2926 ** that it points to iTo. Parameter eType describes the type of pointer to

	2927 ** be modified, as follows:

	2928 **

	2929 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child

	2930 ** page of pPage.

	2931 **

	2932 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow

	2933 ** page pointed to by one of the cells on pPage.

	2934 **

	2935 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next

	2936 ** overflow page in the list.

	2937 */

	2938 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){

	2939 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	2940 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	2941 if( eType==PTRMAP_OVERFLOW2 ){

	2942 /* The pointer is always the first 4 bytes of the page in this case. */

	2943 if( get4byte(pPage->aData)!=iFrom ){

	2944 return SQLITE_CORRUPT_BKPT;

	2945 }

	2946 put4byte(pPage->aData, iTo);

	2947 }else{

	2948 u8 isInitOrig = pPage->isInit;

	2949 int i;

	2950 int nCell;

	2951

	2952 btreeInitPage(pPage);

	2953 nCell = pPage->nCell;

	2954

	2955 for(i=0; i<nCell; i++){

	2956 u8 *pCell = findCell(pPage, i);

	2957 if( eType==PTRMAP_OVERFLOW1 ){

	2958 CellInfo info;

	2959 btreeParseCellPtr(pPage, pCell, &info);

	2960 if( info.iOverflow

	2961 && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage

	2962 && iFrom==get4byte(&pCell[info.iOverflow])

	2963 ){

	2964 put4byte(&pCell[info.iOverflow], iTo);

	2965 break;

	2966 }

	2967 }else{

	2968 if( get4byte(pCell)==iFrom ){

	2969 put4byte(pCell, iTo);

	2970 break;

	2971 }

	2972 }

	2973 }

	2974

	2975 if( i==nCell ){

	2976 if( eType!=PTRMAP_BTREE \|\|

	2977 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){

	2978 return SQLITE_CORRUPT_BKPT;

	2979 }

	2980 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);

	2981 }

	2982

	2983 pPage->isInit = isInitOrig;

	2984 }

	2985 return SQLITE_OK;

	2986 }

	2987

	2988

	2989 /*

	2990 ** Move the open database page pDbPage to location iFreePage in the

	2991 ** database. The pDbPage reference remains valid.

	2992 **

	2993 ** The isCommit flag indicates that there is no need to remember that

	2994 ** the journal needs to be sync()ed before database page pDbPage->pgno

	2995 ** can be written to. The caller has already promised not to write to that

	2996 ** page.

	2997 */

	2998 static int relocatePage(

	2999 BtShared pBt, / Btree */

	3000 MemPage pDbPage, / Open page to move */

	3001 u8 eType, /* Pointer map 'type' entry for pDbPage */

	3002 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */

	3003 Pgno iFreePage, /* The location to move pDbPage to */

	3004 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */

	3005 ){

	3006 MemPage pPtrPage; / The page that contains a pointer to pDbPage */

	3007 Pgno iDbPage = pDbPage->pgno;

	3008 Pager *pPager = pBt->pPager;

	3009 int rc;

	3010

	3011 assert( eType==PTRMAP_OVERFLOW2 \|\| eType==PTRMAP_OVERFLOW1 \|\|

	3012 eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE );

	3013 assert( sqlite3_mutex_held(pBt->mutex) );

	3014 assert( pDbPage->pBt==pBt );

	3015

	3016 /* Move page iDbPage from its current location to page number iFreePage */

	3017 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",

	3018 iDbPage, iFreePage, iPtrPage, eType));

	3019 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);

	3020 if( rc!=SQLITE_OK ){

	3021 return rc;

	3022 }

	3023 pDbPage->pgno = iFreePage;

	3024

	3025 /* If pDbPage was a btree-page, then it may have child pages and/or cells

	3026 ** that point to overflow pages. The pointer map entries for all these

	3027 ** pages need to be changed.

	3028 **

	3029 ** If pDbPage is an overflow page, then the first 4 bytes may store a

	3030 ** pointer to a subsequent overflow page. If this is the case, then

	3031 ** the pointer map needs to be updated for the subsequent overflow page.

	3032 */

	3033 if( eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE ){

	3034 rc = setChildPtrmaps(pDbPage);

	3035 if( rc!=SQLITE_OK ){

	3036 return rc;

	3037 }

	3038 }else{

	3039 Pgno nextOvfl = get4byte(pDbPage->aData);

	3040 if( nextOvfl!=0 ){

	3041 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);

	3042 if( rc!=SQLITE_OK ){

	3043 return rc;

	3044 }

	3045 }

	3046 }

	3047

	3048 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so

	3049 ** that it points at iFreePage. Also fix the pointer map entry for

	3050 ** iPtrPage.

	3051 */

	3052 if( eType!=PTRMAP_ROOTPAGE ){

	3053 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);

	3054 if( rc!=SQLITE_OK ){

	3055 return rc;

	3056 }

	3057 rc = sqlite3PagerWrite(pPtrPage->pDbPage);

	3058 if( rc!=SQLITE_OK ){

	3059 releasePage(pPtrPage);

	3060 return rc;

	3061 }

	3062 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);

	3063 releasePage(pPtrPage);

	3064 if( rc==SQLITE_OK ){

	3065 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);

	3066 }

	3067 }

	3068 return rc;

	3069 }

	3070

	3071 /* Forward declaration required by incrVacuumStep(). */

	3072 static int allocateBtreePage(BtShared , MemPage , Pgno , Pgno, u8);

	3073

	3074 /*

	3075 ** Perform a single step of an incremental-vacuum. If successful, return

	3076 ** SQLITE_OK. If there is no work to do (and therefore no point in

	3077 ** calling this function again), return SQLITE_DONE. Or, if an error

	3078 ** occurs, return some other error code.

	3079 **

	3080 ** More specifically, this function attempts to re-organize the database so

	3081 ** that the last page of the file currently in use is no longer in use.

	3082 **

	3083 ** Parameter nFin is the number of pages that this database would contain

	3084 ** were this function called until it returns SQLITE_DONE.

	3085 **

	3086 ** If the bCommit parameter is non-zero, this function assumes that the

	3087 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE

	3088 ** or an error. bCommit is passed true for an auto-vacuum-on-commit

	3089 ** operation, or false for an incremental vacuum.

	3090 */

	3091 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){

	3092 Pgno nFreeList; /* Number of pages still on the free-list */

	3093 int rc;

	3094

	3095 assert( sqlite3_mutex_held(pBt->mutex) );

	3096 assert( iLastPg>nFin );

	3097

	3098 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){

	3099 u8 eType;

	3100 Pgno iPtrPage;

	3101

	3102 nFreeList = get4byte(&pBt->pPage1->aData[36]);

	3103 if( nFreeList==0 ){

	3104 return SQLITE_DONE;

	3105 }

	3106

	3107 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);

	3108 if( rc!=SQLITE_OK ){

	3109 return rc;

	3110 }

	3111 if( eType==PTRMAP_ROOTPAGE ){

	3112 return SQLITE_CORRUPT_BKPT;

	3113 }

	3114

	3115 if( eType==PTRMAP_FREEPAGE ){

	3116 if( bCommit==0 ){

	3117 /* Remove the page from the files free-list. This is not required

	3118 ** if bCommit is non-zero. In that case, the free-list will be

	3119 ** truncated to zero after this function returns, so it doesn't

	3120 ** matter if it still contains some garbage entries.

	3121 */

	3122 Pgno iFreePg;

	3123 MemPage *pFreePg;

	3124 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);

	3125 if( rc!=SQLITE_OK ){

	3126 return rc;

	3127 }

	3128 assert( iFreePg==iLastPg );

	3129 releasePage(pFreePg);

	3130 }

	3131 } else {

	3132 Pgno iFreePg; /* Index of free page to move pLastPg to */

	3133 MemPage *pLastPg;

	3134 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */

	3135 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */

	3136

	3137 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);

	3138 if( rc!=SQLITE_OK ){

	3139 return rc;

	3140 }

	3141

	3142 /* If bCommit is zero, this loop runs exactly once and page pLastPg

	3143 ** is swapped with the first free page pulled off the free list.

	3144 **

	3145 ** On the other hand, if bCommit is greater than zero, then keep

	3146 ** looping until a free-page located within the first nFin pages

	3147 ** of the file is found.

	3148 */

	3149 if( bCommit==0 ){

	3150 eMode = BTALLOC_LE;

	3151 iNear = nFin;

	3152 }

	3153 do {

	3154 MemPage *pFreePg;

	3155 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);

	3156 if( rc!=SQLITE_OK ){

	3157 releasePage(pLastPg);

	3158 return rc;

	3159 }

	3160 releasePage(pFreePg);

	3161 }while( bCommit && iFreePg>nFin );

	3162 assert( iFreePg<iLastPg );

	3163

	3164 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);

	3165 releasePage(pLastPg);

	3166 if( rc!=SQLITE_OK ){

	3167 return rc;

	3168 }

	3169 }

	3170 }

	3171

	3172 if( bCommit==0 ){

	3173 do {

	3174 iLastPg--;

	3175 }while( iLastPg==PENDING_BYTE_PAGE(pBt) \|\| PTRMAP_ISPAGE(pBt, iLastPg) );

	3176 pBt->bDoTruncate = 1;

	3177 pBt->nPage = iLastPg;

	3178 }

	3179 return SQLITE_OK;

	3180 }

	3181

	3182 /*

	3183 ** The database opened by the first argument is an auto-vacuum database

	3184 ** nOrig pages in size containing nFree free pages. Return the expected

	3185 ** size of the database in pages following an auto-vacuum operation.

	3186 */

	3187 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){

	3188 int nEntry; /* Number of entries on one ptrmap page */

	3189 Pgno nPtrmap; /* Number of PtrMap pages to be freed */

	3190 Pgno nFin; /* Return value */

	3191

	3192 nEntry = pBt->usableSize/5;

	3193 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;

	3194 nFin = nOrig - nFree - nPtrmap;

	3195 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){

	3196 nFin--;

	3197 }

	3198 while( PTRMAP_ISPAGE(pBt, nFin) \|\| nFin==PENDING_BYTE_PAGE(pBt) ){

	3199 nFin--;

	3200 }

	3201

	3202 return nFin;

	3203 }

	3204

	3205 /*

	3206 ** A write-transaction must be opened before calling this function.

	3207 ** It performs a single unit of work towards an incremental vacuum.

	3208 **

	3209 ** If the incremental vacuum is finished after this function has run,

	3210 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,

	3211 ** SQLITE_OK is returned. Otherwise an SQLite error code.

	3212 */

	3213 int sqlite3BtreeIncrVacuum(Btree *p){

	3214 int rc;

	3215 BtShared *pBt = p->pBt;

	3216

	3217 sqlite3BtreeEnter(p);

	3218 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );

	3219 if( !pBt->autoVacuum ){

	3220 rc = SQLITE_DONE;

	3221 }else{

	3222 Pgno nOrig = btreePagecount(pBt);

	3223 Pgno nFree = get4byte(&pBt->pPage1->aData[36]);

	3224 Pgno nFin = finalDbSize(pBt, nOrig, nFree);

	3225

	3226 if( nOrig<nFin ){

	3227 rc = SQLITE_CORRUPT_BKPT;

	3228 }else if( nFree>0 ){

	3229 rc = saveAllCursors(pBt, 0, 0);

	3230 if( rc==SQLITE_OK ){

	3231 invalidateAllOverflowCache(pBt);

	3232 rc = incrVacuumStep(pBt, nFin, nOrig, 0);

	3233 }

	3234 if( rc==SQLITE_OK ){

	3235 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	3236 put4byte(&pBt->pPage1->aData[28], pBt->nPage);

	3237 }

	3238 }else{

	3239 rc = SQLITE_DONE;

	3240 }

	3241 }

	3242 sqlite3BtreeLeave(p);

	3243 return rc;

	3244 }

	3245

	3246 /*

	3247 ** This routine is called prior to sqlite3PagerCommit when a transaction

	3248 ** is committed for an auto-vacuum database.

	3249 **

	3250 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages

	3251 ** the database file should be truncated to during the commit process.

	3252 ** i.e. the database has been reorganized so that only the first *pnTrunc

	3253 ** pages are in use.

	3254 */

	3255 static int autoVacuumCommit(BtShared *pBt){

	3256 int rc = SQLITE_OK;

	3257 Pager *pPager = pBt->pPager;

	3258 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );

	3259

	3260 assert( sqlite3_mutex_held(pBt->mutex) );

	3261 invalidateAllOverflowCache(pBt);

	3262 assert(pBt->autoVacuum);

	3263 if( !pBt->incrVacuum ){

	3264 Pgno nFin; /* Number of pages in database after autovacuuming */

	3265 Pgno nFree; /* Number of pages on the freelist initially */

	3266 Pgno iFree; /* The next page to be freed */

	3267 Pgno nOrig; /* Database size before freeing */

	3268

	3269 nOrig = btreePagecount(pBt);

	3270 if( PTRMAP_ISPAGE(pBt, nOrig) \|\| nOrig==PENDING_BYTE_PAGE(pBt) ){

	3271 /* It is not possible to create a database for which the final page

	3272 ** is either a pointer-map page or the pending-byte page. If one

	3273 ** is encountered, this indicates corruption.

	3274 */

	3275 return SQLITE_CORRUPT_BKPT;

	3276 }

	3277

	3278 nFree = get4byte(&pBt->pPage1->aData[36]);

	3279 nFin = finalDbSize(pBt, nOrig, nFree);

	3280 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;

	3281 if( nFin<nOrig ){

	3282 rc = saveAllCursors(pBt, 0, 0);

	3283 }

	3284 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){

	3285 rc = incrVacuumStep(pBt, nFin, iFree, 1);

	3286 }

	3287 if( (rc==SQLITE_DONE \|\| rc==SQLITE_OK) && nFree>0 ){

	3288 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	3289 put4byte(&pBt->pPage1->aData[32], 0);

	3290 put4byte(&pBt->pPage1->aData[36], 0);

	3291 put4byte(&pBt->pPage1->aData[28], nFin);

	3292 pBt->bDoTruncate = 1;

	3293 pBt->nPage = nFin;

	3294 }

	3295 if( rc!=SQLITE_OK ){

	3296 sqlite3PagerRollback(pPager);

	3297 }

	3298 }

	3299

	3300 assert( nRef>=sqlite3PagerRefcount(pPager) );

	3301 return rc;

	3302 }

	3303

	3304 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */

	3305 # define setChildPtrmaps(x) SQLITE_OK

	3306 #endif

	3307

	3308 /*

	3309 ** This routine does the first phase of a two-phase commit. This routine

	3310 ** causes a rollback journal to be created (if it does not already exist)

	3311 ** and populated with enough information so that if a power loss occurs

	3312 ** the database can be restored to its original state by playing back

	3313 ** the journal. Then the contents of the journal are flushed out to

	3314 ** the disk. After the journal is safely on oxide, the changes to the

	3315 ** database are written into the database file and flushed to oxide.

	3316 ** At the end of this call, the rollback journal still exists on the

	3317 ** disk and we are still holding all locks, so the transaction has not

	3318 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the

	3319 ** commit process.

	3320 **

	3321 ** This call is a no-op if no write-transaction is currently active on pBt.

	3322 **

	3323 ** Otherwise, sync the database file for the btree pBt. zMaster points to

	3324 ** the name of a master journal file that should be written into the

	3325 ** individual journal file, or is NULL, indicating no master journal file

	3326 ** (single database transaction).

	3327 **

	3328 ** When this is called, the master journal should already have been

	3329 ** created, populated with this journal pointer and synced to disk.

	3330 **

	3331 ** Once this is routine has returned, the only thing required to commit

	3332 ** the write-transaction for this database file is to delete the journal.

	3333 */

	3334 int sqlite3BtreeCommitPhaseOne(Btree p, const char zMaster){

	3335 int rc = SQLITE_OK;

	3336 if( p->inTrans==TRANS_WRITE ){

	3337 BtShared *pBt = p->pBt;

	3338 sqlite3BtreeEnter(p);

	3339 #ifndef SQLITE_OMIT_AUTOVACUUM

	3340 if( pBt->autoVacuum ){

	3341 rc = autoVacuumCommit(pBt);

	3342 if( rc!=SQLITE_OK ){

	3343 sqlite3BtreeLeave(p);

	3344 return rc;

	3345 }

	3346 }

	3347 if( pBt->bDoTruncate ){

	3348 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);

	3349 }

	3350 #endif

	3351 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);

	3352 sqlite3BtreeLeave(p);

	3353 }

	3354 return rc;

	3355 }

	3356

	3357 /*

	3358 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()

	3359 ** at the conclusion of a transaction.

	3360 */

	3361 static void btreeEndTransaction(Btree *p){

	3362 BtShared *pBt = p->pBt;

	3363 sqlite3 *db = p->db;

	3364 assert( sqlite3BtreeHoldsMutex(p) );

	3365

	3366 #ifndef SQLITE_OMIT_AUTOVACUUM

	3367 pBt->bDoTruncate = 0;

	3368 #endif

	3369 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){

	3370 /* If there are other active statements that belong to this database

	3371 ** handle, downgrade to a read-only transaction. The other statements

	3372 ** may still be reading from the database. */

	3373 downgradeAllSharedCacheTableLocks(p);

	3374 p->inTrans = TRANS_READ;

	3375 }else{

	3376 /* If the handle had any kind of transaction open, decrement the

	3377 ** transaction count of the shared btree. If the transaction count

	3378 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()

	3379 ** call below will unlock the pager. */

	3380 if( p->inTrans!=TRANS_NONE ){

	3381 clearAllSharedCacheTableLocks(p);

	3382 pBt->nTransaction--;

	3383 if( 0==pBt->nTransaction ){

	3384 pBt->inTransaction = TRANS_NONE;

	3385 }

	3386 }

	3387

	3388 /* Set the current transaction state to TRANS_NONE and unlock the

	3389 ** pager if this call closed the only read or write transaction. */

	3390 p->inTrans = TRANS_NONE;

	3391 unlockBtreeIfUnused(pBt);

	3392 }

	3393

	3394 btreeIntegrity(p);

	3395 }

	3396

	3397 /*

	3398 ** Commit the transaction currently in progress.

	3399 **

	3400 ** This routine implements the second phase of a 2-phase commit. The

	3401 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should

	3402 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()

	3403 ** routine did all the work of writing information out to disk and flushing the

	3404 ** contents so that they are written onto the disk platter. All this

	3405 ** routine has to do is delete or truncate or zero the header in the

	3406 ** the rollback journal (which causes the transaction to commit) and

	3407 ** drop locks.

	3408 **

	3409 ** Normally, if an error occurs while the pager layer is attempting to

	3410 ** finalize the underlying journal file, this function returns an error and

	3411 ** the upper layer will attempt a rollback. However, if the second argument

	3412 ** is non-zero then this b-tree transaction is part of a multi-file

	3413 ** transaction. In this case, the transaction has already been committed

	3414 ** (by deleting a master journal file) and the caller will ignore this

	3415 ** functions return code. So, even if an error occurs in the pager layer,

	3416 ** reset the b-tree objects internal state to indicate that the write

	3417 ** transaction has been closed. This is quite safe, as the pager will have

	3418 ** transitioned to the error state.

	3419 **

	3420 ** This will release the write lock on the database file. If there

	3421 ** are no active cursors, it also releases the read lock.

	3422 */

	3423 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){

	3424

	3425 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;

	3426 sqlite3BtreeEnter(p);

	3427 btreeIntegrity(p);

	3428

	3429 /* If the handle has a write-transaction open, commit the shared-btrees

	3430 ** transaction and set the shared state to TRANS_READ.

	3431 */

	3432 if( p->inTrans==TRANS_WRITE ){

	3433 int rc;

	3434 BtShared *pBt = p->pBt;

	3435 assert( pBt->inTransaction==TRANS_WRITE );

	3436 assert( pBt->nTransaction>0 );

	3437 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);

	3438 if( rc!=SQLITE_OK && bCleanup==0 ){

	3439 sqlite3BtreeLeave(p);

	3440 return rc;

	3441 }

	3442 pBt->inTransaction = TRANS_READ;

	3443 btreeClearHasContent(pBt);

	3444 }

	3445

	3446 btreeEndTransaction(p);

	3447 sqlite3BtreeLeave(p);

	3448 return SQLITE_OK;

	3449 }

	3450

	3451 /*

	3452 ** Do both phases of a commit.

	3453 */

	3454 int sqlite3BtreeCommit(Btree *p){

	3455 int rc;

	3456 sqlite3BtreeEnter(p);

	3457 rc = sqlite3BtreeCommitPhaseOne(p, 0);

	3458 if( rc==SQLITE_OK ){

	3459 rc = sqlite3BtreeCommitPhaseTwo(p, 0);

	3460 }

	3461 sqlite3BtreeLeave(p);

	3462 return rc;

	3463 }

	3464

	3465 /*

	3466 ** This routine sets the state to CURSOR_FAULT and the error

	3467 ** code to errCode for every cursor on any BtShared that pBtree

	3468 ** references. Or if the writeOnly flag is set to 1, then only

	3469 ** trip write cursors and leave read cursors unchanged.

	3470 **

	3471 ** Every cursor is a candidate to be tripped, including cursors

	3472 ** that belong to other database connections that happen to be

	3473 ** sharing the cache with pBtree.

	3474 **

	3475 ** This routine gets called when a rollback occurs. If the writeOnly

	3476 ** flag is true, then only write-cursors need be tripped - read-only

	3477 ** cursors save their current positions so that they may continue

	3478 ** following the rollback. Or, if writeOnly is false, all cursors are

	3479 ** tripped. In general, writeOnly is false if the transaction being

	3480 ** rolled back modified the database schema. In this case b-tree root

	3481 ** pages may be moved or deleted from the database altogether, making

	3482 ** it unsafe for read cursors to continue.

	3483 **

	3484 ** If the writeOnly flag is true and an error is encountered while

	3485 ** saving the current position of a read-only cursor, all cursors,

	3486 ** including all read-cursors are tripped.

	3487 **

	3488 ** SQLITE_OK is returned if successful, or if an error occurs while

	3489 ** saving a cursor position, an SQLite error code.

	3490 */

	3491 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){

	3492 BtCursor *p;

	3493 int rc = SQLITE_OK;

	3494

	3495 assert( (writeOnly==0 \|\| writeOnly==1) && BTCF_WriteFlag==1 );

	3496 if( pBtree ){

	3497 sqlite3BtreeEnter(pBtree);

	3498 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	3499 int i;

	3500 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){

	3501 if( p->eState==CURSOR_VALID ){

	3502 rc = saveCursorPosition(p);

	3503 if( rc!=SQLITE_OK ){

	3504 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);

	3505 break;

	3506 }

	3507 }

	3508 }else{

	3509 sqlite3BtreeClearCursor(p);

	3510 p->eState = CURSOR_FAULT;

	3511 p->skipNext = errCode;

	3512 }

	3513 for(i=0; i<=p->iPage; i++){

	3514 releasePage(p->apPage[i]);

	3515 p->apPage[i] = 0;

	3516 }

	3517 }

	3518 sqlite3BtreeLeave(pBtree);

	3519 }

	3520 return rc;

	3521 }

	3522

	3523 /*

	3524 ** Rollback the transaction in progress.

	3525 **

	3526 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).

	3527 ** Only write cursors are tripped if writeOnly is true but all cursors are

	3528 ** tripped if writeOnly is false. Any attempt to use

	3529 ** a tripped cursor will result in an error.

	3530 **

	3531 ** This will release the write lock on the database file. If there

	3532 ** are no active cursors, it also releases the read lock.

	3533 */

	3534 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){

	3535 int rc;

	3536 BtShared *pBt = p->pBt;

	3537 MemPage *pPage1;

	3538

	3539 assert( writeOnly==1 \|\| writeOnly==0 );

	3540 assert( tripCode==SQLITE_ABORT_ROLLBACK \|\| tripCode==SQLITE_OK );

	3541 sqlite3BtreeEnter(p);

	3542 if( tripCode==SQLITE_OK ){

	3543 rc = tripCode = saveAllCursors(pBt, 0, 0);

	3544 if( rc ) writeOnly = 0;

	3545 }else{

	3546 rc = SQLITE_OK;

	3547 }

	3548 if( tripCode ){

	3549 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);

	3550 assert( rc==SQLITE_OK \|\| (writeOnly==0 && rc2==SQLITE_OK) );

	3551 if( rc2!=SQLITE_OK ) rc = rc2;

	3552 }

	3553 btreeIntegrity(p);

	3554

	3555 if( p->inTrans==TRANS_WRITE ){

	3556 int rc2;

	3557

	3558 assert( TRANS_WRITE==pBt->inTransaction );

	3559 rc2 = sqlite3PagerRollback(pBt->pPager);

	3560 if( rc2!=SQLITE_OK ){

	3561 rc = rc2;

	3562 }

	3563

	3564 /* The rollback may have destroyed the pPage1->aData value. So

	3565 ** call btreeGetPage() on page 1 again to make

	3566 ** sure pPage1->aData is set correctly. */

	3567 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){

	3568 int nPage = get4byte(28+(u8*)pPage1->aData);

	3569 testcase( nPage==0 );

	3570 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);

	3571 testcase( pBt->nPage!=nPage );

	3572 pBt->nPage = nPage;

	3573 releasePage(pPage1);

	3574 }

	3575 assert( countValidCursors(pBt, 1)==0 );

	3576 pBt->inTransaction = TRANS_READ;

	3577 btreeClearHasContent(pBt);

	3578 }

	3579

	3580 btreeEndTransaction(p);

	3581 sqlite3BtreeLeave(p);

	3582 return rc;

	3583 }

	3584

	3585 /*

	3586 ** Start a statement subtransaction. The subtransaction can be rolled

	3587 ** back independently of the main transaction. You must start a transaction

	3588 ** before starting a subtransaction. The subtransaction is ended automatically

	3589 ** if the main transaction commits or rolls back.

	3590 **

	3591 ** Statement subtransactions are used around individual SQL statements

	3592 ** that are contained within a BEGIN...COMMIT block. If a constraint

	3593 ** error occurs within the statement, the effect of that one statement

	3594 ** can be rolled back without having to rollback the entire transaction.

	3595 **

	3596 ** A statement sub-transaction is implemented as an anonymous savepoint. The

	3597 ** value passed as the second parameter is the total number of savepoints,

	3598 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there

	3599 ** are no active savepoints and no other statement-transactions open,

	3600 ** iStatement is 1. This anonymous savepoint can be released or rolled back

	3601 ** using the sqlite3BtreeSavepoint() function.

	3602 */

	3603 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){

	3604 int rc;

	3605 BtShared *pBt = p->pBt;

	3606 sqlite3BtreeEnter(p);

	3607 assert( p->inTrans==TRANS_WRITE );

	3608 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	3609 assert( iStatement>0 );

	3610 assert( iStatement>p->db->nSavepoint );

	3611 assert( pBt->inTransaction==TRANS_WRITE );

	3612 /* At the pager level, a statement transaction is a savepoint with

	3613 ** an index greater than all savepoints created explicitly using

	3614 ** SQL statements. It is illegal to open, release or rollback any

	3615 ** such savepoints while the statement transaction savepoint is active.

	3616 */

	3617 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);

	3618 sqlite3BtreeLeave(p);

	3619 return rc;

	3620 }

	3621

	3622 /*

	3623 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK

	3624 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the

	3625 ** savepoint identified by parameter iSavepoint, depending on the value

	3626 ** of op.

	3627 **

	3628 ** Normally, iSavepoint is greater than or equal to zero. However, if op is

	3629 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the

	3630 ** contents of the entire transaction are rolled back. This is different

	3631 ** from a normal transaction rollback, as no locks are released and the

	3632 ** transaction remains open.

	3633 */

	3634 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){

	3635 int rc = SQLITE_OK;

	3636 if( p && p->inTrans==TRANS_WRITE ){

	3637 BtShared *pBt = p->pBt;

	3638 assert( op==SAVEPOINT_RELEASE \|\| op==SAVEPOINT_ROLLBACK );

	3639 assert( iSavepoint>=0 \|\| (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );

	3640 sqlite3BtreeEnter(p);

	3641 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);

	3642 if( rc==SQLITE_OK ){

	3643 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){

	3644 pBt->nPage = 0;

	3645 }

	3646 rc = newDatabase(pBt);

	3647 pBt->nPage = get4byte(28 + pBt->pPage1->aData);

	3648

	3649 /* The database size was written into the offset 28 of the header

	3650 ** when the transaction started, so we know that the value at offset

	3651 ** 28 is nonzero. */

	3652 assert( pBt->nPage>0 );

	3653 }

	3654 sqlite3BtreeLeave(p);

	3655 }

	3656 return rc;

	3657 }

	3658

	3659 /*

	3660 ** Create a new cursor for the BTree whose root is on the page

	3661 ** iTable. If a read-only cursor is requested, it is assumed that

	3662 ** the caller already has at least a read-only transaction open

	3663 ** on the database already. If a write-cursor is requested, then

	3664 ** the caller is assumed to have an open write transaction.

	3665 **

	3666 ** If wrFlag==0, then the cursor can only be used for reading.

	3667 ** If wrFlag==1, then the cursor can be used for reading or for

	3668 ** writing if other conditions for writing are also met. These

	3669 ** are the conditions that must be met in order for writing to

	3670 ** be allowed:

	3671 **

	3672 ** 1: The cursor must have been opened with wrFlag==1

	3673 **

	3674 ** 2: Other database connections that share the same pager cache

	3675 ** but which are not in the READ_UNCOMMITTED state may not have

	3676 ** cursors open with wrFlag==0 on the same table. Otherwise

	3677 ** the changes made by this write cursor would be visible to

	3678 ** the read cursors in the other database connection.

	3679 **

	3680 ** 3: The database must be writable (not on read-only media)

	3681 **

	3682 ** 4: There must be an active transaction.

	3683 **

	3684 ** No checking is done to make sure that page iTable really is the

	3685 ** root page of a b-tree. If it is not, then the cursor acquired

	3686 ** will not work correctly.

	3687 **

	3688 ** It is assumed that the sqlite3BtreeCursorZero() has been called

	3689 ** on pCur to initialize the memory space prior to invoking this routine.

	3690 */

	3691 static int btreeCursor(

	3692 Btree p, / The btree */

	3693 int iTable, /* Root page of table to open */

	3694 int wrFlag, /* 1 to write. 0 read-only */

	3695 struct KeyInfo pKeyInfo, / First arg to comparison function */

	3696 BtCursor pCur / Space for new cursor */

	3697 ){

	3698 BtShared pBt = p->pBt; / Shared b-tree handle */

	3699

	3700 assert( sqlite3BtreeHoldsMutex(p) );

	3701 assert( wrFlag==0 \|\| wrFlag==1 );

	3702

	3703 /* The following assert statements verify that if this is a sharable

	3704 ** b-tree database, the connection is holding the required table locks,

	3705 ** and that no other connection has any open cursor that conflicts with

	3706 ** this lock. */

	3707 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );

	3708 assert( wrFlag==0 \|\| !hasReadConflicts(p, iTable) );

	3709

	3710 /* Assert that the caller has opened the required transaction. */

	3711 assert( p->inTrans>TRANS_NONE );

	3712 assert( wrFlag==0 \|\| p->inTrans==TRANS_WRITE );

	3713 assert( pBt->pPage1 && pBt->pPage1->aData );

	3714

	3715 if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){

	3716 return SQLITE_READONLY;

	3717 }

	3718 if( wrFlag ){

	3719 allocateTempSpace(pBt);

	3720 if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM;

	3721 }

	3722 if( iTable==1 && btreePagecount(pBt)==0 ){

	3723 assert( wrFlag==0 );

	3724 iTable = 0;

	3725 }

	3726

	3727 /* Now that no other errors can occur, finish filling in the BtCursor

	3728 ** variables and link the cursor into the BtShared list. */

	3729 pCur->pgnoRoot = (Pgno)iTable;

	3730 pCur->iPage = -1;

	3731 pCur->pKeyInfo = pKeyInfo;

	3732 pCur->pBtree = p;

	3733 pCur->pBt = pBt;

	3734 assert( wrFlag==0 \|\| wrFlag==BTCF_WriteFlag );

	3735 pCur->curFlags = wrFlag;

	3736 pCur->pNext = pBt->pCursor;

	3737 if( pCur->pNext ){

	3738 pCur->pNext->pPrev = pCur;

	3739 }

	3740 pBt->pCursor = pCur;

	3741 pCur->eState = CURSOR_INVALID;

	3742 return SQLITE_OK;

	3743 }

	3744 int sqlite3BtreeCursor(

	3745 Btree p, / The btree */

	3746 int iTable, /* Root page of table to open */

	3747 int wrFlag, /* 1 to write. 0 read-only */

	3748 struct KeyInfo pKeyInfo, / First arg to xCompare() */

	3749 BtCursor pCur / Write new cursor here */

	3750 ){

	3751 int rc;

	3752 sqlite3BtreeEnter(p);

	3753 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);

	3754 sqlite3BtreeLeave(p);

	3755 return rc;

	3756 }

	3757

	3758 /*

	3759 ** Return the size of a BtCursor object in bytes.

	3760 **

	3761 ** This interfaces is needed so that users of cursors can preallocate

	3762 ** sufficient storage to hold a cursor. The BtCursor object is opaque

	3763 ** to users so they cannot do the sizeof() themselves - they must call

	3764 ** this routine.

	3765 */

	3766 int sqlite3BtreeCursorSize(void){

	3767 return ROUND8(sizeof(BtCursor));

	3768 }

	3769

	3770 /*

	3771 ** Initialize memory that will be converted into a BtCursor object.

	3772 **

	3773 ** The simple approach here would be to memset() the entire object

	3774 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays

	3775 ** do not need to be zeroed and they are large, so we can save a lot

	3776 ** of run-time by skipping the initialization of those elements.

	3777 */

	3778 void sqlite3BtreeCursorZero(BtCursor *p){

	3779 memset(p, 0, offsetof(BtCursor, iPage));

	3780 }

	3781

	3782 /*

	3783 ** Close a cursor. The read lock on the database file is released

	3784 ** when the last cursor is closed.

	3785 */

	3786 int sqlite3BtreeCloseCursor(BtCursor *pCur){

	3787 Btree *pBtree = pCur->pBtree;

	3788 if( pBtree ){

	3789 int i;

	3790 BtShared *pBt = pCur->pBt;

	3791 sqlite3BtreeEnter(pBtree);

	3792 sqlite3BtreeClearCursor(pCur);

	3793 if( pCur->pPrev ){

	3794 pCur->pPrev->pNext = pCur->pNext;

	3795 }else{

	3796 pBt->pCursor = pCur->pNext;

	3797 }

	3798 if( pCur->pNext ){

	3799 pCur->pNext->pPrev = pCur->pPrev;

	3800 }

	3801 for(i=0; i<=pCur->iPage; i++){

	3802 releasePage(pCur->apPage[i]);

	3803 }

	3804 unlockBtreeIfUnused(pBt);

	3805 sqlite3DbFree(pBtree->db, pCur->aOverflow);

	3806 /* sqlite3_free(pCur); */

	3807 sqlite3BtreeLeave(pBtree);

	3808 }

	3809 return SQLITE_OK;

	3810 }

	3811

	3812 /*

	3813 ** Make sure the BtCursor* given in the argument has a valid

	3814 ** BtCursor.info structure. If it is not already valid, call

	3815 ** btreeParseCell() to fill it in.

	3816 **

	3817 ** BtCursor.info is a cache of the information in the current cell.

	3818 ** Using this cache reduces the number of calls to btreeParseCell().

	3819 **

	3820 ** 2007-06-25: There is a bug in some versions of MSVC that cause the

	3821 ** compiler to crash when getCellInfo() is implemented as a macro.

	3822 ** But there is a measureable speed advantage to using the macro on gcc

	3823 ** (when less compiler optimizations like -Os or -O0 are used and the

	3824 ** compiler is not doing aggressive inlining.) So we use a real function

	3825 ** for MSVC and a macro for everything else. Ticket #2457.

	3826 */

	3827 #ifndef NDEBUG

	3828 static void assertCellInfo(BtCursor *pCur){

	3829 CellInfo info;

	3830 int iPage = pCur->iPage;

	3831 memset(&info, 0, sizeof(info));

	3832 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);

	3833 assert( CORRUPT_DB \|\| memcmp(&info, &pCur->info, sizeof(info))==0 );

	3834 }

	3835 #else

	3836 #define assertCellInfo(x)

	3837 #endif

	3838 #ifdef _MSC_VER

	3839 /* Use a real function in MSVC to work around bugs in that compiler. */

	3840 static void getCellInfo(BtCursor *pCur){

	3841 if( pCur->info.nSize==0 ){

	3842 int iPage = pCur->iPage;

	3843 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);

	3844 pCur->curFlags \|= BTCF_ValidNKey;

	3845 }else{

	3846 assertCellInfo(pCur);

	3847 }

	3848 }

	3849 #else /* if not _MSC_VER */

	3850 /* Use a macro in all other compilers so that the function is inlined */

	3851 #define getCellInfo(pCur) \

	3852 if( pCur->info.nSize==0 ){ \

	3853 int iPage = pCur->iPage; \

	3854 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \

	3855 pCur->curFlags \|= BTCF_ValidNKey; \

	3856 }else{ \

	3857 assertCellInfo(pCur); \

	3858 }

	3859 #endif /* _MSC_VER */

	3860

	3861 #ifndef NDEBUG /* The next routine used only within assert() statements */

	3862 /*

	3863 ** Return true if the given BtCursor is valid. A valid cursor is one

	3864 ** that is currently pointing to a row in a (non-empty) table.

	3865 ** This is a verification routine is used only within assert() statements.

	3866 */

	3867 int sqlite3BtreeCursorIsValid(BtCursor *pCur){

	3868 return pCur && pCur->eState==CURSOR_VALID;

	3869 }

	3870 #endif /* NDEBUG */

	3871

	3872 /*

	3873 ** Set *pSize to the size of the buffer needed to hold the value of

	3874 ** the key for the current entry. If the cursor is not pointing

	3875 ** to a valid entry, *pSize is set to 0.

	3876 **

	3877 ** For a table with the INTKEY flag set, this routine returns the key

	3878 ** itself, not the number of bytes in the key.

	3879 **

	3880 ** The caller must position the cursor prior to invoking this routine.

	3881 **

	3882 ** This routine cannot fail. It always returns SQLITE_OK.

	3883 */

	3884 int sqlite3BtreeKeySize(BtCursor pCur, i64 pSize){

	3885 assert( cursorHoldsMutex(pCur) );

	3886 assert( pCur->eState==CURSOR_VALID );

	3887 getCellInfo(pCur);

	3888 *pSize = pCur->info.nKey;

	3889 return SQLITE_OK;

	3890 }

	3891

	3892 /*

	3893 ** Set *pSize to the number of bytes of data in the entry the

	3894 ** cursor currently points to.

	3895 **

	3896 ** The caller must guarantee that the cursor is pointing to a non-NULL

	3897 ** valid entry. In other words, the calling procedure must guarantee

	3898 ** that the cursor has Cursor.eState==CURSOR_VALID.

	3899 **

	3900 ** Failure is not possible. This function always returns SQLITE_OK.

	3901 ** It might just as well be a procedure (returning void) but we continue

	3902 ** to return an integer result code for historical reasons.

	3903 */

	3904 int sqlite3BtreeDataSize(BtCursor pCur, u32 pSize){

	3905 assert( cursorHoldsMutex(pCur) );

	3906 assert( pCur->eState==CURSOR_VALID );

	3907 assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );

	3908 getCellInfo(pCur);

	3909 *pSize = pCur->info.nPayload;

	3910 return SQLITE_OK;

	3911 }

	3912

	3913 /*

	3914 ** Given the page number of an overflow page in the database (parameter

	3915 ** ovfl), this function finds the page number of the next page in the

	3916 ** linked list of overflow pages. If possible, it uses the auto-vacuum

	3917 ** pointer-map data instead of reading the content of page ovfl to do so.

	3918 **

	3919 ** If an error occurs an SQLite error code is returned. Otherwise:

	3920 **

	3921 ** The page number of the next overflow page in the linked list is

	3922 ** written to *pPgnoNext. If page ovfl is the last page in its linked

	3923 ** list, *pPgnoNext is set to zero.

	3924 **

	3925 ** If ppPage is not NULL, and a reference to the MemPage object corresponding

	3926 ** to page number pOvfl was obtained, then *ppPage is set to point to that

	3927 ** reference. It is the responsibility of the caller to call releasePage()

	3928 ** on *ppPage to free the reference. In no reference was obtained (because

	3929 ** the pointer-map was used to obtain the value for *pPgnoNext), then

	3930 ** *ppPage is set to zero.

	3931 */

	3932 static int getOverflowPage(

	3933 BtShared pBt, / The database file */

	3934 Pgno ovfl, /* Current overflow page number */

	3935 MemPage *ppPage, / OUT: MemPage handle (may be NULL) */

	3936 Pgno pPgnoNext / OUT: Next overflow page number */

	3937 ){

	3938 Pgno next = 0;

	3939 MemPage *pPage = 0;

	3940 int rc = SQLITE_OK;

	3941

	3942 assert( sqlite3_mutex_held(pBt->mutex) );

	3943 assert(pPgnoNext);

	3944

	3945 #ifndef SQLITE_OMIT_AUTOVACUUM

	3946 /* Try to find the next page in the overflow list using the

	3947 ** autovacuum pointer-map pages. Guess that the next page in

	3948 ** the overflow list is page number (ovfl+1). If that guess turns

	3949 ** out to be wrong, fall back to loading the data of page

	3950 ** number ovfl to determine the next page number.

	3951 */

	3952 if( pBt->autoVacuum ){

	3953 Pgno pgno;

	3954 Pgno iGuess = ovfl+1;

	3955 u8 eType;

	3956

	3957 while( PTRMAP_ISPAGE(pBt, iGuess) \|\| iGuess==PENDING_BYTE_PAGE(pBt) ){

	3958 iGuess++;

	3959 }

	3960

	3961 if( iGuess<=btreePagecount(pBt) ){

	3962 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);

	3963 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){

	3964 next = iGuess;

	3965 rc = SQLITE_DONE;

	3966 }

	3967 }

	3968 }

	3969 #endif

	3970

	3971 assert( next==0 \|\| rc==SQLITE_DONE );

	3972 if( rc==SQLITE_OK ){

	3973 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);

	3974 assert( rc==SQLITE_OK \|\| pPage==0 );

	3975 if( rc==SQLITE_OK ){

	3976 next = get4byte(pPage->aData);

	3977 }

	3978 }

	3979

	3980 *pPgnoNext = next;

	3981 if( ppPage ){

	3982 *ppPage = pPage;

	3983 }else{

	3984 releasePage(pPage);

	3985 }

	3986 return (rc==SQLITE_DONE ? SQLITE_OK : rc);

	3987 }

	3988

	3989 /*

	3990 ** Copy data from a buffer to a page, or from a page to a buffer.

	3991 **

	3992 ** pPayload is a pointer to data stored on database page pDbPage.

	3993 ** If argument eOp is false, then nByte bytes of data are copied

	3994 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,

	3995 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes

	3996 ** of data are copied from the buffer pBuf to pPayload.

	3997 **

	3998 ** SQLITE_OK is returned on success, otherwise an error code.

	3999 */

	4000 static int copyPayload(

	4001 void pPayload, / Pointer to page data */

	4002 void pBuf, / Pointer to buffer */

	4003 int nByte, /* Number of bytes to copy */

	4004 int eOp, /* 0 -> copy from page, 1 -> copy to page */

	4005 DbPage pDbPage / Page containing pPayload */

	4006 ){

	4007 if( eOp ){

	4008 /* Copy data from buffer to page (a write operation) */

	4009 int rc = sqlite3PagerWrite(pDbPage);

	4010 if( rc!=SQLITE_OK ){

	4011 return rc;

	4012 }

	4013 memcpy(pPayload, pBuf, nByte);

	4014 }else{

	4015 /* Copy data from page to buffer (a read operation) */

	4016 memcpy(pBuf, pPayload, nByte);

	4017 }

	4018 return SQLITE_OK;

	4019 }

	4020

	4021 /*

	4022 ** This function is used to read or overwrite payload information

	4023 ** for the entry that the pCur cursor is pointing to. The eOp

	4024 ** argument is interpreted as follows:

	4025 **

	4026 ** 0: The operation is a read. Populate the overflow cache.

	4027 ** 1: The operation is a write. Populate the overflow cache.

	4028 ** 2: The operation is a read. Do not populate the overflow cache.

	4029 **

	4030 ** A total of "amt" bytes are read or written beginning at "offset".

	4031 ** Data is read to or from the buffer pBuf.

	4032 **

	4033 ** The content being read or written might appear on the main page

	4034 ** or be scattered out on multiple overflow pages.

	4035 **

	4036 ** If the current cursor entry uses one or more overflow pages and the

	4037 ** eOp argument is not 2, this function may allocate space for and lazily

	4038 ** populates the overflow page-list cache array (BtCursor.aOverflow).

	4039 ** Subsequent calls use this cache to make seeking to the supplied offset

	4040 ** more efficient.

	4041 **

	4042 ** Once an overflow page-list cache has been allocated, it may be

	4043 ** invalidated if some other cursor writes to the same table, or if

	4044 ** the cursor is moved to a different row. Additionally, in auto-vacuum

	4045 ** mode, the following events may invalidate an overflow page-list cache.

	4046 **

	4047 ** * An incremental vacuum,

	4048 ** * A commit in auto_vacuum="full" mode,

	4049 ** * Creating a table (may require moving an overflow page).

	4050 */

	4051 static int accessPayload(

	4052 BtCursor pCur, / Cursor pointing to entry to read from */

	4053 u32 offset, /* Begin reading this far into payload */

	4054 u32 amt, /* Read this many bytes */

	4055 unsigned char pBuf, / Write the bytes into this buffer */

	4056 int eOp /* zero to read. non-zero to write. */

	4057 ){

	4058 unsigned char *aPayload;

	4059 int rc = SQLITE_OK;

	4060 int iIdx = 0;

	4061 MemPage pPage = pCur->apPage[pCur->iPage]; / Btree page of current entry */

	4062 BtShared pBt = pCur->pBt; / Btree this cursor belongs to */

	4063 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4064 unsigned char * const pBufStart = pBuf;

	4065 int bEnd; /* True if reading to end of data */

	4066 #endif

	4067

	4068 assert( pPage );

	4069 assert( pCur->eState==CURSOR_VALID );

	4070 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

	4071 assert( cursorHoldsMutex(pCur) );

	4072 assert( eOp!=2 \|\| offset==0 ); /* Always start from beginning for eOp==2 */

	4073

	4074 getCellInfo(pCur);

	4075 aPayload = pCur->info.pPayload;

	4076 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4077 bEnd = offset+amt==pCur->info.nPayload;

	4078 #endif

	4079 assert( offset+amt <= pCur->info.nPayload );

	4080

	4081 if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){

	4082 /* Trying to read or write past the end of the data is an error */

	4083 return SQLITE_CORRUPT_BKPT;

	4084 }

	4085

	4086 /* Check if data must be read/written to/from the btree page itself. */

	4087 if( offset<pCur->info.nLocal ){

	4088 int a = amt;

	4089 if( a+offset>pCur->info.nLocal ){

	4090 a = pCur->info.nLocal - offset;

	4091 }

	4092 rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);

	4093 offset = 0;

	4094 pBuf += a;

	4095 amt -= a;

	4096 }else{

	4097 offset -= pCur->info.nLocal;

	4098 }

	4099

	4100 if( rc==SQLITE_OK && amt>0 ){

	4101 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */

	4102 Pgno nextPage;

	4103

	4104 nextPage = get4byte(&aPayload[pCur->info.nLocal]);

	4105

	4106 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.

	4107 ** Except, do not allocate aOverflow[] for eOp==2.

	4108 **

	4109 ** The aOverflow[] array is sized at one entry for each overflow page

	4110 ** in the overflow chain. The page number of the first overflow page is

	4111 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array

	4112 ** means "not yet known" (the cache is lazily populated).

	4113 */

	4114 if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){

	4115 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;

	4116 if( nOvfl>pCur->nOvflAlloc ){

	4117 Pgno aNew = (Pgno)sqlite3DbRealloc(

	4118 pCur->pBtree->db, pCur->aOverflow, nOvfl2sizeof(Pgno)

	4119 );

	4120 if( aNew==0 ){

	4121 rc = SQLITE_NOMEM;

	4122 }else{

	4123 pCur->nOvflAlloc = nOvfl*2;

	4124 pCur->aOverflow = aNew;

	4125 }

	4126 }

	4127 if( rc==SQLITE_OK ){

	4128 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));

	4129 pCur->curFlags \|= BTCF_ValidOvfl;

	4130 }

	4131 }

	4132

	4133 /* If the overflow page-list cache has been allocated and the

	4134 ** entry for the first required overflow page is valid, skip

	4135 ** directly to it.

	4136 */

	4137 if( (pCur->curFlags & BTCF_ValidOvfl)!=0

	4138 && pCur->aOverflow[offset/ovflSize]

	4139 ){

	4140 iIdx = (offset/ovflSize);

	4141 nextPage = pCur->aOverflow[iIdx];

	4142 offset = (offset%ovflSize);

	4143 }

	4144

	4145 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){

	4146

	4147 /* If required, populate the overflow page-list cache. */

	4148 if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){

	4149 assert(!pCur->aOverflow[iIdx] \|\| pCur->aOverflow[iIdx]==nextPage);

	4150 pCur->aOverflow[iIdx] = nextPage;

	4151 }

	4152

	4153 if( offset>=ovflSize ){

	4154 /* The only reason to read this page is to obtain the page

	4155 ** number for the next page in the overflow chain. The page

	4156 ** data is not required. So first try to lookup the overflow

	4157 ** page-list cache, if any, then fall back to the getOverflowPage()

	4158 ** function.

	4159 **

	4160 ** Note that the aOverflow[] array must be allocated because eOp!=2

	4161 ** here. If eOp==2, then offset==0 and this branch is never taken.

	4162 */

	4163 assert( eOp!=2 );

	4164 assert( pCur->curFlags & BTCF_ValidOvfl );

	4165 if( pCur->aOverflow[iIdx+1] ){

	4166 nextPage = pCur->aOverflow[iIdx+1];

	4167 }else{

	4168 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);

	4169 }

	4170 offset -= ovflSize;

	4171 }else{

	4172 /* Need to read this page properly. It contains some of the

	4173 ** range of data that is being read (eOp==0) or written (eOp!=0).

	4174 */

	4175 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4176 sqlite3_file *fd;

	4177 #endif

	4178 int a = amt;

	4179 if( a + offset > ovflSize ){

	4180 a = ovflSize - offset;

	4181 }

	4182

	4183 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	4184 /* If all the following are true:

	4185 **

	4186 ** 1) this is a read operation, and

	4187 ** 2) data is required from the start of this overflow page, and

	4188 ** 3) the database is file-backed, and

	4189 ** 4) there is no open write-transaction, and

	4190 ** 5) the database is not a WAL database,

	4191 ** 6) all data from the page is being read.

	4192 ** 7) at least 4 bytes have already been read into the output buffer

	4193 **

	4194 ** then data can be read directly from the database file into the

	4195 ** output buffer, bypassing the page-cache altogether. This speeds

	4196 ** up loading large records that span many overflow pages.

	4197 */

	4198 if( (eOp&0x01)==0 /* (1) */

	4199 && offset==0 /* (2) */

	4200 && (bEnd \|\| a==ovflSize) /* (6) */

	4201 && pBt->inTransaction==TRANS_READ /* (4) */

	4202 && (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (3) */

	4203 && pBt->pPage1->aData[19]==0x01 /* (5) */

	4204 && &pBuf[-4]>=pBufStart /* (7) */

	4205 ){

	4206 u8 aSave[4];

	4207 u8 *aWrite = &pBuf[-4];

	4208 assert( aWrite>=pBufStart ); /* hence (7) */

	4209 memcpy(aSave, aWrite, 4);

	4210 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));

	4211 nextPage = get4byte(aWrite);

	4212 memcpy(aWrite, aSave, 4);

	4213 }else

	4214 #endif

	4215

	4216 {

	4217 DbPage *pDbPage;

	4218 rc = sqlite3PagerAcquire(pBt->pPager, nextPage, &pDbPage,

	4219 ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)

	4220 );

	4221 if( rc==SQLITE_OK ){

	4222 aPayload = sqlite3PagerGetData(pDbPage);

	4223 nextPage = get4byte(aPayload);

	4224 rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);

	4225 sqlite3PagerUnref(pDbPage);

	4226 offset = 0;

	4227 }

	4228 }

	4229 amt -= a;

	4230 pBuf += a;

	4231 }

	4232 }

	4233 }

	4234

	4235 if( rc==SQLITE_OK && amt>0 ){

	4236 return SQLITE_CORRUPT_BKPT;

	4237 }

	4238 return rc;

	4239 }

	4240

	4241 /*

	4242 ** Read part of the key associated with cursor pCur. Exactly

	4243 ** "amt" bytes will be transferred into pBuf[]. The transfer

	4244 ** begins at "offset".

	4245 **

	4246 ** The caller must ensure that pCur is pointing to a valid row

	4247 ** in the table.

	4248 **

	4249 ** Return SQLITE_OK on success or an error code if anything goes

	4250 ** wrong. An error is returned if "offset+amt" is larger than

	4251 ** the available payload.

	4252 */

	4253 int sqlite3BtreeKey(BtCursor pCur, u32 offset, u32 amt, void pBuf){

	4254 assert( cursorHoldsMutex(pCur) );

	4255 assert( pCur->eState==CURSOR_VALID );

	4256 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

	4257 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	4258 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);

	4259 }

	4260

	4261 /*

	4262 ** Read part of the data associated with cursor pCur. Exactly

	4263 ** "amt" bytes will be transfered into pBuf[]. The transfer

	4264 ** begins at "offset".

	4265 **

	4266 ** Return SQLITE_OK on success or an error code if anything goes

	4267 ** wrong. An error is returned if "offset+amt" is larger than

	4268 ** the available payload.

	4269 */

	4270 int sqlite3BtreeData(BtCursor pCur, u32 offset, u32 amt, void pBuf){

	4271 int rc;

	4272

	4273 #ifndef SQLITE_OMIT_INCRBLOB

	4274 if ( pCur->eState==CURSOR_INVALID ){

	4275 return SQLITE_ABORT;

	4276 }

	4277 #endif

	4278

	4279 assert( cursorHoldsMutex(pCur) );

	4280 rc = restoreCursorPosition(pCur);

	4281 if( rc==SQLITE_OK ){

	4282 assert( pCur->eState==CURSOR_VALID );

	4283 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

	4284 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	4285 rc = accessPayload(pCur, offset, amt, pBuf, 0);

	4286 }

	4287 return rc;

	4288 }

	4289

	4290 /*

	4291 ** Return a pointer to payload information from the entry that the

	4292 ** pCur cursor is pointing to. The pointer is to the beginning of

	4293 ** the key if index btrees (pPage->intKey==0) and is the data for

	4294 ** table btrees (pPage->intKey==1). The number of bytes of available

	4295 ** key/data is written into pAmt. If pAmt==0, then the value

	4296 ** returned will not be a valid pointer.

	4297 **

	4298 ** This routine is an optimization. It is common for the entire key

	4299 ** and data to fit on the local page and for there to be no overflow

	4300 ** pages. When that is so, this routine can be used to access the

	4301 ** key and data without making a copy. If the key and/or data spills

	4302 ** onto overflow pages, then accessPayload() must be used to reassemble

	4303 ** the key/data and copy it into a preallocated buffer.

	4304 **

	4305 ** The pointer returned by this routine looks directly into the cached

	4306 ** page of the database. The data might change or move the next time

	4307 ** any btree routine is called.

	4308 */

	4309 static const void *fetchPayload(

	4310 BtCursor pCur, / Cursor pointing to entry to read from */

	4311 u32 pAmt / Write the number of available bytes here */

	4312 ){

	4313 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);

	4314 assert( pCur->eState==CURSOR_VALID );

	4315 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	4316 assert( cursorHoldsMutex(pCur) );

	4317 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	4318 assert( pCur->info.nSize>0 );

	4319 *pAmt = pCur->info.nLocal;

	4320 return (void*)pCur->info.pPayload;

	4321 }

	4322

	4323

	4324 /*

	4325 ** For the entry that cursor pCur is point to, return as

	4326 ** many bytes of the key or data as are available on the local

	4327 ** b-tree page. Write the number of available bytes into *pAmt.

	4328 **

	4329 ** The pointer returned is ephemeral. The key/data may move

	4330 ** or be destroyed on the next call to any Btree routine,

	4331 ** including calls from other threads against the same cache.

	4332 ** Hence, a mutex on the BtShared should be held prior to calling

	4333 ** this routine.

	4334 **

	4335 ** These routines is used to get quick access to key and data

	4336 ** in the common case where no overflow pages are used.

	4337 */

	4338 const void sqlite3BtreeKeyFetch(BtCursor pCur, u32 *pAmt){

	4339 return fetchPayload(pCur, pAmt);

	4340 }

	4341 const void sqlite3BtreeDataFetch(BtCursor pCur, u32 *pAmt){

	4342 return fetchPayload(pCur, pAmt);

	4343 }

	4344

	4345

	4346 /*

	4347 ** Move the cursor down to a new child page. The newPgno argument is the

	4348 ** page number of the child page to move to.

	4349 **

	4350 ** This function returns SQLITE_CORRUPT if the page-header flags field of

	4351 ** the new child page does not match the flags field of the parent (i.e.

	4352 ** if an intkey page appears to be the parent of a non-intkey page, or

	4353 ** vice-versa).

	4354 */

	4355 static int moveToChild(BtCursor *pCur, u32 newPgno){

	4356 int rc;

	4357 int i = pCur->iPage;

	4358 MemPage *pNewPage;

	4359 BtShared *pBt = pCur->pBt;

	4360

	4361 assert( cursorHoldsMutex(pCur) );

	4362 assert( pCur->eState==CURSOR_VALID );

	4363 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );

	4364 assert( pCur->iPage>=0 );

	4365 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){

	4366 return SQLITE_CORRUPT_BKPT;

	4367 }

	4368 rc = getAndInitPage(pBt, newPgno, &pNewPage,

	4369 (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0);

	4370 if( rc ) return rc;

	4371 pCur->apPage[i+1] = pNewPage;

	4372 pCur->aiIdx[i+1] = 0;

	4373 pCur->iPage++;

	4374

	4375 pCur->info.nSize = 0;

	4376 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	4377 if( pNewPage->nCell<1 \|\| pNewPage->intKey!=pCur->apPage[i]->intKey ){

	4378 return SQLITE_CORRUPT_BKPT;

	4379 }

	4380 return SQLITE_OK;

	4381 }

	4382

	4383 #if 0

	4384 /*

	4385 ** Page pParent is an internal (non-leaf) tree page. This function

	4386 ** asserts that page number iChild is the left-child if the iIdx'th

	4387 ** cell in page pParent. Or, if iIdx is equal to the total number of

	4388 ** cells in pParent, that page number iChild is the right-child of

	4389 ** the page.

	4390 */

	4391 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){

	4392 assert( iIdx<=pParent->nCell );

	4393 if( iIdx==pParent->nCell ){

	4394 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );

	4395 }else{

	4396 assert( get4byte(findCell(pParent, iIdx))==iChild );

	4397 }

	4398 }

	4399 #else

	4400 # define assertParentIndex(x,y,z)

	4401 #endif

	4402

	4403 /*

	4404 ** Move the cursor up to the parent page.

	4405 **

	4406 ** pCur->idx is set to the cell index that contains the pointer

	4407 ** to the page we are coming from. If we are coming from the

	4408 ** right-most child page then pCur->idx is set to one more than

	4409 ** the largest cell index.

	4410 */

	4411 static void moveToParent(BtCursor *pCur){

	4412 assert( cursorHoldsMutex(pCur) );

	4413 assert( pCur->eState==CURSOR_VALID );

	4414 assert( pCur->iPage>0 );

	4415 assert( pCur->apPage[pCur->iPage] );

	4416

	4417 /* UPDATE: It is actually possible for the condition tested by the assert

	4418 ** below to be untrue if the database file is corrupt. This can occur if

	4419 ** one cursor has modified page pParent while a reference to it is held

	4420 ** by a second cursor. Which can only happen if a single page is linked

	4421 ** into more than one b-tree structure in a corrupt database. */

	4422 #if 0

	4423 assertParentIndex(

	4424 pCur->apPage[pCur->iPage-1],

	4425 pCur->aiIdx[pCur->iPage-1],

	4426 pCur->apPage[pCur->iPage]->pgno

	4427 );

	4428 #endif

	4429 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );

	4430

	4431 releasePage(pCur->apPage[pCur->iPage]);

	4432 pCur->iPage--;

	4433 pCur->info.nSize = 0;

	4434 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	4435 }

	4436

	4437 /*

	4438 ** Move the cursor to point to the root page of its b-tree structure.

	4439 **

	4440 ** If the table has a virtual root page, then the cursor is moved to point

	4441 ** to the virtual root page instead of the actual root page. A table has a

	4442 ** virtual root page when the actual root page contains no cells and a

	4443 ** single child page. This can only happen with the table rooted at page 1.

	4444 **

	4445 ** If the b-tree structure is empty, the cursor state is set to

	4446 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first

	4447 ** cell located on the root (or virtual root) page and the cursor state

	4448 ** is set to CURSOR_VALID.

	4449 **

	4450 ** If this function returns successfully, it may be assumed that the

	4451 ** page-header flags indicate that the [virtual] root-page is the expected

	4452 ** kind of b-tree page (i.e. if when opening the cursor the caller did not

	4453 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,

	4454 ** indicating a table b-tree, or if the caller did specify a KeyInfo

	4455 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index

	4456 ** b-tree).

	4457 */

	4458 static int moveToRoot(BtCursor *pCur){

	4459 MemPage *pRoot;

	4460 int rc = SQLITE_OK;

	4461

	4462 assert( cursorHoldsMutex(pCur) );

	4463 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );

	4464 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );

	4465 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );

	4466 if( pCur->eState>=CURSOR_REQUIRESEEK ){

	4467 if( pCur->eState==CURSOR_FAULT ){

	4468 assert( pCur->skipNext!=SQLITE_OK );

	4469 return pCur->skipNext;

	4470 }

	4471 sqlite3BtreeClearCursor(pCur);

	4472 }

	4473

	4474 if( pCur->iPage>=0 ){

	4475 while( pCur->iPage ) releasePage(pCur->apPage[pCur->iPage--]);

	4476 }else if( pCur->pgnoRoot==0 ){

	4477 pCur->eState = CURSOR_INVALID;

	4478 return SQLITE_OK;

	4479 }else{

	4480 rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],

	4481 (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0);

	4482 if( rc!=SQLITE_OK ){

	4483 pCur->eState = CURSOR_INVALID;

	4484 return rc;

	4485 }

	4486 pCur->iPage = 0;

	4487 }

	4488 pRoot = pCur->apPage[0];

	4489 assert( pRoot->pgno==pCur->pgnoRoot );

	4490

	4491 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor

	4492 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is

	4493 ** NULL, the caller expects a table b-tree. If this is not the case,

	4494 ** return an SQLITE_CORRUPT error.

	4495 **

	4496 ** Earlier versions of SQLite assumed that this test could not fail

	4497 ** if the root page was already loaded when this function was called (i.e.

	4498 ** if pCur->iPage>=0). But this is not so if the database is corrupted

	4499 ** in such a way that page pRoot is linked into a second b-tree table

	4500 ** (or the freelist). */

	4501 assert( pRoot->intKey==1 \|\| pRoot->intKey==0 );

	4502 if( pRoot->isInit==0 \|\| (pCur->pKeyInfo==0)!=pRoot->intKey ){

	4503 return SQLITE_CORRUPT_BKPT;

	4504 }

	4505

	4506 pCur->aiIdx[0] = 0;

	4507 pCur->info.nSize = 0;

	4508 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidNKey\|BTCF_ValidOvfl);

	4509

	4510 if( pRoot->nCell>0 ){

	4511 pCur->eState = CURSOR_VALID;

	4512 }else if( !pRoot->leaf ){

	4513 Pgno subpage;

	4514 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;

	4515 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);

	4516 pCur->eState = CURSOR_VALID;

	4517 rc = moveToChild(pCur, subpage);

	4518 }else{

	4519 pCur->eState = CURSOR_INVALID;

	4520 }

	4521 return rc;

	4522 }

	4523

	4524 /*

	4525 ** Move the cursor down to the left-most leaf entry beneath the

	4526 ** entry to which it is currently pointing.

	4527 **

	4528 ** The left-most leaf is the one with the smallest key - the first

	4529 ** in ascending order.

	4530 */

	4531 static int moveToLeftmost(BtCursor *pCur){

	4532 Pgno pgno;

	4533 int rc = SQLITE_OK;

	4534 MemPage *pPage;

	4535

	4536 assert( cursorHoldsMutex(pCur) );

	4537 assert( pCur->eState==CURSOR_VALID );

	4538 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){

	4539 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

	4540 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));

	4541 rc = moveToChild(pCur, pgno);

	4542 }

	4543 return rc;

	4544 }

	4545

	4546 /*

	4547 ** Move the cursor down to the right-most leaf entry beneath the

	4548 ** page to which it is currently pointing. Notice the difference

	4549 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()

	4550 ** finds the left-most entry beneath the entry whereas moveToRightmost()

	4551 ** finds the right-most entry beneath the page.

	4552 **

	4553 ** The right-most entry is the one with the largest key - the last

	4554 ** key in ascending order.

	4555 */

	4556 static int moveToRightmost(BtCursor *pCur){

	4557 Pgno pgno;

	4558 int rc = SQLITE_OK;

	4559 MemPage *pPage = 0;

	4560

	4561 assert( cursorHoldsMutex(pCur) );

	4562 assert( pCur->eState==CURSOR_VALID );

	4563 while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){

	4564 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	4565 pCur->aiIdx[pCur->iPage] = pPage->nCell;

	4566 rc = moveToChild(pCur, pgno);

	4567 if( rc ) return rc;

	4568 }

	4569 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;

	4570 assert( pCur->info.nSize==0 );

	4571 assert( (pCur->curFlags & BTCF_ValidNKey)==0 );

	4572 return SQLITE_OK;

	4573 }

	4574

	4575 /* Move the cursor to the first entry in the table. Return SQLITE_OK

	4576 ** on success. Set *pRes to 0 if the cursor actually points to something

	4577 ** or set *pRes to 1 if the table is empty.

	4578 */

	4579 int sqlite3BtreeFirst(BtCursor pCur, int pRes){

	4580 int rc;

	4581

	4582 assert( cursorHoldsMutex(pCur) );

	4583 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	4584 rc = moveToRoot(pCur);

	4585 if( rc==SQLITE_OK ){

	4586 if( pCur->eState==CURSOR_INVALID ){

	4587 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	4588 *pRes = 1;

	4589 }else{

	4590 assert( pCur->apPage[pCur->iPage]->nCell>0 );

	4591 *pRes = 0;

	4592 rc = moveToLeftmost(pCur);

	4593 }

	4594 }

	4595 return rc;

	4596 }

	4597

	4598 /* Move the cursor to the last entry in the table. Return SQLITE_OK

	4599 ** on success. Set *pRes to 0 if the cursor actually points to something

	4600 ** or set *pRes to 1 if the table is empty.

	4601 */

	4602 int sqlite3BtreeLast(BtCursor pCur, int pRes){

	4603 int rc;

	4604

	4605 assert( cursorHoldsMutex(pCur) );

	4606 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	4607

	4608 /* If the cursor already points to the last entry, this is a no-op. */

	4609 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){

	4610 #ifdef SQLITE_DEBUG

	4611 /* This block serves to assert() that the cursor really does point

	4612 ** to the last entry in the b-tree. */

	4613 int ii;

	4614 for(ii=0; ii<pCur->iPage; ii++){

	4615 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );

	4616 }

	4617 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );

	4618 assert( pCur->apPage[pCur->iPage]->leaf );

	4619 #endif

	4620 return SQLITE_OK;

	4621 }

	4622

	4623 rc = moveToRoot(pCur);

	4624 if( rc==SQLITE_OK ){

	4625 if( CURSOR_INVALID==pCur->eState ){

	4626 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	4627 *pRes = 1;

	4628 }else{

	4629 assert( pCur->eState==CURSOR_VALID );

	4630 *pRes = 0;

	4631 rc = moveToRightmost(pCur);

	4632 if( rc==SQLITE_OK ){

	4633 pCur->curFlags \|= BTCF_AtLast;

	4634 }else{

	4635 pCur->curFlags &= ~BTCF_AtLast;

	4636 }

	4637

	4638 }

	4639 }

	4640 return rc;

	4641 }

	4642

	4643 /* Move the cursor so that it points to an entry near the key

	4644 ** specified by pIdxKey or intKey. Return a success code.

	4645 **

	4646 ** For INTKEY tables, the intKey parameter is used. pIdxKey

	4647 ** must be NULL. For index tables, pIdxKey is used and intKey

	4648 ** is ignored.

	4649 **

	4650 ** If an exact match is not found, then the cursor is always

	4651 ** left pointing at a leaf page which would hold the entry if it

	4652 ** were present. The cursor might point to an entry that comes

	4653 ** before or after the key.

	4654 **

	4655 ** An integer is written into *pRes which is the result of

	4656 ** comparing the key with the entry to which the cursor is

	4657 ** pointing. The meaning of the integer written into

	4658 ** *pRes is as follows:

	4659 **

	4660 ** *pRes<0 The cursor is left pointing at an entry that

	4661 ** is smaller than intKey/pIdxKey or if the table is empty

	4662 ** and the cursor is therefore left point to nothing.

	4663 **

	4664 ** *pRes==0 The cursor is left pointing at an entry that

	4665 ** exactly matches intKey/pIdxKey.

	4666 **

	4667 ** *pRes>0 The cursor is left pointing at an entry that

	4668 ** is larger than intKey/pIdxKey.

	4669 **

	4670 */

	4671 int sqlite3BtreeMovetoUnpacked(

	4672 BtCursor pCur, / The cursor to be moved */

	4673 UnpackedRecord pIdxKey, / Unpacked index key */

	4674 i64 intKey, /* The table key */

	4675 int biasRight, /* If true, bias the search to the high end */

	4676 int pRes / Write search results here */

	4677 ){

	4678 int rc;

	4679 RecordCompare xRecordCompare;

	4680

	4681 assert( cursorHoldsMutex(pCur) );

	4682 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	4683 assert( pRes );

	4684 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );

	4685

	4686 /* If the cursor is already positioned at the point we are trying

	4687 ** to move to, then just return without doing any work */

	4688 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0

	4689 && pCur->apPage[0]->intKey

	4690 ){

	4691 if( pCur->info.nKey==intKey ){

	4692 *pRes = 0;

	4693 return SQLITE_OK;

	4694 }

	4695 if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){

	4696 *pRes = -1;

	4697 return SQLITE_OK;

	4698 }

	4699 }

	4700

	4701 if( pIdxKey ){

	4702 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);

	4703 pIdxKey->errCode = 0;

	4704 assert( pIdxKey->default_rc==1

	4705 \|\| pIdxKey->default_rc==0

	4706 \|\| pIdxKey->default_rc==-1

	4707 );

	4708 }else{

	4709 xRecordCompare = 0; /* All keys are integers */

	4710 }

	4711

	4712 rc = moveToRoot(pCur);

	4713 if( rc ){

	4714 return rc;

	4715 }

	4716 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage] );

	4717 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->isInit );

	4718 assert( pCur->eState==CURSOR_INVALID \|\| pCur->apPage[pCur->iPage]->nCell>0 );

	4719 if( pCur->eState==CURSOR_INVALID ){

	4720 *pRes = -1;

	4721 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	4722 return SQLITE_OK;

	4723 }

	4724 assert( pCur->apPage[0]->intKey \|\| pIdxKey );

	4725 for(;;){

	4726 int lwr, upr, idx, c;

	4727 Pgno chldPg;

	4728 MemPage *pPage = pCur->apPage[pCur->iPage];

	4729 u8 pCell; / Pointer to current cell in pPage */

	4730

	4731 /* pPage->nCell must be greater than zero. If this is the root-page

	4732 ** the cursor would have been INVALID above and this for(;;) loop

	4733 ** not run. If this is not the root-page, then the moveToChild() routine

	4734 ** would have already detected db corruption. Similarly, pPage must

	4735 ** be the right kind (index or table) of b-tree page. Otherwise

	4736 ** a moveToChild() or moveToRoot() call would have detected corruption. */

	4737 assert( pPage->nCell>0 );

	4738 assert( pPage->intKey==(pIdxKey==0) );

	4739 lwr = 0;

	4740 upr = pPage->nCell-1;

	4741 assert( biasRight==0 \|\| biasRight==1 );

	4742 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */

	4743 pCur->aiIdx[pCur->iPage] = (u16)idx;

	4744 if( xRecordCompare==0 ){

	4745 for(;;){

	4746 i64 nCellKey;

	4747 pCell = findCell(pPage, idx) + pPage->childPtrSize;

	4748 if( pPage->intKeyLeaf ){

	4749 while( 0x80 <= *(pCell++) ){

	4750 if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;

	4751 }

	4752 }

	4753 getVarint(pCell, (u64*)&nCellKey);

	4754 if( nCellKey<intKey ){

	4755 lwr = idx+1;

	4756 if( lwr>upr ){ c = -1; break; }

	4757 }else if( nCellKey>intKey ){

	4758 upr = idx-1;

	4759 if( lwr>upr ){ c = +1; break; }

	4760 }else{

	4761 assert( nCellKey==intKey );

	4762 pCur->curFlags \|= BTCF_ValidNKey;

	4763 pCur->info.nKey = nCellKey;

	4764 pCur->aiIdx[pCur->iPage] = (u16)idx;

	4765 if( !pPage->leaf ){

	4766 lwr = idx;

	4767 goto moveto_next_layer;

	4768 }else{

	4769 *pRes = 0;

	4770 rc = SQLITE_OK;

	4771 goto moveto_finish;

	4772 }

	4773 }

	4774 assert( lwr+upr>=0 );

	4775 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */

	4776 }

	4777 }else{

	4778 for(;;){

	4779 int nCell;

	4780 pCell = findCell(pPage, idx) + pPage->childPtrSize;

	4781

	4782 /* The maximum supported page-size is 65536 bytes. This means that

	4783 ** the maximum number of record bytes stored on an index B-Tree

	4784 ** page is less than 16384 bytes and may be stored as a 2-byte

	4785 ** varint. This information is used to attempt to avoid parsing

	4786 ** the entire cell by checking for the cases where the record is

	4787 ** stored entirely within the b-tree page by inspecting the first

	4788 ** 2 bytes of the cell.

	4789 */

	4790 nCell = pCell[0];

	4791 if( nCell<=pPage->max1bytePayload ){

	4792 /* This branch runs if the record-size field of the cell is a

	4793 ** single byte varint and the record fits entirely on the main

	4794 ** b-tree page. */

	4795 testcase( pCell+nCell+1==pPage->aDataEnd );

	4796 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);

	4797 }else if( !(pCell[1] & 0x80)

	4798 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal

	4799 ){

	4800 /* The record-size field is a 2 byte varint and the record

	4801 ** fits entirely on the main b-tree page. */

	4802 testcase( pCell+nCell+2==pPage->aDataEnd );

	4803 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);

	4804 }else{

	4805 /* The record flows over onto one or more overflow pages. In

	4806 ** this case the whole cell needs to be parsed, a buffer allocated

	4807 ** and accessPayload() used to retrieve the record into the

	4808 ** buffer before VdbeRecordCompare() can be called. */

	4809 void *pCellKey;

	4810 u8 * const pCellBody = pCell - pPage->childPtrSize;

	4811 btreeParseCellPtr(pPage, pCellBody, &pCur->info);

	4812 nCell = (int)pCur->info.nKey;

	4813 pCellKey = sqlite3Malloc( nCell );

	4814 if( pCellKey==0 ){

	4815 rc = SQLITE_NOMEM;

	4816 goto moveto_finish;

	4817 }

	4818 pCur->aiIdx[pCur->iPage] = (u16)idx;

	4819 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);

	4820 if( rc ){

	4821 sqlite3_free(pCellKey);

	4822 goto moveto_finish;

	4823 }

	4824 c = xRecordCompare(nCell, pCellKey, pIdxKey);

	4825 sqlite3_free(pCellKey);

	4826 }

	4827 assert(

	4828 (pIdxKey->errCode!=SQLITE_CORRUPT \|\| c==0)

	4829 && (pIdxKey->errCode!=SQLITE_NOMEM \|\| pCur->pBtree->db->mallocFailed)

	4830 );

	4831 if( c<0 ){

	4832 lwr = idx+1;

	4833 }else if( c>0 ){

	4834 upr = idx-1;

	4835 }else{

	4836 assert( c==0 );

	4837 *pRes = 0;

	4838 rc = SQLITE_OK;

	4839 pCur->aiIdx[pCur->iPage] = (u16)idx;

	4840 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;

	4841 goto moveto_finish;

	4842 }

	4843 if( lwr>upr ) break;

	4844 assert( lwr+upr>=0 );

	4845 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */

	4846 }

	4847 }

	4848 assert( lwr==upr+1 \|\| (pPage->intKey && !pPage->leaf) );

	4849 assert( pPage->isInit );

	4850 if( pPage->leaf ){

	4851 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	4852 pCur->aiIdx[pCur->iPage] = (u16)idx;

	4853 *pRes = c;

	4854 rc = SQLITE_OK;

	4855 goto moveto_finish;

	4856 }

	4857 moveto_next_layer:

	4858 if( lwr>=pPage->nCell ){

	4859 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	4860 }else{

	4861 chldPg = get4byte(findCell(pPage, lwr));

	4862 }

	4863 pCur->aiIdx[pCur->iPage] = (u16)lwr;

	4864 rc = moveToChild(pCur, chldPg);

	4865 if( rc ) break;

	4866 }

	4867 moveto_finish:

	4868 pCur->info.nSize = 0;

	4869 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	4870 return rc;

	4871 }

	4872

	4873

	4874 /*

	4875 ** Return TRUE if the cursor is not pointing at an entry of the table.

	4876 **

	4877 ** TRUE will be returned after a call to sqlite3BtreeNext() moves

	4878 ** past the last entry in the table or sqlite3BtreePrev() moves past

	4879 ** the first entry. TRUE is also returned if the table is empty.

	4880 */

	4881 int sqlite3BtreeEof(BtCursor *pCur){

	4882 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries

	4883 ** have been deleted? This API will need to change to return an error code

	4884 ** as well as the boolean result value.

	4885 */

	4886 return (CURSOR_VALID!=pCur->eState);

	4887 }

	4888

	4889 /*

	4890 ** Advance the cursor to the next entry in the database. If

	4891 ** successful then set *pRes=0. If the cursor

	4892 ** was already pointing to the last entry in the database before

	4893 ** this routine was called, then set *pRes=1.

	4894 **

	4895 ** The main entry point is sqlite3BtreeNext(). That routine is optimized

	4896 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx

	4897 ** to the next cell on the current page. The (slower) btreeNext() helper

	4898 ** routine is called when it is necessary to move to a different page or

	4899 ** to restore the cursor.

	4900 **

	4901 ** The calling function will set pRes to 0 or 1. The initial pRes value

	4902 ** will be 1 if the cursor being stepped corresponds to an SQL index and

	4903 ** if this routine could have been skipped if that SQL index had been

	4904 ** a unique index. Otherwise the caller will have set *pRes to zero.

	4905 ** Zero is the common case. The btree implementation is free to use the

	4906 ** initial *pRes value as a hint to improve performance, but the current

	4907 ** SQLite btree implementation does not. (Note that the comdb2 btree

	4908 ** implementation does use this hint, however.)

	4909 */

	4910 static SQLITE_NOINLINE int btreeNext(BtCursor pCur, int pRes){

	4911 int rc;

	4912 int idx;

	4913 MemPage *pPage;

	4914

	4915 assert( cursorHoldsMutex(pCur) );

	4916 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	4917 assert( *pRes==0 );

	4918 if( pCur->eState!=CURSOR_VALID ){

	4919 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );

	4920 rc = restoreCursorPosition(pCur);

	4921 if( rc!=SQLITE_OK ){

	4922 return rc;

	4923 }

	4924 if( CURSOR_INVALID==pCur->eState ){

	4925 *pRes = 1;

	4926 return SQLITE_OK;

	4927 }

	4928 if( pCur->skipNext ){

	4929 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

	4930 pCur->eState = CURSOR_VALID;

	4931 if( pCur->skipNext>0 ){

	4932 pCur->skipNext = 0;

	4933 return SQLITE_OK;

	4934 }

	4935 pCur->skipNext = 0;

	4936 }

	4937 }

	4938

	4939 pPage = pCur->apPage[pCur->iPage];

	4940 idx = ++pCur->aiIdx[pCur->iPage];

	4941 assert( pPage->isInit );

	4942

	4943 /* If the database file is corrupt, it is possible for the value of idx

	4944 ** to be invalid here. This can only occur if a second cursor modifies

	4945 ** the page while cursor pCur is holding a reference to it. Which can

	4946 ** only happen if the database is corrupt in such a way as to link the

	4947 ** page into more than one b-tree structure. */

	4948 testcase( idx>pPage->nCell );

	4949

	4950 if( idx>=pPage->nCell ){

	4951 if( !pPage->leaf ){

	4952 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

	4953 if( rc ) return rc;

	4954 return moveToLeftmost(pCur);

	4955 }

	4956 do{

	4957 if( pCur->iPage==0 ){

	4958 *pRes = 1;

	4959 pCur->eState = CURSOR_INVALID;

	4960 return SQLITE_OK;

	4961 }

	4962 moveToParent(pCur);

	4963 pPage = pCur->apPage[pCur->iPage];

	4964 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );

	4965 if( pPage->intKey ){

	4966 return sqlite3BtreeNext(pCur, pRes);

	4967 }else{

	4968 return SQLITE_OK;

	4969 }

	4970 }

	4971 if( pPage->leaf ){

	4972 return SQLITE_OK;

	4973 }else{

	4974 return moveToLeftmost(pCur);

	4975 }

	4976 }

	4977 int sqlite3BtreeNext(BtCursor pCur, int pRes){

	4978 MemPage *pPage;

	4979 assert( cursorHoldsMutex(pCur) );

	4980 assert( pRes!=0 );

	4981 assert( pRes==0 \|\| pRes==1 );

	4982 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	4983 pCur->info.nSize = 0;

	4984 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	4985 *pRes = 0;

	4986 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);

	4987 pPage = pCur->apPage[pCur->iPage];

	4988 if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){

	4989 pCur->aiIdx[pCur->iPage]--;

	4990 return btreeNext(pCur, pRes);

	4991 }

	4992 if( pPage->leaf ){

	4993 return SQLITE_OK;

	4994 }else{

	4995 return moveToLeftmost(pCur);

	4996 }

	4997 }

	4998

	4999 /*

	5000 ** Step the cursor to the back to the previous entry in the database. If

	5001 ** successful then set *pRes=0. If the cursor

	5002 ** was already pointing to the first entry in the database before

	5003 ** this routine was called, then set *pRes=1.

	5004 **

	5005 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized

	5006 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx

	5007 ** to the previous cell on the current page. The (slower) btreePrevious()

	5008 ** helper routine is called when it is necessary to move to a different page

	5009 ** or to restore the cursor.

	5010 **

	5011 ** The calling function will set pRes to 0 or 1. The initial pRes value

	5012 ** will be 1 if the cursor being stepped corresponds to an SQL index and

	5013 ** if this routine could have been skipped if that SQL index had been

	5014 ** a unique index. Otherwise the caller will have set *pRes to zero.

	5015 ** Zero is the common case. The btree implementation is free to use the

	5016 ** initial *pRes value as a hint to improve performance, but the current

	5017 ** SQLite btree implementation does not. (Note that the comdb2 btree

	5018 ** implementation does use this hint, however.)

	5019 */

	5020 static SQLITE_NOINLINE int btreePrevious(BtCursor pCur, int pRes){

	5021 int rc;

	5022 MemPage *pPage;

	5023

	5024 assert( cursorHoldsMutex(pCur) );

	5025 assert( pRes!=0 );

	5026 assert( *pRes==0 );

	5027 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	5028 assert( (pCur->curFlags & (BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey))==0 );

	5029 assert( pCur->info.nSize==0 );

	5030 if( pCur->eState!=CURSOR_VALID ){

	5031 rc = restoreCursorPosition(pCur);

	5032 if( rc!=SQLITE_OK ){

	5033 return rc;

	5034 }

	5035 if( CURSOR_INVALID==pCur->eState ){

	5036 *pRes = 1;

	5037 return SQLITE_OK;

	5038 }

	5039 if( pCur->skipNext ){

	5040 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

	5041 pCur->eState = CURSOR_VALID;

	5042 if( pCur->skipNext<0 ){

	5043 pCur->skipNext = 0;

	5044 return SQLITE_OK;

	5045 }

	5046 pCur->skipNext = 0;

	5047 }

	5048 }

	5049

	5050 pPage = pCur->apPage[pCur->iPage];

	5051 assert( pPage->isInit );

	5052 if( !pPage->leaf ){

	5053 int idx = pCur->aiIdx[pCur->iPage];

	5054 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));

	5055 if( rc ) return rc;

	5056 rc = moveToRightmost(pCur);

	5057 }else{

	5058 while( pCur->aiIdx[pCur->iPage]==0 ){

	5059 if( pCur->iPage==0 ){

	5060 pCur->eState = CURSOR_INVALID;

	5061 *pRes = 1;

	5062 return SQLITE_OK;

	5063 }

	5064 moveToParent(pCur);

	5065 }

	5066 assert( pCur->info.nSize==0 );

	5067 assert( (pCur->curFlags & (BTCF_ValidNKey\|BTCF_ValidOvfl))==0 );

	5068

	5069 pCur->aiIdx[pCur->iPage]--;

	5070 pPage = pCur->apPage[pCur->iPage];

	5071 if( pPage->intKey && !pPage->leaf ){

	5072 rc = sqlite3BtreePrevious(pCur, pRes);

	5073 }else{

	5074 rc = SQLITE_OK;

	5075 }

	5076 }

	5077 return rc;

	5078 }

	5079 int sqlite3BtreePrevious(BtCursor pCur, int pRes){

	5080 assert( cursorHoldsMutex(pCur) );

	5081 assert( pRes!=0 );

	5082 assert( pRes==0 \|\| pRes==1 );

	5083 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	5084 *pRes = 0;

	5085 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey);

	5086 pCur->info.nSize = 0;

	5087 if( pCur->eState!=CURSOR_VALID

	5088 \|\| pCur->aiIdx[pCur->iPage]==0

	5089 \|\| pCur->apPage[pCur->iPage]->leaf==0

	5090 ){

	5091 return btreePrevious(pCur, pRes);

	5092 }

	5093 pCur->aiIdx[pCur->iPage]--;

	5094 return SQLITE_OK;

	5095 }

	5096

	5097 /*

	5098 ** Allocate a new page from the database file.

	5099 **

	5100 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()

	5101 ** has already been called on the new page.) The new page has also

	5102 ** been referenced and the calling routine is responsible for calling

	5103 ** sqlite3PagerUnref() on the new page when it is done.

	5104 **

	5105 ** SQLITE_OK is returned on success. Any other return value indicates

	5106 ** an error. ppPage and pPgno are undefined in the event of an error.

	5107 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.

	5108 **

	5109 ** If the "nearby" parameter is not 0, then an effort is made to

	5110 ** locate a page close to the page number "nearby". This can be used in an

	5111 ** attempt to keep related pages close to each other in the database file,

	5112 ** which in turn can make database access faster.

	5113 **

	5114 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists

	5115 ** anywhere on the free-list, then it is guaranteed to be returned. If

	5116 ** eMode is BTALLOC_LT then the page returned will be less than or equal

	5117 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there

	5118 ** are no restrictions on which page is returned.

	5119 */

	5120 static int allocateBtreePage(

	5121 BtShared pBt, / The btree */

	5122 MemPage *ppPage, / Store pointer to the allocated page here */

	5123 Pgno pPgno, / Store the page number here */

	5124 Pgno nearby, /* Search for a page near this one */

	5125 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */

	5126 ){

	5127 MemPage *pPage1;

	5128 int rc;

	5129 u32 n; /* Number of pages on the freelist */

	5130 u32 k; /* Number of leaves on the trunk of the freelist */

	5131 MemPage *pTrunk = 0;

	5132 MemPage *pPrevTrunk = 0;

	5133 Pgno mxPage; /* Total size of the database file */

	5134

	5135 assert( sqlite3_mutex_held(pBt->mutex) );

	5136 assert( eMode==BTALLOC_ANY \|\| (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );

	5137 pPage1 = pBt->pPage1;

	5138 mxPage = btreePagecount(pBt);

	5139 n = get4byte(&pPage1->aData[36]);

	5140 testcase( n==mxPage-1 );

	5141 if( n>=mxPage ){

	5142 return SQLITE_CORRUPT_BKPT;

	5143 }

	5144 if( n>0 ){

	5145 /* There are pages on the freelist. Reuse one of those pages. */

	5146 Pgno iTrunk;

	5147 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */

	5148

	5149 /* If eMode==BTALLOC_EXACT and a query of the pointer-map

	5150 ** shows that the page 'nearby' is somewhere on the free-list, then

	5151 ** the entire-list will be searched for that page.

	5152 */

	5153 #ifndef SQLITE_OMIT_AUTOVACUUM

	5154 if( eMode==BTALLOC_EXACT ){

	5155 if( nearby<=mxPage ){

	5156 u8 eType;

	5157 assert( nearby>0 );

	5158 assert( pBt->autoVacuum );

	5159 rc = ptrmapGet(pBt, nearby, &eType, 0);

	5160 if( rc ) return rc;

	5161 if( eType==PTRMAP_FREEPAGE ){

	5162 searchList = 1;

	5163 }

	5164 }

	5165 }else if( eMode==BTALLOC_LE ){

	5166 searchList = 1;

	5167 }

	5168 #endif

	5169

	5170 /* Decrement the free-list count by 1. Set iTrunk to the index of the

	5171 ** first free-list trunk page. iPrevTrunk is initially 1.

	5172 */

	5173 rc = sqlite3PagerWrite(pPage1->pDbPage);

	5174 if( rc ) return rc;

	5175 put4byte(&pPage1->aData[36], n-1);

	5176

	5177 /* The code within this loop is run only once if the 'searchList' variable

	5178 ** is not true. Otherwise, it runs once for each trunk-page on the

	5179 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)

	5180 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)

	5181 */

	5182 do {

	5183 pPrevTrunk = pTrunk;

	5184 if( pPrevTrunk ){

	5185 iTrunk = get4byte(&pPrevTrunk->aData[0]);

	5186 }else{

	5187 iTrunk = get4byte(&pPage1->aData[32]);

	5188 }

	5189 testcase( iTrunk==mxPage );

	5190 if( iTrunk>mxPage ){

	5191 rc = SQLITE_CORRUPT_BKPT;

	5192 }else{

	5193 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);

	5194 }

	5195 if( rc ){

	5196 pTrunk = 0;

	5197 goto end_allocate_page;

	5198 }

	5199 assert( pTrunk!=0 );

	5200 assert( pTrunk->aData!=0 );

	5201

	5202 k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */

	5203 if( k==0 && !searchList ){

	5204 /* The trunk has no leaves and the list is not being searched.

	5205 ** So extract the trunk page itself and use it as the newly

	5206 ** allocated page */

	5207 assert( pPrevTrunk==0 );

	5208 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5209 if( rc ){

	5210 goto end_allocate_page;

	5211 }

	5212 *pPgno = iTrunk;

	5213 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

	5214 *ppPage = pTrunk;

	5215 pTrunk = 0;

	5216 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

	5217 }else if( k>(u32)(pBt->usableSize/4 - 2) ){

	5218 /* Value of k is out of range. Database corruption */

	5219 rc = SQLITE_CORRUPT_BKPT;

	5220 goto end_allocate_page;

	5221 #ifndef SQLITE_OMIT_AUTOVACUUM

	5222 }else if( searchList

	5223 && (nearby==iTrunk \|\| (iTrunk<nearby && eMode==BTALLOC_LE))

	5224 ){

	5225 /* The list is being searched and this trunk page is the page

	5226 ** to allocate, regardless of whether it has leaves.

	5227 */

	5228 *pPgno = iTrunk;

	5229 *ppPage = pTrunk;

	5230 searchList = 0;

	5231 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5232 if( rc ){

	5233 goto end_allocate_page;

	5234 }

	5235 if( k==0 ){

	5236 if( !pPrevTrunk ){

	5237 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

	5238 }else{

	5239 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

	5240 if( rc!=SQLITE_OK ){

	5241 goto end_allocate_page;

	5242 }

	5243 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);

	5244 }

	5245 }else{

	5246 /* The trunk page is required by the caller but it contains

	5247 ** pointers to free-list leaves. The first leaf becomes a trunk

	5248 ** page in this case.

	5249 */

	5250 MemPage *pNewTrunk;

	5251 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);

	5252 if( iNewTrunk>mxPage ){

	5253 rc = SQLITE_CORRUPT_BKPT;

	5254 goto end_allocate_page;

	5255 }

	5256 testcase( iNewTrunk==mxPage );

	5257 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);

	5258 if( rc!=SQLITE_OK ){

	5259 goto end_allocate_page;

	5260 }

	5261 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);

	5262 if( rc!=SQLITE_OK ){

	5263 releasePage(pNewTrunk);

	5264 goto end_allocate_page;

	5265 }

	5266 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);

	5267 put4byte(&pNewTrunk->aData[4], k-1);

	5268 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);

	5269 releasePage(pNewTrunk);

	5270 if( !pPrevTrunk ){

	5271 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );

	5272 put4byte(&pPage1->aData[32], iNewTrunk);

	5273 }else{

	5274 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

	5275 if( rc ){

	5276 goto end_allocate_page;

	5277 }

	5278 put4byte(&pPrevTrunk->aData[0], iNewTrunk);

	5279 }

	5280 }

	5281 pTrunk = 0;

	5282 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

	5283 #endif

	5284 }else if( k>0 ){

	5285 /* Extract a leaf from the trunk */

	5286 u32 closest;

	5287 Pgno iPage;

	5288 unsigned char *aData = pTrunk->aData;

	5289 if( nearby>0 ){

	5290 u32 i;

	5291 closest = 0;

	5292 if( eMode==BTALLOC_LE ){

	5293 for(i=0; i<k; i++){

	5294 iPage = get4byte(&aData[8+i*4]);

	5295 if( iPage<=nearby ){

	5296 closest = i;

	5297 break;

	5298 }

	5299 }

	5300 }else{

	5301 int dist;

	5302 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);

	5303 for(i=1; i<k; i++){

	5304 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);

	5305 if( d2<dist ){

	5306 closest = i;

	5307 dist = d2;

	5308 }

	5309 }

	5310 }

	5311 }else{

	5312 closest = 0;

	5313 }

	5314

	5315 iPage = get4byte(&aData[8+closest*4]);

	5316 testcase( iPage==mxPage );

	5317 if( iPage>mxPage ){

	5318 rc = SQLITE_CORRUPT_BKPT;

	5319 goto end_allocate_page;

	5320 }

	5321 testcase( iPage==mxPage );

	5322 if( !searchList

	5323 \|\| (iPage==nearby \|\| (iPage<nearby && eMode==BTALLOC_LE))

	5324 ){

	5325 int noContent;

	5326 *pPgno = iPage;

	5327 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"

	5328 ": %d more free pages\n",

	5329 *pPgno, closest+1, k, pTrunk->pgno, n-1));

	5330 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5331 if( rc ) goto end_allocate_page;

	5332 if( closest<k-1 ){

	5333 memcpy(&aData[8+closest4], &aData[4+k4], 4);

	5334 }

	5335 put4byte(&aData[4], k-1);

	5336 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;

	5337 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);

	5338 if( rc==SQLITE_OK ){

	5339 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

	5340 if( rc!=SQLITE_OK ){

	5341 releasePage(*ppPage);

	5342 }

	5343 }

	5344 searchList = 0;

	5345 }

	5346 }

	5347 releasePage(pPrevTrunk);

	5348 pPrevTrunk = 0;

	5349 }while( searchList );

	5350 }else{

	5351 /* There are no pages on the freelist, so append a new page to the

	5352 ** database image.

	5353 **

	5354 ** Normally, new pages allocated by this block can be requested from the

	5355 ** pager layer with the 'no-content' flag set. This prevents the pager

	5356 ** from trying to read the pages content from disk. However, if the

	5357 ** current transaction has already run one or more incremental-vacuum

	5358 ** steps, then the page we are about to allocate may contain content

	5359 ** that is required in the event of a rollback. In this case, do

	5360 ** not set the no-content flag. This causes the pager to load and journal

	5361 ** the current page content before overwriting it.

	5362 **

	5363 ** Note that the pager will not actually attempt to load or journal

	5364 ** content for any page that really does lie past the end of the database

	5365 ** file on disk. So the effects of disabling the no-content optimization

	5366 ** here are confined to those pages that lie between the end of the

	5367 ** database image and the end of the database file.

	5368 */

	5369 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;

	5370

	5371 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	5372 if( rc ) return rc;

	5373 pBt->nPage++;

	5374 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;

	5375

	5376 #ifndef SQLITE_OMIT_AUTOVACUUM

	5377 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){

	5378 /* If *pPgno refers to a pointer-map page, allocate two new pages

	5379 ** at the end of the file instead of one. The first allocated page

	5380 ** becomes a new pointer-map page, the second is used by the caller.

	5381 */

	5382 MemPage *pPg = 0;

	5383 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));

	5384 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );

	5385 rc = btreeGetPage(pBt, pBt->nPage, &pPg, bNoContent);

	5386 if( rc==SQLITE_OK ){

	5387 rc = sqlite3PagerWrite(pPg->pDbPage);

	5388 releasePage(pPg);

	5389 }

	5390 if( rc ) return rc;

	5391 pBt->nPage++;

	5392 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }

	5393 }

	5394 #endif

	5395 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);

	5396 *pPgno = pBt->nPage;

	5397

	5398 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

	5399 rc = btreeGetPage(pBt, *pPgno, ppPage, bNoContent);

	5400 if( rc ) return rc;

	5401 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

	5402 if( rc!=SQLITE_OK ){

	5403 releasePage(*ppPage);

	5404 }

	5405 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));

	5406 }

	5407

	5408 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

	5409

	5410 end_allocate_page:

	5411 releasePage(pTrunk);

	5412 releasePage(pPrevTrunk);

	5413 if( rc==SQLITE_OK ){

	5414 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){

	5415 releasePage(*ppPage);

	5416 *ppPage = 0;

	5417 return SQLITE_CORRUPT_BKPT;

	5418 }

	5419 (*ppPage)->isInit = 0;

	5420 }else{

	5421 *ppPage = 0;

	5422 }

	5423 assert( rc!=SQLITE_OK \|\| sqlite3PagerIswriteable((*ppPage)->pDbPage) );

	5424 return rc;

	5425 }

	5426

	5427 /*

	5428 ** This function is used to add page iPage to the database file free-list.

	5429 ** It is assumed that the page is not already a part of the free-list.

	5430 **

	5431 ** The value passed as the second argument to this function is optional.

	5432 ** If the caller happens to have a pointer to the MemPage object

	5433 ** corresponding to page iPage handy, it may pass it as the second value.

	5434 ** Otherwise, it may pass NULL.

	5435 **

	5436 ** If a pointer to a MemPage object is passed as the second argument,

	5437 ** its reference count is not altered by this function.

	5438 */

	5439 static int freePage2(BtShared pBt, MemPage pMemPage, Pgno iPage){

	5440 MemPage pTrunk = 0; / Free-list trunk page */

	5441 Pgno iTrunk = 0; /* Page number of free-list trunk page */

	5442 MemPage pPage1 = pBt->pPage1; / Local reference to page 1 */

	5443 MemPage pPage; / Page being freed. May be NULL. */

	5444 int rc; /* Return Code */

	5445 int nFree; /* Initial number of pages on free-list */

	5446

	5447 assert( sqlite3_mutex_held(pBt->mutex) );

	5448 assert( iPage>1 );

	5449 assert( !pMemPage \|\| pMemPage->pgno==iPage );

	5450

	5451 if( pMemPage ){

	5452 pPage = pMemPage;

	5453 sqlite3PagerRef(pPage->pDbPage);

	5454 }else{

	5455 pPage = btreePageLookup(pBt, iPage);

	5456 }

	5457

	5458 /* Increment the free page count on pPage1 */

	5459 rc = sqlite3PagerWrite(pPage1->pDbPage);

	5460 if( rc ) goto freepage_out;

	5461 nFree = get4byte(&pPage1->aData[36]);

	5462 put4byte(&pPage1->aData[36], nFree+1);

	5463

	5464 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	5465 /* If the secure_delete option is enabled, then

	5466 ** always fully overwrite deleted information with zeros.

	5467 */

	5468 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )

	5469 \|\| ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)

	5470 ){

	5471 goto freepage_out;

	5472 }

	5473 memset(pPage->aData, 0, pPage->pBt->pageSize);

	5474 }

	5475

	5476 /* If the database supports auto-vacuum, write an entry in the pointer-map

	5477 ** to indicate that the page is free.

	5478 */

	5479 if( ISAUTOVACUUM ){

	5480 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);

	5481 if( rc ) goto freepage_out;

	5482 }

	5483

	5484 /* Now manipulate the actual database free-list structure. There are two

	5485 ** possibilities. If the free-list is currently empty, or if the first

	5486 ** trunk page in the free-list is full, then this page will become a

	5487 ** new free-list trunk page. Otherwise, it will become a leaf of the

	5488 ** first trunk page in the current free-list. This block tests if it

	5489 ** is possible to add the page as a new free-list leaf.

	5490 */

	5491 if( nFree!=0 ){

	5492 u32 nLeaf; /* Initial number of leaf cells on trunk page */

	5493

	5494 iTrunk = get4byte(&pPage1->aData[32]);

	5495 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);

	5496 if( rc!=SQLITE_OK ){

	5497 goto freepage_out;

	5498 }

	5499

	5500 nLeaf = get4byte(&pTrunk->aData[4]);

	5501 assert( pBt->usableSize>32 );

	5502 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){

	5503 rc = SQLITE_CORRUPT_BKPT;

	5504 goto freepage_out;

	5505 }

	5506 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){

	5507 /* In this case there is room on the trunk page to insert the page

	5508 ** being freed as a new leaf.

	5509 **

	5510 ** Note that the trunk page is not really full until it contains

	5511 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have

	5512 ** coded. But due to a coding error in versions of SQLite prior to

	5513 ** 3.6.0, databases with freelist trunk pages holding more than

	5514 ** usableSize/4 - 8 entries will be reported as corrupt. In order

	5515 ** to maintain backwards compatibility with older versions of SQLite,

	5516 ** we will continue to restrict the number of entries to usableSize/4 - 8

	5517 ** for now. At some point in the future (once everyone has upgraded

	5518 ** to 3.6.0 or later) we should consider fixing the conditional above

	5519 ** to read "usableSize/4-2" instead of "usableSize/4-8".

	5520 */

	5521 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	5522 if( rc==SQLITE_OK ){

	5523 put4byte(&pTrunk->aData[4], nLeaf+1);

	5524 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);

	5525 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){

	5526 sqlite3PagerDontWrite(pPage->pDbPage);

	5527 }

	5528 rc = btreeSetHasContent(pBt, iPage);

	5529 }

	5530 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));

	5531 goto freepage_out;

	5532 }

	5533 }

	5534

	5535 /* If control flows to this point, then it was not possible to add the

	5536 ** the page being freed as a leaf page of the first trunk in the free-list.

	5537 ** Possibly because the free-list is empty, or possibly because the

	5538 ** first trunk in the free-list is full. Either way, the page being freed

	5539 ** will become the new first trunk page in the free-list.

	5540 */

	5541 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){

	5542 goto freepage_out;

	5543 }

	5544 rc = sqlite3PagerWrite(pPage->pDbPage);

	5545 if( rc!=SQLITE_OK ){

	5546 goto freepage_out;

	5547 }

	5548 put4byte(pPage->aData, iTrunk);

	5549 put4byte(&pPage->aData[4], 0);

	5550 put4byte(&pPage1->aData[32], iPage);

	5551 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));

	5552

	5553 freepage_out:

	5554 if( pPage ){

	5555 pPage->isInit = 0;

	5556 }

	5557 releasePage(pPage);

	5558 releasePage(pTrunk);

	5559 return rc;

	5560 }

	5561 static void freePage(MemPage pPage, int pRC){

	5562 if( (*pRC)==SQLITE_OK ){

	5563 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);

	5564 }

	5565 }

	5566

	5567 /*

	5568 ** Free any overflow pages associated with the given Cell. Write the

	5569 ** local Cell size (the number of bytes on the original page, omitting

	5570 ** overflow) into *pnSize.

	5571 */

	5572 static int clearCell(

	5573 MemPage pPage, / The page that contains the Cell */

	5574 unsigned char pCell, / First byte of the Cell */

	5575 u16 pnSize / Write the size of the Cell here */

	5576 ){

	5577 BtShared *pBt = pPage->pBt;

	5578 CellInfo info;

	5579 Pgno ovflPgno;

	5580 int rc;

	5581 int nOvfl;

	5582 u32 ovflPageSize;

	5583

	5584 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	5585 btreeParseCellPtr(pPage, pCell, &info);

	5586 *pnSize = info.nSize;

	5587 if( info.iOverflow==0 ){

	5588 return SQLITE_OK; /* No overflow pages. Return without doing anything */

	5589 }

	5590 if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){

	5591 return SQLITE_CORRUPT_BKPT; /* Cell extends past end of page */

	5592 }

	5593 ovflPgno = get4byte(&pCell[info.iOverflow]);

	5594 assert( pBt->usableSize > 4 );

	5595 ovflPageSize = pBt->usableSize - 4;

	5596 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;

	5597 assert( ovflPgno==0 \|\| nOvfl>0 );

	5598 while( nOvfl-- ){

	5599 Pgno iNext = 0;

	5600 MemPage *pOvfl = 0;

	5601 if( ovflPgno<2 \|\| ovflPgno>btreePagecount(pBt) ){

	5602 /* 0 is not a legal page number and page 1 cannot be an

	5603 ** overflow page. Therefore if ovflPgno<2 or past the end of the

	5604 ** file the database must be corrupt. */

	5605 return SQLITE_CORRUPT_BKPT;

	5606 }

	5607 if( nOvfl ){

	5608 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);

	5609 if( rc ) return rc;

	5610 }

	5611

	5612 if( ( pOvfl \|\| ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )

	5613 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1

	5614 ){

	5615 /* There is no reason any cursor should have an outstanding reference

	5616 ** to an overflow page belonging to a cell that is being deleted/updated.

	5617 ** So if there exists more than one reference to this page, then it

	5618 ** must not really be an overflow page and the database must be corrupt.

	5619 ** It is helpful to detect this before calling freePage2(), as

	5620 ** freePage2() may zero the page contents if secure-delete mode is

	5621 ** enabled. If this 'overflow' page happens to be a page that the

	5622 ** caller is iterating through or using in some other way, this

	5623 ** can be problematic.

	5624 */

	5625 rc = SQLITE_CORRUPT_BKPT;

	5626 }else{

	5627 rc = freePage2(pBt, pOvfl, ovflPgno);

	5628 }

	5629

	5630 if( pOvfl ){

	5631 sqlite3PagerUnref(pOvfl->pDbPage);

	5632 }

	5633 if( rc ) return rc;

	5634 ovflPgno = iNext;

	5635 }

	5636 return SQLITE_OK;

	5637 }

	5638

	5639 /*

	5640 ** Create the byte sequence used to represent a cell on page pPage

	5641 ** and write that byte sequence into pCell[]. Overflow pages are

	5642 ** allocated and filled in as necessary. The calling procedure

	5643 ** is responsible for making sure sufficient space has been allocated

	5644 ** for pCell[].

	5645 **

	5646 ** Note that pCell does not necessary need to point to the pPage->aData

	5647 ** area. pCell might point to some temporary storage. The cell will

	5648 ** be constructed in this temporary area then copied into pPage->aData

	5649 ** later.

	5650 */

	5651 static int fillInCell(

	5652 MemPage pPage, / The page that contains the cell */

	5653 unsigned char pCell, / Complete text of the cell */

	5654 const void pKey, i64 nKey, / The key */

	5655 const void pData,int nData, / The data */

	5656 int nZero, /* Extra zero bytes to append to pData */

	5657 int pnSize / Write cell size here */

	5658 ){

	5659 int nPayload;

	5660 const u8 *pSrc;

	5661 int nSrc, n, rc;

	5662 int spaceLeft;

	5663 MemPage *pOvfl = 0;

	5664 MemPage *pToRelease = 0;

	5665 unsigned char *pPrior;

	5666 unsigned char *pPayload;

	5667 BtShared *pBt = pPage->pBt;

	5668 Pgno pgnoOvfl = 0;

	5669 int nHeader;

	5670

	5671 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	5672

	5673 /* pPage is not necessarily writeable since pCell might be auxiliary

	5674 ** buffer space that is separate from the pPage buffer area */

	5675 assert( pCell<pPage->aData \|\| pCell>=&pPage->aData[pBt->pageSize]

	5676 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	5677

	5678 /* Fill in the header. */

	5679 nHeader = pPage->childPtrSize;

	5680 nPayload = nData + nZero;

	5681 if( pPage->intKeyLeaf ){

	5682 nHeader += putVarint32(&pCell[nHeader], nPayload);

	5683 }else{

	5684 assert( nData==0 );

	5685 assert( nZero==0 );

	5686 }

	5687 nHeader += putVarint(&pCell[nHeader], (u64)&nKey);

	5688

	5689 /* Fill in the payload size */

	5690 if( pPage->intKey ){

	5691 pSrc = pData;

	5692 nSrc = nData;

	5693 nData = 0;

	5694 }else{

	5695 if( NEVER(nKey>0x7fffffff \|\| pKey==0) ){

	5696 return SQLITE_CORRUPT_BKPT;

	5697 }

	5698 nPayload = (int)nKey;

	5699 pSrc = pKey;

	5700 nSrc = (int)nKey;

	5701 }

	5702 if( nPayload<=pPage->maxLocal ){

	5703 n = nHeader + nPayload;

	5704 testcase( n==3 );

	5705 testcase( n==4 );

	5706 if( n<4 ) n = 4;

	5707 *pnSize = n;

	5708 spaceLeft = nPayload;

	5709 pPrior = pCell;

	5710 }else{

	5711 int mn = pPage->minLocal;

	5712 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);

	5713 testcase( n==pPage->maxLocal );

	5714 testcase( n==pPage->maxLocal+1 );

	5715 if( n > pPage->maxLocal ) n = mn;

	5716 spaceLeft = n;

	5717 *pnSize = n + nHeader + 4;

	5718 pPrior = &pCell[nHeader+n];

	5719 }

	5720 pPayload = &pCell[nHeader];

	5721

	5722 /* At this point variables should be set as follows:

	5723 **

	5724 ** nPayload Total payload size in bytes

	5725 ** pPayload Begin writing payload here

	5726 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,

	5727 ** that means content must spill into overflow pages.

	5728 ** *pnSize Size of the local cell (not counting overflow pages)

	5729 ** pPrior Where to write the pgno of the first overflow page

	5730 **

	5731 ** Use a call to btreeParseCellPtr() to verify that the values above

	5732 ** were computed correctly.

	5733 */

	5734 #if SQLITE_DEBUG

	5735 {

	5736 CellInfo info;

	5737 btreeParseCellPtr(pPage, pCell, &info);

	5738 assert( nHeader=(int)(info.pPayload - pCell) );

	5739 assert( info.nKey==nKey );

	5740 assert( *pnSize == info.nSize );

	5741 assert( spaceLeft == info.nLocal );

	5742 assert( pPrior == &pCell[info.iOverflow] );

	5743 }

	5744 #endif

	5745

	5746 /* Write the payload into the local Cell and any extra into overflow pages */

	5747 while( nPayload>0 ){

	5748 if( spaceLeft==0 ){

	5749 #ifndef SQLITE_OMIT_AUTOVACUUM

	5750 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */

	5751 if( pBt->autoVacuum ){

	5752 do{

	5753 pgnoOvfl++;

	5754 } while(

	5755 PTRMAP_ISPAGE(pBt, pgnoOvfl) \|\| pgnoOvfl==PENDING_BYTE_PAGE(pBt)

	5756 );

	5757 }

	5758 #endif

	5759 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);

	5760 #ifndef SQLITE_OMIT_AUTOVACUUM

	5761 /* If the database supports auto-vacuum, and the second or subsequent

	5762 ** overflow page is being allocated, add an entry to the pointer-map

	5763 ** for that page now.

	5764 **

	5765 ** If this is the first overflow page, then write a partial entry

	5766 ** to the pointer-map. If we write nothing to this pointer-map slot,

	5767 ** then the optimistic overflow chain processing in clearCell()

	5768 ** may misinterpret the uninitialized values and delete the

	5769 ** wrong pages from the database.

	5770 */

	5771 if( pBt->autoVacuum && rc==SQLITE_OK ){

	5772 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);

	5773 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);

	5774 if( rc ){

	5775 releasePage(pOvfl);

	5776 }

	5777 }

	5778 #endif

	5779 if( rc ){

	5780 releasePage(pToRelease);

	5781 return rc;

	5782 }

	5783

	5784 /* If pToRelease is not zero than pPrior points into the data area

	5785 ** of pToRelease. Make sure pToRelease is still writeable. */

	5786 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

	5787

	5788 /* If pPrior is part of the data area of pPage, then make sure pPage

	5789 ** is still writeable */

	5790 assert( pPrior<pPage->aData \|\| pPrior>=&pPage->aData[pBt->pageSize]

	5791 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	5792

	5793 put4byte(pPrior, pgnoOvfl);

	5794 releasePage(pToRelease);

	5795 pToRelease = pOvfl;

	5796 pPrior = pOvfl->aData;

	5797 put4byte(pPrior, 0);

	5798 pPayload = &pOvfl->aData[4];

	5799 spaceLeft = pBt->usableSize - 4;

	5800 }

	5801 n = nPayload;

	5802 if( n>spaceLeft ) n = spaceLeft;

	5803

	5804 /* If pToRelease is not zero than pPayload points into the data area

	5805 ** of pToRelease. Make sure pToRelease is still writeable. */

	5806 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

	5807

	5808 /* If pPayload is part of the data area of pPage, then make sure pPage

	5809 ** is still writeable */

	5810 assert( pPayload<pPage->aData \|\| pPayload>=&pPage->aData[pBt->pageSize]

	5811 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	5812

	5813 if( nSrc>0 ){

	5814 if( n>nSrc ) n = nSrc;

	5815 assert( pSrc );

	5816 memcpy(pPayload, pSrc, n);

	5817 }else{

	5818 memset(pPayload, 0, n);

	5819 }

	5820 nPayload -= n;

	5821 pPayload += n;

	5822 pSrc += n;

	5823 nSrc -= n;

	5824 spaceLeft -= n;

	5825 if( nSrc==0 ){

	5826 nSrc = nData;

	5827 pSrc = pData;

	5828 }

	5829 }

	5830 releasePage(pToRelease);

	5831 return SQLITE_OK;

	5832 }

	5833

	5834 /*

	5835 ** Remove the i-th cell from pPage. This routine effects pPage only.

	5836 ** The cell content is not freed or deallocated. It is assumed that

	5837 ** the cell content has been copied someplace else. This routine just

	5838 ** removes the reference to the cell from pPage.

	5839 **

	5840 ** "sz" must be the number of bytes in the cell.

	5841 */

	5842 static void dropCell(MemPage pPage, int idx, int sz, int pRC){

	5843 u32 pc; /* Offset to cell content of cell being deleted */

	5844 u8 data; / pPage->aData */

	5845 u8 ptr; / Used to move bytes around within data[] */

	5846 int rc; /* The return code */

	5847 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */

	5848

	5849 if( *pRC ) return;

	5850

	5851 assert( idx>=0 && idx<pPage->nCell );

	5852 assert( sz==cellSize(pPage, idx) );

	5853 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	5854 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	5855 data = pPage->aData;

	5856 ptr = &pPage->aCellIdx[2*idx];

	5857 pc = get2byte(ptr);

	5858 hdr = pPage->hdrOffset;

	5859 testcase( pc==get2byte(&data[hdr+5]) );

	5860 testcase( pc+sz==pPage->pBt->usableSize );

	5861 if( pc < (u32)get2byte(&data[hdr+5]) \|\| pc+sz > pPage->pBt->usableSize ){

	5862 *pRC = SQLITE_CORRUPT_BKPT;

	5863 return;

	5864 }

	5865 rc = freeSpace(pPage, pc, sz);

	5866 if( rc ){

	5867 *pRC = rc;

	5868 return;

	5869 }

	5870 pPage->nCell--;

	5871 memmove(ptr, ptr+2, 2*(pPage->nCell - idx));

	5872 put2byte(&data[hdr+3], pPage->nCell);

	5873 pPage->nFree += 2;

	5874 }

	5875

	5876 /*

	5877 ** Insert a new cell on pPage at cell index "i". pCell points to the

	5878 ** content of the cell.

	5879 **

	5880 ** If the cell content will fit on the page, then put it there. If it

	5881 ** will not fit, then make a copy of the cell content into pTemp if

	5882 ** pTemp is not null. Regardless of pTemp, allocate a new entry

	5883 ** in pPage->apOvfl[] and make it point to the cell content (either

	5884 ** in pTemp or the original pCell) and also record its index.

	5885 ** Allocating a new entry in pPage->aCell[] implies that

	5886 ** pPage->nOverflow is incremented.

	5887 */

	5888 static void insertCell(

	5889 MemPage pPage, / Page into which we are copying */

	5890 int i, /* New cell becomes the i-th cell of the page */

	5891 u8 pCell, / Content of the new cell */

	5892 int sz, /* Bytes of content in pCell */

	5893 u8 pTemp, / Temp storage space for pCell, if needed */

	5894 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */

	5895 int pRC / Read and write return code from here */

	5896 ){

	5897 int idx = 0; /* Where to write new cell content in data[] */

	5898 int j; /* Loop counter */

	5899 int end; /* First byte past the last cell pointer in data[] */

	5900 int ins; /* Index in data[] where new cell pointer is inserted */

	5901 int cellOffset; /* Address of first cell pointer in data[] */

	5902 u8 data; / The content of the whole page */

	5903

	5904 if( *pRC ) return;

	5905

	5906 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );

	5907 assert( MX_CELL(pPage->pBt)<=10921 );

	5908 assert( pPage->nCell<=MX_CELL(pPage->pBt) \|\| CORRUPT_DB );

	5909 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );

	5910 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );

	5911 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	5912 /* The cell should normally be sized correctly. However, when moving a

	5913 ** malformed cell from a leaf page to an interior page, if the cell size

	5914 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size

	5915 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence

	5916 ** the term after the \|\| in the following assert(). */

	5917 assert( sz==cellSizePtr(pPage, pCell) \|\| (sz==8 && iChild>0) );

	5918 if( pPage->nOverflow \|\| sz+2>pPage->nFree ){

	5919 if( pTemp ){

	5920 memcpy(pTemp, pCell, sz);

	5921 pCell = pTemp;

	5922 }

	5923 if( iChild ){

	5924 put4byte(pCell, iChild);

	5925 }

	5926 j = pPage->nOverflow++;

	5927 assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );

	5928 pPage->apOvfl[j] = pCell;

	5929 pPage->aiOvfl[j] = (u16)i;

	5930 }else{

	5931 int rc = sqlite3PagerWrite(pPage->pDbPage);

	5932 if( rc!=SQLITE_OK ){

	5933 *pRC = rc;

	5934 return;

	5935 }

	5936 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	5937 data = pPage->aData;

	5938 cellOffset = pPage->cellOffset;

	5939 end = cellOffset + 2*pPage->nCell;

	5940 ins = cellOffset + 2*i;

	5941 rc = allocateSpace(pPage, sz, &idx);

	5942 if( rc ){ *pRC = rc; return; }

	5943 /* The allocateSpace() routine guarantees the following two properties

	5944 ** if it returns success */

	5945 assert( idx >= end+2 );

	5946 assert( idx+sz <= (int)pPage->pBt->usableSize );

	5947 pPage->nCell++;

	5948 pPage->nFree -= (u16)(2 + sz);

	5949 memcpy(&data[idx], pCell, sz);

	5950 if( iChild ){

	5951 put4byte(&data[idx], iChild);

	5952 }

	5953 memmove(&data[ins+2], &data[ins], end-ins);

	5954 put2byte(&data[ins], idx);

	5955 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);

	5956 #ifndef SQLITE_OMIT_AUTOVACUUM

	5957 if( pPage->pBt->autoVacuum ){

	5958 /* The cell may contain a pointer to an overflow page. If so, write

	5959 ** the entry for the overflow page into the pointer map.

	5960 */

	5961 ptrmapPutOvflPtr(pPage, pCell, pRC);

	5962 }

	5963 #endif

	5964 }

	5965 }

	5966

	5967 /*

	5968 ** Add a list of cells to a page. The page should be initially empty.

	5969 ** The cells are guaranteed to fit on the page.

	5970 */

	5971 static void assemblePage(

	5972 MemPage pPage, / The page to be assembled */

	5973 int nCell, /* The number of cells to add to this page */

	5974 u8 *apCell, / Pointers to cell bodies */

	5975 u16 aSize / Sizes of the cells */

	5976 ){

	5977 int i; /* Loop counter */

	5978 u8 pCellptr; / Address of next cell pointer */

	5979 int cellbody; /* Address of next cell body */

	5980 u8 * const data = pPage->aData; /* Pointer to data for pPage */

	5981 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */

	5982 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */

	5983

	5984 assert( pPage->nOverflow==0 );

	5985 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	5986 assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)

	5987 && (int)MX_CELL(pPage->pBt)<=10921);

	5988 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	5989

	5990 /* Check that the page has just been zeroed by zeroPage() */

	5991 assert( pPage->nCell==0 );

	5992 assert( get2byteNotZero(&data[hdr+5])==nUsable );

	5993

	5994 pCellptr = &pPage->aCellIdx[nCell*2];

	5995 cellbody = nUsable;

	5996 for(i=nCell-1; i>=0; i--){

	5997 u16 sz = aSize[i];

	5998 pCellptr -= 2;

	5999 cellbody -= sz;

	6000 put2byte(pCellptr, cellbody);

	6001 memcpy(&data[cellbody], apCell[i], sz);

	6002 }

	6003 put2byte(&data[hdr+3], nCell);

	6004 put2byte(&data[hdr+5], cellbody);

	6005 pPage->nFree -= (nCell*2 + nUsable - cellbody);

	6006 pPage->nCell = (u16)nCell;

	6007 }

	6008

	6009 /*

	6010 ** The following parameters determine how many adjacent pages get involved

	6011 ** in a balancing operation. NN is the number of neighbors on either side

	6012 ** of the page that participate in the balancing operation. NB is the

	6013 ** total number of pages that participate, including the target page and

	6014 ** NN neighbors on either side.

	6015 **

	6016 ** The minimum value of NN is 1 (of course). Increasing NN above 1

	6017 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance

	6018 ** in exchange for a larger degradation in INSERT and UPDATE performance.

	6019 ** The value of NN appears to give the best results overall.

	6020 */

	6021 #define NN 1 /* Number of neighbors on either side of pPage */

	6022 #define NB (NN2+1) / Total pages involved in the balance */

	6023

	6024

	6025 #ifndef SQLITE_OMIT_QUICKBALANCE

	6026 /*

	6027 ** This version of balance() handles the common special case where

	6028 ** a new entry is being inserted on the extreme right-end of the

	6029 ** tree, in other words, when the new entry will become the largest

	6030 ** entry in the tree.

	6031 **

	6032 ** Instead of trying to balance the 3 right-most leaf pages, just add

	6033 ** a new page to the right-hand side and put the one new entry in

	6034 ** that page. This leaves the right side of the tree somewhat

	6035 ** unbalanced. But odds are that we will be inserting new entries

	6036 ** at the end soon afterwards so the nearly empty page will quickly

	6037 ** fill up. On average.

	6038 **

	6039 ** pPage is the leaf page which is the right-most page in the tree.

	6040 ** pParent is its parent. pPage must have a single overflow entry

	6041 ** which is also the right-most entry on the page.

	6042 **

	6043 ** The pSpace buffer is used to store a temporary copy of the divider

	6044 ** cell that will be inserted into pParent. Such a cell consists of a 4

	6045 ** byte page number followed by a variable length integer. In other

	6046 ** words, at most 13 bytes. Hence the pSpace buffer must be at

	6047 ** least 13 bytes in size.

	6048 */

	6049 static int balance_quick(MemPage pParent, MemPage pPage, u8 *pSpace){

	6050 BtShared const pBt = pPage->pBt; / B-Tree Database */

	6051 MemPage pNew; / Newly allocated page */

	6052 int rc; /* Return Code */

	6053 Pgno pgnoNew; /* Page number of pNew */

	6054

	6055 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	6056 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	6057 assert( pPage->nOverflow==1 );

	6058

	6059 /* This error condition is now caught prior to reaching this function */

	6060 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;

	6061

	6062 /* Allocate a new page. This page will become the right-sibling of

	6063 ** pPage. Make the parent page writable, so that the new divider cell

	6064 ** may be inserted. If both these operations are successful, proceed.

	6065 */

	6066 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

	6067

	6068 if( rc==SQLITE_OK ){

	6069

	6070 u8 *pOut = &pSpace[4];

	6071 u8 *pCell = pPage->apOvfl[0];

	6072 u16 szCell = cellSizePtr(pPage, pCell);

	6073 u8 *pStop;

	6074

	6075 assert( sqlite3PagerIswriteable(pNew->pDbPage) );

	6076 assert( pPage->aData[0]==(PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF) );

	6077 zeroPage(pNew, PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF);

	6078 assemblePage(pNew, 1, &pCell, &szCell);

	6079

	6080 /* If this is an auto-vacuum database, update the pointer map

	6081 ** with entries for the new page, and any pointer from the

	6082 ** cell on the page to an overflow page. If either of these

	6083 ** operations fails, the return code is set, but the contents

	6084 ** of the parent page are still manipulated by thh code below.

	6085 ** That is Ok, at this point the parent page is guaranteed to

	6086 ** be marked as dirty. Returning an error code will cause a

	6087 ** rollback, undoing any changes made to the parent page.

	6088 */

	6089 if( ISAUTOVACUUM ){

	6090 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);

	6091 if( szCell>pNew->minLocal ){

	6092 ptrmapPutOvflPtr(pNew, pCell, &rc);

	6093 }

	6094 }

	6095

	6096 /* Create a divider cell to insert into pParent. The divider cell

	6097 ** consists of a 4-byte page number (the page number of pPage) and

	6098 ** a variable length key value (which must be the same value as the

	6099 ** largest key on pPage).

	6100 **

	6101 ** To find the largest key value on pPage, first find the right-most

	6102 ** cell on pPage. The first two fields of this cell are the

	6103 ** record-length (a variable length integer at most 32-bits in size)

	6104 ** and the key value (a variable length integer, may have any value).

	6105 ** The first of the while(...) loops below skips over the record-length

	6106 ** field. The second while(...) loop copies the key value from the

	6107 ** cell on pPage into the pSpace buffer.

	6108 */

	6109 pCell = findCell(pPage, pPage->nCell-1);

	6110 pStop = &pCell[9];

	6111 while( (*(pCell++)&0x80) && pCell<pStop );

	6112 pStop = &pCell[9];

	6113 while( (((pOut++) = (pCell++))&0x80) && pCell<pStop );

	6114

	6115 /* Insert the new divider cell into pParent. */

	6116 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),

	6117 0, pPage->pgno, &rc);

	6118

	6119 /* Set the right-child pointer of pParent to point to the new page. */

	6120 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);

	6121

	6122 /* Release the reference to the new page. */

	6123 releasePage(pNew);

	6124 }

	6125

	6126 return rc;

	6127 }

	6128 #endif /* SQLITE_OMIT_QUICKBALANCE */

	6129

	6130 #if 0

	6131 /*

	6132 ** This function does not contribute anything to the operation of SQLite.

	6133 ** it is sometimes activated temporarily while debugging code responsible

	6134 ** for setting pointer-map entries.

	6135 */

	6136 static int ptrmapCheckPages(MemPage **apPage, int nPage){

	6137 int i, j;

	6138 for(i=0; i<nPage; i++){

	6139 Pgno n;

	6140 u8 e;

	6141 MemPage *pPage = apPage[i];

	6142 BtShared *pBt = pPage->pBt;

	6143 assert( pPage->isInit );

	6144

	6145 for(j=0; j<pPage->nCell; j++){

	6146 CellInfo info;

	6147 u8 *z;

	6148

	6149 z = findCell(pPage, j);

	6150 btreeParseCellPtr(pPage, z, &info);

	6151 if( info.iOverflow ){

	6152 Pgno ovfl = get4byte(&z[info.iOverflow]);

	6153 ptrmapGet(pBt, ovfl, &e, &n);

	6154 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );

	6155 }

	6156 if( !pPage->leaf ){

	6157 Pgno child = get4byte(z);

	6158 ptrmapGet(pBt, child, &e, &n);

	6159 assert( n==pPage->pgno && e==PTRMAP_BTREE );

	6160 }

	6161 }

	6162 if( !pPage->leaf ){

	6163 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	6164 ptrmapGet(pBt, child, &e, &n);

	6165 assert( n==pPage->pgno && e==PTRMAP_BTREE );

	6166 }

	6167 }

	6168 return 1;

	6169 }

	6170 #endif

	6171

	6172 /*

	6173 ** This function is used to copy the contents of the b-tree node stored

	6174 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then

	6175 ** the pointer-map entries for each child page are updated so that the

	6176 ** parent page stored in the pointer map is page pTo. If pFrom contained

	6177 ** any cells with overflow page pointers, then the corresponding pointer

	6178 ** map entries are also updated so that the parent page is page pTo.

	6179 **

	6180 ** If pFrom is currently carrying any overflow cells (entries in the

	6181 ** MemPage.apOvfl[] array), they are not copied to pTo.

	6182 **

	6183 ** Before returning, page pTo is reinitialized using btreeInitPage().

	6184 **

	6185 ** The performance of this function is not critical. It is only used by

	6186 ** the balance_shallower() and balance_deeper() procedures, neither of

	6187 ** which are called often under normal circumstances.

	6188 */

	6189 static void copyNodeContent(MemPage pFrom, MemPage pTo, int *pRC){

	6190 if( (*pRC)==SQLITE_OK ){

	6191 BtShared * const pBt = pFrom->pBt;

	6192 u8 * const aFrom = pFrom->aData;

	6193 u8 * const aTo = pTo->aData;

	6194 int const iFromHdr = pFrom->hdrOffset;

	6195 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);

	6196 int rc;

	6197 int iData;

	6198

	6199

	6200 assert( pFrom->isInit );

	6201 assert( pFrom->nFree>=iToHdr );

	6202 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );

	6203

	6204 /* Copy the b-tree node content from page pFrom to page pTo. */

	6205 iData = get2byte(&aFrom[iFromHdr+5]);

	6206 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);

	6207 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);

	6208

	6209 /* Reinitialize page pTo so that the contents of the MemPage structure

	6210 ** match the new data. The initialization of pTo can actually fail under

	6211 ** fairly obscure circumstances, even though it is a copy of initialized

	6212 ** page pFrom.

	6213 */

	6214 pTo->isInit = 0;

	6215 rc = btreeInitPage(pTo);

	6216 if( rc!=SQLITE_OK ){

	6217 *pRC = rc;

	6218 return;

	6219 }

	6220

	6221 /* If this is an auto-vacuum database, update the pointer-map entries

	6222 ** for any b-tree or overflow pages that pTo now contains the pointers to.

	6223 */

	6224 if( ISAUTOVACUUM ){

	6225 *pRC = setChildPtrmaps(pTo);

	6226 }

	6227 }

	6228 }

	6229

	6230 /*

	6231 ** This routine redistributes cells on the iParentIdx'th child of pParent

	6232 ** (hereafter "the page") and up to 2 siblings so that all pages have about the

	6233 ** same amount of free space. Usually a single sibling on either side of the

	6234 ** page are used in the balancing, though both siblings might come from one

	6235 ** side if the page is the first or last child of its parent. If the page

	6236 ** has fewer than 2 siblings (something which can only happen if the page

	6237 ** is a root page or a child of a root page) then all available siblings

	6238 ** participate in the balancing.

	6239 **

	6240 ** The number of siblings of the page might be increased or decreased by

	6241 ** one or two in an effort to keep pages nearly full but not over full.

	6242 **

	6243 ** Note that when this routine is called, some of the cells on the page

	6244 ** might not actually be stored in MemPage.aData[]. This can happen

	6245 ** if the page is overfull. This routine ensures that all cells allocated

	6246 ** to the page and its siblings fit into MemPage.aData[] before returning.

	6247 **

	6248 ** In the course of balancing the page and its siblings, cells may be

	6249 ** inserted into or removed from the parent page (pParent). Doing so

	6250 ** may cause the parent page to become overfull or underfull. If this

	6251 ** happens, it is the responsibility of the caller to invoke the correct

	6252 ** balancing routine to fix this problem (see the balance() routine).

	6253 **

	6254 ** If this routine fails for any reason, it might leave the database

	6255 ** in a corrupted state. So if this routine fails, the database should

	6256 ** be rolled back.

	6257 **

	6258 ** The third argument to this function, aOvflSpace, is a pointer to a

	6259 ** buffer big enough to hold one page. If while inserting cells into the parent

	6260 ** page (pParent) the parent page becomes overfull, this buffer is

	6261 ** used to store the parent's overflow cells. Because this function inserts

	6262 ** a maximum of four divider cells into the parent page, and the maximum

	6263 ** size of a cell stored within an internal node is always less than 1/4

	6264 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large

	6265 ** enough for all overflow cells.

	6266 **

	6267 ** If aOvflSpace is set to a null pointer, this function returns

	6268 ** SQLITE_NOMEM.

	6269 */

	6270 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)

	6271 #pragma optimize("", off)

	6272 #endif

	6273 static int balance_nonroot(

	6274 MemPage pParent, / Parent page of siblings being balanced */

	6275 int iParentIdx, /* Index of "the page" in pParent */

	6276 u8 aOvflSpace, / page-size bytes of space for parent ovfl */

	6277 int isRoot, /* True if pParent is a root-page */

	6278 int bBulk /* True if this call is part of a bulk load */

	6279 ){

	6280 BtShared pBt; / The whole database */

	6281 int nCell = 0; /* Number of cells in apCell[] */

	6282 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */

	6283 int nNew = 0; /* Number of pages in apNew[] */

	6284 int nOld; /* Number of pages in apOld[] */

	6285 int i, j, k; /* Loop counters */

	6286 int nxDiv; /* Next divider slot in pParent->aCell[] */

	6287 int rc = SQLITE_OK; /* The return code */

	6288 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */

	6289 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */

	6290 int usableSpace; /* Bytes in pPage beyond the header */

	6291 int pageFlags; /* Value of pPage->aData[0] */

	6292 int subtotal; /* Subtotal of bytes in cells on one page */

	6293 int iSpace1 = 0; /* First unused byte of aSpace1[] */

	6294 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */

	6295 int szScratch; /* Size of scratch memory requested */

	6296 MemPage apOld[NB]; / pPage and up to two siblings */

	6297 MemPage apCopy[NB]; / Private copies of apOld[] pages */

	6298 MemPage apNew[NB+2]; / pPage and up to NB siblings after balancing */

	6299 u8 pRight; / Location in parent of right-sibling pointer */

	6300 u8 apDiv[NB-1]; / Divider cells in pParent */

	6301 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */

	6302 int szNew[NB+2]; /* Combined size of cells place on i-th page */

	6303 u8 *apCell = 0; / All cells begin balanced */

	6304 u16 szCell; / Local size of all cells in apCell[] */

	6305 u8 aSpace1; / Space for copies of dividers cells */

	6306 Pgno pgno; /* Temp var to store a page number in */

	6307

	6308 pBt = pParent->pBt;

	6309 assert( sqlite3_mutex_held(pBt->mutex) );

	6310 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	6311

	6312 #if 0

	6313 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));

	6314 #endif

	6315

	6316 /* At this point pParent may have at most one overflow cell. And if

	6317 ** this overflow cell is present, it must be the cell with

	6318 ** index iParentIdx. This scenario comes about when this function

	6319 ** is called (indirectly) from sqlite3BtreeDelete().

	6320 */

	6321 assert( pParent->nOverflow==0 \|\| pParent->nOverflow==1 );

	6322 assert( pParent->nOverflow==0 \|\| pParent->aiOvfl[0]==iParentIdx );

	6323

	6324 if( !aOvflSpace ){

	6325 return SQLITE_NOMEM;

	6326 }

	6327

	6328 /* Find the sibling pages to balance. Also locate the cells in pParent

	6329 ** that divide the siblings. An attempt is made to find NN siblings on

	6330 ** either side of pPage. More siblings are taken from one side, however,

	6331 ** if there are fewer than NN siblings on the other side. If pParent

	6332 ** has NB or fewer children then all children of pParent are taken.

	6333 **

	6334 ** This loop also drops the divider cells from the parent page. This

	6335 ** way, the remainder of the function does not have to deal with any

	6336 ** overflow cells in the parent page, since if any existed they will

	6337 ** have already been removed.

	6338 */

	6339 i = pParent->nOverflow + pParent->nCell;

	6340 if( i<2 ){

	6341 nxDiv = 0;

	6342 }else{

	6343 assert( bBulk==0 \|\| bBulk==1 );

	6344 if( iParentIdx==0 ){

	6345 nxDiv = 0;

	6346 }else if( iParentIdx==i ){

	6347 nxDiv = i-2+bBulk;

	6348 }else{

	6349 assert( bBulk==0 );

	6350 nxDiv = iParentIdx-1;

	6351 }

	6352 i = 2-bBulk;

	6353 }

	6354 nOld = i+1;

	6355 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){

	6356 pRight = &pParent->aData[pParent->hdrOffset+8];

	6357 }else{

	6358 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);

	6359 }

	6360 pgno = get4byte(pRight);

	6361 while( 1 ){

	6362 rc = getAndInitPage(pBt, pgno, &apOld[i], 0);

	6363 if( rc ){

	6364 memset(apOld, 0, (i+1)sizeof(MemPage));

	6365 goto balance_cleanup;

	6366 }

	6367 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;

	6368 if( (i--)==0 ) break;

	6369

	6370 if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){

	6371 apDiv[i] = pParent->apOvfl[0];

	6372 pgno = get4byte(apDiv[i]);

	6373 szNew[i] = cellSizePtr(pParent, apDiv[i]);

	6374 pParent->nOverflow = 0;

	6375 }else{

	6376 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);

	6377 pgno = get4byte(apDiv[i]);

	6378 szNew[i] = cellSizePtr(pParent, apDiv[i]);

	6379

	6380 /* Drop the cell from the parent page. apDiv[i] still points to

	6381 ** the cell within the parent, even though it has been dropped.

	6382 ** This is safe because dropping a cell only overwrites the first

	6383 ** four bytes of it, and this function does not need the first

	6384 ** four bytes of the divider cell. So the pointer is safe to use

	6385 ** later on.

	6386 **

	6387 ** But not if we are in secure-delete mode. In secure-delete mode,

	6388 ** the dropCell() routine will overwrite the entire cell with zeroes.

	6389 ** In this case, temporarily copy the cell into the aOvflSpace[]

	6390 ** buffer. It will be copied out again as soon as the aSpace[] buffer

	6391 ** is allocated. */

	6392 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	6393 int iOff;

	6394

	6395 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);

	6396 if( (iOff+szNew[i])>(int)pBt->usableSize ){

	6397 rc = SQLITE_CORRUPT_BKPT;

	6398 memset(apOld, 0, (i+1)sizeof(MemPage));

	6399 goto balance_cleanup;

	6400 }else{

	6401 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);

	6402 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];

	6403 }

	6404 }

	6405 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);

	6406 }

	6407 }

	6408

	6409 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte

	6410 ** alignment */

	6411 nMaxCells = (nMaxCells + 3)&~3;

	6412

	6413 /*

	6414 ** Allocate space for memory structures

	6415 */

	6416 k = pBt->pageSize + ROUND8(sizeof(MemPage));

	6417 szScratch =

	6418 nMaxCellssizeof(u8) /* apCell */

	6419 + nMaxCellssizeof(u16) / szCell */

	6420 + pBt->pageSize /* aSpace1 */

	6421 + knOld; / Page copies (apCopy) */

	6422 apCell = sqlite3ScratchMalloc( szScratch );

	6423 if( apCell==0 ){

	6424 rc = SQLITE_NOMEM;

	6425 goto balance_cleanup;

	6426 }

	6427 szCell = (u16*)&apCell[nMaxCells];

	6428 aSpace1 = (u8*)&szCell[nMaxCells];

	6429 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

	6430

	6431 /*

	6432 ** Load pointers to all cells on sibling pages and the divider cells

	6433 ** into the local apCell[] array. Make copies of the divider cells

	6434 ** into space obtained from aSpace1[] and remove the divider cells

	6435 ** from pParent.

	6436 **

	6437 ** If the siblings are on leaf pages, then the child pointers of the

	6438 ** divider cells are stripped from the cells before they are copied

	6439 ** into aSpace1[]. In this way, all cells in apCell[] are without

	6440 ** child pointers. If siblings are not leaves, then all cell in

	6441 ** apCell[] include child pointers. Either way, all cells in apCell[]

	6442 ** are alike.

	6443 **

	6444 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.

	6445 ** leafData: 1 if pPage holds key+data and pParent holds only keys.

	6446 */

	6447 leafCorrection = apOld[0]->leaf*4;

	6448 leafData = apOld[0]->intKeyLeaf;

	6449 for(i=0; i<nOld; i++){

	6450 int limit;

	6451

	6452 /* Before doing anything else, take a copy of the i'th original sibling

	6453 ** The rest of this function will use data from the copies rather

	6454 ** that the original pages since the original pages will be in the

	6455 ** process of being overwritten. */

	6456 MemPage pOld = apCopy[i] = (MemPage)&aSpace1[pBt->pageSize + k*i];

	6457 memcpy(pOld, apOld[i], sizeof(MemPage));

	6458 pOld->aData = (void*)&pOld[1];

	6459 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);

	6460

	6461 limit = pOld->nCell+pOld->nOverflow;

	6462 if( pOld->nOverflow>0 ){

	6463 for(j=0; j<limit; j++){

	6464 assert( nCell<nMaxCells );

	6465 apCell[nCell] = findOverflowCell(pOld, j);

	6466 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);

	6467 nCell++;

	6468 }

	6469 }else{

	6470 u8 *aData = pOld->aData;

	6471 u16 maskPage = pOld->maskPage;

	6472 u16 cellOffset = pOld->cellOffset;

	6473 for(j=0; j<limit; j++){

	6474 assert( nCell<nMaxCells );

	6475 apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);

	6476 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);

	6477 nCell++;

	6478 }

	6479 }

	6480 if( i<nOld-1 && !leafData){

	6481 u16 sz = (u16)szNew[i];

	6482 u8 *pTemp;

	6483 assert( nCell<nMaxCells );

	6484 szCell[nCell] = sz;

	6485 pTemp = &aSpace1[iSpace1];

	6486 iSpace1 += sz;

	6487 assert( sz<=pBt->maxLocal+23 );

	6488 assert( iSpace1 <= (int)pBt->pageSize );

	6489 memcpy(pTemp, apDiv[i], sz);

	6490 apCell[nCell] = pTemp+leafCorrection;

	6491 assert( leafCorrection==0 \|\| leafCorrection==4 );

	6492 szCell[nCell] = szCell[nCell] - leafCorrection;

	6493 if( !pOld->leaf ){

	6494 assert( leafCorrection==0 );

	6495 assert( pOld->hdrOffset==0 );

	6496 /* The right pointer of the child page pOld becomes the left

	6497 ** pointer of the divider cell */

	6498 memcpy(apCell[nCell], &pOld->aData[8], 4);

	6499 }else{

	6500 assert( leafCorrection==4 );

	6501 if( szCell[nCell]<4 ){

	6502 /* Do not allow any cells smaller than 4 bytes. */

	6503 szCell[nCell] = 4;

	6504 }

	6505 }

	6506 nCell++;

	6507 }

	6508 }

	6509

	6510 /*

	6511 ** Figure out the number of pages needed to hold all nCell cells.

	6512 ** Store this number in "k". Also compute szNew[] which is the total

	6513 ** size of all cells on the i-th page and cntNew[] which is the index

	6514 ** in apCell[] of the cell that divides page i from page i+1.

	6515 ** cntNew[k] should equal nCell.

	6516 **

	6517 ** Values computed by this block:

	6518 **

	6519 ** k: The total number of sibling pages

	6520 ** szNew[i]: Spaced used on the i-th sibling page.

	6521 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to

	6522 ** the right of the i-th sibling page.

	6523 ** usableSpace: Number of bytes of space available on each sibling.

	6524 **

	6525 */

	6526 usableSpace = pBt->usableSize - 12 + leafCorrection;

	6527 for(subtotal=k=i=0; i<nCell; i++){

	6528 assert( i<nMaxCells );

	6529 subtotal += szCell[i] + 2;

	6530 if( subtotal > usableSpace ){

	6531 szNew[k] = subtotal - szCell[i];

	6532 cntNew[k] = i;

	6533 if( leafData ){ i--; }

	6534 subtotal = 0;

	6535 k++;

	6536 if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }

	6537 }

	6538 }

	6539 szNew[k] = subtotal;

	6540 cntNew[k] = nCell;

	6541 k++;

	6542

	6543 /*

	6544 ** The packing computed by the previous block is biased toward the siblings

	6545 ** on the left side. The left siblings are always nearly full, while the

	6546 ** right-most sibling might be nearly empty. This block of code attempts

	6547 ** to adjust the packing of siblings to get a better balance.

	6548 **

	6549 ** This adjustment is more than an optimization. The packing above might

	6550 ** be so out of balance as to be illegal. For example, the right-most

	6551 ** sibling might be completely empty. This adjustment is not optional.

	6552 */

	6553 for(i=k-1; i>0; i--){

	6554 int szRight = szNew[i]; /* Size of sibling on the right */

	6555 int szLeft = szNew[i-1]; /* Size of sibling on the left */

	6556 int r; /* Index of right-most cell in left sibling */

	6557 int d; /* Index of first cell to the left of right sibling */

	6558

	6559 r = cntNew[i-1] - 1;

	6560 d = r + 1 - leafData;

	6561 assert( d<nMaxCells );

	6562 assert( r<nMaxCells );

	6563 while( szRight==0

	6564 \|\| (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2))

	6565 ){

	6566 szRight += szCell[d] + 2;

	6567 szLeft -= szCell[r] + 2;

	6568 cntNew[i-1]--;

	6569 r = cntNew[i-1] - 1;

	6570 d = r + 1 - leafData;

	6571 }

	6572 szNew[i] = szRight;

	6573 szNew[i-1] = szLeft;

	6574 }

	6575

	6576 /* Either we found one or more cells (cntnew[0])>0) or pPage is

	6577 ** a virtual root page. A virtual root page is when the real root

	6578 ** page is page 1 and we are the only child of that page.

	6579 **

	6580 ** UPDATE: The assert() below is not necessarily true if the database

	6581 ** file is corrupt. The corruption will be detected and reported later

	6582 ** in this procedure so there is no need to act upon it now.

	6583 */

	6584 #if 0

	6585 assert( cntNew[0]>0 \|\| (pParent->pgno==1 && pParent->nCell==0) );

	6586 #endif

	6587

	6588 TRACE(("BALANCE: old: %d %d %d ",

	6589 apOld[0]->pgno,

	6590 nOld>=2 ? apOld[1]->pgno : 0,

	6591 nOld>=3 ? apOld[2]->pgno : 0

	6592 ));

	6593

	6594 /*

	6595 ** Allocate k new pages. Reuse old pages where possible.

	6596 */

	6597 if( apOld[0]->pgno<=1 ){

	6598 rc = SQLITE_CORRUPT_BKPT;

	6599 goto balance_cleanup;

	6600 }

	6601 pageFlags = apOld[0]->aData[0];

	6602 for(i=0; i<k; i++){

	6603 MemPage *pNew;

	6604 if( i<nOld ){

	6605 pNew = apNew[i] = apOld[i];

	6606 apOld[i] = 0;

	6607 rc = sqlite3PagerWrite(pNew->pDbPage);

	6608 nNew++;

	6609 if( rc ) goto balance_cleanup;

	6610 }else{

	6611 assert( i>0 );

	6612 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);

	6613 if( rc ) goto balance_cleanup;

	6614 apNew[i] = pNew;

	6615 nNew++;

	6616

	6617 /* Set the pointer-map entry for the new sibling page. */

	6618 if( ISAUTOVACUUM ){

	6619 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);

	6620 if( rc!=SQLITE_OK ){

	6621 goto balance_cleanup;

	6622 }

	6623 }

	6624 }

	6625 }

	6626

	6627 /* Free any old pages that were not reused as new pages.

	6628 */

	6629 while( i<nOld ){

	6630 freePage(apOld[i], &rc);

	6631 if( rc ) goto balance_cleanup;

	6632 releasePage(apOld[i]);

	6633 apOld[i] = 0;

	6634 i++;

	6635 }

	6636

	6637 /*

	6638 ** Put the new pages in ascending order. This helps to

	6639 ** keep entries in the disk file in order so that a scan

	6640 ** of the table is a linear scan through the file. That

	6641 ** in turn helps the operating system to deliver pages

	6642 ** from the disk more rapidly.

	6643 **

	6644 ** An O(n^2) insertion sort algorithm is used, but since

	6645 ** n is never more than NB (a small constant), that should

	6646 ** not be a problem.

	6647 **

	6648 ** When NB==3, this one optimization makes the database

	6649 ** about 25% faster for large insertions and deletions.

	6650 */

	6651 for(i=0; i<k-1; i++){

	6652 int minV = apNew[i]->pgno;

	6653 int minI = i;

	6654 for(j=i+1; j<k; j++){

	6655 if( apNew[j]->pgno<(unsigned)minV ){

	6656 minI = j;

	6657 minV = apNew[j]->pgno;

	6658 }

	6659 }

	6660 if( minI>i ){

	6661 MemPage *pT;

	6662 pT = apNew[i];

	6663 apNew[i] = apNew[minI];

	6664 apNew[minI] = pT;

	6665 }

	6666 }

	6667 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",

	6668 apNew[0]->pgno, szNew[0],

	6669 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,

	6670 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,

	6671 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,

	6672 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));

	6673

	6674 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	6675 put4byte(pRight, apNew[nNew-1]->pgno);

	6676

	6677 /*

	6678 ** Evenly distribute the data in apCell[] across the new pages.

	6679 ** Insert divider cells into pParent as necessary.

	6680 */

	6681 j = 0;

	6682 for(i=0; i<nNew; i++){

	6683 /* Assemble the new sibling page. */

	6684 MemPage *pNew = apNew[i];

	6685 assert( j<nMaxCells );

	6686 zeroPage(pNew, pageFlags);

	6687 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);

	6688 assert( pNew->nCell>0 \|\| (nNew==1 && cntNew[0]==0) );

	6689 assert( pNew->nOverflow==0 );

	6690

	6691 j = cntNew[i];

	6692

	6693 /* If the sibling page assembled above was not the right-most sibling,

	6694 ** insert a divider cell into the parent page.

	6695 */

	6696 assert( i<nNew-1 \|\| j==nCell );

	6697 if( j<nCell ){

	6698 u8 *pCell;

	6699 u8 *pTemp;

	6700 int sz;

	6701

	6702 assert( j<nMaxCells );

	6703 pCell = apCell[j];

	6704 sz = szCell[j] + leafCorrection;

	6705 pTemp = &aOvflSpace[iOvflSpace];

	6706 if( !pNew->leaf ){

	6707 memcpy(&pNew->aData[8], pCell, 4);

	6708 }else if( leafData ){

	6709 /* If the tree is a leaf-data tree, and the siblings are leaves,

	6710 ** then there is no divider cell in apCell[]. Instead, the divider

	6711 ** cell consists of the integer key for the right-most cell of

	6712 ** the sibling-page assembled above only.

	6713 */

	6714 CellInfo info;

	6715 j--;

	6716 btreeParseCellPtr(pNew, apCell[j], &info);

	6717 pCell = pTemp;

	6718 sz = 4 + putVarint(&pCell[4], info.nKey);

	6719 pTemp = 0;

	6720 }else{

	6721 pCell -= 4;

	6722 /* Obscure case for non-leaf-data trees: If the cell at pCell was

	6723 ** previously stored on a leaf node, and its reported size was 4

	6724 ** bytes, then it may actually be smaller than this

	6725 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of

	6726 ** any cell). But it is important to pass the correct size to

	6727 ** insertCell(), so reparse the cell now.

	6728 **

	6729 ** Note that this can never happen in an SQLite data file, as all

	6730 ** cells are at least 4 bytes. It only happens in b-trees used

	6731 ** to evaluate "IN (SELECT ...)" and similar clauses.

	6732 */

	6733 if( szCell[j]==4 ){

	6734 assert(leafCorrection==4);

	6735 sz = cellSizePtr(pParent, pCell);

	6736 }

	6737 }

	6738 iOvflSpace += sz;

	6739 assert( sz<=pBt->maxLocal+23 );

	6740 assert( iOvflSpace <= (int)pBt->pageSize );

	6741 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);

	6742 if( rc!=SQLITE_OK ) goto balance_cleanup;

	6743 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	6744

	6745 j++;

	6746 nxDiv++;

	6747 }

	6748 }

	6749 assert( j==nCell );

	6750 assert( nOld>0 );

	6751 assert( nNew>0 );

	6752 if( (pageFlags & PTF_LEAF)==0 ){

	6753 u8 *zChild = &apCopy[nOld-1]->aData[8];

	6754 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);

	6755 }

	6756

	6757 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){

	6758 /* The root page of the b-tree now contains no cells. The only sibling

	6759 ** page is the right-child of the parent. Copy the contents of the

	6760 ** child page into the parent, decreasing the overall height of the

	6761 ** b-tree structure by one. This is described as the "balance-shallower"

	6762 ** sub-algorithm in some documentation.

	6763 **

	6764 ** If this is an auto-vacuum database, the call to copyNodeContent()

	6765 ** sets all pointer-map entries corresponding to database image pages

	6766 ** for which the pointer is stored within the content being copied.

	6767 **

	6768 ** The second assert below verifies that the child page is defragmented

	6769 ** (it must be, as it was just reconstructed using assemblePage()). This

	6770 ** is important if the parent page happens to be page 1 of the database

	6771 ** image. */

	6772 assert( nNew==1 );

	6773 assert( apNew[0]->nFree ==

	6774 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)

	6775 );

	6776 copyNodeContent(apNew[0], pParent, &rc);

	6777 freePage(apNew[0], &rc);

	6778 }else if( ISAUTOVACUUM ){

	6779 /* Fix the pointer-map entries for all the cells that were shifted around.

	6780 ** There are several different types of pointer-map entries that need to

	6781 ** be dealt with by this routine. Some of these have been set already, but

	6782 ** many have not. The following is a summary:

	6783 **

	6784 ** 1) The entries associated with new sibling pages that were not

	6785 ** siblings when this function was called. These have already

	6786 ** been set. We don't need to worry about old siblings that were

	6787 ** moved to the free-list - the freePage() code has taken care

	6788 ** of those.

	6789 **

	6790 ** 2) The pointer-map entries associated with the first overflow

	6791 ** page in any overflow chains used by new divider cells. These

	6792 ** have also already been taken care of by the insertCell() code.

	6793 **

	6794 ** 3) If the sibling pages are not leaves, then the child pages of

	6795 ** cells stored on the sibling pages may need to be updated.

	6796 **

	6797 ** 4) If the sibling pages are not internal intkey nodes, then any

	6798 ** overflow pages used by these cells may need to be updated

	6799 ** (internal intkey nodes never contain pointers to overflow pages).

	6800 **

	6801 ** 5) If the sibling pages are not leaves, then the pointer-map

	6802 ** entries for the right-child pages of each sibling may need

	6803 ** to be updated.

	6804 **

	6805 ** Cases 1 and 2 are dealt with above by other code. The next

	6806 ** block deals with cases 3 and 4 and the one after that, case 5. Since

	6807 ** setting a pointer map entry is a relatively expensive operation, this

	6808 ** code only sets pointer map entries for child or overflow pages that have

	6809 ** actually moved between pages. */

	6810 MemPage *pNew = apNew[0];

	6811 MemPage *pOld = apCopy[0];

	6812 int nOverflow = pOld->nOverflow;

	6813 int iNextOld = pOld->nCell + nOverflow;

	6814 int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);

	6815 j = 0; /* Current 'old' sibling page */

	6816 k = 0; /* Current 'new' sibling page */

	6817 for(i=0; i<nCell; i++){

	6818 int isDivider = 0;

	6819 while( i==iNextOld ){

	6820 /* Cell i is the cell immediately following the last cell on old

	6821 ** sibling page j. If the siblings are not leaf pages of an

	6822 ** intkey b-tree, then cell i was a divider cell. */

	6823 assert( j+1 < ArraySize(apCopy) );

	6824 assert( j+1 < nOld );

	6825 pOld = apCopy[++j];

	6826 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;

	6827 if( pOld->nOverflow ){

	6828 nOverflow = pOld->nOverflow;

	6829 iOverflow = i + !leafData + pOld->aiOvfl[0];

	6830 }

	6831 isDivider = !leafData;

	6832 }

	6833

	6834 assert(nOverflow>0 \|\| iOverflow<i );

	6835 assert(nOverflow<2 \|\| pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);

	6836 assert(nOverflow<3 \|\| pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);

	6837 if( i==iOverflow ){

	6838 isDivider = 1;

	6839 if( (--nOverflow)>0 ){

	6840 iOverflow++;

	6841 }

	6842 }

	6843

	6844 if( i==cntNew[k] ){

	6845 /* Cell i is the cell immediately following the last cell on new

	6846 ** sibling page k. If the siblings are not leaf pages of an

	6847 ** intkey b-tree, then cell i is a divider cell. */

	6848 pNew = apNew[++k];

	6849 if( !leafData ) continue;

	6850 }

	6851 assert( j<nOld );

	6852 assert( k<nNew );

	6853

	6854 /* If the cell was originally divider cell (and is not now) or

	6855 ** an overflow cell, or if the cell was located on a different sibling

	6856 ** page before the balancing, then the pointer map entries associated

	6857 ** with any child or overflow pages need to be updated. */

	6858 if( isDivider \|\| pOld->pgno!=pNew->pgno ){

	6859 if( !leafCorrection ){

	6860 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);

	6861 }

	6862 if( szCell[i]>pNew->minLocal ){

	6863 ptrmapPutOvflPtr(pNew, apCell[i], &rc);

	6864 }

	6865 }

	6866 }

	6867

	6868 if( !leafCorrection ){

	6869 for(i=0; i<nNew; i++){

	6870 u32 key = get4byte(&apNew[i]->aData[8]);

	6871 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);

	6872 }

	6873 }

	6874

	6875 #if 0

	6876 /* The ptrmapCheckPages() contains assert() statements that verify that

	6877 ** all pointer map pages are set correctly. This is helpful while

	6878 ** debugging. This is usually disabled because a corrupt database may

	6879 ** cause an assert() statement to fail. */

	6880 ptrmapCheckPages(apNew, nNew);

	6881 ptrmapCheckPages(&pParent, 1);

	6882 #endif

	6883 }

	6884

	6885 assert( pParent->isInit );

	6886 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",

	6887 nOld, nNew, nCell));

	6888

	6889 /*

	6890 ** Cleanup before returning.

	6891 */

	6892 balance_cleanup:

	6893 sqlite3ScratchFree(apCell);

	6894 for(i=0; i<nOld; i++){

	6895 releasePage(apOld[i]);

	6896 }

	6897 for(i=0; i<nNew; i++){

	6898 releasePage(apNew[i]);

	6899 }

	6900

	6901 return rc;

	6902 }

	6903 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)

	6904 #pragma optimize("", on)

	6905 #endif

	6906

	6907

	6908 /*

	6909 ** This function is called when the root page of a b-tree structure is

	6910 ** overfull (has one or more overflow pages).

	6911 **

	6912 ** A new child page is allocated and the contents of the current root

	6913 ** page, including overflow cells, are copied into the child. The root

	6914 ** page is then overwritten to make it an empty page with the right-child

	6915 ** pointer pointing to the new page.

	6916 **

	6917 ** Before returning, all pointer-map entries corresponding to pages

	6918 ** that the new child-page now contains pointers to are updated. The

	6919 ** entry corresponding to the new right-child pointer of the root

	6920 ** page is also updated.

	6921 **

	6922 ** If successful, *ppChild is set to contain a reference to the child

	6923 ** page and SQLITE_OK is returned. In this case the caller is required

	6924 ** to call releasePage() on *ppChild exactly once. If an error occurs,

	6925 ** an error code is returned and *ppChild is set to 0.

	6926 */

	6927 static int balance_deeper(MemPage pRoot, MemPage *ppChild){

	6928 int rc; /* Return value from subprocedures */

	6929 MemPage pChild = 0; / Pointer to a new child page */

	6930 Pgno pgnoChild = 0; /* Page number of the new child page */

	6931 BtShared pBt = pRoot->pBt; / The BTree */

	6932

	6933 assert( pRoot->nOverflow>0 );

	6934 assert( sqlite3_mutex_held(pBt->mutex) );

	6935

	6936 /* Make pRoot, the root page of the b-tree, writable. Allocate a new

	6937 ** page that will become the new right-child of pPage. Copy the contents

	6938 ** of the node stored on pRoot into the new child page.

	6939 */

	6940 rc = sqlite3PagerWrite(pRoot->pDbPage);

	6941 if( rc==SQLITE_OK ){

	6942 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);

	6943 copyNodeContent(pRoot, pChild, &rc);

	6944 if( ISAUTOVACUUM ){

	6945 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);

	6946 }

	6947 }

	6948 if( rc ){

	6949 *ppChild = 0;

	6950 releasePage(pChild);

	6951 return rc;

	6952 }

	6953 assert( sqlite3PagerIswriteable(pChild->pDbPage) );

	6954 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

	6955 assert( pChild->nCell==pRoot->nCell );

	6956

	6957 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));

	6958

	6959 /* Copy the overflow cells from pRoot to pChild */

	6960 memcpy(pChild->aiOvfl, pRoot->aiOvfl,

	6961 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));

	6962 memcpy(pChild->apOvfl, pRoot->apOvfl,

	6963 pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));

	6964 pChild->nOverflow = pRoot->nOverflow;

	6965

	6966 /* Zero the contents of pRoot. Then install pChild as the right-child. */

	6967 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);

	6968 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);

	6969

	6970 *ppChild = pChild;

	6971 return SQLITE_OK;

	6972 }

	6973

	6974 /*

	6975 ** The page that pCur currently points to has just been modified in

	6976 ** some way. This function figures out if this modification means the

	6977 ** tree needs to be balanced, and if so calls the appropriate balancing

	6978 ** routine. Balancing routines are:

	6979 **

	6980 ** balance_quick()

	6981 ** balance_deeper()

	6982 ** balance_nonroot()

	6983 */

	6984 static int balance(BtCursor *pCur){

	6985 int rc = SQLITE_OK;

	6986 const int nMin = pCur->pBt->usableSize * 2 / 3;

	6987 u8 aBalanceQuickSpace[13];

	6988 u8 *pFree = 0;

	6989

	6990 TESTONLY( int balance_quick_called = 0 );

	6991 TESTONLY( int balance_deeper_called = 0 );

	6992

	6993 do {

	6994 int iPage = pCur->iPage;

	6995 MemPage *pPage = pCur->apPage[iPage];

	6996

	6997 if( iPage==0 ){

	6998 if( pPage->nOverflow ){

	6999 /* The root page of the b-tree is overfull. In this case call the

	7000 ** balance_deeper() function to create a new child for the root-page

	7001 ** and copy the current contents of the root-page to it. The

	7002 ** next iteration of the do-loop will balance the child page.

	7003 */

	7004 assert( (balance_deeper_called++)==0 );

	7005 rc = balance_deeper(pPage, &pCur->apPage[1]);

	7006 if( rc==SQLITE_OK ){

	7007 pCur->iPage = 1;

	7008 pCur->aiIdx[0] = 0;

	7009 pCur->aiIdx[1] = 0;

	7010 assert( pCur->apPage[1]->nOverflow );

	7011 }

	7012 }else{

	7013 break;

	7014 }

	7015 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){

	7016 break;

	7017 }else{

	7018 MemPage * const pParent = pCur->apPage[iPage-1];

	7019 int const iIdx = pCur->aiIdx[iPage-1];

	7020

	7021 rc = sqlite3PagerWrite(pParent->pDbPage);

	7022 if( rc==SQLITE_OK ){

	7023 #ifndef SQLITE_OMIT_QUICKBALANCE

	7024 if( pPage->intKeyLeaf

	7025 && pPage->nOverflow==1

	7026 && pPage->aiOvfl[0]==pPage->nCell

	7027 && pParent->pgno!=1

	7028 && pParent->nCell==iIdx

	7029 ){

	7030 /* Call balance_quick() to create a new sibling of pPage on which

	7031 ** to store the overflow cell. balance_quick() inserts a new cell

	7032 ** into pParent, which may cause pParent overflow. If this

	7033 ** happens, the next iteration of the do-loop will balance pParent

	7034 ** use either balance_nonroot() or balance_deeper(). Until this

	7035 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]

	7036 ** buffer.

	7037 **

	7038 ** The purpose of the following assert() is to check that only a

	7039 ** single call to balance_quick() is made for each call to this

	7040 ** function. If this were not verified, a subtle bug involving reuse

	7041 ** of the aBalanceQuickSpace[] might sneak in.

	7042 */

	7043 assert( (balance_quick_called++)==0 );

	7044 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);

	7045 }else

	7046 #endif

	7047 {

	7048 /* In this case, call balance_nonroot() to redistribute cells

	7049 ** between pPage and up to 2 of its sibling pages. This involves

	7050 ** modifying the contents of pParent, which may cause pParent to

	7051 ** become overfull or underfull. The next iteration of the do-loop

	7052 ** will balance the parent page to correct this.

	7053 **

	7054 ** If the parent page becomes overfull, the overflow cell or cells

	7055 ** are stored in the pSpace buffer allocated immediately below.

	7056 ** A subsequent iteration of the do-loop will deal with this by

	7057 ** calling balance_nonroot() (balance_deeper() may be called first,

	7058 ** but it doesn't deal with overflow cells - just moves them to a

	7059 ** different page). Once this subsequent call to balance_nonroot()

	7060 ** has completed, it is safe to release the pSpace buffer used by

	7061 ** the previous call, as the overflow cell data will have been

	7062 ** copied either into the body of a database page or into the new

	7063 ** pSpace buffer passed to the latter call to balance_nonroot().

	7064 */

	7065 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);

	7066 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);

	7067 if( pFree ){

	7068 /* If pFree is not NULL, it points to the pSpace buffer used

	7069 ** by a previous call to balance_nonroot(). Its contents are

	7070 ** now stored either on real database pages or within the

	7071 ** new pSpace buffer, so it may be safely freed here. */

	7072 sqlite3PageFree(pFree);

	7073 }

	7074

	7075 /* The pSpace buffer will be freed after the next call to

	7076 ** balance_nonroot(), or just before this function returns, whichever

	7077 ** comes first. */

	7078 pFree = pSpace;

	7079 }

	7080 }

	7081

	7082 pPage->nOverflow = 0;

	7083

	7084 /* The next iteration of the do-loop balances the parent page. */

	7085 releasePage(pPage);

	7086 pCur->iPage--;

	7087 }

	7088 }while( rc==SQLITE_OK );

	7089

	7090 if( pFree ){

	7091 sqlite3PageFree(pFree);

	7092 }

	7093 return rc;

	7094 }

	7095

	7096

	7097 /*

	7098 ** Insert a new record into the BTree. The key is given by (pKey,nKey)

	7099 ** and the data is given by (pData,nData). The cursor is used only to

	7100 ** define what table the record should be inserted into. The cursor

	7101 ** is left pointing at a random location.

	7102 **

	7103 ** For an INTKEY table, only the nKey value of the key is used. pKey is

	7104 ** ignored. For a ZERODATA table, the pData and nData are both ignored.

	7105 **

	7106 ** If the seekResult parameter is non-zero, then a successful call to

	7107 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already

	7108 ** been performed. seekResult is the search result returned (a negative

	7109 ** number if pCur points at an entry that is smaller than (pKey, nKey), or

	7110 ** a positive value if pCur points at an entry that is larger than

	7111 ** (pKey, nKey)).

	7112 **

	7113 ** If the seekResult parameter is non-zero, then the caller guarantees that

	7114 ** cursor pCur is pointing at the existing copy of a row that is to be

	7115 ** overwritten. If the seekResult parameter is 0, then cursor pCur may

	7116 ** point to any entry or to no entry at all and so this function has to seek

	7117 ** the cursor before the new key can be inserted.

	7118 */

	7119 int sqlite3BtreeInsert(

	7120 BtCursor pCur, / Insert data into the table of this cursor */

	7121 const void pKey, i64 nKey, / The key of the new record */

	7122 const void pData, int nData, / The data of the new record */

	7123 int nZero, /* Number of extra 0 bytes to append to data */

	7124 int appendBias, /* True if this is likely an append */

	7125 int seekResult /* Result of prior MovetoUnpacked() call */

	7126 ){

	7127 int rc;

	7128 int loc = seekResult; /* -1: before desired location +1: after */

	7129 int szNew = 0;

	7130 int idx;

	7131 MemPage *pPage;

	7132 Btree *p = pCur->pBtree;

	7133 BtShared *pBt = p->pBt;

	7134 unsigned char *oldCell;

	7135 unsigned char *newCell = 0;

	7136

	7137 if( pCur->eState==CURSOR_FAULT ){

	7138 assert( pCur->skipNext!=SQLITE_OK );

	7139 return pCur->skipNext;

	7140 }

	7141

	7142 assert( cursorHoldsMutex(pCur) );

	7143 assert( (pCur->curFlags & BTCF_WriteFlag)!=0

	7144 && pBt->inTransaction==TRANS_WRITE

	7145 && (pBt->btsFlags & BTS_READ_ONLY)==0 );

	7146 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

	7147

	7148 /* Assert that the caller has been consistent. If this cursor was opened

	7149 ** expecting an index b-tree, then the caller should be inserting blob

	7150 ** keys with no associated data. If the cursor was opened expecting an

	7151 ** intkey table, the caller should be inserting integer keys with a

	7152 ** blob of associated data. */

	7153 assert( (pKey==0)==(pCur->pKeyInfo==0) );

	7154

	7155 /* Save the positions of any other cursors open on this table.

	7156 **

	7157 ** In some cases, the call to btreeMoveto() below is a no-op. For

	7158 ** example, when inserting data into a table with auto-generated integer

	7159 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the

	7160 ** integer key to use. It then calls this function to actually insert the

	7161 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes

	7162 ** that the cursor is already where it needs to be and returns without

	7163 ** doing any work. To avoid thwarting these optimizations, it is important

	7164 ** not to clear the cursor here.

	7165 */

	7166 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

	7167 if( rc ) return rc;

	7168

	7169 if( pCur->pKeyInfo==0 ){

	7170 /* If this is an insert into a table b-tree, invalidate any incrblob

	7171 ** cursors open on the row being replaced */

	7172 invalidateIncrblobCursors(p, nKey, 0);

	7173

	7174 /* If the cursor is currently on the last row and we are appending a

	7175 ** new row onto the end, set the "loc" to avoid an unnecessary btreeMoveto()

	7176 ** call */

	7177 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0

	7178 && pCur->info.nKey==nKey-1 ){

	7179 loc = -1;

	7180 }

	7181 }

	7182

	7183 if( !loc ){

	7184 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);

	7185 if( rc ) return rc;

	7186 }

	7187 assert( pCur->eState==CURSOR_VALID \|\| (pCur->eState==CURSOR_INVALID && loc) );

	7188

	7189 pPage = pCur->apPage[pCur->iPage];

	7190 assert( pPage->intKey \|\| nKey>=0 );

	7191 assert( pPage->leaf \|\| !pPage->intKey );

	7192

	7193 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",

	7194 pCur->pgnoRoot, nKey, nData, pPage->pgno,

	7195 loc==0 ? "overwrite" : "new entry"));

	7196 assert( pPage->isInit );

	7197 newCell = pBt->pTmpSpace;

	7198 assert( newCell!=0 );

	7199 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);

	7200 if( rc ) goto end_insert;

	7201 assert( szNew==cellSizePtr(pPage, newCell) );

	7202 assert( szNew <= MX_CELL_SIZE(pBt) );

	7203 idx = pCur->aiIdx[pCur->iPage];

	7204 if( loc==0 ){

	7205 u16 szOld;

	7206 assert( idx<pPage->nCell );

	7207 rc = sqlite3PagerWrite(pPage->pDbPage);

	7208 if( rc ){

	7209 goto end_insert;

	7210 }

	7211 oldCell = findCell(pPage, idx);

	7212 if( !pPage->leaf ){

	7213 memcpy(newCell, oldCell, 4);

	7214 }

	7215 rc = clearCell(pPage, oldCell, &szOld);

	7216 dropCell(pPage, idx, szOld, &rc);

	7217 if( rc ) goto end_insert;

	7218 }else if( loc<0 && pPage->nCell>0 ){

	7219 assert( pPage->leaf );

	7220 idx = ++pCur->aiIdx[pCur->iPage];

	7221 }else{

	7222 assert( pPage->leaf );

	7223 }

	7224 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);

	7225 assert( rc!=SQLITE_OK \|\| pPage->nCell>0 \|\| pPage->nOverflow>0 );

	7226

	7227 /* If no error has occurred and pPage has an overflow cell, call balance()

	7228 ** to redistribute the cells within the tree. Since balance() may move

	7229 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey

	7230 ** variables.

	7231 **

	7232 ** Previous versions of SQLite called moveToRoot() to move the cursor

	7233 ** back to the root page as balance() used to invalidate the contents

	7234 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,

	7235 ** set the cursor state to "invalid". This makes common insert operations

	7236 ** slightly faster.

	7237 **

	7238 ** There is a subtle but important optimization here too. When inserting

	7239 ** multiple records into an intkey b-tree using a single cursor (as can

	7240 ** happen while processing an "INSERT INTO ... SELECT" statement), it

	7241 ** is advantageous to leave the cursor pointing to the last entry in

	7242 ** the b-tree if possible. If the cursor is left pointing to the last

	7243 ** entry in the table, and the next row inserted has an integer key

	7244 ** larger than the largest existing key, it is possible to insert the

	7245 ** row without seeking the cursor. This can be a big performance boost.

	7246 */

	7247 pCur->info.nSize = 0;

	7248 if( rc==SQLITE_OK && pPage->nOverflow ){

	7249 pCur->curFlags &= ~(BTCF_ValidNKey);

	7250 rc = balance(pCur);

	7251

	7252 /* Must make sure nOverflow is reset to zero even if the balance()

	7253 ** fails. Internal data structure corruption will result otherwise.

	7254 ** Also, set the cursor state to invalid. This stops saveCursorPosition()

	7255 ** from trying to save the current position of the cursor. */

	7256 pCur->apPage[pCur->iPage]->nOverflow = 0;

	7257 pCur->eState = CURSOR_INVALID;

	7258 }

	7259 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );

	7260

	7261 end_insert:

	7262 return rc;

	7263 }

	7264

	7265 /*

	7266 ** Delete the entry that the cursor is pointing to. The cursor

	7267 ** is left pointing at an arbitrary location.

	7268 */

	7269 int sqlite3BtreeDelete(BtCursor *pCur){

	7270 Btree *p = pCur->pBtree;

	7271 BtShared *pBt = p->pBt;

	7272 int rc; /* Return code */

	7273 MemPage pPage; / Page to delete cell from */

	7274 unsigned char pCell; / Pointer to cell to delete */

	7275 int iCellIdx; /* Index of cell to delete */

	7276 int iCellDepth; /* Depth of node containing pCell */

	7277 u16 szCell; /* Size of the cell being deleted */

	7278

	7279 assert( cursorHoldsMutex(pCur) );

	7280 assert( pBt->inTransaction==TRANS_WRITE );

	7281 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	7282 assert( pCur->curFlags & BTCF_WriteFlag );

	7283 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

	7284 assert( !hasReadConflicts(p, pCur->pgnoRoot) );

	7285

	7286 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)

	7287 \|\| NEVER(pCur->eState!=CURSOR_VALID)

	7288 ){

	7289 return SQLITE_ERROR; /* Something has gone awry. */

	7290 }

	7291

	7292 iCellDepth = pCur->iPage;

	7293 iCellIdx = pCur->aiIdx[iCellDepth];

	7294 pPage = pCur->apPage[iCellDepth];

	7295 pCell = findCell(pPage, iCellIdx);

	7296

	7297 /* If the page containing the entry to delete is not a leaf page, move

	7298 ** the cursor to the largest entry in the tree that is smaller than

	7299 ** the entry being deleted. This cell will replace the cell being deleted

	7300 ** from the internal node. The 'previous' entry is used for this instead

	7301 ** of the 'next' entry, as the previous entry is always a part of the

	7302 ** sub-tree headed by the child page of the cell being deleted. This makes

	7303 ** balancing the tree following the delete operation easier. */

	7304 if( !pPage->leaf ){

	7305 int notUsed = 0;

	7306 rc = sqlite3BtreePrevious(pCur, &notUsed);

	7307 if( rc ) return rc;

	7308 }

	7309

	7310 /* Save the positions of any other cursors open on this table before

	7311 ** making any modifications. Make the page containing the entry to be

	7312 ** deleted writable. Then free any overflow pages associated with the

	7313 ** entry and finally remove the cell itself from within the page.

	7314 */

	7315 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

	7316 if( rc ) return rc;

	7317

	7318 /* If this is a delete operation to remove a row from a table b-tree,

	7319 ** invalidate any incrblob cursors open on the row being deleted. */

	7320 if( pCur->pKeyInfo==0 ){

	7321 invalidateIncrblobCursors(p, pCur->info.nKey, 0);

	7322 }

	7323

	7324 rc = sqlite3PagerWrite(pPage->pDbPage);

	7325 if( rc ) return rc;

	7326 rc = clearCell(pPage, pCell, &szCell);

	7327 dropCell(pPage, iCellIdx, szCell, &rc);

	7328 if( rc ) return rc;

	7329

	7330 /* If the cell deleted was not located on a leaf page, then the cursor

	7331 ** is currently pointing to the largest entry in the sub-tree headed

	7332 ** by the child-page of the cell that was just deleted from an internal

	7333 ** node. The cell from the leaf node needs to be moved to the internal

	7334 ** node to replace the deleted cell. */

	7335 if( !pPage->leaf ){

	7336 MemPage *pLeaf = pCur->apPage[pCur->iPage];

	7337 int nCell;

	7338 Pgno n = pCur->apPage[iCellDepth+1]->pgno;

	7339 unsigned char *pTmp;

	7340

	7341 pCell = findCell(pLeaf, pLeaf->nCell-1);

	7342 nCell = cellSizePtr(pLeaf, pCell);

	7343 assert( MX_CELL_SIZE(pBt) >= nCell );

	7344 pTmp = pBt->pTmpSpace;

	7345 assert( pTmp!=0 );

	7346 rc = sqlite3PagerWrite(pLeaf->pDbPage);

	7347 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);

	7348 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);

	7349 if( rc ) return rc;

	7350 }

	7351

	7352 /* Balance the tree. If the entry deleted was located on a leaf page,

	7353 ** then the cursor still points to that page. In this case the first

	7354 ** call to balance() repairs the tree, and the if(...) condition is

	7355 ** never true.

	7356 **

	7357 ** Otherwise, if the entry deleted was on an internal node page, then

	7358 ** pCur is pointing to the leaf page from which a cell was removed to

	7359 ** replace the cell deleted from the internal node. This is slightly

	7360 ** tricky as the leaf node may be underfull, and the internal node may

	7361 ** be either under or overfull. In this case run the balancing algorithm

	7362 ** on the leaf node first. If the balance proceeds far enough up the

	7363 ** tree that we can be sure that any problem in the internal node has

	7364 ** been corrected, so be it. Otherwise, after balancing the leaf node,

	7365 ** walk the cursor up the tree to the internal node and balance it as

	7366 ** well. */

	7367 rc = balance(pCur);

	7368 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){

	7369 while( pCur->iPage>iCellDepth ){

	7370 releasePage(pCur->apPage[pCur->iPage--]);

	7371 }

	7372 rc = balance(pCur);

	7373 }

	7374

	7375 if( rc==SQLITE_OK ){

	7376 moveToRoot(pCur);

	7377 }

	7378 return rc;

	7379 }

	7380

	7381 /*

	7382 ** Create a new BTree table. Write into *piTable the page

	7383 ** number for the root page of the new table.

	7384 **

	7385 ** The type of type is determined by the flags parameter. Only the

	7386 ** following values of flags are currently in use. Other values for

	7387 ** flags might not work:

	7388 **

	7389 ** BTREE_INTKEY\|BTREE_LEAFDATA Used for SQL tables with rowid keys

	7390 ** BTREE_ZERODATA Used for SQL indices

	7391 */

	7392 static int btreeCreateTable(Btree p, int piTable, int createTabFlags){

	7393 BtShared *pBt = p->pBt;

	7394 MemPage *pRoot;

	7395 Pgno pgnoRoot;

	7396 int rc;

	7397 int ptfFlags; /* Page-type flage for the root page of new table */

	7398

	7399 assert( sqlite3BtreeHoldsMutex(p) );

	7400 assert( pBt->inTransaction==TRANS_WRITE );

	7401 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	7402

	7403 #ifdef SQLITE_OMIT_AUTOVACUUM

	7404 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

	7405 if( rc ){

	7406 return rc;

	7407 }

	7408 #else

	7409 if( pBt->autoVacuum ){

	7410 Pgno pgnoMove; /* Move a page here to make room for the root-page */

	7411 MemPage pPageMove; / The page to move to. */

	7412

	7413 /* Creating a new table may probably require moving an existing database

	7414 ** to make room for the new tables root page. In case this page turns

	7415 ** out to be an overflow page, delete all overflow page-map caches

	7416 ** held by open cursors.

	7417 */

	7418 invalidateAllOverflowCache(pBt);

	7419

	7420 /* Read the value of meta[3] from the database to determine where the

	7421 ** root page of the new table should go. meta[3] is the largest root-page

	7422 ** created so far, so the new root-page is (meta[3]+1).

	7423 */

	7424 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);

	7425 pgnoRoot++;

	7426

	7427 /* The new root-page may not be allocated on a pointer-map page, or the

	7428 ** PENDING_BYTE page.

	7429 */

	7430 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) \|\|

	7431 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){

	7432 pgnoRoot++;

	7433 }

	7434 assert( pgnoRoot>=3 );

	7435

	7436 /* Allocate a page. The page that currently resides at pgnoRoot will

	7437 ** be moved to the allocated page (unless the allocated page happens

	7438 ** to reside at pgnoRoot).

	7439 */

	7440 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);

	7441 if( rc!=SQLITE_OK ){

	7442 return rc;

	7443 }

	7444

	7445 if( pgnoMove!=pgnoRoot ){

	7446 /* pgnoRoot is the page that will be used for the root-page of

	7447 ** the new table (assuming an error did not occur). But we were

	7448 ** allocated pgnoMove. If required (i.e. if it was not allocated

	7449 ** by extending the file), the current page at position pgnoMove

	7450 ** is already journaled.

	7451 */

	7452 u8 eType = 0;

	7453 Pgno iPtrPage = 0;

	7454

	7455 /* Save the positions of any open cursors. This is required in

	7456 ** case they are holding a reference to an xFetch reference

	7457 ** corresponding to page pgnoRoot. */

	7458 rc = saveAllCursors(pBt, 0, 0);

	7459 releasePage(pPageMove);

	7460 if( rc!=SQLITE_OK ){

	7461 return rc;

	7462 }

	7463

	7464 /* Move the page currently at pgnoRoot to pgnoMove. */

	7465 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

	7466 if( rc!=SQLITE_OK ){

	7467 return rc;

	7468 }

	7469 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);

	7470 if( eType==PTRMAP_ROOTPAGE \|\| eType==PTRMAP_FREEPAGE ){

	7471 rc = SQLITE_CORRUPT_BKPT;

	7472 }

	7473 if( rc!=SQLITE_OK ){

	7474 releasePage(pRoot);

	7475 return rc;

	7476 }

	7477 assert( eType!=PTRMAP_ROOTPAGE );

	7478 assert( eType!=PTRMAP_FREEPAGE );

	7479 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);

	7480 releasePage(pRoot);

	7481

	7482 /* Obtain the page at pgnoRoot */

	7483 if( rc!=SQLITE_OK ){

	7484 return rc;

	7485 }

	7486 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

	7487 if( rc!=SQLITE_OK ){

	7488 return rc;

	7489 }

	7490 rc = sqlite3PagerWrite(pRoot->pDbPage);

	7491 if( rc!=SQLITE_OK ){

	7492 releasePage(pRoot);

	7493 return rc;

	7494 }

	7495 }else{

	7496 pRoot = pPageMove;

	7497 }

	7498

	7499 /* Update the pointer-map and meta-data with the new root-page number. */

	7500 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);

	7501 if( rc ){

	7502 releasePage(pRoot);

	7503 return rc;

	7504 }

	7505

	7506 /* When the new root page was allocated, page 1 was made writable in

	7507 ** order either to increase the database filesize, or to decrement the

	7508 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.

	7509 */

	7510 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );

	7511 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);

	7512 if( NEVER(rc) ){

	7513 releasePage(pRoot);

	7514 return rc;

	7515 }

	7516

	7517 }else{

	7518 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

	7519 if( rc ) return rc;

	7520 }

	7521 #endif

	7522 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

	7523 if( createTabFlags & BTREE_INTKEY ){

	7524 ptfFlags = PTF_INTKEY \| PTF_LEAFDATA \| PTF_LEAF;

	7525 }else{

	7526 ptfFlags = PTF_ZERODATA \| PTF_LEAF;

	7527 }

	7528 zeroPage(pRoot, ptfFlags);

	7529 sqlite3PagerUnref(pRoot->pDbPage);

	7530 assert( (pBt->openFlags & BTREE_SINGLE)==0 \|\| pgnoRoot==2 );

	7531 *piTable = (int)pgnoRoot;

	7532 return SQLITE_OK;

	7533 }

	7534 int sqlite3BtreeCreateTable(Btree p, int piTable, int flags){

	7535 int rc;

	7536 sqlite3BtreeEnter(p);

	7537 rc = btreeCreateTable(p, piTable, flags);

	7538 sqlite3BtreeLeave(p);

	7539 return rc;

	7540 }

	7541

	7542 /*

	7543 ** Erase the given database page and all its children. Return

	7544 ** the page to the freelist.

	7545 */

	7546 static int clearDatabasePage(

	7547 BtShared pBt, / The BTree that contains the table */

	7548 Pgno pgno, /* Page number to clear */

	7549 int freePageFlag, /* Deallocate page if true */

	7550 int pnChange / Add number of Cells freed to this counter */

	7551 ){

	7552 MemPage *pPage;

	7553 int rc;

	7554 unsigned char *pCell;

	7555 int i;

	7556 int hdr;

	7557 u16 szCell;

	7558

	7559 assert( sqlite3_mutex_held(pBt->mutex) );

	7560 if( pgno>btreePagecount(pBt) ){

	7561 return SQLITE_CORRUPT_BKPT;

	7562 }

	7563

	7564 rc = getAndInitPage(pBt, pgno, &pPage, 0);

	7565 if( rc ) return rc;

	7566 hdr = pPage->hdrOffset;

	7567 for(i=0; i<pPage->nCell; i++){

	7568 pCell = findCell(pPage, i);

	7569 if( !pPage->leaf ){

	7570 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);

	7571 if( rc ) goto cleardatabasepage_out;

	7572 }

	7573 rc = clearCell(pPage, pCell, &szCell);

	7574 if( rc ) goto cleardatabasepage_out;

	7575 }

	7576 if( !pPage->leaf ){

	7577 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);

	7578 if( rc ) goto cleardatabasepage_out;

	7579 }else if( pnChange ){

	7580 assert( pPage->intKey );

	7581 *pnChange += pPage->nCell;

	7582 }

	7583 if( freePageFlag ){

	7584 freePage(pPage, &rc);

	7585 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){

	7586 zeroPage(pPage, pPage->aData[hdr] \| PTF_LEAF);

	7587 }

	7588

	7589 cleardatabasepage_out:

	7590 releasePage(pPage);

	7591 return rc;

	7592 }

	7593

	7594 /*

	7595 ** Delete all information from a single table in the database. iTable is

	7596 ** the page number of the root of the table. After this routine returns,

	7597 ** the root page is empty, but still exists.

	7598 **

	7599 ** This routine will fail with SQLITE_LOCKED if there are any open

	7600 ** read cursors on the table. Open write cursors are moved to the

	7601 ** root of the table.

	7602 **

	7603 ** If pnChange is not NULL, then table iTable must be an intkey table. The

	7604 ** integer value pointed to by pnChange is incremented by the number of

	7605 ** entries in the table.

	7606 */

	7607 int sqlite3BtreeClearTable(Btree p, int iTable, int pnChange){

	7608 int rc;

	7609 BtShared *pBt = p->pBt;

	7610 sqlite3BtreeEnter(p);

	7611 assert( p->inTrans==TRANS_WRITE );

	7612

	7613 rc = saveAllCursors(pBt, (Pgno)iTable, 0);

	7614

	7615 if( SQLITE_OK==rc ){

	7616 /* Invalidate all incrblob cursors open on table iTable (assuming iTable

	7617 ** is the root of a table b-tree - if it is not, the following call is

	7618 ** a no-op). */

	7619 invalidateIncrblobCursors(p, 0, 1);

	7620 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);

	7621 }

	7622 sqlite3BtreeLeave(p);

	7623 return rc;

	7624 }

	7625

	7626 /*

	7627 ** Delete all information from the single table that pCur is open on.

	7628 **

	7629 ** This routine only work for pCur on an ephemeral table.

	7630 */

	7631 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){

	7632 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);

	7633 }

	7634

	7635 /*

	7636 ** Erase all information in a table and add the root of the table to

	7637 ** the freelist. Except, the root of the principle table (the one on

	7638 ** page 1) is never added to the freelist.

	7639 **

	7640 ** This routine will fail with SQLITE_LOCKED if there are any open

	7641 ** cursors on the table.

	7642 **

	7643 ** If AUTOVACUUM is enabled and the page at iTable is not the last

	7644 ** root page in the database file, then the last root page

	7645 ** in the database file is moved into the slot formerly occupied by

	7646 ** iTable and that last slot formerly occupied by the last root page

	7647 ** is added to the freelist instead of iTable. In this say, all

	7648 ** root pages are kept at the beginning of the database file, which

	7649 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the

	7650 ** page number that used to be the last root page in the file before

	7651 ** the move. If no page gets moved, *piMoved is set to 0.

	7652 ** The last root page is recorded in meta[3] and the value of

	7653 ** meta[3] is updated by this procedure.

	7654 */

	7655 static int btreeDropTable(Btree p, Pgno iTable, int piMoved){

	7656 int rc;

	7657 MemPage *pPage = 0;

	7658 BtShared *pBt = p->pBt;

	7659

	7660 assert( sqlite3BtreeHoldsMutex(p) );

	7661 assert( p->inTrans==TRANS_WRITE );

	7662

	7663 /* It is illegal to drop a table if any cursors are open on the

	7664 ** database. This is because in auto-vacuum mode the backend may

	7665 ** need to move another root-page to fill a gap left by the deleted

	7666 ** root page. If an open cursor was using this page a problem would

	7667 ** occur.

	7668 **

	7669 ** This error is caught long before control reaches this point.

	7670 */

	7671 if( NEVER(pBt->pCursor) ){

	7672 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);

	7673 return SQLITE_LOCKED_SHAREDCACHE;

	7674 }

	7675

	7676 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);

	7677 if( rc ) return rc;

	7678 rc = sqlite3BtreeClearTable(p, iTable, 0);

	7679 if( rc ){

	7680 releasePage(pPage);

	7681 return rc;

	7682 }

	7683

	7684 *piMoved = 0;

	7685

	7686 if( iTable>1 ){

	7687 #ifdef SQLITE_OMIT_AUTOVACUUM

	7688 freePage(pPage, &rc);

	7689 releasePage(pPage);

	7690 #else

	7691 if( pBt->autoVacuum ){

	7692 Pgno maxRootPgno;

	7693 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);

	7694

	7695 if( iTable==maxRootPgno ){

	7696 /* If the table being dropped is the table with the largest root-page

	7697 ** number in the database, put the root page on the free list.

	7698 */

	7699 freePage(pPage, &rc);

	7700 releasePage(pPage);

	7701 if( rc!=SQLITE_OK ){

	7702 return rc;

	7703 }

	7704 }else{

	7705 /* The table being dropped does not have the largest root-page

	7706 ** number in the database. So move the page that does into the

	7707 ** gap left by the deleted root-page.

	7708 */

	7709 MemPage *pMove;

	7710 releasePage(pPage);

	7711 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

	7712 if( rc!=SQLITE_OK ){

	7713 return rc;

	7714 }

	7715 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);

	7716 releasePage(pMove);

	7717 if( rc!=SQLITE_OK ){

	7718 return rc;

	7719 }

	7720 pMove = 0;

	7721 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

	7722 freePage(pMove, &rc);

	7723 releasePage(pMove);

	7724 if( rc!=SQLITE_OK ){

	7725 return rc;

	7726 }

	7727 *piMoved = maxRootPgno;

	7728 }

	7729

	7730 /* Set the new 'max-root-page' value in the database header. This

	7731 ** is the old value less one, less one more if that happens to

	7732 ** be a root-page number, less one again if that is the

	7733 ** PENDING_BYTE_PAGE.

	7734 */

	7735 maxRootPgno--;

	7736 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)

	7737 \|\| PTRMAP_ISPAGE(pBt, maxRootPgno) ){

	7738 maxRootPgno--;

	7739 }

	7740 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );

	7741

	7742 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);

	7743 }else{

	7744 freePage(pPage, &rc);

	7745 releasePage(pPage);

	7746 }

	7747 #endif

	7748 }else{

	7749 /* If sqlite3BtreeDropTable was called on page 1.

	7750 ** This really never should happen except in a corrupt

	7751 ** database.

	7752 */

	7753 zeroPage(pPage, PTF_INTKEY\|PTF_LEAF );

	7754 releasePage(pPage);

	7755 }

	7756 return rc;

	7757 }

	7758 int sqlite3BtreeDropTable(Btree p, int iTable, int piMoved){

	7759 int rc;

	7760 sqlite3BtreeEnter(p);

	7761 rc = btreeDropTable(p, iTable, piMoved);

	7762 sqlite3BtreeLeave(p);

	7763 return rc;

	7764 }

	7765

	7766

	7767 /*

	7768 ** This function may only be called if the b-tree connection already

	7769 ** has a read or write transaction open on the database.

	7770 **

	7771 ** Read the meta-information out of a database file. Meta[0]

	7772 ** is the number of free pages currently in the database. Meta[1]

	7773 ** through meta[15] are available for use by higher layers. Meta[0]

	7774 ** is read-only, the others are read/write.

	7775 **

	7776 ** The schema layer numbers meta values differently. At the schema

	7777 ** layer (and the SetCookie and ReadCookie opcodes) the number of

	7778 ** free pages is not visible. So Cookie[0] is the same as Meta[1].

	7779 */

	7780 void sqlite3BtreeGetMeta(Btree p, int idx, u32 pMeta){

	7781 BtShared *pBt = p->pBt;

	7782

	7783 sqlite3BtreeEnter(p);

	7784 assert( p->inTrans>TRANS_NONE );

	7785 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );

	7786 assert( pBt->pPage1 );

	7787 assert( idx>=0 && idx<=15 );

	7788

	7789 pMeta = get4byte(&pBt->pPage1->aData[36 + idx4]);

	7790

	7791 /* If auto-vacuum is disabled in this build and this is an auto-vacuum

	7792 ** database, mark the database as read-only. */

	7793 #ifdef SQLITE_OMIT_AUTOVACUUM

	7794 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){

	7795 pBt->btsFlags \|= BTS_READ_ONLY;

	7796 }

	7797 #endif

	7798

	7799 sqlite3BtreeLeave(p);

	7800 }

	7801

	7802 /*

	7803 ** Write meta-information back into the database. Meta[0] is

	7804 ** read-only and may not be written.

	7805 */

	7806 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){

	7807 BtShared *pBt = p->pBt;

	7808 unsigned char *pP1;

	7809 int rc;

	7810 assert( idx>=1 && idx<=15 );

	7811 sqlite3BtreeEnter(p);

	7812 assert( p->inTrans==TRANS_WRITE );

	7813 assert( pBt->pPage1!=0 );

	7814 pP1 = pBt->pPage1->aData;

	7815 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	7816 if( rc==SQLITE_OK ){

	7817 put4byte(&pP1[36 + idx*4], iMeta);

	7818 #ifndef SQLITE_OMIT_AUTOVACUUM

	7819 if( idx==BTREE_INCR_VACUUM ){

	7820 assert( pBt->autoVacuum \|\| iMeta==0 );

	7821 assert( iMeta==0 \|\| iMeta==1 );

	7822 pBt->incrVacuum = (u8)iMeta;

	7823 }

	7824 #endif

	7825 }

	7826 sqlite3BtreeLeave(p);

	7827 return rc;

	7828 }

	7829

	7830 #ifndef SQLITE_OMIT_BTREECOUNT

	7831 /*

	7832 ** The first argument, pCur, is a cursor opened on some b-tree. Count the

	7833 ** number of entries in the b-tree and write the result to *pnEntry.

	7834 **

	7835 ** SQLITE_OK is returned if the operation is successfully executed.

	7836 ** Otherwise, if an error is encountered (i.e. an IO error or database

	7837 ** corruption) an SQLite error code is returned.

	7838 */

	7839 int sqlite3BtreeCount(BtCursor pCur, i64 pnEntry){

	7840 i64 nEntry = 0; /* Value to return in pnEntry /

	7841 int rc; /* Return code */

	7842

	7843 if( pCur->pgnoRoot==0 ){

	7844 *pnEntry = 0;

	7845 return SQLITE_OK;

	7846 }

	7847 rc = moveToRoot(pCur);

	7848

	7849 /* Unless an error occurs, the following loop runs one iteration for each

	7850 ** page in the B-Tree structure (not including overflow pages).

	7851 */

	7852 while( rc==SQLITE_OK ){

	7853 int iIdx; /* Index of child node in parent */

	7854 MemPage pPage; / Current page of the b-tree */

	7855

	7856 /* If this is a leaf page or the tree is not an int-key tree, then

	7857 ** this page contains countable entries. Increment the entry counter

	7858 ** accordingly.

	7859 */

	7860 pPage = pCur->apPage[pCur->iPage];

	7861 if( pPage->leaf \|\| !pPage->intKey ){

	7862 nEntry += pPage->nCell;

	7863 }

	7864

	7865 /* pPage is a leaf node. This loop navigates the cursor so that it

	7866 ** points to the first interior cell that it points to the parent of

	7867 ** the next page in the tree that has not yet been visited. The

	7868 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell

	7869 ** of the page, or to the number of cells in the page if the next page

	7870 ** to visit is the right-child of its parent.

	7871 **

	7872 ** If all pages in the tree have been visited, return SQLITE_OK to the

	7873 ** caller.

	7874 */

	7875 if( pPage->leaf ){

	7876 do {

	7877 if( pCur->iPage==0 ){

	7878 /* All pages of the b-tree have been visited. Return successfully. */

	7879 *pnEntry = nEntry;

	7880 return SQLITE_OK;

	7881 }

	7882 moveToParent(pCur);

	7883 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );

	7884

	7885 pCur->aiIdx[pCur->iPage]++;

	7886 pPage = pCur->apPage[pCur->iPage];

	7887 }

	7888

	7889 /* Descend to the child node of the cell that the cursor currently

	7890 ** points at. This is the right-child if (iIdx==pPage->nCell).

	7891 */

	7892 iIdx = pCur->aiIdx[pCur->iPage];

	7893 if( iIdx==pPage->nCell ){

	7894 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

	7895 }else{

	7896 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));

	7897 }

	7898 }

	7899

	7900 /* An error has occurred. Return an error code. */

	7901 return rc;

	7902 }

	7903 #endif

	7904

	7905 /*

	7906 ** Return the pager associated with a BTree. This routine is used for

	7907 ** testing and debugging only.

	7908 */

	7909 Pager sqlite3BtreePager(Btree p){

	7910 return p->pBt->pPager;

	7911 }

	7912

	7913 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	7914 /*

	7915 ** Append a message to the error message string.

	7916 */

	7917 static void checkAppendMsg(

	7918 IntegrityCk *pCheck,

	7919 const char *zFormat,

	7920 ...

	7921 ){

	7922 va_list ap;

	7923 char zBuf[200];

	7924 if( !pCheck->mxErr ) return;

	7925 pCheck->mxErr--;

	7926 pCheck->nErr++;

	7927 va_start(ap, zFormat);

	7928 if( pCheck->errMsg.nChar ){

	7929 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);

	7930 }

	7931 if( pCheck->zPfx ){

	7932 sqlite3_snprintf(sizeof(zBuf), zBuf, pCheck->zPfx, pCheck->v1, pCheck->v2);

	7933 sqlite3StrAccumAppendAll(&pCheck->errMsg, zBuf);

	7934 }

	7935 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);

	7936 va_end(ap);

	7937 if( pCheck->errMsg.accError==STRACCUM_NOMEM ){

	7938 pCheck->mallocFailed = 1;

	7939 }

	7940 }

	7941 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	7942

	7943 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	7944

	7945 /*

	7946 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that

	7947 ** corresponds to page iPg is already set.

	7948 */

	7949 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){

	7950 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

	7951 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));

	7952 }

	7953

	7954 /*

	7955 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.

	7956 */

	7957 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){

	7958 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

	7959 pCheck->aPgRef[iPg/8] \|= (1 << (iPg & 0x07));

	7960 }

	7961

	7962

	7963 /*

	7964 ** Add 1 to the reference count for page iPage. If this is the second

	7965 ** reference to the page, add an error message to pCheck->zErrMsg.

	7966 ** Return 1 if there are 2 or more references to the page and 0 if

	7967 ** if this is the first reference to the page.

	7968 **

	7969 ** Also check that the page number is in bounds.

	7970 */

	7971 static int checkRef(IntegrityCk *pCheck, Pgno iPage){

	7972 if( iPage==0 ) return 1;

	7973 if( iPage>pCheck->nPage ){

	7974 checkAppendMsg(pCheck, "invalid page number %d", iPage);

	7975 return 1;

	7976 }

	7977 if( getPageReferenced(pCheck, iPage) ){

	7978 checkAppendMsg(pCheck, "2nd reference to page %d", iPage);

	7979 return 1;

	7980 }

	7981 setPageReferenced(pCheck, iPage);

	7982 return 0;

	7983 }

	7984

	7985 #ifndef SQLITE_OMIT_AUTOVACUUM

	7986 /*

	7987 ** Check that the entry in the pointer-map for page iChild maps to

	7988 ** page iParent, pointer type ptrType. If not, append an error message

	7989 ** to pCheck.

	7990 */

	7991 static void checkPtrmap(

	7992 IntegrityCk pCheck, / Integrity check context */

	7993 Pgno iChild, /* Child page number */

	7994 u8 eType, /* Expected pointer map type */

	7995 Pgno iParent /* Expected pointer map parent page number */

	7996 ){

	7997 int rc;

	7998 u8 ePtrmapType;

	7999 Pgno iPtrmapParent;

	8000

	8001 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);

	8002 if( rc!=SQLITE_OK ){

	8003 if( rc==SQLITE_NOMEM \|\| rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;

	8004 checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);

	8005 return;

	8006 }

	8007

	8008 if( ePtrmapType!=eType \|\| iPtrmapParent!=iParent ){

	8009 checkAppendMsg(pCheck,

	8010 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",

	8011 iChild, eType, iParent, ePtrmapType, iPtrmapParent);

	8012 }

	8013 }

	8014 #endif

	8015

	8016 /*

	8017 ** Check the integrity of the freelist or of an overflow page list.

	8018 ** Verify that the number of pages on the list is N.

	8019 */

	8020 static void checkList(

	8021 IntegrityCk pCheck, / Integrity checking context */

	8022 int isFreeList, /* True for a freelist. False for overflow page list */

	8023 int iPage, /* Page number for first page in the list */

	8024 int N /* Expected number of pages in the list */

	8025 ){

	8026 int i;

	8027 int expected = N;

	8028 int iFirst = iPage;

	8029 while( N-- > 0 && pCheck->mxErr ){

	8030 DbPage *pOvflPage;

	8031 unsigned char *pOvflData;

	8032 if( iPage<1 ){

	8033 checkAppendMsg(pCheck,

	8034 "%d of %d pages missing from overflow list starting at %d",

	8035 N+1, expected, iFirst);

	8036 break;

	8037 }

	8038 if( checkRef(pCheck, iPage) ) break;

	8039 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){

	8040 checkAppendMsg(pCheck, "failed to get page %d", iPage);

	8041 break;

	8042 }

	8043 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);

	8044 if( isFreeList ){

	8045 int n = get4byte(&pOvflData[4]);

	8046 #ifndef SQLITE_OMIT_AUTOVACUUM

	8047 if( pCheck->pBt->autoVacuum ){

	8048 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);

	8049 }

	8050 #endif

	8051 if( n>(int)pCheck->pBt->usableSize/4-2 ){

	8052 checkAppendMsg(pCheck,

	8053 "freelist leaf count too big on page %d", iPage);

	8054 N--;

	8055 }else{

	8056 for(i=0; i<n; i++){

	8057 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);

	8058 #ifndef SQLITE_OMIT_AUTOVACUUM

	8059 if( pCheck->pBt->autoVacuum ){

	8060 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);

	8061 }

	8062 #endif

	8063 checkRef(pCheck, iFreePage);

	8064 }

	8065 N -= n;

	8066 }

	8067 }

	8068 #ifndef SQLITE_OMIT_AUTOVACUUM

	8069 else{

	8070 /* If this database supports auto-vacuum and iPage is not the last

	8071 ** page in this overflow list, check that the pointer-map entry for

	8072 ** the following page matches iPage.

	8073 */

	8074 if( pCheck->pBt->autoVacuum && N>0 ){

	8075 i = get4byte(pOvflData);

	8076 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);

	8077 }

	8078 }

	8079 #endif

	8080 iPage = get4byte(pOvflData);

	8081 sqlite3PagerUnref(pOvflPage);

	8082 }

	8083 }

	8084 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	8085

	8086 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	8087 /*

	8088 ** Do various sanity checks on a single page of a tree. Return

	8089 ** the tree depth. Root pages return 0. Parents of root pages

	8090 ** return 1, and so forth.

	8091 **

	8092 ** These checks are done:

	8093 **

	8094 ** 1. Make sure that cells and freeblocks do not overlap

	8095 ** but combine to completely cover the page.

	8096 ** NO 2. Make sure cell keys are in order.

	8097 ** NO 3. Make sure no key is less than or equal to zLowerBound.

	8098 ** NO 4. Make sure no key is greater than or equal to zUpperBound.

	8099 ** 5. Check the integrity of overflow pages.

	8100 ** 6. Recursively call checkTreePage on all children.

	8101 ** 7. Verify that the depth of all children is the same.

	8102 ** 8. Make sure this page is at least 33% full or else it is

	8103 ** the root of the tree.

	8104 */

	8105 static int checkTreePage(

	8106 IntegrityCk pCheck, / Context for the sanity check */

	8107 int iPage, /* Page number of the page to check */

	8108 i64 *pnParentMinKey,

	8109 i64 *pnParentMaxKey

	8110 ){

	8111 MemPage *pPage;

	8112 int i, rc, depth, d2, pgno, cnt;

	8113 int hdr, cellStart;

	8114 int nCell;

	8115 u8 *data;

	8116 BtShared *pBt;

	8117 int usableSize;

	8118 char *hit = 0;

	8119 i64 nMinKey = 0;

	8120 i64 nMaxKey = 0;

	8121 const char *saved_zPfx = pCheck->zPfx;

	8122 int saved_v1 = pCheck->v1;

	8123 int saved_v2 = pCheck->v2;

	8124

	8125 /* Check that the page exists

	8126 */

	8127 pBt = pCheck->pBt;

	8128 usableSize = pBt->usableSize;

	8129 if( iPage==0 ) return 0;

	8130 if( checkRef(pCheck, iPage) ) return 0;

	8131 pCheck->zPfx = "Page %d: ";

	8132 pCheck->v1 = iPage;

	8133 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){

	8134 checkAppendMsg(pCheck,

	8135 "unable to get the page. error code=%d", rc);

	8136 depth = -1;

	8137 goto end_of_check;

	8138 }

	8139

	8140 /* Clear MemPage.isInit to make sure the corruption detection code in

	8141 ** btreeInitPage() is executed. */

	8142 pPage->isInit = 0;

	8143 if( (rc = btreeInitPage(pPage))!=0 ){

	8144 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */

	8145 checkAppendMsg(pCheck,

	8146 "btreeInitPage() returns error code %d", rc);

	8147 releasePage(pPage);

	8148 depth = -1;

	8149 goto end_of_check;

	8150 }

	8151

	8152 /* Check out all the cells.

	8153 */

	8154 depth = 0;

	8155 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){

	8156 u8 *pCell;

	8157 u32 sz;

	8158 CellInfo info;

	8159

	8160 /* Check payload overflow pages

	8161 */

	8162 pCheck->zPfx = "On tree page %d cell %d: ";

	8163 pCheck->v1 = iPage;

	8164 pCheck->v2 = i;

	8165 pCell = findCell(pPage,i);

	8166 btreeParseCellPtr(pPage, pCell, &info);

	8167 sz = info.nPayload;

	8168 /* For intKey pages, check that the keys are in order.

	8169 */

	8170 if( pPage->intKey ){

	8171 if( i==0 ){

	8172 nMinKey = nMaxKey = info.nKey;

	8173 }else if( info.nKey <= nMaxKey ){

	8174 checkAppendMsg(pCheck,

	8175 "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);

	8176 }

	8177 nMaxKey = info.nKey;

	8178 }

	8179 if( (sz>info.nLocal)

	8180 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])

	8181 ){

	8182 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);

	8183 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);

	8184 #ifndef SQLITE_OMIT_AUTOVACUUM

	8185 if( pBt->autoVacuum ){

	8186 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);

	8187 }

	8188 #endif

	8189 checkList(pCheck, 0, pgnoOvfl, nPage);

	8190 }

	8191

	8192 /* Check sanity of left child page.

	8193 */

	8194 if( !pPage->leaf ){

	8195 pgno = get4byte(pCell);

	8196 #ifndef SQLITE_OMIT_AUTOVACUUM

	8197 if( pBt->autoVacuum ){

	8198 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

	8199 }

	8200 #endif

	8201 d2 = checkTreePage(pCheck, pgno, &nMinKey, i==0?NULL:&nMaxKey);

	8202 if( i>0 && d2!=depth ){

	8203 checkAppendMsg(pCheck, "Child page depth differs");

	8204 }

	8205 depth = d2;

	8206 }

	8207 }

	8208

	8209 if( !pPage->leaf ){

	8210 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	8211 pCheck->zPfx = "On page %d at right child: ";

	8212 pCheck->v1 = iPage;

	8213 #ifndef SQLITE_OMIT_AUTOVACUUM

	8214 if( pBt->autoVacuum ){

	8215 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

	8216 }

	8217 #endif

	8218 checkTreePage(pCheck, pgno, NULL, !pPage->nCell?NULL:&nMaxKey);

	8219 }

	8220

	8221 /* For intKey leaf pages, check that the min/max keys are in order

	8222 ** with any left/parent/right pages.

	8223 */

	8224 pCheck->zPfx = "Page %d: ";

	8225 pCheck->v1 = iPage;

	8226 if( pPage->leaf && pPage->intKey ){

	8227 /* if we are a left child page */

	8228 if( pnParentMinKey ){

	8229 /* if we are the left most child page */

	8230 if( !pnParentMaxKey ){

	8231 if( nMaxKey > *pnParentMinKey ){

	8232 checkAppendMsg(pCheck,

	8233 "Rowid %lld out of order (max larger than parent min of %lld)",

	8234 nMaxKey, *pnParentMinKey);

	8235 }

	8236 }else{

	8237 if( nMinKey <= *pnParentMinKey ){

	8238 checkAppendMsg(pCheck,

	8239 "Rowid %lld out of order (min less than parent min of %lld)",

	8240 nMinKey, *pnParentMinKey);

	8241 }

	8242 if( nMaxKey > *pnParentMaxKey ){

	8243 checkAppendMsg(pCheck,

	8244 "Rowid %lld out of order (max larger than parent max of %lld)",

	8245 nMaxKey, *pnParentMaxKey);

	8246 }

	8247 *pnParentMinKey = nMaxKey;

	8248 }

	8249 /* else if we're a right child page */

	8250 } else if( pnParentMaxKey ){

	8251 if( nMinKey <= *pnParentMaxKey ){

	8252 checkAppendMsg(pCheck,

	8253 "Rowid %lld out of order (min less than parent max of %lld)",

	8254 nMinKey, *pnParentMaxKey);

	8255 }

	8256 }

	8257 }

	8258

	8259 /* Check for complete coverage of the page

	8260 */

	8261 data = pPage->aData;

	8262 hdr = pPage->hdrOffset;

	8263 hit = sqlite3PageMalloc( pBt->pageSize );

	8264 pCheck->zPfx = 0;

	8265 if( hit==0 ){

	8266 pCheck->mallocFailed = 1;

	8267 }else{

	8268 int contentOffset = get2byteNotZero(&data[hdr+5]);

	8269 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */

	8270 memset(hit+contentOffset, 0, usableSize-contentOffset);

	8271 memset(hit, 1, contentOffset);

	8272 nCell = get2byte(&data[hdr+3]);

	8273 cellStart = hdr + 12 - 4*pPage->leaf;

	8274 for(i=0; i<nCell; i++){

	8275 int pc = get2byte(&data[cellStart+i*2]);

	8276 u32 size = 65536;

	8277 int j;

	8278 if( pc<=usableSize-4 ){

	8279 size = cellSizePtr(pPage, &data[pc]);

	8280 }

	8281 if( (int)(pc+size-1)>=usableSize ){

	8282 pCheck->zPfx = 0;

	8283 checkAppendMsg(pCheck,

	8284 "Corruption detected in cell %d on page %d",i,iPage);

	8285 }else{

	8286 for(j=pc+size-1; j>=pc; j--) hit[j]++;

	8287 }

	8288 }

	8289 i = get2byte(&data[hdr+1]);

	8290 while( i>0 ){

	8291 int size, j;

	8292 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */

	8293 size = get2byte(&data[i+2]);

	8294 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */

	8295 for(j=i+size-1; j>=i; j--) hit[j]++;

	8296 j = get2byte(&data[i]);

	8297 assert( j==0 \|\| j>i+size ); /* Enforced by btreeInitPage() */

	8298 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */

	8299 i = j;

	8300 }

	8301 for(i=cnt=0; i<usableSize; i++){

	8302 if( hit[i]==0 ){

	8303 cnt++;

	8304 }else if( hit[i]>1 ){

	8305 checkAppendMsg(pCheck,

	8306 "Multiple uses for byte %d of page %d", i, iPage);

	8307 break;

	8308 }

	8309 }

	8310 if( cnt!=data[hdr+7] ){

	8311 checkAppendMsg(pCheck,

	8312 "Fragmentation of %d bytes reported as %d on page %d",

	8313 cnt, data[hdr+7], iPage);

	8314 }

	8315 }

	8316 sqlite3PageFree(hit);

	8317 releasePage(pPage);

	8318

	8319 end_of_check:

	8320 pCheck->zPfx = saved_zPfx;

	8321 pCheck->v1 = saved_v1;

	8322 pCheck->v2 = saved_v2;

	8323 return depth+1;

	8324 }

	8325 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	8326

	8327 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	8328 /*

	8329 ** This routine does a complete check of the given BTree file. aRoot[] is

	8330 ** an array of pages numbers were each page number is the root page of

	8331 ** a table. nRoot is the number of entries in aRoot.

	8332 **

	8333 ** A read-only or read-write transaction must be opened before calling

	8334 ** this function.

	8335 **

	8336 ** Write the number of error seen in *pnErr. Except for some memory

	8337 ** allocation errors, an error message held in memory obtained from

	8338 ** malloc is returned if pnErr is non-zero. If pnErr==0 then NULL is

	8339 ** returned. If a memory allocation error occurs, NULL is returned.

	8340 */

	8341 char *sqlite3BtreeIntegrityCheck(

	8342 Btree p, / The btree to be checked */

	8343 int aRoot, / An array of root pages numbers for individual trees */

	8344 int nRoot, /* Number of entries in aRoot[] */

	8345 int mxErr, /* Stop reporting errors after this many */

	8346 int pnErr / Write number of errors seen to this variable */

	8347 ){

	8348 Pgno i;

	8349 int nRef;

	8350 IntegrityCk sCheck;

	8351 BtShared *pBt = p->pBt;

	8352 char zErr[100];

	8353

	8354 sqlite3BtreeEnter(p);

	8355 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );

	8356 nRef = sqlite3PagerRefcount(pBt->pPager);

	8357 sCheck.pBt = pBt;

	8358 sCheck.pPager = pBt->pPager;

	8359 sCheck.nPage = btreePagecount(sCheck.pBt);

	8360 sCheck.mxErr = mxErr;

	8361 sCheck.nErr = 0;

	8362 sCheck.mallocFailed = 0;

	8363 sCheck.zPfx = 0;

	8364 sCheck.v1 = 0;

	8365 sCheck.v2 = 0;

	8366 *pnErr = 0;

	8367 if( sCheck.nPage==0 ){

	8368 sqlite3BtreeLeave(p);

	8369 return 0;

	8370 }

	8371

	8372 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);

	8373 if( !sCheck.aPgRef ){

	8374 *pnErr = 1;

	8375 sqlite3BtreeLeave(p);

	8376 return 0;

	8377 }

	8378 i = PENDING_BYTE_PAGE(pBt);

	8379 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);

	8380 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);

	8381 sCheck.errMsg.useMalloc = 2;

	8382

	8383 /* Check the integrity of the freelist

	8384 */

	8385 sCheck.zPfx = "Main freelist: ";

	8386 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),

	8387 get4byte(&pBt->pPage1->aData[36]));

	8388 sCheck.zPfx = 0;

	8389

	8390 /* Check all the tables.

	8391 */

	8392 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){

	8393 if( aRoot[i]==0 ) continue;

	8394 #ifndef SQLITE_OMIT_AUTOVACUUM

	8395 if( pBt->autoVacuum && aRoot[i]>1 ){

	8396 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);

	8397 }

	8398 #endif

	8399 sCheck.zPfx = "List of tree roots: ";

	8400 checkTreePage(&sCheck, aRoot[i], NULL, NULL);

	8401 sCheck.zPfx = 0;

	8402 }

	8403

	8404 /* Make sure every page in the file is referenced

	8405 */

	8406 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){

	8407 #ifdef SQLITE_OMIT_AUTOVACUUM

	8408 if( getPageReferenced(&sCheck, i)==0 ){

	8409 checkAppendMsg(&sCheck, "Page %d is never used", i);

	8410 }

	8411 #else

	8412 /* If the database supports auto-vacuum, make sure no tables contain

	8413 ** references to pointer-map pages.

	8414 */

	8415 if( getPageReferenced(&sCheck, i)==0 &&

	8416 (PTRMAP_PAGENO(pBt, i)!=i \|\| !pBt->autoVacuum) ){

	8417 checkAppendMsg(&sCheck, "Page %d is never used", i);

	8418 }

	8419 if( getPageReferenced(&sCheck, i)!=0 &&

	8420 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){

	8421 checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);

	8422 }

	8423 #endif

	8424 }

	8425

	8426 /* Make sure this analysis did not leave any unref() pages.

	8427 ** This is an internal consistency check; an integrity check

	8428 ** of the integrity check.

	8429 */

	8430 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){

	8431 checkAppendMsg(&sCheck,

	8432 "Outstanding page count goes from %d to %d during this analysis",

	8433 nRef, sqlite3PagerRefcount(pBt->pPager)

	8434 );

	8435 }

	8436

	8437 /* Clean up and report errors.

	8438 */

	8439 sqlite3BtreeLeave(p);

	8440 sqlite3_free(sCheck.aPgRef);

	8441 if( sCheck.mallocFailed ){

	8442 sqlite3StrAccumReset(&sCheck.errMsg);

	8443 *pnErr = sCheck.nErr+1;

	8444 return 0;

	8445 }

	8446 *pnErr = sCheck.nErr;

	8447 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);

	8448 return sqlite3StrAccumFinish(&sCheck.errMsg);

	8449 }

	8450 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	8451

	8452 /*

	8453 ** Return the full pathname of the underlying database file. Return

	8454 ** an empty string if the database is in-memory or a TEMP database.

	8455 **

	8456 ** The pager filename is invariant as long as the pager is

	8457 ** open so it is safe to access without the BtShared mutex.

	8458 */

	8459 const char sqlite3BtreeGetFilename(Btree p){

	8460 assert( p->pBt->pPager!=0 );

	8461 return sqlite3PagerFilename(p->pBt->pPager, 1);

	8462 }

	8463

	8464 /*

	8465 ** Return the pathname of the journal file for this database. The return

	8466 ** value of this routine is the same regardless of whether the journal file

	8467 ** has been created or not.

	8468 **

	8469 ** The pager journal filename is invariant as long as the pager is

	8470 ** open so it is safe to access without the BtShared mutex.

	8471 */

	8472 const char sqlite3BtreeGetJournalname(Btree p){

	8473 assert( p->pBt->pPager!=0 );

	8474 return sqlite3PagerJournalname(p->pBt->pPager);

	8475 }

	8476

	8477 /*

	8478 ** Return non-zero if a transaction is active.

	8479 */

	8480 int sqlite3BtreeIsInTrans(Btree *p){

	8481 assert( p==0 \|\| sqlite3_mutex_held(p->db->mutex) );

	8482 return (p && (p->inTrans==TRANS_WRITE));

	8483 }

	8484

	8485 #ifndef SQLITE_OMIT_WAL

	8486 /*

	8487 ** Run a checkpoint on the Btree passed as the first argument.

	8488 **

	8489 ** Return SQLITE_LOCKED if this or any other connection has an open

	8490 ** transaction on the shared-cache the argument Btree is connected to.

	8491 **

	8492 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.

	8493 */

	8494 int sqlite3BtreeCheckpoint(Btree p, int eMode, int pnLog, int *pnCkpt){

	8495 int rc = SQLITE_OK;

	8496 if( p ){

	8497 BtShared *pBt = p->pBt;

	8498 sqlite3BtreeEnter(p);

	8499 if( pBt->inTransaction!=TRANS_NONE ){

	8500 rc = SQLITE_LOCKED;

	8501 }else{

	8502 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);

	8503 }

	8504 sqlite3BtreeLeave(p);

	8505 }

	8506 return rc;

	8507 }

	8508 #endif

	8509

	8510 /*

	8511 ** Return non-zero if a read (or write) transaction is active.

	8512 */

	8513 int sqlite3BtreeIsInReadTrans(Btree *p){

	8514 assert( p );

	8515 assert( sqlite3_mutex_held(p->db->mutex) );

	8516 return p->inTrans!=TRANS_NONE;

	8517 }

	8518

	8519 int sqlite3BtreeIsInBackup(Btree *p){

	8520 assert( p );

	8521 assert( sqlite3_mutex_held(p->db->mutex) );

	8522 return p->nBackup!=0;

	8523 }

	8524

	8525 /*

	8526 ** This function returns a pointer to a blob of memory associated with

	8527 ** a single shared-btree. The memory is used by client code for its own

	8528 ** purposes (for example, to store a high-level schema associated with

	8529 ** the shared-btree). The btree layer manages reference counting issues.

	8530 **

	8531 ** The first time this is called on a shared-btree, nBytes bytes of memory

	8532 ** are allocated, zeroed, and returned to the caller. For each subsequent

	8533 ** call the nBytes parameter is ignored and a pointer to the same blob

	8534 ** of memory returned.

	8535 **

	8536 ** If the nBytes parameter is 0 and the blob of memory has not yet been

	8537 ** allocated, a null pointer is returned. If the blob has already been

	8538 ** allocated, it is returned as normal.

	8539 **

	8540 ** Just before the shared-btree is closed, the function passed as the

	8541 ** xFree argument when the memory allocation was made is invoked on the

	8542 ** blob of allocated memory. The xFree function should not call sqlite3_free()

	8543 ** on the memory, the btree layer does that.

	8544 */

	8545 void sqlite3BtreeSchema(Btree p, int nBytes, void(xFree)(void )){

	8546 BtShared *pBt = p->pBt;

	8547 sqlite3BtreeEnter(p);

	8548 if( !pBt->pSchema && nBytes ){

	8549 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);

	8550 pBt->xFreeSchema = xFree;

	8551 }

	8552 sqlite3BtreeLeave(p);

	8553 return pBt->pSchema;

	8554 }

	8555

	8556 /*

	8557 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared

	8558 ** btree as the argument handle holds an exclusive lock on the

	8559 ** sqlite_master table. Otherwise SQLITE_OK.

	8560 */

	8561 int sqlite3BtreeSchemaLocked(Btree *p){

	8562 int rc;

	8563 assert( sqlite3_mutex_held(p->db->mutex) );

	8564 sqlite3BtreeEnter(p);

	8565 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

	8566 assert( rc==SQLITE_OK \|\| rc==SQLITE_LOCKED_SHAREDCACHE );

	8567 sqlite3BtreeLeave(p);

	8568 return rc;

	8569 }

	8570

	8571

	8572 #ifndef SQLITE_OMIT_SHARED_CACHE

	8573 /*

	8574 ** Obtain a lock on the table whose root page is iTab. The

	8575 ** lock is a write lock if isWritelock is true or a read lock

	8576 ** if it is false.

	8577 */

	8578 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){

	8579 int rc = SQLITE_OK;

	8580 assert( p->inTrans!=TRANS_NONE );

	8581 if( p->sharable ){

	8582 u8 lockType = READ_LOCK + isWriteLock;

	8583 assert( READ_LOCK+1==WRITE_LOCK );

	8584 assert( isWriteLock==0 \|\| isWriteLock==1 );

	8585

	8586 sqlite3BtreeEnter(p);

	8587 rc = querySharedCacheTableLock(p, iTab, lockType);

	8588 if( rc==SQLITE_OK ){

	8589 rc = setSharedCacheTableLock(p, iTab, lockType);

	8590 }

	8591 sqlite3BtreeLeave(p);

	8592 }

	8593 return rc;

	8594 }

	8595 #endif

	8596

	8597 #ifndef SQLITE_OMIT_INCRBLOB

	8598 /*

	8599 ** Argument pCsr must be a cursor opened for writing on an

	8600 ** INTKEY table currently pointing at a valid table entry.

	8601 ** This function modifies the data stored as part of that entry.

	8602 **

	8603 ** Only the data content may only be modified, it is not possible to

	8604 ** change the length of the data stored. If this function is called with

	8605 ** parameters that attempt to write past the end of the existing data,

	8606 ** no modifications are made and SQLITE_CORRUPT is returned.

	8607 */

	8608 int sqlite3BtreePutData(BtCursor pCsr, u32 offset, u32 amt, void z){

	8609 int rc;

	8610 assert( cursorHoldsMutex(pCsr) );

	8611 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );

	8612 assert( pCsr->curFlags & BTCF_Incrblob );

	8613

	8614 rc = restoreCursorPosition(pCsr);

	8615 if( rc!=SQLITE_OK ){

	8616 return rc;

	8617 }

	8618 assert( pCsr->eState!=CURSOR_REQUIRESEEK );

	8619 if( pCsr->eState!=CURSOR_VALID ){

	8620 return SQLITE_ABORT;

	8621 }

	8622

	8623 /* Save the positions of all other cursors open on this table. This is

	8624 ** required in case any of them are holding references to an xFetch

	8625 ** version of the b-tree page modified by the accessPayload call below.

	8626 **

	8627 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()

	8628 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence

	8629 ** saveAllCursors can only return SQLITE_OK.

	8630 */

	8631 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);

	8632 assert( rc==SQLITE_OK );

	8633

	8634 /* Check some assumptions:

	8635 ** (a) the cursor is open for writing,

	8636 ** (b) there is a read/write transaction open,

	8637 ** (c) the connection holds a write-lock on the table (if required),

	8638 ** (d) there are no conflicting read-locks, and

	8639 ** (e) the cursor points at a valid row of an intKey table.

	8640 */

	8641 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){

	8642 return SQLITE_READONLY;

	8643 }

	8644 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0

	8645 && pCsr->pBt->inTransaction==TRANS_WRITE );

	8646 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );

	8647 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );

	8648 assert( pCsr->apPage[pCsr->iPage]->intKey );

	8649

	8650 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);

	8651 }

	8652

	8653 /*

	8654 ** Mark this cursor as an incremental blob cursor.

	8655 */

	8656 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){

	8657 pCur->curFlags \|= BTCF_Incrblob;

	8658 }

	8659 #endif

	8660

	8661 /*

	8662 ** Set both the "read version" (single byte at byte offset 18) and

	8663 ** "write version" (single byte at byte offset 19) fields in the database

	8664 ** header to iVersion.

	8665 */

	8666 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){

	8667 BtShared *pBt = pBtree->pBt;

	8668 int rc; /* Return code */

	8669

	8670 assert( iVersion==1 \|\| iVersion==2 );

	8671

	8672 /* If setting the version fields to 1, do not automatically open the

	8673 ** WAL connection, even if the version fields are currently set to 2.

	8674 */

	8675 pBt->btsFlags &= ~BTS_NO_WAL;

	8676 if( iVersion==1 ) pBt->btsFlags \|= BTS_NO_WAL;

	8677

	8678 rc = sqlite3BtreeBeginTrans(pBtree, 0);

	8679 if( rc==SQLITE_OK ){

	8680 u8 *aData = pBt->pPage1->aData;

	8681 if( aData[18]!=(u8)iVersion \|\| aData[19]!=(u8)iVersion ){

	8682 rc = sqlite3BtreeBeginTrans(pBtree, 2);

	8683 if( rc==SQLITE_OK ){

	8684 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	8685 if( rc==SQLITE_OK ){

	8686 aData[18] = (u8)iVersion;

	8687 aData[19] = (u8)iVersion;

	8688 }

	8689 }

	8690 }

	8691 }

	8692

	8693 pBt->btsFlags &= ~BTS_NO_WAL;

	8694 return rc;

	8695 }

	8696

	8697 /*

	8698 ** set the mask of hint flags for cursor pCsr. Currently the only valid

	8699 ** values are 0 and BTREE_BULKLOAD.

	8700 */

	8701 void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){

	8702 assert( mask==BTREE_BULKLOAD \|\| mask==0 );

	8703 pCsr->hints = mask;

	8704 }

	8705

	8706 /*

	8707 ** Return true if the given Btree is read-only.

	8708 */

	8709 int sqlite3BtreeIsReadonly(Btree *p){

	8710 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;

	8711 }

OLD	NEW

« no previous file with comments | « third_party/sqlite/sqlite-src-3080704/src/btree.h ('k') | third_party/sqlite/sqlite-src-3080704/src/btreeInt.h » ('j') | no next file with comments »