third_party/sqlite/amalgamation/sqlite3.02.c - Issue 1636873003: Try for backport

Side by Side Diff: third_party/sqlite/amalgamation/sqlite3.02.c

Issue 1636873003: Try for backport (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@zzsql_import3_10_2_websql_backport

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /************ Begin file pager.c *****************************************/

	2 /*

	3 ** 2001 September 15

	4 **

	5 ** The author disclaims copyright to this source code. In place of

	6 ** a legal notice, here is a blessing:

	7 **

	8 ** May you do good and not evil.

	9 ** May you find forgiveness for yourself and forgive others.

	10 ** May you share freely, never taking more than you give.

	11 **

	12 *************************************************************************

	13 ** This is the implementation of the page cache subsystem or "pager".

	14 **

	15 ** The pager is used to access a database disk file. It implements

	16 ** atomic commit and rollback through the use of a journal file that

	17 ** is separate from the database file. The pager also implements file

	18 ** locking to prevent two processes from writing the same database

	19 ** file simultaneously, or one process from reading the database while

	20 ** another is writing.

	21 */

	22 #ifndef SQLITE_OMIT_DISKIO

	23 /* #include "sqliteInt.h" */

	24 /************ Include wal.h in the middle of pager.c *********************/

	25 /************ Begin file wal.h *******************************************/

	26 /*

	27 ** 2010 February 1

	28 **

	29 ** The author disclaims copyright to this source code. In place of

	30 ** a legal notice, here is a blessing:

	31 **

	32 ** May you do good and not evil.

	33 ** May you find forgiveness for yourself and forgive others.

	34 ** May you share freely, never taking more than you give.

	35 **

	36 *************************************************************************

	37 ** This header file defines the interface to the write-ahead logging

	38 ** system. Refer to the comments below and the header comment attached to

	39 ** the implementation of each function in log.c for further details.

	40 */

	41

	42 #ifndef _WAL_H_

	43 #define _WAL_H_

	44

	45 /* #include "sqliteInt.h" */

	46

	47 /* Additional values that can be added to the sync_flags argument of

	48 ** sqlite3WalFrames():

	49 */

	50 #define WAL_SYNC_TRANSACTIONS 0x20 /* Sync at the end of each transaction */

	51 #define SQLITE_SYNC_MASK 0x13 /* Mask off the SQLITE_SYNC_* values */

	52

	53 #ifdef SQLITE_OMIT_WAL

	54 # define sqlite3WalOpen(x,y,z) 0

	55 # define sqlite3WalLimit(x,y)

	56 # define sqlite3WalClose(w,x,y,z) 0

	57 # define sqlite3WalBeginReadTransaction(y,z) 0

	58 # define sqlite3WalEndReadTransaction(z)

	59 # define sqlite3WalDbsize(y) 0

	60 # define sqlite3WalBeginWriteTransaction(y) 0

	61 # define sqlite3WalEndWriteTransaction(x) 0

	62 # define sqlite3WalUndo(x,y,z) 0

	63 # define sqlite3WalSavepoint(y,z)

	64 # define sqlite3WalSavepointUndo(y,z) 0

	65 # define sqlite3WalFrames(u,v,w,x,y,z) 0

	66 # define sqlite3WalCheckpoint(r,s,t,u,v,w,x,y,z) 0

	67 # define sqlite3WalCallback(z) 0

	68 # define sqlite3WalExclusiveMode(y,z) 0

	69 # define sqlite3WalHeapMemory(z) 0

	70 # define sqlite3WalFramesize(z) 0

	71 # define sqlite3WalFindFrame(x,y,z) 0

	72 # define sqlite3WalFile(x) 0

	73 #else

	74

	75 #define WAL_SAVEPOINT_NDATA 4

	76

	77 /* Connection to a write-ahead log (WAL) file.

	78 ** There is one object of this type for each pager.

	79 */

	80 typedef struct Wal Wal;

	81

	82 /* Open and close a connection to a write-ahead log. */

	83 SQLITE_PRIVATE int sqlite3WalOpen(sqlite3_vfs, sqlite3_file, const char , int , i64, Wal*);

	84 SQLITE_PRIVATE int sqlite3WalClose(Wal pWal, int sync_flags, int, u8 );

	85

	86 /* Set the limiting size of a WAL file. */

	87 SQLITE_PRIVATE void sqlite3WalLimit(Wal*, i64);

	88

	89 /* Used by readers to open (lock) and close (unlock) a snapshot. A

	90 ** snapshot is like a read-transaction. It is the state of the database

	91 ** at an instant in time. sqlite3WalOpenSnapshot gets a read lock and

	92 ** preserves the current state even if the other threads or processes

	93 ** write to or checkpoint the WAL. sqlite3WalCloseSnapshot() closes the

	94 ** transaction and releases the lock.

	95 */

	96 SQLITE_PRIVATE int sqlite3WalBeginReadTransaction(Wal pWal, int );

	97 SQLITE_PRIVATE void sqlite3WalEndReadTransaction(Wal *pWal);

	98

	99 /* Read a page from the write-ahead log, if it is present. */

	100 SQLITE_PRIVATE int sqlite3WalFindFrame(Wal , Pgno, u32 );

	101 SQLITE_PRIVATE int sqlite3WalReadFrame(Wal , u32, int, u8 );

	102

	103 /* If the WAL is not empty, return the size of the database. */

	104 SQLITE_PRIVATE Pgno sqlite3WalDbsize(Wal *pWal);

	105

	106 /* Obtain or release the WRITER lock. */

	107 SQLITE_PRIVATE int sqlite3WalBeginWriteTransaction(Wal *pWal);

	108 SQLITE_PRIVATE int sqlite3WalEndWriteTransaction(Wal *pWal);

	109

	110 /* Undo any frames written (but not committed) to the log */

	111 SQLITE_PRIVATE int sqlite3WalUndo(Wal pWal, int (xUndo)(void , Pgno), void p UndoCtx);

	112

	113 /* Return an integer that records the current (uncommitted) write

	114 ** position in the WAL */

	115 SQLITE_PRIVATE void sqlite3WalSavepoint(Wal pWal, u32 aWalData);

	116

	117 /* Move the write position of the WAL back to iFrame. Called in

	118 ** response to a ROLLBACK TO command. */

	119 SQLITE_PRIVATE int sqlite3WalSavepointUndo(Wal pWal, u32 aWalData);

	120

	121 /* Write a frame or frames to the log. */

	122 SQLITE_PRIVATE int sqlite3WalFrames(Wal pWal, int, PgHdr , Pgno, int, int);

	123

	124 /* Copy pages from the log to the database file */

	125 SQLITE_PRIVATE int sqlite3WalCheckpoint(

	126 Wal pWal, / Write-ahead log connection */

	127 int eMode, /* One of PASSIVE, FULL and RESTART */

	128 int (xBusy)(void), /* Function to call when busy */

	129 void pBusyArg, / Context argument for xBusyHandler */

	130 int sync_flags, /* Flags to sync db file with (or 0) */

	131 int nBuf, /* Size of buffer nBuf */

	132 u8 zBuf, / Temporary buffer to use */

	133 int pnLog, / OUT: Number of frames in WAL */

	134 int pnCkpt / OUT: Number of backfilled frames in WAL */

	135 );

	136

	137 /* Return the value to pass to a sqlite3_wal_hook callback, the

	138 ** number of frames in the WAL at the point of the last commit since

	139 ** sqlite3WalCallback() was called. If no commits have occurred since

	140 ** the last call, then return 0.

	141 */

	142 SQLITE_PRIVATE int sqlite3WalCallback(Wal *pWal);

	143

	144 /* Tell the wal layer that an EXCLUSIVE lock has been obtained (or released)

	145 ** by the pager layer on the database file.

	146 */

	147 SQLITE_PRIVATE int sqlite3WalExclusiveMode(Wal *pWal, int op);

	148

	149 /* Return true if the argument is non-NULL and the WAL module is using

	150 ** heap-memory for the wal-index. Otherwise, if the argument is NULL or the

	151 ** WAL module is using shared-memory, return false.

	152 */

	153 SQLITE_PRIVATE int sqlite3WalHeapMemory(Wal *pWal);

	154

	155 #ifdef SQLITE_ENABLE_SNAPSHOT

	156 SQLITE_PRIVATE int sqlite3WalSnapshotGet(Wal pWal, sqlite3_snapshot *ppSnapsho t);

	157 SQLITE_PRIVATE void sqlite3WalSnapshotOpen(Wal pWal, sqlite3_snapshot pSnapsho t);

	158 #endif

	159

	160 #ifdef SQLITE_ENABLE_ZIPVFS

	161 /* If the WAL file is not empty, return the number of bytes of content

	162 ** stored in each frame (i.e. the db page-size when the WAL was created).

	163 */

	164 SQLITE_PRIVATE int sqlite3WalFramesize(Wal *pWal);

	165 #endif

	166

	167 /* Return the sqlite3_file object for the WAL file */

	168 SQLITE_PRIVATE sqlite3_file sqlite3WalFile(Wal pWal);

	169

	170 #endif /* ifndef SQLITE_OMIT_WAL */

	171 #endif /* _WAL_H_ */

	172

	173 /************ End of wal.h ***********************************************/

	174 /************ Continuing where we left off in pager.c ********************/

	175

	176

	177 /***************** NOTES ON THE DESIGN OF THE PAGER **********************

	178 **

	179 ** This comment block describes invariants that hold when using a rollback

	180 ** journal. These invariants do not apply for journal_mode=WAL,

	181 ** journal_mode=MEMORY, or journal_mode=OFF.

	182 **

	183 ** Within this comment block, a page is deemed to have been synced

	184 ** automatically as soon as it is written when PRAGMA synchronous=OFF.

	185 ** Otherwise, the page is not synced until the xSync method of the VFS

	186 ** is called successfully on the file containing the page.

	187 **

	188 ** Definition: A page of the database file is said to be "overwriteable" if

	189 ** one or more of the following are true about the page:

	190 **

	191 ** (a) The original content of the page as it was at the beginning of

	192 ** the transaction has been written into the rollback journal and

	193 ** synced.

	194 **

	195 ** (b) The page was a freelist leaf page at the start of the transaction.

	196 **

	197 ** (c) The page number is greater than the largest page that existed in

	198 ** the database file at the start of the transaction.

	199 **

	200 ** (1) A page of the database file is never overwritten unless one of the

	201 ** following are true:

	202 **

	203 ** (a) The page and all other pages on the same sector are overwriteable.

	204 **

	205 ** (b) The atomic page write optimization is enabled, and the entire

	206 ** transaction other than the update of the transaction sequence

	207 ** number consists of a single page change.

	208 **

	209 ** (2) The content of a page written into the rollback journal exactly matches

	210 ** both the content in the database when the rollback journal was written

	211 ** and the content in the database at the beginning of the current

	212 ** transaction.

	213 **

	214 ** (3) Writes to the database file are an integer multiple of the page size

	215 ** in length and are aligned on a page boundary.

	216 **

	217 ** (4) Reads from the database file are either aligned on a page boundary and

	218 ** an integer multiple of the page size in length or are taken from the

	219 ** first 100 bytes of the database file.

	220 **

	221 ** (5) All writes to the database file are synced prior to the rollback journal

	222 ** being deleted, truncated, or zeroed.

	223 **

	224 ** (6) If a master journal file is used, then all writes to the database file

	225 ** are synced prior to the master journal being deleted.

	226 **

	227 ** Definition: Two databases (or the same database at two points it time)

	228 ** are said to be "logically equivalent" if they give the same answer to

	229 ** all queries. Note in particular the content of freelist leaf

	230 ** pages can be changed arbitrarily without affecting the logical equivalence

	231 ** of the database.

	232 **

	233 ** (7) At any time, if any subset, including the empty set and the total set,

	234 ** of the unsynced changes to a rollback journal are removed and the

	235 ** journal is rolled back, the resulting database file will be logically

	236 ** equivalent to the database file at the beginning of the transaction.

	237 **

	238 ** (8) When a transaction is rolled back, the xTruncate method of the VFS

	239 ** is called to restore the database file to the same size it was at

	240 ** the beginning of the transaction. (In some VFSes, the xTruncate

	241 ** method is a no-op, but that does not change the fact the SQLite will

	242 ** invoke it.)

	243 **

	244 ** (9) Whenever the database file is modified, at least one bit in the range

	245 ** of bytes from 24 through 39 inclusive will be changed prior to releasing

	246 ** the EXCLUSIVE lock, thus signaling other connections on the same

	247 ** database to flush their caches.

	248 **

	249 ** (10) The pattern of bits in bytes 24 through 39 shall not repeat in less

	250 ** than one billion transactions.

	251 **

	252 ** (11) A database file is well-formed at the beginning and at the conclusion

	253 ** of every transaction.

	254 **

	255 ** (12) An EXCLUSIVE lock is held on the database file when writing to

	256 ** the database file.

	257 **

	258 ** (13) A SHARED lock is held on the database file while reading any

	259 ** content out of the database file.

	260 **

	261 ******************************************************************************/

	262

	263 /*

	264 ** Macros for troubleshooting. Normally turned off

	265 */

	266 #if 0

	267 int sqlite3PagerTrace=1; /* True to enable tracing */

	268 #define sqlite3DebugPrintf printf

	269 #define PAGERTRACE(X) if( sqlite3PagerTrace ){ sqlite3DebugPrintf X; }

	270 #else

	271 #define PAGERTRACE(X)

	272 #endif

	273

	274 /*

	275 ** The following two macros are used within the PAGERTRACE() macros above

	276 ** to print out file-descriptors.

	277 **

	278 ** PAGERID() takes a pointer to a Pager struct as its argument. The

	279 ** associated file-descriptor is returned. FILEHANDLEID() takes an sqlite3_file

	280 ** struct as its argument.

	281 */

	282 #define PAGERID(p) ((int)(p->fd))

	283 #define FILEHANDLEID(fd) ((int)fd)

	284

	285 /*

	286 ** The Pager.eState variable stores the current 'state' of a pager. A

	287 ** pager may be in any one of the seven states shown in the following

	288 ** state diagram.

	289 **

	290 ** OPEN <------+------+

	291 ** \| \| \|

	292 ** V \| \|

	293 ** +---------> READER-------+ \|

	294 ** \| \| \|

	295 ** \| V \|

	296 ** \|<-------WRITER_LOCKED------> ERROR

	297 ** \| \| ^

	298 ** \| V \|

	299 ** \|<------WRITER_CACHEMOD-------->\|

	300 ** \| \| \|

	301 ** \| V \|

	302 ** \|<-------WRITER_DBMOD---------->\|

	303 ** \| \| \|

	304 ** \| V \|

	305 ** +<------WRITER_FINISHED-------->+

	306 **

	307 **

	308 ** List of state transitions and the C [function] that performs each:

	309 **

	310 ** OPEN -> READER [sqlite3PagerSharedLock]

	311 ** READER -> OPEN [pager_unlock]

	312 **

	313 ** READER -> WRITER_LOCKED [sqlite3PagerBegin]

	314 ** WRITER_LOCKED -> WRITER_CACHEMOD [pager_open_journal]

	315 ** WRITER_CACHEMOD -> WRITER_DBMOD [syncJournal]

	316 ** WRITER_DBMOD -> WRITER_FINISHED [sqlite3PagerCommitPhaseOne]

	317 WRITER_* -> READER [pager_end_transaction]

	318 **

	319 WRITER_* -> ERROR [pager_error]

	320 ** ERROR -> OPEN [pager_unlock]

	321 **

	322 **

	323 ** OPEN:

	324 **

	325 ** The pager starts up in this state. Nothing is guaranteed in this

	326 ** state - the file may or may not be locked and the database size is

	327 ** unknown. The database may not be read or written.

	328 **

	329 ** * No read or write transaction is active.

	330 ** * Any lock, or no lock at all, may be held on the database file.

	331 ** * The dbSize, dbOrigSize and dbFileSize variables may not be trusted.

	332 **

	333 ** READER:

	334 **

	335 ** In this state all the requirements for reading the database in

	336 ** rollback (non-WAL) mode are met. Unless the pager is (or recently

	337 ** was) in exclusive-locking mode, a user-level read transaction is

	338 ** open. The database size is known in this state.

	339 **

	340 ** A connection running with locking_mode=normal enters this state when

	341 ** it opens a read-transaction on the database and returns to state

	342 ** OPEN after the read-transaction is completed. However a connection

	343 ** running in locking_mode=exclusive (including temp databases) remains in

	344 ** this state even after the read-transaction is closed. The only way

	345 ** a locking_mode=exclusive connection can transition from READER to OPEN

	346 ** is via the ERROR state (see below).

	347 **

	348 ** * A read transaction may be active (but a write-transaction cannot).

	349 ** * A SHARED or greater lock is held on the database file.

	350 ** * The dbSize variable may be trusted (even if a user-level read

	351 ** transaction is not active). The dbOrigSize and dbFileSize variables

	352 ** may not be trusted at this point.

	353 ** * If the database is a WAL database, then the WAL connection is open.

	354 ** * Even if a read-transaction is not open, it is guaranteed that

	355 ** there is no hot-journal in the file-system.

	356 **

	357 ** WRITER_LOCKED:

	358 **

	359 ** The pager moves to this state from READER when a write-transaction

	360 ** is first opened on the database. In WRITER_LOCKED state, all locks

	361 ** required to start a write-transaction are held, but no actual

	362 ** modifications to the cache or database have taken place.

	363 **

	364 ** In rollback mode, a RESERVED or (if the transaction was opened with

	365 ** BEGIN EXCLUSIVE) EXCLUSIVE lock is obtained on the database file when

	366 ** moving to this state, but the journal file is not written to or opened

	367 ** to in this state. If the transaction is committed or rolled back while

	368 ** in WRITER_LOCKED state, all that is required is to unlock the database

	369 ** file.

	370 **

	371 ** IN WAL mode, WalBeginWriteTransaction() is called to lock the log file.

	372 ** If the connection is running with locking_mode=exclusive, an attempt

	373 ** is made to obtain an EXCLUSIVE lock on the database file.

	374 **

	375 ** * A write transaction is active.

	376 ** * If the connection is open in rollback-mode, a RESERVED or greater

	377 ** lock is held on the database file.

	378 ** * If the connection is open in WAL-mode, a WAL write transaction

	379 ** is open (i.e. sqlite3WalBeginWriteTransaction() has been successfully

	380 ** called).

	381 ** * The dbSize, dbOrigSize and dbFileSize variables are all valid.

	382 ** * The contents of the pager cache have not been modified.

	383 ** * The journal file may or may not be open.

	384 ** * Nothing (not even the first header) has been written to the journal.

	385 **

	386 ** WRITER_CACHEMOD:

	387 **

	388 ** A pager moves from WRITER_LOCKED state to this state when a page is

	389 ** first modified by the upper layer. In rollback mode the journal file

	390 ** is opened (if it is not already open) and a header written to the

	391 ** start of it. The database file on disk has not been modified.

	392 **

	393 ** * A write transaction is active.

	394 ** * A RESERVED or greater lock is held on the database file.

	395 ** * The journal file is open and the first header has been written

	396 ** to it, but the header has not been synced to disk.

	397 ** * The contents of the page cache have been modified.

	398 **

	399 ** WRITER_DBMOD:

	400 **

	401 ** The pager transitions from WRITER_CACHEMOD into WRITER_DBMOD state

	402 ** when it modifies the contents of the database file. WAL connections

	403 ** never enter this state (since they do not modify the database file,

	404 ** just the log file).

	405 **

	406 ** * A write transaction is active.

	407 ** * An EXCLUSIVE or greater lock is held on the database file.

	408 ** * The journal file is open and the first header has been written

	409 ** and synced to disk.

	410 ** * The contents of the page cache have been modified (and possibly

	411 ** written to disk).

	412 **

	413 ** WRITER_FINISHED:

	414 **

	415 ** It is not possible for a WAL connection to enter this state.

	416 **

	417 ** A rollback-mode pager changes to WRITER_FINISHED state from WRITER_DBMOD

	418 ** state after the entire transaction has been successfully written into the

	419 ** database file. In this state the transaction may be committed simply

	420 ** by finalizing the journal file. Once in WRITER_FINISHED state, it is

	421 ** not possible to modify the database further. At this point, the upper

	422 ** layer must either commit or rollback the transaction.

	423 **

	424 ** * A write transaction is active.

	425 ** * An EXCLUSIVE or greater lock is held on the database file.

	426 ** * All writing and syncing of journal and database data has finished.

	427 ** If no error occurred, all that remains is to finalize the journal to

	428 ** commit the transaction. If an error did occur, the caller will need

	429 ** to rollback the transaction.

	430 **

	431 ** ERROR:

	432 **

	433 ** The ERROR state is entered when an IO or disk-full error (including

	434 ** SQLITE_IOERR_NOMEM) occurs at a point in the code that makes it

	435 ** difficult to be sure that the in-memory pager state (cache contents,

	436 ** db size etc.) are consistent with the contents of the file-system.

	437 **

	438 ** Temporary pager files may enter the ERROR state, but in-memory pagers

	439 ** cannot.

	440 **

	441 ** For example, if an IO error occurs while performing a rollback,

	442 ** the contents of the page-cache may be left in an inconsistent state.

	443 ** At this point it would be dangerous to change back to READER state

	444 ** (as usually happens after a rollback). Any subsequent readers might

	445 ** report database corruption (due to the inconsistent cache), and if

	446 ** they upgrade to writers, they may inadvertently corrupt the database

	447 ** file. To avoid this hazard, the pager switches into the ERROR state

	448 ** instead of READER following such an error.

	449 **

	450 ** Once it has entered the ERROR state, any attempt to use the pager

	451 ** to read or write data returns an error. Eventually, once all

	452 ** outstanding transactions have been abandoned, the pager is able to

	453 ** transition back to OPEN state, discarding the contents of the

	454 ** page-cache and any other in-memory state at the same time. Everything

	455 ** is reloaded from disk (and, if necessary, hot-journal rollback peformed)

	456 ** when a read-transaction is next opened on the pager (transitioning

	457 ** the pager into READER state). At that point the system has recovered

	458 ** from the error.

	459 **

	460 ** Specifically, the pager jumps into the ERROR state if:

	461 **

	462 ** 1. An error occurs while attempting a rollback. This happens in

	463 ** function sqlite3PagerRollback().

	464 **

	465 ** 2. An error occurs while attempting to finalize a journal file

	466 ** following a commit in function sqlite3PagerCommitPhaseTwo().

	467 **

	468 ** 3. An error occurs while attempting to write to the journal or

	469 ** database file in function pagerStress() in order to free up

	470 ** memory.

	471 **

	472 ** In other cases, the error is returned to the b-tree layer. The b-tree

	473 ** layer then attempts a rollback operation. If the error condition

	474 ** persists, the pager enters the ERROR state via condition (1) above.

	475 **

	476 ** Condition (3) is necessary because it can be triggered by a read-only

	477 ** statement executed within a transaction. In this case, if the error

	478 ** code were simply returned to the user, the b-tree layer would not

	479 ** automatically attempt a rollback, as it assumes that an error in a

	480 ** read-only statement cannot leave the pager in an internally inconsistent

	481 ** state.

	482 **

	483 ** * The Pager.errCode variable is set to something other than SQLITE_OK.

	484 ** * There are one or more outstanding references to pages (after the

	485 ** last reference is dropped the pager should move back to OPEN state).

	486 ** * The pager is not an in-memory pager.

	487 **

	488 **

	489 ** Notes:

	490 **

	491 ** * A pager is never in WRITER_DBMOD or WRITER_FINISHED state if the

	492 ** connection is open in WAL mode. A WAL connection is always in one

	493 ** of the first four states.

	494 **

	495 ** * Normally, a connection open in exclusive mode is never in PAGER_OPEN

	496 ** state. There are two exceptions: immediately after exclusive-mode has

	497 ** been turned on (and before any read or write transactions are

	498 ** executed), and when the pager is leaving the "error state".

	499 **

	500 ** * See also: assert_pager_state().

	501 */

	502 #define PAGER_OPEN 0

	503 #define PAGER_READER 1

	504 #define PAGER_WRITER_LOCKED 2

	505 #define PAGER_WRITER_CACHEMOD 3

	506 #define PAGER_WRITER_DBMOD 4

	507 #define PAGER_WRITER_FINISHED 5

	508 #define PAGER_ERROR 6

	509

	510 /*

	511 ** The Pager.eLock variable is almost always set to one of the

	512 ** following locking-states, according to the lock currently held on

	513 ** the database file: NO_LOCK, SHARED_LOCK, RESERVED_LOCK or EXCLUSIVE_LOCK.

	514 ** This variable is kept up to date as locks are taken and released by

	515 ** the pagerLockDb() and pagerUnlockDb() wrappers.

	516 **

	517 ** If the VFS xLock() or xUnlock() returns an error other than SQLITE_BUSY

	518 ** (i.e. one of the SQLITE_IOERR subtypes), it is not clear whether or not

	519 ** the operation was successful. In these circumstances pagerLockDb() and

	520 ** pagerUnlockDb() take a conservative approach - eLock is always updated

	521 ** when unlocking the file, and only updated when locking the file if the

	522 ** VFS call is successful. This way, the Pager.eLock variable may be set

	523 ** to a less exclusive (lower) value than the lock that is actually held

	524 ** at the system level, but it is never set to a more exclusive value.

	525 **

	526 ** This is usually safe. If an xUnlock fails or appears to fail, there may

	527 ** be a few redundant xLock() calls or a lock may be held for longer than

	528 ** required, but nothing really goes wrong.

	529 **

	530 ** The exception is when the database file is unlocked as the pager moves

	531 ** from ERROR to OPEN state. At this point there may be a hot-journal file

	532 ** in the file-system that needs to be rolled back (as part of an OPEN->SHARED

	533 ** transition, by the same pager or any other). If the call to xUnlock()

	534 ** fails at this point and the pager is left holding an EXCLUSIVE lock, this

	535 ** can confuse the call to xCheckReservedLock() call made later as part

	536 ** of hot-journal detection.

	537 **

	538 ** xCheckReservedLock() is defined as returning true "if there is a RESERVED

	539 ** lock held by this process or any others". So xCheckReservedLock may

	540 ** return true because the caller itself is holding an EXCLUSIVE lock (but

	541 ** doesn't know it because of a previous error in xUnlock). If this happens

	542 ** a hot-journal may be mistaken for a journal being created by an active

	543 ** transaction in another process, causing SQLite to read from the database

	544 ** without rolling it back.

	545 **

	546 ** To work around this, if a call to xUnlock() fails when unlocking the

	547 ** database in the ERROR state, Pager.eLock is set to UNKNOWN_LOCK. It

	548 ** is only changed back to a real locking state after a successful call

	549 ** to xLock(EXCLUSIVE). Also, the code to do the OPEN->SHARED state transition

	550 ** omits the check for a hot-journal if Pager.eLock is set to UNKNOWN_LOCK

	551 ** lock. Instead, it assumes a hot-journal exists and obtains an EXCLUSIVE

	552 ** lock on the database file before attempting to roll it back. See function

	553 ** PagerSharedLock() for more detail.

	554 **

	555 ** Pager.eLock may only be set to UNKNOWN_LOCK when the pager is in

	556 ** PAGER_OPEN state.

	557 */

	558 #define UNKNOWN_LOCK (EXCLUSIVE_LOCK+1)

	559

	560 /*

	561 ** A macro used for invoking the codec if there is one

	562 */

	563 #ifdef SQLITE_HAS_CODEC

	564 # define CODEC1(P,D,N,X,E) \

	565 if( P->xCodec && P->xCodec(P->pCodec,D,N,X)==0 ){ E; }

	566 # define CODEC2(P,D,N,X,E,O) \

	567 if( P->xCodec==0 ){ O=(char*)D; }else \

	568 if( (O=(char*)(P->xCodec(P->pCodec,D,N,X)))==0 ){ E; }

	569 #else

	570 # define CODEC1(P,D,N,X,E) /* NO-OP */

	571 # define CODEC2(P,D,N,X,E,O) O=(char*)D

	572 #endif

	573

	574 /*

	575 ** The maximum allowed sector size. 64KiB. If the xSectorsize() method

	576 ** returns a value larger than this, then MAX_SECTOR_SIZE is used instead.

	577 ** This could conceivably cause corruption following a power failure on

	578 ** such a system. This is currently an undocumented limit.

	579 */

	580 #define MAX_SECTOR_SIZE 0x10000

	581

	582 /*

	583 ** An instance of the following structure is allocated for each active

	584 ** savepoint and statement transaction in the system. All such structures

	585 ** are stored in the Pager.aSavepoint[] array, which is allocated and

	586 ** resized using sqlite3Realloc().

	587 **

	588 ** When a savepoint is created, the PagerSavepoint.iHdrOffset field is

	589 ** set to 0. If a journal-header is written into the main journal while

	590 ** the savepoint is active, then iHdrOffset is set to the byte offset

	591 ** immediately following the last journal record written into the main

	592 ** journal before the journal-header. This is required during savepoint

	593 ** rollback (see pagerPlaybackSavepoint()).

	594 */

	595 typedef struct PagerSavepoint PagerSavepoint;

	596 struct PagerSavepoint {

	597 i64 iOffset; /* Starting offset in main journal */

	598 i64 iHdrOffset; /* See above */

	599 Bitvec pInSavepoint; / Set of pages in this savepoint */

	600 Pgno nOrig; /* Original number of pages in file */

	601 Pgno iSubRec; /* Index of first record in sub-journal */

	602 #ifndef SQLITE_OMIT_WAL

	603 u32 aWalData[WAL_SAVEPOINT_NDATA]; /* WAL savepoint context */

	604 #endif

	605 };

	606

	607 /*

	608 ** Bits of the Pager.doNotSpill flag. See further description below.

	609 */

	610 #define SPILLFLAG_OFF 0x01 /* Never spill cache. Set via pragma */

	611 #define SPILLFLAG_ROLLBACK 0x02 /* Current rolling back, so do not spill */

	612 #define SPILLFLAG_NOSYNC 0x04 /* Spill is ok, but do not sync */

	613

	614 /*

	615 ** An open page cache is an instance of struct Pager. A description of

	616 ** some of the more important member variables follows:

	617 **

	618 ** eState

	619 **

	620 ** The current 'state' of the pager object. See the comment and state

	621 ** diagram above for a description of the pager state.

	622 **

	623 ** eLock

	624 **

	625 ** For a real on-disk database, the current lock held on the database file -

	626 ** NO_LOCK, SHARED_LOCK, RESERVED_LOCK or EXCLUSIVE_LOCK.

	627 **

	628 ** For a temporary or in-memory database (neither of which require any

	629 ** locks), this variable is always set to EXCLUSIVE_LOCK. Since such

	630 ** databases always have Pager.exclusiveMode==1, this tricks the pager

	631 ** logic into thinking that it already has all the locks it will ever

	632 ** need (and no reason to release them).

	633 **

	634 ** In some (obscure) circumstances, this variable may also be set to

	635 ** UNKNOWN_LOCK. See the comment above the #define of UNKNOWN_LOCK for

	636 ** details.

	637 **

	638 ** changeCountDone

	639 **

	640 ** This boolean variable is used to make sure that the change-counter

	641 ** (the 4-byte header field at byte offset 24 of the database file) is

	642 ** not updated more often than necessary.

	643 **

	644 ** It is set to true when the change-counter field is updated, which

	645 ** can only happen if an exclusive lock is held on the database file.

	646 ** It is cleared (set to false) whenever an exclusive lock is

	647 ** relinquished on the database file. Each time a transaction is committed,

	648 ** The changeCountDone flag is inspected. If it is true, the work of

	649 ** updating the change-counter is omitted for the current transaction.

	650 **

	651 ** This mechanism means that when running in exclusive mode, a connection

	652 ** need only update the change-counter once, for the first transaction

	653 ** committed.

	654 **

	655 ** setMaster

	656 **

	657 ** When PagerCommitPhaseOne() is called to commit a transaction, it may

	658 ** (or may not) specify a master-journal name to be written into the

	659 ** journal file before it is synced to disk.

	660 **

	661 ** Whether or not a journal file contains a master-journal pointer affects

	662 ** the way in which the journal file is finalized after the transaction is

	663 ** committed or rolled back when running in "journal_mode=PERSIST" mode.

	664 ** If a journal file does not contain a master-journal pointer, it is

	665 ** finalized by overwriting the first journal header with zeroes. If

	666 ** it does contain a master-journal pointer the journal file is finalized

	667 ** by truncating it to zero bytes, just as if the connection were

	668 ** running in "journal_mode=truncate" mode.

	669 **

	670 ** Journal files that contain master journal pointers cannot be finalized

	671 ** simply by overwriting the first journal-header with zeroes, as the

	672 ** master journal pointer could interfere with hot-journal rollback of any

	673 ** subsequently interrupted transaction that reuses the journal file.

	674 **

	675 ** The flag is cleared as soon as the journal file is finalized (either

	676 ** by PagerCommitPhaseTwo or PagerRollback). If an IO error prevents the

	677 ** journal file from being successfully finalized, the setMaster flag

	678 ** is cleared anyway (and the pager will move to ERROR state).

	679 **

	680 ** doNotSpill

	681 **

	682 ** This variables control the behavior of cache-spills (calls made by

	683 ** the pcache module to the pagerStress() routine to write cached data

	684 ** to the file-system in order to free up memory).

	685 **

	686 ** When bits SPILLFLAG_OFF or SPILLFLAG_ROLLBACK of doNotSpill are set,

	687 ** writing to the database from pagerStress() is disabled altogether.

	688 ** The SPILLFLAG_ROLLBACK case is done in a very obscure case that

	689 ** comes up during savepoint rollback that requires the pcache module

	690 ** to allocate a new page to prevent the journal file from being written

	691 ** while it is being traversed by code in pager_playback(). The SPILLFLAG_OFF

	692 ** case is a user preference.

	693 **

	694 ** If the SPILLFLAG_NOSYNC bit is set, writing to the database from

	695 ** pagerStress() is permitted, but syncing the journal file is not.

	696 ** This flag is set by sqlite3PagerWrite() when the file-system sector-size

	697 ** is larger than the database page-size in order to prevent a journal sync

	698 ** from happening in between the journalling of two pages on the same sector.

	699 **

	700 ** subjInMemory

	701 **

	702 ** This is a boolean variable. If true, then any required sub-journal

	703 ** is opened as an in-memory journal file. If false, then in-memory

	704 ** sub-journals are only used for in-memory pager files.

	705 **

	706 ** This variable is updated by the upper layer each time a new

	707 ** write-transaction is opened.

	708 **

	709 ** dbSize, dbOrigSize, dbFileSize

	710 **

	711 ** Variable dbSize is set to the number of pages in the database file.

	712 ** It is valid in PAGER_READER and higher states (all states except for

	713 ** OPEN and ERROR).

	714 **

	715 ** dbSize is set based on the size of the database file, which may be

	716 ** larger than the size of the database (the value stored at offset

	717 ** 28 of the database header by the btree). If the size of the file

	718 ** is not an integer multiple of the page-size, the value stored in

	719 ** dbSize is rounded down (i.e. a 5KB file with 2K page-size has dbSize==2).

	720 ** Except, any file that is greater than 0 bytes in size is considered

	721 ** to have at least one page. (i.e. a 1KB file with 2K page-size leads

	722 ** to dbSize==1).

	723 **

	724 ** During a write-transaction, if pages with page-numbers greater than

	725 ** dbSize are modified in the cache, dbSize is updated accordingly.

	726 ** Similarly, if the database is truncated using PagerTruncateImage(),

	727 ** dbSize is updated.

	728 **

	729 ** Variables dbOrigSize and dbFileSize are valid in states

	730 ** PAGER_WRITER_LOCKED and higher. dbOrigSize is a copy of the dbSize

	731 ** variable at the start of the transaction. It is used during rollback,

	732 ** and to determine whether or not pages need to be journalled before

	733 ** being modified.

	734 **

	735 ** Throughout a write-transaction, dbFileSize contains the size of

	736 ** the file on disk in pages. It is set to a copy of dbSize when the

	737 ** write-transaction is first opened, and updated when VFS calls are made

	738 ** to write or truncate the database file on disk.

	739 **

	740 ** The only reason the dbFileSize variable is required is to suppress

	741 ** unnecessary calls to xTruncate() after committing a transaction. If,

	742 ** when a transaction is committed, the dbFileSize variable indicates

	743 ** that the database file is larger than the database image (Pager.dbSize),

	744 ** pager_truncate() is called. The pager_truncate() call uses xFilesize()

	745 ** to measure the database file on disk, and then truncates it if required.

	746 ** dbFileSize is not used when rolling back a transaction. In this case

	747 ** pager_truncate() is called unconditionally (which means there may be

	748 ** a call to xFilesize() that is not strictly required). In either case,

	749 ** pager_truncate() may cause the file to become smaller or larger.

	750 **

	751 ** dbHintSize

	752 **

	753 ** The dbHintSize variable is used to limit the number of calls made to

	754 ** the VFS xFileControl(FCNTL_SIZE_HINT) method.

	755 **

	756 ** dbHintSize is set to a copy of the dbSize variable when a

	757 ** write-transaction is opened (at the same time as dbFileSize and

	758 ** dbOrigSize). If the xFileControl(FCNTL_SIZE_HINT) method is called,

	759 ** dbHintSize is increased to the number of pages that correspond to the

	760 ** size-hint passed to the method call. See pager_write_pagelist() for

	761 ** details.

	762 **

	763 ** errCode

	764 **

	765 ** The Pager.errCode variable is only ever used in PAGER_ERROR state. It

	766 ** is set to zero in all other states. In PAGER_ERROR state, Pager.errCode

	767 ** is always set to SQLITE_FULL, SQLITE_IOERR or one of the SQLITE_IOERR_XXX

	768 ** sub-codes.

	769 */

	770 struct Pager {

	771 sqlite3_vfs pVfs; / OS functions to use for IO */

	772 u8 exclusiveMode; /* Boolean. True if locking_mode==EXCLUSIVE */

	773 u8 journalMode; /* One of the PAGER_JOURNALMODE_* values */

	774 u8 useJournal; /* Use a rollback journal on this file */

	775 u8 noSync; /* Do not sync the journal if true */

	776 u8 fullSync; /* Do extra syncs of the journal for robustness */

	777 u8 ckptSyncFlags; /* SYNC_NORMAL or SYNC_FULL for checkpoint */

	778 u8 walSyncFlags; /* SYNC_NORMAL or SYNC_FULL for wal writes */

	779 u8 syncFlags; /* SYNC_NORMAL or SYNC_FULL otherwise */

	780 u8 tempFile; /* zFilename is a temporary or immutable file */

	781 u8 noLock; /* Do not lock (except in WAL mode) */

	782 u8 readOnly; /* True for a read-only database */

	783 u8 memDb; /* True to inhibit all file I/O */

	784

	785 /**************************************************************************

	786 ** The following block contains those class members that change during

	787 ** routine operation. Class members not in this block are either fixed

	788 ** when the pager is first created or else only change when there is a

	789 ** significant mode change (such as changing the page_size, locking_mode,

	790 ** or the journal_mode). From another view, these class members describe

	791 ** the "state" of the pager, while other class members describe the

	792 ** "configuration" of the pager.

	793 */

	794 u8 eState; /* Pager state (OPEN, READER, WRITER_LOCKED..) */

	795 u8 eLock; /* Current lock held on database file */

	796 u8 changeCountDone; /* Set after incrementing the change-counter */

	797 u8 setMaster; /* True if a m-j name has been written to jrnl */

	798 u8 doNotSpill; /* Do not spill the cache when non-zero */

	799 u8 subjInMemory; /* True to use in-memory sub-journals */

	800 u8 bUseFetch; /* True to use xFetch() */

	801 u8 hasHeldSharedLock; /* True if a shared lock has ever been held */

	802 Pgno dbSize; /* Number of pages in the database */

	803 Pgno dbOrigSize; /* dbSize before the current transaction */

	804 Pgno dbFileSize; /* Number of pages in the database file */

	805 Pgno dbHintSize; /* Value passed to FCNTL_SIZE_HINT call */

	806 int errCode; /* One of several kinds of errors */

	807 int nRec; /* Pages journalled since last j-header written */

	808 u32 cksumInit; /* Quasi-random value added to every checksum */

	809 u32 nSubRec; /* Number of records written to sub-journal */

	810 Bitvec pInJournal; / One bit for each page in the database file */

	811 sqlite3_file fd; / File descriptor for database */

	812 sqlite3_file jfd; / File descriptor for main journal */

	813 sqlite3_file sjfd; / File descriptor for sub-journal */

	814 i64 journalOff; /* Current write offset in the journal file */

	815 i64 journalHdr; /* Byte offset to previous journal header */

	816 sqlite3_backup pBackup; / Pointer to list of ongoing backup processes */

	817 PagerSavepoint aSavepoint; / Array of active savepoints */

	818 int nSavepoint; /* Number of elements in aSavepoint[] */

	819 u32 iDataVersion; /* Changes whenever database content changes */

	820 char dbFileVers[16]; /* Changes whenever database file changes */

	821

	822 int nMmapOut; /* Number of mmap pages currently outstanding */

	823 sqlite3_int64 szMmap; /* Desired maximum mmap size */

	824 PgHdr pMmapFreelist; / List of free mmap page headers (pDirty) */

	825 /*

	826 ** End of the routinely-changing class members

	827 ***************************************************************************/

	828

	829 u16 nExtra; /* Add this many bytes to each in-memory page */

	830 i16 nReserve; /* Number of unused bytes at end of each page */

	831 u32 vfsFlags; /* Flags for sqlite3_vfs.xOpen() */

	832 u32 sectorSize; /* Assumed sector size during rollback */

	833 int pageSize; /* Number of bytes in a page */

	834 Pgno mxPgno; /* Maximum allowed size of the database */

	835 i64 journalSizeLimit; /* Size limit for persistent journal files */

	836 char zFilename; / Name of the database file */

	837 char zJournal; / Name of the journal file */

	838 int (xBusyHandler)(void); /* Function to call when busy */

	839 void pBusyHandlerArg; / Context argument for xBusyHandler */

	840 int aStat[3]; /* Total cache hits, misses and writes */

	841 #ifdef SQLITE_TEST

	842 int nRead; /* Database pages read */

	843 #endif

	844 void (xReiniter)(DbPage); /* Call this routine when reloading pages */

	845 #ifdef SQLITE_HAS_CODEC

	846 void (xCodec)(void,void,Pgno,int); /* Routine for en/decoding data */

	847 void (xCodecSizeChng)(void,int,int); /* Notify of page size changes */

	848 void (xCodecFree)(void); /* Destructor for the codec */

	849 void pCodec; / First argument to xCodec... methods */

	850 #endif

	851 char pTmpSpace; / Pager.pageSize bytes of space for tmp use */

	852 PCache pPCache; / Pointer to page cache object */

	853 #ifndef SQLITE_OMIT_WAL

	854 Wal pWal; / Write-ahead log used by "journal_mode=wal" */

	855 char zWal; / File name for write-ahead log */

	856 #endif

	857 };

	858

	859 /*

	860 ** Indexes for use with Pager.aStat[]. The Pager.aStat[] array contains

	861 ** the values accessed by passing SQLITE_DBSTATUS_CACHE_HIT, CACHE_MISS

	862 ** or CACHE_WRITE to sqlite3_db_status().

	863 */

	864 #define PAGER_STAT_HIT 0

	865 #define PAGER_STAT_MISS 1

	866 #define PAGER_STAT_WRITE 2

	867

	868 /*

	869 ** The following global variables hold counters used for

	870 ** testing purposes only. These variables do not exist in

	871 ** a non-testing build. These variables are not thread-safe.

	872 */

	873 #ifdef SQLITE_TEST

	874 SQLITE_API int sqlite3_pager_readdb_count = 0; /* Number of full pages read f rom DB */

	875 SQLITE_API int sqlite3_pager_writedb_count = 0; /* Number of full pages writte n to DB */

	876 SQLITE_API int sqlite3_pager_writej_count = 0; /* Number of pages written to journal */

	877 # define PAGER_INCR(v) v++

	878 #else

	879 # define PAGER_INCR(v)

	880 #endif

	881

	882

	883

	884 /*

	885 ** Journal files begin with the following magic string. The data

	886 ** was obtained from /dev/random. It is used only as a sanity check.

	887 **

	888 ** Since version 2.8.0, the journal format contains additional sanity

	889 ** checking information. If the power fails while the journal is being

	890 ** written, semi-random garbage data might appear in the journal

	891 ** file after power is restored. If an attempt is then made

	892 ** to roll the journal back, the database could be corrupted. The additional

	893 ** sanity checking data is an attempt to discover the garbage in the

	894 ** journal and ignore it.

	895 **

	896 ** The sanity checking information for the new journal format consists

	897 ** of a 32-bit checksum on each page of data. The checksum covers both

	898 ** the page number and the pPager->pageSize bytes of data for the page.

	899 ** This cksum is initialized to a 32-bit random value that appears in the

	900 ** journal file right after the header. The random initializer is important,

	901 ** because garbage data that appears at the end of a journal is likely

	902 ** data that was once in other files that have now been deleted. If the

	903 ** garbage data came from an obsolete journal file, the checksums might

	904 ** be correct. But by initializing the checksum to random value which

	905 ** is different for every journal, we minimize that risk.

	906 */

	907 static const unsigned char aJournalMagic[] = {

	908 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,

	909 };

	910

	911 /*

	912 ** The size of the of each page record in the journal is given by

	913 ** the following macro.

	914 */

	915 #define JOURNAL_PG_SZ(pPager) ((pPager->pageSize) + 8)

	916

	917 /*

	918 ** The journal header size for this pager. This is usually the same

	919 ** size as a single disk sector. See also setSectorSize().

	920 */

	921 #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)

	922

	923 /*

	924 ** The macro MEMDB is true if we are dealing with an in-memory database.

	925 ** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,

	926 ** the value of MEMDB will be a constant and the compiler will optimize

	927 ** out code that would never execute.

	928 */

	929 #ifdef SQLITE_OMIT_MEMORYDB

	930 # define MEMDB 0

	931 #else

	932 # define MEMDB pPager->memDb

	933 #endif

	934

	935 /*

	936 ** The macro USEFETCH is true if we are allowed to use the xFetch and xUnfetch

	937 ** interfaces to access the database using memory-mapped I/O.

	938 */

	939 #if SQLITE_MAX_MMAP_SIZE>0

	940 # define USEFETCH(x) ((x)->bUseFetch)

	941 #else

	942 # define USEFETCH(x) 0

	943 #endif

	944

	945 /*

	946 ** The maximum legal page number is (2^31 - 1).

	947 */

	948 #define PAGER_MAX_PGNO 2147483647

	949

	950 /*

	951 ** The argument to this macro is a file descriptor (type sqlite3_file*).

	952 ** Return 0 if it is not open, or non-zero (but not 1) if it is.

	953 **

	954 ** This is so that expressions can be written as:

	955 **

	956 ** if( isOpen(pPager->jfd) ){ ...

	957 **

	958 ** instead of

	959 **

	960 ** if( pPager->jfd->pMethods ){ ...

	961 */

	962 #define isOpen(pFd) ((pFd)->pMethods!=0)

	963

	964 /*

	965 ** Return true if this pager uses a write-ahead log instead of the usual

	966 ** rollback journal. Otherwise false.

	967 */

	968 #ifndef SQLITE_OMIT_WAL

	969 static int pagerUseWal(Pager *pPager){

	970 return (pPager->pWal!=0);

	971 }

	972 #else

	973 # define pagerUseWal(x) 0

	974 # define pagerRollbackWal(x) 0

	975 # define pagerWalFrames(v,w,x,y) 0

	976 # define pagerOpenWalIfPresent(z) SQLITE_OK

	977 # define pagerBeginReadTransaction(z) SQLITE_OK

	978 #endif

	979

	980 #ifndef NDEBUG

	981 /*

	982 ** Usage:

	983 **

	984 ** assert( assert_pager_state(pPager) );

	985 **

	986 ** This function runs many asserts to try to find inconsistencies in

	987 ** the internal state of the Pager object.

	988 */

	989 static int assert_pager_state(Pager *p){

	990 Pager *pPager = p;

	991

	992 /* State must be valid. */

	993 assert( p->eState==PAGER_OPEN

	994 \|\| p->eState==PAGER_READER

	995 \|\| p->eState==PAGER_WRITER_LOCKED

	996 \|\| p->eState==PAGER_WRITER_CACHEMOD

	997 \|\| p->eState==PAGER_WRITER_DBMOD

	998 \|\| p->eState==PAGER_WRITER_FINISHED

	999 \|\| p->eState==PAGER_ERROR

	1000 );

	1001

	1002 /* Regardless of the current state, a temp-file connection always behaves

	1003 ** as if it has an exclusive lock on the database file. It never updates

	1004 ** the change-counter field, so the changeCountDone flag is always set.

	1005 */

	1006 assert( p->tempFile==0 \|\| p->eLock==EXCLUSIVE_LOCK );

	1007 assert( p->tempFile==0 \|\| pPager->changeCountDone );

	1008

	1009 /* If the useJournal flag is clear, the journal-mode must be "OFF".

	1010 ** And if the journal-mode is "OFF", the journal file must not be open.

	1011 */

	1012 assert( p->journalMode==PAGER_JOURNALMODE_OFF \|\| p->useJournal );

	1013 assert( p->journalMode!=PAGER_JOURNALMODE_OFF \|\| !isOpen(p->jfd) );

	1014

	1015 /* Check that MEMDB implies noSync. And an in-memory journal. Since

	1016 ** this means an in-memory pager performs no IO at all, it cannot encounter

	1017 ** either SQLITE_IOERR or SQLITE_FULL during rollback or while finalizing

	1018 ** a journal file. (although the in-memory journal implementation may

	1019 ** return SQLITE_IOERR_NOMEM while the journal file is being written). It

	1020 ** is therefore not possible for an in-memory pager to enter the ERROR

	1021 ** state.

	1022 */

	1023 if( MEMDB ){

	1024 assert( p->noSync );

	1025 assert( p->journalMode==PAGER_JOURNALMODE_OFF

	1026 \|\| p->journalMode==PAGER_JOURNALMODE_MEMORY

	1027 );

	1028 assert( p->eState!=PAGER_ERROR && p->eState!=PAGER_OPEN );

	1029 assert( pagerUseWal(p)==0 );

	1030 }

	1031

	1032 /* If changeCountDone is set, a RESERVED lock or greater must be held

	1033 ** on the file.

	1034 */

	1035 assert( pPager->changeCountDone==0 \|\| pPager->eLock>=RESERVED_LOCK );

	1036 assert( p->eLock!=PENDING_LOCK );

	1037

	1038 switch( p->eState ){

	1039 case PAGER_OPEN:

	1040 assert( !MEMDB );

	1041 assert( pPager->errCode==SQLITE_OK );

	1042 assert( sqlite3PcacheRefCount(pPager->pPCache)==0 \|\| pPager->tempFile );

	1043 break;

	1044

	1045 case PAGER_READER:

	1046 assert( pPager->errCode==SQLITE_OK );

	1047 assert( p->eLock!=UNKNOWN_LOCK );

	1048 assert( p->eLock>=SHARED_LOCK );

	1049 break;

	1050

	1051 case PAGER_WRITER_LOCKED:

	1052 assert( p->eLock!=UNKNOWN_LOCK );

	1053 assert( pPager->errCode==SQLITE_OK );

	1054 if( !pagerUseWal(pPager) ){

	1055 assert( p->eLock>=RESERVED_LOCK );

	1056 }

	1057 assert( pPager->dbSize==pPager->dbOrigSize );

	1058 assert( pPager->dbOrigSize==pPager->dbFileSize );

	1059 assert( pPager->dbOrigSize==pPager->dbHintSize );

	1060 assert( pPager->setMaster==0 );

	1061 break;

	1062

	1063 case PAGER_WRITER_CACHEMOD:

	1064 assert( p->eLock!=UNKNOWN_LOCK );

	1065 assert( pPager->errCode==SQLITE_OK );

	1066 if( !pagerUseWal(pPager) ){

	1067 /* It is possible that if journal_mode=wal here that neither the

	1068 ** journal file nor the WAL file are open. This happens during

	1069 ** a rollback transaction that switches from journal_mode=off

	1070 ** to journal_mode=wal.

	1071 */

	1072 assert( p->eLock>=RESERVED_LOCK );

	1073 assert( isOpen(p->jfd)

	1074 \|\| p->journalMode==PAGER_JOURNALMODE_OFF

	1075 \|\| p->journalMode==PAGER_JOURNALMODE_WAL

	1076 );

	1077 }

	1078 assert( pPager->dbOrigSize==pPager->dbFileSize );

	1079 assert( pPager->dbOrigSize==pPager->dbHintSize );

	1080 break;

	1081

	1082 case PAGER_WRITER_DBMOD:

	1083 assert( p->eLock==EXCLUSIVE_LOCK );

	1084 assert( pPager->errCode==SQLITE_OK );

	1085 assert( !pagerUseWal(pPager) );

	1086 assert( p->eLock>=EXCLUSIVE_LOCK );

	1087 assert( isOpen(p->jfd)

	1088 \|\| p->journalMode==PAGER_JOURNALMODE_OFF

	1089 \|\| p->journalMode==PAGER_JOURNALMODE_WAL

	1090 );

	1091 assert( pPager->dbOrigSize<=pPager->dbHintSize );

	1092 break;

	1093

	1094 case PAGER_WRITER_FINISHED:

	1095 assert( p->eLock==EXCLUSIVE_LOCK );

	1096 assert( pPager->errCode==SQLITE_OK );

	1097 assert( !pagerUseWal(pPager) );

	1098 assert( isOpen(p->jfd)

	1099 \|\| p->journalMode==PAGER_JOURNALMODE_OFF

	1100 \|\| p->journalMode==PAGER_JOURNALMODE_WAL

	1101 );

	1102 break;

	1103

	1104 case PAGER_ERROR:

	1105 /* There must be at least one outstanding reference to the pager if

	1106 ** in ERROR state. Otherwise the pager should have already dropped

	1107 ** back to OPEN state.

	1108 */

	1109 assert( pPager->errCode!=SQLITE_OK );

	1110 assert( sqlite3PcacheRefCount(pPager->pPCache)>0 );

	1111 break;

	1112 }

	1113

	1114 return 1;

	1115 }

	1116 #endif /* ifndef NDEBUG */

	1117

	1118 #ifdef SQLITE_DEBUG

	1119 /*

	1120 ** Return a pointer to a human readable string in a static buffer

	1121 ** containing the state of the Pager object passed as an argument. This

	1122 ** is intended to be used within debuggers. For example, as an alternative

	1123 ** to "print *pPager" in gdb:

	1124 **

	1125 ** (gdb) printf "%s", print_pager_state(pPager)

	1126 */

	1127 static char print_pager_state(Pager p){

	1128 static char zRet[1024];

	1129

	1130 sqlite3_snprintf(1024, zRet,

	1131 "Filename: %s\n"

	1132 "State: %s errCode=%d\n"

	1133 "Lock: %s\n"

	1134 "Locking mode: locking_mode=%s\n"

	1135 "Journal mode: journal_mode=%s\n"

	1136 "Backing store: tempFile=%d memDb=%d useJournal=%d\n"

	1137 "Journal: journalOff=%lld journalHdr=%lld\n"

	1138 "Size: dbsize=%d dbOrigSize=%d dbFileSize=%d\n"

	1139 , p->zFilename

	1140 , p->eState==PAGER_OPEN ? "OPEN" :

	1141 p->eState==PAGER_READER ? "READER" :

	1142 p->eState==PAGER_WRITER_LOCKED ? "WRITER_LOCKED" :

	1143 p->eState==PAGER_WRITER_CACHEMOD ? "WRITER_CACHEMOD" :

	1144 p->eState==PAGER_WRITER_DBMOD ? "WRITER_DBMOD" :

	1145 p->eState==PAGER_WRITER_FINISHED ? "WRITER_FINISHED" :

	1146 p->eState==PAGER_ERROR ? "ERROR" : "?error?"

	1147 , (int)p->errCode

	1148 , p->eLock==NO_LOCK ? "NO_LOCK" :

	1149 p->eLock==RESERVED_LOCK ? "RESERVED" :

	1150 p->eLock==EXCLUSIVE_LOCK ? "EXCLUSIVE" :

	1151 p->eLock==SHARED_LOCK ? "SHARED" :

	1152 p->eLock==UNKNOWN_LOCK ? "UNKNOWN" : "?error?"

	1153 , p->exclusiveMode ? "exclusive" : "normal"

	1154 , p->journalMode==PAGER_JOURNALMODE_MEMORY ? "memory" :

	1155 p->journalMode==PAGER_JOURNALMODE_OFF ? "off" :

	1156 p->journalMode==PAGER_JOURNALMODE_DELETE ? "delete" :

	1157 p->journalMode==PAGER_JOURNALMODE_PERSIST ? "persist" :

	1158 p->journalMode==PAGER_JOURNALMODE_TRUNCATE ? "truncate" :

	1159 p->journalMode==PAGER_JOURNALMODE_WAL ? "wal" : "?error?"

	1160 , (int)p->tempFile, (int)p->memDb, (int)p->useJournal

	1161 , p->journalOff, p->journalHdr

	1162 , (int)p->dbSize, (int)p->dbOrigSize, (int)p->dbFileSize

	1163 );

	1164

	1165 return zRet;

	1166 }

	1167 #endif

	1168

	1169 /*

	1170 ** Return true if it is necessary to write page *pPg into the sub-journal.

	1171 ** A page needs to be written into the sub-journal if there exists one

	1172 ** or more open savepoints for which:

	1173 **

	1174 ** * The page-number is less than or equal to PagerSavepoint.nOrig, and

	1175 ** * The bit corresponding to the page-number is not set in

	1176 ** PagerSavepoint.pInSavepoint.

	1177 */

	1178 static int subjRequiresPage(PgHdr *pPg){

	1179 Pager *pPager = pPg->pPager;

	1180 PagerSavepoint *p;

	1181 Pgno pgno = pPg->pgno;

	1182 int i;

	1183 for(i=0; i<pPager->nSavepoint; i++){

	1184 p = &pPager->aSavepoint[i];

	1185 if( p->nOrig>=pgno && 0==sqlite3BitvecTestNotNull(p->pInSavepoint, pgno) ){

	1186 return 1;

	1187 }

	1188 }

	1189 return 0;

	1190 }

	1191

	1192 #ifdef SQLITE_DEBUG

	1193 /*

	1194 ** Return true if the page is already in the journal file.

	1195 */

	1196 static int pageInJournal(Pager pPager, PgHdr pPg){

	1197 return sqlite3BitvecTest(pPager->pInJournal, pPg->pgno);

	1198 }

	1199 #endif

	1200

	1201 /*

	1202 ** Read a 32-bit integer from the given file descriptor. Store the integer

	1203 ** that is read in *pRes. Return SQLITE_OK if everything worked, or an

	1204 ** error code is something goes wrong.

	1205 **

	1206 ** All values are stored on disk as big-endian.

	1207 */

	1208 static int read32bits(sqlite3_file fd, i64 offset, u32 pRes){

	1209 unsigned char ac[4];

	1210 int rc = sqlite3OsRead(fd, ac, sizeof(ac), offset);

	1211 if( rc==SQLITE_OK ){

	1212 *pRes = sqlite3Get4byte(ac);

	1213 }

	1214 return rc;

	1215 }

	1216

	1217 /*

	1218 ** Write a 32-bit integer into a string buffer in big-endian byte order.

	1219 */

	1220 #define put32bits(A,B) sqlite3Put4byte((u8*)A,B)

	1221

	1222

	1223 /*

	1224 ** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK

	1225 ** on success or an error code is something goes wrong.

	1226 */

	1227 static int write32bits(sqlite3_file *fd, i64 offset, u32 val){

	1228 char ac[4];

	1229 put32bits(ac, val);

	1230 return sqlite3OsWrite(fd, ac, 4, offset);

	1231 }

	1232

	1233 /*

	1234 ** Unlock the database file to level eLock, which must be either NO_LOCK

	1235 ** or SHARED_LOCK. Regardless of whether or not the call to xUnlock()

	1236 ** succeeds, set the Pager.eLock variable to match the (attempted) new lock.

	1237 **

	1238 ** Except, if Pager.eLock is set to UNKNOWN_LOCK when this function is

	1239 ** called, do not modify it. See the comment above the #define of

	1240 ** UNKNOWN_LOCK for an explanation of this.

	1241 */

	1242 static int pagerUnlockDb(Pager *pPager, int eLock){

	1243 int rc = SQLITE_OK;

	1244

	1245 assert( !pPager->exclusiveMode \|\| pPager->eLock==eLock );

	1246 assert( eLock==NO_LOCK \|\| eLock==SHARED_LOCK );

	1247 assert( eLock!=NO_LOCK \|\| pagerUseWal(pPager)==0 );

	1248 if( isOpen(pPager->fd) ){

	1249 assert( pPager->eLock>=eLock );

	1250 rc = pPager->noLock ? SQLITE_OK : sqlite3OsUnlock(pPager->fd, eLock);

	1251 if( pPager->eLock!=UNKNOWN_LOCK ){

	1252 pPager->eLock = (u8)eLock;

	1253 }

	1254 IOTRACE(("UNLOCK %p %d\n", pPager, eLock))

	1255 }

	1256 return rc;

	1257 }

	1258

	1259 /*

	1260 ** Lock the database file to level eLock, which must be either SHARED_LOCK,

	1261 ** RESERVED_LOCK or EXCLUSIVE_LOCK. If the caller is successful, set the

	1262 ** Pager.eLock variable to the new locking state.

	1263 **

	1264 ** Except, if Pager.eLock is set to UNKNOWN_LOCK when this function is

	1265 ** called, do not modify it unless the new locking state is EXCLUSIVE_LOCK.

	1266 ** See the comment above the #define of UNKNOWN_LOCK for an explanation

	1267 ** of this.

	1268 */

	1269 static int pagerLockDb(Pager *pPager, int eLock){

	1270 int rc = SQLITE_OK;

	1271

	1272 assert( eLock==SHARED_LOCK \|\| eLock==RESERVED_LOCK \|\| eLock==EXCLUSIVE_LOCK );

	1273 if( pPager->eLock<eLock \|\| pPager->eLock==UNKNOWN_LOCK ){

	1274 rc = pPager->noLock ? SQLITE_OK : sqlite3OsLock(pPager->fd, eLock);

	1275 if( rc==SQLITE_OK && (pPager->eLock!=UNKNOWN_LOCK\|\|eLock==EXCLUSIVE_LOCK) ){

	1276 pPager->eLock = (u8)eLock;

	1277 IOTRACE(("LOCK %p %d\n", pPager, eLock))

	1278 }

	1279 }

	1280 return rc;

	1281 }

	1282

	1283 /*

	1284 ** This function determines whether or not the atomic-write optimization

	1285 ** can be used with this pager. The optimization can be used if:

	1286 **

	1287 ** (a) the value returned by OsDeviceCharacteristics() indicates that

	1288 ** a database page may be written atomically, and

	1289 ** (b) the value returned by OsSectorSize() is less than or equal

	1290 ** to the page size.

	1291 **

	1292 ** The optimization is also always enabled for temporary files. It is

	1293 ** an error to call this function if pPager is opened on an in-memory

	1294 ** database.

	1295 **

	1296 ** If the optimization cannot be used, 0 is returned. If it can be used,

	1297 ** then the value returned is the size of the journal file when it

	1298 ** contains rollback data for exactly one page.

	1299 */

	1300 #ifdef SQLITE_ENABLE_ATOMIC_WRITE

	1301 static int jrnlBufferSize(Pager *pPager){

	1302 assert( !MEMDB );

	1303 if( !pPager->tempFile ){

	1304 int dc; /* Device characteristics */

	1305 int nSector; /* Sector size */

	1306 int szPage; /* Page size */

	1307

	1308 assert( isOpen(pPager->fd) );

	1309 dc = sqlite3OsDeviceCharacteristics(pPager->fd);

	1310 nSector = pPager->sectorSize;

	1311 szPage = pPager->pageSize;

	1312

	1313 assert(SQLITE_IOCAP_ATOMIC512==(512>>8));

	1314 assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));

	1315 if( 0==(dc&(SQLITE_IOCAP_ATOMIC\|(szPage>>8)) \|\| nSector>szPage) ){

	1316 return 0;

	1317 }

	1318 }

	1319

	1320 return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager);

	1321 }

	1322 #endif

	1323

	1324 /*

	1325 ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking

	1326 ** on the cache using a hash function. This is used for testing

	1327 ** and debugging only.

	1328 */

	1329 #ifdef SQLITE_CHECK_PAGES

	1330 /*

	1331 ** Return a 32-bit hash of the page data for pPage.

	1332 */

	1333 static u32 pager_datahash(int nByte, unsigned char *pData){

	1334 u32 hash = 0;

	1335 int i;

	1336 for(i=0; i<nByte; i++){

	1337 hash = (hash*1039) + pData[i];

	1338 }

	1339 return hash;

	1340 }

	1341 static u32 pager_pagehash(PgHdr *pPage){

	1342 return pager_datahash(pPage->pPager->pageSize, (unsigned char *)pPage->pData);

	1343 }

	1344 static void pager_set_pagehash(PgHdr *pPage){

	1345 pPage->pageHash = pager_pagehash(pPage);

	1346 }

	1347

	1348 /*

	1349 ** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES

	1350 ** is defined, and NDEBUG is not defined, an assert() statement checks

	1351 ** that the page is either dirty or still matches the calculated page-hash.

	1352 */

	1353 #define CHECK_PAGE(x) checkPage(x)

	1354 static void checkPage(PgHdr *pPg){

	1355 Pager *pPager = pPg->pPager;

	1356 assert( pPager->eState!=PAGER_ERROR );

	1357 assert( (pPg->flags&PGHDR_DIRTY) \|\| pPg->pageHash==pager_pagehash(pPg) );

	1358 }

	1359

	1360 #else

	1361 #define pager_datahash(X,Y) 0

	1362 #define pager_pagehash(X) 0

	1363 #define pager_set_pagehash(X)

	1364 #define CHECK_PAGE(x)

	1365 #endif /* SQLITE_CHECK_PAGES */

	1366

	1367 /*

	1368 ** When this is called the journal file for pager pPager must be open.

	1369 ** This function attempts to read a master journal file name from the

	1370 ** end of the file and, if successful, copies it into memory supplied

	1371 ** by the caller. See comments above writeMasterJournal() for the format

	1372 ** used to store a master journal file name at the end of a journal file.

	1373 **

	1374 ** zMaster must point to a buffer of at least nMaster bytes allocated by

	1375 ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is

	1376 ** enough space to write the master journal name). If the master journal

	1377 ** name in the journal is longer than nMaster bytes (including a

	1378 ** nul-terminator), then this is handled as if no master journal name

	1379 ** were present in the journal.

	1380 **

	1381 ** If a master journal file name is present at the end of the journal

	1382 ** file, then it is copied into the buffer pointed to by zMaster. A

	1383 ** nul-terminator byte is appended to the buffer following the master

	1384 ** journal file name.

	1385 **

	1386 ** If it is determined that no master journal file name is present

	1387 ** zMaster[0] is set to 0 and SQLITE_OK returned.

	1388 **

	1389 ** If an error occurs while reading from the journal file, an SQLite

	1390 ** error code is returned.

	1391 */

	1392 static int readMasterJournal(sqlite3_file pJrnl, char zMaster, u32 nMaster){

	1393 int rc; /* Return code */

	1394 u32 len; /* Length in bytes of master journal name */

	1395 i64 szJ; /* Total size in bytes of journal file pJrnl */

	1396 u32 cksum; /* MJ checksum value read from journal */

	1397 u32 u; /* Unsigned loop counter */

	1398 unsigned char aMagic[8]; /* A buffer to hold the magic header */

	1399 zMaster[0] = '\0';

	1400

	1401 if( SQLITE_OK!=(rc = sqlite3OsFileSize(pJrnl, &szJ))

	1402 \|\| szJ<16

	1403 \|\| SQLITE_OK!=(rc = read32bits(pJrnl, szJ-16, &len))

	1404 \|\| len>=nMaster

	1405 \|\| len==0

	1406 \|\| SQLITE_OK!=(rc = read32bits(pJrnl, szJ-12, &cksum))

	1407 \|\| SQLITE_OK!=(rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8))

	1408 \|\| memcmp(aMagic, aJournalMagic, 8)

	1409 \|\| SQLITE_OK!=(rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len))

	1410 ){

	1411 return rc;

	1412 }

	1413

	1414 /* See if the checksum matches the master journal name */

	1415 for(u=0; u<len; u++){

	1416 cksum -= zMaster[u];

	1417 }

	1418 if( cksum ){

	1419 /* If the checksum doesn't add up, then one or more of the disk sectors

	1420 ** containing the master journal filename is corrupted. This means

	1421 ** definitely roll back, so just return SQLITE_OK and report a (nul)

	1422 ** master-journal filename.

	1423 */

	1424 len = 0;

	1425 }

	1426 zMaster[len] = '\0';

	1427

	1428 return SQLITE_OK;

	1429 }

	1430

	1431 /*

	1432 ** Return the offset of the sector boundary at or immediately

	1433 ** following the value in pPager->journalOff, assuming a sector

	1434 ** size of pPager->sectorSize bytes.

	1435 **

	1436 ** i.e for a sector size of 512:

	1437 **

	1438 ** Pager.journalOff Return value

	1439 ** ---------------------------------------

	1440 ** 0 0

	1441 ** 512 512

	1442 ** 100 512

	1443 ** 2000 2048

	1444 **

	1445 */

	1446 static i64 journalHdrOffset(Pager *pPager){

	1447 i64 offset = 0;

	1448 i64 c = pPager->journalOff;

	1449 if( c ){

	1450 offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);

	1451 }

	1452 assert( offset%JOURNAL_HDR_SZ(pPager)==0 );

	1453 assert( offset>=c );

	1454 assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );

	1455 return offset;

	1456 }

	1457

	1458 /*

	1459 ** The journal file must be open when this function is called.

	1460 **

	1461 ** This function is a no-op if the journal file has not been written to

	1462 ** within the current transaction (i.e. if Pager.journalOff==0).

	1463 **

	1464 ** If doTruncate is non-zero or the Pager.journalSizeLimit variable is

	1465 ** set to 0, then truncate the journal file to zero bytes in size. Otherwise,

	1466 ** zero the 28-byte header at the start of the journal file. In either case,

	1467 ** if the pager is not in no-sync mode, sync the journal file immediately

	1468 ** after writing or truncating it.

	1469 **

	1470 ** If Pager.journalSizeLimit is set to a positive, non-zero value, and

	1471 ** following the truncation or zeroing described above the size of the

	1472 ** journal file in bytes is larger than this value, then truncate the

	1473 ** journal file to Pager.journalSizeLimit bytes. The journal file does

	1474 ** not need to be synced following this operation.

	1475 **

	1476 ** If an IO error occurs, abandon processing and return the IO error code.

	1477 ** Otherwise, return SQLITE_OK.

	1478 */

	1479 static int zeroJournalHdr(Pager *pPager, int doTruncate){

	1480 int rc = SQLITE_OK; /* Return code */

	1481 assert( isOpen(pPager->jfd) );

	1482 if( pPager->journalOff ){

	1483 const i64 iLimit = pPager->journalSizeLimit; /* Local cache of jsl */

	1484

	1485 IOTRACE(("JZEROHDR %p\n", pPager))

	1486 if( doTruncate \|\| iLimit==0 ){

	1487 rc = sqlite3OsTruncate(pPager->jfd, 0);

	1488 }else{

	1489 static const char zeroHdr[28] = {0};

	1490 rc = sqlite3OsWrite(pPager->jfd, zeroHdr, sizeof(zeroHdr), 0);

	1491 }

	1492 if( rc==SQLITE_OK && !pPager->noSync ){

	1493 rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_DATAONLY\|pPager->syncFlags);

	1494 }

	1495

	1496 /* At this point the transaction is committed but the write lock

	1497 ** is still held on the file. If there is a size limit configured for

	1498 ** the persistent journal and the journal file currently consumes more

	1499 ** space than that limit allows for, truncate it now. There is no need

	1500 ** to sync the file following this operation.

	1501 */

	1502 if( rc==SQLITE_OK && iLimit>0 ){

	1503 i64 sz;

	1504 rc = sqlite3OsFileSize(pPager->jfd, &sz);

	1505 if( rc==SQLITE_OK && sz>iLimit ){

	1506 rc = sqlite3OsTruncate(pPager->jfd, iLimit);

	1507 }

	1508 }

	1509 }

	1510 return rc;

	1511 }

	1512

	1513 /*

	1514 ** The journal file must be open when this routine is called. A journal

	1515 ** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the

	1516 ** current location.

	1517 **

	1518 ** The format for the journal header is as follows:

	1519 ** - 8 bytes: Magic identifying journal format.

	1520 ** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.

	1521 ** - 4 bytes: Random number used for page hash.

	1522 ** - 4 bytes: Initial database page count.

	1523 ** - 4 bytes: Sector size used by the process that wrote this journal.

	1524 ** - 4 bytes: Database page size.

	1525 **

	1526 ** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space.

	1527 */

	1528 static int writeJournalHdr(Pager *pPager){

	1529 int rc = SQLITE_OK; /* Return code */

	1530 char zHeader = pPager->pTmpSpace; / Temporary space used to build header */

	1531 u32 nHeader = (u32)pPager->pageSize;/* Size of buffer pointed to by zHeader */

	1532 u32 nWrite; /* Bytes of header sector written */

	1533 int ii; /* Loop counter */

	1534

	1535 assert( isOpen(pPager->jfd) ); /* Journal file must be open. */

	1536

	1537 if( nHeader>JOURNAL_HDR_SZ(pPager) ){

	1538 nHeader = JOURNAL_HDR_SZ(pPager);

	1539 }

	1540

	1541 /* If there are active savepoints and any of them were created

	1542 ** since the most recent journal header was written, update the

	1543 ** PagerSavepoint.iHdrOffset fields now.

	1544 */

	1545 for(ii=0; ii<pPager->nSavepoint; ii++){

	1546 if( pPager->aSavepoint[ii].iHdrOffset==0 ){

	1547 pPager->aSavepoint[ii].iHdrOffset = pPager->journalOff;

	1548 }

	1549 }

	1550

	1551 pPager->journalHdr = pPager->journalOff = journalHdrOffset(pPager);

	1552

	1553 /*

	1554 ** Write the nRec Field - the number of page records that follow this

	1555 ** journal header. Normally, zero is written to this value at this time.

	1556 ** After the records are added to the journal (and the journal synced,

	1557 ** if in full-sync mode), the zero is overwritten with the true number

	1558 ** of records (see syncJournal()).

	1559 **

	1560 ** A faster alternative is to write 0xFFFFFFFF to the nRec field. When

	1561 ** reading the journal this value tells SQLite to assume that the

	1562 ** rest of the journal file contains valid page records. This assumption

	1563 ** is dangerous, as if a failure occurred whilst writing to the journal

	1564 ** file it may contain some garbage data. There are two scenarios

	1565 ** where this risk can be ignored:

	1566 **

	1567 ** * When the pager is in no-sync mode. Corruption can follow a

	1568 ** power failure in this case anyway.

	1569 **

	1570 ** * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees

	1571 ** that garbage data is never appended to the journal file.

	1572 */

	1573 assert( isOpen(pPager->fd) \|\| pPager->noSync );

	1574 if( pPager->noSync \|\| (pPager->journalMode==PAGER_JOURNALMODE_MEMORY)

	1575 \|\| (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND)

	1576 ){

	1577 memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));

	1578 put32bits(&zHeader[sizeof(aJournalMagic)], 0xffffffff);

	1579 }else{

	1580 memset(zHeader, 0, sizeof(aJournalMagic)+4);

	1581 }

	1582

	1583 /* The random check-hash initializer */

	1584 sqlite3_randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);

	1585 put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);

	1586 /* The initial database size */

	1587 put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbOrigSize);

	1588 /* The assumed sector size for this process */

	1589 put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);

	1590

	1591 /* The page size */

	1592 put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize);

	1593

	1594 /* Initializing the tail of the buffer is not necessary. Everything

	1595 ** works find if the following memset() is omitted. But initializing

	1596 ** the memory prevents valgrind from complaining, so we are willing to

	1597 ** take the performance hit.

	1598 */

	1599 memset(&zHeader[sizeof(aJournalMagic)+20], 0,

	1600 nHeader-(sizeof(aJournalMagic)+20));

	1601

	1602 /* In theory, it is only necessary to write the 28 bytes that the

	1603 ** journal header consumes to the journal file here. Then increment the

	1604 ** Pager.journalOff variable by JOURNAL_HDR_SZ so that the next

	1605 ** record is written to the following sector (leaving a gap in the file

	1606 ** that will be implicitly filled in by the OS).

	1607 **

	1608 ** However it has been discovered that on some systems this pattern can

	1609 ** be significantly slower than contiguously writing data to the file,

	1610 ** even if that means explicitly writing data to the block of

	1611 ** (JOURNAL_HDR_SZ - 28) bytes that will not be used. So that is what

	1612 ** is done.

	1613 **

	1614 ** The loop is required here in case the sector-size is larger than the

	1615 ** database page size. Since the zHeader buffer is only Pager.pageSize

	1616 ** bytes in size, more than one call to sqlite3OsWrite() may be required

	1617 ** to populate the entire journal header sector.

	1618 */

	1619 for(nWrite=0; rc==SQLITE_OK&&nWrite<JOURNAL_HDR_SZ(pPager); nWrite+=nHeader){

	1620 IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, nHeader))

	1621 rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff);

	1622 assert( pPager->journalHdr <= pPager->journalOff );

	1623 pPager->journalOff += nHeader;

	1624 }

	1625

	1626 return rc;

	1627 }

	1628

	1629 /*

	1630 ** The journal file must be open when this is called. A journal header file

	1631 ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal

	1632 ** file. The current location in the journal file is given by

	1633 ** pPager->journalOff. See comments above function writeJournalHdr() for

	1634 ** a description of the journal header format.

	1635 **

	1636 ** If the header is read successfully, *pNRec is set to the number of

	1637 ** page records following this header and *pDbSize is set to the size of the

	1638 ** database before the transaction began, in pages. Also, pPager->cksumInit

	1639 ** is set to the value read from the journal header. SQLITE_OK is returned

	1640 ** in this case.

	1641 **

	1642 ** If the journal header file appears to be corrupted, SQLITE_DONE is

	1643 ** returned and pNRec and PDbSize are undefined. If JOURNAL_HDR_SZ bytes

	1644 ** cannot be read from the journal file an error code is returned.

	1645 */

	1646 static int readJournalHdr(

	1647 Pager pPager, / Pager object */

	1648 int isHot,

	1649 i64 journalSize, /* Size of the open journal file in bytes */

	1650 u32 pNRec, / OUT: Value read from the nRec field */

	1651 u32 pDbSize / OUT: Value of original database size field */

	1652 ){

	1653 int rc; /* Return code */

	1654 unsigned char aMagic[8]; /* A buffer to hold the magic header */

	1655 i64 iHdrOff; /* Offset of journal header being read */

	1656

	1657 assert( isOpen(pPager->jfd) ); /* Journal file must be open. */

	1658

	1659 /* Advance Pager.journalOff to the start of the next sector. If the

	1660 ** journal file is too small for there to be a header stored at this

	1661 ** point, return SQLITE_DONE.

	1662 */

	1663 pPager->journalOff = journalHdrOffset(pPager);

	1664 if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){

	1665 return SQLITE_DONE;

	1666 }

	1667 iHdrOff = pPager->journalOff;

	1668

	1669 /* Read in the first 8 bytes of the journal header. If they do not match

	1670 ** the magic string found at the start of each journal header, return

	1671 ** SQLITE_DONE. If an IO error occurs, return an error code. Otherwise,

	1672 ** proceed.

	1673 */

	1674 if( isHot \|\| iHdrOff!=pPager->journalHdr ){

	1675 rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), iHdrOff);

	1676 if( rc ){

	1677 return rc;

	1678 }

	1679 if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){

	1680 return SQLITE_DONE;

	1681 }

	1682 }

	1683

	1684 /* Read the first three 32-bit fields of the journal header: The nRec

	1685 ** field, the checksum-initializer and the database size at the start

	1686 ** of the transaction. Return an error code if anything goes wrong.

	1687 */

	1688 if( SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+8, pNRec))

	1689 \|\| SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+12, &pPager->cksumInit))

	1690 \|\| SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+16, pDbSize))

	1691 ){

	1692 return rc;

	1693 }

	1694

	1695 if( pPager->journalOff==0 ){

	1696 u32 iPageSize; /* Page-size field of journal header */

	1697 u32 iSectorSize; /* Sector-size field of journal header */

	1698

	1699 /* Read the page-size and sector-size journal header fields. */

	1700 if( SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+20, &iSectorSize))

	1701 \|\| SQLITE_OK!=(rc = read32bits(pPager->jfd, iHdrOff+24, &iPageSize))

	1702 ){

	1703 return rc;

	1704 }

	1705

	1706 /* Versions of SQLite prior to 3.5.8 set the page-size field of the

	1707 ** journal header to zero. In this case, assume that the Pager.pageSize

	1708 ** variable is already set to the correct page size.

	1709 */

	1710 if( iPageSize==0 ){

	1711 iPageSize = pPager->pageSize;

	1712 }

	1713

	1714 /* Check that the values read from the page-size and sector-size fields

	1715 ** are within range. To be 'in range', both values need to be a power

	1716 ** of two greater than or equal to 512 or 32, and not greater than their

	1717 ** respective compile time maximum limits.

	1718 */

	1719 if( iPageSize<512 \|\| iSectorSize<32

	1720 \|\| iPageSize>SQLITE_MAX_PAGE_SIZE \|\| iSectorSize>MAX_SECTOR_SIZE

	1721 \|\| ((iPageSize-1)&iPageSize)!=0 \|\| ((iSectorSize-1)&iSectorSize)!=0

	1722 ){

	1723 /* If the either the page-size or sector-size in the journal-header is

	1724 ** invalid, then the process that wrote the journal-header must have

	1725 ** crashed before the header was synced. In this case stop reading

	1726 ** the journal file here.

	1727 */

	1728 return SQLITE_DONE;

	1729 }

	1730

	1731 /* Update the page-size to match the value read from the journal.

	1732 ** Use a testcase() macro to make sure that malloc failure within

	1733 ** PagerSetPagesize() is tested.

	1734 */

	1735 rc = sqlite3PagerSetPagesize(pPager, &iPageSize, -1);

	1736 testcase( rc!=SQLITE_OK );

	1737

	1738 /* Update the assumed sector-size to match the value used by

	1739 ** the process that created this journal. If this journal was

	1740 ** created by a process other than this one, then this routine

	1741 ** is being called from within pager_playback(). The local value

	1742 ** of Pager.sectorSize is restored at the end of that routine.

	1743 */

	1744 pPager->sectorSize = iSectorSize;

	1745 }

	1746

	1747 pPager->journalOff += JOURNAL_HDR_SZ(pPager);

	1748 return rc;

	1749 }

	1750

	1751

	1752 /*

	1753 ** Write the supplied master journal name into the journal file for pager

	1754 ** pPager at the current location. The master journal name must be the last

	1755 ** thing written to a journal file. If the pager is in full-sync mode, the

	1756 ** journal file descriptor is advanced to the next sector boundary before

	1757 ** anything is written. The format is:

	1758 **

	1759 ** + 4 bytes: PAGER_MJ_PGNO.

	1760 ** + N bytes: Master journal filename in utf-8.

	1761 ** + 4 bytes: N (length of master journal name in bytes, no nul-terminator).

	1762 ** + 4 bytes: Master journal name checksum.

	1763 ** + 8 bytes: aJournalMagic[].

	1764 **

	1765 ** The master journal page checksum is the sum of the bytes in the master

	1766 ** journal name, where each byte is interpreted as a signed 8-bit integer.

	1767 **

	1768 ** If zMaster is a NULL pointer (occurs for a single database transaction),

	1769 ** this call is a no-op.

	1770 */

	1771 static int writeMasterJournal(Pager pPager, const char zMaster){

	1772 int rc; /* Return code */

	1773 int nMaster; /* Length of string zMaster */

	1774 i64 iHdrOff; /* Offset of header in journal file */

	1775 i64 jrnlSize; /* Size of journal file on disk */

	1776 u32 cksum = 0; /* Checksum of string zMaster */

	1777

	1778 assert( pPager->setMaster==0 );

	1779 assert( !pagerUseWal(pPager) );

	1780

	1781 if( !zMaster

	1782 \|\| pPager->journalMode==PAGER_JOURNALMODE_MEMORY

	1783 \|\| !isOpen(pPager->jfd)

	1784 ){

	1785 return SQLITE_OK;

	1786 }

	1787 pPager->setMaster = 1;

	1788 assert( pPager->journalHdr <= pPager->journalOff );

	1789

	1790 /* Calculate the length in bytes and the checksum of zMaster */

	1791 for(nMaster=0; zMaster[nMaster]; nMaster++){

	1792 cksum += zMaster[nMaster];

	1793 }

	1794

	1795 /* If in full-sync mode, advance to the next disk sector before writing

	1796 ** the master journal name. This is in case the previous page written to

	1797 ** the journal has already been synced.

	1798 */

	1799 if( pPager->fullSync ){

	1800 pPager->journalOff = journalHdrOffset(pPager);

	1801 }

	1802 iHdrOff = pPager->journalOff;

	1803

	1804 /* Write the master journal data to the end of the journal file. If

	1805 ** an error occurs, return the error code to the caller.

	1806 */

	1807 if( (0 != (rc = write32bits(pPager->jfd, iHdrOff, PAGER_MJ_PGNO(pPager))))

	1808 \|\| (0 != (rc = sqlite3OsWrite(pPager->jfd, zMaster, nMaster, iHdrOff+4)))

	1809 \|\| (0 != (rc = write32bits(pPager->jfd, iHdrOff+4+nMaster, nMaster)))

	1810 \|\| (0 != (rc = write32bits(pPager->jfd, iHdrOff+4+nMaster+4, cksum)))

	1811 \|\| (0 != (rc = sqlite3OsWrite(pPager->jfd, aJournalMagic, 8,

	1812 iHdrOff+4+nMaster+8)))

	1813 ){

	1814 return rc;

	1815 }

	1816 pPager->journalOff += (nMaster+20);

	1817

	1818 /* If the pager is in peristent-journal mode, then the physical

	1819 ** journal-file may extend past the end of the master-journal name

	1820 ** and 8 bytes of magic data just written to the file. This is

	1821 ** dangerous because the code to rollback a hot-journal file

	1822 ** will not be able to find the master-journal name to determine

	1823 ** whether or not the journal is hot.

	1824 **

	1825 ** Easiest thing to do in this scenario is to truncate the journal

	1826 ** file to the required size.

	1827 */

	1828 if( SQLITE_OK==(rc = sqlite3OsFileSize(pPager->jfd, &jrnlSize))

	1829 && jrnlSize>pPager->journalOff

	1830 ){

	1831 rc = sqlite3OsTruncate(pPager->jfd, pPager->journalOff);

	1832 }

	1833 return rc;

	1834 }

	1835

	1836 /*

	1837 ** Discard the entire contents of the in-memory page-cache.

	1838 */

	1839 static void pager_reset(Pager *pPager){

	1840 pPager->iDataVersion++;

	1841 sqlite3BackupRestart(pPager->pBackup);

	1842 sqlite3PcacheClear(pPager->pPCache);

	1843 }

	1844

	1845 /*

	1846 ** Return the pPager->iDataVersion value

	1847 */

	1848 SQLITE_PRIVATE u32 sqlite3PagerDataVersion(Pager *pPager){

	1849 assert( pPager->eState>PAGER_OPEN );

	1850 return pPager->iDataVersion;

	1851 }

	1852

	1853 /*

	1854 ** Free all structures in the Pager.aSavepoint[] array and set both

	1855 ** Pager.aSavepoint and Pager.nSavepoint to zero. Close the sub-journal

	1856 ** if it is open and the pager is not in exclusive mode.

	1857 */

	1858 static void releaseAllSavepoints(Pager *pPager){

	1859 int ii; /* Iterator for looping through Pager.aSavepoint */

	1860 for(ii=0; ii<pPager->nSavepoint; ii++){

	1861 sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint);

	1862 }

	1863 if( !pPager->exclusiveMode \|\| sqlite3IsMemJournal(pPager->sjfd) ){

	1864 sqlite3OsClose(pPager->sjfd);

	1865 }

	1866 sqlite3_free(pPager->aSavepoint);

	1867 pPager->aSavepoint = 0;

	1868 pPager->nSavepoint = 0;

	1869 pPager->nSubRec = 0;

	1870 }

	1871

	1872 /*

	1873 ** Set the bit number pgno in the PagerSavepoint.pInSavepoint

	1874 ** bitvecs of all open savepoints. Return SQLITE_OK if successful

	1875 ** or SQLITE_NOMEM if a malloc failure occurs.

	1876 */

	1877 static int addToSavepointBitvecs(Pager *pPager, Pgno pgno){

	1878 int ii; /* Loop counter */

	1879 int rc = SQLITE_OK; /* Result code */

	1880

	1881 for(ii=0; ii<pPager->nSavepoint; ii++){

	1882 PagerSavepoint *p = &pPager->aSavepoint[ii];

	1883 if( pgno<=p->nOrig ){

	1884 rc \|= sqlite3BitvecSet(p->pInSavepoint, pgno);

	1885 testcase( rc==SQLITE_NOMEM );

	1886 assert( rc==SQLITE_OK \|\| rc==SQLITE_NOMEM );

	1887 }

	1888 }

	1889 return rc;

	1890 }

	1891

	1892 /*

	1893 ** This function is a no-op if the pager is in exclusive mode and not

	1894 ** in the ERROR state. Otherwise, it switches the pager to PAGER_OPEN

	1895 ** state.

	1896 **

	1897 ** If the pager is not in exclusive-access mode, the database file is

	1898 ** completely unlocked. If the file is unlocked and the file-system does

	1899 ** not exhibit the UNDELETABLE_WHEN_OPEN property, the journal file is

	1900 ** closed (if it is open).

	1901 **

	1902 ** If the pager is in ERROR state when this function is called, the

	1903 ** contents of the pager cache are discarded before switching back to

	1904 ** the OPEN state. Regardless of whether the pager is in exclusive-mode

	1905 ** or not, any journal file left in the file-system will be treated

	1906 ** as a hot-journal and rolled back the next time a read-transaction

	1907 ** is opened (by this or by any other connection).

	1908 */

	1909 static void pager_unlock(Pager *pPager){

	1910

	1911 assert( pPager->eState==PAGER_READER

	1912 \|\| pPager->eState==PAGER_OPEN

	1913 \|\| pPager->eState==PAGER_ERROR

	1914 );

	1915

	1916 sqlite3BitvecDestroy(pPager->pInJournal);

	1917 pPager->pInJournal = 0;

	1918 releaseAllSavepoints(pPager);

	1919

	1920 if( pagerUseWal(pPager) ){

	1921 assert( !isOpen(pPager->jfd) );

	1922 sqlite3WalEndReadTransaction(pPager->pWal);

	1923 pPager->eState = PAGER_OPEN;

	1924 }else if( !pPager->exclusiveMode ){

	1925 int rc; /* Error code returned by pagerUnlockDb() */

	1926 int iDc = isOpen(pPager->fd)?sqlite3OsDeviceCharacteristics(pPager->fd):0;

	1927

	1928 /* If the operating system support deletion of open files, then

	1929 ** close the journal file when dropping the database lock. Otherwise

	1930 ** another connection with journal_mode=delete might delete the file

	1931 ** out from under us.

	1932 */

	1933 assert( (PAGER_JOURNALMODE_MEMORY & 5)!=1 );

	1934 assert( (PAGER_JOURNALMODE_OFF & 5)!=1 );

	1935 assert( (PAGER_JOURNALMODE_WAL & 5)!=1 );

	1936 assert( (PAGER_JOURNALMODE_DELETE & 5)!=1 );

	1937 assert( (PAGER_JOURNALMODE_TRUNCATE & 5)==1 );

	1938 assert( (PAGER_JOURNALMODE_PERSIST & 5)==1 );

	1939 if( 0==(iDc & SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN)

	1940 \|\| 1!=(pPager->journalMode & 5)

	1941 ){

	1942 sqlite3OsClose(pPager->jfd);

	1943 }

	1944

	1945 /* If the pager is in the ERROR state and the call to unlock the database

	1946 ** file fails, set the current lock to UNKNOWN_LOCK. See the comment

	1947 ** above the #define for UNKNOWN_LOCK for an explanation of why this

	1948 ** is necessary.

	1949 */

	1950 rc = pagerUnlockDb(pPager, NO_LOCK);

	1951 if( rc!=SQLITE_OK && pPager->eState==PAGER_ERROR ){

	1952 pPager->eLock = UNKNOWN_LOCK;

	1953 }

	1954

	1955 /* The pager state may be changed from PAGER_ERROR to PAGER_OPEN here

	1956 ** without clearing the error code. This is intentional - the error

	1957 ** code is cleared and the cache reset in the block below.

	1958 */

	1959 assert( pPager->errCode \|\| pPager->eState!=PAGER_ERROR );

	1960 pPager->changeCountDone = 0;

	1961 pPager->eState = PAGER_OPEN;

	1962 }

	1963

	1964 /* If Pager.errCode is set, the contents of the pager cache cannot be

	1965 ** trusted. Now that there are no outstanding references to the pager,

	1966 ** it can safely move back to PAGER_OPEN state. This happens in both

	1967 ** normal and exclusive-locking mode.

	1968 */

	1969 if( pPager->errCode ){

	1970 assert( !MEMDB );

	1971 pager_reset(pPager);

	1972 pPager->changeCountDone = pPager->tempFile;

	1973 pPager->eState = PAGER_OPEN;

	1974 pPager->errCode = SQLITE_OK;

	1975 if( USEFETCH(pPager) ) sqlite3OsUnfetch(pPager->fd, 0, 0);

	1976 }

	1977

	1978 pPager->journalOff = 0;

	1979 pPager->journalHdr = 0;

	1980 pPager->setMaster = 0;

	1981 }

	1982

	1983 /*

	1984 ** This function is called whenever an IOERR or FULL error that requires

	1985 ** the pager to transition into the ERROR state may ahve occurred.

	1986 ** The first argument is a pointer to the pager structure, the second

	1987 ** the error-code about to be returned by a pager API function. The

	1988 ** value returned is a copy of the second argument to this function.

	1989 **

	1990 ** If the second argument is SQLITE_FULL, SQLITE_IOERR or one of the

	1991 ** IOERR sub-codes, the pager enters the ERROR state and the error code

	1992 ** is stored in Pager.errCode. While the pager remains in the ERROR state,

	1993 ** all major API calls on the Pager will immediately return Pager.errCode.

	1994 **

	1995 ** The ERROR state indicates that the contents of the pager-cache

	1996 ** cannot be trusted. This state can be cleared by completely discarding

	1997 ** the contents of the pager-cache. If a transaction was active when

	1998 ** the persistent error occurred, then the rollback journal may need

	1999 ** to be replayed to restore the contents of the database file (as if

	2000 ** it were a hot-journal).

	2001 */

	2002 static int pager_error(Pager *pPager, int rc){

	2003 int rc2 = rc & 0xff;

	2004 assert( rc==SQLITE_OK \|\| !MEMDB );

	2005 assert(

	2006 pPager->errCode==SQLITE_FULL \|\|

	2007 pPager->errCode==SQLITE_OK \|\|

	2008 (pPager->errCode & 0xff)==SQLITE_IOERR

	2009 );

	2010 if( rc2==SQLITE_FULL \|\| rc2==SQLITE_IOERR ){

	2011 pPager->errCode = rc;

	2012 pPager->eState = PAGER_ERROR;

	2013 }

	2014 return rc;

	2015 }

	2016

	2017 static int pager_truncate(Pager *pPager, Pgno nPage);

	2018

	2019 /*

	2020 ** This routine ends a transaction. A transaction is usually ended by

	2021 ** either a COMMIT or a ROLLBACK operation. This routine may be called

	2022 ** after rollback of a hot-journal, or if an error occurs while opening

	2023 ** the journal file or writing the very first journal-header of a

	2024 ** database transaction.

	2025 **

	2026 ** This routine is never called in PAGER_ERROR state. If it is called

	2027 ** in PAGER_NONE or PAGER_SHARED state and the lock held is less

	2028 ** exclusive than a RESERVED lock, it is a no-op.

	2029 **

	2030 ** Otherwise, any active savepoints are released.

	2031 **

	2032 ** If the journal file is open, then it is "finalized". Once a journal

	2033 ** file has been finalized it is not possible to use it to roll back a

	2034 ** transaction. Nor will it be considered to be a hot-journal by this

	2035 ** or any other database connection. Exactly how a journal is finalized

	2036 ** depends on whether or not the pager is running in exclusive mode and

	2037 ** the current journal-mode (Pager.journalMode value), as follows:

	2038 **

	2039 ** journalMode==MEMORY

	2040 ** Journal file descriptor is simply closed. This destroys an

	2041 ** in-memory journal.

	2042 **

	2043 ** journalMode==TRUNCATE

	2044 ** Journal file is truncated to zero bytes in size.

	2045 **

	2046 ** journalMode==PERSIST

	2047 ** The first 28 bytes of the journal file are zeroed. This invalidates

	2048 ** the first journal header in the file, and hence the entire journal

	2049 ** file. An invalid journal file cannot be rolled back.

	2050 **

	2051 ** journalMode==DELETE

	2052 ** The journal file is closed and deleted using sqlite3OsDelete().

	2053 **

	2054 ** If the pager is running in exclusive mode, this method of finalizing

	2055 ** the journal file is never used. Instead, if the journalMode is

	2056 ** DELETE and the pager is in exclusive mode, the method described under

	2057 ** journalMode==PERSIST is used instead.

	2058 **

	2059 ** After the journal is finalized, the pager moves to PAGER_READER state.

	2060 ** If running in non-exclusive rollback mode, the lock on the file is

	2061 ** downgraded to a SHARED_LOCK.

	2062 **

	2063 ** SQLITE_OK is returned if no error occurs. If an error occurs during

	2064 ** any of the IO operations to finalize the journal file or unlock the

	2065 ** database then the IO error code is returned to the user. If the

	2066 ** operation to finalize the journal file fails, then the code still

	2067 ** tries to unlock the database file if not in exclusive mode. If the

	2068 ** unlock operation fails as well, then the first error code related

	2069 ** to the first error encountered (the journal finalization one) is

	2070 ** returned.

	2071 */

	2072 static int pager_end_transaction(Pager *pPager, int hasMaster, int bCommit){

	2073 int rc = SQLITE_OK; /* Error code from journal finalization operation */

	2074 int rc2 = SQLITE_OK; /* Error code from db file unlock operation */

	2075

	2076 /* Do nothing if the pager does not have an open write transaction

	2077 ** or at least a RESERVED lock. This function may be called when there

	2078 ** is no write-transaction active but a RESERVED or greater lock is

	2079 ** held under two circumstances:

	2080 **

	2081 ** 1. After a successful hot-journal rollback, it is called with

	2082 ** eState==PAGER_NONE and eLock==EXCLUSIVE_LOCK.

	2083 **

	2084 ** 2. If a connection with locking_mode=exclusive holding an EXCLUSIVE

	2085 ** lock switches back to locking_mode=normal and then executes a

	2086 ** read-transaction, this function is called with eState==PAGER_READER

	2087 ** and eLock==EXCLUSIVE_LOCK when the read-transaction is closed.

	2088 */

	2089 assert( assert_pager_state(pPager) );

	2090 assert( pPager->eState!=PAGER_ERROR );

	2091 if( pPager->eState<PAGER_WRITER_LOCKED && pPager->eLock<RESERVED_LOCK ){

	2092 return SQLITE_OK;

	2093 }

	2094

	2095 releaseAllSavepoints(pPager);

	2096 assert( isOpen(pPager->jfd) \|\| pPager->pInJournal==0 );

	2097 if( isOpen(pPager->jfd) ){

	2098 assert( !pagerUseWal(pPager) );

	2099

	2100 /* Finalize the journal file. */

	2101 if( sqlite3IsMemJournal(pPager->jfd) ){

	2102 assert( pPager->journalMode==PAGER_JOURNALMODE_MEMORY );

	2103 sqlite3OsClose(pPager->jfd);

	2104 }else if( pPager->journalMode==PAGER_JOURNALMODE_TRUNCATE ){

	2105 if( pPager->journalOff==0 ){

	2106 rc = SQLITE_OK;

	2107 }else{

	2108 rc = sqlite3OsTruncate(pPager->jfd, 0);

	2109 if( rc==SQLITE_OK && pPager->fullSync ){

	2110 /* Make sure the new file size is written into the inode right away.

	2111 ** Otherwise the journal might resurrect following a power loss and

	2112 ** cause the last transaction to roll back. See

	2113 ** https://bugzilla.mozilla.org/show_bug.cgi?id=1072773

	2114 */

	2115 rc = sqlite3OsSync(pPager->jfd, pPager->syncFlags);

	2116 }

	2117 }

	2118 pPager->journalOff = 0;

	2119 }else if( pPager->journalMode==PAGER_JOURNALMODE_PERSIST

	2120 \|\| (pPager->exclusiveMode && pPager->journalMode!=PAGER_JOURNALMODE_WAL)

	2121 ){

	2122 rc = zeroJournalHdr(pPager, hasMaster);

	2123 pPager->journalOff = 0;

	2124 }else{

	2125 /* This branch may be executed with Pager.journalMode==MEMORY if

	2126 ** a hot-journal was just rolled back. In this case the journal

	2127 ** file should be closed and deleted. If this connection writes to

	2128 ** the database file, it will do so using an in-memory journal.

	2129 */

	2130 int bDelete = (!pPager->tempFile && sqlite3JournalExists(pPager->jfd));

	2131 assert( pPager->journalMode==PAGER_JOURNALMODE_DELETE

	2132 \|\| pPager->journalMode==PAGER_JOURNALMODE_MEMORY

	2133 \|\| pPager->journalMode==PAGER_JOURNALMODE_WAL

	2134 );

	2135 sqlite3OsClose(pPager->jfd);

	2136 if( bDelete ){

	2137 rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);

	2138 }

	2139 }

	2140 }

	2141

	2142 #ifdef SQLITE_CHECK_PAGES

	2143 sqlite3PcacheIterateDirty(pPager->pPCache, pager_set_pagehash);

	2144 if( pPager->dbSize==0 && sqlite3PcacheRefCount(pPager->pPCache)>0 ){

	2145 PgHdr *p = sqlite3PagerLookup(pPager, 1);

	2146 if( p ){

	2147 p->pageHash = 0;

	2148 sqlite3PagerUnrefNotNull(p);

	2149 }

	2150 }

	2151 #endif

	2152

	2153 sqlite3BitvecDestroy(pPager->pInJournal);

	2154 pPager->pInJournal = 0;

	2155 pPager->nRec = 0;

	2156 sqlite3PcacheCleanAll(pPager->pPCache);

	2157 sqlite3PcacheTruncate(pPager->pPCache, pPager->dbSize);

	2158

	2159 if( pagerUseWal(pPager) ){

	2160 /* Drop the WAL write-lock, if any. Also, if the connection was in

	2161 ** locking_mode=exclusive mode but is no longer, drop the EXCLUSIVE

	2162 ** lock held on the database file.

	2163 */

	2164 rc2 = sqlite3WalEndWriteTransaction(pPager->pWal);

	2165 assert( rc2==SQLITE_OK );

	2166 }else if( rc==SQLITE_OK && bCommit && pPager->dbFileSize>pPager->dbSize ){

	2167 /* This branch is taken when committing a transaction in rollback-journal

	2168 ** mode if the database file on disk is larger than the database image.

	2169 ** At this point the journal has been finalized and the transaction

	2170 ** successfully committed, but the EXCLUSIVE lock is still held on the

	2171 ** file. So it is safe to truncate the database file to its minimum

	2172 ** required size. */

	2173 assert( pPager->eLock==EXCLUSIVE_LOCK );

	2174 rc = pager_truncate(pPager, pPager->dbSize);

	2175 }

	2176

	2177 if( rc==SQLITE_OK && bCommit && isOpen(pPager->fd) ){

	2178 rc = sqlite3OsFileControl(pPager->fd, SQLITE_FCNTL_COMMIT_PHASETWO, 0);

	2179 if( rc==SQLITE_NOTFOUND ) rc = SQLITE_OK;

	2180 }

	2181

	2182 if( !pPager->exclusiveMode

	2183 && (!pagerUseWal(pPager) \|\| sqlite3WalExclusiveMode(pPager->pWal, 0))

	2184 ){

	2185 rc2 = pagerUnlockDb(pPager, SHARED_LOCK);

	2186 pPager->changeCountDone = 0;

	2187 }

	2188 pPager->eState = PAGER_READER;

	2189 pPager->setMaster = 0;

	2190

	2191 return (rc==SQLITE_OK?rc2:rc);

	2192 }

	2193

	2194 /*

	2195 ** Execute a rollback if a transaction is active and unlock the

	2196 ** database file.

	2197 **

	2198 ** If the pager has already entered the ERROR state, do not attempt

	2199 ** the rollback at this time. Instead, pager_unlock() is called. The

	2200 ** call to pager_unlock() will discard all in-memory pages, unlock

	2201 ** the database file and move the pager back to OPEN state. If this

	2202 ** means that there is a hot-journal left in the file-system, the next

	2203 ** connection to obtain a shared lock on the pager (which may be this one)

	2204 ** will roll it back.

	2205 **

	2206 ** If the pager has not already entered the ERROR state, but an IO or

	2207 ** malloc error occurs during a rollback, then this will itself cause

	2208 ** the pager to enter the ERROR state. Which will be cleared by the

	2209 ** call to pager_unlock(), as described above.

	2210 */

	2211 static void pagerUnlockAndRollback(Pager *pPager){

	2212 if( pPager->eState!=PAGER_ERROR && pPager->eState!=PAGER_OPEN ){

	2213 assert( assert_pager_state(pPager) );

	2214 if( pPager->eState>=PAGER_WRITER_LOCKED ){

	2215 sqlite3BeginBenignMalloc();

	2216 sqlite3PagerRollback(pPager);

	2217 sqlite3EndBenignMalloc();

	2218 }else if( !pPager->exclusiveMode ){

	2219 assert( pPager->eState==PAGER_READER );

	2220 pager_end_transaction(pPager, 0, 0);

	2221 }

	2222 }

	2223 pager_unlock(pPager);

	2224 }

	2225

	2226 /*

	2227 ** Parameter aData must point to a buffer of pPager->pageSize bytes

	2228 ** of data. Compute and return a checksum based ont the contents of the

	2229 ** page of data and the current value of pPager->cksumInit.

	2230 **

	2231 ** This is not a real checksum. It is really just the sum of the

	2232 ** random initial value (pPager->cksumInit) and every 200th byte

	2233 ** of the page data, starting with byte offset (pPager->pageSize%200).

	2234 ** Each byte is interpreted as an 8-bit unsigned integer.

	2235 **

	2236 ** Changing the formula used to compute this checksum results in an

	2237 ** incompatible journal file format.

	2238 **

	2239 ** If journal corruption occurs due to a power failure, the most likely

	2240 ** scenario is that one end or the other of the record will be changed.

	2241 ** It is much less likely that the two ends of the journal record will be

	2242 ** correct and the middle be corrupt. Thus, this "checksum" scheme,

	2243 ** though fast and simple, catches the mostly likely kind of corruption.

	2244 */

	2245 static u32 pager_cksum(Pager pPager, const u8 aData){

	2246 u32 cksum = pPager->cksumInit; /* Checksum value to return */

	2247 int i = pPager->pageSize-200; /* Loop counter */

	2248 while( i>0 ){

	2249 cksum += aData[i];

	2250 i -= 200;

	2251 }

	2252 return cksum;

	2253 }

	2254

	2255 /*

	2256 ** Report the current page size and number of reserved bytes back

	2257 ** to the codec.

	2258 */

	2259 #ifdef SQLITE_HAS_CODEC

	2260 static void pagerReportSize(Pager *pPager){

	2261 if( pPager->xCodecSizeChng ){

	2262 pPager->xCodecSizeChng(pPager->pCodec, pPager->pageSize,

	2263 (int)pPager->nReserve);

	2264 }

	2265 }

	2266 #else

	2267 # define pagerReportSize(X) /* No-op if we do not support a codec */

	2268 #endif

	2269

	2270 #ifdef SQLITE_HAS_CODEC

	2271 /*

	2272 ** Make sure the number of reserved bits is the same in the destination

	2273 ** pager as it is in the source. This comes up when a VACUUM changes the

	2274 ** number of reserved bits to the "optimal" amount.

	2275 */

	2276 SQLITE_PRIVATE void sqlite3PagerAlignReserve(Pager pDest, Pager pSrc){

	2277 if( pDest->nReserve!=pSrc->nReserve ){

	2278 pDest->nReserve = pSrc->nReserve;

	2279 pagerReportSize(pDest);

	2280 }

	2281 }

	2282 #endif

	2283

	2284 /*

	2285 ** Read a single page from either the journal file (if isMainJrnl==1) or

	2286 ** from the sub-journal (if isMainJrnl==0) and playback that page.

	2287 ** The page begins at offset pOffset into the file. The pOffset

	2288 ** value is increased to the start of the next page in the journal.

	2289 **

	2290 ** The main rollback journal uses checksums - the statement journal does

	2291 ** not.

	2292 **

	2293 ** If the page number of the page record read from the (sub-)journal file

	2294 ** is greater than the current value of Pager.dbSize, then playback is

	2295 ** skipped and SQLITE_OK is returned.

	2296 **

	2297 ** If pDone is not NULL, then it is a record of pages that have already

	2298 ** been played back. If the page at *pOffset has already been played back

	2299 ** (if the corresponding pDone bit is set) then skip the playback.

	2300 ** Make sure the pDone bit corresponding to the *pOffset page is set

	2301 ** prior to returning.

	2302 **

	2303 ** If the page record is successfully read from the (sub-)journal file

	2304 ** and played back, then SQLITE_OK is returned. If an IO error occurs

	2305 ** while reading the record from the (sub-)journal file or while writing

	2306 ** to the database file, then the IO error code is returned. If data

	2307 ** is successfully read from the (sub-)journal file but appears to be

	2308 ** corrupted, SQLITE_DONE is returned. Data is considered corrupted in

	2309 ** two circumstances:

	2310 **

	2311 ** * If the record page-number is illegal (0 or PAGER_MJ_PGNO), or

	2312 ** * If the record is being rolled back from the main journal file

	2313 ** and the checksum field does not match the record content.

	2314 **

	2315 ** Neither of these two scenarios are possible during a savepoint rollback.

	2316 **

	2317 ** If this is a savepoint rollback, then memory may have to be dynamically

	2318 ** allocated by this function. If this is the case and an allocation fails,

	2319 ** SQLITE_NOMEM is returned.

	2320 */

	2321 static int pager_playback_one_page(

	2322 Pager pPager, / The pager being played back */

	2323 i64 pOffset, / Offset of record to playback */

	2324 Bitvec pDone, / Bitvec of pages already played back */

	2325 int isMainJrnl, /* 1 -> main journal. 0 -> sub-journal. */

	2326 int isSavepnt /* True for a savepoint rollback */

	2327 ){

	2328 int rc;

	2329 PgHdr pPg; / An existing page in the cache */

	2330 Pgno pgno; /* The page number of a page in journal */

	2331 u32 cksum; /* Checksum used for sanity checking */

	2332 char aData; / Temporary storage for the page */

	2333 sqlite3_file jfd; / The file descriptor for the journal file */

	2334 int isSynced; /* True if journal page is synced */

	2335

	2336 assert( (isMainJrnl&~1)==0 ); /* isMainJrnl is 0 or 1 */

	2337 assert( (isSavepnt&~1)==0 ); /* isSavepnt is 0 or 1 */

	2338 assert( isMainJrnl \|\| pDone ); /* pDone always used on sub-journals */

	2339 assert( isSavepnt \|\| pDone==0 ); /* pDone never used on non-savepoint */

	2340

	2341 aData = pPager->pTmpSpace;

	2342 assert( aData ); /* Temp storage must have already been allocated */

	2343 assert( pagerUseWal(pPager)==0 \|\| (!isMainJrnl && isSavepnt) );

	2344

	2345 /* Either the state is greater than PAGER_WRITER_CACHEMOD (a transaction

	2346 ** or savepoint rollback done at the request of the caller) or this is

	2347 ** a hot-journal rollback. If it is a hot-journal rollback, the pager

	2348 ** is in state OPEN and holds an EXCLUSIVE lock. Hot-journal rollback

	2349 ** only reads from the main journal, not the sub-journal.

	2350 */

	2351 assert( pPager->eState>=PAGER_WRITER_CACHEMOD

	2352 \|\| (pPager->eState==PAGER_OPEN && pPager->eLock==EXCLUSIVE_LOCK)

	2353 );

	2354 assert( pPager->eState>=PAGER_WRITER_CACHEMOD \|\| isMainJrnl );

	2355

	2356 /* Read the page number and page data from the journal or sub-journal

	2357 ** file. Return an error code to the caller if an IO error occurs.

	2358 */

	2359 jfd = isMainJrnl ? pPager->jfd : pPager->sjfd;

	2360 rc = read32bits(jfd, *pOffset, &pgno);

	2361 if( rc!=SQLITE_OK ) return rc;

	2362 rc = sqlite3OsRead(jfd, (u8)aData, pPager->pageSize, (pOffset)+4);

	2363 if( rc!=SQLITE_OK ) return rc;

	2364 pOffset += pPager->pageSize + 4 + isMainJrnl4;

	2365

	2366 /* Sanity checking on the page. This is more important that I originally

	2367 ** thought. If a power failure occurs while the journal is being written,

	2368 ** it could cause invalid data to be written into the journal. We need to

	2369 ** detect this invalid data (with high probability) and ignore it.

	2370 */

	2371 if( pgno==0 \|\| pgno==PAGER_MJ_PGNO(pPager) ){

	2372 assert( !isSavepnt );

	2373 return SQLITE_DONE;

	2374 }

	2375 if( pgno>(Pgno)pPager->dbSize \|\| sqlite3BitvecTest(pDone, pgno) ){

	2376 return SQLITE_OK;

	2377 }

	2378 if( isMainJrnl ){

	2379 rc = read32bits(jfd, (*pOffset)-4, &cksum);

	2380 if( rc ) return rc;

	2381 if( !isSavepnt && pager_cksum(pPager, (u8*)aData)!=cksum ){

	2382 return SQLITE_DONE;

	2383 }

	2384 }

	2385

	2386 /* If this page has already been played back before during the current

	2387 ** rollback, then don't bother to play it back again.

	2388 */

	2389 if( pDone && (rc = sqlite3BitvecSet(pDone, pgno))!=SQLITE_OK ){

	2390 return rc;

	2391 }

	2392

	2393 /* When playing back page 1, restore the nReserve setting

	2394 */

	2395 if( pgno==1 && pPager->nReserve!=((u8*)aData)[20] ){

	2396 pPager->nReserve = ((u8*)aData)[20];

	2397 pagerReportSize(pPager);

	2398 }

	2399

	2400 /* If the pager is in CACHEMOD state, then there must be a copy of this

	2401 ** page in the pager cache. In this case just update the pager cache,

	2402 ** not the database file. The page is left marked dirty in this case.

	2403 **

	2404 ** An exception to the above rule: If the database is in no-sync mode

	2405 ** and a page is moved during an incremental vacuum then the page may

	2406 ** not be in the pager cache. Later: if a malloc() or IO error occurs

	2407 ** during a Movepage() call, then the page may not be in the cache

	2408 ** either. So the condition described in the above paragraph is not

	2409 ** assert()able.

	2410 **

	2411 ** If in WRITER_DBMOD, WRITER_FINISHED or OPEN state, then we update the

	2412 ** pager cache if it exists and the main file. The page is then marked

	2413 ** not dirty. Since this code is only executed in PAGER_OPEN state for

	2414 ** a hot-journal rollback, it is guaranteed that the page-cache is empty

	2415 ** if the pager is in OPEN state.

	2416 **

	2417 ** Ticket #1171: The statement journal might contain page content that is

	2418 ** different from the page content at the start of the transaction.

	2419 ** This occurs when a page is changed prior to the start of a statement

	2420 ** then changed again within the statement. When rolling back such a

	2421 ** statement we must not write to the original database unless we know

	2422 ** for certain that original page contents are synced into the main rollback

	2423 ** journal. Otherwise, a power loss might leave modified data in the

	2424 ** database file without an entry in the rollback journal that can

	2425 ** restore the database to its original form. Two conditions must be

	2426 ** met before writing to the database files. (1) the database must be

	2427 ** locked. (2) we know that the original page content is fully synced

	2428 ** in the main journal either because the page is not in cache or else

	2429 ** the page is marked as needSync==0.

	2430 **

	2431 ** 2008-04-14: When attempting to vacuum a corrupt database file, it

	2432 ** is possible to fail a statement on a database that does not yet exist.

	2433 ** Do not attempt to write if database file has never been opened.

	2434 */

	2435 if( pagerUseWal(pPager) ){

	2436 pPg = 0;

	2437 }else{

	2438 pPg = sqlite3PagerLookup(pPager, pgno);

	2439 }

	2440 assert( pPg \|\| !MEMDB );

	2441 assert( pPager->eState!=PAGER_OPEN \|\| pPg==0 );

	2442 PAGERTRACE(("PLAYBACK %d page %d hash(%08x) %s\n",

	2443 PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, (u8*)aData),

	2444 (isMainJrnl?"main-journal":"sub-journal")

	2445 ));

	2446 if( isMainJrnl ){

	2447 isSynced = pPager->noSync \|\| (*pOffset <= pPager->journalHdr);

	2448 }else{

	2449 isSynced = (pPg==0 \|\| 0==(pPg->flags & PGHDR_NEED_SYNC));

	2450 }

	2451 if( isOpen(pPager->fd)

	2452 && (pPager->eState>=PAGER_WRITER_DBMOD \|\| pPager->eState==PAGER_OPEN)

	2453 && isSynced

	2454 ){

	2455 i64 ofst = (pgno-1)*(i64)pPager->pageSize;

	2456 testcase( !isSavepnt && pPg!=0 && (pPg->flags&PGHDR_NEED_SYNC)!=0 );

	2457 assert( !pagerUseWal(pPager) );

	2458 rc = sqlite3OsWrite(pPager->fd, (u8 *)aData, pPager->pageSize, ofst);

	2459 if( pgno>pPager->dbFileSize ){

	2460 pPager->dbFileSize = pgno;

	2461 }

	2462 if( pPager->pBackup ){

	2463 CODEC1(pPager, aData, pgno, 3, rc=SQLITE_NOMEM);

	2464 sqlite3BackupUpdate(pPager->pBackup, pgno, (u8*)aData);

	2465 CODEC2(pPager, aData, pgno, 7, rc=SQLITE_NOMEM, aData);

	2466 }

	2467 }else if( !isMainJrnl && pPg==0 ){

	2468 /* If this is a rollback of a savepoint and data was not written to

	2469 ** the database and the page is not in-memory, there is a potential

	2470 ** problem. When the page is next fetched by the b-tree layer, it

	2471 ** will be read from the database file, which may or may not be

	2472 ** current.

	2473 **

	2474 ** There are a couple of different ways this can happen. All are quite

	2475 ** obscure. When running in synchronous mode, this can only happen

	2476 ** if the page is on the free-list at the start of the transaction, then

	2477 ** populated, then moved using sqlite3PagerMovepage().

	2478 **

	2479 ** The solution is to add an in-memory page to the cache containing

	2480 ** the data just read from the sub-journal. Mark the page as dirty

	2481 ** and if the pager requires a journal-sync, then mark the page as

	2482 ** requiring a journal-sync before it is written.

	2483 */

	2484 assert( isSavepnt );

	2485 assert( (pPager->doNotSpill & SPILLFLAG_ROLLBACK)==0 );

	2486 pPager->doNotSpill \|= SPILLFLAG_ROLLBACK;

	2487 rc = sqlite3PagerGet(pPager, pgno, &pPg, 1);

	2488 assert( (pPager->doNotSpill & SPILLFLAG_ROLLBACK)!=0 );

	2489 pPager->doNotSpill &= ~SPILLFLAG_ROLLBACK;

	2490 if( rc!=SQLITE_OK ) return rc;

	2491 pPg->flags &= ~PGHDR_NEED_READ;

	2492 sqlite3PcacheMakeDirty(pPg);

	2493 }

	2494 if( pPg ){

	2495 /* No page should ever be explicitly rolled back that is in use, except

	2496 ** for page 1 which is held in use in order to keep the lock on the

	2497 ** database active. However such a page may be rolled back as a result

	2498 ** of an internal error resulting in an automatic call to

	2499 ** sqlite3PagerRollback().

	2500 */

	2501 void *pData;

	2502 pData = pPg->pData;

	2503 memcpy(pData, (u8*)aData, pPager->pageSize);

	2504 pPager->xReiniter(pPg);

	2505 if( isMainJrnl && (!isSavepnt \|\| *pOffset<=pPager->journalHdr) ){

	2506 /* If the contents of this page were just restored from the main

	2507 ** journal file, then its content must be as they were when the

	2508 ** transaction was first opened. In this case we can mark the page

	2509 ** as clean, since there will be no need to write it out to the

	2510 ** database.

	2511 **

	2512 ** There is one exception to this rule. If the page is being rolled

	2513 ** back as part of a savepoint (or statement) rollback from an

	2514 ** unsynced portion of the main journal file, then it is not safe

	2515 ** to mark the page as clean. This is because marking the page as

	2516 ** clean will clear the PGHDR_NEED_SYNC flag. Since the page is

	2517 ** already in the journal file (recorded in Pager.pInJournal) and

	2518 ** the PGHDR_NEED_SYNC flag is cleared, if the page is written to

	2519 ** again within this transaction, it will be marked as dirty but

	2520 ** the PGHDR_NEED_SYNC flag will not be set. It could then potentially

	2521 ** be written out into the database file before its journal file

	2522 ** segment is synced. If a crash occurs during or following this,

	2523 ** database corruption may ensue.

	2524 */

	2525 assert( !pagerUseWal(pPager) );

	2526 sqlite3PcacheMakeClean(pPg);

	2527 }

	2528 pager_set_pagehash(pPg);

	2529

	2530 /* If this was page 1, then restore the value of Pager.dbFileVers.

	2531 ** Do this before any decoding. */

	2532 if( pgno==1 ){

	2533 memcpy(&pPager->dbFileVers, &((u8*)pData)[24],sizeof(pPager->dbFileVers));

	2534 }

	2535

	2536 /* Decode the page just read from disk */

	2537 CODEC1(pPager, pData, pPg->pgno, 3, rc=SQLITE_NOMEM);

	2538 sqlite3PcacheRelease(pPg);

	2539 }

	2540 return rc;

	2541 }

	2542

	2543 /*

	2544 ** Parameter zMaster is the name of a master journal file. A single journal

	2545 ** file that referred to the master journal file has just been rolled back.

	2546 ** This routine checks if it is possible to delete the master journal file,

	2547 ** and does so if it is.

	2548 **

	2549 ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not

	2550 ** available for use within this function.

	2551 **

	2552 ** When a master journal file is created, it is populated with the names

	2553 ** of all of its child journals, one after another, formatted as utf-8

	2554 ** encoded text. The end of each child journal file is marked with a

	2555 ** nul-terminator byte (0x00). i.e. the entire contents of a master journal

	2556 ** file for a transaction involving two databases might be:

	2557 **

	2558 ** "/home/bill/a.db-journal\x00/home/bill/b.db-journal\x00"

	2559 **

	2560 ** A master journal file may only be deleted once all of its child

	2561 ** journals have been rolled back.

	2562 **

	2563 ** This function reads the contents of the master-journal file into

	2564 ** memory and loops through each of the child journal names. For

	2565 ** each child journal, it checks if:

	2566 **

	2567 ** * if the child journal exists, and if so

	2568 ** * if the child journal contains a reference to master journal

	2569 ** file zMaster

	2570 **

	2571 ** If a child journal can be found that matches both of the criteria

	2572 ** above, this function returns without doing anything. Otherwise, if

	2573 ** no such child journal can be found, file zMaster is deleted from

	2574 ** the file-system using sqlite3OsDelete().

	2575 **

	2576 ** If an IO error within this function, an error code is returned. This

	2577 ** function allocates memory by calling sqlite3Malloc(). If an allocation

	2578 ** fails, SQLITE_NOMEM is returned. Otherwise, if no IO or malloc errors

	2579 ** occur, SQLITE_OK is returned.

	2580 **

	2581 ** TODO: This function allocates a single block of memory to load

	2582 ** the entire contents of the master journal file. This could be

	2583 ** a couple of kilobytes or so - potentially larger than the page

	2584 ** size.

	2585 */

	2586 static int pager_delmaster(Pager pPager, const char zMaster){

	2587 sqlite3_vfs *pVfs = pPager->pVfs;

	2588 int rc; /* Return code */

	2589 sqlite3_file pMaster; / Malloc'd master-journal file descriptor */

	2590 sqlite3_file pJournal; / Malloc'd child-journal file descriptor */

	2591 char zMasterJournal = 0; / Contents of master journal file */

	2592 i64 nMasterJournal; /* Size of master journal file */

	2593 char zJournal; / Pointer to one journal within MJ file */

	2594 char zMasterPtr; / Space to hold MJ filename from a journal file */

	2595 int nMasterPtr; /* Amount of space allocated to zMasterPtr[] */

	2596

	2597 /* Allocate space for both the pJournal and pMaster file descriptors.

	2598 ** If successful, open the master journal file for reading.

	2599 */

	2600 pMaster = (sqlite3_file )sqlite3MallocZero(pVfs->szOsFile 2);

	2601 pJournal = (sqlite3_file )(((u8 )pMaster) + pVfs->szOsFile);

	2602 if( !pMaster ){

	2603 rc = SQLITE_NOMEM;

	2604 }else{

	2605 const int flags = (SQLITE_OPEN_READONLY\|SQLITE_OPEN_MASTER_JOURNAL);

	2606 rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0);

	2607 }

	2608 if( rc!=SQLITE_OK ) goto delmaster_out;

	2609

	2610 /* Load the entire master journal file into space obtained from

	2611 ** sqlite3_malloc() and pointed to by zMasterJournal. Also obtain

	2612 ** sufficient space (in zMasterPtr) to hold the names of master

	2613 ** journal files extracted from regular rollback-journals.

	2614 */

	2615 rc = sqlite3OsFileSize(pMaster, &nMasterJournal);

	2616 if( rc!=SQLITE_OK ) goto delmaster_out;

	2617 nMasterPtr = pVfs->mxPathname+1;

	2618 zMasterJournal = sqlite3Malloc(nMasterJournal + nMasterPtr + 1);

	2619 if( !zMasterJournal ){

	2620 rc = SQLITE_NOMEM;

	2621 goto delmaster_out;

	2622 }

	2623 zMasterPtr = &zMasterJournal[nMasterJournal+1];

	2624 rc = sqlite3OsRead(pMaster, zMasterJournal, (int)nMasterJournal, 0);

	2625 if( rc!=SQLITE_OK ) goto delmaster_out;

	2626 zMasterJournal[nMasterJournal] = 0;

	2627

	2628 zJournal = zMasterJournal;

	2629 while( (zJournal-zMasterJournal)<nMasterJournal ){

	2630 int exists;

	2631 rc = sqlite3OsAccess(pVfs, zJournal, SQLITE_ACCESS_EXISTS, &exists);

	2632 if( rc!=SQLITE_OK ){

	2633 goto delmaster_out;

	2634 }

	2635 if( exists ){

	2636 /* One of the journals pointed to by the master journal exists.

	2637 ** Open it and check if it points at the master journal. If

	2638 ** so, return without deleting the master journal file.

	2639 */

	2640 int c;

	2641 int flags = (SQLITE_OPEN_READONLY\|SQLITE_OPEN_MAIN_JOURNAL);

	2642 rc = sqlite3OsOpen(pVfs, zJournal, pJournal, flags, 0);

	2643 if( rc!=SQLITE_OK ){

	2644 goto delmaster_out;

	2645 }

	2646

	2647 rc = readMasterJournal(pJournal, zMasterPtr, nMasterPtr);

	2648 sqlite3OsClose(pJournal);

	2649 if( rc!=SQLITE_OK ){

	2650 goto delmaster_out;

	2651 }

	2652

	2653 c = zMasterPtr[0]!=0 && strcmp(zMasterPtr, zMaster)==0;

	2654 if( c ){

	2655 /* We have a match. Do not delete the master journal file. */

	2656 goto delmaster_out;

	2657 }

	2658 }

	2659 zJournal += (sqlite3Strlen30(zJournal)+1);

	2660 }

	2661

	2662 sqlite3OsClose(pMaster);

	2663 rc = sqlite3OsDelete(pVfs, zMaster, 0);

	2664

	2665 delmaster_out:

	2666 sqlite3_free(zMasterJournal);

	2667 if( pMaster ){

	2668 sqlite3OsClose(pMaster);

	2669 assert( !isOpen(pJournal) );

	2670 sqlite3_free(pMaster);

	2671 }

	2672 return rc;

	2673 }

	2674

	2675

	2676 /*

	2677 ** This function is used to change the actual size of the database

	2678 ** file in the file-system. This only happens when committing a transaction,

	2679 ** or rolling back a transaction (including rolling back a hot-journal).

	2680 **

	2681 ** If the main database file is not open, or the pager is not in either

	2682 ** DBMOD or OPEN state, this function is a no-op. Otherwise, the size

	2683 ** of the file is changed to nPage pages (nPage*pPager->pageSize bytes).

	2684 ** If the file on disk is currently larger than nPage pages, then use the VFS

	2685 ** xTruncate() method to truncate it.

	2686 **

	2687 ** Or, it might be the case that the file on disk is smaller than

	2688 ** nPage pages. Some operating system implementations can get confused if

	2689 ** you try to truncate a file to some size that is larger than it

	2690 ** currently is, so detect this case and write a single zero byte to

	2691 ** the end of the new file instead.

	2692 **

	2693 ** If successful, return SQLITE_OK. If an IO error occurs while modifying

	2694 ** the database file, return the error code to the caller.

	2695 */

	2696 static int pager_truncate(Pager *pPager, Pgno nPage){

	2697 int rc = SQLITE_OK;

	2698 assert( pPager->eState!=PAGER_ERROR );

	2699 assert( pPager->eState!=PAGER_READER );

	2700

	2701 if( isOpen(pPager->fd)

	2702 && (pPager->eState>=PAGER_WRITER_DBMOD \|\| pPager->eState==PAGER_OPEN)

	2703 ){

	2704 i64 currentSize, newSize;

	2705 int szPage = pPager->pageSize;

	2706 assert( pPager->eLock==EXCLUSIVE_LOCK );

	2707 /* TODO: Is it safe to use Pager.dbFileSize here? */

	2708 rc = sqlite3OsFileSize(pPager->fd, &currentSize);

	2709 newSize = szPage*(i64)nPage;

	2710 if( rc==SQLITE_OK && currentSize!=newSize ){

	2711 if( currentSize>newSize ){

	2712 rc = sqlite3OsTruncate(pPager->fd, newSize);

	2713 }else if( (currentSize+szPage)<=newSize ){

	2714 char *pTmp = pPager->pTmpSpace;

	2715 memset(pTmp, 0, szPage);

	2716 testcase( (newSize-szPage) == currentSize );

	2717 testcase( (newSize-szPage) > currentSize );

	2718 rc = sqlite3OsWrite(pPager->fd, pTmp, szPage, newSize-szPage);

	2719 }

	2720 if( rc==SQLITE_OK ){

	2721 pPager->dbFileSize = nPage;

	2722 }

	2723 }

	2724 }

	2725 return rc;

	2726 }

	2727

	2728 /*

	2729 ** Return a sanitized version of the sector-size of OS file pFile. The

	2730 ** return value is guaranteed to lie between 32 and MAX_SECTOR_SIZE.

	2731 */

	2732 SQLITE_PRIVATE int sqlite3SectorSize(sqlite3_file *pFile){

	2733 int iRet = sqlite3OsSectorSize(pFile);

	2734 if( iRet<32 ){

	2735 iRet = 512;

	2736 }else if( iRet>MAX_SECTOR_SIZE ){

	2737 assert( MAX_SECTOR_SIZE>=512 );

	2738 iRet = MAX_SECTOR_SIZE;

	2739 }

	2740 return iRet;

	2741 }

	2742

	2743 /*

	2744 ** Set the value of the Pager.sectorSize variable for the given

	2745 ** pager based on the value returned by the xSectorSize method

	2746 ** of the open database file. The sector size will be used

	2747 ** to determine the size and alignment of journal header and

	2748 ** master journal pointers within created journal files.

	2749 **

	2750 ** For temporary files the effective sector size is always 512 bytes.

	2751 **

	2752 ** Otherwise, for non-temporary files, the effective sector size is

	2753 ** the value returned by the xSectorSize() method rounded up to 32 if

	2754 ** it is less than 32, or rounded down to MAX_SECTOR_SIZE if it

	2755 ** is greater than MAX_SECTOR_SIZE.

	2756 **

	2757 ** If the file has the SQLITE_IOCAP_POWERSAFE_OVERWRITE property, then set

	2758 ** the effective sector size to its minimum value (512). The purpose of

	2759 ** pPager->sectorSize is to define the "blast radius" of bytes that

	2760 ** might change if a crash occurs while writing to a single byte in

	2761 ** that range. But with POWERSAFE_OVERWRITE, the blast radius is zero

	2762 ** (that is what POWERSAFE_OVERWRITE means), so we minimize the sector

	2763 ** size. For backwards compatibility of the rollback journal file format,

	2764 ** we cannot reduce the effective sector size below 512.

	2765 */

	2766 static void setSectorSize(Pager *pPager){

	2767 assert( isOpen(pPager->fd) \|\| pPager->tempFile );

	2768

	2769 if( pPager->tempFile

	2770 \|\| (sqlite3OsDeviceCharacteristics(pPager->fd) &

	2771 SQLITE_IOCAP_POWERSAFE_OVERWRITE)!=0

	2772 ){

	2773 /* Sector size doesn't matter for temporary files. Also, the file

	2774 ** may not have been opened yet, in which case the OsSectorSize()

	2775 ** call will segfault. */

	2776 pPager->sectorSize = 512;

	2777 }else{

	2778 pPager->sectorSize = sqlite3SectorSize(pPager->fd);

	2779 }

	2780 }

	2781

	2782 /*

	2783 ** Playback the journal and thus restore the database file to

	2784 ** the state it was in before we started making changes.

	2785 **

	2786 ** The journal file format is as follows:

	2787 **

	2788 ** (1) 8 byte prefix. A copy of aJournalMagic[].

	2789 ** (2) 4 byte big-endian integer which is the number of valid page records

	2790 ** in the journal. If this value is 0xffffffff, then compute the

	2791 ** number of page records from the journal size.

	2792 ** (3) 4 byte big-endian integer which is the initial value for the

	2793 ** sanity checksum.

	2794 ** (4) 4 byte integer which is the number of pages to truncate the

	2795 ** database to during a rollback.

	2796 ** (5) 4 byte big-endian integer which is the sector size. The header

	2797 ** is this many bytes in size.

	2798 ** (6) 4 byte big-endian integer which is the page size.

	2799 ** (7) zero padding out to the next sector size.

	2800 ** (8) Zero or more pages instances, each as follows:

	2801 ** + 4 byte page number.

	2802 ** + pPager->pageSize bytes of data.

	2803 ** + 4 byte checksum

	2804 **

	2805 ** When we speak of the journal header, we mean the first 7 items above.

	2806 ** Each entry in the journal is an instance of the 8th item.

	2807 **

	2808 ** Call the value from the second bullet "nRec". nRec is the number of

	2809 ** valid page entries in the journal. In most cases, you can compute the

	2810 ** value of nRec from the size of the journal file. But if a power

	2811 ** failure occurred while the journal was being written, it could be the

	2812 ** case that the size of the journal file had already been increased but

	2813 ** the extra entries had not yet made it safely to disk. In such a case,

	2814 ** the value of nRec computed from the file size would be too large. For

	2815 ** that reason, we always use the nRec value in the header.

	2816 **

	2817 ** If the nRec value is 0xffffffff it means that nRec should be computed

	2818 ** from the file size. This value is used when the user selects the

	2819 ** no-sync option for the journal. A power failure could lead to corruption

	2820 ** in this case. But for things like temporary table (which will be

	2821 ** deleted when the power is restored) we don't care.

	2822 **

	2823 ** If the file opened as the journal file is not a well-formed

	2824 ** journal file then all pages up to the first corrupted page are rolled

	2825 ** back (or no pages if the journal header is corrupted). The journal file

	2826 ** is then deleted and SQLITE_OK returned, just as if no corruption had

	2827 ** been encountered.

	2828 **

	2829 ** If an I/O or malloc() error occurs, the journal-file is not deleted

	2830 ** and an error code is returned.

	2831 **

	2832 ** The isHot parameter indicates that we are trying to rollback a journal

	2833 ** that might be a hot journal. Or, it could be that the journal is

	2834 ** preserved because of JOURNALMODE_PERSIST or JOURNALMODE_TRUNCATE.

	2835 ** If the journal really is hot, reset the pager cache prior rolling

	2836 ** back any content. If the journal is merely persistent, no reset is

	2837 ** needed.

	2838 */

	2839 static int pager_playback(Pager *pPager, int isHot){

	2840 sqlite3_vfs *pVfs = pPager->pVfs;

	2841 i64 szJ; /* Size of the journal file in bytes */

	2842 u32 nRec; /* Number of Records in the journal */

	2843 u32 u; /* Unsigned loop counter */

	2844 Pgno mxPg = 0; /* Size of the original file in pages */

	2845 int rc; /* Result code of a subroutine */

	2846 int res = 1; /* Value returned by sqlite3OsAccess() */

	2847 char zMaster = 0; / Name of master journal file if any */

	2848 int needPagerReset; /* True to reset page prior to first page rollback */

	2849 int nPlayback = 0; /* Total number of pages restored from journal */

	2850

	2851 /* Figure out how many records are in the journal. Abort early if

	2852 ** the journal is empty.

	2853 */

	2854 assert( isOpen(pPager->jfd) );

	2855 rc = sqlite3OsFileSize(pPager->jfd, &szJ);

	2856 if( rc!=SQLITE_OK ){

	2857 goto end_playback;

	2858 }

	2859

	2860 /* Read the master journal name from the journal, if it is present.

	2861 ** If a master journal file name is specified, but the file is not

	2862 ** present on disk, then the journal is not hot and does not need to be

	2863 ** played back.

	2864 **

	2865 ** TODO: Technically the following is an error because it assumes that

	2866 ** buffer Pager.pTmpSpace is (mxPathname+1) bytes or larger. i.e. that

	2867 ** (pPager->pageSize >= pPager->pVfs->mxPathname+1). Using os_unix.c,

	2868 ** mxPathname is 512, which is the same as the minimum allowable value

	2869 ** for pageSize.

	2870 */

	2871 zMaster = pPager->pTmpSpace;

	2872 rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);

	2873 if( rc==SQLITE_OK && zMaster[0] ){

	2874 rc = sqlite3OsAccess(pVfs, zMaster, SQLITE_ACCESS_EXISTS, &res);

	2875 }

	2876 zMaster = 0;

	2877 if( rc!=SQLITE_OK \|\| !res ){

	2878 goto end_playback;

	2879 }

	2880 pPager->journalOff = 0;

	2881 needPagerReset = isHot;

	2882

	2883 /* This loop terminates either when a readJournalHdr() or

	2884 ** pager_playback_one_page() call returns SQLITE_DONE or an IO error

	2885 ** occurs.

	2886 */

	2887 while( 1 ){

	2888 /* Read the next journal header from the journal file. If there are

	2889 ** not enough bytes left in the journal file for a complete header, or

	2890 ** it is corrupted, then a process must have failed while writing it.

	2891 ** This indicates nothing more needs to be rolled back.

	2892 */

	2893 rc = readJournalHdr(pPager, isHot, szJ, &nRec, &mxPg);

	2894 if( rc!=SQLITE_OK ){

	2895 if( rc==SQLITE_DONE ){

	2896 rc = SQLITE_OK;

	2897 }

	2898 goto end_playback;

	2899 }

	2900

	2901 /* If nRec is 0xffffffff, then this journal was created by a process

	2902 ** working in no-sync mode. This means that the rest of the journal

	2903 ** file consists of pages, there are no more journal headers. Compute

	2904 ** the value of nRec based on this assumption.

	2905 */

	2906 if( nRec==0xffffffff ){

	2907 assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );

	2908 nRec = (int)((szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager));

	2909 }

	2910

	2911 /* If nRec is 0 and this rollback is of a transaction created by this

	2912 ** process and if this is the final header in the journal, then it means

	2913 ** that this part of the journal was being filled but has not yet been

	2914 ** synced to disk. Compute the number of pages based on the remaining

	2915 ** size of the file.

	2916 **

	2917 ** The third term of the test was added to fix ticket #2565.

	2918 ** When rolling back a hot journal, nRec==0 always means that the next

	2919 ** chunk of the journal contains zero pages to be rolled back. But

	2920 ** when doing a ROLLBACK and the nRec==0 chunk is the last chunk in

	2921 ** the journal, it means that the journal might contain additional

	2922 ** pages that need to be rolled back and that the number of pages

	2923 ** should be computed based on the journal file size.

	2924 */

	2925 if( nRec==0 && !isHot &&

	2926 pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){

	2927 nRec = (int)((szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager));

	2928 }

	2929

	2930 /* If this is the first header read from the journal, truncate the

	2931 ** database file back to its original size.

	2932 */

	2933 if( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){

	2934 rc = pager_truncate(pPager, mxPg);

	2935 if( rc!=SQLITE_OK ){

	2936 goto end_playback;

	2937 }

	2938 pPager->dbSize = mxPg;

	2939 }

	2940

	2941 /* Copy original pages out of the journal and back into the

	2942 ** database file and/or page cache.

	2943 */

	2944 for(u=0; u<nRec; u++){

	2945 if( needPagerReset ){

	2946 pager_reset(pPager);

	2947 needPagerReset = 0;

	2948 }

	2949 rc = pager_playback_one_page(pPager,&pPager->journalOff,0,1,0);

	2950 if( rc==SQLITE_OK ){

	2951 nPlayback++;

	2952 }else{

	2953 if( rc==SQLITE_DONE ){

	2954 pPager->journalOff = szJ;

	2955 break;

	2956 }else if( rc==SQLITE_IOERR_SHORT_READ ){

	2957 /* If the journal has been truncated, simply stop reading and

	2958 ** processing the journal. This might happen if the journal was

	2959 ** not completely written and synced prior to a crash. In that

	2960 ** case, the database should have never been written in the

	2961 ** first place so it is OK to simply abandon the rollback. */

	2962 rc = SQLITE_OK;

	2963 goto end_playback;

	2964 }else{

	2965 /* If we are unable to rollback, quit and return the error

	2966 ** code. This will cause the pager to enter the error state

	2967 ** so that no further harm will be done. Perhaps the next

	2968 ** process to come along will be able to rollback the database.

	2969 */

	2970 goto end_playback;

	2971 }

	2972 }

	2973 }

	2974 }

	2975 /NOTREACHED/

	2976 assert( 0 );

	2977

	2978 end_playback:

	2979 /* Following a rollback, the database file should be back in its original

	2980 ** state prior to the start of the transaction, so invoke the

	2981 ** SQLITE_FCNTL_DB_UNCHANGED file-control method to disable the

	2982 ** assertion that the transaction counter was modified.

	2983 */

	2984 #ifdef SQLITE_DEBUG

	2985 if( pPager->fd->pMethods ){

	2986 sqlite3OsFileControlHint(pPager->fd,SQLITE_FCNTL_DB_UNCHANGED,0);

	2987 }

	2988 #endif

	2989

	2990 /* If this playback is happening automatically as a result of an IO or

	2991 ** malloc error that occurred after the change-counter was updated but

	2992 ** before the transaction was committed, then the change-counter

	2993 ** modification may just have been reverted. If this happens in exclusive

	2994 ** mode, then subsequent transactions performed by the connection will not

	2995 ** update the change-counter at all. This may lead to cache inconsistency

	2996 ** problems for other processes at some point in the future. So, just

	2997 ** in case this has happened, clear the changeCountDone flag now.

	2998 */

	2999 pPager->changeCountDone = pPager->tempFile;

	3000

	3001 if( rc==SQLITE_OK ){

	3002 zMaster = pPager->pTmpSpace;

	3003 rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);

	3004 testcase( rc!=SQLITE_OK );

	3005 }

	3006 if( rc==SQLITE_OK

	3007 && (pPager->eState>=PAGER_WRITER_DBMOD \|\| pPager->eState==PAGER_OPEN)

	3008 ){

	3009 rc = sqlite3PagerSync(pPager, 0);

	3010 }

	3011 if( rc==SQLITE_OK ){

	3012 rc = pager_end_transaction(pPager, zMaster[0]!='\0', 0);

	3013 testcase( rc!=SQLITE_OK );

	3014 }

	3015 if( rc==SQLITE_OK && zMaster[0] && res ){

	3016 /* If there was a master journal and this routine will return success,

	3017 ** see if it is possible to delete the master journal.

	3018 */

	3019 rc = pager_delmaster(pPager, zMaster);

	3020 testcase( rc!=SQLITE_OK );

	3021 }

	3022 if( isHot && nPlayback ){

	3023 sqlite3_log(SQLITE_NOTICE_RECOVER_ROLLBACK, "recovered %d pages from %s",

	3024 nPlayback, pPager->zJournal);

	3025 }

	3026

	3027 /* The Pager.sectorSize variable may have been updated while rolling

	3028 ** back a journal created by a process with a different sector size

	3029 ** value. Reset it to the correct value for this process.

	3030 */

	3031 setSectorSize(pPager);

	3032 return rc;

	3033 }

	3034

	3035

	3036 /*

	3037 ** Read the content for page pPg out of the database file and into

	3038 ** pPg->pData. A shared lock or greater must be held on the database

	3039 ** file before this function is called.

	3040 **

	3041 ** If page 1 is read, then the value of Pager.dbFileVers[] is set to

	3042 ** the value read from the database file.

	3043 **

	3044 ** If an IO error occurs, then the IO error is returned to the caller.

	3045 ** Otherwise, SQLITE_OK is returned.

	3046 */

	3047 static int readDbPage(PgHdr *pPg, u32 iFrame){

	3048 Pager pPager = pPg->pPager; / Pager object associated with page pPg */

	3049 Pgno pgno = pPg->pgno; /* Page number to read */

	3050 int rc = SQLITE_OK; /* Return code */

	3051 int pgsz = pPager->pageSize; /* Number of bytes to read */

	3052

	3053 assert( pPager->eState>=PAGER_READER && !MEMDB );

	3054 assert( isOpen(pPager->fd) );

	3055

	3056 #ifndef SQLITE_OMIT_WAL

	3057 if( iFrame ){

	3058 /* Try to pull the page from the write-ahead log. */

	3059 rc = sqlite3WalReadFrame(pPager->pWal, iFrame, pgsz, pPg->pData);

	3060 }else

	3061 #endif

	3062 {

	3063 i64 iOffset = (pgno-1)*(i64)pPager->pageSize;

	3064 rc = sqlite3OsRead(pPager->fd, pPg->pData, pgsz, iOffset);

	3065 if( rc==SQLITE_IOERR_SHORT_READ ){

	3066 rc = SQLITE_OK;

	3067 }

	3068 }

	3069

	3070 if( pgno==1 ){

	3071 if( rc ){

	3072 /* If the read is unsuccessful, set the dbFileVers[] to something

	3073 ** that will never be a valid file version. dbFileVers[] is a copy

	3074 ** of bytes 24..39 of the database. Bytes 28..31 should always be

	3075 ** zero or the size of the database in page. Bytes 32..35 and 35..39

	3076 ** should be page numbers which are never 0xffffffff. So filling

	3077 ** pPager->dbFileVers[] with all 0xff bytes should suffice.

	3078 **

	3079 ** For an encrypted database, the situation is more complex: bytes

	3080 ** 24..39 of the database are white noise. But the probability of

	3081 ** white noise equaling 16 bytes of 0xff is vanishingly small so

	3082 ** we should still be ok.

	3083 */

	3084 memset(pPager->dbFileVers, 0xff, sizeof(pPager->dbFileVers));

	3085 }else{

	3086 u8 dbFileVers = &((u8)pPg->pData)[24];

	3087 memcpy(&pPager->dbFileVers, dbFileVers, sizeof(pPager->dbFileVers));

	3088 }

	3089 }

	3090 CODEC1(pPager, pPg->pData, pgno, 3, rc = SQLITE_NOMEM);

	3091

	3092 PAGER_INCR(sqlite3_pager_readdb_count);

	3093 PAGER_INCR(pPager->nRead);

	3094 IOTRACE(("PGIN %p %d\n", pPager, pgno));

	3095 PAGERTRACE(("FETCH %d page %d hash(%08x)\n",

	3096 PAGERID(pPager), pgno, pager_pagehash(pPg)));

	3097

	3098 return rc;

	3099 }

	3100

	3101 /*

	3102 ** Update the value of the change-counter at offsets 24 and 92 in

	3103 ** the header and the sqlite version number at offset 96.

	3104 **

	3105 ** This is an unconditional update. See also the pager_incr_changecounter()

	3106 ** routine which only updates the change-counter if the update is actually

	3107 ** needed, as determined by the pPager->changeCountDone state variable.

	3108 */

	3109 static void pager_write_changecounter(PgHdr *pPg){

	3110 u32 change_counter;

	3111

	3112 /* Increment the value just read and write it back to byte 24. */

	3113 change_counter = sqlite3Get4byte((u8*)pPg->pPager->dbFileVers)+1;

	3114 put32bits(((char*)pPg->pData)+24, change_counter);

	3115

	3116 /* Also store the SQLite version number in bytes 96..99 and in

	3117 ** bytes 92..95 store the change counter for which the version number

	3118 ** is valid. */

	3119 put32bits(((char*)pPg->pData)+92, change_counter);

	3120 put32bits(((char*)pPg->pData)+96, SQLITE_VERSION_NUMBER);

	3121 }

	3122

	3123 #ifndef SQLITE_OMIT_WAL

	3124 /*

	3125 ** This function is invoked once for each page that has already been

	3126 ** written into the log file when a WAL transaction is rolled back.

	3127 ** Parameter iPg is the page number of said page. The pCtx argument

	3128 ** is actually a pointer to the Pager structure.

	3129 **

	3130 ** If page iPg is present in the cache, and has no outstanding references,

	3131 ** it is discarded. Otherwise, if there are one or more outstanding

	3132 ** references, the page content is reloaded from the database. If the

	3133 ** attempt to reload content from the database is required and fails,

	3134 ** return an SQLite error code. Otherwise, SQLITE_OK.

	3135 */

	3136 static int pagerUndoCallback(void *pCtx, Pgno iPg){

	3137 int rc = SQLITE_OK;

	3138 Pager pPager = (Pager )pCtx;

	3139 PgHdr *pPg;

	3140

	3141 assert( pagerUseWal(pPager) );

	3142 pPg = sqlite3PagerLookup(pPager, iPg);

	3143 if( pPg ){

	3144 if( sqlite3PcachePageRefcount(pPg)==1 ){

	3145 sqlite3PcacheDrop(pPg);

	3146 }else{

	3147 u32 iFrame = 0;

	3148 rc = sqlite3WalFindFrame(pPager->pWal, pPg->pgno, &iFrame);

	3149 if( rc==SQLITE_OK ){

	3150 rc = readDbPage(pPg, iFrame);

	3151 }

	3152 if( rc==SQLITE_OK ){

	3153 pPager->xReiniter(pPg);

	3154 }

	3155 sqlite3PagerUnrefNotNull(pPg);

	3156 }

	3157 }

	3158

	3159 /* Normally, if a transaction is rolled back, any backup processes are

	3160 ** updated as data is copied out of the rollback journal and into the

	3161 ** database. This is not generally possible with a WAL database, as

	3162 ** rollback involves simply truncating the log file. Therefore, if one

	3163 ** or more frames have already been written to the log (and therefore

	3164 ** also copied into the backup databases) as part of this transaction,

	3165 ** the backups must be restarted.

	3166 */

	3167 sqlite3BackupRestart(pPager->pBackup);

	3168

	3169 return rc;

	3170 }

	3171

	3172 /*

	3173 ** This function is called to rollback a transaction on a WAL database.

	3174 */

	3175 static int pagerRollbackWal(Pager *pPager){

	3176 int rc; /* Return Code */

	3177 PgHdr pList; / List of dirty pages to revert */

	3178

	3179 /* For all pages in the cache that are currently dirty or have already

	3180 ** been written (but not committed) to the log file, do one of the

	3181 ** following:

	3182 **

	3183 ** + Discard the cached page (if refcount==0), or

	3184 ** + Reload page content from the database (if refcount>0).

	3185 */

	3186 pPager->dbSize = pPager->dbOrigSize;

	3187 rc = sqlite3WalUndo(pPager->pWal, pagerUndoCallback, (void *)pPager);

	3188 pList = sqlite3PcacheDirtyList(pPager->pPCache);

	3189 while( pList && rc==SQLITE_OK ){

	3190 PgHdr *pNext = pList->pDirty;

	3191 rc = pagerUndoCallback((void *)pPager, pList->pgno);

	3192 pList = pNext;

	3193 }

	3194

	3195 return rc;

	3196 }

	3197

	3198 /*

	3199 ** This function is a wrapper around sqlite3WalFrames(). As well as logging

	3200 ** the contents of the list of pages headed by pList (connected by pDirty),

	3201 ** this function notifies any active backup processes that the pages have

	3202 ** changed.

	3203 **

	3204 ** The list of pages passed into this routine is always sorted by page number.

	3205 ** Hence, if page 1 appears anywhere on the list, it will be the first page.

	3206 */

	3207 static int pagerWalFrames(

	3208 Pager pPager, / Pager object */

	3209 PgHdr pList, / List of frames to log */

	3210 Pgno nTruncate, /* Database size after this commit */

	3211 int isCommit /* True if this is a commit */

	3212 ){

	3213 int rc; /* Return code */

	3214 int nList; /* Number of pages in pList */

	3215 PgHdr p; / For looping over pages */

	3216

	3217 assert( pPager->pWal );

	3218 assert( pList );

	3219 #ifdef SQLITE_DEBUG

	3220 /* Verify that the page list is in accending order */

	3221 for(p=pList; p && p->pDirty; p=p->pDirty){

	3222 assert( p->pgno < p->pDirty->pgno );

	3223 }

	3224 #endif

	3225

	3226 assert( pList->pDirty==0 \|\| isCommit );

	3227 if( isCommit ){

	3228 /* If a WAL transaction is being committed, there is no point in writing

	3229 ** any pages with page numbers greater than nTruncate into the WAL file.

	3230 ** They will never be read by any client. So remove them from the pDirty

	3231 ** list here. */

	3232 PgHdr **ppNext = &pList;

	3233 nList = 0;

	3234 for(p=pList; (*ppNext = p)!=0; p=p->pDirty){

	3235 if( p->pgno<=nTruncate ){

	3236 ppNext = &p->pDirty;

	3237 nList++;

	3238 }

	3239 }

	3240 assert( pList );

	3241 }else{

	3242 nList = 1;

	3243 }

	3244 pPager->aStat[PAGER_STAT_WRITE] += nList;

	3245

	3246 if( pList->pgno==1 ) pager_write_changecounter(pList);

	3247 rc = sqlite3WalFrames(pPager->pWal,

	3248 pPager->pageSize, pList, nTruncate, isCommit, pPager->walSyncFlags

	3249 );

	3250 if( rc==SQLITE_OK && pPager->pBackup ){

	3251 for(p=pList; p; p=p->pDirty){

	3252 sqlite3BackupUpdate(pPager->pBackup, p->pgno, (u8 *)p->pData);

	3253 }

	3254 }

	3255

	3256 #ifdef SQLITE_CHECK_PAGES

	3257 pList = sqlite3PcacheDirtyList(pPager->pPCache);

	3258 for(p=pList; p; p=p->pDirty){

	3259 pager_set_pagehash(p);

	3260 }

	3261 #endif

	3262

	3263 return rc;

	3264 }

	3265

	3266 /*

	3267 ** Begin a read transaction on the WAL.

	3268 **

	3269 ** This routine used to be called "pagerOpenSnapshot()" because it essentially

	3270 ** makes a snapshot of the database at the current point in time and preserves

	3271 ** that snapshot for use by the reader in spite of concurrently changes by

	3272 ** other writers or checkpointers.

	3273 */

	3274 static int pagerBeginReadTransaction(Pager *pPager){

	3275 int rc; /* Return code */

	3276 int changed = 0; /* True if cache must be reset */

	3277

	3278 assert( pagerUseWal(pPager) );

	3279 assert( pPager->eState==PAGER_OPEN \|\| pPager->eState==PAGER_READER );

	3280

	3281 /* sqlite3WalEndReadTransaction() was not called for the previous

	3282 ** transaction in locking_mode=EXCLUSIVE. So call it now. If we

	3283 ** are in locking_mode=NORMAL and EndRead() was previously called,

	3284 ** the duplicate call is harmless.

	3285 */

	3286 sqlite3WalEndReadTransaction(pPager->pWal);

	3287

	3288 rc = sqlite3WalBeginReadTransaction(pPager->pWal, &changed);

	3289 if( rc!=SQLITE_OK \|\| changed ){

	3290 pager_reset(pPager);

	3291 if( USEFETCH(pPager) ) sqlite3OsUnfetch(pPager->fd, 0, 0);

	3292 }

	3293

	3294 return rc;

	3295 }

	3296 #endif

	3297

	3298 /*

	3299 ** This function is called as part of the transition from PAGER_OPEN

	3300 ** to PAGER_READER state to determine the size of the database file

	3301 ** in pages (assuming the page size currently stored in Pager.pageSize).

	3302 **

	3303 ** If no error occurs, SQLITE_OK is returned and the size of the database

	3304 ** in pages is stored in *pnPage. Otherwise, an error code (perhaps

	3305 ** SQLITE_IOERR_FSTAT) is returned and *pnPage is left unmodified.

	3306 */

	3307 static int pagerPagecount(Pager pPager, Pgno pnPage){

	3308 Pgno nPage; /* Value to return via pnPage /

	3309

	3310 /* Query the WAL sub-system for the database size. The WalDbsize()

	3311 ** function returns zero if the WAL is not open (i.e. Pager.pWal==0), or

	3312 ** if the database size is not available. The database size is not

	3313 ** available from the WAL sub-system if the log file is empty or

	3314 ** contains no valid committed transactions.

	3315 */

	3316 assert( pPager->eState==PAGER_OPEN );

	3317 assert( pPager->eLock>=SHARED_LOCK );

	3318 nPage = sqlite3WalDbsize(pPager->pWal);

	3319

	3320 /* If the number of pages in the database is not available from the

	3321 ** WAL sub-system, determine the page counte based on the size of

	3322 ** the database file. If the size of the database file is not an

	3323 ** integer multiple of the page-size, round up the result.

	3324 */

	3325 if( nPage==0 ){

	3326 i64 n = 0; /* Size of db file in bytes */

	3327 assert( isOpen(pPager->fd) \|\| pPager->tempFile );

	3328 if( isOpen(pPager->fd) ){

	3329 int rc = sqlite3OsFileSize(pPager->fd, &n);

	3330 if( rc!=SQLITE_OK ){

	3331 return rc;

	3332 }

	3333 }

	3334 nPage = (Pgno)((n+pPager->pageSize-1) / pPager->pageSize);

	3335 }

	3336

	3337 /* If the current number of pages in the file is greater than the

	3338 ** configured maximum pager number, increase the allowed limit so

	3339 ** that the file can be read.

	3340 */

	3341 if( nPage>pPager->mxPgno ){

	3342 pPager->mxPgno = (Pgno)nPage;

	3343 }

	3344

	3345 *pnPage = nPage;

	3346 return SQLITE_OK;

	3347 }

	3348

	3349 #ifndef SQLITE_OMIT_WAL

	3350 /*

	3351 ** Check if the *-wal file that corresponds to the database opened by pPager

	3352 ** exists if the database is not empy, or verify that the *-wal file does

	3353 ** not exist (by deleting it) if the database file is empty.

	3354 **

	3355 ** If the database is not empty and the *-wal file exists, open the pager

	3356 ** in WAL mode. If the database is empty or if no *-wal file exists and

	3357 ** if no error occurs, make sure Pager.journalMode is not set to

	3358 ** PAGER_JOURNALMODE_WAL.

	3359 **

	3360 ** Return SQLITE_OK or an error code.

	3361 **

	3362 ** The caller must hold a SHARED lock on the database file to call this

	3363 ** function. Because an EXCLUSIVE lock on the db file is required to delete

	3364 ** a WAL on a none-empty database, this ensures there is no race condition

	3365 ** between the xAccess() below and an xDelete() being executed by some

	3366 ** other connection.

	3367 */

	3368 static int pagerOpenWalIfPresent(Pager *pPager){

	3369 int rc = SQLITE_OK;

	3370 assert( pPager->eState==PAGER_OPEN );

	3371 assert( pPager->eLock>=SHARED_LOCK );

	3372

	3373 if( !pPager->tempFile ){

	3374 int isWal; /* True if WAL file exists */

	3375 Pgno nPage; /* Size of the database file */

	3376

	3377 rc = pagerPagecount(pPager, &nPage);

	3378 if( rc ) return rc;

	3379 if( nPage==0 ){

	3380 rc = sqlite3OsDelete(pPager->pVfs, pPager->zWal, 0);

	3381 if( rc==SQLITE_IOERR_DELETE_NOENT ) rc = SQLITE_OK;

	3382 isWal = 0;

	3383 }else{

	3384 rc = sqlite3OsAccess(

	3385 pPager->pVfs, pPager->zWal, SQLITE_ACCESS_EXISTS, &isWal

	3386 );

	3387 }

	3388 if( rc==SQLITE_OK ){

	3389 if( isWal ){

	3390 testcase( sqlite3PcachePagecount(pPager->pPCache)==0 );

	3391 rc = sqlite3PagerOpenWal(pPager, 0);

	3392 }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){

	3393 pPager->journalMode = PAGER_JOURNALMODE_DELETE;

	3394 }

	3395 }

	3396 }

	3397 return rc;

	3398 }

	3399 #endif

	3400

	3401 /*

	3402 ** Playback savepoint pSavepoint. Or, if pSavepoint==NULL, then playback

	3403 ** the entire master journal file. The case pSavepoint==NULL occurs when

	3404 ** a ROLLBACK TO command is invoked on a SAVEPOINT that is a transaction

	3405 ** savepoint.

	3406 **

	3407 ** When pSavepoint is not NULL (meaning a non-transaction savepoint is

	3408 ** being rolled back), then the rollback consists of up to three stages,

	3409 ** performed in the order specified:

	3410 **

	3411 ** * Pages are played back from the main journal starting at byte

	3412 ** offset PagerSavepoint.iOffset and continuing to

	3413 ** PagerSavepoint.iHdrOffset, or to the end of the main journal

	3414 ** file if PagerSavepoint.iHdrOffset is zero.

	3415 **

	3416 ** * If PagerSavepoint.iHdrOffset is not zero, then pages are played

	3417 ** back starting from the journal header immediately following

	3418 ** PagerSavepoint.iHdrOffset to the end of the main journal file.

	3419 **

	3420 ** * Pages are then played back from the sub-journal file, starting

	3421 ** with the PagerSavepoint.iSubRec and continuing to the end of

	3422 ** the journal file.

	3423 **

	3424 ** Throughout the rollback process, each time a page is rolled back, the

	3425 ** corresponding bit is set in a bitvec structure (variable pDone in the

	3426 ** implementation below). This is used to ensure that a page is only

	3427 ** rolled back the first time it is encountered in either journal.

	3428 **

	3429 ** If pSavepoint is NULL, then pages are only played back from the main

	3430 ** journal file. There is no need for a bitvec in this case.

	3431 **

	3432 ** In either case, before playback commences the Pager.dbSize variable

	3433 ** is reset to the value that it held at the start of the savepoint

	3434 ** (or transaction). No page with a page-number greater than this value

	3435 ** is played back. If one is encountered it is simply skipped.

	3436 */

	3437 static int pagerPlaybackSavepoint(Pager pPager, PagerSavepoint pSavepoint){

	3438 i64 szJ; /* Effective size of the main journal */

	3439 i64 iHdrOff; /* End of first segment of main-journal records */

	3440 int rc = SQLITE_OK; /* Return code */

	3441 Bitvec pDone = 0; / Bitvec to ensure pages played back only once */

	3442

	3443 assert( pPager->eState!=PAGER_ERROR );

	3444 assert( pPager->eState>=PAGER_WRITER_LOCKED );

	3445

	3446 /* Allocate a bitvec to use to store the set of pages rolled back */

	3447 if( pSavepoint ){

	3448 pDone = sqlite3BitvecCreate(pSavepoint->nOrig);

	3449 if( !pDone ){

	3450 return SQLITE_NOMEM;

	3451 }

	3452 }

	3453

	3454 /* Set the database size back to the value it was before the savepoint

	3455 ** being reverted was opened.

	3456 */

	3457 pPager->dbSize = pSavepoint ? pSavepoint->nOrig : pPager->dbOrigSize;

	3458 pPager->changeCountDone = pPager->tempFile;

	3459

	3460 if( !pSavepoint && pagerUseWal(pPager) ){

	3461 return pagerRollbackWal(pPager);

	3462 }

	3463

	3464 /* Use pPager->journalOff as the effective size of the main rollback

	3465 ** journal. The actual file might be larger than this in

	3466 ** PAGER_JOURNALMODE_TRUNCATE or PAGER_JOURNALMODE_PERSIST. But anything

	3467 ** past pPager->journalOff is off-limits to us.

	3468 */

	3469 szJ = pPager->journalOff;

	3470 assert( pagerUseWal(pPager)==0 \|\| szJ==0 );

	3471

	3472 /* Begin by rolling back records from the main journal starting at

	3473 ** PagerSavepoint.iOffset and continuing to the next journal header.

	3474 ** There might be records in the main journal that have a page number

	3475 ** greater than the current database size (pPager->dbSize) but those

	3476 ** will be skipped automatically. Pages are added to pDone as they

	3477 ** are played back.

	3478 */

	3479 if( pSavepoint && !pagerUseWal(pPager) ){

	3480 iHdrOff = pSavepoint->iHdrOffset ? pSavepoint->iHdrOffset : szJ;

	3481 pPager->journalOff = pSavepoint->iOffset;

	3482 while( rc==SQLITE_OK && pPager->journalOff<iHdrOff ){

	3483 rc = pager_playback_one_page(pPager, &pPager->journalOff, pDone, 1, 1);

	3484 }

	3485 assert( rc!=SQLITE_DONE );

	3486 }else{

	3487 pPager->journalOff = 0;

	3488 }

	3489

	3490 /* Continue rolling back records out of the main journal starting at

	3491 ** the first journal header seen and continuing until the effective end

	3492 ** of the main journal file. Continue to skip out-of-range pages and

	3493 ** continue adding pages rolled back to pDone.

	3494 */

	3495 while( rc==SQLITE_OK && pPager->journalOff<szJ ){

	3496 u32 ii; /* Loop counter */

	3497 u32 nJRec = 0; /* Number of Journal Records */

	3498 u32 dummy;

	3499 rc = readJournalHdr(pPager, 0, szJ, &nJRec, &dummy);

	3500 assert( rc!=SQLITE_DONE );

	3501

	3502 /*

	3503 ** The "pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff"

	3504 ** test is related to ticket #2565. See the discussion in the

	3505 ** pager_playback() function for additional information.

	3506 */

	3507 if( nJRec==0

	3508 && pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff

	3509 ){

	3510 nJRec = (u32)((szJ - pPager->journalOff)/JOURNAL_PG_SZ(pPager));

	3511 }

	3512 for(ii=0; rc==SQLITE_OK && ii<nJRec && pPager->journalOff<szJ; ii++){

	3513 rc = pager_playback_one_page(pPager, &pPager->journalOff, pDone, 1, 1);

	3514 }

	3515 assert( rc!=SQLITE_DONE );

	3516 }

	3517 assert( rc!=SQLITE_OK \|\| pPager->journalOff>=szJ );

	3518

	3519 /* Finally, rollback pages from the sub-journal. Page that were

	3520 ** previously rolled back out of the main journal (and are hence in pDone)

	3521 ** will be skipped. Out-of-range pages are also skipped.

	3522 */

	3523 if( pSavepoint ){

	3524 u32 ii; /* Loop counter */

	3525 i64 offset = (i64)pSavepoint->iSubRec*(4+pPager->pageSize);

	3526

	3527 if( pagerUseWal(pPager) ){

	3528 rc = sqlite3WalSavepointUndo(pPager->pWal, pSavepoint->aWalData);

	3529 }

	3530 for(ii=pSavepoint->iSubRec; rc==SQLITE_OK && ii<pPager->nSubRec; ii++){

	3531 assert( offset==(i64)ii*(4+pPager->pageSize) );

	3532 rc = pager_playback_one_page(pPager, &offset, pDone, 0, 1);

	3533 }

	3534 assert( rc!=SQLITE_DONE );

	3535 }

	3536

	3537 sqlite3BitvecDestroy(pDone);

	3538 if( rc==SQLITE_OK ){

	3539 pPager->journalOff = szJ;

	3540 }

	3541

	3542 return rc;

	3543 }

	3544

	3545 /*

	3546 ** Change the maximum number of in-memory pages that are allowed

	3547 ** before attempting to recycle clean and unused pages.

	3548 */

	3549 SQLITE_PRIVATE void sqlite3PagerSetCachesize(Pager *pPager, int mxPage){

	3550 sqlite3PcacheSetCachesize(pPager->pPCache, mxPage);

	3551 }

	3552

	3553 /*

	3554 ** Change the maximum number of in-memory pages that are allowed

	3555 ** before attempting to spill pages to journal.

	3556 */

	3557 SQLITE_PRIVATE int sqlite3PagerSetSpillsize(Pager *pPager, int mxPage){

	3558 return sqlite3PcacheSetSpillsize(pPager->pPCache, mxPage);

	3559 }

	3560

	3561 /*

	3562 ** Invoke SQLITE_FCNTL_MMAP_SIZE based on the current value of szMmap.

	3563 */

	3564 static void pagerFixMaplimit(Pager *pPager){

	3565 #if SQLITE_MAX_MMAP_SIZE>0

	3566 sqlite3_file *fd = pPager->fd;

	3567 if( isOpen(fd) && fd->pMethods->iVersion>=3 ){

	3568 sqlite3_int64 sz;

	3569 sz = pPager->szMmap;

	3570 pPager->bUseFetch = (sz>0);

	3571 sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_MMAP_SIZE, &sz);

	3572 }

	3573 #endif

	3574 }

	3575

	3576 /*

	3577 ** Change the maximum size of any memory mapping made of the database file.

	3578 */

	3579 SQLITE_PRIVATE void sqlite3PagerSetMmapLimit(Pager *pPager, sqlite3_int64 szMmap ){

	3580 pPager->szMmap = szMmap;

	3581 pagerFixMaplimit(pPager);

	3582 }

	3583

	3584 /*

	3585 ** Free as much memory as possible from the pager.

	3586 */

	3587 SQLITE_PRIVATE void sqlite3PagerShrink(Pager *pPager){

	3588 sqlite3PcacheShrink(pPager->pPCache);

	3589 }

	3590

	3591 /*

	3592 ** Adjust settings of the pager to those specified in the pgFlags parameter.

	3593 **

	3594 ** The "level" in pgFlags & PAGER_SYNCHRONOUS_MASK sets the robustness

	3595 ** of the database to damage due to OS crashes or power failures by

	3596 ** changing the number of syncs()s when writing the journals.

	3597 ** There are three levels:

	3598 **

	3599 ** OFF sqlite3OsSync() is never called. This is the default

	3600 ** for temporary and transient files.

	3601 **

	3602 ** NORMAL The journal is synced once before writes begin on the

	3603 ** database. This is normally adequate protection, but

	3604 ** it is theoretically possible, though very unlikely,

	3605 ** that an inopertune power failure could leave the journal

	3606 ** in a state which would cause damage to the database

	3607 ** when it is rolled back.

	3608 **

	3609 ** FULL The journal is synced twice before writes begin on the

	3610 ** database (with some additional information - the nRec field

	3611 ** of the journal header - being written in between the two

	3612 ** syncs). If we assume that writing a

	3613 ** single disk sector is atomic, then this mode provides

	3614 ** assurance that the journal will not be corrupted to the

	3615 ** point of causing damage to the database during rollback.

	3616 **

	3617 ** The above is for a rollback-journal mode. For WAL mode, OFF continues

	3618 ** to mean that no syncs ever occur. NORMAL means that the WAL is synced

	3619 ** prior to the start of checkpoint and that the database file is synced

	3620 ** at the conclusion of the checkpoint if the entire content of the WAL

	3621 ** was written back into the database. But no sync operations occur for

	3622 ** an ordinary commit in NORMAL mode with WAL. FULL means that the WAL

	3623 ** file is synced following each commit operation, in addition to the

	3624 ** syncs associated with NORMAL.

	3625 **

	3626 ** Do not confuse synchronous=FULL with SQLITE_SYNC_FULL. The

	3627 ** SQLITE_SYNC_FULL macro means to use the MacOSX-style full-fsync

	3628 ** using fcntl(F_FULLFSYNC). SQLITE_SYNC_NORMAL means to do an

	3629 ** ordinary fsync() call. There is no difference between SQLITE_SYNC_FULL

	3630 ** and SQLITE_SYNC_NORMAL on platforms other than MacOSX. But the

	3631 ** synchronous=FULL versus synchronous=NORMAL setting determines when

	3632 ** the xSync primitive is called and is relevant to all platforms.

	3633 **

	3634 ** Numeric values associated with these states are OFF==1, NORMAL=2,

	3635 ** and FULL=3.

	3636 */

	3637 #ifndef SQLITE_OMIT_PAGER_PRAGMAS

	3638 SQLITE_PRIVATE void sqlite3PagerSetFlags(

	3639 Pager pPager, / The pager to set safety level for */

	3640 unsigned pgFlags /* Various flags */

	3641 ){

	3642 unsigned level = pgFlags & PAGER_SYNCHRONOUS_MASK;

	3643 assert( level>=1 && level<=3 );

	3644 pPager->noSync = (level==1 \|\| pPager->tempFile) ?1:0;

	3645 pPager->fullSync = (level==3 && !pPager->tempFile) ?1:0;

	3646 if( pPager->noSync ){

	3647 pPager->syncFlags = 0;

	3648 pPager->ckptSyncFlags = 0;

	3649 }else if( pgFlags & PAGER_FULLFSYNC ){

	3650 pPager->syncFlags = SQLITE_SYNC_FULL;

	3651 pPager->ckptSyncFlags = SQLITE_SYNC_FULL;

	3652 }else if( pgFlags & PAGER_CKPT_FULLFSYNC ){

	3653 pPager->syncFlags = SQLITE_SYNC_NORMAL;

	3654 pPager->ckptSyncFlags = SQLITE_SYNC_FULL;

	3655 }else{

	3656 pPager->syncFlags = SQLITE_SYNC_NORMAL;

	3657 pPager->ckptSyncFlags = SQLITE_SYNC_NORMAL;

	3658 }

	3659 pPager->walSyncFlags = pPager->syncFlags;

	3660 if( pPager->fullSync ){

	3661 pPager->walSyncFlags \|= WAL_SYNC_TRANSACTIONS;

	3662 }

	3663 if( pgFlags & PAGER_CACHESPILL ){

	3664 pPager->doNotSpill &= ~SPILLFLAG_OFF;

	3665 }else{

	3666 pPager->doNotSpill \|= SPILLFLAG_OFF;

	3667 }

	3668 }

	3669 #endif

	3670

	3671 /*

	3672 ** The following global variable is incremented whenever the library

	3673 ** attempts to open a temporary file. This information is used for

	3674 ** testing and analysis only.

	3675 */

	3676 #ifdef SQLITE_TEST

	3677 SQLITE_API int sqlite3_opentemp_count = 0;

	3678 #endif

	3679

	3680 /*

	3681 ** Open a temporary file.

	3682 **

	3683 ** Write the file descriptor into *pFile. Return SQLITE_OK on success

	3684 ** or some other error code if we fail. The OS will automatically

	3685 ** delete the temporary file when it is closed.

	3686 **

	3687 ** The flags passed to the VFS layer xOpen() call are those specified

	3688 ** by parameter vfsFlags ORed with the following:

	3689 **

	3690 ** SQLITE_OPEN_READWRITE

	3691 ** SQLITE_OPEN_CREATE

	3692 ** SQLITE_OPEN_EXCLUSIVE

	3693 ** SQLITE_OPEN_DELETEONCLOSE

	3694 */

	3695 static int pagerOpentemp(

	3696 Pager pPager, / The pager object */

	3697 sqlite3_file pFile, / Write the file descriptor here */

	3698 int vfsFlags /* Flags passed through to the VFS */

	3699 ){

	3700 int rc; /* Return code */

	3701

	3702 #ifdef SQLITE_TEST

	3703 sqlite3_opentemp_count++; /* Used for testing and analysis only */

	3704 #endif

	3705

	3706 vfsFlags \|= SQLITE_OPEN_READWRITE \| SQLITE_OPEN_CREATE \|

	3707 SQLITE_OPEN_EXCLUSIVE \| SQLITE_OPEN_DELETEONCLOSE;

	3708 rc = sqlite3OsOpen(pPager->pVfs, 0, pFile, vfsFlags, 0);

	3709 assert( rc!=SQLITE_OK \|\| isOpen(pFile) );

	3710 return rc;

	3711 }

	3712

	3713 /*

	3714 ** Set the busy handler function.

	3715 **

	3716 ** The pager invokes the busy-handler if sqlite3OsLock() returns

	3717 ** SQLITE_BUSY when trying to upgrade from no-lock to a SHARED lock,

	3718 ** or when trying to upgrade from a RESERVED lock to an EXCLUSIVE

	3719 ** lock. It does not invoke the busy handler when upgrading from

	3720 ** SHARED to RESERVED, or when upgrading from SHARED to EXCLUSIVE

	3721 ** (which occurs during hot-journal rollback). Summary:

	3722 **

	3723 ** Transition \| Invokes xBusyHandler

	3724 ** --------------------------------------------------------

	3725 ** NO_LOCK -> SHARED_LOCK \| Yes

	3726 ** SHARED_LOCK -> RESERVED_LOCK \| No

	3727 ** SHARED_LOCK -> EXCLUSIVE_LOCK \| No

	3728 ** RESERVED_LOCK -> EXCLUSIVE_LOCK \| Yes

	3729 **

	3730 ** If the busy-handler callback returns non-zero, the lock is

	3731 ** retried. If it returns zero, then the SQLITE_BUSY error is

	3732 ** returned to the caller of the pager API function.

	3733 */

	3734 SQLITE_PRIVATE void sqlite3PagerSetBusyhandler(

	3735 Pager pPager, / Pager object */

	3736 int (xBusyHandler)(void ), /* Pointer to busy-handler function */

	3737 void pBusyHandlerArg / Argument to pass to xBusyHandler */

	3738 ){

	3739 pPager->xBusyHandler = xBusyHandler;

	3740 pPager->pBusyHandlerArg = pBusyHandlerArg;

	3741

	3742 if( isOpen(pPager->fd) ){

	3743 void ap = (void )&pPager->xBusyHandler;

	3744 assert( ((int()(void ))(ap[0]))==xBusyHandler );

	3745 assert( ap[1]==pBusyHandlerArg );

	3746 sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_BUSYHANDLER, (void *)ap);

	3747 }

	3748 }

	3749

	3750 /*

	3751 ** Change the page size used by the Pager object. The new page size

	3752 ** is passed in *pPageSize.

	3753 **

	3754 ** If the pager is in the error state when this function is called, it

	3755 ** is a no-op. The value returned is the error state error code (i.e.

	3756 ** one of SQLITE_IOERR, an SQLITE_IOERR_xxx sub-code or SQLITE_FULL).

	3757 **

	3758 ** Otherwise, if all of the following are true:

	3759 **

	3760 ** * the new page size (value of *pPageSize) is valid (a power

	3761 ** of two between 512 and SQLITE_MAX_PAGE_SIZE, inclusive), and

	3762 **

	3763 ** * there are no outstanding page references, and

	3764 **

	3765 ** * the database is either not an in-memory database or it is

	3766 ** an in-memory database that currently consists of zero pages.

	3767 **

	3768 ** then the pager object page size is set to *pPageSize.

	3769 **

	3770 ** If the page size is changed, then this function uses sqlite3PagerMalloc()

	3771 ** to obtain a new Pager.pTmpSpace buffer. If this allocation attempt

	3772 ** fails, SQLITE_NOMEM is returned and the page size remains unchanged.

	3773 ** In all other cases, SQLITE_OK is returned.

	3774 **

	3775 ** If the page size is not changed, either because one of the enumerated

	3776 ** conditions above is not true, the pager was in error state when this

	3777 ** function was called, or because the memory allocation attempt failed,

	3778 ** then *pPageSize is set to the old, retained page size before returning.

	3779 */

	3780 SQLITE_PRIVATE int sqlite3PagerSetPagesize(Pager pPager, u32 pPageSize, int nR eserve){

	3781 int rc = SQLITE_OK;

	3782

	3783 /* It is not possible to do a full assert_pager_state() here, as this

	3784 ** function may be called from within PagerOpen(), before the state

	3785 ** of the Pager object is internally consistent.

	3786 **

	3787 ** At one point this function returned an error if the pager was in

	3788 ** PAGER_ERROR state. But since PAGER_ERROR state guarantees that

	3789 ** there is at least one outstanding page reference, this function

	3790 ** is a no-op for that case anyhow.

	3791 */

	3792

	3793 u32 pageSize = *pPageSize;

	3794 assert( pageSize==0 \|\| (pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE) );

	3795 if( (pPager->memDb==0 \|\| pPager->dbSize==0)

	3796 && sqlite3PcacheRefCount(pPager->pPCache)==0

	3797 && pageSize && pageSize!=(u32)pPager->pageSize

	3798 ){

	3799 char pNew = NULL; / New temp space */

	3800 i64 nByte = 0;

	3801

	3802 if( pPager->eState>PAGER_OPEN && isOpen(pPager->fd) ){

	3803 rc = sqlite3OsFileSize(pPager->fd, &nByte);

	3804 }

	3805 if( rc==SQLITE_OK ){

	3806 pNew = (char *)sqlite3PageMalloc(pageSize);

	3807 if( !pNew ) rc = SQLITE_NOMEM;

	3808 }

	3809

	3810 if( rc==SQLITE_OK ){

	3811 pager_reset(pPager);

	3812 rc = sqlite3PcacheSetPageSize(pPager->pPCache, pageSize);

	3813 }

	3814 if( rc==SQLITE_OK ){

	3815 sqlite3PageFree(pPager->pTmpSpace);

	3816 pPager->pTmpSpace = pNew;

	3817 pPager->dbSize = (Pgno)((nByte+pageSize-1)/pageSize);

	3818 pPager->pageSize = pageSize;

	3819 }else{

	3820 sqlite3PageFree(pNew);

	3821 }

	3822 }

	3823

	3824 *pPageSize = pPager->pageSize;

	3825 if( rc==SQLITE_OK ){

	3826 if( nReserve<0 ) nReserve = pPager->nReserve;

	3827 assert( nReserve>=0 && nReserve<1000 );

	3828 pPager->nReserve = (i16)nReserve;

	3829 pagerReportSize(pPager);

	3830 pagerFixMaplimit(pPager);

	3831 }

	3832 return rc;

	3833 }

	3834

	3835 /*

	3836 ** Return a pointer to the "temporary page" buffer held internally

	3837 ** by the pager. This is a buffer that is big enough to hold the

	3838 ** entire content of a database page. This buffer is used internally

	3839 ** during rollback and will be overwritten whenever a rollback

	3840 ** occurs. But other modules are free to use it too, as long as

	3841 ** no rollbacks are happening.

	3842 */

	3843 SQLITE_PRIVATE void sqlite3PagerTempSpace(Pager pPager){

	3844 return pPager->pTmpSpace;

	3845 }

	3846

	3847 /*

	3848 ** Attempt to set the maximum database page count if mxPage is positive.

	3849 ** Make no changes if mxPage is zero or negative. And never reduce the

	3850 ** maximum page count below the current size of the database.

	3851 **

	3852 ** Regardless of mxPage, return the current maximum page count.

	3853 */

	3854 SQLITE_PRIVATE int sqlite3PagerMaxPageCount(Pager *pPager, int mxPage){

	3855 if( mxPage>0 ){

	3856 pPager->mxPgno = mxPage;

	3857 }

	3858 assert( pPager->eState!=PAGER_OPEN ); /* Called only by OP_MaxPgcnt */

	3859 assert( pPager->mxPgno>=pPager->dbSize ); /* OP_MaxPgcnt enforces this */

	3860 return pPager->mxPgno;

	3861 }

	3862

	3863 /*

	3864 ** The following set of routines are used to disable the simulated

	3865 ** I/O error mechanism. These routines are used to avoid simulated

	3866 ** errors in places where we do not care about errors.

	3867 **

	3868 ** Unless -DSQLITE_TEST=1 is used, these routines are all no-ops

	3869 ** and generate no code.

	3870 */

	3871 #ifdef SQLITE_TEST

	3872 SQLITE_API extern int sqlite3_io_error_pending;

	3873 SQLITE_API extern int sqlite3_io_error_hit;

	3874 static int saved_cnt;

	3875 void disable_simulated_io_errors(void){

	3876 saved_cnt = sqlite3_io_error_pending;

	3877 sqlite3_io_error_pending = -1;

	3878 }

	3879 void enable_simulated_io_errors(void){

	3880 sqlite3_io_error_pending = saved_cnt;

	3881 }

	3882 #else

	3883 # define disable_simulated_io_errors()

	3884 # define enable_simulated_io_errors()

	3885 #endif

	3886

	3887 /*

	3888 ** Read the first N bytes from the beginning of the file into memory

	3889 ** that pDest points to.

	3890 **

	3891 ** If the pager was opened on a transient file (zFilename==""), or

	3892 ** opened on a file less than N bytes in size, the output buffer is

	3893 ** zeroed and SQLITE_OK returned. The rationale for this is that this

	3894 ** function is used to read database headers, and a new transient or

	3895 ** zero sized database has a header than consists entirely of zeroes.

	3896 **

	3897 ** If any IO error apart from SQLITE_IOERR_SHORT_READ is encountered,

	3898 ** the error code is returned to the caller and the contents of the

	3899 ** output buffer undefined.

	3900 */

	3901 SQLITE_PRIVATE int sqlite3PagerReadFileheader(Pager pPager, int N, unsigned cha r pDest){

	3902 int rc = SQLITE_OK;

	3903 memset(pDest, 0, N);

	3904 assert( isOpen(pPager->fd) \|\| pPager->tempFile );

	3905

	3906 /* This routine is only called by btree immediately after creating

	3907 ** the Pager object. There has not been an opportunity to transition

	3908 ** to WAL mode yet.

	3909 */

	3910 assert( !pagerUseWal(pPager) );

	3911

	3912 if( isOpen(pPager->fd) ){

	3913 IOTRACE(("DBHDR %p 0 %d\n", pPager, N))

	3914 rc = sqlite3OsRead(pPager->fd, pDest, N, 0);

	3915 if( rc==SQLITE_IOERR_SHORT_READ ){

	3916 rc = SQLITE_OK;

	3917 }

	3918 }

	3919 return rc;

	3920 }

	3921

	3922 /*

	3923 ** This function may only be called when a read-transaction is open on

	3924 ** the pager. It returns the total number of pages in the database.

	3925 **

	3926 ** However, if the file is between 1 and <page-size> bytes in size, then

	3927 ** this is considered a 1 page file.

	3928 */

	3929 SQLITE_PRIVATE void sqlite3PagerPagecount(Pager pPager, int pnPage){

	3930 assert( pPager->eState>=PAGER_READER );

	3931 assert( pPager->eState!=PAGER_WRITER_FINISHED );

	3932 *pnPage = (int)pPager->dbSize;

	3933 }

	3934

	3935

	3936 /*

	3937 ** Try to obtain a lock of type locktype on the database file. If

	3938 ** a similar or greater lock is already held, this function is a no-op

	3939 ** (returning SQLITE_OK immediately).

	3940 **

	3941 ** Otherwise, attempt to obtain the lock using sqlite3OsLock(). Invoke

	3942 ** the busy callback if the lock is currently not available. Repeat

	3943 ** until the busy callback returns false or until the attempt to

	3944 ** obtain the lock succeeds.

	3945 **

	3946 ** Return SQLITE_OK on success and an error code if we cannot obtain

	3947 ** the lock. If the lock is obtained successfully, set the Pager.state

	3948 ** variable to locktype before returning.

	3949 */

	3950 static int pager_wait_on_lock(Pager *pPager, int locktype){

	3951 int rc; /* Return code */

	3952

	3953 /* Check that this is either a no-op (because the requested lock is

	3954 ** already held), or one of the transitions that the busy-handler

	3955 ** may be invoked during, according to the comment above

	3956 ** sqlite3PagerSetBusyhandler().

	3957 */

	3958 assert( (pPager->eLock>=locktype)

	3959 \|\| (pPager->eLock==NO_LOCK && locktype==SHARED_LOCK)

	3960 \|\| (pPager->eLock==RESERVED_LOCK && locktype==EXCLUSIVE_LOCK)

	3961 );

	3962

	3963 do {

	3964 rc = pagerLockDb(pPager, locktype);

	3965 }while( rc==SQLITE_BUSY && pPager->xBusyHandler(pPager->pBusyHandlerArg) );

	3966 return rc;

	3967 }

	3968

	3969 /*

	3970 ** Function assertTruncateConstraint(pPager) checks that one of the

	3971 ** following is true for all dirty pages currently in the page-cache:

	3972 **

	3973 ** a) The page number is less than or equal to the size of the

	3974 ** current database image, in pages, OR

	3975 **

	3976 ** b) if the page content were written at this time, it would not

	3977 ** be necessary to write the current content out to the sub-journal

	3978 ** (as determined by function subjRequiresPage()).

	3979 **

	3980 ** If the condition asserted by this function were not true, and the

	3981 ** dirty page were to be discarded from the cache via the pagerStress()

	3982 ** routine, pagerStress() would not write the current page content to

	3983 ** the database file. If a savepoint transaction were rolled back after

	3984 ** this happened, the correct behavior would be to restore the current

	3985 ** content of the page. However, since this content is not present in either

	3986 ** the database file or the portion of the rollback journal and

	3987 ** sub-journal rolled back the content could not be restored and the

	3988 ** database image would become corrupt. It is therefore fortunate that

	3989 ** this circumstance cannot arise.

	3990 */

	3991 #if defined(SQLITE_DEBUG)

	3992 static void assertTruncateConstraintCb(PgHdr *pPg){

	3993 assert( pPg->flags&PGHDR_DIRTY );

	3994 assert( !subjRequiresPage(pPg) \|\| pPg->pgno<=pPg->pPager->dbSize );

	3995 }

	3996 static void assertTruncateConstraint(Pager *pPager){

	3997 sqlite3PcacheIterateDirty(pPager->pPCache, assertTruncateConstraintCb);

	3998 }

	3999 #else

	4000 # define assertTruncateConstraint(pPager)

	4001 #endif

	4002

	4003 /*

	4004 ** Truncate the in-memory database file image to nPage pages. This

	4005 ** function does not actually modify the database file on disk. It

	4006 ** just sets the internal state of the pager object so that the

	4007 ** truncation will be done when the current transaction is committed.

	4008 **

	4009 ** This function is only called right before committing a transaction.

	4010 ** Once this function has been called, the transaction must either be

	4011 ** rolled back or committed. It is not safe to call this function and

	4012 ** then continue writing to the database.

	4013 */

	4014 SQLITE_PRIVATE void sqlite3PagerTruncateImage(Pager *pPager, Pgno nPage){

	4015 assert( pPager->dbSize>=nPage );

	4016 assert( pPager->eState>=PAGER_WRITER_CACHEMOD );

	4017 pPager->dbSize = nPage;

	4018

	4019 /* At one point the code here called assertTruncateConstraint() to

	4020 ** ensure that all pages being truncated away by this operation are,

	4021 ** if one or more savepoints are open, present in the savepoint

	4022 ** journal so that they can be restored if the savepoint is rolled

	4023 ** back. This is no longer necessary as this function is now only

	4024 ** called right before committing a transaction. So although the

	4025 ** Pager object may still have open savepoints (Pager.nSavepoint!=0),

	4026 ** they cannot be rolled back. So the assertTruncateConstraint() call

	4027 ** is no longer correct. */

	4028 }

	4029

	4030

	4031 /*

	4032 ** This function is called before attempting a hot-journal rollback. It

	4033 ** syncs the journal file to disk, then sets pPager->journalHdr to the

	4034 ** size of the journal file so that the pager_playback() routine knows

	4035 ** that the entire journal file has been synced.

	4036 **

	4037 ** Syncing a hot-journal to disk before attempting to roll it back ensures

	4038 ** that if a power-failure occurs during the rollback, the process that

	4039 ** attempts rollback following system recovery sees the same journal

	4040 ** content as this process.

	4041 **

	4042 ** If everything goes as planned, SQLITE_OK is returned. Otherwise,

	4043 ** an SQLite error code.

	4044 */

	4045 static int pagerSyncHotJournal(Pager *pPager){

	4046 int rc = SQLITE_OK;

	4047 if( !pPager->noSync ){

	4048 rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_NORMAL);

	4049 }

	4050 if( rc==SQLITE_OK ){

	4051 rc = sqlite3OsFileSize(pPager->jfd, &pPager->journalHdr);

	4052 }

	4053 return rc;

	4054 }

	4055

	4056 /*

	4057 ** Obtain a reference to a memory mapped page object for page number pgno.

	4058 ** The new object will use the pointer pData, obtained from xFetch().

	4059 ** If successful, set *ppPage to point to the new page reference

	4060 ** and return SQLITE_OK. Otherwise, return an SQLite error code and set

	4061 ** *ppPage to zero.

	4062 **

	4063 ** Page references obtained by calling this function should be released

	4064 ** by calling pagerReleaseMapPage().

	4065 */

	4066 static int pagerAcquireMapPage(

	4067 Pager pPager, / Pager object */

	4068 Pgno pgno, /* Page number */

	4069 void pData, / xFetch()'d data for this page */

	4070 PgHdr *ppPage / OUT: Acquired page object */

	4071 ){

	4072 PgHdr p; / Memory mapped page to return */

	4073

	4074 if( pPager->pMmapFreelist ){

	4075 *ppPage = p = pPager->pMmapFreelist;

	4076 pPager->pMmapFreelist = p->pDirty;

	4077 p->pDirty = 0;

	4078 memset(p->pExtra, 0, pPager->nExtra);

	4079 }else{

	4080 ppPage = p = (PgHdr )sqlite3MallocZero(sizeof(PgHdr) + pPager->nExtra);

	4081 if( p==0 ){

	4082 sqlite3OsUnfetch(pPager->fd, (i64)(pgno-1) * pPager->pageSize, pData);

	4083 return SQLITE_NOMEM;

	4084 }

	4085 p->pExtra = (void *)&p[1];

	4086 p->flags = PGHDR_MMAP;

	4087 p->nRef = 1;

	4088 p->pPager = pPager;

	4089 }

	4090

	4091 assert( p->pExtra==(void *)&p[1] );

	4092 assert( p->pPage==0 );

	4093 assert( p->flags==PGHDR_MMAP );

	4094 assert( p->pPager==pPager );

	4095 assert( p->nRef==1 );

	4096

	4097 p->pgno = pgno;

	4098 p->pData = pData;

	4099 pPager->nMmapOut++;

	4100

	4101 return SQLITE_OK;

	4102 }

	4103

	4104 /*

	4105 ** Release a reference to page pPg. pPg must have been returned by an

	4106 ** earlier call to pagerAcquireMapPage().

	4107 */

	4108 static void pagerReleaseMapPage(PgHdr *pPg){

	4109 Pager *pPager = pPg->pPager;

	4110 pPager->nMmapOut--;

	4111 pPg->pDirty = pPager->pMmapFreelist;

	4112 pPager->pMmapFreelist = pPg;

	4113

	4114 assert( pPager->fd->pMethods->iVersion>=3 );

	4115 sqlite3OsUnfetch(pPager->fd, (i64)(pPg->pgno-1)*pPager->pageSize, pPg->pData);

	4116 }

	4117

	4118 /*

	4119 ** Free all PgHdr objects stored in the Pager.pMmapFreelist list.

	4120 */

	4121 static void pagerFreeMapHdrs(Pager *pPager){

	4122 PgHdr *p;

	4123 PgHdr *pNext;

	4124 for(p=pPager->pMmapFreelist; p; p=pNext){

	4125 pNext = p->pDirty;

	4126 sqlite3_free(p);

	4127 }

	4128 }

	4129

	4130

	4131 /*

	4132 ** Shutdown the page cache. Free all memory and close all files.

	4133 **

	4134 ** If a transaction was in progress when this routine is called, that

	4135 ** transaction is rolled back. All outstanding pages are invalidated

	4136 ** and their memory is freed. Any attempt to use a page associated

	4137 ** with this page cache after this function returns will likely

	4138 ** result in a coredump.

	4139 **

	4140 ** This function always succeeds. If a transaction is active an attempt

	4141 ** is made to roll it back. If an error occurs during the rollback

	4142 ** a hot journal may be left in the filesystem but no error is returned

	4143 ** to the caller.

	4144 */

	4145 SQLITE_PRIVATE int sqlite3PagerClose(Pager *pPager){

	4146 u8 pTmp = (u8 )pPager->pTmpSpace;

	4147

	4148 assert( assert_pager_state(pPager) );

	4149 disable_simulated_io_errors();

	4150 sqlite3BeginBenignMalloc();

	4151 pagerFreeMapHdrs(pPager);

	4152 /* pPager->errCode = 0; */

	4153 pPager->exclusiveMode = 0;

	4154 #ifndef SQLITE_OMIT_WAL

	4155 sqlite3WalClose(pPager->pWal, pPager->ckptSyncFlags, pPager->pageSize, pTmp);

	4156 pPager->pWal = 0;

	4157 #endif

	4158 pager_reset(pPager);

	4159 if( MEMDB ){

	4160 pager_unlock(pPager);

	4161 }else{

	4162 /* If it is open, sync the journal file before calling UnlockAndRollback.

	4163 ** If this is not done, then an unsynced portion of the open journal

	4164 ** file may be played back into the database. If a power failure occurs

	4165 ** while this is happening, the database could become corrupt.

	4166 **

	4167 ** If an error occurs while trying to sync the journal, shift the pager

	4168 ** into the ERROR state. This causes UnlockAndRollback to unlock the

	4169 ** database and close the journal file without attempting to roll it

	4170 ** back or finalize it. The next database user will have to do hot-journal

	4171 ** rollback before accessing the database file.

	4172 */

	4173 if( isOpen(pPager->jfd) ){

	4174 pager_error(pPager, pagerSyncHotJournal(pPager));

	4175 }

	4176 pagerUnlockAndRollback(pPager);

	4177 }

	4178 sqlite3EndBenignMalloc();

	4179 enable_simulated_io_errors();

	4180 PAGERTRACE(("CLOSE %d\n", PAGERID(pPager)));

	4181 IOTRACE(("CLOSE %p\n", pPager))

	4182 sqlite3OsClose(pPager->jfd);

	4183 sqlite3OsClose(pPager->fd);

	4184 sqlite3PageFree(pTmp);

	4185 sqlite3PcacheClose(pPager->pPCache);

	4186

	4187 #ifdef SQLITE_HAS_CODEC

	4188 if( pPager->xCodecFree ) pPager->xCodecFree(pPager->pCodec);

	4189 #endif

	4190

	4191 assert( !pPager->aSavepoint && !pPager->pInJournal );

	4192 assert( !isOpen(pPager->jfd) && !isOpen(pPager->sjfd) );

	4193

	4194 sqlite3_free(pPager);

	4195 return SQLITE_OK;

	4196 }

	4197

	4198 #if !defined(NDEBUG) \|\| defined(SQLITE_TEST)

	4199 /*

	4200 ** Return the page number for page pPg.

	4201 */

	4202 SQLITE_PRIVATE Pgno sqlite3PagerPagenumber(DbPage *pPg){

	4203 return pPg->pgno;

	4204 }

	4205 #endif

	4206

	4207 /*

	4208 ** Increment the reference count for page pPg.

	4209 */

	4210 SQLITE_PRIVATE void sqlite3PagerRef(DbPage *pPg){

	4211 sqlite3PcacheRef(pPg);

	4212 }

	4213

	4214 /*

	4215 ** Sync the journal. In other words, make sure all the pages that have

	4216 ** been written to the journal have actually reached the surface of the

	4217 ** disk and can be restored in the event of a hot-journal rollback.

	4218 **

	4219 ** If the Pager.noSync flag is set, then this function is a no-op.

	4220 ** Otherwise, the actions required depend on the journal-mode and the

	4221 ** device characteristics of the file-system, as follows:

	4222 **

	4223 ** * If the journal file is an in-memory journal file, no action need

	4224 ** be taken.

	4225 **

	4226 ** * Otherwise, if the device does not support the SAFE_APPEND property,

	4227 ** then the nRec field of the most recently written journal header

	4228 ** is updated to contain the number of journal records that have

	4229 ** been written following it. If the pager is operating in full-sync

	4230 ** mode, then the journal file is synced before this field is updated.

	4231 **

	4232 ** * If the device does not support the SEQUENTIAL property, then

	4233 ** journal file is synced.

	4234 **

	4235 ** Or, in pseudo-code:

	4236 **

	4237 ** if( NOT <in-memory journal> ){

	4238 ** if( NOT SAFE_APPEND ){

	4239 ** if( <full-sync mode> ) xSync(<journal file>);

	4240 ** <update nRec field>

	4241 ** }

	4242 ** if( NOT SEQUENTIAL ) xSync(<journal file>);

	4243 ** }

	4244 **

	4245 ** If successful, this routine clears the PGHDR_NEED_SYNC flag of every

	4246 ** page currently held in memory before returning SQLITE_OK. If an IO

	4247 ** error is encountered, then the IO error code is returned to the caller.

	4248 */

	4249 static int syncJournal(Pager *pPager, int newHdr){

	4250 int rc; /* Return code */

	4251

	4252 assert( pPager->eState==PAGER_WRITER_CACHEMOD

	4253 \|\| pPager->eState==PAGER_WRITER_DBMOD

	4254 );

	4255 assert( assert_pager_state(pPager) );

	4256 assert( !pagerUseWal(pPager) );

	4257

	4258 rc = sqlite3PagerExclusiveLock(pPager);

	4259 if( rc!=SQLITE_OK ) return rc;

	4260

	4261 if( !pPager->noSync ){

	4262 assert( !pPager->tempFile );

	4263 if( isOpen(pPager->jfd) && pPager->journalMode!=PAGER_JOURNALMODE_MEMORY ){

	4264 const int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);

	4265 assert( isOpen(pPager->jfd) );

	4266

	4267 if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){

	4268 /* This block deals with an obscure problem. If the last connection

	4269 ** that wrote to this database was operating in persistent-journal

	4270 ** mode, then the journal file may at this point actually be larger

	4271 ** than Pager.journalOff bytes. If the next thing in the journal

	4272 ** file happens to be a journal-header (written as part of the

	4273 ** previous connection's transaction), and a crash or power-failure

	4274 ** occurs after nRec is updated but before this connection writes

	4275 ** anything else to the journal file (or commits/rolls back its

	4276 ** transaction), then SQLite may become confused when doing the

	4277 ** hot-journal rollback following recovery. It may roll back all

	4278 ** of this connections data, then proceed to rolling back the old,

	4279 ** out-of-date data that follows it. Database corruption.

	4280 **

	4281 ** To work around this, if the journal file does appear to contain

	4282 ** a valid header following Pager.journalOff, then write a 0x00

	4283 ** byte to the start of it to prevent it from being recognized.

	4284 **

	4285 ** Variable iNextHdrOffset is set to the offset at which this

	4286 ** problematic header will occur, if it exists. aMagic is used

	4287 ** as a temporary buffer to inspect the first couple of bytes of

	4288 ** the potential journal header.

	4289 */

	4290 i64 iNextHdrOffset;

	4291 u8 aMagic[8];

	4292 u8 zHeader[sizeof(aJournalMagic)+4];

	4293

	4294 memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));

	4295 put32bits(&zHeader[sizeof(aJournalMagic)], pPager->nRec);

	4296

	4297 iNextHdrOffset = journalHdrOffset(pPager);

	4298 rc = sqlite3OsRead(pPager->jfd, aMagic, 8, iNextHdrOffset);

	4299 if( rc==SQLITE_OK && 0==memcmp(aMagic, aJournalMagic, 8) ){

	4300 static const u8 zerobyte = 0;

	4301 rc = sqlite3OsWrite(pPager->jfd, &zerobyte, 1, iNextHdrOffset);

	4302 }

	4303 if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){

	4304 return rc;

	4305 }

	4306

	4307 /* Write the nRec value into the journal file header. If in

	4308 ** full-synchronous mode, sync the journal first. This ensures that

	4309 ** all data has really hit the disk before nRec is updated to mark

	4310 ** it as a candidate for rollback.

	4311 **

	4312 ** This is not required if the persistent media supports the

	4313 ** SAFE_APPEND property. Because in this case it is not possible

	4314 ** for garbage data to be appended to the file, the nRec field

	4315 ** is populated with 0xFFFFFFFF when the journal header is written

	4316 ** and never needs to be updated.

	4317 */

	4318 if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){

	4319 PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager)));

	4320 IOTRACE(("JSYNC %p\n", pPager))

	4321 rc = sqlite3OsSync(pPager->jfd, pPager->syncFlags);

	4322 if( rc!=SQLITE_OK ) return rc;

	4323 }

	4324 IOTRACE(("JHDR %p %lld\n", pPager, pPager->journalHdr));

	4325 rc = sqlite3OsWrite(

	4326 pPager->jfd, zHeader, sizeof(zHeader), pPager->journalHdr

	4327 );

	4328 if( rc!=SQLITE_OK ) return rc;

	4329 }

	4330 if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){

	4331 PAGERTRACE(("SYNC journal of %d\n", PAGERID(pPager)));

	4332 IOTRACE(("JSYNC %p\n", pPager))

	4333 rc = sqlite3OsSync(pPager->jfd, pPager->syncFlags\|

	4334 (pPager->syncFlags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0)

	4335 );

	4336 if( rc!=SQLITE_OK ) return rc;

	4337 }

	4338

	4339 pPager->journalHdr = pPager->journalOff;

	4340 if( newHdr && 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){

	4341 pPager->nRec = 0;

	4342 rc = writeJournalHdr(pPager);

	4343 if( rc!=SQLITE_OK ) return rc;

	4344 }

	4345 }else{

	4346 pPager->journalHdr = pPager->journalOff;

	4347 }

	4348 }

	4349

	4350 /* Unless the pager is in noSync mode, the journal file was just

	4351 ** successfully synced. Either way, clear the PGHDR_NEED_SYNC flag on

	4352 ** all pages.

	4353 */

	4354 sqlite3PcacheClearSyncFlags(pPager->pPCache);

	4355 pPager->eState = PAGER_WRITER_DBMOD;

	4356 assert( assert_pager_state(pPager) );

	4357 return SQLITE_OK;

	4358 }

	4359

	4360 /*

	4361 ** The argument is the first in a linked list of dirty pages connected

	4362 ** by the PgHdr.pDirty pointer. This function writes each one of the

	4363 ** in-memory pages in the list to the database file. The argument may

	4364 ** be NULL, representing an empty list. In this case this function is

	4365 ** a no-op.

	4366 **

	4367 ** The pager must hold at least a RESERVED lock when this function

	4368 ** is called. Before writing anything to the database file, this lock

	4369 ** is upgraded to an EXCLUSIVE lock. If the lock cannot be obtained,

	4370 ** SQLITE_BUSY is returned and no data is written to the database file.

	4371 **

	4372 ** If the pager is a temp-file pager and the actual file-system file

	4373 ** is not yet open, it is created and opened before any data is

	4374 ** written out.

	4375 **

	4376 ** Once the lock has been upgraded and, if necessary, the file opened,

	4377 ** the pages are written out to the database file in list order. Writing

	4378 ** a page is skipped if it meets either of the following criteria:

	4379 **

	4380 ** * The page number is greater than Pager.dbSize, or

	4381 ** * The PGHDR_DONT_WRITE flag is set on the page.

	4382 **

	4383 ** If writing out a page causes the database file to grow, Pager.dbFileSize

	4384 ** is updated accordingly. If page 1 is written out, then the value cached

	4385 ** in Pager.dbFileVers[] is updated to match the new value stored in

	4386 ** the database file.

	4387 **

	4388 ** If everything is successful, SQLITE_OK is returned. If an IO error

	4389 ** occurs, an IO error code is returned. Or, if the EXCLUSIVE lock cannot

	4390 ** be obtained, SQLITE_BUSY is returned.

	4391 */

	4392 static int pager_write_pagelist(Pager pPager, PgHdr pList){

	4393 int rc = SQLITE_OK; /* Return code */

	4394

	4395 /* This function is only called for rollback pagers in WRITER_DBMOD state. */

	4396 assert( !pagerUseWal(pPager) );

	4397 assert( pPager->eState==PAGER_WRITER_DBMOD );

	4398 assert( pPager->eLock==EXCLUSIVE_LOCK );

	4399

	4400 /* If the file is a temp-file has not yet been opened, open it now. It

	4401 ** is not possible for rc to be other than SQLITE_OK if this branch

	4402 ** is taken, as pager_wait_on_lock() is a no-op for temp-files.

	4403 */

	4404 if( !isOpen(pPager->fd) ){

	4405 assert( pPager->tempFile && rc==SQLITE_OK );

	4406 rc = pagerOpentemp(pPager, pPager->fd, pPager->vfsFlags);

	4407 }

	4408

	4409 /* Before the first write, give the VFS a hint of what the final

	4410 ** file size will be.

	4411 */

	4412 assert( rc!=SQLITE_OK \|\| isOpen(pPager->fd) );

	4413 if( rc==SQLITE_OK

	4414 && pPager->dbHintSize<pPager->dbSize

	4415 && (pList->pDirty \|\| pList->pgno>pPager->dbHintSize)

	4416 ){

	4417 sqlite3_int64 szFile = pPager->pageSize * (sqlite3_int64)pPager->dbSize;

	4418 sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_SIZE_HINT, &szFile);

	4419 pPager->dbHintSize = pPager->dbSize;

	4420 }

	4421

	4422 while( rc==SQLITE_OK && pList ){

	4423 Pgno pgno = pList->pgno;

	4424

	4425 /* If there are dirty pages in the page cache with page numbers greater

	4426 ** than Pager.dbSize, this means sqlite3PagerTruncateImage() was called to

	4427 ** make the file smaller (presumably by auto-vacuum code). Do not write

	4428 ** any such pages to the file.

	4429 **

	4430 ** Also, do not write out any page that has the PGHDR_DONT_WRITE flag

	4431 ** set (set by sqlite3PagerDontWrite()).

	4432 */

	4433 if( pgno<=pPager->dbSize && 0==(pList->flags&PGHDR_DONT_WRITE) ){

	4434 i64 offset = (pgno-1)(i64)pPager->pageSize; / Offset to write */

	4435 char pData; / Data to write */

	4436

	4437 assert( (pList->flags&PGHDR_NEED_SYNC)==0 );

	4438 if( pList->pgno==1 ) pager_write_changecounter(pList);

	4439

	4440 /* Encode the database */

	4441 CODEC2(pPager, pList->pData, pgno, 6, return SQLITE_NOMEM, pData);

	4442

	4443 /* Write out the page data. */

	4444 rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset);

	4445

	4446 /* If page 1 was just written, update Pager.dbFileVers to match

	4447 ** the value now stored in the database file. If writing this

	4448 ** page caused the database file to grow, update dbFileSize.

	4449 */

	4450 if( pgno==1 ){

	4451 memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));

	4452 }

	4453 if( pgno>pPager->dbFileSize ){

	4454 pPager->dbFileSize = pgno;

	4455 }

	4456 pPager->aStat[PAGER_STAT_WRITE]++;

	4457

	4458 /* Update any backup objects copying the contents of this pager. */

	4459 sqlite3BackupUpdate(pPager->pBackup, pgno, (u8*)pList->pData);

	4460

	4461 PAGERTRACE(("STORE %d page %d hash(%08x)\n",

	4462 PAGERID(pPager), pgno, pager_pagehash(pList)));

	4463 IOTRACE(("PGOUT %p %d\n", pPager, pgno));

	4464 PAGER_INCR(sqlite3_pager_writedb_count);

	4465 }else{

	4466 PAGERTRACE(("NOSTORE %d page %d\n", PAGERID(pPager), pgno));

	4467 }

	4468 pager_set_pagehash(pList);

	4469 pList = pList->pDirty;

	4470 }

	4471

	4472 return rc;

	4473 }

	4474

	4475 /*

	4476 ** Ensure that the sub-journal file is open. If it is already open, this

	4477 ** function is a no-op.

	4478 **

	4479 ** SQLITE_OK is returned if everything goes according to plan. An

	4480 ** SQLITE_IOERR_XXX error code is returned if a call to sqlite3OsOpen()

	4481 ** fails.

	4482 */

	4483 static int openSubJournal(Pager *pPager){

	4484 int rc = SQLITE_OK;

	4485 if( !isOpen(pPager->sjfd) ){

	4486 if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY \|\| pPager->subjInMemory ){

	4487 sqlite3MemJournalOpen(pPager->sjfd);

	4488 }else{

	4489 rc = pagerOpentemp(pPager, pPager->sjfd, SQLITE_OPEN_SUBJOURNAL);

	4490 }

	4491 }

	4492 return rc;

	4493 }

	4494

	4495 /*

	4496 ** Append a record of the current state of page pPg to the sub-journal.

	4497 **

	4498 ** If successful, set the bit corresponding to pPg->pgno in the bitvecs

	4499 ** for all open savepoints before returning.

	4500 **

	4501 ** This function returns SQLITE_OK if everything is successful, an IO

	4502 ** error code if the attempt to write to the sub-journal fails, or

	4503 ** SQLITE_NOMEM if a malloc fails while setting a bit in a savepoint

	4504 ** bitvec.

	4505 */

	4506 static int subjournalPage(PgHdr *pPg){

	4507 int rc = SQLITE_OK;

	4508 Pager *pPager = pPg->pPager;

	4509 if( pPager->journalMode!=PAGER_JOURNALMODE_OFF ){

	4510

	4511 /* Open the sub-journal, if it has not already been opened */

	4512 assert( pPager->useJournal );

	4513 assert( isOpen(pPager->jfd) \|\| pagerUseWal(pPager) );

	4514 assert( isOpen(pPager->sjfd) \|\| pPager->nSubRec==0 );

	4515 assert( pagerUseWal(pPager)

	4516 \|\| pageInJournal(pPager, pPg)

	4517 \|\| pPg->pgno>pPager->dbOrigSize

	4518 );

	4519 rc = openSubJournal(pPager);

	4520

	4521 /* If the sub-journal was opened successfully (or was already open),

	4522 ** write the journal record into the file. */

	4523 if( rc==SQLITE_OK ){

	4524 void *pData = pPg->pData;

	4525 i64 offset = (i64)pPager->nSubRec*(4+pPager->pageSize);

	4526 char *pData2;

	4527

	4528 CODEC2(pPager, pData, pPg->pgno, 7, return SQLITE_NOMEM, pData2);

	4529 PAGERTRACE(("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno));

	4530 rc = write32bits(pPager->sjfd, offset, pPg->pgno);

	4531 if( rc==SQLITE_OK ){

	4532 rc = sqlite3OsWrite(pPager->sjfd, pData2, pPager->pageSize, offset+4);

	4533 }

	4534 }

	4535 }

	4536 if( rc==SQLITE_OK ){

	4537 pPager->nSubRec++;

	4538 assert( pPager->nSavepoint>0 );

	4539 rc = addToSavepointBitvecs(pPager, pPg->pgno);

	4540 }

	4541 return rc;

	4542 }

	4543 static int subjournalPageIfRequired(PgHdr *pPg){

	4544 if( subjRequiresPage(pPg) ){

	4545 return subjournalPage(pPg);

	4546 }else{

	4547 return SQLITE_OK;

	4548 }

	4549 }

	4550

	4551 /*

	4552 ** This function is called by the pcache layer when it has reached some

	4553 ** soft memory limit. The first argument is a pointer to a Pager object

	4554 ** (cast as a void*). The pager is always 'purgeable' (not an in-memory

	4555 ** database). The second argument is a reference to a page that is

	4556 ** currently dirty but has no outstanding references. The page

	4557 ** is always associated with the Pager object passed as the first

	4558 ** argument.

	4559 **

	4560 ** The job of this function is to make pPg clean by writing its contents

	4561 ** out to the database file, if possible. This may involve syncing the

	4562 ** journal file.

	4563 **

	4564 ** If successful, sqlite3PcacheMakeClean() is called on the page and

	4565 ** SQLITE_OK returned. If an IO error occurs while trying to make the

	4566 ** page clean, the IO error code is returned. If the page cannot be

	4567 ** made clean for some other reason, but no error occurs, then SQLITE_OK

	4568 ** is returned by sqlite3PcacheMakeClean() is not called.

	4569 */

	4570 static int pagerStress(void p, PgHdr pPg){

	4571 Pager pPager = (Pager )p;

	4572 int rc = SQLITE_OK;

	4573

	4574 assert( pPg->pPager==pPager );

	4575 assert( pPg->flags&PGHDR_DIRTY );

	4576

	4577 /* The doNotSpill NOSYNC bit is set during times when doing a sync of

	4578 ** journal (and adding a new header) is not allowed. This occurs

	4579 ** during calls to sqlite3PagerWrite() while trying to journal multiple

	4580 ** pages belonging to the same sector.

	4581 **

	4582 ** The doNotSpill ROLLBACK and OFF bits inhibits all cache spilling

	4583 ** regardless of whether or not a sync is required. This is set during

	4584 ** a rollback or by user request, respectively.

	4585 **

	4586 ** Spilling is also prohibited when in an error state since that could

	4587 ** lead to database corruption. In the current implementation it

	4588 ** is impossible for sqlite3PcacheFetch() to be called with createFlag==3

	4589 ** while in the error state, hence it is impossible for this routine to

	4590 ** be called in the error state. Nevertheless, we include a NEVER()

	4591 ** test for the error state as a safeguard against future changes.

	4592 */

	4593 if( NEVER(pPager->errCode) ) return SQLITE_OK;

	4594 testcase( pPager->doNotSpill & SPILLFLAG_ROLLBACK );

	4595 testcase( pPager->doNotSpill & SPILLFLAG_OFF );

	4596 testcase( pPager->doNotSpill & SPILLFLAG_NOSYNC );

	4597 if( pPager->doNotSpill

	4598 && ((pPager->doNotSpill & (SPILLFLAG_ROLLBACK\|SPILLFLAG_OFF))!=0

	4599 \|\| (pPg->flags & PGHDR_NEED_SYNC)!=0)

	4600 ){

	4601 return SQLITE_OK;

	4602 }

	4603

	4604 pPg->pDirty = 0;

	4605 if( pagerUseWal(pPager) ){

	4606 /* Write a single frame for this page to the log. */

	4607 rc = subjournalPageIfRequired(pPg);

	4608 if( rc==SQLITE_OK ){

	4609 rc = pagerWalFrames(pPager, pPg, 0, 0);

	4610 }

	4611 }else{

	4612

	4613 /* Sync the journal file if required. */

	4614 if( pPg->flags&PGHDR_NEED_SYNC

	4615 \|\| pPager->eState==PAGER_WRITER_CACHEMOD

	4616 ){

	4617 rc = syncJournal(pPager, 1);

	4618 }

	4619

	4620 /* Write the contents of the page out to the database file. */

	4621 if( rc==SQLITE_OK ){

	4622 assert( (pPg->flags&PGHDR_NEED_SYNC)==0 );

	4623 rc = pager_write_pagelist(pPager, pPg);

	4624 }

	4625 }

	4626

	4627 /* Mark the page as clean. */

	4628 if( rc==SQLITE_OK ){

	4629 PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno));

	4630 sqlite3PcacheMakeClean(pPg);

	4631 }

	4632

	4633 return pager_error(pPager, rc);

	4634 }

	4635

	4636 /*

	4637 ** Flush all unreferenced dirty pages to disk.

	4638 */

	4639 SQLITE_PRIVATE int sqlite3PagerFlush(Pager *pPager){

	4640 int rc = pPager->errCode;

	4641 if( !MEMDB ){

	4642 PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache);

	4643 assert( assert_pager_state(pPager) );

	4644 while( rc==SQLITE_OK && pList ){

	4645 PgHdr *pNext = pList->pDirty;

	4646 if( pList->nRef==0 ){

	4647 rc = pagerStress((void*)pPager, pList);

	4648 }

	4649 pList = pNext;

	4650 }

	4651 }

	4652

	4653 return rc;

	4654 }

	4655

	4656 /*

	4657 ** Allocate and initialize a new Pager object and put a pointer to it

	4658 ** in *ppPager. The pager should eventually be freed by passing it

	4659 ** to sqlite3PagerClose().

	4660 **

	4661 ** The zFilename argument is the path to the database file to open.

	4662 ** If zFilename is NULL then a randomly-named temporary file is created

	4663 ** and used as the file to be cached. Temporary files are be deleted

	4664 ** automatically when they are closed. If zFilename is ":memory:" then

	4665 ** all information is held in cache. It is never written to disk.

	4666 ** This can be used to implement an in-memory database.

	4667 **

	4668 ** The nExtra parameter specifies the number of bytes of space allocated

	4669 ** along with each page reference. This space is available to the user

	4670 ** via the sqlite3PagerGetExtra() API.

	4671 **

	4672 ** The flags argument is used to specify properties that affect the

	4673 ** operation of the pager. It should be passed some bitwise combination

	4674 ** of the PAGER_* flags.

	4675 **

	4676 ** The vfsFlags parameter is a bitmask to pass to the flags parameter

	4677 ** of the xOpen() method of the supplied VFS when opening files.

	4678 **

	4679 ** If the pager object is allocated and the specified file opened

	4680 ** successfully, SQLITE_OK is returned and *ppPager set to point to

	4681 ** the new pager object. If an error occurs, *ppPager is set to NULL

	4682 ** and error code returned. This function may return SQLITE_NOMEM

	4683 ** (sqlite3Malloc() is used to allocate memory), SQLITE_CANTOPEN or

	4684 ** various SQLITE_IO_XXX errors.

	4685 */

	4686 SQLITE_PRIVATE int sqlite3PagerOpen(

	4687 sqlite3_vfs pVfs, / The virtual file system to use */

	4688 Pager *ppPager, / OUT: Return the Pager structure here */

	4689 const char zFilename, / Name of the database file to open */

	4690 int nExtra, /* Extra bytes append to each in-memory page */

	4691 int flags, /* flags controlling this file */

	4692 int vfsFlags, /* flags passed through to sqlite3_vfs.xOpen() */

	4693 void (xReinit)(DbPage) /* Function to reinitialize pages */

	4694 ){

	4695 u8 *pPtr;

	4696 Pager pPager = 0; / Pager object to allocate and return */

	4697 int rc = SQLITE_OK; /* Return code */

	4698 int tempFile = 0; /* True for temp files (incl. in-memory files) */

	4699 int memDb = 0; /* True if this is an in-memory file */

	4700 int readOnly = 0; /* True if this is a read-only file */

	4701 int journalFileSize; /* Bytes to allocate for each journal fd */

	4702 char zPathname = 0; / Full path to database file */

	4703 int nPathname = 0; /* Number of bytes in zPathname */

	4704 int useJournal = (flags & PAGER_OMIT_JOURNAL)==0; /* False to omit journal */

	4705 int pcacheSize = sqlite3PcacheSize(); /* Bytes to allocate for PCache */

	4706 u32 szPageDflt = SQLITE_DEFAULT_PAGE_SIZE; /* Default page size */

	4707 const char zUri = 0; / URI args to copy */

	4708 int nUri = 0; /* Number of bytes of URI args at zUri /

	4709

	4710 /* Figure out how much space is required for each journal file-handle

	4711 ** (there are two of them, the main journal and the sub-journal). This

	4712 ** is the maximum space required for an in-memory journal file handle

	4713 ** and a regular journal file-handle. Note that a "regular journal-handle"

	4714 ** may be a wrapper capable of caching the first portion of the journal

	4715 ** file in memory to implement the atomic-write optimization (see

	4716 ** source file journal.c).

	4717 */

	4718 if( sqlite3JournalSize(pVfs)>sqlite3MemJournalSize() ){

	4719 journalFileSize = ROUND8(sqlite3JournalSize(pVfs));

	4720 }else{

	4721 journalFileSize = ROUND8(sqlite3MemJournalSize());

	4722 }

	4723

	4724 /* Set the output variable to NULL in case an error occurs. */

	4725 *ppPager = 0;

	4726

	4727 #ifndef SQLITE_OMIT_MEMORYDB

	4728 if( flags & PAGER_MEMORY ){

	4729 memDb = 1;

	4730 if( zFilename && zFilename[0] ){

	4731 zPathname = sqlite3DbStrDup(0, zFilename);

	4732 if( zPathname==0 ) return SQLITE_NOMEM;

	4733 nPathname = sqlite3Strlen30(zPathname);

	4734 zFilename = 0;

	4735 }

	4736 }

	4737 #endif

	4738

	4739 /* Compute and store the full pathname in an allocated buffer pointed

	4740 ** to by zPathname, length nPathname. Or, if this is a temporary file,

	4741 ** leave both nPathname and zPathname set to 0.

	4742 */

	4743 if( zFilename && zFilename[0] ){

	4744 const char *z;

	4745 nPathname = pVfs->mxPathname+1;

	4746 zPathname = sqlite3DbMallocRaw(0, nPathname*2);

	4747 if( zPathname==0 ){

	4748 return SQLITE_NOMEM;

	4749 }

	4750 zPathname[0] = 0; /* Make sure initialized even if FullPathname() fails */

	4751 rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname);

	4752 nPathname = sqlite3Strlen30(zPathname);

	4753 z = zUri = &zFilename[sqlite3Strlen30(zFilename)+1];

	4754 while( *z ){

	4755 z += sqlite3Strlen30(z)+1;

	4756 z += sqlite3Strlen30(z)+1;

	4757 }

	4758 nUri = (int)(&z[1] - zUri);

	4759 assert( nUri>=0 );

	4760 if( rc==SQLITE_OK && nPathname+8>pVfs->mxPathname ){

	4761 /* This branch is taken when the journal path required by

	4762 ** the database being opened will be more than pVfs->mxPathname

	4763 ** bytes in length. This means the database cannot be opened,

	4764 ** as it will not be possible to open the journal file or even

	4765 ** check for a hot-journal before reading.

	4766 */

	4767 rc = SQLITE_CANTOPEN_BKPT;

	4768 }

	4769 if( rc!=SQLITE_OK ){

	4770 sqlite3DbFree(0, zPathname);

	4771 return rc;

	4772 }

	4773 }

	4774

	4775 /* Allocate memory for the Pager structure, PCache object, the

	4776 ** three file descriptors, the database file name and the journal

	4777 ** file name. The layout in memory is as follows:

	4778 **

	4779 ** Pager object (sizeof(Pager) bytes)

	4780 ** PCache object (sqlite3PcacheSize() bytes)

	4781 ** Database file handle (pVfs->szOsFile bytes)

	4782 ** Sub-journal file handle (journalFileSize bytes)

	4783 ** Main journal file handle (journalFileSize bytes)

	4784 ** Database file name (nPathname+1 bytes)

	4785 ** Journal file name (nPathname+8+1 bytes)

	4786 */

	4787 pPtr = (u8 *)sqlite3MallocZero(

	4788 ROUND8(sizeof(pPager)) + / Pager structure */

	4789 ROUND8(pcacheSize) + /* PCache object */

	4790 ROUND8(pVfs->szOsFile) + /* The main db file */

	4791 journalFileSize * 2 + /* The two journal files */

	4792 nPathname + 1 + nUri + /* zFilename */

	4793 nPathname + 8 + 2 /* zJournal */

	4794 #ifndef SQLITE_OMIT_WAL

	4795 + nPathname + 4 + 2 /* zWal */

	4796 #endif

	4797 );

	4798 assert( EIGHT_BYTE_ALIGNMENT(SQLITE_INT_TO_PTR(journalFileSize)) );

	4799 if( !pPtr ){

	4800 sqlite3DbFree(0, zPathname);

	4801 return SQLITE_NOMEM;

	4802 }

	4803 pPager = (Pager*)(pPtr);

	4804 pPager->pPCache = (PCache)(pPtr += ROUND8(sizeof(pPager)));

	4805 pPager->fd = (sqlite3_file*)(pPtr += ROUND8(pcacheSize));

	4806 pPager->sjfd = (sqlite3_file*)(pPtr += ROUND8(pVfs->szOsFile));

	4807 pPager->jfd = (sqlite3_file*)(pPtr += journalFileSize);

	4808 pPager->zFilename = (char*)(pPtr += journalFileSize);

	4809 assert( EIGHT_BYTE_ALIGNMENT(pPager->jfd) );

	4810

	4811 /* Fill in the Pager.zFilename and Pager.zJournal buffers, if required. */

	4812 if( zPathname ){

	4813 assert( nPathname>0 );

	4814 pPager->zJournal = (char*)(pPtr += nPathname + 1 + nUri);

	4815 memcpy(pPager->zFilename, zPathname, nPathname);

	4816 if( nUri ) memcpy(&pPager->zFilename[nPathname+1], zUri, nUri);

	4817 memcpy(pPager->zJournal, zPathname, nPathname);

	4818 memcpy(&pPager->zJournal[nPathname], "-journal\000", 8+2);

	4819 sqlite3FileSuffix3(pPager->zFilename, pPager->zJournal);

	4820 #ifndef SQLITE_OMIT_WAL

	4821 pPager->zWal = &pPager->zJournal[nPathname+8+1];

	4822 memcpy(pPager->zWal, zPathname, nPathname);

	4823 memcpy(&pPager->zWal[nPathname], "-wal\000", 4+1);

	4824 sqlite3FileSuffix3(pPager->zFilename, pPager->zWal);

	4825 #endif

	4826 sqlite3DbFree(0, zPathname);

	4827 }

	4828 pPager->pVfs = pVfs;

	4829 pPager->vfsFlags = vfsFlags;

	4830

	4831 /* Open the pager file.

	4832 */

	4833 if( zFilename && zFilename[0] ){

	4834 int fout = 0; /* VFS flags returned by xOpen() */

	4835 rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd, vfsFlags, &fout);

	4836 assert( !memDb );

	4837 readOnly = (fout&SQLITE_OPEN_READONLY);

	4838

	4839 /* If the file was successfully opened for read/write access,

	4840 ** choose a default page size in case we have to create the

	4841 ** database file. The default page size is the maximum of:

	4842 **

	4843 ** + SQLITE_DEFAULT_PAGE_SIZE,

	4844 ** + The value returned by sqlite3OsSectorSize()

	4845 ** + The largest page size that can be written atomically.

	4846 */

	4847 if( rc==SQLITE_OK ){

	4848 int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);

	4849 if( !readOnly ){

	4850 setSectorSize(pPager);

	4851 assert(SQLITE_DEFAULT_PAGE_SIZE<=SQLITE_MAX_DEFAULT_PAGE_SIZE);

	4852 if( szPageDflt<pPager->sectorSize ){

	4853 if( pPager->sectorSize>SQLITE_MAX_DEFAULT_PAGE_SIZE ){

	4854 szPageDflt = SQLITE_MAX_DEFAULT_PAGE_SIZE;

	4855 }else{

	4856 szPageDflt = (u32)pPager->sectorSize;

	4857 }

	4858 }

	4859 #ifdef SQLITE_ENABLE_ATOMIC_WRITE

	4860 {

	4861 int ii;

	4862 assert(SQLITE_IOCAP_ATOMIC512==(512>>8));

	4863 assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));

	4864 assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536);

	4865 for(ii=szPageDflt; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){

	4866 if( iDc&(SQLITE_IOCAP_ATOMIC\|(ii>>8)) ){

	4867 szPageDflt = ii;

	4868 }

	4869 }

	4870 }

	4871 #endif

	4872 }

	4873 pPager->noLock = sqlite3_uri_boolean(zFilename, "nolock", 0);

	4874 if( (iDc & SQLITE_IOCAP_IMMUTABLE)!=0

	4875 \|\| sqlite3_uri_boolean(zFilename, "immutable", 0) ){

	4876 vfsFlags \|= SQLITE_OPEN_READONLY;

	4877 goto act_like_temp_file;

	4878 }

	4879 }

	4880 }else{

	4881 /* If a temporary file is requested, it is not opened immediately.

	4882 ** In this case we accept the default page size and delay actually

	4883 ** opening the file until the first call to OsWrite().

	4884 **

	4885 ** This branch is also run for an in-memory database. An in-memory

	4886 ** database is the same as a temp-file that is never written out to

	4887 ** disk and uses an in-memory rollback journal.

	4888 **

	4889 ** This branch also runs for files marked as immutable.

	4890 */

	4891 act_like_temp_file:

	4892 tempFile = 1;

	4893 pPager->eState = PAGER_READER; /* Pretend we already have a lock */

	4894 pPager->eLock = EXCLUSIVE_LOCK; /* Pretend we are in EXCLUSIVE mode */

	4895 pPager->noLock = 1; /* Do no locking */

	4896 readOnly = (vfsFlags&SQLITE_OPEN_READONLY);

	4897 }

	4898

	4899 /* The following call to PagerSetPagesize() serves to set the value of

	4900 ** Pager.pageSize and to allocate the Pager.pTmpSpace buffer.

	4901 */

	4902 if( rc==SQLITE_OK ){

	4903 assert( pPager->memDb==0 );

	4904 rc = sqlite3PagerSetPagesize(pPager, &szPageDflt, -1);

	4905 testcase( rc!=SQLITE_OK );

	4906 }

	4907

	4908 /* Initialize the PCache object. */

	4909 if( rc==SQLITE_OK ){

	4910 assert( nExtra<1000 );

	4911 nExtra = ROUND8(nExtra);

	4912 rc = sqlite3PcacheOpen(szPageDflt, nExtra, !memDb,

	4913 !memDb?pagerStress:0, (void *)pPager, pPager->pPCache);

	4914 }

	4915

	4916 /* If an error occurred above, free the Pager structure and close the file.

	4917 */

	4918 if( rc!=SQLITE_OK ){

	4919 sqlite3OsClose(pPager->fd);

	4920 sqlite3PageFree(pPager->pTmpSpace);

	4921 sqlite3_free(pPager);

	4922 return rc;

	4923 }

	4924

	4925 PAGERTRACE(("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename));

	4926 IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename))

	4927

	4928 pPager->useJournal = (u8)useJournal;

	4929 /* pPager->stmtOpen = 0; */

	4930 /* pPager->stmtInUse = 0; */

	4931 /* pPager->nRef = 0; */

	4932 /* pPager->stmtSize = 0; */

	4933 /* pPager->stmtJSize = 0; */

	4934 /* pPager->nPage = 0; */

	4935 pPager->mxPgno = SQLITE_MAX_PAGE_COUNT;

	4936 /* pPager->state = PAGER_UNLOCK; */

	4937 /* pPager->errMask = 0; */

	4938 pPager->tempFile = (u8)tempFile;

	4939 assert( tempFile==PAGER_LOCKINGMODE_NORMAL

	4940 \|\| tempFile==PAGER_LOCKINGMODE_EXCLUSIVE );

	4941 assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 );

	4942 pPager->exclusiveMode = (u8)tempFile;

	4943 pPager->changeCountDone = pPager->tempFile;

	4944 pPager->memDb = (u8)memDb;

	4945 pPager->readOnly = (u8)readOnly;

	4946 assert( useJournal \|\| pPager->tempFile );

	4947 pPager->noSync = pPager->tempFile;

	4948 if( pPager->noSync ){

	4949 assert( pPager->fullSync==0 );

	4950 assert( pPager->syncFlags==0 );

	4951 assert( pPager->walSyncFlags==0 );

	4952 assert( pPager->ckptSyncFlags==0 );

	4953 }else{

	4954 pPager->fullSync = 1;

	4955 pPager->syncFlags = SQLITE_SYNC_NORMAL;

	4956 pPager->walSyncFlags = SQLITE_SYNC_NORMAL \| WAL_SYNC_TRANSACTIONS;

	4957 pPager->ckptSyncFlags = SQLITE_SYNC_NORMAL;

	4958 }

	4959 /* pPager->pFirst = 0; */

	4960 /* pPager->pFirstSynced = 0; */

	4961 /* pPager->pLast = 0; */

	4962 pPager->nExtra = (u16)nExtra;

	4963 pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT;

	4964 assert( isOpen(pPager->fd) \|\| tempFile );

	4965 setSectorSize(pPager);

	4966 if( !useJournal ){

	4967 pPager->journalMode = PAGER_JOURNALMODE_OFF;

	4968 }else if( memDb ){

	4969 pPager->journalMode = PAGER_JOURNALMODE_MEMORY;

	4970 }

	4971 /* pPager->xBusyHandler = 0; */

	4972 /* pPager->pBusyHandlerArg = 0; */

	4973 pPager->xReiniter = xReinit;

	4974 /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */

	4975 /* pPager->szMmap = SQLITE_DEFAULT_MMAP_SIZE // will be set by btree.c */

	4976

	4977 *ppPager = pPager;

	4978 return SQLITE_OK;

	4979 }

	4980

	4981

	4982 /* Verify that the database file has not be deleted or renamed out from

	4983 ** under the pager. Return SQLITE_OK if the database is still were it ought

	4984 ** to be on disk. Return non-zero (SQLITE_READONLY_DBMOVED or some other error

	4985 ** code from sqlite3OsAccess()) if the database has gone missing.

	4986 */

	4987 static int databaseIsUnmoved(Pager *pPager){

	4988 int bHasMoved = 0;

	4989 int rc;

	4990

	4991 if( pPager->tempFile ) return SQLITE_OK;

	4992 if( pPager->dbSize==0 ) return SQLITE_OK;

	4993 assert( pPager->zFilename && pPager->zFilename[0] );

	4994 rc = sqlite3OsFileControl(pPager->fd, SQLITE_FCNTL_HAS_MOVED, &bHasMoved);

	4995 if( rc==SQLITE_NOTFOUND ){

	4996 /* If the HAS_MOVED file-control is unimplemented, assume that the file

	4997 ** has not been moved. That is the historical behavior of SQLite: prior to

	4998 ** version 3.8.3, it never checked */

	4999 rc = SQLITE_OK;

	5000 }else if( rc==SQLITE_OK && bHasMoved ){

	5001 rc = SQLITE_READONLY_DBMOVED;

	5002 }

	5003 return rc;

	5004 }

	5005

	5006

	5007 /*

	5008 ** This function is called after transitioning from PAGER_UNLOCK to

	5009 ** PAGER_SHARED state. It tests if there is a hot journal present in

	5010 ** the file-system for the given pager. A hot journal is one that

	5011 ** needs to be played back. According to this function, a hot-journal

	5012 ** file exists if the following criteria are met:

	5013 **

	5014 ** * The journal file exists in the file system, and

	5015 ** * No process holds a RESERVED or greater lock on the database file, and

	5016 ** * The database file itself is greater than 0 bytes in size, and

	5017 ** * The first byte of the journal file exists and is not 0x00.

	5018 **

	5019 ** If the current size of the database file is 0 but a journal file

	5020 ** exists, that is probably an old journal left over from a prior

	5021 ** database with the same name. In this case the journal file is

	5022 ** just deleted using OsDelete, *pExists is set to 0 and SQLITE_OK

	5023 ** is returned.

	5024 **

	5025 ** This routine does not check if there is a master journal filename

	5026 ** at the end of the file. If there is, and that master journal file

	5027 ** does not exist, then the journal file is not really hot. In this

	5028 ** case this routine will return a false-positive. The pager_playback()

	5029 ** routine will discover that the journal file is not really hot and

	5030 ** will not roll it back.

	5031 **

	5032 ** If a hot-journal file is found to exist, *pExists is set to 1 and

	5033 ** SQLITE_OK returned. If no hot-journal file is present, *pExists is

	5034 ** set to 0 and SQLITE_OK returned. If an IO error occurs while trying

	5035 ** to determine whether or not a hot-journal file exists, the IO error

	5036 ** code is returned and the value of *pExists is undefined.

	5037 */

	5038 static int hasHotJournal(Pager pPager, int pExists){

	5039 sqlite3_vfs * const pVfs = pPager->pVfs;

	5040 int rc = SQLITE_OK; /* Return code */

	5041 int exists = 1; /* True if a journal file is present */

	5042 int jrnlOpen = !!isOpen(pPager->jfd);

	5043

	5044 assert( pPager->useJournal );

	5045 assert( isOpen(pPager->fd) );

	5046 assert( pPager->eState==PAGER_OPEN );

	5047

	5048 assert( jrnlOpen==0 \|\| ( sqlite3OsDeviceCharacteristics(pPager->jfd) &

	5049 SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN

	5050 ));

	5051

	5052 *pExists = 0;

	5053 if( !jrnlOpen ){

	5054 rc = sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS, &exists);

	5055 }

	5056 if( rc==SQLITE_OK && exists ){

	5057 int locked = 0; /* True if some process holds a RESERVED lock */

	5058

	5059 /* Race condition here: Another process might have been holding the

	5060 ** the RESERVED lock and have a journal open at the sqlite3OsAccess()

	5061 ** call above, but then delete the journal and drop the lock before

	5062 ** we get to the following sqlite3OsCheckReservedLock() call. If that

	5063 ** is the case, this routine might think there is a hot journal when

	5064 ** in fact there is none. This results in a false-positive which will

	5065 ** be dealt with by the playback routine. Ticket #3883.

	5066 */

	5067 rc = sqlite3OsCheckReservedLock(pPager->fd, &locked);

	5068 if( rc==SQLITE_OK && !locked ){

	5069 Pgno nPage; /* Number of pages in database file */

	5070

	5071 rc = pagerPagecount(pPager, &nPage);

	5072 if( rc==SQLITE_OK ){

	5073 /* If the database is zero pages in size, that means that either (1) the

	5074 ** journal is a remnant from a prior database with the same name where

	5075 ** the database file but not the journal was deleted, or (2) the initial

	5076 ** transaction that populates a new database is being rolled back.

	5077 ** In either case, the journal file can be deleted. However, take care

	5078 ** not to delete the journal file if it is already open due to

	5079 ** journal_mode=PERSIST.

	5080 */

	5081 if( nPage==0 && !jrnlOpen ){

	5082 sqlite3BeginBenignMalloc();

	5083 if( pagerLockDb(pPager, RESERVED_LOCK)==SQLITE_OK ){

	5084 sqlite3OsDelete(pVfs, pPager->zJournal, 0);

	5085 if( !pPager->exclusiveMode ) pagerUnlockDb(pPager, SHARED_LOCK);

	5086 }

	5087 sqlite3EndBenignMalloc();

	5088 }else{

	5089 /* The journal file exists and no other connection has a reserved

	5090 ** or greater lock on the database file. Now check that there is

	5091 ** at least one non-zero bytes at the start of the journal file.

	5092 ** If there is, then we consider this journal to be hot. If not,

	5093 ** it can be ignored.

	5094 */

	5095 if( !jrnlOpen ){

	5096 int f = SQLITE_OPEN_READONLY\|SQLITE_OPEN_MAIN_JOURNAL;

	5097 rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &f);

	5098 }

	5099 if( rc==SQLITE_OK ){

	5100 u8 first = 0;

	5101 rc = sqlite3OsRead(pPager->jfd, (void *)&first, 1, 0);

	5102 if( rc==SQLITE_IOERR_SHORT_READ ){

	5103 rc = SQLITE_OK;

	5104 }

	5105 if( !jrnlOpen ){

	5106 sqlite3OsClose(pPager->jfd);

	5107 }

	5108 *pExists = (first!=0);

	5109 }else if( rc==SQLITE_CANTOPEN ){

	5110 /* If we cannot open the rollback journal file in order to see if

	5111 ** it has a zero header, that might be due to an I/O error, or

	5112 ** it might be due to the race condition described above and in

	5113 ** ticket #3883. Either way, assume that the journal is hot.

	5114 ** This might be a false positive. But if it is, then the

	5115 ** automatic journal playback and recovery mechanism will deal

	5116 ** with it under an EXCLUSIVE lock where we do not need to

	5117 ** worry so much with race conditions.

	5118 */

	5119 *pExists = 1;

	5120 rc = SQLITE_OK;

	5121 }

	5122 }

	5123 }

	5124 }

	5125 }

	5126

	5127 return rc;

	5128 }

	5129

	5130 /*

	5131 ** This function is called to obtain a shared lock on the database file.

	5132 ** It is illegal to call sqlite3PagerGet() until after this function

	5133 ** has been successfully called. If a shared-lock is already held when

	5134 ** this function is called, it is a no-op.

	5135 **

	5136 ** The following operations are also performed by this function.

	5137 **

	5138 ** 1) If the pager is currently in PAGER_OPEN state (no lock held

	5139 ** on the database file), then an attempt is made to obtain a

	5140 ** SHARED lock on the database file. Immediately after obtaining

	5141 ** the SHARED lock, the file-system is checked for a hot-journal,

	5142 ** which is played back if present. Following any hot-journal

	5143 ** rollback, the contents of the cache are validated by checking

	5144 ** the 'change-counter' field of the database file header and

	5145 ** discarded if they are found to be invalid.

	5146 **

	5147 ** 2) If the pager is running in exclusive-mode, and there are currently

	5148 ** no outstanding references to any pages, and is in the error state,

	5149 ** then an attempt is made to clear the error state by discarding

	5150 ** the contents of the page cache and rolling back any open journal

	5151 ** file.

	5152 **

	5153 ** If everything is successful, SQLITE_OK is returned. If an IO error

	5154 ** occurs while locking the database, checking for a hot-journal file or

	5155 ** rolling back a journal file, the IO error code is returned.

	5156 */

	5157 SQLITE_PRIVATE int sqlite3PagerSharedLock(Pager *pPager){

	5158 int rc = SQLITE_OK; /* Return code */

	5159

	5160 /* This routine is only called from b-tree and only when there are no

	5161 ** outstanding pages. This implies that the pager state should either

	5162 ** be OPEN or READER. READER is only possible if the pager is or was in

	5163 ** exclusive access mode.

	5164 */

	5165 assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );

	5166 assert( assert_pager_state(pPager) );

	5167 assert( pPager->eState==PAGER_OPEN \|\| pPager->eState==PAGER_READER );

	5168 if( NEVER(MEMDB && pPager->errCode) ){ return pPager->errCode; }

	5169

	5170 if( !pagerUseWal(pPager) && pPager->eState==PAGER_OPEN ){

	5171 int bHotJournal = 1; /* True if there exists a hot journal-file */

	5172

	5173 assert( !MEMDB );

	5174

	5175 rc = pager_wait_on_lock(pPager, SHARED_LOCK);

	5176 if( rc!=SQLITE_OK ){

	5177 assert( pPager->eLock==NO_LOCK \|\| pPager->eLock==UNKNOWN_LOCK );

	5178 goto failed;

	5179 }

	5180

	5181 /* If a journal file exists, and there is no RESERVED lock on the

	5182 ** database file, then it either needs to be played back or deleted.

	5183 */

	5184 if( pPager->eLock<=SHARED_LOCK ){

	5185 rc = hasHotJournal(pPager, &bHotJournal);

	5186 }

	5187 if( rc!=SQLITE_OK ){

	5188 goto failed;

	5189 }

	5190 if( bHotJournal ){

	5191 if( pPager->readOnly ){

	5192 rc = SQLITE_READONLY_ROLLBACK;

	5193 goto failed;

	5194 }

	5195

	5196 /* Get an EXCLUSIVE lock on the database file. At this point it is

	5197 ** important that a RESERVED lock is not obtained on the way to the

	5198 ** EXCLUSIVE lock. If it were, another process might open the

	5199 ** database file, detect the RESERVED lock, and conclude that the

	5200 ** database is safe to read while this process is still rolling the

	5201 ** hot-journal back.

	5202 **

	5203 ** Because the intermediate RESERVED lock is not requested, any

	5204 ** other process attempting to access the database file will get to

	5205 ** this point in the code and fail to obtain its own EXCLUSIVE lock

	5206 ** on the database file.

	5207 **

	5208 ** Unless the pager is in locking_mode=exclusive mode, the lock is

	5209 ** downgraded to SHARED_LOCK before this function returns.

	5210 */

	5211 rc = pagerLockDb(pPager, EXCLUSIVE_LOCK);

	5212 if( rc!=SQLITE_OK ){

	5213 goto failed;

	5214 }

	5215

	5216 /* If it is not already open and the file exists on disk, open the

	5217 ** journal for read/write access. Write access is required because

	5218 ** in exclusive-access mode the file descriptor will be kept open

	5219 ** and possibly used for a transaction later on. Also, write-access

	5220 ** is usually required to finalize the journal in journal_mode=persist

	5221 ** mode (and also for journal_mode=truncate on some systems).

	5222 **

	5223 ** If the journal does not exist, it usually means that some

	5224 ** other connection managed to get in and roll it back before

	5225 ** this connection obtained the exclusive lock above. Or, it

	5226 ** may mean that the pager was in the error-state when this

	5227 ** function was called and the journal file does not exist.

	5228 */

	5229 if( !isOpen(pPager->jfd) ){

	5230 sqlite3_vfs * const pVfs = pPager->pVfs;

	5231 int bExists; /* True if journal file exists */

	5232 rc = sqlite3OsAccess(

	5233 pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS, &bExists);

	5234 if( rc==SQLITE_OK && bExists ){

	5235 int fout = 0;

	5236 int f = SQLITE_OPEN_READWRITE\|SQLITE_OPEN_MAIN_JOURNAL;

	5237 assert( !pPager->tempFile );

	5238 rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout);

	5239 assert( rc!=SQLITE_OK \|\| isOpen(pPager->jfd) );

	5240 if( rc==SQLITE_OK && fout&SQLITE_OPEN_READONLY ){

	5241 rc = SQLITE_CANTOPEN_BKPT;

	5242 sqlite3OsClose(pPager->jfd);

	5243 }

	5244 }

	5245 }

	5246

	5247 /* Playback and delete the journal. Drop the database write

	5248 ** lock and reacquire the read lock. Purge the cache before

	5249 ** playing back the hot-journal so that we don't end up with

	5250 ** an inconsistent cache. Sync the hot journal before playing

	5251 ** it back since the process that crashed and left the hot journal

	5252 ** probably did not sync it and we are required to always sync

	5253 ** the journal before playing it back.

	5254 */

	5255 if( isOpen(pPager->jfd) ){

	5256 assert( rc==SQLITE_OK );

	5257 rc = pagerSyncHotJournal(pPager);

	5258 if( rc==SQLITE_OK ){

	5259 rc = pager_playback(pPager, 1);

	5260 pPager->eState = PAGER_OPEN;

	5261 }

	5262 }else if( !pPager->exclusiveMode ){

	5263 pagerUnlockDb(pPager, SHARED_LOCK);

	5264 }

	5265

	5266 if( rc!=SQLITE_OK ){

	5267 /* This branch is taken if an error occurs while trying to open

	5268 ** or roll back a hot-journal while holding an EXCLUSIVE lock. The

	5269 ** pager_unlock() routine will be called before returning to unlock

	5270 ** the file. If the unlock attempt fails, then Pager.eLock must be

	5271 ** set to UNKNOWN_LOCK (see the comment above the #define for

	5272 ** UNKNOWN_LOCK above for an explanation).

	5273 **

	5274 ** In order to get pager_unlock() to do this, set Pager.eState to

	5275 ** PAGER_ERROR now. This is not actually counted as a transition

	5276 ** to ERROR state in the state diagram at the top of this file,

	5277 ** since we know that the same call to pager_unlock() will very

	5278 ** shortly transition the pager object to the OPEN state. Calling

	5279 ** assert_pager_state() would fail now, as it should not be possible

	5280 ** to be in ERROR state when there are zero outstanding page

	5281 ** references.

	5282 */

	5283 pager_error(pPager, rc);

	5284 goto failed;

	5285 }

	5286

	5287 assert( pPager->eState==PAGER_OPEN );

	5288 assert( (pPager->eLock==SHARED_LOCK)

	5289 \|\| (pPager->exclusiveMode && pPager->eLock>SHARED_LOCK)

	5290 );

	5291 }

	5292

	5293 if( !pPager->tempFile && pPager->hasHeldSharedLock ){

	5294 /* The shared-lock has just been acquired then check to

	5295 ** see if the database has been modified. If the database has changed,

	5296 ** flush the cache. The hasHeldSharedLock flag prevents this from

	5297 ** occurring on the very first access to a file, in order to save a

	5298 ** single unnecessary sqlite3OsRead() call at the start-up.

	5299 **

	5300 ** Database changes are detected by looking at 15 bytes beginning

	5301 ** at offset 24 into the file. The first 4 of these 16 bytes are

	5302 ** a 32-bit counter that is incremented with each change. The

	5303 ** other bytes change randomly with each file change when

	5304 ** a codec is in use.

	5305 **

	5306 ** There is a vanishingly small chance that a change will not be

	5307 ** detected. The chance of an undetected change is so small that

	5308 ** it can be neglected.

	5309 */

	5310 Pgno nPage = 0;

	5311 char dbFileVers[sizeof(pPager->dbFileVers)];

	5312

	5313 rc = pagerPagecount(pPager, &nPage);

	5314 if( rc ) goto failed;

	5315

	5316 if( nPage>0 ){

	5317 IOTRACE(("CKVERS %p %d\n", pPager, sizeof(dbFileVers)));

	5318 rc = sqlite3OsRead(pPager->fd, &dbFileVers, sizeof(dbFileVers), 24);

	5319 if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){

	5320 goto failed;

	5321 }

	5322 }else{

	5323 memset(dbFileVers, 0, sizeof(dbFileVers));

	5324 }

	5325

	5326 if( memcmp(pPager->dbFileVers, dbFileVers, sizeof(dbFileVers))!=0 ){

	5327 pager_reset(pPager);

	5328

	5329 /* Unmap the database file. It is possible that external processes

	5330 ** may have truncated the database file and then extended it back

	5331 ** to its original size while this process was not holding a lock.

	5332 ** In this case there may exist a Pager.pMap mapping that appears

	5333 ** to be the right size but is not actually valid. Avoid this

	5334 ** possibility by unmapping the db here. */

	5335 if( USEFETCH(pPager) ){

	5336 sqlite3OsUnfetch(pPager->fd, 0, 0);

	5337 }

	5338 }

	5339 }

	5340

	5341 /* If there is a WAL file in the file-system, open this database in WAL

	5342 ** mode. Otherwise, the following function call is a no-op.

	5343 */

	5344 rc = pagerOpenWalIfPresent(pPager);

	5345 #ifndef SQLITE_OMIT_WAL

	5346 assert( pPager->pWal==0 \|\| rc==SQLITE_OK );

	5347 #endif

	5348 }

	5349

	5350 if( pagerUseWal(pPager) ){

	5351 assert( rc==SQLITE_OK );

	5352 rc = pagerBeginReadTransaction(pPager);

	5353 }

	5354

	5355 if( pPager->eState==PAGER_OPEN && rc==SQLITE_OK ){

	5356 rc = pagerPagecount(pPager, &pPager->dbSize);

	5357 }

	5358

	5359 failed:

	5360 if( rc!=SQLITE_OK ){

	5361 assert( !MEMDB );

	5362 pager_unlock(pPager);

	5363 assert( pPager->eState==PAGER_OPEN );

	5364 }else{

	5365 pPager->eState = PAGER_READER;

	5366 pPager->hasHeldSharedLock = 1;

	5367 }

	5368 return rc;

	5369 }

	5370

	5371 /*

	5372 ** If the reference count has reached zero, rollback any active

	5373 ** transaction and unlock the pager.

	5374 **

	5375 ** Except, in locking_mode=EXCLUSIVE when there is nothing to in

	5376 ** the rollback journal, the unlock is not performed and there is

	5377 ** nothing to rollback, so this routine is a no-op.

	5378 */

	5379 static void pagerUnlockIfUnused(Pager *pPager){

	5380 if( pPager->nMmapOut==0 && (sqlite3PcacheRefCount(pPager->pPCache)==0) ){

	5381 pagerUnlockAndRollback(pPager);

	5382 }

	5383 }

	5384

	5385 /*

	5386 ** Acquire a reference to page number pgno in pager pPager (a page

	5387 ** reference has type DbPage*). If the requested reference is

	5388 ** successfully obtained, it is copied to *ppPage and SQLITE_OK returned.

	5389 **

	5390 ** If the requested page is already in the cache, it is returned.

	5391 ** Otherwise, a new page object is allocated and populated with data

	5392 ** read from the database file. In some cases, the pcache module may

	5393 ** choose not to allocate a new page object and may reuse an existing

	5394 ** object with no outstanding references.

	5395 **

	5396 ** The extra data appended to a page is always initialized to zeros the

	5397 ** first time a page is loaded into memory. If the page requested is

	5398 ** already in the cache when this function is called, then the extra

	5399 ** data is left as it was when the page object was last used.

	5400 **

	5401 ** If the database image is smaller than the requested page or if a

	5402 ** non-zero value is passed as the noContent parameter and the

	5403 ** requested page is not already stored in the cache, then no

	5404 ** actual disk read occurs. In this case the memory image of the

	5405 ** page is initialized to all zeros.

	5406 **

	5407 ** If noContent is true, it means that we do not care about the contents

	5408 ** of the page. This occurs in two scenarios:

	5409 **

	5410 ** a) When reading a free-list leaf page from the database, and

	5411 **

	5412 ** b) When a savepoint is being rolled back and we need to load

	5413 ** a new page into the cache to be filled with the data read

	5414 ** from the savepoint journal.

	5415 **

	5416 ** If noContent is true, then the data returned is zeroed instead of

	5417 ** being read from the database. Additionally, the bits corresponding

	5418 ** to pgno in Pager.pInJournal (bitvec of pages already written to the

	5419 ** journal file) and the PagerSavepoint.pInSavepoint bitvecs of any open

	5420 ** savepoints are set. This means if the page is made writable at any

	5421 ** point in the future, using a call to sqlite3PagerWrite(), its contents

	5422 ** will not be journaled. This saves IO.

	5423 **

	5424 ** The acquisition might fail for several reasons. In all cases,

	5425 ** an appropriate error code is returned and *ppPage is set to NULL.

	5426 **

	5427 ** See also sqlite3PagerLookup(). Both this routine and Lookup() attempt

	5428 ** to find a page in the in-memory cache first. If the page is not already

	5429 ** in memory, this routine goes to disk to read it in whereas Lookup()

	5430 ** just returns 0. This routine acquires a read-lock the first time it

	5431 ** has to go to disk, and could also playback an old journal if necessary.

	5432 ** Since Lookup() never goes to disk, it never has to deal with locks

	5433 ** or journal files.

	5434 */

	5435 SQLITE_PRIVATE int sqlite3PagerGet(

	5436 Pager pPager, / The pager open on the database file */

	5437 Pgno pgno, /* Page number to fetch */

	5438 DbPage *ppPage, / Write a pointer to the page here */

	5439 int flags /* PAGER_GET_XXX flags */

	5440 ){

	5441 int rc = SQLITE_OK;

	5442 PgHdr *pPg = 0;

	5443 u32 iFrame = 0; /* Frame to read from WAL file */

	5444 const int noContent = (flags & PAGER_GET_NOCONTENT);

	5445

	5446 /* It is acceptable to use a read-only (mmap) page for any page except

	5447 ** page 1 if there is no write-transaction open or the ACQUIRE_READONLY

	5448 ** flag was specified by the caller. And so long as the db is not a

	5449 ** temporary or in-memory database. */

	5450 const int bMmapOk = (pgno>1 && USEFETCH(pPager)

	5451 && (pPager->eState==PAGER_READER \|\| (flags & PAGER_GET_READONLY))

	5452 #ifdef SQLITE_HAS_CODEC

	5453 && pPager->xCodec==0

	5454 #endif

	5455 );

	5456

	5457 /* Optimization note: Adding the "pgno<=1" term before "pgno==0" here

	5458 ** allows the compiler optimizer to reuse the results of the "pgno>1"

	5459 ** test in the previous statement, and avoid testing pgno==0 in the

	5460 ** common case where pgno is large. */

	5461 if( pgno<=1 && pgno==0 ){

	5462 return SQLITE_CORRUPT_BKPT;

	5463 }

	5464 assert( pPager->eState>=PAGER_READER );

	5465 assert( assert_pager_state(pPager) );

	5466 assert( noContent==0 \|\| bMmapOk==0 );

	5467

	5468 assert( pPager->hasHeldSharedLock==1 );

	5469

	5470 /* If the pager is in the error state, return an error immediately.

	5471 ** Otherwise, request the page from the PCache layer. */

	5472 if( pPager->errCode!=SQLITE_OK ){

	5473 rc = pPager->errCode;

	5474 }else{

	5475 if( bMmapOk && pagerUseWal(pPager) ){

	5476 rc = sqlite3WalFindFrame(pPager->pWal, pgno, &iFrame);

	5477 if( rc!=SQLITE_OK ) goto pager_acquire_err;

	5478 }

	5479

	5480 if( bMmapOk && iFrame==0 ){

	5481 void *pData = 0;

	5482

	5483 rc = sqlite3OsFetch(pPager->fd,

	5484 (i64)(pgno-1) * pPager->pageSize, pPager->pageSize, &pData

	5485 );

	5486

	5487 if( rc==SQLITE_OK && pData ){

	5488 if( pPager->eState>PAGER_READER ){

	5489 pPg = sqlite3PagerLookup(pPager, pgno);

	5490 }

	5491 if( pPg==0 ){

	5492 rc = pagerAcquireMapPage(pPager, pgno, pData, &pPg);

	5493 }else{

	5494 sqlite3OsUnfetch(pPager->fd, (i64)(pgno-1)*pPager->pageSize, pData);

	5495 }

	5496 if( pPg ){

	5497 assert( rc==SQLITE_OK );

	5498 *ppPage = pPg;

	5499 return SQLITE_OK;

	5500 }

	5501 }

	5502 if( rc!=SQLITE_OK ){

	5503 goto pager_acquire_err;

	5504 }

	5505 }

	5506

	5507 {

	5508 sqlite3_pcache_page *pBase;

	5509 pBase = sqlite3PcacheFetch(pPager->pPCache, pgno, 3);

	5510 if( pBase==0 ){

	5511 rc = sqlite3PcacheFetchStress(pPager->pPCache, pgno, &pBase);

	5512 if( rc!=SQLITE_OK ) goto pager_acquire_err;

	5513 if( pBase==0 ){

	5514 pPg = *ppPage = 0;

	5515 rc = SQLITE_NOMEM;

	5516 goto pager_acquire_err;

	5517 }

	5518 }

	5519 pPg = *ppPage = sqlite3PcacheFetchFinish(pPager->pPCache, pgno, pBase);

	5520 assert( pPg!=0 );

	5521 }

	5522 }

	5523

	5524 if( rc!=SQLITE_OK ){

	5525 /* Either the call to sqlite3PcacheFetch() returned an error or the

	5526 ** pager was already in the error-state when this function was called.

	5527 ** Set pPg to 0 and jump to the exception handler. */

	5528 pPg = 0;

	5529 goto pager_acquire_err;

	5530 }

	5531 assert( pPg==(*ppPage) );

	5532 assert( pPg->pgno==pgno );

	5533 assert( pPg->pPager==pPager \|\| pPg->pPager==0 );

	5534

	5535 if( pPg->pPager && !noContent ){

	5536 /* In this case the pcache already contains an initialized copy of

	5537 ** the page. Return without further ado. */

	5538 assert( pgno<=PAGER_MAX_PGNO && pgno!=PAGER_MJ_PGNO(pPager) );

	5539 pPager->aStat[PAGER_STAT_HIT]++;

	5540 return SQLITE_OK;

	5541

	5542 }else{

	5543 /* The pager cache has created a new page. Its content needs to

	5544 ** be initialized. */

	5545

	5546 pPg->pPager = pPager;

	5547

	5548 /* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page

	5549 ** number greater than this, or the unused locking-page, is requested. */

	5550 if( pgno>PAGER_MAX_PGNO \|\| pgno==PAGER_MJ_PGNO(pPager) ){

	5551 rc = SQLITE_CORRUPT_BKPT;

	5552 goto pager_acquire_err;

	5553 }

	5554

	5555 if( MEMDB \|\| pPager->dbSize<pgno \|\| noContent \|\| !isOpen(pPager->fd) ){

	5556 if( pgno>pPager->mxPgno ){

	5557 rc = SQLITE_FULL;

	5558 goto pager_acquire_err;

	5559 }

	5560 if( noContent ){

	5561 /* Failure to set the bits in the InJournal bit-vectors is benign.

	5562 ** It merely means that we might do some extra work to journal a

	5563 ** page that does not need to be journaled. Nevertheless, be sure

	5564 ** to test the case where a malloc error occurs while trying to set

	5565 ** a bit in a bit vector.

	5566 */

	5567 sqlite3BeginBenignMalloc();

	5568 if( pgno<=pPager->dbOrigSize ){

	5569 TESTONLY( rc = ) sqlite3BitvecSet(pPager->pInJournal, pgno);

	5570 testcase( rc==SQLITE_NOMEM );

	5571 }

	5572 TESTONLY( rc = ) addToSavepointBitvecs(pPager, pgno);

	5573 testcase( rc==SQLITE_NOMEM );

	5574 sqlite3EndBenignMalloc();

	5575 }

	5576 memset(pPg->pData, 0, pPager->pageSize);

	5577 IOTRACE(("ZERO %p %d\n", pPager, pgno));

	5578 }else{

	5579 if( pagerUseWal(pPager) && bMmapOk==0 ){

	5580 rc = sqlite3WalFindFrame(pPager->pWal, pgno, &iFrame);

	5581 if( rc!=SQLITE_OK ) goto pager_acquire_err;

	5582 }

	5583 assert( pPg->pPager==pPager );

	5584 pPager->aStat[PAGER_STAT_MISS]++;

	5585 rc = readDbPage(pPg, iFrame);

	5586 if( rc!=SQLITE_OK ){

	5587 goto pager_acquire_err;

	5588 }

	5589 }

	5590 pager_set_pagehash(pPg);

	5591 }

	5592

	5593 return SQLITE_OK;

	5594

	5595 pager_acquire_err:

	5596 assert( rc!=SQLITE_OK );

	5597 if( pPg ){

	5598 sqlite3PcacheDrop(pPg);

	5599 }

	5600 pagerUnlockIfUnused(pPager);

	5601

	5602 *ppPage = 0;

	5603 return rc;

	5604 }

	5605

	5606 /*

	5607 ** Acquire a page if it is already in the in-memory cache. Do

	5608 ** not read the page from disk. Return a pointer to the page,

	5609 ** or 0 if the page is not in cache.

	5610 **

	5611 ** See also sqlite3PagerGet(). The difference between this routine

	5612 ** and sqlite3PagerGet() is that _get() will go to the disk and read

	5613 ** in the page if the page is not already in cache. This routine

	5614 ** returns NULL if the page is not in cache or if a disk I/O error

	5615 ** has ever happened.

	5616 */

	5617 SQLITE_PRIVATE DbPage sqlite3PagerLookup(Pager pPager, Pgno pgno){

	5618 sqlite3_pcache_page *pPage;

	5619 assert( pPager!=0 );

	5620 assert( pgno!=0 );

	5621 assert( pPager->pPCache!=0 );

	5622 pPage = sqlite3PcacheFetch(pPager->pPCache, pgno, 0);

	5623 assert( pPage==0 \|\| pPager->hasHeldSharedLock );

	5624 if( pPage==0 ) return 0;

	5625 return sqlite3PcacheFetchFinish(pPager->pPCache, pgno, pPage);

	5626 }

	5627

	5628 /*

	5629 ** Release a page reference.

	5630 **

	5631 ** If the number of references to the page drop to zero, then the

	5632 ** page is added to the LRU list. When all references to all pages

	5633 ** are released, a rollback occurs and the lock on the database is

	5634 ** removed.

	5635 */

	5636 SQLITE_PRIVATE void sqlite3PagerUnrefNotNull(DbPage *pPg){

	5637 Pager *pPager;

	5638 assert( pPg!=0 );

	5639 pPager = pPg->pPager;

	5640 if( pPg->flags & PGHDR_MMAP ){

	5641 pagerReleaseMapPage(pPg);

	5642 }else{

	5643 sqlite3PcacheRelease(pPg);

	5644 }

	5645 pagerUnlockIfUnused(pPager);

	5646 }

	5647 SQLITE_PRIVATE void sqlite3PagerUnref(DbPage *pPg){

	5648 if( pPg ) sqlite3PagerUnrefNotNull(pPg);

	5649 }

	5650

	5651 #if defined(__APPLE__)

	5652 /*

	5653 ** Create and return a CFURLRef given a cstring containing the path to a file.

	5654 */

	5655 static CFURLRef create_cfurl_from_cstring(const char* filePath){

	5656 CFStringRef urlString = CFStringCreateWithFileSystemRepresentation(

	5657 kCFAllocatorDefault, filePath);

	5658 CFURLRef urlRef = CFURLCreateWithFileSystemPath(kCFAllocatorDefault,

	5659 urlString, kCFURLPOSIXPathStyle, FALSE);

	5660 CFRelease(urlString);

	5661 return urlRef;

	5662 }

	5663 #endif

	5664

	5665 /*

	5666 ** This function is called at the start of every write transaction.

	5667 ** There must already be a RESERVED or EXCLUSIVE lock on the database

	5668 ** file when this routine is called.

	5669 **

	5670 ** Open the journal file for pager pPager and write a journal header

	5671 ** to the start of it. If there are active savepoints, open the sub-journal

	5672 ** as well. This function is only used when the journal file is being

	5673 ** opened to write a rollback log for a transaction. It is not used

	5674 ** when opening a hot journal file to roll it back.

	5675 **

	5676 ** If the journal file is already open (as it may be in exclusive mode),

	5677 ** then this function just writes a journal header to the start of the

	5678 ** already open file.

	5679 **

	5680 ** Whether or not the journal file is opened by this function, the

	5681 ** Pager.pInJournal bitvec structure is allocated.

	5682 **

	5683 ** Return SQLITE_OK if everything is successful. Otherwise, return

	5684 ** SQLITE_NOMEM if the attempt to allocate Pager.pInJournal fails, or

	5685 ** an IO error code if opening or writing the journal file fails.

	5686 */

	5687 static int pager_open_journal(Pager *pPager){

	5688 int rc = SQLITE_OK; /* Return code */

	5689 sqlite3_vfs * const pVfs = pPager->pVfs; /* Local cache of vfs pointer */

	5690

	5691 assert( pPager->eState==PAGER_WRITER_LOCKED );

	5692 assert( assert_pager_state(pPager) );

	5693 assert( pPager->pInJournal==0 );

	5694

	5695 /* If already in the error state, this function is a no-op. But on

	5696 ** the other hand, this routine is never called if we are already in

	5697 ** an error state. */

	5698 if( NEVER(pPager->errCode) ) return pPager->errCode;

	5699

	5700 if( !pagerUseWal(pPager) && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){

	5701 pPager->pInJournal = sqlite3BitvecCreate(pPager->dbSize);

	5702 if( pPager->pInJournal==0 ){

	5703 return SQLITE_NOMEM;

	5704 }

	5705

	5706 /* Open the journal file if it is not already open. */

	5707 if( !isOpen(pPager->jfd) ){

	5708 if( pPager->journalMode==PAGER_JOURNALMODE_MEMORY ){

	5709 sqlite3MemJournalOpen(pPager->jfd);

	5710 }else{

	5711 const int flags = /* VFS flags to open journal file */

	5712 SQLITE_OPEN_READWRITE\|SQLITE_OPEN_CREATE\|

	5713 (pPager->tempFile ?

	5714 (SQLITE_OPEN_DELETEONCLOSE\|SQLITE_OPEN_TEMP_JOURNAL):

	5715 (SQLITE_OPEN_MAIN_JOURNAL)

	5716 );

	5717

	5718 /* Verify that the database still has the same name as it did when

	5719 ** it was originally opened. */

	5720 rc = databaseIsUnmoved(pPager);

	5721 if( rc==SQLITE_OK ){

	5722 #ifdef SQLITE_ENABLE_ATOMIC_WRITE

	5723 rc = sqlite3JournalOpen(

	5724 pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager)

	5725 );

	5726 #else

	5727 rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0);

	5728 #endif

	5729 #if defined(__APPLE__)

	5730 /* Set the TimeMachine exclusion metadata for the journal if it has

	5731 ** been set for the database. Only do this for unix-type vfs

	5732 ** implementations. */

	5733 if( rc==SQLITE_OK && pPager->zFilename!=NULL

	5734 && strlen(pPager->zFilename)>0

	5735 && strncmp(pVfs->zName, "unix", 4)==0

	5736 && ( pVfs->zName[4]=='-' \|\| pVfs->zName[4]=='\0' ) ){

	5737 CFURLRef database = create_cfurl_from_cstring(pPager->zFilename);

	5738 if( CSBackupIsItemExcluded(database, NULL) ){

	5739 CFURLRef journal = create_cfurl_from_cstring(pPager->zJournal);

	5740 /* Ignore errors from the following exclusion call. */

	5741 CSBackupSetItemExcluded(journal, TRUE, FALSE);

	5742 CFRelease(journal);

	5743 }

	5744 CFRelease(database);

	5745 }

	5746 #endif

	5747 }

	5748 }

	5749 assert( rc!=SQLITE_OK \|\| isOpen(pPager->jfd) );

	5750 }

	5751

	5752

	5753 /* Write the first journal header to the journal file and open

	5754 ** the sub-journal if necessary.

	5755 */

	5756 if( rc==SQLITE_OK ){

	5757 /* TODO: Check if all of these are really required. */

	5758 pPager->nRec = 0;

	5759 pPager->journalOff = 0;

	5760 pPager->setMaster = 0;

	5761 pPager->journalHdr = 0;

	5762 rc = writeJournalHdr(pPager);

	5763 }

	5764 }

	5765

	5766 if( rc!=SQLITE_OK ){

	5767 sqlite3BitvecDestroy(pPager->pInJournal);

	5768 pPager->pInJournal = 0;

	5769 }else{

	5770 assert( pPager->eState==PAGER_WRITER_LOCKED );

	5771 pPager->eState = PAGER_WRITER_CACHEMOD;

	5772 }

	5773

	5774 return rc;

	5775 }

	5776

	5777 /*

	5778 ** Begin a write-transaction on the specified pager object. If a

	5779 ** write-transaction has already been opened, this function is a no-op.

	5780 **

	5781 ** If the exFlag argument is false, then acquire at least a RESERVED

	5782 ** lock on the database file. If exFlag is true, then acquire at least

	5783 ** an EXCLUSIVE lock. If such a lock is already held, no locking

	5784 ** functions need be called.

	5785 **

	5786 ** If the subjInMemory argument is non-zero, then any sub-journal opened

	5787 ** within this transaction will be opened as an in-memory file. This

	5788 ** has no effect if the sub-journal is already opened (as it may be when

	5789 ** running in exclusive mode) or if the transaction does not require a

	5790 ** sub-journal. If the subjInMemory argument is zero, then any required

	5791 ** sub-journal is implemented in-memory if pPager is an in-memory database,

	5792 ** or using a temporary file otherwise.

	5793 */

	5794 SQLITE_PRIVATE int sqlite3PagerBegin(Pager *pPager, int exFlag, int subjInMemory ){

	5795 int rc = SQLITE_OK;

	5796

	5797 if( pPager->errCode ) return pPager->errCode;

	5798 assert( pPager->eState>=PAGER_READER && pPager->eState<PAGER_ERROR );

	5799 pPager->subjInMemory = (u8)subjInMemory;

	5800

	5801 if( ALWAYS(pPager->eState==PAGER_READER) ){

	5802 assert( pPager->pInJournal==0 );

	5803

	5804 if( pagerUseWal(pPager) ){

	5805 /* If the pager is configured to use locking_mode=exclusive, and an

	5806 ** exclusive lock on the database is not already held, obtain it now.

	5807 */

	5808 if( pPager->exclusiveMode && sqlite3WalExclusiveMode(pPager->pWal, -1) ){

	5809 rc = pagerLockDb(pPager, EXCLUSIVE_LOCK);

	5810 if( rc!=SQLITE_OK ){

	5811 return rc;

	5812 }

	5813 (void)sqlite3WalExclusiveMode(pPager->pWal, 1);

	5814 }

	5815

	5816 /* Grab the write lock on the log file. If successful, upgrade to

	5817 ** PAGER_RESERVED state. Otherwise, return an error code to the caller.

	5818 ** The busy-handler is not invoked if another connection already

	5819 ** holds the write-lock. If possible, the upper layer will call it.

	5820 */

	5821 rc = sqlite3WalBeginWriteTransaction(pPager->pWal);

	5822 }else{

	5823 /* Obtain a RESERVED lock on the database file. If the exFlag parameter

	5824 ** is true, then immediately upgrade this to an EXCLUSIVE lock. The

	5825 ** busy-handler callback can be used when upgrading to the EXCLUSIVE

	5826 ** lock, but not when obtaining the RESERVED lock.

	5827 */

	5828 rc = pagerLockDb(pPager, RESERVED_LOCK);

	5829 if( rc==SQLITE_OK && exFlag ){

	5830 rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);

	5831 }

	5832 }

	5833

	5834 if( rc==SQLITE_OK ){

	5835 /* Change to WRITER_LOCKED state.

	5836 **

	5837 ** WAL mode sets Pager.eState to PAGER_WRITER_LOCKED or CACHEMOD

	5838 ** when it has an open transaction, but never to DBMOD or FINISHED.

	5839 ** This is because in those states the code to roll back savepoint

	5840 ** transactions may copy data from the sub-journal into the database

	5841 ** file as well as into the page cache. Which would be incorrect in

	5842 ** WAL mode.

	5843 */

	5844 pPager->eState = PAGER_WRITER_LOCKED;

	5845 pPager->dbHintSize = pPager->dbSize;

	5846 pPager->dbFileSize = pPager->dbSize;

	5847 pPager->dbOrigSize = pPager->dbSize;

	5848 pPager->journalOff = 0;

	5849 }

	5850

	5851 assert( rc==SQLITE_OK \|\| pPager->eState==PAGER_READER );

	5852 assert( rc!=SQLITE_OK \|\| pPager->eState==PAGER_WRITER_LOCKED );

	5853 assert( assert_pager_state(pPager) );

	5854 }

	5855

	5856 PAGERTRACE(("TRANSACTION %d\n", PAGERID(pPager)));

	5857 return rc;

	5858 }

	5859

	5860 /*

	5861 ** Write page pPg onto the end of the rollback journal.

	5862 */

	5863 static SQLITE_NOINLINE int pagerAddPageToRollbackJournal(PgHdr *pPg){

	5864 Pager *pPager = pPg->pPager;

	5865 int rc;

	5866 u32 cksum;

	5867 char *pData2;

	5868 i64 iOff = pPager->journalOff;

	5869

	5870 /* We should never write to the journal file the page that

	5871 ** contains the database locks. The following assert verifies

	5872 ** that we do not. */

	5873 assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );

	5874

	5875 assert( pPager->journalHdr<=pPager->journalOff );

	5876 CODEC2(pPager, pPg->pData, pPg->pgno, 7, return SQLITE_NOMEM, pData2);

	5877 cksum = pager_cksum(pPager, (u8*)pData2);

	5878

	5879 /* Even if an IO or diskfull error occurs while journalling the

	5880 ** page in the block above, set the need-sync flag for the page.

	5881 ** Otherwise, when the transaction is rolled back, the logic in

	5882 ** playback_one_page() will think that the page needs to be restored

	5883 ** in the database file. And if an IO error occurs while doing so,

	5884 ** then corruption may follow.

	5885 */

	5886 pPg->flags \|= PGHDR_NEED_SYNC;

	5887

	5888 rc = write32bits(pPager->jfd, iOff, pPg->pgno);

	5889 if( rc!=SQLITE_OK ) return rc;

	5890 rc = sqlite3OsWrite(pPager->jfd, pData2, pPager->pageSize, iOff+4);

	5891 if( rc!=SQLITE_OK ) return rc;

	5892 rc = write32bits(pPager->jfd, iOff+pPager->pageSize+4, cksum);

	5893 if( rc!=SQLITE_OK ) return rc;

	5894

	5895 IOTRACE(("JOUT %p %d %lld %d\n", pPager, pPg->pgno,

	5896 pPager->journalOff, pPager->pageSize));

	5897 PAGER_INCR(sqlite3_pager_writej_count);

	5898 PAGERTRACE(("JOURNAL %d page %d needSync=%d hash(%08x)\n",

	5899 PAGERID(pPager), pPg->pgno,

	5900 ((pPg->flags&PGHDR_NEED_SYNC)?1:0), pager_pagehash(pPg)));

	5901

	5902 pPager->journalOff += 8 + pPager->pageSize;

	5903 pPager->nRec++;

	5904 assert( pPager->pInJournal!=0 );

	5905 rc = sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);

	5906 testcase( rc==SQLITE_NOMEM );

	5907 assert( rc==SQLITE_OK \|\| rc==SQLITE_NOMEM );

	5908 rc \|= addToSavepointBitvecs(pPager, pPg->pgno);

	5909 assert( rc==SQLITE_OK \|\| rc==SQLITE_NOMEM );

	5910 return rc;

	5911 }

	5912

	5913 /*

	5914 ** Mark a single data page as writeable. The page is written into the

	5915 ** main journal or sub-journal as required. If the page is written into

	5916 ** one of the journals, the corresponding bit is set in the

	5917 ** Pager.pInJournal bitvec and the PagerSavepoint.pInSavepoint bitvecs

	5918 ** of any open savepoints as appropriate.

	5919 */

	5920 static int pager_write(PgHdr *pPg){

	5921 Pager *pPager = pPg->pPager;

	5922 int rc = SQLITE_OK;

	5923

	5924 /* This routine is not called unless a write-transaction has already

	5925 ** been started. The journal file may or may not be open at this point.

	5926 ** It is never called in the ERROR state.

	5927 */

	5928 assert( pPager->eState==PAGER_WRITER_LOCKED

	5929 \|\| pPager->eState==PAGER_WRITER_CACHEMOD

	5930 \|\| pPager->eState==PAGER_WRITER_DBMOD

	5931 );

	5932 assert( assert_pager_state(pPager) );

	5933 assert( pPager->errCode==0 );

	5934 assert( pPager->readOnly==0 );

	5935 CHECK_PAGE(pPg);

	5936

	5937 /* The journal file needs to be opened. Higher level routines have already

	5938 ** obtained the necessary locks to begin the write-transaction, but the

	5939 ** rollback journal might not yet be open. Open it now if this is the case.

	5940 **

	5941 ** This is done before calling sqlite3PcacheMakeDirty() on the page.

	5942 ** Otherwise, if it were done after calling sqlite3PcacheMakeDirty(), then

	5943 ** an error might occur and the pager would end up in WRITER_LOCKED state

	5944 ** with pages marked as dirty in the cache.

	5945 */

	5946 if( pPager->eState==PAGER_WRITER_LOCKED ){

	5947 rc = pager_open_journal(pPager);

	5948 if( rc!=SQLITE_OK ) return rc;

	5949 }

	5950 assert( pPager->eState>=PAGER_WRITER_CACHEMOD );

	5951 assert( assert_pager_state(pPager) );

	5952

	5953 /* Mark the page that is about to be modified as dirty. */

	5954 sqlite3PcacheMakeDirty(pPg);

	5955

	5956 /* If a rollback journal is in use, them make sure the page that is about

	5957 ** to change is in the rollback journal, or if the page is a new page off

	5958 ** then end of the file, make sure it is marked as PGHDR_NEED_SYNC.

	5959 */

	5960 assert( (pPager->pInJournal!=0) == isOpen(pPager->jfd) );

	5961 if( pPager->pInJournal!=0

	5962 && sqlite3BitvecTestNotNull(pPager->pInJournal, pPg->pgno)==0

	5963 ){

	5964 assert( pagerUseWal(pPager)==0 );

	5965 if( pPg->pgno<=pPager->dbOrigSize ){

	5966 rc = pagerAddPageToRollbackJournal(pPg);

	5967 if( rc!=SQLITE_OK ){

	5968 return rc;

	5969 }

	5970 }else{

	5971 if( pPager->eState!=PAGER_WRITER_DBMOD ){

	5972 pPg->flags \|= PGHDR_NEED_SYNC;

	5973 }

	5974 PAGERTRACE(("APPEND %d page %d needSync=%d\n",

	5975 PAGERID(pPager), pPg->pgno,

	5976 ((pPg->flags&PGHDR_NEED_SYNC)?1:0)));

	5977 }

	5978 }

	5979

	5980 /* The PGHDR_DIRTY bit is set above when the page was added to the dirty-list

	5981 ** and before writing the page into the rollback journal. Wait until now,

	5982 ** after the page has been successfully journalled, before setting the

	5983 ** PGHDR_WRITEABLE bit that indicates that the page can be safely modified.

	5984 */

	5985 pPg->flags \|= PGHDR_WRITEABLE;

	5986

	5987 /* If the statement journal is open and the page is not in it,

	5988 ** then write the page into the statement journal.

	5989 */

	5990 if( pPager->nSavepoint>0 ){

	5991 rc = subjournalPageIfRequired(pPg);

	5992 }

	5993

	5994 /* Update the database size and return. */

	5995 if( pPager->dbSize<pPg->pgno ){

	5996 pPager->dbSize = pPg->pgno;

	5997 }

	5998 return rc;

	5999 }

	6000

	6001 /*

	6002 ** This is a variant of sqlite3PagerWrite() that runs when the sector size

	6003 ** is larger than the page size. SQLite makes the (reasonable) assumption that

	6004 ** all bytes of a sector are written together by hardware. Hence, all bytes of

	6005 ** a sector need to be journalled in case of a power loss in the middle of

	6006 ** a write.

	6007 **

	6008 ** Usually, the sector size is less than or equal to the page size, in which

	6009 ** case pages can be individually written. This routine only runs in the

	6010 ** exceptional case where the page size is smaller than the sector size.

	6011 */

	6012 static SQLITE_NOINLINE int pagerWriteLargeSector(PgHdr *pPg){

	6013 int rc = SQLITE_OK; /* Return code */

	6014 Pgno nPageCount; /* Total number of pages in database file */

	6015 Pgno pg1; /* First page of the sector pPg is located on. */

	6016 int nPage = 0; /* Number of pages starting at pg1 to journal */

	6017 int ii; /* Loop counter */

	6018 int needSync = 0; /* True if any page has PGHDR_NEED_SYNC */

	6019 Pager pPager = pPg->pPager; / The pager that owns pPg */

	6020 Pgno nPagePerSector = (pPager->sectorSize/pPager->pageSize);

	6021

	6022 /* Set the doNotSpill NOSYNC bit to 1. This is because we cannot allow

	6023 ** a journal header to be written between the pages journaled by

	6024 ** this function.

	6025 */

	6026 assert( !MEMDB );

	6027 assert( (pPager->doNotSpill & SPILLFLAG_NOSYNC)==0 );

	6028 pPager->doNotSpill \|= SPILLFLAG_NOSYNC;

	6029

	6030 /* This trick assumes that both the page-size and sector-size are

	6031 ** an integer power of 2. It sets variable pg1 to the identifier

	6032 ** of the first page of the sector pPg is located on.

	6033 */

	6034 pg1 = ((pPg->pgno-1) & ~(nPagePerSector-1)) + 1;

	6035

	6036 nPageCount = pPager->dbSize;

	6037 if( pPg->pgno>nPageCount ){

	6038 nPage = (pPg->pgno - pg1)+1;

	6039 }else if( (pg1+nPagePerSector-1)>nPageCount ){

	6040 nPage = nPageCount+1-pg1;

	6041 }else{

	6042 nPage = nPagePerSector;

	6043 }

	6044 assert(nPage>0);

	6045 assert(pg1<=pPg->pgno);

	6046 assert((pg1+nPage)>pPg->pgno);

	6047

	6048 for(ii=0; ii<nPage && rc==SQLITE_OK; ii++){

	6049 Pgno pg = pg1+ii;

	6050 PgHdr *pPage;

	6051 if( pg==pPg->pgno \|\| !sqlite3BitvecTest(pPager->pInJournal, pg) ){

	6052 if( pg!=PAGER_MJ_PGNO(pPager) ){

	6053 rc = sqlite3PagerGet(pPager, pg, &pPage, 0);

	6054 if( rc==SQLITE_OK ){

	6055 rc = pager_write(pPage);

	6056 if( pPage->flags&PGHDR_NEED_SYNC ){

	6057 needSync = 1;

	6058 }

	6059 sqlite3PagerUnrefNotNull(pPage);

	6060 }

	6061 }

	6062 }else if( (pPage = sqlite3PagerLookup(pPager, pg))!=0 ){

	6063 if( pPage->flags&PGHDR_NEED_SYNC ){

	6064 needSync = 1;

	6065 }

	6066 sqlite3PagerUnrefNotNull(pPage);

	6067 }

	6068 }

	6069

	6070 /* If the PGHDR_NEED_SYNC flag is set for any of the nPage pages

	6071 ** starting at pg1, then it needs to be set for all of them. Because

	6072 ** writing to any of these nPage pages may damage the others, the

	6073 ** journal file must contain sync()ed copies of all of them

	6074 ** before any of them can be written out to the database file.

	6075 */

	6076 if( rc==SQLITE_OK && needSync ){

	6077 assert( !MEMDB );

	6078 for(ii=0; ii<nPage; ii++){

	6079 PgHdr *pPage = sqlite3PagerLookup(pPager, pg1+ii);

	6080 if( pPage ){

	6081 pPage->flags \|= PGHDR_NEED_SYNC;

	6082 sqlite3PagerUnrefNotNull(pPage);

	6083 }

	6084 }

	6085 }

	6086

	6087 assert( (pPager->doNotSpill & SPILLFLAG_NOSYNC)!=0 );

	6088 pPager->doNotSpill &= ~SPILLFLAG_NOSYNC;

	6089 return rc;

	6090 }

	6091

	6092 /*

	6093 ** Mark a data page as writeable. This routine must be called before

	6094 ** making changes to a page. The caller must check the return value

	6095 ** of this function and be careful not to change any page data unless

	6096 ** this routine returns SQLITE_OK.

	6097 **

	6098 ** The difference between this function and pager_write() is that this

	6099 ** function also deals with the special case where 2 or more pages

	6100 ** fit on a single disk sector. In this case all co-resident pages

	6101 ** must have been written to the journal file before returning.

	6102 **

	6103 ** If an error occurs, SQLITE_NOMEM or an IO error code is returned

	6104 ** as appropriate. Otherwise, SQLITE_OK.

	6105 */

	6106 SQLITE_PRIVATE int sqlite3PagerWrite(PgHdr *pPg){

	6107 Pager *pPager = pPg->pPager;

	6108 assert( (pPg->flags & PGHDR_MMAP)==0 );

	6109 assert( pPager->eState>=PAGER_WRITER_LOCKED );

	6110 assert( assert_pager_state(pPager) );

	6111 if( pPager->errCode ){

	6112 return pPager->errCode;

	6113 }else if( (pPg->flags & PGHDR_WRITEABLE)!=0 && pPager->dbSize>=pPg->pgno ){

	6114 if( pPager->nSavepoint ) return subjournalPageIfRequired(pPg);

	6115 return SQLITE_OK;

	6116 }else if( pPager->sectorSize > (u32)pPager->pageSize ){

	6117 return pagerWriteLargeSector(pPg);

	6118 }else{

	6119 return pager_write(pPg);

	6120 }

	6121 }

	6122

	6123 /*

	6124 ** Return TRUE if the page given in the argument was previously passed

	6125 ** to sqlite3PagerWrite(). In other words, return TRUE if it is ok

	6126 ** to change the content of the page.

	6127 */

	6128 #ifndef NDEBUG

	6129 SQLITE_PRIVATE int sqlite3PagerIswriteable(DbPage *pPg){

	6130 return pPg->flags & PGHDR_WRITEABLE;

	6131 }

	6132 #endif

	6133

	6134 /*

	6135 ** A call to this routine tells the pager that it is not necessary to

	6136 ** write the information on page pPg back to the disk, even though

	6137 ** that page might be marked as dirty. This happens, for example, when

	6138 ** the page has been added as a leaf of the freelist and so its

	6139 ** content no longer matters.

	6140 **

	6141 ** The overlying software layer calls this routine when all of the data

	6142 ** on the given page is unused. The pager marks the page as clean so

	6143 ** that it does not get written to disk.

	6144 **

	6145 ** Tests show that this optimization can quadruple the speed of large

	6146 ** DELETE operations.

	6147 */

	6148 SQLITE_PRIVATE void sqlite3PagerDontWrite(PgHdr *pPg){

	6149 Pager *pPager = pPg->pPager;

	6150 if( (pPg->flags&PGHDR_DIRTY) && pPager->nSavepoint==0 ){

	6151 PAGERTRACE(("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager)));

	6152 IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno))

	6153 pPg->flags \|= PGHDR_DONT_WRITE;

	6154 pPg->flags &= ~PGHDR_WRITEABLE;

	6155 pager_set_pagehash(pPg);

	6156 }

	6157 }

	6158

	6159 /*

	6160 ** This routine is called to increment the value of the database file

	6161 ** change-counter, stored as a 4-byte big-endian integer starting at

	6162 ** byte offset 24 of the pager file. The secondary change counter at

	6163 ** 92 is also updated, as is the SQLite version number at offset 96.

	6164 **

	6165 ** But this only happens if the pPager->changeCountDone flag is false.

	6166 ** To avoid excess churning of page 1, the update only happens once.

	6167 ** See also the pager_write_changecounter() routine that does an

	6168 ** unconditional update of the change counters.

	6169 **

	6170 ** If the isDirectMode flag is zero, then this is done by calling

	6171 ** sqlite3PagerWrite() on page 1, then modifying the contents of the

	6172 ** page data. In this case the file will be updated when the current

	6173 ** transaction is committed.

	6174 **

	6175 ** The isDirectMode flag may only be non-zero if the library was compiled

	6176 ** with the SQLITE_ENABLE_ATOMIC_WRITE macro defined. In this case,

	6177 ** if isDirect is non-zero, then the database file is updated directly

	6178 ** by writing an updated version of page 1 using a call to the

	6179 ** sqlite3OsWrite() function.

	6180 */

	6181 static int pager_incr_changecounter(Pager *pPager, int isDirectMode){

	6182 int rc = SQLITE_OK;

	6183

	6184 assert( pPager->eState==PAGER_WRITER_CACHEMOD

	6185 \|\| pPager->eState==PAGER_WRITER_DBMOD

	6186 );

	6187 assert( assert_pager_state(pPager) );

	6188

	6189 /* Declare and initialize constant integer 'isDirect'. If the

	6190 ** atomic-write optimization is enabled in this build, then isDirect

	6191 ** is initialized to the value passed as the isDirectMode parameter

	6192 ** to this function. Otherwise, it is always set to zero.

	6193 **

	6194 ** The idea is that if the atomic-write optimization is not

	6195 ** enabled at compile time, the compiler can omit the tests of

	6196 ** 'isDirect' below, as well as the block enclosed in the

	6197 ** "if( isDirect )" condition.

	6198 */

	6199 #ifndef SQLITE_ENABLE_ATOMIC_WRITE

	6200 # define DIRECT_MODE 0

	6201 assert( isDirectMode==0 );

	6202 UNUSED_PARAMETER(isDirectMode);

	6203 #else

	6204 # define DIRECT_MODE isDirectMode

	6205 #endif

	6206

	6207 if( !pPager->changeCountDone && ALWAYS(pPager->dbSize>0) ){

	6208 PgHdr pPgHdr; / Reference to page 1 */

	6209

	6210 assert( !pPager->tempFile && isOpen(pPager->fd) );

	6211

	6212 /* Open page 1 of the file for writing. */

	6213 rc = sqlite3PagerGet(pPager, 1, &pPgHdr, 0);

	6214 assert( pPgHdr==0 \|\| rc==SQLITE_OK );

	6215

	6216 /* If page one was fetched successfully, and this function is not

	6217 ** operating in direct-mode, make page 1 writable. When not in

	6218 ** direct mode, page 1 is always held in cache and hence the PagerGet()

	6219 ** above is always successful - hence the ALWAYS on rc==SQLITE_OK.

	6220 */

	6221 if( !DIRECT_MODE && ALWAYS(rc==SQLITE_OK) ){

	6222 rc = sqlite3PagerWrite(pPgHdr);

	6223 }

	6224

	6225 if( rc==SQLITE_OK ){

	6226 /* Actually do the update of the change counter */

	6227 pager_write_changecounter(pPgHdr);

	6228

	6229 /* If running in direct mode, write the contents of page 1 to the file. */

	6230 if( DIRECT_MODE ){

	6231 const void *zBuf;

	6232 assert( pPager->dbFileSize>0 );

	6233 CODEC2(pPager, pPgHdr->pData, 1, 6, rc=SQLITE_NOMEM, zBuf);

	6234 if( rc==SQLITE_OK ){

	6235 rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0);

	6236 pPager->aStat[PAGER_STAT_WRITE]++;

	6237 }

	6238 if( rc==SQLITE_OK ){

	6239 /* Update the pager's copy of the change-counter. Otherwise, the

	6240 ** next time a read transaction is opened the cache will be

	6241 ** flushed (as the change-counter values will not match). */

	6242 const void pCopy = (const void )&((const char *)zBuf)[24];

	6243 memcpy(&pPager->dbFileVers, pCopy, sizeof(pPager->dbFileVers));

	6244 pPager->changeCountDone = 1;

	6245 }

	6246 }else{

	6247 pPager->changeCountDone = 1;

	6248 }

	6249 }

	6250

	6251 /* Release the page reference. */

	6252 sqlite3PagerUnref(pPgHdr);

	6253 }

	6254 return rc;

	6255 }

	6256

	6257 /*

	6258 ** Sync the database file to disk. This is a no-op for in-memory databases

	6259 ** or pages with the Pager.noSync flag set.

	6260 **

	6261 ** If successful, or if called on a pager for which it is a no-op, this

	6262 ** function returns SQLITE_OK. Otherwise, an IO error code is returned.

	6263 */

	6264 SQLITE_PRIVATE int sqlite3PagerSync(Pager pPager, const char zMaster){

	6265 int rc = SQLITE_OK;

	6266

	6267 if( isOpen(pPager->fd) ){

	6268 void pArg = (void)zMaster;

	6269 rc = sqlite3OsFileControl(pPager->fd, SQLITE_FCNTL_SYNC, pArg);

	6270 if( rc==SQLITE_NOTFOUND ) rc = SQLITE_OK;

	6271 }

	6272 if( rc==SQLITE_OK && !pPager->noSync ){

	6273 assert( !MEMDB );

	6274 rc = sqlite3OsSync(pPager->fd, pPager->syncFlags);

	6275 }

	6276 return rc;

	6277 }

	6278

	6279 /*

	6280 ** This function may only be called while a write-transaction is active in

	6281 ** rollback. If the connection is in WAL mode, this call is a no-op.

	6282 ** Otherwise, if the connection does not already have an EXCLUSIVE lock on

	6283 ** the database file, an attempt is made to obtain one.

	6284 **

	6285 ** If the EXCLUSIVE lock is already held or the attempt to obtain it is

	6286 ** successful, or the connection is in WAL mode, SQLITE_OK is returned.

	6287 ** Otherwise, either SQLITE_BUSY or an SQLITE_IOERR_XXX error code is

	6288 ** returned.

	6289 */

	6290 SQLITE_PRIVATE int sqlite3PagerExclusiveLock(Pager *pPager){

	6291 int rc = pPager->errCode;

	6292 assert( assert_pager_state(pPager) );

	6293 if( rc==SQLITE_OK ){

	6294 assert( pPager->eState==PAGER_WRITER_CACHEMOD

	6295 \|\| pPager->eState==PAGER_WRITER_DBMOD

	6296 \|\| pPager->eState==PAGER_WRITER_LOCKED

	6297 );

	6298 assert( assert_pager_state(pPager) );

	6299 if( 0==pagerUseWal(pPager) ){

	6300 rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);

	6301 }

	6302 }

	6303 return rc;

	6304 }

	6305

	6306 /*

	6307 ** Sync the database file for the pager pPager. zMaster points to the name

	6308 ** of a master journal file that should be written into the individual

	6309 ** journal file. zMaster may be NULL, which is interpreted as no master

	6310 ** journal (a single database transaction).

	6311 **

	6312 ** This routine ensures that:

	6313 **

	6314 ** * The database file change-counter is updated,

	6315 ** * the journal is synced (unless the atomic-write optimization is used),

	6316 ** * all dirty pages are written to the database file,

	6317 ** * the database file is truncated (if required), and

	6318 ** * the database file synced.

	6319 **

	6320 ** The only thing that remains to commit the transaction is to finalize

	6321 ** (delete, truncate or zero the first part of) the journal file (or

	6322 ** delete the master journal file if specified).

	6323 **

	6324 ** Note that if zMaster==NULL, this does not overwrite a previous value

	6325 ** passed to an sqlite3PagerCommitPhaseOne() call.

	6326 **

	6327 ** If the final parameter - noSync - is true, then the database file itself

	6328 ** is not synced. The caller must call sqlite3PagerSync() directly to

	6329 ** sync the database file before calling CommitPhaseTwo() to delete the

	6330 ** journal file in this case.

	6331 */

	6332 SQLITE_PRIVATE int sqlite3PagerCommitPhaseOne(

	6333 Pager pPager, / Pager object */

	6334 const char zMaster, / If not NULL, the master journal name */

	6335 int noSync /* True to omit the xSync on the db file */

	6336 ){

	6337 int rc = SQLITE_OK; /* Return code */

	6338

	6339 assert( pPager->eState==PAGER_WRITER_LOCKED

	6340 \|\| pPager->eState==PAGER_WRITER_CACHEMOD

	6341 \|\| pPager->eState==PAGER_WRITER_DBMOD

	6342 \|\| pPager->eState==PAGER_ERROR

	6343 );

	6344 assert( assert_pager_state(pPager) );

	6345

	6346 /* If a prior error occurred, report that error again. */

	6347 if( NEVER(pPager->errCode) ) return pPager->errCode;

	6348

	6349 PAGERTRACE(("DATABASE SYNC: File=%s zMaster=%s nSize=%d\n",

	6350 pPager->zFilename, zMaster, pPager->dbSize));

	6351

	6352 /* If no database changes have been made, return early. */

	6353 if( pPager->eState<PAGER_WRITER_CACHEMOD ) return SQLITE_OK;

	6354

	6355 if( MEMDB ){

	6356 /* If this is an in-memory db, or no pages have been written to, or this

	6357 ** function has already been called, it is mostly a no-op. However, any

	6358 ** backup in progress needs to be restarted.

	6359 */

	6360 sqlite3BackupRestart(pPager->pBackup);

	6361 }else{

	6362 if( pagerUseWal(pPager) ){

	6363 PgHdr *pList = sqlite3PcacheDirtyList(pPager->pPCache);

	6364 PgHdr *pPageOne = 0;

	6365 if( pList==0 ){

	6366 /* Must have at least one page for the WAL commit flag.

	6367 ** Ticket [2d1a5c67dfc2363e44f29d9bbd57f] 2011-05-18 */

	6368 rc = sqlite3PagerGet(pPager, 1, &pPageOne, 0);

	6369 pList = pPageOne;

	6370 pList->pDirty = 0;

	6371 }

	6372 assert( rc==SQLITE_OK );

	6373 if( ALWAYS(pList) ){

	6374 rc = pagerWalFrames(pPager, pList, pPager->dbSize, 1);

	6375 }

	6376 sqlite3PagerUnref(pPageOne);

	6377 if( rc==SQLITE_OK ){

	6378 sqlite3PcacheCleanAll(pPager->pPCache);

	6379 }

	6380 }else{

	6381 /* The following block updates the change-counter. Exactly how it

	6382 ** does this depends on whether or not the atomic-update optimization

	6383 ** was enabled at compile time, and if this transaction meets the

	6384 ** runtime criteria to use the operation:

	6385 **

	6386 ** * The file-system supports the atomic-write property for

	6387 ** blocks of size page-size, and

	6388 ** * This commit is not part of a multi-file transaction, and

	6389 ** * Exactly one page has been modified and store in the journal file.

	6390 **

	6391 ** If the optimization was not enabled at compile time, then the

	6392 ** pager_incr_changecounter() function is called to update the change

	6393 ** counter in 'indirect-mode'. If the optimization is compiled in but

	6394 ** is not applicable to this transaction, call sqlite3JournalCreate()

	6395 ** to make sure the journal file has actually been created, then call

	6396 ** pager_incr_changecounter() to update the change-counter in indirect

	6397 ** mode.

	6398 **

	6399 ** Otherwise, if the optimization is both enabled and applicable,

	6400 ** then call pager_incr_changecounter() to update the change-counter

	6401 ** in 'direct' mode. In this case the journal file will never be

	6402 ** created for this transaction.

	6403 */

	6404 #ifdef SQLITE_ENABLE_ATOMIC_WRITE

	6405 PgHdr *pPg;

	6406 assert( isOpen(pPager->jfd)

	6407 \|\| pPager->journalMode==PAGER_JOURNALMODE_OFF

	6408 \|\| pPager->journalMode==PAGER_JOURNALMODE_WAL

	6409 );

	6410 if( !zMaster && isOpen(pPager->jfd)

	6411 && pPager->journalOff==jrnlBufferSize(pPager)

	6412 && pPager->dbSize>=pPager->dbOrigSize

	6413 && (0==(pPg = sqlite3PcacheDirtyList(pPager->pPCache)) \|\| 0==pPg->pDirty)

	6414 ){

	6415 /* Update the db file change counter via the direct-write method. The

	6416 ** following call will modify the in-memory representation of page 1

	6417 ** to include the updated change counter and then write page 1

	6418 ** directly to the database file. Because of the atomic-write

	6419 ** property of the host file-system, this is safe.

	6420 */

	6421 rc = pager_incr_changecounter(pPager, 1);

	6422 }else{

	6423 rc = sqlite3JournalCreate(pPager->jfd);

	6424 if( rc==SQLITE_OK ){

	6425 rc = pager_incr_changecounter(pPager, 0);

	6426 }

	6427 }

	6428 #else

	6429 rc = pager_incr_changecounter(pPager, 0);

	6430 #endif

	6431 if( rc!=SQLITE_OK ) goto commit_phase_one_exit;

	6432

	6433 /* Write the master journal name into the journal file. If a master

	6434 ** journal file name has already been written to the journal file,

	6435 ** or if zMaster is NULL (no master journal), then this call is a no-op.

	6436 */

	6437 rc = writeMasterJournal(pPager, zMaster);

	6438 if( rc!=SQLITE_OK ) goto commit_phase_one_exit;

	6439

	6440 /* Sync the journal file and write all dirty pages to the database.

	6441 ** If the atomic-update optimization is being used, this sync will not

	6442 ** create the journal file or perform any real IO.

	6443 **

	6444 ** Because the change-counter page was just modified, unless the

	6445 ** atomic-update optimization is used it is almost certain that the

	6446 ** journal requires a sync here. However, in locking_mode=exclusive

	6447 ** on a system under memory pressure it is just possible that this is

	6448 ** not the case. In this case it is likely enough that the redundant

	6449 ** xSync() call will be changed to a no-op by the OS anyhow.

	6450 */

	6451 rc = syncJournal(pPager, 0);

	6452 if( rc!=SQLITE_OK ) goto commit_phase_one_exit;

	6453

	6454 rc = pager_write_pagelist(pPager,sqlite3PcacheDirtyList(pPager->pPCache));

	6455 if( rc!=SQLITE_OK ){

	6456 assert( rc!=SQLITE_IOERR_BLOCKED );

	6457 goto commit_phase_one_exit;

	6458 }

	6459 sqlite3PcacheCleanAll(pPager->pPCache);

	6460

	6461 /* If the file on disk is smaller than the database image, use

	6462 ** pager_truncate to grow the file here. This can happen if the database

	6463 ** image was extended as part of the current transaction and then the

	6464 ** last page in the db image moved to the free-list. In this case the

	6465 ** last page is never written out to disk, leaving the database file

	6466 ** undersized. Fix this now if it is the case. */

	6467 if( pPager->dbSize>pPager->dbFileSize ){

	6468 Pgno nNew = pPager->dbSize - (pPager->dbSize==PAGER_MJ_PGNO(pPager));

	6469 assert( pPager->eState==PAGER_WRITER_DBMOD );

	6470 rc = pager_truncate(pPager, nNew);

	6471 if( rc!=SQLITE_OK ) goto commit_phase_one_exit;

	6472 }

	6473

	6474 /* Finally, sync the database file. */

	6475 if( !noSync ){

	6476 rc = sqlite3PagerSync(pPager, zMaster);

	6477 }

	6478 IOTRACE(("DBSYNC %p\n", pPager))

	6479 }

	6480 }

	6481

	6482 commit_phase_one_exit:

	6483 if( rc==SQLITE_OK && !pagerUseWal(pPager) ){

	6484 pPager->eState = PAGER_WRITER_FINISHED;

	6485 }

	6486 return rc;

	6487 }

	6488

	6489

	6490 /*

	6491 ** When this function is called, the database file has been completely

	6492 ** updated to reflect the changes made by the current transaction and

	6493 ** synced to disk. The journal file still exists in the file-system

	6494 ** though, and if a failure occurs at this point it will eventually

	6495 ** be used as a hot-journal and the current transaction rolled back.

	6496 **

	6497 ** This function finalizes the journal file, either by deleting,

	6498 ** truncating or partially zeroing it, so that it cannot be used

	6499 ** for hot-journal rollback. Once this is done the transaction is

	6500 ** irrevocably committed.

	6501 **

	6502 ** If an error occurs, an IO error code is returned and the pager

	6503 ** moves into the error state. Otherwise, SQLITE_OK is returned.

	6504 */

	6505 SQLITE_PRIVATE int sqlite3PagerCommitPhaseTwo(Pager *pPager){

	6506 int rc = SQLITE_OK; /* Return code */

	6507

	6508 /* This routine should not be called if a prior error has occurred.

	6509 ** But if (due to a coding error elsewhere in the system) it does get

	6510 ** called, just return the same error code without doing anything. */

	6511 if( NEVER(pPager->errCode) ) return pPager->errCode;

	6512

	6513 assert( pPager->eState==PAGER_WRITER_LOCKED

	6514 \|\| pPager->eState==PAGER_WRITER_FINISHED

	6515 \|\| (pagerUseWal(pPager) && pPager->eState==PAGER_WRITER_CACHEMOD)

	6516 );

	6517 assert( assert_pager_state(pPager) );

	6518

	6519 /* An optimization. If the database was not actually modified during

	6520 ** this transaction, the pager is running in exclusive-mode and is

	6521 ** using persistent journals, then this function is a no-op.

	6522 **

	6523 ** The start of the journal file currently contains a single journal

	6524 ** header with the nRec field set to 0. If such a journal is used as

	6525 ** a hot-journal during hot-journal rollback, 0 changes will be made

	6526 ** to the database file. So there is no need to zero the journal

	6527 ** header. Since the pager is in exclusive mode, there is no need

	6528 ** to drop any locks either.

	6529 */

	6530 if( pPager->eState==PAGER_WRITER_LOCKED

	6531 && pPager->exclusiveMode

	6532 && pPager->journalMode==PAGER_JOURNALMODE_PERSIST

	6533 ){

	6534 assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) \|\| !pPager->journalOff );

	6535 pPager->eState = PAGER_READER;

	6536 return SQLITE_OK;

	6537 }

	6538

	6539 PAGERTRACE(("COMMIT %d\n", PAGERID(pPager)));

	6540 pPager->iDataVersion++;

	6541 rc = pager_end_transaction(pPager, pPager->setMaster, 1);

	6542 return pager_error(pPager, rc);

	6543 }

	6544

	6545 /*

	6546 ** If a write transaction is open, then all changes made within the

	6547 ** transaction are reverted and the current write-transaction is closed.

	6548 ** The pager falls back to PAGER_READER state if successful, or PAGER_ERROR

	6549 ** state if an error occurs.

	6550 **

	6551 ** If the pager is already in PAGER_ERROR state when this function is called,

	6552 ** it returns Pager.errCode immediately. No work is performed in this case.

	6553 **

	6554 ** Otherwise, in rollback mode, this function performs two functions:

	6555 **

	6556 ** 1) It rolls back the journal file, restoring all database file and

	6557 ** in-memory cache pages to the state they were in when the transaction

	6558 ** was opened, and

	6559 **

	6560 ** 2) It finalizes the journal file, so that it is not used for hot

	6561 ** rollback at any point in the future.

	6562 **

	6563 ** Finalization of the journal file (task 2) is only performed if the

	6564 ** rollback is successful.

	6565 **

	6566 ** In WAL mode, all cache-entries containing data modified within the

	6567 ** current transaction are either expelled from the cache or reverted to

	6568 ** their pre-transaction state by re-reading data from the database or

	6569 ** WAL files. The WAL transaction is then closed.

	6570 */

	6571 SQLITE_PRIVATE int sqlite3PagerRollback(Pager *pPager){

	6572 int rc = SQLITE_OK; /* Return code */

	6573 PAGERTRACE(("ROLLBACK %d\n", PAGERID(pPager)));

	6574

	6575 /* PagerRollback() is a no-op if called in READER or OPEN state. If

	6576 ** the pager is already in the ERROR state, the rollback is not

	6577 ** attempted here. Instead, the error code is returned to the caller.

	6578 */

	6579 assert( assert_pager_state(pPager) );

	6580 if( pPager->eState==PAGER_ERROR ) return pPager->errCode;

	6581 if( pPager->eState<=PAGER_READER ) return SQLITE_OK;

	6582

	6583 if( pagerUseWal(pPager) ){

	6584 int rc2;

	6585 rc = sqlite3PagerSavepoint(pPager, SAVEPOINT_ROLLBACK, -1);

	6586 rc2 = pager_end_transaction(pPager, pPager->setMaster, 0);

	6587 if( rc==SQLITE_OK ) rc = rc2;

	6588 }else if( !isOpen(pPager->jfd) \|\| pPager->eState==PAGER_WRITER_LOCKED ){

	6589 int eState = pPager->eState;

	6590 rc = pager_end_transaction(pPager, 0, 0);

	6591 if( !MEMDB && eState>PAGER_WRITER_LOCKED ){

	6592 /* This can happen using journal_mode=off. Move the pager to the error

	6593 ** state to indicate that the contents of the cache may not be trusted.

	6594 ** Any active readers will get SQLITE_ABORT.

	6595 */

	6596 pPager->errCode = SQLITE_ABORT;

	6597 pPager->eState = PAGER_ERROR;

	6598 return rc;

	6599 }

	6600 }else{

	6601 rc = pager_playback(pPager, 0);

	6602 }

	6603

	6604 assert( pPager->eState==PAGER_READER \|\| rc!=SQLITE_OK );

	6605 assert( rc==SQLITE_OK \|\| rc==SQLITE_FULL \|\| rc==SQLITE_CORRUPT

	6606 \|\| rc==SQLITE_NOMEM \|\| (rc&0xFF)==SQLITE_IOERR

	6607 \|\| rc==SQLITE_CANTOPEN

	6608 );

	6609

	6610 /* If an error occurs during a ROLLBACK, we can no longer trust the pager

	6611 ** cache. So call pager_error() on the way out to make any error persistent.

	6612 */

	6613 return pager_error(pPager, rc);

	6614 }

	6615

	6616 /*

	6617 ** Return TRUE if the database file is opened read-only. Return FALSE

	6618 ** if the database is (in theory) writable.

	6619 */

	6620 SQLITE_PRIVATE u8 sqlite3PagerIsreadonly(Pager *pPager){

	6621 return pPager->readOnly;

	6622 }

	6623

	6624 #ifdef SQLITE_DEBUG

	6625 /*

	6626 ** Return the sum of the reference counts for all pages held by pPager.

	6627 */

	6628 SQLITE_PRIVATE int sqlite3PagerRefcount(Pager *pPager){

	6629 return sqlite3PcacheRefCount(pPager->pPCache);

	6630 }

	6631 #endif

	6632

	6633 /*

	6634 ** Return the approximate number of bytes of memory currently

	6635 ** used by the pager and its associated cache.

	6636 */

	6637 SQLITE_PRIVATE int sqlite3PagerMemUsed(Pager *pPager){

	6638 int perPageSize = pPager->pageSize + pPager->nExtra + sizeof(PgHdr)

	6639 + 5sizeof(void);

	6640 return perPageSize*sqlite3PcachePagecount(pPager->pPCache)

	6641 + sqlite3MallocSize(pPager)

	6642 + pPager->pageSize;

	6643 }

	6644

	6645 /*

	6646 ** Return the number of references to the specified page.

	6647 */

	6648 SQLITE_PRIVATE int sqlite3PagerPageRefcount(DbPage *pPage){

	6649 return sqlite3PcachePageRefcount(pPage);

	6650 }

	6651

	6652 #ifdef SQLITE_TEST

	6653 /*

	6654 ** This routine is used for testing and analysis only.

	6655 */

	6656 SQLITE_PRIVATE int sqlite3PagerStats(Pager pPager){

	6657 static int a[11];

	6658 a[0] = sqlite3PcacheRefCount(pPager->pPCache);

	6659 a[1] = sqlite3PcachePagecount(pPager->pPCache);

	6660 a[2] = sqlite3PcacheGetCachesize(pPager->pPCache);

	6661 a[3] = pPager->eState==PAGER_OPEN ? -1 : (int) pPager->dbSize;

	6662 a[4] = pPager->eState;

	6663 a[5] = pPager->errCode;

	6664 a[6] = pPager->aStat[PAGER_STAT_HIT];

	6665 a[7] = pPager->aStat[PAGER_STAT_MISS];

	6666 a[8] = 0; /* Used to be pPager->nOvfl */

	6667 a[9] = pPager->nRead;

	6668 a[10] = pPager->aStat[PAGER_STAT_WRITE];

	6669 return a;

	6670 }

	6671 #endif

	6672

	6673 /*

	6674 ** Parameter eStat must be either SQLITE_DBSTATUS_CACHE_HIT or

	6675 ** SQLITE_DBSTATUS_CACHE_MISS. Before returning, *pnVal is incremented by the

	6676 ** current cache hit or miss count, according to the value of eStat. If the

	6677 ** reset parameter is non-zero, the cache hit or miss count is zeroed before

	6678 ** returning.

	6679 */

	6680 SQLITE_PRIVATE void sqlite3PagerCacheStat(Pager pPager, int eStat, int reset, i nt pnVal){

	6681

	6682 assert( eStat==SQLITE_DBSTATUS_CACHE_HIT

	6683 \|\| eStat==SQLITE_DBSTATUS_CACHE_MISS

	6684 \|\| eStat==SQLITE_DBSTATUS_CACHE_WRITE

	6685 );

	6686

	6687 assert( SQLITE_DBSTATUS_CACHE_HIT+1==SQLITE_DBSTATUS_CACHE_MISS );

	6688 assert( SQLITE_DBSTATUS_CACHE_HIT+2==SQLITE_DBSTATUS_CACHE_WRITE );

	6689 assert( PAGER_STAT_HIT==0 && PAGER_STAT_MISS==1 && PAGER_STAT_WRITE==2 );

	6690

	6691 *pnVal += pPager->aStat[eStat - SQLITE_DBSTATUS_CACHE_HIT];

	6692 if( reset ){

	6693 pPager->aStat[eStat - SQLITE_DBSTATUS_CACHE_HIT] = 0;

	6694 }

	6695 }

	6696

	6697 /*

	6698 ** Return true if this is an in-memory pager.

	6699 */

	6700 SQLITE_PRIVATE int sqlite3PagerIsMemdb(Pager *pPager){

	6701 return MEMDB;

	6702 }

	6703

	6704 /*

	6705 ** Check that there are at least nSavepoint savepoints open. If there are

	6706 ** currently less than nSavepoints open, then open one or more savepoints

	6707 ** to make up the difference. If the number of savepoints is already

	6708 ** equal to nSavepoint, then this function is a no-op.

	6709 **

	6710 ** If a memory allocation fails, SQLITE_NOMEM is returned. If an error

	6711 ** occurs while opening the sub-journal file, then an IO error code is

	6712 ** returned. Otherwise, SQLITE_OK.

	6713 */

	6714 static SQLITE_NOINLINE int pagerOpenSavepoint(Pager *pPager, int nSavepoint){

	6715 int rc = SQLITE_OK; /* Return code */

	6716 int nCurrent = pPager->nSavepoint; /* Current number of savepoints */

	6717 int ii; /* Iterator variable */

	6718 PagerSavepoint aNew; / New Pager.aSavepoint array */

	6719

	6720 assert( pPager->eState>=PAGER_WRITER_LOCKED );

	6721 assert( assert_pager_state(pPager) );

	6722 assert( nSavepoint>nCurrent && pPager->useJournal );

	6723

	6724 /* Grow the Pager.aSavepoint array using realloc(). Return SQLITE_NOMEM

	6725 ** if the allocation fails. Otherwise, zero the new portion in case a

	6726 ** malloc failure occurs while populating it in the for(...) loop below.

	6727 */

	6728 aNew = (PagerSavepoint *)sqlite3Realloc(

	6729 pPager->aSavepoint, sizeof(PagerSavepoint)*nSavepoint

	6730 );

	6731 if( !aNew ){

	6732 return SQLITE_NOMEM;

	6733 }

	6734 memset(&aNew[nCurrent], 0, (nSavepoint-nCurrent) * sizeof(PagerSavepoint));

	6735 pPager->aSavepoint = aNew;

	6736

	6737 /* Populate the PagerSavepoint structures just allocated. */

	6738 for(ii=nCurrent; ii<nSavepoint; ii++){

	6739 aNew[ii].nOrig = pPager->dbSize;

	6740 if( isOpen(pPager->jfd) && pPager->journalOff>0 ){

	6741 aNew[ii].iOffset = pPager->journalOff;

	6742 }else{

	6743 aNew[ii].iOffset = JOURNAL_HDR_SZ(pPager);

	6744 }

	6745 aNew[ii].iSubRec = pPager->nSubRec;

	6746 aNew[ii].pInSavepoint = sqlite3BitvecCreate(pPager->dbSize);

	6747 if( !aNew[ii].pInSavepoint ){

	6748 return SQLITE_NOMEM;

	6749 }

	6750 if( pagerUseWal(pPager) ){

	6751 sqlite3WalSavepoint(pPager->pWal, aNew[ii].aWalData);

	6752 }

	6753 pPager->nSavepoint = ii+1;

	6754 }

	6755 assert( pPager->nSavepoint==nSavepoint );

	6756 assertTruncateConstraint(pPager);

	6757 return rc;

	6758 }

	6759 SQLITE_PRIVATE int sqlite3PagerOpenSavepoint(Pager *pPager, int nSavepoint){

	6760 assert( pPager->eState>=PAGER_WRITER_LOCKED );

	6761 assert( assert_pager_state(pPager) );

	6762

	6763 if( nSavepoint>pPager->nSavepoint && pPager->useJournal ){

	6764 return pagerOpenSavepoint(pPager, nSavepoint);

	6765 }else{

	6766 return SQLITE_OK;

	6767 }

	6768 }

	6769

	6770

	6771 /*

	6772 ** This function is called to rollback or release (commit) a savepoint.

	6773 ** The savepoint to release or rollback need not be the most recently

	6774 ** created savepoint.

	6775 **

	6776 ** Parameter op is always either SAVEPOINT_ROLLBACK or SAVEPOINT_RELEASE.

	6777 ** If it is SAVEPOINT_RELEASE, then release and destroy the savepoint with

	6778 ** index iSavepoint. If it is SAVEPOINT_ROLLBACK, then rollback all changes

	6779 ** that have occurred since the specified savepoint was created.

	6780 **

	6781 ** The savepoint to rollback or release is identified by parameter

	6782 ** iSavepoint. A value of 0 means to operate on the outermost savepoint

	6783 ** (the first created). A value of (Pager.nSavepoint-1) means operate

	6784 ** on the most recently created savepoint. If iSavepoint is greater than

	6785 ** (Pager.nSavepoint-1), then this function is a no-op.

	6786 **

	6787 ** If a negative value is passed to this function, then the current

	6788 ** transaction is rolled back. This is different to calling

	6789 ** sqlite3PagerRollback() because this function does not terminate

	6790 ** the transaction or unlock the database, it just restores the

	6791 ** contents of the database to its original state.

	6792 **

	6793 ** In any case, all savepoints with an index greater than iSavepoint

	6794 ** are destroyed. If this is a release operation (op==SAVEPOINT_RELEASE),

	6795 ** then savepoint iSavepoint is also destroyed.

	6796 **

	6797 ** This function may return SQLITE_NOMEM if a memory allocation fails,

	6798 ** or an IO error code if an IO error occurs while rolling back a

	6799 ** savepoint. If no errors occur, SQLITE_OK is returned.

	6800 */

	6801 SQLITE_PRIVATE int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint){

	6802 int rc = pPager->errCode; /* Return code */

	6803

	6804 assert( op==SAVEPOINT_RELEASE \|\| op==SAVEPOINT_ROLLBACK );

	6805 assert( iSavepoint>=0 \|\| op==SAVEPOINT_ROLLBACK );

	6806

	6807 if( rc==SQLITE_OK && iSavepoint<pPager->nSavepoint ){

	6808 int ii; /* Iterator variable */

	6809 int nNew; /* Number of remaining savepoints after this op. */

	6810

	6811 /* Figure out how many savepoints will still be active after this

	6812 ** operation. Store this value in nNew. Then free resources associated

	6813 ** with any savepoints that are destroyed by this operation.

	6814 */

	6815 nNew = iSavepoint + (( op==SAVEPOINT_RELEASE ) ? 0 : 1);

	6816 for(ii=nNew; ii<pPager->nSavepoint; ii++){

	6817 sqlite3BitvecDestroy(pPager->aSavepoint[ii].pInSavepoint);

	6818 }

	6819 pPager->nSavepoint = nNew;

	6820

	6821 /* If this is a release of the outermost savepoint, truncate

	6822 ** the sub-journal to zero bytes in size. */

	6823 if( op==SAVEPOINT_RELEASE ){

	6824 if( nNew==0 && isOpen(pPager->sjfd) ){

	6825 /* Only truncate if it is an in-memory sub-journal. */

	6826 if( sqlite3IsMemJournal(pPager->sjfd) ){

	6827 rc = sqlite3OsTruncate(pPager->sjfd, 0);

	6828 assert( rc==SQLITE_OK );

	6829 }

	6830 pPager->nSubRec = 0;

	6831 }

	6832 }

	6833 /* Else this is a rollback operation, playback the specified savepoint.

	6834 ** If this is a temp-file, it is possible that the journal file has

	6835 ** not yet been opened. In this case there have been no changes to

	6836 ** the database file, so the playback operation can be skipped.

	6837 */

	6838 else if( pagerUseWal(pPager) \|\| isOpen(pPager->jfd) ){

	6839 PagerSavepoint *pSavepoint = (nNew==0)?0:&pPager->aSavepoint[nNew-1];

	6840 rc = pagerPlaybackSavepoint(pPager, pSavepoint);

	6841 assert(rc!=SQLITE_DONE);

	6842 }

	6843 }

	6844

	6845 return rc;

	6846 }

	6847

	6848 /*

	6849 ** Return the full pathname of the database file.

	6850 **

	6851 ** Except, if the pager is in-memory only, then return an empty string if

	6852 ** nullIfMemDb is true. This routine is called with nullIfMemDb==1 when

	6853 ** used to report the filename to the user, for compatibility with legacy

	6854 ** behavior. But when the Btree needs to know the filename for matching to

	6855 ** shared cache, it uses nullIfMemDb==0 so that in-memory databases can

	6856 ** participate in shared-cache.

	6857 */

	6858 SQLITE_PRIVATE const char sqlite3PagerFilename(Pager pPager, int nullIfMemDb){

	6859 return (nullIfMemDb && pPager->memDb) ? "" : pPager->zFilename;

	6860 }

	6861

	6862 /*

	6863 ** Return the VFS structure for the pager.

	6864 */

	6865 SQLITE_PRIVATE sqlite3_vfs sqlite3PagerVfs(Pager pPager){

	6866 return pPager->pVfs;

	6867 }

	6868

	6869 /*

	6870 ** Return the file handle for the database file associated

	6871 ** with the pager. This might return NULL if the file has

	6872 ** not yet been opened.

	6873 */

	6874 SQLITE_PRIVATE sqlite3_file sqlite3PagerFile(Pager pPager){

	6875 return pPager->fd;

	6876 }

	6877

	6878 /*

	6879 ** Return the file handle for the journal file (if it exists).

	6880 ** This will be either the rollback journal or the WAL file.

	6881 */

	6882 SQLITE_PRIVATE sqlite3_file sqlite3PagerJrnlFile(Pager pPager){

	6883 #if SQLITE_OMIT_WAL

	6884 return pPager->jfd;

	6885 #else

	6886 return pPager->pWal ? sqlite3WalFile(pPager->pWal) : pPager->jfd;

	6887 #endif

	6888 }

	6889

	6890 /*

	6891 ** Return the full pathname of the journal file.

	6892 */

	6893 SQLITE_PRIVATE const char sqlite3PagerJournalname(Pager pPager){

	6894 return pPager->zJournal;

	6895 }

	6896

	6897 /*

	6898 ** Return true if fsync() calls are disabled for this pager. Return FALSE

	6899 ** if fsync()s are executed normally.

	6900 */

	6901 SQLITE_PRIVATE int sqlite3PagerNosync(Pager *pPager){

	6902 return pPager->noSync;

	6903 }

	6904

	6905 #ifdef SQLITE_HAS_CODEC

	6906 /*

	6907 ** Set or retrieve the codec for this pager

	6908 */

	6909 SQLITE_PRIVATE void sqlite3PagerSetCodec(

	6910 Pager *pPager,

	6911 void (xCodec)(void,void,Pgno,int),

	6912 void (xCodecSizeChng)(void,int,int),

	6913 void (xCodecFree)(void),

	6914 void *pCodec

	6915 ){

	6916 if( pPager->xCodecFree ) pPager->xCodecFree(pPager->pCodec);

	6917 pPager->xCodec = pPager->memDb ? 0 : xCodec;

	6918 pPager->xCodecSizeChng = xCodecSizeChng;

	6919 pPager->xCodecFree = xCodecFree;

	6920 pPager->pCodec = pCodec;

	6921 pagerReportSize(pPager);

	6922 }

	6923 SQLITE_PRIVATE void sqlite3PagerGetCodec(Pager pPager){

	6924 return pPager->pCodec;

	6925 }

	6926

	6927 /*

	6928 ** This function is called by the wal module when writing page content

	6929 ** into the log file.

	6930 **

	6931 ** This function returns a pointer to a buffer containing the encrypted

	6932 ** page content. If a malloc fails, this function may return NULL.

	6933 */

	6934 SQLITE_PRIVATE void sqlite3PagerCodec(PgHdr pPg){

	6935 void *aData = 0;

	6936 CODEC2(pPg->pPager, pPg->pData, pPg->pgno, 6, return 0, aData);

	6937 return aData;

	6938 }

	6939

	6940 /*

	6941 ** Return the current pager state

	6942 */

	6943 SQLITE_PRIVATE int sqlite3PagerState(Pager *pPager){

	6944 return pPager->eState;

	6945 }

	6946 #endif /* SQLITE_HAS_CODEC */

	6947

	6948 #ifndef SQLITE_OMIT_AUTOVACUUM

	6949 /*

	6950 ** Move the page pPg to location pgno in the file.

	6951 **

	6952 ** There must be no references to the page previously located at

	6953 ** pgno (which we call pPgOld) though that page is allowed to be

	6954 ** in cache. If the page previously located at pgno is not already

	6955 ** in the rollback journal, it is not put there by by this routine.

	6956 **

	6957 ** References to the page pPg remain valid. Updating any

	6958 ** meta-data associated with pPg (i.e. data stored in the nExtra bytes

	6959 ** allocated along with the page) is the responsibility of the caller.

	6960 **

	6961 ** A transaction must be active when this routine is called. It used to be

	6962 ** required that a statement transaction was not active, but this restriction

	6963 ** has been removed (CREATE INDEX needs to move a page when a statement

	6964 ** transaction is active).

	6965 **

	6966 ** If the fourth argument, isCommit, is non-zero, then this page is being

	6967 ** moved as part of a database reorganization just before the transaction

	6968 ** is being committed. In this case, it is guaranteed that the database page

	6969 ** pPg refers to will not be written to again within this transaction.

	6970 **

	6971 ** This function may return SQLITE_NOMEM or an IO error code if an error

	6972 ** occurs. Otherwise, it returns SQLITE_OK.

	6973 */

	6974 SQLITE_PRIVATE int sqlite3PagerMovepage(Pager pPager, DbPage pPg, Pgno pgno, i nt isCommit){

	6975 PgHdr pPgOld; / The page being overwritten. */

	6976 Pgno needSyncPgno = 0; /* Old value of pPg->pgno, if sync is required */

	6977 int rc; /* Return code */

	6978 Pgno origPgno; /* The original page number */

	6979

	6980 assert( pPg->nRef>0 );

	6981 assert( pPager->eState==PAGER_WRITER_CACHEMOD

	6982 \|\| pPager->eState==PAGER_WRITER_DBMOD

	6983 );

	6984 assert( assert_pager_state(pPager) );

	6985

	6986 /* In order to be able to rollback, an in-memory database must journal

	6987 ** the page we are moving from.

	6988 */

	6989 if( MEMDB ){

	6990 rc = sqlite3PagerWrite(pPg);

	6991 if( rc ) return rc;

	6992 }

	6993

	6994 /* If the page being moved is dirty and has not been saved by the latest

	6995 ** savepoint, then save the current contents of the page into the

	6996 ** sub-journal now. This is required to handle the following scenario:

	6997 **

	6998 ** BEGIN;

	6999 ** <journal page X, then modify it in memory>

	7000 ** SAVEPOINT one;

	7001 ** <Move page X to location Y>

	7002 ** ROLLBACK TO one;

	7003 **

	7004 ** If page X were not written to the sub-journal here, it would not

	7005 ** be possible to restore its contents when the "ROLLBACK TO one"

	7006 ** statement were is processed.

	7007 **

	7008 ** subjournalPage() may need to allocate space to store pPg->pgno into

	7009 ** one or more savepoint bitvecs. This is the reason this function

	7010 ** may return SQLITE_NOMEM.

	7011 */

	7012 if( (pPg->flags & PGHDR_DIRTY)!=0

	7013 && SQLITE_OK!=(rc = subjournalPageIfRequired(pPg))

	7014 ){

	7015 return rc;

	7016 }

	7017

	7018 PAGERTRACE(("MOVE %d page %d (needSync=%d) moves to %d\n",

	7019 PAGERID(pPager), pPg->pgno, (pPg->flags&PGHDR_NEED_SYNC)?1:0, pgno));

	7020 IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno))

	7021

	7022 /* If the journal needs to be sync()ed before page pPg->pgno can

	7023 ** be written to, store pPg->pgno in local variable needSyncPgno.

	7024 **

	7025 ** If the isCommit flag is set, there is no need to remember that

	7026 ** the journal needs to be sync()ed before database page pPg->pgno

	7027 ** can be written to. The caller has already promised not to write to it.

	7028 */

	7029 if( (pPg->flags&PGHDR_NEED_SYNC) && !isCommit ){

	7030 needSyncPgno = pPg->pgno;

	7031 assert( pPager->journalMode==PAGER_JOURNALMODE_OFF \|\|

	7032 pageInJournal(pPager, pPg) \|\| pPg->pgno>pPager->dbOrigSize );

	7033 assert( pPg->flags&PGHDR_DIRTY );

	7034 }

	7035

	7036 /* If the cache contains a page with page-number pgno, remove it

	7037 ** from its hash chain. Also, if the PGHDR_NEED_SYNC flag was set for

	7038 ** page pgno before the 'move' operation, it needs to be retained

	7039 ** for the page moved there.

	7040 */

	7041 pPg->flags &= ~PGHDR_NEED_SYNC;

	7042 pPgOld = sqlite3PagerLookup(pPager, pgno);

	7043 assert( !pPgOld \|\| pPgOld->nRef==1 );

	7044 if( pPgOld ){

	7045 pPg->flags \|= (pPgOld->flags&PGHDR_NEED_SYNC);

	7046 if( MEMDB ){

	7047 /* Do not discard pages from an in-memory database since we might

	7048 ** need to rollback later. Just move the page out of the way. */

	7049 sqlite3PcacheMove(pPgOld, pPager->dbSize+1);

	7050 }else{

	7051 sqlite3PcacheDrop(pPgOld);

	7052 }

	7053 }

	7054

	7055 origPgno = pPg->pgno;

	7056 sqlite3PcacheMove(pPg, pgno);

	7057 sqlite3PcacheMakeDirty(pPg);

	7058

	7059 /* For an in-memory database, make sure the original page continues

	7060 ** to exist, in case the transaction needs to roll back. Use pPgOld

	7061 ** as the original page since it has already been allocated.

	7062 */

	7063 if( MEMDB ){

	7064 assert( pPgOld );

	7065 sqlite3PcacheMove(pPgOld, origPgno);

	7066 sqlite3PagerUnrefNotNull(pPgOld);

	7067 }

	7068

	7069 if( needSyncPgno ){

	7070 /* If needSyncPgno is non-zero, then the journal file needs to be

	7071 ** sync()ed before any data is written to database file page needSyncPgno.

	7072 ** Currently, no such page exists in the page-cache and the

	7073 ** "is journaled" bitvec flag has been set. This needs to be remedied by

	7074 ** loading the page into the pager-cache and setting the PGHDR_NEED_SYNC

	7075 ** flag.

	7076 **

	7077 ** If the attempt to load the page into the page-cache fails, (due

	7078 ** to a malloc() or IO failure), clear the bit in the pInJournal[]

	7079 ** array. Otherwise, if the page is loaded and written again in

	7080 ** this transaction, it may be written to the database file before

	7081 ** it is synced into the journal file. This way, it may end up in

	7082 ** the journal file twice, but that is not a problem.

	7083 */

	7084 PgHdr *pPgHdr;

	7085 rc = sqlite3PagerGet(pPager, needSyncPgno, &pPgHdr, 0);

	7086 if( rc!=SQLITE_OK ){

	7087 if( needSyncPgno<=pPager->dbOrigSize ){

	7088 assert( pPager->pTmpSpace!=0 );

	7089 sqlite3BitvecClear(pPager->pInJournal, needSyncPgno, pPager->pTmpSpace);

	7090 }

	7091 return rc;

	7092 }

	7093 pPgHdr->flags \|= PGHDR_NEED_SYNC;

	7094 sqlite3PcacheMakeDirty(pPgHdr);

	7095 sqlite3PagerUnrefNotNull(pPgHdr);

	7096 }

	7097

	7098 return SQLITE_OK;

	7099 }

	7100 #endif

	7101

	7102 /*

	7103 ** The page handle passed as the first argument refers to a dirty page

	7104 ** with a page number other than iNew. This function changes the page's

	7105 ** page number to iNew and sets the value of the PgHdr.flags field to

	7106 ** the value passed as the third parameter.

	7107 */

	7108 SQLITE_PRIVATE void sqlite3PagerRekey(DbPage *pPg, Pgno iNew, u16 flags){

	7109 assert( pPg->pgno!=iNew );

	7110 pPg->flags = flags;

	7111 sqlite3PcacheMove(pPg, iNew);

	7112 }

	7113

	7114 /*

	7115 ** Return a pointer to the data for the specified page.

	7116 */

	7117 SQLITE_PRIVATE void sqlite3PagerGetData(DbPage pPg){

	7118 assert( pPg->nRef>0 \|\| pPg->pPager->memDb );

	7119 return pPg->pData;

	7120 }

	7121

	7122 /*

	7123 ** Return a pointer to the Pager.nExtra bytes of "extra" space

	7124 ** allocated along with the specified page.

	7125 */

	7126 SQLITE_PRIVATE void sqlite3PagerGetExtra(DbPage pPg){

	7127 return pPg->pExtra;

	7128 }

	7129

	7130 /*

	7131 ** Get/set the locking-mode for this pager. Parameter eMode must be one

	7132 ** of PAGER_LOCKINGMODE_QUERY, PAGER_LOCKINGMODE_NORMAL or

	7133 ** PAGER_LOCKINGMODE_EXCLUSIVE. If the parameter is not _QUERY, then

	7134 ** the locking-mode is set to the value specified.

	7135 **

	7136 ** The returned value is either PAGER_LOCKINGMODE_NORMAL or

	7137 ** PAGER_LOCKINGMODE_EXCLUSIVE, indicating the current (possibly updated)

	7138 ** locking-mode.

	7139 */

	7140 SQLITE_PRIVATE int sqlite3PagerLockingMode(Pager *pPager, int eMode){

	7141 assert( eMode==PAGER_LOCKINGMODE_QUERY

	7142 \|\| eMode==PAGER_LOCKINGMODE_NORMAL

	7143 \|\| eMode==PAGER_LOCKINGMODE_EXCLUSIVE );

	7144 assert( PAGER_LOCKINGMODE_QUERY<0 );

	7145 assert( PAGER_LOCKINGMODE_NORMAL>=0 && PAGER_LOCKINGMODE_EXCLUSIVE>=0 );

	7146 assert( pPager->exclusiveMode \|\| 0==sqlite3WalHeapMemory(pPager->pWal) );

	7147 if( eMode>=0 && !pPager->tempFile && !sqlite3WalHeapMemory(pPager->pWal) ){

	7148 pPager->exclusiveMode = (u8)eMode;

	7149 }

	7150 return (int)pPager->exclusiveMode;

	7151 }

	7152

	7153 /*

	7154 ** Set the journal-mode for this pager. Parameter eMode must be one of:

	7155 **

	7156 ** PAGER_JOURNALMODE_DELETE

	7157 ** PAGER_JOURNALMODE_TRUNCATE

	7158 ** PAGER_JOURNALMODE_PERSIST

	7159 ** PAGER_JOURNALMODE_OFF

	7160 ** PAGER_JOURNALMODE_MEMORY

	7161 ** PAGER_JOURNALMODE_WAL

	7162 **

	7163 ** The journalmode is set to the value specified if the change is allowed.

	7164 ** The change may be disallowed for the following reasons:

	7165 **

	7166 ** * An in-memory database can only have its journal_mode set to _OFF

	7167 ** or _MEMORY.

	7168 **

	7169 ** * Temporary databases cannot have _WAL journalmode.

	7170 **

	7171 ** The returned indicate the current (possibly updated) journal-mode.

	7172 */

	7173 SQLITE_PRIVATE int sqlite3PagerSetJournalMode(Pager *pPager, int eMode){

	7174 u8 eOld = pPager->journalMode; /* Prior journalmode */

	7175

	7176 #ifdef SQLITE_DEBUG

	7177 /* The print_pager_state() routine is intended to be used by the debugger

	7178 ** only. We invoke it once here to suppress a compiler warning. */

	7179 print_pager_state(pPager);

	7180 #endif

	7181

	7182

	7183 /* The eMode parameter is always valid */

	7184 assert( eMode==PAGER_JOURNALMODE_DELETE

	7185 \|\| eMode==PAGER_JOURNALMODE_TRUNCATE

	7186 \|\| eMode==PAGER_JOURNALMODE_PERSIST

	7187 \|\| eMode==PAGER_JOURNALMODE_OFF

	7188 \|\| eMode==PAGER_JOURNALMODE_WAL

	7189 \|\| eMode==PAGER_JOURNALMODE_MEMORY );

	7190

	7191 /* This routine is only called from the OP_JournalMode opcode, and

	7192 ** the logic there will never allow a temporary file to be changed

	7193 ** to WAL mode.

	7194 */

	7195 assert( pPager->tempFile==0 \|\| eMode!=PAGER_JOURNALMODE_WAL );

	7196

	7197 /* Do allow the journalmode of an in-memory database to be set to

	7198 ** anything other than MEMORY or OFF

	7199 */

	7200 if( MEMDB ){

	7201 assert( eOld==PAGER_JOURNALMODE_MEMORY \|\| eOld==PAGER_JOURNALMODE_OFF );

	7202 if( eMode!=PAGER_JOURNALMODE_MEMORY && eMode!=PAGER_JOURNALMODE_OFF ){

	7203 eMode = eOld;

	7204 }

	7205 }

	7206

	7207 if( eMode!=eOld ){

	7208

	7209 /* Change the journal mode. */

	7210 assert( pPager->eState!=PAGER_ERROR );

	7211 pPager->journalMode = (u8)eMode;

	7212

	7213 /* When transistioning from TRUNCATE or PERSIST to any other journal

	7214 ** mode except WAL, unless the pager is in locking_mode=exclusive mode,

	7215 ** delete the journal file.

	7216 */

	7217 assert( (PAGER_JOURNALMODE_TRUNCATE & 5)==1 );

	7218 assert( (PAGER_JOURNALMODE_PERSIST & 5)==1 );

	7219 assert( (PAGER_JOURNALMODE_DELETE & 5)==0 );

	7220 assert( (PAGER_JOURNALMODE_MEMORY & 5)==4 );

	7221 assert( (PAGER_JOURNALMODE_OFF & 5)==0 );

	7222 assert( (PAGER_JOURNALMODE_WAL & 5)==5 );

	7223

	7224 assert( isOpen(pPager->fd) \|\| pPager->exclusiveMode );

	7225 if( !pPager->exclusiveMode && (eOld & 5)==1 && (eMode & 1)==0 ){

	7226

	7227 /* In this case we would like to delete the journal file. If it is

	7228 ** not possible, then that is not a problem. Deleting the journal file

	7229 ** here is an optimization only.

	7230 **

	7231 ** Before deleting the journal file, obtain a RESERVED lock on the

	7232 ** database file. This ensures that the journal file is not deleted

	7233 ** while it is in use by some other client.

	7234 */

	7235 sqlite3OsClose(pPager->jfd);

	7236 if( pPager->eLock>=RESERVED_LOCK ){

	7237 sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);

	7238 }else{

	7239 int rc = SQLITE_OK;

	7240 int state = pPager->eState;

	7241 assert( state==PAGER_OPEN \|\| state==PAGER_READER );

	7242 if( state==PAGER_OPEN ){

	7243 rc = sqlite3PagerSharedLock(pPager);

	7244 }

	7245 if( pPager->eState==PAGER_READER ){

	7246 assert( rc==SQLITE_OK );

	7247 rc = pagerLockDb(pPager, RESERVED_LOCK);

	7248 }

	7249 if( rc==SQLITE_OK ){

	7250 sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);

	7251 }

	7252 if( rc==SQLITE_OK && state==PAGER_READER ){

	7253 pagerUnlockDb(pPager, SHARED_LOCK);

	7254 }else if( state==PAGER_OPEN ){

	7255 pager_unlock(pPager);

	7256 }

	7257 assert( state==pPager->eState );

	7258 }

	7259 }else if( eMode==PAGER_JOURNALMODE_OFF ){

	7260 sqlite3OsClose(pPager->jfd);

	7261 }

	7262 }

	7263

	7264 /* Return the new journal mode */

	7265 return (int)pPager->journalMode;

	7266 }

	7267

	7268 /*

	7269 ** Return the current journal mode.

	7270 */

	7271 SQLITE_PRIVATE int sqlite3PagerGetJournalMode(Pager *pPager){

	7272 return (int)pPager->journalMode;

	7273 }

	7274

	7275 /*

	7276 ** Return TRUE if the pager is in a state where it is OK to change the

	7277 ** journalmode. Journalmode changes can only happen when the database

	7278 ** is unmodified.

	7279 */

	7280 SQLITE_PRIVATE int sqlite3PagerOkToChangeJournalMode(Pager *pPager){

	7281 assert( assert_pager_state(pPager) );

	7282 if( pPager->eState>=PAGER_WRITER_CACHEMOD ) return 0;

	7283 if( NEVER(isOpen(pPager->jfd) && pPager->journalOff>0) ) return 0;

	7284 return 1;

	7285 }

	7286

	7287 /*

	7288 ** Get/set the size-limit used for persistent journal files.

	7289 **

	7290 ** Setting the size limit to -1 means no limit is enforced.

	7291 ** An attempt to set a limit smaller than -1 is a no-op.

	7292 */

	7293 SQLITE_PRIVATE i64 sqlite3PagerJournalSizeLimit(Pager *pPager, i64 iLimit){

	7294 if( iLimit>=-1 ){

	7295 pPager->journalSizeLimit = iLimit;

	7296 sqlite3WalLimit(pPager->pWal, iLimit);

	7297 }

	7298 return pPager->journalSizeLimit;

	7299 }

	7300

	7301 /*

	7302 ** Return a pointer to the pPager->pBackup variable. The backup module

	7303 ** in backup.c maintains the content of this variable. This module

	7304 ** uses it opaquely as an argument to sqlite3BackupRestart() and

	7305 ** sqlite3BackupUpdate() only.

	7306 */

	7307 SQLITE_PRIVATE sqlite3_backup *sqlite3PagerBackupPtr(Pager pPager){

	7308 return &pPager->pBackup;

	7309 }

	7310

	7311 #ifndef SQLITE_OMIT_VACUUM

	7312 /*

	7313 ** Unless this is an in-memory or temporary database, clear the pager cache.

	7314 */

	7315 SQLITE_PRIVATE void sqlite3PagerClearCache(Pager *pPager){

	7316 if( !MEMDB && pPager->tempFile==0 ) pager_reset(pPager);

	7317 }

	7318 #endif

	7319

	7320 #ifndef SQLITE_OMIT_WAL

	7321 /*

	7322 ** This function is called when the user invokes "PRAGMA wal_checkpoint",

	7323 ** "PRAGMA wal_blocking_checkpoint" or calls the sqlite3_wal_checkpoint()

	7324 ** or wal_blocking_checkpoint() API functions.

	7325 **

	7326 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.

	7327 */

	7328 SQLITE_PRIVATE int sqlite3PagerCheckpoint(Pager pPager, int eMode, int pnLog, int *pnCkpt){

	7329 int rc = SQLITE_OK;

	7330 if( pPager->pWal ){

	7331 rc = sqlite3WalCheckpoint(pPager->pWal, eMode,

	7332 (eMode==SQLITE_CHECKPOINT_PASSIVE ? 0 : pPager->xBusyHandler),

	7333 pPager->pBusyHandlerArg,

	7334 pPager->ckptSyncFlags, pPager->pageSize, (u8 *)pPager->pTmpSpace,

	7335 pnLog, pnCkpt

	7336 );

	7337 }

	7338 return rc;

	7339 }

	7340

	7341 SQLITE_PRIVATE int sqlite3PagerWalCallback(Pager *pPager){

	7342 return sqlite3WalCallback(pPager->pWal);

	7343 }

	7344

	7345 /*

	7346 ** Return true if the underlying VFS for the given pager supports the

	7347 ** primitives necessary for write-ahead logging.

	7348 */

	7349 SQLITE_PRIVATE int sqlite3PagerWalSupported(Pager *pPager){

	7350 const sqlite3_io_methods *pMethods = pPager->fd->pMethods;

	7351 return pPager->exclusiveMode \|\| (pMethods->iVersion>=2 && pMethods->xShmMap);

	7352 }

	7353

	7354 /*

	7355 ** Attempt to take an exclusive lock on the database file. If a PENDING lock

	7356 ** is obtained instead, immediately release it.

	7357 */

	7358 static int pagerExclusiveLock(Pager *pPager){

	7359 int rc; /* Return code */

	7360

	7361 assert( pPager->eLock==SHARED_LOCK \|\| pPager->eLock==EXCLUSIVE_LOCK );

	7362 rc = pagerLockDb(pPager, EXCLUSIVE_LOCK);

	7363 if( rc!=SQLITE_OK ){

	7364 /* If the attempt to grab the exclusive lock failed, release the

	7365 ** pending lock that may have been obtained instead. */

	7366 pagerUnlockDb(pPager, SHARED_LOCK);

	7367 }

	7368

	7369 return rc;

	7370 }

	7371

	7372 /*

	7373 ** Call sqlite3WalOpen() to open the WAL handle. If the pager is in

	7374 ** exclusive-locking mode when this function is called, take an EXCLUSIVE

	7375 ** lock on the database file and use heap-memory to store the wal-index

	7376 ** in. Otherwise, use the normal shared-memory.

	7377 */

	7378 static int pagerOpenWal(Pager *pPager){

	7379 int rc = SQLITE_OK;

	7380

	7381 assert( pPager->pWal==0 && pPager->tempFile==0 );

	7382 assert( pPager->eLock==SHARED_LOCK \|\| pPager->eLock==EXCLUSIVE_LOCK );

	7383

	7384 /* If the pager is already in exclusive-mode, the WAL module will use

	7385 ** heap-memory for the wal-index instead of the VFS shared-memory

	7386 ** implementation. Take the exclusive lock now, before opening the WAL

	7387 ** file, to make sure this is safe.

	7388 */

	7389 if( pPager->exclusiveMode ){

	7390 rc = pagerExclusiveLock(pPager);

	7391 }

	7392

	7393 /* Open the connection to the log file. If this operation fails,

	7394 ** (e.g. due to malloc() failure), return an error code.

	7395 */

	7396 if( rc==SQLITE_OK ){

	7397 rc = sqlite3WalOpen(pPager->pVfs,

	7398 pPager->fd, pPager->zWal, pPager->exclusiveMode,

	7399 pPager->journalSizeLimit, &pPager->pWal

	7400 );

	7401 }

	7402 pagerFixMaplimit(pPager);

	7403

	7404 return rc;

	7405 }

	7406

	7407

	7408 /*

	7409 ** The caller must be holding a SHARED lock on the database file to call

	7410 ** this function.

	7411 **

	7412 ** If the pager passed as the first argument is open on a real database

	7413 ** file (not a temp file or an in-memory database), and the WAL file

	7414 ** is not already open, make an attempt to open it now. If successful,

	7415 ** return SQLITE_OK. If an error occurs or the VFS used by the pager does

	7416 ** not support the xShmXXX() methods, return an error code. *pbOpen is

	7417 ** not modified in either case.

	7418 **

	7419 ** If the pager is open on a temp-file (or in-memory database), or if

	7420 ** the WAL file is already open, set *pbOpen to 1 and return SQLITE_OK

	7421 ** without doing anything.

	7422 */

	7423 SQLITE_PRIVATE int sqlite3PagerOpenWal(

	7424 Pager pPager, / Pager object */

	7425 int pbOpen / OUT: Set to true if call is a no-op */

	7426 ){

	7427 int rc = SQLITE_OK; /* Return code */

	7428

	7429 assert( assert_pager_state(pPager) );

	7430 assert( pPager->eState==PAGER_OPEN \|\| pbOpen );

	7431 assert( pPager->eState==PAGER_READER \|\| !pbOpen );

	7432 assert( pbOpen==0 \|\| *pbOpen==0 );

	7433 assert( pbOpen!=0 \|\| (!pPager->tempFile && !pPager->pWal) );

	7434

	7435 if( !pPager->tempFile && !pPager->pWal ){

	7436 if( !sqlite3PagerWalSupported(pPager) ) return SQLITE_CANTOPEN;

	7437

	7438 /* Close any rollback journal previously open */

	7439 sqlite3OsClose(pPager->jfd);

	7440

	7441 rc = pagerOpenWal(pPager);

	7442 if( rc==SQLITE_OK ){

	7443 pPager->journalMode = PAGER_JOURNALMODE_WAL;

	7444 pPager->eState = PAGER_OPEN;

	7445 }

	7446 }else{

	7447 *pbOpen = 1;

	7448 }

	7449

	7450 return rc;

	7451 }

	7452

	7453 /*

	7454 ** This function is called to close the connection to the log file prior

	7455 ** to switching from WAL to rollback mode.

	7456 **

	7457 ** Before closing the log file, this function attempts to take an

	7458 ** EXCLUSIVE lock on the database file. If this cannot be obtained, an

	7459 ** error (SQLITE_BUSY) is returned and the log connection is not closed.

	7460 ** If successful, the EXCLUSIVE lock is not released before returning.

	7461 */

	7462 SQLITE_PRIVATE int sqlite3PagerCloseWal(Pager *pPager){

	7463 int rc = SQLITE_OK;

	7464

	7465 assert( pPager->journalMode==PAGER_JOURNALMODE_WAL );

	7466

	7467 /* If the log file is not already open, but does exist in the file-system,

	7468 ** it may need to be checkpointed before the connection can switch to

	7469 ** rollback mode. Open it now so this can happen.

	7470 */

	7471 if( !pPager->pWal ){

	7472 int logexists = 0;

	7473 rc = pagerLockDb(pPager, SHARED_LOCK);

	7474 if( rc==SQLITE_OK ){

	7475 rc = sqlite3OsAccess(

	7476 pPager->pVfs, pPager->zWal, SQLITE_ACCESS_EXISTS, &logexists

	7477 );

	7478 }

	7479 if( rc==SQLITE_OK && logexists ){

	7480 rc = pagerOpenWal(pPager);

	7481 }

	7482 }

	7483

	7484 /* Checkpoint and close the log. Because an EXCLUSIVE lock is held on

	7485 ** the database file, the log and log-summary files will be deleted.

	7486 */

	7487 if( rc==SQLITE_OK && pPager->pWal ){

	7488 rc = pagerExclusiveLock(pPager);

	7489 if( rc==SQLITE_OK ){

	7490 rc = sqlite3WalClose(pPager->pWal, pPager->ckptSyncFlags,

	7491 pPager->pageSize, (u8*)pPager->pTmpSpace);

	7492 pPager->pWal = 0;

	7493 pagerFixMaplimit(pPager);

	7494 }

	7495 }

	7496 return rc;

	7497 }

	7498

	7499 #ifdef SQLITE_ENABLE_SNAPSHOT

	7500 /*

	7501 ** If this is a WAL database, obtain a snapshot handle for the snapshot

	7502 ** currently open. Otherwise, return an error.

	7503 */

	7504 SQLITE_PRIVATE int sqlite3PagerSnapshotGet(Pager pPager, sqlite3_snapshot *ppS napshot){

	7505 int rc = SQLITE_ERROR;

	7506 if( pPager->pWal ){

	7507 rc = sqlite3WalSnapshotGet(pPager->pWal, ppSnapshot);

	7508 }

	7509 return rc;

	7510 }

	7511

	7512 /*

	7513 ** If this is a WAL database, store a pointer to pSnapshot. Next time a

	7514 ** read transaction is opened, attempt to read from the snapshot it

	7515 ** identifies. If this is not a WAL database, return an error.

	7516 */

	7517 SQLITE_PRIVATE int sqlite3PagerSnapshotOpen(Pager pPager, sqlite3_snapshot pSn apshot){

	7518 int rc = SQLITE_OK;

	7519 if( pPager->pWal ){

	7520 sqlite3WalSnapshotOpen(pPager->pWal, pSnapshot);

	7521 }else{

	7522 rc = SQLITE_ERROR;

	7523 }

	7524 return rc;

	7525 }

	7526 #endif /* SQLITE_ENABLE_SNAPSHOT */

	7527 #endif /* !SQLITE_OMIT_WAL */

	7528

	7529 #ifdef SQLITE_ENABLE_ZIPVFS

	7530 /*

	7531 ** A read-lock must be held on the pager when this function is called. If

	7532 ** the pager is in WAL mode and the WAL file currently contains one or more

	7533 ** frames, return the size in bytes of the page images stored within the

	7534 ** WAL frames. Otherwise, if this is not a WAL database or the WAL file

	7535 ** is empty, return 0.

	7536 */

	7537 SQLITE_PRIVATE int sqlite3PagerWalFramesize(Pager *pPager){

	7538 assert( pPager->eState>=PAGER_READER );

	7539 return sqlite3WalFramesize(pPager->pWal);

	7540 }

	7541 #endif

	7542

	7543

	7544 #endif /* SQLITE_OMIT_DISKIO */

	7545

	7546 /************ End of pager.c *********************************************/

	7547 /************ Begin file wal.c *******************************************/

	7548 /*

	7549 ** 2010 February 1

	7550 **

	7551 ** The author disclaims copyright to this source code. In place of

	7552 ** a legal notice, here is a blessing:

	7553 **

	7554 ** May you do good and not evil.

	7555 ** May you find forgiveness for yourself and forgive others.

	7556 ** May you share freely, never taking more than you give.

	7557 **

	7558 *************************************************************************

	7559 **

	7560 ** This file contains the implementation of a write-ahead log (WAL) used in

	7561 ** "journal_mode=WAL" mode.

	7562 **

	7563 ** WRITE-AHEAD LOG (WAL) FILE FORMAT

	7564 **

	7565 ** A WAL file consists of a header followed by zero or more "frames".

	7566 ** Each frame records the revised content of a single page from the

	7567 ** database file. All changes to the database are recorded by writing

	7568 ** frames into the WAL. Transactions commit when a frame is written that

	7569 ** contains a commit marker. A single WAL can and usually does record

	7570 ** multiple transactions. Periodically, the content of the WAL is

	7571 ** transferred back into the database file in an operation called a

	7572 ** "checkpoint".

	7573 **

	7574 ** A single WAL file can be used multiple times. In other words, the

	7575 ** WAL can fill up with frames and then be checkpointed and then new

	7576 ** frames can overwrite the old ones. A WAL always grows from beginning

	7577 ** toward the end. Checksums and counters attached to each frame are

	7578 ** used to determine which frames within the WAL are valid and which

	7579 ** are leftovers from prior checkpoints.

	7580 **

	7581 ** The WAL header is 32 bytes in size and consists of the following eight

	7582 ** big-endian 32-bit unsigned integer values:

	7583 **

	7584 ** 0: Magic number. 0x377f0682 or 0x377f0683

	7585 ** 4: File format version. Currently 3007000

	7586 ** 8: Database page size. Example: 1024

	7587 ** 12: Checkpoint sequence number

	7588 ** 16: Salt-1, random integer incremented with each checkpoint

	7589 ** 20: Salt-2, a different random integer changing with each ckpt

	7590 ** 24: Checksum-1 (first part of checksum for first 24 bytes of header).

	7591 ** 28: Checksum-2 (second part of checksum for first 24 bytes of header).

	7592 **

	7593 ** Immediately following the wal-header are zero or more frames. Each

	7594 ** frame consists of a 24-byte frame-header followed by a <page-size> bytes

	7595 ** of page data. The frame-header is six big-endian 32-bit unsigned

	7596 ** integer values, as follows:

	7597 **

	7598 ** 0: Page number.

	7599 ** 4: For commit records, the size of the database image in pages

	7600 ** after the commit. For all other records, zero.

	7601 ** 8: Salt-1 (copied from the header)

	7602 ** 12: Salt-2 (copied from the header)

	7603 ** 16: Checksum-1.

	7604 ** 20: Checksum-2.

	7605 **

	7606 ** A frame is considered valid if and only if the following conditions are

	7607 ** true:

	7608 **

	7609 ** (1) The salt-1 and salt-2 values in the frame-header match

	7610 ** salt values in the wal-header

	7611 **

	7612 ** (2) The checksum values in the final 8 bytes of the frame-header

	7613 ** exactly match the checksum computed consecutively on the

	7614 ** WAL header and the first 8 bytes and the content of all frames

	7615 ** up to and including the current frame.

	7616 **

	7617 ** The checksum is computed using 32-bit big-endian integers if the

	7618 ** magic number in the first 4 bytes of the WAL is 0x377f0683 and it

	7619 ** is computed using little-endian if the magic number is 0x377f0682.

	7620 ** The checksum values are always stored in the frame header in a

	7621 ** big-endian format regardless of which byte order is used to compute

	7622 ** the checksum. The checksum is computed by interpreting the input as

	7623 ** an even number of unsigned 32-bit integers: x[0] through x[N]. The

	7624 ** algorithm used for the checksum is as follows:

	7625 **

	7626 ** for i from 0 to n-1 step 2:

	7627 ** s0 += x[i] + s1;

	7628 ** s1 += x[i+1] + s0;

	7629 ** endfor

	7630 **

	7631 ** Note that s0 and s1 are both weighted checksums using fibonacci weights

	7632 ** in reverse order (the largest fibonacci weight occurs on the first element

	7633 ** of the sequence being summed.) The s1 value spans all 32-bit

	7634 ** terms of the sequence whereas s0 omits the final term.

	7635 **

	7636 ** On a checkpoint, the WAL is first VFS.xSync-ed, then valid content of the

	7637 ** WAL is transferred into the database, then the database is VFS.xSync-ed.

	7638 ** The VFS.xSync operations serve as write barriers - all writes launched

	7639 ** before the xSync must complete before any write that launches after the

	7640 ** xSync begins.

	7641 **

	7642 ** After each checkpoint, the salt-1 value is incremented and the salt-2

	7643 ** value is randomized. This prevents old and new frames in the WAL from

	7644 ** being considered valid at the same time and being checkpointing together

	7645 ** following a crash.

	7646 **

	7647 ** READER ALGORITHM

	7648 **

	7649 ** To read a page from the database (call it page number P), a reader

	7650 ** first checks the WAL to see if it contains page P. If so, then the

	7651 ** last valid instance of page P that is a followed by a commit frame

	7652 ** or is a commit frame itself becomes the value read. If the WAL

	7653 ** contains no copies of page P that are valid and which are a commit

	7654 ** frame or are followed by a commit frame, then page P is read from

	7655 ** the database file.

	7656 **

	7657 ** To start a read transaction, the reader records the index of the last

	7658 ** valid frame in the WAL. The reader uses this recorded "mxFrame" value

	7659 ** for all subsequent read operations. New transactions can be appended

	7660 ** to the WAL, but as long as the reader uses its original mxFrame value

	7661 ** and ignores the newly appended content, it will see a consistent snapshot

	7662 ** of the database from a single point in time. This technique allows

	7663 ** multiple concurrent readers to view different versions of the database

	7664 ** content simultaneously.

	7665 **

	7666 ** The reader algorithm in the previous paragraphs works correctly, but

	7667 ** because frames for page P can appear anywhere within the WAL, the

	7668 ** reader has to scan the entire WAL looking for page P frames. If the

	7669 ** WAL is large (multiple megabytes is typical) that scan can be slow,

	7670 ** and read performance suffers. To overcome this problem, a separate

	7671 ** data structure called the wal-index is maintained to expedite the

	7672 ** search for frames of a particular page.

	7673 **

	7674 ** WAL-INDEX FORMAT

	7675 **

	7676 ** Conceptually, the wal-index is shared memory, though VFS implementations

	7677 ** might choose to implement the wal-index using a mmapped file. Because

	7678 ** the wal-index is shared memory, SQLite does not support journal_mode=WAL

	7679 ** on a network filesystem. All users of the database must be able to

	7680 ** share memory.

	7681 **

	7682 ** The wal-index is transient. After a crash, the wal-index can (and should

	7683 ** be) reconstructed from the original WAL file. In fact, the VFS is required

	7684 ** to either truncate or zero the header of the wal-index when the last

	7685 ** connection to it closes. Because the wal-index is transient, it can

	7686 ** use an architecture-specific format; it does not have to be cross-platform.

	7687 ** Hence, unlike the database and WAL file formats which store all values

	7688 ** as big endian, the wal-index can store multi-byte values in the native

	7689 ** byte order of the host computer.

	7690 **

	7691 ** The purpose of the wal-index is to answer this question quickly: Given

	7692 ** a page number P and a maximum frame index M, return the index of the

	7693 ** last frame in the wal before frame M for page P in the WAL, or return

	7694 ** NULL if there are no frames for page P in the WAL prior to M.

	7695 **

	7696 ** The wal-index consists of a header region, followed by an one or

	7697 ** more index blocks.

	7698 **

	7699 ** The wal-index header contains the total number of frames within the WAL

	7700 ** in the mxFrame field.

	7701 **

	7702 ** Each index block except for the first contains information on

	7703 ** HASHTABLE_NPAGE frames. The first index block contains information on

	7704 ** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and

	7705 ** HASHTABLE_NPAGE are selected so that together the wal-index header and

	7706 ** first index block are the same size as all other index blocks in the

	7707 ** wal-index.

	7708 **

	7709 ** Each index block contains two sections, a page-mapping that contains the

	7710 ** database page number associated with each wal frame, and a hash-table

	7711 ** that allows readers to query an index block for a specific page number.

	7712 ** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE

	7713 ** for the first index block) 32-bit page numbers. The first entry in the

	7714 ** first index-block contains the database page number corresponding to the

	7715 ** first frame in the WAL file. The first entry in the second index block

	7716 ** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in

	7717 ** the log, and so on.

	7718 **

	7719 ** The last index block in a wal-index usually contains less than the full

	7720 ** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,

	7721 ** depending on the contents of the WAL file. This does not change the

	7722 ** allocated size of the page-mapping array - the page-mapping array merely

	7723 ** contains unused entries.

	7724 **

	7725 ** Even without using the hash table, the last frame for page P

	7726 ** can be found by scanning the page-mapping sections of each index block

	7727 ** starting with the last index block and moving toward the first, and

	7728 ** within each index block, starting at the end and moving toward the

	7729 ** beginning. The first entry that equals P corresponds to the frame

	7730 ** holding the content for that page.

	7731 **

	7732 ** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.

	7733 ** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the

	7734 ** hash table for each page number in the mapping section, so the hash

	7735 ** table is never more than half full. The expected number of collisions

	7736 ** prior to finding a match is 1. Each entry of the hash table is an

	7737 ** 1-based index of an entry in the mapping section of the same

	7738 ** index block. Let K be the 1-based index of the largest entry in

	7739 ** the mapping section. (For index blocks other than the last, K will

	7740 ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block

	7741 ** K will be (mxFrame%HASHTABLE_NPAGE).) Unused slots of the hash table

	7742 ** contain a value of 0.

	7743 **

	7744 ** To look for page P in the hash table, first compute a hash iKey on

	7745 ** P as follows:

	7746 **

	7747 ** iKey = (P * 383) % HASHTABLE_NSLOT

	7748 **

	7749 ** Then start scanning entries of the hash table, starting with iKey

	7750 ** (wrapping around to the beginning when the end of the hash table is

	7751 ** reached) until an unused hash slot is found. Let the first unused slot

	7752 ** be at index iUnused. (iUnused might be less than iKey if there was

	7753 ** wrap-around.) Because the hash table is never more than half full,

	7754 ** the search is guaranteed to eventually hit an unused entry. Let

	7755 ** iMax be the value between iKey and iUnused, closest to iUnused,

	7756 ** where aHash[iMax]==P. If there is no iMax entry (if there exists

	7757 ** no hash slot such that aHash[i]==p) then page P is not in the

	7758 ** current index block. Otherwise the iMax-th mapping entry of the

	7759 ** current index block corresponds to the last entry that references

	7760 ** page P.

	7761 **

	7762 ** A hash search begins with the last index block and moves toward the

	7763 ** first index block, looking for entries corresponding to page P. On

	7764 ** average, only two or three slots in each index block need to be

	7765 ** examined in order to either find the last entry for page P, or to

	7766 ** establish that no such entry exists in the block. Each index block

	7767 ** holds over 4000 entries. So two or three index blocks are sufficient

	7768 ** to cover a typical 10 megabyte WAL file, assuming 1K pages. 8 or 10

	7769 ** comparisons (on average) suffice to either locate a frame in the

	7770 ** WAL or to establish that the frame does not exist in the WAL. This

	7771 ** is much faster than scanning the entire 10MB WAL.

	7772 **

	7773 ** Note that entries are added in order of increasing K. Hence, one

	7774 ** reader might be using some value K0 and a second reader that started

	7775 ** at a later time (after additional transactions were added to the WAL

	7776 ** and to the wal-index) might be using a different value K1, where K1>K0.

	7777 ** Both readers can use the same hash table and mapping section to get

	7778 ** the correct result. There may be entries in the hash table with

	7779 ** K>K0 but to the first reader, those entries will appear to be unused

	7780 ** slots in the hash table and so the first reader will get an answer as

	7781 ** if no values greater than K0 had ever been inserted into the hash table

	7782 ** in the first place - which is what reader one wants. Meanwhile, the

	7783 ** second reader using K1 will see additional values that were inserted

	7784 ** later, which is exactly what reader two wants.

	7785 **

	7786 ** When a rollback occurs, the value of K is decreased. Hash table entries

	7787 ** that correspond to frames greater than the new K value are removed

	7788 ** from the hash table at this point.

	7789 */

	7790 #ifndef SQLITE_OMIT_WAL

	7791

	7792 /* #include "wal.h" */

	7793

	7794 /*

	7795 ** Trace output macros

	7796 */

	7797 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)

	7798 SQLITE_PRIVATE int sqlite3WalTrace = 0;

	7799 # define WALTRACE(X) if(sqlite3WalTrace) sqlite3DebugPrintf X

	7800 #else

	7801 # define WALTRACE(X)

	7802 #endif

	7803

	7804 /*

	7805 ** The maximum (and only) versions of the wal and wal-index formats

	7806 ** that may be interpreted by this version of SQLite.

	7807 **

	7808 ** If a client begins recovering a WAL file and finds that (a) the checksum

	7809 ** values in the wal-header are correct and (b) the version field is not

	7810 ** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN.

	7811 **

	7812 ** Similarly, if a client successfully reads a wal-index header (i.e. the

	7813 ** checksum test is successful) and finds that the version field is not

	7814 ** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite

	7815 ** returns SQLITE_CANTOPEN.

	7816 */

	7817 #define WAL_MAX_VERSION 3007000

	7818 #define WALINDEX_MAX_VERSION 3007000

	7819

	7820 /*

	7821 ** Indices of various locking bytes. WAL_NREADER is the number

	7822 ** of available reader locks and should be at least 3. The default

	7823 ** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5.

	7824 */

	7825 #define WAL_WRITE_LOCK 0

	7826 #define WAL_ALL_BUT_WRITE 1

	7827 #define WAL_CKPT_LOCK 1

	7828 #define WAL_RECOVER_LOCK 2

	7829 #define WAL_READ_LOCK(I) (3+(I))

	7830 #define WAL_NREADER (SQLITE_SHM_NLOCK-3)

	7831

	7832

	7833 /* Object declarations */

	7834 typedef struct WalIndexHdr WalIndexHdr;

	7835 typedef struct WalIterator WalIterator;

	7836 typedef struct WalCkptInfo WalCkptInfo;

	7837

	7838

	7839 /*

	7840 ** The following object holds a copy of the wal-index header content.

	7841 **

	7842 ** The actual header in the wal-index consists of two copies of this

	7843 ** object followed by one instance of the WalCkptInfo object.

	7844 ** For all versions of SQLite through 3.10.0 and probably beyond,

	7845 ** the locking bytes (WalCkptInfo.aLock) start at offset 120 and

	7846 ** the total header size is 136 bytes.

	7847 **

	7848 ** The szPage value can be any power of 2 between 512 and 32768, inclusive.

	7849 ** Or it can be 1 to represent a 65536-byte page. The latter case was

	7850 ** added in 3.7.1 when support for 64K pages was added.

	7851 */

	7852 struct WalIndexHdr {

	7853 u32 iVersion; /* Wal-index version */

	7854 u32 unused; /* Unused (padding) field */

	7855 u32 iChange; /* Counter incremented each transaction */

	7856 u8 isInit; /* 1 when initialized */

	7857 u8 bigEndCksum; /* True if checksums in WAL are big-endian */

	7858 u16 szPage; /* Database page size in bytes. 1==64K */

	7859 u32 mxFrame; /* Index of last valid frame in the WAL */

	7860 u32 nPage; /* Size of database in pages */

	7861 u32 aFrameCksum[2]; /* Checksum of last frame in log */

	7862 u32 aSalt[2]; /* Two salt values copied from WAL header */

	7863 u32 aCksum[2]; /* Checksum over all prior fields */

	7864 };

	7865

	7866 /*

	7867 ** A copy of the following object occurs in the wal-index immediately

	7868 ** following the second copy of the WalIndexHdr. This object stores

	7869 ** information used by checkpoint.

	7870 **

	7871 ** nBackfill is the number of frames in the WAL that have been written

	7872 ** back into the database. (We call the act of moving content from WAL to

	7873 ** database "backfilling".) The nBackfill number is never greater than

	7874 ** WalIndexHdr.mxFrame. nBackfill can only be increased by threads

	7875 ** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).

	7876 ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from

	7877 ** mxFrame back to zero when the WAL is reset.

	7878 **

	7879 ** nBackfillAttempted is the largest value of nBackfill that a checkpoint

	7880 ** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however

	7881 ** the nBackfillAttempted is set before any backfilling is done and the

	7882 ** nBackfill is only set after all backfilling completes. So if a checkpoint

	7883 ** crashes, nBackfillAttempted might be larger than nBackfill. The

	7884 ** WalIndexHdr.mxFrame must never be less than nBackfillAttempted.

	7885 **

	7886 ** The aLock[] field is a set of bytes used for locking. These bytes should

	7887 ** never be read or written.

	7888 **

	7889 ** There is one entry in aReadMark[] for each reader lock. If a reader

	7890 ** holds read-lock K, then the value in aReadMark[K] is no greater than

	7891 ** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff)

	7892 ** for any aReadMark[] means that entry is unused. aReadMark[0] is

	7893 ** a special case; its value is never used and it exists as a place-holder

	7894 ** to avoid having to offset aReadMark[] indexs by one. Readers holding

	7895 ** WAL_READ_LOCK(0) always ignore the entire WAL and read all content

	7896 ** directly from the database.

	7897 **

	7898 ** The value of aReadMark[K] may only be changed by a thread that

	7899 ** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of

	7900 ** aReadMark[K] cannot changed while there is a reader is using that mark

	7901 ** since the reader will be holding a shared lock on WAL_READ_LOCK(K).

	7902 **

	7903 ** The checkpointer may only transfer frames from WAL to database where

	7904 ** the frame numbers are less than or equal to every aReadMark[] that is

	7905 ** in use (that is, every aReadMark[j] for which there is a corresponding

	7906 ** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the

	7907 ** largest value and will increase an unused aReadMark[] to mxFrame if there

	7908 ** is not already an aReadMark[] equal to mxFrame. The exception to the

	7909 ** previous sentence is when nBackfill equals mxFrame (meaning that everything

	7910 ** in the WAL has been backfilled into the database) then new readers

	7911 ** will choose aReadMark[0] which has value 0 and hence such reader will

	7912 ** get all their all content directly from the database file and ignore

	7913 ** the WAL.

	7914 **

	7915 ** Writers normally append new frames to the end of the WAL. However,

	7916 ** if nBackfill equals mxFrame (meaning that all WAL content has been

	7917 ** written back into the database) and if no readers are using the WAL

	7918 ** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then

	7919 ** the writer will first "reset" the WAL back to the beginning and start

	7920 ** writing new content beginning at frame 1.

	7921 **

	7922 ** We assume that 32-bit loads are atomic and so no locks are needed in

	7923 ** order to read from any aReadMark[] entries.

	7924 */

	7925 struct WalCkptInfo {

	7926 u32 nBackfill; /* Number of WAL frames backfilled into DB */

	7927 u32 aReadMark[WAL_NREADER]; /* Reader marks */

	7928 u8 aLock[SQLITE_SHM_NLOCK]; /* Reserved space for locks */

	7929 u32 nBackfillAttempted; /* WAL frames perhaps written, or maybe not */

	7930 u32 notUsed0; /* Available for future enhancements */

	7931 };

	7932 #define READMARK_NOT_USED 0xffffffff

	7933

	7934

	7935 /* A block of WALINDEX_LOCK_RESERVED bytes beginning at

	7936 ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems

	7937 ** only support mandatory file-locks, we do not read or write data

	7938 ** from the region of the file on which locks are applied.

	7939 */

	7940 #define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2+offsetof(WalCkptInfo,aLock))

	7941 #define WALINDEX_HDR_SIZE (sizeof(WalIndexHdr)*2+sizeof(WalCkptInfo))

	7942

	7943 /* Size of header before each frame in wal */

	7944 #define WAL_FRAME_HDRSIZE 24

	7945

	7946 /* Size of write ahead log header, including checksum. */

	7947 /* #define WAL_HDRSIZE 24 */

	7948 #define WAL_HDRSIZE 32

	7949

	7950 /* WAL magic value. Either this value, or the same value with the least

	7951 ** significant bit also set (WAL_MAGIC \| 0x00000001) is stored in 32-bit

	7952 ** big-endian format in the first 4 bytes of a WAL file.

	7953 **

	7954 ** If the LSB is set, then the checksums for each frame within the WAL

	7955 ** file are calculated by treating all data as an array of 32-bit

	7956 ** big-endian words. Otherwise, they are calculated by interpreting

	7957 ** all data as 32-bit little-endian words.

	7958 */

	7959 #define WAL_MAGIC 0x377f0682

	7960

	7961 /*

	7962 ** Return the offset of frame iFrame in the write-ahead log file,

	7963 ** assuming a database page size of szPage bytes. The offset returned

	7964 ** is to the start of the write-ahead log frame-header.

	7965 */

	7966 #define walFrameOffset(iFrame, szPage) ( \

	7967 WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE) \

	7968 )

	7969

	7970 /*

	7971 ** An open write-ahead log file is represented by an instance of the

	7972 ** following object.

	7973 */

	7974 struct Wal {

	7975 sqlite3_vfs pVfs; / The VFS used to create pDbFd */

	7976 sqlite3_file pDbFd; / File handle for the database file */

	7977 sqlite3_file pWalFd; / File handle for WAL file */

	7978 u32 iCallback; /* Value to pass to log callback (or 0) */

	7979 i64 mxWalSize; /* Truncate WAL to this size upon reset */

	7980 int nWiData; /* Size of array apWiData */

	7981 int szFirstBlock; /* Size of first block written to WAL file */

	7982 volatile u32 *apWiData; / Pointer to wal-index content in memory */

	7983 u32 szPage; /* Database page size */

	7984 i16 readLock; /* Which read lock is being held. -1 for none */

	7985 u8 syncFlags; /* Flags to use to sync header writes */

	7986 u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */

	7987 u8 writeLock; /* True if in a write transaction */

	7988 u8 ckptLock; /* True if holding a checkpoint lock */

	7989 u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */

	7990 u8 truncateOnCommit; /* True to truncate WAL file on commit */

	7991 u8 syncHeader; /* Fsync the WAL header if true */

	7992 u8 padToSectorBoundary; /* Pad transactions out to the next sector */

	7993 WalIndexHdr hdr; /* Wal-index header for current transaction */

	7994 u32 minFrame; /* Ignore wal frames before this one */

	7995 const char zWalName; / Name of WAL file */

	7996 u32 nCkpt; /* Checkpoint sequence counter in the wal-header */

	7997 #ifdef SQLITE_DEBUG

	7998 u8 lockError; /* True if a locking error has occurred */

	7999 #endif

	8000 #ifdef SQLITE_ENABLE_SNAPSHOT

	8001 WalIndexHdr pSnapshot; / Start transaction here if not NULL */

	8002 #endif

	8003 };

	8004

	8005 /*

	8006 ** Candidate values for Wal.exclusiveMode.

	8007 */

	8008 #define WAL_NORMAL_MODE 0

	8009 #define WAL_EXCLUSIVE_MODE 1

	8010 #define WAL_HEAPMEMORY_MODE 2

	8011

	8012 /*

	8013 ** Possible values for WAL.readOnly

	8014 */

	8015 #define WAL_RDWR 0 /* Normal read/write connection */

	8016 #define WAL_RDONLY 1 /* The WAL file is readonly */

	8017 #define WAL_SHM_RDONLY 2 /* The SHM file is readonly */

	8018

	8019 /*

	8020 ** Each page of the wal-index mapping contains a hash-table made up of

	8021 ** an array of HASHTABLE_NSLOT elements of the following type.

	8022 */

	8023 typedef u16 ht_slot;

	8024

	8025 /*

	8026 ** This structure is used to implement an iterator that loops through

	8027 ** all frames in the WAL in database page order. Where two or more frames

	8028 ** correspond to the same database page, the iterator visits only the

	8029 ** frame most recently written to the WAL (in other words, the frame with

	8030 ** the largest index).

	8031 **

	8032 ** The internals of this structure are only accessed by:

	8033 **

	8034 ** walIteratorInit() - Create a new iterator,

	8035 ** walIteratorNext() - Step an iterator,

	8036 ** walIteratorFree() - Free an iterator.

	8037 **

	8038 ** This functionality is used by the checkpoint code (see walCheckpoint()).

	8039 */

	8040 struct WalIterator {

	8041 int iPrior; /* Last result returned from the iterator */

	8042 int nSegment; /* Number of entries in aSegment[] */

	8043 struct WalSegment {

	8044 int iNext; /* Next slot in aIndex[] not yet returned */

	8045 ht_slot aIndex; / i0, i1, i2... such that aPgno[iN] ascend */

	8046 u32 aPgno; / Array of page numbers. */

	8047 int nEntry; /* Nr. of entries in aPgno[] and aIndex[] */

	8048 int iZero; /* Frame number associated with aPgno[0] */

	8049 } aSegment[1]; /* One for every 32KB page in the wal-index */

	8050 };

	8051

	8052 /*

	8053 ** Define the parameters of the hash tables in the wal-index file. There

	8054 ** is a hash-table following every HASHTABLE_NPAGE page numbers in the

	8055 ** wal-index.

	8056 **

	8057 ** Changing any of these constants will alter the wal-index format and

	8058 ** create incompatibilities.

	8059 */

	8060 #define HASHTABLE_NPAGE 4096 /* Must be power of 2 */

	8061 #define HASHTABLE_HASH_1 383 /* Should be prime */

	8062 #define HASHTABLE_NSLOT (HASHTABLE_NPAGE2) / Must be a power of 2 */

	8063

	8064 /*

	8065 ** The block of page numbers associated with the first hash-table in a

	8066 ** wal-index is smaller than usual. This is so that there is a complete

	8067 ** hash-table on each aligned 32KB page of the wal-index.

	8068 */

	8069 #define HASHTABLE_NPAGE_ONE (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))

	8070

	8071 /* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */

	8072 #define WALINDEX_PGSZ ( \

	8073 sizeof(ht_slot)HASHTABLE_NSLOT + HASHTABLE_NPAGEsizeof(u32) \

	8074 )

	8075

	8076 /*

	8077 ** Obtain a pointer to the iPage'th page of the wal-index. The wal-index

	8078 ** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are

	8079 ** numbered from zero.

	8080 **

	8081 ** If this call is successful, *ppPage is set to point to the wal-index

	8082 ** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,

	8083 ** then an SQLite error code is returned and *ppPage is set to 0.

	8084 */

	8085 static int walIndexPage(Wal pWal, int iPage, volatile u32 *ppPage){

	8086 int rc = SQLITE_OK;

	8087

	8088 /* Enlarge the pWal->apWiData[] array if required */

	8089 if( pWal->nWiData<=iPage ){

	8090 int nByte = sizeof(u32)(iPage+1);

	8091 volatile u32 **apNew;

	8092 apNew = (volatile u32 *)sqlite3_realloc64((void )pWal->apWiData, nByte);

	8093 if( !apNew ){

	8094 *ppPage = 0;

	8095 return SQLITE_NOMEM;

	8096 }

	8097 memset((void*)&apNew[pWal->nWiData], 0,

	8098 sizeof(u32)(iPage+1-pWal->nWiData));

	8099 pWal->apWiData = apNew;

	8100 pWal->nWiData = iPage+1;

	8101 }

	8102

	8103 /* Request a pointer to the required page from the VFS */

	8104 if( pWal->apWiData[iPage]==0 ){

	8105 if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE ){

	8106 pWal->apWiData[iPage] = (u32 volatile *)sqlite3MallocZero(WALINDEX_PGSZ);

	8107 if( !pWal->apWiData[iPage] ) rc = SQLITE_NOMEM;

	8108 }else{

	8109 rc = sqlite3OsShmMap(pWal->pDbFd, iPage, WALINDEX_PGSZ,

	8110 pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]

	8111 );

	8112 if( rc==SQLITE_READONLY ){

	8113 pWal->readOnly \|= WAL_SHM_RDONLY;

	8114 rc = SQLITE_OK;

	8115 }

	8116 }

	8117 }

	8118

	8119 *ppPage = pWal->apWiData[iPage];

	8120 assert( iPage==0 \|\| *ppPage \|\| rc!=SQLITE_OK );

	8121 return rc;

	8122 }

	8123

	8124 /*

	8125 ** Return a pointer to the WalCkptInfo structure in the wal-index.

	8126 */

	8127 static volatile WalCkptInfo walCkptInfo(Wal pWal){

	8128 assert( pWal->nWiData>0 && pWal->apWiData[0] );

	8129 return (volatile WalCkptInfo*)&(pWal->apWiData[0][sizeof(WalIndexHdr)/2]);

	8130 }

	8131

	8132 /*

	8133 ** Return a pointer to the WalIndexHdr structure in the wal-index.

	8134 */

	8135 static volatile WalIndexHdr walIndexHdr(Wal pWal){

	8136 assert( pWal->nWiData>0 && pWal->apWiData[0] );

	8137 return (volatile WalIndexHdr*)pWal->apWiData[0];

	8138 }

	8139

	8140 /*

	8141 ** The argument to this macro must be of type u32. On a little-endian

	8142 ** architecture, it returns the u32 value that results from interpreting

	8143 ** the 4 bytes as a big-endian value. On a big-endian architecture, it

	8144 ** returns the value that would be produced by interpreting the 4 bytes

	8145 ** of the input value as a little-endian integer.

	8146 */

	8147 #define BYTESWAP32(x) ( \

	8148 (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \

	8149 + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \

	8150 )

	8151

	8152 /*

	8153 ** Generate or extend an 8 byte checksum based on the data in

	8154 ** array aByte[] and the initial values of aIn[0] and aIn[1] (or

	8155 ** initial values of 0 and 0 if aIn==NULL).

	8156 **

	8157 ** The checksum is written back into aOut[] before returning.

	8158 **

	8159 ** nByte must be a positive multiple of 8.

	8160 */

	8161 static void walChecksumBytes(

	8162 int nativeCksum, /* True for native byte-order, false for non-native */

	8163 u8 a, / Content to be checksummed */

	8164 int nByte, /* Bytes of content in a[]. Must be a multiple of 8. */

	8165 const u32 aIn, / Initial checksum value input */

	8166 u32 aOut / OUT: Final checksum value output */

	8167 ){

	8168 u32 s1, s2;

	8169 u32 aData = (u32 )a;

	8170 u32 aEnd = (u32 )&a[nByte];

	8171

	8172 if( aIn ){

	8173 s1 = aIn[0];

	8174 s2 = aIn[1];

	8175 }else{

	8176 s1 = s2 = 0;

	8177 }

	8178

	8179 assert( nByte>=8 );

	8180 assert( (nByte&0x00000007)==0 );

	8181

	8182 if( nativeCksum ){

	8183 do {

	8184 s1 += *aData++ + s2;

	8185 s2 += *aData++ + s1;

	8186 }while( aData<aEnd );

	8187 }else{

	8188 do {

	8189 s1 += BYTESWAP32(aData[0]) + s2;

	8190 s2 += BYTESWAP32(aData[1]) + s1;

	8191 aData += 2;

	8192 }while( aData<aEnd );

	8193 }

	8194

	8195 aOut[0] = s1;

	8196 aOut[1] = s2;

	8197 }

	8198

	8199 static void walShmBarrier(Wal *pWal){

	8200 if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){

	8201 sqlite3OsShmBarrier(pWal->pDbFd);

	8202 }

	8203 }

	8204

	8205 /*

	8206 ** Write the header information in pWal->hdr into the wal-index.

	8207 **

	8208 ** The checksum on pWal->hdr is updated before it is written.

	8209 */

	8210 static void walIndexWriteHdr(Wal *pWal){

	8211 volatile WalIndexHdr *aHdr = walIndexHdr(pWal);

	8212 const int nCksum = offsetof(WalIndexHdr, aCksum);

	8213

	8214 assert( pWal->writeLock );

	8215 pWal->hdr.isInit = 1;

	8216 pWal->hdr.iVersion = WALINDEX_MAX_VERSION;

	8217 walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum);

	8218 memcpy((void)&aHdr[1], (const void)&pWal->hdr, sizeof(WalIndexHdr));

	8219 walShmBarrier(pWal);

	8220 memcpy((void)&aHdr[0], (const void)&pWal->hdr, sizeof(WalIndexHdr));

	8221 }

	8222

	8223 /*

	8224 ** This function encodes a single frame header and writes it to a buffer

	8225 ** supplied by the caller. A frame-header is made up of a series of

	8226 ** 4-byte big-endian integers, as follows:

	8227 **

	8228 ** 0: Page number.

	8229 ** 4: For commit records, the size of the database image in pages

	8230 ** after the commit. For all other records, zero.

	8231 ** 8: Salt-1 (copied from the wal-header)

	8232 ** 12: Salt-2 (copied from the wal-header)

	8233 ** 16: Checksum-1.

	8234 ** 20: Checksum-2.

	8235 */

	8236 static void walEncodeFrame(

	8237 Wal pWal, / The write-ahead log */

	8238 u32 iPage, /* Database page number for frame */

	8239 u32 nTruncate, /* New db size (or 0 for non-commit frames) */

	8240 u8 aData, / Pointer to page data */

	8241 u8 aFrame / OUT: Write encoded frame here */

	8242 ){

	8243 int nativeCksum; /* True for native byte-order checksums */

	8244 u32 *aCksum = pWal->hdr.aFrameCksum;

	8245 assert( WAL_FRAME_HDRSIZE==24 );

	8246 sqlite3Put4byte(&aFrame[0], iPage);

	8247 sqlite3Put4byte(&aFrame[4], nTruncate);

	8248 memcpy(&aFrame[8], pWal->hdr.aSalt, 8);

	8249

	8250 nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN);

	8251 walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);

	8252 walChecksumBytes(nativeCksum, aData, pWal->szPage, aCksum, aCksum);

	8253

	8254 sqlite3Put4byte(&aFrame[16], aCksum[0]);

	8255 sqlite3Put4byte(&aFrame[20], aCksum[1]);

	8256 }

	8257

	8258 /*

	8259 ** Check to see if the frame with header in aFrame[] and content

	8260 ** in aData[] is valid. If it is a valid frame, fill *piPage and

	8261 ** *pnTruncate and return true. Return if the frame is not valid.

	8262 */

	8263 static int walDecodeFrame(

	8264 Wal pWal, / The write-ahead log */

	8265 u32 piPage, / OUT: Database page number for frame */

	8266 u32 pnTruncate, / OUT: New db size (or 0 if not commit) */

	8267 u8 aData, / Pointer to page data (for checksum) */

	8268 u8 aFrame / Frame data */

	8269 ){

	8270 int nativeCksum; /* True for native byte-order checksums */

	8271 u32 *aCksum = pWal->hdr.aFrameCksum;

	8272 u32 pgno; /* Page number of the frame */

	8273 assert( WAL_FRAME_HDRSIZE==24 );

	8274

	8275 /* A frame is only valid if the salt values in the frame-header

	8276 ** match the salt values in the wal-header.

	8277 */

	8278 if( memcmp(&pWal->hdr.aSalt, &aFrame[8], 8)!=0 ){

	8279 return 0;

	8280 }

	8281

	8282 /* A frame is only valid if the page number is creater than zero.

	8283 */

	8284 pgno = sqlite3Get4byte(&aFrame[0]);

	8285 if( pgno==0 ){

	8286 return 0;

	8287 }

	8288

	8289 /* A frame is only valid if a checksum of the WAL header,

	8290 ** all prior frams, the first 16 bytes of this frame-header,

	8291 ** and the frame-data matches the checksum in the last 8

	8292 ** bytes of this frame-header.

	8293 */

	8294 nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN);

	8295 walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);

	8296 walChecksumBytes(nativeCksum, aData, pWal->szPage, aCksum, aCksum);

	8297 if( aCksum[0]!=sqlite3Get4byte(&aFrame[16])

	8298 \|\| aCksum[1]!=sqlite3Get4byte(&aFrame[20])

	8299 ){

	8300 /* Checksum failed. */

	8301 return 0;

	8302 }

	8303

	8304 /* If we reach this point, the frame is valid. Return the page number

	8305 ** and the new database size.

	8306 */

	8307 *piPage = pgno;

	8308 *pnTruncate = sqlite3Get4byte(&aFrame[4]);

	8309 return 1;

	8310 }

	8311

	8312

	8313 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)

	8314 /*

	8315 ** Names of locks. This routine is used to provide debugging output and is not

	8316 ** a part of an ordinary build.

	8317 */

	8318 static const char *walLockName(int lockIdx){

	8319 if( lockIdx==WAL_WRITE_LOCK ){

	8320 return "WRITE-LOCK";

	8321 }else if( lockIdx==WAL_CKPT_LOCK ){

	8322 return "CKPT-LOCK";

	8323 }else if( lockIdx==WAL_RECOVER_LOCK ){

	8324 return "RECOVER-LOCK";

	8325 }else{

	8326 static char zName[15];

	8327 sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]",

	8328 lockIdx-WAL_READ_LOCK(0));

	8329 return zName;

	8330 }

	8331 }

	8332 #endif /defined(SQLITE_TEST) \|\| defined(SQLITE_DEBUG) /

	8333

	8334

	8335 /*

	8336 ** Set or release locks on the WAL. Locks are either shared or exclusive.

	8337 ** A lock cannot be moved directly between shared and exclusive - it must go

	8338 ** through the unlocked state first.

	8339 **

	8340 ** In locking_mode=EXCLUSIVE, all of these routines become no-ops.

	8341 */

	8342 static int walLockShared(Wal *pWal, int lockIdx){

	8343 int rc;

	8344 if( pWal->exclusiveMode ) return SQLITE_OK;

	8345 rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,

	8346 SQLITE_SHM_LOCK \| SQLITE_SHM_SHARED);

	8347 WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal,

	8348 walLockName(lockIdx), rc ? "failed" : "ok"));

	8349 VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )

	8350 return rc;

	8351 }

	8352 static void walUnlockShared(Wal *pWal, int lockIdx){

	8353 if( pWal->exclusiveMode ) return;

	8354 (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,

	8355 SQLITE_SHM_UNLOCK \| SQLITE_SHM_SHARED);

	8356 WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx)));

	8357 }

	8358 static int walLockExclusive(Wal *pWal, int lockIdx, int n){

	8359 int rc;

	8360 if( pWal->exclusiveMode ) return SQLITE_OK;

	8361 rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,

	8362 SQLITE_SHM_LOCK \| SQLITE_SHM_EXCLUSIVE);

	8363 WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal,

	8364 walLockName(lockIdx), n, rc ? "failed" : "ok"));

	8365 VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && rc!=SQLITE_BUSY); )

	8366 return rc;

	8367 }

	8368 static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){

	8369 if( pWal->exclusiveMode ) return;

	8370 (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,

	8371 SQLITE_SHM_UNLOCK \| SQLITE_SHM_EXCLUSIVE);

	8372 WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,

	8373 walLockName(lockIdx), n));

	8374 }

	8375

	8376 /*

	8377 ** Compute a hash on a page number. The resulting hash value must land

	8378 ** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances

	8379 ** the hash to the next value in the event of a collision.

	8380 */

	8381 static int walHash(u32 iPage){

	8382 assert( iPage>0 );

	8383 assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );

	8384 return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);

	8385 }

	8386 static int walNextHash(int iPriorHash){

	8387 return (iPriorHash+1)&(HASHTABLE_NSLOT-1);

	8388 }

	8389

	8390 /*

	8391 ** Return pointers to the hash table and page number array stored on

	8392 ** page iHash of the wal-index. The wal-index is broken into 32KB pages

	8393 ** numbered starting from 0.

	8394 **

	8395 ** Set output variable *paHash to point to the start of the hash table

	8396 ** in the wal-index file. Set *piZero to one less than the frame

	8397 ** number of the first frame indexed by this hash table. If a

	8398 ** slot in the hash table is set to N, it refers to frame number

	8399 ** (*piZero+N) in the log.

	8400 **

	8401 ** Finally, set paPgno so that paPgno[1] is the page number of the

	8402 ** first frame indexed by the hash table, frame (*piZero+1).

	8403 */

	8404 static int walHashGet(

	8405 Wal pWal, / WAL handle */

	8406 int iHash, /* Find the iHash'th table */

	8407 volatile ht_slot *paHash, / OUT: Pointer to hash index */

	8408 volatile u32 *paPgno, / OUT: Pointer to page number array */

	8409 u32 piZero / OUT: Frame associated with paPgno[0] /

	8410 ){

	8411 int rc; /* Return code */

	8412 volatile u32 *aPgno;

	8413

	8414 rc = walIndexPage(pWal, iHash, &aPgno);

	8415 assert( rc==SQLITE_OK \|\| iHash>0 );

	8416

	8417 if( rc==SQLITE_OK ){

	8418 u32 iZero;

	8419 volatile ht_slot *aHash;

	8420

	8421 aHash = (volatile ht_slot *)&aPgno[HASHTABLE_NPAGE];

	8422 if( iHash==0 ){

	8423 aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)];

	8424 iZero = 0;

	8425 }else{

	8426 iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;

	8427 }

	8428

	8429 *paPgno = &aPgno[-1];

	8430 *paHash = aHash;

	8431 *piZero = iZero;

	8432 }

	8433 return rc;

	8434 }

	8435

	8436 /*

	8437 ** Return the number of the wal-index page that contains the hash-table

	8438 ** and page-number array that contain entries corresponding to WAL frame

	8439 ** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages

	8440 ** are numbered starting from 0.

	8441 */

	8442 static int walFramePage(u32 iFrame){

	8443 int iHash = (iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1) / HASHTABLE_NPAGE;

	8444 assert( (iHash==0 \|\| iFrame>HASHTABLE_NPAGE_ONE)

	8445 && (iHash>=1 \|\| iFrame<=HASHTABLE_NPAGE_ONE)

	8446 && (iHash<=1 \|\| iFrame>(HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE))

	8447 && (iHash>=2 \|\| iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE)

	8448 && (iHash<=2 \|\| iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE))

	8449 );

	8450 return iHash;

	8451 }

	8452

	8453 /*

	8454 ** Return the page number associated with frame iFrame in this WAL.

	8455 */

	8456 static u32 walFramePgno(Wal *pWal, u32 iFrame){

	8457 int iHash = walFramePage(iFrame);

	8458 if( iHash==0 ){

	8459 return pWal->apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1];

	8460 }

	8461 return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE];

	8462 }

	8463

	8464 /*

	8465 ** Remove entries from the hash table that point to WAL slots greater

	8466 ** than pWal->hdr.mxFrame.

	8467 **

	8468 ** This function is called whenever pWal->hdr.mxFrame is decreased due

	8469 ** to a rollback or savepoint.

	8470 **

	8471 ** At most only the hash table containing pWal->hdr.mxFrame needs to be

	8472 ** updated. Any later hash tables will be automatically cleared when

	8473 ** pWal->hdr.mxFrame advances to the point where those hash tables are

	8474 ** actually needed.

	8475 */

	8476 static void walCleanupHash(Wal *pWal){

	8477 volatile ht_slot aHash = 0; / Pointer to hash table to clear */

	8478 volatile u32 aPgno = 0; / Page number array for hash table */

	8479 u32 iZero = 0; /* frame == (aHash[x]+iZero) */

	8480 int iLimit = 0; /* Zero values greater than this */

	8481 int nByte; /* Number of bytes to zero in aPgno[] */

	8482 int i; /* Used to iterate through aHash[] */

	8483

	8484 assert( pWal->writeLock );

	8485 testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 );

	8486 testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE );

	8487 testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 );

	8488

	8489 if( pWal->hdr.mxFrame==0 ) return;

	8490

	8491 /* Obtain pointers to the hash-table and page-number array containing

	8492 ** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed

	8493 ** that the page said hash-table and array reside on is already mapped.

	8494 */

	8495 assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) );

	8496 assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] );

	8497 walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &aHash, &aPgno, &iZero);

	8498

	8499 /* Zero all hash-table entries that correspond to frame numbers greater

	8500 ** than pWal->hdr.mxFrame.

	8501 */

	8502 iLimit = pWal->hdr.mxFrame - iZero;

	8503 assert( iLimit>0 );

	8504 for(i=0; i<HASHTABLE_NSLOT; i++){

	8505 if( aHash[i]>iLimit ){

	8506 aHash[i] = 0;

	8507 }

	8508 }

	8509

	8510 /* Zero the entries in the aPgno array that correspond to frames with

	8511 ** frame numbers greater than pWal->hdr.mxFrame.

	8512 */

	8513 nByte = (int)((char )aHash - (char )&aPgno[iLimit+1]);

	8514 memset((void *)&aPgno[iLimit+1], 0, nByte);

	8515

	8516 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT

	8517 /* Verify that the every entry in the mapping region is still reachable

	8518 ** via the hash table even after the cleanup.

	8519 */

	8520 if( iLimit ){

	8521 int j; /* Loop counter */

	8522 int iKey; /* Hash key */

	8523 for(j=1; j<=iLimit; j++){

	8524 for(iKey=walHash(aPgno[j]); aHash[iKey]; iKey=walNextHash(iKey)){

	8525 if( aHash[iKey]==j ) break;

	8526 }

	8527 assert( aHash[iKey]==j );

	8528 }

	8529 }

	8530 #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */

	8531 }

	8532

	8533

	8534 /*

	8535 ** Set an entry in the wal-index that will map database page number

	8536 ** pPage into WAL frame iFrame.

	8537 */

	8538 static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){

	8539 int rc; /* Return code */

	8540 u32 iZero = 0; /* One less than frame number of aPgno[1] */

	8541 volatile u32 aPgno = 0; / Page number array */

	8542 volatile ht_slot aHash = 0; / Hash table */

	8543

	8544 rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);

	8545

	8546 /* Assuming the wal-index file was successfully mapped, populate the

	8547 ** page number array and hash table entry.

	8548 */

	8549 if( rc==SQLITE_OK ){

	8550 int iKey; /* Hash table key */

	8551 int idx; /* Value to write to hash-table slot */

	8552 int nCollide; /* Number of hash collisions */

	8553

	8554 idx = iFrame - iZero;

	8555 assert( idx <= HASHTABLE_NSLOT/2 + 1 );

	8556

	8557 /* If this is the first entry to be added to this hash-table, zero the

	8558 ** entire hash table and aPgno[] array before proceeding.

	8559 */

	8560 if( idx==1 ){

	8561 int nByte = (int)((u8 )&aHash[HASHTABLE_NSLOT] - (u8 )&aPgno[1]);

	8562 memset((void*)&aPgno[1], 0, nByte);

	8563 }

	8564

	8565 /* If the entry in aPgno[] is already set, then the previous writer

	8566 ** must have exited unexpectedly in the middle of a transaction (after

	8567 ** writing one or more dirty pages to the WAL to free up memory).

	8568 ** Remove the remnants of that writers uncommitted transaction from

	8569 ** the hash-table before writing any new entries.

	8570 */

	8571 if( aPgno[idx] ){

	8572 walCleanupHash(pWal);

	8573 assert( !aPgno[idx] );

	8574 }

	8575

	8576 /* Write the aPgno[] array entry and the hash-table slot. */

	8577 nCollide = idx;

	8578 for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){

	8579 if( (nCollide--)==0 ) return SQLITE_CORRUPT_BKPT;

	8580 }

	8581 aPgno[idx] = iPage;

	8582 aHash[iKey] = (ht_slot)idx;

	8583

	8584 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT

	8585 /* Verify that the number of entries in the hash table exactly equals

	8586 ** the number of entries in the mapping region.

	8587 */

	8588 {

	8589 int i; /* Loop counter */

	8590 int nEntry = 0; /* Number of entries in the hash table */

	8591 for(i=0; i<HASHTABLE_NSLOT; i++){ if( aHash[i] ) nEntry++; }

	8592 assert( nEntry==idx );

	8593 }

	8594

	8595 /* Verify that the every entry in the mapping region is reachable

	8596 ** via the hash table. This turns out to be a really, really expensive

	8597 ** thing to check, so only do this occasionally - not on every

	8598 ** iteration.

	8599 */

	8600 if( (idx&0x3ff)==0 ){

	8601 int i; /* Loop counter */

	8602 for(i=1; i<=idx; i++){

	8603 for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){

	8604 if( aHash[iKey]==i ) break;

	8605 }

	8606 assert( aHash[iKey]==i );

	8607 }

	8608 }

	8609 #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */

	8610 }

	8611

	8612

	8613 return rc;

	8614 }

	8615

	8616

	8617 /*

	8618 ** Recover the wal-index by reading the write-ahead log file.

	8619 **

	8620 ** This routine first tries to establish an exclusive lock on the

	8621 ** wal-index to prevent other threads/processes from doing anything

	8622 ** with the WAL or wal-index while recovery is running. The

	8623 ** WAL_RECOVER_LOCK is also held so that other threads will know

	8624 ** that this thread is running recovery. If unable to establish

	8625 ** the necessary locks, this routine returns SQLITE_BUSY.

	8626 */

	8627 static int walIndexRecover(Wal *pWal){

	8628 int rc; /* Return Code */

	8629 i64 nSize; /* Size of log file */

	8630 u32 aFrameCksum[2] = {0, 0};

	8631 int iLock; /* Lock offset to lock for checkpoint */

	8632 int nLock; /* Number of locks to hold */

	8633

	8634 /* Obtain an exclusive lock on all byte in the locking range not already

	8635 ** locked by the caller. The caller is guaranteed to have locked the

	8636 ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.

	8637 ** If successful, the same bytes that are locked here are unlocked before

	8638 ** this function returns.

	8639 */

	8640 assert( pWal->ckptLock==1 \|\| pWal->ckptLock==0 );

	8641 assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 );

	8642 assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE );

	8643 assert( pWal->writeLock );

	8644 iLock = WAL_ALL_BUT_WRITE + pWal->ckptLock;

	8645 nLock = SQLITE_SHM_NLOCK - iLock;

	8646 rc = walLockExclusive(pWal, iLock, nLock);

	8647 if( rc ){

	8648 return rc;

	8649 }

	8650 WALTRACE(("WAL%p: recovery begin...\n", pWal));

	8651

	8652 memset(&pWal->hdr, 0, sizeof(WalIndexHdr));

	8653

	8654 rc = sqlite3OsFileSize(pWal->pWalFd, &nSize);

	8655 if( rc!=SQLITE_OK ){

	8656 goto recovery_error;

	8657 }

	8658

	8659 if( nSize>WAL_HDRSIZE ){

	8660 u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */

	8661 u8 aFrame = 0; / Malloc'd buffer to load entire frame */

	8662 int szFrame; /* Number of bytes in buffer aFrame[] */

	8663 u8 aData; / Pointer to data part of aFrame buffer */

	8664 int iFrame; /* Index of last frame read */

	8665 i64 iOffset; /* Next offset to read from log file */

	8666 int szPage; /* Page size according to the log */

	8667 u32 magic; /* Magic value read from WAL header */

	8668 u32 version; /* Magic value read from WAL header */

	8669 int isValid; /* True if this frame is valid */

	8670

	8671 /* Read in the WAL header. */

	8672 rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);

	8673 if( rc!=SQLITE_OK ){

	8674 goto recovery_error;

	8675 }

	8676

	8677 /* If the database page size is not a power of two, or is greater than

	8678 ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid

	8679 ** data. Similarly, if the 'magic' value is invalid, ignore the whole

	8680 ** WAL file.

	8681 */

	8682 magic = sqlite3Get4byte(&aBuf[0]);

	8683 szPage = sqlite3Get4byte(&aBuf[8]);

	8684 if( (magic&0xFFFFFFFE)!=WAL_MAGIC

	8685 \|\| szPage&(szPage-1)

	8686 \|\| szPage>SQLITE_MAX_PAGE_SIZE

	8687 \|\| szPage<512

	8688 ){

	8689 goto finished;

	8690 }

	8691 pWal->hdr.bigEndCksum = (u8)(magic&0x00000001);

	8692 pWal->szPage = szPage;

	8693 pWal->nCkpt = sqlite3Get4byte(&aBuf[12]);

	8694 memcpy(&pWal->hdr.aSalt, &aBuf[16], 8);

	8695

	8696 /* Verify that the WAL header checksum is correct */

	8697 walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN,

	8698 aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum

	8699 );

	8700 if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24])

	8701 \|\| pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28])

	8702 ){

	8703 goto finished;

	8704 }

	8705

	8706 /* Verify that the version number on the WAL format is one that

	8707 ** are able to understand */

	8708 version = sqlite3Get4byte(&aBuf[4]);

	8709 if( version!=WAL_MAX_VERSION ){

	8710 rc = SQLITE_CANTOPEN_BKPT;

	8711 goto finished;

	8712 }

	8713

	8714 /* Malloc a buffer to read frames into. */

	8715 szFrame = szPage + WAL_FRAME_HDRSIZE;

	8716 aFrame = (u8 *)sqlite3_malloc64(szFrame);

	8717 if( !aFrame ){

	8718 rc = SQLITE_NOMEM;

	8719 goto recovery_error;

	8720 }

	8721 aData = &aFrame[WAL_FRAME_HDRSIZE];

	8722

	8723 /* Read all frames from the log file. */

	8724 iFrame = 0;

	8725 for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){

	8726 u32 pgno; /* Database page number for frame */

	8727 u32 nTruncate; /* dbsize field from frame header */

	8728

	8729 /* Read and decode the next log frame. */

	8730 iFrame++;

	8731 rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset);

	8732 if( rc!=SQLITE_OK ) break;

	8733 isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);

	8734 if( !isValid ) break;

	8735 rc = walIndexAppend(pWal, iFrame, pgno);

	8736 if( rc!=SQLITE_OK ) break;

	8737

	8738 /* If nTruncate is non-zero, this is a commit record. */

	8739 if( nTruncate ){

	8740 pWal->hdr.mxFrame = iFrame;

	8741 pWal->hdr.nPage = nTruncate;

	8742 pWal->hdr.szPage = (u16)((szPage&0xff00) \| (szPage>>16));

	8743 testcase( szPage<=32768 );

	8744 testcase( szPage>=65536 );

	8745 aFrameCksum[0] = pWal->hdr.aFrameCksum[0];

	8746 aFrameCksum[1] = pWal->hdr.aFrameCksum[1];

	8747 }

	8748 }

	8749

	8750 sqlite3_free(aFrame);

	8751 }

	8752

	8753 finished:

	8754 if( rc==SQLITE_OK ){

	8755 volatile WalCkptInfo *pInfo;

	8756 int i;

	8757 pWal->hdr.aFrameCksum[0] = aFrameCksum[0];

	8758 pWal->hdr.aFrameCksum[1] = aFrameCksum[1];

	8759 walIndexWriteHdr(pWal);

	8760

	8761 /* Reset the checkpoint-header. This is safe because this thread is

	8762 ** currently holding locks that exclude all other readers, writers and

	8763 ** checkpointers.

	8764 */

	8765 pInfo = walCkptInfo(pWal);

	8766 pInfo->nBackfill = 0;

	8767 pInfo->nBackfillAttempted = pWal->hdr.mxFrame;

	8768 pInfo->aReadMark[0] = 0;

	8769 for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;

	8770 if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame;

	8771

	8772 /* If more than one frame was recovered from the log file, report an

	8773 ** event via sqlite3_log(). This is to help with identifying performance

	8774 ** problems caused by applications routinely shutting down without

	8775 ** checkpointing the log file.

	8776 */

	8777 if( pWal->hdr.nPage ){

	8778 sqlite3_log(SQLITE_NOTICE_RECOVER_WAL,

	8779 "recovered %d frames from WAL file %s",

	8780 pWal->hdr.mxFrame, pWal->zWalName

	8781 );

	8782 }

	8783 }

	8784

	8785 recovery_error:

	8786 WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));

	8787 walUnlockExclusive(pWal, iLock, nLock);

	8788 return rc;

	8789 }

	8790

	8791 /*

	8792 ** Close an open wal-index.

	8793 */

	8794 static void walIndexClose(Wal *pWal, int isDelete){

	8795 if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE ){

	8796 int i;

	8797 for(i=0; i<pWal->nWiData; i++){

	8798 sqlite3_free((void *)pWal->apWiData[i]);

	8799 pWal->apWiData[i] = 0;

	8800 }

	8801 }else{

	8802 sqlite3OsShmUnmap(pWal->pDbFd, isDelete);

	8803 }

	8804 }

	8805

	8806 /*

	8807 ** Open a connection to the WAL file zWalName. The database file must

	8808 ** already be opened on connection pDbFd. The buffer that zWalName points

	8809 ** to must remain valid for the lifetime of the returned Wal* handle.

	8810 **

	8811 ** A SHARED lock should be held on the database file when this function

	8812 ** is called. The purpose of this SHARED lock is to prevent any other

	8813 ** client from unlinking the WAL or wal-index file. If another process

	8814 ** were to do this just after this client opened one of these files, the

	8815 ** system would be badly broken.

	8816 **

	8817 ** If the log file is successfully opened, SQLITE_OK is returned and

	8818 ** *ppWal is set to point to a new WAL handle. If an error occurs,

	8819 ** an SQLite error code is returned and *ppWal is left unmodified.

	8820 */

	8821 SQLITE_PRIVATE int sqlite3WalOpen(

	8822 sqlite3_vfs pVfs, / vfs module to open wal and wal-index */

	8823 sqlite3_file pDbFd, / The open database file */

	8824 const char zWalName, / Name of the WAL file */

	8825 int bNoShm, /* True to run in heap-memory mode */

	8826 i64 mxWalSize, /* Truncate WAL to this size on reset */

	8827 Wal *ppWal / OUT: Allocated Wal handle */

	8828 ){

	8829 int rc; /* Return Code */

	8830 Wal pRet; / Object to allocate and return */

	8831 int flags; /* Flags passed to OsOpen() */

	8832

	8833 assert( zWalName && zWalName[0] );

	8834 assert( pDbFd );

	8835

	8836 /* In the amalgamation, the os_unix.c and os_win.c source files come before

	8837 ** this source file. Verify that the #defines of the locking byte offsets

	8838 ** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value.

	8839 ** For that matter, if the lock offset ever changes from its initial design

	8840 ** value of 120, we need to know that so there is an assert() to check it.

	8841 */

	8842 assert( 120==WALINDEX_LOCK_OFFSET );

	8843 assert( 136==WALINDEX_HDR_SIZE );

	8844 #ifdef WIN_SHM_BASE

	8845 assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET );

	8846 #endif

	8847 #ifdef UNIX_SHM_BASE

	8848 assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET );

	8849 #endif

	8850

	8851

	8852 /* Allocate an instance of struct Wal to return. */

	8853 *ppWal = 0;

	8854 pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile);

	8855 if( !pRet ){

	8856 return SQLITE_NOMEM;

	8857 }

	8858

	8859 pRet->pVfs = pVfs;

	8860 pRet->pWalFd = (sqlite3_file *)&pRet[1];

	8861 pRet->pDbFd = pDbFd;

	8862 pRet->readLock = -1;

	8863 pRet->mxWalSize = mxWalSize;

	8864 pRet->zWalName = zWalName;

	8865 pRet->syncHeader = 1;

	8866 pRet->padToSectorBoundary = 1;

	8867 pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);

	8868

	8869 /* Open file handle on the write-ahead log file. */

	8870 flags = (SQLITE_OPEN_READWRITE\|SQLITE_OPEN_CREATE\|SQLITE_OPEN_WAL);

	8871 rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);

	8872 if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){

	8873 pRet->readOnly = WAL_RDONLY;

	8874 }

	8875

	8876 if( rc!=SQLITE_OK ){

	8877 walIndexClose(pRet, 0);

	8878 sqlite3OsClose(pRet->pWalFd);

	8879 sqlite3_free(pRet);

	8880 }else{

	8881 int iDC = sqlite3OsDeviceCharacteristics(pDbFd);

	8882 if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; }

	8883 if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){

	8884 pRet->padToSectorBoundary = 0;

	8885 }

	8886 *ppWal = pRet;

	8887 WALTRACE(("WAL%d: opened\n", pRet));

	8888 }

	8889 return rc;

	8890 }

	8891

	8892 /*

	8893 ** Change the size to which the WAL file is trucated on each reset.

	8894 */

	8895 SQLITE_PRIVATE void sqlite3WalLimit(Wal *pWal, i64 iLimit){

	8896 if( pWal ) pWal->mxWalSize = iLimit;

	8897 }

	8898

	8899 /*

	8900 ** Find the smallest page number out of all pages held in the WAL that

	8901 ** has not been returned by any prior invocation of this method on the

	8902 ** same WalIterator object. Write into *piFrame the frame index where

	8903 ** that page was last written into the WAL. Write into *piPage the page

	8904 ** number.

	8905 **

	8906 ** Return 0 on success. If there are no pages in the WAL with a page

	8907 ** number larger than *piPage, then return 1.

	8908 */

	8909 static int walIteratorNext(

	8910 WalIterator p, / Iterator */

	8911 u32 piPage, / OUT: The page number of the next page */

	8912 u32 piFrame / OUT: Wal frame index of next page */

	8913 ){

	8914 u32 iMin; /* Result pgno must be greater than iMin */

	8915 u32 iRet = 0xFFFFFFFF; /* 0xffffffff is never a valid page number */

	8916 int i; /* For looping through segments */

	8917

	8918 iMin = p->iPrior;

	8919 assert( iMin<0xffffffff );

	8920 for(i=p->nSegment-1; i>=0; i--){

	8921 struct WalSegment *pSegment = &p->aSegment[i];

	8922 while( pSegment->iNext<pSegment->nEntry ){

	8923 u32 iPg = pSegment->aPgno[pSegment->aIndex[pSegment->iNext]];

	8924 if( iPg>iMin ){

	8925 if( iPg<iRet ){

	8926 iRet = iPg;

	8927 *piFrame = pSegment->iZero + pSegment->aIndex[pSegment->iNext];

	8928 }

	8929 break;

	8930 }

	8931 pSegment->iNext++;

	8932 }

	8933 }

	8934

	8935 *piPage = p->iPrior = iRet;

	8936 return (iRet==0xFFFFFFFF);

	8937 }

	8938

	8939 /*

	8940 ** This function merges two sorted lists into a single sorted list.

	8941 **

	8942 ** aLeft[] and aRight[] are arrays of indices. The sort key is

	8943 ** aContent[aLeft[]] and aContent[aRight[]]. Upon entry, the following

	8944 ** is guaranteed for all J<K:

	8945 **

	8946 ** aContent[aLeft[J]] < aContent[aLeft[K]]

	8947 ** aContent[aRight[J]] < aContent[aRight[K]]

	8948 **

	8949 ** This routine overwrites aRight[] with a new (probably longer) sequence

	8950 ** of indices such that the aRight[] contains every index that appears in

	8951 ** either aLeft[] or the old aRight[] and such that the second condition

	8952 ** above is still met.

	8953 **

	8954 ** The aContent[aLeft[X]] values will be unique for all X. And the

	8955 ** aContent[aRight[X]] values will be unique too. But there might be

	8956 ** one or more combinations of X and Y such that

	8957 **

	8958 ** aLeft[X]!=aRight[Y] && aContent[aLeft[X]] == aContent[aRight[Y]]

	8959 **

	8960 ** When that happens, omit the aLeft[X] and use the aRight[Y] index.

	8961 */

	8962 static void walMerge(

	8963 const u32 aContent, / Pages in wal - keys for the sort */

	8964 ht_slot aLeft, / IN: Left hand input list */

	8965 int nLeft, /* IN: Elements in array paLeft /

	8966 ht_slot *paRight, / IN/OUT: Right hand input list */

	8967 int pnRight, / IN/OUT: Elements in paRight /

	8968 ht_slot aTmp / Temporary buffer */

	8969 ){

	8970 int iLeft = 0; /* Current index in aLeft */

	8971 int iRight = 0; /* Current index in aRight */

	8972 int iOut = 0; /* Current index in output buffer */

	8973 int nRight = *pnRight;

	8974 ht_slot aRight = paRight;

	8975

	8976 assert( nLeft>0 && nRight>0 );

	8977 while( iRight<nRight \|\| iLeft<nLeft ){

	8978 ht_slot logpage;

	8979 Pgno dbpage;

	8980

	8981 if( (iLeft<nLeft)

	8982 && (iRight>=nRight \|\| aContent[aLeft[iLeft]]<aContent[aRight[iRight]])

	8983 ){

	8984 logpage = aLeft[iLeft++];

	8985 }else{

	8986 logpage = aRight[iRight++];

	8987 }

	8988 dbpage = aContent[logpage];

	8989

	8990 aTmp[iOut++] = logpage;

	8991 if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;

	8992

	8993 assert( iLeft>=nLeft \|\| aContent[aLeft[iLeft]]>dbpage );

	8994 assert( iRight>=nRight \|\| aContent[aRight[iRight]]>dbpage );

	8995 }

	8996

	8997 *paRight = aLeft;

	8998 *pnRight = iOut;

	8999 memcpy(aLeft, aTmp, sizeof(aTmp[0])*iOut);

	9000 }

	9001

	9002 /*

	9003 ** Sort the elements in list aList using aContent[] as the sort key.

	9004 ** Remove elements with duplicate keys, preferring to keep the

	9005 ** larger aList[] values.

	9006 **

	9007 ** The aList[] entries are indices into aContent[]. The values in

	9008 ** aList[] are to be sorted so that for all J<K:

	9009 **

	9010 ** aContent[aList[J]] < aContent[aList[K]]

	9011 **

	9012 ** For any X and Y such that

	9013 **

	9014 ** aContent[aList[X]] == aContent[aList[Y]]

	9015 **

	9016 ** Keep the larger of the two values aList[X] and aList[Y] and discard

	9017 ** the smaller.

	9018 */

	9019 static void walMergesort(

	9020 const u32 aContent, / Pages in wal */

	9021 ht_slot aBuffer, / Buffer of at least pnList items to use /

	9022 ht_slot aList, / IN/OUT: List to sort */

	9023 int pnList / IN/OUT: Number of elements in aList[] */

	9024 ){

	9025 struct Sublist {

	9026 int nList; /* Number of elements in aList */

	9027 ht_slot aList; / Pointer to sub-list content */

	9028 };

	9029

	9030 const int nList = pnList; / Size of input list */

	9031 int nMerge = 0; /* Number of elements in list aMerge */

	9032 ht_slot aMerge = 0; / List to be merged */

	9033 int iList; /* Index into input list */

	9034 u32 iSub = 0; /* Index into aSub array */

	9035 struct Sublist aSub[13]; /* Array of sub-lists */

	9036

	9037 memset(aSub, 0, sizeof(aSub));

	9038 assert( nList<=HASHTABLE_NPAGE && nList>0 );

	9039 assert( HASHTABLE_NPAGE==(1<<(ArraySize(aSub)-1)) );

	9040

	9041 for(iList=0; iList<nList; iList++){

	9042 nMerge = 1;

	9043 aMerge = &aList[iList];

	9044 for(iSub=0; iList & (1<<iSub); iSub++){

	9045 struct Sublist *p;

	9046 assert( iSub<ArraySize(aSub) );

	9047 p = &aSub[iSub];

	9048 assert( p->aList && p->nList<=(1<<iSub) );

	9049 assert( p->aList==&aList[iList&~((2<<iSub)-1)] );

	9050 walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);

	9051 }

	9052 aSub[iSub].aList = aMerge;

	9053 aSub[iSub].nList = nMerge;

	9054 }

	9055

	9056 for(iSub++; iSub<ArraySize(aSub); iSub++){

	9057 if( nList & (1<<iSub) ){

	9058 struct Sublist *p;

	9059 assert( iSub<ArraySize(aSub) );

	9060 p = &aSub[iSub];

	9061 assert( p->nList<=(1<<iSub) );

	9062 assert( p->aList==&aList[nList&~((2<<iSub)-1)] );

	9063 walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);

	9064 }

	9065 }

	9066 assert( aMerge==aList );

	9067 *pnList = nMerge;

	9068

	9069 #ifdef SQLITE_DEBUG

	9070 {

	9071 int i;

	9072 for(i=1; i<*pnList; i++){

	9073 assert( aContent[aList[i]] > aContent[aList[i-1]] );

	9074 }

	9075 }

	9076 #endif

	9077 }

	9078

	9079 /*

	9080 ** Free an iterator allocated by walIteratorInit().

	9081 */

	9082 static void walIteratorFree(WalIterator *p){

	9083 sqlite3_free(p);

	9084 }

	9085

	9086 /*

	9087 ** Construct a WalInterator object that can be used to loop over all

	9088 ** pages in the WAL in ascending order. The caller must hold the checkpoint

	9089 ** lock.

	9090 **

	9091 ** On success, make *pp point to the newly allocated WalInterator object

	9092 ** return SQLITE_OK. Otherwise, return an error code. If this routine

	9093 ** returns an error, the value of *pp is undefined.

	9094 **

	9095 ** The calling routine should invoke walIteratorFree() to destroy the

	9096 ** WalIterator object when it has finished with it.

	9097 */

	9098 static int walIteratorInit(Wal pWal, WalIterator *pp){

	9099 WalIterator p; / Return value */

	9100 int nSegment; /* Number of segments to merge */

	9101 u32 iLast; /* Last frame in log */

	9102 int nByte; /* Number of bytes to allocate */

	9103 int i; /* Iterator variable */

	9104 ht_slot aTmp; / Temp space used by merge-sort */

	9105 int rc = SQLITE_OK; /* Return Code */

	9106

	9107 /* This routine only runs while holding the checkpoint lock. And

	9108 ** it only runs if there is actually content in the log (mxFrame>0).

	9109 */

	9110 assert( pWal->ckptLock && pWal->hdr.mxFrame>0 );

	9111 iLast = pWal->hdr.mxFrame;

	9112

	9113 /* Allocate space for the WalIterator object. */

	9114 nSegment = walFramePage(iLast) + 1;

	9115 nByte = sizeof(WalIterator)

	9116 + (nSegment-1)*sizeof(struct WalSegment)

	9117 + iLast*sizeof(ht_slot);

	9118 p = (WalIterator *)sqlite3_malloc64(nByte);

	9119 if( !p ){

	9120 return SQLITE_NOMEM;

	9121 }

	9122 memset(p, 0, nByte);

	9123 p->nSegment = nSegment;

	9124

	9125 /* Allocate temporary space used by the merge-sort routine. This block

	9126 ** of memory will be freed before this function returns.

	9127 */

	9128 aTmp = (ht_slot *)sqlite3_malloc64(

	9129 sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast)

	9130 );

	9131 if( !aTmp ){

	9132 rc = SQLITE_NOMEM;

	9133 }

	9134

	9135 for(i=0; rc==SQLITE_OK && i<nSegment; i++){

	9136 volatile ht_slot *aHash;

	9137 u32 iZero;

	9138 volatile u32 *aPgno;

	9139

	9140 rc = walHashGet(pWal, i, &aHash, &aPgno, &iZero);

	9141 if( rc==SQLITE_OK ){

	9142 int j; /* Counter variable */

	9143 int nEntry; /* Number of entries in this segment */

	9144 ht_slot aIndex; / Sorted index for this segment */

	9145

	9146 aPgno++;

	9147 if( (i+1)==nSegment ){

	9148 nEntry = (int)(iLast - iZero);

	9149 }else{

	9150 nEntry = (int)((u32)aHash - (u32)aPgno);

	9151 }

	9152 aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[iZero];

	9153 iZero++;

	9154

	9155 for(j=0; j<nEntry; j++){

	9156 aIndex[j] = (ht_slot)j;

	9157 }

	9158 walMergesort((u32 *)aPgno, aTmp, aIndex, &nEntry);

	9159 p->aSegment[i].iZero = iZero;

	9160 p->aSegment[i].nEntry = nEntry;

	9161 p->aSegment[i].aIndex = aIndex;

	9162 p->aSegment[i].aPgno = (u32 *)aPgno;

	9163 }

	9164 }

	9165 sqlite3_free(aTmp);

	9166

	9167 if( rc!=SQLITE_OK ){

	9168 walIteratorFree(p);

	9169 }

	9170 *pp = p;

	9171 return rc;

	9172 }

	9173

	9174 /*

	9175 ** Attempt to obtain the exclusive WAL lock defined by parameters lockIdx and

	9176 ** n. If the attempt fails and parameter xBusy is not NULL, then it is a

	9177 ** busy-handler function. Invoke it and retry the lock until either the

	9178 ** lock is successfully obtained or the busy-handler returns 0.

	9179 */

	9180 static int walBusyLock(

	9181 Wal pWal, / WAL connection */

	9182 int (xBusy)(void), /* Function to call when busy */

	9183 void pBusyArg, / Context argument for xBusyHandler */

	9184 int lockIdx, /* Offset of first byte to lock */

	9185 int n /* Number of bytes to lock */

	9186 ){

	9187 int rc;

	9188 do {

	9189 rc = walLockExclusive(pWal, lockIdx, n);

	9190 }while( xBusy && rc==SQLITE_BUSY && xBusy(pBusyArg) );

	9191 return rc;

	9192 }

	9193

	9194 /*

	9195 ** The cache of the wal-index header must be valid to call this function.

	9196 ** Return the page-size in bytes used by the database.

	9197 */

	9198 static int walPagesize(Wal *pWal){

	9199 return (pWal->hdr.szPage&0xfe00) + ((pWal->hdr.szPage&0x0001)<<16);

	9200 }

	9201

	9202 /*

	9203 ** The following is guaranteed when this function is called:

	9204 **

	9205 ** a) the WRITER lock is held,

	9206 ** b) the entire log file has been checkpointed, and

	9207 ** c) any existing readers are reading exclusively from the database

	9208 ** file - there are no readers that may attempt to read a frame from

	9209 ** the log file.

	9210 **

	9211 ** This function updates the shared-memory structures so that the next

	9212 ** client to write to the database (which may be this one) does so by

	9213 ** writing frames into the start of the log file.

	9214 **

	9215 ** The value of parameter salt1 is used as the aSalt[1] value in the

	9216 ** new wal-index header. It should be passed a pseudo-random value (i.e.

	9217 ** one obtained from sqlite3_randomness()).

	9218 */

	9219 static void walRestartHdr(Wal *pWal, u32 salt1){

	9220 volatile WalCkptInfo *pInfo = walCkptInfo(pWal);

	9221 int i; /* Loop counter */

	9222 u32 aSalt = pWal->hdr.aSalt; / Big-endian salt values */

	9223 pWal->nCkpt++;

	9224 pWal->hdr.mxFrame = 0;

	9225 sqlite3Put4byte((u8)&aSalt[0], 1 + sqlite3Get4byte((u8)&aSalt[0]));

	9226 memcpy(&pWal->hdr.aSalt[1], &salt1, 4);

	9227 walIndexWriteHdr(pWal);

	9228 pInfo->nBackfill = 0;

	9229 pInfo->nBackfillAttempted = 0;

	9230 pInfo->aReadMark[1] = 0;

	9231 for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;

	9232 assert( pInfo->aReadMark[0]==0 );

	9233 }

	9234

	9235 /*

	9236 ** Copy as much content as we can from the WAL back into the database file

	9237 ** in response to an sqlite3_wal_checkpoint() request or the equivalent.

	9238 **

	9239 ** The amount of information copies from WAL to database might be limited

	9240 ** by active readers. This routine will never overwrite a database page

	9241 ** that a concurrent reader might be using.

	9242 **

	9243 ** All I/O barrier operations (a.k.a fsyncs) occur in this routine when

	9244 ** SQLite is in WAL-mode in synchronous=NORMAL. That means that if

	9245 ** checkpoints are always run by a background thread or background

	9246 ** process, foreground threads will never block on a lengthy fsync call.

	9247 **

	9248 ** Fsync is called on the WAL before writing content out of the WAL and

	9249 ** into the database. This ensures that if the new content is persistent

	9250 ** in the WAL and can be recovered following a power-loss or hard reset.

	9251 **

	9252 ** Fsync is also called on the database file if (and only if) the entire

	9253 ** WAL content is copied into the database file. This second fsync makes

	9254 ** it safe to delete the WAL since the new content will persist in the

	9255 ** database file.

	9256 **

	9257 ** This routine uses and updates the nBackfill field of the wal-index header.

	9258 ** This is the only routine that will increase the value of nBackfill.

	9259 ** (A WAL reset or recovery will revert nBackfill to zero, but not increase

	9260 ** its value.)

	9261 **

	9262 ** The caller must be holding sufficient locks to ensure that no other

	9263 ** checkpoint is running (in any other thread or process) at the same

	9264 ** time.

	9265 */

	9266 static int walCheckpoint(

	9267 Wal pWal, / Wal connection */

	9268 int eMode, /* One of PASSIVE, FULL or RESTART */

	9269 int (xBusy)(void), /* Function to call when busy */

	9270 void pBusyArg, / Context argument for xBusyHandler */

	9271 int sync_flags, /* Flags for OsSync() (or 0) */

	9272 u8 zBuf / Temporary buffer to use */

	9273 ){

	9274 int rc = SQLITE_OK; /* Return code */

	9275 int szPage; /* Database page-size */

	9276 WalIterator pIter = 0; / Wal iterator context */

	9277 u32 iDbpage = 0; /* Next database page to write */

	9278 u32 iFrame = 0; /* Wal frame containing data for iDbpage */

	9279 u32 mxSafeFrame; /* Max frame that can be backfilled */

	9280 u32 mxPage; /* Max database page to write */

	9281 int i; /* Loop counter */

	9282 volatile WalCkptInfo pInfo; / The checkpoint status information */

	9283

	9284 szPage = walPagesize(pWal);

	9285 testcase( szPage<=32768 );

	9286 testcase( szPage>=65536 );

	9287 pInfo = walCkptInfo(pWal);

	9288 if( pInfo->nBackfill<pWal->hdr.mxFrame ){

	9289

	9290 /* Allocate the iterator */

	9291 rc = walIteratorInit(pWal, &pIter);

	9292 if( rc!=SQLITE_OK ){

	9293 return rc;

	9294 }

	9295 assert( pIter );

	9296

	9297 /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked

	9298 ** in the SQLITE_CHECKPOINT_PASSIVE mode. */

	9299 assert( eMode!=SQLITE_CHECKPOINT_PASSIVE \|\| xBusy==0 );

	9300

	9301 /* Compute in mxSafeFrame the index of the last frame of the WAL that is

	9302 ** safe to write into the database. Frames beyond mxSafeFrame might

	9303 ** overwrite database pages that are in use by active readers and thus

	9304 ** cannot be backfilled from the WAL.

	9305 */

	9306 mxSafeFrame = pWal->hdr.mxFrame;

	9307 mxPage = pWal->hdr.nPage;

	9308 for(i=1; i<WAL_NREADER; i++){

	9309 /* Thread-sanitizer reports that the following is an unsafe read,

	9310 ** as some other thread may be in the process of updating the value

	9311 ** of the aReadMark[] slot. The assumption here is that if that is

	9312 ** happening, the other client may only be increasing the value,

	9313 ** not decreasing it. So assuming either that either the "old" or

	9314 ** "new" version of the value is read, and not some arbitrary value

	9315 ** that would never be written by a real client, things are still

	9316 ** safe. */

	9317 u32 y = pInfo->aReadMark[i];

	9318 if( mxSafeFrame>y ){

	9319 assert( y<=pWal->hdr.mxFrame );

	9320 rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1);

	9321 if( rc==SQLITE_OK ){

	9322 pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED);

	9323 walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);

	9324 }else if( rc==SQLITE_BUSY ){

	9325 mxSafeFrame = y;

	9326 xBusy = 0;

	9327 }else{

	9328 goto walcheckpoint_out;

	9329 }

	9330 }

	9331 }

	9332

	9333 if( pInfo->nBackfill<mxSafeFrame

	9334 && (rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(0),1))==SQLITE_OK

	9335 ){

	9336 i64 nSize; /* Current size of database file */

	9337 u32 nBackfill = pInfo->nBackfill;

	9338

	9339 pInfo->nBackfillAttempted = mxSafeFrame;

	9340

	9341 /* Sync the WAL to disk */

	9342 if( sync_flags ){

	9343 rc = sqlite3OsSync(pWal->pWalFd, sync_flags);

	9344 }

	9345

	9346 /* If the database may grow as a result of this checkpoint, hint

	9347 ** about the eventual size of the db file to the VFS layer.

	9348 */

	9349 if( rc==SQLITE_OK ){

	9350 i64 nReq = ((i64)mxPage * szPage);

	9351 rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);

	9352 if( rc==SQLITE_OK && nSize<nReq ){

	9353 sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);

	9354 }

	9355 }

	9356

	9357

	9358 /* Iterate through the contents of the WAL, copying data to the db file */

	9359 while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){

	9360 i64 iOffset;

	9361 assert( walFramePgno(pWal, iFrame)==iDbpage );

	9362 if( iFrame<=nBackfill \|\| iFrame>mxSafeFrame \|\| iDbpage>mxPage ){

	9363 continue;

	9364 }

	9365 iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;

	9366 /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */

	9367 rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);

	9368 if( rc!=SQLITE_OK ) break;

	9369 iOffset = (iDbpage-1)*(i64)szPage;

	9370 testcase( IS_BIG_INT(iOffset) );

	9371 rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset);

	9372 if( rc!=SQLITE_OK ) break;

	9373 }

	9374

	9375 /* If work was actually accomplished... */

	9376 if( rc==SQLITE_OK ){

	9377 if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){

	9378 i64 szDb = pWal->hdr.nPage*(i64)szPage;

	9379 testcase( IS_BIG_INT(szDb) );

	9380 rc = sqlite3OsTruncate(pWal->pDbFd, szDb);

	9381 if( rc==SQLITE_OK && sync_flags ){

	9382 rc = sqlite3OsSync(pWal->pDbFd, sync_flags);

	9383 }

	9384 }

	9385 if( rc==SQLITE_OK ){

	9386 pInfo->nBackfill = mxSafeFrame;

	9387 }

	9388 }

	9389

	9390 /* Release the reader lock held while backfilling */

	9391 walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);

	9392 }

	9393

	9394 if( rc==SQLITE_BUSY ){

	9395 /* Reset the return code so as not to report a checkpoint failure

	9396 ** just because there are active readers. */

	9397 rc = SQLITE_OK;

	9398 }

	9399 }

	9400

	9401 /* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the

	9402 ** entire wal file has been copied into the database file, then block

	9403 ** until all readers have finished using the wal file. This ensures that

	9404 ** the next process to write to the database restarts the wal file.

	9405 */

	9406 if( rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){

	9407 assert( pWal->writeLock );

	9408 if( pInfo->nBackfill<pWal->hdr.mxFrame ){

	9409 rc = SQLITE_BUSY;

	9410 }else if( eMode>=SQLITE_CHECKPOINT_RESTART ){

	9411 u32 salt1;

	9412 sqlite3_randomness(4, &salt1);

	9413 assert( pInfo->nBackfill==pWal->hdr.mxFrame );

	9414 rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(1), WAL_NREADER-1);

	9415 if( rc==SQLITE_OK ){

	9416 if( eMode==SQLITE_CHECKPOINT_TRUNCATE ){

	9417 /* IMPLEMENTATION-OF: R-44699-57140 This mode works the same way as

	9418 ** SQLITE_CHECKPOINT_RESTART with the addition that it also

	9419 ** truncates the log file to zero bytes just prior to a

	9420 ** successful return.

	9421 **

	9422 ** In theory, it might be safe to do this without updating the

	9423 ** wal-index header in shared memory, as all subsequent reader or

	9424 ** writer clients should see that the entire log file has been

	9425 ** checkpointed and behave accordingly. This seems unsafe though,

	9426 ** as it would leave the system in a state where the contents of

	9427 ** the wal-index header do not match the contents of the

	9428 ** file-system. To avoid this, update the wal-index header to

	9429 ** indicate that the log file contains zero valid frames. */

	9430 walRestartHdr(pWal, salt1);

	9431 rc = sqlite3OsTruncate(pWal->pWalFd, 0);

	9432 }

	9433 walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);

	9434 }

	9435 }

	9436 }

	9437

	9438 walcheckpoint_out:

	9439 walIteratorFree(pIter);

	9440 return rc;

	9441 }

	9442

	9443 /*

	9444 ** If the WAL file is currently larger than nMax bytes in size, truncate

	9445 ** it to exactly nMax bytes. If an error occurs while doing so, ignore it.

	9446 */

	9447 static void walLimitSize(Wal *pWal, i64 nMax){

	9448 i64 sz;

	9449 int rx;

	9450 sqlite3BeginBenignMalloc();

	9451 rx = sqlite3OsFileSize(pWal->pWalFd, &sz);

	9452 if( rx==SQLITE_OK && (sz > nMax ) ){

	9453 rx = sqlite3OsTruncate(pWal->pWalFd, nMax);

	9454 }

	9455 sqlite3EndBenignMalloc();

	9456 if( rx ){

	9457 sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName);

	9458 }

	9459 }

	9460

	9461 /*

	9462 ** Close a connection to a log file.

	9463 */

	9464 SQLITE_PRIVATE int sqlite3WalClose(

	9465 Wal pWal, / Wal to close */

	9466 int sync_flags, /* Flags to pass to OsSync() (or 0) */

	9467 int nBuf,

	9468 u8 zBuf / Buffer of at least nBuf bytes */

	9469 ){

	9470 int rc = SQLITE_OK;

	9471 if( pWal ){

	9472 int isDelete = 0; /* True to unlink wal and wal-index files */

	9473

	9474 /* If an EXCLUSIVE lock can be obtained on the database file (using the

	9475 ** ordinary, rollback-mode locking methods, this guarantees that the

	9476 ** connection associated with this log file is the only connection to

	9477 ** the database. In this case checkpoint the database and unlink both

	9478 ** the wal and wal-index files.

	9479 **

	9480 ** The EXCLUSIVE lock is not released before returning.

	9481 */

	9482 rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE);

	9483 if( rc==SQLITE_OK ){

	9484 if( pWal->exclusiveMode==WAL_NORMAL_MODE ){

	9485 pWal->exclusiveMode = WAL_EXCLUSIVE_MODE;

	9486 }

	9487 rc = sqlite3WalCheckpoint(

	9488 pWal, SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0

	9489 );

	9490 if( rc==SQLITE_OK ){

	9491 int bPersist = -1;

	9492 sqlite3OsFileControlHint(

	9493 pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist

	9494 );

	9495 if( bPersist!=1 ){

	9496 /* Try to delete the WAL file if the checkpoint completed and

	9497 ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal

	9498 ** mode (!bPersist) */

	9499 isDelete = 1;

	9500 }else if( pWal->mxWalSize>=0 ){

	9501 /* Try to truncate the WAL file to zero bytes if the checkpoint

	9502 ** completed and fsynced (rc==SQLITE_OK) and we are in persistent

	9503 ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a

	9504 ** non-negative value (pWal->mxWalSize>=0). Note that we truncate

	9505 ** to zero bytes as truncating to the journal_size_limit might

	9506 ** leave a corrupt WAL file on disk. */

	9507 walLimitSize(pWal, 0);

	9508 }

	9509 }

	9510 }

	9511

	9512 walIndexClose(pWal, isDelete);

	9513 sqlite3OsClose(pWal->pWalFd);

	9514 if( isDelete ){

	9515 sqlite3BeginBenignMalloc();

	9516 sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);

	9517 sqlite3EndBenignMalloc();

	9518 }

	9519 WALTRACE(("WAL%p: closed\n", pWal));

	9520 sqlite3_free((void *)pWal->apWiData);

	9521 sqlite3_free(pWal);

	9522 }

	9523 return rc;

	9524 }

	9525

	9526 /*

	9527 ** Try to read the wal-index header. Return 0 on success and 1 if

	9528 ** there is a problem.

	9529 **

	9530 ** The wal-index is in shared memory. Another thread or process might

	9531 ** be writing the header at the same time this procedure is trying to

	9532 ** read it, which might result in inconsistency. A dirty read is detected

	9533 ** by verifying that both copies of the header are the same and also by

	9534 ** a checksum on the header.

	9535 **

	9536 ** If and only if the read is consistent and the header is different from

	9537 ** pWal->hdr, then pWal->hdr is updated to the content of the new header

	9538 ** and *pChanged is set to 1.

	9539 **

	9540 ** If the checksum cannot be verified return non-zero. If the header

	9541 ** is read successfully and the checksum verified, return zero.

	9542 */

	9543 static int walIndexTryHdr(Wal pWal, int pChanged){

	9544 u32 aCksum[2]; /* Checksum on the header content */

	9545 WalIndexHdr h1, h2; /* Two copies of the header content */

	9546 WalIndexHdr volatile aHdr; / Header in shared memory */

	9547

	9548 /* The first page of the wal-index must be mapped at this point. */

	9549 assert( pWal->nWiData>0 && pWal->apWiData[0] );

	9550

	9551 /* Read the header. This might happen concurrently with a write to the

	9552 ** same area of shared memory on a different CPU in a SMP,

	9553 ** meaning it is possible that an inconsistent snapshot is read

	9554 ** from the file. If this happens, return non-zero.

	9555 **

	9556 ** There are two copies of the header at the beginning of the wal-index.

	9557 ** When reading, read [0] first then [1]. Writes are in the reverse order.

	9558 ** Memory barriers are used to prevent the compiler or the hardware from

	9559 ** reordering the reads and writes.

	9560 */

	9561 aHdr = walIndexHdr(pWal);

	9562 memcpy(&h1, (void *)&aHdr[0], sizeof(h1));

	9563 walShmBarrier(pWal);

	9564 memcpy(&h2, (void *)&aHdr[1], sizeof(h2));

	9565

	9566 if( memcmp(&h1, &h2, sizeof(h1))!=0 ){

	9567 return 1; /* Dirty read */

	9568 }

	9569 if( h1.isInit==0 ){

	9570 return 1; /* Malformed header - probably all zeros */

	9571 }

	9572 walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum);

	9573 if( aCksum[0]!=h1.aCksum[0] \|\| aCksum[1]!=h1.aCksum[1] ){

	9574 return 1; /* Checksum does not match */

	9575 }

	9576

	9577 if( memcmp(&pWal->hdr, &h1, sizeof(WalIndexHdr)) ){

	9578 *pChanged = 1;

	9579 memcpy(&pWal->hdr, &h1, sizeof(WalIndexHdr));

	9580 pWal->szPage = (pWal->hdr.szPage&0xfe00) + ((pWal->hdr.szPage&0x0001)<<16);

	9581 testcase( pWal->szPage<=32768 );

	9582 testcase( pWal->szPage>=65536 );

	9583 }

	9584

	9585 /* The header was successfully read. Return zero. */

	9586 return 0;

	9587 }

	9588

	9589 /*

	9590 ** Read the wal-index header from the wal-index and into pWal->hdr.

	9591 ** If the wal-header appears to be corrupt, try to reconstruct the

	9592 ** wal-index from the WAL before returning.

	9593 **

	9594 ** Set *pChanged to 1 if the wal-index header value in pWal->hdr is

	9595 ** changed by this operation. If pWal->hdr is unchanged, set *pChanged

	9596 ** to 0.

	9597 **

	9598 ** If the wal-index header is successfully read, return SQLITE_OK.

	9599 ** Otherwise an SQLite error code.

	9600 */

	9601 static int walIndexReadHdr(Wal pWal, int pChanged){

	9602 int rc; /* Return code */

	9603 int badHdr; /* True if a header read failed */

	9604 volatile u32 page0; / Chunk of wal-index containing header */

	9605

	9606 /* Ensure that page 0 of the wal-index (the page that contains the

	9607 ** wal-index header) is mapped. Return early if an error occurs here.

	9608 */

	9609 assert( pChanged );

	9610 rc = walIndexPage(pWal, 0, &page0);

	9611 if( rc!=SQLITE_OK ){

	9612 return rc;

	9613 };

	9614 assert( page0 \|\| pWal->writeLock==0 );

	9615

	9616 /* If the first page of the wal-index has been mapped, try to read the

	9617 ** wal-index header immediately, without holding any lock. This usually

	9618 ** works, but may fail if the wal-index header is corrupt or currently

	9619 ** being modified by another thread or process.

	9620 */

	9621 badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);

	9622

	9623 /* If the first attempt failed, it might have been due to a race

	9624 ** with a writer. So get a WRITE lock and try again.

	9625 */

	9626 assert( badHdr==0 \|\| pWal->writeLock==0 );

	9627 if( badHdr ){

	9628 if( pWal->readOnly & WAL_SHM_RDONLY ){

	9629 if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){

	9630 walUnlockShared(pWal, WAL_WRITE_LOCK);

	9631 rc = SQLITE_READONLY_RECOVERY;

	9632 }

	9633 }else if( SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){

	9634 pWal->writeLock = 1;

	9635 if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){

	9636 badHdr = walIndexTryHdr(pWal, pChanged);

	9637 if( badHdr ){

	9638 /* If the wal-index header is still malformed even while holding

	9639 ** a WRITE lock, it can only mean that the header is corrupted and

	9640 ** needs to be reconstructed. So run recovery to do exactly that.

	9641 */

	9642 rc = walIndexRecover(pWal);

	9643 *pChanged = 1;

	9644 }

	9645 }

	9646 pWal->writeLock = 0;

	9647 walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);

	9648 }

	9649 }

	9650

	9651 /* If the header is read successfully, check the version number to make

	9652 ** sure the wal-index was not constructed with some future format that

	9653 ** this version of SQLite cannot understand.

	9654 */

	9655 if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){

	9656 rc = SQLITE_CANTOPEN_BKPT;

	9657 }

	9658

	9659 return rc;

	9660 }

	9661

	9662 /*

	9663 ** This is the value that walTryBeginRead returns when it needs to

	9664 ** be retried.

	9665 */

	9666 #define WAL_RETRY (-1)

	9667

	9668 /*

	9669 ** Attempt to start a read transaction. This might fail due to a race or

	9670 ** other transient condition. When that happens, it returns WAL_RETRY to

	9671 ** indicate to the caller that it is safe to retry immediately.

	9672 **

	9673 ** On success return SQLITE_OK. On a permanent failure (such an

	9674 ** I/O error or an SQLITE_BUSY because another process is running

	9675 ** recovery) return a positive error code.

	9676 **

	9677 ** The useWal parameter is true to force the use of the WAL and disable

	9678 ** the case where the WAL is bypassed because it has been completely

	9679 ** checkpointed. If useWal==0 then this routine calls walIndexReadHdr()

	9680 ** to make a copy of the wal-index header into pWal->hdr. If the

	9681 ** wal-index header has changed, *pChanged is set to 1 (as an indication

	9682 ** to the caller that the local paget cache is obsolete and needs to be

	9683 ** flushed.) When useWal==1, the wal-index header is assumed to already

	9684 ** be loaded and the pChanged parameter is unused.

	9685 **

	9686 ** The caller must set the cnt parameter to the number of prior calls to

	9687 ** this routine during the current read attempt that returned WAL_RETRY.

	9688 ** This routine will start taking more aggressive measures to clear the

	9689 ** race conditions after multiple WAL_RETRY returns, and after an excessive

	9690 ** number of errors will ultimately return SQLITE_PROTOCOL. The

	9691 ** SQLITE_PROTOCOL return indicates that some other process has gone rogue

	9692 ** and is not honoring the locking protocol. There is a vanishingly small

	9693 ** chance that SQLITE_PROTOCOL could be returned because of a run of really

	9694 ** bad luck when there is lots of contention for the wal-index, but that

	9695 ** possibility is so small that it can be safely neglected, we believe.

	9696 **

	9697 ** On success, this routine obtains a read lock on

	9698 ** WAL_READ_LOCK(pWal->readLock). The pWal->readLock integer is

	9699 ** in the range 0 <= pWal->readLock < WAL_NREADER. If pWal->readLock==(-1)

	9700 ** that means the Wal does not hold any read lock. The reader must not

	9701 ** access any database page that is modified by a WAL frame up to and

	9702 ** including frame number aReadMark[pWal->readLock]. The reader will

	9703 ** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0

	9704 ** Or if pWal->readLock==0, then the reader will ignore the WAL

	9705 ** completely and get all content directly from the database file.

	9706 ** If the useWal parameter is 1 then the WAL will never be ignored and

	9707 ** this routine will always set pWal->readLock>0 on success.

	9708 ** When the read transaction is completed, the caller must release the

	9709 ** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1.

	9710 **

	9711 ** This routine uses the nBackfill and aReadMark[] fields of the header

	9712 ** to select a particular WAL_READ_LOCK() that strives to let the

	9713 ** checkpoint process do as much work as possible. This routine might

	9714 ** update values of the aReadMark[] array in the header, but if it does

	9715 ** so it takes care to hold an exclusive lock on the corresponding

	9716 ** WAL_READ_LOCK() while changing values.

	9717 */

	9718 static int walTryBeginRead(Wal pWal, int pChanged, int useWal, int cnt){

	9719 volatile WalCkptInfo pInfo; / Checkpoint information in wal-index */

	9720 u32 mxReadMark; /* Largest aReadMark[] value */

	9721 int mxI; /* Index of largest aReadMark[] value */

	9722 int i; /* Loop counter */

	9723 int rc = SQLITE_OK; /* Return code */

	9724 u32 mxFrame; /* Wal frame to lock to */

	9725

	9726 assert( pWal->readLock<0 ); /* Not currently locked */

	9727

	9728 /* Take steps to avoid spinning forever if there is a protocol error.

	9729 **

	9730 ** Circumstances that cause a RETRY should only last for the briefest

	9731 ** instances of time. No I/O or other system calls are done while the

	9732 ** locks are held, so the locks should not be held for very long. But

	9733 ** if we are unlucky, another process that is holding a lock might get

	9734 ** paged out or take a page-fault that is time-consuming to resolve,

	9735 ** during the few nanoseconds that it is holding the lock. In that case,

	9736 ** it might take longer than normal for the lock to free.

	9737 **

	9738 ** After 5 RETRYs, we begin calling sqlite3OsSleep(). The first few

	9739 ** calls to sqlite3OsSleep() have a delay of 1 microsecond. Really this

	9740 ** is more of a scheduler yield than an actual delay. But on the 10th

	9741 ** an subsequent retries, the delays start becoming longer and longer,

	9742 ** so that on the 100th (and last) RETRY we delay for 323 milliseconds.

	9743 ** The total delay time before giving up is less than 10 seconds.

	9744 */

	9745 if( cnt>5 ){

	9746 int nDelay = 1; /* Pause time in microseconds */

	9747 if( cnt>100 ){

	9748 VVA_ONLY( pWal->lockError = 1; )

	9749 return SQLITE_PROTOCOL;

	9750 }

	9751 if( cnt>=10 ) nDelay = (cnt-9)(cnt-9)39;

	9752 sqlite3OsSleep(pWal->pVfs, nDelay);

	9753 }

	9754

	9755 if( !useWal ){

	9756 rc = walIndexReadHdr(pWal, pChanged);

	9757 if( rc==SQLITE_BUSY ){

	9758 /* If there is not a recovery running in another thread or process

	9759 ** then convert BUSY errors to WAL_RETRY. If recovery is known to

	9760 ** be running, convert BUSY to BUSY_RECOVERY. There is a race here

	9761 ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY

	9762 ** would be technically correct. But the race is benign since with

	9763 ** WAL_RETRY this routine will be called again and will probably be

	9764 ** right on the second iteration.

	9765 */

	9766 if( pWal->apWiData[0]==0 ){

	9767 /* This branch is taken when the xShmMap() method returns SQLITE_BUSY.

	9768 ** We assume this is a transient condition, so return WAL_RETRY. The

	9769 ** xShmMap() implementation used by the default unix and win32 VFS

	9770 ** modules may return SQLITE_BUSY due to a race condition in the

	9771 ** code that determines whether or not the shared-memory region

	9772 ** must be zeroed before the requested page is returned.

	9773 */

	9774 rc = WAL_RETRY;

	9775 }else if( SQLITE_OK==(rc = walLockShared(pWal, WAL_RECOVER_LOCK)) ){

	9776 walUnlockShared(pWal, WAL_RECOVER_LOCK);

	9777 rc = WAL_RETRY;

	9778 }else if( rc==SQLITE_BUSY ){

	9779 rc = SQLITE_BUSY_RECOVERY;

	9780 }

	9781 }

	9782 if( rc!=SQLITE_OK ){

	9783 return rc;

	9784 }

	9785 }

	9786

	9787 pInfo = walCkptInfo(pWal);

	9788 if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame

	9789 #ifdef SQLITE_ENABLE_SNAPSHOT

	9790 && (pWal->pSnapshot==0 \|\| pWal->hdr.mxFrame==0

	9791 \|\| 0==memcmp(&pWal->hdr, pWal->pSnapshot, sizeof(WalIndexHdr)))

	9792 #endif

	9793 ){

	9794 /* The WAL has been completely backfilled (or it is empty).

	9795 ** and can be safely ignored.

	9796 */

	9797 rc = walLockShared(pWal, WAL_READ_LOCK(0));

	9798 walShmBarrier(pWal);

	9799 if( rc==SQLITE_OK ){

	9800 if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){

	9801 /* It is not safe to allow the reader to continue here if frames

	9802 ** may have been appended to the log before READ_LOCK(0) was obtained.

	9803 ** When holding READ_LOCK(0), the reader ignores the entire log file,

	9804 ** which implies that the database file contains a trustworthy

	9805 ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from

	9806 ** happening, this is usually correct.

	9807 **

	9808 ** However, if frames have been appended to the log (or if the log

	9809 ** is wrapped and written for that matter) before the READ_LOCK(0)

	9810 ** is obtained, that is not necessarily true. A checkpointer may

	9811 ** have started to backfill the appended frames but crashed before

	9812 ** it finished. Leaving a corrupt image in the database file.

	9813 */

	9814 walUnlockShared(pWal, WAL_READ_LOCK(0));

	9815 return WAL_RETRY;

	9816 }

	9817 pWal->readLock = 0;

	9818 return SQLITE_OK;

	9819 }else if( rc!=SQLITE_BUSY ){

	9820 return rc;

	9821 }

	9822 }

	9823

	9824 /* If we get this far, it means that the reader will want to use

	9825 ** the WAL to get at content from recent commits. The job now is

	9826 ** to select one of the aReadMark[] entries that is closest to

	9827 ** but not exceeding pWal->hdr.mxFrame and lock that entry.

	9828 */

	9829 mxReadMark = 0;

	9830 mxI = 0;

	9831 mxFrame = pWal->hdr.mxFrame;

	9832 #ifdef SQLITE_ENABLE_SNAPSHOT

	9833 if( pWal->pSnapshot && pWal->pSnapshot->mxFrame<mxFrame ){

	9834 mxFrame = pWal->pSnapshot->mxFrame;

	9835 }

	9836 #endif

	9837 for(i=1; i<WAL_NREADER; i++){

	9838 u32 thisMark = pInfo->aReadMark[i];

	9839 if( mxReadMark<=thisMark && thisMark<=mxFrame ){

	9840 assert( thisMark!=READMARK_NOT_USED );

	9841 mxReadMark = thisMark;

	9842 mxI = i;

	9843 }

	9844 }

	9845 if( (pWal->readOnly & WAL_SHM_RDONLY)==0

	9846 && (mxReadMark<mxFrame \|\| mxI==0)

	9847 ){

	9848 for(i=1; i<WAL_NREADER; i++){

	9849 rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);

	9850 if( rc==SQLITE_OK ){

	9851 mxReadMark = pInfo->aReadMark[i] = mxFrame;

	9852 mxI = i;

	9853 walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);

	9854 break;

	9855 }else if( rc!=SQLITE_BUSY ){

	9856 return rc;

	9857 }

	9858 }

	9859 }

	9860 if( mxI==0 ){

	9861 assert( rc==SQLITE_BUSY \|\| (pWal->readOnly & WAL_SHM_RDONLY)!=0 );

	9862 return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK;

	9863 }

	9864

	9865 rc = walLockShared(pWal, WAL_READ_LOCK(mxI));

	9866 if( rc ){

	9867 return rc==SQLITE_BUSY ? WAL_RETRY : rc;

	9868 }

	9869 /* Now that the read-lock has been obtained, check that neither the

	9870 ** value in the aReadMark[] array or the contents of the wal-index

	9871 ** header have changed.

	9872 **

	9873 ** It is necessary to check that the wal-index header did not change

	9874 ** between the time it was read and when the shared-lock was obtained

	9875 ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility

	9876 ** that the log file may have been wrapped by a writer, or that frames

	9877 ** that occur later in the log than pWal->hdr.mxFrame may have been

	9878 ** copied into the database by a checkpointer. If either of these things

	9879 ** happened, then reading the database with the current value of

	9880 ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry

	9881 ** instead.

	9882 **

	9883 ** Before checking that the live wal-index header has not changed

	9884 ** since it was read, set Wal.minFrame to the first frame in the wal

	9885 ** file that has not yet been checkpointed. This client will not need

	9886 ** to read any frames earlier than minFrame from the wal file - they

	9887 ** can be safely read directly from the database file.

	9888 **

	9889 ** Because a ShmBarrier() call is made between taking the copy of

	9890 ** nBackfill and checking that the wal-header in shared-memory still

	9891 ** matches the one cached in pWal->hdr, it is guaranteed that the

	9892 ** checkpointer that set nBackfill was not working with a wal-index

	9893 ** header newer than that cached in pWal->hdr. If it were, that could

	9894 ** cause a problem. The checkpointer could omit to checkpoint

	9895 ** a version of page X that lies before pWal->minFrame (call that version

	9896 ** A) on the basis that there is a newer version (version B) of the same

	9897 ** page later in the wal file. But if version B happens to like past

	9898 ** frame pWal->hdr.mxFrame - then the client would incorrectly assume

	9899 ** that it can read version A from the database file. However, since

	9900 ** we can guarantee that the checkpointer that set nBackfill could not

	9901 ** see any pages past pWal->hdr.mxFrame, this problem does not come up.

	9902 */

	9903 pWal->minFrame = pInfo->nBackfill+1;

	9904 walShmBarrier(pWal);

	9905 if( pInfo->aReadMark[mxI]!=mxReadMark

	9906 \|\| memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr))

	9907 ){

	9908 walUnlockShared(pWal, WAL_READ_LOCK(mxI));

	9909 return WAL_RETRY;

	9910 }else{

	9911 assert( mxReadMark<=pWal->hdr.mxFrame );

	9912 pWal->readLock = (i16)mxI;

	9913 }

	9914 return rc;

	9915 }

	9916

	9917 /*

	9918 ** Begin a read transaction on the database.

	9919 **

	9920 ** This routine used to be called sqlite3OpenSnapshot() and with good reason:

	9921 ** it takes a snapshot of the state of the WAL and wal-index for the current

	9922 ** instant in time. The current thread will continue to use this snapshot.

	9923 ** Other threads might append new content to the WAL and wal-index but

	9924 ** that extra content is ignored by the current thread.

	9925 **

	9926 ** If the database contents have changes since the previous read

	9927 ** transaction, then *pChanged is set to 1 before returning. The

	9928 ** Pager layer will use this to know that is cache is stale and

	9929 ** needs to be flushed.

	9930 */

	9931 SQLITE_PRIVATE int sqlite3WalBeginReadTransaction(Wal pWal, int pChanged){

	9932 int rc; /* Return code */

	9933 int cnt = 0; /* Number of TryBeginRead attempts */

	9934

	9935 #ifdef SQLITE_ENABLE_SNAPSHOT

	9936 int bChanged = 0;

	9937 WalIndexHdr *pSnapshot = pWal->pSnapshot;

	9938 if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){

	9939 bChanged = 1;

	9940 }

	9941 #endif

	9942

	9943 do{

	9944 rc = walTryBeginRead(pWal, pChanged, 0, ++cnt);

	9945 }while( rc==WAL_RETRY );

	9946 testcase( (rc&0xff)==SQLITE_BUSY );

	9947 testcase( (rc&0xff)==SQLITE_IOERR );

	9948 testcase( rc==SQLITE_PROTOCOL );

	9949 testcase( rc==SQLITE_OK );

	9950

	9951 #ifdef SQLITE_ENABLE_SNAPSHOT

	9952 if( rc==SQLITE_OK ){

	9953 if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){

	9954 /* At this point the client has a lock on an aReadMark[] slot holding

	9955 ** a value equal to or smaller than pSnapshot->mxFrame, but pWal->hdr

	9956 ** is populated with the wal-index header corresponding to the head

	9957 ** of the wal file. Verify that pSnapshot is still valid before

	9958 ** continuing. Reasons why pSnapshot might no longer be valid:

	9959 **

	9960 ** (1) The WAL file has been reset since the snapshot was taken.

	9961 ** In this case, the salt will have changed.

	9962 **

	9963 ** (2) A checkpoint as been attempted that wrote frames past

	9964 ** pSnapshot->mxFrame into the database file. Note that the

	9965 ** checkpoint need not have completed for this to cause problems.

	9966 */

	9967 volatile WalCkptInfo *pInfo = walCkptInfo(pWal);

	9968

	9969 assert( pWal->readLock>0 \|\| pWal->hdr.mxFrame==0 );

	9970 assert( pInfo->aReadMark[pWal->readLock]<=pSnapshot->mxFrame );

	9971

	9972 /* It is possible that there is a checkpointer thread running

	9973 ** concurrent with this code. If this is the case, it may be that the

	9974 ** checkpointer has already determined that it will checkpoint

	9975 ** snapshot X, where X is later in the wal file than pSnapshot, but

	9976 ** has not yet set the pInfo->nBackfillAttempted variable to indicate

	9977 ** its intent. To avoid the race condition this leads to, ensure that

	9978 ** there is no checkpointer process by taking a shared CKPT lock

	9979 ** before checking pInfo->nBackfillAttempted. */

	9980 rc = walLockShared(pWal, WAL_CKPT_LOCK);

	9981

	9982 if( rc==SQLITE_OK ){

	9983 /* Check that the wal file has not been wrapped. Assuming that it has

	9984 ** not, also check that no checkpointer has attempted to checkpoint any

	9985 ** frames beyond pSnapshot->mxFrame. If either of these conditions are

	9986 ** true, return SQLITE_BUSY_SNAPSHOT. Otherwise, overwrite pWal->hdr

	9987 ** with pSnapshot and set pChanged as appropriate for opening the

	9988 ** snapshot. */

	9989 if( !memcmp(pSnapshot->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt))

	9990 && pSnapshot->mxFrame>=pInfo->nBackfillAttempted

	9991 ){

	9992 assert( pWal->readLock>0 );

	9993 memcpy(&pWal->hdr, pSnapshot, sizeof(WalIndexHdr));

	9994 *pChanged = bChanged;

	9995 }else{

	9996 rc = SQLITE_BUSY_SNAPSHOT;

	9997 }

	9998

	9999 /* Release the shared CKPT lock obtained above. */

	10000 walUnlockShared(pWal, WAL_CKPT_LOCK);

	10001 }

	10002

	10003

	10004 if( rc!=SQLITE_OK ){

	10005 sqlite3WalEndReadTransaction(pWal);

	10006 }

	10007 }

	10008 }

	10009 #endif

	10010 return rc;

	10011 }

	10012

	10013 /*

	10014 ** Finish with a read transaction. All this does is release the

	10015 ** read-lock.

	10016 */

	10017 SQLITE_PRIVATE void sqlite3WalEndReadTransaction(Wal *pWal){

	10018 sqlite3WalEndWriteTransaction(pWal);

	10019 if( pWal->readLock>=0 ){

	10020 walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));

	10021 pWal->readLock = -1;

	10022 }

	10023 }

	10024

	10025 /*

	10026 ** Search the wal file for page pgno. If found, set *piRead to the frame that

	10027 ** contains the page. Otherwise, if pgno is not in the wal file, set *piRead

	10028 ** to zero.

	10029 **

	10030 ** Return SQLITE_OK if successful, or an error code if an error occurs. If an

	10031 ** error does occur, the final value of *piRead is undefined.

	10032 */

	10033 SQLITE_PRIVATE int sqlite3WalFindFrame(

	10034 Wal pWal, / WAL handle */

	10035 Pgno pgno, /* Database page number to read data for */

	10036 u32 piRead / OUT: Frame number (or zero) */

	10037 ){

	10038 u32 iRead = 0; /* If !=0, WAL frame to return data from */

	10039 u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */

	10040 int iHash; /* Used to loop through N hash tables */

	10041 int iMinHash;

	10042

	10043 /* This routine is only be called from within a read transaction. */

	10044 assert( pWal->readLock>=0 \|\| pWal->lockError );

	10045

	10046 /* If the "last page" field of the wal-index header snapshot is 0, then

	10047 ** no data will be read from the wal under any circumstances. Return early

	10048 ** in this case as an optimization. Likewise, if pWal->readLock==0,

	10049 ** then the WAL is ignored by the reader so return early, as if the

	10050 ** WAL were empty.

	10051 */

	10052 if( iLast==0 \|\| pWal->readLock==0 ){

	10053 *piRead = 0;

	10054 return SQLITE_OK;

	10055 }

	10056

	10057 /* Search the hash table or tables for an entry matching page number

	10058 ** pgno. Each iteration of the following for() loop searches one

	10059 ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).

	10060 **

	10061 ** This code might run concurrently to the code in walIndexAppend()

	10062 ** that adds entries to the wal-index (and possibly to this hash

	10063 ** table). This means the value just read from the hash

	10064 ** slot (aHash[iKey]) may have been added before or after the

	10065 ** current read transaction was opened. Values added after the

	10066 ** read transaction was opened may have been written incorrectly -

	10067 ** i.e. these slots may contain garbage data. However, we assume

	10068 ** that any slots written before the current read transaction was

	10069 ** opened remain unmodified.

	10070 **

	10071 ** For the reasons above, the if(...) condition featured in the inner

	10072 ** loop of the following block is more stringent that would be required

	10073 ** if we had exclusive access to the hash-table:

	10074 **

	10075 ** (aPgno[iFrame]==pgno):

	10076 ** This condition filters out normal hash-table collisions.

	10077 **

	10078 ** (iFrame<=iLast):

	10079 ** This condition filters out entries that were added to the hash

	10080 ** table after the current read-transaction had started.

	10081 */

	10082 iMinHash = walFramePage(pWal->minFrame);

	10083 for(iHash=walFramePage(iLast); iHash>=iMinHash && iRead==0; iHash--){

	10084 volatile ht_slot aHash; / Pointer to hash table */

	10085 volatile u32 aPgno; / Pointer to array of page numbers */

	10086 u32 iZero; /* Frame number corresponding to aPgno[0] */

	10087 int iKey; /* Hash slot index */

	10088 int nCollide; /* Number of hash collisions remaining */

	10089 int rc; /* Error code */

	10090

	10091 rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero);

	10092 if( rc!=SQLITE_OK ){

	10093 return rc;

	10094 }

	10095 nCollide = HASHTABLE_NSLOT;

	10096 for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){

	10097 u32 iFrame = aHash[iKey] + iZero;

	10098 if( iFrame<=iLast && iFrame>=pWal->minFrame && aPgno[aHash[iKey]]==pgno ){

	10099 assert( iFrame>iRead \|\| CORRUPT_DB );

	10100 iRead = iFrame;

	10101 }

	10102 if( (nCollide--)==0 ){

	10103 return SQLITE_CORRUPT_BKPT;

	10104 }

	10105 }

	10106 }

	10107

	10108 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT

	10109 /* If expensive assert() statements are available, do a linear search

	10110 ** of the wal-index file content. Make sure the results agree with the

	10111 ** result obtained using the hash indexes above. */

	10112 {

	10113 u32 iRead2 = 0;

	10114 u32 iTest;

	10115 assert( pWal->minFrame>0 );

	10116 for(iTest=iLast; iTest>=pWal->minFrame; iTest--){

	10117 if( walFramePgno(pWal, iTest)==pgno ){

	10118 iRead2 = iTest;

	10119 break;

	10120 }

	10121 }

	10122 assert( iRead==iRead2 );

	10123 }

	10124 #endif

	10125

	10126 *piRead = iRead;

	10127 return SQLITE_OK;

	10128 }

	10129

	10130 /*

	10131 ** Read the contents of frame iRead from the wal file into buffer pOut

	10132 ** (which is nOut bytes in size). Return SQLITE_OK if successful, or an

	10133 ** error code otherwise.

	10134 */

	10135 SQLITE_PRIVATE int sqlite3WalReadFrame(

	10136 Wal pWal, / WAL handle */

	10137 u32 iRead, /* Frame to read */

	10138 int nOut, /* Size of buffer pOut in bytes */

	10139 u8 pOut / Buffer to write page data to */

	10140 ){

	10141 int sz;

	10142 i64 iOffset;

	10143 sz = pWal->hdr.szPage;

	10144 sz = (sz&0xfe00) + ((sz&0x0001)<<16);

	10145 testcase( sz<=32768 );

	10146 testcase( sz>=65536 );

	10147 iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;

	10148 /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */

	10149 return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset);

	10150 }

	10151

	10152 /*

	10153 ** Return the size of the database in pages (or zero, if unknown).

	10154 */

	10155 SQLITE_PRIVATE Pgno sqlite3WalDbsize(Wal *pWal){

	10156 if( pWal && ALWAYS(pWal->readLock>=0) ){

	10157 return pWal->hdr.nPage;

	10158 }

	10159 return 0;

	10160 }

	10161

	10162

	10163 /*

	10164 ** This function starts a write transaction on the WAL.

	10165 **

	10166 ** A read transaction must have already been started by a prior call

	10167 ** to sqlite3WalBeginReadTransaction().

	10168 **

	10169 ** If another thread or process has written into the database since

	10170 ** the read transaction was started, then it is not possible for this

	10171 ** thread to write as doing so would cause a fork. So this routine

	10172 ** returns SQLITE_BUSY in that case and no write transaction is started.

	10173 **

	10174 ** There can only be a single writer active at a time.

	10175 */

	10176 SQLITE_PRIVATE int sqlite3WalBeginWriteTransaction(Wal *pWal){

	10177 int rc;

	10178

	10179 /* Cannot start a write transaction without first holding a read

	10180 ** transaction. */

	10181 assert( pWal->readLock>=0 );

	10182

	10183 if( pWal->readOnly ){

	10184 return SQLITE_READONLY;

	10185 }

	10186

	10187 /* Only one writer allowed at a time. Get the write lock. Return

	10188 ** SQLITE_BUSY if unable.

	10189 */

	10190 rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);

	10191 if( rc ){

	10192 return rc;

	10193 }

	10194 pWal->writeLock = 1;

	10195

	10196 /* If another connection has written to the database file since the

	10197 ** time the read transaction on this connection was started, then

	10198 ** the write is disallowed.

	10199 */

	10200 if( memcmp(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr))!=0 ){

	10201 walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);

	10202 pWal->writeLock = 0;

	10203 rc = SQLITE_BUSY_SNAPSHOT;

	10204 }

	10205

	10206 return rc;

	10207 }

	10208

	10209 /*

	10210 ** End a write transaction. The commit has already been done. This

	10211 ** routine merely releases the lock.

	10212 */

	10213 SQLITE_PRIVATE int sqlite3WalEndWriteTransaction(Wal *pWal){

	10214 if( pWal->writeLock ){

	10215 walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);

	10216 pWal->writeLock = 0;

	10217 pWal->truncateOnCommit = 0;

	10218 }

	10219 return SQLITE_OK;

	10220 }

	10221

	10222 /*

	10223 ** If any data has been written (but not committed) to the log file, this

	10224 ** function moves the write-pointer back to the start of the transaction.

	10225 **

	10226 ** Additionally, the callback function is invoked for each frame written

	10227 ** to the WAL since the start of the transaction. If the callback returns

	10228 ** other than SQLITE_OK, it is not invoked again and the error code is

	10229 ** returned to the caller.

	10230 **

	10231 ** Otherwise, if the callback function does not return an error, this

	10232 ** function returns SQLITE_OK.

	10233 */

	10234 SQLITE_PRIVATE int sqlite3WalUndo(Wal pWal, int (xUndo)(void , Pgno), void p UndoCtx){

	10235 int rc = SQLITE_OK;

	10236 if( ALWAYS(pWal->writeLock) ){

	10237 Pgno iMax = pWal->hdr.mxFrame;

	10238 Pgno iFrame;

	10239

	10240 /* Restore the clients cache of the wal-index header to the state it

	10241 ** was in before the client began writing to the database.

	10242 */

	10243 memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr));

	10244

	10245 for(iFrame=pWal->hdr.mxFrame+1;

	10246 ALWAYS(rc==SQLITE_OK) && iFrame<=iMax;

	10247 iFrame++

	10248 ){

	10249 /* This call cannot fail. Unless the page for which the page number

	10250 ** is passed as the second argument is (a) in the cache and

	10251 ** (b) has an outstanding reference, then xUndo is either a no-op

	10252 ** (if (a) is false) or simply expels the page from the cache (if (b)

	10253 ** is false).

	10254 **

	10255 ** If the upper layer is doing a rollback, it is guaranteed that there

	10256 ** are no outstanding references to any page other than page 1. And

	10257 ** page 1 is never written to the log until the transaction is

	10258 ** committed. As a result, the call to xUndo may not fail.

	10259 */

	10260 assert( walFramePgno(pWal, iFrame)!=1 );

	10261 rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));

	10262 }

	10263 if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal);

	10264 }

	10265 return rc;

	10266 }

	10267

	10268 /*

	10269 ** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32

	10270 ** values. This function populates the array with values required to

	10271 ** "rollback" the write position of the WAL handle back to the current

	10272 ** point in the event of a savepoint rollback (via WalSavepointUndo()).

	10273 */

	10274 SQLITE_PRIVATE void sqlite3WalSavepoint(Wal pWal, u32 aWalData){

	10275 assert( pWal->writeLock );

	10276 aWalData[0] = pWal->hdr.mxFrame;

	10277 aWalData[1] = pWal->hdr.aFrameCksum[0];

	10278 aWalData[2] = pWal->hdr.aFrameCksum[1];

	10279 aWalData[3] = pWal->nCkpt;

	10280 }

	10281

	10282 /*

	10283 ** Move the write position of the WAL back to the point identified by

	10284 ** the values in the aWalData[] array. aWalData must point to an array

	10285 ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated

	10286 ** by a call to WalSavepoint().

	10287 */

	10288 SQLITE_PRIVATE int sqlite3WalSavepointUndo(Wal pWal, u32 aWalData){

	10289 int rc = SQLITE_OK;

	10290

	10291 assert( pWal->writeLock );

	10292 assert( aWalData[3]!=pWal->nCkpt \|\| aWalData[0]<=pWal->hdr.mxFrame );

	10293

	10294 if( aWalData[3]!=pWal->nCkpt ){

	10295 /* This savepoint was opened immediately after the write-transaction

	10296 ** was started. Right after that, the writer decided to wrap around

	10297 ** to the start of the log. Update the savepoint values to match.

	10298 */

	10299 aWalData[0] = 0;

	10300 aWalData[3] = pWal->nCkpt;

	10301 }

	10302

	10303 if( aWalData[0]<pWal->hdr.mxFrame ){

	10304 pWal->hdr.mxFrame = aWalData[0];

	10305 pWal->hdr.aFrameCksum[0] = aWalData[1];

	10306 pWal->hdr.aFrameCksum[1] = aWalData[2];

	10307 walCleanupHash(pWal);

	10308 }

	10309

	10310 return rc;

	10311 }

	10312

	10313 /*

	10314 ** This function is called just before writing a set of frames to the log

	10315 ** file (see sqlite3WalFrames()). It checks to see if, instead of appending

	10316 ** to the current log file, it is possible to overwrite the start of the

	10317 ** existing log file with the new frames (i.e. "reset" the log). If so,

	10318 ** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left

	10319 ** unchanged.

	10320 **

	10321 ** SQLITE_OK is returned if no error is encountered (regardless of whether

	10322 ** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned

	10323 ** if an error occurs.

	10324 */

	10325 static int walRestartLog(Wal *pWal){

	10326 int rc = SQLITE_OK;

	10327 int cnt;

	10328

	10329 if( pWal->readLock==0 ){

	10330 volatile WalCkptInfo *pInfo = walCkptInfo(pWal);

	10331 assert( pInfo->nBackfill==pWal->hdr.mxFrame );

	10332 if( pInfo->nBackfill>0 ){

	10333 u32 salt1;

	10334 sqlite3_randomness(4, &salt1);

	10335 rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);

	10336 if( rc==SQLITE_OK ){

	10337 /* If all readers are using WAL_READ_LOCK(0) (in other words if no

	10338 ** readers are currently using the WAL), then the transactions

	10339 ** frames will overwrite the start of the existing log. Update the

	10340 ** wal-index header to reflect this.

	10341 **

	10342 ** In theory it would be Ok to update the cache of the header only

	10343 ** at this point. But updating the actual wal-index header is also

	10344 ** safe and means there is no special case for sqlite3WalUndo()

	10345 ** to handle if this transaction is rolled back. */

	10346 walRestartHdr(pWal, salt1);

	10347 walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);

	10348 }else if( rc!=SQLITE_BUSY ){

	10349 return rc;

	10350 }

	10351 }

	10352 walUnlockShared(pWal, WAL_READ_LOCK(0));

	10353 pWal->readLock = -1;

	10354 cnt = 0;

	10355 do{

	10356 int notUsed;

	10357 rc = walTryBeginRead(pWal, &notUsed, 1, ++cnt);

	10358 }while( rc==WAL_RETRY );

	10359 assert( (rc&0xff)!=SQLITE_BUSY ); /* BUSY not possible when useWal==1 */

	10360 testcase( (rc&0xff)==SQLITE_IOERR );

	10361 testcase( rc==SQLITE_PROTOCOL );

	10362 testcase( rc==SQLITE_OK );

	10363 }

	10364 return rc;

	10365 }

	10366

	10367 /*

	10368 ** Information about the current state of the WAL file and where

	10369 ** the next fsync should occur - passed from sqlite3WalFrames() into

	10370 ** walWriteToLog().

	10371 */

	10372 typedef struct WalWriter {

	10373 Wal pWal; / The complete WAL information */

	10374 sqlite3_file pFd; / The WAL file to which we write */

	10375 sqlite3_int64 iSyncPoint; /* Fsync at this offset */

	10376 int syncFlags; /* Flags for the fsync */

	10377 int szPage; /* Size of one page */

	10378 } WalWriter;

	10379

	10380 /*

	10381 ** Write iAmt bytes of content into the WAL file beginning at iOffset.

	10382 ** Do a sync when crossing the p->iSyncPoint boundary.

	10383 **

	10384 ** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,

	10385 ** first write the part before iSyncPoint, then sync, then write the

	10386 ** rest.

	10387 */

	10388 static int walWriteToLog(

	10389 WalWriter p, / WAL to write to */

	10390 void pContent, / Content to be written */

	10391 int iAmt, /* Number of bytes to write */

	10392 sqlite3_int64 iOffset /* Start writing at this offset */

	10393 ){

	10394 int rc;

	10395 if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){

	10396 int iFirstAmt = (int)(p->iSyncPoint - iOffset);

	10397 rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset);

	10398 if( rc ) return rc;

	10399 iOffset += iFirstAmt;

	10400 iAmt -= iFirstAmt;

	10401 pContent = (void)(iFirstAmt + (char)pContent);

	10402 assert( p->syncFlags & (SQLITE_SYNC_NORMAL\|SQLITE_SYNC_FULL) );

	10403 rc = sqlite3OsSync(p->pFd, p->syncFlags & SQLITE_SYNC_MASK);

	10404 if( iAmt==0 \|\| rc ) return rc;

	10405 }

	10406 rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset);

	10407 return rc;

	10408 }

	10409

	10410 /*

	10411 ** Write out a single frame of the WAL

	10412 */

	10413 static int walWriteOneFrame(

	10414 WalWriter p, / Where to write the frame */

	10415 PgHdr pPage, / The page of the frame to be written */

	10416 int nTruncate, /* The commit flag. Usually 0. >0 for commit */

	10417 sqlite3_int64 iOffset /* Byte offset at which to write */

	10418 ){

	10419 int rc; /* Result code from subfunctions */

	10420 void pData; / Data actually written */

	10421 u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */

	10422 #if defined(SQLITE_HAS_CODEC)

	10423 if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM;

	10424 #else

	10425 pData = pPage->pData;

	10426 #endif

	10427 walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame);

	10428 rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset);

	10429 if( rc ) return rc;

	10430 /* Write the page data */

	10431 rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame));

	10432 return rc;

	10433 }

	10434

	10435 /*

	10436 ** Write a set of frames to the log. The caller must hold the write-lock

	10437 ** on the log file (obtained using sqlite3WalBeginWriteTransaction()).

	10438 */

	10439 SQLITE_PRIVATE int sqlite3WalFrames(

	10440 Wal pWal, / Wal handle to write to */

	10441 int szPage, /* Database page-size in bytes */

	10442 PgHdr pList, / List of dirty pages to write */

	10443 Pgno nTruncate, /* Database size after this commit */

	10444 int isCommit, /* True if this is a commit */

	10445 int sync_flags /* Flags to pass to OsSync() (or 0) */

	10446 ){

	10447 int rc; /* Used to catch return codes */

	10448 u32 iFrame; /* Next frame address */

	10449 PgHdr p; / Iterator to run through pList with. */

	10450 PgHdr pLast = 0; / Last frame in list */

	10451 int nExtra = 0; /* Number of extra copies of last page */

	10452 int szFrame; /* The size of a single frame */

	10453 i64 iOffset; /* Next byte to write in WAL file */

	10454 WalWriter w; /* The writer */

	10455

	10456 assert( pList );

	10457 assert( pWal->writeLock );

	10458

	10459 /* If this frame set completes a transaction, then nTruncate>0. If

	10460 ** nTruncate==0 then this frame set does not complete the transaction. */

	10461 assert( (isCommit!=0)==(nTruncate!=0) );

	10462

	10463 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)

	10464 { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}

	10465 WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",

	10466 pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill"));

	10467 }

	10468 #endif

	10469

	10470 /* See if it is possible to write these frames into the start of the

	10471 ** log file, instead of appending to it at pWal->hdr.mxFrame.

	10472 */

	10473 if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){

	10474 return rc;

	10475 }

	10476

	10477 /* If this is the first frame written into the log, write the WAL

	10478 ** header to the start of the WAL file. See comments at the top of

	10479 ** this source file for a description of the WAL header format.

	10480 */

	10481 iFrame = pWal->hdr.mxFrame;

	10482 if( iFrame==0 ){

	10483 u8 aWalHdr[WAL_HDRSIZE]; /* Buffer to assemble wal-header in */

	10484 u32 aCksum[2]; /* Checksum for wal-header */

	10485

	10486 sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC \| SQLITE_BIGENDIAN));

	10487 sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION);

	10488 sqlite3Put4byte(&aWalHdr[8], szPage);

	10489 sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt);

	10490 if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt);

	10491 memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8);

	10492 walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum);

	10493 sqlite3Put4byte(&aWalHdr[24], aCksum[0]);

	10494 sqlite3Put4byte(&aWalHdr[28], aCksum[1]);

	10495

	10496 pWal->szPage = szPage;

	10497 pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN;

	10498 pWal->hdr.aFrameCksum[0] = aCksum[0];

	10499 pWal->hdr.aFrameCksum[1] = aCksum[1];

	10500 pWal->truncateOnCommit = 1;

	10501

	10502 rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);

	10503 WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));

	10504 if( rc!=SQLITE_OK ){

	10505 return rc;

	10506 }

	10507

	10508 /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless

	10509 ** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise

	10510 ** an out-of-order write following a WAL restart could result in

	10511 ** database corruption. See the ticket:

	10512 **

	10513 ** http://localhost:591/sqlite/info/ff5be73dee

	10514 */

	10515 if( pWal->syncHeader && sync_flags ){

	10516 rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);

	10517 if( rc ) return rc;

	10518 }

	10519 }

	10520 assert( (int)pWal->szPage==szPage );

	10521

	10522 /* Setup information needed to write frames into the WAL */

	10523 w.pWal = pWal;

	10524 w.pFd = pWal->pWalFd;

	10525 w.iSyncPoint = 0;

	10526 w.syncFlags = sync_flags;

	10527 w.szPage = szPage;

	10528 iOffset = walFrameOffset(iFrame+1, szPage);

	10529 szFrame = szPage + WAL_FRAME_HDRSIZE;

	10530

	10531 /* Write all frames into the log file exactly once */

	10532 for(p=pList; p; p=p->pDirty){

	10533 int nDbSize; /* 0 normally. Positive == commit flag */

	10534 iFrame++;

	10535 assert( iOffset==walFrameOffset(iFrame, szPage) );

	10536 nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0;

	10537 rc = walWriteOneFrame(&w, p, nDbSize, iOffset);

	10538 if( rc ) return rc;

	10539 pLast = p;

	10540 iOffset += szFrame;

	10541 }

	10542

	10543 /* If this is the end of a transaction, then we might need to pad

	10544 ** the transaction and/or sync the WAL file.

	10545 **

	10546 ** Padding and syncing only occur if this set of frames complete a

	10547 ** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL

	10548 ** or synchronous==OFF, then no padding or syncing are needed.

	10549 **

	10550 ** If SQLITE_IOCAP_POWERSAFE_OVERWRITE is defined, then padding is not

	10551 ** needed and only the sync is done. If padding is needed, then the

	10552 ** final frame is repeated (with its commit mark) until the next sector

	10553 ** boundary is crossed. Only the part of the WAL prior to the last

	10554 ** sector boundary is synced; the part of the last frame that extends

	10555 ** past the sector boundary is written after the sync.

	10556 */

	10557 if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){

	10558 if( pWal->padToSectorBoundary ){

	10559 int sectorSize = sqlite3SectorSize(pWal->pWalFd);

	10560 w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize;

	10561 while( iOffset<w.iSyncPoint ){

	10562 rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset);

	10563 if( rc ) return rc;

	10564 iOffset += szFrame;

	10565 nExtra++;

	10566 }

	10567 }else{

	10568 rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK);

	10569 }

	10570 }

	10571

	10572 /* If this frame set completes the first transaction in the WAL and

	10573 ** if PRAGMA journal_size_limit is set, then truncate the WAL to the

	10574 ** journal size limit, if possible.

	10575 */

	10576 if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){

	10577 i64 sz = pWal->mxWalSize;

	10578 if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){

	10579 sz = walFrameOffset(iFrame+nExtra+1, szPage);

	10580 }

	10581 walLimitSize(pWal, sz);

	10582 pWal->truncateOnCommit = 0;

	10583 }

	10584

	10585 /* Append data to the wal-index. It is not necessary to lock the

	10586 ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index

	10587 ** guarantees that there are no other writers, and no data that may

	10588 ** be in use by existing readers is being overwritten.

	10589 */

	10590 iFrame = pWal->hdr.mxFrame;

	10591 for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){

	10592 iFrame++;

	10593 rc = walIndexAppend(pWal, iFrame, p->pgno);

	10594 }

	10595 while( rc==SQLITE_OK && nExtra>0 ){

	10596 iFrame++;

	10597 nExtra--;

	10598 rc = walIndexAppend(pWal, iFrame, pLast->pgno);

	10599 }

	10600

	10601 if( rc==SQLITE_OK ){

	10602 /* Update the private copy of the header. */

	10603 pWal->hdr.szPage = (u16)((szPage&0xff00) \| (szPage>>16));

	10604 testcase( szPage<=32768 );

	10605 testcase( szPage>=65536 );

	10606 pWal->hdr.mxFrame = iFrame;

	10607 if( isCommit ){

	10608 pWal->hdr.iChange++;

	10609 pWal->hdr.nPage = nTruncate;

	10610 }

	10611 /* If this is a commit, update the wal-index header too. */

	10612 if( isCommit ){

	10613 walIndexWriteHdr(pWal);

	10614 pWal->iCallback = iFrame;

	10615 }

	10616 }

	10617

	10618 WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok"));

	10619 return rc;

	10620 }

	10621

	10622 /*

	10623 ** This routine is called to implement sqlite3_wal_checkpoint() and

	10624 ** related interfaces.

	10625 **

	10626 ** Obtain a CHECKPOINT lock and then backfill as much information as

	10627 ** we can from WAL into the database.

	10628 **

	10629 ** If parameter xBusy is not NULL, it is a pointer to a busy-handler

	10630 ** callback. In this case this function runs a blocking checkpoint.

	10631 */

	10632 SQLITE_PRIVATE int sqlite3WalCheckpoint(

	10633 Wal pWal, / Wal connection */

	10634 int eMode, /* PASSIVE, FULL, RESTART, or TRUNCATE */

	10635 int (xBusy)(void), /* Function to call when busy */

	10636 void pBusyArg, / Context argument for xBusyHandler */

	10637 int sync_flags, /* Flags to sync db file with (or 0) */

	10638 int nBuf, /* Size of temporary buffer */

	10639 u8 zBuf, / Temporary buffer to use */

	10640 int pnLog, / OUT: Number of frames in WAL */

	10641 int pnCkpt / OUT: Number of backfilled frames in WAL */

	10642 ){

	10643 int rc; /* Return code */

	10644 int isChanged = 0; /* True if a new wal-index header is loaded */

	10645 int eMode2 = eMode; /* Mode to pass to walCheckpoint() */

	10646 int (xBusy2)(void) = xBusy; /* Busy handler for eMode2 */

	10647

	10648 assert( pWal->ckptLock==0 );

	10649 assert( pWal->writeLock==0 );

	10650

	10651 /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked

	10652 ** in the SQLITE_CHECKPOINT_PASSIVE mode. */

	10653 assert( eMode!=SQLITE_CHECKPOINT_PASSIVE \|\| xBusy==0 );

	10654

	10655 if( pWal->readOnly ) return SQLITE_READONLY;

	10656 WALTRACE(("WAL%p: checkpoint begins\n", pWal));

	10657

	10658 /* IMPLEMENTATION-OF: R-62028-47212 All calls obtain an exclusive

	10659 ** "checkpoint" lock on the database file. */

	10660 rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);

	10661 if( rc ){

	10662 /* EVIDENCE-OF: R-10421-19736 If any other process is running a

	10663 ** checkpoint operation at the same time, the lock cannot be obtained and

	10664 ** SQLITE_BUSY is returned.

	10665 ** EVIDENCE-OF: R-53820-33897 Even if there is a busy-handler configured,

	10666 ** it will not be invoked in this case.

	10667 */

	10668 testcase( rc==SQLITE_BUSY );

	10669 testcase( xBusy!=0 );

	10670 return rc;

	10671 }

	10672 pWal->ckptLock = 1;

	10673

	10674 /* IMPLEMENTATION-OF: R-59782-36818 The SQLITE_CHECKPOINT_FULL, RESTART and

	10675 ** TRUNCATE modes also obtain the exclusive "writer" lock on the database

	10676 ** file.

	10677 **

	10678 ** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained

	10679 ** immediately, and a busy-handler is configured, it is invoked and the

	10680 ** writer lock retried until either the busy-handler returns 0 or the

	10681 ** lock is successfully obtained.

	10682 */

	10683 if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){

	10684 rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_WRITE_LOCK, 1);

	10685 if( rc==SQLITE_OK ){

	10686 pWal->writeLock = 1;

	10687 }else if( rc==SQLITE_BUSY ){

	10688 eMode2 = SQLITE_CHECKPOINT_PASSIVE;

	10689 xBusy2 = 0;

	10690 rc = SQLITE_OK;

	10691 }

	10692 }

	10693

	10694 /* Read the wal-index header. */

	10695 if( rc==SQLITE_OK ){

	10696 rc = walIndexReadHdr(pWal, &isChanged);

	10697 if( isChanged && pWal->pDbFd->pMethods->iVersion>=3 ){

	10698 sqlite3OsUnfetch(pWal->pDbFd, 0, 0);

	10699 }

	10700 }

	10701

	10702 /* Copy data from the log to the database file. */

	10703 if( rc==SQLITE_OK ){

	10704 if( pWal->hdr.mxFrame && walPagesize(pWal)!=nBuf ){

	10705 rc = SQLITE_CORRUPT_BKPT;

	10706 }else{

	10707 rc = walCheckpoint(pWal, eMode2, xBusy2, pBusyArg, sync_flags, zBuf);

	10708 }

	10709

	10710 /* If no error occurred, set the output variables. */

	10711 if( rc==SQLITE_OK \|\| rc==SQLITE_BUSY ){

	10712 if( pnLog ) *pnLog = (int)pWal->hdr.mxFrame;

	10713 if( pnCkpt ) *pnCkpt = (int)(walCkptInfo(pWal)->nBackfill);

	10714 }

	10715 }

	10716

	10717 if( isChanged ){

	10718 /* If a new wal-index header was loaded before the checkpoint was

	10719 ** performed, then the pager-cache associated with pWal is now

	10720 ** out of date. So zero the cached wal-index header to ensure that

	10721 ** next time the pager opens a snapshot on this database it knows that

	10722 ** the cache needs to be reset.

	10723 */

	10724 memset(&pWal->hdr, 0, sizeof(WalIndexHdr));

	10725 }

	10726

	10727 /* Release the locks. */

	10728 sqlite3WalEndWriteTransaction(pWal);

	10729 walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);

	10730 pWal->ckptLock = 0;

	10731 WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));

	10732 return (rc==SQLITE_OK && eMode!=eMode2 ? SQLITE_BUSY : rc);

	10733 }

	10734

	10735 /* Return the value to pass to a sqlite3_wal_hook callback, the

	10736 ** number of frames in the WAL at the point of the last commit since

	10737 ** sqlite3WalCallback() was called. If no commits have occurred since

	10738 ** the last call, then return 0.

	10739 */

	10740 SQLITE_PRIVATE int sqlite3WalCallback(Wal *pWal){

	10741 u32 ret = 0;

	10742 if( pWal ){

	10743 ret = pWal->iCallback;

	10744 pWal->iCallback = 0;

	10745 }

	10746 return (int)ret;

	10747 }

	10748

	10749 /*

	10750 ** This function is called to change the WAL subsystem into or out

	10751 ** of locking_mode=EXCLUSIVE.

	10752 **

	10753 ** If op is zero, then attempt to change from locking_mode=EXCLUSIVE

	10754 ** into locking_mode=NORMAL. This means that we must acquire a lock

	10755 ** on the pWal->readLock byte. If the WAL is already in locking_mode=NORMAL

	10756 ** or if the acquisition of the lock fails, then return 0. If the

	10757 ** transition out of exclusive-mode is successful, return 1. This

	10758 ** operation must occur while the pager is still holding the exclusive

	10759 ** lock on the main database file.

	10760 **

	10761 ** If op is one, then change from locking_mode=NORMAL into

	10762 ** locking_mode=EXCLUSIVE. This means that the pWal->readLock must

	10763 ** be released. Return 1 if the transition is made and 0 if the

	10764 ** WAL is already in exclusive-locking mode - meaning that this

	10765 ** routine is a no-op. The pager must already hold the exclusive lock

	10766 ** on the main database file before invoking this operation.

	10767 **

	10768 ** If op is negative, then do a dry-run of the op==1 case but do

	10769 ** not actually change anything. The pager uses this to see if it

	10770 ** should acquire the database exclusive lock prior to invoking

	10771 ** the op==1 case.

	10772 */

	10773 SQLITE_PRIVATE int sqlite3WalExclusiveMode(Wal *pWal, int op){

	10774 int rc;

	10775 assert( pWal->writeLock==0 );

	10776 assert( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE \|\| op==-1 );

	10777

	10778 /* pWal->readLock is usually set, but might be -1 if there was a

	10779 ** prior error while attempting to acquire are read-lock. This cannot

	10780 ** happen if the connection is actually in exclusive mode (as no xShmLock

	10781 ** locks are taken in this case). Nor should the pager attempt to

	10782 ** upgrade to exclusive-mode following such an error.

	10783 */

	10784 assert( pWal->readLock>=0 \|\| pWal->lockError );

	10785 assert( pWal->readLock>=0 \|\| (op<=0 && pWal->exclusiveMode==0) );

	10786

	10787 if( op==0 ){

	10788 if( pWal->exclusiveMode ){

	10789 pWal->exclusiveMode = 0;

	10790 if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){

	10791 pWal->exclusiveMode = 1;

	10792 }

	10793 rc = pWal->exclusiveMode==0;

	10794 }else{

	10795 /* Already in locking_mode=NORMAL */

	10796 rc = 0;

	10797 }

	10798 }else if( op>0 ){

	10799 assert( pWal->exclusiveMode==0 );

	10800 assert( pWal->readLock>=0 );

	10801 walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));

	10802 pWal->exclusiveMode = 1;

	10803 rc = 1;

	10804 }else{

	10805 rc = pWal->exclusiveMode==0;

	10806 }

	10807 return rc;

	10808 }

	10809

	10810 /*

	10811 ** Return true if the argument is non-NULL and the WAL module is using

	10812 ** heap-memory for the wal-index. Otherwise, if the argument is NULL or the

	10813 ** WAL module is using shared-memory, return false.

	10814 */

	10815 SQLITE_PRIVATE int sqlite3WalHeapMemory(Wal *pWal){

	10816 return (pWal && pWal->exclusiveMode==WAL_HEAPMEMORY_MODE );

	10817 }

	10818

	10819 #ifdef SQLITE_ENABLE_SNAPSHOT

	10820 /* Create a snapshot object. The content of a snapshot is opaque to

	10821 ** every other subsystem, so the WAL module can put whatever it needs

	10822 ** in the object.

	10823 */

	10824 SQLITE_PRIVATE int sqlite3WalSnapshotGet(Wal pWal, sqlite3_snapshot *ppSnapsho t){

	10825 int rc = SQLITE_OK;

	10826 WalIndexHdr *pRet;

	10827

	10828 assert( pWal->readLock>=0 && pWal->writeLock==0 );

	10829

	10830 pRet = (WalIndexHdr*)sqlite3_malloc(sizeof(WalIndexHdr));

	10831 if( pRet==0 ){

	10832 rc = SQLITE_NOMEM;

	10833 }else{

	10834 memcpy(pRet, &pWal->hdr, sizeof(WalIndexHdr));

	10835 ppSnapshot = (sqlite3_snapshot)pRet;

	10836 }

	10837

	10838 return rc;

	10839 }

	10840

	10841 /* Try to open on pSnapshot when the next read-transaction starts

	10842 */

	10843 SQLITE_PRIVATE void sqlite3WalSnapshotOpen(Wal pWal, sqlite3_snapshot pSnapsho t){

	10844 pWal->pSnapshot = (WalIndexHdr*)pSnapshot;

	10845 }

	10846 #endif /* SQLITE_ENABLE_SNAPSHOT */

	10847

	10848 #ifdef SQLITE_ENABLE_ZIPVFS

	10849 /*

	10850 ** If the argument is not NULL, it points to a Wal object that holds a

	10851 ** read-lock. This function returns the database page-size if it is known,

	10852 ** or zero if it is not (or if pWal is NULL).

	10853 */

	10854 SQLITE_PRIVATE int sqlite3WalFramesize(Wal *pWal){

	10855 assert( pWal==0 \|\| pWal->readLock>=0 );

	10856 return (pWal ? pWal->szPage : 0);

	10857 }

	10858 #endif

	10859

	10860 /* Return the sqlite3_file object for the WAL file

	10861 */

	10862 SQLITE_PRIVATE sqlite3_file sqlite3WalFile(Wal pWal){

	10863 return pWal->pWalFd;

	10864 }

	10865

	10866 #endif /* #ifndef SQLITE_OMIT_WAL */

	10867

	10868 /************ End of wal.c ***********************************************/

	10869 /************ Begin file btmutex.c ***************************************/

	10870 /*

	10871 ** 2007 August 27

	10872 **

	10873 ** The author disclaims copyright to this source code. In place of

	10874 ** a legal notice, here is a blessing:

	10875 **

	10876 ** May you do good and not evil.

	10877 ** May you find forgiveness for yourself and forgive others.

	10878 ** May you share freely, never taking more than you give.

	10879 **

	10880 *************************************************************************

	10881 **

	10882 ** This file contains code used to implement mutexes on Btree objects.

	10883 ** This code really belongs in btree.c. But btree.c is getting too

	10884 ** big and we want to break it down some. This packaged seemed like

	10885 ** a good breakout.

	10886 */

	10887 /************ Include btreeInt.h in the middle of btmutex.c **************/

	10888 /************ Begin file btreeInt.h **************************************/

	10889 /*

	10890 ** 2004 April 6

	10891 **

	10892 ** The author disclaims copyright to this source code. In place of

	10893 ** a legal notice, here is a blessing:

	10894 **

	10895 ** May you do good and not evil.

	10896 ** May you find forgiveness for yourself and forgive others.

	10897 ** May you share freely, never taking more than you give.

	10898 **

	10899 *************************************************************************

	10900 ** This file implements an external (disk-based) database using BTrees.

	10901 ** For a detailed discussion of BTrees, refer to

	10902 **

	10903 ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:

	10904 ** "Sorting And Searching", pages 473-480. Addison-Wesley

	10905 ** Publishing Company, Reading, Massachusetts.

	10906 **

	10907 ** The basic idea is that each page of the file contains N database

	10908 ** entries and N+1 pointers to subpages.

	10909 **

	10910 ** ----------------------------------------------------------------

	10911 ** \| Ptr(0) \| Key(0) \| Ptr(1) \| Key(1) \| ... \| Key(N-1) \| Ptr(N) \|

	10912 ** ----------------------------------------------------------------

	10913 **

	10914 ** All of the keys on the page that Ptr(0) points to have values less

	10915 ** than Key(0). All of the keys on page Ptr(1) and its subpages have

	10916 ** values greater than Key(0) and less than Key(1). All of the keys

	10917 ** on Ptr(N) and its subpages have values greater than Key(N-1). And

	10918 ** so forth.

	10919 **

	10920 ** Finding a particular key requires reading O(log(M)) pages from the

	10921 ** disk where M is the number of entries in the tree.

	10922 **

	10923 ** In this implementation, a single file can hold one or more separate

	10924 ** BTrees. Each BTree is identified by the index of its root page. The

	10925 ** key and data for any entry are combined to form the "payload". A

	10926 ** fixed amount of payload can be carried directly on the database

	10927 ** page. If the payload is larger than the preset amount then surplus

	10928 ** bytes are stored on overflow pages. The payload for an entry

	10929 ** and the preceding pointer are combined to form a "Cell". Each

	10930 ** page has a small header which contains the Ptr(N) pointer and other

	10931 ** information such as the size of key and data.

	10932 **

	10933 ** FORMAT DETAILS

	10934 **

	10935 ** The file is divided into pages. The first page is called page 1,

	10936 ** the second is page 2, and so forth. A page number of zero indicates

	10937 ** "no such page". The page size can be any power of 2 between 512 and 65536.

	10938 ** Each page can be either a btree page, a freelist page, an overflow

	10939 ** page, or a pointer-map page.

	10940 **

	10941 ** The first page is always a btree page. The first 100 bytes of the first

	10942 ** page contain a special header (the "file header") that describes the file.

	10943 ** The format of the file header is as follows:

	10944 **

	10945 ** OFFSET SIZE DESCRIPTION

	10946 ** 0 16 Header string: "SQLite format 3\000"

	10947 ** 16 2 Page size in bytes. (1 means 65536)

	10948 ** 18 1 File format write version

	10949 ** 19 1 File format read version

	10950 ** 20 1 Bytes of unused space at the end of each page

	10951 ** 21 1 Max embedded payload fraction (must be 64)

	10952 ** 22 1 Min embedded payload fraction (must be 32)

	10953 ** 23 1 Min leaf payload fraction (must be 32)

	10954 ** 24 4 File change counter

	10955 ** 28 4 Reserved for future use

	10956 ** 32 4 First freelist page

	10957 ** 36 4 Number of freelist pages in the file

	10958 ** 40 60 15 4-byte meta values passed to higher layers

	10959 **

	10960 ** 40 4 Schema cookie

	10961 ** 44 4 File format of schema layer

	10962 ** 48 4 Size of page cache

	10963 ** 52 4 Largest root-page (auto/incr_vacuum)

	10964 ** 56 4 1=UTF-8 2=UTF16le 3=UTF16be

	10965 ** 60 4 User version

	10966 ** 64 4 Incremental vacuum mode

	10967 ** 68 4 Application-ID

	10968 ** 72 20 unused

	10969 ** 92 4 The version-valid-for number

	10970 ** 96 4 SQLITE_VERSION_NUMBER

	10971 **

	10972 ** All of the integer values are big-endian (most significant byte first).

	10973 **

	10974 ** The file change counter is incremented when the database is changed

	10975 ** This counter allows other processes to know when the file has changed

	10976 ** and thus when they need to flush their cache.

	10977 **

	10978 ** The max embedded payload fraction is the amount of the total usable

	10979 ** space in a page that can be consumed by a single cell for standard

	10980 ** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default

	10981 ** is to limit the maximum cell size so that at least 4 cells will fit

	10982 ** on one page. Thus the default max embedded payload fraction is 64.

	10983 **

	10984 ** If the payload for a cell is larger than the max payload, then extra

	10985 ** payload is spilled to overflow pages. Once an overflow page is allocated,

	10986 ** as many bytes as possible are moved into the overflow pages without letting

	10987 ** the cell size drop below the min embedded payload fraction.

	10988 **

	10989 ** The min leaf payload fraction is like the min embedded payload fraction

	10990 ** except that it applies to leaf nodes in a LEAFDATA tree. The maximum

	10991 ** payload fraction for a LEAFDATA tree is always 100% (or 255) and it

	10992 ** not specified in the header.

	10993 **

	10994 ** Each btree pages is divided into three sections: The header, the

	10995 ** cell pointer array, and the cell content area. Page 1 also has a 100-byte

	10996 ** file header that occurs before the page header.

	10997 **

	10998 ** \|----------------\|

	10999 ** \| file header \| 100 bytes. Page 1 only.

	11000 ** \|----------------\|

	11001 ** \| page header \| 8 bytes for leaves. 12 bytes for interior nodes

	11002 ** \|----------------\|

	11003 ** \| cell pointer \| \| 2 bytes per cell. Sorted order.

	11004 ** \| array \| \| Grows downward

	11005 ** \| \| v

	11006 ** \|----------------\|

	11007 ** \| unallocated \|

	11008 ** \| space \|

	11009 ** \|----------------\| ^ Grows upwards

	11010 ** \| cell content \| \| Arbitrary order interspersed with freeblocks.

	11011 ** \| area \| \| and free space fragments.

	11012 ** \|----------------\|

	11013 **

	11014 ** The page headers looks like this:

	11015 **

	11016 ** OFFSET SIZE DESCRIPTION

	11017 ** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf

	11018 ** 1 2 byte offset to the first freeblock

	11019 ** 3 2 number of cells on this page

	11020 ** 5 2 first byte of the cell content area

	11021 ** 7 1 number of fragmented free bytes

	11022 ** 8 4 Right child (the Ptr(N) value). Omitted on leaves.

	11023 **

	11024 ** The flags define the format of this btree page. The leaf flag means that

	11025 ** this page has no children. The zerodata flag means that this page carries

	11026 ** only keys and no data. The intkey flag means that the key is an integer

	11027 ** which is stored in the key size entry of the cell header rather than in

	11028 ** the payload area.

	11029 **

	11030 ** The cell pointer array begins on the first byte after the page header.

	11031 ** The cell pointer array contains zero or more 2-byte numbers which are

	11032 ** offsets from the beginning of the page to the cell content in the cell

	11033 ** content area. The cell pointers occur in sorted order. The system strives

	11034 ** to keep free space after the last cell pointer so that new cells can

	11035 ** be easily added without having to defragment the page.

	11036 **

	11037 ** Cell content is stored at the very end of the page and grows toward the

	11038 ** beginning of the page.

	11039 **

	11040 ** Unused space within the cell content area is collected into a linked list of

	11041 ** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset

	11042 ** to the first freeblock is given in the header. Freeblocks occur in

	11043 ** increasing order. Because a freeblock must be at least 4 bytes in size,

	11044 ** any group of 3 or fewer unused bytes in the cell content area cannot

	11045 ** exist on the freeblock chain. A group of 3 or fewer free bytes is called

	11046 ** a fragment. The total number of bytes in all fragments is recorded.

	11047 ** in the page header at offset 7.

	11048 **

	11049 ** SIZE DESCRIPTION

	11050 ** 2 Byte offset of the next freeblock

	11051 ** 2 Bytes in this freeblock

	11052 **

	11053 ** Cells are of variable length. Cells are stored in the cell content area at

	11054 ** the end of the page. Pointers to the cells are in the cell pointer array

	11055 ** that immediately follows the page header. Cells is not necessarily

	11056 ** contiguous or in order, but cell pointers are contiguous and in order.

	11057 **

	11058 ** Cell content makes use of variable length integers. A variable

	11059 ** length integer is 1 to 9 bytes where the lower 7 bits of each

	11060 ** byte are used. The integer consists of all bytes that have bit 8 set and

	11061 ** the first byte with bit 8 clear. The most significant byte of the integer

	11062 ** appears first. A variable-length integer may not be more than 9 bytes long.

	11063 ** As a special case, all 8 bytes of the 9th byte are used as data. This

	11064 ** allows a 64-bit integer to be encoded in 9 bytes.

	11065 **

	11066 ** 0x00 becomes 0x00000000

	11067 ** 0x7f becomes 0x0000007f

	11068 ** 0x81 0x00 becomes 0x00000080

	11069 ** 0x82 0x00 becomes 0x00000100

	11070 ** 0x80 0x7f becomes 0x0000007f

	11071 ** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678

	11072 ** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081

	11073 **

	11074 ** Variable length integers are used for rowids and to hold the number of

	11075 ** bytes of key and data in a btree cell.

	11076 **

	11077 ** The content of a cell looks like this:

	11078 **

	11079 ** SIZE DESCRIPTION

	11080 ** 4 Page number of the left child. Omitted if leaf flag is set.

	11081 ** var Number of bytes of data. Omitted if the zerodata flag is set.

	11082 ** var Number of bytes of key. Or the key itself if intkey flag is set.

	11083 ** * Payload

	11084 ** 4 First page of the overflow chain. Omitted if no overflow

	11085 **

	11086 ** Overflow pages form a linked list. Each page except the last is completely

	11087 ** filled with data (pagesize - 4 bytes). The last page can have as little

	11088 ** as 1 byte of data.

	11089 **

	11090 ** SIZE DESCRIPTION

	11091 ** 4 Page number of next overflow page

	11092 ** * Data

	11093 **

	11094 ** Freelist pages come in two subtypes: trunk pages and leaf pages. The

	11095 ** file header points to the first in a linked list of trunk page. Each trunk

	11096 ** page points to multiple leaf pages. The content of a leaf page is

	11097 ** unspecified. A trunk page looks like this:

	11098 **

	11099 ** SIZE DESCRIPTION

	11100 ** 4 Page number of next trunk page

	11101 ** 4 Number of leaf pointers on this page

	11102 ** * zero or more pages numbers of leaves

	11103 */

	11104 /* #include "sqliteInt.h" */

	11105

	11106

	11107 /* The following value is the maximum cell size assuming a maximum page

	11108 ** size give above.

	11109 */

	11110 #define MX_CELL_SIZE(pBt) ((int)(pBt->pageSize-8))

	11111

	11112 /* The maximum number of cells on a single page of the database. This

	11113 ** assumes a minimum cell size of 6 bytes (4 bytes for the cell itself

	11114 ** plus 2 bytes for the index to the cell in the page header). Such

	11115 ** small cells will be rare, but they are possible.

	11116 */

	11117 #define MX_CELL(pBt) ((pBt->pageSize-8)/6)

	11118

	11119 /* Forward declarations */

	11120 typedef struct MemPage MemPage;

	11121 typedef struct BtLock BtLock;

	11122 typedef struct CellInfo CellInfo;

	11123

	11124 /*

	11125 ** This is a magic string that appears at the beginning of every

	11126 ** SQLite database in order to identify the file as a real database.

	11127 **

	11128 ** You can change this value at compile-time by specifying a

	11129 ** -DSQLITE_FILE_HEADER="..." on the compiler command-line. The

	11130 ** header must be exactly 16 bytes including the zero-terminator so

	11131 ** the string itself should be 15 characters long. If you change

	11132 ** the header, then your custom library will not be able to read

	11133 ** databases generated by the standard tools and the standard tools

	11134 ** will not be able to read databases created by your custom library.

	11135 */

	11136 #ifndef SQLITE_FILE_HEADER /* 123456789 123456 */

	11137 # define SQLITE_FILE_HEADER "SQLite format 3"

	11138 #endif

	11139

	11140 /*

	11141 ** Page type flags. An ORed combination of these flags appear as the

	11142 ** first byte of on-disk image of every BTree page.

	11143 */

	11144 #define PTF_INTKEY 0x01

	11145 #define PTF_ZERODATA 0x02

	11146 #define PTF_LEAFDATA 0x04

	11147 #define PTF_LEAF 0x08

	11148

	11149 /*

	11150 ** As each page of the file is loaded into memory, an instance of the following

	11151 ** structure is appended and initialized to zero. This structure stores

	11152 ** information about the page that is decoded from the raw file page.

	11153 **

	11154 ** The pParent field points back to the parent page. This allows us to

	11155 ** walk up the BTree from any leaf to the root. Care must be taken to

	11156 ** unref() the parent page pointer when this page is no longer referenced.

	11157 ** The pageDestructor() routine handles that chore.

	11158 **

	11159 ** Access to all fields of this structure is controlled by the mutex

	11160 ** stored in MemPage.pBt->mutex.

	11161 */

	11162 struct MemPage {

	11163 u8 isInit; /* True if previously initialized. MUST BE FIRST! */

	11164 u8 nOverflow; /* Number of overflow cell bodies in aCell[] */

	11165 u8 intKey; /* True if table b-trees. False for index b-trees */

	11166 u8 intKeyLeaf; /* True if the leaf of an intKey table */

	11167 u8 noPayload; /* True if internal intKey page (thus w/o data) */

	11168 u8 leaf; /* True if a leaf page */

	11169 u8 hdrOffset; /* 100 for page 1. 0 otherwise */

	11170 u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */

	11171 u8 max1bytePayload; /* min(maxLocal,127) */

	11172 u8 bBusy; /* Prevent endless loops on corrupt database files */

	11173 u16 maxLocal; /* Copy of BtShared.maxLocal or BtShared.maxLeaf */

	11174 u16 minLocal; /* Copy of BtShared.minLocal or BtShared.minLeaf */

	11175 u16 cellOffset; /* Index in aData of first cell pointer */

	11176 u16 nFree; /* Number of free bytes on the page */

	11177 u16 nCell; /* Number of cells on this page, local and ovfl */

	11178 u16 maskPage; /* Mask for page offset */

	11179 u16 aiOvfl[5]; /* Insert the i-th overflow cell before the aiOvfl-th

	11180 ** non-overflow cell */

	11181 u8 apOvfl[5]; / Pointers to the body of overflow cells */

	11182 BtShared pBt; / Pointer to BtShared that this page is part of */

	11183 u8 aData; / Pointer to disk image of the page data */

	11184 u8 aDataEnd; / One byte past the end of usable data */

	11185 u8 aCellIdx; / The cell index area */

	11186 u8 aDataOfst; / Same as aData for leaves. aData+4 for interior */

	11187 DbPage pDbPage; / Pager page handle */

	11188 u16 (xCellSize)(MemPage,u8); / cellSizePtr method */

	11189 void (xParseCell)(MemPage,u8,CellInfo); /* btreeParseCell method */

	11190 Pgno pgno; /* Page number for this page */

	11191 };

	11192

	11193 /*

	11194 ** The in-memory image of a disk page has the auxiliary information appended

	11195 ** to the end. EXTRA_SIZE is the number of bytes of space needed to hold

	11196 ** that extra information.

	11197 */

	11198 #define EXTRA_SIZE sizeof(MemPage)

	11199

	11200 /*

	11201 ** A linked list of the following structures is stored at BtShared.pLock.

	11202 ** Locks are added (or upgraded from READ_LOCK to WRITE_LOCK) when a cursor

	11203 ** is opened on the table with root page BtShared.iTable. Locks are removed

	11204 ** from this list when a transaction is committed or rolled back, or when

	11205 ** a btree handle is closed.

	11206 */

	11207 struct BtLock {

	11208 Btree pBtree; / Btree handle holding this lock */

	11209 Pgno iTable; /* Root page of table */

	11210 u8 eLock; /* READ_LOCK or WRITE_LOCK */

	11211 BtLock pNext; / Next in BtShared.pLock list */

	11212 };

	11213

	11214 /* Candidate values for BtLock.eLock */

	11215 #define READ_LOCK 1

	11216 #define WRITE_LOCK 2

	11217

	11218 /* A Btree handle

	11219 **

	11220 ** A database connection contains a pointer to an instance of

	11221 ** this object for every database file that it has open. This structure

	11222 ** is opaque to the database connection. The database connection cannot

	11223 ** see the internals of this structure and only deals with pointers to

	11224 ** this structure.

	11225 **

	11226 ** For some database files, the same underlying database cache might be

	11227 ** shared between multiple connections. In that case, each connection

	11228 ** has it own instance of this object. But each instance of this object

	11229 ** points to the same BtShared object. The database cache and the

	11230 ** schema associated with the database file are all contained within

	11231 ** the BtShared object.

	11232 **

	11233 ** All fields in this structure are accessed under sqlite3.mutex.

	11234 ** The pBt pointer itself may not be changed while there exists cursors

	11235 ** in the referenced BtShared that point back to this Btree since those

	11236 ** cursors have to go through this Btree to find their BtShared and

	11237 ** they often do so without holding sqlite3.mutex.

	11238 */

	11239 struct Btree {

	11240 sqlite3 db; / The database connection holding this btree */

	11241 BtShared pBt; / Sharable content of this btree */

	11242 u8 inTrans; /* TRANS_NONE, TRANS_READ or TRANS_WRITE */

	11243 u8 sharable; /* True if we can share pBt with another db */

	11244 u8 locked; /* True if db currently has pBt locked */

	11245 u8 hasIncrblobCur; /* True if there are one or more Incrblob cursors */

	11246 int wantToLock; /* Number of nested calls to sqlite3BtreeEnter() */

	11247 int nBackup; /* Number of backup operations reading this btree */

	11248 u32 iDataVersion; /* Combines with pBt->pPager->iDataVersion */

	11249 Btree pNext; / List of other sharable Btrees from the same db */

	11250 Btree pPrev; / Back pointer of the same list */

	11251 #ifndef SQLITE_OMIT_SHARED_CACHE

	11252 BtLock lock; /* Object used to lock page 1 */

	11253 #endif

	11254 };

	11255

	11256 /*

	11257 ** Btree.inTrans may take one of the following values.

	11258 **

	11259 ** If the shared-data extension is enabled, there may be multiple users

	11260 ** of the Btree structure. At most one of these may open a write transaction,

	11261 ** but any number may have active read transactions.

	11262 */

	11263 #define TRANS_NONE 0

	11264 #define TRANS_READ 1

	11265 #define TRANS_WRITE 2

	11266

	11267 /*

	11268 ** An instance of this object represents a single database file.

	11269 **

	11270 ** A single database file can be in use at the same time by two

	11271 ** or more database connections. When two or more connections are

	11272 ** sharing the same database file, each connection has it own

	11273 ** private Btree object for the file and each of those Btrees points

	11274 ** to this one BtShared object. BtShared.nRef is the number of

	11275 ** connections currently sharing this database file.

	11276 **

	11277 ** Fields in this structure are accessed under the BtShared.mutex

	11278 ** mutex, except for nRef and pNext which are accessed under the

	11279 ** global SQLITE_MUTEX_STATIC_MASTER mutex. The pPager field

	11280 ** may not be modified once it is initially set as long as nRef>0.

	11281 ** The pSchema field may be set once under BtShared.mutex and

	11282 ** thereafter is unchanged as long as nRef>0.

	11283 **

	11284 ** isPending:

	11285 **

	11286 ** If a BtShared client fails to obtain a write-lock on a database

	11287 ** table (because there exists one or more read-locks on the table),

	11288 ** the shared-cache enters 'pending-lock' state and isPending is

	11289 ** set to true.

	11290 **

	11291 ** The shared-cache leaves the 'pending lock' state when either of

	11292 ** the following occur:

	11293 **

	11294 ** 1) The current writer (BtShared.pWriter) concludes its transaction, OR

	11295 ** 2) The number of locks held by other connections drops to zero.

	11296 **

	11297 ** while in the 'pending-lock' state, no connection may start a new

	11298 ** transaction.

	11299 **

	11300 ** This feature is included to help prevent writer-starvation.

	11301 */

	11302 struct BtShared {

	11303 Pager pPager; / The page cache */

	11304 sqlite3 db; / Database connection currently using this Btree */

	11305 BtCursor pCursor; / A list of all open cursors */

	11306 MemPage pPage1; / First page of the database */

	11307 u8 openFlags; /* Flags to sqlite3BtreeOpen() */

	11308 #ifndef SQLITE_OMIT_AUTOVACUUM

	11309 u8 autoVacuum; /* True if auto-vacuum is enabled */

	11310 u8 incrVacuum; /* True if incr-vacuum is enabled */

	11311 u8 bDoTruncate; /* True to truncate db on commit */

	11312 #endif

	11313 u8 inTransaction; /* Transaction state */

	11314 u8 max1bytePayload; /* Maximum first byte of cell for a 1-byte payload */

	11315 #ifdef SQLITE_HAS_CODEC

	11316 u8 optimalReserve; /* Desired amount of reserved space per page */

	11317 #endif

	11318 u16 btsFlags; /* Boolean parameters. See BTS_* macros below */

	11319 u16 maxLocal; /* Maximum local payload in non-LEAFDATA tables */

	11320 u16 minLocal; /* Minimum local payload in non-LEAFDATA tables */

	11321 u16 maxLeaf; /* Maximum local payload in a LEAFDATA table */

	11322 u16 minLeaf; /* Minimum local payload in a LEAFDATA table */

	11323 u32 pageSize; /* Total number of bytes on a page */

	11324 u32 usableSize; /* Number of usable bytes on each page */

	11325 int nTransaction; /* Number of open transactions (read + write) */

	11326 u32 nPage; /* Number of pages in the database */

	11327 void pSchema; / Pointer to space allocated by sqlite3BtreeSchema() */

	11328 void (xFreeSchema)(void); /* Destructor for BtShared.pSchema */

	11329 sqlite3_mutex mutex; / Non-recursive mutex required to access this object */

	11330 Bitvec pHasContent; / Set of pages moved to free-list this transaction */

	11331 #ifndef SQLITE_OMIT_SHARED_CACHE

	11332 int nRef; /* Number of references to this structure */

	11333 BtShared pNext; / Next on a list of sharable BtShared structs */

	11334 BtLock pLock; / List of locks held on this shared-btree struct */

	11335 Btree pWriter; / Btree with currently open write transaction */

	11336 #endif

	11337 u8 pTmpSpace; / Temp space sufficient to hold a single cell */

	11338 };

	11339

	11340 /*

	11341 ** Allowed values for BtShared.btsFlags

	11342 */

	11343 #define BTS_READ_ONLY 0x0001 /* Underlying file is readonly */

	11344 #define BTS_PAGESIZE_FIXED 0x0002 /* Page size can no longer be changed */

	11345 #define BTS_SECURE_DELETE 0x0004 /* PRAGMA secure_delete is enabled */

	11346 #define BTS_INITIALLY_EMPTY 0x0008 /* Database was empty at trans start */

	11347 #define BTS_NO_WAL 0x0010 /* Do not open write-ahead-log files */

	11348 #define BTS_EXCLUSIVE 0x0020 /* pWriter has an exclusive lock */

	11349 #define BTS_PENDING 0x0040 /* Waiting for read-locks to clear */

	11350

	11351 /*

	11352 ** An instance of the following structure is used to hold information

	11353 ** about a cell. The parseCellPtr() function fills in this structure

	11354 ** based on information extract from the raw disk page.

	11355 */

	11356 struct CellInfo {

	11357 i64 nKey; /* The key for INTKEY tables, or nPayload otherwise */

	11358 u8 pPayload; / Pointer to the start of payload */

	11359 u32 nPayload; /* Bytes of payload */

	11360 u16 nLocal; /* Amount of payload held locally, not on overflow */

	11361 u16 nSize; /* Size of the cell content on the main b-tree page */

	11362 };

	11363

	11364 /*

	11365 ** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than

	11366 ** this will be declared corrupt. This value is calculated based on a

	11367 ** maximum database size of 2^31 pages a minimum fanout of 2 for a

	11368 ** root-node and 3 for all other internal nodes.

	11369 **

	11370 ** If a tree that appears to be taller than this is encountered, it is

	11371 ** assumed that the database is corrupt.

	11372 */

	11373 #define BTCURSOR_MAX_DEPTH 20

	11374

	11375 /*

	11376 ** A cursor is a pointer to a particular entry within a particular

	11377 ** b-tree within a database file.

	11378 **

	11379 ** The entry is identified by its MemPage and the index in

	11380 ** MemPage.aCell[] of the entry.

	11381 **

	11382 ** A single database file can be shared by two more database connections,

	11383 ** but cursors cannot be shared. Each cursor is associated with a

	11384 ** particular database connection identified BtCursor.pBtree.db.

	11385 **

	11386 ** Fields in this structure are accessed under the BtShared.mutex

	11387 ** found at self->pBt->mutex.

	11388 **

	11389 ** skipNext meaning:

	11390 ** eState==SKIPNEXT && skipNext>0: Next sqlite3BtreeNext() is no-op.

	11391 ** eState==SKIPNEXT && skipNext<0: Next sqlite3BtreePrevious() is no-op.

	11392 ** eState==FAULT: Cursor fault with skipNext as error code.

	11393 */

	11394 struct BtCursor {

	11395 Btree pBtree; / The Btree to which this cursor belongs */

	11396 BtShared pBt; / The BtShared this cursor points to */

	11397 BtCursor pNext; / Forms a linked list of all cursors */

	11398 Pgno aOverflow; / Cache of overflow page locations */

	11399 CellInfo info; /* A parse of the cell we are pointing at */

	11400 i64 nKey; /* Size of pKey, or last integer key */

	11401 void pKey; / Saved key that was cursor last known position */

	11402 Pgno pgnoRoot; /* The root page of this tree */

	11403 int nOvflAlloc; /* Allocated size of aOverflow[] array */

	11404 int skipNext; /* Prev() is noop if negative. Next() is noop if positive.

	11405 ** Error code if eState==CURSOR_FAULT */

	11406 u8 curFlags; /* zero or more BTCF_* flags defined below */

	11407 u8 curPagerFlags; /* Flags to send to sqlite3PagerGet() */

	11408 u8 eState; /* One of the CURSOR_XXX constants (see below) */

	11409 u8 hints; /* As configured by CursorSetHints() */

	11410 /* All fields above are zeroed when the cursor is allocated. See

	11411 ** sqlite3BtreeCursorZero(). Fields that follow must be manually

	11412 ** initialized. */

	11413 i8 iPage; /* Index of current page in apPage */

	11414 u8 curIntKey; /* Value of apPage[0]->intKey */

	11415 struct KeyInfo pKeyInfo; / Argument passed to comparison function */

	11416 void padding1; / Make object size a multiple of 16 */

	11417 u16 aiIdx[BTCURSOR_MAX_DEPTH]; /* Current index in apPage[i] */

	11418 MemPage apPage[BTCURSOR_MAX_DEPTH]; / Pages from root to current page */

	11419 };

	11420

	11421 /*

	11422 ** Legal values for BtCursor.curFlags

	11423 */

	11424 #define BTCF_WriteFlag 0x01 /* True if a write cursor */

	11425 #define BTCF_ValidNKey 0x02 /* True if info.nKey is valid */

	11426 #define BTCF_ValidOvfl 0x04 /* True if aOverflow is valid */

	11427 #define BTCF_AtLast 0x08 /* Cursor is pointing ot the last entry */

	11428 #define BTCF_Incrblob 0x10 /* True if an incremental I/O handle */

	11429 #define BTCF_Multiple 0x20 /* Maybe another cursor on the same btree */

	11430

	11431 /*

	11432 ** Potential values for BtCursor.eState.

	11433 **

	11434 ** CURSOR_INVALID:

	11435 ** Cursor does not point to a valid entry. This can happen (for example)

	11436 ** because the table is empty or because BtreeCursorFirst() has not been

	11437 ** called.

	11438 **

	11439 ** CURSOR_VALID:

	11440 ** Cursor points to a valid entry. getPayload() etc. may be called.

	11441 **

	11442 ** CURSOR_SKIPNEXT:

	11443 ** Cursor is valid except that the Cursor.skipNext field is non-zero

	11444 ** indicating that the next sqlite3BtreeNext() or sqlite3BtreePrevious()

	11445 ** operation should be a no-op.

	11446 **

	11447 ** CURSOR_REQUIRESEEK:

	11448 ** The table that this cursor was opened on still exists, but has been

	11449 ** modified since the cursor was last used. The cursor position is saved

	11450 ** in variables BtCursor.pKey and BtCursor.nKey. When a cursor is in

	11451 ** this state, restoreCursorPosition() can be called to attempt to

	11452 ** seek the cursor to the saved position.

	11453 **

	11454 ** CURSOR_FAULT:

	11455 ** An unrecoverable error (an I/O error or a malloc failure) has occurred

	11456 ** on a different connection that shares the BtShared cache with this

	11457 ** cursor. The error has left the cache in an inconsistent state.

	11458 ** Do nothing else with this cursor. Any attempt to use the cursor

	11459 ** should return the error code stored in BtCursor.skipNext

	11460 */

	11461 #define CURSOR_INVALID 0

	11462 #define CURSOR_VALID 1

	11463 #define CURSOR_SKIPNEXT 2

	11464 #define CURSOR_REQUIRESEEK 3

	11465 #define CURSOR_FAULT 4

	11466

	11467 /*

	11468 ** The database page the PENDING_BYTE occupies. This page is never used.

	11469 */

	11470 # define PENDING_BYTE_PAGE(pBt) PAGER_MJ_PGNO(pBt)

	11471

	11472 /*

	11473 ** These macros define the location of the pointer-map entry for a

	11474 ** database page. The first argument to each is the number of usable

	11475 ** bytes on each page of the database (often 1024). The second is the

	11476 ** page number to look up in the pointer map.

	11477 **

	11478 ** PTRMAP_PAGENO returns the database page number of the pointer-map

	11479 ** page that stores the required pointer. PTRMAP_PTROFFSET returns

	11480 ** the offset of the requested map entry.

	11481 **

	11482 ** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,

	11483 ** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be

	11484 ** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements

	11485 ** this test.

	11486 */

	11487 #define PTRMAP_PAGENO(pBt, pgno) ptrmapPageno(pBt, pgno)

	11488 #define PTRMAP_PTROFFSET(pgptrmap, pgno) (5*(pgno-pgptrmap-1))

	11489 #define PTRMAP_ISPAGE(pBt, pgno) (PTRMAP_PAGENO((pBt),(pgno))==(pgno))

	11490

	11491 /*

	11492 ** The pointer map is a lookup table that identifies the parent page for

	11493 ** each child page in the database file. The parent page is the page that

	11494 ** contains a pointer to the child. Every page in the database contains

	11495 ** 0 or 1 parent pages. (In this context 'database page' refers

	11496 ** to any page that is not part of the pointer map itself.) Each pointer map

	11497 ** entry consists of a single byte 'type' and a 4 byte parent page number.

	11498 ** The PTRMAP_XXX identifiers below are the valid types.

	11499 **

	11500 ** The purpose of the pointer map is to facility moving pages from one

	11501 ** position in the file to another as part of autovacuum. When a page

	11502 ** is moved, the pointer in its parent must be updated to point to the

	11503 ** new location. The pointer map is used to locate the parent page quickly.

	11504 **

	11505 ** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not

	11506 ** used in this case.

	11507 **

	11508 ** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number

	11509 ** is not used in this case.

	11510 **

	11511 ** PTRMAP_OVERFLOW1: The database page is the first page in a list of

	11512 ** overflow pages. The page number identifies the page that

	11513 ** contains the cell with a pointer to this overflow page.

	11514 **

	11515 ** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of

	11516 ** overflow pages. The page-number identifies the previous

	11517 ** page in the overflow page list.

	11518 **

	11519 ** PTRMAP_BTREE: The database page is a non-root btree page. The page number

	11520 ** identifies the parent page in the btree.

	11521 */

	11522 #define PTRMAP_ROOTPAGE 1

	11523 #define PTRMAP_FREEPAGE 2

	11524 #define PTRMAP_OVERFLOW1 3

	11525 #define PTRMAP_OVERFLOW2 4

	11526 #define PTRMAP_BTREE 5

	11527

	11528 /* A bunch of assert() statements to check the transaction state variables

	11529 ** of handle p (type Btree*) are internally consistent.

	11530 */

	11531 #define btreeIntegrity(p) \

	11532 assert( p->pBt->inTransaction!=TRANS_NONE \|\| p->pBt->nTransaction==0 ); \

	11533 assert( p->pBt->inTransaction>=p->inTrans );

	11534

	11535

	11536 /*

	11537 ** The ISAUTOVACUUM macro is used within balance_nonroot() to determine

	11538 ** if the database supports auto-vacuum or not. Because it is used

	11539 ** within an expression that is an argument to another macro

	11540 ** (sqliteMallocRaw), it is not possible to use conditional compilation.

	11541 ** So, this macro is defined instead.

	11542 */

	11543 #ifndef SQLITE_OMIT_AUTOVACUUM

	11544 #define ISAUTOVACUUM (pBt->autoVacuum)

	11545 #else

	11546 #define ISAUTOVACUUM 0

	11547 #endif

	11548

	11549

	11550 /*

	11551 ** This structure is passed around through all the sanity checking routines

	11552 ** in order to keep track of some global state information.

	11553 **

	11554 ** The aRef[] array is allocated so that there is 1 bit for each page in

	11555 ** the database. As the integrity-check proceeds, for each page used in

	11556 ** the database the corresponding bit is set. This allows integrity-check to

	11557 ** detect pages that are used twice and orphaned pages (both of which

	11558 ** indicate corruption).

	11559 */

	11560 typedef struct IntegrityCk IntegrityCk;

	11561 struct IntegrityCk {

	11562 BtShared pBt; / The tree being checked out */

	11563 Pager pPager; / The associated pager. Also accessible by pBt->pPager */

	11564 u8 aPgRef; / 1 bit per page in the db (see above) */

	11565 Pgno nPage; /* Number of pages in the database */

	11566 int mxErr; /* Stop accumulating errors when this reaches zero */

	11567 int nErr; /* Number of messages written to zErrMsg so far */

	11568 int mallocFailed; /* A memory allocation error has occurred */

	11569 const char zPfx; / Error message prefix */

	11570 int v1, v2; /* Values for up to two %d fields in zPfx */

	11571 StrAccum errMsg; /* Accumulate the error message text here */

	11572 u32 heap; / Min-heap used for analyzing cell coverage */

	11573 };

	11574

	11575 /*

	11576 ** Routines to read or write a two- and four-byte big-endian integer values.

	11577 */

	11578 #define get2byte(x) ((x)[0]<<8 \| (x)[1])

	11579 #define put2byte(p,v) ((p)[0] = (u8)((v)>>8), (p)[1] = (u8)(v))

	11580 #define get4byte sqlite3Get4byte

	11581 #define put4byte sqlite3Put4byte

	11582

	11583 /*

	11584 ** get2byteAligned(), unlike get2byte(), requires that its argument point to a

	11585 ** two-byte aligned address. get2bytea() is only used for accessing the

	11586 ** cell addresses in a btree header.

	11587 */

	11588 #if SQLITE_BYTEORDER==4321

	11589 # define get2byteAligned(x) ((u16)(x))

	11590 #elif SQLITE_BYTEORDER==1234 && !defined(SQLITE_DISABLE_INTRINSIC) \

	11591 && GCC_VERSION>=4008000

	11592 # define get2byteAligned(x) __builtin_bswap16((u16)(x))

	11593 #elif SQLITE_BYTEORDER==1234 && !defined(SQLITE_DISABLE_INTRINSIC) \

	11594 && defined(_MSC_VER) && _MSC_VER>=1300

	11595 # define get2byteAligned(x) _byteswap_ushort((u16)(x))

	11596 #else

	11597 # define get2byteAligned(x) ((x)[0]<<8 \| (x)[1])

	11598 #endif

	11599

	11600 /************ End of btreeInt.h ******************************************/

	11601 /************ Continuing where we left off in btmutex.c ******************/

	11602 #ifndef SQLITE_OMIT_SHARED_CACHE

	11603 #if SQLITE_THREADSAFE

	11604

	11605 /*

	11606 ** Obtain the BtShared mutex associated with B-Tree handle p. Also,

	11607 ** set BtShared.db to the database handle associated with p and the

	11608 ** p->locked boolean to true.

	11609 */

	11610 static void lockBtreeMutex(Btree *p){

	11611 assert( p->locked==0 );

	11612 assert( sqlite3_mutex_notheld(p->pBt->mutex) );

	11613 assert( sqlite3_mutex_held(p->db->mutex) );

	11614

	11615 sqlite3_mutex_enter(p->pBt->mutex);

	11616 p->pBt->db = p->db;

	11617 p->locked = 1;

	11618 }

	11619

	11620 /*

	11621 ** Release the BtShared mutex associated with B-Tree handle p and

	11622 ** clear the p->locked boolean.

	11623 */

	11624 static void SQLITE_NOINLINE unlockBtreeMutex(Btree *p){

	11625 BtShared *pBt = p->pBt;

	11626 assert( p->locked==1 );

	11627 assert( sqlite3_mutex_held(pBt->mutex) );

	11628 assert( sqlite3_mutex_held(p->db->mutex) );

	11629 assert( p->db==pBt->db );

	11630

	11631 sqlite3_mutex_leave(pBt->mutex);

	11632 p->locked = 0;

	11633 }

	11634

	11635 /* Forward reference */

	11636 static void SQLITE_NOINLINE btreeLockCarefully(Btree *p);

	11637

	11638 /*

	11639 ** Enter a mutex on the given BTree object.

	11640 **

	11641 ** If the object is not sharable, then no mutex is ever required

	11642 ** and this routine is a no-op. The underlying mutex is non-recursive.

	11643 ** But we keep a reference count in Btree.wantToLock so the behavior

	11644 ** of this interface is recursive.

	11645 **

	11646 ** To avoid deadlocks, multiple Btrees are locked in the same order

	11647 ** by all database connections. The p->pNext is a list of other

	11648 ** Btrees belonging to the same database connection as the p Btree

	11649 ** which need to be locked after p. If we cannot get a lock on

	11650 ** p, then first unlock all of the others on p->pNext, then wait

	11651 ** for the lock to become available on p, then relock all of the

	11652 ** subsequent Btrees that desire a lock.

	11653 */

	11654 SQLITE_PRIVATE void sqlite3BtreeEnter(Btree *p){

	11655 /* Some basic sanity checking on the Btree. The list of Btrees

	11656 ** connected by pNext and pPrev should be in sorted order by

	11657 ** Btree.pBt value. All elements of the list should belong to

	11658 ** the same connection. Only shared Btrees are on the list. */

	11659 assert( p->pNext==0 \|\| p->pNext->pBt>p->pBt );

	11660 assert( p->pPrev==0 \|\| p->pPrev->pBt<p->pBt );

	11661 assert( p->pNext==0 \|\| p->pNext->db==p->db );

	11662 assert( p->pPrev==0 \|\| p->pPrev->db==p->db );

	11663 assert( p->sharable \|\| (p->pNext==0 && p->pPrev==0) );

	11664

	11665 /* Check for locking consistency */

	11666 assert( !p->locked \|\| p->wantToLock>0 );

	11667 assert( p->sharable \|\| p->wantToLock==0 );

	11668

	11669 /* We should already hold a lock on the database connection */

	11670 assert( sqlite3_mutex_held(p->db->mutex) );

	11671

	11672 /* Unless the database is sharable and unlocked, then BtShared.db

	11673 ** should already be set correctly. */

	11674 assert( (p->locked==0 && p->sharable) \|\| p->pBt->db==p->db );

	11675

	11676 if( !p->sharable ) return;

	11677 p->wantToLock++;

	11678 if( p->locked ) return;

	11679 btreeLockCarefully(p);

	11680 }

	11681

	11682 /* This is a helper function for sqlite3BtreeLock(). By moving

	11683 ** complex, but seldom used logic, out of sqlite3BtreeLock() and

	11684 ** into this routine, we avoid unnecessary stack pointer changes

	11685 ** and thus help the sqlite3BtreeLock() routine to run much faster

	11686 ** in the common case.

	11687 */

	11688 static void SQLITE_NOINLINE btreeLockCarefully(Btree *p){

	11689 Btree *pLater;

	11690

	11691 /* In most cases, we should be able to acquire the lock we

	11692 ** want without having to go through the ascending lock

	11693 ** procedure that follows. Just be sure not to block.

	11694 */

	11695 if( sqlite3_mutex_try(p->pBt->mutex)==SQLITE_OK ){

	11696 p->pBt->db = p->db;

	11697 p->locked = 1;

	11698 return;

	11699 }

	11700

	11701 /* To avoid deadlock, first release all locks with a larger

	11702 ** BtShared address. Then acquire our lock. Then reacquire

	11703 ** the other BtShared locks that we used to hold in ascending

	11704 ** order.

	11705 */

	11706 for(pLater=p->pNext; pLater; pLater=pLater->pNext){

	11707 assert( pLater->sharable );

	11708 assert( pLater->pNext==0 \|\| pLater->pNext->pBt>pLater->pBt );

	11709 assert( !pLater->locked \|\| pLater->wantToLock>0 );

	11710 if( pLater->locked ){

	11711 unlockBtreeMutex(pLater);

	11712 }

	11713 }

	11714 lockBtreeMutex(p);

	11715 for(pLater=p->pNext; pLater; pLater=pLater->pNext){

	11716 if( pLater->wantToLock ){

	11717 lockBtreeMutex(pLater);

	11718 }

	11719 }

	11720 }

	11721

	11722

	11723 /*

	11724 ** Exit the recursive mutex on a Btree.

	11725 */

	11726 SQLITE_PRIVATE void sqlite3BtreeLeave(Btree *p){

	11727 assert( sqlite3_mutex_held(p->db->mutex) );

	11728 if( p->sharable ){

	11729 assert( p->wantToLock>0 );

	11730 p->wantToLock--;

	11731 if( p->wantToLock==0 ){

	11732 unlockBtreeMutex(p);

	11733 }

	11734 }

	11735 }

	11736

	11737 #ifndef NDEBUG

	11738 /*

	11739 ** Return true if the BtShared mutex is held on the btree, or if the

	11740 ** B-Tree is not marked as sharable.

	11741 **

	11742 ** This routine is used only from within assert() statements.

	11743 */

	11744 SQLITE_PRIVATE int sqlite3BtreeHoldsMutex(Btree *p){

	11745 assert( p->sharable==0 \|\| p->locked==0 \|\| p->wantToLock>0 );

	11746 assert( p->sharable==0 \|\| p->locked==0 \|\| p->db==p->pBt->db );

	11747 assert( p->sharable==0 \|\| p->locked==0 \|\| sqlite3_mutex_held(p->pBt->mutex) );

	11748 assert( p->sharable==0 \|\| p->locked==0 \|\| sqlite3_mutex_held(p->db->mutex) );

	11749

	11750 return (p->sharable==0 \|\| p->locked);

	11751 }

	11752 #endif

	11753

	11754

	11755 #ifndef SQLITE_OMIT_INCRBLOB

	11756 /*

	11757 ** Enter and leave a mutex on a Btree given a cursor owned by that

	11758 ** Btree. These entry points are used by incremental I/O and can be

	11759 ** omitted if that module is not used.

	11760 */

	11761 SQLITE_PRIVATE void sqlite3BtreeEnterCursor(BtCursor *pCur){

	11762 sqlite3BtreeEnter(pCur->pBtree);

	11763 }

	11764 SQLITE_PRIVATE void sqlite3BtreeLeaveCursor(BtCursor *pCur){

	11765 sqlite3BtreeLeave(pCur->pBtree);

	11766 }

	11767 #endif /* SQLITE_OMIT_INCRBLOB */

	11768

	11769

	11770 /*

	11771 ** Enter the mutex on every Btree associated with a database

	11772 ** connection. This is needed (for example) prior to parsing

	11773 ** a statement since we will be comparing table and column names

	11774 ** against all schemas and we do not want those schemas being

	11775 ** reset out from under us.

	11776 **

	11777 ** There is a corresponding leave-all procedures.

	11778 **

	11779 ** Enter the mutexes in accending order by BtShared pointer address

	11780 ** to avoid the possibility of deadlock when two threads with

	11781 ** two or more btrees in common both try to lock all their btrees

	11782 ** at the same instant.

	11783 */

	11784 SQLITE_PRIVATE void sqlite3BtreeEnterAll(sqlite3 *db){

	11785 int i;

	11786 Btree *p;

	11787 assert( sqlite3_mutex_held(db->mutex) );

	11788 for(i=0; i<db->nDb; i++){

	11789 p = db->aDb[i].pBt;

	11790 if( p ) sqlite3BtreeEnter(p);

	11791 }

	11792 }

	11793 SQLITE_PRIVATE void sqlite3BtreeLeaveAll(sqlite3 *db){

	11794 int i;

	11795 Btree *p;

	11796 assert( sqlite3_mutex_held(db->mutex) );

	11797 for(i=0; i<db->nDb; i++){

	11798 p = db->aDb[i].pBt;

	11799 if( p ) sqlite3BtreeLeave(p);

	11800 }

	11801 }

	11802

	11803 /*

	11804 ** Return true if a particular Btree requires a lock. Return FALSE if

	11805 ** no lock is ever required since it is not sharable.

	11806 */

	11807 SQLITE_PRIVATE int sqlite3BtreeSharable(Btree *p){

	11808 return p->sharable;

	11809 }

	11810

	11811 #ifndef NDEBUG

	11812 /*

	11813 ** Return true if the current thread holds the database connection

	11814 ** mutex and all required BtShared mutexes.

	11815 **

	11816 ** This routine is used inside assert() statements only.

	11817 */

	11818 SQLITE_PRIVATE int sqlite3BtreeHoldsAllMutexes(sqlite3 *db){

	11819 int i;

	11820 if( !sqlite3_mutex_held(db->mutex) ){

	11821 return 0;

	11822 }

	11823 for(i=0; i<db->nDb; i++){

	11824 Btree *p;

	11825 p = db->aDb[i].pBt;

	11826 if( p && p->sharable &&

	11827 (p->wantToLock==0 \|\| !sqlite3_mutex_held(p->pBt->mutex)) ){

	11828 return 0;

	11829 }

	11830 }

	11831 return 1;

	11832 }

	11833 #endif /* NDEBUG */

	11834

	11835 #ifndef NDEBUG

	11836 /*

	11837 ** Return true if the correct mutexes are held for accessing the

	11838 ** db->aDb[iDb].pSchema structure. The mutexes required for schema

	11839 ** access are:

	11840 **

	11841 ** (1) The mutex on db

	11842 ** (2) if iDb!=1, then the mutex on db->aDb[iDb].pBt.

	11843 **

	11844 ** If pSchema is not NULL, then iDb is computed from pSchema and

	11845 ** db using sqlite3SchemaToIndex().

	11846 */

	11847 SQLITE_PRIVATE int sqlite3SchemaMutexHeld(sqlite3 db, int iDb, Schema pSchema) {

	11848 Btree *p;

	11849 assert( db!=0 );

	11850 if( pSchema ) iDb = sqlite3SchemaToIndex(db, pSchema);

	11851 assert( iDb>=0 && iDb<db->nDb );

	11852 if( !sqlite3_mutex_held(db->mutex) ) return 0;

	11853 if( iDb==1 ) return 1;

	11854 p = db->aDb[iDb].pBt;

	11855 assert( p!=0 );

	11856 return p->sharable==0 \|\| p->locked==1;

	11857 }

	11858 #endif /* NDEBUG */

	11859

	11860 #else /* SQLITE_THREADSAFE>0 above. SQLITE_THREADSAFE==0 below */

	11861 /*

	11862 ** The following are special cases for mutex enter routines for use

	11863 ** in single threaded applications that use shared cache. Except for

	11864 ** these two routines, all mutex operations are no-ops in that case and

	11865 ** are null #defines in btree.h.

	11866 **

	11867 ** If shared cache is disabled, then all btree mutex routines, including

	11868 ** the ones below, are no-ops and are null #defines in btree.h.

	11869 */

	11870

	11871 SQLITE_PRIVATE void sqlite3BtreeEnter(Btree *p){

	11872 p->pBt->db = p->db;

	11873 }

	11874 SQLITE_PRIVATE void sqlite3BtreeEnterAll(sqlite3 *db){

	11875 int i;

	11876 for(i=0; i<db->nDb; i++){

	11877 Btree *p = db->aDb[i].pBt;

	11878 if( p ){

	11879 p->pBt->db = p->db;

	11880 }

	11881 }

	11882 }

	11883 #endif /* if SQLITE_THREADSAFE */

	11884 #endif /* ifndef SQLITE_OMIT_SHARED_CACHE */

	11885

	11886 /************ End of btmutex.c *******************************************/

	11887 /************ Begin file btree.c *****************************************/

	11888 /*

	11889 ** 2004 April 6

	11890 **

	11891 ** The author disclaims copyright to this source code. In place of

	11892 ** a legal notice, here is a blessing:

	11893 **

	11894 ** May you do good and not evil.

	11895 ** May you find forgiveness for yourself and forgive others.

	11896 ** May you share freely, never taking more than you give.

	11897 **

	11898 *************************************************************************

	11899 ** This file implements an external (disk-based) database using BTrees.

	11900 ** See the header comment on "btreeInt.h" for additional information.

	11901 ** Including a description of file format and an overview of operation.

	11902 */

	11903 /* #include "btreeInt.h" */

	11904

	11905 /*

	11906 ** The header string that appears at the beginning of every

	11907 ** SQLite database.

	11908 */

	11909 static const char zMagicHeader[] = SQLITE_FILE_HEADER;

	11910

	11911 /*

	11912 ** Set this global variable to 1 to enable tracing using the TRACE

	11913 ** macro.

	11914 */

	11915 #if 0

	11916 int sqlite3BtreeTrace=1; /* True to enable tracing */

	11917 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}

	11918 #else

	11919 # define TRACE(X)

	11920 #endif

	11921

	11922 /*

	11923 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.

	11924 ** But if the value is zero, make it 65536.

	11925 **

	11926 ** This routine is used to extract the "offset to cell content area" value

	11927 ** from the header of a btree page. If the page size is 65536 and the page

	11928 ** is empty, the offset should be 65536, but the 2-byte value stores zero.

	11929 ** This routine makes the necessary adjustment to 65536.

	11930 */

	11931 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)

	11932

	11933 /*

	11934 ** Values passed as the 5th argument to allocateBtreePage()

	11935 */

	11936 #define BTALLOC_ANY 0 /* Allocate any page */

	11937 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */

	11938 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */

	11939

	11940 /*

	11941 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not

	11942 ** defined, or 0 if it is. For example:

	11943 **

	11944 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);

	11945 */

	11946 #ifndef SQLITE_OMIT_AUTOVACUUM

	11947 #define IfNotOmitAV(expr) (expr)

	11948 #else

	11949 #define IfNotOmitAV(expr) 0

	11950 #endif

	11951

	11952 #ifndef SQLITE_OMIT_SHARED_CACHE

	11953 /*

	11954 ** A list of BtShared objects that are eligible for participation

	11955 ** in shared cache. This variable has file scope during normal builds,

	11956 ** but the test harness needs to access it so we make it global for

	11957 ** test builds.

	11958 **

	11959 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.

	11960 */

	11961 #ifdef SQLITE_TEST

	11962 SQLITE_PRIVATE BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

	11963 #else

	11964 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

	11965 #endif

	11966 #endif /* SQLITE_OMIT_SHARED_CACHE */

	11967

	11968 #ifndef SQLITE_OMIT_SHARED_CACHE

	11969 /*

	11970 ** Enable or disable the shared pager and schema features.

	11971 **

	11972 ** This routine has no effect on existing database connections.

	11973 ** The shared cache setting effects only future calls to

	11974 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().

	11975 */

	11976 SQLITE_API int SQLITE_STDCALL sqlite3_enable_shared_cache(int enable){

	11977 sqlite3GlobalConfig.sharedCacheEnabled = enable;

	11978 return SQLITE_OK;

	11979 }

	11980 #endif

	11981

	11982

	11983

	11984 #ifdef SQLITE_OMIT_SHARED_CACHE

	11985 /*

	11986 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),

	11987 ** and clearAllSharedCacheTableLocks()

	11988 ** manipulate entries in the BtShared.pLock linked list used to store

	11989 ** shared-cache table level locks. If the library is compiled with the

	11990 ** shared-cache feature disabled, then there is only ever one user

	11991 ** of each BtShared structure and so this locking is not necessary.

	11992 ** So define the lock related functions as no-ops.

	11993 */

	11994 #define querySharedCacheTableLock(a,b,c) SQLITE_OK

	11995 #define setSharedCacheTableLock(a,b,c) SQLITE_OK

	11996 #define clearAllSharedCacheTableLocks(a)

	11997 #define downgradeAllSharedCacheTableLocks(a)

	11998 #define hasSharedCacheTableLock(a,b,c,d) 1

	11999 #define hasReadConflicts(a, b) 0

	12000 #endif

	12001

	12002 #ifndef SQLITE_OMIT_SHARED_CACHE

	12003

	12004 #ifdef SQLITE_DEBUG

	12005 /*

	12006 ** This function is only used as part of an assert() statement. *

	12007 **

	12008 ** Check to see if pBtree holds the required locks to read or write to the

	12009 ** table with root page iRoot. Return 1 if it does and 0 if not.

	12010 **

	12011 ** For example, when writing to a table with root-page iRoot via

	12012 ** Btree connection pBtree:

	12013 **

	12014 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );

	12015 **

	12016 ** When writing to an index that resides in a sharable database, the

	12017 ** caller should have first obtained a lock specifying the root page of

	12018 ** the corresponding table. This makes things a bit more complicated,

	12019 ** as this module treats each table as a separate structure. To determine

	12020 ** the table corresponding to the index being written, this

	12021 ** function has to search through the database schema.

	12022 **

	12023 ** Instead of a lock on the table/index rooted at page iRoot, the caller may

	12024 ** hold a write-lock on the schema table (root page 1). This is also

	12025 ** acceptable.

	12026 */

	12027 static int hasSharedCacheTableLock(

	12028 Btree pBtree, / Handle that must hold lock */

	12029 Pgno iRoot, /* Root page of b-tree */

	12030 int isIndex, /* True if iRoot is the root of an index b-tree */

	12031 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */

	12032 ){

	12033 Schema pSchema = (Schema )pBtree->pBt->pSchema;

	12034 Pgno iTab = 0;

	12035 BtLock *pLock;

	12036

	12037 /* If this database is not shareable, or if the client is reading

	12038 ** and has the read-uncommitted flag set, then no lock is required.

	12039 ** Return true immediately.

	12040 */

	12041 if( (pBtree->sharable==0)

	12042 \|\| (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))

	12043 ){

	12044 return 1;

	12045 }

	12046

	12047 /* If the client is reading or writing an index and the schema is

	12048 ** not loaded, then it is too difficult to actually check to see if

	12049 ** the correct locks are held. So do not bother - just return true.

	12050 ** This case does not come up very often anyhow.

	12051 */

	12052 if( isIndex && (!pSchema \|\| (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){

	12053 return 1;

	12054 }

	12055

	12056 /* Figure out the root-page that the lock should be held on. For table

	12057 ** b-trees, this is just the root page of the b-tree being read or

	12058 ** written. For index b-trees, it is the root page of the associated

	12059 ** table. */

	12060 if( isIndex ){

	12061 HashElem *p;

	12062 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){

	12063 Index pIdx = (Index )sqliteHashData(p);

	12064 if( pIdx->tnum==(int)iRoot ){

	12065 if( iTab ){

	12066 /* Two or more indexes share the same root page. There must

	12067 ** be imposter tables. So just return true. The assert is not

	12068 ** useful in that case. */

	12069 return 1;

	12070 }

	12071 iTab = pIdx->pTable->tnum;

	12072 }

	12073 }

	12074 }else{

	12075 iTab = iRoot;

	12076 }

	12077

	12078 /* Search for the required lock. Either a write-lock on root-page iTab, a

	12079 ** write-lock on the schema table, or (if the client is reading) a

	12080 ** read-lock on iTab will suffice. Return 1 if any of these are found. */

	12081 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){

	12082 if( pLock->pBtree==pBtree

	12083 && (pLock->iTable==iTab \|\| (pLock->eLock==WRITE_LOCK && pLock->iTable==1))

	12084 && pLock->eLock>=eLockType

	12085 ){

	12086 return 1;

	12087 }

	12088 }

	12089

	12090 /* Failed to find the required lock. */

	12091 return 0;

	12092 }

	12093 #endif /* SQLITE_DEBUG */

	12094

	12095 #ifdef SQLITE_DEBUG

	12096 /*

	12097 ** This function may be used as part of assert() statements only. **

	12098 **

	12099 ** Return true if it would be illegal for pBtree to write into the

	12100 ** table or index rooted at iRoot because other shared connections are

	12101 ** simultaneously reading that same table or index.

	12102 **

	12103 ** It is illegal for pBtree to write if some other Btree object that

	12104 ** shares the same BtShared object is currently reading or writing

	12105 ** the iRoot table. Except, if the other Btree object has the

	12106 ** read-uncommitted flag set, then it is OK for the other object to

	12107 ** have a read cursor.

	12108 **

	12109 ** For example, before writing to any part of the table or index

	12110 ** rooted at page iRoot, one should call:

	12111 **

	12112 ** assert( !hasReadConflicts(pBtree, iRoot) );

	12113 */

	12114 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){

	12115 BtCursor *p;

	12116 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	12117 if( p->pgnoRoot==iRoot

	12118 && p->pBtree!=pBtree

	12119 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)

	12120 ){

	12121 return 1;

	12122 }

	12123 }

	12124 return 0;

	12125 }

	12126 #endif /* #ifdef SQLITE_DEBUG */

	12127

	12128 /*

	12129 ** Query to see if Btree handle p may obtain a lock of type eLock

	12130 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return

	12131 ** SQLITE_OK if the lock may be obtained (by calling

	12132 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.

	12133 */

	12134 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){

	12135 BtShared *pBt = p->pBt;

	12136 BtLock *pIter;

	12137

	12138 assert( sqlite3BtreeHoldsMutex(p) );

	12139 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

	12140 assert( p->db!=0 );

	12141 assert( !(p->db->flags&SQLITE_ReadUncommitted)\|\|eLock==WRITE_LOCK\|\|iTab==1 );

	12142

	12143 /* If requesting a write-lock, then the Btree must have an open write

	12144 ** transaction on this file. And, obviously, for this to be so there

	12145 ** must be an open write transaction on the file itself.

	12146 */

	12147 assert( eLock==READ_LOCK \|\| (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );

	12148 assert( eLock==READ_LOCK \|\| pBt->inTransaction==TRANS_WRITE );

	12149

	12150 /* This routine is a no-op if the shared-cache is not enabled */

	12151 if( !p->sharable ){

	12152 return SQLITE_OK;

	12153 }

	12154

	12155 /* If some other connection is holding an exclusive lock, the

	12156 ** requested lock may not be obtained.

	12157 */

	12158 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){

	12159 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);

	12160 return SQLITE_LOCKED_SHAREDCACHE;

	12161 }

	12162

	12163 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	12164 /* The condition (pIter->eLock!=eLock) in the following if(...)

	12165 ** statement is a simplification of:

	12166 **

	12167 ** (eLock==WRITE_LOCK \|\| pIter->eLock==WRITE_LOCK)

	12168 **

	12169 ** since we know that if eLock==WRITE_LOCK, then no other connection

	12170 ** may hold a WRITE_LOCK on any table in this file (since there can

	12171 ** only be a single writer).

	12172 */

	12173 assert( pIter->eLock==READ_LOCK \|\| pIter->eLock==WRITE_LOCK );

	12174 assert( eLock==READ_LOCK \|\| pIter->pBtree==p \|\| pIter->eLock==READ_LOCK);

	12175 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){

	12176 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);

	12177 if( eLock==WRITE_LOCK ){

	12178 assert( p==pBt->pWriter );

	12179 pBt->btsFlags \|= BTS_PENDING;

	12180 }

	12181 return SQLITE_LOCKED_SHAREDCACHE;

	12182 }

	12183 }

	12184 return SQLITE_OK;

	12185 }

	12186 #endif /* !SQLITE_OMIT_SHARED_CACHE */

	12187

	12188 #ifndef SQLITE_OMIT_SHARED_CACHE

	12189 /*

	12190 ** Add a lock on the table with root-page iTable to the shared-btree used

	12191 ** by Btree handle p. Parameter eLock must be either READ_LOCK or

	12192 ** WRITE_LOCK.

	12193 **

	12194 ** This function assumes the following:

	12195 **

	12196 ** (a) The specified Btree object p is connected to a sharable

	12197 ** database (one with the BtShared.sharable flag set), and

	12198 **

	12199 ** (b) No other Btree objects hold a lock that conflicts

	12200 ** with the requested lock (i.e. querySharedCacheTableLock() has

	12201 ** already been called and returned SQLITE_OK).

	12202 **

	12203 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM

	12204 ** is returned if a malloc attempt fails.

	12205 */

	12206 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){

	12207 BtShared *pBt = p->pBt;

	12208 BtLock *pLock = 0;

	12209 BtLock *pIter;

	12210

	12211 assert( sqlite3BtreeHoldsMutex(p) );

	12212 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

	12213 assert( p->db!=0 );

	12214

	12215 /* A connection with the read-uncommitted flag set will never try to

	12216 ** obtain a read-lock using this function. The only read-lock obtained

	12217 ** by a connection in read-uncommitted mode is on the sqlite_master

	12218 ** table, and that lock is obtained in BtreeBeginTrans(). */

	12219 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) \|\| eLock==WRITE_LOCK );

	12220

	12221 /* This function should only be called on a sharable b-tree after it

	12222 ** has been determined that no other b-tree holds a conflicting lock. */

	12223 assert( p->sharable );

	12224 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );

	12225

	12226 /* First search the list for an existing lock on this table. */

	12227 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	12228 if( pIter->iTable==iTable && pIter->pBtree==p ){

	12229 pLock = pIter;

	12230 break;

	12231 }

	12232 }

	12233

	12234 /* If the above search did not find a BtLock struct associating Btree p

	12235 ** with table iTable, allocate one and link it into the list.

	12236 */

	12237 if( !pLock ){

	12238 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));

	12239 if( !pLock ){

	12240 return SQLITE_NOMEM;

	12241 }

	12242 pLock->iTable = iTable;

	12243 pLock->pBtree = p;

	12244 pLock->pNext = pBt->pLock;

	12245 pBt->pLock = pLock;

	12246 }

	12247

	12248 /* Set the BtLock.eLock variable to the maximum of the current lock

	12249 ** and the requested lock. This means if a write-lock was already held

	12250 ** and a read-lock requested, we don't incorrectly downgrade the lock.

	12251 */

	12252 assert( WRITE_LOCK>READ_LOCK );

	12253 if( eLock>pLock->eLock ){

	12254 pLock->eLock = eLock;

	12255 }

	12256

	12257 return SQLITE_OK;

	12258 }

	12259 #endif /* !SQLITE_OMIT_SHARED_CACHE */

	12260

	12261 #ifndef SQLITE_OMIT_SHARED_CACHE

	12262 /*

	12263 ** Release all the table locks (locks obtained via calls to

	12264 ** the setSharedCacheTableLock() procedure) held by Btree object p.

	12265 **

	12266 ** This function assumes that Btree p has an open read or write

	12267 ** transaction. If it does not, then the BTS_PENDING flag

	12268 ** may be incorrectly cleared.

	12269 */

	12270 static void clearAllSharedCacheTableLocks(Btree *p){

	12271 BtShared *pBt = p->pBt;

	12272 BtLock **ppIter = &pBt->pLock;

	12273

	12274 assert( sqlite3BtreeHoldsMutex(p) );

	12275 assert( p->sharable \|\| 0==*ppIter );

	12276 assert( p->inTrans>0 );

	12277

	12278 while( *ppIter ){

	12279 BtLock pLock = ppIter;

	12280 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 \|\| pBt->pWriter==pLock->pBtree );

	12281 assert( pLock->pBtree->inTrans>=pLock->eLock );

	12282 if( pLock->pBtree==p ){

	12283 *ppIter = pLock->pNext;

	12284 assert( pLock->iTable!=1 \|\| pLock==&p->lock );

	12285 if( pLock->iTable!=1 ){

	12286 sqlite3_free(pLock);

	12287 }

	12288 }else{

	12289 ppIter = &pLock->pNext;

	12290 }

	12291 }

	12292

	12293 assert( (pBt->btsFlags & BTS_PENDING)==0 \|\| pBt->pWriter );

	12294 if( pBt->pWriter==p ){

	12295 pBt->pWriter = 0;

	12296 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

	12297 }else if( pBt->nTransaction==2 ){

	12298 /* This function is called when Btree p is concluding its

	12299 ** transaction. If there currently exists a writer, and p is not

	12300 ** that writer, then the number of locks held by connections other

	12301 ** than the writer must be about to drop to zero. In this case

	12302 ** set the BTS_PENDING flag to 0.

	12303 **

	12304 ** If there is not currently a writer, then BTS_PENDING must

	12305 ** be zero already. So this next line is harmless in that case.

	12306 */

	12307 pBt->btsFlags &= ~BTS_PENDING;

	12308 }

	12309 }

	12310

	12311 /*

	12312 ** This function changes all write-locks held by Btree p into read-locks.

	12313 */

	12314 static void downgradeAllSharedCacheTableLocks(Btree *p){

	12315 BtShared *pBt = p->pBt;

	12316 if( pBt->pWriter==p ){

	12317 BtLock *pLock;

	12318 pBt->pWriter = 0;

	12319 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

	12320 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){

	12321 assert( pLock->eLock==READ_LOCK \|\| pLock->pBtree==p );

	12322 pLock->eLock = READ_LOCK;

	12323 }

	12324 }

	12325 }

	12326

	12327 #endif /* SQLITE_OMIT_SHARED_CACHE */

	12328

	12329 static void releasePage(MemPage pPage); / Forward reference */

	12330

	12331 /*

	12332 *** This routine is used inside of assert() only **

	12333 **

	12334 ** Verify that the cursor holds the mutex on its BtShared

	12335 */

	12336 #ifdef SQLITE_DEBUG

	12337 static int cursorHoldsMutex(BtCursor *p){

	12338 return sqlite3_mutex_held(p->pBt->mutex);

	12339 }

	12340 #endif

	12341

	12342 /*

	12343 ** Invalidate the overflow cache of the cursor passed as the first argument.

	12344 ** on the shared btree structure pBt.

	12345 */

	12346 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)

	12347

	12348 /*

	12349 ** Invalidate the overflow page-list cache for all cursors opened

	12350 ** on the shared btree structure pBt.

	12351 */

	12352 static void invalidateAllOverflowCache(BtShared *pBt){

	12353 BtCursor *p;

	12354 assert( sqlite3_mutex_held(pBt->mutex) );

	12355 for(p=pBt->pCursor; p; p=p->pNext){

	12356 invalidateOverflowCache(p);

	12357 }

	12358 }

	12359

	12360 #ifndef SQLITE_OMIT_INCRBLOB

	12361 /*

	12362 ** This function is called before modifying the contents of a table

	12363 ** to invalidate any incrblob cursors that are open on the

	12364 ** row or one of the rows being modified.

	12365 **

	12366 ** If argument isClearTable is true, then the entire contents of the

	12367 ** table is about to be deleted. In this case invalidate all incrblob

	12368 ** cursors open on any row within the table with root-page pgnoRoot.

	12369 **

	12370 ** Otherwise, if argument isClearTable is false, then the row with

	12371 ** rowid iRow is being replaced or deleted. In this case invalidate

	12372 ** only those incrblob cursors open on that specific row.

	12373 */

	12374 static void invalidateIncrblobCursors(

	12375 Btree pBtree, / The database file to check */

	12376 i64 iRow, /* The rowid that might be changing */

	12377 int isClearTable /* True if all rows are being deleted */

	12378 ){

	12379 BtCursor *p;

	12380 if( pBtree->hasIncrblobCur==0 ) return;

	12381 assert( sqlite3BtreeHoldsMutex(pBtree) );

	12382 pBtree->hasIncrblobCur = 0;

	12383 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	12384 if( (p->curFlags & BTCF_Incrblob)!=0 ){

	12385 pBtree->hasIncrblobCur = 1;

	12386 if( isClearTable \|\| p->info.nKey==iRow ){

	12387 p->eState = CURSOR_INVALID;

	12388 }

	12389 }

	12390 }

	12391 }

	12392

	12393 #else

	12394 /* Stub function when INCRBLOB is omitted */

	12395 #define invalidateIncrblobCursors(x,y,z)

	12396 #endif /* SQLITE_OMIT_INCRBLOB */

	12397

	12398 /*

	12399 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called

	12400 ** when a page that previously contained data becomes a free-list leaf

	12401 ** page.

	12402 **

	12403 ** The BtShared.pHasContent bitvec exists to work around an obscure

	12404 ** bug caused by the interaction of two useful IO optimizations surrounding

	12405 ** free-list leaf pages:

	12406 **

	12407 ** 1) When all data is deleted from a page and the page becomes

	12408 ** a free-list leaf page, the page is not written to the database

	12409 ** (as free-list leaf pages contain no meaningful data). Sometimes

	12410 ** such a page is not even journalled (as it will not be modified,

	12411 ** why bother journalling it?).

	12412 **

	12413 ** 2) When a free-list leaf page is reused, its content is not read

	12414 ** from the database or written to the journal file (why should it

	12415 ** be, if it is not at all meaningful?).

	12416 **

	12417 ** By themselves, these optimizations work fine and provide a handy

	12418 ** performance boost to bulk delete or insert operations. However, if

	12419 ** a page is moved to the free-list and then reused within the same

	12420 ** transaction, a problem comes up. If the page is not journalled when

	12421 ** it is moved to the free-list and it is also not journalled when it

	12422 ** is extracted from the free-list and reused, then the original data

	12423 ** may be lost. In the event of a rollback, it may not be possible

	12424 ** to restore the database to its original configuration.

	12425 **

	12426 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is

	12427 ** moved to become a free-list leaf page, the corresponding bit is

	12428 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,

	12429 ** optimization 2 above is omitted if the corresponding bit is already

	12430 ** set in BtShared.pHasContent. The contents of the bitvec are cleared

	12431 ** at the end of every transaction.

	12432 */

	12433 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){

	12434 int rc = SQLITE_OK;

	12435 if( !pBt->pHasContent ){

	12436 assert( pgno<=pBt->nPage );

	12437 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);

	12438 if( !pBt->pHasContent ){

	12439 rc = SQLITE_NOMEM;

	12440 }

	12441 }

	12442 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){

	12443 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);

	12444 }

	12445 return rc;

	12446 }

	12447

	12448 /*

	12449 ** Query the BtShared.pHasContent vector.

	12450 **

	12451 ** This function is called when a free-list leaf page is removed from the

	12452 ** free-list for reuse. It returns false if it is safe to retrieve the

	12453 ** page from the pager layer with the 'no-content' flag set. True otherwise.

	12454 */

	12455 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){

	12456 Bitvec *p = pBt->pHasContent;

	12457 return (p && (pgno>sqlite3BitvecSize(p) \|\| sqlite3BitvecTest(p, pgno)));

	12458 }

	12459

	12460 /*

	12461 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be

	12462 ** invoked at the conclusion of each write-transaction.

	12463 */

	12464 static void btreeClearHasContent(BtShared *pBt){

	12465 sqlite3BitvecDestroy(pBt->pHasContent);

	12466 pBt->pHasContent = 0;

	12467 }

	12468

	12469 /*

	12470 ** Release all of the apPage[] pages for a cursor.

	12471 */

	12472 static void btreeReleaseAllCursorPages(BtCursor *pCur){

	12473 int i;

	12474 for(i=0; i<=pCur->iPage; i++){

	12475 releasePage(pCur->apPage[i]);

	12476 pCur->apPage[i] = 0;

	12477 }

	12478 pCur->iPage = -1;

	12479 }

	12480

	12481 /*

	12482 ** The cursor passed as the only argument must point to a valid entry

	12483 ** when this function is called (i.e. have eState==CURSOR_VALID). This

	12484 ** function saves the current cursor key in variables pCur->nKey and

	12485 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error

	12486 ** code otherwise.

	12487 **

	12488 ** If the cursor is open on an intkey table, then the integer key

	12489 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to

	12490 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is

	12491 ** set to point to a malloced buffer pCur->nKey bytes in size containing

	12492 ** the key.

	12493 */

	12494 static int saveCursorKey(BtCursor *pCur){

	12495 int rc;

	12496 assert( CURSOR_VALID==pCur->eState );

	12497 assert( 0==pCur->pKey );

	12498 assert( cursorHoldsMutex(pCur) );

	12499

	12500 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);

	12501 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */

	12502

	12503 /* If this is an intKey table, then the above call to BtreeKeySize()

	12504 ** stores the integer key in pCur->nKey. In this case this value is

	12505 ** all that is required. Otherwise, if pCur is not open on an intKey

	12506 ** table, then malloc space for and store the pCur->nKey bytes of key

	12507 ** data. */

	12508 if( 0==pCur->curIntKey ){

	12509 void *pKey = sqlite3Malloc( pCur->nKey );

	12510 if( pKey ){

	12511 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);

	12512 if( rc==SQLITE_OK ){

	12513 pCur->pKey = pKey;

	12514 }else{

	12515 sqlite3_free(pKey);

	12516 }

	12517 }else{

	12518 rc = SQLITE_NOMEM;

	12519 }

	12520 }

	12521 assert( !pCur->curIntKey \|\| !pCur->pKey );

	12522 return rc;

	12523 }

	12524

	12525 /*

	12526 ** Save the current cursor position in the variables BtCursor.nKey

	12527 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.

	12528 **

	12529 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)

	12530 ** prior to calling this routine.

	12531 */

	12532 static int saveCursorPosition(BtCursor *pCur){

	12533 int rc;

	12534

	12535 assert( CURSOR_VALID==pCur->eState \|\| CURSOR_SKIPNEXT==pCur->eState );

	12536 assert( 0==pCur->pKey );

	12537 assert( cursorHoldsMutex(pCur) );

	12538

	12539 if( pCur->eState==CURSOR_SKIPNEXT ){

	12540 pCur->eState = CURSOR_VALID;

	12541 }else{

	12542 pCur->skipNext = 0;

	12543 }

	12544

	12545 rc = saveCursorKey(pCur);

	12546 if( rc==SQLITE_OK ){

	12547 btreeReleaseAllCursorPages(pCur);

	12548 pCur->eState = CURSOR_REQUIRESEEK;

	12549 }

	12550

	12551 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl\|BTCF_AtLast);

	12552 return rc;

	12553 }

	12554

	12555 /* Forward reference */

	12556 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor,Pgno,BtCursor);

	12557

	12558 /*

	12559 ** Save the positions of all cursors (except pExcept) that are open on

	12560 ** the table with root-page iRoot. "Saving the cursor position" means that

	12561 ** the location in the btree is remembered in such a way that it can be

	12562 ** moved back to the same spot after the btree has been modified. This

	12563 ** routine is called just before cursor pExcept is used to modify the

	12564 ** table, for example in BtreeDelete() or BtreeInsert().

	12565 **

	12566 ** If there are two or more cursors on the same btree, then all such

	12567 ** cursors should have their BTCF_Multiple flag set. The btreeCursor()

	12568 ** routine enforces that rule. This routine only needs to be called in

	12569 ** the uncommon case when pExpect has the BTCF_Multiple flag set.

	12570 **

	12571 ** If pExpect!=NULL and if no other cursors are found on the same root-page,

	12572 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another

	12573 ** pointless call to this routine.

	12574 **

	12575 ** Implementation note: This routine merely checks to see if any cursors

	12576 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual)

	12577 ** event that cursors are in need to being saved.

	12578 */

	12579 static int saveAllCursors(BtShared pBt, Pgno iRoot, BtCursor pExcept){

	12580 BtCursor *p;

	12581 assert( sqlite3_mutex_held(pBt->mutex) );

	12582 assert( pExcept==0 \|\| pExcept->pBt==pBt );

	12583 for(p=pBt->pCursor; p; p=p->pNext){

	12584 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ) break;

	12585 }

	12586 if( p ) return saveCursorsOnList(p, iRoot, pExcept);

	12587 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;

	12588 return SQLITE_OK;

	12589 }

	12590

	12591 /* This helper routine to saveAllCursors does the actual work of saving

	12592 ** the cursors if and when a cursor is found that actually requires saving.

	12593 ** The common case is that no cursors need to be saved, so this routine is

	12594 ** broken out from its caller to avoid unnecessary stack pointer movement.

	12595 */

	12596 static int SQLITE_NOINLINE saveCursorsOnList(

	12597 BtCursor p, / The first cursor that needs saving */

	12598 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */

	12599 BtCursor pExcept / Do not save this cursor */

	12600 ){

	12601 do{

	12602 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ){

	12603 if( p->eState==CURSOR_VALID \|\| p->eState==CURSOR_SKIPNEXT ){

	12604 int rc = saveCursorPosition(p);

	12605 if( SQLITE_OK!=rc ){

	12606 return rc;

	12607 }

	12608 }else{

	12609 testcase( p->iPage>0 );

	12610 btreeReleaseAllCursorPages(p);

	12611 }

	12612 }

	12613 p = p->pNext;

	12614 }while( p );

	12615 return SQLITE_OK;

	12616 }

	12617

	12618 /*

	12619 ** Clear the current cursor position.

	12620 */

	12621 SQLITE_PRIVATE void sqlite3BtreeClearCursor(BtCursor *pCur){

	12622 assert( cursorHoldsMutex(pCur) );

	12623 sqlite3_free(pCur->pKey);

	12624 pCur->pKey = 0;

	12625 pCur->eState = CURSOR_INVALID;

	12626 }

	12627

	12628 /*

	12629 ** In this version of BtreeMoveto, pKey is a packed index record

	12630 ** such as is generated by the OP_MakeRecord opcode. Unpack the

	12631 ** record and then call BtreeMovetoUnpacked() to do the work.

	12632 */

	12633 static int btreeMoveto(

	12634 BtCursor pCur, / Cursor open on the btree to be searched */

	12635 const void pKey, / Packed key if the btree is an index */

	12636 i64 nKey, /* Integer key for tables. Size of pKey for indices */

	12637 int bias, /* Bias search to the high end */

	12638 int pRes / Write search results here */

	12639 ){

	12640 int rc; /* Status code */

	12641 UnpackedRecord pIdxKey; / Unpacked index key */

	12642 char aSpace[200]; /* Temp space for pIdxKey - to avoid a malloc */

	12643 char *pFree = 0;

	12644

	12645 if( pKey ){

	12646 assert( nKey==(i64)(int)nKey );

	12647 pIdxKey = sqlite3VdbeAllocUnpackedRecord(

	12648 pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree

	12649 );

	12650 if( pIdxKey==0 ) return SQLITE_NOMEM;

	12651 sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);

	12652 if( pIdxKey->nField==0 ){

	12653 sqlite3DbFree(pCur->pKeyInfo->db, pFree);

	12654 return SQLITE_CORRUPT_BKPT;

	12655 }

	12656 }else{

	12657 pIdxKey = 0;

	12658 }

	12659 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);

	12660 if( pFree ){

	12661 sqlite3DbFree(pCur->pKeyInfo->db, pFree);

	12662 }

	12663 return rc;

	12664 }

	12665

	12666 /*

	12667 ** Restore the cursor to the position it was in (or as close to as possible)

	12668 ** when saveCursorPosition() was called. Note that this call deletes the

	12669 ** saved position info stored by saveCursorPosition(), so there can be

	12670 ** at most one effective restoreCursorPosition() call after each

	12671 ** saveCursorPosition().

	12672 */

	12673 static int btreeRestoreCursorPosition(BtCursor *pCur){

	12674 int rc;

	12675 int skipNext;

	12676 assert( cursorHoldsMutex(pCur) );

	12677 assert( pCur->eState>=CURSOR_REQUIRESEEK );

	12678 if( pCur->eState==CURSOR_FAULT ){

	12679 return pCur->skipNext;

	12680 }

	12681 pCur->eState = CURSOR_INVALID;

	12682 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);

	12683 if( rc==SQLITE_OK ){

	12684 sqlite3_free(pCur->pKey);

	12685 pCur->pKey = 0;

	12686 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_INVALID );

	12687 pCur->skipNext \|= skipNext;

	12688 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){

	12689 pCur->eState = CURSOR_SKIPNEXT;

	12690 }

	12691 }

	12692 return rc;

	12693 }

	12694

	12695 #define restoreCursorPosition(p) \

	12696 (p->eState>=CURSOR_REQUIRESEEK ? \

	12697 btreeRestoreCursorPosition(p) : \

	12698 SQLITE_OK)

	12699

	12700 /*

	12701 ** Determine whether or not a cursor has moved from the position where

	12702 ** it was last placed, or has been invalidated for any other reason.

	12703 ** Cursors can move when the row they are pointing at is deleted out

	12704 ** from under them, for example. Cursor might also move if a btree

	12705 ** is rebalanced.

	12706 **

	12707 ** Calling this routine with a NULL cursor pointer returns false.

	12708 **

	12709 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor

	12710 ** back to where it ought to be if this routine returns true.

	12711 */

	12712 SQLITE_PRIVATE int sqlite3BtreeCursorHasMoved(BtCursor *pCur){

	12713 return pCur->eState!=CURSOR_VALID;

	12714 }

	12715

	12716 /*

	12717 ** This routine restores a cursor back to its original position after it

	12718 ** has been moved by some outside activity (such as a btree rebalance or

	12719 ** a row having been deleted out from under the cursor).

	12720 **

	12721 ** On success, the *pDifferentRow parameter is false if the cursor is left

	12722 ** pointing at exactly the same row. *pDifferntRow is the row the cursor

	12723 ** was pointing to has been deleted, forcing the cursor to point to some

	12724 ** nearby row.

	12725 **

	12726 ** This routine should only be called for a cursor that just returned

	12727 ** TRUE from sqlite3BtreeCursorHasMoved().

	12728 */

	12729 SQLITE_PRIVATE int sqlite3BtreeCursorRestore(BtCursor pCur, int pDifferentRow) {

	12730 int rc;

	12731

	12732 assert( pCur!=0 );

	12733 assert( pCur->eState!=CURSOR_VALID );

	12734 rc = restoreCursorPosition(pCur);

	12735 if( rc ){

	12736 *pDifferentRow = 1;

	12737 return rc;

	12738 }

	12739 if( pCur->eState!=CURSOR_VALID ){

	12740 *pDifferentRow = 1;

	12741 }else{

	12742 assert( pCur->skipNext==0 );

	12743 *pDifferentRow = 0;

	12744 }

	12745 return SQLITE_OK;

	12746 }

	12747

	12748 #ifdef SQLITE_ENABLE_CURSOR_HINTS

	12749 /*

	12750 ** Provide hints to the cursor. The particular hint given (and the type

	12751 ** and number of the varargs parameters) is determined by the eHintType

	12752 ** parameter. See the definitions of the BTREE_HINT_* macros for details.

	12753 */

	12754 SQLITE_PRIVATE void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){

	12755 /* Used only by system that substitute their own storage engine */

	12756 }

	12757 #endif

	12758

	12759 /*

	12760 ** Provide flag hints to the cursor.

	12761 */

	12762 SQLITE_PRIVATE void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){

	12763 assert( x==BTREE_SEEK_EQ \|\| x==BTREE_BULKLOAD \|\| x==0 );

	12764 pCur->hints = x;

	12765 }

	12766

	12767

	12768 #ifndef SQLITE_OMIT_AUTOVACUUM

	12769 /*

	12770 ** Given a page number of a regular database page, return the page

	12771 ** number for the pointer-map page that contains the entry for the

	12772 ** input page number.

	12773 **

	12774 ** Return 0 (not a valid page) for pgno==1 since there is

	12775 ** no pointer map associated with page 1. The integrity_check logic

	12776 ** requires that ptrmapPageno(*,1)!=1.

	12777 */

	12778 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){

	12779 int nPagesPerMapPage;

	12780 Pgno iPtrMap, ret;

	12781 assert( sqlite3_mutex_held(pBt->mutex) );

	12782 if( pgno<2 ) return 0;

	12783 nPagesPerMapPage = (pBt->usableSize/5)+1;

	12784 iPtrMap = (pgno-2)/nPagesPerMapPage;

	12785 ret = (iPtrMap*nPagesPerMapPage) + 2;

	12786 if( ret==PENDING_BYTE_PAGE(pBt) ){

	12787 ret++;

	12788 }

	12789 return ret;

	12790 }

	12791

	12792 /*

	12793 ** Write an entry into the pointer map.

	12794 **

	12795 ** This routine updates the pointer map entry for page number 'key'

	12796 ** so that it maps to type 'eType' and parent page number 'pgno'.

	12797 **

	12798 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is

	12799 ** a no-op. If an error occurs, the appropriate error code is written

	12800 ** into *pRC.

	12801 */

	12802 static void ptrmapPut(BtShared pBt, Pgno key, u8 eType, Pgno parent, int pRC){

	12803 DbPage pDbPage; / The pointer map page */

	12804 u8 pPtrmap; / The pointer map data */

	12805 Pgno iPtrmap; /* The pointer map page number */

	12806 int offset; /* Offset in pointer map page */

	12807 int rc; /* Return code from subfunctions */

	12808

	12809 if( *pRC ) return;

	12810

	12811 assert( sqlite3_mutex_held(pBt->mutex) );

	12812 /* The master-journal page number must never be used as a pointer map page */

	12813 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );

	12814

	12815 assert( pBt->autoVacuum );

	12816 if( key==0 ){

	12817 *pRC = SQLITE_CORRUPT_BKPT;

	12818 return;

	12819 }

	12820 iPtrmap = PTRMAP_PAGENO(pBt, key);

	12821 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);

	12822 if( rc!=SQLITE_OK ){

	12823 *pRC = rc;

	12824 return;

	12825 }

	12826 offset = PTRMAP_PTROFFSET(iPtrmap, key);

	12827 if( offset<0 ){

	12828 *pRC = SQLITE_CORRUPT_BKPT;

	12829 goto ptrmap_exit;

	12830 }

	12831 assert( offset <= (int)pBt->usableSize-5 );

	12832 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

	12833

	12834 if( eType!=pPtrmap[offset] \|\| get4byte(&pPtrmap[offset+1])!=parent ){

	12835 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));

	12836 *pRC= rc = sqlite3PagerWrite(pDbPage);

	12837 if( rc==SQLITE_OK ){

	12838 pPtrmap[offset] = eType;

	12839 put4byte(&pPtrmap[offset+1], parent);

	12840 }

	12841 }

	12842

	12843 ptrmap_exit:

	12844 sqlite3PagerUnref(pDbPage);

	12845 }

	12846

	12847 /*

	12848 ** Read an entry from the pointer map.

	12849 **

	12850 ** This routine retrieves the pointer map entry for page 'key', writing

	12851 ** the type and parent page number to pEType and pPgno respectively.

	12852 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.

	12853 */

	12854 static int ptrmapGet(BtShared pBt, Pgno key, u8 pEType, Pgno *pPgno){

	12855 DbPage pDbPage; / The pointer map page */

	12856 int iPtrmap; /* Pointer map page index */

	12857 u8 pPtrmap; / Pointer map page data */

	12858 int offset; /* Offset of entry in pointer map */

	12859 int rc;

	12860

	12861 assert( sqlite3_mutex_held(pBt->mutex) );

	12862

	12863 iPtrmap = PTRMAP_PAGENO(pBt, key);

	12864 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);

	12865 if( rc!=0 ){

	12866 return rc;

	12867 }

	12868 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

	12869

	12870 offset = PTRMAP_PTROFFSET(iPtrmap, key);

	12871 if( offset<0 ){

	12872 sqlite3PagerUnref(pDbPage);

	12873 return SQLITE_CORRUPT_BKPT;

	12874 }

	12875 assert( offset <= (int)pBt->usableSize-5 );

	12876 assert( pEType!=0 );

	12877 *pEType = pPtrmap[offset];

	12878 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);

	12879

	12880 sqlite3PagerUnref(pDbPage);

	12881 if( pEType<1 \|\| pEType>5 ) return SQLITE_CORRUPT_BKPT;

	12882 return SQLITE_OK;

	12883 }

	12884

	12885 #else /* if defined SQLITE_OMIT_AUTOVACUUM */

	12886 #define ptrmapPut(w,x,y,z,rc)

	12887 #define ptrmapGet(w,x,y,z) SQLITE_OK

	12888 #define ptrmapPutOvflPtr(x, y, rc)

	12889 #endif

	12890

	12891 /*

	12892 ** Given a btree page and a cell index (0 means the first cell on

	12893 ** the page, 1 means the second cell, and so forth) return a pointer

	12894 ** to the cell content.

	12895 **

	12896 ** findCellPastPtr() does the same except it skips past the initial

	12897 ** 4-byte child pointer found on interior pages, if there is one.

	12898 **

	12899 ** This routine works only for pages that do not contain overflow cells.

	12900 */

	12901 #define findCell(P,I) \

	12902 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))

	12903 #define findCellPastPtr(P,I) \

	12904 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))

	12905

	12906

	12907 /*

	12908 ** This is common tail processing for btreeParseCellPtr() and

	12909 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely

	12910 ** on a single B-tree page. Make necessary adjustments to the CellInfo

	12911 ** structure.

	12912 */

	12913 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(

	12914 MemPage pPage, / Page containing the cell */

	12915 u8 pCell, / Pointer to the cell text. */

	12916 CellInfo pInfo / Fill in this structure */

	12917 ){

	12918 /* If the payload will not fit completely on the local page, we have

	12919 ** to decide how much to store locally and how much to spill onto

	12920 ** overflow pages. The strategy is to minimize the amount of unused

	12921 ** space on overflow pages while keeping the amount of local storage

	12922 ** in between minLocal and maxLocal.

	12923 **

	12924 ** Warning: changing the way overflow payload is distributed in any

	12925 ** way will result in an incompatible file format.

	12926 */

	12927 int minLocal; /* Minimum amount of payload held locally */

	12928 int maxLocal; /* Maximum amount of payload held locally */

	12929 int surplus; /* Overflow payload available for local storage */

	12930

	12931 minLocal = pPage->minLocal;

	12932 maxLocal = pPage->maxLocal;

	12933 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);

	12934 testcase( surplus==maxLocal );

	12935 testcase( surplus==maxLocal+1 );

	12936 if( surplus <= maxLocal ){

	12937 pInfo->nLocal = (u16)surplus;

	12938 }else{

	12939 pInfo->nLocal = (u16)minLocal;

	12940 }

	12941 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;

	12942 }

	12943

	12944 /*

	12945 ** The following routines are implementations of the MemPage.xParseCell()

	12946 ** method.

	12947 **

	12948 ** Parse a cell content block and fill in the CellInfo structure.

	12949 **

	12950 ** btreeParseCellPtr() => table btree leaf nodes

	12951 ** btreeParseCellNoPayload() => table btree internal nodes

	12952 ** btreeParseCellPtrIndex() => index btree nodes

	12953 **

	12954 ** There is also a wrapper function btreeParseCell() that works for

	12955 ** all MemPage types and that references the cell by index rather than

	12956 ** by pointer.

	12957 */

	12958 static void btreeParseCellPtrNoPayload(

	12959 MemPage pPage, / Page containing the cell */

	12960 u8 pCell, / Pointer to the cell text. */

	12961 CellInfo pInfo / Fill in this structure */

	12962 ){

	12963 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	12964 assert( pPage->leaf==0 );

	12965 assert( pPage->noPayload );

	12966 assert( pPage->childPtrSize==4 );

	12967 #ifndef SQLITE_DEBUG

	12968 UNUSED_PARAMETER(pPage);

	12969 #endif

	12970 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);

	12971 pInfo->nPayload = 0;

	12972 pInfo->nLocal = 0;

	12973 pInfo->pPayload = 0;

	12974 return;

	12975 }

	12976 static void btreeParseCellPtr(

	12977 MemPage pPage, / Page containing the cell */

	12978 u8 pCell, / Pointer to the cell text. */

	12979 CellInfo pInfo / Fill in this structure */

	12980 ){

	12981 u8 pIter; / For scanning through pCell */

	12982 u32 nPayload; /* Number of bytes of cell payload */

	12983 u64 iKey; /* Extracted Key value */

	12984

	12985 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	12986 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

	12987 assert( pPage->intKeyLeaf \|\| pPage->noPayload );

	12988 assert( pPage->noPayload==0 );

	12989 assert( pPage->intKeyLeaf );

	12990 assert( pPage->childPtrSize==0 );

	12991 pIter = pCell;

	12992

	12993 /* The next block of code is equivalent to:

	12994 **

	12995 ** pIter += getVarint32(pIter, nPayload);

	12996 **

	12997 ** The code is inlined to avoid a function call.

	12998 */

	12999 nPayload = *pIter;

	13000 if( nPayload>=0x80 ){

	13001 u8 *pEnd = &pIter[8];

	13002 nPayload &= 0x7f;

	13003 do{

	13004 nPayload = (nPayload<<7) \| (*++pIter & 0x7f);

	13005 }while( (*pIter)>=0x80 && pIter<pEnd );

	13006 }

	13007 pIter++;

	13008

	13009 /* The next block of code is equivalent to:

	13010 **

	13011 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey);

	13012 **

	13013 ** The code is inlined to avoid a function call.

	13014 */

	13015 iKey = *pIter;

	13016 if( iKey>=0x80 ){

	13017 u8 *pEnd = &pIter[7];

	13018 iKey &= 0x7f;

	13019 while(1){

	13020 iKey = (iKey<<7) \| (*++pIter & 0x7f);

	13021 if( (*pIter)<0x80 ) break;

	13022 if( pIter>=pEnd ){

	13023 iKey = (iKey<<8) \| *++pIter;

	13024 break;

	13025 }

	13026 }

	13027 }

	13028 pIter++;

	13029

	13030 pInfo->nKey = (i64)&iKey;

	13031 pInfo->nPayload = nPayload;

	13032 pInfo->pPayload = pIter;

	13033 testcase( nPayload==pPage->maxLocal );

	13034 testcase( nPayload==pPage->maxLocal+1 );

	13035 if( nPayload<=pPage->maxLocal ){

	13036 /* This is the (easy) common case where the entire payload fits

	13037 ** on the local page. No overflow is required.

	13038 */

	13039 pInfo->nSize = nPayload + (u16)(pIter - pCell);

	13040 if( pInfo->nSize<4 ) pInfo->nSize = 4;

	13041 pInfo->nLocal = (u16)nPayload;

	13042 }else{

	13043 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);

	13044 }

	13045 }

	13046 static void btreeParseCellPtrIndex(

	13047 MemPage pPage, / Page containing the cell */

	13048 u8 pCell, / Pointer to the cell text. */

	13049 CellInfo pInfo / Fill in this structure */

	13050 ){

	13051 u8 pIter; / For scanning through pCell */

	13052 u32 nPayload; /* Number of bytes of cell payload */

	13053

	13054 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13055 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

	13056 assert( pPage->intKeyLeaf==0 );

	13057 assert( pPage->noPayload==0 );

	13058 pIter = pCell + pPage->childPtrSize;

	13059 nPayload = *pIter;

	13060 if( nPayload>=0x80 ){

	13061 u8 *pEnd = &pIter[8];

	13062 nPayload &= 0x7f;

	13063 do{

	13064 nPayload = (nPayload<<7) \| (*++pIter & 0x7f);

	13065 }while( *(pIter)>=0x80 && pIter<pEnd );

	13066 }

	13067 pIter++;

	13068 pInfo->nKey = nPayload;

	13069 pInfo->nPayload = nPayload;

	13070 pInfo->pPayload = pIter;

	13071 testcase( nPayload==pPage->maxLocal );

	13072 testcase( nPayload==pPage->maxLocal+1 );

	13073 if( nPayload<=pPage->maxLocal ){

	13074 /* This is the (easy) common case where the entire payload fits

	13075 ** on the local page. No overflow is required.

	13076 */

	13077 pInfo->nSize = nPayload + (u16)(pIter - pCell);

	13078 if( pInfo->nSize<4 ) pInfo->nSize = 4;

	13079 pInfo->nLocal = (u16)nPayload;

	13080 }else{

	13081 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);

	13082 }

	13083 }

	13084 static void btreeParseCell(

	13085 MemPage pPage, / Page containing the cell */

	13086 int iCell, /* The cell index. First cell is 0 */

	13087 CellInfo pInfo / Fill in this structure */

	13088 ){

	13089 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);

	13090 }

	13091

	13092 /*

	13093 ** The following routines are implementations of the MemPage.xCellSize

	13094 ** method.

	13095 **

	13096 ** Compute the total number of bytes that a Cell needs in the cell

	13097 ** data area of the btree-page. The return number includes the cell

	13098 ** data header and the local payload, but not any overflow page or

	13099 ** the space used by the cell pointer.

	13100 **

	13101 ** cellSizePtrNoPayload() => table internal nodes

	13102 ** cellSizePtr() => all index nodes & table leaf nodes

	13103 */

	13104 static u16 cellSizePtr(MemPage pPage, u8 pCell){

	13105 u8 pIter = pCell + pPage->childPtrSize; / For looping over bytes of pCell */

	13106 u8 pEnd; / End mark for a varint */

	13107 u32 nSize; /* Size value to return */

	13108

	13109 #ifdef SQLITE_DEBUG

	13110 /* The value returned by this function should always be the same as

	13111 ** the (CellInfo.nSize) value found by doing a full parse of the

	13112 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

	13113 ** this function verifies that this invariant is not violated. */

	13114 CellInfo debuginfo;

	13115 pPage->xParseCell(pPage, pCell, &debuginfo);

	13116 #endif

	13117

	13118 assert( pPage->noPayload==0 );

	13119 nSize = *pIter;

	13120 if( nSize>=0x80 ){

	13121 pEnd = &pIter[8];

	13122 nSize &= 0x7f;

	13123 do{

	13124 nSize = (nSize<<7) \| (*++pIter & 0x7f);

	13125 }while( *(pIter)>=0x80 && pIter<pEnd );

	13126 }

	13127 pIter++;

	13128 if( pPage->intKey ){

	13129 /* pIter now points at the 64-bit integer key value, a variable length

	13130 ** integer. The following block moves pIter to point at the first byte

	13131 ** past the end of the key value. */

	13132 pEnd = &pIter[9];

	13133 while( (*pIter++)&0x80 && pIter<pEnd );

	13134 }

	13135 testcase( nSize==pPage->maxLocal );

	13136 testcase( nSize==pPage->maxLocal+1 );

	13137 if( nSize<=pPage->maxLocal ){

	13138 nSize += (u32)(pIter - pCell);

	13139 if( nSize<4 ) nSize = 4;

	13140 }else{

	13141 int minLocal = pPage->minLocal;

	13142 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);

	13143 testcase( nSize==pPage->maxLocal );

	13144 testcase( nSize==pPage->maxLocal+1 );

	13145 if( nSize>pPage->maxLocal ){

	13146 nSize = minLocal;

	13147 }

	13148 nSize += 4 + (u16)(pIter - pCell);

	13149 }

	13150 assert( nSize==debuginfo.nSize \|\| CORRUPT_DB );

	13151 return (u16)nSize;

	13152 }

	13153 static u16 cellSizePtrNoPayload(MemPage pPage, u8 pCell){

	13154 u8 pIter = pCell + 4; / For looping over bytes of pCell */

	13155 u8 pEnd; / End mark for a varint */

	13156

	13157 #ifdef SQLITE_DEBUG

	13158 /* The value returned by this function should always be the same as

	13159 ** the (CellInfo.nSize) value found by doing a full parse of the

	13160 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

	13161 ** this function verifies that this invariant is not violated. */

	13162 CellInfo debuginfo;

	13163 pPage->xParseCell(pPage, pCell, &debuginfo);

	13164 #else

	13165 UNUSED_PARAMETER(pPage);

	13166 #endif

	13167

	13168 assert( pPage->childPtrSize==4 );

	13169 pEnd = pIter + 9;

	13170 while( (*pIter++)&0x80 && pIter<pEnd );

	13171 assert( debuginfo.nSize==(u16)(pIter - pCell) \|\| CORRUPT_DB );

	13172 return (u16)(pIter - pCell);

	13173 }

	13174

	13175

	13176 #ifdef SQLITE_DEBUG

	13177 /* This variation on cellSizePtr() is used inside of assert() statements

	13178 ** only. */

	13179 static u16 cellSize(MemPage *pPage, int iCell){

	13180 return pPage->xCellSize(pPage, findCell(pPage, iCell));

	13181 }

	13182 #endif

	13183

	13184 #ifndef SQLITE_OMIT_AUTOVACUUM

	13185 /*

	13186 ** If the cell pCell, part of page pPage contains a pointer

	13187 ** to an overflow page, insert an entry into the pointer-map

	13188 ** for the overflow page.

	13189 */

	13190 static void ptrmapPutOvflPtr(MemPage pPage, u8 pCell, int *pRC){

	13191 CellInfo info;

	13192 if( *pRC ) return;

	13193 assert( pCell!=0 );

	13194 pPage->xParseCell(pPage, pCell, &info);

	13195 if( info.nLocal<info.nPayload ){

	13196 Pgno ovfl = get4byte(&pCell[info.nSize-4]);

	13197 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);

	13198 }

	13199 }

	13200 #endif

	13201

	13202

	13203 /*

	13204 ** Defragment the page given. All Cells are moved to the

	13205 ** end of the page and all free space is collected into one

	13206 ** big FreeBlk that occurs in between the header and cell

	13207 ** pointer array and the cell content area.

	13208 **

	13209 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a

	13210 ** b-tree page so that there are no freeblocks or fragment bytes, all

	13211 ** unused bytes are contained in the unallocated space region, and all

	13212 ** cells are packed tightly at the end of the page.

	13213 */

	13214 static int defragmentPage(MemPage *pPage){

	13215 int i; /* Loop counter */

	13216 int pc; /* Address of the i-th cell */

	13217 int hdr; /* Offset to the page header */

	13218 int size; /* Size of a cell */

	13219 int usableSize; /* Number of usable bytes on a page */

	13220 int cellOffset; /* Offset to the cell pointer array */

	13221 int cbrk; /* Offset to the cell content area */

	13222 int nCell; /* Number of cells on the page */

	13223 unsigned char data; / The page data */

	13224 unsigned char temp; / Temp area for cell content */

	13225 unsigned char src; / Source of content */

	13226 int iCellFirst; /* First allowable cell index */

	13227 int iCellLast; /* Last possible cell index */

	13228

	13229

	13230 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	13231 assert( pPage->pBt!=0 );

	13232 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );

	13233 assert( pPage->nOverflow==0 );

	13234 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13235 temp = 0;

	13236 src = data = pPage->aData;

	13237 hdr = pPage->hdrOffset;

	13238 cellOffset = pPage->cellOffset;

	13239 nCell = pPage->nCell;

	13240 assert( nCell==get2byte(&data[hdr+3]) );

	13241 usableSize = pPage->pBt->usableSize;

	13242 cbrk = usableSize;

	13243 iCellFirst = cellOffset + 2*nCell;

	13244 iCellLast = usableSize - 4;

	13245 for(i=0; i<nCell; i++){

	13246 u8 pAddr; / The i-th cell pointer */

	13247 pAddr = &data[cellOffset + i*2];

	13248 pc = get2byte(pAddr);

	13249 testcase( pc==iCellFirst );

	13250 testcase( pc==iCellLast );

	13251 /* These conditions have already been verified in btreeInitPage()

	13252 ** if PRAGMA cell_size_check=ON.

	13253 */

	13254 if( pc<iCellFirst \|\| pc>iCellLast ){

	13255 return SQLITE_CORRUPT_BKPT;

	13256 }

	13257 assert( pc>=iCellFirst && pc<=iCellLast );

	13258 size = pPage->xCellSize(pPage, &src[pc]);

	13259 cbrk -= size;

	13260 if( cbrk<iCellFirst \|\| pc+size>usableSize ){

	13261 return SQLITE_CORRUPT_BKPT;

	13262 }

	13263 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );

	13264 testcase( cbrk+size==usableSize );

	13265 testcase( pc+size==usableSize );

	13266 put2byte(pAddr, cbrk);

	13267 if( temp==0 ){

	13268 int x;

	13269 if( cbrk==pc ) continue;

	13270 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);

	13271 x = get2byte(&data[hdr+5]);

	13272 memcpy(&temp[x], &data[x], (cbrk+size) - x);

	13273 src = temp;

	13274 }

	13275 memcpy(&data[cbrk], &src[pc], size);

	13276 }

	13277 assert( cbrk>=iCellFirst );

	13278 put2byte(&data[hdr+5], cbrk);

	13279 data[hdr+1] = 0;

	13280 data[hdr+2] = 0;

	13281 data[hdr+7] = 0;

	13282 memset(&data[iCellFirst], 0, cbrk-iCellFirst);

	13283 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	13284 if( cbrk-iCellFirst!=pPage->nFree ){

	13285 return SQLITE_CORRUPT_BKPT;

	13286 }

	13287 return SQLITE_OK;

	13288 }

	13289

	13290 /*

	13291 ** Search the free-list on page pPg for space to store a cell nByte bytes in

	13292 ** size. If one can be found, return a pointer to the space and remove it

	13293 ** from the free-list.

	13294 **

	13295 ** If no suitable space can be found on the free-list, return NULL.

	13296 **

	13297 ** This function may detect corruption within pPg. If corruption is

	13298 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.

	13299 **

	13300 ** Slots on the free list that are between 1 and 3 bytes larger than nByte

	13301 ** will be ignored if adding the extra space to the fragmentation count

	13302 ** causes the fragmentation count to exceed 60.

	13303 */

	13304 static u8 pageFindSlot(MemPage pPg, int nByte, int *pRc){

	13305 const int hdr = pPg->hdrOffset;

	13306 u8 * const aData = pPg->aData;

	13307 int iAddr = hdr + 1;

	13308 int pc = get2byte(&aData[iAddr]);

	13309 int x;

	13310 int usableSize = pPg->pBt->usableSize;

	13311

	13312 assert( pc>0 );

	13313 do{

	13314 int size; /* Size of the free slot */

	13315 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of

	13316 ** increasing offset. */

	13317 if( pc>usableSize-4 \|\| pc<iAddr+4 ){

	13318 *pRc = SQLITE_CORRUPT_BKPT;

	13319 return 0;

	13320 }

	13321 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each

	13322 ** freeblock form a big-endian integer which is the size of the freeblock

	13323 ** in bytes, including the 4-byte header. */

	13324 size = get2byte(&aData[pc+2]);

	13325 if( (x = size - nByte)>=0 ){

	13326 testcase( x==4 );

	13327 testcase( x==3 );

	13328 if( pc < pPg->cellOffset+2*pPg->nCell \|\| size+pc > usableSize ){

	13329 *pRc = SQLITE_CORRUPT_BKPT;

	13330 return 0;

	13331 }else if( x<4 ){

	13332 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total

	13333 ** number of bytes in fragments may not exceed 60. */

	13334 if( aData[hdr+7]>57 ) return 0;

	13335

	13336 /* Remove the slot from the free-list. Update the number of

	13337 ** fragmented bytes within the page. */

	13338 memcpy(&aData[iAddr], &aData[pc], 2);

	13339 aData[hdr+7] += (u8)x;

	13340 }else{

	13341 /* The slot remains on the free-list. Reduce its size to account

	13342 ** for the portion used by the new allocation. */

	13343 put2byte(&aData[pc+2], x);

	13344 }

	13345 return &aData[pc + x];

	13346 }

	13347 iAddr = pc;

	13348 pc = get2byte(&aData[pc]);

	13349 }while( pc );

	13350

	13351 return 0;

	13352 }

	13353

	13354 /*

	13355 ** Allocate nByte bytes of space from within the B-Tree page passed

	13356 ** as the first argument. Write into *pIdx the index into pPage->aData[]

	13357 ** of the first byte of allocated space. Return either SQLITE_OK or

	13358 ** an error code (usually SQLITE_CORRUPT).

	13359 **

	13360 ** The caller guarantees that there is sufficient space to make the

	13361 ** allocation. This routine might need to defragment in order to bring

	13362 ** all the space together, however. This routine will avoid using

	13363 ** the first two bytes past the cell pointer area since presumably this

	13364 ** allocation is being made in order to insert a new cell, so we will

	13365 ** also end up needing a new cell pointer.

	13366 */

	13367 static int allocateSpace(MemPage pPage, int nByte, int pIdx){

	13368 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */

	13369 u8 * const data = pPage->aData; /* Local cache of pPage->aData */

	13370 int top; /* First byte of cell content area */

	13371 int rc = SQLITE_OK; /* Integer return code */

	13372 int gap; /* First byte of gap between cell pointers and cell content */

	13373

	13374 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	13375 assert( pPage->pBt );

	13376 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13377 assert( nByte>=0 ); /* Minimum cell size is 4 */

	13378 assert( pPage->nFree>=nByte );

	13379 assert( pPage->nOverflow==0 );

	13380 assert( nByte < (int)(pPage->pBt->usableSize-8) );

	13381

	13382 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );

	13383 gap = pPage->cellOffset + 2*pPage->nCell;

	13384 assert( gap<=65536 );

	13385 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size

	13386 ** and the reserved space is zero (the usual value for reserved space)

	13387 ** then the cell content offset of an empty page wants to be 65536.

	13388 ** However, that integer is too large to be stored in a 2-byte unsigned

	13389 ** integer, so a value of 0 is used in its place. */

	13390 top = get2byte(&data[hdr+5]);

	13391 assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */

	13392 if( gap>top ){

	13393 if( top==0 && pPage->pBt->usableSize==65536 ){

	13394 top = 65536;

	13395 }else{

	13396 return SQLITE_CORRUPT_BKPT;

	13397 }

	13398 }

	13399

	13400 /* If there is enough space between gap and top for one more cell pointer

	13401 ** array entry offset, and if the freelist is not empty, then search the

	13402 ** freelist looking for a free slot big enough to satisfy the request.

	13403 */

	13404 testcase( gap+2==top );

	13405 testcase( gap+1==top );

	13406 testcase( gap==top );

	13407 if( (data[hdr+2] \|\| data[hdr+1]) && gap+2<=top ){

	13408 u8 *pSpace = pageFindSlot(pPage, nByte, &rc);

	13409 if( pSpace ){

	13410 assert( pSpace>=data && (pSpace - data)<65536 );

	13411 *pIdx = (int)(pSpace - data);

	13412 return SQLITE_OK;

	13413 }else if( rc ){

	13414 return rc;

	13415 }

	13416 }

	13417

	13418 /* The request could not be fulfilled using a freelist slot. Check

	13419 ** to see if defragmentation is necessary.

	13420 */

	13421 testcase( gap+2+nByte==top );

	13422 if( gap+2+nByte>top ){

	13423 assert( pPage->nCell>0 \|\| CORRUPT_DB );

	13424 rc = defragmentPage(pPage);

	13425 if( rc ) return rc;

	13426 top = get2byteNotZero(&data[hdr+5]);

	13427 assert( gap+nByte<=top );

	13428 }

	13429

	13430

	13431 /* Allocate memory from the gap in between the cell pointer array

	13432 ** and the cell content area. The btreeInitPage() call has already

	13433 ** validated the freelist. Given that the freelist is valid, there

	13434 ** is no way that the allocation can extend off the end of the page.

	13435 ** The assert() below verifies the previous sentence.

	13436 */

	13437 top -= nByte;

	13438 put2byte(&data[hdr+5], top);

	13439 assert( top+nByte <= (int)pPage->pBt->usableSize );

	13440 *pIdx = top;

	13441 return SQLITE_OK;

	13442 }

	13443

	13444 /*

	13445 ** Return a section of the pPage->aData to the freelist.

	13446 ** The first byte of the new free block is pPage->aData[iStart]

	13447 ** and the size of the block is iSize bytes.

	13448 **

	13449 ** Adjacent freeblocks are coalesced.

	13450 **

	13451 ** Note that even though the freeblock list was checked by btreeInitPage(),

	13452 ** that routine will not detect overlap between cells or freeblocks. Nor

	13453 ** does it detect cells or freeblocks that encrouch into the reserved bytes

	13454 ** at the end of the page. So do additional corruption checks inside this

	13455 ** routine and return SQLITE_CORRUPT if any problems are found.

	13456 */

	13457 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){

	13458 u16 iPtr; /* Address of ptr to next freeblock */

	13459 u16 iFreeBlk; /* Address of the next freeblock */

	13460 u8 hdr; /* Page header size. 0 or 100 */

	13461 u8 nFrag = 0; /* Reduction in fragmentation */

	13462 u16 iOrigSize = iSize; /* Original value of iSize */

	13463 u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */

	13464 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */

	13465 unsigned char data = pPage->aData; / Page content */

	13466

	13467 assert( pPage->pBt!=0 );

	13468 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	13469 assert( CORRUPT_DB \|\| iStart>=pPage->hdrOffset+6+pPage->childPtrSize );

	13470 assert( CORRUPT_DB \|\| iEnd <= pPage->pBt->usableSize );

	13471 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13472 assert( iSize>=4 ); /* Minimum cell size is 4 */

	13473 assert( iStart<=iLast );

	13474

	13475 /* Overwrite deleted information with zeros when the secure_delete

	13476 ** option is enabled */

	13477 if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){

	13478 memset(&data[iStart], 0, iSize);

	13479 }

	13480

	13481 /* The list of freeblocks must be in ascending order. Find the

	13482 ** spot on the list where iStart should be inserted.

	13483 */

	13484 hdr = pPage->hdrOffset;

	13485 iPtr = hdr + 1;

	13486 if( data[iPtr+1]==0 && data[iPtr]==0 ){

	13487 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */

	13488 }else{

	13489 while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){

	13490 if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;

	13491 iPtr = iFreeBlk;

	13492 }

	13493 if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;

	13494 assert( iFreeBlk>iPtr \|\| iFreeBlk==0 );

	13495

	13496 /* At this point:

	13497 ** iFreeBlk: First freeblock after iStart, or zero if none

	13498 ** iPtr: The address of a pointer to iFreeBlk

	13499 **

	13500 ** Check to see if iFreeBlk should be coalesced onto the end of iStart.

	13501 */

	13502 if( iFreeBlk && iEnd+3>=iFreeBlk ){

	13503 nFrag = iFreeBlk - iEnd;

	13504 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;

	13505 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);

	13506 if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;

	13507 iSize = iEnd - iStart;

	13508 iFreeBlk = get2byte(&data[iFreeBlk]);

	13509 }

	13510

	13511 /* If iPtr is another freeblock (that is, if iPtr is not the freelist

	13512 ** pointer in the page header) then check to see if iStart should be

	13513 ** coalesced onto the end of iPtr.

	13514 */

	13515 if( iPtr>hdr+1 ){

	13516 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);

	13517 if( iPtrEnd+3>=iStart ){

	13518 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;

	13519 nFrag += iStart - iPtrEnd;

	13520 iSize = iEnd - iPtr;

	13521 iStart = iPtr;

	13522 }

	13523 }

	13524 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;

	13525 data[hdr+7] -= nFrag;

	13526 }

	13527 if( iStart==get2byte(&data[hdr+5]) ){

	13528 /* The new freeblock is at the beginning of the cell content area,

	13529 ** so just extend the cell content area rather than create another

	13530 ** freelist entry */

	13531 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;

	13532 put2byte(&data[hdr+1], iFreeBlk);

	13533 put2byte(&data[hdr+5], iEnd);

	13534 }else{

	13535 /* Insert the new freeblock into the freelist */

	13536 put2byte(&data[iPtr], iStart);

	13537 put2byte(&data[iStart], iFreeBlk);

	13538 put2byte(&data[iStart+2], iSize);

	13539 }

	13540 pPage->nFree += iOrigSize;

	13541 return SQLITE_OK;

	13542 }

	13543

	13544 /*

	13545 ** Decode the flags byte (the first byte of the header) for a page

	13546 ** and initialize fields of the MemPage structure accordingly.

	13547 **

	13548 ** Only the following combinations are supported. Anything different

	13549 ** indicates a corrupt database files:

	13550 **

	13551 ** PTF_ZERODATA

	13552 ** PTF_ZERODATA \| PTF_LEAF

	13553 ** PTF_LEAFDATA \| PTF_INTKEY

	13554 ** PTF_LEAFDATA \| PTF_INTKEY \| PTF_LEAF

	13555 */

	13556 static int decodeFlags(MemPage *pPage, int flagByte){

	13557 BtShared pBt; / A copy of pPage->pBt */

	13558

	13559 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );

	13560 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13561 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );

	13562 flagByte &= ~PTF_LEAF;

	13563 pPage->childPtrSize = 4-4*pPage->leaf;

	13564 pPage->xCellSize = cellSizePtr;

	13565 pBt = pPage->pBt;

	13566 if( flagByte==(PTF_LEAFDATA \| PTF_INTKEY) ){

	13567 /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior

	13568 ** table b-tree page. */

	13569 assert( (PTF_LEAFDATA\|PTF_INTKEY)==5 );

	13570 /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf

	13571 ** table b-tree page. */

	13572 assert( (PTF_LEAFDATA\|PTF_INTKEY\|PTF_LEAF)==13 );

	13573 pPage->intKey = 1;

	13574 if( pPage->leaf ){

	13575 pPage->intKeyLeaf = 1;

	13576 pPage->noPayload = 0;

	13577 pPage->xParseCell = btreeParseCellPtr;

	13578 }else{

	13579 pPage->intKeyLeaf = 0;

	13580 pPage->noPayload = 1;

	13581 pPage->xCellSize = cellSizePtrNoPayload;

	13582 pPage->xParseCell = btreeParseCellPtrNoPayload;

	13583 }

	13584 pPage->maxLocal = pBt->maxLeaf;

	13585 pPage->minLocal = pBt->minLeaf;

	13586 }else if( flagByte==PTF_ZERODATA ){

	13587 /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior

	13588 ** index b-tree page. */

	13589 assert( (PTF_ZERODATA)==2 );

	13590 /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf

	13591 ** index b-tree page. */

	13592 assert( (PTF_ZERODATA\|PTF_LEAF)==10 );

	13593 pPage->intKey = 0;

	13594 pPage->intKeyLeaf = 0;

	13595 pPage->noPayload = 0;

	13596 pPage->xParseCell = btreeParseCellPtrIndex;

	13597 pPage->maxLocal = pBt->maxLocal;

	13598 pPage->minLocal = pBt->minLocal;

	13599 }else{

	13600 /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is

	13601 ** an error. */

	13602 return SQLITE_CORRUPT_BKPT;

	13603 }

	13604 pPage->max1bytePayload = pBt->max1bytePayload;

	13605 return SQLITE_OK;

	13606 }

	13607

	13608 /*

	13609 ** Initialize the auxiliary information for a disk block.

	13610 **

	13611 ** Return SQLITE_OK on success. If we see that the page does

	13612 ** not contain a well-formed database page, then return

	13613 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not

	13614 ** guarantee that the page is well-formed. It only shows that

	13615 ** we failed to detect any corruption.

	13616 */

	13617 static int btreeInitPage(MemPage *pPage){

	13618

	13619 assert( pPage->pBt!=0 );

	13620 assert( pPage->pBt->db!=0 );

	13621 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13622 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );

	13623 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );

	13624 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );

	13625

	13626 if( !pPage->isInit ){

	13627 u16 pc; /* Address of a freeblock within pPage->aData[] */

	13628 u8 hdr; /* Offset to beginning of page header */

	13629 u8 data; / Equal to pPage->aData */

	13630 BtShared pBt; / The main btree structure */

	13631 int usableSize; /* Amount of usable space on each page */

	13632 u16 cellOffset; /* Offset from start of page to first cell pointer */

	13633 int nFree; /* Number of unused bytes on the page */

	13634 int top; /* First byte of the cell content area */

	13635 int iCellFirst; /* First allowable cell or freeblock offset */

	13636 int iCellLast; /* Last possible cell or freeblock offset */

	13637

	13638 pBt = pPage->pBt;

	13639

	13640 hdr = pPage->hdrOffset;

	13641 data = pPage->aData;

	13642 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating

	13643 ** the b-tree page type. */

	13644 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;

	13645 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

	13646 pPage->maskPage = (u16)(pBt->pageSize - 1);

	13647 pPage->nOverflow = 0;

	13648 usableSize = pBt->usableSize;

	13649 pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;

	13650 pPage->aDataEnd = &data[usableSize];

	13651 pPage->aCellIdx = &data[cellOffset];

	13652 pPage->aDataOfst = &data[pPage->childPtrSize];

	13653 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates

	13654 ** the start of the cell content area. A zero value for this integer is

	13655 ** interpreted as 65536. */

	13656 top = get2byteNotZero(&data[hdr+5]);

	13657 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the

	13658 ** number of cells on the page. */

	13659 pPage->nCell = get2byte(&data[hdr+3]);

	13660 if( pPage->nCell>MX_CELL(pBt) ){

	13661 /* To many cells for a single page. The page must be corrupt */

	13662 return SQLITE_CORRUPT_BKPT;

	13663 }

	13664 testcase( pPage->nCell==MX_CELL(pBt) );

	13665 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only

	13666 ** possible for a root page of a table that contains no rows) then the

	13667 ** offset to the cell content area will equal the page size minus the

	13668 ** bytes of reserved space. */

	13669 assert( pPage->nCell>0 \|\| top==usableSize \|\| CORRUPT_DB );

	13670

	13671 /* A malformed database page might cause us to read past the end

	13672 ** of page when parsing a cell.

	13673 **

	13674 ** The following block of code checks early to see if a cell extends

	13675 ** past the end of a page boundary and causes SQLITE_CORRUPT to be

	13676 ** returned if it does.

	13677 */

	13678 iCellFirst = cellOffset + 2*pPage->nCell;

	13679 iCellLast = usableSize - 4;

	13680 if( pBt->db->flags & SQLITE_CellSizeCk ){

	13681 int i; /* Index into the cell pointer array */

	13682 int sz; /* Size of a cell */

	13683

	13684 if( !pPage->leaf ) iCellLast--;

	13685 for(i=0; i<pPage->nCell; i++){

	13686 pc = get2byteAligned(&data[cellOffset+i*2]);

	13687 testcase( pc==iCellFirst );

	13688 testcase( pc==iCellLast );

	13689 if( pc<iCellFirst \|\| pc>iCellLast ){

	13690 return SQLITE_CORRUPT_BKPT;

	13691 }

	13692 sz = pPage->xCellSize(pPage, &data[pc]);

	13693 testcase( pc+sz==usableSize );

	13694 if( pc+sz>usableSize ){

	13695 return SQLITE_CORRUPT_BKPT;

	13696 }

	13697 }

	13698 if( !pPage->leaf ) iCellLast++;

	13699 }

	13700

	13701 /* Compute the total free space on the page

	13702 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the

	13703 ** start of the first freeblock on the page, or is zero if there are no

	13704 ** freeblocks. */

	13705 pc = get2byte(&data[hdr+1]);

	13706 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */

	13707 while( pc>0 ){

	13708 u16 next, size;

	13709 if( pc<iCellFirst \|\| pc>iCellLast ){

	13710 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will

	13711 ** always be at least one cell before the first freeblock.

	13712 **

	13713 ** Or, the freeblock is off the end of the page

	13714 */

	13715 return SQLITE_CORRUPT_BKPT;

	13716 }

	13717 next = get2byte(&data[pc]);

	13718 size = get2byte(&data[pc+2]);

	13719 if( (next>0 && next<=pc+size+3) \|\| pc+size>usableSize ){

	13720 /* Free blocks must be in ascending order. And the last byte of

	13721 ** the free-block must lie on the database page. */

	13722 return SQLITE_CORRUPT_BKPT;

	13723 }

	13724 nFree = nFree + size;

	13725 pc = next;

	13726 }

	13727

	13728 /* At this point, nFree contains the sum of the offset to the start

	13729 ** of the cell-content area plus the number of free bytes within

	13730 ** the cell-content area. If this is greater than the usable-size

	13731 ** of the page, then the page must be corrupted. This check also

	13732 ** serves to verify that the offset to the start of the cell-content

	13733 ** area, according to the page header, lies within the page.

	13734 */

	13735 if( nFree>usableSize ){

	13736 return SQLITE_CORRUPT_BKPT;

	13737 }

	13738 pPage->nFree = (u16)(nFree - iCellFirst);

	13739 pPage->isInit = 1;

	13740 }

	13741 return SQLITE_OK;

	13742 }

	13743

	13744 /*

	13745 ** Set up a raw page so that it looks like a database page holding

	13746 ** no entries.

	13747 */

	13748 static void zeroPage(MemPage *pPage, int flags){

	13749 unsigned char *data = pPage->aData;

	13750 BtShared *pBt = pPage->pBt;

	13751 u8 hdr = pPage->hdrOffset;

	13752 u16 first;

	13753

	13754 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );

	13755 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

	13756 assert( sqlite3PagerGetData(pPage->pDbPage) == data );

	13757 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	13758 assert( sqlite3_mutex_held(pBt->mutex) );

	13759 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	13760 memset(&data[hdr], 0, pBt->usableSize - hdr);

	13761 }

	13762 data[hdr] = (char)flags;

	13763 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);

	13764 memset(&data[hdr+1], 0, 4);

	13765 data[hdr+7] = 0;

	13766 put2byte(&data[hdr+5], pBt->usableSize);

	13767 pPage->nFree = (u16)(pBt->usableSize - first);

	13768 decodeFlags(pPage, flags);

	13769 pPage->cellOffset = first;

	13770 pPage->aDataEnd = &data[pBt->usableSize];

	13771 pPage->aCellIdx = &data[first];

	13772 pPage->aDataOfst = &data[pPage->childPtrSize];

	13773 pPage->nOverflow = 0;

	13774 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

	13775 pPage->maskPage = (u16)(pBt->pageSize - 1);

	13776 pPage->nCell = 0;

	13777 pPage->isInit = 1;

	13778 }

	13779

	13780

	13781 /*

	13782 ** Convert a DbPage obtained from the pager into a MemPage used by

	13783 ** the btree layer.

	13784 */

	13785 static MemPage btreePageFromDbPage(DbPage pDbPage, Pgno pgno, BtShared *pBt){

	13786 MemPage pPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

	13787 if( pgno!=pPage->pgno ){

	13788 pPage->aData = sqlite3PagerGetData(pDbPage);

	13789 pPage->pDbPage = pDbPage;

	13790 pPage->pBt = pBt;

	13791 pPage->pgno = pgno;

	13792 pPage->hdrOffset = pgno==1 ? 100 : 0;

	13793 }

	13794 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );

	13795 return pPage;

	13796 }

	13797

	13798 /*

	13799 ** Get a page from the pager. Initialize the MemPage.pBt and

	13800 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage().

	13801 **

	13802 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care

	13803 ** about the content of the page at this time. So do not go to the disk

	13804 ** to fetch the content. Just fill in the content with zeros for now.

	13805 ** If in the future we call sqlite3PagerWrite() on this page, that

	13806 ** means we have started to be concerned about content and the disk

	13807 ** read should occur at that point.

	13808 */

	13809 static int btreeGetPage(

	13810 BtShared pBt, / The btree */

	13811 Pgno pgno, /* Number of the page to fetch */

	13812 MemPage *ppPage, / Return the page in this parameter */

	13813 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

	13814 ){

	13815 int rc;

	13816 DbPage *pDbPage;

	13817

	13818 assert( flags==0 \|\| flags==PAGER_GET_NOCONTENT \|\| flags==PAGER_GET_READONLY );

	13819 assert( sqlite3_mutex_held(pBt->mutex) );

	13820 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);

	13821 if( rc ) return rc;

	13822 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);

	13823 return SQLITE_OK;

	13824 }

	13825

	13826 /*

	13827 ** Retrieve a page from the pager cache. If the requested page is not

	13828 ** already in the pager cache return NULL. Initialize the MemPage.pBt and

	13829 ** MemPage.aData elements if needed.

	13830 */

	13831 static MemPage btreePageLookup(BtShared pBt, Pgno pgno){

	13832 DbPage *pDbPage;

	13833 assert( sqlite3_mutex_held(pBt->mutex) );

	13834 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);

	13835 if( pDbPage ){

	13836 return btreePageFromDbPage(pDbPage, pgno, pBt);

	13837 }

	13838 return 0;

	13839 }

	13840

	13841 /*

	13842 ** Return the size of the database file in pages. If there is any kind of

	13843 ** error, return ((unsigned int)-1).

	13844 */

	13845 static Pgno btreePagecount(BtShared *pBt){

	13846 return pBt->nPage;

	13847 }

	13848 SQLITE_PRIVATE u32 sqlite3BtreeLastPage(Btree *p){

	13849 assert( sqlite3BtreeHoldsMutex(p) );

	13850 assert( ((p->pBt->nPage)&0x8000000)==0 );

	13851 return btreePagecount(p->pBt);

	13852 }

	13853

	13854 /*

	13855 ** Get a page from the pager and initialize it.

	13856 **

	13857 ** If pCur!=0 then the page is being fetched as part of a moveToChild()

	13858 ** call. Do additional sanity checking on the page in this case.

	13859 ** And if the fetch fails, this routine must decrement pCur->iPage.

	13860 **

	13861 ** The page is fetched as read-write unless pCur is not NULL and is

	13862 ** a read-only cursor.

	13863 **

	13864 ** If an error occurs, then *ppPage is undefined. It

	13865 ** may remain unchanged, or it may be set to an invalid value.

	13866 */

	13867 static int getAndInitPage(

	13868 BtShared pBt, / The database file */

	13869 Pgno pgno, /* Number of the page to get */

	13870 MemPage *ppPage, / Write the page pointer here */

	13871 BtCursor pCur, / Cursor to receive the page, or NULL */

	13872 int bReadOnly /* True for a read-only page */

	13873 ){

	13874 int rc;

	13875 DbPage *pDbPage;

	13876 assert( sqlite3_mutex_held(pBt->mutex) );

	13877 assert( pCur==0 \|\| ppPage==&pCur->apPage[pCur->iPage] );

	13878 assert( pCur==0 \|\| bReadOnly==pCur->curPagerFlags );

	13879 assert( pCur==0 \|\| pCur->iPage>0 );

	13880

	13881 if( pgno>btreePagecount(pBt) ){

	13882 rc = SQLITE_CORRUPT_BKPT;

	13883 goto getAndInitPage_error;

	13884 }

	13885 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);

	13886 if( rc ){

	13887 goto getAndInitPage_error;

	13888 }

	13889 ppPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

	13890 if( (*ppPage)->isInit==0 ){

	13891 btreePageFromDbPage(pDbPage, pgno, pBt);

	13892 rc = btreeInitPage(*ppPage);

	13893 if( rc!=SQLITE_OK ){

	13894 releasePage(*ppPage);

	13895 goto getAndInitPage_error;

	13896 }

	13897 }

	13898 assert( (*ppPage)->pgno==pgno );

	13899 assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );

	13900

	13901 /* If obtaining a child page for a cursor, we must verify that the page is

	13902 ** compatible with the root page. */

	13903 if( pCur && ((ppPage)->nCell<1 \|\| (ppPage)->intKey!=pCur->curIntKey) ){

	13904 rc = SQLITE_CORRUPT_BKPT;

	13905 releasePage(*ppPage);

	13906 goto getAndInitPage_error;

	13907 }

	13908 return SQLITE_OK;

	13909

	13910 getAndInitPage_error:

	13911 if( pCur ) pCur->iPage--;

	13912 testcase( pgno==0 );

	13913 assert( pgno!=0 \|\| rc==SQLITE_CORRUPT );

	13914 return rc;

	13915 }

	13916

	13917 /*

	13918 ** Release a MemPage. This should be called once for each prior

	13919 ** call to btreeGetPage.

	13920 */

	13921 static void releasePageNotNull(MemPage *pPage){

	13922 assert( pPage->aData );

	13923 assert( pPage->pBt );

	13924 assert( pPage->pDbPage!=0 );

	13925 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

	13926 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );

	13927 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13928 sqlite3PagerUnrefNotNull(pPage->pDbPage);

	13929 }

	13930 static void releasePage(MemPage *pPage){

	13931 if( pPage ) releasePageNotNull(pPage);

	13932 }

	13933

	13934 /*

	13935 ** Get an unused page.

	13936 **

	13937 ** This works just like btreeGetPage() with the addition:

	13938 **

	13939 ** * If the page is already in use for some other purpose, immediately

	13940 ** release it and return an SQLITE_CURRUPT error.

	13941 ** * Make sure the isInit flag is clear

	13942 */

	13943 static int btreeGetUnusedPage(

	13944 BtShared pBt, / The btree */

	13945 Pgno pgno, /* Number of the page to fetch */

	13946 MemPage *ppPage, / Return the page in this parameter */

	13947 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

	13948 ){

	13949 int rc = btreeGetPage(pBt, pgno, ppPage, flags);

	13950 if( rc==SQLITE_OK ){

	13951 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){

	13952 releasePage(*ppPage);

	13953 *ppPage = 0;

	13954 return SQLITE_CORRUPT_BKPT;

	13955 }

	13956 (*ppPage)->isInit = 0;

	13957 }else{

	13958 *ppPage = 0;

	13959 }

	13960 return rc;

	13961 }

	13962

	13963

	13964 /*

	13965 ** During a rollback, when the pager reloads information into the cache

	13966 ** so that the cache is restored to its original state at the start of

	13967 ** the transaction, for each page restored this routine is called.

	13968 **

	13969 ** This routine needs to reset the extra data section at the end of the

	13970 ** page to agree with the restored data.

	13971 */

	13972 static void pageReinit(DbPage *pData){

	13973 MemPage *pPage;

	13974 pPage = (MemPage *)sqlite3PagerGetExtra(pData);

	13975 assert( sqlite3PagerPageRefcount(pData)>0 );

	13976 if( pPage->isInit ){

	13977 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	13978 pPage->isInit = 0;

	13979 if( sqlite3PagerPageRefcount(pData)>1 ){

	13980 /* pPage might not be a btree page; it might be an overflow page

	13981 ** or ptrmap page or a free page. In those cases, the following

	13982 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.

	13983 ** But no harm is done by this. And it is very important that

	13984 ** btreeInitPage() be called on every btree page so we make

	13985 ** the call for every page that comes in for re-initing. */

	13986 btreeInitPage(pPage);

	13987 }

	13988 }

	13989 }

	13990

	13991 /*

	13992 ** Invoke the busy handler for a btree.

	13993 */

	13994 static int btreeInvokeBusyHandler(void *pArg){

	13995 BtShared pBt = (BtShared)pArg;

	13996 assert( pBt->db );

	13997 assert( sqlite3_mutex_held(pBt->db->mutex) );

	13998 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);

	13999 }

	14000

	14001 /*

	14002 ** Open a database file.

	14003 **

	14004 ** zFilename is the name of the database file. If zFilename is NULL

	14005 ** then an ephemeral database is created. The ephemeral database might

	14006 ** be exclusively in memory, or it might use a disk-based memory cache.

	14007 ** Either way, the ephemeral database will be automatically deleted

	14008 ** when sqlite3BtreeClose() is called.

	14009 **

	14010 ** If zFilename is ":memory:" then an in-memory database is created

	14011 ** that is automatically destroyed when it is closed.

	14012 **

	14013 ** The "flags" parameter is a bitmask that might contain bits like

	14014 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.

	14015 **

	14016 ** If the database is already opened in the same database connection

	14017 ** and we are in shared cache mode, then the open will fail with an

	14018 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared

	14019 ** objects in the same database connection since doing so will lead

	14020 ** to problems with locking.

	14021 */

	14022 SQLITE_PRIVATE int sqlite3BtreeOpen(

	14023 sqlite3_vfs pVfs, / VFS to use for this b-tree */

	14024 const char zFilename, / Name of the file containing the BTree database */

	14025 sqlite3 db, / Associated database handle */

	14026 Btree *ppBtree, / Pointer to new Btree object written here */

	14027 int flags, /* Options */

	14028 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */

	14029 ){

	14030 BtShared pBt = 0; / Shared part of btree structure */

	14031 Btree p; / Handle to return */

	14032 sqlite3_mutex mutexOpen = 0; / Prevents a race condition. Ticket #3537 */

	14033 int rc = SQLITE_OK; /* Result code from this function */

	14034 u8 nReserve; /* Byte of unused space on each page */

	14035 unsigned char zDbHeader[100]; /* Database header content */

	14036

	14037 /* True if opening an ephemeral, temporary database */

	14038 const int isTempDb = zFilename==0 \|\| zFilename[0]==0;

	14039

	14040 /* Set the variable isMemdb to true for an in-memory database, or

	14041 ** false for a file-based database.

	14042 */

	14043 #ifdef SQLITE_OMIT_MEMORYDB

	14044 const int isMemdb = 0;

	14045 #else

	14046 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)

	14047 \|\| (isTempDb && sqlite3TempInMemory(db))

	14048 \|\| (vfsFlags & SQLITE_OPEN_MEMORY)!=0;

	14049 #endif

	14050

	14051 assert( db!=0 );

	14052 assert( pVfs!=0 );

	14053 assert( sqlite3_mutex_held(db->mutex) );

	14054 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */

	14055

	14056 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */

	14057 assert( (flags & BTREE_UNORDERED)==0 \|\| (flags & BTREE_SINGLE)!=0 );

	14058

	14059 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */

	14060 assert( (flags & BTREE_SINGLE)==0 \|\| isTempDb );

	14061

	14062 if( isMemdb ){

	14063 flags \|= BTREE_MEMORY;

	14064 }

	14065 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb \|\| isTempDb) ){

	14066 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) \| SQLITE_OPEN_TEMP_DB;

	14067 }

	14068 p = sqlite3MallocZero(sizeof(Btree));

	14069 if( !p ){

	14070 return SQLITE_NOMEM;

	14071 }

	14072 p->inTrans = TRANS_NONE;

	14073 p->db = db;

	14074 #ifndef SQLITE_OMIT_SHARED_CACHE

	14075 p->lock.pBtree = p;

	14076 p->lock.iTable = 1;

	14077 #endif

	14078

	14079 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	14080 /*

	14081 ** If this Btree is a candidate for shared cache, try to find an

	14082 ** existing BtShared object that we can share with

	14083 */

	14084 if( isTempDb==0 && (isMemdb==0 \|\| (vfsFlags&SQLITE_OPEN_URI)!=0) ){

	14085 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){

	14086 int nFilename = sqlite3Strlen30(zFilename)+1;

	14087 int nFullPathname = pVfs->mxPathname+1;

	14088 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));

	14089 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

	14090

	14091 p->sharable = 1;

	14092 if( !zFullPathname ){

	14093 sqlite3_free(p);

	14094 return SQLITE_NOMEM;

	14095 }

	14096 if( isMemdb ){

	14097 memcpy(zFullPathname, zFilename, nFilename);

	14098 }else{

	14099 rc = sqlite3OsFullPathname(pVfs, zFilename,

	14100 nFullPathname, zFullPathname);

	14101 if( rc ){

	14102 sqlite3_free(zFullPathname);

	14103 sqlite3_free(p);

	14104 return rc;

	14105 }

	14106 }

	14107 #if SQLITE_THREADSAFE

	14108 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);

	14109 sqlite3_mutex_enter(mutexOpen);

	14110 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);

	14111 sqlite3_mutex_enter(mutexShared);

	14112 #endif

	14113 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){

	14114 assert( pBt->nRef>0 );

	14115 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))

	14116 && sqlite3PagerVfs(pBt->pPager)==pVfs ){

	14117 int iDb;

	14118 for(iDb=db->nDb-1; iDb>=0; iDb--){

	14119 Btree *pExisting = db->aDb[iDb].pBt;

	14120 if( pExisting && pExisting->pBt==pBt ){

	14121 sqlite3_mutex_leave(mutexShared);

	14122 sqlite3_mutex_leave(mutexOpen);

	14123 sqlite3_free(zFullPathname);

	14124 sqlite3_free(p);

	14125 return SQLITE_CONSTRAINT;

	14126 }

	14127 }

	14128 p->pBt = pBt;

	14129 pBt->nRef++;

	14130 break;

	14131 }

	14132 }

	14133 sqlite3_mutex_leave(mutexShared);

	14134 sqlite3_free(zFullPathname);

	14135 }

	14136 #ifdef SQLITE_DEBUG

	14137 else{

	14138 /* In debug mode, we mark all persistent databases as sharable

	14139 ** even when they are not. This exercises the locking code and

	14140 ** gives more opportunity for asserts(sqlite3_mutex_held())

	14141 ** statements to find locking problems.

	14142 */

	14143 p->sharable = 1;

	14144 }

	14145 #endif

	14146 }

	14147 #endif

	14148 if( pBt==0 ){

	14149 /*

	14150 ** The following asserts make sure that structures used by the btree are

	14151 ** the right size. This is to guard against size changes that result

	14152 ** when compiling on a different architecture.

	14153 */

	14154 assert( sizeof(i64)==8 );

	14155 assert( sizeof(u64)==8 );

	14156 assert( sizeof(u32)==4 );

	14157 assert( sizeof(u16)==2 );

	14158 assert( sizeof(Pgno)==4 );

	14159

	14160 pBt = sqlite3MallocZero( sizeof(*pBt) );

	14161 if( pBt==0 ){

	14162 rc = SQLITE_NOMEM;

	14163 goto btree_open_out;

	14164 }

	14165 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,

	14166 EXTRA_SIZE, flags, vfsFlags, pageReinit);

	14167 if( rc==SQLITE_OK ){

	14168 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);

	14169 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);

	14170 }

	14171 if( rc!=SQLITE_OK ){

	14172 goto btree_open_out;

	14173 }

	14174 pBt->openFlags = (u8)flags;

	14175 pBt->db = db;

	14176 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);

	14177 p->pBt = pBt;

	14178

	14179 pBt->pCursor = 0;

	14180 pBt->pPage1 = 0;

	14181 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags \|= BTS_READ_ONLY;

	14182 #ifdef SQLITE_SECURE_DELETE

	14183 pBt->btsFlags \|= BTS_SECURE_DELETE;

	14184 #endif

	14185 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is

	14186 ** determined by the 2-byte integer located at an offset of 16 bytes from

	14187 ** the beginning of the database file. */

	14188 pBt->pageSize = (zDbHeader[16]<<8) \| (zDbHeader[17]<<16);

	14189 if( pBt->pageSize<512 \|\| pBt->pageSize>SQLITE_MAX_PAGE_SIZE

	14190 \|\| ((pBt->pageSize-1)&pBt->pageSize)!=0 ){

	14191 pBt->pageSize = 0;

	14192 #ifndef SQLITE_OMIT_AUTOVACUUM

	14193 /* If the magic name ":memory:" will create an in-memory database, then

	14194 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if

	14195 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if

	14196 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a

	14197 ** regular file-name. In this case the auto-vacuum applies as per normal.

	14198 */

	14199 if( zFilename && !isMemdb ){

	14200 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);

	14201 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);

	14202 }

	14203 #endif

	14204 nReserve = 0;

	14205 }else{

	14206 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is

	14207 ** determined by the one-byte unsigned integer found at an offset of 20

	14208 ** into the database file header. */

	14209 nReserve = zDbHeader[20];

	14210 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	14211 #ifndef SQLITE_OMIT_AUTOVACUUM

	14212 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);

	14213 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);

	14214 #endif

	14215 }

	14216 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

	14217 if( rc ) goto btree_open_out;

	14218 pBt->usableSize = pBt->pageSize - nReserve;

	14219 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */

	14220

	14221 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	14222 /* Add the new BtShared object to the linked list sharable BtShareds.

	14223 */

	14224 if( p->sharable ){

	14225 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

	14226 pBt->nRef = 1;

	14227 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)

	14228 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){

	14229 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);

	14230 if( pBt->mutex==0 ){

	14231 rc = SQLITE_NOMEM;

	14232 db->mallocFailed = 0;

	14233 goto btree_open_out;

	14234 }

	14235 }

	14236 sqlite3_mutex_enter(mutexShared);

	14237 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);

	14238 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;

	14239 sqlite3_mutex_leave(mutexShared);

	14240 }

	14241 #endif

	14242 }

	14243

	14244 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

	14245 /* If the new Btree uses a sharable pBtShared, then link the new

	14246 ** Btree into the list of all sharable Btrees for the same connection.

	14247 ** The list is kept in ascending order by pBt address.

	14248 */

	14249 if( p->sharable ){

	14250 int i;

	14251 Btree *pSib;

	14252 for(i=0; i<db->nDb; i++){

	14253 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){

	14254 while( pSib->pPrev ){ pSib = pSib->pPrev; }

	14255 if( p->pBt<pSib->pBt ){

	14256 p->pNext = pSib;

	14257 p->pPrev = 0;

	14258 pSib->pPrev = p;

	14259 }else{

	14260 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){

	14261 pSib = pSib->pNext;

	14262 }

	14263 p->pNext = pSib->pNext;

	14264 p->pPrev = pSib;

	14265 if( p->pNext ){

	14266 p->pNext->pPrev = p;

	14267 }

	14268 pSib->pNext = p;

	14269 }

	14270 break;

	14271 }

	14272 }

	14273 }

	14274 #endif

	14275 *ppBtree = p;

	14276

	14277 btree_open_out:

	14278 if( rc!=SQLITE_OK ){

	14279 if( pBt && pBt->pPager ){

	14280 sqlite3PagerClose(pBt->pPager);

	14281 }

	14282 sqlite3_free(pBt);

	14283 sqlite3_free(p);

	14284 *ppBtree = 0;

	14285 }else{

	14286 /* If the B-Tree was successfully opened, set the pager-cache size to the

	14287 ** default value. Except, when opening on an existing shared pager-cache,

	14288 ** do not change the pager-cache size.

	14289 */

	14290 if( sqlite3BtreeSchema(p, 0, 0)==0 ){

	14291 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);

	14292 }

	14293 }

	14294 if( mutexOpen ){

	14295 assert( sqlite3_mutex_held(mutexOpen) );

	14296 sqlite3_mutex_leave(mutexOpen);

	14297 }

	14298 return rc;

	14299 }

	14300

	14301 /*

	14302 ** Decrement the BtShared.nRef counter. When it reaches zero,

	14303 ** remove the BtShared structure from the sharing list. Return

	14304 ** true if the BtShared.nRef counter reaches zero and return

	14305 ** false if it is still positive.

	14306 */

	14307 static int removeFromSharingList(BtShared *pBt){

	14308 #ifndef SQLITE_OMIT_SHARED_CACHE

	14309 MUTEX_LOGIC( sqlite3_mutex *pMaster; )

	14310 BtShared *pList;

	14311 int removed = 0;

	14312

	14313 assert( sqlite3_mutex_notheld(pBt->mutex) );

	14314 MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )

	14315 sqlite3_mutex_enter(pMaster);

	14316 pBt->nRef--;

	14317 if( pBt->nRef<=0 ){

	14318 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){

	14319 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;

	14320 }else{

	14321 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);

	14322 while( ALWAYS(pList) && pList->pNext!=pBt ){

	14323 pList=pList->pNext;

	14324 }

	14325 if( ALWAYS(pList) ){

	14326 pList->pNext = pBt->pNext;

	14327 }

	14328 }

	14329 if( SQLITE_THREADSAFE ){

	14330 sqlite3_mutex_free(pBt->mutex);

	14331 }

	14332 removed = 1;

	14333 }

	14334 sqlite3_mutex_leave(pMaster);

	14335 return removed;

	14336 #else

	14337 return 1;

	14338 #endif

	14339 }

	14340

	14341 /*

	14342 ** Make sure pBt->pTmpSpace points to an allocation of

	14343 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child

	14344 ** pointer.

	14345 */

	14346 static void allocateTempSpace(BtShared *pBt){

	14347 if( !pBt->pTmpSpace ){

	14348 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );

	14349

	14350 /* One of the uses of pBt->pTmpSpace is to format cells before

	14351 ** inserting them into a leaf page (function fillInCell()). If

	14352 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes

	14353 ** by the various routines that manipulate binary cells. Which

	14354 ** can mean that fillInCell() only initializes the first 2 or 3

	14355 ** bytes of pTmpSpace, but that the first 4 bytes are copied from

	14356 ** it into a database page. This is not actually a problem, but it

	14357 ** does cause a valgrind error when the 1 or 2 bytes of unitialized

	14358 ** data is passed to system call write(). So to avoid this error,

	14359 ** zero the first 4 bytes of temp space here.

	14360 **

	14361 ** Also: Provide four bytes of initialized space before the

	14362 ** beginning of pTmpSpace as an area available to prepend the

	14363 ** left-child pointer to the beginning of a cell.

	14364 */

	14365 if( pBt->pTmpSpace ){

	14366 memset(pBt->pTmpSpace, 0, 8);

	14367 pBt->pTmpSpace += 4;

	14368 }

	14369 }

	14370 }

	14371

	14372 /*

	14373 ** Free the pBt->pTmpSpace allocation

	14374 */

	14375 static void freeTempSpace(BtShared *pBt){

	14376 if( pBt->pTmpSpace ){

	14377 pBt->pTmpSpace -= 4;

	14378 sqlite3PageFree(pBt->pTmpSpace);

	14379 pBt->pTmpSpace = 0;

	14380 }

	14381 }

	14382

	14383 /*

	14384 ** Close an open database and invalidate all cursors.

	14385 */

	14386 SQLITE_PRIVATE int sqlite3BtreeClose(Btree *p){

	14387 BtShared *pBt = p->pBt;

	14388 BtCursor *pCur;

	14389

	14390 /* Close all cursors opened via this handle. */

	14391 assert( sqlite3_mutex_held(p->db->mutex) );

	14392 sqlite3BtreeEnter(p);

	14393 pCur = pBt->pCursor;

	14394 while( pCur ){

	14395 BtCursor *pTmp = pCur;

	14396 pCur = pCur->pNext;

	14397 if( pTmp->pBtree==p ){

	14398 sqlite3BtreeCloseCursor(pTmp);

	14399 }

	14400 }

	14401

	14402 /* Rollback any active transaction and free the handle structure.

	14403 ** The call to sqlite3BtreeRollback() drops any table-locks held by

	14404 ** this handle.

	14405 */

	14406 sqlite3BtreeRollback(p, SQLITE_OK, 0);

	14407 sqlite3BtreeLeave(p);

	14408

	14409 /* If there are still other outstanding references to the shared-btree

	14410 ** structure, return now. The remainder of this procedure cleans

	14411 ** up the shared-btree.

	14412 */

	14413 assert( p->wantToLock==0 && p->locked==0 );

	14414 if( !p->sharable \|\| removeFromSharingList(pBt) ){

	14415 /* The pBt is no longer on the sharing list, so we can access

	14416 ** it without having to hold the mutex.

	14417 **

	14418 ** Clean out and delete the BtShared object.

	14419 */

	14420 assert( !pBt->pCursor );

	14421 sqlite3PagerClose(pBt->pPager);

	14422 if( pBt->xFreeSchema && pBt->pSchema ){

	14423 pBt->xFreeSchema(pBt->pSchema);

	14424 }

	14425 sqlite3DbFree(0, pBt->pSchema);

	14426 freeTempSpace(pBt);

	14427 sqlite3_free(pBt);

	14428 }

	14429

	14430 #ifndef SQLITE_OMIT_SHARED_CACHE

	14431 assert( p->wantToLock==0 );

	14432 assert( p->locked==0 );

	14433 if( p->pPrev ) p->pPrev->pNext = p->pNext;

	14434 if( p->pNext ) p->pNext->pPrev = p->pPrev;

	14435 #endif

	14436

	14437 sqlite3_free(p);

	14438 return SQLITE_OK;

	14439 }

	14440

	14441 /*

	14442 ** Change the "soft" limit on the number of pages in the cache.

	14443 ** Unused and unmodified pages will be recycled when the number of

	14444 ** pages in the cache exceeds this soft limit. But the size of the

	14445 ** cache is allowed to grow larger than this limit if it contains

	14446 ** dirty pages or pages still in active use.

	14447 */

	14448 SQLITE_PRIVATE int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){

	14449 BtShared *pBt = p->pBt;

	14450 assert( sqlite3_mutex_held(p->db->mutex) );

	14451 sqlite3BtreeEnter(p);

	14452 sqlite3PagerSetCachesize(pBt->pPager, mxPage);

	14453 sqlite3BtreeLeave(p);

	14454 return SQLITE_OK;

	14455 }

	14456

	14457 /*

	14458 ** Change the "spill" limit on the number of pages in the cache.

	14459 ** If the number of pages exceeds this limit during a write transaction,

	14460 ** the pager might attempt to "spill" pages to the journal early in

	14461 ** order to free up memory.

	14462 **

	14463 ** The value returned is the current spill size. If zero is passed

	14464 ** as an argument, no changes are made to the spill size setting, so

	14465 ** using mxPage of 0 is a way to query the current spill size.

	14466 */

	14467 SQLITE_PRIVATE int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){

	14468 BtShared *pBt = p->pBt;

	14469 int res;

	14470 assert( sqlite3_mutex_held(p->db->mutex) );

	14471 sqlite3BtreeEnter(p);

	14472 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);

	14473 sqlite3BtreeLeave(p);

	14474 return res;

	14475 }

	14476

	14477 #if SQLITE_MAX_MMAP_SIZE>0

	14478 /*

	14479 ** Change the limit on the amount of the database file that may be

	14480 ** memory mapped.

	14481 */

	14482 SQLITE_PRIVATE int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){

	14483 BtShared *pBt = p->pBt;

	14484 assert( sqlite3_mutex_held(p->db->mutex) );

	14485 sqlite3BtreeEnter(p);

	14486 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);

	14487 sqlite3BtreeLeave(p);

	14488 return SQLITE_OK;

	14489 }

	14490 #endif /* SQLITE_MAX_MMAP_SIZE>0 */

	14491

	14492 /*

	14493 ** Change the way data is synced to disk in order to increase or decrease

	14494 ** how well the database resists damage due to OS crashes and power

	14495 ** failures. Level 1 is the same as asynchronous (no syncs() occur and

	14496 ** there is a high probability of damage) Level 2 is the default. There

	14497 ** is a very low but non-zero probability of damage. Level 3 reduces the

	14498 ** probability of damage to near zero but with a write performance reduction.

	14499 */

	14500 #ifndef SQLITE_OMIT_PAGER_PRAGMAS

	14501 SQLITE_PRIVATE int sqlite3BtreeSetPagerFlags(

	14502 Btree p, / The btree to set the safety level on */

	14503 unsigned pgFlags /* Various PAGER_* flags */

	14504 ){

	14505 BtShared *pBt = p->pBt;

	14506 assert( sqlite3_mutex_held(p->db->mutex) );

	14507 sqlite3BtreeEnter(p);

	14508 sqlite3PagerSetFlags(pBt->pPager, pgFlags);

	14509 sqlite3BtreeLeave(p);

	14510 return SQLITE_OK;

	14511 }

	14512 #endif

	14513

	14514 /*

	14515 ** Return TRUE if the given btree is set to safety level 1. In other

	14516 ** words, return TRUE if no sync() occurs on the disk files.

	14517 */

	14518 SQLITE_PRIVATE int sqlite3BtreeSyncDisabled(Btree *p){

	14519 BtShared *pBt = p->pBt;

	14520 int rc;

	14521 assert( sqlite3_mutex_held(p->db->mutex) );

	14522 sqlite3BtreeEnter(p);

	14523 assert( pBt && pBt->pPager );

	14524 rc = sqlite3PagerNosync(pBt->pPager);

	14525 sqlite3BtreeLeave(p);

	14526 return rc;

	14527 }

	14528

	14529 /*

	14530 ** Change the default pages size and the number of reserved bytes per page.

	14531 ** Or, if the page size has already been fixed, return SQLITE_READONLY

	14532 ** without changing anything.

	14533 **

	14534 ** The page size must be a power of 2 between 512 and 65536. If the page

	14535 ** size supplied does not meet this constraint then the page size is not

	14536 ** changed.

	14537 **

	14538 ** Page sizes are constrained to be a power of two so that the region

	14539 ** of the database file used for locking (beginning at PENDING_BYTE,

	14540 ** the first byte past the 1GB boundary, 0x40000000) needs to occur

	14541 ** at the beginning of a page.

	14542 **

	14543 ** If parameter nReserve is less than zero, then the number of reserved

	14544 ** bytes per page is left unchanged.

	14545 **

	14546 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size

	14547 ** and autovacuum mode can no longer be changed.

	14548 */

	14549 SQLITE_PRIVATE int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){

	14550 int rc = SQLITE_OK;

	14551 BtShared *pBt = p->pBt;

	14552 assert( nReserve>=-1 && nReserve<=255 );

	14553 sqlite3BtreeEnter(p);

	14554 #if SQLITE_HAS_CODEC

	14555 if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;

	14556 #endif

	14557 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){

	14558 sqlite3BtreeLeave(p);

	14559 return SQLITE_READONLY;

	14560 }

	14561 if( nReserve<0 ){

	14562 nReserve = pBt->pageSize - pBt->usableSize;

	14563 }

	14564 assert( nReserve>=0 && nReserve<=255 );

	14565 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&

	14566 ((pageSize-1)&pageSize)==0 ){

	14567 assert( (pageSize & 7)==0 );

	14568 assert( !pBt->pCursor );

	14569 pBt->pageSize = (u32)pageSize;

	14570 freeTempSpace(pBt);

	14571 }

	14572 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

	14573 pBt->usableSize = pBt->pageSize - (u16)nReserve;

	14574 if( iFix ) pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	14575 sqlite3BtreeLeave(p);

	14576 return rc;

	14577 }

	14578

	14579 /*

	14580 ** Return the currently defined page size

	14581 */

	14582 SQLITE_PRIVATE int sqlite3BtreeGetPageSize(Btree *p){

	14583 return p->pBt->pageSize;

	14584 }

	14585

	14586 /*

	14587 ** This function is similar to sqlite3BtreeGetReserve(), except that it

	14588 ** may only be called if it is guaranteed that the b-tree mutex is already

	14589 ** held.

	14590 **

	14591 ** This is useful in one special case in the backup API code where it is

	14592 ** known that the shared b-tree mutex is held, but the mutex on the

	14593 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()

	14594 ** were to be called, it might collide with some other operation on the

	14595 ** database handle that owns *p, causing undefined behavior.

	14596 */

	14597 SQLITE_PRIVATE int sqlite3BtreeGetReserveNoMutex(Btree *p){

	14598 int n;

	14599 assert( sqlite3_mutex_held(p->pBt->mutex) );

	14600 n = p->pBt->pageSize - p->pBt->usableSize;

	14601 return n;

	14602 }

	14603

	14604 /*

	14605 ** Return the number of bytes of space at the end of every page that

	14606 ** are intentually left unused. This is the "reserved" space that is

	14607 ** sometimes used by extensions.

	14608 **

	14609 ** If SQLITE_HAS_MUTEX is defined then the number returned is the

	14610 ** greater of the current reserved space and the maximum requested

	14611 ** reserve space.

	14612 */

	14613 SQLITE_PRIVATE int sqlite3BtreeGetOptimalReserve(Btree *p){

	14614 int n;

	14615 sqlite3BtreeEnter(p);

	14616 n = sqlite3BtreeGetReserveNoMutex(p);

	14617 #ifdef SQLITE_HAS_CODEC

	14618 if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;

	14619 #endif

	14620 sqlite3BtreeLeave(p);

	14621 return n;

	14622 }

	14623

	14624

	14625 /*

	14626 ** Set the maximum page count for a database if mxPage is positive.

	14627 ** No changes are made if mxPage is 0 or negative.

	14628 ** Regardless of the value of mxPage, return the maximum page count.

	14629 */

	14630 SQLITE_PRIVATE int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){

	14631 int n;

	14632 sqlite3BtreeEnter(p);

	14633 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);

	14634 sqlite3BtreeLeave(p);

	14635 return n;

	14636 }

	14637

	14638 /*

	14639 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1. If newFlag is -1,

	14640 ** then make no changes. Always return the value of the BTS_SECURE_DELETE

	14641 ** setting after the change.

	14642 */

	14643 SQLITE_PRIVATE int sqlite3BtreeSecureDelete(Btree *p, int newFlag){

	14644 int b;

	14645 if( p==0 ) return 0;

	14646 sqlite3BtreeEnter(p);

	14647 if( newFlag>=0 ){

	14648 p->pBt->btsFlags &= ~BTS_SECURE_DELETE;

	14649 if( newFlag ) p->pBt->btsFlags \|= BTS_SECURE_DELETE;

	14650 }

	14651 b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;

	14652 sqlite3BtreeLeave(p);

	14653 return b;

	14654 }

	14655

	14656 /*

	14657 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'

	14658 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it

	14659 ** is disabled. The default value for the auto-vacuum property is

	14660 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.

	14661 */

	14662 SQLITE_PRIVATE int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){

	14663 #ifdef SQLITE_OMIT_AUTOVACUUM

	14664 return SQLITE_READONLY;

	14665 #else

	14666 BtShared *pBt = p->pBt;

	14667 int rc = SQLITE_OK;

	14668 u8 av = (u8)autoVacuum;

	14669

	14670 sqlite3BtreeEnter(p);

	14671 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){

	14672 rc = SQLITE_READONLY;

	14673 }else{

	14674 pBt->autoVacuum = av ?1:0;

	14675 pBt->incrVacuum = av==2 ?1:0;

	14676 }

	14677 sqlite3BtreeLeave(p);

	14678 return rc;

	14679 #endif

	14680 }

	14681

	14682 /*

	14683 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is

	14684 ** enabled 1 is returned. Otherwise 0.

	14685 */

	14686 SQLITE_PRIVATE int sqlite3BtreeGetAutoVacuum(Btree *p){

	14687 #ifdef SQLITE_OMIT_AUTOVACUUM

	14688 return BTREE_AUTOVACUUM_NONE;

	14689 #else

	14690 int rc;

	14691 sqlite3BtreeEnter(p);

	14692 rc = (

	14693 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:

	14694 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:

	14695 BTREE_AUTOVACUUM_INCR

	14696 );

	14697 sqlite3BtreeLeave(p);

	14698 return rc;

	14699 #endif

	14700 }

	14701

	14702

	14703 /*

	14704 ** Get a reference to pPage1 of the database file. This will

	14705 ** also acquire a readlock on that file.

	14706 **

	14707 ** SQLITE_OK is returned on success. If the file is not a

	14708 ** well-formed database file, then SQLITE_CORRUPT is returned.

	14709 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM

	14710 ** is returned if we run out of memory.

	14711 */

	14712 static int lockBtree(BtShared *pBt){

	14713 int rc; /* Result code from subfunctions */

	14714 MemPage pPage1; / Page 1 of the database file */

	14715 int nPage; /* Number of pages in the database */

	14716 int nPageFile = 0; /* Number of pages in the database file */

	14717 int nPageHeader; /* Number of pages in the database according to hdr */

	14718

	14719 assert( sqlite3_mutex_held(pBt->mutex) );

	14720 assert( pBt->pPage1==0 );

	14721 rc = sqlite3PagerSharedLock(pBt->pPager);

	14722 if( rc!=SQLITE_OK ) return rc;

	14723 rc = btreeGetPage(pBt, 1, &pPage1, 0);

	14724 if( rc!=SQLITE_OK ) return rc;

	14725

	14726 /* Do some checking to help insure the file we opened really is

	14727 ** a valid database file.

	14728 */

	14729 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);

	14730 sqlite3PagerPagecount(pBt->pPager, &nPageFile);

	14731 if( nPage==0 \|\| memcmp(24+(u8)pPage1->aData, 92+(u8)pPage1->aData,4)!=0 ){

	14732 nPage = nPageFile;

	14733 }

	14734 if( nPage>0 ){

	14735 u32 pageSize;

	14736 u32 usableSize;

	14737 u8 *page1 = pPage1->aData;

	14738 rc = SQLITE_NOTADB;

	14739 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins

	14740 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d

	14741 ** 61 74 20 33 00. */

	14742 if( memcmp(page1, zMagicHeader, 16)!=0 ){

	14743 goto page1_init_failed;

	14744 }

	14745

	14746 #ifdef SQLITE_OMIT_WAL

	14747 if( page1[18]>1 ){

	14748 pBt->btsFlags \|= BTS_READ_ONLY;

	14749 }

	14750 if( page1[19]>1 ){

	14751 goto page1_init_failed;

	14752 }

	14753 #else

	14754 if( page1[18]>2 ){

	14755 pBt->btsFlags \|= BTS_READ_ONLY;

	14756 }

	14757 if( page1[19]>2 ){

	14758 goto page1_init_failed;

	14759 }

	14760

	14761 /* If the write version is set to 2, this database should be accessed

	14762 ** in WAL mode. If the log is not already open, open it now. Then

	14763 ** return SQLITE_OK and return without populating BtShared.pPage1.

	14764 ** The caller detects this and calls this function again. This is

	14765 ** required as the version of page 1 currently in the page1 buffer

	14766 ** may not be the latest version - there may be a newer one in the log

	14767 ** file.

	14768 */

	14769 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){

	14770 int isOpen = 0;

	14771 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);

	14772 if( rc!=SQLITE_OK ){

	14773 goto page1_init_failed;

	14774 }else if( isOpen==0 ){

	14775 releasePage(pPage1);

	14776 return SQLITE_OK;

	14777 }

	14778 rc = SQLITE_NOTADB;

	14779 }

	14780 #endif

	14781

	14782 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload

	14783 ** fractions and the leaf payload fraction values must be 64, 32, and 32.

	14784 **

	14785 ** The original design allowed these amounts to vary, but as of

	14786 ** version 3.6.0, we require them to be fixed.

	14787 */

	14788 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){

	14789 goto page1_init_failed;

	14790 }

	14791 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is

	14792 ** determined by the 2-byte integer located at an offset of 16 bytes from

	14793 ** the beginning of the database file. */

	14794 pageSize = (page1[16]<<8) \| (page1[17]<<16);

	14795 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two

	14796 ** between 512 and 65536 inclusive. */

	14797 if( ((pageSize-1)&pageSize)!=0

	14798 \|\| pageSize>SQLITE_MAX_PAGE_SIZE

	14799 \|\| pageSize<=256

	14800 ){

	14801 goto page1_init_failed;

	14802 }

	14803 assert( (pageSize & 7)==0 );

	14804 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte

	14805 ** integer at offset 20 is the number of bytes of space at the end of

	14806 ** each page to reserve for extensions.

	14807 **

	14808 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is

	14809 ** determined by the one-byte unsigned integer found at an offset of 20

	14810 ** into the database file header. */

	14811 usableSize = pageSize - page1[20];

	14812 if( (u32)pageSize!=pBt->pageSize ){

	14813 /* After reading the first page of the database assuming a page size

	14814 ** of BtShared.pageSize, we have discovered that the page-size is

	14815 ** actually pageSize. Unlock the database, leave pBt->pPage1 at

	14816 ** zero and return SQLITE_OK. The caller will call this function

	14817 ** again with the correct page-size.

	14818 */

	14819 releasePage(pPage1);

	14820 pBt->usableSize = usableSize;

	14821 pBt->pageSize = pageSize;

	14822 freeTempSpace(pBt);

	14823 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,

	14824 pageSize-usableSize);

	14825 return rc;

	14826 }

	14827 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){

	14828 rc = SQLITE_CORRUPT_BKPT;

	14829 goto page1_init_failed;

	14830 }

	14831 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to

	14832 ** be less than 480. In other words, if the page size is 512, then the

	14833 ** reserved space size cannot exceed 32. */

	14834 if( usableSize<480 ){

	14835 goto page1_init_failed;

	14836 }

	14837 pBt->pageSize = pageSize;

	14838 pBt->usableSize = usableSize;

	14839 #ifndef SQLITE_OMIT_AUTOVACUUM

	14840 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);

	14841 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);

	14842 #endif

	14843 }

	14844

	14845 /* maxLocal is the maximum amount of payload to store locally for

	14846 ** a cell. Make sure it is small enough so that at least minFanout

	14847 ** cells can will fit on one page. We assume a 10-byte page header.

	14848 ** Besides the payload, the cell must store:

	14849 ** 2-byte pointer to the cell

	14850 ** 4-byte child pointer

	14851 ** 9-byte nKey value

	14852 ** 4-byte nData value

	14853 ** 4-byte overflow page pointer

	14854 ** So a cell consists of a 2-byte pointer, a header which is as much as

	14855 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow

	14856 ** page pointer.

	14857 */

	14858 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);

	14859 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);

	14860 pBt->maxLeaf = (u16)(pBt->usableSize - 35);

	14861 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);

	14862 if( pBt->maxLocal>127 ){

	14863 pBt->max1bytePayload = 127;

	14864 }else{

	14865 pBt->max1bytePayload = (u8)pBt->maxLocal;

	14866 }

	14867 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );

	14868 pBt->pPage1 = pPage1;

	14869 pBt->nPage = nPage;

	14870 return SQLITE_OK;

	14871

	14872 page1_init_failed:

	14873 releasePage(pPage1);

	14874 pBt->pPage1 = 0;

	14875 return rc;

	14876 }

	14877

	14878 #ifndef NDEBUG

	14879 /*

	14880 ** Return the number of cursors open on pBt. This is for use

	14881 ** in assert() expressions, so it is only compiled if NDEBUG is not

	14882 ** defined.

	14883 **

	14884 ** Only write cursors are counted if wrOnly is true. If wrOnly is

	14885 ** false then all cursors are counted.

	14886 **

	14887 ** For the purposes of this routine, a cursor is any cursor that

	14888 ** is capable of reading or writing to the database. Cursors that

	14889 ** have been tripped into the CURSOR_FAULT state are not counted.

	14890 */

	14891 static int countValidCursors(BtShared *pBt, int wrOnly){

	14892 BtCursor *pCur;

	14893 int r = 0;

	14894 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){

	14895 if( (wrOnly==0 \|\| (pCur->curFlags & BTCF_WriteFlag)!=0)

	14896 && pCur->eState!=CURSOR_FAULT ) r++;

	14897 }

	14898 return r;

	14899 }

	14900 #endif

	14901

	14902 /*

	14903 ** If there are no outstanding cursors and we are not in the middle

	14904 ** of a transaction but there is a read lock on the database, then

	14905 ** this routine unrefs the first page of the database file which

	14906 ** has the effect of releasing the read lock.

	14907 **

	14908 ** If there is a transaction in progress, this routine is a no-op.

	14909 */

	14910 static void unlockBtreeIfUnused(BtShared *pBt){

	14911 assert( sqlite3_mutex_held(pBt->mutex) );

	14912 assert( countValidCursors(pBt,0)==0 \|\| pBt->inTransaction>TRANS_NONE );

	14913 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){

	14914 MemPage *pPage1 = pBt->pPage1;

	14915 assert( pPage1->aData );

	14916 assert( sqlite3PagerRefcount(pBt->pPager)==1 );

	14917 pBt->pPage1 = 0;

	14918 releasePageNotNull(pPage1);

	14919 }

	14920 }

	14921

	14922 /*

	14923 ** If pBt points to an empty file then convert that empty file

	14924 ** into a new empty database by initializing the first page of

	14925 ** the database.

	14926 */

	14927 static int newDatabase(BtShared *pBt){

	14928 MemPage *pP1;

	14929 unsigned char *data;

	14930 int rc;

	14931

	14932 assert( sqlite3_mutex_held(pBt->mutex) );

	14933 if( pBt->nPage>0 ){

	14934 return SQLITE_OK;

	14935 }

	14936 pP1 = pBt->pPage1;

	14937 assert( pP1!=0 );

	14938 data = pP1->aData;

	14939 rc = sqlite3PagerWrite(pP1->pDbPage);

	14940 if( rc ) return rc;

	14941 memcpy(data, zMagicHeader, sizeof(zMagicHeader));

	14942 assert( sizeof(zMagicHeader)==16 );

	14943 data[16] = (u8)((pBt->pageSize>>8)&0xff);

	14944 data[17] = (u8)((pBt->pageSize>>16)&0xff);

	14945 data[18] = 1;

	14946 data[19] = 1;

	14947 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);

	14948 data[20] = (u8)(pBt->pageSize - pBt->usableSize);

	14949 data[21] = 64;

	14950 data[22] = 32;

	14951 data[23] = 32;

	14952 memset(&data[24], 0, 100-24);

	14953 zeroPage(pP1, PTF_INTKEY\|PTF_LEAF\|PTF_LEAFDATA );

	14954 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

	14955 #ifndef SQLITE_OMIT_AUTOVACUUM

	14956 assert( pBt->autoVacuum==1 \|\| pBt->autoVacuum==0 );

	14957 assert( pBt->incrVacuum==1 \|\| pBt->incrVacuum==0 );

	14958 put4byte(&data[36 + 4*4], pBt->autoVacuum);

	14959 put4byte(&data[36 + 7*4], pBt->incrVacuum);

	14960 #endif

	14961 pBt->nPage = 1;

	14962 data[31] = 1;

	14963 return SQLITE_OK;

	14964 }

	14965

	14966 /*

	14967 ** Initialize the first page of the database file (creating a database

	14968 ** consisting of a single page and no schema objects). Return SQLITE_OK

	14969 ** if successful, or an SQLite error code otherwise.

	14970 */

	14971 SQLITE_PRIVATE int sqlite3BtreeNewDb(Btree *p){

	14972 int rc;

	14973 sqlite3BtreeEnter(p);

	14974 p->pBt->nPage = 0;

	14975 rc = newDatabase(p->pBt);

	14976 sqlite3BtreeLeave(p);

	14977 return rc;

	14978 }

	14979

	14980 /*

	14981 ** Attempt to start a new transaction. A write-transaction

	14982 ** is started if the second argument is nonzero, otherwise a read-

	14983 ** transaction. If the second argument is 2 or more and exclusive

	14984 ** transaction is started, meaning that no other process is allowed

	14985 ** to access the database. A preexisting transaction may not be

	14986 ** upgraded to exclusive by calling this routine a second time - the

	14987 ** exclusivity flag only works for a new transaction.

	14988 **

	14989 ** A write-transaction must be started before attempting any

	14990 ** changes to the database. None of the following routines

	14991 ** will work unless a transaction is started first:

	14992 **

	14993 ** sqlite3BtreeCreateTable()

	14994 ** sqlite3BtreeCreateIndex()

	14995 ** sqlite3BtreeClearTable()

	14996 ** sqlite3BtreeDropTable()

	14997 ** sqlite3BtreeInsert()

	14998 ** sqlite3BtreeDelete()

	14999 ** sqlite3BtreeUpdateMeta()

	15000 **

	15001 ** If an initial attempt to acquire the lock fails because of lock contention

	15002 ** and the database was previously unlocked, then invoke the busy handler

	15003 ** if there is one. But if there was previously a read-lock, do not

	15004 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is

	15005 ** returned when there is already a read-lock in order to avoid a deadlock.

	15006 **

	15007 ** Suppose there are two processes A and B. A has a read lock and B has

	15008 ** a reserved lock. B tries to promote to exclusive but is blocked because

	15009 ** of A's read lock. A tries to promote to reserved but is blocked by B.

	15010 ** One or the other of the two processes must give way or there can be

	15011 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback

	15012 ** when A already has a read lock, we encourage A to give up and let B

	15013 ** proceed.

	15014 */

	15015 SQLITE_PRIVATE int sqlite3BtreeBeginTrans(Btree *p, int wrflag){

	15016 sqlite3 *pBlock = 0;

	15017 BtShared *pBt = p->pBt;

	15018 int rc = SQLITE_OK;

	15019

	15020 sqlite3BtreeEnter(p);

	15021 btreeIntegrity(p);

	15022

	15023 /* If the btree is already in a write-transaction, or it

	15024 ** is already in a read-transaction and a read-transaction

	15025 ** is requested, this is a no-op.

	15026 */

	15027 if( p->inTrans==TRANS_WRITE \|\| (p->inTrans==TRANS_READ && !wrflag) ){

	15028 goto trans_begun;

	15029 }

	15030 assert( pBt->inTransaction==TRANS_WRITE \|\| IfNotOmitAV(pBt->bDoTruncate)==0 );

	15031

	15032 /* Write transactions are not possible on a read-only database */

	15033 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){

	15034 rc = SQLITE_READONLY;

	15035 goto trans_begun;

	15036 }

	15037

	15038 #ifndef SQLITE_OMIT_SHARED_CACHE

	15039 /* If another database handle has already opened a write transaction

	15040 ** on this shared-btree structure and a second write transaction is

	15041 ** requested, return SQLITE_LOCKED.

	15042 */

	15043 if( (wrflag && pBt->inTransaction==TRANS_WRITE)

	15044 \|\| (pBt->btsFlags & BTS_PENDING)!=0

	15045 ){

	15046 pBlock = pBt->pWriter->db;

	15047 }else if( wrflag>1 ){

	15048 BtLock *pIter;

	15049 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

	15050 if( pIter->pBtree!=p ){

	15051 pBlock = pIter->pBtree->db;

	15052 break;

	15053 }

	15054 }

	15055 }

	15056 if( pBlock ){

	15057 sqlite3ConnectionBlocked(p->db, pBlock);

	15058 rc = SQLITE_LOCKED_SHAREDCACHE;

	15059 goto trans_begun;

	15060 }

	15061 #endif

	15062

	15063 /* Any read-only or read-write transaction implies a read-lock on

	15064 ** page 1. So if some other shared-cache client already has a write-lock

	15065 ** on page 1, the transaction cannot be opened. */

	15066 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

	15067 if( SQLITE_OK!=rc ) goto trans_begun;

	15068

	15069 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;

	15070 if( pBt->nPage==0 ) pBt->btsFlags \|= BTS_INITIALLY_EMPTY;

	15071 do {

	15072 /* Call lockBtree() until either pBt->pPage1 is populated or

	15073 ** lockBtree() returns something other than SQLITE_OK. lockBtree()

	15074 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after

	15075 ** reading page 1 it discovers that the page-size of the database

	15076 ** file is not pBt->pageSize. In this case lockBtree() will update

	15077 ** pBt->pageSize to the page-size of the file on disk.

	15078 */

	15079 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );

	15080

	15081 if( rc==SQLITE_OK && wrflag ){

	15082 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){

	15083 rc = SQLITE_READONLY;

	15084 }else{

	15085 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));

	15086 if( rc==SQLITE_OK ){

	15087 rc = newDatabase(pBt);

	15088 }

	15089 }

	15090 }

	15091

	15092 if( rc!=SQLITE_OK ){

	15093 unlockBtreeIfUnused(pBt);

	15094 }

	15095 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&

	15096 btreeInvokeBusyHandler(pBt) );

	15097

	15098 if( rc==SQLITE_OK ){

	15099 if( p->inTrans==TRANS_NONE ){

	15100 pBt->nTransaction++;

	15101 #ifndef SQLITE_OMIT_SHARED_CACHE

	15102 if( p->sharable ){

	15103 assert( p->lock.pBtree==p && p->lock.iTable==1 );

	15104 p->lock.eLock = READ_LOCK;

	15105 p->lock.pNext = pBt->pLock;

	15106 pBt->pLock = &p->lock;

	15107 }

	15108 #endif

	15109 }

	15110 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);

	15111 if( p->inTrans>pBt->inTransaction ){

	15112 pBt->inTransaction = p->inTrans;

	15113 }

	15114 if( wrflag ){

	15115 MemPage *pPage1 = pBt->pPage1;

	15116 #ifndef SQLITE_OMIT_SHARED_CACHE

	15117 assert( !pBt->pWriter );

	15118 pBt->pWriter = p;

	15119 pBt->btsFlags &= ~BTS_EXCLUSIVE;

	15120 if( wrflag>1 ) pBt->btsFlags \|= BTS_EXCLUSIVE;

	15121 #endif

	15122

	15123 /* If the db-size header field is incorrect (as it may be if an old

	15124 ** client has been writing the database file), update it now. Doing

	15125 ** this sooner rather than later means the database size can safely

	15126 ** re-read the database size from page 1 if a savepoint or transaction

	15127 ** rollback occurs within the transaction.

	15128 */

	15129 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){

	15130 rc = sqlite3PagerWrite(pPage1->pDbPage);

	15131 if( rc==SQLITE_OK ){

	15132 put4byte(&pPage1->aData[28], pBt->nPage);

	15133 }

	15134 }

	15135 }

	15136 }

	15137

	15138

	15139 trans_begun:

	15140 if( rc==SQLITE_OK && wrflag ){

	15141 /* This call makes sure that the pager has the correct number of

	15142 ** open savepoints. If the second parameter is greater than 0 and

	15143 ** the sub-journal is not already open, then it will be opened here.

	15144 */

	15145 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);

	15146 }

	15147

	15148 btreeIntegrity(p);

	15149 sqlite3BtreeLeave(p);

	15150 return rc;

	15151 }

	15152

	15153 #ifndef SQLITE_OMIT_AUTOVACUUM

	15154

	15155 /*

	15156 ** Set the pointer-map entries for all children of page pPage. Also, if

	15157 ** pPage contains cells that point to overflow pages, set the pointer

	15158 ** map entries for the overflow pages as well.

	15159 */

	15160 static int setChildPtrmaps(MemPage *pPage){

	15161 int i; /* Counter variable */

	15162 int nCell; /* Number of cells in page pPage */

	15163 int rc; /* Return code */

	15164 BtShared *pBt = pPage->pBt;

	15165 u8 isInitOrig = pPage->isInit;

	15166 Pgno pgno = pPage->pgno;

	15167

	15168 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	15169 rc = btreeInitPage(pPage);

	15170 if( rc!=SQLITE_OK ){

	15171 goto set_child_ptrmaps_out;

	15172 }

	15173 nCell = pPage->nCell;

	15174

	15175 for(i=0; i<nCell; i++){

	15176 u8 *pCell = findCell(pPage, i);

	15177

	15178 ptrmapPutOvflPtr(pPage, pCell, &rc);

	15179

	15180 if( !pPage->leaf ){

	15181 Pgno childPgno = get4byte(pCell);

	15182 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

	15183 }

	15184 }

	15185

	15186 if( !pPage->leaf ){

	15187 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	15188 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

	15189 }

	15190

	15191 set_child_ptrmaps_out:

	15192 pPage->isInit = isInitOrig;

	15193 return rc;

	15194 }

	15195

	15196 /*

	15197 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so

	15198 ** that it points to iTo. Parameter eType describes the type of pointer to

	15199 ** be modified, as follows:

	15200 **

	15201 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child

	15202 ** page of pPage.

	15203 **

	15204 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow

	15205 ** page pointed to by one of the cells on pPage.

	15206 **

	15207 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next

	15208 ** overflow page in the list.

	15209 */

	15210 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){

	15211 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	15212 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	15213 if( eType==PTRMAP_OVERFLOW2 ){

	15214 /* The pointer is always the first 4 bytes of the page in this case. */

	15215 if( get4byte(pPage->aData)!=iFrom ){

	15216 return SQLITE_CORRUPT_BKPT;

	15217 }

	15218 put4byte(pPage->aData, iTo);

	15219 }else{

	15220 u8 isInitOrig = pPage->isInit;

	15221 int i;

	15222 int nCell;

	15223 int rc;

	15224

	15225 rc = btreeInitPage(pPage);

	15226 if( rc ) return rc;

	15227 nCell = pPage->nCell;

	15228

	15229 for(i=0; i<nCell; i++){

	15230 u8 *pCell = findCell(pPage, i);

	15231 if( eType==PTRMAP_OVERFLOW1 ){

	15232 CellInfo info;

	15233 pPage->xParseCell(pPage, pCell, &info);

	15234 if( info.nLocal<info.nPayload

	15235 && pCell+info.nSize-1<=pPage->aData+pPage->maskPage

	15236 && iFrom==get4byte(pCell+info.nSize-4)

	15237 ){

	15238 put4byte(pCell+info.nSize-4, iTo);

	15239 break;

	15240 }

	15241 }else{

	15242 if( get4byte(pCell)==iFrom ){

	15243 put4byte(pCell, iTo);

	15244 break;

	15245 }

	15246 }

	15247 }

	15248

	15249 if( i==nCell ){

	15250 if( eType!=PTRMAP_BTREE \|\|

	15251 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){

	15252 return SQLITE_CORRUPT_BKPT;

	15253 }

	15254 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);

	15255 }

	15256

	15257 pPage->isInit = isInitOrig;

	15258 }

	15259 return SQLITE_OK;

	15260 }

	15261

	15262

	15263 /*

	15264 ** Move the open database page pDbPage to location iFreePage in the

	15265 ** database. The pDbPage reference remains valid.

	15266 **

	15267 ** The isCommit flag indicates that there is no need to remember that

	15268 ** the journal needs to be sync()ed before database page pDbPage->pgno

	15269 ** can be written to. The caller has already promised not to write to that

	15270 ** page.

	15271 */

	15272 static int relocatePage(

	15273 BtShared pBt, / Btree */

	15274 MemPage pDbPage, / Open page to move */

	15275 u8 eType, /* Pointer map 'type' entry for pDbPage */

	15276 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */

	15277 Pgno iFreePage, /* The location to move pDbPage to */

	15278 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */

	15279 ){

	15280 MemPage pPtrPage; / The page that contains a pointer to pDbPage */

	15281 Pgno iDbPage = pDbPage->pgno;

	15282 Pager *pPager = pBt->pPager;

	15283 int rc;

	15284

	15285 assert( eType==PTRMAP_OVERFLOW2 \|\| eType==PTRMAP_OVERFLOW1 \|\|

	15286 eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE );

	15287 assert( sqlite3_mutex_held(pBt->mutex) );

	15288 assert( pDbPage->pBt==pBt );

	15289

	15290 /* Move page iDbPage from its current location to page number iFreePage */

	15291 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",

	15292 iDbPage, iFreePage, iPtrPage, eType));

	15293 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);

	15294 if( rc!=SQLITE_OK ){

	15295 return rc;

	15296 }

	15297 pDbPage->pgno = iFreePage;

	15298

	15299 /* If pDbPage was a btree-page, then it may have child pages and/or cells

	15300 ** that point to overflow pages. The pointer map entries for all these

	15301 ** pages need to be changed.

	15302 **

	15303 ** If pDbPage is an overflow page, then the first 4 bytes may store a

	15304 ** pointer to a subsequent overflow page. If this is the case, then

	15305 ** the pointer map needs to be updated for the subsequent overflow page.

	15306 */

	15307 if( eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE ){

	15308 rc = setChildPtrmaps(pDbPage);

	15309 if( rc!=SQLITE_OK ){

	15310 return rc;

	15311 }

	15312 }else{

	15313 Pgno nextOvfl = get4byte(pDbPage->aData);

	15314 if( nextOvfl!=0 ){

	15315 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);

	15316 if( rc!=SQLITE_OK ){

	15317 return rc;

	15318 }

	15319 }

	15320 }

	15321

	15322 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so

	15323 ** that it points at iFreePage. Also fix the pointer map entry for

	15324 ** iPtrPage.

	15325 */

	15326 if( eType!=PTRMAP_ROOTPAGE ){

	15327 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);

	15328 if( rc!=SQLITE_OK ){

	15329 return rc;

	15330 }

	15331 rc = sqlite3PagerWrite(pPtrPage->pDbPage);

	15332 if( rc!=SQLITE_OK ){

	15333 releasePage(pPtrPage);

	15334 return rc;

	15335 }

	15336 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);

	15337 releasePage(pPtrPage);

	15338 if( rc==SQLITE_OK ){

	15339 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);

	15340 }

	15341 }

	15342 return rc;

	15343 }

	15344

	15345 /* Forward declaration required by incrVacuumStep(). */

	15346 static int allocateBtreePage(BtShared , MemPage , Pgno , Pgno, u8);

	15347

	15348 /*

	15349 ** Perform a single step of an incremental-vacuum. If successful, return

	15350 ** SQLITE_OK. If there is no work to do (and therefore no point in

	15351 ** calling this function again), return SQLITE_DONE. Or, if an error

	15352 ** occurs, return some other error code.

	15353 **

	15354 ** More specifically, this function attempts to re-organize the database so

	15355 ** that the last page of the file currently in use is no longer in use.

	15356 **

	15357 ** Parameter nFin is the number of pages that this database would contain

	15358 ** were this function called until it returns SQLITE_DONE.

	15359 **

	15360 ** If the bCommit parameter is non-zero, this function assumes that the

	15361 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE

	15362 ** or an error. bCommit is passed true for an auto-vacuum-on-commit

	15363 ** operation, or false for an incremental vacuum.

	15364 */

	15365 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){

	15366 Pgno nFreeList; /* Number of pages still on the free-list */

	15367 int rc;

	15368

	15369 assert( sqlite3_mutex_held(pBt->mutex) );

	15370 assert( iLastPg>nFin );

	15371

	15372 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){

	15373 u8 eType;

	15374 Pgno iPtrPage;

	15375

	15376 nFreeList = get4byte(&pBt->pPage1->aData[36]);

	15377 if( nFreeList==0 ){

	15378 return SQLITE_DONE;

	15379 }

	15380

	15381 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);

	15382 if( rc!=SQLITE_OK ){

	15383 return rc;

	15384 }

	15385 if( eType==PTRMAP_ROOTPAGE ){

	15386 return SQLITE_CORRUPT_BKPT;

	15387 }

	15388

	15389 if( eType==PTRMAP_FREEPAGE ){

	15390 if( bCommit==0 ){

	15391 /* Remove the page from the files free-list. This is not required

	15392 ** if bCommit is non-zero. In that case, the free-list will be

	15393 ** truncated to zero after this function returns, so it doesn't

	15394 ** matter if it still contains some garbage entries.

	15395 */

	15396 Pgno iFreePg;

	15397 MemPage *pFreePg;

	15398 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);

	15399 if( rc!=SQLITE_OK ){

	15400 return rc;

	15401 }

	15402 assert( iFreePg==iLastPg );

	15403 releasePage(pFreePg);

	15404 }

	15405 } else {

	15406 Pgno iFreePg; /* Index of free page to move pLastPg to */

	15407 MemPage *pLastPg;

	15408 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */

	15409 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */

	15410

	15411 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);

	15412 if( rc!=SQLITE_OK ){

	15413 return rc;

	15414 }

	15415

	15416 /* If bCommit is zero, this loop runs exactly once and page pLastPg

	15417 ** is swapped with the first free page pulled off the free list.

	15418 **

	15419 ** On the other hand, if bCommit is greater than zero, then keep

	15420 ** looping until a free-page located within the first nFin pages

	15421 ** of the file is found.

	15422 */

	15423 if( bCommit==0 ){

	15424 eMode = BTALLOC_LE;

	15425 iNear = nFin;

	15426 }

	15427 do {

	15428 MemPage *pFreePg;

	15429 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);

	15430 if( rc!=SQLITE_OK ){

	15431 releasePage(pLastPg);

	15432 return rc;

	15433 }

	15434 releasePage(pFreePg);

	15435 }while( bCommit && iFreePg>nFin );

	15436 assert( iFreePg<iLastPg );

	15437

	15438 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);

	15439 releasePage(pLastPg);

	15440 if( rc!=SQLITE_OK ){

	15441 return rc;

	15442 }

	15443 }

	15444 }

	15445

	15446 if( bCommit==0 ){

	15447 do {

	15448 iLastPg--;

	15449 }while( iLastPg==PENDING_BYTE_PAGE(pBt) \|\| PTRMAP_ISPAGE(pBt, iLastPg) );

	15450 pBt->bDoTruncate = 1;

	15451 pBt->nPage = iLastPg;

	15452 }

	15453 return SQLITE_OK;

	15454 }

	15455

	15456 /*

	15457 ** The database opened by the first argument is an auto-vacuum database

	15458 ** nOrig pages in size containing nFree free pages. Return the expected

	15459 ** size of the database in pages following an auto-vacuum operation.

	15460 */

	15461 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){

	15462 int nEntry; /* Number of entries on one ptrmap page */

	15463 Pgno nPtrmap; /* Number of PtrMap pages to be freed */

	15464 Pgno nFin; /* Return value */

	15465

	15466 nEntry = pBt->usableSize/5;

	15467 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;

	15468 nFin = nOrig - nFree - nPtrmap;

	15469 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){

	15470 nFin--;

	15471 }

	15472 while( PTRMAP_ISPAGE(pBt, nFin) \|\| nFin==PENDING_BYTE_PAGE(pBt) ){

	15473 nFin--;

	15474 }

	15475

	15476 return nFin;

	15477 }

	15478

	15479 /*

	15480 ** A write-transaction must be opened before calling this function.

	15481 ** It performs a single unit of work towards an incremental vacuum.

	15482 **

	15483 ** If the incremental vacuum is finished after this function has run,

	15484 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,

	15485 ** SQLITE_OK is returned. Otherwise an SQLite error code.

	15486 */

	15487 SQLITE_PRIVATE int sqlite3BtreeIncrVacuum(Btree *p){

	15488 int rc;

	15489 BtShared *pBt = p->pBt;

	15490

	15491 sqlite3BtreeEnter(p);

	15492 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );

	15493 if( !pBt->autoVacuum ){

	15494 rc = SQLITE_DONE;

	15495 }else{

	15496 Pgno nOrig = btreePagecount(pBt);

	15497 Pgno nFree = get4byte(&pBt->pPage1->aData[36]);

	15498 Pgno nFin = finalDbSize(pBt, nOrig, nFree);

	15499

	15500 if( nOrig<nFin ){

	15501 rc = SQLITE_CORRUPT_BKPT;

	15502 }else if( nFree>0 ){

	15503 rc = saveAllCursors(pBt, 0, 0);

	15504 if( rc==SQLITE_OK ){

	15505 invalidateAllOverflowCache(pBt);

	15506 rc = incrVacuumStep(pBt, nFin, nOrig, 0);

	15507 }

	15508 if( rc==SQLITE_OK ){

	15509 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	15510 put4byte(&pBt->pPage1->aData[28], pBt->nPage);

	15511 }

	15512 }else{

	15513 rc = SQLITE_DONE;

	15514 }

	15515 }

	15516 sqlite3BtreeLeave(p);

	15517 return rc;

	15518 }

	15519

	15520 /*

	15521 ** This routine is called prior to sqlite3PagerCommit when a transaction

	15522 ** is committed for an auto-vacuum database.

	15523 **

	15524 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages

	15525 ** the database file should be truncated to during the commit process.

	15526 ** i.e. the database has been reorganized so that only the first *pnTrunc

	15527 ** pages are in use.

	15528 */

	15529 static int autoVacuumCommit(BtShared *pBt){

	15530 int rc = SQLITE_OK;

	15531 Pager *pPager = pBt->pPager;

	15532 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )

	15533

	15534 assert( sqlite3_mutex_held(pBt->mutex) );

	15535 invalidateAllOverflowCache(pBt);

	15536 assert(pBt->autoVacuum);

	15537 if( !pBt->incrVacuum ){

	15538 Pgno nFin; /* Number of pages in database after autovacuuming */

	15539 Pgno nFree; /* Number of pages on the freelist initially */

	15540 Pgno iFree; /* The next page to be freed */

	15541 Pgno nOrig; /* Database size before freeing */

	15542

	15543 nOrig = btreePagecount(pBt);

	15544 if( PTRMAP_ISPAGE(pBt, nOrig) \|\| nOrig==PENDING_BYTE_PAGE(pBt) ){

	15545 /* It is not possible to create a database for which the final page

	15546 ** is either a pointer-map page or the pending-byte page. If one

	15547 ** is encountered, this indicates corruption.

	15548 */

	15549 return SQLITE_CORRUPT_BKPT;

	15550 }

	15551

	15552 nFree = get4byte(&pBt->pPage1->aData[36]);

	15553 nFin = finalDbSize(pBt, nOrig, nFree);

	15554 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;

	15555 if( nFin<nOrig ){

	15556 rc = saveAllCursors(pBt, 0, 0);

	15557 }

	15558 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){

	15559 rc = incrVacuumStep(pBt, nFin, iFree, 1);

	15560 }

	15561 if( (rc==SQLITE_DONE \|\| rc==SQLITE_OK) && nFree>0 ){

	15562 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	15563 put4byte(&pBt->pPage1->aData[32], 0);

	15564 put4byte(&pBt->pPage1->aData[36], 0);

	15565 put4byte(&pBt->pPage1->aData[28], nFin);

	15566 pBt->bDoTruncate = 1;

	15567 pBt->nPage = nFin;

	15568 }

	15569 if( rc!=SQLITE_OK ){

	15570 sqlite3PagerRollback(pPager);

	15571 }

	15572 }

	15573

	15574 assert( nRef>=sqlite3PagerRefcount(pPager) );

	15575 return rc;

	15576 }

	15577

	15578 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */

	15579 # define setChildPtrmaps(x) SQLITE_OK

	15580 #endif

	15581

	15582 /*

	15583 ** This routine does the first phase of a two-phase commit. This routine

	15584 ** causes a rollback journal to be created (if it does not already exist)

	15585 ** and populated with enough information so that if a power loss occurs

	15586 ** the database can be restored to its original state by playing back

	15587 ** the journal. Then the contents of the journal are flushed out to

	15588 ** the disk. After the journal is safely on oxide, the changes to the

	15589 ** database are written into the database file and flushed to oxide.

	15590 ** At the end of this call, the rollback journal still exists on the

	15591 ** disk and we are still holding all locks, so the transaction has not

	15592 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the

	15593 ** commit process.

	15594 **

	15595 ** This call is a no-op if no write-transaction is currently active on pBt.

	15596 **

	15597 ** Otherwise, sync the database file for the btree pBt. zMaster points to

	15598 ** the name of a master journal file that should be written into the

	15599 ** individual journal file, or is NULL, indicating no master journal file

	15600 ** (single database transaction).

	15601 **

	15602 ** When this is called, the master journal should already have been

	15603 ** created, populated with this journal pointer and synced to disk.

	15604 **

	15605 ** Once this is routine has returned, the only thing required to commit

	15606 ** the write-transaction for this database file is to delete the journal.

	15607 */

	15608 SQLITE_PRIVATE int sqlite3BtreeCommitPhaseOne(Btree p, const char zMaster){

	15609 int rc = SQLITE_OK;

	15610 if( p->inTrans==TRANS_WRITE ){

	15611 BtShared *pBt = p->pBt;

	15612 sqlite3BtreeEnter(p);

	15613 #ifndef SQLITE_OMIT_AUTOVACUUM

	15614 if( pBt->autoVacuum ){

	15615 rc = autoVacuumCommit(pBt);

	15616 if( rc!=SQLITE_OK ){

	15617 sqlite3BtreeLeave(p);

	15618 return rc;

	15619 }

	15620 }

	15621 if( pBt->bDoTruncate ){

	15622 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);

	15623 }

	15624 #endif

	15625 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);

	15626 sqlite3BtreeLeave(p);

	15627 }

	15628 return rc;

	15629 }

	15630

	15631 /*

	15632 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()

	15633 ** at the conclusion of a transaction.

	15634 */

	15635 static void btreeEndTransaction(Btree *p){

	15636 BtShared *pBt = p->pBt;

	15637 sqlite3 *db = p->db;

	15638 assert( sqlite3BtreeHoldsMutex(p) );

	15639

	15640 #ifndef SQLITE_OMIT_AUTOVACUUM

	15641 pBt->bDoTruncate = 0;

	15642 #endif

	15643 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){

	15644 /* If there are other active statements that belong to this database

	15645 ** handle, downgrade to a read-only transaction. The other statements

	15646 ** may still be reading from the database. */

	15647 downgradeAllSharedCacheTableLocks(p);

	15648 p->inTrans = TRANS_READ;

	15649 }else{

	15650 /* If the handle had any kind of transaction open, decrement the

	15651 ** transaction count of the shared btree. If the transaction count

	15652 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()

	15653 ** call below will unlock the pager. */

	15654 if( p->inTrans!=TRANS_NONE ){

	15655 clearAllSharedCacheTableLocks(p);

	15656 pBt->nTransaction--;

	15657 if( 0==pBt->nTransaction ){

	15658 pBt->inTransaction = TRANS_NONE;

	15659 }

	15660 }

	15661

	15662 /* Set the current transaction state to TRANS_NONE and unlock the

	15663 ** pager if this call closed the only read or write transaction. */

	15664 p->inTrans = TRANS_NONE;

	15665 unlockBtreeIfUnused(pBt);

	15666 }

	15667

	15668 btreeIntegrity(p);

	15669 }

	15670

	15671 /*

	15672 ** Commit the transaction currently in progress.

	15673 **

	15674 ** This routine implements the second phase of a 2-phase commit. The

	15675 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should

	15676 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()

	15677 ** routine did all the work of writing information out to disk and flushing the

	15678 ** contents so that they are written onto the disk platter. All this

	15679 ** routine has to do is delete or truncate or zero the header in the

	15680 ** the rollback journal (which causes the transaction to commit) and

	15681 ** drop locks.

	15682 **

	15683 ** Normally, if an error occurs while the pager layer is attempting to

	15684 ** finalize the underlying journal file, this function returns an error and

	15685 ** the upper layer will attempt a rollback. However, if the second argument

	15686 ** is non-zero then this b-tree transaction is part of a multi-file

	15687 ** transaction. In this case, the transaction has already been committed

	15688 ** (by deleting a master journal file) and the caller will ignore this

	15689 ** functions return code. So, even if an error occurs in the pager layer,

	15690 ** reset the b-tree objects internal state to indicate that the write

	15691 ** transaction has been closed. This is quite safe, as the pager will have

	15692 ** transitioned to the error state.

	15693 **

	15694 ** This will release the write lock on the database file. If there

	15695 ** are no active cursors, it also releases the read lock.

	15696 */

	15697 SQLITE_PRIVATE int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){

	15698

	15699 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;

	15700 sqlite3BtreeEnter(p);

	15701 btreeIntegrity(p);

	15702

	15703 /* If the handle has a write-transaction open, commit the shared-btrees

	15704 ** transaction and set the shared state to TRANS_READ.

	15705 */

	15706 if( p->inTrans==TRANS_WRITE ){

	15707 int rc;

	15708 BtShared *pBt = p->pBt;

	15709 assert( pBt->inTransaction==TRANS_WRITE );

	15710 assert( pBt->nTransaction>0 );

	15711 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);

	15712 if( rc!=SQLITE_OK && bCleanup==0 ){

	15713 sqlite3BtreeLeave(p);

	15714 return rc;

	15715 }

	15716 p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */

	15717 pBt->inTransaction = TRANS_READ;

	15718 btreeClearHasContent(pBt);

	15719 }

	15720

	15721 btreeEndTransaction(p);

	15722 sqlite3BtreeLeave(p);

	15723 return SQLITE_OK;

	15724 }

	15725

	15726 /*

	15727 ** Do both phases of a commit.

	15728 */

	15729 SQLITE_PRIVATE int sqlite3BtreeCommit(Btree *p){

	15730 int rc;

	15731 sqlite3BtreeEnter(p);

	15732 rc = sqlite3BtreeCommitPhaseOne(p, 0);

	15733 if( rc==SQLITE_OK ){

	15734 rc = sqlite3BtreeCommitPhaseTwo(p, 0);

	15735 }

	15736 sqlite3BtreeLeave(p);

	15737 return rc;

	15738 }

	15739

	15740 /*

	15741 ** This routine sets the state to CURSOR_FAULT and the error

	15742 ** code to errCode for every cursor on any BtShared that pBtree

	15743 ** references. Or if the writeOnly flag is set to 1, then only

	15744 ** trip write cursors and leave read cursors unchanged.

	15745 **

	15746 ** Every cursor is a candidate to be tripped, including cursors

	15747 ** that belong to other database connections that happen to be

	15748 ** sharing the cache with pBtree.

	15749 **

	15750 ** This routine gets called when a rollback occurs. If the writeOnly

	15751 ** flag is true, then only write-cursors need be tripped - read-only

	15752 ** cursors save their current positions so that they may continue

	15753 ** following the rollback. Or, if writeOnly is false, all cursors are

	15754 ** tripped. In general, writeOnly is false if the transaction being

	15755 ** rolled back modified the database schema. In this case b-tree root

	15756 ** pages may be moved or deleted from the database altogether, making

	15757 ** it unsafe for read cursors to continue.

	15758 **

	15759 ** If the writeOnly flag is true and an error is encountered while

	15760 ** saving the current position of a read-only cursor, all cursors,

	15761 ** including all read-cursors are tripped.

	15762 **

	15763 ** SQLITE_OK is returned if successful, or if an error occurs while

	15764 ** saving a cursor position, an SQLite error code.

	15765 */

	15766 SQLITE_PRIVATE int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int wr iteOnly){

	15767 BtCursor *p;

	15768 int rc = SQLITE_OK;

	15769

	15770 assert( (writeOnly==0 \|\| writeOnly==1) && BTCF_WriteFlag==1 );

	15771 if( pBtree ){

	15772 sqlite3BtreeEnter(pBtree);

	15773 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

	15774 int i;

	15775 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){

	15776 if( p->eState==CURSOR_VALID \|\| p->eState==CURSOR_SKIPNEXT ){

	15777 rc = saveCursorPosition(p);

	15778 if( rc!=SQLITE_OK ){

	15779 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);

	15780 break;

	15781 }

	15782 }

	15783 }else{

	15784 sqlite3BtreeClearCursor(p);

	15785 p->eState = CURSOR_FAULT;

	15786 p->skipNext = errCode;

	15787 }

	15788 for(i=0; i<=p->iPage; i++){

	15789 releasePage(p->apPage[i]);

	15790 p->apPage[i] = 0;

	15791 }

	15792 }

	15793 sqlite3BtreeLeave(pBtree);

	15794 }

	15795 return rc;

	15796 }

	15797

	15798 /*

	15799 ** Rollback the transaction in progress.

	15800 **

	15801 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).

	15802 ** Only write cursors are tripped if writeOnly is true but all cursors are

	15803 ** tripped if writeOnly is false. Any attempt to use

	15804 ** a tripped cursor will result in an error.

	15805 **

	15806 ** This will release the write lock on the database file. If there

	15807 ** are no active cursors, it also releases the read lock.

	15808 */

	15809 SQLITE_PRIVATE int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){

	15810 int rc;

	15811 BtShared *pBt = p->pBt;

	15812 MemPage *pPage1;

	15813

	15814 assert( writeOnly==1 \|\| writeOnly==0 );

	15815 assert( tripCode==SQLITE_ABORT_ROLLBACK \|\| tripCode==SQLITE_OK );

	15816 sqlite3BtreeEnter(p);

	15817 if( tripCode==SQLITE_OK ){

	15818 rc = tripCode = saveAllCursors(pBt, 0, 0);

	15819 if( rc ) writeOnly = 0;

	15820 }else{

	15821 rc = SQLITE_OK;

	15822 }

	15823 if( tripCode ){

	15824 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);

	15825 assert( rc==SQLITE_OK \|\| (writeOnly==0 && rc2==SQLITE_OK) );

	15826 if( rc2!=SQLITE_OK ) rc = rc2;

	15827 }

	15828 btreeIntegrity(p);

	15829

	15830 if( p->inTrans==TRANS_WRITE ){

	15831 int rc2;

	15832

	15833 assert( TRANS_WRITE==pBt->inTransaction );

	15834 rc2 = sqlite3PagerRollback(pBt->pPager);

	15835 if( rc2!=SQLITE_OK ){

	15836 rc = rc2;

	15837 }

	15838

	15839 /* The rollback may have destroyed the pPage1->aData value. So

	15840 ** call btreeGetPage() on page 1 again to make

	15841 ** sure pPage1->aData is set correctly. */

	15842 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){

	15843 int nPage = get4byte(28+(u8*)pPage1->aData);

	15844 testcase( nPage==0 );

	15845 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);

	15846 testcase( pBt->nPage!=nPage );

	15847 pBt->nPage = nPage;

	15848 releasePage(pPage1);

	15849 }

	15850 assert( countValidCursors(pBt, 1)==0 );

	15851 pBt->inTransaction = TRANS_READ;

	15852 btreeClearHasContent(pBt);

	15853 }

	15854

	15855 btreeEndTransaction(p);

	15856 sqlite3BtreeLeave(p);

	15857 return rc;

	15858 }

	15859

	15860 /*

	15861 ** Start a statement subtransaction. The subtransaction can be rolled

	15862 ** back independently of the main transaction. You must start a transaction

	15863 ** before starting a subtransaction. The subtransaction is ended automatically

	15864 ** if the main transaction commits or rolls back.

	15865 **

	15866 ** Statement subtransactions are used around individual SQL statements

	15867 ** that are contained within a BEGIN...COMMIT block. If a constraint

	15868 ** error occurs within the statement, the effect of that one statement

	15869 ** can be rolled back without having to rollback the entire transaction.

	15870 **

	15871 ** A statement sub-transaction is implemented as an anonymous savepoint. The

	15872 ** value passed as the second parameter is the total number of savepoints,

	15873 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there

	15874 ** are no active savepoints and no other statement-transactions open,

	15875 ** iStatement is 1. This anonymous savepoint can be released or rolled back

	15876 ** using the sqlite3BtreeSavepoint() function.

	15877 */

	15878 SQLITE_PRIVATE int sqlite3BtreeBeginStmt(Btree *p, int iStatement){

	15879 int rc;

	15880 BtShared *pBt = p->pBt;

	15881 sqlite3BtreeEnter(p);

	15882 assert( p->inTrans==TRANS_WRITE );

	15883 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	15884 assert( iStatement>0 );

	15885 assert( iStatement>p->db->nSavepoint );

	15886 assert( pBt->inTransaction==TRANS_WRITE );

	15887 /* At the pager level, a statement transaction is a savepoint with

	15888 ** an index greater than all savepoints created explicitly using

	15889 ** SQL statements. It is illegal to open, release or rollback any

	15890 ** such savepoints while the statement transaction savepoint is active.

	15891 */

	15892 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);

	15893 sqlite3BtreeLeave(p);

	15894 return rc;

	15895 }

	15896

	15897 /*

	15898 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK

	15899 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the

	15900 ** savepoint identified by parameter iSavepoint, depending on the value

	15901 ** of op.

	15902 **

	15903 ** Normally, iSavepoint is greater than or equal to zero. However, if op is

	15904 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the

	15905 ** contents of the entire transaction are rolled back. This is different

	15906 ** from a normal transaction rollback, as no locks are released and the

	15907 ** transaction remains open.

	15908 */

	15909 SQLITE_PRIVATE int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){

	15910 int rc = SQLITE_OK;

	15911 if( p && p->inTrans==TRANS_WRITE ){

	15912 BtShared *pBt = p->pBt;

	15913 assert( op==SAVEPOINT_RELEASE \|\| op==SAVEPOINT_ROLLBACK );

	15914 assert( iSavepoint>=0 \|\| (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );

	15915 sqlite3BtreeEnter(p);

	15916 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);

	15917 if( rc==SQLITE_OK ){

	15918 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){

	15919 pBt->nPage = 0;

	15920 }

	15921 rc = newDatabase(pBt);

	15922 pBt->nPage = get4byte(28 + pBt->pPage1->aData);

	15923

	15924 /* The database size was written into the offset 28 of the header

	15925 ** when the transaction started, so we know that the value at offset

	15926 ** 28 is nonzero. */

	15927 assert( pBt->nPage>0 );

	15928 }

	15929 sqlite3BtreeLeave(p);

	15930 }

	15931 return rc;

	15932 }

	15933

	15934 /*

	15935 ** Create a new cursor for the BTree whose root is on the page

	15936 ** iTable. If a read-only cursor is requested, it is assumed that

	15937 ** the caller already has at least a read-only transaction open

	15938 ** on the database already. If a write-cursor is requested, then

	15939 ** the caller is assumed to have an open write transaction.

	15940 **

	15941 ** If wrFlag==0, then the cursor can only be used for reading.

	15942 ** If wrFlag==1, then the cursor can be used for reading or for

	15943 ** writing if other conditions for writing are also met. These

	15944 ** are the conditions that must be met in order for writing to

	15945 ** be allowed:

	15946 **

	15947 ** 1: The cursor must have been opened with wrFlag==1

	15948 **

	15949 ** 2: Other database connections that share the same pager cache

	15950 ** but which are not in the READ_UNCOMMITTED state may not have

	15951 ** cursors open with wrFlag==0 on the same table. Otherwise

	15952 ** the changes made by this write cursor would be visible to

	15953 ** the read cursors in the other database connection.

	15954 **

	15955 ** 3: The database must be writable (not on read-only media)

	15956 **

	15957 ** 4: There must be an active transaction.

	15958 **

	15959 ** No checking is done to make sure that page iTable really is the

	15960 ** root page of a b-tree. If it is not, then the cursor acquired

	15961 ** will not work correctly.

	15962 **

	15963 ** It is assumed that the sqlite3BtreeCursorZero() has been called

	15964 ** on pCur to initialize the memory space prior to invoking this routine.

	15965 */

	15966 static int btreeCursor(

	15967 Btree p, / The btree */

	15968 int iTable, /* Root page of table to open */

	15969 int wrFlag, /* 1 to write. 0 read-only */

	15970 struct KeyInfo pKeyInfo, / First arg to comparison function */

	15971 BtCursor pCur / Space for new cursor */

	15972 ){

	15973 BtShared pBt = p->pBt; / Shared b-tree handle */

	15974 BtCursor pX; / Looping over other all cursors */

	15975

	15976 assert( sqlite3BtreeHoldsMutex(p) );

	15977 assert( wrFlag==0

	15978 \|\| wrFlag==BTREE_WRCSR

	15979 \|\| wrFlag==(BTREE_WRCSR\|BTREE_FORDELETE)

	15980 );

	15981

	15982 /* The following assert statements verify that if this is a sharable

	15983 ** b-tree database, the connection is holding the required table locks,

	15984 ** and that no other connection has any open cursor that conflicts with

	15985 ** this lock. */

	15986 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );

	15987 assert( wrFlag==0 \|\| !hasReadConflicts(p, iTable) );

	15988

	15989 /* Assert that the caller has opened the required transaction. */

	15990 assert( p->inTrans>TRANS_NONE );

	15991 assert( wrFlag==0 \|\| p->inTrans==TRANS_WRITE );

	15992 assert( pBt->pPage1 && pBt->pPage1->aData );

	15993 assert( wrFlag==0 \|\| (pBt->btsFlags & BTS_READ_ONLY)==0 );

	15994

	15995 if( wrFlag ){

	15996 allocateTempSpace(pBt);

	15997 if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM;

	15998 }

	15999 if( iTable==1 && btreePagecount(pBt)==0 ){

	16000 assert( wrFlag==0 );

	16001 iTable = 0;

	16002 }

	16003

	16004 /* Now that no other errors can occur, finish filling in the BtCursor

	16005 ** variables and link the cursor into the BtShared list. */

	16006 pCur->pgnoRoot = (Pgno)iTable;

	16007 pCur->iPage = -1;

	16008 pCur->pKeyInfo = pKeyInfo;

	16009 pCur->pBtree = p;

	16010 pCur->pBt = pBt;

	16011 pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;

	16012 pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;

	16013 /* If there are two or more cursors on the same btree, then all such

	16014 ** cursors must have the BTCF_Multiple flag set. */

	16015 for(pX=pBt->pCursor; pX; pX=pX->pNext){

	16016 if( pX->pgnoRoot==(Pgno)iTable ){

	16017 pX->curFlags \|= BTCF_Multiple;

	16018 pCur->curFlags \|= BTCF_Multiple;

	16019 }

	16020 }

	16021 pCur->pNext = pBt->pCursor;

	16022 pBt->pCursor = pCur;

	16023 pCur->eState = CURSOR_INVALID;

	16024 return SQLITE_OK;

	16025 }

	16026 SQLITE_PRIVATE int sqlite3BtreeCursor(

	16027 Btree p, / The btree */

	16028 int iTable, /* Root page of table to open */

	16029 int wrFlag, /* 1 to write. 0 read-only */

	16030 struct KeyInfo pKeyInfo, / First arg to xCompare() */

	16031 BtCursor pCur / Write new cursor here */

	16032 ){

	16033 int rc;

	16034 if( iTable<1 ){

	16035 rc = SQLITE_CORRUPT_BKPT;

	16036 }else{

	16037 sqlite3BtreeEnter(p);

	16038 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);

	16039 sqlite3BtreeLeave(p);

	16040 }

	16041 return rc;

	16042 }

	16043

	16044 /*

	16045 ** Return the size of a BtCursor object in bytes.

	16046 **

	16047 ** This interfaces is needed so that users of cursors can preallocate

	16048 ** sufficient storage to hold a cursor. The BtCursor object is opaque

	16049 ** to users so they cannot do the sizeof() themselves - they must call

	16050 ** this routine.

	16051 */

	16052 SQLITE_PRIVATE int sqlite3BtreeCursorSize(void){

	16053 return ROUND8(sizeof(BtCursor));

	16054 }

	16055

	16056 /*

	16057 ** Initialize memory that will be converted into a BtCursor object.

	16058 **

	16059 ** The simple approach here would be to memset() the entire object

	16060 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays

	16061 ** do not need to be zeroed and they are large, so we can save a lot

	16062 ** of run-time by skipping the initialization of those elements.

	16063 */

	16064 SQLITE_PRIVATE void sqlite3BtreeCursorZero(BtCursor *p){

	16065 memset(p, 0, offsetof(BtCursor, iPage));

	16066 }

	16067

	16068 /*

	16069 ** Close a cursor. The read lock on the database file is released

	16070 ** when the last cursor is closed.

	16071 */

	16072 SQLITE_PRIVATE int sqlite3BtreeCloseCursor(BtCursor *pCur){

	16073 Btree *pBtree = pCur->pBtree;

	16074 if( pBtree ){

	16075 int i;

	16076 BtShared *pBt = pCur->pBt;

	16077 sqlite3BtreeEnter(pBtree);

	16078 sqlite3BtreeClearCursor(pCur);

	16079 assert( pBt->pCursor!=0 );

	16080 if( pBt->pCursor==pCur ){

	16081 pBt->pCursor = pCur->pNext;

	16082 }else{

	16083 BtCursor *pPrev = pBt->pCursor;

	16084 do{

	16085 if( pPrev->pNext==pCur ){

	16086 pPrev->pNext = pCur->pNext;

	16087 break;

	16088 }

	16089 pPrev = pPrev->pNext;

	16090 }while( ALWAYS(pPrev) );

	16091 }

	16092 for(i=0; i<=pCur->iPage; i++){

	16093 releasePage(pCur->apPage[i]);

	16094 }

	16095 unlockBtreeIfUnused(pBt);

	16096 sqlite3_free(pCur->aOverflow);

	16097 /* sqlite3_free(pCur); */

	16098 sqlite3BtreeLeave(pBtree);

	16099 }

	16100 return SQLITE_OK;

	16101 }

	16102

	16103 /*

	16104 ** Make sure the BtCursor* given in the argument has a valid

	16105 ** BtCursor.info structure. If it is not already valid, call

	16106 ** btreeParseCell() to fill it in.

	16107 **

	16108 ** BtCursor.info is a cache of the information in the current cell.

	16109 ** Using this cache reduces the number of calls to btreeParseCell().

	16110 */

	16111 #ifndef NDEBUG

	16112 static void assertCellInfo(BtCursor *pCur){

	16113 CellInfo info;

	16114 int iPage = pCur->iPage;

	16115 memset(&info, 0, sizeof(info));

	16116 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);

	16117 assert( CORRUPT_DB \|\| memcmp(&info, &pCur->info, sizeof(info))==0 );

	16118 }

	16119 #else

	16120 #define assertCellInfo(x)

	16121 #endif

	16122 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){

	16123 if( pCur->info.nSize==0 ){

	16124 int iPage = pCur->iPage;

	16125 pCur->curFlags \|= BTCF_ValidNKey;

	16126 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);

	16127 }else{

	16128 assertCellInfo(pCur);

	16129 }

	16130 }

	16131

	16132 #ifndef NDEBUG /* The next routine used only within assert() statements */

	16133 /*

	16134 ** Return true if the given BtCursor is valid. A valid cursor is one

	16135 ** that is currently pointing to a row in a (non-empty) table.

	16136 ** This is a verification routine is used only within assert() statements.

	16137 */

	16138 SQLITE_PRIVATE int sqlite3BtreeCursorIsValid(BtCursor *pCur){

	16139 return pCur && pCur->eState==CURSOR_VALID;

	16140 }

	16141 #endif /* NDEBUG */

	16142

	16143 /*

	16144 ** Set *pSize to the size of the buffer needed to hold the value of

	16145 ** the key for the current entry. If the cursor is not pointing

	16146 ** to a valid entry, *pSize is set to 0.

	16147 **

	16148 ** For a table with the INTKEY flag set, this routine returns the key

	16149 ** itself, not the number of bytes in the key.

	16150 **

	16151 ** The caller must position the cursor prior to invoking this routine.

	16152 **

	16153 ** This routine cannot fail. It always returns SQLITE_OK.

	16154 */

	16155 SQLITE_PRIVATE int sqlite3BtreeKeySize(BtCursor pCur, i64 pSize){

	16156 assert( cursorHoldsMutex(pCur) );

	16157 assert( pCur->eState==CURSOR_VALID );

	16158 getCellInfo(pCur);

	16159 *pSize = pCur->info.nKey;

	16160 return SQLITE_OK;

	16161 }

	16162

	16163 /*

	16164 ** Set *pSize to the number of bytes of data in the entry the

	16165 ** cursor currently points to.

	16166 **

	16167 ** The caller must guarantee that the cursor is pointing to a non-NULL

	16168 ** valid entry. In other words, the calling procedure must guarantee

	16169 ** that the cursor has Cursor.eState==CURSOR_VALID.

	16170 **

	16171 ** Failure is not possible. This function always returns SQLITE_OK.

	16172 ** It might just as well be a procedure (returning void) but we continue

	16173 ** to return an integer result code for historical reasons.

	16174 */

	16175 SQLITE_PRIVATE int sqlite3BtreeDataSize(BtCursor pCur, u32 pSize){

	16176 assert( cursorHoldsMutex(pCur) );

	16177 assert( pCur->eState==CURSOR_VALID );

	16178 assert( pCur->iPage>=0 );

	16179 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );

	16180 assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );

	16181 getCellInfo(pCur);

	16182 *pSize = pCur->info.nPayload;

	16183 return SQLITE_OK;

	16184 }

	16185

	16186 /*

	16187 ** Given the page number of an overflow page in the database (parameter

	16188 ** ovfl), this function finds the page number of the next page in the

	16189 ** linked list of overflow pages. If possible, it uses the auto-vacuum

	16190 ** pointer-map data instead of reading the content of page ovfl to do so.

	16191 **

	16192 ** If an error occurs an SQLite error code is returned. Otherwise:

	16193 **

	16194 ** The page number of the next overflow page in the linked list is

	16195 ** written to *pPgnoNext. If page ovfl is the last page in its linked

	16196 ** list, *pPgnoNext is set to zero.

	16197 **

	16198 ** If ppPage is not NULL, and a reference to the MemPage object corresponding

	16199 ** to page number pOvfl was obtained, then *ppPage is set to point to that

	16200 ** reference. It is the responsibility of the caller to call releasePage()

	16201 ** on *ppPage to free the reference. In no reference was obtained (because

	16202 ** the pointer-map was used to obtain the value for *pPgnoNext), then

	16203 ** *ppPage is set to zero.

	16204 */

	16205 static int getOverflowPage(

	16206 BtShared pBt, / The database file */

	16207 Pgno ovfl, /* Current overflow page number */

	16208 MemPage *ppPage, / OUT: MemPage handle (may be NULL) */

	16209 Pgno pPgnoNext / OUT: Next overflow page number */

	16210 ){

	16211 Pgno next = 0;

	16212 MemPage *pPage = 0;

	16213 int rc = SQLITE_OK;

	16214

	16215 assert( sqlite3_mutex_held(pBt->mutex) );

	16216 assert(pPgnoNext);

	16217

	16218 #ifndef SQLITE_OMIT_AUTOVACUUM

	16219 /* Try to find the next page in the overflow list using the

	16220 ** autovacuum pointer-map pages. Guess that the next page in

	16221 ** the overflow list is page number (ovfl+1). If that guess turns

	16222 ** out to be wrong, fall back to loading the data of page

	16223 ** number ovfl to determine the next page number.

	16224 */

	16225 if( pBt->autoVacuum ){

	16226 Pgno pgno;

	16227 Pgno iGuess = ovfl+1;

	16228 u8 eType;

	16229

	16230 while( PTRMAP_ISPAGE(pBt, iGuess) \|\| iGuess==PENDING_BYTE_PAGE(pBt) ){

	16231 iGuess++;

	16232 }

	16233

	16234 if( iGuess<=btreePagecount(pBt) ){

	16235 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);

	16236 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){

	16237 next = iGuess;

	16238 rc = SQLITE_DONE;

	16239 }

	16240 }

	16241 }

	16242 #endif

	16243

	16244 assert( next==0 \|\| rc==SQLITE_DONE );

	16245 if( rc==SQLITE_OK ){

	16246 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);

	16247 assert( rc==SQLITE_OK \|\| pPage==0 );

	16248 if( rc==SQLITE_OK ){

	16249 next = get4byte(pPage->aData);

	16250 }

	16251 }

	16252

	16253 *pPgnoNext = next;

	16254 if( ppPage ){

	16255 *ppPage = pPage;

	16256 }else{

	16257 releasePage(pPage);

	16258 }

	16259 return (rc==SQLITE_DONE ? SQLITE_OK : rc);

	16260 }

	16261

	16262 /*

	16263 ** Copy data from a buffer to a page, or from a page to a buffer.

	16264 **

	16265 ** pPayload is a pointer to data stored on database page pDbPage.

	16266 ** If argument eOp is false, then nByte bytes of data are copied

	16267 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,

	16268 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes

	16269 ** of data are copied from the buffer pBuf to pPayload.

	16270 **

	16271 ** SQLITE_OK is returned on success, otherwise an error code.

	16272 */

	16273 static int copyPayload(

	16274 void pPayload, / Pointer to page data */

	16275 void pBuf, / Pointer to buffer */

	16276 int nByte, /* Number of bytes to copy */

	16277 int eOp, /* 0 -> copy from page, 1 -> copy to page */

	16278 DbPage pDbPage / Page containing pPayload */

	16279 ){

	16280 if( eOp ){

	16281 /* Copy data from buffer to page (a write operation) */

	16282 int rc = sqlite3PagerWrite(pDbPage);

	16283 if( rc!=SQLITE_OK ){

	16284 return rc;

	16285 }

	16286 memcpy(pPayload, pBuf, nByte);

	16287 }else{

	16288 /* Copy data from page to buffer (a read operation) */

	16289 memcpy(pBuf, pPayload, nByte);

	16290 }

	16291 return SQLITE_OK;

	16292 }

	16293

	16294 /*

	16295 ** This function is used to read or overwrite payload information

	16296 ** for the entry that the pCur cursor is pointing to. The eOp

	16297 ** argument is interpreted as follows:

	16298 **

	16299 ** 0: The operation is a read. Populate the overflow cache.

	16300 ** 1: The operation is a write. Populate the overflow cache.

	16301 ** 2: The operation is a read. Do not populate the overflow cache.

	16302 **

	16303 ** A total of "amt" bytes are read or written beginning at "offset".

	16304 ** Data is read to or from the buffer pBuf.

	16305 **

	16306 ** The content being read or written might appear on the main page

	16307 ** or be scattered out on multiple overflow pages.

	16308 **

	16309 ** If the current cursor entry uses one or more overflow pages and the

	16310 ** eOp argument is not 2, this function may allocate space for and lazily

	16311 ** populates the overflow page-list cache array (BtCursor.aOverflow).

	16312 ** Subsequent calls use this cache to make seeking to the supplied offset

	16313 ** more efficient.

	16314 **

	16315 ** Once an overflow page-list cache has been allocated, it may be

	16316 ** invalidated if some other cursor writes to the same table, or if

	16317 ** the cursor is moved to a different row. Additionally, in auto-vacuum

	16318 ** mode, the following events may invalidate an overflow page-list cache.

	16319 **

	16320 ** * An incremental vacuum,

	16321 ** * A commit in auto_vacuum="full" mode,

	16322 ** * Creating a table (may require moving an overflow page).

	16323 */

	16324 static int accessPayload(

	16325 BtCursor pCur, / Cursor pointing to entry to read from */

	16326 u32 offset, /* Begin reading this far into payload */

	16327 u32 amt, /* Read this many bytes */

	16328 unsigned char pBuf, / Write the bytes into this buffer */

	16329 int eOp /* zero to read. non-zero to write. */

	16330 ){

	16331 unsigned char *aPayload;

	16332 int rc = SQLITE_OK;

	16333 int iIdx = 0;

	16334 MemPage pPage = pCur->apPage[pCur->iPage]; / Btree page of current entry */

	16335 BtShared pBt = pCur->pBt; / Btree this cursor belongs to */

	16336 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	16337 unsigned char * const pBufStart = pBuf;

	16338 int bEnd; /* True if reading to end of data */

	16339 #endif

	16340

	16341 assert( pPage );

	16342 assert( pCur->eState==CURSOR_VALID );

	16343 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

	16344 assert( cursorHoldsMutex(pCur) );

	16345 assert( eOp!=2 \|\| offset==0 ); /* Always start from beginning for eOp==2 */

	16346

	16347 getCellInfo(pCur);

	16348 aPayload = pCur->info.pPayload;

	16349 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	16350 bEnd = offset+amt==pCur->info.nPayload;

	16351 #endif

	16352 assert( offset+amt <= pCur->info.nPayload );

	16353

	16354 if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){

	16355 /* Trying to read or write past the end of the data is an error */

	16356 return SQLITE_CORRUPT_BKPT;

	16357 }

	16358

	16359 /* Check if data must be read/written to/from the btree page itself. */

	16360 if( offset<pCur->info.nLocal ){

	16361 int a = amt;

	16362 if( a+offset>pCur->info.nLocal ){

	16363 a = pCur->info.nLocal - offset;

	16364 }

	16365 rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);

	16366 offset = 0;

	16367 pBuf += a;

	16368 amt -= a;

	16369 }else{

	16370 offset -= pCur->info.nLocal;

	16371 }

	16372

	16373

	16374 if( rc==SQLITE_OK && amt>0 ){

	16375 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */

	16376 Pgno nextPage;

	16377

	16378 nextPage = get4byte(&aPayload[pCur->info.nLocal]);

	16379

	16380 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.

	16381 ** Except, do not allocate aOverflow[] for eOp==2.

	16382 **

	16383 ** The aOverflow[] array is sized at one entry for each overflow page

	16384 ** in the overflow chain. The page number of the first overflow page is

	16385 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array

	16386 ** means "not yet known" (the cache is lazily populated).

	16387 */

	16388 if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){

	16389 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;

	16390 if( nOvfl>pCur->nOvflAlloc ){

	16391 Pgno aNew = (Pgno)sqlite3Realloc(

	16392 pCur->aOverflow, nOvfl2sizeof(Pgno)

	16393 );

	16394 if( aNew==0 ){

	16395 rc = SQLITE_NOMEM;

	16396 }else{

	16397 pCur->nOvflAlloc = nOvfl*2;

	16398 pCur->aOverflow = aNew;

	16399 }

	16400 }

	16401 if( rc==SQLITE_OK ){

	16402 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));

	16403 pCur->curFlags \|= BTCF_ValidOvfl;

	16404 }

	16405 }

	16406

	16407 /* If the overflow page-list cache has been allocated and the

	16408 ** entry for the first required overflow page is valid, skip

	16409 ** directly to it.

	16410 */

	16411 if( (pCur->curFlags & BTCF_ValidOvfl)!=0

	16412 && pCur->aOverflow[offset/ovflSize]

	16413 ){

	16414 iIdx = (offset/ovflSize);

	16415 nextPage = pCur->aOverflow[iIdx];

	16416 offset = (offset%ovflSize);

	16417 }

	16418

	16419 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){

	16420

	16421 /* If required, populate the overflow page-list cache. */

	16422 if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){

	16423 assert( pCur->aOverflow[iIdx]==0

	16424 \|\| pCur->aOverflow[iIdx]==nextPage

	16425 \|\| CORRUPT_DB );

	16426 pCur->aOverflow[iIdx] = nextPage;

	16427 }

	16428

	16429 if( offset>=ovflSize ){

	16430 /* The only reason to read this page is to obtain the page

	16431 ** number for the next page in the overflow chain. The page

	16432 ** data is not required. So first try to lookup the overflow

	16433 ** page-list cache, if any, then fall back to the getOverflowPage()

	16434 ** function.

	16435 **

	16436 ** Note that the aOverflow[] array must be allocated because eOp!=2

	16437 ** here. If eOp==2, then offset==0 and this branch is never taken.

	16438 */

	16439 assert( eOp!=2 );

	16440 assert( pCur->curFlags & BTCF_ValidOvfl );

	16441 assert( pCur->pBtree->db==pBt->db );

	16442 if( pCur->aOverflow[iIdx+1] ){

	16443 nextPage = pCur->aOverflow[iIdx+1];

	16444 }else{

	16445 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);

	16446 }

	16447 offset -= ovflSize;

	16448 }else{

	16449 /* Need to read this page properly. It contains some of the

	16450 ** range of data that is being read (eOp==0) or written (eOp!=0).

	16451 */

	16452 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	16453 sqlite3_file *fd;

	16454 #endif

	16455 int a = amt;

	16456 if( a + offset > ovflSize ){

	16457 a = ovflSize - offset;

	16458 }

	16459

	16460 #ifdef SQLITE_DIRECT_OVERFLOW_READ

	16461 /* If all the following are true:

	16462 **

	16463 ** 1) this is a read operation, and

	16464 ** 2) data is required from the start of this overflow page, and

	16465 ** 3) the database is file-backed, and

	16466 ** 4) there is no open write-transaction, and

	16467 ** 5) the database is not a WAL database,

	16468 ** 6) all data from the page is being read.

	16469 ** 7) at least 4 bytes have already been read into the output buffer

	16470 **

	16471 ** then data can be read directly from the database file into the

	16472 ** output buffer, bypassing the page-cache altogether. This speeds

	16473 ** up loading large records that span many overflow pages.

	16474 */

	16475 if( (eOp&0x01)==0 /* (1) */

	16476 && offset==0 /* (2) */

	16477 && (bEnd \|\| a==ovflSize) /* (6) */

	16478 && pBt->inTransaction==TRANS_READ /* (4) */

	16479 && (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (3) */

	16480 && pBt->pPage1->aData[19]==0x01 /* (5) */

	16481 && &pBuf[-4]>=pBufStart /* (7) */

	16482 ){

	16483 u8 aSave[4];

	16484 u8 *aWrite = &pBuf[-4];

	16485 assert( aWrite>=pBufStart ); /* hence (7) */

	16486 memcpy(aSave, aWrite, 4);

	16487 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));

	16488 nextPage = get4byte(aWrite);

	16489 memcpy(aWrite, aSave, 4);

	16490 }else

	16491 #endif

	16492

	16493 {

	16494 DbPage *pDbPage;

	16495 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,

	16496 ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)

	16497 );

	16498 if( rc==SQLITE_OK ){

	16499 aPayload = sqlite3PagerGetData(pDbPage);

	16500 nextPage = get4byte(aPayload);

	16501 rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);

	16502 sqlite3PagerUnref(pDbPage);

	16503 offset = 0;

	16504 }

	16505 }

	16506 amt -= a;

	16507 pBuf += a;

	16508 }

	16509 }

	16510 }

	16511

	16512 if( rc==SQLITE_OK && amt>0 ){

	16513 return SQLITE_CORRUPT_BKPT;

	16514 }

	16515 return rc;

	16516 }

	16517

	16518 /*

	16519 ** Read part of the key associated with cursor pCur. Exactly

	16520 ** "amt" bytes will be transferred into pBuf[]. The transfer

	16521 ** begins at "offset".

	16522 **

	16523 ** The caller must ensure that pCur is pointing to a valid row

	16524 ** in the table.

	16525 **

	16526 ** Return SQLITE_OK on success or an error code if anything goes

	16527 ** wrong. An error is returned if "offset+amt" is larger than

	16528 ** the available payload.

	16529 */

	16530 SQLITE_PRIVATE int sqlite3BtreeKey(BtCursor pCur, u32 offset, u32 amt, void pB uf){

	16531 assert( cursorHoldsMutex(pCur) );

	16532 assert( pCur->eState==CURSOR_VALID );

	16533 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

	16534 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	16535 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);

	16536 }

	16537

	16538 /*

	16539 ** Read part of the data associated with cursor pCur. Exactly

	16540 ** "amt" bytes will be transfered into pBuf[]. The transfer

	16541 ** begins at "offset".

	16542 **

	16543 ** Return SQLITE_OK on success or an error code if anything goes

	16544 ** wrong. An error is returned if "offset+amt" is larger than

	16545 ** the available payload.

	16546 */

	16547 SQLITE_PRIVATE int sqlite3BtreeData(BtCursor pCur, u32 offset, u32 amt, void p Buf){

	16548 int rc;

	16549

	16550 #ifndef SQLITE_OMIT_INCRBLOB

	16551 if ( pCur->eState==CURSOR_INVALID ){

	16552 return SQLITE_ABORT;

	16553 }

	16554 #endif

	16555

	16556 assert( cursorHoldsMutex(pCur) );

	16557 rc = restoreCursorPosition(pCur);

	16558 if( rc==SQLITE_OK ){

	16559 assert( pCur->eState==CURSOR_VALID );

	16560 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

	16561 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	16562 rc = accessPayload(pCur, offset, amt, pBuf, 0);

	16563 }

	16564 return rc;

	16565 }

	16566

	16567 /*

	16568 ** Return a pointer to payload information from the entry that the

	16569 ** pCur cursor is pointing to. The pointer is to the beginning of

	16570 ** the key if index btrees (pPage->intKey==0) and is the data for

	16571 ** table btrees (pPage->intKey==1). The number of bytes of available

	16572 ** key/data is written into pAmt. If pAmt==0, then the value

	16573 ** returned will not be a valid pointer.

	16574 **

	16575 ** This routine is an optimization. It is common for the entire key

	16576 ** and data to fit on the local page and for there to be no overflow

	16577 ** pages. When that is so, this routine can be used to access the

	16578 ** key and data without making a copy. If the key and/or data spills

	16579 ** onto overflow pages, then accessPayload() must be used to reassemble

	16580 ** the key/data and copy it into a preallocated buffer.

	16581 **

	16582 ** The pointer returned by this routine looks directly into the cached

	16583 ** page of the database. The data might change or move the next time

	16584 ** any btree routine is called.

	16585 */

	16586 static const void *fetchPayload(

	16587 BtCursor pCur, / Cursor pointing to entry to read from */

	16588 u32 pAmt / Write the number of available bytes here */

	16589 ){

	16590 u32 amt;

	16591 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);

	16592 assert( pCur->eState==CURSOR_VALID );

	16593 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	16594 assert( cursorHoldsMutex(pCur) );

	16595 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	16596 assert( pCur->info.nSize>0 );

	16597 assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData \|\| CORRUPT_DB );

	16598 assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd \|\|CORRUPT_DB);

	16599 amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);

	16600 if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;

	16601 *pAmt = amt;

	16602 return (void*)pCur->info.pPayload;

	16603 }

	16604

	16605

	16606 /*

	16607 ** For the entry that cursor pCur is point to, return as

	16608 ** many bytes of the key or data as are available on the local

	16609 ** b-tree page. Write the number of available bytes into *pAmt.

	16610 **

	16611 ** The pointer returned is ephemeral. The key/data may move

	16612 ** or be destroyed on the next call to any Btree routine,

	16613 ** including calls from other threads against the same cache.

	16614 ** Hence, a mutex on the BtShared should be held prior to calling

	16615 ** this routine.

	16616 **

	16617 ** These routines is used to get quick access to key and data

	16618 ** in the common case where no overflow pages are used.

	16619 */

	16620 SQLITE_PRIVATE const void sqlite3BtreeKeyFetch(BtCursor pCur, u32 *pAmt){

	16621 return fetchPayload(pCur, pAmt);

	16622 }

	16623 SQLITE_PRIVATE const void sqlite3BtreeDataFetch(BtCursor pCur, u32 *pAmt){

	16624 return fetchPayload(pCur, pAmt);

	16625 }

	16626

	16627

	16628 /*

	16629 ** Move the cursor down to a new child page. The newPgno argument is the

	16630 ** page number of the child page to move to.

	16631 **

	16632 ** This function returns SQLITE_CORRUPT if the page-header flags field of

	16633 ** the new child page does not match the flags field of the parent (i.e.

	16634 ** if an intkey page appears to be the parent of a non-intkey page, or

	16635 ** vice-versa).

	16636 */

	16637 static int moveToChild(BtCursor *pCur, u32 newPgno){

	16638 BtShared *pBt = pCur->pBt;

	16639

	16640 assert( cursorHoldsMutex(pCur) );

	16641 assert( pCur->eState==CURSOR_VALID );

	16642 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );

	16643 assert( pCur->iPage>=0 );

	16644 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){

	16645 return SQLITE_CORRUPT_BKPT;

	16646 }

	16647 pCur->info.nSize = 0;

	16648 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	16649 pCur->iPage++;

	16650 pCur->aiIdx[pCur->iPage] = 0;

	16651 return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],

	16652 pCur, pCur->curPagerFlags);

	16653 }

	16654

	16655 #if SQLITE_DEBUG

	16656 /*

	16657 ** Page pParent is an internal (non-leaf) tree page. This function

	16658 ** asserts that page number iChild is the left-child if the iIdx'th

	16659 ** cell in page pParent. Or, if iIdx is equal to the total number of

	16660 ** cells in pParent, that page number iChild is the right-child of

	16661 ** the page.

	16662 */

	16663 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){

	16664 if( CORRUPT_DB ) return; /* The conditions tested below might not be true

	16665 ** in a corrupt database */

	16666 assert( iIdx<=pParent->nCell );

	16667 if( iIdx==pParent->nCell ){

	16668 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );

	16669 }else{

	16670 assert( get4byte(findCell(pParent, iIdx))==iChild );

	16671 }

	16672 }

	16673 #else

	16674 # define assertParentIndex(x,y,z)

	16675 #endif

	16676

	16677 /*

	16678 ** Move the cursor up to the parent page.

	16679 **

	16680 ** pCur->idx is set to the cell index that contains the pointer

	16681 ** to the page we are coming from. If we are coming from the

	16682 ** right-most child page then pCur->idx is set to one more than

	16683 ** the largest cell index.

	16684 */

	16685 static void moveToParent(BtCursor *pCur){

	16686 assert( cursorHoldsMutex(pCur) );

	16687 assert( pCur->eState==CURSOR_VALID );

	16688 assert( pCur->iPage>0 );

	16689 assert( pCur->apPage[pCur->iPage] );

	16690 assertParentIndex(

	16691 pCur->apPage[pCur->iPage-1],

	16692 pCur->aiIdx[pCur->iPage-1],

	16693 pCur->apPage[pCur->iPage]->pgno

	16694 );

	16695 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );

	16696 pCur->info.nSize = 0;

	16697 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	16698 releasePageNotNull(pCur->apPage[pCur->iPage--]);

	16699 }

	16700

	16701 /*

	16702 ** Move the cursor to point to the root page of its b-tree structure.

	16703 **

	16704 ** If the table has a virtual root page, then the cursor is moved to point

	16705 ** to the virtual root page instead of the actual root page. A table has a

	16706 ** virtual root page when the actual root page contains no cells and a

	16707 ** single child page. This can only happen with the table rooted at page 1.

	16708 **

	16709 ** If the b-tree structure is empty, the cursor state is set to

	16710 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first

	16711 ** cell located on the root (or virtual root) page and the cursor state

	16712 ** is set to CURSOR_VALID.

	16713 **

	16714 ** If this function returns successfully, it may be assumed that the

	16715 ** page-header flags indicate that the [virtual] root-page is the expected

	16716 ** kind of b-tree page (i.e. if when opening the cursor the caller did not

	16717 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,

	16718 ** indicating a table b-tree, or if the caller did specify a KeyInfo

	16719 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index

	16720 ** b-tree).

	16721 */

	16722 static int moveToRoot(BtCursor *pCur){

	16723 MemPage *pRoot;

	16724 int rc = SQLITE_OK;

	16725

	16726 assert( cursorHoldsMutex(pCur) );

	16727 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );

	16728 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );

	16729 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );

	16730 if( pCur->eState>=CURSOR_REQUIRESEEK ){

	16731 if( pCur->eState==CURSOR_FAULT ){

	16732 assert( pCur->skipNext!=SQLITE_OK );

	16733 return pCur->skipNext;

	16734 }

	16735 sqlite3BtreeClearCursor(pCur);

	16736 }

	16737

	16738 if( pCur->iPage>=0 ){

	16739 while( pCur->iPage ){

	16740 assert( pCur->apPage[pCur->iPage]!=0 );

	16741 releasePageNotNull(pCur->apPage[pCur->iPage--]);

	16742 }

	16743 }else if( pCur->pgnoRoot==0 ){

	16744 pCur->eState = CURSOR_INVALID;

	16745 return SQLITE_OK;

	16746 }else{

	16747 assert( pCur->iPage==(-1) );

	16748 rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],

	16749 0, pCur->curPagerFlags);

	16750 if( rc!=SQLITE_OK ){

	16751 pCur->eState = CURSOR_INVALID;

	16752 return rc;

	16753 }

	16754 pCur->iPage = 0;

	16755 pCur->curIntKey = pCur->apPage[0]->intKey;

	16756 }

	16757 pRoot = pCur->apPage[0];

	16758 assert( pRoot->pgno==pCur->pgnoRoot );

	16759

	16760 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor

	16761 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is

	16762 ** NULL, the caller expects a table b-tree. If this is not the case,

	16763 ** return an SQLITE_CORRUPT error.

	16764 **

	16765 ** Earlier versions of SQLite assumed that this test could not fail

	16766 ** if the root page was already loaded when this function was called (i.e.

	16767 ** if pCur->iPage>=0). But this is not so if the database is corrupted

	16768 ** in such a way that page pRoot is linked into a second b-tree table

	16769 ** (or the freelist). */

	16770 assert( pRoot->intKey==1 \|\| pRoot->intKey==0 );

	16771 if( pRoot->isInit==0 \|\| (pCur->pKeyInfo==0)!=pRoot->intKey ){

	16772 return SQLITE_CORRUPT_BKPT;

	16773 }

	16774

	16775 pCur->aiIdx[0] = 0;

	16776 pCur->info.nSize = 0;

	16777 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidNKey\|BTCF_ValidOvfl);

	16778

	16779 if( pRoot->nCell>0 ){

	16780 pCur->eState = CURSOR_VALID;

	16781 }else if( !pRoot->leaf ){

	16782 Pgno subpage;

	16783 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;

	16784 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);

	16785 pCur->eState = CURSOR_VALID;

	16786 rc = moveToChild(pCur, subpage);

	16787 }else{

	16788 pCur->eState = CURSOR_INVALID;

	16789 }

	16790 return rc;

	16791 }

	16792

	16793 /*

	16794 ** Move the cursor down to the left-most leaf entry beneath the

	16795 ** entry to which it is currently pointing.

	16796 **

	16797 ** The left-most leaf is the one with the smallest key - the first

	16798 ** in ascending order.

	16799 */

	16800 static int moveToLeftmost(BtCursor *pCur){

	16801 Pgno pgno;

	16802 int rc = SQLITE_OK;

	16803 MemPage *pPage;

	16804

	16805 assert( cursorHoldsMutex(pCur) );

	16806 assert( pCur->eState==CURSOR_VALID );

	16807 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){

	16808 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

	16809 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));

	16810 rc = moveToChild(pCur, pgno);

	16811 }

	16812 return rc;

	16813 }

	16814

	16815 /*

	16816 ** Move the cursor down to the right-most leaf entry beneath the

	16817 ** page to which it is currently pointing. Notice the difference

	16818 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()

	16819 ** finds the left-most entry beneath the entry whereas moveToRightmost()

	16820 ** finds the right-most entry beneath the page.

	16821 **

	16822 ** The right-most entry is the one with the largest key - the last

	16823 ** key in ascending order.

	16824 */

	16825 static int moveToRightmost(BtCursor *pCur){

	16826 Pgno pgno;

	16827 int rc = SQLITE_OK;

	16828 MemPage *pPage = 0;

	16829

	16830 assert( cursorHoldsMutex(pCur) );

	16831 assert( pCur->eState==CURSOR_VALID );

	16832 while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){

	16833 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	16834 pCur->aiIdx[pCur->iPage] = pPage->nCell;

	16835 rc = moveToChild(pCur, pgno);

	16836 if( rc ) return rc;

	16837 }

	16838 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;

	16839 assert( pCur->info.nSize==0 );

	16840 assert( (pCur->curFlags & BTCF_ValidNKey)==0 );

	16841 return SQLITE_OK;

	16842 }

	16843

	16844 /* Move the cursor to the first entry in the table. Return SQLITE_OK

	16845 ** on success. Set *pRes to 0 if the cursor actually points to something

	16846 ** or set *pRes to 1 if the table is empty.

	16847 */

	16848 SQLITE_PRIVATE int sqlite3BtreeFirst(BtCursor pCur, int pRes){

	16849 int rc;

	16850

	16851 assert( cursorHoldsMutex(pCur) );

	16852 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	16853 rc = moveToRoot(pCur);

	16854 if( rc==SQLITE_OK ){

	16855 if( pCur->eState==CURSOR_INVALID ){

	16856 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	16857 *pRes = 1;

	16858 }else{

	16859 assert( pCur->apPage[pCur->iPage]->nCell>0 );

	16860 *pRes = 0;

	16861 rc = moveToLeftmost(pCur);

	16862 }

	16863 }

	16864 return rc;

	16865 }

	16866

	16867 /* Move the cursor to the last entry in the table. Return SQLITE_OK

	16868 ** on success. Set *pRes to 0 if the cursor actually points to something

	16869 ** or set *pRes to 1 if the table is empty.

	16870 */

	16871 SQLITE_PRIVATE int sqlite3BtreeLast(BtCursor pCur, int pRes){

	16872 int rc;

	16873

	16874 assert( cursorHoldsMutex(pCur) );

	16875 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	16876

	16877 /* If the cursor already points to the last entry, this is a no-op. */

	16878 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){

	16879 #ifdef SQLITE_DEBUG

	16880 /* This block serves to assert() that the cursor really does point

	16881 ** to the last entry in the b-tree. */

	16882 int ii;

	16883 for(ii=0; ii<pCur->iPage; ii++){

	16884 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );

	16885 }

	16886 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );

	16887 assert( pCur->apPage[pCur->iPage]->leaf );

	16888 #endif

	16889 return SQLITE_OK;

	16890 }

	16891

	16892 rc = moveToRoot(pCur);

	16893 if( rc==SQLITE_OK ){

	16894 if( CURSOR_INVALID==pCur->eState ){

	16895 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	16896 *pRes = 1;

	16897 }else{

	16898 assert( pCur->eState==CURSOR_VALID );

	16899 *pRes = 0;

	16900 rc = moveToRightmost(pCur);

	16901 if( rc==SQLITE_OK ){

	16902 pCur->curFlags \|= BTCF_AtLast;

	16903 }else{

	16904 pCur->curFlags &= ~BTCF_AtLast;

	16905 }

	16906

	16907 }

	16908 }

	16909 return rc;

	16910 }

	16911

	16912 /* Move the cursor so that it points to an entry near the key

	16913 ** specified by pIdxKey or intKey. Return a success code.

	16914 **

	16915 ** For INTKEY tables, the intKey parameter is used. pIdxKey

	16916 ** must be NULL. For index tables, pIdxKey is used and intKey

	16917 ** is ignored.

	16918 **

	16919 ** If an exact match is not found, then the cursor is always

	16920 ** left pointing at a leaf page which would hold the entry if it

	16921 ** were present. The cursor might point to an entry that comes

	16922 ** before or after the key.

	16923 **

	16924 ** An integer is written into *pRes which is the result of

	16925 ** comparing the key with the entry to which the cursor is

	16926 ** pointing. The meaning of the integer written into

	16927 ** *pRes is as follows:

	16928 **

	16929 ** *pRes<0 The cursor is left pointing at an entry that

	16930 ** is smaller than intKey/pIdxKey or if the table is empty

	16931 ** and the cursor is therefore left point to nothing.

	16932 **

	16933 ** *pRes==0 The cursor is left pointing at an entry that

	16934 ** exactly matches intKey/pIdxKey.

	16935 **

	16936 ** *pRes>0 The cursor is left pointing at an entry that

	16937 ** is larger than intKey/pIdxKey.

	16938 **

	16939 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there

	16940 ** exists an entry in the table that exactly matches pIdxKey.

	16941 */

	16942 SQLITE_PRIVATE int sqlite3BtreeMovetoUnpacked(

	16943 BtCursor pCur, / The cursor to be moved */

	16944 UnpackedRecord pIdxKey, / Unpacked index key */

	16945 i64 intKey, /* The table key */

	16946 int biasRight, /* If true, bias the search to the high end */

	16947 int pRes / Write search results here */

	16948 ){

	16949 int rc;

	16950 RecordCompare xRecordCompare;

	16951

	16952 assert( cursorHoldsMutex(pCur) );

	16953 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

	16954 assert( pRes );

	16955 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );

	16956

	16957 /* If the cursor is already positioned at the point we are trying

	16958 ** to move to, then just return without doing any work */

	16959 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0

	16960 && pCur->curIntKey

	16961 ){

	16962 if( pCur->info.nKey==intKey ){

	16963 *pRes = 0;

	16964 return SQLITE_OK;

	16965 }

	16966 if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){

	16967 *pRes = -1;

	16968 return SQLITE_OK;

	16969 }

	16970 }

	16971

	16972 if( pIdxKey ){

	16973 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);

	16974 pIdxKey->errCode = 0;

	16975 assert( pIdxKey->default_rc==1

	16976 \|\| pIdxKey->default_rc==0

	16977 \|\| pIdxKey->default_rc==-1

	16978 );

	16979 }else{

	16980 xRecordCompare = 0; /* All keys are integers */

	16981 }

	16982

	16983 rc = moveToRoot(pCur);

	16984 if( rc ){

	16985 return rc;

	16986 }

	16987 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage] );

	16988 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->isInit );

	16989 assert( pCur->eState==CURSOR_INVALID \|\| pCur->apPage[pCur->iPage]->nCell>0 );

	16990 if( pCur->eState==CURSOR_INVALID ){

	16991 *pRes = -1;

	16992 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

	16993 return SQLITE_OK;

	16994 }

	16995 assert( pCur->apPage[0]->intKey==pCur->curIntKey );

	16996 assert( pCur->curIntKey \|\| pIdxKey );

	16997 for(;;){

	16998 int lwr, upr, idx, c;

	16999 Pgno chldPg;

	17000 MemPage *pPage = pCur->apPage[pCur->iPage];

	17001 u8 pCell; / Pointer to current cell in pPage */

	17002

	17003 /* pPage->nCell must be greater than zero. If this is the root-page

	17004 ** the cursor would have been INVALID above and this for(;;) loop

	17005 ** not run. If this is not the root-page, then the moveToChild() routine

	17006 ** would have already detected db corruption. Similarly, pPage must

	17007 ** be the right kind (index or table) of b-tree page. Otherwise

	17008 ** a moveToChild() or moveToRoot() call would have detected corruption. */

	17009 assert( pPage->nCell>0 );

	17010 assert( pPage->intKey==(pIdxKey==0) );

	17011 lwr = 0;

	17012 upr = pPage->nCell-1;

	17013 assert( biasRight==0 \|\| biasRight==1 );

	17014 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */

	17015 pCur->aiIdx[pCur->iPage] = (u16)idx;

	17016 if( xRecordCompare==0 ){

	17017 for(;;){

	17018 i64 nCellKey;

	17019 pCell = findCellPastPtr(pPage, idx);

	17020 if( pPage->intKeyLeaf ){

	17021 while( 0x80 <= *(pCell++) ){

	17022 if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;

	17023 }

	17024 }

	17025 getVarint(pCell, (u64*)&nCellKey);

	17026 if( nCellKey<intKey ){

	17027 lwr = idx+1;

	17028 if( lwr>upr ){ c = -1; break; }

	17029 }else if( nCellKey>intKey ){

	17030 upr = idx-1;

	17031 if( lwr>upr ){ c = +1; break; }

	17032 }else{

	17033 assert( nCellKey==intKey );

	17034 pCur->curFlags \|= BTCF_ValidNKey;

	17035 pCur->info.nKey = nCellKey;

	17036 pCur->aiIdx[pCur->iPage] = (u16)idx;

	17037 if( !pPage->leaf ){

	17038 lwr = idx;

	17039 goto moveto_next_layer;

	17040 }else{

	17041 *pRes = 0;

	17042 rc = SQLITE_OK;

	17043 goto moveto_finish;

	17044 }

	17045 }

	17046 assert( lwr+upr>=0 );

	17047 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */

	17048 }

	17049 }else{

	17050 for(;;){

	17051 int nCell; /* Size of the pCell cell in bytes */

	17052 pCell = findCellPastPtr(pPage, idx);

	17053

	17054 /* The maximum supported page-size is 65536 bytes. This means that

	17055 ** the maximum number of record bytes stored on an index B-Tree

	17056 ** page is less than 16384 bytes and may be stored as a 2-byte

	17057 ** varint. This information is used to attempt to avoid parsing

	17058 ** the entire cell by checking for the cases where the record is

	17059 ** stored entirely within the b-tree page by inspecting the first

	17060 ** 2 bytes of the cell.

	17061 */

	17062 nCell = pCell[0];

	17063 if( nCell<=pPage->max1bytePayload ){

	17064 /* This branch runs if the record-size field of the cell is a

	17065 ** single byte varint and the record fits entirely on the main

	17066 ** b-tree page. */

	17067 testcase( pCell+nCell+1==pPage->aDataEnd );

	17068 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);

	17069 }else if( !(pCell[1] & 0x80)

	17070 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal

	17071 ){

	17072 /* The record-size field is a 2 byte varint and the record

	17073 ** fits entirely on the main b-tree page. */

	17074 testcase( pCell+nCell+2==pPage->aDataEnd );

	17075 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);

	17076 }else{

	17077 /* The record flows over onto one or more overflow pages. In

	17078 ** this case the whole cell needs to be parsed, a buffer allocated

	17079 ** and accessPayload() used to retrieve the record into the

	17080 ** buffer before VdbeRecordCompare() can be called.

	17081 **

	17082 ** If the record is corrupt, the xRecordCompare routine may read

	17083 ** up to two varints past the end of the buffer. An extra 18

	17084 ** bytes of padding is allocated at the end of the buffer in

	17085 ** case this happens. */

	17086 void *pCellKey;

	17087 u8 * const pCellBody = pCell - pPage->childPtrSize;

	17088 pPage->xParseCell(pPage, pCellBody, &pCur->info);

	17089 nCell = (int)pCur->info.nKey;

	17090 testcase( nCell<0 ); /* True if key size is 2^32 or more */

	17091 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */

	17092 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */

	17093 testcase( nCell==2 ); /* Minimum legal index key size */

	17094 if( nCell<2 ){

	17095 rc = SQLITE_CORRUPT_BKPT;

	17096 goto moveto_finish;

	17097 }

	17098 pCellKey = sqlite3Malloc( nCell+18 );

	17099 if( pCellKey==0 ){

	17100 rc = SQLITE_NOMEM;

	17101 goto moveto_finish;

	17102 }

	17103 pCur->aiIdx[pCur->iPage] = (u16)idx;

	17104 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);

	17105 if( rc ){

	17106 sqlite3_free(pCellKey);

	17107 goto moveto_finish;

	17108 }

	17109 c = xRecordCompare(nCell, pCellKey, pIdxKey);

	17110 sqlite3_free(pCellKey);

	17111 }

	17112 assert(

	17113 (pIdxKey->errCode!=SQLITE_CORRUPT \|\| c==0)

	17114 && (pIdxKey->errCode!=SQLITE_NOMEM \|\| pCur->pBtree->db->mallocFailed)

	17115 );

	17116 if( c<0 ){

	17117 lwr = idx+1;

	17118 }else if( c>0 ){

	17119 upr = idx-1;

	17120 }else{

	17121 assert( c==0 );

	17122 *pRes = 0;

	17123 rc = SQLITE_OK;

	17124 pCur->aiIdx[pCur->iPage] = (u16)idx;

	17125 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;

	17126 goto moveto_finish;

	17127 }

	17128 if( lwr>upr ) break;

	17129 assert( lwr+upr>=0 );

	17130 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */

	17131 }

	17132 }

	17133 assert( lwr==upr+1 \|\| (pPage->intKey && !pPage->leaf) );

	17134 assert( pPage->isInit );

	17135 if( pPage->leaf ){

	17136 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	17137 pCur->aiIdx[pCur->iPage] = (u16)idx;

	17138 *pRes = c;

	17139 rc = SQLITE_OK;

	17140 goto moveto_finish;

	17141 }

	17142 moveto_next_layer:

	17143 if( lwr>=pPage->nCell ){

	17144 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	17145 }else{

	17146 chldPg = get4byte(findCell(pPage, lwr));

	17147 }

	17148 pCur->aiIdx[pCur->iPage] = (u16)lwr;

	17149 rc = moveToChild(pCur, chldPg);

	17150 if( rc ) break;

	17151 }

	17152 moveto_finish:

	17153 pCur->info.nSize = 0;

	17154 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	17155 return rc;

	17156 }

	17157

	17158

	17159 /*

	17160 ** Return TRUE if the cursor is not pointing at an entry of the table.

	17161 **

	17162 ** TRUE will be returned after a call to sqlite3BtreeNext() moves

	17163 ** past the last entry in the table or sqlite3BtreePrev() moves past

	17164 ** the first entry. TRUE is also returned if the table is empty.

	17165 */

	17166 SQLITE_PRIVATE int sqlite3BtreeEof(BtCursor *pCur){

	17167 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries

	17168 ** have been deleted? This API will need to change to return an error code

	17169 ** as well as the boolean result value.

	17170 */

	17171 return (CURSOR_VALID!=pCur->eState);

	17172 }

	17173

	17174 /*

	17175 ** Advance the cursor to the next entry in the database. If

	17176 ** successful then set *pRes=0. If the cursor

	17177 ** was already pointing to the last entry in the database before

	17178 ** this routine was called, then set *pRes=1.

	17179 **

	17180 ** The main entry point is sqlite3BtreeNext(). That routine is optimized

	17181 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx

	17182 ** to the next cell on the current page. The (slower) btreeNext() helper

	17183 ** routine is called when it is necessary to move to a different page or

	17184 ** to restore the cursor.

	17185 **

	17186 ** The calling function will set pRes to 0 or 1. The initial pRes value

	17187 ** will be 1 if the cursor being stepped corresponds to an SQL index and

	17188 ** if this routine could have been skipped if that SQL index had been

	17189 ** a unique index. Otherwise the caller will have set *pRes to zero.

	17190 ** Zero is the common case. The btree implementation is free to use the

	17191 ** initial *pRes value as a hint to improve performance, but the current

	17192 ** SQLite btree implementation does not. (Note that the comdb2 btree

	17193 ** implementation does use this hint, however.)

	17194 */

	17195 static SQLITE_NOINLINE int btreeNext(BtCursor pCur, int pRes){

	17196 int rc;

	17197 int idx;

	17198 MemPage *pPage;

	17199

	17200 assert( cursorHoldsMutex(pCur) );

	17201 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	17202 assert( *pRes==0 );

	17203 if( pCur->eState!=CURSOR_VALID ){

	17204 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );

	17205 rc = restoreCursorPosition(pCur);

	17206 if( rc!=SQLITE_OK ){

	17207 return rc;

	17208 }

	17209 if( CURSOR_INVALID==pCur->eState ){

	17210 *pRes = 1;

	17211 return SQLITE_OK;

	17212 }

	17213 if( pCur->skipNext ){

	17214 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

	17215 pCur->eState = CURSOR_VALID;

	17216 if( pCur->skipNext>0 ){

	17217 pCur->skipNext = 0;

	17218 return SQLITE_OK;

	17219 }

	17220 pCur->skipNext = 0;

	17221 }

	17222 }

	17223

	17224 pPage = pCur->apPage[pCur->iPage];

	17225 idx = ++pCur->aiIdx[pCur->iPage];

	17226 assert( pPage->isInit );

	17227

	17228 /* If the database file is corrupt, it is possible for the value of idx

	17229 ** to be invalid here. This can only occur if a second cursor modifies

	17230 ** the page while cursor pCur is holding a reference to it. Which can

	17231 ** only happen if the database is corrupt in such a way as to link the

	17232 ** page into more than one b-tree structure. */

	17233 testcase( idx>pPage->nCell );

	17234

	17235 if( idx>=pPage->nCell ){

	17236 if( !pPage->leaf ){

	17237 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

	17238 if( rc ) return rc;

	17239 return moveToLeftmost(pCur);

	17240 }

	17241 do{

	17242 if( pCur->iPage==0 ){

	17243 *pRes = 1;

	17244 pCur->eState = CURSOR_INVALID;

	17245 return SQLITE_OK;

	17246 }

	17247 moveToParent(pCur);

	17248 pPage = pCur->apPage[pCur->iPage];

	17249 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );

	17250 if( pPage->intKey ){

	17251 return sqlite3BtreeNext(pCur, pRes);

	17252 }else{

	17253 return SQLITE_OK;

	17254 }

	17255 }

	17256 if( pPage->leaf ){

	17257 return SQLITE_OK;

	17258 }else{

	17259 return moveToLeftmost(pCur);

	17260 }

	17261 }

	17262 SQLITE_PRIVATE int sqlite3BtreeNext(BtCursor pCur, int pRes){

	17263 MemPage *pPage;

	17264 assert( cursorHoldsMutex(pCur) );

	17265 assert( pRes!=0 );

	17266 assert( pRes==0 \|\| pRes==1 );

	17267 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	17268 pCur->info.nSize = 0;

	17269 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

	17270 *pRes = 0;

	17271 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);

	17272 pPage = pCur->apPage[pCur->iPage];

	17273 if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){

	17274 pCur->aiIdx[pCur->iPage]--;

	17275 return btreeNext(pCur, pRes);

	17276 }

	17277 if( pPage->leaf ){

	17278 return SQLITE_OK;

	17279 }else{

	17280 return moveToLeftmost(pCur);

	17281 }

	17282 }

	17283

	17284 /*

	17285 ** Step the cursor to the back to the previous entry in the database. If

	17286 ** successful then set *pRes=0. If the cursor

	17287 ** was already pointing to the first entry in the database before

	17288 ** this routine was called, then set *pRes=1.

	17289 **

	17290 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized

	17291 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx

	17292 ** to the previous cell on the current page. The (slower) btreePrevious()

	17293 ** helper routine is called when it is necessary to move to a different page

	17294 ** or to restore the cursor.

	17295 **

	17296 ** The calling function will set pRes to 0 or 1. The initial pRes value

	17297 ** will be 1 if the cursor being stepped corresponds to an SQL index and

	17298 ** if this routine could have been skipped if that SQL index had been

	17299 ** a unique index. Otherwise the caller will have set *pRes to zero.

	17300 ** Zero is the common case. The btree implementation is free to use the

	17301 ** initial *pRes value as a hint to improve performance, but the current

	17302 ** SQLite btree implementation does not. (Note that the comdb2 btree

	17303 ** implementation does use this hint, however.)

	17304 */

	17305 static SQLITE_NOINLINE int btreePrevious(BtCursor pCur, int pRes){

	17306 int rc;

	17307 MemPage *pPage;

	17308

	17309 assert( cursorHoldsMutex(pCur) );

	17310 assert( pRes!=0 );

	17311 assert( *pRes==0 );

	17312 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	17313 assert( (pCur->curFlags & (BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey))==0 );

	17314 assert( pCur->info.nSize==0 );

	17315 if( pCur->eState!=CURSOR_VALID ){

	17316 rc = restoreCursorPosition(pCur);

	17317 if( rc!=SQLITE_OK ){

	17318 return rc;

	17319 }

	17320 if( CURSOR_INVALID==pCur->eState ){

	17321 *pRes = 1;

	17322 return SQLITE_OK;

	17323 }

	17324 if( pCur->skipNext ){

	17325 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

	17326 pCur->eState = CURSOR_VALID;

	17327 if( pCur->skipNext<0 ){

	17328 pCur->skipNext = 0;

	17329 return SQLITE_OK;

	17330 }

	17331 pCur->skipNext = 0;

	17332 }

	17333 }

	17334

	17335 pPage = pCur->apPage[pCur->iPage];

	17336 assert( pPage->isInit );

	17337 if( !pPage->leaf ){

	17338 int idx = pCur->aiIdx[pCur->iPage];

	17339 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));

	17340 if( rc ) return rc;

	17341 rc = moveToRightmost(pCur);

	17342 }else{

	17343 while( pCur->aiIdx[pCur->iPage]==0 ){

	17344 if( pCur->iPage==0 ){

	17345 pCur->eState = CURSOR_INVALID;

	17346 *pRes = 1;

	17347 return SQLITE_OK;

	17348 }

	17349 moveToParent(pCur);

	17350 }

	17351 assert( pCur->info.nSize==0 );

	17352 assert( (pCur->curFlags & (BTCF_ValidNKey\|BTCF_ValidOvfl))==0 );

	17353

	17354 pCur->aiIdx[pCur->iPage]--;

	17355 pPage = pCur->apPage[pCur->iPage];

	17356 if( pPage->intKey && !pPage->leaf ){

	17357 rc = sqlite3BtreePrevious(pCur, pRes);

	17358 }else{

	17359 rc = SQLITE_OK;

	17360 }

	17361 }

	17362 return rc;

	17363 }

	17364 SQLITE_PRIVATE int sqlite3BtreePrevious(BtCursor pCur, int pRes){

	17365 assert( cursorHoldsMutex(pCur) );

	17366 assert( pRes!=0 );

	17367 assert( pRes==0 \|\| pRes==1 );

	17368 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

	17369 *pRes = 0;

	17370 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey);

	17371 pCur->info.nSize = 0;

	17372 if( pCur->eState!=CURSOR_VALID

	17373 \|\| pCur->aiIdx[pCur->iPage]==0

	17374 \|\| pCur->apPage[pCur->iPage]->leaf==0

	17375 ){

	17376 return btreePrevious(pCur, pRes);

	17377 }

	17378 pCur->aiIdx[pCur->iPage]--;

	17379 return SQLITE_OK;

	17380 }

	17381

	17382 /*

	17383 ** Allocate a new page from the database file.

	17384 **

	17385 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()

	17386 ** has already been called on the new page.) The new page has also

	17387 ** been referenced and the calling routine is responsible for calling

	17388 ** sqlite3PagerUnref() on the new page when it is done.

	17389 **

	17390 ** SQLITE_OK is returned on success. Any other return value indicates

	17391 ** an error. *ppPage is set to NULL in the event of an error.

	17392 **

	17393 ** If the "nearby" parameter is not 0, then an effort is made to

	17394 ** locate a page close to the page number "nearby". This can be used in an

	17395 ** attempt to keep related pages close to each other in the database file,

	17396 ** which in turn can make database access faster.

	17397 **

	17398 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists

	17399 ** anywhere on the free-list, then it is guaranteed to be returned. If

	17400 ** eMode is BTALLOC_LT then the page returned will be less than or equal

	17401 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there

	17402 ** are no restrictions on which page is returned.

	17403 */

	17404 static int allocateBtreePage(

	17405 BtShared pBt, / The btree */

	17406 MemPage *ppPage, / Store pointer to the allocated page here */

	17407 Pgno pPgno, / Store the page number here */

	17408 Pgno nearby, /* Search for a page near this one */

	17409 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */

	17410 ){

	17411 MemPage *pPage1;

	17412 int rc;

	17413 u32 n; /* Number of pages on the freelist */

	17414 u32 k; /* Number of leaves on the trunk of the freelist */

	17415 MemPage *pTrunk = 0;

	17416 MemPage *pPrevTrunk = 0;

	17417 Pgno mxPage; /* Total size of the database file */

	17418

	17419 assert( sqlite3_mutex_held(pBt->mutex) );

	17420 assert( eMode==BTALLOC_ANY \|\| (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );

	17421 pPage1 = pBt->pPage1;

	17422 mxPage = btreePagecount(pBt);

	17423 /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36

	17424 ** stores stores the total number of pages on the freelist. */

	17425 n = get4byte(&pPage1->aData[36]);

	17426 testcase( n==mxPage-1 );

	17427 if( n>=mxPage ){

	17428 return SQLITE_CORRUPT_BKPT;

	17429 }

	17430 if( n>0 ){

	17431 /* There are pages on the freelist. Reuse one of those pages. */

	17432 Pgno iTrunk;

	17433 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */

	17434 u32 nSearch = 0; /* Count of the number of search attempts */

	17435

	17436 /* If eMode==BTALLOC_EXACT and a query of the pointer-map

	17437 ** shows that the page 'nearby' is somewhere on the free-list, then

	17438 ** the entire-list will be searched for that page.

	17439 */

	17440 #ifndef SQLITE_OMIT_AUTOVACUUM

	17441 if( eMode==BTALLOC_EXACT ){

	17442 if( nearby<=mxPage ){

	17443 u8 eType;

	17444 assert( nearby>0 );

	17445 assert( pBt->autoVacuum );

	17446 rc = ptrmapGet(pBt, nearby, &eType, 0);

	17447 if( rc ) return rc;

	17448 if( eType==PTRMAP_FREEPAGE ){

	17449 searchList = 1;

	17450 }

	17451 }

	17452 }else if( eMode==BTALLOC_LE ){

	17453 searchList = 1;

	17454 }

	17455 #endif

	17456

	17457 /* Decrement the free-list count by 1. Set iTrunk to the index of the

	17458 ** first free-list trunk page. iPrevTrunk is initially 1.

	17459 */

	17460 rc = sqlite3PagerWrite(pPage1->pDbPage);

	17461 if( rc ) return rc;

	17462 put4byte(&pPage1->aData[36], n-1);

	17463

	17464 /* The code within this loop is run only once if the 'searchList' variable

	17465 ** is not true. Otherwise, it runs once for each trunk-page on the

	17466 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)

	17467 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)

	17468 */

	17469 do {

	17470 pPrevTrunk = pTrunk;

	17471 if( pPrevTrunk ){

	17472 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page

	17473 ** is the page number of the next freelist trunk page in the list or

	17474 ** zero if this is the last freelist trunk page. */

	17475 iTrunk = get4byte(&pPrevTrunk->aData[0]);

	17476 }else{

	17477 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32

	17478 ** stores the page number of the first page of the freelist, or zero if

	17479 ** the freelist is empty. */

	17480 iTrunk = get4byte(&pPage1->aData[32]);

	17481 }

	17482 testcase( iTrunk==mxPage );

	17483 if( iTrunk>mxPage \|\| nSearch++ > n ){

	17484 rc = SQLITE_CORRUPT_BKPT;

	17485 }else{

	17486 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);

	17487 }

	17488 if( rc ){

	17489 pTrunk = 0;

	17490 goto end_allocate_page;

	17491 }

	17492 assert( pTrunk!=0 );

	17493 assert( pTrunk->aData!=0 );

	17494 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page

	17495 ** is the number of leaf page pointers to follow. */

	17496 k = get4byte(&pTrunk->aData[4]);

	17497 if( k==0 && !searchList ){

	17498 /* The trunk has no leaves and the list is not being searched.

	17499 ** So extract the trunk page itself and use it as the newly

	17500 ** allocated page */

	17501 assert( pPrevTrunk==0 );

	17502 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	17503 if( rc ){

	17504 goto end_allocate_page;

	17505 }

	17506 *pPgno = iTrunk;

	17507 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

	17508 *ppPage = pTrunk;

	17509 pTrunk = 0;

	17510 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

	17511 }else if( k>(u32)(pBt->usableSize/4 - 2) ){

	17512 /* Value of k is out of range. Database corruption */

	17513 rc = SQLITE_CORRUPT_BKPT;

	17514 goto end_allocate_page;

	17515 #ifndef SQLITE_OMIT_AUTOVACUUM

	17516 }else if( searchList

	17517 && (nearby==iTrunk \|\| (iTrunk<nearby && eMode==BTALLOC_LE))

	17518 ){

	17519 /* The list is being searched and this trunk page is the page

	17520 ** to allocate, regardless of whether it has leaves.

	17521 */

	17522 *pPgno = iTrunk;

	17523 *ppPage = pTrunk;

	17524 searchList = 0;

	17525 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	17526 if( rc ){

	17527 goto end_allocate_page;

	17528 }

	17529 if( k==0 ){

	17530 if( !pPrevTrunk ){

	17531 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

	17532 }else{

	17533 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

	17534 if( rc!=SQLITE_OK ){

	17535 goto end_allocate_page;

	17536 }

	17537 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);

	17538 }

	17539 }else{

	17540 /* The trunk page is required by the caller but it contains

	17541 ** pointers to free-list leaves. The first leaf becomes a trunk

	17542 ** page in this case.

	17543 */

	17544 MemPage *pNewTrunk;

	17545 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);

	17546 if( iNewTrunk>mxPage ){

	17547 rc = SQLITE_CORRUPT_BKPT;

	17548 goto end_allocate_page;

	17549 }

	17550 testcase( iNewTrunk==mxPage );

	17551 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);

	17552 if( rc!=SQLITE_OK ){

	17553 goto end_allocate_page;

	17554 }

	17555 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);

	17556 if( rc!=SQLITE_OK ){

	17557 releasePage(pNewTrunk);

	17558 goto end_allocate_page;

	17559 }

	17560 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);

	17561 put4byte(&pNewTrunk->aData[4], k-1);

	17562 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);

	17563 releasePage(pNewTrunk);

	17564 if( !pPrevTrunk ){

	17565 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );

	17566 put4byte(&pPage1->aData[32], iNewTrunk);

	17567 }else{

	17568 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

	17569 if( rc ){

	17570 goto end_allocate_page;

	17571 }

	17572 put4byte(&pPrevTrunk->aData[0], iNewTrunk);

	17573 }

	17574 }

	17575 pTrunk = 0;

	17576 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

	17577 #endif

	17578 }else if( k>0 ){

	17579 /* Extract a leaf from the trunk */

	17580 u32 closest;

	17581 Pgno iPage;

	17582 unsigned char *aData = pTrunk->aData;

	17583 if( nearby>0 ){

	17584 u32 i;

	17585 closest = 0;

	17586 if( eMode==BTALLOC_LE ){

	17587 for(i=0; i<k; i++){

	17588 iPage = get4byte(&aData[8+i*4]);

	17589 if( iPage<=nearby ){

	17590 closest = i;

	17591 break;

	17592 }

	17593 }

	17594 }else{

	17595 int dist;

	17596 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);

	17597 for(i=1; i<k; i++){

	17598 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);

	17599 if( d2<dist ){

	17600 closest = i;

	17601 dist = d2;

	17602 }

	17603 }

	17604 }

	17605 }else{

	17606 closest = 0;

	17607 }

	17608

	17609 iPage = get4byte(&aData[8+closest*4]);

	17610 testcase( iPage==mxPage );

	17611 if( iPage>mxPage ){

	17612 rc = SQLITE_CORRUPT_BKPT;

	17613 goto end_allocate_page;

	17614 }

	17615 testcase( iPage==mxPage );

	17616 if( !searchList

	17617 \|\| (iPage==nearby \|\| (iPage<nearby && eMode==BTALLOC_LE))

	17618 ){

	17619 int noContent;

	17620 *pPgno = iPage;

	17621 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"

	17622 ": %d more free pages\n",

	17623 *pPgno, closest+1, k, pTrunk->pgno, n-1));

	17624 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	17625 if( rc ) goto end_allocate_page;

	17626 if( closest<k-1 ){

	17627 memcpy(&aData[8+closest4], &aData[4+k4], 4);

	17628 }

	17629 put4byte(&aData[4], k-1);

	17630 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;

	17631 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);

	17632 if( rc==SQLITE_OK ){

	17633 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

	17634 if( rc!=SQLITE_OK ){

	17635 releasePage(*ppPage);

	17636 *ppPage = 0;

	17637 }

	17638 }

	17639 searchList = 0;

	17640 }

	17641 }

	17642 releasePage(pPrevTrunk);

	17643 pPrevTrunk = 0;

	17644 }while( searchList );

	17645 }else{

	17646 /* There are no pages on the freelist, so append a new page to the

	17647 ** database image.

	17648 **

	17649 ** Normally, new pages allocated by this block can be requested from the

	17650 ** pager layer with the 'no-content' flag set. This prevents the pager

	17651 ** from trying to read the pages content from disk. However, if the

	17652 ** current transaction has already run one or more incremental-vacuum

	17653 ** steps, then the page we are about to allocate may contain content

	17654 ** that is required in the event of a rollback. In this case, do

	17655 ** not set the no-content flag. This causes the pager to load and journal

	17656 ** the current page content before overwriting it.

	17657 **

	17658 ** Note that the pager will not actually attempt to load or journal

	17659 ** content for any page that really does lie past the end of the database

	17660 ** file on disk. So the effects of disabling the no-content optimization

	17661 ** here are confined to those pages that lie between the end of the

	17662 ** database image and the end of the database file.

	17663 */

	17664 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;

	17665

	17666 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	17667 if( rc ) return rc;

	17668 pBt->nPage++;

	17669 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;

	17670

	17671 #ifndef SQLITE_OMIT_AUTOVACUUM

	17672 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){

	17673 /* If *pPgno refers to a pointer-map page, allocate two new pages

	17674 ** at the end of the file instead of one. The first allocated page

	17675 ** becomes a new pointer-map page, the second is used by the caller.

	17676 */

	17677 MemPage *pPg = 0;

	17678 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));

	17679 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );

	17680 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);

	17681 if( rc==SQLITE_OK ){

	17682 rc = sqlite3PagerWrite(pPg->pDbPage);

	17683 releasePage(pPg);

	17684 }

	17685 if( rc ) return rc;

	17686 pBt->nPage++;

	17687 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }

	17688 }

	17689 #endif

	17690 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);

	17691 *pPgno = pBt->nPage;

	17692

	17693 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

	17694 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);

	17695 if( rc ) return rc;

	17696 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

	17697 if( rc!=SQLITE_OK ){

	17698 releasePage(*ppPage);

	17699 *ppPage = 0;

	17700 }

	17701 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));

	17702 }

	17703

	17704 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

	17705

	17706 end_allocate_page:

	17707 releasePage(pTrunk);

	17708 releasePage(pPrevTrunk);

	17709 assert( rc!=SQLITE_OK \|\| sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );

	17710 assert( rc!=SQLITE_OK \|\| (*ppPage)->isInit==0 );

	17711 return rc;

	17712 }

	17713

	17714 /*

	17715 ** This function is used to add page iPage to the database file free-list.

	17716 ** It is assumed that the page is not already a part of the free-list.

	17717 **

	17718 ** The value passed as the second argument to this function is optional.

	17719 ** If the caller happens to have a pointer to the MemPage object

	17720 ** corresponding to page iPage handy, it may pass it as the second value.

	17721 ** Otherwise, it may pass NULL.

	17722 **

	17723 ** If a pointer to a MemPage object is passed as the second argument,

	17724 ** its reference count is not altered by this function.

	17725 */

	17726 static int freePage2(BtShared pBt, MemPage pMemPage, Pgno iPage){

	17727 MemPage pTrunk = 0; / Free-list trunk page */

	17728 Pgno iTrunk = 0; /* Page number of free-list trunk page */

	17729 MemPage pPage1 = pBt->pPage1; / Local reference to page 1 */

	17730 MemPage pPage; / Page being freed. May be NULL. */

	17731 int rc; /* Return Code */

	17732 int nFree; /* Initial number of pages on free-list */

	17733

	17734 assert( sqlite3_mutex_held(pBt->mutex) );

	17735 assert( CORRUPT_DB \|\| iPage>1 );

	17736 assert( !pMemPage \|\| pMemPage->pgno==iPage );

	17737

	17738 if( iPage<2 ) return SQLITE_CORRUPT_BKPT;

	17739 if( pMemPage ){

	17740 pPage = pMemPage;

	17741 sqlite3PagerRef(pPage->pDbPage);

	17742 }else{

	17743 pPage = btreePageLookup(pBt, iPage);

	17744 }

	17745

	17746 /* Increment the free page count on pPage1 */

	17747 rc = sqlite3PagerWrite(pPage1->pDbPage);

	17748 if( rc ) goto freepage_out;

	17749 nFree = get4byte(&pPage1->aData[36]);

	17750 put4byte(&pPage1->aData[36], nFree+1);

	17751

	17752 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	17753 /* If the secure_delete option is enabled, then

	17754 ** always fully overwrite deleted information with zeros.

	17755 */

	17756 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )

	17757 \|\| ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)

	17758 ){

	17759 goto freepage_out;

	17760 }

	17761 memset(pPage->aData, 0, pPage->pBt->pageSize);

	17762 }

	17763

	17764 /* If the database supports auto-vacuum, write an entry in the pointer-map

	17765 ** to indicate that the page is free.

	17766 */

	17767 if( ISAUTOVACUUM ){

	17768 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);

	17769 if( rc ) goto freepage_out;

	17770 }

	17771

	17772 /* Now manipulate the actual database free-list structure. There are two

	17773 ** possibilities. If the free-list is currently empty, or if the first

	17774 ** trunk page in the free-list is full, then this page will become a

	17775 ** new free-list trunk page. Otherwise, it will become a leaf of the

	17776 ** first trunk page in the current free-list. This block tests if it

	17777 ** is possible to add the page as a new free-list leaf.

	17778 */

	17779 if( nFree!=0 ){

	17780 u32 nLeaf; /* Initial number of leaf cells on trunk page */

	17781

	17782 iTrunk = get4byte(&pPage1->aData[32]);

	17783 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);

	17784 if( rc!=SQLITE_OK ){

	17785 goto freepage_out;

	17786 }

	17787

	17788 nLeaf = get4byte(&pTrunk->aData[4]);

	17789 assert( pBt->usableSize>32 );

	17790 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){

	17791 rc = SQLITE_CORRUPT_BKPT;

	17792 goto freepage_out;

	17793 }

	17794 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){

	17795 /* In this case there is room on the trunk page to insert the page

	17796 ** being freed as a new leaf.

	17797 **

	17798 ** Note that the trunk page is not really full until it contains

	17799 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have

	17800 ** coded. But due to a coding error in versions of SQLite prior to

	17801 ** 3.6.0, databases with freelist trunk pages holding more than

	17802 ** usableSize/4 - 8 entries will be reported as corrupt. In order

	17803 ** to maintain backwards compatibility with older versions of SQLite,

	17804 ** we will continue to restrict the number of entries to usableSize/4 - 8

	17805 ** for now. At some point in the future (once everyone has upgraded

	17806 ** to 3.6.0 or later) we should consider fixing the conditional above

	17807 ** to read "usableSize/4-2" instead of "usableSize/4-8".

	17808 **

	17809 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still

	17810 ** avoid using the last six entries in the freelist trunk page array in

	17811 ** order that database files created by newer versions of SQLite can be

	17812 ** read by older versions of SQLite.

	17813 */

	17814 rc = sqlite3PagerWrite(pTrunk->pDbPage);

	17815 if( rc==SQLITE_OK ){

	17816 put4byte(&pTrunk->aData[4], nLeaf+1);

	17817 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);

	17818 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){

	17819 sqlite3PagerDontWrite(pPage->pDbPage);

	17820 }

	17821 rc = btreeSetHasContent(pBt, iPage);

	17822 }

	17823 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));

	17824 goto freepage_out;

	17825 }

	17826 }

	17827

	17828 /* If control flows to this point, then it was not possible to add the

	17829 ** the page being freed as a leaf page of the first trunk in the free-list.

	17830 ** Possibly because the free-list is empty, or possibly because the

	17831 ** first trunk in the free-list is full. Either way, the page being freed

	17832 ** will become the new first trunk page in the free-list.

	17833 */

	17834 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){

	17835 goto freepage_out;

	17836 }

	17837 rc = sqlite3PagerWrite(pPage->pDbPage);

	17838 if( rc!=SQLITE_OK ){

	17839 goto freepage_out;

	17840 }

	17841 put4byte(pPage->aData, iTrunk);

	17842 put4byte(&pPage->aData[4], 0);

	17843 put4byte(&pPage1->aData[32], iPage);

	17844 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));

	17845

	17846 freepage_out:

	17847 if( pPage ){

	17848 pPage->isInit = 0;

	17849 }

	17850 releasePage(pPage);

	17851 releasePage(pTrunk);

	17852 return rc;

	17853 }

	17854 static void freePage(MemPage pPage, int pRC){

	17855 if( (*pRC)==SQLITE_OK ){

	17856 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);

	17857 }

	17858 }

	17859

	17860 /*

	17861 ** Free any overflow pages associated with the given Cell. Write the

	17862 ** local Cell size (the number of bytes on the original page, omitting

	17863 ** overflow) into *pnSize.

	17864 */

	17865 static int clearCell(

	17866 MemPage pPage, / The page that contains the Cell */

	17867 unsigned char pCell, / First byte of the Cell */

	17868 u16 pnSize / Write the size of the Cell here */

	17869 ){

	17870 BtShared *pBt = pPage->pBt;

	17871 CellInfo info;

	17872 Pgno ovflPgno;

	17873 int rc;

	17874 int nOvfl;

	17875 u32 ovflPageSize;

	17876

	17877 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	17878 pPage->xParseCell(pPage, pCell, &info);

	17879 *pnSize = info.nSize;

	17880 if( info.nLocal==info.nPayload ){

	17881 return SQLITE_OK; /* No overflow pages. Return without doing anything */

	17882 }

	17883 if( pCell+info.nSize-1 > pPage->aData+pPage->maskPage ){

	17884 return SQLITE_CORRUPT_BKPT; /* Cell extends past end of page */

	17885 }

	17886 ovflPgno = get4byte(pCell + info.nSize - 4);

	17887 assert( pBt->usableSize > 4 );

	17888 ovflPageSize = pBt->usableSize - 4;

	17889 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;

	17890 assert( nOvfl>0 \|\|

	17891 (CORRUPT_DB && (info.nPayload + ovflPageSize)<ovflPageSize)

	17892 );

	17893 while( nOvfl-- ){

	17894 Pgno iNext = 0;

	17895 MemPage *pOvfl = 0;

	17896 if( ovflPgno<2 \|\| ovflPgno>btreePagecount(pBt) ){

	17897 /* 0 is not a legal page number and page 1 cannot be an

	17898 ** overflow page. Therefore if ovflPgno<2 or past the end of the

	17899 ** file the database must be corrupt. */

	17900 return SQLITE_CORRUPT_BKPT;

	17901 }

	17902 if( nOvfl ){

	17903 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);

	17904 if( rc ) return rc;

	17905 }

	17906

	17907 if( ( pOvfl \|\| ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )

	17908 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1

	17909 ){

	17910 /* There is no reason any cursor should have an outstanding reference

	17911 ** to an overflow page belonging to a cell that is being deleted/updated.

	17912 ** So if there exists more than one reference to this page, then it

	17913 ** must not really be an overflow page and the database must be corrupt.

	17914 ** It is helpful to detect this before calling freePage2(), as

	17915 ** freePage2() may zero the page contents if secure-delete mode is

	17916 ** enabled. If this 'overflow' page happens to be a page that the

	17917 ** caller is iterating through or using in some other way, this

	17918 ** can be problematic.

	17919 */

	17920 rc = SQLITE_CORRUPT_BKPT;

	17921 }else{

	17922 rc = freePage2(pBt, pOvfl, ovflPgno);

	17923 }

	17924

	17925 if( pOvfl ){

	17926 sqlite3PagerUnref(pOvfl->pDbPage);

	17927 }

	17928 if( rc ) return rc;

	17929 ovflPgno = iNext;

	17930 }

	17931 return SQLITE_OK;

	17932 }

	17933

	17934 /*

	17935 ** Create the byte sequence used to represent a cell on page pPage

	17936 ** and write that byte sequence into pCell[]. Overflow pages are

	17937 ** allocated and filled in as necessary. The calling procedure

	17938 ** is responsible for making sure sufficient space has been allocated

	17939 ** for pCell[].

	17940 **

	17941 ** Note that pCell does not necessary need to point to the pPage->aData

	17942 ** area. pCell might point to some temporary storage. The cell will

	17943 ** be constructed in this temporary area then copied into pPage->aData

	17944 ** later.

	17945 */

	17946 static int fillInCell(

	17947 MemPage pPage, / The page that contains the cell */

	17948 unsigned char pCell, / Complete text of the cell */

	17949 const void pKey, i64 nKey, / The key */

	17950 const void pData,int nData, / The data */

	17951 int nZero, /* Extra zero bytes to append to pData */

	17952 int pnSize / Write cell size here */

	17953 ){

	17954 int nPayload;

	17955 const u8 *pSrc;

	17956 int nSrc, n, rc;

	17957 int spaceLeft;

	17958 MemPage *pOvfl = 0;

	17959 MemPage *pToRelease = 0;

	17960 unsigned char *pPrior;

	17961 unsigned char *pPayload;

	17962 BtShared *pBt = pPage->pBt;

	17963 Pgno pgnoOvfl = 0;

	17964 int nHeader;

	17965

	17966 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	17967

	17968 /* pPage is not necessarily writeable since pCell might be auxiliary

	17969 ** buffer space that is separate from the pPage buffer area */

	17970 assert( pCell<pPage->aData \|\| pCell>=&pPage->aData[pBt->pageSize]

	17971 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	17972

	17973 /* Fill in the header. */

	17974 nHeader = pPage->childPtrSize;

	17975 nPayload = nData + nZero;

	17976 if( pPage->intKeyLeaf ){

	17977 nHeader += putVarint32(&pCell[nHeader], nPayload);

	17978 }else{

	17979 assert( nData==0 );

	17980 assert( nZero==0 );

	17981 }

	17982 nHeader += putVarint(&pCell[nHeader], (u64)&nKey);

	17983

	17984 /* Fill in the payload size */

	17985 if( pPage->intKey ){

	17986 pSrc = pData;

	17987 nSrc = nData;

	17988 nData = 0;

	17989 }else{

	17990 assert( nKey<=0x7fffffff && pKey!=0 );

	17991 nPayload = (int)nKey;

	17992 pSrc = pKey;

	17993 nSrc = (int)nKey;

	17994 }

	17995 if( nPayload<=pPage->maxLocal ){

	17996 n = nHeader + nPayload;

	17997 testcase( n==3 );

	17998 testcase( n==4 );

	17999 if( n<4 ) n = 4;

	18000 *pnSize = n;

	18001 spaceLeft = nPayload;

	18002 pPrior = pCell;

	18003 }else{

	18004 int mn = pPage->minLocal;

	18005 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);

	18006 testcase( n==pPage->maxLocal );

	18007 testcase( n==pPage->maxLocal+1 );

	18008 if( n > pPage->maxLocal ) n = mn;

	18009 spaceLeft = n;

	18010 *pnSize = n + nHeader + 4;

	18011 pPrior = &pCell[nHeader+n];

	18012 }

	18013 pPayload = &pCell[nHeader];

	18014

	18015 /* At this point variables should be set as follows:

	18016 **

	18017 ** nPayload Total payload size in bytes

	18018 ** pPayload Begin writing payload here

	18019 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,

	18020 ** that means content must spill into overflow pages.

	18021 ** *pnSize Size of the local cell (not counting overflow pages)

	18022 ** pPrior Where to write the pgno of the first overflow page

	18023 **

	18024 ** Use a call to btreeParseCellPtr() to verify that the values above

	18025 ** were computed correctly.

	18026 */

	18027 #if SQLITE_DEBUG

	18028 {

	18029 CellInfo info;

	18030 pPage->xParseCell(pPage, pCell, &info);

	18031 assert( nHeader=(int)(info.pPayload - pCell) );

	18032 assert( info.nKey==nKey );

	18033 assert( *pnSize == info.nSize );

	18034 assert( spaceLeft == info.nLocal );

	18035 }

	18036 #endif

	18037

	18038 /* Write the payload into the local Cell and any extra into overflow pages */

	18039 while( nPayload>0 ){

	18040 if( spaceLeft==0 ){

	18041 #ifndef SQLITE_OMIT_AUTOVACUUM

	18042 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */

	18043 if( pBt->autoVacuum ){

	18044 do{

	18045 pgnoOvfl++;

	18046 } while(

	18047 PTRMAP_ISPAGE(pBt, pgnoOvfl) \|\| pgnoOvfl==PENDING_BYTE_PAGE(pBt)

	18048 );

	18049 }

	18050 #endif

	18051 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);

	18052 #ifndef SQLITE_OMIT_AUTOVACUUM

	18053 /* If the database supports auto-vacuum, and the second or subsequent

	18054 ** overflow page is being allocated, add an entry to the pointer-map

	18055 ** for that page now.

	18056 **

	18057 ** If this is the first overflow page, then write a partial entry

	18058 ** to the pointer-map. If we write nothing to this pointer-map slot,

	18059 ** then the optimistic overflow chain processing in clearCell()

	18060 ** may misinterpret the uninitialized values and delete the

	18061 ** wrong pages from the database.

	18062 */

	18063 if( pBt->autoVacuum && rc==SQLITE_OK ){

	18064 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);

	18065 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);

	18066 if( rc ){

	18067 releasePage(pOvfl);

	18068 }

	18069 }

	18070 #endif

	18071 if( rc ){

	18072 releasePage(pToRelease);

	18073 return rc;

	18074 }

	18075

	18076 /* If pToRelease is not zero than pPrior points into the data area

	18077 ** of pToRelease. Make sure pToRelease is still writeable. */

	18078 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

	18079

	18080 /* If pPrior is part of the data area of pPage, then make sure pPage

	18081 ** is still writeable */

	18082 assert( pPrior<pPage->aData \|\| pPrior>=&pPage->aData[pBt->pageSize]

	18083 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	18084

	18085 put4byte(pPrior, pgnoOvfl);

	18086 releasePage(pToRelease);

	18087 pToRelease = pOvfl;

	18088 pPrior = pOvfl->aData;

	18089 put4byte(pPrior, 0);

	18090 pPayload = &pOvfl->aData[4];

	18091 spaceLeft = pBt->usableSize - 4;

	18092 }

	18093 n = nPayload;

	18094 if( n>spaceLeft ) n = spaceLeft;

	18095

	18096 /* If pToRelease is not zero than pPayload points into the data area

	18097 ** of pToRelease. Make sure pToRelease is still writeable. */

	18098 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

	18099

	18100 /* If pPayload is part of the data area of pPage, then make sure pPage

	18101 ** is still writeable */

	18102 assert( pPayload<pPage->aData \|\| pPayload>=&pPage->aData[pBt->pageSize]

	18103 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

	18104

	18105 if( nSrc>0 ){

	18106 if( n>nSrc ) n = nSrc;

	18107 assert( pSrc );

	18108 memcpy(pPayload, pSrc, n);

	18109 }else{

	18110 memset(pPayload, 0, n);

	18111 }

	18112 nPayload -= n;

	18113 pPayload += n;

	18114 pSrc += n;

	18115 nSrc -= n;

	18116 spaceLeft -= n;

	18117 if( nSrc==0 ){

	18118 nSrc = nData;

	18119 pSrc = pData;

	18120 }

	18121 }

	18122 releasePage(pToRelease);

	18123 return SQLITE_OK;

	18124 }

	18125

	18126 /*

	18127 ** Remove the i-th cell from pPage. This routine effects pPage only.

	18128 ** The cell content is not freed or deallocated. It is assumed that

	18129 ** the cell content has been copied someplace else. This routine just

	18130 ** removes the reference to the cell from pPage.

	18131 **

	18132 ** "sz" must be the number of bytes in the cell.

	18133 */

	18134 static void dropCell(MemPage pPage, int idx, int sz, int pRC){

	18135 u32 pc; /* Offset to cell content of cell being deleted */

	18136 u8 data; / pPage->aData */

	18137 u8 ptr; / Used to move bytes around within data[] */

	18138 int rc; /* The return code */

	18139 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */

	18140

	18141 if( *pRC ) return;

	18142

	18143 assert( idx>=0 && idx<pPage->nCell );

	18144 assert( CORRUPT_DB \|\| sz==cellSize(pPage, idx) );

	18145 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	18146 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	18147 data = pPage->aData;

	18148 ptr = &pPage->aCellIdx[2*idx];

	18149 pc = get2byte(ptr);

	18150 hdr = pPage->hdrOffset;

	18151 testcase( pc==get2byte(&data[hdr+5]) );

	18152 testcase( pc+sz==pPage->pBt->usableSize );

	18153 if( pc < (u32)get2byte(&data[hdr+5]) \|\| pc+sz > pPage->pBt->usableSize ){

	18154 *pRC = SQLITE_CORRUPT_BKPT;

	18155 return;

	18156 }

	18157 rc = freeSpace(pPage, pc, sz);

	18158 if( rc ){

	18159 *pRC = rc;

	18160 return;

	18161 }

	18162 pPage->nCell--;

	18163 if( pPage->nCell==0 ){

	18164 memset(&data[hdr+1], 0, 4);

	18165 data[hdr+7] = 0;

	18166 put2byte(&data[hdr+5], pPage->pBt->usableSize);

	18167 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset

	18168 - pPage->childPtrSize - 8;

	18169 }else{

	18170 memmove(ptr, ptr+2, 2*(pPage->nCell - idx));

	18171 put2byte(&data[hdr+3], pPage->nCell);

	18172 pPage->nFree += 2;

	18173 }

	18174 }

	18175

	18176 /*

	18177 ** Insert a new cell on pPage at cell index "i". pCell points to the

	18178 ** content of the cell.

	18179 **

	18180 ** If the cell content will fit on the page, then put it there. If it

	18181 ** will not fit, then make a copy of the cell content into pTemp if

	18182 ** pTemp is not null. Regardless of pTemp, allocate a new entry

	18183 ** in pPage->apOvfl[] and make it point to the cell content (either

	18184 ** in pTemp or the original pCell) and also record its index.

	18185 ** Allocating a new entry in pPage->aCell[] implies that

	18186 ** pPage->nOverflow is incremented.

	18187 */

	18188 static void insertCell(

	18189 MemPage pPage, / Page into which we are copying */

	18190 int i, /* New cell becomes the i-th cell of the page */

	18191 u8 pCell, / Content of the new cell */

	18192 int sz, /* Bytes of content in pCell */

	18193 u8 pTemp, / Temp storage space for pCell, if needed */

	18194 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */

	18195 int pRC / Read and write return code from here */

	18196 ){

	18197 int idx = 0; /* Where to write new cell content in data[] */

	18198 int j; /* Loop counter */

	18199 u8 data; / The content of the whole page */

	18200 u8 pIns; / The point in pPage->aCellIdx[] where no cell inserted */

	18201

	18202 if( *pRC ) return;

	18203

	18204 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );

	18205 assert( MX_CELL(pPage->pBt)<=10921 );

	18206 assert( pPage->nCell<=MX_CELL(pPage->pBt) \|\| CORRUPT_DB );

	18207 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );

	18208 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );

	18209 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	18210 /* The cell should normally be sized correctly. However, when moving a

	18211 ** malformed cell from a leaf page to an interior page, if the cell size

	18212 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size

	18213 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence

	18214 ** the term after the \|\| in the following assert(). */

	18215 assert( sz==pPage->xCellSize(pPage, pCell) \|\| (sz==8 && iChild>0) );

	18216 if( pPage->nOverflow \|\| sz+2>pPage->nFree ){

	18217 if( pTemp ){

	18218 memcpy(pTemp, pCell, sz);

	18219 pCell = pTemp;

	18220 }

	18221 if( iChild ){

	18222 put4byte(pCell, iChild);

	18223 }

	18224 j = pPage->nOverflow++;

	18225 assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );

	18226 pPage->apOvfl[j] = pCell;

	18227 pPage->aiOvfl[j] = (u16)i;

	18228

	18229 /* When multiple overflows occur, they are always sequential and in

	18230 ** sorted order. This invariants arise because multiple overflows can

	18231 ** only occur when inserting divider cells into the parent page during

	18232 ** balancing, and the dividers are adjacent and sorted.

	18233 */

	18234 assert( j==0 \|\| pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */

	18235 assert( j==0 \|\| i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */

	18236 }else{

	18237 int rc = sqlite3PagerWrite(pPage->pDbPage);

	18238 if( rc!=SQLITE_OK ){

	18239 *pRC = rc;

	18240 return;

	18241 }

	18242 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

	18243 data = pPage->aData;

	18244 assert( &data[pPage->cellOffset]==pPage->aCellIdx );

	18245 rc = allocateSpace(pPage, sz, &idx);

	18246 if( rc ){ *pRC = rc; return; }

	18247 /* The allocateSpace() routine guarantees the following properties

	18248 ** if it returns successfully */

	18249 assert( idx >= 0 );

	18250 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 \|\| CORRUPT_DB );

	18251 assert( idx+sz <= (int)pPage->pBt->usableSize );

	18252 pPage->nFree -= (u16)(2 + sz);

	18253 memcpy(&data[idx], pCell, sz);

	18254 if( iChild ){

	18255 put4byte(&data[idx], iChild);

	18256 }

	18257 pIns = pPage->aCellIdx + i*2;

	18258 memmove(pIns+2, pIns, 2*(pPage->nCell - i));

	18259 put2byte(pIns, idx);

	18260 pPage->nCell++;

	18261 /* increment the cell count */

	18262 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;

	18263 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );

	18264 #ifndef SQLITE_OMIT_AUTOVACUUM

	18265 if( pPage->pBt->autoVacuum ){

	18266 /* The cell may contain a pointer to an overflow page. If so, write

	18267 ** the entry for the overflow page into the pointer map.

	18268 */

	18269 ptrmapPutOvflPtr(pPage, pCell, pRC);

	18270 }

	18271 #endif

	18272 }

	18273 }

	18274

	18275 /*

	18276 ** A CellArray object contains a cache of pointers and sizes for a

	18277 ** consecutive sequence of cells that might be held multiple pages.

	18278 */

	18279 typedef struct CellArray CellArray;

	18280 struct CellArray {

	18281 int nCell; /* Number of cells in apCell[] */

	18282 MemPage pRef; / Reference page */

	18283 u8 *apCell; / All cells begin balanced */

	18284 u16 szCell; / Local size of all cells in apCell[] */

	18285 };

	18286

	18287 /*

	18288 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been

	18289 ** computed.

	18290 */

	18291 static void populateCellCache(CellArray *p, int idx, int N){

	18292 assert( idx>=0 && idx+N<=p->nCell );

	18293 while( N>0 ){

	18294 assert( p->apCell[idx]!=0 );

	18295 if( p->szCell[idx]==0 ){

	18296 p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);

	18297 }else{

	18298 assert( CORRUPT_DB \|\|

	18299 p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );

	18300 }

	18301 idx++;

	18302 N--;

	18303 }

	18304 }

	18305

	18306 /*

	18307 ** Return the size of the Nth element of the cell array

	18308 */

	18309 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){

	18310 assert( N>=0 && N<p->nCell );

	18311 assert( p->szCell[N]==0 );

	18312 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);

	18313 return p->szCell[N];

	18314 }

	18315 static u16 cachedCellSize(CellArray *p, int N){

	18316 assert( N>=0 && N<p->nCell );

	18317 if( p->szCell[N] ) return p->szCell[N];

	18318 return computeCellSize(p, N);

	18319 }

	18320

	18321 /*

	18322 ** Array apCell[] contains pointers to nCell b-tree page cells. The

	18323 ** szCell[] array contains the size in bytes of each cell. This function

	18324 ** replaces the current contents of page pPg with the contents of the cell

	18325 ** array.

	18326 **

	18327 ** Some of the cells in apCell[] may currently be stored in pPg. This

	18328 ** function works around problems caused by this by making a copy of any

	18329 ** such cells before overwriting the page data.

	18330 **

	18331 ** The MemPage.nFree field is invalidated by this function. It is the

	18332 ** responsibility of the caller to set it correctly.

	18333 */

	18334 static int rebuildPage(

	18335 MemPage pPg, / Edit this page */

	18336 int nCell, /* Final number of cells on page */

	18337 u8 *apCell, / Array of cells */

	18338 u16 szCell / Array of cell sizes */

	18339 ){

	18340 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */

	18341 u8 * const aData = pPg->aData; /* Pointer to data for pPg */

	18342 const int usableSize = pPg->pBt->usableSize;

	18343 u8 * const pEnd = &aData[usableSize];

	18344 int i;

	18345 u8 *pCellptr = pPg->aCellIdx;

	18346 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);

	18347 u8 *pData;

	18348

	18349 i = get2byte(&aData[hdr+5]);

	18350 memcpy(&pTmp[i], &aData[i], usableSize - i);

	18351

	18352 pData = pEnd;

	18353 for(i=0; i<nCell; i++){

	18354 u8 *pCell = apCell[i];

	18355 if( SQLITE_WITHIN(pCell,aData,pEnd) ){

	18356 pCell = &pTmp[pCell - aData];

	18357 }

	18358 pData -= szCell[i];

	18359 put2byte(pCellptr, (pData - aData));

	18360 pCellptr += 2;

	18361 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;

	18362 memcpy(pData, pCell, szCell[i]);

	18363 assert( szCell[i]==pPg->xCellSize(pPg, pCell) \|\| CORRUPT_DB );

	18364 testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );

	18365 }

	18366

	18367 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */

	18368 pPg->nCell = nCell;

	18369 pPg->nOverflow = 0;

	18370

	18371 put2byte(&aData[hdr+1], 0);

	18372 put2byte(&aData[hdr+3], pPg->nCell);

	18373 put2byte(&aData[hdr+5], pData - aData);

	18374 aData[hdr+7] = 0x00;

	18375 return SQLITE_OK;

	18376 }

	18377

	18378 /*

	18379 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell

	18380 ** contains the size in bytes of each such cell. This function attempts to

	18381 ** add the cells stored in the array to page pPg. If it cannot (because

	18382 ** the page needs to be defragmented before the cells will fit), non-zero

	18383 ** is returned. Otherwise, if the cells are added successfully, zero is

	18384 ** returned.

	18385 **

	18386 ** Argument pCellptr points to the first entry in the cell-pointer array

	18387 ** (part of page pPg) to populate. After cell apCell[0] is written to the

	18388 ** page body, a 16-bit offset is written to pCellptr. And so on, for each

	18389 ** cell in the array. It is the responsibility of the caller to ensure

	18390 ** that it is safe to overwrite this part of the cell-pointer array.

	18391 **

	18392 ** When this function is called, *ppData points to the start of the

	18393 ** content area on page pPg. If the size of the content area is extended,

	18394 ** *ppData is updated to point to the new start of the content area

	18395 ** before returning.

	18396 **

	18397 ** Finally, argument pBegin points to the byte immediately following the

	18398 ** end of the space required by this page for the cell-pointer area (for

	18399 ** all cells - not just those inserted by the current call). If the content

	18400 ** area must be extended to before this point in order to accomodate all

	18401 ** cells in apCell[], then the cells do not fit and non-zero is returned.

	18402 */

	18403 static int pageInsertArray(

	18404 MemPage pPg, / Page to add cells to */

	18405 u8 pBegin, / End of cell-pointer array */

	18406 u8 *ppData, / IN/OUT: Page content -area pointer */

	18407 u8 pCellptr, / Pointer to cell-pointer area */

	18408 int iFirst, /* Index of first cell to add */

	18409 int nCell, /* Number of cells to add to pPg */

	18410 CellArray pCArray / Array of cells */

	18411 ){

	18412 int i;

	18413 u8 *aData = pPg->aData;

	18414 u8 pData = ppData;

	18415 int iEnd = iFirst + nCell;

	18416 assert( CORRUPT_DB \|\| pPg->hdrOffset==0 ); /* Never called on page 1 */

	18417 for(i=iFirst; i<iEnd; i++){

	18418 int sz, rc;

	18419 u8 *pSlot;

	18420 sz = cachedCellSize(pCArray, i);

	18421 if( (aData[1]==0 && aData[2]==0) \|\| (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){

	18422 pData -= sz;

	18423 if( pData<pBegin ) return 1;

	18424 pSlot = pData;

	18425 }

	18426 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed

	18427 ** database. But they might for a corrupt database. Hence use memmove()

	18428 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */

	18429 assert( (pSlot+sz)<=pCArray->apCell[i]

	18430 \|\| pSlot>=(pCArray->apCell[i]+sz)

	18431 \|\| CORRUPT_DB );

	18432 memmove(pSlot, pCArray->apCell[i], sz);

	18433 put2byte(pCellptr, (pSlot - aData));

	18434 pCellptr += 2;

	18435 }

	18436 *ppData = pData;

	18437 return 0;

	18438 }

	18439

	18440 /*

	18441 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell

	18442 ** contains the size in bytes of each such cell. This function adds the

	18443 ** space associated with each cell in the array that is currently stored

	18444 ** within the body of pPg to the pPg free-list. The cell-pointers and other

	18445 ** fields of the page are not updated.

	18446 **

	18447 ** This function returns the total number of cells added to the free-list.

	18448 */

	18449 static int pageFreeArray(

	18450 MemPage pPg, / Page to edit */

	18451 int iFirst, /* First cell to delete */

	18452 int nCell, /* Cells to delete */

	18453 CellArray pCArray / Array of cells */

	18454 ){

	18455 u8 * const aData = pPg->aData;

	18456 u8 * const pEnd = &aData[pPg->pBt->usableSize];

	18457 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];

	18458 int nRet = 0;

	18459 int i;

	18460 int iEnd = iFirst + nCell;

	18461 u8 *pFree = 0;

	18462 int szFree = 0;

	18463

	18464 for(i=iFirst; i<iEnd; i++){

	18465 u8 *pCell = pCArray->apCell[i];

	18466 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){

	18467 int sz;

	18468 /* No need to use cachedCellSize() here. The sizes of all cells that

	18469 ** are to be freed have already been computing while deciding which

	18470 ** cells need freeing */

	18471 sz = pCArray->szCell[i]; assert( sz>0 );

	18472 if( pFree!=(pCell + sz) ){

	18473 if( pFree ){

	18474 assert( pFree>aData && (pFree - aData)<65536 );

	18475 freeSpace(pPg, (u16)(pFree - aData), szFree);

	18476 }

	18477 pFree = pCell;

	18478 szFree = sz;

	18479 if( pFree+sz>pEnd ) return 0;

	18480 }else{

	18481 pFree = pCell;

	18482 szFree += sz;

	18483 }

	18484 nRet++;

	18485 }

	18486 }

	18487 if( pFree ){

	18488 assert( pFree>aData && (pFree - aData)<65536 );

	18489 freeSpace(pPg, (u16)(pFree - aData), szFree);

	18490 }

	18491 return nRet;

	18492 }

	18493

	18494 /*

	18495 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the

	18496 ** pages being balanced. The current page, pPg, has pPg->nCell cells starting

	18497 ** with apCell[iOld]. After balancing, this page should hold nNew cells

	18498 ** starting at apCell[iNew].

	18499 **

	18500 ** This routine makes the necessary adjustments to pPg so that it contains

	18501 ** the correct cells after being balanced.

	18502 **

	18503 ** The pPg->nFree field is invalid when this function returns. It is the

	18504 ** responsibility of the caller to set it correctly.

	18505 */

	18506 static int editPage(

	18507 MemPage pPg, / Edit this page */

	18508 int iOld, /* Index of first cell currently on page */

	18509 int iNew, /* Index of new first cell on page */

	18510 int nNew, /* Final number of cells on page */

	18511 CellArray pCArray / Array of cells and sizes */

	18512 ){

	18513 u8 * const aData = pPg->aData;

	18514 const int hdr = pPg->hdrOffset;

	18515 u8 pBegin = &pPg->aCellIdx[nNew 2];

	18516 int nCell = pPg->nCell; /* Cells stored on pPg */

	18517 u8 *pData;

	18518 u8 *pCellptr;

	18519 int i;

	18520 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;

	18521 int iNewEnd = iNew + nNew;

	18522

	18523 #ifdef SQLITE_DEBUG

	18524 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);

	18525 memcpy(pTmp, aData, pPg->pBt->usableSize);

	18526 #endif

	18527

	18528 /* Remove cells from the start and end of the page */

	18529 if( iOld<iNew ){

	18530 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);

	18531 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift2], nCell2);

	18532 nCell -= nShift;

	18533 }

	18534 if( iNewEnd < iOldEnd ){

	18535 nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);

	18536 }

	18537

	18538 pData = &aData[get2byteNotZero(&aData[hdr+5])];

	18539 if( pData<pBegin ) goto editpage_fail;

	18540

	18541 /* Add cells to the start of the page */

	18542 if( iNew<iOld ){

	18543 int nAdd = MIN(nNew,iOld-iNew);

	18544 assert( (iOld-iNew)<nNew \|\| nCell==0 \|\| CORRUPT_DB );

	18545 pCellptr = pPg->aCellIdx;

	18546 memmove(&pCellptr[nAdd2], pCellptr, nCell2);

	18547 if( pageInsertArray(

	18548 pPg, pBegin, &pData, pCellptr,

	18549 iNew, nAdd, pCArray

	18550 ) ) goto editpage_fail;

	18551 nCell += nAdd;

	18552 }

	18553

	18554 /* Add any overflow cells */

	18555 for(i=0; i<pPg->nOverflow; i++){

	18556 int iCell = (iOld + pPg->aiOvfl[i]) - iNew;

	18557 if( iCell>=0 && iCell<nNew ){

	18558 pCellptr = &pPg->aCellIdx[iCell * 2];

	18559 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);

	18560 nCell++;

	18561 if( pageInsertArray(

	18562 pPg, pBegin, &pData, pCellptr,

	18563 iCell+iNew, 1, pCArray

	18564 ) ) goto editpage_fail;

	18565 }

	18566 }

	18567

	18568 /* Append cells to the end of the page */

	18569 pCellptr = &pPg->aCellIdx[nCell*2];

	18570 if( pageInsertArray(

	18571 pPg, pBegin, &pData, pCellptr,

	18572 iNew+nCell, nNew-nCell, pCArray

	18573 ) ) goto editpage_fail;

	18574

	18575 pPg->nCell = nNew;

	18576 pPg->nOverflow = 0;

	18577

	18578 put2byte(&aData[hdr+3], pPg->nCell);

	18579 put2byte(&aData[hdr+5], pData - aData);

	18580

	18581 #ifdef SQLITE_DEBUG

	18582 for(i=0; i<nNew && !CORRUPT_DB; i++){

	18583 u8 *pCell = pCArray->apCell[i+iNew];

	18584 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);

	18585 if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){

	18586 pCell = &pTmp[pCell - aData];

	18587 }

	18588 assert( 0==memcmp(pCell, &aData[iOff],

	18589 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );

	18590 }

	18591 #endif

	18592

	18593 return SQLITE_OK;

	18594 editpage_fail:

	18595 /* Unable to edit this page. Rebuild it from scratch instead. */

	18596 populateCellCache(pCArray, iNew, nNew);

	18597 return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);

	18598 }

	18599

	18600 /*

	18601 ** The following parameters determine how many adjacent pages get involved

	18602 ** in a balancing operation. NN is the number of neighbors on either side

	18603 ** of the page that participate in the balancing operation. NB is the

	18604 ** total number of pages that participate, including the target page and

	18605 ** NN neighbors on either side.

	18606 **

	18607 ** The minimum value of NN is 1 (of course). Increasing NN above 1

	18608 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance

	18609 ** in exchange for a larger degradation in INSERT and UPDATE performance.

	18610 ** The value of NN appears to give the best results overall.

	18611 */

	18612 #define NN 1 /* Number of neighbors on either side of pPage */

	18613 #define NB (NN2+1) / Total pages involved in the balance */

	18614

	18615

	18616 #ifndef SQLITE_OMIT_QUICKBALANCE

	18617 /*

	18618 ** This version of balance() handles the common special case where

	18619 ** a new entry is being inserted on the extreme right-end of the

	18620 ** tree, in other words, when the new entry will become the largest

	18621 ** entry in the tree.

	18622 **

	18623 ** Instead of trying to balance the 3 right-most leaf pages, just add

	18624 ** a new page to the right-hand side and put the one new entry in

	18625 ** that page. This leaves the right side of the tree somewhat

	18626 ** unbalanced. But odds are that we will be inserting new entries

	18627 ** at the end soon afterwards so the nearly empty page will quickly

	18628 ** fill up. On average.

	18629 **

	18630 ** pPage is the leaf page which is the right-most page in the tree.

	18631 ** pParent is its parent. pPage must have a single overflow entry

	18632 ** which is also the right-most entry on the page.

	18633 **

	18634 ** The pSpace buffer is used to store a temporary copy of the divider

	18635 ** cell that will be inserted into pParent. Such a cell consists of a 4

	18636 ** byte page number followed by a variable length integer. In other

	18637 ** words, at most 13 bytes. Hence the pSpace buffer must be at

	18638 ** least 13 bytes in size.

	18639 */

	18640 static int balance_quick(MemPage pParent, MemPage pPage, u8 *pSpace){

	18641 BtShared const pBt = pPage->pBt; / B-Tree Database */

	18642 MemPage pNew; / Newly allocated page */

	18643 int rc; /* Return Code */

	18644 Pgno pgnoNew; /* Page number of pNew */

	18645

	18646 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

	18647 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	18648 assert( pPage->nOverflow==1 );

	18649

	18650 /* This error condition is now caught prior to reaching this function */

	18651 if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;

	18652

	18653 /* Allocate a new page. This page will become the right-sibling of

	18654 ** pPage. Make the parent page writable, so that the new divider cell

	18655 ** may be inserted. If both these operations are successful, proceed.

	18656 */

	18657 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

	18658

	18659 if( rc==SQLITE_OK ){

	18660

	18661 u8 *pOut = &pSpace[4];

	18662 u8 *pCell = pPage->apOvfl[0];

	18663 u16 szCell = pPage->xCellSize(pPage, pCell);

	18664 u8 *pStop;

	18665

	18666 assert( sqlite3PagerIswriteable(pNew->pDbPage) );

	18667 assert( pPage->aData[0]==(PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF) );

	18668 zeroPage(pNew, PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF);

	18669 rc = rebuildPage(pNew, 1, &pCell, &szCell);

	18670 if( NEVER(rc) ) return rc;

	18671 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;

	18672

	18673 /* If this is an auto-vacuum database, update the pointer map

	18674 ** with entries for the new page, and any pointer from the

	18675 ** cell on the page to an overflow page. If either of these

	18676 ** operations fails, the return code is set, but the contents

	18677 ** of the parent page are still manipulated by thh code below.

	18678 ** That is Ok, at this point the parent page is guaranteed to

	18679 ** be marked as dirty. Returning an error code will cause a

	18680 ** rollback, undoing any changes made to the parent page.

	18681 */

	18682 if( ISAUTOVACUUM ){

	18683 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);

	18684 if( szCell>pNew->minLocal ){

	18685 ptrmapPutOvflPtr(pNew, pCell, &rc);

	18686 }

	18687 }

	18688

	18689 /* Create a divider cell to insert into pParent. The divider cell

	18690 ** consists of a 4-byte page number (the page number of pPage) and

	18691 ** a variable length key value (which must be the same value as the

	18692 ** largest key on pPage).

	18693 **

	18694 ** To find the largest key value on pPage, first find the right-most

	18695 ** cell on pPage. The first two fields of this cell are the

	18696 ** record-length (a variable length integer at most 32-bits in size)

	18697 ** and the key value (a variable length integer, may have any value).

	18698 ** The first of the while(...) loops below skips over the record-length

	18699 ** field. The second while(...) loop copies the key value from the

	18700 ** cell on pPage into the pSpace buffer.

	18701 */

	18702 pCell = findCell(pPage, pPage->nCell-1);

	18703 pStop = &pCell[9];

	18704 while( (*(pCell++)&0x80) && pCell<pStop );

	18705 pStop = &pCell[9];

	18706 while( (((pOut++) = (pCell++))&0x80) && pCell<pStop );

	18707

	18708 /* Insert the new divider cell into pParent. */

	18709 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),

	18710 0, pPage->pgno, &rc);

	18711

	18712 /* Set the right-child pointer of pParent to point to the new page. */

	18713 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);

	18714

	18715 /* Release the reference to the new page. */

	18716 releasePage(pNew);

	18717 }

	18718

	18719 return rc;

	18720 }

	18721 #endif /* SQLITE_OMIT_QUICKBALANCE */

	18722

	18723 #if 0

	18724 /*

	18725 ** This function does not contribute anything to the operation of SQLite.

	18726 ** it is sometimes activated temporarily while debugging code responsible

	18727 ** for setting pointer-map entries.

	18728 */

	18729 static int ptrmapCheckPages(MemPage **apPage, int nPage){

	18730 int i, j;

	18731 for(i=0; i<nPage; i++){

	18732 Pgno n;

	18733 u8 e;

	18734 MemPage *pPage = apPage[i];

	18735 BtShared *pBt = pPage->pBt;

	18736 assert( pPage->isInit );

	18737

	18738 for(j=0; j<pPage->nCell; j++){

	18739 CellInfo info;

	18740 u8 *z;

	18741

	18742 z = findCell(pPage, j);

	18743 pPage->xParseCell(pPage, z, &info);

	18744 if( info.nLocal<info.nPayload ){

	18745 Pgno ovfl = get4byte(&z[info.nSize-4]);

	18746 ptrmapGet(pBt, ovfl, &e, &n);

	18747 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );

	18748 }

	18749 if( !pPage->leaf ){

	18750 Pgno child = get4byte(z);

	18751 ptrmapGet(pBt, child, &e, &n);

	18752 assert( n==pPage->pgno && e==PTRMAP_BTREE );

	18753 }

	18754 }

	18755 if( !pPage->leaf ){

	18756 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);

	18757 ptrmapGet(pBt, child, &e, &n);

	18758 assert( n==pPage->pgno && e==PTRMAP_BTREE );

	18759 }

	18760 }

	18761 return 1;

	18762 }

	18763 #endif

	18764

	18765 /*

	18766 ** This function is used to copy the contents of the b-tree node stored

	18767 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then

	18768 ** the pointer-map entries for each child page are updated so that the

	18769 ** parent page stored in the pointer map is page pTo. If pFrom contained

	18770 ** any cells with overflow page pointers, then the corresponding pointer

	18771 ** map entries are also updated so that the parent page is page pTo.

	18772 **

	18773 ** If pFrom is currently carrying any overflow cells (entries in the

	18774 ** MemPage.apOvfl[] array), they are not copied to pTo.

	18775 **

	18776 ** Before returning, page pTo is reinitialized using btreeInitPage().

	18777 **

	18778 ** The performance of this function is not critical. It is only used by

	18779 ** the balance_shallower() and balance_deeper() procedures, neither of

	18780 ** which are called often under normal circumstances.

	18781 */

	18782 static void copyNodeContent(MemPage pFrom, MemPage pTo, int *pRC){

	18783 if( (*pRC)==SQLITE_OK ){

	18784 BtShared * const pBt = pFrom->pBt;

	18785 u8 * const aFrom = pFrom->aData;

	18786 u8 * const aTo = pTo->aData;

	18787 int const iFromHdr = pFrom->hdrOffset;

	18788 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);

	18789 int rc;

	18790 int iData;

	18791

	18792

	18793 assert( pFrom->isInit );

	18794 assert( pFrom->nFree>=iToHdr );

	18795 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );

	18796

	18797 /* Copy the b-tree node content from page pFrom to page pTo. */

	18798 iData = get2byte(&aFrom[iFromHdr+5]);

	18799 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);

	18800 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);

	18801

	18802 /* Reinitialize page pTo so that the contents of the MemPage structure

	18803 ** match the new data. The initialization of pTo can actually fail under

	18804 ** fairly obscure circumstances, even though it is a copy of initialized

	18805 ** page pFrom.

	18806 */

	18807 pTo->isInit = 0;

	18808 rc = btreeInitPage(pTo);

	18809 if( rc!=SQLITE_OK ){

	18810 *pRC = rc;

	18811 return;

	18812 }

	18813

	18814 /* If this is an auto-vacuum database, update the pointer-map entries

	18815 ** for any b-tree or overflow pages that pTo now contains the pointers to.

	18816 */

	18817 if( ISAUTOVACUUM ){

	18818 *pRC = setChildPtrmaps(pTo);

	18819 }

	18820 }

	18821 }

	18822

	18823 /*

	18824 ** This routine redistributes cells on the iParentIdx'th child of pParent

	18825 ** (hereafter "the page") and up to 2 siblings so that all pages have about the

	18826 ** same amount of free space. Usually a single sibling on either side of the

	18827 ** page are used in the balancing, though both siblings might come from one

	18828 ** side if the page is the first or last child of its parent. If the page

	18829 ** has fewer than 2 siblings (something which can only happen if the page

	18830 ** is a root page or a child of a root page) then all available siblings

	18831 ** participate in the balancing.

	18832 **

	18833 ** The number of siblings of the page might be increased or decreased by

	18834 ** one or two in an effort to keep pages nearly full but not over full.

	18835 **

	18836 ** Note that when this routine is called, some of the cells on the page

	18837 ** might not actually be stored in MemPage.aData[]. This can happen

	18838 ** if the page is overfull. This routine ensures that all cells allocated

	18839 ** to the page and its siblings fit into MemPage.aData[] before returning.

	18840 **

	18841 ** In the course of balancing the page and its siblings, cells may be

	18842 ** inserted into or removed from the parent page (pParent). Doing so

	18843 ** may cause the parent page to become overfull or underfull. If this

	18844 ** happens, it is the responsibility of the caller to invoke the correct

	18845 ** balancing routine to fix this problem (see the balance() routine).

	18846 **

	18847 ** If this routine fails for any reason, it might leave the database

	18848 ** in a corrupted state. So if this routine fails, the database should

	18849 ** be rolled back.

	18850 **

	18851 ** The third argument to this function, aOvflSpace, is a pointer to a

	18852 ** buffer big enough to hold one page. If while inserting cells into the parent

	18853 ** page (pParent) the parent page becomes overfull, this buffer is

	18854 ** used to store the parent's overflow cells. Because this function inserts

	18855 ** a maximum of four divider cells into the parent page, and the maximum

	18856 ** size of a cell stored within an internal node is always less than 1/4

	18857 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large

	18858 ** enough for all overflow cells.

	18859 **

	18860 ** If aOvflSpace is set to a null pointer, this function returns

	18861 ** SQLITE_NOMEM.

	18862 */

	18863 static int balance_nonroot(

	18864 MemPage pParent, / Parent page of siblings being balanced */

	18865 int iParentIdx, /* Index of "the page" in pParent */

	18866 u8 aOvflSpace, / page-size bytes of space for parent ovfl */

	18867 int isRoot, /* True if pParent is a root-page */

	18868 int bBulk /* True if this call is part of a bulk load */

	18869 ){

	18870 BtShared pBt; / The whole database */

	18871 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */

	18872 int nNew = 0; /* Number of pages in apNew[] */

	18873 int nOld; /* Number of pages in apOld[] */

	18874 int i, j, k; /* Loop counters */

	18875 int nxDiv; /* Next divider slot in pParent->aCell[] */

	18876 int rc = SQLITE_OK; /* The return code */

	18877 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */

	18878 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */

	18879 int usableSpace; /* Bytes in pPage beyond the header */

	18880 int pageFlags; /* Value of pPage->aData[0] */

	18881 int iSpace1 = 0; /* First unused byte of aSpace1[] */

	18882 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */

	18883 int szScratch; /* Size of scratch memory requested */

	18884 MemPage apOld[NB]; / pPage and up to two siblings */

	18885 MemPage apNew[NB+2]; / pPage and up to NB siblings after balancing */

	18886 u8 pRight; / Location in parent of right-sibling pointer */

	18887 u8 apDiv[NB-1]; / Divider cells in pParent */

	18888 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */

	18889 int cntOld[NB+2]; /* Old index in b.apCell[] */

	18890 int szNew[NB+2]; /* Combined size of cells placed on i-th page */

	18891 u8 aSpace1; / Space for copies of dividers cells */

	18892 Pgno pgno; /* Temp var to store a page number in */

	18893 u8 abDone[NB+2]; /* True after i'th new page is populated */

	18894 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */

	18895 Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */

	18896 u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */

	18897 CellArray b; /* Parsed information on cells being balanced */

	18898

	18899 memset(abDone, 0, sizeof(abDone));

	18900 b.nCell = 0;

	18901 b.apCell = 0;

	18902 pBt = pParent->pBt;

	18903 assert( sqlite3_mutex_held(pBt->mutex) );

	18904 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	18905

	18906 #if 0

	18907 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));

	18908 #endif

	18909

	18910 /* At this point pParent may have at most one overflow cell. And if

	18911 ** this overflow cell is present, it must be the cell with

	18912 ** index iParentIdx. This scenario comes about when this function

	18913 ** is called (indirectly) from sqlite3BtreeDelete().

	18914 */

	18915 assert( pParent->nOverflow==0 \|\| pParent->nOverflow==1 );

	18916 assert( pParent->nOverflow==0 \|\| pParent->aiOvfl[0]==iParentIdx );

	18917

	18918 if( !aOvflSpace ){

	18919 return SQLITE_NOMEM;

	18920 }

	18921

	18922 /* Find the sibling pages to balance. Also locate the cells in pParent

	18923 ** that divide the siblings. An attempt is made to find NN siblings on

	18924 ** either side of pPage. More siblings are taken from one side, however,

	18925 ** if there are fewer than NN siblings on the other side. If pParent

	18926 ** has NB or fewer children then all children of pParent are taken.

	18927 **

	18928 ** This loop also drops the divider cells from the parent page. This

	18929 ** way, the remainder of the function does not have to deal with any

	18930 ** overflow cells in the parent page, since if any existed they will

	18931 ** have already been removed.

	18932 */

	18933 i = pParent->nOverflow + pParent->nCell;

	18934 if( i<2 ){

	18935 nxDiv = 0;

	18936 }else{

	18937 assert( bBulk==0 \|\| bBulk==1 );

	18938 if( iParentIdx==0 ){

	18939 nxDiv = 0;

	18940 }else if( iParentIdx==i ){

	18941 nxDiv = i-2+bBulk;

	18942 }else{

	18943 nxDiv = iParentIdx-1;

	18944 }

	18945 i = 2-bBulk;

	18946 }

	18947 nOld = i+1;

	18948 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){

	18949 pRight = &pParent->aData[pParent->hdrOffset+8];

	18950 }else{

	18951 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);

	18952 }

	18953 pgno = get4byte(pRight);

	18954 while( 1 ){

	18955 rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);

	18956 if( rc ){

	18957 memset(apOld, 0, (i+1)sizeof(MemPage));

	18958 goto balance_cleanup;

	18959 }

	18960 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;

	18961 if( (i--)==0 ) break;

	18962

	18963 if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){

	18964 apDiv[i] = pParent->apOvfl[0];

	18965 pgno = get4byte(apDiv[i]);

	18966 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);

	18967 pParent->nOverflow = 0;

	18968 }else{

	18969 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);

	18970 pgno = get4byte(apDiv[i]);

	18971 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);

	18972

	18973 /* Drop the cell from the parent page. apDiv[i] still points to

	18974 ** the cell within the parent, even though it has been dropped.

	18975 ** This is safe because dropping a cell only overwrites the first

	18976 ** four bytes of it, and this function does not need the first

	18977 ** four bytes of the divider cell. So the pointer is safe to use

	18978 ** later on.

	18979 **

	18980 ** But not if we are in secure-delete mode. In secure-delete mode,

	18981 ** the dropCell() routine will overwrite the entire cell with zeroes.

	18982 ** In this case, temporarily copy the cell into the aOvflSpace[]

	18983 ** buffer. It will be copied out again as soon as the aSpace[] buffer

	18984 ** is allocated. */

	18985 if( pBt->btsFlags & BTS_SECURE_DELETE ){

	18986 int iOff;

	18987

	18988 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);

	18989 if( (iOff+szNew[i])>(int)pBt->usableSize ){

	18990 rc = SQLITE_CORRUPT_BKPT;

	18991 memset(apOld, 0, (i+1)sizeof(MemPage));

	18992 goto balance_cleanup;

	18993 }else{

	18994 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);

	18995 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];

	18996 }

	18997 }

	18998 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);

	18999 }

	19000 }

	19001

	19002 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte

	19003 ** alignment */

	19004 nMaxCells = (nMaxCells + 3)&~3;

	19005

	19006 /*

	19007 ** Allocate space for memory structures

	19008 */

	19009 szScratch =

	19010 nMaxCellssizeof(u8) /* b.apCell */

	19011 + nMaxCellssizeof(u16) / b.szCell */

	19012 + pBt->pageSize; /* aSpace1 */

	19013

	19014 /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer

	19015 ** that is more than 6 times the database page size. */

	19016 assert( szScratch<=6*(int)pBt->pageSize );

	19017 b.apCell = sqlite3ScratchMalloc( szScratch );

	19018 if( b.apCell==0 ){

	19019 rc = SQLITE_NOMEM;

	19020 goto balance_cleanup;

	19021 }

	19022 b.szCell = (u16*)&b.apCell[nMaxCells];

	19023 aSpace1 = (u8*)&b.szCell[nMaxCells];

	19024 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

	19025

	19026 /*

	19027 ** Load pointers to all cells on sibling pages and the divider cells

	19028 ** into the local b.apCell[] array. Make copies of the divider cells

	19029 ** into space obtained from aSpace1[]. The divider cells have already

	19030 ** been removed from pParent.

	19031 **

	19032 ** If the siblings are on leaf pages, then the child pointers of the

	19033 ** divider cells are stripped from the cells before they are copied

	19034 ** into aSpace1[]. In this way, all cells in b.apCell[] are without

	19035 ** child pointers. If siblings are not leaves, then all cell in

	19036 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[]

	19037 ** are alike.

	19038 **

	19039 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.

	19040 ** leafData: 1 if pPage holds key+data and pParent holds only keys.

	19041 */

	19042 b.pRef = apOld[0];

	19043 leafCorrection = b.pRef->leaf*4;

	19044 leafData = b.pRef->intKeyLeaf;

	19045 for(i=0; i<nOld; i++){

	19046 MemPage *pOld = apOld[i];

	19047 int limit = pOld->nCell;

	19048 u8 *aData = pOld->aData;

	19049 u16 maskPage = pOld->maskPage;

	19050 u8 *piCell = aData + pOld->cellOffset;

	19051 u8 *piEnd;

	19052

	19053 /* Verify that all sibling pages are of the same "type" (table-leaf,

	19054 ** table-interior, index-leaf, or index-interior).

	19055 */

	19056 if( pOld->aData[0]!=apOld[0]->aData[0] ){

	19057 rc = SQLITE_CORRUPT_BKPT;

	19058 goto balance_cleanup;

	19059 }

	19060

	19061 /* Load b.apCell[] with pointers to all cells in pOld. If pOld

	19062 ** constains overflow cells, include them in the b.apCell[] array

	19063 ** in the correct spot.

	19064 **

	19065 ** Note that when there are multiple overflow cells, it is always the

	19066 ** case that they are sequential and adjacent. This invariant arises

	19067 ** because multiple overflows can only occurs when inserting divider

	19068 ** cells into a parent on a prior balance, and divider cells are always

	19069 ** adjacent and are inserted in order. There is an assert() tagged

	19070 ** with "NOTE 1" in the overflow cell insertion loop to prove this

	19071 ** invariant.

	19072 **

	19073 ** This must be done in advance. Once the balance starts, the cell

	19074 ** offset section of the btree page will be overwritten and we will no

	19075 ** long be able to find the cells if a pointer to each cell is not saved

	19076 ** first.

	19077 */

	19078 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*limit);

	19079 if( pOld->nOverflow>0 ){

	19080 memset(&b.szCell[b.nCell+limit], 0, sizeof(b.szCell[0])*pOld->nOverflow);

	19081 limit = pOld->aiOvfl[0];

	19082 for(j=0; j<limit; j++){

	19083 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));

	19084 piCell += 2;

	19085 b.nCell++;

	19086 }

	19087 for(k=0; k<pOld->nOverflow; k++){

	19088 assert( k==0 \|\| pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */

	19089 b.apCell[b.nCell] = pOld->apOvfl[k];

	19090 b.nCell++;

	19091 }

	19092 }

	19093 piEnd = aData + pOld->cellOffset + 2*pOld->nCell;

	19094 while( piCell<piEnd ){

	19095 assert( b.nCell<nMaxCells );

	19096 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));

	19097 piCell += 2;

	19098 b.nCell++;

	19099 }

	19100

	19101 cntOld[i] = b.nCell;

	19102 if( i<nOld-1 && !leafData){

	19103 u16 sz = (u16)szNew[i];

	19104 u8 *pTemp;

	19105 assert( b.nCell<nMaxCells );

	19106 b.szCell[b.nCell] = sz;

	19107 pTemp = &aSpace1[iSpace1];

	19108 iSpace1 += sz;

	19109 assert( sz<=pBt->maxLocal+23 );

	19110 assert( iSpace1 <= (int)pBt->pageSize );

	19111 memcpy(pTemp, apDiv[i], sz);

	19112 b.apCell[b.nCell] = pTemp+leafCorrection;

	19113 assert( leafCorrection==0 \|\| leafCorrection==4 );

	19114 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;

	19115 if( !pOld->leaf ){

	19116 assert( leafCorrection==0 );

	19117 assert( pOld->hdrOffset==0 );

	19118 /* The right pointer of the child page pOld becomes the left

	19119 ** pointer of the divider cell */

	19120 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);

	19121 }else{

	19122 assert( leafCorrection==4 );

	19123 while( b.szCell[b.nCell]<4 ){

	19124 /* Do not allow any cells smaller than 4 bytes. If a smaller cell

	19125 ** does exist, pad it with 0x00 bytes. */

	19126 assert( b.szCell[b.nCell]==3 \|\| CORRUPT_DB );

	19127 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] \|\| CORRUPT_DB );

	19128 aSpace1[iSpace1++] = 0x00;

	19129 b.szCell[b.nCell]++;

	19130 }

	19131 }

	19132 b.nCell++;

	19133 }

	19134 }

	19135

	19136 /*

	19137 ** Figure out the number of pages needed to hold all b.nCell cells.

	19138 ** Store this number in "k". Also compute szNew[] which is the total

	19139 ** size of all cells on the i-th page and cntNew[] which is the index

	19140 ** in b.apCell[] of the cell that divides page i from page i+1.

	19141 ** cntNew[k] should equal b.nCell.

	19142 **

	19143 ** Values computed by this block:

	19144 **

	19145 ** k: The total number of sibling pages

	19146 ** szNew[i]: Spaced used on the i-th sibling page.

	19147 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to

	19148 ** the right of the i-th sibling page.

	19149 ** usableSpace: Number of bytes of space available on each sibling.

	19150 **

	19151 */

	19152 usableSpace = pBt->usableSize - 12 + leafCorrection;

	19153 for(i=0; i<nOld; i++){

	19154 MemPage *p = apOld[i];

	19155 szNew[i] = usableSpace - p->nFree;

	19156 if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }

	19157 for(j=0; j<p->nOverflow; j++){

	19158 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);

	19159 }

	19160 cntNew[i] = cntOld[i];

	19161 }

	19162 k = nOld;

	19163 for(i=0; i<k; i++){

	19164 int sz;

	19165 while( szNew[i]>usableSpace ){

	19166 if( i+1>=k ){

	19167 k = i+2;

	19168 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }

	19169 szNew[k-1] = 0;

	19170 cntNew[k-1] = b.nCell;

	19171 }

	19172 sz = 2 + cachedCellSize(&b, cntNew[i]-1);

	19173 szNew[i] -= sz;

	19174 if( !leafData ){

	19175 if( cntNew[i]<b.nCell ){

	19176 sz = 2 + cachedCellSize(&b, cntNew[i]);

	19177 }else{

	19178 sz = 0;

	19179 }

	19180 }

	19181 szNew[i+1] += sz;

	19182 cntNew[i]--;

	19183 }

	19184 while( cntNew[i]<b.nCell ){

	19185 sz = 2 + cachedCellSize(&b, cntNew[i]);

	19186 if( szNew[i]+sz>usableSpace ) break;

	19187 szNew[i] += sz;

	19188 cntNew[i]++;

	19189 if( !leafData ){

	19190 if( cntNew[i]<b.nCell ){

	19191 sz = 2 + cachedCellSize(&b, cntNew[i]);

	19192 }else{

	19193 sz = 0;

	19194 }

	19195 }

	19196 szNew[i+1] -= sz;

	19197 }

	19198 if( cntNew[i]>=b.nCell ){

	19199 k = i+1;

	19200 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){

	19201 rc = SQLITE_CORRUPT_BKPT;

	19202 goto balance_cleanup;

	19203 }

	19204 }

	19205

	19206 /*

	19207 ** The packing computed by the previous block is biased toward the siblings

	19208 ** on the left side (siblings with smaller keys). The left siblings are

	19209 ** always nearly full, while the right-most sibling might be nearly empty.

	19210 ** The next block of code attempts to adjust the packing of siblings to

	19211 ** get a better balance.

	19212 **

	19213 ** This adjustment is more than an optimization. The packing above might

	19214 ** be so out of balance as to be illegal. For example, the right-most

	19215 ** sibling might be completely empty. This adjustment is not optional.

	19216 */

	19217 for(i=k-1; i>0; i--){

	19218 int szRight = szNew[i]; /* Size of sibling on the right */

	19219 int szLeft = szNew[i-1]; /* Size of sibling on the left */

	19220 int r; /* Index of right-most cell in left sibling */

	19221 int d; /* Index of first cell to the left of right sibling */

	19222

	19223 r = cntNew[i-1] - 1;

	19224 d = r + 1 - leafData;

	19225 (void)cachedCellSize(&b, d);

	19226 do{

	19227 assert( d<nMaxCells );

	19228 assert( r<nMaxCells );

	19229 (void)cachedCellSize(&b, r);

	19230 if( szRight!=0

	19231 && (bBulk \|\| szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+2)) ){

	19232 break;

	19233 }

	19234 szRight += b.szCell[d] + 2;

	19235 szLeft -= b.szCell[r] + 2;

	19236 cntNew[i-1] = r;

	19237 r--;

	19238 d--;

	19239 }while( r>=0 );

	19240 szNew[i] = szRight;

	19241 szNew[i-1] = szLeft;

	19242 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){

	19243 rc = SQLITE_CORRUPT_BKPT;

	19244 goto balance_cleanup;

	19245 }

	19246 }

	19247

	19248 /* Sanity check: For a non-corrupt database file one of the follwing

	19249 ** must be true:

	19250 ** (1) We found one or more cells (cntNew[0])>0), or

	19251 ** (2) pPage is a virtual root page. A virtual root page is when

	19252 ** the real root page is page 1 and we are the only child of

	19253 ** that page.

	19254 */

	19255 assert( cntNew[0]>0 \|\| (pParent->pgno==1 && pParent->nCell==0) \|\| CORRUPT_DB);

	19256 TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",

	19257 apOld[0]->pgno, apOld[0]->nCell,

	19258 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,

	19259 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0

	19260 ));

	19261

	19262 /*

	19263 ** Allocate k new pages. Reuse old pages where possible.

	19264 */

	19265 pageFlags = apOld[0]->aData[0];

	19266 for(i=0; i<k; i++){

	19267 MemPage *pNew;

	19268 if( i<nOld ){

	19269 pNew = apNew[i] = apOld[i];

	19270 apOld[i] = 0;

	19271 rc = sqlite3PagerWrite(pNew->pDbPage);

	19272 nNew++;

	19273 if( rc ) goto balance_cleanup;

	19274 }else{

	19275 assert( i>0 );

	19276 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);

	19277 if( rc ) goto balance_cleanup;

	19278 zeroPage(pNew, pageFlags);

	19279 apNew[i] = pNew;

	19280 nNew++;

	19281 cntOld[i] = b.nCell;

	19282

	19283 /* Set the pointer-map entry for the new sibling page. */

	19284 if( ISAUTOVACUUM ){

	19285 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);

	19286 if( rc!=SQLITE_OK ){

	19287 goto balance_cleanup;

	19288 }

	19289 }

	19290 }

	19291 }

	19292

	19293 /*

	19294 ** Reassign page numbers so that the new pages are in ascending order.

	19295 ** This helps to keep entries in the disk file in order so that a scan

	19296 ** of the table is closer to a linear scan through the file. That in turn

	19297 ** helps the operating system to deliver pages from the disk more rapidly.

	19298 **

	19299 ** An O(n^2) insertion sort algorithm is used, but since n is never more

	19300 ** than (NB+2) (a small constant), that should not be a problem.

	19301 **

	19302 ** When NB==3, this one optimization makes the database about 25% faster

	19303 ** for large insertions and deletions.

	19304 */

	19305 for(i=0; i<nNew; i++){

	19306 aPgOrder[i] = aPgno[i] = apNew[i]->pgno;

	19307 aPgFlags[i] = apNew[i]->pDbPage->flags;

	19308 for(j=0; j<i; j++){

	19309 if( aPgno[j]==aPgno[i] ){

	19310 /* This branch is taken if the set of sibling pages somehow contains

	19311 ** duplicate entries. This can happen if the database is corrupt.

	19312 ** It would be simpler to detect this as part of the loop below, but

	19313 ** we do the detection here in order to avoid populating the pager

	19314 ** cache with two separate objects associated with the same

	19315 ** page number. */

	19316 assert( CORRUPT_DB );

	19317 rc = SQLITE_CORRUPT_BKPT;

	19318 goto balance_cleanup;

	19319 }

	19320 }

	19321 }

	19322 for(i=0; i<nNew; i++){

	19323 int iBest = 0; /* aPgno[] index of page number to use */

	19324 for(j=1; j<nNew; j++){

	19325 if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;

	19326 }

	19327 pgno = aPgOrder[iBest];

	19328 aPgOrder[iBest] = 0xffffffff;

	19329 if( iBest!=i ){

	19330 if( iBest>i ){

	19331 sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);

	19332 }

	19333 sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);

	19334 apNew[i]->pgno = pgno;

	19335 }

	19336 }

	19337

	19338 TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "

	19339 "%d(%d nc=%d) %d(%d nc=%d)\n",

	19340 apNew[0]->pgno, szNew[0], cntNew[0],

	19341 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,

	19342 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,

	19343 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,

	19344 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,

	19345 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,

	19346 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,

	19347 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,

	19348 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0

	19349 ));

	19350

	19351 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	19352 put4byte(pRight, apNew[nNew-1]->pgno);

	19353

	19354 /* If the sibling pages are not leaves, ensure that the right-child pointer

	19355 ** of the right-most new sibling page is set to the value that was

	19356 ** originally in the same field of the right-most old sibling page. */

	19357 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){

	19358 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];

	19359 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);

	19360 }

	19361

	19362 /* Make any required updates to pointer map entries associated with

	19363 ** cells stored on sibling pages following the balance operation. Pointer

	19364 ** map entries associated with divider cells are set by the insertCell()

	19365 ** routine. The associated pointer map entries are:

	19366 **

	19367 ** a) if the cell contains a reference to an overflow chain, the

	19368 ** entry associated with the first page in the overflow chain, and

	19369 **

	19370 ** b) if the sibling pages are not leaves, the child page associated

	19371 ** with the cell.

	19372 **

	19373 ** If the sibling pages are not leaves, then the pointer map entry

	19374 ** associated with the right-child of each sibling may also need to be

	19375 ** updated. This happens below, after the sibling pages have been

	19376 ** populated, not here.

	19377 */

	19378 if( ISAUTOVACUUM ){

	19379 MemPage *pNew = apNew[0];

	19380 u8 *aOld = pNew->aData;

	19381 int cntOldNext = pNew->nCell + pNew->nOverflow;

	19382 int usableSize = pBt->usableSize;

	19383 int iNew = 0;

	19384 int iOld = 0;

	19385

	19386 for(i=0; i<b.nCell; i++){

	19387 u8 *pCell = b.apCell[i];

	19388 if( i==cntOldNext ){

	19389 MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];

	19390 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;

	19391 aOld = pOld->aData;

	19392 }

	19393 if( i==cntNew[iNew] ){

	19394 pNew = apNew[++iNew];

	19395 if( !leafData ) continue;

	19396 }

	19397

	19398 /* Cell pCell is destined for new sibling page pNew. Originally, it

	19399 ** was either part of sibling page iOld (possibly an overflow cell),

	19400 ** or else the divider cell to the left of sibling page iOld. So,

	19401 ** if sibling page iOld had the same page number as pNew, and if

	19402 ** pCell really was a part of sibling page iOld (not a divider or

	19403 ** overflow cell), we can skip updating the pointer map entries. */

	19404 if( iOld>=nNew

	19405 \|\| pNew->pgno!=aPgno[iOld]

	19406 \|\| !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])

	19407 ){

	19408 if( !leafCorrection ){

	19409 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);

	19410 }

	19411 if( cachedCellSize(&b,i)>pNew->minLocal ){

	19412 ptrmapPutOvflPtr(pNew, pCell, &rc);

	19413 }

	19414 if( rc ) goto balance_cleanup;

	19415 }

	19416 }

	19417 }

	19418

	19419 /* Insert new divider cells into pParent. */

	19420 for(i=0; i<nNew-1; i++){

	19421 u8 *pCell;

	19422 u8 *pTemp;

	19423 int sz;

	19424 MemPage *pNew = apNew[i];

	19425 j = cntNew[i];

	19426

	19427 assert( j<nMaxCells );

	19428 assert( b.apCell[j]!=0 );

	19429 pCell = b.apCell[j];

	19430 sz = b.szCell[j] + leafCorrection;

	19431 pTemp = &aOvflSpace[iOvflSpace];

	19432 if( !pNew->leaf ){

	19433 memcpy(&pNew->aData[8], pCell, 4);

	19434 }else if( leafData ){

	19435 /* If the tree is a leaf-data tree, and the siblings are leaves,

	19436 ** then there is no divider cell in b.apCell[]. Instead, the divider

	19437 ** cell consists of the integer key for the right-most cell of

	19438 ** the sibling-page assembled above only.

	19439 */

	19440 CellInfo info;

	19441 j--;

	19442 pNew->xParseCell(pNew, b.apCell[j], &info);

	19443 pCell = pTemp;

	19444 sz = 4 + putVarint(&pCell[4], info.nKey);

	19445 pTemp = 0;

	19446 }else{

	19447 pCell -= 4;

	19448 /* Obscure case for non-leaf-data trees: If the cell at pCell was

	19449 ** previously stored on a leaf node, and its reported size was 4

	19450 ** bytes, then it may actually be smaller than this

	19451 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of

	19452 ** any cell). But it is important to pass the correct size to

	19453 ** insertCell(), so reparse the cell now.

	19454 **

	19455 ** Note that this can never happen in an SQLite data file, as all

	19456 ** cells are at least 4 bytes. It only happens in b-trees used

	19457 ** to evaluate "IN (SELECT ...)" and similar clauses.

	19458 */

	19459 if( b.szCell[j]==4 ){

	19460 assert(leafCorrection==4);

	19461 sz = pParent->xCellSize(pParent, pCell);

	19462 }

	19463 }

	19464 iOvflSpace += sz;

	19465 assert( sz<=pBt->maxLocal+23 );

	19466 assert( iOvflSpace <= (int)pBt->pageSize );

	19467 insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);

	19468 if( rc!=SQLITE_OK ) goto balance_cleanup;

	19469 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

	19470 }

	19471

	19472 /* Now update the actual sibling pages. The order in which they are updated

	19473 ** is important, as this code needs to avoid disrupting any page from which

	19474 ** cells may still to be read. In practice, this means:

	19475 **

	19476 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])

	19477 ** then it is not safe to update page apNew[iPg] until after

	19478 ** the left-hand sibling apNew[iPg-1] has been updated.

	19479 **

	19480 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])

	19481 ** then it is not safe to update page apNew[iPg] until after

	19482 ** the right-hand sibling apNew[iPg+1] has been updated.

	19483 **

	19484 ** If neither of the above apply, the page is safe to update.

	19485 **

	19486 ** The iPg value in the following loop starts at nNew-1 goes down

	19487 ** to 0, then back up to nNew-1 again, thus making two passes over

	19488 ** the pages. On the initial downward pass, only condition (1) above

	19489 ** needs to be tested because (2) will always be true from the previous

	19490 ** step. On the upward pass, both conditions are always true, so the

	19491 ** upwards pass simply processes pages that were missed on the downward

	19492 ** pass.

	19493 */

	19494 for(i=1-nNew; i<nNew; i++){

	19495 int iPg = i<0 ? -i : i;

	19496 assert( iPg>=0 && iPg<nNew );

	19497 if( abDone[iPg] ) continue; /* Skip pages already processed */

	19498 if( i>=0 /* On the upwards pass, or... */

	19499 \|\| cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */

	19500 ){

	19501 int iNew;

	19502 int iOld;

	19503 int nNewCell;

	19504

	19505 /* Verify condition (1): If cells are moving left, update iPg

	19506 ** only after iPg-1 has already been updated. */

	19507 assert( iPg==0 \|\| cntOld[iPg-1]>=cntNew[iPg-1] \|\| abDone[iPg-1] );

	19508

	19509 /* Verify condition (2): If cells are moving right, update iPg

	19510 ** only after iPg+1 has already been updated. */

	19511 assert( cntNew[iPg]>=cntOld[iPg] \|\| abDone[iPg+1] );

	19512

	19513 if( iPg==0 ){

	19514 iNew = iOld = 0;

	19515 nNewCell = cntNew[0];

	19516 }else{

	19517 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;

	19518 iNew = cntNew[iPg-1] + !leafData;

	19519 nNewCell = cntNew[iPg] - iNew;

	19520 }

	19521

	19522 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);

	19523 if( rc ) goto balance_cleanup;

	19524 abDone[iPg]++;

	19525 apNew[iPg]->nFree = usableSpace-szNew[iPg];

	19526 assert( apNew[iPg]->nOverflow==0 );

	19527 assert( apNew[iPg]->nCell==nNewCell );

	19528 }

	19529 }

	19530

	19531 /* All pages have been processed exactly once */

	19532 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );

	19533

	19534 assert( nOld>0 );

	19535 assert( nNew>0 );

	19536

	19537 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){

	19538 /* The root page of the b-tree now contains no cells. The only sibling

	19539 ** page is the right-child of the parent. Copy the contents of the

	19540 ** child page into the parent, decreasing the overall height of the

	19541 ** b-tree structure by one. This is described as the "balance-shallower"

	19542 ** sub-algorithm in some documentation.

	19543 **

	19544 ** If this is an auto-vacuum database, the call to copyNodeContent()

	19545 ** sets all pointer-map entries corresponding to database image pages

	19546 ** for which the pointer is stored within the content being copied.

	19547 **

	19548 ** It is critical that the child page be defragmented before being

	19549 ** copied into the parent, because if the parent is page 1 then it will

	19550 ** by smaller than the child due to the database header, and so all the

	19551 ** free space needs to be up front.

	19552 */

	19553 assert( nNew==1 \|\| CORRUPT_DB );

	19554 rc = defragmentPage(apNew[0]);

	19555 testcase( rc!=SQLITE_OK );

	19556 assert( apNew[0]->nFree ==

	19557 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)

	19558 \|\| rc!=SQLITE_OK

	19559 );

	19560 copyNodeContent(apNew[0], pParent, &rc);

	19561 freePage(apNew[0], &rc);

	19562 }else if( ISAUTOVACUUM && !leafCorrection ){

	19563 /* Fix the pointer map entries associated with the right-child of each

	19564 ** sibling page. All other pointer map entries have already been taken

	19565 ** care of. */

	19566 for(i=0; i<nNew; i++){

	19567 u32 key = get4byte(&apNew[i]->aData[8]);

	19568 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);

	19569 }

	19570 }

	19571

	19572 assert( pParent->isInit );

	19573 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",

	19574 nOld, nNew, b.nCell));

	19575

	19576 /* Free any old pages that were not reused as new pages.

	19577 */

	19578 for(i=nNew; i<nOld; i++){

	19579 freePage(apOld[i], &rc);

	19580 }

	19581

	19582 #if 0

	19583 if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){

	19584 /* The ptrmapCheckPages() contains assert() statements that verify that

	19585 ** all pointer map pages are set correctly. This is helpful while

	19586 ** debugging. This is usually disabled because a corrupt database may

	19587 ** cause an assert() statement to fail. */

	19588 ptrmapCheckPages(apNew, nNew);

	19589 ptrmapCheckPages(&pParent, 1);

	19590 }

	19591 #endif

	19592

	19593 /*

	19594 ** Cleanup before returning.

	19595 */

	19596 balance_cleanup:

	19597 sqlite3ScratchFree(b.apCell);

	19598 for(i=0; i<nOld; i++){

	19599 releasePage(apOld[i]);

	19600 }

	19601 for(i=0; i<nNew; i++){

	19602 releasePage(apNew[i]);

	19603 }

	19604

	19605 return rc;

	19606 }

	19607

	19608

	19609 /*

	19610 ** This function is called when the root page of a b-tree structure is

	19611 ** overfull (has one or more overflow pages).

	19612 **

	19613 ** A new child page is allocated and the contents of the current root

	19614 ** page, including overflow cells, are copied into the child. The root

	19615 ** page is then overwritten to make it an empty page with the right-child

	19616 ** pointer pointing to the new page.

	19617 **

	19618 ** Before returning, all pointer-map entries corresponding to pages

	19619 ** that the new child-page now contains pointers to are updated. The

	19620 ** entry corresponding to the new right-child pointer of the root

	19621 ** page is also updated.

	19622 **

	19623 ** If successful, *ppChild is set to contain a reference to the child

	19624 ** page and SQLITE_OK is returned. In this case the caller is required

	19625 ** to call releasePage() on *ppChild exactly once. If an error occurs,

	19626 ** an error code is returned and *ppChild is set to 0.

	19627 */

	19628 static int balance_deeper(MemPage pRoot, MemPage *ppChild){

	19629 int rc; /* Return value from subprocedures */

	19630 MemPage pChild = 0; / Pointer to a new child page */

	19631 Pgno pgnoChild = 0; /* Page number of the new child page */

	19632 BtShared pBt = pRoot->pBt; / The BTree */

	19633

	19634 assert( pRoot->nOverflow>0 );

	19635 assert( sqlite3_mutex_held(pBt->mutex) );

	19636

	19637 /* Make pRoot, the root page of the b-tree, writable. Allocate a new

	19638 ** page that will become the new right-child of pPage. Copy the contents

	19639 ** of the node stored on pRoot into the new child page.

	19640 */

	19641 rc = sqlite3PagerWrite(pRoot->pDbPage);

	19642 if( rc==SQLITE_OK ){

	19643 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);

	19644 copyNodeContent(pRoot, pChild, &rc);

	19645 if( ISAUTOVACUUM ){

	19646 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);

	19647 }

	19648 }

	19649 if( rc ){

	19650 *ppChild = 0;

	19651 releasePage(pChild);

	19652 return rc;

	19653 }

	19654 assert( sqlite3PagerIswriteable(pChild->pDbPage) );

	19655 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

	19656 assert( pChild->nCell==pRoot->nCell );

	19657

	19658 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));

	19659

	19660 /* Copy the overflow cells from pRoot to pChild */

	19661 memcpy(pChild->aiOvfl, pRoot->aiOvfl,

	19662 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));

	19663 memcpy(pChild->apOvfl, pRoot->apOvfl,

	19664 pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));

	19665 pChild->nOverflow = pRoot->nOverflow;

	19666

	19667 /* Zero the contents of pRoot. Then install pChild as the right-child. */

	19668 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);

	19669 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);

	19670

	19671 *ppChild = pChild;

	19672 return SQLITE_OK;

	19673 }

	19674

	19675 /*

	19676 ** The page that pCur currently points to has just been modified in

	19677 ** some way. This function figures out if this modification means the

	19678 ** tree needs to be balanced, and if so calls the appropriate balancing

	19679 ** routine. Balancing routines are:

	19680 **

	19681 ** balance_quick()

	19682 ** balance_deeper()

	19683 ** balance_nonroot()

	19684 */

	19685 static int balance(BtCursor *pCur){

	19686 int rc = SQLITE_OK;

	19687 const int nMin = pCur->pBt->usableSize * 2 / 3;

	19688 u8 aBalanceQuickSpace[13];

	19689 u8 *pFree = 0;

	19690

	19691 TESTONLY( int balance_quick_called = 0 );

	19692 TESTONLY( int balance_deeper_called = 0 );

	19693

	19694 do {

	19695 int iPage = pCur->iPage;

	19696 MemPage *pPage = pCur->apPage[iPage];

	19697

	19698 if( iPage==0 ){

	19699 if( pPage->nOverflow ){

	19700 /* The root page of the b-tree is overfull. In this case call the

	19701 ** balance_deeper() function to create a new child for the root-page

	19702 ** and copy the current contents of the root-page to it. The

	19703 ** next iteration of the do-loop will balance the child page.

	19704 */

	19705 assert( (balance_deeper_called++)==0 );

	19706 rc = balance_deeper(pPage, &pCur->apPage[1]);

	19707 if( rc==SQLITE_OK ){

	19708 pCur->iPage = 1;

	19709 pCur->aiIdx[0] = 0;

	19710 pCur->aiIdx[1] = 0;

	19711 assert( pCur->apPage[1]->nOverflow );

	19712 }

	19713 }else{

	19714 break;

	19715 }

	19716 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){

	19717 break;

	19718 }else{

	19719 MemPage * const pParent = pCur->apPage[iPage-1];

	19720 int const iIdx = pCur->aiIdx[iPage-1];

	19721

	19722 rc = sqlite3PagerWrite(pParent->pDbPage);

	19723 if( rc==SQLITE_OK ){

	19724 #ifndef SQLITE_OMIT_QUICKBALANCE

	19725 if( pPage->intKeyLeaf

	19726 && pPage->nOverflow==1

	19727 && pPage->aiOvfl[0]==pPage->nCell

	19728 && pParent->pgno!=1

	19729 && pParent->nCell==iIdx

	19730 ){

	19731 /* Call balance_quick() to create a new sibling of pPage on which

	19732 ** to store the overflow cell. balance_quick() inserts a new cell

	19733 ** into pParent, which may cause pParent overflow. If this

	19734 ** happens, the next iteration of the do-loop will balance pParent

	19735 ** use either balance_nonroot() or balance_deeper(). Until this

	19736 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]

	19737 ** buffer.

	19738 **

	19739 ** The purpose of the following assert() is to check that only a

	19740 ** single call to balance_quick() is made for each call to this

	19741 ** function. If this were not verified, a subtle bug involving reuse

	19742 ** of the aBalanceQuickSpace[] might sneak in.

	19743 */

	19744 assert( (balance_quick_called++)==0 );

	19745 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);

	19746 }else

	19747 #endif

	19748 {

	19749 /* In this case, call balance_nonroot() to redistribute cells

	19750 ** between pPage and up to 2 of its sibling pages. This involves

	19751 ** modifying the contents of pParent, which may cause pParent to

	19752 ** become overfull or underfull. The next iteration of the do-loop

	19753 ** will balance the parent page to correct this.

	19754 **

	19755 ** If the parent page becomes overfull, the overflow cell or cells

	19756 ** are stored in the pSpace buffer allocated immediately below.

	19757 ** A subsequent iteration of the do-loop will deal with this by

	19758 ** calling balance_nonroot() (balance_deeper() may be called first,

	19759 ** but it doesn't deal with overflow cells - just moves them to a

	19760 ** different page). Once this subsequent call to balance_nonroot()

	19761 ** has completed, it is safe to release the pSpace buffer used by

	19762 ** the previous call, as the overflow cell data will have been

	19763 ** copied either into the body of a database page or into the new

	19764 ** pSpace buffer passed to the latter call to balance_nonroot().

	19765 */

	19766 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);

	19767 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,

	19768 pCur->hints&BTREE_BULKLOAD);

	19769 if( pFree ){

	19770 /* If pFree is not NULL, it points to the pSpace buffer used

	19771 ** by a previous call to balance_nonroot(). Its contents are

	19772 ** now stored either on real database pages or within the

	19773 ** new pSpace buffer, so it may be safely freed here. */

	19774 sqlite3PageFree(pFree);

	19775 }

	19776

	19777 /* The pSpace buffer will be freed after the next call to

	19778 ** balance_nonroot(), or just before this function returns, whichever

	19779 ** comes first. */

	19780 pFree = pSpace;

	19781 }

	19782 }

	19783

	19784 pPage->nOverflow = 0;

	19785

	19786 /* The next iteration of the do-loop balances the parent page. */

	19787 releasePage(pPage);

	19788 pCur->iPage--;

	19789 assert( pCur->iPage>=0 );

	19790 }

	19791 }while( rc==SQLITE_OK );

	19792

	19793 if( pFree ){

	19794 sqlite3PageFree(pFree);

	19795 }

	19796 return rc;

	19797 }

	19798

	19799

	19800 /*

	19801 ** Insert a new record into the BTree. The key is given by (pKey,nKey)

	19802 ** and the data is given by (pData,nData). The cursor is used only to

	19803 ** define what table the record should be inserted into. The cursor

	19804 ** is left pointing at a random location.

	19805 **

	19806 ** For an INTKEY table, only the nKey value of the key is used. pKey is

	19807 ** ignored. For a ZERODATA table, the pData and nData are both ignored.

	19808 **

	19809 ** If the seekResult parameter is non-zero, then a successful call to

	19810 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already

	19811 ** been performed. seekResult is the search result returned (a negative

	19812 ** number if pCur points at an entry that is smaller than (pKey, nKey), or

	19813 ** a positive value if pCur points at an entry that is larger than

	19814 ** (pKey, nKey)).

	19815 **

	19816 ** If the seekResult parameter is non-zero, then the caller guarantees that

	19817 ** cursor pCur is pointing at the existing copy of a row that is to be

	19818 ** overwritten. If the seekResult parameter is 0, then cursor pCur may

	19819 ** point to any entry or to no entry at all and so this function has to seek

	19820 ** the cursor before the new key can be inserted.

	19821 */

	19822 SQLITE_PRIVATE int sqlite3BtreeInsert(

	19823 BtCursor pCur, / Insert data into the table of this cursor */

	19824 const void pKey, i64 nKey, / The key of the new record */

	19825 const void pData, int nData, / The data of the new record */

	19826 int nZero, /* Number of extra 0 bytes to append to data */

	19827 int appendBias, /* True if this is likely an append */

	19828 int seekResult /* Result of prior MovetoUnpacked() call */

	19829 ){

	19830 int rc;

	19831 int loc = seekResult; /* -1: before desired location +1: after */

	19832 int szNew = 0;

	19833 int idx;

	19834 MemPage *pPage;

	19835 Btree *p = pCur->pBtree;

	19836 BtShared *pBt = p->pBt;

	19837 unsigned char *oldCell;

	19838 unsigned char *newCell = 0;

	19839

	19840 if( pCur->eState==CURSOR_FAULT ){

	19841 assert( pCur->skipNext!=SQLITE_OK );

	19842 return pCur->skipNext;

	19843 }

	19844

	19845 assert( cursorHoldsMutex(pCur) );

	19846 assert( (pCur->curFlags & BTCF_WriteFlag)!=0

	19847 && pBt->inTransaction==TRANS_WRITE

	19848 && (pBt->btsFlags & BTS_READ_ONLY)==0 );

	19849 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

	19850

	19851 /* Assert that the caller has been consistent. If this cursor was opened

	19852 ** expecting an index b-tree, then the caller should be inserting blob

	19853 ** keys with no associated data. If the cursor was opened expecting an

	19854 ** intkey table, the caller should be inserting integer keys with a

	19855 ** blob of associated data. */

	19856 assert( (pKey==0)==(pCur->pKeyInfo==0) );

	19857

	19858 /* Save the positions of any other cursors open on this table.

	19859 **

	19860 ** In some cases, the call to btreeMoveto() below is a no-op. For

	19861 ** example, when inserting data into a table with auto-generated integer

	19862 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the

	19863 ** integer key to use. It then calls this function to actually insert the

	19864 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes

	19865 ** that the cursor is already where it needs to be and returns without

	19866 ** doing any work. To avoid thwarting these optimizations, it is important

	19867 ** not to clear the cursor here.

	19868 */

	19869 if( pCur->curFlags & BTCF_Multiple ){

	19870 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

	19871 if( rc ) return rc;

	19872 }

	19873

	19874 if( pCur->pKeyInfo==0 ){

	19875 assert( pKey==0 );

	19876 /* If this is an insert into a table b-tree, invalidate any incrblob

	19877 ** cursors open on the row being replaced */

	19878 invalidateIncrblobCursors(p, nKey, 0);

	19879

	19880 /* If the cursor is currently on the last row and we are appending a

	19881 ** new row onto the end, set the "loc" to avoid an unnecessary

	19882 ** btreeMoveto() call */

	19883 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0

	19884 && pCur->info.nKey==nKey-1 ){

	19885 loc = -1;

	19886 }else if( loc==0 ){

	19887 rc = sqlite3BtreeMovetoUnpacked(pCur, 0, nKey, appendBias, &loc);

	19888 if( rc ) return rc;

	19889 }

	19890 }else if( loc==0 ){

	19891 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);

	19892 if( rc ) return rc;

	19893 }

	19894 assert( pCur->eState==CURSOR_VALID \|\| (pCur->eState==CURSOR_INVALID && loc) );

	19895

	19896 pPage = pCur->apPage[pCur->iPage];

	19897 assert( pPage->intKey \|\| nKey>=0 );

	19898 assert( pPage->leaf \|\| !pPage->intKey );

	19899

	19900 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",

	19901 pCur->pgnoRoot, nKey, nData, pPage->pgno,

	19902 loc==0 ? "overwrite" : "new entry"));

	19903 assert( pPage->isInit );

	19904 newCell = pBt->pTmpSpace;

	19905 assert( newCell!=0 );

	19906 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);

	19907 if( rc ) goto end_insert;

	19908 assert( szNew==pPage->xCellSize(pPage, newCell) );

	19909 assert( szNew <= MX_CELL_SIZE(pBt) );

	19910 idx = pCur->aiIdx[pCur->iPage];

	19911 if( loc==0 ){

	19912 u16 szOld;

	19913 assert( idx<pPage->nCell );

	19914 rc = sqlite3PagerWrite(pPage->pDbPage);

	19915 if( rc ){

	19916 goto end_insert;

	19917 }

	19918 oldCell = findCell(pPage, idx);

	19919 if( !pPage->leaf ){

	19920 memcpy(newCell, oldCell, 4);

	19921 }

	19922 rc = clearCell(pPage, oldCell, &szOld);

	19923 dropCell(pPage, idx, szOld, &rc);

	19924 if( rc ) goto end_insert;

	19925 }else if( loc<0 && pPage->nCell>0 ){

	19926 assert( pPage->leaf );

	19927 idx = ++pCur->aiIdx[pCur->iPage];

	19928 }else{

	19929 assert( pPage->leaf );

	19930 }

	19931 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);

	19932 assert( rc!=SQLITE_OK \|\| pPage->nCell>0 \|\| pPage->nOverflow>0 );

	19933

	19934 /* If no error has occurred and pPage has an overflow cell, call balance()

	19935 ** to redistribute the cells within the tree. Since balance() may move

	19936 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey

	19937 ** variables.

	19938 **

	19939 ** Previous versions of SQLite called moveToRoot() to move the cursor

	19940 ** back to the root page as balance() used to invalidate the contents

	19941 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,

	19942 ** set the cursor state to "invalid". This makes common insert operations

	19943 ** slightly faster.

	19944 **

	19945 ** There is a subtle but important optimization here too. When inserting

	19946 ** multiple records into an intkey b-tree using a single cursor (as can

	19947 ** happen while processing an "INSERT INTO ... SELECT" statement), it

	19948 ** is advantageous to leave the cursor pointing to the last entry in

	19949 ** the b-tree if possible. If the cursor is left pointing to the last

	19950 ** entry in the table, and the next row inserted has an integer key

	19951 ** larger than the largest existing key, it is possible to insert the

	19952 ** row without seeking the cursor. This can be a big performance boost.

	19953 */

	19954 pCur->info.nSize = 0;

	19955 if( rc==SQLITE_OK && pPage->nOverflow ){

	19956 pCur->curFlags &= ~(BTCF_ValidNKey);

	19957 rc = balance(pCur);

	19958

	19959 /* Must make sure nOverflow is reset to zero even if the balance()

	19960 ** fails. Internal data structure corruption will result otherwise.

	19961 ** Also, set the cursor state to invalid. This stops saveCursorPosition()

	19962 ** from trying to save the current position of the cursor. */

	19963 pCur->apPage[pCur->iPage]->nOverflow = 0;

	19964 pCur->eState = CURSOR_INVALID;

	19965 }

	19966 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );

	19967

	19968 end_insert:

	19969 return rc;

	19970 }

	19971

	19972 /*

	19973 ** Delete the entry that the cursor is pointing to.

	19974 **

	19975 ** If the second parameter is zero, then the cursor is left pointing at an

	19976 ** arbitrary location after the delete. If it is non-zero, then the cursor

	19977 ** is left in a state such that the next call to BtreeNext() or BtreePrev()

	19978 ** moves it to the same row as it would if the call to BtreeDelete() had

	19979 ** been omitted.

	19980 */

	19981 SQLITE_PRIVATE int sqlite3BtreeDelete(BtCursor *pCur, int bPreserve){

	19982 Btree *p = pCur->pBtree;

	19983 BtShared *pBt = p->pBt;

	19984 int rc; /* Return code */

	19985 MemPage pPage; / Page to delete cell from */

	19986 unsigned char pCell; / Pointer to cell to delete */

	19987 int iCellIdx; /* Index of cell to delete */

	19988 int iCellDepth; /* Depth of node containing pCell */

	19989 u16 szCell; /* Size of the cell being deleted */

	19990 int bSkipnext = 0; /* Leaf cursor in SKIPNEXT state */

	19991

	19992 assert( cursorHoldsMutex(pCur) );

	19993 assert( pBt->inTransaction==TRANS_WRITE );

	19994 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	19995 assert( pCur->curFlags & BTCF_WriteFlag );

	19996 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

	19997 assert( !hasReadConflicts(p, pCur->pgnoRoot) );

	19998 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

	19999 assert( pCur->eState==CURSOR_VALID );

	20000

	20001 iCellDepth = pCur->iPage;

	20002 iCellIdx = pCur->aiIdx[iCellDepth];

	20003 pPage = pCur->apPage[iCellDepth];

	20004 pCell = findCell(pPage, iCellIdx);

	20005

	20006 /* If the page containing the entry to delete is not a leaf page, move

	20007 ** the cursor to the largest entry in the tree that is smaller than

	20008 ** the entry being deleted. This cell will replace the cell being deleted

	20009 ** from the internal node. The 'previous' entry is used for this instead

	20010 ** of the 'next' entry, as the previous entry is always a part of the

	20011 ** sub-tree headed by the child page of the cell being deleted. This makes

	20012 ** balancing the tree following the delete operation easier. */

	20013 if( !pPage->leaf ){

	20014 int notUsed = 0;

	20015 rc = sqlite3BtreePrevious(pCur, &notUsed);

	20016 if( rc ) return rc;

	20017 }

	20018

	20019 /* Save the positions of any other cursors open on this table before

	20020 ** making any modifications. */

	20021 if( pCur->curFlags & BTCF_Multiple ){

	20022 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

	20023 if( rc ) return rc;

	20024 }

	20025

	20026 /* If this is a delete operation to remove a row from a table b-tree,

	20027 ** invalidate any incrblob cursors open on the row being deleted. */

	20028 if( pCur->pKeyInfo==0 ){

	20029 invalidateIncrblobCursors(p, pCur->info.nKey, 0);

	20030 }

	20031

	20032 /* If the bPreserve flag is set to true, then the cursor position must

	20033 ** be preserved following this delete operation. If the current delete

	20034 ** will cause a b-tree rebalance, then this is done by saving the cursor

	20035 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before

	20036 ** returning.

	20037 **

	20038 ** Or, if the current delete will not cause a rebalance, then the cursor

	20039 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately

	20040 ** before or after the deleted entry. In this case set bSkipnext to true. */

	20041 if( bPreserve ){

	20042 if( !pPage->leaf

	20043 \|\| (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)

	20044 ){

	20045 /* A b-tree rebalance will be required after deleting this entry.

	20046 ** Save the cursor key. */

	20047 rc = saveCursorKey(pCur);

	20048 if( rc ) return rc;

	20049 }else{

	20050 bSkipnext = 1;

	20051 }

	20052 }

	20053

	20054 /* Make the page containing the entry to be deleted writable. Then free any

	20055 ** overflow pages associated with the entry and finally remove the cell

	20056 ** itself from within the page. */

	20057 rc = sqlite3PagerWrite(pPage->pDbPage);

	20058 if( rc ) return rc;

	20059 rc = clearCell(pPage, pCell, &szCell);

	20060 dropCell(pPage, iCellIdx, szCell, &rc);

	20061 if( rc ) return rc;

	20062

	20063 /* If the cell deleted was not located on a leaf page, then the cursor

	20064 ** is currently pointing to the largest entry in the sub-tree headed

	20065 ** by the child-page of the cell that was just deleted from an internal

	20066 ** node. The cell from the leaf node needs to be moved to the internal

	20067 ** node to replace the deleted cell. */

	20068 if( !pPage->leaf ){

	20069 MemPage *pLeaf = pCur->apPage[pCur->iPage];

	20070 int nCell;

	20071 Pgno n = pCur->apPage[iCellDepth+1]->pgno;

	20072 unsigned char *pTmp;

	20073

	20074 pCell = findCell(pLeaf, pLeaf->nCell-1);

	20075 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;

	20076 nCell = pLeaf->xCellSize(pLeaf, pCell);

	20077 assert( MX_CELL_SIZE(pBt) >= nCell );

	20078 pTmp = pBt->pTmpSpace;

	20079 assert( pTmp!=0 );

	20080 rc = sqlite3PagerWrite(pLeaf->pDbPage);

	20081 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);

	20082 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);

	20083 if( rc ) return rc;

	20084 }

	20085

	20086 /* Balance the tree. If the entry deleted was located on a leaf page,

	20087 ** then the cursor still points to that page. In this case the first

	20088 ** call to balance() repairs the tree, and the if(...) condition is

	20089 ** never true.

	20090 **

	20091 ** Otherwise, if the entry deleted was on an internal node page, then

	20092 ** pCur is pointing to the leaf page from which a cell was removed to

	20093 ** replace the cell deleted from the internal node. This is slightly

	20094 ** tricky as the leaf node may be underfull, and the internal node may

	20095 ** be either under or overfull. In this case run the balancing algorithm

	20096 ** on the leaf node first. If the balance proceeds far enough up the

	20097 ** tree that we can be sure that any problem in the internal node has

	20098 ** been corrected, so be it. Otherwise, after balancing the leaf node,

	20099 ** walk the cursor up the tree to the internal node and balance it as

	20100 ** well. */

	20101 rc = balance(pCur);

	20102 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){

	20103 while( pCur->iPage>iCellDepth ){

	20104 releasePage(pCur->apPage[pCur->iPage--]);

	20105 }

	20106 rc = balance(pCur);

	20107 }

	20108

	20109 if( rc==SQLITE_OK ){

	20110 if( bSkipnext ){

	20111 assert( bPreserve && (pCur->iPage==iCellDepth \|\| CORRUPT_DB) );

	20112 assert( pPage==pCur->apPage[pCur->iPage] );

	20113 assert( (pPage->nCell>0 \|\| CORRUPT_DB) && iCellIdx<=pPage->nCell );

	20114 pCur->eState = CURSOR_SKIPNEXT;

	20115 if( iCellIdx>=pPage->nCell ){

	20116 pCur->skipNext = -1;

	20117 pCur->aiIdx[iCellDepth] = pPage->nCell-1;

	20118 }else{

	20119 pCur->skipNext = 1;

	20120 }

	20121 }else{

	20122 rc = moveToRoot(pCur);

	20123 if( bPreserve ){

	20124 pCur->eState = CURSOR_REQUIRESEEK;

	20125 }

	20126 }

	20127 }

	20128 return rc;

	20129 }

	20130

	20131 /*

	20132 ** Create a new BTree table. Write into *piTable the page

	20133 ** number for the root page of the new table.

	20134 **

	20135 ** The type of type is determined by the flags parameter. Only the

	20136 ** following values of flags are currently in use. Other values for

	20137 ** flags might not work:

	20138 **

	20139 ** BTREE_INTKEY\|BTREE_LEAFDATA Used for SQL tables with rowid keys

	20140 ** BTREE_ZERODATA Used for SQL indices

	20141 */

	20142 static int btreeCreateTable(Btree p, int piTable, int createTabFlags){

	20143 BtShared *pBt = p->pBt;

	20144 MemPage *pRoot;

	20145 Pgno pgnoRoot;

	20146 int rc;

	20147 int ptfFlags; /* Page-type flage for the root page of new table */

	20148

	20149 assert( sqlite3BtreeHoldsMutex(p) );

	20150 assert( pBt->inTransaction==TRANS_WRITE );

	20151 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

	20152

	20153 #ifdef SQLITE_OMIT_AUTOVACUUM

	20154 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

	20155 if( rc ){

	20156 return rc;

	20157 }

	20158 #else

	20159 if( pBt->autoVacuum ){

	20160 Pgno pgnoMove; /* Move a page here to make room for the root-page */

	20161 MemPage pPageMove; / The page to move to. */

	20162

	20163 /* Creating a new table may probably require moving an existing database

	20164 ** to make room for the new tables root page. In case this page turns

	20165 ** out to be an overflow page, delete all overflow page-map caches

	20166 ** held by open cursors.

	20167 */

	20168 invalidateAllOverflowCache(pBt);

	20169

	20170 /* Read the value of meta[3] from the database to determine where the

	20171 ** root page of the new table should go. meta[3] is the largest root-page

	20172 ** created so far, so the new root-page is (meta[3]+1).

	20173 */

	20174 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);

	20175 pgnoRoot++;

	20176

	20177 /* The new root-page may not be allocated on a pointer-map page, or the

	20178 ** PENDING_BYTE page.

	20179 */

	20180 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) \|\|

	20181 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){

	20182 pgnoRoot++;

	20183 }

	20184 assert( pgnoRoot>=3 \|\| CORRUPT_DB );

	20185 testcase( pgnoRoot<3 );

	20186

	20187 /* Allocate a page. The page that currently resides at pgnoRoot will

	20188 ** be moved to the allocated page (unless the allocated page happens

	20189 ** to reside at pgnoRoot).

	20190 */

	20191 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);

	20192 if( rc!=SQLITE_OK ){

	20193 return rc;

	20194 }

	20195

	20196 if( pgnoMove!=pgnoRoot ){

	20197 /* pgnoRoot is the page that will be used for the root-page of

	20198 ** the new table (assuming an error did not occur). But we were

	20199 ** allocated pgnoMove. If required (i.e. if it was not allocated

	20200 ** by extending the file), the current page at position pgnoMove

	20201 ** is already journaled.

	20202 */

	20203 u8 eType = 0;

	20204 Pgno iPtrPage = 0;

	20205

	20206 /* Save the positions of any open cursors. This is required in

	20207 ** case they are holding a reference to an xFetch reference

	20208 ** corresponding to page pgnoRoot. */

	20209 rc = saveAllCursors(pBt, 0, 0);

	20210 releasePage(pPageMove);

	20211 if( rc!=SQLITE_OK ){

	20212 return rc;

	20213 }

	20214

	20215 /* Move the page currently at pgnoRoot to pgnoMove. */

	20216 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

	20217 if( rc!=SQLITE_OK ){

	20218 return rc;

	20219 }

	20220 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);

	20221 if( eType==PTRMAP_ROOTPAGE \|\| eType==PTRMAP_FREEPAGE ){

	20222 rc = SQLITE_CORRUPT_BKPT;

	20223 }

	20224 if( rc!=SQLITE_OK ){

	20225 releasePage(pRoot);

	20226 return rc;

	20227 }

	20228 assert( eType!=PTRMAP_ROOTPAGE );

	20229 assert( eType!=PTRMAP_FREEPAGE );

	20230 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);

	20231 releasePage(pRoot);

	20232

	20233 /* Obtain the page at pgnoRoot */

	20234 if( rc!=SQLITE_OK ){

	20235 return rc;

	20236 }

	20237 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

	20238 if( rc!=SQLITE_OK ){

	20239 return rc;

	20240 }

	20241 rc = sqlite3PagerWrite(pRoot->pDbPage);

	20242 if( rc!=SQLITE_OK ){

	20243 releasePage(pRoot);

	20244 return rc;

	20245 }

	20246 }else{

	20247 pRoot = pPageMove;

	20248 }

	20249

	20250 /* Update the pointer-map and meta-data with the new root-page number. */

	20251 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);

	20252 if( rc ){

	20253 releasePage(pRoot);

	20254 return rc;

	20255 }

	20256

	20257 /* When the new root page was allocated, page 1 was made writable in

	20258 ** order either to increase the database filesize, or to decrement the

	20259 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.

	20260 */

	20261 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );

	20262 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);

	20263 if( NEVER(rc) ){

	20264 releasePage(pRoot);

	20265 return rc;

	20266 }

	20267

	20268 }else{

	20269 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

	20270 if( rc ) return rc;

	20271 }

	20272 #endif

	20273 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

	20274 if( createTabFlags & BTREE_INTKEY ){

	20275 ptfFlags = PTF_INTKEY \| PTF_LEAFDATA \| PTF_LEAF;

	20276 }else{

	20277 ptfFlags = PTF_ZERODATA \| PTF_LEAF;

	20278 }

	20279 zeroPage(pRoot, ptfFlags);

	20280 sqlite3PagerUnref(pRoot->pDbPage);

	20281 assert( (pBt->openFlags & BTREE_SINGLE)==0 \|\| pgnoRoot==2 );

	20282 *piTable = (int)pgnoRoot;

	20283 return SQLITE_OK;

	20284 }

	20285 SQLITE_PRIVATE int sqlite3BtreeCreateTable(Btree p, int piTable, int flags){

	20286 int rc;

	20287 sqlite3BtreeEnter(p);

	20288 rc = btreeCreateTable(p, piTable, flags);

	20289 sqlite3BtreeLeave(p);

	20290 return rc;

	20291 }

	20292

	20293 /*

	20294 ** Erase the given database page and all its children. Return

	20295 ** the page to the freelist.

	20296 */

	20297 static int clearDatabasePage(

	20298 BtShared pBt, / The BTree that contains the table */

	20299 Pgno pgno, /* Page number to clear */

	20300 int freePageFlag, /* Deallocate page if true */

	20301 int pnChange / Add number of Cells freed to this counter */

	20302 ){

	20303 MemPage *pPage;

	20304 int rc;

	20305 unsigned char *pCell;

	20306 int i;

	20307 int hdr;

	20308 u16 szCell;

	20309

	20310 assert( sqlite3_mutex_held(pBt->mutex) );

	20311 if( pgno>btreePagecount(pBt) ){

	20312 return SQLITE_CORRUPT_BKPT;

	20313 }

	20314 rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);

	20315 if( rc ) return rc;

	20316 if( pPage->bBusy ){

	20317 rc = SQLITE_CORRUPT_BKPT;

	20318 goto cleardatabasepage_out;

	20319 }

	20320 pPage->bBusy = 1;

	20321 hdr = pPage->hdrOffset;

	20322 for(i=0; i<pPage->nCell; i++){

	20323 pCell = findCell(pPage, i);

	20324 if( !pPage->leaf ){

	20325 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);

	20326 if( rc ) goto cleardatabasepage_out;

	20327 }

	20328 rc = clearCell(pPage, pCell, &szCell);

	20329 if( rc ) goto cleardatabasepage_out;

	20330 }

	20331 if( !pPage->leaf ){

	20332 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);

	20333 if( rc ) goto cleardatabasepage_out;

	20334 }else if( pnChange ){

	20335 assert( pPage->intKey \|\| CORRUPT_DB );

	20336 testcase( !pPage->intKey );

	20337 *pnChange += pPage->nCell;

	20338 }

	20339 if( freePageFlag ){

	20340 freePage(pPage, &rc);

	20341 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){

	20342 zeroPage(pPage, pPage->aData[hdr] \| PTF_LEAF);

	20343 }

	20344

	20345 cleardatabasepage_out:

	20346 pPage->bBusy = 0;

	20347 releasePage(pPage);

	20348 return rc;

	20349 }

	20350

	20351 /*

	20352 ** Delete all information from a single table in the database. iTable is

	20353 ** the page number of the root of the table. After this routine returns,

	20354 ** the root page is empty, but still exists.

	20355 **

	20356 ** This routine will fail with SQLITE_LOCKED if there are any open

	20357 ** read cursors on the table. Open write cursors are moved to the

	20358 ** root of the table.

	20359 **

	20360 ** If pnChange is not NULL, then table iTable must be an intkey table. The

	20361 ** integer value pointed to by pnChange is incremented by the number of

	20362 ** entries in the table.

	20363 */

	20364 SQLITE_PRIVATE int sqlite3BtreeClearTable(Btree p, int iTable, int pnChange){

	20365 int rc;

	20366 BtShared *pBt = p->pBt;

	20367 sqlite3BtreeEnter(p);

	20368 assert( p->inTrans==TRANS_WRITE );

	20369

	20370 rc = saveAllCursors(pBt, (Pgno)iTable, 0);

	20371

	20372 if( SQLITE_OK==rc ){

	20373 /* Invalidate all incrblob cursors open on table iTable (assuming iTable

	20374 ** is the root of a table b-tree - if it is not, the following call is

	20375 ** a no-op). */

	20376 invalidateIncrblobCursors(p, 0, 1);

	20377 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);

	20378 }

	20379 sqlite3BtreeLeave(p);

	20380 return rc;

	20381 }

	20382

	20383 /*

	20384 ** Delete all information from the single table that pCur is open on.

	20385 **

	20386 ** This routine only work for pCur on an ephemeral table.

	20387 */

	20388 SQLITE_PRIVATE int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){

	20389 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);

	20390 }

	20391

	20392 /*

	20393 ** Erase all information in a table and add the root of the table to

	20394 ** the freelist. Except, the root of the principle table (the one on

	20395 ** page 1) is never added to the freelist.

	20396 **

	20397 ** This routine will fail with SQLITE_LOCKED if there are any open

	20398 ** cursors on the table.

	20399 **

	20400 ** If AUTOVACUUM is enabled and the page at iTable is not the last

	20401 ** root page in the database file, then the last root page

	20402 ** in the database file is moved into the slot formerly occupied by

	20403 ** iTable and that last slot formerly occupied by the last root page

	20404 ** is added to the freelist instead of iTable. In this say, all

	20405 ** root pages are kept at the beginning of the database file, which

	20406 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the

	20407 ** page number that used to be the last root page in the file before

	20408 ** the move. If no page gets moved, *piMoved is set to 0.

	20409 ** The last root page is recorded in meta[3] and the value of

	20410 ** meta[3] is updated by this procedure.

	20411 */

	20412 static int btreeDropTable(Btree p, Pgno iTable, int piMoved){

	20413 int rc;

	20414 MemPage *pPage = 0;

	20415 BtShared *pBt = p->pBt;

	20416

	20417 assert( sqlite3BtreeHoldsMutex(p) );

	20418 assert( p->inTrans==TRANS_WRITE );

	20419

	20420 /* It is illegal to drop a table if any cursors are open on the

	20421 ** database. This is because in auto-vacuum mode the backend may

	20422 ** need to move another root-page to fill a gap left by the deleted

	20423 ** root page. If an open cursor was using this page a problem would

	20424 ** occur.

	20425 **

	20426 ** This error is caught long before control reaches this point.

	20427 */

	20428 if( NEVER(pBt->pCursor) ){

	20429 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);

	20430 return SQLITE_LOCKED_SHAREDCACHE;

	20431 }

	20432

	20433 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);

	20434 if( rc ) return rc;

	20435 rc = sqlite3BtreeClearTable(p, iTable, 0);

	20436 if( rc ){

	20437 releasePage(pPage);

	20438 return rc;

	20439 }

	20440

	20441 *piMoved = 0;

	20442

	20443 if( iTable>1 ){

	20444 #ifdef SQLITE_OMIT_AUTOVACUUM

	20445 freePage(pPage, &rc);

	20446 releasePage(pPage);

	20447 #else

	20448 if( pBt->autoVacuum ){

	20449 Pgno maxRootPgno;

	20450 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);

	20451

	20452 if( iTable==maxRootPgno ){

	20453 /* If the table being dropped is the table with the largest root-page

	20454 ** number in the database, put the root page on the free list.

	20455 */

	20456 freePage(pPage, &rc);

	20457 releasePage(pPage);

	20458 if( rc!=SQLITE_OK ){

	20459 return rc;

	20460 }

	20461 }else{

	20462 /* The table being dropped does not have the largest root-page

	20463 ** number in the database. So move the page that does into the

	20464 ** gap left by the deleted root-page.

	20465 */

	20466 MemPage *pMove;

	20467 releasePage(pPage);

	20468 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

	20469 if( rc!=SQLITE_OK ){

	20470 return rc;

	20471 }

	20472 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);

	20473 releasePage(pMove);

	20474 if( rc!=SQLITE_OK ){

	20475 return rc;

	20476 }

	20477 pMove = 0;

	20478 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

	20479 freePage(pMove, &rc);

	20480 releasePage(pMove);

	20481 if( rc!=SQLITE_OK ){

	20482 return rc;

	20483 }

	20484 *piMoved = maxRootPgno;

	20485 }

	20486

	20487 /* Set the new 'max-root-page' value in the database header. This

	20488 ** is the old value less one, less one more if that happens to

	20489 ** be a root-page number, less one again if that is the

	20490 ** PENDING_BYTE_PAGE.

	20491 */

	20492 maxRootPgno--;

	20493 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)

	20494 \|\| PTRMAP_ISPAGE(pBt, maxRootPgno) ){

	20495 maxRootPgno--;

	20496 }

	20497 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );

	20498

	20499 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);

	20500 }else{

	20501 freePage(pPage, &rc);

	20502 releasePage(pPage);

	20503 }

	20504 #endif

	20505 }else{

	20506 /* If sqlite3BtreeDropTable was called on page 1.

	20507 ** This really never should happen except in a corrupt

	20508 ** database.

	20509 */

	20510 zeroPage(pPage, PTF_INTKEY\|PTF_LEAF );

	20511 releasePage(pPage);

	20512 }

	20513 return rc;

	20514 }

	20515 SQLITE_PRIVATE int sqlite3BtreeDropTable(Btree p, int iTable, int piMoved){

	20516 int rc;

	20517 sqlite3BtreeEnter(p);

	20518 rc = btreeDropTable(p, iTable, piMoved);

	20519 sqlite3BtreeLeave(p);

	20520 return rc;

	20521 }

	20522

	20523

	20524 /*

	20525 ** This function may only be called if the b-tree connection already

	20526 ** has a read or write transaction open on the database.

	20527 **

	20528 ** Read the meta-information out of a database file. Meta[0]

	20529 ** is the number of free pages currently in the database. Meta[1]

	20530 ** through meta[15] are available for use by higher layers. Meta[0]

	20531 ** is read-only, the others are read/write.

	20532 **

	20533 ** The schema layer numbers meta values differently. At the schema

	20534 ** layer (and the SetCookie and ReadCookie opcodes) the number of

	20535 ** free pages is not visible. So Cookie[0] is the same as Meta[1].

	20536 **

	20537 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead

	20538 ** of reading the value out of the header, it instead loads the "DataVersion"

	20539 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the

	20540 ** database file. It is a number computed by the pager. But its access

	20541 ** pattern is the same as header meta values, and so it is convenient to

	20542 ** read it from this routine.

	20543 */

	20544 SQLITE_PRIVATE void sqlite3BtreeGetMeta(Btree p, int idx, u32 pMeta){

	20545 BtShared *pBt = p->pBt;

	20546

	20547 sqlite3BtreeEnter(p);

	20548 assert( p->inTrans>TRANS_NONE );

	20549 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );

	20550 assert( pBt->pPage1 );

	20551 assert( idx>=0 && idx<=15 );

	20552

	20553 if( idx==BTREE_DATA_VERSION ){

	20554 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;

	20555 }else{

	20556 pMeta = get4byte(&pBt->pPage1->aData[36 + idx4]);

	20557 }

	20558

	20559 /* If auto-vacuum is disabled in this build and this is an auto-vacuum

	20560 ** database, mark the database as read-only. */

	20561 #ifdef SQLITE_OMIT_AUTOVACUUM

	20562 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){

	20563 pBt->btsFlags \|= BTS_READ_ONLY;

	20564 }

	20565 #endif

	20566

	20567 sqlite3BtreeLeave(p);

	20568 }

	20569

	20570 /*

	20571 ** Write meta-information back into the database. Meta[0] is

	20572 ** read-only and may not be written.

	20573 */

	20574 SQLITE_PRIVATE int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){

	20575 BtShared *pBt = p->pBt;

	20576 unsigned char *pP1;

	20577 int rc;

	20578 assert( idx>=1 && idx<=15 );

	20579 sqlite3BtreeEnter(p);

	20580 assert( p->inTrans==TRANS_WRITE );

	20581 assert( pBt->pPage1!=0 );

	20582 pP1 = pBt->pPage1->aData;

	20583 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	20584 if( rc==SQLITE_OK ){

	20585 put4byte(&pP1[36 + idx*4], iMeta);

	20586 #ifndef SQLITE_OMIT_AUTOVACUUM

	20587 if( idx==BTREE_INCR_VACUUM ){

	20588 assert( pBt->autoVacuum \|\| iMeta==0 );

	20589 assert( iMeta==0 \|\| iMeta==1 );

	20590 pBt->incrVacuum = (u8)iMeta;

	20591 }

	20592 #endif

	20593 }

	20594 sqlite3BtreeLeave(p);

	20595 return rc;

	20596 }

	20597

	20598 #ifndef SQLITE_OMIT_BTREECOUNT

	20599 /*

	20600 ** The first argument, pCur, is a cursor opened on some b-tree. Count the

	20601 ** number of entries in the b-tree and write the result to *pnEntry.

	20602 **

	20603 ** SQLITE_OK is returned if the operation is successfully executed.

	20604 ** Otherwise, if an error is encountered (i.e. an IO error or database

	20605 ** corruption) an SQLite error code is returned.

	20606 */

	20607 SQLITE_PRIVATE int sqlite3BtreeCount(BtCursor pCur, i64 pnEntry){

	20608 i64 nEntry = 0; /* Value to return in pnEntry /

	20609 int rc; /* Return code */

	20610

	20611 if( pCur->pgnoRoot==0 ){

	20612 *pnEntry = 0;

	20613 return SQLITE_OK;

	20614 }

	20615 rc = moveToRoot(pCur);

	20616

	20617 /* Unless an error occurs, the following loop runs one iteration for each

	20618 ** page in the B-Tree structure (not including overflow pages).

	20619 */

	20620 while( rc==SQLITE_OK ){

	20621 int iIdx; /* Index of child node in parent */

	20622 MemPage pPage; / Current page of the b-tree */

	20623

	20624 /* If this is a leaf page or the tree is not an int-key tree, then

	20625 ** this page contains countable entries. Increment the entry counter

	20626 ** accordingly.

	20627 */

	20628 pPage = pCur->apPage[pCur->iPage];

	20629 if( pPage->leaf \|\| !pPage->intKey ){

	20630 nEntry += pPage->nCell;

	20631 }

	20632

	20633 /* pPage is a leaf node. This loop navigates the cursor so that it

	20634 ** points to the first interior cell that it points to the parent of

	20635 ** the next page in the tree that has not yet been visited. The

	20636 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell

	20637 ** of the page, or to the number of cells in the page if the next page

	20638 ** to visit is the right-child of its parent.

	20639 **

	20640 ** If all pages in the tree have been visited, return SQLITE_OK to the

	20641 ** caller.

	20642 */

	20643 if( pPage->leaf ){

	20644 do {

	20645 if( pCur->iPage==0 ){

	20646 /* All pages of the b-tree have been visited. Return successfully. */

	20647 *pnEntry = nEntry;

	20648 return moveToRoot(pCur);

	20649 }

	20650 moveToParent(pCur);

	20651 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );

	20652

	20653 pCur->aiIdx[pCur->iPage]++;

	20654 pPage = pCur->apPage[pCur->iPage];

	20655 }

	20656

	20657 /* Descend to the child node of the cell that the cursor currently

	20658 ** points at. This is the right-child if (iIdx==pPage->nCell).

	20659 */

	20660 iIdx = pCur->aiIdx[pCur->iPage];

	20661 if( iIdx==pPage->nCell ){

	20662 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

	20663 }else{

	20664 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));

	20665 }

	20666 }

	20667

	20668 /* An error has occurred. Return an error code. */

	20669 return rc;

	20670 }

	20671 #endif

	20672

	20673 /*

	20674 ** Return the pager associated with a BTree. This routine is used for

	20675 ** testing and debugging only.

	20676 */

	20677 SQLITE_PRIVATE Pager sqlite3BtreePager(Btree p){

	20678 return p->pBt->pPager;

	20679 }

	20680

	20681 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	20682 /*

	20683 ** Append a message to the error message string.

	20684 */

	20685 static void checkAppendMsg(

	20686 IntegrityCk *pCheck,

	20687 const char *zFormat,

	20688 ...

	20689 ){

	20690 va_list ap;

	20691 if( !pCheck->mxErr ) return;

	20692 pCheck->mxErr--;

	20693 pCheck->nErr++;

	20694 va_start(ap, zFormat);

	20695 if( pCheck->errMsg.nChar ){

	20696 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);

	20697 }

	20698 if( pCheck->zPfx ){

	20699 sqlite3XPrintf(&pCheck->errMsg, 0, pCheck->zPfx, pCheck->v1, pCheck->v2);

	20700 }

	20701 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);

	20702 va_end(ap);

	20703 if( pCheck->errMsg.accError==STRACCUM_NOMEM ){

	20704 pCheck->mallocFailed = 1;

	20705 }

	20706 }

	20707 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	20708

	20709 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	20710

	20711 /*

	20712 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that

	20713 ** corresponds to page iPg is already set.

	20714 */

	20715 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){

	20716 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

	20717 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));

	20718 }

	20719

	20720 /*

	20721 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.

	20722 */

	20723 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){

	20724 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

	20725 pCheck->aPgRef[iPg/8] \|= (1 << (iPg & 0x07));

	20726 }

	20727

	20728

	20729 /*

	20730 ** Add 1 to the reference count for page iPage. If this is the second

	20731 ** reference to the page, add an error message to pCheck->zErrMsg.

	20732 ** Return 1 if there are 2 or more references to the page and 0 if

	20733 ** if this is the first reference to the page.

	20734 **

	20735 ** Also check that the page number is in bounds.

	20736 */

	20737 static int checkRef(IntegrityCk *pCheck, Pgno iPage){

	20738 if( iPage==0 ) return 1;

	20739 if( iPage>pCheck->nPage ){

	20740 checkAppendMsg(pCheck, "invalid page number %d", iPage);

	20741 return 1;

	20742 }

	20743 if( getPageReferenced(pCheck, iPage) ){

	20744 checkAppendMsg(pCheck, "2nd reference to page %d", iPage);

	20745 return 1;

	20746 }

	20747 setPageReferenced(pCheck, iPage);

	20748 return 0;

	20749 }

	20750

	20751 #ifndef SQLITE_OMIT_AUTOVACUUM

	20752 /*

	20753 ** Check that the entry in the pointer-map for page iChild maps to

	20754 ** page iParent, pointer type ptrType. If not, append an error message

	20755 ** to pCheck.

	20756 */

	20757 static void checkPtrmap(

	20758 IntegrityCk pCheck, / Integrity check context */

	20759 Pgno iChild, /* Child page number */

	20760 u8 eType, /* Expected pointer map type */

	20761 Pgno iParent /* Expected pointer map parent page number */

	20762 ){

	20763 int rc;

	20764 u8 ePtrmapType;

	20765 Pgno iPtrmapParent;

	20766

	20767 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);

	20768 if( rc!=SQLITE_OK ){

	20769 if( rc==SQLITE_NOMEM \|\| rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;

	20770 checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);

	20771 return;

	20772 }

	20773

	20774 if( ePtrmapType!=eType \|\| iPtrmapParent!=iParent ){

	20775 checkAppendMsg(pCheck,

	20776 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",

	20777 iChild, eType, iParent, ePtrmapType, iPtrmapParent);

	20778 }

	20779 }

	20780 #endif

	20781

	20782 /*

	20783 ** Check the integrity of the freelist or of an overflow page list.

	20784 ** Verify that the number of pages on the list is N.

	20785 */

	20786 static void checkList(

	20787 IntegrityCk pCheck, / Integrity checking context */

	20788 int isFreeList, /* True for a freelist. False for overflow page list */

	20789 int iPage, /* Page number for first page in the list */

	20790 int N /* Expected number of pages in the list */

	20791 ){

	20792 int i;

	20793 int expected = N;

	20794 int iFirst = iPage;

	20795 while( N-- > 0 && pCheck->mxErr ){

	20796 DbPage *pOvflPage;

	20797 unsigned char *pOvflData;

	20798 if( iPage<1 ){

	20799 checkAppendMsg(pCheck,

	20800 "%d of %d pages missing from overflow list starting at %d",

	20801 N+1, expected, iFirst);

	20802 break;

	20803 }

	20804 if( checkRef(pCheck, iPage) ) break;

	20805 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){

	20806 checkAppendMsg(pCheck, "failed to get page %d", iPage);

	20807 break;

	20808 }

	20809 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);

	20810 if( isFreeList ){

	20811 int n = get4byte(&pOvflData[4]);

	20812 #ifndef SQLITE_OMIT_AUTOVACUUM

	20813 if( pCheck->pBt->autoVacuum ){

	20814 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);

	20815 }

	20816 #endif

	20817 if( n>(int)pCheck->pBt->usableSize/4-2 ){

	20818 checkAppendMsg(pCheck,

	20819 "freelist leaf count too big on page %d", iPage);

	20820 N--;

	20821 }else{

	20822 for(i=0; i<n; i++){

	20823 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);

	20824 #ifndef SQLITE_OMIT_AUTOVACUUM

	20825 if( pCheck->pBt->autoVacuum ){

	20826 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);

	20827 }

	20828 #endif

	20829 checkRef(pCheck, iFreePage);

	20830 }

	20831 N -= n;

	20832 }

	20833 }

	20834 #ifndef SQLITE_OMIT_AUTOVACUUM

	20835 else{

	20836 /* If this database supports auto-vacuum and iPage is not the last

	20837 ** page in this overflow list, check that the pointer-map entry for

	20838 ** the following page matches iPage.

	20839 */

	20840 if( pCheck->pBt->autoVacuum && N>0 ){

	20841 i = get4byte(pOvflData);

	20842 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);

	20843 }

	20844 }

	20845 #endif

	20846 iPage = get4byte(pOvflData);

	20847 sqlite3PagerUnref(pOvflPage);

	20848

	20849 if( isFreeList && N<(iPage!=0) ){

	20850 checkAppendMsg(pCheck, "free-page count in header is too small");

	20851 }

	20852 }

	20853 }

	20854 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	20855

	20856 /*

	20857 ** An implementation of a min-heap.

	20858 **

	20859 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the

	20860 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2]

	20861 ** and aHeap[N*2+1].

	20862 **

	20863 ** The heap property is this: Every node is less than or equal to both

	20864 ** of its daughter nodes. A consequence of the heap property is that the

	20865 ** root node aHeap[1] is always the minimum value currently in the heap.

	20866 **

	20867 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto

	20868 ** the heap, preserving the heap property. The btreeHeapPull() routine

	20869 ** removes the root element from the heap (the minimum value in the heap)

	20870 ** and then moves other nodes around as necessary to preserve the heap

	20871 ** property.

	20872 **

	20873 ** This heap is used for cell overlap and coverage testing. Each u32

	20874 ** entry represents the span of a cell or freeblock on a btree page.

	20875 ** The upper 16 bits are the index of the first byte of a range and the

	20876 ** lower 16 bits are the index of the last byte of that range.

	20877 */

	20878 static void btreeHeapInsert(u32 *aHeap, u32 x){

	20879 u32 j, i = ++aHeap[0];

	20880 aHeap[i] = x;

	20881 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){

	20882 x = aHeap[j];

	20883 aHeap[j] = aHeap[i];

	20884 aHeap[i] = x;

	20885 i = j;

	20886 }

	20887 }

	20888 static int btreeHeapPull(u32 aHeap, u32 pOut){

	20889 u32 j, i, x;

	20890 if( (x = aHeap[0])==0 ) return 0;

	20891 *pOut = aHeap[1];

	20892 aHeap[1] = aHeap[x];

	20893 aHeap[x] = 0xffffffff;

	20894 aHeap[0]--;

	20895 i = 1;

	20896 while( (j = i*2)<=aHeap[0] ){

	20897 if( aHeap[j]>aHeap[j+1] ) j++;

	20898 if( aHeap[i]<aHeap[j] ) break;

	20899 x = aHeap[i];

	20900 aHeap[i] = aHeap[j];

	20901 aHeap[j] = x;

	20902 i = j;

	20903 }

	20904 return 1;

	20905 }

	20906

	20907 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	20908 /*

	20909 ** Do various sanity checks on a single page of a tree. Return

	20910 ** the tree depth. Root pages return 0. Parents of root pages

	20911 ** return 1, and so forth.

	20912 **

	20913 ** These checks are done:

	20914 **

	20915 ** 1. Make sure that cells and freeblocks do not overlap

	20916 ** but combine to completely cover the page.

	20917 ** 2. Make sure integer cell keys are in order.

	20918 ** 3. Check the integrity of overflow pages.

	20919 ** 4. Recursively call checkTreePage on all children.

	20920 ** 5. Verify that the depth of all children is the same.

	20921 */

	20922 static int checkTreePage(

	20923 IntegrityCk pCheck, / Context for the sanity check */

	20924 int iPage, /* Page number of the page to check */

	20925 i64 piMinKey, / Write minimum integer primary key here */

	20926 i64 maxKey /* Error if integer primary key greater than this */

	20927 ){

	20928 MemPage pPage = 0; / The page being analyzed */

	20929 int i; /* Loop counter */

	20930 int rc; /* Result code from subroutine call */

	20931 int depth = -1, d2; /* Depth of a subtree */

	20932 int pgno; /* Page number */

	20933 int nFrag; /* Number of fragmented bytes on the page */

	20934 int hdr; /* Offset to the page header */

	20935 int cellStart; /* Offset to the start of the cell pointer array */

	20936 int nCell; /* Number of cells */

	20937 int doCoverageCheck = 1; /* True if cell coverage checking should be done */

	20938 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey

	20939 ** False if IPK must be strictly less than maxKey */

	20940 u8 data; / Page content */

	20941 u8 pCell; / Cell content */

	20942 u8 pCellIdx; / Next element of the cell pointer array */

	20943 BtShared pBt; / The BtShared object that owns pPage */

	20944 u32 pc; /* Address of a cell */

	20945 u32 usableSize; /* Usable size of the page */

	20946 u32 contentOffset; /* Offset to the start of the cell content area */

	20947 u32 heap = 0; / Min-heap used for checking cell coverage */

	20948 u32 x, prev = 0; /* Next and previous entry on the min-heap */

	20949 const char *saved_zPfx = pCheck->zPfx;

	20950 int saved_v1 = pCheck->v1;

	20951 int saved_v2 = pCheck->v2;

	20952 u8 savedIsInit = 0;

	20953

	20954 /* Check that the page exists

	20955 */

	20956 pBt = pCheck->pBt;

	20957 usableSize = pBt->usableSize;

	20958 if( iPage==0 ) return 0;

	20959 if( checkRef(pCheck, iPage) ) return 0;

	20960 pCheck->zPfx = "Page %d: ";

	20961 pCheck->v1 = iPage;

	20962 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){

	20963 checkAppendMsg(pCheck,

	20964 "unable to get the page. error code=%d", rc);

	20965 goto end_of_check;

	20966 }

	20967

	20968 /* Clear MemPage.isInit to make sure the corruption detection code in

	20969 ** btreeInitPage() is executed. */

	20970 savedIsInit = pPage->isInit;

	20971 pPage->isInit = 0;

	20972 if( (rc = btreeInitPage(pPage))!=0 ){

	20973 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */

	20974 checkAppendMsg(pCheck,

	20975 "btreeInitPage() returns error code %d", rc);

	20976 goto end_of_check;

	20977 }

	20978 data = pPage->aData;

	20979 hdr = pPage->hdrOffset;

	20980

	20981 /* Set up for cell analysis */

	20982 pCheck->zPfx = "On tree page %d cell %d: ";

	20983 contentOffset = get2byteNotZero(&data[hdr+5]);

	20984 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */

	20985

	20986 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the

	20987 ** number of cells on the page. */

	20988 nCell = get2byte(&data[hdr+3]);

	20989 assert( pPage->nCell==nCell );

	20990

	20991 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page

	20992 ** immediately follows the b-tree page header. */

	20993 cellStart = hdr + 12 - 4*pPage->leaf;

	20994 assert( pPage->aCellIdx==&data[cellStart] );

	20995 pCellIdx = &data[cellStart + 2*(nCell-1)];

	20996

	20997 if( !pPage->leaf ){

	20998 /* Analyze the right-child page of internal pages */

	20999 pgno = get4byte(&data[hdr+8]);

	21000 #ifndef SQLITE_OMIT_AUTOVACUUM

	21001 if( pBt->autoVacuum ){

	21002 pCheck->zPfx = "On page %d at right child: ";

	21003 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

	21004 }

	21005 #endif

	21006 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);

	21007 keyCanBeEqual = 0;

	21008 }else{

	21009 /* For leaf pages, the coverage check will occur in the same loop

	21010 ** as the other cell checks, so initialize the heap. */

	21011 heap = pCheck->heap;

	21012 heap[0] = 0;

	21013 }

	21014

	21015 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte

	21016 ** integer offsets to the cell contents. */

	21017 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){

	21018 CellInfo info;

	21019

	21020 /* Check cell size */

	21021 pCheck->v2 = i;

	21022 assert( pCellIdx==&data[cellStart + i*2] );

	21023 pc = get2byteAligned(pCellIdx);

	21024 pCellIdx -= 2;

	21025 if( pc<contentOffset \|\| pc>usableSize-4 ){

	21026 checkAppendMsg(pCheck, "Offset %d out of range %d..%d",

	21027 pc, contentOffset, usableSize-4);

	21028 doCoverageCheck = 0;

	21029 continue;

	21030 }

	21031 pCell = &data[pc];

	21032 pPage->xParseCell(pPage, pCell, &info);

	21033 if( pc+info.nSize>usableSize ){

	21034 checkAppendMsg(pCheck, "Extends off end of page");

	21035 doCoverageCheck = 0;

	21036 continue;

	21037 }

	21038

	21039 /* Check for integer primary key out of range */

	21040 if( pPage->intKey ){

	21041 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){

	21042 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);

	21043 }

	21044 maxKey = info.nKey;

	21045 }

	21046

	21047 /* Check the content overflow list */

	21048 if( info.nPayload>info.nLocal ){

	21049 int nPage; /* Number of pages on the overflow chain */

	21050 Pgno pgnoOvfl; /* First page of the overflow chain */

	21051 assert( pc + info.nSize - 4 <= usableSize );

	21052 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);

	21053 pgnoOvfl = get4byte(&pCell[info.nSize - 4]);

	21054 #ifndef SQLITE_OMIT_AUTOVACUUM

	21055 if( pBt->autoVacuum ){

	21056 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);

	21057 }

	21058 #endif

	21059 checkList(pCheck, 0, pgnoOvfl, nPage);

	21060 }

	21061

	21062 if( !pPage->leaf ){

	21063 /* Check sanity of left child page for internal pages */

	21064 pgno = get4byte(pCell);

	21065 #ifndef SQLITE_OMIT_AUTOVACUUM

	21066 if( pBt->autoVacuum ){

	21067 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

	21068 }

	21069 #endif

	21070 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);

	21071 keyCanBeEqual = 0;

	21072 if( d2!=depth ){

	21073 checkAppendMsg(pCheck, "Child page depth differs");

	21074 depth = d2;

	21075 }

	21076 }else{

	21077 /* Populate the coverage-checking heap for leaf pages */

	21078 btreeHeapInsert(heap, (pc<<16)\|(pc+info.nSize-1));

	21079 }

	21080 }

	21081 *piMinKey = maxKey;

	21082

	21083 /* Check for complete coverage of the page

	21084 */

	21085 pCheck->zPfx = 0;

	21086 if( doCoverageCheck && pCheck->mxErr>0 ){

	21087 /* For leaf pages, the min-heap has already been initialized and the

	21088 ** cells have already been inserted. But for internal pages, that has

	21089 ** not yet been done, so do it now */

	21090 if( !pPage->leaf ){

	21091 heap = pCheck->heap;

	21092 heap[0] = 0;

	21093 for(i=nCell-1; i>=0; i--){

	21094 u32 size;

	21095 pc = get2byteAligned(&data[cellStart+i*2]);

	21096 size = pPage->xCellSize(pPage, &data[pc]);

	21097 btreeHeapInsert(heap, (pc<<16)\|(pc+size-1));

	21098 }

	21099 }

	21100 /* Add the freeblocks to the min-heap

	21101 **

	21102 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header

	21103 ** is the offset of the first freeblock, or zero if there are no

	21104 ** freeblocks on the page.

	21105 */

	21106 i = get2byte(&data[hdr+1]);

	21107 while( i>0 ){

	21108 int size, j;

	21109 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeInitPage() */

	21110 size = get2byte(&data[i+2]);

	21111 assert( (u32)(i+size)<=usableSize ); /* Enforced by btreeInitPage() */

	21112 btreeHeapInsert(heap, (((u32)i)<<16)\|(i+size-1));

	21113 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a

	21114 ** big-endian integer which is the offset in the b-tree page of the next

	21115 ** freeblock in the chain, or zero if the freeblock is the last on the

	21116 ** chain. */

	21117 j = get2byte(&data[i]);

	21118 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of

	21119 ** increasing offset. */

	21120 assert( j==0 \|\| j>i+size ); /* Enforced by btreeInitPage() */

	21121 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeInitPage() */

	21122 i = j;

	21123 }

	21124 /* Analyze the min-heap looking for overlap between cells and/or

	21125 ** freeblocks, and counting the number of untracked bytes in nFrag.

	21126 **

	21127 ** Each min-heap entry is of the form: (start_address<<16)\|end_address.

	21128 ** There is an implied first entry the covers the page header, the cell

	21129 ** pointer index, and the gap between the cell pointer index and the start

	21130 ** of cell content.

	21131 **

	21132 ** The loop below pulls entries from the min-heap in order and compares

	21133 ** the start_address against the previous end_address. If there is an

	21134 ** overlap, that means bytes are used multiple times. If there is a gap,

	21135 ** that gap is added to the fragmentation count.

	21136 */

	21137 nFrag = 0;

	21138 prev = contentOffset - 1; /* Implied first min-heap entry */

	21139 while( btreeHeapPull(heap,&x) ){

	21140 if( (prev&0xffff)>=(x>>16) ){

	21141 checkAppendMsg(pCheck,

	21142 "Multiple uses for byte %u of page %d", x>>16, iPage);

	21143 break;

	21144 }else{

	21145 nFrag += (x>>16) - (prev&0xffff) - 1;

	21146 prev = x;

	21147 }

	21148 }

	21149 nFrag += usableSize - (prev&0xffff) - 1;

	21150 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments

	21151 ** is stored in the fifth field of the b-tree page header.

	21152 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the

	21153 ** number of fragmented free bytes within the cell content area.

	21154 */

	21155 if( heap[0]==0 && nFrag!=data[hdr+7] ){

	21156 checkAppendMsg(pCheck,

	21157 "Fragmentation of %d bytes reported as %d on page %d",

	21158 nFrag, data[hdr+7], iPage);

	21159 }

	21160 }

	21161

	21162 end_of_check:

	21163 if( !doCoverageCheck ) pPage->isInit = savedIsInit;

	21164 releasePage(pPage);

	21165 pCheck->zPfx = saved_zPfx;

	21166 pCheck->v1 = saved_v1;

	21167 pCheck->v2 = saved_v2;

	21168 return depth+1;

	21169 }

	21170 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	21171

	21172 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

	21173 /*

	21174 ** This routine does a complete check of the given BTree file. aRoot[] is

	21175 ** an array of pages numbers were each page number is the root page of

	21176 ** a table. nRoot is the number of entries in aRoot.

	21177 **

	21178 ** A read-only or read-write transaction must be opened before calling

	21179 ** this function.

	21180 **

	21181 ** Write the number of error seen in *pnErr. Except for some memory

	21182 ** allocation errors, an error message held in memory obtained from

	21183 ** malloc is returned if pnErr is non-zero. If pnErr==0 then NULL is

	21184 ** returned. If a memory allocation error occurs, NULL is returned.

	21185 */

	21186 SQLITE_PRIVATE char *sqlite3BtreeIntegrityCheck(

	21187 Btree p, / The btree to be checked */

	21188 int aRoot, / An array of root pages numbers for individual trees */

	21189 int nRoot, /* Number of entries in aRoot[] */

	21190 int mxErr, /* Stop reporting errors after this many */

	21191 int pnErr / Write number of errors seen to this variable */

	21192 ){

	21193 Pgno i;

	21194 IntegrityCk sCheck;

	21195 BtShared *pBt = p->pBt;

	21196 int savedDbFlags = pBt->db->flags;

	21197 char zErr[100];

	21198 VVA_ONLY( int nRef );

	21199

	21200 sqlite3BtreeEnter(p);

	21201 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );

	21202 assert( (nRef = sqlite3PagerRefcount(pBt->pPager))>=0 );

	21203 sCheck.pBt = pBt;

	21204 sCheck.pPager = pBt->pPager;

	21205 sCheck.nPage = btreePagecount(sCheck.pBt);

	21206 sCheck.mxErr = mxErr;

	21207 sCheck.nErr = 0;

	21208 sCheck.mallocFailed = 0;

	21209 sCheck.zPfx = 0;

	21210 sCheck.v1 = 0;

	21211 sCheck.v2 = 0;

	21212 sCheck.aPgRef = 0;

	21213 sCheck.heap = 0;

	21214 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);

	21215 if( sCheck.nPage==0 ){

	21216 goto integrity_ck_cleanup;

	21217 }

	21218

	21219 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);

	21220 if( !sCheck.aPgRef ){

	21221 sCheck.mallocFailed = 1;

	21222 goto integrity_ck_cleanup;

	21223 }

	21224 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );

	21225 if( sCheck.heap==0 ){

	21226 sCheck.mallocFailed = 1;

	21227 goto integrity_ck_cleanup;

	21228 }

	21229

	21230 i = PENDING_BYTE_PAGE(pBt);

	21231 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);

	21232

	21233 /* Check the integrity of the freelist

	21234 */

	21235 sCheck.zPfx = "Main freelist: ";

	21236 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),

	21237 get4byte(&pBt->pPage1->aData[36]));

	21238 sCheck.zPfx = 0;

	21239

	21240 /* Check all the tables.

	21241 */

	21242 testcase( pBt->db->flags & SQLITE_CellSizeCk );

	21243 pBt->db->flags &= ~SQLITE_CellSizeCk;

	21244 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){

	21245 i64 notUsed;

	21246 if( aRoot[i]==0 ) continue;

	21247 #ifndef SQLITE_OMIT_AUTOVACUUM

	21248 if( pBt->autoVacuum && aRoot[i]>1 ){

	21249 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);

	21250 }

	21251 #endif

	21252 checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);

	21253 }

	21254 pBt->db->flags = savedDbFlags;

	21255

	21256 /* Make sure every page in the file is referenced

	21257 */

	21258 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){

	21259 #ifdef SQLITE_OMIT_AUTOVACUUM

	21260 if( getPageReferenced(&sCheck, i)==0 ){

	21261 checkAppendMsg(&sCheck, "Page %d is never used", i);

	21262 }

	21263 #else

	21264 /* If the database supports auto-vacuum, make sure no tables contain

	21265 ** references to pointer-map pages.

	21266 */

	21267 if( getPageReferenced(&sCheck, i)==0 &&

	21268 (PTRMAP_PAGENO(pBt, i)!=i \|\| !pBt->autoVacuum) ){

	21269 checkAppendMsg(&sCheck, "Page %d is never used", i);

	21270 }

	21271 if( getPageReferenced(&sCheck, i)!=0 &&

	21272 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){

	21273 checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);

	21274 }

	21275 #endif

	21276 }

	21277

	21278 /* Clean up and report errors.

	21279 */

	21280 integrity_ck_cleanup:

	21281 sqlite3PageFree(sCheck.heap);

	21282 sqlite3_free(sCheck.aPgRef);

	21283 if( sCheck.mallocFailed ){

	21284 sqlite3StrAccumReset(&sCheck.errMsg);

	21285 sCheck.nErr++;

	21286 }

	21287 *pnErr = sCheck.nErr;

	21288 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);

	21289 /* Make sure this analysis did not leave any unref() pages. */

	21290 assert( nRef==sqlite3PagerRefcount(pBt->pPager) );

	21291 sqlite3BtreeLeave(p);

	21292 return sqlite3StrAccumFinish(&sCheck.errMsg);

	21293 }

	21294 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

	21295

	21296 /*

	21297 ** Return the full pathname of the underlying database file. Return

	21298 ** an empty string if the database is in-memory or a TEMP database.

	21299 **

	21300 ** The pager filename is invariant as long as the pager is

	21301 ** open so it is safe to access without the BtShared mutex.

	21302 */

	21303 SQLITE_PRIVATE const char sqlite3BtreeGetFilename(Btree p){

	21304 assert( p->pBt->pPager!=0 );

	21305 return sqlite3PagerFilename(p->pBt->pPager, 1);

	21306 }

	21307

	21308 /*

	21309 ** Return the pathname of the journal file for this database. The return

	21310 ** value of this routine is the same regardless of whether the journal file

	21311 ** has been created or not.

	21312 **

	21313 ** The pager journal filename is invariant as long as the pager is

	21314 ** open so it is safe to access without the BtShared mutex.

	21315 */

	21316 SQLITE_PRIVATE const char sqlite3BtreeGetJournalname(Btree p){

	21317 assert( p->pBt->pPager!=0 );

	21318 return sqlite3PagerJournalname(p->pBt->pPager);

	21319 }

	21320

	21321 /*

	21322 ** Return non-zero if a transaction is active.

	21323 */

	21324 SQLITE_PRIVATE int sqlite3BtreeIsInTrans(Btree *p){

	21325 assert( p==0 \|\| sqlite3_mutex_held(p->db->mutex) );

	21326 return (p && (p->inTrans==TRANS_WRITE));

	21327 }

	21328

	21329 #ifndef SQLITE_OMIT_WAL

	21330 /*

	21331 ** Run a checkpoint on the Btree passed as the first argument.

	21332 **

	21333 ** Return SQLITE_LOCKED if this or any other connection has an open

	21334 ** transaction on the shared-cache the argument Btree is connected to.

	21335 **

	21336 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.

	21337 */

	21338 SQLITE_PRIVATE int sqlite3BtreeCheckpoint(Btree p, int eMode, int pnLog, int * pnCkpt){

	21339 int rc = SQLITE_OK;

	21340 if( p ){

	21341 BtShared *pBt = p->pBt;

	21342 sqlite3BtreeEnter(p);

	21343 if( pBt->inTransaction!=TRANS_NONE ){

	21344 rc = SQLITE_LOCKED;

	21345 }else{

	21346 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);

	21347 }

	21348 sqlite3BtreeLeave(p);

	21349 }

	21350 return rc;

	21351 }

	21352 #endif

	21353

	21354 /*

	21355 ** Return non-zero if a read (or write) transaction is active.

	21356 */

	21357 SQLITE_PRIVATE int sqlite3BtreeIsInReadTrans(Btree *p){

	21358 assert( p );

	21359 assert( sqlite3_mutex_held(p->db->mutex) );

	21360 return p->inTrans!=TRANS_NONE;

	21361 }

	21362

	21363 SQLITE_PRIVATE int sqlite3BtreeIsInBackup(Btree *p){

	21364 assert( p );

	21365 assert( sqlite3_mutex_held(p->db->mutex) );

	21366 return p->nBackup!=0;

	21367 }

	21368

	21369 /*

	21370 ** This function returns a pointer to a blob of memory associated with

	21371 ** a single shared-btree. The memory is used by client code for its own

	21372 ** purposes (for example, to store a high-level schema associated with

	21373 ** the shared-btree). The btree layer manages reference counting issues.

	21374 **

	21375 ** The first time this is called on a shared-btree, nBytes bytes of memory

	21376 ** are allocated, zeroed, and returned to the caller. For each subsequent

	21377 ** call the nBytes parameter is ignored and a pointer to the same blob

	21378 ** of memory returned.

	21379 **

	21380 ** If the nBytes parameter is 0 and the blob of memory has not yet been

	21381 ** allocated, a null pointer is returned. If the blob has already been

	21382 ** allocated, it is returned as normal.

	21383 **

	21384 ** Just before the shared-btree is closed, the function passed as the

	21385 ** xFree argument when the memory allocation was made is invoked on the

	21386 ** blob of allocated memory. The xFree function should not call sqlite3_free()

	21387 ** on the memory, the btree layer does that.

	21388 */

	21389 SQLITE_PRIVATE void sqlite3BtreeSchema(Btree p, int nBytes, void(xFree)(void )){

	21390 BtShared *pBt = p->pBt;

	21391 sqlite3BtreeEnter(p);

	21392 if( !pBt->pSchema && nBytes ){

	21393 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);

	21394 pBt->xFreeSchema = xFree;

	21395 }

	21396 sqlite3BtreeLeave(p);

	21397 return pBt->pSchema;

	21398 }

	21399

	21400 /*

	21401 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared

	21402 ** btree as the argument handle holds an exclusive lock on the

	21403 ** sqlite_master table. Otherwise SQLITE_OK.

	21404 */

	21405 SQLITE_PRIVATE int sqlite3BtreeSchemaLocked(Btree *p){

	21406 int rc;

	21407 assert( sqlite3_mutex_held(p->db->mutex) );

	21408 sqlite3BtreeEnter(p);

	21409 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

	21410 assert( rc==SQLITE_OK \|\| rc==SQLITE_LOCKED_SHAREDCACHE );

	21411 sqlite3BtreeLeave(p);

	21412 return rc;

	21413 }

	21414

	21415

	21416 #ifndef SQLITE_OMIT_SHARED_CACHE

	21417 /*

	21418 ** Obtain a lock on the table whose root page is iTab. The

	21419 ** lock is a write lock if isWritelock is true or a read lock

	21420 ** if it is false.

	21421 */

	21422 SQLITE_PRIVATE int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){

	21423 int rc = SQLITE_OK;

	21424 assert( p->inTrans!=TRANS_NONE );

	21425 if( p->sharable ){

	21426 u8 lockType = READ_LOCK + isWriteLock;

	21427 assert( READ_LOCK+1==WRITE_LOCK );

	21428 assert( isWriteLock==0 \|\| isWriteLock==1 );

	21429

	21430 sqlite3BtreeEnter(p);

	21431 rc = querySharedCacheTableLock(p, iTab, lockType);

	21432 if( rc==SQLITE_OK ){

	21433 rc = setSharedCacheTableLock(p, iTab, lockType);

	21434 }

	21435 sqlite3BtreeLeave(p);

	21436 }

	21437 return rc;

	21438 }

	21439 #endif

	21440

	21441 #ifndef SQLITE_OMIT_INCRBLOB

	21442 /*

	21443 ** Argument pCsr must be a cursor opened for writing on an

	21444 ** INTKEY table currently pointing at a valid table entry.

	21445 ** This function modifies the data stored as part of that entry.

	21446 **

	21447 ** Only the data content may only be modified, it is not possible to

	21448 ** change the length of the data stored. If this function is called with

	21449 ** parameters that attempt to write past the end of the existing data,

	21450 ** no modifications are made and SQLITE_CORRUPT is returned.

	21451 */

	21452 SQLITE_PRIVATE int sqlite3BtreePutData(BtCursor pCsr, u32 offset, u32 amt, void z){

	21453 int rc;

	21454 assert( cursorHoldsMutex(pCsr) );

	21455 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );

	21456 assert( pCsr->curFlags & BTCF_Incrblob );

	21457

	21458 rc = restoreCursorPosition(pCsr);

	21459 if( rc!=SQLITE_OK ){

	21460 return rc;

	21461 }

	21462 assert( pCsr->eState!=CURSOR_REQUIRESEEK );

	21463 if( pCsr->eState!=CURSOR_VALID ){

	21464 return SQLITE_ABORT;

	21465 }

	21466

	21467 /* Save the positions of all other cursors open on this table. This is

	21468 ** required in case any of them are holding references to an xFetch

	21469 ** version of the b-tree page modified by the accessPayload call below.

	21470 **

	21471 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()

	21472 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence

	21473 ** saveAllCursors can only return SQLITE_OK.

	21474 */

	21475 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);

	21476 assert( rc==SQLITE_OK );

	21477

	21478 /* Check some assumptions:

	21479 ** (a) the cursor is open for writing,

	21480 ** (b) there is a read/write transaction open,

	21481 ** (c) the connection holds a write-lock on the table (if required),

	21482 ** (d) there are no conflicting read-locks, and

	21483 ** (e) the cursor points at a valid row of an intKey table.

	21484 */

	21485 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){

	21486 return SQLITE_READONLY;

	21487 }

	21488 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0

	21489 && pCsr->pBt->inTransaction==TRANS_WRITE );

	21490 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );

	21491 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );

	21492 assert( pCsr->apPage[pCsr->iPage]->intKey );

	21493

	21494 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);

	21495 }

	21496

	21497 /*

	21498 ** Mark this cursor as an incremental blob cursor.

	21499 */

	21500 SQLITE_PRIVATE void sqlite3BtreeIncrblobCursor(BtCursor *pCur){

	21501 pCur->curFlags \|= BTCF_Incrblob;

	21502 pCur->pBtree->hasIncrblobCur = 1;

	21503 }

	21504 #endif

	21505

	21506 /*

	21507 ** Set both the "read version" (single byte at byte offset 18) and

	21508 ** "write version" (single byte at byte offset 19) fields in the database

	21509 ** header to iVersion.

	21510 */

	21511 SQLITE_PRIVATE int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){

	21512 BtShared *pBt = pBtree->pBt;

	21513 int rc; /* Return code */

	21514

	21515 assert( iVersion==1 \|\| iVersion==2 );

	21516

	21517 /* If setting the version fields to 1, do not automatically open the

	21518 ** WAL connection, even if the version fields are currently set to 2.

	21519 */

	21520 pBt->btsFlags &= ~BTS_NO_WAL;

	21521 if( iVersion==1 ) pBt->btsFlags \|= BTS_NO_WAL;

	21522

	21523 rc = sqlite3BtreeBeginTrans(pBtree, 0);

	21524 if( rc==SQLITE_OK ){

	21525 u8 *aData = pBt->pPage1->aData;

	21526 if( aData[18]!=(u8)iVersion \|\| aData[19]!=(u8)iVersion ){

	21527 rc = sqlite3BtreeBeginTrans(pBtree, 2);

	21528 if( rc==SQLITE_OK ){

	21529 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

	21530 if( rc==SQLITE_OK ){

	21531 aData[18] = (u8)iVersion;

	21532 aData[19] = (u8)iVersion;

	21533 }

	21534 }

	21535 }

	21536 }

	21537

	21538 pBt->btsFlags &= ~BTS_NO_WAL;

	21539 return rc;

	21540 }

	21541

	21542 /*

	21543 ** Return true if the cursor has a hint specified. This routine is

	21544 ** only used from within assert() statements

	21545 */

	21546 SQLITE_PRIVATE int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){

	21547 return (pCsr->hints & mask)!=0;

	21548 }

	21549

	21550 /*

	21551 ** Return true if the given Btree is read-only.

	21552 */

	21553 SQLITE_PRIVATE int sqlite3BtreeIsReadonly(Btree *p){

	21554 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;

	21555 }

	21556

	21557 /*

	21558 ** Return the size of the header added to each page by this module.

	21559 */

	21560 SQLITE_PRIVATE int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }

	21561

	21562 /************ End of btree.c *********************************************/

	21563 /************ Begin file backup.c ****************************************/

	21564 /*

	21565 ** 2009 January 28

	21566 **

	21567 ** The author disclaims copyright to this source code. In place of

	21568 ** a legal notice, here is a blessing:

	21569 **

	21570 ** May you do good and not evil.

	21571 ** May you find forgiveness for yourself and forgive others.

	21572 ** May you share freely, never taking more than you give.

	21573 **

	21574 *************************************************************************

	21575 ** This file contains the implementation of the sqlite3_backup_XXX()

	21576 ** API functions and the related features.

	21577 */

	21578 /* #include "sqliteInt.h" */

	21579 /* #include "btreeInt.h" */

	21580

	21581 /*

	21582 ** Structure allocated for each backup operation.

	21583 */

	21584 struct sqlite3_backup {

	21585 sqlite3* pDestDb; /* Destination database handle */

	21586 Btree pDest; / Destination b-tree file */

	21587 u32 iDestSchema; /* Original schema cookie in destination */

	21588 int bDestLocked; /* True once a write-transaction is open on pDest */

	21589

	21590 Pgno iNext; /* Page number of the next source page to copy */

	21591 sqlite3* pSrcDb; /* Source database handle */

	21592 Btree pSrc; / Source b-tree file */

	21593

	21594 int rc; /* Backup process error code */

	21595

	21596 /* These two variables are set by every call to backup_step(). They are

	21597 ** read by calls to backup_remaining() and backup_pagecount().

	21598 */

	21599 Pgno nRemaining; /* Number of pages left to copy */

	21600 Pgno nPagecount; /* Total number of pages to copy */

	21601

	21602 int isAttached; /* True once backup has been registered with pager */

	21603 sqlite3_backup pNext; / Next backup associated with source pager */

	21604 };

	21605

	21606 /*

	21607 ** THREAD SAFETY NOTES:

	21608 **

	21609 ** Once it has been created using backup_init(), a single sqlite3_backup

	21610 ** structure may be accessed via two groups of thread-safe entry points:

	21611 **

	21612 ** * Via the sqlite3_backup_XXX() API function backup_step() and

	21613 ** backup_finish(). Both these functions obtain the source database

	21614 ** handle mutex and the mutex associated with the source BtShared

	21615 ** structure, in that order.

	21616 **

	21617 ** * Via the BackupUpdate() and BackupRestart() functions, which are

	21618 ** invoked by the pager layer to report various state changes in

	21619 ** the page cache associated with the source database. The mutex

	21620 ** associated with the source database BtShared structure will always

	21621 ** be held when either of these functions are invoked.

	21622 **

	21623 ** The other sqlite3_backup_XXX() API functions, backup_remaining() and

	21624 ** backup_pagecount() are not thread-safe functions. If they are called

	21625 ** while some other thread is calling backup_step() or backup_finish(),

	21626 ** the values returned may be invalid. There is no way for a call to

	21627 ** BackupUpdate() or BackupRestart() to interfere with backup_remaining()

	21628 ** or backup_pagecount().

	21629 **

	21630 ** Depending on the SQLite configuration, the database handles and/or

	21631 ** the Btree objects may have their own mutexes that require locking.

	21632 ** Non-sharable Btrees (in-memory databases for example), do not have

	21633 ** associated mutexes.

	21634 */

	21635

	21636 /*

	21637 ** Return a pointer corresponding to database zDb (i.e. "main", "temp")

	21638 ** in connection handle pDb. If such a database cannot be found, return

	21639 ** a NULL pointer and write an error message to pErrorDb.

	21640 **

	21641 ** If the "temp" database is requested, it may need to be opened by this

	21642 ** function. If an error occurs while doing so, return 0 and write an

	21643 ** error message to pErrorDb.

	21644 */

	21645 static Btree findBtree(sqlite3 pErrorDb, sqlite3 pDb, const char zDb){

	21646 int i = sqlite3FindDbName(pDb, zDb);

	21647

	21648 if( i==1 ){

	21649 Parse *pParse;

	21650 int rc = 0;

	21651 pParse = sqlite3StackAllocZero(pErrorDb, sizeof(*pParse));

	21652 if( pParse==0 ){

	21653 sqlite3ErrorWithMsg(pErrorDb, SQLITE_NOMEM, "out of memory");

	21654 rc = SQLITE_NOMEM;

	21655 }else{

	21656 pParse->db = pDb;

	21657 if( sqlite3OpenTempDatabase(pParse) ){

	21658 sqlite3ErrorWithMsg(pErrorDb, pParse->rc, "%s", pParse->zErrMsg);

	21659 rc = SQLITE_ERROR;

	21660 }

	21661 sqlite3DbFree(pErrorDb, pParse->zErrMsg);

	21662 sqlite3ParserReset(pParse);

	21663 sqlite3StackFree(pErrorDb, pParse);

	21664 }

	21665 if( rc ){

	21666 return 0;

	21667 }

	21668 }

	21669

	21670 if( i<0 ){

	21671 sqlite3ErrorWithMsg(pErrorDb, SQLITE_ERROR, "unknown database %s", zDb);

	21672 return 0;

	21673 }

	21674

	21675 return pDb->aDb[i].pBt;

	21676 }

	21677

	21678 /*

	21679 ** Attempt to set the page size of the destination to match the page size

	21680 ** of the source.

	21681 */

	21682 static int setDestPgsz(sqlite3_backup *p){

	21683 int rc;

	21684 rc = sqlite3BtreeSetPageSize(p->pDest,sqlite3BtreeGetPageSize(p->pSrc),-1,0);

	21685 return rc;

	21686 }

	21687

	21688 /*

	21689 ** Check that there is no open read-transaction on the b-tree passed as the

	21690 ** second argument. If there is not, return SQLITE_OK. Otherwise, if there

	21691 ** is an open read-transaction, return SQLITE_ERROR and leave an error

	21692 ** message in database handle db.

	21693 */

	21694 static int checkReadTransaction(sqlite3 db, Btree p){

	21695 if( sqlite3BtreeIsInReadTrans(p) ){

	21696 sqlite3ErrorWithMsg(db, SQLITE_ERROR, "destination database is in use");

	21697 return SQLITE_ERROR;

	21698 }

	21699 return SQLITE_OK;

	21700 }

	21701

	21702 /*

	21703 ** Create an sqlite3_backup process to copy the contents of zSrcDb from

	21704 ** connection handle pSrcDb to zDestDb in pDestDb. If successful, return

	21705 ** a pointer to the new sqlite3_backup object.

	21706 **

	21707 ** If an error occurs, NULL is returned and an error code and error message

	21708 ** stored in database handle pDestDb.

	21709 */

	21710 SQLITE_API sqlite3_backup *SQLITE_STDCALL sqlite3_backup_init(

	21711 sqlite3* pDestDb, /* Database to write to */

	21712 const char zDestDb, / Name of database within pDestDb */

	21713 sqlite3* pSrcDb, /* Database connection to read from */

	21714 const char zSrcDb / Name of database within pSrcDb */

	21715 ){

	21716 sqlite3_backup p; / Value to return */

	21717

	21718 #ifdef SQLITE_ENABLE_API_ARMOR

	21719 if( !sqlite3SafetyCheckOk(pSrcDb)\|\|!sqlite3SafetyCheckOk(pDestDb) ){

	21720 (void)SQLITE_MISUSE_BKPT;

	21721 return 0;

	21722 }

	21723 #endif

	21724

	21725 /* Lock the source database handle. The destination database

	21726 ** handle is not locked in this routine, but it is locked in

	21727 ** sqlite3_backup_step(). The user is required to ensure that no

	21728 ** other thread accesses the destination handle for the duration

	21729 ** of the backup operation. Any attempt to use the destination

	21730 ** database connection while a backup is in progress may cause

	21731 ** a malfunction or a deadlock.

	21732 */

	21733 sqlite3_mutex_enter(pSrcDb->mutex);

	21734 sqlite3_mutex_enter(pDestDb->mutex);

	21735

	21736 if( pSrcDb==pDestDb ){

	21737 sqlite3ErrorWithMsg(

	21738 pDestDb, SQLITE_ERROR, "source and destination must be distinct"

	21739 );

	21740 p = 0;

	21741 }else {

	21742 /* Allocate space for a new sqlite3_backup object...

	21743 ** EVIDENCE-OF: R-64852-21591 The sqlite3_backup object is created by a

	21744 ** call to sqlite3_backup_init() and is destroyed by a call to

	21745 ** sqlite3_backup_finish(). */

	21746 p = (sqlite3_backup *)sqlite3MallocZero(sizeof(sqlite3_backup));

	21747 if( !p ){

	21748 sqlite3Error(pDestDb, SQLITE_NOMEM);

	21749 }

	21750 }

	21751

	21752 /* If the allocation succeeded, populate the new object. */

	21753 if( p ){

	21754 p->pSrc = findBtree(pDestDb, pSrcDb, zSrcDb);

	21755 p->pDest = findBtree(pDestDb, pDestDb, zDestDb);

	21756 p->pDestDb = pDestDb;

	21757 p->pSrcDb = pSrcDb;

	21758 p->iNext = 1;

	21759 p->isAttached = 0;

	21760

	21761 if( 0==p->pSrc \|\| 0==p->pDest

	21762 \|\| setDestPgsz(p)==SQLITE_NOMEM

	21763 \|\| checkReadTransaction(pDestDb, p->pDest)!=SQLITE_OK

	21764 ){

	21765 /* One (or both) of the named databases did not exist or an OOM

	21766 ** error was hit. Or there is a transaction open on the destination

	21767 ** database. The error has already been written into the pDestDb

	21768 ** handle. All that is left to do here is free the sqlite3_backup

	21769 ** structure. */

	21770 sqlite3_free(p);

	21771 p = 0;

	21772 }

	21773 }

	21774 if( p ){

	21775 p->pSrc->nBackup++;

	21776 }

	21777

	21778 sqlite3_mutex_leave(pDestDb->mutex);

	21779 sqlite3_mutex_leave(pSrcDb->mutex);

	21780 return p;

	21781 }

	21782

	21783 /*

	21784 ** Argument rc is an SQLite error code. Return true if this error is

	21785 ** considered fatal if encountered during a backup operation. All errors

	21786 ** are considered fatal except for SQLITE_BUSY and SQLITE_LOCKED.

	21787 */

	21788 static int isFatalError(int rc){

	21789 return (rc!=SQLITE_OK && rc!=SQLITE_BUSY && ALWAYS(rc!=SQLITE_LOCKED));

	21790 }

	21791

	21792 /*

	21793 ** Parameter zSrcData points to a buffer containing the data for

	21794 ** page iSrcPg from the source database. Copy this data into the

	21795 ** destination database.

	21796 */

	21797 static int backupOnePage(

	21798 sqlite3_backup p, / Backup handle */

	21799 Pgno iSrcPg, /* Source database page to backup */

	21800 const u8 zSrcData, / Source database page data */

	21801 int bUpdate /* True for an update, false otherwise */

	21802 ){

	21803 Pager * const pDestPager = sqlite3BtreePager(p->pDest);

	21804 const int nSrcPgsz = sqlite3BtreeGetPageSize(p->pSrc);

	21805 int nDestPgsz = sqlite3BtreeGetPageSize(p->pDest);

	21806 const int nCopy = MIN(nSrcPgsz, nDestPgsz);

	21807 const i64 iEnd = (i64)iSrcPg*(i64)nSrcPgsz;

	21808 #ifdef SQLITE_HAS_CODEC

	21809 /* Use BtreeGetReserveNoMutex() for the source b-tree, as although it is

	21810 ** guaranteed that the shared-mutex is held by this thread, handle

	21811 ** p->pSrc may not actually be the owner. */

	21812 int nSrcReserve = sqlite3BtreeGetReserveNoMutex(p->pSrc);

	21813 int nDestReserve = sqlite3BtreeGetOptimalReserve(p->pDest);

	21814 #endif

	21815 int rc = SQLITE_OK;

	21816 i64 iOff;

	21817

	21818 assert( sqlite3BtreeGetReserveNoMutex(p->pSrc)>=0 );

	21819 assert( p->bDestLocked );

	21820 assert( !isFatalError(p->rc) );

	21821 assert( iSrcPg!=PENDING_BYTE_PAGE(p->pSrc->pBt) );

	21822 assert( zSrcData );

	21823

	21824 /* Catch the case where the destination is an in-memory database and the

	21825 ** page sizes of the source and destination differ.

	21826 */

	21827 if( nSrcPgsz!=nDestPgsz && sqlite3PagerIsMemdb(pDestPager) ){

	21828 rc = SQLITE_READONLY;

	21829 }

	21830

	21831 #ifdef SQLITE_HAS_CODEC

	21832 /* Backup is not possible if the page size of the destination is changing

	21833 ** and a codec is in use.

	21834 */

	21835 if( nSrcPgsz!=nDestPgsz && sqlite3PagerGetCodec(pDestPager)!=0 ){

	21836 rc = SQLITE_READONLY;

	21837 }

	21838

	21839 /* Backup is not possible if the number of bytes of reserve space differ

	21840 ** between source and destination. If there is a difference, try to

	21841 ** fix the destination to agree with the source. If that is not possible,

	21842 ** then the backup cannot proceed.

	21843 */

	21844 if( nSrcReserve!=nDestReserve ){

	21845 u32 newPgsz = nSrcPgsz;

	21846 rc = sqlite3PagerSetPagesize(pDestPager, &newPgsz, nSrcReserve);

	21847 if( rc==SQLITE_OK && newPgsz!=nSrcPgsz ) rc = SQLITE_READONLY;

	21848 }

	21849 #endif

	21850

	21851 /* This loop runs once for each destination page spanned by the source

	21852 ** page. For each iteration, variable iOff is set to the byte offset

	21853 ** of the destination page.

	21854 */

	21855 for(iOff=iEnd-(i64)nSrcPgsz; rc==SQLITE_OK && iOff<iEnd; iOff+=nDestPgsz){

	21856 DbPage *pDestPg = 0;

	21857 Pgno iDest = (Pgno)(iOff/nDestPgsz)+1;

	21858 if( iDest==PENDING_BYTE_PAGE(p->pDest->pBt) ) continue;

	21859 if( SQLITE_OK==(rc = sqlite3PagerGet(pDestPager, iDest, &pDestPg, 0))

	21860 && SQLITE_OK==(rc = sqlite3PagerWrite(pDestPg))

	21861 ){

	21862 const u8 *zIn = &zSrcData[iOff%nSrcPgsz];

	21863 u8 *zDestData = sqlite3PagerGetData(pDestPg);

	21864 u8 *zOut = &zDestData[iOff%nDestPgsz];

	21865

	21866 /* Copy the data from the source page into the destination page.

	21867 ** Then clear the Btree layer MemPage.isInit flag. Both this module

	21868 ** and the pager code use this trick (clearing the first byte

	21869 ** of the page 'extra' space to invalidate the Btree layers

	21870 ** cached parse of the page). MemPage.isInit is marked

	21871 ** "MUST BE FIRST" for this purpose.

	21872 */

	21873 memcpy(zOut, zIn, nCopy);

	21874 ((u8 *)sqlite3PagerGetExtra(pDestPg))[0] = 0;

	21875 if( iOff==0 && bUpdate==0 ){

	21876 sqlite3Put4byte(&zOut[28], sqlite3BtreeLastPage(p->pSrc));

	21877 }

	21878 }

	21879 sqlite3PagerUnref(pDestPg);

	21880 }

	21881

	21882 return rc;

	21883 }

	21884

	21885 /*

	21886 ** If pFile is currently larger than iSize bytes, then truncate it to

	21887 ** exactly iSize bytes. If pFile is not larger than iSize bytes, then

	21888 ** this function is a no-op.

	21889 **

	21890 ** Return SQLITE_OK if everything is successful, or an SQLite error

	21891 ** code if an error occurs.

	21892 */

	21893 static int backupTruncateFile(sqlite3_file *pFile, i64 iSize){

	21894 i64 iCurrent;

	21895 int rc = sqlite3OsFileSize(pFile, &iCurrent);

	21896 if( rc==SQLITE_OK && iCurrent>iSize ){

	21897 rc = sqlite3OsTruncate(pFile, iSize);

	21898 }

	21899 return rc;

	21900 }

	21901

	21902 /*

	21903 ** Register this backup object with the associated source pager for

	21904 ** callbacks when pages are changed or the cache invalidated.

	21905 */

	21906 static void attachBackupObject(sqlite3_backup *p){

	21907 sqlite3_backup **pp;

	21908 assert( sqlite3BtreeHoldsMutex(p->pSrc) );

	21909 pp = sqlite3PagerBackupPtr(sqlite3BtreePager(p->pSrc));

	21910 p->pNext = *pp;

	21911 *pp = p;

	21912 p->isAttached = 1;

	21913 }

	21914

	21915 /*

	21916 ** Copy nPage pages from the source b-tree to the destination.

	21917 */

	21918 SQLITE_API int SQLITE_STDCALL sqlite3_backup_step(sqlite3_backup *p, int nPage){

	21919 int rc;

	21920 int destMode; /* Destination journal mode */

	21921 int pgszSrc = 0; /* Source page size */

	21922 int pgszDest = 0; /* Destination page size */

	21923

	21924 #ifdef SQLITE_ENABLE_API_ARMOR

	21925 if( p==0 ) return SQLITE_MISUSE_BKPT;

	21926 #endif

	21927 sqlite3_mutex_enter(p->pSrcDb->mutex);

	21928 sqlite3BtreeEnter(p->pSrc);

	21929 if( p->pDestDb ){

	21930 sqlite3_mutex_enter(p->pDestDb->mutex);

	21931 }

	21932

	21933 rc = p->rc;

	21934 if( !isFatalError(rc) ){

	21935 Pager * const pSrcPager = sqlite3BtreePager(p->pSrc); /* Source pager */

	21936 Pager * const pDestPager = sqlite3BtreePager(p->pDest); /* Dest pager */

	21937 int ii; /* Iterator variable */

	21938 int nSrcPage = -1; /* Size of source db in pages */

	21939 int bCloseTrans = 0; /* True if src db requires unlocking */

	21940

	21941 /* If the source pager is currently in a write-transaction, return

	21942 ** SQLITE_BUSY immediately.

	21943 */

	21944 if( p->pDestDb && p->pSrc->pBt->inTransaction==TRANS_WRITE ){

	21945 rc = SQLITE_BUSY;

	21946 }else{

	21947 rc = SQLITE_OK;

	21948 }

	21949

	21950 /* Lock the destination database, if it is not locked already. */

	21951 if( SQLITE_OK==rc && p->bDestLocked==0

	21952 && SQLITE_OK==(rc = sqlite3BtreeBeginTrans(p->pDest, 2))

	21953 ){

	21954 p->bDestLocked = 1;

	21955 sqlite3BtreeGetMeta(p->pDest, BTREE_SCHEMA_VERSION, &p->iDestSchema);

	21956 }

	21957

	21958 /* If there is no open read-transaction on the source database, open

	21959 ** one now. If a transaction is opened here, then it will be closed

	21960 ** before this function exits.

	21961 */

	21962 if( rc==SQLITE_OK && 0==sqlite3BtreeIsInReadTrans(p->pSrc) ){

	21963 rc = sqlite3BtreeBeginTrans(p->pSrc, 0);

	21964 bCloseTrans = 1;

	21965 }

	21966

	21967 /* Do not allow backup if the destination database is in WAL mode

	21968 ** and the page sizes are different between source and destination */

	21969 pgszSrc = sqlite3BtreeGetPageSize(p->pSrc);

	21970 pgszDest = sqlite3BtreeGetPageSize(p->pDest);

	21971 destMode = sqlite3PagerGetJournalMode(sqlite3BtreePager(p->pDest));

	21972 if( SQLITE_OK==rc && destMode==PAGER_JOURNALMODE_WAL && pgszSrc!=pgszDest ){

	21973 rc = SQLITE_READONLY;

	21974 }

	21975

	21976 /* Now that there is a read-lock on the source database, query the

	21977 ** source pager for the number of pages in the database.

	21978 */

	21979 nSrcPage = (int)sqlite3BtreeLastPage(p->pSrc);

	21980 assert( nSrcPage>=0 );

	21981 for(ii=0; (nPage<0 \|\| ii<nPage) && p->iNext<=(Pgno)nSrcPage && !rc; ii++){

	21982 const Pgno iSrcPg = p->iNext; /* Source page number */

	21983 if( iSrcPg!=PENDING_BYTE_PAGE(p->pSrc->pBt) ){

	21984 DbPage pSrcPg; / Source page object */

	21985 rc = sqlite3PagerGet(pSrcPager, iSrcPg, &pSrcPg,PAGER_GET_READONLY);

	21986 if( rc==SQLITE_OK ){

	21987 rc = backupOnePage(p, iSrcPg, sqlite3PagerGetData(pSrcPg), 0);

	21988 sqlite3PagerUnref(pSrcPg);

	21989 }

	21990 }

	21991 p->iNext++;

	21992 }

	21993 if( rc==SQLITE_OK ){

	21994 p->nPagecount = nSrcPage;

	21995 p->nRemaining = nSrcPage+1-p->iNext;

	21996 if( p->iNext>(Pgno)nSrcPage ){

	21997 rc = SQLITE_DONE;

	21998 }else if( !p->isAttached ){

	21999 attachBackupObject(p);

	22000 }

	22001 }

	22002

	22003 /* Update the schema version field in the destination database. This

	22004 ** is to make sure that the schema-version really does change in

	22005 ** the case where the source and destination databases have the

	22006 ** same schema version.

	22007 */

	22008 if( rc==SQLITE_DONE ){

	22009 if( nSrcPage==0 ){

	22010 rc = sqlite3BtreeNewDb(p->pDest);

	22011 nSrcPage = 1;

	22012 }

	22013 if( rc==SQLITE_OK \|\| rc==SQLITE_DONE ){

	22014 rc = sqlite3BtreeUpdateMeta(p->pDest,1,p->iDestSchema+1);

	22015 }

	22016 if( rc==SQLITE_OK ){

	22017 if( p->pDestDb ){

	22018 sqlite3ResetAllSchemasOfConnection(p->pDestDb);

	22019 }

	22020 if( destMode==PAGER_JOURNALMODE_WAL ){

	22021 rc = sqlite3BtreeSetVersion(p->pDest, 2);

	22022 }

	22023 }

	22024 if( rc==SQLITE_OK ){

	22025 int nDestTruncate;

	22026 /* Set nDestTruncate to the final number of pages in the destination

	22027 ** database. The complication here is that the destination page

	22028 ** size may be different to the source page size.

	22029 **

	22030 ** If the source page size is smaller than the destination page size,

	22031 ** round up. In this case the call to sqlite3OsTruncate() below will

	22032 ** fix the size of the file. However it is important to call

	22033 ** sqlite3PagerTruncateImage() here so that any pages in the

	22034 ** destination file that lie beyond the nDestTruncate page mark are

	22035 ** journalled by PagerCommitPhaseOne() before they are destroyed

	22036 ** by the file truncation.

	22037 */

	22038 assert( pgszSrc==sqlite3BtreeGetPageSize(p->pSrc) );

	22039 assert( pgszDest==sqlite3BtreeGetPageSize(p->pDest) );

	22040 if( pgszSrc<pgszDest ){

	22041 int ratio = pgszDest/pgszSrc;

	22042 nDestTruncate = (nSrcPage+ratio-1)/ratio;

	22043 if( nDestTruncate==(int)PENDING_BYTE_PAGE(p->pDest->pBt) ){

	22044 nDestTruncate--;

	22045 }

	22046 }else{

	22047 nDestTruncate = nSrcPage * (pgszSrc/pgszDest);

	22048 }

	22049 assert( nDestTruncate>0 );

	22050

	22051 if( pgszSrc<pgszDest ){

	22052 /* If the source page-size is smaller than the destination page-size,

	22053 ** two extra things may need to happen:

	22054 **

	22055 ** * The destination may need to be truncated, and

	22056 **

	22057 ** * Data stored on the pages immediately following the

	22058 ** pending-byte page in the source database may need to be

	22059 ** copied into the destination database.

	22060 */

	22061 const i64 iSize = (i64)pgszSrc * (i64)nSrcPage;

	22062 sqlite3_file * const pFile = sqlite3PagerFile(pDestPager);

	22063 Pgno iPg;

	22064 int nDstPage;

	22065 i64 iOff;

	22066 i64 iEnd;

	22067

	22068 assert( pFile );

	22069 assert( nDestTruncate==0

	22070 \|\| (i64)nDestTruncate*(i64)pgszDest >= iSize \|\| (

	22071 nDestTruncate==(int)(PENDING_BYTE_PAGE(p->pDest->pBt)-1)

	22072 && iSize>=PENDING_BYTE && iSize<=PENDING_BYTE+pgszDest

	22073 ));

	22074

	22075 /* This block ensures that all data required to recreate the original

	22076 ** database has been stored in the journal for pDestPager and the

	22077 ** journal synced to disk. So at this point we may safely modify

	22078 ** the database file in any way, knowing that if a power failure

	22079 ** occurs, the original database will be reconstructed from the

	22080 ** journal file. */

	22081 sqlite3PagerPagecount(pDestPager, &nDstPage);

	22082 for(iPg=nDestTruncate; rc==SQLITE_OK && iPg<=(Pgno)nDstPage; iPg++){

	22083 if( iPg!=PENDING_BYTE_PAGE(p->pDest->pBt) ){

	22084 DbPage *pPg;

	22085 rc = sqlite3PagerGet(pDestPager, iPg, &pPg, 0);

	22086 if( rc==SQLITE_OK ){

	22087 rc = sqlite3PagerWrite(pPg);

	22088 sqlite3PagerUnref(pPg);

	22089 }

	22090 }

	22091 }

	22092 if( rc==SQLITE_OK ){

	22093 rc = sqlite3PagerCommitPhaseOne(pDestPager, 0, 1);

	22094 }

	22095

	22096 /* Write the extra pages and truncate the database file as required */

	22097 iEnd = MIN(PENDING_BYTE + pgszDest, iSize);

	22098 for(

	22099 iOff=PENDING_BYTE+pgszSrc;

	22100 rc==SQLITE_OK && iOff<iEnd;

	22101 iOff+=pgszSrc

	22102 ){

	22103 PgHdr *pSrcPg = 0;

	22104 const Pgno iSrcPg = (Pgno)((iOff/pgszSrc)+1);

	22105 rc = sqlite3PagerGet(pSrcPager, iSrcPg, &pSrcPg, 0);

	22106 if( rc==SQLITE_OK ){

	22107 u8 *zData = sqlite3PagerGetData(pSrcPg);

	22108 rc = sqlite3OsWrite(pFile, zData, pgszSrc, iOff);

	22109 }

	22110 sqlite3PagerUnref(pSrcPg);

	22111 }

	22112 if( rc==SQLITE_OK ){

	22113 rc = backupTruncateFile(pFile, iSize);

	22114 }

	22115

	22116 /* Sync the database file to disk. */

	22117 if( rc==SQLITE_OK ){

	22118 rc = sqlite3PagerSync(pDestPager, 0);

	22119 }

	22120 }else{

	22121 sqlite3PagerTruncateImage(pDestPager, nDestTruncate);

	22122 rc = sqlite3PagerCommitPhaseOne(pDestPager, 0, 0);

	22123 }

	22124

	22125 /* Finish committing the transaction to the destination database. */

	22126 if( SQLITE_OK==rc

	22127 && SQLITE_OK==(rc = sqlite3BtreeCommitPhaseTwo(p->pDest, 0))

	22128 ){

	22129 rc = SQLITE_DONE;

	22130 }

	22131 }

	22132 }

	22133

	22134 /* If bCloseTrans is true, then this function opened a read transaction

	22135 ** on the source database. Close the read transaction here. There is

	22136 ** no need to check the return values of the btree methods here, as

	22137 ** "committing" a read-only transaction cannot fail.

	22138 */

	22139 if( bCloseTrans ){

	22140 TESTONLY( int rc2 );

	22141 TESTONLY( rc2 = ) sqlite3BtreeCommitPhaseOne(p->pSrc, 0);

	22142 TESTONLY( rc2 \|= ) sqlite3BtreeCommitPhaseTwo(p->pSrc, 0);

	22143 assert( rc2==SQLITE_OK );

	22144 }

	22145

	22146 if( rc==SQLITE_IOERR_NOMEM ){

	22147 rc = SQLITE_NOMEM;

	22148 }

	22149 p->rc = rc;

	22150 }

	22151 if( p->pDestDb ){

	22152 sqlite3_mutex_leave(p->pDestDb->mutex);

	22153 }

	22154 sqlite3BtreeLeave(p->pSrc);

	22155 sqlite3_mutex_leave(p->pSrcDb->mutex);

	22156 return rc;

	22157 }

	22158

	22159 /*

	22160 ** Release all resources associated with an sqlite3_backup* handle.

	22161 */

	22162 SQLITE_API int SQLITE_STDCALL sqlite3_backup_finish(sqlite3_backup *p){

	22163 sqlite3_backup *pp; / Ptr to head of pagers backup list */

	22164 sqlite3 pSrcDb; / Source database connection */

	22165 int rc; /* Value to return */

	22166

	22167 /* Enter the mutexes */

	22168 if( p==0 ) return SQLITE_OK;

	22169 pSrcDb = p->pSrcDb;

	22170 sqlite3_mutex_enter(pSrcDb->mutex);

	22171 sqlite3BtreeEnter(p->pSrc);

	22172 if( p->pDestDb ){

	22173 sqlite3_mutex_enter(p->pDestDb->mutex);

	22174 }

	22175

	22176 /* Detach this backup from the source pager. */

	22177 if( p->pDestDb ){

	22178 p->pSrc->nBackup--;

	22179 }

	22180 if( p->isAttached ){

	22181 pp = sqlite3PagerBackupPtr(sqlite3BtreePager(p->pSrc));

	22182 while( *pp!=p ){

	22183 pp = &(*pp)->pNext;

	22184 }

	22185 *pp = p->pNext;

	22186 }

	22187

	22188 /* If a transaction is still open on the Btree, roll it back. */

	22189 sqlite3BtreeRollback(p->pDest, SQLITE_OK, 0);

	22190

	22191 /* Set the error code of the destination database handle. */

	22192 rc = (p->rc==SQLITE_DONE) ? SQLITE_OK : p->rc;

	22193 if( p->pDestDb ){

	22194 sqlite3Error(p->pDestDb, rc);

	22195

	22196 /* Exit the mutexes and free the backup context structure. */

	22197 sqlite3LeaveMutexAndCloseZombie(p->pDestDb);

	22198 }

	22199 sqlite3BtreeLeave(p->pSrc);

	22200 if( p->pDestDb ){

	22201 /* EVIDENCE-OF: R-64852-21591 The sqlite3_backup object is created by a

	22202 ** call to sqlite3_backup_init() and is destroyed by a call to

	22203 ** sqlite3_backup_finish(). */

	22204 sqlite3_free(p);

	22205 }

	22206 sqlite3LeaveMutexAndCloseZombie(pSrcDb);

	22207 return rc;

	22208 }

	22209

	22210 /*

	22211 ** Return the number of pages still to be backed up as of the most recent

	22212 ** call to sqlite3_backup_step().

	22213 */

	22214 SQLITE_API int SQLITE_STDCALL sqlite3_backup_remaining(sqlite3_backup *p){

	22215 #ifdef SQLITE_ENABLE_API_ARMOR

	22216 if( p==0 ){

	22217 (void)SQLITE_MISUSE_BKPT;

	22218 return 0;

	22219 }

	22220 #endif

	22221 return p->nRemaining;

	22222 }

	22223

	22224 /*

	22225 ** Return the total number of pages in the source database as of the most

	22226 ** recent call to sqlite3_backup_step().

	22227 */

	22228 SQLITE_API int SQLITE_STDCALL sqlite3_backup_pagecount(sqlite3_backup *p){

	22229 #ifdef SQLITE_ENABLE_API_ARMOR

	22230 if( p==0 ){

	22231 (void)SQLITE_MISUSE_BKPT;

	22232 return 0;

	22233 }

	22234 #endif

	22235 return p->nPagecount;

	22236 }

	22237

	22238 /*

	22239 ** This function is called after the contents of page iPage of the

	22240 ** source database have been modified. If page iPage has already been

	22241 ** copied into the destination database, then the data written to the

	22242 ** destination is now invalidated. The destination copy of iPage needs

	22243 ** to be updated with the new data before the backup operation is

	22244 ** complete.

	22245 **

	22246 ** It is assumed that the mutex associated with the BtShared object

	22247 ** corresponding to the source database is held when this function is

	22248 ** called.

	22249 */

	22250 static SQLITE_NOINLINE void backupUpdate(

	22251 sqlite3_backup *p,

	22252 Pgno iPage,

	22253 const u8 *aData

	22254 ){

	22255 assert( p!=0 );

	22256 do{

	22257 assert( sqlite3_mutex_held(p->pSrc->pBt->mutex) );

	22258 if( !isFatalError(p->rc) && iPage<p->iNext ){

	22259 /* The backup process p has already copied page iPage. But now it

	22260 ** has been modified by a transaction on the source pager. Copy

	22261 ** the new data into the backup.

	22262 */

	22263 int rc;

	22264 assert( p->pDestDb );

	22265 sqlite3_mutex_enter(p->pDestDb->mutex);

	22266 rc = backupOnePage(p, iPage, aData, 1);

	22267 sqlite3_mutex_leave(p->pDestDb->mutex);

	22268 assert( rc!=SQLITE_BUSY && rc!=SQLITE_LOCKED );

	22269 if( rc!=SQLITE_OK ){

	22270 p->rc = rc;

	22271 }

	22272 }

	22273 }while( (p = p->pNext)!=0 );

	22274 }

	22275 SQLITE_PRIVATE void sqlite3BackupUpdate(sqlite3_backup pBackup, Pgno iPage, con st u8 aData){

	22276 if( pBackup ) backupUpdate(pBackup, iPage, aData);

	22277 }

	22278

	22279 /*

	22280 ** Restart the backup process. This is called when the pager layer

	22281 ** detects that the database has been modified by an external database

	22282 ** connection. In this case there is no way of knowing which of the

	22283 ** pages that have been copied into the destination database are still

	22284 ** valid and which are not, so the entire process needs to be restarted.

	22285 **

	22286 ** It is assumed that the mutex associated with the BtShared object

	22287 ** corresponding to the source database is held when this function is

	22288 ** called.

	22289 */

	22290 SQLITE_PRIVATE void sqlite3BackupRestart(sqlite3_backup *pBackup){

	22291 sqlite3_backup p; / Iterator variable */

	22292 for(p=pBackup; p; p=p->pNext){

	22293 assert( sqlite3_mutex_held(p->pSrc->pBt->mutex) );

	22294 p->iNext = 1;

	22295 }

	22296 }

	22297

	22298 #ifndef SQLITE_OMIT_VACUUM

	22299 /*

	22300 ** Copy the complete content of pBtFrom into pBtTo. A transaction

	22301 ** must be active for both files.

	22302 **

	22303 ** The size of file pTo may be reduced by this operation. If anything

	22304 ** goes wrong, the transaction on pTo is rolled back. If successful, the

	22305 ** transaction is committed before returning.

	22306 */

	22307 SQLITE_PRIVATE int sqlite3BtreeCopyFile(Btree pTo, Btree pFrom){

	22308 int rc;

	22309 sqlite3_file pFd; / File descriptor for database pTo */

	22310 sqlite3_backup b;

	22311 sqlite3BtreeEnter(pTo);

	22312 sqlite3BtreeEnter(pFrom);

	22313

	22314 assert( sqlite3BtreeIsInTrans(pTo) );

	22315 pFd = sqlite3PagerFile(sqlite3BtreePager(pTo));

	22316 if( pFd->pMethods ){

	22317 i64 nByte = sqlite3BtreeGetPageSize(pFrom)*(i64)sqlite3BtreeLastPage(pFrom);

	22318 rc = sqlite3OsFileControl(pFd, SQLITE_FCNTL_OVERWRITE, &nByte);

	22319 if( rc==SQLITE_NOTFOUND ) rc = SQLITE_OK;

	22320 if( rc ) goto copy_finished;

	22321 }

	22322

	22323 /* Set up an sqlite3_backup object. sqlite3_backup.pDestDb must be set

	22324 ** to 0. This is used by the implementations of sqlite3_backup_step()

	22325 ** and sqlite3_backup_finish() to detect that they are being called

	22326 ** from this function, not directly by the user.

	22327 */

	22328 memset(&b, 0, sizeof(b));

	22329 b.pSrcDb = pFrom->db;

	22330 b.pSrc = pFrom;

	22331 b.pDest = pTo;

	22332 b.iNext = 1;

	22333

	22334 #ifdef SQLITE_HAS_CODEC

	22335 sqlite3PagerAlignReserve(sqlite3BtreePager(pTo), sqlite3BtreePager(pFrom));

	22336 #endif

	22337

	22338 /* 0x7FFFFFFF is the hard limit for the number of pages in a database

	22339 ** file. By passing this as the number of pages to copy to

	22340 ** sqlite3_backup_step(), we can guarantee that the copy finishes

	22341 ** within a single call (unless an error occurs). The assert() statement

	22342 ** checks this assumption - (p->rc) should be set to either SQLITE_DONE

	22343 ** or an error code.

	22344 */

	22345 sqlite3_backup_step(&b, 0x7FFFFFFF);

	22346 assert( b.rc!=SQLITE_OK );

	22347 rc = sqlite3_backup_finish(&b);

	22348 if( rc==SQLITE_OK ){

	22349 pTo->pBt->btsFlags &= ~BTS_PAGESIZE_FIXED;

	22350 }else{

	22351 sqlite3PagerClearCache(sqlite3BtreePager(b.pDest));

	22352 }

	22353

	22354 assert( sqlite3BtreeIsInTrans(pTo)==0 );

	22355 copy_finished:

	22356 sqlite3BtreeLeave(pFrom);

	22357 sqlite3BtreeLeave(pTo);

	22358 return rc;

	22359 }

	22360 #endif /* SQLITE_OMIT_VACUUM */

	22361

	22362 /************ End of backup.c ********************************************/

	22363

	22364 /* Chain include. */

	22365 #include "sqlite3.03.c"

OLD	NEW

« no previous file with comments | « third_party/sqlite/amalgamation/sqlite3.01.c ('k') | third_party/sqlite/amalgamation/sqlite3.03.c » ('j') | no next file with comments »