third_party/sqlite/sqlite-src-3100200/src/btree.c - Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory.

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/src/btree.c

Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory. (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 ** 2004 April 6

3 **

4 ** The author disclaims copyright to this source code. In place of

5 ** a legal notice, here is a blessing:

6 **

7 ** May you do good and not evil.

8 ** May you find forgiveness for yourself and forgive others.

9 ** May you share freely, never taking more than you give.

10 **

11 *************************************************************************

12 ** This file implements an external (disk-based) database using BTrees.

13 ** See the header comment on "btreeInt.h" for additional information.

14 ** Including a description of file format and an overview of operation.

15 */

16 #include "btreeInt.h"

17

18 /*

19 ** The header string that appears at the beginning of every

20 ** SQLite database.

21 */

22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;

23

24 /*

25 ** Set this global variable to 1 to enable tracing using the TRACE

26 ** macro.

27 */

28 #if 0

29 int sqlite3BtreeTrace=1; /* True to enable tracing */

30 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}

31 #else

32 # define TRACE(X)

33 #endif

34

35 /*

36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.

37 ** But if the value is zero, make it 65536.

38 **

39 ** This routine is used to extract the "offset to cell content area" value

40 ** from the header of a btree page. If the page size is 65536 and the page

41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.

42 ** This routine makes the necessary adjustment to 65536.

43 */

44 #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)

45

46 /*

47 ** Values passed as the 5th argument to allocateBtreePage()

48 */

49 #define BTALLOC_ANY 0 /* Allocate any page */

50 #define BTALLOC_EXACT 1 /* Allocate exact page if possible */

51 #define BTALLOC_LE 2 /* Allocate any page <= the parameter */

52

53 /*

54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not

55 ** defined, or 0 if it is. For example:

56 **

57 ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);

58 */

59 #ifndef SQLITE_OMIT_AUTOVACUUM

60 #define IfNotOmitAV(expr) (expr)

61 #else

62 #define IfNotOmitAV(expr) 0

63 #endif

64

65 #ifndef SQLITE_OMIT_SHARED_CACHE

66 /*

67 ** A list of BtShared objects that are eligible for participation

68 ** in shared cache. This variable has file scope during normal builds,

69 ** but the test harness needs to access it so we make it global for

70 ** test builds.

71 **

72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.

73 */

74 #ifdef SQLITE_TEST

75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

76 #else

77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;

78 #endif

79 #endif /* SQLITE_OMIT_SHARED_CACHE */

80

81 #ifndef SQLITE_OMIT_SHARED_CACHE

82 /*

83 ** Enable or disable the shared pager and schema features.

84 **

85 ** This routine has no effect on existing database connections.

86 ** The shared cache setting effects only future calls to

87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().

88 */

89 int sqlite3_enable_shared_cache(int enable){

90 sqlite3GlobalConfig.sharedCacheEnabled = enable;

91 return SQLITE_OK;

92 }

93 #endif

94

95

96

97 #ifdef SQLITE_OMIT_SHARED_CACHE

98 /*

99 ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),

100 ** and clearAllSharedCacheTableLocks()

101 ** manipulate entries in the BtShared.pLock linked list used to store

102 ** shared-cache table level locks. If the library is compiled with the

103 ** shared-cache feature disabled, then there is only ever one user

104 ** of each BtShared structure and so this locking is not necessary.

105 ** So define the lock related functions as no-ops.

106 */

107 #define querySharedCacheTableLock(a,b,c) SQLITE_OK

108 #define setSharedCacheTableLock(a,b,c) SQLITE_OK

109 #define clearAllSharedCacheTableLocks(a)

110 #define downgradeAllSharedCacheTableLocks(a)

111 #define hasSharedCacheTableLock(a,b,c,d) 1

112 #define hasReadConflicts(a, b) 0

113 #endif

114

115 #ifndef SQLITE_OMIT_SHARED_CACHE

116

117 #ifdef SQLITE_DEBUG

118 /*

119 ** This function is only used as part of an assert() statement. *

120 **

121 ** Check to see if pBtree holds the required locks to read or write to the

122 ** table with root page iRoot. Return 1 if it does and 0 if not.

123 **

124 ** For example, when writing to a table with root-page iRoot via

125 ** Btree connection pBtree:

126 **

127 ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );

128 **

129 ** When writing to an index that resides in a sharable database, the

130 ** caller should have first obtained a lock specifying the root page of

131 ** the corresponding table. This makes things a bit more complicated,

132 ** as this module treats each table as a separate structure. To determine

133 ** the table corresponding to the index being written, this

134 ** function has to search through the database schema.

135 **

136 ** Instead of a lock on the table/index rooted at page iRoot, the caller may

137 ** hold a write-lock on the schema table (root page 1). This is also

138 ** acceptable.

139 */

140 static int hasSharedCacheTableLock(

141 Btree pBtree, / Handle that must hold lock */

142 Pgno iRoot, /* Root page of b-tree */

143 int isIndex, /* True if iRoot is the root of an index b-tree */

144 int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */

145 ){

146 Schema pSchema = (Schema )pBtree->pBt->pSchema;

147 Pgno iTab = 0;

148 BtLock *pLock;

149

150 /* If this database is not shareable, or if the client is reading

151 ** and has the read-uncommitted flag set, then no lock is required.

152 ** Return true immediately.

153 */

154 if( (pBtree->sharable==0)

155 \|\| (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))

156 ){

157 return 1;

158 }

159

160 /* If the client is reading or writing an index and the schema is

161 ** not loaded, then it is too difficult to actually check to see if

162 ** the correct locks are held. So do not bother - just return true.

163 ** This case does not come up very often anyhow.

164 */

165 if( isIndex && (!pSchema \|\| (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){

166 return 1;

167 }

168

169 /* Figure out the root-page that the lock should be held on. For table

170 ** b-trees, this is just the root page of the b-tree being read or

171 ** written. For index b-trees, it is the root page of the associated

172 ** table. */

173 if( isIndex ){

174 HashElem *p;

175 for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){

176 Index pIdx = (Index )sqliteHashData(p);

177 if( pIdx->tnum==(int)iRoot ){

178 if( iTab ){

179 /* Two or more indexes share the same root page. There must

180 ** be imposter tables. So just return true. The assert is not

181 ** useful in that case. */

182 return 1;

183 }

184 iTab = pIdx->pTable->tnum;

185 }

186 }

187 }else{

188 iTab = iRoot;

189 }

190

191 /* Search for the required lock. Either a write-lock on root-page iTab, a

192 ** write-lock on the schema table, or (if the client is reading) a

193 ** read-lock on iTab will suffice. Return 1 if any of these are found. */

194 for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){

195 if( pLock->pBtree==pBtree

196 && (pLock->iTable==iTab \|\| (pLock->eLock==WRITE_LOCK && pLock->iTable==1))

197 && pLock->eLock>=eLockType

198 ){

199 return 1;

200 }

201 }

202

203 /* Failed to find the required lock. */

204 return 0;

205 }

206 #endif /* SQLITE_DEBUG */

207

208 #ifdef SQLITE_DEBUG

209 /*

210 ** This function may be used as part of assert() statements only. **

211 **

212 ** Return true if it would be illegal for pBtree to write into the

213 ** table or index rooted at iRoot because other shared connections are

214 ** simultaneously reading that same table or index.

215 **

216 ** It is illegal for pBtree to write if some other Btree object that

217 ** shares the same BtShared object is currently reading or writing

218 ** the iRoot table. Except, if the other Btree object has the

219 ** read-uncommitted flag set, then it is OK for the other object to

220 ** have a read cursor.

221 **

222 ** For example, before writing to any part of the table or index

223 ** rooted at page iRoot, one should call:

224 **

225 ** assert( !hasReadConflicts(pBtree, iRoot) );

226 */

227 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){

228 BtCursor *p;

229 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

230 if( p->pgnoRoot==iRoot

231 && p->pBtree!=pBtree

232 && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)

233 ){

234 return 1;

235 }

236 }

237 return 0;

238 }

239 #endif /* #ifdef SQLITE_DEBUG */

240

241 /*

242 ** Query to see if Btree handle p may obtain a lock of type eLock

243 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return

244 ** SQLITE_OK if the lock may be obtained (by calling

245 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.

246 */

247 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){

248 BtShared *pBt = p->pBt;

249 BtLock *pIter;

250

251 assert( sqlite3BtreeHoldsMutex(p) );

252 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

253 assert( p->db!=0 );

254 assert( !(p->db->flags&SQLITE_ReadUncommitted)\|\|eLock==WRITE_LOCK\|\|iTab==1 );

255

256 /* If requesting a write-lock, then the Btree must have an open write

257 ** transaction on this file. And, obviously, for this to be so there

258 ** must be an open write transaction on the file itself.

259 */

260 assert( eLock==READ_LOCK \|\| (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );

261 assert( eLock==READ_LOCK \|\| pBt->inTransaction==TRANS_WRITE );

262

263 /* This routine is a no-op if the shared-cache is not enabled */

264 if( !p->sharable ){

265 return SQLITE_OK;

266 }

267

268 /* If some other connection is holding an exclusive lock, the

269 ** requested lock may not be obtained.

270 */

271 if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){

272 sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);

273 return SQLITE_LOCKED_SHAREDCACHE;

274 }

275

276 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

277 /* The condition (pIter->eLock!=eLock) in the following if(...)

278 ** statement is a simplification of:

279 **

280 ** (eLock==WRITE_LOCK \|\| pIter->eLock==WRITE_LOCK)

281 **

282 ** since we know that if eLock==WRITE_LOCK, then no other connection

283 ** may hold a WRITE_LOCK on any table in this file (since there can

284 ** only be a single writer).

285 */

286 assert( pIter->eLock==READ_LOCK \|\| pIter->eLock==WRITE_LOCK );

287 assert( eLock==READ_LOCK \|\| pIter->pBtree==p \|\| pIter->eLock==READ_LOCK);

288 if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){

289 sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);

290 if( eLock==WRITE_LOCK ){

291 assert( p==pBt->pWriter );

292 pBt->btsFlags \|= BTS_PENDING;

293 }

294 return SQLITE_LOCKED_SHAREDCACHE;

295 }

296 }

297 return SQLITE_OK;

298 }

299 #endif /* !SQLITE_OMIT_SHARED_CACHE */

300

301 #ifndef SQLITE_OMIT_SHARED_CACHE

302 /*

303 ** Add a lock on the table with root-page iTable to the shared-btree used

304 ** by Btree handle p. Parameter eLock must be either READ_LOCK or

305 ** WRITE_LOCK.

306 **

307 ** This function assumes the following:

308 **

309 ** (a) The specified Btree object p is connected to a sharable

310 ** database (one with the BtShared.sharable flag set), and

311 **

312 ** (b) No other Btree objects hold a lock that conflicts

313 ** with the requested lock (i.e. querySharedCacheTableLock() has

314 ** already been called and returned SQLITE_OK).

315 **

316 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM

317 ** is returned if a malloc attempt fails.

318 */

319 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){

320 BtShared *pBt = p->pBt;

321 BtLock *pLock = 0;

322 BtLock *pIter;

323

324 assert( sqlite3BtreeHoldsMutex(p) );

325 assert( eLock==READ_LOCK \|\| eLock==WRITE_LOCK );

326 assert( p->db!=0 );

327

328 /* A connection with the read-uncommitted flag set will never try to

329 ** obtain a read-lock using this function. The only read-lock obtained

330 ** by a connection in read-uncommitted mode is on the sqlite_master

331 ** table, and that lock is obtained in BtreeBeginTrans(). */

332 assert( 0==(p->db->flags&SQLITE_ReadUncommitted) \|\| eLock==WRITE_LOCK );

333

334 /* This function should only be called on a sharable b-tree after it

335 ** has been determined that no other b-tree holds a conflicting lock. */

336 assert( p->sharable );

337 assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );

338

339 /* First search the list for an existing lock on this table. */

340 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

341 if( pIter->iTable==iTable && pIter->pBtree==p ){

342 pLock = pIter;

343 break;

344 }

345 }

346

347 /* If the above search did not find a BtLock struct associating Btree p

348 ** with table iTable, allocate one and link it into the list.

349 */

350 if( !pLock ){

351 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));

352 if( !pLock ){

353 return SQLITE_NOMEM;

354 }

355 pLock->iTable = iTable;

356 pLock->pBtree = p;

357 pLock->pNext = pBt->pLock;

358 pBt->pLock = pLock;

359 }

360

361 /* Set the BtLock.eLock variable to the maximum of the current lock

362 ** and the requested lock. This means if a write-lock was already held

363 ** and a read-lock requested, we don't incorrectly downgrade the lock.

364 */

365 assert( WRITE_LOCK>READ_LOCK );

366 if( eLock>pLock->eLock ){

367 pLock->eLock = eLock;

368 }

369

370 return SQLITE_OK;

371 }

372 #endif /* !SQLITE_OMIT_SHARED_CACHE */

373

374 #ifndef SQLITE_OMIT_SHARED_CACHE

375 /*

376 ** Release all the table locks (locks obtained via calls to

377 ** the setSharedCacheTableLock() procedure) held by Btree object p.

378 **

379 ** This function assumes that Btree p has an open read or write

380 ** transaction. If it does not, then the BTS_PENDING flag

381 ** may be incorrectly cleared.

382 */

383 static void clearAllSharedCacheTableLocks(Btree *p){

384 BtShared *pBt = p->pBt;

385 BtLock **ppIter = &pBt->pLock;

386

387 assert( sqlite3BtreeHoldsMutex(p) );

388 assert( p->sharable \|\| 0==*ppIter );

389 assert( p->inTrans>0 );

390

391 while( *ppIter ){

392 BtLock pLock = ppIter;

393 assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 \|\| pBt->pWriter==pLock->pBtree );

394 assert( pLock->pBtree->inTrans>=pLock->eLock );

395 if( pLock->pBtree==p ){

396 *ppIter = pLock->pNext;

397 assert( pLock->iTable!=1 \|\| pLock==&p->lock );

398 if( pLock->iTable!=1 ){

399 sqlite3_free(pLock);

400 }

401 }else{

402 ppIter = &pLock->pNext;

403 }

404 }

405

406 assert( (pBt->btsFlags & BTS_PENDING)==0 \|\| pBt->pWriter );

407 if( pBt->pWriter==p ){

408 pBt->pWriter = 0;

409 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

410 }else if( pBt->nTransaction==2 ){

411 /* This function is called when Btree p is concluding its

412 ** transaction. If there currently exists a writer, and p is not

413 ** that writer, then the number of locks held by connections other

414 ** than the writer must be about to drop to zero. In this case

415 ** set the BTS_PENDING flag to 0.

416 **

417 ** If there is not currently a writer, then BTS_PENDING must

418 ** be zero already. So this next line is harmless in that case.

419 */

420 pBt->btsFlags &= ~BTS_PENDING;

421 }

422 }

423

424 /*

425 ** This function changes all write-locks held by Btree p into read-locks.

426 */

427 static void downgradeAllSharedCacheTableLocks(Btree *p){

428 BtShared *pBt = p->pBt;

429 if( pBt->pWriter==p ){

430 BtLock *pLock;

431 pBt->pWriter = 0;

432 pBt->btsFlags &= ~(BTS_EXCLUSIVE\|BTS_PENDING);

433 for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){

434 assert( pLock->eLock==READ_LOCK \|\| pLock->pBtree==p );

435 pLock->eLock = READ_LOCK;

436 }

437 }

438 }

439

440 #endif /* SQLITE_OMIT_SHARED_CACHE */

441

442 static void releasePage(MemPage pPage); / Forward reference */

443

444 /*

445 *** This routine is used inside of assert() only **

446 **

447 ** Verify that the cursor holds the mutex on its BtShared

448 */

449 #ifdef SQLITE_DEBUG

450 static int cursorHoldsMutex(BtCursor *p){

451 return sqlite3_mutex_held(p->pBt->mutex);

452 }

453 #endif

454

455 /*

456 ** Invalidate the overflow cache of the cursor passed as the first argument.

457 ** on the shared btree structure pBt.

458 */

459 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)

460

461 /*

462 ** Invalidate the overflow page-list cache for all cursors opened

463 ** on the shared btree structure pBt.

464 */

465 static void invalidateAllOverflowCache(BtShared *pBt){

466 BtCursor *p;

467 assert( sqlite3_mutex_held(pBt->mutex) );

468 for(p=pBt->pCursor; p; p=p->pNext){

469 invalidateOverflowCache(p);

470 }

471 }

472

473 #ifndef SQLITE_OMIT_INCRBLOB

474 /*

475 ** This function is called before modifying the contents of a table

476 ** to invalidate any incrblob cursors that are open on the

477 ** row or one of the rows being modified.

478 **

479 ** If argument isClearTable is true, then the entire contents of the

480 ** table is about to be deleted. In this case invalidate all incrblob

481 ** cursors open on any row within the table with root-page pgnoRoot.

482 **

483 ** Otherwise, if argument isClearTable is false, then the row with

484 ** rowid iRow is being replaced or deleted. In this case invalidate

485 ** only those incrblob cursors open on that specific row.

486 */

487 static void invalidateIncrblobCursors(

488 Btree pBtree, / The database file to check */

489 i64 iRow, /* The rowid that might be changing */

490 int isClearTable /* True if all rows are being deleted */

491 ){

492 BtCursor *p;

493 if( pBtree->hasIncrblobCur==0 ) return;

494 assert( sqlite3BtreeHoldsMutex(pBtree) );

495 pBtree->hasIncrblobCur = 0;

496 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

497 if( (p->curFlags & BTCF_Incrblob)!=0 ){

498 pBtree->hasIncrblobCur = 1;

499 if( isClearTable \|\| p->info.nKey==iRow ){

500 p->eState = CURSOR_INVALID;

501 }

502 }

503 }

504 }

505

506 #else

507 /* Stub function when INCRBLOB is omitted */

508 #define invalidateIncrblobCursors(x,y,z)

509 #endif /* SQLITE_OMIT_INCRBLOB */

510

511 /*

512 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called

513 ** when a page that previously contained data becomes a free-list leaf

514 ** page.

515 **

516 ** The BtShared.pHasContent bitvec exists to work around an obscure

517 ** bug caused by the interaction of two useful IO optimizations surrounding

518 ** free-list leaf pages:

519 **

520 ** 1) When all data is deleted from a page and the page becomes

521 ** a free-list leaf page, the page is not written to the database

522 ** (as free-list leaf pages contain no meaningful data). Sometimes

523 ** such a page is not even journalled (as it will not be modified,

524 ** why bother journalling it?).

525 **

526 ** 2) When a free-list leaf page is reused, its content is not read

527 ** from the database or written to the journal file (why should it

528 ** be, if it is not at all meaningful?).

529 **

530 ** By themselves, these optimizations work fine and provide a handy

531 ** performance boost to bulk delete or insert operations. However, if

532 ** a page is moved to the free-list and then reused within the same

533 ** transaction, a problem comes up. If the page is not journalled when

534 ** it is moved to the free-list and it is also not journalled when it

535 ** is extracted from the free-list and reused, then the original data

536 ** may be lost. In the event of a rollback, it may not be possible

537 ** to restore the database to its original configuration.

538 **

539 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is

540 ** moved to become a free-list leaf page, the corresponding bit is

541 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,

542 ** optimization 2 above is omitted if the corresponding bit is already

543 ** set in BtShared.pHasContent. The contents of the bitvec are cleared

544 ** at the end of every transaction.

545 */

546 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){

547 int rc = SQLITE_OK;

548 if( !pBt->pHasContent ){

549 assert( pgno<=pBt->nPage );

550 pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);

551 if( !pBt->pHasContent ){

552 rc = SQLITE_NOMEM;

553 }

554 }

555 if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){

556 rc = sqlite3BitvecSet(pBt->pHasContent, pgno);

557 }

558 return rc;

559 }

560

561 /*

562 ** Query the BtShared.pHasContent vector.

563 **

564 ** This function is called when a free-list leaf page is removed from the

565 ** free-list for reuse. It returns false if it is safe to retrieve the

566 ** page from the pager layer with the 'no-content' flag set. True otherwise.

567 */

568 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){

569 Bitvec *p = pBt->pHasContent;

570 return (p && (pgno>sqlite3BitvecSize(p) \|\| sqlite3BitvecTest(p, pgno)));

571 }

572

573 /*

574 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be

575 ** invoked at the conclusion of each write-transaction.

576 */

577 static void btreeClearHasContent(BtShared *pBt){

578 sqlite3BitvecDestroy(pBt->pHasContent);

579 pBt->pHasContent = 0;

580 }

581

582 /*

583 ** Release all of the apPage[] pages for a cursor.

584 */

585 static void btreeReleaseAllCursorPages(BtCursor *pCur){

586 int i;

587 for(i=0; i<=pCur->iPage; i++){

588 releasePage(pCur->apPage[i]);

589 pCur->apPage[i] = 0;

590 }

591 pCur->iPage = -1;

592 }

593

594 /*

595 ** The cursor passed as the only argument must point to a valid entry

596 ** when this function is called (i.e. have eState==CURSOR_VALID). This

597 ** function saves the current cursor key in variables pCur->nKey and

598 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error

599 ** code otherwise.

600 **

601 ** If the cursor is open on an intkey table, then the integer key

602 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to

603 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is

604 ** set to point to a malloced buffer pCur->nKey bytes in size containing

605 ** the key.

606 */

607 static int saveCursorKey(BtCursor *pCur){

608 int rc;

609 assert( CURSOR_VALID==pCur->eState );

610 assert( 0==pCur->pKey );

611 assert( cursorHoldsMutex(pCur) );

612

613 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);

614 assert( rc==SQLITE_OK ); /* KeySize() cannot fail */

615

616 /* If this is an intKey table, then the above call to BtreeKeySize()

617 ** stores the integer key in pCur->nKey. In this case this value is

618 ** all that is required. Otherwise, if pCur is not open on an intKey

619 ** table, then malloc space for and store the pCur->nKey bytes of key

620 ** data. */

621 if( 0==pCur->curIntKey ){

622 void *pKey = sqlite3Malloc( pCur->nKey );

623 if( pKey ){

624 rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);

625 if( rc==SQLITE_OK ){

626 pCur->pKey = pKey;

627 }else{

628 sqlite3_free(pKey);

629 }

630 }else{

631 rc = SQLITE_NOMEM;

632 }

633 }

634 assert( !pCur->curIntKey \|\| !pCur->pKey );

635 return rc;

636 }

637

638 /*

639 ** Save the current cursor position in the variables BtCursor.nKey

640 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.

641 **

642 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)

643 ** prior to calling this routine.

644 */

645 static int saveCursorPosition(BtCursor *pCur){

646 int rc;

647

648 assert( CURSOR_VALID==pCur->eState \|\| CURSOR_SKIPNEXT==pCur->eState );

649 assert( 0==pCur->pKey );

650 assert( cursorHoldsMutex(pCur) );

651

652 if( pCur->eState==CURSOR_SKIPNEXT ){

653 pCur->eState = CURSOR_VALID;

654 }else{

655 pCur->skipNext = 0;

656 }

657

658 rc = saveCursorKey(pCur);

659 if( rc==SQLITE_OK ){

660 btreeReleaseAllCursorPages(pCur);

661 pCur->eState = CURSOR_REQUIRESEEK;

662 }

663

664 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl\|BTCF_AtLast);

665 return rc;

666 }

667

668 /* Forward reference */

669 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor,Pgno,BtCursor);

670

671 /*

672 ** Save the positions of all cursors (except pExcept) that are open on

673 ** the table with root-page iRoot. "Saving the cursor position" means that

674 ** the location in the btree is remembered in such a way that it can be

675 ** moved back to the same spot after the btree has been modified. This

676 ** routine is called just before cursor pExcept is used to modify the

677 ** table, for example in BtreeDelete() or BtreeInsert().

678 **

679 ** If there are two or more cursors on the same btree, then all such

680 ** cursors should have their BTCF_Multiple flag set. The btreeCursor()

681 ** routine enforces that rule. This routine only needs to be called in

682 ** the uncommon case when pExpect has the BTCF_Multiple flag set.

683 **

684 ** If pExpect!=NULL and if no other cursors are found on the same root-page,

685 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another

686 ** pointless call to this routine.

687 **

688 ** Implementation note: This routine merely checks to see if any cursors

689 ** need to be saved. It calls out to saveCursorsOnList() in the (unusual)

690 ** event that cursors are in need to being saved.

691 */

692 static int saveAllCursors(BtShared pBt, Pgno iRoot, BtCursor pExcept){

693 BtCursor *p;

694 assert( sqlite3_mutex_held(pBt->mutex) );

695 assert( pExcept==0 \|\| pExcept->pBt==pBt );

696 for(p=pBt->pCursor; p; p=p->pNext){

697 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ) break;

698 }

699 if( p ) return saveCursorsOnList(p, iRoot, pExcept);

700 if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;

701 return SQLITE_OK;

702 }

703

704 /* This helper routine to saveAllCursors does the actual work of saving

705 ** the cursors if and when a cursor is found that actually requires saving.

706 ** The common case is that no cursors need to be saved, so this routine is

707 ** broken out from its caller to avoid unnecessary stack pointer movement.

708 */

709 static int SQLITE_NOINLINE saveCursorsOnList(

710 BtCursor p, / The first cursor that needs saving */

711 Pgno iRoot, /* Only save cursor with this iRoot. Save all if zero */

712 BtCursor pExcept / Do not save this cursor */

713 ){

714 do{

715 if( p!=pExcept && (0==iRoot \|\| p->pgnoRoot==iRoot) ){

716 if( p->eState==CURSOR_VALID \|\| p->eState==CURSOR_SKIPNEXT ){

717 int rc = saveCursorPosition(p);

718 if( SQLITE_OK!=rc ){

719 return rc;

720 }

721 }else{

722 testcase( p->iPage>0 );

723 btreeReleaseAllCursorPages(p);

724 }

725 }

726 p = p->pNext;

727 }while( p );

728 return SQLITE_OK;

729 }

730

731 /*

732 ** Clear the current cursor position.

733 */

734 void sqlite3BtreeClearCursor(BtCursor *pCur){

735 assert( cursorHoldsMutex(pCur) );

736 sqlite3_free(pCur->pKey);

737 pCur->pKey = 0;

738 pCur->eState = CURSOR_INVALID;

739 }

740

741 /*

742 ** In this version of BtreeMoveto, pKey is a packed index record

743 ** such as is generated by the OP_MakeRecord opcode. Unpack the

744 ** record and then call BtreeMovetoUnpacked() to do the work.

745 */

746 static int btreeMoveto(

747 BtCursor pCur, / Cursor open on the btree to be searched */

748 const void pKey, / Packed key if the btree is an index */

749 i64 nKey, /* Integer key for tables. Size of pKey for indices */

750 int bias, /* Bias search to the high end */

751 int pRes / Write search results here */

752 ){

753 int rc; /* Status code */

754 UnpackedRecord pIdxKey; / Unpacked index key */

755 char aSpace[200]; /* Temp space for pIdxKey - to avoid a malloc */

756 char *pFree = 0;

757

758 if( pKey ){

759 assert( nKey==(i64)(int)nKey );

760 pIdxKey = sqlite3VdbeAllocUnpackedRecord(

761 pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree

762 );

763 if( pIdxKey==0 ) return SQLITE_NOMEM;

764 sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);

765 if( pIdxKey->nField==0 ){

766 sqlite3DbFree(pCur->pKeyInfo->db, pFree);

767 return SQLITE_CORRUPT_BKPT;

768 }

769 }else{

770 pIdxKey = 0;

771 }

772 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);

773 if( pFree ){

774 sqlite3DbFree(pCur->pKeyInfo->db, pFree);

775 }

776 return rc;

777 }

778

779 /*

780 ** Restore the cursor to the position it was in (or as close to as possible)

781 ** when saveCursorPosition() was called. Note that this call deletes the

782 ** saved position info stored by saveCursorPosition(), so there can be

783 ** at most one effective restoreCursorPosition() call after each

784 ** saveCursorPosition().

785 */

786 static int btreeRestoreCursorPosition(BtCursor *pCur){

787 int rc;

788 int skipNext;

789 assert( cursorHoldsMutex(pCur) );

790 assert( pCur->eState>=CURSOR_REQUIRESEEK );

791 if( pCur->eState==CURSOR_FAULT ){

792 return pCur->skipNext;

793 }

794 pCur->eState = CURSOR_INVALID;

795 rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);

796 if( rc==SQLITE_OK ){

797 sqlite3_free(pCur->pKey);

798 pCur->pKey = 0;

799 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_INVALID );

800 pCur->skipNext \|= skipNext;

801 if( pCur->skipNext && pCur->eState==CURSOR_VALID ){

802 pCur->eState = CURSOR_SKIPNEXT;

803 }

804 }

805 return rc;

806 }

807

808 #define restoreCursorPosition(p) \

809 (p->eState>=CURSOR_REQUIRESEEK ? \

810 btreeRestoreCursorPosition(p) : \

811 SQLITE_OK)

812

813 /*

814 ** Determine whether or not a cursor has moved from the position where

815 ** it was last placed, or has been invalidated for any other reason.

816 ** Cursors can move when the row they are pointing at is deleted out

817 ** from under them, for example. Cursor might also move if a btree

818 ** is rebalanced.

819 **

820 ** Calling this routine with a NULL cursor pointer returns false.

821 **

822 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor

823 ** back to where it ought to be if this routine returns true.

824 */

825 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){

826 return pCur->eState!=CURSOR_VALID;

827 }

828

829 /*

830 ** This routine restores a cursor back to its original position after it

831 ** has been moved by some outside activity (such as a btree rebalance or

832 ** a row having been deleted out from under the cursor).

833 **

834 ** On success, the *pDifferentRow parameter is false if the cursor is left

835 ** pointing at exactly the same row. *pDifferntRow is the row the cursor

836 ** was pointing to has been deleted, forcing the cursor to point to some

837 ** nearby row.

838 **

839 ** This routine should only be called for a cursor that just returned

840 ** TRUE from sqlite3BtreeCursorHasMoved().

841 */

842 int sqlite3BtreeCursorRestore(BtCursor pCur, int pDifferentRow){

843 int rc;

844

845 assert( pCur!=0 );

846 assert( pCur->eState!=CURSOR_VALID );

847 rc = restoreCursorPosition(pCur);

848 if( rc ){

849 *pDifferentRow = 1;

850 return rc;

851 }

852 if( pCur->eState!=CURSOR_VALID ){

853 *pDifferentRow = 1;

854 }else{

855 assert( pCur->skipNext==0 );

856 *pDifferentRow = 0;

857 }

858 return SQLITE_OK;

859 }

860

861 #ifdef SQLITE_ENABLE_CURSOR_HINTS

862 /*

863 ** Provide hints to the cursor. The particular hint given (and the type

864 ** and number of the varargs parameters) is determined by the eHintType

865 ** parameter. See the definitions of the BTREE_HINT_* macros for details.

866 */

867 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){

868 /* Used only by system that substitute their own storage engine */

869 }

870 #endif

871

872 /*

873 ** Provide flag hints to the cursor.

874 */

875 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){

876 assert( x==BTREE_SEEK_EQ \|\| x==BTREE_BULKLOAD \|\| x==0 );

877 pCur->hints = x;

878 }

879

880

881 #ifndef SQLITE_OMIT_AUTOVACUUM

882 /*

883 ** Given a page number of a regular database page, return the page

884 ** number for the pointer-map page that contains the entry for the

885 ** input page number.

886 **

887 ** Return 0 (not a valid page) for pgno==1 since there is

888 ** no pointer map associated with page 1. The integrity_check logic

889 ** requires that ptrmapPageno(*,1)!=1.

890 */

891 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){

892 int nPagesPerMapPage;

893 Pgno iPtrMap, ret;

894 assert( sqlite3_mutex_held(pBt->mutex) );

895 if( pgno<2 ) return 0;

896 nPagesPerMapPage = (pBt->usableSize/5)+1;

897 iPtrMap = (pgno-2)/nPagesPerMapPage;

898 ret = (iPtrMap*nPagesPerMapPage) + 2;

899 if( ret==PENDING_BYTE_PAGE(pBt) ){

900 ret++;

901 }

902 return ret;

903 }

904

905 /*

906 ** Write an entry into the pointer map.

907 **

908 ** This routine updates the pointer map entry for page number 'key'

909 ** so that it maps to type 'eType' and parent page number 'pgno'.

910 **

911 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is

912 ** a no-op. If an error occurs, the appropriate error code is written

913 ** into *pRC.

914 */

915 static void ptrmapPut(BtShared pBt, Pgno key, u8 eType, Pgno parent, int pRC){

916 DbPage pDbPage; / The pointer map page */

917 u8 pPtrmap; / The pointer map data */

918 Pgno iPtrmap; /* The pointer map page number */

919 int offset; /* Offset in pointer map page */

920 int rc; /* Return code from subfunctions */

921

922 if( *pRC ) return;

923

924 assert( sqlite3_mutex_held(pBt->mutex) );

925 /* The master-journal page number must never be used as a pointer map page */

926 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );

927

928 assert( pBt->autoVacuum );

929 if( key==0 ){

930 *pRC = SQLITE_CORRUPT_BKPT;

931 return;

932 }

933 iPtrmap = PTRMAP_PAGENO(pBt, key);

934 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);

935 if( rc!=SQLITE_OK ){

936 *pRC = rc;

937 return;

938 }

939 offset = PTRMAP_PTROFFSET(iPtrmap, key);

940 if( offset<0 ){

941 *pRC = SQLITE_CORRUPT_BKPT;

942 goto ptrmap_exit;

943 }

944 assert( offset <= (int)pBt->usableSize-5 );

945 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

946

947 if( eType!=pPtrmap[offset] \|\| get4byte(&pPtrmap[offset+1])!=parent ){

948 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));

949 *pRC= rc = sqlite3PagerWrite(pDbPage);

950 if( rc==SQLITE_OK ){

951 pPtrmap[offset] = eType;

952 put4byte(&pPtrmap[offset+1], parent);

953 }

954 }

955

956 ptrmap_exit:

957 sqlite3PagerUnref(pDbPage);

958 }

959

960 /*

961 ** Read an entry from the pointer map.

962 **

963 ** This routine retrieves the pointer map entry for page 'key', writing

964 ** the type and parent page number to pEType and pPgno respectively.

965 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.

966 */

967 static int ptrmapGet(BtShared pBt, Pgno key, u8 pEType, Pgno *pPgno){

968 DbPage pDbPage; / The pointer map page */

969 int iPtrmap; /* Pointer map page index */

970 u8 pPtrmap; / Pointer map page data */

971 int offset; /* Offset of entry in pointer map */

972 int rc;

973

974 assert( sqlite3_mutex_held(pBt->mutex) );

975

976 iPtrmap = PTRMAP_PAGENO(pBt, key);

977 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);

978 if( rc!=0 ){

979 return rc;

980 }

981 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);

982

983 offset = PTRMAP_PTROFFSET(iPtrmap, key);

984 if( offset<0 ){

985 sqlite3PagerUnref(pDbPage);

986 return SQLITE_CORRUPT_BKPT;

987 }

988 assert( offset <= (int)pBt->usableSize-5 );

989 assert( pEType!=0 );

990 *pEType = pPtrmap[offset];

991 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);

992

993 sqlite3PagerUnref(pDbPage);

994 if( pEType<1 \|\| pEType>5 ) return SQLITE_CORRUPT_BKPT;

995 return SQLITE_OK;

996 }

997

998 #else /* if defined SQLITE_OMIT_AUTOVACUUM */

999 #define ptrmapPut(w,x,y,z,rc)

1000 #define ptrmapGet(w,x,y,z) SQLITE_OK

1001 #define ptrmapPutOvflPtr(x, y, rc)

1002 #endif

1003

1004 /*

1005 ** Given a btree page and a cell index (0 means the first cell on

1006 ** the page, 1 means the second cell, and so forth) return a pointer

1007 ** to the cell content.

1008 **

1009 ** findCellPastPtr() does the same except it skips past the initial

1010 ** 4-byte child pointer found on interior pages, if there is one.

1011 **

1012 ** This routine works only for pages that do not contain overflow cells.

1013 */

1014 #define findCell(P,I) \

1015 ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))

1016 #define findCellPastPtr(P,I) \

1017 ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))

1018

1019

1020 /*

1021 ** This is common tail processing for btreeParseCellPtr() and

1022 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely

1023 ** on a single B-tree page. Make necessary adjustments to the CellInfo

1024 ** structure.

1025 */

1026 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(

1027 MemPage pPage, / Page containing the cell */

1028 u8 pCell, / Pointer to the cell text. */

1029 CellInfo pInfo / Fill in this structure */

1030 ){

1031 /* If the payload will not fit completely on the local page, we have

1032 ** to decide how much to store locally and how much to spill onto

1033 ** overflow pages. The strategy is to minimize the amount of unused

1034 ** space on overflow pages while keeping the amount of local storage

1035 ** in between minLocal and maxLocal.

1036 **

1037 ** Warning: changing the way overflow payload is distributed in any

1038 ** way will result in an incompatible file format.

1039 */

1040 int minLocal; /* Minimum amount of payload held locally */

1041 int maxLocal; /* Maximum amount of payload held locally */

1042 int surplus; /* Overflow payload available for local storage */

1043

1044 minLocal = pPage->minLocal;

1045 maxLocal = pPage->maxLocal;

1046 surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);

1047 testcase( surplus==maxLocal );

1048 testcase( surplus==maxLocal+1 );

1049 if( surplus <= maxLocal ){

1050 pInfo->nLocal = (u16)surplus;

1051 }else{

1052 pInfo->nLocal = (u16)minLocal;

1053 }

1054 pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;

1055 }

1056

1057 /*

1058 ** The following routines are implementations of the MemPage.xParseCell()

1059 ** method.

1060 **

1061 ** Parse a cell content block and fill in the CellInfo structure.

1062 **

1063 ** btreeParseCellPtr() => table btree leaf nodes

1064 ** btreeParseCellNoPayload() => table btree internal nodes

1065 ** btreeParseCellPtrIndex() => index btree nodes

1066 **

1067 ** There is also a wrapper function btreeParseCell() that works for

1068 ** all MemPage types and that references the cell by index rather than

1069 ** by pointer.

1070 */

1071 static void btreeParseCellPtrNoPayload(

1072 MemPage pPage, / Page containing the cell */

1073 u8 pCell, / Pointer to the cell text. */

1074 CellInfo pInfo / Fill in this structure */

1075 ){

1076 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1077 assert( pPage->leaf==0 );

1078 assert( pPage->noPayload );

1079 assert( pPage->childPtrSize==4 );

1080 #ifndef SQLITE_DEBUG

1081 UNUSED_PARAMETER(pPage);

1082 #endif

1083 pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);

1084 pInfo->nPayload = 0;

1085 pInfo->nLocal = 0;

1086 pInfo->pPayload = 0;

1087 return;

1088 }

1089 static void btreeParseCellPtr(

1090 MemPage pPage, / Page containing the cell */

1091 u8 pCell, / Pointer to the cell text. */

1092 CellInfo pInfo / Fill in this structure */

1093 ){

1094 u8 pIter; / For scanning through pCell */

1095 u32 nPayload; /* Number of bytes of cell payload */

1096 u64 iKey; /* Extracted Key value */

1097

1098 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1099 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

1100 assert( pPage->intKeyLeaf \|\| pPage->noPayload );

1101 assert( pPage->noPayload==0 );

1102 assert( pPage->intKeyLeaf );

1103 assert( pPage->childPtrSize==0 );

1104 pIter = pCell;

1105

1106 /* The next block of code is equivalent to:

1107 **

1108 ** pIter += getVarint32(pIter, nPayload);

1109 **

1110 ** The code is inlined to avoid a function call.

1111 */

1112 nPayload = *pIter;

1113 if( nPayload>=0x80 ){

1114 u8 *pEnd = &pIter[8];

1115 nPayload &= 0x7f;

1116 do{

1117 nPayload = (nPayload<<7) \| (*++pIter & 0x7f);

1118 }while( (*pIter)>=0x80 && pIter<pEnd );

1119 }

1120 pIter++;

1121

1122 /* The next block of code is equivalent to:

1123 **

1124 ** pIter += getVarint(pIter, (u64*)&pInfo->nKey);

1125 **

1126 ** The code is inlined to avoid a function call.

1127 */

1128 iKey = *pIter;

1129 if( iKey>=0x80 ){

1130 u8 *pEnd = &pIter[7];

1131 iKey &= 0x7f;

1132 while(1){

1133 iKey = (iKey<<7) \| (*++pIter & 0x7f);

1134 if( (*pIter)<0x80 ) break;

1135 if( pIter>=pEnd ){

1136 iKey = (iKey<<8) \| *++pIter;

1137 break;

1138 }

1139 }

1140 }

1141 pIter++;

1142

1143 pInfo->nKey = (i64)&iKey;

1144 pInfo->nPayload = nPayload;

1145 pInfo->pPayload = pIter;

1146 testcase( nPayload==pPage->maxLocal );

1147 testcase( nPayload==pPage->maxLocal+1 );

1148 if( nPayload<=pPage->maxLocal ){

1149 /* This is the (easy) common case where the entire payload fits

1150 ** on the local page. No overflow is required.

1151 */

1152 pInfo->nSize = nPayload + (u16)(pIter - pCell);

1153 if( pInfo->nSize<4 ) pInfo->nSize = 4;

1154 pInfo->nLocal = (u16)nPayload;

1155 }else{

1156 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);

1157 }

1158 }

1159 static void btreeParseCellPtrIndex(

1160 MemPage pPage, / Page containing the cell */

1161 u8 pCell, / Pointer to the cell text. */

1162 CellInfo pInfo / Fill in this structure */

1163 ){

1164 u8 pIter; / For scanning through pCell */

1165 u32 nPayload; /* Number of bytes of cell payload */

1166

1167 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1168 assert( pPage->leaf==0 \|\| pPage->leaf==1 );

1169 assert( pPage->intKeyLeaf==0 );

1170 assert( pPage->noPayload==0 );

1171 pIter = pCell + pPage->childPtrSize;

1172 nPayload = *pIter;

1173 if( nPayload>=0x80 ){

1174 u8 *pEnd = &pIter[8];

1175 nPayload &= 0x7f;

1176 do{

1177 nPayload = (nPayload<<7) \| (*++pIter & 0x7f);

1178 }while( *(pIter)>=0x80 && pIter<pEnd );

1179 }

1180 pIter++;

1181 pInfo->nKey = nPayload;

1182 pInfo->nPayload = nPayload;

1183 pInfo->pPayload = pIter;

1184 testcase( nPayload==pPage->maxLocal );

1185 testcase( nPayload==pPage->maxLocal+1 );

1186 if( nPayload<=pPage->maxLocal ){

1187 /* This is the (easy) common case where the entire payload fits

1188 ** on the local page. No overflow is required.

1189 */

1190 pInfo->nSize = nPayload + (u16)(pIter - pCell);

1191 if( pInfo->nSize<4 ) pInfo->nSize = 4;

1192 pInfo->nLocal = (u16)nPayload;

1193 }else{

1194 btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);

1195 }

1196 }

1197 static void btreeParseCell(

1198 MemPage pPage, / Page containing the cell */

1199 int iCell, /* The cell index. First cell is 0 */

1200 CellInfo pInfo / Fill in this structure */

1201 ){

1202 pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);

1203 }

1204

1205 /*

1206 ** The following routines are implementations of the MemPage.xCellSize

1207 ** method.

1208 **

1209 ** Compute the total number of bytes that a Cell needs in the cell

1210 ** data area of the btree-page. The return number includes the cell

1211 ** data header and the local payload, but not any overflow page or

1212 ** the space used by the cell pointer.

1213 **

1214 ** cellSizePtrNoPayload() => table internal nodes

1215 ** cellSizePtr() => all index nodes & table leaf nodes

1216 */

1217 static u16 cellSizePtr(MemPage pPage, u8 pCell){

1218 u8 pIter = pCell + pPage->childPtrSize; / For looping over bytes of pCell */

1219 u8 pEnd; / End mark for a varint */

1220 u32 nSize; /* Size value to return */

1221

1222 #ifdef SQLITE_DEBUG

1223 /* The value returned by this function should always be the same as

1224 ** the (CellInfo.nSize) value found by doing a full parse of the

1225 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

1226 ** this function verifies that this invariant is not violated. */

1227 CellInfo debuginfo;

1228 pPage->xParseCell(pPage, pCell, &debuginfo);

1229 #endif

1230

1231 assert( pPage->noPayload==0 );

1232 nSize = *pIter;

1233 if( nSize>=0x80 ){

1234 pEnd = &pIter[8];

1235 nSize &= 0x7f;

1236 do{

1237 nSize = (nSize<<7) \| (*++pIter & 0x7f);

1238 }while( *(pIter)>=0x80 && pIter<pEnd );

1239 }

1240 pIter++;

1241 if( pPage->intKey ){

1242 /* pIter now points at the 64-bit integer key value, a variable length

1243 ** integer. The following block moves pIter to point at the first byte

1244 ** past the end of the key value. */

1245 pEnd = &pIter[9];

1246 while( (*pIter++)&0x80 && pIter<pEnd );

1247 }

1248 testcase( nSize==pPage->maxLocal );

1249 testcase( nSize==pPage->maxLocal+1 );

1250 if( nSize<=pPage->maxLocal ){

1251 nSize += (u32)(pIter - pCell);

1252 if( nSize<4 ) nSize = 4;

1253 }else{

1254 int minLocal = pPage->minLocal;

1255 nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);

1256 testcase( nSize==pPage->maxLocal );

1257 testcase( nSize==pPage->maxLocal+1 );

1258 if( nSize>pPage->maxLocal ){

1259 nSize = minLocal;

1260 }

1261 nSize += 4 + (u16)(pIter - pCell);

1262 }

1263 assert( nSize==debuginfo.nSize \|\| CORRUPT_DB );

1264 return (u16)nSize;

1265 }

1266 static u16 cellSizePtrNoPayload(MemPage pPage, u8 pCell){

1267 u8 pIter = pCell + 4; / For looping over bytes of pCell */

1268 u8 pEnd; / End mark for a varint */

1269

1270 #ifdef SQLITE_DEBUG

1271 /* The value returned by this function should always be the same as

1272 ** the (CellInfo.nSize) value found by doing a full parse of the

1273 ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of

1274 ** this function verifies that this invariant is not violated. */

1275 CellInfo debuginfo;

1276 pPage->xParseCell(pPage, pCell, &debuginfo);

1277 #else

1278 UNUSED_PARAMETER(pPage);

1279 #endif

1280

1281 assert( pPage->childPtrSize==4 );

1282 pEnd = pIter + 9;

1283 while( (*pIter++)&0x80 && pIter<pEnd );

1284 assert( debuginfo.nSize==(u16)(pIter - pCell) \|\| CORRUPT_DB );

1285 return (u16)(pIter - pCell);

1286 }

1287

1288

1289 #ifdef SQLITE_DEBUG

1290 /* This variation on cellSizePtr() is used inside of assert() statements

1291 ** only. */

1292 static u16 cellSize(MemPage *pPage, int iCell){

1293 return pPage->xCellSize(pPage, findCell(pPage, iCell));

1294 }

1295 #endif

1296

1297 #ifndef SQLITE_OMIT_AUTOVACUUM

1298 /*

1299 ** If the cell pCell, part of page pPage contains a pointer

1300 ** to an overflow page, insert an entry into the pointer-map

1301 ** for the overflow page.

1302 */

1303 static void ptrmapPutOvflPtr(MemPage pPage, u8 pCell, int *pRC){

1304 CellInfo info;

1305 if( *pRC ) return;

1306 assert( pCell!=0 );

1307 pPage->xParseCell(pPage, pCell, &info);

1308 if( info.nLocal<info.nPayload ){

1309 Pgno ovfl = get4byte(&pCell[info.nSize-4]);

1310 ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);

1311 }

1312 }

1313 #endif

1314

1315

1316 /*

1317 ** Defragment the page given. All Cells are moved to the

1318 ** end of the page and all free space is collected into one

1319 ** big FreeBlk that occurs in between the header and cell

1320 ** pointer array and the cell content area.

1321 **

1322 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a

1323 ** b-tree page so that there are no freeblocks or fragment bytes, all

1324 ** unused bytes are contained in the unallocated space region, and all

1325 ** cells are packed tightly at the end of the page.

1326 */

1327 static int defragmentPage(MemPage *pPage){

1328 int i; /* Loop counter */

1329 int pc; /* Address of the i-th cell */

1330 int hdr; /* Offset to the page header */

1331 int size; /* Size of a cell */

1332 int usableSize; /* Number of usable bytes on a page */

1333 int cellOffset; /* Offset to the cell pointer array */

1334 int cbrk; /* Offset to the cell content area */

1335 int nCell; /* Number of cells on the page */

1336 unsigned char data; / The page data */

1337 unsigned char temp; / Temp area for cell content */

1338 unsigned char src; / Source of content */

1339 int iCellFirst; /* First allowable cell index */

1340 int iCellLast; /* Last possible cell index */

1341

1342

1343 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

1344 assert( pPage->pBt!=0 );

1345 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );

1346 assert( pPage->nOverflow==0 );

1347 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1348 temp = 0;

1349 src = data = pPage->aData;

1350 hdr = pPage->hdrOffset;

1351 cellOffset = pPage->cellOffset;

1352 nCell = pPage->nCell;

1353 assert( nCell==get2byte(&data[hdr+3]) );

1354 usableSize = pPage->pBt->usableSize;

1355 cbrk = usableSize;

1356 iCellFirst = cellOffset + 2*nCell;

1357 iCellLast = usableSize - 4;

1358 for(i=0; i<nCell; i++){

1359 u8 pAddr; / The i-th cell pointer */

1360 pAddr = &data[cellOffset + i*2];

1361 pc = get2byte(pAddr);

1362 testcase( pc==iCellFirst );

1363 testcase( pc==iCellLast );

1364 /* These conditions have already been verified in btreeInitPage()

1365 ** if PRAGMA cell_size_check=ON.

1366 */

1367 if( pc<iCellFirst \|\| pc>iCellLast ){

1368 return SQLITE_CORRUPT_BKPT;

1369 }

1370 assert( pc>=iCellFirst && pc<=iCellLast );

1371 size = pPage->xCellSize(pPage, &src[pc]);

1372 cbrk -= size;

1373 if( cbrk<iCellFirst \|\| pc+size>usableSize ){

1374 return SQLITE_CORRUPT_BKPT;

1375 }

1376 assert( cbrk+size<=usableSize && cbrk>=iCellFirst );

1377 testcase( cbrk+size==usableSize );

1378 testcase( pc+size==usableSize );

1379 put2byte(pAddr, cbrk);

1380 if( temp==0 ){

1381 int x;

1382 if( cbrk==pc ) continue;

1383 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);

1384 x = get2byte(&data[hdr+5]);

1385 memcpy(&temp[x], &data[x], (cbrk+size) - x);

1386 src = temp;

1387 }

1388 memcpy(&data[cbrk], &src[pc], size);

1389 }

1390 assert( cbrk>=iCellFirst );

1391 put2byte(&data[hdr+5], cbrk);

1392 data[hdr+1] = 0;

1393 data[hdr+2] = 0;

1394 data[hdr+7] = 0;

1395 memset(&data[iCellFirst], 0, cbrk-iCellFirst);

1396 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

1397 if( cbrk-iCellFirst!=pPage->nFree ){

1398 return SQLITE_CORRUPT_BKPT;

1399 }

1400 return SQLITE_OK;

1401 }

1402

1403 /*

1404 ** Search the free-list on page pPg for space to store a cell nByte bytes in

1405 ** size. If one can be found, return a pointer to the space and remove it

1406 ** from the free-list.

1407 **

1408 ** If no suitable space can be found on the free-list, return NULL.

1409 **

1410 ** This function may detect corruption within pPg. If corruption is

1411 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.

1412 **

1413 ** Slots on the free list that are between 1 and 3 bytes larger than nByte

1414 ** will be ignored if adding the extra space to the fragmentation count

1415 ** causes the fragmentation count to exceed 60.

1416 */

1417 static u8 pageFindSlot(MemPage pPg, int nByte, int *pRc){

1418 const int hdr = pPg->hdrOffset;

1419 u8 * const aData = pPg->aData;

1420 int iAddr = hdr + 1;

1421 int pc = get2byte(&aData[iAddr]);

1422 int x;

1423 int usableSize = pPg->pBt->usableSize;

1424

1425 assert( pc>0 );

1426 do{

1427 int size; /* Size of the free slot */

1428 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of

1429 ** increasing offset. */

1430 if( pc>usableSize-4 \|\| pc<iAddr+4 ){

1431 *pRc = SQLITE_CORRUPT_BKPT;

1432 return 0;

1433 }

1434 /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each

1435 ** freeblock form a big-endian integer which is the size of the freeblock

1436 ** in bytes, including the 4-byte header. */

1437 size = get2byte(&aData[pc+2]);

1438 if( (x = size - nByte)>=0 ){

1439 testcase( x==4 );

1440 testcase( x==3 );

1441 if( pc < pPg->cellOffset+2*pPg->nCell \|\| size+pc > usableSize ){

1442 *pRc = SQLITE_CORRUPT_BKPT;

1443 return 0;

1444 }else if( x<4 ){

1445 /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total

1446 ** number of bytes in fragments may not exceed 60. */

1447 if( aData[hdr+7]>57 ) return 0;

1448

1449 /* Remove the slot from the free-list. Update the number of

1450 ** fragmented bytes within the page. */

1451 memcpy(&aData[iAddr], &aData[pc], 2);

1452 aData[hdr+7] += (u8)x;

1453 }else{

1454 /* The slot remains on the free-list. Reduce its size to account

1455 ** for the portion used by the new allocation. */

1456 put2byte(&aData[pc+2], x);

1457 }

1458 return &aData[pc + x];

1459 }

1460 iAddr = pc;

1461 pc = get2byte(&aData[pc]);

1462 }while( pc );

1463

1464 return 0;

1465 }

1466

1467 /*

1468 ** Allocate nByte bytes of space from within the B-Tree page passed

1469 ** as the first argument. Write into *pIdx the index into pPage->aData[]

1470 ** of the first byte of allocated space. Return either SQLITE_OK or

1471 ** an error code (usually SQLITE_CORRUPT).

1472 **

1473 ** The caller guarantees that there is sufficient space to make the

1474 ** allocation. This routine might need to defragment in order to bring

1475 ** all the space together, however. This routine will avoid using

1476 ** the first two bytes past the cell pointer area since presumably this

1477 ** allocation is being made in order to insert a new cell, so we will

1478 ** also end up needing a new cell pointer.

1479 */

1480 static int allocateSpace(MemPage pPage, int nByte, int pIdx){

1481 const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */

1482 u8 * const data = pPage->aData; /* Local cache of pPage->aData */

1483 int top; /* First byte of cell content area */

1484 int rc = SQLITE_OK; /* Integer return code */

1485 int gap; /* First byte of gap between cell pointers and cell content */

1486

1487 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

1488 assert( pPage->pBt );

1489 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1490 assert( nByte>=0 ); /* Minimum cell size is 4 */

1491 assert( pPage->nFree>=nByte );

1492 assert( pPage->nOverflow==0 );

1493 assert( nByte < (int)(pPage->pBt->usableSize-8) );

1494

1495 assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );

1496 gap = pPage->cellOffset + 2*pPage->nCell;

1497 assert( gap<=65536 );

1498 /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size

1499 ** and the reserved space is zero (the usual value for reserved space)

1500 ** then the cell content offset of an empty page wants to be 65536.

1501 ** However, that integer is too large to be stored in a 2-byte unsigned

1502 ** integer, so a value of 0 is used in its place. */

1503 top = get2byte(&data[hdr+5]);

1504 assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */

1505 if( gap>top ){

1506 if( top==0 && pPage->pBt->usableSize==65536 ){

1507 top = 65536;

1508 }else{

1509 return SQLITE_CORRUPT_BKPT;

1510 }

1511 }

1512

1513 /* If there is enough space between gap and top for one more cell pointer

1514 ** array entry offset, and if the freelist is not empty, then search the

1515 ** freelist looking for a free slot big enough to satisfy the request.

1516 */

1517 testcase( gap+2==top );

1518 testcase( gap+1==top );

1519 testcase( gap==top );

1520 if( (data[hdr+2] \|\| data[hdr+1]) && gap+2<=top ){

1521 u8 *pSpace = pageFindSlot(pPage, nByte, &rc);

1522 if( pSpace ){

1523 assert( pSpace>=data && (pSpace - data)<65536 );

1524 *pIdx = (int)(pSpace - data);

1525 return SQLITE_OK;

1526 }else if( rc ){

1527 return rc;

1528 }

1529 }

1530

1531 /* The request could not be fulfilled using a freelist slot. Check

1532 ** to see if defragmentation is necessary.

1533 */

1534 testcase( gap+2+nByte==top );

1535 if( gap+2+nByte>top ){

1536 assert( pPage->nCell>0 \|\| CORRUPT_DB );

1537 rc = defragmentPage(pPage);

1538 if( rc ) return rc;

1539 top = get2byteNotZero(&data[hdr+5]);

1540 assert( gap+nByte<=top );

1541 }

1542

1543

1544 /* Allocate memory from the gap in between the cell pointer array

1545 ** and the cell content area. The btreeInitPage() call has already

1546 ** validated the freelist. Given that the freelist is valid, there

1547 ** is no way that the allocation can extend off the end of the page.

1548 ** The assert() below verifies the previous sentence.

1549 */

1550 top -= nByte;

1551 put2byte(&data[hdr+5], top);

1552 assert( top+nByte <= (int)pPage->pBt->usableSize );

1553 *pIdx = top;

1554 return SQLITE_OK;

1555 }

1556

1557 /*

1558 ** Return a section of the pPage->aData to the freelist.

1559 ** The first byte of the new free block is pPage->aData[iStart]

1560 ** and the size of the block is iSize bytes.

1561 **

1562 ** Adjacent freeblocks are coalesced.

1563 **

1564 ** Note that even though the freeblock list was checked by btreeInitPage(),

1565 ** that routine will not detect overlap between cells or freeblocks. Nor

1566 ** does it detect cells or freeblocks that encrouch into the reserved bytes

1567 ** at the end of the page. So do additional corruption checks inside this

1568 ** routine and return SQLITE_CORRUPT if any problems are found.

1569 */

1570 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){

1571 u16 iPtr; /* Address of ptr to next freeblock */

1572 u16 iFreeBlk; /* Address of the next freeblock */

1573 u8 hdr; /* Page header size. 0 or 100 */

1574 u8 nFrag = 0; /* Reduction in fragmentation */

1575 u16 iOrigSize = iSize; /* Original value of iSize */

1576 u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */

1577 u32 iEnd = iStart + iSize; /* First byte past the iStart buffer */

1578 unsigned char data = pPage->aData; / Page content */

1579

1580 assert( pPage->pBt!=0 );

1581 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

1582 assert( CORRUPT_DB \|\| iStart>=pPage->hdrOffset+6+pPage->childPtrSize );

1583 assert( CORRUPT_DB \|\| iEnd <= pPage->pBt->usableSize );

1584 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1585 assert( iSize>=4 ); /* Minimum cell size is 4 */

1586 assert( iStart<=iLast );

1587

1588 /* Overwrite deleted information with zeros when the secure_delete

1589 ** option is enabled */

1590 if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){

1591 memset(&data[iStart], 0, iSize);

1592 }

1593

1594 /* The list of freeblocks must be in ascending order. Find the

1595 ** spot on the list where iStart should be inserted.

1596 */

1597 hdr = pPage->hdrOffset;

1598 iPtr = hdr + 1;

1599 if( data[iPtr+1]==0 && data[iPtr]==0 ){

1600 iFreeBlk = 0; /* Shortcut for the case when the freelist is empty */

1601 }else{

1602 while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){

1603 if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;

1604 iPtr = iFreeBlk;

1605 }

1606 if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;

1607 assert( iFreeBlk>iPtr \|\| iFreeBlk==0 );

1608

1609 /* At this point:

1610 ** iFreeBlk: First freeblock after iStart, or zero if none

1611 ** iPtr: The address of a pointer to iFreeBlk

1612 **

1613 ** Check to see if iFreeBlk should be coalesced onto the end of iStart.

1614 */

1615 if( iFreeBlk && iEnd+3>=iFreeBlk ){

1616 nFrag = iFreeBlk - iEnd;

1617 if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;

1618 iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);

1619 if( iEnd > pPage->pBt->usableSize ) return SQLITE_CORRUPT_BKPT;

1620 iSize = iEnd - iStart;

1621 iFreeBlk = get2byte(&data[iFreeBlk]);

1622 }

1623

1624 /* If iPtr is another freeblock (that is, if iPtr is not the freelist

1625 ** pointer in the page header) then check to see if iStart should be

1626 ** coalesced onto the end of iPtr.

1627 */

1628 if( iPtr>hdr+1 ){

1629 int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);

1630 if( iPtrEnd+3>=iStart ){

1631 if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;

1632 nFrag += iStart - iPtrEnd;

1633 iSize = iEnd - iPtr;

1634 iStart = iPtr;

1635 }

1636 }

1637 if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;

1638 data[hdr+7] -= nFrag;

1639 }

1640 if( iStart==get2byte(&data[hdr+5]) ){

1641 /* The new freeblock is at the beginning of the cell content area,

1642 ** so just extend the cell content area rather than create another

1643 ** freelist entry */

1644 if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;

1645 put2byte(&data[hdr+1], iFreeBlk);

1646 put2byte(&data[hdr+5], iEnd);

1647 }else{

1648 /* Insert the new freeblock into the freelist */

1649 put2byte(&data[iPtr], iStart);

1650 put2byte(&data[iStart], iFreeBlk);

1651 put2byte(&data[iStart+2], iSize);

1652 }

1653 pPage->nFree += iOrigSize;

1654 return SQLITE_OK;

1655 }

1656

1657 /*

1658 ** Decode the flags byte (the first byte of the header) for a page

1659 ** and initialize fields of the MemPage structure accordingly.

1660 **

1661 ** Only the following combinations are supported. Anything different

1662 ** indicates a corrupt database files:

1663 **

1664 ** PTF_ZERODATA

1665 ** PTF_ZERODATA \| PTF_LEAF

1666 ** PTF_LEAFDATA \| PTF_INTKEY

1667 ** PTF_LEAFDATA \| PTF_INTKEY \| PTF_LEAF

1668 */

1669 static int decodeFlags(MemPage *pPage, int flagByte){

1670 BtShared pBt; / A copy of pPage->pBt */

1671

1672 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );

1673 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1674 pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );

1675 flagByte &= ~PTF_LEAF;

1676 pPage->childPtrSize = 4-4*pPage->leaf;

1677 pPage->xCellSize = cellSizePtr;

1678 pBt = pPage->pBt;

1679 if( flagByte==(PTF_LEAFDATA \| PTF_INTKEY) ){

1680 /* EVIDENCE-OF: R-03640-13415 A value of 5 means the page is an interior

1681 ** table b-tree page. */

1682 assert( (PTF_LEAFDATA\|PTF_INTKEY)==5 );

1683 /* EVIDENCE-OF: R-20501-61796 A value of 13 means the page is a leaf

1684 ** table b-tree page. */

1685 assert( (PTF_LEAFDATA\|PTF_INTKEY\|PTF_LEAF)==13 );

1686 pPage->intKey = 1;

1687 if( pPage->leaf ){

1688 pPage->intKeyLeaf = 1;

1689 pPage->noPayload = 0;

1690 pPage->xParseCell = btreeParseCellPtr;

1691 }else{

1692 pPage->intKeyLeaf = 0;

1693 pPage->noPayload = 1;

1694 pPage->xCellSize = cellSizePtrNoPayload;

1695 pPage->xParseCell = btreeParseCellPtrNoPayload;

1696 }

1697 pPage->maxLocal = pBt->maxLeaf;

1698 pPage->minLocal = pBt->minLeaf;

1699 }else if( flagByte==PTF_ZERODATA ){

1700 /* EVIDENCE-OF: R-27225-53936 A value of 2 means the page is an interior

1701 ** index b-tree page. */

1702 assert( (PTF_ZERODATA)==2 );

1703 /* EVIDENCE-OF: R-16571-11615 A value of 10 means the page is a leaf

1704 ** index b-tree page. */

1705 assert( (PTF_ZERODATA\|PTF_LEAF)==10 );

1706 pPage->intKey = 0;

1707 pPage->intKeyLeaf = 0;

1708 pPage->noPayload = 0;

1709 pPage->xParseCell = btreeParseCellPtrIndex;

1710 pPage->maxLocal = pBt->maxLocal;

1711 pPage->minLocal = pBt->minLocal;

1712 }else{

1713 /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is

1714 ** an error. */

1715 return SQLITE_CORRUPT_BKPT;

1716 }

1717 pPage->max1bytePayload = pBt->max1bytePayload;

1718 return SQLITE_OK;

1719 }

1720

1721 /*

1722 ** Initialize the auxiliary information for a disk block.

1723 **

1724 ** Return SQLITE_OK on success. If we see that the page does

1725 ** not contain a well-formed database page, then return

1726 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not

1727 ** guarantee that the page is well-formed. It only shows that

1728 ** we failed to detect any corruption.

1729 */

1730 static int btreeInitPage(MemPage *pPage){

1731

1732 assert( pPage->pBt!=0 );

1733 assert( pPage->pBt->db!=0 );

1734 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

1735 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );

1736 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );

1737 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );

1738

1739 if( !pPage->isInit ){

1740 u16 pc; /* Address of a freeblock within pPage->aData[] */

1741 u8 hdr; /* Offset to beginning of page header */

1742 u8 data; / Equal to pPage->aData */

1743 BtShared pBt; / The main btree structure */

1744 int usableSize; /* Amount of usable space on each page */

1745 u16 cellOffset; /* Offset from start of page to first cell pointer */

1746 int nFree; /* Number of unused bytes on the page */

1747 int top; /* First byte of the cell content area */

1748 int iCellFirst; /* First allowable cell or freeblock offset */

1749 int iCellLast; /* Last possible cell or freeblock offset */

1750

1751 pBt = pPage->pBt;

1752

1753 hdr = pPage->hdrOffset;

1754 data = pPage->aData;

1755 /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating

1756 ** the b-tree page type. */

1757 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;

1758 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

1759 pPage->maskPage = (u16)(pBt->pageSize - 1);

1760 pPage->nOverflow = 0;

1761 usableSize = pBt->usableSize;

1762 pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;

1763 pPage->aDataEnd = &data[usableSize];

1764 pPage->aCellIdx = &data[cellOffset];

1765 pPage->aDataOfst = &data[pPage->childPtrSize];

1766 /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates

1767 ** the start of the cell content area. A zero value for this integer is

1768 ** interpreted as 65536. */

1769 top = get2byteNotZero(&data[hdr+5]);

1770 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the

1771 ** number of cells on the page. */

1772 pPage->nCell = get2byte(&data[hdr+3]);

1773 if( pPage->nCell>MX_CELL(pBt) ){

1774 /* To many cells for a single page. The page must be corrupt */

1775 return SQLITE_CORRUPT_BKPT;

1776 }

1777 testcase( pPage->nCell==MX_CELL(pBt) );

1778 /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only

1779 ** possible for a root page of a table that contains no rows) then the

1780 ** offset to the cell content area will equal the page size minus the

1781 ** bytes of reserved space. */

1782 assert( pPage->nCell>0 \|\| top==usableSize \|\| CORRUPT_DB );

1783

1784 /* A malformed database page might cause us to read past the end

1785 ** of page when parsing a cell.

1786 **

1787 ** The following block of code checks early to see if a cell extends

1788 ** past the end of a page boundary and causes SQLITE_CORRUPT to be

1789 ** returned if it does.

1790 */

1791 iCellFirst = cellOffset + 2*pPage->nCell;

1792 iCellLast = usableSize - 4;

1793 if( pBt->db->flags & SQLITE_CellSizeCk ){

1794 int i; /* Index into the cell pointer array */

1795 int sz; /* Size of a cell */

1796

1797 if( !pPage->leaf ) iCellLast--;

1798 for(i=0; i<pPage->nCell; i++){

1799 pc = get2byteAligned(&data[cellOffset+i*2]);

1800 testcase( pc==iCellFirst );

1801 testcase( pc==iCellLast );

1802 if( pc<iCellFirst \|\| pc>iCellLast ){

1803 return SQLITE_CORRUPT_BKPT;

1804 }

1805 sz = pPage->xCellSize(pPage, &data[pc]);

1806 testcase( pc+sz==usableSize );

1807 if( pc+sz>usableSize ){

1808 return SQLITE_CORRUPT_BKPT;

1809 }

1810 }

1811 if( !pPage->leaf ) iCellLast++;

1812 }

1813

1814 /* Compute the total free space on the page

1815 ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the

1816 ** start of the first freeblock on the page, or is zero if there are no

1817 ** freeblocks. */

1818 pc = get2byte(&data[hdr+1]);

1819 nFree = data[hdr+7] + top; /* Init nFree to non-freeblock free space */

1820 while( pc>0 ){

1821 u16 next, size;

1822 if( pc<iCellFirst \|\| pc>iCellLast ){

1823 /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will

1824 ** always be at least one cell before the first freeblock.

1825 **

1826 ** Or, the freeblock is off the end of the page

1827 */

1828 return SQLITE_CORRUPT_BKPT;

1829 }

1830 next = get2byte(&data[pc]);

1831 size = get2byte(&data[pc+2]);

1832 if( (next>0 && next<=pc+size+3) \|\| pc+size>usableSize ){

1833 /* Free blocks must be in ascending order. And the last byte of

1834 ** the free-block must lie on the database page. */

1835 return SQLITE_CORRUPT_BKPT;

1836 }

1837 nFree = nFree + size;

1838 pc = next;

1839 }

1840

1841 /* At this point, nFree contains the sum of the offset to the start

1842 ** of the cell-content area plus the number of free bytes within

1843 ** the cell-content area. If this is greater than the usable-size

1844 ** of the page, then the page must be corrupted. This check also

1845 ** serves to verify that the offset to the start of the cell-content

1846 ** area, according to the page header, lies within the page.

1847 */

1848 if( nFree>usableSize ){

1849 return SQLITE_CORRUPT_BKPT;

1850 }

1851 pPage->nFree = (u16)(nFree - iCellFirst);

1852 pPage->isInit = 1;

1853 }

1854 return SQLITE_OK;

1855 }

1856

1857 /*

1858 ** Set up a raw page so that it looks like a database page holding

1859 ** no entries.

1860 */

1861 static void zeroPage(MemPage *pPage, int flags){

1862 unsigned char *data = pPage->aData;

1863 BtShared *pBt = pPage->pBt;

1864 u8 hdr = pPage->hdrOffset;

1865 u16 first;

1866

1867 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );

1868 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

1869 assert( sqlite3PagerGetData(pPage->pDbPage) == data );

1870 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

1871 assert( sqlite3_mutex_held(pBt->mutex) );

1872 if( pBt->btsFlags & BTS_SECURE_DELETE ){

1873 memset(&data[hdr], 0, pBt->usableSize - hdr);

1874 }

1875 data[hdr] = (char)flags;

1876 first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);

1877 memset(&data[hdr+1], 0, 4);

1878 data[hdr+7] = 0;

1879 put2byte(&data[hdr+5], pBt->usableSize);

1880 pPage->nFree = (u16)(pBt->usableSize - first);

1881 decodeFlags(pPage, flags);

1882 pPage->cellOffset = first;

1883 pPage->aDataEnd = &data[pBt->usableSize];

1884 pPage->aCellIdx = &data[first];

1885 pPage->aDataOfst = &data[pPage->childPtrSize];

1886 pPage->nOverflow = 0;

1887 assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );

1888 pPage->maskPage = (u16)(pBt->pageSize - 1);

1889 pPage->nCell = 0;

1890 pPage->isInit = 1;

1891 }

1892

1893

1894 /*

1895 ** Convert a DbPage obtained from the pager into a MemPage used by

1896 ** the btree layer.

1897 */

1898 static MemPage btreePageFromDbPage(DbPage pDbPage, Pgno pgno, BtShared *pBt){

1899 MemPage pPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

1900 if( pgno!=pPage->pgno ){

1901 pPage->aData = sqlite3PagerGetData(pDbPage);

1902 pPage->pDbPage = pDbPage;

1903 pPage->pBt = pBt;

1904 pPage->pgno = pgno;

1905 pPage->hdrOffset = pgno==1 ? 100 : 0;

1906 }

1907 assert( pPage->aData==sqlite3PagerGetData(pDbPage) );

1908 return pPage;

1909 }

1910

1911 /*

1912 ** Get a page from the pager. Initialize the MemPage.pBt and

1913 ** MemPage.aData elements if needed. See also: btreeGetUnusedPage().

1914 **

1915 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care

1916 ** about the content of the page at this time. So do not go to the disk

1917 ** to fetch the content. Just fill in the content with zeros for now.

1918 ** If in the future we call sqlite3PagerWrite() on this page, that

1919 ** means we have started to be concerned about content and the disk

1920 ** read should occur at that point.

1921 */

1922 static int btreeGetPage(

1923 BtShared pBt, / The btree */

1924 Pgno pgno, /* Number of the page to fetch */

1925 MemPage *ppPage, / Return the page in this parameter */

1926 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

1927 ){

1928 int rc;

1929 DbPage *pDbPage;

1930

1931 assert( flags==0 \|\| flags==PAGER_GET_NOCONTENT \|\| flags==PAGER_GET_READONLY );

1932 assert( sqlite3_mutex_held(pBt->mutex) );

1933 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);

1934 if( rc ) return rc;

1935 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);

1936 return SQLITE_OK;

1937 }

1938

1939 /*

1940 ** Retrieve a page from the pager cache. If the requested page is not

1941 ** already in the pager cache return NULL. Initialize the MemPage.pBt and

1942 ** MemPage.aData elements if needed.

1943 */

1944 static MemPage btreePageLookup(BtShared pBt, Pgno pgno){

1945 DbPage *pDbPage;

1946 assert( sqlite3_mutex_held(pBt->mutex) );

1947 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);

1948 if( pDbPage ){

1949 return btreePageFromDbPage(pDbPage, pgno, pBt);

1950 }

1951 return 0;

1952 }

1953

1954 /*

1955 ** Return the size of the database file in pages. If there is any kind of

1956 ** error, return ((unsigned int)-1).

1957 */

1958 static Pgno btreePagecount(BtShared *pBt){

1959 return pBt->nPage;

1960 }

1961 u32 sqlite3BtreeLastPage(Btree *p){

1962 assert( sqlite3BtreeHoldsMutex(p) );

1963 assert( ((p->pBt->nPage)&0x8000000)==0 );

1964 return btreePagecount(p->pBt);

1965 }

1966

1967 /*

1968 ** Get a page from the pager and initialize it.

1969 **

1970 ** If pCur!=0 then the page is being fetched as part of a moveToChild()

1971 ** call. Do additional sanity checking on the page in this case.

1972 ** And if the fetch fails, this routine must decrement pCur->iPage.

1973 **

1974 ** The page is fetched as read-write unless pCur is not NULL and is

1975 ** a read-only cursor.

1976 **

1977 ** If an error occurs, then *ppPage is undefined. It

1978 ** may remain unchanged, or it may be set to an invalid value.

1979 */

1980 static int getAndInitPage(

1981 BtShared pBt, / The database file */

1982 Pgno pgno, /* Number of the page to get */

1983 MemPage *ppPage, / Write the page pointer here */

1984 BtCursor pCur, / Cursor to receive the page, or NULL */

1985 int bReadOnly /* True for a read-only page */

1986 ){

1987 int rc;

1988 DbPage *pDbPage;

1989 assert( sqlite3_mutex_held(pBt->mutex) );

1990 assert( pCur==0 \|\| ppPage==&pCur->apPage[pCur->iPage] );

1991 assert( pCur==0 \|\| bReadOnly==pCur->curPagerFlags );

1992 assert( pCur==0 \|\| pCur->iPage>0 );

1993

1994 if( pgno>btreePagecount(pBt) ){

1995 rc = SQLITE_CORRUPT_BKPT;

1996 goto getAndInitPage_error;

1997 }

1998 rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);

1999 if( rc ){

2000 goto getAndInitPage_error;

2001 }

2002 ppPage = (MemPage)sqlite3PagerGetExtra(pDbPage);

2003 if( (*ppPage)->isInit==0 ){

2004 btreePageFromDbPage(pDbPage, pgno, pBt);

2005 rc = btreeInitPage(*ppPage);

2006 if( rc!=SQLITE_OK ){

2007 releasePage(*ppPage);

2008 goto getAndInitPage_error;

2009 }

2010 }

2011 assert( (*ppPage)->pgno==pgno );

2012 assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );

2013

2014 /* If obtaining a child page for a cursor, we must verify that the page is

2015 ** compatible with the root page. */

2016 if( pCur && ((ppPage)->nCell<1 \|\| (ppPage)->intKey!=pCur->curIntKey) ){

2017 rc = SQLITE_CORRUPT_BKPT;

2018 releasePage(*ppPage);

2019 goto getAndInitPage_error;

2020 }

2021 return SQLITE_OK;

2022

2023 getAndInitPage_error:

2024 if( pCur ) pCur->iPage--;

2025 testcase( pgno==0 );

2026 assert( pgno!=0 \|\| rc==SQLITE_CORRUPT );

2027 return rc;

2028 }

2029

2030 /*

2031 ** Release a MemPage. This should be called once for each prior

2032 ** call to btreeGetPage.

2033 */

2034 static void releasePageNotNull(MemPage *pPage){

2035 assert( pPage->aData );

2036 assert( pPage->pBt );

2037 assert( pPage->pDbPage!=0 );

2038 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );

2039 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );

2040 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

2041 sqlite3PagerUnrefNotNull(pPage->pDbPage);

2042 }

2043 static void releasePage(MemPage *pPage){

2044 if( pPage ) releasePageNotNull(pPage);

2045 }

2046

2047 /*

2048 ** Get an unused page.

2049 **

2050 ** This works just like btreeGetPage() with the addition:

2051 **

2052 ** * If the page is already in use for some other purpose, immediately

2053 ** release it and return an SQLITE_CURRUPT error.

2054 ** * Make sure the isInit flag is clear

2055 */

2056 static int btreeGetUnusedPage(

2057 BtShared pBt, / The btree */

2058 Pgno pgno, /* Number of the page to fetch */

2059 MemPage *ppPage, / Return the page in this parameter */

2060 int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */

2061 ){

2062 int rc = btreeGetPage(pBt, pgno, ppPage, flags);

2063 if( rc==SQLITE_OK ){

2064 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){

2065 releasePage(*ppPage);

2066 *ppPage = 0;

2067 return SQLITE_CORRUPT_BKPT;

2068 }

2069 (*ppPage)->isInit = 0;

2070 }else{

2071 *ppPage = 0;

2072 }

2073 return rc;

2074 }

2075

2076

2077 /*

2078 ** During a rollback, when the pager reloads information into the cache

2079 ** so that the cache is restored to its original state at the start of

2080 ** the transaction, for each page restored this routine is called.

2081 **

2082 ** This routine needs to reset the extra data section at the end of the

2083 ** page to agree with the restored data.

2084 */

2085 static void pageReinit(DbPage *pData){

2086 MemPage *pPage;

2087 pPage = (MemPage *)sqlite3PagerGetExtra(pData);

2088 assert( sqlite3PagerPageRefcount(pData)>0 );

2089 if( pPage->isInit ){

2090 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

2091 pPage->isInit = 0;

2092 if( sqlite3PagerPageRefcount(pData)>1 ){

2093 /* pPage might not be a btree page; it might be an overflow page

2094 ** or ptrmap page or a free page. In those cases, the following

2095 ** call to btreeInitPage() will likely return SQLITE_CORRUPT.

2096 ** But no harm is done by this. And it is very important that

2097 ** btreeInitPage() be called on every btree page so we make

2098 ** the call for every page that comes in for re-initing. */

2099 btreeInitPage(pPage);

2100 }

2101 }

2102 }

2103

2104 /*

2105 ** Invoke the busy handler for a btree.

2106 */

2107 static int btreeInvokeBusyHandler(void *pArg){

2108 BtShared pBt = (BtShared)pArg;

2109 assert( pBt->db );

2110 assert( sqlite3_mutex_held(pBt->db->mutex) );

2111 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);

2112 }

2113

2114 /*

2115 ** Open a database file.

2116 **

2117 ** zFilename is the name of the database file. If zFilename is NULL

2118 ** then an ephemeral database is created. The ephemeral database might

2119 ** be exclusively in memory, or it might use a disk-based memory cache.

2120 ** Either way, the ephemeral database will be automatically deleted

2121 ** when sqlite3BtreeClose() is called.

2122 **

2123 ** If zFilename is ":memory:" then an in-memory database is created

2124 ** that is automatically destroyed when it is closed.

2125 **

2126 ** The "flags" parameter is a bitmask that might contain bits like

2127 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.

2128 **

2129 ** If the database is already opened in the same database connection

2130 ** and we are in shared cache mode, then the open will fail with an

2131 ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared

2132 ** objects in the same database connection since doing so will lead

2133 ** to problems with locking.

2134 */

2135 int sqlite3BtreeOpen(

2136 sqlite3_vfs pVfs, / VFS to use for this b-tree */

2137 const char zFilename, / Name of the file containing the BTree database */

2138 sqlite3 db, / Associated database handle */

2139 Btree *ppBtree, / Pointer to new Btree object written here */

2140 int flags, /* Options */

2141 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */

2142 ){

2143 BtShared pBt = 0; / Shared part of btree structure */

2144 Btree p; / Handle to return */

2145 sqlite3_mutex mutexOpen = 0; / Prevents a race condition. Ticket #3537 */

2146 int rc = SQLITE_OK; /* Result code from this function */

2147 u8 nReserve; /* Byte of unused space on each page */

2148 unsigned char zDbHeader[100]; /* Database header content */

2149

2150 /* True if opening an ephemeral, temporary database */

2151 const int isTempDb = zFilename==0 \|\| zFilename[0]==0;

2152

2153 /* Set the variable isMemdb to true for an in-memory database, or

2154 ** false for a file-based database.

2155 */

2156 #ifdef SQLITE_OMIT_MEMORYDB

2157 const int isMemdb = 0;

2158 #else

2159 const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)

2160 \|\| (isTempDb && sqlite3TempInMemory(db))

2161 \|\| (vfsFlags & SQLITE_OPEN_MEMORY)!=0;

2162 #endif

2163

2164 assert( db!=0 );

2165 assert( pVfs!=0 );

2166 assert( sqlite3_mutex_held(db->mutex) );

2167 assert( (flags&0xff)==flags ); /* flags fit in 8 bits */

2168

2169 /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */

2170 assert( (flags & BTREE_UNORDERED)==0 \|\| (flags & BTREE_SINGLE)!=0 );

2171

2172 /* A BTREE_SINGLE database is always a temporary and/or ephemeral */

2173 assert( (flags & BTREE_SINGLE)==0 \|\| isTempDb );

2174

2175 if( isMemdb ){

2176 flags \|= BTREE_MEMORY;

2177 }

2178 if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb \|\| isTempDb) ){

2179 vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) \| SQLITE_OPEN_TEMP_DB;

2180 }

2181 p = sqlite3MallocZero(sizeof(Btree));

2182 if( !p ){

2183 return SQLITE_NOMEM;

2184 }

2185 p->inTrans = TRANS_NONE;

2186 p->db = db;

2187 #ifndef SQLITE_OMIT_SHARED_CACHE

2188 p->lock.pBtree = p;

2189 p->lock.iTable = 1;

2190 #endif

2191

2192 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

2193 /*

2194 ** If this Btree is a candidate for shared cache, try to find an

2195 ** existing BtShared object that we can share with

2196 */

2197 if( isTempDb==0 && (isMemdb==0 \|\| (vfsFlags&SQLITE_OPEN_URI)!=0) ){

2198 if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){

2199 int nFilename = sqlite3Strlen30(zFilename)+1;

2200 int nFullPathname = pVfs->mxPathname+1;

2201 char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));

2202 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

2203

2204 p->sharable = 1;

2205 if( !zFullPathname ){

2206 sqlite3_free(p);

2207 return SQLITE_NOMEM;

2208 }

2209 if( isMemdb ){

2210 memcpy(zFullPathname, zFilename, nFilename);

2211 }else{

2212 rc = sqlite3OsFullPathname(pVfs, zFilename,

2213 nFullPathname, zFullPathname);

2214 if( rc ){

2215 sqlite3_free(zFullPathname);

2216 sqlite3_free(p);

2217 return rc;

2218 }

2219 }

2220 #if SQLITE_THREADSAFE

2221 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);

2222 sqlite3_mutex_enter(mutexOpen);

2223 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);

2224 sqlite3_mutex_enter(mutexShared);

2225 #endif

2226 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){

2227 assert( pBt->nRef>0 );

2228 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))

2229 && sqlite3PagerVfs(pBt->pPager)==pVfs ){

2230 int iDb;

2231 for(iDb=db->nDb-1; iDb>=0; iDb--){

2232 Btree *pExisting = db->aDb[iDb].pBt;

2233 if( pExisting && pExisting->pBt==pBt ){

2234 sqlite3_mutex_leave(mutexShared);

2235 sqlite3_mutex_leave(mutexOpen);

2236 sqlite3_free(zFullPathname);

2237 sqlite3_free(p);

2238 return SQLITE_CONSTRAINT;

2239 }

2240 }

2241 p->pBt = pBt;

2242 pBt->nRef++;

2243 break;

2244 }

2245 }

2246 sqlite3_mutex_leave(mutexShared);

2247 sqlite3_free(zFullPathname);

2248 }

2249 #ifdef SQLITE_DEBUG

2250 else{

2251 /* In debug mode, we mark all persistent databases as sharable

2252 ** even when they are not. This exercises the locking code and

2253 ** gives more opportunity for asserts(sqlite3_mutex_held())

2254 ** statements to find locking problems.

2255 */

2256 p->sharable = 1;

2257 }

2258 #endif

2259 }

2260 #endif

2261 if( pBt==0 ){

2262 /*

2263 ** The following asserts make sure that structures used by the btree are

2264 ** the right size. This is to guard against size changes that result

2265 ** when compiling on a different architecture.

2266 */

2267 assert( sizeof(i64)==8 );

2268 assert( sizeof(u64)==8 );

2269 assert( sizeof(u32)==4 );

2270 assert( sizeof(u16)==2 );

2271 assert( sizeof(Pgno)==4 );

2272

2273 pBt = sqlite3MallocZero( sizeof(*pBt) );

2274 if( pBt==0 ){

2275 rc = SQLITE_NOMEM;

2276 goto btree_open_out;

2277 }

2278 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,

2279 EXTRA_SIZE, flags, vfsFlags, pageReinit);

2280 if( rc==SQLITE_OK ){

2281 sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);

2282 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);

2283 }

2284 if( rc!=SQLITE_OK ){

2285 goto btree_open_out;

2286 }

2287 pBt->openFlags = (u8)flags;

2288 pBt->db = db;

2289 sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);

2290 p->pBt = pBt;

2291

2292 pBt->pCursor = 0;

2293 pBt->pPage1 = 0;

2294 if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags \|= BTS_READ_ONLY;

2295 #ifdef SQLITE_SECURE_DELETE

2296 pBt->btsFlags \|= BTS_SECURE_DELETE;

2297 #endif

2298 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is

2299 ** determined by the 2-byte integer located at an offset of 16 bytes from

2300 ** the beginning of the database file. */

2301 pBt->pageSize = (zDbHeader[16]<<8) \| (zDbHeader[17]<<16);

2302 if( pBt->pageSize<512 \|\| pBt->pageSize>SQLITE_MAX_PAGE_SIZE

2303 \|\| ((pBt->pageSize-1)&pBt->pageSize)!=0 ){

2304 pBt->pageSize = 0;

2305 #ifndef SQLITE_OMIT_AUTOVACUUM

2306 /* If the magic name ":memory:" will create an in-memory database, then

2307 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if

2308 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if

2309 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a

2310 ** regular file-name. In this case the auto-vacuum applies as per normal.

2311 */

2312 if( zFilename && !isMemdb ){

2313 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);

2314 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);

2315 }

2316 #endif

2317 nReserve = 0;

2318 }else{

2319 /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is

2320 ** determined by the one-byte unsigned integer found at an offset of 20

2321 ** into the database file header. */

2322 nReserve = zDbHeader[20];

2323 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

2324 #ifndef SQLITE_OMIT_AUTOVACUUM

2325 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);

2326 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);

2327 #endif

2328 }

2329 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

2330 if( rc ) goto btree_open_out;

2331 pBt->usableSize = pBt->pageSize - nReserve;

2332 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */

2333

2334 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

2335 /* Add the new BtShared object to the linked list sharable BtShareds.

2336 */

2337 if( p->sharable ){

2338 MUTEX_LOGIC( sqlite3_mutex *mutexShared; )

2339 pBt->nRef = 1;

2340 MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)

2341 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){

2342 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);

2343 if( pBt->mutex==0 ){

2344 rc = SQLITE_NOMEM;

2345 db->mallocFailed = 0;

2346 goto btree_open_out;

2347 }

2348 }

2349 sqlite3_mutex_enter(mutexShared);

2350 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);

2351 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;

2352 sqlite3_mutex_leave(mutexShared);

2353 }

2354 #endif

2355 }

2356

2357 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)

2358 /* If the new Btree uses a sharable pBtShared, then link the new

2359 ** Btree into the list of all sharable Btrees for the same connection.

2360 ** The list is kept in ascending order by pBt address.

2361 */

2362 if( p->sharable ){

2363 int i;

2364 Btree *pSib;

2365 for(i=0; i<db->nDb; i++){

2366 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){

2367 while( pSib->pPrev ){ pSib = pSib->pPrev; }

2368 if( p->pBt<pSib->pBt ){

2369 p->pNext = pSib;

2370 p->pPrev = 0;

2371 pSib->pPrev = p;

2372 }else{

2373 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){

2374 pSib = pSib->pNext;

2375 }

2376 p->pNext = pSib->pNext;

2377 p->pPrev = pSib;

2378 if( p->pNext ){

2379 p->pNext->pPrev = p;

2380 }

2381 pSib->pNext = p;

2382 }

2383 break;

2384 }

2385 }

2386 }

2387 #endif

2388 *ppBtree = p;

2389

2390 btree_open_out:

2391 if( rc!=SQLITE_OK ){

2392 if( pBt && pBt->pPager ){

2393 sqlite3PagerClose(pBt->pPager);

2394 }

2395 sqlite3_free(pBt);

2396 sqlite3_free(p);

2397 *ppBtree = 0;

2398 }else{

2399 /* If the B-Tree was successfully opened, set the pager-cache size to the

2400 ** default value. Except, when opening on an existing shared pager-cache,

2401 ** do not change the pager-cache size.

2402 */

2403 if( sqlite3BtreeSchema(p, 0, 0)==0 ){

2404 sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);

2405 }

2406 }

2407 if( mutexOpen ){

2408 assert( sqlite3_mutex_held(mutexOpen) );

2409 sqlite3_mutex_leave(mutexOpen);

2410 }

2411 return rc;

2412 }

2413

2414 /*

2415 ** Decrement the BtShared.nRef counter. When it reaches zero,

2416 ** remove the BtShared structure from the sharing list. Return

2417 ** true if the BtShared.nRef counter reaches zero and return

2418 ** false if it is still positive.

2419 */

2420 static int removeFromSharingList(BtShared *pBt){

2421 #ifndef SQLITE_OMIT_SHARED_CACHE

2422 MUTEX_LOGIC( sqlite3_mutex *pMaster; )

2423 BtShared *pList;

2424 int removed = 0;

2425

2426 assert( sqlite3_mutex_notheld(pBt->mutex) );

2427 MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )

2428 sqlite3_mutex_enter(pMaster);

2429 pBt->nRef--;

2430 if( pBt->nRef<=0 ){

2431 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){

2432 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;

2433 }else{

2434 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);

2435 while( ALWAYS(pList) && pList->pNext!=pBt ){

2436 pList=pList->pNext;

2437 }

2438 if( ALWAYS(pList) ){

2439 pList->pNext = pBt->pNext;

2440 }

2441 }

2442 if( SQLITE_THREADSAFE ){

2443 sqlite3_mutex_free(pBt->mutex);

2444 }

2445 removed = 1;

2446 }

2447 sqlite3_mutex_leave(pMaster);

2448 return removed;

2449 #else

2450 return 1;

2451 #endif

2452 }

2453

2454 /*

2455 ** Make sure pBt->pTmpSpace points to an allocation of

2456 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child

2457 ** pointer.

2458 */

2459 static void allocateTempSpace(BtShared *pBt){

2460 if( !pBt->pTmpSpace ){

2461 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );

2462

2463 /* One of the uses of pBt->pTmpSpace is to format cells before

2464 ** inserting them into a leaf page (function fillInCell()). If

2465 ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes

2466 ** by the various routines that manipulate binary cells. Which

2467 ** can mean that fillInCell() only initializes the first 2 or 3

2468 ** bytes of pTmpSpace, but that the first 4 bytes are copied from

2469 ** it into a database page. This is not actually a problem, but it

2470 ** does cause a valgrind error when the 1 or 2 bytes of unitialized

2471 ** data is passed to system call write(). So to avoid this error,

2472 ** zero the first 4 bytes of temp space here.

2473 **

2474 ** Also: Provide four bytes of initialized space before the

2475 ** beginning of pTmpSpace as an area available to prepend the

2476 ** left-child pointer to the beginning of a cell.

2477 */

2478 if( pBt->pTmpSpace ){

2479 memset(pBt->pTmpSpace, 0, 8);

2480 pBt->pTmpSpace += 4;

2481 }

2482 }

2483 }

2484

2485 /*

2486 ** Free the pBt->pTmpSpace allocation

2487 */

2488 static void freeTempSpace(BtShared *pBt){

2489 if( pBt->pTmpSpace ){

2490 pBt->pTmpSpace -= 4;

2491 sqlite3PageFree(pBt->pTmpSpace);

2492 pBt->pTmpSpace = 0;

2493 }

2494 }

2495

2496 /*

2497 ** Close an open database and invalidate all cursors.

2498 */

2499 int sqlite3BtreeClose(Btree *p){

2500 BtShared *pBt = p->pBt;

2501 BtCursor *pCur;

2502

2503 /* Close all cursors opened via this handle. */

2504 assert( sqlite3_mutex_held(p->db->mutex) );

2505 sqlite3BtreeEnter(p);

2506 pCur = pBt->pCursor;

2507 while( pCur ){

2508 BtCursor *pTmp = pCur;

2509 pCur = pCur->pNext;

2510 if( pTmp->pBtree==p ){

2511 sqlite3BtreeCloseCursor(pTmp);

2512 }

2513 }

2514

2515 /* Rollback any active transaction and free the handle structure.

2516 ** The call to sqlite3BtreeRollback() drops any table-locks held by

2517 ** this handle.

2518 */

2519 sqlite3BtreeRollback(p, SQLITE_OK, 0);

2520 sqlite3BtreeLeave(p);

2521

2522 /* If there are still other outstanding references to the shared-btree

2523 ** structure, return now. The remainder of this procedure cleans

2524 ** up the shared-btree.

2525 */

2526 assert( p->wantToLock==0 && p->locked==0 );

2527 if( !p->sharable \|\| removeFromSharingList(pBt) ){

2528 /* The pBt is no longer on the sharing list, so we can access

2529 ** it without having to hold the mutex.

2530 **

2531 ** Clean out and delete the BtShared object.

2532 */

2533 assert( !pBt->pCursor );

2534 sqlite3PagerClose(pBt->pPager);

2535 if( pBt->xFreeSchema && pBt->pSchema ){

2536 pBt->xFreeSchema(pBt->pSchema);

2537 }

2538 sqlite3DbFree(0, pBt->pSchema);

2539 freeTempSpace(pBt);

2540 sqlite3_free(pBt);

2541 }

2542

2543 #ifndef SQLITE_OMIT_SHARED_CACHE

2544 assert( p->wantToLock==0 );

2545 assert( p->locked==0 );

2546 if( p->pPrev ) p->pPrev->pNext = p->pNext;

2547 if( p->pNext ) p->pNext->pPrev = p->pPrev;

2548 #endif

2549

2550 sqlite3_free(p);

2551 return SQLITE_OK;

2552 }

2553

2554 /*

2555 ** Change the "soft" limit on the number of pages in the cache.

2556 ** Unused and unmodified pages will be recycled when the number of

2557 ** pages in the cache exceeds this soft limit. But the size of the

2558 ** cache is allowed to grow larger than this limit if it contains

2559 ** dirty pages or pages still in active use.

2560 */

2561 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){

2562 BtShared *pBt = p->pBt;

2563 assert( sqlite3_mutex_held(p->db->mutex) );

2564 sqlite3BtreeEnter(p);

2565 sqlite3PagerSetCachesize(pBt->pPager, mxPage);

2566 sqlite3BtreeLeave(p);

2567 return SQLITE_OK;

2568 }

2569

2570 /*

2571 ** Change the "spill" limit on the number of pages in the cache.

2572 ** If the number of pages exceeds this limit during a write transaction,

2573 ** the pager might attempt to "spill" pages to the journal early in

2574 ** order to free up memory.

2575 **

2576 ** The value returned is the current spill size. If zero is passed

2577 ** as an argument, no changes are made to the spill size setting, so

2578 ** using mxPage of 0 is a way to query the current spill size.

2579 */

2580 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){

2581 BtShared *pBt = p->pBt;

2582 int res;

2583 assert( sqlite3_mutex_held(p->db->mutex) );

2584 sqlite3BtreeEnter(p);

2585 res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);

2586 sqlite3BtreeLeave(p);

2587 return res;

2588 }

2589

2590 #if SQLITE_MAX_MMAP_SIZE>0

2591 /*

2592 ** Change the limit on the amount of the database file that may be

2593 ** memory mapped.

2594 */

2595 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){

2596 BtShared *pBt = p->pBt;

2597 assert( sqlite3_mutex_held(p->db->mutex) );

2598 sqlite3BtreeEnter(p);

2599 sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);

2600 sqlite3BtreeLeave(p);

2601 return SQLITE_OK;

2602 }

2603 #endif /* SQLITE_MAX_MMAP_SIZE>0 */

2604

2605 /*

2606 ** Change the way data is synced to disk in order to increase or decrease

2607 ** how well the database resists damage due to OS crashes and power

2608 ** failures. Level 1 is the same as asynchronous (no syncs() occur and

2609 ** there is a high probability of damage) Level 2 is the default. There

2610 ** is a very low but non-zero probability of damage. Level 3 reduces the

2611 ** probability of damage to near zero but with a write performance reduction.

2612 */

2613 #ifndef SQLITE_OMIT_PAGER_PRAGMAS

2614 int sqlite3BtreeSetPagerFlags(

2615 Btree p, / The btree to set the safety level on */

2616 unsigned pgFlags /* Various PAGER_* flags */

2617 ){

2618 BtShared *pBt = p->pBt;

2619 assert( sqlite3_mutex_held(p->db->mutex) );

2620 sqlite3BtreeEnter(p);

2621 sqlite3PagerSetFlags(pBt->pPager, pgFlags);

2622 sqlite3BtreeLeave(p);

2623 return SQLITE_OK;

2624 }

2625 #endif

2626

2627 /*

2628 ** Return TRUE if the given btree is set to safety level 1. In other

2629 ** words, return TRUE if no sync() occurs on the disk files.

2630 */

2631 int sqlite3BtreeSyncDisabled(Btree *p){

2632 BtShared *pBt = p->pBt;

2633 int rc;

2634 assert( sqlite3_mutex_held(p->db->mutex) );

2635 sqlite3BtreeEnter(p);

2636 assert( pBt && pBt->pPager );

2637 rc = sqlite3PagerNosync(pBt->pPager);

2638 sqlite3BtreeLeave(p);

2639 return rc;

2640 }

2641

2642 /*

2643 ** Change the default pages size and the number of reserved bytes per page.

2644 ** Or, if the page size has already been fixed, return SQLITE_READONLY

2645 ** without changing anything.

2646 **

2647 ** The page size must be a power of 2 between 512 and 65536. If the page

2648 ** size supplied does not meet this constraint then the page size is not

2649 ** changed.

2650 **

2651 ** Page sizes are constrained to be a power of two so that the region

2652 ** of the database file used for locking (beginning at PENDING_BYTE,

2653 ** the first byte past the 1GB boundary, 0x40000000) needs to occur

2654 ** at the beginning of a page.

2655 **

2656 ** If parameter nReserve is less than zero, then the number of reserved

2657 ** bytes per page is left unchanged.

2658 **

2659 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size

2660 ** and autovacuum mode can no longer be changed.

2661 */

2662 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){

2663 int rc = SQLITE_OK;

2664 BtShared *pBt = p->pBt;

2665 assert( nReserve>=-1 && nReserve<=255 );

2666 sqlite3BtreeEnter(p);

2667 #if SQLITE_HAS_CODEC

2668 if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;

2669 #endif

2670 if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){

2671 sqlite3BtreeLeave(p);

2672 return SQLITE_READONLY;

2673 }

2674 if( nReserve<0 ){

2675 nReserve = pBt->pageSize - pBt->usableSize;

2676 }

2677 assert( nReserve>=0 && nReserve<=255 );

2678 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&

2679 ((pageSize-1)&pageSize)==0 ){

2680 assert( (pageSize & 7)==0 );

2681 assert( !pBt->pCursor );

2682 pBt->pageSize = (u32)pageSize;

2683 freeTempSpace(pBt);

2684 }

2685 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);

2686 pBt->usableSize = pBt->pageSize - (u16)nReserve;

2687 if( iFix ) pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

2688 sqlite3BtreeLeave(p);

2689 return rc;

2690 }

2691

2692 /*

2693 ** Return the currently defined page size

2694 */

2695 int sqlite3BtreeGetPageSize(Btree *p){

2696 return p->pBt->pageSize;

2697 }

2698

2699 /*

2700 ** This function is similar to sqlite3BtreeGetReserve(), except that it

2701 ** may only be called if it is guaranteed that the b-tree mutex is already

2702 ** held.

2703 **

2704 ** This is useful in one special case in the backup API code where it is

2705 ** known that the shared b-tree mutex is held, but the mutex on the

2706 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()

2707 ** were to be called, it might collide with some other operation on the

2708 ** database handle that owns *p, causing undefined behavior.

2709 */

2710 int sqlite3BtreeGetReserveNoMutex(Btree *p){

2711 int n;

2712 assert( sqlite3_mutex_held(p->pBt->mutex) );

2713 n = p->pBt->pageSize - p->pBt->usableSize;

2714 return n;

2715 }

2716

2717 /*

2718 ** Return the number of bytes of space at the end of every page that

2719 ** are intentually left unused. This is the "reserved" space that is

2720 ** sometimes used by extensions.

2721 **

2722 ** If SQLITE_HAS_MUTEX is defined then the number returned is the

2723 ** greater of the current reserved space and the maximum requested

2724 ** reserve space.

2725 */

2726 int sqlite3BtreeGetOptimalReserve(Btree *p){

2727 int n;

2728 sqlite3BtreeEnter(p);

2729 n = sqlite3BtreeGetReserveNoMutex(p);

2730 #ifdef SQLITE_HAS_CODEC

2731 if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;

2732 #endif

2733 sqlite3BtreeLeave(p);

2734 return n;

2735 }

2736

2737

2738 /*

2739 ** Set the maximum page count for a database if mxPage is positive.

2740 ** No changes are made if mxPage is 0 or negative.

2741 ** Regardless of the value of mxPage, return the maximum page count.

2742 */

2743 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){

2744 int n;

2745 sqlite3BtreeEnter(p);

2746 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);

2747 sqlite3BtreeLeave(p);

2748 return n;

2749 }

2750

2751 /*

2752 ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1. If newFlag is -1,

2753 ** then make no changes. Always return the value of the BTS_SECURE_DELETE

2754 ** setting after the change.

2755 */

2756 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){

2757 int b;

2758 if( p==0 ) return 0;

2759 sqlite3BtreeEnter(p);

2760 if( newFlag>=0 ){

2761 p->pBt->btsFlags &= ~BTS_SECURE_DELETE;

2762 if( newFlag ) p->pBt->btsFlags \|= BTS_SECURE_DELETE;

2763 }

2764 b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;

2765 sqlite3BtreeLeave(p);

2766 return b;

2767 }

2768

2769 /*

2770 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'

2771 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it

2772 ** is disabled. The default value for the auto-vacuum property is

2773 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.

2774 */

2775 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){

2776 #ifdef SQLITE_OMIT_AUTOVACUUM

2777 return SQLITE_READONLY;

2778 #else

2779 BtShared *pBt = p->pBt;

2780 int rc = SQLITE_OK;

2781 u8 av = (u8)autoVacuum;

2782

2783 sqlite3BtreeEnter(p);

2784 if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){

2785 rc = SQLITE_READONLY;

2786 }else{

2787 pBt->autoVacuum = av ?1:0;

2788 pBt->incrVacuum = av==2 ?1:0;

2789 }

2790 sqlite3BtreeLeave(p);

2791 return rc;

2792 #endif

2793 }

2794

2795 /*

2796 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is

2797 ** enabled 1 is returned. Otherwise 0.

2798 */

2799 int sqlite3BtreeGetAutoVacuum(Btree *p){

2800 #ifdef SQLITE_OMIT_AUTOVACUUM

2801 return BTREE_AUTOVACUUM_NONE;

2802 #else

2803 int rc;

2804 sqlite3BtreeEnter(p);

2805 rc = (

2806 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:

2807 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:

2808 BTREE_AUTOVACUUM_INCR

2809 );

2810 sqlite3BtreeLeave(p);

2811 return rc;

2812 #endif

2813 }

2814

2815

2816 /*

2817 ** Get a reference to pPage1 of the database file. This will

2818 ** also acquire a readlock on that file.

2819 **

2820 ** SQLITE_OK is returned on success. If the file is not a

2821 ** well-formed database file, then SQLITE_CORRUPT is returned.

2822 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM

2823 ** is returned if we run out of memory.

2824 */

2825 static int lockBtree(BtShared *pBt){

2826 int rc; /* Result code from subfunctions */

2827 MemPage pPage1; / Page 1 of the database file */

2828 int nPage; /* Number of pages in the database */

2829 int nPageFile = 0; /* Number of pages in the database file */

2830 int nPageHeader; /* Number of pages in the database according to hdr */

2831

2832 assert( sqlite3_mutex_held(pBt->mutex) );

2833 assert( pBt->pPage1==0 );

2834 rc = sqlite3PagerSharedLock(pBt->pPager);

2835 if( rc!=SQLITE_OK ) return rc;

2836 rc = btreeGetPage(pBt, 1, &pPage1, 0);

2837 if( rc!=SQLITE_OK ) return rc;

2838

2839 /* Do some checking to help insure the file we opened really is

2840 ** a valid database file.

2841 */

2842 nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);

2843 sqlite3PagerPagecount(pBt->pPager, &nPageFile);

2844 if( nPage==0 \|\| memcmp(24+(u8)pPage1->aData, 92+(u8)pPage1->aData,4)!=0 ){

2845 nPage = nPageFile;

2846 }

2847 if( nPage>0 ){

2848 u32 pageSize;

2849 u32 usableSize;

2850 u8 *page1 = pPage1->aData;

2851 rc = SQLITE_NOTADB;

2852 /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins

2853 ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d

2854 ** 61 74 20 33 00. */

2855 if( memcmp(page1, zMagicHeader, 16)!=0 ){

2856 goto page1_init_failed;

2857 }

2858

2859 #ifdef SQLITE_OMIT_WAL

2860 if( page1[18]>1 ){

2861 pBt->btsFlags \|= BTS_READ_ONLY;

2862 }

2863 if( page1[19]>1 ){

2864 goto page1_init_failed;

2865 }

2866 #else

2867 if( page1[18]>2 ){

2868 pBt->btsFlags \|= BTS_READ_ONLY;

2869 }

2870 if( page1[19]>2 ){

2871 goto page1_init_failed;

2872 }

2873

2874 /* If the write version is set to 2, this database should be accessed

2875 ** in WAL mode. If the log is not already open, open it now. Then

2876 ** return SQLITE_OK and return without populating BtShared.pPage1.

2877 ** The caller detects this and calls this function again. This is

2878 ** required as the version of page 1 currently in the page1 buffer

2879 ** may not be the latest version - there may be a newer one in the log

2880 ** file.

2881 */

2882 if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){

2883 int isOpen = 0;

2884 rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);

2885 if( rc!=SQLITE_OK ){

2886 goto page1_init_failed;

2887 }else if( isOpen==0 ){

2888 releasePage(pPage1);

2889 return SQLITE_OK;

2890 }

2891 rc = SQLITE_NOTADB;

2892 }

2893 #endif

2894

2895 /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload

2896 ** fractions and the leaf payload fraction values must be 64, 32, and 32.

2897 **

2898 ** The original design allowed these amounts to vary, but as of

2899 ** version 3.6.0, we require them to be fixed.

2900 */

2901 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){

2902 goto page1_init_failed;

2903 }

2904 /* EVIDENCE-OF: R-51873-39618 The page size for a database file is

2905 ** determined by the 2-byte integer located at an offset of 16 bytes from

2906 ** the beginning of the database file. */

2907 pageSize = (page1[16]<<8) \| (page1[17]<<16);

2908 /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two

2909 ** between 512 and 65536 inclusive. */

2910 if( ((pageSize-1)&pageSize)!=0

2911 \|\| pageSize>SQLITE_MAX_PAGE_SIZE

2912 \|\| pageSize<=256

2913 ){

2914 goto page1_init_failed;

2915 }

2916 assert( (pageSize & 7)==0 );

2917 /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte

2918 ** integer at offset 20 is the number of bytes of space at the end of

2919 ** each page to reserve for extensions.

2920 **

2921 ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is

2922 ** determined by the one-byte unsigned integer found at an offset of 20

2923 ** into the database file header. */

2924 usableSize = pageSize - page1[20];

2925 if( (u32)pageSize!=pBt->pageSize ){

2926 /* After reading the first page of the database assuming a page size

2927 ** of BtShared.pageSize, we have discovered that the page-size is

2928 ** actually pageSize. Unlock the database, leave pBt->pPage1 at

2929 ** zero and return SQLITE_OK. The caller will call this function

2930 ** again with the correct page-size.

2931 */

2932 releasePage(pPage1);

2933 pBt->usableSize = usableSize;

2934 pBt->pageSize = pageSize;

2935 freeTempSpace(pBt);

2936 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,

2937 pageSize-usableSize);

2938 return rc;

2939 }

2940 if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){

2941 rc = SQLITE_CORRUPT_BKPT;

2942 goto page1_init_failed;

2943 }

2944 /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to

2945 ** be less than 480. In other words, if the page size is 512, then the

2946 ** reserved space size cannot exceed 32. */

2947 if( usableSize<480 ){

2948 goto page1_init_failed;

2949 }

2950 pBt->pageSize = pageSize;

2951 pBt->usableSize = usableSize;

2952 #ifndef SQLITE_OMIT_AUTOVACUUM

2953 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);

2954 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);

2955 #endif

2956 }

2957

2958 /* maxLocal is the maximum amount of payload to store locally for

2959 ** a cell. Make sure it is small enough so that at least minFanout

2960 ** cells can will fit on one page. We assume a 10-byte page header.

2961 ** Besides the payload, the cell must store:

2962 ** 2-byte pointer to the cell

2963 ** 4-byte child pointer

2964 ** 9-byte nKey value

2965 ** 4-byte nData value

2966 ** 4-byte overflow page pointer

2967 ** So a cell consists of a 2-byte pointer, a header which is as much as

2968 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow

2969 ** page pointer.

2970 */

2971 pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);

2972 pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);

2973 pBt->maxLeaf = (u16)(pBt->usableSize - 35);

2974 pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);

2975 if( pBt->maxLocal>127 ){

2976 pBt->max1bytePayload = 127;

2977 }else{

2978 pBt->max1bytePayload = (u8)pBt->maxLocal;

2979 }

2980 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );

2981 pBt->pPage1 = pPage1;

2982 pBt->nPage = nPage;

2983 return SQLITE_OK;

2984

2985 page1_init_failed:

2986 releasePage(pPage1);

2987 pBt->pPage1 = 0;

2988 return rc;

2989 }

2990

2991 #ifndef NDEBUG

2992 /*

2993 ** Return the number of cursors open on pBt. This is for use

2994 ** in assert() expressions, so it is only compiled if NDEBUG is not

2995 ** defined.

2996 **

2997 ** Only write cursors are counted if wrOnly is true. If wrOnly is

2998 ** false then all cursors are counted.

2999 **

3000 ** For the purposes of this routine, a cursor is any cursor that

3001 ** is capable of reading or writing to the database. Cursors that

3002 ** have been tripped into the CURSOR_FAULT state are not counted.

3003 */

3004 static int countValidCursors(BtShared *pBt, int wrOnly){

3005 BtCursor *pCur;

3006 int r = 0;

3007 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){

3008 if( (wrOnly==0 \|\| (pCur->curFlags & BTCF_WriteFlag)!=0)

3009 && pCur->eState!=CURSOR_FAULT ) r++;

3010 }

3011 return r;

3012 }

3013 #endif

3014

3015 /*

3016 ** If there are no outstanding cursors and we are not in the middle

3017 ** of a transaction but there is a read lock on the database, then

3018 ** this routine unrefs the first page of the database file which

3019 ** has the effect of releasing the read lock.

3020 **

3021 ** If there is a transaction in progress, this routine is a no-op.

3022 */

3023 static void unlockBtreeIfUnused(BtShared *pBt){

3024 assert( sqlite3_mutex_held(pBt->mutex) );

3025 assert( countValidCursors(pBt,0)==0 \|\| pBt->inTransaction>TRANS_NONE );

3026 if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){

3027 MemPage *pPage1 = pBt->pPage1;

3028 assert( pPage1->aData );

3029 assert( sqlite3PagerRefcount(pBt->pPager)==1 );

3030 pBt->pPage1 = 0;

3031 releasePageNotNull(pPage1);

3032 }

3033 }

3034

3035 /*

3036 ** If pBt points to an empty file then convert that empty file

3037 ** into a new empty database by initializing the first page of

3038 ** the database.

3039 */

3040 static int newDatabase(BtShared *pBt){

3041 MemPage *pP1;

3042 unsigned char *data;

3043 int rc;

3044

3045 assert( sqlite3_mutex_held(pBt->mutex) );

3046 if( pBt->nPage>0 ){

3047 return SQLITE_OK;

3048 }

3049 pP1 = pBt->pPage1;

3050 assert( pP1!=0 );

3051 data = pP1->aData;

3052 rc = sqlite3PagerWrite(pP1->pDbPage);

3053 if( rc ) return rc;

3054 memcpy(data, zMagicHeader, sizeof(zMagicHeader));

3055 assert( sizeof(zMagicHeader)==16 );

3056 data[16] = (u8)((pBt->pageSize>>8)&0xff);

3057 data[17] = (u8)((pBt->pageSize>>16)&0xff);

3058 data[18] = 1;

3059 data[19] = 1;

3060 assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);

3061 data[20] = (u8)(pBt->pageSize - pBt->usableSize);

3062 data[21] = 64;

3063 data[22] = 32;

3064 data[23] = 32;

3065 memset(&data[24], 0, 100-24);

3066 zeroPage(pP1, PTF_INTKEY\|PTF_LEAF\|PTF_LEAFDATA );

3067 pBt->btsFlags \|= BTS_PAGESIZE_FIXED;

3068 #ifndef SQLITE_OMIT_AUTOVACUUM

3069 assert( pBt->autoVacuum==1 \|\| pBt->autoVacuum==0 );

3070 assert( pBt->incrVacuum==1 \|\| pBt->incrVacuum==0 );

3071 put4byte(&data[36 + 4*4], pBt->autoVacuum);

3072 put4byte(&data[36 + 7*4], pBt->incrVacuum);

3073 #endif

3074 pBt->nPage = 1;

3075 data[31] = 1;

3076 return SQLITE_OK;

3077 }

3078

3079 /*

3080 ** Initialize the first page of the database file (creating a database

3081 ** consisting of a single page and no schema objects). Return SQLITE_OK

3082 ** if successful, or an SQLite error code otherwise.

3083 */

3084 int sqlite3BtreeNewDb(Btree *p){

3085 int rc;

3086 sqlite3BtreeEnter(p);

3087 p->pBt->nPage = 0;

3088 rc = newDatabase(p->pBt);

3089 sqlite3BtreeLeave(p);

3090 return rc;

3091 }

3092

3093 /*

3094 ** Attempt to start a new transaction. A write-transaction

3095 ** is started if the second argument is nonzero, otherwise a read-

3096 ** transaction. If the second argument is 2 or more and exclusive

3097 ** transaction is started, meaning that no other process is allowed

3098 ** to access the database. A preexisting transaction may not be

3099 ** upgraded to exclusive by calling this routine a second time - the

3100 ** exclusivity flag only works for a new transaction.

3101 **

3102 ** A write-transaction must be started before attempting any

3103 ** changes to the database. None of the following routines

3104 ** will work unless a transaction is started first:

3105 **

3106 ** sqlite3BtreeCreateTable()

3107 ** sqlite3BtreeCreateIndex()

3108 ** sqlite3BtreeClearTable()

3109 ** sqlite3BtreeDropTable()

3110 ** sqlite3BtreeInsert()

3111 ** sqlite3BtreeDelete()

3112 ** sqlite3BtreeUpdateMeta()

3113 **

3114 ** If an initial attempt to acquire the lock fails because of lock contention

3115 ** and the database was previously unlocked, then invoke the busy handler

3116 ** if there is one. But if there was previously a read-lock, do not

3117 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is

3118 ** returned when there is already a read-lock in order to avoid a deadlock.

3119 **

3120 ** Suppose there are two processes A and B. A has a read lock and B has

3121 ** a reserved lock. B tries to promote to exclusive but is blocked because

3122 ** of A's read lock. A tries to promote to reserved but is blocked by B.

3123 ** One or the other of the two processes must give way or there can be

3124 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback

3125 ** when A already has a read lock, we encourage A to give up and let B

3126 ** proceed.

3127 */

3128 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){

3129 sqlite3 *pBlock = 0;

3130 BtShared *pBt = p->pBt;

3131 int rc = SQLITE_OK;

3132

3133 sqlite3BtreeEnter(p);

3134 btreeIntegrity(p);

3135

3136 /* If the btree is already in a write-transaction, or it

3137 ** is already in a read-transaction and a read-transaction

3138 ** is requested, this is a no-op.

3139 */

3140 if( p->inTrans==TRANS_WRITE \|\| (p->inTrans==TRANS_READ && !wrflag) ){

3141 goto trans_begun;

3142 }

3143 assert( pBt->inTransaction==TRANS_WRITE \|\| IfNotOmitAV(pBt->bDoTruncate)==0 );

3144

3145 /* Write transactions are not possible on a read-only database */

3146 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){

3147 rc = SQLITE_READONLY;

3148 goto trans_begun;

3149 }

3150

3151 #ifndef SQLITE_OMIT_SHARED_CACHE

3152 /* If another database handle has already opened a write transaction

3153 ** on this shared-btree structure and a second write transaction is

3154 ** requested, return SQLITE_LOCKED.

3155 */

3156 if( (wrflag && pBt->inTransaction==TRANS_WRITE)

3157 \|\| (pBt->btsFlags & BTS_PENDING)!=0

3158 ){

3159 pBlock = pBt->pWriter->db;

3160 }else if( wrflag>1 ){

3161 BtLock *pIter;

3162 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){

3163 if( pIter->pBtree!=p ){

3164 pBlock = pIter->pBtree->db;

3165 break;

3166 }

3167 }

3168 }

3169 if( pBlock ){

3170 sqlite3ConnectionBlocked(p->db, pBlock);

3171 rc = SQLITE_LOCKED_SHAREDCACHE;

3172 goto trans_begun;

3173 }

3174 #endif

3175

3176 /* Any read-only or read-write transaction implies a read-lock on

3177 ** page 1. So if some other shared-cache client already has a write-lock

3178 ** on page 1, the transaction cannot be opened. */

3179 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

3180 if( SQLITE_OK!=rc ) goto trans_begun;

3181

3182 pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;

3183 if( pBt->nPage==0 ) pBt->btsFlags \|= BTS_INITIALLY_EMPTY;

3184 do {

3185 /* Call lockBtree() until either pBt->pPage1 is populated or

3186 ** lockBtree() returns something other than SQLITE_OK. lockBtree()

3187 ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after

3188 ** reading page 1 it discovers that the page-size of the database

3189 ** file is not pBt->pageSize. In this case lockBtree() will update

3190 ** pBt->pageSize to the page-size of the file on disk.

3191 */

3192 while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );

3193

3194 if( rc==SQLITE_OK && wrflag ){

3195 if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){

3196 rc = SQLITE_READONLY;

3197 }else{

3198 rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));

3199 if( rc==SQLITE_OK ){

3200 rc = newDatabase(pBt);

3201 }

3202 }

3203 }

3204

3205 if( rc!=SQLITE_OK ){

3206 unlockBtreeIfUnused(pBt);

3207 }

3208 }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&

3209 btreeInvokeBusyHandler(pBt) );

3210

3211 if( rc==SQLITE_OK ){

3212 if( p->inTrans==TRANS_NONE ){

3213 pBt->nTransaction++;

3214 #ifndef SQLITE_OMIT_SHARED_CACHE

3215 if( p->sharable ){

3216 assert( p->lock.pBtree==p && p->lock.iTable==1 );

3217 p->lock.eLock = READ_LOCK;

3218 p->lock.pNext = pBt->pLock;

3219 pBt->pLock = &p->lock;

3220 }

3221 #endif

3222 }

3223 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);

3224 if( p->inTrans>pBt->inTransaction ){

3225 pBt->inTransaction = p->inTrans;

3226 }

3227 if( wrflag ){

3228 MemPage *pPage1 = pBt->pPage1;

3229 #ifndef SQLITE_OMIT_SHARED_CACHE

3230 assert( !pBt->pWriter );

3231 pBt->pWriter = p;

3232 pBt->btsFlags &= ~BTS_EXCLUSIVE;

3233 if( wrflag>1 ) pBt->btsFlags \|= BTS_EXCLUSIVE;

3234 #endif

3235

3236 /* If the db-size header field is incorrect (as it may be if an old

3237 ** client has been writing the database file), update it now. Doing

3238 ** this sooner rather than later means the database size can safely

3239 ** re-read the database size from page 1 if a savepoint or transaction

3240 ** rollback occurs within the transaction.

3241 */

3242 if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){

3243 rc = sqlite3PagerWrite(pPage1->pDbPage);

3244 if( rc==SQLITE_OK ){

3245 put4byte(&pPage1->aData[28], pBt->nPage);

3246 }

3247 }

3248 }

3249 }

3250

3251

3252 trans_begun:

3253 if( rc==SQLITE_OK && wrflag ){

3254 /* This call makes sure that the pager has the correct number of

3255 ** open savepoints. If the second parameter is greater than 0 and

3256 ** the sub-journal is not already open, then it will be opened here.

3257 */

3258 rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);

3259 }

3260

3261 btreeIntegrity(p);

3262 sqlite3BtreeLeave(p);

3263 return rc;

3264 }

3265

3266 #ifndef SQLITE_OMIT_AUTOVACUUM

3267

3268 /*

3269 ** Set the pointer-map entries for all children of page pPage. Also, if

3270 ** pPage contains cells that point to overflow pages, set the pointer

3271 ** map entries for the overflow pages as well.

3272 */

3273 static int setChildPtrmaps(MemPage *pPage){

3274 int i; /* Counter variable */

3275 int nCell; /* Number of cells in page pPage */

3276 int rc; /* Return code */

3277 BtShared *pBt = pPage->pBt;

3278 u8 isInitOrig = pPage->isInit;

3279 Pgno pgno = pPage->pgno;

3280

3281 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

3282 rc = btreeInitPage(pPage);

3283 if( rc!=SQLITE_OK ){

3284 goto set_child_ptrmaps_out;

3285 }

3286 nCell = pPage->nCell;

3287

3288 for(i=0; i<nCell; i++){

3289 u8 *pCell = findCell(pPage, i);

3290

3291 ptrmapPutOvflPtr(pPage, pCell, &rc);

3292

3293 if( !pPage->leaf ){

3294 Pgno childPgno = get4byte(pCell);

3295 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

3296 }

3297 }

3298

3299 if( !pPage->leaf ){

3300 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

3301 ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

3302 }

3303

3304 set_child_ptrmaps_out:

3305 pPage->isInit = isInitOrig;

3306 return rc;

3307 }

3308

3309 /*

3310 ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so

3311 ** that it points to iTo. Parameter eType describes the type of pointer to

3312 ** be modified, as follows:

3313 **

3314 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child

3315 ** page of pPage.

3316 **

3317 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow

3318 ** page pointed to by one of the cells on pPage.

3319 **

3320 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next

3321 ** overflow page in the list.

3322 */

3323 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){

3324 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

3325 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

3326 if( eType==PTRMAP_OVERFLOW2 ){

3327 /* The pointer is always the first 4 bytes of the page in this case. */

3328 if( get4byte(pPage->aData)!=iFrom ){

3329 return SQLITE_CORRUPT_BKPT;

3330 }

3331 put4byte(pPage->aData, iTo);

3332 }else{

3333 u8 isInitOrig = pPage->isInit;

3334 int i;

3335 int nCell;

3336 int rc;

3337

3338 rc = btreeInitPage(pPage);

3339 if( rc ) return rc;

3340 nCell = pPage->nCell;

3341

3342 for(i=0; i<nCell; i++){

3343 u8 *pCell = findCell(pPage, i);

3344 if( eType==PTRMAP_OVERFLOW1 ){

3345 CellInfo info;

3346 pPage->xParseCell(pPage, pCell, &info);

3347 if( info.nLocal<info.nPayload

3348 && pCell+info.nSize-1<=pPage->aData+pPage->maskPage

3349 && iFrom==get4byte(pCell+info.nSize-4)

3350 ){

3351 put4byte(pCell+info.nSize-4, iTo);

3352 break;

3353 }

3354 }else{

3355 if( get4byte(pCell)==iFrom ){

3356 put4byte(pCell, iTo);

3357 break;

3358 }

3359 }

3360 }

3361

3362 if( i==nCell ){

3363 if( eType!=PTRMAP_BTREE \|\|

3364 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){

3365 return SQLITE_CORRUPT_BKPT;

3366 }

3367 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);

3368 }

3369

3370 pPage->isInit = isInitOrig;

3371 }

3372 return SQLITE_OK;

3373 }

3374

3375

3376 /*

3377 ** Move the open database page pDbPage to location iFreePage in the

3378 ** database. The pDbPage reference remains valid.

3379 **

3380 ** The isCommit flag indicates that there is no need to remember that

3381 ** the journal needs to be sync()ed before database page pDbPage->pgno

3382 ** can be written to. The caller has already promised not to write to that

3383 ** page.

3384 */

3385 static int relocatePage(

3386 BtShared pBt, / Btree */

3387 MemPage pDbPage, / Open page to move */

3388 u8 eType, /* Pointer map 'type' entry for pDbPage */

3389 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */

3390 Pgno iFreePage, /* The location to move pDbPage to */

3391 int isCommit /* isCommit flag passed to sqlite3PagerMovepage */

3392 ){

3393 MemPage pPtrPage; / The page that contains a pointer to pDbPage */

3394 Pgno iDbPage = pDbPage->pgno;

3395 Pager *pPager = pBt->pPager;

3396 int rc;

3397

3398 assert( eType==PTRMAP_OVERFLOW2 \|\| eType==PTRMAP_OVERFLOW1 \|\|

3399 eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE );

3400 assert( sqlite3_mutex_held(pBt->mutex) );

3401 assert( pDbPage->pBt==pBt );

3402

3403 /* Move page iDbPage from its current location to page number iFreePage */

3404 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",

3405 iDbPage, iFreePage, iPtrPage, eType));

3406 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);

3407 if( rc!=SQLITE_OK ){

3408 return rc;

3409 }

3410 pDbPage->pgno = iFreePage;

3411

3412 /* If pDbPage was a btree-page, then it may have child pages and/or cells

3413 ** that point to overflow pages. The pointer map entries for all these

3414 ** pages need to be changed.

3415 **

3416 ** If pDbPage is an overflow page, then the first 4 bytes may store a

3417 ** pointer to a subsequent overflow page. If this is the case, then

3418 ** the pointer map needs to be updated for the subsequent overflow page.

3419 */

3420 if( eType==PTRMAP_BTREE \|\| eType==PTRMAP_ROOTPAGE ){

3421 rc = setChildPtrmaps(pDbPage);

3422 if( rc!=SQLITE_OK ){

3423 return rc;

3424 }

3425 }else{

3426 Pgno nextOvfl = get4byte(pDbPage->aData);

3427 if( nextOvfl!=0 ){

3428 ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);

3429 if( rc!=SQLITE_OK ){

3430 return rc;

3431 }

3432 }

3433 }

3434

3435 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so

3436 ** that it points at iFreePage. Also fix the pointer map entry for

3437 ** iPtrPage.

3438 */

3439 if( eType!=PTRMAP_ROOTPAGE ){

3440 rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);

3441 if( rc!=SQLITE_OK ){

3442 return rc;

3443 }

3444 rc = sqlite3PagerWrite(pPtrPage->pDbPage);

3445 if( rc!=SQLITE_OK ){

3446 releasePage(pPtrPage);

3447 return rc;

3448 }

3449 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);

3450 releasePage(pPtrPage);

3451 if( rc==SQLITE_OK ){

3452 ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);

3453 }

3454 }

3455 return rc;

3456 }

3457

3458 /* Forward declaration required by incrVacuumStep(). */

3459 static int allocateBtreePage(BtShared , MemPage , Pgno , Pgno, u8);

3460

3461 /*

3462 ** Perform a single step of an incremental-vacuum. If successful, return

3463 ** SQLITE_OK. If there is no work to do (and therefore no point in

3464 ** calling this function again), return SQLITE_DONE. Or, if an error

3465 ** occurs, return some other error code.

3466 **

3467 ** More specifically, this function attempts to re-organize the database so

3468 ** that the last page of the file currently in use is no longer in use.

3469 **

3470 ** Parameter nFin is the number of pages that this database would contain

3471 ** were this function called until it returns SQLITE_DONE.

3472 **

3473 ** If the bCommit parameter is non-zero, this function assumes that the

3474 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE

3475 ** or an error. bCommit is passed true for an auto-vacuum-on-commit

3476 ** operation, or false for an incremental vacuum.

3477 */

3478 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){

3479 Pgno nFreeList; /* Number of pages still on the free-list */

3480 int rc;

3481

3482 assert( sqlite3_mutex_held(pBt->mutex) );

3483 assert( iLastPg>nFin );

3484

3485 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){

3486 u8 eType;

3487 Pgno iPtrPage;

3488

3489 nFreeList = get4byte(&pBt->pPage1->aData[36]);

3490 if( nFreeList==0 ){

3491 return SQLITE_DONE;

3492 }

3493

3494 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);

3495 if( rc!=SQLITE_OK ){

3496 return rc;

3497 }

3498 if( eType==PTRMAP_ROOTPAGE ){

3499 return SQLITE_CORRUPT_BKPT;

3500 }

3501

3502 if( eType==PTRMAP_FREEPAGE ){

3503 if( bCommit==0 ){

3504 /* Remove the page from the files free-list. This is not required

3505 ** if bCommit is non-zero. In that case, the free-list will be

3506 ** truncated to zero after this function returns, so it doesn't

3507 ** matter if it still contains some garbage entries.

3508 */

3509 Pgno iFreePg;

3510 MemPage *pFreePg;

3511 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);

3512 if( rc!=SQLITE_OK ){

3513 return rc;

3514 }

3515 assert( iFreePg==iLastPg );

3516 releasePage(pFreePg);

3517 }

3518 } else {

3519 Pgno iFreePg; /* Index of free page to move pLastPg to */

3520 MemPage *pLastPg;

3521 u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */

3522 Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */

3523

3524 rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);

3525 if( rc!=SQLITE_OK ){

3526 return rc;

3527 }

3528

3529 /* If bCommit is zero, this loop runs exactly once and page pLastPg

3530 ** is swapped with the first free page pulled off the free list.

3531 **

3532 ** On the other hand, if bCommit is greater than zero, then keep

3533 ** looping until a free-page located within the first nFin pages

3534 ** of the file is found.

3535 */

3536 if( bCommit==0 ){

3537 eMode = BTALLOC_LE;

3538 iNear = nFin;

3539 }

3540 do {

3541 MemPage *pFreePg;

3542 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);

3543 if( rc!=SQLITE_OK ){

3544 releasePage(pLastPg);

3545 return rc;

3546 }

3547 releasePage(pFreePg);

3548 }while( bCommit && iFreePg>nFin );

3549 assert( iFreePg<iLastPg );

3550

3551 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);

3552 releasePage(pLastPg);

3553 if( rc!=SQLITE_OK ){

3554 return rc;

3555 }

3556 }

3557 }

3558

3559 if( bCommit==0 ){

3560 do {

3561 iLastPg--;

3562 }while( iLastPg==PENDING_BYTE_PAGE(pBt) \|\| PTRMAP_ISPAGE(pBt, iLastPg) );

3563 pBt->bDoTruncate = 1;

3564 pBt->nPage = iLastPg;

3565 }

3566 return SQLITE_OK;

3567 }

3568

3569 /*

3570 ** The database opened by the first argument is an auto-vacuum database

3571 ** nOrig pages in size containing nFree free pages. Return the expected

3572 ** size of the database in pages following an auto-vacuum operation.

3573 */

3574 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){

3575 int nEntry; /* Number of entries on one ptrmap page */

3576 Pgno nPtrmap; /* Number of PtrMap pages to be freed */

3577 Pgno nFin; /* Return value */

3578

3579 nEntry = pBt->usableSize/5;

3580 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;

3581 nFin = nOrig - nFree - nPtrmap;

3582 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){

3583 nFin--;

3584 }

3585 while( PTRMAP_ISPAGE(pBt, nFin) \|\| nFin==PENDING_BYTE_PAGE(pBt) ){

3586 nFin--;

3587 }

3588

3589 return nFin;

3590 }

3591

3592 /*

3593 ** A write-transaction must be opened before calling this function.

3594 ** It performs a single unit of work towards an incremental vacuum.

3595 **

3596 ** If the incremental vacuum is finished after this function has run,

3597 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,

3598 ** SQLITE_OK is returned. Otherwise an SQLite error code.

3599 */

3600 int sqlite3BtreeIncrVacuum(Btree *p){

3601 int rc;

3602 BtShared *pBt = p->pBt;

3603

3604 sqlite3BtreeEnter(p);

3605 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );

3606 if( !pBt->autoVacuum ){

3607 rc = SQLITE_DONE;

3608 }else{

3609 Pgno nOrig = btreePagecount(pBt);

3610 Pgno nFree = get4byte(&pBt->pPage1->aData[36]);

3611 Pgno nFin = finalDbSize(pBt, nOrig, nFree);

3612

3613 if( nOrig<nFin ){

3614 rc = SQLITE_CORRUPT_BKPT;

3615 }else if( nFree>0 ){

3616 rc = saveAllCursors(pBt, 0, 0);

3617 if( rc==SQLITE_OK ){

3618 invalidateAllOverflowCache(pBt);

3619 rc = incrVacuumStep(pBt, nFin, nOrig, 0);

3620 }

3621 if( rc==SQLITE_OK ){

3622 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

3623 put4byte(&pBt->pPage1->aData[28], pBt->nPage);

3624 }

3625 }else{

3626 rc = SQLITE_DONE;

3627 }

3628 }

3629 sqlite3BtreeLeave(p);

3630 return rc;

3631 }

3632

3633 /*

3634 ** This routine is called prior to sqlite3PagerCommit when a transaction

3635 ** is committed for an auto-vacuum database.

3636 **

3637 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages

3638 ** the database file should be truncated to during the commit process.

3639 ** i.e. the database has been reorganized so that only the first *pnTrunc

3640 ** pages are in use.

3641 */

3642 static int autoVacuumCommit(BtShared *pBt){

3643 int rc = SQLITE_OK;

3644 Pager *pPager = pBt->pPager;

3645 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )

3646

3647 assert( sqlite3_mutex_held(pBt->mutex) );

3648 invalidateAllOverflowCache(pBt);

3649 assert(pBt->autoVacuum);

3650 if( !pBt->incrVacuum ){

3651 Pgno nFin; /* Number of pages in database after autovacuuming */

3652 Pgno nFree; /* Number of pages on the freelist initially */

3653 Pgno iFree; /* The next page to be freed */

3654 Pgno nOrig; /* Database size before freeing */

3655

3656 nOrig = btreePagecount(pBt);

3657 if( PTRMAP_ISPAGE(pBt, nOrig) \|\| nOrig==PENDING_BYTE_PAGE(pBt) ){

3658 /* It is not possible to create a database for which the final page

3659 ** is either a pointer-map page or the pending-byte page. If one

3660 ** is encountered, this indicates corruption.

3661 */

3662 return SQLITE_CORRUPT_BKPT;

3663 }

3664

3665 nFree = get4byte(&pBt->pPage1->aData[36]);

3666 nFin = finalDbSize(pBt, nOrig, nFree);

3667 if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;

3668 if( nFin<nOrig ){

3669 rc = saveAllCursors(pBt, 0, 0);

3670 }

3671 for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){

3672 rc = incrVacuumStep(pBt, nFin, iFree, 1);

3673 }

3674 if( (rc==SQLITE_DONE \|\| rc==SQLITE_OK) && nFree>0 ){

3675 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

3676 put4byte(&pBt->pPage1->aData[32], 0);

3677 put4byte(&pBt->pPage1->aData[36], 0);

3678 put4byte(&pBt->pPage1->aData[28], nFin);

3679 pBt->bDoTruncate = 1;

3680 pBt->nPage = nFin;

3681 }

3682 if( rc!=SQLITE_OK ){

3683 sqlite3PagerRollback(pPager);

3684 }

3685 }

3686

3687 assert( nRef>=sqlite3PagerRefcount(pPager) );

3688 return rc;

3689 }

3690

3691 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */

3692 # define setChildPtrmaps(x) SQLITE_OK

3693 #endif

3694

3695 /*

3696 ** This routine does the first phase of a two-phase commit. This routine

3697 ** causes a rollback journal to be created (if it does not already exist)

3698 ** and populated with enough information so that if a power loss occurs

3699 ** the database can be restored to its original state by playing back

3700 ** the journal. Then the contents of the journal are flushed out to

3701 ** the disk. After the journal is safely on oxide, the changes to the

3702 ** database are written into the database file and flushed to oxide.

3703 ** At the end of this call, the rollback journal still exists on the

3704 ** disk and we are still holding all locks, so the transaction has not

3705 ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the

3706 ** commit process.

3707 **

3708 ** This call is a no-op if no write-transaction is currently active on pBt.

3709 **

3710 ** Otherwise, sync the database file for the btree pBt. zMaster points to

3711 ** the name of a master journal file that should be written into the

3712 ** individual journal file, or is NULL, indicating no master journal file

3713 ** (single database transaction).

3714 **

3715 ** When this is called, the master journal should already have been

3716 ** created, populated with this journal pointer and synced to disk.

3717 **

3718 ** Once this is routine has returned, the only thing required to commit

3719 ** the write-transaction for this database file is to delete the journal.

3720 */

3721 int sqlite3BtreeCommitPhaseOne(Btree p, const char zMaster){

3722 int rc = SQLITE_OK;

3723 if( p->inTrans==TRANS_WRITE ){

3724 BtShared *pBt = p->pBt;

3725 sqlite3BtreeEnter(p);

3726 #ifndef SQLITE_OMIT_AUTOVACUUM

3727 if( pBt->autoVacuum ){

3728 rc = autoVacuumCommit(pBt);

3729 if( rc!=SQLITE_OK ){

3730 sqlite3BtreeLeave(p);

3731 return rc;

3732 }

3733 }

3734 if( pBt->bDoTruncate ){

3735 sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);

3736 }

3737 #endif

3738 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);

3739 sqlite3BtreeLeave(p);

3740 }

3741 return rc;

3742 }

3743

3744 /*

3745 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()

3746 ** at the conclusion of a transaction.

3747 */

3748 static void btreeEndTransaction(Btree *p){

3749 BtShared *pBt = p->pBt;

3750 sqlite3 *db = p->db;

3751 assert( sqlite3BtreeHoldsMutex(p) );

3752

3753 #ifndef SQLITE_OMIT_AUTOVACUUM

3754 pBt->bDoTruncate = 0;

3755 #endif

3756 if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){

3757 /* If there are other active statements that belong to this database

3758 ** handle, downgrade to a read-only transaction. The other statements

3759 ** may still be reading from the database. */

3760 downgradeAllSharedCacheTableLocks(p);

3761 p->inTrans = TRANS_READ;

3762 }else{

3763 /* If the handle had any kind of transaction open, decrement the

3764 ** transaction count of the shared btree. If the transaction count

3765 ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()

3766 ** call below will unlock the pager. */

3767 if( p->inTrans!=TRANS_NONE ){

3768 clearAllSharedCacheTableLocks(p);

3769 pBt->nTransaction--;

3770 if( 0==pBt->nTransaction ){

3771 pBt->inTransaction = TRANS_NONE;

3772 }

3773 }

3774

3775 /* Set the current transaction state to TRANS_NONE and unlock the

3776 ** pager if this call closed the only read or write transaction. */

3777 p->inTrans = TRANS_NONE;

3778 unlockBtreeIfUnused(pBt);

3779 }

3780

3781 btreeIntegrity(p);

3782 }

3783

3784 /*

3785 ** Commit the transaction currently in progress.

3786 **

3787 ** This routine implements the second phase of a 2-phase commit. The

3788 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should

3789 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()

3790 ** routine did all the work of writing information out to disk and flushing the

3791 ** contents so that they are written onto the disk platter. All this

3792 ** routine has to do is delete or truncate or zero the header in the

3793 ** the rollback journal (which causes the transaction to commit) and

3794 ** drop locks.

3795 **

3796 ** Normally, if an error occurs while the pager layer is attempting to

3797 ** finalize the underlying journal file, this function returns an error and

3798 ** the upper layer will attempt a rollback. However, if the second argument

3799 ** is non-zero then this b-tree transaction is part of a multi-file

3800 ** transaction. In this case, the transaction has already been committed

3801 ** (by deleting a master journal file) and the caller will ignore this

3802 ** functions return code. So, even if an error occurs in the pager layer,

3803 ** reset the b-tree objects internal state to indicate that the write

3804 ** transaction has been closed. This is quite safe, as the pager will have

3805 ** transitioned to the error state.

3806 **

3807 ** This will release the write lock on the database file. If there

3808 ** are no active cursors, it also releases the read lock.

3809 */

3810 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){

3811

3812 if( p->inTrans==TRANS_NONE ) return SQLITE_OK;

3813 sqlite3BtreeEnter(p);

3814 btreeIntegrity(p);

3815

3816 /* If the handle has a write-transaction open, commit the shared-btrees

3817 ** transaction and set the shared state to TRANS_READ.

3818 */

3819 if( p->inTrans==TRANS_WRITE ){

3820 int rc;

3821 BtShared *pBt = p->pBt;

3822 assert( pBt->inTransaction==TRANS_WRITE );

3823 assert( pBt->nTransaction>0 );

3824 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);

3825 if( rc!=SQLITE_OK && bCleanup==0 ){

3826 sqlite3BtreeLeave(p);

3827 return rc;

3828 }

3829 p->iDataVersion--; /* Compensate for pPager->iDataVersion++; */

3830 pBt->inTransaction = TRANS_READ;

3831 btreeClearHasContent(pBt);

3832 }

3833

3834 btreeEndTransaction(p);

3835 sqlite3BtreeLeave(p);

3836 return SQLITE_OK;

3837 }

3838

3839 /*

3840 ** Do both phases of a commit.

3841 */

3842 int sqlite3BtreeCommit(Btree *p){

3843 int rc;

3844 sqlite3BtreeEnter(p);

3845 rc = sqlite3BtreeCommitPhaseOne(p, 0);

3846 if( rc==SQLITE_OK ){

3847 rc = sqlite3BtreeCommitPhaseTwo(p, 0);

3848 }

3849 sqlite3BtreeLeave(p);

3850 return rc;

3851 }

3852

3853 /*

3854 ** This routine sets the state to CURSOR_FAULT and the error

3855 ** code to errCode for every cursor on any BtShared that pBtree

3856 ** references. Or if the writeOnly flag is set to 1, then only

3857 ** trip write cursors and leave read cursors unchanged.

3858 **

3859 ** Every cursor is a candidate to be tripped, including cursors

3860 ** that belong to other database connections that happen to be

3861 ** sharing the cache with pBtree.

3862 **

3863 ** This routine gets called when a rollback occurs. If the writeOnly

3864 ** flag is true, then only write-cursors need be tripped - read-only

3865 ** cursors save their current positions so that they may continue

3866 ** following the rollback. Or, if writeOnly is false, all cursors are

3867 ** tripped. In general, writeOnly is false if the transaction being

3868 ** rolled back modified the database schema. In this case b-tree root

3869 ** pages may be moved or deleted from the database altogether, making

3870 ** it unsafe for read cursors to continue.

3871 **

3872 ** If the writeOnly flag is true and an error is encountered while

3873 ** saving the current position of a read-only cursor, all cursors,

3874 ** including all read-cursors are tripped.

3875 **

3876 ** SQLITE_OK is returned if successful, or if an error occurs while

3877 ** saving a cursor position, an SQLite error code.

3878 */

3879 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){

3880 BtCursor *p;

3881 int rc = SQLITE_OK;

3882

3883 assert( (writeOnly==0 \|\| writeOnly==1) && BTCF_WriteFlag==1 );

3884 if( pBtree ){

3885 sqlite3BtreeEnter(pBtree);

3886 for(p=pBtree->pBt->pCursor; p; p=p->pNext){

3887 int i;

3888 if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){

3889 if( p->eState==CURSOR_VALID \|\| p->eState==CURSOR_SKIPNEXT ){

3890 rc = saveCursorPosition(p);

3891 if( rc!=SQLITE_OK ){

3892 (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);

3893 break;

3894 }

3895 }

3896 }else{

3897 sqlite3BtreeClearCursor(p);

3898 p->eState = CURSOR_FAULT;

3899 p->skipNext = errCode;

3900 }

3901 for(i=0; i<=p->iPage; i++){

3902 releasePage(p->apPage[i]);

3903 p->apPage[i] = 0;

3904 }

3905 }

3906 sqlite3BtreeLeave(pBtree);

3907 }

3908 return rc;

3909 }

3910

3911 /*

3912 ** Rollback the transaction in progress.

3913 **

3914 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).

3915 ** Only write cursors are tripped if writeOnly is true but all cursors are

3916 ** tripped if writeOnly is false. Any attempt to use

3917 ** a tripped cursor will result in an error.

3918 **

3919 ** This will release the write lock on the database file. If there

3920 ** are no active cursors, it also releases the read lock.

3921 */

3922 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){

3923 int rc;

3924 BtShared *pBt = p->pBt;

3925 MemPage *pPage1;

3926

3927 assert( writeOnly==1 \|\| writeOnly==0 );

3928 assert( tripCode==SQLITE_ABORT_ROLLBACK \|\| tripCode==SQLITE_OK );

3929 sqlite3BtreeEnter(p);

3930 if( tripCode==SQLITE_OK ){

3931 rc = tripCode = saveAllCursors(pBt, 0, 0);

3932 if( rc ) writeOnly = 0;

3933 }else{

3934 rc = SQLITE_OK;

3935 }

3936 if( tripCode ){

3937 int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);

3938 assert( rc==SQLITE_OK \|\| (writeOnly==0 && rc2==SQLITE_OK) );

3939 if( rc2!=SQLITE_OK ) rc = rc2;

3940 }

3941 btreeIntegrity(p);

3942

3943 if( p->inTrans==TRANS_WRITE ){

3944 int rc2;

3945

3946 assert( TRANS_WRITE==pBt->inTransaction );

3947 rc2 = sqlite3PagerRollback(pBt->pPager);

3948 if( rc2!=SQLITE_OK ){

3949 rc = rc2;

3950 }

3951

3952 /* The rollback may have destroyed the pPage1->aData value. So

3953 ** call btreeGetPage() on page 1 again to make

3954 ** sure pPage1->aData is set correctly. */

3955 if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){

3956 int nPage = get4byte(28+(u8*)pPage1->aData);

3957 testcase( nPage==0 );

3958 if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);

3959 testcase( pBt->nPage!=nPage );

3960 pBt->nPage = nPage;

3961 releasePage(pPage1);

3962 }

3963 assert( countValidCursors(pBt, 1)==0 );

3964 pBt->inTransaction = TRANS_READ;

3965 btreeClearHasContent(pBt);

3966 }

3967

3968 btreeEndTransaction(p);

3969 sqlite3BtreeLeave(p);

3970 return rc;

3971 }

3972

3973 /*

3974 ** Start a statement subtransaction. The subtransaction can be rolled

3975 ** back independently of the main transaction. You must start a transaction

3976 ** before starting a subtransaction. The subtransaction is ended automatically

3977 ** if the main transaction commits or rolls back.

3978 **

3979 ** Statement subtransactions are used around individual SQL statements

3980 ** that are contained within a BEGIN...COMMIT block. If a constraint

3981 ** error occurs within the statement, the effect of that one statement

3982 ** can be rolled back without having to rollback the entire transaction.

3983 **

3984 ** A statement sub-transaction is implemented as an anonymous savepoint. The

3985 ** value passed as the second parameter is the total number of savepoints,

3986 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there

3987 ** are no active savepoints and no other statement-transactions open,

3988 ** iStatement is 1. This anonymous savepoint can be released or rolled back

3989 ** using the sqlite3BtreeSavepoint() function.

3990 */

3991 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){

3992 int rc;

3993 BtShared *pBt = p->pBt;

3994 sqlite3BtreeEnter(p);

3995 assert( p->inTrans==TRANS_WRITE );

3996 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

3997 assert( iStatement>0 );

3998 assert( iStatement>p->db->nSavepoint );

3999 assert( pBt->inTransaction==TRANS_WRITE );

4000 /* At the pager level, a statement transaction is a savepoint with

4001 ** an index greater than all savepoints created explicitly using

4002 ** SQL statements. It is illegal to open, release or rollback any

4003 ** such savepoints while the statement transaction savepoint is active.

4004 */

4005 rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);

4006 sqlite3BtreeLeave(p);

4007 return rc;

4008 }

4009

4010 /*

4011 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK

4012 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the

4013 ** savepoint identified by parameter iSavepoint, depending on the value

4014 ** of op.

4015 **

4016 ** Normally, iSavepoint is greater than or equal to zero. However, if op is

4017 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the

4018 ** contents of the entire transaction are rolled back. This is different

4019 ** from a normal transaction rollback, as no locks are released and the

4020 ** transaction remains open.

4021 */

4022 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){

4023 int rc = SQLITE_OK;

4024 if( p && p->inTrans==TRANS_WRITE ){

4025 BtShared *pBt = p->pBt;

4026 assert( op==SAVEPOINT_RELEASE \|\| op==SAVEPOINT_ROLLBACK );

4027 assert( iSavepoint>=0 \|\| (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );

4028 sqlite3BtreeEnter(p);

4029 rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);

4030 if( rc==SQLITE_OK ){

4031 if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){

4032 pBt->nPage = 0;

4033 }

4034 rc = newDatabase(pBt);

4035 pBt->nPage = get4byte(28 + pBt->pPage1->aData);

4036

4037 /* The database size was written into the offset 28 of the header

4038 ** when the transaction started, so we know that the value at offset

4039 ** 28 is nonzero. */

4040 assert( pBt->nPage>0 );

4041 }

4042 sqlite3BtreeLeave(p);

4043 }

4044 return rc;

4045 }

4046

4047 /*

4048 ** Create a new cursor for the BTree whose root is on the page

4049 ** iTable. If a read-only cursor is requested, it is assumed that

4050 ** the caller already has at least a read-only transaction open

4051 ** on the database already. If a write-cursor is requested, then

4052 ** the caller is assumed to have an open write transaction.

4053 **

4054 ** If wrFlag==0, then the cursor can only be used for reading.

4055 ** If wrFlag==1, then the cursor can be used for reading or for

4056 ** writing if other conditions for writing are also met. These

4057 ** are the conditions that must be met in order for writing to

4058 ** be allowed:

4059 **

4060 ** 1: The cursor must have been opened with wrFlag==1

4061 **

4062 ** 2: Other database connections that share the same pager cache

4063 ** but which are not in the READ_UNCOMMITTED state may not have

4064 ** cursors open with wrFlag==0 on the same table. Otherwise

4065 ** the changes made by this write cursor would be visible to

4066 ** the read cursors in the other database connection.

4067 **

4068 ** 3: The database must be writable (not on read-only media)

4069 **

4070 ** 4: There must be an active transaction.

4071 **

4072 ** No checking is done to make sure that page iTable really is the

4073 ** root page of a b-tree. If it is not, then the cursor acquired

4074 ** will not work correctly.

4075 **

4076 ** It is assumed that the sqlite3BtreeCursorZero() has been called

4077 ** on pCur to initialize the memory space prior to invoking this routine.

4078 */

4079 static int btreeCursor(

4080 Btree p, / The btree */

4081 int iTable, /* Root page of table to open */

4082 int wrFlag, /* 1 to write. 0 read-only */

4083 struct KeyInfo pKeyInfo, / First arg to comparison function */

4084 BtCursor pCur / Space for new cursor */

4085 ){

4086 BtShared pBt = p->pBt; / Shared b-tree handle */

4087 BtCursor pX; / Looping over other all cursors */

4088

4089 assert( sqlite3BtreeHoldsMutex(p) );

4090 assert( wrFlag==0

4091 \|\| wrFlag==BTREE_WRCSR

4092 \|\| wrFlag==(BTREE_WRCSR\|BTREE_FORDELETE)

4093 );

4094

4095 /* The following assert statements verify that if this is a sharable

4096 ** b-tree database, the connection is holding the required table locks,

4097 ** and that no other connection has any open cursor that conflicts with

4098 ** this lock. */

4099 assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );

4100 assert( wrFlag==0 \|\| !hasReadConflicts(p, iTable) );

4101

4102 /* Assert that the caller has opened the required transaction. */

4103 assert( p->inTrans>TRANS_NONE );

4104 assert( wrFlag==0 \|\| p->inTrans==TRANS_WRITE );

4105 assert( pBt->pPage1 && pBt->pPage1->aData );

4106 assert( wrFlag==0 \|\| (pBt->btsFlags & BTS_READ_ONLY)==0 );

4107

4108 if( wrFlag ){

4109 allocateTempSpace(pBt);

4110 if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM;

4111 }

4112 if( iTable==1 && btreePagecount(pBt)==0 ){

4113 assert( wrFlag==0 );

4114 iTable = 0;

4115 }

4116

4117 /* Now that no other errors can occur, finish filling in the BtCursor

4118 ** variables and link the cursor into the BtShared list. */

4119 pCur->pgnoRoot = (Pgno)iTable;

4120 pCur->iPage = -1;

4121 pCur->pKeyInfo = pKeyInfo;

4122 pCur->pBtree = p;

4123 pCur->pBt = pBt;

4124 pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;

4125 pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;

4126 /* If there are two or more cursors on the same btree, then all such

4127 ** cursors must have the BTCF_Multiple flag set. */

4128 for(pX=pBt->pCursor; pX; pX=pX->pNext){

4129 if( pX->pgnoRoot==(Pgno)iTable ){

4130 pX->curFlags \|= BTCF_Multiple;

4131 pCur->curFlags \|= BTCF_Multiple;

4132 }

4133 }

4134 pCur->pNext = pBt->pCursor;

4135 pBt->pCursor = pCur;

4136 pCur->eState = CURSOR_INVALID;

4137 return SQLITE_OK;

4138 }

4139 int sqlite3BtreeCursor(

4140 Btree p, / The btree */

4141 int iTable, /* Root page of table to open */

4142 int wrFlag, /* 1 to write. 0 read-only */

4143 struct KeyInfo pKeyInfo, / First arg to xCompare() */

4144 BtCursor pCur / Write new cursor here */

4145 ){

4146 int rc;

4147 if( iTable<1 ){

4148 rc = SQLITE_CORRUPT_BKPT;

4149 }else{

4150 sqlite3BtreeEnter(p);

4151 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);

4152 sqlite3BtreeLeave(p);

4153 }

4154 return rc;

4155 }

4156

4157 /*

4158 ** Return the size of a BtCursor object in bytes.

4159 **

4160 ** This interfaces is needed so that users of cursors can preallocate

4161 ** sufficient storage to hold a cursor. The BtCursor object is opaque

4162 ** to users so they cannot do the sizeof() themselves - they must call

4163 ** this routine.

4164 */

4165 int sqlite3BtreeCursorSize(void){

4166 return ROUND8(sizeof(BtCursor));

4167 }

4168

4169 /*

4170 ** Initialize memory that will be converted into a BtCursor object.

4171 **

4172 ** The simple approach here would be to memset() the entire object

4173 ** to zero. But it turns out that the apPage[] and aiIdx[] arrays

4174 ** do not need to be zeroed and they are large, so we can save a lot

4175 ** of run-time by skipping the initialization of those elements.

4176 */

4177 void sqlite3BtreeCursorZero(BtCursor *p){

4178 memset(p, 0, offsetof(BtCursor, iPage));

4179 }

4180

4181 /*

4182 ** Close a cursor. The read lock on the database file is released

4183 ** when the last cursor is closed.

4184 */

4185 int sqlite3BtreeCloseCursor(BtCursor *pCur){

4186 Btree *pBtree = pCur->pBtree;

4187 if( pBtree ){

4188 int i;

4189 BtShared *pBt = pCur->pBt;

4190 sqlite3BtreeEnter(pBtree);

4191 sqlite3BtreeClearCursor(pCur);

4192 assert( pBt->pCursor!=0 );

4193 if( pBt->pCursor==pCur ){

4194 pBt->pCursor = pCur->pNext;

4195 }else{

4196 BtCursor *pPrev = pBt->pCursor;

4197 do{

4198 if( pPrev->pNext==pCur ){

4199 pPrev->pNext = pCur->pNext;

4200 break;

4201 }

4202 pPrev = pPrev->pNext;

4203 }while( ALWAYS(pPrev) );

4204 }

4205 for(i=0; i<=pCur->iPage; i++){

4206 releasePage(pCur->apPage[i]);

4207 }

4208 unlockBtreeIfUnused(pBt);

4209 sqlite3_free(pCur->aOverflow);

4210 /* sqlite3_free(pCur); */

4211 sqlite3BtreeLeave(pBtree);

4212 }

4213 return SQLITE_OK;

4214 }

4215

4216 /*

4217 ** Make sure the BtCursor* given in the argument has a valid

4218 ** BtCursor.info structure. If it is not already valid, call

4219 ** btreeParseCell() to fill it in.

4220 **

4221 ** BtCursor.info is a cache of the information in the current cell.

4222 ** Using this cache reduces the number of calls to btreeParseCell().

4223 */

4224 #ifndef NDEBUG

4225 static void assertCellInfo(BtCursor *pCur){

4226 CellInfo info;

4227 int iPage = pCur->iPage;

4228 memset(&info, 0, sizeof(info));

4229 btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);

4230 assert( CORRUPT_DB \|\| memcmp(&info, &pCur->info, sizeof(info))==0 );

4231 }

4232 #else

4233 #define assertCellInfo(x)

4234 #endif

4235 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){

4236 if( pCur->info.nSize==0 ){

4237 int iPage = pCur->iPage;

4238 pCur->curFlags \|= BTCF_ValidNKey;

4239 btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);

4240 }else{

4241 assertCellInfo(pCur);

4242 }

4243 }

4244

4245 #ifndef NDEBUG /* The next routine used only within assert() statements */

4246 /*

4247 ** Return true if the given BtCursor is valid. A valid cursor is one

4248 ** that is currently pointing to a row in a (non-empty) table.

4249 ** This is a verification routine is used only within assert() statements.

4250 */

4251 int sqlite3BtreeCursorIsValid(BtCursor *pCur){

4252 return pCur && pCur->eState==CURSOR_VALID;

4253 }

4254 #endif /* NDEBUG */

4255

4256 /*

4257 ** Set *pSize to the size of the buffer needed to hold the value of

4258 ** the key for the current entry. If the cursor is not pointing

4259 ** to a valid entry, *pSize is set to 0.

4260 **

4261 ** For a table with the INTKEY flag set, this routine returns the key

4262 ** itself, not the number of bytes in the key.

4263 **

4264 ** The caller must position the cursor prior to invoking this routine.

4265 **

4266 ** This routine cannot fail. It always returns SQLITE_OK.

4267 */

4268 int sqlite3BtreeKeySize(BtCursor pCur, i64 pSize){

4269 assert( cursorHoldsMutex(pCur) );

4270 assert( pCur->eState==CURSOR_VALID );

4271 getCellInfo(pCur);

4272 *pSize = pCur->info.nKey;

4273 return SQLITE_OK;

4274 }

4275

4276 /*

4277 ** Set *pSize to the number of bytes of data in the entry the

4278 ** cursor currently points to.

4279 **

4280 ** The caller must guarantee that the cursor is pointing to a non-NULL

4281 ** valid entry. In other words, the calling procedure must guarantee

4282 ** that the cursor has Cursor.eState==CURSOR_VALID.

4283 **

4284 ** Failure is not possible. This function always returns SQLITE_OK.

4285 ** It might just as well be a procedure (returning void) but we continue

4286 ** to return an integer result code for historical reasons.

4287 */

4288 int sqlite3BtreeDataSize(BtCursor pCur, u32 pSize){

4289 assert( cursorHoldsMutex(pCur) );

4290 assert( pCur->eState==CURSOR_VALID );

4291 assert( pCur->iPage>=0 );

4292 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );

4293 assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );

4294 getCellInfo(pCur);

4295 *pSize = pCur->info.nPayload;

4296 return SQLITE_OK;

4297 }

4298

4299 /*

4300 ** Given the page number of an overflow page in the database (parameter

4301 ** ovfl), this function finds the page number of the next page in the

4302 ** linked list of overflow pages. If possible, it uses the auto-vacuum

4303 ** pointer-map data instead of reading the content of page ovfl to do so.

4304 **

4305 ** If an error occurs an SQLite error code is returned. Otherwise:

4306 **

4307 ** The page number of the next overflow page in the linked list is

4308 ** written to *pPgnoNext. If page ovfl is the last page in its linked

4309 ** list, *pPgnoNext is set to zero.

4310 **

4311 ** If ppPage is not NULL, and a reference to the MemPage object corresponding

4312 ** to page number pOvfl was obtained, then *ppPage is set to point to that

4313 ** reference. It is the responsibility of the caller to call releasePage()

4314 ** on *ppPage to free the reference. In no reference was obtained (because

4315 ** the pointer-map was used to obtain the value for *pPgnoNext), then

4316 ** *ppPage is set to zero.

4317 */

4318 static int getOverflowPage(

4319 BtShared pBt, / The database file */

4320 Pgno ovfl, /* Current overflow page number */

4321 MemPage *ppPage, / OUT: MemPage handle (may be NULL) */

4322 Pgno pPgnoNext / OUT: Next overflow page number */

4323 ){

4324 Pgno next = 0;

4325 MemPage *pPage = 0;

4326 int rc = SQLITE_OK;

4327

4328 assert( sqlite3_mutex_held(pBt->mutex) );

4329 assert(pPgnoNext);

4330

4331 #ifndef SQLITE_OMIT_AUTOVACUUM

4332 /* Try to find the next page in the overflow list using the

4333 ** autovacuum pointer-map pages. Guess that the next page in

4334 ** the overflow list is page number (ovfl+1). If that guess turns

4335 ** out to be wrong, fall back to loading the data of page

4336 ** number ovfl to determine the next page number.

4337 */

4338 if( pBt->autoVacuum ){

4339 Pgno pgno;

4340 Pgno iGuess = ovfl+1;

4341 u8 eType;

4342

4343 while( PTRMAP_ISPAGE(pBt, iGuess) \|\| iGuess==PENDING_BYTE_PAGE(pBt) ){

4344 iGuess++;

4345 }

4346

4347 if( iGuess<=btreePagecount(pBt) ){

4348 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);

4349 if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){

4350 next = iGuess;

4351 rc = SQLITE_DONE;

4352 }

4353 }

4354 }

4355 #endif

4356

4357 assert( next==0 \|\| rc==SQLITE_DONE );

4358 if( rc==SQLITE_OK ){

4359 rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);

4360 assert( rc==SQLITE_OK \|\| pPage==0 );

4361 if( rc==SQLITE_OK ){

4362 next = get4byte(pPage->aData);

4363 }

4364 }

4365

4366 *pPgnoNext = next;

4367 if( ppPage ){

4368 *ppPage = pPage;

4369 }else{

4370 releasePage(pPage);

4371 }

4372 return (rc==SQLITE_DONE ? SQLITE_OK : rc);

4373 }

4374

4375 /*

4376 ** Copy data from a buffer to a page, or from a page to a buffer.

4377 **

4378 ** pPayload is a pointer to data stored on database page pDbPage.

4379 ** If argument eOp is false, then nByte bytes of data are copied

4380 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,

4381 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes

4382 ** of data are copied from the buffer pBuf to pPayload.

4383 **

4384 ** SQLITE_OK is returned on success, otherwise an error code.

4385 */

4386 static int copyPayload(

4387 void pPayload, / Pointer to page data */

4388 void pBuf, / Pointer to buffer */

4389 int nByte, /* Number of bytes to copy */

4390 int eOp, /* 0 -> copy from page, 1 -> copy to page */

4391 DbPage pDbPage / Page containing pPayload */

4392 ){

4393 if( eOp ){

4394 /* Copy data from buffer to page (a write operation) */

4395 int rc = sqlite3PagerWrite(pDbPage);

4396 if( rc!=SQLITE_OK ){

4397 return rc;

4398 }

4399 memcpy(pPayload, pBuf, nByte);

4400 }else{

4401 /* Copy data from page to buffer (a read operation) */

4402 memcpy(pBuf, pPayload, nByte);

4403 }

4404 return SQLITE_OK;

4405 }

4406

4407 /*

4408 ** This function is used to read or overwrite payload information

4409 ** for the entry that the pCur cursor is pointing to. The eOp

4410 ** argument is interpreted as follows:

4411 **

4412 ** 0: The operation is a read. Populate the overflow cache.

4413 ** 1: The operation is a write. Populate the overflow cache.

4414 ** 2: The operation is a read. Do not populate the overflow cache.

4415 **

4416 ** A total of "amt" bytes are read or written beginning at "offset".

4417 ** Data is read to or from the buffer pBuf.

4418 **

4419 ** The content being read or written might appear on the main page

4420 ** or be scattered out on multiple overflow pages.

4421 **

4422 ** If the current cursor entry uses one or more overflow pages and the

4423 ** eOp argument is not 2, this function may allocate space for and lazily

4424 ** populates the overflow page-list cache array (BtCursor.aOverflow).

4425 ** Subsequent calls use this cache to make seeking to the supplied offset

4426 ** more efficient.

4427 **

4428 ** Once an overflow page-list cache has been allocated, it may be

4429 ** invalidated if some other cursor writes to the same table, or if

4430 ** the cursor is moved to a different row. Additionally, in auto-vacuum

4431 ** mode, the following events may invalidate an overflow page-list cache.

4432 **

4433 ** * An incremental vacuum,

4434 ** * A commit in auto_vacuum="full" mode,

4435 ** * Creating a table (may require moving an overflow page).

4436 */

4437 static int accessPayload(

4438 BtCursor pCur, / Cursor pointing to entry to read from */

4439 u32 offset, /* Begin reading this far into payload */

4440 u32 amt, /* Read this many bytes */

4441 unsigned char pBuf, / Write the bytes into this buffer */

4442 int eOp /* zero to read. non-zero to write. */

4443 ){

4444 unsigned char *aPayload;

4445 int rc = SQLITE_OK;

4446 int iIdx = 0;

4447 MemPage pPage = pCur->apPage[pCur->iPage]; / Btree page of current entry */

4448 BtShared pBt = pCur->pBt; / Btree this cursor belongs to */

4449 #ifdef SQLITE_DIRECT_OVERFLOW_READ

4450 unsigned char * const pBufStart = pBuf;

4451 int bEnd; /* True if reading to end of data */

4452 #endif

4453

4454 assert( pPage );

4455 assert( pCur->eState==CURSOR_VALID );

4456 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

4457 assert( cursorHoldsMutex(pCur) );

4458 assert( eOp!=2 \|\| offset==0 ); /* Always start from beginning for eOp==2 */

4459

4460 getCellInfo(pCur);

4461 aPayload = pCur->info.pPayload;

4462 #ifdef SQLITE_DIRECT_OVERFLOW_READ

4463 bEnd = offset+amt==pCur->info.nPayload;

4464 #endif

4465 assert( offset+amt <= pCur->info.nPayload );

4466

4467 if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){

4468 /* Trying to read or write past the end of the data is an error */

4469 return SQLITE_CORRUPT_BKPT;

4470 }

4471

4472 /* Check if data must be read/written to/from the btree page itself. */

4473 if( offset<pCur->info.nLocal ){

4474 int a = amt;

4475 if( a+offset>pCur->info.nLocal ){

4476 a = pCur->info.nLocal - offset;

4477 }

4478 rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);

4479 offset = 0;

4480 pBuf += a;

4481 amt -= a;

4482 }else{

4483 offset -= pCur->info.nLocal;

4484 }

4485

4486

4487 if( rc==SQLITE_OK && amt>0 ){

4488 const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */

4489 Pgno nextPage;

4490

4491 nextPage = get4byte(&aPayload[pCur->info.nLocal]);

4492

4493 /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.

4494 ** Except, do not allocate aOverflow[] for eOp==2.

4495 **

4496 ** The aOverflow[] array is sized at one entry for each overflow page

4497 ** in the overflow chain. The page number of the first overflow page is

4498 ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array

4499 ** means "not yet known" (the cache is lazily populated).

4500 */

4501 if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){

4502 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;

4503 if( nOvfl>pCur->nOvflAlloc ){

4504 Pgno aNew = (Pgno)sqlite3Realloc(

4505 pCur->aOverflow, nOvfl2sizeof(Pgno)

4506 );

4507 if( aNew==0 ){

4508 rc = SQLITE_NOMEM;

4509 }else{

4510 pCur->nOvflAlloc = nOvfl*2;

4511 pCur->aOverflow = aNew;

4512 }

4513 }

4514 if( rc==SQLITE_OK ){

4515 memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));

4516 pCur->curFlags \|= BTCF_ValidOvfl;

4517 }

4518 }

4519

4520 /* If the overflow page-list cache has been allocated and the

4521 ** entry for the first required overflow page is valid, skip

4522 ** directly to it.

4523 */

4524 if( (pCur->curFlags & BTCF_ValidOvfl)!=0

4525 && pCur->aOverflow[offset/ovflSize]

4526 ){

4527 iIdx = (offset/ovflSize);

4528 nextPage = pCur->aOverflow[iIdx];

4529 offset = (offset%ovflSize);

4530 }

4531

4532 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){

4533

4534 /* If required, populate the overflow page-list cache. */

4535 if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){

4536 assert( pCur->aOverflow[iIdx]==0

4537 \|\| pCur->aOverflow[iIdx]==nextPage

4538 \|\| CORRUPT_DB );

4539 pCur->aOverflow[iIdx] = nextPage;

4540 }

4541

4542 if( offset>=ovflSize ){

4543 /* The only reason to read this page is to obtain the page

4544 ** number for the next page in the overflow chain. The page

4545 ** data is not required. So first try to lookup the overflow

4546 ** page-list cache, if any, then fall back to the getOverflowPage()

4547 ** function.

4548 **

4549 ** Note that the aOverflow[] array must be allocated because eOp!=2

4550 ** here. If eOp==2, then offset==0 and this branch is never taken.

4551 */

4552 assert( eOp!=2 );

4553 assert( pCur->curFlags & BTCF_ValidOvfl );

4554 assert( pCur->pBtree->db==pBt->db );

4555 if( pCur->aOverflow[iIdx+1] ){

4556 nextPage = pCur->aOverflow[iIdx+1];

4557 }else{

4558 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);

4559 }

4560 offset -= ovflSize;

4561 }else{

4562 /* Need to read this page properly. It contains some of the

4563 ** range of data that is being read (eOp==0) or written (eOp!=0).

4564 */

4565 #ifdef SQLITE_DIRECT_OVERFLOW_READ

4566 sqlite3_file *fd;

4567 #endif

4568 int a = amt;

4569 if( a + offset > ovflSize ){

4570 a = ovflSize - offset;

4571 }

4572

4573 #ifdef SQLITE_DIRECT_OVERFLOW_READ

4574 /* If all the following are true:

4575 **

4576 ** 1) this is a read operation, and

4577 ** 2) data is required from the start of this overflow page, and

4578 ** 3) the database is file-backed, and

4579 ** 4) there is no open write-transaction, and

4580 ** 5) the database is not a WAL database,

4581 ** 6) all data from the page is being read.

4582 ** 7) at least 4 bytes have already been read into the output buffer

4583 **

4584 ** then data can be read directly from the database file into the

4585 ** output buffer, bypassing the page-cache altogether. This speeds

4586 ** up loading large records that span many overflow pages.

4587 */

4588 if( (eOp&0x01)==0 /* (1) */

4589 && offset==0 /* (2) */

4590 && (bEnd \|\| a==ovflSize) /* (6) */

4591 && pBt->inTransaction==TRANS_READ /* (4) */

4592 && (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (3) */

4593 && pBt->pPage1->aData[19]==0x01 /* (5) */

4594 && &pBuf[-4]>=pBufStart /* (7) */

4595 ){

4596 u8 aSave[4];

4597 u8 *aWrite = &pBuf[-4];

4598 assert( aWrite>=pBufStart ); /* hence (7) */

4599 memcpy(aSave, aWrite, 4);

4600 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));

4601 nextPage = get4byte(aWrite);

4602 memcpy(aWrite, aSave, 4);

4603 }else

4604 #endif

4605

4606 {

4607 DbPage *pDbPage;

4608 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,

4609 ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)

4610 );

4611 if( rc==SQLITE_OK ){

4612 aPayload = sqlite3PagerGetData(pDbPage);

4613 nextPage = get4byte(aPayload);

4614 rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);

4615 sqlite3PagerUnref(pDbPage);

4616 offset = 0;

4617 }

4618 }

4619 amt -= a;

4620 pBuf += a;

4621 }

4622 }

4623 }

4624

4625 if( rc==SQLITE_OK && amt>0 ){

4626 return SQLITE_CORRUPT_BKPT;

4627 }

4628 return rc;

4629 }

4630

4631 /*

4632 ** Read part of the key associated with cursor pCur. Exactly

4633 ** "amt" bytes will be transferred into pBuf[]. The transfer

4634 ** begins at "offset".

4635 **

4636 ** The caller must ensure that pCur is pointing to a valid row

4637 ** in the table.

4638 **

4639 ** Return SQLITE_OK on success or an error code if anything goes

4640 ** wrong. An error is returned if "offset+amt" is larger than

4641 ** the available payload.

4642 */

4643 int sqlite3BtreeKey(BtCursor pCur, u32 offset, u32 amt, void pBuf){

4644 assert( cursorHoldsMutex(pCur) );

4645 assert( pCur->eState==CURSOR_VALID );

4646 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

4647 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

4648 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);

4649 }

4650

4651 /*

4652 ** Read part of the data associated with cursor pCur. Exactly

4653 ** "amt" bytes will be transfered into pBuf[]. The transfer

4654 ** begins at "offset".

4655 **

4656 ** Return SQLITE_OK on success or an error code if anything goes

4657 ** wrong. An error is returned if "offset+amt" is larger than

4658 ** the available payload.

4659 */

4660 int sqlite3BtreeData(BtCursor pCur, u32 offset, u32 amt, void pBuf){

4661 int rc;

4662

4663 #ifndef SQLITE_OMIT_INCRBLOB

4664 if ( pCur->eState==CURSOR_INVALID ){

4665 return SQLITE_ABORT;

4666 }

4667 #endif

4668

4669 assert( cursorHoldsMutex(pCur) );

4670 rc = restoreCursorPosition(pCur);

4671 if( rc==SQLITE_OK ){

4672 assert( pCur->eState==CURSOR_VALID );

4673 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );

4674 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

4675 rc = accessPayload(pCur, offset, amt, pBuf, 0);

4676 }

4677 return rc;

4678 }

4679

4680 /*

4681 ** Return a pointer to payload information from the entry that the

4682 ** pCur cursor is pointing to. The pointer is to the beginning of

4683 ** the key if index btrees (pPage->intKey==0) and is the data for

4684 ** table btrees (pPage->intKey==1). The number of bytes of available

4685 ** key/data is written into pAmt. If pAmt==0, then the value

4686 ** returned will not be a valid pointer.

4687 **

4688 ** This routine is an optimization. It is common for the entire key

4689 ** and data to fit on the local page and for there to be no overflow

4690 ** pages. When that is so, this routine can be used to access the

4691 ** key and data without making a copy. If the key and/or data spills

4692 ** onto overflow pages, then accessPayload() must be used to reassemble

4693 ** the key/data and copy it into a preallocated buffer.

4694 **

4695 ** The pointer returned by this routine looks directly into the cached

4696 ** page of the database. The data might change or move the next time

4697 ** any btree routine is called.

4698 */

4699 static const void *fetchPayload(

4700 BtCursor pCur, / Cursor pointing to entry to read from */

4701 u32 pAmt / Write the number of available bytes here */

4702 ){

4703 u32 amt;

4704 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);

4705 assert( pCur->eState==CURSOR_VALID );

4706 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

4707 assert( cursorHoldsMutex(pCur) );

4708 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

4709 assert( pCur->info.nSize>0 );

4710 assert( pCur->info.pPayload>pCur->apPage[pCur->iPage]->aData \|\| CORRUPT_DB );

4711 assert( pCur->info.pPayload<pCur->apPage[pCur->iPage]->aDataEnd \|\|CORRUPT_DB);

4712 amt = (int)(pCur->apPage[pCur->iPage]->aDataEnd - pCur->info.pPayload);

4713 if( pCur->info.nLocal<amt ) amt = pCur->info.nLocal;

4714 *pAmt = amt;

4715 return (void*)pCur->info.pPayload;

4716 }

4717

4718

4719 /*

4720 ** For the entry that cursor pCur is point to, return as

4721 ** many bytes of the key or data as are available on the local

4722 ** b-tree page. Write the number of available bytes into *pAmt.

4723 **

4724 ** The pointer returned is ephemeral. The key/data may move

4725 ** or be destroyed on the next call to any Btree routine,

4726 ** including calls from other threads against the same cache.

4727 ** Hence, a mutex on the BtShared should be held prior to calling

4728 ** this routine.

4729 **

4730 ** These routines is used to get quick access to key and data

4731 ** in the common case where no overflow pages are used.

4732 */

4733 const void sqlite3BtreeKeyFetch(BtCursor pCur, u32 *pAmt){

4734 return fetchPayload(pCur, pAmt);

4735 }

4736 const void sqlite3BtreeDataFetch(BtCursor pCur, u32 *pAmt){

4737 return fetchPayload(pCur, pAmt);

4738 }

4739

4740

4741 /*

4742 ** Move the cursor down to a new child page. The newPgno argument is the

4743 ** page number of the child page to move to.

4744 **

4745 ** This function returns SQLITE_CORRUPT if the page-header flags field of

4746 ** the new child page does not match the flags field of the parent (i.e.

4747 ** if an intkey page appears to be the parent of a non-intkey page, or

4748 ** vice-versa).

4749 */

4750 static int moveToChild(BtCursor *pCur, u32 newPgno){

4751 BtShared *pBt = pCur->pBt;

4752

4753 assert( cursorHoldsMutex(pCur) );

4754 assert( pCur->eState==CURSOR_VALID );

4755 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );

4756 assert( pCur->iPage>=0 );

4757 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){

4758 return SQLITE_CORRUPT_BKPT;

4759 }

4760 pCur->info.nSize = 0;

4761 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

4762 pCur->iPage++;

4763 pCur->aiIdx[pCur->iPage] = 0;

4764 return getAndInitPage(pBt, newPgno, &pCur->apPage[pCur->iPage],

4765 pCur, pCur->curPagerFlags);

4766 }

4767

4768 #if SQLITE_DEBUG

4769 /*

4770 ** Page pParent is an internal (non-leaf) tree page. This function

4771 ** asserts that page number iChild is the left-child if the iIdx'th

4772 ** cell in page pParent. Or, if iIdx is equal to the total number of

4773 ** cells in pParent, that page number iChild is the right-child of

4774 ** the page.

4775 */

4776 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){

4777 if( CORRUPT_DB ) return; /* The conditions tested below might not be true

4778 ** in a corrupt database */

4779 assert( iIdx<=pParent->nCell );

4780 if( iIdx==pParent->nCell ){

4781 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );

4782 }else{

4783 assert( get4byte(findCell(pParent, iIdx))==iChild );

4784 }

4785 }

4786 #else

4787 # define assertParentIndex(x,y,z)

4788 #endif

4789

4790 /*

4791 ** Move the cursor up to the parent page.

4792 **

4793 ** pCur->idx is set to the cell index that contains the pointer

4794 ** to the page we are coming from. If we are coming from the

4795 ** right-most child page then pCur->idx is set to one more than

4796 ** the largest cell index.

4797 */

4798 static void moveToParent(BtCursor *pCur){

4799 assert( cursorHoldsMutex(pCur) );

4800 assert( pCur->eState==CURSOR_VALID );

4801 assert( pCur->iPage>0 );

4802 assert( pCur->apPage[pCur->iPage] );

4803 assertParentIndex(

4804 pCur->apPage[pCur->iPage-1],

4805 pCur->aiIdx[pCur->iPage-1],

4806 pCur->apPage[pCur->iPage]->pgno

4807 );

4808 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );

4809 pCur->info.nSize = 0;

4810 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

4811 releasePageNotNull(pCur->apPage[pCur->iPage--]);

4812 }

4813

4814 /*

4815 ** Move the cursor to point to the root page of its b-tree structure.

4816 **

4817 ** If the table has a virtual root page, then the cursor is moved to point

4818 ** to the virtual root page instead of the actual root page. A table has a

4819 ** virtual root page when the actual root page contains no cells and a

4820 ** single child page. This can only happen with the table rooted at page 1.

4821 **

4822 ** If the b-tree structure is empty, the cursor state is set to

4823 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first

4824 ** cell located on the root (or virtual root) page and the cursor state

4825 ** is set to CURSOR_VALID.

4826 **

4827 ** If this function returns successfully, it may be assumed that the

4828 ** page-header flags indicate that the [virtual] root-page is the expected

4829 ** kind of b-tree page (i.e. if when opening the cursor the caller did not

4830 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,

4831 ** indicating a table b-tree, or if the caller did specify a KeyInfo

4832 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index

4833 ** b-tree).

4834 */

4835 static int moveToRoot(BtCursor *pCur){

4836 MemPage *pRoot;

4837 int rc = SQLITE_OK;

4838

4839 assert( cursorHoldsMutex(pCur) );

4840 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );

4841 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );

4842 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );

4843 if( pCur->eState>=CURSOR_REQUIRESEEK ){

4844 if( pCur->eState==CURSOR_FAULT ){

4845 assert( pCur->skipNext!=SQLITE_OK );

4846 return pCur->skipNext;

4847 }

4848 sqlite3BtreeClearCursor(pCur);

4849 }

4850

4851 if( pCur->iPage>=0 ){

4852 while( pCur->iPage ){

4853 assert( pCur->apPage[pCur->iPage]!=0 );

4854 releasePageNotNull(pCur->apPage[pCur->iPage--]);

4855 }

4856 }else if( pCur->pgnoRoot==0 ){

4857 pCur->eState = CURSOR_INVALID;

4858 return SQLITE_OK;

4859 }else{

4860 assert( pCur->iPage==(-1) );

4861 rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0],

4862 0, pCur->curPagerFlags);

4863 if( rc!=SQLITE_OK ){

4864 pCur->eState = CURSOR_INVALID;

4865 return rc;

4866 }

4867 pCur->iPage = 0;

4868 pCur->curIntKey = pCur->apPage[0]->intKey;

4869 }

4870 pRoot = pCur->apPage[0];

4871 assert( pRoot->pgno==pCur->pgnoRoot );

4872

4873 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor

4874 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is

4875 ** NULL, the caller expects a table b-tree. If this is not the case,

4876 ** return an SQLITE_CORRUPT error.

4877 **

4878 ** Earlier versions of SQLite assumed that this test could not fail

4879 ** if the root page was already loaded when this function was called (i.e.

4880 ** if pCur->iPage>=0). But this is not so if the database is corrupted

4881 ** in such a way that page pRoot is linked into a second b-tree table

4882 ** (or the freelist). */

4883 assert( pRoot->intKey==1 \|\| pRoot->intKey==0 );

4884 if( pRoot->isInit==0 \|\| (pCur->pKeyInfo==0)!=pRoot->intKey ){

4885 return SQLITE_CORRUPT_BKPT;

4886 }

4887

4888 pCur->aiIdx[0] = 0;

4889 pCur->info.nSize = 0;

4890 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidNKey\|BTCF_ValidOvfl);

4891

4892 if( pRoot->nCell>0 ){

4893 pCur->eState = CURSOR_VALID;

4894 }else if( !pRoot->leaf ){

4895 Pgno subpage;

4896 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;

4897 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);

4898 pCur->eState = CURSOR_VALID;

4899 rc = moveToChild(pCur, subpage);

4900 }else{

4901 pCur->eState = CURSOR_INVALID;

4902 }

4903 return rc;

4904 }

4905

4906 /*

4907 ** Move the cursor down to the left-most leaf entry beneath the

4908 ** entry to which it is currently pointing.

4909 **

4910 ** The left-most leaf is the one with the smallest key - the first

4911 ** in ascending order.

4912 */

4913 static int moveToLeftmost(BtCursor *pCur){

4914 Pgno pgno;

4915 int rc = SQLITE_OK;

4916 MemPage *pPage;

4917

4918 assert( cursorHoldsMutex(pCur) );

4919 assert( pCur->eState==CURSOR_VALID );

4920 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){

4921 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );

4922 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));

4923 rc = moveToChild(pCur, pgno);

4924 }

4925 return rc;

4926 }

4927

4928 /*

4929 ** Move the cursor down to the right-most leaf entry beneath the

4930 ** page to which it is currently pointing. Notice the difference

4931 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()

4932 ** finds the left-most entry beneath the entry whereas moveToRightmost()

4933 ** finds the right-most entry beneath the page.

4934 **

4935 ** The right-most entry is the one with the largest key - the last

4936 ** key in ascending order.

4937 */

4938 static int moveToRightmost(BtCursor *pCur){

4939 Pgno pgno;

4940 int rc = SQLITE_OK;

4941 MemPage *pPage = 0;

4942

4943 assert( cursorHoldsMutex(pCur) );

4944 assert( pCur->eState==CURSOR_VALID );

4945 while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){

4946 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);

4947 pCur->aiIdx[pCur->iPage] = pPage->nCell;

4948 rc = moveToChild(pCur, pgno);

4949 if( rc ) return rc;

4950 }

4951 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;

4952 assert( pCur->info.nSize==0 );

4953 assert( (pCur->curFlags & BTCF_ValidNKey)==0 );

4954 return SQLITE_OK;

4955 }

4956

4957 /* Move the cursor to the first entry in the table. Return SQLITE_OK

4958 ** on success. Set *pRes to 0 if the cursor actually points to something

4959 ** or set *pRes to 1 if the table is empty.

4960 */

4961 int sqlite3BtreeFirst(BtCursor pCur, int pRes){

4962 int rc;

4963

4964 assert( cursorHoldsMutex(pCur) );

4965 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

4966 rc = moveToRoot(pCur);

4967 if( rc==SQLITE_OK ){

4968 if( pCur->eState==CURSOR_INVALID ){

4969 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

4970 *pRes = 1;

4971 }else{

4972 assert( pCur->apPage[pCur->iPage]->nCell>0 );

4973 *pRes = 0;

4974 rc = moveToLeftmost(pCur);

4975 }

4976 }

4977 return rc;

4978 }

4979

4980 /* Move the cursor to the last entry in the table. Return SQLITE_OK

4981 ** on success. Set *pRes to 0 if the cursor actually points to something

4982 ** or set *pRes to 1 if the table is empty.

4983 */

4984 int sqlite3BtreeLast(BtCursor pCur, int pRes){

4985 int rc;

4986

4987 assert( cursorHoldsMutex(pCur) );

4988 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

4989

4990 /* If the cursor already points to the last entry, this is a no-op. */

4991 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){

4992 #ifdef SQLITE_DEBUG

4993 /* This block serves to assert() that the cursor really does point

4994 ** to the last entry in the b-tree. */

4995 int ii;

4996 for(ii=0; ii<pCur->iPage; ii++){

4997 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );

4998 }

4999 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );

5000 assert( pCur->apPage[pCur->iPage]->leaf );

5001 #endif

5002 return SQLITE_OK;

5003 }

5004

5005 rc = moveToRoot(pCur);

5006 if( rc==SQLITE_OK ){

5007 if( CURSOR_INVALID==pCur->eState ){

5008 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

5009 *pRes = 1;

5010 }else{

5011 assert( pCur->eState==CURSOR_VALID );

5012 *pRes = 0;

5013 rc = moveToRightmost(pCur);

5014 if( rc==SQLITE_OK ){

5015 pCur->curFlags \|= BTCF_AtLast;

5016 }else{

5017 pCur->curFlags &= ~BTCF_AtLast;

5018 }

5019

5020 }

5021 }

5022 return rc;

5023 }

5024

5025 /* Move the cursor so that it points to an entry near the key

5026 ** specified by pIdxKey or intKey. Return a success code.

5027 **

5028 ** For INTKEY tables, the intKey parameter is used. pIdxKey

5029 ** must be NULL. For index tables, pIdxKey is used and intKey

5030 ** is ignored.

5031 **

5032 ** If an exact match is not found, then the cursor is always

5033 ** left pointing at a leaf page which would hold the entry if it

5034 ** were present. The cursor might point to an entry that comes

5035 ** before or after the key.

5036 **

5037 ** An integer is written into *pRes which is the result of

5038 ** comparing the key with the entry to which the cursor is

5039 ** pointing. The meaning of the integer written into

5040 ** *pRes is as follows:

5041 **

5042 ** *pRes<0 The cursor is left pointing at an entry that

5043 ** is smaller than intKey/pIdxKey or if the table is empty

5044 ** and the cursor is therefore left point to nothing.

5045 **

5046 ** *pRes==0 The cursor is left pointing at an entry that

5047 ** exactly matches intKey/pIdxKey.

5048 **

5049 ** *pRes>0 The cursor is left pointing at an entry that

5050 ** is larger than intKey/pIdxKey.

5051 **

5052 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there

5053 ** exists an entry in the table that exactly matches pIdxKey.

5054 */

5055 int sqlite3BtreeMovetoUnpacked(

5056 BtCursor pCur, / The cursor to be moved */

5057 UnpackedRecord pIdxKey, / Unpacked index key */

5058 i64 intKey, /* The table key */

5059 int biasRight, /* If true, bias the search to the high end */

5060 int pRes / Write search results here */

5061 ){

5062 int rc;

5063 RecordCompare xRecordCompare;

5064

5065 assert( cursorHoldsMutex(pCur) );

5066 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );

5067 assert( pRes );

5068 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );

5069

5070 /* If the cursor is already positioned at the point we are trying

5071 ** to move to, then just return without doing any work */

5072 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0

5073 && pCur->curIntKey

5074 ){

5075 if( pCur->info.nKey==intKey ){

5076 *pRes = 0;

5077 return SQLITE_OK;

5078 }

5079 if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){

5080 *pRes = -1;

5081 return SQLITE_OK;

5082 }

5083 }

5084

5085 if( pIdxKey ){

5086 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);

5087 pIdxKey->errCode = 0;

5088 assert( pIdxKey->default_rc==1

5089 \|\| pIdxKey->default_rc==0

5090 \|\| pIdxKey->default_rc==-1

5091 );

5092 }else{

5093 xRecordCompare = 0; /* All keys are integers */

5094 }

5095

5096 rc = moveToRoot(pCur);

5097 if( rc ){

5098 return rc;

5099 }

5100 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage] );

5101 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->isInit );

5102 assert( pCur->eState==CURSOR_INVALID \|\| pCur->apPage[pCur->iPage]->nCell>0 );

5103 if( pCur->eState==CURSOR_INVALID ){

5104 *pRes = -1;

5105 assert( pCur->pgnoRoot==0 \|\| pCur->apPage[pCur->iPage]->nCell==0 );

5106 return SQLITE_OK;

5107 }

5108 assert( pCur->apPage[0]->intKey==pCur->curIntKey );

5109 assert( pCur->curIntKey \|\| pIdxKey );

5110 for(;;){

5111 int lwr, upr, idx, c;

5112 Pgno chldPg;

5113 MemPage *pPage = pCur->apPage[pCur->iPage];

5114 u8 pCell; / Pointer to current cell in pPage */

5115

5116 /* pPage->nCell must be greater than zero. If this is the root-page

5117 ** the cursor would have been INVALID above and this for(;;) loop

5118 ** not run. If this is not the root-page, then the moveToChild() routine

5119 ** would have already detected db corruption. Similarly, pPage must

5120 ** be the right kind (index or table) of b-tree page. Otherwise

5121 ** a moveToChild() or moveToRoot() call would have detected corruption. */

5122 assert( pPage->nCell>0 );

5123 assert( pPage->intKey==(pIdxKey==0) );

5124 lwr = 0;

5125 upr = pPage->nCell-1;

5126 assert( biasRight==0 \|\| biasRight==1 );

5127 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */

5128 pCur->aiIdx[pCur->iPage] = (u16)idx;

5129 if( xRecordCompare==0 ){

5130 for(;;){

5131 i64 nCellKey;

5132 pCell = findCellPastPtr(pPage, idx);

5133 if( pPage->intKeyLeaf ){

5134 while( 0x80 <= *(pCell++) ){

5135 if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;

5136 }

5137 }

5138 getVarint(pCell, (u64*)&nCellKey);

5139 if( nCellKey<intKey ){

5140 lwr = idx+1;

5141 if( lwr>upr ){ c = -1; break; }

5142 }else if( nCellKey>intKey ){

5143 upr = idx-1;

5144 if( lwr>upr ){ c = +1; break; }

5145 }else{

5146 assert( nCellKey==intKey );

5147 pCur->curFlags \|= BTCF_ValidNKey;

5148 pCur->info.nKey = nCellKey;

5149 pCur->aiIdx[pCur->iPage] = (u16)idx;

5150 if( !pPage->leaf ){

5151 lwr = idx;

5152 goto moveto_next_layer;

5153 }else{

5154 *pRes = 0;

5155 rc = SQLITE_OK;

5156 goto moveto_finish;

5157 }

5158 }

5159 assert( lwr+upr>=0 );

5160 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */

5161 }

5162 }else{

5163 for(;;){

5164 int nCell; /* Size of the pCell cell in bytes */

5165 pCell = findCellPastPtr(pPage, idx);

5166

5167 /* The maximum supported page-size is 65536 bytes. This means that

5168 ** the maximum number of record bytes stored on an index B-Tree

5169 ** page is less than 16384 bytes and may be stored as a 2-byte

5170 ** varint. This information is used to attempt to avoid parsing

5171 ** the entire cell by checking for the cases where the record is

5172 ** stored entirely within the b-tree page by inspecting the first

5173 ** 2 bytes of the cell.

5174 */

5175 nCell = pCell[0];

5176 if( nCell<=pPage->max1bytePayload ){

5177 /* This branch runs if the record-size field of the cell is a

5178 ** single byte varint and the record fits entirely on the main

5179 ** b-tree page. */

5180 testcase( pCell+nCell+1==pPage->aDataEnd );

5181 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);

5182 }else if( !(pCell[1] & 0x80)

5183 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal

5184 ){

5185 /* The record-size field is a 2 byte varint and the record

5186 ** fits entirely on the main b-tree page. */

5187 testcase( pCell+nCell+2==pPage->aDataEnd );

5188 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);

5189 }else{

5190 /* The record flows over onto one or more overflow pages. In

5191 ** this case the whole cell needs to be parsed, a buffer allocated

5192 ** and accessPayload() used to retrieve the record into the

5193 ** buffer before VdbeRecordCompare() can be called.

5194 **

5195 ** If the record is corrupt, the xRecordCompare routine may read

5196 ** up to two varints past the end of the buffer. An extra 18

5197 ** bytes of padding is allocated at the end of the buffer in

5198 ** case this happens. */

5199 void *pCellKey;

5200 u8 * const pCellBody = pCell - pPage->childPtrSize;

5201 pPage->xParseCell(pPage, pCellBody, &pCur->info);

5202 nCell = (int)pCur->info.nKey;

5203 testcase( nCell<0 ); /* True if key size is 2^32 or more */

5204 testcase( nCell==0 ); /* Invalid key size: 0x80 0x80 0x00 */

5205 testcase( nCell==1 ); /* Invalid key size: 0x80 0x80 0x01 */

5206 testcase( nCell==2 ); /* Minimum legal index key size */

5207 if( nCell<2 ){

5208 rc = SQLITE_CORRUPT_BKPT;

5209 goto moveto_finish;

5210 }

5211 pCellKey = sqlite3Malloc( nCell+18 );

5212 if( pCellKey==0 ){

5213 rc = SQLITE_NOMEM;

5214 goto moveto_finish;

5215 }

5216 pCur->aiIdx[pCur->iPage] = (u16)idx;

5217 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);

5218 if( rc ){

5219 sqlite3_free(pCellKey);

5220 goto moveto_finish;

5221 }

5222 c = xRecordCompare(nCell, pCellKey, pIdxKey);

5223 sqlite3_free(pCellKey);

5224 }

5225 assert(

5226 (pIdxKey->errCode!=SQLITE_CORRUPT \|\| c==0)

5227 && (pIdxKey->errCode!=SQLITE_NOMEM \|\| pCur->pBtree->db->mallocFailed)

5228 );

5229 if( c<0 ){

5230 lwr = idx+1;

5231 }else if( c>0 ){

5232 upr = idx-1;

5233 }else{

5234 assert( c==0 );

5235 *pRes = 0;

5236 rc = SQLITE_OK;

5237 pCur->aiIdx[pCur->iPage] = (u16)idx;

5238 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT;

5239 goto moveto_finish;

5240 }

5241 if( lwr>upr ) break;

5242 assert( lwr+upr>=0 );

5243 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */

5244 }

5245 }

5246 assert( lwr==upr+1 \|\| (pPage->intKey && !pPage->leaf) );

5247 assert( pPage->isInit );

5248 if( pPage->leaf ){

5249 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

5250 pCur->aiIdx[pCur->iPage] = (u16)idx;

5251 *pRes = c;

5252 rc = SQLITE_OK;

5253 goto moveto_finish;

5254 }

5255 moveto_next_layer:

5256 if( lwr>=pPage->nCell ){

5257 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);

5258 }else{

5259 chldPg = get4byte(findCell(pPage, lwr));

5260 }

5261 pCur->aiIdx[pCur->iPage] = (u16)lwr;

5262 rc = moveToChild(pCur, chldPg);

5263 if( rc ) break;

5264 }

5265 moveto_finish:

5266 pCur->info.nSize = 0;

5267 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

5268 return rc;

5269 }

5270

5271

5272 /*

5273 ** Return TRUE if the cursor is not pointing at an entry of the table.

5274 **

5275 ** TRUE will be returned after a call to sqlite3BtreeNext() moves

5276 ** past the last entry in the table or sqlite3BtreePrev() moves past

5277 ** the first entry. TRUE is also returned if the table is empty.

5278 */

5279 int sqlite3BtreeEof(BtCursor *pCur){

5280 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries

5281 ** have been deleted? This API will need to change to return an error code

5282 ** as well as the boolean result value.

5283 */

5284 return (CURSOR_VALID!=pCur->eState);

5285 }

5286

5287 /*

5288 ** Advance the cursor to the next entry in the database. If

5289 ** successful then set *pRes=0. If the cursor

5290 ** was already pointing to the last entry in the database before

5291 ** this routine was called, then set *pRes=1.

5292 **

5293 ** The main entry point is sqlite3BtreeNext(). That routine is optimized

5294 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx

5295 ** to the next cell on the current page. The (slower) btreeNext() helper

5296 ** routine is called when it is necessary to move to a different page or

5297 ** to restore the cursor.

5298 **

5299 ** The calling function will set pRes to 0 or 1. The initial pRes value

5300 ** will be 1 if the cursor being stepped corresponds to an SQL index and

5301 ** if this routine could have been skipped if that SQL index had been

5302 ** a unique index. Otherwise the caller will have set *pRes to zero.

5303 ** Zero is the common case. The btree implementation is free to use the

5304 ** initial *pRes value as a hint to improve performance, but the current

5305 ** SQLite btree implementation does not. (Note that the comdb2 btree

5306 ** implementation does use this hint, however.)

5307 */

5308 static SQLITE_NOINLINE int btreeNext(BtCursor pCur, int pRes){

5309 int rc;

5310 int idx;

5311 MemPage *pPage;

5312

5313 assert( cursorHoldsMutex(pCur) );

5314 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

5315 assert( *pRes==0 );

5316 if( pCur->eState!=CURSOR_VALID ){

5317 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );

5318 rc = restoreCursorPosition(pCur);

5319 if( rc!=SQLITE_OK ){

5320 return rc;

5321 }

5322 if( CURSOR_INVALID==pCur->eState ){

5323 *pRes = 1;

5324 return SQLITE_OK;

5325 }

5326 if( pCur->skipNext ){

5327 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

5328 pCur->eState = CURSOR_VALID;

5329 if( pCur->skipNext>0 ){

5330 pCur->skipNext = 0;

5331 return SQLITE_OK;

5332 }

5333 pCur->skipNext = 0;

5334 }

5335 }

5336

5337 pPage = pCur->apPage[pCur->iPage];

5338 idx = ++pCur->aiIdx[pCur->iPage];

5339 assert( pPage->isInit );

5340

5341 /* If the database file is corrupt, it is possible for the value of idx

5342 ** to be invalid here. This can only occur if a second cursor modifies

5343 ** the page while cursor pCur is holding a reference to it. Which can

5344 ** only happen if the database is corrupt in such a way as to link the

5345 ** page into more than one b-tree structure. */

5346 testcase( idx>pPage->nCell );

5347

5348 if( idx>=pPage->nCell ){

5349 if( !pPage->leaf ){

5350 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

5351 if( rc ) return rc;

5352 return moveToLeftmost(pCur);

5353 }

5354 do{

5355 if( pCur->iPage==0 ){

5356 *pRes = 1;

5357 pCur->eState = CURSOR_INVALID;

5358 return SQLITE_OK;

5359 }

5360 moveToParent(pCur);

5361 pPage = pCur->apPage[pCur->iPage];

5362 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );

5363 if( pPage->intKey ){

5364 return sqlite3BtreeNext(pCur, pRes);

5365 }else{

5366 return SQLITE_OK;

5367 }

5368 }

5369 if( pPage->leaf ){

5370 return SQLITE_OK;

5371 }else{

5372 return moveToLeftmost(pCur);

5373 }

5374 }

5375 int sqlite3BtreeNext(BtCursor pCur, int pRes){

5376 MemPage *pPage;

5377 assert( cursorHoldsMutex(pCur) );

5378 assert( pRes!=0 );

5379 assert( pRes==0 \|\| pRes==1 );

5380 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

5381 pCur->info.nSize = 0;

5382 pCur->curFlags &= ~(BTCF_ValidNKey\|BTCF_ValidOvfl);

5383 *pRes = 0;

5384 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);

5385 pPage = pCur->apPage[pCur->iPage];

5386 if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){

5387 pCur->aiIdx[pCur->iPage]--;

5388 return btreeNext(pCur, pRes);

5389 }

5390 if( pPage->leaf ){

5391 return SQLITE_OK;

5392 }else{

5393 return moveToLeftmost(pCur);

5394 }

5395 }

5396

5397 /*

5398 ** Step the cursor to the back to the previous entry in the database. If

5399 ** successful then set *pRes=0. If the cursor

5400 ** was already pointing to the first entry in the database before

5401 ** this routine was called, then set *pRes=1.

5402 **

5403 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized

5404 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx

5405 ** to the previous cell on the current page. The (slower) btreePrevious()

5406 ** helper routine is called when it is necessary to move to a different page

5407 ** or to restore the cursor.

5408 **

5409 ** The calling function will set pRes to 0 or 1. The initial pRes value

5410 ** will be 1 if the cursor being stepped corresponds to an SQL index and

5411 ** if this routine could have been skipped if that SQL index had been

5412 ** a unique index. Otherwise the caller will have set *pRes to zero.

5413 ** Zero is the common case. The btree implementation is free to use the

5414 ** initial *pRes value as a hint to improve performance, but the current

5415 ** SQLite btree implementation does not. (Note that the comdb2 btree

5416 ** implementation does use this hint, however.)

5417 */

5418 static SQLITE_NOINLINE int btreePrevious(BtCursor pCur, int pRes){

5419 int rc;

5420 MemPage *pPage;

5421

5422 assert( cursorHoldsMutex(pCur) );

5423 assert( pRes!=0 );

5424 assert( *pRes==0 );

5425 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

5426 assert( (pCur->curFlags & (BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey))==0 );

5427 assert( pCur->info.nSize==0 );

5428 if( pCur->eState!=CURSOR_VALID ){

5429 rc = restoreCursorPosition(pCur);

5430 if( rc!=SQLITE_OK ){

5431 return rc;

5432 }

5433 if( CURSOR_INVALID==pCur->eState ){

5434 *pRes = 1;

5435 return SQLITE_OK;

5436 }

5437 if( pCur->skipNext ){

5438 assert( pCur->eState==CURSOR_VALID \|\| pCur->eState==CURSOR_SKIPNEXT );

5439 pCur->eState = CURSOR_VALID;

5440 if( pCur->skipNext<0 ){

5441 pCur->skipNext = 0;

5442 return SQLITE_OK;

5443 }

5444 pCur->skipNext = 0;

5445 }

5446 }

5447

5448 pPage = pCur->apPage[pCur->iPage];

5449 assert( pPage->isInit );

5450 if( !pPage->leaf ){

5451 int idx = pCur->aiIdx[pCur->iPage];

5452 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));

5453 if( rc ) return rc;

5454 rc = moveToRightmost(pCur);

5455 }else{

5456 while( pCur->aiIdx[pCur->iPage]==0 ){

5457 if( pCur->iPage==0 ){

5458 pCur->eState = CURSOR_INVALID;

5459 *pRes = 1;

5460 return SQLITE_OK;

5461 }

5462 moveToParent(pCur);

5463 }

5464 assert( pCur->info.nSize==0 );

5465 assert( (pCur->curFlags & (BTCF_ValidNKey\|BTCF_ValidOvfl))==0 );

5466

5467 pCur->aiIdx[pCur->iPage]--;

5468 pPage = pCur->apPage[pCur->iPage];

5469 if( pPage->intKey && !pPage->leaf ){

5470 rc = sqlite3BtreePrevious(pCur, pRes);

5471 }else{

5472 rc = SQLITE_OK;

5473 }

5474 }

5475 return rc;

5476 }

5477 int sqlite3BtreePrevious(BtCursor pCur, int pRes){

5478 assert( cursorHoldsMutex(pCur) );

5479 assert( pRes!=0 );

5480 assert( pRes==0 \|\| pRes==1 );

5481 assert( pCur->skipNext==0 \|\| pCur->eState!=CURSOR_VALID );

5482 *pRes = 0;

5483 pCur->curFlags &= ~(BTCF_AtLast\|BTCF_ValidOvfl\|BTCF_ValidNKey);

5484 pCur->info.nSize = 0;

5485 if( pCur->eState!=CURSOR_VALID

5486 \|\| pCur->aiIdx[pCur->iPage]==0

5487 \|\| pCur->apPage[pCur->iPage]->leaf==0

5488 ){

5489 return btreePrevious(pCur, pRes);

5490 }

5491 pCur->aiIdx[pCur->iPage]--;

5492 return SQLITE_OK;

5493 }

5494

5495 /*

5496 ** Allocate a new page from the database file.

5497 **

5498 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()

5499 ** has already been called on the new page.) The new page has also

5500 ** been referenced and the calling routine is responsible for calling

5501 ** sqlite3PagerUnref() on the new page when it is done.

5502 **

5503 ** SQLITE_OK is returned on success. Any other return value indicates

5504 ** an error. *ppPage is set to NULL in the event of an error.

5505 **

5506 ** If the "nearby" parameter is not 0, then an effort is made to

5507 ** locate a page close to the page number "nearby". This can be used in an

5508 ** attempt to keep related pages close to each other in the database file,

5509 ** which in turn can make database access faster.

5510 **

5511 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists

5512 ** anywhere on the free-list, then it is guaranteed to be returned. If

5513 ** eMode is BTALLOC_LT then the page returned will be less than or equal

5514 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there

5515 ** are no restrictions on which page is returned.

5516 */

5517 static int allocateBtreePage(

5518 BtShared pBt, / The btree */

5519 MemPage *ppPage, / Store pointer to the allocated page here */

5520 Pgno pPgno, / Store the page number here */

5521 Pgno nearby, /* Search for a page near this one */

5522 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */

5523 ){

5524 MemPage *pPage1;

5525 int rc;

5526 u32 n; /* Number of pages on the freelist */

5527 u32 k; /* Number of leaves on the trunk of the freelist */

5528 MemPage *pTrunk = 0;

5529 MemPage *pPrevTrunk = 0;

5530 Pgno mxPage; /* Total size of the database file */

5531

5532 assert( sqlite3_mutex_held(pBt->mutex) );

5533 assert( eMode==BTALLOC_ANY \|\| (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );

5534 pPage1 = pBt->pPage1;

5535 mxPage = btreePagecount(pBt);

5536 /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36

5537 ** stores stores the total number of pages on the freelist. */

5538 n = get4byte(&pPage1->aData[36]);

5539 testcase( n==mxPage-1 );

5540 if( n>=mxPage ){

5541 return SQLITE_CORRUPT_BKPT;

5542 }

5543 if( n>0 ){

5544 /* There are pages on the freelist. Reuse one of those pages. */

5545 Pgno iTrunk;

5546 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */

5547 u32 nSearch = 0; /* Count of the number of search attempts */

5548

5549 /* If eMode==BTALLOC_EXACT and a query of the pointer-map

5550 ** shows that the page 'nearby' is somewhere on the free-list, then

5551 ** the entire-list will be searched for that page.

5552 */

5553 #ifndef SQLITE_OMIT_AUTOVACUUM

5554 if( eMode==BTALLOC_EXACT ){

5555 if( nearby<=mxPage ){

5556 u8 eType;

5557 assert( nearby>0 );

5558 assert( pBt->autoVacuum );

5559 rc = ptrmapGet(pBt, nearby, &eType, 0);

5560 if( rc ) return rc;

5561 if( eType==PTRMAP_FREEPAGE ){

5562 searchList = 1;

5563 }

5564 }

5565 }else if( eMode==BTALLOC_LE ){

5566 searchList = 1;

5567 }

5568 #endif

5569

5570 /* Decrement the free-list count by 1. Set iTrunk to the index of the

5571 ** first free-list trunk page. iPrevTrunk is initially 1.

5572 */

5573 rc = sqlite3PagerWrite(pPage1->pDbPage);

5574 if( rc ) return rc;

5575 put4byte(&pPage1->aData[36], n-1);

5576

5577 /* The code within this loop is run only once if the 'searchList' variable

5578 ** is not true. Otherwise, it runs once for each trunk-page on the

5579 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)

5580 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)

5581 */

5582 do {

5583 pPrevTrunk = pTrunk;

5584 if( pPrevTrunk ){

5585 /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page

5586 ** is the page number of the next freelist trunk page in the list or

5587 ** zero if this is the last freelist trunk page. */

5588 iTrunk = get4byte(&pPrevTrunk->aData[0]);

5589 }else{

5590 /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32

5591 ** stores the page number of the first page of the freelist, or zero if

5592 ** the freelist is empty. */

5593 iTrunk = get4byte(&pPage1->aData[32]);

5594 }

5595 testcase( iTrunk==mxPage );

5596 if( iTrunk>mxPage \|\| nSearch++ > n ){

5597 rc = SQLITE_CORRUPT_BKPT;

5598 }else{

5599 rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);

5600 }

5601 if( rc ){

5602 pTrunk = 0;

5603 goto end_allocate_page;

5604 }

5605 assert( pTrunk!=0 );

5606 assert( pTrunk->aData!=0 );

5607 /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page

5608 ** is the number of leaf page pointers to follow. */

5609 k = get4byte(&pTrunk->aData[4]);

5610 if( k==0 && !searchList ){

5611 /* The trunk has no leaves and the list is not being searched.

5612 ** So extract the trunk page itself and use it as the newly

5613 ** allocated page */

5614 assert( pPrevTrunk==0 );

5615 rc = sqlite3PagerWrite(pTrunk->pDbPage);

5616 if( rc ){

5617 goto end_allocate_page;

5618 }

5619 *pPgno = iTrunk;

5620 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

5621 *ppPage = pTrunk;

5622 pTrunk = 0;

5623 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

5624 }else if( k>(u32)(pBt->usableSize/4 - 2) ){

5625 /* Value of k is out of range. Database corruption */

5626 rc = SQLITE_CORRUPT_BKPT;

5627 goto end_allocate_page;

5628 #ifndef SQLITE_OMIT_AUTOVACUUM

5629 }else if( searchList

5630 && (nearby==iTrunk \|\| (iTrunk<nearby && eMode==BTALLOC_LE))

5631 ){

5632 /* The list is being searched and this trunk page is the page

5633 ** to allocate, regardless of whether it has leaves.

5634 */

5635 *pPgno = iTrunk;

5636 *ppPage = pTrunk;

5637 searchList = 0;

5638 rc = sqlite3PagerWrite(pTrunk->pDbPage);

5639 if( rc ){

5640 goto end_allocate_page;

5641 }

5642 if( k==0 ){

5643 if( !pPrevTrunk ){

5644 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);

5645 }else{

5646 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

5647 if( rc!=SQLITE_OK ){

5648 goto end_allocate_page;

5649 }

5650 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);

5651 }

5652 }else{

5653 /* The trunk page is required by the caller but it contains

5654 ** pointers to free-list leaves. The first leaf becomes a trunk

5655 ** page in this case.

5656 */

5657 MemPage *pNewTrunk;

5658 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);

5659 if( iNewTrunk>mxPage ){

5660 rc = SQLITE_CORRUPT_BKPT;

5661 goto end_allocate_page;

5662 }

5663 testcase( iNewTrunk==mxPage );

5664 rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);

5665 if( rc!=SQLITE_OK ){

5666 goto end_allocate_page;

5667 }

5668 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);

5669 if( rc!=SQLITE_OK ){

5670 releasePage(pNewTrunk);

5671 goto end_allocate_page;

5672 }

5673 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);

5674 put4byte(&pNewTrunk->aData[4], k-1);

5675 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);

5676 releasePage(pNewTrunk);

5677 if( !pPrevTrunk ){

5678 assert( sqlite3PagerIswriteable(pPage1->pDbPage) );

5679 put4byte(&pPage1->aData[32], iNewTrunk);

5680 }else{

5681 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);

5682 if( rc ){

5683 goto end_allocate_page;

5684 }

5685 put4byte(&pPrevTrunk->aData[0], iNewTrunk);

5686 }

5687 }

5688 pTrunk = 0;

5689 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));

5690 #endif

5691 }else if( k>0 ){

5692 /* Extract a leaf from the trunk */

5693 u32 closest;

5694 Pgno iPage;

5695 unsigned char *aData = pTrunk->aData;

5696 if( nearby>0 ){

5697 u32 i;

5698 closest = 0;

5699 if( eMode==BTALLOC_LE ){

5700 for(i=0; i<k; i++){

5701 iPage = get4byte(&aData[8+i*4]);

5702 if( iPage<=nearby ){

5703 closest = i;

5704 break;

5705 }

5706 }

5707 }else{

5708 int dist;

5709 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);

5710 for(i=1; i<k; i++){

5711 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);

5712 if( d2<dist ){

5713 closest = i;

5714 dist = d2;

5715 }

5716 }

5717 }

5718 }else{

5719 closest = 0;

5720 }

5721

5722 iPage = get4byte(&aData[8+closest*4]);

5723 testcase( iPage==mxPage );

5724 if( iPage>mxPage ){

5725 rc = SQLITE_CORRUPT_BKPT;

5726 goto end_allocate_page;

5727 }

5728 testcase( iPage==mxPage );

5729 if( !searchList

5730 \|\| (iPage==nearby \|\| (iPage<nearby && eMode==BTALLOC_LE))

5731 ){

5732 int noContent;

5733 *pPgno = iPage;

5734 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"

5735 ": %d more free pages\n",

5736 *pPgno, closest+1, k, pTrunk->pgno, n-1));

5737 rc = sqlite3PagerWrite(pTrunk->pDbPage);

5738 if( rc ) goto end_allocate_page;

5739 if( closest<k-1 ){

5740 memcpy(&aData[8+closest4], &aData[4+k4], 4);

5741 }

5742 put4byte(&aData[4], k-1);

5743 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;

5744 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);

5745 if( rc==SQLITE_OK ){

5746 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

5747 if( rc!=SQLITE_OK ){

5748 releasePage(*ppPage);

5749 *ppPage = 0;

5750 }

5751 }

5752 searchList = 0;

5753 }

5754 }

5755 releasePage(pPrevTrunk);

5756 pPrevTrunk = 0;

5757 }while( searchList );

5758 }else{

5759 /* There are no pages on the freelist, so append a new page to the

5760 ** database image.

5761 **

5762 ** Normally, new pages allocated by this block can be requested from the

5763 ** pager layer with the 'no-content' flag set. This prevents the pager

5764 ** from trying to read the pages content from disk. However, if the

5765 ** current transaction has already run one or more incremental-vacuum

5766 ** steps, then the page we are about to allocate may contain content

5767 ** that is required in the event of a rollback. In this case, do

5768 ** not set the no-content flag. This causes the pager to load and journal

5769 ** the current page content before overwriting it.

5770 **

5771 ** Note that the pager will not actually attempt to load or journal

5772 ** content for any page that really does lie past the end of the database

5773 ** file on disk. So the effects of disabling the no-content optimization

5774 ** here are confined to those pages that lie between the end of the

5775 ** database image and the end of the database file.

5776 */

5777 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;

5778

5779 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

5780 if( rc ) return rc;

5781 pBt->nPage++;

5782 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;

5783

5784 #ifndef SQLITE_OMIT_AUTOVACUUM

5785 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){

5786 /* If *pPgno refers to a pointer-map page, allocate two new pages

5787 ** at the end of the file instead of one. The first allocated page

5788 ** becomes a new pointer-map page, the second is used by the caller.

5789 */

5790 MemPage *pPg = 0;

5791 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));

5792 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );

5793 rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);

5794 if( rc==SQLITE_OK ){

5795 rc = sqlite3PagerWrite(pPg->pDbPage);

5796 releasePage(pPg);

5797 }

5798 if( rc ) return rc;

5799 pBt->nPage++;

5800 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }

5801 }

5802 #endif

5803 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);

5804 *pPgno = pBt->nPage;

5805

5806 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

5807 rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);

5808 if( rc ) return rc;

5809 rc = sqlite3PagerWrite((*ppPage)->pDbPage);

5810 if( rc!=SQLITE_OK ){

5811 releasePage(*ppPage);

5812 *ppPage = 0;

5813 }

5814 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));

5815 }

5816

5817 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );

5818

5819 end_allocate_page:

5820 releasePage(pTrunk);

5821 releasePage(pPrevTrunk);

5822 assert( rc!=SQLITE_OK \|\| sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );

5823 assert( rc!=SQLITE_OK \|\| (*ppPage)->isInit==0 );

5824 return rc;

5825 }

5826

5827 /*

5828 ** This function is used to add page iPage to the database file free-list.

5829 ** It is assumed that the page is not already a part of the free-list.

5830 **

5831 ** The value passed as the second argument to this function is optional.

5832 ** If the caller happens to have a pointer to the MemPage object

5833 ** corresponding to page iPage handy, it may pass it as the second value.

5834 ** Otherwise, it may pass NULL.

5835 **

5836 ** If a pointer to a MemPage object is passed as the second argument,

5837 ** its reference count is not altered by this function.

5838 */

5839 static int freePage2(BtShared pBt, MemPage pMemPage, Pgno iPage){

5840 MemPage pTrunk = 0; / Free-list trunk page */

5841 Pgno iTrunk = 0; /* Page number of free-list trunk page */

5842 MemPage pPage1 = pBt->pPage1; / Local reference to page 1 */

5843 MemPage pPage; / Page being freed. May be NULL. */

5844 int rc; /* Return Code */

5845 int nFree; /* Initial number of pages on free-list */

5846

5847 assert( sqlite3_mutex_held(pBt->mutex) );

5848 assert( CORRUPT_DB \|\| iPage>1 );

5849 assert( !pMemPage \|\| pMemPage->pgno==iPage );

5850

5851 if( iPage<2 ) return SQLITE_CORRUPT_BKPT;

5852 if( pMemPage ){

5853 pPage = pMemPage;

5854 sqlite3PagerRef(pPage->pDbPage);

5855 }else{

5856 pPage = btreePageLookup(pBt, iPage);

5857 }

5858

5859 /* Increment the free page count on pPage1 */

5860 rc = sqlite3PagerWrite(pPage1->pDbPage);

5861 if( rc ) goto freepage_out;

5862 nFree = get4byte(&pPage1->aData[36]);

5863 put4byte(&pPage1->aData[36], nFree+1);

5864

5865 if( pBt->btsFlags & BTS_SECURE_DELETE ){

5866 /* If the secure_delete option is enabled, then

5867 ** always fully overwrite deleted information with zeros.

5868 */

5869 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )

5870 \|\| ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)

5871 ){

5872 goto freepage_out;

5873 }

5874 memset(pPage->aData, 0, pPage->pBt->pageSize);

5875 }

5876

5877 /* If the database supports auto-vacuum, write an entry in the pointer-map

5878 ** to indicate that the page is free.

5879 */

5880 if( ISAUTOVACUUM ){

5881 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);

5882 if( rc ) goto freepage_out;

5883 }

5884

5885 /* Now manipulate the actual database free-list structure. There are two

5886 ** possibilities. If the free-list is currently empty, or if the first

5887 ** trunk page in the free-list is full, then this page will become a

5888 ** new free-list trunk page. Otherwise, it will become a leaf of the

5889 ** first trunk page in the current free-list. This block tests if it

5890 ** is possible to add the page as a new free-list leaf.

5891 */

5892 if( nFree!=0 ){

5893 u32 nLeaf; /* Initial number of leaf cells on trunk page */

5894

5895 iTrunk = get4byte(&pPage1->aData[32]);

5896 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);

5897 if( rc!=SQLITE_OK ){

5898 goto freepage_out;

5899 }

5900

5901 nLeaf = get4byte(&pTrunk->aData[4]);

5902 assert( pBt->usableSize>32 );

5903 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){

5904 rc = SQLITE_CORRUPT_BKPT;

5905 goto freepage_out;

5906 }

5907 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){

5908 /* In this case there is room on the trunk page to insert the page

5909 ** being freed as a new leaf.

5910 **

5911 ** Note that the trunk page is not really full until it contains

5912 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have

5913 ** coded. But due to a coding error in versions of SQLite prior to

5914 ** 3.6.0, databases with freelist trunk pages holding more than

5915 ** usableSize/4 - 8 entries will be reported as corrupt. In order

5916 ** to maintain backwards compatibility with older versions of SQLite,

5917 ** we will continue to restrict the number of entries to usableSize/4 - 8

5918 ** for now. At some point in the future (once everyone has upgraded

5919 ** to 3.6.0 or later) we should consider fixing the conditional above

5920 ** to read "usableSize/4-2" instead of "usableSize/4-8".

5921 **

5922 ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still

5923 ** avoid using the last six entries in the freelist trunk page array in

5924 ** order that database files created by newer versions of SQLite can be

5925 ** read by older versions of SQLite.

5926 */

5927 rc = sqlite3PagerWrite(pTrunk->pDbPage);

5928 if( rc==SQLITE_OK ){

5929 put4byte(&pTrunk->aData[4], nLeaf+1);

5930 put4byte(&pTrunk->aData[8+nLeaf*4], iPage);

5931 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){

5932 sqlite3PagerDontWrite(pPage->pDbPage);

5933 }

5934 rc = btreeSetHasContent(pBt, iPage);

5935 }

5936 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));

5937 goto freepage_out;

5938 }

5939 }

5940

5941 /* If control flows to this point, then it was not possible to add the

5942 ** the page being freed as a leaf page of the first trunk in the free-list.

5943 ** Possibly because the free-list is empty, or possibly because the

5944 ** first trunk in the free-list is full. Either way, the page being freed

5945 ** will become the new first trunk page in the free-list.

5946 */

5947 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){

5948 goto freepage_out;

5949 }

5950 rc = sqlite3PagerWrite(pPage->pDbPage);

5951 if( rc!=SQLITE_OK ){

5952 goto freepage_out;

5953 }

5954 put4byte(pPage->aData, iTrunk);

5955 put4byte(&pPage->aData[4], 0);

5956 put4byte(&pPage1->aData[32], iPage);

5957 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));

5958

5959 freepage_out:

5960 if( pPage ){

5961 pPage->isInit = 0;

5962 }

5963 releasePage(pPage);

5964 releasePage(pTrunk);

5965 return rc;

5966 }

5967 static void freePage(MemPage pPage, int pRC){

5968 if( (*pRC)==SQLITE_OK ){

5969 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);

5970 }

5971 }

5972

5973 /*

5974 ** Free any overflow pages associated with the given Cell. Write the

5975 ** local Cell size (the number of bytes on the original page, omitting

5976 ** overflow) into *pnSize.

5977 */

5978 static int clearCell(

5979 MemPage pPage, / The page that contains the Cell */

5980 unsigned char pCell, / First byte of the Cell */

5981 u16 pnSize / Write the size of the Cell here */

5982 ){

5983 BtShared *pBt = pPage->pBt;

5984 CellInfo info;

5985 Pgno ovflPgno;

5986 int rc;

5987 int nOvfl;

5988 u32 ovflPageSize;

5989

5990 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

5991 pPage->xParseCell(pPage, pCell, &info);

5992 *pnSize = info.nSize;

5993 if( info.nLocal==info.nPayload ){

5994 return SQLITE_OK; /* No overflow pages. Return without doing anything */

5995 }

5996 if( pCell+info.nSize-1 > pPage->aData+pPage->maskPage ){

5997 return SQLITE_CORRUPT_BKPT; /* Cell extends past end of page */

5998 }

5999 ovflPgno = get4byte(pCell + info.nSize - 4);

6000 assert( pBt->usableSize > 4 );

6001 ovflPageSize = pBt->usableSize - 4;

6002 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;

6003 assert( nOvfl>0 \|\|

6004 (CORRUPT_DB && (info.nPayload + ovflPageSize)<ovflPageSize)

6005 );

6006 while( nOvfl-- ){

6007 Pgno iNext = 0;

6008 MemPage *pOvfl = 0;

6009 if( ovflPgno<2 \|\| ovflPgno>btreePagecount(pBt) ){

6010 /* 0 is not a legal page number and page 1 cannot be an

6011 ** overflow page. Therefore if ovflPgno<2 or past the end of the

6012 ** file the database must be corrupt. */

6013 return SQLITE_CORRUPT_BKPT;

6014 }

6015 if( nOvfl ){

6016 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);

6017 if( rc ) return rc;

6018 }

6019

6020 if( ( pOvfl \|\| ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )

6021 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1

6022 ){

6023 /* There is no reason any cursor should have an outstanding reference

6024 ** to an overflow page belonging to a cell that is being deleted/updated.

6025 ** So if there exists more than one reference to this page, then it

6026 ** must not really be an overflow page and the database must be corrupt.

6027 ** It is helpful to detect this before calling freePage2(), as

6028 ** freePage2() may zero the page contents if secure-delete mode is

6029 ** enabled. If this 'overflow' page happens to be a page that the

6030 ** caller is iterating through or using in some other way, this

6031 ** can be problematic.

6032 */

6033 rc = SQLITE_CORRUPT_BKPT;

6034 }else{

6035 rc = freePage2(pBt, pOvfl, ovflPgno);

6036 }

6037

6038 if( pOvfl ){

6039 sqlite3PagerUnref(pOvfl->pDbPage);

6040 }

6041 if( rc ) return rc;

6042 ovflPgno = iNext;

6043 }

6044 return SQLITE_OK;

6045 }

6046

6047 /*

6048 ** Create the byte sequence used to represent a cell on page pPage

6049 ** and write that byte sequence into pCell[]. Overflow pages are

6050 ** allocated and filled in as necessary. The calling procedure

6051 ** is responsible for making sure sufficient space has been allocated

6052 ** for pCell[].

6053 **

6054 ** Note that pCell does not necessary need to point to the pPage->aData

6055 ** area. pCell might point to some temporary storage. The cell will

6056 ** be constructed in this temporary area then copied into pPage->aData

6057 ** later.

6058 */

6059 static int fillInCell(

6060 MemPage pPage, / The page that contains the cell */

6061 unsigned char pCell, / Complete text of the cell */

6062 const void pKey, i64 nKey, / The key */

6063 const void pData,int nData, / The data */

6064 int nZero, /* Extra zero bytes to append to pData */

6065 int pnSize / Write cell size here */

6066 ){

6067 int nPayload;

6068 const u8 *pSrc;

6069 int nSrc, n, rc;

6070 int spaceLeft;

6071 MemPage *pOvfl = 0;

6072 MemPage *pToRelease = 0;

6073 unsigned char *pPrior;

6074 unsigned char *pPayload;

6075 BtShared *pBt = pPage->pBt;

6076 Pgno pgnoOvfl = 0;

6077 int nHeader;

6078

6079 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

6080

6081 /* pPage is not necessarily writeable since pCell might be auxiliary

6082 ** buffer space that is separate from the pPage buffer area */

6083 assert( pCell<pPage->aData \|\| pCell>=&pPage->aData[pBt->pageSize]

6084 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

6085

6086 /* Fill in the header. */

6087 nHeader = pPage->childPtrSize;

6088 nPayload = nData + nZero;

6089 if( pPage->intKeyLeaf ){

6090 nHeader += putVarint32(&pCell[nHeader], nPayload);

6091 }else{

6092 assert( nData==0 );

6093 assert( nZero==0 );

6094 }

6095 nHeader += putVarint(&pCell[nHeader], (u64)&nKey);

6096

6097 /* Fill in the payload size */

6098 if( pPage->intKey ){

6099 pSrc = pData;

6100 nSrc = nData;

6101 nData = 0;

6102 }else{

6103 assert( nKey<=0x7fffffff && pKey!=0 );

6104 nPayload = (int)nKey;

6105 pSrc = pKey;

6106 nSrc = (int)nKey;

6107 }

6108 if( nPayload<=pPage->maxLocal ){

6109 n = nHeader + nPayload;

6110 testcase( n==3 );

6111 testcase( n==4 );

6112 if( n<4 ) n = 4;

6113 *pnSize = n;

6114 spaceLeft = nPayload;

6115 pPrior = pCell;

6116 }else{

6117 int mn = pPage->minLocal;

6118 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);

6119 testcase( n==pPage->maxLocal );

6120 testcase( n==pPage->maxLocal+1 );

6121 if( n > pPage->maxLocal ) n = mn;

6122 spaceLeft = n;

6123 *pnSize = n + nHeader + 4;

6124 pPrior = &pCell[nHeader+n];

6125 }

6126 pPayload = &pCell[nHeader];

6127

6128 /* At this point variables should be set as follows:

6129 **

6130 ** nPayload Total payload size in bytes

6131 ** pPayload Begin writing payload here

6132 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft,

6133 ** that means content must spill into overflow pages.

6134 ** *pnSize Size of the local cell (not counting overflow pages)

6135 ** pPrior Where to write the pgno of the first overflow page

6136 **

6137 ** Use a call to btreeParseCellPtr() to verify that the values above

6138 ** were computed correctly.

6139 */

6140 #if SQLITE_DEBUG

6141 {

6142 CellInfo info;

6143 pPage->xParseCell(pPage, pCell, &info);

6144 assert( nHeader=(int)(info.pPayload - pCell) );

6145 assert( info.nKey==nKey );

6146 assert( *pnSize == info.nSize );

6147 assert( spaceLeft == info.nLocal );

6148 }

6149 #endif

6150

6151 /* Write the payload into the local Cell and any extra into overflow pages */

6152 while( nPayload>0 ){

6153 if( spaceLeft==0 ){

6154 #ifndef SQLITE_OMIT_AUTOVACUUM

6155 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */

6156 if( pBt->autoVacuum ){

6157 do{

6158 pgnoOvfl++;

6159 } while(

6160 PTRMAP_ISPAGE(pBt, pgnoOvfl) \|\| pgnoOvfl==PENDING_BYTE_PAGE(pBt)

6161 );

6162 }

6163 #endif

6164 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);

6165 #ifndef SQLITE_OMIT_AUTOVACUUM

6166 /* If the database supports auto-vacuum, and the second or subsequent

6167 ** overflow page is being allocated, add an entry to the pointer-map

6168 ** for that page now.

6169 **

6170 ** If this is the first overflow page, then write a partial entry

6171 ** to the pointer-map. If we write nothing to this pointer-map slot,

6172 ** then the optimistic overflow chain processing in clearCell()

6173 ** may misinterpret the uninitialized values and delete the

6174 ** wrong pages from the database.

6175 */

6176 if( pBt->autoVacuum && rc==SQLITE_OK ){

6177 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);

6178 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);

6179 if( rc ){

6180 releasePage(pOvfl);

6181 }

6182 }

6183 #endif

6184 if( rc ){

6185 releasePage(pToRelease);

6186 return rc;

6187 }

6188

6189 /* If pToRelease is not zero than pPrior points into the data area

6190 ** of pToRelease. Make sure pToRelease is still writeable. */

6191 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

6192

6193 /* If pPrior is part of the data area of pPage, then make sure pPage

6194 ** is still writeable */

6195 assert( pPrior<pPage->aData \|\| pPrior>=&pPage->aData[pBt->pageSize]

6196 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

6197

6198 put4byte(pPrior, pgnoOvfl);

6199 releasePage(pToRelease);

6200 pToRelease = pOvfl;

6201 pPrior = pOvfl->aData;

6202 put4byte(pPrior, 0);

6203 pPayload = &pOvfl->aData[4];

6204 spaceLeft = pBt->usableSize - 4;

6205 }

6206 n = nPayload;

6207 if( n>spaceLeft ) n = spaceLeft;

6208

6209 /* If pToRelease is not zero than pPayload points into the data area

6210 ** of pToRelease. Make sure pToRelease is still writeable. */

6211 assert( pToRelease==0 \|\| sqlite3PagerIswriteable(pToRelease->pDbPage) );

6212

6213 /* If pPayload is part of the data area of pPage, then make sure pPage

6214 ** is still writeable */

6215 assert( pPayload<pPage->aData \|\| pPayload>=&pPage->aData[pBt->pageSize]

6216 \|\| sqlite3PagerIswriteable(pPage->pDbPage) );

6217

6218 if( nSrc>0 ){

6219 if( n>nSrc ) n = nSrc;

6220 assert( pSrc );

6221 memcpy(pPayload, pSrc, n);

6222 }else{

6223 memset(pPayload, 0, n);

6224 }

6225 nPayload -= n;

6226 pPayload += n;

6227 pSrc += n;

6228 nSrc -= n;

6229 spaceLeft -= n;

6230 if( nSrc==0 ){

6231 nSrc = nData;

6232 pSrc = pData;

6233 }

6234 }

6235 releasePage(pToRelease);

6236 return SQLITE_OK;

6237 }

6238

6239 /*

6240 ** Remove the i-th cell from pPage. This routine effects pPage only.

6241 ** The cell content is not freed or deallocated. It is assumed that

6242 ** the cell content has been copied someplace else. This routine just

6243 ** removes the reference to the cell from pPage.

6244 **

6245 ** "sz" must be the number of bytes in the cell.

6246 */

6247 static void dropCell(MemPage pPage, int idx, int sz, int pRC){

6248 u32 pc; /* Offset to cell content of cell being deleted */

6249 u8 data; / pPage->aData */

6250 u8 ptr; / Used to move bytes around within data[] */

6251 int rc; /* The return code */

6252 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */

6253

6254 if( *pRC ) return;

6255

6256 assert( idx>=0 && idx<pPage->nCell );

6257 assert( CORRUPT_DB \|\| sz==cellSize(pPage, idx) );

6258 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

6259 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

6260 data = pPage->aData;

6261 ptr = &pPage->aCellIdx[2*idx];

6262 pc = get2byte(ptr);

6263 hdr = pPage->hdrOffset;

6264 testcase( pc==get2byte(&data[hdr+5]) );

6265 testcase( pc+sz==pPage->pBt->usableSize );

6266 if( pc < (u32)get2byte(&data[hdr+5]) \|\| pc+sz > pPage->pBt->usableSize ){

6267 *pRC = SQLITE_CORRUPT_BKPT;

6268 return;

6269 }

6270 rc = freeSpace(pPage, pc, sz);

6271 if( rc ){

6272 *pRC = rc;

6273 return;

6274 }

6275 pPage->nCell--;

6276 if( pPage->nCell==0 ){

6277 memset(&data[hdr+1], 0, 4);

6278 data[hdr+7] = 0;

6279 put2byte(&data[hdr+5], pPage->pBt->usableSize);

6280 pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset

6281 - pPage->childPtrSize - 8;

6282 }else{

6283 memmove(ptr, ptr+2, 2*(pPage->nCell - idx));

6284 put2byte(&data[hdr+3], pPage->nCell);

6285 pPage->nFree += 2;

6286 }

6287 }

6288

6289 /*

6290 ** Insert a new cell on pPage at cell index "i". pCell points to the

6291 ** content of the cell.

6292 **

6293 ** If the cell content will fit on the page, then put it there. If it

6294 ** will not fit, then make a copy of the cell content into pTemp if

6295 ** pTemp is not null. Regardless of pTemp, allocate a new entry

6296 ** in pPage->apOvfl[] and make it point to the cell content (either

6297 ** in pTemp or the original pCell) and also record its index.

6298 ** Allocating a new entry in pPage->aCell[] implies that

6299 ** pPage->nOverflow is incremented.

6300 */

6301 static void insertCell(

6302 MemPage pPage, / Page into which we are copying */

6303 int i, /* New cell becomes the i-th cell of the page */

6304 u8 pCell, / Content of the new cell */

6305 int sz, /* Bytes of content in pCell */

6306 u8 pTemp, / Temp storage space for pCell, if needed */

6307 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */

6308 int pRC / Read and write return code from here */

6309 ){

6310 int idx = 0; /* Where to write new cell content in data[] */

6311 int j; /* Loop counter */

6312 u8 data; / The content of the whole page */

6313 u8 pIns; / The point in pPage->aCellIdx[] where no cell inserted */

6314

6315 if( *pRC ) return;

6316

6317 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );

6318 assert( MX_CELL(pPage->pBt)<=10921 );

6319 assert( pPage->nCell<=MX_CELL(pPage->pBt) \|\| CORRUPT_DB );

6320 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );

6321 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );

6322 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

6323 /* The cell should normally be sized correctly. However, when moving a

6324 ** malformed cell from a leaf page to an interior page, if the cell size

6325 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size

6326 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence

6327 ** the term after the \|\| in the following assert(). */

6328 assert( sz==pPage->xCellSize(pPage, pCell) \|\| (sz==8 && iChild>0) );

6329 if( pPage->nOverflow \|\| sz+2>pPage->nFree ){

6330 if( pTemp ){

6331 memcpy(pTemp, pCell, sz);

6332 pCell = pTemp;

6333 }

6334 if( iChild ){

6335 put4byte(pCell, iChild);

6336 }

6337 j = pPage->nOverflow++;

6338 assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );

6339 pPage->apOvfl[j] = pCell;

6340 pPage->aiOvfl[j] = (u16)i;

6341

6342 /* When multiple overflows occur, they are always sequential and in

6343 ** sorted order. This invariants arise because multiple overflows can

6344 ** only occur when inserting divider cells into the parent page during

6345 ** balancing, and the dividers are adjacent and sorted.

6346 */

6347 assert( j==0 \|\| pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */

6348 assert( j==0 \|\| i==pPage->aiOvfl[j-1]+1 ); /* Overflows are sequential */

6349 }else{

6350 int rc = sqlite3PagerWrite(pPage->pDbPage);

6351 if( rc!=SQLITE_OK ){

6352 *pRC = rc;

6353 return;

6354 }

6355 assert( sqlite3PagerIswriteable(pPage->pDbPage) );

6356 data = pPage->aData;

6357 assert( &data[pPage->cellOffset]==pPage->aCellIdx );

6358 rc = allocateSpace(pPage, sz, &idx);

6359 if( rc ){ *pRC = rc; return; }

6360 /* The allocateSpace() routine guarantees the following properties

6361 ** if it returns successfully */

6362 assert( idx >= 0 );

6363 assert( idx >= pPage->cellOffset+2*pPage->nCell+2 \|\| CORRUPT_DB );

6364 assert( idx+sz <= (int)pPage->pBt->usableSize );

6365 pPage->nFree -= (u16)(2 + sz);

6366 memcpy(&data[idx], pCell, sz);

6367 if( iChild ){

6368 put4byte(&data[idx], iChild);

6369 }

6370 pIns = pPage->aCellIdx + i*2;

6371 memmove(pIns+2, pIns, 2*(pPage->nCell - i));

6372 put2byte(pIns, idx);

6373 pPage->nCell++;

6374 /* increment the cell count */

6375 if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;

6376 assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );

6377 #ifndef SQLITE_OMIT_AUTOVACUUM

6378 if( pPage->pBt->autoVacuum ){

6379 /* The cell may contain a pointer to an overflow page. If so, write

6380 ** the entry for the overflow page into the pointer map.

6381 */

6382 ptrmapPutOvflPtr(pPage, pCell, pRC);

6383 }

6384 #endif

6385 }

6386 }

6387

6388 /*

6389 ** A CellArray object contains a cache of pointers and sizes for a

6390 ** consecutive sequence of cells that might be held multiple pages.

6391 */

6392 typedef struct CellArray CellArray;

6393 struct CellArray {

6394 int nCell; /* Number of cells in apCell[] */

6395 MemPage pRef; / Reference page */

6396 u8 *apCell; / All cells begin balanced */

6397 u16 szCell; / Local size of all cells in apCell[] */

6398 };

6399

6400 /*

6401 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been

6402 ** computed.

6403 */

6404 static void populateCellCache(CellArray *p, int idx, int N){

6405 assert( idx>=0 && idx+N<=p->nCell );

6406 while( N>0 ){

6407 assert( p->apCell[idx]!=0 );

6408 if( p->szCell[idx]==0 ){

6409 p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);

6410 }else{

6411 assert( CORRUPT_DB \|\|

6412 p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );

6413 }

6414 idx++;

6415 N--;

6416 }

6417 }

6418

6419 /*

6420 ** Return the size of the Nth element of the cell array

6421 */

6422 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){

6423 assert( N>=0 && N<p->nCell );

6424 assert( p->szCell[N]==0 );

6425 p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);

6426 return p->szCell[N];

6427 }

6428 static u16 cachedCellSize(CellArray *p, int N){

6429 assert( N>=0 && N<p->nCell );

6430 if( p->szCell[N] ) return p->szCell[N];

6431 return computeCellSize(p, N);

6432 }

6433

6434 /*

6435 ** Array apCell[] contains pointers to nCell b-tree page cells. The

6436 ** szCell[] array contains the size in bytes of each cell. This function

6437 ** replaces the current contents of page pPg with the contents of the cell

6438 ** array.

6439 **

6440 ** Some of the cells in apCell[] may currently be stored in pPg. This

6441 ** function works around problems caused by this by making a copy of any

6442 ** such cells before overwriting the page data.

6443 **

6444 ** The MemPage.nFree field is invalidated by this function. It is the

6445 ** responsibility of the caller to set it correctly.

6446 */

6447 static int rebuildPage(

6448 MemPage pPg, / Edit this page */

6449 int nCell, /* Final number of cells on page */

6450 u8 *apCell, / Array of cells */

6451 u16 szCell / Array of cell sizes */

6452 ){

6453 const int hdr = pPg->hdrOffset; /* Offset of header on pPg */

6454 u8 * const aData = pPg->aData; /* Pointer to data for pPg */

6455 const int usableSize = pPg->pBt->usableSize;

6456 u8 * const pEnd = &aData[usableSize];

6457 int i;

6458 u8 *pCellptr = pPg->aCellIdx;

6459 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);

6460 u8 *pData;

6461

6462 i = get2byte(&aData[hdr+5]);

6463 memcpy(&pTmp[i], &aData[i], usableSize - i);

6464

6465 pData = pEnd;

6466 for(i=0; i<nCell; i++){

6467 u8 *pCell = apCell[i];

6468 if( SQLITE_WITHIN(pCell,aData,pEnd) ){

6469 pCell = &pTmp[pCell - aData];

6470 }

6471 pData -= szCell[i];

6472 put2byte(pCellptr, (pData - aData));

6473 pCellptr += 2;

6474 if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;

6475 memcpy(pData, pCell, szCell[i]);

6476 assert( szCell[i]==pPg->xCellSize(pPg, pCell) \|\| CORRUPT_DB );

6477 testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );

6478 }

6479

6480 /* The pPg->nFree field is now set incorrectly. The caller will fix it. */

6481 pPg->nCell = nCell;

6482 pPg->nOverflow = 0;

6483

6484 put2byte(&aData[hdr+1], 0);

6485 put2byte(&aData[hdr+3], pPg->nCell);

6486 put2byte(&aData[hdr+5], pData - aData);

6487 aData[hdr+7] = 0x00;

6488 return SQLITE_OK;

6489 }

6490

6491 /*

6492 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell

6493 ** contains the size in bytes of each such cell. This function attempts to

6494 ** add the cells stored in the array to page pPg. If it cannot (because

6495 ** the page needs to be defragmented before the cells will fit), non-zero

6496 ** is returned. Otherwise, if the cells are added successfully, zero is

6497 ** returned.

6498 **

6499 ** Argument pCellptr points to the first entry in the cell-pointer array

6500 ** (part of page pPg) to populate. After cell apCell[0] is written to the

6501 ** page body, a 16-bit offset is written to pCellptr. And so on, for each

6502 ** cell in the array. It is the responsibility of the caller to ensure

6503 ** that it is safe to overwrite this part of the cell-pointer array.

6504 **

6505 ** When this function is called, *ppData points to the start of the

6506 ** content area on page pPg. If the size of the content area is extended,

6507 ** *ppData is updated to point to the new start of the content area

6508 ** before returning.

6509 **

6510 ** Finally, argument pBegin points to the byte immediately following the

6511 ** end of the space required by this page for the cell-pointer area (for

6512 ** all cells - not just those inserted by the current call). If the content

6513 ** area must be extended to before this point in order to accomodate all

6514 ** cells in apCell[], then the cells do not fit and non-zero is returned.

6515 */

6516 static int pageInsertArray(

6517 MemPage pPg, / Page to add cells to */

6518 u8 pBegin, / End of cell-pointer array */

6519 u8 *ppData, / IN/OUT: Page content -area pointer */

6520 u8 pCellptr, / Pointer to cell-pointer area */

6521 int iFirst, /* Index of first cell to add */

6522 int nCell, /* Number of cells to add to pPg */

6523 CellArray pCArray / Array of cells */

6524 ){

6525 int i;

6526 u8 *aData = pPg->aData;

6527 u8 pData = ppData;

6528 int iEnd = iFirst + nCell;

6529 assert( CORRUPT_DB \|\| pPg->hdrOffset==0 ); /* Never called on page 1 */

6530 for(i=iFirst; i<iEnd; i++){

6531 int sz, rc;

6532 u8 *pSlot;

6533 sz = cachedCellSize(pCArray, i);

6534 if( (aData[1]==0 && aData[2]==0) \|\| (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){

6535 pData -= sz;

6536 if( pData<pBegin ) return 1;

6537 pSlot = pData;

6538 }

6539 /* pSlot and pCArray->apCell[i] will never overlap on a well-formed

6540 ** database. But they might for a corrupt database. Hence use memmove()

6541 ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */

6542 assert( (pSlot+sz)<=pCArray->apCell[i]

6543 \|\| pSlot>=(pCArray->apCell[i]+sz)

6544 \|\| CORRUPT_DB );

6545 memmove(pSlot, pCArray->apCell[i], sz);

6546 put2byte(pCellptr, (pSlot - aData));

6547 pCellptr += 2;

6548 }

6549 *ppData = pData;

6550 return 0;

6551 }

6552

6553 /*

6554 ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell

6555 ** contains the size in bytes of each such cell. This function adds the

6556 ** space associated with each cell in the array that is currently stored

6557 ** within the body of pPg to the pPg free-list. The cell-pointers and other

6558 ** fields of the page are not updated.

6559 **

6560 ** This function returns the total number of cells added to the free-list.

6561 */

6562 static int pageFreeArray(

6563 MemPage pPg, / Page to edit */

6564 int iFirst, /* First cell to delete */

6565 int nCell, /* Cells to delete */

6566 CellArray pCArray / Array of cells */

6567 ){

6568 u8 * const aData = pPg->aData;

6569 u8 * const pEnd = &aData[pPg->pBt->usableSize];

6570 u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];

6571 int nRet = 0;

6572 int i;

6573 int iEnd = iFirst + nCell;

6574 u8 *pFree = 0;

6575 int szFree = 0;

6576

6577 for(i=iFirst; i<iEnd; i++){

6578 u8 *pCell = pCArray->apCell[i];

6579 if( SQLITE_WITHIN(pCell, pStart, pEnd) ){

6580 int sz;

6581 /* No need to use cachedCellSize() here. The sizes of all cells that

6582 ** are to be freed have already been computing while deciding which

6583 ** cells need freeing */

6584 sz = pCArray->szCell[i]; assert( sz>0 );

6585 if( pFree!=(pCell + sz) ){

6586 if( pFree ){

6587 assert( pFree>aData && (pFree - aData)<65536 );

6588 freeSpace(pPg, (u16)(pFree - aData), szFree);

6589 }

6590 pFree = pCell;

6591 szFree = sz;

6592 if( pFree+sz>pEnd ) return 0;

6593 }else{

6594 pFree = pCell;

6595 szFree += sz;

6596 }

6597 nRet++;

6598 }

6599 }

6600 if( pFree ){

6601 assert( pFree>aData && (pFree - aData)<65536 );

6602 freeSpace(pPg, (u16)(pFree - aData), szFree);

6603 }

6604 return nRet;

6605 }

6606

6607 /*

6608 ** apCell[] and szCell[] contains pointers to and sizes of all cells in the

6609 ** pages being balanced. The current page, pPg, has pPg->nCell cells starting

6610 ** with apCell[iOld]. After balancing, this page should hold nNew cells

6611 ** starting at apCell[iNew].

6612 **

6613 ** This routine makes the necessary adjustments to pPg so that it contains

6614 ** the correct cells after being balanced.

6615 **

6616 ** The pPg->nFree field is invalid when this function returns. It is the

6617 ** responsibility of the caller to set it correctly.

6618 */

6619 static int editPage(

6620 MemPage pPg, / Edit this page */

6621 int iOld, /* Index of first cell currently on page */

6622 int iNew, /* Index of new first cell on page */

6623 int nNew, /* Final number of cells on page */

6624 CellArray pCArray / Array of cells and sizes */

6625 ){

6626 u8 * const aData = pPg->aData;

6627 const int hdr = pPg->hdrOffset;

6628 u8 pBegin = &pPg->aCellIdx[nNew 2];

6629 int nCell = pPg->nCell; /* Cells stored on pPg */

6630 u8 *pData;

6631 u8 *pCellptr;

6632 int i;

6633 int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;

6634 int iNewEnd = iNew + nNew;

6635

6636 #ifdef SQLITE_DEBUG

6637 u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);

6638 memcpy(pTmp, aData, pPg->pBt->usableSize);

6639 #endif

6640

6641 /* Remove cells from the start and end of the page */

6642 if( iOld<iNew ){

6643 int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);

6644 memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift2], nCell2);

6645 nCell -= nShift;

6646 }

6647 if( iNewEnd < iOldEnd ){

6648 nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);

6649 }

6650

6651 pData = &aData[get2byteNotZero(&aData[hdr+5])];

6652 if( pData<pBegin ) goto editpage_fail;

6653

6654 /* Add cells to the start of the page */

6655 if( iNew<iOld ){

6656 int nAdd = MIN(nNew,iOld-iNew);

6657 assert( (iOld-iNew)<nNew \|\| nCell==0 \|\| CORRUPT_DB );

6658 pCellptr = pPg->aCellIdx;

6659 memmove(&pCellptr[nAdd2], pCellptr, nCell2);

6660 if( pageInsertArray(

6661 pPg, pBegin, &pData, pCellptr,

6662 iNew, nAdd, pCArray

6663 ) ) goto editpage_fail;

6664 nCell += nAdd;

6665 }

6666

6667 /* Add any overflow cells */

6668 for(i=0; i<pPg->nOverflow; i++){

6669 int iCell = (iOld + pPg->aiOvfl[i]) - iNew;

6670 if( iCell>=0 && iCell<nNew ){

6671 pCellptr = &pPg->aCellIdx[iCell * 2];

6672 memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);

6673 nCell++;

6674 if( pageInsertArray(

6675 pPg, pBegin, &pData, pCellptr,

6676 iCell+iNew, 1, pCArray

6677 ) ) goto editpage_fail;

6678 }

6679 }

6680

6681 /* Append cells to the end of the page */

6682 pCellptr = &pPg->aCellIdx[nCell*2];

6683 if( pageInsertArray(

6684 pPg, pBegin, &pData, pCellptr,

6685 iNew+nCell, nNew-nCell, pCArray

6686 ) ) goto editpage_fail;

6687

6688 pPg->nCell = nNew;

6689 pPg->nOverflow = 0;

6690

6691 put2byte(&aData[hdr+3], pPg->nCell);

6692 put2byte(&aData[hdr+5], pData - aData);

6693

6694 #ifdef SQLITE_DEBUG

6695 for(i=0; i<nNew && !CORRUPT_DB; i++){

6696 u8 *pCell = pCArray->apCell[i+iNew];

6697 int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);

6698 if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){

6699 pCell = &pTmp[pCell - aData];

6700 }

6701 assert( 0==memcmp(pCell, &aData[iOff],

6702 pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );

6703 }

6704 #endif

6705

6706 return SQLITE_OK;

6707 editpage_fail:

6708 /* Unable to edit this page. Rebuild it from scratch instead. */

6709 populateCellCache(pCArray, iNew, nNew);

6710 return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);

6711 }

6712

6713 /*

6714 ** The following parameters determine how many adjacent pages get involved

6715 ** in a balancing operation. NN is the number of neighbors on either side

6716 ** of the page that participate in the balancing operation. NB is the

6717 ** total number of pages that participate, including the target page and

6718 ** NN neighbors on either side.

6719 **

6720 ** The minimum value of NN is 1 (of course). Increasing NN above 1

6721 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance

6722 ** in exchange for a larger degradation in INSERT and UPDATE performance.

6723 ** The value of NN appears to give the best results overall.

6724 */

6725 #define NN 1 /* Number of neighbors on either side of pPage */

6726 #define NB (NN2+1) / Total pages involved in the balance */

6727

6728

6729 #ifndef SQLITE_OMIT_QUICKBALANCE

6730 /*

6731 ** This version of balance() handles the common special case where

6732 ** a new entry is being inserted on the extreme right-end of the

6733 ** tree, in other words, when the new entry will become the largest

6734 ** entry in the tree.

6735 **

6736 ** Instead of trying to balance the 3 right-most leaf pages, just add

6737 ** a new page to the right-hand side and put the one new entry in

6738 ** that page. This leaves the right side of the tree somewhat

6739 ** unbalanced. But odds are that we will be inserting new entries

6740 ** at the end soon afterwards so the nearly empty page will quickly

6741 ** fill up. On average.

6742 **

6743 ** pPage is the leaf page which is the right-most page in the tree.

6744 ** pParent is its parent. pPage must have a single overflow entry

6745 ** which is also the right-most entry on the page.

6746 **

6747 ** The pSpace buffer is used to store a temporary copy of the divider

6748 ** cell that will be inserted into pParent. Such a cell consists of a 4

6749 ** byte page number followed by a variable length integer. In other

6750 ** words, at most 13 bytes. Hence the pSpace buffer must be at

6751 ** least 13 bytes in size.

6752 */

6753 static int balance_quick(MemPage pParent, MemPage pPage, u8 *pSpace){

6754 BtShared const pBt = pPage->pBt; / B-Tree Database */

6755 MemPage pNew; / Newly allocated page */

6756 int rc; /* Return Code */

6757 Pgno pgnoNew; /* Page number of pNew */

6758

6759 assert( sqlite3_mutex_held(pPage->pBt->mutex) );

6760 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

6761 assert( pPage->nOverflow==1 );

6762

6763 /* This error condition is now caught prior to reaching this function */

6764 if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;

6765

6766 /* Allocate a new page. This page will become the right-sibling of

6767 ** pPage. Make the parent page writable, so that the new divider cell

6768 ** may be inserted. If both these operations are successful, proceed.

6769 */

6770 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);

6771

6772 if( rc==SQLITE_OK ){

6773

6774 u8 *pOut = &pSpace[4];

6775 u8 *pCell = pPage->apOvfl[0];

6776 u16 szCell = pPage->xCellSize(pPage, pCell);

6777 u8 *pStop;

6778

6779 assert( sqlite3PagerIswriteable(pNew->pDbPage) );

6780 assert( pPage->aData[0]==(PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF) );

6781 zeroPage(pNew, PTF_INTKEY\|PTF_LEAFDATA\|PTF_LEAF);

6782 rc = rebuildPage(pNew, 1, &pCell, &szCell);

6783 if( NEVER(rc) ) return rc;

6784 pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;

6785

6786 /* If this is an auto-vacuum database, update the pointer map

6787 ** with entries for the new page, and any pointer from the

6788 ** cell on the page to an overflow page. If either of these

6789 ** operations fails, the return code is set, but the contents

6790 ** of the parent page are still manipulated by thh code below.

6791 ** That is Ok, at this point the parent page is guaranteed to

6792 ** be marked as dirty. Returning an error code will cause a

6793 ** rollback, undoing any changes made to the parent page.

6794 */

6795 if( ISAUTOVACUUM ){

6796 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);

6797 if( szCell>pNew->minLocal ){

6798 ptrmapPutOvflPtr(pNew, pCell, &rc);

6799 }

6800 }

6801

6802 /* Create a divider cell to insert into pParent. The divider cell

6803 ** consists of a 4-byte page number (the page number of pPage) and

6804 ** a variable length key value (which must be the same value as the

6805 ** largest key on pPage).

6806 **

6807 ** To find the largest key value on pPage, first find the right-most

6808 ** cell on pPage. The first two fields of this cell are the

6809 ** record-length (a variable length integer at most 32-bits in size)

6810 ** and the key value (a variable length integer, may have any value).

6811 ** The first of the while(...) loops below skips over the record-length

6812 ** field. The second while(...) loop copies the key value from the

6813 ** cell on pPage into the pSpace buffer.

6814 */

6815 pCell = findCell(pPage, pPage->nCell-1);

6816 pStop = &pCell[9];

6817 while( (*(pCell++)&0x80) && pCell<pStop );

6818 pStop = &pCell[9];

6819 while( (((pOut++) = (pCell++))&0x80) && pCell<pStop );

6820

6821 /* Insert the new divider cell into pParent. */

6822 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),

6823 0, pPage->pgno, &rc);

6824

6825 /* Set the right-child pointer of pParent to point to the new page. */

6826 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);

6827

6828 /* Release the reference to the new page. */

6829 releasePage(pNew);

6830 }

6831

6832 return rc;

6833 }

6834 #endif /* SQLITE_OMIT_QUICKBALANCE */

6835

6836 #if 0

6837 /*

6838 ** This function does not contribute anything to the operation of SQLite.

6839 ** it is sometimes activated temporarily while debugging code responsible

6840 ** for setting pointer-map entries.

6841 */

6842 static int ptrmapCheckPages(MemPage **apPage, int nPage){

6843 int i, j;

6844 for(i=0; i<nPage; i++){

6845 Pgno n;

6846 u8 e;

6847 MemPage *pPage = apPage[i];

6848 BtShared *pBt = pPage->pBt;

6849 assert( pPage->isInit );

6850

6851 for(j=0; j<pPage->nCell; j++){

6852 CellInfo info;

6853 u8 *z;

6854

6855 z = findCell(pPage, j);

6856 pPage->xParseCell(pPage, z, &info);

6857 if( info.nLocal<info.nPayload ){

6858 Pgno ovfl = get4byte(&z[info.nSize-4]);

6859 ptrmapGet(pBt, ovfl, &e, &n);

6860 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );

6861 }

6862 if( !pPage->leaf ){

6863 Pgno child = get4byte(z);

6864 ptrmapGet(pBt, child, &e, &n);

6865 assert( n==pPage->pgno && e==PTRMAP_BTREE );

6866 }

6867 }

6868 if( !pPage->leaf ){

6869 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);

6870 ptrmapGet(pBt, child, &e, &n);

6871 assert( n==pPage->pgno && e==PTRMAP_BTREE );

6872 }

6873 }

6874 return 1;

6875 }

6876 #endif

6877

6878 /*

6879 ** This function is used to copy the contents of the b-tree node stored

6880 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then

6881 ** the pointer-map entries for each child page are updated so that the

6882 ** parent page stored in the pointer map is page pTo. If pFrom contained

6883 ** any cells with overflow page pointers, then the corresponding pointer

6884 ** map entries are also updated so that the parent page is page pTo.

6885 **

6886 ** If pFrom is currently carrying any overflow cells (entries in the

6887 ** MemPage.apOvfl[] array), they are not copied to pTo.

6888 **

6889 ** Before returning, page pTo is reinitialized using btreeInitPage().

6890 **

6891 ** The performance of this function is not critical. It is only used by

6892 ** the balance_shallower() and balance_deeper() procedures, neither of

6893 ** which are called often under normal circumstances.

6894 */

6895 static void copyNodeContent(MemPage pFrom, MemPage pTo, int *pRC){

6896 if( (*pRC)==SQLITE_OK ){

6897 BtShared * const pBt = pFrom->pBt;

6898 u8 * const aFrom = pFrom->aData;

6899 u8 * const aTo = pTo->aData;

6900 int const iFromHdr = pFrom->hdrOffset;

6901 int const iToHdr = ((pTo->pgno==1) ? 100 : 0);

6902 int rc;

6903 int iData;

6904

6905

6906 assert( pFrom->isInit );

6907 assert( pFrom->nFree>=iToHdr );

6908 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );

6909

6910 /* Copy the b-tree node content from page pFrom to page pTo. */

6911 iData = get2byte(&aFrom[iFromHdr+5]);

6912 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);

6913 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);

6914

6915 /* Reinitialize page pTo so that the contents of the MemPage structure

6916 ** match the new data. The initialization of pTo can actually fail under

6917 ** fairly obscure circumstances, even though it is a copy of initialized

6918 ** page pFrom.

6919 */

6920 pTo->isInit = 0;

6921 rc = btreeInitPage(pTo);

6922 if( rc!=SQLITE_OK ){

6923 *pRC = rc;

6924 return;

6925 }

6926

6927 /* If this is an auto-vacuum database, update the pointer-map entries

6928 ** for any b-tree or overflow pages that pTo now contains the pointers to.

6929 */

6930 if( ISAUTOVACUUM ){

6931 *pRC = setChildPtrmaps(pTo);

6932 }

6933 }

6934 }

6935

6936 /*

6937 ** This routine redistributes cells on the iParentIdx'th child of pParent

6938 ** (hereafter "the page") and up to 2 siblings so that all pages have about the

6939 ** same amount of free space. Usually a single sibling on either side of the

6940 ** page are used in the balancing, though both siblings might come from one

6941 ** side if the page is the first or last child of its parent. If the page

6942 ** has fewer than 2 siblings (something which can only happen if the page

6943 ** is a root page or a child of a root page) then all available siblings

6944 ** participate in the balancing.

6945 **

6946 ** The number of siblings of the page might be increased or decreased by

6947 ** one or two in an effort to keep pages nearly full but not over full.

6948 **

6949 ** Note that when this routine is called, some of the cells on the page

6950 ** might not actually be stored in MemPage.aData[]. This can happen

6951 ** if the page is overfull. This routine ensures that all cells allocated

6952 ** to the page and its siblings fit into MemPage.aData[] before returning.

6953 **

6954 ** In the course of balancing the page and its siblings, cells may be

6955 ** inserted into or removed from the parent page (pParent). Doing so

6956 ** may cause the parent page to become overfull or underfull. If this

6957 ** happens, it is the responsibility of the caller to invoke the correct

6958 ** balancing routine to fix this problem (see the balance() routine).

6959 **

6960 ** If this routine fails for any reason, it might leave the database

6961 ** in a corrupted state. So if this routine fails, the database should

6962 ** be rolled back.

6963 **

6964 ** The third argument to this function, aOvflSpace, is a pointer to a

6965 ** buffer big enough to hold one page. If while inserting cells into the parent

6966 ** page (pParent) the parent page becomes overfull, this buffer is

6967 ** used to store the parent's overflow cells. Because this function inserts

6968 ** a maximum of four divider cells into the parent page, and the maximum

6969 ** size of a cell stored within an internal node is always less than 1/4

6970 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large

6971 ** enough for all overflow cells.

6972 **

6973 ** If aOvflSpace is set to a null pointer, this function returns

6974 ** SQLITE_NOMEM.

6975 */

6976 static int balance_nonroot(

6977 MemPage pParent, / Parent page of siblings being balanced */

6978 int iParentIdx, /* Index of "the page" in pParent */

6979 u8 aOvflSpace, / page-size bytes of space for parent ovfl */

6980 int isRoot, /* True if pParent is a root-page */

6981 int bBulk /* True if this call is part of a bulk load */

6982 ){

6983 BtShared pBt; / The whole database */

6984 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */

6985 int nNew = 0; /* Number of pages in apNew[] */

6986 int nOld; /* Number of pages in apOld[] */

6987 int i, j, k; /* Loop counters */

6988 int nxDiv; /* Next divider slot in pParent->aCell[] */

6989 int rc = SQLITE_OK; /* The return code */

6990 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */

6991 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */

6992 int usableSpace; /* Bytes in pPage beyond the header */

6993 int pageFlags; /* Value of pPage->aData[0] */

6994 int iSpace1 = 0; /* First unused byte of aSpace1[] */

6995 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */

6996 int szScratch; /* Size of scratch memory requested */

6997 MemPage apOld[NB]; / pPage and up to two siblings */

6998 MemPage apNew[NB+2]; / pPage and up to NB siblings after balancing */

6999 u8 pRight; / Location in parent of right-sibling pointer */

7000 u8 apDiv[NB-1]; / Divider cells in pParent */

7001 int cntNew[NB+2]; /* Index in b.paCell[] of cell after i-th page */

7002 int cntOld[NB+2]; /* Old index in b.apCell[] */

7003 int szNew[NB+2]; /* Combined size of cells placed on i-th page */

7004 u8 aSpace1; / Space for copies of dividers cells */

7005 Pgno pgno; /* Temp var to store a page number in */

7006 u8 abDone[NB+2]; /* True after i'th new page is populated */

7007 Pgno aPgno[NB+2]; /* Page numbers of new pages before shuffling */

7008 Pgno aPgOrder[NB+2]; /* Copy of aPgno[] used for sorting pages */

7009 u16 aPgFlags[NB+2]; /* flags field of new pages before shuffling */

7010 CellArray b; /* Parsed information on cells being balanced */

7011

7012 memset(abDone, 0, sizeof(abDone));

7013 b.nCell = 0;

7014 b.apCell = 0;

7015 pBt = pParent->pBt;

7016 assert( sqlite3_mutex_held(pBt->mutex) );

7017 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

7018

7019 #if 0

7020 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));

7021 #endif

7022

7023 /* At this point pParent may have at most one overflow cell. And if

7024 ** this overflow cell is present, it must be the cell with

7025 ** index iParentIdx. This scenario comes about when this function

7026 ** is called (indirectly) from sqlite3BtreeDelete().

7027 */

7028 assert( pParent->nOverflow==0 \|\| pParent->nOverflow==1 );

7029 assert( pParent->nOverflow==0 \|\| pParent->aiOvfl[0]==iParentIdx );

7030

7031 if( !aOvflSpace ){

7032 return SQLITE_NOMEM;

7033 }

7034

7035 /* Find the sibling pages to balance. Also locate the cells in pParent

7036 ** that divide the siblings. An attempt is made to find NN siblings on

7037 ** either side of pPage. More siblings are taken from one side, however,

7038 ** if there are fewer than NN siblings on the other side. If pParent

7039 ** has NB or fewer children then all children of pParent are taken.

7040 **

7041 ** This loop also drops the divider cells from the parent page. This

7042 ** way, the remainder of the function does not have to deal with any

7043 ** overflow cells in the parent page, since if any existed they will

7044 ** have already been removed.

7045 */

7046 i = pParent->nOverflow + pParent->nCell;

7047 if( i<2 ){

7048 nxDiv = 0;

7049 }else{

7050 assert( bBulk==0 \|\| bBulk==1 );

7051 if( iParentIdx==0 ){

7052 nxDiv = 0;

7053 }else if( iParentIdx==i ){

7054 nxDiv = i-2+bBulk;

7055 }else{

7056 nxDiv = iParentIdx-1;

7057 }

7058 i = 2-bBulk;

7059 }

7060 nOld = i+1;

7061 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){

7062 pRight = &pParent->aData[pParent->hdrOffset+8];

7063 }else{

7064 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);

7065 }

7066 pgno = get4byte(pRight);

7067 while( 1 ){

7068 rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);

7069 if( rc ){

7070 memset(apOld, 0, (i+1)sizeof(MemPage));

7071 goto balance_cleanup;

7072 }

7073 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;

7074 if( (i--)==0 ) break;

7075

7076 if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){

7077 apDiv[i] = pParent->apOvfl[0];

7078 pgno = get4byte(apDiv[i]);

7079 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);

7080 pParent->nOverflow = 0;

7081 }else{

7082 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);

7083 pgno = get4byte(apDiv[i]);

7084 szNew[i] = pParent->xCellSize(pParent, apDiv[i]);

7085

7086 /* Drop the cell from the parent page. apDiv[i] still points to

7087 ** the cell within the parent, even though it has been dropped.

7088 ** This is safe because dropping a cell only overwrites the first

7089 ** four bytes of it, and this function does not need the first

7090 ** four bytes of the divider cell. So the pointer is safe to use

7091 ** later on.

7092 **

7093 ** But not if we are in secure-delete mode. In secure-delete mode,

7094 ** the dropCell() routine will overwrite the entire cell with zeroes.

7095 ** In this case, temporarily copy the cell into the aOvflSpace[]

7096 ** buffer. It will be copied out again as soon as the aSpace[] buffer

7097 ** is allocated. */

7098 if( pBt->btsFlags & BTS_SECURE_DELETE ){

7099 int iOff;

7100

7101 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);

7102 if( (iOff+szNew[i])>(int)pBt->usableSize ){

7103 rc = SQLITE_CORRUPT_BKPT;

7104 memset(apOld, 0, (i+1)sizeof(MemPage));

7105 goto balance_cleanup;

7106 }else{

7107 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);

7108 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];

7109 }

7110 }

7111 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);

7112 }

7113 }

7114

7115 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte

7116 ** alignment */

7117 nMaxCells = (nMaxCells + 3)&~3;

7118

7119 /*

7120 ** Allocate space for memory structures

7121 */

7122 szScratch =

7123 nMaxCellssizeof(u8) /* b.apCell */

7124 + nMaxCellssizeof(u16) / b.szCell */

7125 + pBt->pageSize; /* aSpace1 */

7126

7127 /* EVIDENCE-OF: R-28375-38319 SQLite will never request a scratch buffer

7128 ** that is more than 6 times the database page size. */

7129 assert( szScratch<=6*(int)pBt->pageSize );

7130 b.apCell = sqlite3ScratchMalloc( szScratch );

7131 if( b.apCell==0 ){

7132 rc = SQLITE_NOMEM;

7133 goto balance_cleanup;

7134 }

7135 b.szCell = (u16*)&b.apCell[nMaxCells];

7136 aSpace1 = (u8*)&b.szCell[nMaxCells];

7137 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

7138

7139 /*

7140 ** Load pointers to all cells on sibling pages and the divider cells

7141 ** into the local b.apCell[] array. Make copies of the divider cells

7142 ** into space obtained from aSpace1[]. The divider cells have already

7143 ** been removed from pParent.

7144 **

7145 ** If the siblings are on leaf pages, then the child pointers of the

7146 ** divider cells are stripped from the cells before they are copied

7147 ** into aSpace1[]. In this way, all cells in b.apCell[] are without

7148 ** child pointers. If siblings are not leaves, then all cell in

7149 ** b.apCell[] include child pointers. Either way, all cells in b.apCell[]

7150 ** are alike.

7151 **

7152 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.

7153 ** leafData: 1 if pPage holds key+data and pParent holds only keys.

7154 */

7155 b.pRef = apOld[0];

7156 leafCorrection = b.pRef->leaf*4;

7157 leafData = b.pRef->intKeyLeaf;

7158 for(i=0; i<nOld; i++){

7159 MemPage *pOld = apOld[i];

7160 int limit = pOld->nCell;

7161 u8 *aData = pOld->aData;

7162 u16 maskPage = pOld->maskPage;

7163 u8 *piCell = aData + pOld->cellOffset;

7164 u8 *piEnd;

7165

7166 /* Verify that all sibling pages are of the same "type" (table-leaf,

7167 ** table-interior, index-leaf, or index-interior).

7168 */

7169 if( pOld->aData[0]!=apOld[0]->aData[0] ){

7170 rc = SQLITE_CORRUPT_BKPT;

7171 goto balance_cleanup;

7172 }

7173

7174 /* Load b.apCell[] with pointers to all cells in pOld. If pOld

7175 ** constains overflow cells, include them in the b.apCell[] array

7176 ** in the correct spot.

7177 **

7178 ** Note that when there are multiple overflow cells, it is always the

7179 ** case that they are sequential and adjacent. This invariant arises

7180 ** because multiple overflows can only occurs when inserting divider

7181 ** cells into a parent on a prior balance, and divider cells are always

7182 ** adjacent and are inserted in order. There is an assert() tagged

7183 ** with "NOTE 1" in the overflow cell insertion loop to prove this

7184 ** invariant.

7185 **

7186 ** This must be done in advance. Once the balance starts, the cell

7187 ** offset section of the btree page will be overwritten and we will no

7188 ** long be able to find the cells if a pointer to each cell is not saved

7189 ** first.

7190 */

7191 memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*limit);

7192 if( pOld->nOverflow>0 ){

7193 memset(&b.szCell[b.nCell+limit], 0, sizeof(b.szCell[0])*pOld->nOverflow);

7194 limit = pOld->aiOvfl[0];

7195 for(j=0; j<limit; j++){

7196 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));

7197 piCell += 2;

7198 b.nCell++;

7199 }

7200 for(k=0; k<pOld->nOverflow; k++){

7201 assert( k==0 \|\| pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */

7202 b.apCell[b.nCell] = pOld->apOvfl[k];

7203 b.nCell++;

7204 }

7205 }

7206 piEnd = aData + pOld->cellOffset + 2*pOld->nCell;

7207 while( piCell<piEnd ){

7208 assert( b.nCell<nMaxCells );

7209 b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));

7210 piCell += 2;

7211 b.nCell++;

7212 }

7213

7214 cntOld[i] = b.nCell;

7215 if( i<nOld-1 && !leafData){

7216 u16 sz = (u16)szNew[i];

7217 u8 *pTemp;

7218 assert( b.nCell<nMaxCells );

7219 b.szCell[b.nCell] = sz;

7220 pTemp = &aSpace1[iSpace1];

7221 iSpace1 += sz;

7222 assert( sz<=pBt->maxLocal+23 );

7223 assert( iSpace1 <= (int)pBt->pageSize );

7224 memcpy(pTemp, apDiv[i], sz);

7225 b.apCell[b.nCell] = pTemp+leafCorrection;

7226 assert( leafCorrection==0 \|\| leafCorrection==4 );

7227 b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;

7228 if( !pOld->leaf ){

7229 assert( leafCorrection==0 );

7230 assert( pOld->hdrOffset==0 );

7231 /* The right pointer of the child page pOld becomes the left

7232 ** pointer of the divider cell */

7233 memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);

7234 }else{

7235 assert( leafCorrection==4 );

7236 while( b.szCell[b.nCell]<4 ){

7237 /* Do not allow any cells smaller than 4 bytes. If a smaller cell

7238 ** does exist, pad it with 0x00 bytes. */

7239 assert( b.szCell[b.nCell]==3 \|\| CORRUPT_DB );

7240 assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] \|\| CORRUPT_DB );

7241 aSpace1[iSpace1++] = 0x00;

7242 b.szCell[b.nCell]++;

7243 }

7244 }

7245 b.nCell++;

7246 }

7247 }

7248

7249 /*

7250 ** Figure out the number of pages needed to hold all b.nCell cells.

7251 ** Store this number in "k". Also compute szNew[] which is the total

7252 ** size of all cells on the i-th page and cntNew[] which is the index

7253 ** in b.apCell[] of the cell that divides page i from page i+1.

7254 ** cntNew[k] should equal b.nCell.

7255 **

7256 ** Values computed by this block:

7257 **

7258 ** k: The total number of sibling pages

7259 ** szNew[i]: Spaced used on the i-th sibling page.

7260 ** cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to

7261 ** the right of the i-th sibling page.

7262 ** usableSpace: Number of bytes of space available on each sibling.

7263 **

7264 */

7265 usableSpace = pBt->usableSize - 12 + leafCorrection;

7266 for(i=0; i<nOld; i++){

7267 MemPage *p = apOld[i];

7268 szNew[i] = usableSpace - p->nFree;

7269 if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }

7270 for(j=0; j<p->nOverflow; j++){

7271 szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);

7272 }

7273 cntNew[i] = cntOld[i];

7274 }

7275 k = nOld;

7276 for(i=0; i<k; i++){

7277 int sz;

7278 while( szNew[i]>usableSpace ){

7279 if( i+1>=k ){

7280 k = i+2;

7281 if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }

7282 szNew[k-1] = 0;

7283 cntNew[k-1] = b.nCell;

7284 }

7285 sz = 2 + cachedCellSize(&b, cntNew[i]-1);

7286 szNew[i] -= sz;

7287 if( !leafData ){

7288 if( cntNew[i]<b.nCell ){

7289 sz = 2 + cachedCellSize(&b, cntNew[i]);

7290 }else{

7291 sz = 0;

7292 }

7293 }

7294 szNew[i+1] += sz;

7295 cntNew[i]--;

7296 }

7297 while( cntNew[i]<b.nCell ){

7298 sz = 2 + cachedCellSize(&b, cntNew[i]);

7299 if( szNew[i]+sz>usableSpace ) break;

7300 szNew[i] += sz;

7301 cntNew[i]++;

7302 if( !leafData ){

7303 if( cntNew[i]<b.nCell ){

7304 sz = 2 + cachedCellSize(&b, cntNew[i]);

7305 }else{

7306 sz = 0;

7307 }

7308 }

7309 szNew[i+1] -= sz;

7310 }

7311 if( cntNew[i]>=b.nCell ){

7312 k = i+1;

7313 }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){

7314 rc = SQLITE_CORRUPT_BKPT;

7315 goto balance_cleanup;

7316 }

7317 }

7318

7319 /*

7320 ** The packing computed by the previous block is biased toward the siblings

7321 ** on the left side (siblings with smaller keys). The left siblings are

7322 ** always nearly full, while the right-most sibling might be nearly empty.

7323 ** The next block of code attempts to adjust the packing of siblings to

7324 ** get a better balance.

7325 **

7326 ** This adjustment is more than an optimization. The packing above might

7327 ** be so out of balance as to be illegal. For example, the right-most

7328 ** sibling might be completely empty. This adjustment is not optional.

7329 */

7330 for(i=k-1; i>0; i--){

7331 int szRight = szNew[i]; /* Size of sibling on the right */

7332 int szLeft = szNew[i-1]; /* Size of sibling on the left */

7333 int r; /* Index of right-most cell in left sibling */

7334 int d; /* Index of first cell to the left of right sibling */

7335

7336 r = cntNew[i-1] - 1;

7337 d = r + 1 - leafData;

7338 (void)cachedCellSize(&b, d);

7339 do{

7340 assert( d<nMaxCells );

7341 assert( r<nMaxCells );

7342 (void)cachedCellSize(&b, r);

7343 if( szRight!=0

7344 && (bBulk \|\| szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+2)) ){

7345 break;

7346 }

7347 szRight += b.szCell[d] + 2;

7348 szLeft -= b.szCell[r] + 2;

7349 cntNew[i-1] = r;

7350 r--;

7351 d--;

7352 }while( r>=0 );

7353 szNew[i] = szRight;

7354 szNew[i-1] = szLeft;

7355 if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){

7356 rc = SQLITE_CORRUPT_BKPT;

7357 goto balance_cleanup;

7358 }

7359 }

7360

7361 /* Sanity check: For a non-corrupt database file one of the follwing

7362 ** must be true:

7363 ** (1) We found one or more cells (cntNew[0])>0), or

7364 ** (2) pPage is a virtual root page. A virtual root page is when

7365 ** the real root page is page 1 and we are the only child of

7366 ** that page.

7367 */

7368 assert( cntNew[0]>0 \|\| (pParent->pgno==1 && pParent->nCell==0) \|\| CORRUPT_DB);

7369 TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",

7370 apOld[0]->pgno, apOld[0]->nCell,

7371 nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,

7372 nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0

7373 ));

7374

7375 /*

7376 ** Allocate k new pages. Reuse old pages where possible.

7377 */

7378 pageFlags = apOld[0]->aData[0];

7379 for(i=0; i<k; i++){

7380 MemPage *pNew;

7381 if( i<nOld ){

7382 pNew = apNew[i] = apOld[i];

7383 apOld[i] = 0;

7384 rc = sqlite3PagerWrite(pNew->pDbPage);

7385 nNew++;

7386 if( rc ) goto balance_cleanup;

7387 }else{

7388 assert( i>0 );

7389 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);

7390 if( rc ) goto balance_cleanup;

7391 zeroPage(pNew, pageFlags);

7392 apNew[i] = pNew;

7393 nNew++;

7394 cntOld[i] = b.nCell;

7395

7396 /* Set the pointer-map entry for the new sibling page. */

7397 if( ISAUTOVACUUM ){

7398 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);

7399 if( rc!=SQLITE_OK ){

7400 goto balance_cleanup;

7401 }

7402 }

7403 }

7404 }

7405

7406 /*

7407 ** Reassign page numbers so that the new pages are in ascending order.

7408 ** This helps to keep entries in the disk file in order so that a scan

7409 ** of the table is closer to a linear scan through the file. That in turn

7410 ** helps the operating system to deliver pages from the disk more rapidly.

7411 **

7412 ** An O(n^2) insertion sort algorithm is used, but since n is never more

7413 ** than (NB+2) (a small constant), that should not be a problem.

7414 **

7415 ** When NB==3, this one optimization makes the database about 25% faster

7416 ** for large insertions and deletions.

7417 */

7418 for(i=0; i<nNew; i++){

7419 aPgOrder[i] = aPgno[i] = apNew[i]->pgno;

7420 aPgFlags[i] = apNew[i]->pDbPage->flags;

7421 for(j=0; j<i; j++){

7422 if( aPgno[j]==aPgno[i] ){

7423 /* This branch is taken if the set of sibling pages somehow contains

7424 ** duplicate entries. This can happen if the database is corrupt.

7425 ** It would be simpler to detect this as part of the loop below, but

7426 ** we do the detection here in order to avoid populating the pager

7427 ** cache with two separate objects associated with the same

7428 ** page number. */

7429 assert( CORRUPT_DB );

7430 rc = SQLITE_CORRUPT_BKPT;

7431 goto balance_cleanup;

7432 }

7433 }

7434 }

7435 for(i=0; i<nNew; i++){

7436 int iBest = 0; /* aPgno[] index of page number to use */

7437 for(j=1; j<nNew; j++){

7438 if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;

7439 }

7440 pgno = aPgOrder[iBest];

7441 aPgOrder[iBest] = 0xffffffff;

7442 if( iBest!=i ){

7443 if( iBest>i ){

7444 sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);

7445 }

7446 sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);

7447 apNew[i]->pgno = pgno;

7448 }

7449 }

7450

7451 TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "

7452 "%d(%d nc=%d) %d(%d nc=%d)\n",

7453 apNew[0]->pgno, szNew[0], cntNew[0],

7454 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,

7455 nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,

7456 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,

7457 nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,

7458 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,

7459 nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,

7460 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,

7461 nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0

7462 ));

7463

7464 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

7465 put4byte(pRight, apNew[nNew-1]->pgno);

7466

7467 /* If the sibling pages are not leaves, ensure that the right-child pointer

7468 ** of the right-most new sibling page is set to the value that was

7469 ** originally in the same field of the right-most old sibling page. */

7470 if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){

7471 MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];

7472 memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);

7473 }

7474

7475 /* Make any required updates to pointer map entries associated with

7476 ** cells stored on sibling pages following the balance operation. Pointer

7477 ** map entries associated with divider cells are set by the insertCell()

7478 ** routine. The associated pointer map entries are:

7479 **

7480 ** a) if the cell contains a reference to an overflow chain, the

7481 ** entry associated with the first page in the overflow chain, and

7482 **

7483 ** b) if the sibling pages are not leaves, the child page associated

7484 ** with the cell.

7485 **

7486 ** If the sibling pages are not leaves, then the pointer map entry

7487 ** associated with the right-child of each sibling may also need to be

7488 ** updated. This happens below, after the sibling pages have been

7489 ** populated, not here.

7490 */

7491 if( ISAUTOVACUUM ){

7492 MemPage *pNew = apNew[0];

7493 u8 *aOld = pNew->aData;

7494 int cntOldNext = pNew->nCell + pNew->nOverflow;

7495 int usableSize = pBt->usableSize;

7496 int iNew = 0;

7497 int iOld = 0;

7498

7499 for(i=0; i<b.nCell; i++){

7500 u8 *pCell = b.apCell[i];

7501 if( i==cntOldNext ){

7502 MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];

7503 cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;

7504 aOld = pOld->aData;

7505 }

7506 if( i==cntNew[iNew] ){

7507 pNew = apNew[++iNew];

7508 if( !leafData ) continue;

7509 }

7510

7511 /* Cell pCell is destined for new sibling page pNew. Originally, it

7512 ** was either part of sibling page iOld (possibly an overflow cell),

7513 ** or else the divider cell to the left of sibling page iOld. So,

7514 ** if sibling page iOld had the same page number as pNew, and if

7515 ** pCell really was a part of sibling page iOld (not a divider or

7516 ** overflow cell), we can skip updating the pointer map entries. */

7517 if( iOld>=nNew

7518 \|\| pNew->pgno!=aPgno[iOld]

7519 \|\| !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])

7520 ){

7521 if( !leafCorrection ){

7522 ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);

7523 }

7524 if( cachedCellSize(&b,i)>pNew->minLocal ){

7525 ptrmapPutOvflPtr(pNew, pCell, &rc);

7526 }

7527 if( rc ) goto balance_cleanup;

7528 }

7529 }

7530 }

7531

7532 /* Insert new divider cells into pParent. */

7533 for(i=0; i<nNew-1; i++){

7534 u8 *pCell;

7535 u8 *pTemp;

7536 int sz;

7537 MemPage *pNew = apNew[i];

7538 j = cntNew[i];

7539

7540 assert( j<nMaxCells );

7541 assert( b.apCell[j]!=0 );

7542 pCell = b.apCell[j];

7543 sz = b.szCell[j] + leafCorrection;

7544 pTemp = &aOvflSpace[iOvflSpace];

7545 if( !pNew->leaf ){

7546 memcpy(&pNew->aData[8], pCell, 4);

7547 }else if( leafData ){

7548 /* If the tree is a leaf-data tree, and the siblings are leaves,

7549 ** then there is no divider cell in b.apCell[]. Instead, the divider

7550 ** cell consists of the integer key for the right-most cell of

7551 ** the sibling-page assembled above only.

7552 */

7553 CellInfo info;

7554 j--;

7555 pNew->xParseCell(pNew, b.apCell[j], &info);

7556 pCell = pTemp;

7557 sz = 4 + putVarint(&pCell[4], info.nKey);

7558 pTemp = 0;

7559 }else{

7560 pCell -= 4;

7561 /* Obscure case for non-leaf-data trees: If the cell at pCell was

7562 ** previously stored on a leaf node, and its reported size was 4

7563 ** bytes, then it may actually be smaller than this

7564 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of

7565 ** any cell). But it is important to pass the correct size to

7566 ** insertCell(), so reparse the cell now.

7567 **

7568 ** Note that this can never happen in an SQLite data file, as all

7569 ** cells are at least 4 bytes. It only happens in b-trees used

7570 ** to evaluate "IN (SELECT ...)" and similar clauses.

7571 */

7572 if( b.szCell[j]==4 ){

7573 assert(leafCorrection==4);

7574 sz = pParent->xCellSize(pParent, pCell);

7575 }

7576 }

7577 iOvflSpace += sz;

7578 assert( sz<=pBt->maxLocal+23 );

7579 assert( iOvflSpace <= (int)pBt->pageSize );

7580 insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);

7581 if( rc!=SQLITE_OK ) goto balance_cleanup;

7582 assert( sqlite3PagerIswriteable(pParent->pDbPage) );

7583 }

7584

7585 /* Now update the actual sibling pages. The order in which they are updated

7586 ** is important, as this code needs to avoid disrupting any page from which

7587 ** cells may still to be read. In practice, this means:

7588 **

7589 ** (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])

7590 ** then it is not safe to update page apNew[iPg] until after

7591 ** the left-hand sibling apNew[iPg-1] has been updated.

7592 **

7593 ** (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])

7594 ** then it is not safe to update page apNew[iPg] until after

7595 ** the right-hand sibling apNew[iPg+1] has been updated.

7596 **

7597 ** If neither of the above apply, the page is safe to update.

7598 **

7599 ** The iPg value in the following loop starts at nNew-1 goes down

7600 ** to 0, then back up to nNew-1 again, thus making two passes over

7601 ** the pages. On the initial downward pass, only condition (1) above

7602 ** needs to be tested because (2) will always be true from the previous

7603 ** step. On the upward pass, both conditions are always true, so the

7604 ** upwards pass simply processes pages that were missed on the downward

7605 ** pass.

7606 */

7607 for(i=1-nNew; i<nNew; i++){

7608 int iPg = i<0 ? -i : i;

7609 assert( iPg>=0 && iPg<nNew );

7610 if( abDone[iPg] ) continue; /* Skip pages already processed */

7611 if( i>=0 /* On the upwards pass, or... */

7612 \|\| cntOld[iPg-1]>=cntNew[iPg-1] /* Condition (1) is true */

7613 ){

7614 int iNew;

7615 int iOld;

7616 int nNewCell;

7617

7618 /* Verify condition (1): If cells are moving left, update iPg

7619 ** only after iPg-1 has already been updated. */

7620 assert( iPg==0 \|\| cntOld[iPg-1]>=cntNew[iPg-1] \|\| abDone[iPg-1] );

7621

7622 /* Verify condition (2): If cells are moving right, update iPg

7623 ** only after iPg+1 has already been updated. */

7624 assert( cntNew[iPg]>=cntOld[iPg] \|\| abDone[iPg+1] );

7625

7626 if( iPg==0 ){

7627 iNew = iOld = 0;

7628 nNewCell = cntNew[0];

7629 }else{

7630 iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;

7631 iNew = cntNew[iPg-1] + !leafData;

7632 nNewCell = cntNew[iPg] - iNew;

7633 }

7634

7635 rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);

7636 if( rc ) goto balance_cleanup;

7637 abDone[iPg]++;

7638 apNew[iPg]->nFree = usableSpace-szNew[iPg];

7639 assert( apNew[iPg]->nOverflow==0 );

7640 assert( apNew[iPg]->nCell==nNewCell );

7641 }

7642 }

7643

7644 /* All pages have been processed exactly once */

7645 assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );

7646

7647 assert( nOld>0 );

7648 assert( nNew>0 );

7649

7650 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){

7651 /* The root page of the b-tree now contains no cells. The only sibling

7652 ** page is the right-child of the parent. Copy the contents of the

7653 ** child page into the parent, decreasing the overall height of the

7654 ** b-tree structure by one. This is described as the "balance-shallower"

7655 ** sub-algorithm in some documentation.

7656 **

7657 ** If this is an auto-vacuum database, the call to copyNodeContent()

7658 ** sets all pointer-map entries corresponding to database image pages

7659 ** for which the pointer is stored within the content being copied.

7660 **

7661 ** It is critical that the child page be defragmented before being

7662 ** copied into the parent, because if the parent is page 1 then it will

7663 ** by smaller than the child due to the database header, and so all the

7664 ** free space needs to be up front.

7665 */

7666 assert( nNew==1 \|\| CORRUPT_DB );

7667 rc = defragmentPage(apNew[0]);

7668 testcase( rc!=SQLITE_OK );

7669 assert( apNew[0]->nFree ==

7670 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)

7671 \|\| rc!=SQLITE_OK

7672 );

7673 copyNodeContent(apNew[0], pParent, &rc);

7674 freePage(apNew[0], &rc);

7675 }else if( ISAUTOVACUUM && !leafCorrection ){

7676 /* Fix the pointer map entries associated with the right-child of each

7677 ** sibling page. All other pointer map entries have already been taken

7678 ** care of. */

7679 for(i=0; i<nNew; i++){

7680 u32 key = get4byte(&apNew[i]->aData[8]);

7681 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);

7682 }

7683 }

7684

7685 assert( pParent->isInit );

7686 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",

7687 nOld, nNew, b.nCell));

7688

7689 /* Free any old pages that were not reused as new pages.

7690 */

7691 for(i=nNew; i<nOld; i++){

7692 freePage(apOld[i], &rc);

7693 }

7694

7695 #if 0

7696 if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){

7697 /* The ptrmapCheckPages() contains assert() statements that verify that

7698 ** all pointer map pages are set correctly. This is helpful while

7699 ** debugging. This is usually disabled because a corrupt database may

7700 ** cause an assert() statement to fail. */

7701 ptrmapCheckPages(apNew, nNew);

7702 ptrmapCheckPages(&pParent, 1);

7703 }

7704 #endif

7705

7706 /*

7707 ** Cleanup before returning.

7708 */

7709 balance_cleanup:

7710 sqlite3ScratchFree(b.apCell);

7711 for(i=0; i<nOld; i++){

7712 releasePage(apOld[i]);

7713 }

7714 for(i=0; i<nNew; i++){

7715 releasePage(apNew[i]);

7716 }

7717

7718 return rc;

7719 }

7720

7721

7722 /*

7723 ** This function is called when the root page of a b-tree structure is

7724 ** overfull (has one or more overflow pages).

7725 **

7726 ** A new child page is allocated and the contents of the current root

7727 ** page, including overflow cells, are copied into the child. The root

7728 ** page is then overwritten to make it an empty page with the right-child

7729 ** pointer pointing to the new page.

7730 **

7731 ** Before returning, all pointer-map entries corresponding to pages

7732 ** that the new child-page now contains pointers to are updated. The

7733 ** entry corresponding to the new right-child pointer of the root

7734 ** page is also updated.

7735 **

7736 ** If successful, *ppChild is set to contain a reference to the child

7737 ** page and SQLITE_OK is returned. In this case the caller is required

7738 ** to call releasePage() on *ppChild exactly once. If an error occurs,

7739 ** an error code is returned and *ppChild is set to 0.

7740 */

7741 static int balance_deeper(MemPage pRoot, MemPage *ppChild){

7742 int rc; /* Return value from subprocedures */

7743 MemPage pChild = 0; / Pointer to a new child page */

7744 Pgno pgnoChild = 0; /* Page number of the new child page */

7745 BtShared pBt = pRoot->pBt; / The BTree */

7746

7747 assert( pRoot->nOverflow>0 );

7748 assert( sqlite3_mutex_held(pBt->mutex) );

7749

7750 /* Make pRoot, the root page of the b-tree, writable. Allocate a new

7751 ** page that will become the new right-child of pPage. Copy the contents

7752 ** of the node stored on pRoot into the new child page.

7753 */

7754 rc = sqlite3PagerWrite(pRoot->pDbPage);

7755 if( rc==SQLITE_OK ){

7756 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);

7757 copyNodeContent(pRoot, pChild, &rc);

7758 if( ISAUTOVACUUM ){

7759 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);

7760 }

7761 }

7762 if( rc ){

7763 *ppChild = 0;

7764 releasePage(pChild);

7765 return rc;

7766 }

7767 assert( sqlite3PagerIswriteable(pChild->pDbPage) );

7768 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

7769 assert( pChild->nCell==pRoot->nCell );

7770

7771 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));

7772

7773 /* Copy the overflow cells from pRoot to pChild */

7774 memcpy(pChild->aiOvfl, pRoot->aiOvfl,

7775 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));

7776 memcpy(pChild->apOvfl, pRoot->apOvfl,

7777 pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));

7778 pChild->nOverflow = pRoot->nOverflow;

7779

7780 /* Zero the contents of pRoot. Then install pChild as the right-child. */

7781 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);

7782 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);

7783

7784 *ppChild = pChild;

7785 return SQLITE_OK;

7786 }

7787

7788 /*

7789 ** The page that pCur currently points to has just been modified in

7790 ** some way. This function figures out if this modification means the

7791 ** tree needs to be balanced, and if so calls the appropriate balancing

7792 ** routine. Balancing routines are:

7793 **

7794 ** balance_quick()

7795 ** balance_deeper()

7796 ** balance_nonroot()

7797 */

7798 static int balance(BtCursor *pCur){

7799 int rc = SQLITE_OK;

7800 const int nMin = pCur->pBt->usableSize * 2 / 3;

7801 u8 aBalanceQuickSpace[13];

7802 u8 *pFree = 0;

7803

7804 TESTONLY( int balance_quick_called = 0 );

7805 TESTONLY( int balance_deeper_called = 0 );

7806

7807 do {

7808 int iPage = pCur->iPage;

7809 MemPage *pPage = pCur->apPage[iPage];

7810

7811 if( iPage==0 ){

7812 if( pPage->nOverflow ){

7813 /* The root page of the b-tree is overfull. In this case call the

7814 ** balance_deeper() function to create a new child for the root-page

7815 ** and copy the current contents of the root-page to it. The

7816 ** next iteration of the do-loop will balance the child page.

7817 */

7818 assert( (balance_deeper_called++)==0 );

7819 rc = balance_deeper(pPage, &pCur->apPage[1]);

7820 if( rc==SQLITE_OK ){

7821 pCur->iPage = 1;

7822 pCur->aiIdx[0] = 0;

7823 pCur->aiIdx[1] = 0;

7824 assert( pCur->apPage[1]->nOverflow );

7825 }

7826 }else{

7827 break;

7828 }

7829 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){

7830 break;

7831 }else{

7832 MemPage * const pParent = pCur->apPage[iPage-1];

7833 int const iIdx = pCur->aiIdx[iPage-1];

7834

7835 rc = sqlite3PagerWrite(pParent->pDbPage);

7836 if( rc==SQLITE_OK ){

7837 #ifndef SQLITE_OMIT_QUICKBALANCE

7838 if( pPage->intKeyLeaf

7839 && pPage->nOverflow==1

7840 && pPage->aiOvfl[0]==pPage->nCell

7841 && pParent->pgno!=1

7842 && pParent->nCell==iIdx

7843 ){

7844 /* Call balance_quick() to create a new sibling of pPage on which

7845 ** to store the overflow cell. balance_quick() inserts a new cell

7846 ** into pParent, which may cause pParent overflow. If this

7847 ** happens, the next iteration of the do-loop will balance pParent

7848 ** use either balance_nonroot() or balance_deeper(). Until this

7849 ** happens, the overflow cell is stored in the aBalanceQuickSpace[]

7850 ** buffer.

7851 **

7852 ** The purpose of the following assert() is to check that only a

7853 ** single call to balance_quick() is made for each call to this

7854 ** function. If this were not verified, a subtle bug involving reuse

7855 ** of the aBalanceQuickSpace[] might sneak in.

7856 */

7857 assert( (balance_quick_called++)==0 );

7858 rc = balance_quick(pParent, pPage, aBalanceQuickSpace);

7859 }else

7860 #endif

7861 {

7862 /* In this case, call balance_nonroot() to redistribute cells

7863 ** between pPage and up to 2 of its sibling pages. This involves

7864 ** modifying the contents of pParent, which may cause pParent to

7865 ** become overfull or underfull. The next iteration of the do-loop

7866 ** will balance the parent page to correct this.

7867 **

7868 ** If the parent page becomes overfull, the overflow cell or cells

7869 ** are stored in the pSpace buffer allocated immediately below.

7870 ** A subsequent iteration of the do-loop will deal with this by

7871 ** calling balance_nonroot() (balance_deeper() may be called first,

7872 ** but it doesn't deal with overflow cells - just moves them to a

7873 ** different page). Once this subsequent call to balance_nonroot()

7874 ** has completed, it is safe to release the pSpace buffer used by

7875 ** the previous call, as the overflow cell data will have been

7876 ** copied either into the body of a database page or into the new

7877 ** pSpace buffer passed to the latter call to balance_nonroot().

7878 */

7879 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);

7880 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,

7881 pCur->hints&BTREE_BULKLOAD);

7882 if( pFree ){

7883 /* If pFree is not NULL, it points to the pSpace buffer used

7884 ** by a previous call to balance_nonroot(). Its contents are

7885 ** now stored either on real database pages or within the

7886 ** new pSpace buffer, so it may be safely freed here. */

7887 sqlite3PageFree(pFree);

7888 }

7889

7890 /* The pSpace buffer will be freed after the next call to

7891 ** balance_nonroot(), or just before this function returns, whichever

7892 ** comes first. */

7893 pFree = pSpace;

7894 }

7895 }

7896

7897 pPage->nOverflow = 0;

7898

7899 /* The next iteration of the do-loop balances the parent page. */

7900 releasePage(pPage);

7901 pCur->iPage--;

7902 assert( pCur->iPage>=0 );

7903 }

7904 }while( rc==SQLITE_OK );

7905

7906 if( pFree ){

7907 sqlite3PageFree(pFree);

7908 }

7909 return rc;

7910 }

7911

7912

7913 /*

7914 ** Insert a new record into the BTree. The key is given by (pKey,nKey)

7915 ** and the data is given by (pData,nData). The cursor is used only to

7916 ** define what table the record should be inserted into. The cursor

7917 ** is left pointing at a random location.

7918 **

7919 ** For an INTKEY table, only the nKey value of the key is used. pKey is

7920 ** ignored. For a ZERODATA table, the pData and nData are both ignored.

7921 **

7922 ** If the seekResult parameter is non-zero, then a successful call to

7923 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already

7924 ** been performed. seekResult is the search result returned (a negative

7925 ** number if pCur points at an entry that is smaller than (pKey, nKey), or

7926 ** a positive value if pCur points at an entry that is larger than

7927 ** (pKey, nKey)).

7928 **

7929 ** If the seekResult parameter is non-zero, then the caller guarantees that

7930 ** cursor pCur is pointing at the existing copy of a row that is to be

7931 ** overwritten. If the seekResult parameter is 0, then cursor pCur may

7932 ** point to any entry or to no entry at all and so this function has to seek

7933 ** the cursor before the new key can be inserted.

7934 */

7935 int sqlite3BtreeInsert(

7936 BtCursor pCur, / Insert data into the table of this cursor */

7937 const void pKey, i64 nKey, / The key of the new record */

7938 const void pData, int nData, / The data of the new record */

7939 int nZero, /* Number of extra 0 bytes to append to data */

7940 int appendBias, /* True if this is likely an append */

7941 int seekResult /* Result of prior MovetoUnpacked() call */

7942 ){

7943 int rc;

7944 int loc = seekResult; /* -1: before desired location +1: after */

7945 int szNew = 0;

7946 int idx;

7947 MemPage *pPage;

7948 Btree *p = pCur->pBtree;

7949 BtShared *pBt = p->pBt;

7950 unsigned char *oldCell;

7951 unsigned char *newCell = 0;

7952

7953 if( pCur->eState==CURSOR_FAULT ){

7954 assert( pCur->skipNext!=SQLITE_OK );

7955 return pCur->skipNext;

7956 }

7957

7958 assert( cursorHoldsMutex(pCur) );

7959 assert( (pCur->curFlags & BTCF_WriteFlag)!=0

7960 && pBt->inTransaction==TRANS_WRITE

7961 && (pBt->btsFlags & BTS_READ_ONLY)==0 );

7962 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

7963

7964 /* Assert that the caller has been consistent. If this cursor was opened

7965 ** expecting an index b-tree, then the caller should be inserting blob

7966 ** keys with no associated data. If the cursor was opened expecting an

7967 ** intkey table, the caller should be inserting integer keys with a

7968 ** blob of associated data. */

7969 assert( (pKey==0)==(pCur->pKeyInfo==0) );

7970

7971 /* Save the positions of any other cursors open on this table.

7972 **

7973 ** In some cases, the call to btreeMoveto() below is a no-op. For

7974 ** example, when inserting data into a table with auto-generated integer

7975 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the

7976 ** integer key to use. It then calls this function to actually insert the

7977 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes

7978 ** that the cursor is already where it needs to be and returns without

7979 ** doing any work. To avoid thwarting these optimizations, it is important

7980 ** not to clear the cursor here.

7981 */

7982 if( pCur->curFlags & BTCF_Multiple ){

7983 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

7984 if( rc ) return rc;

7985 }

7986

7987 if( pCur->pKeyInfo==0 ){

7988 assert( pKey==0 );

7989 /* If this is an insert into a table b-tree, invalidate any incrblob

7990 ** cursors open on the row being replaced */

7991 invalidateIncrblobCursors(p, nKey, 0);

7992

7993 /* If the cursor is currently on the last row and we are appending a

7994 ** new row onto the end, set the "loc" to avoid an unnecessary

7995 ** btreeMoveto() call */

7996 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0

7997 && pCur->info.nKey==nKey-1 ){

7998 loc = -1;

7999 }else if( loc==0 ){

8000 rc = sqlite3BtreeMovetoUnpacked(pCur, 0, nKey, appendBias, &loc);

8001 if( rc ) return rc;

8002 }

8003 }else if( loc==0 ){

8004 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);

8005 if( rc ) return rc;

8006 }

8007 assert( pCur->eState==CURSOR_VALID \|\| (pCur->eState==CURSOR_INVALID && loc) );

8008

8009 pPage = pCur->apPage[pCur->iPage];

8010 assert( pPage->intKey \|\| nKey>=0 );

8011 assert( pPage->leaf \|\| !pPage->intKey );

8012

8013 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",

8014 pCur->pgnoRoot, nKey, nData, pPage->pgno,

8015 loc==0 ? "overwrite" : "new entry"));

8016 assert( pPage->isInit );

8017 newCell = pBt->pTmpSpace;

8018 assert( newCell!=0 );

8019 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);

8020 if( rc ) goto end_insert;

8021 assert( szNew==pPage->xCellSize(pPage, newCell) );

8022 assert( szNew <= MX_CELL_SIZE(pBt) );

8023 idx = pCur->aiIdx[pCur->iPage];

8024 if( loc==0 ){

8025 u16 szOld;

8026 assert( idx<pPage->nCell );

8027 rc = sqlite3PagerWrite(pPage->pDbPage);

8028 if( rc ){

8029 goto end_insert;

8030 }

8031 oldCell = findCell(pPage, idx);

8032 if( !pPage->leaf ){

8033 memcpy(newCell, oldCell, 4);

8034 }

8035 rc = clearCell(pPage, oldCell, &szOld);

8036 dropCell(pPage, idx, szOld, &rc);

8037 if( rc ) goto end_insert;

8038 }else if( loc<0 && pPage->nCell>0 ){

8039 assert( pPage->leaf );

8040 idx = ++pCur->aiIdx[pCur->iPage];

8041 }else{

8042 assert( pPage->leaf );

8043 }

8044 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);

8045 assert( rc!=SQLITE_OK \|\| pPage->nCell>0 \|\| pPage->nOverflow>0 );

8046

8047 /* If no error has occurred and pPage has an overflow cell, call balance()

8048 ** to redistribute the cells within the tree. Since balance() may move

8049 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey

8050 ** variables.

8051 **

8052 ** Previous versions of SQLite called moveToRoot() to move the cursor

8053 ** back to the root page as balance() used to invalidate the contents

8054 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,

8055 ** set the cursor state to "invalid". This makes common insert operations

8056 ** slightly faster.

8057 **

8058 ** There is a subtle but important optimization here too. When inserting

8059 ** multiple records into an intkey b-tree using a single cursor (as can

8060 ** happen while processing an "INSERT INTO ... SELECT" statement), it

8061 ** is advantageous to leave the cursor pointing to the last entry in

8062 ** the b-tree if possible. If the cursor is left pointing to the last

8063 ** entry in the table, and the next row inserted has an integer key

8064 ** larger than the largest existing key, it is possible to insert the

8065 ** row without seeking the cursor. This can be a big performance boost.

8066 */

8067 pCur->info.nSize = 0;

8068 if( rc==SQLITE_OK && pPage->nOverflow ){

8069 pCur->curFlags &= ~(BTCF_ValidNKey);

8070 rc = balance(pCur);

8071

8072 /* Must make sure nOverflow is reset to zero even if the balance()

8073 ** fails. Internal data structure corruption will result otherwise.

8074 ** Also, set the cursor state to invalid. This stops saveCursorPosition()

8075 ** from trying to save the current position of the cursor. */

8076 pCur->apPage[pCur->iPage]->nOverflow = 0;

8077 pCur->eState = CURSOR_INVALID;

8078 }

8079 assert( pCur->apPage[pCur->iPage]->nOverflow==0 );

8080

8081 end_insert:

8082 return rc;

8083 }

8084

8085 /*

8086 ** Delete the entry that the cursor is pointing to.

8087 **

8088 ** If the second parameter is zero, then the cursor is left pointing at an

8089 ** arbitrary location after the delete. If it is non-zero, then the cursor

8090 ** is left in a state such that the next call to BtreeNext() or BtreePrev()

8091 ** moves it to the same row as it would if the call to BtreeDelete() had

8092 ** been omitted.

8093 */

8094 int sqlite3BtreeDelete(BtCursor *pCur, int bPreserve){

8095 Btree *p = pCur->pBtree;

8096 BtShared *pBt = p->pBt;

8097 int rc; /* Return code */

8098 MemPage pPage; / Page to delete cell from */

8099 unsigned char pCell; / Pointer to cell to delete */

8100 int iCellIdx; /* Index of cell to delete */

8101 int iCellDepth; /* Depth of node containing pCell */

8102 u16 szCell; /* Size of the cell being deleted */

8103 int bSkipnext = 0; /* Leaf cursor in SKIPNEXT state */

8104

8105 assert( cursorHoldsMutex(pCur) );

8106 assert( pBt->inTransaction==TRANS_WRITE );

8107 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

8108 assert( pCur->curFlags & BTCF_WriteFlag );

8109 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );

8110 assert( !hasReadConflicts(p, pCur->pgnoRoot) );

8111 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );

8112 assert( pCur->eState==CURSOR_VALID );

8113

8114 iCellDepth = pCur->iPage;

8115 iCellIdx = pCur->aiIdx[iCellDepth];

8116 pPage = pCur->apPage[iCellDepth];

8117 pCell = findCell(pPage, iCellIdx);

8118

8119 /* If the page containing the entry to delete is not a leaf page, move

8120 ** the cursor to the largest entry in the tree that is smaller than

8121 ** the entry being deleted. This cell will replace the cell being deleted

8122 ** from the internal node. The 'previous' entry is used for this instead

8123 ** of the 'next' entry, as the previous entry is always a part of the

8124 ** sub-tree headed by the child page of the cell being deleted. This makes

8125 ** balancing the tree following the delete operation easier. */

8126 if( !pPage->leaf ){

8127 int notUsed = 0;

8128 rc = sqlite3BtreePrevious(pCur, &notUsed);

8129 if( rc ) return rc;

8130 }

8131

8132 /* Save the positions of any other cursors open on this table before

8133 ** making any modifications. */

8134 if( pCur->curFlags & BTCF_Multiple ){

8135 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);

8136 if( rc ) return rc;

8137 }

8138

8139 /* If this is a delete operation to remove a row from a table b-tree,

8140 ** invalidate any incrblob cursors open on the row being deleted. */

8141 if( pCur->pKeyInfo==0 ){

8142 invalidateIncrblobCursors(p, pCur->info.nKey, 0);

8143 }

8144

8145 /* If the bPreserve flag is set to true, then the cursor position must

8146 ** be preserved following this delete operation. If the current delete

8147 ** will cause a b-tree rebalance, then this is done by saving the cursor

8148 ** key and leaving the cursor in CURSOR_REQUIRESEEK state before

8149 ** returning.

8150 **

8151 ** Or, if the current delete will not cause a rebalance, then the cursor

8152 ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately

8153 ** before or after the deleted entry. In this case set bSkipnext to true. */

8154 if( bPreserve ){

8155 if( !pPage->leaf

8156 \|\| (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)

8157 ){

8158 /* A b-tree rebalance will be required after deleting this entry.

8159 ** Save the cursor key. */

8160 rc = saveCursorKey(pCur);

8161 if( rc ) return rc;

8162 }else{

8163 bSkipnext = 1;

8164 }

8165 }

8166

8167 /* Make the page containing the entry to be deleted writable. Then free any

8168 ** overflow pages associated with the entry and finally remove the cell

8169 ** itself from within the page. */

8170 rc = sqlite3PagerWrite(pPage->pDbPage);

8171 if( rc ) return rc;

8172 rc = clearCell(pPage, pCell, &szCell);

8173 dropCell(pPage, iCellIdx, szCell, &rc);

8174 if( rc ) return rc;

8175

8176 /* If the cell deleted was not located on a leaf page, then the cursor

8177 ** is currently pointing to the largest entry in the sub-tree headed

8178 ** by the child-page of the cell that was just deleted from an internal

8179 ** node. The cell from the leaf node needs to be moved to the internal

8180 ** node to replace the deleted cell. */

8181 if( !pPage->leaf ){

8182 MemPage *pLeaf = pCur->apPage[pCur->iPage];

8183 int nCell;

8184 Pgno n = pCur->apPage[iCellDepth+1]->pgno;

8185 unsigned char *pTmp;

8186

8187 pCell = findCell(pLeaf, pLeaf->nCell-1);

8188 if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;

8189 nCell = pLeaf->xCellSize(pLeaf, pCell);

8190 assert( MX_CELL_SIZE(pBt) >= nCell );

8191 pTmp = pBt->pTmpSpace;

8192 assert( pTmp!=0 );

8193 rc = sqlite3PagerWrite(pLeaf->pDbPage);

8194 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);

8195 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);

8196 if( rc ) return rc;

8197 }

8198

8199 /* Balance the tree. If the entry deleted was located on a leaf page,

8200 ** then the cursor still points to that page. In this case the first

8201 ** call to balance() repairs the tree, and the if(...) condition is

8202 ** never true.

8203 **

8204 ** Otherwise, if the entry deleted was on an internal node page, then

8205 ** pCur is pointing to the leaf page from which a cell was removed to

8206 ** replace the cell deleted from the internal node. This is slightly

8207 ** tricky as the leaf node may be underfull, and the internal node may

8208 ** be either under or overfull. In this case run the balancing algorithm

8209 ** on the leaf node first. If the balance proceeds far enough up the

8210 ** tree that we can be sure that any problem in the internal node has

8211 ** been corrected, so be it. Otherwise, after balancing the leaf node,

8212 ** walk the cursor up the tree to the internal node and balance it as

8213 ** well. */

8214 rc = balance(pCur);

8215 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){

8216 while( pCur->iPage>iCellDepth ){

8217 releasePage(pCur->apPage[pCur->iPage--]);

8218 }

8219 rc = balance(pCur);

8220 }

8221

8222 if( rc==SQLITE_OK ){

8223 if( bSkipnext ){

8224 assert( bPreserve && (pCur->iPage==iCellDepth \|\| CORRUPT_DB) );

8225 assert( pPage==pCur->apPage[pCur->iPage] );

8226 assert( (pPage->nCell>0 \|\| CORRUPT_DB) && iCellIdx<=pPage->nCell );

8227 pCur->eState = CURSOR_SKIPNEXT;

8228 if( iCellIdx>=pPage->nCell ){

8229 pCur->skipNext = -1;

8230 pCur->aiIdx[iCellDepth] = pPage->nCell-1;

8231 }else{

8232 pCur->skipNext = 1;

8233 }

8234 }else{

8235 rc = moveToRoot(pCur);

8236 if( bPreserve ){

8237 pCur->eState = CURSOR_REQUIRESEEK;

8238 }

8239 }

8240 }

8241 return rc;

8242 }

8243

8244 /*

8245 ** Create a new BTree table. Write into *piTable the page

8246 ** number for the root page of the new table.

8247 **

8248 ** The type of type is determined by the flags parameter. Only the

8249 ** following values of flags are currently in use. Other values for

8250 ** flags might not work:

8251 **

8252 ** BTREE_INTKEY\|BTREE_LEAFDATA Used for SQL tables with rowid keys

8253 ** BTREE_ZERODATA Used for SQL indices

8254 */

8255 static int btreeCreateTable(Btree p, int piTable, int createTabFlags){

8256 BtShared *pBt = p->pBt;

8257 MemPage *pRoot;

8258 Pgno pgnoRoot;

8259 int rc;

8260 int ptfFlags; /* Page-type flage for the root page of new table */

8261

8262 assert( sqlite3BtreeHoldsMutex(p) );

8263 assert( pBt->inTransaction==TRANS_WRITE );

8264 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );

8265

8266 #ifdef SQLITE_OMIT_AUTOVACUUM

8267 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

8268 if( rc ){

8269 return rc;

8270 }

8271 #else

8272 if( pBt->autoVacuum ){

8273 Pgno pgnoMove; /* Move a page here to make room for the root-page */

8274 MemPage pPageMove; / The page to move to. */

8275

8276 /* Creating a new table may probably require moving an existing database

8277 ** to make room for the new tables root page. In case this page turns

8278 ** out to be an overflow page, delete all overflow page-map caches

8279 ** held by open cursors.

8280 */

8281 invalidateAllOverflowCache(pBt);

8282

8283 /* Read the value of meta[3] from the database to determine where the

8284 ** root page of the new table should go. meta[3] is the largest root-page

8285 ** created so far, so the new root-page is (meta[3]+1).

8286 */

8287 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);

8288 pgnoRoot++;

8289

8290 /* The new root-page may not be allocated on a pointer-map page, or the

8291 ** PENDING_BYTE page.

8292 */

8293 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) \|\|

8294 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){

8295 pgnoRoot++;

8296 }

8297 assert( pgnoRoot>=3 \|\| CORRUPT_DB );

8298 testcase( pgnoRoot<3 );

8299

8300 /* Allocate a page. The page that currently resides at pgnoRoot will

8301 ** be moved to the allocated page (unless the allocated page happens

8302 ** to reside at pgnoRoot).

8303 */

8304 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);

8305 if( rc!=SQLITE_OK ){

8306 return rc;

8307 }

8308

8309 if( pgnoMove!=pgnoRoot ){

8310 /* pgnoRoot is the page that will be used for the root-page of

8311 ** the new table (assuming an error did not occur). But we were

8312 ** allocated pgnoMove. If required (i.e. if it was not allocated

8313 ** by extending the file), the current page at position pgnoMove

8314 ** is already journaled.

8315 */

8316 u8 eType = 0;

8317 Pgno iPtrPage = 0;

8318

8319 /* Save the positions of any open cursors. This is required in

8320 ** case they are holding a reference to an xFetch reference

8321 ** corresponding to page pgnoRoot. */

8322 rc = saveAllCursors(pBt, 0, 0);

8323 releasePage(pPageMove);

8324 if( rc!=SQLITE_OK ){

8325 return rc;

8326 }

8327

8328 /* Move the page currently at pgnoRoot to pgnoMove. */

8329 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

8330 if( rc!=SQLITE_OK ){

8331 return rc;

8332 }

8333 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);

8334 if( eType==PTRMAP_ROOTPAGE \|\| eType==PTRMAP_FREEPAGE ){

8335 rc = SQLITE_CORRUPT_BKPT;

8336 }

8337 if( rc!=SQLITE_OK ){

8338 releasePage(pRoot);

8339 return rc;

8340 }

8341 assert( eType!=PTRMAP_ROOTPAGE );

8342 assert( eType!=PTRMAP_FREEPAGE );

8343 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);

8344 releasePage(pRoot);

8345

8346 /* Obtain the page at pgnoRoot */

8347 if( rc!=SQLITE_OK ){

8348 return rc;

8349 }

8350 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);

8351 if( rc!=SQLITE_OK ){

8352 return rc;

8353 }

8354 rc = sqlite3PagerWrite(pRoot->pDbPage);

8355 if( rc!=SQLITE_OK ){

8356 releasePage(pRoot);

8357 return rc;

8358 }

8359 }else{

8360 pRoot = pPageMove;

8361 }

8362

8363 /* Update the pointer-map and meta-data with the new root-page number. */

8364 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);

8365 if( rc ){

8366 releasePage(pRoot);

8367 return rc;

8368 }

8369

8370 /* When the new root page was allocated, page 1 was made writable in

8371 ** order either to increase the database filesize, or to decrement the

8372 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.

8373 */

8374 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );

8375 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);

8376 if( NEVER(rc) ){

8377 releasePage(pRoot);

8378 return rc;

8379 }

8380

8381 }else{

8382 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);

8383 if( rc ) return rc;

8384 }

8385 #endif

8386 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );

8387 if( createTabFlags & BTREE_INTKEY ){

8388 ptfFlags = PTF_INTKEY \| PTF_LEAFDATA \| PTF_LEAF;

8389 }else{

8390 ptfFlags = PTF_ZERODATA \| PTF_LEAF;

8391 }

8392 zeroPage(pRoot, ptfFlags);

8393 sqlite3PagerUnref(pRoot->pDbPage);

8394 assert( (pBt->openFlags & BTREE_SINGLE)==0 \|\| pgnoRoot==2 );

8395 *piTable = (int)pgnoRoot;

8396 return SQLITE_OK;

8397 }

8398 int sqlite3BtreeCreateTable(Btree p, int piTable, int flags){

8399 int rc;

8400 sqlite3BtreeEnter(p);

8401 rc = btreeCreateTable(p, piTable, flags);

8402 sqlite3BtreeLeave(p);

8403 return rc;

8404 }

8405

8406 /*

8407 ** Erase the given database page and all its children. Return

8408 ** the page to the freelist.

8409 */

8410 static int clearDatabasePage(

8411 BtShared pBt, / The BTree that contains the table */

8412 Pgno pgno, /* Page number to clear */

8413 int freePageFlag, /* Deallocate page if true */

8414 int pnChange / Add number of Cells freed to this counter */

8415 ){

8416 MemPage *pPage;

8417 int rc;

8418 unsigned char *pCell;

8419 int i;

8420 int hdr;

8421 u16 szCell;

8422

8423 assert( sqlite3_mutex_held(pBt->mutex) );

8424 if( pgno>btreePagecount(pBt) ){

8425 return SQLITE_CORRUPT_BKPT;

8426 }

8427 rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);

8428 if( rc ) return rc;

8429 if( pPage->bBusy ){

8430 rc = SQLITE_CORRUPT_BKPT;

8431 goto cleardatabasepage_out;

8432 }

8433 pPage->bBusy = 1;

8434 hdr = pPage->hdrOffset;

8435 for(i=0; i<pPage->nCell; i++){

8436 pCell = findCell(pPage, i);

8437 if( !pPage->leaf ){

8438 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);

8439 if( rc ) goto cleardatabasepage_out;

8440 }

8441 rc = clearCell(pPage, pCell, &szCell);

8442 if( rc ) goto cleardatabasepage_out;

8443 }

8444 if( !pPage->leaf ){

8445 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);

8446 if( rc ) goto cleardatabasepage_out;

8447 }else if( pnChange ){

8448 assert( pPage->intKey \|\| CORRUPT_DB );

8449 testcase( !pPage->intKey );

8450 *pnChange += pPage->nCell;

8451 }

8452 if( freePageFlag ){

8453 freePage(pPage, &rc);

8454 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){

8455 zeroPage(pPage, pPage->aData[hdr] \| PTF_LEAF);

8456 }

8457

8458 cleardatabasepage_out:

8459 pPage->bBusy = 0;

8460 releasePage(pPage);

8461 return rc;

8462 }

8463

8464 /*

8465 ** Delete all information from a single table in the database. iTable is

8466 ** the page number of the root of the table. After this routine returns,

8467 ** the root page is empty, but still exists.

8468 **

8469 ** This routine will fail with SQLITE_LOCKED if there are any open

8470 ** read cursors on the table. Open write cursors are moved to the

8471 ** root of the table.

8472 **

8473 ** If pnChange is not NULL, then table iTable must be an intkey table. The

8474 ** integer value pointed to by pnChange is incremented by the number of

8475 ** entries in the table.

8476 */

8477 int sqlite3BtreeClearTable(Btree p, int iTable, int pnChange){

8478 int rc;

8479 BtShared *pBt = p->pBt;

8480 sqlite3BtreeEnter(p);

8481 assert( p->inTrans==TRANS_WRITE );

8482

8483 rc = saveAllCursors(pBt, (Pgno)iTable, 0);

8484

8485 if( SQLITE_OK==rc ){

8486 /* Invalidate all incrblob cursors open on table iTable (assuming iTable

8487 ** is the root of a table b-tree - if it is not, the following call is

8488 ** a no-op). */

8489 invalidateIncrblobCursors(p, 0, 1);

8490 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);

8491 }

8492 sqlite3BtreeLeave(p);

8493 return rc;

8494 }

8495

8496 /*

8497 ** Delete all information from the single table that pCur is open on.

8498 **

8499 ** This routine only work for pCur on an ephemeral table.

8500 */

8501 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){

8502 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);

8503 }

8504

8505 /*

8506 ** Erase all information in a table and add the root of the table to

8507 ** the freelist. Except, the root of the principle table (the one on

8508 ** page 1) is never added to the freelist.

8509 **

8510 ** This routine will fail with SQLITE_LOCKED if there are any open

8511 ** cursors on the table.

8512 **

8513 ** If AUTOVACUUM is enabled and the page at iTable is not the last

8514 ** root page in the database file, then the last root page

8515 ** in the database file is moved into the slot formerly occupied by

8516 ** iTable and that last slot formerly occupied by the last root page

8517 ** is added to the freelist instead of iTable. In this say, all

8518 ** root pages are kept at the beginning of the database file, which

8519 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the

8520 ** page number that used to be the last root page in the file before

8521 ** the move. If no page gets moved, *piMoved is set to 0.

8522 ** The last root page is recorded in meta[3] and the value of

8523 ** meta[3] is updated by this procedure.

8524 */

8525 static int btreeDropTable(Btree p, Pgno iTable, int piMoved){

8526 int rc;

8527 MemPage *pPage = 0;

8528 BtShared *pBt = p->pBt;

8529

8530 assert( sqlite3BtreeHoldsMutex(p) );

8531 assert( p->inTrans==TRANS_WRITE );

8532

8533 /* It is illegal to drop a table if any cursors are open on the

8534 ** database. This is because in auto-vacuum mode the backend may

8535 ** need to move another root-page to fill a gap left by the deleted

8536 ** root page. If an open cursor was using this page a problem would

8537 ** occur.

8538 **

8539 ** This error is caught long before control reaches this point.

8540 */

8541 if( NEVER(pBt->pCursor) ){

8542 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);

8543 return SQLITE_LOCKED_SHAREDCACHE;

8544 }

8545

8546 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);

8547 if( rc ) return rc;

8548 rc = sqlite3BtreeClearTable(p, iTable, 0);

8549 if( rc ){

8550 releasePage(pPage);

8551 return rc;

8552 }

8553

8554 *piMoved = 0;

8555

8556 if( iTable>1 ){

8557 #ifdef SQLITE_OMIT_AUTOVACUUM

8558 freePage(pPage, &rc);

8559 releasePage(pPage);

8560 #else

8561 if( pBt->autoVacuum ){

8562 Pgno maxRootPgno;

8563 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);

8564

8565 if( iTable==maxRootPgno ){

8566 /* If the table being dropped is the table with the largest root-page

8567 ** number in the database, put the root page on the free list.

8568 */

8569 freePage(pPage, &rc);

8570 releasePage(pPage);

8571 if( rc!=SQLITE_OK ){

8572 return rc;

8573 }

8574 }else{

8575 /* The table being dropped does not have the largest root-page

8576 ** number in the database. So move the page that does into the

8577 ** gap left by the deleted root-page.

8578 */

8579 MemPage *pMove;

8580 releasePage(pPage);

8581 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

8582 if( rc!=SQLITE_OK ){

8583 return rc;

8584 }

8585 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);

8586 releasePage(pMove);

8587 if( rc!=SQLITE_OK ){

8588 return rc;

8589 }

8590 pMove = 0;

8591 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);

8592 freePage(pMove, &rc);

8593 releasePage(pMove);

8594 if( rc!=SQLITE_OK ){

8595 return rc;

8596 }

8597 *piMoved = maxRootPgno;

8598 }

8599

8600 /* Set the new 'max-root-page' value in the database header. This

8601 ** is the old value less one, less one more if that happens to

8602 ** be a root-page number, less one again if that is the

8603 ** PENDING_BYTE_PAGE.

8604 */

8605 maxRootPgno--;

8606 while( maxRootPgno==PENDING_BYTE_PAGE(pBt)

8607 \|\| PTRMAP_ISPAGE(pBt, maxRootPgno) ){

8608 maxRootPgno--;

8609 }

8610 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );

8611

8612 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);

8613 }else{

8614 freePage(pPage, &rc);

8615 releasePage(pPage);

8616 }

8617 #endif

8618 }else{

8619 /* If sqlite3BtreeDropTable was called on page 1.

8620 ** This really never should happen except in a corrupt

8621 ** database.

8622 */

8623 zeroPage(pPage, PTF_INTKEY\|PTF_LEAF );

8624 releasePage(pPage);

8625 }

8626 return rc;

8627 }

8628 int sqlite3BtreeDropTable(Btree p, int iTable, int piMoved){

8629 int rc;

8630 sqlite3BtreeEnter(p);

8631 rc = btreeDropTable(p, iTable, piMoved);

8632 sqlite3BtreeLeave(p);

8633 return rc;

8634 }

8635

8636

8637 /*

8638 ** This function may only be called if the b-tree connection already

8639 ** has a read or write transaction open on the database.

8640 **

8641 ** Read the meta-information out of a database file. Meta[0]

8642 ** is the number of free pages currently in the database. Meta[1]

8643 ** through meta[15] are available for use by higher layers. Meta[0]

8644 ** is read-only, the others are read/write.

8645 **

8646 ** The schema layer numbers meta values differently. At the schema

8647 ** layer (and the SetCookie and ReadCookie opcodes) the number of

8648 ** free pages is not visible. So Cookie[0] is the same as Meta[1].

8649 **

8650 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case. Instead

8651 ** of reading the value out of the header, it instead loads the "DataVersion"

8652 ** from the pager. The BTREE_DATA_VERSION value is not actually stored in the

8653 ** database file. It is a number computed by the pager. But its access

8654 ** pattern is the same as header meta values, and so it is convenient to

8655 ** read it from this routine.

8656 */

8657 void sqlite3BtreeGetMeta(Btree p, int idx, u32 pMeta){

8658 BtShared *pBt = p->pBt;

8659

8660 sqlite3BtreeEnter(p);

8661 assert( p->inTrans>TRANS_NONE );

8662 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );

8663 assert( pBt->pPage1 );

8664 assert( idx>=0 && idx<=15 );

8665

8666 if( idx==BTREE_DATA_VERSION ){

8667 *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;

8668 }else{

8669 pMeta = get4byte(&pBt->pPage1->aData[36 + idx4]);

8670 }

8671

8672 /* If auto-vacuum is disabled in this build and this is an auto-vacuum

8673 ** database, mark the database as read-only. */

8674 #ifdef SQLITE_OMIT_AUTOVACUUM

8675 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){

8676 pBt->btsFlags \|= BTS_READ_ONLY;

8677 }

8678 #endif

8679

8680 sqlite3BtreeLeave(p);

8681 }

8682

8683 /*

8684 ** Write meta-information back into the database. Meta[0] is

8685 ** read-only and may not be written.

8686 */

8687 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){

8688 BtShared *pBt = p->pBt;

8689 unsigned char *pP1;

8690 int rc;

8691 assert( idx>=1 && idx<=15 );

8692 sqlite3BtreeEnter(p);

8693 assert( p->inTrans==TRANS_WRITE );

8694 assert( pBt->pPage1!=0 );

8695 pP1 = pBt->pPage1->aData;

8696 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

8697 if( rc==SQLITE_OK ){

8698 put4byte(&pP1[36 + idx*4], iMeta);

8699 #ifndef SQLITE_OMIT_AUTOVACUUM

8700 if( idx==BTREE_INCR_VACUUM ){

8701 assert( pBt->autoVacuum \|\| iMeta==0 );

8702 assert( iMeta==0 \|\| iMeta==1 );

8703 pBt->incrVacuum = (u8)iMeta;

8704 }

8705 #endif

8706 }

8707 sqlite3BtreeLeave(p);

8708 return rc;

8709 }

8710

8711 #ifndef SQLITE_OMIT_BTREECOUNT

8712 /*

8713 ** The first argument, pCur, is a cursor opened on some b-tree. Count the

8714 ** number of entries in the b-tree and write the result to *pnEntry.

8715 **

8716 ** SQLITE_OK is returned if the operation is successfully executed.

8717 ** Otherwise, if an error is encountered (i.e. an IO error or database

8718 ** corruption) an SQLite error code is returned.

8719 */

8720 int sqlite3BtreeCount(BtCursor pCur, i64 pnEntry){

8721 i64 nEntry = 0; /* Value to return in pnEntry /

8722 int rc; /* Return code */

8723

8724 if( pCur->pgnoRoot==0 ){

8725 *pnEntry = 0;

8726 return SQLITE_OK;

8727 }

8728 rc = moveToRoot(pCur);

8729

8730 /* Unless an error occurs, the following loop runs one iteration for each

8731 ** page in the B-Tree structure (not including overflow pages).

8732 */

8733 while( rc==SQLITE_OK ){

8734 int iIdx; /* Index of child node in parent */

8735 MemPage pPage; / Current page of the b-tree */

8736

8737 /* If this is a leaf page or the tree is not an int-key tree, then

8738 ** this page contains countable entries. Increment the entry counter

8739 ** accordingly.

8740 */

8741 pPage = pCur->apPage[pCur->iPage];

8742 if( pPage->leaf \|\| !pPage->intKey ){

8743 nEntry += pPage->nCell;

8744 }

8745

8746 /* pPage is a leaf node. This loop navigates the cursor so that it

8747 ** points to the first interior cell that it points to the parent of

8748 ** the next page in the tree that has not yet been visited. The

8749 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell

8750 ** of the page, or to the number of cells in the page if the next page

8751 ** to visit is the right-child of its parent.

8752 **

8753 ** If all pages in the tree have been visited, return SQLITE_OK to the

8754 ** caller.

8755 */

8756 if( pPage->leaf ){

8757 do {

8758 if( pCur->iPage==0 ){

8759 /* All pages of the b-tree have been visited. Return successfully. */

8760 *pnEntry = nEntry;

8761 return moveToRoot(pCur);

8762 }

8763 moveToParent(pCur);

8764 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );

8765

8766 pCur->aiIdx[pCur->iPage]++;

8767 pPage = pCur->apPage[pCur->iPage];

8768 }

8769

8770 /* Descend to the child node of the cell that the cursor currently

8771 ** points at. This is the right-child if (iIdx==pPage->nCell).

8772 */

8773 iIdx = pCur->aiIdx[pCur->iPage];

8774 if( iIdx==pPage->nCell ){

8775 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));

8776 }else{

8777 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));

8778 }

8779 }

8780

8781 /* An error has occurred. Return an error code. */

8782 return rc;

8783 }

8784 #endif

8785

8786 /*

8787 ** Return the pager associated with a BTree. This routine is used for

8788 ** testing and debugging only.

8789 */

8790 Pager sqlite3BtreePager(Btree p){

8791 return p->pBt->pPager;

8792 }

8793

8794 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

8795 /*

8796 ** Append a message to the error message string.

8797 */

8798 static void checkAppendMsg(

8799 IntegrityCk *pCheck,

8800 const char *zFormat,

8801 ...

8802 ){

8803 va_list ap;

8804 if( !pCheck->mxErr ) return;

8805 pCheck->mxErr--;

8806 pCheck->nErr++;

8807 va_start(ap, zFormat);

8808 if( pCheck->errMsg.nChar ){

8809 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);

8810 }

8811 if( pCheck->zPfx ){

8812 sqlite3XPrintf(&pCheck->errMsg, 0, pCheck->zPfx, pCheck->v1, pCheck->v2);

8813 }

8814 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);

8815 va_end(ap);

8816 if( pCheck->errMsg.accError==STRACCUM_NOMEM ){

8817 pCheck->mallocFailed = 1;

8818 }

8819 }

8820 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

8821

8822 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

8823

8824 /*

8825 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that

8826 ** corresponds to page iPg is already set.

8827 */

8828 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){

8829 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

8830 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));

8831 }

8832

8833 /*

8834 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.

8835 */

8836 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){

8837 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );

8838 pCheck->aPgRef[iPg/8] \|= (1 << (iPg & 0x07));

8839 }

8840

8841

8842 /*

8843 ** Add 1 to the reference count for page iPage. If this is the second

8844 ** reference to the page, add an error message to pCheck->zErrMsg.

8845 ** Return 1 if there are 2 or more references to the page and 0 if

8846 ** if this is the first reference to the page.

8847 **

8848 ** Also check that the page number is in bounds.

8849 */

8850 static int checkRef(IntegrityCk *pCheck, Pgno iPage){

8851 if( iPage==0 ) return 1;

8852 if( iPage>pCheck->nPage ){

8853 checkAppendMsg(pCheck, "invalid page number %d", iPage);

8854 return 1;

8855 }

8856 if( getPageReferenced(pCheck, iPage) ){

8857 checkAppendMsg(pCheck, "2nd reference to page %d", iPage);

8858 return 1;

8859 }

8860 setPageReferenced(pCheck, iPage);

8861 return 0;

8862 }

8863

8864 #ifndef SQLITE_OMIT_AUTOVACUUM

8865 /*

8866 ** Check that the entry in the pointer-map for page iChild maps to

8867 ** page iParent, pointer type ptrType. If not, append an error message

8868 ** to pCheck.

8869 */

8870 static void checkPtrmap(

8871 IntegrityCk pCheck, / Integrity check context */

8872 Pgno iChild, /* Child page number */

8873 u8 eType, /* Expected pointer map type */

8874 Pgno iParent /* Expected pointer map parent page number */

8875 ){

8876 int rc;

8877 u8 ePtrmapType;

8878 Pgno iPtrmapParent;

8879

8880 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);

8881 if( rc!=SQLITE_OK ){

8882 if( rc==SQLITE_NOMEM \|\| rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;

8883 checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);

8884 return;

8885 }

8886

8887 if( ePtrmapType!=eType \|\| iPtrmapParent!=iParent ){

8888 checkAppendMsg(pCheck,

8889 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",

8890 iChild, eType, iParent, ePtrmapType, iPtrmapParent);

8891 }

8892 }

8893 #endif

8894

8895 /*

8896 ** Check the integrity of the freelist or of an overflow page list.

8897 ** Verify that the number of pages on the list is N.

8898 */

8899 static void checkList(

8900 IntegrityCk pCheck, / Integrity checking context */

8901 int isFreeList, /* True for a freelist. False for overflow page list */

8902 int iPage, /* Page number for first page in the list */

8903 int N /* Expected number of pages in the list */

8904 ){

8905 int i;

8906 int expected = N;

8907 int iFirst = iPage;

8908 while( N-- > 0 && pCheck->mxErr ){

8909 DbPage *pOvflPage;

8910 unsigned char *pOvflData;

8911 if( iPage<1 ){

8912 checkAppendMsg(pCheck,

8913 "%d of %d pages missing from overflow list starting at %d",

8914 N+1, expected, iFirst);

8915 break;

8916 }

8917 if( checkRef(pCheck, iPage) ) break;

8918 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){

8919 checkAppendMsg(pCheck, "failed to get page %d", iPage);

8920 break;

8921 }

8922 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);

8923 if( isFreeList ){

8924 int n = get4byte(&pOvflData[4]);

8925 #ifndef SQLITE_OMIT_AUTOVACUUM

8926 if( pCheck->pBt->autoVacuum ){

8927 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);

8928 }

8929 #endif

8930 if( n>(int)pCheck->pBt->usableSize/4-2 ){

8931 checkAppendMsg(pCheck,

8932 "freelist leaf count too big on page %d", iPage);

8933 N--;

8934 }else{

8935 for(i=0; i<n; i++){

8936 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);

8937 #ifndef SQLITE_OMIT_AUTOVACUUM

8938 if( pCheck->pBt->autoVacuum ){

8939 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);

8940 }

8941 #endif

8942 checkRef(pCheck, iFreePage);

8943 }

8944 N -= n;

8945 }

8946 }

8947 #ifndef SQLITE_OMIT_AUTOVACUUM

8948 else{

8949 /* If this database supports auto-vacuum and iPage is not the last

8950 ** page in this overflow list, check that the pointer-map entry for

8951 ** the following page matches iPage.

8952 */

8953 if( pCheck->pBt->autoVacuum && N>0 ){

8954 i = get4byte(pOvflData);

8955 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);

8956 }

8957 }

8958 #endif

8959 iPage = get4byte(pOvflData);

8960 sqlite3PagerUnref(pOvflPage);

8961

8962 if( isFreeList && N<(iPage!=0) ){

8963 checkAppendMsg(pCheck, "free-page count in header is too small");

8964 }

8965 }

8966 }

8967 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

8968

8969 /*

8970 ** An implementation of a min-heap.

8971 **

8972 ** aHeap[0] is the number of elements on the heap. aHeap[1] is the

8973 ** root element. The daughter nodes of aHeap[N] are aHeap[N*2]

8974 ** and aHeap[N*2+1].

8975 **

8976 ** The heap property is this: Every node is less than or equal to both

8977 ** of its daughter nodes. A consequence of the heap property is that the

8978 ** root node aHeap[1] is always the minimum value currently in the heap.

8979 **

8980 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto

8981 ** the heap, preserving the heap property. The btreeHeapPull() routine

8982 ** removes the root element from the heap (the minimum value in the heap)

8983 ** and then moves other nodes around as necessary to preserve the heap

8984 ** property.

8985 **

8986 ** This heap is used for cell overlap and coverage testing. Each u32

8987 ** entry represents the span of a cell or freeblock on a btree page.

8988 ** The upper 16 bits are the index of the first byte of a range and the

8989 ** lower 16 bits are the index of the last byte of that range.

8990 */

8991 static void btreeHeapInsert(u32 *aHeap, u32 x){

8992 u32 j, i = ++aHeap[0];

8993 aHeap[i] = x;

8994 while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){

8995 x = aHeap[j];

8996 aHeap[j] = aHeap[i];

8997 aHeap[i] = x;

8998 i = j;

8999 }

9000 }

9001 static int btreeHeapPull(u32 aHeap, u32 pOut){

9002 u32 j, i, x;

9003 if( (x = aHeap[0])==0 ) return 0;

9004 *pOut = aHeap[1];

9005 aHeap[1] = aHeap[x];

9006 aHeap[x] = 0xffffffff;

9007 aHeap[0]--;

9008 i = 1;

9009 while( (j = i*2)<=aHeap[0] ){

9010 if( aHeap[j]>aHeap[j+1] ) j++;

9011 if( aHeap[i]<aHeap[j] ) break;

9012 x = aHeap[i];

9013 aHeap[i] = aHeap[j];

9014 aHeap[j] = x;

9015 i = j;

9016 }

9017 return 1;

9018 }

9019

9020 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

9021 /*

9022 ** Do various sanity checks on a single page of a tree. Return

9023 ** the tree depth. Root pages return 0. Parents of root pages

9024 ** return 1, and so forth.

9025 **

9026 ** These checks are done:

9027 **

9028 ** 1. Make sure that cells and freeblocks do not overlap

9029 ** but combine to completely cover the page.

9030 ** 2. Make sure integer cell keys are in order.

9031 ** 3. Check the integrity of overflow pages.

9032 ** 4. Recursively call checkTreePage on all children.

9033 ** 5. Verify that the depth of all children is the same.

9034 */

9035 static int checkTreePage(

9036 IntegrityCk pCheck, / Context for the sanity check */

9037 int iPage, /* Page number of the page to check */

9038 i64 piMinKey, / Write minimum integer primary key here */

9039 i64 maxKey /* Error if integer primary key greater than this */

9040 ){

9041 MemPage pPage = 0; / The page being analyzed */

9042 int i; /* Loop counter */

9043 int rc; /* Result code from subroutine call */

9044 int depth = -1, d2; /* Depth of a subtree */

9045 int pgno; /* Page number */

9046 int nFrag; /* Number of fragmented bytes on the page */

9047 int hdr; /* Offset to the page header */

9048 int cellStart; /* Offset to the start of the cell pointer array */

9049 int nCell; /* Number of cells */

9050 int doCoverageCheck = 1; /* True if cell coverage checking should be done */

9051 int keyCanBeEqual = 1; /* True if IPK can be equal to maxKey

9052 ** False if IPK must be strictly less than maxKey */

9053 u8 data; / Page content */

9054 u8 pCell; / Cell content */

9055 u8 pCellIdx; / Next element of the cell pointer array */

9056 BtShared pBt; / The BtShared object that owns pPage */

9057 u32 pc; /* Address of a cell */

9058 u32 usableSize; /* Usable size of the page */

9059 u32 contentOffset; /* Offset to the start of the cell content area */

9060 u32 heap = 0; / Min-heap used for checking cell coverage */

9061 u32 x, prev = 0; /* Next and previous entry on the min-heap */

9062 const char *saved_zPfx = pCheck->zPfx;

9063 int saved_v1 = pCheck->v1;

9064 int saved_v2 = pCheck->v2;

9065 u8 savedIsInit = 0;

9066

9067 /* Check that the page exists

9068 */

9069 pBt = pCheck->pBt;

9070 usableSize = pBt->usableSize;

9071 if( iPage==0 ) return 0;

9072 if( checkRef(pCheck, iPage) ) return 0;

9073 pCheck->zPfx = "Page %d: ";

9074 pCheck->v1 = iPage;

9075 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){

9076 checkAppendMsg(pCheck,

9077 "unable to get the page. error code=%d", rc);

9078 goto end_of_check;

9079 }

9080

9081 /* Clear MemPage.isInit to make sure the corruption detection code in

9082 ** btreeInitPage() is executed. */

9083 savedIsInit = pPage->isInit;

9084 pPage->isInit = 0;

9085 if( (rc = btreeInitPage(pPage))!=0 ){

9086 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */

9087 checkAppendMsg(pCheck,

9088 "btreeInitPage() returns error code %d", rc);

9089 goto end_of_check;

9090 }

9091 data = pPage->aData;

9092 hdr = pPage->hdrOffset;

9093

9094 /* Set up for cell analysis */

9095 pCheck->zPfx = "On tree page %d cell %d: ";

9096 contentOffset = get2byteNotZero(&data[hdr+5]);

9097 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */

9098

9099 /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the

9100 ** number of cells on the page. */

9101 nCell = get2byte(&data[hdr+3]);

9102 assert( pPage->nCell==nCell );

9103

9104 /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page

9105 ** immediately follows the b-tree page header. */

9106 cellStart = hdr + 12 - 4*pPage->leaf;

9107 assert( pPage->aCellIdx==&data[cellStart] );

9108 pCellIdx = &data[cellStart + 2*(nCell-1)];

9109

9110 if( !pPage->leaf ){

9111 /* Analyze the right-child page of internal pages */

9112 pgno = get4byte(&data[hdr+8]);

9113 #ifndef SQLITE_OMIT_AUTOVACUUM

9114 if( pBt->autoVacuum ){

9115 pCheck->zPfx = "On page %d at right child: ";

9116 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

9117 }

9118 #endif

9119 depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);

9120 keyCanBeEqual = 0;

9121 }else{

9122 /* For leaf pages, the coverage check will occur in the same loop

9123 ** as the other cell checks, so initialize the heap. */

9124 heap = pCheck->heap;

9125 heap[0] = 0;

9126 }

9127

9128 /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte

9129 ** integer offsets to the cell contents. */

9130 for(i=nCell-1; i>=0 && pCheck->mxErr; i--){

9131 CellInfo info;

9132

9133 /* Check cell size */

9134 pCheck->v2 = i;

9135 assert( pCellIdx==&data[cellStart + i*2] );

9136 pc = get2byteAligned(pCellIdx);

9137 pCellIdx -= 2;

9138 if( pc<contentOffset \|\| pc>usableSize-4 ){

9139 checkAppendMsg(pCheck, "Offset %d out of range %d..%d",

9140 pc, contentOffset, usableSize-4);

9141 doCoverageCheck = 0;

9142 continue;

9143 }

9144 pCell = &data[pc];

9145 pPage->xParseCell(pPage, pCell, &info);

9146 if( pc+info.nSize>usableSize ){

9147 checkAppendMsg(pCheck, "Extends off end of page");

9148 doCoverageCheck = 0;

9149 continue;

9150 }

9151

9152 /* Check for integer primary key out of range */

9153 if( pPage->intKey ){

9154 if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){

9155 checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);

9156 }

9157 maxKey = info.nKey;

9158 }

9159

9160 /* Check the content overflow list */

9161 if( info.nPayload>info.nLocal ){

9162 int nPage; /* Number of pages on the overflow chain */

9163 Pgno pgnoOvfl; /* First page of the overflow chain */

9164 assert( pc + info.nSize - 4 <= usableSize );

9165 nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);

9166 pgnoOvfl = get4byte(&pCell[info.nSize - 4]);

9167 #ifndef SQLITE_OMIT_AUTOVACUUM

9168 if( pBt->autoVacuum ){

9169 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);

9170 }

9171 #endif

9172 checkList(pCheck, 0, pgnoOvfl, nPage);

9173 }

9174

9175 if( !pPage->leaf ){

9176 /* Check sanity of left child page for internal pages */

9177 pgno = get4byte(pCell);

9178 #ifndef SQLITE_OMIT_AUTOVACUUM

9179 if( pBt->autoVacuum ){

9180 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);

9181 }

9182 #endif

9183 d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);

9184 keyCanBeEqual = 0;

9185 if( d2!=depth ){

9186 checkAppendMsg(pCheck, "Child page depth differs");

9187 depth = d2;

9188 }

9189 }else{

9190 /* Populate the coverage-checking heap for leaf pages */

9191 btreeHeapInsert(heap, (pc<<16)\|(pc+info.nSize-1));

9192 }

9193 }

9194 *piMinKey = maxKey;

9195

9196 /* Check for complete coverage of the page

9197 */

9198 pCheck->zPfx = 0;

9199 if( doCoverageCheck && pCheck->mxErr>0 ){

9200 /* For leaf pages, the min-heap has already been initialized and the

9201 ** cells have already been inserted. But for internal pages, that has

9202 ** not yet been done, so do it now */

9203 if( !pPage->leaf ){

9204 heap = pCheck->heap;

9205 heap[0] = 0;

9206 for(i=nCell-1; i>=0; i--){

9207 u32 size;

9208 pc = get2byteAligned(&data[cellStart+i*2]);

9209 size = pPage->xCellSize(pPage, &data[pc]);

9210 btreeHeapInsert(heap, (pc<<16)\|(pc+size-1));

9211 }

9212 }

9213 /* Add the freeblocks to the min-heap

9214 **

9215 ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header

9216 ** is the offset of the first freeblock, or zero if there are no

9217 ** freeblocks on the page.

9218 */

9219 i = get2byte(&data[hdr+1]);

9220 while( i>0 ){

9221 int size, j;

9222 assert( (u32)i<=usableSize-4 ); /* Enforced by btreeInitPage() */

9223 size = get2byte(&data[i+2]);

9224 assert( (u32)(i+size)<=usableSize ); /* Enforced by btreeInitPage() */

9225 btreeHeapInsert(heap, (((u32)i)<<16)\|(i+size-1));

9226 /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a

9227 ** big-endian integer which is the offset in the b-tree page of the next

9228 ** freeblock in the chain, or zero if the freeblock is the last on the

9229 ** chain. */

9230 j = get2byte(&data[i]);

9231 /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of

9232 ** increasing offset. */

9233 assert( j==0 \|\| j>i+size ); /* Enforced by btreeInitPage() */

9234 assert( (u32)j<=usableSize-4 ); /* Enforced by btreeInitPage() */

9235 i = j;

9236 }

9237 /* Analyze the min-heap looking for overlap between cells and/or

9238 ** freeblocks, and counting the number of untracked bytes in nFrag.

9239 **

9240 ** Each min-heap entry is of the form: (start_address<<16)\|end_address.

9241 ** There is an implied first entry the covers the page header, the cell

9242 ** pointer index, and the gap between the cell pointer index and the start

9243 ** of cell content.

9244 **

9245 ** The loop below pulls entries from the min-heap in order and compares

9246 ** the start_address against the previous end_address. If there is an

9247 ** overlap, that means bytes are used multiple times. If there is a gap,

9248 ** that gap is added to the fragmentation count.

9249 */

9250 nFrag = 0;

9251 prev = contentOffset - 1; /* Implied first min-heap entry */

9252 while( btreeHeapPull(heap,&x) ){

9253 if( (prev&0xffff)>=(x>>16) ){

9254 checkAppendMsg(pCheck,

9255 "Multiple uses for byte %u of page %d", x>>16, iPage);

9256 break;

9257 }else{

9258 nFrag += (x>>16) - (prev&0xffff) - 1;

9259 prev = x;

9260 }

9261 }

9262 nFrag += usableSize - (prev&0xffff) - 1;

9263 /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments

9264 ** is stored in the fifth field of the b-tree page header.

9265 ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the

9266 ** number of fragmented free bytes within the cell content area.

9267 */

9268 if( heap[0]==0 && nFrag!=data[hdr+7] ){

9269 checkAppendMsg(pCheck,

9270 "Fragmentation of %d bytes reported as %d on page %d",

9271 nFrag, data[hdr+7], iPage);

9272 }

9273 }

9274

9275 end_of_check:

9276 if( !doCoverageCheck ) pPage->isInit = savedIsInit;

9277 releasePage(pPage);

9278 pCheck->zPfx = saved_zPfx;

9279 pCheck->v1 = saved_v1;

9280 pCheck->v2 = saved_v2;

9281 return depth+1;

9282 }

9283 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

9284

9285 #ifndef SQLITE_OMIT_INTEGRITY_CHECK

9286 /*

9287 ** This routine does a complete check of the given BTree file. aRoot[] is

9288 ** an array of pages numbers were each page number is the root page of

9289 ** a table. nRoot is the number of entries in aRoot.

9290 **

9291 ** A read-only or read-write transaction must be opened before calling

9292 ** this function.

9293 **

9294 ** Write the number of error seen in *pnErr. Except for some memory

9295 ** allocation errors, an error message held in memory obtained from

9296 ** malloc is returned if pnErr is non-zero. If pnErr==0 then NULL is

9297 ** returned. If a memory allocation error occurs, NULL is returned.

9298 */

9299 char *sqlite3BtreeIntegrityCheck(

9300 Btree p, / The btree to be checked */

9301 int aRoot, / An array of root pages numbers for individual trees */

9302 int nRoot, /* Number of entries in aRoot[] */

9303 int mxErr, /* Stop reporting errors after this many */

9304 int pnErr / Write number of errors seen to this variable */

9305 ){

9306 Pgno i;

9307 IntegrityCk sCheck;

9308 BtShared *pBt = p->pBt;

9309 int savedDbFlags = pBt->db->flags;

9310 char zErr[100];

9311 VVA_ONLY( int nRef );

9312

9313 sqlite3BtreeEnter(p);

9314 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );

9315 assert( (nRef = sqlite3PagerRefcount(pBt->pPager))>=0 );

9316 sCheck.pBt = pBt;

9317 sCheck.pPager = pBt->pPager;

9318 sCheck.nPage = btreePagecount(sCheck.pBt);

9319 sCheck.mxErr = mxErr;

9320 sCheck.nErr = 0;

9321 sCheck.mallocFailed = 0;

9322 sCheck.zPfx = 0;

9323 sCheck.v1 = 0;

9324 sCheck.v2 = 0;

9325 sCheck.aPgRef = 0;

9326 sCheck.heap = 0;

9327 sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);

9328 if( sCheck.nPage==0 ){

9329 goto integrity_ck_cleanup;

9330 }

9331

9332 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);

9333 if( !sCheck.aPgRef ){

9334 sCheck.mallocFailed = 1;

9335 goto integrity_ck_cleanup;

9336 }

9337 sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );

9338 if( sCheck.heap==0 ){

9339 sCheck.mallocFailed = 1;

9340 goto integrity_ck_cleanup;

9341 }

9342

9343 i = PENDING_BYTE_PAGE(pBt);

9344 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);

9345

9346 /* Check the integrity of the freelist

9347 */

9348 sCheck.zPfx = "Main freelist: ";

9349 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),

9350 get4byte(&pBt->pPage1->aData[36]));

9351 sCheck.zPfx = 0;

9352

9353 /* Check all the tables.

9354 */

9355 testcase( pBt->db->flags & SQLITE_CellSizeCk );

9356 pBt->db->flags &= ~SQLITE_CellSizeCk;

9357 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){

9358 i64 notUsed;

9359 if( aRoot[i]==0 ) continue;

9360 #ifndef SQLITE_OMIT_AUTOVACUUM

9361 if( pBt->autoVacuum && aRoot[i]>1 ){

9362 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);

9363 }

9364 #endif

9365 checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);

9366 }

9367 pBt->db->flags = savedDbFlags;

9368

9369 /* Make sure every page in the file is referenced

9370 */

9371 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){

9372 #ifdef SQLITE_OMIT_AUTOVACUUM

9373 if( getPageReferenced(&sCheck, i)==0 ){

9374 checkAppendMsg(&sCheck, "Page %d is never used", i);

9375 }

9376 #else

9377 /* If the database supports auto-vacuum, make sure no tables contain

9378 ** references to pointer-map pages.

9379 */

9380 if( getPageReferenced(&sCheck, i)==0 &&

9381 (PTRMAP_PAGENO(pBt, i)!=i \|\| !pBt->autoVacuum) ){

9382 checkAppendMsg(&sCheck, "Page %d is never used", i);

9383 }

9384 if( getPageReferenced(&sCheck, i)!=0 &&

9385 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){

9386 checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);

9387 }

9388 #endif

9389 }

9390

9391 /* Clean up and report errors.

9392 */

9393 integrity_ck_cleanup:

9394 sqlite3PageFree(sCheck.heap);

9395 sqlite3_free(sCheck.aPgRef);

9396 if( sCheck.mallocFailed ){

9397 sqlite3StrAccumReset(&sCheck.errMsg);

9398 sCheck.nErr++;

9399 }

9400 *pnErr = sCheck.nErr;

9401 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);

9402 /* Make sure this analysis did not leave any unref() pages. */

9403 assert( nRef==sqlite3PagerRefcount(pBt->pPager) );

9404 sqlite3BtreeLeave(p);

9405 return sqlite3StrAccumFinish(&sCheck.errMsg);

9406 }

9407 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */

9408

9409 /*

9410 ** Return the full pathname of the underlying database file. Return

9411 ** an empty string if the database is in-memory or a TEMP database.

9412 **

9413 ** The pager filename is invariant as long as the pager is

9414 ** open so it is safe to access without the BtShared mutex.

9415 */

9416 const char sqlite3BtreeGetFilename(Btree p){

9417 assert( p->pBt->pPager!=0 );

9418 return sqlite3PagerFilename(p->pBt->pPager, 1);

9419 }

9420

9421 /*

9422 ** Return the pathname of the journal file for this database. The return

9423 ** value of this routine is the same regardless of whether the journal file

9424 ** has been created or not.

9425 **

9426 ** The pager journal filename is invariant as long as the pager is

9427 ** open so it is safe to access without the BtShared mutex.

9428 */

9429 const char sqlite3BtreeGetJournalname(Btree p){

9430 assert( p->pBt->pPager!=0 );

9431 return sqlite3PagerJournalname(p->pBt->pPager);

9432 }

9433

9434 /*

9435 ** Return non-zero if a transaction is active.

9436 */

9437 int sqlite3BtreeIsInTrans(Btree *p){

9438 assert( p==0 \|\| sqlite3_mutex_held(p->db->mutex) );

9439 return (p && (p->inTrans==TRANS_WRITE));

9440 }

9441

9442 #ifndef SQLITE_OMIT_WAL

9443 /*

9444 ** Run a checkpoint on the Btree passed as the first argument.

9445 **

9446 ** Return SQLITE_LOCKED if this or any other connection has an open

9447 ** transaction on the shared-cache the argument Btree is connected to.

9448 **

9449 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.

9450 */

9451 int sqlite3BtreeCheckpoint(Btree p, int eMode, int pnLog, int *pnCkpt){

9452 int rc = SQLITE_OK;

9453 if( p ){

9454 BtShared *pBt = p->pBt;

9455 sqlite3BtreeEnter(p);

9456 if( pBt->inTransaction!=TRANS_NONE ){

9457 rc = SQLITE_LOCKED;

9458 }else{

9459 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);

9460 }

9461 sqlite3BtreeLeave(p);

9462 }

9463 return rc;

9464 }

9465 #endif

9466

9467 /*

9468 ** Return non-zero if a read (or write) transaction is active.

9469 */

9470 int sqlite3BtreeIsInReadTrans(Btree *p){

9471 assert( p );

9472 assert( sqlite3_mutex_held(p->db->mutex) );

9473 return p->inTrans!=TRANS_NONE;

9474 }

9475

9476 int sqlite3BtreeIsInBackup(Btree *p){

9477 assert( p );

9478 assert( sqlite3_mutex_held(p->db->mutex) );

9479 return p->nBackup!=0;

9480 }

9481

9482 /*

9483 ** This function returns a pointer to a blob of memory associated with

9484 ** a single shared-btree. The memory is used by client code for its own

9485 ** purposes (for example, to store a high-level schema associated with

9486 ** the shared-btree). The btree layer manages reference counting issues.

9487 **

9488 ** The first time this is called on a shared-btree, nBytes bytes of memory

9489 ** are allocated, zeroed, and returned to the caller. For each subsequent

9490 ** call the nBytes parameter is ignored and a pointer to the same blob

9491 ** of memory returned.

9492 **

9493 ** If the nBytes parameter is 0 and the blob of memory has not yet been

9494 ** allocated, a null pointer is returned. If the blob has already been

9495 ** allocated, it is returned as normal.

9496 **

9497 ** Just before the shared-btree is closed, the function passed as the

9498 ** xFree argument when the memory allocation was made is invoked on the

9499 ** blob of allocated memory. The xFree function should not call sqlite3_free()

9500 ** on the memory, the btree layer does that.

9501 */

9502 void sqlite3BtreeSchema(Btree p, int nBytes, void(xFree)(void )){

9503 BtShared *pBt = p->pBt;

9504 sqlite3BtreeEnter(p);

9505 if( !pBt->pSchema && nBytes ){

9506 pBt->pSchema = sqlite3DbMallocZero(0, nBytes);

9507 pBt->xFreeSchema = xFree;

9508 }

9509 sqlite3BtreeLeave(p);

9510 return pBt->pSchema;

9511 }

9512

9513 /*

9514 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared

9515 ** btree as the argument handle holds an exclusive lock on the

9516 ** sqlite_master table. Otherwise SQLITE_OK.

9517 */

9518 int sqlite3BtreeSchemaLocked(Btree *p){

9519 int rc;

9520 assert( sqlite3_mutex_held(p->db->mutex) );

9521 sqlite3BtreeEnter(p);

9522 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);

9523 assert( rc==SQLITE_OK \|\| rc==SQLITE_LOCKED_SHAREDCACHE );

9524 sqlite3BtreeLeave(p);

9525 return rc;

9526 }

9527

9528

9529 #ifndef SQLITE_OMIT_SHARED_CACHE

9530 /*

9531 ** Obtain a lock on the table whose root page is iTab. The

9532 ** lock is a write lock if isWritelock is true or a read lock

9533 ** if it is false.

9534 */

9535 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){

9536 int rc = SQLITE_OK;

9537 assert( p->inTrans!=TRANS_NONE );

9538 if( p->sharable ){

9539 u8 lockType = READ_LOCK + isWriteLock;

9540 assert( READ_LOCK+1==WRITE_LOCK );

9541 assert( isWriteLock==0 \|\| isWriteLock==1 );

9542

9543 sqlite3BtreeEnter(p);

9544 rc = querySharedCacheTableLock(p, iTab, lockType);

9545 if( rc==SQLITE_OK ){

9546 rc = setSharedCacheTableLock(p, iTab, lockType);

9547 }

9548 sqlite3BtreeLeave(p);

9549 }

9550 return rc;

9551 }

9552 #endif

9553

9554 #ifndef SQLITE_OMIT_INCRBLOB

9555 /*

9556 ** Argument pCsr must be a cursor opened for writing on an

9557 ** INTKEY table currently pointing at a valid table entry.

9558 ** This function modifies the data stored as part of that entry.

9559 **

9560 ** Only the data content may only be modified, it is not possible to

9561 ** change the length of the data stored. If this function is called with

9562 ** parameters that attempt to write past the end of the existing data,

9563 ** no modifications are made and SQLITE_CORRUPT is returned.

9564 */

9565 int sqlite3BtreePutData(BtCursor pCsr, u32 offset, u32 amt, void z){

9566 int rc;

9567 assert( cursorHoldsMutex(pCsr) );

9568 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );

9569 assert( pCsr->curFlags & BTCF_Incrblob );

9570

9571 rc = restoreCursorPosition(pCsr);

9572 if( rc!=SQLITE_OK ){

9573 return rc;

9574 }

9575 assert( pCsr->eState!=CURSOR_REQUIRESEEK );

9576 if( pCsr->eState!=CURSOR_VALID ){

9577 return SQLITE_ABORT;

9578 }

9579

9580 /* Save the positions of all other cursors open on this table. This is

9581 ** required in case any of them are holding references to an xFetch

9582 ** version of the b-tree page modified by the accessPayload call below.

9583 **

9584 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()

9585 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence

9586 ** saveAllCursors can only return SQLITE_OK.

9587 */

9588 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);

9589 assert( rc==SQLITE_OK );

9590

9591 /* Check some assumptions:

9592 ** (a) the cursor is open for writing,

9593 ** (b) there is a read/write transaction open,

9594 ** (c) the connection holds a write-lock on the table (if required),

9595 ** (d) there are no conflicting read-locks, and

9596 ** (e) the cursor points at a valid row of an intKey table.

9597 */

9598 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){

9599 return SQLITE_READONLY;

9600 }

9601 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0

9602 && pCsr->pBt->inTransaction==TRANS_WRITE );

9603 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );

9604 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );

9605 assert( pCsr->apPage[pCsr->iPage]->intKey );

9606

9607 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);

9608 }

9609

9610 /*

9611 ** Mark this cursor as an incremental blob cursor.

9612 */

9613 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){

9614 pCur->curFlags \|= BTCF_Incrblob;

9615 pCur->pBtree->hasIncrblobCur = 1;

9616 }

9617 #endif

9618

9619 /*

9620 ** Set both the "read version" (single byte at byte offset 18) and

9621 ** "write version" (single byte at byte offset 19) fields in the database

9622 ** header to iVersion.

9623 */

9624 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){

9625 BtShared *pBt = pBtree->pBt;

9626 int rc; /* Return code */

9627

9628 assert( iVersion==1 \|\| iVersion==2 );

9629

9630 /* If setting the version fields to 1, do not automatically open the

9631 ** WAL connection, even if the version fields are currently set to 2.

9632 */

9633 pBt->btsFlags &= ~BTS_NO_WAL;

9634 if( iVersion==1 ) pBt->btsFlags \|= BTS_NO_WAL;

9635

9636 rc = sqlite3BtreeBeginTrans(pBtree, 0);

9637 if( rc==SQLITE_OK ){

9638 u8 *aData = pBt->pPage1->aData;

9639 if( aData[18]!=(u8)iVersion \|\| aData[19]!=(u8)iVersion ){

9640 rc = sqlite3BtreeBeginTrans(pBtree, 2);

9641 if( rc==SQLITE_OK ){

9642 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);

9643 if( rc==SQLITE_OK ){

9644 aData[18] = (u8)iVersion;

9645 aData[19] = (u8)iVersion;

9646 }

9647 }

9648 }

9649 }

9650

9651 pBt->btsFlags &= ~BTS_NO_WAL;

9652 return rc;

9653 }

9654

9655 /*

9656 ** Return true if the cursor has a hint specified. This routine is

9657 ** only used from within assert() statements

9658 */

9659 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){

9660 return (pCsr->hints & mask)!=0;

9661 }

9662

9663 /*

9664 ** Return true if the given Btree is read-only.

9665 */

9666 int sqlite3BtreeIsReadonly(Btree *p){

9667 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;

9668 }

9669

9670 /*

9671 ** Return the size of the header added to each page by this module.

9672 */

9673 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }

OLD	NEW

« no previous file with comments | « third_party/sqlite/sqlite-src-3100200/src/btree.h ('k') | third_party/sqlite/sqlite-src-3100200/src/btreeInt.h » ('j') | no next file with comments »