| OLD | NEW | 
 | (Empty) | 
|     1 /* |  | 
|     2 ** 2004 April 6 |  | 
|     3 ** |  | 
|     4 ** The author disclaims copyright to this source code.  In place of |  | 
|     5 ** a legal notice, here is a blessing: |  | 
|     6 ** |  | 
|     7 **    May you do good and not evil. |  | 
|     8 **    May you find forgiveness for yourself and forgive others. |  | 
|     9 **    May you share freely, never taking more than you give. |  | 
|    10 ** |  | 
|    11 ************************************************************************* |  | 
|    12 ** $Id: btree.c,v 1.705 2009/08/10 03:57:58 shane Exp $ |  | 
|    13 ** |  | 
|    14 ** This file implements a external (disk-based) database using BTrees. |  | 
|    15 ** See the header comment on "btreeInt.h" for additional information. |  | 
|    16 ** Including a description of file format and an overview of operation. |  | 
|    17 */ |  | 
|    18 #include "btreeInt.h" |  | 
|    19  |  | 
|    20 /* |  | 
|    21 ** The header string that appears at the beginning of every |  | 
|    22 ** SQLite database. |  | 
|    23 */ |  | 
|    24 static const char zMagicHeader[] = SQLITE_FILE_HEADER; |  | 
|    25  |  | 
|    26 /* |  | 
|    27 ** The header string that appears at the beginning of a SQLite |  | 
|    28 ** database which has been poisoned. |  | 
|    29 */ |  | 
|    30 static const char zPoisonHeader[] = "SQLite poison 3"; |  | 
|    31  |  | 
|    32 /* |  | 
|    33 ** Set this global variable to 1 to enable tracing using the TRACE |  | 
|    34 ** macro. |  | 
|    35 */ |  | 
|    36 #if 0 |  | 
|    37 int sqlite3BtreeTrace=1;  /* True to enable tracing */ |  | 
|    38 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);} |  | 
|    39 #else |  | 
|    40 # define TRACE(X) |  | 
|    41 #endif |  | 
|    42  |  | 
|    43  |  | 
|    44  |  | 
|    45 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|    46 /* |  | 
|    47 ** A list of BtShared objects that are eligible for participation |  | 
|    48 ** in shared cache.  This variable has file scope during normal builds, |  | 
|    49 ** but the test harness needs to access it so we make it global for  |  | 
|    50 ** test builds. |  | 
|    51 ** |  | 
|    52 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER. |  | 
|    53 */ |  | 
|    54 #ifdef SQLITE_TEST |  | 
|    55 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; |  | 
|    56 #else |  | 
|    57 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0; |  | 
|    58 #endif |  | 
|    59 #endif /* SQLITE_OMIT_SHARED_CACHE */ |  | 
|    60  |  | 
|    61 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|    62 /* |  | 
|    63 ** Enable or disable the shared pager and schema features. |  | 
|    64 ** |  | 
|    65 ** This routine has no effect on existing database connections. |  | 
|    66 ** The shared cache setting effects only future calls to |  | 
|    67 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2(). |  | 
|    68 */ |  | 
|    69 int sqlite3_enable_shared_cache(int enable){ |  | 
|    70   sqlite3GlobalConfig.sharedCacheEnabled = enable; |  | 
|    71   return SQLITE_OK; |  | 
|    72 } |  | 
|    73 #endif |  | 
|    74  |  | 
|    75  |  | 
|    76  |  | 
|    77 #ifdef SQLITE_OMIT_SHARED_CACHE |  | 
|    78   /* |  | 
|    79   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(), |  | 
|    80   ** and clearAllSharedCacheTableLocks() |  | 
|    81   ** manipulate entries in the BtShared.pLock linked list used to store |  | 
|    82   ** shared-cache table level locks. If the library is compiled with the |  | 
|    83   ** shared-cache feature disabled, then there is only ever one user |  | 
|    84   ** of each BtShared structure and so this locking is not necessary.  |  | 
|    85   ** So define the lock related functions as no-ops. |  | 
|    86   */ |  | 
|    87   #define querySharedCacheTableLock(a,b,c) SQLITE_OK |  | 
|    88   #define setSharedCacheTableLock(a,b,c) SQLITE_OK |  | 
|    89   #define clearAllSharedCacheTableLocks(a) |  | 
|    90   #define downgradeAllSharedCacheTableLocks(a) |  | 
|    91   #define hasSharedCacheTableLock(a,b,c,d) 1 |  | 
|    92   #define hasReadConflicts(a, b) 0 |  | 
|    93 #endif |  | 
|    94  |  | 
|    95 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|    96  |  | 
|    97 #ifdef SQLITE_DEBUG |  | 
|    98 /* |  | 
|    99 ** This function is only used as part of an assert() statement. It checks |  | 
|   100 ** that connection p holds the required locks to read or write to the  |  | 
|   101 ** b-tree with root page iRoot. If so, true is returned. Otherwise, false.  |  | 
|   102 ** For example, when writing to a table b-tree with root-page iRoot via  |  | 
|   103 ** Btree connection pBtree: |  | 
|   104 ** |  | 
|   105 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) ); |  | 
|   106 ** |  | 
|   107 ** When writing to an index b-tree that resides in a sharable database, the  |  | 
|   108 ** caller should have first obtained a lock specifying the root page of |  | 
|   109 ** the corresponding table b-tree. This makes things a bit more complicated, |  | 
|   110 ** as this module treats each b-tree as a separate structure. To determine |  | 
|   111 ** the table b-tree corresponding to the index b-tree being written, this |  | 
|   112 ** function has to search through the database schema. |  | 
|   113 ** |  | 
|   114 ** Instead of a lock on the b-tree rooted at page iRoot, the caller may |  | 
|   115 ** hold a write-lock on the schema table (root page 1). This is also |  | 
|   116 ** acceptable. |  | 
|   117 */ |  | 
|   118 static int hasSharedCacheTableLock( |  | 
|   119   Btree *pBtree,         /* Handle that must hold lock */ |  | 
|   120   Pgno iRoot,            /* Root page of b-tree */ |  | 
|   121   int isIndex,           /* True if iRoot is the root of an index b-tree */ |  | 
|   122   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */ |  | 
|   123 ){ |  | 
|   124   Schema *pSchema = (Schema *)pBtree->pBt->pSchema; |  | 
|   125   Pgno iTab = 0; |  | 
|   126   BtLock *pLock; |  | 
|   127  |  | 
|   128   /* If this b-tree database is not shareable, or if the client is reading |  | 
|   129   ** and has the read-uncommitted flag set, then no lock is required.  |  | 
|   130   ** In these cases return true immediately.  If the client is reading  |  | 
|   131   ** or writing an index b-tree, but the schema is not loaded, then return |  | 
|   132   ** true also. In this case the lock is required, but it is too difficult |  | 
|   133   ** to check if the client actually holds it. This doesn't happen very |  | 
|   134   ** often.  */ |  | 
|   135   if( (pBtree->sharable==0) |  | 
|   136    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted)) |  | 
|   137    || (isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0 )) |  | 
|   138   ){ |  | 
|   139     return 1; |  | 
|   140   } |  | 
|   141  |  | 
|   142   /* Figure out the root-page that the lock should be held on. For table |  | 
|   143   ** b-trees, this is just the root page of the b-tree being read or |  | 
|   144   ** written. For index b-trees, it is the root page of the associated |  | 
|   145   ** table.  */ |  | 
|   146   if( isIndex ){ |  | 
|   147     HashElem *p; |  | 
|   148     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){ |  | 
|   149       Index *pIdx = (Index *)sqliteHashData(p); |  | 
|   150       if( pIdx->tnum==(int)iRoot ){ |  | 
|   151         iTab = pIdx->pTable->tnum; |  | 
|   152       } |  | 
|   153     } |  | 
|   154   }else{ |  | 
|   155     iTab = iRoot; |  | 
|   156   } |  | 
|   157  |  | 
|   158   /* Search for the required lock. Either a write-lock on root-page iTab, a  |  | 
|   159   ** write-lock on the schema table, or (if the client is reading) a |  | 
|   160   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */ |  | 
|   161   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){ |  | 
|   162     if( pLock->pBtree==pBtree  |  | 
|   163      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1)) |  | 
|   164      && pLock->eLock>=eLockType  |  | 
|   165     ){ |  | 
|   166       return 1; |  | 
|   167     } |  | 
|   168   } |  | 
|   169  |  | 
|   170   /* Failed to find the required lock. */ |  | 
|   171   return 0; |  | 
|   172 } |  | 
|   173  |  | 
|   174 /* |  | 
|   175 ** This function is also used as part of assert() statements only. It  |  | 
|   176 ** returns true if there exist one or more cursors open on the table  |  | 
|   177 ** with root page iRoot that do not belong to either connection pBtree  |  | 
|   178 ** or some other connection that has the read-uncommitted flag set. |  | 
|   179 ** |  | 
|   180 ** For example, before writing to page iRoot: |  | 
|   181 ** |  | 
|   182 **    assert( !hasReadConflicts(pBtree, iRoot) ); |  | 
|   183 */ |  | 
|   184 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){ |  | 
|   185   BtCursor *p; |  | 
|   186   for(p=pBtree->pBt->pCursor; p; p=p->pNext){ |  | 
|   187     if( p->pgnoRoot==iRoot  |  | 
|   188      && p->pBtree!=pBtree |  | 
|   189      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted) |  | 
|   190     ){ |  | 
|   191       return 1; |  | 
|   192     } |  | 
|   193   } |  | 
|   194   return 0; |  | 
|   195 } |  | 
|   196 #endif    /* #ifdef SQLITE_DEBUG */ |  | 
|   197  |  | 
|   198 /* |  | 
|   199 ** Query to see if btree handle p may obtain a lock of type eLock  |  | 
|   200 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return |  | 
|   201 ** SQLITE_OK if the lock may be obtained (by calling |  | 
|   202 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not. |  | 
|   203 */ |  | 
|   204 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){ |  | 
|   205   BtShared *pBt = p->pBt; |  | 
|   206   BtLock *pIter; |  | 
|   207  |  | 
|   208   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|   209   assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); |  | 
|   210   assert( p->db!=0 ); |  | 
|   211   assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 ); |  | 
|   212    |  | 
|   213   /* If requesting a write-lock, then the Btree must have an open write |  | 
|   214   ** transaction on this file. And, obviously, for this to be so there  |  | 
|   215   ** must be an open write transaction on the file itself. |  | 
|   216   */ |  | 
|   217   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) ); |  | 
|   218   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE ); |  | 
|   219    |  | 
|   220   /* This is a no-op if the shared-cache is not enabled */ |  | 
|   221   if( !p->sharable ){ |  | 
|   222     return SQLITE_OK; |  | 
|   223   } |  | 
|   224  |  | 
|   225   /* If some other connection is holding an exclusive lock, the |  | 
|   226   ** requested lock may not be obtained. |  | 
|   227   */ |  | 
|   228   if( pBt->pWriter!=p && pBt->isExclusive ){ |  | 
|   229     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db); |  | 
|   230     return SQLITE_LOCKED_SHAREDCACHE; |  | 
|   231   } |  | 
|   232  |  | 
|   233   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ |  | 
|   234     /* The condition (pIter->eLock!=eLock) in the following if(...)  |  | 
|   235     ** statement is a simplification of: |  | 
|   236     ** |  | 
|   237     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK) |  | 
|   238     ** |  | 
|   239     ** since we know that if eLock==WRITE_LOCK, then no other connection |  | 
|   240     ** may hold a WRITE_LOCK on any table in this file (since there can |  | 
|   241     ** only be a single writer). |  | 
|   242     */ |  | 
|   243     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK ); |  | 
|   244     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK); |  | 
|   245     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){ |  | 
|   246       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db); |  | 
|   247       if( eLock==WRITE_LOCK ){ |  | 
|   248         assert( p==pBt->pWriter ); |  | 
|   249         pBt->isPending = 1; |  | 
|   250       } |  | 
|   251       return SQLITE_LOCKED_SHAREDCACHE; |  | 
|   252     } |  | 
|   253   } |  | 
|   254   return SQLITE_OK; |  | 
|   255 } |  | 
|   256 #endif /* !SQLITE_OMIT_SHARED_CACHE */ |  | 
|   257  |  | 
|   258 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|   259 /* |  | 
|   260 ** Add a lock on the table with root-page iTable to the shared-btree used |  | 
|   261 ** by Btree handle p. Parameter eLock must be either READ_LOCK or  |  | 
|   262 ** WRITE_LOCK. |  | 
|   263 ** |  | 
|   264 ** This function assumes the following: |  | 
|   265 ** |  | 
|   266 **   (a) The specified b-tree connection handle is connected to a sharable |  | 
|   267 **       b-tree database (one with the BtShared.sharable) flag set, and |  | 
|   268 ** |  | 
|   269 **   (b) No other b-tree connection handle holds a lock that conflicts |  | 
|   270 **       with the requested lock (i.e. querySharedCacheTableLock() has |  | 
|   271 **       already been called and returned SQLITE_OK). |  | 
|   272 ** |  | 
|   273 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM  |  | 
|   274 ** is returned if a malloc attempt fails. |  | 
|   275 */ |  | 
|   276 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){ |  | 
|   277   BtShared *pBt = p->pBt; |  | 
|   278   BtLock *pLock = 0; |  | 
|   279   BtLock *pIter; |  | 
|   280  |  | 
|   281   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|   282   assert( eLock==READ_LOCK || eLock==WRITE_LOCK ); |  | 
|   283   assert( p->db!=0 ); |  | 
|   284  |  | 
|   285   /* A connection with the read-uncommitted flag set will never try to |  | 
|   286   ** obtain a read-lock using this function. The only read-lock obtained |  | 
|   287   ** by a connection in read-uncommitted mode is on the sqlite_master  |  | 
|   288   ** table, and that lock is obtained in BtreeBeginTrans().  */ |  | 
|   289   assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK ); |  | 
|   290  |  | 
|   291   /* This function should only be called on a sharable b-tree after it  |  | 
|   292   ** has been determined that no other b-tree holds a conflicting lock.  */ |  | 
|   293   assert( p->sharable ); |  | 
|   294   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) ); |  | 
|   295  |  | 
|   296   /* First search the list for an existing lock on this table. */ |  | 
|   297   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ |  | 
|   298     if( pIter->iTable==iTable && pIter->pBtree==p ){ |  | 
|   299       pLock = pIter; |  | 
|   300       break; |  | 
|   301     } |  | 
|   302   } |  | 
|   303  |  | 
|   304   /* If the above search did not find a BtLock struct associating Btree p |  | 
|   305   ** with table iTable, allocate one and link it into the list. |  | 
|   306   */ |  | 
|   307   if( !pLock ){ |  | 
|   308     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock)); |  | 
|   309     if( !pLock ){ |  | 
|   310       return SQLITE_NOMEM; |  | 
|   311     } |  | 
|   312     pLock->iTable = iTable; |  | 
|   313     pLock->pBtree = p; |  | 
|   314     pLock->pNext = pBt->pLock; |  | 
|   315     pBt->pLock = pLock; |  | 
|   316   } |  | 
|   317  |  | 
|   318   /* Set the BtLock.eLock variable to the maximum of the current lock |  | 
|   319   ** and the requested lock. This means if a write-lock was already held |  | 
|   320   ** and a read-lock requested, we don't incorrectly downgrade the lock. |  | 
|   321   */ |  | 
|   322   assert( WRITE_LOCK>READ_LOCK ); |  | 
|   323   if( eLock>pLock->eLock ){ |  | 
|   324     pLock->eLock = eLock; |  | 
|   325   } |  | 
|   326  |  | 
|   327   return SQLITE_OK; |  | 
|   328 } |  | 
|   329 #endif /* !SQLITE_OMIT_SHARED_CACHE */ |  | 
|   330  |  | 
|   331 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|   332 /* |  | 
|   333 ** Release all the table locks (locks obtained via calls to |  | 
|   334 ** the setSharedCacheTableLock() procedure) held by Btree handle p. |  | 
|   335 ** |  | 
|   336 ** This function assumes that handle p has an open read or write  |  | 
|   337 ** transaction. If it does not, then the BtShared.isPending variable |  | 
|   338 ** may be incorrectly cleared. |  | 
|   339 */ |  | 
|   340 static void clearAllSharedCacheTableLocks(Btree *p){ |  | 
|   341   BtShared *pBt = p->pBt; |  | 
|   342   BtLock **ppIter = &pBt->pLock; |  | 
|   343  |  | 
|   344   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|   345   assert( p->sharable || 0==*ppIter ); |  | 
|   346   assert( p->inTrans>0 ); |  | 
|   347  |  | 
|   348   while( *ppIter ){ |  | 
|   349     BtLock *pLock = *ppIter; |  | 
|   350     assert( pBt->isExclusive==0 || pBt->pWriter==pLock->pBtree ); |  | 
|   351     assert( pLock->pBtree->inTrans>=pLock->eLock ); |  | 
|   352     if( pLock->pBtree==p ){ |  | 
|   353       *ppIter = pLock->pNext; |  | 
|   354       assert( pLock->iTable!=1 || pLock==&p->lock ); |  | 
|   355       if( pLock->iTable!=1 ){ |  | 
|   356         sqlite3_free(pLock); |  | 
|   357       } |  | 
|   358     }else{ |  | 
|   359       ppIter = &pLock->pNext; |  | 
|   360     } |  | 
|   361   } |  | 
|   362  |  | 
|   363   assert( pBt->isPending==0 || pBt->pWriter ); |  | 
|   364   if( pBt->pWriter==p ){ |  | 
|   365     pBt->pWriter = 0; |  | 
|   366     pBt->isExclusive = 0; |  | 
|   367     pBt->isPending = 0; |  | 
|   368   }else if( pBt->nTransaction==2 ){ |  | 
|   369     /* This function is called when connection p is concluding its  |  | 
|   370     ** transaction. If there currently exists a writer, and p is not |  | 
|   371     ** that writer, then the number of locks held by connections other |  | 
|   372     ** than the writer must be about to drop to zero. In this case |  | 
|   373     ** set the isPending flag to 0. |  | 
|   374     ** |  | 
|   375     ** If there is not currently a writer, then BtShared.isPending must |  | 
|   376     ** be zero already. So this next line is harmless in that case. |  | 
|   377     */ |  | 
|   378     pBt->isPending = 0; |  | 
|   379   } |  | 
|   380 } |  | 
|   381  |  | 
|   382 /* |  | 
|   383 ** This function changes all write-locks held by connection p to read-locks. |  | 
|   384 */ |  | 
|   385 static void downgradeAllSharedCacheTableLocks(Btree *p){ |  | 
|   386   BtShared *pBt = p->pBt; |  | 
|   387   if( pBt->pWriter==p ){ |  | 
|   388     BtLock *pLock; |  | 
|   389     pBt->pWriter = 0; |  | 
|   390     pBt->isExclusive = 0; |  | 
|   391     pBt->isPending = 0; |  | 
|   392     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){ |  | 
|   393       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p ); |  | 
|   394       pLock->eLock = READ_LOCK; |  | 
|   395     } |  | 
|   396   } |  | 
|   397 } |  | 
|   398  |  | 
|   399 #endif /* SQLITE_OMIT_SHARED_CACHE */ |  | 
|   400  |  | 
|   401 static void releasePage(MemPage *pPage);  /* Forward reference */ |  | 
|   402  |  | 
|   403 /* |  | 
|   404 ** Verify that the cursor holds a mutex on the BtShared |  | 
|   405 */ |  | 
|   406 #ifndef NDEBUG |  | 
|   407 static int cursorHoldsMutex(BtCursor *p){ |  | 
|   408   return sqlite3_mutex_held(p->pBt->mutex); |  | 
|   409 } |  | 
|   410 #endif |  | 
|   411  |  | 
|   412  |  | 
|   413 #ifndef SQLITE_OMIT_INCRBLOB |  | 
|   414 /* |  | 
|   415 ** Invalidate the overflow page-list cache for cursor pCur, if any. |  | 
|   416 */ |  | 
|   417 static void invalidateOverflowCache(BtCursor *pCur){ |  | 
|   418   assert( cursorHoldsMutex(pCur) ); |  | 
|   419   sqlite3_free(pCur->aOverflow); |  | 
|   420   pCur->aOverflow = 0; |  | 
|   421 } |  | 
|   422  |  | 
|   423 /* |  | 
|   424 ** Invalidate the overflow page-list cache for all cursors opened |  | 
|   425 ** on the shared btree structure pBt. |  | 
|   426 */ |  | 
|   427 static void invalidateAllOverflowCache(BtShared *pBt){ |  | 
|   428   BtCursor *p; |  | 
|   429   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|   430   for(p=pBt->pCursor; p; p=p->pNext){ |  | 
|   431     invalidateOverflowCache(p); |  | 
|   432   } |  | 
|   433 } |  | 
|   434  |  | 
|   435 /* |  | 
|   436 ** This function is called before modifying the contents of a table |  | 
|   437 ** b-tree to invalidate any incrblob cursors that are open on the |  | 
|   438 ** row or one of the rows being modified. |  | 
|   439 ** |  | 
|   440 ** If argument isClearTable is true, then the entire contents of the |  | 
|   441 ** table is about to be deleted. In this case invalidate all incrblob |  | 
|   442 ** cursors open on any row within the table with root-page pgnoRoot. |  | 
|   443 ** |  | 
|   444 ** Otherwise, if argument isClearTable is false, then the row with |  | 
|   445 ** rowid iRow is being replaced or deleted. In this case invalidate |  | 
|   446 ** only those incrblob cursors open on this specific row. |  | 
|   447 */ |  | 
|   448 static void invalidateIncrblobCursors( |  | 
|   449   Btree *pBtree,          /* The database file to check */ |  | 
|   450   i64 iRow,               /* The rowid that might be changing */ |  | 
|   451   int isClearTable        /* True if all rows are being deleted */ |  | 
|   452 ){ |  | 
|   453   BtCursor *p; |  | 
|   454   BtShared *pBt = pBtree->pBt; |  | 
|   455   assert( sqlite3BtreeHoldsMutex(pBtree) ); |  | 
|   456   for(p=pBt->pCursor; p; p=p->pNext){ |  | 
|   457     if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){ |  | 
|   458       p->eState = CURSOR_INVALID; |  | 
|   459     } |  | 
|   460   } |  | 
|   461 } |  | 
|   462  |  | 
|   463 #else |  | 
|   464   #define invalidateOverflowCache(x) |  | 
|   465   #define invalidateAllOverflowCache(x) |  | 
|   466   #define invalidateIncrblobCursors(x,y,z) |  | 
|   467 #endif |  | 
|   468  |  | 
|   469 /* |  | 
|   470 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called  |  | 
|   471 ** when a page that previously contained data becomes a free-list leaf  |  | 
|   472 ** page. |  | 
|   473 ** |  | 
|   474 ** The BtShared.pHasContent bitvec exists to work around an obscure |  | 
|   475 ** bug caused by the interaction of two useful IO optimizations surrounding |  | 
|   476 ** free-list leaf pages: |  | 
|   477 ** |  | 
|   478 **   1) When all data is deleted from a page and the page becomes |  | 
|   479 **      a free-list leaf page, the page is not written to the database |  | 
|   480 **      (as free-list leaf pages contain no meaningful data). Sometimes |  | 
|   481 **      such a page is not even journalled (as it will not be modified, |  | 
|   482 **      why bother journalling it?). |  | 
|   483 ** |  | 
|   484 **   2) When a free-list leaf page is reused, its content is not read |  | 
|   485 **      from the database or written to the journal file (why should it |  | 
|   486 **      be, if it is not at all meaningful?). |  | 
|   487 ** |  | 
|   488 ** By themselves, these optimizations work fine and provide a handy |  | 
|   489 ** performance boost to bulk delete or insert operations. However, if |  | 
|   490 ** a page is moved to the free-list and then reused within the same |  | 
|   491 ** transaction, a problem comes up. If the page is not journalled when |  | 
|   492 ** it is moved to the free-list and it is also not journalled when it |  | 
|   493 ** is extracted from the free-list and reused, then the original data |  | 
|   494 ** may be lost. In the event of a rollback, it may not be possible |  | 
|   495 ** to restore the database to its original configuration. |  | 
|   496 ** |  | 
|   497 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is  |  | 
|   498 ** moved to become a free-list leaf page, the corresponding bit is |  | 
|   499 ** set in the bitvec. Whenever a leaf page is extracted from the free-list, |  | 
|   500 ** optimization 2 above is ommitted if the corresponding bit is already |  | 
|   501 ** set in BtShared.pHasContent. The contents of the bitvec are cleared |  | 
|   502 ** at the end of every transaction. |  | 
|   503 */ |  | 
|   504 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){ |  | 
|   505   int rc = SQLITE_OK; |  | 
|   506   if( !pBt->pHasContent ){ |  | 
|   507     int nPage = 100; |  | 
|   508     sqlite3PagerPagecount(pBt->pPager, &nPage); |  | 
|   509     /* If sqlite3PagerPagecount() fails there is no harm because the |  | 
|   510     ** nPage variable is unchanged from its default value of 100 */ |  | 
|   511     pBt->pHasContent = sqlite3BitvecCreate((u32)nPage); |  | 
|   512     if( !pBt->pHasContent ){ |  | 
|   513       rc = SQLITE_NOMEM; |  | 
|   514     } |  | 
|   515   } |  | 
|   516   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){ |  | 
|   517     rc = sqlite3BitvecSet(pBt->pHasContent, pgno); |  | 
|   518   } |  | 
|   519   return rc; |  | 
|   520 } |  | 
|   521  |  | 
|   522 /* |  | 
|   523 ** Query the BtShared.pHasContent vector. |  | 
|   524 ** |  | 
|   525 ** This function is called when a free-list leaf page is removed from the |  | 
|   526 ** free-list for reuse. It returns false if it is safe to retrieve the |  | 
|   527 ** page from the pager layer with the 'no-content' flag set. True otherwise. |  | 
|   528 */ |  | 
|   529 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){ |  | 
|   530   Bitvec *p = pBt->pHasContent; |  | 
|   531   return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno))); |  | 
|   532 } |  | 
|   533  |  | 
|   534 /* |  | 
|   535 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be |  | 
|   536 ** invoked at the conclusion of each write-transaction. |  | 
|   537 */ |  | 
|   538 static void btreeClearHasContent(BtShared *pBt){ |  | 
|   539   sqlite3BitvecDestroy(pBt->pHasContent); |  | 
|   540   pBt->pHasContent = 0; |  | 
|   541 } |  | 
|   542  |  | 
|   543 /* |  | 
|   544 ** Save the current cursor position in the variables BtCursor.nKey  |  | 
|   545 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK. |  | 
|   546 ** |  | 
|   547 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID) |  | 
|   548 ** prior to calling this routine.   |  | 
|   549 */ |  | 
|   550 static int saveCursorPosition(BtCursor *pCur){ |  | 
|   551   int rc; |  | 
|   552  |  | 
|   553   assert( CURSOR_VALID==pCur->eState ); |  | 
|   554   assert( 0==pCur->pKey ); |  | 
|   555   assert( cursorHoldsMutex(pCur) ); |  | 
|   556  |  | 
|   557   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey); |  | 
|   558   assert( rc==SQLITE_OK );  /* KeySize() cannot fail */ |  | 
|   559  |  | 
|   560   /* If this is an intKey table, then the above call to BtreeKeySize() |  | 
|   561   ** stores the integer key in pCur->nKey. In this case this value is |  | 
|   562   ** all that is required. Otherwise, if pCur is not open on an intKey |  | 
|   563   ** table, then malloc space for and store the pCur->nKey bytes of key  |  | 
|   564   ** data. |  | 
|   565   */ |  | 
|   566   if( 0==pCur->apPage[0]->intKey ){ |  | 
|   567     void *pKey = sqlite3Malloc( (int)pCur->nKey ); |  | 
|   568     if( pKey ){ |  | 
|   569       rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey); |  | 
|   570       if( rc==SQLITE_OK ){ |  | 
|   571         pCur->pKey = pKey; |  | 
|   572       }else{ |  | 
|   573         sqlite3_free(pKey); |  | 
|   574       } |  | 
|   575     }else{ |  | 
|   576       rc = SQLITE_NOMEM; |  | 
|   577     } |  | 
|   578   } |  | 
|   579   assert( !pCur->apPage[0]->intKey || !pCur->pKey ); |  | 
|   580  |  | 
|   581   if( rc==SQLITE_OK ){ |  | 
|   582     int i; |  | 
|   583     for(i=0; i<=pCur->iPage; i++){ |  | 
|   584       releasePage(pCur->apPage[i]); |  | 
|   585       pCur->apPage[i] = 0; |  | 
|   586     } |  | 
|   587     pCur->iPage = -1; |  | 
|   588     pCur->eState = CURSOR_REQUIRESEEK; |  | 
|   589   } |  | 
|   590  |  | 
|   591   invalidateOverflowCache(pCur); |  | 
|   592   return rc; |  | 
|   593 } |  | 
|   594  |  | 
|   595 /* |  | 
|   596 ** Save the positions of all cursors except pExcept open on the table  |  | 
|   597 ** with root-page iRoot. Usually, this is called just before cursor |  | 
|   598 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()). |  | 
|   599 */ |  | 
|   600 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){ |  | 
|   601   BtCursor *p; |  | 
|   602   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|   603   assert( pExcept==0 || pExcept->pBt==pBt ); |  | 
|   604   for(p=pBt->pCursor; p; p=p->pNext){ |  | 
|   605     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&  |  | 
|   606         p->eState==CURSOR_VALID ){ |  | 
|   607       int rc = saveCursorPosition(p); |  | 
|   608       if( SQLITE_OK!=rc ){ |  | 
|   609         return rc; |  | 
|   610       } |  | 
|   611     } |  | 
|   612   } |  | 
|   613   return SQLITE_OK; |  | 
|   614 } |  | 
|   615  |  | 
|   616 /* |  | 
|   617 ** Clear the current cursor position. |  | 
|   618 */ |  | 
|   619 void sqlite3BtreeClearCursor(BtCursor *pCur){ |  | 
|   620   assert( cursorHoldsMutex(pCur) ); |  | 
|   621   sqlite3_free(pCur->pKey); |  | 
|   622   pCur->pKey = 0; |  | 
|   623   pCur->eState = CURSOR_INVALID; |  | 
|   624 } |  | 
|   625  |  | 
|   626 /* |  | 
|   627 ** In this version of BtreeMoveto, pKey is a packed index record |  | 
|   628 ** such as is generated by the OP_MakeRecord opcode.  Unpack the |  | 
|   629 ** record and then call BtreeMovetoUnpacked() to do the work. |  | 
|   630 */ |  | 
|   631 static int btreeMoveto( |  | 
|   632   BtCursor *pCur,     /* Cursor open on the btree to be searched */ |  | 
|   633   const void *pKey,   /* Packed key if the btree is an index */ |  | 
|   634   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */ |  | 
|   635   int bias,           /* Bias search to the high end */ |  | 
|   636   int *pRes           /* Write search results here */ |  | 
|   637 ){ |  | 
|   638   int rc;                    /* Status code */ |  | 
|   639   UnpackedRecord *pIdxKey;   /* Unpacked index key */ |  | 
|   640   char aSpace[150];          /* Temp space for pIdxKey - to avoid a malloc */ |  | 
|   641  |  | 
|   642   if( pKey ){ |  | 
|   643     assert( nKey==(i64)(int)nKey ); |  | 
|   644     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, |  | 
|   645                                       aSpace, sizeof(aSpace)); |  | 
|   646     if( pIdxKey==0 ) return SQLITE_NOMEM; |  | 
|   647   }else{ |  | 
|   648     pIdxKey = 0; |  | 
|   649   } |  | 
|   650   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes); |  | 
|   651   if( pKey ){ |  | 
|   652     sqlite3VdbeDeleteUnpackedRecord(pIdxKey); |  | 
|   653   } |  | 
|   654   return rc; |  | 
|   655 } |  | 
|   656  |  | 
|   657 /* |  | 
|   658 ** Restore the cursor to the position it was in (or as close to as possible) |  | 
|   659 ** when saveCursorPosition() was called. Note that this call deletes the  |  | 
|   660 ** saved position info stored by saveCursorPosition(), so there can be |  | 
|   661 ** at most one effective restoreCursorPosition() call after each  |  | 
|   662 ** saveCursorPosition(). |  | 
|   663 */ |  | 
|   664 static int btreeRestoreCursorPosition(BtCursor *pCur){ |  | 
|   665   int rc; |  | 
|   666   assert( cursorHoldsMutex(pCur) ); |  | 
|   667   assert( pCur->eState>=CURSOR_REQUIRESEEK ); |  | 
|   668   if( pCur->eState==CURSOR_FAULT ){ |  | 
|   669     return pCur->skipNext; |  | 
|   670   } |  | 
|   671   pCur->eState = CURSOR_INVALID; |  | 
|   672   rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext); |  | 
|   673   if( rc==SQLITE_OK ){ |  | 
|   674     sqlite3_free(pCur->pKey); |  | 
|   675     pCur->pKey = 0; |  | 
|   676     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID ); |  | 
|   677   } |  | 
|   678   return rc; |  | 
|   679 } |  | 
|   680  |  | 
|   681 #define restoreCursorPosition(p) \ |  | 
|   682   (p->eState>=CURSOR_REQUIRESEEK ? \ |  | 
|   683          btreeRestoreCursorPosition(p) : \ |  | 
|   684          SQLITE_OK) |  | 
|   685  |  | 
|   686 /* |  | 
|   687 ** Determine whether or not a cursor has moved from the position it |  | 
|   688 ** was last placed at.  Cursors can move when the row they are pointing |  | 
|   689 ** at is deleted out from under them. |  | 
|   690 ** |  | 
|   691 ** This routine returns an error code if something goes wrong.  The |  | 
|   692 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not. |  | 
|   693 */ |  | 
|   694 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){ |  | 
|   695   int rc; |  | 
|   696  |  | 
|   697   rc = restoreCursorPosition(pCur); |  | 
|   698   if( rc ){ |  | 
|   699     *pHasMoved = 1; |  | 
|   700     return rc; |  | 
|   701   } |  | 
|   702   if( pCur->eState!=CURSOR_VALID || pCur->skipNext!=0 ){ |  | 
|   703     *pHasMoved = 1; |  | 
|   704   }else{ |  | 
|   705     *pHasMoved = 0; |  | 
|   706   } |  | 
|   707   return SQLITE_OK; |  | 
|   708 } |  | 
|   709  |  | 
|   710 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|   711 /* |  | 
|   712 ** Given a page number of a regular database page, return the page |  | 
|   713 ** number for the pointer-map page that contains the entry for the |  | 
|   714 ** input page number. |  | 
|   715 */ |  | 
|   716 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){ |  | 
|   717   int nPagesPerMapPage; |  | 
|   718   Pgno iPtrMap, ret; |  | 
|   719   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|   720   nPagesPerMapPage = (pBt->usableSize/5)+1; |  | 
|   721   iPtrMap = (pgno-2)/nPagesPerMapPage; |  | 
|   722   ret = (iPtrMap*nPagesPerMapPage) + 2;  |  | 
|   723   if( ret==PENDING_BYTE_PAGE(pBt) ){ |  | 
|   724     ret++; |  | 
|   725   } |  | 
|   726   return ret; |  | 
|   727 } |  | 
|   728  |  | 
|   729 /* |  | 
|   730 ** Write an entry into the pointer map. |  | 
|   731 ** |  | 
|   732 ** This routine updates the pointer map entry for page number 'key' |  | 
|   733 ** so that it maps to type 'eType' and parent page number 'pgno'. |  | 
|   734 ** |  | 
|   735 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is |  | 
|   736 ** a no-op.  If an error occurs, the appropriate error code is written |  | 
|   737 ** into *pRC. |  | 
|   738 */ |  | 
|   739 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){ |  | 
|   740   DbPage *pDbPage;  /* The pointer map page */ |  | 
|   741   u8 *pPtrmap;      /* The pointer map data */ |  | 
|   742   Pgno iPtrmap;     /* The pointer map page number */ |  | 
|   743   int offset;       /* Offset in pointer map page */ |  | 
|   744   int rc;           /* Return code from subfunctions */ |  | 
|   745  |  | 
|   746   if( *pRC ) return; |  | 
|   747  |  | 
|   748   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|   749   /* The master-journal page number must never be used as a pointer map page */ |  | 
|   750   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) ); |  | 
|   751  |  | 
|   752   assert( pBt->autoVacuum ); |  | 
|   753   if( key==0 ){ |  | 
|   754     *pRC = SQLITE_CORRUPT_BKPT; |  | 
|   755     return; |  | 
|   756   } |  | 
|   757   iPtrmap = PTRMAP_PAGENO(pBt, key); |  | 
|   758   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); |  | 
|   759   if( rc!=SQLITE_OK ){ |  | 
|   760     *pRC = rc; |  | 
|   761     return; |  | 
|   762   } |  | 
|   763   offset = PTRMAP_PTROFFSET(iPtrmap, key); |  | 
|   764   if( offset<0 ){ |  | 
|   765     *pRC = SQLITE_CORRUPT_BKPT; |  | 
|   766     goto ptrmap_exit; |  | 
|   767   } |  | 
|   768   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); |  | 
|   769  |  | 
|   770   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){ |  | 
|   771     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent)); |  | 
|   772     *pRC= rc = sqlite3PagerWrite(pDbPage); |  | 
|   773     if( rc==SQLITE_OK ){ |  | 
|   774       pPtrmap[offset] = eType; |  | 
|   775       put4byte(&pPtrmap[offset+1], parent); |  | 
|   776     } |  | 
|   777   } |  | 
|   778  |  | 
|   779 ptrmap_exit: |  | 
|   780   sqlite3PagerUnref(pDbPage); |  | 
|   781 } |  | 
|   782  |  | 
|   783 /* |  | 
|   784 ** Read an entry from the pointer map. |  | 
|   785 ** |  | 
|   786 ** This routine retrieves the pointer map entry for page 'key', writing |  | 
|   787 ** the type and parent page number to *pEType and *pPgno respectively. |  | 
|   788 ** An error code is returned if something goes wrong, otherwise SQLITE_OK. |  | 
|   789 */ |  | 
|   790 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){ |  | 
|   791   DbPage *pDbPage;   /* The pointer map page */ |  | 
|   792   int iPtrmap;       /* Pointer map page index */ |  | 
|   793   u8 *pPtrmap;       /* Pointer map page data */ |  | 
|   794   int offset;        /* Offset of entry in pointer map */ |  | 
|   795   int rc; |  | 
|   796  |  | 
|   797   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|   798  |  | 
|   799   iPtrmap = PTRMAP_PAGENO(pBt, key); |  | 
|   800   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage); |  | 
|   801   if( rc!=0 ){ |  | 
|   802     return rc; |  | 
|   803   } |  | 
|   804   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage); |  | 
|   805  |  | 
|   806   offset = PTRMAP_PTROFFSET(iPtrmap, key); |  | 
|   807   assert( pEType!=0 ); |  | 
|   808   *pEType = pPtrmap[offset]; |  | 
|   809   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]); |  | 
|   810  |  | 
|   811   sqlite3PagerUnref(pDbPage); |  | 
|   812   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; |  | 
|   813   return SQLITE_OK; |  | 
|   814 } |  | 
|   815  |  | 
|   816 #else /* if defined SQLITE_OMIT_AUTOVACUUM */ |  | 
|   817   #define ptrmapPut(w,x,y,z,rc) |  | 
|   818   #define ptrmapGet(w,x,y,z) SQLITE_OK |  | 
|   819   #define ptrmapPutOvflPtr(x, y, rc) |  | 
|   820 #endif |  | 
|   821  |  | 
|   822 /* |  | 
|   823 ** Given a btree page and a cell index (0 means the first cell on |  | 
|   824 ** the page, 1 means the second cell, and so forth) return a pointer |  | 
|   825 ** to the cell content. |  | 
|   826 ** |  | 
|   827 ** This routine works only for pages that do not contain overflow cells. |  | 
|   828 */ |  | 
|   829 #define findCell(P,I) \ |  | 
|   830   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)]))) |  | 
|   831  |  | 
|   832 /* |  | 
|   833 ** This a more complex version of findCell() that works for |  | 
|   834 ** pages that do contain overflow cells. |  | 
|   835 */ |  | 
|   836 static u8 *findOverflowCell(MemPage *pPage, int iCell){ |  | 
|   837   int i; |  | 
|   838   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|   839   for(i=pPage->nOverflow-1; i>=0; i--){ |  | 
|   840     int k; |  | 
|   841     struct _OvflCell *pOvfl; |  | 
|   842     pOvfl = &pPage->aOvfl[i]; |  | 
|   843     k = pOvfl->idx; |  | 
|   844     if( k<=iCell ){ |  | 
|   845       if( k==iCell ){ |  | 
|   846         return pOvfl->pCell; |  | 
|   847       } |  | 
|   848       iCell--; |  | 
|   849     } |  | 
|   850   } |  | 
|   851   return findCell(pPage, iCell); |  | 
|   852 } |  | 
|   853  |  | 
|   854 /* |  | 
|   855 ** Parse a cell content block and fill in the CellInfo structure.  There |  | 
|   856 ** are two versions of this function.  btreeParseCell() takes a  |  | 
|   857 ** cell index as the second argument and btreeParseCellPtr()  |  | 
|   858 ** takes a pointer to the body of the cell as its second argument. |  | 
|   859 ** |  | 
|   860 ** Within this file, the parseCell() macro can be called instead of |  | 
|   861 ** btreeParseCellPtr(). Using some compilers, this will be faster. |  | 
|   862 */ |  | 
|   863 static void btreeParseCellPtr( |  | 
|   864   MemPage *pPage,         /* Page containing the cell */ |  | 
|   865   u8 *pCell,              /* Pointer to the cell text. */ |  | 
|   866   CellInfo *pInfo         /* Fill in this structure */ |  | 
|   867 ){ |  | 
|   868   u16 n;                  /* Number bytes in cell content header */ |  | 
|   869   u32 nPayload;           /* Number of bytes of cell payload */ |  | 
|   870  |  | 
|   871   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|   872  |  | 
|   873   pInfo->pCell = pCell; |  | 
|   874   assert( pPage->leaf==0 || pPage->leaf==1 ); |  | 
|   875   n = pPage->childPtrSize; |  | 
|   876   assert( n==4-4*pPage->leaf ); |  | 
|   877   if( pPage->intKey ){ |  | 
|   878     if( pPage->hasData ){ |  | 
|   879       n += getVarint32(&pCell[n], nPayload); |  | 
|   880     }else{ |  | 
|   881       nPayload = 0; |  | 
|   882     } |  | 
|   883     n += getVarint(&pCell[n], (u64*)&pInfo->nKey); |  | 
|   884     pInfo->nData = nPayload; |  | 
|   885   }else{ |  | 
|   886     pInfo->nData = 0; |  | 
|   887     n += getVarint32(&pCell[n], nPayload); |  | 
|   888     pInfo->nKey = nPayload; |  | 
|   889   } |  | 
|   890   pInfo->nPayload = nPayload; |  | 
|   891   pInfo->nHeader = n; |  | 
|   892   testcase( nPayload==pPage->maxLocal ); |  | 
|   893   testcase( nPayload==pPage->maxLocal+1 ); |  | 
|   894   if( likely(nPayload<=pPage->maxLocal) ){ |  | 
|   895     /* This is the (easy) common case where the entire payload fits |  | 
|   896     ** on the local page.  No overflow is required. |  | 
|   897     */ |  | 
|   898     int nSize;          /* Total size of cell content in bytes */ |  | 
|   899     nSize = nPayload + n; |  | 
|   900     pInfo->nLocal = (u16)nPayload; |  | 
|   901     pInfo->iOverflow = 0; |  | 
|   902     if( (nSize & ~3)==0 ){ |  | 
|   903       nSize = 4;        /* Minimum cell size is 4 */ |  | 
|   904     } |  | 
|   905     pInfo->nSize = (u16)nSize; |  | 
|   906   }else{ |  | 
|   907     /* If the payload will not fit completely on the local page, we have |  | 
|   908     ** to decide how much to store locally and how much to spill onto |  | 
|   909     ** overflow pages.  The strategy is to minimize the amount of unused |  | 
|   910     ** space on overflow pages while keeping the amount of local storage |  | 
|   911     ** in between minLocal and maxLocal. |  | 
|   912     ** |  | 
|   913     ** Warning:  changing the way overflow payload is distributed in any |  | 
|   914     ** way will result in an incompatible file format. |  | 
|   915     */ |  | 
|   916     int minLocal;  /* Minimum amount of payload held locally */ |  | 
|   917     int maxLocal;  /* Maximum amount of payload held locally */ |  | 
|   918     int surplus;   /* Overflow payload available for local storage */ |  | 
|   919  |  | 
|   920     minLocal = pPage->minLocal; |  | 
|   921     maxLocal = pPage->maxLocal; |  | 
|   922     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4); |  | 
|   923     testcase( surplus==maxLocal ); |  | 
|   924     testcase( surplus==maxLocal+1 ); |  | 
|   925     if( surplus <= maxLocal ){ |  | 
|   926       pInfo->nLocal = (u16)surplus; |  | 
|   927     }else{ |  | 
|   928       pInfo->nLocal = (u16)minLocal; |  | 
|   929     } |  | 
|   930     pInfo->iOverflow = (u16)(pInfo->nLocal + n); |  | 
|   931     pInfo->nSize = pInfo->iOverflow + 4; |  | 
|   932   } |  | 
|   933 } |  | 
|   934 #define parseCell(pPage, iCell, pInfo) \ |  | 
|   935   btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo)) |  | 
|   936 static void btreeParseCell( |  | 
|   937   MemPage *pPage,         /* Page containing the cell */ |  | 
|   938   int iCell,              /* The cell index.  First cell is 0 */ |  | 
|   939   CellInfo *pInfo         /* Fill in this structure */ |  | 
|   940 ){ |  | 
|   941   parseCell(pPage, iCell, pInfo); |  | 
|   942 } |  | 
|   943  |  | 
|   944 /* |  | 
|   945 ** Compute the total number of bytes that a Cell needs in the cell |  | 
|   946 ** data area of the btree-page.  The return number includes the cell |  | 
|   947 ** data header and the local payload, but not any overflow page or |  | 
|   948 ** the space used by the cell pointer. |  | 
|   949 */ |  | 
|   950 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){ |  | 
|   951   u8 *pIter = &pCell[pPage->childPtrSize]; |  | 
|   952   u32 nSize; |  | 
|   953  |  | 
|   954 #ifdef SQLITE_DEBUG |  | 
|   955   /* The value returned by this function should always be the same as |  | 
|   956   ** the (CellInfo.nSize) value found by doing a full parse of the |  | 
|   957   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of |  | 
|   958   ** this function verifies that this invariant is not violated. */ |  | 
|   959   CellInfo debuginfo; |  | 
|   960   btreeParseCellPtr(pPage, pCell, &debuginfo); |  | 
|   961 #endif |  | 
|   962  |  | 
|   963   if( pPage->intKey ){ |  | 
|   964     u8 *pEnd; |  | 
|   965     if( pPage->hasData ){ |  | 
|   966       pIter += getVarint32(pIter, nSize); |  | 
|   967     }else{ |  | 
|   968       nSize = 0; |  | 
|   969     } |  | 
|   970  |  | 
|   971     /* pIter now points at the 64-bit integer key value, a variable length  |  | 
|   972     ** integer. The following block moves pIter to point at the first byte |  | 
|   973     ** past the end of the key value. */ |  | 
|   974     pEnd = &pIter[9]; |  | 
|   975     while( (*pIter++)&0x80 && pIter<pEnd ); |  | 
|   976   }else{ |  | 
|   977     pIter += getVarint32(pIter, nSize); |  | 
|   978   } |  | 
|   979  |  | 
|   980   testcase( nSize==pPage->maxLocal ); |  | 
|   981   testcase( nSize==pPage->maxLocal+1 ); |  | 
|   982   if( nSize>pPage->maxLocal ){ |  | 
|   983     int minLocal = pPage->minLocal; |  | 
|   984     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4); |  | 
|   985     testcase( nSize==pPage->maxLocal ); |  | 
|   986     testcase( nSize==pPage->maxLocal+1 ); |  | 
|   987     if( nSize>pPage->maxLocal ){ |  | 
|   988       nSize = minLocal; |  | 
|   989     } |  | 
|   990     nSize += 4; |  | 
|   991   } |  | 
|   992   nSize += (u32)(pIter - pCell); |  | 
|   993  |  | 
|   994   /* The minimum size of any cell is 4 bytes. */ |  | 
|   995   if( nSize<4 ){ |  | 
|   996     nSize = 4; |  | 
|   997   } |  | 
|   998  |  | 
|   999   assert( nSize==debuginfo.nSize ); |  | 
|  1000   return (u16)nSize; |  | 
|  1001 } |  | 
|  1002 #ifndef NDEBUG |  | 
|  1003 static u16 cellSize(MemPage *pPage, int iCell){ |  | 
|  1004   return cellSizePtr(pPage, findCell(pPage, iCell)); |  | 
|  1005 } |  | 
|  1006 #endif |  | 
|  1007  |  | 
|  1008 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  1009 /* |  | 
|  1010 ** If the cell pCell, part of page pPage contains a pointer |  | 
|  1011 ** to an overflow page, insert an entry into the pointer-map |  | 
|  1012 ** for the overflow page. |  | 
|  1013 */ |  | 
|  1014 static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){ |  | 
|  1015   CellInfo info; |  | 
|  1016   if( *pRC ) return; |  | 
|  1017   assert( pCell!=0 ); |  | 
|  1018   btreeParseCellPtr(pPage, pCell, &info); |  | 
|  1019   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); |  | 
|  1020   if( info.iOverflow ){ |  | 
|  1021     Pgno ovfl = get4byte(&pCell[info.iOverflow]); |  | 
|  1022     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC); |  | 
|  1023   } |  | 
|  1024 } |  | 
|  1025 #endif |  | 
|  1026  |  | 
|  1027  |  | 
|  1028 /* |  | 
|  1029 ** Defragment the page given.  All Cells are moved to the |  | 
|  1030 ** end of the page and all free space is collected into one |  | 
|  1031 ** big FreeBlk that occurs in between the header and cell |  | 
|  1032 ** pointer array and the cell content area. |  | 
|  1033 */ |  | 
|  1034 static int defragmentPage(MemPage *pPage){ |  | 
|  1035   int i;                     /* Loop counter */ |  | 
|  1036   int pc;                    /* Address of a i-th cell */ |  | 
|  1037   int hdr;                   /* Offset to the page header */ |  | 
|  1038   int size;                  /* Size of a cell */ |  | 
|  1039   int usableSize;            /* Number of usable bytes on a page */ |  | 
|  1040   int cellOffset;            /* Offset to the cell pointer array */ |  | 
|  1041   int cbrk;                  /* Offset to the cell content area */ |  | 
|  1042   int nCell;                 /* Number of cells on the page */ |  | 
|  1043   unsigned char *data;       /* The page data */ |  | 
|  1044   unsigned char *temp;       /* Temp area for cell content */ |  | 
|  1045   int iCellFirst;            /* First allowable cell index */ |  | 
|  1046   int iCellLast;             /* Last possible cell index */ |  | 
|  1047  |  | 
|  1048  |  | 
|  1049   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  1050   assert( pPage->pBt!=0 ); |  | 
|  1051   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE ); |  | 
|  1052   assert( pPage->nOverflow==0 ); |  | 
|  1053   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1054   temp = sqlite3PagerTempSpace(pPage->pBt->pPager); |  | 
|  1055   data = pPage->aData; |  | 
|  1056   hdr = pPage->hdrOffset; |  | 
|  1057   cellOffset = pPage->cellOffset; |  | 
|  1058   nCell = pPage->nCell; |  | 
|  1059   assert( nCell==get2byte(&data[hdr+3]) ); |  | 
|  1060   usableSize = pPage->pBt->usableSize; |  | 
|  1061   cbrk = get2byte(&data[hdr+5]); |  | 
|  1062   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk); |  | 
|  1063   cbrk = usableSize; |  | 
|  1064   iCellFirst = cellOffset + 2*nCell; |  | 
|  1065   iCellLast = usableSize - 4; |  | 
|  1066   for(i=0; i<nCell; i++){ |  | 
|  1067     u8 *pAddr;     /* The i-th cell pointer */ |  | 
|  1068     pAddr = &data[cellOffset + i*2]; |  | 
|  1069     pc = get2byte(pAddr); |  | 
|  1070     testcase( pc==iCellFirst ); |  | 
|  1071     testcase( pc==iCellLast ); |  | 
|  1072 #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) |  | 
|  1073     /* These conditions have already been verified in btreeInitPage() |  | 
|  1074     ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined  |  | 
|  1075     */ |  | 
|  1076     if( pc<iCellFirst || pc>iCellLast ){ |  | 
|  1077       return SQLITE_CORRUPT_BKPT; |  | 
|  1078     } |  | 
|  1079 #endif |  | 
|  1080     assert( pc>=iCellFirst && pc<=iCellLast ); |  | 
|  1081     size = cellSizePtr(pPage, &temp[pc]); |  | 
|  1082     cbrk -= size; |  | 
|  1083 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) |  | 
|  1084     if( cbrk<iCellFirst ){ |  | 
|  1085       return SQLITE_CORRUPT_BKPT; |  | 
|  1086     } |  | 
|  1087 #else |  | 
|  1088     if( cbrk<iCellFirst || pc+size>usableSize ){ |  | 
|  1089       return SQLITE_CORRUPT_BKPT; |  | 
|  1090     } |  | 
|  1091 #endif |  | 
|  1092     assert( cbrk+size<=usableSize && cbrk>=iCellFirst ); |  | 
|  1093     testcase( cbrk+size==usableSize ); |  | 
|  1094     testcase( pc+size==usableSize ); |  | 
|  1095     memcpy(&data[cbrk], &temp[pc], size); |  | 
|  1096     put2byte(pAddr, cbrk); |  | 
|  1097   } |  | 
|  1098   assert( cbrk>=iCellFirst ); |  | 
|  1099   put2byte(&data[hdr+5], cbrk); |  | 
|  1100   data[hdr+1] = 0; |  | 
|  1101   data[hdr+2] = 0; |  | 
|  1102   data[hdr+7] = 0; |  | 
|  1103   memset(&data[iCellFirst], 0, cbrk-iCellFirst); |  | 
|  1104   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  1105   if( cbrk-iCellFirst!=pPage->nFree ){ |  | 
|  1106     return SQLITE_CORRUPT_BKPT; |  | 
|  1107   } |  | 
|  1108   return SQLITE_OK; |  | 
|  1109 } |  | 
|  1110  |  | 
|  1111 /* |  | 
|  1112 ** Allocate nByte bytes of space from within the B-Tree page passed |  | 
|  1113 ** as the first argument. Write into *pIdx the index into pPage->aData[] |  | 
|  1114 ** of the first byte of allocated space. Return either SQLITE_OK or |  | 
|  1115 ** an error code (usually SQLITE_CORRUPT). |  | 
|  1116 ** |  | 
|  1117 ** The caller guarantees that there is sufficient space to make the |  | 
|  1118 ** allocation.  This routine might need to defragment in order to bring |  | 
|  1119 ** all the space together, however.  This routine will avoid using |  | 
|  1120 ** the first two bytes past the cell pointer area since presumably this |  | 
|  1121 ** allocation is being made in order to insert a new cell, so we will |  | 
|  1122 ** also end up needing a new cell pointer. |  | 
|  1123 */ |  | 
|  1124 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){ |  | 
|  1125   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */ |  | 
|  1126   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */ |  | 
|  1127   int nFrag;                           /* Number of fragmented bytes on pPage */ |  | 
|  1128   int top;                             /* First byte of cell content area */ |  | 
|  1129   int gap;        /* First byte of gap between cell pointers and cell content */ |  | 
|  1130   int rc;         /* Integer return code */ |  | 
|  1131    |  | 
|  1132   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  1133   assert( pPage->pBt ); |  | 
|  1134   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1135   assert( nByte>=0 );  /* Minimum cell size is 4 */ |  | 
|  1136   assert( pPage->nFree>=nByte ); |  | 
|  1137   assert( pPage->nOverflow==0 ); |  | 
|  1138   assert( nByte<pPage->pBt->usableSize-8 ); |  | 
|  1139  |  | 
|  1140   nFrag = data[hdr+7]; |  | 
|  1141   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf ); |  | 
|  1142   gap = pPage->cellOffset + 2*pPage->nCell; |  | 
|  1143   top = get2byte(&data[hdr+5]); |  | 
|  1144   if( gap>top ) return SQLITE_CORRUPT_BKPT; |  | 
|  1145   testcase( gap+2==top ); |  | 
|  1146   testcase( gap+1==top ); |  | 
|  1147   testcase( gap==top ); |  | 
|  1148  |  | 
|  1149   if( nFrag>=60 ){ |  | 
|  1150     /* Always defragment highly fragmented pages */ |  | 
|  1151     rc = defragmentPage(pPage); |  | 
|  1152     if( rc ) return rc; |  | 
|  1153     top = get2byte(&data[hdr+5]); |  | 
|  1154   }else if( gap+2<=top ){ |  | 
|  1155     /* Search the freelist looking for a free slot big enough to satisfy  |  | 
|  1156     ** the request. The allocation is made from the first free slot in  |  | 
|  1157     ** the list that is large enough to accomadate it. |  | 
|  1158     */ |  | 
|  1159     int pc, addr; |  | 
|  1160     for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){ |  | 
|  1161       int size = get2byte(&data[pc+2]);     /* Size of free slot */ |  | 
|  1162       if( size>=nByte ){ |  | 
|  1163         int x = size - nByte; |  | 
|  1164         testcase( x==4 ); |  | 
|  1165         testcase( x==3 ); |  | 
|  1166         if( x<4 ){ |  | 
|  1167           /* Remove the slot from the free-list. Update the number of |  | 
|  1168           ** fragmented bytes within the page. */ |  | 
|  1169           memcpy(&data[addr], &data[pc], 2); |  | 
|  1170           data[hdr+7] = (u8)(nFrag + x); |  | 
|  1171         }else{ |  | 
|  1172           /* The slot remains on the free-list. Reduce its size to account |  | 
|  1173           ** for the portion used by the new allocation. */ |  | 
|  1174           put2byte(&data[pc+2], x); |  | 
|  1175         } |  | 
|  1176         *pIdx = pc + x; |  | 
|  1177         return SQLITE_OK; |  | 
|  1178       } |  | 
|  1179     } |  | 
|  1180   } |  | 
|  1181  |  | 
|  1182   /* Check to make sure there is enough space in the gap to satisfy |  | 
|  1183   ** the allocation.  If not, defragment. |  | 
|  1184   */ |  | 
|  1185   testcase( gap+2+nByte==top ); |  | 
|  1186   if( gap+2+nByte>top ){ |  | 
|  1187     rc = defragmentPage(pPage); |  | 
|  1188     if( rc ) return rc; |  | 
|  1189     top = get2byte(&data[hdr+5]); |  | 
|  1190     assert( gap+nByte<=top ); |  | 
|  1191   } |  | 
|  1192  |  | 
|  1193  |  | 
|  1194   /* Allocate memory from the gap in between the cell pointer array |  | 
|  1195   ** and the cell content area.  The btreeInitPage() call has already |  | 
|  1196   ** validated the freelist.  Given that the freelist is valid, there |  | 
|  1197   ** is no way that the allocation can extend off the end of the page. |  | 
|  1198   ** The assert() below verifies the previous sentence. |  | 
|  1199   */ |  | 
|  1200   top -= nByte; |  | 
|  1201   put2byte(&data[hdr+5], top); |  | 
|  1202   assert( top+nByte <= pPage->pBt->usableSize ); |  | 
|  1203   *pIdx = top; |  | 
|  1204   return SQLITE_OK; |  | 
|  1205 } |  | 
|  1206  |  | 
|  1207 /* |  | 
|  1208 ** Return a section of the pPage->aData to the freelist. |  | 
|  1209 ** The first byte of the new free block is pPage->aDisk[start] |  | 
|  1210 ** and the size of the block is "size" bytes. |  | 
|  1211 ** |  | 
|  1212 ** Most of the effort here is involved in coalesing adjacent |  | 
|  1213 ** free blocks into a single big free block. |  | 
|  1214 */ |  | 
|  1215 static int freeSpace(MemPage *pPage, int start, int size){ |  | 
|  1216   int addr, pbegin, hdr; |  | 
|  1217   int iLast;                        /* Largest possible freeblock offset */ |  | 
|  1218   unsigned char *data = pPage->aData; |  | 
|  1219  |  | 
|  1220   assert( pPage->pBt!=0 ); |  | 
|  1221   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  1222   assert( start>=pPage->hdrOffset+6+pPage->childPtrSize ); |  | 
|  1223   assert( (start + size)<=pPage->pBt->usableSize ); |  | 
|  1224   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1225   assert( size>=0 );   /* Minimum cell size is 4 */ |  | 
|  1226  |  | 
|  1227 #ifdef SQLITE_SECURE_DELETE |  | 
|  1228   /* Overwrite deleted information with zeros when the SECURE_DELETE  |  | 
|  1229   ** option is enabled at compile-time */ |  | 
|  1230   memset(&data[start], 0, size); |  | 
|  1231 #endif |  | 
|  1232  |  | 
|  1233   /* Add the space back into the linked list of freeblocks.  Note that |  | 
|  1234   ** even though the freeblock list was checked by btreeInitPage(), |  | 
|  1235   ** btreeInitPage() did not detect overlapping cells or |  | 
|  1236   ** freeblocks that overlapped cells.   Nor does it detect when the |  | 
|  1237   ** cell content area exceeds the value in the page header.  If these |  | 
|  1238   ** situations arise, then subsequent insert operations might corrupt |  | 
|  1239   ** the freelist.  So we do need to check for corruption while scanning |  | 
|  1240   ** the freelist. |  | 
|  1241   */ |  | 
|  1242   hdr = pPage->hdrOffset; |  | 
|  1243   addr = hdr + 1; |  | 
|  1244   iLast = pPage->pBt->usableSize - 4; |  | 
|  1245   assert( start<=iLast ); |  | 
|  1246   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){ |  | 
|  1247     if( pbegin<addr+4 ){ |  | 
|  1248       return SQLITE_CORRUPT_BKPT; |  | 
|  1249     } |  | 
|  1250     addr = pbegin; |  | 
|  1251   } |  | 
|  1252   if( pbegin>iLast ){ |  | 
|  1253     return SQLITE_CORRUPT_BKPT; |  | 
|  1254   } |  | 
|  1255   assert( pbegin>addr || pbegin==0 ); |  | 
|  1256   put2byte(&data[addr], start); |  | 
|  1257   put2byte(&data[start], pbegin); |  | 
|  1258   put2byte(&data[start+2], size); |  | 
|  1259   pPage->nFree = pPage->nFree + (u16)size; |  | 
|  1260  |  | 
|  1261   /* Coalesce adjacent free blocks */ |  | 
|  1262   addr = hdr + 1; |  | 
|  1263   while( (pbegin = get2byte(&data[addr]))>0 ){ |  | 
|  1264     int pnext, psize, x; |  | 
|  1265     assert( pbegin>addr ); |  | 
|  1266     assert( pbegin<=pPage->pBt->usableSize-4 ); |  | 
|  1267     pnext = get2byte(&data[pbegin]); |  | 
|  1268     psize = get2byte(&data[pbegin+2]); |  | 
|  1269     if( pbegin + psize + 3 >= pnext && pnext>0 ){ |  | 
|  1270       int frag = pnext - (pbegin+psize); |  | 
|  1271       if( (frag<0) || (frag>(int)data[hdr+7]) ){ |  | 
|  1272         return SQLITE_CORRUPT_BKPT; |  | 
|  1273       } |  | 
|  1274       data[hdr+7] -= (u8)frag; |  | 
|  1275       x = get2byte(&data[pnext]); |  | 
|  1276       put2byte(&data[pbegin], x); |  | 
|  1277       x = pnext + get2byte(&data[pnext+2]) - pbegin; |  | 
|  1278       put2byte(&data[pbegin+2], x); |  | 
|  1279     }else{ |  | 
|  1280       addr = pbegin; |  | 
|  1281     } |  | 
|  1282   } |  | 
|  1283  |  | 
|  1284   /* If the cell content area begins with a freeblock, remove it. */ |  | 
|  1285   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){ |  | 
|  1286     int top; |  | 
|  1287     pbegin = get2byte(&data[hdr+1]); |  | 
|  1288     memcpy(&data[hdr+1], &data[pbegin], 2); |  | 
|  1289     top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]); |  | 
|  1290     put2byte(&data[hdr+5], top); |  | 
|  1291   } |  | 
|  1292   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  1293   return SQLITE_OK; |  | 
|  1294 } |  | 
|  1295  |  | 
|  1296 /* |  | 
|  1297 ** Decode the flags byte (the first byte of the header) for a page |  | 
|  1298 ** and initialize fields of the MemPage structure accordingly. |  | 
|  1299 ** |  | 
|  1300 ** Only the following combinations are supported.  Anything different |  | 
|  1301 ** indicates a corrupt database files: |  | 
|  1302 ** |  | 
|  1303 **         PTF_ZERODATA |  | 
|  1304 **         PTF_ZERODATA | PTF_LEAF |  | 
|  1305 **         PTF_LEAFDATA | PTF_INTKEY |  | 
|  1306 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF |  | 
|  1307 */ |  | 
|  1308 static int decodeFlags(MemPage *pPage, int flagByte){ |  | 
|  1309   BtShared *pBt;     /* A copy of pPage->pBt */ |  | 
|  1310  |  | 
|  1311   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) ); |  | 
|  1312   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1313   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 ); |  | 
|  1314   flagByte &= ~PTF_LEAF; |  | 
|  1315   pPage->childPtrSize = 4-4*pPage->leaf; |  | 
|  1316   pBt = pPage->pBt; |  | 
|  1317   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){ |  | 
|  1318     pPage->intKey = 1; |  | 
|  1319     pPage->hasData = pPage->leaf; |  | 
|  1320     pPage->maxLocal = pBt->maxLeaf; |  | 
|  1321     pPage->minLocal = pBt->minLeaf; |  | 
|  1322   }else if( flagByte==PTF_ZERODATA ){ |  | 
|  1323     pPage->intKey = 0; |  | 
|  1324     pPage->hasData = 0; |  | 
|  1325     pPage->maxLocal = pBt->maxLocal; |  | 
|  1326     pPage->minLocal = pBt->minLocal; |  | 
|  1327   }else{ |  | 
|  1328     return SQLITE_CORRUPT_BKPT; |  | 
|  1329   } |  | 
|  1330   return SQLITE_OK; |  | 
|  1331 } |  | 
|  1332  |  | 
|  1333 /* |  | 
|  1334 ** Initialize the auxiliary information for a disk block. |  | 
|  1335 ** |  | 
|  1336 ** Return SQLITE_OK on success.  If we see that the page does |  | 
|  1337 ** not contain a well-formed database page, then return  |  | 
|  1338 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not |  | 
|  1339 ** guarantee that the page is well-formed.  It only shows that |  | 
|  1340 ** we failed to detect any corruption. |  | 
|  1341 */ |  | 
|  1342 static int btreeInitPage(MemPage *pPage){ |  | 
|  1343  |  | 
|  1344   assert( pPage->pBt!=0 ); |  | 
|  1345   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1346   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) ); |  | 
|  1347   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) ); |  | 
|  1348   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) ); |  | 
|  1349  |  | 
|  1350   if( !pPage->isInit ){ |  | 
|  1351     u16 pc;            /* Address of a freeblock within pPage->aData[] */ |  | 
|  1352     u8 hdr;            /* Offset to beginning of page header */ |  | 
|  1353     u8 *data;          /* Equal to pPage->aData */ |  | 
|  1354     BtShared *pBt;        /* The main btree structure */ |  | 
|  1355     u16 usableSize;    /* Amount of usable space on each page */ |  | 
|  1356     u16 cellOffset;    /* Offset from start of page to first cell pointer */ |  | 
|  1357     u16 nFree;         /* Number of unused bytes on the page */ |  | 
|  1358     u16 top;           /* First byte of the cell content area */ |  | 
|  1359     int iCellFirst;    /* First allowable cell or freeblock offset */ |  | 
|  1360     int iCellLast;     /* Last possible cell or freeblock offset */ |  | 
|  1361  |  | 
|  1362     pBt = pPage->pBt; |  | 
|  1363  |  | 
|  1364     hdr = pPage->hdrOffset; |  | 
|  1365     data = pPage->aData; |  | 
|  1366     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT; |  | 
|  1367     assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); |  | 
|  1368     pPage->maskPage = pBt->pageSize - 1; |  | 
|  1369     pPage->nOverflow = 0; |  | 
|  1370     usableSize = pBt->usableSize; |  | 
|  1371     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf; |  | 
|  1372     top = get2byte(&data[hdr+5]); |  | 
|  1373     pPage->nCell = get2byte(&data[hdr+3]); |  | 
|  1374     if( pPage->nCell>MX_CELL(pBt) ){ |  | 
|  1375       /* To many cells for a single page.  The page must be corrupt */ |  | 
|  1376       return SQLITE_CORRUPT_BKPT; |  | 
|  1377     } |  | 
|  1378     testcase( pPage->nCell==MX_CELL(pBt) ); |  | 
|  1379  |  | 
|  1380     /* A malformed database page might cause us to read past the end |  | 
|  1381     ** of page when parsing a cell.   |  | 
|  1382     ** |  | 
|  1383     ** The following block of code checks early to see if a cell extends |  | 
|  1384     ** past the end of a page boundary and causes SQLITE_CORRUPT to be  |  | 
|  1385     ** returned if it does. |  | 
|  1386     */ |  | 
|  1387     iCellFirst = cellOffset + 2*pPage->nCell; |  | 
|  1388     iCellLast = usableSize - 4; |  | 
|  1389 #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK) |  | 
|  1390     { |  | 
|  1391       int i;            /* Index into the cell pointer array */ |  | 
|  1392       int sz;           /* Size of a cell */ |  | 
|  1393  |  | 
|  1394       if( !pPage->leaf ) iCellLast--; |  | 
|  1395       for(i=0; i<pPage->nCell; i++){ |  | 
|  1396         pc = get2byte(&data[cellOffset+i*2]); |  | 
|  1397         testcase( pc==iCellFirst ); |  | 
|  1398         testcase( pc==iCellLast ); |  | 
|  1399         if( pc<iCellFirst || pc>iCellLast ){ |  | 
|  1400           return SQLITE_CORRUPT_BKPT; |  | 
|  1401         } |  | 
|  1402         sz = cellSizePtr(pPage, &data[pc]); |  | 
|  1403         testcase( pc+sz==usableSize ); |  | 
|  1404         if( pc+sz>usableSize ){ |  | 
|  1405           return SQLITE_CORRUPT_BKPT; |  | 
|  1406         } |  | 
|  1407       } |  | 
|  1408       if( !pPage->leaf ) iCellLast++; |  | 
|  1409     }   |  | 
|  1410 #endif |  | 
|  1411  |  | 
|  1412     /* Compute the total free space on the page */ |  | 
|  1413     pc = get2byte(&data[hdr+1]); |  | 
|  1414     nFree = data[hdr+7] + top; |  | 
|  1415     while( pc>0 ){ |  | 
|  1416       u16 next, size; |  | 
|  1417       if( pc<iCellFirst || pc>iCellLast ){ |  | 
|  1418         /* Start of free block is off the page */ |  | 
|  1419         return SQLITE_CORRUPT_BKPT;  |  | 
|  1420       } |  | 
|  1421       next = get2byte(&data[pc]); |  | 
|  1422       size = get2byte(&data[pc+2]); |  | 
|  1423       if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){ |  | 
|  1424         /* Free blocks must be in ascending order. And the last byte of |  | 
|  1425         ** the free-block must lie on the database page.  */ |  | 
|  1426         return SQLITE_CORRUPT_BKPT;  |  | 
|  1427       } |  | 
|  1428       nFree = nFree + size; |  | 
|  1429       pc = next; |  | 
|  1430     } |  | 
|  1431  |  | 
|  1432     /* At this point, nFree contains the sum of the offset to the start |  | 
|  1433     ** of the cell-content area plus the number of free bytes within |  | 
|  1434     ** the cell-content area. If this is greater than the usable-size |  | 
|  1435     ** of the page, then the page must be corrupted. This check also |  | 
|  1436     ** serves to verify that the offset to the start of the cell-content |  | 
|  1437     ** area, according to the page header, lies within the page. |  | 
|  1438     */ |  | 
|  1439     if( nFree>usableSize ){ |  | 
|  1440       return SQLITE_CORRUPT_BKPT;  |  | 
|  1441     } |  | 
|  1442     pPage->nFree = (u16)(nFree - iCellFirst); |  | 
|  1443     pPage->isInit = 1; |  | 
|  1444   } |  | 
|  1445   return SQLITE_OK; |  | 
|  1446 } |  | 
|  1447  |  | 
|  1448 /* |  | 
|  1449 ** Set up a raw page so that it looks like a database page holding |  | 
|  1450 ** no entries. |  | 
|  1451 */ |  | 
|  1452 static void zeroPage(MemPage *pPage, int flags){ |  | 
|  1453   unsigned char *data = pPage->aData; |  | 
|  1454   BtShared *pBt = pPage->pBt; |  | 
|  1455   u8 hdr = pPage->hdrOffset; |  | 
|  1456   u16 first; |  | 
|  1457  |  | 
|  1458   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno ); |  | 
|  1459   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); |  | 
|  1460   assert( sqlite3PagerGetData(pPage->pDbPage) == data ); |  | 
|  1461   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  1462   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  1463   /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/ |  | 
|  1464   data[hdr] = (char)flags; |  | 
|  1465   first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0); |  | 
|  1466   memset(&data[hdr+1], 0, 4); |  | 
|  1467   data[hdr+7] = 0; |  | 
|  1468   put2byte(&data[hdr+5], pBt->usableSize); |  | 
|  1469   pPage->nFree = pBt->usableSize - first; |  | 
|  1470   decodeFlags(pPage, flags); |  | 
|  1471   pPage->hdrOffset = hdr; |  | 
|  1472   pPage->cellOffset = first; |  | 
|  1473   pPage->nOverflow = 0; |  | 
|  1474   assert( pBt->pageSize>=512 && pBt->pageSize<=32768 ); |  | 
|  1475   pPage->maskPage = pBt->pageSize - 1; |  | 
|  1476   pPage->nCell = 0; |  | 
|  1477   pPage->isInit = 1; |  | 
|  1478 } |  | 
|  1479  |  | 
|  1480  |  | 
|  1481 /* |  | 
|  1482 ** Convert a DbPage obtained from the pager into a MemPage used by |  | 
|  1483 ** the btree layer. |  | 
|  1484 */ |  | 
|  1485 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){ |  | 
|  1486   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage); |  | 
|  1487   pPage->aData = sqlite3PagerGetData(pDbPage); |  | 
|  1488   pPage->pDbPage = pDbPage; |  | 
|  1489   pPage->pBt = pBt; |  | 
|  1490   pPage->pgno = pgno; |  | 
|  1491   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0; |  | 
|  1492   return pPage;  |  | 
|  1493 } |  | 
|  1494  |  | 
|  1495 /* |  | 
|  1496 ** Get a page from the pager.  Initialize the MemPage.pBt and |  | 
|  1497 ** MemPage.aData elements if needed. |  | 
|  1498 ** |  | 
|  1499 ** If the noContent flag is set, it means that we do not care about |  | 
|  1500 ** the content of the page at this time.  So do not go to the disk |  | 
|  1501 ** to fetch the content.  Just fill in the content with zeros for now. |  | 
|  1502 ** If in the future we call sqlite3PagerWrite() on this page, that |  | 
|  1503 ** means we have started to be concerned about content and the disk |  | 
|  1504 ** read should occur at that point. |  | 
|  1505 */ |  | 
|  1506 static int btreeGetPage( |  | 
|  1507   BtShared *pBt,       /* The btree */ |  | 
|  1508   Pgno pgno,           /* Number of the page to fetch */ |  | 
|  1509   MemPage **ppPage,    /* Return the page in this parameter */ |  | 
|  1510   int noContent        /* Do not load page content if true */ |  | 
|  1511 ){ |  | 
|  1512   int rc; |  | 
|  1513   DbPage *pDbPage; |  | 
|  1514  |  | 
|  1515   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  1516   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent); |  | 
|  1517   if( rc ) return rc; |  | 
|  1518   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt); |  | 
|  1519   return SQLITE_OK; |  | 
|  1520 } |  | 
|  1521  |  | 
|  1522 /* |  | 
|  1523 ** Retrieve a page from the pager cache. If the requested page is not |  | 
|  1524 ** already in the pager cache return NULL. Initialize the MemPage.pBt and |  | 
|  1525 ** MemPage.aData elements if needed. |  | 
|  1526 */ |  | 
|  1527 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){ |  | 
|  1528   DbPage *pDbPage; |  | 
|  1529   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  1530   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno); |  | 
|  1531   if( pDbPage ){ |  | 
|  1532     return btreePageFromDbPage(pDbPage, pgno, pBt); |  | 
|  1533   } |  | 
|  1534   return 0; |  | 
|  1535 } |  | 
|  1536  |  | 
|  1537 /* |  | 
|  1538 ** Return the size of the database file in pages. If there is any kind of |  | 
|  1539 ** error, return ((unsigned int)-1). |  | 
|  1540 */ |  | 
|  1541 static Pgno pagerPagecount(BtShared *pBt){ |  | 
|  1542   int nPage = -1; |  | 
|  1543   int rc; |  | 
|  1544   assert( pBt->pPage1 ); |  | 
|  1545   rc = sqlite3PagerPagecount(pBt->pPager, &nPage); |  | 
|  1546   assert( rc==SQLITE_OK || nPage==-1 ); |  | 
|  1547   return (Pgno)nPage; |  | 
|  1548 } |  | 
|  1549  |  | 
|  1550 /* |  | 
|  1551 ** Get a page from the pager and initialize it.  This routine is just a |  | 
|  1552 ** convenience wrapper around separate calls to btreeGetPage() and  |  | 
|  1553 ** btreeInitPage(). |  | 
|  1554 ** |  | 
|  1555 ** If an error occurs, then the value *ppPage is set to is undefined. It |  | 
|  1556 ** may remain unchanged, or it may be set to an invalid value. |  | 
|  1557 */ |  | 
|  1558 static int getAndInitPage( |  | 
|  1559   BtShared *pBt,          /* The database file */ |  | 
|  1560   Pgno pgno,           /* Number of the page to get */ |  | 
|  1561   MemPage **ppPage     /* Write the page pointer here */ |  | 
|  1562 ){ |  | 
|  1563   int rc; |  | 
|  1564   TESTONLY( Pgno iLastPg = pagerPagecount(pBt); ) |  | 
|  1565   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  1566  |  | 
|  1567   rc = btreeGetPage(pBt, pgno, ppPage, 0); |  | 
|  1568   if( rc==SQLITE_OK ){ |  | 
|  1569     rc = btreeInitPage(*ppPage); |  | 
|  1570     if( rc!=SQLITE_OK ){ |  | 
|  1571       releasePage(*ppPage); |  | 
|  1572     } |  | 
|  1573   } |  | 
|  1574  |  | 
|  1575   /* If the requested page number was either 0 or greater than the page |  | 
|  1576   ** number of the last page in the database, this function should return |  | 
|  1577   ** SQLITE_CORRUPT or some other error (i.e. SQLITE_FULL). Check that this |  | 
|  1578   ** is the case.  */ |  | 
|  1579   assert( (pgno>0 && pgno<=iLastPg) || rc!=SQLITE_OK ); |  | 
|  1580   testcase( pgno==0 ); |  | 
|  1581   testcase( pgno==iLastPg ); |  | 
|  1582  |  | 
|  1583   return rc; |  | 
|  1584 } |  | 
|  1585  |  | 
|  1586 /* |  | 
|  1587 ** Release a MemPage.  This should be called once for each prior |  | 
|  1588 ** call to btreeGetPage. |  | 
|  1589 */ |  | 
|  1590 static void releasePage(MemPage *pPage){ |  | 
|  1591   if( pPage ){ |  | 
|  1592     assert( pPage->nOverflow==0 || sqlite3PagerPageRefcount(pPage->pDbPage)>1 ); |  | 
|  1593     assert( pPage->aData ); |  | 
|  1594     assert( pPage->pBt ); |  | 
|  1595     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage ); |  | 
|  1596     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData ); |  | 
|  1597     assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1598     sqlite3PagerUnref(pPage->pDbPage); |  | 
|  1599   } |  | 
|  1600 } |  | 
|  1601  |  | 
|  1602 /* |  | 
|  1603 ** During a rollback, when the pager reloads information into the cache |  | 
|  1604 ** so that the cache is restored to its original state at the start of |  | 
|  1605 ** the transaction, for each page restored this routine is called. |  | 
|  1606 ** |  | 
|  1607 ** This routine needs to reset the extra data section at the end of the |  | 
|  1608 ** page to agree with the restored data. |  | 
|  1609 */ |  | 
|  1610 static void pageReinit(DbPage *pData){ |  | 
|  1611   MemPage *pPage; |  | 
|  1612   pPage = (MemPage *)sqlite3PagerGetExtra(pData); |  | 
|  1613   assert( sqlite3PagerPageRefcount(pData)>0 ); |  | 
|  1614   if( pPage->isInit ){ |  | 
|  1615     assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  1616     pPage->isInit = 0; |  | 
|  1617     if( sqlite3PagerPageRefcount(pData)>1 ){ |  | 
|  1618       /* pPage might not be a btree page;  it might be an overflow page |  | 
|  1619       ** or ptrmap page or a free page.  In those cases, the following |  | 
|  1620       ** call to btreeInitPage() will likely return SQLITE_CORRUPT. |  | 
|  1621       ** But no harm is done by this.  And it is very important that |  | 
|  1622       ** btreeInitPage() be called on every btree page so we make |  | 
|  1623       ** the call for every page that comes in for re-initing. */ |  | 
|  1624       btreeInitPage(pPage); |  | 
|  1625     } |  | 
|  1626   } |  | 
|  1627 } |  | 
|  1628  |  | 
|  1629 /* |  | 
|  1630 ** Invoke the busy handler for a btree. |  | 
|  1631 */ |  | 
|  1632 static int btreeInvokeBusyHandler(void *pArg){ |  | 
|  1633   BtShared *pBt = (BtShared*)pArg; |  | 
|  1634   assert( pBt->db ); |  | 
|  1635   assert( sqlite3_mutex_held(pBt->db->mutex) ); |  | 
|  1636   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler); |  | 
|  1637 } |  | 
|  1638  |  | 
|  1639 /* |  | 
|  1640 ** Open a database file. |  | 
|  1641 **  |  | 
|  1642 ** zFilename is the name of the database file.  If zFilename is NULL |  | 
|  1643 ** a new database with a random name is created.  This randomly named |  | 
|  1644 ** database file will be deleted when sqlite3BtreeClose() is called. |  | 
|  1645 ** If zFilename is ":memory:" then an in-memory database is created |  | 
|  1646 ** that is automatically destroyed when it is closed. |  | 
|  1647 ** |  | 
|  1648 ** If the database is already opened in the same database connection |  | 
|  1649 ** and we are in shared cache mode, then the open will fail with an |  | 
|  1650 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared |  | 
|  1651 ** objects in the same database connection since doing so will lead |  | 
|  1652 ** to problems with locking. |  | 
|  1653 */ |  | 
|  1654 int sqlite3BtreeOpen( |  | 
|  1655   const char *zFilename,  /* Name of the file containing the BTree database */ |  | 
|  1656   sqlite3 *db,            /* Associated database handle */ |  | 
|  1657   Btree **ppBtree,        /* Pointer to new Btree object written here */ |  | 
|  1658   int flags,              /* Options */ |  | 
|  1659   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */ |  | 
|  1660 ){ |  | 
|  1661   sqlite3_vfs *pVfs;             /* The VFS to use for this btree */ |  | 
|  1662   BtShared *pBt = 0;             /* Shared part of btree structure */ |  | 
|  1663   Btree *p;                      /* Handle to return */ |  | 
|  1664   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */ |  | 
|  1665   int rc = SQLITE_OK;            /* Result code from this function */ |  | 
|  1666   u8 nReserve;                   /* Byte of unused space on each page */ |  | 
|  1667   unsigned char zDbHeader[100];  /* Database header content */ |  | 
|  1668  |  | 
|  1669   /* Set the variable isMemdb to true for an in-memory database, or  |  | 
|  1670   ** false for a file-based database. This symbol is only required if |  | 
|  1671   ** either of the shared-data or autovacuum features are compiled  |  | 
|  1672   ** into the library. |  | 
|  1673   */ |  | 
|  1674 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM) |  | 
|  1675   #ifdef SQLITE_OMIT_MEMORYDB |  | 
|  1676     const int isMemdb = 0; |  | 
|  1677   #else |  | 
|  1678     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:"); |  | 
|  1679   #endif |  | 
|  1680 #endif |  | 
|  1681  |  | 
|  1682   assert( db!=0 ); |  | 
|  1683   assert( sqlite3_mutex_held(db->mutex) ); |  | 
|  1684  |  | 
|  1685   pVfs = db->pVfs; |  | 
|  1686   p = sqlite3MallocZero(sizeof(Btree)); |  | 
|  1687   if( !p ){ |  | 
|  1688     return SQLITE_NOMEM; |  | 
|  1689   } |  | 
|  1690   p->inTrans = TRANS_NONE; |  | 
|  1691   p->db = db; |  | 
|  1692 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  1693   p->lock.pBtree = p; |  | 
|  1694   p->lock.iTable = 1; |  | 
|  1695 #endif |  | 
|  1696  |  | 
|  1697 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) |  | 
|  1698   /* |  | 
|  1699   ** If this Btree is a candidate for shared cache, try to find an |  | 
|  1700   ** existing BtShared object that we can share with |  | 
|  1701   */ |  | 
|  1702   if( isMemdb==0 && zFilename && zFilename[0] ){ |  | 
|  1703     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){ |  | 
|  1704       int nFullPathname = pVfs->mxPathname+1; |  | 
|  1705       char *zFullPathname = sqlite3Malloc(nFullPathname); |  | 
|  1706       sqlite3_mutex *mutexShared; |  | 
|  1707       p->sharable = 1; |  | 
|  1708       if( !zFullPathname ){ |  | 
|  1709         sqlite3_free(p); |  | 
|  1710         return SQLITE_NOMEM; |  | 
|  1711       } |  | 
|  1712       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname); |  | 
|  1713       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN); |  | 
|  1714       sqlite3_mutex_enter(mutexOpen); |  | 
|  1715       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); |  | 
|  1716       sqlite3_mutex_enter(mutexShared); |  | 
|  1717       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){ |  | 
|  1718         assert( pBt->nRef>0 ); |  | 
|  1719         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager)) |  | 
|  1720                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){ |  | 
|  1721           int iDb; |  | 
|  1722           for(iDb=db->nDb-1; iDb>=0; iDb--){ |  | 
|  1723             Btree *pExisting = db->aDb[iDb].pBt; |  | 
|  1724             if( pExisting && pExisting->pBt==pBt ){ |  | 
|  1725               sqlite3_mutex_leave(mutexShared); |  | 
|  1726               sqlite3_mutex_leave(mutexOpen); |  | 
|  1727               sqlite3_free(zFullPathname); |  | 
|  1728               sqlite3_free(p); |  | 
|  1729               return SQLITE_CONSTRAINT; |  | 
|  1730             } |  | 
|  1731           } |  | 
|  1732           p->pBt = pBt; |  | 
|  1733           pBt->nRef++; |  | 
|  1734           break; |  | 
|  1735         } |  | 
|  1736       } |  | 
|  1737       sqlite3_mutex_leave(mutexShared); |  | 
|  1738       sqlite3_free(zFullPathname); |  | 
|  1739     } |  | 
|  1740 #ifdef SQLITE_DEBUG |  | 
|  1741     else{ |  | 
|  1742       /* In debug mode, we mark all persistent databases as sharable |  | 
|  1743       ** even when they are not.  This exercises the locking code and |  | 
|  1744       ** gives more opportunity for asserts(sqlite3_mutex_held()) |  | 
|  1745       ** statements to find locking problems. |  | 
|  1746       */ |  | 
|  1747       p->sharable = 1; |  | 
|  1748     } |  | 
|  1749 #endif |  | 
|  1750   } |  | 
|  1751 #endif |  | 
|  1752   if( pBt==0 ){ |  | 
|  1753     /* |  | 
|  1754     ** The following asserts make sure that structures used by the btree are |  | 
|  1755     ** the right size.  This is to guard against size changes that result |  | 
|  1756     ** when compiling on a different architecture. |  | 
|  1757     */ |  | 
|  1758     assert( sizeof(i64)==8 || sizeof(i64)==4 ); |  | 
|  1759     assert( sizeof(u64)==8 || sizeof(u64)==4 ); |  | 
|  1760     assert( sizeof(u32)==4 ); |  | 
|  1761     assert( sizeof(u16)==2 ); |  | 
|  1762     assert( sizeof(Pgno)==4 ); |  | 
|  1763    |  | 
|  1764     pBt = sqlite3MallocZero( sizeof(*pBt) ); |  | 
|  1765     if( pBt==0 ){ |  | 
|  1766       rc = SQLITE_NOMEM; |  | 
|  1767       goto btree_open_out; |  | 
|  1768     } |  | 
|  1769     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename, |  | 
|  1770                           EXTRA_SIZE, flags, vfsFlags, pageReinit); |  | 
|  1771     if( rc==SQLITE_OK ){ |  | 
|  1772       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader); |  | 
|  1773     } |  | 
|  1774     if( rc!=SQLITE_OK ){ |  | 
|  1775       goto btree_open_out; |  | 
|  1776     } |  | 
|  1777     pBt->db = db; |  | 
|  1778     sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt); |  | 
|  1779     p->pBt = pBt; |  | 
|  1780    |  | 
|  1781     pBt->pCursor = 0; |  | 
|  1782     pBt->pPage1 = 0; |  | 
|  1783     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager); |  | 
|  1784     pBt->pageSize = get2byte(&zDbHeader[16]); |  | 
|  1785     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE |  | 
|  1786          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){ |  | 
|  1787       pBt->pageSize = 0; |  | 
|  1788 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  1789       /* If the magic name ":memory:" will create an in-memory database, then |  | 
|  1790       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if |  | 
|  1791       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if |  | 
|  1792       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a |  | 
|  1793       ** regular file-name. In this case the auto-vacuum applies as per normal. |  | 
|  1794       */ |  | 
|  1795       if( zFilename && !isMemdb ){ |  | 
|  1796         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0); |  | 
|  1797         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0); |  | 
|  1798       } |  | 
|  1799 #endif |  | 
|  1800       nReserve = 0; |  | 
|  1801     }else{ |  | 
|  1802       nReserve = zDbHeader[20]; |  | 
|  1803       pBt->pageSizeFixed = 1; |  | 
|  1804 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  1805       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0); |  | 
|  1806       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0); |  | 
|  1807 #endif |  | 
|  1808     } |  | 
|  1809     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); |  | 
|  1810     if( rc ) goto btree_open_out; |  | 
|  1811     pBt->usableSize = pBt->pageSize - nReserve; |  | 
|  1812     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */ |  | 
|  1813     |  | 
|  1814 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) |  | 
|  1815     /* Add the new BtShared object to the linked list sharable BtShareds. |  | 
|  1816     */ |  | 
|  1817     if( p->sharable ){ |  | 
|  1818       sqlite3_mutex *mutexShared; |  | 
|  1819       pBt->nRef = 1; |  | 
|  1820       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); |  | 
|  1821       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){ |  | 
|  1822         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST); |  | 
|  1823         if( pBt->mutex==0 ){ |  | 
|  1824           rc = SQLITE_NOMEM; |  | 
|  1825           db->mallocFailed = 0; |  | 
|  1826           goto btree_open_out; |  | 
|  1827         } |  | 
|  1828       } |  | 
|  1829       sqlite3_mutex_enter(mutexShared); |  | 
|  1830       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList); |  | 
|  1831       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt; |  | 
|  1832       sqlite3_mutex_leave(mutexShared); |  | 
|  1833     } |  | 
|  1834 #endif |  | 
|  1835   } |  | 
|  1836  |  | 
|  1837 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO) |  | 
|  1838   /* If the new Btree uses a sharable pBtShared, then link the new |  | 
|  1839   ** Btree into the list of all sharable Btrees for the same connection. |  | 
|  1840   ** The list is kept in ascending order by pBt address. |  | 
|  1841   */ |  | 
|  1842   if( p->sharable ){ |  | 
|  1843     int i; |  | 
|  1844     Btree *pSib; |  | 
|  1845     for(i=0; i<db->nDb; i++){ |  | 
|  1846       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){ |  | 
|  1847         while( pSib->pPrev ){ pSib = pSib->pPrev; } |  | 
|  1848         if( p->pBt<pSib->pBt ){ |  | 
|  1849           p->pNext = pSib; |  | 
|  1850           p->pPrev = 0; |  | 
|  1851           pSib->pPrev = p; |  | 
|  1852         }else{ |  | 
|  1853           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){ |  | 
|  1854             pSib = pSib->pNext; |  | 
|  1855           } |  | 
|  1856           p->pNext = pSib->pNext; |  | 
|  1857           p->pPrev = pSib; |  | 
|  1858           if( p->pNext ){ |  | 
|  1859             p->pNext->pPrev = p; |  | 
|  1860           } |  | 
|  1861           pSib->pNext = p; |  | 
|  1862         } |  | 
|  1863         break; |  | 
|  1864       } |  | 
|  1865     } |  | 
|  1866   } |  | 
|  1867 #endif |  | 
|  1868   *ppBtree = p; |  | 
|  1869  |  | 
|  1870 btree_open_out: |  | 
|  1871   if( rc!=SQLITE_OK ){ |  | 
|  1872     if( pBt && pBt->pPager ){ |  | 
|  1873       sqlite3PagerClose(pBt->pPager); |  | 
|  1874     } |  | 
|  1875     sqlite3_free(pBt); |  | 
|  1876     sqlite3_free(p); |  | 
|  1877     *ppBtree = 0; |  | 
|  1878   } |  | 
|  1879   if( mutexOpen ){ |  | 
|  1880     assert( sqlite3_mutex_held(mutexOpen) ); |  | 
|  1881     sqlite3_mutex_leave(mutexOpen); |  | 
|  1882   } |  | 
|  1883   return rc; |  | 
|  1884 } |  | 
|  1885  |  | 
|  1886 /* |  | 
|  1887 ** Decrement the BtShared.nRef counter.  When it reaches zero, |  | 
|  1888 ** remove the BtShared structure from the sharing list.  Return |  | 
|  1889 ** true if the BtShared.nRef counter reaches zero and return |  | 
|  1890 ** false if it is still positive. |  | 
|  1891 */ |  | 
|  1892 static int removeFromSharingList(BtShared *pBt){ |  | 
|  1893 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  1894   sqlite3_mutex *pMaster; |  | 
|  1895   BtShared *pList; |  | 
|  1896   int removed = 0; |  | 
|  1897  |  | 
|  1898   assert( sqlite3_mutex_notheld(pBt->mutex) ); |  | 
|  1899   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); |  | 
|  1900   sqlite3_mutex_enter(pMaster); |  | 
|  1901   pBt->nRef--; |  | 
|  1902   if( pBt->nRef<=0 ){ |  | 
|  1903     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){ |  | 
|  1904       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext; |  | 
|  1905     }else{ |  | 
|  1906       pList = GLOBAL(BtShared*,sqlite3SharedCacheList); |  | 
|  1907       while( ALWAYS(pList) && pList->pNext!=pBt ){ |  | 
|  1908         pList=pList->pNext; |  | 
|  1909       } |  | 
|  1910       if( ALWAYS(pList) ){ |  | 
|  1911         pList->pNext = pBt->pNext; |  | 
|  1912       } |  | 
|  1913     } |  | 
|  1914     if( SQLITE_THREADSAFE ){ |  | 
|  1915       sqlite3_mutex_free(pBt->mutex); |  | 
|  1916     } |  | 
|  1917     removed = 1; |  | 
|  1918   } |  | 
|  1919   sqlite3_mutex_leave(pMaster); |  | 
|  1920   return removed; |  | 
|  1921 #else |  | 
|  1922   return 1; |  | 
|  1923 #endif |  | 
|  1924 } |  | 
|  1925  |  | 
|  1926 /* |  | 
|  1927 ** Make sure pBt->pTmpSpace points to an allocation of  |  | 
|  1928 ** MX_CELL_SIZE(pBt) bytes. |  | 
|  1929 */ |  | 
|  1930 static void allocateTempSpace(BtShared *pBt){ |  | 
|  1931   if( !pBt->pTmpSpace ){ |  | 
|  1932     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize ); |  | 
|  1933   } |  | 
|  1934 } |  | 
|  1935  |  | 
|  1936 /* |  | 
|  1937 ** Free the pBt->pTmpSpace allocation |  | 
|  1938 */ |  | 
|  1939 static void freeTempSpace(BtShared *pBt){ |  | 
|  1940   sqlite3PageFree( pBt->pTmpSpace); |  | 
|  1941   pBt->pTmpSpace = 0; |  | 
|  1942 } |  | 
|  1943  |  | 
|  1944 /* |  | 
|  1945 ** Close an open database and invalidate all cursors. |  | 
|  1946 */ |  | 
|  1947 int sqlite3BtreeClose(Btree *p){ |  | 
|  1948   BtShared *pBt = p->pBt; |  | 
|  1949   BtCursor *pCur; |  | 
|  1950  |  | 
|  1951   /* Close all cursors opened via this handle.  */ |  | 
|  1952   assert( sqlite3_mutex_held(p->db->mutex) ); |  | 
|  1953   sqlite3BtreeEnter(p); |  | 
|  1954   pCur = pBt->pCursor; |  | 
|  1955   while( pCur ){ |  | 
|  1956     BtCursor *pTmp = pCur; |  | 
|  1957     pCur = pCur->pNext; |  | 
|  1958     if( pTmp->pBtree==p ){ |  | 
|  1959       sqlite3BtreeCloseCursor(pTmp); |  | 
|  1960     } |  | 
|  1961   } |  | 
|  1962  |  | 
|  1963   /* Rollback any active transaction and free the handle structure. |  | 
|  1964   ** The call to sqlite3BtreeRollback() drops any table-locks held by |  | 
|  1965   ** this handle. |  | 
|  1966   */ |  | 
|  1967   sqlite3BtreeRollback(p); |  | 
|  1968   sqlite3BtreeLeave(p); |  | 
|  1969  |  | 
|  1970   /* If there are still other outstanding references to the shared-btree |  | 
|  1971   ** structure, return now. The remainder of this procedure cleans  |  | 
|  1972   ** up the shared-btree. |  | 
|  1973   */ |  | 
|  1974   assert( p->wantToLock==0 && p->locked==0 ); |  | 
|  1975   if( !p->sharable || removeFromSharingList(pBt) ){ |  | 
|  1976     /* The pBt is no longer on the sharing list, so we can access |  | 
|  1977     ** it without having to hold the mutex. |  | 
|  1978     ** |  | 
|  1979     ** Clean out and delete the BtShared object. |  | 
|  1980     */ |  | 
|  1981     assert( !pBt->pCursor ); |  | 
|  1982     sqlite3PagerClose(pBt->pPager); |  | 
|  1983     if( pBt->xFreeSchema && pBt->pSchema ){ |  | 
|  1984       pBt->xFreeSchema(pBt->pSchema); |  | 
|  1985     } |  | 
|  1986     sqlite3_free(pBt->pSchema); |  | 
|  1987     freeTempSpace(pBt); |  | 
|  1988     sqlite3_free(pBt); |  | 
|  1989   } |  | 
|  1990  |  | 
|  1991 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  1992   assert( p->wantToLock==0 ); |  | 
|  1993   assert( p->locked==0 ); |  | 
|  1994   if( p->pPrev ) p->pPrev->pNext = p->pNext; |  | 
|  1995   if( p->pNext ) p->pNext->pPrev = p->pPrev; |  | 
|  1996 #endif |  | 
|  1997  |  | 
|  1998   sqlite3_free(p); |  | 
|  1999   return SQLITE_OK; |  | 
|  2000 } |  | 
|  2001  |  | 
|  2002 /* |  | 
|  2003 ** Change the limit on the number of pages allowed in the cache. |  | 
|  2004 ** |  | 
|  2005 ** The maximum number of cache pages is set to the absolute |  | 
|  2006 ** value of mxPage.  If mxPage is negative, the pager will |  | 
|  2007 ** operate asynchronously - it will not stop to do fsync()s |  | 
|  2008 ** to insure data is written to the disk surface before |  | 
|  2009 ** continuing.  Transactions still work if synchronous is off, |  | 
|  2010 ** and the database cannot be corrupted if this program |  | 
|  2011 ** crashes.  But if the operating system crashes or there is |  | 
|  2012 ** an abrupt power failure when synchronous is off, the database |  | 
|  2013 ** could be left in an inconsistent and unrecoverable state. |  | 
|  2014 ** Synchronous is on by default so database corruption is not |  | 
|  2015 ** normally a worry. |  | 
|  2016 */ |  | 
|  2017 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){ |  | 
|  2018   BtShared *pBt = p->pBt; |  | 
|  2019   assert( sqlite3_mutex_held(p->db->mutex) ); |  | 
|  2020   sqlite3BtreeEnter(p); |  | 
|  2021   sqlite3PagerSetCachesize(pBt->pPager, mxPage); |  | 
|  2022   sqlite3BtreeLeave(p); |  | 
|  2023   return SQLITE_OK; |  | 
|  2024 } |  | 
|  2025  |  | 
|  2026 /* |  | 
|  2027 ** Change the way data is synced to disk in order to increase or decrease |  | 
|  2028 ** how well the database resists damage due to OS crashes and power |  | 
|  2029 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and |  | 
|  2030 ** there is a high probability of damage)  Level 2 is the default.  There |  | 
|  2031 ** is a very low but non-zero probability of damage.  Level 3 reduces the |  | 
|  2032 ** probability of damage to near zero but with a write performance reduction. |  | 
|  2033 */ |  | 
|  2034 #ifndef SQLITE_OMIT_PAGER_PRAGMAS |  | 
|  2035 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){ |  | 
|  2036   BtShared *pBt = p->pBt; |  | 
|  2037   assert( sqlite3_mutex_held(p->db->mutex) ); |  | 
|  2038   sqlite3BtreeEnter(p); |  | 
|  2039   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync); |  | 
|  2040   sqlite3BtreeLeave(p); |  | 
|  2041   return SQLITE_OK; |  | 
|  2042 } |  | 
|  2043 #endif |  | 
|  2044  |  | 
|  2045 /* |  | 
|  2046 ** Return TRUE if the given btree is set to safety level 1.  In other |  | 
|  2047 ** words, return TRUE if no sync() occurs on the disk files. |  | 
|  2048 */ |  | 
|  2049 int sqlite3BtreeSyncDisabled(Btree *p){ |  | 
|  2050   BtShared *pBt = p->pBt; |  | 
|  2051   int rc; |  | 
|  2052   assert( sqlite3_mutex_held(p->db->mutex) );   |  | 
|  2053   sqlite3BtreeEnter(p); |  | 
|  2054   assert( pBt && pBt->pPager ); |  | 
|  2055   rc = sqlite3PagerNosync(pBt->pPager); |  | 
|  2056   sqlite3BtreeLeave(p); |  | 
|  2057   return rc; |  | 
|  2058 } |  | 
|  2059  |  | 
|  2060 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) |  | 
|  2061 /* |  | 
|  2062 ** Change the default pages size and the number of reserved bytes per page. |  | 
|  2063 ** Or, if the page size has already been fixed, return SQLITE_READONLY  |  | 
|  2064 ** without changing anything. |  | 
|  2065 ** |  | 
|  2066 ** The page size must be a power of 2 between 512 and 65536.  If the page |  | 
|  2067 ** size supplied does not meet this constraint then the page size is not |  | 
|  2068 ** changed. |  | 
|  2069 ** |  | 
|  2070 ** Page sizes are constrained to be a power of two so that the region |  | 
|  2071 ** of the database file used for locking (beginning at PENDING_BYTE, |  | 
|  2072 ** the first byte past the 1GB boundary, 0x40000000) needs to occur |  | 
|  2073 ** at the beginning of a page. |  | 
|  2074 ** |  | 
|  2075 ** If parameter nReserve is less than zero, then the number of reserved |  | 
|  2076 ** bytes per page is left unchanged. |  | 
|  2077 ** |  | 
|  2078 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size |  | 
|  2079 ** and autovacuum mode can no longer be changed. |  | 
|  2080 */ |  | 
|  2081 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){ |  | 
|  2082   int rc = SQLITE_OK; |  | 
|  2083   BtShared *pBt = p->pBt; |  | 
|  2084   assert( nReserve>=-1 && nReserve<=255 ); |  | 
|  2085   sqlite3BtreeEnter(p); |  | 
|  2086   if( pBt->pageSizeFixed ){ |  | 
|  2087     sqlite3BtreeLeave(p); |  | 
|  2088     return SQLITE_READONLY; |  | 
|  2089   } |  | 
|  2090   if( nReserve<0 ){ |  | 
|  2091     nReserve = pBt->pageSize - pBt->usableSize; |  | 
|  2092   } |  | 
|  2093   assert( nReserve>=0 && nReserve<=255 ); |  | 
|  2094   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE && |  | 
|  2095         ((pageSize-1)&pageSize)==0 ){ |  | 
|  2096     assert( (pageSize & 7)==0 ); |  | 
|  2097     assert( !pBt->pPage1 && !pBt->pCursor ); |  | 
|  2098     pBt->pageSize = (u16)pageSize; |  | 
|  2099     freeTempSpace(pBt); |  | 
|  2100   } |  | 
|  2101   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve); |  | 
|  2102   pBt->usableSize = pBt->pageSize - (u16)nReserve; |  | 
|  2103   if( iFix ) pBt->pageSizeFixed = 1; |  | 
|  2104   sqlite3BtreeLeave(p); |  | 
|  2105   return rc; |  | 
|  2106 } |  | 
|  2107  |  | 
|  2108 /* |  | 
|  2109 ** Return the currently defined page size |  | 
|  2110 */ |  | 
|  2111 int sqlite3BtreeGetPageSize(Btree *p){ |  | 
|  2112   return p->pBt->pageSize; |  | 
|  2113 } |  | 
|  2114  |  | 
|  2115 /* |  | 
|  2116 ** Return the number of bytes of space at the end of every page that |  | 
|  2117 ** are intentually left unused.  This is the "reserved" space that is |  | 
|  2118 ** sometimes used by extensions. |  | 
|  2119 */ |  | 
|  2120 int sqlite3BtreeGetReserve(Btree *p){ |  | 
|  2121   int n; |  | 
|  2122   sqlite3BtreeEnter(p); |  | 
|  2123   n = p->pBt->pageSize - p->pBt->usableSize; |  | 
|  2124   sqlite3BtreeLeave(p); |  | 
|  2125   return n; |  | 
|  2126 } |  | 
|  2127  |  | 
|  2128 /* |  | 
|  2129 ** Set the maximum page count for a database if mxPage is positive. |  | 
|  2130 ** No changes are made if mxPage is 0 or negative. |  | 
|  2131 ** Regardless of the value of mxPage, return the maximum page count. |  | 
|  2132 */ |  | 
|  2133 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){ |  | 
|  2134   int n; |  | 
|  2135   sqlite3BtreeEnter(p); |  | 
|  2136   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage); |  | 
|  2137   sqlite3BtreeLeave(p); |  | 
|  2138   return n; |  | 
|  2139 } |  | 
|  2140 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */ |  | 
|  2141  |  | 
|  2142 /* |  | 
|  2143 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum' |  | 
|  2144 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it |  | 
|  2145 ** is disabled. The default value for the auto-vacuum property is  |  | 
|  2146 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro. |  | 
|  2147 */ |  | 
|  2148 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){ |  | 
|  2149 #ifdef SQLITE_OMIT_AUTOVACUUM |  | 
|  2150   return SQLITE_READONLY; |  | 
|  2151 #else |  | 
|  2152   BtShared *pBt = p->pBt; |  | 
|  2153   int rc = SQLITE_OK; |  | 
|  2154   u8 av = (u8)autoVacuum; |  | 
|  2155  |  | 
|  2156   sqlite3BtreeEnter(p); |  | 
|  2157   if( pBt->pageSizeFixed && (av ?1:0)!=pBt->autoVacuum ){ |  | 
|  2158     rc = SQLITE_READONLY; |  | 
|  2159   }else{ |  | 
|  2160     pBt->autoVacuum = av ?1:0; |  | 
|  2161     pBt->incrVacuum = av==2 ?1:0; |  | 
|  2162   } |  | 
|  2163   sqlite3BtreeLeave(p); |  | 
|  2164   return rc; |  | 
|  2165 #endif |  | 
|  2166 } |  | 
|  2167  |  | 
|  2168 /* |  | 
|  2169 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is  |  | 
|  2170 ** enabled 1 is returned. Otherwise 0. |  | 
|  2171 */ |  | 
|  2172 int sqlite3BtreeGetAutoVacuum(Btree *p){ |  | 
|  2173 #ifdef SQLITE_OMIT_AUTOVACUUM |  | 
|  2174   return BTREE_AUTOVACUUM_NONE; |  | 
|  2175 #else |  | 
|  2176   int rc; |  | 
|  2177   sqlite3BtreeEnter(p); |  | 
|  2178   rc = ( |  | 
|  2179     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE: |  | 
|  2180     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL: |  | 
|  2181     BTREE_AUTOVACUUM_INCR |  | 
|  2182   ); |  | 
|  2183   sqlite3BtreeLeave(p); |  | 
|  2184   return rc; |  | 
|  2185 #endif |  | 
|  2186 } |  | 
|  2187  |  | 
|  2188  |  | 
|  2189 /* |  | 
|  2190 ** Get a reference to pPage1 of the database file.  This will |  | 
|  2191 ** also acquire a readlock on that file. |  | 
|  2192 ** |  | 
|  2193 ** SQLITE_OK is returned on success.  If the file is not a |  | 
|  2194 ** well-formed database file, then SQLITE_CORRUPT is returned. |  | 
|  2195 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM |  | 
|  2196 ** is returned if we run out of memory.  |  | 
|  2197 */ |  | 
|  2198 static int lockBtree(BtShared *pBt){ |  | 
|  2199   int rc; |  | 
|  2200   MemPage *pPage1; |  | 
|  2201   int nPage; |  | 
|  2202  |  | 
|  2203   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  2204   assert( pBt->pPage1==0 ); |  | 
|  2205   rc = sqlite3PagerSharedLock(pBt->pPager); |  | 
|  2206   if( rc!=SQLITE_OK ) return rc; |  | 
|  2207   rc = btreeGetPage(pBt, 1, &pPage1, 0); |  | 
|  2208   if( rc!=SQLITE_OK ) return rc; |  | 
|  2209  |  | 
|  2210   /* Do some checking to help insure the file we opened really is |  | 
|  2211   ** a valid database file.  |  | 
|  2212   */ |  | 
|  2213   rc = sqlite3PagerPagecount(pBt->pPager, &nPage); |  | 
|  2214   if( rc!=SQLITE_OK ){ |  | 
|  2215     goto page1_init_failed; |  | 
|  2216   }else if( nPage>0 ){ |  | 
|  2217     int pageSize; |  | 
|  2218     int usableSize; |  | 
|  2219     u8 *page1 = pPage1->aData; |  | 
|  2220     rc = SQLITE_NOTADB; |  | 
|  2221     if( memcmp(page1, zMagicHeader, 16)!=0 ){ |  | 
|  2222       goto page1_init_failed; |  | 
|  2223     } |  | 
|  2224     if( page1[18]>1 ){ |  | 
|  2225       pBt->readOnly = 1; |  | 
|  2226     } |  | 
|  2227     if( page1[19]>1 ){ |  | 
|  2228       goto page1_init_failed; |  | 
|  2229     } |  | 
|  2230  |  | 
|  2231     /* The maximum embedded fraction must be exactly 25%.  And the minimum |  | 
|  2232     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data. |  | 
|  2233     ** The original design allowed these amounts to vary, but as of |  | 
|  2234     ** version 3.6.0, we require them to be fixed. |  | 
|  2235     */ |  | 
|  2236     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){ |  | 
|  2237       goto page1_init_failed; |  | 
|  2238     } |  | 
|  2239     pageSize = get2byte(&page1[16]); |  | 
|  2240     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 || |  | 
|  2241         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE) |  | 
|  2242     ){ |  | 
|  2243       goto page1_init_failed; |  | 
|  2244     } |  | 
|  2245     assert( (pageSize & 7)==0 ); |  | 
|  2246     usableSize = pageSize - page1[20]; |  | 
|  2247     if( pageSize!=pBt->pageSize ){ |  | 
|  2248       /* After reading the first page of the database assuming a page size |  | 
|  2249       ** of BtShared.pageSize, we have discovered that the page-size is |  | 
|  2250       ** actually pageSize. Unlock the database, leave pBt->pPage1 at |  | 
|  2251       ** zero and return SQLITE_OK. The caller will call this function |  | 
|  2252       ** again with the correct page-size. |  | 
|  2253       */ |  | 
|  2254       releasePage(pPage1); |  | 
|  2255       pBt->usableSize = (u16)usableSize; |  | 
|  2256       pBt->pageSize = (u16)pageSize; |  | 
|  2257       freeTempSpace(pBt); |  | 
|  2258       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, |  | 
|  2259                                    pageSize-usableSize); |  | 
|  2260       return rc; |  | 
|  2261     } |  | 
|  2262     if( usableSize<480 ){ |  | 
|  2263       goto page1_init_failed; |  | 
|  2264     } |  | 
|  2265     pBt->pageSize = (u16)pageSize; |  | 
|  2266     pBt->usableSize = (u16)usableSize; |  | 
|  2267 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  2268     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0); |  | 
|  2269     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0); |  | 
|  2270 #endif |  | 
|  2271   } |  | 
|  2272  |  | 
|  2273   /* maxLocal is the maximum amount of payload to store locally for |  | 
|  2274   ** a cell.  Make sure it is small enough so that at least minFanout |  | 
|  2275   ** cells can will fit on one page.  We assume a 10-byte page header. |  | 
|  2276   ** Besides the payload, the cell must store: |  | 
|  2277   **     2-byte pointer to the cell |  | 
|  2278   **     4-byte child pointer |  | 
|  2279   **     9-byte nKey value |  | 
|  2280   **     4-byte nData value |  | 
|  2281   **     4-byte overflow page pointer |  | 
|  2282   ** So a cell consists of a 2-byte poiner, a header which is as much as |  | 
|  2283   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow |  | 
|  2284   ** page pointer. |  | 
|  2285   */ |  | 
|  2286   pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23; |  | 
|  2287   pBt->minLocal = (pBt->usableSize-12)*32/255 - 23; |  | 
|  2288   pBt->maxLeaf = pBt->usableSize - 35; |  | 
|  2289   pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23; |  | 
|  2290   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) ); |  | 
|  2291   pBt->pPage1 = pPage1; |  | 
|  2292   return SQLITE_OK; |  | 
|  2293  |  | 
|  2294 page1_init_failed: |  | 
|  2295   releasePage(pPage1); |  | 
|  2296   pBt->pPage1 = 0; |  | 
|  2297   return rc; |  | 
|  2298 } |  | 
|  2299  |  | 
|  2300 /* |  | 
|  2301 ** If there are no outstanding cursors and we are not in the middle |  | 
|  2302 ** of a transaction but there is a read lock on the database, then |  | 
|  2303 ** this routine unrefs the first page of the database file which  |  | 
|  2304 ** has the effect of releasing the read lock. |  | 
|  2305 ** |  | 
|  2306 ** If there is a transaction in progress, this routine is a no-op. |  | 
|  2307 */ |  | 
|  2308 static void unlockBtreeIfUnused(BtShared *pBt){ |  | 
|  2309   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  2310   assert( pBt->pCursor==0 || pBt->inTransaction>TRANS_NONE ); |  | 
|  2311   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){ |  | 
|  2312     assert( pBt->pPage1->aData ); |  | 
|  2313     assert( sqlite3PagerRefcount(pBt->pPager)==1 ); |  | 
|  2314     assert( pBt->pPage1->aData ); |  | 
|  2315     releasePage(pBt->pPage1); |  | 
|  2316     pBt->pPage1 = 0; |  | 
|  2317   } |  | 
|  2318 } |  | 
|  2319  |  | 
|  2320 /* |  | 
|  2321 ** If pBt points to an empty file then convert that empty file |  | 
|  2322 ** into a new empty database by initializing the first page of |  | 
|  2323 ** the database. |  | 
|  2324 */ |  | 
|  2325 static int newDatabase(BtShared *pBt){ |  | 
|  2326   MemPage *pP1; |  | 
|  2327   unsigned char *data; |  | 
|  2328   int rc; |  | 
|  2329   int nPage; |  | 
|  2330  |  | 
|  2331   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  2332   /* The database size has already been measured and cached, so failure |  | 
|  2333   ** is impossible here.  If the original size measurement failed, then |  | 
|  2334   ** processing aborts before entering this routine. */ |  | 
|  2335   rc = sqlite3PagerPagecount(pBt->pPager, &nPage); |  | 
|  2336   if( NEVER(rc!=SQLITE_OK) || nPage>0 ){ |  | 
|  2337     return rc; |  | 
|  2338   } |  | 
|  2339   pP1 = pBt->pPage1; |  | 
|  2340   assert( pP1!=0 ); |  | 
|  2341   data = pP1->aData; |  | 
|  2342   rc = sqlite3PagerWrite(pP1->pDbPage); |  | 
|  2343   if( rc ) return rc; |  | 
|  2344   memcpy(data, zMagicHeader, sizeof(zMagicHeader)); |  | 
|  2345   assert( sizeof(zMagicHeader)==16 ); |  | 
|  2346   assert( sizeof(zMagicHeader)==sizeof(zPoisonHeader) ); |  | 
|  2347   put2byte(&data[16], pBt->pageSize); |  | 
|  2348   data[18] = 1; |  | 
|  2349   data[19] = 1; |  | 
|  2350   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize); |  | 
|  2351   data[20] = (u8)(pBt->pageSize - pBt->usableSize); |  | 
|  2352   data[21] = 64; |  | 
|  2353   data[22] = 32; |  | 
|  2354   data[23] = 32; |  | 
|  2355   memset(&data[24], 0, 100-24); |  | 
|  2356   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA ); |  | 
|  2357   pBt->pageSizeFixed = 1; |  | 
|  2358 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  2359   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 ); |  | 
|  2360   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 ); |  | 
|  2361   put4byte(&data[36 + 4*4], pBt->autoVacuum); |  | 
|  2362   put4byte(&data[36 + 7*4], pBt->incrVacuum); |  | 
|  2363 #endif |  | 
|  2364   return SQLITE_OK; |  | 
|  2365 } |  | 
|  2366  |  | 
|  2367 /* |  | 
|  2368 ** Attempt to start a new transaction. A write-transaction |  | 
|  2369 ** is started if the second argument is nonzero, otherwise a read- |  | 
|  2370 ** transaction.  If the second argument is 2 or more and exclusive |  | 
|  2371 ** transaction is started, meaning that no other process is allowed |  | 
|  2372 ** to access the database.  A preexisting transaction may not be |  | 
|  2373 ** upgraded to exclusive by calling this routine a second time - the |  | 
|  2374 ** exclusivity flag only works for a new transaction. |  | 
|  2375 ** |  | 
|  2376 ** A write-transaction must be started before attempting any  |  | 
|  2377 ** changes to the database.  None of the following routines  |  | 
|  2378 ** will work unless a transaction is started first: |  | 
|  2379 ** |  | 
|  2380 **      sqlite3BtreeCreateTable() |  | 
|  2381 **      sqlite3BtreeCreateIndex() |  | 
|  2382 **      sqlite3BtreeClearTable() |  | 
|  2383 **      sqlite3BtreeDropTable() |  | 
|  2384 **      sqlite3BtreeInsert() |  | 
|  2385 **      sqlite3BtreeDelete() |  | 
|  2386 **      sqlite3BtreeUpdateMeta() |  | 
|  2387 ** |  | 
|  2388 ** If an initial attempt to acquire the lock fails because of lock contention |  | 
|  2389 ** and the database was previously unlocked, then invoke the busy handler |  | 
|  2390 ** if there is one.  But if there was previously a read-lock, do not |  | 
|  2391 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is  |  | 
|  2392 ** returned when there is already a read-lock in order to avoid a deadlock. |  | 
|  2393 ** |  | 
|  2394 ** Suppose there are two processes A and B.  A has a read lock and B has |  | 
|  2395 ** a reserved lock.  B tries to promote to exclusive but is blocked because |  | 
|  2396 ** of A's read lock.  A tries to promote to reserved but is blocked by B. |  | 
|  2397 ** One or the other of the two processes must give way or there can be |  | 
|  2398 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback |  | 
|  2399 ** when A already has a read lock, we encourage A to give up and let B |  | 
|  2400 ** proceed. |  | 
|  2401 */ |  | 
|  2402 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){ |  | 
|  2403   sqlite3 *pBlock = 0; |  | 
|  2404   BtShared *pBt = p->pBt; |  | 
|  2405   int rc = SQLITE_OK; |  | 
|  2406  |  | 
|  2407   sqlite3BtreeEnter(p); |  | 
|  2408   btreeIntegrity(p); |  | 
|  2409  |  | 
|  2410   /* If the btree is already in a write-transaction, or it |  | 
|  2411   ** is already in a read-transaction and a read-transaction |  | 
|  2412   ** is requested, this is a no-op. |  | 
|  2413   */ |  | 
|  2414   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){ |  | 
|  2415     goto trans_begun; |  | 
|  2416   } |  | 
|  2417  |  | 
|  2418   /* Write transactions are not possible on a read-only database */ |  | 
|  2419   if( pBt->readOnly && wrflag ){ |  | 
|  2420     rc = SQLITE_READONLY; |  | 
|  2421     goto trans_begun; |  | 
|  2422   } |  | 
|  2423  |  | 
|  2424 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  2425   /* If another database handle has already opened a write transaction  |  | 
|  2426   ** on this shared-btree structure and a second write transaction is |  | 
|  2427   ** requested, return SQLITE_LOCKED. |  | 
|  2428   */ |  | 
|  2429   if( (wrflag && pBt->inTransaction==TRANS_WRITE) || pBt->isPending ){ |  | 
|  2430     pBlock = pBt->pWriter->db; |  | 
|  2431   }else if( wrflag>1 ){ |  | 
|  2432     BtLock *pIter; |  | 
|  2433     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){ |  | 
|  2434       if( pIter->pBtree!=p ){ |  | 
|  2435         pBlock = pIter->pBtree->db; |  | 
|  2436         break; |  | 
|  2437       } |  | 
|  2438     } |  | 
|  2439   } |  | 
|  2440   if( pBlock ){ |  | 
|  2441     sqlite3ConnectionBlocked(p->db, pBlock); |  | 
|  2442     rc = SQLITE_LOCKED_SHAREDCACHE; |  | 
|  2443     goto trans_begun; |  | 
|  2444   } |  | 
|  2445 #endif |  | 
|  2446  |  | 
|  2447   /* Any read-only or read-write transaction implies a read-lock on  |  | 
|  2448   ** page 1. So if some other shared-cache client already has a write-lock  |  | 
|  2449   ** on page 1, the transaction cannot be opened. */ |  | 
|  2450   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); |  | 
|  2451   if( SQLITE_OK!=rc ) goto trans_begun; |  | 
|  2452  |  | 
|  2453   do { |  | 
|  2454     /* Call lockBtree() until either pBt->pPage1 is populated or |  | 
|  2455     ** lockBtree() returns something other than SQLITE_OK. lockBtree() |  | 
|  2456     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after |  | 
|  2457     ** reading page 1 it discovers that the page-size of the database  |  | 
|  2458     ** file is not pBt->pageSize. In this case lockBtree() will update |  | 
|  2459     ** pBt->pageSize to the page-size of the file on disk. |  | 
|  2460     */ |  | 
|  2461     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) ); |  | 
|  2462  |  | 
|  2463     if( rc==SQLITE_OK && wrflag ){ |  | 
|  2464       if( pBt->readOnly ){ |  | 
|  2465         rc = SQLITE_READONLY; |  | 
|  2466       }else{ |  | 
|  2467         rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db)); |  | 
|  2468         if( rc==SQLITE_OK ){ |  | 
|  2469           rc = newDatabase(pBt); |  | 
|  2470         } |  | 
|  2471       } |  | 
|  2472     } |  | 
|  2473    |  | 
|  2474     if( rc!=SQLITE_OK ){ |  | 
|  2475       unlockBtreeIfUnused(pBt); |  | 
|  2476     } |  | 
|  2477   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && |  | 
|  2478           btreeInvokeBusyHandler(pBt) ); |  | 
|  2479  |  | 
|  2480   if( rc==SQLITE_OK ){ |  | 
|  2481     if( p->inTrans==TRANS_NONE ){ |  | 
|  2482       pBt->nTransaction++; |  | 
|  2483 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  2484       if( p->sharable ){ |  | 
|  2485         assert( p->lock.pBtree==p && p->lock.iTable==1 ); |  | 
|  2486         p->lock.eLock = READ_LOCK; |  | 
|  2487         p->lock.pNext = pBt->pLock; |  | 
|  2488         pBt->pLock = &p->lock; |  | 
|  2489       } |  | 
|  2490 #endif |  | 
|  2491     } |  | 
|  2492     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ); |  | 
|  2493     if( p->inTrans>pBt->inTransaction ){ |  | 
|  2494       pBt->inTransaction = p->inTrans; |  | 
|  2495     } |  | 
|  2496 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  2497     if( wrflag ){ |  | 
|  2498       assert( !pBt->pWriter ); |  | 
|  2499       pBt->pWriter = p; |  | 
|  2500       pBt->isExclusive = (u8)(wrflag>1); |  | 
|  2501     } |  | 
|  2502 #endif |  | 
|  2503   } |  | 
|  2504  |  | 
|  2505  |  | 
|  2506 trans_begun: |  | 
|  2507   if( rc==SQLITE_OK && wrflag ){ |  | 
|  2508     /* This call makes sure that the pager has the correct number of |  | 
|  2509     ** open savepoints. If the second parameter is greater than 0 and |  | 
|  2510     ** the sub-journal is not already open, then it will be opened here. |  | 
|  2511     */ |  | 
|  2512     rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint); |  | 
|  2513   } |  | 
|  2514  |  | 
|  2515   btreeIntegrity(p); |  | 
|  2516   sqlite3BtreeLeave(p); |  | 
|  2517   return rc; |  | 
|  2518 } |  | 
|  2519  |  | 
|  2520 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  2521  |  | 
|  2522 /* |  | 
|  2523 ** Set the pointer-map entries for all children of page pPage. Also, if |  | 
|  2524 ** pPage contains cells that point to overflow pages, set the pointer |  | 
|  2525 ** map entries for the overflow pages as well. |  | 
|  2526 */ |  | 
|  2527 static int setChildPtrmaps(MemPage *pPage){ |  | 
|  2528   int i;                             /* Counter variable */ |  | 
|  2529   int nCell;                         /* Number of cells in page pPage */ |  | 
|  2530   int rc;                            /* Return code */ |  | 
|  2531   BtShared *pBt = pPage->pBt; |  | 
|  2532   u8 isInitOrig = pPage->isInit; |  | 
|  2533   Pgno pgno = pPage->pgno; |  | 
|  2534  |  | 
|  2535   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  2536   rc = btreeInitPage(pPage); |  | 
|  2537   if( rc!=SQLITE_OK ){ |  | 
|  2538     goto set_child_ptrmaps_out; |  | 
|  2539   } |  | 
|  2540   nCell = pPage->nCell; |  | 
|  2541  |  | 
|  2542   for(i=0; i<nCell; i++){ |  | 
|  2543     u8 *pCell = findCell(pPage, i); |  | 
|  2544  |  | 
|  2545     ptrmapPutOvflPtr(pPage, pCell, &rc); |  | 
|  2546  |  | 
|  2547     if( !pPage->leaf ){ |  | 
|  2548       Pgno childPgno = get4byte(pCell); |  | 
|  2549       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); |  | 
|  2550     } |  | 
|  2551   } |  | 
|  2552  |  | 
|  2553   if( !pPage->leaf ){ |  | 
|  2554     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); |  | 
|  2555     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc); |  | 
|  2556   } |  | 
|  2557  |  | 
|  2558 set_child_ptrmaps_out: |  | 
|  2559   pPage->isInit = isInitOrig; |  | 
|  2560   return rc; |  | 
|  2561 } |  | 
|  2562  |  | 
|  2563 /* |  | 
|  2564 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so |  | 
|  2565 ** that it points to iTo. Parameter eType describes the type of pointer to |  | 
|  2566 ** be modified, as  follows: |  | 
|  2567 ** |  | 
|  2568 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child  |  | 
|  2569 **                   page of pPage. |  | 
|  2570 ** |  | 
|  2571 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow |  | 
|  2572 **                   page pointed to by one of the cells on pPage. |  | 
|  2573 ** |  | 
|  2574 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next |  | 
|  2575 **                   overflow page in the list. |  | 
|  2576 */ |  | 
|  2577 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){ |  | 
|  2578   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  2579   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  2580   if( eType==PTRMAP_OVERFLOW2 ){ |  | 
|  2581     /* The pointer is always the first 4 bytes of the page in this case.  */ |  | 
|  2582     if( get4byte(pPage->aData)!=iFrom ){ |  | 
|  2583       return SQLITE_CORRUPT_BKPT; |  | 
|  2584     } |  | 
|  2585     put4byte(pPage->aData, iTo); |  | 
|  2586   }else{ |  | 
|  2587     u8 isInitOrig = pPage->isInit; |  | 
|  2588     int i; |  | 
|  2589     int nCell; |  | 
|  2590  |  | 
|  2591     btreeInitPage(pPage); |  | 
|  2592     nCell = pPage->nCell; |  | 
|  2593  |  | 
|  2594     for(i=0; i<nCell; i++){ |  | 
|  2595       u8 *pCell = findCell(pPage, i); |  | 
|  2596       if( eType==PTRMAP_OVERFLOW1 ){ |  | 
|  2597         CellInfo info; |  | 
|  2598         btreeParseCellPtr(pPage, pCell, &info); |  | 
|  2599         if( info.iOverflow ){ |  | 
|  2600           if( iFrom==get4byte(&pCell[info.iOverflow]) ){ |  | 
|  2601             put4byte(&pCell[info.iOverflow], iTo); |  | 
|  2602             break; |  | 
|  2603           } |  | 
|  2604         } |  | 
|  2605       }else{ |  | 
|  2606         if( get4byte(pCell)==iFrom ){ |  | 
|  2607           put4byte(pCell, iTo); |  | 
|  2608           break; |  | 
|  2609         } |  | 
|  2610       } |  | 
|  2611     } |  | 
|  2612    |  | 
|  2613     if( i==nCell ){ |  | 
|  2614       if( eType!=PTRMAP_BTREE ||  |  | 
|  2615           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){ |  | 
|  2616         return SQLITE_CORRUPT_BKPT; |  | 
|  2617       } |  | 
|  2618       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo); |  | 
|  2619     } |  | 
|  2620  |  | 
|  2621     pPage->isInit = isInitOrig; |  | 
|  2622   } |  | 
|  2623   return SQLITE_OK; |  | 
|  2624 } |  | 
|  2625  |  | 
|  2626  |  | 
|  2627 /* |  | 
|  2628 ** Move the open database page pDbPage to location iFreePage in the  |  | 
|  2629 ** database. The pDbPage reference remains valid. |  | 
|  2630 ** |  | 
|  2631 ** The isCommit flag indicates that there is no need to remember that |  | 
|  2632 ** the journal needs to be sync()ed before database page pDbPage->pgno  |  | 
|  2633 ** can be written to. The caller has already promised not to write to that |  | 
|  2634 ** page. |  | 
|  2635 */ |  | 
|  2636 static int relocatePage( |  | 
|  2637   BtShared *pBt,           /* Btree */ |  | 
|  2638   MemPage *pDbPage,        /* Open page to move */ |  | 
|  2639   u8 eType,                /* Pointer map 'type' entry for pDbPage */ |  | 
|  2640   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */ |  | 
|  2641   Pgno iFreePage,          /* The location to move pDbPage to */ |  | 
|  2642   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */ |  | 
|  2643 ){ |  | 
|  2644   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */ |  | 
|  2645   Pgno iDbPage = pDbPage->pgno; |  | 
|  2646   Pager *pPager = pBt->pPager; |  | 
|  2647   int rc; |  | 
|  2648  |  | 
|  2649   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||  |  | 
|  2650       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ); |  | 
|  2651   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  2652   assert( pDbPage->pBt==pBt ); |  | 
|  2653  |  | 
|  2654   /* Move page iDbPage from its current location to page number iFreePage */ |  | 
|  2655   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",  |  | 
|  2656       iDbPage, iFreePage, iPtrPage, eType)); |  | 
|  2657   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit); |  | 
|  2658   if( rc!=SQLITE_OK ){ |  | 
|  2659     return rc; |  | 
|  2660   } |  | 
|  2661   pDbPage->pgno = iFreePage; |  | 
|  2662  |  | 
|  2663   /* If pDbPage was a btree-page, then it may have child pages and/or cells |  | 
|  2664   ** that point to overflow pages. The pointer map entries for all these |  | 
|  2665   ** pages need to be changed. |  | 
|  2666   ** |  | 
|  2667   ** If pDbPage is an overflow page, then the first 4 bytes may store a |  | 
|  2668   ** pointer to a subsequent overflow page. If this is the case, then |  | 
|  2669   ** the pointer map needs to be updated for the subsequent overflow page. |  | 
|  2670   */ |  | 
|  2671   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){ |  | 
|  2672     rc = setChildPtrmaps(pDbPage); |  | 
|  2673     if( rc!=SQLITE_OK ){ |  | 
|  2674       return rc; |  | 
|  2675     } |  | 
|  2676   }else{ |  | 
|  2677     Pgno nextOvfl = get4byte(pDbPage->aData); |  | 
|  2678     if( nextOvfl!=0 ){ |  | 
|  2679       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc); |  | 
|  2680       if( rc!=SQLITE_OK ){ |  | 
|  2681         return rc; |  | 
|  2682       } |  | 
|  2683     } |  | 
|  2684   } |  | 
|  2685  |  | 
|  2686   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so |  | 
|  2687   ** that it points at iFreePage. Also fix the pointer map entry for |  | 
|  2688   ** iPtrPage. |  | 
|  2689   */ |  | 
|  2690   if( eType!=PTRMAP_ROOTPAGE ){ |  | 
|  2691     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0); |  | 
|  2692     if( rc!=SQLITE_OK ){ |  | 
|  2693       return rc; |  | 
|  2694     } |  | 
|  2695     rc = sqlite3PagerWrite(pPtrPage->pDbPage); |  | 
|  2696     if( rc!=SQLITE_OK ){ |  | 
|  2697       releasePage(pPtrPage); |  | 
|  2698       return rc; |  | 
|  2699     } |  | 
|  2700     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType); |  | 
|  2701     releasePage(pPtrPage); |  | 
|  2702     if( rc==SQLITE_OK ){ |  | 
|  2703       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc); |  | 
|  2704     } |  | 
|  2705   } |  | 
|  2706   return rc; |  | 
|  2707 } |  | 
|  2708  |  | 
|  2709 /* Forward declaration required by incrVacuumStep(). */ |  | 
|  2710 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8); |  | 
|  2711  |  | 
|  2712 /* |  | 
|  2713 ** Perform a single step of an incremental-vacuum. If successful, |  | 
|  2714 ** return SQLITE_OK. If there is no work to do (and therefore no |  | 
|  2715 ** point in calling this function again), return SQLITE_DONE. |  | 
|  2716 ** |  | 
|  2717 ** More specificly, this function attempts to re-organize the  |  | 
|  2718 ** database so that the last page of the file currently in use |  | 
|  2719 ** is no longer in use. |  | 
|  2720 ** |  | 
|  2721 ** If the nFin parameter is non-zero, this function assumes |  | 
|  2722 ** that the caller will keep calling incrVacuumStep() until |  | 
|  2723 ** it returns SQLITE_DONE or an error, and that nFin is the |  | 
|  2724 ** number of pages the database file will contain after this  |  | 
|  2725 ** process is complete.  If nFin is zero, it is assumed that |  | 
|  2726 ** incrVacuumStep() will be called a finite amount of times |  | 
|  2727 ** which may or may not empty the freelist.  A full autovacuum |  | 
|  2728 ** has nFin>0.  A "PRAGMA incremental_vacuum" has nFin==0. |  | 
|  2729 */ |  | 
|  2730 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg){ |  | 
|  2731   Pgno nFreeList;           /* Number of pages still on the free-list */ |  | 
|  2732  |  | 
|  2733   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  2734   assert( iLastPg>nFin ); |  | 
|  2735  |  | 
|  2736   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){ |  | 
|  2737     int rc; |  | 
|  2738     u8 eType; |  | 
|  2739     Pgno iPtrPage; |  | 
|  2740  |  | 
|  2741     nFreeList = get4byte(&pBt->pPage1->aData[36]); |  | 
|  2742     if( nFreeList==0 ){ |  | 
|  2743       return SQLITE_DONE; |  | 
|  2744     } |  | 
|  2745  |  | 
|  2746     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage); |  | 
|  2747     if( rc!=SQLITE_OK ){ |  | 
|  2748       return rc; |  | 
|  2749     } |  | 
|  2750     if( eType==PTRMAP_ROOTPAGE ){ |  | 
|  2751       return SQLITE_CORRUPT_BKPT; |  | 
|  2752     } |  | 
|  2753  |  | 
|  2754     if( eType==PTRMAP_FREEPAGE ){ |  | 
|  2755       if( nFin==0 ){ |  | 
|  2756         /* Remove the page from the files free-list. This is not required |  | 
|  2757         ** if nFin is non-zero. In that case, the free-list will be |  | 
|  2758         ** truncated to zero after this function returns, so it doesn't  |  | 
|  2759         ** matter if it still contains some garbage entries. |  | 
|  2760         */ |  | 
|  2761         Pgno iFreePg; |  | 
|  2762         MemPage *pFreePg; |  | 
|  2763         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1); |  | 
|  2764         if( rc!=SQLITE_OK ){ |  | 
|  2765           return rc; |  | 
|  2766         } |  | 
|  2767         assert( iFreePg==iLastPg ); |  | 
|  2768         releasePage(pFreePg); |  | 
|  2769       } |  | 
|  2770     } else { |  | 
|  2771       Pgno iFreePg;             /* Index of free page to move pLastPg to */ |  | 
|  2772       MemPage *pLastPg; |  | 
|  2773  |  | 
|  2774       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0); |  | 
|  2775       if( rc!=SQLITE_OK ){ |  | 
|  2776         return rc; |  | 
|  2777       } |  | 
|  2778  |  | 
|  2779       /* If nFin is zero, this loop runs exactly once and page pLastPg |  | 
|  2780       ** is swapped with the first free page pulled off the free list. |  | 
|  2781       ** |  | 
|  2782       ** On the other hand, if nFin is greater than zero, then keep |  | 
|  2783       ** looping until a free-page located within the first nFin pages |  | 
|  2784       ** of the file is found. |  | 
|  2785       */ |  | 
|  2786       do { |  | 
|  2787         MemPage *pFreePg; |  | 
|  2788         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0); |  | 
|  2789         if( rc!=SQLITE_OK ){ |  | 
|  2790           releasePage(pLastPg); |  | 
|  2791           return rc; |  | 
|  2792         } |  | 
|  2793         releasePage(pFreePg); |  | 
|  2794       }while( nFin!=0 && iFreePg>nFin ); |  | 
|  2795       assert( iFreePg<iLastPg ); |  | 
|  2796        |  | 
|  2797       rc = sqlite3PagerWrite(pLastPg->pDbPage); |  | 
|  2798       if( rc==SQLITE_OK ){ |  | 
|  2799         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0); |  | 
|  2800       } |  | 
|  2801       releasePage(pLastPg); |  | 
|  2802       if( rc!=SQLITE_OK ){ |  | 
|  2803         return rc; |  | 
|  2804       } |  | 
|  2805     } |  | 
|  2806   } |  | 
|  2807  |  | 
|  2808   if( nFin==0 ){ |  | 
|  2809     iLastPg--; |  | 
|  2810     while( iLastPg==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, iLastPg) ){ |  | 
|  2811       if( PTRMAP_ISPAGE(pBt, iLastPg) ){ |  | 
|  2812         MemPage *pPg; |  | 
|  2813         int rc = btreeGetPage(pBt, iLastPg, &pPg, 0); |  | 
|  2814         if( rc!=SQLITE_OK ){ |  | 
|  2815           return rc; |  | 
|  2816         } |  | 
|  2817         rc = sqlite3PagerWrite(pPg->pDbPage); |  | 
|  2818         releasePage(pPg); |  | 
|  2819         if( rc!=SQLITE_OK ){ |  | 
|  2820           return rc; |  | 
|  2821         } |  | 
|  2822       } |  | 
|  2823       iLastPg--; |  | 
|  2824     } |  | 
|  2825     sqlite3PagerTruncateImage(pBt->pPager, iLastPg); |  | 
|  2826   } |  | 
|  2827   return SQLITE_OK; |  | 
|  2828 } |  | 
|  2829  |  | 
|  2830 /* |  | 
|  2831 ** A write-transaction must be opened before calling this function. |  | 
|  2832 ** It performs a single unit of work towards an incremental vacuum. |  | 
|  2833 ** |  | 
|  2834 ** If the incremental vacuum is finished after this function has run, |  | 
|  2835 ** SQLITE_DONE is returned. If it is not finished, but no error occurred, |  | 
|  2836 ** SQLITE_OK is returned. Otherwise an SQLite error code.  |  | 
|  2837 */ |  | 
|  2838 int sqlite3BtreeIncrVacuum(Btree *p){ |  | 
|  2839   int rc; |  | 
|  2840   BtShared *pBt = p->pBt; |  | 
|  2841  |  | 
|  2842   sqlite3BtreeEnter(p); |  | 
|  2843   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE ); |  | 
|  2844   if( !pBt->autoVacuum ){ |  | 
|  2845     rc = SQLITE_DONE; |  | 
|  2846   }else{ |  | 
|  2847     invalidateAllOverflowCache(pBt); |  | 
|  2848     rc = incrVacuumStep(pBt, 0, pagerPagecount(pBt)); |  | 
|  2849   } |  | 
|  2850   sqlite3BtreeLeave(p); |  | 
|  2851   return rc; |  | 
|  2852 } |  | 
|  2853  |  | 
|  2854 /* |  | 
|  2855 ** This routine is called prior to sqlite3PagerCommit when a transaction |  | 
|  2856 ** is commited for an auto-vacuum database. |  | 
|  2857 ** |  | 
|  2858 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages |  | 
|  2859 ** the database file should be truncated to during the commit process.  |  | 
|  2860 ** i.e. the database has been reorganized so that only the first *pnTrunc |  | 
|  2861 ** pages are in use. |  | 
|  2862 */ |  | 
|  2863 static int autoVacuumCommit(BtShared *pBt){ |  | 
|  2864   int rc = SQLITE_OK; |  | 
|  2865   Pager *pPager = pBt->pPager; |  | 
|  2866   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) ); |  | 
|  2867  |  | 
|  2868   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  2869   invalidateAllOverflowCache(pBt); |  | 
|  2870   assert(pBt->autoVacuum); |  | 
|  2871   if( !pBt->incrVacuum ){ |  | 
|  2872     Pgno nFin;         /* Number of pages in database after autovacuuming */ |  | 
|  2873     Pgno nFree;        /* Number of pages on the freelist initially */ |  | 
|  2874     Pgno nPtrmap;      /* Number of PtrMap pages to be freed */ |  | 
|  2875     Pgno iFree;        /* The next page to be freed */ |  | 
|  2876     int nEntry;        /* Number of entries on one ptrmap page */ |  | 
|  2877     Pgno nOrig;        /* Database size before freeing */ |  | 
|  2878  |  | 
|  2879     nOrig = pagerPagecount(pBt); |  | 
|  2880     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){ |  | 
|  2881       /* It is not possible to create a database for which the final page |  | 
|  2882       ** is either a pointer-map page or the pending-byte page. If one |  | 
|  2883       ** is encountered, this indicates corruption. |  | 
|  2884       */ |  | 
|  2885       return SQLITE_CORRUPT_BKPT; |  | 
|  2886     } |  | 
|  2887  |  | 
|  2888     nFree = get4byte(&pBt->pPage1->aData[36]); |  | 
|  2889     nEntry = pBt->usableSize/5; |  | 
|  2890     nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry; |  | 
|  2891     nFin = nOrig - nFree - nPtrmap; |  | 
|  2892     if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){ |  | 
|  2893       nFin--; |  | 
|  2894     } |  | 
|  2895     while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){ |  | 
|  2896       nFin--; |  | 
|  2897     } |  | 
|  2898     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT; |  | 
|  2899  |  | 
|  2900     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){ |  | 
|  2901       rc = incrVacuumStep(pBt, nFin, iFree); |  | 
|  2902     } |  | 
|  2903     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){ |  | 
|  2904       rc = SQLITE_OK; |  | 
|  2905       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); |  | 
|  2906       put4byte(&pBt->pPage1->aData[32], 0); |  | 
|  2907       put4byte(&pBt->pPage1->aData[36], 0); |  | 
|  2908       sqlite3PagerTruncateImage(pBt->pPager, nFin); |  | 
|  2909     } |  | 
|  2910     if( rc!=SQLITE_OK ){ |  | 
|  2911       sqlite3PagerRollback(pPager); |  | 
|  2912     } |  | 
|  2913   } |  | 
|  2914  |  | 
|  2915   assert( nRef==sqlite3PagerRefcount(pPager) ); |  | 
|  2916   return rc; |  | 
|  2917 } |  | 
|  2918  |  | 
|  2919 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ |  | 
|  2920 # define setChildPtrmaps(x) SQLITE_OK |  | 
|  2921 #endif |  | 
|  2922  |  | 
|  2923 /* |  | 
|  2924 ** This routine does the first phase of a two-phase commit.  This routine |  | 
|  2925 ** causes a rollback journal to be created (if it does not already exist) |  | 
|  2926 ** and populated with enough information so that if a power loss occurs |  | 
|  2927 ** the database can be restored to its original state by playing back |  | 
|  2928 ** the journal.  Then the contents of the journal are flushed out to |  | 
|  2929 ** the disk.  After the journal is safely on oxide, the changes to the |  | 
|  2930 ** database are written into the database file and flushed to oxide. |  | 
|  2931 ** At the end of this call, the rollback journal still exists on the |  | 
|  2932 ** disk and we are still holding all locks, so the transaction has not |  | 
|  2933 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the |  | 
|  2934 ** commit process. |  | 
|  2935 ** |  | 
|  2936 ** This call is a no-op if no write-transaction is currently active on pBt. |  | 
|  2937 ** |  | 
|  2938 ** Otherwise, sync the database file for the btree pBt. zMaster points to |  | 
|  2939 ** the name of a master journal file that should be written into the |  | 
|  2940 ** individual journal file, or is NULL, indicating no master journal file  |  | 
|  2941 ** (single database transaction). |  | 
|  2942 ** |  | 
|  2943 ** When this is called, the master journal should already have been |  | 
|  2944 ** created, populated with this journal pointer and synced to disk. |  | 
|  2945 ** |  | 
|  2946 ** Once this is routine has returned, the only thing required to commit |  | 
|  2947 ** the write-transaction for this database file is to delete the journal. |  | 
|  2948 */ |  | 
|  2949 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){ |  | 
|  2950   int rc = SQLITE_OK; |  | 
|  2951   if( p->inTrans==TRANS_WRITE ){ |  | 
|  2952     BtShared *pBt = p->pBt; |  | 
|  2953     sqlite3BtreeEnter(p); |  | 
|  2954 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  2955     if( pBt->autoVacuum ){ |  | 
|  2956       rc = autoVacuumCommit(pBt); |  | 
|  2957       if( rc!=SQLITE_OK ){ |  | 
|  2958         sqlite3BtreeLeave(p); |  | 
|  2959         return rc; |  | 
|  2960       } |  | 
|  2961     } |  | 
|  2962 #endif |  | 
|  2963     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0); |  | 
|  2964     sqlite3BtreeLeave(p); |  | 
|  2965   } |  | 
|  2966   return rc; |  | 
|  2967 } |  | 
|  2968  |  | 
|  2969 /* |  | 
|  2970 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback() |  | 
|  2971 ** at the conclusion of a transaction. |  | 
|  2972 */ |  | 
|  2973 static void btreeEndTransaction(Btree *p){ |  | 
|  2974   BtShared *pBt = p->pBt; |  | 
|  2975   BtCursor *pCsr; |  | 
|  2976   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|  2977  |  | 
|  2978   /* Search for a cursor held open by this b-tree connection. If one exists, |  | 
|  2979   ** then the transaction will be downgraded to a read-only transaction |  | 
|  2980   ** instead of actually concluded. A subsequent call to CommitPhaseTwo()  |  | 
|  2981   ** or Rollback() will finish the transaction and unlock the database.  */ |  | 
|  2982   for(pCsr=pBt->pCursor; pCsr && pCsr->pBtree!=p; pCsr=pCsr->pNext); |  | 
|  2983   assert( pCsr==0 || p->inTrans>TRANS_NONE ); |  | 
|  2984  |  | 
|  2985   btreeClearHasContent(pBt); |  | 
|  2986   if( pCsr ){ |  | 
|  2987     downgradeAllSharedCacheTableLocks(p); |  | 
|  2988     p->inTrans = TRANS_READ; |  | 
|  2989   }else{ |  | 
|  2990     /* If the handle had any kind of transaction open, decrement the  |  | 
|  2991     ** transaction count of the shared btree. If the transaction count  |  | 
|  2992     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused() |  | 
|  2993     ** call below will unlock the pager.  */ |  | 
|  2994     if( p->inTrans!=TRANS_NONE ){ |  | 
|  2995       clearAllSharedCacheTableLocks(p); |  | 
|  2996       pBt->nTransaction--; |  | 
|  2997       if( 0==pBt->nTransaction ){ |  | 
|  2998         pBt->inTransaction = TRANS_NONE; |  | 
|  2999       } |  | 
|  3000     } |  | 
|  3001  |  | 
|  3002     /* Set the current transaction state to TRANS_NONE and unlock the  |  | 
|  3003     ** pager if this call closed the only read or write transaction.  */ |  | 
|  3004     p->inTrans = TRANS_NONE; |  | 
|  3005     unlockBtreeIfUnused(pBt); |  | 
|  3006   } |  | 
|  3007  |  | 
|  3008   btreeIntegrity(p); |  | 
|  3009 } |  | 
|  3010  |  | 
|  3011 /* |  | 
|  3012 ** Commit the transaction currently in progress. |  | 
|  3013 ** |  | 
|  3014 ** This routine implements the second phase of a 2-phase commit.  The |  | 
|  3015 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should |  | 
|  3016 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne() |  | 
|  3017 ** routine did all the work of writing information out to disk and flushing the |  | 
|  3018 ** contents so that they are written onto the disk platter.  All this |  | 
|  3019 ** routine has to do is delete or truncate or zero the header in the |  | 
|  3020 ** the rollback journal (which causes the transaction to commit) and |  | 
|  3021 ** drop locks. |  | 
|  3022 ** |  | 
|  3023 ** This will release the write lock on the database file.  If there |  | 
|  3024 ** are no active cursors, it also releases the read lock. |  | 
|  3025 */ |  | 
|  3026 int sqlite3BtreeCommitPhaseTwo(Btree *p){ |  | 
|  3027   BtShared *pBt = p->pBt; |  | 
|  3028  |  | 
|  3029   sqlite3BtreeEnter(p); |  | 
|  3030   btreeIntegrity(p); |  | 
|  3031  |  | 
|  3032   /* If the handle has a write-transaction open, commit the shared-btrees  |  | 
|  3033   ** transaction and set the shared state to TRANS_READ. |  | 
|  3034   */ |  | 
|  3035   if( p->inTrans==TRANS_WRITE ){ |  | 
|  3036     int rc; |  | 
|  3037     assert( pBt->inTransaction==TRANS_WRITE ); |  | 
|  3038     assert( pBt->nTransaction>0 ); |  | 
|  3039     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager); |  | 
|  3040     if( rc!=SQLITE_OK ){ |  | 
|  3041       sqlite3BtreeLeave(p); |  | 
|  3042       return rc; |  | 
|  3043     } |  | 
|  3044     pBt->inTransaction = TRANS_READ; |  | 
|  3045   } |  | 
|  3046  |  | 
|  3047   btreeEndTransaction(p); |  | 
|  3048   sqlite3BtreeLeave(p); |  | 
|  3049   return SQLITE_OK; |  | 
|  3050 } |  | 
|  3051  |  | 
|  3052 /* |  | 
|  3053 ** Do both phases of a commit. |  | 
|  3054 */ |  | 
|  3055 int sqlite3BtreeCommit(Btree *p){ |  | 
|  3056   int rc; |  | 
|  3057   sqlite3BtreeEnter(p); |  | 
|  3058   rc = sqlite3BtreeCommitPhaseOne(p, 0); |  | 
|  3059   if( rc==SQLITE_OK ){ |  | 
|  3060     rc = sqlite3BtreeCommitPhaseTwo(p); |  | 
|  3061   } |  | 
|  3062   sqlite3BtreeLeave(p); |  | 
|  3063   return rc; |  | 
|  3064 } |  | 
|  3065  |  | 
|  3066 #ifndef NDEBUG |  | 
|  3067 /* |  | 
|  3068 ** Return the number of write-cursors open on this handle. This is for use |  | 
|  3069 ** in assert() expressions, so it is only compiled if NDEBUG is not |  | 
|  3070 ** defined. |  | 
|  3071 ** |  | 
|  3072 ** For the purposes of this routine, a write-cursor is any cursor that |  | 
|  3073 ** is capable of writing to the databse.  That means the cursor was |  | 
|  3074 ** originally opened for writing and the cursor has not be disabled |  | 
|  3075 ** by having its state changed to CURSOR_FAULT. |  | 
|  3076 */ |  | 
|  3077 static int countWriteCursors(BtShared *pBt){ |  | 
|  3078   BtCursor *pCur; |  | 
|  3079   int r = 0; |  | 
|  3080   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ |  | 
|  3081     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;  |  | 
|  3082   } |  | 
|  3083   return r; |  | 
|  3084 } |  | 
|  3085 #endif |  | 
|  3086  |  | 
|  3087 /* |  | 
|  3088 ** This routine sets the state to CURSOR_FAULT and the error |  | 
|  3089 ** code to errCode for every cursor on BtShared that pBtree |  | 
|  3090 ** references. |  | 
|  3091 ** |  | 
|  3092 ** Every cursor is tripped, including cursors that belong |  | 
|  3093 ** to other database connections that happen to be sharing |  | 
|  3094 ** the cache with pBtree. |  | 
|  3095 ** |  | 
|  3096 ** This routine gets called when a rollback occurs. |  | 
|  3097 ** All cursors using the same cache must be tripped |  | 
|  3098 ** to prevent them from trying to use the btree after |  | 
|  3099 ** the rollback.  The rollback may have deleted tables |  | 
|  3100 ** or moved root pages, so it is not sufficient to |  | 
|  3101 ** save the state of the cursor.  The cursor must be |  | 
|  3102 ** invalidated. |  | 
|  3103 */ |  | 
|  3104 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){ |  | 
|  3105   BtCursor *p; |  | 
|  3106   sqlite3BtreeEnter(pBtree); |  | 
|  3107   for(p=pBtree->pBt->pCursor; p; p=p->pNext){ |  | 
|  3108     int i; |  | 
|  3109     sqlite3BtreeClearCursor(p); |  | 
|  3110     p->eState = CURSOR_FAULT; |  | 
|  3111     p->skipNext = errCode; |  | 
|  3112     for(i=0; i<=p->iPage; i++){ |  | 
|  3113       releasePage(p->apPage[i]); |  | 
|  3114       p->apPage[i] = 0; |  | 
|  3115     } |  | 
|  3116   } |  | 
|  3117   sqlite3BtreeLeave(pBtree); |  | 
|  3118 } |  | 
|  3119  |  | 
|  3120 /* |  | 
|  3121 ** Rollback the transaction in progress.  All cursors will be |  | 
|  3122 ** invalided by this operation.  Any attempt to use a cursor |  | 
|  3123 ** that was open at the beginning of this operation will result |  | 
|  3124 ** in an error. |  | 
|  3125 ** |  | 
|  3126 ** This will release the write lock on the database file.  If there |  | 
|  3127 ** are no active cursors, it also releases the read lock. |  | 
|  3128 */ |  | 
|  3129 int sqlite3BtreeRollback(Btree *p){ |  | 
|  3130   int rc; |  | 
|  3131   BtShared *pBt = p->pBt; |  | 
|  3132   MemPage *pPage1; |  | 
|  3133  |  | 
|  3134   sqlite3BtreeEnter(p); |  | 
|  3135   rc = saveAllCursors(pBt, 0, 0); |  | 
|  3136 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  3137   if( rc!=SQLITE_OK ){ |  | 
|  3138     /* This is a horrible situation. An IO or malloc() error occurred whilst |  | 
|  3139     ** trying to save cursor positions. If this is an automatic rollback (as |  | 
|  3140     ** the result of a constraint, malloc() failure or IO error) then  |  | 
|  3141     ** the cache may be internally inconsistent (not contain valid trees) so |  | 
|  3142     ** we cannot simply return the error to the caller. Instead, abort  |  | 
|  3143     ** all queries that may be using any of the cursors that failed to save. |  | 
|  3144     */ |  | 
|  3145     sqlite3BtreeTripAllCursors(p, rc); |  | 
|  3146   } |  | 
|  3147 #endif |  | 
|  3148   btreeIntegrity(p); |  | 
|  3149  |  | 
|  3150   if( p->inTrans==TRANS_WRITE ){ |  | 
|  3151     int rc2; |  | 
|  3152  |  | 
|  3153     assert( TRANS_WRITE==pBt->inTransaction ); |  | 
|  3154     rc2 = sqlite3PagerRollback(pBt->pPager); |  | 
|  3155     if( rc2!=SQLITE_OK ){ |  | 
|  3156       rc = rc2; |  | 
|  3157     } |  | 
|  3158  |  | 
|  3159     /* The rollback may have destroyed the pPage1->aData value.  So |  | 
|  3160     ** call btreeGetPage() on page 1 again to make |  | 
|  3161     ** sure pPage1->aData is set correctly. */ |  | 
|  3162     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){ |  | 
|  3163       releasePage(pPage1); |  | 
|  3164     } |  | 
|  3165     assert( countWriteCursors(pBt)==0 ); |  | 
|  3166     pBt->inTransaction = TRANS_READ; |  | 
|  3167   } |  | 
|  3168  |  | 
|  3169   btreeEndTransaction(p); |  | 
|  3170   sqlite3BtreeLeave(p); |  | 
|  3171   return rc; |  | 
|  3172 } |  | 
|  3173  |  | 
|  3174 /* |  | 
|  3175 ** Start a statement subtransaction. The subtransaction can can be rolled |  | 
|  3176 ** back independently of the main transaction. You must start a transaction  |  | 
|  3177 ** before starting a subtransaction. The subtransaction is ended automatically  |  | 
|  3178 ** if the main transaction commits or rolls back. |  | 
|  3179 ** |  | 
|  3180 ** Statement subtransactions are used around individual SQL statements |  | 
|  3181 ** that are contained within a BEGIN...COMMIT block.  If a constraint |  | 
|  3182 ** error occurs within the statement, the effect of that one statement |  | 
|  3183 ** can be rolled back without having to rollback the entire transaction. |  | 
|  3184 ** |  | 
|  3185 ** A statement sub-transaction is implemented as an anonymous savepoint. The |  | 
|  3186 ** value passed as the second parameter is the total number of savepoints, |  | 
|  3187 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there |  | 
|  3188 ** are no active savepoints and no other statement-transactions open, |  | 
|  3189 ** iStatement is 1. This anonymous savepoint can be released or rolled back |  | 
|  3190 ** using the sqlite3BtreeSavepoint() function. |  | 
|  3191 */ |  | 
|  3192 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){ |  | 
|  3193   int rc; |  | 
|  3194   BtShared *pBt = p->pBt; |  | 
|  3195   sqlite3BtreeEnter(p); |  | 
|  3196   assert( p->inTrans==TRANS_WRITE ); |  | 
|  3197   assert( pBt->readOnly==0 ); |  | 
|  3198   assert( iStatement>0 ); |  | 
|  3199   assert( iStatement>p->db->nSavepoint ); |  | 
|  3200   if( NEVER(p->inTrans!=TRANS_WRITE || pBt->readOnly) ){ |  | 
|  3201     rc = SQLITE_INTERNAL; |  | 
|  3202   }else{ |  | 
|  3203     assert( pBt->inTransaction==TRANS_WRITE ); |  | 
|  3204     /* At the pager level, a statement transaction is a savepoint with |  | 
|  3205     ** an index greater than all savepoints created explicitly using |  | 
|  3206     ** SQL statements. It is illegal to open, release or rollback any |  | 
|  3207     ** such savepoints while the statement transaction savepoint is active. |  | 
|  3208     */ |  | 
|  3209     rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement); |  | 
|  3210   } |  | 
|  3211   sqlite3BtreeLeave(p); |  | 
|  3212   return rc; |  | 
|  3213 } |  | 
|  3214  |  | 
|  3215 /* |  | 
|  3216 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK |  | 
|  3217 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the |  | 
|  3218 ** savepoint identified by parameter iSavepoint, depending on the value  |  | 
|  3219 ** of op. |  | 
|  3220 ** |  | 
|  3221 ** Normally, iSavepoint is greater than or equal to zero. However, if op is |  | 
|  3222 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the  |  | 
|  3223 ** contents of the entire transaction are rolled back. This is different |  | 
|  3224 ** from a normal transaction rollback, as no locks are released and the |  | 
|  3225 ** transaction remains open. |  | 
|  3226 */ |  | 
|  3227 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){ |  | 
|  3228   int rc = SQLITE_OK; |  | 
|  3229   if( p && p->inTrans==TRANS_WRITE ){ |  | 
|  3230     BtShared *pBt = p->pBt; |  | 
|  3231     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK ); |  | 
|  3232     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) ); |  | 
|  3233     sqlite3BtreeEnter(p); |  | 
|  3234     rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint); |  | 
|  3235     if( rc==SQLITE_OK ){ |  | 
|  3236       rc = newDatabase(pBt); |  | 
|  3237     } |  | 
|  3238     sqlite3BtreeLeave(p); |  | 
|  3239   } |  | 
|  3240   return rc; |  | 
|  3241 } |  | 
|  3242  |  | 
|  3243 /* |  | 
|  3244 ** Create a new cursor for the BTree whose root is on the page |  | 
|  3245 ** iTable. If a read-only cursor is requested, it is assumed that |  | 
|  3246 ** the caller already has at least a read-only transaction open |  | 
|  3247 ** on the database already. If a write-cursor is requested, then |  | 
|  3248 ** the caller is assumed to have an open write transaction. |  | 
|  3249 ** |  | 
|  3250 ** If wrFlag==0, then the cursor can only be used for reading. |  | 
|  3251 ** If wrFlag==1, then the cursor can be used for reading or for |  | 
|  3252 ** writing if other conditions for writing are also met.  These |  | 
|  3253 ** are the conditions that must be met in order for writing to |  | 
|  3254 ** be allowed: |  | 
|  3255 ** |  | 
|  3256 ** 1:  The cursor must have been opened with wrFlag==1 |  | 
|  3257 ** |  | 
|  3258 ** 2:  Other database connections that share the same pager cache |  | 
|  3259 **     but which are not in the READ_UNCOMMITTED state may not have |  | 
|  3260 **     cursors open with wrFlag==0 on the same table.  Otherwise |  | 
|  3261 **     the changes made by this write cursor would be visible to |  | 
|  3262 **     the read cursors in the other database connection. |  | 
|  3263 ** |  | 
|  3264 ** 3:  The database must be writable (not on read-only media) |  | 
|  3265 ** |  | 
|  3266 ** 4:  There must be an active transaction. |  | 
|  3267 ** |  | 
|  3268 ** No checking is done to make sure that page iTable really is the |  | 
|  3269 ** root page of a b-tree.  If it is not, then the cursor acquired |  | 
|  3270 ** will not work correctly. |  | 
|  3271 ** |  | 
|  3272 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory  |  | 
|  3273 ** pointed to by pCur have been zeroed by the caller. |  | 
|  3274 */ |  | 
|  3275 static int btreeCursor( |  | 
|  3276   Btree *p,                              /* The btree */ |  | 
|  3277   int iTable,                            /* Root page of table to open */ |  | 
|  3278   int wrFlag,                            /* 1 to write. 0 read-only */ |  | 
|  3279   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */ |  | 
|  3280   BtCursor *pCur                         /* Space for new cursor */ |  | 
|  3281 ){ |  | 
|  3282   BtShared *pBt = p->pBt;                /* Shared b-tree handle */ |  | 
|  3283  |  | 
|  3284   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|  3285   assert( wrFlag==0 || wrFlag==1 ); |  | 
|  3286  |  | 
|  3287   /* The following assert statements verify that if this is a sharable  |  | 
|  3288   ** b-tree database, the connection is holding the required table locks,  |  | 
|  3289   ** and that no other connection has any open cursor that conflicts with  |  | 
|  3290   ** this lock.  */ |  | 
|  3291   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) ); |  | 
|  3292   assert( wrFlag==0 || !hasReadConflicts(p, iTable) ); |  | 
|  3293  |  | 
|  3294   /* Assert that the caller has opened the required transaction. */ |  | 
|  3295   assert( p->inTrans>TRANS_NONE ); |  | 
|  3296   assert( wrFlag==0 || p->inTrans==TRANS_WRITE ); |  | 
|  3297   assert( pBt->pPage1 && pBt->pPage1->aData ); |  | 
|  3298  |  | 
|  3299   if( NEVER(wrFlag && pBt->readOnly) ){ |  | 
|  3300     return SQLITE_READONLY; |  | 
|  3301   } |  | 
|  3302   if( iTable==1 && pagerPagecount(pBt)==0 ){ |  | 
|  3303     return SQLITE_EMPTY; |  | 
|  3304   } |  | 
|  3305  |  | 
|  3306   /* Now that no other errors can occur, finish filling in the BtCursor |  | 
|  3307   ** variables and link the cursor into the BtShared list.  */ |  | 
|  3308   pCur->pgnoRoot = (Pgno)iTable; |  | 
|  3309   pCur->iPage = -1; |  | 
|  3310   pCur->pKeyInfo = pKeyInfo; |  | 
|  3311   pCur->pBtree = p; |  | 
|  3312   pCur->pBt = pBt; |  | 
|  3313   pCur->wrFlag = (u8)wrFlag; |  | 
|  3314   pCur->pNext = pBt->pCursor; |  | 
|  3315   if( pCur->pNext ){ |  | 
|  3316     pCur->pNext->pPrev = pCur; |  | 
|  3317   } |  | 
|  3318   pBt->pCursor = pCur; |  | 
|  3319   pCur->eState = CURSOR_INVALID; |  | 
|  3320   pCur->cachedRowid = 0; |  | 
|  3321   return SQLITE_OK; |  | 
|  3322 } |  | 
|  3323 int sqlite3BtreeCursor( |  | 
|  3324   Btree *p,                                   /* The btree */ |  | 
|  3325   int iTable,                                 /* Root page of table to open */ |  | 
|  3326   int wrFlag,                                 /* 1 to write. 0 read-only */ |  | 
|  3327   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */ |  | 
|  3328   BtCursor *pCur                              /* Write new cursor here */ |  | 
|  3329 ){ |  | 
|  3330   int rc; |  | 
|  3331   sqlite3BtreeEnter(p); |  | 
|  3332   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur); |  | 
|  3333   sqlite3BtreeLeave(p); |  | 
|  3334   return rc; |  | 
|  3335 } |  | 
|  3336  |  | 
|  3337 /* |  | 
|  3338 ** Return the size of a BtCursor object in bytes. |  | 
|  3339 ** |  | 
|  3340 ** This interfaces is needed so that users of cursors can preallocate |  | 
|  3341 ** sufficient storage to hold a cursor.  The BtCursor object is opaque |  | 
|  3342 ** to users so they cannot do the sizeof() themselves - they must call |  | 
|  3343 ** this routine. |  | 
|  3344 */ |  | 
|  3345 int sqlite3BtreeCursorSize(void){ |  | 
|  3346   return sizeof(BtCursor); |  | 
|  3347 } |  | 
|  3348  |  | 
|  3349 /* |  | 
|  3350 ** Set the cached rowid value of every cursor in the same database file |  | 
|  3351 ** as pCur and having the same root page number as pCur.  The value is |  | 
|  3352 ** set to iRowid. |  | 
|  3353 ** |  | 
|  3354 ** Only positive rowid values are considered valid for this cache. |  | 
|  3355 ** The cache is initialized to zero, indicating an invalid cache. |  | 
|  3356 ** A btree will work fine with zero or negative rowids.  We just cannot |  | 
|  3357 ** cache zero or negative rowids, which means tables that use zero or |  | 
|  3358 ** negative rowids might run a little slower.  But in practice, zero |  | 
|  3359 ** or negative rowids are very uncommon so this should not be a problem. |  | 
|  3360 */ |  | 
|  3361 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){ |  | 
|  3362   BtCursor *p; |  | 
|  3363   for(p=pCur->pBt->pCursor; p; p=p->pNext){ |  | 
|  3364     if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid; |  | 
|  3365   } |  | 
|  3366   assert( pCur->cachedRowid==iRowid ); |  | 
|  3367 } |  | 
|  3368  |  | 
|  3369 /* |  | 
|  3370 ** Return the cached rowid for the given cursor.  A negative or zero |  | 
|  3371 ** return value indicates that the rowid cache is invalid and should be |  | 
|  3372 ** ignored.  If the rowid cache has never before been set, then a |  | 
|  3373 ** zero is returned. |  | 
|  3374 */ |  | 
|  3375 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){ |  | 
|  3376   return pCur->cachedRowid; |  | 
|  3377 } |  | 
|  3378  |  | 
|  3379 /* |  | 
|  3380 ** Close a cursor.  The read lock on the database file is released |  | 
|  3381 ** when the last cursor is closed. |  | 
|  3382 */ |  | 
|  3383 int sqlite3BtreeCloseCursor(BtCursor *pCur){ |  | 
|  3384   Btree *pBtree = pCur->pBtree; |  | 
|  3385   if( pBtree ){ |  | 
|  3386     int i; |  | 
|  3387     BtShared *pBt = pCur->pBt; |  | 
|  3388     sqlite3BtreeEnter(pBtree); |  | 
|  3389     sqlite3BtreeClearCursor(pCur); |  | 
|  3390     if( pCur->pPrev ){ |  | 
|  3391       pCur->pPrev->pNext = pCur->pNext; |  | 
|  3392     }else{ |  | 
|  3393       pBt->pCursor = pCur->pNext; |  | 
|  3394     } |  | 
|  3395     if( pCur->pNext ){ |  | 
|  3396       pCur->pNext->pPrev = pCur->pPrev; |  | 
|  3397     } |  | 
|  3398     for(i=0; i<=pCur->iPage; i++){ |  | 
|  3399       releasePage(pCur->apPage[i]); |  | 
|  3400     } |  | 
|  3401     unlockBtreeIfUnused(pBt); |  | 
|  3402     invalidateOverflowCache(pCur); |  | 
|  3403     /* sqlite3_free(pCur); */ |  | 
|  3404     sqlite3BtreeLeave(pBtree); |  | 
|  3405   } |  | 
|  3406   return SQLITE_OK; |  | 
|  3407 } |  | 
|  3408  |  | 
|  3409 /* |  | 
|  3410 ** Make sure the BtCursor* given in the argument has a valid |  | 
|  3411 ** BtCursor.info structure.  If it is not already valid, call |  | 
|  3412 ** btreeParseCell() to fill it in. |  | 
|  3413 ** |  | 
|  3414 ** BtCursor.info is a cache of the information in the current cell. |  | 
|  3415 ** Using this cache reduces the number of calls to btreeParseCell(). |  | 
|  3416 ** |  | 
|  3417 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the |  | 
|  3418 ** compiler to crash when getCellInfo() is implemented as a macro. |  | 
|  3419 ** But there is a measureable speed advantage to using the macro on gcc |  | 
|  3420 ** (when less compiler optimizations like -Os or -O0 are used and the |  | 
|  3421 ** compiler is not doing agressive inlining.)  So we use a real function |  | 
|  3422 ** for MSVC and a macro for everything else.  Ticket #2457. |  | 
|  3423 */ |  | 
|  3424 #ifndef NDEBUG |  | 
|  3425   static void assertCellInfo(BtCursor *pCur){ |  | 
|  3426     CellInfo info; |  | 
|  3427     int iPage = pCur->iPage; |  | 
|  3428     memset(&info, 0, sizeof(info)); |  | 
|  3429     btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info); |  | 
|  3430     assert( memcmp(&info, &pCur->info, sizeof(info))==0 ); |  | 
|  3431   } |  | 
|  3432 #else |  | 
|  3433   #define assertCellInfo(x) |  | 
|  3434 #endif |  | 
|  3435 #ifdef _MSC_VER |  | 
|  3436   /* Use a real function in MSVC to work around bugs in that compiler. */ |  | 
|  3437   static void getCellInfo(BtCursor *pCur){ |  | 
|  3438     if( pCur->info.nSize==0 ){ |  | 
|  3439       int iPage = pCur->iPage; |  | 
|  3440       btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); |  | 
|  3441       pCur->validNKey = 1; |  | 
|  3442     }else{ |  | 
|  3443       assertCellInfo(pCur); |  | 
|  3444     } |  | 
|  3445   } |  | 
|  3446 #else /* if not _MSC_VER */ |  | 
|  3447   /* Use a macro in all other compilers so that the function is inlined */ |  | 
|  3448 #define getCellInfo(pCur)                                                      \ |  | 
|  3449   if( pCur->info.nSize==0 ){                                                   \ |  | 
|  3450     int iPage = pCur->iPage;                                                   \ |  | 
|  3451     btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \ |  | 
|  3452     pCur->validNKey = 1;                                                       \ |  | 
|  3453   }else{                                                                       \ |  | 
|  3454     assertCellInfo(pCur);                                                      \ |  | 
|  3455   } |  | 
|  3456 #endif /* _MSC_VER */ |  | 
|  3457  |  | 
|  3458 #ifndef NDEBUG  /* The next routine used only within assert() statements */ |  | 
|  3459 /* |  | 
|  3460 ** Return true if the given BtCursor is valid.  A valid cursor is one |  | 
|  3461 ** that is currently pointing to a row in a (non-empty) table. |  | 
|  3462 ** This is a verification routine is used only within assert() statements. |  | 
|  3463 */ |  | 
|  3464 int sqlite3BtreeCursorIsValid(BtCursor *pCur){ |  | 
|  3465   return pCur && pCur->eState==CURSOR_VALID; |  | 
|  3466 } |  | 
|  3467 #endif /* NDEBUG */ |  | 
|  3468  |  | 
|  3469 /* |  | 
|  3470 ** Set *pSize to the size of the buffer needed to hold the value of |  | 
|  3471 ** the key for the current entry.  If the cursor is not pointing |  | 
|  3472 ** to a valid entry, *pSize is set to 0.  |  | 
|  3473 ** |  | 
|  3474 ** For a table with the INTKEY flag set, this routine returns the key |  | 
|  3475 ** itself, not the number of bytes in the key. |  | 
|  3476 ** |  | 
|  3477 ** The caller must position the cursor prior to invoking this routine. |  | 
|  3478 **  |  | 
|  3479 ** This routine cannot fail.  It always returns SQLITE_OK.   |  | 
|  3480 */ |  | 
|  3481 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){ |  | 
|  3482   assert( cursorHoldsMutex(pCur) ); |  | 
|  3483   assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID ); |  | 
|  3484   if( pCur->eState!=CURSOR_VALID ){ |  | 
|  3485     *pSize = 0; |  | 
|  3486   }else{ |  | 
|  3487     getCellInfo(pCur); |  | 
|  3488     *pSize = pCur->info.nKey; |  | 
|  3489   } |  | 
|  3490   return SQLITE_OK; |  | 
|  3491 } |  | 
|  3492  |  | 
|  3493 /* |  | 
|  3494 ** Set *pSize to the number of bytes of data in the entry the |  | 
|  3495 ** cursor currently points to. |  | 
|  3496 ** |  | 
|  3497 ** The caller must guarantee that the cursor is pointing to a non-NULL |  | 
|  3498 ** valid entry.  In other words, the calling procedure must guarantee |  | 
|  3499 ** that the cursor has Cursor.eState==CURSOR_VALID. |  | 
|  3500 ** |  | 
|  3501 ** Failure is not possible.  This function always returns SQLITE_OK. |  | 
|  3502 ** It might just as well be a procedure (returning void) but we continue |  | 
|  3503 ** to return an integer result code for historical reasons. |  | 
|  3504 */ |  | 
|  3505 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){ |  | 
|  3506   assert( cursorHoldsMutex(pCur) ); |  | 
|  3507   assert( pCur->eState==CURSOR_VALID ); |  | 
|  3508   getCellInfo(pCur); |  | 
|  3509   *pSize = pCur->info.nData; |  | 
|  3510   return SQLITE_OK; |  | 
|  3511 } |  | 
|  3512  |  | 
|  3513 /* |  | 
|  3514 ** Given the page number of an overflow page in the database (parameter |  | 
|  3515 ** ovfl), this function finds the page number of the next page in the  |  | 
|  3516 ** linked list of overflow pages. If possible, it uses the auto-vacuum |  | 
|  3517 ** pointer-map data instead of reading the content of page ovfl to do so.  |  | 
|  3518 ** |  | 
|  3519 ** If an error occurs an SQLite error code is returned. Otherwise: |  | 
|  3520 ** |  | 
|  3521 ** The page number of the next overflow page in the linked list is  |  | 
|  3522 ** written to *pPgnoNext. If page ovfl is the last page in its linked  |  | 
|  3523 ** list, *pPgnoNext is set to zero.  |  | 
|  3524 ** |  | 
|  3525 ** If ppPage is not NULL, and a reference to the MemPage object corresponding |  | 
|  3526 ** to page number pOvfl was obtained, then *ppPage is set to point to that |  | 
|  3527 ** reference. It is the responsibility of the caller to call releasePage() |  | 
|  3528 ** on *ppPage to free the reference. In no reference was obtained (because |  | 
|  3529 ** the pointer-map was used to obtain the value for *pPgnoNext), then |  | 
|  3530 ** *ppPage is set to zero. |  | 
|  3531 */ |  | 
|  3532 static int getOverflowPage( |  | 
|  3533   BtShared *pBt,               /* The database file */ |  | 
|  3534   Pgno ovfl,                   /* Current overflow page number */ |  | 
|  3535   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */ |  | 
|  3536   Pgno *pPgnoNext              /* OUT: Next overflow page number */ |  | 
|  3537 ){ |  | 
|  3538   Pgno next = 0; |  | 
|  3539   MemPage *pPage = 0; |  | 
|  3540   int rc = SQLITE_OK; |  | 
|  3541  |  | 
|  3542   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  3543   assert(pPgnoNext); |  | 
|  3544  |  | 
|  3545 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  3546   /* Try to find the next page in the overflow list using the |  | 
|  3547   ** autovacuum pointer-map pages. Guess that the next page in  |  | 
|  3548   ** the overflow list is page number (ovfl+1). If that guess turns  |  | 
|  3549   ** out to be wrong, fall back to loading the data of page  |  | 
|  3550   ** number ovfl to determine the next page number. |  | 
|  3551   */ |  | 
|  3552   if( pBt->autoVacuum ){ |  | 
|  3553     Pgno pgno; |  | 
|  3554     Pgno iGuess = ovfl+1; |  | 
|  3555     u8 eType; |  | 
|  3556  |  | 
|  3557     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){ |  | 
|  3558       iGuess++; |  | 
|  3559     } |  | 
|  3560  |  | 
|  3561     if( iGuess<=pagerPagecount(pBt) ){ |  | 
|  3562       rc = ptrmapGet(pBt, iGuess, &eType, &pgno); |  | 
|  3563       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){ |  | 
|  3564         next = iGuess; |  | 
|  3565         rc = SQLITE_DONE; |  | 
|  3566       } |  | 
|  3567     } |  | 
|  3568   } |  | 
|  3569 #endif |  | 
|  3570  |  | 
|  3571   assert( next==0 || rc==SQLITE_DONE ); |  | 
|  3572   if( rc==SQLITE_OK ){ |  | 
|  3573     rc = btreeGetPage(pBt, ovfl, &pPage, 0); |  | 
|  3574     assert( rc==SQLITE_OK || pPage==0 ); |  | 
|  3575     if( rc==SQLITE_OK ){ |  | 
|  3576       next = get4byte(pPage->aData); |  | 
|  3577     } |  | 
|  3578   } |  | 
|  3579  |  | 
|  3580   *pPgnoNext = next; |  | 
|  3581   if( ppPage ){ |  | 
|  3582     *ppPage = pPage; |  | 
|  3583   }else{ |  | 
|  3584     releasePage(pPage); |  | 
|  3585   } |  | 
|  3586   return (rc==SQLITE_DONE ? SQLITE_OK : rc); |  | 
|  3587 } |  | 
|  3588  |  | 
|  3589 /* |  | 
|  3590 ** Copy data from a buffer to a page, or from a page to a buffer. |  | 
|  3591 ** |  | 
|  3592 ** pPayload is a pointer to data stored on database page pDbPage. |  | 
|  3593 ** If argument eOp is false, then nByte bytes of data are copied |  | 
|  3594 ** from pPayload to the buffer pointed at by pBuf. If eOp is true, |  | 
|  3595 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes |  | 
|  3596 ** of data are copied from the buffer pBuf to pPayload. |  | 
|  3597 ** |  | 
|  3598 ** SQLITE_OK is returned on success, otherwise an error code. |  | 
|  3599 */ |  | 
|  3600 static int copyPayload( |  | 
|  3601   void *pPayload,           /* Pointer to page data */ |  | 
|  3602   void *pBuf,               /* Pointer to buffer */ |  | 
|  3603   int nByte,                /* Number of bytes to copy */ |  | 
|  3604   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */ |  | 
|  3605   DbPage *pDbPage           /* Page containing pPayload */ |  | 
|  3606 ){ |  | 
|  3607   if( eOp ){ |  | 
|  3608     /* Copy data from buffer to page (a write operation) */ |  | 
|  3609     int rc = sqlite3PagerWrite(pDbPage); |  | 
|  3610     if( rc!=SQLITE_OK ){ |  | 
|  3611       return rc; |  | 
|  3612     } |  | 
|  3613     memcpy(pPayload, pBuf, nByte); |  | 
|  3614   }else{ |  | 
|  3615     /* Copy data from page to buffer (a read operation) */ |  | 
|  3616     memcpy(pBuf, pPayload, nByte); |  | 
|  3617   } |  | 
|  3618   return SQLITE_OK; |  | 
|  3619 } |  | 
|  3620  |  | 
|  3621 /* |  | 
|  3622 ** This function is used to read or overwrite payload information |  | 
|  3623 ** for the entry that the pCur cursor is pointing to. If the eOp |  | 
|  3624 ** parameter is 0, this is a read operation (data copied into |  | 
|  3625 ** buffer pBuf). If it is non-zero, a write (data copied from |  | 
|  3626 ** buffer pBuf). |  | 
|  3627 ** |  | 
|  3628 ** A total of "amt" bytes are read or written beginning at "offset". |  | 
|  3629 ** Data is read to or from the buffer pBuf. |  | 
|  3630 ** |  | 
|  3631 ** The content being read or written might appear on the main page |  | 
|  3632 ** or be scattered out on multiple overflow pages. |  | 
|  3633 ** |  | 
|  3634 ** If the BtCursor.isIncrblobHandle flag is set, and the current |  | 
|  3635 ** cursor entry uses one or more overflow pages, this function |  | 
|  3636 ** allocates space for and lazily popluates the overflow page-list  |  | 
|  3637 ** cache array (BtCursor.aOverflow). Subsequent calls use this |  | 
|  3638 ** cache to make seeking to the supplied offset more efficient. |  | 
|  3639 ** |  | 
|  3640 ** Once an overflow page-list cache has been allocated, it may be |  | 
|  3641 ** invalidated if some other cursor writes to the same table, or if |  | 
|  3642 ** the cursor is moved to a different row. Additionally, in auto-vacuum |  | 
|  3643 ** mode, the following events may invalidate an overflow page-list cache. |  | 
|  3644 ** |  | 
|  3645 **   * An incremental vacuum, |  | 
|  3646 **   * A commit in auto_vacuum="full" mode, |  | 
|  3647 **   * Creating a table (may require moving an overflow page). |  | 
|  3648 */ |  | 
|  3649 static int accessPayload( |  | 
|  3650   BtCursor *pCur,      /* Cursor pointing to entry to read from */ |  | 
|  3651   u32 offset,          /* Begin reading this far into payload */ |  | 
|  3652   u32 amt,             /* Read this many bytes */ |  | 
|  3653   unsigned char *pBuf, /* Write the bytes into this buffer */  |  | 
|  3654   int eOp              /* zero to read. non-zero to write. */ |  | 
|  3655 ){ |  | 
|  3656   unsigned char *aPayload; |  | 
|  3657   int rc = SQLITE_OK; |  | 
|  3658   u32 nKey; |  | 
|  3659   int iIdx = 0; |  | 
|  3660   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */ |  | 
|  3661   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */ |  | 
|  3662  |  | 
|  3663   assert( pPage ); |  | 
|  3664   assert( pCur->eState==CURSOR_VALID ); |  | 
|  3665   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); |  | 
|  3666   assert( cursorHoldsMutex(pCur) ); |  | 
|  3667  |  | 
|  3668   getCellInfo(pCur); |  | 
|  3669   aPayload = pCur->info.pCell + pCur->info.nHeader; |  | 
|  3670   nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey); |  | 
|  3671  |  | 
|  3672   if( NEVER(offset+amt > nKey+pCur->info.nData)  |  | 
|  3673    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] |  | 
|  3674   ){ |  | 
|  3675     /* Trying to read or write past the end of the data is an error */ |  | 
|  3676     return SQLITE_CORRUPT_BKPT; |  | 
|  3677   } |  | 
|  3678  |  | 
|  3679   /* Check if data must be read/written to/from the btree page itself. */ |  | 
|  3680   if( offset<pCur->info.nLocal ){ |  | 
|  3681     int a = amt; |  | 
|  3682     if( a+offset>pCur->info.nLocal ){ |  | 
|  3683       a = pCur->info.nLocal - offset; |  | 
|  3684     } |  | 
|  3685     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage); |  | 
|  3686     offset = 0; |  | 
|  3687     pBuf += a; |  | 
|  3688     amt -= a; |  | 
|  3689   }else{ |  | 
|  3690     offset -= pCur->info.nLocal; |  | 
|  3691   } |  | 
|  3692  |  | 
|  3693   if( rc==SQLITE_OK && amt>0 ){ |  | 
|  3694     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */ |  | 
|  3695     Pgno nextPage; |  | 
|  3696  |  | 
|  3697     nextPage = get4byte(&aPayload[pCur->info.nLocal]); |  | 
|  3698  |  | 
|  3699 #ifndef SQLITE_OMIT_INCRBLOB |  | 
|  3700     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[] |  | 
|  3701     ** has not been allocated, allocate it now. The array is sized at |  | 
|  3702     ** one entry for each overflow page in the overflow chain. The |  | 
|  3703     ** page number of the first overflow page is stored in aOverflow[0], |  | 
|  3704     ** etc. A value of 0 in the aOverflow[] array means "not yet known" |  | 
|  3705     ** (the cache is lazily populated). |  | 
|  3706     */ |  | 
|  3707     if( pCur->isIncrblobHandle && !pCur->aOverflow ){ |  | 
|  3708       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize; |  | 
|  3709       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl); |  | 
|  3710       /* nOvfl is always positive.  If it were zero, fetchPayload would have |  | 
|  3711       ** been used instead of this routine. */ |  | 
|  3712       if( ALWAYS(nOvfl) && !pCur->aOverflow ){ |  | 
|  3713         rc = SQLITE_NOMEM; |  | 
|  3714       } |  | 
|  3715     } |  | 
|  3716  |  | 
|  3717     /* If the overflow page-list cache has been allocated and the |  | 
|  3718     ** entry for the first required overflow page is valid, skip |  | 
|  3719     ** directly to it. |  | 
|  3720     */ |  | 
|  3721     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){ |  | 
|  3722       iIdx = (offset/ovflSize); |  | 
|  3723       nextPage = pCur->aOverflow[iIdx]; |  | 
|  3724       offset = (offset%ovflSize); |  | 
|  3725     } |  | 
|  3726 #endif |  | 
|  3727  |  | 
|  3728     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){ |  | 
|  3729  |  | 
|  3730 #ifndef SQLITE_OMIT_INCRBLOB |  | 
|  3731       /* If required, populate the overflow page-list cache. */ |  | 
|  3732       if( pCur->aOverflow ){ |  | 
|  3733         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage); |  | 
|  3734         pCur->aOverflow[iIdx] = nextPage; |  | 
|  3735       } |  | 
|  3736 #endif |  | 
|  3737  |  | 
|  3738       if( offset>=ovflSize ){ |  | 
|  3739         /* The only reason to read this page is to obtain the page |  | 
|  3740         ** number for the next page in the overflow chain. The page |  | 
|  3741         ** data is not required. So first try to lookup the overflow |  | 
|  3742         ** page-list cache, if any, then fall back to the getOverflowPage() |  | 
|  3743         ** function. |  | 
|  3744         */ |  | 
|  3745 #ifndef SQLITE_OMIT_INCRBLOB |  | 
|  3746         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){ |  | 
|  3747           nextPage = pCur->aOverflow[iIdx+1]; |  | 
|  3748         } else  |  | 
|  3749 #endif |  | 
|  3750           rc = getOverflowPage(pBt, nextPage, 0, &nextPage); |  | 
|  3751         offset -= ovflSize; |  | 
|  3752       }else{ |  | 
|  3753         /* Need to read this page properly. It contains some of the |  | 
|  3754         ** range of data that is being read (eOp==0) or written (eOp!=0). |  | 
|  3755         */ |  | 
|  3756         DbPage *pDbPage; |  | 
|  3757         int a = amt; |  | 
|  3758         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage); |  | 
|  3759         if( rc==SQLITE_OK ){ |  | 
|  3760           aPayload = sqlite3PagerGetData(pDbPage); |  | 
|  3761           nextPage = get4byte(aPayload); |  | 
|  3762           if( a + offset > ovflSize ){ |  | 
|  3763             a = ovflSize - offset; |  | 
|  3764           } |  | 
|  3765           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage); |  | 
|  3766           sqlite3PagerUnref(pDbPage); |  | 
|  3767           offset = 0; |  | 
|  3768           amt -= a; |  | 
|  3769           pBuf += a; |  | 
|  3770         } |  | 
|  3771       } |  | 
|  3772     } |  | 
|  3773   } |  | 
|  3774  |  | 
|  3775   if( rc==SQLITE_OK && amt>0 ){ |  | 
|  3776     return SQLITE_CORRUPT_BKPT; |  | 
|  3777   } |  | 
|  3778   return rc; |  | 
|  3779 } |  | 
|  3780  |  | 
|  3781 /* |  | 
|  3782 ** Read part of the key associated with cursor pCur.  Exactly |  | 
|  3783 ** "amt" bytes will be transfered into pBuf[].  The transfer |  | 
|  3784 ** begins at "offset". |  | 
|  3785 ** |  | 
|  3786 ** The caller must ensure that pCur is pointing to a valid row |  | 
|  3787 ** in the table. |  | 
|  3788 ** |  | 
|  3789 ** Return SQLITE_OK on success or an error code if anything goes |  | 
|  3790 ** wrong.  An error is returned if "offset+amt" is larger than |  | 
|  3791 ** the available payload. |  | 
|  3792 */ |  | 
|  3793 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ |  | 
|  3794   assert( cursorHoldsMutex(pCur) ); |  | 
|  3795   assert( pCur->eState==CURSOR_VALID ); |  | 
|  3796   assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); |  | 
|  3797   assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); |  | 
|  3798   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); |  | 
|  3799 } |  | 
|  3800  |  | 
|  3801 /* |  | 
|  3802 ** Read part of the data associated with cursor pCur.  Exactly |  | 
|  3803 ** "amt" bytes will be transfered into pBuf[].  The transfer |  | 
|  3804 ** begins at "offset". |  | 
|  3805 ** |  | 
|  3806 ** Return SQLITE_OK on success or an error code if anything goes |  | 
|  3807 ** wrong.  An error is returned if "offset+amt" is larger than |  | 
|  3808 ** the available payload. |  | 
|  3809 */ |  | 
|  3810 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ |  | 
|  3811   int rc; |  | 
|  3812  |  | 
|  3813 #ifndef SQLITE_OMIT_INCRBLOB |  | 
|  3814   if ( pCur->eState==CURSOR_INVALID ){ |  | 
|  3815     return SQLITE_ABORT; |  | 
|  3816   } |  | 
|  3817 #endif |  | 
|  3818  |  | 
|  3819   assert( cursorHoldsMutex(pCur) ); |  | 
|  3820   rc = restoreCursorPosition(pCur); |  | 
|  3821   if( rc==SQLITE_OK ){ |  | 
|  3822     assert( pCur->eState==CURSOR_VALID ); |  | 
|  3823     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); |  | 
|  3824     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); |  | 
|  3825     rc = accessPayload(pCur, offset, amt, pBuf, 0); |  | 
|  3826   } |  | 
|  3827   return rc; |  | 
|  3828 } |  | 
|  3829  |  | 
|  3830 /* |  | 
|  3831 ** Return a pointer to payload information from the entry that the  |  | 
|  3832 ** pCur cursor is pointing to.  The pointer is to the beginning of |  | 
|  3833 ** the key if skipKey==0 and it points to the beginning of data if |  | 
|  3834 ** skipKey==1.  The number of bytes of available key/data is written |  | 
|  3835 ** into *pAmt.  If *pAmt==0, then the value returned will not be |  | 
|  3836 ** a valid pointer. |  | 
|  3837 ** |  | 
|  3838 ** This routine is an optimization.  It is common for the entire key |  | 
|  3839 ** and data to fit on the local page and for there to be no overflow |  | 
|  3840 ** pages.  When that is so, this routine can be used to access the |  | 
|  3841 ** key and data without making a copy.  If the key and/or data spills |  | 
|  3842 ** onto overflow pages, then accessPayload() must be used to reassemble |  | 
|  3843 ** the key/data and copy it into a preallocated buffer. |  | 
|  3844 ** |  | 
|  3845 ** The pointer returned by this routine looks directly into the cached |  | 
|  3846 ** page of the database.  The data might change or move the next time |  | 
|  3847 ** any btree routine is called. |  | 
|  3848 */ |  | 
|  3849 static const unsigned char *fetchPayload( |  | 
|  3850   BtCursor *pCur,      /* Cursor pointing to entry to read from */ |  | 
|  3851   int *pAmt,           /* Write the number of available bytes here */ |  | 
|  3852   int skipKey          /* read beginning at data if this is true */ |  | 
|  3853 ){ |  | 
|  3854   unsigned char *aPayload; |  | 
|  3855   MemPage *pPage; |  | 
|  3856   u32 nKey; |  | 
|  3857   u32 nLocal; |  | 
|  3858  |  | 
|  3859   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); |  | 
|  3860   assert( pCur->eState==CURSOR_VALID ); |  | 
|  3861   assert( cursorHoldsMutex(pCur) ); |  | 
|  3862   pPage = pCur->apPage[pCur->iPage]; |  | 
|  3863   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); |  | 
|  3864   if( NEVER(pCur->info.nSize==0) ){ |  | 
|  3865     btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage], |  | 
|  3866                    &pCur->info); |  | 
|  3867   } |  | 
|  3868   aPayload = pCur->info.pCell; |  | 
|  3869   aPayload += pCur->info.nHeader; |  | 
|  3870   if( pPage->intKey ){ |  | 
|  3871     nKey = 0; |  | 
|  3872   }else{ |  | 
|  3873     nKey = (int)pCur->info.nKey; |  | 
|  3874   } |  | 
|  3875   if( skipKey ){ |  | 
|  3876     aPayload += nKey; |  | 
|  3877     nLocal = pCur->info.nLocal - nKey; |  | 
|  3878   }else{ |  | 
|  3879     nLocal = pCur->info.nLocal; |  | 
|  3880     assert( nLocal<=nKey ); |  | 
|  3881   } |  | 
|  3882   *pAmt = nLocal; |  | 
|  3883   return aPayload; |  | 
|  3884 } |  | 
|  3885  |  | 
|  3886  |  | 
|  3887 /* |  | 
|  3888 ** For the entry that cursor pCur is point to, return as |  | 
|  3889 ** many bytes of the key or data as are available on the local |  | 
|  3890 ** b-tree page.  Write the number of available bytes into *pAmt. |  | 
|  3891 ** |  | 
|  3892 ** The pointer returned is ephemeral.  The key/data may move |  | 
|  3893 ** or be destroyed on the next call to any Btree routine, |  | 
|  3894 ** including calls from other threads against the same cache. |  | 
|  3895 ** Hence, a mutex on the BtShared should be held prior to calling |  | 
|  3896 ** this routine. |  | 
|  3897 ** |  | 
|  3898 ** These routines is used to get quick access to key and data |  | 
|  3899 ** in the common case where no overflow pages are used. |  | 
|  3900 */ |  | 
|  3901 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){ |  | 
|  3902   const void *p = 0; |  | 
|  3903   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); |  | 
|  3904   assert( cursorHoldsMutex(pCur) ); |  | 
|  3905   if( ALWAYS(pCur->eState==CURSOR_VALID) ){ |  | 
|  3906     p = (const void*)fetchPayload(pCur, pAmt, 0); |  | 
|  3907   } |  | 
|  3908   return p; |  | 
|  3909 } |  | 
|  3910 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){ |  | 
|  3911   const void *p = 0; |  | 
|  3912   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); |  | 
|  3913   assert( cursorHoldsMutex(pCur) ); |  | 
|  3914   if( ALWAYS(pCur->eState==CURSOR_VALID) ){ |  | 
|  3915     p = (const void*)fetchPayload(pCur, pAmt, 1); |  | 
|  3916   } |  | 
|  3917   return p; |  | 
|  3918 } |  | 
|  3919  |  | 
|  3920  |  | 
|  3921 /* |  | 
|  3922 ** Move the cursor down to a new child page.  The newPgno argument is the |  | 
|  3923 ** page number of the child page to move to. |  | 
|  3924 ** |  | 
|  3925 ** This function returns SQLITE_CORRUPT if the page-header flags field of |  | 
|  3926 ** the new child page does not match the flags field of the parent (i.e. |  | 
|  3927 ** if an intkey page appears to be the parent of a non-intkey page, or |  | 
|  3928 ** vice-versa). |  | 
|  3929 */ |  | 
|  3930 static int moveToChild(BtCursor *pCur, u32 newPgno){ |  | 
|  3931   int rc; |  | 
|  3932   int i = pCur->iPage; |  | 
|  3933   MemPage *pNewPage; |  | 
|  3934   BtShared *pBt = pCur->pBt; |  | 
|  3935  |  | 
|  3936   assert( cursorHoldsMutex(pCur) ); |  | 
|  3937   assert( pCur->eState==CURSOR_VALID ); |  | 
|  3938   assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); |  | 
|  3939   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ |  | 
|  3940     return SQLITE_CORRUPT_BKPT; |  | 
|  3941   } |  | 
|  3942   rc = getAndInitPage(pBt, newPgno, &pNewPage); |  | 
|  3943   if( rc ) return rc; |  | 
|  3944   pCur->apPage[i+1] = pNewPage; |  | 
|  3945   pCur->aiIdx[i+1] = 0; |  | 
|  3946   pCur->iPage++; |  | 
|  3947  |  | 
|  3948   pCur->info.nSize = 0; |  | 
|  3949   pCur->validNKey = 0; |  | 
|  3950   if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ |  | 
|  3951     return SQLITE_CORRUPT_BKPT; |  | 
|  3952   } |  | 
|  3953   return SQLITE_OK; |  | 
|  3954 } |  | 
|  3955  |  | 
|  3956 #ifndef NDEBUG |  | 
|  3957 /* |  | 
|  3958 ** Page pParent is an internal (non-leaf) tree page. This function  |  | 
|  3959 ** asserts that page number iChild is the left-child if the iIdx'th |  | 
|  3960 ** cell in page pParent. Or, if iIdx is equal to the total number of |  | 
|  3961 ** cells in pParent, that page number iChild is the right-child of |  | 
|  3962 ** the page. |  | 
|  3963 */ |  | 
|  3964 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ |  | 
|  3965   assert( iIdx<=pParent->nCell ); |  | 
|  3966   if( iIdx==pParent->nCell ){ |  | 
|  3967     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); |  | 
|  3968   }else{ |  | 
|  3969     assert( get4byte(findCell(pParent, iIdx))==iChild ); |  | 
|  3970   } |  | 
|  3971 } |  | 
|  3972 #else |  | 
|  3973 #  define assertParentIndex(x,y,z)  |  | 
|  3974 #endif |  | 
|  3975  |  | 
|  3976 /* |  | 
|  3977 ** Move the cursor up to the parent page. |  | 
|  3978 ** |  | 
|  3979 ** pCur->idx is set to the cell index that contains the pointer |  | 
|  3980 ** to the page we are coming from.  If we are coming from the |  | 
|  3981 ** right-most child page then pCur->idx is set to one more than |  | 
|  3982 ** the largest cell index. |  | 
|  3983 */ |  | 
|  3984 static void moveToParent(BtCursor *pCur){ |  | 
|  3985   assert( cursorHoldsMutex(pCur) ); |  | 
|  3986   assert( pCur->eState==CURSOR_VALID ); |  | 
|  3987   assert( pCur->iPage>0 ); |  | 
|  3988   assert( pCur->apPage[pCur->iPage] ); |  | 
|  3989   assertParentIndex( |  | 
|  3990     pCur->apPage[pCur->iPage-1],  |  | 
|  3991     pCur->aiIdx[pCur->iPage-1],  |  | 
|  3992     pCur->apPage[pCur->iPage]->pgno |  | 
|  3993   ); |  | 
|  3994   releasePage(pCur->apPage[pCur->iPage]); |  | 
|  3995   pCur->iPage--; |  | 
|  3996   pCur->info.nSize = 0; |  | 
|  3997   pCur->validNKey = 0; |  | 
|  3998 } |  | 
|  3999  |  | 
|  4000 /* |  | 
|  4001 ** Move the cursor to point to the root page of its b-tree structure. |  | 
|  4002 ** |  | 
|  4003 ** If the table has a virtual root page, then the cursor is moved to point |  | 
|  4004 ** to the virtual root page instead of the actual root page. A table has a |  | 
|  4005 ** virtual root page when the actual root page contains no cells and a  |  | 
|  4006 ** single child page. This can only happen with the table rooted at page 1. |  | 
|  4007 ** |  | 
|  4008 ** If the b-tree structure is empty, the cursor state is set to  |  | 
|  4009 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first |  | 
|  4010 ** cell located on the root (or virtual root) page and the cursor state |  | 
|  4011 ** is set to CURSOR_VALID. |  | 
|  4012 ** |  | 
|  4013 ** If this function returns successfully, it may be assumed that the |  | 
|  4014 ** page-header flags indicate that the [virtual] root-page is the expected  |  | 
|  4015 ** kind of b-tree page (i.e. if when opening the cursor the caller did not |  | 
|  4016 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, |  | 
|  4017 ** indicating a table b-tree, or if the caller did specify a KeyInfo  |  | 
|  4018 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index |  | 
|  4019 ** b-tree). |  | 
|  4020 */ |  | 
|  4021 static int moveToRoot(BtCursor *pCur){ |  | 
|  4022   MemPage *pRoot; |  | 
|  4023   int rc = SQLITE_OK; |  | 
|  4024   Btree *p = pCur->pBtree; |  | 
|  4025   BtShared *pBt = p->pBt; |  | 
|  4026  |  | 
|  4027   assert( cursorHoldsMutex(pCur) ); |  | 
|  4028   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); |  | 
|  4029   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK ); |  | 
|  4030   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK ); |  | 
|  4031   if( pCur->eState>=CURSOR_REQUIRESEEK ){ |  | 
|  4032     if( pCur->eState==CURSOR_FAULT ){ |  | 
|  4033       assert( pCur->skipNext!=SQLITE_OK ); |  | 
|  4034       return pCur->skipNext; |  | 
|  4035     } |  | 
|  4036     sqlite3BtreeClearCursor(pCur); |  | 
|  4037   } |  | 
|  4038  |  | 
|  4039   if( pCur->iPage>=0 ){ |  | 
|  4040     int i; |  | 
|  4041     for(i=1; i<=pCur->iPage; i++){ |  | 
|  4042       releasePage(pCur->apPage[i]); |  | 
|  4043     } |  | 
|  4044     pCur->iPage = 0; |  | 
|  4045   }else{ |  | 
|  4046     rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]); |  | 
|  4047     if( rc!=SQLITE_OK ){ |  | 
|  4048       pCur->eState = CURSOR_INVALID; |  | 
|  4049       return rc; |  | 
|  4050     } |  | 
|  4051     pCur->iPage = 0; |  | 
|  4052  |  | 
|  4053     /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor |  | 
|  4054     ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is |  | 
|  4055     ** NULL, the caller expects a table b-tree. If this is not the case, |  | 
|  4056     ** return an SQLITE_CORRUPT error.  */ |  | 
|  4057     assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 ); |  | 
|  4058     if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){ |  | 
|  4059       return SQLITE_CORRUPT_BKPT; |  | 
|  4060     } |  | 
|  4061   } |  | 
|  4062  |  | 
|  4063   /* Assert that the root page is of the correct type. This must be the |  | 
|  4064   ** case as the call to this function that loaded the root-page (either |  | 
|  4065   ** this call or a previous invocation) would have detected corruption  |  | 
|  4066   ** if the assumption were not true, and it is not possible for the flags  |  | 
|  4067   ** byte to have been modified while this cursor is holding a reference |  | 
|  4068   ** to the page.  */ |  | 
|  4069   pRoot = pCur->apPage[0]; |  | 
|  4070   assert( pRoot->pgno==pCur->pgnoRoot ); |  | 
|  4071   assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey ); |  | 
|  4072  |  | 
|  4073   pCur->aiIdx[0] = 0; |  | 
|  4074   pCur->info.nSize = 0; |  | 
|  4075   pCur->atLast = 0; |  | 
|  4076   pCur->validNKey = 0; |  | 
|  4077  |  | 
|  4078   if( pRoot->nCell==0 && !pRoot->leaf ){ |  | 
|  4079     Pgno subpage; |  | 
|  4080     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; |  | 
|  4081     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); |  | 
|  4082     pCur->eState = CURSOR_VALID; |  | 
|  4083     rc = moveToChild(pCur, subpage); |  | 
|  4084   }else{ |  | 
|  4085     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID); |  | 
|  4086   } |  | 
|  4087   return rc; |  | 
|  4088 } |  | 
|  4089  |  | 
|  4090 /* |  | 
|  4091 ** Move the cursor down to the left-most leaf entry beneath the |  | 
|  4092 ** entry to which it is currently pointing. |  | 
|  4093 ** |  | 
|  4094 ** The left-most leaf is the one with the smallest key - the first |  | 
|  4095 ** in ascending order. |  | 
|  4096 */ |  | 
|  4097 static int moveToLeftmost(BtCursor *pCur){ |  | 
|  4098   Pgno pgno; |  | 
|  4099   int rc = SQLITE_OK; |  | 
|  4100   MemPage *pPage; |  | 
|  4101  |  | 
|  4102   assert( cursorHoldsMutex(pCur) ); |  | 
|  4103   assert( pCur->eState==CURSOR_VALID ); |  | 
|  4104   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ |  | 
|  4105     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); |  | 
|  4106     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); |  | 
|  4107     rc = moveToChild(pCur, pgno); |  | 
|  4108   } |  | 
|  4109   return rc; |  | 
|  4110 } |  | 
|  4111  |  | 
|  4112 /* |  | 
|  4113 ** Move the cursor down to the right-most leaf entry beneath the |  | 
|  4114 ** page to which it is currently pointing.  Notice the difference |  | 
|  4115 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost() |  | 
|  4116 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() |  | 
|  4117 ** finds the right-most entry beneath the *page*. |  | 
|  4118 ** |  | 
|  4119 ** The right-most entry is the one with the largest key - the last |  | 
|  4120 ** key in ascending order. |  | 
|  4121 */ |  | 
|  4122 static int moveToRightmost(BtCursor *pCur){ |  | 
|  4123   Pgno pgno; |  | 
|  4124   int rc = SQLITE_OK; |  | 
|  4125   MemPage *pPage = 0; |  | 
|  4126  |  | 
|  4127   assert( cursorHoldsMutex(pCur) ); |  | 
|  4128   assert( pCur->eState==CURSOR_VALID ); |  | 
|  4129   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ |  | 
|  4130     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); |  | 
|  4131     pCur->aiIdx[pCur->iPage] = pPage->nCell; |  | 
|  4132     rc = moveToChild(pCur, pgno); |  | 
|  4133   } |  | 
|  4134   if( rc==SQLITE_OK ){ |  | 
|  4135     pCur->aiIdx[pCur->iPage] = pPage->nCell-1; |  | 
|  4136     pCur->info.nSize = 0; |  | 
|  4137     pCur->validNKey = 0; |  | 
|  4138   } |  | 
|  4139   return rc; |  | 
|  4140 } |  | 
|  4141  |  | 
|  4142 /* Move the cursor to the first entry in the table.  Return SQLITE_OK |  | 
|  4143 ** on success.  Set *pRes to 0 if the cursor actually points to something |  | 
|  4144 ** or set *pRes to 1 if the table is empty. |  | 
|  4145 */ |  | 
|  4146 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ |  | 
|  4147   int rc; |  | 
|  4148  |  | 
|  4149   assert( cursorHoldsMutex(pCur) ); |  | 
|  4150   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); |  | 
|  4151   rc = moveToRoot(pCur); |  | 
|  4152   if( rc==SQLITE_OK ){ |  | 
|  4153     if( pCur->eState==CURSOR_INVALID ){ |  | 
|  4154       assert( pCur->apPage[pCur->iPage]->nCell==0 ); |  | 
|  4155       *pRes = 1; |  | 
|  4156       rc = SQLITE_OK; |  | 
|  4157     }else{ |  | 
|  4158       assert( pCur->apPage[pCur->iPage]->nCell>0 ); |  | 
|  4159       *pRes = 0; |  | 
|  4160       rc = moveToLeftmost(pCur); |  | 
|  4161     } |  | 
|  4162   } |  | 
|  4163   return rc; |  | 
|  4164 } |  | 
|  4165  |  | 
|  4166 /* Move the cursor to the last entry in the table.  Return SQLITE_OK |  | 
|  4167 ** on success.  Set *pRes to 0 if the cursor actually points to something |  | 
|  4168 ** or set *pRes to 1 if the table is empty. |  | 
|  4169 */ |  | 
|  4170 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ |  | 
|  4171   int rc; |  | 
|  4172   |  | 
|  4173   assert( cursorHoldsMutex(pCur) ); |  | 
|  4174   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); |  | 
|  4175  |  | 
|  4176   /* If the cursor already points to the last entry, this is a no-op. */ |  | 
|  4177   if( CURSOR_VALID==pCur->eState && pCur->atLast ){ |  | 
|  4178 #ifdef SQLITE_DEBUG |  | 
|  4179     /* This block serves to assert() that the cursor really does point  |  | 
|  4180     ** to the last entry in the b-tree. */ |  | 
|  4181     int ii; |  | 
|  4182     for(ii=0; ii<pCur->iPage; ii++){ |  | 
|  4183       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); |  | 
|  4184     } |  | 
|  4185     assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 ); |  | 
|  4186     assert( pCur->apPage[pCur->iPage]->leaf ); |  | 
|  4187 #endif |  | 
|  4188     return SQLITE_OK; |  | 
|  4189   } |  | 
|  4190  |  | 
|  4191   rc = moveToRoot(pCur); |  | 
|  4192   if( rc==SQLITE_OK ){ |  | 
|  4193     if( CURSOR_INVALID==pCur->eState ){ |  | 
|  4194       assert( pCur->apPage[pCur->iPage]->nCell==0 ); |  | 
|  4195       *pRes = 1; |  | 
|  4196     }else{ |  | 
|  4197       assert( pCur->eState==CURSOR_VALID ); |  | 
|  4198       *pRes = 0; |  | 
|  4199       rc = moveToRightmost(pCur); |  | 
|  4200       pCur->atLast = rc==SQLITE_OK ?1:0; |  | 
|  4201     } |  | 
|  4202   } |  | 
|  4203   return rc; |  | 
|  4204 } |  | 
|  4205  |  | 
|  4206 /* Move the cursor so that it points to an entry near the key  |  | 
|  4207 ** specified by pIdxKey or intKey.   Return a success code. |  | 
|  4208 ** |  | 
|  4209 ** For INTKEY tables, the intKey parameter is used.  pIdxKey  |  | 
|  4210 ** must be NULL.  For index tables, pIdxKey is used and intKey |  | 
|  4211 ** is ignored. |  | 
|  4212 ** |  | 
|  4213 ** If an exact match is not found, then the cursor is always |  | 
|  4214 ** left pointing at a leaf page which would hold the entry if it |  | 
|  4215 ** were present.  The cursor might point to an entry that comes |  | 
|  4216 ** before or after the key. |  | 
|  4217 ** |  | 
|  4218 ** An integer is written into *pRes which is the result of |  | 
|  4219 ** comparing the key with the entry to which the cursor is  |  | 
|  4220 ** pointing.  The meaning of the integer written into |  | 
|  4221 ** *pRes is as follows: |  | 
|  4222 ** |  | 
|  4223 **     *pRes<0      The cursor is left pointing at an entry that |  | 
|  4224 **                  is smaller than intKey/pIdxKey or if the table is empty |  | 
|  4225 **                  and the cursor is therefore left point to nothing. |  | 
|  4226 ** |  | 
|  4227 **     *pRes==0     The cursor is left pointing at an entry that |  | 
|  4228 **                  exactly matches intKey/pIdxKey. |  | 
|  4229 ** |  | 
|  4230 **     *pRes>0      The cursor is left pointing at an entry that |  | 
|  4231 **                  is larger than intKey/pIdxKey. |  | 
|  4232 ** |  | 
|  4233 */ |  | 
|  4234 int sqlite3BtreeMovetoUnpacked( |  | 
|  4235   BtCursor *pCur,          /* The cursor to be moved */ |  | 
|  4236   UnpackedRecord *pIdxKey, /* Unpacked index key */ |  | 
|  4237   i64 intKey,              /* The table key */ |  | 
|  4238   int biasRight,           /* If true, bias the search to the high end */ |  | 
|  4239   int *pRes                /* Write search results here */ |  | 
|  4240 ){ |  | 
|  4241   int rc; |  | 
|  4242  |  | 
|  4243   assert( cursorHoldsMutex(pCur) ); |  | 
|  4244   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); |  | 
|  4245   assert( pRes ); |  | 
|  4246   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); |  | 
|  4247  |  | 
|  4248   /* If the cursor is already positioned at the point we are trying |  | 
|  4249   ** to move to, then just return without doing any work */ |  | 
|  4250   if( pCur->eState==CURSOR_VALID && pCur->validNKey  |  | 
|  4251    && pCur->apPage[0]->intKey  |  | 
|  4252   ){ |  | 
|  4253     if( pCur->info.nKey==intKey ){ |  | 
|  4254       *pRes = 0; |  | 
|  4255       return SQLITE_OK; |  | 
|  4256     } |  | 
|  4257     if( pCur->atLast && pCur->info.nKey<intKey ){ |  | 
|  4258       *pRes = -1; |  | 
|  4259       return SQLITE_OK; |  | 
|  4260     } |  | 
|  4261   } |  | 
|  4262  |  | 
|  4263   rc = moveToRoot(pCur); |  | 
|  4264   if( rc ){ |  | 
|  4265     return rc; |  | 
|  4266   } |  | 
|  4267   assert( pCur->apPage[pCur->iPage] ); |  | 
|  4268   assert( pCur->apPage[pCur->iPage]->isInit ); |  | 
|  4269   assert( pCur->apPage[pCur->iPage]->nCell>0 || pCur->eState==CURSOR_INVALID ); |  | 
|  4270   if( pCur->eState==CURSOR_INVALID ){ |  | 
|  4271     *pRes = -1; |  | 
|  4272     assert( pCur->apPage[pCur->iPage]->nCell==0 ); |  | 
|  4273     return SQLITE_OK; |  | 
|  4274   } |  | 
|  4275   assert( pCur->apPage[0]->intKey || pIdxKey ); |  | 
|  4276   for(;;){ |  | 
|  4277     int lwr, upr; |  | 
|  4278     Pgno chldPg; |  | 
|  4279     MemPage *pPage = pCur->apPage[pCur->iPage]; |  | 
|  4280     int c; |  | 
|  4281  |  | 
|  4282     /* pPage->nCell must be greater than zero. If this is the root-page |  | 
|  4283     ** the cursor would have been INVALID above and this for(;;) loop |  | 
|  4284     ** not run. If this is not the root-page, then the moveToChild() routine |  | 
|  4285     ** would have already detected db corruption. Similarly, pPage must |  | 
|  4286     ** be the right kind (index or table) of b-tree page. Otherwise |  | 
|  4287     ** a moveToChild() or moveToRoot() call would have detected corruption.  */ |  | 
|  4288     assert( pPage->nCell>0 ); |  | 
|  4289     assert( pPage->intKey==(pIdxKey==0) ); |  | 
|  4290     lwr = 0; |  | 
|  4291     upr = pPage->nCell-1; |  | 
|  4292     if( biasRight ){ |  | 
|  4293       pCur->aiIdx[pCur->iPage] = (u16)upr; |  | 
|  4294     }else{ |  | 
|  4295       pCur->aiIdx[pCur->iPage] = (u16)((upr+lwr)/2); |  | 
|  4296     } |  | 
|  4297     for(;;){ |  | 
|  4298       int idx = pCur->aiIdx[pCur->iPage]; /* Index of current cell in pPage */ |  | 
|  4299       u8 *pCell;                          /* Pointer to current cell in pPage */ |  | 
|  4300  |  | 
|  4301       pCur->info.nSize = 0; |  | 
|  4302       pCell = findCell(pPage, idx) + pPage->childPtrSize; |  | 
|  4303       if( pPage->intKey ){ |  | 
|  4304         i64 nCellKey; |  | 
|  4305         if( pPage->hasData ){ |  | 
|  4306           u32 dummy; |  | 
|  4307           pCell += getVarint32(pCell, dummy); |  | 
|  4308         } |  | 
|  4309         getVarint(pCell, (u64*)&nCellKey); |  | 
|  4310         if( nCellKey==intKey ){ |  | 
|  4311           c = 0; |  | 
|  4312         }else if( nCellKey<intKey ){ |  | 
|  4313           c = -1; |  | 
|  4314         }else{ |  | 
|  4315           assert( nCellKey>intKey ); |  | 
|  4316           c = +1; |  | 
|  4317         } |  | 
|  4318         pCur->validNKey = 1; |  | 
|  4319         pCur->info.nKey = nCellKey; |  | 
|  4320       }else{ |  | 
|  4321         /* The maximum supported page-size is 32768 bytes. This means that |  | 
|  4322         ** the maximum number of record bytes stored on an index B-Tree |  | 
|  4323         ** page is at most 8198 bytes, which may be stored as a 2-byte |  | 
|  4324         ** varint. This information is used to attempt to avoid parsing  |  | 
|  4325         ** the entire cell by checking for the cases where the record is  |  | 
|  4326         ** stored entirely within the b-tree page by inspecting the first  |  | 
|  4327         ** 2 bytes of the cell. |  | 
|  4328         */ |  | 
|  4329         int nCell = pCell[0]; |  | 
|  4330         if( !(nCell & 0x80) && nCell<=pPage->maxLocal ){ |  | 
|  4331           /* This branch runs if the record-size field of the cell is a |  | 
|  4332           ** single byte varint and the record fits entirely on the main |  | 
|  4333           ** b-tree page.  */ |  | 
|  4334           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey); |  | 
|  4335         }else if( !(pCell[1] & 0x80)  |  | 
|  4336           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal |  | 
|  4337         ){ |  | 
|  4338           /* The record-size field is a 2 byte varint and the record  |  | 
|  4339           ** fits entirely on the main b-tree page.  */ |  | 
|  4340           c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey); |  | 
|  4341         }else{ |  | 
|  4342           /* The record flows over onto one or more overflow pages. In |  | 
|  4343           ** this case the whole cell needs to be parsed, a buffer allocated |  | 
|  4344           ** and accessPayload() used to retrieve the record into the |  | 
|  4345           ** buffer before VdbeRecordCompare() can be called. */ |  | 
|  4346           void *pCellKey; |  | 
|  4347           u8 * const pCellBody = pCell - pPage->childPtrSize; |  | 
|  4348           btreeParseCellPtr(pPage, pCellBody, &pCur->info); |  | 
|  4349           nCell = (int)pCur->info.nKey; |  | 
|  4350           pCellKey = sqlite3Malloc( nCell ); |  | 
|  4351           if( pCellKey==0 ){ |  | 
|  4352             rc = SQLITE_NOMEM; |  | 
|  4353             goto moveto_finish; |  | 
|  4354           } |  | 
|  4355           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0); |  | 
|  4356           if( rc ){ |  | 
|  4357             sqlite3_free(pCellKey); |  | 
|  4358             goto moveto_finish; |  | 
|  4359           } |  | 
|  4360           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey); |  | 
|  4361           sqlite3_free(pCellKey); |  | 
|  4362         } |  | 
|  4363       } |  | 
|  4364       if( c==0 ){ |  | 
|  4365         if( pPage->intKey && !pPage->leaf ){ |  | 
|  4366           lwr = idx; |  | 
|  4367           upr = lwr - 1; |  | 
|  4368           break; |  | 
|  4369         }else{ |  | 
|  4370           *pRes = 0; |  | 
|  4371           rc = SQLITE_OK; |  | 
|  4372           goto moveto_finish; |  | 
|  4373         } |  | 
|  4374       } |  | 
|  4375       if( c<0 ){ |  | 
|  4376         lwr = idx+1; |  | 
|  4377       }else{ |  | 
|  4378         upr = idx-1; |  | 
|  4379       } |  | 
|  4380       if( lwr>upr ){ |  | 
|  4381         break; |  | 
|  4382       } |  | 
|  4383       pCur->aiIdx[pCur->iPage] = (u16)((lwr+upr)/2); |  | 
|  4384     } |  | 
|  4385     assert( lwr==upr+1 ); |  | 
|  4386     assert( pPage->isInit ); |  | 
|  4387     if( pPage->leaf ){ |  | 
|  4388       chldPg = 0; |  | 
|  4389     }else if( lwr>=pPage->nCell ){ |  | 
|  4390       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); |  | 
|  4391     }else{ |  | 
|  4392       chldPg = get4byte(findCell(pPage, lwr)); |  | 
|  4393     } |  | 
|  4394     if( chldPg==0 ){ |  | 
|  4395       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); |  | 
|  4396       *pRes = c; |  | 
|  4397       rc = SQLITE_OK; |  | 
|  4398       goto moveto_finish; |  | 
|  4399     } |  | 
|  4400     pCur->aiIdx[pCur->iPage] = (u16)lwr; |  | 
|  4401     pCur->info.nSize = 0; |  | 
|  4402     pCur->validNKey = 0; |  | 
|  4403     rc = moveToChild(pCur, chldPg); |  | 
|  4404     if( rc ) goto moveto_finish; |  | 
|  4405   } |  | 
|  4406 moveto_finish: |  | 
|  4407   return rc; |  | 
|  4408 } |  | 
|  4409  |  | 
|  4410  |  | 
|  4411 /* |  | 
|  4412 ** Return TRUE if the cursor is not pointing at an entry of the table. |  | 
|  4413 ** |  | 
|  4414 ** TRUE will be returned after a call to sqlite3BtreeNext() moves |  | 
|  4415 ** past the last entry in the table or sqlite3BtreePrev() moves past |  | 
|  4416 ** the first entry.  TRUE is also returned if the table is empty. |  | 
|  4417 */ |  | 
|  4418 int sqlite3BtreeEof(BtCursor *pCur){ |  | 
|  4419   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries |  | 
|  4420   ** have been deleted? This API will need to change to return an error code |  | 
|  4421   ** as well as the boolean result value. |  | 
|  4422   */ |  | 
|  4423   return (CURSOR_VALID!=pCur->eState); |  | 
|  4424 } |  | 
|  4425  |  | 
|  4426 /* |  | 
|  4427 ** Advance the cursor to the next entry in the database.  If |  | 
|  4428 ** successful then set *pRes=0.  If the cursor |  | 
|  4429 ** was already pointing to the last entry in the database before |  | 
|  4430 ** this routine was called, then set *pRes=1. |  | 
|  4431 */ |  | 
|  4432 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ |  | 
|  4433   int rc; |  | 
|  4434   int idx; |  | 
|  4435   MemPage *pPage; |  | 
|  4436  |  | 
|  4437   assert( cursorHoldsMutex(pCur) ); |  | 
|  4438   rc = restoreCursorPosition(pCur); |  | 
|  4439   if( rc!=SQLITE_OK ){ |  | 
|  4440     return rc; |  | 
|  4441   } |  | 
|  4442   assert( pRes!=0 ); |  | 
|  4443   if( CURSOR_INVALID==pCur->eState ){ |  | 
|  4444     *pRes = 1; |  | 
|  4445     return SQLITE_OK; |  | 
|  4446   } |  | 
|  4447   if( pCur->skipNext>0 ){ |  | 
|  4448     pCur->skipNext = 0; |  | 
|  4449     *pRes = 0; |  | 
|  4450     return SQLITE_OK; |  | 
|  4451   } |  | 
|  4452   pCur->skipNext = 0; |  | 
|  4453  |  | 
|  4454   pPage = pCur->apPage[pCur->iPage]; |  | 
|  4455   idx = ++pCur->aiIdx[pCur->iPage]; |  | 
|  4456   assert( pPage->isInit ); |  | 
|  4457   assert( idx<=pPage->nCell ); |  | 
|  4458  |  | 
|  4459   pCur->info.nSize = 0; |  | 
|  4460   pCur->validNKey = 0; |  | 
|  4461   if( idx>=pPage->nCell ){ |  | 
|  4462     if( !pPage->leaf ){ |  | 
|  4463       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); |  | 
|  4464       if( rc ) return rc; |  | 
|  4465       rc = moveToLeftmost(pCur); |  | 
|  4466       *pRes = 0; |  | 
|  4467       return rc; |  | 
|  4468     } |  | 
|  4469     do{ |  | 
|  4470       if( pCur->iPage==0 ){ |  | 
|  4471         *pRes = 1; |  | 
|  4472         pCur->eState = CURSOR_INVALID; |  | 
|  4473         return SQLITE_OK; |  | 
|  4474       } |  | 
|  4475       moveToParent(pCur); |  | 
|  4476       pPage = pCur->apPage[pCur->iPage]; |  | 
|  4477     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); |  | 
|  4478     *pRes = 0; |  | 
|  4479     if( pPage->intKey ){ |  | 
|  4480       rc = sqlite3BtreeNext(pCur, pRes); |  | 
|  4481     }else{ |  | 
|  4482       rc = SQLITE_OK; |  | 
|  4483     } |  | 
|  4484     return rc; |  | 
|  4485   } |  | 
|  4486   *pRes = 0; |  | 
|  4487   if( pPage->leaf ){ |  | 
|  4488     return SQLITE_OK; |  | 
|  4489   } |  | 
|  4490   rc = moveToLeftmost(pCur); |  | 
|  4491   return rc; |  | 
|  4492 } |  | 
|  4493  |  | 
|  4494  |  | 
|  4495 /* |  | 
|  4496 ** Step the cursor to the back to the previous entry in the database.  If |  | 
|  4497 ** successful then set *pRes=0.  If the cursor |  | 
|  4498 ** was already pointing to the first entry in the database before |  | 
|  4499 ** this routine was called, then set *pRes=1. |  | 
|  4500 */ |  | 
|  4501 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ |  | 
|  4502   int rc; |  | 
|  4503   MemPage *pPage; |  | 
|  4504  |  | 
|  4505   assert( cursorHoldsMutex(pCur) ); |  | 
|  4506   rc = restoreCursorPosition(pCur); |  | 
|  4507   if( rc!=SQLITE_OK ){ |  | 
|  4508     return rc; |  | 
|  4509   } |  | 
|  4510   pCur->atLast = 0; |  | 
|  4511   if( CURSOR_INVALID==pCur->eState ){ |  | 
|  4512     *pRes = 1; |  | 
|  4513     return SQLITE_OK; |  | 
|  4514   } |  | 
|  4515   if( pCur->skipNext<0 ){ |  | 
|  4516     pCur->skipNext = 0; |  | 
|  4517     *pRes = 0; |  | 
|  4518     return SQLITE_OK; |  | 
|  4519   } |  | 
|  4520   pCur->skipNext = 0; |  | 
|  4521  |  | 
|  4522   pPage = pCur->apPage[pCur->iPage]; |  | 
|  4523   assert( pPage->isInit ); |  | 
|  4524   if( !pPage->leaf ){ |  | 
|  4525     int idx = pCur->aiIdx[pCur->iPage]; |  | 
|  4526     rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); |  | 
|  4527     if( rc ){ |  | 
|  4528       return rc; |  | 
|  4529     } |  | 
|  4530     rc = moveToRightmost(pCur); |  | 
|  4531   }else{ |  | 
|  4532     while( pCur->aiIdx[pCur->iPage]==0 ){ |  | 
|  4533       if( pCur->iPage==0 ){ |  | 
|  4534         pCur->eState = CURSOR_INVALID; |  | 
|  4535         *pRes = 1; |  | 
|  4536         return SQLITE_OK; |  | 
|  4537       } |  | 
|  4538       moveToParent(pCur); |  | 
|  4539     } |  | 
|  4540     pCur->info.nSize = 0; |  | 
|  4541     pCur->validNKey = 0; |  | 
|  4542  |  | 
|  4543     pCur->aiIdx[pCur->iPage]--; |  | 
|  4544     pPage = pCur->apPage[pCur->iPage]; |  | 
|  4545     if( pPage->intKey && !pPage->leaf ){ |  | 
|  4546       rc = sqlite3BtreePrevious(pCur, pRes); |  | 
|  4547     }else{ |  | 
|  4548       rc = SQLITE_OK; |  | 
|  4549     } |  | 
|  4550   } |  | 
|  4551   *pRes = 0; |  | 
|  4552   return rc; |  | 
|  4553 } |  | 
|  4554  |  | 
|  4555 /* |  | 
|  4556 ** Allocate a new page from the database file. |  | 
|  4557 ** |  | 
|  4558 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite() |  | 
|  4559 ** has already been called on the new page.)  The new page has also |  | 
|  4560 ** been referenced and the calling routine is responsible for calling |  | 
|  4561 ** sqlite3PagerUnref() on the new page when it is done. |  | 
|  4562 ** |  | 
|  4563 ** SQLITE_OK is returned on success.  Any other return value indicates |  | 
|  4564 ** an error.  *ppPage and *pPgno are undefined in the event of an error. |  | 
|  4565 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. |  | 
|  4566 ** |  | 
|  4567 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to  |  | 
|  4568 ** locate a page close to the page number "nearby".  This can be used in an |  | 
|  4569 ** attempt to keep related pages close to each other in the database file, |  | 
|  4570 ** which in turn can make database access faster. |  | 
|  4571 ** |  | 
|  4572 ** If the "exact" parameter is not 0, and the page-number nearby exists  |  | 
|  4573 ** anywhere on the free-list, then it is guarenteed to be returned. This |  | 
|  4574 ** is only used by auto-vacuum databases when allocating a new table. |  | 
|  4575 */ |  | 
|  4576 static int allocateBtreePage( |  | 
|  4577   BtShared *pBt,  |  | 
|  4578   MemPage **ppPage,  |  | 
|  4579   Pgno *pPgno,  |  | 
|  4580   Pgno nearby, |  | 
|  4581   u8 exact |  | 
|  4582 ){ |  | 
|  4583   MemPage *pPage1; |  | 
|  4584   int rc; |  | 
|  4585   u32 n;     /* Number of pages on the freelist */ |  | 
|  4586   u32 k;     /* Number of leaves on the trunk of the freelist */ |  | 
|  4587   MemPage *pTrunk = 0; |  | 
|  4588   MemPage *pPrevTrunk = 0; |  | 
|  4589   Pgno mxPage;     /* Total size of the database file */ |  | 
|  4590  |  | 
|  4591   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  4592   pPage1 = pBt->pPage1; |  | 
|  4593   mxPage = pagerPagecount(pBt); |  | 
|  4594   n = get4byte(&pPage1->aData[36]); |  | 
|  4595   testcase( n==mxPage-1 ); |  | 
|  4596   if( n>=mxPage ){ |  | 
|  4597     return SQLITE_CORRUPT_BKPT; |  | 
|  4598   } |  | 
|  4599   if( n>0 ){ |  | 
|  4600     /* There are pages on the freelist.  Reuse one of those pages. */ |  | 
|  4601     Pgno iTrunk; |  | 
|  4602     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ |  | 
|  4603      |  | 
|  4604     /* If the 'exact' parameter was true and a query of the pointer-map |  | 
|  4605     ** shows that the page 'nearby' is somewhere on the free-list, then |  | 
|  4606     ** the entire-list will be searched for that page. |  | 
|  4607     */ |  | 
|  4608 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  4609     if( exact && nearby<=mxPage ){ |  | 
|  4610       u8 eType; |  | 
|  4611       assert( nearby>0 ); |  | 
|  4612       assert( pBt->autoVacuum ); |  | 
|  4613       rc = ptrmapGet(pBt, nearby, &eType, 0); |  | 
|  4614       if( rc ) return rc; |  | 
|  4615       if( eType==PTRMAP_FREEPAGE ){ |  | 
|  4616         searchList = 1; |  | 
|  4617       } |  | 
|  4618       *pPgno = nearby; |  | 
|  4619     } |  | 
|  4620 #endif |  | 
|  4621  |  | 
|  4622     /* Decrement the free-list count by 1. Set iTrunk to the index of the |  | 
|  4623     ** first free-list trunk page. iPrevTrunk is initially 1. |  | 
|  4624     */ |  | 
|  4625     rc = sqlite3PagerWrite(pPage1->pDbPage); |  | 
|  4626     if( rc ) return rc; |  | 
|  4627     put4byte(&pPage1->aData[36], n-1); |  | 
|  4628  |  | 
|  4629     /* The code within this loop is run only once if the 'searchList' variable |  | 
|  4630     ** is not true. Otherwise, it runs once for each trunk-page on the |  | 
|  4631     ** free-list until the page 'nearby' is located. |  | 
|  4632     */ |  | 
|  4633     do { |  | 
|  4634       pPrevTrunk = pTrunk; |  | 
|  4635       if( pPrevTrunk ){ |  | 
|  4636         iTrunk = get4byte(&pPrevTrunk->aData[0]); |  | 
|  4637       }else{ |  | 
|  4638         iTrunk = get4byte(&pPage1->aData[32]); |  | 
|  4639       } |  | 
|  4640       testcase( iTrunk==mxPage ); |  | 
|  4641       if( iTrunk>mxPage ){ |  | 
|  4642         rc = SQLITE_CORRUPT_BKPT; |  | 
|  4643       }else{ |  | 
|  4644         rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); |  | 
|  4645       } |  | 
|  4646       if( rc ){ |  | 
|  4647         pTrunk = 0; |  | 
|  4648         goto end_allocate_page; |  | 
|  4649       } |  | 
|  4650  |  | 
|  4651       k = get4byte(&pTrunk->aData[4]); |  | 
|  4652       if( k==0 && !searchList ){ |  | 
|  4653         /* The trunk has no leaves and the list is not being searched.  |  | 
|  4654         ** So extract the trunk page itself and use it as the newly  |  | 
|  4655         ** allocated page */ |  | 
|  4656         assert( pPrevTrunk==0 ); |  | 
|  4657         rc = sqlite3PagerWrite(pTrunk->pDbPage); |  | 
|  4658         if( rc ){ |  | 
|  4659           goto end_allocate_page; |  | 
|  4660         } |  | 
|  4661         *pPgno = iTrunk; |  | 
|  4662         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); |  | 
|  4663         *ppPage = pTrunk; |  | 
|  4664         pTrunk = 0; |  | 
|  4665         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); |  | 
|  4666       }else if( k>(u32)(pBt->usableSize/4 - 2) ){ |  | 
|  4667         /* Value of k is out of range.  Database corruption */ |  | 
|  4668         rc = SQLITE_CORRUPT_BKPT; |  | 
|  4669         goto end_allocate_page; |  | 
|  4670 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  4671       }else if( searchList && nearby==iTrunk ){ |  | 
|  4672         /* The list is being searched and this trunk page is the page |  | 
|  4673         ** to allocate, regardless of whether it has leaves. |  | 
|  4674         */ |  | 
|  4675         assert( *pPgno==iTrunk ); |  | 
|  4676         *ppPage = pTrunk; |  | 
|  4677         searchList = 0; |  | 
|  4678         rc = sqlite3PagerWrite(pTrunk->pDbPage); |  | 
|  4679         if( rc ){ |  | 
|  4680           goto end_allocate_page; |  | 
|  4681         } |  | 
|  4682         if( k==0 ){ |  | 
|  4683           if( !pPrevTrunk ){ |  | 
|  4684             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); |  | 
|  4685           }else{ |  | 
|  4686             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); |  | 
|  4687           } |  | 
|  4688         }else{ |  | 
|  4689           /* The trunk page is required by the caller but it contains  |  | 
|  4690           ** pointers to free-list leaves. The first leaf becomes a trunk |  | 
|  4691           ** page in this case. |  | 
|  4692           */ |  | 
|  4693           MemPage *pNewTrunk; |  | 
|  4694           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); |  | 
|  4695           if( iNewTrunk>mxPage ){  |  | 
|  4696             rc = SQLITE_CORRUPT_BKPT; |  | 
|  4697             goto end_allocate_page; |  | 
|  4698           } |  | 
|  4699           testcase( iNewTrunk==mxPage ); |  | 
|  4700           rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); |  | 
|  4701           if( rc!=SQLITE_OK ){ |  | 
|  4702             goto end_allocate_page; |  | 
|  4703           } |  | 
|  4704           rc = sqlite3PagerWrite(pNewTrunk->pDbPage); |  | 
|  4705           if( rc!=SQLITE_OK ){ |  | 
|  4706             releasePage(pNewTrunk); |  | 
|  4707             goto end_allocate_page; |  | 
|  4708           } |  | 
|  4709           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); |  | 
|  4710           put4byte(&pNewTrunk->aData[4], k-1); |  | 
|  4711           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); |  | 
|  4712           releasePage(pNewTrunk); |  | 
|  4713           if( !pPrevTrunk ){ |  | 
|  4714             assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); |  | 
|  4715             put4byte(&pPage1->aData[32], iNewTrunk); |  | 
|  4716           }else{ |  | 
|  4717             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); |  | 
|  4718             if( rc ){ |  | 
|  4719               goto end_allocate_page; |  | 
|  4720             } |  | 
|  4721             put4byte(&pPrevTrunk->aData[0], iNewTrunk); |  | 
|  4722           } |  | 
|  4723         } |  | 
|  4724         pTrunk = 0; |  | 
|  4725         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); |  | 
|  4726 #endif |  | 
|  4727       }else if( k>0 ){ |  | 
|  4728         /* Extract a leaf from the trunk */ |  | 
|  4729         u32 closest; |  | 
|  4730         Pgno iPage; |  | 
|  4731         unsigned char *aData = pTrunk->aData; |  | 
|  4732         rc = sqlite3PagerWrite(pTrunk->pDbPage); |  | 
|  4733         if( rc ){ |  | 
|  4734           goto end_allocate_page; |  | 
|  4735         } |  | 
|  4736         if( nearby>0 ){ |  | 
|  4737           u32 i; |  | 
|  4738           int dist; |  | 
|  4739           closest = 0; |  | 
|  4740           dist = get4byte(&aData[8]) - nearby; |  | 
|  4741           if( dist<0 ) dist = -dist; |  | 
|  4742           for(i=1; i<k; i++){ |  | 
|  4743             int d2 = get4byte(&aData[8+i*4]) - nearby; |  | 
|  4744             if( d2<0 ) d2 = -d2; |  | 
|  4745             if( d2<dist ){ |  | 
|  4746               closest = i; |  | 
|  4747               dist = d2; |  | 
|  4748             } |  | 
|  4749           } |  | 
|  4750         }else{ |  | 
|  4751           closest = 0; |  | 
|  4752         } |  | 
|  4753  |  | 
|  4754         iPage = get4byte(&aData[8+closest*4]); |  | 
|  4755         testcase( iPage==mxPage ); |  | 
|  4756         if( iPage>mxPage ){ |  | 
|  4757           rc = SQLITE_CORRUPT_BKPT; |  | 
|  4758           goto end_allocate_page; |  | 
|  4759         } |  | 
|  4760         testcase( iPage==mxPage ); |  | 
|  4761         if( !searchList || iPage==nearby ){ |  | 
|  4762           int noContent; |  | 
|  4763           *pPgno = iPage; |  | 
|  4764           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" |  | 
|  4765                  ": %d more free pages\n", |  | 
|  4766                  *pPgno, closest+1, k, pTrunk->pgno, n-1)); |  | 
|  4767           if( closest<k-1 ){ |  | 
|  4768             memcpy(&aData[8+closest*4], &aData[4+k*4], 4); |  | 
|  4769           } |  | 
|  4770           put4byte(&aData[4], k-1); |  | 
|  4771           assert( sqlite3PagerIswriteable(pTrunk->pDbPage) ); |  | 
|  4772           noContent = !btreeGetHasContent(pBt, *pPgno); |  | 
|  4773           rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); |  | 
|  4774           if( rc==SQLITE_OK ){ |  | 
|  4775             rc = sqlite3PagerWrite((*ppPage)->pDbPage); |  | 
|  4776             if( rc!=SQLITE_OK ){ |  | 
|  4777               releasePage(*ppPage); |  | 
|  4778             } |  | 
|  4779           } |  | 
|  4780           searchList = 0; |  | 
|  4781         } |  | 
|  4782       } |  | 
|  4783       releasePage(pPrevTrunk); |  | 
|  4784       pPrevTrunk = 0; |  | 
|  4785     }while( searchList ); |  | 
|  4786   }else{ |  | 
|  4787     /* There are no pages on the freelist, so create a new page at the |  | 
|  4788     ** end of the file */ |  | 
|  4789     int nPage = pagerPagecount(pBt); |  | 
|  4790     *pPgno = nPage + 1; |  | 
|  4791  |  | 
|  4792     if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ |  | 
|  4793       (*pPgno)++; |  | 
|  4794     } |  | 
|  4795  |  | 
|  4796 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  4797     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){ |  | 
|  4798       /* If *pPgno refers to a pointer-map page, allocate two new pages |  | 
|  4799       ** at the end of the file instead of one. The first allocated page |  | 
|  4800       ** becomes a new pointer-map page, the second is used by the caller. |  | 
|  4801       */ |  | 
|  4802       MemPage *pPg = 0; |  | 
|  4803       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno)); |  | 
|  4804       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); |  | 
|  4805       rc = btreeGetPage(pBt, *pPgno, &pPg, 0); |  | 
|  4806       if( rc==SQLITE_OK ){ |  | 
|  4807         rc = sqlite3PagerWrite(pPg->pDbPage); |  | 
|  4808         releasePage(pPg); |  | 
|  4809       } |  | 
|  4810       if( rc ) return rc; |  | 
|  4811       (*pPgno)++; |  | 
|  4812       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; } |  | 
|  4813     } |  | 
|  4814 #endif |  | 
|  4815  |  | 
|  4816     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); |  | 
|  4817     rc = btreeGetPage(pBt, *pPgno, ppPage, 0); |  | 
|  4818     if( rc ) return rc; |  | 
|  4819     rc = sqlite3PagerWrite((*ppPage)->pDbPage); |  | 
|  4820     if( rc!=SQLITE_OK ){ |  | 
|  4821       releasePage(*ppPage); |  | 
|  4822     } |  | 
|  4823     TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); |  | 
|  4824   } |  | 
|  4825  |  | 
|  4826   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); |  | 
|  4827  |  | 
|  4828 end_allocate_page: |  | 
|  4829   releasePage(pTrunk); |  | 
|  4830   releasePage(pPrevTrunk); |  | 
|  4831   if( rc==SQLITE_OK ){ |  | 
|  4832     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ |  | 
|  4833       releasePage(*ppPage); |  | 
|  4834       return SQLITE_CORRUPT_BKPT; |  | 
|  4835     } |  | 
|  4836     (*ppPage)->isInit = 0; |  | 
|  4837   }else{ |  | 
|  4838     *ppPage = 0; |  | 
|  4839   } |  | 
|  4840   return rc; |  | 
|  4841 } |  | 
|  4842  |  | 
|  4843 /* |  | 
|  4844 ** This function is used to add page iPage to the database file free-list.  |  | 
|  4845 ** It is assumed that the page is not already a part of the free-list. |  | 
|  4846 ** |  | 
|  4847 ** The value passed as the second argument to this function is optional. |  | 
|  4848 ** If the caller happens to have a pointer to the MemPage object  |  | 
|  4849 ** corresponding to page iPage handy, it may pass it as the second value.  |  | 
|  4850 ** Otherwise, it may pass NULL. |  | 
|  4851 ** |  | 
|  4852 ** If a pointer to a MemPage object is passed as the second argument, |  | 
|  4853 ** its reference count is not altered by this function. |  | 
|  4854 */ |  | 
|  4855 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ |  | 
|  4856   MemPage *pTrunk = 0;                /* Free-list trunk page */ |  | 
|  4857   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */  |  | 
|  4858   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */ |  | 
|  4859   MemPage *pPage;                     /* Page being freed. May be NULL. */ |  | 
|  4860   int rc;                             /* Return Code */ |  | 
|  4861   int nFree;                          /* Initial number of pages on free-list */ |  | 
|  4862  |  | 
|  4863   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  4864   assert( iPage>1 ); |  | 
|  4865   assert( !pMemPage || pMemPage->pgno==iPage ); |  | 
|  4866  |  | 
|  4867   if( pMemPage ){ |  | 
|  4868     pPage = pMemPage; |  | 
|  4869     sqlite3PagerRef(pPage->pDbPage); |  | 
|  4870   }else{ |  | 
|  4871     pPage = btreePageLookup(pBt, iPage); |  | 
|  4872   } |  | 
|  4873  |  | 
|  4874   /* Increment the free page count on pPage1 */ |  | 
|  4875   rc = sqlite3PagerWrite(pPage1->pDbPage); |  | 
|  4876   if( rc ) goto freepage_out; |  | 
|  4877   nFree = get4byte(&pPage1->aData[36]); |  | 
|  4878   put4byte(&pPage1->aData[36], nFree+1); |  | 
|  4879  |  | 
|  4880 #ifdef SQLITE_SECURE_DELETE |  | 
|  4881   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then |  | 
|  4882   ** always fully overwrite deleted information with zeros. |  | 
|  4883   */ |  | 
|  4884   if( (!pPage && (rc = btreeGetPage(pBt, iPage, &pPage, 0))) |  | 
|  4885    ||            (rc = sqlite3PagerWrite(pPage->pDbPage)) |  | 
|  4886   ){ |  | 
|  4887     goto freepage_out; |  | 
|  4888   } |  | 
|  4889   memset(pPage->aData, 0, pPage->pBt->pageSize); |  | 
|  4890 #endif |  | 
|  4891  |  | 
|  4892   /* If the database supports auto-vacuum, write an entry in the pointer-map |  | 
|  4893   ** to indicate that the page is free. |  | 
|  4894   */ |  | 
|  4895   if( ISAUTOVACUUM ){ |  | 
|  4896     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); |  | 
|  4897     if( rc ) goto freepage_out; |  | 
|  4898   } |  | 
|  4899  |  | 
|  4900   /* Now manipulate the actual database free-list structure. There are two |  | 
|  4901   ** possibilities. If the free-list is currently empty, or if the first |  | 
|  4902   ** trunk page in the free-list is full, then this page will become a |  | 
|  4903   ** new free-list trunk page. Otherwise, it will become a leaf of the |  | 
|  4904   ** first trunk page in the current free-list. This block tests if it |  | 
|  4905   ** is possible to add the page as a new free-list leaf. |  | 
|  4906   */ |  | 
|  4907   if( nFree!=0 ){ |  | 
|  4908     u32 nLeaf;                /* Initial number of leaf cells on trunk page */ |  | 
|  4909  |  | 
|  4910     iTrunk = get4byte(&pPage1->aData[32]); |  | 
|  4911     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); |  | 
|  4912     if( rc!=SQLITE_OK ){ |  | 
|  4913       goto freepage_out; |  | 
|  4914     } |  | 
|  4915  |  | 
|  4916     nLeaf = get4byte(&pTrunk->aData[4]); |  | 
|  4917     assert( pBt->usableSize>32 ); |  | 
|  4918     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ |  | 
|  4919       rc = SQLITE_CORRUPT_BKPT; |  | 
|  4920       goto freepage_out; |  | 
|  4921     } |  | 
|  4922     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ |  | 
|  4923       /* In this case there is room on the trunk page to insert the page |  | 
|  4924       ** being freed as a new leaf. |  | 
|  4925       ** |  | 
|  4926       ** Note that the trunk page is not really full until it contains |  | 
|  4927       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have |  | 
|  4928       ** coded.  But due to a coding error in versions of SQLite prior to |  | 
|  4929       ** 3.6.0, databases with freelist trunk pages holding more than |  | 
|  4930       ** usableSize/4 - 8 entries will be reported as corrupt.  In order |  | 
|  4931       ** to maintain backwards compatibility with older versions of SQLite, |  | 
|  4932       ** we will continue to restrict the number of entries to usableSize/4 - 8 |  | 
|  4933       ** for now.  At some point in the future (once everyone has upgraded |  | 
|  4934       ** to 3.6.0 or later) we should consider fixing the conditional above |  | 
|  4935       ** to read "usableSize/4-2" instead of "usableSize/4-8". |  | 
|  4936       */ |  | 
|  4937       rc = sqlite3PagerWrite(pTrunk->pDbPage); |  | 
|  4938       if( rc==SQLITE_OK ){ |  | 
|  4939         put4byte(&pTrunk->aData[4], nLeaf+1); |  | 
|  4940         put4byte(&pTrunk->aData[8+nLeaf*4], iPage); |  | 
|  4941 #ifndef SQLITE_SECURE_DELETE |  | 
|  4942         if( pPage ){ |  | 
|  4943           sqlite3PagerDontWrite(pPage->pDbPage); |  | 
|  4944         } |  | 
|  4945 #endif |  | 
|  4946         rc = btreeSetHasContent(pBt, iPage); |  | 
|  4947       } |  | 
|  4948       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); |  | 
|  4949       goto freepage_out; |  | 
|  4950     } |  | 
|  4951   } |  | 
|  4952  |  | 
|  4953   /* If control flows to this point, then it was not possible to add the |  | 
|  4954   ** the page being freed as a leaf page of the first trunk in the free-list. |  | 
|  4955   ** Possibly because the free-list is empty, or possibly because the  |  | 
|  4956   ** first trunk in the free-list is full. Either way, the page being freed |  | 
|  4957   ** will become the new first trunk page in the free-list. |  | 
|  4958   */ |  | 
|  4959   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ |  | 
|  4960     goto freepage_out; |  | 
|  4961   } |  | 
|  4962   rc = sqlite3PagerWrite(pPage->pDbPage); |  | 
|  4963   if( rc!=SQLITE_OK ){ |  | 
|  4964     goto freepage_out; |  | 
|  4965   } |  | 
|  4966   put4byte(pPage->aData, iTrunk); |  | 
|  4967   put4byte(&pPage->aData[4], 0); |  | 
|  4968   put4byte(&pPage1->aData[32], iPage); |  | 
|  4969   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); |  | 
|  4970  |  | 
|  4971 freepage_out: |  | 
|  4972   if( pPage ){ |  | 
|  4973     pPage->isInit = 0; |  | 
|  4974   } |  | 
|  4975   releasePage(pPage); |  | 
|  4976   releasePage(pTrunk); |  | 
|  4977   return rc; |  | 
|  4978 } |  | 
|  4979 static void freePage(MemPage *pPage, int *pRC){ |  | 
|  4980   if( (*pRC)==SQLITE_OK ){ |  | 
|  4981     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); |  | 
|  4982   } |  | 
|  4983 } |  | 
|  4984  |  | 
|  4985 /* |  | 
|  4986 ** Free any overflow pages associated with the given Cell. |  | 
|  4987 */ |  | 
|  4988 static int clearCell(MemPage *pPage, unsigned char *pCell){ |  | 
|  4989   BtShared *pBt = pPage->pBt; |  | 
|  4990   CellInfo info; |  | 
|  4991   Pgno ovflPgno; |  | 
|  4992   int rc; |  | 
|  4993   int nOvfl; |  | 
|  4994   u16 ovflPageSize; |  | 
|  4995  |  | 
|  4996   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  4997   btreeParseCellPtr(pPage, pCell, &info); |  | 
|  4998   if( info.iOverflow==0 ){ |  | 
|  4999     return SQLITE_OK;  /* No overflow pages. Return without doing anything */ |  | 
|  5000   } |  | 
|  5001   ovflPgno = get4byte(&pCell[info.iOverflow]); |  | 
|  5002   assert( pBt->usableSize > 4 ); |  | 
|  5003   ovflPageSize = pBt->usableSize - 4; |  | 
|  5004   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; |  | 
|  5005   assert( ovflPgno==0 || nOvfl>0 ); |  | 
|  5006   while( nOvfl-- ){ |  | 
|  5007     Pgno iNext = 0; |  | 
|  5008     MemPage *pOvfl = 0; |  | 
|  5009     if( ovflPgno<2 || ovflPgno>pagerPagecount(pBt) ){ |  | 
|  5010       /* 0 is not a legal page number and page 1 cannot be an  |  | 
|  5011       ** overflow page. Therefore if ovflPgno<2 or past the end of the  |  | 
|  5012       ** file the database must be corrupt. */ |  | 
|  5013       return SQLITE_CORRUPT_BKPT; |  | 
|  5014     } |  | 
|  5015     if( nOvfl ){ |  | 
|  5016       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); |  | 
|  5017       if( rc ) return rc; |  | 
|  5018     } |  | 
|  5019     rc = freePage2(pBt, pOvfl, ovflPgno); |  | 
|  5020     if( pOvfl ){ |  | 
|  5021       sqlite3PagerUnref(pOvfl->pDbPage); |  | 
|  5022     } |  | 
|  5023     if( rc ) return rc; |  | 
|  5024     ovflPgno = iNext; |  | 
|  5025   } |  | 
|  5026   return SQLITE_OK; |  | 
|  5027 } |  | 
|  5028  |  | 
|  5029 /* |  | 
|  5030 ** Create the byte sequence used to represent a cell on page pPage |  | 
|  5031 ** and write that byte sequence into pCell[].  Overflow pages are |  | 
|  5032 ** allocated and filled in as necessary.  The calling procedure |  | 
|  5033 ** is responsible for making sure sufficient space has been allocated |  | 
|  5034 ** for pCell[]. |  | 
|  5035 ** |  | 
|  5036 ** Note that pCell does not necessary need to point to the pPage->aData |  | 
|  5037 ** area.  pCell might point to some temporary storage.  The cell will |  | 
|  5038 ** be constructed in this temporary area then copied into pPage->aData |  | 
|  5039 ** later. |  | 
|  5040 */ |  | 
|  5041 static int fillInCell( |  | 
|  5042   MemPage *pPage,                /* The page that contains the cell */ |  | 
|  5043   unsigned char *pCell,          /* Complete text of the cell */ |  | 
|  5044   const void *pKey, i64 nKey,    /* The key */ |  | 
|  5045   const void *pData,int nData,   /* The data */ |  | 
|  5046   int nZero,                     /* Extra zero bytes to append to pData */ |  | 
|  5047   int *pnSize                    /* Write cell size here */ |  | 
|  5048 ){ |  | 
|  5049   int nPayload; |  | 
|  5050   const u8 *pSrc; |  | 
|  5051   int nSrc, n, rc; |  | 
|  5052   int spaceLeft; |  | 
|  5053   MemPage *pOvfl = 0; |  | 
|  5054   MemPage *pToRelease = 0; |  | 
|  5055   unsigned char *pPrior; |  | 
|  5056   unsigned char *pPayload; |  | 
|  5057   BtShared *pBt = pPage->pBt; |  | 
|  5058   Pgno pgnoOvfl = 0; |  | 
|  5059   int nHeader; |  | 
|  5060   CellInfo info; |  | 
|  5061  |  | 
|  5062   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  5063  |  | 
|  5064   /* pPage is not necessarily writeable since pCell might be auxiliary |  | 
|  5065   ** buffer space that is separate from the pPage buffer area */ |  | 
|  5066   assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] |  | 
|  5067             || sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  5068  |  | 
|  5069   /* Fill in the header. */ |  | 
|  5070   nHeader = 0; |  | 
|  5071   if( !pPage->leaf ){ |  | 
|  5072     nHeader += 4; |  | 
|  5073   } |  | 
|  5074   if( pPage->hasData ){ |  | 
|  5075     nHeader += putVarint(&pCell[nHeader], nData+nZero); |  | 
|  5076   }else{ |  | 
|  5077     nData = nZero = 0; |  | 
|  5078   } |  | 
|  5079   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); |  | 
|  5080   btreeParseCellPtr(pPage, pCell, &info); |  | 
|  5081   assert( info.nHeader==nHeader ); |  | 
|  5082   assert( info.nKey==nKey ); |  | 
|  5083   assert( info.nData==(u32)(nData+nZero) ); |  | 
|  5084    |  | 
|  5085   /* Fill in the payload */ |  | 
|  5086   nPayload = nData + nZero; |  | 
|  5087   if( pPage->intKey ){ |  | 
|  5088     pSrc = pData; |  | 
|  5089     nSrc = nData; |  | 
|  5090     nData = 0; |  | 
|  5091   }else{  |  | 
|  5092     if( NEVER(nKey>0x7fffffff || pKey==0) ){ |  | 
|  5093       return SQLITE_CORRUPT_BKPT; |  | 
|  5094     } |  | 
|  5095     nPayload += (int)nKey; |  | 
|  5096     pSrc = pKey; |  | 
|  5097     nSrc = (int)nKey; |  | 
|  5098   } |  | 
|  5099   *pnSize = info.nSize; |  | 
|  5100   spaceLeft = info.nLocal; |  | 
|  5101   pPayload = &pCell[nHeader]; |  | 
|  5102   pPrior = &pCell[info.iOverflow]; |  | 
|  5103  |  | 
|  5104   while( nPayload>0 ){ |  | 
|  5105     if( spaceLeft==0 ){ |  | 
|  5106 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  5107       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ |  | 
|  5108       if( pBt->autoVacuum ){ |  | 
|  5109         do{ |  | 
|  5110           pgnoOvfl++; |  | 
|  5111         } while(  |  | 
|  5112           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)  |  | 
|  5113         ); |  | 
|  5114       } |  | 
|  5115 #endif |  | 
|  5116       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); |  | 
|  5117 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  5118       /* If the database supports auto-vacuum, and the second or subsequent |  | 
|  5119       ** overflow page is being allocated, add an entry to the pointer-map |  | 
|  5120       ** for that page now.  |  | 
|  5121       ** |  | 
|  5122       ** If this is the first overflow page, then write a partial entry  |  | 
|  5123       ** to the pointer-map. If we write nothing to this pointer-map slot, |  | 
|  5124       ** then the optimistic overflow chain processing in clearCell() |  | 
|  5125       ** may misinterpret the uninitialised values and delete the |  | 
|  5126       ** wrong pages from the database. |  | 
|  5127       */ |  | 
|  5128       if( pBt->autoVacuum && rc==SQLITE_OK ){ |  | 
|  5129         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); |  | 
|  5130         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); |  | 
|  5131         if( rc ){ |  | 
|  5132           releasePage(pOvfl); |  | 
|  5133         } |  | 
|  5134       } |  | 
|  5135 #endif |  | 
|  5136       if( rc ){ |  | 
|  5137         releasePage(pToRelease); |  | 
|  5138         return rc; |  | 
|  5139       } |  | 
|  5140  |  | 
|  5141       /* If pToRelease is not zero than pPrior points into the data area |  | 
|  5142       ** of pToRelease.  Make sure pToRelease is still writeable. */ |  | 
|  5143       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); |  | 
|  5144  |  | 
|  5145       /* If pPrior is part of the data area of pPage, then make sure pPage |  | 
|  5146       ** is still writeable */ |  | 
|  5147       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] |  | 
|  5148             || sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  5149  |  | 
|  5150       put4byte(pPrior, pgnoOvfl); |  | 
|  5151       releasePage(pToRelease); |  | 
|  5152       pToRelease = pOvfl; |  | 
|  5153       pPrior = pOvfl->aData; |  | 
|  5154       put4byte(pPrior, 0); |  | 
|  5155       pPayload = &pOvfl->aData[4]; |  | 
|  5156       spaceLeft = pBt->usableSize - 4; |  | 
|  5157     } |  | 
|  5158     n = nPayload; |  | 
|  5159     if( n>spaceLeft ) n = spaceLeft; |  | 
|  5160  |  | 
|  5161     /* If pToRelease is not zero than pPayload points into the data area |  | 
|  5162     ** of pToRelease.  Make sure pToRelease is still writeable. */ |  | 
|  5163     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); |  | 
|  5164  |  | 
|  5165     /* If pPayload is part of the data area of pPage, then make sure pPage |  | 
|  5166     ** is still writeable */ |  | 
|  5167     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] |  | 
|  5168             || sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  5169  |  | 
|  5170     if( nSrc>0 ){ |  | 
|  5171       if( n>nSrc ) n = nSrc; |  | 
|  5172       assert( pSrc ); |  | 
|  5173       memcpy(pPayload, pSrc, n); |  | 
|  5174     }else{ |  | 
|  5175       memset(pPayload, 0, n); |  | 
|  5176     } |  | 
|  5177     nPayload -= n; |  | 
|  5178     pPayload += n; |  | 
|  5179     pSrc += n; |  | 
|  5180     nSrc -= n; |  | 
|  5181     spaceLeft -= n; |  | 
|  5182     if( nSrc==0 ){ |  | 
|  5183       nSrc = nData; |  | 
|  5184       pSrc = pData; |  | 
|  5185     } |  | 
|  5186   } |  | 
|  5187   releasePage(pToRelease); |  | 
|  5188   return SQLITE_OK; |  | 
|  5189 } |  | 
|  5190  |  | 
|  5191 /* |  | 
|  5192 ** Remove the i-th cell from pPage.  This routine effects pPage only. |  | 
|  5193 ** The cell content is not freed or deallocated.  It is assumed that |  | 
|  5194 ** the cell content has been copied someplace else.  This routine just |  | 
|  5195 ** removes the reference to the cell from pPage. |  | 
|  5196 ** |  | 
|  5197 ** "sz" must be the number of bytes in the cell. |  | 
|  5198 */ |  | 
|  5199 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ |  | 
|  5200   int i;          /* Loop counter */ |  | 
|  5201   int pc;         /* Offset to cell content of cell being deleted */ |  | 
|  5202   u8 *data;       /* pPage->aData */ |  | 
|  5203   u8 *ptr;        /* Used to move bytes around within data[] */ |  | 
|  5204   int rc;         /* The return code */ |  | 
|  5205   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */ |  | 
|  5206  |  | 
|  5207   if( *pRC ) return; |  | 
|  5208  |  | 
|  5209   assert( idx>=0 && idx<pPage->nCell ); |  | 
|  5210   assert( sz==cellSize(pPage, idx) ); |  | 
|  5211   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  5212   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  5213   data = pPage->aData; |  | 
|  5214   ptr = &data[pPage->cellOffset + 2*idx]; |  | 
|  5215   pc = get2byte(ptr); |  | 
|  5216   hdr = pPage->hdrOffset; |  | 
|  5217   testcase( pc==get2byte(&data[hdr+5]) ); |  | 
|  5218   testcase( pc+sz==pPage->pBt->usableSize ); |  | 
|  5219   if( pc < get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){ |  | 
|  5220     *pRC = SQLITE_CORRUPT_BKPT; |  | 
|  5221     return; |  | 
|  5222   } |  | 
|  5223   rc = freeSpace(pPage, pc, sz); |  | 
|  5224   if( rc ){ |  | 
|  5225     *pRC = rc; |  | 
|  5226     return; |  | 
|  5227   } |  | 
|  5228   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){ |  | 
|  5229     ptr[0] = ptr[2]; |  | 
|  5230     ptr[1] = ptr[3]; |  | 
|  5231   } |  | 
|  5232   pPage->nCell--; |  | 
|  5233   put2byte(&data[hdr+3], pPage->nCell); |  | 
|  5234   pPage->nFree += 2; |  | 
|  5235 } |  | 
|  5236  |  | 
|  5237 /* |  | 
|  5238 ** Insert a new cell on pPage at cell index "i".  pCell points to the |  | 
|  5239 ** content of the cell. |  | 
|  5240 ** |  | 
|  5241 ** If the cell content will fit on the page, then put it there.  If it |  | 
|  5242 ** will not fit, then make a copy of the cell content into pTemp if |  | 
|  5243 ** pTemp is not null.  Regardless of pTemp, allocate a new entry |  | 
|  5244 ** in pPage->aOvfl[] and make it point to the cell content (either |  | 
|  5245 ** in pTemp or the original pCell) and also record its index.  |  | 
|  5246 ** Allocating a new entry in pPage->aCell[] implies that  |  | 
|  5247 ** pPage->nOverflow is incremented. |  | 
|  5248 ** |  | 
|  5249 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the |  | 
|  5250 ** cell. The caller will overwrite them after this function returns. If |  | 
|  5251 ** nSkip is non-zero, then pCell may not point to an invalid memory location  |  | 
|  5252 ** (but pCell+nSkip is always valid). |  | 
|  5253 */ |  | 
|  5254 static void insertCell( |  | 
|  5255   MemPage *pPage,   /* Page into which we are copying */ |  | 
|  5256   int i,            /* New cell becomes the i-th cell of the page */ |  | 
|  5257   u8 *pCell,        /* Content of the new cell */ |  | 
|  5258   int sz,           /* Bytes of content in pCell */ |  | 
|  5259   u8 *pTemp,        /* Temp storage space for pCell, if needed */ |  | 
|  5260   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */ |  | 
|  5261   int *pRC          /* Read and write return code from here */ |  | 
|  5262 ){ |  | 
|  5263   int idx;          /* Where to write new cell content in data[] */ |  | 
|  5264   int j;            /* Loop counter */ |  | 
|  5265   int end;          /* First byte past the last cell pointer in data[] */ |  | 
|  5266   int ins;          /* Index in data[] where new cell pointer is inserted */ |  | 
|  5267   int cellOffset;   /* Address of first cell pointer in data[] */ |  | 
|  5268   u8 *data;         /* The content of the whole page */ |  | 
|  5269   u8 *ptr;          /* Used for moving information around in data[] */ |  | 
|  5270  |  | 
|  5271   int nSkip = (iChild ? 4 : 0); |  | 
|  5272  |  | 
|  5273   if( *pRC ) return; |  | 
|  5274  |  | 
|  5275   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); |  | 
|  5276   assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); |  | 
|  5277   assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); |  | 
|  5278   assert( sz==cellSizePtr(pPage, pCell) ); |  | 
|  5279   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  5280   if( pPage->nOverflow || sz+2>pPage->nFree ){ |  | 
|  5281     if( pTemp ){ |  | 
|  5282       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); |  | 
|  5283       pCell = pTemp; |  | 
|  5284     } |  | 
|  5285     if( iChild ){ |  | 
|  5286       put4byte(pCell, iChild); |  | 
|  5287     } |  | 
|  5288     j = pPage->nOverflow++; |  | 
|  5289     assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); |  | 
|  5290     pPage->aOvfl[j].pCell = pCell; |  | 
|  5291     pPage->aOvfl[j].idx = (u16)i; |  | 
|  5292   }else{ |  | 
|  5293     int rc = sqlite3PagerWrite(pPage->pDbPage); |  | 
|  5294     if( rc!=SQLITE_OK ){ |  | 
|  5295       *pRC = rc; |  | 
|  5296       return; |  | 
|  5297     } |  | 
|  5298     assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  5299     data = pPage->aData; |  | 
|  5300     cellOffset = pPage->cellOffset; |  | 
|  5301     end = cellOffset + 2*pPage->nCell; |  | 
|  5302     ins = cellOffset + 2*i; |  | 
|  5303     rc = allocateSpace(pPage, sz, &idx); |  | 
|  5304     if( rc ){ *pRC = rc; return; } |  | 
|  5305     /* The allocateSpace() routine guarantees the following two properties |  | 
|  5306     ** if it returns success */ |  | 
|  5307     assert( idx >= end+2 ); |  | 
|  5308     assert( idx+sz <= pPage->pBt->usableSize ); |  | 
|  5309     pPage->nCell++; |  | 
|  5310     pPage->nFree -= (u16)(2 + sz); |  | 
|  5311     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); |  | 
|  5312     if( iChild ){ |  | 
|  5313       put4byte(&data[idx], iChild); |  | 
|  5314     } |  | 
|  5315     for(j=end, ptr=&data[j]; j>ins; j-=2, ptr-=2){ |  | 
|  5316       ptr[0] = ptr[-2]; |  | 
|  5317       ptr[1] = ptr[-1]; |  | 
|  5318     } |  | 
|  5319     put2byte(&data[ins], idx); |  | 
|  5320     put2byte(&data[pPage->hdrOffset+3], pPage->nCell); |  | 
|  5321 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  5322     if( pPage->pBt->autoVacuum ){ |  | 
|  5323       /* The cell may contain a pointer to an overflow page. If so, write |  | 
|  5324       ** the entry for the overflow page into the pointer map. |  | 
|  5325       */ |  | 
|  5326       ptrmapPutOvflPtr(pPage, pCell, pRC); |  | 
|  5327     } |  | 
|  5328 #endif |  | 
|  5329   } |  | 
|  5330 } |  | 
|  5331  |  | 
|  5332 /* |  | 
|  5333 ** Add a list of cells to a page.  The page should be initially empty. |  | 
|  5334 ** The cells are guaranteed to fit on the page. |  | 
|  5335 */ |  | 
|  5336 static void assemblePage( |  | 
|  5337   MemPage *pPage,   /* The page to be assemblied */ |  | 
|  5338   int nCell,        /* The number of cells to add to this page */ |  | 
|  5339   u8 **apCell,      /* Pointers to cell bodies */ |  | 
|  5340   u16 *aSize        /* Sizes of the cells */ |  | 
|  5341 ){ |  | 
|  5342   int i;            /* Loop counter */ |  | 
|  5343   u8 *pCellptr;     /* Address of next cell pointer */ |  | 
|  5344   int cellbody;     /* Address of next cell body */ |  | 
|  5345   u8 * const data = pPage->aData;             /* Pointer to data for pPage */ |  | 
|  5346   const int hdr = pPage->hdrOffset;           /* Offset of header on pPage */ |  | 
|  5347   const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ |  | 
|  5348  |  | 
|  5349   assert( pPage->nOverflow==0 ); |  | 
|  5350   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  5351   assert( nCell>=0 && nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); |  | 
|  5352   assert( sqlite3PagerIswriteable(pPage->pDbPage) ); |  | 
|  5353  |  | 
|  5354   /* Check that the page has just been zeroed by zeroPage() */ |  | 
|  5355   assert( pPage->nCell==0 ); |  | 
|  5356   assert( get2byte(&data[hdr+5])==nUsable ); |  | 
|  5357  |  | 
|  5358   pCellptr = &data[pPage->cellOffset + nCell*2]; |  | 
|  5359   cellbody = nUsable; |  | 
|  5360   for(i=nCell-1; i>=0; i--){ |  | 
|  5361     pCellptr -= 2; |  | 
|  5362     cellbody -= aSize[i]; |  | 
|  5363     put2byte(pCellptr, cellbody); |  | 
|  5364     memcpy(&data[cellbody], apCell[i], aSize[i]); |  | 
|  5365   } |  | 
|  5366   put2byte(&data[hdr+3], nCell); |  | 
|  5367   put2byte(&data[hdr+5], cellbody); |  | 
|  5368   pPage->nFree -= (nCell*2 + nUsable - cellbody); |  | 
|  5369   pPage->nCell = (u16)nCell; |  | 
|  5370 } |  | 
|  5371  |  | 
|  5372 /* |  | 
|  5373 ** The following parameters determine how many adjacent pages get involved |  | 
|  5374 ** in a balancing operation.  NN is the number of neighbors on either side |  | 
|  5375 ** of the page that participate in the balancing operation.  NB is the |  | 
|  5376 ** total number of pages that participate, including the target page and |  | 
|  5377 ** NN neighbors on either side. |  | 
|  5378 ** |  | 
|  5379 ** The minimum value of NN is 1 (of course).  Increasing NN above 1 |  | 
|  5380 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance |  | 
|  5381 ** in exchange for a larger degradation in INSERT and UPDATE performance. |  | 
|  5382 ** The value of NN appears to give the best results overall. |  | 
|  5383 */ |  | 
|  5384 #define NN 1             /* Number of neighbors on either side of pPage */ |  | 
|  5385 #define NB (NN*2+1)      /* Total pages involved in the balance */ |  | 
|  5386  |  | 
|  5387  |  | 
|  5388 #ifndef SQLITE_OMIT_QUICKBALANCE |  | 
|  5389 /* |  | 
|  5390 ** This version of balance() handles the common special case where |  | 
|  5391 ** a new entry is being inserted on the extreme right-end of the |  | 
|  5392 ** tree, in other words, when the new entry will become the largest |  | 
|  5393 ** entry in the tree. |  | 
|  5394 ** |  | 
|  5395 ** Instead of trying to balance the 3 right-most leaf pages, just add |  | 
|  5396 ** a new page to the right-hand side and put the one new entry in |  | 
|  5397 ** that page.  This leaves the right side of the tree somewhat |  | 
|  5398 ** unbalanced.  But odds are that we will be inserting new entries |  | 
|  5399 ** at the end soon afterwards so the nearly empty page will quickly |  | 
|  5400 ** fill up.  On average. |  | 
|  5401 ** |  | 
|  5402 ** pPage is the leaf page which is the right-most page in the tree. |  | 
|  5403 ** pParent is its parent.  pPage must have a single overflow entry |  | 
|  5404 ** which is also the right-most entry on the page. |  | 
|  5405 ** |  | 
|  5406 ** The pSpace buffer is used to store a temporary copy of the divider |  | 
|  5407 ** cell that will be inserted into pParent. Such a cell consists of a 4 |  | 
|  5408 ** byte page number followed by a variable length integer. In other |  | 
|  5409 ** words, at most 13 bytes. Hence the pSpace buffer must be at |  | 
|  5410 ** least 13 bytes in size. |  | 
|  5411 */ |  | 
|  5412 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ |  | 
|  5413   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */ |  | 
|  5414   MemPage *pNew;                       /* Newly allocated page */ |  | 
|  5415   int rc;                              /* Return Code */ |  | 
|  5416   Pgno pgnoNew;                        /* Page number of pNew */ |  | 
|  5417  |  | 
|  5418   assert( sqlite3_mutex_held(pPage->pBt->mutex) ); |  | 
|  5419   assert( sqlite3PagerIswriteable(pParent->pDbPage) ); |  | 
|  5420   assert( pPage->nOverflow==1 ); |  | 
|  5421  |  | 
|  5422   if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; |  | 
|  5423  |  | 
|  5424   /* Allocate a new page. This page will become the right-sibling of  |  | 
|  5425   ** pPage. Make the parent page writable, so that the new divider cell |  | 
|  5426   ** may be inserted. If both these operations are successful, proceed. |  | 
|  5427   */ |  | 
|  5428   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); |  | 
|  5429  |  | 
|  5430   if( rc==SQLITE_OK ){ |  | 
|  5431  |  | 
|  5432     u8 *pOut = &pSpace[4]; |  | 
|  5433     u8 *pCell = pPage->aOvfl[0].pCell; |  | 
|  5434     u16 szCell = cellSizePtr(pPage, pCell); |  | 
|  5435     u8 *pStop; |  | 
|  5436  |  | 
|  5437     assert( sqlite3PagerIswriteable(pNew->pDbPage) ); |  | 
|  5438     assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); |  | 
|  5439     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); |  | 
|  5440     assemblePage(pNew, 1, &pCell, &szCell); |  | 
|  5441  |  | 
|  5442     /* If this is an auto-vacuum database, update the pointer map |  | 
|  5443     ** with entries for the new page, and any pointer from the  |  | 
|  5444     ** cell on the page to an overflow page. If either of these |  | 
|  5445     ** operations fails, the return code is set, but the contents |  | 
|  5446     ** of the parent page are still manipulated by thh code below. |  | 
|  5447     ** That is Ok, at this point the parent page is guaranteed to |  | 
|  5448     ** be marked as dirty. Returning an error code will cause a |  | 
|  5449     ** rollback, undoing any changes made to the parent page. |  | 
|  5450     */ |  | 
|  5451     if( ISAUTOVACUUM ){ |  | 
|  5452       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); |  | 
|  5453       if( szCell>pNew->minLocal ){ |  | 
|  5454         ptrmapPutOvflPtr(pNew, pCell, &rc); |  | 
|  5455       } |  | 
|  5456     } |  | 
|  5457    |  | 
|  5458     /* Create a divider cell to insert into pParent. The divider cell |  | 
|  5459     ** consists of a 4-byte page number (the page number of pPage) and |  | 
|  5460     ** a variable length key value (which must be the same value as the |  | 
|  5461     ** largest key on pPage). |  | 
|  5462     ** |  | 
|  5463     ** To find the largest key value on pPage, first find the right-most  |  | 
|  5464     ** cell on pPage. The first two fields of this cell are the  |  | 
|  5465     ** record-length (a variable length integer at most 32-bits in size) |  | 
|  5466     ** and the key value (a variable length integer, may have any value). |  | 
|  5467     ** The first of the while(...) loops below skips over the record-length |  | 
|  5468     ** field. The second while(...) loop copies the key value from the |  | 
|  5469     ** cell on pPage into the pSpace buffer. |  | 
|  5470     */ |  | 
|  5471     pCell = findCell(pPage, pPage->nCell-1); |  | 
|  5472     pStop = &pCell[9]; |  | 
|  5473     while( (*(pCell++)&0x80) && pCell<pStop ); |  | 
|  5474     pStop = &pCell[9]; |  | 
|  5475     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); |  | 
|  5476  |  | 
|  5477     /* Insert the new divider cell into pParent. */ |  | 
|  5478     insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), |  | 
|  5479                0, pPage->pgno, &rc); |  | 
|  5480  |  | 
|  5481     /* Set the right-child pointer of pParent to point to the new page. */ |  | 
|  5482     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); |  | 
|  5483    |  | 
|  5484     /* Release the reference to the new page. */ |  | 
|  5485     releasePage(pNew); |  | 
|  5486   } |  | 
|  5487  |  | 
|  5488   return rc; |  | 
|  5489 } |  | 
|  5490 #endif /* SQLITE_OMIT_QUICKBALANCE */ |  | 
|  5491  |  | 
|  5492 #if 0 |  | 
|  5493 /* |  | 
|  5494 ** This function does not contribute anything to the operation of SQLite. |  | 
|  5495 ** it is sometimes activated temporarily while debugging code responsible  |  | 
|  5496 ** for setting pointer-map entries. |  | 
|  5497 */ |  | 
|  5498 static int ptrmapCheckPages(MemPage **apPage, int nPage){ |  | 
|  5499   int i, j; |  | 
|  5500   for(i=0; i<nPage; i++){ |  | 
|  5501     Pgno n; |  | 
|  5502     u8 e; |  | 
|  5503     MemPage *pPage = apPage[i]; |  | 
|  5504     BtShared *pBt = pPage->pBt; |  | 
|  5505     assert( pPage->isInit ); |  | 
|  5506  |  | 
|  5507     for(j=0; j<pPage->nCell; j++){ |  | 
|  5508       CellInfo info; |  | 
|  5509       u8 *z; |  | 
|  5510       |  | 
|  5511       z = findCell(pPage, j); |  | 
|  5512       btreeParseCellPtr(pPage, z, &info); |  | 
|  5513       if( info.iOverflow ){ |  | 
|  5514         Pgno ovfl = get4byte(&z[info.iOverflow]); |  | 
|  5515         ptrmapGet(pBt, ovfl, &e, &n); |  | 
|  5516         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); |  | 
|  5517       } |  | 
|  5518       if( !pPage->leaf ){ |  | 
|  5519         Pgno child = get4byte(z); |  | 
|  5520         ptrmapGet(pBt, child, &e, &n); |  | 
|  5521         assert( n==pPage->pgno && e==PTRMAP_BTREE ); |  | 
|  5522       } |  | 
|  5523     } |  | 
|  5524     if( !pPage->leaf ){ |  | 
|  5525       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); |  | 
|  5526       ptrmapGet(pBt, child, &e, &n); |  | 
|  5527       assert( n==pPage->pgno && e==PTRMAP_BTREE ); |  | 
|  5528     } |  | 
|  5529   } |  | 
|  5530   return 1; |  | 
|  5531 } |  | 
|  5532 #endif |  | 
|  5533  |  | 
|  5534 /* |  | 
|  5535 ** This function is used to copy the contents of the b-tree node stored  |  | 
|  5536 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then |  | 
|  5537 ** the pointer-map entries for each child page are updated so that the |  | 
|  5538 ** parent page stored in the pointer map is page pTo. If pFrom contained |  | 
|  5539 ** any cells with overflow page pointers, then the corresponding pointer |  | 
|  5540 ** map entries are also updated so that the parent page is page pTo. |  | 
|  5541 ** |  | 
|  5542 ** If pFrom is currently carrying any overflow cells (entries in the |  | 
|  5543 ** MemPage.aOvfl[] array), they are not copied to pTo.  |  | 
|  5544 ** |  | 
|  5545 ** Before returning, page pTo is reinitialized using btreeInitPage(). |  | 
|  5546 ** |  | 
|  5547 ** The performance of this function is not critical. It is only used by  |  | 
|  5548 ** the balance_shallower() and balance_deeper() procedures, neither of |  | 
|  5549 ** which are called often under normal circumstances. |  | 
|  5550 */ |  | 
|  5551 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ |  | 
|  5552   if( (*pRC)==SQLITE_OK ){ |  | 
|  5553     BtShared * const pBt = pFrom->pBt; |  | 
|  5554     u8 * const aFrom = pFrom->aData; |  | 
|  5555     u8 * const aTo = pTo->aData; |  | 
|  5556     int const iFromHdr = pFrom->hdrOffset; |  | 
|  5557     int const iToHdr = ((pTo->pgno==1) ? 100 : 0); |  | 
|  5558     TESTONLY(int rc;) |  | 
|  5559     int iData; |  | 
|  5560    |  | 
|  5561    |  | 
|  5562     assert( pFrom->isInit ); |  | 
|  5563     assert( pFrom->nFree>=iToHdr ); |  | 
|  5564     assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize ); |  | 
|  5565    |  | 
|  5566     /* Copy the b-tree node content from page pFrom to page pTo. */ |  | 
|  5567     iData = get2byte(&aFrom[iFromHdr+5]); |  | 
|  5568     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); |  | 
|  5569     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); |  | 
|  5570    |  | 
|  5571     /* Reinitialize page pTo so that the contents of the MemPage structure |  | 
|  5572     ** match the new data. The initialization of pTo "cannot" fail, as the |  | 
|  5573     ** data copied from pFrom is known to be valid.  */ |  | 
|  5574     pTo->isInit = 0; |  | 
|  5575     TESTONLY(rc = ) btreeInitPage(pTo); |  | 
|  5576     assert( rc==SQLITE_OK ); |  | 
|  5577    |  | 
|  5578     /* If this is an auto-vacuum database, update the pointer-map entries |  | 
|  5579     ** for any b-tree or overflow pages that pTo now contains the pointers to. |  | 
|  5580     */ |  | 
|  5581     if( ISAUTOVACUUM ){ |  | 
|  5582       *pRC = setChildPtrmaps(pTo); |  | 
|  5583     } |  | 
|  5584   } |  | 
|  5585 } |  | 
|  5586  |  | 
|  5587 /* |  | 
|  5588 ** This routine redistributes cells on the iParentIdx'th child of pParent |  | 
|  5589 ** (hereafter "the page") and up to 2 siblings so that all pages have about the |  | 
|  5590 ** same amount of free space. Usually a single sibling on either side of the |  | 
|  5591 ** page are used in the balancing, though both siblings might come from one |  | 
|  5592 ** side if the page is the first or last child of its parent. If the page  |  | 
|  5593 ** has fewer than 2 siblings (something which can only happen if the page |  | 
|  5594 ** is a root page or a child of a root page) then all available siblings |  | 
|  5595 ** participate in the balancing. |  | 
|  5596 ** |  | 
|  5597 ** The number of siblings of the page might be increased or decreased by  |  | 
|  5598 ** one or two in an effort to keep pages nearly full but not over full.  |  | 
|  5599 ** |  | 
|  5600 ** Note that when this routine is called, some of the cells on the page |  | 
|  5601 ** might not actually be stored in MemPage.aData[]. This can happen |  | 
|  5602 ** if the page is overfull. This routine ensures that all cells allocated |  | 
|  5603 ** to the page and its siblings fit into MemPage.aData[] before returning. |  | 
|  5604 ** |  | 
|  5605 ** In the course of balancing the page and its siblings, cells may be |  | 
|  5606 ** inserted into or removed from the parent page (pParent). Doing so |  | 
|  5607 ** may cause the parent page to become overfull or underfull. If this |  | 
|  5608 ** happens, it is the responsibility of the caller to invoke the correct |  | 
|  5609 ** balancing routine to fix this problem (see the balance() routine).  |  | 
|  5610 ** |  | 
|  5611 ** If this routine fails for any reason, it might leave the database |  | 
|  5612 ** in a corrupted state. So if this routine fails, the database should |  | 
|  5613 ** be rolled back. |  | 
|  5614 ** |  | 
|  5615 ** The third argument to this function, aOvflSpace, is a pointer to a |  | 
|  5616 ** buffer big enough to hold one page. If while inserting cells into the parent |  | 
|  5617 ** page (pParent) the parent page becomes overfull, this buffer is |  | 
|  5618 ** used to store the parent's overflow cells. Because this function inserts |  | 
|  5619 ** a maximum of four divider cells into the parent page, and the maximum |  | 
|  5620 ** size of a cell stored within an internal node is always less than 1/4 |  | 
|  5621 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large |  | 
|  5622 ** enough for all overflow cells. |  | 
|  5623 ** |  | 
|  5624 ** If aOvflSpace is set to a null pointer, this function returns  |  | 
|  5625 ** SQLITE_NOMEM. |  | 
|  5626 */ |  | 
|  5627 static int balance_nonroot( |  | 
|  5628   MemPage *pParent,               /* Parent page of siblings being balanced */ |  | 
|  5629   int iParentIdx,                 /* Index of "the page" in pParent */ |  | 
|  5630   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */ |  | 
|  5631   int isRoot                      /* True if pParent is a root-page */ |  | 
|  5632 ){ |  | 
|  5633   BtShared *pBt;               /* The whole database */ |  | 
|  5634   int nCell = 0;               /* Number of cells in apCell[] */ |  | 
|  5635   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */ |  | 
|  5636   int nNew = 0;                /* Number of pages in apNew[] */ |  | 
|  5637   int nOld;                    /* Number of pages in apOld[] */ |  | 
|  5638   int i, j, k;                 /* Loop counters */ |  | 
|  5639   int nxDiv;                   /* Next divider slot in pParent->aCell[] */ |  | 
|  5640   int rc = SQLITE_OK;          /* The return code */ |  | 
|  5641   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */ |  | 
|  5642   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */ |  | 
|  5643   int usableSpace;             /* Bytes in pPage beyond the header */ |  | 
|  5644   int pageFlags;               /* Value of pPage->aData[0] */ |  | 
|  5645   int subtotal;                /* Subtotal of bytes in cells on one page */ |  | 
|  5646   int iSpace1 = 0;             /* First unused byte of aSpace1[] */ |  | 
|  5647   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */ |  | 
|  5648   int szScratch;               /* Size of scratch memory requested */ |  | 
|  5649   MemPage *apOld[NB];          /* pPage and up to two siblings */ |  | 
|  5650   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */ |  | 
|  5651   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */ |  | 
|  5652   u8 *pRight;                  /* Location in parent of right-sibling pointer */ |  | 
|  5653   u8 *apDiv[NB-1];             /* Divider cells in pParent */ |  | 
|  5654   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */ |  | 
|  5655   int szNew[NB+2];             /* Combined size of cells place on i-th page */ |  | 
|  5656   u8 **apCell = 0;             /* All cells begin balanced */ |  | 
|  5657   u16 *szCell;                 /* Local size of all cells in apCell[] */ |  | 
|  5658   u8 *aSpace1;                 /* Space for copies of dividers cells */ |  | 
|  5659   Pgno pgno;                   /* Temp var to store a page number in */ |  | 
|  5660  |  | 
|  5661   pBt = pParent->pBt; |  | 
|  5662   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  5663   assert( sqlite3PagerIswriteable(pParent->pDbPage) ); |  | 
|  5664  |  | 
|  5665 #if 0 |  | 
|  5666   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); |  | 
|  5667 #endif |  | 
|  5668  |  | 
|  5669   /* At this point pParent may have at most one overflow cell. And if |  | 
|  5670   ** this overflow cell is present, it must be the cell with  |  | 
|  5671   ** index iParentIdx. This scenario comes about when this function |  | 
|  5672   ** is called (indirectly) from sqlite3BtreeDelete(). |  | 
|  5673   */ |  | 
|  5674   assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); |  | 
|  5675   assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx ); |  | 
|  5676  |  | 
|  5677   if( !aOvflSpace ){ |  | 
|  5678     return SQLITE_NOMEM; |  | 
|  5679   } |  | 
|  5680  |  | 
|  5681   /* Find the sibling pages to balance. Also locate the cells in pParent  |  | 
|  5682   ** that divide the siblings. An attempt is made to find NN siblings on  |  | 
|  5683   ** either side of pPage. More siblings are taken from one side, however,  |  | 
|  5684   ** if there are fewer than NN siblings on the other side. If pParent |  | 
|  5685   ** has NB or fewer children then all children of pParent are taken.   |  | 
|  5686   ** |  | 
|  5687   ** This loop also drops the divider cells from the parent page. This |  | 
|  5688   ** way, the remainder of the function does not have to deal with any |  | 
|  5689   ** overflow cells in the parent page, since if any existed they will |  | 
|  5690   ** have already been removed. |  | 
|  5691   */ |  | 
|  5692   i = pParent->nOverflow + pParent->nCell; |  | 
|  5693   if( i<2 ){ |  | 
|  5694     nxDiv = 0; |  | 
|  5695     nOld = i+1; |  | 
|  5696   }else{ |  | 
|  5697     nOld = 3; |  | 
|  5698     if( iParentIdx==0 ){                  |  | 
|  5699       nxDiv = 0; |  | 
|  5700     }else if( iParentIdx==i ){ |  | 
|  5701       nxDiv = i-2; |  | 
|  5702     }else{ |  | 
|  5703       nxDiv = iParentIdx-1; |  | 
|  5704     } |  | 
|  5705     i = 2; |  | 
|  5706   } |  | 
|  5707   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ |  | 
|  5708     pRight = &pParent->aData[pParent->hdrOffset+8]; |  | 
|  5709   }else{ |  | 
|  5710     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); |  | 
|  5711   } |  | 
|  5712   pgno = get4byte(pRight); |  | 
|  5713   while( 1 ){ |  | 
|  5714     rc = getAndInitPage(pBt, pgno, &apOld[i]); |  | 
|  5715     if( rc ){ |  | 
|  5716       memset(apOld, 0, (i+1)*sizeof(MemPage*)); |  | 
|  5717       goto balance_cleanup; |  | 
|  5718     } |  | 
|  5719     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; |  | 
|  5720     if( (i--)==0 ) break; |  | 
|  5721  |  | 
|  5722     if( i+nxDiv==pParent->aOvfl[0].idx && pParent->nOverflow ){ |  | 
|  5723       apDiv[i] = pParent->aOvfl[0].pCell; |  | 
|  5724       pgno = get4byte(apDiv[i]); |  | 
|  5725       szNew[i] = cellSizePtr(pParent, apDiv[i]); |  | 
|  5726       pParent->nOverflow = 0; |  | 
|  5727     }else{ |  | 
|  5728       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); |  | 
|  5729       pgno = get4byte(apDiv[i]); |  | 
|  5730       szNew[i] = cellSizePtr(pParent, apDiv[i]); |  | 
|  5731  |  | 
|  5732       /* Drop the cell from the parent page. apDiv[i] still points to |  | 
|  5733       ** the cell within the parent, even though it has been dropped. |  | 
|  5734       ** This is safe because dropping a cell only overwrites the first |  | 
|  5735       ** four bytes of it, and this function does not need the first |  | 
|  5736       ** four bytes of the divider cell. So the pointer is safe to use |  | 
|  5737       ** later on.   |  | 
|  5738       ** |  | 
|  5739       ** Unless SQLite is compiled in secure-delete mode. In this case, |  | 
|  5740       ** the dropCell() routine will overwrite the entire cell with zeroes. |  | 
|  5741       ** In this case, temporarily copy the cell into the aOvflSpace[] |  | 
|  5742       ** buffer. It will be copied out again as soon as the aSpace[] buffer |  | 
|  5743       ** is allocated.  */ |  | 
|  5744 #ifdef SQLITE_SECURE_DELETE |  | 
|  5745       memcpy(&aOvflSpace[apDiv[i]-pParent->aData], apDiv[i], szNew[i]); |  | 
|  5746       apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; |  | 
|  5747 #endif |  | 
|  5748       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); |  | 
|  5749     } |  | 
|  5750   } |  | 
|  5751  |  | 
|  5752   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte |  | 
|  5753   ** alignment */ |  | 
|  5754   nMaxCells = (nMaxCells + 3)&~3; |  | 
|  5755  |  | 
|  5756   /* |  | 
|  5757   ** Allocate space for memory structures |  | 
|  5758   */ |  | 
|  5759   k = pBt->pageSize + ROUND8(sizeof(MemPage)); |  | 
|  5760   szScratch = |  | 
|  5761        nMaxCells*sizeof(u8*)                       /* apCell */ |  | 
|  5762      + nMaxCells*sizeof(u16)                       /* szCell */ |  | 
|  5763      + pBt->pageSize                               /* aSpace1 */ |  | 
|  5764      + k*nOld;                                     /* Page copies (apCopy) */ |  | 
|  5765   apCell = sqlite3ScratchMalloc( szScratch );  |  | 
|  5766   if( apCell==0 ){ |  | 
|  5767     rc = SQLITE_NOMEM; |  | 
|  5768     goto balance_cleanup; |  | 
|  5769   } |  | 
|  5770   szCell = (u16*)&apCell[nMaxCells]; |  | 
|  5771   aSpace1 = (u8*)&szCell[nMaxCells]; |  | 
|  5772   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); |  | 
|  5773  |  | 
|  5774   /* |  | 
|  5775   ** Load pointers to all cells on sibling pages and the divider cells |  | 
|  5776   ** into the local apCell[] array.  Make copies of the divider cells |  | 
|  5777   ** into space obtained from aSpace1[] and remove the the divider Cells |  | 
|  5778   ** from pParent. |  | 
|  5779   ** |  | 
|  5780   ** If the siblings are on leaf pages, then the child pointers of the |  | 
|  5781   ** divider cells are stripped from the cells before they are copied |  | 
|  5782   ** into aSpace1[].  In this way, all cells in apCell[] are without |  | 
|  5783   ** child pointers.  If siblings are not leaves, then all cell in |  | 
|  5784   ** apCell[] include child pointers.  Either way, all cells in apCell[] |  | 
|  5785   ** are alike. |  | 
|  5786   ** |  | 
|  5787   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf. |  | 
|  5788   **       leafData:  1 if pPage holds key+data and pParent holds only keys. |  | 
|  5789   */ |  | 
|  5790   leafCorrection = apOld[0]->leaf*4; |  | 
|  5791   leafData = apOld[0]->hasData; |  | 
|  5792   for(i=0; i<nOld; i++){ |  | 
|  5793     int limit; |  | 
|  5794      |  | 
|  5795     /* Before doing anything else, take a copy of the i'th original sibling |  | 
|  5796     ** The rest of this function will use data from the copies rather |  | 
|  5797     ** that the original pages since the original pages will be in the |  | 
|  5798     ** process of being overwritten.  */ |  | 
|  5799     MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; |  | 
|  5800     memcpy(pOld, apOld[i], sizeof(MemPage)); |  | 
|  5801     pOld->aData = (void*)&pOld[1]; |  | 
|  5802     memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); |  | 
|  5803  |  | 
|  5804     limit = pOld->nCell+pOld->nOverflow; |  | 
|  5805     for(j=0; j<limit; j++){ |  | 
|  5806       assert( nCell<nMaxCells ); |  | 
|  5807       apCell[nCell] = findOverflowCell(pOld, j); |  | 
|  5808       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); |  | 
|  5809       nCell++; |  | 
|  5810     } |  | 
|  5811     if( i<nOld-1 && !leafData){ |  | 
|  5812       u16 sz = (u16)szNew[i]; |  | 
|  5813       u8 *pTemp; |  | 
|  5814       assert( nCell<nMaxCells ); |  | 
|  5815       szCell[nCell] = sz; |  | 
|  5816       pTemp = &aSpace1[iSpace1]; |  | 
|  5817       iSpace1 += sz; |  | 
|  5818       assert( sz<=pBt->pageSize/4 ); |  | 
|  5819       assert( iSpace1<=pBt->pageSize ); |  | 
|  5820       memcpy(pTemp, apDiv[i], sz); |  | 
|  5821       apCell[nCell] = pTemp+leafCorrection; |  | 
|  5822       assert( leafCorrection==0 || leafCorrection==4 ); |  | 
|  5823       szCell[nCell] = szCell[nCell] - leafCorrection; |  | 
|  5824       if( !pOld->leaf ){ |  | 
|  5825         assert( leafCorrection==0 ); |  | 
|  5826         assert( pOld->hdrOffset==0 ); |  | 
|  5827         /* The right pointer of the child page pOld becomes the left |  | 
|  5828         ** pointer of the divider cell */ |  | 
|  5829         memcpy(apCell[nCell], &pOld->aData[8], 4); |  | 
|  5830       }else{ |  | 
|  5831         assert( leafCorrection==4 ); |  | 
|  5832         if( szCell[nCell]<4 ){ |  | 
|  5833           /* Do not allow any cells smaller than 4 bytes. */ |  | 
|  5834           szCell[nCell] = 4; |  | 
|  5835         } |  | 
|  5836       } |  | 
|  5837       nCell++; |  | 
|  5838     } |  | 
|  5839   } |  | 
|  5840  |  | 
|  5841   /* |  | 
|  5842   ** Figure out the number of pages needed to hold all nCell cells. |  | 
|  5843   ** Store this number in "k".  Also compute szNew[] which is the total |  | 
|  5844   ** size of all cells on the i-th page and cntNew[] which is the index |  | 
|  5845   ** in apCell[] of the cell that divides page i from page i+1.   |  | 
|  5846   ** cntNew[k] should equal nCell. |  | 
|  5847   ** |  | 
|  5848   ** Values computed by this block: |  | 
|  5849   ** |  | 
|  5850   **           k: The total number of sibling pages |  | 
|  5851   **    szNew[i]: Spaced used on the i-th sibling page. |  | 
|  5852   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to |  | 
|  5853   **              the right of the i-th sibling page. |  | 
|  5854   ** usableSpace: Number of bytes of space available on each sibling. |  | 
|  5855   **  |  | 
|  5856   */ |  | 
|  5857   usableSpace = pBt->usableSize - 12 + leafCorrection; |  | 
|  5858   for(subtotal=k=i=0; i<nCell; i++){ |  | 
|  5859     assert( i<nMaxCells ); |  | 
|  5860     subtotal += szCell[i] + 2; |  | 
|  5861     if( subtotal > usableSpace ){ |  | 
|  5862       szNew[k] = subtotal - szCell[i]; |  | 
|  5863       cntNew[k] = i; |  | 
|  5864       if( leafData ){ i--; } |  | 
|  5865       subtotal = 0; |  | 
|  5866       k++; |  | 
|  5867       if( k>NB+1 ){ rc = SQLITE_CORRUPT; goto balance_cleanup; } |  | 
|  5868     } |  | 
|  5869   } |  | 
|  5870   szNew[k] = subtotal; |  | 
|  5871   cntNew[k] = nCell; |  | 
|  5872   k++; |  | 
|  5873  |  | 
|  5874   /* |  | 
|  5875   ** The packing computed by the previous block is biased toward the siblings |  | 
|  5876   ** on the left side.  The left siblings are always nearly full, while the |  | 
|  5877   ** right-most sibling might be nearly empty.  This block of code attempts |  | 
|  5878   ** to adjust the packing of siblings to get a better balance. |  | 
|  5879   ** |  | 
|  5880   ** This adjustment is more than an optimization.  The packing above might |  | 
|  5881   ** be so out of balance as to be illegal.  For example, the right-most |  | 
|  5882   ** sibling might be completely empty.  This adjustment is not optional. |  | 
|  5883   */ |  | 
|  5884   for(i=k-1; i>0; i--){ |  | 
|  5885     int szRight = szNew[i];  /* Size of sibling on the right */ |  | 
|  5886     int szLeft = szNew[i-1]; /* Size of sibling on the left */ |  | 
|  5887     int r;              /* Index of right-most cell in left sibling */ |  | 
|  5888     int d;              /* Index of first cell to the left of right sibling */ |  | 
|  5889  |  | 
|  5890     r = cntNew[i-1] - 1; |  | 
|  5891     d = r + 1 - leafData; |  | 
|  5892     assert( d<nMaxCells ); |  | 
|  5893     assert( r<nMaxCells ); |  | 
|  5894     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){ |  | 
|  5895       szRight += szCell[d] + 2; |  | 
|  5896       szLeft -= szCell[r] + 2; |  | 
|  5897       cntNew[i-1]--; |  | 
|  5898       r = cntNew[i-1] - 1; |  | 
|  5899       d = r + 1 - leafData; |  | 
|  5900     } |  | 
|  5901     szNew[i] = szRight; |  | 
|  5902     szNew[i-1] = szLeft; |  | 
|  5903   } |  | 
|  5904  |  | 
|  5905   /* Either we found one or more cells (cntnew[0])>0) or pPage is |  | 
|  5906   ** a virtual root page.  A virtual root page is when the real root |  | 
|  5907   ** page is page 1 and we are the only child of that page. |  | 
|  5908   */ |  | 
|  5909   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); |  | 
|  5910  |  | 
|  5911   TRACE(("BALANCE: old: %d %d %d  ", |  | 
|  5912     apOld[0]->pgno,  |  | 
|  5913     nOld>=2 ? apOld[1]->pgno : 0, |  | 
|  5914     nOld>=3 ? apOld[2]->pgno : 0 |  | 
|  5915   )); |  | 
|  5916  |  | 
|  5917   /* |  | 
|  5918   ** Allocate k new pages.  Reuse old pages where possible. |  | 
|  5919   */ |  | 
|  5920   if( apOld[0]->pgno<=1 ){ |  | 
|  5921     rc = SQLITE_CORRUPT; |  | 
|  5922     goto balance_cleanup; |  | 
|  5923   } |  | 
|  5924   pageFlags = apOld[0]->aData[0]; |  | 
|  5925   for(i=0; i<k; i++){ |  | 
|  5926     MemPage *pNew; |  | 
|  5927     if( i<nOld ){ |  | 
|  5928       pNew = apNew[i] = apOld[i]; |  | 
|  5929       apOld[i] = 0; |  | 
|  5930       rc = sqlite3PagerWrite(pNew->pDbPage); |  | 
|  5931       nNew++; |  | 
|  5932       if( rc ) goto balance_cleanup; |  | 
|  5933     }else{ |  | 
|  5934       assert( i>0 ); |  | 
|  5935       rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); |  | 
|  5936       if( rc ) goto balance_cleanup; |  | 
|  5937       apNew[i] = pNew; |  | 
|  5938       nNew++; |  | 
|  5939  |  | 
|  5940       /* Set the pointer-map entry for the new sibling page. */ |  | 
|  5941       if( ISAUTOVACUUM ){ |  | 
|  5942         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); |  | 
|  5943         if( rc!=SQLITE_OK ){ |  | 
|  5944           goto balance_cleanup; |  | 
|  5945         } |  | 
|  5946       } |  | 
|  5947     } |  | 
|  5948   } |  | 
|  5949  |  | 
|  5950   /* Free any old pages that were not reused as new pages. |  | 
|  5951   */ |  | 
|  5952   while( i<nOld ){ |  | 
|  5953     freePage(apOld[i], &rc); |  | 
|  5954     if( rc ) goto balance_cleanup; |  | 
|  5955     releasePage(apOld[i]); |  | 
|  5956     apOld[i] = 0; |  | 
|  5957     i++; |  | 
|  5958   } |  | 
|  5959  |  | 
|  5960   /* |  | 
|  5961   ** Put the new pages in accending order.  This helps to |  | 
|  5962   ** keep entries in the disk file in order so that a scan |  | 
|  5963   ** of the table is a linear scan through the file.  That |  | 
|  5964   ** in turn helps the operating system to deliver pages |  | 
|  5965   ** from the disk more rapidly. |  | 
|  5966   ** |  | 
|  5967   ** An O(n^2) insertion sort algorithm is used, but since |  | 
|  5968   ** n is never more than NB (a small constant), that should |  | 
|  5969   ** not be a problem. |  | 
|  5970   ** |  | 
|  5971   ** When NB==3, this one optimization makes the database |  | 
|  5972   ** about 25% faster for large insertions and deletions. |  | 
|  5973   */ |  | 
|  5974   for(i=0; i<k-1; i++){ |  | 
|  5975     int minV = apNew[i]->pgno; |  | 
|  5976     int minI = i; |  | 
|  5977     for(j=i+1; j<k; j++){ |  | 
|  5978       if( apNew[j]->pgno<(unsigned)minV ){ |  | 
|  5979         minI = j; |  | 
|  5980         minV = apNew[j]->pgno; |  | 
|  5981       } |  | 
|  5982     } |  | 
|  5983     if( minI>i ){ |  | 
|  5984       int t; |  | 
|  5985       MemPage *pT; |  | 
|  5986       t = apNew[i]->pgno; |  | 
|  5987       pT = apNew[i]; |  | 
|  5988       apNew[i] = apNew[minI]; |  | 
|  5989       apNew[minI] = pT; |  | 
|  5990     } |  | 
|  5991   } |  | 
|  5992   TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", |  | 
|  5993     apNew[0]->pgno, szNew[0], |  | 
|  5994     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, |  | 
|  5995     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, |  | 
|  5996     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, |  | 
|  5997     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); |  | 
|  5998  |  | 
|  5999   assert( sqlite3PagerIswriteable(pParent->pDbPage) ); |  | 
|  6000   put4byte(pRight, apNew[nNew-1]->pgno); |  | 
|  6001  |  | 
|  6002   /* |  | 
|  6003   ** Evenly distribute the data in apCell[] across the new pages. |  | 
|  6004   ** Insert divider cells into pParent as necessary. |  | 
|  6005   */ |  | 
|  6006   j = 0; |  | 
|  6007   for(i=0; i<nNew; i++){ |  | 
|  6008     /* Assemble the new sibling page. */ |  | 
|  6009     MemPage *pNew = apNew[i]; |  | 
|  6010     assert( j<nMaxCells ); |  | 
|  6011     zeroPage(pNew, pageFlags); |  | 
|  6012     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); |  | 
|  6013     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); |  | 
|  6014     assert( pNew->nOverflow==0 ); |  | 
|  6015  |  | 
|  6016     j = cntNew[i]; |  | 
|  6017  |  | 
|  6018     /* If the sibling page assembled above was not the right-most sibling, |  | 
|  6019     ** insert a divider cell into the parent page. |  | 
|  6020     */ |  | 
|  6021     assert( i<nNew-1 || j==nCell ); |  | 
|  6022     if( j<nCell ){ |  | 
|  6023       u8 *pCell; |  | 
|  6024       u8 *pTemp; |  | 
|  6025       int sz; |  | 
|  6026  |  | 
|  6027       assert( j<nMaxCells ); |  | 
|  6028       pCell = apCell[j]; |  | 
|  6029       sz = szCell[j] + leafCorrection; |  | 
|  6030       pTemp = &aOvflSpace[iOvflSpace]; |  | 
|  6031       if( !pNew->leaf ){ |  | 
|  6032         memcpy(&pNew->aData[8], pCell, 4); |  | 
|  6033       }else if( leafData ){ |  | 
|  6034         /* If the tree is a leaf-data tree, and the siblings are leaves,  |  | 
|  6035         ** then there is no divider cell in apCell[]. Instead, the divider  |  | 
|  6036         ** cell consists of the integer key for the right-most cell of  |  | 
|  6037         ** the sibling-page assembled above only. |  | 
|  6038         */ |  | 
|  6039         CellInfo info; |  | 
|  6040         j--; |  | 
|  6041         btreeParseCellPtr(pNew, apCell[j], &info); |  | 
|  6042         pCell = pTemp; |  | 
|  6043         sz = 4 + putVarint(&pCell[4], info.nKey); |  | 
|  6044         pTemp = 0; |  | 
|  6045       }else{ |  | 
|  6046         pCell -= 4; |  | 
|  6047         /* Obscure case for non-leaf-data trees: If the cell at pCell was |  | 
|  6048         ** previously stored on a leaf node, and its reported size was 4 |  | 
|  6049         ** bytes, then it may actually be smaller than this  |  | 
|  6050         ** (see btreeParseCellPtr(), 4 bytes is the minimum size of |  | 
|  6051         ** any cell). But it is important to pass the correct size to  |  | 
|  6052         ** insertCell(), so reparse the cell now. |  | 
|  6053         ** |  | 
|  6054         ** Note that this can never happen in an SQLite data file, as all |  | 
|  6055         ** cells are at least 4 bytes. It only happens in b-trees used |  | 
|  6056         ** to evaluate "IN (SELECT ...)" and similar clauses. |  | 
|  6057         */ |  | 
|  6058         if( szCell[j]==4 ){ |  | 
|  6059           assert(leafCorrection==4); |  | 
|  6060           sz = cellSizePtr(pParent, pCell); |  | 
|  6061         } |  | 
|  6062       } |  | 
|  6063       iOvflSpace += sz; |  | 
|  6064       assert( sz<=pBt->pageSize/4 ); |  | 
|  6065       assert( iOvflSpace<=pBt->pageSize ); |  | 
|  6066       insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); |  | 
|  6067       if( rc!=SQLITE_OK ) goto balance_cleanup; |  | 
|  6068       assert( sqlite3PagerIswriteable(pParent->pDbPage) ); |  | 
|  6069  |  | 
|  6070       j++; |  | 
|  6071       nxDiv++; |  | 
|  6072     } |  | 
|  6073   } |  | 
|  6074   assert( j==nCell ); |  | 
|  6075   assert( nOld>0 ); |  | 
|  6076   assert( nNew>0 ); |  | 
|  6077   if( (pageFlags & PTF_LEAF)==0 ){ |  | 
|  6078     u8 *zChild = &apCopy[nOld-1]->aData[8]; |  | 
|  6079     memcpy(&apNew[nNew-1]->aData[8], zChild, 4); |  | 
|  6080   } |  | 
|  6081  |  | 
|  6082   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ |  | 
|  6083     /* The root page of the b-tree now contains no cells. The only sibling |  | 
|  6084     ** page is the right-child of the parent. Copy the contents of the |  | 
|  6085     ** child page into the parent, decreasing the overall height of the |  | 
|  6086     ** b-tree structure by one. This is described as the "balance-shallower" |  | 
|  6087     ** sub-algorithm in some documentation. |  | 
|  6088     ** |  | 
|  6089     ** If this is an auto-vacuum database, the call to copyNodeContent()  |  | 
|  6090     ** sets all pointer-map entries corresponding to database image pages  |  | 
|  6091     ** for which the pointer is stored within the content being copied. |  | 
|  6092     ** |  | 
|  6093     ** The second assert below verifies that the child page is defragmented |  | 
|  6094     ** (it must be, as it was just reconstructed using assemblePage()). This |  | 
|  6095     ** is important if the parent page happens to be page 1 of the database |  | 
|  6096     ** image.  */ |  | 
|  6097     assert( nNew==1 ); |  | 
|  6098     assert( apNew[0]->nFree ==  |  | 
|  6099         (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)  |  | 
|  6100     ); |  | 
|  6101     copyNodeContent(apNew[0], pParent, &rc); |  | 
|  6102     freePage(apNew[0], &rc); |  | 
|  6103   }else if( ISAUTOVACUUM ){ |  | 
|  6104     /* Fix the pointer-map entries for all the cells that were shifted around.  |  | 
|  6105     ** There are several different types of pointer-map entries that need to |  | 
|  6106     ** be dealt with by this routine. Some of these have been set already, but |  | 
|  6107     ** many have not. The following is a summary: |  | 
|  6108     ** |  | 
|  6109     **   1) The entries associated with new sibling pages that were not |  | 
|  6110     **      siblings when this function was called. These have already |  | 
|  6111     **      been set. We don't need to worry about old siblings that were |  | 
|  6112     **      moved to the free-list - the freePage() code has taken care |  | 
|  6113     **      of those. |  | 
|  6114     ** |  | 
|  6115     **   2) The pointer-map entries associated with the first overflow |  | 
|  6116     **      page in any overflow chains used by new divider cells. These  |  | 
|  6117     **      have also already been taken care of by the insertCell() code. |  | 
|  6118     ** |  | 
|  6119     **   3) If the sibling pages are not leaves, then the child pages of |  | 
|  6120     **      cells stored on the sibling pages may need to be updated. |  | 
|  6121     ** |  | 
|  6122     **   4) If the sibling pages are not internal intkey nodes, then any |  | 
|  6123     **      overflow pages used by these cells may need to be updated |  | 
|  6124     **      (internal intkey nodes never contain pointers to overflow pages). |  | 
|  6125     ** |  | 
|  6126     **   5) If the sibling pages are not leaves, then the pointer-map |  | 
|  6127     **      entries for the right-child pages of each sibling may need |  | 
|  6128     **      to be updated. |  | 
|  6129     ** |  | 
|  6130     ** Cases 1 and 2 are dealt with above by other code. The next |  | 
|  6131     ** block deals with cases 3 and 4 and the one after that, case 5. Since |  | 
|  6132     ** setting a pointer map entry is a relatively expensive operation, this |  | 
|  6133     ** code only sets pointer map entries for child or overflow pages that have |  | 
|  6134     ** actually moved between pages.  */ |  | 
|  6135     MemPage *pNew = apNew[0]; |  | 
|  6136     MemPage *pOld = apCopy[0]; |  | 
|  6137     int nOverflow = pOld->nOverflow; |  | 
|  6138     int iNextOld = pOld->nCell + nOverflow; |  | 
|  6139     int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1); |  | 
|  6140     j = 0;                             /* Current 'old' sibling page */ |  | 
|  6141     k = 0;                             /* Current 'new' sibling page */ |  | 
|  6142     for(i=0; i<nCell; i++){ |  | 
|  6143       int isDivider = 0; |  | 
|  6144       while( i==iNextOld ){ |  | 
|  6145         /* Cell i is the cell immediately following the last cell on old |  | 
|  6146         ** sibling page j. If the siblings are not leaf pages of an |  | 
|  6147         ** intkey b-tree, then cell i was a divider cell. */ |  | 
|  6148         pOld = apCopy[++j]; |  | 
|  6149         iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; |  | 
|  6150         if( pOld->nOverflow ){ |  | 
|  6151           nOverflow = pOld->nOverflow; |  | 
|  6152           iOverflow = i + !leafData + pOld->aOvfl[0].idx; |  | 
|  6153         } |  | 
|  6154         isDivider = !leafData;   |  | 
|  6155       } |  | 
|  6156  |  | 
|  6157       assert(nOverflow>0 || iOverflow<i ); |  | 
|  6158       assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1); |  | 
|  6159       assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1); |  | 
|  6160       if( i==iOverflow ){ |  | 
|  6161         isDivider = 1; |  | 
|  6162         if( (--nOverflow)>0 ){ |  | 
|  6163           iOverflow++; |  | 
|  6164         } |  | 
|  6165       } |  | 
|  6166  |  | 
|  6167       if( i==cntNew[k] ){ |  | 
|  6168         /* Cell i is the cell immediately following the last cell on new |  | 
|  6169         ** sibling page k. If the siblings are not leaf pages of an |  | 
|  6170         ** intkey b-tree, then cell i is a divider cell.  */ |  | 
|  6171         pNew = apNew[++k]; |  | 
|  6172         if( !leafData ) continue; |  | 
|  6173       } |  | 
|  6174       assert( j<nOld ); |  | 
|  6175       assert( k<nNew ); |  | 
|  6176  |  | 
|  6177       /* If the cell was originally divider cell (and is not now) or |  | 
|  6178       ** an overflow cell, or if the cell was located on a different sibling |  | 
|  6179       ** page before the balancing, then the pointer map entries associated |  | 
|  6180       ** with any child or overflow pages need to be updated.  */ |  | 
|  6181       if( isDivider || pOld->pgno!=pNew->pgno ){ |  | 
|  6182         if( !leafCorrection ){ |  | 
|  6183           ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); |  | 
|  6184         } |  | 
|  6185         if( szCell[i]>pNew->minLocal ){ |  | 
|  6186           ptrmapPutOvflPtr(pNew, apCell[i], &rc); |  | 
|  6187         } |  | 
|  6188       } |  | 
|  6189     } |  | 
|  6190  |  | 
|  6191     if( !leafCorrection ){ |  | 
|  6192       for(i=0; i<nNew; i++){ |  | 
|  6193         u32 key = get4byte(&apNew[i]->aData[8]); |  | 
|  6194         ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); |  | 
|  6195       } |  | 
|  6196     } |  | 
|  6197  |  | 
|  6198 #if 0 |  | 
|  6199     /* The ptrmapCheckPages() contains assert() statements that verify that |  | 
|  6200     ** all pointer map pages are set correctly. This is helpful while  |  | 
|  6201     ** debugging. This is usually disabled because a corrupt database may |  | 
|  6202     ** cause an assert() statement to fail.  */ |  | 
|  6203     ptrmapCheckPages(apNew, nNew); |  | 
|  6204     ptrmapCheckPages(&pParent, 1); |  | 
|  6205 #endif |  | 
|  6206   } |  | 
|  6207  |  | 
|  6208   assert( pParent->isInit ); |  | 
|  6209   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", |  | 
|  6210           nOld, nNew, nCell)); |  | 
|  6211  |  | 
|  6212   /* |  | 
|  6213   ** Cleanup before returning. |  | 
|  6214   */ |  | 
|  6215 balance_cleanup: |  | 
|  6216   sqlite3ScratchFree(apCell); |  | 
|  6217   for(i=0; i<nOld; i++){ |  | 
|  6218     releasePage(apOld[i]); |  | 
|  6219   } |  | 
|  6220   for(i=0; i<nNew; i++){ |  | 
|  6221     releasePage(apNew[i]); |  | 
|  6222   } |  | 
|  6223  |  | 
|  6224   return rc; |  | 
|  6225 } |  | 
|  6226  |  | 
|  6227  |  | 
|  6228 /* |  | 
|  6229 ** This function is called when the root page of a b-tree structure is |  | 
|  6230 ** overfull (has one or more overflow pages). |  | 
|  6231 ** |  | 
|  6232 ** A new child page is allocated and the contents of the current root |  | 
|  6233 ** page, including overflow cells, are copied into the child. The root |  | 
|  6234 ** page is then overwritten to make it an empty page with the right-child  |  | 
|  6235 ** pointer pointing to the new page. |  | 
|  6236 ** |  | 
|  6237 ** Before returning, all pointer-map entries corresponding to pages  |  | 
|  6238 ** that the new child-page now contains pointers to are updated. The |  | 
|  6239 ** entry corresponding to the new right-child pointer of the root |  | 
|  6240 ** page is also updated. |  | 
|  6241 ** |  | 
|  6242 ** If successful, *ppChild is set to contain a reference to the child  |  | 
|  6243 ** page and SQLITE_OK is returned. In this case the caller is required |  | 
|  6244 ** to call releasePage() on *ppChild exactly once. If an error occurs, |  | 
|  6245 ** an error code is returned and *ppChild is set to 0. |  | 
|  6246 */ |  | 
|  6247 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ |  | 
|  6248   int rc;                        /* Return value from subprocedures */ |  | 
|  6249   MemPage *pChild = 0;           /* Pointer to a new child page */ |  | 
|  6250   Pgno pgnoChild = 0;            /* Page number of the new child page */ |  | 
|  6251   BtShared *pBt = pRoot->pBt;    /* The BTree */ |  | 
|  6252  |  | 
|  6253   assert( pRoot->nOverflow>0 ); |  | 
|  6254   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  6255  |  | 
|  6256   /* Make pRoot, the root page of the b-tree, writable. Allocate a new  |  | 
|  6257   ** page that will become the new right-child of pPage. Copy the contents |  | 
|  6258   ** of the node stored on pRoot into the new child page. |  | 
|  6259   */ |  | 
|  6260   rc = sqlite3PagerWrite(pRoot->pDbPage); |  | 
|  6261   if( rc==SQLITE_OK ){ |  | 
|  6262     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); |  | 
|  6263     copyNodeContent(pRoot, pChild, &rc); |  | 
|  6264     if( ISAUTOVACUUM ){ |  | 
|  6265       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); |  | 
|  6266     } |  | 
|  6267   } |  | 
|  6268   if( rc ){ |  | 
|  6269     *ppChild = 0; |  | 
|  6270     releasePage(pChild); |  | 
|  6271     return rc; |  | 
|  6272   } |  | 
|  6273   assert( sqlite3PagerIswriteable(pChild->pDbPage) ); |  | 
|  6274   assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); |  | 
|  6275   assert( pChild->nCell==pRoot->nCell ); |  | 
|  6276  |  | 
|  6277   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); |  | 
|  6278  |  | 
|  6279   /* Copy the overflow cells from pRoot to pChild */ |  | 
|  6280   memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0])); |  | 
|  6281   pChild->nOverflow = pRoot->nOverflow; |  | 
|  6282  |  | 
|  6283   /* Zero the contents of pRoot. Then install pChild as the right-child. */ |  | 
|  6284   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); |  | 
|  6285   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); |  | 
|  6286  |  | 
|  6287   *ppChild = pChild; |  | 
|  6288   return SQLITE_OK; |  | 
|  6289 } |  | 
|  6290  |  | 
|  6291 /* |  | 
|  6292 ** The page that pCur currently points to has just been modified in |  | 
|  6293 ** some way. This function figures out if this modification means the |  | 
|  6294 ** tree needs to be balanced, and if so calls the appropriate balancing  |  | 
|  6295 ** routine. Balancing routines are: |  | 
|  6296 ** |  | 
|  6297 **   balance_quick() |  | 
|  6298 **   balance_deeper() |  | 
|  6299 **   balance_nonroot() |  | 
|  6300 */ |  | 
|  6301 static int balance(BtCursor *pCur){ |  | 
|  6302   int rc = SQLITE_OK; |  | 
|  6303   const int nMin = pCur->pBt->usableSize * 2 / 3; |  | 
|  6304   u8 aBalanceQuickSpace[13]; |  | 
|  6305   u8 *pFree = 0; |  | 
|  6306  |  | 
|  6307   TESTONLY( int balance_quick_called = 0 ); |  | 
|  6308   TESTONLY( int balance_deeper_called = 0 ); |  | 
|  6309  |  | 
|  6310   do { |  | 
|  6311     int iPage = pCur->iPage; |  | 
|  6312     MemPage *pPage = pCur->apPage[iPage]; |  | 
|  6313  |  | 
|  6314     if( iPage==0 ){ |  | 
|  6315       if( pPage->nOverflow ){ |  | 
|  6316         /* The root page of the b-tree is overfull. In this case call the |  | 
|  6317         ** balance_deeper() function to create a new child for the root-page |  | 
|  6318         ** and copy the current contents of the root-page to it. The |  | 
|  6319         ** next iteration of the do-loop will balance the child page. |  | 
|  6320         */  |  | 
|  6321         assert( (balance_deeper_called++)==0 ); |  | 
|  6322         rc = balance_deeper(pPage, &pCur->apPage[1]); |  | 
|  6323         if( rc==SQLITE_OK ){ |  | 
|  6324           pCur->iPage = 1; |  | 
|  6325           pCur->aiIdx[0] = 0; |  | 
|  6326           pCur->aiIdx[1] = 0; |  | 
|  6327           assert( pCur->apPage[1]->nOverflow ); |  | 
|  6328         } |  | 
|  6329       }else{ |  | 
|  6330         break; |  | 
|  6331       } |  | 
|  6332     }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ |  | 
|  6333       break; |  | 
|  6334     }else{ |  | 
|  6335       MemPage * const pParent = pCur->apPage[iPage-1]; |  | 
|  6336       int const iIdx = pCur->aiIdx[iPage-1]; |  | 
|  6337  |  | 
|  6338       rc = sqlite3PagerWrite(pParent->pDbPage); |  | 
|  6339       if( rc==SQLITE_OK ){ |  | 
|  6340 #ifndef SQLITE_OMIT_QUICKBALANCE |  | 
|  6341         if( pPage->hasData |  | 
|  6342          && pPage->nOverflow==1 |  | 
|  6343          && pPage->aOvfl[0].idx==pPage->nCell |  | 
|  6344          && pParent->pgno!=1 |  | 
|  6345          && pParent->nCell==iIdx |  | 
|  6346         ){ |  | 
|  6347           /* Call balance_quick() to create a new sibling of pPage on which |  | 
|  6348           ** to store the overflow cell. balance_quick() inserts a new cell |  | 
|  6349           ** into pParent, which may cause pParent overflow. If this |  | 
|  6350           ** happens, the next interation of the do-loop will balance pParent  |  | 
|  6351           ** use either balance_nonroot() or balance_deeper(). Until this |  | 
|  6352           ** happens, the overflow cell is stored in the aBalanceQuickSpace[] |  | 
|  6353           ** buffer.  |  | 
|  6354           ** |  | 
|  6355           ** The purpose of the following assert() is to check that only a |  | 
|  6356           ** single call to balance_quick() is made for each call to this |  | 
|  6357           ** function. If this were not verified, a subtle bug involving reuse |  | 
|  6358           ** of the aBalanceQuickSpace[] might sneak in. |  | 
|  6359           */ |  | 
|  6360           assert( (balance_quick_called++)==0 ); |  | 
|  6361           rc = balance_quick(pParent, pPage, aBalanceQuickSpace); |  | 
|  6362         }else |  | 
|  6363 #endif |  | 
|  6364         { |  | 
|  6365           /* In this case, call balance_nonroot() to redistribute cells |  | 
|  6366           ** between pPage and up to 2 of its sibling pages. This involves |  | 
|  6367           ** modifying the contents of pParent, which may cause pParent to |  | 
|  6368           ** become overfull or underfull. The next iteration of the do-loop |  | 
|  6369           ** will balance the parent page to correct this. |  | 
|  6370           **  |  | 
|  6371           ** If the parent page becomes overfull, the overflow cell or cells |  | 
|  6372           ** are stored in the pSpace buffer allocated immediately below.  |  | 
|  6373           ** A subsequent iteration of the do-loop will deal with this by |  | 
|  6374           ** calling balance_nonroot() (balance_deeper() may be called first, |  | 
|  6375           ** but it doesn't deal with overflow cells - just moves them to a |  | 
|  6376           ** different page). Once this subsequent call to balance_nonroot()  |  | 
|  6377           ** has completed, it is safe to release the pSpace buffer used by |  | 
|  6378           ** the previous call, as the overflow cell data will have been  |  | 
|  6379           ** copied either into the body of a database page or into the new |  | 
|  6380           ** pSpace buffer passed to the latter call to balance_nonroot(). |  | 
|  6381           */ |  | 
|  6382           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); |  | 
|  6383           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1); |  | 
|  6384           if( pFree ){ |  | 
|  6385             /* If pFree is not NULL, it points to the pSpace buffer used  |  | 
|  6386             ** by a previous call to balance_nonroot(). Its contents are |  | 
|  6387             ** now stored either on real database pages or within the  |  | 
|  6388             ** new pSpace buffer, so it may be safely freed here. */ |  | 
|  6389             sqlite3PageFree(pFree); |  | 
|  6390           } |  | 
|  6391  |  | 
|  6392           /* The pSpace buffer will be freed after the next call to |  | 
|  6393           ** balance_nonroot(), or just before this function returns, whichever |  | 
|  6394           ** comes first. */ |  | 
|  6395           pFree = pSpace; |  | 
|  6396         } |  | 
|  6397       } |  | 
|  6398  |  | 
|  6399       pPage->nOverflow = 0; |  | 
|  6400  |  | 
|  6401       /* The next iteration of the do-loop balances the parent page. */ |  | 
|  6402       releasePage(pPage); |  | 
|  6403       pCur->iPage--; |  | 
|  6404     } |  | 
|  6405   }while( rc==SQLITE_OK ); |  | 
|  6406  |  | 
|  6407   if( pFree ){ |  | 
|  6408     sqlite3PageFree(pFree); |  | 
|  6409   } |  | 
|  6410   return rc; |  | 
|  6411 } |  | 
|  6412  |  | 
|  6413  |  | 
|  6414 /* |  | 
|  6415 ** Insert a new record into the BTree.  The key is given by (pKey,nKey) |  | 
|  6416 ** and the data is given by (pData,nData).  The cursor is used only to |  | 
|  6417 ** define what table the record should be inserted into.  The cursor |  | 
|  6418 ** is left pointing at a random location. |  | 
|  6419 ** |  | 
|  6420 ** For an INTKEY table, only the nKey value of the key is used.  pKey is |  | 
|  6421 ** ignored.  For a ZERODATA table, the pData and nData are both ignored. |  | 
|  6422 ** |  | 
|  6423 ** If the seekResult parameter is non-zero, then a successful call to |  | 
|  6424 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already |  | 
|  6425 ** been performed. seekResult is the search result returned (a negative |  | 
|  6426 ** number if pCur points at an entry that is smaller than (pKey, nKey), or |  | 
|  6427 ** a positive value if pCur points at an etry that is larger than  |  | 
|  6428 ** (pKey, nKey)).  |  | 
|  6429 ** |  | 
|  6430 ** If the seekResult parameter is non-zero, then the caller guarantees that |  | 
|  6431 ** cursor pCur is pointing at the existing copy of a row that is to be |  | 
|  6432 ** overwritten.  If the seekResult parameter is 0, then cursor pCur may |  | 
|  6433 ** point to any entry or to no entry at all and so this function has to seek |  | 
|  6434 ** the cursor before the new key can be inserted. |  | 
|  6435 */ |  | 
|  6436 int sqlite3BtreeInsert( |  | 
|  6437   BtCursor *pCur,                /* Insert data into the table of this cursor */ |  | 
|  6438   const void *pKey, i64 nKey,    /* The key of the new record */ |  | 
|  6439   const void *pData, int nData,  /* The data of the new record */ |  | 
|  6440   int nZero,                     /* Number of extra 0 bytes to append to data */ |  | 
|  6441   int appendBias,                /* True if this is likely an append */ |  | 
|  6442   int seekResult                 /* Result of prior MovetoUnpacked() call */ |  | 
|  6443 ){ |  | 
|  6444   int rc; |  | 
|  6445   int loc = seekResult;          /* -1: before desired location  +1: after */ |  | 
|  6446   int szNew; |  | 
|  6447   int idx; |  | 
|  6448   MemPage *pPage; |  | 
|  6449   Btree *p = pCur->pBtree; |  | 
|  6450   BtShared *pBt = p->pBt; |  | 
|  6451   unsigned char *oldCell; |  | 
|  6452   unsigned char *newCell = 0; |  | 
|  6453  |  | 
|  6454   if( pCur->eState==CURSOR_FAULT ){ |  | 
|  6455     assert( pCur->skipNext!=SQLITE_OK ); |  | 
|  6456     return pCur->skipNext; |  | 
|  6457   } |  | 
|  6458  |  | 
|  6459   assert( cursorHoldsMutex(pCur) ); |  | 
|  6460   assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE && !pBt->readOnly ); |  | 
|  6461   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); |  | 
|  6462  |  | 
|  6463   /* Assert that the caller has been consistent. If this cursor was opened |  | 
|  6464   ** expecting an index b-tree, then the caller should be inserting blob |  | 
|  6465   ** keys with no associated data. If the cursor was opened expecting an |  | 
|  6466   ** intkey table, the caller should be inserting integer keys with a |  | 
|  6467   ** blob of associated data.  */ |  | 
|  6468   assert( (pKey==0)==(pCur->pKeyInfo==0) ); |  | 
|  6469  |  | 
|  6470   /* If this is an insert into a table b-tree, invalidate any incrblob  |  | 
|  6471   ** cursors open on the row being replaced (assuming this is a replace |  | 
|  6472   ** operation - if it is not, the following is a no-op).  */ |  | 
|  6473   if( pCur->pKeyInfo==0 ){ |  | 
|  6474     invalidateIncrblobCursors(p, nKey, 0); |  | 
|  6475   } |  | 
|  6476  |  | 
|  6477   /* Save the positions of any other cursors open on this table. |  | 
|  6478   ** |  | 
|  6479   ** In some cases, the call to btreeMoveto() below is a no-op. For |  | 
|  6480   ** example, when inserting data into a table with auto-generated integer |  | 
|  6481   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the  |  | 
|  6482   ** integer key to use. It then calls this function to actually insert the  |  | 
|  6483   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes |  | 
|  6484   ** that the cursor is already where it needs to be and returns without |  | 
|  6485   ** doing any work. To avoid thwarting these optimizations, it is important |  | 
|  6486   ** not to clear the cursor here. |  | 
|  6487   */ |  | 
|  6488   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); |  | 
|  6489   if( rc ) return rc; |  | 
|  6490   if( !loc ){ |  | 
|  6491     rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); |  | 
|  6492     if( rc ) return rc; |  | 
|  6493   } |  | 
|  6494   assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) ); |  | 
|  6495  |  | 
|  6496   pPage = pCur->apPage[pCur->iPage]; |  | 
|  6497   assert( pPage->intKey || nKey>=0 ); |  | 
|  6498   assert( pPage->leaf || !pPage->intKey ); |  | 
|  6499  |  | 
|  6500   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", |  | 
|  6501           pCur->pgnoRoot, nKey, nData, pPage->pgno, |  | 
|  6502           loc==0 ? "overwrite" : "new entry")); |  | 
|  6503   assert( pPage->isInit ); |  | 
|  6504   allocateTempSpace(pBt); |  | 
|  6505   newCell = pBt->pTmpSpace; |  | 
|  6506   if( newCell==0 ) return SQLITE_NOMEM; |  | 
|  6507   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); |  | 
|  6508   if( rc ) goto end_insert; |  | 
|  6509   assert( szNew==cellSizePtr(pPage, newCell) ); |  | 
|  6510   assert( szNew<=MX_CELL_SIZE(pBt) ); |  | 
|  6511   idx = pCur->aiIdx[pCur->iPage]; |  | 
|  6512   if( loc==0 ){ |  | 
|  6513     u16 szOld; |  | 
|  6514     assert( idx<pPage->nCell ); |  | 
|  6515     rc = sqlite3PagerWrite(pPage->pDbPage); |  | 
|  6516     if( rc ){ |  | 
|  6517       goto end_insert; |  | 
|  6518     } |  | 
|  6519     oldCell = findCell(pPage, idx); |  | 
|  6520     if( !pPage->leaf ){ |  | 
|  6521       memcpy(newCell, oldCell, 4); |  | 
|  6522     } |  | 
|  6523     szOld = cellSizePtr(pPage, oldCell); |  | 
|  6524     rc = clearCell(pPage, oldCell); |  | 
|  6525     dropCell(pPage, idx, szOld, &rc); |  | 
|  6526     if( rc ) goto end_insert; |  | 
|  6527   }else if( loc<0 && pPage->nCell>0 ){ |  | 
|  6528     assert( pPage->leaf ); |  | 
|  6529     idx = ++pCur->aiIdx[pCur->iPage]; |  | 
|  6530   }else{ |  | 
|  6531     assert( pPage->leaf ); |  | 
|  6532   } |  | 
|  6533   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); |  | 
|  6534   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); |  | 
|  6535  |  | 
|  6536   /* If no error has occured and pPage has an overflow cell, call balance()  |  | 
|  6537   ** to redistribute the cells within the tree. Since balance() may move |  | 
|  6538   ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey |  | 
|  6539   ** variables. |  | 
|  6540   ** |  | 
|  6541   ** Previous versions of SQLite called moveToRoot() to move the cursor |  | 
|  6542   ** back to the root page as balance() used to invalidate the contents |  | 
|  6543   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, |  | 
|  6544   ** set the cursor state to "invalid". This makes common insert operations |  | 
|  6545   ** slightly faster. |  | 
|  6546   ** |  | 
|  6547   ** There is a subtle but important optimization here too. When inserting |  | 
|  6548   ** multiple records into an intkey b-tree using a single cursor (as can |  | 
|  6549   ** happen while processing an "INSERT INTO ... SELECT" statement), it |  | 
|  6550   ** is advantageous to leave the cursor pointing to the last entry in |  | 
|  6551   ** the b-tree if possible. If the cursor is left pointing to the last |  | 
|  6552   ** entry in the table, and the next row inserted has an integer key |  | 
|  6553   ** larger than the largest existing key, it is possible to insert the |  | 
|  6554   ** row without seeking the cursor. This can be a big performance boost. |  | 
|  6555   */ |  | 
|  6556   pCur->info.nSize = 0; |  | 
|  6557   pCur->validNKey = 0; |  | 
|  6558   if( rc==SQLITE_OK && pPage->nOverflow ){ |  | 
|  6559     rc = balance(pCur); |  | 
|  6560  |  | 
|  6561     /* Must make sure nOverflow is reset to zero even if the balance() |  | 
|  6562     ** fails. Internal data structure corruption will result otherwise.  |  | 
|  6563     ** Also, set the cursor state to invalid. This stops saveCursorPosition() |  | 
|  6564     ** from trying to save the current position of the cursor.  */ |  | 
|  6565     pCur->apPage[pCur->iPage]->nOverflow = 0; |  | 
|  6566     pCur->eState = CURSOR_INVALID; |  | 
|  6567   } |  | 
|  6568   assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); |  | 
|  6569  |  | 
|  6570 end_insert: |  | 
|  6571   return rc; |  | 
|  6572 } |  | 
|  6573  |  | 
|  6574 /* |  | 
|  6575 ** Delete the entry that the cursor is pointing to.  The cursor |  | 
|  6576 ** is left pointing at a arbitrary location. |  | 
|  6577 */ |  | 
|  6578 int sqlite3BtreeDelete(BtCursor *pCur){ |  | 
|  6579   Btree *p = pCur->pBtree; |  | 
|  6580   BtShared *pBt = p->pBt;               |  | 
|  6581   int rc;                              /* Return code */ |  | 
|  6582   MemPage *pPage;                      /* Page to delete cell from */ |  | 
|  6583   unsigned char *pCell;                /* Pointer to cell to delete */ |  | 
|  6584   int iCellIdx;                        /* Index of cell to delete */ |  | 
|  6585   int iCellDepth;                      /* Depth of node containing pCell */  |  | 
|  6586  |  | 
|  6587   assert( cursorHoldsMutex(pCur) ); |  | 
|  6588   assert( pBt->inTransaction==TRANS_WRITE ); |  | 
|  6589   assert( !pBt->readOnly ); |  | 
|  6590   assert( pCur->wrFlag ); |  | 
|  6591   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); |  | 
|  6592   assert( !hasReadConflicts(p, pCur->pgnoRoot) ); |  | 
|  6593  |  | 
|  6594   if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)  |  | 
|  6595    || NEVER(pCur->eState!=CURSOR_VALID) |  | 
|  6596   ){ |  | 
|  6597     return SQLITE_ERROR;  /* Something has gone awry. */ |  | 
|  6598   } |  | 
|  6599  |  | 
|  6600   /* If this is a delete operation to remove a row from a table b-tree, |  | 
|  6601   ** invalidate any incrblob cursors open on the row being deleted.  */ |  | 
|  6602   if( pCur->pKeyInfo==0 ){ |  | 
|  6603     invalidateIncrblobCursors(p, pCur->info.nKey, 0); |  | 
|  6604   } |  | 
|  6605  |  | 
|  6606   iCellDepth = pCur->iPage; |  | 
|  6607   iCellIdx = pCur->aiIdx[iCellDepth]; |  | 
|  6608   pPage = pCur->apPage[iCellDepth]; |  | 
|  6609   pCell = findCell(pPage, iCellIdx); |  | 
|  6610  |  | 
|  6611   /* If the page containing the entry to delete is not a leaf page, move |  | 
|  6612   ** the cursor to the largest entry in the tree that is smaller than |  | 
|  6613   ** the entry being deleted. This cell will replace the cell being deleted |  | 
|  6614   ** from the internal node. The 'previous' entry is used for this instead |  | 
|  6615   ** of the 'next' entry, as the previous entry is always a part of the |  | 
|  6616   ** sub-tree headed by the child page of the cell being deleted. This makes |  | 
|  6617   ** balancing the tree following the delete operation easier.  */ |  | 
|  6618   if( !pPage->leaf ){ |  | 
|  6619     int notUsed; |  | 
|  6620     rc = sqlite3BtreePrevious(pCur, ¬Used); |  | 
|  6621     if( rc ) return rc; |  | 
|  6622   } |  | 
|  6623  |  | 
|  6624   /* Save the positions of any other cursors open on this table before |  | 
|  6625   ** making any modifications. Make the page containing the entry to be  |  | 
|  6626   ** deleted writable. Then free any overflow pages associated with the  |  | 
|  6627   ** entry and finally remove the cell itself from within the page.   |  | 
|  6628   */ |  | 
|  6629   rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); |  | 
|  6630   if( rc ) return rc; |  | 
|  6631   rc = sqlite3PagerWrite(pPage->pDbPage); |  | 
|  6632   if( rc ) return rc; |  | 
|  6633   rc = clearCell(pPage, pCell); |  | 
|  6634   dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc); |  | 
|  6635   if( rc ) return rc; |  | 
|  6636  |  | 
|  6637   /* If the cell deleted was not located on a leaf page, then the cursor |  | 
|  6638   ** is currently pointing to the largest entry in the sub-tree headed |  | 
|  6639   ** by the child-page of the cell that was just deleted from an internal |  | 
|  6640   ** node. The cell from the leaf node needs to be moved to the internal |  | 
|  6641   ** node to replace the deleted cell.  */ |  | 
|  6642   if( !pPage->leaf ){ |  | 
|  6643     MemPage *pLeaf = pCur->apPage[pCur->iPage]; |  | 
|  6644     int nCell; |  | 
|  6645     Pgno n = pCur->apPage[iCellDepth+1]->pgno; |  | 
|  6646     unsigned char *pTmp; |  | 
|  6647  |  | 
|  6648     pCell = findCell(pLeaf, pLeaf->nCell-1); |  | 
|  6649     nCell = cellSizePtr(pLeaf, pCell); |  | 
|  6650     assert( MX_CELL_SIZE(pBt)>=nCell ); |  | 
|  6651  |  | 
|  6652     allocateTempSpace(pBt); |  | 
|  6653     pTmp = pBt->pTmpSpace; |  | 
|  6654  |  | 
|  6655     rc = sqlite3PagerWrite(pLeaf->pDbPage); |  | 
|  6656     insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); |  | 
|  6657     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); |  | 
|  6658     if( rc ) return rc; |  | 
|  6659   } |  | 
|  6660  |  | 
|  6661   /* Balance the tree. If the entry deleted was located on a leaf page, |  | 
|  6662   ** then the cursor still points to that page. In this case the first |  | 
|  6663   ** call to balance() repairs the tree, and the if(...) condition is |  | 
|  6664   ** never true. |  | 
|  6665   ** |  | 
|  6666   ** Otherwise, if the entry deleted was on an internal node page, then |  | 
|  6667   ** pCur is pointing to the leaf page from which a cell was removed to |  | 
|  6668   ** replace the cell deleted from the internal node. This is slightly |  | 
|  6669   ** tricky as the leaf node may be underfull, and the internal node may |  | 
|  6670   ** be either under or overfull. In this case run the balancing algorithm |  | 
|  6671   ** on the leaf node first. If the balance proceeds far enough up the |  | 
|  6672   ** tree that we can be sure that any problem in the internal node has |  | 
|  6673   ** been corrected, so be it. Otherwise, after balancing the leaf node, |  | 
|  6674   ** walk the cursor up the tree to the internal node and balance it as  |  | 
|  6675   ** well.  */ |  | 
|  6676   rc = balance(pCur); |  | 
|  6677   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ |  | 
|  6678     while( pCur->iPage>iCellDepth ){ |  | 
|  6679       releasePage(pCur->apPage[pCur->iPage--]); |  | 
|  6680     } |  | 
|  6681     rc = balance(pCur); |  | 
|  6682   } |  | 
|  6683  |  | 
|  6684   if( rc==SQLITE_OK ){ |  | 
|  6685     moveToRoot(pCur); |  | 
|  6686   } |  | 
|  6687   return rc; |  | 
|  6688 } |  | 
|  6689  |  | 
|  6690 /* |  | 
|  6691 ** Create a new BTree table.  Write into *piTable the page |  | 
|  6692 ** number for the root page of the new table. |  | 
|  6693 ** |  | 
|  6694 ** The type of type is determined by the flags parameter.  Only the |  | 
|  6695 ** following values of flags are currently in use.  Other values for |  | 
|  6696 ** flags might not work: |  | 
|  6697 ** |  | 
|  6698 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys |  | 
|  6699 **     BTREE_ZERODATA                  Used for SQL indices |  | 
|  6700 */ |  | 
|  6701 static int btreeCreateTable(Btree *p, int *piTable, int flags){ |  | 
|  6702   BtShared *pBt = p->pBt; |  | 
|  6703   MemPage *pRoot; |  | 
|  6704   Pgno pgnoRoot; |  | 
|  6705   int rc; |  | 
|  6706  |  | 
|  6707   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|  6708   assert( pBt->inTransaction==TRANS_WRITE ); |  | 
|  6709   assert( !pBt->readOnly ); |  | 
|  6710  |  | 
|  6711 #ifdef SQLITE_OMIT_AUTOVACUUM |  | 
|  6712   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); |  | 
|  6713   if( rc ){ |  | 
|  6714     return rc; |  | 
|  6715   } |  | 
|  6716 #else |  | 
|  6717   if( pBt->autoVacuum ){ |  | 
|  6718     Pgno pgnoMove;      /* Move a page here to make room for the root-page */ |  | 
|  6719     MemPage *pPageMove; /* The page to move to. */ |  | 
|  6720  |  | 
|  6721     /* Creating a new table may probably require moving an existing database |  | 
|  6722     ** to make room for the new tables root page. In case this page turns |  | 
|  6723     ** out to be an overflow page, delete all overflow page-map caches |  | 
|  6724     ** held by open cursors. |  | 
|  6725     */ |  | 
|  6726     invalidateAllOverflowCache(pBt); |  | 
|  6727  |  | 
|  6728     /* Read the value of meta[3] from the database to determine where the |  | 
|  6729     ** root page of the new table should go. meta[3] is the largest root-page |  | 
|  6730     ** created so far, so the new root-page is (meta[3]+1). |  | 
|  6731     */ |  | 
|  6732     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); |  | 
|  6733     pgnoRoot++; |  | 
|  6734  |  | 
|  6735     /* The new root-page may not be allocated on a pointer-map page, or the |  | 
|  6736     ** PENDING_BYTE page. |  | 
|  6737     */ |  | 
|  6738     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || |  | 
|  6739         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ |  | 
|  6740       pgnoRoot++; |  | 
|  6741     } |  | 
|  6742     assert( pgnoRoot>=3 ); |  | 
|  6743  |  | 
|  6744     /* Allocate a page. The page that currently resides at pgnoRoot will |  | 
|  6745     ** be moved to the allocated page (unless the allocated page happens |  | 
|  6746     ** to reside at pgnoRoot). |  | 
|  6747     */ |  | 
|  6748     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1); |  | 
|  6749     if( rc!=SQLITE_OK ){ |  | 
|  6750       return rc; |  | 
|  6751     } |  | 
|  6752  |  | 
|  6753     if( pgnoMove!=pgnoRoot ){ |  | 
|  6754       /* pgnoRoot is the page that will be used for the root-page of |  | 
|  6755       ** the new table (assuming an error did not occur). But we were |  | 
|  6756       ** allocated pgnoMove. If required (i.e. if it was not allocated |  | 
|  6757       ** by extending the file), the current page at position pgnoMove |  | 
|  6758       ** is already journaled. |  | 
|  6759       */ |  | 
|  6760       u8 eType = 0; |  | 
|  6761       Pgno iPtrPage = 0; |  | 
|  6762  |  | 
|  6763       releasePage(pPageMove); |  | 
|  6764  |  | 
|  6765       /* Move the page currently at pgnoRoot to pgnoMove. */ |  | 
|  6766       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); |  | 
|  6767       if( rc!=SQLITE_OK ){ |  | 
|  6768         return rc; |  | 
|  6769       } |  | 
|  6770       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); |  | 
|  6771       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ |  | 
|  6772         rc = SQLITE_CORRUPT_BKPT; |  | 
|  6773       } |  | 
|  6774       if( rc!=SQLITE_OK ){ |  | 
|  6775         releasePage(pRoot); |  | 
|  6776         return rc; |  | 
|  6777       } |  | 
|  6778       assert( eType!=PTRMAP_ROOTPAGE ); |  | 
|  6779       assert( eType!=PTRMAP_FREEPAGE ); |  | 
|  6780       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); |  | 
|  6781       releasePage(pRoot); |  | 
|  6782  |  | 
|  6783       /* Obtain the page at pgnoRoot */ |  | 
|  6784       if( rc!=SQLITE_OK ){ |  | 
|  6785         return rc; |  | 
|  6786       } |  | 
|  6787       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); |  | 
|  6788       if( rc!=SQLITE_OK ){ |  | 
|  6789         return rc; |  | 
|  6790       } |  | 
|  6791       rc = sqlite3PagerWrite(pRoot->pDbPage); |  | 
|  6792       if( rc!=SQLITE_OK ){ |  | 
|  6793         releasePage(pRoot); |  | 
|  6794         return rc; |  | 
|  6795       } |  | 
|  6796     }else{ |  | 
|  6797       pRoot = pPageMove; |  | 
|  6798     }  |  | 
|  6799  |  | 
|  6800     /* Update the pointer-map and meta-data with the new root-page number. */ |  | 
|  6801     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); |  | 
|  6802     if( rc ){ |  | 
|  6803       releasePage(pRoot); |  | 
|  6804       return rc; |  | 
|  6805     } |  | 
|  6806     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); |  | 
|  6807     if( rc ){ |  | 
|  6808       releasePage(pRoot); |  | 
|  6809       return rc; |  | 
|  6810     } |  | 
|  6811  |  | 
|  6812   }else{ |  | 
|  6813     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); |  | 
|  6814     if( rc ) return rc; |  | 
|  6815   } |  | 
|  6816 #endif |  | 
|  6817   assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); |  | 
|  6818   zeroPage(pRoot, flags | PTF_LEAF); |  | 
|  6819   sqlite3PagerUnref(pRoot->pDbPage); |  | 
|  6820   *piTable = (int)pgnoRoot; |  | 
|  6821   return SQLITE_OK; |  | 
|  6822 } |  | 
|  6823 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ |  | 
|  6824   int rc; |  | 
|  6825   sqlite3BtreeEnter(p); |  | 
|  6826   rc = btreeCreateTable(p, piTable, flags); |  | 
|  6827   sqlite3BtreeLeave(p); |  | 
|  6828   return rc; |  | 
|  6829 } |  | 
|  6830  |  | 
|  6831 /* |  | 
|  6832 ** Erase the given database page and all its children.  Return |  | 
|  6833 ** the page to the freelist. |  | 
|  6834 */ |  | 
|  6835 static int clearDatabasePage( |  | 
|  6836   BtShared *pBt,           /* The BTree that contains the table */ |  | 
|  6837   Pgno pgno,            /* Page number to clear */ |  | 
|  6838   int freePageFlag,     /* Deallocate page if true */ |  | 
|  6839   int *pnChange |  | 
|  6840 ){ |  | 
|  6841   MemPage *pPage; |  | 
|  6842   int rc; |  | 
|  6843   unsigned char *pCell; |  | 
|  6844   int i; |  | 
|  6845  |  | 
|  6846   assert( sqlite3_mutex_held(pBt->mutex) ); |  | 
|  6847   if( pgno>pagerPagecount(pBt) ){ |  | 
|  6848     return SQLITE_CORRUPT_BKPT; |  | 
|  6849   } |  | 
|  6850  |  | 
|  6851   rc = getAndInitPage(pBt, pgno, &pPage); |  | 
|  6852   if( rc ) return rc; |  | 
|  6853   for(i=0; i<pPage->nCell; i++){ |  | 
|  6854     pCell = findCell(pPage, i); |  | 
|  6855     if( !pPage->leaf ){ |  | 
|  6856       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); |  | 
|  6857       if( rc ) goto cleardatabasepage_out; |  | 
|  6858     } |  | 
|  6859     rc = clearCell(pPage, pCell); |  | 
|  6860     if( rc ) goto cleardatabasepage_out; |  | 
|  6861   } |  | 
|  6862   if( !pPage->leaf ){ |  | 
|  6863     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange); |  | 
|  6864     if( rc ) goto cleardatabasepage_out; |  | 
|  6865   }else if( pnChange ){ |  | 
|  6866     assert( pPage->intKey ); |  | 
|  6867     *pnChange += pPage->nCell; |  | 
|  6868   } |  | 
|  6869   if( freePageFlag ){ |  | 
|  6870     freePage(pPage, &rc); |  | 
|  6871   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ |  | 
|  6872     zeroPage(pPage, pPage->aData[0] | PTF_LEAF); |  | 
|  6873   } |  | 
|  6874  |  | 
|  6875 cleardatabasepage_out: |  | 
|  6876   releasePage(pPage); |  | 
|  6877   return rc; |  | 
|  6878 } |  | 
|  6879  |  | 
|  6880 /* |  | 
|  6881 ** Delete all information from a single table in the database.  iTable is |  | 
|  6882 ** the page number of the root of the table.  After this routine returns, |  | 
|  6883 ** the root page is empty, but still exists. |  | 
|  6884 ** |  | 
|  6885 ** This routine will fail with SQLITE_LOCKED if there are any open |  | 
|  6886 ** read cursors on the table.  Open write cursors are moved to the |  | 
|  6887 ** root of the table. |  | 
|  6888 ** |  | 
|  6889 ** If pnChange is not NULL, then table iTable must be an intkey table. The |  | 
|  6890 ** integer value pointed to by pnChange is incremented by the number of |  | 
|  6891 ** entries in the table. |  | 
|  6892 */ |  | 
|  6893 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ |  | 
|  6894   int rc; |  | 
|  6895   BtShared *pBt = p->pBt; |  | 
|  6896   sqlite3BtreeEnter(p); |  | 
|  6897   assert( p->inTrans==TRANS_WRITE ); |  | 
|  6898  |  | 
|  6899   /* Invalidate all incrblob cursors open on table iTable (assuming iTable |  | 
|  6900   ** is the root of a table b-tree - if it is not, the following call is |  | 
|  6901   ** a no-op).  */ |  | 
|  6902   invalidateIncrblobCursors(p, 0, 1); |  | 
|  6903  |  | 
|  6904   rc = saveAllCursors(pBt, (Pgno)iTable, 0); |  | 
|  6905   if( SQLITE_OK==rc ){ |  | 
|  6906     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); |  | 
|  6907   } |  | 
|  6908   sqlite3BtreeLeave(p); |  | 
|  6909   return rc; |  | 
|  6910 } |  | 
|  6911  |  | 
|  6912 /* |  | 
|  6913 ** Erase all information in a table and add the root of the table to |  | 
|  6914 ** the freelist.  Except, the root of the principle table (the one on |  | 
|  6915 ** page 1) is never added to the freelist. |  | 
|  6916 ** |  | 
|  6917 ** This routine will fail with SQLITE_LOCKED if there are any open |  | 
|  6918 ** cursors on the table. |  | 
|  6919 ** |  | 
|  6920 ** If AUTOVACUUM is enabled and the page at iTable is not the last |  | 
|  6921 ** root page in the database file, then the last root page  |  | 
|  6922 ** in the database file is moved into the slot formerly occupied by |  | 
|  6923 ** iTable and that last slot formerly occupied by the last root page |  | 
|  6924 ** is added to the freelist instead of iTable.  In this say, all |  | 
|  6925 ** root pages are kept at the beginning of the database file, which |  | 
|  6926 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the  |  | 
|  6927 ** page number that used to be the last root page in the file before |  | 
|  6928 ** the move.  If no page gets moved, *piMoved is set to 0. |  | 
|  6929 ** The last root page is recorded in meta[3] and the value of |  | 
|  6930 ** meta[3] is updated by this procedure. |  | 
|  6931 */ |  | 
|  6932 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ |  | 
|  6933   int rc; |  | 
|  6934   MemPage *pPage = 0; |  | 
|  6935   BtShared *pBt = p->pBt; |  | 
|  6936  |  | 
|  6937   assert( sqlite3BtreeHoldsMutex(p) ); |  | 
|  6938   assert( p->inTrans==TRANS_WRITE ); |  | 
|  6939  |  | 
|  6940   /* It is illegal to drop a table if any cursors are open on the |  | 
|  6941   ** database. This is because in auto-vacuum mode the backend may |  | 
|  6942   ** need to move another root-page to fill a gap left by the deleted |  | 
|  6943   ** root page. If an open cursor was using this page a problem would  |  | 
|  6944   ** occur. |  | 
|  6945   ** |  | 
|  6946   ** This error is caught long before control reaches this point. |  | 
|  6947   */ |  | 
|  6948   if( NEVER(pBt->pCursor) ){ |  | 
|  6949     sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); |  | 
|  6950     return SQLITE_LOCKED_SHAREDCACHE; |  | 
|  6951   } |  | 
|  6952  |  | 
|  6953   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); |  | 
|  6954   if( rc ) return rc; |  | 
|  6955   rc = sqlite3BtreeClearTable(p, iTable, 0); |  | 
|  6956   if( rc ){ |  | 
|  6957     releasePage(pPage); |  | 
|  6958     return rc; |  | 
|  6959   } |  | 
|  6960  |  | 
|  6961   *piMoved = 0; |  | 
|  6962  |  | 
|  6963   if( iTable>1 ){ |  | 
|  6964 #ifdef SQLITE_OMIT_AUTOVACUUM |  | 
|  6965     freePage(pPage, &rc); |  | 
|  6966     releasePage(pPage); |  | 
|  6967 #else |  | 
|  6968     if( pBt->autoVacuum ){ |  | 
|  6969       Pgno maxRootPgno; |  | 
|  6970       sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); |  | 
|  6971  |  | 
|  6972       if( iTable==maxRootPgno ){ |  | 
|  6973         /* If the table being dropped is the table with the largest root-page |  | 
|  6974         ** number in the database, put the root page on the free list.  |  | 
|  6975         */ |  | 
|  6976         freePage(pPage, &rc); |  | 
|  6977         releasePage(pPage); |  | 
|  6978         if( rc!=SQLITE_OK ){ |  | 
|  6979           return rc; |  | 
|  6980         } |  | 
|  6981       }else{ |  | 
|  6982         /* The table being dropped does not have the largest root-page |  | 
|  6983         ** number in the database. So move the page that does into the  |  | 
|  6984         ** gap left by the deleted root-page. |  | 
|  6985         */ |  | 
|  6986         MemPage *pMove; |  | 
|  6987         releasePage(pPage); |  | 
|  6988         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); |  | 
|  6989         if( rc!=SQLITE_OK ){ |  | 
|  6990           return rc; |  | 
|  6991         } |  | 
|  6992         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); |  | 
|  6993         releasePage(pMove); |  | 
|  6994         if( rc!=SQLITE_OK ){ |  | 
|  6995           return rc; |  | 
|  6996         } |  | 
|  6997         pMove = 0; |  | 
|  6998         rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); |  | 
|  6999         freePage(pMove, &rc); |  | 
|  7000         releasePage(pMove); |  | 
|  7001         if( rc!=SQLITE_OK ){ |  | 
|  7002           return rc; |  | 
|  7003         } |  | 
|  7004         *piMoved = maxRootPgno; |  | 
|  7005       } |  | 
|  7006  |  | 
|  7007       /* Set the new 'max-root-page' value in the database header. This |  | 
|  7008       ** is the old value less one, less one more if that happens to |  | 
|  7009       ** be a root-page number, less one again if that is the |  | 
|  7010       ** PENDING_BYTE_PAGE. |  | 
|  7011       */ |  | 
|  7012       maxRootPgno--; |  | 
|  7013       while( maxRootPgno==PENDING_BYTE_PAGE(pBt) |  | 
|  7014              || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ |  | 
|  7015         maxRootPgno--; |  | 
|  7016       } |  | 
|  7017       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); |  | 
|  7018  |  | 
|  7019       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); |  | 
|  7020     }else{ |  | 
|  7021       freePage(pPage, &rc); |  | 
|  7022       releasePage(pPage); |  | 
|  7023     } |  | 
|  7024 #endif |  | 
|  7025   }else{ |  | 
|  7026     /* If sqlite3BtreeDropTable was called on page 1. |  | 
|  7027     ** This really never should happen except in a corrupt |  | 
|  7028     ** database.  |  | 
|  7029     */ |  | 
|  7030     zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); |  | 
|  7031     releasePage(pPage); |  | 
|  7032   } |  | 
|  7033   return rc;   |  | 
|  7034 } |  | 
|  7035 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ |  | 
|  7036   int rc; |  | 
|  7037   sqlite3BtreeEnter(p); |  | 
|  7038   rc = btreeDropTable(p, iTable, piMoved); |  | 
|  7039   sqlite3BtreeLeave(p); |  | 
|  7040   return rc; |  | 
|  7041 } |  | 
|  7042  |  | 
|  7043  |  | 
|  7044 /* |  | 
|  7045 ** This function may only be called if the b-tree connection already |  | 
|  7046 ** has a read or write transaction open on the database. |  | 
|  7047 ** |  | 
|  7048 ** Read the meta-information out of a database file.  Meta[0] |  | 
|  7049 ** is the number of free pages currently in the database.  Meta[1] |  | 
|  7050 ** through meta[15] are available for use by higher layers.  Meta[0] |  | 
|  7051 ** is read-only, the others are read/write. |  | 
|  7052 **  |  | 
|  7053 ** The schema layer numbers meta values differently.  At the schema |  | 
|  7054 ** layer (and the SetCookie and ReadCookie opcodes) the number of |  | 
|  7055 ** free pages is not visible.  So Cookie[0] is the same as Meta[1]. |  | 
|  7056 */ |  | 
|  7057 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ |  | 
|  7058   BtShared *pBt = p->pBt; |  | 
|  7059  |  | 
|  7060   sqlite3BtreeEnter(p); |  | 
|  7061   assert( p->inTrans>TRANS_NONE ); |  | 
|  7062   assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); |  | 
|  7063   assert( pBt->pPage1 ); |  | 
|  7064   assert( idx>=0 && idx<=15 ); |  | 
|  7065  |  | 
|  7066   *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); |  | 
|  7067  |  | 
|  7068   /* If auto-vacuum is disabled in this build and this is an auto-vacuum |  | 
|  7069   ** database, mark the database as read-only.  */ |  | 
|  7070 #ifdef SQLITE_OMIT_AUTOVACUUM |  | 
|  7071   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ) pBt->readOnly = 1; |  | 
|  7072 #endif |  | 
|  7073  |  | 
|  7074   sqlite3BtreeLeave(p); |  | 
|  7075 } |  | 
|  7076  |  | 
|  7077 /* |  | 
|  7078 ** Write meta-information back into the database.  Meta[0] is |  | 
|  7079 ** read-only and may not be written. |  | 
|  7080 */ |  | 
|  7081 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ |  | 
|  7082   BtShared *pBt = p->pBt; |  | 
|  7083   unsigned char *pP1; |  | 
|  7084   int rc; |  | 
|  7085   assert( idx>=1 && idx<=15 ); |  | 
|  7086   sqlite3BtreeEnter(p); |  | 
|  7087   assert( p->inTrans==TRANS_WRITE ); |  | 
|  7088   assert( pBt->pPage1!=0 ); |  | 
|  7089   pP1 = pBt->pPage1->aData; |  | 
|  7090   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); |  | 
|  7091   if( rc==SQLITE_OK ){ |  | 
|  7092     put4byte(&pP1[36 + idx*4], iMeta); |  | 
|  7093 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7094     if( idx==BTREE_INCR_VACUUM ){ |  | 
|  7095       assert( pBt->autoVacuum || iMeta==0 ); |  | 
|  7096       assert( iMeta==0 || iMeta==1 ); |  | 
|  7097       pBt->incrVacuum = (u8)iMeta; |  | 
|  7098     } |  | 
|  7099 #endif |  | 
|  7100   } |  | 
|  7101   sqlite3BtreeLeave(p); |  | 
|  7102   return rc; |  | 
|  7103 } |  | 
|  7104  |  | 
|  7105 #ifndef SQLITE_OMIT_BTREECOUNT |  | 
|  7106 /* |  | 
|  7107 ** The first argument, pCur, is a cursor opened on some b-tree. Count the |  | 
|  7108 ** number of entries in the b-tree and write the result to *pnEntry. |  | 
|  7109 ** |  | 
|  7110 ** SQLITE_OK is returned if the operation is successfully executed.  |  | 
|  7111 ** Otherwise, if an error is encountered (i.e. an IO error or database |  | 
|  7112 ** corruption) an SQLite error code is returned. |  | 
|  7113 */ |  | 
|  7114 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ |  | 
|  7115   i64 nEntry = 0;                      /* Value to return in *pnEntry */ |  | 
|  7116   int rc;                              /* Return code */ |  | 
|  7117   rc = moveToRoot(pCur); |  | 
|  7118  |  | 
|  7119   /* Unless an error occurs, the following loop runs one iteration for each |  | 
|  7120   ** page in the B-Tree structure (not including overflow pages).  |  | 
|  7121   */ |  | 
|  7122   while( rc==SQLITE_OK ){ |  | 
|  7123     int iIdx;                          /* Index of child node in parent */ |  | 
|  7124     MemPage *pPage;                    /* Current page of the b-tree */ |  | 
|  7125  |  | 
|  7126     /* If this is a leaf page or the tree is not an int-key tree, then  |  | 
|  7127     ** this page contains countable entries. Increment the entry counter |  | 
|  7128     ** accordingly. |  | 
|  7129     */ |  | 
|  7130     pPage = pCur->apPage[pCur->iPage]; |  | 
|  7131     if( pPage->leaf || !pPage->intKey ){ |  | 
|  7132       nEntry += pPage->nCell; |  | 
|  7133     } |  | 
|  7134  |  | 
|  7135     /* pPage is a leaf node. This loop navigates the cursor so that it  |  | 
|  7136     ** points to the first interior cell that it points to the parent of |  | 
|  7137     ** the next page in the tree that has not yet been visited. The |  | 
|  7138     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell |  | 
|  7139     ** of the page, or to the number of cells in the page if the next page |  | 
|  7140     ** to visit is the right-child of its parent. |  | 
|  7141     ** |  | 
|  7142     ** If all pages in the tree have been visited, return SQLITE_OK to the |  | 
|  7143     ** caller. |  | 
|  7144     */ |  | 
|  7145     if( pPage->leaf ){ |  | 
|  7146       do { |  | 
|  7147         if( pCur->iPage==0 ){ |  | 
|  7148           /* All pages of the b-tree have been visited. Return successfully. */ |  | 
|  7149           *pnEntry = nEntry; |  | 
|  7150           return SQLITE_OK; |  | 
|  7151         } |  | 
|  7152         moveToParent(pCur); |  | 
|  7153       }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); |  | 
|  7154  |  | 
|  7155       pCur->aiIdx[pCur->iPage]++; |  | 
|  7156       pPage = pCur->apPage[pCur->iPage]; |  | 
|  7157     } |  | 
|  7158  |  | 
|  7159     /* Descend to the child node of the cell that the cursor currently  |  | 
|  7160     ** points at. This is the right-child if (iIdx==pPage->nCell). |  | 
|  7161     */ |  | 
|  7162     iIdx = pCur->aiIdx[pCur->iPage]; |  | 
|  7163     if( iIdx==pPage->nCell ){ |  | 
|  7164       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); |  | 
|  7165     }else{ |  | 
|  7166       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); |  | 
|  7167     } |  | 
|  7168   } |  | 
|  7169  |  | 
|  7170   /* An error has occurred. Return an error code. */ |  | 
|  7171   return rc; |  | 
|  7172 } |  | 
|  7173 #endif |  | 
|  7174  |  | 
|  7175 /* |  | 
|  7176 ** Return the pager associated with a BTree.  This routine is used for |  | 
|  7177 ** testing and debugging only. |  | 
|  7178 */ |  | 
|  7179 Pager *sqlite3BtreePager(Btree *p){ |  | 
|  7180   return p->pBt->pPager; |  | 
|  7181 } |  | 
|  7182  |  | 
|  7183 #ifndef SQLITE_OMIT_INTEGRITY_CHECK |  | 
|  7184 /* |  | 
|  7185 ** Append a message to the error message string. |  | 
|  7186 */ |  | 
|  7187 static void checkAppendMsg( |  | 
|  7188   IntegrityCk *pCheck, |  | 
|  7189   char *zMsg1, |  | 
|  7190   const char *zFormat, |  | 
|  7191   ... |  | 
|  7192 ){ |  | 
|  7193   va_list ap; |  | 
|  7194   if( !pCheck->mxErr ) return; |  | 
|  7195   pCheck->mxErr--; |  | 
|  7196   pCheck->nErr++; |  | 
|  7197   va_start(ap, zFormat); |  | 
|  7198   if( pCheck->errMsg.nChar ){ |  | 
|  7199     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); |  | 
|  7200   } |  | 
|  7201   if( zMsg1 ){ |  | 
|  7202     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1); |  | 
|  7203   } |  | 
|  7204   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); |  | 
|  7205   va_end(ap); |  | 
|  7206   if( pCheck->errMsg.mallocFailed ){ |  | 
|  7207     pCheck->mallocFailed = 1; |  | 
|  7208   } |  | 
|  7209 } |  | 
|  7210 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ |  | 
|  7211  |  | 
|  7212 #ifndef SQLITE_OMIT_INTEGRITY_CHECK |  | 
|  7213 /* |  | 
|  7214 ** Add 1 to the reference count for page iPage.  If this is the second |  | 
|  7215 ** reference to the page, add an error message to pCheck->zErrMsg. |  | 
|  7216 ** Return 1 if there are 2 ore more references to the page and 0 if |  | 
|  7217 ** if this is the first reference to the page. |  | 
|  7218 ** |  | 
|  7219 ** Also check that the page number is in bounds. |  | 
|  7220 */ |  | 
|  7221 static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){ |  | 
|  7222   if( iPage==0 ) return 1; |  | 
|  7223   if( iPage>pCheck->nPage ){ |  | 
|  7224     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage); |  | 
|  7225     return 1; |  | 
|  7226   } |  | 
|  7227   if( pCheck->anRef[iPage]==1 ){ |  | 
|  7228     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage); |  | 
|  7229     return 1; |  | 
|  7230   } |  | 
|  7231   return  (pCheck->anRef[iPage]++)>1; |  | 
|  7232 } |  | 
|  7233  |  | 
|  7234 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7235 /* |  | 
|  7236 ** Check that the entry in the pointer-map for page iChild maps to  |  | 
|  7237 ** page iParent, pointer type ptrType. If not, append an error message |  | 
|  7238 ** to pCheck. |  | 
|  7239 */ |  | 
|  7240 static void checkPtrmap( |  | 
|  7241   IntegrityCk *pCheck,   /* Integrity check context */ |  | 
|  7242   Pgno iChild,           /* Child page number */ |  | 
|  7243   u8 eType,              /* Expected pointer map type */ |  | 
|  7244   Pgno iParent,          /* Expected pointer map parent page number */ |  | 
|  7245   char *zContext         /* Context description (used for error msg) */ |  | 
|  7246 ){ |  | 
|  7247   int rc; |  | 
|  7248   u8 ePtrmapType; |  | 
|  7249   Pgno iPtrmapParent; |  | 
|  7250  |  | 
|  7251   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); |  | 
|  7252   if( rc!=SQLITE_OK ){ |  | 
|  7253     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; |  | 
|  7254     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild); |  | 
|  7255     return; |  | 
|  7256   } |  | 
|  7257  |  | 
|  7258   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ |  | 
|  7259     checkAppendMsg(pCheck, zContext,  |  | 
|  7260       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",  |  | 
|  7261       iChild, eType, iParent, ePtrmapType, iPtrmapParent); |  | 
|  7262   } |  | 
|  7263 } |  | 
|  7264 #endif |  | 
|  7265  |  | 
|  7266 /* |  | 
|  7267 ** Check the integrity of the freelist or of an overflow page list. |  | 
|  7268 ** Verify that the number of pages on the list is N. |  | 
|  7269 */ |  | 
|  7270 static void checkList( |  | 
|  7271   IntegrityCk *pCheck,  /* Integrity checking context */ |  | 
|  7272   int isFreeList,       /* True for a freelist.  False for overflow page list */ |  | 
|  7273   int iPage,            /* Page number for first page in the list */ |  | 
|  7274   int N,                /* Expected number of pages in the list */ |  | 
|  7275   char *zContext        /* Context for error messages */ |  | 
|  7276 ){ |  | 
|  7277   int i; |  | 
|  7278   int expected = N; |  | 
|  7279   int iFirst = iPage; |  | 
|  7280   while( N-- > 0 && pCheck->mxErr ){ |  | 
|  7281     DbPage *pOvflPage; |  | 
|  7282     unsigned char *pOvflData; |  | 
|  7283     if( iPage<1 ){ |  | 
|  7284       checkAppendMsg(pCheck, zContext, |  | 
|  7285          "%d of %d pages missing from overflow list starting at %d", |  | 
|  7286           N+1, expected, iFirst); |  | 
|  7287       break; |  | 
|  7288     } |  | 
|  7289     if( checkRef(pCheck, iPage, zContext) ) break; |  | 
|  7290     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ |  | 
|  7291       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage); |  | 
|  7292       break; |  | 
|  7293     } |  | 
|  7294     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); |  | 
|  7295     if( isFreeList ){ |  | 
|  7296       int n = get4byte(&pOvflData[4]); |  | 
|  7297 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7298       if( pCheck->pBt->autoVacuum ){ |  | 
|  7299         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext); |  | 
|  7300       } |  | 
|  7301 #endif |  | 
|  7302       if( n>pCheck->pBt->usableSize/4-2 ){ |  | 
|  7303         checkAppendMsg(pCheck, zContext, |  | 
|  7304            "freelist leaf count too big on page %d", iPage); |  | 
|  7305         N--; |  | 
|  7306       }else{ |  | 
|  7307         for(i=0; i<n; i++){ |  | 
|  7308           Pgno iFreePage = get4byte(&pOvflData[8+i*4]); |  | 
|  7309 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7310           if( pCheck->pBt->autoVacuum ){ |  | 
|  7311             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext); |  | 
|  7312           } |  | 
|  7313 #endif |  | 
|  7314           checkRef(pCheck, iFreePage, zContext); |  | 
|  7315         } |  | 
|  7316         N -= n; |  | 
|  7317       } |  | 
|  7318     } |  | 
|  7319 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7320     else{ |  | 
|  7321       /* If this database supports auto-vacuum and iPage is not the last |  | 
|  7322       ** page in this overflow list, check that the pointer-map entry for |  | 
|  7323       ** the following page matches iPage. |  | 
|  7324       */ |  | 
|  7325       if( pCheck->pBt->autoVacuum && N>0 ){ |  | 
|  7326         i = get4byte(pOvflData); |  | 
|  7327         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext); |  | 
|  7328       } |  | 
|  7329     } |  | 
|  7330 #endif |  | 
|  7331     iPage = get4byte(pOvflData); |  | 
|  7332     sqlite3PagerUnref(pOvflPage); |  | 
|  7333   } |  | 
|  7334 } |  | 
|  7335 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ |  | 
|  7336  |  | 
|  7337 #ifndef SQLITE_OMIT_INTEGRITY_CHECK |  | 
|  7338 /* |  | 
|  7339 ** Do various sanity checks on a single page of a tree.  Return |  | 
|  7340 ** the tree depth.  Root pages return 0.  Parents of root pages |  | 
|  7341 ** return 1, and so forth. |  | 
|  7342 **  |  | 
|  7343 ** These checks are done: |  | 
|  7344 ** |  | 
|  7345 **      1.  Make sure that cells and freeblocks do not overlap |  | 
|  7346 **          but combine to completely cover the page. |  | 
|  7347 **  NO  2.  Make sure cell keys are in order. |  | 
|  7348 **  NO  3.  Make sure no key is less than or equal to zLowerBound. |  | 
|  7349 **  NO  4.  Make sure no key is greater than or equal to zUpperBound. |  | 
|  7350 **      5.  Check the integrity of overflow pages. |  | 
|  7351 **      6.  Recursively call checkTreePage on all children. |  | 
|  7352 **      7.  Verify that the depth of all children is the same. |  | 
|  7353 **      8.  Make sure this page is at least 33% full or else it is |  | 
|  7354 **          the root of the tree. |  | 
|  7355 */ |  | 
|  7356 static int checkTreePage( |  | 
|  7357   IntegrityCk *pCheck,  /* Context for the sanity check */ |  | 
|  7358   int iPage,            /* Page number of the page to check */ |  | 
|  7359   char *zParentContext  /* Parent context */ |  | 
|  7360 ){ |  | 
|  7361   MemPage *pPage; |  | 
|  7362   int i, rc, depth, d2, pgno, cnt; |  | 
|  7363   int hdr, cellStart; |  | 
|  7364   int nCell; |  | 
|  7365   u8 *data; |  | 
|  7366   BtShared *pBt; |  | 
|  7367   int usableSize; |  | 
|  7368   char zContext[100]; |  | 
|  7369   char *hit = 0; |  | 
|  7370  |  | 
|  7371   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage); |  | 
|  7372  |  | 
|  7373   /* Check that the page exists |  | 
|  7374   */ |  | 
|  7375   pBt = pCheck->pBt; |  | 
|  7376   usableSize = pBt->usableSize; |  | 
|  7377   if( iPage==0 ) return 0; |  | 
|  7378   if( checkRef(pCheck, iPage, zParentContext) ) return 0; |  | 
|  7379   if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ |  | 
|  7380     checkAppendMsg(pCheck, zContext, |  | 
|  7381        "unable to get the page. error code=%d", rc); |  | 
|  7382     return 0; |  | 
|  7383   } |  | 
|  7384  |  | 
|  7385   /* Clear MemPage.isInit to make sure the corruption detection code in |  | 
|  7386   ** btreeInitPage() is executed.  */ |  | 
|  7387   pPage->isInit = 0; |  | 
|  7388   if( (rc = btreeInitPage(pPage))!=0 ){ |  | 
|  7389     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */ |  | 
|  7390     checkAppendMsg(pCheck, zContext,  |  | 
|  7391                    "btreeInitPage() returns error code %d", rc); |  | 
|  7392     releasePage(pPage); |  | 
|  7393     return 0; |  | 
|  7394   } |  | 
|  7395  |  | 
|  7396   /* Check out all the cells. |  | 
|  7397   */ |  | 
|  7398   depth = 0; |  | 
|  7399   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ |  | 
|  7400     u8 *pCell; |  | 
|  7401     u32 sz; |  | 
|  7402     CellInfo info; |  | 
|  7403  |  | 
|  7404     /* Check payload overflow pages |  | 
|  7405     */ |  | 
|  7406     sqlite3_snprintf(sizeof(zContext), zContext, |  | 
|  7407              "On tree page %d cell %d: ", iPage, i); |  | 
|  7408     pCell = findCell(pPage,i); |  | 
|  7409     btreeParseCellPtr(pPage, pCell, &info); |  | 
|  7410     sz = info.nData; |  | 
|  7411     if( !pPage->intKey ) sz += (int)info.nKey; |  | 
|  7412     assert( sz==info.nPayload ); |  | 
|  7413     if( (sz>info.nLocal)  |  | 
|  7414      && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) |  | 
|  7415     ){ |  | 
|  7416       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); |  | 
|  7417       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); |  | 
|  7418 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7419       if( pBt->autoVacuum ){ |  | 
|  7420         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext); |  | 
|  7421       } |  | 
|  7422 #endif |  | 
|  7423       checkList(pCheck, 0, pgnoOvfl, nPage, zContext); |  | 
|  7424     } |  | 
|  7425  |  | 
|  7426     /* Check sanity of left child page. |  | 
|  7427     */ |  | 
|  7428     if( !pPage->leaf ){ |  | 
|  7429       pgno = get4byte(pCell); |  | 
|  7430 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7431       if( pBt->autoVacuum ){ |  | 
|  7432         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext); |  | 
|  7433       } |  | 
|  7434 #endif |  | 
|  7435       d2 = checkTreePage(pCheck, pgno, zContext); |  | 
|  7436       if( i>0 && d2!=depth ){ |  | 
|  7437         checkAppendMsg(pCheck, zContext, "Child page depth differs"); |  | 
|  7438       } |  | 
|  7439       depth = d2; |  | 
|  7440     } |  | 
|  7441   } |  | 
|  7442   if( !pPage->leaf ){ |  | 
|  7443     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); |  | 
|  7444     sqlite3_snprintf(sizeof(zContext), zContext,  |  | 
|  7445                      "On page %d at right child: ", iPage); |  | 
|  7446 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7447     if( pBt->autoVacuum ){ |  | 
|  7448       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0); |  | 
|  7449     } |  | 
|  7450 #endif |  | 
|  7451     checkTreePage(pCheck, pgno, zContext); |  | 
|  7452   } |  | 
|  7453   |  | 
|  7454   /* Check for complete coverage of the page |  | 
|  7455   */ |  | 
|  7456   data = pPage->aData; |  | 
|  7457   hdr = pPage->hdrOffset; |  | 
|  7458   hit = sqlite3PageMalloc( pBt->pageSize ); |  | 
|  7459   if( hit==0 ){ |  | 
|  7460     pCheck->mallocFailed = 1; |  | 
|  7461   }else{ |  | 
|  7462     u16 contentOffset = get2byte(&data[hdr+5]); |  | 
|  7463     assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */ |  | 
|  7464     memset(hit+contentOffset, 0, usableSize-contentOffset); |  | 
|  7465     memset(hit, 1, contentOffset); |  | 
|  7466     nCell = get2byte(&data[hdr+3]); |  | 
|  7467     cellStart = hdr + 12 - 4*pPage->leaf; |  | 
|  7468     for(i=0; i<nCell; i++){ |  | 
|  7469       int pc = get2byte(&data[cellStart+i*2]); |  | 
|  7470       u16 size = 1024; |  | 
|  7471       int j; |  | 
|  7472       if( pc<=usableSize-4 ){ |  | 
|  7473         size = cellSizePtr(pPage, &data[pc]); |  | 
|  7474       } |  | 
|  7475       if( (pc+size-1)>=usableSize ){ |  | 
|  7476         checkAppendMsg(pCheck, 0,  |  | 
|  7477             "Corruption detected in cell %d on page %d",i,iPage,0); |  | 
|  7478       }else{ |  | 
|  7479         for(j=pc+size-1; j>=pc; j--) hit[j]++; |  | 
|  7480       } |  | 
|  7481     } |  | 
|  7482     i = get2byte(&data[hdr+1]); |  | 
|  7483     while( i>0 ){ |  | 
|  7484       int size, j; |  | 
|  7485       assert( i<=usableSize-4 );     /* Enforced by btreeInitPage() */ |  | 
|  7486       size = get2byte(&data[i+2]); |  | 
|  7487       assert( i+size<=usableSize );  /* Enforced by btreeInitPage() */ |  | 
|  7488       for(j=i+size-1; j>=i; j--) hit[j]++; |  | 
|  7489       j = get2byte(&data[i]); |  | 
|  7490       assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */ |  | 
|  7491       assert( j<=usableSize-4 );   /* Enforced by btreeInitPage() */ |  | 
|  7492       i = j; |  | 
|  7493     } |  | 
|  7494     for(i=cnt=0; i<usableSize; i++){ |  | 
|  7495       if( hit[i]==0 ){ |  | 
|  7496         cnt++; |  | 
|  7497       }else if( hit[i]>1 ){ |  | 
|  7498         checkAppendMsg(pCheck, 0, |  | 
|  7499           "Multiple uses for byte %d of page %d", i, iPage); |  | 
|  7500         break; |  | 
|  7501       } |  | 
|  7502     } |  | 
|  7503     if( cnt!=data[hdr+7] ){ |  | 
|  7504       checkAppendMsg(pCheck, 0,  |  | 
|  7505           "Fragmentation of %d bytes reported as %d on page %d", |  | 
|  7506           cnt, data[hdr+7], iPage); |  | 
|  7507     } |  | 
|  7508   } |  | 
|  7509   sqlite3PageFree(hit); |  | 
|  7510   releasePage(pPage); |  | 
|  7511   return depth+1; |  | 
|  7512 } |  | 
|  7513 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ |  | 
|  7514  |  | 
|  7515 #ifndef SQLITE_OMIT_INTEGRITY_CHECK |  | 
|  7516 /* |  | 
|  7517 ** This routine does a complete check of the given BTree file.  aRoot[] is |  | 
|  7518 ** an array of pages numbers were each page number is the root page of |  | 
|  7519 ** a table.  nRoot is the number of entries in aRoot. |  | 
|  7520 ** |  | 
|  7521 ** A read-only or read-write transaction must be opened before calling |  | 
|  7522 ** this function. |  | 
|  7523 ** |  | 
|  7524 ** Write the number of error seen in *pnErr.  Except for some memory |  | 
|  7525 ** allocation errors,  an error message held in memory obtained from |  | 
|  7526 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is |  | 
|  7527 ** returned.  If a memory allocation error occurs, NULL is returned. |  | 
|  7528 */ |  | 
|  7529 char *sqlite3BtreeIntegrityCheck( |  | 
|  7530   Btree *p,     /* The btree to be checked */ |  | 
|  7531   int *aRoot,   /* An array of root pages numbers for individual trees */ |  | 
|  7532   int nRoot,    /* Number of entries in aRoot[] */ |  | 
|  7533   int mxErr,    /* Stop reporting errors after this many */ |  | 
|  7534   int *pnErr    /* Write number of errors seen to this variable */ |  | 
|  7535 ){ |  | 
|  7536   Pgno i; |  | 
|  7537   int nRef; |  | 
|  7538   IntegrityCk sCheck; |  | 
|  7539   BtShared *pBt = p->pBt; |  | 
|  7540   char zErr[100]; |  | 
|  7541  |  | 
|  7542   sqlite3BtreeEnter(p); |  | 
|  7543   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); |  | 
|  7544   nRef = sqlite3PagerRefcount(pBt->pPager); |  | 
|  7545   sCheck.pBt = pBt; |  | 
|  7546   sCheck.pPager = pBt->pPager; |  | 
|  7547   sCheck.nPage = pagerPagecount(sCheck.pBt); |  | 
|  7548   sCheck.mxErr = mxErr; |  | 
|  7549   sCheck.nErr = 0; |  | 
|  7550   sCheck.mallocFailed = 0; |  | 
|  7551   *pnErr = 0; |  | 
|  7552   if( sCheck.nPage==0 ){ |  | 
|  7553     sqlite3BtreeLeave(p); |  | 
|  7554     return 0; |  | 
|  7555   } |  | 
|  7556   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); |  | 
|  7557   if( !sCheck.anRef ){ |  | 
|  7558     *pnErr = 1; |  | 
|  7559     sqlite3BtreeLeave(p); |  | 
|  7560     return 0; |  | 
|  7561   } |  | 
|  7562   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } |  | 
|  7563   i = PENDING_BYTE_PAGE(pBt); |  | 
|  7564   if( i<=sCheck.nPage ){ |  | 
|  7565     sCheck.anRef[i] = 1; |  | 
|  7566   } |  | 
|  7567   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000); |  | 
|  7568  |  | 
|  7569   /* Check the integrity of the freelist |  | 
|  7570   */ |  | 
|  7571   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), |  | 
|  7572             get4byte(&pBt->pPage1->aData[36]), "Main freelist: "); |  | 
|  7573  |  | 
|  7574   /* Check all the tables. |  | 
|  7575   */ |  | 
|  7576   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ |  | 
|  7577     if( aRoot[i]==0 ) continue; |  | 
|  7578 #ifndef SQLITE_OMIT_AUTOVACUUM |  | 
|  7579     if( pBt->autoVacuum && aRoot[i]>1 ){ |  | 
|  7580       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0); |  | 
|  7581     } |  | 
|  7582 #endif |  | 
|  7583     checkTreePage(&sCheck, aRoot[i], "List of tree roots: "); |  | 
|  7584   } |  | 
|  7585  |  | 
|  7586   /* Make sure every page in the file is referenced |  | 
|  7587   */ |  | 
|  7588   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ |  | 
|  7589 #ifdef SQLITE_OMIT_AUTOVACUUM |  | 
|  7590     if( sCheck.anRef[i]==0 ){ |  | 
|  7591       checkAppendMsg(&sCheck, 0, "Page %d is never used", i); |  | 
|  7592     } |  | 
|  7593 #else |  | 
|  7594     /* If the database supports auto-vacuum, make sure no tables contain |  | 
|  7595     ** references to pointer-map pages. |  | 
|  7596     */ |  | 
|  7597     if( sCheck.anRef[i]==0 &&  |  | 
|  7598        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ |  | 
|  7599       checkAppendMsg(&sCheck, 0, "Page %d is never used", i); |  | 
|  7600     } |  | 
|  7601     if( sCheck.anRef[i]!=0 &&  |  | 
|  7602        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ |  | 
|  7603       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i); |  | 
|  7604     } |  | 
|  7605 #endif |  | 
|  7606   } |  | 
|  7607  |  | 
|  7608   /* Make sure this analysis did not leave any unref() pages. |  | 
|  7609   ** This is an internal consistency check; an integrity check |  | 
|  7610   ** of the integrity check. |  | 
|  7611   */ |  | 
|  7612   if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ |  | 
|  7613     checkAppendMsg(&sCheck, 0,  |  | 
|  7614       "Outstanding page count goes from %d to %d during this analysis", |  | 
|  7615       nRef, sqlite3PagerRefcount(pBt->pPager) |  | 
|  7616     ); |  | 
|  7617   } |  | 
|  7618  |  | 
|  7619   /* Clean  up and report errors. |  | 
|  7620   */ |  | 
|  7621   sqlite3BtreeLeave(p); |  | 
|  7622   sqlite3_free(sCheck.anRef); |  | 
|  7623   if( sCheck.mallocFailed ){ |  | 
|  7624     sqlite3StrAccumReset(&sCheck.errMsg); |  | 
|  7625     *pnErr = sCheck.nErr+1; |  | 
|  7626     return 0; |  | 
|  7627   } |  | 
|  7628   *pnErr = sCheck.nErr; |  | 
|  7629   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); |  | 
|  7630   return sqlite3StrAccumFinish(&sCheck.errMsg); |  | 
|  7631 } |  | 
|  7632 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ |  | 
|  7633  |  | 
|  7634 /* |  | 
|  7635 ** Return the full pathname of the underlying database file. |  | 
|  7636 ** |  | 
|  7637 ** The pager filename is invariant as long as the pager is |  | 
|  7638 ** open so it is safe to access without the BtShared mutex. |  | 
|  7639 */ |  | 
|  7640 const char *sqlite3BtreeGetFilename(Btree *p){ |  | 
|  7641   assert( p->pBt->pPager!=0 ); |  | 
|  7642   return sqlite3PagerFilename(p->pBt->pPager); |  | 
|  7643 } |  | 
|  7644  |  | 
|  7645 /* |  | 
|  7646 ** Return the pathname of the journal file for this database. The return |  | 
|  7647 ** value of this routine is the same regardless of whether the journal file |  | 
|  7648 ** has been created or not. |  | 
|  7649 ** |  | 
|  7650 ** The pager journal filename is invariant as long as the pager is |  | 
|  7651 ** open so it is safe to access without the BtShared mutex. |  | 
|  7652 */ |  | 
|  7653 const char *sqlite3BtreeGetJournalname(Btree *p){ |  | 
|  7654   assert( p->pBt->pPager!=0 ); |  | 
|  7655   return sqlite3PagerJournalname(p->pBt->pPager); |  | 
|  7656 } |  | 
|  7657  |  | 
|  7658 /* |  | 
|  7659 ** Return non-zero if a transaction is active. |  | 
|  7660 */ |  | 
|  7661 int sqlite3BtreeIsInTrans(Btree *p){ |  | 
|  7662   assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); |  | 
|  7663   return (p && (p->inTrans==TRANS_WRITE)); |  | 
|  7664 } |  | 
|  7665  |  | 
|  7666 /* |  | 
|  7667 ** Return non-zero if a read (or write) transaction is active. |  | 
|  7668 */ |  | 
|  7669 int sqlite3BtreeIsInReadTrans(Btree *p){ |  | 
|  7670   assert( p ); |  | 
|  7671   assert( sqlite3_mutex_held(p->db->mutex) ); |  | 
|  7672   return p->inTrans!=TRANS_NONE; |  | 
|  7673 } |  | 
|  7674  |  | 
|  7675 int sqlite3BtreeIsInBackup(Btree *p){ |  | 
|  7676   assert( p ); |  | 
|  7677   assert( sqlite3_mutex_held(p->db->mutex) ); |  | 
|  7678   return p->nBackup!=0; |  | 
|  7679 } |  | 
|  7680  |  | 
|  7681 /* |  | 
|  7682 ** This function returns a pointer to a blob of memory associated with |  | 
|  7683 ** a single shared-btree. The memory is used by client code for its own |  | 
|  7684 ** purposes (for example, to store a high-level schema associated with  |  | 
|  7685 ** the shared-btree). The btree layer manages reference counting issues. |  | 
|  7686 ** |  | 
|  7687 ** The first time this is called on a shared-btree, nBytes bytes of memory |  | 
|  7688 ** are allocated, zeroed, and returned to the caller. For each subsequent  |  | 
|  7689 ** call the nBytes parameter is ignored and a pointer to the same blob |  | 
|  7690 ** of memory returned.  |  | 
|  7691 ** |  | 
|  7692 ** If the nBytes parameter is 0 and the blob of memory has not yet been |  | 
|  7693 ** allocated, a null pointer is returned. If the blob has already been |  | 
|  7694 ** allocated, it is returned as normal. |  | 
|  7695 ** |  | 
|  7696 ** Just before the shared-btree is closed, the function passed as the  |  | 
|  7697 ** xFree argument when the memory allocation was made is invoked on the  |  | 
|  7698 ** blob of allocated memory. This function should not call sqlite3_free() |  | 
|  7699 ** on the memory, the btree layer does that. |  | 
|  7700 */ |  | 
|  7701 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ |  | 
|  7702   BtShared *pBt = p->pBt; |  | 
|  7703   sqlite3BtreeEnter(p); |  | 
|  7704   if( !pBt->pSchema && nBytes ){ |  | 
|  7705     pBt->pSchema = sqlite3MallocZero(nBytes); |  | 
|  7706     pBt->xFreeSchema = xFree; |  | 
|  7707   } |  | 
|  7708   sqlite3BtreeLeave(p); |  | 
|  7709   return pBt->pSchema; |  | 
|  7710 } |  | 
|  7711  |  | 
|  7712 /* |  | 
|  7713 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared  |  | 
|  7714 ** btree as the argument handle holds an exclusive lock on the  |  | 
|  7715 ** sqlite_master table. Otherwise SQLITE_OK. |  | 
|  7716 */ |  | 
|  7717 int sqlite3BtreeSchemaLocked(Btree *p){ |  | 
|  7718   int rc; |  | 
|  7719   assert( sqlite3_mutex_held(p->db->mutex) ); |  | 
|  7720   sqlite3BtreeEnter(p); |  | 
|  7721   rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); |  | 
|  7722   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); |  | 
|  7723   sqlite3BtreeLeave(p); |  | 
|  7724   return rc; |  | 
|  7725 } |  | 
|  7726  |  | 
|  7727  |  | 
|  7728 #ifndef SQLITE_OMIT_SHARED_CACHE |  | 
|  7729 /* |  | 
|  7730 ** Obtain a lock on the table whose root page is iTab.  The |  | 
|  7731 ** lock is a write lock if isWritelock is true or a read lock |  | 
|  7732 ** if it is false. |  | 
|  7733 */ |  | 
|  7734 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ |  | 
|  7735   int rc = SQLITE_OK; |  | 
|  7736   assert( p->inTrans!=TRANS_NONE ); |  | 
|  7737   if( p->sharable ){ |  | 
|  7738     u8 lockType = READ_LOCK + isWriteLock; |  | 
|  7739     assert( READ_LOCK+1==WRITE_LOCK ); |  | 
|  7740     assert( isWriteLock==0 || isWriteLock==1 ); |  | 
|  7741  |  | 
|  7742     sqlite3BtreeEnter(p); |  | 
|  7743     rc = querySharedCacheTableLock(p, iTab, lockType); |  | 
|  7744     if( rc==SQLITE_OK ){ |  | 
|  7745       rc = setSharedCacheTableLock(p, iTab, lockType); |  | 
|  7746     } |  | 
|  7747     sqlite3BtreeLeave(p); |  | 
|  7748   } |  | 
|  7749   return rc; |  | 
|  7750 } |  | 
|  7751 #endif |  | 
|  7752  |  | 
|  7753 #ifndef SQLITE_OMIT_INCRBLOB |  | 
|  7754 /* |  | 
|  7755 ** Argument pCsr must be a cursor opened for writing on an  |  | 
|  7756 ** INTKEY table currently pointing at a valid table entry.  |  | 
|  7757 ** This function modifies the data stored as part of that entry. |  | 
|  7758 ** |  | 
|  7759 ** Only the data content may only be modified, it is not possible to  |  | 
|  7760 ** change the length of the data stored. If this function is called with |  | 
|  7761 ** parameters that attempt to write past the end of the existing data, |  | 
|  7762 ** no modifications are made and SQLITE_CORRUPT is returned. |  | 
|  7763 */ |  | 
|  7764 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ |  | 
|  7765   int rc; |  | 
|  7766   assert( cursorHoldsMutex(pCsr) ); |  | 
|  7767   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); |  | 
|  7768   assert( pCsr->isIncrblobHandle ); |  | 
|  7769  |  | 
|  7770   rc = restoreCursorPosition(pCsr); |  | 
|  7771   if( rc!=SQLITE_OK ){ |  | 
|  7772     return rc; |  | 
|  7773   } |  | 
|  7774   assert( pCsr->eState!=CURSOR_REQUIRESEEK ); |  | 
|  7775   if( pCsr->eState!=CURSOR_VALID ){ |  | 
|  7776     return SQLITE_ABORT; |  | 
|  7777   } |  | 
|  7778  |  | 
|  7779   /* Check some assumptions:  |  | 
|  7780   **   (a) the cursor is open for writing, |  | 
|  7781   **   (b) there is a read/write transaction open, |  | 
|  7782   **   (c) the connection holds a write-lock on the table (if required), |  | 
|  7783   **   (d) there are no conflicting read-locks, and |  | 
|  7784   **   (e) the cursor points at a valid row of an intKey table. |  | 
|  7785   */ |  | 
|  7786   if( !pCsr->wrFlag ){ |  | 
|  7787     return SQLITE_READONLY; |  | 
|  7788   } |  | 
|  7789   assert( !pCsr->pBt->readOnly && pCsr->pBt->inTransaction==TRANS_WRITE ); |  | 
|  7790   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); |  | 
|  7791   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); |  | 
|  7792   assert( pCsr->apPage[pCsr->iPage]->intKey ); |  | 
|  7793  |  | 
|  7794   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); |  | 
|  7795 } |  | 
|  7796  |  | 
|  7797 /*  |  | 
|  7798 ** Set a flag on this cursor to cache the locations of pages from the  |  | 
|  7799 ** overflow list for the current row. This is used by cursors opened |  | 
|  7800 ** for incremental blob IO only. |  | 
|  7801 ** |  | 
|  7802 ** This function sets a flag only. The actual page location cache |  | 
|  7803 ** (stored in BtCursor.aOverflow[]) is allocated and used by function |  | 
|  7804 ** accessPayload() (the worker function for sqlite3BtreeData() and |  | 
|  7805 ** sqlite3BtreePutData()). |  | 
|  7806 */ |  | 
|  7807 void sqlite3BtreeCacheOverflow(BtCursor *pCur){ |  | 
|  7808   assert( cursorHoldsMutex(pCur) ); |  | 
|  7809   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); |  | 
|  7810   assert(!pCur->isIncrblobHandle); |  | 
|  7811   assert(!pCur->aOverflow); |  | 
|  7812   pCur->isIncrblobHandle = 1; |  | 
|  7813 } |  | 
|  7814  |  | 
|  7815 /* Poison the db so that other clients error out as quickly as |  | 
|  7816 ** possible. |  | 
|  7817 */ |  | 
|  7818 int sqlite3Poison(sqlite3 *db){ |  | 
|  7819   int rc; |  | 
|  7820   Btree *p; |  | 
|  7821   BtShared *pBt; |  | 
|  7822   unsigned char *pP1; |  | 
|  7823  |  | 
|  7824   if( db == NULL) return SQLITE_OK; |  | 
|  7825  |  | 
|  7826   /* Database 0 corrosponds to the main database. */ |  | 
|  7827   if( db->nDb<1 ) return SQLITE_OK; |  | 
|  7828   p = db->aDb[0].pBt; |  | 
|  7829   pBt = p->pBt; |  | 
|  7830  |  | 
|  7831   /* If in a transaction, roll it back.  Committing any changes to a |  | 
|  7832   ** corrupt database may mess up evidence, we definitely don't want |  | 
|  7833   ** to allow poisoning to be rolled back, and the database is anyhow |  | 
|  7834   ** going bye-bye RSN. |  | 
|  7835   */ |  | 
|  7836   /* TODO(shess): Figure out if this might release the lock and let |  | 
|  7837   ** someone else get in there, which might deny us the lock a couple |  | 
|  7838   ** lines down. |  | 
|  7839   */ |  | 
|  7840   if( sqlite3BtreeIsInTrans(p) ) sqlite3BtreeRollback(p); |  | 
|  7841  |  | 
|  7842   /* Start an exclusive transaction.  This will check the headers, so |  | 
|  7843   ** if someone else poisoned the database we should get an error. |  | 
|  7844   */ |  | 
|  7845   rc = sqlite3BtreeBeginTrans(p, 2); |  | 
|  7846   /* TODO(shess): Handle SQLITE_BUSY? */ |  | 
|  7847   if( rc!=SQLITE_OK ) return rc; |  | 
|  7848  |  | 
|  7849   /* Copied from sqlite3BtreeUpdateMeta().  Writing the old version of |  | 
|  7850   ** the page to the journal may be overkill, but it probably won't |  | 
|  7851   ** hurt. |  | 
|  7852   */ |  | 
|  7853   assert( pBt->inTrans==TRANS_WRITE ); |  | 
|  7854   assert( pBt->pPage1!=0 ); |  | 
|  7855   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); |  | 
|  7856   if( rc ) goto err; |  | 
|  7857  |  | 
|  7858   /* "SQLite format 3" changes to |  | 
|  7859   ** "SQLite poison 3".  Be extra paranoid about making this change. |  | 
|  7860   */ |  | 
|  7861   if( sizeof(zMagicHeader)!=16 || |  | 
|  7862       sizeof(zPoisonHeader)!=sizeof(zMagicHeader) ){ |  | 
|  7863     rc = SQLITE_ERROR; |  | 
|  7864     goto err; |  | 
|  7865   } |  | 
|  7866   pP1 = pBt->pPage1->aData; |  | 
|  7867   if( memcmp(pP1, zMagicHeader, 16)!=0 ){ |  | 
|  7868     rc = SQLITE_CORRUPT; |  | 
|  7869     goto err; |  | 
|  7870   } |  | 
|  7871   memcpy(pP1, zPoisonHeader, 16); |  | 
|  7872  |  | 
|  7873   /* Push it to the database file. */ |  | 
|  7874   return sqlite3BtreeCommit(p); |  | 
|  7875  |  | 
|  7876  err: |  | 
|  7877   /* TODO(shess): What about errors, here? */ |  | 
|  7878   sqlite3BtreeRollback(p); |  | 
|  7879   return rc; |  | 
|  7880 } |  | 
|  7881  |  | 
|  7882 #endif |  | 
| OLD | NEW |