Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(189)

Unified Diff: third_party/sqlite/src/src/wal.c

Issue 1610963002: Import SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/sqlite/src/src/wal.h ('k') | third_party/sqlite/src/src/walker.c » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/sqlite/src/src/wal.c
diff --git a/third_party/sqlite/src/src/wal.c b/third_party/sqlite/src/src/wal.c
index d134a8b52a31089b555ada725aa259c94646286c..f38e24a961a37b367f996849a9d2b33a970065a1 100644
--- a/third_party/sqlite/src/src/wal.c
+++ b/third_party/sqlite/src/src/wal.c
@@ -272,7 +272,8 @@ int sqlite3WalTrace = 0;
/*
** Indices of various locking bytes. WAL_NREADER is the number
-** of available reader locks and should be at least 3.
+** of available reader locks and should be at least 3. The default
+** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5.
*/
#define WAL_WRITE_LOCK 0
#define WAL_ALL_BUT_WRITE 1
@@ -292,7 +293,10 @@ typedef struct WalCkptInfo WalCkptInfo;
** The following object holds a copy of the wal-index header content.
**
** The actual header in the wal-index consists of two copies of this
-** object.
+** object followed by one instance of the WalCkptInfo object.
+** For all versions of SQLite through 3.10.0 and probably beyond,
+** the locking bytes (WalCkptInfo.aLock) start at offset 120 and
+** the total header size is 136 bytes.
**
** The szPage value can be any power of 2 between 512 and 32768, inclusive.
** Or it can be 1 to represent a 65536-byte page. The latter case was
@@ -325,6 +329,16 @@ struct WalIndexHdr {
** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
** mxFrame back to zero when the WAL is reset.
**
+** nBackfillAttempted is the largest value of nBackfill that a checkpoint
+** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however
+** the nBackfillAttempted is set before any backfilling is done and the
+** nBackfill is only set after all backfilling completes. So if a checkpoint
+** crashes, nBackfillAttempted might be larger than nBackfill. The
+** WalIndexHdr.mxFrame must never be less than nBackfillAttempted.
+**
+** The aLock[] field is a set of bytes used for locking. These bytes should
+** never be read or written.
+**
** There is one entry in aReadMark[] for each reader lock. If a reader
** holds read-lock K, then the value in aReadMark[K] is no greater than
** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff)
@@ -364,6 +378,9 @@ struct WalIndexHdr {
struct WalCkptInfo {
u32 nBackfill; /* Number of WAL frames backfilled into DB */
u32 aReadMark[WAL_NREADER]; /* Reader marks */
+ u8 aLock[SQLITE_SHM_NLOCK]; /* Reserved space for locks */
+ u32 nBackfillAttempted; /* WAL frames perhaps written, or maybe not */
+ u32 notUsed0; /* Available for future enhancements */
};
#define READMARK_NOT_USED 0xffffffff
@@ -373,9 +390,8 @@ struct WalCkptInfo {
** only support mandatory file-locks, we do not read or write data
** from the region of the file on which locks are applied.
*/
-#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo))
-#define WALINDEX_LOCK_RESERVED 16
-#define WALINDEX_HDR_SIZE (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)
+#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2+offsetof(WalCkptInfo,aLock))
+#define WALINDEX_HDR_SIZE (sizeof(WalIndexHdr)*2+sizeof(WalCkptInfo))
/* Size of header before each frame in wal */
#define WAL_FRAME_HDRSIZE 24
@@ -428,11 +444,15 @@ struct Wal {
u8 syncHeader; /* Fsync the WAL header if true */
u8 padToSectorBoundary; /* Pad transactions out to the next sector */
WalIndexHdr hdr; /* Wal-index header for current transaction */
+ u32 minFrame; /* Ignore wal frames before this one */
const char *zWalName; /* Name of WAL file */
u32 nCkpt; /* Checkpoint sequence counter in the wal-header */
#ifdef SQLITE_DEBUG
u8 lockError; /* True if a locking error has occurred */
#endif
+#ifdef SQLITE_ENABLE_SNAPSHOT
+ WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */
+#endif
};
/*
@@ -522,7 +542,7 @@ static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
if( pWal->nWiData<=iPage ){
int nByte = sizeof(u32*)*(iPage+1);
volatile u32 **apNew;
- apNew = (volatile u32 **)sqlite3_realloc((void *)pWal->apWiData, nByte);
+ apNew = (volatile u32 **)sqlite3_realloc64((void *)pWal->apWiData, nByte);
if( !apNew ){
*ppPage = 0;
return SQLITE_NOMEM;
@@ -648,9 +668,9 @@ static void walIndexWriteHdr(Wal *pWal){
pWal->hdr.isInit = 1;
pWal->hdr.iVersion = WALINDEX_MAX_VERSION;
walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum);
- memcpy((void *)&aHdr[1], (void *)&pWal->hdr, sizeof(WalIndexHdr));
+ memcpy((void*)&aHdr[1], (const void*)&pWal->hdr, sizeof(WalIndexHdr));
walShmBarrier(pWal);
- memcpy((void *)&aHdr[0], (void *)&pWal->hdr, sizeof(WalIndexHdr));
+ memcpy((void*)&aHdr[0], (const void*)&pWal->hdr, sizeof(WalIndexHdr));
}
/*
@@ -951,13 +971,13 @@ static void walCleanupHash(Wal *pWal){
** via the hash table even after the cleanup.
*/
if( iLimit ){
- int i; /* Loop counter */
+ int j; /* Loop counter */
int iKey; /* Hash key */
- for(i=1; i<=iLimit; i++){
- for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
- if( aHash[iKey]==i ) break;
+ for(j=1; j<=iLimit; j++){
+ for(iKey=walHash(aPgno[j]); aHash[iKey]; iKey=walNextHash(iKey)){
+ if( aHash[iKey]==j ) break;
}
- assert( aHash[iKey]==i );
+ assert( aHash[iKey]==j );
}
}
#endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
@@ -1146,7 +1166,7 @@ static int walIndexRecover(Wal *pWal){
/* Malloc a buffer to read frames into. */
szFrame = szPage + WAL_FRAME_HDRSIZE;
- aFrame = (u8 *)sqlite3_malloc(szFrame);
+ aFrame = (u8 *)sqlite3_malloc64(szFrame);
if( !aFrame ){
rc = SQLITE_NOMEM;
goto recovery_error;
@@ -1197,6 +1217,7 @@ finished:
*/
pInfo = walCkptInfo(pWal);
pInfo->nBackfill = 0;
+ pInfo->nBackfillAttempted = pWal->hdr.mxFrame;
pInfo->aReadMark[0] = 0;
for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame;
@@ -1268,7 +1289,11 @@ int sqlite3WalOpen(
/* In the amalgamation, the os_unix.c and os_win.c source files come before
** this source file. Verify that the #defines of the locking byte offsets
** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value.
+ ** For that matter, if the lock offset ever changes from its initial design
+ ** value of 120, we need to know that so there is an assert() to check it.
*/
+ assert( 120==WALINDEX_LOCK_OFFSET );
+ assert( 136==WALINDEX_HDR_SIZE );
#ifdef WIN_SHM_BASE
assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET );
#endif
@@ -1459,7 +1484,7 @@ static void walMergesort(
int nMerge = 0; /* Number of elements in list aMerge */
ht_slot *aMerge = 0; /* List to be merged */
int iList; /* Index into input list */
- int iSub = 0; /* Index into aSub array */
+ u32 iSub = 0; /* Index into aSub array */
struct Sublist aSub[13]; /* Array of sub-lists */
memset(aSub, 0, sizeof(aSub));
@@ -1470,7 +1495,9 @@ static void walMergesort(
nMerge = 1;
aMerge = &aList[iList];
for(iSub=0; iList & (1<<iSub); iSub++){
- struct Sublist *p = &aSub[iSub];
+ struct Sublist *p;
+ assert( iSub<ArraySize(aSub) );
+ p = &aSub[iSub];
assert( p->aList && p->nList<=(1<<iSub) );
assert( p->aList==&aList[iList&~((2<<iSub)-1)] );
walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);
@@ -1481,7 +1508,9 @@ static void walMergesort(
for(iSub++; iSub<ArraySize(aSub); iSub++){
if( nList & (1<<iSub) ){
- struct Sublist *p = &aSub[iSub];
+ struct Sublist *p;
+ assert( iSub<ArraySize(aSub) );
+ p = &aSub[iSub];
assert( p->nList<=(1<<iSub) );
assert( p->aList==&aList[nList&~((2<<iSub)-1)] );
walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);
@@ -1504,7 +1533,7 @@ static void walMergesort(
** Free an iterator allocated by walIteratorInit().
*/
static void walIteratorFree(WalIterator *p){
- sqlite3ScratchFree(p);
+ sqlite3_free(p);
}
/*
@@ -1539,7 +1568,7 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
nByte = sizeof(WalIterator)
+ (nSegment-1)*sizeof(struct WalSegment)
+ iLast*sizeof(ht_slot);
- p = (WalIterator *)sqlite3ScratchMalloc(nByte);
+ p = (WalIterator *)sqlite3_malloc64(nByte);
if( !p ){
return SQLITE_NOMEM;
}
@@ -1549,7 +1578,7 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
/* Allocate temporary space used by the merge-sort routine. This block
** of memory will be freed before this function returns.
*/
- aTmp = (ht_slot *)sqlite3ScratchMalloc(
+ aTmp = (ht_slot *)sqlite3_malloc64(
sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast)
);
if( !aTmp ){
@@ -1586,7 +1615,7 @@ static int walIteratorInit(Wal *pWal, WalIterator **pp){
p->aSegment[i].aPgno = (u32 *)aPgno;
}
}
- sqlite3ScratchFree(aTmp);
+ sqlite3_free(aTmp);
if( rc!=SQLITE_OK ){
walIteratorFree(p);
@@ -1624,6 +1653,39 @@ static int walPagesize(Wal *pWal){
}
/*
+** The following is guaranteed when this function is called:
+**
+** a) the WRITER lock is held,
+** b) the entire log file has been checkpointed, and
+** c) any existing readers are reading exclusively from the database
+** file - there are no readers that may attempt to read a frame from
+** the log file.
+**
+** This function updates the shared-memory structures so that the next
+** client to write to the database (which may be this one) does so by
+** writing frames into the start of the log file.
+**
+** The value of parameter salt1 is used as the aSalt[1] value in the
+** new wal-index header. It should be passed a pseudo-random value (i.e.
+** one obtained from sqlite3_randomness()).
+*/
+static void walRestartHdr(Wal *pWal, u32 salt1){
+ volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
+ int i; /* Loop counter */
+ u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */
+ pWal->nCkpt++;
+ pWal->hdr.mxFrame = 0;
+ sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
+ memcpy(&pWal->hdr.aSalt[1], &salt1, 4);
+ walIndexWriteHdr(pWal);
+ pInfo->nBackfill = 0;
+ pInfo->nBackfillAttempted = 0;
+ pInfo->aReadMark[1] = 0;
+ for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
+ assert( pInfo->aReadMark[0]==0 );
+}
+
+/*
** Copy as much content as we can from the WAL back into the database file
** in response to an sqlite3_wal_checkpoint() request or the equivalent.
**
@@ -1657,12 +1719,12 @@ static int walPagesize(Wal *pWal){
static int walCheckpoint(
Wal *pWal, /* Wal connection */
int eMode, /* One of PASSIVE, FULL or RESTART */
- int (*xBusyCall)(void*), /* Function to call when busy */
+ int (*xBusy)(void*), /* Function to call when busy */
void *pBusyArg, /* Context argument for xBusyHandler */
int sync_flags, /* Flags for OsSync() (or 0) */
u8 *zBuf /* Temporary buffer to use */
){
- int rc; /* Return code */
+ int rc = SQLITE_OK; /* Return code */
int szPage; /* Database page-size */
WalIterator *pIter = 0; /* Wal iterator context */
u32 iDbpage = 0; /* Next database page to write */
@@ -1671,123 +1733,156 @@ static int walCheckpoint(
u32 mxPage; /* Max database page to write */
int i; /* Loop counter */
volatile WalCkptInfo *pInfo; /* The checkpoint status information */
- int (*xBusy)(void*) = 0; /* Function to call when waiting for locks */
szPage = walPagesize(pWal);
testcase( szPage<=32768 );
testcase( szPage>=65536 );
pInfo = walCkptInfo(pWal);
- if( pInfo->nBackfill>=pWal->hdr.mxFrame ) return SQLITE_OK;
+ if( pInfo->nBackfill<pWal->hdr.mxFrame ){
- /* Allocate the iterator */
- rc = walIteratorInit(pWal, &pIter);
- if( rc!=SQLITE_OK ){
- return rc;
- }
- assert( pIter );
+ /* Allocate the iterator */
+ rc = walIteratorInit(pWal, &pIter);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+ assert( pIter );
- if( eMode!=SQLITE_CHECKPOINT_PASSIVE ) xBusy = xBusyCall;
+ /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked
+ ** in the SQLITE_CHECKPOINT_PASSIVE mode. */
+ assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 );
- /* Compute in mxSafeFrame the index of the last frame of the WAL that is
- ** safe to write into the database. Frames beyond mxSafeFrame might
- ** overwrite database pages that are in use by active readers and thus
- ** cannot be backfilled from the WAL.
- */
- mxSafeFrame = pWal->hdr.mxFrame;
- mxPage = pWal->hdr.nPage;
- for(i=1; i<WAL_NREADER; i++){
- u32 y = pInfo->aReadMark[i];
- if( mxSafeFrame>y ){
- assert( y<=pWal->hdr.mxFrame );
- rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1);
- if( rc==SQLITE_OK ){
- pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED);
- walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
- }else if( rc==SQLITE_BUSY ){
- mxSafeFrame = y;
- xBusy = 0;
- }else{
- goto walcheckpoint_out;
+ /* Compute in mxSafeFrame the index of the last frame of the WAL that is
+ ** safe to write into the database. Frames beyond mxSafeFrame might
+ ** overwrite database pages that are in use by active readers and thus
+ ** cannot be backfilled from the WAL.
+ */
+ mxSafeFrame = pWal->hdr.mxFrame;
+ mxPage = pWal->hdr.nPage;
+ for(i=1; i<WAL_NREADER; i++){
+ /* Thread-sanitizer reports that the following is an unsafe read,
+ ** as some other thread may be in the process of updating the value
+ ** of the aReadMark[] slot. The assumption here is that if that is
+ ** happening, the other client may only be increasing the value,
+ ** not decreasing it. So assuming either that either the "old" or
+ ** "new" version of the value is read, and not some arbitrary value
+ ** that would never be written by a real client, things are still
+ ** safe. */
+ u32 y = pInfo->aReadMark[i];
+ if( mxSafeFrame>y ){
+ assert( y<=pWal->hdr.mxFrame );
+ rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1);
+ if( rc==SQLITE_OK ){
+ pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED);
+ walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
+ }else if( rc==SQLITE_BUSY ){
+ mxSafeFrame = y;
+ xBusy = 0;
+ }else{
+ goto walcheckpoint_out;
+ }
}
}
- }
- if( pInfo->nBackfill<mxSafeFrame
- && (rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(0), 1))==SQLITE_OK
- ){
- i64 nSize; /* Current size of database file */
- u32 nBackfill = pInfo->nBackfill;
+ if( pInfo->nBackfill<mxSafeFrame
+ && (rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(0),1))==SQLITE_OK
+ ){
+ i64 nSize; /* Current size of database file */
+ u32 nBackfill = pInfo->nBackfill;
- /* Sync the WAL to disk */
- if( sync_flags ){
- rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
- }
+ pInfo->nBackfillAttempted = mxSafeFrame;
- /* If the database may grow as a result of this checkpoint, hint
- ** about the eventual size of the db file to the VFS layer.
- */
- if( rc==SQLITE_OK ){
- i64 nReq = ((i64)mxPage * szPage);
- rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);
- if( rc==SQLITE_OK && nSize<nReq ){
- sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
+ /* Sync the WAL to disk */
+ if( sync_flags ){
+ rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
}
- }
+ /* If the database may grow as a result of this checkpoint, hint
+ ** about the eventual size of the db file to the VFS layer.
+ */
+ if( rc==SQLITE_OK ){
+ i64 nReq = ((i64)mxPage * szPage);
+ rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);
+ if( rc==SQLITE_OK && nSize<nReq ){
+ sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
+ }
+ }
- /* Iterate through the contents of the WAL, copying data to the db file. */
- while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
- i64 iOffset;
- assert( walFramePgno(pWal, iFrame)==iDbpage );
- if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ) continue;
- iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
- /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
- rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);
- if( rc!=SQLITE_OK ) break;
- iOffset = (iDbpage-1)*(i64)szPage;
- testcase( IS_BIG_INT(iOffset) );
- rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset);
- if( rc!=SQLITE_OK ) break;
- }
- /* If work was actually accomplished... */
- if( rc==SQLITE_OK ){
- if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
- i64 szDb = pWal->hdr.nPage*(i64)szPage;
- testcase( IS_BIG_INT(szDb) );
- rc = sqlite3OsTruncate(pWal->pDbFd, szDb);
- if( rc==SQLITE_OK && sync_flags ){
- rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
+ /* Iterate through the contents of the WAL, copying data to the db file */
+ while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
+ i64 iOffset;
+ assert( walFramePgno(pWal, iFrame)==iDbpage );
+ if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ){
+ continue;
}
+ iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
+ /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
+ rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);
+ if( rc!=SQLITE_OK ) break;
+ iOffset = (iDbpage-1)*(i64)szPage;
+ testcase( IS_BIG_INT(iOffset) );
+ rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset);
+ if( rc!=SQLITE_OK ) break;
}
+
+ /* If work was actually accomplished... */
if( rc==SQLITE_OK ){
- pInfo->nBackfill = mxSafeFrame;
+ if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
+ i64 szDb = pWal->hdr.nPage*(i64)szPage;
+ testcase( IS_BIG_INT(szDb) );
+ rc = sqlite3OsTruncate(pWal->pDbFd, szDb);
+ if( rc==SQLITE_OK && sync_flags ){
+ rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
+ }
+ }
+ if( rc==SQLITE_OK ){
+ pInfo->nBackfill = mxSafeFrame;
+ }
}
- }
- /* Release the reader lock held while backfilling */
- walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
- }
+ /* Release the reader lock held while backfilling */
+ walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
+ }
- if( rc==SQLITE_BUSY ){
- /* Reset the return code so as not to report a checkpoint failure
- ** just because there are active readers. */
- rc = SQLITE_OK;
+ if( rc==SQLITE_BUSY ){
+ /* Reset the return code so as not to report a checkpoint failure
+ ** just because there are active readers. */
+ rc = SQLITE_OK;
+ }
}
- /* If this is an SQLITE_CHECKPOINT_RESTART operation, and the entire wal
- ** file has been copied into the database file, then block until all
- ** readers have finished using the wal file. This ensures that the next
- ** process to write to the database restarts the wal file.
+ /* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the
+ ** entire wal file has been copied into the database file, then block
+ ** until all readers have finished using the wal file. This ensures that
+ ** the next process to write to the database restarts the wal file.
*/
if( rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){
assert( pWal->writeLock );
if( pInfo->nBackfill<pWal->hdr.mxFrame ){
rc = SQLITE_BUSY;
- }else if( eMode==SQLITE_CHECKPOINT_RESTART ){
- assert( mxSafeFrame==pWal->hdr.mxFrame );
+ }else if( eMode>=SQLITE_CHECKPOINT_RESTART ){
+ u32 salt1;
+ sqlite3_randomness(4, &salt1);
+ assert( pInfo->nBackfill==pWal->hdr.mxFrame );
rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(1), WAL_NREADER-1);
if( rc==SQLITE_OK ){
+ if( eMode==SQLITE_CHECKPOINT_TRUNCATE ){
+ /* IMPLEMENTATION-OF: R-44699-57140 This mode works the same way as
+ ** SQLITE_CHECKPOINT_RESTART with the addition that it also
+ ** truncates the log file to zero bytes just prior to a
+ ** successful return.
+ **
+ ** In theory, it might be safe to do this without updating the
+ ** wal-index header in shared memory, as all subsequent reader or
+ ** writer clients should see that the entire log file has been
+ ** checkpointed and behave accordingly. This seems unsafe though,
+ ** as it would leave the system in a state where the contents of
+ ** the wal-index header do not match the contents of the
+ ** file-system. To avoid this, update the wal-index header to
+ ** indicate that the log file contains zero valid frames. */
+ walRestartHdr(pWal, salt1);
+ rc = sqlite3OsTruncate(pWal->pWalFd, 0);
+ }
walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
}
}
@@ -2079,6 +2174,7 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
int mxI; /* Index of largest aReadMark[] value */
int i; /* Loop counter */
int rc = SQLITE_OK; /* Return code */
+ u32 mxFrame; /* Wal frame to lock to */
assert( pWal->readLock<0 ); /* Not currently locked */
@@ -2142,7 +2238,12 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
}
pInfo = walCkptInfo(pWal);
- if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame ){
+ if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame
+#ifdef SQLITE_ENABLE_SNAPSHOT
+ && (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0
+ || 0==memcmp(&pWal->hdr, pWal->pSnapshot, sizeof(WalIndexHdr)))
+#endif
+ ){
/* The WAL has been completely backfilled (or it is empty).
** and can be safely ignored.
*/
@@ -2180,70 +2281,88 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
*/
mxReadMark = 0;
mxI = 0;
+ mxFrame = pWal->hdr.mxFrame;
+#ifdef SQLITE_ENABLE_SNAPSHOT
+ if( pWal->pSnapshot && pWal->pSnapshot->mxFrame<mxFrame ){
+ mxFrame = pWal->pSnapshot->mxFrame;
+ }
+#endif
for(i=1; i<WAL_NREADER; i++){
u32 thisMark = pInfo->aReadMark[i];
- if( mxReadMark<=thisMark && thisMark<=pWal->hdr.mxFrame ){
+ if( mxReadMark<=thisMark && thisMark<=mxFrame ){
assert( thisMark!=READMARK_NOT_USED );
mxReadMark = thisMark;
mxI = i;
}
}
- /* There was once an "if" here. The extra "{" is to preserve indentation. */
- {
- if( (pWal->readOnly & WAL_SHM_RDONLY)==0
- && (mxReadMark<pWal->hdr.mxFrame || mxI==0)
- ){
- for(i=1; i<WAL_NREADER; i++){
- rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
- if( rc==SQLITE_OK ){
- mxReadMark = pInfo->aReadMark[i] = pWal->hdr.mxFrame;
- mxI = i;
- walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
- break;
- }else if( rc!=SQLITE_BUSY ){
- return rc;
- }
+ if( (pWal->readOnly & WAL_SHM_RDONLY)==0
+ && (mxReadMark<mxFrame || mxI==0)
+ ){
+ for(i=1; i<WAL_NREADER; i++){
+ rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
+ if( rc==SQLITE_OK ){
+ mxReadMark = pInfo->aReadMark[i] = mxFrame;
+ mxI = i;
+ walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
+ break;
+ }else if( rc!=SQLITE_BUSY ){
+ return rc;
}
}
- if( mxI==0 ){
- assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 );
- return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK;
- }
+ }
+ if( mxI==0 ){
+ assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 );
+ return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK;
+ }
- rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
- if( rc ){
- return rc==SQLITE_BUSY ? WAL_RETRY : rc;
- }
- /* Now that the read-lock has been obtained, check that neither the
- ** value in the aReadMark[] array or the contents of the wal-index
- ** header have changed.
- **
- ** It is necessary to check that the wal-index header did not change
- ** between the time it was read and when the shared-lock was obtained
- ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility
- ** that the log file may have been wrapped by a writer, or that frames
- ** that occur later in the log than pWal->hdr.mxFrame may have been
- ** copied into the database by a checkpointer. If either of these things
- ** happened, then reading the database with the current value of
- ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry
- ** instead.
- **
- ** This does not guarantee that the copy of the wal-index header is up to
- ** date before proceeding. That would not be possible without somehow
- ** blocking writers. It only guarantees that a dangerous checkpoint or
- ** log-wrap (either of which would require an exclusive lock on
- ** WAL_READ_LOCK(mxI)) has not occurred since the snapshot was valid.
- */
- walShmBarrier(pWal);
- if( pInfo->aReadMark[mxI]!=mxReadMark
- || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr))
- ){
- walUnlockShared(pWal, WAL_READ_LOCK(mxI));
- return WAL_RETRY;
- }else{
- assert( mxReadMark<=pWal->hdr.mxFrame );
- pWal->readLock = (i16)mxI;
- }
+ rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
+ if( rc ){
+ return rc==SQLITE_BUSY ? WAL_RETRY : rc;
+ }
+ /* Now that the read-lock has been obtained, check that neither the
+ ** value in the aReadMark[] array or the contents of the wal-index
+ ** header have changed.
+ **
+ ** It is necessary to check that the wal-index header did not change
+ ** between the time it was read and when the shared-lock was obtained
+ ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility
+ ** that the log file may have been wrapped by a writer, or that frames
+ ** that occur later in the log than pWal->hdr.mxFrame may have been
+ ** copied into the database by a checkpointer. If either of these things
+ ** happened, then reading the database with the current value of
+ ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry
+ ** instead.
+ **
+ ** Before checking that the live wal-index header has not changed
+ ** since it was read, set Wal.minFrame to the first frame in the wal
+ ** file that has not yet been checkpointed. This client will not need
+ ** to read any frames earlier than minFrame from the wal file - they
+ ** can be safely read directly from the database file.
+ **
+ ** Because a ShmBarrier() call is made between taking the copy of
+ ** nBackfill and checking that the wal-header in shared-memory still
+ ** matches the one cached in pWal->hdr, it is guaranteed that the
+ ** checkpointer that set nBackfill was not working with a wal-index
+ ** header newer than that cached in pWal->hdr. If it were, that could
+ ** cause a problem. The checkpointer could omit to checkpoint
+ ** a version of page X that lies before pWal->minFrame (call that version
+ ** A) on the basis that there is a newer version (version B) of the same
+ ** page later in the wal file. But if version B happens to like past
+ ** frame pWal->hdr.mxFrame - then the client would incorrectly assume
+ ** that it can read version A from the database file. However, since
+ ** we can guarantee that the checkpointer that set nBackfill could not
+ ** see any pages past pWal->hdr.mxFrame, this problem does not come up.
+ */
+ pWal->minFrame = pInfo->nBackfill+1;
+ walShmBarrier(pWal);
+ if( pInfo->aReadMark[mxI]!=mxReadMark
+ || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr))
+ ){
+ walUnlockShared(pWal, WAL_READ_LOCK(mxI));
+ return WAL_RETRY;
+ }else{
+ assert( mxReadMark<=pWal->hdr.mxFrame );
+ pWal->readLock = (i16)mxI;
}
return rc;
}
@@ -2266,6 +2385,14 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
int rc; /* Return code */
int cnt = 0; /* Number of TryBeginRead attempts */
+#ifdef SQLITE_ENABLE_SNAPSHOT
+ int bChanged = 0;
+ WalIndexHdr *pSnapshot = pWal->pSnapshot;
+ if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){
+ bChanged = 1;
+ }
+#endif
+
do{
rc = walTryBeginRead(pWal, pChanged, 0, ++cnt);
}while( rc==WAL_RETRY );
@@ -2273,6 +2400,66 @@ int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
testcase( (rc&0xff)==SQLITE_IOERR );
testcase( rc==SQLITE_PROTOCOL );
testcase( rc==SQLITE_OK );
+
+#ifdef SQLITE_ENABLE_SNAPSHOT
+ if( rc==SQLITE_OK ){
+ if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){
+ /* At this point the client has a lock on an aReadMark[] slot holding
+ ** a value equal to or smaller than pSnapshot->mxFrame, but pWal->hdr
+ ** is populated with the wal-index header corresponding to the head
+ ** of the wal file. Verify that pSnapshot is still valid before
+ ** continuing. Reasons why pSnapshot might no longer be valid:
+ **
+ ** (1) The WAL file has been reset since the snapshot was taken.
+ ** In this case, the salt will have changed.
+ **
+ ** (2) A checkpoint as been attempted that wrote frames past
+ ** pSnapshot->mxFrame into the database file. Note that the
+ ** checkpoint need not have completed for this to cause problems.
+ */
+ volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
+
+ assert( pWal->readLock>0 || pWal->hdr.mxFrame==0 );
+ assert( pInfo->aReadMark[pWal->readLock]<=pSnapshot->mxFrame );
+
+ /* It is possible that there is a checkpointer thread running
+ ** concurrent with this code. If this is the case, it may be that the
+ ** checkpointer has already determined that it will checkpoint
+ ** snapshot X, where X is later in the wal file than pSnapshot, but
+ ** has not yet set the pInfo->nBackfillAttempted variable to indicate
+ ** its intent. To avoid the race condition this leads to, ensure that
+ ** there is no checkpointer process by taking a shared CKPT lock
+ ** before checking pInfo->nBackfillAttempted. */
+ rc = walLockShared(pWal, WAL_CKPT_LOCK);
+
+ if( rc==SQLITE_OK ){
+ /* Check that the wal file has not been wrapped. Assuming that it has
+ ** not, also check that no checkpointer has attempted to checkpoint any
+ ** frames beyond pSnapshot->mxFrame. If either of these conditions are
+ ** true, return SQLITE_BUSY_SNAPSHOT. Otherwise, overwrite pWal->hdr
+ ** with *pSnapshot and set *pChanged as appropriate for opening the
+ ** snapshot. */
+ if( !memcmp(pSnapshot->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt))
+ && pSnapshot->mxFrame>=pInfo->nBackfillAttempted
+ ){
+ assert( pWal->readLock>0 );
+ memcpy(&pWal->hdr, pSnapshot, sizeof(WalIndexHdr));
+ *pChanged = bChanged;
+ }else{
+ rc = SQLITE_BUSY_SNAPSHOT;
+ }
+
+ /* Release the shared CKPT lock obtained above. */
+ walUnlockShared(pWal, WAL_CKPT_LOCK);
+ }
+
+
+ if( rc!=SQLITE_OK ){
+ sqlite3WalEndReadTransaction(pWal);
+ }
+ }
+ }
+#endif
return rc;
}
@@ -2304,6 +2491,7 @@ int sqlite3WalFindFrame(
u32 iRead = 0; /* If !=0, WAL frame to return data from */
u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */
int iHash; /* Used to loop through N hash tables */
+ int iMinHash;
/* This routine is only be called from within a read transaction. */
assert( pWal->readLock>=0 || pWal->lockError );
@@ -2344,7 +2532,8 @@ int sqlite3WalFindFrame(
** This condition filters out entries that were added to the hash
** table after the current read-transaction had started.
*/
- for(iHash=walFramePage(iLast); iHash>=0 && iRead==0; iHash--){
+ iMinHash = walFramePage(pWal->minFrame);
+ for(iHash=walFramePage(iLast); iHash>=iMinHash && iRead==0; iHash--){
volatile ht_slot *aHash; /* Pointer to hash table */
volatile u32 *aPgno; /* Pointer to array of page numbers */
u32 iZero; /* Frame number corresponding to aPgno[0] */
@@ -2359,8 +2548,8 @@ int sqlite3WalFindFrame(
nCollide = HASHTABLE_NSLOT;
for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){
u32 iFrame = aHash[iKey] + iZero;
- if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){
- /* assert( iFrame>iRead ); -- not true if there is corruption */
+ if( iFrame<=iLast && iFrame>=pWal->minFrame && aPgno[aHash[iKey]]==pgno ){
+ assert( iFrame>iRead || CORRUPT_DB );
iRead = iFrame;
}
if( (nCollide--)==0 ){
@@ -2376,7 +2565,8 @@ int sqlite3WalFindFrame(
{
u32 iRead2 = 0;
u32 iTest;
- for(iTest=iLast; iTest>0; iTest--){
+ assert( pWal->minFrame>0 );
+ for(iTest=iLast; iTest>=pWal->minFrame; iTest--){
if( walFramePgno(pWal, iTest)==pgno ){
iRead2 = iTest;
break;
@@ -2573,7 +2763,6 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
return rc;
}
-
/*
** This function is called just before writing a set of frames to the log
** file (see sqlite3WalFrames()). It checks to see if, instead of appending
@@ -2606,20 +2795,8 @@ static int walRestartLog(Wal *pWal){
** In theory it would be Ok to update the cache of the header only
** at this point. But updating the actual wal-index header is also
** safe and means there is no special case for sqlite3WalUndo()
- ** to handle if this transaction is rolled back.
- */
- int i; /* Loop counter */
- u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */
-
- pWal->nCkpt++;
- pWal->hdr.mxFrame = 0;
- sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
- aSalt[1] = salt1;
- walIndexWriteHdr(pWal);
- pInfo->nBackfill = 0;
- pInfo->aReadMark[1] = 0;
- for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
- assert( pInfo->aReadMark[0]==0 );
+ ** to handle if this transaction is rolled back. */
+ walRestartHdr(pWal, salt1);
walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
}else if( rc!=SQLITE_BUSY ){
return rc;
@@ -2907,7 +3084,7 @@ int sqlite3WalFrames(
*/
int sqlite3WalCheckpoint(
Wal *pWal, /* Wal connection */
- int eMode, /* PASSIVE, FULL or RESTART */
+ int eMode, /* PASSIVE, FULL, RESTART, or TRUNCATE */
int (*xBusy)(void*), /* Function to call when busy */
void *pBusyArg, /* Context argument for xBusyHandler */
int sync_flags, /* Flags to sync db file with (or 0) */
@@ -2919,29 +3096,42 @@ int sqlite3WalCheckpoint(
int rc; /* Return code */
int isChanged = 0; /* True if a new wal-index header is loaded */
int eMode2 = eMode; /* Mode to pass to walCheckpoint() */
+ int (*xBusy2)(void*) = xBusy; /* Busy handler for eMode2 */
assert( pWal->ckptLock==0 );
assert( pWal->writeLock==0 );
+ /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked
+ ** in the SQLITE_CHECKPOINT_PASSIVE mode. */
+ assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 );
+
if( pWal->readOnly ) return SQLITE_READONLY;
WALTRACE(("WAL%p: checkpoint begins\n", pWal));
+
+ /* IMPLEMENTATION-OF: R-62028-47212 All calls obtain an exclusive
+ ** "checkpoint" lock on the database file. */
rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
if( rc ){
- /* Usually this is SQLITE_BUSY meaning that another thread or process
- ** is already running a checkpoint, or maybe a recovery. But it might
- ** also be SQLITE_IOERR. */
+ /* EVIDENCE-OF: R-10421-19736 If any other process is running a
+ ** checkpoint operation at the same time, the lock cannot be obtained and
+ ** SQLITE_BUSY is returned.
+ ** EVIDENCE-OF: R-53820-33897 Even if there is a busy-handler configured,
+ ** it will not be invoked in this case.
+ */
+ testcase( rc==SQLITE_BUSY );
+ testcase( xBusy!=0 );
return rc;
}
pWal->ckptLock = 1;
- /* If this is a blocking-checkpoint, then obtain the write-lock as well
- ** to prevent any writers from running while the checkpoint is underway.
- ** This has to be done before the call to walIndexReadHdr() below.
+ /* IMPLEMENTATION-OF: R-59782-36818 The SQLITE_CHECKPOINT_FULL, RESTART and
+ ** TRUNCATE modes also obtain the exclusive "writer" lock on the database
+ ** file.
**
- ** If the writer lock cannot be obtained, then a passive checkpoint is
- ** run instead. Since the checkpointer is not holding the writer lock,
- ** there is no point in blocking waiting for any readers. Assuming no
- ** other error occurs, this function will return SQLITE_BUSY to the caller.
+ ** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained
+ ** immediately, and a busy-handler is configured, it is invoked and the
+ ** writer lock retried until either the busy-handler returns 0 or the
+ ** lock is successfully obtained.
*/
if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){
rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_WRITE_LOCK, 1);
@@ -2949,6 +3139,7 @@ int sqlite3WalCheckpoint(
pWal->writeLock = 1;
}else if( rc==SQLITE_BUSY ){
eMode2 = SQLITE_CHECKPOINT_PASSIVE;
+ xBusy2 = 0;
rc = SQLITE_OK;
}
}
@@ -2966,7 +3157,7 @@ int sqlite3WalCheckpoint(
if( pWal->hdr.mxFrame && walPagesize(pWal)!=nBuf ){
rc = SQLITE_CORRUPT_BKPT;
}else{
- rc = walCheckpoint(pWal, eMode2, xBusy, pBusyArg, sync_flags, zBuf);
+ rc = walCheckpoint(pWal, eMode2, xBusy2, pBusyArg, sync_flags, zBuf);
}
/* If no error occurred, set the output variables. */
@@ -3078,6 +3269,35 @@ int sqlite3WalHeapMemory(Wal *pWal){
return (pWal && pWal->exclusiveMode==WAL_HEAPMEMORY_MODE );
}
+#ifdef SQLITE_ENABLE_SNAPSHOT
+/* Create a snapshot object. The content of a snapshot is opaque to
+** every other subsystem, so the WAL module can put whatever it needs
+** in the object.
+*/
+int sqlite3WalSnapshotGet(Wal *pWal, sqlite3_snapshot **ppSnapshot){
+ int rc = SQLITE_OK;
+ WalIndexHdr *pRet;
+
+ assert( pWal->readLock>=0 && pWal->writeLock==0 );
+
+ pRet = (WalIndexHdr*)sqlite3_malloc(sizeof(WalIndexHdr));
+ if( pRet==0 ){
+ rc = SQLITE_NOMEM;
+ }else{
+ memcpy(pRet, &pWal->hdr, sizeof(WalIndexHdr));
+ *ppSnapshot = (sqlite3_snapshot*)pRet;
+ }
+
+ return rc;
+}
+
+/* Try to open on pSnapshot when the next read-transaction starts
+*/
+void sqlite3WalSnapshotOpen(Wal *pWal, sqlite3_snapshot *pSnapshot){
+ pWal->pSnapshot = (WalIndexHdr*)pSnapshot;
+}
+#endif /* SQLITE_ENABLE_SNAPSHOT */
+
#ifdef SQLITE_ENABLE_ZIPVFS
/*
** If the argument is not NULL, it points to a Wal object that holds a
@@ -3090,4 +3310,10 @@ int sqlite3WalFramesize(Wal *pWal){
}
#endif
+/* Return the sqlite3_file object for the WAL file
+*/
+sqlite3_file *sqlite3WalFile(Wal *pWal){
+ return pWal->pWalFd;
+}
+
#endif /* #ifndef SQLITE_OMIT_WAL */
« no previous file with comments | « third_party/sqlite/src/src/wal.h ('k') | third_party/sqlite/src/src/walker.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698