| Index: third_party/sqlite/src/src/wal.c
|
| diff --git a/third_party/sqlite/src/src/wal.c b/third_party/sqlite/src/src/wal.c
|
| index 51ea18fb21677476c9f4ac626c41c1b1a8ef6267..d134a8b52a31089b555ada725aa259c94646286c 100644
|
| --- a/third_party/sqlite/src/src/wal.c
|
| +++ b/third_party/sqlite/src/src/wal.c
|
| @@ -142,14 +142,15 @@
|
| ** byte order of the host computer.
|
| **
|
| ** The purpose of the wal-index is to answer this question quickly: Given
|
| -** a page number P, return the index of the last frame for page P in the WAL,
|
| -** or return NULL if there are no frames for page P in the WAL.
|
| +** a page number P and a maximum frame index M, return the index of the
|
| +** last frame in the wal before frame M for page P in the WAL, or return
|
| +** NULL if there are no frames for page P in the WAL prior to M.
|
| **
|
| ** The wal-index consists of a header region, followed by an one or
|
| ** more index blocks.
|
| **
|
| ** The wal-index header contains the total number of frames within the WAL
|
| -** in the the mxFrame field.
|
| +** in the mxFrame field.
|
| **
|
| ** Each index block except for the first contains information on
|
| ** HASHTABLE_NPAGE frames. The first index block contains information on
|
| @@ -412,14 +413,20 @@ struct Wal {
|
| sqlite3_file *pDbFd; /* File handle for the database file */
|
| sqlite3_file *pWalFd; /* File handle for WAL file */
|
| u32 iCallback; /* Value to pass to log callback (or 0) */
|
| + i64 mxWalSize; /* Truncate WAL to this size upon reset */
|
| int nWiData; /* Size of array apWiData */
|
| + int szFirstBlock; /* Size of first block written to WAL file */
|
| volatile u32 **apWiData; /* Pointer to wal-index content in memory */
|
| u32 szPage; /* Database page size */
|
| i16 readLock; /* Which read lock is being held. -1 for none */
|
| + u8 syncFlags; /* Flags to use to sync header writes */
|
| u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */
|
| u8 writeLock; /* True if in a write transaction */
|
| u8 ckptLock; /* True if holding a checkpoint lock */
|
| - u8 readOnly; /* True if the WAL file is open read-only */
|
| + u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
|
| + u8 truncateOnCommit; /* True to truncate WAL file on commit */
|
| + u8 syncHeader; /* Fsync the WAL header if true */
|
| + u8 padToSectorBoundary; /* Pad transactions out to the next sector */
|
| WalIndexHdr hdr; /* Wal-index header for current transaction */
|
| const char *zWalName; /* Name of WAL file */
|
| u32 nCkpt; /* Checkpoint sequence counter in the wal-header */
|
| @@ -436,6 +443,13 @@ struct Wal {
|
| #define WAL_HEAPMEMORY_MODE 2
|
|
|
| /*
|
| +** Possible values for WAL.readOnly
|
| +*/
|
| +#define WAL_RDWR 0 /* Normal read/write connection */
|
| +#define WAL_RDONLY 1 /* The WAL file is readonly */
|
| +#define WAL_SHM_RDONLY 2 /* The SHM file is readonly */
|
| +
|
| +/*
|
| ** Each page of the wal-index mapping contains a hash-table made up of
|
| ** an array of HASHTABLE_NSLOT elements of the following type.
|
| */
|
| @@ -528,6 +542,10 @@ static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
|
| rc = sqlite3OsShmMap(pWal->pDbFd, iPage, WALINDEX_PGSZ,
|
| pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]
|
| );
|
| + if( rc==SQLITE_READONLY ){
|
| + pWal->readOnly |= WAL_SHM_RDONLY;
|
| + rc = SQLITE_OK;
|
| + }
|
| }
|
| }
|
|
|
| @@ -556,7 +574,7 @@ static volatile WalIndexHdr *walIndexHdr(Wal *pWal){
|
| ** The argument to this macro must be of type u32. On a little-endian
|
| ** architecture, it returns the u32 value that results from interpreting
|
| ** the 4 bytes as a big-endian value. On a big-endian architecture, it
|
| -** returns the value that would be produced by intepreting the 4 bytes
|
| +** returns the value that would be produced by interpreting the 4 bytes
|
| ** of the input value as a little-endian integer.
|
| */
|
| #define BYTESWAP32(x) ( \
|
| @@ -970,7 +988,7 @@ static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
|
| assert( idx <= HASHTABLE_NSLOT/2 + 1 );
|
|
|
| /* If this is the first entry to be added to this hash-table, zero the
|
| - ** entire hash table and aPgno[] array before proceding.
|
| + ** entire hash table and aPgno[] array before proceeding.
|
| */
|
| if( idx==1 ){
|
| int nByte = (int)((u8 *)&aHash[HASHTABLE_NSLOT] - (u8 *)&aPgno[1]);
|
| @@ -1081,6 +1099,7 @@ static int walIndexRecover(Wal *pWal){
|
| int szPage; /* Page size according to the log */
|
| u32 magic; /* Magic value read from WAL header */
|
| u32 version; /* Magic value read from WAL header */
|
| + int isValid; /* True if this frame is valid */
|
|
|
| /* Read in the WAL header. */
|
| rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
|
| @@ -1139,14 +1158,14 @@ static int walIndexRecover(Wal *pWal){
|
| for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){
|
| u32 pgno; /* Database page number for frame */
|
| u32 nTruncate; /* dbsize field from frame header */
|
| - int isValid; /* True if this frame is valid */
|
|
|
| /* Read and decode the next log frame. */
|
| + iFrame++;
|
| rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset);
|
| if( rc!=SQLITE_OK ) break;
|
| isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);
|
| if( !isValid ) break;
|
| - rc = walIndexAppend(pWal, ++iFrame, pgno);
|
| + rc = walIndexAppend(pWal, iFrame, pgno);
|
| if( rc!=SQLITE_OK ) break;
|
|
|
| /* If nTruncate is non-zero, this is a commit record. */
|
| @@ -1180,6 +1199,7 @@ finished:
|
| pInfo->nBackfill = 0;
|
| pInfo->aReadMark[0] = 0;
|
| for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
|
| + if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame;
|
|
|
| /* If more than one frame was recovered from the log file, report an
|
| ** event via sqlite3_log(). This is to help with identifying performance
|
| @@ -1187,8 +1207,9 @@ finished:
|
| ** checkpointing the log file.
|
| */
|
| if( pWal->hdr.nPage ){
|
| - sqlite3_log(SQLITE_OK, "Recovered %d frames from WAL file %s",
|
| - pWal->hdr.nPage, pWal->zWalName
|
| + sqlite3_log(SQLITE_NOTICE_RECOVER_WAL,
|
| + "recovered %d frames from WAL file %s",
|
| + pWal->hdr.mxFrame, pWal->zWalName
|
| );
|
| }
|
| }
|
| @@ -1234,6 +1255,7 @@ int sqlite3WalOpen(
|
| sqlite3_file *pDbFd, /* The open database file */
|
| const char *zWalName, /* Name of the WAL file */
|
| int bNoShm, /* True to run in heap-memory mode */
|
| + i64 mxWalSize, /* Truncate WAL to this size on reset */
|
| Wal **ppWal /* OUT: Allocated Wal handle */
|
| ){
|
| int rc; /* Return Code */
|
| @@ -1266,14 +1288,17 @@ int sqlite3WalOpen(
|
| pRet->pWalFd = (sqlite3_file *)&pRet[1];
|
| pRet->pDbFd = pDbFd;
|
| pRet->readLock = -1;
|
| + pRet->mxWalSize = mxWalSize;
|
| pRet->zWalName = zWalName;
|
| + pRet->syncHeader = 1;
|
| + pRet->padToSectorBoundary = 1;
|
| pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);
|
|
|
| /* Open file handle on the write-ahead log file. */
|
| flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
|
| rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);
|
| if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
|
| - pRet->readOnly = 1;
|
| + pRet->readOnly = WAL_RDONLY;
|
| }
|
|
|
| if( rc!=SQLITE_OK ){
|
| @@ -1281,6 +1306,11 @@ int sqlite3WalOpen(
|
| sqlite3OsClose(pRet->pWalFd);
|
| sqlite3_free(pRet);
|
| }else{
|
| + int iDC = sqlite3OsDeviceCharacteristics(pDbFd);
|
| + if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; }
|
| + if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){
|
| + pRet->padToSectorBoundary = 0;
|
| + }
|
| *ppWal = pRet;
|
| WALTRACE(("WAL%d: opened\n", pRet));
|
| }
|
| @@ -1288,6 +1318,13 @@ int sqlite3WalOpen(
|
| }
|
|
|
| /*
|
| +** Change the size to which the WAL file is trucated on each reset.
|
| +*/
|
| +void sqlite3WalLimit(Wal *pWal, i64 iLimit){
|
| + if( pWal ) pWal->mxWalSize = iLimit;
|
| +}
|
| +
|
| +/*
|
| ** Find the smallest page number out of all pages held in the WAL that
|
| ** has not been returned by any prior invocation of this method on the
|
| ** same WalIterator object. Write into *piFrame the frame index where
|
| @@ -1609,7 +1646,7 @@ static int walPagesize(Wal *pWal){
|
| ** database file.
|
| **
|
| ** This routine uses and updates the nBackfill field of the wal-index header.
|
| -** This is the only routine tha will increase the value of nBackfill.
|
| +** This is the only routine that will increase the value of nBackfill.
|
| ** (A WAL reset or recovery will revert nBackfill to zero, but not increase
|
| ** its value.)
|
| **
|
| @@ -1664,7 +1701,7 @@ static int walCheckpoint(
|
| assert( y<=pWal->hdr.mxFrame );
|
| rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1);
|
| if( rc==SQLITE_OK ){
|
| - pInfo->aReadMark[i] = READMARK_NOT_USED;
|
| + pInfo->aReadMark[i] = (i==1 ? mxSafeFrame : READMARK_NOT_USED);
|
| walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
|
| }else if( rc==SQLITE_BUSY ){
|
| mxSafeFrame = y;
|
| @@ -1686,17 +1723,18 @@ static int walCheckpoint(
|
| rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
|
| }
|
|
|
| - /* If the database file may grow as a result of this checkpoint, hint
|
| - ** about the eventual size of the db file to the VFS layer.
|
| + /* If the database may grow as a result of this checkpoint, hint
|
| + ** about the eventual size of the db file to the VFS layer.
|
| */
|
| if( rc==SQLITE_OK ){
|
| i64 nReq = ((i64)mxPage * szPage);
|
| rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);
|
| if( rc==SQLITE_OK && nSize<nReq ){
|
| - sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
|
| + sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
|
| }
|
| }
|
|
|
| +
|
| /* Iterate through the contents of the WAL, copying data to the db file. */
|
| while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
|
| i64 iOffset;
|
| @@ -1761,6 +1799,24 @@ static int walCheckpoint(
|
| }
|
|
|
| /*
|
| +** If the WAL file is currently larger than nMax bytes in size, truncate
|
| +** it to exactly nMax bytes. If an error occurs while doing so, ignore it.
|
| +*/
|
| +static void walLimitSize(Wal *pWal, i64 nMax){
|
| + i64 sz;
|
| + int rx;
|
| + sqlite3BeginBenignMalloc();
|
| + rx = sqlite3OsFileSize(pWal->pWalFd, &sz);
|
| + if( rx==SQLITE_OK && (sz > nMax ) ){
|
| + rx = sqlite3OsTruncate(pWal->pWalFd, nMax);
|
| + }
|
| + sqlite3EndBenignMalloc();
|
| + if( rx ){
|
| + sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName);
|
| + }
|
| +}
|
| +
|
| +/*
|
| ** Close a connection to a log file.
|
| */
|
| int sqlite3WalClose(
|
| @@ -1790,14 +1846,33 @@ int sqlite3WalClose(
|
| pWal, SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0
|
| );
|
| if( rc==SQLITE_OK ){
|
| - isDelete = 1;
|
| + int bPersist = -1;
|
| + sqlite3OsFileControlHint(
|
| + pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist
|
| + );
|
| + if( bPersist!=1 ){
|
| + /* Try to delete the WAL file if the checkpoint completed and
|
| + ** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal
|
| + ** mode (!bPersist) */
|
| + isDelete = 1;
|
| + }else if( pWal->mxWalSize>=0 ){
|
| + /* Try to truncate the WAL file to zero bytes if the checkpoint
|
| + ** completed and fsynced (rc==SQLITE_OK) and we are in persistent
|
| + ** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a
|
| + ** non-negative value (pWal->mxWalSize>=0). Note that we truncate
|
| + ** to zero bytes as truncating to the journal_size_limit might
|
| + ** leave a corrupt WAL file on disk. */
|
| + walLimitSize(pWal, 0);
|
| + }
|
| }
|
| }
|
|
|
| walIndexClose(pWal, isDelete);
|
| sqlite3OsClose(pWal->pWalFd);
|
| if( isDelete ){
|
| + sqlite3BeginBenignMalloc();
|
| sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);
|
| + sqlite3EndBenignMalloc();
|
| }
|
| WALTRACE(("WAL%p: closed\n", pWal));
|
| sqlite3_free((void *)pWal->apWiData);
|
| @@ -1875,7 +1950,7 @@ static int walIndexTryHdr(Wal *pWal, int *pChanged){
|
| ** wal-index from the WAL before returning.
|
| **
|
| ** Set *pChanged to 1 if the wal-index header value in pWal->hdr is
|
| -** changed by this opertion. If pWal->hdr is unchanged, set *pChanged
|
| +** changed by this operation. If pWal->hdr is unchanged, set *pChanged
|
| ** to 0.
|
| **
|
| ** If the wal-index header is successfully read, return SQLITE_OK.
|
| @@ -1907,21 +1982,28 @@ static int walIndexReadHdr(Wal *pWal, int *pChanged){
|
| ** with a writer. So get a WRITE lock and try again.
|
| */
|
| assert( badHdr==0 || pWal->writeLock==0 );
|
| - if( badHdr && SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){
|
| - pWal->writeLock = 1;
|
| - if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
|
| - badHdr = walIndexTryHdr(pWal, pChanged);
|
| - if( badHdr ){
|
| - /* If the wal-index header is still malformed even while holding
|
| - ** a WRITE lock, it can only mean that the header is corrupted and
|
| - ** needs to be reconstructed. So run recovery to do exactly that.
|
| - */
|
| - rc = walIndexRecover(pWal);
|
| - *pChanged = 1;
|
| + if( badHdr ){
|
| + if( pWal->readOnly & WAL_SHM_RDONLY ){
|
| + if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){
|
| + walUnlockShared(pWal, WAL_WRITE_LOCK);
|
| + rc = SQLITE_READONLY_RECOVERY;
|
| + }
|
| + }else if( SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){
|
| + pWal->writeLock = 1;
|
| + if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
|
| + badHdr = walIndexTryHdr(pWal, pChanged);
|
| + if( badHdr ){
|
| + /* If the wal-index header is still malformed even while holding
|
| + ** a WRITE lock, it can only mean that the header is corrupted and
|
| + ** needs to be reconstructed. So run recovery to do exactly that.
|
| + */
|
| + rc = walIndexRecover(pWal);
|
| + *pChanged = 1;
|
| + }
|
| }
|
| + pWal->writeLock = 0;
|
| + walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
|
| }
|
| - pWal->writeLock = 0;
|
| - walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
|
| }
|
|
|
| /* If the header is read successfully, check the version number to make
|
| @@ -2014,8 +2096,8 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
|
| ** calls to sqlite3OsSleep() have a delay of 1 microsecond. Really this
|
| ** is more of a scheduler yield than an actual delay. But on the 10th
|
| ** an subsequent retries, the delays start becoming longer and longer,
|
| - ** so that on the 100th (and last) RETRY we delay for 21 milliseconds.
|
| - ** The total delay time before giving up is less than 1 second.
|
| + ** so that on the 100th (and last) RETRY we delay for 323 milliseconds.
|
| + ** The total delay time before giving up is less than 10 seconds.
|
| */
|
| if( cnt>5 ){
|
| int nDelay = 1; /* Pause time in microseconds */
|
| @@ -2023,7 +2105,7 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
|
| VVA_ONLY( pWal->lockError = 1; )
|
| return SQLITE_PROTOCOL;
|
| }
|
| - if( cnt>=10 ) nDelay = (cnt-9)*238; /* Max delay 21ms. Total delay 996ms */
|
| + if( cnt>=10 ) nDelay = (cnt-9)*(cnt-9)*39;
|
| sqlite3OsSleep(pWal->pVfs, nDelay);
|
| }
|
|
|
| @@ -2072,7 +2154,7 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
|
| ** may have been appended to the log before READ_LOCK(0) was obtained.
|
| ** When holding READ_LOCK(0), the reader ignores the entire log file,
|
| ** which implies that the database file contains a trustworthy
|
| - ** snapshoT. Since holding READ_LOCK(0) prevents a checkpoint from
|
| + ** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from
|
| ** happening, this is usually correct.
|
| **
|
| ** However, if frames have been appended to the log (or if the log
|
| @@ -2108,7 +2190,9 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
|
| }
|
| /* There was once an "if" here. The extra "{" is to preserve indentation. */
|
| {
|
| - if( mxReadMark < pWal->hdr.mxFrame || mxI==0 ){
|
| + if( (pWal->readOnly & WAL_SHM_RDONLY)==0
|
| + && (mxReadMark<pWal->hdr.mxFrame || mxI==0)
|
| + ){
|
| for(i=1; i<WAL_NREADER; i++){
|
| rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
|
| if( rc==SQLITE_OK ){
|
| @@ -2122,8 +2206,8 @@ static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
|
| }
|
| }
|
| if( mxI==0 ){
|
| - assert( rc==SQLITE_BUSY );
|
| - return WAL_RETRY;
|
| + assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 );
|
| + return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK;
|
| }
|
|
|
| rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
|
| @@ -2205,19 +2289,17 @@ void sqlite3WalEndReadTransaction(Wal *pWal){
|
| }
|
|
|
| /*
|
| -** Read a page from the WAL, if it is present in the WAL and if the
|
| -** current read transaction is configured to use the WAL.
|
| +** Search the wal file for page pgno. If found, set *piRead to the frame that
|
| +** contains the page. Otherwise, if pgno is not in the wal file, set *piRead
|
| +** to zero.
|
| **
|
| -** The *pInWal is set to 1 if the requested page is in the WAL and
|
| -** has been loaded. Or *pInWal is set to 0 if the page was not in
|
| -** the WAL and needs to be read out of the database.
|
| +** Return SQLITE_OK if successful, or an error code if an error occurs. If an
|
| +** error does occur, the final value of *piRead is undefined.
|
| */
|
| -int sqlite3WalRead(
|
| +int sqlite3WalFindFrame(
|
| Wal *pWal, /* WAL handle */
|
| Pgno pgno, /* Database page number to read data for */
|
| - int *pInWal, /* OUT: True if data is read from WAL */
|
| - int nOut, /* Size of buffer pOut in bytes */
|
| - u8 *pOut /* Buffer to write page data to */
|
| + u32 *piRead /* OUT: Frame number (or zero) */
|
| ){
|
| u32 iRead = 0; /* If !=0, WAL frame to return data from */
|
| u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */
|
| @@ -2233,7 +2315,7 @@ int sqlite3WalRead(
|
| ** WAL were empty.
|
| */
|
| if( iLast==0 || pWal->readLock==0 ){
|
| - *pInWal = 0;
|
| + *piRead = 0;
|
| return SQLITE_OK;
|
| }
|
|
|
| @@ -2278,7 +2360,7 @@ int sqlite3WalRead(
|
| for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){
|
| u32 iFrame = aHash[iKey] + iZero;
|
| if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){
|
| - assert( iFrame>iRead );
|
| + /* assert( iFrame>iRead ); -- not true if there is corruption */
|
| iRead = iFrame;
|
| }
|
| if( (nCollide--)==0 ){
|
| @@ -2304,26 +2386,31 @@ int sqlite3WalRead(
|
| }
|
| #endif
|
|
|
| - /* If iRead is non-zero, then it is the log frame number that contains the
|
| - ** required page. Read and return data from the log file.
|
| - */
|
| - if( iRead ){
|
| - int sz;
|
| - i64 iOffset;
|
| - sz = pWal->hdr.szPage;
|
| - sz = (pWal->hdr.szPage&0xfe00) + ((pWal->hdr.szPage&0x0001)<<16);
|
| - testcase( sz<=32768 );
|
| - testcase( sz>=65536 );
|
| - iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;
|
| - *pInWal = 1;
|
| - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
|
| - return sqlite3OsRead(pWal->pWalFd, pOut, nOut, iOffset);
|
| - }
|
| -
|
| - *pInWal = 0;
|
| + *piRead = iRead;
|
| return SQLITE_OK;
|
| }
|
|
|
| +/*
|
| +** Read the contents of frame iRead from the wal file into buffer pOut
|
| +** (which is nOut bytes in size). Return SQLITE_OK if successful, or an
|
| +** error code otherwise.
|
| +*/
|
| +int sqlite3WalReadFrame(
|
| + Wal *pWal, /* WAL handle */
|
| + u32 iRead, /* Frame to read */
|
| + int nOut, /* Size of buffer pOut in bytes */
|
| + u8 *pOut /* Buffer to write page data to */
|
| +){
|
| + int sz;
|
| + i64 iOffset;
|
| + sz = pWal->hdr.szPage;
|
| + sz = (sz&0xfe00) + ((sz&0x0001)<<16);
|
| + testcase( sz<=32768 );
|
| + testcase( sz>=65536 );
|
| + iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;
|
| + /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
|
| + return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset);
|
| +}
|
|
|
| /*
|
| ** Return the size of the database in pages (or zero, if unknown).
|
| @@ -2376,7 +2463,7 @@ int sqlite3WalBeginWriteTransaction(Wal *pWal){
|
| if( memcmp(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr))!=0 ){
|
| walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
|
| pWal->writeLock = 0;
|
| - rc = SQLITE_BUSY;
|
| + rc = SQLITE_BUSY_SNAPSHOT;
|
| }
|
|
|
| return rc;
|
| @@ -2390,6 +2477,7 @@ int sqlite3WalEndWriteTransaction(Wal *pWal){
|
| if( pWal->writeLock ){
|
| walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
|
| pWal->writeLock = 0;
|
| + pWal->truncateOnCommit = 0;
|
| }
|
| return SQLITE_OK;
|
| }
|
| @@ -2435,9 +2523,8 @@ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
|
| assert( walFramePgno(pWal, iFrame)!=1 );
|
| rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
|
| }
|
| - walCleanupHash(pWal);
|
| + if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal);
|
| }
|
| - assert( rc==SQLITE_OK );
|
| return rc;
|
| }
|
|
|
| @@ -2486,6 +2573,7 @@ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
|
| return rc;
|
| }
|
|
|
| +
|
| /*
|
| ** This function is called just before writing a set of frames to the log
|
| ** file (see sqlite3WalFrames()). It checks to see if, instead of appending
|
| @@ -2522,13 +2610,15 @@ static int walRestartLog(Wal *pWal){
|
| */
|
| int i; /* Loop counter */
|
| u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */
|
| +
|
| pWal->nCkpt++;
|
| pWal->hdr.mxFrame = 0;
|
| sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
|
| aSalt[1] = salt1;
|
| walIndexWriteHdr(pWal);
|
| pInfo->nBackfill = 0;
|
| - for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
|
| + pInfo->aReadMark[1] = 0;
|
| + for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
|
| assert( pInfo->aReadMark[0]==0 );
|
| walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
|
| }else if( rc!=SQLITE_BUSY ){
|
| @@ -2550,6 +2640,74 @@ static int walRestartLog(Wal *pWal){
|
| return rc;
|
| }
|
|
|
| +/*
|
| +** Information about the current state of the WAL file and where
|
| +** the next fsync should occur - passed from sqlite3WalFrames() into
|
| +** walWriteToLog().
|
| +*/
|
| +typedef struct WalWriter {
|
| + Wal *pWal; /* The complete WAL information */
|
| + sqlite3_file *pFd; /* The WAL file to which we write */
|
| + sqlite3_int64 iSyncPoint; /* Fsync at this offset */
|
| + int syncFlags; /* Flags for the fsync */
|
| + int szPage; /* Size of one page */
|
| +} WalWriter;
|
| +
|
| +/*
|
| +** Write iAmt bytes of content into the WAL file beginning at iOffset.
|
| +** Do a sync when crossing the p->iSyncPoint boundary.
|
| +**
|
| +** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,
|
| +** first write the part before iSyncPoint, then sync, then write the
|
| +** rest.
|
| +*/
|
| +static int walWriteToLog(
|
| + WalWriter *p, /* WAL to write to */
|
| + void *pContent, /* Content to be written */
|
| + int iAmt, /* Number of bytes to write */
|
| + sqlite3_int64 iOffset /* Start writing at this offset */
|
| +){
|
| + int rc;
|
| + if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){
|
| + int iFirstAmt = (int)(p->iSyncPoint - iOffset);
|
| + rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset);
|
| + if( rc ) return rc;
|
| + iOffset += iFirstAmt;
|
| + iAmt -= iFirstAmt;
|
| + pContent = (void*)(iFirstAmt + (char*)pContent);
|
| + assert( p->syncFlags & (SQLITE_SYNC_NORMAL|SQLITE_SYNC_FULL) );
|
| + rc = sqlite3OsSync(p->pFd, p->syncFlags & SQLITE_SYNC_MASK);
|
| + if( iAmt==0 || rc ) return rc;
|
| + }
|
| + rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset);
|
| + return rc;
|
| +}
|
| +
|
| +/*
|
| +** Write out a single frame of the WAL
|
| +*/
|
| +static int walWriteOneFrame(
|
| + WalWriter *p, /* Where to write the frame */
|
| + PgHdr *pPage, /* The page of the frame to be written */
|
| + int nTruncate, /* The commit flag. Usually 0. >0 for commit */
|
| + sqlite3_int64 iOffset /* Byte offset at which to write */
|
| +){
|
| + int rc; /* Result code from subfunctions */
|
| + void *pData; /* Data actually written */
|
| + u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
|
| +#if defined(SQLITE_HAS_CODEC)
|
| + if( (pData = sqlite3PagerCodec(pPage))==0 ) return SQLITE_NOMEM;
|
| +#else
|
| + pData = pPage->pData;
|
| +#endif
|
| + walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame);
|
| + rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset);
|
| + if( rc ) return rc;
|
| + /* Write the page data */
|
| + rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame));
|
| + return rc;
|
| +}
|
| +
|
| /*
|
| ** Write a set of frames to the log. The caller must hold the write-lock
|
| ** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
|
| @@ -2564,14 +2722,20 @@ int sqlite3WalFrames(
|
| ){
|
| int rc; /* Used to catch return codes */
|
| u32 iFrame; /* Next frame address */
|
| - u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
|
| PgHdr *p; /* Iterator to run through pList with. */
|
| PgHdr *pLast = 0; /* Last frame in list */
|
| - int nLast = 0; /* Number of extra copies of last page */
|
| + int nExtra = 0; /* Number of extra copies of last page */
|
| + int szFrame; /* The size of a single frame */
|
| + i64 iOffset; /* Next byte to write in WAL file */
|
| + WalWriter w; /* The writer */
|
|
|
| assert( pList );
|
| assert( pWal->writeLock );
|
|
|
| + /* If this frame set completes a transaction, then nTruncate>0. If
|
| + ** nTruncate==0 then this frame set does not complete the transaction. */
|
| + assert( (isCommit!=0)==(nTruncate!=0) );
|
| +
|
| #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
|
| { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
|
| WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
|
| @@ -2599,7 +2763,7 @@ int sqlite3WalFrames(
|
| sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION);
|
| sqlite3Put4byte(&aWalHdr[8], szPage);
|
| sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt);
|
| - sqlite3_randomness(8, pWal->hdr.aSalt);
|
| + if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt);
|
| memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8);
|
| walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum);
|
| sqlite3Put4byte(&aWalHdr[24], aCksum[0]);
|
| @@ -2609,77 +2773,89 @@ int sqlite3WalFrames(
|
| pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN;
|
| pWal->hdr.aFrameCksum[0] = aCksum[0];
|
| pWal->hdr.aFrameCksum[1] = aCksum[1];
|
| + pWal->truncateOnCommit = 1;
|
|
|
| rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);
|
| WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
|
| if( rc!=SQLITE_OK ){
|
| return rc;
|
| }
|
| +
|
| + /* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless
|
| + ** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise
|
| + ** an out-of-order write following a WAL restart could result in
|
| + ** database corruption. See the ticket:
|
| + **
|
| + ** http://localhost:591/sqlite/info/ff5be73dee
|
| + */
|
| + if( pWal->syncHeader && sync_flags ){
|
| + rc = sqlite3OsSync(pWal->pWalFd, sync_flags & SQLITE_SYNC_MASK);
|
| + if( rc ) return rc;
|
| + }
|
| }
|
| assert( (int)pWal->szPage==szPage );
|
|
|
| - /* Write the log file. */
|
| - for(p=pList; p; p=p->pDirty){
|
| - u32 nDbsize; /* Db-size field for frame header */
|
| - i64 iOffset; /* Write offset in log file */
|
| - void *pData;
|
| -
|
| - iOffset = walFrameOffset(++iFrame, szPage);
|
| - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
|
| -
|
| - /* Populate and write the frame header */
|
| - nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0;
|
| -#if defined(SQLITE_HAS_CODEC)
|
| - if( (pData = sqlite3PagerCodec(p))==0 ) return SQLITE_NOMEM;
|
| -#else
|
| - pData = p->pData;
|
| -#endif
|
| - walEncodeFrame(pWal, p->pgno, nDbsize, pData, aFrame);
|
| - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
|
| - if( rc!=SQLITE_OK ){
|
| - return rc;
|
| - }
|
| + /* Setup information needed to write frames into the WAL */
|
| + w.pWal = pWal;
|
| + w.pFd = pWal->pWalFd;
|
| + w.iSyncPoint = 0;
|
| + w.syncFlags = sync_flags;
|
| + w.szPage = szPage;
|
| + iOffset = walFrameOffset(iFrame+1, szPage);
|
| + szFrame = szPage + WAL_FRAME_HDRSIZE;
|
|
|
| - /* Write the page data */
|
| - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset+sizeof(aFrame));
|
| - if( rc!=SQLITE_OK ){
|
| - return rc;
|
| - }
|
| + /* Write all frames into the log file exactly once */
|
| + for(p=pList; p; p=p->pDirty){
|
| + int nDbSize; /* 0 normally. Positive == commit flag */
|
| + iFrame++;
|
| + assert( iOffset==walFrameOffset(iFrame, szPage) );
|
| + nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0;
|
| + rc = walWriteOneFrame(&w, p, nDbSize, iOffset);
|
| + if( rc ) return rc;
|
| pLast = p;
|
| + iOffset += szFrame;
|
| }
|
|
|
| - /* Sync the log file if the 'isSync' flag was specified. */
|
| - if( sync_flags ){
|
| - i64 iSegment = sqlite3OsSectorSize(pWal->pWalFd);
|
| - i64 iOffset = walFrameOffset(iFrame+1, szPage);
|
| -
|
| - assert( isCommit );
|
| - assert( iSegment>0 );
|
| -
|
| - iSegment = (((iOffset+iSegment-1)/iSegment) * iSegment);
|
| - while( iOffset<iSegment ){
|
| - void *pData;
|
| -#if defined(SQLITE_HAS_CODEC)
|
| - if( (pData = sqlite3PagerCodec(pLast))==0 ) return SQLITE_NOMEM;
|
| -#else
|
| - pData = pLast->pData;
|
| -#endif
|
| - walEncodeFrame(pWal, pLast->pgno, nTruncate, pData, aFrame);
|
| - /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
|
| - rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
|
| - if( rc!=SQLITE_OK ){
|
| - return rc;
|
| - }
|
| - iOffset += WAL_FRAME_HDRSIZE;
|
| - rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOffset);
|
| - if( rc!=SQLITE_OK ){
|
| - return rc;
|
| + /* If this is the end of a transaction, then we might need to pad
|
| + ** the transaction and/or sync the WAL file.
|
| + **
|
| + ** Padding and syncing only occur if this set of frames complete a
|
| + ** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL
|
| + ** or synchronous==OFF, then no padding or syncing are needed.
|
| + **
|
| + ** If SQLITE_IOCAP_POWERSAFE_OVERWRITE is defined, then padding is not
|
| + ** needed and only the sync is done. If padding is needed, then the
|
| + ** final frame is repeated (with its commit mark) until the next sector
|
| + ** boundary is crossed. Only the part of the WAL prior to the last
|
| + ** sector boundary is synced; the part of the last frame that extends
|
| + ** past the sector boundary is written after the sync.
|
| + */
|
| + if( isCommit && (sync_flags & WAL_SYNC_TRANSACTIONS)!=0 ){
|
| + if( pWal->padToSectorBoundary ){
|
| + int sectorSize = sqlite3SectorSize(pWal->pWalFd);
|
| + w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize;
|
| + while( iOffset<w.iSyncPoint ){
|
| + rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset);
|
| + if( rc ) return rc;
|
| + iOffset += szFrame;
|
| + nExtra++;
|
| }
|
| - nLast++;
|
| - iOffset += szPage;
|
| + }else{
|
| + rc = sqlite3OsSync(w.pFd, sync_flags & SQLITE_SYNC_MASK);
|
| }
|
| + }
|
|
|
| - rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
|
| + /* If this frame set completes the first transaction in the WAL and
|
| + ** if PRAGMA journal_size_limit is set, then truncate the WAL to the
|
| + ** journal size limit, if possible.
|
| + */
|
| + if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){
|
| + i64 sz = pWal->mxWalSize;
|
| + if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){
|
| + sz = walFrameOffset(iFrame+nExtra+1, szPage);
|
| + }
|
| + walLimitSize(pWal, sz);
|
| + pWal->truncateOnCommit = 0;
|
| }
|
|
|
| /* Append data to the wal-index. It is not necessary to lock the
|
| @@ -2692,9 +2868,9 @@ int sqlite3WalFrames(
|
| iFrame++;
|
| rc = walIndexAppend(pWal, iFrame, p->pgno);
|
| }
|
| - while( nLast>0 && rc==SQLITE_OK ){
|
| + while( rc==SQLITE_OK && nExtra>0 ){
|
| iFrame++;
|
| - nLast--;
|
| + nExtra--;
|
| rc = walIndexAppend(pWal, iFrame, pLast->pgno);
|
| }
|
|
|
| @@ -2747,6 +2923,7 @@ int sqlite3WalCheckpoint(
|
| assert( pWal->ckptLock==0 );
|
| assert( pWal->writeLock==0 );
|
|
|
| + if( pWal->readOnly ) return SQLITE_READONLY;
|
| WALTRACE(("WAL%p: checkpoint begins\n", pWal));
|
| rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
|
| if( rc ){
|
| @@ -2779,6 +2956,9 @@ int sqlite3WalCheckpoint(
|
| /* Read the wal-index header. */
|
| if( rc==SQLITE_OK ){
|
| rc = walIndexReadHdr(pWal, &isChanged);
|
| + if( isChanged && pWal->pDbFd->pMethods->iVersion>=3 ){
|
| + sqlite3OsUnfetch(pWal->pDbFd, 0, 0);
|
| + }
|
| }
|
|
|
| /* Copy data from the log to the database file. */
|
| @@ -2898,4 +3078,16 @@ int sqlite3WalHeapMemory(Wal *pWal){
|
| return (pWal && pWal->exclusiveMode==WAL_HEAPMEMORY_MODE );
|
| }
|
|
|
| +#ifdef SQLITE_ENABLE_ZIPVFS
|
| +/*
|
| +** If the argument is not NULL, it points to a Wal object that holds a
|
| +** read-lock. This function returns the database page-size if it is known,
|
| +** or zero if it is not (or if pWal is NULL).
|
| +*/
|
| +int sqlite3WalFramesize(Wal *pWal){
|
| + assert( pWal==0 || pWal->readLock>=0 );
|
| + return (pWal ? pWal->szPage : 0);
|
| +}
|
| +#endif
|
| +
|
| #endif /* #ifndef SQLITE_OMIT_WAL */
|
|
|