| Index: sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.h
|
| ===================================================================
|
| --- sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.h (revision 2678)
|
| +++ sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.h (working copy)
|
| @@ -1,494 +0,0 @@
|
| -// Copyright 2006 Google Inc.
|
| -// Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith
|
| -//
|
| -// Licensed under the Apache License, Version 2.0 (the "License");
|
| -// you may not use this file except in compliance with the License.
|
| -// You may obtain a copy of the License at
|
| -//
|
| -// http://www.apache.org/licenses/LICENSE-2.0
|
| -//
|
| -// Unless required by applicable law or agreed to in writing, software
|
| -// distributed under the License is distributed on an "AS IS" BASIS,
|
| -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| -// See the License for the specific language governing permissions and
|
| -// limitations under the License.
|
| -//
|
| -// Implementation of the Bentley/McIlroy algorithm for finding differences.
|
| -// Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings.
|
| -// http://citeseer.ist.psu.edu/555557.html
|
| -
|
| -#ifndef OPEN_VCDIFF_BLOCKHASH_H_
|
| -#define OPEN_VCDIFF_BLOCKHASH_H_
|
| -
|
| -#include <config.h>
|
| -#include <stdint.h> // uint32_t
|
| -#include <cstddef> // size_t
|
| -#include <vector>
|
| -
|
| -namespace open_vcdiff {
|
| -
|
| -// A generic hash table which will be used to keep track of byte runs
|
| -// of size kBlockSize in both the incrementally processed target data
|
| -// and the preprocessed source dictionary.
|
| -//
|
| -// A custom hash table implementation is used instead of the standard
|
| -// hash_map template because we know that there will be exactly one
|
| -// entry in the BlockHash corresponding to each kBlockSize bytes
|
| -// in the source data, which makes certain optimizations possible:
|
| -// * The memory for the hash table and for all hash entries can be allocated
|
| -// in one step rather than incrementally for each insert operation.
|
| -// * A single integer can be used to represent both
|
| -// the index of the next hash entry in the chain
|
| -// and the position of the entry within the source data
|
| -// (== kBlockSize * block_number). This greatly reduces the size
|
| -// of a hash entry.
|
| -//
|
| -class BlockHash {
|
| - public:
|
| - // Block size as per Bentley/McIlroy; must be a power of two.
|
| - //
|
| - // Using (for example) kBlockSize = 4 guarantees that no match smaller
|
| - // than size 4 will be identified, that some matches having sizes
|
| - // 4, 5, or 6 may be identified, and that all matches
|
| - // having size 7 or greater will be identified (because any string of
|
| - // 7 bytes must contain a complete aligned block of 4 bytes.)
|
| - //
|
| - // Increasing kBlockSize by a factor of two will halve the amount of
|
| - // memory needed for the next block table, and will halve the setup time
|
| - // for a new BlockHash. However, it also doubles the minimum
|
| - // match length that is guaranteed to be found in FindBestMatch(),
|
| - // so that function will be less effective in finding matches.
|
| - //
|
| - // Computational effort in FindBestMatch (which is the inner loop of
|
| - // the encoding algorithm) will be proportional to the number of
|
| - // matches found, and a low value of kBlockSize will waste time
|
| - // tracking down small matches. On the other hand, if this value
|
| - // is set too high, no matches will be found at all.
|
| - //
|
| - // It is suggested that different values of kBlockSize be tried against
|
| - // a representative data set to find the best tradeoff between
|
| - // memory/CPU and the effectiveness of FindBestMatch().
|
| - //
|
| - // If you change kBlockSize to a smaller value, please increase
|
| - // kMaxMatchesToCheck accordingly.
|
| - static const int kBlockSize = 32;
|
| -
|
| - // This class is used to store the best match found by FindBestMatch()
|
| - // and return it to the caller.
|
| - class Match {
|
| - public:
|
| - Match() : size_(0), source_offset_(-1), target_offset_(-1) { }
|
| -
|
| - void ReplaceIfBetterMatch(size_t candidate_size,
|
| - int candidate_source_offset,
|
| - int candidate_target_offset) {
|
| - if (candidate_size > size_) {
|
| - size_ = candidate_size;
|
| - source_offset_ = candidate_source_offset;
|
| - target_offset_ = candidate_target_offset;
|
| - }
|
| - }
|
| -
|
| - size_t size() const { return size_; }
|
| - int source_offset() const { return source_offset_; }
|
| - int target_offset() const { return target_offset_; }
|
| -
|
| - private:
|
| - // The size of the best (longest) match passed to ReplaceIfBetterMatch().
|
| - size_t size_;
|
| -
|
| - // The source offset of the match, including the starting_offset_
|
| - // of the BlockHash for which the match was found.
|
| - int source_offset_;
|
| -
|
| - // The target offset of the match. An offset of 0 corresponds to the
|
| - // data at target_start, which is an argument of FindBestMatch().
|
| - int target_offset_;
|
| -
|
| - // Making these private avoids implicit copy constructor
|
| - // & assignment operator
|
| - Match(const Match&); // NOLINT
|
| - void operator=(const Match&);
|
| - };
|
| -
|
| - // A BlockHash is created using a buffer of source data. The hash table
|
| - // will contain one entry for each kBlockSize-byte block in the
|
| - // source data.
|
| - //
|
| - // See the comments for starting_offset_, below, for a description of
|
| - // the starting_offset argument. For a hash of source (dictionary) data,
|
| - // starting_offset_ will be zero; for a hash of previously encoded
|
| - // target data, starting_offset_ will be equal to the dictionary size.
|
| - //
|
| - BlockHash(const char* source_data, size_t source_size, int starting_offset);
|
| -
|
| - ~BlockHash();
|
| -
|
| - // Initializes the object before use.
|
| - // This method must be called after constructing a BlockHash object,
|
| - // and before any other method may be called. This is because
|
| - // Init() dynamically allocates hash_table_ and next_block_table_.
|
| - // Returns true if initialization succeeded, or false if an error occurred,
|
| - // in which case no other method except the destructor may then be used
|
| - // on the object.
|
| - //
|
| - // If populate_hash_table is true, then AddAllBlocks() will be called
|
| - // to populate the hash table. If populate_hash_table is false, then
|
| - // classes that inherit from BlockHash are expected to call AddBlock()
|
| - // to incrementally populate individual blocks of data.
|
| - //
|
| - bool Init(bool populate_hash_table);
|
| -
|
| - // In the context of the open-vcdiff encoder, BlockHash is used for two
|
| - // purposes: to hash the source (dictionary) data, and to hash
|
| - // the previously encoded target data. The main differences between
|
| - // a dictionary BlockHash and a target BlockHash are as follows:
|
| - //
|
| - // 1. The best_match->source_offset() returned from FindBestMatch()
|
| - // for a target BlockHash is computed in the following manner:
|
| - // the starting offset of the first byte in the target data
|
| - // is equal to the dictionary size. FindBestMatch() will add
|
| - // starting_offset_ to any best_match->source_offset() value it returns,
|
| - // in order to produce the correct offset value for a target BlockHash.
|
| - // 2. For a dictionary BlockHash, the entire data set is hashed at once
|
| - // when Init() is called with the parameter populate_hash_table = true.
|
| - // For a target BlockHash, because the previously encoded target data
|
| - // includes only the data seen up to the current encoding position,
|
| - // the data blocks are hashed incrementally as the encoding position
|
| - // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex().
|
| - //
|
| - // The following two factory functions can be used to create BlockHash
|
| - // objects for each of these two purposes. Each factory function calls
|
| - // the object constructor and also calls Init(). If an error occurs,
|
| - // NULL is returned; otherwise a valid BlockHash object is returned.
|
| - // Since a dictionary BlockHash is not expected to be modified after
|
| - // initialization, a const object is returned.
|
| - // The caller is responsible for deleting the returned object
|
| - // (using the C++ delete operator) once it is no longer needed.
|
| - static const BlockHash* CreateDictionaryHash(const char* dictionary_data,
|
| - size_t dictionary_size);
|
| - static BlockHash* CreateTargetHash(const char* target_data,
|
| - size_t target_size,
|
| - size_t dictionary_size);
|
| -
|
| - const size_t table_size() const { return hash_table_.size(); }
|
| - const int starting_offset() const { return starting_offset_; }
|
| -
|
| - // This function will be called to add blocks incrementally to the target hash
|
| - // as the encoding position advances through the target data. It will be
|
| - // called for every kBlockSize-byte block in the target data, regardless
|
| - // of whether the block is aligned evenly on a block boundary. The
|
| - // BlockHash will only store hash entries for the evenly-aligned blocks.
|
| - //
|
| - void AddOneIndexHash(int index, uint32_t hash_value) {
|
| - if (index == NextIndexToAdd()) {
|
| - AddBlock(hash_value);
|
| - }
|
| - }
|
| -
|
| - // Calls AddBlock() for each kBlockSize-byte block in the range
|
| - // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints.
|
| - // If end_index <= the last index added (last_block_added_ * kBlockSize),
|
| - // this function does nothing.
|
| - //
|
| - // A partial block beginning anywhere up to (end_index - 1) is also added,
|
| - // unless it extends outside the end of the source data. Like AddAllBlocks(),
|
| - // this function computes the hash value for each of the blocks in question
|
| - // from scratch, so it is not a good option if the hash values have already
|
| - // been computed for some other purpose.
|
| - //
|
| - // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are
|
| - // 14 bytes of source data.
|
| - // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock()
|
| - // only for block number 2 (at index 8).
|
| - // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call
|
| - // AddBlock() at all, because block 3 (beginning at index 12) would
|
| - // fall outside the range of source data.
|
| - //
|
| - // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to
|
| - // add a whole range of data to a target hash when a COPY instruction
|
| - // is generated.
|
| - void AddAllBlocksThroughIndex(int end_index);
|
| -
|
| - // FindBestMatch takes a position within the unencoded target data
|
| - // (target_candidate_start) and the hash value of the kBlockSize bytes
|
| - // beginning at that position (hash_value). It attempts to find a matching
|
| - // set of bytes within the source (== dictionary) data, expanding
|
| - // the match both below and above the target block. It cannot expand
|
| - // the match outside the bounds of the source data, or below
|
| - // target_start within the target data, or past
|
| - // the end limit of (target_start + target_length).
|
| - //
|
| - // target_candidate_start is the start of the candidate block within the
|
| - // target data for which a match will be sought, while
|
| - // target_start (which is <= target_candidate_start)
|
| - // is the start of the target data that has yet to be encoded.
|
| - //
|
| - // If a match is found whose size is greater than the size
|
| - // of best_match, this function populates *best_match with the
|
| - // size, source_offset, and target_offset of the match found.
|
| - // best_match->source_offset() will contain the index of the start of the
|
| - // matching source data, plus starting_offset_
|
| - // (see description of starting_offset_ for details);
|
| - // best_match->target_offset() will contain the offset of the match
|
| - // beginning with target_start = offset 0, such that
|
| - // 0 <= best_match->target_offset()
|
| - // <= (target_candidate_start - target_start);
|
| - // and best_match->size() will contain the size of the match.
|
| - // If no such match is found, this function leaves *best_match unmodified.
|
| - //
|
| - // On calling FindBestMatch(), best_match must
|
| - // point to a valid Match object, and cannot be NULL.
|
| - // The same Match object can be passed
|
| - // when calling FindBestMatch() on a different BlockHash object
|
| - // for the same candidate data block, in order to find
|
| - // the best match possible across both objects. For example:
|
| - //
|
| - // open_vcdiff::BlockHash::Match best_match;
|
| - // uint32_t hash_value =
|
| - // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start);
|
| - // bh1.FindBestMatch(hash_value,
|
| - // target_candidate_start,
|
| - // target_start,
|
| - // target_length,
|
| - // &best_match);
|
| - // bh2.FindBestMatch(hash_value,
|
| - // target_candidate_start,
|
| - // target_start,
|
| - // target_length,
|
| - // &best_match);
|
| - // if (best_size >= 0) {
|
| - // // a match was found; its size, source offset, and target offset
|
| - // // can be found in best_match
|
| - // }
|
| - //
|
| - // hash_value is passed as a separate parameter from target_candidate_start,
|
| - // (rather than calculated within FindBestMatch) in order to take
|
| - // advantage of the rolling hash, which quickly calculates the hash value
|
| - // of the block starting at target_candidate_start based on
|
| - // the known hash value of the block starting at (target_candidate_start - 1).
|
| - // See vcdiffengine.cc for more details.
|
| - //
|
| - // Example:
|
| - // kBlockSize: 4
|
| - // target text: "ANDREW LLOYD WEBBER"
|
| - // 1^ 5^2^ 3^
|
| - // dictionary: "INSURANCE : LLOYDS OF LONDON"
|
| - // 4^
|
| - // hashed dictionary blocks:
|
| - // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON"
|
| - //
|
| - // 1: target_start (beginning of unencoded data)
|
| - // 2: target_candidate_start (for the block "LLOY")
|
| - // 3: target_length (points one byte beyond the last byte of data.)
|
| - // 4: best_match->source_offset() (after calling FindBestMatch)
|
| - // 5: best_match->target_offset() (after calling FindBestMatch)
|
| - //
|
| - // Under these conditions, FindBestMatch will find a matching
|
| - // hashed dictionary block for "LLOY", and will extend the beginning of
|
| - // this match backwards by one byte, and the end of the match forwards
|
| - // by one byte, finding that the best match is " LLOYD"
|
| - // with best_match->source_offset() = 10
|
| - // (offset of " LLOYD" in the source string),
|
| - // best_match->target_offset() = 6
|
| - // (offset of " LLOYD" in the target string),
|
| - // and best_match->size() = 6.
|
| - //
|
| - void FindBestMatch(uint32_t hash_value,
|
| - const char* target_candidate_start,
|
| - const char* target_start,
|
| - size_t target_size,
|
| - Match* best_match) const;
|
| -
|
| - protected:
|
| - // FindBestMatch() will not process more than this number
|
| - // of matching hash entries.
|
| - //
|
| - // It is necessary to have a limit on the maximum number of matches
|
| - // that will be checked in order to avoid the worst-case performance
|
| - // possible if, for example, all the blocks in the dictionary have
|
| - // the same hash value. See the unit test SearchStringFindsTooManyMatches
|
| - // for an example of such a case. The encoder uses a loop in
|
| - // VCDiffEngine::Encode over each target byte, containing a loop in
|
| - // BlockHash::FindBestMatch over the number of matches (up to a maximum
|
| - // of the number of source blocks), containing two loops that extend
|
| - // the match forwards and backwards up to the number of source bytes.
|
| - // Total complexity in the worst case is
|
| - // O([target size] * source_size_ * source_size_
|
| - // Placing a limit on the possible number of matches checked changes this to
|
| - // O([target size] * source_size_ * kMaxMatchesToCheck)
|
| - //
|
| - // In empirical testing on real HTML text, using a block size of 4,
|
| - // the number of true matches per call to FindBestMatch() did not exceed 78;
|
| - // with a block size of 32, the number of matches did not exceed 3.
|
| - //
|
| - // The expected number of true matches scales super-linearly
|
| - // with the inverse of kBlockSize, but here a linear scale is used
|
| - // for block sizes smaller than 32.
|
| - static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 8 :
|
| - (8 * (32 / kBlockSize));
|
| -
|
| - // Do not skip more than this number of non-matching hash collisions
|
| - // to find the next matching entry in the hash chain.
|
| - static const int kMaxProbes = 16;
|
| -
|
| - // Internal routine which calculates a hash table size based on kBlockSize and
|
| - // the dictionary_size. Will return a power of two if successful, or 0 if an
|
| - // internal error occurs. Some calculations (such as GetHashTableIndex())
|
| - // depend on the table size being a power of two.
|
| - static const size_t CalcTableSize(const size_t dictionary_size);
|
| -
|
| - const size_t GetNumberOfBlocks() const {
|
| - return source_size_ / kBlockSize;
|
| - }
|
| -
|
| - // Use the lowest-order bits of the hash value
|
| - // as the index into the hash table.
|
| - int GetHashTableIndex(uint32_t hash_value) const {
|
| - return hash_value & (table_size() - 1);
|
| - }
|
| -
|
| - // The index within source_data_ of the next block
|
| - // for which AddBlock() should be called.
|
| - int NextIndexToAdd() const {
|
| - return (last_block_added_ + 1) * kBlockSize;
|
| - }
|
| -
|
| - static inline bool TooManyMatches(int* match_counter);
|
| -
|
| - const char* const source_data() { return source_data_; }
|
| - const size_t source_size() { return source_size_; }
|
| -
|
| - // Adds an entry to the hash table for one block of source data of length
|
| - // kBlockSize, starting at source_data_[block_number * kBlockSize],
|
| - // where block_number is always (last_block_added_ + 1). That is,
|
| - // AddBlock() must be called once for each block in source_data_
|
| - // in increasing order.
|
| - void AddBlock(uint32_t hash_value);
|
| -
|
| - // Calls AddBlock() for each complete kBlockSize-byte block between
|
| - // source_data_ and (source_data_ + source_size_). It is equivalent
|
| - // to calling AddAllBlocksThroughIndex(source_data + source_size).
|
| - // This function is called when Init(true) is invoked.
|
| - void AddAllBlocks();
|
| -
|
| - // Returns true if the contents of the kBlockSize-byte block
|
| - // beginning at block1 are identical to the contents of
|
| - // the block beginning at block2; false otherwise.
|
| - static bool BlockContentsMatch(const char* block1, const char* block2);
|
| -
|
| - // Finds the first block number within the hashed data
|
| - // that represents a match for the given hash value.
|
| - // Returns -1 if no match was found.
|
| - //
|
| - // Init() must have been called and returned true before using
|
| - // FirstMatchingBlock or NextMatchingBlock. No check is performed
|
| - // for this condition; the code will crash if this condition is violated.
|
| - //
|
| - // The hash table is initially populated with -1 (not found) values,
|
| - // so if this function is called before the hash table has been populated
|
| - // using AddAllBlocks() or AddBlock(), it will simply return -1
|
| - // for any value of hash_value.
|
| - int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const;
|
| -
|
| - // Given a block number returned by FirstMatchingBlock()
|
| - // or by a previous call to NextMatchingBlock(), returns
|
| - // the next block number that matches the same hash value.
|
| - // Returns -1 if no match was found.
|
| - int NextMatchingBlock(int block_number, const char* block_ptr) const;
|
| -
|
| - // Inline versions of BlockContentsMatch and FirstMatchingBlock.
|
| - // These save the cost of a function call
|
| - // when these routines are called from within the module.
|
| - // The external (non-inlined) versions are called only by unit tests.
|
| - static inline bool BlockContentsMatchInline(const char* block1,
|
| - const char* block2);
|
| - inline int FirstMatchingBlockInline(uint32_t hash_value,
|
| - const char* block_ptr) const;
|
| -
|
| - // Walk through the hash entry chain, skipping over any false matches
|
| - // (for which the lowest bits of the fingerprints match,
|
| - // but the actual block data does not.) Returns the block number of
|
| - // the first true match found, or -1 if no true match was found.
|
| - // If block_number is a matching block, the function will return block_number
|
| - // without skipping to the next block.
|
| - int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const;
|
| -
|
| - // Returns the number of bytes to the left of source_match_start
|
| - // that match the corresponding bytes to the left of target_match_start.
|
| - // Will not examine more than max_bytes bytes, which is to say that
|
| - // the return value will be in the range [0, max_bytes] inclusive.
|
| - static int MatchingBytesToLeft(const char* source_match_start,
|
| - const char* target_match_start,
|
| - int max_bytes);
|
| -
|
| - // Returns the number of bytes starting at source_match_end
|
| - // that match the corresponding bytes starting at target_match_end.
|
| - // Will not examine more than max_bytes bytes, which is to say that
|
| - // the return value will be in the range [0, max_bytes] inclusive.
|
| - static int MatchingBytesToRight(const char* source_match_end,
|
| - const char* target_match_end,
|
| - int max_bytes);
|
| -
|
| - // The protected functions BlockContentsMatch, FirstMatchingBlock,
|
| - // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight
|
| - // should be made accessible to unit tests.
|
| - friend class BlockHashTest;
|
| -
|
| - private:
|
| - const char* const source_data_;
|
| - const size_t source_size_;
|
| -
|
| - // The size of this array is determined using CalcTableSize(). It has at
|
| - // least one element for each kBlockSize-byte block in the source data.
|
| - // GetHashTableIndex() returns an index into this table for a given hash
|
| - // value. The value of each element of hash_table_ is the lowest block
|
| - // number in the source data whose hash value would return the same value from
|
| - // GetHashTableIndex(), or -1 if there is no matching block. This value can
|
| - // then be used as an index into next_block_table_ to retrieve the entire set
|
| - // of matching block numbers.
|
| - std::vector<int> hash_table_;
|
| -
|
| - // An array containing one element for each source block. Each element is
|
| - // either -1 (== not found) or the index of the next block whose hash value
|
| - // would produce a matching result from GetHashTableIndex().
|
| - std::vector<int> next_block_table_;
|
| -
|
| - // This array has the same size as hash_table_. For every block number B that
|
| - // is referenced in hash_table_, last_block_table_[B] will contain the maximum
|
| - // block number among all blocks that have the same GetHashTableIndex() value
|
| - // as the block B. This number may be B itself. For a block number B' that
|
| - // is not referenced in hash_table_, the value of last_block_table_[B'] is -1.
|
| - // This table is used only while populating the hash table, not while looking
|
| - // up hash values in the table. Keeping track of the last block number in the
|
| - // chain allows us to construct the block chains as FIFO rather than LIFO
|
| - // lists, so that the match with the lowest index is returned first. This
|
| - // should result in a more compact encoding because the VCDIFF format favors
|
| - // smaller index values and repeated index values.
|
| - std::vector<int> last_block_table_;
|
| -
|
| - // The offset of the first byte of source data (the data at source_data_[0]).
|
| - // For the purpose of computing offsets, the source data and target data
|
| - // are considered to be concatenated -- not literally in a single memory
|
| - // buffer, but conceptually as described in the RFC.
|
| - // The first byte of the previously encoded target data
|
| - // has an offset that is equal to dictionary_size, i.e., just after
|
| - // the last byte of source data.
|
| - // For a hash of source (dictionary) data, starting_offset_ will be zero;
|
| - // for a hash of previously encoded target data, starting_offset_ will be
|
| - // equal to the dictionary size.
|
| - const int starting_offset_;
|
| -
|
| - // The last index added by AddBlock(). This determines the block number
|
| - // for successive calls to AddBlock(), and is also
|
| - // used to determine the starting block for AddAllBlocksThroughIndex().
|
| - int last_block_added_;
|
| -
|
| - // Making these private avoids implicit copy constructor & assignment operator
|
| - BlockHash(const BlockHash&); // NOLINT
|
| - void operator=(const BlockHash&);
|
| -};
|
| -
|
| -} // namespace open_vcdiff
|
| -
|
| -#endif // OPEN_VCDIFF_BLOCKHASH_H_
|
|
|