| Index: sdch/open_vcdiff/depot/opensource/open-vcdiff/src/rolling_hash.h
|
| ===================================================================
|
| --- sdch/open_vcdiff/depot/opensource/open-vcdiff/src/rolling_hash.h (revision 2678)
|
| +++ sdch/open_vcdiff/depot/opensource/open-vcdiff/src/rolling_hash.h (working copy)
|
| @@ -1,237 +0,0 @@
|
| -// Copyright 2007, 2008 Google Inc.
|
| -// Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
|
| -//
|
| -// Licensed under the Apache License, Version 2.0 (the "License");
|
| -// you may not use this file except in compliance with the License.
|
| -// You may obtain a copy of the License at
|
| -//
|
| -// http://www.apache.org/licenses/LICENSE-2.0
|
| -//
|
| -// Unless required by applicable law or agreed to in writing, software
|
| -// distributed under the License is distributed on an "AS IS" BASIS,
|
| -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| -// See the License for the specific language governing permissions and
|
| -// limitations under the License.
|
| -
|
| -#ifndef OPEN_VCDIFF_ROLLING_HASH_H_
|
| -#define OPEN_VCDIFF_ROLLING_HASH_H_
|
| -
|
| -#include <config.h>
|
| -#include <stdint.h> // uint32_t
|
| -#include "logging.h"
|
| -
|
| -namespace open_vcdiff {
|
| -
|
| -// Rabin-Karp hasher module -- this is a faster version with different
|
| -// constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
|
| -// close enough for most applications.
|
| -
|
| -// Definitions common to all hash window sizes.
|
| -class RollingHashUtil {
|
| - public:
|
| - // Multiplier for incremental hashing. The compiler should be smart enough to
|
| - // convert (val * kMult) into ((val << 8) + val).
|
| - static const uint32_t kMult = 257;
|
| -
|
| - // All hashes are returned modulo "kBase". Current implementation requires
|
| - // kBase <= 2^32/kMult to avoid overflow. Also, kBase must be a power of two
|
| - // so that we can compute modulus efficiently.
|
| - static const uint32_t kBase = (1 << 23);
|
| -
|
| - // Returns operand % kBase, assuming that kBase is a power of two.
|
| - static inline uint32_t ModBase(uint32_t operand) {
|
| - return operand & (kBase - 1);
|
| - }
|
| -
|
| - // Given an unsigned integer "operand", returns an unsigned integer "result"
|
| - // such that
|
| - // result < kBase
|
| - // and
|
| - // ModBase(operand + result) == 0
|
| - static inline uint32_t FindModBaseInverse(uint32_t operand) {
|
| - // The subtraction (0 - operand) produces an unsigned underflow for any
|
| - // operand except 0. The underflow results in a (very large) unsigned
|
| - // number. Binary subtraction is used instead of unary negation because
|
| - // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
|
| - // value is negated.
|
| - //
|
| - // The C++ mod operation (operand % kBase) may produce different results for
|
| - // different compilers if operand is negative. That is not a problem in
|
| - // this case, since all numbers used are unsigned, and ModBase does its work
|
| - // using bitwise arithmetic rather than the % operator.
|
| - return ModBase(uint32_t(0) - operand);
|
| - }
|
| -
|
| - // Here's the heart of the hash algorithm. Start with a partial_hash value of
|
| - // 0, and run this HashStep once against each byte in the data window to be
|
| - // hashed. The result will be the hash value for the entire data window. The
|
| - // Hash() function, below, does exactly this, albeit with some refinements.
|
| - static inline uint32_t HashStep(uint32_t partial_hash,
|
| - unsigned char next_byte) {
|
| - return ModBase((partial_hash * kMult) + next_byte);
|
| - }
|
| -
|
| - // Use this function to start computing a new hash value based on the first
|
| - // two bytes in the window. It is equivalent to calling
|
| - // HashStep(HashStep(0, ptr[0]), ptr[1])
|
| - // but takes advantage of the fact that the maximum value of
|
| - // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
|
| - // avoiding an unnecessary ModBase operation.
|
| - static inline uint32_t HashFirstTwoBytes(const char* ptr) {
|
| - return (static_cast<unsigned char>(ptr[0]) * kMult)
|
| - + static_cast<unsigned char>(ptr[1]);
|
| - }
|
| - private:
|
| - // Making these private avoids copy constructor and assignment operator.
|
| - // No objects of this type should be constructed.
|
| - RollingHashUtil();
|
| - RollingHashUtil(const RollingHashUtil&); // NOLINT
|
| - void operator=(const RollingHashUtil&);
|
| -};
|
| -
|
| -// window_size must be >= 2.
|
| -template<int window_size>
|
| -class RollingHash {
|
| - public:
|
| - // Perform global initialization that is required in order to instantiate a
|
| - // RollingHash. This function *must* be called (preferably on startup) by any
|
| - // program that uses a RollingHash. It is harmless to call this function more
|
| - // than once. It is not thread-safe, but calling it from two different
|
| - // threads at the same time can only cause a memory leak, not incorrect
|
| - // behavior. Make sure to call it before spawning any threads that could use
|
| - // RollingHash. The function returns true if initialization succeeds, or
|
| - // false if initialization fails, in which case the caller should not proceed
|
| - // to construct any objects of type RollingHash.
|
| - static bool Init();
|
| -
|
| - // Initialize hasher to maintain a window of the specified size. You need an
|
| - // instance of this type to use UpdateHash(), but Hash() does not depend on
|
| - // remove_table_, so it is static.
|
| - RollingHash() {
|
| - if (!remove_table_) {
|
| - LOG(DFATAL) << "RollingHash object instantiated"
|
| - " before calling RollingHash::Init()" << LOG_ENDL;
|
| - }
|
| - }
|
| -
|
| - // Compute a hash of the window "ptr[0, window_size - 1]".
|
| - static uint32_t Hash(const char* ptr) {
|
| - uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
|
| - for (int i = 2; i < window_size; ++i) {
|
| - h = RollingHashUtil::HashStep(h, ptr[i]);
|
| - }
|
| - return h;
|
| - }
|
| -
|
| - // Update a hash by removing the oldest byte and adding a new byte.
|
| - //
|
| - // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
|
| - // along with the value of buffer[0] (the "old_first_byte" argument)
|
| - // and the value of buffer[window_size] (the "new_last_byte" argument).
|
| - // It quickly computes the hash value of buffer[1] ... buffer[window_size]
|
| - // without having to run Hash() on the entire window.
|
| - //
|
| - // The larger the window, the more advantage comes from using UpdateHash()
|
| - // (which runs in time independent of window_size) instead of Hash().
|
| - // Each time window_size doubles, the time to execute Hash() also doubles,
|
| - // while the time to execute UpdateHash() remains constant. Empirical tests
|
| - // have borne out this statement.
|
| - uint32_t UpdateHash(uint32_t old_hash,
|
| - const char old_first_byte,
|
| - const char new_last_byte) const {
|
| - uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
|
| - return RollingHashUtil::HashStep(partial_hash, new_last_byte);
|
| - }
|
| -
|
| - protected:
|
| - // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
|
| - // value of the first byte buffer[0], this function returns a *partial* hash
|
| - // value for buffer[1] ... buffer[window_size -1]. See the comments in
|
| - // Init(), below, for a description of how the contents of remove_table_ are
|
| - // computed.
|
| - static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
|
| - unsigned char first_byte) {
|
| - return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
|
| - }
|
| -
|
| - private:
|
| - // We keep a table that maps from any byte "b" to
|
| - // (- b * pow(kMult, window_size - 1)) % kBase
|
| - static const uint32_t* remove_table_;
|
| -};
|
| -
|
| -// For each window_size, fill a 256-entry table such that
|
| -// the hash value of buffer[0] ... buffer[window_size - 1]
|
| -// + remove_table_[buffer[0]]
|
| -// == the hash value of buffer[1] ... buffer[window_size - 1]
|
| -// See the comments in Init(), below, for a description of how the contents of
|
| -// remove_table_ are computed.
|
| -template<int window_size>
|
| -const uint32_t* RollingHash<window_size>::remove_table_ = NULL;
|
| -
|
| -// Init() checks to make sure that the static object remove_table_ has been
|
| -// initialized; if not, it does the considerable work of populating it. Once
|
| -// it's ready, the table can be used for any number of RollingHash objects of
|
| -// the same window_size.
|
| -//
|
| -template<int window_size>
|
| -bool RollingHash<window_size>::Init() {
|
| - if (window_size < 2) {
|
| - LOG(ERROR) << "RollingHash window size " << window_size
|
| - << " is too small" << LOG_ENDL;
|
| - return false;
|
| - }
|
| - if (remove_table_ == NULL) {
|
| - // The new object is placed into a local pointer instead of directly into
|
| - // remove_table_, for two reasons:
|
| - // 1. remove_table_ is a pointer to const. The table is populated using
|
| - // the non-const local pointer and then assigned to the global const
|
| - // pointer once it's ready.
|
| - // 2. No other thread will ever see remove_table_ pointing to a
|
| - // partially-initialized table. If two threads happen to call Init()
|
| - // at the same time, two tables with the same contents may be created
|
| - // (causing a memory leak), but the results will be consistent
|
| - // no matter which of the two tables is used.
|
| - uint32_t* new_remove_table = new uint32_t[256];
|
| - // Compute multiplier. Concisely, it is:
|
| - // pow(kMult, (window_size - 1)) % kBase,
|
| - // but we compute the power in integer form.
|
| - uint32_t multiplier = 1;
|
| - for (int i = 0; i < window_size - 1; ++i) {
|
| - multiplier =
|
| - RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
|
| - }
|
| - // For each character removed_byte, compute
|
| - // remove_table_[removed_byte] ==
|
| - // (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
|
| - // where the power operator "pow" is taken in integer form.
|
| - //
|
| - // If you take a hash value fp representing the hash of
|
| - // buffer[0] ... buffer[window_size - 1]
|
| - // and add the value of remove_table_[buffer[0]] to it, the result will be
|
| - // a partial hash value for
|
| - // buffer[1] ... buffer[window_size - 1]
|
| - // that is to say, it no longer includes buffer[0].
|
| - //
|
| - // The following byte at buffer[window_size] can then be merged with this
|
| - // partial hash value to arrive quickly at the hash value for a window that
|
| - // has advanced by one byte, to
|
| - // buffer[1] ... buffer[window_size]
|
| - // In fact, that is precisely what happens in UpdateHash, above.
|
| - uint32_t byte_times_multiplier = 0;
|
| - for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
|
| - new_remove_table[removed_byte] =
|
| - RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
|
| - // Iteratively adding the multiplier in this loop is equivalent to
|
| - // computing (removed_byte * multiplier), and is faster
|
| - byte_times_multiplier =
|
| - RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
|
| - }
|
| - remove_table_ = new_remove_table;
|
| - }
|
| - return true;
|
| -}
|
| -
|
| -} // namespace open_vcdiff
|
| -
|
| -#endif // OPEN_VCDIFF_ROLLING_HASH_H_
|
|
|