| Index: sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc
|
| ===================================================================
|
| --- sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc (revision 2678)
|
| +++ sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc (working copy)
|
| @@ -1,434 +0,0 @@
|
| -// Copyright 2006, 2008 Google Inc.
|
| -// Authors: Chandra Chereddi, Lincoln Smith
|
| -//
|
| -// Licensed under the Apache License, Version 2.0 (the "License");
|
| -// you may not use this file except in compliance with the License.
|
| -// You may obtain a copy of the License at
|
| -//
|
| -// http://www.apache.org/licenses/LICENSE-2.0
|
| -//
|
| -// Unless required by applicable law or agreed to in writing, software
|
| -// distributed under the License is distributed on an "AS IS" BASIS,
|
| -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| -// See the License for the specific language governing permissions and
|
| -// limitations under the License.
|
| -
|
| -#include <config.h>
|
| -#include "blockhash.h"
|
| -#include "compile_assert.h"
|
| -#include <stdint.h> // uint32_t
|
| -#include "logging.h"
|
| -#include "rolling_hash.h"
|
| -
|
| -namespace open_vcdiff {
|
| -
|
| -typedef unsigned long uword_t; // a machine word NOLINT
|
| -
|
| -BlockHash::BlockHash(const char* source_data,
|
| - size_t source_size,
|
| - int starting_offset)
|
| - : source_data_(source_data),
|
| - source_size_(source_size),
|
| - starting_offset_(starting_offset),
|
| - last_block_added_(-1) {
|
| -}
|
| -
|
| -BlockHash::~BlockHash() { }
|
| -
|
| -// kBlockSize must be at least 2 to be meaningful. Since it's a compile-time
|
| -// constant, check its value at compile time rather than wasting CPU cycles
|
| -// on runtime checks.
|
| -COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
|
| -
|
| -// kBlockSize is required to be a power of 2 because multiplication
|
| -// (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
|
| -// are commonly-used operations. If kBlockSize is a compile-time
|
| -// constant and a power of 2, the compiler can convert these three operations
|
| -// into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
|
| -// more efficient than executing full integer multiply, divide, or remainder
|
| -// instructions.
|
| -COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
|
| - kBlockSize_must_be_a_power_of_2);
|
| -
|
| -bool BlockHash::Init(bool populate_hash_table) {
|
| - if (!hash_table_.empty() ||
|
| - !next_block_table_.empty() ||
|
| - !last_block_table_.empty()) {
|
| - LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL;
|
| - return false;
|
| - }
|
| - const size_t table_size = CalcTableSize(source_size_);
|
| - if (table_size == 0) {
|
| - LOG(DFATAL) << "Error finding table size for source size " << source_size_
|
| - << LOG_ENDL;
|
| - return false;
|
| - }
|
| - hash_table_.resize(table_size, -1);
|
| - next_block_table_.resize(GetNumberOfBlocks(), -1);
|
| - last_block_table_.resize(GetNumberOfBlocks(), -1);
|
| - if (populate_hash_table) {
|
| - AddAllBlocks();
|
| - }
|
| - return true;
|
| -}
|
| -
|
| -const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
|
| - size_t dictionary_size) {
|
| - BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
|
| - dictionary_size,
|
| - 0);
|
| - if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
|
| - delete new_dictionary_hash;
|
| - return NULL;
|
| - } else {
|
| - return new_dictionary_hash;
|
| - }
|
| -}
|
| -
|
| -BlockHash* BlockHash::CreateTargetHash(const char* target_data,
|
| - size_t target_size,
|
| - size_t dictionary_size) {
|
| - BlockHash* new_target_hash = new BlockHash(target_data,
|
| - target_size,
|
| - static_cast<int>(dictionary_size));
|
| - if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
|
| - delete new_target_hash;
|
| - return NULL;
|
| - } else {
|
| - return new_target_hash;
|
| - }
|
| -}
|
| -
|
| -// Returns zero if an error occurs.
|
| -const size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
|
| - // Overallocate the hash table by making it the same size (in bytes)
|
| - // as the source data. This is a trade-off between space and time:
|
| - // the empty entries in the hash table will reduce the
|
| - // probability of a hash collision to (sizeof(int) / kblockSize),
|
| - // and so save time comparing false matches.
|
| - const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT
|
| - size_t table_size = 1;
|
| - // Find the smallest power of 2 that is >= min_size, and assign
|
| - // that value to table_size.
|
| - while (table_size < min_size) {
|
| - table_size <<= 1;
|
| - // Guard against an infinite loop
|
| - if (table_size <= 0) {
|
| - LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
|
| - << dictionary_size
|
| - << "): resulting table_size " << table_size
|
| - << " is zero or negative" << LOG_ENDL;
|
| - return 0;
|
| - }
|
| - }
|
| - // Check size sanity
|
| - if ((table_size & (table_size - 1)) != 0) {
|
| - LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
|
| - << dictionary_size
|
| - << "): resulting table_size " << table_size
|
| - << " is not a power of 2" << LOG_ENDL;
|
| - return 0;
|
| - }
|
| - // The loop above tries to find the smallest power of 2 that is >= min_size.
|
| - // That value must lie somewhere between min_size and (min_size * 2),
|
| - // except for the case (dictionary_size == 0, table_size == 1).
|
| - if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
|
| - LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
|
| - << dictionary_size
|
| - << "): resulting table_size " << table_size
|
| - << " is too large" << LOG_ENDL;
|
| - return 0;
|
| - }
|
| - return table_size;
|
| -}
|
| -
|
| -// If the hash value is already available from the rolling hash,
|
| -// call this function to save time.
|
| -void BlockHash::AddBlock(uint32_t hash_value) {
|
| - if (hash_table_.empty()) {
|
| - LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()"
|
| - << LOG_ENDL;
|
| - return;
|
| - }
|
| - // The initial value of last_block_added_ is -1.
|
| - int block_number = last_block_added_ + 1;
|
| - const int total_blocks =
|
| - static_cast<int>(source_size_ / kBlockSize); // round down
|
| - if (block_number >= total_blocks) {
|
| - LOG(DFATAL) << "BlockHash::AddBlock() called"
|
| - " with block number " << block_number
|
| - << " that is past last block " << (total_blocks - 1)
|
| - << LOG_ENDL;
|
| - return;
|
| - }
|
| - if (next_block_table_[block_number] != -1) {
|
| - LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
|
| - "block number = " << block_number
|
| - << ", next block should be -1 but is "
|
| - << next_block_table_[block_number] << LOG_ENDL;
|
| - return;
|
| - }
|
| - const int hash_table_index = GetHashTableIndex(hash_value);
|
| - const int first_matching_block = hash_table_[hash_table_index];
|
| - if (first_matching_block < 0) {
|
| - // This is the first entry with this hash value
|
| - hash_table_[hash_table_index] = block_number;
|
| - last_block_table_[block_number] = block_number;
|
| - } else {
|
| - // Add this entry at the end of the chain of matching blocks
|
| - const int last_matching_block = last_block_table_[first_matching_block];
|
| - if (next_block_table_[last_matching_block] != -1) {
|
| - LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
|
| - "first matching block = " << first_matching_block
|
| - << ", last matching block = " << last_matching_block
|
| - << ", next block should be -1 but is "
|
| - << next_block_table_[last_matching_block] << LOG_ENDL;
|
| - return;
|
| - }
|
| - next_block_table_[last_matching_block] = block_number;
|
| - last_block_table_[first_matching_block] = block_number;
|
| - }
|
| - last_block_added_ = block_number;
|
| -}
|
| -
|
| -void BlockHash::AddAllBlocks() {
|
| - AddAllBlocksThroughIndex(static_cast<int>(source_size_));
|
| -}
|
| -
|
| -void BlockHash::AddAllBlocksThroughIndex(int end_index) {
|
| - if (end_index > static_cast<int>(source_size_)) {
|
| - LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
|
| - " with index " << end_index
|
| - << " higher than end index " << source_size_ << LOG_ENDL;
|
| - return;
|
| - }
|
| - const int last_index_added = last_block_added_ * kBlockSize;
|
| - if (end_index <= last_index_added) {
|
| - LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
|
| - " with index " << end_index
|
| - << " <= last index added ( " << last_index_added
|
| - << ")" << LOG_ENDL;
|
| - return;
|
| - }
|
| - int end_limit = end_index;
|
| - // Don't allow reading any indices at or past source_size_.
|
| - // The Hash function extends (kBlockSize - 1) bytes past the index,
|
| - // so leave a margin of that size.
|
| - int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
|
| - if (end_limit > last_legal_hash_index) {
|
| - end_limit = last_legal_hash_index + 1;
|
| - }
|
| - const char* block_ptr = source_data() + NextIndexToAdd();
|
| - const char* const end_ptr = source_data() + end_limit;
|
| - while (block_ptr < end_ptr) {
|
| - AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
|
| - block_ptr += kBlockSize;
|
| - }
|
| -}
|
| -
|
| -COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
|
| - kBlockSize_must_be_a_multiple_of_machine_word_size);
|
| -
|
| -// A recursive template to compare a fixed number
|
| -// of (possibly unaligned) machine words starting
|
| -// at addresses block1 and block2. Returns true or false
|
| -// depending on whether an exact match was found.
|
| -template<int number_of_words>
|
| -inline bool CompareWholeWordValues(const char* block1,
|
| - const char* block2) {
|
| - return CompareWholeWordValues<1>(block1, block2) &&
|
| - CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
|
| - block2 + sizeof(uword_t));
|
| -}
|
| -
|
| -// The base of the recursive template: compare one pair of machine words.
|
| -template<>
|
| -inline bool CompareWholeWordValues<1>(const char* word1,
|
| - const char* word2) {
|
| - uword_t aligned_word1, aligned_word2;
|
| - memcpy(&aligned_word1, word1, sizeof(aligned_word1));
|
| - memcpy(&aligned_word2, word2, sizeof(aligned_word2));
|
| - return aligned_word1 == aligned_word2;
|
| -}
|
| -
|
| -// Calling BlockContentsMatch is equivalent to the following memcmp call:
|
| -//
|
| -// memcmp(block1, block2, BlockHash::kBlockSize);
|
| -//
|
| -// This function will only be called for two blocks whose hash values match.
|
| -// For that reason, it is very likely that either the blocks are identical,
|
| -// or they have different first bytes. (Given a good hash function, if two
|
| -// non-identical blocks have the same hash value, the probability that their
|
| -// first bytes differ is the same as the probability of two random bytes
|
| -// differing: 255/256, or rather smaller for text data.) So optimize
|
| -// for these two common cases.
|
| -//
|
| -// A block must be composed of an integral number of machine words
|
| -// (uword_t values.) This function takes advantage of that fact
|
| -// by comparing the blocks as series of (possibly unaligned) word values.
|
| -// A word-sized comparison can be performed as a single
|
| -// machine instruction. Comparing words instead of bytes means that,
|
| -// on a 64-bit platform, this function will use 8 times fewer test-and-branch
|
| -// instructions than a byte-by-byte comparison. Even with the extra
|
| -// cost of the calls to memcpy, this method is still at least twice as fast
|
| -// as memcmp (measured using gcc on a 64-bit platform, with a block size
|
| -// of 32.) For blocks with identical contents (a common case), this method
|
| -// is over six times faster than memcmp.
|
| -inline bool BlockHash::BlockContentsMatchInline(const char* block1,
|
| - const char* block2) {
|
| - // Optimize for mismatch in first byte, as described above
|
| - if (*block1 != *block2) {
|
| - return false;
|
| - }
|
| - static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
|
| - return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
|
| -}
|
| -
|
| -bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
|
| - return BlockContentsMatchInline(block1, block2);
|
| -}
|
| -
|
| -inline int BlockHash::SkipNonMatchingBlocks(int block_number,
|
| - const char* block_ptr) const {
|
| - int probes = 0;
|
| - while ((block_number != -1) &&
|
| - !BlockContentsMatchInline(block_ptr,
|
| - &source_data_[block_number * kBlockSize])) {
|
| - if (++probes > kMaxProbes) {
|
| - return -1; // Avoid too much chaining
|
| - }
|
| - block_number = next_block_table_[block_number];
|
| - }
|
| - return block_number;
|
| -}
|
| -
|
| -// Init() must have been called and returned true before using
|
| -// FirstMatchingBlock or NextMatchingBlock. No check is performed
|
| -// for this condition; the code will crash if this condition is violated.
|
| -inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
|
| - const char* block_ptr) const {
|
| - return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
|
| - block_ptr);
|
| -}
|
| -
|
| -int BlockHash::FirstMatchingBlock(uint32_t hash_value,
|
| - const char* block_ptr) const {
|
| - return FirstMatchingBlockInline(hash_value, block_ptr);
|
| -}
|
| -
|
| -int BlockHash::NextMatchingBlock(int block_number,
|
| - const char* block_ptr) const {
|
| - if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
|
| - LOG(DFATAL) << "NextMatchingBlock called for invalid block number "
|
| - << block_number << LOG_ENDL;
|
| - return -1;
|
| - }
|
| - return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
|
| -}
|
| -
|
| -// Keep a count of the number of matches found. This will throttle the
|
| -// number of iterations in FindBestMatch. For example, if the entire
|
| -// dictionary is made up of spaces (' ') and the search string is also
|
| -// made up of spaces, there will be one match for each block in the
|
| -// dictionary.
|
| -inline bool BlockHash::TooManyMatches(int* match_counter) {
|
| - ++(*match_counter);
|
| - return (*match_counter) > kMaxMatchesToCheck;
|
| -}
|
| -
|
| -// Returns the number of bytes to the left of source_match_start
|
| -// that match the corresponding bytes to the left of target_match_start.
|
| -// Will not examine more than max_bytes bytes, which is to say that
|
| -// the return value will be in the range [0, max_bytes] inclusive.
|
| -int BlockHash::MatchingBytesToLeft(const char* source_match_start,
|
| - const char* target_match_start,
|
| - int max_bytes) {
|
| - const char* source_ptr = source_match_start;
|
| - const char* target_ptr = target_match_start;
|
| - int bytes_found = 0;
|
| - while (bytes_found < max_bytes) {
|
| - --source_ptr;
|
| - --target_ptr;
|
| - if (*source_ptr != *target_ptr) {
|
| - break;
|
| - }
|
| - ++bytes_found;
|
| - }
|
| - return bytes_found;
|
| -}
|
| -
|
| -// Returns the number of bytes starting at source_match_end
|
| -// that match the corresponding bytes starting at target_match_end.
|
| -// Will not examine more than max_bytes bytes, which is to say that
|
| -// the return value will be in the range [0, max_bytes] inclusive.
|
| -int BlockHash::MatchingBytesToRight(const char* source_match_end,
|
| - const char* target_match_end,
|
| - int max_bytes) {
|
| - const char* source_ptr = source_match_end;
|
| - const char* target_ptr = target_match_end;
|
| - int bytes_found = 0;
|
| - while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
|
| - ++bytes_found;
|
| - ++source_ptr;
|
| - ++target_ptr;
|
| - }
|
| - return bytes_found;
|
| -}
|
| -
|
| -// No NULL checks are performed on the pointer arguments. The caller
|
| -// must guarantee that none of the arguments is NULL, or a crash will occur.
|
| -//
|
| -// The vast majority of calls to FindBestMatch enter the loop *zero* times,
|
| -// which is to say that most candidate blocks find no matches in the dictionary.
|
| -// The important sections for optimization are therefore the code outside the
|
| -// loop and the code within the loop conditions. Keep this to a minimum.
|
| -void BlockHash::FindBestMatch(uint32_t hash_value,
|
| - const char* target_candidate_start,
|
| - const char* target_start,
|
| - size_t target_size,
|
| - Match* best_match) const {
|
| - int match_counter = 0;
|
| - for (int block_number = FirstMatchingBlockInline(hash_value,
|
| - target_candidate_start);
|
| - (block_number >= 0) && !TooManyMatches(&match_counter);
|
| - block_number = NextMatchingBlock(block_number, target_candidate_start)) {
|
| - int source_match_offset = block_number * kBlockSize;
|
| - const int source_match_end = source_match_offset + kBlockSize;
|
| -
|
| - int target_match_offset =
|
| - static_cast<int>(target_candidate_start - target_start);
|
| - const int target_match_end = target_match_offset + kBlockSize;
|
| -
|
| - size_t match_size = kBlockSize;
|
| - {
|
| - // Extend match start towards beginning of unencoded data
|
| - const int limit_bytes_to_left = std::min(source_match_offset,
|
| - target_match_offset);
|
| - const int matching_bytes_to_left =
|
| - MatchingBytesToLeft(source_data_ + source_match_offset,
|
| - target_start + target_match_offset,
|
| - limit_bytes_to_left);
|
| - source_match_offset -= matching_bytes_to_left;
|
| - target_match_offset -= matching_bytes_to_left;
|
| - match_size += matching_bytes_to_left;
|
| - }
|
| - {
|
| - // Extend match end towards end of unencoded data
|
| - const size_t source_bytes_to_right = source_size_ - source_match_end;
|
| - const size_t target_bytes_to_right = target_size - target_match_end;
|
| - const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
|
| - target_bytes_to_right);
|
| - match_size +=
|
| - MatchingBytesToRight(source_data_ + source_match_end,
|
| - target_start + target_match_end,
|
| - static_cast<int>(limit_bytes_to_right));
|
| - }
|
| - // Update in/out parameter if the best match found was better
|
| - // than any match already stored in *best_match.
|
| - best_match->ReplaceIfBetterMatch(match_size,
|
| - source_match_offset + starting_offset_,
|
| - target_match_offset);
|
| - }
|
| -}
|
| -
|
| -} // namespace open_vcdiff
|
|
|