| OLD | NEW |
| (Empty) |
| 1 // Copyright 2006, 2008 Google Inc. | |
| 2 // Authors: Chandra Chereddi, Lincoln Smith | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // | |
| 8 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 // | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 | |
| 16 #include <config.h> | |
| 17 #include "blockhash.h" | |
| 18 #include "compile_assert.h" | |
| 19 #include <stdint.h> // uint32_t | |
| 20 #include "logging.h" | |
| 21 #include "rolling_hash.h" | |
| 22 | |
| 23 namespace open_vcdiff { | |
| 24 | |
| 25 typedef unsigned long uword_t; // a machine word NOLINT | |
| 26 | |
| 27 BlockHash::BlockHash(const char* source_data, | |
| 28 size_t source_size, | |
| 29 int starting_offset) | |
| 30 : source_data_(source_data), | |
| 31 source_size_(source_size), | |
| 32 starting_offset_(starting_offset), | |
| 33 last_block_added_(-1) { | |
| 34 } | |
| 35 | |
| 36 BlockHash::~BlockHash() { } | |
| 37 | |
| 38 // kBlockSize must be at least 2 to be meaningful. Since it's a compile-time | |
| 39 // constant, check its value at compile time rather than wasting CPU cycles | |
| 40 // on runtime checks. | |
| 41 COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2); | |
| 42 | |
| 43 // kBlockSize is required to be a power of 2 because multiplication | |
| 44 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize) | |
| 45 // are commonly-used operations. If kBlockSize is a compile-time | |
| 46 // constant and a power of 2, the compiler can convert these three operations | |
| 47 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much | |
| 48 // more efficient than executing full integer multiply, divide, or remainder | |
| 49 // instructions. | |
| 50 COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0, | |
| 51 kBlockSize_must_be_a_power_of_2); | |
| 52 | |
| 53 bool BlockHash::Init(bool populate_hash_table) { | |
| 54 if (!hash_table_.empty() || | |
| 55 !next_block_table_.empty() || | |
| 56 !last_block_table_.empty()) { | |
| 57 LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL; | |
| 58 return false; | |
| 59 } | |
| 60 const size_t table_size = CalcTableSize(source_size_); | |
| 61 if (table_size == 0) { | |
| 62 LOG(DFATAL) << "Error finding table size for source size " << source_size_ | |
| 63 << LOG_ENDL; | |
| 64 return false; | |
| 65 } | |
| 66 hash_table_.resize(table_size, -1); | |
| 67 next_block_table_.resize(GetNumberOfBlocks(), -1); | |
| 68 last_block_table_.resize(GetNumberOfBlocks(), -1); | |
| 69 if (populate_hash_table) { | |
| 70 AddAllBlocks(); | |
| 71 } | |
| 72 return true; | |
| 73 } | |
| 74 | |
| 75 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data, | |
| 76 size_t dictionary_size) { | |
| 77 BlockHash* new_dictionary_hash = new BlockHash(dictionary_data, | |
| 78 dictionary_size, | |
| 79 0); | |
| 80 if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) { | |
| 81 delete new_dictionary_hash; | |
| 82 return NULL; | |
| 83 } else { | |
| 84 return new_dictionary_hash; | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 BlockHash* BlockHash::CreateTargetHash(const char* target_data, | |
| 89 size_t target_size, | |
| 90 size_t dictionary_size) { | |
| 91 BlockHash* new_target_hash = new BlockHash(target_data, | |
| 92 target_size, | |
| 93 static_cast<int>(dictionary_size)); | |
| 94 if (!new_target_hash->Init(/* populate_hash_table = */ false)) { | |
| 95 delete new_target_hash; | |
| 96 return NULL; | |
| 97 } else { | |
| 98 return new_target_hash; | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 // Returns zero if an error occurs. | |
| 103 const size_t BlockHash::CalcTableSize(const size_t dictionary_size) { | |
| 104 // Overallocate the hash table by making it the same size (in bytes) | |
| 105 // as the source data. This is a trade-off between space and time: | |
| 106 // the empty entries in the hash table will reduce the | |
| 107 // probability of a hash collision to (sizeof(int) / kblockSize), | |
| 108 // and so save time comparing false matches. | |
| 109 const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT | |
| 110 size_t table_size = 1; | |
| 111 // Find the smallest power of 2 that is >= min_size, and assign | |
| 112 // that value to table_size. | |
| 113 while (table_size < min_size) { | |
| 114 table_size <<= 1; | |
| 115 // Guard against an infinite loop | |
| 116 if (table_size <= 0) { | |
| 117 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = " | |
| 118 << dictionary_size | |
| 119 << "): resulting table_size " << table_size | |
| 120 << " is zero or negative" << LOG_ENDL; | |
| 121 return 0; | |
| 122 } | |
| 123 } | |
| 124 // Check size sanity | |
| 125 if ((table_size & (table_size - 1)) != 0) { | |
| 126 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = " | |
| 127 << dictionary_size | |
| 128 << "): resulting table_size " << table_size | |
| 129 << " is not a power of 2" << LOG_ENDL; | |
| 130 return 0; | |
| 131 } | |
| 132 // The loop above tries to find the smallest power of 2 that is >= min_size. | |
| 133 // That value must lie somewhere between min_size and (min_size * 2), | |
| 134 // except for the case (dictionary_size == 0, table_size == 1). | |
| 135 if ((dictionary_size > 0) && (table_size > (min_size * 2))) { | |
| 136 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = " | |
| 137 << dictionary_size | |
| 138 << "): resulting table_size " << table_size | |
| 139 << " is too large" << LOG_ENDL; | |
| 140 return 0; | |
| 141 } | |
| 142 return table_size; | |
| 143 } | |
| 144 | |
| 145 // If the hash value is already available from the rolling hash, | |
| 146 // call this function to save time. | |
| 147 void BlockHash::AddBlock(uint32_t hash_value) { | |
| 148 if (hash_table_.empty()) { | |
| 149 LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()" | |
| 150 << LOG_ENDL; | |
| 151 return; | |
| 152 } | |
| 153 // The initial value of last_block_added_ is -1. | |
| 154 int block_number = last_block_added_ + 1; | |
| 155 const int total_blocks = | |
| 156 static_cast<int>(source_size_ / kBlockSize); // round down | |
| 157 if (block_number >= total_blocks) { | |
| 158 LOG(DFATAL) << "BlockHash::AddBlock() called" | |
| 159 " with block number " << block_number | |
| 160 << " that is past last block " << (total_blocks - 1) | |
| 161 << LOG_ENDL; | |
| 162 return; | |
| 163 } | |
| 164 if (next_block_table_[block_number] != -1) { | |
| 165 LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): " | |
| 166 "block number = " << block_number | |
| 167 << ", next block should be -1 but is " | |
| 168 << next_block_table_[block_number] << LOG_ENDL; | |
| 169 return; | |
| 170 } | |
| 171 const int hash_table_index = GetHashTableIndex(hash_value); | |
| 172 const int first_matching_block = hash_table_[hash_table_index]; | |
| 173 if (first_matching_block < 0) { | |
| 174 // This is the first entry with this hash value | |
| 175 hash_table_[hash_table_index] = block_number; | |
| 176 last_block_table_[block_number] = block_number; | |
| 177 } else { | |
| 178 // Add this entry at the end of the chain of matching blocks | |
| 179 const int last_matching_block = last_block_table_[first_matching_block]; | |
| 180 if (next_block_table_[last_matching_block] != -1) { | |
| 181 LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): " | |
| 182 "first matching block = " << first_matching_block | |
| 183 << ", last matching block = " << last_matching_block | |
| 184 << ", next block should be -1 but is " | |
| 185 << next_block_table_[last_matching_block] << LOG_ENDL; | |
| 186 return; | |
| 187 } | |
| 188 next_block_table_[last_matching_block] = block_number; | |
| 189 last_block_table_[first_matching_block] = block_number; | |
| 190 } | |
| 191 last_block_added_ = block_number; | |
| 192 } | |
| 193 | |
| 194 void BlockHash::AddAllBlocks() { | |
| 195 AddAllBlocksThroughIndex(static_cast<int>(source_size_)); | |
| 196 } | |
| 197 | |
| 198 void BlockHash::AddAllBlocksThroughIndex(int end_index) { | |
| 199 if (end_index > static_cast<int>(source_size_)) { | |
| 200 LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called" | |
| 201 " with index " << end_index | |
| 202 << " higher than end index " << source_size_ << LOG_ENDL; | |
| 203 return; | |
| 204 } | |
| 205 const int last_index_added = last_block_added_ * kBlockSize; | |
| 206 if (end_index <= last_index_added) { | |
| 207 LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called" | |
| 208 " with index " << end_index | |
| 209 << " <= last index added ( " << last_index_added | |
| 210 << ")" << LOG_ENDL; | |
| 211 return; | |
| 212 } | |
| 213 int end_limit = end_index; | |
| 214 // Don't allow reading any indices at or past source_size_. | |
| 215 // The Hash function extends (kBlockSize - 1) bytes past the index, | |
| 216 // so leave a margin of that size. | |
| 217 int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize); | |
| 218 if (end_limit > last_legal_hash_index) { | |
| 219 end_limit = last_legal_hash_index + 1; | |
| 220 } | |
| 221 const char* block_ptr = source_data() + NextIndexToAdd(); | |
| 222 const char* const end_ptr = source_data() + end_limit; | |
| 223 while (block_ptr < end_ptr) { | |
| 224 AddBlock(RollingHash<kBlockSize>::Hash(block_ptr)); | |
| 225 block_ptr += kBlockSize; | |
| 226 } | |
| 227 } | |
| 228 | |
| 229 COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0, | |
| 230 kBlockSize_must_be_a_multiple_of_machine_word_size); | |
| 231 | |
| 232 // A recursive template to compare a fixed number | |
| 233 // of (possibly unaligned) machine words starting | |
| 234 // at addresses block1 and block2. Returns true or false | |
| 235 // depending on whether an exact match was found. | |
| 236 template<int number_of_words> | |
| 237 inline bool CompareWholeWordValues(const char* block1, | |
| 238 const char* block2) { | |
| 239 return CompareWholeWordValues<1>(block1, block2) && | |
| 240 CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t), | |
| 241 block2 + sizeof(uword_t)); | |
| 242 } | |
| 243 | |
| 244 // The base of the recursive template: compare one pair of machine words. | |
| 245 template<> | |
| 246 inline bool CompareWholeWordValues<1>(const char* word1, | |
| 247 const char* word2) { | |
| 248 uword_t aligned_word1, aligned_word2; | |
| 249 memcpy(&aligned_word1, word1, sizeof(aligned_word1)); | |
| 250 memcpy(&aligned_word2, word2, sizeof(aligned_word2)); | |
| 251 return aligned_word1 == aligned_word2; | |
| 252 } | |
| 253 | |
| 254 // Calling BlockContentsMatch is equivalent to the following memcmp call: | |
| 255 // | |
| 256 // memcmp(block1, block2, BlockHash::kBlockSize); | |
| 257 // | |
| 258 // This function will only be called for two blocks whose hash values match. | |
| 259 // For that reason, it is very likely that either the blocks are identical, | |
| 260 // or they have different first bytes. (Given a good hash function, if two | |
| 261 // non-identical blocks have the same hash value, the probability that their | |
| 262 // first bytes differ is the same as the probability of two random bytes | |
| 263 // differing: 255/256, or rather smaller for text data.) So optimize | |
| 264 // for these two common cases. | |
| 265 // | |
| 266 // A block must be composed of an integral number of machine words | |
| 267 // (uword_t values.) This function takes advantage of that fact | |
| 268 // by comparing the blocks as series of (possibly unaligned) word values. | |
| 269 // A word-sized comparison can be performed as a single | |
| 270 // machine instruction. Comparing words instead of bytes means that, | |
| 271 // on a 64-bit platform, this function will use 8 times fewer test-and-branch | |
| 272 // instructions than a byte-by-byte comparison. Even with the extra | |
| 273 // cost of the calls to memcpy, this method is still at least twice as fast | |
| 274 // as memcmp (measured using gcc on a 64-bit platform, with a block size | |
| 275 // of 32.) For blocks with identical contents (a common case), this method | |
| 276 // is over six times faster than memcmp. | |
| 277 inline bool BlockHash::BlockContentsMatchInline(const char* block1, | |
| 278 const char* block2) { | |
| 279 // Optimize for mismatch in first byte, as described above | |
| 280 if (*block1 != *block2) { | |
| 281 return false; | |
| 282 } | |
| 283 static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t); | |
| 284 return CompareWholeWordValues<kWordsPerBlock>(block1, block2); | |
| 285 } | |
| 286 | |
| 287 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) { | |
| 288 return BlockContentsMatchInline(block1, block2); | |
| 289 } | |
| 290 | |
| 291 inline int BlockHash::SkipNonMatchingBlocks(int block_number, | |
| 292 const char* block_ptr) const { | |
| 293 int probes = 0; | |
| 294 while ((block_number != -1) && | |
| 295 !BlockContentsMatchInline(block_ptr, | |
| 296 &source_data_[block_number * kBlockSize])) { | |
| 297 if (++probes > kMaxProbes) { | |
| 298 return -1; // Avoid too much chaining | |
| 299 } | |
| 300 block_number = next_block_table_[block_number]; | |
| 301 } | |
| 302 return block_number; | |
| 303 } | |
| 304 | |
| 305 // Init() must have been called and returned true before using | |
| 306 // FirstMatchingBlock or NextMatchingBlock. No check is performed | |
| 307 // for this condition; the code will crash if this condition is violated. | |
| 308 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value, | |
| 309 const char* block_ptr) const { | |
| 310 return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)], | |
| 311 block_ptr); | |
| 312 } | |
| 313 | |
| 314 int BlockHash::FirstMatchingBlock(uint32_t hash_value, | |
| 315 const char* block_ptr) const { | |
| 316 return FirstMatchingBlockInline(hash_value, block_ptr); | |
| 317 } | |
| 318 | |
| 319 int BlockHash::NextMatchingBlock(int block_number, | |
| 320 const char* block_ptr) const { | |
| 321 if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) { | |
| 322 LOG(DFATAL) << "NextMatchingBlock called for invalid block number " | |
| 323 << block_number << LOG_ENDL; | |
| 324 return -1; | |
| 325 } | |
| 326 return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr); | |
| 327 } | |
| 328 | |
| 329 // Keep a count of the number of matches found. This will throttle the | |
| 330 // number of iterations in FindBestMatch. For example, if the entire | |
| 331 // dictionary is made up of spaces (' ') and the search string is also | |
| 332 // made up of spaces, there will be one match for each block in the | |
| 333 // dictionary. | |
| 334 inline bool BlockHash::TooManyMatches(int* match_counter) { | |
| 335 ++(*match_counter); | |
| 336 return (*match_counter) > kMaxMatchesToCheck; | |
| 337 } | |
| 338 | |
| 339 // Returns the number of bytes to the left of source_match_start | |
| 340 // that match the corresponding bytes to the left of target_match_start. | |
| 341 // Will not examine more than max_bytes bytes, which is to say that | |
| 342 // the return value will be in the range [0, max_bytes] inclusive. | |
| 343 int BlockHash::MatchingBytesToLeft(const char* source_match_start, | |
| 344 const char* target_match_start, | |
| 345 int max_bytes) { | |
| 346 const char* source_ptr = source_match_start; | |
| 347 const char* target_ptr = target_match_start; | |
| 348 int bytes_found = 0; | |
| 349 while (bytes_found < max_bytes) { | |
| 350 --source_ptr; | |
| 351 --target_ptr; | |
| 352 if (*source_ptr != *target_ptr) { | |
| 353 break; | |
| 354 } | |
| 355 ++bytes_found; | |
| 356 } | |
| 357 return bytes_found; | |
| 358 } | |
| 359 | |
| 360 // Returns the number of bytes starting at source_match_end | |
| 361 // that match the corresponding bytes starting at target_match_end. | |
| 362 // Will not examine more than max_bytes bytes, which is to say that | |
| 363 // the return value will be in the range [0, max_bytes] inclusive. | |
| 364 int BlockHash::MatchingBytesToRight(const char* source_match_end, | |
| 365 const char* target_match_end, | |
| 366 int max_bytes) { | |
| 367 const char* source_ptr = source_match_end; | |
| 368 const char* target_ptr = target_match_end; | |
| 369 int bytes_found = 0; | |
| 370 while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) { | |
| 371 ++bytes_found; | |
| 372 ++source_ptr; | |
| 373 ++target_ptr; | |
| 374 } | |
| 375 return bytes_found; | |
| 376 } | |
| 377 | |
| 378 // No NULL checks are performed on the pointer arguments. The caller | |
| 379 // must guarantee that none of the arguments is NULL, or a crash will occur. | |
| 380 // | |
| 381 // The vast majority of calls to FindBestMatch enter the loop *zero* times, | |
| 382 // which is to say that most candidate blocks find no matches in the dictionary. | |
| 383 // The important sections for optimization are therefore the code outside the | |
| 384 // loop and the code within the loop conditions. Keep this to a minimum. | |
| 385 void BlockHash::FindBestMatch(uint32_t hash_value, | |
| 386 const char* target_candidate_start, | |
| 387 const char* target_start, | |
| 388 size_t target_size, | |
| 389 Match* best_match) const { | |
| 390 int match_counter = 0; | |
| 391 for (int block_number = FirstMatchingBlockInline(hash_value, | |
| 392 target_candidate_start); | |
| 393 (block_number >= 0) && !TooManyMatches(&match_counter); | |
| 394 block_number = NextMatchingBlock(block_number, target_candidate_start)) { | |
| 395 int source_match_offset = block_number * kBlockSize; | |
| 396 const int source_match_end = source_match_offset + kBlockSize; | |
| 397 | |
| 398 int target_match_offset = | |
| 399 static_cast<int>(target_candidate_start - target_start); | |
| 400 const int target_match_end = target_match_offset + kBlockSize; | |
| 401 | |
| 402 size_t match_size = kBlockSize; | |
| 403 { | |
| 404 // Extend match start towards beginning of unencoded data | |
| 405 const int limit_bytes_to_left = std::min(source_match_offset, | |
| 406 target_match_offset); | |
| 407 const int matching_bytes_to_left = | |
| 408 MatchingBytesToLeft(source_data_ + source_match_offset, | |
| 409 target_start + target_match_offset, | |
| 410 limit_bytes_to_left); | |
| 411 source_match_offset -= matching_bytes_to_left; | |
| 412 target_match_offset -= matching_bytes_to_left; | |
| 413 match_size += matching_bytes_to_left; | |
| 414 } | |
| 415 { | |
| 416 // Extend match end towards end of unencoded data | |
| 417 const size_t source_bytes_to_right = source_size_ - source_match_end; | |
| 418 const size_t target_bytes_to_right = target_size - target_match_end; | |
| 419 const size_t limit_bytes_to_right = std::min(source_bytes_to_right, | |
| 420 target_bytes_to_right); | |
| 421 match_size += | |
| 422 MatchingBytesToRight(source_data_ + source_match_end, | |
| 423 target_start + target_match_end, | |
| 424 static_cast<int>(limit_bytes_to_right)); | |
| 425 } | |
| 426 // Update in/out parameter if the best match found was better | |
| 427 // than any match already stored in *best_match. | |
| 428 best_match->ReplaceIfBetterMatch(match_size, | |
| 429 source_match_offset + starting_offset_, | |
| 430 target_match_offset); | |
| 431 } | |
| 432 } | |
| 433 | |
| 434 } // namespace open_vcdiff | |
| OLD | NEW |