Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(84)

Side by Side Diff: sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc

Issue 5203: Transition to pulling open-vcdiff from repository, instead of using snapshot... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: Created 12 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include <config.h>
17 #include "blockhash.h"
18 #include "compile_assert.h"
19 #include <stdint.h> // uint32_t
20 #include "logging.h"
21 #include "rolling_hash.h"
22
23 namespace open_vcdiff {
24
25 typedef unsigned long uword_t; // a machine word NOLINT
26
27 BlockHash::BlockHash(const char* source_data,
28 size_t source_size,
29 int starting_offset)
30 : source_data_(source_data),
31 source_size_(source_size),
32 starting_offset_(starting_offset),
33 last_block_added_(-1) {
34 }
35
36 BlockHash::~BlockHash() { }
37
38 // kBlockSize must be at least 2 to be meaningful. Since it's a compile-time
39 // constant, check its value at compile time rather than wasting CPU cycles
40 // on runtime checks.
41 COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
42
43 // kBlockSize is required to be a power of 2 because multiplication
44 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
45 // are commonly-used operations. If kBlockSize is a compile-time
46 // constant and a power of 2, the compiler can convert these three operations
47 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
48 // more efficient than executing full integer multiply, divide, or remainder
49 // instructions.
50 COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
51 kBlockSize_must_be_a_power_of_2);
52
53 bool BlockHash::Init(bool populate_hash_table) {
54 if (!hash_table_.empty() ||
55 !next_block_table_.empty() ||
56 !last_block_table_.empty()) {
57 LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL;
58 return false;
59 }
60 const size_t table_size = CalcTableSize(source_size_);
61 if (table_size == 0) {
62 LOG(DFATAL) << "Error finding table size for source size " << source_size_
63 << LOG_ENDL;
64 return false;
65 }
66 hash_table_.resize(table_size, -1);
67 next_block_table_.resize(GetNumberOfBlocks(), -1);
68 last_block_table_.resize(GetNumberOfBlocks(), -1);
69 if (populate_hash_table) {
70 AddAllBlocks();
71 }
72 return true;
73 }
74
75 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
76 size_t dictionary_size) {
77 BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
78 dictionary_size,
79 0);
80 if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
81 delete new_dictionary_hash;
82 return NULL;
83 } else {
84 return new_dictionary_hash;
85 }
86 }
87
88 BlockHash* BlockHash::CreateTargetHash(const char* target_data,
89 size_t target_size,
90 size_t dictionary_size) {
91 BlockHash* new_target_hash = new BlockHash(target_data,
92 target_size,
93 static_cast<int>(dictionary_size));
94 if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
95 delete new_target_hash;
96 return NULL;
97 } else {
98 return new_target_hash;
99 }
100 }
101
102 // Returns zero if an error occurs.
103 const size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
104 // Overallocate the hash table by making it the same size (in bytes)
105 // as the source data. This is a trade-off between space and time:
106 // the empty entries in the hash table will reduce the
107 // probability of a hash collision to (sizeof(int) / kblockSize),
108 // and so save time comparing false matches.
109 const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT
110 size_t table_size = 1;
111 // Find the smallest power of 2 that is >= min_size, and assign
112 // that value to table_size.
113 while (table_size < min_size) {
114 table_size <<= 1;
115 // Guard against an infinite loop
116 if (table_size <= 0) {
117 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
118 << dictionary_size
119 << "): resulting table_size " << table_size
120 << " is zero or negative" << LOG_ENDL;
121 return 0;
122 }
123 }
124 // Check size sanity
125 if ((table_size & (table_size - 1)) != 0) {
126 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
127 << dictionary_size
128 << "): resulting table_size " << table_size
129 << " is not a power of 2" << LOG_ENDL;
130 return 0;
131 }
132 // The loop above tries to find the smallest power of 2 that is >= min_size.
133 // That value must lie somewhere between min_size and (min_size * 2),
134 // except for the case (dictionary_size == 0, table_size == 1).
135 if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
136 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
137 << dictionary_size
138 << "): resulting table_size " << table_size
139 << " is too large" << LOG_ENDL;
140 return 0;
141 }
142 return table_size;
143 }
144
145 // If the hash value is already available from the rolling hash,
146 // call this function to save time.
147 void BlockHash::AddBlock(uint32_t hash_value) {
148 if (hash_table_.empty()) {
149 LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()"
150 << LOG_ENDL;
151 return;
152 }
153 // The initial value of last_block_added_ is -1.
154 int block_number = last_block_added_ + 1;
155 const int total_blocks =
156 static_cast<int>(source_size_ / kBlockSize); // round down
157 if (block_number >= total_blocks) {
158 LOG(DFATAL) << "BlockHash::AddBlock() called"
159 " with block number " << block_number
160 << " that is past last block " << (total_blocks - 1)
161 << LOG_ENDL;
162 return;
163 }
164 if (next_block_table_[block_number] != -1) {
165 LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
166 "block number = " << block_number
167 << ", next block should be -1 but is "
168 << next_block_table_[block_number] << LOG_ENDL;
169 return;
170 }
171 const int hash_table_index = GetHashTableIndex(hash_value);
172 const int first_matching_block = hash_table_[hash_table_index];
173 if (first_matching_block < 0) {
174 // This is the first entry with this hash value
175 hash_table_[hash_table_index] = block_number;
176 last_block_table_[block_number] = block_number;
177 } else {
178 // Add this entry at the end of the chain of matching blocks
179 const int last_matching_block = last_block_table_[first_matching_block];
180 if (next_block_table_[last_matching_block] != -1) {
181 LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
182 "first matching block = " << first_matching_block
183 << ", last matching block = " << last_matching_block
184 << ", next block should be -1 but is "
185 << next_block_table_[last_matching_block] << LOG_ENDL;
186 return;
187 }
188 next_block_table_[last_matching_block] = block_number;
189 last_block_table_[first_matching_block] = block_number;
190 }
191 last_block_added_ = block_number;
192 }
193
194 void BlockHash::AddAllBlocks() {
195 AddAllBlocksThroughIndex(static_cast<int>(source_size_));
196 }
197
198 void BlockHash::AddAllBlocksThroughIndex(int end_index) {
199 if (end_index > static_cast<int>(source_size_)) {
200 LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
201 " with index " << end_index
202 << " higher than end index " << source_size_ << LOG_ENDL;
203 return;
204 }
205 const int last_index_added = last_block_added_ * kBlockSize;
206 if (end_index <= last_index_added) {
207 LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
208 " with index " << end_index
209 << " <= last index added ( " << last_index_added
210 << ")" << LOG_ENDL;
211 return;
212 }
213 int end_limit = end_index;
214 // Don't allow reading any indices at or past source_size_.
215 // The Hash function extends (kBlockSize - 1) bytes past the index,
216 // so leave a margin of that size.
217 int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
218 if (end_limit > last_legal_hash_index) {
219 end_limit = last_legal_hash_index + 1;
220 }
221 const char* block_ptr = source_data() + NextIndexToAdd();
222 const char* const end_ptr = source_data() + end_limit;
223 while (block_ptr < end_ptr) {
224 AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
225 block_ptr += kBlockSize;
226 }
227 }
228
229 COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
230 kBlockSize_must_be_a_multiple_of_machine_word_size);
231
232 // A recursive template to compare a fixed number
233 // of (possibly unaligned) machine words starting
234 // at addresses block1 and block2. Returns true or false
235 // depending on whether an exact match was found.
236 template<int number_of_words>
237 inline bool CompareWholeWordValues(const char* block1,
238 const char* block2) {
239 return CompareWholeWordValues<1>(block1, block2) &&
240 CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
241 block2 + sizeof(uword_t));
242 }
243
244 // The base of the recursive template: compare one pair of machine words.
245 template<>
246 inline bool CompareWholeWordValues<1>(const char* word1,
247 const char* word2) {
248 uword_t aligned_word1, aligned_word2;
249 memcpy(&aligned_word1, word1, sizeof(aligned_word1));
250 memcpy(&aligned_word2, word2, sizeof(aligned_word2));
251 return aligned_word1 == aligned_word2;
252 }
253
254 // Calling BlockContentsMatch is equivalent to the following memcmp call:
255 //
256 // memcmp(block1, block2, BlockHash::kBlockSize);
257 //
258 // This function will only be called for two blocks whose hash values match.
259 // For that reason, it is very likely that either the blocks are identical,
260 // or they have different first bytes. (Given a good hash function, if two
261 // non-identical blocks have the same hash value, the probability that their
262 // first bytes differ is the same as the probability of two random bytes
263 // differing: 255/256, or rather smaller for text data.) So optimize
264 // for these two common cases.
265 //
266 // A block must be composed of an integral number of machine words
267 // (uword_t values.) This function takes advantage of that fact
268 // by comparing the blocks as series of (possibly unaligned) word values.
269 // A word-sized comparison can be performed as a single
270 // machine instruction. Comparing words instead of bytes means that,
271 // on a 64-bit platform, this function will use 8 times fewer test-and-branch
272 // instructions than a byte-by-byte comparison. Even with the extra
273 // cost of the calls to memcpy, this method is still at least twice as fast
274 // as memcmp (measured using gcc on a 64-bit platform, with a block size
275 // of 32.) For blocks with identical contents (a common case), this method
276 // is over six times faster than memcmp.
277 inline bool BlockHash::BlockContentsMatchInline(const char* block1,
278 const char* block2) {
279 // Optimize for mismatch in first byte, as described above
280 if (*block1 != *block2) {
281 return false;
282 }
283 static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
284 return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
285 }
286
287 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
288 return BlockContentsMatchInline(block1, block2);
289 }
290
291 inline int BlockHash::SkipNonMatchingBlocks(int block_number,
292 const char* block_ptr) const {
293 int probes = 0;
294 while ((block_number != -1) &&
295 !BlockContentsMatchInline(block_ptr,
296 &source_data_[block_number * kBlockSize])) {
297 if (++probes > kMaxProbes) {
298 return -1; // Avoid too much chaining
299 }
300 block_number = next_block_table_[block_number];
301 }
302 return block_number;
303 }
304
305 // Init() must have been called and returned true before using
306 // FirstMatchingBlock or NextMatchingBlock. No check is performed
307 // for this condition; the code will crash if this condition is violated.
308 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
309 const char* block_ptr) const {
310 return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
311 block_ptr);
312 }
313
314 int BlockHash::FirstMatchingBlock(uint32_t hash_value,
315 const char* block_ptr) const {
316 return FirstMatchingBlockInline(hash_value, block_ptr);
317 }
318
319 int BlockHash::NextMatchingBlock(int block_number,
320 const char* block_ptr) const {
321 if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
322 LOG(DFATAL) << "NextMatchingBlock called for invalid block number "
323 << block_number << LOG_ENDL;
324 return -1;
325 }
326 return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
327 }
328
329 // Keep a count of the number of matches found. This will throttle the
330 // number of iterations in FindBestMatch. For example, if the entire
331 // dictionary is made up of spaces (' ') and the search string is also
332 // made up of spaces, there will be one match for each block in the
333 // dictionary.
334 inline bool BlockHash::TooManyMatches(int* match_counter) {
335 ++(*match_counter);
336 return (*match_counter) > kMaxMatchesToCheck;
337 }
338
339 // Returns the number of bytes to the left of source_match_start
340 // that match the corresponding bytes to the left of target_match_start.
341 // Will not examine more than max_bytes bytes, which is to say that
342 // the return value will be in the range [0, max_bytes] inclusive.
343 int BlockHash::MatchingBytesToLeft(const char* source_match_start,
344 const char* target_match_start,
345 int max_bytes) {
346 const char* source_ptr = source_match_start;
347 const char* target_ptr = target_match_start;
348 int bytes_found = 0;
349 while (bytes_found < max_bytes) {
350 --source_ptr;
351 --target_ptr;
352 if (*source_ptr != *target_ptr) {
353 break;
354 }
355 ++bytes_found;
356 }
357 return bytes_found;
358 }
359
360 // Returns the number of bytes starting at source_match_end
361 // that match the corresponding bytes starting at target_match_end.
362 // Will not examine more than max_bytes bytes, which is to say that
363 // the return value will be in the range [0, max_bytes] inclusive.
364 int BlockHash::MatchingBytesToRight(const char* source_match_end,
365 const char* target_match_end,
366 int max_bytes) {
367 const char* source_ptr = source_match_end;
368 const char* target_ptr = target_match_end;
369 int bytes_found = 0;
370 while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
371 ++bytes_found;
372 ++source_ptr;
373 ++target_ptr;
374 }
375 return bytes_found;
376 }
377
378 // No NULL checks are performed on the pointer arguments. The caller
379 // must guarantee that none of the arguments is NULL, or a crash will occur.
380 //
381 // The vast majority of calls to FindBestMatch enter the loop *zero* times,
382 // which is to say that most candidate blocks find no matches in the dictionary.
383 // The important sections for optimization are therefore the code outside the
384 // loop and the code within the loop conditions. Keep this to a minimum.
385 void BlockHash::FindBestMatch(uint32_t hash_value,
386 const char* target_candidate_start,
387 const char* target_start,
388 size_t target_size,
389 Match* best_match) const {
390 int match_counter = 0;
391 for (int block_number = FirstMatchingBlockInline(hash_value,
392 target_candidate_start);
393 (block_number >= 0) && !TooManyMatches(&match_counter);
394 block_number = NextMatchingBlock(block_number, target_candidate_start)) {
395 int source_match_offset = block_number * kBlockSize;
396 const int source_match_end = source_match_offset + kBlockSize;
397
398 int target_match_offset =
399 static_cast<int>(target_candidate_start - target_start);
400 const int target_match_end = target_match_offset + kBlockSize;
401
402 size_t match_size = kBlockSize;
403 {
404 // Extend match start towards beginning of unencoded data
405 const int limit_bytes_to_left = std::min(source_match_offset,
406 target_match_offset);
407 const int matching_bytes_to_left =
408 MatchingBytesToLeft(source_data_ + source_match_offset,
409 target_start + target_match_offset,
410 limit_bytes_to_left);
411 source_match_offset -= matching_bytes_to_left;
412 target_match_offset -= matching_bytes_to_left;
413 match_size += matching_bytes_to_left;
414 }
415 {
416 // Extend match end towards end of unencoded data
417 const size_t source_bytes_to_right = source_size_ - source_match_end;
418 const size_t target_bytes_to_right = target_size - target_match_end;
419 const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
420 target_bytes_to_right);
421 match_size +=
422 MatchingBytesToRight(source_data_ + source_match_end,
423 target_start + target_match_end,
424 static_cast<int>(limit_bytes_to_right));
425 }
426 // Update in/out parameter if the best match found was better
427 // than any match already stored in *best_match.
428 best_match->ReplaceIfBetterMatch(match_size,
429 source_match_offset + starting_offset_,
430 target_match_offset);
431 }
432 }
433
434 } // namespace open_vcdiff
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698