sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc - Issue 5203: Transition to pulling open-vcdiff from repository, instead of using snapshot...

Side by Side Diff: sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc

Issue 5203: Transition to pulling open-vcdiff from repository, instead of using snapshot... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.h ('k') | sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright 2006, 2008 Google Inc.

2 // Authors: Chandra Chereddi, Lincoln Smith

3 //

4 // Licensed under the Apache License, Version 2.0 (the "License");

5 // you may not use this file except in compliance with the License.

6 // You may obtain a copy of the License at

7 //

8 // http://www.apache.org/licenses/LICENSE-2.0

9 //

10 // Unless required by applicable law or agreed to in writing, software

11 // distributed under the License is distributed on an "AS IS" BASIS,

12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13 // See the License for the specific language governing permissions and

14 // limitations under the License.

15

16 #include <config.h>

17 #include "blockhash.h"

18 #include "compile_assert.h"

19 #include <stdint.h> // uint32_t

20 #include "logging.h"

21 #include "rolling_hash.h"

22

23 namespace open_vcdiff {

24

25 typedef unsigned long uword_t; // a machine word NOLINT

26

27 BlockHash::BlockHash(const char* source_data,

28 size_t source_size,

29 int starting_offset)

30 : source_data_(source_data),

31 source_size_(source_size),

32 starting_offset_(starting_offset),

33 last_block_added_(-1) {

34 }

35

36 BlockHash::~BlockHash() { }

37

38 // kBlockSize must be at least 2 to be meaningful. Since it's a compile-time

39 // constant, check its value at compile time rather than wasting CPU cycles

40 // on runtime checks.

41 COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);

42

43 // kBlockSize is required to be a power of 2 because multiplication

44 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)

45 // are commonly-used operations. If kBlockSize is a compile-time

46 // constant and a power of 2, the compiler can convert these three operations

47 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much

48 // more efficient than executing full integer multiply, divide, or remainder

49 // instructions.

50 COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,

51 kBlockSize_must_be_a_power_of_2);

52

53 bool BlockHash::Init(bool populate_hash_table) {

54 if (!hash_table_.empty() \|\|

55 !next_block_table_.empty() \|\|

56 !last_block_table_.empty()) {

57 LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL;

58 return false;

59 }

60 const size_t table_size = CalcTableSize(source_size_);

61 if (table_size == 0) {

62 LOG(DFATAL) << "Error finding table size for source size " << source_size_

63 << LOG_ENDL;

64 return false;

65 }

66 hash_table_.resize(table_size, -1);

67 next_block_table_.resize(GetNumberOfBlocks(), -1);

68 last_block_table_.resize(GetNumberOfBlocks(), -1);

69 if (populate_hash_table) {

70 AddAllBlocks();

71 }

72 return true;

73 }

74

75 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,

76 size_t dictionary_size) {

77 BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,

78 dictionary_size,

79 0);

80 if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {

81 delete new_dictionary_hash;

82 return NULL;

83 } else {

84 return new_dictionary_hash;

85 }

86 }

87

88 BlockHash* BlockHash::CreateTargetHash(const char* target_data,

89 size_t target_size,

90 size_t dictionary_size) {

91 BlockHash* new_target_hash = new BlockHash(target_data,

92 target_size,

93 static_cast<int>(dictionary_size));

94 if (!new_target_hash->Init(/* populate_hash_table = */ false)) {

95 delete new_target_hash;

96 return NULL;

97 } else {

98 return new_target_hash;

99 }

100 }

101

102 // Returns zero if an error occurs.

103 const size_t BlockHash::CalcTableSize(const size_t dictionary_size) {

104 // Overallocate the hash table by making it the same size (in bytes)

105 // as the source data. This is a trade-off between space and time:

106 // the empty entries in the hash table will reduce the

107 // probability of a hash collision to (sizeof(int) / kblockSize),

108 // and so save time comparing false matches.

109 const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT

110 size_t table_size = 1;

111 // Find the smallest power of 2 that is >= min_size, and assign

112 // that value to table_size.

113 while (table_size < min_size) {

114 table_size <<= 1;

115 // Guard against an infinite loop

116 if (table_size <= 0) {

117 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "

118 << dictionary_size

119 << "): resulting table_size " << table_size

120 << " is zero or negative" << LOG_ENDL;

121 return 0;

122 }

123 }

124 // Check size sanity

125 if ((table_size & (table_size - 1)) != 0) {

126 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "

127 << dictionary_size

128 << "): resulting table_size " << table_size

129 << " is not a power of 2" << LOG_ENDL;

130 return 0;

131 }

132 // The loop above tries to find the smallest power of 2 that is >= min_size.

133 // That value must lie somewhere between min_size and (min_size * 2),

134 // except for the case (dictionary_size == 0, table_size == 1).

135 if ((dictionary_size > 0) && (table_size > (min_size * 2))) {

136 LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "

137 << dictionary_size

138 << "): resulting table_size " << table_size

139 << " is too large" << LOG_ENDL;

140 return 0;

141 }

142 return table_size;

143 }

144

145 // If the hash value is already available from the rolling hash,

146 // call this function to save time.

147 void BlockHash::AddBlock(uint32_t hash_value) {

148 if (hash_table_.empty()) {

149 LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()"

150 << LOG_ENDL;

151 return;

152 }

153 // The initial value of last_block_added_ is -1.

154 int block_number = last_block_added_ + 1;

155 const int total_blocks =

156 static_cast<int>(source_size_ / kBlockSize); // round down

157 if (block_number >= total_blocks) {

158 LOG(DFATAL) << "BlockHash::AddBlock() called"

159 " with block number " << block_number

160 << " that is past last block " << (total_blocks - 1)

161 << LOG_ENDL;

162 return;

163 }

164 if (next_block_table_[block_number] != -1) {

165 LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "

166 "block number = " << block_number

167 << ", next block should be -1 but is "

168 << next_block_table_[block_number] << LOG_ENDL;

169 return;

170 }

171 const int hash_table_index = GetHashTableIndex(hash_value);

172 const int first_matching_block = hash_table_[hash_table_index];

173 if (first_matching_block < 0) {

174 // This is the first entry with this hash value

175 hash_table_[hash_table_index] = block_number;

176 last_block_table_[block_number] = block_number;

177 } else {

178 // Add this entry at the end of the chain of matching blocks

179 const int last_matching_block = last_block_table_[first_matching_block];

180 if (next_block_table_[last_matching_block] != -1) {

181 LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "

182 "first matching block = " << first_matching_block

183 << ", last matching block = " << last_matching_block

184 << ", next block should be -1 but is "

185 << next_block_table_[last_matching_block] << LOG_ENDL;

186 return;

187 }

188 next_block_table_[last_matching_block] = block_number;

189 last_block_table_[first_matching_block] = block_number;

190 }

191 last_block_added_ = block_number;

192 }

193

194 void BlockHash::AddAllBlocks() {

195 AddAllBlocksThroughIndex(static_cast<int>(source_size_));

196 }

197

198 void BlockHash::AddAllBlocksThroughIndex(int end_index) {

199 if (end_index > static_cast<int>(source_size_)) {

200 LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"

201 " with index " << end_index

202 << " higher than end index " << source_size_ << LOG_ENDL;

203 return;

204 }

205 const int last_index_added = last_block_added_ * kBlockSize;

206 if (end_index <= last_index_added) {

207 LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"

208 " with index " << end_index

209 << " <= last index added ( " << last_index_added

210 << ")" << LOG_ENDL;

211 return;

212 }

213 int end_limit = end_index;

214 // Don't allow reading any indices at or past source_size_.

215 // The Hash function extends (kBlockSize - 1) bytes past the index,

216 // so leave a margin of that size.

217 int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);

218 if (end_limit > last_legal_hash_index) {

219 end_limit = last_legal_hash_index + 1;

220 }

221 const char* block_ptr = source_data() + NextIndexToAdd();

222 const char* const end_ptr = source_data() + end_limit;

223 while (block_ptr < end_ptr) {

224 AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));

225 block_ptr += kBlockSize;

226 }

227 }

228

229 COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,

230 kBlockSize_must_be_a_multiple_of_machine_word_size);

231

232 // A recursive template to compare a fixed number

233 // of (possibly unaligned) machine words starting

234 // at addresses block1 and block2. Returns true or false

235 // depending on whether an exact match was found.

236 template<int number_of_words>

237 inline bool CompareWholeWordValues(const char* block1,

238 const char* block2) {

239 return CompareWholeWordValues<1>(block1, block2) &&

240 CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),

241 block2 + sizeof(uword_t));

242 }

243

244 // The base of the recursive template: compare one pair of machine words.

245 template<>

246 inline bool CompareWholeWordValues<1>(const char* word1,

247 const char* word2) {

248 uword_t aligned_word1, aligned_word2;

249 memcpy(&aligned_word1, word1, sizeof(aligned_word1));

250 memcpy(&aligned_word2, word2, sizeof(aligned_word2));

251 return aligned_word1 == aligned_word2;

252 }

253

254 // Calling BlockContentsMatch is equivalent to the following memcmp call:

255 //

256 // memcmp(block1, block2, BlockHash::kBlockSize);

257 //

258 // This function will only be called for two blocks whose hash values match.

259 // For that reason, it is very likely that either the blocks are identical,

260 // or they have different first bytes. (Given a good hash function, if two

261 // non-identical blocks have the same hash value, the probability that their

262 // first bytes differ is the same as the probability of two random bytes

263 // differing: 255/256, or rather smaller for text data.) So optimize

264 // for these two common cases.

265 //

266 // A block must be composed of an integral number of machine words

267 // (uword_t values.) This function takes advantage of that fact

268 // by comparing the blocks as series of (possibly unaligned) word values.

269 // A word-sized comparison can be performed as a single

270 // machine instruction. Comparing words instead of bytes means that,

271 // on a 64-bit platform, this function will use 8 times fewer test-and-branch

272 // instructions than a byte-by-byte comparison. Even with the extra

273 // cost of the calls to memcpy, this method is still at least twice as fast

274 // as memcmp (measured using gcc on a 64-bit platform, with a block size

275 // of 32.) For blocks with identical contents (a common case), this method

276 // is over six times faster than memcmp.

277 inline bool BlockHash::BlockContentsMatchInline(const char* block1,

278 const char* block2) {

279 // Optimize for mismatch in first byte, as described above

280 if (block1 != block2) {

281 return false;

282 }

283 static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);

284 return CompareWholeWordValues<kWordsPerBlock>(block1, block2);

285 }

286

287 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {

288 return BlockContentsMatchInline(block1, block2);

289 }

290

291 inline int BlockHash::SkipNonMatchingBlocks(int block_number,

292 const char* block_ptr) const {

293 int probes = 0;

294 while ((block_number != -1) &&

295 !BlockContentsMatchInline(block_ptr,

296 &source_data_[block_number * kBlockSize])) {

297 if (++probes > kMaxProbes) {

298 return -1; // Avoid too much chaining

299 }

300 block_number = next_block_table_[block_number];

301 }

302 return block_number;

303 }

304

305 // Init() must have been called and returned true before using

306 // FirstMatchingBlock or NextMatchingBlock. No check is performed

307 // for this condition; the code will crash if this condition is violated.

308 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,

309 const char* block_ptr) const {

310 return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],

311 block_ptr);

312 }

313

314 int BlockHash::FirstMatchingBlock(uint32_t hash_value,

315 const char* block_ptr) const {

316 return FirstMatchingBlockInline(hash_value, block_ptr);

317 }

318

319 int BlockHash::NextMatchingBlock(int block_number,

320 const char* block_ptr) const {

321 if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {

322 LOG(DFATAL) << "NextMatchingBlock called for invalid block number "

323 << block_number << LOG_ENDL;

324 return -1;

325 }

326 return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);

327 }

328

329 // Keep a count of the number of matches found. This will throttle the

330 // number of iterations in FindBestMatch. For example, if the entire

331 // dictionary is made up of spaces (' ') and the search string is also

332 // made up of spaces, there will be one match for each block in the

333 // dictionary.

334 inline bool BlockHash::TooManyMatches(int* match_counter) {

335 ++(*match_counter);

336 return (*match_counter) > kMaxMatchesToCheck;

337 }

338

339 // Returns the number of bytes to the left of source_match_start

340 // that match the corresponding bytes to the left of target_match_start.

341 // Will not examine more than max_bytes bytes, which is to say that

342 // the return value will be in the range [0, max_bytes] inclusive.

343 int BlockHash::MatchingBytesToLeft(const char* source_match_start,

344 const char* target_match_start,

345 int max_bytes) {

346 const char* source_ptr = source_match_start;

347 const char* target_ptr = target_match_start;

348 int bytes_found = 0;

349 while (bytes_found < max_bytes) {

350 --source_ptr;

351 --target_ptr;

352 if (source_ptr != target_ptr) {

353 break;

354 }

355 ++bytes_found;

356 }

357 return bytes_found;

358 }

359

360 // Returns the number of bytes starting at source_match_end

361 // that match the corresponding bytes starting at target_match_end.

362 // Will not examine more than max_bytes bytes, which is to say that

363 // the return value will be in the range [0, max_bytes] inclusive.

364 int BlockHash::MatchingBytesToRight(const char* source_match_end,

365 const char* target_match_end,

366 int max_bytes) {

367 const char* source_ptr = source_match_end;

368 const char* target_ptr = target_match_end;

369 int bytes_found = 0;

370 while ((bytes_found < max_bytes) && (source_ptr == target_ptr)) {

371 ++bytes_found;

372 ++source_ptr;

373 ++target_ptr;

374 }

375 return bytes_found;

376 }

377

378 // No NULL checks are performed on the pointer arguments. The caller

379 // must guarantee that none of the arguments is NULL, or a crash will occur.

380 //

381 // The vast majority of calls to FindBestMatch enter the loop zero times,

382 // which is to say that most candidate blocks find no matches in the dictionary.

383 // The important sections for optimization are therefore the code outside the

384 // loop and the code within the loop conditions. Keep this to a minimum.

385 void BlockHash::FindBestMatch(uint32_t hash_value,

386 const char* target_candidate_start,

387 const char* target_start,

388 size_t target_size,

389 Match* best_match) const {

390 int match_counter = 0;

391 for (int block_number = FirstMatchingBlockInline(hash_value,

392 target_candidate_start);

393 (block_number >= 0) && !TooManyMatches(&match_counter);

394 block_number = NextMatchingBlock(block_number, target_candidate_start)) {

395 int source_match_offset = block_number * kBlockSize;

396 const int source_match_end = source_match_offset + kBlockSize;

397

398 int target_match_offset =

399 static_cast<int>(target_candidate_start - target_start);

400 const int target_match_end = target_match_offset + kBlockSize;

401

402 size_t match_size = kBlockSize;

403 {

404 // Extend match start towards beginning of unencoded data

405 const int limit_bytes_to_left = std::min(source_match_offset,

406 target_match_offset);

407 const int matching_bytes_to_left =

408 MatchingBytesToLeft(source_data_ + source_match_offset,

409 target_start + target_match_offset,

410 limit_bytes_to_left);

411 source_match_offset -= matching_bytes_to_left;

412 target_match_offset -= matching_bytes_to_left;

413 match_size += matching_bytes_to_left;

414 }

415 {

416 // Extend match end towards end of unencoded data

417 const size_t source_bytes_to_right = source_size_ - source_match_end;

418 const size_t target_bytes_to_right = target_size - target_match_end;

419 const size_t limit_bytes_to_right = std::min(source_bytes_to_right,

420 target_bytes_to_right);

421 match_size +=

422 MatchingBytesToRight(source_data_ + source_match_end,

423 target_start + target_match_end,

424 static_cast<int>(limit_bytes_to_right));

425 }

426 // Update in/out parameter if the best match found was better

427 // than any match already stored in *best_match.

428 best_match->ReplaceIfBetterMatch(match_size,

429 source_match_offset + starting_offset_,

430 target_match_offset);

431 }

432 }

433

434 } // namespace open_vcdiff

OLD	NEW