sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.h - Issue 5203: Transition to pulling open-vcdiff from repository, instead of using snapshot...

Side by Side Diff: sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.h

Issue 5203: Transition to pulling open-vcdiff from repository, instead of using snapshot... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « sdch/open_vcdiff/depot/opensource/open-vcdiff/src/adler32.c ('k') | sdch/open_vcdiff/depot/opensource/open-vcdiff/src/blockhash.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright 2006 Google Inc.

2 // Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith

3 //

4 // Licensed under the Apache License, Version 2.0 (the "License");

5 // you may not use this file except in compliance with the License.

6 // You may obtain a copy of the License at

7 //

8 // http://www.apache.org/licenses/LICENSE-2.0

9 //

10 // Unless required by applicable law or agreed to in writing, software

11 // distributed under the License is distributed on an "AS IS" BASIS,

12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13 // See the License for the specific language governing permissions and

14 // limitations under the License.

15 //

16 // Implementation of the Bentley/McIlroy algorithm for finding differences.

17 // Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings.

18 // http://citeseer.ist.psu.edu/555557.html

19

20 #ifndef OPEN_VCDIFF_BLOCKHASH_H_

21 #define OPEN_VCDIFF_BLOCKHASH_H_

22

23 #include <config.h>

24 #include <stdint.h> // uint32_t

25 #include <cstddef> // size_t

26 #include <vector>

27

28 namespace open_vcdiff {

29

30 // A generic hash table which will be used to keep track of byte runs

31 // of size kBlockSize in both the incrementally processed target data

32 // and the preprocessed source dictionary.

33 //

34 // A custom hash table implementation is used instead of the standard

35 // hash_map template because we know that there will be exactly one

36 // entry in the BlockHash corresponding to each kBlockSize bytes

37 // in the source data, which makes certain optimizations possible:

38 // * The memory for the hash table and for all hash entries can be allocated

39 // in one step rather than incrementally for each insert operation.

40 // * A single integer can be used to represent both

41 // the index of the next hash entry in the chain

42 // and the position of the entry within the source data

43 // (== kBlockSize * block_number). This greatly reduces the size

44 // of a hash entry.

45 //

46 class BlockHash {

47 public:

48 // Block size as per Bentley/McIlroy; must be a power of two.

49 //

50 // Using (for example) kBlockSize = 4 guarantees that no match smaller

51 // than size 4 will be identified, that some matches having sizes

52 // 4, 5, or 6 may be identified, and that all matches

53 // having size 7 or greater will be identified (because any string of

54 // 7 bytes must contain a complete aligned block of 4 bytes.)

55 //

56 // Increasing kBlockSize by a factor of two will halve the amount of

57 // memory needed for the next block table, and will halve the setup time

58 // for a new BlockHash. However, it also doubles the minimum

59 // match length that is guaranteed to be found in FindBestMatch(),

60 // so that function will be less effective in finding matches.

61 //

62 // Computational effort in FindBestMatch (which is the inner loop of

63 // the encoding algorithm) will be proportional to the number of

64 // matches found, and a low value of kBlockSize will waste time

65 // tracking down small matches. On the other hand, if this value

66 // is set too high, no matches will be found at all.

67 //

68 // It is suggested that different values of kBlockSize be tried against

69 // a representative data set to find the best tradeoff between

70 // memory/CPU and the effectiveness of FindBestMatch().

71 //

72 // If you change kBlockSize to a smaller value, please increase

73 // kMaxMatchesToCheck accordingly.

74 static const int kBlockSize = 32;

75

76 // This class is used to store the best match found by FindBestMatch()

77 // and return it to the caller.

78 class Match {

79 public:

80 Match() : size_(0), source_offset_(-1), target_offset_(-1) { }

81

82 void ReplaceIfBetterMatch(size_t candidate_size,

83 int candidate_source_offset,

84 int candidate_target_offset) {

85 if (candidate_size > size_) {

86 size_ = candidate_size;

87 source_offset_ = candidate_source_offset;

88 target_offset_ = candidate_target_offset;

89 }

90 }

91

92 size_t size() const { return size_; }

93 int source_offset() const { return source_offset_; }

94 int target_offset() const { return target_offset_; }

95

96 private:

97 // The size of the best (longest) match passed to ReplaceIfBetterMatch().

98 size_t size_;

99

100 // The source offset of the match, including the starting_offset_

101 // of the BlockHash for which the match was found.

102 int source_offset_;

103

104 // The target offset of the match. An offset of 0 corresponds to the

105 // data at target_start, which is an argument of FindBestMatch().

106 int target_offset_;

107

108 // Making these private avoids implicit copy constructor

109 // & assignment operator

110 Match(const Match&); // NOLINT

111 void operator=(const Match&);

112 };

113

114 // A BlockHash is created using a buffer of source data. The hash table

115 // will contain one entry for each kBlockSize-byte block in the

116 // source data.

117 //

118 // See the comments for starting_offset_, below, for a description of

119 // the starting_offset argument. For a hash of source (dictionary) data,

120 // starting_offset_ will be zero; for a hash of previously encoded

121 // target data, starting_offset_ will be equal to the dictionary size.

122 //

123 BlockHash(const char* source_data, size_t source_size, int starting_offset);

124

125 ~BlockHash();

126

127 // Initializes the object before use.

128 // This method must be called after constructing a BlockHash object,

129 // and before any other method may be called. This is because

130 // Init() dynamically allocates hash_table_ and next_block_table_.

131 // Returns true if initialization succeeded, or false if an error occurred,

132 // in which case no other method except the destructor may then be used

133 // on the object.

134 //

135 // If populate_hash_table is true, then AddAllBlocks() will be called

136 // to populate the hash table. If populate_hash_table is false, then

137 // classes that inherit from BlockHash are expected to call AddBlock()

138 // to incrementally populate individual blocks of data.

139 //

140 bool Init(bool populate_hash_table);

141

142 // In the context of the open-vcdiff encoder, BlockHash is used for two

143 // purposes: to hash the source (dictionary) data, and to hash

144 // the previously encoded target data. The main differences between

145 // a dictionary BlockHash and a target BlockHash are as follows:

146 //

147 // 1. The best_match->source_offset() returned from FindBestMatch()

148 // for a target BlockHash is computed in the following manner:

149 // the starting offset of the first byte in the target data

150 // is equal to the dictionary size. FindBestMatch() will add

151 // starting_offset_ to any best_match->source_offset() value it returns,

152 // in order to produce the correct offset value for a target BlockHash.

153 // 2. For a dictionary BlockHash, the entire data set is hashed at once

154 // when Init() is called with the parameter populate_hash_table = true.

155 // For a target BlockHash, because the previously encoded target data

156 // includes only the data seen up to the current encoding position,

157 // the data blocks are hashed incrementally as the encoding position

158 // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex().

159 //

160 // The following two factory functions can be used to create BlockHash

161 // objects for each of these two purposes. Each factory function calls

162 // the object constructor and also calls Init(). If an error occurs,

163 // NULL is returned; otherwise a valid BlockHash object is returned.

164 // Since a dictionary BlockHash is not expected to be modified after

165 // initialization, a const object is returned.

166 // The caller is responsible for deleting the returned object

167 // (using the C++ delete operator) once it is no longer needed.

168 static const BlockHash* CreateDictionaryHash(const char* dictionary_data,

169 size_t dictionary_size);

170 static BlockHash* CreateTargetHash(const char* target_data,

171 size_t target_size,

172 size_t dictionary_size);

173

174 const size_t table_size() const { return hash_table_.size(); }

175 const int starting_offset() const { return starting_offset_; }

176

177 // This function will be called to add blocks incrementally to the target hash

178 // as the encoding position advances through the target data. It will be

179 // called for every kBlockSize-byte block in the target data, regardless

180 // of whether the block is aligned evenly on a block boundary. The

181 // BlockHash will only store hash entries for the evenly-aligned blocks.

182 //

183 void AddOneIndexHash(int index, uint32_t hash_value) {

184 if (index == NextIndexToAdd()) {

185 AddBlock(hash_value);

186 }

187 }

188

189 // Calls AddBlock() for each kBlockSize-byte block in the range

190 // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints.

191 // If end_index <= the last index added (last_block_added_ * kBlockSize),

192 // this function does nothing.

193 //

194 // A partial block beginning anywhere up to (end_index - 1) is also added,

195 // unless it extends outside the end of the source data. Like AddAllBlocks(),

196 // this function computes the hash value for each of the blocks in question

197 // from scratch, so it is not a good option if the hash values have already

198 // been computed for some other purpose.

199 //

200 // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are

201 // 14 bytes of source data.

202 // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock()

203 // only for block number 2 (at index 8).

204 // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call

205 // AddBlock() at all, because block 3 (beginning at index 12) would

206 // fall outside the range of source data.

207 //

208 // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to

209 // add a whole range of data to a target hash when a COPY instruction

210 // is generated.

211 void AddAllBlocksThroughIndex(int end_index);

212

213 // FindBestMatch takes a position within the unencoded target data

214 // (target_candidate_start) and the hash value of the kBlockSize bytes

215 // beginning at that position (hash_value). It attempts to find a matching

216 // set of bytes within the source (== dictionary) data, expanding

217 // the match both below and above the target block. It cannot expand

218 // the match outside the bounds of the source data, or below

219 // target_start within the target data, or past

220 // the end limit of (target_start + target_length).

221 //

222 // target_candidate_start is the start of the candidate block within the

223 // target data for which a match will be sought, while

224 // target_start (which is <= target_candidate_start)

225 // is the start of the target data that has yet to be encoded.

226 //

227 // If a match is found whose size is greater than the size

228 // of best_match, this function populates *best_match with the

229 // size, source_offset, and target_offset of the match found.

230 // best_match->source_offset() will contain the index of the start of the

231 // matching source data, plus starting_offset_

232 // (see description of starting_offset_ for details);

233 // best_match->target_offset() will contain the offset of the match

234 // beginning with target_start = offset 0, such that

235 // 0 <= best_match->target_offset()

236 // <= (target_candidate_start - target_start);

237 // and best_match->size() will contain the size of the match.

238 // If no such match is found, this function leaves *best_match unmodified.

239 //

240 // On calling FindBestMatch(), best_match must

241 // point to a valid Match object, and cannot be NULL.

242 // The same Match object can be passed

243 // when calling FindBestMatch() on a different BlockHash object

244 // for the same candidate data block, in order to find

245 // the best match possible across both objects. For example:

246 //

247 // open_vcdiff::BlockHash::Match best_match;

248 // uint32_t hash_value =

249 // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start);

250 // bh1.FindBestMatch(hash_value,

251 // target_candidate_start,

252 // target_start,

253 // target_length,

254 // &best_match);

255 // bh2.FindBestMatch(hash_value,

256 // target_candidate_start,

257 // target_start,

258 // target_length,

259 // &best_match);

260 // if (best_size >= 0) {

261 // // a match was found; its size, source offset, and target offset

262 // // can be found in best_match

263 // }

264 //

265 // hash_value is passed as a separate parameter from target_candidate_start,

266 // (rather than calculated within FindBestMatch) in order to take

267 // advantage of the rolling hash, which quickly calculates the hash value

268 // of the block starting at target_candidate_start based on

269 // the known hash value of the block starting at (target_candidate_start - 1).

270 // See vcdiffengine.cc for more details.

271 //

272 // Example:

273 // kBlockSize: 4

274 // target text: "ANDREW LLOYD WEBBER"

275 // 1^ 5^2^ 3^

276 // dictionary: "INSURANCE : LLOYDS OF LONDON"

277 // 4^

278 // hashed dictionary blocks:

279 // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON"

280 //

281 // 1: target_start (beginning of unencoded data)

282 // 2: target_candidate_start (for the block "LLOY")

283 // 3: target_length (points one byte beyond the last byte of data.)

284 // 4: best_match->source_offset() (after calling FindBestMatch)

285 // 5: best_match->target_offset() (after calling FindBestMatch)

286 //

287 // Under these conditions, FindBestMatch will find a matching

288 // hashed dictionary block for "LLOY", and will extend the beginning of

289 // this match backwards by one byte, and the end of the match forwards

290 // by one byte, finding that the best match is " LLOYD"

291 // with best_match->source_offset() = 10

292 // (offset of " LLOYD" in the source string),

293 // best_match->target_offset() = 6

294 // (offset of " LLOYD" in the target string),

295 // and best_match->size() = 6.

296 //

297 void FindBestMatch(uint32_t hash_value,

298 const char* target_candidate_start,

299 const char* target_start,

300 size_t target_size,

301 Match* best_match) const;

302

303 protected:

304 // FindBestMatch() will not process more than this number

305 // of matching hash entries.

306 //

307 // It is necessary to have a limit on the maximum number of matches

308 // that will be checked in order to avoid the worst-case performance

309 // possible if, for example, all the blocks in the dictionary have

310 // the same hash value. See the unit test SearchStringFindsTooManyMatches

311 // for an example of such a case. The encoder uses a loop in

312 // VCDiffEngine::Encode over each target byte, containing a loop in

313 // BlockHash::FindBestMatch over the number of matches (up to a maximum

314 // of the number of source blocks), containing two loops that extend

315 // the match forwards and backwards up to the number of source bytes.

316 // Total complexity in the worst case is

317 // O([target size] * source_size_ * source_size_

318 // Placing a limit on the possible number of matches checked changes this to

319 // O([target size] * source_size_ * kMaxMatchesToCheck)

320 //

321 // In empirical testing on real HTML text, using a block size of 4,

322 // the number of true matches per call to FindBestMatch() did not exceed 78;

323 // with a block size of 32, the number of matches did not exceed 3.

324 //

325 // The expected number of true matches scales super-linearly

326 // with the inverse of kBlockSize, but here a linear scale is used

327 // for block sizes smaller than 32.

328 static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 8 :

329 (8 * (32 / kBlockSize));

330

331 // Do not skip more than this number of non-matching hash collisions

332 // to find the next matching entry in the hash chain.

333 static const int kMaxProbes = 16;

334

335 // Internal routine which calculates a hash table size based on kBlockSize and

336 // the dictionary_size. Will return a power of two if successful, or 0 if an

337 // internal error occurs. Some calculations (such as GetHashTableIndex())

338 // depend on the table size being a power of two.

339 static const size_t CalcTableSize(const size_t dictionary_size);

340

341 const size_t GetNumberOfBlocks() const {

342 return source_size_ / kBlockSize;

343 }

344

345 // Use the lowest-order bits of the hash value

346 // as the index into the hash table.

347 int GetHashTableIndex(uint32_t hash_value) const {

348 return hash_value & (table_size() - 1);

349 }

350

351 // The index within source_data_ of the next block

352 // for which AddBlock() should be called.

353 int NextIndexToAdd() const {

354 return (last_block_added_ + 1) * kBlockSize;

355 }

356

357 static inline bool TooManyMatches(int* match_counter);

358

359 const char* const source_data() { return source_data_; }

360 const size_t source_size() { return source_size_; }

361

362 // Adds an entry to the hash table for one block of source data of length

363 // kBlockSize, starting at source_data_[block_number * kBlockSize],

364 // where block_number is always (last_block_added_ + 1). That is,

365 // AddBlock() must be called once for each block in source_data_

366 // in increasing order.

367 void AddBlock(uint32_t hash_value);

368

369 // Calls AddBlock() for each complete kBlockSize-byte block between

370 // source_data_ and (source_data_ + source_size_). It is equivalent

371 // to calling AddAllBlocksThroughIndex(source_data + source_size).

372 // This function is called when Init(true) is invoked.

373 void AddAllBlocks();

374

375 // Returns true if the contents of the kBlockSize-byte block

376 // beginning at block1 are identical to the contents of

377 // the block beginning at block2; false otherwise.

378 static bool BlockContentsMatch(const char* block1, const char* block2);

379

380 // Finds the first block number within the hashed data

381 // that represents a match for the given hash value.

382 // Returns -1 if no match was found.

383 //

384 // Init() must have been called and returned true before using

385 // FirstMatchingBlock or NextMatchingBlock. No check is performed

386 // for this condition; the code will crash if this condition is violated.

387 //

388 // The hash table is initially populated with -1 (not found) values,

389 // so if this function is called before the hash table has been populated

390 // using AddAllBlocks() or AddBlock(), it will simply return -1

391 // for any value of hash_value.

392 int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const;

393

394 // Given a block number returned by FirstMatchingBlock()

395 // or by a previous call to NextMatchingBlock(), returns

396 // the next block number that matches the same hash value.

397 // Returns -1 if no match was found.

398 int NextMatchingBlock(int block_number, const char* block_ptr) const;

399

400 // Inline versions of BlockContentsMatch and FirstMatchingBlock.

401 // These save the cost of a function call

402 // when these routines are called from within the module.

403 // The external (non-inlined) versions are called only by unit tests.

404 static inline bool BlockContentsMatchInline(const char* block1,

405 const char* block2);

406 inline int FirstMatchingBlockInline(uint32_t hash_value,

407 const char* block_ptr) const;

408

409 // Walk through the hash entry chain, skipping over any false matches

410 // (for which the lowest bits of the fingerprints match,

411 // but the actual block data does not.) Returns the block number of

412 // the first true match found, or -1 if no true match was found.

413 // If block_number is a matching block, the function will return block_number

414 // without skipping to the next block.

415 int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const;

416

417 // Returns the number of bytes to the left of source_match_start

418 // that match the corresponding bytes to the left of target_match_start.

419 // Will not examine more than max_bytes bytes, which is to say that

420 // the return value will be in the range [0, max_bytes] inclusive.

421 static int MatchingBytesToLeft(const char* source_match_start,

422 const char* target_match_start,

423 int max_bytes);

424

425 // Returns the number of bytes starting at source_match_end

426 // that match the corresponding bytes starting at target_match_end.

427 // Will not examine more than max_bytes bytes, which is to say that

428 // the return value will be in the range [0, max_bytes] inclusive.

429 static int MatchingBytesToRight(const char* source_match_end,

430 const char* target_match_end,

431 int max_bytes);

432

433 // The protected functions BlockContentsMatch, FirstMatchingBlock,

434 // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight

435 // should be made accessible to unit tests.

436 friend class BlockHashTest;

437

438 private:

439 const char* const source_data_;

440 const size_t source_size_;

441

442 // The size of this array is determined using CalcTableSize(). It has at

443 // least one element for each kBlockSize-byte block in the source data.

444 // GetHashTableIndex() returns an index into this table for a given hash

445 // value. The value of each element of hash_table_ is the lowest block

446 // number in the source data whose hash value would return the same value from

447 // GetHashTableIndex(), or -1 if there is no matching block. This value can

448 // then be used as an index into next_block_table_ to retrieve the entire set

449 // of matching block numbers.

450 std::vector<int> hash_table_;

451

452 // An array containing one element for each source block. Each element is

453 // either -1 (== not found) or the index of the next block whose hash value

454 // would produce a matching result from GetHashTableIndex().

455 std::vector<int> next_block_table_;

456

457 // This array has the same size as hash_table_. For every block number B that

458 // is referenced in hash_table_, last_block_table_[B] will contain the maximum

459 // block number among all blocks that have the same GetHashTableIndex() value

460 // as the block B. This number may be B itself. For a block number B' that

461 // is not referenced in hash_table_, the value of last_block_table_[B'] is -1.

462 // This table is used only while populating the hash table, not while looking

463 // up hash values in the table. Keeping track of the last block number in the

464 // chain allows us to construct the block chains as FIFO rather than LIFO

465 // lists, so that the match with the lowest index is returned first. This

466 // should result in a more compact encoding because the VCDIFF format favors

467 // smaller index values and repeated index values.

468 std::vector<int> last_block_table_;

469

470 // The offset of the first byte of source data (the data at source_data_[0]).

471 // For the purpose of computing offsets, the source data and target data

472 // are considered to be concatenated -- not literally in a single memory

473 // buffer, but conceptually as described in the RFC.

474 // The first byte of the previously encoded target data

475 // has an offset that is equal to dictionary_size, i.e., just after

476 // the last byte of source data.

477 // For a hash of source (dictionary) data, starting_offset_ will be zero;

478 // for a hash of previously encoded target data, starting_offset_ will be

479 // equal to the dictionary size.

480 const int starting_offset_;

481

482 // The last index added by AddBlock(). This determines the block number

483 // for successive calls to AddBlock(), and is also

484 // used to determine the starting block for AddAllBlocksThroughIndex().

485 int last_block_added_;

486

487 // Making these private avoids implicit copy constructor & assignment operator

488 BlockHash(const BlockHash&); // NOLINT

489 void operator=(const BlockHash&);

490 };

491

492 } // namespace open_vcdiff

493

494 #endif // OPEN_VCDIFF_BLOCKHASH_H_

OLD	NEW