Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: third_party/brotli/enc/hash.h

Issue 1956893002: Added brotli enc/ and tools/ directories. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Updated to most recent build tools Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/brotli/enc/find_match_length.h ('k') | third_party/brotli/enc/histogram.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* Copyright 2010 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6
7 // A (forgetful) hash table to the data seen by the compressor, to
8 // help create backward references to previous data.
9
10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_
12
13 #include <sys/types.h>
14 #include <algorithm>
15 #include <cstring>
16 #include <limits>
17
18 #include "./dictionary_hash.h"
19 #include "./fast_log.h"
20 #include "./find_match_length.h"
21 #include "./port.h"
22 #include "./prefix.h"
23 #include "./static_dict.h"
24 #include "./transform.h"
25 #include "./types.h"
26
27 namespace brotli {
28
29 static const size_t kMaxTreeSearchDepth = 64;
30 static const size_t kMaxTreeCompLength = 128;
31
32 static const uint32_t kDistanceCacheIndex[] = {
33 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
34 };
35 static const int kDistanceCacheOffset[] = {
36 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
37 };
38
39 static const uint32_t kCutoffTransformsCount = 10;
40 static const uint8_t kCutoffTransforms[] = {
41 0, 12, 27, 23, 42, 63, 56, 48, 59, 64
42 };
43
44 // kHashMul32 multiplier has these properties:
45 // * The multiplier must be odd. Otherwise we may lose the highest bit.
46 // * No long streaks of 1s or 0s.
47 // * There is no effort to ensure that it is a prime, the oddity is enough
48 // for this use.
49 // * The number has been tuned heuristically against compression benchmarks.
50 static const uint32_t kHashMul32 = 0x1e35a7bd;
51
52 template<int kShiftBits>
53 inline uint32_t Hash(const uint8_t *data) {
54 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
55 // The higher bits contain more mixture from the multiplication,
56 // so we take our results from there.
57 return h >> (32 - kShiftBits);
58 }
59
60 // Usually, we always choose the longest backward reference. This function
61 // allows for the exception of that rule.
62 //
63 // If we choose a backward reference that is further away, it will
64 // usually be coded with more bits. We approximate this by assuming
65 // log2(distance). If the distance can be expressed in terms of the
66 // last four distances, we use some heuristic constants to estimate
67 // the bits cost. For the first up to four literals we use the bit
68 // cost of the literals from the literal cost model, after that we
69 // use the average bit cost of the cost model.
70 //
71 // This function is used to sometimes discard a longer backward reference
72 // when it is not much longer and the bit cost for encoding it is more
73 // than the saved literals.
74 //
75 // backward_reference_offset MUST be positive.
76 inline double BackwardReferenceScore(size_t copy_length,
77 size_t backward_reference_offset) {
78 return 5.4 * static_cast<double>(copy_length) -
79 1.20 * Log2FloorNonZero(backward_reference_offset);
80 }
81
82 inline double BackwardReferenceScoreUsingLastDistance(size_t copy_length,
83 size_t distance_short_code) {
84 static const double kDistanceShortCodeBitCost[16] = {
85 -0.6, 0.95, 1.17, 1.27,
86 0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
87 1.05, 1.05, 1.15, 1.15, 1.25, 1.25
88 };
89 return 5.4 * static_cast<double>(copy_length) -
90 kDistanceShortCodeBitCost[distance_short_code];
91 }
92
93 struct BackwardMatch {
94 BackwardMatch(void) : distance(0), length_and_code(0) {}
95
96 BackwardMatch(size_t dist, size_t len)
97 : distance(static_cast<uint32_t>(dist))
98 , length_and_code(static_cast<uint32_t>(len << 5)) {}
99
100 BackwardMatch(size_t dist, size_t len, size_t len_code)
101 : distance(static_cast<uint32_t>(dist))
102 , length_and_code(static_cast<uint32_t>(
103 (len << 5) | (len == len_code ? 0 : len_code))) {}
104
105 size_t length(void) const {
106 return length_and_code >> 5;
107 }
108 size_t length_code(void) const {
109 size_t code = length_and_code & 31;
110 return code ? code : length();
111 }
112
113 uint32_t distance;
114 uint32_t length_and_code;
115 };
116
117 // A (forgetful) hash table to the data seen by the compressor, to
118 // help create backward references to previous data.
119 //
120 // This is a hash map of fixed size (kBucketSize). Starting from the
121 // given index, kBucketSweep buckets are used to store values of a key.
122 template <int kBucketBits, int kBucketSweep, bool kUseDictionary>
123 class HashLongestMatchQuickly {
124 public:
125 HashLongestMatchQuickly(void) {
126 Reset();
127 }
128 void Reset(void) {
129 need_init_ = true;
130 num_dict_lookups_ = 0;
131 num_dict_matches_ = 0;
132 }
133 void Init(void) {
134 if (need_init_) {
135 // It is not strictly necessary to fill this buffer here, but
136 // not filling will make the results of the compression stochastic
137 // (but correct). This is because random data would cause the
138 // system to find accidentally good backward references here and there.
139 memset(&buckets_[0], 0, sizeof(buckets_));
140 need_init_ = false;
141 }
142 }
143 void InitForData(const uint8_t* data, size_t num) {
144 for (size_t i = 0; i < num; ++i) {
145 const uint32_t key = HashBytes(&data[i]);
146 memset(&buckets_[key], 0, kBucketSweep * sizeof(buckets_[0]));
147 need_init_ = false;
148 }
149 }
150 // Look at 4 bytes at data.
151 // Compute a hash from these, and store the value somewhere within
152 // [ix .. ix+3].
153 inline void Store(const uint8_t *data, const uint32_t ix) {
154 const uint32_t key = HashBytes(data);
155 // Wiggle the value with the bucket sweep range.
156 const uint32_t off = (ix >> 3) % kBucketSweep;
157 buckets_[key + off] = ix;
158 }
159
160 // Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask]
161 // up to the length of max_length and stores the position cur_ix in the
162 // hash table.
163 //
164 // Does not look for matches longer than max_length.
165 // Does not look for matches further away than max_backward.
166 // Writes the best found match length into best_len_out.
167 // Writes the index (&data[index]) of the start of the best match into
168 // best_distance_out.
169 inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer,
170 const size_t ring_buffer_mask,
171 const int* __restrict distance_cache,
172 const size_t cur_ix,
173 const size_t max_length,
174 const size_t max_backward,
175 size_t * __restrict best_len_out,
176 size_t * __restrict best_len_code_out,
177 size_t * __restrict best_distance_out,
178 double* __restrict best_score_out) {
179 const size_t best_len_in = *best_len_out;
180 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
181 const uint32_t key = HashBytes(&ring_buffer[cur_ix_masked]);
182 int compare_char = ring_buffer[cur_ix_masked + best_len_in];
183 double best_score = *best_score_out;
184 size_t best_len = best_len_in;
185 size_t cached_backward = static_cast<size_t>(distance_cache[0]);
186 size_t prev_ix = cur_ix - cached_backward;
187 bool match_found = false;
188 if (prev_ix < cur_ix) {
189 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
190 if (compare_char == ring_buffer[prev_ix + best_len]) {
191 size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
192 &ring_buffer[cur_ix_masked],
193 max_length);
194 if (len >= 4) {
195 best_score = BackwardReferenceScoreUsingLastDistance(len, 0);
196 best_len = len;
197 *best_len_out = len;
198 *best_len_code_out = len;
199 *best_distance_out = cached_backward;
200 *best_score_out = best_score;
201 compare_char = ring_buffer[cur_ix_masked + best_len];
202 if (kBucketSweep == 1) {
203 buckets_[key] = static_cast<uint32_t>(cur_ix);
204 return true;
205 } else {
206 match_found = true;
207 }
208 }
209 }
210 }
211 if (kBucketSweep == 1) {
212 // Only one to look for, don't bother to prepare for a loop.
213 prev_ix = buckets_[key];
214 buckets_[key] = static_cast<uint32_t>(cur_ix);
215 size_t backward = cur_ix - prev_ix;
216 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
217 if (compare_char != ring_buffer[prev_ix + best_len_in]) {
218 return false;
219 }
220 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
221 return false;
222 }
223 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
224 &ring_buffer[cur_ix_masked],
225 max_length);
226 if (len >= 4) {
227 *best_len_out = len;
228 *best_len_code_out = len;
229 *best_distance_out = backward;
230 *best_score_out = BackwardReferenceScore(len, backward);
231 return true;
232 }
233 } else {
234 uint32_t *bucket = buckets_ + key;
235 prev_ix = *bucket++;
236 for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) {
237 const size_t backward = cur_ix - prev_ix;
238 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
239 if (compare_char != ring_buffer[prev_ix + best_len]) {
240 continue;
241 }
242 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
243 continue;
244 }
245 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
246 &ring_buffer[cur_ix_masked],
247 max_length);
248 if (len >= 4) {
249 const double score = BackwardReferenceScore(len, backward);
250 if (best_score < score) {
251 best_score = score;
252 best_len = len;
253 *best_len_out = best_len;
254 *best_len_code_out = best_len;
255 *best_distance_out = backward;
256 *best_score_out = score;
257 compare_char = ring_buffer[cur_ix_masked + best_len];
258 match_found = true;
259 }
260 }
261 }
262 }
263 if (kUseDictionary && !match_found &&
264 num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
265 ++num_dict_lookups_;
266 const uint32_t dict_key = Hash<14>(&ring_buffer[cur_ix_masked]) << 1;
267 const uint16_t v = kStaticDictionaryHash[dict_key];
268 if (v > 0) {
269 const uint32_t len = v & 31;
270 const uint32_t dist = v >> 5;
271 const size_t offset =
272 kBrotliDictionaryOffsetsByLength[len] + len * dist;
273 if (len <= max_length) {
274 const size_t matchlen =
275 FindMatchLengthWithLimit(&ring_buffer[cur_ix_masked],
276 &kBrotliDictionary[offset], len);
277 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {
278 const size_t transform_id = kCutoffTransforms[len - matchlen];
279 const size_t word_id =
280 transform_id * (1u << kBrotliDictionarySizeBitsByLength[len]) +
281 dist;
282 const size_t backward = max_backward + word_id + 1;
283 const double score = BackwardReferenceScore(matchlen, backward);
284 if (best_score < score) {
285 ++num_dict_matches_;
286 best_score = score;
287 best_len = matchlen;
288 *best_len_out = best_len;
289 *best_len_code_out = len;
290 *best_distance_out = backward;
291 *best_score_out = best_score;
292 match_found = true;
293 }
294 }
295 }
296 }
297 }
298 const uint32_t off = (cur_ix >> 3) % kBucketSweep;
299 buckets_[key + off] = static_cast<uint32_t>(cur_ix);
300 return match_found;
301 }
302
303 enum { kHashLength = 5 };
304 enum { kHashTypeLength = 8 };
305 // HashBytes is the function that chooses the bucket to place
306 // the address in. The HashLongestMatch and HashLongestMatchQuickly
307 // classes have separate, different implementations of hashing.
308 static uint32_t HashBytes(const uint8_t *data) {
309 // Computing a hash based on 5 bytes works much better for
310 // qualities 1 and 3, where the next hash value is likely to replace
311 uint64_t h = (BROTLI_UNALIGNED_LOAD64(data) << 24) * kHashMul32;
312 // The higher bits contain more mixture from the multiplication,
313 // so we take our results from there.
314 return static_cast<uint32_t>(h >> (64 - kBucketBits));
315 }
316
317 enum { kHashMapSize = 4 << kBucketBits };
318
319 private:
320 static const uint32_t kBucketSize = 1 << kBucketBits;
321 uint32_t buckets_[kBucketSize + kBucketSweep];
322 // True if buckets_ array needs to be initialized.
323 bool need_init_;
324 size_t num_dict_lookups_;
325 size_t num_dict_matches_;
326 };
327
328 // A (forgetful) hash table to the data seen by the compressor, to
329 // help create backward references to previous data.
330 //
331 // This is a hash map of fixed size (kBucketSize) to a ring buffer of
332 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
333 // index positions of the given hash key in the compressed data.
334 template <int kBucketBits,
335 int kBlockBits,
336 int kNumLastDistancesToCheck>
337 class HashLongestMatch {
338 public:
339 HashLongestMatch(void) {
340 Reset();
341 }
342
343 void Reset(void) {
344 need_init_ = true;
345 num_dict_lookups_ = 0;
346 num_dict_matches_ = 0;
347 }
348
349 void Init(void) {
350 if (need_init_) {
351 memset(&num_[0], 0, sizeof(num_));
352 need_init_ = false;
353 }
354 }
355
356 void InitForData(const uint8_t* data, size_t num) {
357 for (size_t i = 0; i < num; ++i) {
358 const uint32_t key = HashBytes(&data[i]);
359 num_[key] = 0;
360 need_init_ = false;
361 }
362 }
363
364 // Look at 3 bytes at data.
365 // Compute a hash from these, and store the value of ix at that position.
366 inline void Store(const uint8_t *data, const uint32_t ix) {
367 const uint32_t key = HashBytes(data);
368 const int minor_ix = num_[key] & kBlockMask;
369 buckets_[key][minor_ix] = ix;
370 ++num_[key];
371 }
372
373 // Find a longest backward match of &data[cur_ix] up to the length of
374 // max_length and stores the position cur_ix in the hash table.
375 //
376 // Does not look for matches longer than max_length.
377 // Does not look for matches further away than max_backward.
378 // Writes the best found match length into best_len_out.
379 // Writes the index (&data[index]) offset from the start of the best match
380 // into best_distance_out.
381 // Write the score of the best match into best_score_out.
382 bool FindLongestMatch(const uint8_t * __restrict data,
383 const size_t ring_buffer_mask,
384 const int* __restrict distance_cache,
385 const size_t cur_ix,
386 const size_t max_length,
387 const size_t max_backward,
388 size_t * __restrict best_len_out,
389 size_t * __restrict best_len_code_out,
390 size_t * __restrict best_distance_out,
391 double * __restrict best_score_out) {
392 *best_len_code_out = 0;
393 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
394 bool match_found = false;
395 // Don't accept a short copy from far away.
396 double best_score = *best_score_out;
397 size_t best_len = *best_len_out;
398 *best_len_out = 0;
399 // Try last distance first.
400 for (size_t i = 0; i < kNumLastDistancesToCheck; ++i) {
401 const size_t idx = kDistanceCacheIndex[i];
402 const size_t backward =
403 static_cast<size_t>(distance_cache[idx] + kDistanceCacheOffset[i]);
404 size_t prev_ix = static_cast<size_t>(cur_ix - backward);
405 if (prev_ix >= cur_ix) {
406 continue;
407 }
408 if (PREDICT_FALSE(backward > max_backward)) {
409 continue;
410 }
411 prev_ix &= ring_buffer_mask;
412
413 if (cur_ix_masked + best_len > ring_buffer_mask ||
414 prev_ix + best_len > ring_buffer_mask ||
415 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
416 continue;
417 }
418 const size_t len = FindMatchLengthWithLimit(&data[prev_ix],
419 &data[cur_ix_masked],
420 max_length);
421 if (len >= 3 || (len == 2 && i < 2)) {
422 // Comparing for >= 2 does not change the semantics, but just saves for
423 // a few unnecessary binary logarithms in backward reference score,
424 // since we are not interested in such short matches.
425 double score = BackwardReferenceScoreUsingLastDistance(len, i);
426 if (best_score < score) {
427 best_score = score;
428 best_len = len;
429 *best_len_out = best_len;
430 *best_len_code_out = best_len;
431 *best_distance_out = backward;
432 *best_score_out = best_score;
433 match_found = true;
434 }
435 }
436 }
437 const uint32_t key = HashBytes(&data[cur_ix_masked]);
438 const uint32_t * __restrict const bucket = &buckets_[key][0];
439 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
440 for (size_t i = num_[key]; i > down;) {
441 --i;
442 size_t prev_ix = bucket[i & kBlockMask];
443 const size_t backward = cur_ix - prev_ix;
444 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
445 break;
446 }
447 prev_ix &= ring_buffer_mask;
448 if (cur_ix_masked + best_len > ring_buffer_mask ||
449 prev_ix + best_len > ring_buffer_mask ||
450 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
451 continue;
452 }
453 const size_t len = FindMatchLengthWithLimit(&data[prev_ix],
454 &data[cur_ix_masked],
455 max_length);
456 if (len >= 4) {
457 // Comparing for >= 3 does not change the semantics, but just saves
458 // for a few unnecessary binary logarithms in backward reference
459 // score, since we are not interested in such short matches.
460 double score = BackwardReferenceScore(len, backward);
461 if (best_score < score) {
462 best_score = score;
463 best_len = len;
464 *best_len_out = best_len;
465 *best_len_code_out = best_len;
466 *best_distance_out = backward;
467 *best_score_out = best_score;
468 match_found = true;
469 }
470 }
471 }
472 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix);
473 ++num_[key];
474 if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
475 size_t dict_key = Hash<14>(&data[cur_ix_masked]) << 1;
476 for (int k = 0; k < 2; ++k, ++dict_key) {
477 ++num_dict_lookups_;
478 const uint16_t v = kStaticDictionaryHash[dict_key];
479 if (v > 0) {
480 const size_t len = v & 31;
481 const size_t dist = v >> 5;
482 const size_t offset =
483 kBrotliDictionaryOffsetsByLength[len] + len * dist;
484 if (len <= max_length) {
485 const size_t matchlen =
486 FindMatchLengthWithLimit(&data[cur_ix_masked],
487 &kBrotliDictionary[offset], len);
488 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {
489 const size_t transform_id = kCutoffTransforms[len - matchlen];
490 const size_t word_id =
491 transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) +
492 dist;
493 const size_t backward = max_backward + word_id + 1;
494 double score = BackwardReferenceScore(matchlen, backward);
495 if (best_score < score) {
496 ++num_dict_matches_;
497 best_score = score;
498 best_len = matchlen;
499 *best_len_out = best_len;
500 *best_len_code_out = len;
501 *best_distance_out = backward;
502 *best_score_out = best_score;
503 match_found = true;
504 }
505 }
506 }
507 }
508 }
509 }
510 return match_found;
511 }
512
513 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
514 // length of max_length and stores the position cur_ix in the hash table.
515 //
516 // Sets *num_matches to the number of matches found, and stores the found
517 // matches in matches[0] to matches[*num_matches - 1]. The matches will be
518 // sorted by strictly increasing length and (non-strictly) increasing
519 // distance.
520 size_t FindAllMatches(const uint8_t* data,
521 const size_t ring_buffer_mask,
522 const size_t cur_ix,
523 const size_t max_length,
524 const size_t max_backward,
525 BackwardMatch* matches) {
526 BackwardMatch* const orig_matches = matches;
527 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
528 size_t best_len = 1;
529 size_t stop = cur_ix - 64;
530 if (cur_ix < 64) { stop = 0; }
531 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) {
532 size_t prev_ix = i;
533 const size_t backward = cur_ix - prev_ix;
534 if (PREDICT_FALSE(backward > max_backward)) {
535 break;
536 }
537 prev_ix &= ring_buffer_mask;
538 if (data[cur_ix_masked] != data[prev_ix] ||
539 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
540 continue;
541 }
542 const size_t len =
543 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
544 max_length);
545 if (len > best_len) {
546 best_len = len;
547 *matches++ = BackwardMatch(backward, len);
548 }
549 }
550 const uint32_t key = HashBytes(&data[cur_ix_masked]);
551 const uint32_t * __restrict const bucket = &buckets_[key][0];
552 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
553 for (size_t i = num_[key]; i > down;) {
554 --i;
555 size_t prev_ix = bucket[i & kBlockMask];
556 const size_t backward = cur_ix - prev_ix;
557 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
558 break;
559 }
560 prev_ix &= ring_buffer_mask;
561 if (cur_ix_masked + best_len > ring_buffer_mask ||
562 prev_ix + best_len > ring_buffer_mask ||
563 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
564 continue;
565 }
566 const size_t len =
567 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
568 max_length);
569 if (len > best_len) {
570 best_len = len;
571 *matches++ = BackwardMatch(backward, len);
572 }
573 }
574 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix);
575 ++num_[key];
576 uint32_t dict_matches[kMaxDictionaryMatchLen + 1];
577 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {
578 dict_matches[i] = kInvalidMatch;
579 }
580 size_t minlen = std::max<size_t>(4, best_len + 1);
581 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,
582 &dict_matches[0])) {
583 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length);
584 for (size_t l = minlen; l <= maxlen; ++l) {
585 uint32_t dict_id = dict_matches[l];
586 if (dict_id < kInvalidMatch) {
587 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l,
588 dict_id & 31);
589 }
590 }
591 }
592 return static_cast<size_t>(matches - orig_matches);
593 }
594
595 enum { kHashLength = 4 };
596 enum { kHashTypeLength = 4 };
597
598 // HashBytes is the function that chooses the bucket to place
599 // the address in. The HashLongestMatch and HashLongestMatchQuickly
600 // classes have separate, different implementations of hashing.
601 static uint32_t HashBytes(const uint8_t *data) {
602 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
603 // The higher bits contain more mixture from the multiplication,
604 // so we take our results from there.
605 return h >> (32 - kBucketBits);
606 }
607
608 enum { kHashMapSize = 2 << kBucketBits };
609
610 static const size_t kMaxNumMatches = 64 + (1 << kBlockBits);
611
612 private:
613 // Number of hash buckets.
614 static const uint32_t kBucketSize = 1 << kBucketBits;
615
616 // Only kBlockSize newest backward references are kept,
617 // and the older are forgotten.
618 static const uint32_t kBlockSize = 1 << kBlockBits;
619
620 // Mask for accessing entries in a block (in a ringbuffer manner).
621 static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
622
623 // Number of entries in a particular bucket.
624 uint16_t num_[kBucketSize];
625
626 // Buckets containing kBlockSize of backward references.
627 uint32_t buckets_[kBucketSize][kBlockSize];
628
629 // True if num_ array needs to be initialized.
630 bool need_init_;
631
632 size_t num_dict_lookups_;
633 size_t num_dict_matches_;
634 };
635
636 // A (forgetful) hash table where each hash bucket contains a binary tree of
637 // sequences whose first 4 bytes share the same hash code.
638 // Each sequence is kMaxTreeCompLength long and is identified by its starting
639 // position in the input data. The binary tree is sorted by the lexicographic
640 // order of the sequences, and it is also a max-heap with respect to the
641 // starting positions.
642 class HashToBinaryTree {
643 public:
644 HashToBinaryTree() : forest_(NULL) {
645 Reset();
646 }
647
648 ~HashToBinaryTree() {
649 delete[] forest_;
650 }
651
652 void Reset() {
653 need_init_ = true;
654 }
655
656 void Init(int lgwin, size_t position, size_t bytes, bool is_last) {
657 if (need_init_) {
658 window_mask_ = (1u << lgwin) - 1u;
659 invalid_pos_ = static_cast<uint32_t>(-window_mask_);
660 for (uint32_t i = 0; i < kBucketSize; i++) {
661 buckets_[i] = invalid_pos_;
662 }
663 size_t num_nodes = (position == 0 && is_last) ? bytes : window_mask_ + 1;
664 forest_ = new uint32_t[2 * num_nodes];
665 need_init_ = false;
666 }
667 }
668
669 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
670 // length of max_length and stores the position cur_ix in the hash table.
671 //
672 // Sets *num_matches to the number of matches found, and stores the found
673 // matches in matches[0] to matches[*num_matches - 1]. The matches will be
674 // sorted by strictly increasing length and (non-strictly) increasing
675 // distance.
676 size_t FindAllMatches(const uint8_t* data,
677 const size_t ring_buffer_mask,
678 const size_t cur_ix,
679 const size_t max_length,
680 const size_t max_backward,
681 BackwardMatch* matches) {
682 BackwardMatch* const orig_matches = matches;
683 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
684 size_t best_len = 1;
685 size_t stop = cur_ix - 64;
686 if (cur_ix < 64) { stop = 0; }
687 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) {
688 size_t prev_ix = i;
689 const size_t backward = cur_ix - prev_ix;
690 if (PREDICT_FALSE(backward > max_backward)) {
691 break;
692 }
693 prev_ix &= ring_buffer_mask;
694 if (data[cur_ix_masked] != data[prev_ix] ||
695 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
696 continue;
697 }
698 const size_t len =
699 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
700 max_length);
701 if (len > best_len) {
702 best_len = len;
703 *matches++ = BackwardMatch(backward, len);
704 }
705 }
706 if (best_len < max_length) {
707 matches = StoreAndFindMatches(data, cur_ix, ring_buffer_mask,
708 max_length, &best_len, matches);
709 }
710 uint32_t dict_matches[kMaxDictionaryMatchLen + 1];
711 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {
712 dict_matches[i] = kInvalidMatch;
713 }
714 size_t minlen = std::max<size_t>(4, best_len + 1);
715 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,
716 &dict_matches[0])) {
717 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length);
718 for (size_t l = minlen; l <= maxlen; ++l) {
719 uint32_t dict_id = dict_matches[l];
720 if (dict_id < kInvalidMatch) {
721 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l,
722 dict_id & 31);
723 }
724 }
725 }
726 return static_cast<size_t>(matches - orig_matches);
727 }
728
729 // Stores the hash of the next 4 bytes and re-roots the binary tree at the
730 // current sequence, without returning any matches.
731 // REQUIRES: cur_ix + kMaxTreeCompLength <= end-of-current-block
732 void Store(const uint8_t* data,
733 const size_t ring_buffer_mask,
734 const size_t cur_ix) {
735 size_t best_len = 0;
736 StoreAndFindMatches(data, cur_ix, ring_buffer_mask, kMaxTreeCompLength,
737 &best_len, NULL);
738 }
739
740 void StitchToPreviousBlock(size_t num_bytes,
741 size_t position,
742 const uint8_t* ringbuffer,
743 size_t ringbuffer_mask) {
744 if (num_bytes >= 3 && position >= kMaxTreeCompLength) {
745 // Store the last `kMaxTreeCompLength - 1` positions in the hasher.
746 // These could not be calculated before, since they require knowledge
747 // of both the previous and the current block.
748 const size_t i_start = position - kMaxTreeCompLength + 1;
749 const size_t i_end = std::min(position, i_start + num_bytes);
750 for (size_t i = i_start; i < i_end; ++i) {
751 // We know that i + kMaxTreeCompLength <= position + num_bytes, i.e. the
752 // end of the current block and that we have at least
753 // kMaxTreeCompLength tail in the ringbuffer.
754 Store(ringbuffer, ringbuffer_mask, i);
755 }
756 }
757 }
758
759 static const size_t kMaxNumMatches = 64 + kMaxTreeSearchDepth;
760
761 private:
762 // Stores the hash of the next 4 bytes and in a single tree-traversal, the
763 // hash bucket's binary tree is searched for matches and is re-rooted at the
764 // current position.
765 //
766 // If less than kMaxTreeCompLength data is available, the hash bucket of the
767 // current position is searched for matches, but the state of the hash table
768 // is not changed, since we can not know the final sorting order of the
769 // current (incomplete) sequence.
770 //
771 // This function must be called with increasing cur_ix positions.
772 BackwardMatch* StoreAndFindMatches(const uint8_t* const __restrict data,
773 const size_t cur_ix,
774 const size_t ring_buffer_mask,
775 const size_t max_length,
776 size_t* const __restrict best_len,
777 BackwardMatch* __restrict matches) {
778 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
779 const size_t max_backward = window_mask_ - 15;
780 const size_t max_comp_len = std::min(max_length, kMaxTreeCompLength);
781 const bool reroot_tree = max_length >= kMaxTreeCompLength;
782 const uint32_t key = HashBytes(&data[cur_ix_masked]);
783 size_t prev_ix = buckets_[key];
784 // The forest index of the rightmost node of the left subtree of the new
785 // root, updated as we traverse and reroot the tree of the hash bucket.
786 size_t node_left = LeftChildIndex(cur_ix);
787 // The forest index of the leftmost node of the right subtree of the new
788 // root, updated as we traverse and reroot the tree of the hash bucket.
789 size_t node_right = RightChildIndex(cur_ix);
790 // The match length of the rightmost node of the left subtree of the new
791 // root, updated as we traverse and reroot the tree of the hash bucket.
792 size_t best_len_left = 0;
793 // The match length of the leftmost node of the right subtree of the new
794 // root, updated as we traverse and reroot the tree of the hash bucket.
795 size_t best_len_right = 0;
796 if (reroot_tree) {
797 buckets_[key] = static_cast<uint32_t>(cur_ix);
798 }
799 for (size_t depth_remaining = kMaxTreeSearchDepth; ; --depth_remaining) {
800 const size_t backward = cur_ix - prev_ix;
801 const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
802 if (backward == 0 || backward > max_backward || depth_remaining == 0) {
803 if (reroot_tree) {
804 forest_[node_left] = invalid_pos_;
805 forest_[node_right] = invalid_pos_;
806 }
807 break;
808 }
809 const size_t cur_len = std::min(best_len_left, best_len_right);
810 const size_t len = cur_len +
811 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
812 &data[prev_ix_masked + cur_len],
813 max_length - cur_len);
814 if (len > *best_len) {
815 *best_len = len;
816 if (matches) {
817 *matches++ = BackwardMatch(backward, len);
818 }
819 if (len >= max_comp_len) {
820 if (reroot_tree) {
821 forest_[node_left] = forest_[LeftChildIndex(prev_ix)];
822 forest_[node_right] = forest_[RightChildIndex(prev_ix)];
823 }
824 break;
825 }
826 }
827 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
828 best_len_left = len;
829 if (reroot_tree) {
830 forest_[node_left] = static_cast<uint32_t>(prev_ix);
831 }
832 node_left = RightChildIndex(prev_ix);
833 prev_ix = forest_[node_left];
834 } else {
835 best_len_right = len;
836 if (reroot_tree) {
837 forest_[node_right] = static_cast<uint32_t>(prev_ix);
838 }
839 node_right = LeftChildIndex(prev_ix);
840 prev_ix = forest_[node_right];
841 }
842 }
843 return matches;
844 }
845
846 inline size_t LeftChildIndex(const size_t pos) {
847 return 2 * (pos & window_mask_);
848 }
849
850 inline size_t RightChildIndex(const size_t pos) {
851 return 2 * (pos & window_mask_) + 1;
852 }
853
854 static uint32_t HashBytes(const uint8_t *data) {
855 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
856 // The higher bits contain more mixture from the multiplication,
857 // so we take our results from there.
858 return h >> (32 - kBucketBits);
859 }
860
861 static const int kBucketBits = 17;
862 static const size_t kBucketSize = 1 << kBucketBits;
863
864 // The window size minus 1
865 size_t window_mask_;
866
867 // Hash table that maps the 4-byte hashes of the sequence to the last
868 // position where this hash was found, which is the root of the binary
869 // tree of sequences that share this hash bucket.
870 uint32_t buckets_[kBucketSize];
871
872 // The union of the binary trees of each hash bucket. The root of the tree
873 // corresponding to a hash is a sequence starting at buckets_[hash] and
874 // the left and right children of a sequence starting at pos are
875 // forest_[2 * pos] and forest_[2 * pos + 1].
876 uint32_t* forest_;
877
878 // A position used to mark a non-existent sequence, i.e. a tree is empty if
879 // its root is at invalid_pos_ and a node is a leaf if both its children
880 // are at invalid_pos_.
881 uint32_t invalid_pos_;
882
883 bool need_init_;
884 };
885
886 struct Hashers {
887 // For kBucketSweep == 1, enabling the dictionary lookup makes compression
888 // a little faster (0.5% - 1%) and it compresses 0.15% better on small text
889 // and html inputs.
890 typedef HashLongestMatchQuickly<16, 1, true> H2;
891 typedef HashLongestMatchQuickly<16, 2, false> H3;
892 typedef HashLongestMatchQuickly<17, 4, true> H4;
893 typedef HashLongestMatch<14, 4, 4> H5;
894 typedef HashLongestMatch<14, 5, 4> H6;
895 typedef HashLongestMatch<15, 6, 10> H7;
896 typedef HashLongestMatch<15, 7, 10> H8;
897 typedef HashLongestMatch<15, 8, 16> H9;
898 typedef HashToBinaryTree H10;
899
900 Hashers(void) : hash_h2(0), hash_h3(0), hash_h4(0), hash_h5(0),
901 hash_h6(0), hash_h7(0), hash_h8(0), hash_h9(0), hash_h10(0) {}
902
903 ~Hashers(void) {
904 delete hash_h2;
905 delete hash_h3;
906 delete hash_h4;
907 delete hash_h5;
908 delete hash_h6;
909 delete hash_h7;
910 delete hash_h8;
911 delete hash_h9;
912 delete hash_h10;
913 }
914
915 void Init(int type) {
916 switch (type) {
917 case 2: hash_h2 = new H2; break;
918 case 3: hash_h3 = new H3; break;
919 case 4: hash_h4 = new H4; break;
920 case 5: hash_h5 = new H5; break;
921 case 6: hash_h6 = new H6; break;
922 case 7: hash_h7 = new H7; break;
923 case 8: hash_h8 = new H8; break;
924 case 9: hash_h9 = new H9; break;
925 case 10: hash_h10 = new H10; break;
926 default: break;
927 }
928 }
929
930 template<typename Hasher>
931 void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) {
932 hasher->Init();
933 for (size_t i = 0; i + Hasher::kHashTypeLength - 1 < size; i++) {
934 hasher->Store(&dict[i], static_cast<uint32_t>(i));
935 }
936 }
937
938 // Custom LZ77 window.
939 void PrependCustomDictionary(
940 int type, int lgwin, const size_t size, const uint8_t* dict) {
941 switch (type) {
942 case 2: WarmupHash(size, dict, hash_h2); break;
943 case 3: WarmupHash(size, dict, hash_h3); break;
944 case 4: WarmupHash(size, dict, hash_h4); break;
945 case 5: WarmupHash(size, dict, hash_h5); break;
946 case 6: WarmupHash(size, dict, hash_h6); break;
947 case 7: WarmupHash(size, dict, hash_h7); break;
948 case 8: WarmupHash(size, dict, hash_h8); break;
949 case 9: WarmupHash(size, dict, hash_h9); break;
950 case 10:
951 hash_h10->Init(lgwin, 0, size, false);
952 for (size_t i = 0; i + kMaxTreeCompLength - 1 < size; ++i) {
953 hash_h10->Store(dict, std::numeric_limits<size_t>::max(), i);
954 }
955 break;
956 default: break;
957 }
958 }
959
960
961 H2* hash_h2;
962 H3* hash_h3;
963 H4* hash_h4;
964 H5* hash_h5;
965 H6* hash_h6;
966 H7* hash_h7;
967 H8* hash_h8;
968 H9* hash_h9;
969 H10* hash_h10;
970 };
971
972 } // namespace brotli
973
974 #endif // BROTLI_ENC_HASH_H_
OLDNEW
« no previous file with comments | « third_party/brotli/enc/find_match_length.h ('k') | third_party/brotli/enc/histogram.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698