Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: third_party/brotli/enc/hash.h

Issue 2537133002: Update brotli to v1.0.0-snapshot. (Closed)
Patch Set: Fixed typo Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* Copyright 2010 Google Inc. All Rights Reserved. 1 /* Copyright 2010 Google Inc. All Rights Reserved.
2 2
3 Distributed under MIT license. 3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */ 5 */
6 6
7 // A (forgetful) hash table to the data seen by the compressor, to 7 /* A (forgetful) hash table to the data seen by the compressor, to
8 // help create backward references to previous data. 8 help create backward references to previous data. */
9 9
10 #ifndef BROTLI_ENC_HASH_H_ 10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_ 11 #define BROTLI_ENC_HASH_H_
12 12
13 #include <sys/types.h> 13 #include <string.h> /* memcmp, memset */
14 #include <algorithm>
15 #include <cstring>
16 #include <limits>
17 14
15 #include "../common/constants.h"
16 #include "../common/dictionary.h"
17 #include <brotli/types.h>
18 #include "./dictionary_hash.h" 18 #include "./dictionary_hash.h"
19 #include "./fast_log.h" 19 #include "./fast_log.h"
20 #include "./find_match_length.h" 20 #include "./find_match_length.h"
21 #include "./memory.h"
21 #include "./port.h" 22 #include "./port.h"
22 #include "./prefix.h" 23 #include "./quality.h"
23 #include "./static_dict.h" 24 #include "./static_dict.h"
24 #include "./transform.h"
25 #include "./types.h"
26 25
27 namespace brotli { 26 #if defined(__cplusplus) || defined(c_plusplus)
27 extern "C" {
28 #endif
28 29
29 static const size_t kMaxTreeSearchDepth = 64; 30 #define MAX_TREE_SEARCH_DEPTH 64
30 static const size_t kMaxTreeCompLength = 128; 31 #define MAX_TREE_COMP_LENGTH 128
32 #define score_t size_t
31 33
32 static const uint32_t kDistanceCacheIndex[] = { 34 static const uint32_t kDistanceCacheIndex[] = {
33 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 35 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
34 }; 36 };
35 static const int kDistanceCacheOffset[] = { 37 static const int kDistanceCacheOffset[] = {
36 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3 38 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
37 }; 39 };
38 40
39 static const uint32_t kCutoffTransformsCount = 10; 41 static const uint32_t kCutoffTransformsCount = 10;
40 static const uint8_t kCutoffTransforms[] = { 42 static const uint8_t kCutoffTransforms[] = {
41 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 43 0, 12, 27, 23, 42, 63, 56, 48, 59, 64
42 }; 44 };
43 45
44 // kHashMul32 multiplier has these properties: 46 typedef struct HasherSearchResult {
45 // * The multiplier must be odd. Otherwise we may lose the highest bit. 47 size_t len;
46 // * No long streaks of 1s or 0s. 48 size_t len_x_code; /* == len ^ len_code */
47 // * There is no effort to ensure that it is a prime, the oddity is enough 49 size_t distance;
48 // for this use. 50 score_t score;
49 // * The number has been tuned heuristically against compression benchmarks. 51 } HasherSearchResult;
52
53 typedef struct DictionarySearchStatictics {
54 size_t num_lookups;
55 size_t num_matches;
56 } DictionarySearchStatictics;
57
58 /* kHashMul32 multiplier has these properties:
59 * The multiplier must be odd. Otherwise we may lose the highest bit.
60 * No long streaks of ones or zeros.
61 * There is no effort to ensure that it is a prime, the oddity is enough
62 for this use.
63 * The number has been tuned heuristically against compression benchmarks. */
50 static const uint32_t kHashMul32 = 0x1e35a7bd; 64 static const uint32_t kHashMul32 = 0x1e35a7bd;
51 65
52 template<int kShiftBits> 66 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
53 inline uint32_t Hash(const uint8_t *data) {
54 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; 67 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
55 // The higher bits contain more mixture from the multiplication, 68 /* The higher bits contain more mixture from the multiplication,
56 // so we take our results from there. 69 so we take our results from there. */
57 return h >> (32 - kShiftBits); 70 return h >> (32 - 14);
58 } 71 }
59 72
60 // Usually, we always choose the longest backward reference. This function 73 #define BROTLI_LITERAL_BYTE_SCORE 540
61 // allows for the exception of that rule. 74 #define BROTLI_DISTANCE_BIT_PENALTY 120
62 // 75 /* Score must be positive after applying maximal penalty. */
63 // If we choose a backward reference that is further away, it will 76 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
64 // usually be coded with more bits. We approximate this by assuming 77
65 // log2(distance). If the distance can be expressed in terms of the 78 /* Usually, we always choose the longest backward reference. This function
66 // last four distances, we use some heuristic constants to estimate 79 allows for the exception of that rule.
67 // the bits cost. For the first up to four literals we use the bit 80
68 // cost of the literals from the literal cost model, after that we 81 If we choose a backward reference that is further away, it will
69 // use the average bit cost of the cost model. 82 usually be coded with more bits. We approximate this by assuming
70 // 83 log2(distance). If the distance can be expressed in terms of the
71 // This function is used to sometimes discard a longer backward reference 84 last four distances, we use some heuristic constants to estimate
72 // when it is not much longer and the bit cost for encoding it is more 85 the bits cost. For the first up to four literals we use the bit
73 // than the saved literals. 86 cost of the literals from the literal cost model, after that we
74 // 87 use the average bit cost of the cost model.
75 // backward_reference_offset MUST be positive. 88
76 inline double BackwardReferenceScore(size_t copy_length, 89 This function is used to sometimes discard a longer backward reference
77 size_t backward_reference_offset) { 90 when it is not much longer and the bit cost for encoding it is more
78 return 5.4 * static_cast<double>(copy_length) - 91 than the saved literals.
79 1.20 * Log2FloorNonZero(backward_reference_offset); 92
80 } 93 backward_reference_offset MUST be positive. */
81 94 static BROTLI_INLINE score_t BackwardReferenceScore(
82 inline double BackwardReferenceScoreUsingLastDistance(size_t copy_length, 95 size_t copy_length, size_t backward_reference_offset) {
83 size_t distance_short_code) { 96 return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
84 static const double kDistanceShortCodeBitCost[16] = { 97 BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
85 -0.6, 0.95, 1.17, 1.27, 98 }
86 0.93, 0.93, 0.96, 0.96, 0.99, 0.99, 99
87 1.05, 1.05, 1.15, 1.15, 1.25, 1.25 100 static const score_t kDistanceShortCodeCost[BROTLI_NUM_DISTANCE_SHORT_CODES] = {
88 }; 101 /* Repeat last */
89 return 5.4 * static_cast<double>(copy_length) - 102 BROTLI_SCORE_BASE + 60,
90 kDistanceShortCodeBitCost[distance_short_code]; 103 /* 2nd, 3rd, 4th last */
91 } 104 BROTLI_SCORE_BASE - 95,
92 105 BROTLI_SCORE_BASE - 117,
93 struct BackwardMatch { 106 BROTLI_SCORE_BASE - 127,
94 BackwardMatch(void) : distance(0), length_and_code(0) {} 107 /* Last with offset */
95 108 BROTLI_SCORE_BASE - 93,
96 BackwardMatch(size_t dist, size_t len) 109 BROTLI_SCORE_BASE - 93,
97 : distance(static_cast<uint32_t>(dist)) 110 BROTLI_SCORE_BASE - 96,
98 , length_and_code(static_cast<uint32_t>(len << 5)) {} 111 BROTLI_SCORE_BASE - 96,
99 112 BROTLI_SCORE_BASE - 99,
100 BackwardMatch(size_t dist, size_t len, size_t len_code) 113 BROTLI_SCORE_BASE - 99,
101 : distance(static_cast<uint32_t>(dist)) 114 /* 2nd last with offset */
102 , length_and_code(static_cast<uint32_t>( 115 BROTLI_SCORE_BASE - 105,
103 (len << 5) | (len == len_code ? 0 : len_code))) {} 116 BROTLI_SCORE_BASE - 105,
104 117 BROTLI_SCORE_BASE - 115,
105 size_t length(void) const { 118 BROTLI_SCORE_BASE - 115,
106 return length_and_code >> 5; 119 BROTLI_SCORE_BASE - 125,
107 } 120 BROTLI_SCORE_BASE - 125
108 size_t length_code(void) const { 121 };
109 size_t code = length_and_code & 31; 122
110 return code ? code : length(); 123 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
111 } 124 size_t copy_length, size_t distance_short_code) {
112 125 return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
126 kDistanceShortCodeCost[distance_short_code];
127 }
128
129 static BROTLI_INLINE void DictionarySearchStaticticsReset(
130 DictionarySearchStatictics* self) {
131 self->num_lookups = 0;
132 self->num_matches = 0;
133 }
134
135 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
136 size_t item, const uint8_t* data, size_t max_length, size_t max_backward,
137 HasherSearchResult* out) {
138 size_t len;
139 size_t dist;
140 size_t offset;
141 size_t matchlen;
142 size_t backward;
143 score_t score;
144 len = item & 31;
145 dist = item >> 5;
146 offset = kBrotliDictionaryOffsetsByLength[len] + len * dist;
147 if (len > max_length) {
148 return BROTLI_FALSE;
149 }
150
151 matchlen = FindMatchLengthWithLimit(data, &kBrotliDictionary[offset], len);
152 if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) {
153 return BROTLI_FALSE;
154 }
155 {
156 size_t transform_id = kCutoffTransforms[len - matchlen];
157 backward = max_backward + dist + 1 +
158 (transform_id << kBrotliDictionarySizeBitsByLength[len]);
159 }
160 score = BackwardReferenceScore(matchlen, backward);
161 if (score < out->score) {
162 return BROTLI_FALSE;
163 }
164 out->len = matchlen;
165 out->len_x_code = len ^ matchlen;
166 out->distance = backward;
167 out->score = score;
168 return BROTLI_TRUE;
169 }
170
171 static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary(
172 DictionarySearchStatictics* self, const uint8_t* data, size_t max_length,
173 size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
174 size_t key;
175 size_t i;
176 BROTLI_BOOL is_match_found = BROTLI_FALSE;
177 if (self->num_matches < (self->num_lookups >> 7)) {
178 return BROTLI_FALSE;
179 }
180 key = Hash14(data) << 1;
181 for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
182 size_t item = kStaticDictionaryHash[key];
183 self->num_lookups++;
184 if (item != 0 &&
185 TestStaticDictionaryItem(item, data, max_length, max_backward, out)) {
186 self->num_matches++;
187 is_match_found = BROTLI_TRUE;
188 }
189 }
190 return is_match_found;
191 }
192
193 typedef struct BackwardMatch {
113 uint32_t distance; 194 uint32_t distance;
114 uint32_t length_and_code; 195 uint32_t length_and_code;
115 }; 196 } BackwardMatch;
116 197
117 // A (forgetful) hash table to the data seen by the compressor, to 198 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
118 // help create backward references to previous data. 199 size_t dist, size_t len) {
119 // 200 self->distance = (uint32_t)dist;
120 // This is a hash map of fixed size (kBucketSize). Starting from the 201 self->length_and_code = (uint32_t)(len << 5);
121 // given index, kBucketSweep buckets are used to store values of a key. 202 }
122 template <int kBucketBits, int kBucketSweep, bool kUseDictionary> 203
123 class HashLongestMatchQuickly { 204 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
124 public: 205 size_t dist, size_t len, size_t len_code) {
125 HashLongestMatchQuickly(void) { 206 self->distance = (uint32_t)dist;
126 Reset(); 207 self->length_and_code =
127 } 208 (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
128 void Reset(void) { 209 }
129 need_init_ = true; 210
130 num_dict_lookups_ = 0; 211 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
131 num_dict_matches_ = 0; 212 return self->length_and_code >> 5;
132 } 213 }
133 void Init(void) { 214
134 if (need_init_) { 215 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
135 // It is not strictly necessary to fill this buffer here, but 216 size_t code = self->length_and_code & 31;
136 // not filling will make the results of the compression stochastic 217 return code ? code : BackwardMatchLength(self);
137 // (but correct). This is because random data would cause the 218 }
138 // system to find accidentally good backward references here and there. 219
139 memset(&buckets_[0], 0, sizeof(buckets_)); 220 #define EXPAND_CAT(a, b) CAT(a, b)
140 need_init_ = false; 221 #define CAT(a, b) a ## b
141 } 222 #define FN(X) EXPAND_CAT(X, HASHER())
142 } 223
143 void InitForData(const uint8_t* data, size_t num) { 224 #define MAX_NUM_MATCHES_H10 (64 + MAX_TREE_SEARCH_DEPTH)
144 for (size_t i = 0; i < num; ++i) { 225
145 const uint32_t key = HashBytes(&data[i]); 226 #define HASHER() H10
146 memset(&buckets_[key], 0, kBucketSweep * sizeof(buckets_[0])); 227 #define HashToBinaryTree HASHER()
147 need_init_ = false; 228
148 } 229 #define BUCKET_BITS 17
149 } 230 #define BUCKET_SIZE (1 << BUCKET_BITS)
150 // Look at 4 bytes at data. 231
151 // Compute a hash from these, and store the value somewhere within 232 static size_t FN(HashTypeLength)(void) { return 4; }
152 // [ix .. ix+3]. 233 static size_t FN(StoreLookahead)(void) { return MAX_TREE_COMP_LENGTH; }
153 inline void Store(const uint8_t *data, const uint32_t ix) { 234
154 const uint32_t key = HashBytes(data); 235 static uint32_t FN(HashBytes)(const uint8_t *data) {
155 // Wiggle the value with the bucket sweep range. 236 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
156 const uint32_t off = (ix >> 3) % kBucketSweep; 237 /* The higher bits contain more mixture from the multiplication,
157 buckets_[key + off] = ix; 238 so we take our results from there. */
158 } 239 return h >> (32 - BUCKET_BITS);
159 240 }
160 // Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask] 241
161 // up to the length of max_length and stores the position cur_ix in the 242 /* A (forgetful) hash table where each hash bucket contains a binary tree of
162 // hash table. 243 sequences whose first 4 bytes share the same hash code.
163 // 244 Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting
164 // Does not look for matches longer than max_length. 245 position in the input data. The binary tree is sorted by the lexicographic
165 // Does not look for matches further away than max_backward. 246 order of the sequences, and it is also a max-heap with respect to the
166 // Writes the best found match length into best_len_out. 247 starting positions. */
167 // Writes the index (&data[index]) of the start of the best match into 248 typedef struct HashToBinaryTree {
168 // best_distance_out. 249 /* The window size minus 1 */
169 inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer, 250 size_t window_mask_;
170 const size_t ring_buffer_mask, 251
171 const int* __restrict distance_cache, 252 /* Hash table that maps the 4-byte hashes of the sequence to the last
172 const size_t cur_ix, 253 position where this hash was found, which is the root of the binary
173 const size_t max_length, 254 tree of sequences that share this hash bucket. */
174 const size_t max_backward, 255 uint32_t buckets_[BUCKET_SIZE];
175 size_t * __restrict best_len_out, 256
176 size_t * __restrict best_len_code_out, 257 /* The union of the binary trees of each hash bucket. The root of the tree
177 size_t * __restrict best_distance_out, 258 corresponding to a hash is a sequence starting at buckets_[hash] and
178 double* __restrict best_score_out) { 259 the left and right children of a sequence starting at pos are
179 const size_t best_len_in = *best_len_out; 260 forest_[2 * pos] and forest_[2 * pos + 1]. */
180 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; 261 uint32_t* forest_;
181 const uint32_t key = HashBytes(&ring_buffer[cur_ix_masked]); 262
182 int compare_char = ring_buffer[cur_ix_masked + best_len_in]; 263 /* A position used to mark a non-existent sequence, i.e. a tree is empty if
183 double best_score = *best_score_out; 264 its root is at invalid_pos_ and a node is a leaf if both its children
184 size_t best_len = best_len_in; 265 are at invalid_pos_. */
185 size_t cached_backward = static_cast<size_t>(distance_cache[0]); 266 uint32_t invalid_pos_;
186 size_t prev_ix = cur_ix - cached_backward; 267
187 bool match_found = false; 268 size_t forest_size_;
188 if (prev_ix < cur_ix) { 269 BROTLI_BOOL is_dirty_;
189 prev_ix &= static_cast<uint32_t>(ring_buffer_mask); 270 } HashToBinaryTree;
190 if (compare_char == ring_buffer[prev_ix + best_len]) { 271
191 size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix], 272 static void FN(Reset)(HashToBinaryTree* self) {
192 &ring_buffer[cur_ix_masked], 273 self->is_dirty_ = BROTLI_TRUE;
193 max_length); 274 }
194 if (len >= 4) { 275
195 best_score = BackwardReferenceScoreUsingLastDistance(len, 0); 276 static void FN(Initialize)(HashToBinaryTree* self) {
196 best_len = len; 277 self->forest_ = NULL;
197 *best_len_out = len; 278 self->forest_size_ = 0;
198 *best_len_code_out = len; 279 FN(Reset)(self);
199 *best_distance_out = cached_backward; 280 }
200 *best_score_out = best_score; 281
201 compare_char = ring_buffer[cur_ix_masked + best_len]; 282 static void FN(Cleanup)(MemoryManager* m, HashToBinaryTree* self) {
202 if (kBucketSweep == 1) { 283 BROTLI_FREE(m, self->forest_);
203 buckets_[key] = static_cast<uint32_t>(cur_ix); 284 }
204 return true; 285
205 } else { 286 static void FN(Init)(
206 match_found = true; 287 MemoryManager* m, HashToBinaryTree* self, const uint8_t* data,
207 } 288 const BrotliEncoderParams* params, size_t position, size_t bytes,
289 BROTLI_BOOL is_last) {
290 if (self->is_dirty_) {
291 uint32_t invalid_pos;
292 size_t num_nodes;
293 uint32_t i;
294 BROTLI_UNUSED(data);
295 self->window_mask_ = (1u << params->lgwin) - 1u;
296 invalid_pos = (uint32_t)(0 - self->window_mask_);
297 self->invalid_pos_ = invalid_pos;
298 for (i = 0; i < BUCKET_SIZE; i++) {
299 self->buckets_[i] = invalid_pos;
300 }
301 num_nodes = (position == 0 && is_last) ? bytes : self->window_mask_ + 1;
302 if (num_nodes > self->forest_size_) {
303 BROTLI_FREE(m, self->forest_);
304 self->forest_ = BROTLI_ALLOC(m, uint32_t, 2 * num_nodes);
305 if (BROTLI_IS_OOM(m)) return;
306 self->forest_size_ = num_nodes;
307 }
308 self->is_dirty_ = BROTLI_FALSE;
309 }
310 }
311
312 static BROTLI_INLINE size_t FN(LeftChildIndex)(HashToBinaryTree* self,
313 const size_t pos) {
314 return 2 * (pos & self->window_mask_);
315 }
316
317 static BROTLI_INLINE size_t FN(RightChildIndex)(HashToBinaryTree* self,
318 const size_t pos) {
319 return 2 * (pos & self->window_mask_) + 1;
320 }
321
322 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the
323 hash bucket's binary tree is searched for matches and is re-rooted at the
324 current position.
325
326 If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the
327 current position is searched for matches, but the state of the hash table
328 is not changed, since we can not know the final sorting order of the
329 current (incomplete) sequence.
330
331 This function must be called with increasing cur_ix positions. */
332 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)(
333 HashToBinaryTree* self, const uint8_t* const BROTLI_RESTRICT data,
334 const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length,
335 const size_t max_backward, size_t* const BROTLI_RESTRICT best_len,
336 BackwardMatch* BROTLI_RESTRICT matches) {
337 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
338 const size_t max_comp_len =
339 BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH);
340 const BROTLI_BOOL should_reroot_tree =
341 TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH);
342 const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
343 size_t prev_ix = self->buckets_[key];
344 /* The forest index of the rightmost node of the left subtree of the new
345 root, updated as we traverse and re-root the tree of the hash bucket. */
346 size_t node_left = FN(LeftChildIndex)(self, cur_ix);
347 /* The forest index of the leftmost node of the right subtree of the new
348 root, updated as we traverse and re-root the tree of the hash bucket. */
349 size_t node_right = FN(RightChildIndex)(self, cur_ix);
350 /* The match length of the rightmost node of the left subtree of the new
351 root, updated as we traverse and re-root the tree of the hash bucket. */
352 size_t best_len_left = 0;
353 /* The match length of the leftmost node of the right subtree of the new
354 root, updated as we traverse and re-root the tree of the hash bucket. */
355 size_t best_len_right = 0;
356 size_t depth_remaining;
357 if (should_reroot_tree) {
358 self->buckets_[key] = (uint32_t)cur_ix;
359 }
360 for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) {
361 const size_t backward = cur_ix - prev_ix;
362 const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
363 if (backward == 0 || backward > max_backward || depth_remaining == 0) {
364 if (should_reroot_tree) {
365 self->forest_[node_left] = self->invalid_pos_;
366 self->forest_[node_right] = self->invalid_pos_;
367 }
368 break;
369 }
370 {
371 const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right);
372 size_t len;
373 assert(cur_len <= MAX_TREE_COMP_LENGTH);
374 len = cur_len +
375 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
376 &data[prev_ix_masked + cur_len],
377 max_length - cur_len);
378 assert(0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len));
379 if (matches && len > *best_len) {
380 *best_len = len;
381 InitBackwardMatch(matches++, backward, len);
382 }
383 if (len >= max_comp_len) {
384 if (should_reroot_tree) {
385 self->forest_[node_left] =
386 self->forest_[FN(LeftChildIndex)(self, prev_ix)];
387 self->forest_[node_right] =
388 self->forest_[FN(RightChildIndex)(self, prev_ix)];
208 } 389 }
209 }
210 }
211 if (kBucketSweep == 1) {
212 // Only one to look for, don't bother to prepare for a loop.
213 prev_ix = buckets_[key];
214 buckets_[key] = static_cast<uint32_t>(cur_ix);
215 size_t backward = cur_ix - prev_ix;
216 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
217 if (compare_char != ring_buffer[prev_ix + best_len_in]) {
218 return false;
219 }
220 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
221 return false;
222 }
223 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
224 &ring_buffer[cur_ix_masked],
225 max_length);
226 if (len >= 4) {
227 *best_len_out = len;
228 *best_len_code_out = len;
229 *best_distance_out = backward;
230 *best_score_out = BackwardReferenceScore(len, backward);
231 return true;
232 }
233 } else {
234 uint32_t *bucket = buckets_ + key;
235 prev_ix = *bucket++;
236 for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) {
237 const size_t backward = cur_ix - prev_ix;
238 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);
239 if (compare_char != ring_buffer[prev_ix + best_len]) {
240 continue;
241 }
242 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
243 continue;
244 }
245 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
246 &ring_buffer[cur_ix_masked],
247 max_length);
248 if (len >= 4) {
249 const double score = BackwardReferenceScore(len, backward);
250 if (best_score < score) {
251 best_score = score;
252 best_len = len;
253 *best_len_out = best_len;
254 *best_len_code_out = best_len;
255 *best_distance_out = backward;
256 *best_score_out = score;
257 compare_char = ring_buffer[cur_ix_masked + best_len];
258 match_found = true;
259 }
260 }
261 }
262 }
263 if (kUseDictionary && !match_found &&
264 num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
265 ++num_dict_lookups_;
266 const uint32_t dict_key = Hash<14>(&ring_buffer[cur_ix_masked]) << 1;
267 const uint16_t v = kStaticDictionaryHash[dict_key];
268 if (v > 0) {
269 const uint32_t len = v & 31;
270 const uint32_t dist = v >> 5;
271 const size_t offset =
272 kBrotliDictionaryOffsetsByLength[len] + len * dist;
273 if (len <= max_length) {
274 const size_t matchlen =
275 FindMatchLengthWithLimit(&ring_buffer[cur_ix_masked],
276 &kBrotliDictionary[offset], len);
277 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {
278 const size_t transform_id = kCutoffTransforms[len - matchlen];
279 const size_t word_id =
280 transform_id * (1u << kBrotliDictionarySizeBitsByLength[len]) +
281 dist;
282 const size_t backward = max_backward + word_id + 1;
283 const double score = BackwardReferenceScore(matchlen, backward);
284 if (best_score < score) {
285 ++num_dict_matches_;
286 best_score = score;
287 best_len = matchlen;
288 *best_len_out = best_len;
289 *best_len_code_out = len;
290 *best_distance_out = backward;
291 *best_score_out = best_score;
292 match_found = true;
293 }
294 }
295 }
296 }
297 }
298 const uint32_t off = (cur_ix >> 3) % kBucketSweep;
299 buckets_[key + off] = static_cast<uint32_t>(cur_ix);
300 return match_found;
301 }
302
303 enum { kHashLength = 5 };
304 enum { kHashTypeLength = 8 };
305 // HashBytes is the function that chooses the bucket to place
306 // the address in. The HashLongestMatch and HashLongestMatchQuickly
307 // classes have separate, different implementations of hashing.
308 static uint32_t HashBytes(const uint8_t *data) {
309 // Computing a hash based on 5 bytes works much better for
310 // qualities 1 and 3, where the next hash value is likely to replace
311 uint64_t h = (BROTLI_UNALIGNED_LOAD64(data) << 24) * kHashMul32;
312 // The higher bits contain more mixture from the multiplication,
313 // so we take our results from there.
314 return static_cast<uint32_t>(h >> (64 - kBucketBits));
315 }
316
317 enum { kHashMapSize = 4 << kBucketBits };
318
319 private:
320 static const uint32_t kBucketSize = 1 << kBucketBits;
321 uint32_t buckets_[kBucketSize + kBucketSweep];
322 // True if buckets_ array needs to be initialized.
323 bool need_init_;
324 size_t num_dict_lookups_;
325 size_t num_dict_matches_;
326 };
327
328 // A (forgetful) hash table to the data seen by the compressor, to
329 // help create backward references to previous data.
330 //
331 // This is a hash map of fixed size (kBucketSize) to a ring buffer of
332 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
333 // index positions of the given hash key in the compressed data.
334 template <int kBucketBits,
335 int kBlockBits,
336 int kNumLastDistancesToCheck>
337 class HashLongestMatch {
338 public:
339 HashLongestMatch(void) {
340 Reset();
341 }
342
343 void Reset(void) {
344 need_init_ = true;
345 num_dict_lookups_ = 0;
346 num_dict_matches_ = 0;
347 }
348
349 void Init(void) {
350 if (need_init_) {
351 memset(&num_[0], 0, sizeof(num_));
352 need_init_ = false;
353 }
354 }
355
356 void InitForData(const uint8_t* data, size_t num) {
357 for (size_t i = 0; i < num; ++i) {
358 const uint32_t key = HashBytes(&data[i]);
359 num_[key] = 0;
360 need_init_ = false;
361 }
362 }
363
364 // Look at 3 bytes at data.
365 // Compute a hash from these, and store the value of ix at that position.
366 inline void Store(const uint8_t *data, const uint32_t ix) {
367 const uint32_t key = HashBytes(data);
368 const int minor_ix = num_[key] & kBlockMask;
369 buckets_[key][minor_ix] = ix;
370 ++num_[key];
371 }
372
373 // Find a longest backward match of &data[cur_ix] up to the length of
374 // max_length and stores the position cur_ix in the hash table.
375 //
376 // Does not look for matches longer than max_length.
377 // Does not look for matches further away than max_backward.
378 // Writes the best found match length into best_len_out.
379 // Writes the index (&data[index]) offset from the start of the best match
380 // into best_distance_out.
381 // Write the score of the best match into best_score_out.
382 bool FindLongestMatch(const uint8_t * __restrict data,
383 const size_t ring_buffer_mask,
384 const int* __restrict distance_cache,
385 const size_t cur_ix,
386 const size_t max_length,
387 const size_t max_backward,
388 size_t * __restrict best_len_out,
389 size_t * __restrict best_len_code_out,
390 size_t * __restrict best_distance_out,
391 double * __restrict best_score_out) {
392 *best_len_code_out = 0;
393 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
394 bool match_found = false;
395 // Don't accept a short copy from far away.
396 double best_score = *best_score_out;
397 size_t best_len = *best_len_out;
398 *best_len_out = 0;
399 // Try last distance first.
400 for (size_t i = 0; i < kNumLastDistancesToCheck; ++i) {
401 const size_t idx = kDistanceCacheIndex[i];
402 const size_t backward =
403 static_cast<size_t>(distance_cache[idx] + kDistanceCacheOffset[i]);
404 size_t prev_ix = static_cast<size_t>(cur_ix - backward);
405 if (prev_ix >= cur_ix) {
406 continue;
407 }
408 if (PREDICT_FALSE(backward > max_backward)) {
409 continue;
410 }
411 prev_ix &= ring_buffer_mask;
412
413 if (cur_ix_masked + best_len > ring_buffer_mask ||
414 prev_ix + best_len > ring_buffer_mask ||
415 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
416 continue;
417 }
418 const size_t len = FindMatchLengthWithLimit(&data[prev_ix],
419 &data[cur_ix_masked],
420 max_length);
421 if (len >= 3 || (len == 2 && i < 2)) {
422 // Comparing for >= 2 does not change the semantics, but just saves for
423 // a few unnecessary binary logarithms in backward reference score,
424 // since we are not interested in such short matches.
425 double score = BackwardReferenceScoreUsingLastDistance(len, i);
426 if (best_score < score) {
427 best_score = score;
428 best_len = len;
429 *best_len_out = best_len;
430 *best_len_code_out = best_len;
431 *best_distance_out = backward;
432 *best_score_out = best_score;
433 match_found = true;
434 }
435 }
436 }
437 const uint32_t key = HashBytes(&data[cur_ix_masked]);
438 const uint32_t * __restrict const bucket = &buckets_[key][0];
439 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
440 for (size_t i = num_[key]; i > down;) {
441 --i;
442 size_t prev_ix = bucket[i & kBlockMask];
443 const size_t backward = cur_ix - prev_ix;
444 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
445 break; 390 break;
446 } 391 }
447 prev_ix &= ring_buffer_mask; 392 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
448 if (cur_ix_masked + best_len > ring_buffer_mask || 393 best_len_left = len;
449 prev_ix + best_len > ring_buffer_mask || 394 if (should_reroot_tree) {
450 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) { 395 self->forest_[node_left] = (uint32_t)prev_ix;
451 continue; 396 }
397 node_left = FN(RightChildIndex)(self, prev_ix);
398 prev_ix = self->forest_[node_left];
399 } else {
400 best_len_right = len;
401 if (should_reroot_tree) {
402 self->forest_[node_right] = (uint32_t)prev_ix;
403 }
404 node_right = FN(LeftChildIndex)(self, prev_ix);
405 prev_ix = self->forest_[node_right];
452 } 406 }
453 const size_t len = FindMatchLengthWithLimit(&data[prev_ix], 407 }
454 &data[cur_ix_masked], 408 }
455 max_length); 409 return matches;
456 if (len >= 4) { 410 }
457 // Comparing for >= 3 does not change the semantics, but just saves 411
458 // for a few unnecessary binary logarithms in backward reference 412 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
459 // score, since we are not interested in such short matches. 413 length of max_length and stores the position cur_ix in the hash table.
460 double score = BackwardReferenceScore(len, backward); 414
461 if (best_score < score) { 415 Sets *num_matches to the number of matches found, and stores the found
462 best_score = score; 416 matches in matches[0] to matches[*num_matches - 1]. The matches will be
463 best_len = len; 417 sorted by strictly increasing length and (non-strictly) increasing
464 *best_len_out = best_len; 418 distance. */
465 *best_len_code_out = best_len; 419 static BROTLI_INLINE size_t FN(FindAllMatches)(HashToBinaryTree* self,
466 *best_distance_out = backward; 420 const uint8_t* data, const size_t ring_buffer_mask, const size_t cur_ix,
467 *best_score_out = best_score; 421 const size_t max_length, const size_t max_backward,
468 match_found = true; 422 const BrotliEncoderParams* params, BackwardMatch* matches) {
469 } 423 BackwardMatch* const orig_matches = matches;
470 } 424 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
471 } 425 size_t best_len = 1;
472 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix); 426 const size_t short_match_max_backward =
473 ++num_[key]; 427 params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64;
474 if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) { 428 size_t stop = cur_ix - short_match_max_backward;
475 size_t dict_key = Hash<14>(&data[cur_ix_masked]) << 1; 429 uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1];
476 for (int k = 0; k < 2; ++k, ++dict_key) { 430 size_t i;
477 ++num_dict_lookups_; 431 if (cur_ix < short_match_max_backward) { stop = 0; }
478 const uint16_t v = kStaticDictionaryHash[dict_key]; 432 for (i = cur_ix - 1; i > stop && best_len <= 2; --i) {
479 if (v > 0) { 433 size_t prev_ix = i;
480 const size_t len = v & 31; 434 const size_t backward = cur_ix - prev_ix;
481 const size_t dist = v >> 5; 435 if (BROTLI_PREDICT_FALSE(backward > max_backward)) {
482 const size_t offset = 436 break;
483 kBrotliDictionaryOffsetsByLength[len] + len * dist; 437 }
484 if (len <= max_length) { 438 prev_ix &= ring_buffer_mask;
485 const size_t matchlen = 439 if (data[cur_ix_masked] != data[prev_ix] ||
486 FindMatchLengthWithLimit(&data[cur_ix_masked], 440 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
487 &kBrotliDictionary[offset], len); 441 continue;
488 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) { 442 }
489 const size_t transform_id = kCutoffTransforms[len - matchlen]; 443 {
490 const size_t word_id =
491 transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) +
492 dist;
493 const size_t backward = max_backward + word_id + 1;
494 double score = BackwardReferenceScore(matchlen, backward);
495 if (best_score < score) {
496 ++num_dict_matches_;
497 best_score = score;
498 best_len = matchlen;
499 *best_len_out = best_len;
500 *best_len_code_out = len;
501 *best_distance_out = backward;
502 *best_score_out = best_score;
503 match_found = true;
504 }
505 }
506 }
507 }
508 }
509 }
510 return match_found;
511 }
512
513 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
514 // length of max_length and stores the position cur_ix in the hash table.
515 //
516 // Sets *num_matches to the number of matches found, and stores the found
517 // matches in matches[0] to matches[*num_matches - 1]. The matches will be
518 // sorted by strictly increasing length and (non-strictly) increasing
519 // distance.
520 size_t FindAllMatches(const uint8_t* data,
521 const size_t ring_buffer_mask,
522 const size_t cur_ix,
523 const size_t max_length,
524 const size_t max_backward,
525 BackwardMatch* matches) {
526 BackwardMatch* const orig_matches = matches;
527 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
528 size_t best_len = 1;
529 size_t stop = cur_ix - 64;
530 if (cur_ix < 64) { stop = 0; }
531 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) {
532 size_t prev_ix = i;
533 const size_t backward = cur_ix - prev_ix;
534 if (PREDICT_FALSE(backward > max_backward)) {
535 break;
536 }
537 prev_ix &= ring_buffer_mask;
538 if (data[cur_ix_masked] != data[prev_ix] ||
539 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
540 continue;
541 }
542 const size_t len = 444 const size_t len =
543 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], 445 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
544 max_length); 446 max_length);
545 if (len > best_len) { 447 if (len > best_len) {
546 best_len = len; 448 best_len = len;
547 *matches++ = BackwardMatch(backward, len); 449 InitBackwardMatch(matches++, backward, len);
548 } 450 }
549 } 451 }
550 const uint32_t key = HashBytes(&data[cur_ix_masked]); 452 }
551 const uint32_t * __restrict const bucket = &buckets_[key][0]; 453 if (best_len < max_length) {
552 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; 454 matches = FN(StoreAndFindMatches)(self, data, cur_ix, ring_buffer_mask,
553 for (size_t i = num_[key]; i > down;) { 455 max_length, max_backward, &best_len, matches);
554 --i; 456 }
555 size_t prev_ix = bucket[i & kBlockMask]; 457 for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) {
556 const size_t backward = cur_ix - prev_ix; 458 dict_matches[i] = kInvalidMatch;
557 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) { 459 }
558 break; 460 {
559 } 461 size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1);
560 prev_ix &= ring_buffer_mask; 462 if (BrotliFindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen,
561 if (cur_ix_masked + best_len > ring_buffer_mask || 463 max_length, &dict_matches[0])) {
562 prev_ix + best_len > ring_buffer_mask || 464 size_t maxlen = BROTLI_MIN(
563 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) { 465 size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length);
564 continue; 466 size_t l;
565 } 467 for (l = minlen; l <= maxlen; ++l) {
566 const size_t len =
567 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
568 max_length);
569 if (len > best_len) {
570 best_len = len;
571 *matches++ = BackwardMatch(backward, len);
572 }
573 }
574 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix);
575 ++num_[key];
576 uint32_t dict_matches[kMaxDictionaryMatchLen + 1];
577 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {
578 dict_matches[i] = kInvalidMatch;
579 }
580 size_t minlen = std::max<size_t>(4, best_len + 1);
581 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,
582 &dict_matches[0])) {
583 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length);
584 for (size_t l = minlen; l <= maxlen; ++l) {
585 uint32_t dict_id = dict_matches[l]; 468 uint32_t dict_id = dict_matches[l];
586 if (dict_id < kInvalidMatch) { 469 if (dict_id < kInvalidMatch) {
587 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l, 470 InitDictionaryBackwardMatch(matches++,
588 dict_id & 31); 471 max_backward + (dict_id >> 5) + 1, l, dict_id & 31);
589 } 472 }
590 } 473 }
591 } 474 }
592 return static_cast<size_t>(matches - orig_matches); 475 }
593 } 476 return (size_t)(matches - orig_matches);
594 477 }
595 enum { kHashLength = 4 }; 478
596 enum { kHashTypeLength = 4 }; 479 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the
597 480 current sequence, without returning any matches.
598 // HashBytes is the function that chooses the bucket to place 481 REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */
599 // the address in. The HashLongestMatch and HashLongestMatchQuickly 482 static BROTLI_INLINE void FN(Store)(HashToBinaryTree* self, const uint8_t *data,
600 // classes have separate, different implementations of hashing. 483 const size_t mask, const size_t ix) {
601 static uint32_t HashBytes(const uint8_t *data) { 484 /* Maximum distance is window size - 16, see section 9.1. of the spec. */
602 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; 485 const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1;
603 // The higher bits contain more mixture from the multiplication, 486 FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH,
604 // so we take our results from there. 487 max_backward, NULL, NULL);
605 return h >> (32 - kBucketBits); 488 }
606 } 489
607 490 static BROTLI_INLINE void FN(StoreRange)(HashToBinaryTree* self,
608 enum { kHashMapSize = 2 << kBucketBits }; 491 const uint8_t *data, const size_t mask, const size_t ix_start,
609 492 const size_t ix_end) {
610 static const size_t kMaxNumMatches = 64 + (1 << kBlockBits); 493 size_t i = ix_start;
611 494 size_t j = ix_start;
612 private: 495 if (ix_start + 63 <= ix_end) {
613 // Number of hash buckets. 496 i = ix_end - 63;
614 static const uint32_t kBucketSize = 1 << kBucketBits; 497 }
615 498 if (ix_start + 512 <= i) {
616 // Only kBlockSize newest backward references are kept, 499 for (; j < i; j += 8) {
617 // and the older are forgotten. 500 FN(Store)(self, data, mask, j);
618 static const uint32_t kBlockSize = 1 << kBlockBits; 501 }
619 502 }
620 // Mask for accessing entries in a block (in a ringbuffer manner). 503 for (; i < ix_end; ++i) {
621 static const uint32_t kBlockMask = (1 << kBlockBits) - 1; 504 FN(Store)(self, data, mask, i);
622 505 }
623 // Number of entries in a particular bucket. 506 }
624 uint16_t num_[kBucketSize]; 507
625 508 static BROTLI_INLINE void FN(StitchToPreviousBlock)(HashToBinaryTree* self,
626 // Buckets containing kBlockSize of backward references. 509 size_t num_bytes, size_t position, const uint8_t* ringbuffer,
627 uint32_t buckets_[kBucketSize][kBlockSize]; 510 size_t ringbuffer_mask) {
628 511 if (num_bytes >= FN(HashTypeLength)() - 1 &&
629 // True if num_ array needs to be initialized. 512 position >= MAX_TREE_COMP_LENGTH) {
630 bool need_init_; 513 /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher.
631 514 These could not be calculated before, since they require knowledge
632 size_t num_dict_lookups_; 515 of both the previous and the current block. */
633 size_t num_dict_matches_; 516 const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1;
634 }; 517 const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes);
635 518 size_t i;
636 // A (forgetful) hash table where each hash bucket contains a binary tree of 519 for (i = i_start; i < i_end; ++i) {
637 // sequences whose first 4 bytes share the same hash code. 520 /* Maximum distance is window size - 16, see section 9.1. of the spec.
638 // Each sequence is kMaxTreeCompLength long and is identified by its starting 521 Furthermore, we have to make sure that we don't look further back
639 // position in the input data. The binary tree is sorted by the lexicographic 522 from the start of the next block than the window size, otherwise we
640 // order of the sequences, and it is also a max-heap with respect to the 523 could access already overwritten areas of the ring-buffer. */
641 // starting positions. 524 const size_t max_backward =
642 class HashToBinaryTree { 525 self->window_mask_ - BROTLI_MAX(size_t,
643 public: 526 BROTLI_WINDOW_GAP - 1,
644 HashToBinaryTree() : forest_(NULL) { 527 position - i);
645 Reset(); 528 /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the
646 } 529 end of the current block and that we have at least
647 530 MAX_TREE_COMP_LENGTH tail in the ring-buffer. */
648 ~HashToBinaryTree() { 531 FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask,
649 delete[] forest_; 532 MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL);
650 } 533 }
651 534 }
652 void Reset() { 535 }
653 need_init_ = true; 536
654 } 537 #undef BUCKET_SIZE
655 538 #undef BUCKET_BITS
656 void Init(int lgwin, size_t position, size_t bytes, bool is_last) { 539
657 if (need_init_) { 540 #undef HASHER
658 window_mask_ = (1u << lgwin) - 1u; 541
659 invalid_pos_ = static_cast<uint32_t>(0 - window_mask_); 542 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
660 for (uint32_t i = 0; i < kBucketSize; i++) { 543 a little faster (0.5% - 1%) and it compresses 0.15% better on small text
661 buckets_[i] = invalid_pos_; 544 and HTML inputs. */
662 } 545
663 size_t num_nodes = (position == 0 && is_last) ? bytes : window_mask_ + 1; 546 #define HASHER() H2
664 forest_ = new uint32_t[2 * num_nodes]; 547 #define BUCKET_BITS 16
665 need_init_ = false; 548 #define BUCKET_SWEEP 1
666 } 549 #define USE_DICTIONARY 1
667 } 550 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
668 551 #undef BUCKET_SWEEP
669 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the 552 #undef USE_DICTIONARY
670 // length of max_length and stores the position cur_ix in the hash table. 553 #undef HASHER
671 // 554
672 // Sets *num_matches to the number of matches found, and stores the found 555 #define HASHER() H3
673 // matches in matches[0] to matches[*num_matches - 1]. The matches will be 556 #define BUCKET_SWEEP 2
674 // sorted by strictly increasing length and (non-strictly) increasing 557 #define USE_DICTIONARY 0
675 // distance. 558 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
676 size_t FindAllMatches(const uint8_t* data, 559 #undef USE_DICTIONARY
677 const size_t ring_buffer_mask, 560 #undef BUCKET_SWEEP
678 const size_t cur_ix, 561 #undef BUCKET_BITS
679 const size_t max_length, 562 #undef HASHER
680 const size_t max_backward, 563
681 BackwardMatch* matches) { 564 #define HASHER() H4
682 BackwardMatch* const orig_matches = matches; 565 #define BUCKET_BITS 17
683 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; 566 #define BUCKET_SWEEP 4
684 size_t best_len = 1; 567 #define USE_DICTIONARY 1
685 size_t stop = cur_ix - 64; 568 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
686 if (cur_ix < 64) { stop = 0; } 569 #undef USE_DICTIONARY
687 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) { 570 #undef BUCKET_SWEEP
688 size_t prev_ix = i; 571 #undef BUCKET_BITS
689 const size_t backward = cur_ix - prev_ix; 572 #undef HASHER
690 if (PREDICT_FALSE(backward > max_backward)) { 573
691 break; 574 #define HASHER() H5
692 } 575 #define BUCKET_BITS 14
693 prev_ix &= ring_buffer_mask; 576 #define BLOCK_BITS 4
694 if (data[cur_ix_masked] != data[prev_ix] || 577 #define NUM_LAST_DISTANCES_TO_CHECK 4
695 data[cur_ix_masked + 1] != data[prev_ix + 1]) { 578 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
696 continue; 579 #undef BLOCK_BITS
697 } 580 #undef HASHER
698 const size_t len = 581
699 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], 582 #define HASHER() H6
700 max_length); 583 #define BLOCK_BITS 5
701 if (len > best_len) { 584 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
702 best_len = len; 585 #undef NUM_LAST_DISTANCES_TO_CHECK
703 *matches++ = BackwardMatch(backward, len); 586 #undef BLOCK_BITS
704 } 587 #undef BUCKET_BITS
705 } 588 #undef HASHER
706 if (best_len < max_length) { 589
707 matches = StoreAndFindMatches(data, cur_ix, ring_buffer_mask, 590 #define HASHER() H7
708 max_length, &best_len, matches); 591 #define BUCKET_BITS 15
709 } 592 #define BLOCK_BITS 6
710 uint32_t dict_matches[kMaxDictionaryMatchLen + 1]; 593 #define NUM_LAST_DISTANCES_TO_CHECK 10
711 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) { 594 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
712 dict_matches[i] = kInvalidMatch; 595 #undef BLOCK_BITS
713 } 596 #undef HASHER
714 size_t minlen = std::max<size_t>(4, best_len + 1); 597
715 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length, 598 #define HASHER() H8
716 &dict_matches[0])) { 599 #define BLOCK_BITS 7
717 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length); 600 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
718 for (size_t l = minlen; l <= maxlen; ++l) { 601 #undef NUM_LAST_DISTANCES_TO_CHECK
719 uint32_t dict_id = dict_matches[l]; 602 #undef BLOCK_BITS
720 if (dict_id < kInvalidMatch) { 603 #undef HASHER
721 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l, 604
722 dict_id & 31); 605 #define HASHER() H9
723 } 606 #define BLOCK_BITS 8
724 } 607 #define NUM_LAST_DISTANCES_TO_CHECK 16
725 } 608 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
726 return static_cast<size_t>(matches - orig_matches); 609 #undef NUM_LAST_DISTANCES_TO_CHECK
727 } 610 #undef BLOCK_BITS
728 611 #undef BUCKET_BITS
729 // Stores the hash of the next 4 bytes and re-roots the binary tree at the 612 #undef HASHER
730 // current sequence, without returning any matches. 613
731 // REQUIRES: cur_ix + kMaxTreeCompLength <= end-of-current-block 614 #define BUCKET_BITS 15
732 void Store(const uint8_t* data, 615
733 const size_t ring_buffer_mask, 616 #define NUM_LAST_DISTANCES_TO_CHECK 4
734 const size_t cur_ix) { 617 #define NUM_BANKS 1
735 size_t best_len = 0; 618 #define BANK_BITS 16
736 StoreAndFindMatches(data, cur_ix, ring_buffer_mask, kMaxTreeCompLength, 619 #define HASHER() H40
737 &best_len, NULL); 620 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
738 } 621 #undef HASHER
739 622 #undef NUM_LAST_DISTANCES_TO_CHECK
740 void StitchToPreviousBlock(size_t num_bytes, 623
741 size_t position, 624 #define NUM_LAST_DISTANCES_TO_CHECK 10
742 const uint8_t* ringbuffer, 625 #define HASHER() H41
743 size_t ringbuffer_mask) { 626 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
744 if (num_bytes >= 3 && position >= kMaxTreeCompLength) { 627 #undef HASHER
745 // Store the last `kMaxTreeCompLength - 1` positions in the hasher. 628 #undef NUM_LAST_DISTANCES_TO_CHECK
746 // These could not be calculated before, since they require knowledge 629 #undef NUM_BANKS
747 // of both the previous and the current block. 630 #undef BANK_BITS
748 const size_t i_start = position - kMaxTreeCompLength + 1; 631
749 const size_t i_end = std::min(position, i_start + num_bytes); 632 #define NUM_LAST_DISTANCES_TO_CHECK 16
750 for (size_t i = i_start; i < i_end; ++i) { 633 #define NUM_BANKS 512
751 // We know that i + kMaxTreeCompLength <= position + num_bytes, i.e. the 634 #define BANK_BITS 9
752 // end of the current block and that we have at least 635 #define HASHER() H42
753 // kMaxTreeCompLength tail in the ringbuffer. 636 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
754 Store(ringbuffer, ringbuffer_mask, i); 637 #undef HASHER
755 } 638 #undef NUM_LAST_DISTANCES_TO_CHECK
756 } 639 #undef NUM_BANKS
757 } 640 #undef BANK_BITS
758 641
759 static const size_t kMaxNumMatches = 64 + kMaxTreeSearchDepth; 642 #undef BUCKET_BITS
760 643
761 private: 644 #undef FN
762 // Stores the hash of the next 4 bytes and in a single tree-traversal, the 645 #undef CAT
763 // hash bucket's binary tree is searched for matches and is re-rooted at the 646 #undef EXPAND_CAT
764 // current position. 647
765 // 648 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(7) H(8) H(9) \
766 // If less than kMaxTreeCompLength data is available, the hash bucket of the 649 H(40) H(41) H(42)
767 // current position is searched for matches, but the state of the hash table 650 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
768 // is not changed, since we can not know the final sorting order of the 651
769 // current (incomplete) sequence. 652 typedef struct Hashers {
770 // 653 #define MEMBER_(N) H ## N* h ## N;
771 // This function must be called with increasing cur_ix positions. 654 FOR_ALL_HASHERS(MEMBER_)
772 BackwardMatch* StoreAndFindMatches(const uint8_t* const __restrict data, 655 #undef MEMBER_
773 const size_t cur_ix, 656 } Hashers;
774 const size_t ring_buffer_mask, 657
775 const size_t max_length, 658 static BROTLI_INLINE void InitHashers(Hashers* self) {
776 size_t* const __restrict best_len, 659 #define INIT_(N) self->h ## N = 0;
777 BackwardMatch* __restrict matches) { 660 FOR_ALL_HASHERS(INIT_)
778 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; 661 #undef INIT_
779 const size_t max_backward = window_mask_ - 15; 662 }
780 const size_t max_comp_len = std::min(max_length, kMaxTreeCompLength); 663
781 const bool reroot_tree = max_length >= kMaxTreeCompLength; 664 static BROTLI_INLINE void DestroyHashers(MemoryManager* m, Hashers* self) {
782 const uint32_t key = HashBytes(&data[cur_ix_masked]); 665 if (self->h10) CleanupH10(m, self->h10);
783 size_t prev_ix = buckets_[key]; 666 #define CLEANUP_(N) BROTLI_FREE(m, self->h ## N)
784 // The forest index of the rightmost node of the left subtree of the new 667 FOR_ALL_HASHERS(CLEANUP_)
785 // root, updated as we traverse and reroot the tree of the hash bucket. 668 #undef CLEANUP_
786 size_t node_left = LeftChildIndex(cur_ix); 669 }
787 // The forest index of the leftmost node of the right subtree of the new 670
788 // root, updated as we traverse and reroot the tree of the hash bucket. 671 static BROTLI_INLINE void HashersReset(Hashers* self, int type) {
789 size_t node_right = RightChildIndex(cur_ix); 672 switch (type) {
790 // The match length of the rightmost node of the left subtree of the new 673 #define RESET_(N) case N: ResetH ## N(self->h ## N); break;
791 // root, updated as we traverse and reroot the tree of the hash bucket. 674 FOR_ALL_HASHERS(RESET_)
792 size_t best_len_left = 0; 675 #undef RESET_
793 // The match length of the leftmost node of the right subtree of the new 676 default: break;
794 // root, updated as we traverse and reroot the tree of the hash bucket. 677 }
795 size_t best_len_right = 0; 678 }
796 if (reroot_tree) { 679
797 buckets_[key] = static_cast<uint32_t>(cur_ix); 680 static BROTLI_INLINE void HashersSetup(
798 } 681 MemoryManager* m, Hashers* self, int type) {
799 for (size_t depth_remaining = kMaxTreeSearchDepth; ; --depth_remaining) { 682 switch (type) {
800 const size_t backward = cur_ix - prev_ix; 683 #define SETUP_(N) case N: self->h ## N = BROTLI_ALLOC(m, H ## N, 1); break;
801 const size_t prev_ix_masked = prev_ix & ring_buffer_mask; 684 FOR_ALL_HASHERS(SETUP_)
802 if (backward == 0 || backward > max_backward || depth_remaining == 0) { 685 #undef SETUP_
803 if (reroot_tree) { 686 default: break;
804 forest_[node_left] = invalid_pos_; 687 }
805 forest_[node_right] = invalid_pos_; 688 if (BROTLI_IS_OOM(m)) return;
806 } 689 if (type == 10) InitializeH10(self->h10);
807 break; 690 HashersReset(self, type);
808 } 691 }
809 const size_t cur_len = std::min(best_len_left, best_len_right); 692
810 const size_t len = cur_len + 693 #define WARMUP_HASH_(N) \
811 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len], 694 static BROTLI_INLINE void WarmupHashH ## N(MemoryManager* m, \
812 &data[prev_ix_masked + cur_len], 695 const BrotliEncoderParams* params, const size_t size, const uint8_t* dict, \
813 max_length - cur_len); 696 H ## N* hasher) { \
814 if (len > *best_len) { 697 size_t overlap = (StoreLookaheadH ## N()) - 1; \
815 *best_len = len; 698 size_t i; \
816 if (matches) { 699 InitH ## N(m, hasher, dict, params, 0, size, BROTLI_FALSE); \
817 *matches++ = BackwardMatch(backward, len); 700 if (BROTLI_IS_OOM(m)) return; \
818 } 701 for (i = 0; i + overlap < size; i++) { \
819 if (len >= max_comp_len) { 702 StoreH ## N(hasher, dict, ~(size_t)0, i); \
820 if (reroot_tree) { 703 } \
821 forest_[node_left] = forest_[LeftChildIndex(prev_ix)]; 704 }
822 forest_[node_right] = forest_[RightChildIndex(prev_ix)]; 705 FOR_ALL_HASHERS(WARMUP_HASH_)
823 } 706 #undef WARMUP_HASH_
824 break; 707
825 } 708 /* Custom LZ77 window. */
826 } 709 static BROTLI_INLINE void HashersPrependCustomDictionary(
827 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) { 710 MemoryManager* m, Hashers* self, const BrotliEncoderParams* params,
828 best_len_left = len; 711 const size_t size, const uint8_t* dict) {
829 if (reroot_tree) { 712 int hasher_type = ChooseHasher(params);
830 forest_[node_left] = static_cast<uint32_t>(prev_ix); 713 switch (hasher_type) {
831 } 714 #define PREPEND_(N) \
832 node_left = RightChildIndex(prev_ix); 715 case N: WarmupHashH ## N(m, params, size, dict, self->h ## N); break;
833 prev_ix = forest_[node_left]; 716 FOR_ALL_HASHERS(PREPEND_)
834 } else { 717 #undef PREPEND_
835 best_len_right = len; 718 default: break;
836 if (reroot_tree) { 719 }
837 forest_[node_right] = static_cast<uint32_t>(prev_ix); 720 if (BROTLI_IS_OOM(m)) return;
838 } 721 }
839 node_right = LeftChildIndex(prev_ix); 722
840 prev_ix = forest_[node_right]; 723
841 } 724 #if defined(__cplusplus) || defined(c_plusplus)
842 } 725 } /* extern "C" */
843 return matches; 726 #endif
844 } 727
845 728 #endif /* BROTLI_ENC_HASH_H_ */
846 inline size_t LeftChildIndex(const size_t pos) {
847 return 2 * (pos & window_mask_);
848 }
849
850 inline size_t RightChildIndex(const size_t pos) {
851 return 2 * (pos & window_mask_) + 1;
852 }
853
854 static uint32_t HashBytes(const uint8_t *data) {
855 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
856 // The higher bits contain more mixture from the multiplication,
857 // so we take our results from there.
858 return h >> (32 - kBucketBits);
859 }
860
861 static const int kBucketBits = 17;
862 static const size_t kBucketSize = 1 << kBucketBits;
863
864 // The window size minus 1
865 size_t window_mask_;
866
867 // Hash table that maps the 4-byte hashes of the sequence to the last
868 // position where this hash was found, which is the root of the binary
869 // tree of sequences that share this hash bucket.
870 uint32_t buckets_[kBucketSize];
871
872 // The union of the binary trees of each hash bucket. The root of the tree
873 // corresponding to a hash is a sequence starting at buckets_[hash] and
874 // the left and right children of a sequence starting at pos are
875 // forest_[2 * pos] and forest_[2 * pos + 1].
876 uint32_t* forest_;
877
878 // A position used to mark a non-existent sequence, i.e. a tree is empty if
879 // its root is at invalid_pos_ and a node is a leaf if both its children
880 // are at invalid_pos_.
881 uint32_t invalid_pos_;
882
883 bool need_init_;
884 };
885
886 struct Hashers {
887 // For kBucketSweep == 1, enabling the dictionary lookup makes compression
888 // a little faster (0.5% - 1%) and it compresses 0.15% better on small text
889 // and html inputs.
890 typedef HashLongestMatchQuickly<16, 1, true> H2;
891 typedef HashLongestMatchQuickly<16, 2, false> H3;
892 typedef HashLongestMatchQuickly<17, 4, true> H4;
893 typedef HashLongestMatch<14, 4, 4> H5;
894 typedef HashLongestMatch<14, 5, 4> H6;
895 typedef HashLongestMatch<15, 6, 10> H7;
896 typedef HashLongestMatch<15, 7, 10> H8;
897 typedef HashLongestMatch<15, 8, 16> H9;
898 typedef HashToBinaryTree H10;
899
900 Hashers(void) : hash_h2(0), hash_h3(0), hash_h4(0), hash_h5(0),
901 hash_h6(0), hash_h7(0), hash_h8(0), hash_h9(0), hash_h10(0) {}
902
903 ~Hashers(void) {
904 delete hash_h2;
905 delete hash_h3;
906 delete hash_h4;
907 delete hash_h5;
908 delete hash_h6;
909 delete hash_h7;
910 delete hash_h8;
911 delete hash_h9;
912 delete hash_h10;
913 }
914
915 void Init(int type) {
916 switch (type) {
917 case 2: hash_h2 = new H2; break;
918 case 3: hash_h3 = new H3; break;
919 case 4: hash_h4 = new H4; break;
920 case 5: hash_h5 = new H5; break;
921 case 6: hash_h6 = new H6; break;
922 case 7: hash_h7 = new H7; break;
923 case 8: hash_h8 = new H8; break;
924 case 9: hash_h9 = new H9; break;
925 case 10: hash_h10 = new H10; break;
926 default: break;
927 }
928 }
929
930 template<typename Hasher>
931 void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) {
932 hasher->Init();
933 for (size_t i = 0; i + Hasher::kHashTypeLength - 1 < size; i++) {
934 hasher->Store(&dict[i], static_cast<uint32_t>(i));
935 }
936 }
937
938 // Custom LZ77 window.
939 void PrependCustomDictionary(
940 int type, int lgwin, const size_t size, const uint8_t* dict) {
941 switch (type) {
942 case 2: WarmupHash(size, dict, hash_h2); break;
943 case 3: WarmupHash(size, dict, hash_h3); break;
944 case 4: WarmupHash(size, dict, hash_h4); break;
945 case 5: WarmupHash(size, dict, hash_h5); break;
946 case 6: WarmupHash(size, dict, hash_h6); break;
947 case 7: WarmupHash(size, dict, hash_h7); break;
948 case 8: WarmupHash(size, dict, hash_h8); break;
949 case 9: WarmupHash(size, dict, hash_h9); break;
950 case 10:
951 hash_h10->Init(lgwin, 0, size, false);
952 for (size_t i = 0; i + kMaxTreeCompLength - 1 < size; ++i) {
953 hash_h10->Store(dict, std::numeric_limits<size_t>::max(), i);
954 }
955 break;
956 default: break;
957 }
958 }
959
960
961 H2* hash_h2;
962 H3* hash_h3;
963 H4* hash_h4;
964 H5* hash_h5;
965 H6* hash_h6;
966 H7* hash_h7;
967 H8* hash_h8;
968 H9* hash_h9;
969 H10* hash_h10;
970 };
971
972 } // namespace brotli
973
974 #endif // BROTLI_ENC_HASH_H_
OLDNEW
« no previous file with comments | « third_party/brotli/enc/find_match_length.h ('k') | third_party/brotli/enc/hash_forgetful_chain_inc.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698