| OLD | NEW |
| 1 /* Copyright 2010 Google Inc. All Rights Reserved. | 1 /* Copyright 2010 Google Inc. All Rights Reserved. |
| 2 | 2 |
| 3 Distributed under MIT license. | 3 Distributed under MIT license. |
| 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| 5 */ | 5 */ |
| 6 | 6 |
| 7 // A (forgetful) hash table to the data seen by the compressor, to | 7 /* A (forgetful) hash table to the data seen by the compressor, to |
| 8 // help create backward references to previous data. | 8 help create backward references to previous data. */ |
| 9 | 9 |
| 10 #ifndef BROTLI_ENC_HASH_H_ | 10 #ifndef BROTLI_ENC_HASH_H_ |
| 11 #define BROTLI_ENC_HASH_H_ | 11 #define BROTLI_ENC_HASH_H_ |
| 12 | 12 |
| 13 #include <sys/types.h> | 13 #include <string.h> /* memcmp, memset */ |
| 14 #include <algorithm> | |
| 15 #include <cstring> | |
| 16 #include <limits> | |
| 17 | 14 |
| 15 #include "../common/constants.h" |
| 16 #include "../common/dictionary.h" |
| 17 #include <brotli/types.h> |
| 18 #include "./dictionary_hash.h" | 18 #include "./dictionary_hash.h" |
| 19 #include "./fast_log.h" | 19 #include "./fast_log.h" |
| 20 #include "./find_match_length.h" | 20 #include "./find_match_length.h" |
| 21 #include "./memory.h" |
| 21 #include "./port.h" | 22 #include "./port.h" |
| 22 #include "./prefix.h" | 23 #include "./quality.h" |
| 23 #include "./static_dict.h" | 24 #include "./static_dict.h" |
| 24 #include "./transform.h" | |
| 25 #include "./types.h" | |
| 26 | 25 |
| 27 namespace brotli { | 26 #if defined(__cplusplus) || defined(c_plusplus) |
| 27 extern "C" { |
| 28 #endif |
| 28 | 29 |
| 29 static const size_t kMaxTreeSearchDepth = 64; | 30 #define MAX_TREE_SEARCH_DEPTH 64 |
| 30 static const size_t kMaxTreeCompLength = 128; | 31 #define MAX_TREE_COMP_LENGTH 128 |
| 32 #define score_t size_t |
| 31 | 33 |
| 32 static const uint32_t kDistanceCacheIndex[] = { | 34 static const uint32_t kDistanceCacheIndex[] = { |
| 33 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, | 35 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, |
| 34 }; | 36 }; |
| 35 static const int kDistanceCacheOffset[] = { | 37 static const int kDistanceCacheOffset[] = { |
| 36 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3 | 38 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3 |
| 37 }; | 39 }; |
| 38 | 40 |
| 39 static const uint32_t kCutoffTransformsCount = 10; | 41 static const uint32_t kCutoffTransformsCount = 10; |
| 40 static const uint8_t kCutoffTransforms[] = { | 42 static const uint8_t kCutoffTransforms[] = { |
| 41 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 | 43 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 |
| 42 }; | 44 }; |
| 43 | 45 |
| 44 // kHashMul32 multiplier has these properties: | 46 typedef struct HasherSearchResult { |
| 45 // * The multiplier must be odd. Otherwise we may lose the highest bit. | 47 size_t len; |
| 46 // * No long streaks of 1s or 0s. | 48 size_t len_x_code; /* == len ^ len_code */ |
| 47 // * There is no effort to ensure that it is a prime, the oddity is enough | 49 size_t distance; |
| 48 // for this use. | 50 score_t score; |
| 49 // * The number has been tuned heuristically against compression benchmarks. | 51 } HasherSearchResult; |
| 52 |
| 53 typedef struct DictionarySearchStatictics { |
| 54 size_t num_lookups; |
| 55 size_t num_matches; |
| 56 } DictionarySearchStatictics; |
| 57 |
| 58 /* kHashMul32 multiplier has these properties: |
| 59 * The multiplier must be odd. Otherwise we may lose the highest bit. |
| 60 * No long streaks of ones or zeros. |
| 61 * There is no effort to ensure that it is a prime, the oddity is enough |
| 62 for this use. |
| 63 * The number has been tuned heuristically against compression benchmarks. */ |
| 50 static const uint32_t kHashMul32 = 0x1e35a7bd; | 64 static const uint32_t kHashMul32 = 0x1e35a7bd; |
| 51 | 65 |
| 52 template<int kShiftBits> | 66 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) { |
| 53 inline uint32_t Hash(const uint8_t *data) { | |
| 54 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; | 67 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; |
| 55 // The higher bits contain more mixture from the multiplication, | 68 /* The higher bits contain more mixture from the multiplication, |
| 56 // so we take our results from there. | 69 so we take our results from there. */ |
| 57 return h >> (32 - kShiftBits); | 70 return h >> (32 - 14); |
| 58 } | 71 } |
| 59 | 72 |
| 60 // Usually, we always choose the longest backward reference. This function | 73 #define BROTLI_LITERAL_BYTE_SCORE 540 |
| 61 // allows for the exception of that rule. | 74 #define BROTLI_DISTANCE_BIT_PENALTY 120 |
| 62 // | 75 /* Score must be positive after applying maximal penalty. */ |
| 63 // If we choose a backward reference that is further away, it will | 76 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t)) |
| 64 // usually be coded with more bits. We approximate this by assuming | 77 |
| 65 // log2(distance). If the distance can be expressed in terms of the | 78 /* Usually, we always choose the longest backward reference. This function |
| 66 // last four distances, we use some heuristic constants to estimate | 79 allows for the exception of that rule. |
| 67 // the bits cost. For the first up to four literals we use the bit | 80 |
| 68 // cost of the literals from the literal cost model, after that we | 81 If we choose a backward reference that is further away, it will |
| 69 // use the average bit cost of the cost model. | 82 usually be coded with more bits. We approximate this by assuming |
| 70 // | 83 log2(distance). If the distance can be expressed in terms of the |
| 71 // This function is used to sometimes discard a longer backward reference | 84 last four distances, we use some heuristic constants to estimate |
| 72 // when it is not much longer and the bit cost for encoding it is more | 85 the bits cost. For the first up to four literals we use the bit |
| 73 // than the saved literals. | 86 cost of the literals from the literal cost model, after that we |
| 74 // | 87 use the average bit cost of the cost model. |
| 75 // backward_reference_offset MUST be positive. | 88 |
| 76 inline double BackwardReferenceScore(size_t copy_length, | 89 This function is used to sometimes discard a longer backward reference |
| 77 size_t backward_reference_offset) { | 90 when it is not much longer and the bit cost for encoding it is more |
| 78 return 5.4 * static_cast<double>(copy_length) - | 91 than the saved literals. |
| 79 1.20 * Log2FloorNonZero(backward_reference_offset); | 92 |
| 80 } | 93 backward_reference_offset MUST be positive. */ |
| 81 | 94 static BROTLI_INLINE score_t BackwardReferenceScore( |
| 82 inline double BackwardReferenceScoreUsingLastDistance(size_t copy_length, | 95 size_t copy_length, size_t backward_reference_offset) { |
| 83 size_t distance_short_code) { | 96 return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length - |
| 84 static const double kDistanceShortCodeBitCost[16] = { | 97 BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset); |
| 85 -0.6, 0.95, 1.17, 1.27, | 98 } |
| 86 0.93, 0.93, 0.96, 0.96, 0.99, 0.99, | 99 |
| 87 1.05, 1.05, 1.15, 1.15, 1.25, 1.25 | 100 static const score_t kDistanceShortCodeCost[BROTLI_NUM_DISTANCE_SHORT_CODES] = { |
| 88 }; | 101 /* Repeat last */ |
| 89 return 5.4 * static_cast<double>(copy_length) - | 102 BROTLI_SCORE_BASE + 60, |
| 90 kDistanceShortCodeBitCost[distance_short_code]; | 103 /* 2nd, 3rd, 4th last */ |
| 91 } | 104 BROTLI_SCORE_BASE - 95, |
| 92 | 105 BROTLI_SCORE_BASE - 117, |
| 93 struct BackwardMatch { | 106 BROTLI_SCORE_BASE - 127, |
| 94 BackwardMatch(void) : distance(0), length_and_code(0) {} | 107 /* Last with offset */ |
| 95 | 108 BROTLI_SCORE_BASE - 93, |
| 96 BackwardMatch(size_t dist, size_t len) | 109 BROTLI_SCORE_BASE - 93, |
| 97 : distance(static_cast<uint32_t>(dist)) | 110 BROTLI_SCORE_BASE - 96, |
| 98 , length_and_code(static_cast<uint32_t>(len << 5)) {} | 111 BROTLI_SCORE_BASE - 96, |
| 99 | 112 BROTLI_SCORE_BASE - 99, |
| 100 BackwardMatch(size_t dist, size_t len, size_t len_code) | 113 BROTLI_SCORE_BASE - 99, |
| 101 : distance(static_cast<uint32_t>(dist)) | 114 /* 2nd last with offset */ |
| 102 , length_and_code(static_cast<uint32_t>( | 115 BROTLI_SCORE_BASE - 105, |
| 103 (len << 5) | (len == len_code ? 0 : len_code))) {} | 116 BROTLI_SCORE_BASE - 105, |
| 104 | 117 BROTLI_SCORE_BASE - 115, |
| 105 size_t length(void) const { | 118 BROTLI_SCORE_BASE - 115, |
| 106 return length_and_code >> 5; | 119 BROTLI_SCORE_BASE - 125, |
| 107 } | 120 BROTLI_SCORE_BASE - 125 |
| 108 size_t length_code(void) const { | 121 }; |
| 109 size_t code = length_and_code & 31; | 122 |
| 110 return code ? code : length(); | 123 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance( |
| 111 } | 124 size_t copy_length, size_t distance_short_code) { |
| 112 | 125 return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length + |
| 126 kDistanceShortCodeCost[distance_short_code]; |
| 127 } |
| 128 |
| 129 static BROTLI_INLINE void DictionarySearchStaticticsReset( |
| 130 DictionarySearchStatictics* self) { |
| 131 self->num_lookups = 0; |
| 132 self->num_matches = 0; |
| 133 } |
| 134 |
| 135 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem( |
| 136 size_t item, const uint8_t* data, size_t max_length, size_t max_backward, |
| 137 HasherSearchResult* out) { |
| 138 size_t len; |
| 139 size_t dist; |
| 140 size_t offset; |
| 141 size_t matchlen; |
| 142 size_t backward; |
| 143 score_t score; |
| 144 len = item & 31; |
| 145 dist = item >> 5; |
| 146 offset = kBrotliDictionaryOffsetsByLength[len] + len * dist; |
| 147 if (len > max_length) { |
| 148 return BROTLI_FALSE; |
| 149 } |
| 150 |
| 151 matchlen = FindMatchLengthWithLimit(data, &kBrotliDictionary[offset], len); |
| 152 if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) { |
| 153 return BROTLI_FALSE; |
| 154 } |
| 155 { |
| 156 size_t transform_id = kCutoffTransforms[len - matchlen]; |
| 157 backward = max_backward + dist + 1 + |
| 158 (transform_id << kBrotliDictionarySizeBitsByLength[len]); |
| 159 } |
| 160 score = BackwardReferenceScore(matchlen, backward); |
| 161 if (score < out->score) { |
| 162 return BROTLI_FALSE; |
| 163 } |
| 164 out->len = matchlen; |
| 165 out->len_x_code = len ^ matchlen; |
| 166 out->distance = backward; |
| 167 out->score = score; |
| 168 return BROTLI_TRUE; |
| 169 } |
| 170 |
| 171 static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary( |
| 172 DictionarySearchStatictics* self, const uint8_t* data, size_t max_length, |
| 173 size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) { |
| 174 size_t key; |
| 175 size_t i; |
| 176 BROTLI_BOOL is_match_found = BROTLI_FALSE; |
| 177 if (self->num_matches < (self->num_lookups >> 7)) { |
| 178 return BROTLI_FALSE; |
| 179 } |
| 180 key = Hash14(data) << 1; |
| 181 for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) { |
| 182 size_t item = kStaticDictionaryHash[key]; |
| 183 self->num_lookups++; |
| 184 if (item != 0 && |
| 185 TestStaticDictionaryItem(item, data, max_length, max_backward, out)) { |
| 186 self->num_matches++; |
| 187 is_match_found = BROTLI_TRUE; |
| 188 } |
| 189 } |
| 190 return is_match_found; |
| 191 } |
| 192 |
| 193 typedef struct BackwardMatch { |
| 113 uint32_t distance; | 194 uint32_t distance; |
| 114 uint32_t length_and_code; | 195 uint32_t length_and_code; |
| 115 }; | 196 } BackwardMatch; |
| 116 | 197 |
| 117 // A (forgetful) hash table to the data seen by the compressor, to | 198 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self, |
| 118 // help create backward references to previous data. | 199 size_t dist, size_t len) { |
| 119 // | 200 self->distance = (uint32_t)dist; |
| 120 // This is a hash map of fixed size (kBucketSize). Starting from the | 201 self->length_and_code = (uint32_t)(len << 5); |
| 121 // given index, kBucketSweep buckets are used to store values of a key. | 202 } |
| 122 template <int kBucketBits, int kBucketSweep, bool kUseDictionary> | 203 |
| 123 class HashLongestMatchQuickly { | 204 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self, |
| 124 public: | 205 size_t dist, size_t len, size_t len_code) { |
| 125 HashLongestMatchQuickly(void) { | 206 self->distance = (uint32_t)dist; |
| 126 Reset(); | 207 self->length_and_code = |
| 127 } | 208 (uint32_t)((len << 5) | (len == len_code ? 0 : len_code)); |
| 128 void Reset(void) { | 209 } |
| 129 need_init_ = true; | 210 |
| 130 num_dict_lookups_ = 0; | 211 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) { |
| 131 num_dict_matches_ = 0; | 212 return self->length_and_code >> 5; |
| 132 } | 213 } |
| 133 void Init(void) { | 214 |
| 134 if (need_init_) { | 215 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) { |
| 135 // It is not strictly necessary to fill this buffer here, but | 216 size_t code = self->length_and_code & 31; |
| 136 // not filling will make the results of the compression stochastic | 217 return code ? code : BackwardMatchLength(self); |
| 137 // (but correct). This is because random data would cause the | 218 } |
| 138 // system to find accidentally good backward references here and there. | 219 |
| 139 memset(&buckets_[0], 0, sizeof(buckets_)); | 220 #define EXPAND_CAT(a, b) CAT(a, b) |
| 140 need_init_ = false; | 221 #define CAT(a, b) a ## b |
| 141 } | 222 #define FN(X) EXPAND_CAT(X, HASHER()) |
| 142 } | 223 |
| 143 void InitForData(const uint8_t* data, size_t num) { | 224 #define MAX_NUM_MATCHES_H10 (64 + MAX_TREE_SEARCH_DEPTH) |
| 144 for (size_t i = 0; i < num; ++i) { | 225 |
| 145 const uint32_t key = HashBytes(&data[i]); | 226 #define HASHER() H10 |
| 146 memset(&buckets_[key], 0, kBucketSweep * sizeof(buckets_[0])); | 227 #define HashToBinaryTree HASHER() |
| 147 need_init_ = false; | 228 |
| 148 } | 229 #define BUCKET_BITS 17 |
| 149 } | 230 #define BUCKET_SIZE (1 << BUCKET_BITS) |
| 150 // Look at 4 bytes at data. | 231 |
| 151 // Compute a hash from these, and store the value somewhere within | 232 static size_t FN(HashTypeLength)(void) { return 4; } |
| 152 // [ix .. ix+3]. | 233 static size_t FN(StoreLookahead)(void) { return MAX_TREE_COMP_LENGTH; } |
| 153 inline void Store(const uint8_t *data, const uint32_t ix) { | 234 |
| 154 const uint32_t key = HashBytes(data); | 235 static uint32_t FN(HashBytes)(const uint8_t *data) { |
| 155 // Wiggle the value with the bucket sweep range. | 236 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; |
| 156 const uint32_t off = (ix >> 3) % kBucketSweep; | 237 /* The higher bits contain more mixture from the multiplication, |
| 157 buckets_[key + off] = ix; | 238 so we take our results from there. */ |
| 158 } | 239 return h >> (32 - BUCKET_BITS); |
| 159 | 240 } |
| 160 // Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask] | 241 |
| 161 // up to the length of max_length and stores the position cur_ix in the | 242 /* A (forgetful) hash table where each hash bucket contains a binary tree of |
| 162 // hash table. | 243 sequences whose first 4 bytes share the same hash code. |
| 163 // | 244 Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting |
| 164 // Does not look for matches longer than max_length. | 245 position in the input data. The binary tree is sorted by the lexicographic |
| 165 // Does not look for matches further away than max_backward. | 246 order of the sequences, and it is also a max-heap with respect to the |
| 166 // Writes the best found match length into best_len_out. | 247 starting positions. */ |
| 167 // Writes the index (&data[index]) of the start of the best match into | 248 typedef struct HashToBinaryTree { |
| 168 // best_distance_out. | 249 /* The window size minus 1 */ |
| 169 inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer, | 250 size_t window_mask_; |
| 170 const size_t ring_buffer_mask, | 251 |
| 171 const int* __restrict distance_cache, | 252 /* Hash table that maps the 4-byte hashes of the sequence to the last |
| 172 const size_t cur_ix, | 253 position where this hash was found, which is the root of the binary |
| 173 const size_t max_length, | 254 tree of sequences that share this hash bucket. */ |
| 174 const size_t max_backward, | 255 uint32_t buckets_[BUCKET_SIZE]; |
| 175 size_t * __restrict best_len_out, | 256 |
| 176 size_t * __restrict best_len_code_out, | 257 /* The union of the binary trees of each hash bucket. The root of the tree |
| 177 size_t * __restrict best_distance_out, | 258 corresponding to a hash is a sequence starting at buckets_[hash] and |
| 178 double* __restrict best_score_out) { | 259 the left and right children of a sequence starting at pos are |
| 179 const size_t best_len_in = *best_len_out; | 260 forest_[2 * pos] and forest_[2 * pos + 1]. */ |
| 180 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; | 261 uint32_t* forest_; |
| 181 const uint32_t key = HashBytes(&ring_buffer[cur_ix_masked]); | 262 |
| 182 int compare_char = ring_buffer[cur_ix_masked + best_len_in]; | 263 /* A position used to mark a non-existent sequence, i.e. a tree is empty if |
| 183 double best_score = *best_score_out; | 264 its root is at invalid_pos_ and a node is a leaf if both its children |
| 184 size_t best_len = best_len_in; | 265 are at invalid_pos_. */ |
| 185 size_t cached_backward = static_cast<size_t>(distance_cache[0]); | 266 uint32_t invalid_pos_; |
| 186 size_t prev_ix = cur_ix - cached_backward; | 267 |
| 187 bool match_found = false; | 268 size_t forest_size_; |
| 188 if (prev_ix < cur_ix) { | 269 BROTLI_BOOL is_dirty_; |
| 189 prev_ix &= static_cast<uint32_t>(ring_buffer_mask); | 270 } HashToBinaryTree; |
| 190 if (compare_char == ring_buffer[prev_ix + best_len]) { | 271 |
| 191 size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix], | 272 static void FN(Reset)(HashToBinaryTree* self) { |
| 192 &ring_buffer[cur_ix_masked], | 273 self->is_dirty_ = BROTLI_TRUE; |
| 193 max_length); | 274 } |
| 194 if (len >= 4) { | 275 |
| 195 best_score = BackwardReferenceScoreUsingLastDistance(len, 0); | 276 static void FN(Initialize)(HashToBinaryTree* self) { |
| 196 best_len = len; | 277 self->forest_ = NULL; |
| 197 *best_len_out = len; | 278 self->forest_size_ = 0; |
| 198 *best_len_code_out = len; | 279 FN(Reset)(self); |
| 199 *best_distance_out = cached_backward; | 280 } |
| 200 *best_score_out = best_score; | 281 |
| 201 compare_char = ring_buffer[cur_ix_masked + best_len]; | 282 static void FN(Cleanup)(MemoryManager* m, HashToBinaryTree* self) { |
| 202 if (kBucketSweep == 1) { | 283 BROTLI_FREE(m, self->forest_); |
| 203 buckets_[key] = static_cast<uint32_t>(cur_ix); | 284 } |
| 204 return true; | 285 |
| 205 } else { | 286 static void FN(Init)( |
| 206 match_found = true; | 287 MemoryManager* m, HashToBinaryTree* self, const uint8_t* data, |
| 207 } | 288 const BrotliEncoderParams* params, size_t position, size_t bytes, |
| 289 BROTLI_BOOL is_last) { |
| 290 if (self->is_dirty_) { |
| 291 uint32_t invalid_pos; |
| 292 size_t num_nodes; |
| 293 uint32_t i; |
| 294 BROTLI_UNUSED(data); |
| 295 self->window_mask_ = (1u << params->lgwin) - 1u; |
| 296 invalid_pos = (uint32_t)(0 - self->window_mask_); |
| 297 self->invalid_pos_ = invalid_pos; |
| 298 for (i = 0; i < BUCKET_SIZE; i++) { |
| 299 self->buckets_[i] = invalid_pos; |
| 300 } |
| 301 num_nodes = (position == 0 && is_last) ? bytes : self->window_mask_ + 1; |
| 302 if (num_nodes > self->forest_size_) { |
| 303 BROTLI_FREE(m, self->forest_); |
| 304 self->forest_ = BROTLI_ALLOC(m, uint32_t, 2 * num_nodes); |
| 305 if (BROTLI_IS_OOM(m)) return; |
| 306 self->forest_size_ = num_nodes; |
| 307 } |
| 308 self->is_dirty_ = BROTLI_FALSE; |
| 309 } |
| 310 } |
| 311 |
| 312 static BROTLI_INLINE size_t FN(LeftChildIndex)(HashToBinaryTree* self, |
| 313 const size_t pos) { |
| 314 return 2 * (pos & self->window_mask_); |
| 315 } |
| 316 |
| 317 static BROTLI_INLINE size_t FN(RightChildIndex)(HashToBinaryTree* self, |
| 318 const size_t pos) { |
| 319 return 2 * (pos & self->window_mask_) + 1; |
| 320 } |
| 321 |
| 322 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the |
| 323 hash bucket's binary tree is searched for matches and is re-rooted at the |
| 324 current position. |
| 325 |
| 326 If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the |
| 327 current position is searched for matches, but the state of the hash table |
| 328 is not changed, since we can not know the final sorting order of the |
| 329 current (incomplete) sequence. |
| 330 |
| 331 This function must be called with increasing cur_ix positions. */ |
| 332 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)( |
| 333 HashToBinaryTree* self, const uint8_t* const BROTLI_RESTRICT data, |
| 334 const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length, |
| 335 const size_t max_backward, size_t* const BROTLI_RESTRICT best_len, |
| 336 BackwardMatch* BROTLI_RESTRICT matches) { |
| 337 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; |
| 338 const size_t max_comp_len = |
| 339 BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH); |
| 340 const BROTLI_BOOL should_reroot_tree = |
| 341 TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH); |
| 342 const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]); |
| 343 size_t prev_ix = self->buckets_[key]; |
| 344 /* The forest index of the rightmost node of the left subtree of the new |
| 345 root, updated as we traverse and re-root the tree of the hash bucket. */ |
| 346 size_t node_left = FN(LeftChildIndex)(self, cur_ix); |
| 347 /* The forest index of the leftmost node of the right subtree of the new |
| 348 root, updated as we traverse and re-root the tree of the hash bucket. */ |
| 349 size_t node_right = FN(RightChildIndex)(self, cur_ix); |
| 350 /* The match length of the rightmost node of the left subtree of the new |
| 351 root, updated as we traverse and re-root the tree of the hash bucket. */ |
| 352 size_t best_len_left = 0; |
| 353 /* The match length of the leftmost node of the right subtree of the new |
| 354 root, updated as we traverse and re-root the tree of the hash bucket. */ |
| 355 size_t best_len_right = 0; |
| 356 size_t depth_remaining; |
| 357 if (should_reroot_tree) { |
| 358 self->buckets_[key] = (uint32_t)cur_ix; |
| 359 } |
| 360 for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) { |
| 361 const size_t backward = cur_ix - prev_ix; |
| 362 const size_t prev_ix_masked = prev_ix & ring_buffer_mask; |
| 363 if (backward == 0 || backward > max_backward || depth_remaining == 0) { |
| 364 if (should_reroot_tree) { |
| 365 self->forest_[node_left] = self->invalid_pos_; |
| 366 self->forest_[node_right] = self->invalid_pos_; |
| 367 } |
| 368 break; |
| 369 } |
| 370 { |
| 371 const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right); |
| 372 size_t len; |
| 373 assert(cur_len <= MAX_TREE_COMP_LENGTH); |
| 374 len = cur_len + |
| 375 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len], |
| 376 &data[prev_ix_masked + cur_len], |
| 377 max_length - cur_len); |
| 378 assert(0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len)); |
| 379 if (matches && len > *best_len) { |
| 380 *best_len = len; |
| 381 InitBackwardMatch(matches++, backward, len); |
| 382 } |
| 383 if (len >= max_comp_len) { |
| 384 if (should_reroot_tree) { |
| 385 self->forest_[node_left] = |
| 386 self->forest_[FN(LeftChildIndex)(self, prev_ix)]; |
| 387 self->forest_[node_right] = |
| 388 self->forest_[FN(RightChildIndex)(self, prev_ix)]; |
| 208 } | 389 } |
| 209 } | |
| 210 } | |
| 211 if (kBucketSweep == 1) { | |
| 212 // Only one to look for, don't bother to prepare for a loop. | |
| 213 prev_ix = buckets_[key]; | |
| 214 buckets_[key] = static_cast<uint32_t>(cur_ix); | |
| 215 size_t backward = cur_ix - prev_ix; | |
| 216 prev_ix &= static_cast<uint32_t>(ring_buffer_mask); | |
| 217 if (compare_char != ring_buffer[prev_ix + best_len_in]) { | |
| 218 return false; | |
| 219 } | |
| 220 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) { | |
| 221 return false; | |
| 222 } | |
| 223 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix], | |
| 224 &ring_buffer[cur_ix_masked], | |
| 225 max_length); | |
| 226 if (len >= 4) { | |
| 227 *best_len_out = len; | |
| 228 *best_len_code_out = len; | |
| 229 *best_distance_out = backward; | |
| 230 *best_score_out = BackwardReferenceScore(len, backward); | |
| 231 return true; | |
| 232 } | |
| 233 } else { | |
| 234 uint32_t *bucket = buckets_ + key; | |
| 235 prev_ix = *bucket++; | |
| 236 for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) { | |
| 237 const size_t backward = cur_ix - prev_ix; | |
| 238 prev_ix &= static_cast<uint32_t>(ring_buffer_mask); | |
| 239 if (compare_char != ring_buffer[prev_ix + best_len]) { | |
| 240 continue; | |
| 241 } | |
| 242 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) { | |
| 243 continue; | |
| 244 } | |
| 245 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix], | |
| 246 &ring_buffer[cur_ix_masked], | |
| 247 max_length); | |
| 248 if (len >= 4) { | |
| 249 const double score = BackwardReferenceScore(len, backward); | |
| 250 if (best_score < score) { | |
| 251 best_score = score; | |
| 252 best_len = len; | |
| 253 *best_len_out = best_len; | |
| 254 *best_len_code_out = best_len; | |
| 255 *best_distance_out = backward; | |
| 256 *best_score_out = score; | |
| 257 compare_char = ring_buffer[cur_ix_masked + best_len]; | |
| 258 match_found = true; | |
| 259 } | |
| 260 } | |
| 261 } | |
| 262 } | |
| 263 if (kUseDictionary && !match_found && | |
| 264 num_dict_matches_ >= (num_dict_lookups_ >> 7)) { | |
| 265 ++num_dict_lookups_; | |
| 266 const uint32_t dict_key = Hash<14>(&ring_buffer[cur_ix_masked]) << 1; | |
| 267 const uint16_t v = kStaticDictionaryHash[dict_key]; | |
| 268 if (v > 0) { | |
| 269 const uint32_t len = v & 31; | |
| 270 const uint32_t dist = v >> 5; | |
| 271 const size_t offset = | |
| 272 kBrotliDictionaryOffsetsByLength[len] + len * dist; | |
| 273 if (len <= max_length) { | |
| 274 const size_t matchlen = | |
| 275 FindMatchLengthWithLimit(&ring_buffer[cur_ix_masked], | |
| 276 &kBrotliDictionary[offset], len); | |
| 277 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) { | |
| 278 const size_t transform_id = kCutoffTransforms[len - matchlen]; | |
| 279 const size_t word_id = | |
| 280 transform_id * (1u << kBrotliDictionarySizeBitsByLength[len]) + | |
| 281 dist; | |
| 282 const size_t backward = max_backward + word_id + 1; | |
| 283 const double score = BackwardReferenceScore(matchlen, backward); | |
| 284 if (best_score < score) { | |
| 285 ++num_dict_matches_; | |
| 286 best_score = score; | |
| 287 best_len = matchlen; | |
| 288 *best_len_out = best_len; | |
| 289 *best_len_code_out = len; | |
| 290 *best_distance_out = backward; | |
| 291 *best_score_out = best_score; | |
| 292 match_found = true; | |
| 293 } | |
| 294 } | |
| 295 } | |
| 296 } | |
| 297 } | |
| 298 const uint32_t off = (cur_ix >> 3) % kBucketSweep; | |
| 299 buckets_[key + off] = static_cast<uint32_t>(cur_ix); | |
| 300 return match_found; | |
| 301 } | |
| 302 | |
| 303 enum { kHashLength = 5 }; | |
| 304 enum { kHashTypeLength = 8 }; | |
| 305 // HashBytes is the function that chooses the bucket to place | |
| 306 // the address in. The HashLongestMatch and HashLongestMatchQuickly | |
| 307 // classes have separate, different implementations of hashing. | |
| 308 static uint32_t HashBytes(const uint8_t *data) { | |
| 309 // Computing a hash based on 5 bytes works much better for | |
| 310 // qualities 1 and 3, where the next hash value is likely to replace | |
| 311 uint64_t h = (BROTLI_UNALIGNED_LOAD64(data) << 24) * kHashMul32; | |
| 312 // The higher bits contain more mixture from the multiplication, | |
| 313 // so we take our results from there. | |
| 314 return static_cast<uint32_t>(h >> (64 - kBucketBits)); | |
| 315 } | |
| 316 | |
| 317 enum { kHashMapSize = 4 << kBucketBits }; | |
| 318 | |
| 319 private: | |
| 320 static const uint32_t kBucketSize = 1 << kBucketBits; | |
| 321 uint32_t buckets_[kBucketSize + kBucketSweep]; | |
| 322 // True if buckets_ array needs to be initialized. | |
| 323 bool need_init_; | |
| 324 size_t num_dict_lookups_; | |
| 325 size_t num_dict_matches_; | |
| 326 }; | |
| 327 | |
| 328 // A (forgetful) hash table to the data seen by the compressor, to | |
| 329 // help create backward references to previous data. | |
| 330 // | |
| 331 // This is a hash map of fixed size (kBucketSize) to a ring buffer of | |
| 332 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize | |
| 333 // index positions of the given hash key in the compressed data. | |
| 334 template <int kBucketBits, | |
| 335 int kBlockBits, | |
| 336 int kNumLastDistancesToCheck> | |
| 337 class HashLongestMatch { | |
| 338 public: | |
| 339 HashLongestMatch(void) { | |
| 340 Reset(); | |
| 341 } | |
| 342 | |
| 343 void Reset(void) { | |
| 344 need_init_ = true; | |
| 345 num_dict_lookups_ = 0; | |
| 346 num_dict_matches_ = 0; | |
| 347 } | |
| 348 | |
| 349 void Init(void) { | |
| 350 if (need_init_) { | |
| 351 memset(&num_[0], 0, sizeof(num_)); | |
| 352 need_init_ = false; | |
| 353 } | |
| 354 } | |
| 355 | |
| 356 void InitForData(const uint8_t* data, size_t num) { | |
| 357 for (size_t i = 0; i < num; ++i) { | |
| 358 const uint32_t key = HashBytes(&data[i]); | |
| 359 num_[key] = 0; | |
| 360 need_init_ = false; | |
| 361 } | |
| 362 } | |
| 363 | |
| 364 // Look at 3 bytes at data. | |
| 365 // Compute a hash from these, and store the value of ix at that position. | |
| 366 inline void Store(const uint8_t *data, const uint32_t ix) { | |
| 367 const uint32_t key = HashBytes(data); | |
| 368 const int minor_ix = num_[key] & kBlockMask; | |
| 369 buckets_[key][minor_ix] = ix; | |
| 370 ++num_[key]; | |
| 371 } | |
| 372 | |
| 373 // Find a longest backward match of &data[cur_ix] up to the length of | |
| 374 // max_length and stores the position cur_ix in the hash table. | |
| 375 // | |
| 376 // Does not look for matches longer than max_length. | |
| 377 // Does not look for matches further away than max_backward. | |
| 378 // Writes the best found match length into best_len_out. | |
| 379 // Writes the index (&data[index]) offset from the start of the best match | |
| 380 // into best_distance_out. | |
| 381 // Write the score of the best match into best_score_out. | |
| 382 bool FindLongestMatch(const uint8_t * __restrict data, | |
| 383 const size_t ring_buffer_mask, | |
| 384 const int* __restrict distance_cache, | |
| 385 const size_t cur_ix, | |
| 386 const size_t max_length, | |
| 387 const size_t max_backward, | |
| 388 size_t * __restrict best_len_out, | |
| 389 size_t * __restrict best_len_code_out, | |
| 390 size_t * __restrict best_distance_out, | |
| 391 double * __restrict best_score_out) { | |
| 392 *best_len_code_out = 0; | |
| 393 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; | |
| 394 bool match_found = false; | |
| 395 // Don't accept a short copy from far away. | |
| 396 double best_score = *best_score_out; | |
| 397 size_t best_len = *best_len_out; | |
| 398 *best_len_out = 0; | |
| 399 // Try last distance first. | |
| 400 for (size_t i = 0; i < kNumLastDistancesToCheck; ++i) { | |
| 401 const size_t idx = kDistanceCacheIndex[i]; | |
| 402 const size_t backward = | |
| 403 static_cast<size_t>(distance_cache[idx] + kDistanceCacheOffset[i]); | |
| 404 size_t prev_ix = static_cast<size_t>(cur_ix - backward); | |
| 405 if (prev_ix >= cur_ix) { | |
| 406 continue; | |
| 407 } | |
| 408 if (PREDICT_FALSE(backward > max_backward)) { | |
| 409 continue; | |
| 410 } | |
| 411 prev_ix &= ring_buffer_mask; | |
| 412 | |
| 413 if (cur_ix_masked + best_len > ring_buffer_mask || | |
| 414 prev_ix + best_len > ring_buffer_mask || | |
| 415 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) { | |
| 416 continue; | |
| 417 } | |
| 418 const size_t len = FindMatchLengthWithLimit(&data[prev_ix], | |
| 419 &data[cur_ix_masked], | |
| 420 max_length); | |
| 421 if (len >= 3 || (len == 2 && i < 2)) { | |
| 422 // Comparing for >= 2 does not change the semantics, but just saves for | |
| 423 // a few unnecessary binary logarithms in backward reference score, | |
| 424 // since we are not interested in such short matches. | |
| 425 double score = BackwardReferenceScoreUsingLastDistance(len, i); | |
| 426 if (best_score < score) { | |
| 427 best_score = score; | |
| 428 best_len = len; | |
| 429 *best_len_out = best_len; | |
| 430 *best_len_code_out = best_len; | |
| 431 *best_distance_out = backward; | |
| 432 *best_score_out = best_score; | |
| 433 match_found = true; | |
| 434 } | |
| 435 } | |
| 436 } | |
| 437 const uint32_t key = HashBytes(&data[cur_ix_masked]); | |
| 438 const uint32_t * __restrict const bucket = &buckets_[key][0]; | |
| 439 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; | |
| 440 for (size_t i = num_[key]; i > down;) { | |
| 441 --i; | |
| 442 size_t prev_ix = bucket[i & kBlockMask]; | |
| 443 const size_t backward = cur_ix - prev_ix; | |
| 444 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) { | |
| 445 break; | 390 break; |
| 446 } | 391 } |
| 447 prev_ix &= ring_buffer_mask; | 392 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) { |
| 448 if (cur_ix_masked + best_len > ring_buffer_mask || | 393 best_len_left = len; |
| 449 prev_ix + best_len > ring_buffer_mask || | 394 if (should_reroot_tree) { |
| 450 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) { | 395 self->forest_[node_left] = (uint32_t)prev_ix; |
| 451 continue; | 396 } |
| 397 node_left = FN(RightChildIndex)(self, prev_ix); |
| 398 prev_ix = self->forest_[node_left]; |
| 399 } else { |
| 400 best_len_right = len; |
| 401 if (should_reroot_tree) { |
| 402 self->forest_[node_right] = (uint32_t)prev_ix; |
| 403 } |
| 404 node_right = FN(LeftChildIndex)(self, prev_ix); |
| 405 prev_ix = self->forest_[node_right]; |
| 452 } | 406 } |
| 453 const size_t len = FindMatchLengthWithLimit(&data[prev_ix], | 407 } |
| 454 &data[cur_ix_masked], | 408 } |
| 455 max_length); | 409 return matches; |
| 456 if (len >= 4) { | 410 } |
| 457 // Comparing for >= 3 does not change the semantics, but just saves | 411 |
| 458 // for a few unnecessary binary logarithms in backward reference | 412 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the |
| 459 // score, since we are not interested in such short matches. | 413 length of max_length and stores the position cur_ix in the hash table. |
| 460 double score = BackwardReferenceScore(len, backward); | 414 |
| 461 if (best_score < score) { | 415 Sets *num_matches to the number of matches found, and stores the found |
| 462 best_score = score; | 416 matches in matches[0] to matches[*num_matches - 1]. The matches will be |
| 463 best_len = len; | 417 sorted by strictly increasing length and (non-strictly) increasing |
| 464 *best_len_out = best_len; | 418 distance. */ |
| 465 *best_len_code_out = best_len; | 419 static BROTLI_INLINE size_t FN(FindAllMatches)(HashToBinaryTree* self, |
| 466 *best_distance_out = backward; | 420 const uint8_t* data, const size_t ring_buffer_mask, const size_t cur_ix, |
| 467 *best_score_out = best_score; | 421 const size_t max_length, const size_t max_backward, |
| 468 match_found = true; | 422 const BrotliEncoderParams* params, BackwardMatch* matches) { |
| 469 } | 423 BackwardMatch* const orig_matches = matches; |
| 470 } | 424 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; |
| 471 } | 425 size_t best_len = 1; |
| 472 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix); | 426 const size_t short_match_max_backward = |
| 473 ++num_[key]; | 427 params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64; |
| 474 if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) { | 428 size_t stop = cur_ix - short_match_max_backward; |
| 475 size_t dict_key = Hash<14>(&data[cur_ix_masked]) << 1; | 429 uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1]; |
| 476 for (int k = 0; k < 2; ++k, ++dict_key) { | 430 size_t i; |
| 477 ++num_dict_lookups_; | 431 if (cur_ix < short_match_max_backward) { stop = 0; } |
| 478 const uint16_t v = kStaticDictionaryHash[dict_key]; | 432 for (i = cur_ix - 1; i > stop && best_len <= 2; --i) { |
| 479 if (v > 0) { | 433 size_t prev_ix = i; |
| 480 const size_t len = v & 31; | 434 const size_t backward = cur_ix - prev_ix; |
| 481 const size_t dist = v >> 5; | 435 if (BROTLI_PREDICT_FALSE(backward > max_backward)) { |
| 482 const size_t offset = | 436 break; |
| 483 kBrotliDictionaryOffsetsByLength[len] + len * dist; | 437 } |
| 484 if (len <= max_length) { | 438 prev_ix &= ring_buffer_mask; |
| 485 const size_t matchlen = | 439 if (data[cur_ix_masked] != data[prev_ix] || |
| 486 FindMatchLengthWithLimit(&data[cur_ix_masked], | 440 data[cur_ix_masked + 1] != data[prev_ix + 1]) { |
| 487 &kBrotliDictionary[offset], len); | 441 continue; |
| 488 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) { | 442 } |
| 489 const size_t transform_id = kCutoffTransforms[len - matchlen]; | 443 { |
| 490 const size_t word_id = | |
| 491 transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) + | |
| 492 dist; | |
| 493 const size_t backward = max_backward + word_id + 1; | |
| 494 double score = BackwardReferenceScore(matchlen, backward); | |
| 495 if (best_score < score) { | |
| 496 ++num_dict_matches_; | |
| 497 best_score = score; | |
| 498 best_len = matchlen; | |
| 499 *best_len_out = best_len; | |
| 500 *best_len_code_out = len; | |
| 501 *best_distance_out = backward; | |
| 502 *best_score_out = best_score; | |
| 503 match_found = true; | |
| 504 } | |
| 505 } | |
| 506 } | |
| 507 } | |
| 508 } | |
| 509 } | |
| 510 return match_found; | |
| 511 } | |
| 512 | |
| 513 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the | |
| 514 // length of max_length and stores the position cur_ix in the hash table. | |
| 515 // | |
| 516 // Sets *num_matches to the number of matches found, and stores the found | |
| 517 // matches in matches[0] to matches[*num_matches - 1]. The matches will be | |
| 518 // sorted by strictly increasing length and (non-strictly) increasing | |
| 519 // distance. | |
| 520 size_t FindAllMatches(const uint8_t* data, | |
| 521 const size_t ring_buffer_mask, | |
| 522 const size_t cur_ix, | |
| 523 const size_t max_length, | |
| 524 const size_t max_backward, | |
| 525 BackwardMatch* matches) { | |
| 526 BackwardMatch* const orig_matches = matches; | |
| 527 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; | |
| 528 size_t best_len = 1; | |
| 529 size_t stop = cur_ix - 64; | |
| 530 if (cur_ix < 64) { stop = 0; } | |
| 531 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) { | |
| 532 size_t prev_ix = i; | |
| 533 const size_t backward = cur_ix - prev_ix; | |
| 534 if (PREDICT_FALSE(backward > max_backward)) { | |
| 535 break; | |
| 536 } | |
| 537 prev_ix &= ring_buffer_mask; | |
| 538 if (data[cur_ix_masked] != data[prev_ix] || | |
| 539 data[cur_ix_masked + 1] != data[prev_ix + 1]) { | |
| 540 continue; | |
| 541 } | |
| 542 const size_t len = | 444 const size_t len = |
| 543 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], | 445 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], |
| 544 max_length); | 446 max_length); |
| 545 if (len > best_len) { | 447 if (len > best_len) { |
| 546 best_len = len; | 448 best_len = len; |
| 547 *matches++ = BackwardMatch(backward, len); | 449 InitBackwardMatch(matches++, backward, len); |
| 548 } | 450 } |
| 549 } | 451 } |
| 550 const uint32_t key = HashBytes(&data[cur_ix_masked]); | 452 } |
| 551 const uint32_t * __restrict const bucket = &buckets_[key][0]; | 453 if (best_len < max_length) { |
| 552 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; | 454 matches = FN(StoreAndFindMatches)(self, data, cur_ix, ring_buffer_mask, |
| 553 for (size_t i = num_[key]; i > down;) { | 455 max_length, max_backward, &best_len, matches); |
| 554 --i; | 456 } |
| 555 size_t prev_ix = bucket[i & kBlockMask]; | 457 for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) { |
| 556 const size_t backward = cur_ix - prev_ix; | 458 dict_matches[i] = kInvalidMatch; |
| 557 if (PREDICT_FALSE(backward == 0 || backward > max_backward)) { | 459 } |
| 558 break; | 460 { |
| 559 } | 461 size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1); |
| 560 prev_ix &= ring_buffer_mask; | 462 if (BrotliFindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, |
| 561 if (cur_ix_masked + best_len > ring_buffer_mask || | 463 max_length, &dict_matches[0])) { |
| 562 prev_ix + best_len > ring_buffer_mask || | 464 size_t maxlen = BROTLI_MIN( |
| 563 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) { | 465 size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length); |
| 564 continue; | 466 size_t l; |
| 565 } | 467 for (l = minlen; l <= maxlen; ++l) { |
| 566 const size_t len = | |
| 567 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], | |
| 568 max_length); | |
| 569 if (len > best_len) { | |
| 570 best_len = len; | |
| 571 *matches++ = BackwardMatch(backward, len); | |
| 572 } | |
| 573 } | |
| 574 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix); | |
| 575 ++num_[key]; | |
| 576 uint32_t dict_matches[kMaxDictionaryMatchLen + 1]; | |
| 577 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) { | |
| 578 dict_matches[i] = kInvalidMatch; | |
| 579 } | |
| 580 size_t minlen = std::max<size_t>(4, best_len + 1); | |
| 581 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length, | |
| 582 &dict_matches[0])) { | |
| 583 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length); | |
| 584 for (size_t l = minlen; l <= maxlen; ++l) { | |
| 585 uint32_t dict_id = dict_matches[l]; | 468 uint32_t dict_id = dict_matches[l]; |
| 586 if (dict_id < kInvalidMatch) { | 469 if (dict_id < kInvalidMatch) { |
| 587 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l, | 470 InitDictionaryBackwardMatch(matches++, |
| 588 dict_id & 31); | 471 max_backward + (dict_id >> 5) + 1, l, dict_id & 31); |
| 589 } | 472 } |
| 590 } | 473 } |
| 591 } | 474 } |
| 592 return static_cast<size_t>(matches - orig_matches); | 475 } |
| 593 } | 476 return (size_t)(matches - orig_matches); |
| 594 | 477 } |
| 595 enum { kHashLength = 4 }; | 478 |
| 596 enum { kHashTypeLength = 4 }; | 479 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the |
| 597 | 480 current sequence, without returning any matches. |
| 598 // HashBytes is the function that chooses the bucket to place | 481 REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */ |
| 599 // the address in. The HashLongestMatch and HashLongestMatchQuickly | 482 static BROTLI_INLINE void FN(Store)(HashToBinaryTree* self, const uint8_t *data, |
| 600 // classes have separate, different implementations of hashing. | 483 const size_t mask, const size_t ix) { |
| 601 static uint32_t HashBytes(const uint8_t *data) { | 484 /* Maximum distance is window size - 16, see section 9.1. of the spec. */ |
| 602 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; | 485 const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1; |
| 603 // The higher bits contain more mixture from the multiplication, | 486 FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH, |
| 604 // so we take our results from there. | 487 max_backward, NULL, NULL); |
| 605 return h >> (32 - kBucketBits); | 488 } |
| 606 } | 489 |
| 607 | 490 static BROTLI_INLINE void FN(StoreRange)(HashToBinaryTree* self, |
| 608 enum { kHashMapSize = 2 << kBucketBits }; | 491 const uint8_t *data, const size_t mask, const size_t ix_start, |
| 609 | 492 const size_t ix_end) { |
| 610 static const size_t kMaxNumMatches = 64 + (1 << kBlockBits); | 493 size_t i = ix_start; |
| 611 | 494 size_t j = ix_start; |
| 612 private: | 495 if (ix_start + 63 <= ix_end) { |
| 613 // Number of hash buckets. | 496 i = ix_end - 63; |
| 614 static const uint32_t kBucketSize = 1 << kBucketBits; | 497 } |
| 615 | 498 if (ix_start + 512 <= i) { |
| 616 // Only kBlockSize newest backward references are kept, | 499 for (; j < i; j += 8) { |
| 617 // and the older are forgotten. | 500 FN(Store)(self, data, mask, j); |
| 618 static const uint32_t kBlockSize = 1 << kBlockBits; | 501 } |
| 619 | 502 } |
| 620 // Mask for accessing entries in a block (in a ringbuffer manner). | 503 for (; i < ix_end; ++i) { |
| 621 static const uint32_t kBlockMask = (1 << kBlockBits) - 1; | 504 FN(Store)(self, data, mask, i); |
| 622 | 505 } |
| 623 // Number of entries in a particular bucket. | 506 } |
| 624 uint16_t num_[kBucketSize]; | 507 |
| 625 | 508 static BROTLI_INLINE void FN(StitchToPreviousBlock)(HashToBinaryTree* self, |
| 626 // Buckets containing kBlockSize of backward references. | 509 size_t num_bytes, size_t position, const uint8_t* ringbuffer, |
| 627 uint32_t buckets_[kBucketSize][kBlockSize]; | 510 size_t ringbuffer_mask) { |
| 628 | 511 if (num_bytes >= FN(HashTypeLength)() - 1 && |
| 629 // True if num_ array needs to be initialized. | 512 position >= MAX_TREE_COMP_LENGTH) { |
| 630 bool need_init_; | 513 /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher. |
| 631 | 514 These could not be calculated before, since they require knowledge |
| 632 size_t num_dict_lookups_; | 515 of both the previous and the current block. */ |
| 633 size_t num_dict_matches_; | 516 const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1; |
| 634 }; | 517 const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes); |
| 635 | 518 size_t i; |
| 636 // A (forgetful) hash table where each hash bucket contains a binary tree of | 519 for (i = i_start; i < i_end; ++i) { |
| 637 // sequences whose first 4 bytes share the same hash code. | 520 /* Maximum distance is window size - 16, see section 9.1. of the spec. |
| 638 // Each sequence is kMaxTreeCompLength long and is identified by its starting | 521 Furthermore, we have to make sure that we don't look further back |
| 639 // position in the input data. The binary tree is sorted by the lexicographic | 522 from the start of the next block than the window size, otherwise we |
| 640 // order of the sequences, and it is also a max-heap with respect to the | 523 could access already overwritten areas of the ring-buffer. */ |
| 641 // starting positions. | 524 const size_t max_backward = |
| 642 class HashToBinaryTree { | 525 self->window_mask_ - BROTLI_MAX(size_t, |
| 643 public: | 526 BROTLI_WINDOW_GAP - 1, |
| 644 HashToBinaryTree() : forest_(NULL) { | 527 position - i); |
| 645 Reset(); | 528 /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the |
| 646 } | 529 end of the current block and that we have at least |
| 647 | 530 MAX_TREE_COMP_LENGTH tail in the ring-buffer. */ |
| 648 ~HashToBinaryTree() { | 531 FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask, |
| 649 delete[] forest_; | 532 MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL); |
| 650 } | 533 } |
| 651 | 534 } |
| 652 void Reset() { | 535 } |
| 653 need_init_ = true; | 536 |
| 654 } | 537 #undef BUCKET_SIZE |
| 655 | 538 #undef BUCKET_BITS |
| 656 void Init(int lgwin, size_t position, size_t bytes, bool is_last) { | 539 |
| 657 if (need_init_) { | 540 #undef HASHER |
| 658 window_mask_ = (1u << lgwin) - 1u; | 541 |
| 659 invalid_pos_ = static_cast<uint32_t>(0 - window_mask_); | 542 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression |
| 660 for (uint32_t i = 0; i < kBucketSize; i++) { | 543 a little faster (0.5% - 1%) and it compresses 0.15% better on small text |
| 661 buckets_[i] = invalid_pos_; | 544 and HTML inputs. */ |
| 662 } | 545 |
| 663 size_t num_nodes = (position == 0 && is_last) ? bytes : window_mask_ + 1; | 546 #define HASHER() H2 |
| 664 forest_ = new uint32_t[2 * num_nodes]; | 547 #define BUCKET_BITS 16 |
| 665 need_init_ = false; | 548 #define BUCKET_SWEEP 1 |
| 666 } | 549 #define USE_DICTIONARY 1 |
| 667 } | 550 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 668 | 551 #undef BUCKET_SWEEP |
| 669 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the | 552 #undef USE_DICTIONARY |
| 670 // length of max_length and stores the position cur_ix in the hash table. | 553 #undef HASHER |
| 671 // | 554 |
| 672 // Sets *num_matches to the number of matches found, and stores the found | 555 #define HASHER() H3 |
| 673 // matches in matches[0] to matches[*num_matches - 1]. The matches will be | 556 #define BUCKET_SWEEP 2 |
| 674 // sorted by strictly increasing length and (non-strictly) increasing | 557 #define USE_DICTIONARY 0 |
| 675 // distance. | 558 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 676 size_t FindAllMatches(const uint8_t* data, | 559 #undef USE_DICTIONARY |
| 677 const size_t ring_buffer_mask, | 560 #undef BUCKET_SWEEP |
| 678 const size_t cur_ix, | 561 #undef BUCKET_BITS |
| 679 const size_t max_length, | 562 #undef HASHER |
| 680 const size_t max_backward, | 563 |
| 681 BackwardMatch* matches) { | 564 #define HASHER() H4 |
| 682 BackwardMatch* const orig_matches = matches; | 565 #define BUCKET_BITS 17 |
| 683 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; | 566 #define BUCKET_SWEEP 4 |
| 684 size_t best_len = 1; | 567 #define USE_DICTIONARY 1 |
| 685 size_t stop = cur_ix - 64; | 568 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 686 if (cur_ix < 64) { stop = 0; } | 569 #undef USE_DICTIONARY |
| 687 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) { | 570 #undef BUCKET_SWEEP |
| 688 size_t prev_ix = i; | 571 #undef BUCKET_BITS |
| 689 const size_t backward = cur_ix - prev_ix; | 572 #undef HASHER |
| 690 if (PREDICT_FALSE(backward > max_backward)) { | 573 |
| 691 break; | 574 #define HASHER() H5 |
| 692 } | 575 #define BUCKET_BITS 14 |
| 693 prev_ix &= ring_buffer_mask; | 576 #define BLOCK_BITS 4 |
| 694 if (data[cur_ix_masked] != data[prev_ix] || | 577 #define NUM_LAST_DISTANCES_TO_CHECK 4 |
| 695 data[cur_ix_masked + 1] != data[prev_ix + 1]) { | 578 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
| 696 continue; | 579 #undef BLOCK_BITS |
| 697 } | 580 #undef HASHER |
| 698 const size_t len = | 581 |
| 699 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], | 582 #define HASHER() H6 |
| 700 max_length); | 583 #define BLOCK_BITS 5 |
| 701 if (len > best_len) { | 584 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
| 702 best_len = len; | 585 #undef NUM_LAST_DISTANCES_TO_CHECK |
| 703 *matches++ = BackwardMatch(backward, len); | 586 #undef BLOCK_BITS |
| 704 } | 587 #undef BUCKET_BITS |
| 705 } | 588 #undef HASHER |
| 706 if (best_len < max_length) { | 589 |
| 707 matches = StoreAndFindMatches(data, cur_ix, ring_buffer_mask, | 590 #define HASHER() H7 |
| 708 max_length, &best_len, matches); | 591 #define BUCKET_BITS 15 |
| 709 } | 592 #define BLOCK_BITS 6 |
| 710 uint32_t dict_matches[kMaxDictionaryMatchLen + 1]; | 593 #define NUM_LAST_DISTANCES_TO_CHECK 10 |
| 711 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) { | 594 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
| 712 dict_matches[i] = kInvalidMatch; | 595 #undef BLOCK_BITS |
| 713 } | 596 #undef HASHER |
| 714 size_t minlen = std::max<size_t>(4, best_len + 1); | 597 |
| 715 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length, | 598 #define HASHER() H8 |
| 716 &dict_matches[0])) { | 599 #define BLOCK_BITS 7 |
| 717 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length); | 600 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
| 718 for (size_t l = minlen; l <= maxlen; ++l) { | 601 #undef NUM_LAST_DISTANCES_TO_CHECK |
| 719 uint32_t dict_id = dict_matches[l]; | 602 #undef BLOCK_BITS |
| 720 if (dict_id < kInvalidMatch) { | 603 #undef HASHER |
| 721 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l, | 604 |
| 722 dict_id & 31); | 605 #define HASHER() H9 |
| 723 } | 606 #define BLOCK_BITS 8 |
| 724 } | 607 #define NUM_LAST_DISTANCES_TO_CHECK 16 |
| 725 } | 608 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
| 726 return static_cast<size_t>(matches - orig_matches); | 609 #undef NUM_LAST_DISTANCES_TO_CHECK |
| 727 } | 610 #undef BLOCK_BITS |
| 728 | 611 #undef BUCKET_BITS |
| 729 // Stores the hash of the next 4 bytes and re-roots the binary tree at the | 612 #undef HASHER |
| 730 // current sequence, without returning any matches. | 613 |
| 731 // REQUIRES: cur_ix + kMaxTreeCompLength <= end-of-current-block | 614 #define BUCKET_BITS 15 |
| 732 void Store(const uint8_t* data, | 615 |
| 733 const size_t ring_buffer_mask, | 616 #define NUM_LAST_DISTANCES_TO_CHECK 4 |
| 734 const size_t cur_ix) { | 617 #define NUM_BANKS 1 |
| 735 size_t best_len = 0; | 618 #define BANK_BITS 16 |
| 736 StoreAndFindMatches(data, cur_ix, ring_buffer_mask, kMaxTreeCompLength, | 619 #define HASHER() H40 |
| 737 &best_len, NULL); | 620 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
| 738 } | 621 #undef HASHER |
| 739 | 622 #undef NUM_LAST_DISTANCES_TO_CHECK |
| 740 void StitchToPreviousBlock(size_t num_bytes, | 623 |
| 741 size_t position, | 624 #define NUM_LAST_DISTANCES_TO_CHECK 10 |
| 742 const uint8_t* ringbuffer, | 625 #define HASHER() H41 |
| 743 size_t ringbuffer_mask) { | 626 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
| 744 if (num_bytes >= 3 && position >= kMaxTreeCompLength) { | 627 #undef HASHER |
| 745 // Store the last `kMaxTreeCompLength - 1` positions in the hasher. | 628 #undef NUM_LAST_DISTANCES_TO_CHECK |
| 746 // These could not be calculated before, since they require knowledge | 629 #undef NUM_BANKS |
| 747 // of both the previous and the current block. | 630 #undef BANK_BITS |
| 748 const size_t i_start = position - kMaxTreeCompLength + 1; | 631 |
| 749 const size_t i_end = std::min(position, i_start + num_bytes); | 632 #define NUM_LAST_DISTANCES_TO_CHECK 16 |
| 750 for (size_t i = i_start; i < i_end; ++i) { | 633 #define NUM_BANKS 512 |
| 751 // We know that i + kMaxTreeCompLength <= position + num_bytes, i.e. the | 634 #define BANK_BITS 9 |
| 752 // end of the current block and that we have at least | 635 #define HASHER() H42 |
| 753 // kMaxTreeCompLength tail in the ringbuffer. | 636 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
| 754 Store(ringbuffer, ringbuffer_mask, i); | 637 #undef HASHER |
| 755 } | 638 #undef NUM_LAST_DISTANCES_TO_CHECK |
| 756 } | 639 #undef NUM_BANKS |
| 757 } | 640 #undef BANK_BITS |
| 758 | 641 |
| 759 static const size_t kMaxNumMatches = 64 + kMaxTreeSearchDepth; | 642 #undef BUCKET_BITS |
| 760 | 643 |
| 761 private: | 644 #undef FN |
| 762 // Stores the hash of the next 4 bytes and in a single tree-traversal, the | 645 #undef CAT |
| 763 // hash bucket's binary tree is searched for matches and is re-rooted at the | 646 #undef EXPAND_CAT |
| 764 // current position. | 647 |
| 765 // | 648 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(7) H(8) H(9) \ |
| 766 // If less than kMaxTreeCompLength data is available, the hash bucket of the | 649 H(40) H(41) H(42) |
| 767 // current position is searched for matches, but the state of the hash table | 650 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10) |
| 768 // is not changed, since we can not know the final sorting order of the | 651 |
| 769 // current (incomplete) sequence. | 652 typedef struct Hashers { |
| 770 // | 653 #define MEMBER_(N) H ## N* h ## N; |
| 771 // This function must be called with increasing cur_ix positions. | 654 FOR_ALL_HASHERS(MEMBER_) |
| 772 BackwardMatch* StoreAndFindMatches(const uint8_t* const __restrict data, | 655 #undef MEMBER_ |
| 773 const size_t cur_ix, | 656 } Hashers; |
| 774 const size_t ring_buffer_mask, | 657 |
| 775 const size_t max_length, | 658 static BROTLI_INLINE void InitHashers(Hashers* self) { |
| 776 size_t* const __restrict best_len, | 659 #define INIT_(N) self->h ## N = 0; |
| 777 BackwardMatch* __restrict matches) { | 660 FOR_ALL_HASHERS(INIT_) |
| 778 const size_t cur_ix_masked = cur_ix & ring_buffer_mask; | 661 #undef INIT_ |
| 779 const size_t max_backward = window_mask_ - 15; | 662 } |
| 780 const size_t max_comp_len = std::min(max_length, kMaxTreeCompLength); | 663 |
| 781 const bool reroot_tree = max_length >= kMaxTreeCompLength; | 664 static BROTLI_INLINE void DestroyHashers(MemoryManager* m, Hashers* self) { |
| 782 const uint32_t key = HashBytes(&data[cur_ix_masked]); | 665 if (self->h10) CleanupH10(m, self->h10); |
| 783 size_t prev_ix = buckets_[key]; | 666 #define CLEANUP_(N) BROTLI_FREE(m, self->h ## N) |
| 784 // The forest index of the rightmost node of the left subtree of the new | 667 FOR_ALL_HASHERS(CLEANUP_) |
| 785 // root, updated as we traverse and reroot the tree of the hash bucket. | 668 #undef CLEANUP_ |
| 786 size_t node_left = LeftChildIndex(cur_ix); | 669 } |
| 787 // The forest index of the leftmost node of the right subtree of the new | 670 |
| 788 // root, updated as we traverse and reroot the tree of the hash bucket. | 671 static BROTLI_INLINE void HashersReset(Hashers* self, int type) { |
| 789 size_t node_right = RightChildIndex(cur_ix); | 672 switch (type) { |
| 790 // The match length of the rightmost node of the left subtree of the new | 673 #define RESET_(N) case N: ResetH ## N(self->h ## N); break; |
| 791 // root, updated as we traverse and reroot the tree of the hash bucket. | 674 FOR_ALL_HASHERS(RESET_) |
| 792 size_t best_len_left = 0; | 675 #undef RESET_ |
| 793 // The match length of the leftmost node of the right subtree of the new | 676 default: break; |
| 794 // root, updated as we traverse and reroot the tree of the hash bucket. | 677 } |
| 795 size_t best_len_right = 0; | 678 } |
| 796 if (reroot_tree) { | 679 |
| 797 buckets_[key] = static_cast<uint32_t>(cur_ix); | 680 static BROTLI_INLINE void HashersSetup( |
| 798 } | 681 MemoryManager* m, Hashers* self, int type) { |
| 799 for (size_t depth_remaining = kMaxTreeSearchDepth; ; --depth_remaining) { | 682 switch (type) { |
| 800 const size_t backward = cur_ix - prev_ix; | 683 #define SETUP_(N) case N: self->h ## N = BROTLI_ALLOC(m, H ## N, 1); break; |
| 801 const size_t prev_ix_masked = prev_ix & ring_buffer_mask; | 684 FOR_ALL_HASHERS(SETUP_) |
| 802 if (backward == 0 || backward > max_backward || depth_remaining == 0) { | 685 #undef SETUP_ |
| 803 if (reroot_tree) { | 686 default: break; |
| 804 forest_[node_left] = invalid_pos_; | 687 } |
| 805 forest_[node_right] = invalid_pos_; | 688 if (BROTLI_IS_OOM(m)) return; |
| 806 } | 689 if (type == 10) InitializeH10(self->h10); |
| 807 break; | 690 HashersReset(self, type); |
| 808 } | 691 } |
| 809 const size_t cur_len = std::min(best_len_left, best_len_right); | 692 |
| 810 const size_t len = cur_len + | 693 #define WARMUP_HASH_(N) \ |
| 811 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len], | 694 static BROTLI_INLINE void WarmupHashH ## N(MemoryManager* m, \ |
| 812 &data[prev_ix_masked + cur_len], | 695 const BrotliEncoderParams* params, const size_t size, const uint8_t* dict, \ |
| 813 max_length - cur_len); | 696 H ## N* hasher) { \ |
| 814 if (len > *best_len) { | 697 size_t overlap = (StoreLookaheadH ## N()) - 1; \ |
| 815 *best_len = len; | 698 size_t i; \ |
| 816 if (matches) { | 699 InitH ## N(m, hasher, dict, params, 0, size, BROTLI_FALSE); \ |
| 817 *matches++ = BackwardMatch(backward, len); | 700 if (BROTLI_IS_OOM(m)) return; \ |
| 818 } | 701 for (i = 0; i + overlap < size; i++) { \ |
| 819 if (len >= max_comp_len) { | 702 StoreH ## N(hasher, dict, ~(size_t)0, i); \ |
| 820 if (reroot_tree) { | 703 } \ |
| 821 forest_[node_left] = forest_[LeftChildIndex(prev_ix)]; | 704 } |
| 822 forest_[node_right] = forest_[RightChildIndex(prev_ix)]; | 705 FOR_ALL_HASHERS(WARMUP_HASH_) |
| 823 } | 706 #undef WARMUP_HASH_ |
| 824 break; | 707 |
| 825 } | 708 /* Custom LZ77 window. */ |
| 826 } | 709 static BROTLI_INLINE void HashersPrependCustomDictionary( |
| 827 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) { | 710 MemoryManager* m, Hashers* self, const BrotliEncoderParams* params, |
| 828 best_len_left = len; | 711 const size_t size, const uint8_t* dict) { |
| 829 if (reroot_tree) { | 712 int hasher_type = ChooseHasher(params); |
| 830 forest_[node_left] = static_cast<uint32_t>(prev_ix); | 713 switch (hasher_type) { |
| 831 } | 714 #define PREPEND_(N) \ |
| 832 node_left = RightChildIndex(prev_ix); | 715 case N: WarmupHashH ## N(m, params, size, dict, self->h ## N); break; |
| 833 prev_ix = forest_[node_left]; | 716 FOR_ALL_HASHERS(PREPEND_) |
| 834 } else { | 717 #undef PREPEND_ |
| 835 best_len_right = len; | 718 default: break; |
| 836 if (reroot_tree) { | 719 } |
| 837 forest_[node_right] = static_cast<uint32_t>(prev_ix); | 720 if (BROTLI_IS_OOM(m)) return; |
| 838 } | 721 } |
| 839 node_right = LeftChildIndex(prev_ix); | 722 |
| 840 prev_ix = forest_[node_right]; | 723 |
| 841 } | 724 #if defined(__cplusplus) || defined(c_plusplus) |
| 842 } | 725 } /* extern "C" */ |
| 843 return matches; | 726 #endif |
| 844 } | 727 |
| 845 | 728 #endif /* BROTLI_ENC_HASH_H_ */ |
| 846 inline size_t LeftChildIndex(const size_t pos) { | |
| 847 return 2 * (pos & window_mask_); | |
| 848 } | |
| 849 | |
| 850 inline size_t RightChildIndex(const size_t pos) { | |
| 851 return 2 * (pos & window_mask_) + 1; | |
| 852 } | |
| 853 | |
| 854 static uint32_t HashBytes(const uint8_t *data) { | |
| 855 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; | |
| 856 // The higher bits contain more mixture from the multiplication, | |
| 857 // so we take our results from there. | |
| 858 return h >> (32 - kBucketBits); | |
| 859 } | |
| 860 | |
| 861 static const int kBucketBits = 17; | |
| 862 static const size_t kBucketSize = 1 << kBucketBits; | |
| 863 | |
| 864 // The window size minus 1 | |
| 865 size_t window_mask_; | |
| 866 | |
| 867 // Hash table that maps the 4-byte hashes of the sequence to the last | |
| 868 // position where this hash was found, which is the root of the binary | |
| 869 // tree of sequences that share this hash bucket. | |
| 870 uint32_t buckets_[kBucketSize]; | |
| 871 | |
| 872 // The union of the binary trees of each hash bucket. The root of the tree | |
| 873 // corresponding to a hash is a sequence starting at buckets_[hash] and | |
| 874 // the left and right children of a sequence starting at pos are | |
| 875 // forest_[2 * pos] and forest_[2 * pos + 1]. | |
| 876 uint32_t* forest_; | |
| 877 | |
| 878 // A position used to mark a non-existent sequence, i.e. a tree is empty if | |
| 879 // its root is at invalid_pos_ and a node is a leaf if both its children | |
| 880 // are at invalid_pos_. | |
| 881 uint32_t invalid_pos_; | |
| 882 | |
| 883 bool need_init_; | |
| 884 }; | |
| 885 | |
| 886 struct Hashers { | |
| 887 // For kBucketSweep == 1, enabling the dictionary lookup makes compression | |
| 888 // a little faster (0.5% - 1%) and it compresses 0.15% better on small text | |
| 889 // and html inputs. | |
| 890 typedef HashLongestMatchQuickly<16, 1, true> H2; | |
| 891 typedef HashLongestMatchQuickly<16, 2, false> H3; | |
| 892 typedef HashLongestMatchQuickly<17, 4, true> H4; | |
| 893 typedef HashLongestMatch<14, 4, 4> H5; | |
| 894 typedef HashLongestMatch<14, 5, 4> H6; | |
| 895 typedef HashLongestMatch<15, 6, 10> H7; | |
| 896 typedef HashLongestMatch<15, 7, 10> H8; | |
| 897 typedef HashLongestMatch<15, 8, 16> H9; | |
| 898 typedef HashToBinaryTree H10; | |
| 899 | |
| 900 Hashers(void) : hash_h2(0), hash_h3(0), hash_h4(0), hash_h5(0), | |
| 901 hash_h6(0), hash_h7(0), hash_h8(0), hash_h9(0), hash_h10(0) {} | |
| 902 | |
| 903 ~Hashers(void) { | |
| 904 delete hash_h2; | |
| 905 delete hash_h3; | |
| 906 delete hash_h4; | |
| 907 delete hash_h5; | |
| 908 delete hash_h6; | |
| 909 delete hash_h7; | |
| 910 delete hash_h8; | |
| 911 delete hash_h9; | |
| 912 delete hash_h10; | |
| 913 } | |
| 914 | |
| 915 void Init(int type) { | |
| 916 switch (type) { | |
| 917 case 2: hash_h2 = new H2; break; | |
| 918 case 3: hash_h3 = new H3; break; | |
| 919 case 4: hash_h4 = new H4; break; | |
| 920 case 5: hash_h5 = new H5; break; | |
| 921 case 6: hash_h6 = new H6; break; | |
| 922 case 7: hash_h7 = new H7; break; | |
| 923 case 8: hash_h8 = new H8; break; | |
| 924 case 9: hash_h9 = new H9; break; | |
| 925 case 10: hash_h10 = new H10; break; | |
| 926 default: break; | |
| 927 } | |
| 928 } | |
| 929 | |
| 930 template<typename Hasher> | |
| 931 void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) { | |
| 932 hasher->Init(); | |
| 933 for (size_t i = 0; i + Hasher::kHashTypeLength - 1 < size; i++) { | |
| 934 hasher->Store(&dict[i], static_cast<uint32_t>(i)); | |
| 935 } | |
| 936 } | |
| 937 | |
| 938 // Custom LZ77 window. | |
| 939 void PrependCustomDictionary( | |
| 940 int type, int lgwin, const size_t size, const uint8_t* dict) { | |
| 941 switch (type) { | |
| 942 case 2: WarmupHash(size, dict, hash_h2); break; | |
| 943 case 3: WarmupHash(size, dict, hash_h3); break; | |
| 944 case 4: WarmupHash(size, dict, hash_h4); break; | |
| 945 case 5: WarmupHash(size, dict, hash_h5); break; | |
| 946 case 6: WarmupHash(size, dict, hash_h6); break; | |
| 947 case 7: WarmupHash(size, dict, hash_h7); break; | |
| 948 case 8: WarmupHash(size, dict, hash_h8); break; | |
| 949 case 9: WarmupHash(size, dict, hash_h9); break; | |
| 950 case 10: | |
| 951 hash_h10->Init(lgwin, 0, size, false); | |
| 952 for (size_t i = 0; i + kMaxTreeCompLength - 1 < size; ++i) { | |
| 953 hash_h10->Store(dict, std::numeric_limits<size_t>::max(), i); | |
| 954 } | |
| 955 break; | |
| 956 default: break; | |
| 957 } | |
| 958 } | |
| 959 | |
| 960 | |
| 961 H2* hash_h2; | |
| 962 H3* hash_h3; | |
| 963 H4* hash_h4; | |
| 964 H5* hash_h5; | |
| 965 H6* hash_h6; | |
| 966 H7* hash_h7; | |
| 967 H8* hash_h8; | |
| 968 H9* hash_h9; | |
| 969 H10* hash_h10; | |
| 970 }; | |
| 971 | |
| 972 } // namespace brotli | |
| 973 | |
| 974 #endif // BROTLI_ENC_HASH_H_ | |
| OLD | NEW |