third_party/brotli/enc/hash.h - Issue 1956893002: Added brotli enc/ and tools/ directories.

Side by Side Diff: third_party/brotli/enc/hash.h

Issue 1956893002: Added brotli enc/ and tools/ directories. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Updated to most recent build tools Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /* Copyright 2010 Google Inc. All Rights Reserved.

	2

	3 Distributed under MIT license.

	4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT

	5 */

	6

	7 // A (forgetful) hash table to the data seen by the compressor, to

	8 // help create backward references to previous data.

	9

	10 #ifndef BROTLI_ENC_HASH_H_

	11 #define BROTLI_ENC_HASH_H_

	12

	13 #include <sys/types.h>

	14 #include <algorithm>

	15 #include <cstring>

	16 #include <limits>

	17

	18 #include "./dictionary_hash.h"

	19 #include "./fast_log.h"

	20 #include "./find_match_length.h"

	21 #include "./port.h"

	22 #include "./prefix.h"

	23 #include "./static_dict.h"

	24 #include "./transform.h"

	25 #include "./types.h"

	26

	27 namespace brotli {

	28

	29 static const size_t kMaxTreeSearchDepth = 64;

	30 static const size_t kMaxTreeCompLength = 128;

	31

	32 static const uint32_t kDistanceCacheIndex[] = {

	33 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,

	34 };

	35 static const int kDistanceCacheOffset[] = {

	36 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3

	37 };

	38

	39 static const uint32_t kCutoffTransformsCount = 10;

	40 static const uint8_t kCutoffTransforms[] = {

	41 0, 12, 27, 23, 42, 63, 56, 48, 59, 64

	42 };

	43

	44 // kHashMul32 multiplier has these properties:

	45 // * The multiplier must be odd. Otherwise we may lose the highest bit.

	46 // * No long streaks of 1s or 0s.

	47 // * There is no effort to ensure that it is a prime, the oddity is enough

	48 // for this use.

	49 // * The number has been tuned heuristically against compression benchmarks.

	50 static const uint32_t kHashMul32 = 0x1e35a7bd;

	51

	52 template<int kShiftBits>

	53 inline uint32_t Hash(const uint8_t *data) {

	54 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;

	55 // The higher bits contain more mixture from the multiplication,

	56 // so we take our results from there.

	57 return h >> (32 - kShiftBits);

	58 }

	59

	60 // Usually, we always choose the longest backward reference. This function

	61 // allows for the exception of that rule.

	62 //

	63 // If we choose a backward reference that is further away, it will

	64 // usually be coded with more bits. We approximate this by assuming

	65 // log2(distance). If the distance can be expressed in terms of the

	66 // last four distances, we use some heuristic constants to estimate

	67 // the bits cost. For the first up to four literals we use the bit

	68 // cost of the literals from the literal cost model, after that we

	69 // use the average bit cost of the cost model.

	70 //

	71 // This function is used to sometimes discard a longer backward reference

	72 // when it is not much longer and the bit cost for encoding it is more

	73 // than the saved literals.

	74 //

	75 // backward_reference_offset MUST be positive.

	76 inline double BackwardReferenceScore(size_t copy_length,

	77 size_t backward_reference_offset) {

	78 return 5.4 * static_cast<double>(copy_length) -

	79 1.20 * Log2FloorNonZero(backward_reference_offset);

	80 }

	81

	82 inline double BackwardReferenceScoreUsingLastDistance(size_t copy_length,

	83 size_t distance_short_code) {

	84 static const double kDistanceShortCodeBitCost[16] = {

	85 -0.6, 0.95, 1.17, 1.27,

	86 0.93, 0.93, 0.96, 0.96, 0.99, 0.99,

	87 1.05, 1.05, 1.15, 1.15, 1.25, 1.25

	88 };

	89 return 5.4 * static_cast<double>(copy_length) -

	90 kDistanceShortCodeBitCost[distance_short_code];

	91 }

	92

	93 struct BackwardMatch {

	94 BackwardMatch(void) : distance(0), length_and_code(0) {}

	95

	96 BackwardMatch(size_t dist, size_t len)

	97 : distance(static_cast<uint32_t>(dist))

	98 , length_and_code(static_cast<uint32_t>(len << 5)) {}

	99

	100 BackwardMatch(size_t dist, size_t len, size_t len_code)

	101 : distance(static_cast<uint32_t>(dist))

	102 , length_and_code(static_cast<uint32_t>(

	103 (len << 5) \| (len == len_code ? 0 : len_code))) {}

	104

	105 size_t length(void) const {

	106 return length_and_code >> 5;

	107 }

	108 size_t length_code(void) const {

	109 size_t code = length_and_code & 31;

	110 return code ? code : length();

	111 }

	112

	113 uint32_t distance;

	114 uint32_t length_and_code;

	115 };

	116

	117 // A (forgetful) hash table to the data seen by the compressor, to

	118 // help create backward references to previous data.

	119 //

	120 // This is a hash map of fixed size (kBucketSize). Starting from the

	121 // given index, kBucketSweep buckets are used to store values of a key.

	122 template <int kBucketBits, int kBucketSweep, bool kUseDictionary>

	123 class HashLongestMatchQuickly {

	124 public:

	125 HashLongestMatchQuickly(void) {

	126 Reset();

	127 }

	128 void Reset(void) {

	129 need_init_ = true;

	130 num_dict_lookups_ = 0;

	131 num_dict_matches_ = 0;

	132 }

	133 void Init(void) {

	134 if (need_init_) {

	135 // It is not strictly necessary to fill this buffer here, but

	136 // not filling will make the results of the compression stochastic

	137 // (but correct). This is because random data would cause the

	138 // system to find accidentally good backward references here and there.

	139 memset(&buckets_[0], 0, sizeof(buckets_));

	140 need_init_ = false;

	141 }

	142 }

	143 void InitForData(const uint8_t* data, size_t num) {

	144 for (size_t i = 0; i < num; ++i) {

	145 const uint32_t key = HashBytes(&data[i]);

	146 memset(&buckets_[key], 0, kBucketSweep * sizeof(buckets_[0]));

	147 need_init_ = false;

	148 }

	149 }

	150 // Look at 4 bytes at data.

	151 // Compute a hash from these, and store the value somewhere within

	152 // [ix .. ix+3].

	153 inline void Store(const uint8_t *data, const uint32_t ix) {

	154 const uint32_t key = HashBytes(data);

	155 // Wiggle the value with the bucket sweep range.

	156 const uint32_t off = (ix >> 3) % kBucketSweep;

	157 buckets_[key + off] = ix;

	158 }

	159

	160 // Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask]

	161 // up to the length of max_length and stores the position cur_ix in the

	162 // hash table.

	163 //

	164 // Does not look for matches longer than max_length.

	165 // Does not look for matches further away than max_backward.

	166 // Writes the best found match length into best_len_out.

	167 // Writes the index (&data[index]) of the start of the best match into

	168 // best_distance_out.

	169 inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer,

	170 const size_t ring_buffer_mask,

	171 const int* __restrict distance_cache,

	172 const size_t cur_ix,

	173 const size_t max_length,

	174 const size_t max_backward,

	175 size_t * __restrict best_len_out,

	176 size_t * __restrict best_len_code_out,

	177 size_t * __restrict best_distance_out,

	178 double* __restrict best_score_out) {

	179 const size_t best_len_in = *best_len_out;

	180 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;

	181 const uint32_t key = HashBytes(&ring_buffer[cur_ix_masked]);

	182 int compare_char = ring_buffer[cur_ix_masked + best_len_in];

	183 double best_score = *best_score_out;

	184 size_t best_len = best_len_in;

	185 size_t cached_backward = static_cast<size_t>(distance_cache[0]);

	186 size_t prev_ix = cur_ix - cached_backward;

	187 bool match_found = false;

	188 if (prev_ix < cur_ix) {

	189 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);

	190 if (compare_char == ring_buffer[prev_ix + best_len]) {

	191 size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],

	192 &ring_buffer[cur_ix_masked],

	193 max_length);

	194 if (len >= 4) {

	195 best_score = BackwardReferenceScoreUsingLastDistance(len, 0);

	196 best_len = len;

	197 *best_len_out = len;

	198 *best_len_code_out = len;

	199 *best_distance_out = cached_backward;

	200 *best_score_out = best_score;

	201 compare_char = ring_buffer[cur_ix_masked + best_len];

	202 if (kBucketSweep == 1) {

	203 buckets_[key] = static_cast<uint32_t>(cur_ix);

	204 return true;

	205 } else {

	206 match_found = true;

	207 }

	208 }

	209 }

	210 }

	211 if (kBucketSweep == 1) {

	212 // Only one to look for, don't bother to prepare for a loop.

	213 prev_ix = buckets_[key];

	214 buckets_[key] = static_cast<uint32_t>(cur_ix);

	215 size_t backward = cur_ix - prev_ix;

	216 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);

	217 if (compare_char != ring_buffer[prev_ix + best_len_in]) {

	218 return false;

	219 }

	220 if (PREDICT_FALSE(backward == 0 \|\| backward > max_backward)) {

	221 return false;

	222 }

	223 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],

	224 &ring_buffer[cur_ix_masked],

	225 max_length);

	226 if (len >= 4) {

	227 *best_len_out = len;

	228 *best_len_code_out = len;

	229 *best_distance_out = backward;

	230 *best_score_out = BackwardReferenceScore(len, backward);

	231 return true;

	232 }

	233 } else {

	234 uint32_t *bucket = buckets_ + key;

	235 prev_ix = *bucket++;

	236 for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) {

	237 const size_t backward = cur_ix - prev_ix;

	238 prev_ix &= static_cast<uint32_t>(ring_buffer_mask);

	239 if (compare_char != ring_buffer[prev_ix + best_len]) {

	240 continue;

	241 }

	242 if (PREDICT_FALSE(backward == 0 \|\| backward > max_backward)) {

	243 continue;

	244 }

	245 const size_t len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],

	246 &ring_buffer[cur_ix_masked],

	247 max_length);

	248 if (len >= 4) {

	249 const double score = BackwardReferenceScore(len, backward);

	250 if (best_score < score) {

	251 best_score = score;

	252 best_len = len;

	253 *best_len_out = best_len;

	254 *best_len_code_out = best_len;

	255 *best_distance_out = backward;

	256 *best_score_out = score;

	257 compare_char = ring_buffer[cur_ix_masked + best_len];

	258 match_found = true;

	259 }

	260 }

	261 }

	262 }

	263 if (kUseDictionary && !match_found &&

	264 num_dict_matches_ >= (num_dict_lookups_ >> 7)) {

	265 ++num_dict_lookups_;

	266 const uint32_t dict_key = Hash<14>(&ring_buffer[cur_ix_masked]) << 1;

	267 const uint16_t v = kStaticDictionaryHash[dict_key];

	268 if (v > 0) {

	269 const uint32_t len = v & 31;

	270 const uint32_t dist = v >> 5;

	271 const size_t offset =

	272 kBrotliDictionaryOffsetsByLength[len] + len * dist;

	273 if (len <= max_length) {

	274 const size_t matchlen =

	275 FindMatchLengthWithLimit(&ring_buffer[cur_ix_masked],

	276 &kBrotliDictionary[offset], len);

	277 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {

	278 const size_t transform_id = kCutoffTransforms[len - matchlen];

	279 const size_t word_id =

	280 transform_id * (1u << kBrotliDictionarySizeBitsByLength[len]) +

	281 dist;

	282 const size_t backward = max_backward + word_id + 1;

	283 const double score = BackwardReferenceScore(matchlen, backward);

	284 if (best_score < score) {

	285 ++num_dict_matches_;

	286 best_score = score;

	287 best_len = matchlen;

	288 *best_len_out = best_len;

	289 *best_len_code_out = len;

	290 *best_distance_out = backward;

	291 *best_score_out = best_score;

	292 match_found = true;

	293 }

	294 }

	295 }

	296 }

	297 }

	298 const uint32_t off = (cur_ix >> 3) % kBucketSweep;

	299 buckets_[key + off] = static_cast<uint32_t>(cur_ix);

	300 return match_found;

	301 }

	302

	303 enum { kHashLength = 5 };

	304 enum { kHashTypeLength = 8 };

	305 // HashBytes is the function that chooses the bucket to place

	306 // the address in. The HashLongestMatch and HashLongestMatchQuickly

	307 // classes have separate, different implementations of hashing.

	308 static uint32_t HashBytes(const uint8_t *data) {

	309 // Computing a hash based on 5 bytes works much better for

	310 // qualities 1 and 3, where the next hash value is likely to replace

	311 uint64_t h = (BROTLI_UNALIGNED_LOAD64(data) << 24) * kHashMul32;

	312 // The higher bits contain more mixture from the multiplication,

	313 // so we take our results from there.

	314 return static_cast<uint32_t>(h >> (64 - kBucketBits));

	315 }

	316

	317 enum { kHashMapSize = 4 << kBucketBits };

	318

	319 private:

	320 static const uint32_t kBucketSize = 1 << kBucketBits;

	321 uint32_t buckets_[kBucketSize + kBucketSweep];

	322 // True if buckets_ array needs to be initialized.

	323 bool need_init_;

	324 size_t num_dict_lookups_;

	325 size_t num_dict_matches_;

	326 };

	327

	328 // A (forgetful) hash table to the data seen by the compressor, to

	329 // help create backward references to previous data.

	330 //

	331 // This is a hash map of fixed size (kBucketSize) to a ring buffer of

	332 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize

	333 // index positions of the given hash key in the compressed data.

	334 template <int kBucketBits,

	335 int kBlockBits,

	336 int kNumLastDistancesToCheck>

	337 class HashLongestMatch {

	338 public:

	339 HashLongestMatch(void) {

	340 Reset();

	341 }

	342

	343 void Reset(void) {

	344 need_init_ = true;

	345 num_dict_lookups_ = 0;

	346 num_dict_matches_ = 0;

	347 }

	348

	349 void Init(void) {

	350 if (need_init_) {

	351 memset(&num_[0], 0, sizeof(num_));

	352 need_init_ = false;

	353 }

	354 }

	355

	356 void InitForData(const uint8_t* data, size_t num) {

	357 for (size_t i = 0; i < num; ++i) {

	358 const uint32_t key = HashBytes(&data[i]);

	359 num_[key] = 0;

	360 need_init_ = false;

	361 }

	362 }

	363

	364 // Look at 3 bytes at data.

	365 // Compute a hash from these, and store the value of ix at that position.

	366 inline void Store(const uint8_t *data, const uint32_t ix) {

	367 const uint32_t key = HashBytes(data);

	368 const int minor_ix = num_[key] & kBlockMask;

	369 buckets_[key][minor_ix] = ix;

	370 ++num_[key];

	371 }

	372

	373 // Find a longest backward match of &data[cur_ix] up to the length of

	374 // max_length and stores the position cur_ix in the hash table.

	375 //

	376 // Does not look for matches longer than max_length.

	377 // Does not look for matches further away than max_backward.

	378 // Writes the best found match length into best_len_out.

	379 // Writes the index (&data[index]) offset from the start of the best match

	380 // into best_distance_out.

	381 // Write the score of the best match into best_score_out.

	382 bool FindLongestMatch(const uint8_t * __restrict data,

	383 const size_t ring_buffer_mask,

	384 const int* __restrict distance_cache,

	385 const size_t cur_ix,

	386 const size_t max_length,

	387 const size_t max_backward,

	388 size_t * __restrict best_len_out,

	389 size_t * __restrict best_len_code_out,

	390 size_t * __restrict best_distance_out,

	391 double * __restrict best_score_out) {

	392 *best_len_code_out = 0;

	393 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;

	394 bool match_found = false;

	395 // Don't accept a short copy from far away.

	396 double best_score = *best_score_out;

	397 size_t best_len = *best_len_out;

	398 *best_len_out = 0;

	399 // Try last distance first.

	400 for (size_t i = 0; i < kNumLastDistancesToCheck; ++i) {

	401 const size_t idx = kDistanceCacheIndex[i];

	402 const size_t backward =

	403 static_cast<size_t>(distance_cache[idx] + kDistanceCacheOffset[i]);

	404 size_t prev_ix = static_cast<size_t>(cur_ix - backward);

	405 if (prev_ix >= cur_ix) {

	406 continue;

	407 }

	408 if (PREDICT_FALSE(backward > max_backward)) {

	409 continue;

	410 }

	411 prev_ix &= ring_buffer_mask;

	412

	413 if (cur_ix_masked + best_len > ring_buffer_mask \|\|

	414 prev_ix + best_len > ring_buffer_mask \|\|

	415 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {

	416 continue;

	417 }

	418 const size_t len = FindMatchLengthWithLimit(&data[prev_ix],

	419 &data[cur_ix_masked],

	420 max_length);

	421 if (len >= 3 \|\| (len == 2 && i < 2)) {

	422 // Comparing for >= 2 does not change the semantics, but just saves for

	423 // a few unnecessary binary logarithms in backward reference score,

	424 // since we are not interested in such short matches.

	425 double score = BackwardReferenceScoreUsingLastDistance(len, i);

	426 if (best_score < score) {

	427 best_score = score;

	428 best_len = len;

	429 *best_len_out = best_len;

	430 *best_len_code_out = best_len;

	431 *best_distance_out = backward;

	432 *best_score_out = best_score;

	433 match_found = true;

	434 }

	435 }

	436 }

	437 const uint32_t key = HashBytes(&data[cur_ix_masked]);

	438 const uint32_t * __restrict const bucket = &buckets_[key][0];

	439 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;

	440 for (size_t i = num_[key]; i > down;) {

	441 --i;

	442 size_t prev_ix = bucket[i & kBlockMask];

	443 const size_t backward = cur_ix - prev_ix;

	444 if (PREDICT_FALSE(backward == 0 \|\| backward > max_backward)) {

	445 break;

	446 }

	447 prev_ix &= ring_buffer_mask;

	448 if (cur_ix_masked + best_len > ring_buffer_mask \|\|

	449 prev_ix + best_len > ring_buffer_mask \|\|

	450 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {

	451 continue;

	452 }

	453 const size_t len = FindMatchLengthWithLimit(&data[prev_ix],

	454 &data[cur_ix_masked],

	455 max_length);

	456 if (len >= 4) {

	457 // Comparing for >= 3 does not change the semantics, but just saves

	458 // for a few unnecessary binary logarithms in backward reference

	459 // score, since we are not interested in such short matches.

	460 double score = BackwardReferenceScore(len, backward);

	461 if (best_score < score) {

	462 best_score = score;

	463 best_len = len;

	464 *best_len_out = best_len;

	465 *best_len_code_out = best_len;

	466 *best_distance_out = backward;

	467 *best_score_out = best_score;

	468 match_found = true;

	469 }

	470 }

	471 }

	472 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix);

	473 ++num_[key];

	474 if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) {

	475 size_t dict_key = Hash<14>(&data[cur_ix_masked]) << 1;

	476 for (int k = 0; k < 2; ++k, ++dict_key) {

	477 ++num_dict_lookups_;

	478 const uint16_t v = kStaticDictionaryHash[dict_key];

	479 if (v > 0) {

	480 const size_t len = v & 31;

	481 const size_t dist = v >> 5;

	482 const size_t offset =

	483 kBrotliDictionaryOffsetsByLength[len] + len * dist;

	484 if (len <= max_length) {

	485 const size_t matchlen =

	486 FindMatchLengthWithLimit(&data[cur_ix_masked],

	487 &kBrotliDictionary[offset], len);

	488 if (matchlen + kCutoffTransformsCount > len && matchlen > 0) {

	489 const size_t transform_id = kCutoffTransforms[len - matchlen];

	490 const size_t word_id =

	491 transform_id * (1 << kBrotliDictionarySizeBitsByLength[len]) +

	492 dist;

	493 const size_t backward = max_backward + word_id + 1;

	494 double score = BackwardReferenceScore(matchlen, backward);

	495 if (best_score < score) {

	496 ++num_dict_matches_;

	497 best_score = score;

	498 best_len = matchlen;

	499 *best_len_out = best_len;

	500 *best_len_code_out = len;

	501 *best_distance_out = backward;

	502 *best_score_out = best_score;

	503 match_found = true;

	504 }

	505 }

	506 }

	507 }

	508 }

	509 }

	510 return match_found;

	511 }

	512

	513 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the

	514 // length of max_length and stores the position cur_ix in the hash table.

	515 //

	516 // Sets *num_matches to the number of matches found, and stores the found

	517 // matches in matches[0] to matches[*num_matches - 1]. The matches will be

	518 // sorted by strictly increasing length and (non-strictly) increasing

	519 // distance.

	520 size_t FindAllMatches(const uint8_t* data,

	521 const size_t ring_buffer_mask,

	522 const size_t cur_ix,

	523 const size_t max_length,

	524 const size_t max_backward,

	525 BackwardMatch* matches) {

	526 BackwardMatch* const orig_matches = matches;

	527 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;

	528 size_t best_len = 1;

	529 size_t stop = cur_ix - 64;

	530 if (cur_ix < 64) { stop = 0; }

	531 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) {

	532 size_t prev_ix = i;

	533 const size_t backward = cur_ix - prev_ix;

	534 if (PREDICT_FALSE(backward > max_backward)) {

	535 break;

	536 }

	537 prev_ix &= ring_buffer_mask;

	538 if (data[cur_ix_masked] != data[prev_ix] \|\|

	539 data[cur_ix_masked + 1] != data[prev_ix + 1]) {

	540 continue;

	541 }

	542 const size_t len =

	543 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],

	544 max_length);

	545 if (len > best_len) {

	546 best_len = len;

	547 *matches++ = BackwardMatch(backward, len);

	548 }

	549 }

	550 const uint32_t key = HashBytes(&data[cur_ix_masked]);

	551 const uint32_t * __restrict const bucket = &buckets_[key][0];

	552 const size_t down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;

	553 for (size_t i = num_[key]; i > down;) {

	554 --i;

	555 size_t prev_ix = bucket[i & kBlockMask];

	556 const size_t backward = cur_ix - prev_ix;

	557 if (PREDICT_FALSE(backward == 0 \|\| backward > max_backward)) {

	558 break;

	559 }

	560 prev_ix &= ring_buffer_mask;

	561 if (cur_ix_masked + best_len > ring_buffer_mask \|\|

	562 prev_ix + best_len > ring_buffer_mask \|\|

	563 data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {

	564 continue;

	565 }

	566 const size_t len =

	567 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],

	568 max_length);

	569 if (len > best_len) {

	570 best_len = len;

	571 *matches++ = BackwardMatch(backward, len);

	572 }

	573 }

	574 buckets_[key][num_[key] & kBlockMask] = static_cast<uint32_t>(cur_ix);

	575 ++num_[key];

	576 uint32_t dict_matches[kMaxDictionaryMatchLen + 1];

	577 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {

	578 dict_matches[i] = kInvalidMatch;

	579 }

	580 size_t minlen = std::max<size_t>(4, best_len + 1);

	581 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,

	582 &dict_matches[0])) {

	583 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length);

	584 for (size_t l = minlen; l <= maxlen; ++l) {

	585 uint32_t dict_id = dict_matches[l];

	586 if (dict_id < kInvalidMatch) {

	587 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l,

	588 dict_id & 31);

	589 }

	590 }

	591 }

	592 return static_cast<size_t>(matches - orig_matches);

	593 }

	594

	595 enum { kHashLength = 4 };

	596 enum { kHashTypeLength = 4 };

	597

	598 // HashBytes is the function that chooses the bucket to place

	599 // the address in. The HashLongestMatch and HashLongestMatchQuickly

	600 // classes have separate, different implementations of hashing.

	601 static uint32_t HashBytes(const uint8_t *data) {

	602 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;

	603 // The higher bits contain more mixture from the multiplication,

	604 // so we take our results from there.

	605 return h >> (32 - kBucketBits);

	606 }

	607

	608 enum { kHashMapSize = 2 << kBucketBits };

	609

	610 static const size_t kMaxNumMatches = 64 + (1 << kBlockBits);

	611

	612 private:

	613 // Number of hash buckets.

	614 static const uint32_t kBucketSize = 1 << kBucketBits;

	615

	616 // Only kBlockSize newest backward references are kept,

	617 // and the older are forgotten.

	618 static const uint32_t kBlockSize = 1 << kBlockBits;

	619

	620 // Mask for accessing entries in a block (in a ringbuffer manner).

	621 static const uint32_t kBlockMask = (1 << kBlockBits) - 1;

	622

	623 // Number of entries in a particular bucket.

	624 uint16_t num_[kBucketSize];

	625

	626 // Buckets containing kBlockSize of backward references.

	627 uint32_t buckets_[kBucketSize][kBlockSize];

	628

	629 // True if num_ array needs to be initialized.

	630 bool need_init_;

	631

	632 size_t num_dict_lookups_;

	633 size_t num_dict_matches_;

	634 };

	635

	636 // A (forgetful) hash table where each hash bucket contains a binary tree of

	637 // sequences whose first 4 bytes share the same hash code.

	638 // Each sequence is kMaxTreeCompLength long and is identified by its starting

	639 // position in the input data. The binary tree is sorted by the lexicographic

	640 // order of the sequences, and it is also a max-heap with respect to the

	641 // starting positions.

	642 class HashToBinaryTree {

	643 public:

	644 HashToBinaryTree() : forest_(NULL) {

	645 Reset();

	646 }

	647

	648 ~HashToBinaryTree() {

	649 delete[] forest_;

	650 }

	651

	652 void Reset() {

	653 need_init_ = true;

	654 }

	655

	656 void Init(int lgwin, size_t position, size_t bytes, bool is_last) {

	657 if (need_init_) {

	658 window_mask_ = (1u << lgwin) - 1u;

	659 invalid_pos_ = static_cast<uint32_t>(-window_mask_);

	660 for (uint32_t i = 0; i < kBucketSize; i++) {

	661 buckets_[i] = invalid_pos_;

	662 }

	663 size_t num_nodes = (position == 0 && is_last) ? bytes : window_mask_ + 1;

	664 forest_ = new uint32_t[2 * num_nodes];

	665 need_init_ = false;

	666 }

	667 }

	668

	669 // Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the

	670 // length of max_length and stores the position cur_ix in the hash table.

	671 //

	672 // Sets *num_matches to the number of matches found, and stores the found

	673 // matches in matches[0] to matches[*num_matches - 1]. The matches will be

	674 // sorted by strictly increasing length and (non-strictly) increasing

	675 // distance.

	676 size_t FindAllMatches(const uint8_t* data,

	677 const size_t ring_buffer_mask,

	678 const size_t cur_ix,

	679 const size_t max_length,

	680 const size_t max_backward,

	681 BackwardMatch* matches) {

	682 BackwardMatch* const orig_matches = matches;

	683 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;

	684 size_t best_len = 1;

	685 size_t stop = cur_ix - 64;

	686 if (cur_ix < 64) { stop = 0; }

	687 for (size_t i = cur_ix - 1; i > stop && best_len <= 2; --i) {

	688 size_t prev_ix = i;

	689 const size_t backward = cur_ix - prev_ix;

	690 if (PREDICT_FALSE(backward > max_backward)) {

	691 break;

	692 }

	693 prev_ix &= ring_buffer_mask;

	694 if (data[cur_ix_masked] != data[prev_ix] \|\|

	695 data[cur_ix_masked + 1] != data[prev_ix + 1]) {

	696 continue;

	697 }

	698 const size_t len =

	699 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],

	700 max_length);

	701 if (len > best_len) {

	702 best_len = len;

	703 *matches++ = BackwardMatch(backward, len);

	704 }

	705 }

	706 if (best_len < max_length) {

	707 matches = StoreAndFindMatches(data, cur_ix, ring_buffer_mask,

	708 max_length, &best_len, matches);

	709 }

	710 uint32_t dict_matches[kMaxDictionaryMatchLen + 1];

	711 for (size_t i = 0; i <= kMaxDictionaryMatchLen; ++i) {

	712 dict_matches[i] = kInvalidMatch;

	713 }

	714 size_t minlen = std::max<size_t>(4, best_len + 1);

	715 if (FindAllStaticDictionaryMatches(&data[cur_ix_masked], minlen, max_length,

	716 &dict_matches[0])) {

	717 size_t maxlen = std::min<size_t>(kMaxDictionaryMatchLen, max_length);

	718 for (size_t l = minlen; l <= maxlen; ++l) {

	719 uint32_t dict_id = dict_matches[l];

	720 if (dict_id < kInvalidMatch) {

	721 *matches++ = BackwardMatch(max_backward + (dict_id >> 5) + 1, l,

	722 dict_id & 31);

	723 }

	724 }

	725 }

	726 return static_cast<size_t>(matches - orig_matches);

	727 }

	728

	729 // Stores the hash of the next 4 bytes and re-roots the binary tree at the

	730 // current sequence, without returning any matches.

	731 // REQUIRES: cur_ix + kMaxTreeCompLength <= end-of-current-block

	732 void Store(const uint8_t* data,

	733 const size_t ring_buffer_mask,

	734 const size_t cur_ix) {

	735 size_t best_len = 0;

	736 StoreAndFindMatches(data, cur_ix, ring_buffer_mask, kMaxTreeCompLength,

	737 &best_len, NULL);

	738 }

	739

	740 void StitchToPreviousBlock(size_t num_bytes,

	741 size_t position,

	742 const uint8_t* ringbuffer,

	743 size_t ringbuffer_mask) {

	744 if (num_bytes >= 3 && position >= kMaxTreeCompLength) {

	745 // Store the last `kMaxTreeCompLength - 1` positions in the hasher.

	746 // These could not be calculated before, since they require knowledge

	747 // of both the previous and the current block.

	748 const size_t i_start = position - kMaxTreeCompLength + 1;

	749 const size_t i_end = std::min(position, i_start + num_bytes);

	750 for (size_t i = i_start; i < i_end; ++i) {

	751 // We know that i + kMaxTreeCompLength <= position + num_bytes, i.e. the

	752 // end of the current block and that we have at least

	753 // kMaxTreeCompLength tail in the ringbuffer.

	754 Store(ringbuffer, ringbuffer_mask, i);

	755 }

	756 }

	757 }

	758

	759 static const size_t kMaxNumMatches = 64 + kMaxTreeSearchDepth;

	760

	761 private:

	762 // Stores the hash of the next 4 bytes and in a single tree-traversal, the

	763 // hash bucket's binary tree is searched for matches and is re-rooted at the

	764 // current position.

	765 //

	766 // If less than kMaxTreeCompLength data is available, the hash bucket of the

	767 // current position is searched for matches, but the state of the hash table

	768 // is not changed, since we can not know the final sorting order of the

	769 // current (incomplete) sequence.

	770 //

	771 // This function must be called with increasing cur_ix positions.

	772 BackwardMatch* StoreAndFindMatches(const uint8_t* const __restrict data,

	773 const size_t cur_ix,

	774 const size_t ring_buffer_mask,

	775 const size_t max_length,

	776 size_t* const __restrict best_len,

	777 BackwardMatch* __restrict matches) {

	778 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;

	779 const size_t max_backward = window_mask_ - 15;

	780 const size_t max_comp_len = std::min(max_length, kMaxTreeCompLength);

	781 const bool reroot_tree = max_length >= kMaxTreeCompLength;

	782 const uint32_t key = HashBytes(&data[cur_ix_masked]);

	783 size_t prev_ix = buckets_[key];

	784 // The forest index of the rightmost node of the left subtree of the new

	785 // root, updated as we traverse and reroot the tree of the hash bucket.

	786 size_t node_left = LeftChildIndex(cur_ix);

	787 // The forest index of the leftmost node of the right subtree of the new

	788 // root, updated as we traverse and reroot the tree of the hash bucket.

	789 size_t node_right = RightChildIndex(cur_ix);

	790 // The match length of the rightmost node of the left subtree of the new

	791 // root, updated as we traverse and reroot the tree of the hash bucket.

	792 size_t best_len_left = 0;

	793 // The match length of the leftmost node of the right subtree of the new

	794 // root, updated as we traverse and reroot the tree of the hash bucket.

	795 size_t best_len_right = 0;

	796 if (reroot_tree) {

	797 buckets_[key] = static_cast<uint32_t>(cur_ix);

	798 }

	799 for (size_t depth_remaining = kMaxTreeSearchDepth; ; --depth_remaining) {

	800 const size_t backward = cur_ix - prev_ix;

	801 const size_t prev_ix_masked = prev_ix & ring_buffer_mask;

	802 if (backward == 0 \|\| backward > max_backward \|\| depth_remaining == 0) {

	803 if (reroot_tree) {

	804 forest_[node_left] = invalid_pos_;

	805 forest_[node_right] = invalid_pos_;

	806 }

	807 break;

	808 }

	809 const size_t cur_len = std::min(best_len_left, best_len_right);

	810 const size_t len = cur_len +

	811 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],

	812 &data[prev_ix_masked + cur_len],

	813 max_length - cur_len);

	814 if (len > *best_len) {

	815 *best_len = len;

	816 if (matches) {

	817 *matches++ = BackwardMatch(backward, len);

	818 }

	819 if (len >= max_comp_len) {

	820 if (reroot_tree) {

	821 forest_[node_left] = forest_[LeftChildIndex(prev_ix)];

	822 forest_[node_right] = forest_[RightChildIndex(prev_ix)];

	823 }

	824 break;

	825 }

	826 }

	827 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {

	828 best_len_left = len;

	829 if (reroot_tree) {

	830 forest_[node_left] = static_cast<uint32_t>(prev_ix);

	831 }

	832 node_left = RightChildIndex(prev_ix);

	833 prev_ix = forest_[node_left];

	834 } else {

	835 best_len_right = len;

	836 if (reroot_tree) {

	837 forest_[node_right] = static_cast<uint32_t>(prev_ix);

	838 }

	839 node_right = LeftChildIndex(prev_ix);

	840 prev_ix = forest_[node_right];

	841 }

	842 }

	843 return matches;

	844 }

	845

	846 inline size_t LeftChildIndex(const size_t pos) {

	847 return 2 * (pos & window_mask_);

	848 }

	849

	850 inline size_t RightChildIndex(const size_t pos) {

	851 return 2 * (pos & window_mask_) + 1;

	852 }

	853

	854 static uint32_t HashBytes(const uint8_t *data) {

	855 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;

	856 // The higher bits contain more mixture from the multiplication,

	857 // so we take our results from there.

	858 return h >> (32 - kBucketBits);

	859 }

	860

	861 static const int kBucketBits = 17;

	862 static const size_t kBucketSize = 1 << kBucketBits;

	863

	864 // The window size minus 1

	865 size_t window_mask_;

	866

	867 // Hash table that maps the 4-byte hashes of the sequence to the last

	868 // position where this hash was found, which is the root of the binary

	869 // tree of sequences that share this hash bucket.

	870 uint32_t buckets_[kBucketSize];

	871

	872 // The union of the binary trees of each hash bucket. The root of the tree

	873 // corresponding to a hash is a sequence starting at buckets_[hash] and

	874 // the left and right children of a sequence starting at pos are

	875 // forest_[2 * pos] and forest_[2 * pos + 1].

	876 uint32_t* forest_;

	877

	878 // A position used to mark a non-existent sequence, i.e. a tree is empty if

	879 // its root is at invalid_pos_ and a node is a leaf if both its children

	880 // are at invalid_pos_.

	881 uint32_t invalid_pos_;

	882

	883 bool need_init_;

	884 };

	885

	886 struct Hashers {

	887 // For kBucketSweep == 1, enabling the dictionary lookup makes compression

	888 // a little faster (0.5% - 1%) and it compresses 0.15% better on small text

	889 // and html inputs.

	890 typedef HashLongestMatchQuickly<16, 1, true> H2;

	891 typedef HashLongestMatchQuickly<16, 2, false> H3;

	892 typedef HashLongestMatchQuickly<17, 4, true> H4;

	893 typedef HashLongestMatch<14, 4, 4> H5;

	894 typedef HashLongestMatch<14, 5, 4> H6;

	895 typedef HashLongestMatch<15, 6, 10> H7;

	896 typedef HashLongestMatch<15, 7, 10> H8;

	897 typedef HashLongestMatch<15, 8, 16> H9;

	898 typedef HashToBinaryTree H10;

	899

	900 Hashers(void) : hash_h2(0), hash_h3(0), hash_h4(0), hash_h5(0),

	901 hash_h6(0), hash_h7(0), hash_h8(0), hash_h9(0), hash_h10(0) {}

	902

	903 ~Hashers(void) {

	904 delete hash_h2;

	905 delete hash_h3;

	906 delete hash_h4;

	907 delete hash_h5;

	908 delete hash_h6;

	909 delete hash_h7;

	910 delete hash_h8;

	911 delete hash_h9;

	912 delete hash_h10;

	913 }

	914

	915 void Init(int type) {

	916 switch (type) {

	917 case 2: hash_h2 = new H2; break;

	918 case 3: hash_h3 = new H3; break;

	919 case 4: hash_h4 = new H4; break;

	920 case 5: hash_h5 = new H5; break;

	921 case 6: hash_h6 = new H6; break;

	922 case 7: hash_h7 = new H7; break;

	923 case 8: hash_h8 = new H8; break;

	924 case 9: hash_h9 = new H9; break;

	925 case 10: hash_h10 = new H10; break;

	926 default: break;

	927 }

	928 }

	929

	930 template<typename Hasher>

	931 void WarmupHash(const size_t size, const uint8_t* dict, Hasher* hasher) {

	932 hasher->Init();

	933 for (size_t i = 0; i + Hasher::kHashTypeLength - 1 < size; i++) {

	934 hasher->Store(&dict[i], static_cast<uint32_t>(i));

	935 }

	936 }

	937

	938 // Custom LZ77 window.

	939 void PrependCustomDictionary(

	940 int type, int lgwin, const size_t size, const uint8_t* dict) {

	941 switch (type) {

	942 case 2: WarmupHash(size, dict, hash_h2); break;

	943 case 3: WarmupHash(size, dict, hash_h3); break;

	944 case 4: WarmupHash(size, dict, hash_h4); break;

	945 case 5: WarmupHash(size, dict, hash_h5); break;

	946 case 6: WarmupHash(size, dict, hash_h6); break;

	947 case 7: WarmupHash(size, dict, hash_h7); break;

	948 case 8: WarmupHash(size, dict, hash_h8); break;

	949 case 9: WarmupHash(size, dict, hash_h9); break;

	950 case 10:

	951 hash_h10->Init(lgwin, 0, size, false);

	952 for (size_t i = 0; i + kMaxTreeCompLength - 1 < size; ++i) {

	953 hash_h10->Store(dict, std::numeric_limits<size_t>::max(), i);

	954 }

	955 break;

	956 default: break;

	957 }

	958 }

	959

	960

	961 H2* hash_h2;

	962 H3* hash_h3;

	963 H4* hash_h4;

	964 H5* hash_h5;

	965 H6* hash_h6;

	966 H7* hash_h7;

	967 H8* hash_h8;

	968 H9* hash_h9;

	969 H10* hash_h10;

	970 };

	971

	972 } // namespace brotli

	973

	974 #endif // BROTLI_ENC_HASH_H_

OLD	NEW

« no previous file with comments | « third_party/brotli/enc/find_match_length.h ('k') | third_party/brotli/enc/histogram.h » ('j') | no next file with comments »