| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "sync/internal_api/public/base/unique_position.h" | |
| 6 | |
| 7 #include <stddef.h> | |
| 8 #include <stdint.h> | |
| 9 | |
| 10 #include <algorithm> | |
| 11 #include <limits> | |
| 12 | |
| 13 #include "base/logging.h" | |
| 14 #include "base/rand_util.h" | |
| 15 #include "base/stl_util.h" | |
| 16 #include "base/strings/string_number_conversions.h" | |
| 17 #include "sync/protocol/unique_position.pb.h" | |
| 18 #include "third_party/zlib/zlib.h" | |
| 19 | |
| 20 namespace syncer { | |
| 21 | |
| 22 const size_t UniquePosition::kSuffixLength = 28; | |
| 23 const size_t UniquePosition::kCompressBytesThreshold = 128; | |
| 24 | |
| 25 // static. | |
| 26 bool UniquePosition::IsValidSuffix(const std::string& suffix) { | |
| 27 // The suffix must be exactly the specified length, otherwise unique suffixes | |
| 28 // are not sufficient to guarantee unique positions (because prefix + suffix | |
| 29 // == p + refixsuffix). | |
| 30 return suffix.length() == kSuffixLength | |
| 31 && suffix[kSuffixLength-1] != 0; | |
| 32 } | |
| 33 | |
| 34 // static. | |
| 35 bool UniquePosition::IsValidBytes(const std::string& bytes) { | |
| 36 // The first condition ensures that our suffix uniqueness is sufficient to | |
| 37 // guarantee position uniqueness. Otherwise, it's possible the end of some | |
| 38 // prefix + some short suffix == some long suffix. | |
| 39 // The second condition ensures that FindSmallerWithSuffix can always return a | |
| 40 // result. | |
| 41 return bytes.length() >= kSuffixLength | |
| 42 && bytes[bytes.length()-1] != 0; | |
| 43 } | |
| 44 | |
| 45 // static. | |
| 46 std::string UniquePosition::RandomSuffix() { | |
| 47 // Users random data for all but the last byte. The last byte must not be | |
| 48 // zero. We arbitrarily set it to 0x7f. | |
| 49 return base::RandBytesAsString(kSuffixLength - 1) + "\x7f"; | |
| 50 } | |
| 51 | |
| 52 // static. | |
| 53 UniquePosition UniquePosition::CreateInvalid() { | |
| 54 UniquePosition pos; | |
| 55 DCHECK(!pos.IsValid()); | |
| 56 return pos; | |
| 57 } | |
| 58 | |
| 59 // static. | |
| 60 UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) { | |
| 61 if (proto.has_custom_compressed_v1()) { | |
| 62 return UniquePosition(proto.custom_compressed_v1()); | |
| 63 } else if (proto.has_value() && !proto.value().empty()) { | |
| 64 return UniquePosition(Compress(proto.value())); | |
| 65 } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) { | |
| 66 uLongf uncompressed_len = proto.uncompressed_length(); | |
| 67 std::string un_gzipped; | |
| 68 | |
| 69 un_gzipped.resize(uncompressed_len); | |
| 70 int result = uncompress( | |
| 71 reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)), | |
| 72 &uncompressed_len, | |
| 73 reinterpret_cast<const Bytef*>(proto.compressed_value().data()), | |
| 74 proto.compressed_value().size()); | |
| 75 if (result != Z_OK) { | |
| 76 DLOG(ERROR) << "Unzip failed " << result; | |
| 77 return UniquePosition::CreateInvalid(); | |
| 78 } | |
| 79 if (uncompressed_len != proto.uncompressed_length()) { | |
| 80 DLOG(ERROR) | |
| 81 << "Uncompressed length " << uncompressed_len | |
| 82 << " did not match specified length " << proto.uncompressed_length(); | |
| 83 return UniquePosition::CreateInvalid(); | |
| 84 } | |
| 85 return UniquePosition(Compress(un_gzipped)); | |
| 86 } else { | |
| 87 return UniquePosition::CreateInvalid(); | |
| 88 } | |
| 89 } | |
| 90 | |
| 91 // static. | |
| 92 UniquePosition UniquePosition::FromInt64(int64_t x, const std::string& suffix) { | |
| 93 uint64_t y = static_cast<uint64_t>(x); | |
| 94 y ^= 0x8000000000000000ULL; // Make it non-negative. | |
| 95 std::string bytes(8, 0); | |
| 96 for (int i = 7; i >= 0; --i) { | |
| 97 bytes[i] = static_cast<uint8_t>(y); | |
| 98 y >>= 8; | |
| 99 } | |
| 100 return UniquePosition(bytes + suffix, suffix); | |
| 101 } | |
| 102 | |
| 103 // static. | |
| 104 UniquePosition UniquePosition::InitialPosition( | |
| 105 const std::string& suffix) { | |
| 106 DCHECK(IsValidSuffix(suffix)); | |
| 107 return UniquePosition(suffix, suffix); | |
| 108 } | |
| 109 | |
| 110 // static. | |
| 111 UniquePosition UniquePosition::Before( | |
| 112 const UniquePosition& x, | |
| 113 const std::string& suffix) { | |
| 114 DCHECK(IsValidSuffix(suffix)); | |
| 115 DCHECK(x.IsValid()); | |
| 116 const std::string& before = FindSmallerWithSuffix( | |
| 117 Uncompress(x.compressed_), suffix); | |
| 118 return UniquePosition(before + suffix, suffix); | |
| 119 } | |
| 120 | |
| 121 // static. | |
| 122 UniquePosition UniquePosition::After( | |
| 123 const UniquePosition& x, | |
| 124 const std::string& suffix) { | |
| 125 DCHECK(IsValidSuffix(suffix)); | |
| 126 DCHECK(x.IsValid()); | |
| 127 const std::string& after = FindGreaterWithSuffix( | |
| 128 Uncompress(x.compressed_), suffix); | |
| 129 return UniquePosition(after + suffix, suffix); | |
| 130 } | |
| 131 | |
| 132 // static. | |
| 133 UniquePosition UniquePosition::Between( | |
| 134 const UniquePosition& before, | |
| 135 const UniquePosition& after, | |
| 136 const std::string& suffix) { | |
| 137 DCHECK(before.IsValid()); | |
| 138 DCHECK(after.IsValid()); | |
| 139 DCHECK(before.LessThan(after)); | |
| 140 DCHECK(IsValidSuffix(suffix)); | |
| 141 const std::string& mid = FindBetweenWithSuffix( | |
| 142 Uncompress(before.compressed_), | |
| 143 Uncompress(after.compressed_), | |
| 144 suffix); | |
| 145 return UniquePosition(mid + suffix, suffix); | |
| 146 } | |
| 147 | |
| 148 UniquePosition::UniquePosition() : is_valid_(false) {} | |
| 149 | |
| 150 bool UniquePosition::LessThan(const UniquePosition& other) const { | |
| 151 DCHECK(this->IsValid()); | |
| 152 DCHECK(other.IsValid()); | |
| 153 | |
| 154 return compressed_ < other.compressed_; | |
| 155 } | |
| 156 | |
| 157 bool UniquePosition::Equals(const UniquePosition& other) const { | |
| 158 if (!this->IsValid() && !other.IsValid()) | |
| 159 return true; | |
| 160 | |
| 161 return compressed_ == other.compressed_; | |
| 162 } | |
| 163 | |
| 164 void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const { | |
| 165 proto->Clear(); | |
| 166 | |
| 167 // This is the current preferred foramt. | |
| 168 proto->set_custom_compressed_v1(compressed_); | |
| 169 | |
| 170 // Older clients used to write other formats. We don't bother doing that | |
| 171 // anymore because that form of backwards compatibility is expensive. We no | |
| 172 // longer want to pay that price just too support clients that have been | |
| 173 // obsolete for a long time. See the proto definition for details. | |
| 174 } | |
| 175 | |
| 176 void UniquePosition::SerializeToString(std::string* blob) const { | |
| 177 DCHECK(blob); | |
| 178 sync_pb::UniquePosition proto; | |
| 179 ToProto(&proto); | |
| 180 proto.SerializeToString(blob); | |
| 181 } | |
| 182 | |
| 183 int64_t UniquePosition::ToInt64() const { | |
| 184 uint64_t y = 0; | |
| 185 const std::string& s = Uncompress(compressed_); | |
| 186 size_t l = sizeof(int64_t); | |
| 187 if (s.length() < l) { | |
| 188 NOTREACHED(); | |
| 189 l = s.length(); | |
| 190 } | |
| 191 for (size_t i = 0; i < l; ++i) { | |
| 192 const uint8_t byte = s[l - i - 1]; | |
| 193 y |= static_cast<uint64_t>(byte) << (i * 8); | |
| 194 } | |
| 195 y ^= 0x8000000000000000ULL; | |
| 196 // This is technically implementation-defined if y > INT64_MAX, so | |
| 197 // we're assuming that we're on a twos-complement machine. | |
| 198 return static_cast<int64_t>(y); | |
| 199 } | |
| 200 | |
| 201 bool UniquePosition::IsValid() const { | |
| 202 return is_valid_; | |
| 203 } | |
| 204 | |
| 205 std::string UniquePosition::ToDebugString() const { | |
| 206 const std::string bytes = Uncompress(compressed_); | |
| 207 if (bytes.empty()) | |
| 208 return std::string("INVALID[]"); | |
| 209 | |
| 210 std::string debug_string = base::HexEncode(bytes.data(), bytes.length()); | |
| 211 if (!IsValid()) { | |
| 212 debug_string = "INVALID[" + debug_string + "]"; | |
| 213 } | |
| 214 | |
| 215 std::string compressed_string = | |
| 216 base::HexEncode(compressed_.data(), compressed_.length()); | |
| 217 debug_string.append(", compressed: " + compressed_string); | |
| 218 return debug_string; | |
| 219 } | |
| 220 | |
| 221 std::string UniquePosition::GetSuffixForTest() const { | |
| 222 const std::string bytes = Uncompress(compressed_); | |
| 223 const size_t prefix_len = bytes.length() - kSuffixLength; | |
| 224 return bytes.substr(prefix_len, std::string::npos); | |
| 225 } | |
| 226 | |
| 227 std::string UniquePosition::FindSmallerWithSuffix( | |
| 228 const std::string& reference, | |
| 229 const std::string& suffix) { | |
| 230 size_t ref_zeroes = reference.find_first_not_of('\0'); | |
| 231 size_t suffix_zeroes = suffix.find_first_not_of('\0'); | |
| 232 | |
| 233 // Neither of our inputs are allowed to have trailing zeroes, so the following | |
| 234 // must be true. | |
| 235 DCHECK_NE(ref_zeroes, std::string::npos); | |
| 236 DCHECK_NE(suffix_zeroes, std::string::npos); | |
| 237 | |
| 238 if (suffix_zeroes > ref_zeroes) { | |
| 239 // Implies suffix < ref. | |
| 240 return std::string(); | |
| 241 } | |
| 242 | |
| 243 if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) { | |
| 244 // Prepend zeroes so the result has as many zero digits as |reference|. | |
| 245 return std::string(ref_zeroes - suffix_zeroes, '\0'); | |
| 246 } else if (suffix_zeroes > 1) { | |
| 247 // Prepend zeroes so the result has one more zero digit than |reference|. | |
| 248 // We could also take the "else" branch below, but taking this branch will | |
| 249 // give us a smaller result. | |
| 250 return std::string(ref_zeroes - suffix_zeroes + 1, '\0'); | |
| 251 } else { | |
| 252 // Prepend zeroes to match those in the |reference|, then something smaller | |
| 253 // than the first non-zero digit in |reference|. | |
| 254 char lt_digit = static_cast<uint8_t>(reference[ref_zeroes]) / 2; | |
| 255 return std::string(ref_zeroes, '\0') + lt_digit; | |
| 256 } | |
| 257 } | |
| 258 | |
| 259 // static | |
| 260 std::string UniquePosition::FindGreaterWithSuffix( | |
| 261 const std::string& reference, | |
| 262 const std::string& suffix) { | |
| 263 size_t ref_FFs = | |
| 264 reference.find_first_not_of(std::numeric_limits<uint8_t>::max()); | |
| 265 size_t suffix_FFs = | |
| 266 suffix.find_first_not_of(std::numeric_limits<uint8_t>::max()); | |
| 267 | |
| 268 if (ref_FFs == std::string::npos) { | |
| 269 ref_FFs = reference.length(); | |
| 270 } | |
| 271 if (suffix_FFs == std::string::npos) { | |
| 272 suffix_FFs = suffix.length(); | |
| 273 } | |
| 274 | |
| 275 if (suffix_FFs > ref_FFs) { | |
| 276 // Implies suffix > reference. | |
| 277 return std::string(); | |
| 278 } | |
| 279 | |
| 280 if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) { | |
| 281 // Prepend FF digits to match those in |reference|. | |
| 282 return std::string(ref_FFs - suffix_FFs, | |
| 283 std::numeric_limits<uint8_t>::max()); | |
| 284 } else if (suffix_FFs > 1) { | |
| 285 // Prepend enough leading FF digits so result has one more of them than | |
| 286 // |reference| does. We could also take the "else" branch below, but this | |
| 287 // gives us a smaller result. | |
| 288 return std::string(ref_FFs - suffix_FFs + 1, | |
| 289 std::numeric_limits<uint8_t>::max()); | |
| 290 } else { | |
| 291 // Prepend FF digits to match those in |reference|, then something larger | |
| 292 // than the first non-FF digit in |reference|. | |
| 293 char gt_digit = static_cast<uint8_t>(reference[ref_FFs]) + | |
| 294 (std::numeric_limits<uint8_t>::max() - | |
| 295 static_cast<uint8_t>(reference[ref_FFs]) + 1) / | |
| 296 2; | |
| 297 return std::string(ref_FFs, std::numeric_limits<uint8_t>::max()) + gt_digit; | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 // static | |
| 302 std::string UniquePosition::FindBetweenWithSuffix( | |
| 303 const std::string& before, | |
| 304 const std::string& after, | |
| 305 const std::string& suffix) { | |
| 306 DCHECK(IsValidSuffix(suffix)); | |
| 307 DCHECK_NE(before, after); | |
| 308 DCHECK_LT(before, after); | |
| 309 | |
| 310 std::string mid; | |
| 311 | |
| 312 // Sometimes our suffix puts us where we want to be. | |
| 313 if (before < suffix && suffix < after) { | |
| 314 return std::string(); | |
| 315 } | |
| 316 | |
| 317 size_t i = 0; | |
| 318 for ( ; i < std::min(before.length(), after.length()); ++i) { | |
| 319 uint8_t a_digit = before[i]; | |
| 320 uint8_t b_digit = after[i]; | |
| 321 | |
| 322 if (b_digit - a_digit >= 2) { | |
| 323 mid.push_back(a_digit + (b_digit - a_digit)/2); | |
| 324 return mid; | |
| 325 } else if (a_digit == b_digit) { | |
| 326 mid.push_back(a_digit); | |
| 327 | |
| 328 // Both strings are equal so far. Will appending the suffix at this point | |
| 329 // give us the comparison we're looking for? | |
| 330 if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) { | |
| 331 return mid; | |
| 332 } | |
| 333 } else { | |
| 334 DCHECK_EQ(b_digit - a_digit, 1); // Implied by above if branches. | |
| 335 | |
| 336 // The two options are off by one digit. The choice of whether to round | |
| 337 // up or down here will have consequences on what we do with the remaining | |
| 338 // digits. Exploring both options is an optimization and is not required | |
| 339 // for the correctness of this algorithm. | |
| 340 | |
| 341 // Option A: Round down the current digit. This makes our |mid| < | |
| 342 // |after|, no matter what we append afterwards. We then focus on | |
| 343 // appending digits until |mid| > |before|. | |
| 344 std::string mid_a = mid; | |
| 345 mid_a.push_back(a_digit); | |
| 346 mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix)); | |
| 347 | |
| 348 // Option B: Round up the current digit. This makes our |mid| > |before|, | |
| 349 // no matter what we append afterwards. We then focus on appending digits | |
| 350 // until |mid| < |after|. Note that this option may not be viable if the | |
| 351 // current digit is the last one in |after|, so we skip the option in that | |
| 352 // case. | |
| 353 if (after.length() > i+1) { | |
| 354 std::string mid_b = mid; | |
| 355 mid_b.push_back(b_digit); | |
| 356 mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix)); | |
| 357 | |
| 358 // Does this give us a shorter position value? If so, use it. | |
| 359 if (mid_b.length() < mid_a.length()) { | |
| 360 return mid_b; | |
| 361 } | |
| 362 } | |
| 363 return mid_a; | |
| 364 } | |
| 365 } | |
| 366 | |
| 367 // If we haven't found a midpoint yet, the following must be true: | |
| 368 DCHECK_EQ(before.substr(0, i), after.substr(0, i)); | |
| 369 DCHECK_EQ(before, mid); | |
| 370 DCHECK_LT(before.length(), after.length()); | |
| 371 | |
| 372 // We know that we'll need to append at least one more byte to |mid| in the | |
| 373 // process of making it < |after|. Appending any digit, regardless of the | |
| 374 // value, will make |before| < |mid|. Therefore, the following will get us a | |
| 375 // valid position. | |
| 376 | |
| 377 mid.append(FindSmallerWithSuffix(after.substr(i), suffix)); | |
| 378 return mid; | |
| 379 } | |
| 380 | |
| 381 UniquePosition::UniquePosition(const std::string& internal_rep) | |
| 382 : compressed_(internal_rep), | |
| 383 is_valid_(IsValidBytes(Uncompress(internal_rep))) { | |
| 384 } | |
| 385 | |
| 386 UniquePosition::UniquePosition( | |
| 387 const std::string& uncompressed, | |
| 388 const std::string& suffix) | |
| 389 : compressed_(Compress(uncompressed)), | |
| 390 is_valid_(IsValidBytes(uncompressed)) { | |
| 391 DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length()); | |
| 392 DCHECK(IsValidSuffix(suffix)); | |
| 393 DCHECK(IsValid()); | |
| 394 } | |
| 395 | |
| 396 // On custom compression: | |
| 397 // | |
| 398 // Let C(x) be the compression function and U(x) be the uncompression function. | |
| 399 // | |
| 400 // This compression scheme has a few special properties. For one, it is | |
| 401 // order-preserving. For any two valid position strings x and y: | |
| 402 // x < y <=> C(x) < C(y) | |
| 403 // This allows us keep the position strings compressed as we sort them. | |
| 404 // | |
| 405 // The compressed format and the decode algorithm: | |
| 406 // | |
| 407 // The compressed string is a series of blocks, almost all of which are 8 bytes | |
| 408 // in length. The only exception is the last block in the compressed string, | |
| 409 // which may be a remainder block, which has length no greater than 7. The | |
| 410 // full-length blocks are either repeated character blocks or plain data blocks. | |
| 411 // All blocks are entirely self-contained. Their decoded values are independent | |
| 412 // from that of their neighbours. | |
| 413 // | |
| 414 // A repeated character block is encoded into eight bytes and represents between | |
| 415 // 4 and 2^31 repeated instances of a given character in the unencoded stream. | |
| 416 // The encoding consists of a single character repeated four times, followed by | |
| 417 // an encoded count. The encoded count is stored as a big-endian 32 bit | |
| 418 // integer. There are 2^31 possible count values, and two encodings for each. | |
| 419 // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc = | |
| 420 // count'. At compression time, the algorithm will choose between the two | |
| 421 // encodings based on which of the two will maintain the appropriate sort | |
| 422 // ordering (by a process which will be described below). The decompression | |
| 423 // algorithm need not concern itself with which encoding was used; it needs only | |
| 424 // to decode it. The decoded value of this block is "count" instances of the | |
| 425 // character that was repeated four times in the first half of this block. | |
| 426 // | |
| 427 // A plain data block is encoded into eight bytes and represents exactly eight | |
| 428 // bytes of data in the unencoded stream. The plain data block must not begin | |
| 429 // with the same character repeated four times. It is allowed to contain such a | |
| 430 // four-character sequence, just not at the start of the block. The decoded | |
| 431 // value of a plain data block is identical to its encoded value. | |
| 432 // | |
| 433 // A remainder block has length of at most seven. It is a shorter version of | |
| 434 // the plain data block. It occurs only at the end of the encoded stream and | |
| 435 // represents exactly as many bytes of unencoded data as its own length. Like a | |
| 436 // plain data block, the remainder block never begins with the same character | |
| 437 // repeated four times. The decoded value of this block is identical to its | |
| 438 // encoded value. | |
| 439 // | |
| 440 // The encode algorithm: | |
| 441 // | |
| 442 // From the above description, it can be seen that there may be more than one | |
| 443 // way to encode a given input string. The encoder must be careful to choose | |
| 444 // the encoding that guarantees sort ordering. | |
| 445 // | |
| 446 // The rules for the encoder are as follows: | |
| 447 // 1. Iterate through the input string and produce output blocks one at a time. | |
| 448 // 2. Where possible (ie. where the next four bytes of input consist of the | |
| 449 // same character repeated four times), produce a repeated data block of | |
| 450 // maximum possible length. | |
| 451 // 3. If there is at least 8 bytes of data remaining and it is not possible | |
| 452 // to produce a repeated character block, produce a plain data block. | |
| 453 // 4. If there are less than 8 bytes of data remaining and it is not possible | |
| 454 // to produce a repeated character block, produce a remainder block. | |
| 455 // 5. When producing a repeated character block, the count encoding must be | |
| 456 // chosen in such a way that the sort ordering is maintained. The choice is | |
| 457 // best illustrated by way of example: | |
| 458 // | |
| 459 // When comparing two strings, the first of which begins with of 8 | |
| 460 // instances of the letter 'B' and the second with 10 instances of the | |
| 461 // letter 'B', which of the two should compare lower? The result depends | |
| 462 // on the 9th character of the first string, since it will be compared | |
| 463 // against the 9th 'B' in the second string. If that character is an 'A', | |
| 464 // then the first string will compare lower. If it is a 'C', then the | |
| 465 // first string will compare higher. | |
| 466 // | |
| 467 // The key insight is that the comparison value of a repeated character block | |
| 468 // depends on the value of the character that follows it. If the character | |
| 469 // follows the repeated character has a value greater than the repeated | |
| 470 // character itself, then a shorter run length should translate to a higher | |
| 471 // comparison value. Therefore, we encode its count using the low encoding. | |
| 472 // Similarly, if the following character is lower, we use the high encoding. | |
| 473 | |
| 474 namespace { | |
| 475 | |
| 476 // Appends an encoded run length to |output_str|. | |
| 477 static void WriteEncodedRunLength(uint32_t length, | |
| 478 bool high_encoding, | |
| 479 std::string* output_str) { | |
| 480 CHECK_GE(length, 4U); | |
| 481 CHECK_LT(length, 0x80000000); | |
| 482 | |
| 483 // Step 1: Invert the count, if necessary, to account for the following digit. | |
| 484 uint32_t encoded_length; | |
| 485 if (high_encoding) { | |
| 486 encoded_length = 0xffffffff - length; | |
| 487 } else { | |
| 488 encoded_length = length; | |
| 489 } | |
| 490 | |
| 491 // Step 2: Write it as big-endian so it compares correctly with memcmp(3). | |
| 492 output_str->append(1, 0xff & (encoded_length >> 24U)); | |
| 493 output_str->append(1, 0xff & (encoded_length >> 16U)); | |
| 494 output_str->append(1, 0xff & (encoded_length >> 8U)); | |
| 495 output_str->append(1, 0xff & (encoded_length >> 0U)); | |
| 496 } | |
| 497 | |
| 498 // Reads an encoded run length for |str| at position |i|. | |
| 499 static uint32_t ReadEncodedRunLength(const std::string& str, size_t i) { | |
| 500 DCHECK_LE(i + 4, str.length()); | |
| 501 | |
| 502 // Step 1: Extract the big-endian count. | |
| 503 uint32_t encoded_length = | |
| 504 ((uint8_t)(str[i + 3]) << 0) | ((uint8_t)(str[i + 2]) << 8) | | |
| 505 ((uint8_t)(str[i + 1]) << 16) | ((uint8_t)(str[i + 0]) << 24); | |
| 506 | |
| 507 // Step 2: If this was an inverted count, un-invert it. | |
| 508 uint32_t length; | |
| 509 if (encoded_length & 0x80000000) { | |
| 510 length = 0xffffffff - encoded_length; | |
| 511 } else { | |
| 512 length = encoded_length; | |
| 513 } | |
| 514 | |
| 515 return length; | |
| 516 } | |
| 517 | |
| 518 // A series of four identical chars at the beginning of a block indicates | |
| 519 // the beginning of a repeated character block. | |
| 520 static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) { | |
| 521 return chars[start_index] == chars[start_index+1] | |
| 522 && chars[start_index] == chars[start_index+2] | |
| 523 && chars[start_index] == chars[start_index+3]; | |
| 524 } | |
| 525 | |
| 526 } // namespace | |
| 527 | |
| 528 // static | |
| 529 // Wraps the CompressImpl function with a bunch of DCHECKs. | |
| 530 std::string UniquePosition::Compress(const std::string& str) { | |
| 531 DCHECK(IsValidBytes(str)); | |
| 532 std::string compressed = CompressImpl(str); | |
| 533 DCHECK(IsValidCompressed(compressed)); | |
| 534 DCHECK_EQ(str, Uncompress(compressed)); | |
| 535 return compressed; | |
| 536 } | |
| 537 | |
| 538 // static | |
| 539 // Performs the order preserving run length compression of a given input string. | |
| 540 std::string UniquePosition::CompressImpl(const std::string& str) { | |
| 541 std::string output; | |
| 542 | |
| 543 // The compressed length will usually be at least as long as the suffix (28), | |
| 544 // since the suffix bytes are mostly random. Most are a few bytes longer; a | |
| 545 // small few are tens of bytes longer. Some early tests indicated that | |
| 546 // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a | |
| 547 // good trade-off, but that has not been confirmed through profiling. | |
| 548 output.reserve(48); | |
| 549 | |
| 550 // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the | |
| 551 // length of a string of identical digits starting at i. | |
| 552 for (size_t i = 0; i < str.length(); ) { | |
| 553 if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) { | |
| 554 // Four identical bytes in a row at this position means that we must start | |
| 555 // a repeated character block. Begin by outputting those four bytes. | |
| 556 output.append(str, i, 4); | |
| 557 | |
| 558 // Determine the size of the run. | |
| 559 const char rep_digit = str[i]; | |
| 560 const size_t runs_until = str.find_first_not_of(rep_digit, i+4); | |
| 561 | |
| 562 // Handle the 'runs until end' special case specially. | |
| 563 size_t run_length; | |
| 564 bool encode_high; // True if the next byte is greater than |rep_digit|. | |
| 565 if (runs_until == std::string::npos) { | |
| 566 run_length = str.length() - i; | |
| 567 encode_high = false; | |
| 568 } else { | |
| 569 run_length = runs_until - i; | |
| 570 encode_high = static_cast<uint8_t>(str[runs_until]) > | |
| 571 static_cast<uint8_t>(rep_digit); | |
| 572 } | |
| 573 DCHECK_LT(run_length, | |
| 574 static_cast<size_t>(std::numeric_limits<int32_t>::max())) | |
| 575 << "This implementation can't encode run-lengths greater than 2^31."; | |
| 576 | |
| 577 WriteEncodedRunLength(run_length, encode_high, &output); | |
| 578 i += run_length; // Jump forward by the size of the run length. | |
| 579 } else { | |
| 580 // Output up to eight bytes without any encoding. | |
| 581 const size_t len = std::min(static_cast<size_t>(8), str.length() - i); | |
| 582 output.append(str, i, len); | |
| 583 i += len; // Jump forward by the amount of input consumed (usually 8). | |
| 584 } | |
| 585 } | |
| 586 | |
| 587 return output; | |
| 588 } | |
| 589 | |
| 590 // static | |
| 591 // Uncompresses strings that were compresed with UniquePosition::Compress. | |
| 592 std::string UniquePosition::Uncompress(const std::string& str) { | |
| 593 std::string output; | |
| 594 size_t i = 0; | |
| 595 // Iterate through the compressed string one block at a time. | |
| 596 for (i = 0; i + 8 <= str.length(); i += 8) { | |
| 597 if (IsRepeatedCharPrefix(str, i)) { | |
| 598 // Found a repeated character block. Expand it. | |
| 599 const char rep_digit = str[i]; | |
| 600 uint32_t length = ReadEncodedRunLength(str, i + 4); | |
| 601 output.append(length, rep_digit); | |
| 602 } else { | |
| 603 // Found a regular block. Copy it. | |
| 604 output.append(str, i, 8); | |
| 605 } | |
| 606 } | |
| 607 // Copy the remaining bytes that were too small to form a block. | |
| 608 output.append(str, i, std::string::npos); | |
| 609 return output; | |
| 610 } | |
| 611 | |
| 612 bool UniquePosition::IsValidCompressed(const std::string& str) { | |
| 613 for (size_t i = 0; i + 8 <= str.length(); i += 8) { | |
| 614 if (IsRepeatedCharPrefix(str, i)) { | |
| 615 uint32_t count = ReadEncodedRunLength(str, i + 4); | |
| 616 if (count < 4) { | |
| 617 // A repeated character block should at least represent the four | |
| 618 // characters that started it. | |
| 619 return false; | |
| 620 } | |
| 621 if (str[i] == str[i+4]) { | |
| 622 // Does the next digit after a count match the repeated character? Then | |
| 623 // this is not the highest possible count. | |
| 624 return false; | |
| 625 } | |
| 626 } | |
| 627 } | |
| 628 // We don't bother looking for the existence or checking the validity of | |
| 629 // any partial blocks. There's no way they could be invalid anyway. | |
| 630 return true; | |
| 631 } | |
| 632 | |
| 633 } // namespace syncer | |
| OLD | NEW |