OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "sync/internal_api/public/base/unique_position.h" | |
6 | |
7 #include <stddef.h> | |
8 #include <stdint.h> | |
9 | |
10 #include <algorithm> | |
11 #include <limits> | |
12 | |
13 #include "base/logging.h" | |
14 #include "base/rand_util.h" | |
15 #include "base/stl_util.h" | |
16 #include "base/strings/string_number_conversions.h" | |
17 #include "sync/protocol/unique_position.pb.h" | |
18 #include "third_party/zlib/zlib.h" | |
19 | |
20 namespace syncer { | |
21 | |
22 const size_t UniquePosition::kSuffixLength = 28; | |
23 const size_t UniquePosition::kCompressBytesThreshold = 128; | |
24 | |
25 // static. | |
26 bool UniquePosition::IsValidSuffix(const std::string& suffix) { | |
27 // The suffix must be exactly the specified length, otherwise unique suffixes | |
28 // are not sufficient to guarantee unique positions (because prefix + suffix | |
29 // == p + refixsuffix). | |
30 return suffix.length() == kSuffixLength | |
31 && suffix[kSuffixLength-1] != 0; | |
32 } | |
33 | |
34 // static. | |
35 bool UniquePosition::IsValidBytes(const std::string& bytes) { | |
36 // The first condition ensures that our suffix uniqueness is sufficient to | |
37 // guarantee position uniqueness. Otherwise, it's possible the end of some | |
38 // prefix + some short suffix == some long suffix. | |
39 // The second condition ensures that FindSmallerWithSuffix can always return a | |
40 // result. | |
41 return bytes.length() >= kSuffixLength | |
42 && bytes[bytes.length()-1] != 0; | |
43 } | |
44 | |
45 // static. | |
46 std::string UniquePosition::RandomSuffix() { | |
47 // Users random data for all but the last byte. The last byte must not be | |
48 // zero. We arbitrarily set it to 0x7f. | |
49 return base::RandBytesAsString(kSuffixLength - 1) + "\x7f"; | |
50 } | |
51 | |
52 // static. | |
53 UniquePosition UniquePosition::CreateInvalid() { | |
54 UniquePosition pos; | |
55 DCHECK(!pos.IsValid()); | |
56 return pos; | |
57 } | |
58 | |
59 // static. | |
60 UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) { | |
61 if (proto.has_custom_compressed_v1()) { | |
62 return UniquePosition(proto.custom_compressed_v1()); | |
63 } else if (proto.has_value() && !proto.value().empty()) { | |
64 return UniquePosition(Compress(proto.value())); | |
65 } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) { | |
66 uLongf uncompressed_len = proto.uncompressed_length(); | |
67 std::string un_gzipped; | |
68 | |
69 un_gzipped.resize(uncompressed_len); | |
70 int result = uncompress( | |
71 reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)), | |
72 &uncompressed_len, | |
73 reinterpret_cast<const Bytef*>(proto.compressed_value().data()), | |
74 proto.compressed_value().size()); | |
75 if (result != Z_OK) { | |
76 DLOG(ERROR) << "Unzip failed " << result; | |
77 return UniquePosition::CreateInvalid(); | |
78 } | |
79 if (uncompressed_len != proto.uncompressed_length()) { | |
80 DLOG(ERROR) | |
81 << "Uncompressed length " << uncompressed_len | |
82 << " did not match specified length " << proto.uncompressed_length(); | |
83 return UniquePosition::CreateInvalid(); | |
84 } | |
85 return UniquePosition(Compress(un_gzipped)); | |
86 } else { | |
87 return UniquePosition::CreateInvalid(); | |
88 } | |
89 } | |
90 | |
91 // static. | |
92 UniquePosition UniquePosition::FromInt64(int64_t x, const std::string& suffix) { | |
93 uint64_t y = static_cast<uint64_t>(x); | |
94 y ^= 0x8000000000000000ULL; // Make it non-negative. | |
95 std::string bytes(8, 0); | |
96 for (int i = 7; i >= 0; --i) { | |
97 bytes[i] = static_cast<uint8_t>(y); | |
98 y >>= 8; | |
99 } | |
100 return UniquePosition(bytes + suffix, suffix); | |
101 } | |
102 | |
103 // static. | |
104 UniquePosition UniquePosition::InitialPosition( | |
105 const std::string& suffix) { | |
106 DCHECK(IsValidSuffix(suffix)); | |
107 return UniquePosition(suffix, suffix); | |
108 } | |
109 | |
110 // static. | |
111 UniquePosition UniquePosition::Before( | |
112 const UniquePosition& x, | |
113 const std::string& suffix) { | |
114 DCHECK(IsValidSuffix(suffix)); | |
115 DCHECK(x.IsValid()); | |
116 const std::string& before = FindSmallerWithSuffix( | |
117 Uncompress(x.compressed_), suffix); | |
118 return UniquePosition(before + suffix, suffix); | |
119 } | |
120 | |
121 // static. | |
122 UniquePosition UniquePosition::After( | |
123 const UniquePosition& x, | |
124 const std::string& suffix) { | |
125 DCHECK(IsValidSuffix(suffix)); | |
126 DCHECK(x.IsValid()); | |
127 const std::string& after = FindGreaterWithSuffix( | |
128 Uncompress(x.compressed_), suffix); | |
129 return UniquePosition(after + suffix, suffix); | |
130 } | |
131 | |
132 // static. | |
133 UniquePosition UniquePosition::Between( | |
134 const UniquePosition& before, | |
135 const UniquePosition& after, | |
136 const std::string& suffix) { | |
137 DCHECK(before.IsValid()); | |
138 DCHECK(after.IsValid()); | |
139 DCHECK(before.LessThan(after)); | |
140 DCHECK(IsValidSuffix(suffix)); | |
141 const std::string& mid = FindBetweenWithSuffix( | |
142 Uncompress(before.compressed_), | |
143 Uncompress(after.compressed_), | |
144 suffix); | |
145 return UniquePosition(mid + suffix, suffix); | |
146 } | |
147 | |
148 UniquePosition::UniquePosition() : is_valid_(false) {} | |
149 | |
150 bool UniquePosition::LessThan(const UniquePosition& other) const { | |
151 DCHECK(this->IsValid()); | |
152 DCHECK(other.IsValid()); | |
153 | |
154 return compressed_ < other.compressed_; | |
155 } | |
156 | |
157 bool UniquePosition::Equals(const UniquePosition& other) const { | |
158 if (!this->IsValid() && !other.IsValid()) | |
159 return true; | |
160 | |
161 return compressed_ == other.compressed_; | |
162 } | |
163 | |
164 void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const { | |
165 proto->Clear(); | |
166 | |
167 // This is the current preferred foramt. | |
168 proto->set_custom_compressed_v1(compressed_); | |
169 | |
170 // Older clients used to write other formats. We don't bother doing that | |
171 // anymore because that form of backwards compatibility is expensive. We no | |
172 // longer want to pay that price just too support clients that have been | |
173 // obsolete for a long time. See the proto definition for details. | |
174 } | |
175 | |
176 void UniquePosition::SerializeToString(std::string* blob) const { | |
177 DCHECK(blob); | |
178 sync_pb::UniquePosition proto; | |
179 ToProto(&proto); | |
180 proto.SerializeToString(blob); | |
181 } | |
182 | |
183 int64_t UniquePosition::ToInt64() const { | |
184 uint64_t y = 0; | |
185 const std::string& s = Uncompress(compressed_); | |
186 size_t l = sizeof(int64_t); | |
187 if (s.length() < l) { | |
188 NOTREACHED(); | |
189 l = s.length(); | |
190 } | |
191 for (size_t i = 0; i < l; ++i) { | |
192 const uint8_t byte = s[l - i - 1]; | |
193 y |= static_cast<uint64_t>(byte) << (i * 8); | |
194 } | |
195 y ^= 0x8000000000000000ULL; | |
196 // This is technically implementation-defined if y > INT64_MAX, so | |
197 // we're assuming that we're on a twos-complement machine. | |
198 return static_cast<int64_t>(y); | |
199 } | |
200 | |
201 bool UniquePosition::IsValid() const { | |
202 return is_valid_; | |
203 } | |
204 | |
205 std::string UniquePosition::ToDebugString() const { | |
206 const std::string bytes = Uncompress(compressed_); | |
207 if (bytes.empty()) | |
208 return std::string("INVALID[]"); | |
209 | |
210 std::string debug_string = base::HexEncode(bytes.data(), bytes.length()); | |
211 if (!IsValid()) { | |
212 debug_string = "INVALID[" + debug_string + "]"; | |
213 } | |
214 | |
215 std::string compressed_string = | |
216 base::HexEncode(compressed_.data(), compressed_.length()); | |
217 debug_string.append(", compressed: " + compressed_string); | |
218 return debug_string; | |
219 } | |
220 | |
221 std::string UniquePosition::GetSuffixForTest() const { | |
222 const std::string bytes = Uncompress(compressed_); | |
223 const size_t prefix_len = bytes.length() - kSuffixLength; | |
224 return bytes.substr(prefix_len, std::string::npos); | |
225 } | |
226 | |
227 std::string UniquePosition::FindSmallerWithSuffix( | |
228 const std::string& reference, | |
229 const std::string& suffix) { | |
230 size_t ref_zeroes = reference.find_first_not_of('\0'); | |
231 size_t suffix_zeroes = suffix.find_first_not_of('\0'); | |
232 | |
233 // Neither of our inputs are allowed to have trailing zeroes, so the following | |
234 // must be true. | |
235 DCHECK_NE(ref_zeroes, std::string::npos); | |
236 DCHECK_NE(suffix_zeroes, std::string::npos); | |
237 | |
238 if (suffix_zeroes > ref_zeroes) { | |
239 // Implies suffix < ref. | |
240 return std::string(); | |
241 } | |
242 | |
243 if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) { | |
244 // Prepend zeroes so the result has as many zero digits as |reference|. | |
245 return std::string(ref_zeroes - suffix_zeroes, '\0'); | |
246 } else if (suffix_zeroes > 1) { | |
247 // Prepend zeroes so the result has one more zero digit than |reference|. | |
248 // We could also take the "else" branch below, but taking this branch will | |
249 // give us a smaller result. | |
250 return std::string(ref_zeroes - suffix_zeroes + 1, '\0'); | |
251 } else { | |
252 // Prepend zeroes to match those in the |reference|, then something smaller | |
253 // than the first non-zero digit in |reference|. | |
254 char lt_digit = static_cast<uint8_t>(reference[ref_zeroes]) / 2; | |
255 return std::string(ref_zeroes, '\0') + lt_digit; | |
256 } | |
257 } | |
258 | |
259 // static | |
260 std::string UniquePosition::FindGreaterWithSuffix( | |
261 const std::string& reference, | |
262 const std::string& suffix) { | |
263 size_t ref_FFs = | |
264 reference.find_first_not_of(std::numeric_limits<uint8_t>::max()); | |
265 size_t suffix_FFs = | |
266 suffix.find_first_not_of(std::numeric_limits<uint8_t>::max()); | |
267 | |
268 if (ref_FFs == std::string::npos) { | |
269 ref_FFs = reference.length(); | |
270 } | |
271 if (suffix_FFs == std::string::npos) { | |
272 suffix_FFs = suffix.length(); | |
273 } | |
274 | |
275 if (suffix_FFs > ref_FFs) { | |
276 // Implies suffix > reference. | |
277 return std::string(); | |
278 } | |
279 | |
280 if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) { | |
281 // Prepend FF digits to match those in |reference|. | |
282 return std::string(ref_FFs - suffix_FFs, | |
283 std::numeric_limits<uint8_t>::max()); | |
284 } else if (suffix_FFs > 1) { | |
285 // Prepend enough leading FF digits so result has one more of them than | |
286 // |reference| does. We could also take the "else" branch below, but this | |
287 // gives us a smaller result. | |
288 return std::string(ref_FFs - suffix_FFs + 1, | |
289 std::numeric_limits<uint8_t>::max()); | |
290 } else { | |
291 // Prepend FF digits to match those in |reference|, then something larger | |
292 // than the first non-FF digit in |reference|. | |
293 char gt_digit = static_cast<uint8_t>(reference[ref_FFs]) + | |
294 (std::numeric_limits<uint8_t>::max() - | |
295 static_cast<uint8_t>(reference[ref_FFs]) + 1) / | |
296 2; | |
297 return std::string(ref_FFs, std::numeric_limits<uint8_t>::max()) + gt_digit; | |
298 } | |
299 } | |
300 | |
301 // static | |
302 std::string UniquePosition::FindBetweenWithSuffix( | |
303 const std::string& before, | |
304 const std::string& after, | |
305 const std::string& suffix) { | |
306 DCHECK(IsValidSuffix(suffix)); | |
307 DCHECK_NE(before, after); | |
308 DCHECK_LT(before, after); | |
309 | |
310 std::string mid; | |
311 | |
312 // Sometimes our suffix puts us where we want to be. | |
313 if (before < suffix && suffix < after) { | |
314 return std::string(); | |
315 } | |
316 | |
317 size_t i = 0; | |
318 for ( ; i < std::min(before.length(), after.length()); ++i) { | |
319 uint8_t a_digit = before[i]; | |
320 uint8_t b_digit = after[i]; | |
321 | |
322 if (b_digit - a_digit >= 2) { | |
323 mid.push_back(a_digit + (b_digit - a_digit)/2); | |
324 return mid; | |
325 } else if (a_digit == b_digit) { | |
326 mid.push_back(a_digit); | |
327 | |
328 // Both strings are equal so far. Will appending the suffix at this point | |
329 // give us the comparison we're looking for? | |
330 if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) { | |
331 return mid; | |
332 } | |
333 } else { | |
334 DCHECK_EQ(b_digit - a_digit, 1); // Implied by above if branches. | |
335 | |
336 // The two options are off by one digit. The choice of whether to round | |
337 // up or down here will have consequences on what we do with the remaining | |
338 // digits. Exploring both options is an optimization and is not required | |
339 // for the correctness of this algorithm. | |
340 | |
341 // Option A: Round down the current digit. This makes our |mid| < | |
342 // |after|, no matter what we append afterwards. We then focus on | |
343 // appending digits until |mid| > |before|. | |
344 std::string mid_a = mid; | |
345 mid_a.push_back(a_digit); | |
346 mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix)); | |
347 | |
348 // Option B: Round up the current digit. This makes our |mid| > |before|, | |
349 // no matter what we append afterwards. We then focus on appending digits | |
350 // until |mid| < |after|. Note that this option may not be viable if the | |
351 // current digit is the last one in |after|, so we skip the option in that | |
352 // case. | |
353 if (after.length() > i+1) { | |
354 std::string mid_b = mid; | |
355 mid_b.push_back(b_digit); | |
356 mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix)); | |
357 | |
358 // Does this give us a shorter position value? If so, use it. | |
359 if (mid_b.length() < mid_a.length()) { | |
360 return mid_b; | |
361 } | |
362 } | |
363 return mid_a; | |
364 } | |
365 } | |
366 | |
367 // If we haven't found a midpoint yet, the following must be true: | |
368 DCHECK_EQ(before.substr(0, i), after.substr(0, i)); | |
369 DCHECK_EQ(before, mid); | |
370 DCHECK_LT(before.length(), after.length()); | |
371 | |
372 // We know that we'll need to append at least one more byte to |mid| in the | |
373 // process of making it < |after|. Appending any digit, regardless of the | |
374 // value, will make |before| < |mid|. Therefore, the following will get us a | |
375 // valid position. | |
376 | |
377 mid.append(FindSmallerWithSuffix(after.substr(i), suffix)); | |
378 return mid; | |
379 } | |
380 | |
381 UniquePosition::UniquePosition(const std::string& internal_rep) | |
382 : compressed_(internal_rep), | |
383 is_valid_(IsValidBytes(Uncompress(internal_rep))) { | |
384 } | |
385 | |
386 UniquePosition::UniquePosition( | |
387 const std::string& uncompressed, | |
388 const std::string& suffix) | |
389 : compressed_(Compress(uncompressed)), | |
390 is_valid_(IsValidBytes(uncompressed)) { | |
391 DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length()); | |
392 DCHECK(IsValidSuffix(suffix)); | |
393 DCHECK(IsValid()); | |
394 } | |
395 | |
396 // On custom compression: | |
397 // | |
398 // Let C(x) be the compression function and U(x) be the uncompression function. | |
399 // | |
400 // This compression scheme has a few special properties. For one, it is | |
401 // order-preserving. For any two valid position strings x and y: | |
402 // x < y <=> C(x) < C(y) | |
403 // This allows us keep the position strings compressed as we sort them. | |
404 // | |
405 // The compressed format and the decode algorithm: | |
406 // | |
407 // The compressed string is a series of blocks, almost all of which are 8 bytes | |
408 // in length. The only exception is the last block in the compressed string, | |
409 // which may be a remainder block, which has length no greater than 7. The | |
410 // full-length blocks are either repeated character blocks or plain data blocks. | |
411 // All blocks are entirely self-contained. Their decoded values are independent | |
412 // from that of their neighbours. | |
413 // | |
414 // A repeated character block is encoded into eight bytes and represents between | |
415 // 4 and 2^31 repeated instances of a given character in the unencoded stream. | |
416 // The encoding consists of a single character repeated four times, followed by | |
417 // an encoded count. The encoded count is stored as a big-endian 32 bit | |
418 // integer. There are 2^31 possible count values, and two encodings for each. | |
419 // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc = | |
420 // count'. At compression time, the algorithm will choose between the two | |
421 // encodings based on which of the two will maintain the appropriate sort | |
422 // ordering (by a process which will be described below). The decompression | |
423 // algorithm need not concern itself with which encoding was used; it needs only | |
424 // to decode it. The decoded value of this block is "count" instances of the | |
425 // character that was repeated four times in the first half of this block. | |
426 // | |
427 // A plain data block is encoded into eight bytes and represents exactly eight | |
428 // bytes of data in the unencoded stream. The plain data block must not begin | |
429 // with the same character repeated four times. It is allowed to contain such a | |
430 // four-character sequence, just not at the start of the block. The decoded | |
431 // value of a plain data block is identical to its encoded value. | |
432 // | |
433 // A remainder block has length of at most seven. It is a shorter version of | |
434 // the plain data block. It occurs only at the end of the encoded stream and | |
435 // represents exactly as many bytes of unencoded data as its own length. Like a | |
436 // plain data block, the remainder block never begins with the same character | |
437 // repeated four times. The decoded value of this block is identical to its | |
438 // encoded value. | |
439 // | |
440 // The encode algorithm: | |
441 // | |
442 // From the above description, it can be seen that there may be more than one | |
443 // way to encode a given input string. The encoder must be careful to choose | |
444 // the encoding that guarantees sort ordering. | |
445 // | |
446 // The rules for the encoder are as follows: | |
447 // 1. Iterate through the input string and produce output blocks one at a time. | |
448 // 2. Where possible (ie. where the next four bytes of input consist of the | |
449 // same character repeated four times), produce a repeated data block of | |
450 // maximum possible length. | |
451 // 3. If there is at least 8 bytes of data remaining and it is not possible | |
452 // to produce a repeated character block, produce a plain data block. | |
453 // 4. If there are less than 8 bytes of data remaining and it is not possible | |
454 // to produce a repeated character block, produce a remainder block. | |
455 // 5. When producing a repeated character block, the count encoding must be | |
456 // chosen in such a way that the sort ordering is maintained. The choice is | |
457 // best illustrated by way of example: | |
458 // | |
459 // When comparing two strings, the first of which begins with of 8 | |
460 // instances of the letter 'B' and the second with 10 instances of the | |
461 // letter 'B', which of the two should compare lower? The result depends | |
462 // on the 9th character of the first string, since it will be compared | |
463 // against the 9th 'B' in the second string. If that character is an 'A', | |
464 // then the first string will compare lower. If it is a 'C', then the | |
465 // first string will compare higher. | |
466 // | |
467 // The key insight is that the comparison value of a repeated character block | |
468 // depends on the value of the character that follows it. If the character | |
469 // follows the repeated character has a value greater than the repeated | |
470 // character itself, then a shorter run length should translate to a higher | |
471 // comparison value. Therefore, we encode its count using the low encoding. | |
472 // Similarly, if the following character is lower, we use the high encoding. | |
473 | |
474 namespace { | |
475 | |
476 // Appends an encoded run length to |output_str|. | |
477 static void WriteEncodedRunLength(uint32_t length, | |
478 bool high_encoding, | |
479 std::string* output_str) { | |
480 CHECK_GE(length, 4U); | |
481 CHECK_LT(length, 0x80000000); | |
482 | |
483 // Step 1: Invert the count, if necessary, to account for the following digit. | |
484 uint32_t encoded_length; | |
485 if (high_encoding) { | |
486 encoded_length = 0xffffffff - length; | |
487 } else { | |
488 encoded_length = length; | |
489 } | |
490 | |
491 // Step 2: Write it as big-endian so it compares correctly with memcmp(3). | |
492 output_str->append(1, 0xff & (encoded_length >> 24U)); | |
493 output_str->append(1, 0xff & (encoded_length >> 16U)); | |
494 output_str->append(1, 0xff & (encoded_length >> 8U)); | |
495 output_str->append(1, 0xff & (encoded_length >> 0U)); | |
496 } | |
497 | |
498 // Reads an encoded run length for |str| at position |i|. | |
499 static uint32_t ReadEncodedRunLength(const std::string& str, size_t i) { | |
500 DCHECK_LE(i + 4, str.length()); | |
501 | |
502 // Step 1: Extract the big-endian count. | |
503 uint32_t encoded_length = | |
504 ((uint8_t)(str[i + 3]) << 0) | ((uint8_t)(str[i + 2]) << 8) | | |
505 ((uint8_t)(str[i + 1]) << 16) | ((uint8_t)(str[i + 0]) << 24); | |
506 | |
507 // Step 2: If this was an inverted count, un-invert it. | |
508 uint32_t length; | |
509 if (encoded_length & 0x80000000) { | |
510 length = 0xffffffff - encoded_length; | |
511 } else { | |
512 length = encoded_length; | |
513 } | |
514 | |
515 return length; | |
516 } | |
517 | |
518 // A series of four identical chars at the beginning of a block indicates | |
519 // the beginning of a repeated character block. | |
520 static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) { | |
521 return chars[start_index] == chars[start_index+1] | |
522 && chars[start_index] == chars[start_index+2] | |
523 && chars[start_index] == chars[start_index+3]; | |
524 } | |
525 | |
526 } // namespace | |
527 | |
528 // static | |
529 // Wraps the CompressImpl function with a bunch of DCHECKs. | |
530 std::string UniquePosition::Compress(const std::string& str) { | |
531 DCHECK(IsValidBytes(str)); | |
532 std::string compressed = CompressImpl(str); | |
533 DCHECK(IsValidCompressed(compressed)); | |
534 DCHECK_EQ(str, Uncompress(compressed)); | |
535 return compressed; | |
536 } | |
537 | |
538 // static | |
539 // Performs the order preserving run length compression of a given input string. | |
540 std::string UniquePosition::CompressImpl(const std::string& str) { | |
541 std::string output; | |
542 | |
543 // The compressed length will usually be at least as long as the suffix (28), | |
544 // since the suffix bytes are mostly random. Most are a few bytes longer; a | |
545 // small few are tens of bytes longer. Some early tests indicated that | |
546 // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a | |
547 // good trade-off, but that has not been confirmed through profiling. | |
548 output.reserve(48); | |
549 | |
550 // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the | |
551 // length of a string of identical digits starting at i. | |
552 for (size_t i = 0; i < str.length(); ) { | |
553 if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) { | |
554 // Four identical bytes in a row at this position means that we must start | |
555 // a repeated character block. Begin by outputting those four bytes. | |
556 output.append(str, i, 4); | |
557 | |
558 // Determine the size of the run. | |
559 const char rep_digit = str[i]; | |
560 const size_t runs_until = str.find_first_not_of(rep_digit, i+4); | |
561 | |
562 // Handle the 'runs until end' special case specially. | |
563 size_t run_length; | |
564 bool encode_high; // True if the next byte is greater than |rep_digit|. | |
565 if (runs_until == std::string::npos) { | |
566 run_length = str.length() - i; | |
567 encode_high = false; | |
568 } else { | |
569 run_length = runs_until - i; | |
570 encode_high = static_cast<uint8_t>(str[runs_until]) > | |
571 static_cast<uint8_t>(rep_digit); | |
572 } | |
573 DCHECK_LT(run_length, | |
574 static_cast<size_t>(std::numeric_limits<int32_t>::max())) | |
575 << "This implementation can't encode run-lengths greater than 2^31."; | |
576 | |
577 WriteEncodedRunLength(run_length, encode_high, &output); | |
578 i += run_length; // Jump forward by the size of the run length. | |
579 } else { | |
580 // Output up to eight bytes without any encoding. | |
581 const size_t len = std::min(static_cast<size_t>(8), str.length() - i); | |
582 output.append(str, i, len); | |
583 i += len; // Jump forward by the amount of input consumed (usually 8). | |
584 } | |
585 } | |
586 | |
587 return output; | |
588 } | |
589 | |
590 // static | |
591 // Uncompresses strings that were compresed with UniquePosition::Compress. | |
592 std::string UniquePosition::Uncompress(const std::string& str) { | |
593 std::string output; | |
594 size_t i = 0; | |
595 // Iterate through the compressed string one block at a time. | |
596 for (i = 0; i + 8 <= str.length(); i += 8) { | |
597 if (IsRepeatedCharPrefix(str, i)) { | |
598 // Found a repeated character block. Expand it. | |
599 const char rep_digit = str[i]; | |
600 uint32_t length = ReadEncodedRunLength(str, i + 4); | |
601 output.append(length, rep_digit); | |
602 } else { | |
603 // Found a regular block. Copy it. | |
604 output.append(str, i, 8); | |
605 } | |
606 } | |
607 // Copy the remaining bytes that were too small to form a block. | |
608 output.append(str, i, std::string::npos); | |
609 return output; | |
610 } | |
611 | |
612 bool UniquePosition::IsValidCompressed(const std::string& str) { | |
613 for (size_t i = 0; i + 8 <= str.length(); i += 8) { | |
614 if (IsRepeatedCharPrefix(str, i)) { | |
615 uint32_t count = ReadEncodedRunLength(str, i + 4); | |
616 if (count < 4) { | |
617 // A repeated character block should at least represent the four | |
618 // characters that started it. | |
619 return false; | |
620 } | |
621 if (str[i] == str[i+4]) { | |
622 // Does the next digit after a count match the repeated character? Then | |
623 // this is not the highest possible count. | |
624 return false; | |
625 } | |
626 } | |
627 } | |
628 // We don't bother looking for the existence or checking the validity of | |
629 // any partial blocks. There's no way they could be invalid anyway. | |
630 return true; | |
631 } | |
632 | |
633 } // namespace syncer | |
OLD | NEW |