Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1017)

Side by Side Diff: third_party/brotli/enc/utf8_util.cc

Issue 2537133002: Update brotli to v1.0.0-snapshot. (Closed)
Patch Set: Fixed typo Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/brotli/enc/utf8_util.c ('k') | third_party/brotli/enc/write_bits.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* Copyright 2013 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6
7 // Heuristics for deciding about the UTF8-ness of strings.
8
9 #include "./utf8_util.h"
10
11 #include "./types.h"
12
13 namespace brotli {
14
15 namespace {
16
17 size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) {
18 // ASCII
19 if ((input[0] & 0x80) == 0) {
20 *symbol = input[0];
21 if (*symbol > 0) {
22 return 1;
23 }
24 }
25 // 2-byte UTF8
26 if (size > 1u &&
27 (input[0] & 0xe0) == 0xc0 &&
28 (input[1] & 0xc0) == 0x80) {
29 *symbol = (((input[0] & 0x1f) << 6) |
30 (input[1] & 0x3f));
31 if (*symbol > 0x7f) {
32 return 2;
33 }
34 }
35 // 3-byte UFT8
36 if (size > 2u &&
37 (input[0] & 0xf0) == 0xe0 &&
38 (input[1] & 0xc0) == 0x80 &&
39 (input[2] & 0xc0) == 0x80) {
40 *symbol = (((input[0] & 0x0f) << 12) |
41 ((input[1] & 0x3f) << 6) |
42 (input[2] & 0x3f));
43 if (*symbol > 0x7ff) {
44 return 3;
45 }
46 }
47 // 4-byte UFT8
48 if (size > 3u &&
49 (input[0] & 0xf8) == 0xf0 &&
50 (input[1] & 0xc0) == 0x80 &&
51 (input[2] & 0xc0) == 0x80 &&
52 (input[3] & 0xc0) == 0x80) {
53 *symbol = (((input[0] & 0x07) << 18) |
54 ((input[1] & 0x3f) << 12) |
55 ((input[2] & 0x3f) << 6) |
56 (input[3] & 0x3f));
57 if (*symbol > 0xffff && *symbol <= 0x10ffff) {
58 return 4;
59 }
60 }
61 // Not UTF8, emit a special symbol above the UTF8-code space
62 *symbol = 0x110000 | input[0];
63 return 1;
64 }
65
66 } // namespace
67
68 // Returns true if at least min_fraction of the data is UTF8-encoded.
69 bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
70 const size_t length, const double min_fraction) {
71 size_t size_utf8 = 0;
72 size_t i = 0;
73 while (i < length) {
74 int symbol;
75 size_t bytes_read = ParseAsUTF8(
76 &symbol, &data[(pos + i) & mask], length - i);
77 i += bytes_read;
78 if (symbol < 0x110000) size_utf8 += bytes_read;
79 }
80 return size_utf8 > min_fraction * static_cast<double>(length);
81 }
82
83 } // namespace brotli
OLDNEW
« no previous file with comments | « third_party/brotli/enc/utf8_util.c ('k') | third_party/brotli/enc/write_bits.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698