OLD | NEW |
| (Empty) |
1 /* Copyright 2013 Google Inc. All Rights Reserved. | |
2 | |
3 Distributed under MIT license. | |
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
5 */ | |
6 | |
7 // Heuristics for deciding about the UTF8-ness of strings. | |
8 | |
9 #include "./utf8_util.h" | |
10 | |
11 #include "./types.h" | |
12 | |
13 namespace brotli { | |
14 | |
15 namespace { | |
16 | |
17 size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) { | |
18 // ASCII | |
19 if ((input[0] & 0x80) == 0) { | |
20 *symbol = input[0]; | |
21 if (*symbol > 0) { | |
22 return 1; | |
23 } | |
24 } | |
25 // 2-byte UTF8 | |
26 if (size > 1u && | |
27 (input[0] & 0xe0) == 0xc0 && | |
28 (input[1] & 0xc0) == 0x80) { | |
29 *symbol = (((input[0] & 0x1f) << 6) | | |
30 (input[1] & 0x3f)); | |
31 if (*symbol > 0x7f) { | |
32 return 2; | |
33 } | |
34 } | |
35 // 3-byte UFT8 | |
36 if (size > 2u && | |
37 (input[0] & 0xf0) == 0xe0 && | |
38 (input[1] & 0xc0) == 0x80 && | |
39 (input[2] & 0xc0) == 0x80) { | |
40 *symbol = (((input[0] & 0x0f) << 12) | | |
41 ((input[1] & 0x3f) << 6) | | |
42 (input[2] & 0x3f)); | |
43 if (*symbol > 0x7ff) { | |
44 return 3; | |
45 } | |
46 } | |
47 // 4-byte UFT8 | |
48 if (size > 3u && | |
49 (input[0] & 0xf8) == 0xf0 && | |
50 (input[1] & 0xc0) == 0x80 && | |
51 (input[2] & 0xc0) == 0x80 && | |
52 (input[3] & 0xc0) == 0x80) { | |
53 *symbol = (((input[0] & 0x07) << 18) | | |
54 ((input[1] & 0x3f) << 12) | | |
55 ((input[2] & 0x3f) << 6) | | |
56 (input[3] & 0x3f)); | |
57 if (*symbol > 0xffff && *symbol <= 0x10ffff) { | |
58 return 4; | |
59 } | |
60 } | |
61 // Not UTF8, emit a special symbol above the UTF8-code space | |
62 *symbol = 0x110000 | input[0]; | |
63 return 1; | |
64 } | |
65 | |
66 } // namespace | |
67 | |
68 // Returns true if at least min_fraction of the data is UTF8-encoded. | |
69 bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask, | |
70 const size_t length, const double min_fraction) { | |
71 size_t size_utf8 = 0; | |
72 size_t i = 0; | |
73 while (i < length) { | |
74 int symbol; | |
75 size_t bytes_read = ParseAsUTF8( | |
76 &symbol, &data[(pos + i) & mask], length - i); | |
77 i += bytes_read; | |
78 if (symbol < 0x110000) size_utf8 += bytes_read; | |
79 } | |
80 return size_utf8 > min_fraction * static_cast<double>(length); | |
81 } | |
82 | |
83 } // namespace brotli | |
OLD | NEW |