OLD | NEW |
1 /* Copyright 2013 Google Inc. All Rights Reserved. | 1 /* Copyright 2013 Google Inc. All Rights Reserved. |
2 | 2 |
3 Distributed under MIT license. | 3 Distributed under MIT license. |
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
5 */ | 5 */ |
6 | 6 |
7 // Heuristics for deciding about the UTF8-ness of strings. | 7 /* Heuristics for deciding about the UTF8-ness of strings. */ |
8 | 8 |
9 #include "./utf8_util.h" | 9 #include "./utf8_util.h" |
10 | 10 |
11 #include "./types.h" | 11 #include <brotli/types.h> |
12 | 12 |
13 namespace brotli { | 13 #if defined(__cplusplus) || defined(c_plusplus) |
| 14 extern "C" { |
| 15 #endif |
14 | 16 |
15 namespace { | 17 static size_t BrotliParseAsUTF8( |
16 | 18 int* symbol, const uint8_t* input, size_t size) { |
17 size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) { | 19 /* ASCII */ |
18 // ASCII | |
19 if ((input[0] & 0x80) == 0) { | 20 if ((input[0] & 0x80) == 0) { |
20 *symbol = input[0]; | 21 *symbol = input[0]; |
21 if (*symbol > 0) { | 22 if (*symbol > 0) { |
22 return 1; | 23 return 1; |
23 } | 24 } |
24 } | 25 } |
25 // 2-byte UTF8 | 26 /* 2-byte UTF8 */ |
26 if (size > 1u && | 27 if (size > 1u && |
27 (input[0] & 0xe0) == 0xc0 && | 28 (input[0] & 0xe0) == 0xc0 && |
28 (input[1] & 0xc0) == 0x80) { | 29 (input[1] & 0xc0) == 0x80) { |
29 *symbol = (((input[0] & 0x1f) << 6) | | 30 *symbol = (((input[0] & 0x1f) << 6) | |
30 (input[1] & 0x3f)); | 31 (input[1] & 0x3f)); |
31 if (*symbol > 0x7f) { | 32 if (*symbol > 0x7f) { |
32 return 2; | 33 return 2; |
33 } | 34 } |
34 } | 35 } |
35 // 3-byte UFT8 | 36 /* 3-byte UFT8 */ |
36 if (size > 2u && | 37 if (size > 2u && |
37 (input[0] & 0xf0) == 0xe0 && | 38 (input[0] & 0xf0) == 0xe0 && |
38 (input[1] & 0xc0) == 0x80 && | 39 (input[1] & 0xc0) == 0x80 && |
39 (input[2] & 0xc0) == 0x80) { | 40 (input[2] & 0xc0) == 0x80) { |
40 *symbol = (((input[0] & 0x0f) << 12) | | 41 *symbol = (((input[0] & 0x0f) << 12) | |
41 ((input[1] & 0x3f) << 6) | | 42 ((input[1] & 0x3f) << 6) | |
42 (input[2] & 0x3f)); | 43 (input[2] & 0x3f)); |
43 if (*symbol > 0x7ff) { | 44 if (*symbol > 0x7ff) { |
44 return 3; | 45 return 3; |
45 } | 46 } |
46 } | 47 } |
47 // 4-byte UFT8 | 48 /* 4-byte UFT8 */ |
48 if (size > 3u && | 49 if (size > 3u && |
49 (input[0] & 0xf8) == 0xf0 && | 50 (input[0] & 0xf8) == 0xf0 && |
50 (input[1] & 0xc0) == 0x80 && | 51 (input[1] & 0xc0) == 0x80 && |
51 (input[2] & 0xc0) == 0x80 && | 52 (input[2] & 0xc0) == 0x80 && |
52 (input[3] & 0xc0) == 0x80) { | 53 (input[3] & 0xc0) == 0x80) { |
53 *symbol = (((input[0] & 0x07) << 18) | | 54 *symbol = (((input[0] & 0x07) << 18) | |
54 ((input[1] & 0x3f) << 12) | | 55 ((input[1] & 0x3f) << 12) | |
55 ((input[2] & 0x3f) << 6) | | 56 ((input[2] & 0x3f) << 6) | |
56 (input[3] & 0x3f)); | 57 (input[3] & 0x3f)); |
57 if (*symbol > 0xffff && *symbol <= 0x10ffff) { | 58 if (*symbol > 0xffff && *symbol <= 0x10ffff) { |
58 return 4; | 59 return 4; |
59 } | 60 } |
60 } | 61 } |
61 // Not UTF8, emit a special symbol above the UTF8-code space | 62 /* Not UTF8, emit a special symbol above the UTF8-code space */ |
62 *symbol = 0x110000 | input[0]; | 63 *symbol = 0x110000 | input[0]; |
63 return 1; | 64 return 1; |
64 } | 65 } |
65 | 66 |
66 } // namespace | 67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ |
67 | 68 BROTLI_BOOL BrotliIsMostlyUTF8( |
68 // Returns true if at least min_fraction of the data is UTF8-encoded. | 69 const uint8_t* data, const size_t pos, const size_t mask, |
69 bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask, | 70 const size_t length, const double min_fraction) { |
70 const size_t length, const double min_fraction) { | |
71 size_t size_utf8 = 0; | 71 size_t size_utf8 = 0; |
72 size_t i = 0; | 72 size_t i = 0; |
73 while (i < length) { | 73 while (i < length) { |
74 int symbol; | 74 int symbol; |
75 size_t bytes_read = ParseAsUTF8( | 75 size_t bytes_read = |
76 &symbol, &data[(pos + i) & mask], length - i); | 76 BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); |
77 i += bytes_read; | 77 i += bytes_read; |
78 if (symbol < 0x110000) size_utf8 += bytes_read; | 78 if (symbol < 0x110000) size_utf8 += bytes_read; |
79 } | 79 } |
80 return size_utf8 > min_fraction * static_cast<double>(length); | 80 return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length); |
81 } | 81 } |
82 | 82 |
83 } // namespace brotli | 83 #if defined(__cplusplus) || defined(c_plusplus) |
| 84 } /* extern "C" */ |
| 85 #endif |
OLD | NEW |