| OLD | NEW |
| (Empty) | |
| 1 /* Copyright 2013 Google Inc. All Rights Reserved. |
| 2 |
| 3 Distributed under MIT license. |
| 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| 5 */ |
| 6 |
| 7 // Heuristics for deciding about the UTF8-ness of strings. |
| 8 |
| 9 #include "./utf8_util.h" |
| 10 |
| 11 #include "./types.h" |
| 12 |
| 13 namespace brotli { |
| 14 |
| 15 namespace { |
| 16 |
| 17 size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) { |
| 18 // ASCII |
| 19 if ((input[0] & 0x80) == 0) { |
| 20 *symbol = input[0]; |
| 21 if (*symbol > 0) { |
| 22 return 1; |
| 23 } |
| 24 } |
| 25 // 2-byte UTF8 |
| 26 if (size > 1u && |
| 27 (input[0] & 0xe0) == 0xc0 && |
| 28 (input[1] & 0xc0) == 0x80) { |
| 29 *symbol = (((input[0] & 0x1f) << 6) | |
| 30 (input[1] & 0x3f)); |
| 31 if (*symbol > 0x7f) { |
| 32 return 2; |
| 33 } |
| 34 } |
| 35 // 3-byte UFT8 |
| 36 if (size > 2u && |
| 37 (input[0] & 0xf0) == 0xe0 && |
| 38 (input[1] & 0xc0) == 0x80 && |
| 39 (input[2] & 0xc0) == 0x80) { |
| 40 *symbol = (((input[0] & 0x0f) << 12) | |
| 41 ((input[1] & 0x3f) << 6) | |
| 42 (input[2] & 0x3f)); |
| 43 if (*symbol > 0x7ff) { |
| 44 return 3; |
| 45 } |
| 46 } |
| 47 // 4-byte UFT8 |
| 48 if (size > 3u && |
| 49 (input[0] & 0xf8) == 0xf0 && |
| 50 (input[1] & 0xc0) == 0x80 && |
| 51 (input[2] & 0xc0) == 0x80 && |
| 52 (input[3] & 0xc0) == 0x80) { |
| 53 *symbol = (((input[0] & 0x07) << 18) | |
| 54 ((input[1] & 0x3f) << 12) | |
| 55 ((input[2] & 0x3f) << 6) | |
| 56 (input[3] & 0x3f)); |
| 57 if (*symbol > 0xffff && *symbol <= 0x10ffff) { |
| 58 return 4; |
| 59 } |
| 60 } |
| 61 // Not UTF8, emit a special symbol above the UTF8-code space |
| 62 *symbol = 0x110000 | input[0]; |
| 63 return 1; |
| 64 } |
| 65 |
| 66 } // namespace |
| 67 |
| 68 // Returns true if at least min_fraction of the data is UTF8-encoded. |
| 69 bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask, |
| 70 const size_t length, const double min_fraction) { |
| 71 size_t size_utf8 = 0; |
| 72 size_t i = 0; |
| 73 while (i < length) { |
| 74 int symbol; |
| 75 size_t bytes_read = ParseAsUTF8( |
| 76 &symbol, &data[(pos + i) & mask], length - i); |
| 77 i += bytes_read; |
| 78 if (symbol < 0x110000) size_utf8 += bytes_read; |
| 79 } |
| 80 return size_utf8 > min_fraction * static_cast<double>(length); |
| 81 } |
| 82 |
| 83 } // namespace brotli |
| OLD | NEW |