| OLD | NEW |
| (Empty) |
| 1 /* Copyright 2013 Google Inc. All Rights Reserved. | |
| 2 | |
| 3 Distributed under MIT license. | |
| 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
| 5 */ | |
| 6 | |
| 7 // Heuristics for deciding about the UTF8-ness of strings. | |
| 8 | |
| 9 #include "./utf8_util.h" | |
| 10 | |
| 11 #include "./types.h" | |
| 12 | |
| 13 namespace brotli { | |
| 14 | |
| 15 namespace { | |
| 16 | |
| 17 size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) { | |
| 18 // ASCII | |
| 19 if ((input[0] & 0x80) == 0) { | |
| 20 *symbol = input[0]; | |
| 21 if (*symbol > 0) { | |
| 22 return 1; | |
| 23 } | |
| 24 } | |
| 25 // 2-byte UTF8 | |
| 26 if (size > 1u && | |
| 27 (input[0] & 0xe0) == 0xc0 && | |
| 28 (input[1] & 0xc0) == 0x80) { | |
| 29 *symbol = (((input[0] & 0x1f) << 6) | | |
| 30 (input[1] & 0x3f)); | |
| 31 if (*symbol > 0x7f) { | |
| 32 return 2; | |
| 33 } | |
| 34 } | |
| 35 // 3-byte UFT8 | |
| 36 if (size > 2u && | |
| 37 (input[0] & 0xf0) == 0xe0 && | |
| 38 (input[1] & 0xc0) == 0x80 && | |
| 39 (input[2] & 0xc0) == 0x80) { | |
| 40 *symbol = (((input[0] & 0x0f) << 12) | | |
| 41 ((input[1] & 0x3f) << 6) | | |
| 42 (input[2] & 0x3f)); | |
| 43 if (*symbol > 0x7ff) { | |
| 44 return 3; | |
| 45 } | |
| 46 } | |
| 47 // 4-byte UFT8 | |
| 48 if (size > 3u && | |
| 49 (input[0] & 0xf8) == 0xf0 && | |
| 50 (input[1] & 0xc0) == 0x80 && | |
| 51 (input[2] & 0xc0) == 0x80 && | |
| 52 (input[3] & 0xc0) == 0x80) { | |
| 53 *symbol = (((input[0] & 0x07) << 18) | | |
| 54 ((input[1] & 0x3f) << 12) | | |
| 55 ((input[2] & 0x3f) << 6) | | |
| 56 (input[3] & 0x3f)); | |
| 57 if (*symbol > 0xffff && *symbol <= 0x10ffff) { | |
| 58 return 4; | |
| 59 } | |
| 60 } | |
| 61 // Not UTF8, emit a special symbol above the UTF8-code space | |
| 62 *symbol = 0x110000 | input[0]; | |
| 63 return 1; | |
| 64 } | |
| 65 | |
| 66 } // namespace | |
| 67 | |
| 68 // Returns true if at least min_fraction of the data is UTF8-encoded. | |
| 69 bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask, | |
| 70 const size_t length, const double min_fraction) { | |
| 71 size_t size_utf8 = 0; | |
| 72 size_t i = 0; | |
| 73 while (i < length) { | |
| 74 int symbol; | |
| 75 size_t bytes_read = ParseAsUTF8( | |
| 76 &symbol, &data[(pos + i) & mask], length - i); | |
| 77 i += bytes_read; | |
| 78 if (symbol < 0x110000) size_utf8 += bytes_read; | |
| 79 } | |
| 80 return size_utf8 > min_fraction * static_cast<double>(length); | |
| 81 } | |
| 82 | |
| 83 } // namespace brotli | |
| OLD | NEW |