| Index: third_party/brotli/enc/utf8_util.cc
|
| diff --git a/third_party/brotli/enc/utf8_util.cc b/third_party/brotli/enc/utf8_util.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..a2b5c3a6776477af0b1132667fb8d6b05e4ccee6
|
| --- /dev/null
|
| +++ b/third_party/brotli/enc/utf8_util.cc
|
| @@ -0,0 +1,83 @@
|
| +/* Copyright 2013 Google Inc. All Rights Reserved.
|
| +
|
| + Distributed under MIT license.
|
| + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
|
| +*/
|
| +
|
| +// Heuristics for deciding about the UTF8-ness of strings.
|
| +
|
| +#include "./utf8_util.h"
|
| +
|
| +#include "./types.h"
|
| +
|
| +namespace brotli {
|
| +
|
| +namespace {
|
| +
|
| +size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) {
|
| + // ASCII
|
| + if ((input[0] & 0x80) == 0) {
|
| + *symbol = input[0];
|
| + if (*symbol > 0) {
|
| + return 1;
|
| + }
|
| + }
|
| + // 2-byte UTF8
|
| + if (size > 1u &&
|
| + (input[0] & 0xe0) == 0xc0 &&
|
| + (input[1] & 0xc0) == 0x80) {
|
| + *symbol = (((input[0] & 0x1f) << 6) |
|
| + (input[1] & 0x3f));
|
| + if (*symbol > 0x7f) {
|
| + return 2;
|
| + }
|
| + }
|
| + // 3-byte UFT8
|
| + if (size > 2u &&
|
| + (input[0] & 0xf0) == 0xe0 &&
|
| + (input[1] & 0xc0) == 0x80 &&
|
| + (input[2] & 0xc0) == 0x80) {
|
| + *symbol = (((input[0] & 0x0f) << 12) |
|
| + ((input[1] & 0x3f) << 6) |
|
| + (input[2] & 0x3f));
|
| + if (*symbol > 0x7ff) {
|
| + return 3;
|
| + }
|
| + }
|
| + // 4-byte UFT8
|
| + if (size > 3u &&
|
| + (input[0] & 0xf8) == 0xf0 &&
|
| + (input[1] & 0xc0) == 0x80 &&
|
| + (input[2] & 0xc0) == 0x80 &&
|
| + (input[3] & 0xc0) == 0x80) {
|
| + *symbol = (((input[0] & 0x07) << 18) |
|
| + ((input[1] & 0x3f) << 12) |
|
| + ((input[2] & 0x3f) << 6) |
|
| + (input[3] & 0x3f));
|
| + if (*symbol > 0xffff && *symbol <= 0x10ffff) {
|
| + return 4;
|
| + }
|
| + }
|
| + // Not UTF8, emit a special symbol above the UTF8-code space
|
| + *symbol = 0x110000 | input[0];
|
| + return 1;
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +// Returns true if at least min_fraction of the data is UTF8-encoded.
|
| +bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
|
| + const size_t length, const double min_fraction) {
|
| + size_t size_utf8 = 0;
|
| + size_t i = 0;
|
| + while (i < length) {
|
| + int symbol;
|
| + size_t bytes_read = ParseAsUTF8(
|
| + &symbol, &data[(pos + i) & mask], length - i);
|
| + i += bytes_read;
|
| + if (symbol < 0x110000) size_utf8 += bytes_read;
|
| + }
|
| + return size_utf8 > min_fraction * static_cast<double>(length);
|
| +}
|
| +
|
| +} // namespace brotli
|
|
|