Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(45)

Unified Diff: third_party/brotli/enc/utf8_util.cc

Issue 1956893002: Added brotli enc/ and tools/ directories. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Updated to most recent build tools Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/brotli/enc/utf8_util.h ('k') | third_party/brotli/enc/write_bits.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/brotli/enc/utf8_util.cc
diff --git a/third_party/brotli/enc/utf8_util.cc b/third_party/brotli/enc/utf8_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2b5c3a6776477af0b1132667fb8d6b05e4ccee6
--- /dev/null
+++ b/third_party/brotli/enc/utf8_util.cc
@@ -0,0 +1,83 @@
+/* Copyright 2013 Google Inc. All Rights Reserved.
+
+ Distributed under MIT license.
+ See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
+*/
+
+// Heuristics for deciding about the UTF8-ness of strings.
+
+#include "./utf8_util.h"
+
+#include "./types.h"
+
+namespace brotli {
+
+namespace {
+
+size_t ParseAsUTF8(int* symbol, const uint8_t* input, size_t size) {
+ // ASCII
+ if ((input[0] & 0x80) == 0) {
+ *symbol = input[0];
+ if (*symbol > 0) {
+ return 1;
+ }
+ }
+ // 2-byte UTF8
+ if (size > 1u &&
+ (input[0] & 0xe0) == 0xc0 &&
+ (input[1] & 0xc0) == 0x80) {
+ *symbol = (((input[0] & 0x1f) << 6) |
+ (input[1] & 0x3f));
+ if (*symbol > 0x7f) {
+ return 2;
+ }
+ }
+ // 3-byte UFT8
+ if (size > 2u &&
+ (input[0] & 0xf0) == 0xe0 &&
+ (input[1] & 0xc0) == 0x80 &&
+ (input[2] & 0xc0) == 0x80) {
+ *symbol = (((input[0] & 0x0f) << 12) |
+ ((input[1] & 0x3f) << 6) |
+ (input[2] & 0x3f));
+ if (*symbol > 0x7ff) {
+ return 3;
+ }
+ }
+ // 4-byte UFT8
+ if (size > 3u &&
+ (input[0] & 0xf8) == 0xf0 &&
+ (input[1] & 0xc0) == 0x80 &&
+ (input[2] & 0xc0) == 0x80 &&
+ (input[3] & 0xc0) == 0x80) {
+ *symbol = (((input[0] & 0x07) << 18) |
+ ((input[1] & 0x3f) << 12) |
+ ((input[2] & 0x3f) << 6) |
+ (input[3] & 0x3f));
+ if (*symbol > 0xffff && *symbol <= 0x10ffff) {
+ return 4;
+ }
+ }
+ // Not UTF8, emit a special symbol above the UTF8-code space
+ *symbol = 0x110000 | input[0];
+ return 1;
+}
+
+} // namespace
+
+// Returns true if at least min_fraction of the data is UTF8-encoded.
+bool IsMostlyUTF8(const uint8_t* data, const size_t pos, const size_t mask,
+ const size_t length, const double min_fraction) {
+ size_t size_utf8 = 0;
+ size_t i = 0;
+ while (i < length) {
+ int symbol;
+ size_t bytes_read = ParseAsUTF8(
+ &symbol, &data[(pos + i) & mask], length - i);
+ i += bytes_read;
+ if (symbol < 0x110000) size_utf8 += bytes_read;
+ }
+ return size_utf8 > min_fraction * static_cast<double>(length);
+}
+
+} // namespace brotli
« no previous file with comments | « third_party/brotli/enc/utf8_util.h ('k') | third_party/brotli/enc/write_bits.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698