Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Unified Diff: third_party/protobuf/js/binary/decoder.js

Issue 2495533002: third_party/protobuf: Update to HEAD (83d681ee2c) (Closed)
Patch Set: Update to new HEAD (b7632464b4) + restore GOOGLE_PROTOBUF_NO_STATIC_INITIALIZER Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/protobuf/js/binary/decoder.js
diff --git a/third_party/protobuf/js/binary/decoder.js b/third_party/protobuf/js/binary/decoder.js
index 41094a36881ece2dee8208bdd9b2b690a16536c1..040cf7153fcda2652a55588c9349cbc9f398afb0 100644
--- a/third_party/protobuf/js/binary/decoder.js
+++ b/third_party/protobuf/js/binary/decoder.js
@@ -895,11 +895,9 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
/**
* Reads and parses a UTF-8 encoded unicode string from the stream.
- * The code is inspired by maps.vectortown.parse.StreamedDataViewReader, with
- * the exception that the implementation here does not get confused if it
- * encounters characters longer than three bytes. These characters are ignored
- * though, as they are extremely rare: three UTF-8 bytes cover virtually all
- * characters in common use (http://en.wikipedia.org/wiki/UTF-8).
+ * The code is inspired by maps.vectortown.parse.StreamedDataViewReader.
+ * Supports codepoints from U+0000 up to U+10FFFF.
+ * (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read.
* @return {string} The decoded string.
*/
@@ -907,30 +905,45 @@ jspb.BinaryDecoder.prototype.readString = function(length) {
var bytes = this.bytes_;
var cursor = this.cursor_;
var end = cursor + length;
- var chars = [];
+ var codeUnits = [];
while (cursor < end) {
var c = bytes[cursor++];
if (c < 128) { // Regular 7-bit ASCII.
- chars.push(c);
+ codeUnits.push(c);
} else if (c < 192) {
// UTF-8 continuation mark. We are out of sync. This
// might happen if we attempted to read a character
- // with more than three bytes.
+ // with more than four bytes.
continue;
} else if (c < 224) { // UTF-8 with two bytes.
var c2 = bytes[cursor++];
- chars.push(((c & 31) << 6) | (c2 & 63));
+ codeUnits.push(((c & 31) << 6) | (c2 & 63));
} else if (c < 240) { // UTF-8 with three bytes.
var c2 = bytes[cursor++];
var c3 = bytes[cursor++];
- chars.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+ codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+ } else if (c < 248) { // UTF-8 with 4 bytes.
+ var c2 = bytes[cursor++];
+ var c3 = bytes[cursor++];
+ var c4 = bytes[cursor++];
+ // Characters written on 4 bytes have 21 bits for a codepoint.
+ // We can't fit that on 16bit characters, so we use surrogates.
+ var codepoint = ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
+ // Surrogates formula from wikipedia.
+ // 1. Subtract 0x10000 from codepoint
+ codepoint -= 0x10000;
+ // 2. Split this into the high 10-bit value and the low 10-bit value
+ // 3. Add 0xD800 to the high value to form the high surrogate
+ // 4. Add 0xDC00 to the low value to form the low surrogate:
+ var low = (codepoint & 1023) + 0xDC00;
+ var high = ((codepoint >> 10) & 1023) + 0xD800;
+ codeUnits.push(high, low)
}
}
-
// String.fromCharCode.apply is faster than manually appending characters on
// Chrome 25+, and generates no additional cons string garbage.
- var result = String.fromCharCode.apply(null, chars);
+ var result = String.fromCharCode.apply(null, codeUnits);
this.cursor_ = cursor;
return result;
};

Powered by Google App Engine
This is Rietveld 408576698