Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(158)

Unified Diff: mojo/public/dart/third_party/html/lib/src/char_encodings.dart

Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: mojo/public/dart/third_party/html/lib/src/char_encodings.dart
diff --git a/mojo/public/dart/third_party/html/lib/src/char_encodings.dart b/mojo/public/dart/third_party/html/lib/src/char_encodings.dart
new file mode 100644
index 0000000000000000000000000000000000000000..5ddf7279679d08c1907dfe5b0ee466dce2de10e4
--- /dev/null
+++ b/mojo/public/dart/third_party/html/lib/src/char_encodings.dart
@@ -0,0 +1,223 @@
+/// Decodes bytes using the correct name. See [decodeBytes].
+library char_encodings;
+
+import 'dart:collection';
+import 'package:utf/utf.dart';
+
+// TODO(jmesserly): this function is conspicuously absent from dart:utf.
+/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
+/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
+/// used in HTML to detect the UTF-
+bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
+ int end = length != null ? offset + length : bytes.length;
+ return (offset + 3) <= end &&
+ bytes[offset] == 0xEF &&
+ bytes[offset + 1] == 0xBB &&
+ bytes[offset + 2] == 0xBF;
+}
+
+// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
+// file, but dart:utf does not expose stream-based decoders yet.
+/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
+/// the codepoints. Supports the major unicode encodings as well as ascii and
+/// and windows-1252 encodings.
+Iterable<int> decodeBytes(String encoding, List<int> bytes, [int offset = 0,
+ int length,
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
+ if (length == null) length = bytes.length;
+ final replace = replacementCodepoint;
+ switch (encoding) {
+ case 'ascii':
+ bytes = bytes.sublist(offset, offset + length);
+ // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
+ for (int byte in bytes) {
+ if (byte > 127) {
+ // TODO(jmesserly): ideally this would be DecoderException, like the
+ // one thrown in runtime/bin/string_stream.dart, but we don't want to
+ // depend on dart:io.
+ throw new FormatException("Illegal ASCII character $byte");
+ }
+ }
+ return bytes;
+
+ case 'windows-1252':
+ case 'cp1252':
+ return decodeWindows1252AsIterable(bytes, offset, length, replace);
+
+ case 'utf-8':
+ // NOTE: to match the behavior of the other decode functions, we eat the
+ // utf-8 BOM here.
+ if (hasUtf8Bom(bytes, offset, length)) {
+ offset += 3;
+ length -= 3;
+ }
+ return decodeUtf8AsIterable(bytes, offset, length, replace);
+
+ case 'utf-16':
+ return decodeUtf16AsIterable(bytes, offset, length, replace);
+ case 'utf-16-be':
+ return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
+ case 'utf-16-le':
+ return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
+
+ case 'utf-32':
+ return decodeUtf32AsIterable(bytes, offset, length, replace);
+ case 'utf-32-be':
+ return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
+ case 'utf-32-le':
+ return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
+
+ default:
+ throw new ArgumentError('Encoding $encoding not supported');
+ }
+}
+
+// TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.
+/// Returns the code points for the [input]. This works like [String.charCodes]
+/// but it decodes UTF-16 surrogate pairs.
+List<int> toCodepoints(String input) {
+ var newCodes = <int>[];
+ for (int i = 0; i < input.length; i++) {
+ var c = input.codeUnitAt(i);
+ if (0xD800 <= c && c <= 0xDBFF) {
+ int next = i + 1;
+ if (next < input.length) {
+ var d = input.codeUnitAt(next);
+ if (0xDC00 <= d && d <= 0xDFFF) {
+ c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
+ i = next;
+ }
+ }
+ }
+ newCodes.add(c);
+ }
+ return newCodes;
+}
+
+/// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as
+/// an iterable. Thus, the consumer can only convert as much of the input as
+/// needed. Set the [replacementCharacter] to null to throw an [ArgumentError]
+/// rather than replace the bad value.
+IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
+ [int offset = 0, int length,
+ int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
+ return new IterableWindows1252Decoder(
+ bytes, offset, length, replacementCodepoint);
+}
+
+/// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
+/// provides an iterator on demand and the iterator will only translate bytes
+/// as requested by the user of the iterator. (Note: results are not cached.)
+class IterableWindows1252Decoder extends IterableBase<int> {
+ final List<int> bytes;
+ final int offset;
+ final int length;
+ final int replacementCodepoint;
+
+ IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,
+ int this.length = null,
+ int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
+
+ Windows1252Decoder get iterator =>
+ new Windows1252Decoder(bytes, offset, length, replacementCodepoint);
+}
+
+/// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
+/// The parameters can set an offset into a list of bytes (as int), limit the
+/// length of the values to be decoded, and override the default Unicode
+/// replacement character. Set the replacementCharacter to null to throw an
+/// ArgumentError rather than replace the bad value. The return value
+/// from this method can be used as an Iterable (e.g. in a for-loop).
+class Windows1252Decoder implements Iterator<int> {
+ final int replacementCodepoint;
+ final List<int> _bytes;
+ int _offset;
+ final int _length;
+
+ Windows1252Decoder(List<int> bytes, [int offset = 0, int length,
+ this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
+ : _bytes = bytes,
+ _offset = offset - 1,
+ _length = length == null ? bytes.length : length;
+
+ bool get _inRange => _offset >= 0 && _offset < _length;
+ int get current => _inRange ? _mapChar(_bytes[_offset]) : null;
+
+ bool moveNext() {
+ _offset++;
+ return _inRange;
+ }
+
+ int _mapChar(int char) {
+ // TODO(jmesserly): this is duplicating entitiesWindows1252 and
+ // replacementCharacters from constants.dart
+ switch (char) {
+ case 0x80:
+ return 0x20AC; // EURO SIGN
+ case 0x82:
+ return 0x201A; // SINGLE LOW-9 QUOTATION MARK
+ case 0x83:
+ return 0x0192; // LATIN SMALL LETTER F WITH HOOK
+ case 0x84:
+ return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
+ case 0x85:
+ return 0x2026; // HORIZONTAL ELLIPSIS
+ case 0x86:
+ return 0x2020; // DAGGER
+ case 0x87:
+ return 0x2021; // DOUBLE DAGGER
+ case 0x88:
+ return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
+ case 0x89:
+ return 0x2030; // PER MILLE SIGN
+ case 0x8A:
+ return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
+ case 0x8B:
+ return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+ case 0x8C:
+ return 0x0152; // LATIN CAPITAL LIGATURE OE
+ case 0x8E:
+ return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
+ case 0x91:
+ return 0x2018; // LEFT SINGLE QUOTATION MARK
+ case 0x92:
+ return 0x2019; // RIGHT SINGLE QUOTATION MARK
+ case 0x93:
+ return 0x201C; // LEFT DOUBLE QUOTATION MARK
+ case 0x94:
+ return 0x201D; // RIGHT DOUBLE QUOTATION MARK
+ case 0x95:
+ return 0x2022; // BULLET
+ case 0x96:
+ return 0x2013; // EN DASH
+ case 0x97:
+ return 0x2014; // EM DASH
+ case 0x98:
+ return 0x02DC; // SMALL TILDE
+ case 0x99:
+ return 0x2122; // TRADE MARK SIGN
+ case 0x9A:
+ return 0x0161; // LATIN SMALL LETTER S WITH CARON
+ case 0x9B:
+ return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+ case 0x9C:
+ return 0x0153; // LATIN SMALL LIGATURE OE
+ case 0x9E:
+ return 0x017E; // LATIN SMALL LETTER Z WITH CARON
+ case 0x9F:
+ return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
+
+ case 0x81:
+ case 0x8D:
+ case 0x8F:
+ case 0x90:
+ case 0x9D:
+ if (replacementCodepoint == null) {
+ throw new ArgumentError(
+ "Invalid windows-1252 code point $char at $_offset");
+ }
+ return replacementCodepoint;
+ }
+ return char;
+ }
+}
« no previous file with comments | « mojo/public/dart/third_party/html/lib/parser_console.dart ('k') | mojo/public/dart/third_party/html/lib/src/constants.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698