mojo/public/dart/third_party/html/lib/src/char_encodings.dart - Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs

Side by Side Diff: mojo/public/dart/third_party/html/lib/src/char_encodings.dart

Issue 1346773002: Stop running pub get at gclient sync time and fix build bugs (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /// Decodes bytes using the correct name. See [decodeBytes].

	2 library char_encodings;

	3

	4 import 'dart:collection';

	5 import 'package:utf/utf.dart';

	6

	7 // TODO(jmesserly): this function is conspicuously absent from dart:utf.

	8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark.

	9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is

	10 /// used in HTML to detect the UTF-

	11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {

	12 int end = length != null ? offset + length : bytes.length;

	13 return (offset + 3) <= end &&

	14 bytes[offset] == 0xEF &&

	15 bytes[offset + 1] == 0xBB &&

	16 bytes[offset + 2] == 0xBF;

	17 }

	18

	19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire

	20 // file, but dart:utf does not expose stream-based decoders yet.

	21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for

	22 /// the codepoints. Supports the major unicode encodings as well as ascii and

	23 /// and windows-1252 encodings.

	24 Iterable<int> decodeBytes(String encoding, List<int> bytes, [int offset = 0,

	25 int length,

	26 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

	27 if (length == null) length = bytes.length;

	28 final replace = replacementCodepoint;

	29 switch (encoding) {

	30 case 'ascii':

	31 bytes = bytes.sublist(offset, offset + length);

	32 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart

	33 for (int byte in bytes) {

	34 if (byte > 127) {

	35 // TODO(jmesserly): ideally this would be DecoderException, like the

	36 // one thrown in runtime/bin/string_stream.dart, but we don't want to

	37 // depend on dart:io.

	38 throw new FormatException("Illegal ASCII character $byte");

	39 }

	40 }

	41 return bytes;

	42

	43 case 'windows-1252':

	44 case 'cp1252':

	45 return decodeWindows1252AsIterable(bytes, offset, length, replace);

	46

	47 case 'utf-8':

	48 // NOTE: to match the behavior of the other decode functions, we eat the

	49 // utf-8 BOM here.

	50 if (hasUtf8Bom(bytes, offset, length)) {

	51 offset += 3;

	52 length -= 3;

	53 }

	54 return decodeUtf8AsIterable(bytes, offset, length, replace);

	55

	56 case 'utf-16':

	57 return decodeUtf16AsIterable(bytes, offset, length, replace);

	58 case 'utf-16-be':

	59 return decodeUtf16beAsIterable(bytes, offset, length, true, replace);

	60 case 'utf-16-le':

	61 return decodeUtf16leAsIterable(bytes, offset, length, true, replace);

	62

	63 case 'utf-32':

	64 return decodeUtf32AsIterable(bytes, offset, length, replace);

	65 case 'utf-32-be':

	66 return decodeUtf32beAsIterable(bytes, offset, length, true, replace);

	67 case 'utf-32-le':

	68 return decodeUtf32leAsIterable(bytes, offset, length, true, replace);

	69

	70 default:

	71 throw new ArgumentError('Encoding $encoding not supported');

	72 }

	73 }

	74

	75 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.

	76 /// Returns the code points for the [input]. This works like [String.charCodes]

	77 /// but it decodes UTF-16 surrogate pairs.

	78 List<int> toCodepoints(String input) {

	79 var newCodes = <int>[];

	80 for (int i = 0; i < input.length; i++) {

	81 var c = input.codeUnitAt(i);

	82 if (0xD800 <= c && c <= 0xDBFF) {

	83 int next = i + 1;

	84 if (next < input.length) {

	85 var d = input.codeUnitAt(next);

	86 if (0xDC00 <= d && d <= 0xDFFF) {

	87 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);

	88 i = next;

	89 }

	90 }

	91 }

	92 newCodes.add(c);

	93 }

	94 return newCodes;

	95 }

	96

	97 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as

	98 /// an iterable. Thus, the consumer can only convert as much of the input as

	99 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError]

	100 /// rather than replace the bad value.

	101 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,

	102 [int offset = 0, int length,

	103 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

	104 return new IterableWindows1252Decoder(

	105 bytes, offset, length, replacementCodepoint);

	106 }

	107

	108 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type

	109 /// provides an iterator on demand and the iterator will only translate bytes

	110 /// as requested by the user of the iterator. (Note: results are not cached.)

	111 class IterableWindows1252Decoder extends IterableBase<int> {

	112 final List<int> bytes;

	113 final int offset;

	114 final int length;

	115 final int replacementCodepoint;

	116

	117 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,

	118 int this.length = null,

	119 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

	120

	121 Windows1252Decoder get iterator =>

	122 new Windows1252Decoder(bytes, offset, length, replacementCodepoint);

	123 }

	124

	125 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.

	126 /// The parameters can set an offset into a list of bytes (as int), limit the

	127 /// length of the values to be decoded, and override the default Unicode

	128 /// replacement character. Set the replacementCharacter to null to throw an

	129 /// ArgumentError rather than replace the bad value. The return value

	130 /// from this method can be used as an Iterable (e.g. in a for-loop).

	131 class Windows1252Decoder implements Iterator<int> {

	132 final int replacementCodepoint;

	133 final List<int> _bytes;

	134 int _offset;

	135 final int _length;

	136

	137 Windows1252Decoder(List<int> bytes, [int offset = 0, int length,

	138 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])

	139 : _bytes = bytes,

	140 _offset = offset - 1,

	141 _length = length == null ? bytes.length : length;

	142

	143 bool get _inRange => _offset >= 0 && _offset < _length;

	144 int get current => _inRange ? _mapChar(_bytes[_offset]) : null;

	145

	146 bool moveNext() {

	147 _offset++;

	148 return _inRange;

	149 }

	150

	151 int _mapChar(int char) {

	152 // TODO(jmesserly): this is duplicating entitiesWindows1252 and

	153 // replacementCharacters from constants.dart

	154 switch (char) {

	155 case 0x80:

	156 return 0x20AC; // EURO SIGN

	157 case 0x82:

	158 return 0x201A; // SINGLE LOW-9 QUOTATION MARK

	159 case 0x83:

	160 return 0x0192; // LATIN SMALL LETTER F WITH HOOK

	161 case 0x84:

	162 return 0x201E; // DOUBLE LOW-9 QUOTATION MARK

	163 case 0x85:

	164 return 0x2026; // HORIZONTAL ELLIPSIS

	165 case 0x86:

	166 return 0x2020; // DAGGER

	167 case 0x87:

	168 return 0x2021; // DOUBLE DAGGER

	169 case 0x88:

	170 return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT

	171 case 0x89:

	172 return 0x2030; // PER MILLE SIGN

	173 case 0x8A:

	174 return 0x0160; // LATIN CAPITAL LETTER S WITH CARON

	175 case 0x8B:

	176 return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK

	177 case 0x8C:

	178 return 0x0152; // LATIN CAPITAL LIGATURE OE

	179 case 0x8E:

	180 return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON

	181 case 0x91:

	182 return 0x2018; // LEFT SINGLE QUOTATION MARK

	183 case 0x92:

	184 return 0x2019; // RIGHT SINGLE QUOTATION MARK

	185 case 0x93:

	186 return 0x201C; // LEFT DOUBLE QUOTATION MARK

	187 case 0x94:

	188 return 0x201D; // RIGHT DOUBLE QUOTATION MARK

	189 case 0x95:

	190 return 0x2022; // BULLET

	191 case 0x96:

	192 return 0x2013; // EN DASH

	193 case 0x97:

	194 return 0x2014; // EM DASH

	195 case 0x98:

	196 return 0x02DC; // SMALL TILDE

	197 case 0x99:

	198 return 0x2122; // TRADE MARK SIGN

	199 case 0x9A:

	200 return 0x0161; // LATIN SMALL LETTER S WITH CARON

	201 case 0x9B:

	202 return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK

	203 case 0x9C:

	204 return 0x0153; // LATIN SMALL LIGATURE OE

	205 case 0x9E:

	206 return 0x017E; // LATIN SMALL LETTER Z WITH CARON

	207 case 0x9F:

	208 return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS

	209

	210 case 0x81:

	211 case 0x8D:

	212 case 0x8F:

	213 case 0x90:

	214 case 0x9D:

	215 if (replacementCodepoint == null) {

	216 throw new ArgumentError(

	217 "Invalid windows-1252 code point $char at $_offset");

	218 }

	219 return replacementCodepoint;

	220 }

	221 return char;

	222 }

	223 }

OLD	NEW