pkg/third_party/html5lib/lib/src/char_encodings.dart - Issue 22375011: move html5lib code into dart svn repo

Side by Side Diff: pkg/third_party/html5lib/lib/src/char_encodings.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /** Decodes bytes using the correct name. See [decodeBytes]. */

	2 library char_encodings;

	3

	4 import 'dart:collection';

	5 import 'dart:utf';

	6

	7 // TODO(jmesserly): this function is conspicuously absent from dart:utf.

	8 /**

	9 * Returns true if the [bytes] starts with a UTF-8 byte order mark.

	10 * Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is

	11 * used in HTML to detect the UTF-

	12 */

	13 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {

	14 int end = length != null ? offset + length : bytes.length;

	15 return (offset + 3) <= end &&

	16 bytes[offset] == 0xEF &&

	17 bytes[offset + 1] == 0xBB &&

	18 bytes[offset + 2] == 0xBF;

	19 }

	20

	21 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire

	22 // file, but dart:utf does not expose stream-based decoders yet.

	23 /**

	24 * Decodes the [bytes] with the provided [encoding] and returns an iterable for

	25 * the codepoints. Supports the major unicode encodings as well as ascii and

	26 * and windows-1252 encodings.

	27 */

	28 Iterable<int> decodeBytes(String encoding, List<int> bytes,

	29 [int offset = 0, int length,

	30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

	31 if (length == null) length = bytes.length;

	32 final replace = replacementCodepoint;

	33 switch (encoding) {

	34 case 'ascii':

	35 bytes = bytes.sublist(offset, offset + length);

	36 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart

	37 for (int byte in bytes) {

	38 if (byte > 127) {

	39 // TODO(jmesserly): ideally this would be DecoderException, like the

	40 // one thrown in runtime/bin/string_stream.dart, but we don't want to

	41 // depend on dart:io.

	42 throw new FormatException("Illegal ASCII character $byte");

	43 }

	44 }

	45 return bytes;

	46

	47 case 'windows-1252':

	48 case 'cp1252':

	49 return decodeWindows1252AsIterable(bytes, offset, length, replace);

	50

	51 case 'utf-8':

	52 // NOTE: to match the behavior of the other decode functions, we eat the

	53 // utf-8 BOM here.

	54 if (hasUtf8Bom(bytes, offset, length)) {

	55 offset += 3;

	56 length -= 3;

	57 }

	58 return decodeUtf8AsIterable(bytes, offset, length, replace);

	59

	60 case 'utf-16':

	61 return decodeUtf16AsIterable(bytes, offset, length, replace);

	62 case 'utf-16-be':

	63 return decodeUtf16beAsIterable(bytes, offset, length, true, replace);

	64 case 'utf-16-le':

	65 return decodeUtf16leAsIterable(bytes, offset, length, true, replace);

	66

	67 case 'utf-32':

	68 return decodeUtf32AsIterable(bytes, offset, length, replace);

	69 case 'utf-32-be':

	70 return decodeUtf32beAsIterable(bytes, offset, length, true, replace);

	71 case 'utf-32-le':

	72 return decodeUtf32leAsIterable(bytes, offset, length, true, replace);

	73

	74 default:

	75 throw new ArgumentError('Encoding $encoding not supported');

	76 }

	77 }

	78

	79

	80 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.

	81 /**

	82 * Returns the code points for the [input]. This works like [String.charCodes]

	83 * but it decodes UTF-16 surrogate pairs.

	84 */

	85 List<int> toCodepoints(String input) {

	86 var newCodes = <int>[];

	87 for (int i = 0; i < input.length; i++) {

	88 var c = input.codeUnitAt(i);

	89 if (0xD800 <= c && c <= 0xDBFF) {

	90 int next = i + 1;

	91 if (next < input.length) {

	92 var d = input.codeUnitAt(next);

	93 if (0xDC00 <= d && d <= 0xDFFF) {

	94 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);

	95 i = next;

	96 }

	97 }

	98 }

	99 newCodes.add(c);

	100 }

	101 return newCodes;

	102 }

	103

	104

	105 /**

	106 * Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an

	107 * iterable. Thus, the consumer can only convert as much of the input as needed.

	108 * Set the [replacementCharacter] to null to throw an [ArgumentError]

	109 * rather than replace the bad value.

	110 */

	111 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,

	112 [int offset = 0, int length,

	113 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

	114 return new IterableWindows1252Decoder(bytes, offset, length,

	115 replacementCodepoint);

	116 }

	117

	118

	119 /**

	120 * Return type of [decodeWindows1252AsIterable] and variants. The Iterable type

	121 * provides an iterator on demand and the iterator will only translate bytes

	122 * as requested by the user of the iterator. (Note: results are not cached.)

	123 */

	124 class IterableWindows1252Decoder extends IterableBase<int> {

	125 final List<int> bytes;

	126 final int offset;

	127 final int length;

	128 final int replacementCodepoint;

	129

	130 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,

	131 int this.length = null,

	132 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

	133

	134 Windows1252Decoder get iterator =>

	135 new Windows1252Decoder(bytes, offset, length, replacementCodepoint);

	136 }

	137

	138

	139 /**

	140 * Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.

	141 * The parameters can set an offset into a list of bytes (as int), limit the

	142 * length of the values to be decoded, and override the default Unicode

	143 * replacement character. Set the replacementCharacter to null to throw an

	144 * ArgumentError rather than replace the bad value. The return value

	145 * from this method can be used as an Iterable (e.g. in a for-loop).

	146 */

	147 class Windows1252Decoder implements Iterator<int> {

	148 final int replacementCodepoint;

	149 final List<int> _bytes;

	150 int _offset;

	151 final int _length;

	152

	153 Windows1252Decoder(List<int> bytes, [int offset = 0, int length,

	154 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])

	155 : _bytes = bytes,

	156 _offset = offset - 1,

	157 _length = length == null ? bytes.length : length;

	158

	159 bool get _inRange => _offset >= 0 && _offset < _length;

	160 int get current => _inRange ? _mapChar(_bytes[_offset]) : null;

	161

	162 bool moveNext() {

	163 _offset++;

	164 return _inRange;

	165 }

	166

	167 int _mapChar(int char) {

	168 // TODO(jmesserly): this is duplicating entitiesWindows1252 and

	169 // replacementCharacters from constants.dart

	170 switch (char) {

	171 case 0x80: return 0x20AC; // EURO SIGN

	172 case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK

	173 case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK

	174 case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK

	175 case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS

	176 case 0x86: return 0x2020; // DAGGER

	177 case 0x87: return 0x2021; // DOUBLE DAGGER

	178 case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT

	179 case 0x89: return 0x2030; // PER MILLE SIGN

	180 case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON

	181 case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK

	182 case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE

	183 case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON

	184 case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK

	185 case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK

	186 case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK

	187 case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK

	188 case 0x95: return 0x2022; // BULLET

	189 case 0x96: return 0x2013; // EN DASH

	190 case 0x97: return 0x2014; // EM DASH

	191 case 0x98: return 0x02DC; // SMALL TILDE

	192 case 0x99: return 0x2122; // TRADE MARK SIGN

	193 case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON

	194 case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK

	195 case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE

	196 case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON

	197 case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS

	198

	199 case 0x81:

	200 case 0x8D:

	201 case 0x8F:

	202 case 0x90:

	203 case 0x9D:

	204 if (replacementCodepoint == null) {

	205 throw new ArgumentError(

	206 "Invalid windows-1252 code point $char at $_offset");

	207 }

	208 return replacementCodepoint;

	209 }

	210 return char;

	211 }

	212 }

OLD	NEW