Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(77)

Side by Side Diff: pkg/third_party/html5lib/lib/src/char_encodings.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /** Decodes bytes using the correct name. See [decodeBytes]. */
2 library char_encodings;
3
4 import 'dart:collection';
5 import 'dart:utf';
6
7 // TODO(jmesserly): this function is conspicuously absent from dart:utf.
8 /**
9 * Returns true if the [bytes] starts with a UTF-8 byte order mark.
10 * Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
11 * used in HTML to detect the UTF-
12 */
13 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
14 int end = length != null ? offset + length : bytes.length;
15 return (offset + 3) <= end &&
16 bytes[offset] == 0xEF &&
17 bytes[offset + 1] == 0xBB &&
18 bytes[offset + 2] == 0xBF;
19 }
20
21 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
22 // file, but dart:utf does not expose stream-based decoders yet.
23 /**
24 * Decodes the [bytes] with the provided [encoding] and returns an iterable for
25 * the codepoints. Supports the major unicode encodings as well as ascii and
26 * and windows-1252 encodings.
27 */
28 Iterable<int> decodeBytes(String encoding, List<int> bytes,
29 [int offset = 0, int length,
30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
31 if (length == null) length = bytes.length;
32 final replace = replacementCodepoint;
33 switch (encoding) {
34 case 'ascii':
35 bytes = bytes.sublist(offset, offset + length);
36 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
37 for (int byte in bytes) {
38 if (byte > 127) {
39 // TODO(jmesserly): ideally this would be DecoderException, like the
40 // one thrown in runtime/bin/string_stream.dart, but we don't want to
41 // depend on dart:io.
42 throw new FormatException("Illegal ASCII character $byte");
43 }
44 }
45 return bytes;
46
47 case 'windows-1252':
48 case 'cp1252':
49 return decodeWindows1252AsIterable(bytes, offset, length, replace);
50
51 case 'utf-8':
52 // NOTE: to match the behavior of the other decode functions, we eat the
53 // utf-8 BOM here.
54 if (hasUtf8Bom(bytes, offset, length)) {
55 offset += 3;
56 length -= 3;
57 }
58 return decodeUtf8AsIterable(bytes, offset, length, replace);
59
60 case 'utf-16':
61 return decodeUtf16AsIterable(bytes, offset, length, replace);
62 case 'utf-16-be':
63 return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
64 case 'utf-16-le':
65 return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
66
67 case 'utf-32':
68 return decodeUtf32AsIterable(bytes, offset, length, replace);
69 case 'utf-32-be':
70 return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
71 case 'utf-32-le':
72 return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
73
74 default:
75 throw new ArgumentError('Encoding $encoding not supported');
76 }
77 }
78
79
80 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.
81 /**
82 * Returns the code points for the [input]. This works like [String.charCodes]
83 * but it decodes UTF-16 surrogate pairs.
84 */
85 List<int> toCodepoints(String input) {
86 var newCodes = <int>[];
87 for (int i = 0; i < input.length; i++) {
88 var c = input.codeUnitAt(i);
89 if (0xD800 <= c && c <= 0xDBFF) {
90 int next = i + 1;
91 if (next < input.length) {
92 var d = input.codeUnitAt(next);
93 if (0xDC00 <= d && d <= 0xDFFF) {
94 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
95 i = next;
96 }
97 }
98 }
99 newCodes.add(c);
100 }
101 return newCodes;
102 }
103
104
105 /**
106 * Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an
107 * iterable. Thus, the consumer can only convert as much of the input as needed.
108 * Set the [replacementCharacter] to null to throw an [ArgumentError]
109 * rather than replace the bad value.
110 */
111 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
112 [int offset = 0, int length,
113 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
114 return new IterableWindows1252Decoder(bytes, offset, length,
115 replacementCodepoint);
116 }
117
118
119 /**
120 * Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
121 * provides an iterator on demand and the iterator will only translate bytes
122 * as requested by the user of the iterator. (Note: results are not cached.)
123 */
124 class IterableWindows1252Decoder extends IterableBase<int> {
125 final List<int> bytes;
126 final int offset;
127 final int length;
128 final int replacementCodepoint;
129
130 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,
131 int this.length = null,
132 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
133
134 Windows1252Decoder get iterator =>
135 new Windows1252Decoder(bytes, offset, length, replacementCodepoint);
136 }
137
138
139 /**
140 * Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
141 * The parameters can set an offset into a list of bytes (as int), limit the
142 * length of the values to be decoded, and override the default Unicode
143 * replacement character. Set the replacementCharacter to null to throw an
144 * ArgumentError rather than replace the bad value. The return value
145 * from this method can be used as an Iterable (e.g. in a for-loop).
146 */
147 class Windows1252Decoder implements Iterator<int> {
148 final int replacementCodepoint;
149 final List<int> _bytes;
150 int _offset;
151 final int _length;
152
153 Windows1252Decoder(List<int> bytes, [int offset = 0, int length,
154 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
155 : _bytes = bytes,
156 _offset = offset - 1,
157 _length = length == null ? bytes.length : length;
158
159 bool get _inRange => _offset >= 0 && _offset < _length;
160 int get current => _inRange ? _mapChar(_bytes[_offset]) : null;
161
162 bool moveNext() {
163 _offset++;
164 return _inRange;
165 }
166
167 int _mapChar(int char) {
168 // TODO(jmesserly): this is duplicating entitiesWindows1252 and
169 // replacementCharacters from constants.dart
170 switch (char) {
171 case 0x80: return 0x20AC; // EURO SIGN
172 case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK
173 case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK
174 case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
175 case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS
176 case 0x86: return 0x2020; // DAGGER
177 case 0x87: return 0x2021; // DOUBLE DAGGER
178 case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
179 case 0x89: return 0x2030; // PER MILLE SIGN
180 case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
181 case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
182 case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE
183 case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
184 case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK
185 case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK
186 case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK
187 case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK
188 case 0x95: return 0x2022; // BULLET
189 case 0x96: return 0x2013; // EN DASH
190 case 0x97: return 0x2014; // EM DASH
191 case 0x98: return 0x02DC; // SMALL TILDE
192 case 0x99: return 0x2122; // TRADE MARK SIGN
193 case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON
194 case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
195 case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE
196 case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON
197 case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
198
199 case 0x81:
200 case 0x8D:
201 case 0x8F:
202 case 0x90:
203 case 0x9D:
204 if (replacementCodepoint == null) {
205 throw new ArgumentError(
206 "Invalid windows-1252 code point $char at $_offset");
207 }
208 return replacementCodepoint;
209 }
210 return char;
211 }
212 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698