Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: html/lib/src/char_encodings.dart

Issue 1400473008: Roll Observatory packages and add a roll script (Closed) Base URL: git@github.com:dart-lang/observatory_pub_packages.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « html/lib/parser_console.dart ('k') | html/lib/src/constants.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /// Decodes bytes using the correct name. See [decodeBytes].
2 library char_encodings;
3
4 import 'dart:collection';
5 import 'package:utf/utf.dart';
6
7 // TODO(jmesserly): this function is conspicuously absent from dart:utf.
8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark.
9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
10 /// used in HTML to detect the UTF-
11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
12 int end = length != null ? offset + length : bytes.length;
13 return (offset + 3) <= end &&
14 bytes[offset] == 0xEF &&
15 bytes[offset + 1] == 0xBB &&
16 bytes[offset + 2] == 0xBF;
17 }
18
19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
20 // file, but dart:utf does not expose stream-based decoders yet.
21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for
22 /// the codepoints. Supports the major unicode encodings as well as ascii and
23 /// and windows-1252 encodings.
24 Iterable<int> decodeBytes(String encoding, List<int> bytes, [int offset = 0,
25 int length,
26 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
27 if (length == null) length = bytes.length;
28 final replace = replacementCodepoint;
29 switch (encoding) {
30 case 'ascii':
31 bytes = bytes.sublist(offset, offset + length);
32 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
33 for (int byte in bytes) {
34 if (byte > 127) {
35 // TODO(jmesserly): ideally this would be DecoderException, like the
36 // one thrown in runtime/bin/string_stream.dart, but we don't want to
37 // depend on dart:io.
38 throw new FormatException("Illegal ASCII character $byte");
39 }
40 }
41 return bytes;
42
43 case 'windows-1252':
44 case 'cp1252':
45 return decodeWindows1252AsIterable(bytes, offset, length, replace);
46
47 case 'utf-8':
48 // NOTE: to match the behavior of the other decode functions, we eat the
49 // utf-8 BOM here.
50 if (hasUtf8Bom(bytes, offset, length)) {
51 offset += 3;
52 length -= 3;
53 }
54 return decodeUtf8AsIterable(bytes, offset, length, replace);
55
56 case 'utf-16':
57 return decodeUtf16AsIterable(bytes, offset, length, replace);
58 case 'utf-16-be':
59 return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
60 case 'utf-16-le':
61 return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
62
63 case 'utf-32':
64 return decodeUtf32AsIterable(bytes, offset, length, replace);
65 case 'utf-32-be':
66 return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
67 case 'utf-32-le':
68 return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
69
70 default:
71 throw new ArgumentError('Encoding $encoding not supported');
72 }
73 }
74
75 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.
76 /// Returns the code points for the [input]. This works like [String.charCodes]
77 /// but it decodes UTF-16 surrogate pairs.
78 List<int> toCodepoints(String input) {
79 var newCodes = <int>[];
80 for (int i = 0; i < input.length; i++) {
81 var c = input.codeUnitAt(i);
82 if (0xD800 <= c && c <= 0xDBFF) {
83 int next = i + 1;
84 if (next < input.length) {
85 var d = input.codeUnitAt(next);
86 if (0xDC00 <= d && d <= 0xDFFF) {
87 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
88 i = next;
89 }
90 }
91 }
92 newCodes.add(c);
93 }
94 return newCodes;
95 }
96
97 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as
98 /// an iterable. Thus, the consumer can only convert as much of the input as
99 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError]
100 /// rather than replace the bad value.
101 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
102 [int offset = 0, int length,
103 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
104 return new IterableWindows1252Decoder(
105 bytes, offset, length, replacementCodepoint);
106 }
107
108 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
109 /// provides an iterator on demand and the iterator will only translate bytes
110 /// as requested by the user of the iterator. (Note: results are not cached.)
111 class IterableWindows1252Decoder extends IterableBase<int> {
112 final List<int> bytes;
113 final int offset;
114 final int length;
115 final int replacementCodepoint;
116
117 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,
118 int this.length = null,
119 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
120
121 Windows1252Decoder get iterator =>
122 new Windows1252Decoder(bytes, offset, length, replacementCodepoint);
123 }
124
125 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
126 /// The parameters can set an offset into a list of bytes (as int), limit the
127 /// length of the values to be decoded, and override the default Unicode
128 /// replacement character. Set the replacementCharacter to null to throw an
129 /// ArgumentError rather than replace the bad value. The return value
130 /// from this method can be used as an Iterable (e.g. in a for-loop).
131 class Windows1252Decoder implements Iterator<int> {
132 final int replacementCodepoint;
133 final List<int> _bytes;
134 int _offset;
135 final int _length;
136
137 Windows1252Decoder(List<int> bytes, [int offset = 0, int length,
138 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
139 : _bytes = bytes,
140 _offset = offset - 1,
141 _length = length == null ? bytes.length : length;
142
143 bool get _inRange => _offset >= 0 && _offset < _length;
144 int get current => _inRange ? _mapChar(_bytes[_offset]) : null;
145
146 bool moveNext() {
147 _offset++;
148 return _inRange;
149 }
150
151 int _mapChar(int char) {
152 // TODO(jmesserly): this is duplicating entitiesWindows1252 and
153 // replacementCharacters from constants.dart
154 switch (char) {
155 case 0x80:
156 return 0x20AC; // EURO SIGN
157 case 0x82:
158 return 0x201A; // SINGLE LOW-9 QUOTATION MARK
159 case 0x83:
160 return 0x0192; // LATIN SMALL LETTER F WITH HOOK
161 case 0x84:
162 return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
163 case 0x85:
164 return 0x2026; // HORIZONTAL ELLIPSIS
165 case 0x86:
166 return 0x2020; // DAGGER
167 case 0x87:
168 return 0x2021; // DOUBLE DAGGER
169 case 0x88:
170 return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
171 case 0x89:
172 return 0x2030; // PER MILLE SIGN
173 case 0x8A:
174 return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
175 case 0x8B:
176 return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
177 case 0x8C:
178 return 0x0152; // LATIN CAPITAL LIGATURE OE
179 case 0x8E:
180 return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
181 case 0x91:
182 return 0x2018; // LEFT SINGLE QUOTATION MARK
183 case 0x92:
184 return 0x2019; // RIGHT SINGLE QUOTATION MARK
185 case 0x93:
186 return 0x201C; // LEFT DOUBLE QUOTATION MARK
187 case 0x94:
188 return 0x201D; // RIGHT DOUBLE QUOTATION MARK
189 case 0x95:
190 return 0x2022; // BULLET
191 case 0x96:
192 return 0x2013; // EN DASH
193 case 0x97:
194 return 0x2014; // EM DASH
195 case 0x98:
196 return 0x02DC; // SMALL TILDE
197 case 0x99:
198 return 0x2122; // TRADE MARK SIGN
199 case 0x9A:
200 return 0x0161; // LATIN SMALL LETTER S WITH CARON
201 case 0x9B:
202 return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
203 case 0x9C:
204 return 0x0153; // LATIN SMALL LIGATURE OE
205 case 0x9E:
206 return 0x017E; // LATIN SMALL LETTER Z WITH CARON
207 case 0x9F:
208 return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
209
210 case 0x81:
211 case 0x8D:
212 case 0x8F:
213 case 0x90:
214 case 0x9D:
215 if (replacementCodepoint == null) {
216 throw new ArgumentError(
217 "Invalid windows-1252 code point $char at $_offset");
218 }
219 return replacementCodepoint;
220 }
221 return char;
222 }
223 }
OLDNEW
« no previous file with comments | « html/lib/parser_console.dart ('k') | html/lib/src/constants.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698