OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.utf; | 5 part of dart.utf; |
6 | 6 |
7 /** | 7 /** |
8 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert | 8 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert |
9 * as much of the input as needed. Determines the byte order from the BOM, | 9 * as much of the input as needed. Determines the byte order from the BOM, |
10 * or uses big-endian as a default. This method always strips a leading BOM. | 10 * or uses big-endian as a default. This method always strips a leading BOM. |
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
106 List<int> encodeUtf16(String str) => | 106 List<int> encodeUtf16(String str) => |
107 encodeUtf16be(str, true); | 107 encodeUtf16be(str, true); |
108 | 108 |
109 /** | 109 /** |
110 * Produce a list of UTF-16BE encoded bytes. By default, this method produces | 110 * Produce a list of UTF-16BE encoded bytes. By default, this method produces |
111 * UTF-16BE bytes with no BOM. | 111 * UTF-16BE bytes with no BOM. |
112 */ | 112 */ |
113 List<int> encodeUtf16be(String str, [bool writeBOM = false]) { | 113 List<int> encodeUtf16be(String str, [bool writeBOM = false]) { |
114 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); | 114 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); |
115 List<int> encoding = | 115 List<int> encoding = |
116 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); | 116 new List<int>.fixedLength(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); |
117 int i = 0; | 117 int i = 0; |
118 if (writeBOM) { | 118 if (writeBOM) { |
119 encoding[i++] = UNICODE_UTF_BOM_HI; | 119 encoding[i++] = UNICODE_UTF_BOM_HI; |
120 encoding[i++] = UNICODE_UTF_BOM_LO; | 120 encoding[i++] = UNICODE_UTF_BOM_LO; |
121 } | 121 } |
122 for (int unit in utf16CodeUnits) { | 122 for (int unit in utf16CodeUnits) { |
123 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8; | 123 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8; |
124 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | 124 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; |
125 } | 125 } |
126 return encoding; | 126 return encoding; |
127 } | 127 } |
128 | 128 |
129 /** | 129 /** |
130 * Produce a list of UTF-16LE encoded bytes. By default, this method produces | 130 * Produce a list of UTF-16LE encoded bytes. By default, this method produces |
131 * UTF-16LE bytes with no BOM. | 131 * UTF-16LE bytes with no BOM. |
132 */ | 132 */ |
133 List<int> encodeUtf16le(String str, [bool writeBOM = false]) { | 133 List<int> encodeUtf16le(String str, [bool writeBOM = false]) { |
134 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); | 134 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str); |
135 List<int> encoding = | 135 List<int> encoding = |
136 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); | 136 new List<int>.fixedLength(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0)); |
137 int i = 0; | 137 int i = 0; |
138 if (writeBOM) { | 138 if (writeBOM) { |
139 encoding[i++] = UNICODE_UTF_BOM_LO; | 139 encoding[i++] = UNICODE_UTF_BOM_LO; |
140 encoding[i++] = UNICODE_UTF_BOM_HI; | 140 encoding[i++] = UNICODE_UTF_BOM_HI; |
141 } | 141 } |
142 for (int unit in utf16CodeUnits) { | 142 for (int unit in utf16CodeUnits) { |
143 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | 143 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; |
144 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8; | 144 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8; |
145 } | 145 } |
146 return encoding; | 146 return encoding; |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
181 return _codepointsToUtf16CodeUnits(str.charCodes); | 181 return _codepointsToUtf16CodeUnits(str.charCodes); |
182 } | 182 } |
183 | 183 |
184 typedef _ListRangeIterator _CodeUnitsProvider(); | 184 typedef _ListRangeIterator _CodeUnitsProvider(); |
185 | 185 |
186 /** | 186 /** |
187 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type | 187 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type |
188 * provides an iterator on demand and the iterator will only translate bytes | 188 * provides an iterator on demand and the iterator will only translate bytes |
189 * as requested by the user of the iterator. (Note: results are not cached.) | 189 * as requested by the user of the iterator. (Note: results are not cached.) |
190 */ | 190 */ |
191 class IterableUtf16Decoder implements Iterable<int> { | 191 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 192 // that's cheaper to allocate. |
| 193 class IterableUtf16Decoder extends Iterable<int> { |
192 final _CodeUnitsProvider codeunitsProvider; | 194 final _CodeUnitsProvider codeunitsProvider; |
193 final int replacementCodepoint; | 195 final int replacementCodepoint; |
194 | 196 |
195 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint); | 197 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint); |
196 | 198 |
197 Utf16CodeUnitDecoder iterator() => | 199 Utf16CodeUnitDecoder get iterator => |
198 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(), | 200 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(), |
199 replacementCodepoint); | 201 replacementCodepoint); |
200 } | 202 } |
201 | 203 |
202 /** | 204 /** |
203 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes | 205 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes |
204 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine | 206 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine |
205 * endian-ness, and defaults to BE. | 207 * endian-ness, and defaults to BE. |
206 */ | 208 */ |
207 class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator { | 209 class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator { |
208 final _ListRangeIterator utf16EncodedBytesIterator; | 210 final _ListRangeIterator utf16EncodedBytesIterator; |
209 final int replacementCodepoint; | 211 final int replacementCodepoint; |
| 212 int _current = null; |
210 | 213 |
211 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator( | 214 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator( |
212 this.utf16EncodedBytesIterator, this.replacementCodepoint); | 215 this.utf16EncodedBytesIterator, this.replacementCodepoint); |
213 | 216 |
214 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | 217 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ |
215 int offset = 0, int length, | 218 int offset = 0, int length, |
216 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 219 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
217 if (length == null) { | 220 if (length == null) { |
218 length = utf16EncodedBytes.length - offset; | 221 length = utf16EncodedBytes.length - offset; |
219 } | 222 } |
220 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) { | 223 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) { |
221 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, | 224 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, |
222 length - 2, false, replacementCodepoint); | 225 length - 2, false, replacementCodepoint); |
223 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) { | 226 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) { |
224 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, | 227 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2, |
225 length - 2, false, replacementCodepoint); | 228 length - 2, false, replacementCodepoint); |
226 } else { | 229 } else { |
227 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset, | 230 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset, |
228 length, false, replacementCodepoint); | 231 length, false, replacementCodepoint); |
229 } | 232 } |
230 } | 233 } |
231 | 234 |
232 /** | 235 /** |
233 * Provides a fast way to decode the rest of the source bytes in a single | 236 * Provides a fast way to decode the rest of the source bytes in a single |
234 * call. This method trades memory for improved speed in that it potentially | 237 * call. This method trades memory for improved speed in that it potentially |
235 * over-allocates the List containing results. | 238 * over-allocates the List containing results. |
236 */ | 239 */ |
237 List<int> decodeRest() { | 240 List<int> decodeRest() { |
238 List<int> codeunits = new List<int>(remaining); | 241 List<int> codeunits = new List<int>.fixedLength(remaining); |
239 int i = 0; | 242 int i = 0; |
240 while (hasNext) { | 243 while (moveNext()) { |
241 codeunits[i++] = next(); | 244 codeunits[i++] = current; |
242 } | 245 } |
243 if (i == codeunits.length) { | 246 if (i == codeunits.length) { |
244 return codeunits; | 247 return codeunits; |
245 } else { | 248 } else { |
246 List<int> truncCodeunits = new List<int>(i); | 249 List<int> truncCodeunits = new List<int>.fixedLength(i); |
247 truncCodeunits.setRange(0, i, codeunits); | 250 truncCodeunits.setRange(0, i, codeunits); |
248 return truncCodeunits; | 251 return truncCodeunits; |
249 } | 252 } |
250 } | 253 } |
251 | 254 |
252 bool get hasNext => utf16EncodedBytesIterator.hasNext; | 255 int get current => _current; |
253 | 256 |
254 int next() { | 257 bool moveNext() { |
| 258 _current = null; |
255 if (utf16EncodedBytesIterator.remaining < 2) { | 259 if (utf16EncodedBytesIterator.remaining < 2) { |
256 utf16EncodedBytesIterator.next(); | 260 utf16EncodedBytesIterator.moveNext(); |
257 if (replacementCodepoint != null) { | 261 if (replacementCodepoint != null) { |
258 return replacementCodepoint; | 262 _current = replacementCodepoint; |
| 263 return true; |
259 } else { | 264 } else { |
260 throw new ArgumentError( | 265 throw new ArgumentError( |
261 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); | 266 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}"); |
262 } | 267 } |
263 } else { | 268 } else { |
264 return decode(); | 269 _current = decode(); |
| 270 return true; |
265 } | 271 } |
266 } | 272 } |
267 | 273 |
268 int get position => utf16EncodedBytesIterator.position ~/ 2; | 274 int get position => utf16EncodedBytesIterator.position ~/ 2; |
269 | 275 |
270 void backup([int by = 1]) { | 276 void backup([int by = 1]) { |
271 utf16EncodedBytesIterator.backup(2 * by); | 277 utf16EncodedBytesIterator.backup(2 * by); |
272 } | 278 } |
273 | 279 |
274 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; | 280 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2; |
275 | 281 |
276 void skip([int count = 1]) { | 282 void skip([int count = 1]) { |
277 utf16EncodedBytesIterator.skip(2 * count); | 283 utf16EncodedBytesIterator.skip(2 * count); |
278 } | 284 } |
279 | 285 |
280 int decode(); | 286 int decode(); |
281 } | 287 } |
282 | 288 |
283 /** | 289 /** |
284 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes | 290 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes |
285 * to produce the code unit (0-(2^16)-1). | 291 * to produce the code unit (0-(2^16)-1). |
286 */ | 292 */ |
287 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { | 293 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { |
288 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | 294 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ |
289 int offset = 0, int length, bool stripBom = true, | 295 int offset = 0, int length, bool stripBom = true, |
290 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 296 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
291 super._fromListRangeIterator((new _ListRange(utf16EncodedBytes, offset, | 297 super._fromListRangeIterator( |
292 length)).iterator(), replacementCodepoint) { | 298 (new _ListRange(utf16EncodedBytes, offset, length)).iterator, |
| 299 replacementCodepoint) { |
293 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) { | 300 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) { |
294 skip(); | 301 skip(); |
295 } | 302 } |
296 } | 303 } |
297 | 304 |
298 int decode() { | 305 int decode() { |
299 int hi = utf16EncodedBytesIterator.next(); | 306 utf16EncodedBytesIterator.moveNext(); |
300 int lo = utf16EncodedBytesIterator.next(); | 307 int hi = utf16EncodedBytesIterator.current; |
| 308 utf16EncodedBytesIterator.moveNext(); |
| 309 int lo = utf16EncodedBytesIterator.current; |
301 return (hi << 8) + lo; | 310 return (hi << 8) + lo; |
302 } | 311 } |
303 } | 312 } |
304 | 313 |
305 /** | 314 /** |
306 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes | 315 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes |
307 * to produce the code unit (0-(2^16)-1). | 316 * to produce the code unit (0-(2^16)-1). |
308 */ | 317 */ |
309 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { | 318 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder { |
310 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ | 319 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [ |
311 int offset = 0, int length, bool stripBom = true, | 320 int offset = 0, int length, bool stripBom = true, |
312 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 321 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
313 super._fromListRangeIterator((new _ListRange(utf16EncodedBytes, offset, | 322 super._fromListRangeIterator( |
314 length)).iterator(), replacementCodepoint) { | 323 (new _ListRange(utf16EncodedBytes, offset, length)).iterator, |
| 324 replacementCodepoint) { |
315 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) { | 325 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) { |
316 skip(); | 326 skip(); |
317 } | 327 } |
318 } | 328 } |
319 | 329 |
320 int decode() { | 330 int decode() { |
321 int lo = utf16EncodedBytesIterator.next(); | 331 utf16EncodedBytesIterator.moveNext(); |
322 int hi = utf16EncodedBytesIterator.next(); | 332 int lo = utf16EncodedBytesIterator.current; |
| 333 utf16EncodedBytesIterator.moveNext(); |
| 334 int hi = utf16EncodedBytesIterator.current; |
323 return (hi << 8) + lo; | 335 return (hi << 8) + lo; |
324 } | 336 } |
325 } | 337 } |
OLD | NEW |