Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 25463003: Fix UTF8 encoder for Unicode runes > 0xFFFF. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | tests/lib/convert/unicode_tests.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.convert; 5 part of dart.convert;
6 6
7 /** The Unicode Replacement character `U+FFFD` (�). */ 7 /** The Unicode Replacement character `U+FFFD` (�). */
8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; 8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;
9 9
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */
11 const UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;
12
10 /** 13 /**
11 * An instance of the default implementation of the [Utf8Codec]. 14 * An instance of the default implementation of the [Utf8Codec].
12 * 15 *
13 * This instance provides a convenient access to the most common UTF-8 16 * This instance provides a convenient access to the most common UTF-8
14 * use cases. 17 * use cases.
15 * 18 *
16 * Examples: 19 * Examples:
17 * 20 *
18 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ"); 21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");
19 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6, 22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,
(...skipping 21 matching lines...) Expand all
41 */ 44 */
42 const Utf8Codec({ bool allowMalformed: false }) 45 const Utf8Codec({ bool allowMalformed: false })
43 : _allowMalformed = allowMalformed; 46 : _allowMalformed = allowMalformed;
44 47
45 String get name => "utf-8"; 48 String get name => "utf-8";
46 49
47 /** 50 /**
48 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the 51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
49 * corresponding string. 52 * corresponding string.
50 * 53 *
54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
Anders Johnsen 2013/10/01 11:32:29 Is this really happening? I don't see any usage of
floitsch 2013/10/01 11:37:42 There was a private copy of it. I removed that one
55 * character is discarded.
56 *
51 * If [allowMalformed] is `true` the decoder replaces invalid (or 57 * If [allowMalformed] is `true` the decoder replaces invalid (or
52 * unterminated) character sequences with the Unicode Replacement character 58 * unterminated) character sequences with the Unicode Replacement character
53 * `U+FFFD` (�). Otherwise it throws a [FormatException]. 59 * `U+FFFD` (�). Otherwise it throws a [FormatException].
54 * 60 *
55 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that 61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that
56 * was used to instantiate `this`. 62 * was used to instantiate `this`.
57 */ 63 */
58 String decode(List<int> codeUnits, { bool allowMalformed }) { 64 String decode(List<int> codeUnits, { bool allowMalformed }) {
59 if (allowMalformed == null) allowMalformed = _allowMalformed; 65 if (allowMalformed == null) allowMalformed = _allowMalformed;
60 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits); 66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);
(...skipping 235 matching lines...) Expand 10 before | Expand all | Expand 10 after
296 * If it is `true` [convert] replaces invalid (or unterminated) character 302 * If it is `true` [convert] replaces invalid (or unterminated) character
297 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise 303 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
298 * it throws a [FormatException]. 304 * it throws a [FormatException].
299 */ 305 */
300 const Utf8Decoder({ bool allowMalformed: false }) 306 const Utf8Decoder({ bool allowMalformed: false })
301 : this._allowMalformed = allowMalformed; 307 : this._allowMalformed = allowMalformed;
302 308
303 /** 309 /**
304 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the 310 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
305 * corresponding string. 311 * corresponding string.
312 *
313 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
314 * character is discarded.
306 */ 315 */
307 String convert(List<int> codeUnits) { 316 String convert(List<int> codeUnits) {
308 StringBuffer buffer = new StringBuffer(); 317 StringBuffer buffer = new StringBuffer();
309 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed); 318 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);
310 decoder.convert(codeUnits, 0, codeUnits.length); 319 decoder.convert(codeUnits, 0, codeUnits.length);
311 decoder.close(); 320 decoder.close();
312 return buffer.toString(); 321 return buffer.toString();
313 } 322 }
314 323
315 /** 324 /**
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
349 const int _REPLACEMENT_CHARACTER = 0xFFFD; 358 const int _REPLACEMENT_CHARACTER = 0xFFFD;
350 const int _BOM_CHARACTER = 0xFEFF; 359 const int _BOM_CHARACTER = 0xFEFF;
351 360
352 bool _isSurrogate(int codeUnit) => 361 bool _isSurrogate(int codeUnit) =>
353 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN; 362 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
354 bool _isLeadSurrogate(int codeUnit) => 363 bool _isLeadSurrogate(int codeUnit) =>
355 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN; 364 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
356 bool _isTailSurrogate(int codeUnit) => 365 bool _isTailSurrogate(int codeUnit) =>
357 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN; 366 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
358 int _combineSurrogatePair(int lead, int tail) => 367 int _combineSurrogatePair(int lead, int tail) =>
359 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10) 368 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)
360 | (tail & _SURROGATE_VALUE_MASK); 369 | (tail & _SURROGATE_VALUE_MASK);
361 370
362 371
363 /** 372 /**
364 * Decodes UTF-8. 373 * Decodes UTF-8.
365 * 374 *
366 * The decoder handles chunked input. 375 * The decoder handles chunked input.
367 */ 376 */
368 // TODO(floitsch): make this class public. 377 // TODO(floitsch): make this class public.
369 class _Utf8Decoder { 378 class _Utf8Decoder {
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
507 } 516 }
508 break loop; 517 break loop;
509 } 518 }
510 if (expectedUnits > 0) { 519 if (expectedUnits > 0) {
511 _value = value; 520 _value = value;
512 _expectedUnits = expectedUnits; 521 _expectedUnits = expectedUnits;
513 _extraUnits = extraUnits; 522 _extraUnits = extraUnits;
514 } 523 }
515 } 524 }
516 } 525 }
OLDNEW
« no previous file with comments | « no previous file | tests/lib/convert/unicode_tests.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698