sdk/lib/convert/utf.dart - Issue 25463003: Fix UTF8 encoder for Unicode runes > 0xFFFF.

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 25463003: Fix UTF8 encoder for Unicode runes > 0xFFFF. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of dart.convert;	5 part of dart.convert;

6	6

7 /** The Unicode Replacement character `U+FFFD` (�). */	7 /** The Unicode Replacement character `U+FFFD` (�). */

8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;	8 const UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;

9	9

	10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */

	11 const UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;

	12

10 /**	13 /**

11 * An instance of the default implementation of the [Utf8Codec].	14 * An instance of the default implementation of the [Utf8Codec].

12 *	15 *

13 * This instance provides a convenient access to the most common UTF-8	16 * This instance provides a convenient access to the most common UTF-8

14 * use cases.	17 * use cases.

15 *	18 *

16 * Examples:	19 * Examples:

17 *	20 *

18 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");	21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");

19 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,	22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,

(...skipping 21 matching lines...) Expand all Loading...
41 */	44 */

42 const Utf8Codec({ bool allowMalformed: false })	45 const Utf8Codec({ bool allowMalformed: false })

43 : _allowMalformed = allowMalformed;	46 : _allowMalformed = allowMalformed;

44	47

45 String get name => "utf-8";	48 String get name => "utf-8";

46	49

47 /**	50 /**

48 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the	51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

49 * corresponding string.	52 * corresponding string.

50 *	53 *

	54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this
	Anders Johnsen 2013/10/01 11:32:29 Is this really happening? I don't see any usage of Is this really happening? I don't see any usage of 'UNICODE_BOM_CHARACTER_RUNE'. floitsch 2013/10/01 11:37:42 There was a private copy of it. I removed that one Show quoted text On 2013/10/01 11:32:29, Anders Johnsen wrote: > Is this really happening? I don't see any usage of 'UNICODE_BOM_CHARACTER_RUNE'. There was a private copy of it. I removed that one.
	55 * character is discarded.

	56 *

51 * If [allowMalformed] is `true` the decoder replaces invalid (or	57 * If [allowMalformed] is `true` the decoder replaces invalid (or

52 * unterminated) character sequences with the Unicode Replacement character	58 * unterminated) character sequences with the Unicode Replacement character

53 * `U+FFFD` (�). Otherwise it throws a [FormatException].	59 * `U+FFFD` (�). Otherwise it throws a [FormatException].

54 *	60 *

55 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that	61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that

56 * was used to instantiate `this`.	62 * was used to instantiate `this`.

57 */	63 */

58 String decode(List<int> codeUnits, { bool allowMalformed }) {	64 String decode(List<int> codeUnits, { bool allowMalformed }) {

59 if (allowMalformed == null) allowMalformed = _allowMalformed;	65 if (allowMalformed == null) allowMalformed = _allowMalformed;

60 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);	66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);

(...skipping 235 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
296 * If it is `true` [convert] replaces invalid (or unterminated) character	302 * If it is `true` [convert] replaces invalid (or unterminated) character

297 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise	303 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise

298 * it throws a [FormatException].	304 * it throws a [FormatException].

299 */	305 */

300 const Utf8Decoder({ bool allowMalformed: false })	306 const Utf8Decoder({ bool allowMalformed: false })

301 : this._allowMalformed = allowMalformed;	307 : this._allowMalformed = allowMalformed;

302	308

303 /**	309 /**

304 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the	310 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

305 * corresponding string.	311 * corresponding string.

	312 *

	313 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

	314 * character is discarded.

306 */	315 */

307 String convert(List<int> codeUnits) {	316 String convert(List<int> codeUnits) {

308 StringBuffer buffer = new StringBuffer();	317 StringBuffer buffer = new StringBuffer();

309 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);	318 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);

310 decoder.convert(codeUnits, 0, codeUnits.length);	319 decoder.convert(codeUnits, 0, codeUnits.length);

311 decoder.close();	320 decoder.close();

312 return buffer.toString();	321 return buffer.toString();

313 }	322 }

314	323

315 /**	324 /**

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
349 const int _REPLACEMENT_CHARACTER = 0xFFFD;	358 const int _REPLACEMENT_CHARACTER = 0xFFFD;

350 const int _BOM_CHARACTER = 0xFEFF;	359 const int _BOM_CHARACTER = 0xFEFF;

351	360

352 bool _isSurrogate(int codeUnit) =>	361 bool _isSurrogate(int codeUnit) =>

353 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;	362 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;

354 bool _isLeadSurrogate(int codeUnit) =>	363 bool _isLeadSurrogate(int codeUnit) =>

355 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;	364 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;

356 bool _isTailSurrogate(int codeUnit) =>	365 bool _isTailSurrogate(int codeUnit) =>

357 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;	366 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;

358 int _combineSurrogatePair(int lead, int tail) =>	367 int _combineSurrogatePair(int lead, int tail) =>

359 0x10000 \| ((lead & _SURROGATE_VALUE_MASK) << 10)	368 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)

360 \| (tail & _SURROGATE_VALUE_MASK);	369 \| (tail & _SURROGATE_VALUE_MASK);

361	370

362	371

363 /**	372 /**

364 * Decodes UTF-8.	373 * Decodes UTF-8.

365 *	374 *

366 * The decoder handles chunked input.	375 * The decoder handles chunked input.

367 */	376 */

368 // TODO(floitsch): make this class public.	377 // TODO(floitsch): make this class public.

369 class _Utf8Decoder {	378 class _Utf8Decoder {

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
507 }	516 }

508 break loop;	517 break loop;

509 }	518 }

510 if (expectedUnits > 0) {	519 if (expectedUnits > 0) {

511 _value = value;	520 _value = value;

512 _expectedUnits = expectedUnits;	521 _expectedUnits = expectedUnits;

513 _extraUnits = extraUnits;	522 _extraUnits = extraUnits;

514 }	523 }

515 }	524 }

516 }	525 }

OLD	NEW

« no previous file with comments | « no previous file | tests/lib/convert/unicode_tests.dart » ('j') | no next file with comments »