pkg/utf/lib/utf16.dart - Issue 68563004: Move unicode tests to utf package.

Side by Side Diff: pkg/utf/lib/utf16.dart

Issue 68563004: Move unicode tests to utf package. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Simplify test. Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of utf;	5 part of utf;

6	6

	7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).
	floitsch 2013/11/18 17:08:17 Whole section copied verbatim. Whole section copied verbatim. Lasse Reichstein Nielsen 2013/11/19 07:42:38 I don't think I want to add anything new to the ut I don't think I want to add anything new to the utf package now. It was kept for backwards compatibility, but new features should go into dart:convert instead. The utf package isn't something we want to continue developing. floitsch 2013/11/19 10:40:32 I don't agree. The utf-package contains much more Show quoted text On 2013/11/19 07:42:38, Lasse Reichstein Nielsen wrote: > I don't think I want to add anything new to the utf package now. > It was kept for backwards compatibility, but new features should go into > dart:convert instead. The utf package isn't something we want to continue > developing. I don't agree. The utf-package contains much more features than convert. I would like it to see getting improved as a package. When I say "copied verbatim" I meant from within the same package. Lasse Reichstein Nielsen 2013/11/19 12:25:43 From the same package - in that case, LGTM. From the same package - in that case, LGTM.
	8 /**

	9 * Provide a list of Unicode codepoints for a given string.

	10 */

	11 List<int> stringToCodepoints(String str) {

	12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.

	13 // So we need to convert.

	14 return _utf16CodeUnitsToCodepoints(str.codeUnits);
	Lasse Reichstein Nielsen 2013/11/19 07:42:38 If we keep it (and I don't think we should - if it If we keep it (and I don't think we should - if it's used by the tests, put it there instead), then update it: - Change this to "return str.runes.toList();" and remove the comment above. - If _utf16CodeUnitsToCodepoints isn't used in other places, it can be removed then. floitsch 2013/11/19 10:40:32 This was code that already existed in the package. Show quoted text On 2013/11/19 07:42:38, Lasse Reichstein Nielsen wrote: > If we keep it (and I don't think we should - if it's used by the tests, put it > there instead), then update it: > - Change this to "return str.runes.toList();" and remove the comment above. > - If _utf16CodeUnitsToCodepoints isn't used in other places, it can be removed > then. This was code that already existed in the package. Not changing in this CL.
	15 }

	16

	17 /**

	18 * Generate a string from the provided Unicode codepoints.

	19 *

	20 * Deprecated Use [String.fromCharCodes] instead.
	Lasse Reichstein Nielsen 2013/11/19 07:42:38 Ditto - remove this. Definitely remove the "Deprec Ditto - remove this. Definitely remove the "Deprecated". floitsch 2013/11/19 10:40:32 Not in this CL. Show quoted text On 2013/11/19 07:42:38, Lasse Reichstein Nielsen wrote: > Ditto - remove this. Definitely remove the "Deprecated". Not in this CL.
	21 */

	22 String codepointsToString(List<int> codepoints) {

	23 return new String.fromCharCodes(codepoints);

	24 }

	25

	26 /**

	27 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.

	28 * The parameters can override the default Unicode replacement character. Set

	29 * the replacementCharacter to null to throw an ArgumentError

	30 * rather than replace the bad value.

	31 */

	32 class Utf16CodeUnitDecoder implements Iterator<int> {
	Lasse Reichstein Nielsen 2013/11/19 07:42:38 Do we have a way to use a Converter to go from inp Do we have a way to use a Converter to go from input iterator to output iterator? If not, we probably should. floitsch 2013/11/19 10:40:32 Again. this is code that already existed. Not chan Show quoted text On 2013/11/19 07:42:38, Lasse Reichstein Nielsen wrote: > Do we have a way to use a Converter to go from input iterator to output > iterator? If not, we probably should. Again. this is code that already existed. Not changing in this CL.
	33 final _ListRangeIterator utf16CodeUnitIterator;

	34 final int replacementCodepoint;

	35 int _current = null;

	36

	37 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,

	38 int this.replacementCodepoint =

	39 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

	40 utf16CodeUnitIterator =

	41 (new _ListRange(utf16CodeUnits, offset, length)).iterator;

	42

	43 Utf16CodeUnitDecoder.fromListRangeIterator(

	44 _ListRangeIterator this.utf16CodeUnitIterator,

	45 int this.replacementCodepoint);

	46

	47 Iterator<int> get iterator => this;

	48

	49 int get current => _current;

	50

	51 bool moveNext() {

	52 _current = null;

	53 if (!utf16CodeUnitIterator.moveNext()) return false;

	54

	55 int value = utf16CodeUnitIterator.current;

	56 if (value < 0) {

	57 if (replacementCodepoint != null) {

	58 _current = replacementCodepoint;

	59 } else {

	60 throw new ArgumentError(

	61 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

	62 }

	63 } else if (value < UNICODE_UTF16_RESERVED_LO \|\|

	64 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

	65 // transfer directly

	66 _current = value;

	67 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&

	68 utf16CodeUnitIterator.moveNext()) {

	69 // merge surrogate pair

	70 int nextValue = utf16CodeUnitIterator.current;

	71 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&

	72 nextValue <= UNICODE_UTF16_RESERVED_HI) {

	73 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;

	74 value += UNICODE_UTF16_OFFSET +

	75 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);

	76 _current = value;

	77 } else {

	78 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&

	79 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {

	80 utf16CodeUnitIterator.backup();

	81 }

	82 if (replacementCodepoint != null) {

	83 _current = replacementCodepoint;

	84 } else {

	85 throw new ArgumentError(

	86 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

	87 }

	88 }

	89 } else if (replacementCodepoint != null) {

	90 _current = replacementCodepoint;

	91 } else {

	92 throw new ArgumentError(

	93 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

	94 }

	95 return true;

	96 }

	97 }

	98

	99 /**

	100 * Encode code points as UTF16 code units.

	101 */

	102 List<int> _codepointsToUtf16CodeUnits(
	Lasse Reichstein Nielsen 2013/11/19 07:42:38 If this isn't used, remove it. If it is, consider If this isn't used, remove it. If it is, consider whether it can be replaced by: new String.fromCharCodes(codePoints.getRange(offset, length)).codeUnits; floitsch 2013/11/19 10:40:32 ditto. Show quoted text On 2013/11/19 07:42:38, Lasse Reichstein Nielsen wrote: > If this isn't used, remove it. If it is, consider whether it can be replaced by: > new String.fromCharCodes(codePoints.getRange(offset, length)).codeUnits; > ditto.
	103 List<int> codepoints,

	104 [int offset = 0,

	105 int length,

	106 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

	107

	108 _ListRange listRange = new _ListRange(codepoints, offset, length);

	109 int encodedLength = 0;

	110 for (int value in listRange) {

	111 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) \|\|

	112 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

	113 encodedLength++;

	114 } else if (value > UNICODE_PLANE_ONE_MAX &&

	115 value <= UNICODE_VALID_RANGE_MAX) {

	116 encodedLength += 2;

	117 } else {

	118 encodedLength++;

	119 }

	120 }

	121

	122 List<int> codeUnitsBuffer = new List<int>(encodedLength);

	123 int j = 0;

	124 for (int value in listRange) {

	125 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) \|\|

	126 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

	127 codeUnitsBuffer[j++] = value;

	128 } else if (value > UNICODE_PLANE_ONE_MAX &&

	129 value <= UNICODE_VALID_RANGE_MAX) {

	130 int base = value - UNICODE_UTF16_OFFSET;

	131 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +

	132 ((base & UNICODE_UTF16_HI_MASK) >> 10);

	133 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +

	134 (base & UNICODE_UTF16_LO_MASK);

	135 } else if (replacementCodepoint != null) {

	136 codeUnitsBuffer[j++] = replacementCodepoint;

	137 } else {

	138 throw new ArgumentError("Invalid encoding");

	139 }

	140 }

	141 return codeUnitsBuffer;

	142 }

	143

	144 /**

	145 * Decodes the utf16 codeunits to codepoints.

	146 */

	147 List<int> _utf16CodeUnitsToCodepoints(
	Lasse Reichstein Nielsen 2013/11/19 07:42:38 If not used, remove. If used, consider rewriting a If not used, remove. If used, consider rewriting as new String.fromCharCodes(utf16CodeUnits.getRange(offset, length)).runes.toList(); floitsch 2013/11/19 10:40:32 ditto. Show quoted text On 2013/11/19 07:42:38, Lasse Reichstein Nielsen wrote: > If not used, remove. > If used, consider rewriting as > new String.fromCharCodes(utf16CodeUnits.getRange(offset, > length)).runes.toList(); ditto.
	148 List<int> utf16CodeUnits, [int offset = 0, int length,

	149 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

	150 _ListRangeIterator source =

	151 (new _ListRange(utf16CodeUnits, offset, length)).iterator;

	152 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder

	153 .fromListRangeIterator(source, replacementCodepoint);

	154 List<int> codepoints = new List<int>(source.remaining);

	155 int i = 0;

	156 while (decoder.moveNext()) {

	157 codepoints[i++] = decoder.current;

	158 }

	159 if (i == codepoints.length) {

	160 return codepoints;

	161 } else {

	162 List<int> codepointTrunc = new List<int>(i);

	163 codepointTrunc.setRange(0, i, codepoints);

	164 return codepointTrunc;

	165 }

	166 }

	167

7 /**	168 /**

8 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert	169 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert

9 * as much of the input as needed. Determines the byte order from the BOM,	170 * as much of the input as needed. Determines the byte order from the BOM,

10 * or uses big-endian as a default. This method always strips a leading BOM.	171 * or uses big-endian as a default. This method always strips a leading BOM.

11 * Set the [replacementCodepoint] to null to throw an ArgumentError	172 * Set the [replacementCodepoint] to null to throw an ArgumentError

12 * rather than replace the bad value. The default value for	173 * rather than replace the bad value. The default value for

13 * [replacementCodepoint] is U+FFFD.	174 * [replacementCodepoint] is U+FFFD.

14 */	175 */

15 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,	176 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,

16 int length, int replacementCodepoint =	177 int length, int replacementCodepoint =

(...skipping 232 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
249 List<int> truncCodeunits = new List<int>(i);	410 List<int> truncCodeunits = new List<int>(i);

250 truncCodeunits.setRange(0, i, codeunits);	411 truncCodeunits.setRange(0, i, codeunits);

251 return truncCodeunits;	412 return truncCodeunits;

252 }	413 }

253 }	414 }

254	415

255 int get current => _current;	416 int get current => _current;

256	417

257 bool moveNext() {	418 bool moveNext() {

258 _current = null;	419 _current = null;

259 if (utf16EncodedBytesIterator.remaining < 2) {	420 int remaining = utf16EncodedBytesIterator.remaining;

	421 if (remaining == 0) {

	422 _current = null;

	423 return false;

	424 }

	425 if (remaining == 1) {

260 utf16EncodedBytesIterator.moveNext();	426 utf16EncodedBytesIterator.moveNext();

261 if (replacementCodepoint != null) {	427 if (replacementCodepoint != null) {

262 _current = replacementCodepoint;	428 _current = replacementCodepoint;

263 return true;	429 return true;

264 } else {	430 } else {

265 throw new ArgumentError(	431 throw new ArgumentError(

266 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");	432 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");

267 }	433 }

268 } else {

269 _current = decode();

270 return true;

271 }	434 }

	435 _current = decode();

	436 return true;

272 }	437 }

273	438

274 int get position => utf16EncodedBytesIterator.position ~/ 2;	439 int get position => utf16EncodedBytesIterator.position ~/ 2;

275	440

276 void backup([int by = 1]) {	441 void backup([int by = 1]) {

277 utf16EncodedBytesIterator.backup(2 * by);	442 utf16EncodedBytesIterator.backup(2 * by);

278 }	443 }

279	444

280 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;	445 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;

281	446

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
328 }	493 }

329	494

330 int decode() {	495 int decode() {

331 utf16EncodedBytesIterator.moveNext();	496 utf16EncodedBytesIterator.moveNext();

332 int lo = utf16EncodedBytesIterator.current;	497 int lo = utf16EncodedBytesIterator.current;

333 utf16EncodedBytesIterator.moveNext();	498 utf16EncodedBytesIterator.moveNext();

334 int hi = utf16EncodedBytesIterator.current;	499 int hi = utf16EncodedBytesIterator.current;

335 return (hi << 8) + lo;	500 return (hi << 8) + lo;

336 }	501 }

337 }	502 }

OLD	NEW

« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | pkg/utf/test/utf16_test.dart » ('J')