Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(419)

Side by Side Diff: sdk/lib/core/string.dart

Issue 12094056: Runes, a bi-directional code-point iterator/iterable. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Hooked up to String and added tests. Created 7 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « sdk/lib/core/iterable.dart ('k') | tests/corelib/string_runes_test.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.core; 5 part of dart.core;
6 6
7 /** 7 /**
8 * The String class represents sequences of characters. Strings are 8 * The String class represents sequences of characters. Strings are
9 * immutable. A string is represented by a sequence of Unicode UTF-16 9 * immutable. A string is represented by a sequence of Unicode UTF-16
10 * code units accessible through the [codeUnitAt] or the 10 * code units accessible through the [codeUnitAt] or the
(...skipping 262 matching lines...) Expand 10 before | Expand all | Expand 10 after
273 // TODO(floitsch): make it a bidirectional iterator. 273 // TODO(floitsch): make it a bidirectional iterator.
274 Iterable<int> get codeUnits; 274 Iterable<int> get codeUnits;
275 275
276 /** 276 /**
277 * Returns an iterable of Unicode code-points of this string. 277 * Returns an iterable of Unicode code-points of this string.
278 * 278 *
279 * If the string contains surrogate pairs, they will be combined and returned 279 * If the string contains surrogate pairs, they will be combined and returned
280 * as one integer by this iterator. Unmatched surrogate halves are treated 280 * as one integer by this iterator. Unmatched surrogate halves are treated
281 * like valid 16-bit code-units. 281 * like valid 16-bit code-units.
282 */ 282 */
283 // TODO(floitsch): make it a Runes class. 283 Runes get runes;
284 Iterable<int> get runes;
285 284
286 /** 285 /**
287 * If this string is not already all lower case, returns a new string 286 * If this string is not already all lower case, returns a new string
288 * where all characters are made lower case. Returns [:this:] otherwise. 287 * where all characters are made lower case. Returns [:this:] otherwise.
289 */ 288 */
290 // TODO(floitsch): document better. (See EcmaScript for description). 289 // TODO(floitsch): document better. (See EcmaScript for description).
291 String toLowerCase(); 290 String toLowerCase();
292 291
293 /** 292 /**
294 * If this string is not already all upper case, returns a new string 293 * If this string is not already all upper case, returns a new string
295 * where all characters are made upper case. Returns [:this:] otherwise. 294 * where all characters are made upper case. Returns [:this:] otherwise.
296 */ 295 */
297 // TODO(floitsch): document better. (See EcmaScript for description). 296 // TODO(floitsch): document better. (See EcmaScript for description).
298 String toUpperCase(); 297 String toUpperCase();
299 } 298 }
299
300 /**
301 * The runes of a [String].
302 */
303 class Runes extends Iterable<int> {
304 final String string;
305 Runes(this.string);
306
307 RuneIterator get iterator => new RuneIterator(string);
308
309 int get first {
floitsch 2013/02/12 14:23:10 You think it's worth specializing "first" ?
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Probably not. Let's drop it.
310 if (string.length == 0) {
311 throw new StateError("No elements.");
312 }
313 int code = string.charCodeAt(0);
314 if (_isLeadSurrogate(code) && string.length > 1) {
315 int nextCode = string.charCodeAt(1);
316 if (_isTrailSurrogate(nextCode)) {
317 return _combineSurrogatePair(code, nextCode);
318 }
319 }
320 return code;
321 }
322
323 int get last {
324 if (string.length == 0) {
325 throw new StateError("No elements.");
326 }
327 int length = string.length;
328 int code = string.charCodeAt(length - 1);
329 if (_isTrailSurrogate(code) && string.length > 1) {
330 int previousCode = string.charCodeAt(length - 2);
331 if (_isLeadSurrogate(previousCode)) {
332 return _combineSurrogatePair(previousCode, code);
333 }
334 }
335 return code;
336 }
337
338 }
339
340 // Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
341 bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
342
343 // Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
344 bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
345
346 // Combine a lead and a trail surrogate value into a single code point.
347 int _combineSurrogatePair(int start, int end) {
348 return 0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF);
349 }
350
351 /** [Iterator] for reading Unicode code points out of a Dart string. */
352 class RuneIterator implements BiDirectionalIterator<int> {
353 /** String being iterated. */
354 final String string;
355 /** Position before the current code point. */
356 int _position;
357 /** Position after the current code point. */
358 int _nextPosition;
359 /**
360 * Current code point.
361 *
362 * If the iterator has hit either end, the [_currentCodePoint] is null
363 * and [: _position == _nextPosition :].
364 */
365 int _currentCodePoint;
366
367 /** Create an iterator positioned at the beginning of the string. */
368 RuneIterator(String string)
369 : string = string, _position = 0, _nextPosition = 0;
floitsch 2013/02/12 14:23:10 not necessary, but reads better with "this.string
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
370
371 /**
372 * Create an iterator positioned before the [index]th code unit of the string.
373 *
374 * A [moveNext] will make the following code point the current value, and a
floitsch 2013/02/12 14:23:10 don't use "code point", but "rune". Not perfect ei
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Reworded.
375 * [movePrevious] will make the preceding code pount the current value.
floitsch 2013/02/12 14:23:10 point.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
376 *
377 * It is an error if the [index] position is in the middle of a surrogate
378 * pair.
379 */
380 RuneIterator.at(String string, int index)
381 : string = string, _position = index, _nextPosition = index {
382 if (index < 0 || index > string.length) {
383 throw new RangeError.range(index, 0, string.length);
384 }
385 _checkSplitSurrogate(index);
386 }
387
388 /** Throw an error if the index is in the middle of a surrogate pair. */
389 void _checkSplitSurrogate(int index) {
390 if (index > 0 && index < string.length &&
391 _isLeadSurrogate(string.charCodeAt(index - 1)) &&
392 _isTrailSurrogate(string.charCodeAt(index))) {
393 throw new ArgumentError("Index inside surrogate pair: $index");
394 }
395 }
396
397 /**
398 * Returns the starting position of the current rune in the string.
399 *
400 * If the current rune is null, this is the index of the rune that
floitsch 2013/02/12 14:23:10 I would prefer if rawIndex was null or -1 then.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 This way, someone can give you an uninitialized it
Lasse Reichstein Nielsen 2013/02/12 15:20:23 I've changed it to return null anyway, if there is
401 * will become current after a call to [moveNext].
402 */
403 int get rawIndex => _position;
404
405 /**
406 * Resets the iterator to the rune at the specified index of the string.
407 *
408 * Setting a negative [rawIndex], or one greater than [:string.length:],
floitsch 2013/02/12 14:23:10 If we allow "string.length" we should maybe allow
Lasse Reichstein Nielsen 2013/02/12 15:13:23 I'd disallow string.length then. Positions in st
floitsch 2013/02/13 10:12:19 *Nobody* except the implementors see it this way.
409 * is an error. So is setting it in the middle of a surrogate pair.
410 *
411 * Setting the position to the end of then string will set [current] to null.
412 */
413 void set rawIndex(int rawIndex) {
414 reset(rawIndex);
415 moveNext();
416 }
417
418 /**
419 * Resets the iterator to the given index into the string.
420 *
421 * After this the [current] value is unset.
422 * You must call [moveNext] make the rune at the position current,
423 * or [movePrevious] for the last rune before the position.
424 *
425 * Setting a negative [rawIndex], or one greater than [:string.length:],
floitsch 2013/02/12 14:23:10 ditto. Maybe we should allow "-1".
Lasse Reichstein Nielsen 2013/02/12 15:13:23 For what? reset(0) is a reset to the beginning of
floitsch 2013/02/13 10:12:19 If I reset to (0) and then moveNext I don't expect
Lasse Reichstein Nielsen 2013/02/13 17:20:42 That's not how RuneIterator.reset works. Are you
floitsch 2013/02/13 17:34:50 Ok for original behavior for reset (including allo
426 * is an error. So is setting it in the middle of a surrogate pair.
427 */
428 void reset([int rawIndex = 0]) {
429 if (rawIndex < 0 || rawIndex > string.length) {
430 throw new RangeError.range(rawIndex, 0, string.length);
431 }
432 _checkSplitSurrogate(rawIndex);
433 _position = _nextPosition = rawIndex;
434 _currentCodePoint = null;
435 }
436
437 /** The rune starting at the current position in the string. */
438 int get current => _currentCodePoint;
439
440 /**
441 * The number of code units comprising the current rune.
442 *
443 * Returns zero if the current rune
floitsch 2013/02/12 14:23:10 unfinished sentence.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
444 */
445 int get currentSize => _nextPosition - _position;
446
447 /**
448 * A string containing the current rune.
449 *
450 * For runes outside the basic multilingual plane, this will be
451 * a two-character String.
452 *
453 * Returns null if [current] is null.
454 */
455 String get currentAsString {
456 if (_position == _nextPosition) return null;
457 if (_position + 1 == _nextPosition) return string[_position];
458 return string.substring(_position, _nextPosition);
459 }
460
461
462 bool moveNext() {
463 _position = _nextPosition;
464 if (_nextPosition == string.length) {
floitsch 2013/02/12 14:23:10 _position. No need for _nextPosition anymore.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
465 _currentCodePoint = null;
466 return false;
467 }
468 int codeUnit = string.charCodeAt(_position);
469 int nextPosition = _position + 1;
470 if (_isLeadSurrogate(codeUnit) && nextPosition < string.length) {
471 int nextCodeUnit = string.charCodeAt(nextPosition);
472 if (_isTrailSurrogate(nextCodeUnit)) {
473 _nextPosition = nextPosition + 1;
474 _currentCodePoint = _combineSurrogatePair(codeUnit, nextCodeUnit);
475 return true;
476 }
477 }
478 _nextPosition = nextPosition;
479 _currentCodePoint = codeUnit;
480 return true;
481 }
482
483 bool movePrevious() {
484 _nextPosition = _position;
485 if (_position == 0) {
486 _currentCodePoint = null;
487 return false;
488 }
489 int position = _position - 1;
490 int codeUnit = string.charCodeAt(position);
491 if (_isTrailSurrogate(codeUnit) && position > 0) {
492 int prevCodeUnit = string.charCodeAt(position - 1);
493 if (_isLeadSurrogate(prevCodeUnit)) {
494 _position = position - 1;
495 _currentCodePoint = _combineSurrogatePair(prevCodeUnit, codeUnit);
496 return true;
497 }
498 }
499 _position = position;
500 _currentCodePoint = codeUnit;
501 return true;
502 }
503 }
OLDNEW
« no previous file with comments | « sdk/lib/core/iterable.dart ('k') | tests/corelib/string_runes_test.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698