Index: sdk/lib/core/string.dart |
diff --git a/sdk/lib/core/string.dart b/sdk/lib/core/string.dart |
index 6fb1b9035abe13dced4a3fe0c69424c88e589242..c15e3063622b3900dbe6c4235bdf2086a3311973 100644 |
--- a/sdk/lib/core/string.dart |
+++ b/sdk/lib/core/string.dart |
@@ -280,8 +280,7 @@ abstract class String implements Comparable, Pattern { |
* as one integer by this iterator. Unmatched surrogate halves are treated |
* like valid 16-bit code-units. |
*/ |
- // TODO(floitsch): make it a Runes class. |
- Iterable<int> get runes; |
+ Runes get runes; |
/** |
* If this string is not already all lower case, returns a new string |
@@ -297,3 +296,208 @@ abstract class String implements Comparable, Pattern { |
// TODO(floitsch): document better. (See EcmaScript for description). |
String toUpperCase(); |
} |
+ |
+/** |
+ * The runes of a [String]. |
+ */ |
+class Runes extends Iterable<int> { |
+ final String string; |
+ Runes(this.string); |
+ |
+ RuneIterator get iterator => new RuneIterator(string); |
+ |
+ int get first { |
floitsch
2013/02/12 14:23:10
You think it's worth specializing "first" ?
Lasse Reichstein Nielsen
2013/02/12 15:13:23
Probably not. Let's drop it.
|
+ if (string.length == 0) { |
+ throw new StateError("No elements."); |
+ } |
+ int code = string.charCodeAt(0); |
+ if (_isLeadSurrogate(code) && string.length > 1) { |
+ int nextCode = string.charCodeAt(1); |
+ if (_isTrailSurrogate(nextCode)) { |
+ return _combineSurrogatePair(code, nextCode); |
+ } |
+ } |
+ return code; |
+ } |
+ |
+ int get last { |
+ if (string.length == 0) { |
+ throw new StateError("No elements."); |
+ } |
+ int length = string.length; |
+ int code = string.charCodeAt(length - 1); |
+ if (_isTrailSurrogate(code) && string.length > 1) { |
+ int previousCode = string.charCodeAt(length - 2); |
+ if (_isLeadSurrogate(previousCode)) { |
+ return _combineSurrogatePair(previousCode, code); |
+ } |
+ } |
+ return code; |
+ } |
+ |
+} |
+ |
+// Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate. |
+bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800; |
+ |
+// Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate. |
+bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00; |
+ |
+// Combine a lead and a trail surrogate value into a single code point. |
+int _combineSurrogatePair(int start, int end) { |
+ return 0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF); |
+} |
+ |
+/** [Iterator] for reading Unicode code points out of a Dart string. */ |
+class RuneIterator implements BiDirectionalIterator<int> { |
+ /** String being iterated. */ |
+ final String string; |
+ /** Position before the current code point. */ |
+ int _position; |
+ /** Position after the current code point. */ |
+ int _nextPosition; |
+ /** |
+ * Current code point. |
+ * |
+ * If the iterator has hit either end, the [_currentCodePoint] is null |
+ * and [: _position == _nextPosition :]. |
+ */ |
+ int _currentCodePoint; |
+ |
+ /** Create an iterator positioned at the beginning of the string. */ |
+ RuneIterator(String string) |
+ : string = string, _position = 0, _nextPosition = 0; |
floitsch
2013/02/12 14:23:10
not necessary, but reads better with "this.string
Lasse Reichstein Nielsen
2013/02/12 15:13:23
Done.
|
+ |
+ /** |
+ * Create an iterator positioned before the [index]th code unit of the string. |
+ * |
+ * A [moveNext] will make the following code point the current value, and a |
floitsch
2013/02/12 14:23:10
don't use "code point", but "rune".
Not perfect ei
Lasse Reichstein Nielsen
2013/02/12 15:13:23
Reworded.
|
+ * [movePrevious] will make the preceding code pount the current value. |
floitsch
2013/02/12 14:23:10
point.
Lasse Reichstein Nielsen
2013/02/12 15:13:23
Done.
|
+ * |
+ * It is an error if the [index] position is in the middle of a surrogate |
+ * pair. |
+ */ |
+ RuneIterator.at(String string, int index) |
+ : string = string, _position = index, _nextPosition = index { |
+ if (index < 0 || index > string.length) { |
+ throw new RangeError.range(index, 0, string.length); |
+ } |
+ _checkSplitSurrogate(index); |
+ } |
+ |
+ /** Throw an error if the index is in the middle of a surrogate pair. */ |
+ void _checkSplitSurrogate(int index) { |
+ if (index > 0 && index < string.length && |
+ _isLeadSurrogate(string.charCodeAt(index - 1)) && |
+ _isTrailSurrogate(string.charCodeAt(index))) { |
+ throw new ArgumentError("Index inside surrogate pair: $index"); |
+ } |
+ } |
+ |
+ /** |
+ * Returns the starting position of the current rune in the string. |
+ * |
+ * If the current rune is null, this is the index of the rune that |
floitsch
2013/02/12 14:23:10
I would prefer if rawIndex was null or -1 then.
Lasse Reichstein Nielsen
2013/02/12 15:13:23
This way, someone can give you an uninitialized it
Lasse Reichstein Nielsen
2013/02/12 15:20:23
I've changed it to return null anyway, if there is
|
+ * will become current after a call to [moveNext]. |
+ */ |
+ int get rawIndex => _position; |
+ |
+ /** |
+ * Resets the iterator to the rune at the specified index of the string. |
+ * |
+ * Setting a negative [rawIndex], or one greater than [:string.length:], |
floitsch
2013/02/12 14:23:10
If we allow "string.length" we should maybe allow
Lasse Reichstein Nielsen
2013/02/12 15:13:23
I'd disallow string.length then.
Positions in st
floitsch
2013/02/13 10:12:19
*Nobody* except the implementors see it this way.
|
+ * is an error. So is setting it in the middle of a surrogate pair. |
+ * |
+ * Setting the position to the end of then string will set [current] to null. |
+ */ |
+ void set rawIndex(int rawIndex) { |
+ reset(rawIndex); |
+ moveNext(); |
+ } |
+ |
+ /** |
+ * Resets the iterator to the given index into the string. |
+ * |
+ * After this the [current] value is unset. |
+ * You must call [moveNext] make the rune at the position current, |
+ * or [movePrevious] for the last rune before the position. |
+ * |
+ * Setting a negative [rawIndex], or one greater than [:string.length:], |
floitsch
2013/02/12 14:23:10
ditto. Maybe we should allow "-1".
Lasse Reichstein Nielsen
2013/02/12 15:13:23
For what?
reset(0) is a reset to the beginning of
floitsch
2013/02/13 10:12:19
If I reset to (0) and then moveNext I don't expect
Lasse Reichstein Nielsen
2013/02/13 17:20:42
That's not how RuneIterator.reset works.
Are you
floitsch
2013/02/13 17:34:50
Ok for original behavior for reset (including allo
|
+ * is an error. So is setting it in the middle of a surrogate pair. |
+ */ |
+ void reset([int rawIndex = 0]) { |
+ if (rawIndex < 0 || rawIndex > string.length) { |
+ throw new RangeError.range(rawIndex, 0, string.length); |
+ } |
+ _checkSplitSurrogate(rawIndex); |
+ _position = _nextPosition = rawIndex; |
+ _currentCodePoint = null; |
+ } |
+ |
+ /** The rune starting at the current position in the string. */ |
+ int get current => _currentCodePoint; |
+ |
+ /** |
+ * The number of code units comprising the current rune. |
+ * |
+ * Returns zero if the current rune |
floitsch
2013/02/12 14:23:10
unfinished sentence.
Lasse Reichstein Nielsen
2013/02/12 15:13:23
Done.
|
+ */ |
+ int get currentSize => _nextPosition - _position; |
+ |
+ /** |
+ * A string containing the current rune. |
+ * |
+ * For runes outside the basic multilingual plane, this will be |
+ * a two-character String. |
+ * |
+ * Returns null if [current] is null. |
+ */ |
+ String get currentAsString { |
+ if (_position == _nextPosition) return null; |
+ if (_position + 1 == _nextPosition) return string[_position]; |
+ return string.substring(_position, _nextPosition); |
+ } |
+ |
+ |
+ bool moveNext() { |
+ _position = _nextPosition; |
+ if (_nextPosition == string.length) { |
floitsch
2013/02/12 14:23:10
_position. No need for _nextPosition anymore.
Lasse Reichstein Nielsen
2013/02/12 15:13:23
Done.
|
+ _currentCodePoint = null; |
+ return false; |
+ } |
+ int codeUnit = string.charCodeAt(_position); |
+ int nextPosition = _position + 1; |
+ if (_isLeadSurrogate(codeUnit) && nextPosition < string.length) { |
+ int nextCodeUnit = string.charCodeAt(nextPosition); |
+ if (_isTrailSurrogate(nextCodeUnit)) { |
+ _nextPosition = nextPosition + 1; |
+ _currentCodePoint = _combineSurrogatePair(codeUnit, nextCodeUnit); |
+ return true; |
+ } |
+ } |
+ _nextPosition = nextPosition; |
+ _currentCodePoint = codeUnit; |
+ return true; |
+ } |
+ |
+ bool movePrevious() { |
+ _nextPosition = _position; |
+ if (_position == 0) { |
+ _currentCodePoint = null; |
+ return false; |
+ } |
+ int position = _position - 1; |
+ int codeUnit = string.charCodeAt(position); |
+ if (_isTrailSurrogate(codeUnit) && position > 0) { |
+ int prevCodeUnit = string.charCodeAt(position - 1); |
+ if (_isLeadSurrogate(prevCodeUnit)) { |
+ _position = position - 1; |
+ _currentCodePoint = _combineSurrogatePair(prevCodeUnit, codeUnit); |
+ return true; |
+ } |
+ } |
+ _position = position; |
+ _currentCodePoint = codeUnit; |
+ return true; |
+ } |
+} |