Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(251)

Unified Diff: sdk/lib/core/string.dart

Issue 12094056: Runes, a bi-directional code-point iterator/iterable. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Hooked up to String and added tests. Created 7 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sdk/lib/core/iterable.dart ('k') | tests/corelib/string_runes_test.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sdk/lib/core/string.dart
diff --git a/sdk/lib/core/string.dart b/sdk/lib/core/string.dart
index 6fb1b9035abe13dced4a3fe0c69424c88e589242..c15e3063622b3900dbe6c4235bdf2086a3311973 100644
--- a/sdk/lib/core/string.dart
+++ b/sdk/lib/core/string.dart
@@ -280,8 +280,7 @@ abstract class String implements Comparable, Pattern {
* as one integer by this iterator. Unmatched surrogate halves are treated
* like valid 16-bit code-units.
*/
- // TODO(floitsch): make it a Runes class.
- Iterable<int> get runes;
+ Runes get runes;
/**
* If this string is not already all lower case, returns a new string
@@ -297,3 +296,208 @@ abstract class String implements Comparable, Pattern {
// TODO(floitsch): document better. (See EcmaScript for description).
String toUpperCase();
}
+
+/**
+ * The runes of a [String].
+ */
+class Runes extends Iterable<int> {
+ final String string;
+ Runes(this.string);
+
+ RuneIterator get iterator => new RuneIterator(string);
+
+ int get first {
floitsch 2013/02/12 14:23:10 You think it's worth specializing "first" ?
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Probably not. Let's drop it.
+ if (string.length == 0) {
+ throw new StateError("No elements.");
+ }
+ int code = string.charCodeAt(0);
+ if (_isLeadSurrogate(code) && string.length > 1) {
+ int nextCode = string.charCodeAt(1);
+ if (_isTrailSurrogate(nextCode)) {
+ return _combineSurrogatePair(code, nextCode);
+ }
+ }
+ return code;
+ }
+
+ int get last {
+ if (string.length == 0) {
+ throw new StateError("No elements.");
+ }
+ int length = string.length;
+ int code = string.charCodeAt(length - 1);
+ if (_isTrailSurrogate(code) && string.length > 1) {
+ int previousCode = string.charCodeAt(length - 2);
+ if (_isLeadSurrogate(previousCode)) {
+ return _combineSurrogatePair(previousCode, code);
+ }
+ }
+ return code;
+ }
+
+}
+
+// Is then code (a 16-bit unsigned integer) a UTF-16 lead surrogate.
+bool _isLeadSurrogate(int code) => (code & 0xFC00) == 0xD800;
+
+// Is then code (a 16-bit unsigned integer) a UTF-16 trail surrogate.
+bool _isTrailSurrogate(int code) => (code & 0xFC00) == 0xDC00;
+
+// Combine a lead and a trail surrogate value into a single code point.
+int _combineSurrogatePair(int start, int end) {
+ return 0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF);
+}
+
+/** [Iterator] for reading Unicode code points out of a Dart string. */
+class RuneIterator implements BiDirectionalIterator<int> {
+ /** String being iterated. */
+ final String string;
+ /** Position before the current code point. */
+ int _position;
+ /** Position after the current code point. */
+ int _nextPosition;
+ /**
+ * Current code point.
+ *
+ * If the iterator has hit either end, the [_currentCodePoint] is null
+ * and [: _position == _nextPosition :].
+ */
+ int _currentCodePoint;
+
+ /** Create an iterator positioned at the beginning of the string. */
+ RuneIterator(String string)
+ : string = string, _position = 0, _nextPosition = 0;
floitsch 2013/02/12 14:23:10 not necessary, but reads better with "this.string
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
+
+ /**
+ * Create an iterator positioned before the [index]th code unit of the string.
+ *
+ * A [moveNext] will make the following code point the current value, and a
floitsch 2013/02/12 14:23:10 don't use "code point", but "rune". Not perfect ei
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Reworded.
+ * [movePrevious] will make the preceding code pount the current value.
floitsch 2013/02/12 14:23:10 point.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
+ *
+ * It is an error if the [index] position is in the middle of a surrogate
+ * pair.
+ */
+ RuneIterator.at(String string, int index)
+ : string = string, _position = index, _nextPosition = index {
+ if (index < 0 || index > string.length) {
+ throw new RangeError.range(index, 0, string.length);
+ }
+ _checkSplitSurrogate(index);
+ }
+
+ /** Throw an error if the index is in the middle of a surrogate pair. */
+ void _checkSplitSurrogate(int index) {
+ if (index > 0 && index < string.length &&
+ _isLeadSurrogate(string.charCodeAt(index - 1)) &&
+ _isTrailSurrogate(string.charCodeAt(index))) {
+ throw new ArgumentError("Index inside surrogate pair: $index");
+ }
+ }
+
+ /**
+ * Returns the starting position of the current rune in the string.
+ *
+ * If the current rune is null, this is the index of the rune that
floitsch 2013/02/12 14:23:10 I would prefer if rawIndex was null or -1 then.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 This way, someone can give you an uninitialized it
Lasse Reichstein Nielsen 2013/02/12 15:20:23 I've changed it to return null anyway, if there is
+ * will become current after a call to [moveNext].
+ */
+ int get rawIndex => _position;
+
+ /**
+ * Resets the iterator to the rune at the specified index of the string.
+ *
+ * Setting a negative [rawIndex], or one greater than [:string.length:],
floitsch 2013/02/12 14:23:10 If we allow "string.length" we should maybe allow
Lasse Reichstein Nielsen 2013/02/12 15:13:23 I'd disallow string.length then. Positions in st
floitsch 2013/02/13 10:12:19 *Nobody* except the implementors see it this way.
+ * is an error. So is setting it in the middle of a surrogate pair.
+ *
+ * Setting the position to the end of then string will set [current] to null.
+ */
+ void set rawIndex(int rawIndex) {
+ reset(rawIndex);
+ moveNext();
+ }
+
+ /**
+ * Resets the iterator to the given index into the string.
+ *
+ * After this the [current] value is unset.
+ * You must call [moveNext] make the rune at the position current,
+ * or [movePrevious] for the last rune before the position.
+ *
+ * Setting a negative [rawIndex], or one greater than [:string.length:],
floitsch 2013/02/12 14:23:10 ditto. Maybe we should allow "-1".
Lasse Reichstein Nielsen 2013/02/12 15:13:23 For what? reset(0) is a reset to the beginning of
floitsch 2013/02/13 10:12:19 If I reset to (0) and then moveNext I don't expect
Lasse Reichstein Nielsen 2013/02/13 17:20:42 That's not how RuneIterator.reset works. Are you
floitsch 2013/02/13 17:34:50 Ok for original behavior for reset (including allo
+ * is an error. So is setting it in the middle of a surrogate pair.
+ */
+ void reset([int rawIndex = 0]) {
+ if (rawIndex < 0 || rawIndex > string.length) {
+ throw new RangeError.range(rawIndex, 0, string.length);
+ }
+ _checkSplitSurrogate(rawIndex);
+ _position = _nextPosition = rawIndex;
+ _currentCodePoint = null;
+ }
+
+ /** The rune starting at the current position in the string. */
+ int get current => _currentCodePoint;
+
+ /**
+ * The number of code units comprising the current rune.
+ *
+ * Returns zero if the current rune
floitsch 2013/02/12 14:23:10 unfinished sentence.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
+ */
+ int get currentSize => _nextPosition - _position;
+
+ /**
+ * A string containing the current rune.
+ *
+ * For runes outside the basic multilingual plane, this will be
+ * a two-character String.
+ *
+ * Returns null if [current] is null.
+ */
+ String get currentAsString {
+ if (_position == _nextPosition) return null;
+ if (_position + 1 == _nextPosition) return string[_position];
+ return string.substring(_position, _nextPosition);
+ }
+
+
+ bool moveNext() {
+ _position = _nextPosition;
+ if (_nextPosition == string.length) {
floitsch 2013/02/12 14:23:10 _position. No need for _nextPosition anymore.
Lasse Reichstein Nielsen 2013/02/12 15:13:23 Done.
+ _currentCodePoint = null;
+ return false;
+ }
+ int codeUnit = string.charCodeAt(_position);
+ int nextPosition = _position + 1;
+ if (_isLeadSurrogate(codeUnit) && nextPosition < string.length) {
+ int nextCodeUnit = string.charCodeAt(nextPosition);
+ if (_isTrailSurrogate(nextCodeUnit)) {
+ _nextPosition = nextPosition + 1;
+ _currentCodePoint = _combineSurrogatePair(codeUnit, nextCodeUnit);
+ return true;
+ }
+ }
+ _nextPosition = nextPosition;
+ _currentCodePoint = codeUnit;
+ return true;
+ }
+
+ bool movePrevious() {
+ _nextPosition = _position;
+ if (_position == 0) {
+ _currentCodePoint = null;
+ return false;
+ }
+ int position = _position - 1;
+ int codeUnit = string.charCodeAt(position);
+ if (_isTrailSurrogate(codeUnit) && position > 0) {
+ int prevCodeUnit = string.charCodeAt(position - 1);
+ if (_isLeadSurrogate(prevCodeUnit)) {
+ _position = position - 1;
+ _currentCodePoint = _combineSurrogatePair(prevCodeUnit, codeUnit);
+ return true;
+ }
+ }
+ _position = position;
+ _currentCodePoint = codeUnit;
+ return true;
+ }
+}
« no previous file with comments | « sdk/lib/core/iterable.dart ('k') | tests/corelib/string_runes_test.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698