Chromium Code Reviews| Index: runtime/lib/string_base.dart |
| diff --git a/runtime/lib/string_base.dart b/runtime/lib/string_base.dart |
| index 22e17b9662c0098a0a15de1770e430b36c459db3..1f901e7e2c759de6628c1f1126d9bede84649fa5 100644 |
| --- a/runtime/lib/string_base.dart |
| +++ b/runtime/lib/string_base.dart |
| @@ -15,30 +15,26 @@ class _StringBase { |
| int get hashCode native "String_getHashCode"; |
| - /** |
| - * Create the most efficient string representation for specified |
| - * [codePoints]. |
| - */ |
| - static String createFromCharCodes(List<int> charCodes) { |
| + static String createFromUtf16(List<int> codeUnits) { |
|
siva
2012/11/16 22:32:04
The comment
"Create the .... for specified UTF-16
erikcorry
2012/11/19 12:40:41
Done.
|
| _ObjectArray objectArray; |
| - if (charCodes is _ObjectArray) { |
| - objectArray = charCodes; |
| + if (codeUnits is _ObjectArray) { |
| + objectArray = codeUnits; |
| } else { |
| - int len = charCodes.length; |
| + int len = codeUnits.length; |
| objectArray = new _ObjectArray(len); |
| for (int i = 0; i < len; i++) { |
| - objectArray[i] = charCodes[i]; |
| + objectArray[i] = codeUnits[i]; |
| } |
| } |
| - return _createFromCodePoints(objectArray); |
| + return _createFromUtf16(objectArray); |
| } |
| - static String _createFromCodePoints(List<int> codePoints) |
| - native "StringBase_createFromCodePoints"; |
| + static String _createFromUtf16(List<int> codeUnits) |
| + native "StringBase_createFromUtf16"; |
| String operator [](int index) native "String_charAt"; |
| - int charCodeAt(int index) native "String_charCodeAt"; |
| + int codeUnitAt(int index) native "String_codeUnitAt"; |
| int get length native "String_getLength"; |
| @@ -69,12 +65,12 @@ class _StringBase { |
| int otherLength = other.length; |
| int len = (thisLength < otherLength) ? thisLength : otherLength; |
| for (int i = 0; i < len; i++) { |
| - int thisCodePoint = this.charCodeAt(i); |
| - int otherCodePoint = other.charCodeAt(i); |
| - if (thisCodePoint < otherCodePoint) { |
| + int thisCodeUnit = this.codeUnitAt(i); |
| + int otherCodeUnit = other.codeUnitAt(i); |
| + if (thisCodeUnit < otherCodeUnit) { |
| return -1; |
| } |
| - if (thisCodePoint > otherCodePoint) { |
| + if (thisCodeUnit > otherCodeUnit) { |
| return 1; |
| } |
| } |
| @@ -93,7 +89,7 @@ class _StringBase { |
| return false; |
| } |
| for (int i = 0; i < len; i++) { |
| - if (this.charCodeAt(i + start) != other.charCodeAt(i)) { |
| + if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) { |
| return false; |
| } |
| } |
| @@ -162,7 +158,9 @@ class _StringBase { |
| final int len = this.length; |
| int first = 0; |
| for (; first < len; first++) { |
| - if (!_isWhitespace(this.charCodeAt(first))) { |
| + // There are no whitespace characters that are outside the BMP so we |
| + // can use code units here for efficiency. |
| + if (!_isWhitespace(this.codeUnitAt(first))) { |
| break; |
| } |
| } |
| @@ -172,7 +170,7 @@ class _StringBase { |
| } |
| int last = len - 1; |
| for (; last >= first; last--) { |
| - if (!_isWhitespace(this.charCodeAt(last))) { |
| + if (!_isWhitespace(this.codeUnitAt(last))) { |
| break; |
| } |
| } |
| @@ -293,20 +291,35 @@ class _StringBase { |
| return result; |
| } |
| + // TODO(erikcorry): Fix this to use the new code point iterator when it is |
| + // available. |
| List<String> splitChars() { |
| int len = this.length; |
| final result = new List<String>(len); |
| - for (int i = 0; i < len; i++) { |
| - result[i] = this[i]; |
| + int i, j; |
| + for (i = j = 0; i < len; i++, j++) { |
| + int c = charCodeAt(i); |
| + // Check for non-basic plane character encoded as a UTF-16 surrogate pair. |
| + if (c >= String.SMP_CODE_POINT_BASE) { |
| + i++; |
| + } |
| + result[j] = new String.fromCharCodes([c]); |
| } |
| - return result; |
| + if (i == j) return result; |
| + // If we saw some non-basic plane characters, then we have to return a |
| + // slightly smaller array than expected (we can't trim the original one |
| + // because it is non-extendable). This rarely happens so this is preferable |
| + // to having a separate pass over the string to count the code points. |
| + final newResult = new List<String>(j); |
| + for (i = 0; i < j; i++) newResult[i] = result[i]; |
| + return newResult; |
|
siva
2012/11/16 22:32:04
This piece of code is repeated 3 times in this CL,
erikcorry
2012/11/19 12:40:41
One copy removed, last three lines changed to a ge
|
| } |
| - List<int> get charCodes { |
| + List<int> get codeUnits { |
| int len = this.length; |
| final result = new List<int>(len); |
| for (int i = 0; i < len; i++) { |
| - result[i] = this.charCodeAt(i); |
| + result[i] = this.codeUnitAt(i); |
| } |
| return result; |
| } |
| @@ -360,36 +373,94 @@ class _OneByteString extends _StringBase implements String { |
| "_OneByteString can only be allocated by the VM"); |
| } |
| - // Checks for one-byte whitespaces only. |
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid |
| - // whitespaces for one byte strings. |
| bool _isWhitespace(int codePoint) { |
| return |
| - (codePoint == 32) || // Space. |
| + (codePoint == 32) || // Space. |
| + (codePoint == 0xa0) || // No-break space. |
| ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. |
| } |
| + int charCodeAt(int index) => codeUnitAt(index); |
| + |
| + List<int> get charCodes => codeUnits; |
| } |
| -class _TwoByteString extends _StringBase implements String { |
| - factory _TwoByteString._uninstantiable() { |
| +class _TwoByteStringBase extends _StringBase { |
|
siva
2012/11/16 22:32:04
Maybe add a TODO here to get rid of this class and
erikcorry
2012/11/19 12:40:41
I don't see why we would want to do that.
|
| + factory _TwoByteStringBase._uninstantiable() { |
| throw new UnsupportedError( |
| - "_TwoByteString can only be allocated by the VM"); |
| + "_TwoByteStringBase can't be instaniated"); |
| } |
| - // Checks for one-byte whitespaces only. |
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid |
| - // whitespaces. Add checking for multi-byte whitespace codepoints. |
| + // Works for both code points and code units since all spaces are in the BMP. |
| bool _isWhitespace(int codePoint) { |
| return |
| - (codePoint == 32) || // Space. |
| - ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. |
| + (codePoint == 32) || // Space. |
| + (codePoint == 0xa0) || // No-break space. |
| + ((9 <= codePoint) && (codePoint <= 13)) || // CR, LF, TAB, etc. |
| + (codePoint >= 0x1680 && // Optimization. |
| + (codePoint == 0x1680 || // Ogham space mark. |
| + codePoint == 0x180e || // Mongolian vowel separator. |
| + (codePoint >= 0x2000 && codePoint <= 0x200a) || // Wide/narrow spaces. |
| + codePoint == 0x2028 || // Line separator. |
| + codePoint == 0x2029 || // Paragraph separator. |
| + codePoint == 0x202f || // Narrow no-break space. |
| + codePoint == 0x205f || // Medium mathematical space. |
| + codePoint == 0x3000 || // Ideographic space. |
| + codePoint == 0xfeff)); // BOM code. |
| + } |
| + |
| + int charCodeAt(int index) { |
| + const int LEAD_SURROGATE_BASE = 0xd800; |
| + const int LEAD_SURROGATE_END = 0xdbff; |
| + const int TRAIL_SURROGATE_BASE = 0xdc00; |
| + const int TRAIL_SURROGATE_END = 0xdfff; |
| + const int MASK = 0x3ff; |
| + int code = codeUnitAt(index); |
| + if (code < LEAD_SURROGATE_BASE || code > LEAD_SURROGATE_END) return code; |
| + if (index + 1 >= length) return code; |
| + int trail = codeUnitAt(index + 1); |
| + if (trail < TRAIL_SURROGATE_BASE || trail > TRAIL_SURROGATE_END) { |
| + return code; |
| + } |
| + return String.SMP_CODE_POINT_BASE + ((code & MASK) << 10) + (trail & MASK); |
| + } |
| + |
| + List<int> get charCodes { |
|
siva
2012/11/16 22:32:04
TODO, fix this to use the new code point iterator
erikcorry
2012/11/19 12:40:41
Done.
|
| + int len = this.length; |
| + final result = new List<int>(len); |
| + int i, j; |
| + for (i = j = 0; i < len; i++, j++) { |
| + int c = this.charCodeAt(i); |
| + // Check for supplementary plane character encoded as a UTF-16 surrogate |
| + // pair. |
| + if (c >= String.SMP_CODE_POINT_BASE) { |
| + i++; |
| + } |
| + result[j] = c; |
| + } |
| + if (i == j) return result; |
|
siva
2012/11/16 22:32:04
I find the (i == j) condition here a little unread
erikcorry
2012/11/19 12:40:41
Done.
|
| + // If we saw some non-basic plane characters, then we have to return a |
| + // slightly smaller array than expected (we can't trim the original one |
| + // because it is non-extendable). This rarely happens so this is preferable |
| + // to having a separate pass over the string to count the code points. |
| + final newResult = new List<int>(j); |
| + for (i = 0; i < j; i++) newResult[i] = result[i]; |
| + return newResult; |
| } |
|
siva
2012/11/16 22:32:04
This code here and in splitChars above seem to be
erikcorry
2012/11/19 12:40:41
Last three lines fixed to use getRange, but splitC
|
| } |
| -class _FourByteString extends _StringBase implements String { |
| +class _TwoByteString extends _TwoByteStringBase implements String { |
| + factory _TwoByteString._uninstantiable() { |
| + throw new UnsupportedError( |
| + "_TwoByteString can only be allocated by the VM"); |
| + } |
| +} |
| + |
| + |
| +// TODO(erikcorry): This is going away. |
|
siva
2012/11/16 22:32:04
(why not remove it in this CL itself)?
erikcorry
2012/11/19 12:40:41
Done.
|
| +class _FourByteString extends _StringBase { |
| factory _FourByteString._uninstantiable() { |
| throw new UnsupportedError( |
| "_FourByteString can only be allocated by the VM"); |
| @@ -412,35 +483,29 @@ class _ExternalOneByteString extends _StringBase implements String { |
| "_ExternalOneByteString can only be allocated by the VM"); |
| } |
| - // Checks for one-byte whitespaces only. |
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid |
| - // whitespaces for one byte strings. |
| bool _isWhitespace(int codePoint) { |
| return |
| - (codePoint == 32) || // Space. |
| + (codePoint == 32) || // Space. |
| + (codePoint == 0xa0) || // No-break space. |
| ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. |
| } |
| + |
| + int charCodeAt(int index) => codeUnitAt(index); |
| + |
| + List<int> get charCodes => codeUnits; |
| } |
| -class _ExternalTwoByteString extends _StringBase implements String { |
| +class _ExternalTwoByteString extends _TwoByteStringBase implements String { |
| factory _ExternalTwoByteString._uninstantiable() { |
| throw new UnsupportedError( |
| "_ExternalTwoByteString can only be allocated by the VM"); |
| } |
| - |
| - // Checks for one-byte whitespaces only. |
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid |
| - // whitespaces. Add checking for multi-byte whitespace codepoints. |
| - bool _isWhitespace(int codePoint) { |
| - return |
| - (codePoint == 32) || // Space. |
| - ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. |
| - } |
| } |
| -class _ExternalFourByteString extends _StringBase implements String { |
| +// TODO(erikcorry): This is going away. |
|
siva
2012/11/16 22:32:04
Ditto comment.
erikcorry
2012/11/19 12:40:41
Done.
|
| +class _ExternalFourByteString extends _StringBase { |
| factory _ExternalFourByteString._uninstantiable() { |
| throw new UnsupportedError( |
| "ExternalFourByteString can only be allocated by the VM"); |