| Index: runtime/lib/string_base.dart
|
| diff --git a/runtime/lib/string_base.dart b/runtime/lib/string_base.dart
|
| index 22e17b9662c0098a0a15de1770e430b36c459db3..3513f8545f2a3d5a2ccccae98de8a9c26176954c 100644
|
| --- a/runtime/lib/string_base.dart
|
| +++ b/runtime/lib/string_base.dart
|
| @@ -16,29 +16,29 @@ class _StringBase {
|
| int get hashCode native "String_getHashCode";
|
|
|
| /**
|
| - * Create the most efficient string representation for specified
|
| - * [codePoints].
|
| + * Create the most efficient string representation for the specified UTF-16
|
| + * [codeUnits].
|
| */
|
| - static String createFromCharCodes(List<int> charCodes) {
|
| + static String createFromUtf16(List<int> codeUnits) {
|
| _ObjectArray objectArray;
|
| - if (charCodes is _ObjectArray) {
|
| - objectArray = charCodes;
|
| + if (codeUnits is _ObjectArray) {
|
| + objectArray = codeUnits;
|
| } else {
|
| - int len = charCodes.length;
|
| + int len = codeUnits.length;
|
| objectArray = new _ObjectArray(len);
|
| for (int i = 0; i < len; i++) {
|
| - objectArray[i] = charCodes[i];
|
| + objectArray[i] = codeUnits[i];
|
| }
|
| }
|
| - return _createFromCodePoints(objectArray);
|
| + return _createFromUtf16(objectArray);
|
| }
|
|
|
| - static String _createFromCodePoints(List<int> codePoints)
|
| - native "StringBase_createFromCodePoints";
|
| + static String _createFromUtf16(List<int> codeUnits)
|
| + native "StringBase_createFromUtf16";
|
|
|
| String operator [](int index) native "String_charAt";
|
|
|
| - int charCodeAt(int index) native "String_charCodeAt";
|
| + int codeUnitAt(int index) native "String_codeUnitAt";
|
|
|
| int get length native "String_getLength";
|
|
|
| @@ -69,12 +69,12 @@ class _StringBase {
|
| int otherLength = other.length;
|
| int len = (thisLength < otherLength) ? thisLength : otherLength;
|
| for (int i = 0; i < len; i++) {
|
| - int thisCodePoint = this.charCodeAt(i);
|
| - int otherCodePoint = other.charCodeAt(i);
|
| - if (thisCodePoint < otherCodePoint) {
|
| + int thisCodeUnit = this.codeUnitAt(i);
|
| + int otherCodeUnit = other.codeUnitAt(i);
|
| + if (thisCodeUnit < otherCodeUnit) {
|
| return -1;
|
| }
|
| - if (thisCodePoint > otherCodePoint) {
|
| + if (thisCodeUnit > otherCodeUnit) {
|
| return 1;
|
| }
|
| }
|
| @@ -93,7 +93,7 @@ class _StringBase {
|
| return false;
|
| }
|
| for (int i = 0; i < len; i++) {
|
| - if (this.charCodeAt(i + start) != other.charCodeAt(i)) {
|
| + if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {
|
| return false;
|
| }
|
| }
|
| @@ -162,7 +162,9 @@ class _StringBase {
|
| final int len = this.length;
|
| int first = 0;
|
| for (; first < len; first++) {
|
| - if (!_isWhitespace(this.charCodeAt(first))) {
|
| + // There are no whitespace characters that are outside the BMP so we
|
| + // can use code units here for efficiency.
|
| + if (!_isWhitespace(this.codeUnitAt(first))) {
|
| break;
|
| }
|
| }
|
| @@ -172,7 +174,7 @@ class _StringBase {
|
| }
|
| int last = len - 1;
|
| for (; last >= first; last--) {
|
| - if (!_isWhitespace(this.charCodeAt(last))) {
|
| + if (!_isWhitespace(this.codeUnitAt(last))) {
|
| break;
|
| }
|
| }
|
| @@ -293,20 +295,35 @@ class _StringBase {
|
| return result;
|
| }
|
|
|
| + // TODO(erikcorry): Fix this to use the new code point iterator when it is
|
| + // available.
|
| List<String> splitChars() {
|
| int len = this.length;
|
| final result = new List<String>(len);
|
| - for (int i = 0; i < len; i++) {
|
| - result[i] = this[i];
|
| + bool smpCharacterSeen = false;
|
| + int i, j;
|
| + for (i = j = 0; i < len; i++, j++) {
|
| + int c = charCodeAt(i);
|
| + // Check for non-basic plane character encoded as a UTF-16 surrogate pair.
|
| + if (c >= String.SMP_CODE_POINT_BASE) {
|
| + i++;
|
| + smpCharacterSeen = true;
|
| + }
|
| + result[j] = new String.fromCharCodes([c]);
|
| }
|
| - return result;
|
| + if (!smpCharacterSeen) return result;
|
| + // If we saw some non-basic plane characters, then we have to return a
|
| + // slightly smaller array than expected (we can't trim the original one
|
| + // because it is non-extendable). This rarely happens so this is preferable
|
| + // to having a separate pass over the string to count the code points.
|
| + return result.getRange(0, j);
|
| }
|
|
|
| - List<int> get charCodes {
|
| + List<int> get codeUnits {
|
| int len = this.length;
|
| final result = new List<int>(len);
|
| for (int i = 0; i < len; i++) {
|
| - result[i] = this.charCodeAt(i);
|
| + result[i] = this.codeUnitAt(i);
|
| }
|
| return result;
|
| }
|
| @@ -360,48 +377,90 @@ class _OneByteString extends _StringBase implements String {
|
| "_OneByteString can only be allocated by the VM");
|
| }
|
|
|
| - // Checks for one-byte whitespaces only.
|
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
|
| - // whitespaces for one byte strings.
|
| bool _isWhitespace(int codePoint) {
|
| return
|
| - (codePoint == 32) || // Space.
|
| + (codePoint == 32) || // Space.
|
| + (codePoint == 0xa0) || // No-break space.
|
| ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
|
| }
|
|
|
| + int charCodeAt(int index) => codeUnitAt(index);
|
| +
|
| + List<int> get charCodes => codeUnits;
|
| }
|
|
|
|
|
| -class _TwoByteString extends _StringBase implements String {
|
| - factory _TwoByteString._uninstantiable() {
|
| +class _TwoByteStringBase extends _StringBase {
|
| + factory _TwoByteStringBase._uninstantiable() {
|
| throw new UnsupportedError(
|
| - "_TwoByteString can only be allocated by the VM");
|
| + "_TwoByteStringBase can't be instaniated");
|
| }
|
|
|
| - // Checks for one-byte whitespaces only.
|
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
|
| - // whitespaces. Add checking for multi-byte whitespace codepoints.
|
| + // Works for both code points and code units since all spaces are in the BMP.
|
| bool _isWhitespace(int codePoint) {
|
| return
|
| - (codePoint == 32) || // Space.
|
| - ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
|
| + (codePoint == 32) || // Space.
|
| + (codePoint == 0xa0) || // No-break space.
|
| + ((9 <= codePoint) && (codePoint <= 13)) || // CR, LF, TAB, etc.
|
| + (codePoint >= 0x1680 && // Optimization.
|
| + (codePoint == 0x1680 || // Ogham space mark.
|
| + codePoint == 0x180e || // Mongolian vowel separator.
|
| + (codePoint >= 0x2000 && codePoint <= 0x200a) || // Wide/narrow spaces.
|
| + codePoint == 0x2028 || // Line separator.
|
| + codePoint == 0x2029 || // Paragraph separator.
|
| + codePoint == 0x202f || // Narrow no-break space.
|
| + codePoint == 0x205f || // Medium mathematical space.
|
| + codePoint == 0x3000 || // Ideographic space.
|
| + codePoint == 0xfeff)); // BOM code.
|
| + }
|
| +
|
| + int charCodeAt(int index) {
|
| + const int LEAD_SURROGATE_BASE = 0xd800;
|
| + const int LEAD_SURROGATE_END = 0xdbff;
|
| + const int TRAIL_SURROGATE_BASE = 0xdc00;
|
| + const int TRAIL_SURROGATE_END = 0xdfff;
|
| + const int MASK = 0x3ff;
|
| + int code = codeUnitAt(index);
|
| + if (code < LEAD_SURROGATE_BASE || code > LEAD_SURROGATE_END) return code;
|
| + if (index + 1 >= length) return code;
|
| + int trail = codeUnitAt(index + 1);
|
| + if (trail < TRAIL_SURROGATE_BASE || trail > TRAIL_SURROGATE_END) {
|
| + return code;
|
| + }
|
| + return String.SMP_CODE_POINT_BASE + ((code & MASK) << 10) + (trail & MASK);
|
| + }
|
| +
|
| + // TODO(erikcorry): Fix this to use the new code point iterator when it is
|
| + // available.
|
| + List<int> get charCodes {
|
| + int len = this.length;
|
| + final result = new List<int>(len);
|
| + bool smpCharacterSeen = false;
|
| + int i, j;
|
| + for (i = j = 0; i < len; i++, j++) {
|
| + int c = this.charCodeAt(i);
|
| + // Check for supplementary plane character encoded as a UTF-16 surrogate
|
| + // pair.
|
| + if (c >= String.SMP_CODE_POINT_BASE) {
|
| + i++;
|
| + smpCharacterSeen = true;
|
| + }
|
| + result[j] = c;
|
| + }
|
| + if (!smpCharacterSeen) return result;
|
| + // If we saw some non-basic plane characters, then we have to return a
|
| + // slightly smaller array than expected (we can't trim the original one
|
| + // because it is non-extendable). This rarely happens so this is preferable
|
| + // to having a separate pass over the string to count the code points.
|
| + return result.getRange(0, j);
|
| }
|
| }
|
|
|
|
|
| -class _FourByteString extends _StringBase implements String {
|
| - factory _FourByteString._uninstantiable() {
|
| +class _TwoByteString extends _TwoByteStringBase implements String {
|
| + factory _TwoByteString._uninstantiable() {
|
| throw new UnsupportedError(
|
| - "_FourByteString can only be allocated by the VM");
|
| - }
|
| -
|
| - // Checks for one-byte whitespaces only.
|
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
|
| - // whitespaces. Add checking for multi-byte whitespace codepoints.
|
| - bool _isWhitespace(int codePoint) {
|
| - return
|
| - (codePoint == 32) || // Space.
|
| - ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
|
| + "_TwoByteString can only be allocated by the VM");
|
| }
|
| }
|
|
|
| @@ -412,47 +471,23 @@ class _ExternalOneByteString extends _StringBase implements String {
|
| "_ExternalOneByteString can only be allocated by the VM");
|
| }
|
|
|
| - // Checks for one-byte whitespaces only.
|
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
|
| - // whitespaces for one byte strings.
|
| bool _isWhitespace(int codePoint) {
|
| return
|
| - (codePoint == 32) || // Space.
|
| + (codePoint == 32) || // Space.
|
| + (codePoint == 0xa0) || // No-break space.
|
| ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
|
| }
|
| -}
|
|
|
| + int charCodeAt(int index) => codeUnitAt(index);
|
|
|
| -class _ExternalTwoByteString extends _StringBase implements String {
|
| - factory _ExternalTwoByteString._uninstantiable() {
|
| - throw new UnsupportedError(
|
| - "_ExternalTwoByteString can only be allocated by the VM");
|
| - }
|
| -
|
| - // Checks for one-byte whitespaces only.
|
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
|
| - // whitespaces. Add checking for multi-byte whitespace codepoints.
|
| - bool _isWhitespace(int codePoint) {
|
| - return
|
| - (codePoint == 32) || // Space.
|
| - ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
|
| - }
|
| + List<int> get charCodes => codeUnits;
|
| }
|
|
|
|
|
| -class _ExternalFourByteString extends _StringBase implements String {
|
| - factory _ExternalFourByteString._uninstantiable() {
|
| +class _ExternalTwoByteString extends _TwoByteStringBase implements String {
|
| + factory _ExternalTwoByteString._uninstantiable() {
|
| throw new UnsupportedError(
|
| - "ExternalFourByteString can only be allocated by the VM");
|
| - }
|
| -
|
| - // Checks for one-byte whitespaces only.
|
| - // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
|
| - // whitespaces. Add checking for multi-byte whitespace codepoints.
|
| - bool _isWhitespace(int codePoint) {
|
| - return
|
| - (codePoint == 32) || // Space.
|
| - ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
|
| + "_ExternalTwoByteString can only be allocated by the VM");
|
| }
|
| }
|
|
|
|
|