Index: sdk/lib/core/string.dart |
diff --git a/sdk/lib/core/string.dart b/sdk/lib/core/string.dart |
index 3de2b8cfa17071c38b26dbd669b2ba09dfef2394..f2d0e613203d238145d68ef8416cddb30d6c79b6 100644 |
--- a/sdk/lib/core/string.dart |
+++ b/sdk/lib/core/string.dart |
@@ -4,103 +4,34 @@ |
/** |
* The String class represents character strings. Strings are |
- * immutable. A string is represented by a list of 16-bit UTF-16 |
- * code units accessible through the [codeUnitAt] or the [codeUnits] |
- * methods. The corresponding Unicode code points are available with |
- * [charCodeAt] or the [charCodes] method. |
+ * immutable. A string is represented by a list of 32-bit Unicode |
+ * scalar character codes accessible through the [charCodeAt] or the |
+ * [charCodes] method. |
*/ |
abstract class String implements Comparable, Pattern, Sequence<String> { |
- // Unicode does not allow for code points above this limit. |
- static const int MAX_CODE_POINT = 0x10ffff; |
- // A Dart string is represented by UTF-16 code units which must be <= 0xffff. |
- static const int MAX_CODE_UNIT = 0xffff; |
- // Unicode does not allow for code points in this range. |
- static const int UNICODE_RESERVED_AREA_START = 0xd800; |
- static const int UNICODE_RESERVED_AREA_END = 0xdfff; |
- // Unicode code points above this limit are coded as two code units in Dart's |
- // UTF-16 string. |
- static const int SUPPLEMENTARY_CODE_POINT_BASE = 0x10000; |
- |
- /** |
- * Allocates a new String for the specified 21 bit Unicode [codePoints]. |
- * Throws an ArgumentError if any of the codePoints are not ints between 0 and |
- * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points |
- * are in the area reserved for UTF-16 surrogate pairs. |
- */ |
- factory String.fromCharCodes(List<int> charCodes) { |
- int pairs = 0; |
- // There is some duplication of constants here relative to the ones in |
- // lib/utf/utf16.dart because we don't want core to depend on the utf |
- // library. |
- const int MASK = 0x3ff; |
- const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START; |
- const int TRAIL_SURROGATE_BASE = 0xdc00; |
- for (var code in charCodes) { |
- if (code is !int || code < 0) throw new ArgumentError(charCodes); |
- if (code >= UNICODE_RESERVED_AREA_START) { |
- if (code > MAX_CODE_UNIT) { |
- pairs++; |
- } |
- if (code <= UNICODE_RESERVED_AREA_END || code > MAX_CODE_POINT) { |
- // No surrogates or out-of-range code points allowed in the input. |
- throw new ArgumentError(charCodes); |
- } |
- } |
- } |
- // Fast case - there are no surrogate pairs. |
- if (pairs == 0) return new String.fromCodeUnits(charCodes); |
- var codeUnits = new List<int>(pairs + charCodes.length); |
- int j = 0; |
- for (int code in charCodes) { |
- if (code >= SUPPLEMENTARY_CODE_POINT_BASE) { |
- codeUnits[j++] = LEAD_SURROGATE_BASE + |
- (((code - SUPPLEMENTARY_CODE_POINT_BASE) >> 10) & MASK); |
- codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK); |
- } else { |
- codeUnits[j++] = code; |
- } |
- } |
- return new String.fromCodeUnits(codeUnits); |
- } |
- |
- /** |
- * Allocates a new String for the specified 16 bit UTF-16 [codeUnits]. |
- */ |
- external factory String.fromCodeUnits(List<int> codeUnits); |
- |
- /** |
- * Gets the Unicode character (as [String]) at the given [index]. This |
- * routine can return a single combining character (accent) that would |
- * normally be displayed together with the character it is modifying. |
- * If the index corresponds to a surrogate code unit then a one-code-unit |
- * string is returned containing that unpaired surrogate code unit. |
+ /** |
+ * Allocates a new String for the specified [charCodes]. |
*/ |
- String operator [](int index); |
+ external factory String.fromCharCodes(List<int> charCodes); |
/** |
- * Gets the 21 bit Unicode code point at the given [index]. If the code units |
- * at index and index + 1 form a valid surrogate pair then this function |
- * returns the non-basic plane code point that they represent. If the code |
- * unit at index is a trailing surrogate or a leading surrogate that is not |
- * followed by a trailing surrogate then the raw code unit is returned. |
+ * Gets the character (as [String]) at the given [index]. |
*/ |
- int charCodeAt(int index); |
+ String operator [](int index); |
/** |
- * Gets the 16 bit UTF-16 code unit at the given index. |
+ * Gets the scalar character code at the given [index]. |
*/ |
- int codeUnitAt(int index); |
- |
+ int charCodeAt(int index); |
/** |
- * The length of the string, measured in UTF-16 code units. |
+ * The length of the string. |
*/ |
int get length; |
/** |
* Returns whether the two strings are equal. This method compares |
- * each individual UTF-16 code unit. No Unicode normalization is |
- * performed (accent composition/decomposition). |
+ * each individual scalar character codes of the strings. |
*/ |
bool operator ==(String other); |
@@ -145,13 +76,10 @@ abstract class String implements Comparable, Pattern, Sequence<String> { |
String substring(int startIndex, [int endIndex]); |
/** |
- * Removes leading and trailing whitespace from a string. If the string |
- * contains leading or trailing whitespace a new string with no leading and |
- * no trailing whitespace is returned. Otherwise, the string itself is |
- * returned. Whitespace is defined as every Unicode character in the Zs, Zl |
- * and Zp categories (this includes no-break space), the spacing control |
- * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM |
- * character. |
+ * Removes leading and trailing whitespace from a string. If the |
+ * string contains leading or trailing whitespace a new string with |
+ * no leading and no trailing whitespace is returned. Otherwise, the |
+ * string itself is returned. |
*/ |
String trim(); |
@@ -180,23 +108,16 @@ abstract class String implements Comparable, Pattern, Sequence<String> { |
List<String> split(Pattern pattern); |
/** |
- * Returns a list of the characters of this string. No string normalization |
- * is performed so unprecomposed combining characters (accents) may be found |
- * in the list. Valid surrogate pairs are returned as one string. |
+ * Returns a list of the characters of this string. |
*/ |
List<String> splitChars(); |
/** |
- * Returns a list of the 21 bit Unicode code points of this string. |
+ * Returns a list of the scalar character codes of this string. |
*/ |
List<int> get charCodes; |
/** |
- * Returns a list of the 16 bit UTF-16 code units of this string. |
- */ |
- List<int> get codeUnits; |
- |
- /** |
* If this string is not already all lower case, returns a new string |
* where all characters are made lower case. Returns [:this:] otherwise. |
*/ |