Chromium Code Reviews| Index: sdk/lib/core/string.dart |
| diff --git a/sdk/lib/core/string.dart b/sdk/lib/core/string.dart |
| index f2d0e613203d238145d68ef8416cddb30d6c79b6..b8731eb223e4500f345119eb029947bb62db3a07 100644 |
| --- a/sdk/lib/core/string.dart |
| +++ b/sdk/lib/core/string.dart |
| @@ -4,34 +4,103 @@ |
| /** |
| * The String class represents character strings. Strings are |
| - * immutable. A string is represented by a list of 32-bit Unicode |
| - * scalar character codes accessible through the [charCodeAt] or the |
| - * [charCodes] method. |
| + * immutable. A string is represented by a list of 16-bit UTF-16 |
| + * code units accessible through the [codeUnitAt] or the [codeUnits] |
| + * methods. The corresponding Unicode code points are available with |
| + * [charCodeAt] or the [charCodes] method. |
| */ |
| abstract class String implements Comparable, Pattern, Sequence<String> { |
| - /** |
| - * Allocates a new String for the specified [charCodes]. |
| + // Unicode does not allow for code points above this limit. |
| + static const int MAX_CODE_POINT = 0x10ffff; |
| + // A Dart string is represented by UTF-16 code units which must be <= 0xffff. |
| + static const int MAX_CODE_UNIT = 0xffff; |
| + // Unicode does not allow for code points in this range. |
| + static const int UNICODE_RESERVED_AREA_START = 0xd800; |
| + static const int UNICODE_RESERVED_AREA_END = 0xdfff; |
| + // Unicode code points above this limit are coded as two code units in Dart's |
| + // UTF-16 string. |
| + static const int SMP_CODE_POINT_BASE = 0x10000; |
| + |
| + /** |
| + * Allocates a new String for the specified 21 bit Unicode [codePoints]. |
| + * Throws an ArgumentError if any of the codePoints are not ints between 0 and |
| + * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points |
| + * are in the area reserved for UTF-16 surrogate pairs. |
| + */ |
| + factory String.fromCharCodes(List<int> charCodes) { |
|
siva
2012/11/16 22:32:04
why not call this createFromUtf32
erikcorry
2012/11/19 12:40:41
The exact name of this is to be determined by floi
|
| + int pairs = 0; |
| + // There is some duplication of constants here relative to the ones in |
| + // lib/utf/utf16.dart because we don't want core to depend on the utf |
| + // library. |
| + const int MASK = 0x3ff; |
| + const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START; |
| + const int TRAIL_SURROGATE_BASE = 0xdc00; |
| + for (var code in charCodes) { |
| + if (code is !int || code < 0) throw new ArgumentError(charCodes); |
| + if (code >= UNICODE_RESERVED_AREA_START) { |
| + if (code > MAX_CODE_UNIT) { |
| + pairs++; |
| + } |
| + if (code <= UNICODE_RESERVED_AREA_END || code > MAX_CODE_POINT) { |
| + // No surrogates or out-of-range code points allowed in the input. |
| + throw new ArgumentError(charCodes); |
| + } |
| + } |
| + } |
| + // Fast case - there are no surrogate pairs. |
| + if (pairs == 0) return new String.fromCodeUnits(charCodes); |
| + var codeUnits = new List<int>(pairs + charCodes.length); |
| + int j = 0; |
| + for (int code in charCodes) { |
| + if (code >= SMP_CODE_POINT_BASE) { |
| + codeUnits[j++] = |
| + LEAD_SURROGATE_BASE + (((code - SMP_CODE_POINT_BASE) >> 10) & MASK); |
| + codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK); |
| + } else { |
| + codeUnits[j++] = code; |
| + } |
| + } |
| + return new String.fromCodeUnits(codeUnits); |
| + } |
| + |
| + /** |
| + * Allocates a new String for the specified 16 bit UTF-16 [codeUnits]. |
| + */ |
| + external factory String.fromCodeUnits(List<int> codeUnits); |
|
siva
2012/11/16 22:32:04
Should be called createFromUtf16 to avoid confusi
erikcorry
2012/11/19 12:40:41
As above, up to floitsch.
|
| + |
| + /** |
| + * Gets the Unicode character (as [String]) at the given [index]. This |
| + * routine can return a single combining character (accent) that would |
| + * normally be displayed together with the character it is modifying. |
| + * If the index corresponds to a surrogate code unit then a one-code-unit |
| + * string is returned containing that unpaired surrogate code unit. |
| */ |
| - external factory String.fromCharCodes(List<int> charCodes); |
| + String operator [](int index); |
| /** |
| - * Gets the character (as [String]) at the given [index]. |
| + * Gets the 21 bit Unicode code point at the given [index]. If the code units |
| + * at index and index + 1 form a valid surrogate pair then this function |
| + * returns the non-basic plane code point that they represent. If the code |
| + * unit at index is a trailing surrogate or a leading surrogate that is not |
| + * followed by a trailing surrogate then the raw code unit is returned. |
| */ |
| - String operator [](int index); |
| + int charCodeAt(int index); |
| /** |
| - * Gets the scalar character code at the given [index]. |
| + * Gets the 16 bit UTF-16 code unit at the given index. |
| */ |
| - int charCodeAt(int index); |
| + int codeUnitAt(int index); |
| + |
| /** |
| - * The length of the string. |
| + * The length of the string, measured in UTF-16 code units. |
| */ |
| int get length; |
| /** |
| * Returns whether the two strings are equal. This method compares |
| - * each individual scalar character codes of the strings. |
| + * each individual UTF-16 code unit. No Unicode normalization is |
| + * performed (accent composition/decomposition). |
| */ |
| bool operator ==(String other); |
| @@ -76,10 +145,13 @@ abstract class String implements Comparable, Pattern, Sequence<String> { |
| String substring(int startIndex, [int endIndex]); |
| /** |
| - * Removes leading and trailing whitespace from a string. If the |
| - * string contains leading or trailing whitespace a new string with |
| - * no leading and no trailing whitespace is returned. Otherwise, the |
| - * string itself is returned. |
| + * Removes leading and trailing whitespace from a string. If the string |
| + * contains leading or trailing whitespace a new string with no leading and |
| + * no trailing whitespace is returned. Otherwise, the string itself is |
| + * returned. Whitespace is defined as every Unicode character in the Zs, Zl |
| + * and Zp categories (this includes no-break space), the spacing control |
| + * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM |
| + * character. |
| */ |
| String trim(); |
| @@ -108,14 +180,42 @@ abstract class String implements Comparable, Pattern, Sequence<String> { |
| List<String> split(Pattern pattern); |
| /** |
| - * Returns a list of the characters of this string. |
| + * Returns a list of the characters of this string. No string normalization |
| + * is performed so unprecomposed combining characters (accents) may be found |
| + * in the list. Valid surrogate pairs are returned as one string. |
| */ |
| List<String> splitChars(); |
| /** |
| - * Returns a list of the scalar character codes of this string. |
| + * Returns a list of the 21 bit Unicode code points of this string. |
| + */ |
| + List<int> get charCodes { |
|
siva
2012/11/16 22:32:04
TODO use codepoint iterator instead of charCodeAt
erikcorry
2012/11/19 12:40:41
This method was a dupe. Removed.
|
| + int len = this.length; |
| + final result = new List<int>(len); |
| + int i, j; |
| + for (i = j = 0; i < len; i++, j++) { |
| + int c = this.charCodeAt(i); |
| + // Check for non-basic plane character encoded as a UTF-16 surrogate pair. |
| + if (c > 0xffff) { |
| + i++; |
| + } |
| + result[j] = c; |
| + } |
| + if (i == j) return result; |
| + // If we saw some non-basic plane characters, then we have to return a |
| + // slightly smaller array than expected (we can't trim the original one |
| + // because it is non-extendable). This rarely happens so this is preferable |
| + // to having a separate pass over the string to count the code points. |
| + final newResult = new List<int>(j); |
| + for (i = 0; i < j; i++) newResult[i] = result[i]; |
| + return newResult; |
|
siva
2012/11/16 22:32:04
This piece of code was repeated earlier too, maybe
erikcorry
2012/11/19 12:40:41
Ditto.
|
| + } |
| + |
| + |
| + /** |
| + * Returns a list of the 16 bit UTF-16 code units of this string. |
| */ |
| - List<int> get charCodes; |
| + List<int> get codeUnits; |
| /** |
| * If this string is not already all lower case, returns a new string |