Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(214)

Unified Diff: sdk/lib/core/string.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Implemented feedback from patch set 3 Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sdk/lib/core/string.dart
diff --git a/sdk/lib/core/string.dart b/sdk/lib/core/string.dart
index f2d0e613203d238145d68ef8416cddb30d6c79b6..7fce25849bd5357dc641c837f86f3e8813c5ee30 100644
--- a/sdk/lib/core/string.dart
+++ b/sdk/lib/core/string.dart
@@ -4,34 +4,103 @@
/**
* The String class represents character strings. Strings are
- * immutable. A string is represented by a list of 32-bit Unicode
- * scalar character codes accessible through the [charCodeAt] or the
- * [charCodes] method.
+ * immutable. A string is represented by a list of 16-bit UTF-16
+ * code units accessible through the [codeUnitAt] or the [codeUnits]
+ * methods. The corresponding Unicode code points are available with
+ * [charCodeAt] or the [charCodes] method.
*/
abstract class String implements Comparable, Pattern, Sequence<String> {
- /**
- * Allocates a new String for the specified [charCodes].
+ // Unicode does not allow for code points above this limit.
+ static const int MAX_CODE_POINT = 0x10ffff;
+ // A Dart string is represented by UTF-16 code units which must be <= 0xffff.
+ static const int MAX_CODE_UNIT = 0xffff;
+ // Unicode does not allow for code points in this range.
+ static const int UNICODE_RESERVED_AREA_START = 0xd800;
+ static const int UNICODE_RESERVED_AREA_END = 0xdfff;
+ // Unicode code points above this limit are coded as two code units in Dart's
+ // UTF-16 string.
+ static const int SMP_CODE_POINT_BASE = 0x10000;
+
+ /**
+ * Allocates a new String for the specified 21 bit Unicode [codePoints].
+ * Throws an ArgumentError if any of the codePoints are not ints between 0 and
+ * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points
+ * are in the area reserved for UTF-16 surrogate pairs.
+ */
+ factory String.fromCharCodes(List<int> charCodes) {
+ int pairs = 0;
+ // There is some duplication of constants here relative to the ones in
+ // lib/utf/utf16.dart because we don't want core to depend on the utf
+ // library.
+ const int MASK = 0x3ff;
+ const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START;
+ const int TRAIL_SURROGATE_BASE = 0xdc00;
+ for (var code in charCodes) {
+ if (code is !int || code < 0) throw new ArgumentError(charCodes);
+ if (code >= UNICODE_RESERVED_AREA_START) {
+ if (code > MAX_CODE_UNIT) {
+ pairs++;
+ }
+ if (code <= UNICODE_RESERVED_AREA_END || code > MAX_CODE_POINT) {
+ // No surrogates or out-of-range code points allowed in the input.
+ throw new ArgumentError(charCodes);
+ }
+ }
+ }
+ // Fast case - there are no surrogate pairs.
+ if (pairs == 0) return new String.fromCodeUnits(charCodes);
+ var codeUnits = new List<int>(pairs + charCodes.length);
+ int j = 0;
+ for (int code in charCodes) {
+ if (code >= SMP_CODE_POINT_BASE) {
+ codeUnits[j++] =
+ LEAD_SURROGATE_BASE + (((code - SMP_CODE_POINT_BASE) >> 10) & MASK);
+ codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK);
+ } else {
+ codeUnits[j++] = code;
+ }
+ }
+ return new String.fromCodeUnits(codeUnits);
+ }
+
+ /**
+ * Allocates a new String for the specified 16 bit UTF-16 [codeUnits].
+ */
+ external factory String.fromCodeUnits(List<int> codeUnits);
+
+ /**
+ * Gets the Unicode character (as [String]) at the given [index]. This
+ * routine can return a single combining character (accent) that would
+ * normally be displayed together with the character it is modifying.
+ * If the index corresponds to a surrogate code unit then a one-code-unit
+ * string is returned containing that unpaired surrogate code unit.
*/
- external factory String.fromCharCodes(List<int> charCodes);
+ String operator [](int index);
/**
- * Gets the character (as [String]) at the given [index].
+ * Gets the 21 bit Unicode code point at the given [index]. If the code units
+ * at index and index + 1 form a valid surrogate pair then this function
+ * returns the non-basic plane code point that they represent. If the code
+ * unit at index is a trailing surrogate or a leading surrogate that is not
+ * followed by a trailing surrogate then the raw code unit is returned.
*/
- String operator [](int index);
+ int charCodeAt(int index);
/**
- * Gets the scalar character code at the given [index].
+ * Gets the 16 bit UTF-16 code unit at the given index.
*/
- int charCodeAt(int index);
+ int codeUnitAt(int index);
+
/**
- * The length of the string.
+ * The length of the string, measured in UTF-16 code units.
*/
int get length;
/**
* Returns whether the two strings are equal. This method compares
- * each individual scalar character codes of the strings.
+ * each individual UTF-16 code unit. No Unicode normalization is
+ * performed (accent composition/decomposition).
*/
bool operator ==(String other);
@@ -76,10 +145,13 @@ abstract class String implements Comparable, Pattern, Sequence<String> {
String substring(int startIndex, [int endIndex]);
/**
- * Removes leading and trailing whitespace from a string. If the
- * string contains leading or trailing whitespace a new string with
- * no leading and no trailing whitespace is returned. Otherwise, the
- * string itself is returned.
+ * Removes leading and trailing whitespace from a string. If the string
+ * contains leading or trailing whitespace a new string with no leading and
+ * no trailing whitespace is returned. Otherwise, the string itself is
+ * returned. Whitespace is defined as every Unicode character in the Zs, Zl
+ * and Zp categories (this includes no-break space), the spacing control
+ * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM
+ * character.
*/
String trim();
@@ -108,16 +180,23 @@ abstract class String implements Comparable, Pattern, Sequence<String> {
List<String> split(Pattern pattern);
/**
- * Returns a list of the characters of this string.
+ * Returns a list of the characters of this string. No string normalization
+ * is performed so unprecomposed combining characters (accents) may be found
+ * in the list. Valid surrogate pairs are returned as one string.
*/
List<String> splitChars();
/**
- * Returns a list of the scalar character codes of this string.
+ * Returns a list of the 21 bit Unicode code points of this string.
*/
List<int> get charCodes;
/**
+ * Returns a list of the 16 bit UTF-16 code units of this string.
+ */
+ List<int> get codeUnits;
+
+ /**
* If this string is not already all lower case, returns a new string
* where all characters are made lower case. Returns [:this:] otherwise.
*/

Powered by Google App Engine
This is Rietveld 408576698