sdk/lib/core/string.dart - Issue 11368138: Add some support for the code-point code-unit distinction.

Unified Diff: sdk/lib/core/string.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Implemented feedback from patch set 2. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: sdk/lib/core/string.dart

diff --git a/sdk/lib/core/string.dart b/sdk/lib/core/string.dart

index f2d0e613203d238145d68ef8416cddb30d6c79b6..b8731eb223e4500f345119eb029947bb62db3a07 100644

--- a/sdk/lib/core/string.dart

+++ b/sdk/lib/core/string.dart

@@ -4,34 +4,103 @@

/**

* The String class represents character strings. Strings are

- * immutable. A string is represented by a list of 32-bit Unicode

- * scalar character codes accessible through the [charCodeAt] or the

- * [charCodes] method.

+ * immutable. A string is represented by a list of 16-bit UTF-16

+ * code units accessible through the [codeUnitAt] or the [codeUnits]

+ * methods. The corresponding Unicode code points are available with

+ * [charCodeAt] or the [charCodes] method.

abstract class String implements Comparable, Pattern, Sequence<String> {

- /**

- * Allocates a new String for the specified [charCodes].

+ // Unicode does not allow for code points above this limit.

+ static const int MAX_CODE_POINT = 0x10ffff;

+ // A Dart string is represented by UTF-16 code units which must be <= 0xffff.

+ static const int MAX_CODE_UNIT = 0xffff;

+ // Unicode does not allow for code points in this range.

+ static const int UNICODE_RESERVED_AREA_START = 0xd800;

+ static const int UNICODE_RESERVED_AREA_END = 0xdfff;

+ // Unicode code points above this limit are coded as two code units in Dart's

+ // UTF-16 string.

+ static const int SMP_CODE_POINT_BASE = 0x10000;

+ /**

+ * Allocates a new String for the specified 21 bit Unicode [codePoints].

+ * Throws an ArgumentError if any of the codePoints are not ints between 0 and

+ * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points

+ * are in the area reserved for UTF-16 surrogate pairs.

+ */

+ factory String.fromCharCodes(List<int> charCodes) {

siva 2012/11/16 22:32:04 why not call this createFromUtf32

erikcorry 2012/11/19 12:40:41 The exact name of this is to be determined by floi

+ int pairs = 0;

+ // There is some duplication of constants here relative to the ones in

+ // lib/utf/utf16.dart because we don't want core to depend on the utf

+ // library.

+ const int MASK = 0x3ff;

+ const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START;

+ const int TRAIL_SURROGATE_BASE = 0xdc00;

+ for (var code in charCodes) {

+ if (code is !int || code < 0) throw new ArgumentError(charCodes);

+ if (code >= UNICODE_RESERVED_AREA_START) {

+ if (code > MAX_CODE_UNIT) {

+ pairs++;

+ }

+ if (code <= UNICODE_RESERVED_AREA_END || code > MAX_CODE_POINT) {

+ // No surrogates or out-of-range code points allowed in the input.

+ throw new ArgumentError(charCodes);

+ }

+ // Fast case - there are no surrogate pairs.

+ if (pairs == 0) return new String.fromCodeUnits(charCodes);

+ var codeUnits = new List<int>(pairs + charCodes.length);

+ int j = 0;

+ for (int code in charCodes) {

+ if (code >= SMP_CODE_POINT_BASE) {

+ codeUnits[j++] =

+ LEAD_SURROGATE_BASE + (((code - SMP_CODE_POINT_BASE) >> 10) & MASK);

+ codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK);

+ } else {

+ codeUnits[j++] = code;

+ }

+ return new String.fromCodeUnits(codeUnits);

+ }

+ /**

+ * Allocates a new String for the specified 16 bit UTF-16 [codeUnits].

+ */

+ external factory String.fromCodeUnits(List<int> codeUnits);

siva 2012/11/16 22:32:04 Should be called createFromUtf16 to avoid confusi

erikcorry 2012/11/19 12:40:41 As above, up to floitsch.

+ /**

+ * Gets the Unicode character (as [String]) at the given [index]. This

+ * routine can return a single combining character (accent) that would

+ * normally be displayed together with the character it is modifying.

+ * If the index corresponds to a surrogate code unit then a one-code-unit

+ * string is returned containing that unpaired surrogate code unit.

- external factory String.fromCharCodes(List<int> charCodes);

+ String operator [](int index);

/**

- * Gets the character (as [String]) at the given [index].

+ * Gets the 21 bit Unicode code point at the given [index]. If the code units

+ * at index and index + 1 form a valid surrogate pair then this function

+ * returns the non-basic plane code point that they represent. If the code

+ * unit at index is a trailing surrogate or a leading surrogate that is not

+ * followed by a trailing surrogate then the raw code unit is returned.

- String operator [](int index);

+ int charCodeAt(int index);

/**

- * Gets the scalar character code at the given [index].

+ * Gets the 16 bit UTF-16 code unit at the given index.

- int charCodeAt(int index);

+ int codeUnitAt(int index);

/**

- * The length of the string.

+ * The length of the string, measured in UTF-16 code units.

int get length;

/**

* Returns whether the two strings are equal. This method compares

- * each individual scalar character codes of the strings.

+ * each individual UTF-16 code unit. No Unicode normalization is

+ * performed (accent composition/decomposition).

bool operator ==(String other);

@@ -76,10 +145,13 @@ abstract class String implements Comparable, Pattern, Sequence<String> {

String substring(int startIndex, [int endIndex]);

/**

- * Removes leading and trailing whitespace from a string. If the

- * string contains leading or trailing whitespace a new string with

- * no leading and no trailing whitespace is returned. Otherwise, the

- * string itself is returned.

+ * Removes leading and trailing whitespace from a string. If the string

+ * contains leading or trailing whitespace a new string with no leading and

+ * no trailing whitespace is returned. Otherwise, the string itself is

+ * returned. Whitespace is defined as every Unicode character in the Zs, Zl

+ * and Zp categories (this includes no-break space), the spacing control

+ * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM

+ * character.

String trim();

@@ -108,14 +180,42 @@ abstract class String implements Comparable, Pattern, Sequence<String> {

List<String> split(Pattern pattern);

/**

- * Returns a list of the characters of this string.

+ * Returns a list of the characters of this string. No string normalization

+ * is performed so unprecomposed combining characters (accents) may be found

+ * in the list. Valid surrogate pairs are returned as one string.

List<String> splitChars();

/**

- * Returns a list of the scalar character codes of this string.

+ * Returns a list of the 21 bit Unicode code points of this string.

+ */

+ List<int> get charCodes {

siva 2012/11/16 22:32:04 TODO use codepoint iterator instead of charCodeAt

erikcorry 2012/11/19 12:40:41 This method was a dupe. Removed.

+ int len = this.length;

+ final result = new List<int>(len);

+ int i, j;

+ for (i = j = 0; i < len; i++, j++) {

+ int c = this.charCodeAt(i);

+ // Check for non-basic plane character encoded as a UTF-16 surrogate pair.

+ if (c > 0xffff) {

+ i++;

+ }

+ result[j] = c;

+ }

+ if (i == j) return result;

+ // If we saw some non-basic plane characters, then we have to return a

+ // slightly smaller array than expected (we can't trim the original one

+ // because it is non-extendable). This rarely happens so this is preferable

+ // to having a separate pass over the string to count the code points.

+ final newResult = new List<int>(j);

+ for (i = 0; i < j; i++) newResult[i] = result[i];

+ return newResult;

siva 2012/11/16 22:32:04 This piece of code was repeated earlier too, maybe

erikcorry 2012/11/19 12:40:41 Ditto.

+ }

+ /**

+ * Returns a list of the 16 bit UTF-16 code units of this string.

- List<int> get charCodes;

+ List<int> get codeUnits;

/**

* If this string is not already all lower case, returns a new string

« runtime/vm/object.cc ('K') | « sdk/lib/_internal/compiler/implementation/util/util.dart ('k') | sdk/lib/io/string_stream.dart » ('j') | no next file with comments »