sdk/lib/core/string.dart - Issue 11368138: Add some support for the code-point code-unit distinction.

Side by Side Diff: sdk/lib/core/string.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Implemented feedback from patch set 2. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 /**	5 /**

6 * The String class represents character strings. Strings are	6 * The String class represents character strings. Strings are

7 * immutable. A string is represented by a list of 32-bit Unicode	7 * immutable. A string is represented by a list of 16-bit UTF-16

8 * scalar character codes accessible through the [charCodeAt] or the	8 * code units accessible through the [codeUnitAt] or the [codeUnits]

9 * [charCodes] method.	9 * methods. The corresponding Unicode code points are available with

	10 * [charCodeAt] or the [charCodes] method.

10 */	11 */

11 abstract class String implements Comparable, Pattern, Sequence<String> {	12 abstract class String implements Comparable, Pattern, Sequence<String> {

12 /**	13 // Unicode does not allow for code points above this limit.

13 * Allocates a new String for the specified [charCodes].	14 static const int MAX_CODE_POINT = 0x10ffff;

14 */	15 // A Dart string is represented by UTF-16 code units which must be <= 0xffff.

15 external factory String.fromCharCodes(List<int> charCodes);	16 static const int MAX_CODE_UNIT = 0xffff;

	17 // Unicode does not allow for code points in this range.

	18 static const int UNICODE_RESERVED_AREA_START = 0xd800;

	19 static const int UNICODE_RESERVED_AREA_END = 0xdfff;

	20 // Unicode code points above this limit are coded as two code units in Dart's

	21 // UTF-16 string.

	22 static const int SMP_CODE_POINT_BASE = 0x10000;

16	23

17 /**	24 /**

18 * Gets the character (as [String]) at the given [index].	25 * Allocates a new String for the specified 21 bit Unicode [codePoints].

	26 * Throws an ArgumentError if any of the codePoints are not ints between 0 and

	27 * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points

	28 * are in the area reserved for UTF-16 surrogate pairs.

	29 */

	30 factory String.fromCharCodes(List<int> charCodes) {
	siva 2012/11/16 22:32:04 why not call this createFromUtf32 why not call this createFromUtf32 erikcorry 2012/11/19 12:40:41 The exact name of this is to be determined by floi Show quoted text On 2012/11/16 22:32:04, siva wrote: > why not call this createFromUtf32 The exact name of this is to be determined by floitsch and I already have his LGTM. I'm sure he'll rename it when he has decided on a name.
	31 int pairs = 0;

	32 // There is some duplication of constants here relative to the ones in

	33 // lib/utf/utf16.dart because we don't want core to depend on the utf

	34 // library.

	35 const int MASK = 0x3ff;

	36 const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START;

	37 const int TRAIL_SURROGATE_BASE = 0xdc00;

	38 for (var code in charCodes) {

	39 if (code is !int \|\| code < 0) throw new ArgumentError(charCodes);

	40 if (code >= UNICODE_RESERVED_AREA_START) {

	41 if (code > MAX_CODE_UNIT) {

	42 pairs++;

	43 }

	44 if (code <= UNICODE_RESERVED_AREA_END \|\| code > MAX_CODE_POINT) {

	45 // No surrogates or out-of-range code points allowed in the input.

	46 throw new ArgumentError(charCodes);

	47 }

	48 }

	49 }

	50 // Fast case - there are no surrogate pairs.

	51 if (pairs == 0) return new String.fromCodeUnits(charCodes);

	52 var codeUnits = new List<int>(pairs + charCodes.length);

	53 int j = 0;

	54 for (int code in charCodes) {

	55 if (code >= SMP_CODE_POINT_BASE) {

	56 codeUnits[j++] =

	57 LEAD_SURROGATE_BASE + (((code - SMP_CODE_POINT_BASE) >> 10) & MASK);

	58 codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK);

	59 } else {

	60 codeUnits[j++] = code;

	61 }

	62 }

	63 return new String.fromCodeUnits(codeUnits);

	64 }

	65

	66 /**

	67 * Allocates a new String for the specified 16 bit UTF-16 [codeUnits].

	68 */

	69 external factory String.fromCodeUnits(List<int> codeUnits);
	siva 2012/11/16 22:32:04 Should be called createFromUtf16 to avoid confusi Should be called createFromUtf16 to avoid confusion you have already renamed the VM method. erikcorry 2012/11/19 12:40:41 As above, up to floitsch. Show quoted text On 2012/11/16 22:32:04, siva wrote: > Should be called createFromUtf16 to avoid confusion > you have already renamed the VM method. As above, up to floitsch.
	70

	71 /**

	72 * Gets the Unicode character (as [String]) at the given [index]. This

	73 * routine can return a single combining character (accent) that would

	74 * normally be displayed together with the character it is modifying.

	75 * If the index corresponds to a surrogate code unit then a one-code-unit

	76 * string is returned containing that unpaired surrogate code unit.

19 */	77 */

20 String operator [](int index);	78 String operator [](int index);

21	79

22 /**	80 /**

23 * Gets the scalar character code at the given [index].	81 * Gets the 21 bit Unicode code point at the given [index]. If the code units

	82 * at index and index + 1 form a valid surrogate pair then this function

	83 * returns the non-basic plane code point that they represent. If the code

	84 * unit at index is a trailing surrogate or a leading surrogate that is not

	85 * followed by a trailing surrogate then the raw code unit is returned.

24 */	86 */

25 int charCodeAt(int index);	87 int charCodeAt(int index);

26	88

27 /**	89 /**

28 * The length of the string.	90 * Gets the 16 bit UTF-16 code unit at the given index.

	91 */

	92 int codeUnitAt(int index);

	93

	94

	95 /**

	96 * The length of the string, measured in UTF-16 code units.

29 */	97 */

30 int get length;	98 int get length;

31	99

32 /**	100 /**

33 * Returns whether the two strings are equal. This method compares	101 * Returns whether the two strings are equal. This method compares

34 * each individual scalar character codes of the strings.	102 * each individual UTF-16 code unit. No Unicode normalization is

	103 * performed (accent composition/decomposition).

35 */	104 */

36 bool operator ==(String other);	105 bool operator ==(String other);

37	106

38 /**	107 /**

39 * Returns whether this string ends with [other].	108 * Returns whether this string ends with [other].

40 */	109 */

41 bool endsWith(String other);	110 bool endsWith(String other);

42	111

43 /**	112 /**

44 * Returns whether this string starts with [other].	113 * Returns whether this string starts with [other].

(...skipping 24 matching lines...) Expand all Loading...
69 */	138 */

70 String concat(String other);	139 String concat(String other);

71	140

72 /**	141 /**

73 * Returns a substring of this string in the given range.	142 * Returns a substring of this string in the given range.

74 * [startIndex] is inclusive and [endIndex] is exclusive.	143 * [startIndex] is inclusive and [endIndex] is exclusive.

75 */	144 */

76 String substring(int startIndex, [int endIndex]);	145 String substring(int startIndex, [int endIndex]);

77	146

78 /**	147 /**

79 * Removes leading and trailing whitespace from a string. If the	148 * Removes leading and trailing whitespace from a string. If the string

80 * string contains leading or trailing whitespace a new string with	149 * contains leading or trailing whitespace a new string with no leading and

81 * no leading and no trailing whitespace is returned. Otherwise, the	150 * no trailing whitespace is returned. Otherwise, the string itself is

82 * string itself is returned.	151 * returned. Whitespace is defined as every Unicode character in the Zs, Zl

	152 * and Zp categories (this includes no-break space), the spacing control

	153 * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM

	154 * character.

83 */	155 */

84 String trim();	156 String trim();

85	157

86 /**	158 /**

87 * Returns whether this string contains [other] starting	159 * Returns whether this string contains [other] starting

88 * at [startIndex] (inclusive).	160 * at [startIndex] (inclusive).

89 */	161 */

90 bool contains(Pattern other, [int startIndex]);	162 bool contains(Pattern other, [int startIndex]);

91	163

92 /**	164 /**

93 * Returns a new string where the first occurence of [from] in this string	165 * Returns a new string where the first occurence of [from] in this string

94 * is replaced with [to].	166 * is replaced with [to].

95 */	167 */

96 String replaceFirst(Pattern from, String to);	168 String replaceFirst(Pattern from, String to);

97	169

98 /**	170 /**

99 * Returns a new string where all occurences of [from] in this string	171 * Returns a new string where all occurences of [from] in this string

100 * are replaced with [to].	172 * are replaced with [to].

101 */	173 */

102 String replaceAll(Pattern from, String to);	174 String replaceAll(Pattern from, String to);

103	175

104 /**	176 /**

105 * Splits the string around matches of [pattern]. Returns	177 * Splits the string around matches of [pattern]. Returns

106 * a list of substrings.	178 * a list of substrings.

107 */	179 */

108 List<String> split(Pattern pattern);	180 List<String> split(Pattern pattern);

109	181

110 /**	182 /**

111 * Returns a list of the characters of this string.	183 * Returns a list of the characters of this string. No string normalization

	184 * is performed so unprecomposed combining characters (accents) may be found

	185 * in the list. Valid surrogate pairs are returned as one string.

112 */	186 */

113 List<String> splitChars();	187 List<String> splitChars();

114	188

115 /**	189 /**

116 * Returns a list of the scalar character codes of this string.	190 * Returns a list of the 21 bit Unicode code points of this string.

117 */	191 */

118 List<int> get charCodes;	192 List<int> get charCodes {
	siva 2012/11/16 22:32:04 TODO use codepoint iterator instead of charCodeAt TODO use codepoint iterator instead of charCodeAt here? erikcorry 2012/11/19 12:40:41 This method was a dupe. Removed. Show quoted text On 2012/11/16 22:32:04, siva wrote: > TODO use codepoint iterator instead of charCodeAt here? This method was a dupe. Removed.
	193 int len = this.length;

	194 final result = new List<int>(len);

	195 int i, j;

	196 for (i = j = 0; i < len; i++, j++) {

	197 int c = this.charCodeAt(i);

	198 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.

	199 if (c > 0xffff) {

	200 i++;

	201 }

	202 result[j] = c;

	203 }

	204 if (i == j) return result;

	205 // If we saw some non-basic plane characters, then we have to return a

	206 // slightly smaller array than expected (we can't trim the original one

	207 // because it is non-extendable). This rarely happens so this is preferable

	208 // to having a separate pass over the string to count the code points.

	209 final newResult = new List<int>(j);

	210 for (i = 0; i < j; i++) newResult[i] = result[i];

	211 return newResult;
	siva 2012/11/16 22:32:04 This piece of code was repeated earlier too, maybe This piece of code was repeated earlier too, maybe abstract it out. erikcorry 2012/11/19 12:40:41 Ditto. Show quoted text On 2012/11/16 22:32:04, siva wrote: > This piece of code was repeated earlier too, maybe abstract > it out. Ditto.
	212 }

	213

	214

	215 /**

	216 * Returns a list of the 16 bit UTF-16 code units of this string.

	217 */

	218 List<int> get codeUnits;

119	219

120 /**	220 /**

121 * If this string is not already all lower case, returns a new string	221 * If this string is not already all lower case, returns a new string

122 * where all characters are made lower case. Returns [:this:] otherwise.	222 * where all characters are made lower case. Returns [:this:] otherwise.

123 */	223 */

124 String toLowerCase();	224 String toLowerCase();

125	225

126 /**	226 /**

127 * If this string is not already all uper case, returns a new string	227 * If this string is not already all uper case, returns a new string

128 * where all characters are made upper case. Returns [:this:] otherwise.	228 * where all characters are made upper case. Returns [:this:] otherwise.

129 */	229 */

130 String toUpperCase();	230 String toUpperCase();

131 }	231 }

OLD	NEW

« runtime/vm/object.cc ('K') | « sdk/lib/_internal/compiler/implementation/util/util.dart ('k') | sdk/lib/io/string_stream.dart » ('j') | no next file with comments »