Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: sdk/lib/core/string.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Implemented feedback from patch set 2. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 /** 5 /**
6 * The String class represents character strings. Strings are 6 * The String class represents character strings. Strings are
7 * immutable. A string is represented by a list of 32-bit Unicode 7 * immutable. A string is represented by a list of 16-bit UTF-16
8 * scalar character codes accessible through the [charCodeAt] or the 8 * code units accessible through the [codeUnitAt] or the [codeUnits]
9 * [charCodes] method. 9 * methods. The corresponding Unicode code points are available with
10 * [charCodeAt] or the [charCodes] method.
10 */ 11 */
11 abstract class String implements Comparable, Pattern, Sequence<String> { 12 abstract class String implements Comparable, Pattern, Sequence<String> {
12 /** 13 // Unicode does not allow for code points above this limit.
13 * Allocates a new String for the specified [charCodes]. 14 static const int MAX_CODE_POINT = 0x10ffff;
14 */ 15 // A Dart string is represented by UTF-16 code units which must be <= 0xffff.
15 external factory String.fromCharCodes(List<int> charCodes); 16 static const int MAX_CODE_UNIT = 0xffff;
17 // Unicode does not allow for code points in this range.
18 static const int UNICODE_RESERVED_AREA_START = 0xd800;
19 static const int UNICODE_RESERVED_AREA_END = 0xdfff;
20 // Unicode code points above this limit are coded as two code units in Dart's
21 // UTF-16 string.
22 static const int SMP_CODE_POINT_BASE = 0x10000;
16 23
17 /** 24 /**
18 * Gets the character (as [String]) at the given [index]. 25 * Allocates a new String for the specified 21 bit Unicode [codePoints].
26 * Throws an ArgumentError if any of the codePoints are not ints between 0 and
27 * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points
28 * are in the area reserved for UTF-16 surrogate pairs.
29 */
30 factory String.fromCharCodes(List<int> charCodes) {
siva 2012/11/16 22:32:04 why not call this createFromUtf32
erikcorry 2012/11/19 12:40:41 The exact name of this is to be determined by floi
31 int pairs = 0;
32 // There is some duplication of constants here relative to the ones in
33 // lib/utf/utf16.dart because we don't want core to depend on the utf
34 // library.
35 const int MASK = 0x3ff;
36 const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START;
37 const int TRAIL_SURROGATE_BASE = 0xdc00;
38 for (var code in charCodes) {
39 if (code is !int || code < 0) throw new ArgumentError(charCodes);
40 if (code >= UNICODE_RESERVED_AREA_START) {
41 if (code > MAX_CODE_UNIT) {
42 pairs++;
43 }
44 if (code <= UNICODE_RESERVED_AREA_END || code > MAX_CODE_POINT) {
45 // No surrogates or out-of-range code points allowed in the input.
46 throw new ArgumentError(charCodes);
47 }
48 }
49 }
50 // Fast case - there are no surrogate pairs.
51 if (pairs == 0) return new String.fromCodeUnits(charCodes);
52 var codeUnits = new List<int>(pairs + charCodes.length);
53 int j = 0;
54 for (int code in charCodes) {
55 if (code >= SMP_CODE_POINT_BASE) {
56 codeUnits[j++] =
57 LEAD_SURROGATE_BASE + (((code - SMP_CODE_POINT_BASE) >> 10) & MASK);
58 codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK);
59 } else {
60 codeUnits[j++] = code;
61 }
62 }
63 return new String.fromCodeUnits(codeUnits);
64 }
65
66 /**
67 * Allocates a new String for the specified 16 bit UTF-16 [codeUnits].
68 */
69 external factory String.fromCodeUnits(List<int> codeUnits);
siva 2012/11/16 22:32:04 Should be called createFromUtf16 to avoid confusi
erikcorry 2012/11/19 12:40:41 As above, up to floitsch.
70
71 /**
72 * Gets the Unicode character (as [String]) at the given [index]. This
73 * routine can return a single combining character (accent) that would
74 * normally be displayed together with the character it is modifying.
75 * If the index corresponds to a surrogate code unit then a one-code-unit
76 * string is returned containing that unpaired surrogate code unit.
19 */ 77 */
20 String operator [](int index); 78 String operator [](int index);
21 79
22 /** 80 /**
23 * Gets the scalar character code at the given [index]. 81 * Gets the 21 bit Unicode code point at the given [index]. If the code units
82 * at index and index + 1 form a valid surrogate pair then this function
83 * returns the non-basic plane code point that they represent. If the code
84 * unit at index is a trailing surrogate or a leading surrogate that is not
85 * followed by a trailing surrogate then the raw code unit is returned.
24 */ 86 */
25 int charCodeAt(int index); 87 int charCodeAt(int index);
26 88
27 /** 89 /**
28 * The length of the string. 90 * Gets the 16 bit UTF-16 code unit at the given index.
91 */
92 int codeUnitAt(int index);
93
94
95 /**
96 * The length of the string, measured in UTF-16 code units.
29 */ 97 */
30 int get length; 98 int get length;
31 99
32 /** 100 /**
33 * Returns whether the two strings are equal. This method compares 101 * Returns whether the two strings are equal. This method compares
34 * each individual scalar character codes of the strings. 102 * each individual UTF-16 code unit. No Unicode normalization is
103 * performed (accent composition/decomposition).
35 */ 104 */
36 bool operator ==(String other); 105 bool operator ==(String other);
37 106
38 /** 107 /**
39 * Returns whether this string ends with [other]. 108 * Returns whether this string ends with [other].
40 */ 109 */
41 bool endsWith(String other); 110 bool endsWith(String other);
42 111
43 /** 112 /**
44 * Returns whether this string starts with [other]. 113 * Returns whether this string starts with [other].
(...skipping 24 matching lines...) Expand all
69 */ 138 */
70 String concat(String other); 139 String concat(String other);
71 140
72 /** 141 /**
73 * Returns a substring of this string in the given range. 142 * Returns a substring of this string in the given range.
74 * [startIndex] is inclusive and [endIndex] is exclusive. 143 * [startIndex] is inclusive and [endIndex] is exclusive.
75 */ 144 */
76 String substring(int startIndex, [int endIndex]); 145 String substring(int startIndex, [int endIndex]);
77 146
78 /** 147 /**
79 * Removes leading and trailing whitespace from a string. If the 148 * Removes leading and trailing whitespace from a string. If the string
80 * string contains leading or trailing whitespace a new string with 149 * contains leading or trailing whitespace a new string with no leading and
81 * no leading and no trailing whitespace is returned. Otherwise, the 150 * no trailing whitespace is returned. Otherwise, the string itself is
82 * string itself is returned. 151 * returned. Whitespace is defined as every Unicode character in the Zs, Zl
152 * and Zp categories (this includes no-break space), the spacing control
153 * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM
154 * character.
83 */ 155 */
84 String trim(); 156 String trim();
85 157
86 /** 158 /**
87 * Returns whether this string contains [other] starting 159 * Returns whether this string contains [other] starting
88 * at [startIndex] (inclusive). 160 * at [startIndex] (inclusive).
89 */ 161 */
90 bool contains(Pattern other, [int startIndex]); 162 bool contains(Pattern other, [int startIndex]);
91 163
92 /** 164 /**
93 * Returns a new string where the first occurence of [from] in this string 165 * Returns a new string where the first occurence of [from] in this string
94 * is replaced with [to]. 166 * is replaced with [to].
95 */ 167 */
96 String replaceFirst(Pattern from, String to); 168 String replaceFirst(Pattern from, String to);
97 169
98 /** 170 /**
99 * Returns a new string where all occurences of [from] in this string 171 * Returns a new string where all occurences of [from] in this string
100 * are replaced with [to]. 172 * are replaced with [to].
101 */ 173 */
102 String replaceAll(Pattern from, String to); 174 String replaceAll(Pattern from, String to);
103 175
104 /** 176 /**
105 * Splits the string around matches of [pattern]. Returns 177 * Splits the string around matches of [pattern]. Returns
106 * a list of substrings. 178 * a list of substrings.
107 */ 179 */
108 List<String> split(Pattern pattern); 180 List<String> split(Pattern pattern);
109 181
110 /** 182 /**
111 * Returns a list of the characters of this string. 183 * Returns a list of the characters of this string. No string normalization
184 * is performed so unprecomposed combining characters (accents) may be found
185 * in the list. Valid surrogate pairs are returned as one string.
112 */ 186 */
113 List<String> splitChars(); 187 List<String> splitChars();
114 188
115 /** 189 /**
116 * Returns a list of the scalar character codes of this string. 190 * Returns a list of the 21 bit Unicode code points of this string.
117 */ 191 */
118 List<int> get charCodes; 192 List<int> get charCodes {
siva 2012/11/16 22:32:04 TODO use codepoint iterator instead of charCodeAt
erikcorry 2012/11/19 12:40:41 This method was a dupe. Removed.
193 int len = this.length;
194 final result = new List<int>(len);
195 int i, j;
196 for (i = j = 0; i < len; i++, j++) {
197 int c = this.charCodeAt(i);
198 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.
199 if (c > 0xffff) {
200 i++;
201 }
202 result[j] = c;
203 }
204 if (i == j) return result;
205 // If we saw some non-basic plane characters, then we have to return a
206 // slightly smaller array than expected (we can't trim the original one
207 // because it is non-extendable). This rarely happens so this is preferable
208 // to having a separate pass over the string to count the code points.
209 final newResult = new List<int>(j);
210 for (i = 0; i < j; i++) newResult[i] = result[i];
211 return newResult;
siva 2012/11/16 22:32:04 This piece of code was repeated earlier too, maybe
erikcorry 2012/11/19 12:40:41 Ditto.
212 }
213
214
215 /**
216 * Returns a list of the 16 bit UTF-16 code units of this string.
217 */
218 List<int> get codeUnits;
119 219
120 /** 220 /**
121 * If this string is not already all lower case, returns a new string 221 * If this string is not already all lower case, returns a new string
122 * where all characters are made lower case. Returns [:this:] otherwise. 222 * where all characters are made lower case. Returns [:this:] otherwise.
123 */ 223 */
124 String toLowerCase(); 224 String toLowerCase();
125 225
126 /** 226 /**
127 * If this string is not already all uper case, returns a new string 227 * If this string is not already all uper case, returns a new string
128 * where all characters are made upper case. Returns [:this:] otherwise. 228 * where all characters are made upper case. Returns [:this:] otherwise.
129 */ 229 */
130 String toUpperCase(); 230 String toUpperCase();
131 } 231 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698