OLD | NEW |
---|---|
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 /** | 5 /** |
6 * The String class represents character strings. Strings are | 6 * The String class represents character strings. Strings are |
7 * immutable. A string is represented by a list of 32-bit Unicode | 7 * immutable. A string is represented by a list of 16-bit UTF-16 |
8 * scalar character codes accessible through the [charCodeAt] or the | 8 * code units accessible through the [codeUnitAt] or the [codeUnits] |
9 * [charCodes] method. | 9 * methods. The corresponding Unicode code points are available with |
10 * [charCodeAt] or the [charCodes] method. | |
10 */ | 11 */ |
11 abstract class String implements Comparable, Pattern, Sequence<String> { | 12 abstract class String implements Comparable, Pattern, Sequence<String> { |
12 /** | 13 // Unicode does not allow for code points above this limit. |
13 * Allocates a new String for the specified [charCodes]. | 14 static const int MAX_CODE_POINT = 0x10ffff; |
14 */ | 15 // A Dart string is represented by UTF-16 code units which must be <= 0xffff. |
15 external factory String.fromCharCodes(List<int> charCodes); | 16 static const int MAX_CODE_UNIT = 0xffff; |
17 // Unicode does not allow for code points in this range. | |
18 static const int UNICODE_RESERVED_AREA_START = 0xd800; | |
19 static const int UNICODE_RESERVED_AREA_END = 0xdfff; | |
20 // Unicode code points above this limit are coded as two code units in Dart's | |
21 // UTF-16 string. | |
22 static const int SMP_CODE_POINT_BASE = 0x10000; | |
16 | 23 |
17 /** | 24 /** |
18 * Gets the character (as [String]) at the given [index]. | 25 * Allocates a new String for the specified 21 bit Unicode [codePoints]. |
26 * Throws an ArgumentError if any of the codePoints are not ints between 0 and | |
27 * MAX_CODE_POINT. Also throws an ArgumentError if any of the code points | |
28 * are in the area reserved for UTF-16 surrogate pairs. | |
29 */ | |
30 factory String.fromCharCodes(List<int> charCodes) { | |
siva
2012/11/16 22:32:04
why not call this createFromUtf32
erikcorry
2012/11/19 12:40:41
The exact name of this is to be determined by floi
| |
31 int pairs = 0; | |
32 // There is some duplication of constants here relative to the ones in | |
33 // lib/utf/utf16.dart because we don't want core to depend on the utf | |
34 // library. | |
35 const int MASK = 0x3ff; | |
36 const int LEAD_SURROGATE_BASE = UNICODE_RESERVED_AREA_START; | |
37 const int TRAIL_SURROGATE_BASE = 0xdc00; | |
38 for (var code in charCodes) { | |
39 if (code is !int || code < 0) throw new ArgumentError(charCodes); | |
40 if (code >= UNICODE_RESERVED_AREA_START) { | |
41 if (code > MAX_CODE_UNIT) { | |
42 pairs++; | |
43 } | |
44 if (code <= UNICODE_RESERVED_AREA_END || code > MAX_CODE_POINT) { | |
45 // No surrogates or out-of-range code points allowed in the input. | |
46 throw new ArgumentError(charCodes); | |
47 } | |
48 } | |
49 } | |
50 // Fast case - there are no surrogate pairs. | |
51 if (pairs == 0) return new String.fromCodeUnits(charCodes); | |
52 var codeUnits = new List<int>(pairs + charCodes.length); | |
53 int j = 0; | |
54 for (int code in charCodes) { | |
55 if (code >= SMP_CODE_POINT_BASE) { | |
56 codeUnits[j++] = | |
57 LEAD_SURROGATE_BASE + (((code - SMP_CODE_POINT_BASE) >> 10) & MASK); | |
58 codeUnits[j++] = TRAIL_SURROGATE_BASE + (code & MASK); | |
59 } else { | |
60 codeUnits[j++] = code; | |
61 } | |
62 } | |
63 return new String.fromCodeUnits(codeUnits); | |
64 } | |
65 | |
66 /** | |
67 * Allocates a new String for the specified 16 bit UTF-16 [codeUnits]. | |
68 */ | |
69 external factory String.fromCodeUnits(List<int> codeUnits); | |
siva
2012/11/16 22:32:04
Should be called createFromUtf16 to avoid confusi
erikcorry
2012/11/19 12:40:41
As above, up to floitsch.
| |
70 | |
71 /** | |
72 * Gets the Unicode character (as [String]) at the given [index]. This | |
73 * routine can return a single combining character (accent) that would | |
74 * normally be displayed together with the character it is modifying. | |
75 * If the index corresponds to a surrogate code unit then a one-code-unit | |
76 * string is returned containing that unpaired surrogate code unit. | |
19 */ | 77 */ |
20 String operator [](int index); | 78 String operator [](int index); |
21 | 79 |
22 /** | 80 /** |
23 * Gets the scalar character code at the given [index]. | 81 * Gets the 21 bit Unicode code point at the given [index]. If the code units |
82 * at index and index + 1 form a valid surrogate pair then this function | |
83 * returns the non-basic plane code point that they represent. If the code | |
84 * unit at index is a trailing surrogate or a leading surrogate that is not | |
85 * followed by a trailing surrogate then the raw code unit is returned. | |
24 */ | 86 */ |
25 int charCodeAt(int index); | 87 int charCodeAt(int index); |
26 | 88 |
27 /** | 89 /** |
28 * The length of the string. | 90 * Gets the 16 bit UTF-16 code unit at the given index. |
91 */ | |
92 int codeUnitAt(int index); | |
93 | |
94 | |
95 /** | |
96 * The length of the string, measured in UTF-16 code units. | |
29 */ | 97 */ |
30 int get length; | 98 int get length; |
31 | 99 |
32 /** | 100 /** |
33 * Returns whether the two strings are equal. This method compares | 101 * Returns whether the two strings are equal. This method compares |
34 * each individual scalar character codes of the strings. | 102 * each individual UTF-16 code unit. No Unicode normalization is |
103 * performed (accent composition/decomposition). | |
35 */ | 104 */ |
36 bool operator ==(String other); | 105 bool operator ==(String other); |
37 | 106 |
38 /** | 107 /** |
39 * Returns whether this string ends with [other]. | 108 * Returns whether this string ends with [other]. |
40 */ | 109 */ |
41 bool endsWith(String other); | 110 bool endsWith(String other); |
42 | 111 |
43 /** | 112 /** |
44 * Returns whether this string starts with [other]. | 113 * Returns whether this string starts with [other]. |
(...skipping 24 matching lines...) Expand all Loading... | |
69 */ | 138 */ |
70 String concat(String other); | 139 String concat(String other); |
71 | 140 |
72 /** | 141 /** |
73 * Returns a substring of this string in the given range. | 142 * Returns a substring of this string in the given range. |
74 * [startIndex] is inclusive and [endIndex] is exclusive. | 143 * [startIndex] is inclusive and [endIndex] is exclusive. |
75 */ | 144 */ |
76 String substring(int startIndex, [int endIndex]); | 145 String substring(int startIndex, [int endIndex]); |
77 | 146 |
78 /** | 147 /** |
79 * Removes leading and trailing whitespace from a string. If the | 148 * Removes leading and trailing whitespace from a string. If the string |
80 * string contains leading or trailing whitespace a new string with | 149 * contains leading or trailing whitespace a new string with no leading and |
81 * no leading and no trailing whitespace is returned. Otherwise, the | 150 * no trailing whitespace is returned. Otherwise, the string itself is |
82 * string itself is returned. | 151 * returned. Whitespace is defined as every Unicode character in the Zs, Zl |
152 * and Zp categories (this includes no-break space), the spacing control | |
153 * characters from 9 to 13 (tab, lf, vtab, ff and cr), and 0xfeff the BOM | |
154 * character. | |
83 */ | 155 */ |
84 String trim(); | 156 String trim(); |
85 | 157 |
86 /** | 158 /** |
87 * Returns whether this string contains [other] starting | 159 * Returns whether this string contains [other] starting |
88 * at [startIndex] (inclusive). | 160 * at [startIndex] (inclusive). |
89 */ | 161 */ |
90 bool contains(Pattern other, [int startIndex]); | 162 bool contains(Pattern other, [int startIndex]); |
91 | 163 |
92 /** | 164 /** |
93 * Returns a new string where the first occurence of [from] in this string | 165 * Returns a new string where the first occurence of [from] in this string |
94 * is replaced with [to]. | 166 * is replaced with [to]. |
95 */ | 167 */ |
96 String replaceFirst(Pattern from, String to); | 168 String replaceFirst(Pattern from, String to); |
97 | 169 |
98 /** | 170 /** |
99 * Returns a new string where all occurences of [from] in this string | 171 * Returns a new string where all occurences of [from] in this string |
100 * are replaced with [to]. | 172 * are replaced with [to]. |
101 */ | 173 */ |
102 String replaceAll(Pattern from, String to); | 174 String replaceAll(Pattern from, String to); |
103 | 175 |
104 /** | 176 /** |
105 * Splits the string around matches of [pattern]. Returns | 177 * Splits the string around matches of [pattern]. Returns |
106 * a list of substrings. | 178 * a list of substrings. |
107 */ | 179 */ |
108 List<String> split(Pattern pattern); | 180 List<String> split(Pattern pattern); |
109 | 181 |
110 /** | 182 /** |
111 * Returns a list of the characters of this string. | 183 * Returns a list of the characters of this string. No string normalization |
184 * is performed so unprecomposed combining characters (accents) may be found | |
185 * in the list. Valid surrogate pairs are returned as one string. | |
112 */ | 186 */ |
113 List<String> splitChars(); | 187 List<String> splitChars(); |
114 | 188 |
115 /** | 189 /** |
116 * Returns a list of the scalar character codes of this string. | 190 * Returns a list of the 21 bit Unicode code points of this string. |
117 */ | 191 */ |
118 List<int> get charCodes; | 192 List<int> get charCodes { |
siva
2012/11/16 22:32:04
TODO use codepoint iterator instead of charCodeAt
erikcorry
2012/11/19 12:40:41
This method was a dupe. Removed.
| |
193 int len = this.length; | |
194 final result = new List<int>(len); | |
195 int i, j; | |
196 for (i = j = 0; i < len; i++, j++) { | |
197 int c = this.charCodeAt(i); | |
198 // Check for non-basic plane character encoded as a UTF-16 surrogate pair. | |
199 if (c > 0xffff) { | |
200 i++; | |
201 } | |
202 result[j] = c; | |
203 } | |
204 if (i == j) return result; | |
205 // If we saw some non-basic plane characters, then we have to return a | |
206 // slightly smaller array than expected (we can't trim the original one | |
207 // because it is non-extendable). This rarely happens so this is preferable | |
208 // to having a separate pass over the string to count the code points. | |
209 final newResult = new List<int>(j); | |
210 for (i = 0; i < j; i++) newResult[i] = result[i]; | |
211 return newResult; | |
siva
2012/11/16 22:32:04
This piece of code was repeated earlier too, maybe
erikcorry
2012/11/19 12:40:41
Ditto.
| |
212 } | |
213 | |
214 | |
215 /** | |
216 * Returns a list of the 16 bit UTF-16 code units of this string. | |
217 */ | |
218 List<int> get codeUnits; | |
119 | 219 |
120 /** | 220 /** |
121 * If this string is not already all lower case, returns a new string | 221 * If this string is not already all lower case, returns a new string |
122 * where all characters are made lower case. Returns [:this:] otherwise. | 222 * where all characters are made lower case. Returns [:this:] otherwise. |
123 */ | 223 */ |
124 String toLowerCase(); | 224 String toLowerCase(); |
125 | 225 |
126 /** | 226 /** |
127 * If this string is not already all uper case, returns a new string | 227 * If this string is not already all uper case, returns a new string |
128 * where all characters are made upper case. Returns [:this:] otherwise. | 228 * where all characters are made upper case. Returns [:this:] otherwise. |
129 */ | 229 */ |
130 String toUpperCase(); | 230 String toUpperCase(); |
131 } | 231 } |
OLD | NEW |