Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(29)

Side by Side Diff: runtime/lib/string_base.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 /** 5 /**
6 * [_StringBase] contains common methods used by concrete String 6 * [_StringBase] contains common methods used by concrete String
7 * implementations, e.g., _OneByteString. 7 * implementations, e.g., _OneByteString.
8 */ 8 */
9 class _StringBase { 9 class _StringBase {
10 10
11 factory _StringBase._uninstantiable() { 11 factory _StringBase._uninstantiable() {
12 throw new UnsupportedError( 12 throw new UnsupportedError(
13 "_StringBase can't be instaniated"); 13 "_StringBase can't be instaniated");
14 } 14 }
15 15
16 int get hashCode native "String_getHashCode"; 16 int get hashCode native "String_getHashCode";
17 17
18 /** 18 /**
19 * Create the most efficient string representation for specified 19 * Create the most efficient string representation for specified
20 * [codePoints]. 20 * [codePoints].
21 */ 21 */
22 static String createFromCharCodes(List<int> charCodes) { 22 static String createFromCharCodes(List<int> codePoints) {
23 _ObjectArray objectArray; 23 _ObjectArray objectArray;
24 if (charCodes is _ObjectArray) { 24 if (codePoints is _ObjectArray) {
25 objectArray = charCodes; 25 objectArray = codePoints;
26 } else { 26 } else {
27 int len = charCodes.length; 27 int len = codePoints.length;
28 objectArray = new _ObjectArray(len); 28 objectArray = new _ObjectArray(len);
29 for (int i = 0; i < len; i++) { 29 for (int i = 0; i < len; i++) {
30 objectArray[i] = charCodes[i]; 30 objectArray[i] = codePoints[i];
31 } 31 }
32 } 32 }
33 return _createFromCodePoints(objectArray); 33 return _createFromCodePoints(objectArray);
34 } 34 }
35 35
36 static String _createFromCodePoints(List<int> codePoints) 36 static String _createFromCodePoints(List<int> codePoints)
37 native "StringBase_createFromCodePoints"; 37 native "StringBase_createFromCodePoints";
38 38
39 static String createFromCodeUnits(List<int> codeUnits) {
40 _ObjectArray objectArray;
41 if (codeUnits is _ObjectArray) {
42 objectArray = codeUnits;
43 } else {
44 int len = codeUnits.length;
45 objectArray = new _ObjectArray(len);
46 for (int i = 0; i < len; i++) {
47 objectArray[i] = codeUnits[i];
48 }
49 }
50 return _createFromCodeUnits(objectArray);
51 }
52
53 static String _createFromCodeUnits(List<int> codeUnits)
54 native "StringBase_createFromCodeUnits";
55
39 String operator [](int index) native "String_charAt"; 56 String operator [](int index) native "String_charAt";
40 57
41 int charCodeAt(int index) native "String_charCodeAt"; 58 int charCodeAt(int index) native "String_charCodeAt";
42 59
60 int codeUnitAt(int index) native "String_codeUnitAt";
61
43 int get length native "String_getLength"; 62 int get length native "String_getLength";
44 63
45 bool get isEmpty { 64 bool get isEmpty {
46 return this.length === 0; 65 return this.length === 0;
47 } 66 }
48 67
49 String concat(String other) native "String_concat"; 68 String concat(String other) native "String_concat";
50 69
51 String toString() { 70 String toString() {
52 return this; 71 return this;
53 } 72 }
54 73
55 bool operator ==(Object other) { 74 bool operator ==(Object other) {
56 if (this === other) { 75 if (this === other) {
57 return true; 76 return true;
58 } 77 }
59 if ((other is !String) || 78 if ((other is !String) ||
60 (this.length != other.length)) { 79 (this.length != other.length)) {
61 // TODO(5413632): Compare hash codes when both are present. 80 // TODO(5413632): Compare hash codes when both are present.
62 return false; 81 return false;
63 } 82 }
64 return this.compareTo(other) === 0; 83 return this.compareTo(other) === 0;
65 } 84 }
66 85
67 int compareTo(String other) { 86 int compareTo(String other) {
68 int thisLength = this.length; 87 int thisLength = this.length;
69 int otherLength = other.length; 88 int otherLength = other.length;
70 int len = (thisLength < otherLength) ? thisLength : otherLength; 89 int len = (thisLength < otherLength) ? thisLength : otherLength;
71 for (int i = 0; i < len; i++) { 90 for (int i = 0; i < len; i++) {
72 int thisCodePoint = this.charCodeAt(i); 91 int thisCodeUnit = this.codeUnitAt(i);
73 int otherCodePoint = other.charCodeAt(i); 92 int otherCodeUnit = other.codeUnitAt(i);
74 if (thisCodePoint < otherCodePoint) { 93 if (thisCodeUnit < otherCodeUnit) {
75 return -1; 94 return -1;
76 } 95 }
77 if (thisCodePoint > otherCodePoint) { 96 if (thisCodeUnit > otherCodeUnit) {
78 return 1; 97 return 1;
79 } 98 }
80 } 99 }
81 if (thisLength < otherLength) return -1; 100 if (thisLength < otherLength) return -1;
82 if (thisLength > otherLength) return 1; 101 if (thisLength > otherLength) return 1;
83 return 0; 102 return 0;
84 } 103 }
85 104
86 bool _substringMatches(int start, String other) { 105 bool _substringMatches(int start, String other) {
87 if (other.isEmpty) return true; 106 if (other.isEmpty) return true;
88 if ((start < 0) || (start >= this.length)) { 107 if ((start < 0) || (start >= this.length)) {
89 return false; 108 return false;
90 } 109 }
91 final int len = other.length; 110 final int len = other.length;
92 if ((start + len) > this.length) { 111 if ((start + len) > this.length) {
93 return false; 112 return false;
94 } 113 }
95 for (int i = 0; i < len; i++) { 114 for (int i = 0; i < len; i++) {
96 if (this.charCodeAt(i + start) != other.charCodeAt(i)) { 115 if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {
97 return false; 116 return false;
98 } 117 }
99 } 118 }
100 return true; 119 return true;
101 } 120 }
102 121
103 bool endsWith(String other) { 122 bool endsWith(String other) {
104 return _substringMatches(this.length - other.length, other); 123 return _substringMatches(this.length - other.length, other);
105 } 124 }
106 125
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
155 return _substringUnchecked(startIndex, endIndex); 174 return _substringUnchecked(startIndex, endIndex);
156 } 175 }
157 176
158 String _substringUnchecked(int startIndex, int endIndex) 177 String _substringUnchecked(int startIndex, int endIndex)
159 native "StringBase_substringUnchecked"; 178 native "StringBase_substringUnchecked";
160 179
161 String trim() { 180 String trim() {
162 final int len = this.length; 181 final int len = this.length;
163 int first = 0; 182 int first = 0;
164 for (; first < len; first++) { 183 for (; first < len; first++) {
165 if (!_isWhitespace(this.charCodeAt(first))) { 184 // There are no whitespace characters that are outside the BMP so we
185 // can use code units here for efficiency.
186 if (!_isWhitespace(this.codeUnitAt(first))) {
166 break; 187 break;
167 } 188 }
168 } 189 }
169 if (len == first) { 190 if (len == first) {
170 // String contains only whitespaces. 191 // String contains only whitespaces.
171 return ""; 192 return "";
172 } 193 }
173 int last = len - 1; 194 int last = len - 1;
174 for (; last >= first; last--) { 195 for (; last >= first; last--) {
175 if (!_isWhitespace(this.charCodeAt(last))) { 196 if (!_isWhitespace(this.codeUnitAt(last))) {
176 break; 197 break;
177 } 198 }
178 } 199 }
179 if ((first == 0) && (last == (len - 1))) { 200 if ((first == 0) && (last == (len - 1))) {
180 // Returns this string if it does not have leading or trailing 201 // Returns this string if it does not have leading or trailing
181 // whitespaces. 202 // whitespaces.
182 return this; 203 return this;
183 } else { 204 } else {
184 return _substringUnchecked(first, last + 1); 205 return _substringUnchecked(first, last + 1);
185 } 206 }
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
286 if (startIndex == endIndex && endIndex == previousIndex) { 307 if (startIndex == endIndex && endIndex == previousIndex) {
287 ++startIndex; // empty match, advance and restart 308 ++startIndex; // empty match, advance and restart
288 continue; 309 continue;
289 } 310 }
290 result.add(this.substring(previousIndex, match.start)); 311 result.add(this.substring(previousIndex, match.start));
291 startIndex = previousIndex = endIndex; 312 startIndex = previousIndex = endIndex;
292 } 313 }
293 return result; 314 return result;
294 } 315 }
295 316
317 // TODO(erikcorry): Fix this to use the new code point iterator when it is
318 // available.
296 List<String> splitChars() { 319 List<String> splitChars() {
297 int len = this.length; 320 int len = this.length;
298 final result = new List<String>(len); 321 final result = new List<String>(len);
299 for (int i = 0; i < len; i++) { 322 int i, j;
300 result[i] = this[i]; 323 for (i = j = 0; i < len; i++, j++) {
324 int c = charCodeAt(i);
325 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.
326 if (c > 0xffff) {
floitsch 2012/11/08 15:28:21 Can't you use Utf16::IsSurrogate(c)?
erikcorry 2012/11/15 13:28:25 No, that's a C++ function. I added some named con
327 i++;
328 }
329 result[j] = new String.fromCharCodes([c]);
301 } 330 }
302 return result; 331 if (i == j) return result;
332 // If we saw some non-basic plane characters, then we have to return a
333 // slightly smaller array than expected (we can't trim the original one
334 // because it is non-extendable). This rarely happens so this is preferable
335 // to having a separate pass over the string to count the code points.
336 final newResult = new List<String>(j);
337 for (i = 0; i < j; i++) newResult[i] = result[i];
338 return newResult;
303 } 339 }
304 340
305 List<int> get charCodes { 341 List<int> get charCodes {
306 int len = this.length; 342 int len = this.length;
307 final result = new List<int>(len); 343 final result = new List<int>(len);
344 int i, j;
345 for (i = j = 0; i < len; i++, j++) {
346 int c = this.charCodeAt(i);
347 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.
348 if (c > 0xffff) {
349 i++;
350 }
351 result[j] = c;
352 }
353 if (i == j) return result;
354 // If we saw some non-basic plane characters, then we have to return a
355 // slightly smaller array than expected (we can't trim the original one
356 // because it is non-extendable). This rarely happens so this is preferable
357 // to having a separate pass over the string to count the code points.
358 final newResult = new List<int>(j);
359 for (i = 0; i < j; i++) newResult[i] = result[i];
360 return newResult;
361 }
362
363 List<int> get codeUnits {
364 int len = this.length;
365 final result = new List<int>(len);
308 for (int i = 0; i < len; i++) { 366 for (int i = 0; i < len; i++) {
309 result[i] = this.charCodeAt(i); 367 result[i] = this.codeUnitAt(i);
310 } 368 }
311 return result; 369 return result;
312 } 370 }
313 371
314 String toUpperCase() native "String_toUpperCase"; 372 String toUpperCase() native "String_toUpperCase";
315 373
316 String toLowerCase() native "String_toLowerCase"; 374 String toLowerCase() native "String_toLowerCase";
317 375
318 // Implementations of Strings methods follow below. 376 // Implementations of Strings methods follow below.
319 static String join(List<String> strings, String separator) { 377 static String join(List<String> strings, String separator) {
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
353 native "Strings_concatAll"; 411 native "Strings_concatAll";
354 } 412 }
355 413
356 414
357 class _OneByteString extends _StringBase implements String { 415 class _OneByteString extends _StringBase implements String {
358 factory _OneByteString._uninstantiable() { 416 factory _OneByteString._uninstantiable() {
359 throw new UnsupportedError( 417 throw new UnsupportedError(
360 "_OneByteString can only be allocated by the VM"); 418 "_OneByteString can only be allocated by the VM");
361 } 419 }
362 420
363 // Checks for one-byte whitespaces only.
364 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
365 // whitespaces for one byte strings.
366 bool _isWhitespace(int codePoint) { 421 bool _isWhitespace(int codePoint) {
367 return 422 return
368 (codePoint === 32) || // Space. 423 (codePoint == 32) || // Space.
424 (codePoint == 0xa0) || // No-break space.
369 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 425 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
370 } 426 }
371 427
372 } 428 }
373 429
374 430
375 class _TwoByteString extends _StringBase implements String { 431 class _TwoByteString extends _StringBase implements String {
376 factory _TwoByteString._uninstantiable() { 432 factory _TwoByteString._uninstantiable() {
377 throw new UnsupportedError( 433 throw new UnsupportedError(
378 "_TwoByteString can only be allocated by the VM"); 434 "_TwoByteString can only be allocated by the VM");
379 } 435 }
380 436
381 // Checks for one-byte whitespaces only. 437 // Works for both code points and code units since all spaces are in the BMP.
382 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
383 // whitespaces. Add checking for multi-byte whitespace codepoints.
384 bool _isWhitespace(int codePoint) { 438 bool _isWhitespace(int codePoint) {
385 return 439 return
386 (codePoint === 32) || // Space. 440 (codePoint == 32) || // Space.
387 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 441 (codePoint == 0xa0) || // No-break space.
442 ((9 <= codePoint) && (codePoint <= 13)) || // CR, LF, TAB, etc.
443 (codePoint >= 0x1680 && // Optimization.
444 (codePoint == 0x1680 || // Ogham space mark.
445 codePoint == 0x180e || // Mongolian vowel separator.
446 (codePoint >= 0x2000 && codePoint <= 0x200a) || // Wide/narrow spaces.
447 codePoint == 0x202f || // Narrow no-break space.
448 codePoint == 0x205f || // Medium mathematical space.
449 codePoint == 0x3000)); // Ideographic space.
388 } 450 }
389 } 451 }
390 452
391 453
454 // TODO(erikcorry): This is going away.
392 class _FourByteString extends _StringBase implements String { 455 class _FourByteString extends _StringBase implements String {
393 factory _FourByteString._uninstantiable() { 456 factory _FourByteString._uninstantiable() {
394 throw new UnsupportedError( 457 throw new UnsupportedError(
395 "_FourByteString can only be allocated by the VM"); 458 "_FourByteString can only be allocated by the VM");
396 } 459 }
397 460
398 // Checks for one-byte whitespaces only. 461 // Checks for one-byte whitespaces only.
399 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid 462 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
400 // whitespaces. Add checking for multi-byte whitespace codepoints. 463 // whitespaces. Add checking for multi-byte whitespace codepoints.
401 bool _isWhitespace(int codePoint) { 464 bool _isWhitespace(int codePoint) {
402 return 465 return
403 (codePoint === 32) || // Space. 466 (codePoint === 32) || // Space.
404 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 467 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
405 } 468 }
406 } 469 }
407 470
408 471
409 class _ExternalOneByteString extends _StringBase implements String { 472 class _ExternalOneByteString extends _StringBase implements String {
410 factory _ExternalOneByteString._uninstantiable() { 473 factory _ExternalOneByteString._uninstantiable() {
411 throw new UnsupportedError( 474 throw new UnsupportedError(
412 "_ExternalOneByteString can only be allocated by the VM"); 475 "_ExternalOneByteString can only be allocated by the VM");
413 } 476 }
414 477
415 // Checks for one-byte whitespaces only.
416 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
417 // whitespaces for one byte strings.
418 bool _isWhitespace(int codePoint) { 478 bool _isWhitespace(int codePoint) {
419 return 479 return
420 (codePoint === 32) || // Space. 480 (codePoint == 32) || // Space.
481 (codePoint == 0xa0) || // No-break space.
421 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 482 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
422 } 483 }
423 } 484 }
424 485
425 486
426 class _ExternalTwoByteString extends _StringBase implements String { 487 class _ExternalTwoByteString extends _StringBase implements String {
427 factory ExternalTwoByteString._uninstantiable() { 488 factory ExternalTwoByteString._uninstantiable() {
428 throw new UnsupportedError( 489 throw new UnsupportedError(
429 "_ExternalTwoByteString can only be allocated by the VM"); 490 "_ExternalTwoByteString can only be allocated by the VM");
430 } 491 }
431 492
432 // Checks for one-byte whitespaces only. 493 // Works for both code points and code units since all spaces are in the BMP.
433 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
434 // whitespaces. Add checking for multi-byte whitespace codepoints.
435 bool _isWhitespace(int codePoint) { 494 bool _isWhitespace(int codePoint) {
436 return 495 return
437 (codePoint === 32) || // Space. 496 (codePoint == 32) || // Space.
438 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 497 (codePoint == 0xa0) || // No-break space.
498 ((9 <= codePoint) && (codePoint <= 13)) || // CR, LF, TAB, etc.
499 (codePoint >= 0x1680 && // Optimization.
500 (codePoint == 0x1680 || // Ogham space mark.
501 codePoint == 0x180e || // Mongolian vowel separator.
502 (codePoint >= 0x2000 && codePoint <= 0x200a) || // Wide/narrow spaces.
503 codePoint == 0x202f || // Narrow no-break space.
504 codePoint == 0x205f || // Medium mathematical space.
505 codePoint == 0x3000)); // Ideographic space.
439 } 506 }
440 } 507 }
441 508
442 509
510 // TODO(erikcorry): This is going away.
443 class _ExternalFourByteString extends _StringBase implements String { 511 class _ExternalFourByteString extends _StringBase implements String {
444 factory _ExternalFourByteString._uninstantiable() { 512 factory _ExternalFourByteString._uninstantiable() {
445 throw new UnsupportedError( 513 throw new UnsupportedError(
446 "ExternalFourByteString can only be allocated by the VM"); 514 "ExternalFourByteString can only be allocated by the VM");
447 } 515 }
448 516
449 // Checks for one-byte whitespaces only. 517 // Checks for one-byte whitespaces only.
450 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid 518 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
451 // whitespaces. Add checking for multi-byte whitespace codepoints. 519 // whitespaces. Add checking for multi-byte whitespace codepoints.
452 bool _isWhitespace(int codePoint) { 520 bool _isWhitespace(int codePoint) {
(...skipping 25 matching lines...) Expand all
478 for (int g in groups) { 546 for (int g in groups) {
479 result.add(group(g)); 547 result.add(group(g));
480 } 548 }
481 return result; 549 return result;
482 } 550 }
483 551
484 final int start; 552 final int start;
485 final String str; 553 final String str;
486 final String pattern; 554 final String pattern;
487 } 555 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698