Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(722)

Side by Side Diff: runtime/lib/string_base.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: New version integrates feedback, adds less to standard String class. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 /** 5 /**
6 * [_StringBase] contains common methods used by concrete String 6 * [_StringBase] contains common methods used by concrete String
7 * implementations, e.g., _OneByteString. 7 * implementations, e.g., _OneByteString.
8 */ 8 */
9 class _StringBase { 9 class _StringBase {
10 10
11 factory _StringBase._uninstantiable() { 11 factory _StringBase._uninstantiable() {
12 throw new UnsupportedError( 12 throw new UnsupportedError(
13 "_StringBase can't be instaniated"); 13 "_StringBase can't be instaniated");
14 } 14 }
15 15
16 int get hashCode native "String_getHashCode"; 16 int get hashCode native "String_getHashCode";
17 17
18 /** 18 static String createFromCodeUnits(List<int> codeUnits) {
cshapiro 2012/11/15 20:14:51 Please stick with naming this by the input type.
erikcorry 2012/11/15 23:47:05 Done.
19 * Create the most efficient string representation for specified
20 * [codePoints].
21 */
22 static String createFromCharCodes(List<int> charCodes) {
23 _ObjectArray objectArray; 19 _ObjectArray objectArray;
24 if (charCodes is _ObjectArray) { 20 if (codeUnits is _ObjectArray) {
25 objectArray = charCodes; 21 objectArray = codeUnits;
26 } else { 22 } else {
27 int len = charCodes.length; 23 int len = codeUnits.length;
28 objectArray = new _ObjectArray(len); 24 objectArray = new _ObjectArray(len);
29 for (int i = 0; i < len; i++) { 25 for (int i = 0; i < len; i++) {
30 objectArray[i] = charCodes[i]; 26 objectArray[i] = codeUnits[i];
31 } 27 }
32 } 28 }
33 return _createFromCodePoints(objectArray); 29 return _createFromCodeUnits(objectArray);
34 } 30 }
35 31
36 static String _createFromCodePoints(List<int> codePoints) 32 static String _createFromCodeUnits(List<int> codeUnits)
37 native "StringBase_createFromCodePoints"; 33 native "StringBase_createFromCodeUnits";
38 34
39 String operator [](int index) native "String_charAt"; 35 String operator [](int index) native "String_charAt";
40 36
41 int charCodeAt(int index) native "String_charCodeAt"; 37 int codeUnitAt(int index) native "String_codeUnitAt";
42 38
43 int get length native "String_getLength"; 39 int get length native "String_getLength";
44 40
45 bool get isEmpty { 41 bool get isEmpty {
46 return this.length == 0; 42 return this.length == 0;
47 } 43 }
48 44
49 String concat(String other) native "String_concat"; 45 String concat(String other) native "String_concat";
50 46
51 String toString() { 47 String toString() {
(...skipping 10 matching lines...) Expand all
62 return false; 58 return false;
63 } 59 }
64 return this.compareTo(other) == 0; 60 return this.compareTo(other) == 0;
65 } 61 }
66 62
67 int compareTo(String other) { 63 int compareTo(String other) {
68 int thisLength = this.length; 64 int thisLength = this.length;
69 int otherLength = other.length; 65 int otherLength = other.length;
70 int len = (thisLength < otherLength) ? thisLength : otherLength; 66 int len = (thisLength < otherLength) ? thisLength : otherLength;
71 for (int i = 0; i < len; i++) { 67 for (int i = 0; i < len; i++) {
72 int thisCodePoint = this.charCodeAt(i); 68 int thisCodeUnit = this.codeUnitAt(i);
73 int otherCodePoint = other.charCodeAt(i); 69 int otherCodeUnit = other.codeUnitAt(i);
74 if (thisCodePoint < otherCodePoint) { 70 if (thisCodeUnit < otherCodeUnit) {
75 return -1; 71 return -1;
76 } 72 }
77 if (thisCodePoint > otherCodePoint) { 73 if (thisCodeUnit > otherCodeUnit) {
78 return 1; 74 return 1;
79 } 75 }
80 } 76 }
81 if (thisLength < otherLength) return -1; 77 if (thisLength < otherLength) return -1;
82 if (thisLength > otherLength) return 1; 78 if (thisLength > otherLength) return 1;
83 return 0; 79 return 0;
84 } 80 }
85 81
86 bool _substringMatches(int start, String other) { 82 bool _substringMatches(int start, String other) {
87 if (other.isEmpty) return true; 83 if (other.isEmpty) return true;
88 if ((start < 0) || (start >= this.length)) { 84 if ((start < 0) || (start >= this.length)) {
89 return false; 85 return false;
90 } 86 }
91 final int len = other.length; 87 final int len = other.length;
92 if ((start + len) > this.length) { 88 if ((start + len) > this.length) {
93 return false; 89 return false;
94 } 90 }
95 for (int i = 0; i < len; i++) { 91 for (int i = 0; i < len; i++) {
96 if (this.charCodeAt(i + start) != other.charCodeAt(i)) { 92 if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {
97 return false; 93 return false;
98 } 94 }
99 } 95 }
100 return true; 96 return true;
101 } 97 }
102 98
103 bool endsWith(String other) { 99 bool endsWith(String other) {
104 return _substringMatches(this.length - other.length, other); 100 return _substringMatches(this.length - other.length, other);
105 } 101 }
106 102
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
155 return _substringUnchecked(startIndex, endIndex); 151 return _substringUnchecked(startIndex, endIndex);
156 } 152 }
157 153
158 String _substringUnchecked(int startIndex, int endIndex) 154 String _substringUnchecked(int startIndex, int endIndex)
159 native "StringBase_substringUnchecked"; 155 native "StringBase_substringUnchecked";
160 156
161 String trim() { 157 String trim() {
162 final int len = this.length; 158 final int len = this.length;
163 int first = 0; 159 int first = 0;
164 for (; first < len; first++) { 160 for (; first < len; first++) {
165 if (!_isWhitespace(this.charCodeAt(first))) { 161 // There are no whitespace characters that are outside the BMP so we
162 // can use code units here for efficiency.
163 if (!_isWhitespace(this.codeUnitAt(first))) {
166 break; 164 break;
167 } 165 }
168 } 166 }
169 if (len == first) { 167 if (len == first) {
170 // String contains only whitespaces. 168 // String contains only whitespaces.
171 return ""; 169 return "";
172 } 170 }
173 int last = len - 1; 171 int last = len - 1;
174 for (; last >= first; last--) { 172 for (; last >= first; last--) {
175 if (!_isWhitespace(this.charCodeAt(last))) { 173 if (!_isWhitespace(this.codeUnitAt(last))) {
176 break; 174 break;
177 } 175 }
178 } 176 }
179 if ((first == 0) && (last == (len - 1))) { 177 if ((first == 0) && (last == (len - 1))) {
180 // Returns this string if it does not have leading or trailing 178 // Returns this string if it does not have leading or trailing
181 // whitespaces. 179 // whitespaces.
182 return this; 180 return this;
183 } else { 181 } else {
184 return _substringUnchecked(first, last + 1); 182 return _substringUnchecked(first, last + 1);
185 } 183 }
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
286 if (startIndex == endIndex && endIndex == previousIndex) { 284 if (startIndex == endIndex && endIndex == previousIndex) {
287 ++startIndex; // empty match, advance and restart 285 ++startIndex; // empty match, advance and restart
288 continue; 286 continue;
289 } 287 }
290 result.add(this.substring(previousIndex, match.start)); 288 result.add(this.substring(previousIndex, match.start));
291 startIndex = previousIndex = endIndex; 289 startIndex = previousIndex = endIndex;
292 } 290 }
293 return result; 291 return result;
294 } 292 }
295 293
294 // TODO(erikcorry): Fix this to use the new code point iterator when it is
295 // available.
296 List<String> splitChars() { 296 List<String> splitChars() {
297 int len = this.length; 297 int len = this.length;
298 final result = new List<String>(len); 298 final result = new List<String>(len);
299 int i, j;
300 for (i = j = 0; i < len; i++, j++) {
301 int c = charCodeAt(i);
302 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.
303 if (c >= String.SMP_CODE_POINT_BASE) {
304 i++;
305 }
306 result[j] = new String.fromCharCodes([c]);
307 }
308 if (i == j) return result;
309 // If we saw some non-basic plane characters, then we have to return a
310 // slightly smaller array than expected (we can't trim the original one
311 // because it is non-extendable). This rarely happens so this is preferable
312 // to having a separate pass over the string to count the code points.
313 final newResult = new List<String>(j);
314 for (i = 0; i < j; i++) newResult[i] = result[i];
315 return newResult;
316 }
317
318 List<int> get codeUnits {
319 int len = this.length;
320 final result = new List<int>(len);
299 for (int i = 0; i < len; i++) { 321 for (int i = 0; i < len; i++) {
300 result[i] = this[i]; 322 result[i] = this.codeUnitAt(i);
301 } 323 }
302 return result; 324 return result;
303 } 325 }
304
305 List<int> get charCodes {
306 int len = this.length;
307 final result = new List<int>(len);
308 for (int i = 0; i < len; i++) {
309 result[i] = this.charCodeAt(i);
310 }
311 return result;
312 }
313 326
314 String toUpperCase() native "String_toUpperCase"; 327 String toUpperCase() native "String_toUpperCase";
315 328
316 String toLowerCase() native "String_toLowerCase"; 329 String toLowerCase() native "String_toLowerCase";
317 330
318 // Implementations of Strings methods follow below. 331 // Implementations of Strings methods follow below.
319 static String join(List<String> strings, String separator) { 332 static String join(List<String> strings, String separator) {
320 final int length = strings.length; 333 final int length = strings.length;
321 if (length == 0) { 334 if (length == 0) {
322 return ""; 335 return "";
(...skipping 30 matching lines...) Expand all
353 native "Strings_concatAll"; 366 native "Strings_concatAll";
354 } 367 }
355 368
356 369
357 class _OneByteString extends _StringBase implements String { 370 class _OneByteString extends _StringBase implements String {
358 factory _OneByteString._uninstantiable() { 371 factory _OneByteString._uninstantiable() {
359 throw new UnsupportedError( 372 throw new UnsupportedError(
360 "_OneByteString can only be allocated by the VM"); 373 "_OneByteString can only be allocated by the VM");
361 } 374 }
362 375
363 // Checks for one-byte whitespaces only.
364 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
365 // whitespaces for one byte strings.
366 bool _isWhitespace(int codePoint) { 376 bool _isWhitespace(int codePoint) {
367 return 377 return
368 (codePoint == 32) || // Space. 378 (codePoint == 32) || // Space.
379 (codePoint == 0xa0) || // No-break space.
369 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 380 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
370 } 381 }
371 382
383 int charCodeAt(int index) => codeUnitAt(index);
384
385 List<int> get charCodes => codeUnits;
372 } 386 }
373 387
374 388
375 class _TwoByteString extends _StringBase implements String { 389 class _TwoByteStringBase extends _StringBase {
390 factory _TwoByteStringBase._uninstantiable() {
391 throw new UnsupportedError(
392 "_TwoByteStringBase can't be instaniated");
393 }
394
395 // Works for both code points and code units since all spaces are in the BMP.
396 bool _isWhitespace(int codePoint) {
397 return
398 (codePoint == 32) || // Space.
399 (codePoint == 0xa0) || // No-break space.
400 ((9 <= codePoint) && (codePoint <= 13)) || // CR, LF, TAB, etc.
401 (codePoint >= 0x1680 && // Optimization.
402 (codePoint == 0x1680 || // Ogham space mark.
403 codePoint == 0x180e || // Mongolian vowel separator.
404 (codePoint >= 0x2000 && codePoint <= 0x200a) || // Wide/narrow spaces.
405 codePoint == 0x2028 || // Line separator.
406 codePoint == 0x2029 || // Paragraph separator.
407 codePoint == 0x202f || // Narrow no-break space.
408 codePoint == 0x205f || // Medium mathematical space.
409 codePoint == 0x3000 || // Ideographic space.
410 codePoint == 0xfeff)); // BOM code.
411 }
412
413 int charCodeAt(int index) {
414 const int LEAD_SURROGATE_BASE = 0xd800;
415 const int LEAD_SURROGATE_END = 0xdbff;
416 const int TRAIL_SURROGATE_BASE = 0xdc00;
417 const int TRAIL_SURROGATE_END = 0xdfff;
418 const int MASK = 0x3ff;
419 int code = codeUnitAt(index);
420 if (code < LEAD_SURROGATE_BASE || code > LEAD_SURROGATE_END) return code;
421 if (index + 1 >= length) return code;
422 int trail = codeUnitAt(index + 1);
423 if (trail < TRAIL_SURROGATE_BASE || trail > TRAIL_SURROGATE_END) {
424 return code;
425 }
426 return String.SMP_CODE_POINT_BASE + ((code & MASK) << 10) + (trail & MASK);
427 }
428
429 List<int> get charCodes {
430 int len = this.length;
431 final result = new List<int>(len);
432 int i, j;
433 for (i = j = 0; i < len; i++, j++) {
434 int c = this.charCodeAt(i);
435 // Check for supplementary plane character encoded as a UTF-16 surrogate
436 // pair.
437 if (c >= String.SMP_CODE_POINT_BASE) {
438 i++;
439 }
440 result[j] = c;
441 }
442 if (i == j) return result;
443 // If we saw some non-basic plane characters, then we have to return a
444 // slightly smaller array than expected (we can't trim the original one
445 // because it is non-extendable). This rarely happens so this is preferable
446 // to having a separate pass over the string to count the code points.
447 final newResult = new List<int>(j);
448 for (i = 0; i < j; i++) newResult[i] = result[i];
449 return newResult;
450 }
451 }
452
453
454 class _TwoByteString extends _TwoByteStringBase implements String {
376 factory _TwoByteString._uninstantiable() { 455 factory _TwoByteString._uninstantiable() {
377 throw new UnsupportedError( 456 throw new UnsupportedError(
378 "_TwoByteString can only be allocated by the VM"); 457 "_TwoByteString can only be allocated by the VM");
379 } 458 }
380
381 // Checks for one-byte whitespaces only.
382 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
383 // whitespaces. Add checking for multi-byte whitespace codepoints.
384 bool _isWhitespace(int codePoint) {
385 return
386 (codePoint == 32) || // Space.
387 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
388 }
389 } 459 }
390 460
391 461
392 class _FourByteString extends _StringBase implements String { 462 // TODO(erikcorry): This is going away.
463 class _FourByteString extends _StringBase {
393 factory _FourByteString._uninstantiable() { 464 factory _FourByteString._uninstantiable() {
394 throw new UnsupportedError( 465 throw new UnsupportedError(
395 "_FourByteString can only be allocated by the VM"); 466 "_FourByteString can only be allocated by the VM");
396 } 467 }
397 468
398 // Checks for one-byte whitespaces only. 469 // Checks for one-byte whitespaces only.
399 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid 470 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
400 // whitespaces. Add checking for multi-byte whitespace codepoints. 471 // whitespaces. Add checking for multi-byte whitespace codepoints.
401 bool _isWhitespace(int codePoint) { 472 bool _isWhitespace(int codePoint) {
402 return 473 return
403 (codePoint == 32) || // Space. 474 (codePoint == 32) || // Space.
404 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 475 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
405 } 476 }
406 } 477 }
407 478
408 479
409 class _ExternalOneByteString extends _StringBase implements String { 480 class _ExternalOneByteString extends _StringBase implements String {
410 factory _ExternalOneByteString._uninstantiable() { 481 factory _ExternalOneByteString._uninstantiable() {
411 throw new UnsupportedError( 482 throw new UnsupportedError(
412 "_ExternalOneByteString can only be allocated by the VM"); 483 "_ExternalOneByteString can only be allocated by the VM");
413 } 484 }
414 485
415 // Checks for one-byte whitespaces only.
416 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
417 // whitespaces for one byte strings.
418 bool _isWhitespace(int codePoint) { 486 bool _isWhitespace(int codePoint) {
419 return 487 return
420 (codePoint == 32) || // Space. 488 (codePoint == 32) || // Space.
489 (codePoint == 0xa0) || // No-break space.
421 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 490 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
422 } 491 }
492
493 int charCodeAt(int index) => codeUnitAt(index);
494
495 List<int> get charCodes => codeUnits;
496 }
497
498
499 class _ExternalTwoByteString extends _TwoByteStringBase implements String {
500 factory _ExternalTwoByteString._uninstantiable() {
501 throw new UnsupportedError(
502 "_ExternalTwoByteString can only be allocated by the VM");
503 }
423 } 504 }
424 505
425 506
426 class _ExternalTwoByteString extends _StringBase implements String { 507 // TODO(erikcorry): This is going away.
427 factory _ExternalTwoByteString._uninstantiable() { 508 class _ExternalFourByteString extends _StringBase {
428 throw new UnsupportedError(
429 "_ExternalTwoByteString can only be allocated by the VM");
430 }
431
432 // Checks for one-byte whitespaces only.
433 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
434 // whitespaces. Add checking for multi-byte whitespace codepoints.
435 bool _isWhitespace(int codePoint) {
436 return
437 (codePoint == 32) || // Space.
438 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
439 }
440 }
441
442
443 class _ExternalFourByteString extends _StringBase implements String {
444 factory _ExternalFourByteString._uninstantiable() { 509 factory _ExternalFourByteString._uninstantiable() {
445 throw new UnsupportedError( 510 throw new UnsupportedError(
446 "ExternalFourByteString can only be allocated by the VM"); 511 "ExternalFourByteString can only be allocated by the VM");
447 } 512 }
448 513
449 // Checks for one-byte whitespaces only. 514 // Checks for one-byte whitespaces only.
450 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid 515 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
451 // whitespaces. Add checking for multi-byte whitespace codepoints. 516 // whitespaces. Add checking for multi-byte whitespace codepoints.
452 bool _isWhitespace(int codePoint) { 517 bool _isWhitespace(int codePoint) {
453 return 518 return
(...skipping 24 matching lines...) Expand all
478 for (int g in groups) { 543 for (int g in groups) {
479 result.add(group(g)); 544 result.add(group(g));
480 } 545 }
481 return result; 546 return result;
482 } 547 }
483 548
484 final int start; 549 final int start;
485 final String str; 550 final String str;
486 final String pattern; 551 final String pattern;
487 } 552 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698