Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(308)

Side by Side Diff: runtime/lib/string_base.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Implemented feedback from patch set 3 Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 /** 5 /**
6 * [_StringBase] contains common methods used by concrete String 6 * [_StringBase] contains common methods used by concrete String
7 * implementations, e.g., _OneByteString. 7 * implementations, e.g., _OneByteString.
8 */ 8 */
9 class _StringBase { 9 class _StringBase {
10 10
11 factory _StringBase._uninstantiable() { 11 factory _StringBase._uninstantiable() {
12 throw new UnsupportedError( 12 throw new UnsupportedError(
13 "_StringBase can't be instaniated"); 13 "_StringBase can't be instaniated");
14 } 14 }
15 15
16 int get hashCode native "String_getHashCode"; 16 int get hashCode native "String_getHashCode";
17 17
18 /** 18 /**
19 * Create the most efficient string representation for specified 19 * Create the most efficient string representation for the specified UTF-16
20 * [codePoints]. 20 * [codeUnits].
21 */ 21 */
22 static String createFromCharCodes(List<int> charCodes) { 22 static String createFromUtf16(List<int> codeUnits) {
23 _ObjectArray objectArray; 23 _ObjectArray objectArray;
24 if (charCodes is _ObjectArray) { 24 if (codeUnits is _ObjectArray) {
25 objectArray = charCodes; 25 objectArray = codeUnits;
26 } else { 26 } else {
27 int len = charCodes.length; 27 int len = codeUnits.length;
28 objectArray = new _ObjectArray(len); 28 objectArray = new _ObjectArray(len);
29 for (int i = 0; i < len; i++) { 29 for (int i = 0; i < len; i++) {
30 objectArray[i] = charCodes[i]; 30 objectArray[i] = codeUnits[i];
31 } 31 }
32 } 32 }
33 return _createFromCodePoints(objectArray); 33 return _createFromUtf16(objectArray);
34 } 34 }
35 35
36 static String _createFromCodePoints(List<int> codePoints) 36 static String _createFromUtf16(List<int> codeUnits)
37 native "StringBase_createFromCodePoints"; 37 native "StringBase_createFromUtf16";
38 38
39 String operator [](int index) native "String_charAt"; 39 String operator [](int index) native "String_charAt";
40 40
41 int charCodeAt(int index) native "String_charCodeAt"; 41 int codeUnitAt(int index) native "String_codeUnitAt";
42 42
43 int get length native "String_getLength"; 43 int get length native "String_getLength";
44 44
45 bool get isEmpty { 45 bool get isEmpty {
46 return this.length == 0; 46 return this.length == 0;
47 } 47 }
48 48
49 String concat(String other) native "String_concat"; 49 String concat(String other) native "String_concat";
50 50
51 String toString() { 51 String toString() {
(...skipping 10 matching lines...) Expand all
62 return false; 62 return false;
63 } 63 }
64 return this.compareTo(other) == 0; 64 return this.compareTo(other) == 0;
65 } 65 }
66 66
67 int compareTo(String other) { 67 int compareTo(String other) {
68 int thisLength = this.length; 68 int thisLength = this.length;
69 int otherLength = other.length; 69 int otherLength = other.length;
70 int len = (thisLength < otherLength) ? thisLength : otherLength; 70 int len = (thisLength < otherLength) ? thisLength : otherLength;
71 for (int i = 0; i < len; i++) { 71 for (int i = 0; i < len; i++) {
72 int thisCodePoint = this.charCodeAt(i); 72 int thisCodeUnit = this.codeUnitAt(i);
73 int otherCodePoint = other.charCodeAt(i); 73 int otherCodeUnit = other.codeUnitAt(i);
74 if (thisCodePoint < otherCodePoint) { 74 if (thisCodeUnit < otherCodeUnit) {
75 return -1; 75 return -1;
76 } 76 }
77 if (thisCodePoint > otherCodePoint) { 77 if (thisCodeUnit > otherCodeUnit) {
78 return 1; 78 return 1;
79 } 79 }
80 } 80 }
81 if (thisLength < otherLength) return -1; 81 if (thisLength < otherLength) return -1;
82 if (thisLength > otherLength) return 1; 82 if (thisLength > otherLength) return 1;
83 return 0; 83 return 0;
84 } 84 }
85 85
86 bool _substringMatches(int start, String other) { 86 bool _substringMatches(int start, String other) {
87 if (other.isEmpty) return true; 87 if (other.isEmpty) return true;
88 if ((start < 0) || (start >= this.length)) { 88 if ((start < 0) || (start >= this.length)) {
89 return false; 89 return false;
90 } 90 }
91 final int len = other.length; 91 final int len = other.length;
92 if ((start + len) > this.length) { 92 if ((start + len) > this.length) {
93 return false; 93 return false;
94 } 94 }
95 for (int i = 0; i < len; i++) { 95 for (int i = 0; i < len; i++) {
96 if (this.charCodeAt(i + start) != other.charCodeAt(i)) { 96 if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {
97 return false; 97 return false;
98 } 98 }
99 } 99 }
100 return true; 100 return true;
101 } 101 }
102 102
103 bool endsWith(String other) { 103 bool endsWith(String other) {
104 return _substringMatches(this.length - other.length, other); 104 return _substringMatches(this.length - other.length, other);
105 } 105 }
106 106
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
155 return _substringUnchecked(startIndex, endIndex); 155 return _substringUnchecked(startIndex, endIndex);
156 } 156 }
157 157
158 String _substringUnchecked(int startIndex, int endIndex) 158 String _substringUnchecked(int startIndex, int endIndex)
159 native "StringBase_substringUnchecked"; 159 native "StringBase_substringUnchecked";
160 160
161 String trim() { 161 String trim() {
162 final int len = this.length; 162 final int len = this.length;
163 int first = 0; 163 int first = 0;
164 for (; first < len; first++) { 164 for (; first < len; first++) {
165 if (!_isWhitespace(this.charCodeAt(first))) { 165 // There are no whitespace characters that are outside the BMP so we
166 // can use code units here for efficiency.
167 if (!_isWhitespace(this.codeUnitAt(first))) {
166 break; 168 break;
167 } 169 }
168 } 170 }
169 if (len == first) { 171 if (len == first) {
170 // String contains only whitespaces. 172 // String contains only whitespaces.
171 return ""; 173 return "";
172 } 174 }
173 int last = len - 1; 175 int last = len - 1;
174 for (; last >= first; last--) { 176 for (; last >= first; last--) {
175 if (!_isWhitespace(this.charCodeAt(last))) { 177 if (!_isWhitespace(this.codeUnitAt(last))) {
176 break; 178 break;
177 } 179 }
178 } 180 }
179 if ((first == 0) && (last == (len - 1))) { 181 if ((first == 0) && (last == (len - 1))) {
180 // Returns this string if it does not have leading or trailing 182 // Returns this string if it does not have leading or trailing
181 // whitespaces. 183 // whitespaces.
182 return this; 184 return this;
183 } else { 185 } else {
184 return _substringUnchecked(first, last + 1); 186 return _substringUnchecked(first, last + 1);
185 } 187 }
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
286 if (startIndex == endIndex && endIndex == previousIndex) { 288 if (startIndex == endIndex && endIndex == previousIndex) {
287 ++startIndex; // empty match, advance and restart 289 ++startIndex; // empty match, advance and restart
288 continue; 290 continue;
289 } 291 }
290 result.add(this.substring(previousIndex, match.start)); 292 result.add(this.substring(previousIndex, match.start));
291 startIndex = previousIndex = endIndex; 293 startIndex = previousIndex = endIndex;
292 } 294 }
293 return result; 295 return result;
294 } 296 }
295 297
298 // TODO(erikcorry): Fix this to use the new code point iterator when it is
299 // available.
296 List<String> splitChars() { 300 List<String> splitChars() {
297 int len = this.length; 301 int len = this.length;
298 final result = new List<String>(len); 302 final result = new List<String>(len);
303 bool smpCharacterSeen = false;
304 int i, j;
305 for (i = j = 0; i < len; i++, j++) {
306 int c = charCodeAt(i);
307 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.
308 if (c >= String.SMP_CODE_POINT_BASE) {
309 i++;
310 smpCharacterSeen = true;
311 }
312 result[j] = new String.fromCharCodes([c]);
313 }
314 if (!smpCharacterSeen) return result;
315 // If we saw some non-basic plane characters, then we have to return a
316 // slightly smaller array than expected (we can't trim the original one
317 // because it is non-extendable). This rarely happens so this is preferable
318 // to having a separate pass over the string to count the code points.
319 return result.getRange(0, j);
320 }
321
322 List<int> get codeUnits {
323 int len = this.length;
324 final result = new List<int>(len);
299 for (int i = 0; i < len; i++) { 325 for (int i = 0; i < len; i++) {
300 result[i] = this[i]; 326 result[i] = this.codeUnitAt(i);
301 } 327 }
302 return result; 328 return result;
303 } 329 }
304
305 List<int> get charCodes {
306 int len = this.length;
307 final result = new List<int>(len);
308 for (int i = 0; i < len; i++) {
309 result[i] = this.charCodeAt(i);
310 }
311 return result;
312 }
313 330
314 String toUpperCase() native "String_toUpperCase"; 331 String toUpperCase() native "String_toUpperCase";
315 332
316 String toLowerCase() native "String_toLowerCase"; 333 String toLowerCase() native "String_toLowerCase";
317 334
318 // Implementations of Strings methods follow below. 335 // Implementations of Strings methods follow below.
319 static String join(List<String> strings, String separator) { 336 static String join(List<String> strings, String separator) {
320 final int length = strings.length; 337 final int length = strings.length;
321 if (length == 0) { 338 if (length == 0) {
322 return ""; 339 return "";
(...skipping 30 matching lines...) Expand all
353 native "Strings_concatAll"; 370 native "Strings_concatAll";
354 } 371 }
355 372
356 373
357 class _OneByteString extends _StringBase implements String { 374 class _OneByteString extends _StringBase implements String {
358 factory _OneByteString._uninstantiable() { 375 factory _OneByteString._uninstantiable() {
359 throw new UnsupportedError( 376 throw new UnsupportedError(
360 "_OneByteString can only be allocated by the VM"); 377 "_OneByteString can only be allocated by the VM");
361 } 378 }
362 379
363 // Checks for one-byte whitespaces only.
364 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
365 // whitespaces for one byte strings.
366 bool _isWhitespace(int codePoint) { 380 bool _isWhitespace(int codePoint) {
367 return 381 return
368 (codePoint == 32) || // Space. 382 (codePoint == 32) || // Space.
383 (codePoint == 0xa0) || // No-break space.
369 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 384 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
370 } 385 }
371 386
387 int charCodeAt(int index) => codeUnitAt(index);
388
389 List<int> get charCodes => codeUnits;
372 } 390 }
373 391
374 392
375 class _TwoByteString extends _StringBase implements String { 393 class _TwoByteStringBase extends _StringBase {
394 factory _TwoByteStringBase._uninstantiable() {
395 throw new UnsupportedError(
396 "_TwoByteStringBase can't be instaniated");
397 }
398
399 // Works for both code points and code units since all spaces are in the BMP.
400 bool _isWhitespace(int codePoint) {
401 return
402 (codePoint == 32) || // Space.
403 (codePoint == 0xa0) || // No-break space.
404 ((9 <= codePoint) && (codePoint <= 13)) || // CR, LF, TAB, etc.
405 (codePoint >= 0x1680 && // Optimization.
406 (codePoint == 0x1680 || // Ogham space mark.
407 codePoint == 0x180e || // Mongolian vowel separator.
408 (codePoint >= 0x2000 && codePoint <= 0x200a) || // Wide/narrow spaces.
409 codePoint == 0x2028 || // Line separator.
410 codePoint == 0x2029 || // Paragraph separator.
411 codePoint == 0x202f || // Narrow no-break space.
412 codePoint == 0x205f || // Medium mathematical space.
413 codePoint == 0x3000 || // Ideographic space.
414 codePoint == 0xfeff)); // BOM code.
415 }
416
417 int charCodeAt(int index) {
418 const int LEAD_SURROGATE_BASE = 0xd800;
419 const int LEAD_SURROGATE_END = 0xdbff;
420 const int TRAIL_SURROGATE_BASE = 0xdc00;
421 const int TRAIL_SURROGATE_END = 0xdfff;
422 const int MASK = 0x3ff;
423 int code = codeUnitAt(index);
424 if (code < LEAD_SURROGATE_BASE || code > LEAD_SURROGATE_END) return code;
425 if (index + 1 >= length) return code;
426 int trail = codeUnitAt(index + 1);
427 if (trail < TRAIL_SURROGATE_BASE || trail > TRAIL_SURROGATE_END) {
428 return code;
429 }
430 return String.SMP_CODE_POINT_BASE + ((code & MASK) << 10) + (trail & MASK);
431 }
432
433 // TODO(erikcorry): Fix this to use the new code point iterator when it is
434 // available.
435 List<int> get charCodes {
436 int len = this.length;
437 final result = new List<int>(len);
438 bool smpCharacterSeen = false;
439 int i, j;
440 for (i = j = 0; i < len; i++, j++) {
441 int c = this.charCodeAt(i);
442 // Check for supplementary plane character encoded as a UTF-16 surrogate
443 // pair.
444 if (c >= String.SMP_CODE_POINT_BASE) {
445 i++;
446 smpCharacterSeen = true;
447 }
448 result[j] = c;
449 }
450 if (!smpCharacterSeen) return result;
451 // If we saw some non-basic plane characters, then we have to return a
452 // slightly smaller array than expected (we can't trim the original one
453 // because it is non-extendable). This rarely happens so this is preferable
454 // to having a separate pass over the string to count the code points.
455 return result.getRange(0, j);
456 }
457 }
458
459
460 class _TwoByteString extends _TwoByteStringBase implements String {
376 factory _TwoByteString._uninstantiable() { 461 factory _TwoByteString._uninstantiable() {
377 throw new UnsupportedError( 462 throw new UnsupportedError(
378 "_TwoByteString can only be allocated by the VM"); 463 "_TwoByteString can only be allocated by the VM");
379 } 464 }
380
381 // Checks for one-byte whitespaces only.
382 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
383 // whitespaces. Add checking for multi-byte whitespace codepoints.
384 bool _isWhitespace(int codePoint) {
385 return
386 (codePoint == 32) || // Space.
387 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
388 }
389 } 465 }
390 466
391
392 class _FourByteString extends _StringBase implements String {
393 factory _FourByteString._uninstantiable() {
394 throw new UnsupportedError(
395 "_FourByteString can only be allocated by the VM");
396 }
397
398 // Checks for one-byte whitespaces only.
399 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
400 // whitespaces. Add checking for multi-byte whitespace codepoints.
401 bool _isWhitespace(int codePoint) {
402 return
403 (codePoint == 32) || // Space.
404 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
405 }
406 }
407
408 467
409 class _ExternalOneByteString extends _StringBase implements String { 468 class _ExternalOneByteString extends _StringBase implements String {
410 factory _ExternalOneByteString._uninstantiable() { 469 factory _ExternalOneByteString._uninstantiable() {
411 throw new UnsupportedError( 470 throw new UnsupportedError(
412 "_ExternalOneByteString can only be allocated by the VM"); 471 "_ExternalOneByteString can only be allocated by the VM");
413 } 472 }
414 473
415 // Checks for one-byte whitespaces only.
416 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
417 // whitespaces for one byte strings.
418 bool _isWhitespace(int codePoint) { 474 bool _isWhitespace(int codePoint) {
419 return 475 return
420 (codePoint == 32) || // Space. 476 (codePoint == 32) || // Space.
477 (codePoint == 0xa0) || // No-break space.
421 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc. 478 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
422 } 479 }
480
481 int charCodeAt(int index) => codeUnitAt(index);
482
483 List<int> get charCodes => codeUnits;
423 } 484 }
424 485
425 486
426 class _ExternalTwoByteString extends _StringBase implements String { 487 class _ExternalTwoByteString extends _TwoByteStringBase implements String {
427 factory _ExternalTwoByteString._uninstantiable() { 488 factory _ExternalTwoByteString._uninstantiable() {
428 throw new UnsupportedError( 489 throw new UnsupportedError(
429 "_ExternalTwoByteString can only be allocated by the VM"); 490 "_ExternalTwoByteString can only be allocated by the VM");
430 } 491 }
431
432 // Checks for one-byte whitespaces only.
433 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
434 // whitespaces. Add checking for multi-byte whitespace codepoints.
435 bool _isWhitespace(int codePoint) {
436 return
437 (codePoint == 32) || // Space.
438 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
439 }
440 } 492 }
441 493
442
443 class _ExternalFourByteString extends _StringBase implements String {
444 factory _ExternalFourByteString._uninstantiable() {
445 throw new UnsupportedError(
446 "ExternalFourByteString can only be allocated by the VM");
447 }
448
449 // Checks for one-byte whitespaces only.
450 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid
451 // whitespaces. Add checking for multi-byte whitespace codepoints.
452 bool _isWhitespace(int codePoint) {
453 return
454 (codePoint == 32) || // Space.
455 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.
456 }
457 }
458
459 494
460 class _StringMatch implements Match { 495 class _StringMatch implements Match {
461 const _StringMatch(int this.start, 496 const _StringMatch(int this.start,
462 String this.str, 497 String this.str,
463 String this.pattern); 498 String this.pattern);
464 499
465 int get end => start + pattern.length; 500 int get end => start + pattern.length;
466 String operator[](int g) => group(g); 501 String operator[](int g) => group(g);
467 int get groupCount => 0; 502 int get groupCount => 0;
468 503
469 String group(int group) { 504 String group(int group) {
470 if (group != 0) { 505 if (group != 0) {
471 throw new RangeError.value(group); 506 throw new RangeError.value(group);
472 } 507 }
473 return pattern; 508 return pattern;
474 } 509 }
475 510
476 List<String> groups(List<int> groups) { 511 List<String> groups(List<int> groups) {
477 List<String> result = new List<String>(); 512 List<String> result = new List<String>();
478 for (int g in groups) { 513 for (int g in groups) {
479 result.add(group(g)); 514 result.add(group(g));
480 } 515 }
481 return result; 516 return result;
482 } 517 }
483 518
484 final int start; 519 final int start;
485 final String str; 520 final String str;
486 final String pattern; 521 final String pattern;
487 } 522 }
OLDNEW
« no previous file with comments | « runtime/lib/string.cc ('k') | runtime/lib/string_patch.dart » ('j') | runtime/vm/scanner.cc » ('J')

Powered by Google App Engine
This is Rietveld 408576698