runtime/lib/string_base.dart - Issue 11368138: Add some support for the code-point code-unit distinction.

Side by Side Diff: runtime/lib/string_base.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 /**	5 /**

6 * [_StringBase] contains common methods used by concrete String	6 * [_StringBase] contains common methods used by concrete String

7 * implementations, e.g., _OneByteString.	7 * implementations, e.g., _OneByteString.

8 */	8 */

9 class _StringBase {	9 class _StringBase {

10	10

11 factory _StringBase._uninstantiable() {	11 factory _StringBase._uninstantiable() {

12 throw new UnsupportedError(	12 throw new UnsupportedError(

13 "_StringBase can't be instaniated");	13 "_StringBase can't be instaniated");

14 }	14 }

15	15

16 int get hashCode native "String_getHashCode";	16 int get hashCode native "String_getHashCode";

17	17

18 /**	18 /**

19 * Create the most efficient string representation for specified	19 * Create the most efficient string representation for specified

20 * [codePoints].	20 * [codePoints].

21 */	21 */

22 static String createFromCharCodes(List<int> charCodes) {	22 static String createFromCharCodes(List<int> codePoints) {

23 _ObjectArray objectArray;	23 _ObjectArray objectArray;

24 if (charCodes is _ObjectArray) {	24 if (codePoints is _ObjectArray) {

25 objectArray = charCodes;	25 objectArray = codePoints;

26 } else {	26 } else {

27 int len = charCodes.length;	27 int len = codePoints.length;

28 objectArray = new _ObjectArray(len);	28 objectArray = new _ObjectArray(len);

29 for (int i = 0; i < len; i++) {	29 for (int i = 0; i < len; i++) {

30 objectArray[i] = charCodes[i];	30 objectArray[i] = codePoints[i];

31 }	31 }

32 }	32 }

33 return _createFromCodePoints(objectArray);	33 return _createFromCodePoints(objectArray);

34 }	34 }

35	35

36 static String _createFromCodePoints(List<int> codePoints)	36 static String _createFromCodePoints(List<int> codePoints)

37 native "StringBase_createFromCodePoints";	37 native "StringBase_createFromCodePoints";

38	38

	39 static String createFromCodeUnits(List<int> codeUnits) {

	40 _ObjectArray objectArray;

	41 if (codeUnits is _ObjectArray) {

	42 objectArray = codeUnits;

	43 } else {

	44 int len = codeUnits.length;

	45 objectArray = new _ObjectArray(len);

	46 for (int i = 0; i < len; i++) {

	47 objectArray[i] = codeUnits[i];

	48 }

	49 }

	50 return _createFromCodeUnits(objectArray);

	51 }

	52

	53 static String _createFromCodeUnits(List<int> codeUnits)

	54 native "StringBase_createFromCodeUnits";

	55

39 String operator [](int index) native "String_charAt";	56 String operator [](int index) native "String_charAt";

40	57

41 int charCodeAt(int index) native "String_charCodeAt";	58 int charCodeAt(int index) native "String_charCodeAt";

42	59

	60 int codeUnitAt(int index) native "String_codeUnitAt";

	61

43 int get length native "String_getLength";	62 int get length native "String_getLength";

44	63

45 bool get isEmpty {	64 bool get isEmpty {

46 return this.length === 0;	65 return this.length === 0;

47 }	66 }

48	67

49 String concat(String other) native "String_concat";	68 String concat(String other) native "String_concat";

50	69

51 String toString() {	70 String toString() {

52 return this;	71 return this;

53 }	72 }

54	73

55 bool operator ==(Object other) {	74 bool operator ==(Object other) {

56 if (this === other) {	75 if (this === other) {

57 return true;	76 return true;

58 }	77 }

59 if ((other is !String) \|\|	78 if ((other is !String) \|\|

60 (this.length != other.length)) {	79 (this.length != other.length)) {

61 // TODO(5413632): Compare hash codes when both are present.	80 // TODO(5413632): Compare hash codes when both are present.

62 return false;	81 return false;

63 }	82 }

64 return this.compareTo(other) === 0;	83 return this.compareTo(other) === 0;

65 }	84 }

66	85

67 int compareTo(String other) {	86 int compareTo(String other) {

68 int thisLength = this.length;	87 int thisLength = this.length;

69 int otherLength = other.length;	88 int otherLength = other.length;

70 int len = (thisLength < otherLength) ? thisLength : otherLength;	89 int len = (thisLength < otherLength) ? thisLength : otherLength;

71 for (int i = 0; i < len; i++) {	90 for (int i = 0; i < len; i++) {

72 int thisCodePoint = this.charCodeAt(i);	91 int thisCodeUnit = this.codeUnitAt(i);

73 int otherCodePoint = other.charCodeAt(i);	92 int otherCodeUnit = other.codeUnitAt(i);

74 if (thisCodePoint < otherCodePoint) {	93 if (thisCodeUnit < otherCodeUnit) {

75 return -1;	94 return -1;

76 }	95 }

77 if (thisCodePoint > otherCodePoint) {	96 if (thisCodeUnit > otherCodeUnit) {

78 return 1;	97 return 1;

79 }	98 }

80 }	99 }

81 if (thisLength < otherLength) return -1;	100 if (thisLength < otherLength) return -1;

82 if (thisLength > otherLength) return 1;	101 if (thisLength > otherLength) return 1;

83 return 0;	102 return 0;

84 }	103 }

85	104

86 bool _substringMatches(int start, String other) {	105 bool _substringMatches(int start, String other) {

87 if (other.isEmpty) return true;	106 if (other.isEmpty) return true;

88 if ((start < 0) \|\| (start >= this.length)) {	107 if ((start < 0) \|\| (start >= this.length)) {

89 return false;	108 return false;

90 }	109 }

91 final int len = other.length;	110 final int len = other.length;

92 if ((start + len) > this.length) {	111 if ((start + len) > this.length) {

93 return false;	112 return false;

94 }	113 }

95 for (int i = 0; i < len; i++) {	114 for (int i = 0; i < len; i++) {

96 if (this.charCodeAt(i + start) != other.charCodeAt(i)) {	115 if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {

97 return false;	116 return false;

98 }	117 }

99 }	118 }

100 return true;	119 return true;

101 }	120 }

102	121

103 bool endsWith(String other) {	122 bool endsWith(String other) {

104 return _substringMatches(this.length - other.length, other);	123 return _substringMatches(this.length - other.length, other);

105 }	124 }

106	125

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155 return _substringUnchecked(startIndex, endIndex);	174 return _substringUnchecked(startIndex, endIndex);

156 }	175 }

157	176

158 String _substringUnchecked(int startIndex, int endIndex)	177 String _substringUnchecked(int startIndex, int endIndex)

159 native "StringBase_substringUnchecked";	178 native "StringBase_substringUnchecked";

160	179

161 String trim() {	180 String trim() {

162 final int len = this.length;	181 final int len = this.length;

163 int first = 0;	182 int first = 0;

164 for (; first < len; first++) {	183 for (; first < len; first++) {

165 if (!_isWhitespace(this.charCodeAt(first))) {	184 // There are no whitespace characters that are outside the BMP so we

	185 // can use code units here for efficiency.

	186 if (!_isWhitespace(this.codeUnitAt(first))) {

166 break;	187 break;

167 }	188 }

168 }	189 }

169 if (len == first) {	190 if (len == first) {

170 // String contains only whitespaces.	191 // String contains only whitespaces.

171 return "";	192 return "";

172 }	193 }

173 int last = len - 1;	194 int last = len - 1;

174 for (; last >= first; last--) {	195 for (; last >= first; last--) {

175 if (!_isWhitespace(this.charCodeAt(last))) {	196 if (!_isWhitespace(this.codeUnitAt(last))) {

176 break;	197 break;

177 }	198 }

178 }	199 }

179 if ((first == 0) && (last == (len - 1))) {	200 if ((first == 0) && (last == (len - 1))) {

180 // Returns this string if it does not have leading or trailing	201 // Returns this string if it does not have leading or trailing

181 // whitespaces.	202 // whitespaces.

182 return this;	203 return this;

183 } else {	204 } else {

184 return _substringUnchecked(first, last + 1);	205 return _substringUnchecked(first, last + 1);

185 }	206 }

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
286 if (startIndex == endIndex && endIndex == previousIndex) {	307 if (startIndex == endIndex && endIndex == previousIndex) {

287 ++startIndex; // empty match, advance and restart	308 ++startIndex; // empty match, advance and restart

288 continue;	309 continue;

289 }	310 }

290 result.add(this.substring(previousIndex, match.start));	311 result.add(this.substring(previousIndex, match.start));

291 startIndex = previousIndex = endIndex;	312 startIndex = previousIndex = endIndex;

292 }	313 }

293 return result;	314 return result;

294 }	315 }

295	316

	317 // TODO(erikcorry): Fix this to use the new code point iterator when it is

	318 // available.

296 List<String> splitChars() {	319 List<String> splitChars() {

297 int len = this.length;	320 int len = this.length;

298 final result = new List<String>(len);	321 final result = new List<String>(len);

299 for (int i = 0; i < len; i++) {	322 int i, j;

300 result[i] = this[i];	323 for (i = j = 0; i < len; i++, j++) {

	324 int c = charCodeAt(i);

	325 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.

	326 if (c > 0xffff) {
	floitsch 2012/11/08 15:28:21 Can't you use Utf16::IsSurrogate(c)? Can't you use Utf16::IsSurrogate(c)? erikcorry 2012/11/15 13:28:25 No, that's a C++ function. I added some named con Show quoted text On 2012/11/08 15:28:21, floitsch wrote: > Can't you use Utf16::IsSurrogate(c)? No, that's a C++ function. I added some named constants to make this less ugly.
	327 i++;

	328 }

	329 result[j] = new String.fromCharCodes([c]);

301 }	330 }

302 return result;	331 if (i == j) return result;

	332 // If we saw some non-basic plane characters, then we have to return a

	333 // slightly smaller array than expected (we can't trim the original one

	334 // because it is non-extendable). This rarely happens so this is preferable

	335 // to having a separate pass over the string to count the code points.

	336 final newResult = new List<String>(j);

	337 for (i = 0; i < j; i++) newResult[i] = result[i];

	338 return newResult;

303 }	339 }

304	340

305 List<int> get charCodes {	341 List<int> get charCodes {

306 int len = this.length;	342 int len = this.length;

307 final result = new List<int>(len);	343 final result = new List<int>(len);

	344 int i, j;

	345 for (i = j = 0; i < len; i++, j++) {

	346 int c = this.charCodeAt(i);

	347 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.

	348 if (c > 0xffff) {

	349 i++;

	350 }

	351 result[j] = c;

	352 }

	353 if (i == j) return result;

	354 // If we saw some non-basic plane characters, then we have to return a

	355 // slightly smaller array than expected (we can't trim the original one

	356 // because it is non-extendable). This rarely happens so this is preferable

	357 // to having a separate pass over the string to count the code points.

	358 final newResult = new List<int>(j);

	359 for (i = 0; i < j; i++) newResult[i] = result[i];

	360 return newResult;

	361 }

	362

	363 List<int> get codeUnits {

	364 int len = this.length;

	365 final result = new List<int>(len);

308 for (int i = 0; i < len; i++) {	366 for (int i = 0; i < len; i++) {

309 result[i] = this.charCodeAt(i);	367 result[i] = this.codeUnitAt(i);

310 }	368 }

311 return result;	369 return result;

312 }	370 }

313	371

314 String toUpperCase() native "String_toUpperCase";	372 String toUpperCase() native "String_toUpperCase";

315	373

316 String toLowerCase() native "String_toLowerCase";	374 String toLowerCase() native "String_toLowerCase";

317	375

318 // Implementations of Strings methods follow below.	376 // Implementations of Strings methods follow below.

319 static String join(List<String> strings, String separator) {	377 static String join(List<String> strings, String separator) {

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
353 native "Strings_concatAll";	411 native "Strings_concatAll";

354 }	412 }

355	413

356	414

357 class _OneByteString extends _StringBase implements String {	415 class _OneByteString extends _StringBase implements String {

358 factory _OneByteString._uninstantiable() {	416 factory _OneByteString._uninstantiable() {

359 throw new UnsupportedError(	417 throw new UnsupportedError(

360 "_OneByteString can only be allocated by the VM");	418 "_OneByteString can only be allocated by the VM");

361 }	419 }

362	420

363 // Checks for one-byte whitespaces only.

364 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

365 // whitespaces for one byte strings.

366 bool _isWhitespace(int codePoint) {	421 bool _isWhitespace(int codePoint) {

367 return	422 return

368 (codePoint === 32) \|\| // Space.	423 (codePoint == 32) \|\| // Space.

	424 (codePoint == 0xa0) \|\| // No-break space.

369 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	425 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

370 }	426 }

371	427

372 }	428 }

373	429

374	430

375 class _TwoByteString extends _StringBase implements String {	431 class _TwoByteString extends _StringBase implements String {

376 factory _TwoByteString._uninstantiable() {	432 factory _TwoByteString._uninstantiable() {

377 throw new UnsupportedError(	433 throw new UnsupportedError(

378 "_TwoByteString can only be allocated by the VM");	434 "_TwoByteString can only be allocated by the VM");

379 }	435 }

380	436

381 // Checks for one-byte whitespaces only.	437 // Works for both code points and code units since all spaces are in the BMP.

382 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

383 // whitespaces. Add checking for multi-byte whitespace codepoints.

384 bool _isWhitespace(int codePoint) {	438 bool _isWhitespace(int codePoint) {

385 return	439 return

386 (codePoint === 32) \|\| // Space.	440 (codePoint == 32) \|\| // Space.

387 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	441 (codePoint == 0xa0) \|\| // No-break space.

	442 ((9 <= codePoint) && (codePoint <= 13)) \|\| // CR, LF, TAB, etc.

	443 (codePoint >= 0x1680 && // Optimization.

	444 (codePoint == 0x1680 \|\| // Ogham space mark.

	445 codePoint == 0x180e \|\| // Mongolian vowel separator.

	446 (codePoint >= 0x2000 && codePoint <= 0x200a) \|\| // Wide/narrow spaces.

	447 codePoint == 0x202f \|\| // Narrow no-break space.

	448 codePoint == 0x205f \|\| // Medium mathematical space.

	449 codePoint == 0x3000)); // Ideographic space.

388 }	450 }

389 }	451 }

390	452

391	453

	454 // TODO(erikcorry): This is going away.

392 class _FourByteString extends _StringBase implements String {	455 class _FourByteString extends _StringBase implements String {

393 factory _FourByteString._uninstantiable() {	456 factory _FourByteString._uninstantiable() {

394 throw new UnsupportedError(	457 throw new UnsupportedError(

395 "_FourByteString can only be allocated by the VM");	458 "_FourByteString can only be allocated by the VM");

396 }	459 }

397	460

398 // Checks for one-byte whitespaces only.	461 // Checks for one-byte whitespaces only.

399 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid	462 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

400 // whitespaces. Add checking for multi-byte whitespace codepoints.	463 // whitespaces. Add checking for multi-byte whitespace codepoints.

401 bool _isWhitespace(int codePoint) {	464 bool _isWhitespace(int codePoint) {

402 return	465 return

403 (codePoint === 32) \|\| // Space.	466 (codePoint === 32) \|\| // Space.

404 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	467 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

405 }	468 }

406 }	469 }

407	470

408	471

409 class _ExternalOneByteString extends _StringBase implements String {	472 class _ExternalOneByteString extends _StringBase implements String {

410 factory _ExternalOneByteString._uninstantiable() {	473 factory _ExternalOneByteString._uninstantiable() {

411 throw new UnsupportedError(	474 throw new UnsupportedError(

412 "_ExternalOneByteString can only be allocated by the VM");	475 "_ExternalOneByteString can only be allocated by the VM");

413 }	476 }

414	477

415 // Checks for one-byte whitespaces only.

416 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

417 // whitespaces for one byte strings.

418 bool _isWhitespace(int codePoint) {	478 bool _isWhitespace(int codePoint) {

419 return	479 return

420 (codePoint === 32) \|\| // Space.	480 (codePoint == 32) \|\| // Space.

	481 (codePoint == 0xa0) \|\| // No-break space.

421 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	482 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

422 }	483 }

423 }	484 }

424	485

425	486

426 class _ExternalTwoByteString extends _StringBase implements String {	487 class _ExternalTwoByteString extends _StringBase implements String {

427 factory ExternalTwoByteString._uninstantiable() {	488 factory ExternalTwoByteString._uninstantiable() {

428 throw new UnsupportedError(	489 throw new UnsupportedError(

429 "_ExternalTwoByteString can only be allocated by the VM");	490 "_ExternalTwoByteString can only be allocated by the VM");

430 }	491 }

431	492

432 // Checks for one-byte whitespaces only.	493 // Works for both code points and code units since all spaces are in the BMP.

433 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

434 // whitespaces. Add checking for multi-byte whitespace codepoints.

435 bool _isWhitespace(int codePoint) {	494 bool _isWhitespace(int codePoint) {

436 return	495 return

437 (codePoint === 32) \|\| // Space.	496 (codePoint == 32) \|\| // Space.

438 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	497 (codePoint == 0xa0) \|\| // No-break space.

	498 ((9 <= codePoint) && (codePoint <= 13)) \|\| // CR, LF, TAB, etc.

	499 (codePoint >= 0x1680 && // Optimization.

	500 (codePoint == 0x1680 \|\| // Ogham space mark.

	501 codePoint == 0x180e \|\| // Mongolian vowel separator.

	502 (codePoint >= 0x2000 && codePoint <= 0x200a) \|\| // Wide/narrow spaces.

	503 codePoint == 0x202f \|\| // Narrow no-break space.

	504 codePoint == 0x205f \|\| // Medium mathematical space.

	505 codePoint == 0x3000)); // Ideographic space.

439 }	506 }

440 }	507 }

441	508

442	509

	510 // TODO(erikcorry): This is going away.

443 class _ExternalFourByteString extends _StringBase implements String {	511 class _ExternalFourByteString extends _StringBase implements String {

444 factory _ExternalFourByteString._uninstantiable() {	512 factory _ExternalFourByteString._uninstantiable() {

445 throw new UnsupportedError(	513 throw new UnsupportedError(

446 "ExternalFourByteString can only be allocated by the VM");	514 "ExternalFourByteString can only be allocated by the VM");

447 }	515 }

448	516

449 // Checks for one-byte whitespaces only.	517 // Checks for one-byte whitespaces only.

450 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid	518 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

451 // whitespaces. Add checking for multi-byte whitespace codepoints.	519 // whitespaces. Add checking for multi-byte whitespace codepoints.

452 bool _isWhitespace(int codePoint) {	520 bool _isWhitespace(int codePoint) {

(...skipping 25 matching lines...) Expand all Loading...
478 for (int g in groups) {	546 for (int g in groups) {

479 result.add(group(g));	547 result.add(group(g));

480 }	548 }

481 return result;	549 return result;

482 }	550 }

483	551

484 final int start;	552 final int start;

485 final String str;	553 final String str;

486 final String pattern;	554 final String pattern;

487 }	555 }

OLD	NEW

« runtime/lib/string.cc ('K') | « runtime/lib/string.cc ('k') | runtime/lib/string_patch.dart » ('j') | runtime/vm/object.h » ('J')