runtime/lib/string_base.dart - Issue 11368138: Add some support for the code-point code-unit distinction.

Side by Side Diff: runtime/lib/string_base.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Implemented feedback from patch set 3 Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 /**	5 /**

6 * [_StringBase] contains common methods used by concrete String	6 * [_StringBase] contains common methods used by concrete String

7 * implementations, e.g., _OneByteString.	7 * implementations, e.g., _OneByteString.

8 */	8 */

9 class _StringBase {	9 class _StringBase {

10	10

11 factory _StringBase._uninstantiable() {	11 factory _StringBase._uninstantiable() {

12 throw new UnsupportedError(	12 throw new UnsupportedError(

13 "_StringBase can't be instaniated");	13 "_StringBase can't be instaniated");

14 }	14 }

15	15

16 int get hashCode native "String_getHashCode";	16 int get hashCode native "String_getHashCode";

17	17

18 /**	18 /**

19 * Create the most efficient string representation for specified	19 * Create the most efficient string representation for the specified UTF-16

20 * [codePoints].	20 * [codeUnits].

21 */	21 */

22 static String createFromCharCodes(List<int> charCodes) {	22 static String createFromUtf16(List<int> codeUnits) {

23 _ObjectArray objectArray;	23 _ObjectArray objectArray;

24 if (charCodes is _ObjectArray) {	24 if (codeUnits is _ObjectArray) {

25 objectArray = charCodes;	25 objectArray = codeUnits;

26 } else {	26 } else {

27 int len = charCodes.length;	27 int len = codeUnits.length;

28 objectArray = new _ObjectArray(len);	28 objectArray = new _ObjectArray(len);

29 for (int i = 0; i < len; i++) {	29 for (int i = 0; i < len; i++) {

30 objectArray[i] = charCodes[i];	30 objectArray[i] = codeUnits[i];

31 }	31 }

32 }	32 }

33 return _createFromCodePoints(objectArray);	33 return _createFromUtf16(objectArray);

34 }	34 }

35	35

36 static String _createFromCodePoints(List<int> codePoints)	36 static String _createFromUtf16(List<int> codeUnits)

37 native "StringBase_createFromCodePoints";	37 native "StringBase_createFromUtf16";

38	38

39 String operator [](int index) native "String_charAt";	39 String operator [](int index) native "String_charAt";

40	40

41 int charCodeAt(int index) native "String_charCodeAt";	41 int codeUnitAt(int index) native "String_codeUnitAt";

42	42

43 int get length native "String_getLength";	43 int get length native "String_getLength";

44	44

45 bool get isEmpty {	45 bool get isEmpty {

46 return this.length == 0;	46 return this.length == 0;

47 }	47 }

48	48

49 String concat(String other) native "String_concat";	49 String concat(String other) native "String_concat";

50	50

51 String toString() {	51 String toString() {

(...skipping 10 matching lines...) Expand all Loading...
62 return false;	62 return false;

63 }	63 }

64 return this.compareTo(other) == 0;	64 return this.compareTo(other) == 0;

65 }	65 }

66	66

67 int compareTo(String other) {	67 int compareTo(String other) {

68 int thisLength = this.length;	68 int thisLength = this.length;

69 int otherLength = other.length;	69 int otherLength = other.length;

70 int len = (thisLength < otherLength) ? thisLength : otherLength;	70 int len = (thisLength < otherLength) ? thisLength : otherLength;

71 for (int i = 0; i < len; i++) {	71 for (int i = 0; i < len; i++) {

72 int thisCodePoint = this.charCodeAt(i);	72 int thisCodeUnit = this.codeUnitAt(i);

73 int otherCodePoint = other.charCodeAt(i);	73 int otherCodeUnit = other.codeUnitAt(i);

74 if (thisCodePoint < otherCodePoint) {	74 if (thisCodeUnit < otherCodeUnit) {

75 return -1;	75 return -1;

76 }	76 }

77 if (thisCodePoint > otherCodePoint) {	77 if (thisCodeUnit > otherCodeUnit) {

78 return 1;	78 return 1;

79 }	79 }

80 }	80 }

81 if (thisLength < otherLength) return -1;	81 if (thisLength < otherLength) return -1;

82 if (thisLength > otherLength) return 1;	82 if (thisLength > otherLength) return 1;

83 return 0;	83 return 0;

84 }	84 }

85	85

86 bool _substringMatches(int start, String other) {	86 bool _substringMatches(int start, String other) {

87 if (other.isEmpty) return true;	87 if (other.isEmpty) return true;

88 if ((start < 0) \|\| (start >= this.length)) {	88 if ((start < 0) \|\| (start >= this.length)) {

89 return false;	89 return false;

90 }	90 }

91 final int len = other.length;	91 final int len = other.length;

92 if ((start + len) > this.length) {	92 if ((start + len) > this.length) {

93 return false;	93 return false;

94 }	94 }

95 for (int i = 0; i < len; i++) {	95 for (int i = 0; i < len; i++) {

96 if (this.charCodeAt(i + start) != other.charCodeAt(i)) {	96 if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {

97 return false;	97 return false;

98 }	98 }

99 }	99 }

100 return true;	100 return true;

101 }	101 }

102	102

103 bool endsWith(String other) {	103 bool endsWith(String other) {

104 return _substringMatches(this.length - other.length, other);	104 return _substringMatches(this.length - other.length, other);

105 }	105 }

106	106

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155 return _substringUnchecked(startIndex, endIndex);	155 return _substringUnchecked(startIndex, endIndex);

156 }	156 }

157	157

158 String _substringUnchecked(int startIndex, int endIndex)	158 String _substringUnchecked(int startIndex, int endIndex)

159 native "StringBase_substringUnchecked";	159 native "StringBase_substringUnchecked";

160	160

161 String trim() {	161 String trim() {

162 final int len = this.length;	162 final int len = this.length;

163 int first = 0;	163 int first = 0;

164 for (; first < len; first++) {	164 for (; first < len; first++) {

165 if (!_isWhitespace(this.charCodeAt(first))) {	165 // There are no whitespace characters that are outside the BMP so we

	166 // can use code units here for efficiency.

	167 if (!_isWhitespace(this.codeUnitAt(first))) {

166 break;	168 break;

167 }	169 }

168 }	170 }

169 if (len == first) {	171 if (len == first) {

170 // String contains only whitespaces.	172 // String contains only whitespaces.

171 return "";	173 return "";

172 }	174 }

173 int last = len - 1;	175 int last = len - 1;

174 for (; last >= first; last--) {	176 for (; last >= first; last--) {

175 if (!_isWhitespace(this.charCodeAt(last))) {	177 if (!_isWhitespace(this.codeUnitAt(last))) {

176 break;	178 break;

177 }	179 }

178 }	180 }

179 if ((first == 0) && (last == (len - 1))) {	181 if ((first == 0) && (last == (len - 1))) {

180 // Returns this string if it does not have leading or trailing	182 // Returns this string if it does not have leading or trailing

181 // whitespaces.	183 // whitespaces.

182 return this;	184 return this;

183 } else {	185 } else {

184 return _substringUnchecked(first, last + 1);	186 return _substringUnchecked(first, last + 1);

185 }	187 }

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
286 if (startIndex == endIndex && endIndex == previousIndex) {	288 if (startIndex == endIndex && endIndex == previousIndex) {

287 ++startIndex; // empty match, advance and restart	289 ++startIndex; // empty match, advance and restart

288 continue;	290 continue;

289 }	291 }

290 result.add(this.substring(previousIndex, match.start));	292 result.add(this.substring(previousIndex, match.start));

291 startIndex = previousIndex = endIndex;	293 startIndex = previousIndex = endIndex;

292 }	294 }

293 return result;	295 return result;

294 }	296 }

295	297

	298 // TODO(erikcorry): Fix this to use the new code point iterator when it is

	299 // available.

296 List<String> splitChars() {	300 List<String> splitChars() {

297 int len = this.length;	301 int len = this.length;

298 final result = new List<String>(len);	302 final result = new List<String>(len);

	303 bool smpCharacterSeen = false;

	304 int i, j;

	305 for (i = j = 0; i < len; i++, j++) {

	306 int c = charCodeAt(i);

	307 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.

	308 if (c >= String.SMP_CODE_POINT_BASE) {

	309 i++;

	310 smpCharacterSeen = true;

	311 }

	312 result[j] = new String.fromCharCodes([c]);

	313 }

	314 if (!smpCharacterSeen) return result;

	315 // If we saw some non-basic plane characters, then we have to return a

	316 // slightly smaller array than expected (we can't trim the original one

	317 // because it is non-extendable). This rarely happens so this is preferable

	318 // to having a separate pass over the string to count the code points.

	319 return result.getRange(0, j);

	320 }

	321

	322 List<int> get codeUnits {

	323 int len = this.length;

	324 final result = new List<int>(len);

299 for (int i = 0; i < len; i++) {	325 for (int i = 0; i < len; i++) {

300 result[i] = this[i];	326 result[i] = this.codeUnitAt(i);

301 }	327 }

302 return result;	328 return result;

303 }	329 }

304

305 List<int> get charCodes {

306 int len = this.length;

307 final result = new List<int>(len);

308 for (int i = 0; i < len; i++) {

309 result[i] = this.charCodeAt(i);

310 }

311 return result;

312 }

313	330

314 String toUpperCase() native "String_toUpperCase";	331 String toUpperCase() native "String_toUpperCase";

315	332

316 String toLowerCase() native "String_toLowerCase";	333 String toLowerCase() native "String_toLowerCase";

317	334

318 // Implementations of Strings methods follow below.	335 // Implementations of Strings methods follow below.

319 static String join(List<String> strings, String separator) {	336 static String join(List<String> strings, String separator) {

320 final int length = strings.length;	337 final int length = strings.length;

321 if (length == 0) {	338 if (length == 0) {

322 return "";	339 return "";

(...skipping 30 matching lines...) Expand all Loading...
353 native "Strings_concatAll";	370 native "Strings_concatAll";

354 }	371 }

355	372

356	373

357 class _OneByteString extends _StringBase implements String {	374 class _OneByteString extends _StringBase implements String {

358 factory _OneByteString._uninstantiable() {	375 factory _OneByteString._uninstantiable() {

359 throw new UnsupportedError(	376 throw new UnsupportedError(

360 "_OneByteString can only be allocated by the VM");	377 "_OneByteString can only be allocated by the VM");

361 }	378 }

362	379

363 // Checks for one-byte whitespaces only.

364 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

365 // whitespaces for one byte strings.

366 bool _isWhitespace(int codePoint) {	380 bool _isWhitespace(int codePoint) {

367 return	381 return

368 (codePoint == 32) \|\| // Space.	382 (codePoint == 32) \|\| // Space.

	383 (codePoint == 0xa0) \|\| // No-break space.

369 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	384 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

370 }	385 }

371	386

	387 int charCodeAt(int index) => codeUnitAt(index);

	388

	389 List<int> get charCodes => codeUnits;

372 }	390 }

373	391

374	392

375 class _TwoByteString extends _StringBase implements String {	393 class _TwoByteStringBase extends _StringBase {

	394 factory _TwoByteStringBase._uninstantiable() {

	395 throw new UnsupportedError(

	396 "_TwoByteStringBase can't be instaniated");

	397 }

	398

	399 // Works for both code points and code units since all spaces are in the BMP.

	400 bool _isWhitespace(int codePoint) {

	401 return

	402 (codePoint == 32) \|\| // Space.

	403 (codePoint == 0xa0) \|\| // No-break space.

	404 ((9 <= codePoint) && (codePoint <= 13)) \|\| // CR, LF, TAB, etc.

	405 (codePoint >= 0x1680 && // Optimization.

	406 (codePoint == 0x1680 \|\| // Ogham space mark.

	407 codePoint == 0x180e \|\| // Mongolian vowel separator.

	408 (codePoint >= 0x2000 && codePoint <= 0x200a) \|\| // Wide/narrow spaces.

	409 codePoint == 0x2028 \|\| // Line separator.

	410 codePoint == 0x2029 \|\| // Paragraph separator.

	411 codePoint == 0x202f \|\| // Narrow no-break space.

	412 codePoint == 0x205f \|\| // Medium mathematical space.

	413 codePoint == 0x3000 \|\| // Ideographic space.

	414 codePoint == 0xfeff)); // BOM code.

	415 }

	416

	417 int charCodeAt(int index) {

	418 const int LEAD_SURROGATE_BASE = 0xd800;

	419 const int LEAD_SURROGATE_END = 0xdbff;

	420 const int TRAIL_SURROGATE_BASE = 0xdc00;

	421 const int TRAIL_SURROGATE_END = 0xdfff;

	422 const int MASK = 0x3ff;

	423 int code = codeUnitAt(index);

	424 if (code < LEAD_SURROGATE_BASE \|\| code > LEAD_SURROGATE_END) return code;

	425 if (index + 1 >= length) return code;

	426 int trail = codeUnitAt(index + 1);

	427 if (trail < TRAIL_SURROGATE_BASE \|\| trail > TRAIL_SURROGATE_END) {

	428 return code;

	429 }

	430 return String.SMP_CODE_POINT_BASE + ((code & MASK) << 10) + (trail & MASK);

	431 }

	432

	433 // TODO(erikcorry): Fix this to use the new code point iterator when it is

	434 // available.

	435 List<int> get charCodes {

	436 int len = this.length;

	437 final result = new List<int>(len);

	438 bool smpCharacterSeen = false;

	439 int i, j;

	440 for (i = j = 0; i < len; i++, j++) {

	441 int c = this.charCodeAt(i);

	442 // Check for supplementary plane character encoded as a UTF-16 surrogate

	443 // pair.

	444 if (c >= String.SMP_CODE_POINT_BASE) {

	445 i++;

	446 smpCharacterSeen = true;

	447 }

	448 result[j] = c;

	449 }

	450 if (!smpCharacterSeen) return result;

	451 // If we saw some non-basic plane characters, then we have to return a

	452 // slightly smaller array than expected (we can't trim the original one

	453 // because it is non-extendable). This rarely happens so this is preferable

	454 // to having a separate pass over the string to count the code points.

	455 return result.getRange(0, j);

	456 }

	457 }

	458

	459

	460 class _TwoByteString extends _TwoByteStringBase implements String {

376 factory _TwoByteString._uninstantiable() {	461 factory _TwoByteString._uninstantiable() {

377 throw new UnsupportedError(	462 throw new UnsupportedError(

378 "_TwoByteString can only be allocated by the VM");	463 "_TwoByteString can only be allocated by the VM");

379 }	464 }

380

381 // Checks for one-byte whitespaces only.

382 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

383 // whitespaces. Add checking for multi-byte whitespace codepoints.

384 bool _isWhitespace(int codePoint) {

385 return

386 (codePoint == 32) \|\| // Space.

387 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

388 }

389 }	465 }

390	466

391

392 class _FourByteString extends _StringBase implements String {

393 factory _FourByteString._uninstantiable() {

394 throw new UnsupportedError(

395 "_FourByteString can only be allocated by the VM");

396 }

397

398 // Checks for one-byte whitespaces only.

399 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

400 // whitespaces. Add checking for multi-byte whitespace codepoints.

401 bool _isWhitespace(int codePoint) {

402 return

403 (codePoint == 32) \|\| // Space.

404 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

405 }

406 }

407

408	467

409 class _ExternalOneByteString extends _StringBase implements String {	468 class _ExternalOneByteString extends _StringBase implements String {

410 factory _ExternalOneByteString._uninstantiable() {	469 factory _ExternalOneByteString._uninstantiable() {

411 throw new UnsupportedError(	470 throw new UnsupportedError(

412 "_ExternalOneByteString can only be allocated by the VM");	471 "_ExternalOneByteString can only be allocated by the VM");

413 }	472 }

414	473

415 // Checks for one-byte whitespaces only.

416 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

417 // whitespaces for one byte strings.

418 bool _isWhitespace(int codePoint) {	474 bool _isWhitespace(int codePoint) {

419 return	475 return

420 (codePoint == 32) \|\| // Space.	476 (codePoint == 32) \|\| // Space.

	477 (codePoint == 0xa0) \|\| // No-break space.

421 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	478 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

422 }	479 }

	480

	481 int charCodeAt(int index) => codeUnitAt(index);

	482

	483 List<int> get charCodes => codeUnits;

423 }	484 }

424	485

425	486

426 class _ExternalTwoByteString extends _StringBase implements String {	487 class _ExternalTwoByteString extends _TwoByteStringBase implements String {

427 factory _ExternalTwoByteString._uninstantiable() {	488 factory _ExternalTwoByteString._uninstantiable() {

428 throw new UnsupportedError(	489 throw new UnsupportedError(

429 "_ExternalTwoByteString can only be allocated by the VM");	490 "_ExternalTwoByteString can only be allocated by the VM");

430 }	491 }

431

432 // Checks for one-byte whitespaces only.

433 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

434 // whitespaces. Add checking for multi-byte whitespace codepoints.

435 bool _isWhitespace(int codePoint) {

436 return

437 (codePoint == 32) \|\| // Space.

438 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

439 }

440 }	492 }

441	493

442

443 class _ExternalFourByteString extends _StringBase implements String {

444 factory _ExternalFourByteString._uninstantiable() {

445 throw new UnsupportedError(

446 "ExternalFourByteString can only be allocated by the VM");

447 }

448

449 // Checks for one-byte whitespaces only.

450 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

451 // whitespaces. Add checking for multi-byte whitespace codepoints.

452 bool _isWhitespace(int codePoint) {

453 return

454 (codePoint == 32) \|\| // Space.

455 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

456 }

457 }

458

459	494

460 class _StringMatch implements Match {	495 class _StringMatch implements Match {

461 const _StringMatch(int this.start,	496 const _StringMatch(int this.start,

462 String this.str,	497 String this.str,

463 String this.pattern);	498 String this.pattern);

464	499

465 int get end => start + pattern.length;	500 int get end => start + pattern.length;

466 String operator[](int g) => group(g);	501 String operator[](int g) => group(g);

467 int get groupCount => 0;	502 int get groupCount => 0;

468	503

469 String group(int group) {	504 String group(int group) {

470 if (group != 0) {	505 if (group != 0) {

471 throw new RangeError.value(group);	506 throw new RangeError.value(group);

472 }	507 }

473 return pattern;	508 return pattern;

474 }	509 }

475	510

476 List<String> groups(List<int> groups) {	511 List<String> groups(List<int> groups) {

477 List<String> result = new List<String>();	512 List<String> result = new List<String>();

478 for (int g in groups) {	513 for (int g in groups) {

479 result.add(group(g));	514 result.add(group(g));

480 }	515 }

481 return result;	516 return result;

482 }	517 }

483	518

484 final int start;	519 final int start;

485 final String str;	520 final String str;

486 final String pattern;	521 final String pattern;

487 }	522 }

OLD	NEW

« no previous file with comments | « runtime/lib/string.cc ('k') | runtime/lib/string_patch.dart » ('j') | runtime/vm/scanner.cc » ('J')