runtime/lib/string_base.dart - Issue 11368138: Add some support for the code-point code-unit distinction.

Side by Side Diff: runtime/lib/string_base.dart

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Implemented feedback from patch set 2. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 /**	5 /**

6 * [_StringBase] contains common methods used by concrete String	6 * [_StringBase] contains common methods used by concrete String

7 * implementations, e.g., _OneByteString.	7 * implementations, e.g., _OneByteString.

8 */	8 */

9 class _StringBase {	9 class _StringBase {

10	10

11 factory _StringBase._uninstantiable() {	11 factory _StringBase._uninstantiable() {

12 throw new UnsupportedError(	12 throw new UnsupportedError(

13 "_StringBase can't be instaniated");	13 "_StringBase can't be instaniated");

14 }	14 }

15	15

16 int get hashCode native "String_getHashCode";	16 int get hashCode native "String_getHashCode";

17	17

18 /**	18 static String createFromUtf16(List<int> codeUnits) {
	siva 2012/11/16 22:32:04 The comment "Create the .... for specified UTF-16 The comment "Create the .... for specified UTF-16 code units" would still be valid right ? erikcorry 2012/11/19 12:40:41 Done. Show quoted text On 2012/11/16 22:32:04, siva wrote: > The comment > "Create the .... for specified UTF-16 code units" would > still be valid right ? Done.
19 * Create the most efficient string representation for specified

20 * [codePoints].

21 */

22 static String createFromCharCodes(List<int> charCodes) {

23 _ObjectArray objectArray;	19 _ObjectArray objectArray;

24 if (charCodes is _ObjectArray) {	20 if (codeUnits is _ObjectArray) {

25 objectArray = charCodes;	21 objectArray = codeUnits;

26 } else {	22 } else {

27 int len = charCodes.length;	23 int len = codeUnits.length;

28 objectArray = new _ObjectArray(len);	24 objectArray = new _ObjectArray(len);

29 for (int i = 0; i < len; i++) {	25 for (int i = 0; i < len; i++) {

30 objectArray[i] = charCodes[i];	26 objectArray[i] = codeUnits[i];

31 }	27 }

32 }	28 }

33 return _createFromCodePoints(objectArray);	29 return _createFromUtf16(objectArray);

34 }	30 }

35	31

36 static String _createFromCodePoints(List<int> codePoints)	32 static String _createFromUtf16(List<int> codeUnits)

37 native "StringBase_createFromCodePoints";	33 native "StringBase_createFromUtf16";

38	34

39 String operator [](int index) native "String_charAt";	35 String operator [](int index) native "String_charAt";

40	36

41 int charCodeAt(int index) native "String_charCodeAt";	37 int codeUnitAt(int index) native "String_codeUnitAt";

42	38

43 int get length native "String_getLength";	39 int get length native "String_getLength";

44	40

45 bool get isEmpty {	41 bool get isEmpty {

46 return this.length == 0;	42 return this.length == 0;

47 }	43 }

48	44

49 String concat(String other) native "String_concat";	45 String concat(String other) native "String_concat";

50	46

51 String toString() {	47 String toString() {

(...skipping 10 matching lines...) Expand all Loading...
62 return false;	58 return false;

63 }	59 }

64 return this.compareTo(other) == 0;	60 return this.compareTo(other) == 0;

65 }	61 }

66	62

67 int compareTo(String other) {	63 int compareTo(String other) {

68 int thisLength = this.length;	64 int thisLength = this.length;

69 int otherLength = other.length;	65 int otherLength = other.length;

70 int len = (thisLength < otherLength) ? thisLength : otherLength;	66 int len = (thisLength < otherLength) ? thisLength : otherLength;

71 for (int i = 0; i < len; i++) {	67 for (int i = 0; i < len; i++) {

72 int thisCodePoint = this.charCodeAt(i);	68 int thisCodeUnit = this.codeUnitAt(i);

73 int otherCodePoint = other.charCodeAt(i);	69 int otherCodeUnit = other.codeUnitAt(i);

74 if (thisCodePoint < otherCodePoint) {	70 if (thisCodeUnit < otherCodeUnit) {

75 return -1;	71 return -1;

76 }	72 }

77 if (thisCodePoint > otherCodePoint) {	73 if (thisCodeUnit > otherCodeUnit) {

78 return 1;	74 return 1;

79 }	75 }

80 }	76 }

81 if (thisLength < otherLength) return -1;	77 if (thisLength < otherLength) return -1;

82 if (thisLength > otherLength) return 1;	78 if (thisLength > otherLength) return 1;

83 return 0;	79 return 0;

84 }	80 }

85	81

86 bool _substringMatches(int start, String other) {	82 bool _substringMatches(int start, String other) {

87 if (other.isEmpty) return true;	83 if (other.isEmpty) return true;

88 if ((start < 0) \|\| (start >= this.length)) {	84 if ((start < 0) \|\| (start >= this.length)) {

89 return false;	85 return false;

90 }	86 }

91 final int len = other.length;	87 final int len = other.length;

92 if ((start + len) > this.length) {	88 if ((start + len) > this.length) {

93 return false;	89 return false;

94 }	90 }

95 for (int i = 0; i < len; i++) {	91 for (int i = 0; i < len; i++) {

96 if (this.charCodeAt(i + start) != other.charCodeAt(i)) {	92 if (this.codeUnitAt(i + start) != other.codeUnitAt(i)) {

97 return false;	93 return false;

98 }	94 }

99 }	95 }

100 return true;	96 return true;

101 }	97 }

102	98

103 bool endsWith(String other) {	99 bool endsWith(String other) {

104 return _substringMatches(this.length - other.length, other);	100 return _substringMatches(this.length - other.length, other);

105 }	101 }

106	102

(...skipping 48 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
155 return _substringUnchecked(startIndex, endIndex);	151 return _substringUnchecked(startIndex, endIndex);

156 }	152 }

157	153

158 String _substringUnchecked(int startIndex, int endIndex)	154 String _substringUnchecked(int startIndex, int endIndex)

159 native "StringBase_substringUnchecked";	155 native "StringBase_substringUnchecked";

160	156

161 String trim() {	157 String trim() {

162 final int len = this.length;	158 final int len = this.length;

163 int first = 0;	159 int first = 0;

164 for (; first < len; first++) {	160 for (; first < len; first++) {

165 if (!_isWhitespace(this.charCodeAt(first))) {	161 // There are no whitespace characters that are outside the BMP so we

	162 // can use code units here for efficiency.

	163 if (!_isWhitespace(this.codeUnitAt(first))) {

166 break;	164 break;

167 }	165 }

168 }	166 }

169 if (len == first) {	167 if (len == first) {

170 // String contains only whitespaces.	168 // String contains only whitespaces.

171 return "";	169 return "";

172 }	170 }

173 int last = len - 1;	171 int last = len - 1;

174 for (; last >= first; last--) {	172 for (; last >= first; last--) {

175 if (!_isWhitespace(this.charCodeAt(last))) {	173 if (!_isWhitespace(this.codeUnitAt(last))) {

176 break;	174 break;

177 }	175 }

178 }	176 }

179 if ((first == 0) && (last == (len - 1))) {	177 if ((first == 0) && (last == (len - 1))) {

180 // Returns this string if it does not have leading or trailing	178 // Returns this string if it does not have leading or trailing

181 // whitespaces.	179 // whitespaces.

182 return this;	180 return this;

183 } else {	181 } else {

184 return _substringUnchecked(first, last + 1);	182 return _substringUnchecked(first, last + 1);

185 }	183 }

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
286 if (startIndex == endIndex && endIndex == previousIndex) {	284 if (startIndex == endIndex && endIndex == previousIndex) {

287 ++startIndex; // empty match, advance and restart	285 ++startIndex; // empty match, advance and restart

288 continue;	286 continue;

289 }	287 }

290 result.add(this.substring(previousIndex, match.start));	288 result.add(this.substring(previousIndex, match.start));

291 startIndex = previousIndex = endIndex;	289 startIndex = previousIndex = endIndex;

292 }	290 }

293 return result;	291 return result;

294 }	292 }

295	293

	294 // TODO(erikcorry): Fix this to use the new code point iterator when it is

	295 // available.

296 List<String> splitChars() {	296 List<String> splitChars() {

297 int len = this.length;	297 int len = this.length;

298 final result = new List<String>(len);	298 final result = new List<String>(len);

	299 int i, j;

	300 for (i = j = 0; i < len; i++, j++) {

	301 int c = charCodeAt(i);

	302 // Check for non-basic plane character encoded as a UTF-16 surrogate pair.

	303 if (c >= String.SMP_CODE_POINT_BASE) {

	304 i++;

	305 }

	306 result[j] = new String.fromCharCodes([c]);

	307 }

	308 if (i == j) return result;

	309 // If we saw some non-basic plane characters, then we have to return a

	310 // slightly smaller array than expected (we can't trim the original one

	311 // because it is non-extendable). This rarely happens so this is preferable

	312 // to having a separate pass over the string to count the code points.

	313 final newResult = new List<String>(j);

	314 for (i = 0; i < j; i++) newResult[i] = result[i];

	315 return newResult;
	siva 2012/11/16 22:32:04 This piece of code is repeated 3 times in this CL, This piece of code is repeated 3 times in this CL, why not abstract it out. erikcorry 2012/11/19 12:40:41 One copy removed, last three lines changed to a ge Show quoted text On 2012/11/16 22:32:04, siva wrote: > This piece of code is repeated 3 times in this CL, why not abstract it out. One copy removed, last three lines changed to a getRange call.
	316 }

	317

	318 List<int> get codeUnits {

	319 int len = this.length;

	320 final result = new List<int>(len);

299 for (int i = 0; i < len; i++) {	321 for (int i = 0; i < len; i++) {

300 result[i] = this[i];	322 result[i] = this.codeUnitAt(i);

301 }	323 }

302 return result;	324 return result;

303 }	325 }

304

305 List<int> get charCodes {

306 int len = this.length;

307 final result = new List<int>(len);

308 for (int i = 0; i < len; i++) {

309 result[i] = this.charCodeAt(i);

310 }

311 return result;

312 }

313	326

314 String toUpperCase() native "String_toUpperCase";	327 String toUpperCase() native "String_toUpperCase";

315	328

316 String toLowerCase() native "String_toLowerCase";	329 String toLowerCase() native "String_toLowerCase";

317	330

318 // Implementations of Strings methods follow below.	331 // Implementations of Strings methods follow below.

319 static String join(List<String> strings, String separator) {	332 static String join(List<String> strings, String separator) {

320 final int length = strings.length;	333 final int length = strings.length;

321 if (length == 0) {	334 if (length == 0) {

322 return "";	335 return "";

(...skipping 30 matching lines...) Expand all Loading...
353 native "Strings_concatAll";	366 native "Strings_concatAll";

354 }	367 }

355	368

356	369

357 class _OneByteString extends _StringBase implements String {	370 class _OneByteString extends _StringBase implements String {

358 factory _OneByteString._uninstantiable() {	371 factory _OneByteString._uninstantiable() {

359 throw new UnsupportedError(	372 throw new UnsupportedError(

360 "_OneByteString can only be allocated by the VM");	373 "_OneByteString can only be allocated by the VM");

361 }	374 }

362	375

363 // Checks for one-byte whitespaces only.

364 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

365 // whitespaces for one byte strings.

366 bool _isWhitespace(int codePoint) {	376 bool _isWhitespace(int codePoint) {

367 return	377 return

368 (codePoint == 32) \|\| // Space.	378 (codePoint == 32) \|\| // Space.

	379 (codePoint == 0xa0) \|\| // No-break space.

369 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	380 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

370 }	381 }

371	382

	383 int charCodeAt(int index) => codeUnitAt(index);

	384

	385 List<int> get charCodes => codeUnits;

372 }	386 }

373	387

374	388

375 class _TwoByteString extends _StringBase implements String {	389 class _TwoByteStringBase extends _StringBase {
	siva 2012/11/16 22:32:04 Maybe add a TODO here to get rid of this class and Maybe add a TODO here to get rid of this class and use mixins instead once they are implemented in dart. erikcorry 2012/11/19 12:40:41 I don't see why we would want to do that. Show quoted text On 2012/11/16 22:32:04, siva wrote: > Maybe add a TODO here to get rid of this class and use > mixins instead once they are implemented in dart. I don't see why we would want to do that.
	390 factory _TwoByteStringBase._uninstantiable() {

	391 throw new UnsupportedError(

	392 "_TwoByteStringBase can't be instaniated");

	393 }

	394

	395 // Works for both code points and code units since all spaces are in the BMP.

	396 bool _isWhitespace(int codePoint) {

	397 return

	398 (codePoint == 32) \|\| // Space.

	399 (codePoint == 0xa0) \|\| // No-break space.

	400 ((9 <= codePoint) && (codePoint <= 13)) \|\| // CR, LF, TAB, etc.

	401 (codePoint >= 0x1680 && // Optimization.

	402 (codePoint == 0x1680 \|\| // Ogham space mark.

	403 codePoint == 0x180e \|\| // Mongolian vowel separator.

	404 (codePoint >= 0x2000 && codePoint <= 0x200a) \|\| // Wide/narrow spaces.

	405 codePoint == 0x2028 \|\| // Line separator.

	406 codePoint == 0x2029 \|\| // Paragraph separator.

	407 codePoint == 0x202f \|\| // Narrow no-break space.

	408 codePoint == 0x205f \|\| // Medium mathematical space.

	409 codePoint == 0x3000 \|\| // Ideographic space.

	410 codePoint == 0xfeff)); // BOM code.

	411 }

	412

	413 int charCodeAt(int index) {

	414 const int LEAD_SURROGATE_BASE = 0xd800;

	415 const int LEAD_SURROGATE_END = 0xdbff;

	416 const int TRAIL_SURROGATE_BASE = 0xdc00;

	417 const int TRAIL_SURROGATE_END = 0xdfff;

	418 const int MASK = 0x3ff;

	419 int code = codeUnitAt(index);

	420 if (code < LEAD_SURROGATE_BASE \|\| code > LEAD_SURROGATE_END) return code;

	421 if (index + 1 >= length) return code;

	422 int trail = codeUnitAt(index + 1);

	423 if (trail < TRAIL_SURROGATE_BASE \|\| trail > TRAIL_SURROGATE_END) {

	424 return code;

	425 }

	426 return String.SMP_CODE_POINT_BASE + ((code & MASK) << 10) + (trail & MASK);

	427 }

	428

	429 List<int> get charCodes {
	siva 2012/11/16 22:32:04 TODO, fix this to use the new code point iterator TODO, fix this to use the new code point iterator when it is available? erikcorry 2012/11/19 12:40:41 Done. Show quoted text On 2012/11/16 22:32:04, siva wrote: > TODO, fix this to use the new code point iterator when it is available? Done.
	430 int len = this.length;

	431 final result = new List<int>(len);

	432 int i, j;

	433 for (i = j = 0; i < len; i++, j++) {

	434 int c = this.charCodeAt(i);

	435 // Check for supplementary plane character encoded as a UTF-16 surrogate

	436 // pair.

	437 if (c >= String.SMP_CODE_POINT_BASE) {

	438 i++;

	439 }

	440 result[j] = c;

	441 }

	442 if (i == j) return result;
	siva 2012/11/16 22:32:04 I find the (i == j) condition here a little unread I find the (i == j) condition here a little unreadable, why not introduce a boolean variable non_bmp_char_seen and test for that. The same comment also holds for the splitChars case. erikcorry 2012/11/19 12:40:41 Done. Show quoted text On 2012/11/16 22:32:04, siva wrote: > I find the (i == j) condition here a little unreadable, > why not introduce a boolean variable non_bmp_char_seen > and test for that. > > The same comment also holds for the splitChars case. Done.
	443 // If we saw some non-basic plane characters, then we have to return a

	444 // slightly smaller array than expected (we can't trim the original one

	445 // because it is non-extendable). This rarely happens so this is preferable

	446 // to having a separate pass over the string to count the code points.

	447 final newResult = new List<int>(j);

	448 for (i = 0; i < j; i++) newResult[i] = result[i];

	449 return newResult;

	450 }
	siva 2012/11/16 22:32:04 This code here and in splitChars above seem to be This code here and in splitChars above seem to be identical, it might make sense to abstract it out (maybe a static method that is passed in a string object). erikcorry 2012/11/19 12:40:41 Last three lines fixed to use getRange, but splitC Show quoted text On 2012/11/16 22:32:04, siva wrote: > This code here and in splitChars above seem to be identical, > it might make sense to abstract it out (maybe a static > method that is passed in a string object). Last three lines fixed to use getRange, but splitChars is not really identical since it returns a list of strings, whereas this one returns a list of ints.
	451 }

	452

	453

	454 class _TwoByteString extends _TwoByteStringBase implements String {

376 factory _TwoByteString._uninstantiable() {	455 factory _TwoByteString._uninstantiable() {

377 throw new UnsupportedError(	456 throw new UnsupportedError(

378 "_TwoByteString can only be allocated by the VM");	457 "_TwoByteString can only be allocated by the VM");

379 }	458 }

380

381 // Checks for one-byte whitespaces only.

382 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

383 // whitespaces. Add checking for multi-byte whitespace codepoints.

384 bool _isWhitespace(int codePoint) {

385 return

386 (codePoint == 32) \|\| // Space.

387 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

388 }

389 }	459 }

390	460

391	461

392 class _FourByteString extends _StringBase implements String {	462 // TODO(erikcorry): This is going away.
	siva 2012/11/16 22:32:04 (why not remove it in this CL itself)? (why not remove it in this CL itself)? erikcorry 2012/11/19 12:40:41 Done. Show quoted text On 2012/11/16 22:32:04, siva wrote: > (why not remove it in this CL itself)? Done.
	463 class _FourByteString extends _StringBase {

393 factory _FourByteString._uninstantiable() {	464 factory _FourByteString._uninstantiable() {

394 throw new UnsupportedError(	465 throw new UnsupportedError(

395 "_FourByteString can only be allocated by the VM");	466 "_FourByteString can only be allocated by the VM");

396 }	467 }

397	468

398 // Checks for one-byte whitespaces only.	469 // Checks for one-byte whitespaces only.

399 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid	470 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

400 // whitespaces. Add checking for multi-byte whitespace codepoints.	471 // whitespaces. Add checking for multi-byte whitespace codepoints.

401 bool _isWhitespace(int codePoint) {	472 bool _isWhitespace(int codePoint) {

402 return	473 return

403 (codePoint == 32) \|\| // Space.	474 (codePoint == 32) \|\| // Space.

404 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	475 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

405 }	476 }

406 }	477 }

407	478

408	479

409 class _ExternalOneByteString extends _StringBase implements String {	480 class _ExternalOneByteString extends _StringBase implements String {

410 factory _ExternalOneByteString._uninstantiable() {	481 factory _ExternalOneByteString._uninstantiable() {

411 throw new UnsupportedError(	482 throw new UnsupportedError(

412 "_ExternalOneByteString can only be allocated by the VM");	483 "_ExternalOneByteString can only be allocated by the VM");

413 }	484 }

414	485

415 // Checks for one-byte whitespaces only.

416 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

417 // whitespaces for one byte strings.

418 bool _isWhitespace(int codePoint) {	486 bool _isWhitespace(int codePoint) {

419 return	487 return

420 (codePoint == 32) \|\| // Space.	488 (codePoint == 32) \|\| // Space.

	489 (codePoint == 0xa0) \|\| // No-break space.

421 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.	490 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

422 }	491 }

	492

	493 int charCodeAt(int index) => codeUnitAt(index);

	494

	495 List<int> get charCodes => codeUnits;

	496 }

	497

	498

	499 class _ExternalTwoByteString extends _TwoByteStringBase implements String {

	500 factory _ExternalTwoByteString._uninstantiable() {

	501 throw new UnsupportedError(

	502 "_ExternalTwoByteString can only be allocated by the VM");

	503 }

423 }	504 }

424	505

425	506

426 class _ExternalTwoByteString extends _StringBase implements String {	507 // TODO(erikcorry): This is going away.
	siva 2012/11/16 22:32:04 Ditto comment. Ditto comment. erikcorry 2012/11/19 12:40:41 Done. Show quoted text On 2012/11/16 22:32:04, siva wrote: > Ditto comment. Done.
427 factory _ExternalTwoByteString._uninstantiable() {	508 class _ExternalFourByteString extends _StringBase {

428 throw new UnsupportedError(

429 "_ExternalTwoByteString can only be allocated by the VM");

430 }

431

432 // Checks for one-byte whitespaces only.

433 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

434 // whitespaces. Add checking for multi-byte whitespace codepoints.

435 bool _isWhitespace(int codePoint) {

436 return

437 (codePoint == 32) \|\| // Space.

438 ((9 <= codePoint) && (codePoint <= 13)); // CR, LF, TAB, etc.

439 }

440 }

441

442

443 class _ExternalFourByteString extends _StringBase implements String {

444 factory _ExternalFourByteString._uninstantiable() {	509 factory _ExternalFourByteString._uninstantiable() {

445 throw new UnsupportedError(	510 throw new UnsupportedError(

446 "ExternalFourByteString can only be allocated by the VM");	511 "ExternalFourByteString can only be allocated by the VM");

447 }	512 }

448	513

449 // Checks for one-byte whitespaces only.	514 // Checks for one-byte whitespaces only.

450 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid	515 // TODO(srdjan): Investigate if 0x85 (NEL) and 0xA0 (NBSP) are valid

451 // whitespaces. Add checking for multi-byte whitespace codepoints.	516 // whitespaces. Add checking for multi-byte whitespace codepoints.

452 bool _isWhitespace(int codePoint) {	517 bool _isWhitespace(int codePoint) {

453 return	518 return

(...skipping 24 matching lines...) Expand all Loading...
478 for (int g in groups) {	543 for (int g in groups) {

479 result.add(group(g));	544 result.add(group(g));

480 }	545 }

481 return result;	546 return result;

482 }	547 }

483	548

484 final int start;	549 final int start;

485 final String str;	550 final String str;

486 final String pattern;	551 final String pattern;

487 }	552 }

OLD	NEW

« no previous file with comments | « runtime/lib/string.cc ('k') | runtime/lib/string_patch.dart » ('j') | runtime/lib/string_patch.dart » ('J')