src/json-parser.h - Issue 7241023: Improve JSON.parse to use less memory when using escaped and non-ascii...

Side by Side Diff: src/json-parser.h

Issue 7241023: Improve JSON.parse to use less memory when using escaped and non-ascii... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 9 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
48 }	48 }

49	49

50 static const int kEndOfString = -1;	50 static const int kEndOfString = -1;

51	51

52 private:	52 private:

53 // Parse a string containing a single JSON value.	53 // Parse a string containing a single JSON value.

54 Handle<Object> ParseJson(Handle<String> source);	54 Handle<Object> ParseJson(Handle<String> source);

55	55

56 inline void Advance() {	56 inline void Advance() {

57 position_++;	57 position_++;

58 if (position_ > source_length_) {	58 if (position_ >= source_length_) {

59 c0_ = kEndOfString;	59 c0_ = kEndOfString;

60 } else if (seq_ascii) {	60 } else if (seq_ascii) {

61 c0_ = seq_source_->SeqAsciiStringGet(position_);	61 c0_ = seq_source_->SeqAsciiStringGet(position_);

62 } else {	62 } else {

63 c0_ = source_->Get(position_);	63 c0_ = source_->Get(position_);

64 }	64 }

65 }	65 }

66	66

67 // The JSON lexical grammar is specified in the ECMAScript 5 standard,	67 // The JSON lexical grammar is specified in the ECMAScript 5 standard,

68 // section 15.12.1.1. The only allowed whitespace characters between tokens	68 // section 15.12.1.1. The only allowed whitespace characters between tokens

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
100 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and	100 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and

101 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.	101 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.

102 Handle<String> ParseJsonString() {	102 Handle<String> ParseJsonString() {

103 return ScanJsonString<false>();	103 return ScanJsonString<false>();

104 }	104 }

105 Handle<String> ParseJsonSymbol() {	105 Handle<String> ParseJsonSymbol() {

106 return ScanJsonString<true>();	106 return ScanJsonString<true>();

107 }	107 }

108 template <bool is_symbol>	108 template <bool is_symbol>

109 Handle<String> ScanJsonString();	109 Handle<String> ScanJsonString();

110 // Slow version for unicode support, uses the first ascii_count characters,	110 // Slow version for backslash and unicode support, uses the characters from

111 // as first part of a ConsString	111 // start to end in prefix as the first part of the resulting string.

112 Handle<String> SlowScanJsonString(int beg_pos);	112 template <typename StringType, typename SinkChar>

	113 Handle<String> SlowScanJsonString(Handle<String> prefix, int start, int end);
	Lasse Reichstein 2011/06/29 09:27:30 Slow version of what? Just say what the function d Slow version of what? Just say what the function does, and only after that maybe say which function is supposed to use it. So it creates a new string, copies prefix[start..end] into the beginning of it (and I assume start can be non-zero when prefix is source_) and starts scanning the string and adding it after the prefix. sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > Slow version of what? Just say what the function does, and only after that maybe > say which function is supposed to use it. > > So it creates a new string, copies prefix[start..end] into the beginning of it > (and I assume start can be non-zero when prefix is source_) and starts scanning > the string and adding it after the prefix. > Done.
113	114

114 // A JSON number (production JSONNumber) is a subset of the valid JavaScript	115 // A JSON number (production JSONNumber) is a subset of the valid JavaScript

115 // decimal number literals.	116 // decimal number literals.

116 // It includes an optional minus sign, must have at least one	117 // It includes an optional minus sign, must have at least one

117 // digit before and after a decimal point, may not have prefixed zeros (unless	118 // digit before and after a decimal point, may not have prefixed zeros (unless

118 // the integer part is zero), and may include an exponent part (e.g., "e-10").	119 // the integer part is zero), and may include an exponent part (e.g., "e-10").

119 // Hexadecimal and octal numbers are not allowed.	120 // Hexadecimal and octal numbers are not allowed.

120 Handle<Object> ParseJsonNumber();	121 Handle<Object> ParseJsonNumber();

121	122

122 // Parse a single JSON value from input (grammar production JSONValue).	123 // Parse a single JSON value from input (grammar production JSONValue).

(...skipping 18 matching lines...) Expand all Loading...
141	142

142	143

143 // Mark that a parsing error has happened at the current token, and	144 // Mark that a parsing error has happened at the current token, and

144 // return a null handle. Primarily for readability.	145 // return a null handle. Primarily for readability.

145 inline Handle<Object> ReportUnexpectedCharacter() {	146 inline Handle<Object> ReportUnexpectedCharacter() {

146 return Handle<Object>::null();	147 return Handle<Object>::null();

147 }	148 }

148	149

149 inline Isolate* isolate() { return isolate_; }	150 inline Isolate* isolate() { return isolate_; }

150	151

151 static const int kInitialSpecialStringSize = 1024;	152 static const int kInitialSpecialStringLength = 1024;

152	153

153	154

154 private:	155 private:

155 Handle<String> source_;	156 Handle<String> source_;

156 int source_length_;	157 int source_length_;

157 Handle<SeqAsciiString> seq_source_;	158 Handle<SeqAsciiString> seq_source_;

158	159

159 Isolate* isolate_;	160 Isolate* isolate_;

160 uc32 c0_;	161 uc32 c0_;

161 int position_;	162 int position_;

162 };	163 };

163	164

164 template <bool seq_ascii>	165 template <bool seq_ascii>

165 Handle<Object> JsonParser<seq_ascii>::ParseJson(Handle<String> source) {	166 Handle<Object> JsonParser<seq_ascii>::ParseJson(Handle<String> source) {

166 isolate_ = source->map()->isolate();	167 isolate_ = source->map()->isolate();

167 source_ = Handle<String>(source->TryFlattenGetString());	168 source_ = Handle<String>(source->TryFlattenGetString());

168 source_length_ = source_->length() - 1;	169 source_length_ = source_->length();

169	170

170 // Optimized fast case where we only have ascii characters.	171 // Optimized fast case where we only have ascii characters.
	Lasse Reichstein 2011/06/29 09:27:30 ASCII is an acronym when used in prose. ASCII is an acronym when used in prose. sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > ASCII is an acronym when used in prose. Done.
171 if (seq_ascii) {	172 if (seq_ascii) {

172 seq_source_ = Handle<SeqAsciiString>::cast(source_);	173 seq_source_ = Handle<SeqAsciiString>::cast(source_);

173 }	174 }

174	175

175 // Set initial position right before the string.	176 // Set initial position right before the string.

176 position_ = -1;	177 position_ = -1;

177 // Advance to the first character (posibly EOS)	178 // Advance to the first character (posibly EOS)

178 AdvanceSkipWhitespace();	179 AdvanceSkipWhitespace();

179 Handle<Object> result = ParseJsonValue();	180 Handle<Object> result = ParseJsonValue();

180 if (result.is_null() \|\| c0_ != kEndOfString) {	181 if (result.is_null() \|\| c0_ != kEndOfString) {

(...skipping 222 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
403 number = StringToDouble(isolate()->unicode_cache(),	404 number = StringToDouble(isolate()->unicode_cache(),

404 result,	405 result,

405 NO_FLAGS, // Hex, octal or trailing junk.	406 NO_FLAGS, // Hex, octal or trailing junk.

406 0.0);	407 0.0);

407 buffer.Dispose();	408 buffer.Dispose();

408 }	409 }

409 SkipWhitespace();	410 SkipWhitespace();

410 return isolate()->factory()->NewNumber(number);	411 return isolate()->factory()->NewNumber(number);

411 }	412 }

412	413

	414

	415 template <typename StringType>

	416 inline void SeqStringSet(Handle<StringType> seq_str, int i, uc32 c);

	417

	418 template <>

	419 inline void SeqStringSet(Handle<SeqTwoByteString> seq_str, int i, uc32 c) {

	420 seq_str->SeqTwoByteStringSet(i, c);

	421 }

	422

	423 template <>

	424 inline void SeqStringSet(Handle<SeqAsciiString> seq_str, int i, uc32 c) {

	425 seq_str->SeqAsciiStringSet(i, c);

	426 }

	427

	428 template <typename StringType>

	429 inline Handle<StringType> NewRawString(Factory* factory, int length);

	430

	431 template <>

	432 inline Handle<SeqTwoByteString> NewRawString(Factory* factory, int length) {

	433 return factory->NewRawTwoByteString(length, NOT_TENURED);

	434 }

	435

	436 template <>

	437 inline Handle<SeqAsciiString> NewRawString(Factory* factory, int length) {

	438 return factory->NewRawAsciiString(length, NOT_TENURED);

	439 }

	440

	441

	442 // Scans the rest of a JSON string starting from position_ and writes

	443 // substring(prefix, start, end) along with the scanned characters into a

	444 // sequential string of type StringType.

413 template <bool seq_ascii>	445 template <bool seq_ascii>

414 Handle<String> JsonParser<seq_ascii>::SlowScanJsonString(int beg_pos) {	446 template <typename StringType, typename SinkChar>

415 // The currently scanned ascii characters.	447 Handle<String> JsonParser<seq_ascii>::SlowScanJsonString(

416 Handle<String> ascii(isolate()->factory()->NewProperSubString(source_,	448 Handle<String> prefix, int start, int end) {

417 beg_pos,	449 int count = end - start;

418 position_));	450 int length = Min(count + source_length_ - position_,

419 Handle<String> two_byte =	451 Max(kInitialSpecialStringLength, 2 * count));

420 isolate()->factory()->NewRawTwoByteString(kInitialSpecialStringSize,	452 Handle<StringType> seq_str = NewRawString<StringType>(isolate()->factory(),

421 NOT_TENURED);	453 length);

422 Handle<SeqTwoByteString> seq_two_byte =	454 // Copy prefix into seq_str.

423 Handle<SeqTwoByteString>::cast(two_byte);	455 SinkChar* dest = seq_str->GetChars();

424	456 String::WriteToFlat(*prefix, dest, start, end);

425 int allocation_count = 1;

426 int count = 0;

427	457

428 while (c0_ != '"') {	458 while (c0_ != '"') {

429 // Create new seq string	459 // Create new seq string
	Lasse Reichstein 2011/06/29 09:27:30 Move comment to after "if" line, so it only applie Move comment to after "if" line, so it only applies if the condition is true (or make it apply to the check itself, like below: "Check whether we need to create a new string."). Comments should be full sentences (so remember "." at the end). No, we don't always follow that in practice :) sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > Move comment to after "if" line, so it only applies if the condition is true (or > make it apply to the check itself, like below: "Check whether we need to create > a new string."). Comments should be full sentences (so remember "." at the end). > No, we don't always follow that in practice :) Done.
430 if (count >= kInitialSpecialStringSize * allocation_count) {	460 if (count >= length) {

431 allocation_count = allocation_count * 2;	461 return this->SlowScanJsonString<StringType, SinkChar>(seq_str, 0, count);

432 int new_size = allocation_count * kInitialSpecialStringSize;

433 Handle<String> new_two_byte =

434 isolate()->factory()->NewRawTwoByteString(new_size,

435 NOT_TENURED);

436 uc16* char_start =

437 Handle<SeqTwoByteString>::cast(new_two_byte)->GetChars();

438 String::WriteToFlat(*seq_two_byte, char_start, 0, count);

439 seq_two_byte = Handle<SeqTwoByteString>::cast(new_two_byte);

440 }	462 }

441

442 // Check for control character (0x00-0x1f) or unterminated string (<0).	463 // Check for control character (0x00-0x1f) or unterminated string (<0).

443 if (c0_ < 0x20) return Handle<String>::null();	464 if (c0_ < 0x20) return Handle<String>::null();

444 if (c0_ != '\\') {	465 if (c0_ != '\\') {

445 seq_two_byte->SeqTwoByteStringSet(count++, c0_);	466 if (sizeof(char) != sizeof(SinkChar) \|\|
	Lasse Reichstein 2011/06/29 09:27:30 Does it lint? (Generally, us kCharSize instead of Does it lint? (Generally, us kCharSize instead of sizeof(char)). For readability, I'd prefer the operands in the opposite order: if (sizeof(SinkChar) != kCharSize \|\| and you should probably do: if (sizeof(SinkChar) == kUC16Size \|\| since you know that there are only those two possibilities. sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > Does it lint? (Generally, us kCharSize instead of sizeof(char)). > For readability, I'd prefer the operands in the opposite order: > if (sizeof(SinkChar) != kCharSize \|\| > and you should probably do: > if (sizeof(SinkChar) == kUC16Size \|\| > since you know that there are only those two possibilities. Done.
446 Advance();	467 seq_ascii \|\|
	Lasse Reichstein 2011/06/29 09:27:30 This could use a comment: If the sink can contain This could use a comment: If the sink can contain UC16 characters, or source_ contains only ASCII characters, there's no need to test whether we can store the character. Otherwise check whether the UC16 source character can fit in the ASCII sink. sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > This could use a comment: > If the sink can contain UC16 characters, or source_ contains only ASCII > characters, there's no need to test whether we can store the character. > Otherwise check whether the UC16 source character can fit in the ASCII sink. Done.
	468 c0_ <= kMaxAsciiCharCode) {

	469 SeqStringSet(seq_str, count++, c0_);

	470 Advance();

	471 } else {

	472 // StringType is SeqAsciiString and we just read a non-ascii char.
	Lasse Reichstein 2011/06/29 09:27:30 non-ASCII. non-ASCII. sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > non-ASCII. Done.
	473 return this->SlowScanJsonString<SeqTwoByteString, uc16>(seq_str,

	474 0,

	475 count);

	476 }

447 } else {	477 } else {

448 Advance();	478 Advance(); // Advance past the \.

449 switch (c0_) {	479 switch (c0_) {

450 case '"':	480 case '"':

451 case '\\':	481 case '\\':

452 case '/':	482 case '/':

453 seq_two_byte->SeqTwoByteStringSet(count++, c0_);	483 SeqStringSet(seq_str, count++, c0_);

454 break;	484 break;

455 case 'b':	485 case 'b':

456 seq_two_byte->SeqTwoByteStringSet(count++, '\x08');	486 SeqStringSet(seq_str, count++, '\x08');

457 break;	487 break;

458 case 'f':	488 case 'f':

459 seq_two_byte->SeqTwoByteStringSet(count++, '\x0c');	489 SeqStringSet(seq_str, count++, '\x0c');

460 break;	490 break;

461 case 'n':	491 case 'n':

462 seq_two_byte->SeqTwoByteStringSet(count++, '\x0a');	492 SeqStringSet(seq_str, count++, '\x0a');

463 break;	493 break;

464 case 'r':	494 case 'r':

465 seq_two_byte->SeqTwoByteStringSet(count++, '\x0d');	495 SeqStringSet(seq_str, count++, '\x0d');

466 break;	496 break;

467 case 't':	497 case 't':

468 seq_two_byte->SeqTwoByteStringSet(count++, '\x09');	498 SeqStringSet(seq_str, count++, '\x09');

469 break;	499 break;

470 case 'u': {	500 case 'u': {

471 uc32 value = 0;	501 uc32 value = 0;

472 for (int i = 0; i < 4; i++) {	502 for (int i = 0; i < 4; i++) {

473 Advance();	503 Advance();

474 int digit = HexValue(c0_);	504 int digit = HexValue(c0_);

475 if (digit < 0) {	505 if (digit < 0) {

476 return Handle<String>::null();	506 return Handle<String>::null();

477 }	507 }

478 value = value * 16 + digit;	508 value = value * 16 + digit;

479 }	509 }

480 seq_two_byte->SeqTwoByteStringSet(count++, value);	510 if (sizeof(char) != sizeof(SinkChar) \|\| value <= kMaxAsciiCharCode) {
	Lasse Reichstein 2011/06/29 09:27:30 sizeof(SinkChar) == kUC16Size sizeof(SinkChar) == kUC16Size sandholm 2011/06/29 10:44:39 Done. Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > sizeof(SinkChar) == kUC16Size Done.
481 break;	511 SeqStringSet(seq_str, count++, value);

	512 break;

	513 } else {

	514 // StringType is SeqAsciiString and we just read a non-ascii char.

	515 position_ -= 6; // Rewind position to \ in \uxxxx.

	516 Advance();

	517 return this->SlowScanJsonString<SeqTwoByteString, uc16>(seq_str,

	518 0,

	519 count);

	520 }

482 }	521 }

483 default:	522 default:

484 return Handle<String>::null();	523 return Handle<String>::null();

485 }	524 }

486 Advance();	525 Advance();

487 }	526 }

488 }	527 }

489 // Advance past the last '"'.	528 // Shrink seq_string length to count.

490 ASSERT_EQ('"', c0_);	529 if (isolate()->heap()->InNewSpace(*seq_str)) {

491 AdvanceSkipWhitespace();

492

493 // Shrink the the string to our length.

494 if (isolate()->heap()->InNewSpace(*seq_two_byte)) {

495 isolate()->heap()->new_space()->	530 isolate()->heap()->new_space()->

496 template ShrinkStringAtAllocationBoundary<SeqTwoByteString>(	531 template ShrinkStringAtAllocationBoundary<StringType>(
	Lasse Reichstein 2011/06/29 09:27:30 Do ASSERT that the string is at the allocation bou Do ASSERT that the string is at the allocation boundary. (It should be, there have been no allocations or possible interruptions since it was allocated). sandholm 2011/06/29 10:44:39 That ASSERT is in ShrinkStringAtAllocationBoundary Show quoted text On 2011/06/29 09:27:30, Lasse Reichstein wrote: > Do ASSERT that the string is at the allocation boundary. > (It should be, there have been no allocations or possible interruptions since it > was allocated). That ASSERT is in ShrinkStringAtAllocationBoundary. That should suffice? Lasse Reichstein 2011/06/29 10:53:00 It'll do :) It's probably better to keep it there, It'll do :) It's probably better to keep it there, than to start introducing the top of newspace here, when we don't otherwise use it.
497 *seq_two_byte, count);	532 *seq_str, count);

498 } else {	533 } else {

499 int string_size = SeqTwoByteString::SizeFor(count);	534 int string_size = StringType::SizeFor(count);

500 int allocated_string_size =	535 int allocated_string_size = StringType::SizeFor(length);

501 SeqTwoByteString::SizeFor(kInitialSpecialStringSize * allocation_count);

502 int delta = allocated_string_size - string_size;	536 int delta = allocated_string_size - string_size;

503 Address start_filler_object = seq_two_byte->address() + string_size;	537 Address start_filler_object = seq_str->address() + string_size;

504 seq_two_byte->set_length(count);	538 seq_str->set_length(count);

505 isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta);	539 isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta);

506 }	540 }

507 return isolate()->factory()->NewConsString(ascii, seq_two_byte);	541 ASSERT_EQ('"', c0_);

	542 // Advance past the last '"'.

	543 AdvanceSkipWhitespace();

	544 return seq_str;

508 }	545 }

509	546

	547

510 template <bool seq_ascii>	548 template <bool seq_ascii>

511 template <bool is_symbol>	549 template <bool is_symbol>

512 Handle<String> JsonParser<seq_ascii>::ScanJsonString() {	550 Handle<String> JsonParser<seq_ascii>::ScanJsonString() {

513 ASSERT_EQ('"', c0_);	551 ASSERT_EQ('"', c0_);

514 Advance();	552 Advance();

	553 if (c0_ == '"') {

	554 AdvanceSkipWhitespace();

	555 return Handle<String>(isolate()->heap()->empty_string());

	556 }

515 int beg_pos = position_;	557 int beg_pos = position_;

516 // Fast case for ascii only without escape characters.	558 // Fast case for ascii only without escape characters.

517 while (c0_ != '"') {	559 do {

518 // Check for control character (0x00-0x1f) or unterminated string (<0).	560 // Check for control character (0x00-0x1f) or unterminated string (<0).

519 if (c0_ < 0x20) return Handle<String>::null();	561 if (c0_ < 0x20) return Handle<String>::null();

520 if (c0_ != '\\' && (seq_ascii \|\| c0_ < kMaxAsciiCharCode)) {	562 if (c0_ != '\\' && (seq_ascii \|\| c0_ <= kMaxAsciiCharCode)) {

521 Advance();	563 Advance();

522 } else {	564 } else {

523 return this->SlowScanJsonString(beg_pos);	565 return this->SlowScanJsonString<SeqAsciiString, char>(source_,

	566 beg_pos,

	567 position_);

524 }	568 }

	569 } while (c0_ != '"');

	570 int length = position_ - beg_pos;

	571 Handle<String> result;

	572 if (seq_ascii && is_symbol) {

	573 result = isolate()->factory()->LookupAsciiSymbol(seq_source_,

	574 beg_pos,

	575 length);

	576 } else {

	577 result = isolate()->factory()->NewRawAsciiString(length);

	578 char* dest = SeqAsciiString::cast(*result)->GetChars();

	579 String::WriteToFlat(*source_, dest, beg_pos, position_);

525 }	580 }

526 ASSERT_EQ('"', c0_);	581 ASSERT_EQ('"', c0_);

527 int end_pos = position_;

528 // Advance past the last '"'.	582 // Advance past the last '"'.

529 AdvanceSkipWhitespace();	583 AdvanceSkipWhitespace();

530 if (seq_ascii && is_symbol) {	584 return result;

531 return isolate()->factory()->LookupAsciiSymbol(seq_source_,

532 beg_pos,

533 end_pos - beg_pos);

534 } else {

535 return isolate()->factory()->NewProperSubString(source_,

536 beg_pos,

537 end_pos);

538 }

539 }	585 }

540	586

541 } } // namespace v8::internal	587 } } // namespace v8::internal

542	588

543 #endif // V8_JSON_PARSER_H_	589 #endif // V8_JSON_PARSER_H_

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »