Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(570)

Side by Side Diff: src/json-parser.h

Issue 7241023: Improve JSON.parse to use less memory when using escaped and non-ascii... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 9 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
48 } 48 }
49 49
50 static const int kEndOfString = -1; 50 static const int kEndOfString = -1;
51 51
52 private: 52 private:
53 // Parse a string containing a single JSON value. 53 // Parse a string containing a single JSON value.
54 Handle<Object> ParseJson(Handle<String> source); 54 Handle<Object> ParseJson(Handle<String> source);
55 55
56 inline void Advance() { 56 inline void Advance() {
57 position_++; 57 position_++;
58 if (position_ > source_length_) { 58 if (position_ >= source_length_) {
59 c0_ = kEndOfString; 59 c0_ = kEndOfString;
60 } else if (seq_ascii) { 60 } else if (seq_ascii) {
61 c0_ = seq_source_->SeqAsciiStringGet(position_); 61 c0_ = seq_source_->SeqAsciiStringGet(position_);
62 } else { 62 } else {
63 c0_ = source_->Get(position_); 63 c0_ = source_->Get(position_);
64 } 64 }
65 } 65 }
66 66
67 // The JSON lexical grammar is specified in the ECMAScript 5 standard, 67 // The JSON lexical grammar is specified in the ECMAScript 5 standard,
68 // section 15.12.1.1. The only allowed whitespace characters between tokens 68 // section 15.12.1.1. The only allowed whitespace characters between tokens
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
100 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and 100 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
101 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. 101 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
102 Handle<String> ParseJsonString() { 102 Handle<String> ParseJsonString() {
103 return ScanJsonString<false>(); 103 return ScanJsonString<false>();
104 } 104 }
105 Handle<String> ParseJsonSymbol() { 105 Handle<String> ParseJsonSymbol() {
106 return ScanJsonString<true>(); 106 return ScanJsonString<true>();
107 } 107 }
108 template <bool is_symbol> 108 template <bool is_symbol>
109 Handle<String> ScanJsonString(); 109 Handle<String> ScanJsonString();
110 // Slow version for unicode support, uses the first ascii_count characters, 110 // Slow version for backslash and unicode support, uses the characters from
111 // as first part of a ConsString 111 // start to end in prefix as the first part of the resulting string.
112 Handle<String> SlowScanJsonString(int beg_pos); 112 template <typename StringType, typename SinkChar>
113 Handle<String> SlowScanJsonString(Handle<String> prefix, int start, int end);
Lasse Reichstein 2011/06/29 09:27:30 Slow version of what? Just say what the function d
sandholm 2011/06/29 10:44:39 Done.
113 114
114 // A JSON number (production JSONNumber) is a subset of the valid JavaScript 115 // A JSON number (production JSONNumber) is a subset of the valid JavaScript
115 // decimal number literals. 116 // decimal number literals.
116 // It includes an optional minus sign, must have at least one 117 // It includes an optional minus sign, must have at least one
117 // digit before and after a decimal point, may not have prefixed zeros (unless 118 // digit before and after a decimal point, may not have prefixed zeros (unless
118 // the integer part is zero), and may include an exponent part (e.g., "e-10"). 119 // the integer part is zero), and may include an exponent part (e.g., "e-10").
119 // Hexadecimal and octal numbers are not allowed. 120 // Hexadecimal and octal numbers are not allowed.
120 Handle<Object> ParseJsonNumber(); 121 Handle<Object> ParseJsonNumber();
121 122
122 // Parse a single JSON value from input (grammar production JSONValue). 123 // Parse a single JSON value from input (grammar production JSONValue).
(...skipping 18 matching lines...) Expand all
141 142
142 143
143 // Mark that a parsing error has happened at the current token, and 144 // Mark that a parsing error has happened at the current token, and
144 // return a null handle. Primarily for readability. 145 // return a null handle. Primarily for readability.
145 inline Handle<Object> ReportUnexpectedCharacter() { 146 inline Handle<Object> ReportUnexpectedCharacter() {
146 return Handle<Object>::null(); 147 return Handle<Object>::null();
147 } 148 }
148 149
149 inline Isolate* isolate() { return isolate_; } 150 inline Isolate* isolate() { return isolate_; }
150 151
151 static const int kInitialSpecialStringSize = 1024; 152 static const int kInitialSpecialStringLength = 1024;
152 153
153 154
154 private: 155 private:
155 Handle<String> source_; 156 Handle<String> source_;
156 int source_length_; 157 int source_length_;
157 Handle<SeqAsciiString> seq_source_; 158 Handle<SeqAsciiString> seq_source_;
158 159
159 Isolate* isolate_; 160 Isolate* isolate_;
160 uc32 c0_; 161 uc32 c0_;
161 int position_; 162 int position_;
162 }; 163 };
163 164
164 template <bool seq_ascii> 165 template <bool seq_ascii>
165 Handle<Object> JsonParser<seq_ascii>::ParseJson(Handle<String> source) { 166 Handle<Object> JsonParser<seq_ascii>::ParseJson(Handle<String> source) {
166 isolate_ = source->map()->isolate(); 167 isolate_ = source->map()->isolate();
167 source_ = Handle<String>(source->TryFlattenGetString()); 168 source_ = Handle<String>(source->TryFlattenGetString());
168 source_length_ = source_->length() - 1; 169 source_length_ = source_->length();
169 170
170 // Optimized fast case where we only have ascii characters. 171 // Optimized fast case where we only have ascii characters.
Lasse Reichstein 2011/06/29 09:27:30 ASCII is an acronym when used in prose.
sandholm 2011/06/29 10:44:39 Done.
171 if (seq_ascii) { 172 if (seq_ascii) {
172 seq_source_ = Handle<SeqAsciiString>::cast(source_); 173 seq_source_ = Handle<SeqAsciiString>::cast(source_);
173 } 174 }
174 175
175 // Set initial position right before the string. 176 // Set initial position right before the string.
176 position_ = -1; 177 position_ = -1;
177 // Advance to the first character (posibly EOS) 178 // Advance to the first character (posibly EOS)
178 AdvanceSkipWhitespace(); 179 AdvanceSkipWhitespace();
179 Handle<Object> result = ParseJsonValue(); 180 Handle<Object> result = ParseJsonValue();
180 if (result.is_null() || c0_ != kEndOfString) { 181 if (result.is_null() || c0_ != kEndOfString) {
(...skipping 222 matching lines...) Expand 10 before | Expand all | Expand 10 after
403 number = StringToDouble(isolate()->unicode_cache(), 404 number = StringToDouble(isolate()->unicode_cache(),
404 result, 405 result,
405 NO_FLAGS, // Hex, octal or trailing junk. 406 NO_FLAGS, // Hex, octal or trailing junk.
406 0.0); 407 0.0);
407 buffer.Dispose(); 408 buffer.Dispose();
408 } 409 }
409 SkipWhitespace(); 410 SkipWhitespace();
410 return isolate()->factory()->NewNumber(number); 411 return isolate()->factory()->NewNumber(number);
411 } 412 }
412 413
414
415 template <typename StringType>
416 inline void SeqStringSet(Handle<StringType> seq_str, int i, uc32 c);
417
418 template <>
419 inline void SeqStringSet(Handle<SeqTwoByteString> seq_str, int i, uc32 c) {
420 seq_str->SeqTwoByteStringSet(i, c);
421 }
422
423 template <>
424 inline void SeqStringSet(Handle<SeqAsciiString> seq_str, int i, uc32 c) {
425 seq_str->SeqAsciiStringSet(i, c);
426 }
427
428 template <typename StringType>
429 inline Handle<StringType> NewRawString(Factory* factory, int length);
430
431 template <>
432 inline Handle<SeqTwoByteString> NewRawString(Factory* factory, int length) {
433 return factory->NewRawTwoByteString(length, NOT_TENURED);
434 }
435
436 template <>
437 inline Handle<SeqAsciiString> NewRawString(Factory* factory, int length) {
438 return factory->NewRawAsciiString(length, NOT_TENURED);
439 }
440
441
442 // Scans the rest of a JSON string starting from position_ and writes
443 // substring(prefix, start, end) along with the scanned characters into a
444 // sequential string of type StringType.
413 template <bool seq_ascii> 445 template <bool seq_ascii>
414 Handle<String> JsonParser<seq_ascii>::SlowScanJsonString(int beg_pos) { 446 template <typename StringType, typename SinkChar>
415 // The currently scanned ascii characters. 447 Handle<String> JsonParser<seq_ascii>::SlowScanJsonString(
416 Handle<String> ascii(isolate()->factory()->NewProperSubString(source_, 448 Handle<String> prefix, int start, int end) {
417 beg_pos, 449 int count = end - start;
418 position_)); 450 int length = Min(count + source_length_ - position_,
419 Handle<String> two_byte = 451 Max(kInitialSpecialStringLength, 2 * count));
420 isolate()->factory()->NewRawTwoByteString(kInitialSpecialStringSize, 452 Handle<StringType> seq_str = NewRawString<StringType>(isolate()->factory(),
421 NOT_TENURED); 453 length);
422 Handle<SeqTwoByteString> seq_two_byte = 454 // Copy prefix into seq_str.
423 Handle<SeqTwoByteString>::cast(two_byte); 455 SinkChar* dest = seq_str->GetChars();
424 456 String::WriteToFlat(*prefix, dest, start, end);
425 int allocation_count = 1;
426 int count = 0;
427 457
428 while (c0_ != '"') { 458 while (c0_ != '"') {
429 // Create new seq string 459 // Create new seq string
Lasse Reichstein 2011/06/29 09:27:30 Move comment to after "if" line, so it only applie
sandholm 2011/06/29 10:44:39 Done.
430 if (count >= kInitialSpecialStringSize * allocation_count) { 460 if (count >= length) {
431 allocation_count = allocation_count * 2; 461 return this->SlowScanJsonString<StringType, SinkChar>(seq_str, 0, count);
432 int new_size = allocation_count * kInitialSpecialStringSize;
433 Handle<String> new_two_byte =
434 isolate()->factory()->NewRawTwoByteString(new_size,
435 NOT_TENURED);
436 uc16* char_start =
437 Handle<SeqTwoByteString>::cast(new_two_byte)->GetChars();
438 String::WriteToFlat(*seq_two_byte, char_start, 0, count);
439 seq_two_byte = Handle<SeqTwoByteString>::cast(new_two_byte);
440 } 462 }
441
442 // Check for control character (0x00-0x1f) or unterminated string (<0). 463 // Check for control character (0x00-0x1f) or unterminated string (<0).
443 if (c0_ < 0x20) return Handle<String>::null(); 464 if (c0_ < 0x20) return Handle<String>::null();
444 if (c0_ != '\\') { 465 if (c0_ != '\\') {
445 seq_two_byte->SeqTwoByteStringSet(count++, c0_); 466 if (sizeof(char) != sizeof(SinkChar) ||
Lasse Reichstein 2011/06/29 09:27:30 Does it lint? (Generally, us kCharSize instead of
sandholm 2011/06/29 10:44:39 Done.
446 Advance(); 467 seq_ascii ||
Lasse Reichstein 2011/06/29 09:27:30 This could use a comment: If the sink can contain
sandholm 2011/06/29 10:44:39 Done.
468 c0_ <= kMaxAsciiCharCode) {
469 SeqStringSet(seq_str, count++, c0_);
470 Advance();
471 } else {
472 // StringType is SeqAsciiString and we just read a non-ascii char.
Lasse Reichstein 2011/06/29 09:27:30 non-ASCII.
sandholm 2011/06/29 10:44:39 Done.
473 return this->SlowScanJsonString<SeqTwoByteString, uc16>(seq_str,
474 0,
475 count);
476 }
447 } else { 477 } else {
448 Advance(); 478 Advance(); // Advance past the \.
449 switch (c0_) { 479 switch (c0_) {
450 case '"': 480 case '"':
451 case '\\': 481 case '\\':
452 case '/': 482 case '/':
453 seq_two_byte->SeqTwoByteStringSet(count++, c0_); 483 SeqStringSet(seq_str, count++, c0_);
454 break; 484 break;
455 case 'b': 485 case 'b':
456 seq_two_byte->SeqTwoByteStringSet(count++, '\x08'); 486 SeqStringSet(seq_str, count++, '\x08');
457 break; 487 break;
458 case 'f': 488 case 'f':
459 seq_two_byte->SeqTwoByteStringSet(count++, '\x0c'); 489 SeqStringSet(seq_str, count++, '\x0c');
460 break; 490 break;
461 case 'n': 491 case 'n':
462 seq_two_byte->SeqTwoByteStringSet(count++, '\x0a'); 492 SeqStringSet(seq_str, count++, '\x0a');
463 break; 493 break;
464 case 'r': 494 case 'r':
465 seq_two_byte->SeqTwoByteStringSet(count++, '\x0d'); 495 SeqStringSet(seq_str, count++, '\x0d');
466 break; 496 break;
467 case 't': 497 case 't':
468 seq_two_byte->SeqTwoByteStringSet(count++, '\x09'); 498 SeqStringSet(seq_str, count++, '\x09');
469 break; 499 break;
470 case 'u': { 500 case 'u': {
471 uc32 value = 0; 501 uc32 value = 0;
472 for (int i = 0; i < 4; i++) { 502 for (int i = 0; i < 4; i++) {
473 Advance(); 503 Advance();
474 int digit = HexValue(c0_); 504 int digit = HexValue(c0_);
475 if (digit < 0) { 505 if (digit < 0) {
476 return Handle<String>::null(); 506 return Handle<String>::null();
477 } 507 }
478 value = value * 16 + digit; 508 value = value * 16 + digit;
479 } 509 }
480 seq_two_byte->SeqTwoByteStringSet(count++, value); 510 if (sizeof(char) != sizeof(SinkChar) || value <= kMaxAsciiCharCode) {
Lasse Reichstein 2011/06/29 09:27:30 sizeof(SinkChar) == kUC16Size
sandholm 2011/06/29 10:44:39 Done.
481 break; 511 SeqStringSet(seq_str, count++, value);
512 break;
513 } else {
514 // StringType is SeqAsciiString and we just read a non-ascii char.
515 position_ -= 6; // Rewind position to \ in \uxxxx.
516 Advance();
517 return this->SlowScanJsonString<SeqTwoByteString, uc16>(seq_str,
518 0,
519 count);
520 }
482 } 521 }
483 default: 522 default:
484 return Handle<String>::null(); 523 return Handle<String>::null();
485 } 524 }
486 Advance(); 525 Advance();
487 } 526 }
488 } 527 }
489 // Advance past the last '"'. 528 // Shrink seq_string length to count.
490 ASSERT_EQ('"', c0_); 529 if (isolate()->heap()->InNewSpace(*seq_str)) {
491 AdvanceSkipWhitespace();
492
493 // Shrink the the string to our length.
494 if (isolate()->heap()->InNewSpace(*seq_two_byte)) {
495 isolate()->heap()->new_space()-> 530 isolate()->heap()->new_space()->
496 template ShrinkStringAtAllocationBoundary<SeqTwoByteString>( 531 template ShrinkStringAtAllocationBoundary<StringType>(
Lasse Reichstein 2011/06/29 09:27:30 Do ASSERT that the string is at the allocation bou
sandholm 2011/06/29 10:44:39 That ASSERT is in ShrinkStringAtAllocationBoundary
Lasse Reichstein 2011/06/29 10:53:00 It'll do :) It's probably better to keep it there,
497 *seq_two_byte, count); 532 *seq_str, count);
498 } else { 533 } else {
499 int string_size = SeqTwoByteString::SizeFor(count); 534 int string_size = StringType::SizeFor(count);
500 int allocated_string_size = 535 int allocated_string_size = StringType::SizeFor(length);
501 SeqTwoByteString::SizeFor(kInitialSpecialStringSize * allocation_count);
502 int delta = allocated_string_size - string_size; 536 int delta = allocated_string_size - string_size;
503 Address start_filler_object = seq_two_byte->address() + string_size; 537 Address start_filler_object = seq_str->address() + string_size;
504 seq_two_byte->set_length(count); 538 seq_str->set_length(count);
505 isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta); 539 isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta);
506 } 540 }
507 return isolate()->factory()->NewConsString(ascii, seq_two_byte); 541 ASSERT_EQ('"', c0_);
542 // Advance past the last '"'.
543 AdvanceSkipWhitespace();
544 return seq_str;
508 } 545 }
509 546
547
510 template <bool seq_ascii> 548 template <bool seq_ascii>
511 template <bool is_symbol> 549 template <bool is_symbol>
512 Handle<String> JsonParser<seq_ascii>::ScanJsonString() { 550 Handle<String> JsonParser<seq_ascii>::ScanJsonString() {
513 ASSERT_EQ('"', c0_); 551 ASSERT_EQ('"', c0_);
514 Advance(); 552 Advance();
553 if (c0_ == '"') {
554 AdvanceSkipWhitespace();
555 return Handle<String>(isolate()->heap()->empty_string());
556 }
515 int beg_pos = position_; 557 int beg_pos = position_;
516 // Fast case for ascii only without escape characters. 558 // Fast case for ascii only without escape characters.
517 while (c0_ != '"') { 559 do {
518 // Check for control character (0x00-0x1f) or unterminated string (<0). 560 // Check for control character (0x00-0x1f) or unterminated string (<0).
519 if (c0_ < 0x20) return Handle<String>::null(); 561 if (c0_ < 0x20) return Handle<String>::null();
520 if (c0_ != '\\' && (seq_ascii || c0_ < kMaxAsciiCharCode)) { 562 if (c0_ != '\\' && (seq_ascii || c0_ <= kMaxAsciiCharCode)) {
521 Advance(); 563 Advance();
522 } else { 564 } else {
523 return this->SlowScanJsonString(beg_pos); 565 return this->SlowScanJsonString<SeqAsciiString, char>(source_,
566 beg_pos,
567 position_);
524 } 568 }
569 } while (c0_ != '"');
570 int length = position_ - beg_pos;
571 Handle<String> result;
572 if (seq_ascii && is_symbol) {
573 result = isolate()->factory()->LookupAsciiSymbol(seq_source_,
574 beg_pos,
575 length);
576 } else {
577 result = isolate()->factory()->NewRawAsciiString(length);
578 char* dest = SeqAsciiString::cast(*result)->GetChars();
579 String::WriteToFlat(*source_, dest, beg_pos, position_);
525 } 580 }
526 ASSERT_EQ('"', c0_); 581 ASSERT_EQ('"', c0_);
527 int end_pos = position_;
528 // Advance past the last '"'. 582 // Advance past the last '"'.
529 AdvanceSkipWhitespace(); 583 AdvanceSkipWhitespace();
530 if (seq_ascii && is_symbol) { 584 return result;
531 return isolate()->factory()->LookupAsciiSymbol(seq_source_,
532 beg_pos,
533 end_pos - beg_pos);
534 } else {
535 return isolate()->factory()->NewProperSubString(source_,
536 beg_pos,
537 end_pos);
538 }
539 } 585 }
540 586
541 } } // namespace v8::internal 587 } } // namespace v8::internal
542 588
543 #endif // V8_JSON_PARSER_H_ 589 #endif // V8_JSON_PARSER_H_
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698