base/json/json_reader.cc - Issue 9801007: Improve JSONReader performance by up to 55% by using std::string instead of wstring.

Side by Side Diff: base/json/json_reader.cc

Issue 9801007: Improve JSONReader performance by up to 55% by using std::string instead of wstring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Address comments Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/json/json_reader.h"	5 #include "base/json/json_reader.h"

6	6

7 #include "base/float_util.h"	7 #include "base/float_util.h"

8 #include "base/logging.h"	8 #include "base/logging.h"

9 #include "base/memory/scoped_ptr.h"	9 #include "base/memory/scoped_ptr.h"

10 #include "base/stringprintf.h"	10 #include "base/stringprintf.h"

11 #include "base/string_number_conversions.h"	11 #include "base/string_number_conversions.h"

12 #include "base/string_util.h"	12 #include "base/string_util.h"

	13 #include "base/third_party/icu/icu_utf.h"

13 #include "base/utf_string_conversions.h"	14 #include "base/utf_string_conversions.h"

14 #include "base/values.h"	15 #include "base/values.h"

15	16

16 namespace {	17 namespace {

17	18

18 const wchar_t kNullString[] = L"null";	19 const char kNullString[] = "null";

19 const wchar_t kTrueString[] = L"true";	20 const char kTrueString[] = "true";

20 const wchar_t kFalseString[] = L"false";	21 const char kFalseString[] = "false";

21	22

22 const int kStackLimit = 100;	23 const int kStackLimit = 100;

23	24

24 // A helper method for ParseNumberToken. It reads an int from the end of	25 // A helper method for ParseNumberToken. It reads an int from the end of

25 // token. The method returns false if there is no valid integer at the end of	26 // token. The method returns false if there is no valid integer at the end of

26 // the token.	27 // the token.

27 bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {	28 bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {

28 wchar_t first = token.NextChar();	29 char first = token.NextChar();

29 int len = 0;	30 int len = 0;

30	31

31 // Read in more digits.	32 // Read in more digits.

32 wchar_t c = first;	33 char c = first;

33 while ('\0' != c && IsAsciiDigit(c)) {	34 while ('\0' != c && IsAsciiDigit(c)) {

34 ++token.length;	35 ++token.length;

35 ++len;	36 ++len;

36 c = token.NextChar();	37 c = token.NextChar();

37 }	38 }

38 // We need at least 1 digit.	39 // We need at least 1 digit.

39 if (len == 0)	40 if (len == 0)

40 return false;	41 return false;

41	42

42 if (!can_have_leading_zeros && len > 1 && '0' == first)	43 if (!can_have_leading_zeros && len > 1 && '0' == first)

43 return false;	44 return false;

44	45

45 return true;	46 return true;

46 }	47 }

47	48

48 // A helper method for ParseStringToken. It reads \|digits\| hex digits from the	49 // A helper method for ParseStringToken. It reads \|digits\| hex digits from the

49 // token. If the sequence if digits is not valid (contains other characters),	50 // token. If the sequence if digits is not valid (contains other characters),

50 // the method returns false.	51 // the method returns false.

51 bool ReadHexDigits(base::JSONReader::Token& token, int digits) {	52 bool ReadHexDigits(base::JSONReader::Token& token, int digits) {

52 for (int i = 1; i <= digits; ++i) {	53 for (int i = 1; i <= digits; ++i) {

53 wchar_t c = *(token.begin + token.length + i);	54 char c = *(token.begin + token.length + i);

54 if (c == '\0' \|\| !IsHexDigit(c))	55 if (c == '\0' \|\| !IsHexDigit(c))

55 return false;	56 return false;

56 }	57 }

57	58

58 token.length += digits;	59 token.length += digits;

59 return true;	60 return true;

60 }	61 }

61	62

62 } // namespace	63 } // namespace

63	64

(...skipping 12 matching lines...) Expand all Loading...
76 const char* JSONReader::kUnexpectedDataAfterRoot =	77 const char* JSONReader::kUnexpectedDataAfterRoot =

77 "Unexpected data after root element.";	78 "Unexpected data after root element.";

78 const char* JSONReader::kUnsupportedEncoding =	79 const char* JSONReader::kUnsupportedEncoding =

79 "Unsupported encoding. JSON must be UTF-8.";	80 "Unsupported encoding. JSON must be UTF-8.";

80 const char* JSONReader::kUnquotedDictionaryKey =	81 const char* JSONReader::kUnquotedDictionaryKey =

81 "Dictionary keys must be quoted.";	82 "Dictionary keys must be quoted.";

82	83

83 JSONReader::JSONReader()	84 JSONReader::JSONReader()

84 : start_pos_(NULL),	85 : start_pos_(NULL),

85 json_pos_(NULL),	86 json_pos_(NULL),

	87 end_pos_(NULL),

86 stack_depth_(0),	88 stack_depth_(0),

87 allow_trailing_comma_(false),	89 allow_trailing_comma_(false),

88 error_code_(JSON_NO_ERROR),	90 error_code_(JSON_NO_ERROR),

89 error_line_(0),	91 error_line_(0),

90 error_col_(0) {}	92 error_col_(0) {}

91	93

92 // static	94 // static

93 Value* JSONReader::Read(const std::string& json,	95 Value* JSONReader::Read(const std::string& json,

94 bool allow_trailing_comma) {	96 bool allow_trailing_comma) {

95 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);	97 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
141 }	143 }

142	144

143 std::string JSONReader::GetErrorMessage() const {	145 std::string JSONReader::GetErrorMessage() const {

144 return FormatErrorMessage(error_line_, error_col_,	146 return FormatErrorMessage(error_line_, error_col_,

145 ErrorCodeToString(error_code_));	147 ErrorCodeToString(error_code_));

146 }	148 }

147	149

148 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,	150 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,

149 bool allow_trailing_comma) {	151 bool allow_trailing_comma) {

150 // The input must be in UTF-8.	152 // The input must be in UTF-8.

151 if (!IsStringUTF8(json.c_str())) {	153 if (!IsStringUTF8(json.data())) {

152 error_code_ = JSON_UNSUPPORTED_ENCODING;	154 error_code_ = JSON_UNSUPPORTED_ENCODING;

153 return NULL;	155 return NULL;

154 }	156 }

155	157

156 // The conversion from UTF8 to wstring removes null bytes for us	158 start_pos_ = json.data();

157 // (a good thing).	159 end_pos_ = start_pos_ + json.size();

158 std::wstring json_wide(UTF8ToWide(json));

159 start_pos_ = json_wide.c_str();

160	160

161 // When the input JSON string starts with a UTF-8 Byte-Order-Mark	161 // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF)

162 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode	162 // or <0xEF 0xBB 0xBF>, advance the start position to avoid the

163 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from	163 // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an

164 // mis-treating a Unicode BOM as an invalid character and returning NULL,	164 // invalid character and returning NULL.

165 // skip a converted Unicode BOM if it exists.	165 if (json.size() >= 3 && start_pos_[0] == 0xEF &&

166 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {	166 start_pos_[1] == 0xBB && start_pos_[2] == 0xBF) {

167 ++start_pos_;	167 start_pos_ += 3;

168 }	168 }

169	169

170 json_pos_ = start_pos_;	170 json_pos_ = start_pos_;

171 allow_trailing_comma_ = allow_trailing_comma;	171 allow_trailing_comma_ = allow_trailing_comma;

172 stack_depth_ = 0;	172 stack_depth_ = 0;

173 error_code_ = JSON_NO_ERROR;	173 error_code_ = JSON_NO_ERROR;

174	174

175 scoped_ptr<Value> root(BuildValue(check_root));	175 scoped_ptr<Value> root(BuildValue(check_root));

176 if (root.get()) {	176 if (root.get()) {

177 if (ParseToken().type == Token::END_OF_INPUT) {	177 if (ParseToken().type == Token::END_OF_INPUT) {

(...skipping 171 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
349 json_pos_ += token.length;	349 json_pos_ += token.length;

350	350

351 --stack_depth_;	351 --stack_depth_;

352 return node.release();	352 return node.release();

353 }	353 }

354	354

355 JSONReader::Token JSONReader::ParseNumberToken() {	355 JSONReader::Token JSONReader::ParseNumberToken() {

356 // We just grab the number here. We validate the size in DecodeNumber.	356 // We just grab the number here. We validate the size in DecodeNumber.

357 // According to RFC4627, a valid number is: [minus] int [frac] [exp]	357 // According to RFC4627, a valid number is: [minus] int [frac] [exp]

358 Token token(Token::NUMBER, json_pos_, 0);	358 Token token(Token::NUMBER, json_pos_, 0);

359 wchar_t c = *json_pos_;	359 char c = *json_pos_;

360 if ('-' == c) {	360 if ('-' == c) {

361 ++token.length;	361 ++token.length;

362 c = token.NextChar();	362 c = token.NextChar();

363 }	363 }

364	364

365 if (!ReadInt(token, false))	365 if (!ReadInt(token, false))

366 return Token::CreateInvalidToken();	366 return Token::CreateInvalidToken();

367	367

368 // Optional fraction part	368 // Optional fraction part

369 c = token.NextChar();	369 c = token.NextChar();

(...skipping 13 matching lines...) Expand all Loading...
383 c = token.NextChar();	383 c = token.NextChar();

384 }	384 }

385 if (!ReadInt(token, true))	385 if (!ReadInt(token, true))

386 return Token::CreateInvalidToken();	386 return Token::CreateInvalidToken();

387 }	387 }

388	388

389 return token;	389 return token;

390 }	390 }

391	391

392 Value* JSONReader::DecodeNumber(const Token& token) {	392 Value* JSONReader::DecodeNumber(const Token& token) {

393 const std::wstring num_string(token.begin, token.length);	393 const std::string num_string(token.begin, token.length);

394	394

395 int num_int;	395 int num_int;

396 if (StringToInt(WideToUTF8(num_string), &num_int))	396 if (StringToInt(num_string, &num_int))

397 return Value::CreateIntegerValue(num_int);	397 return Value::CreateIntegerValue(num_int);

398	398

399 double num_double;	399 double num_double;

400 if (StringToDouble(WideToUTF8(num_string), &num_double) &&	400 if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double))

401 base::IsFinite(num_double))

402 return Value::CreateDoubleValue(num_double);	401 return Value::CreateDoubleValue(num_double);

403	402

404 return NULL;	403 return NULL;

405 }	404 }

406	405

407 JSONReader::Token JSONReader::ParseStringToken() {	406 JSONReader::Token JSONReader::ParseStringToken() {

408 Token token(Token::STRING, json_pos_, 1);	407 Token token(Token::STRING, json_pos_, 1);

409 wchar_t c = token.NextChar();	408 char c = token.NextChar();

410 while ('\0' != c) {	409 while (json_pos_ + token.length < end_pos_) {

411 if ('\\' == c) {	410 if ('\\' == c) {

412 ++token.length;	411 ++token.length;

413 c = token.NextChar();	412 c = token.NextChar();

414 // Make sure the escaped char is valid.	413 // Make sure the escaped char is valid.

415 switch (c) {	414 switch (c) {

416 case 'x':	415 case 'x':

417 if (!ReadHexDigits(token, 2)) {	416 if (!ReadHexDigits(token, 2)) {

418 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);	417 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);

419 return Token::CreateInvalidToken();	418 return Token::CreateInvalidToken();

420 }	419 }

(...skipping 22 matching lines...) Expand all Loading...
443 ++token.length;	442 ++token.length;

444 return token;	443 return token;

445 }	444 }

446 ++token.length;	445 ++token.length;

447 c = token.NextChar();	446 c = token.NextChar();

448 }	447 }

449 return Token::CreateInvalidToken();	448 return Token::CreateInvalidToken();

450 }	449 }

451	450

452 Value* JSONReader::DecodeString(const Token& token) {	451 Value* JSONReader::DecodeString(const Token& token) {

453 std::wstring decoded_str;	452 std::string decoded_str;

454 decoded_str.reserve(token.length - 2);	453 decoded_str.reserve(token.length - 2);

455	454

456 for (int i = 1; i < token.length - 1; ++i) {	455 for (int i = 1; i < token.length - 1; ++i) {

457 wchar_t c = *(token.begin + i);	456 char c = *(token.begin + i);

458 if ('\\' == c) {	457 if ('\\' == c) {

459 ++i;	458 ++i;

460 c = *(token.begin + i);	459 c = *(token.begin + i);

461 switch (c) {	460 switch (c) {

462 case '"':	461 case '"':

463 case '/':	462 case '/':

464 case '\\':	463 case '\\':

465 decoded_str.push_back(c);	464 decoded_str.push_back(c);

466 break;	465 break;

467 case 'b':	466 case 'b':

(...skipping 14 matching lines...) Expand all Loading...
482 case 'v':	481 case 'v':

483 decoded_str.push_back('\v');	482 decoded_str.push_back('\v');

484 break;	483 break;

485	484

486 case 'x':	485 case 'x':

487 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +	486 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +

488 HexDigitToInt(*(token.begin + i + 2)));	487 HexDigitToInt(*(token.begin + i + 2)));

489 i += 2;	488 i += 2;

490 break;	489 break;

491 case 'u':	490 case 'u':

492 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) +	491 if (!ConvertUTF16Units(token, &i, &decoded_str))

493 (HexDigitToInt(*(token.begin + i + 2)) << 8) +	492 return NULL;

494 (HexDigitToInt(*(token.begin + i + 3)) << 4) +

495 HexDigitToInt(*(token.begin + i + 4)));

496 i += 4;

497 break;	493 break;

498	494

499 default:	495 default:

500 // We should only have valid strings at this point. If not,	496 // We should only have valid strings at this point. If not,

501 // ParseStringToken didn't do it's job.	497 // ParseStringToken didn't do it's job.

502 NOTREACHED();	498 NOTREACHED();

503 return NULL;	499 return NULL;

504 }	500 }

505 } else {	501 } else {

506 // Not escaped	502 // Not escaped

507 decoded_str.push_back(c);	503 decoded_str.push_back(c);

508 }	504 }

509 }	505 }

510 return Value::CreateStringValue(WideToUTF16Hack(decoded_str));	506 return Value::CreateStringValue(decoded_str);

	507 }

	508

	509 bool JSONReader::ConvertUTF16Units(const Token& token,

	510 int* i,

	511 std::string* dest_string) {

	512 if (*i + 4 >= token.length)

	513 return false;

	514

	515 // This is a 32-bit field because the shift operations in the

	516 // conversion process below cause MSVC to error about "data loss."

	517 // This only stores UTF-16 code units, though.

	518 // Consume the UTF-16 code unit, which may be a high surrogate.

	519 uint32 code_unit16_high = ReadUTF16Unit(token.begin + *i);

	520 *i += 4;

	521

	522 // If this is a high surrogate, consume the next code unit to get the

	523 // low surrogate.

	524 uint32 code_unit16_low = 0;

	525 if (CBU16_IS_SURROGATE(code_unit16_high)) {

	526 // Make sure this is the high surrogate. If not, it's an encoding

	527 // error.

	528 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))

	529 return false;

	530

	531 // Make sure that the token has more characters to consume the

	532 // lower surrogate.

	533 if (*i + 6 >= token.length)

	534 return false;

	535 if ((++(i) + token.begin) != '\\' \|\| (++(i) + token.begin) != 'u')

	536 return false;

	537

	538 code_unit16_low = ReadUTF16Unit(token.begin + *i);

	539 *i += 4;

	540 if (!CBU16_IS_SURROGATE(code_unit16_low) \|\|

	541 !CBU16_IS_TRAIL(code_unit16_low)) {

	542 return false;

	543 }

	544 } else if (!CBU16_IS_SINGLE(code_unit16_high)) {

	545 // If this is not a code point, it's an encoding error.

	546 return false;

	547 }

	548

	549 // Convert the UTF-16 code units to a code point and then to a UTF-8

	550 // code unit sequence.

	551 char code_point[8] = { 0 };

	552 size_t offset = 0;

	553 if (!code_unit16_low) {

	554 CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);

	555 } else {

	556 uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,

	557 code_unit16_low);

	558 offset = 0;

	559 CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);

	560 }

	561 dest_string->append(code_point);

	562 return true;

	563 }

	564

	565 uint32 JSONReader::ReadUTF16Unit(const char* buf) {

	566 return (HexDigitToInt(*(buf + 1)) << 12) +
	brettw 2012/03/22 20:23:23 You didn't fix the general problem, though, which You didn't fix the general problem, though, which is the current HexDigitToInt doesn't report errors. What about \uzz89 which will be 0x89 which would pass your test. Robert Sesek 2012/03/22 20:52:16 That also caused the parser to fail, but I see you Show quoted text On 2012/03/22 20:23:23, brettw wrote: > You didn't fix the general problem, though, which is the current HexDigitToInt > doesn't report errors. What about \uzz89 which will be 0x89 which would pass > your test. That also caused the parser to fail, but I see your point. Switched to using HexStringToInt()
	567 (HexDigitToInt(*(buf + 2)) << 8) +

	568 (HexDigitToInt(*(buf + 3)) << 4) +

	569 HexDigitToInt(*(buf + 4));

511 }	570 }

512	571

513 JSONReader::Token JSONReader::ParseToken() {	572 JSONReader::Token JSONReader::ParseToken() {

514 EatWhitespaceAndComments();	573 EatWhitespaceAndComments();

515	574

516 Token token(Token::INVALID_TOKEN, 0, 0);	575 Token token(Token::INVALID_TOKEN, 0, 0);

517 switch (*json_pos_) {	576 switch (*json_pos_) {

518 case '\0':	577 case '\0':

519 token.type = Token::END_OF_INPUT;	578 token.type = Token::END_OF_INPUT;

520 break;	579 break;

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
573 break;	632 break;

574	633

575 case '"':	634 case '"':

576 token = ParseStringToken();	635 token = ParseStringToken();

577 break;	636 break;

578 }	637 }

579 return token;	638 return token;

580 }	639 }

581	640

582 void JSONReader::EatWhitespaceAndComments() {	641 void JSONReader::EatWhitespaceAndComments() {

583 while ('\0' != *json_pos_) {	642 while (json_pos_ != end_pos_) {

584 switch (*json_pos_) {	643 switch (*json_pos_) {

585 case ' ':	644 case ' ':

586 case '\n':	645 case '\n':

587 case '\r':	646 case '\r':

588 case '\t':	647 case '\t':

589 ++json_pos_;	648 ++json_pos_;

590 break;	649 break;

591 case '/':	650 case '/':

592 // TODO(tc): This isn't in the RFC so it should be a parser flag.	651 // TODO(tc): This isn't in the RFC so it should be a parser flag.

593 if (!EatComment())	652 if (!EatComment())

594 return;	653 return;

595 break;	654 break;

596 default:	655 default:

597 // Not a whitespace char, just exit.	656 // Not a whitespace char, just exit.

598 return;	657 return;

599 }	658 }

600 }	659 }

601 }	660 }

602	661

603 bool JSONReader::EatComment() {	662 bool JSONReader::EatComment() {

604 if ('/' != *json_pos_)	663 if ('/' != *json_pos_)

605 return false;	664 return false;

606	665

607 wchar_t next_char = *(json_pos_ + 1);	666 char next_char = *(json_pos_ + 1);

608 if ('/' == next_char) {	667 if ('/' == next_char) {

609 // Line comment, read until \n or \r	668 // Line comment, read until \n or \r

610 json_pos_ += 2;	669 json_pos_ += 2;

611 while ('\0' != *json_pos_) {	670 while (json_pos_ != end_pos_) {

612 switch (*json_pos_) {	671 switch (*json_pos_) {

613 case '\n':	672 case '\n':

614 case '\r':	673 case '\r':

615 ++json_pos_;	674 ++json_pos_;

616 return true;	675 return true;

617 default:	676 default:

618 ++json_pos_;	677 ++json_pos_;

619 }	678 }

620 }	679 }

621 } else if ('*' == next_char) {	680 } else if ('*' == next_char) {

622 // Block comment, read until */	681 // Block comment, read until */

623 json_pos_ += 2;	682 json_pos_ += 2;

624 while ('\0' != *json_pos_) {	683 while (json_pos_ != end_pos_) {

625 if ('' == json_pos_ && '/' == *(json_pos_ + 1)) {	684 if ('' == json_pos_ && '/' == *(json_pos_ + 1)) {

626 json_pos_ += 2;	685 json_pos_ += 2;

627 return true;	686 return true;

628 }	687 }

629 ++json_pos_;	688 ++json_pos_;

630 }	689 }

631 } else {	690 } else {

632 return false;	691 return false;

633 }	692 }

634 return true;	693 return true;

635 }	694 }

636	695

637 bool JSONReader::NextStringMatch(const wchar_t* str, size_t length) {	696 bool JSONReader::NextStringMatch(const char* str, size_t length) {

638 return wcsncmp(json_pos_, str, length) == 0;	697 return strncmp(json_pos_, str, length) == 0;

639 }	698 }

640	699

641 void JSONReader::SetErrorCode(JsonParseError error,	700 void JSONReader::SetErrorCode(JsonParseError error,

642 const wchar_t* error_pos) {	701 const char* error_pos) {

643 int line_number = 1;	702 int line_number = 1;

644 int column_number = 1;	703 int column_number = 1;

645	704

646 // Figure out the line and column the error occured at.	705 // Figure out the line and column the error occured at.

647 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {	706 for (const char* pos = start_pos_; pos != error_pos; ++pos) {

648 if (*pos == '\0') {	707 if (pos > end_pos_) {

649 NOTREACHED();	708 NOTREACHED();

650 return;	709 return;

651 }	710 }

652	711

653 if (*pos == '\n') {	712 if (*pos == '\n') {

654 ++line_number;	713 ++line_number;

655 column_number = 1;	714 column_number = 1;

656 } else {	715 } else {

657 ++column_number;	716 ++column_number;

658 }	717 }

659 }	718 }

660	719

661 error_line_ = line_number;	720 error_line_ = line_number;

662 error_col_ = column_number;	721 error_col_ = column_number;

663 error_code_ = error;	722 error_code_ = error;

664 }	723 }

665	724

666 } // namespace base	725 } // namespace base

OLD	NEW

« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »