Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Side by Side Diff: base/json/json_reader.cc

Issue 9801007: Improve JSONReader performance by up to 55% by using std::string instead of wstring. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Address comments Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/json/json_reader.h" 5 #include "base/json/json_reader.h"
6 6
7 #include "base/float_util.h" 7 #include "base/float_util.h"
8 #include "base/logging.h" 8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h" 9 #include "base/memory/scoped_ptr.h"
10 #include "base/stringprintf.h" 10 #include "base/stringprintf.h"
11 #include "base/string_number_conversions.h" 11 #include "base/string_number_conversions.h"
12 #include "base/string_util.h" 12 #include "base/string_util.h"
13 #include "base/third_party/icu/icu_utf.h"
13 #include "base/utf_string_conversions.h" 14 #include "base/utf_string_conversions.h"
14 #include "base/values.h" 15 #include "base/values.h"
15 16
16 namespace { 17 namespace {
17 18
18 const wchar_t kNullString[] = L"null"; 19 const char kNullString[] = "null";
19 const wchar_t kTrueString[] = L"true"; 20 const char kTrueString[] = "true";
20 const wchar_t kFalseString[] = L"false"; 21 const char kFalseString[] = "false";
21 22
22 const int kStackLimit = 100; 23 const int kStackLimit = 100;
23 24
24 // A helper method for ParseNumberToken. It reads an int from the end of 25 // A helper method for ParseNumberToken. It reads an int from the end of
25 // token. The method returns false if there is no valid integer at the end of 26 // token. The method returns false if there is no valid integer at the end of
26 // the token. 27 // the token.
27 bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) { 28 bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {
28 wchar_t first = token.NextChar(); 29 char first = token.NextChar();
29 int len = 0; 30 int len = 0;
30 31
31 // Read in more digits. 32 // Read in more digits.
32 wchar_t c = first; 33 char c = first;
33 while ('\0' != c && IsAsciiDigit(c)) { 34 while ('\0' != c && IsAsciiDigit(c)) {
34 ++token.length; 35 ++token.length;
35 ++len; 36 ++len;
36 c = token.NextChar(); 37 c = token.NextChar();
37 } 38 }
38 // We need at least 1 digit. 39 // We need at least 1 digit.
39 if (len == 0) 40 if (len == 0)
40 return false; 41 return false;
41 42
42 if (!can_have_leading_zeros && len > 1 && '0' == first) 43 if (!can_have_leading_zeros && len > 1 && '0' == first)
43 return false; 44 return false;
44 45
45 return true; 46 return true;
46 } 47 }
47 48
48 // A helper method for ParseStringToken. It reads |digits| hex digits from the 49 // A helper method for ParseStringToken. It reads |digits| hex digits from the
49 // token. If the sequence if digits is not valid (contains other characters), 50 // token. If the sequence if digits is not valid (contains other characters),
50 // the method returns false. 51 // the method returns false.
51 bool ReadHexDigits(base::JSONReader::Token& token, int digits) { 52 bool ReadHexDigits(base::JSONReader::Token& token, int digits) {
52 for (int i = 1; i <= digits; ++i) { 53 for (int i = 1; i <= digits; ++i) {
53 wchar_t c = *(token.begin + token.length + i); 54 char c = *(token.begin + token.length + i);
54 if (c == '\0' || !IsHexDigit(c)) 55 if (c == '\0' || !IsHexDigit(c))
55 return false; 56 return false;
56 } 57 }
57 58
58 token.length += digits; 59 token.length += digits;
59 return true; 60 return true;
60 } 61 }
61 62
62 } // namespace 63 } // namespace
63 64
(...skipping 12 matching lines...) Expand all
76 const char* JSONReader::kUnexpectedDataAfterRoot = 77 const char* JSONReader::kUnexpectedDataAfterRoot =
77 "Unexpected data after root element."; 78 "Unexpected data after root element.";
78 const char* JSONReader::kUnsupportedEncoding = 79 const char* JSONReader::kUnsupportedEncoding =
79 "Unsupported encoding. JSON must be UTF-8."; 80 "Unsupported encoding. JSON must be UTF-8.";
80 const char* JSONReader::kUnquotedDictionaryKey = 81 const char* JSONReader::kUnquotedDictionaryKey =
81 "Dictionary keys must be quoted."; 82 "Dictionary keys must be quoted.";
82 83
83 JSONReader::JSONReader() 84 JSONReader::JSONReader()
84 : start_pos_(NULL), 85 : start_pos_(NULL),
85 json_pos_(NULL), 86 json_pos_(NULL),
87 end_pos_(NULL),
86 stack_depth_(0), 88 stack_depth_(0),
87 allow_trailing_comma_(false), 89 allow_trailing_comma_(false),
88 error_code_(JSON_NO_ERROR), 90 error_code_(JSON_NO_ERROR),
89 error_line_(0), 91 error_line_(0),
90 error_col_(0) {} 92 error_col_(0) {}
91 93
92 // static 94 // static
93 Value* JSONReader::Read(const std::string& json, 95 Value* JSONReader::Read(const std::string& json,
94 bool allow_trailing_comma) { 96 bool allow_trailing_comma) {
95 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL); 97 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
141 } 143 }
142 144
143 std::string JSONReader::GetErrorMessage() const { 145 std::string JSONReader::GetErrorMessage() const {
144 return FormatErrorMessage(error_line_, error_col_, 146 return FormatErrorMessage(error_line_, error_col_,
145 ErrorCodeToString(error_code_)); 147 ErrorCodeToString(error_code_));
146 } 148 }
147 149
148 Value* JSONReader::JsonToValue(const std::string& json, bool check_root, 150 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
149 bool allow_trailing_comma) { 151 bool allow_trailing_comma) {
150 // The input must be in UTF-8. 152 // The input must be in UTF-8.
151 if (!IsStringUTF8(json.c_str())) { 153 if (!IsStringUTF8(json.data())) {
152 error_code_ = JSON_UNSUPPORTED_ENCODING; 154 error_code_ = JSON_UNSUPPORTED_ENCODING;
153 return NULL; 155 return NULL;
154 } 156 }
155 157
156 // The conversion from UTF8 to wstring removes null bytes for us 158 start_pos_ = json.data();
157 // (a good thing). 159 end_pos_ = start_pos_ + json.size();
158 std::wstring json_wide(UTF8ToWide(json));
159 start_pos_ = json_wide.c_str();
160 160
161 // When the input JSON string starts with a UTF-8 Byte-Order-Mark 161 // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF)
162 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode 162 // or <0xEF 0xBB 0xBF>, advance the start position to avoid the
163 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from 163 // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an
164 // mis-treating a Unicode BOM as an invalid character and returning NULL, 164 // invalid character and returning NULL.
165 // skip a converted Unicode BOM if it exists. 165 if (json.size() >= 3 && start_pos_[0] == 0xEF &&
166 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) { 166 start_pos_[1] == 0xBB && start_pos_[2] == 0xBF) {
167 ++start_pos_; 167 start_pos_ += 3;
168 } 168 }
169 169
170 json_pos_ = start_pos_; 170 json_pos_ = start_pos_;
171 allow_trailing_comma_ = allow_trailing_comma; 171 allow_trailing_comma_ = allow_trailing_comma;
172 stack_depth_ = 0; 172 stack_depth_ = 0;
173 error_code_ = JSON_NO_ERROR; 173 error_code_ = JSON_NO_ERROR;
174 174
175 scoped_ptr<Value> root(BuildValue(check_root)); 175 scoped_ptr<Value> root(BuildValue(check_root));
176 if (root.get()) { 176 if (root.get()) {
177 if (ParseToken().type == Token::END_OF_INPUT) { 177 if (ParseToken().type == Token::END_OF_INPUT) {
(...skipping 171 matching lines...) Expand 10 before | Expand all | Expand 10 after
349 json_pos_ += token.length; 349 json_pos_ += token.length;
350 350
351 --stack_depth_; 351 --stack_depth_;
352 return node.release(); 352 return node.release();
353 } 353 }
354 354
355 JSONReader::Token JSONReader::ParseNumberToken() { 355 JSONReader::Token JSONReader::ParseNumberToken() {
356 // We just grab the number here. We validate the size in DecodeNumber. 356 // We just grab the number here. We validate the size in DecodeNumber.
357 // According to RFC4627, a valid number is: [minus] int [frac] [exp] 357 // According to RFC4627, a valid number is: [minus] int [frac] [exp]
358 Token token(Token::NUMBER, json_pos_, 0); 358 Token token(Token::NUMBER, json_pos_, 0);
359 wchar_t c = *json_pos_; 359 char c = *json_pos_;
360 if ('-' == c) { 360 if ('-' == c) {
361 ++token.length; 361 ++token.length;
362 c = token.NextChar(); 362 c = token.NextChar();
363 } 363 }
364 364
365 if (!ReadInt(token, false)) 365 if (!ReadInt(token, false))
366 return Token::CreateInvalidToken(); 366 return Token::CreateInvalidToken();
367 367
368 // Optional fraction part 368 // Optional fraction part
369 c = token.NextChar(); 369 c = token.NextChar();
(...skipping 13 matching lines...) Expand all
383 c = token.NextChar(); 383 c = token.NextChar();
384 } 384 }
385 if (!ReadInt(token, true)) 385 if (!ReadInt(token, true))
386 return Token::CreateInvalidToken(); 386 return Token::CreateInvalidToken();
387 } 387 }
388 388
389 return token; 389 return token;
390 } 390 }
391 391
392 Value* JSONReader::DecodeNumber(const Token& token) { 392 Value* JSONReader::DecodeNumber(const Token& token) {
393 const std::wstring num_string(token.begin, token.length); 393 const std::string num_string(token.begin, token.length);
394 394
395 int num_int; 395 int num_int;
396 if (StringToInt(WideToUTF8(num_string), &num_int)) 396 if (StringToInt(num_string, &num_int))
397 return Value::CreateIntegerValue(num_int); 397 return Value::CreateIntegerValue(num_int);
398 398
399 double num_double; 399 double num_double;
400 if (StringToDouble(WideToUTF8(num_string), &num_double) && 400 if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double))
401 base::IsFinite(num_double))
402 return Value::CreateDoubleValue(num_double); 401 return Value::CreateDoubleValue(num_double);
403 402
404 return NULL; 403 return NULL;
405 } 404 }
406 405
407 JSONReader::Token JSONReader::ParseStringToken() { 406 JSONReader::Token JSONReader::ParseStringToken() {
408 Token token(Token::STRING, json_pos_, 1); 407 Token token(Token::STRING, json_pos_, 1);
409 wchar_t c = token.NextChar(); 408 char c = token.NextChar();
410 while ('\0' != c) { 409 while (json_pos_ + token.length < end_pos_) {
411 if ('\\' == c) { 410 if ('\\' == c) {
412 ++token.length; 411 ++token.length;
413 c = token.NextChar(); 412 c = token.NextChar();
414 // Make sure the escaped char is valid. 413 // Make sure the escaped char is valid.
415 switch (c) { 414 switch (c) {
416 case 'x': 415 case 'x':
417 if (!ReadHexDigits(token, 2)) { 416 if (!ReadHexDigits(token, 2)) {
418 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 417 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
419 return Token::CreateInvalidToken(); 418 return Token::CreateInvalidToken();
420 } 419 }
(...skipping 22 matching lines...) Expand all
443 ++token.length; 442 ++token.length;
444 return token; 443 return token;
445 } 444 }
446 ++token.length; 445 ++token.length;
447 c = token.NextChar(); 446 c = token.NextChar();
448 } 447 }
449 return Token::CreateInvalidToken(); 448 return Token::CreateInvalidToken();
450 } 449 }
451 450
452 Value* JSONReader::DecodeString(const Token& token) { 451 Value* JSONReader::DecodeString(const Token& token) {
453 std::wstring decoded_str; 452 std::string decoded_str;
454 decoded_str.reserve(token.length - 2); 453 decoded_str.reserve(token.length - 2);
455 454
456 for (int i = 1; i < token.length - 1; ++i) { 455 for (int i = 1; i < token.length - 1; ++i) {
457 wchar_t c = *(token.begin + i); 456 char c = *(token.begin + i);
458 if ('\\' == c) { 457 if ('\\' == c) {
459 ++i; 458 ++i;
460 c = *(token.begin + i); 459 c = *(token.begin + i);
461 switch (c) { 460 switch (c) {
462 case '"': 461 case '"':
463 case '/': 462 case '/':
464 case '\\': 463 case '\\':
465 decoded_str.push_back(c); 464 decoded_str.push_back(c);
466 break; 465 break;
467 case 'b': 466 case 'b':
(...skipping 14 matching lines...) Expand all
482 case 'v': 481 case 'v':
483 decoded_str.push_back('\v'); 482 decoded_str.push_back('\v');
484 break; 483 break;
485 484
486 case 'x': 485 case 'x':
487 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) + 486 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +
488 HexDigitToInt(*(token.begin + i + 2))); 487 HexDigitToInt(*(token.begin + i + 2)));
489 i += 2; 488 i += 2;
490 break; 489 break;
491 case 'u': 490 case 'u':
492 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) + 491 if (!ConvertUTF16Units(token, &i, &decoded_str))
493 (HexDigitToInt(*(token.begin + i + 2)) << 8) + 492 return NULL;
494 (HexDigitToInt(*(token.begin + i + 3)) << 4) +
495 HexDigitToInt(*(token.begin + i + 4)));
496 i += 4;
497 break; 493 break;
498 494
499 default: 495 default:
500 // We should only have valid strings at this point. If not, 496 // We should only have valid strings at this point. If not,
501 // ParseStringToken didn't do it's job. 497 // ParseStringToken didn't do it's job.
502 NOTREACHED(); 498 NOTREACHED();
503 return NULL; 499 return NULL;
504 } 500 }
505 } else { 501 } else {
506 // Not escaped 502 // Not escaped
507 decoded_str.push_back(c); 503 decoded_str.push_back(c);
508 } 504 }
509 } 505 }
510 return Value::CreateStringValue(WideToUTF16Hack(decoded_str)); 506 return Value::CreateStringValue(decoded_str);
507 }
508
509 bool JSONReader::ConvertUTF16Units(const Token& token,
510 int* i,
511 std::string* dest_string) {
512 if (*i + 4 >= token.length)
513 return false;
514
515 // This is a 32-bit field because the shift operations in the
516 // conversion process below cause MSVC to error about "data loss."
517 // This only stores UTF-16 code units, though.
518 // Consume the UTF-16 code unit, which may be a high surrogate.
519 uint32 code_unit16_high = ReadUTF16Unit(token.begin + *i);
520 *i += 4;
521
522 // If this is a high surrogate, consume the next code unit to get the
523 // low surrogate.
524 uint32 code_unit16_low = 0;
525 if (CBU16_IS_SURROGATE(code_unit16_high)) {
526 // Make sure this is the high surrogate. If not, it's an encoding
527 // error.
528 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
529 return false;
530
531 // Make sure that the token has more characters to consume the
532 // lower surrogate.
533 if (*i + 6 >= token.length)
534 return false;
535 if (*(++(*i) + token.begin) != '\\' || *(++(*i) + token.begin) != 'u')
536 return false;
537
538 code_unit16_low = ReadUTF16Unit(token.begin + *i);
539 *i += 4;
540 if (!CBU16_IS_SURROGATE(code_unit16_low) ||
541 !CBU16_IS_TRAIL(code_unit16_low)) {
542 return false;
543 }
544 } else if (!CBU16_IS_SINGLE(code_unit16_high)) {
545 // If this is not a code point, it's an encoding error.
546 return false;
547 }
548
549 // Convert the UTF-16 code units to a code point and then to a UTF-8
550 // code unit sequence.
551 char code_point[8] = { 0 };
552 size_t offset = 0;
553 if (!code_unit16_low) {
554 CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);
555 } else {
556 uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,
557 code_unit16_low);
558 offset = 0;
559 CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);
560 }
561 dest_string->append(code_point);
562 return true;
563 }
564
565 uint32 JSONReader::ReadUTF16Unit(const char* buf) {
566 return (HexDigitToInt(*(buf + 1)) << 12) +
brettw 2012/03/22 20:23:23 You didn't fix the general problem, though, which
Robert Sesek 2012/03/22 20:52:16 That also caused the parser to fail, but I see you
567 (HexDigitToInt(*(buf + 2)) << 8) +
568 (HexDigitToInt(*(buf + 3)) << 4) +
569 HexDigitToInt(*(buf + 4));
511 } 570 }
512 571
513 JSONReader::Token JSONReader::ParseToken() { 572 JSONReader::Token JSONReader::ParseToken() {
514 EatWhitespaceAndComments(); 573 EatWhitespaceAndComments();
515 574
516 Token token(Token::INVALID_TOKEN, 0, 0); 575 Token token(Token::INVALID_TOKEN, 0, 0);
517 switch (*json_pos_) { 576 switch (*json_pos_) {
518 case '\0': 577 case '\0':
519 token.type = Token::END_OF_INPUT; 578 token.type = Token::END_OF_INPUT;
520 break; 579 break;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
573 break; 632 break;
574 633
575 case '"': 634 case '"':
576 token = ParseStringToken(); 635 token = ParseStringToken();
577 break; 636 break;
578 } 637 }
579 return token; 638 return token;
580 } 639 }
581 640
582 void JSONReader::EatWhitespaceAndComments() { 641 void JSONReader::EatWhitespaceAndComments() {
583 while ('\0' != *json_pos_) { 642 while (json_pos_ != end_pos_) {
584 switch (*json_pos_) { 643 switch (*json_pos_) {
585 case ' ': 644 case ' ':
586 case '\n': 645 case '\n':
587 case '\r': 646 case '\r':
588 case '\t': 647 case '\t':
589 ++json_pos_; 648 ++json_pos_;
590 break; 649 break;
591 case '/': 650 case '/':
592 // TODO(tc): This isn't in the RFC so it should be a parser flag. 651 // TODO(tc): This isn't in the RFC so it should be a parser flag.
593 if (!EatComment()) 652 if (!EatComment())
594 return; 653 return;
595 break; 654 break;
596 default: 655 default:
597 // Not a whitespace char, just exit. 656 // Not a whitespace char, just exit.
598 return; 657 return;
599 } 658 }
600 } 659 }
601 } 660 }
602 661
603 bool JSONReader::EatComment() { 662 bool JSONReader::EatComment() {
604 if ('/' != *json_pos_) 663 if ('/' != *json_pos_)
605 return false; 664 return false;
606 665
607 wchar_t next_char = *(json_pos_ + 1); 666 char next_char = *(json_pos_ + 1);
608 if ('/' == next_char) { 667 if ('/' == next_char) {
609 // Line comment, read until \n or \r 668 // Line comment, read until \n or \r
610 json_pos_ += 2; 669 json_pos_ += 2;
611 while ('\0' != *json_pos_) { 670 while (json_pos_ != end_pos_) {
612 switch (*json_pos_) { 671 switch (*json_pos_) {
613 case '\n': 672 case '\n':
614 case '\r': 673 case '\r':
615 ++json_pos_; 674 ++json_pos_;
616 return true; 675 return true;
617 default: 676 default:
618 ++json_pos_; 677 ++json_pos_;
619 } 678 }
620 } 679 }
621 } else if ('*' == next_char) { 680 } else if ('*' == next_char) {
622 // Block comment, read until */ 681 // Block comment, read until */
623 json_pos_ += 2; 682 json_pos_ += 2;
624 while ('\0' != *json_pos_) { 683 while (json_pos_ != end_pos_) {
625 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) { 684 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
626 json_pos_ += 2; 685 json_pos_ += 2;
627 return true; 686 return true;
628 } 687 }
629 ++json_pos_; 688 ++json_pos_;
630 } 689 }
631 } else { 690 } else {
632 return false; 691 return false;
633 } 692 }
634 return true; 693 return true;
635 } 694 }
636 695
637 bool JSONReader::NextStringMatch(const wchar_t* str, size_t length) { 696 bool JSONReader::NextStringMatch(const char* str, size_t length) {
638 return wcsncmp(json_pos_, str, length) == 0; 697 return strncmp(json_pos_, str, length) == 0;
639 } 698 }
640 699
641 void JSONReader::SetErrorCode(JsonParseError error, 700 void JSONReader::SetErrorCode(JsonParseError error,
642 const wchar_t* error_pos) { 701 const char* error_pos) {
643 int line_number = 1; 702 int line_number = 1;
644 int column_number = 1; 703 int column_number = 1;
645 704
646 // Figure out the line and column the error occured at. 705 // Figure out the line and column the error occured at.
647 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) { 706 for (const char* pos = start_pos_; pos != error_pos; ++pos) {
648 if (*pos == '\0') { 707 if (pos > end_pos_) {
649 NOTREACHED(); 708 NOTREACHED();
650 return; 709 return;
651 } 710 }
652 711
653 if (*pos == '\n') { 712 if (*pos == '\n') {
654 ++line_number; 713 ++line_number;
655 column_number = 1; 714 column_number = 1;
656 } else { 715 } else {
657 ++column_number; 716 ++column_number;
658 } 717 }
659 } 718 }
660 719
661 error_line_ = line_number; 720 error_line_ = line_number;
662 error_col_ = column_number; 721 error_col_ = column_number;
663 error_code_ = error; 722 error_code_ = error;
664 } 723 }
665 724
666 } // namespace base 725 } // namespace base
OLDNEW
« no previous file with comments | « base/json/json_reader.h ('k') | base/json/json_reader_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698