base/json/json_parser.cc - Issue 10035042: Rewrite base::JSONReader to be 35-40% faster, depending on the input string.

Side by Side Diff: base/json/json_parser.cc

Issue 10035042: Rewrite base::JSONReader to be 35-40% faster, depending on the input string. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Address comments/fix Win Created 8 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "base/json/json_parser.h"

	6

	7 #include "base/float_util.h"

	8 #include "base/logging.h"

	9 #include "base/memory/scoped_ptr.h"

	10 #include "base/string_number_conversions.h"

	11 #include "base/string_util.h"

	12 #include "base/stringprintf.h"

	13 #include "base/third_party/icu/icu_utf.h"

	14 #include "base/utf_string_conversion_utils.h"

	15 #include "base/utf_string_conversions.h"

	16 #include "base/values.h"

	17

	18 namespace base {

	19 namespace internal {

	20

	21 namespace {

	22

	23 const int kStackMaxDepth = 100;

	24

	25 const int32 kExtendedASCIIStart = 0x80;

	26

	27 // This and the class below are used to own the JSON input string for when

	28 // string tokens are stored as StringPiece instead of std::string. This

	29 // optimization avoids about 2/3rds of string memory copies. The constructor

	30 // takes the input string and swaps its data into the new instance. The real

	31 // root value is also Swap()ed into the new instance.

	32 class DictionaryHiddenRootValue : public base::DictionaryValue {

	33 public:

	34 DictionaryHiddenRootValue(std::string* json, Value* root) {

	35 DCHECK(root->IsType(Value::TYPE_DICTIONARY));

	36 DictionaryValue::Swap(static_cast<DictionaryValue*>(root));

	37 json->swap(json_);

	38 }

	39

	40 virtual void Swap(DictionaryValue* other) OVERRIDE {

	41 DLOG(1) << "Swap()ing a DictionaryValue inefficiently.";
	Mark Mentovai 2012/05/08 20:19:41 DLOG(1) is DLOG(WARNING). You either meant that or DLOG(1) is DLOG(WARNING). You either meant that or DVLOG(1). Throughout. Robert Sesek 2012/05/15 16:57:51 I've said it before, but it bears repeating: we ha Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > DLOG(1) is DLOG(WARNING). You either meant that or DVLOG(1). Throughout. I've said it before, but it bears repeating: we have way too many logging macros.
	42

	43 // First deep copy to convert JSONStringValue to std::string and swap that

	44 // copy with \|other\|, which contains the new contents of \|this\|.

	45 scoped_ptr<base::DictionaryValue> copy(DeepCopy());

	46 copy->Swap(other);

	47

	48 // Then erase the contents of the current dictionary and swap in the

	49 // new contents, originally from \|other\|.

	50 Clear();

	51 json_.clear();

	52 DictionaryValue::Swap(copy.get());

	53 }

	54

	55 // Not overriding DictionaryValue::Remove because it just calls through to

	56 // the method below.

	57

	58 virtual bool RemoveWithoutPathExpansion(const std::string& key,

	59 Value** out) OVERRIDE {

	60 // If the caller won't take ownership of the removed value, just call up.

	61 if (!out)

	62 return DictionaryValue::RemoveWithoutPathExpansion(key, out);

	63

	64 DLOG(1) << "Remove()ing from a DictionaryValue inefficiently.";

	65

	66 // Otherwise, remove the value while its still "owned" by this and copy it

	67 // to convert any JSONStringValues to std::string.

	68 Value* out_owned = NULL;

	69 if (!DictionaryValue::RemoveWithoutPathExpansion(key, &out_owned))

	70 return false;

	71

	72 *out = out_owned->DeepCopy();

	73 delete out_owned;

	74

	75 return true;

	76 }

	77

	78 private:

	79 std::string json_;

	80

	81 DISALLOW_COPY_AND_ASSIGN(DictionaryHiddenRootValue);

	82 };

	83

	84 class ListHiddenRootValue : public base::ListValue {

	85 public:

	86 ListHiddenRootValue(std::string* json, Value* root) {

	87 DCHECK(root->IsType(Value::TYPE_LIST));

	88 ListValue::Swap(static_cast<ListValue*>(root));

	89 json->swap(json_);

	90 }

	91

	92 virtual void Swap(ListValue* other) OVERRIDE {

	93 DLOG(1) << "Swap()ing a ListValue inefficiently.";

	94

	95 // First deep copy to convert JSONStringValue to std::string and swap that

	96 // copy with \|other\|, which contains the new contents of \|this\|.

	97 scoped_ptr<base::ListValue> copy(DeepCopy());

	98 copy->Swap(other);

	99

	100 // Then erase the contents of the current list and swap in the new contents,

	101 // originally from \|other\|.

	102 Clear();

	103 json_.clear();

	104 ListValue::Swap(copy.get());

	105 }

	106

	107 virtual bool Remove(size_t index, Value** out) OVERRIDE {

	108 // If the caller won't take ownership of the removed value, just call up.

	109 if (!out)

	110 return ListValue::Remove(index, out);

	111

	112 DLOG(1) << "Remove()ing from a ListValue inefficiently.";

	113

	114 // Otherwise, remove the value while its still "owned" by this and copy it

	115 // to convert any JSONStringValues to std::string.

	116 Value* out_owned = NULL;

	117 if (!ListValue::Remove(index, &out_owned))

	118 return false;

	119

	120 *out = out_owned->DeepCopy();

	121 delete out_owned;

	122

	123 return true;

	124 }

	125

	126 private:

	127 std::string json_;

	128

	129 DISALLOW_COPY_AND_ASSIGN(ListHiddenRootValue);

	130 };

	131

	132 // A variant on StringValue that uses StringPiece instead of copying the string

	133 // into the Value. This can only be stored in a child of hidden root (above),

	134 // otherwise the referenced string will not be guaranteed to outlive it.

	135 class JSONStringValue : public base::Value {

	136 public:

	137 explicit JSONStringValue(const base::StringPiece& piece)

	138 : Value(TYPE_STRING),

	139 string_piece_(piece) {

	140 }

	141

	142 // Value:

	143 bool GetAsString(std::string* out_value) const OVERRIDE {

	144 string_piece_.CopyToString(out_value);

	145 return true;

	146 }

	147 bool GetAsString(string16* out_value) const OVERRIDE {

	148 *out_value = UTF8ToUTF16(string_piece_);

	149 return true;

	150 }

	151 virtual Value* DeepCopy() const OVERRIDE {

	152 return Value::CreateStringValue(string_piece_.as_string());

	153 }

	154 virtual bool Equals(const Value* other) const OVERRIDE {

	155 std::string other_string;

	156 return other->IsType(TYPE_STRING) && other->GetAsString(&other_string) &&

	157 StringPiece(other_string) == string_piece_;

	158 }

	159

	160 private:

	161 // The location in the original input stream.

	162 base::StringPiece string_piece_;

	163

	164 DISALLOW_COPY_AND_ASSIGN(JSONStringValue);

	165 };

	166

	167 // Simple class that checks for maximum recursion/"stack overflow."

	168 class StackMarker {

	169 public:

	170 explicit StackMarker(int* depth) : depth_(depth) {

	171 ++(*depth_);

	172 }
	Mark Mentovai 2012/05/08 20:19:41 You should (D)CHECK here that depth <= kStackMaxDe You should (D)CHECK here that depth <= kStackMaxDepth to enforce that the callers are using this class properly. If they set a marker but never check IsTooDeep and return, things can continue recursing unchecked. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > You should (D)CHECK here that depth <= kStackMaxDepth to enforce that the > callers are using this class properly. If they set a marker but never check > IsTooDeep and return, things can continue recursing unchecked. Done.
	173 ~StackMarker() {

	174 --(*depth_);

	175 }

	176

	177 bool IsTooDeep() const {

	178 return *depth_ >= kStackMaxDepth;

	179 }

	180

	181 private:

	182 int* const depth_;

	183

	184 DISALLOW_COPY_AND_ASSIGN(StackMarker);

	185 };

	186

	187 } // namespace

	188

	189 JSONParser::JSONParser(int options)

	190 : options_(options),

	191 start_pos_(NULL),

	192 pos_(0),
	tfarina 2012/05/04 00:25:28 nit: just curious why did you choose 0 to initiali nit: just curious why did you choose 0 to initialize pos_ and end_pos_ instead of NULL, but NULL for start_pos_. Seems inconsistent to my taste :/ Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/04 00:25:28, tfarina wrote: > nit: just curious why did you choose 0 to initialize pos_ and end_pos_ instead > of NULL, but NULL for start_pos_. Seems inconsistent to my taste :/ Done.
	193 end_pos_(0),

	194 index_(0),

	195 stack_depth_(0),

	196 line_number_(0),

	197 index_last_line_(0),

	198 error_code_(JSONReader::JSON_NO_ERROR),

	199 error_line_(0),

	200 error_column_(0) {

	201 }

	202

	203 JSONParser::~JSONParser() {

	204 }

	205

	206 Value* JSONParser::Parse(const std::string& input) {

	207 // TODO(rsesek): Windows has problems with StringPiece/hidden roots. Fix

	208 // <http://crbug.com/126107> when my Windows box arrives.

	209 #if defined(OS_WIN)

	210 options_ \|= JSON_DETACHABLE_CHILDREN;

	211 #endif

	212

	213 std::string input_copy;

	214 // If the children of a JSON root can be detached, then hidden roots cannot

	215 // be used, so do not bother copying the input because StringPiece will not

	216 // be used anywhere.

	217 if (!(options_ & JSON_DETACHABLE_CHILDREN)) {

	218 input_copy = input;

	219 start_pos_ = input_copy.data();

	220 } else {

	221 start_pos_ = input.data();

	222 }

	223 pos_ = start_pos_;

	224 end_pos_ = start_pos_ + input.length();

	225 index_ = 0;

	226 line_number_ = 1;

	227 index_last_line_ = 0;

	228

	229 error_code_ = JSONReader::JSON_NO_ERROR;

	230 error_line_ = 0;

	231 error_column_ = 0;

	232

	233 // When the input JSON string starts with a UTF-8 Byte-Order-Mark

	234 // <0xEF 0xBB 0xBF>, advance the start position to avoid the

	235 // ParseNextToken function mis-treating a Unicode BOM as an invalid

	236 // character and returning NULL.

	237 if (CanConsume(3) && static_cast<uint8>(*pos_) == 0xEF &&

	238 static_cast<uint8>(*(pos_ + 1)) == 0xBB &&

	239 static_cast<uint8>(*(pos_ + 2)) == 0xBF) {

	240 NextNChars(3);

	241 }

	242

	243 // Parse the first and all subsequent tokens.

	244 scoped_ptr<Value> root(ParseNextToken());

	245 if (!root.get())

	246 return NULL;

	247

	248 // Make sure the input stream is at an end.

	249 if (GetNextToken() != T_END_OF_INPUT) {

	250 if (!CanConsume(1) \|\| (NextChar() && GetNextToken() != T_END_OF_INPUT)) {

	251 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1);

	252 return NULL;

	253 }

	254 }

	255

	256 // Dictionaries and lists can contain JSONStringValues, so wrap them in a

	257 // hidden root.

	258 if (!(options_ & JSON_DETACHABLE_CHILDREN)) {

	259 if (root->IsType(Value::TYPE_DICTIONARY)) {

	260 return new DictionaryHiddenRootValue(&input_copy, root.release());

	261 } else if (root->IsType(Value::TYPE_LIST)) {

	262 return new ListHiddenRootValue(&input_copy, root.release());

	263 } else if (root->IsType(Value::TYPE_STRING)) {

	264 // A string type could be a JSONStringValue, but because there's no

	265 // corresponding HiddenRootValue, the memory will be lost. Deep copy to

	266 // preserve it.

	267 return root->DeepCopy();

	268 }

	269 }

	270

	271 // All other values can be returned directly.

	272 return root.release();

	273 }

	274

	275 JSONReader::JsonParseError JSONParser::error_code() const {

	276 return error_code_;

	277 }

	278

	279 std::string JSONParser::GetErrorMessage() const {

	280 return FormatErrorMessage(error_line_, error_column_,

	281 JSONReader::ErrorCodeToString(error_code_));

	282 }

	283

	284 // StringBuilder ///////////////////////////////////////////////////////////////

	285

	286 JSONParser::StringBuilder::StringBuilder()

	287 : pos_(NULL),

	288 length_(0),

	289 string_(NULL) {

	290 }

	291

	292 JSONParser::StringBuilder::StringBuilder(const char* pos)

	293 : pos_(pos),

	294 length_(0),

	295 string_(NULL) {

	296 }

	297

	298 void JSONParser::StringBuilder::Swap(StringBuilder* other) {

	299 std::swap(other->string_, string_);

	300 std::swap(other->pos_, pos_);

	301 std::swap(other->length_, length_);

	302 }

	303

	304 JSONParser::StringBuilder::~StringBuilder() {

	305 delete string_;

	306 }

	307

	308 void JSONParser::StringBuilder::Append(const char& c) {

	309 DCHECK_GE(c, 0);
	Mark Mentovai 2012/05/08 20:19:41 Because of the stupid nature of char, you should b Because of the stupid nature of char, you should be doing these checks with it explicitly signed or unsigned. Robert Sesek 2012/05/15 16:57:51 How would you do this? Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > Because of the stupid nature of char, you should be doing these checks with it > explicitly signed or unsigned. How would you do this?
	310 DCHECK_LT(c, 128);

	311

	312 if (string_)

	313 string_->push_back(c);

	314 else

	315 ++length_;

	316 }

	317

	318 void JSONParser::StringBuilder::AppendString(const std::string& str) {

	319 DCHECK(string_);

	320 string_->append(str);

	321 }

	322

	323 void JSONParser::StringBuilder::Convert() {

	324 if (string_)

	325 return;

	326 string_ = new std::string(pos_, length_);

	327 }

	328

	329 bool JSONParser::StringBuilder::CanBeStringPiece() const {

	330 return !string_;

	331 }

	332

	333 StringPiece JSONParser::StringBuilder::AsStringPiece() {

	334 if (string_)

	335 return StringPiece();

	336 return StringPiece(pos_, length_);

	337 }

	338

	339 const std::string& JSONParser::StringBuilder::AsString() {

	340 if (!string_)

	341 Convert();

	342 return *string_;

	343 }

	344

	345 // JSONParser private //////////////////////////////////////////////////////////

	346

	347 inline bool JSONParser::CanConsume(int length) {

	348 return pos_ + length <= end_pos_;

	349 }

	350

	351 const char* JSONParser::NextChar() {

	352 DCHECK(CanConsume(1));

	353 ++index_;

	354 ++pos_;

	355 return pos_;

	356 }

	357

	358 void JSONParser::NextNChars(int n) {

	359 DCHECK(CanConsume(n));

	360 index_ += n;

	361 pos_ += n;

	362 }

	363

	364 JSONParser::Token JSONParser::GetNextToken() {

	365 EatWhitespaceAndComments();

	366 if (!CanConsume(1))

	367 return T_END_OF_INPUT;

	368

	369 switch (*pos_) {

	370 case '{':

	371 return T_OBJECT_BEGIN;

	372 case '}':

	373 return T_OBJECT_END;

	374 case '[':

	375 return T_ARRAY_BEGIN;

	376 case ']':

	377 return T_ARRAY_END;

	378 case '"':

	379 return T_STRING;

	380 case '0':

	381 case '1':

	382 case '2':

	383 case '3':

	384 case '4':

	385 case '5':

	386 case '6':

	387 case '7':

	388 case '8':

	389 case '9':

	390 case '-':

	391 return T_NUMBER;

	392 case 't':

	393 return T_BOOL_TRUE;

	394 case 'f':

	395 return T_BOOL_FALSE;

	396 case 'n':

	397 return T_NULL;

	398 case ',':

	399 return T_LIST_SEPARATOR;

	400 case ':':

	401 return T_OBJECT_PAIR_SEPARATOR;

	402 default:

	403 return T_INVALID_TOKEN;

	404 }

	405 }

	406

	407 void JSONParser::EatWhitespaceAndComments() {

	408 while (pos_ < end_pos_) {

	409 switch (*pos_) {

	410 case '\r':

	411 case '\n':

	412 index_last_line_ = index_;

	413 ++line_number_;

	414 // Fall through.

	415 case ' ':

	416 case '\t':

	417 NextChar();

	418 break;

	419 case '/':

	420 if (!EatComment())

	421 return;

	422 break;

	423 default:

	424 return;

	425 }

	426 }

	427 }

	428

	429 bool JSONParser::EatComment() {

	430 if (*pos_ != '/' \|\| !CanConsume(1))

	431 return false;

	432

	433 char next_char = *NextChar();

	434 if (next_char == '/') {

	435 // Single line comment, read to newline.

	436 while (CanConsume(1)) {

	437 char next_char = *NextChar();

	438 if (next_char == '\n' \|\| next_char == '\r')

	439 return true;

	440 }

	441 } else if (next_char == '*') {

	442 // Block comment, read until end marker.

	443 while (CanConsume(2)) {

	444 if (NextChar() == '' && *NextChar() == '/') {
	Mark Mentovai 2012/05/08 20:19:41 This eats two characters at a time in a loop, so t This eats two characters at a time in a loop, so the comment terminator is handled properly in /*/, improperly in /a/, properly again in /ab/, etc., improper handling for any odd number of characters between / and /. You want to seek looking just for , and then verify that what follows it is a /. Robert Sesek 2012/05/15 16:57:51 Isn't that what's happening? The operator there is Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > This eats two characters at a time in a loop, so the comment terminator is > handled properly in /*/, improperly in /a/, properly again in /ab/, etc., > improper handling for any odd number of characters between / and /. > > You want to seek looking just for , and then verify that what follows it is a > /. Isn't that what's happening? The operator there is &&. Added a test, and it passes.
	445 // EatWhitespaceAndComments will inspect pos_, which will still be on

	446 // the last / of the comment, so advance once more (which may also be

	447 // end of input).

	448 NextChar();

	449 return true;

	450 }

	451 }
	Mark Mentovai 2012/05/08 20:19:41 If the /* is unterminated and you reach the end of If the /* is unterminated and you reach the end of input, this returns false having eaten the comment. The parser is wound to the end of input. What’s the intended behavior? I guess GetNextToken bails out when it sees it’s at the end, but this is probably worth a comment. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > If the /* is unterminated and you reach the end of input, this returns false > having eaten the comment. The parser is wound to the end of input. What’s the > intended behavior? I guess GetNextToken bails out when it sees it’s at the end, > but this is probably worth a comment. Done.
	452 }

	453

	454 return false;

	455 }

	456

	457 Value* JSONParser::ParseNextToken() {

	458 return ParseToken(GetNextToken());

	459 }

	460

	461 Value* JSONParser::ParseToken(Token token) {

	462 switch (token) {

	463 case T_OBJECT_BEGIN:

	464 return ConsumeDictionary();

	465 case T_ARRAY_BEGIN:

	466 return ConsumeList();

	467 case T_STRING:

	468 return ConsumeString();

	469 case T_NUMBER:

	470 return ConsumeNumber();

	471 case T_BOOL_TRUE:

	472 case T_BOOL_FALSE:

	473 case T_NULL:

	474 return ConsumeLiteral();

	475 default:

	476 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);

	477 return NULL;

	478 }

	479 }

	480

	481 Value* JSONParser::ConsumeDictionary() {

	482 if (*pos_ != '{') {

	483 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);

	484 return NULL;

	485 }

	486

	487 StackMarker depth_check(&stack_depth_);

	488 if (depth_check.IsTooDeep()) {

	489 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 1);

	490 return NULL;

	491 }

	492

	493 scoped_ptr<DictionaryValue> dict(new DictionaryValue);

	494

	495 NextChar();

	496 Token token = GetNextToken();

	497 while (token != T_OBJECT_END) {

	498 if (token != T_STRING) {

	499 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1);

	500 return NULL;

	501 }

	502

	503 // First consume the key.

	504 StringBuilder key;

	505 if (!ConsumeStringRaw(&key)) {

	506 return NULL;

	507 }

	508

	509 // Read the separator.

	510 NextChar();

	511 token = GetNextToken();

	512 if (token != T_OBJECT_PAIR_SEPARATOR) {

	513 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	514 return NULL;

	515 }

	516

	517 // The token is the value. Ownership transfers to \|dict\|.

	518 NextChar();

	519 Value* value = ParseNextToken();

	520 if (!value) {

	521 return NULL;

	522 }

	523

	524 dict->SetWithoutPathExpansion(key.AsString(), value);

	525

	526 NextChar();

	527 token = GetNextToken();

	528 if (token == T_LIST_SEPARATOR) {

	529 NextChar();

	530 token = GetNextToken();

	531 if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {

	532 ReportError(JSONReader::JSON_TRAILING_COMMA, 1);

	533 return NULL;

	534 }

	535 } else if (token != T_OBJECT_END) {

	536 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);

	537 return NULL;

	538 }

	539 }

	540

	541 if (token != T_OBJECT_END)

	542 return NULL;

	543

	544 return dict.release();

	545 }

	546

	547 Value* JSONParser::ConsumeList() {

	548 if (*pos_ != '[') {

	549 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);

	550 return NULL;

	551 }

	552

	553 StackMarker depth_check(&stack_depth_);

	554 if (depth_check.IsTooDeep()) {

	555 ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 1);

	556 return NULL;

	557 }

	558

	559 scoped_ptr<ListValue> list(new ListValue);

	560

	561 NextChar();

	562 Token token = GetNextToken();

	563 while (token != T_ARRAY_END) {

	564 Value* item = ParseToken(token);

	565 if (!item) {

	566 // ReportError from deeper level.

	567 return NULL;

	568 }

	569

	570 list->Append(item);

	571

	572 NextChar();

	573 token = GetNextToken();

	574 if (token == T_LIST_SEPARATOR) {

	575 NextChar();

	576 token = GetNextToken();

	577 if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {

	578 ReportError(JSONReader::JSON_TRAILING_COMMA, 1);

	579 return NULL;

	580 }

	581 } else if (token != T_ARRAY_END) {

	582 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	583 return NULL;

	584 }

	585 }

	586

	587 if (token != T_ARRAY_END)

	588 return NULL;

	589

	590 return list.release();

	591 }

	592

	593 Value* JSONParser::ConsumeString() {

	594 StringBuilder string;

	595 if (!ConsumeStringRaw(&string))

	596 return NULL;

	597

	598 // Create the Value representation, either using a hidden root, if configured

	599 // to do so, and the string can be represented by StringPiece.

	600 if (string.CanBeStringPiece() && !(options_ & JSON_DETACHABLE_CHILDREN)) {

	601 return new JSONStringValue(string.AsStringPiece());

	602 } else {

	603 if (string.CanBeStringPiece())

	604 string.Convert();

	605 return new StringValue(string.AsString());

	606 }

	607 }

	608

	609 bool JSONParser::ConsumeStringRaw(StringBuilder* out) {

	610 if (*pos_ != '"') {

	611 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);

	612 return false;

	613 }

	614

	615 // StringBuilder will internally build a StringPiece unless a UTF-16

	616 // conversion occurs, at which point it will perform a copy into a

	617 // std::string.

	618 StringBuilder string(NextChar());

	619

	620 int length = end_pos_ - start_pos_;

	621 int32 next_char = 0;

	622

	623 DCHECK_EQ(pos_, (start_pos_ + index_));
	Mark Mentovai 2012/05/08 20:19:41 Why the s? Why the s? Robert Sesek 2012/05/15 16:57:51 Debugging code removed. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > Why the *s? Debugging code removed.
	624

	625 while (CanConsume(1)) {

	626 pos_ = start_pos_ + index_; // CBU8_NEXT is postcrement.

	627 CBU8_NEXT(start_pos_, index_, length, next_char);

	628 if (next_char < 0 \|\| !IsValidCharacter(next_char)) {

	629 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1);

	630 return false;

	631 }

	632

	633 // If this character is an escape sequence...

	634 if (next_char == '\\') {

	635 // The input string will be adjusted (either by combining the two

	636 // characters of an encoded escape sequence, or with a UTF conversion),

	637 // so using StringPiece isn't possible -- force a conversion.

	638 string.Convert();

	639

	640 if (!CanConsume(1)) {

	641 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);

	642 return false;

	643 }

	644

	645 switch (*NextChar()) {

	646 // Allowed esape sequences:

	647 case 'x': { // UTF-8 sequence.

	648 if (!CanConsume(2)) {

	649 ReportError(JSONReader::JSON_INVALID_ESCAPE, 1);

	650 return false;

	651 }

	652

	653 int hex_digit = 0;

	654 if (!HexStringToInt(StringPiece(NextChar(), 2), &hex_digit)) {

	655 ReportError(JSONReader::JSON_INVALID_ESCAPE, -1);

	656 return false;

	657 }

	658 NextChar();

	659

	660 if (hex_digit < kExtendedASCIIStart)

	661 string.Append(hex_digit);

	662 else

	663 DecodeUTF8(hex_digit, &string);
	Mark Mentovai 2012/05/08 20:19:41 How is this supposed to work? Why don’t I see it i How is this supposed to work? Why don’t I see it in the RFC? Seems weird and bad and udnerspecified. Is it just for compatibility with an old and terrible interface? Is anyone actually using it? If not removed, it definitely needs to be documented somewhere. Robert Sesek 2012/05/15 16:57:51 Documented. I don't want to remove this now (witho Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > How is this supposed to work? Why don’t I see it in the RFC? Seems weird and bad > and udnerspecified. Is it just for compatibility with an old and terrible > interface? Is anyone actually using it? If not removed, it definitely needs to > be documented somewhere. Documented. I don't want to remove this now (without knowing what inputs could be sending \x).
	664 break;

	665 }

	666 case 'u': { // UTF-16 sequence.

	667 // UTF units are of the form \uXXXX.

	668 if (!CanConsume(5)) { // 5 being 'u' and four HEX digits.

	669 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);

	670 return false;

	671 }

	672

	673 // Skip the 'u'.

	674 NextChar();

	675

	676 std::string utf8_units;

	677 if (!DecodeUTF16(&utf8_units)) {

	678 ReportError(JSONReader::JSON_INVALID_ESCAPE, -1);

	679 return false;

	680 }

	681

	682 string.AppendString(utf8_units);

	683 break;

	684 }

	685 case '"':

	686 string.Append('"');

	687 break;

	688 case '\\':

	689 string.Append('\\');

	690 break;

	691 case '/':

	692 string.Append('/');

	693 break;

	694 case 'b':

	695 string.Append('\b');

	696 break;

	697 case 'f':

	698 string.Append('\f');

	699 break;

	700 case 'n':

	701 string.Append('\n');

	702 break;

	703 case 'r':

	704 string.Append('\r');

	705 break;

	706 case 't':

	707 string.Append('\t');

	708 break;

	709 case 'v': // Not listed as valid escape sequence in the RFC.

	710 string.Append('\v');

	711 break;

	712 // All other escape squences are illegal.

	713 default:

	714 ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);

	715 return false;

	716 }

	717 } else if (next_char == '"') {

	718 --index_; // Rewind by one because of CBU8_NEXT.

	719 out->Swap(&string);

	720 return true;

	721 } else {

	722 if (next_char < kExtendedASCIIStart)

	723 string.Append(next_char);

	724 else

	725 DecodeUTF8(next_char, &string);

	726 }

	727 }

	728

	729 ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);

	730 return false;

	731 }

	732

	733 // Entry is at the first X in \uXXXX.

	734 bool JSONParser::DecodeUTF16(std::string* dest_string) {

	735 if (!CanConsume(4))

	736 return false;

	737

	738 // This is a 32-bit field because the shift operations in the

	739 // conversion process below cause MSVC to error about "data loss."

	740 // This only stores UTF-16 code units, though.

	741 // Consume the UTF-16 code unit, which may be a high surrogate.

	742 int code_unit16_high = 0;

	743 if (!HexStringToInt(StringPiece(pos_, 4), &code_unit16_high))

	744 return false;

	745

	746 // Only add 3, not 4, because at the end of this iteration, the parser has

	747 // finished working with the last digit of the UTF sequence, meaning that

	748 // the next spin of the loop will advance to the next byte.

	749 NextNChars(3);

	750

	751 // If this is a high surrogate, consume the next code unit to get the

	752 // low surrogate.

	753 int code_unit16_low = 0;

	754 if (CBU16_IS_SURROGATE(code_unit16_high)) {

	755 // Make sure this is the high surrogate. If not, it's an encoding

	756 // error.

	757 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))

	758 return false;

	759

	760 // Make sure that the token has more characters to consume the

	761 // lower surrogate.

	762 if (!CanConsume(6)) // 6 being '\' 'u' and four HEX digits.

	763 return false;

	764 if (NextChar() != '\\' \|\| NextChar() != 'u')

	765 return false;

	766

	767 NextChar(); // Read past 'u'.

	768 if (!HexStringToInt(StringPiece(pos_, 4), &code_unit16_low))

	769 return false;

	770

	771 NextNChars(3);

	772

	773 if (!CBU16_IS_SURROGATE(code_unit16_low) \|\|

	774 !CBU16_IS_TRAIL(code_unit16_low)) {
	Mark Mentovai 2012/05/08 20:19:41 CBU16_IS_TRAIL implies CBU16_IS_SURROGATE, you onl CBU16_IS_TRAIL implies CBU16_IS_SURROGATE, you only need to check CBU16_IS_TRAIL. (Optimization might eliminate the additional check. It also might not.) If you want to keep CBU16_IS_SURROGATE, it may be best as a DCHECK. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > CBU16_IS_TRAIL implies CBU16_IS_SURROGATE, you only need to check > CBU16_IS_TRAIL. > > (Optimization might eliminate the additional check. It also might not.) > > If you want to keep CBU16_IS_SURROGATE, it may be best as a DCHECK. Done.
	775 return false;

	776 }

	777 } else if (!CBU16_IS_SINGLE(code_unit16_high)) {
	Mark Mentovai 2012/05/08 20:19:41 CBU16_IS_SINGLE is defined as !CBU16_IS_SURROGATE, CBU16_IS_SINGLE is defined as !CBU16_IS_SURROGATE, so this will never be entered. If you want to do this check, it also might be best as a DCHECK. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > CBU16_IS_SINGLE is defined as !CBU16_IS_SURROGATE, so this will never be > entered. If you want to do this check, it also might be best as a DCHECK. Done.
	778 // If this is not a code point, it's an encoding error.

	779 return false;

	780 }

	781

	782 // Convert the UTF-16 code units to a code point and then to a UTF-8

	783 // code unit sequence.

	784 char code_point[8] = { 0 };

	785 size_t offset = 0;

	786 if (!code_unit16_low) {
	Mark Mentovai 2012/05/08 20:19:41 Rather than rechecking this, why don’t you do it i Rather than rechecking this, why don’t you do it inside the if-if above, immediately after deciding that code_unit16_low is a UTF-16 trail? Then you can avoid having code_unit16 even being exposed at this scope. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > Rather than rechecking this, why don’t you do it inside the if-if above, > immediately after deciding that code_unit16_low is a UTF-16 trail? Then you can > avoid having code_unit16 even being exposed at this scope. Done.
	787 CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);

	788 } else {

	789 uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,

	790 code_unit16_low);

	791 offset = 0;

	792 CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);
	Mark Mentovai 2012/05/08 20:19:41 And the same for this, except you’d put it in the And the same for this, except you’d put it in the “I know I have a UTF-16 single” block. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > And the same for this, except you’d put it in the “I know I have a UTF-16 > single” block. Done.
	793 }

	794 dest_string->append(code_point);

	795 return true;

	796 }

	797

	798 void JSONParser::DecodeUTF8(const int32& point, StringBuilder* dest) {

	799 // Anything outside of the basic ASCII plane will need to be decomposed from

	800 // int32 to a multi-byte sequence.

	801 if (point < kExtendedASCIIStart) {

	802 dest->Append(point);

	803 } else {

	804 char utf8_units[4] = { 0 };

	805 int offset = 0;

	806 CBU8_APPEND_UNSAFE(utf8_units, offset, point);

	807 dest->Convert();

	808 dest->AppendString(utf8_units);

	809 }

	810 }

	811

	812 Value* JSONParser::ConsumeNumber() {

	813 const char* num_start = pos_;

	814 const int start_index = index_;

	815 int end_index = start_index;

	816

	817 if (*pos_ == '-')

	818 NextChar();

	819

	820 if (!ReadInt(false)) {

	821 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	822 return NULL;

	823 }

	824 end_index = index_;

	825

	826 // The optional faction part.
	Mark Mentovai 2012/05/08 20:19:41 fraction fraction Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > fraction Done.
	827 if (*pos_ == '.') {

	828 if (!CanConsume(1)) {

	829 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	830 return NULL;

	831 }

	832 NextChar();

	833 if (!ReadInt(true)) {

	834 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	835 return NULL;

	836 }

	837 end_index = index_;

	838 }

	839

	840 // Optional exponent part.

	841 if (pos_ == 'e' \|\| pos_ == 'E') {

	842 NextChar();

	843 if (pos_ == '-' \|\| pos_ == '+')

	844 NextChar();

	845 if (!ReadInt(true)) {

	846 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	847 return NULL;

	848 }

	849 end_index = index_;

	850 }

	851

	852 // ReadInt is greedy because numbers have no easily detectable sentinel,

	853 // so save off where the parser should be on exit (see Consume invariant at

	854 // the top of the header), then make sure the next token is one which is

	855 // valid.

	856 const char* exit_pos = pos_ - 1;

	857 int exit_index = index_ - 1;

	858

	859 switch (GetNextToken()) {

	860 case T_OBJECT_END:

	861 case T_ARRAY_END:

	862 case T_LIST_SEPARATOR:

	863 case T_END_OF_INPUT:

	864 break;

	865 default:

	866 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	867 return NULL;

	868 }

	869

	870 pos_ = exit_pos;

	871 index_ = exit_index;

	872

	873 StringPiece num_string(num_start, end_index - start_index);

	874

	875 int num_int;

	876 if (StringToInt(num_string, &num_int))

	877 return Value::CreateIntegerValue(num_int);

	878

	879 double num_double;

	880 if (base::StringToDouble(num_string.as_string(), &num_double) &&

	881 IsFinite(num_double)) {

	882 return Value::CreateDoubleValue(num_double);

	883 }

	884

	885 return NULL;

	886 }

	887

	888 bool JSONParser::ReadInt(bool allow_leading_zeros) {

	889 char first = *pos_;

	890 int len = 0;

	891

	892 char c = first;

	893 while (CanConsume(1) && IsAsciiDigit(c)) {

	894 c = *NextChar();

	895 ++len;

	896 }

	897

	898 if (len == 0)

	899 return false;

	900

	901 if (!allow_leading_zeros && len > 1 && first == '0')

	902 return false;

	903

	904 return true;

	905 }

	906

	907 Value* JSONParser::ConsumeLiteral() {

	908 switch (*pos_) {

	909 case 't':
	Mark Mentovai 2012/05/08 20:19:41 I’d be more comfortable having kTrueLiteral[] = "t I’d be more comfortable having kTrueLiteral[] = "true" so that you can use either arraysize(kTrueLiteral) - 1 or strlen(kTrueLiteral). A constant based on the length of the literal string appears three times per case, and that’s asking for trouble. Then, you’d wrap each entire case in {} so that kTrueLiteral doesn’t exist outside of that scope, guarding against reuse of kTrueLiteral inside the "false" and "null" branches. You shouldn’t have to count characters yourself, even when there are only four of them. Robert Sesek 2012/05/15 16:57:51 Done. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > I’d be more comfortable having kTrueLiteral[] = "true" so that you can use > either arraysize(kTrueLiteral) - 1 or strlen(kTrueLiteral). A constant based on > the length of the literal string appears three times per case, and that’s asking > for trouble. > > Then, you’d wrap each entire case in {} so that kTrueLiteral doesn’t exist > outside of that scope, guarding against reuse of kTrueLiteral inside the "false" > and "null" branches. > > You shouldn’t have to count characters yourself, even when there are only four > of them. Done.
	910 if (!CanConsume(3) \|\| !StringsAreEqual(pos_, "true", 4)) {

	911 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	912 return NULL;

	913 }

	914 NextNChars(3);

	915 return Value::CreateBooleanValue(true);

	916 case 'f':

	917 if (!CanConsume(4) \|\| !StringsAreEqual(pos_, "false", 5)) {

	918 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	919 return NULL;

	920 }

	921 NextNChars(4);

	922 return Value::CreateBooleanValue(false);

	923 case 'n':

	924 if (!CanConsume(3) \|\| !StringsAreEqual(pos_, "null", 4)) {

	925 ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);

	926 return NULL;

	927 }

	928 NextNChars(3);

	929 return Value::CreateNullValue();

	930 default:

	931 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);

	932 return NULL;

	933 }

	934 }

	935

	936 // static

	937 bool JSONParser::StringsAreEqual(const char* one, const char* two, size_t len) {

	938 return strncmp(one, two, len) == 0;

	939 }

	940

	941 void JSONParser::ReportError(JSONReader::JsonParseError code,

	942 int column_adjust) {

	943 error_code_ = code;

	944 error_line_ = line_number_;

	945 error_column_ = index_ - index_last_line_ + column_adjust;

	946 }

	947

	948 // static

	949 std::string JSONParser::FormatErrorMessage(int line, int column,

	950 const std::string& description) {

	951 if (line \|\| column) {
	Mark Mentovai 2012/05/08 20:19:41 Do you ever have !line && column, or the other way Do you ever have !line && column, or the other way around? Robert Sesek 2012/05/15 16:57:51 No, but one could be zero. Show quoted text On 2012/05/08 20:19:41, Mark Mentovai wrote: > Do you ever have !line && column, or the other way around? No, but one could be zero.
	952 return StringPrintf("Line: %i, column: %i, %s",

	953 line, column, description.c_str());

	954 }

	955 return description;

	956 }

	957

	958 } // namespace internal

	959 } // namespace base

OLD	NEW

« base/json/json_parser.h ('K') | « base/json/json_parser.h ('k') | base/json/json_parser_unittest.cc » ('j') | base/json/json_parser_unittest.cc » ('J')