OLD | NEW |
| (Empty) |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 // | |
5 // A JSON parser. Converts strings of JSON into a Value object (see | |
6 // base/values.h). | |
7 // http://www.ietf.org/rfc/rfc4627.txt?number=4627 | |
8 // | |
9 // Known limitations/deviations from the RFC: | |
10 // - Only knows how to parse ints within the range of a signed 32 bit int and | |
11 // decimal numbers within a double. | |
12 // - Assumes input is encoded as UTF8. The spec says we should allow UTF-16 | |
13 // (BE or LE) and UTF-32 (BE or LE) as well. | |
14 // - We limit nesting to 100 levels to prevent stack overflow (this is allowed | |
15 // by the RFC). | |
16 // - A Unicode FAQ ("http://unicode.org/faq/utf_bom.html") writes a data | |
17 // stream may start with a Unicode Byte-Order-Mark (U+FEFF), i.e. the input | |
18 // UTF-8 string for the JSONReader::JsonToValue() function may start with a | |
19 // UTF-8 BOM (0xEF, 0xBB, 0xBF). | |
20 // To avoid the function from mis-treating a UTF-8 BOM as an invalid | |
21 // character, the function skips a Unicode BOM at the beginning of the | |
22 // Unicode string (converted from the input UTF-8 string) before parsing it. | |
23 // | |
24 // TODO(tc): Add a parsing option to to relax object keys being wrapped in | |
25 // double quotes | |
26 // TODO(tc): Add an option to disable comment stripping | |
27 // TODO(aa): Consider making the constructor public and the static Read() method | |
28 // only a convenience for the common uses with more complex configuration going | |
29 // on the instance. | |
30 | |
31 #ifndef BASE_JSON_READER_H_ | |
32 #define BASE_JSON_READER_H_ | |
33 | |
34 #include <string> | |
35 | |
36 #include "base/basictypes.h" | |
37 #include "testing/gtest/include/gtest/gtest_prod.h" | |
38 | |
39 class Value; | |
40 | |
41 class JSONReader { | |
42 public: | |
43 // A struct to hold a JS token. | |
44 class Token { | |
45 public: | |
46 enum Type { | |
47 OBJECT_BEGIN, // { | |
48 OBJECT_END, // } | |
49 ARRAY_BEGIN, // [ | |
50 ARRAY_END, // ] | |
51 STRING, | |
52 NUMBER, | |
53 BOOL_TRUE, // true | |
54 BOOL_FALSE, // false | |
55 NULL_TOKEN, // null | |
56 LIST_SEPARATOR, // , | |
57 OBJECT_PAIR_SEPARATOR, // : | |
58 END_OF_INPUT, | |
59 INVALID_TOKEN, | |
60 }; | |
61 Token(Type t, const wchar_t* b, int len) | |
62 : type(t), begin(b), length(len) {} | |
63 | |
64 Type type; | |
65 | |
66 // A pointer into JSONReader::json_pos_ that's the beginning of this token. | |
67 const wchar_t* begin; | |
68 | |
69 // End should be one char past the end of the token. | |
70 int length; | |
71 | |
72 // Get the character that's one past the end of this token. | |
73 wchar_t NextChar() { | |
74 return *(begin + length); | |
75 } | |
76 }; | |
77 | |
78 // Error messages that can be returned. | |
79 static const char* kBadRootElementType; | |
80 static const char* kInvalidEscape; | |
81 static const char* kSyntaxError; | |
82 static const char* kTrailingComma; | |
83 static const char* kTooMuchNesting; | |
84 static const char* kUnexpectedDataAfterRoot; | |
85 static const char* kUnsupportedEncoding; | |
86 static const char* kUnquotedDictionaryKey; | |
87 | |
88 JSONReader(); | |
89 | |
90 // Reads and parses |json|, returning a Value. The caller owns the returned | |
91 // instance. If |json| is not a properly formed JSON string, returns NULL. | |
92 // If |allow_trailing_comma| is true, we will ignore trailing commas in | |
93 // objects and arrays even though this goes against the RFC. | |
94 static Value* Read(const std::string& json, bool allow_trailing_comma); | |
95 | |
96 // Reads and parses |json| like Read(). |error_message_out| is optional. If | |
97 // specified and NULL is returned, |error_message_out| will be populated with | |
98 // a string describing the error. Otherwise, |error_message_out| is | |
99 // unmodified. | |
100 static Value* ReadAndReturnError(const std::string& json, | |
101 bool allow_trailing_comma, | |
102 std::string* error_message_out); | |
103 | |
104 // Returns the error message if the last call to JsonToValue() failed. If the | |
105 // last call did not fail, returns a valid empty string. | |
106 std::string error_message() { return error_message_; } | |
107 | |
108 // Reads and parses |json|, returning a Value. The caller owns the returned | |
109 // instance. If |json| is not a properly formed JSON string, returns NULL and | |
110 // a detailed error can be retrieved from |error_message()|. | |
111 // If |check_root| is true, we require that the root object be an object or | |
112 // array. Otherwise, it can be any valid JSON type. | |
113 // If |allow_trailing_comma| is true, we will ignore trailing commas in | |
114 // objects and arrays even though this goes against the RFC. | |
115 Value* JsonToValue(const std::string& json, bool check_root, | |
116 bool allow_trailing_comma); | |
117 | |
118 private: | |
119 static std::string FormatErrorMessage(int line, int column, | |
120 const char* description); | |
121 | |
122 DISALLOW_EVIL_CONSTRUCTORS(JSONReader); | |
123 | |
124 FRIEND_TEST(JSONReaderTest, Reading); | |
125 FRIEND_TEST(JSONReaderTest, ErrorMessages); | |
126 | |
127 // Recursively build Value. Returns NULL if we don't have a valid JSON | |
128 // string. If |is_root| is true, we verify that the root element is either | |
129 // an object or an array. | |
130 Value* BuildValue(bool is_root); | |
131 | |
132 // Parses a sequence of characters into a Token::NUMBER. If the sequence of | |
133 // characters is not a valid number, returns a Token::INVALID_TOKEN. Note | |
134 // that DecodeNumber is used to actually convert from a string to an | |
135 // int/double. | |
136 Token ParseNumberToken(); | |
137 | |
138 // Try and convert the substring that token holds into an int or a double. If | |
139 // we can (ie., no overflow), return the value, else return NULL. | |
140 Value* DecodeNumber(const Token& token); | |
141 | |
142 // Parses a sequence of characters into a Token::STRING. If the sequence of | |
143 // characters is not a valid string, returns a Token::INVALID_TOKEN. Note | |
144 // that DecodeString is used to actually decode the escaped string into an | |
145 // actual wstring. | |
146 Token ParseStringToken(); | |
147 | |
148 // Convert the substring into a value string. This should always succeed | |
149 // (otherwise ParseStringToken would have failed). | |
150 Value* DecodeString(const Token& token); | |
151 | |
152 // Grabs the next token in the JSON stream. This does not increment the | |
153 // stream so it can be used to look ahead at the next token. | |
154 Token ParseToken(); | |
155 | |
156 // Increments |json_pos_| past leading whitespace and comments. | |
157 void EatWhitespaceAndComments(); | |
158 | |
159 // If |json_pos_| is at the start of a comment, eat it, otherwise, returns | |
160 // false. | |
161 bool EatComment(); | |
162 | |
163 // Checks if |json_pos_| matches str. | |
164 bool NextStringMatch(const std::wstring& str); | |
165 | |
166 // Creates the error message that will be returned to the caller. The current | |
167 // line and column are determined and added into the final message. | |
168 void SetErrorMessage(const char* description, const wchar_t* error_pos); | |
169 | |
170 // Pointer to the starting position in the input string. | |
171 const wchar_t* start_pos_; | |
172 | |
173 // Pointer to the current position in the input string. | |
174 const wchar_t* json_pos_; | |
175 | |
176 // Used to keep track of how many nested lists/dicts there are. | |
177 int stack_depth_; | |
178 | |
179 // A parser flag that allows trailing commas in objects and arrays. | |
180 bool allow_trailing_comma_; | |
181 | |
182 // Contains the error message for the last call to JsonToValue(), if any. | |
183 std::string error_message_; | |
184 }; | |
185 | |
186 #endif // BASE_JSON_READER_H_ | |
OLD | NEW |