Index: third_party/protobuf/src/google/protobuf/io/tokenizer.cc |
=================================================================== |
--- third_party/protobuf/src/google/protobuf/io/tokenizer.cc (revision 216642) |
+++ third_party/protobuf/src/google/protobuf/io/tokenizer.cc (working copy) |
@@ -89,8 +89,11 @@ |
// exactly pretty. |
#include <google/protobuf/io/tokenizer.h> |
+#include <google/protobuf/stubs/common.h> |
+#include <google/protobuf/stubs/stringprintf.h> |
#include <google/protobuf/io/zero_copy_stream.h> |
#include <google/protobuf/stubs/strutil.h> |
+#include <google/protobuf/stubs/stl_util.h> |
namespace google { |
namespace protobuf { |
@@ -118,6 +121,8 @@ |
CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || |
c == '\r' || c == '\v' || c == '\f'); |
+CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' || |
+ c == '\r' || c == '\v' || c == '\f'); |
CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0'); |
@@ -187,7 +192,8 @@ |
read_error_(false), |
line_(0), |
column_(0), |
- token_start_(-1), |
+ record_target_(NULL), |
+ record_start_(-1), |
allow_f_after_float_(false), |
comment_style_(CPP_COMMENT_STYLE) { |
@@ -238,9 +244,9 @@ |
} |
// If we're in a token, append the rest of the buffer to it. |
- if (token_start_ >= 0 && token_start_ < buffer_size_) { |
- current_.text.append(buffer_ + token_start_, buffer_size_ - token_start_); |
- token_start_ = 0; |
+ if (record_target_ != NULL && record_start_ < buffer_size_) { |
+ record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_); |
+ record_start_ = 0; |
} |
const void* data = NULL; |
@@ -261,23 +267,33 @@ |
current_char_ = buffer_[0]; |
} |
+inline void Tokenizer::RecordTo(string* target) { |
+ record_target_ = target; |
+ record_start_ = buffer_pos_; |
+} |
+ |
+inline void Tokenizer::StopRecording() { |
+ // Note: The if() is necessary because some STL implementations crash when |
+ // you call string::append(NULL, 0), presumably because they are trying to |
+ // be helpful by detecting the NULL pointer, even though there's nothing |
+ // wrong with reading zero bytes from NULL. |
+ if (buffer_pos_ != record_start_) { |
+ record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_); |
+ } |
+ record_target_ = NULL; |
+ record_start_ = -1; |
+} |
+ |
inline void Tokenizer::StartToken() { |
- token_start_ = buffer_pos_; |
current_.type = TYPE_START; // Just for the sake of initializing it. |
current_.text.clear(); |
current_.line = line_; |
current_.column = column_; |
+ RecordTo(¤t_.text); |
} |
inline void Tokenizer::EndToken() { |
- // Note: The if() is necessary because some STL implementations crash when |
- // you call string::append(NULL, 0), presumably because they are trying to |
- // be helpful by detecting the NULL pointer, even though there's nothing |
- // wrong with reading zero bytes from NULL. |
- if (buffer_pos_ != token_start_) { |
- current_.text.append(buffer_ + token_start_, buffer_pos_ - token_start_); |
- } |
- token_start_ = -1; |
+ StopRecording(); |
current_.end_column = column_; |
} |
@@ -353,6 +369,27 @@ |
AddError("Expected hex digits for escape sequence."); |
} |
// Possibly followed by another hex digit, but again we don't care. |
+ } else if (TryConsume('u')) { |
+ if (!TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>()) { |
+ AddError("Expected four hex digits for \\u escape sequence."); |
+ } |
+ } else if (TryConsume('U')) { |
+ // We expect 8 hex digits; but only the range up to 0x10ffff is |
+ // legal. |
+ if (!TryConsume('0') || |
+ !TryConsume('0') || |
+ !(TryConsume('0') || TryConsume('1')) || |
+ !TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>() || |
+ !TryConsumeOne<HexDigit>()) { |
+ AddError("Expected eight hex digits up to 10ffff for \\U escape " |
+ "sequence"); |
+ } |
} else { |
AddError("Invalid escape sequence in string literal."); |
} |
@@ -426,26 +463,51 @@ |
return is_float ? TYPE_FLOAT : TYPE_INTEGER; |
} |
-void Tokenizer::ConsumeLineComment() { |
+void Tokenizer::ConsumeLineComment(string* content) { |
+ if (content != NULL) RecordTo(content); |
+ |
while (current_char_ != '\0' && current_char_ != '\n') { |
NextChar(); |
} |
TryConsume('\n'); |
+ |
+ if (content != NULL) StopRecording(); |
} |
-void Tokenizer::ConsumeBlockComment() { |
+void Tokenizer::ConsumeBlockComment(string* content) { |
int start_line = line_; |
int start_column = column_ - 2; |
+ if (content != NULL) RecordTo(content); |
+ |
while (true) { |
while (current_char_ != '\0' && |
current_char_ != '*' && |
- current_char_ != '/') { |
+ current_char_ != '/' && |
+ current_char_ != '\n') { |
NextChar(); |
} |
- if (TryConsume('*') && TryConsume('/')) { |
+ if (TryConsume('\n')) { |
+ if (content != NULL) StopRecording(); |
+ |
+ // Consume leading whitespace and asterisk; |
+ ConsumeZeroOrMore<WhitespaceNoNewline>(); |
+ if (TryConsume('*')) { |
+ if (TryConsume('/')) { |
+ // End of comment. |
+ break; |
+ } |
+ } |
+ |
+ if (content != NULL) RecordTo(content); |
+ } else if (TryConsume('*') && TryConsume('/')) { |
// End of comment. |
+ if (content != NULL) { |
+ StopRecording(); |
+ // Strip trailing "*/". |
+ content->erase(content->size() - 2); |
+ } |
break; |
} else if (TryConsume('/') && current_char_ == '*') { |
// Note: We didn't consume the '*' because if there is a '/' after it |
@@ -456,42 +518,59 @@ |
AddError("End-of-file inside block comment."); |
error_collector_->AddError( |
start_line, start_column, " Comment started here."); |
+ if (content != NULL) StopRecording(); |
break; |
} |
} |
} |
+Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() { |
+ if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { |
+ if (TryConsume('/')) { |
+ return LINE_COMMENT; |
+ } else if (TryConsume('*')) { |
+ return BLOCK_COMMENT; |
+ } else { |
+ // Oops, it was just a slash. Return it. |
+ current_.type = TYPE_SYMBOL; |
+ current_.text = "/"; |
+ current_.line = line_; |
+ current_.column = column_ - 1; |
+ current_.end_column = column_; |
+ return SLASH_NOT_COMMENT; |
+ } |
+ } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { |
+ return LINE_COMMENT; |
+ } else { |
+ return NO_COMMENT; |
+ } |
+} |
+ |
// ------------------------------------------------------------------- |
bool Tokenizer::Next() { |
previous_ = current_; |
- // Did we skip any characters after the last token? |
- bool skipped_stuff = false; |
- |
while (!read_error_) { |
- if (TryConsumeOne<Whitespace>()) { |
- ConsumeZeroOrMore<Whitespace>(); |
+ ConsumeZeroOrMore<Whitespace>(); |
- } else if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { |
- // Starting a comment? |
- if (TryConsume('/')) { |
- ConsumeLineComment(); |
- } else if (TryConsume('*')) { |
- ConsumeBlockComment(); |
- } else { |
- // Oops, it was just a slash. Return it. |
- current_.type = TYPE_SYMBOL; |
- current_.text = "/"; |
- current_.line = line_; |
- current_.column = column_ - 1; |
+ switch (TryConsumeCommentStart()) { |
+ case LINE_COMMENT: |
+ ConsumeLineComment(NULL); |
+ continue; |
+ case BLOCK_COMMENT: |
+ ConsumeBlockComment(NULL); |
+ continue; |
+ case SLASH_NOT_COMMENT: |
return true; |
- } |
+ case NO_COMMENT: |
+ break; |
+ } |
- } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { |
- ConsumeLineComment(); |
+ // Check for EOF before continuing. |
+ if (read_error_) break; |
- } else if (LookingAt<Unprintable>() || current_char_ == '\0') { |
+ if (LookingAt<Unprintable>() || current_char_ == '\0') { |
AddError("Invalid control characters encountered in text."); |
NextChar(); |
// Skip more unprintable characters, too. But, remember that '\0' is |
@@ -519,7 +598,9 @@ |
if (TryConsumeOne<Digit>()) { |
// It's a floating-point number. |
- if (previous_.type == TYPE_IDENTIFIER && !skipped_stuff) { |
+ if (previous_.type == TYPE_IDENTIFIER && |
+ current_.line == previous_.line && |
+ current_.column == previous_.end_column) { |
// We don't accept syntax like "blah.123". |
error_collector_->AddError(line_, column_ - 2, |
"Need space between identifier and decimal point."); |
@@ -544,8 +625,6 @@ |
EndToken(); |
return true; |
} |
- |
- skipped_stuff = true; |
} |
// EOF |
@@ -557,6 +636,195 @@ |
return false; |
} |
+namespace { |
+ |
+// Helper class for collecting comments and putting them in the right places. |
+// |
+// This basically just buffers the most recent comment until it can be decided |
+// exactly where that comment should be placed. When Flush() is called, the |
+// current comment goes into either prev_trailing_comments or detached_comments. |
+// When the CommentCollector is destroyed, the last buffered comment goes into |
+// next_leading_comments. |
+class CommentCollector { |
+ public: |
+ CommentCollector(string* prev_trailing_comments, |
+ vector<string>* detached_comments, |
+ string* next_leading_comments) |
+ : prev_trailing_comments_(prev_trailing_comments), |
+ detached_comments_(detached_comments), |
+ next_leading_comments_(next_leading_comments), |
+ has_comment_(false), |
+ is_line_comment_(false), |
+ can_attach_to_prev_(true) { |
+ if (prev_trailing_comments != NULL) prev_trailing_comments->clear(); |
+ if (detached_comments != NULL) detached_comments->clear(); |
+ if (next_leading_comments != NULL) next_leading_comments->clear(); |
+ } |
+ |
+ ~CommentCollector() { |
+ // Whatever is in the buffer is a leading comment. |
+ if (next_leading_comments_ != NULL && has_comment_) { |
+ comment_buffer_.swap(*next_leading_comments_); |
+ } |
+ } |
+ |
+ // About to read a line comment. Get the comment buffer pointer in order to |
+ // read into it. |
+ string* GetBufferForLineComment() { |
+ // We want to combine with previous line comments, but not block comments. |
+ if (has_comment_ && !is_line_comment_) { |
+ Flush(); |
+ } |
+ has_comment_ = true; |
+ is_line_comment_ = true; |
+ return &comment_buffer_; |
+ } |
+ |
+ // About to read a block comment. Get the comment buffer pointer in order to |
+ // read into it. |
+ string* GetBufferForBlockComment() { |
+ if (has_comment_) { |
+ Flush(); |
+ } |
+ has_comment_ = true; |
+ is_line_comment_ = false; |
+ return &comment_buffer_; |
+ } |
+ |
+ void ClearBuffer() { |
+ comment_buffer_.clear(); |
+ has_comment_ = false; |
+ } |
+ |
+ // Called once we know that the comment buffer is complete and is *not* |
+ // connected to the next token. |
+ void Flush() { |
+ if (has_comment_) { |
+ if (can_attach_to_prev_) { |
+ if (prev_trailing_comments_ != NULL) { |
+ prev_trailing_comments_->append(comment_buffer_); |
+ } |
+ can_attach_to_prev_ = false; |
+ } else { |
+ if (detached_comments_ != NULL) { |
+ detached_comments_->push_back(comment_buffer_); |
+ } |
+ } |
+ ClearBuffer(); |
+ } |
+ } |
+ |
+ void DetachFromPrev() { |
+ can_attach_to_prev_ = false; |
+ } |
+ |
+ private: |
+ string* prev_trailing_comments_; |
+ vector<string>* detached_comments_; |
+ string* next_leading_comments_; |
+ |
+ string comment_buffer_; |
+ |
+ // True if any comments were read into comment_buffer_. This can be true even |
+ // if comment_buffer_ is empty, namely if the comment was "/**/". |
+ bool has_comment_; |
+ |
+ // Is the comment in the comment buffer a line comment? |
+ bool is_line_comment_; |
+ |
+ // Is it still possible that we could be reading a comment attached to the |
+ // previous token? |
+ bool can_attach_to_prev_; |
+}; |
+ |
+} // namespace |
+ |
+bool Tokenizer::NextWithComments(string* prev_trailing_comments, |
+ vector<string>* detached_comments, |
+ string* next_leading_comments) { |
+ CommentCollector collector(prev_trailing_comments, detached_comments, |
+ next_leading_comments); |
+ |
+ if (current_.type == TYPE_START) { |
+ collector.DetachFromPrev(); |
+ } else { |
+ // A comment appearing on the same line must be attached to the previous |
+ // declaration. |
+ ConsumeZeroOrMore<WhitespaceNoNewline>(); |
+ switch (TryConsumeCommentStart()) { |
+ case LINE_COMMENT: |
+ ConsumeLineComment(collector.GetBufferForLineComment()); |
+ |
+ // Don't allow comments on subsequent lines to be attached to a trailing |
+ // comment. |
+ collector.Flush(); |
+ break; |
+ case BLOCK_COMMENT: |
+ ConsumeBlockComment(collector.GetBufferForBlockComment()); |
+ |
+ ConsumeZeroOrMore<WhitespaceNoNewline>(); |
+ if (!TryConsume('\n')) { |
+ // Oops, the next token is on the same line. If we recorded a comment |
+ // we really have no idea which token it should be attached to. |
+ collector.ClearBuffer(); |
+ return Next(); |
+ } |
+ |
+ // Don't allow comments on subsequent lines to be attached to a trailing |
+ // comment. |
+ collector.Flush(); |
+ break; |
+ case SLASH_NOT_COMMENT: |
+ return true; |
+ case NO_COMMENT: |
+ if (!TryConsume('\n')) { |
+ // The next token is on the same line. There are no comments. |
+ return Next(); |
+ } |
+ break; |
+ } |
+ } |
+ |
+ // OK, we are now on the line *after* the previous token. |
+ while (true) { |
+ ConsumeZeroOrMore<WhitespaceNoNewline>(); |
+ |
+ switch (TryConsumeCommentStart()) { |
+ case LINE_COMMENT: |
+ ConsumeLineComment(collector.GetBufferForLineComment()); |
+ break; |
+ case BLOCK_COMMENT: |
+ ConsumeBlockComment(collector.GetBufferForBlockComment()); |
+ |
+ // Consume the rest of the line so that we don't interpret it as a |
+ // blank line the next time around the loop. |
+ ConsumeZeroOrMore<WhitespaceNoNewline>(); |
+ TryConsume('\n'); |
+ break; |
+ case SLASH_NOT_COMMENT: |
+ return true; |
+ case NO_COMMENT: |
+ if (TryConsume('\n')) { |
+ // Completely blank line. |
+ collector.Flush(); |
+ collector.DetachFromPrev(); |
+ } else { |
+ bool result = Next(); |
+ if (!result || |
+ current_.text == "}" || |
+ current_.text == "]" || |
+ current_.text == ")") { |
+ // It looks like we're at the end of a scope. In this case it |
+ // makes no sense to attach a comment to the following token. |
+ collector.Flush(); |
+ } |
+ return result; |
+ } |
+ break; |
+ } |
+ } |
+} |
+ |
// ------------------------------------------------------------------- |
// Token-parsing helpers. Remember that these don't need to report |
// errors since any errors should already have been reported while |
@@ -626,17 +894,138 @@ |
return result; |
} |
+// Helper to append a Unicode code point to a string as UTF8, without bringing |
+// in any external dependencies. |
+static void AppendUTF8(uint32 code_point, string* output) { |
+ uint32 tmp = 0; |
+ int len = 0; |
+ if (code_point <= 0x7f) { |
+ tmp = code_point; |
+ len = 1; |
+ } else if (code_point <= 0x07ff) { |
+ tmp = 0x0000c080 | |
+ ((code_point & 0x07c0) << 2) | |
+ (code_point & 0x003f); |
+ len = 2; |
+ } else if (code_point <= 0xffff) { |
+ tmp = 0x00e08080 | |
+ ((code_point & 0xf000) << 4) | |
+ ((code_point & 0x0fc0) << 2) | |
+ (code_point & 0x003f); |
+ len = 3; |
+ } else if (code_point <= 0x1fffff) { |
+ tmp = 0xf0808080 | |
+ ((code_point & 0x1c0000) << 6) | |
+ ((code_point & 0x03f000) << 4) | |
+ ((code_point & 0x000fc0) << 2) | |
+ (code_point & 0x003f); |
+ len = 4; |
+ } else { |
+ // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is |
+ // normally only defined up to there as well. |
+ StringAppendF(output, "\\U%08x", code_point); |
+ return; |
+ } |
+ tmp = ghtonl(tmp); |
+ output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); |
+} |
+ |
+// Try to read <len> hex digits from ptr, and stuff the numeric result into |
+// *result. Returns true if that many digits were successfully consumed. |
+static bool ReadHexDigits(const char* ptr, int len, uint32* result) { |
+ *result = 0; |
+ if (len == 0) return false; |
+ for (const char* end = ptr + len; ptr < end; ++ptr) { |
+ if (*ptr == '\0') return false; |
+ *result = (*result << 4) + DigitValue(*ptr); |
+ } |
+ return true; |
+} |
+ |
+// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range |
+// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail |
+// surrogate. These numbers are in a reserved range of Unicode code points, so |
+// if we encounter such a pair we know how to parse it and convert it into a |
+// single code point. |
+static const uint32 kMinHeadSurrogate = 0xd800; |
+static const uint32 kMaxHeadSurrogate = 0xdc00; |
+static const uint32 kMinTrailSurrogate = 0xdc00; |
+static const uint32 kMaxTrailSurrogate = 0xe000; |
+ |
+static inline bool IsHeadSurrogate(uint32 code_point) { |
+ return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate); |
+} |
+ |
+static inline bool IsTrailSurrogate(uint32 code_point) { |
+ return (code_point >= kMinTrailSurrogate) && |
+ (code_point < kMaxTrailSurrogate); |
+} |
+ |
+// Combine a head and trail surrogate into a single Unicode code point. |
+static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) { |
+ GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate)); |
+ GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate)); |
+ return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) | |
+ (trail_surrogate - kMinTrailSurrogate)); |
+} |
+ |
+// Convert the escape sequence parameter to a number of expected hex digits. |
+static inline int UnicodeLength(char key) { |
+ if (key == 'u') return 4; |
+ if (key == 'U') return 8; |
+ return 0; |
+} |
+ |
+// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt |
+// to parse that sequence. On success, returns a pointer to the first char |
+// beyond that sequence, and fills in *code_point. On failure, returns ptr |
+// itself. |
+static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) { |
+ const char* p = ptr; |
+ // Fetch the code point. |
+ const int len = UnicodeLength(*p++); |
+ if (!ReadHexDigits(p, len, code_point)) |
+ return ptr; |
+ p += len; |
+ |
+ // Check if the code point we read is a "head surrogate." If so, then we |
+ // expect it to be immediately followed by another code point which is a valid |
+ // "trail surrogate," and together they form a UTF-16 pair which decodes into |
+ // a single Unicode point. Trail surrogates may only use \u, not \U. |
+ if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') { |
+ uint32 trail_surrogate; |
+ if (ReadHexDigits(p + 2, 4, &trail_surrogate) && |
+ IsTrailSurrogate(trail_surrogate)) { |
+ *code_point = AssembleUTF16(*code_point, trail_surrogate); |
+ p += 6; |
+ } |
+ // If this failed, then we just emit the head surrogate as a code point. |
+ // It's bogus, but so is the string. |
+ } |
+ |
+ return p; |
+} |
+ |
+// The text string must begin and end with single or double quote |
+// characters. |
void Tokenizer::ParseStringAppend(const string& text, string* output) { |
- // Reminder: text[0] is always the quote character. (If text is |
- // empty, it's invalid, so we'll just return.) |
- if (text.empty()) { |
+ // Reminder: text[0] is always a quote character. (If text is |
+ // empty, it's invalid, so we'll just return). |
+ const size_t text_size = text.size(); |
+ if (text_size == 0) { |
GOOGLE_LOG(DFATAL) |
<< " Tokenizer::ParseStringAppend() passed text that could not" |
" have been tokenized as a string: " << CEscape(text); |
return; |
} |
- output->reserve(output->size() + text.size()); |
+ // Reserve room for new string. The branch is necessary because if |
+ // there is already space available the reserve() call might |
+ // downsize the output. |
+ const size_t new_len = text_size + output->size(); |
+ if (new_len > output->capacity()) { |
+ output->reserve(new_len); |
+ } |
// Loop through the string copying characters to "output" and |
// interpreting escape sequences. Note that any invalid escape |
@@ -674,19 +1063,27 @@ |
} |
output->push_back(static_cast<char>(code)); |
+ } else if (*ptr == 'u' || *ptr == 'U') { |
+ uint32 unicode; |
+ const char* end = FetchUnicodePoint(ptr, &unicode); |
+ if (end == ptr) { |
+ // Failure: Just dump out what we saw, don't try to parse it. |
+ output->push_back(*ptr); |
+ } else { |
+ AppendUTF8(unicode, output); |
+ ptr = end - 1; // Because we're about to ++ptr. |
+ } |
} else { |
// Some other escape code. |
output->push_back(TranslateEscape(*ptr)); |
} |
- } else if (*ptr == text[0]) { |
- // Ignore quote matching the starting quote. |
+ } else if (*ptr == text[0] && ptr[1] == '\0') { |
+ // Ignore final quote matching the starting quote. |
} else { |
output->push_back(*ptr); |
} |
} |
- |
- return; |
} |
} // namespace io |