third_party/protobuf/src/google/protobuf/io/tokenizer.cc - Issue 21208003: Update protobuf to r428, part 1.

Unified Diff: third_party/protobuf/src/google/protobuf/io/tokenizer.cc

Issue 21208003: Update protobuf to r428, part 1. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « third_party/protobuf/src/google/protobuf/io/tokenizer.h ('k') | third_party/protobuf/src/google/protobuf/io/tokenizer_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/protobuf/src/google/protobuf/io/tokenizer.cc

===================================================================

--- third_party/protobuf/src/google/protobuf/io/tokenizer.cc (revision 216642)

+++ third_party/protobuf/src/google/protobuf/io/tokenizer.cc (working copy)

@@ -89,8 +89,11 @@

// exactly pretty.

#include <google/protobuf/io/tokenizer.h>

+#include <google/protobuf/stubs/common.h>

+#include <google/protobuf/stubs/stringprintf.h>

#include <google/protobuf/io/zero_copy_stream.h>

#include <google/protobuf/stubs/strutil.h>

+#include <google/protobuf/stubs/stl_util.h>

namespace google {

namespace protobuf {

@@ -118,6 +121,8 @@

CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||

c == '\r' || c == '\v' || c == '\f');

+CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||

+ c == '\r' || c == '\v' || c == '\f');

CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');

@@ -187,7 +192,8 @@

read_error_(false),

line_(0),

column_(0),

- token_start_(-1),

+ record_target_(NULL),

+ record_start_(-1),

allow_f_after_float_(false),

comment_style_(CPP_COMMENT_STYLE) {

@@ -238,9 +244,9 @@

}

// If we're in a token, append the rest of the buffer to it.

- if (token_start_ >= 0 && token_start_ < buffer_size_) {

- current_.text.append(buffer_ + token_start_, buffer_size_ - token_start_);

- token_start_ = 0;

+ if (record_target_ != NULL && record_start_ < buffer_size_) {

+ record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);

+ record_start_ = 0;

}

const void* data = NULL;

@@ -261,23 +267,33 @@

current_char_ = buffer_[0];

}

+inline void Tokenizer::RecordTo(string* target) {

+ record_target_ = target;

+ record_start_ = buffer_pos_;

+inline void Tokenizer::StopRecording() {

+ // Note: The if() is necessary because some STL implementations crash when

+ // you call string::append(NULL, 0), presumably because they are trying to

+ // be helpful by detecting the NULL pointer, even though there's nothing

+ // wrong with reading zero bytes from NULL.

+ if (buffer_pos_ != record_start_) {

+ record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);

+ }

+ record_target_ = NULL;

+ record_start_ = -1;

inline void Tokenizer::StartToken() {

- token_start_ = buffer_pos_;

current_.type = TYPE_START; // Just for the sake of initializing it.

current_.text.clear();

current_.line = line_;

current_.column = column_;

+ RecordTo(&current_.text);

}

inline void Tokenizer::EndToken() {

- // Note: The if() is necessary because some STL implementations crash when

- // you call string::append(NULL, 0), presumably because they are trying to

- // be helpful by detecting the NULL pointer, even though there's nothing

- // wrong with reading zero bytes from NULL.

- if (buffer_pos_ != token_start_) {

- current_.text.append(buffer_ + token_start_, buffer_pos_ - token_start_);

- }

- token_start_ = -1;

+ StopRecording();

current_.end_column = column_;

}

@@ -353,6 +369,27 @@

AddError("Expected hex digits for escape sequence.");

}

// Possibly followed by another hex digit, but again we don't care.

+ } else if (TryConsume('u')) {

+ if (!TryConsumeOne<HexDigit>() ||

+ !TryConsumeOne<HexDigit>() ||

+ !TryConsumeOne<HexDigit>()) {

+ AddError("Expected four hex digits for \\u escape sequence.");

+ }

+ } else if (TryConsume('U')) {

+ // We expect 8 hex digits; but only the range up to 0x10ffff is

+ // legal.

+ if (!TryConsume('0') ||

+ !TryConsume('0') ||

+ !(TryConsume('0') || TryConsume('1')) ||

+ !TryConsumeOne<HexDigit>() ||

+ !TryConsumeOne<HexDigit>()) {

+ AddError("Expected eight hex digits up to 10ffff for \\U escape "

+ "sequence");

+ }

} else {

AddError("Invalid escape sequence in string literal.");

}

@@ -426,26 +463,51 @@

return is_float ? TYPE_FLOAT : TYPE_INTEGER;

}

-void Tokenizer::ConsumeLineComment() {

+void Tokenizer::ConsumeLineComment(string* content) {

+ if (content != NULL) RecordTo(content);

while (current_char_ != '\0' && current_char_ != '\n') {

NextChar();

}

TryConsume('\n');

+ if (content != NULL) StopRecording();

}

-void Tokenizer::ConsumeBlockComment() {

+void Tokenizer::ConsumeBlockComment(string* content) {

int start_line = line_;

int start_column = column_ - 2;

+ if (content != NULL) RecordTo(content);

while (true) {

while (current_char_ != '\0' &&

current_char_ != '*' &&

- current_char_ != '/') {

+ current_char_ != '/' &&

+ current_char_ != '\n') {

NextChar();

}

- if (TryConsume('*') && TryConsume('/')) {

+ if (TryConsume('\n')) {

+ if (content != NULL) StopRecording();

+ // Consume leading whitespace and asterisk;

+ ConsumeZeroOrMore<WhitespaceNoNewline>();

+ if (TryConsume('*')) {

+ if (TryConsume('/')) {

+ // End of comment.

+ break;

+ }

+ if (content != NULL) RecordTo(content);

+ } else if (TryConsume('*') && TryConsume('/')) {

// End of comment.

+ if (content != NULL) {

+ StopRecording();

+ // Strip trailing "*/".

+ content->erase(content->size() - 2);

+ }

break;

} else if (TryConsume('/') && current_char_ == '*') {

// Note: We didn't consume the '*' because if there is a '/' after it

@@ -456,42 +518,59 @@

AddError("End-of-file inside block comment.");

error_collector_->AddError(

start_line, start_column, " Comment started here.");

+ if (content != NULL) StopRecording();

break;

}

+Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {

+ if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {

+ if (TryConsume('/')) {

+ return LINE_COMMENT;

+ } else if (TryConsume('*')) {

+ return BLOCK_COMMENT;

+ } else {

+ // Oops, it was just a slash. Return it.

+ current_.type = TYPE_SYMBOL;

+ current_.text = "/";

+ current_.line = line_;

+ current_.column = column_ - 1;

+ current_.end_column = column_;

+ return SLASH_NOT_COMMENT;

+ }

+ } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {

+ return LINE_COMMENT;

+ } else {

+ return NO_COMMENT;

+ }

// -------------------------------------------------------------------

bool Tokenizer::Next() {

previous_ = current_;

- // Did we skip any characters after the last token?

- bool skipped_stuff = false;

while (!read_error_) {

- if (TryConsumeOne<Whitespace>()) {

- ConsumeZeroOrMore<Whitespace>();

+ ConsumeZeroOrMore<Whitespace>();

- } else if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {

- // Starting a comment?

- if (TryConsume('/')) {

- ConsumeLineComment();

- } else if (TryConsume('*')) {

- ConsumeBlockComment();

- } else {

- // Oops, it was just a slash. Return it.

- current_.type = TYPE_SYMBOL;

- current_.text = "/";

- current_.line = line_;

- current_.column = column_ - 1;

+ switch (TryConsumeCommentStart()) {

+ case LINE_COMMENT:

+ ConsumeLineComment(NULL);

+ continue;

+ case BLOCK_COMMENT:

+ ConsumeBlockComment(NULL);

+ continue;

+ case SLASH_NOT_COMMENT:

return true;

- }

+ case NO_COMMENT:

+ break;

+ }

- } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {

- ConsumeLineComment();

+ // Check for EOF before continuing.

+ if (read_error_) break;

- } else if (LookingAt<Unprintable>() || current_char_ == '\0') {

+ if (LookingAt<Unprintable>() || current_char_ == '\0') {

AddError("Invalid control characters encountered in text.");

NextChar();

// Skip more unprintable characters, too. But, remember that '\0' is

@@ -519,7 +598,9 @@

if (TryConsumeOne<Digit>()) {

// It's a floating-point number.

- if (previous_.type == TYPE_IDENTIFIER && !skipped_stuff) {

+ if (previous_.type == TYPE_IDENTIFIER &&

+ current_.line == previous_.line &&

+ current_.column == previous_.end_column) {

// We don't accept syntax like "blah.123".

error_collector_->AddError(line_, column_ - 2,

"Need space between identifier and decimal point.");

@@ -544,8 +625,6 @@

EndToken();

return true;

}

- skipped_stuff = true;

}

// EOF

@@ -557,6 +636,195 @@

return false;

}

+namespace {

+// Helper class for collecting comments and putting them in the right places.

+//

+// This basically just buffers the most recent comment until it can be decided

+// exactly where that comment should be placed. When Flush() is called, the

+// current comment goes into either prev_trailing_comments or detached_comments.

+// When the CommentCollector is destroyed, the last buffered comment goes into

+// next_leading_comments.

+class CommentCollector {

+ public:

+ CommentCollector(string* prev_trailing_comments,

+ vector<string>* detached_comments,

+ string* next_leading_comments)

+ : prev_trailing_comments_(prev_trailing_comments),

+ detached_comments_(detached_comments),

+ next_leading_comments_(next_leading_comments),

+ has_comment_(false),

+ is_line_comment_(false),

+ can_attach_to_prev_(true) {

+ if (prev_trailing_comments != NULL) prev_trailing_comments->clear();

+ if (detached_comments != NULL) detached_comments->clear();

+ if (next_leading_comments != NULL) next_leading_comments->clear();

+ }

+ ~CommentCollector() {

+ // Whatever is in the buffer is a leading comment.

+ if (next_leading_comments_ != NULL && has_comment_) {

+ comment_buffer_.swap(*next_leading_comments_);

+ }

+ // About to read a line comment. Get the comment buffer pointer in order to

+ // read into it.

+ string* GetBufferForLineComment() {

+ // We want to combine with previous line comments, but not block comments.

+ if (has_comment_ && !is_line_comment_) {

+ Flush();

+ }

+ has_comment_ = true;

+ is_line_comment_ = true;

+ return &comment_buffer_;

+ }

+ // About to read a block comment. Get the comment buffer pointer in order to

+ // read into it.

+ string* GetBufferForBlockComment() {

+ if (has_comment_) {

+ Flush();

+ }

+ has_comment_ = true;

+ is_line_comment_ = false;

+ return &comment_buffer_;

+ }

+ void ClearBuffer() {

+ comment_buffer_.clear();

+ has_comment_ = false;

+ }

+ // Called once we know that the comment buffer is complete and is *not*

+ // connected to the next token.

+ void Flush() {

+ if (has_comment_) {

+ if (can_attach_to_prev_) {

+ if (prev_trailing_comments_ != NULL) {

+ prev_trailing_comments_->append(comment_buffer_);

+ }

+ can_attach_to_prev_ = false;

+ } else {

+ if (detached_comments_ != NULL) {

+ detached_comments_->push_back(comment_buffer_);

+ }

+ ClearBuffer();

+ }

+ void DetachFromPrev() {

+ can_attach_to_prev_ = false;

+ }

+ private:

+ string* prev_trailing_comments_;

+ vector<string>* detached_comments_;

+ string* next_leading_comments_;

+ string comment_buffer_;

+ // True if any comments were read into comment_buffer_. This can be true even

+ // if comment_buffer_ is empty, namely if the comment was "/**/".

+ bool has_comment_;

+ // Is the comment in the comment buffer a line comment?

+ bool is_line_comment_;

+ // Is it still possible that we could be reading a comment attached to the

+ // previous token?

+ bool can_attach_to_prev_;

+};

+} // namespace

+bool Tokenizer::NextWithComments(string* prev_trailing_comments,

+ vector<string>* detached_comments,

+ string* next_leading_comments) {

+ CommentCollector collector(prev_trailing_comments, detached_comments,

+ next_leading_comments);

+ if (current_.type == TYPE_START) {

+ collector.DetachFromPrev();

+ } else {

+ // A comment appearing on the same line must be attached to the previous

+ // declaration.

+ ConsumeZeroOrMore<WhitespaceNoNewline>();

+ switch (TryConsumeCommentStart()) {

+ case LINE_COMMENT:

+ ConsumeLineComment(collector.GetBufferForLineComment());

+ // Don't allow comments on subsequent lines to be attached to a trailing

+ // comment.

+ collector.Flush();

+ break;

+ case BLOCK_COMMENT:

+ ConsumeBlockComment(collector.GetBufferForBlockComment());

+ ConsumeZeroOrMore<WhitespaceNoNewline>();

+ if (!TryConsume('\n')) {

+ // Oops, the next token is on the same line. If we recorded a comment

+ // we really have no idea which token it should be attached to.

+ collector.ClearBuffer();

+ return Next();

+ }

+ // Don't allow comments on subsequent lines to be attached to a trailing

+ // comment.

+ collector.Flush();

+ break;

+ case SLASH_NOT_COMMENT:

+ return true;

+ case NO_COMMENT:

+ if (!TryConsume('\n')) {

+ // The next token is on the same line. There are no comments.

+ return Next();

+ }

+ break;

+ }

+ // OK, we are now on the line *after* the previous token.

+ while (true) {

+ ConsumeZeroOrMore<WhitespaceNoNewline>();

+ switch (TryConsumeCommentStart()) {

+ case LINE_COMMENT:

+ ConsumeLineComment(collector.GetBufferForLineComment());

+ break;

+ case BLOCK_COMMENT:

+ ConsumeBlockComment(collector.GetBufferForBlockComment());

+ // Consume the rest of the line so that we don't interpret it as a

+ // blank line the next time around the loop.

+ ConsumeZeroOrMore<WhitespaceNoNewline>();

+ TryConsume('\n');

+ break;

+ case SLASH_NOT_COMMENT:

+ return true;

+ case NO_COMMENT:

+ if (TryConsume('\n')) {

+ // Completely blank line.

+ collector.Flush();

+ collector.DetachFromPrev();

+ } else {

+ bool result = Next();

+ if (!result ||

+ current_.text == "}" ||

+ current_.text == "]" ||

+ current_.text == ")") {

+ // It looks like we're at the end of a scope. In this case it

+ // makes no sense to attach a comment to the following token.

+ collector.Flush();

+ }

+ return result;

+ }

+ break;

+ }

// -------------------------------------------------------------------

// Token-parsing helpers. Remember that these don't need to report

// errors since any errors should already have been reported while

@@ -626,17 +894,138 @@

return result;

}

+// Helper to append a Unicode code point to a string as UTF8, without bringing

+// in any external dependencies.

+static void AppendUTF8(uint32 code_point, string* output) {

+ uint32 tmp = 0;

+ int len = 0;

+ if (code_point <= 0x7f) {

+ tmp = code_point;

+ len = 1;

+ } else if (code_point <= 0x07ff) {

+ tmp = 0x0000c080 |

+ ((code_point & 0x07c0) << 2) |

+ (code_point & 0x003f);

+ len = 2;

+ } else if (code_point <= 0xffff) {

+ tmp = 0x00e08080 |

+ ((code_point & 0xf000) << 4) |

+ ((code_point & 0x0fc0) << 2) |

+ (code_point & 0x003f);

+ len = 3;

+ } else if (code_point <= 0x1fffff) {

+ tmp = 0xf0808080 |

+ ((code_point & 0x1c0000) << 6) |

+ ((code_point & 0x03f000) << 4) |

+ ((code_point & 0x000fc0) << 2) |

+ (code_point & 0x003f);

+ len = 4;

+ } else {

+ // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is

+ // normally only defined up to there as well.

+ StringAppendF(output, "\\U%08x", code_point);

+ return;

+ }

+ tmp = ghtonl(tmp);

+ output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);

+// Try to read <len> hex digits from ptr, and stuff the numeric result into

+// *result. Returns true if that many digits were successfully consumed.

+static bool ReadHexDigits(const char* ptr, int len, uint32* result) {

+ *result = 0;

+ if (len == 0) return false;

+ for (const char* end = ptr + len; ptr < end; ++ptr) {

+ if (*ptr == '\0') return false;

+ *result = (*result << 4) + DigitValue(*ptr);

+ }

+ return true;

+// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range

+// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail

+// surrogate. These numbers are in a reserved range of Unicode code points, so

+// if we encounter such a pair we know how to parse it and convert it into a

+// single code point.

+static const uint32 kMinHeadSurrogate = 0xd800;

+static const uint32 kMaxHeadSurrogate = 0xdc00;

+static const uint32 kMinTrailSurrogate = 0xdc00;

+static const uint32 kMaxTrailSurrogate = 0xe000;

+static inline bool IsHeadSurrogate(uint32 code_point) {

+ return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);

+static inline bool IsTrailSurrogate(uint32 code_point) {

+ return (code_point >= kMinTrailSurrogate) &&

+ (code_point < kMaxTrailSurrogate);

+// Combine a head and trail surrogate into a single Unicode code point.

+static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {

+ GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));

+ GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));

+ return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |

+ (trail_surrogate - kMinTrailSurrogate));

+// Convert the escape sequence parameter to a number of expected hex digits.

+static inline int UnicodeLength(char key) {

+ if (key == 'u') return 4;

+ if (key == 'U') return 8;

+ return 0;

+// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt

+// to parse that sequence. On success, returns a pointer to the first char

+// beyond that sequence, and fills in *code_point. On failure, returns ptr

+// itself.

+static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {

+ const char* p = ptr;

+ // Fetch the code point.

+ const int len = UnicodeLength(*p++);

+ if (!ReadHexDigits(p, len, code_point))

+ return ptr;

+ p += len;

+ // Check if the code point we read is a "head surrogate." If so, then we

+ // expect it to be immediately followed by another code point which is a valid

+ // "trail surrogate," and together they form a UTF-16 pair which decodes into

+ // a single Unicode point. Trail surrogates may only use \u, not \U.

+ if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {

+ uint32 trail_surrogate;

+ if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&

+ IsTrailSurrogate(trail_surrogate)) {

+ *code_point = AssembleUTF16(*code_point, trail_surrogate);

+ p += 6;

+ }

+ // If this failed, then we just emit the head surrogate as a code point.

+ // It's bogus, but so is the string.

+ }

+ return p;

+// The text string must begin and end with single or double quote

+// characters.

void Tokenizer::ParseStringAppend(const string& text, string* output) {

- // Reminder: text[0] is always the quote character. (If text is

- // empty, it's invalid, so we'll just return.)

- if (text.empty()) {

+ // Reminder: text[0] is always a quote character. (If text is

+ // empty, it's invalid, so we'll just return).

+ const size_t text_size = text.size();

+ if (text_size == 0) {

GOOGLE_LOG(DFATAL)

<< " Tokenizer::ParseStringAppend() passed text that could not"

" have been tokenized as a string: " << CEscape(text);

return;

}

- output->reserve(output->size() + text.size());

+ // Reserve room for new string. The branch is necessary because if

+ // there is already space available the reserve() call might

+ // downsize the output.

+ const size_t new_len = text_size + output->size();

+ if (new_len > output->capacity()) {

+ output->reserve(new_len);

+ }

// Loop through the string copying characters to "output" and

// interpreting escape sequences. Note that any invalid escape

@@ -674,19 +1063,27 @@

}

output->push_back(static_cast<char>(code));

+ } else if (*ptr == 'u' || *ptr == 'U') {

+ uint32 unicode;

+ const char* end = FetchUnicodePoint(ptr, &unicode);

+ if (end == ptr) {

+ // Failure: Just dump out what we saw, don't try to parse it.

+ output->push_back(*ptr);

+ } else {

+ AppendUTF8(unicode, output);

+ ptr = end - 1; // Because we're about to ++ptr.

+ }

} else {

// Some other escape code.

output->push_back(TranslateEscape(*ptr));

}

- } else if (*ptr == text[0]) {

- // Ignore quote matching the starting quote.

+ } else if (*ptr == text[0] && ptr[1] == '\0') {

+ // Ignore final quote matching the starting quote.

} else {

output->push_back(*ptr);

}

- return;

}

} // namespace io