Index: third_party/protobuf/src/google/protobuf/io/tokenizer_unittest.cc |
=================================================================== |
--- third_party/protobuf/src/google/protobuf/io/tokenizer_unittest.cc (revision 216642) |
+++ third_party/protobuf/src/google/protobuf/io/tokenizer_unittest.cc (working copy) |
@@ -32,10 +32,11 @@ |
// Based on original Protocol Buffers design by |
// Sanjay Ghemawat, Jeff Dean, and others. |
-#include <vector> |
+#include <limits.h> |
#include <math.h> |
-#include <limits.h> |
+#include <vector> |
+ |
#include <google/protobuf/io/tokenizer.h> |
#include <google/protobuf/io/zero_copy_stream_impl.h> |
@@ -514,6 +515,217 @@ |
// ------------------------------------------------------------------- |
+// In each case, the input is expected to have two tokens named "prev" and |
+// "next" with comments in between. |
+struct DocCommentCase { |
+ string input; |
+ |
+ const char* prev_trailing_comments; |
+ const char* detached_comments[10]; |
+ const char* next_leading_comments; |
+}; |
+ |
+inline ostream& operator<<(ostream& out, |
+ const DocCommentCase& test_case) { |
+ return out << CEscape(test_case.input); |
+} |
+ |
+DocCommentCase kDocCommentCases[] = { |
+ { |
+ "prev next", |
+ |
+ "", |
+ {}, |
+ "" |
+ }, |
+ |
+ { |
+ "prev /* ignored */ next", |
+ |
+ "", |
+ {}, |
+ "" |
+ }, |
+ |
+ { |
+ "prev // trailing comment\n" |
+ "next", |
+ |
+ " trailing comment\n", |
+ {}, |
+ "" |
+ }, |
+ |
+ { |
+ "prev\n" |
+ "// leading comment\n" |
+ "// line 2\n" |
+ "next", |
+ |
+ "", |
+ {}, |
+ " leading comment\n" |
+ " line 2\n" |
+ }, |
+ |
+ { |
+ "prev\n" |
+ "// trailing comment\n" |
+ "// line 2\n" |
+ "\n" |
+ "next", |
+ |
+ " trailing comment\n" |
+ " line 2\n", |
+ {}, |
+ "" |
+ }, |
+ |
+ { |
+ "prev // trailing comment\n" |
+ "// leading comment\n" |
+ "// line 2\n" |
+ "next", |
+ |
+ " trailing comment\n", |
+ {}, |
+ " leading comment\n" |
+ " line 2\n" |
+ }, |
+ |
+ { |
+ "prev /* trailing block comment */\n" |
+ "/* leading block comment\n" |
+ " * line 2\n" |
+ " * line 3 */" |
+ "next", |
+ |
+ " trailing block comment ", |
+ {}, |
+ " leading block comment\n" |
+ " line 2\n" |
+ " line 3 " |
+ }, |
+ |
+ { |
+ "prev\n" |
+ "/* trailing block comment\n" |
+ " * line 2\n" |
+ " * line 3\n" |
+ " */\n" |
+ "/* leading block comment\n" |
+ " * line 2\n" |
+ " * line 3 */" |
+ "next", |
+ |
+ " trailing block comment\n" |
+ " line 2\n" |
+ " line 3\n", |
+ {}, |
+ " leading block comment\n" |
+ " line 2\n" |
+ " line 3 " |
+ }, |
+ |
+ { |
+ "prev\n" |
+ "// trailing comment\n" |
+ "\n" |
+ "// detached comment\n" |
+ "// line 2\n" |
+ "\n" |
+ "// second detached comment\n" |
+ "/* third detached comment\n" |
+ " * line 2 */\n" |
+ "// leading comment\n" |
+ "next", |
+ |
+ " trailing comment\n", |
+ { |
+ " detached comment\n" |
+ " line 2\n", |
+ " second detached comment\n", |
+ " third detached comment\n" |
+ " line 2 " |
+ }, |
+ " leading comment\n" |
+ }, |
+ |
+ { |
+ "prev /**/\n" |
+ "\n" |
+ "// detached comment\n" |
+ "\n" |
+ "// leading comment\n" |
+ "next", |
+ |
+ "", |
+ { |
+ " detached comment\n" |
+ }, |
+ " leading comment\n" |
+ }, |
+ |
+ { |
+ "prev /**/\n" |
+ "// leading comment\n" |
+ "next", |
+ |
+ "", |
+ {}, |
+ " leading comment\n" |
+ }, |
+ }; |
+ |
+TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { |
+ // Set up the tokenizer. |
+ TestInputStream input(kDocCommentCases_case.input.data(), |
+ kDocCommentCases_case.input.size(), |
+ kBlockSizes_case); |
+ TestErrorCollector error_collector; |
+ Tokenizer tokenizer(&input, &error_collector); |
+ |
+ // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). |
+ TestInputStream input2(kDocCommentCases_case.input.data(), |
+ kDocCommentCases_case.input.size(), |
+ kBlockSizes_case); |
+ Tokenizer tokenizer2(&input2, &error_collector); |
+ |
+ tokenizer.Next(); |
+ tokenizer2.Next(); |
+ |
+ EXPECT_EQ("prev", tokenizer.current().text); |
+ EXPECT_EQ("prev", tokenizer2.current().text); |
+ |
+ string prev_trailing_comments; |
+ vector<string> detached_comments; |
+ string next_leading_comments; |
+ tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, |
+ &next_leading_comments); |
+ tokenizer2.NextWithComments(NULL, NULL, NULL); |
+ EXPECT_EQ("next", tokenizer.current().text); |
+ EXPECT_EQ("next", tokenizer2.current().text); |
+ |
+ EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, |
+ prev_trailing_comments); |
+ |
+ for (int i = 0; i < detached_comments.size(); i++) { |
+ ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases)); |
+ ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL); |
+ EXPECT_EQ(kDocCommentCases_case.detached_comments[i], |
+ detached_comments[i]); |
+ } |
+ |
+ // Verify that we matched all the detached comments. |
+ EXPECT_EQ(NULL, |
+ kDocCommentCases_case.detached_comments[detached_comments.size()]); |
+ |
+ EXPECT_EQ(kDocCommentCases_case.next_leading_comments, |
+ next_leading_comments); |
+} |
+ |
+// ------------------------------------------------------------------- |
+ |
// Test parse helpers. It's not really worth setting up a full data-driven |
// test here. |
TEST_F(TokenizerTest, ParseInteger) { |
@@ -614,6 +826,22 @@ |
Tokenizer::ParseString("'\\", &output); |
EXPECT_EQ("\\", output); |
+ // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode |
+ // characters. |
+ Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output); |
+ EXPECT_EQ("$¢€𤭢XX", output); |
+ // Same thing encoded using UTF16. |
+ Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output); |
+ EXPECT_EQ("$¢€𤭢XX", output); |
+ // Here's some broken UTF16; there's a head surrogate with no tail surrogate. |
+ // We just output this as if it were UTF8; it's not a defined code point, but |
+ // it has a defined encoding. |
+ Tokenizer::ParseString("'\\ud852XX'", &output); |
+ EXPECT_EQ("\xed\xa1\x92XX", output); |
+ // Malformed escape: Demons may fly out of the nose. |
+ Tokenizer::ParseString("\\u0", &output); |
+ EXPECT_EQ("u0", output); |
+ |
// Test invalid strings that will never be tokenized as strings. |
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet |
EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output), |
@@ -658,6 +886,12 @@ |
"0:4: String literals cannot cross line boundaries.\n" }, |
{ "'bar\nfoo", true, |
"0:4: String literals cannot cross line boundaries.\n" }, |
+ { "'\\u01' foo", true, |
+ "0:5: Expected four hex digits for \\u escape sequence.\n" }, |
+ { "'\\u01' foo", true, |
+ "0:5: Expected four hex digits for \\u escape sequence.\n" }, |
+ { "'\\uXYZ' foo", true, |
+ "0:3: Expected four hex digits for \\u escape sequence.\n" }, |
// Integer errors. |
{ "123foo", true, |
@@ -734,7 +968,7 @@ |
} |
// Check that the errors match what was expected. |
- EXPECT_EQ(error_collector.text_, kErrorCases_case.errors); |
+ EXPECT_EQ(kErrorCases_case.errors, error_collector.text_); |
// If the error was recoverable, make sure we saw "foo" after it. |
if (kErrorCases_case.recoverable) { |
@@ -760,6 +994,7 @@ |
EXPECT_EQ(strlen("foo"), input.ByteCount()); |
} |
+ |
} // namespace |
} // namespace io |
} // namespace protobuf |