Index: url/url_canon_icu_unittest.cc |
diff --git a/url/url_canon_icu_unittest.cc b/url/url_canon_icu_unittest.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..83f6083c0b7cccc875eff63599acfe55dae39314 |
--- /dev/null |
+++ b/url/url_canon_icu_unittest.cc |
@@ -0,0 +1,167 @@ |
+// Copyright 2013 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "base/macros.h" |
+#include "testing/gtest/include/gtest/gtest.h" |
+#include "third_party/icu/source/common/unicode/ucnv.h" |
+#include "url/url_canon.h" |
+#include "url/url_canon_icu.h" |
+#include "url/url_canon_stdstring.h" |
+#include "url/url_test_utils.h" |
+ |
+// Some implementations of base/basictypes.h may define ARRAYSIZE. |
+// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro |
+// which is in our version of basictypes.h. |
+#ifndef ARRAYSIZE |
+#define ARRAYSIZE ARRAYSIZE_UNSAFE |
+#endif |
+ |
+namespace url { |
+ |
+using test_utils::WStringToUTF16; |
+ |
+namespace { |
+ |
+// Wrapper around a UConverter object that managers creation and destruction. |
+class UConvScoper { |
+ public: |
+ explicit UConvScoper(const char* charset_name) { |
+ UErrorCode err = U_ZERO_ERROR; |
+ converter_ = ucnv_open(charset_name, &err); |
+ } |
+ |
+ ~UConvScoper() { |
+ if (converter_) |
+ ucnv_close(converter_); |
+ } |
+ |
+ // Returns the converter object, may be NULL. |
+ UConverter* converter() const { return converter_; } |
+ |
+ private: |
+ UConverter* converter_; |
+}; |
+ |
+TEST(URLCanonIcuTest, ICUCharsetConverter) { |
+ struct ICUCase { |
+ const wchar_t* input; |
+ const char* encoding; |
+ const char* expected; |
+ } icu_cases[] = { |
+ // UTF-8. |
+ {L"Hello, world", "utf-8", "Hello, world"}, |
+ {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, |
+ // Non-BMP UTF-8. |
+ {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, |
+ // Big5 |
+ {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, |
+ // Unrepresentable character in the destination set. |
+ {L"hello\x4f60\x06de\x597dworld", "big5", |
+ "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, |
+ }; |
+ |
+ for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) { |
+ UConvScoper conv(icu_cases[i].encoding); |
+ ASSERT_TRUE(conv.converter() != NULL); |
+ ICUCharsetConverter converter(conv.converter()); |
+ |
+ std::string str; |
+ StdStringCanonOutput output(&str); |
+ |
+ base::string16 input_str(WStringToUTF16(icu_cases[i].input)); |
+ int input_len = static_cast<int>(input_str.length()); |
+ converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); |
+ output.Complete(); |
+ |
+ EXPECT_STREQ(icu_cases[i].expected, str.c_str()); |
+ } |
+ |
+ // Test string sizes around the resize boundary for the output to make sure |
+ // the converter resizes as needed. |
+ const int static_size = 16; |
+ UConvScoper conv("utf-8"); |
+ ASSERT_TRUE(conv.converter()); |
+ ICUCharsetConverter converter(conv.converter()); |
+ for (int i = static_size - 2; i <= static_size + 2; i++) { |
+ // Make a string with the appropriate length. |
+ base::string16 input; |
+ for (int ch = 0; ch < i; ch++) |
+ input.push_back('a'); |
+ |
+ RawCanonOutput<static_size> output; |
+ converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()), |
+ &output); |
+ EXPECT_EQ(input.length(), static_cast<size_t>(output.length())); |
+ } |
+} |
+ |
+TEST(URLCanonIcuTest, QueryWithConverter) { |
+ struct QueryCase { |
+ const char* input8; |
+ const wchar_t* input16; |
+ const char* encoding; |
+ const char* expected; |
+ } query_cases[] = { |
+ // Regular ASCII case in some different encodings. |
+ {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, |
+ {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, |
+ {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, |
+ // Chinese input/output |
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", |
+ "?q=%C4%E3%BA%C3"}, |
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, |
+ // Unencodable character in the destination character set should be |
+ // escaped. The escape sequence unescapes to be the entity name: |
+ // "?q=你" |
+ {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", |
+ "?q=Chinese%26%2365319%3B"}, |
+ }; |
+ |
+ for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) { |
+ Component out_comp; |
+ |
+ UConvScoper conv(query_cases[i].encoding); |
+ ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); |
+ ICUCharsetConverter converter(conv.converter()); |
+ |
+ if (query_cases[i].input8) { |
+ int len = static_cast<int>(strlen(query_cases[i].input8)); |
+ Component in_comp(0, len); |
+ std::string out_str; |
+ |
+ StdStringCanonOutput output(&out_str); |
+ CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output, |
+ &out_comp); |
+ output.Complete(); |
+ |
+ EXPECT_EQ(query_cases[i].expected, out_str); |
+ } |
+ |
+ if (query_cases[i].input16) { |
+ base::string16 input16(WStringToUTF16(query_cases[i].input16)); |
+ int len = static_cast<int>(input16.length()); |
+ Component in_comp(0, len); |
+ std::string out_str; |
+ |
+ StdStringCanonOutput output(&out_str); |
+ CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output, |
+ &out_comp); |
+ output.Complete(); |
+ |
+ EXPECT_EQ(query_cases[i].expected, out_str); |
+ } |
+ } |
+ |
+ // Extra test for input with embedded NULL; |
+ std::string out_str; |
+ StdStringCanonOutput output(&out_str); |
+ Component out_comp; |
+ CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); |
+ output.Complete(); |
+ EXPECT_EQ("?a%20%00z%01", out_str); |
+} |
+ |
+} // namespace |
+ |
+} // namespace url |