Chromium Code Reviews

Unified Diff: base/sys_string_conversions_linux.cc

Issue 149526: Use v8's utf8 <-> wide conversion for Linux sys_string_conversion. (Closed)
Patch Set: Created 11 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View side-by-side diff with in-line comments
« no previous file with comments | « no previous file | base/sys_string_conversions_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: base/sys_string_conversions_linux.cc
diff --git a/base/sys_string_conversions_linux.cc b/base/sys_string_conversions_linux.cc
index 118f0ac48494e2048079457f1f6b094ab2f21b33..bd530d9330ffe19d111d1a70f8f84f193b01d3c0 100644
--- a/base/sys_string_conversions_linux.cc
+++ b/base/sys_string_conversions_linux.cc
@@ -6,21 +6,166 @@
#include <wchar.h>
+#include "base/basictypes.h"
#include "base/string_piece.h"
#include "base/string_util.h"
namespace base {
+// UTF8 <-> UCS-4 conversion from v8.
+namespace {
+
+const uint32_t kMaxEncodedSize = 4;
+const uint32_t kMaxOneByteChar = 0x7f;
+const uint32_t kMaxTwoByteChar = 0x7ff;
+const uint32_t kMaxThreeByteChar = 0xffff;
+const uint32_t kMaxFourByteChar = 0x1fffff;
+const uint32_t kBadChar = 0xFFFD;
+
+size_t WriteUnicodeCharAsUTF8(char* str, uint32_t c) {
darin (slow to review) 2009/07/14 16:51:41 this stuff seems like it would be better broken ou
+ static const int kMask = ~(1 << 6);
+ if (c <= kMaxOneByteChar) {
+ str[0] = c;
+ return 1;
+ } else if (c <= kMaxTwoByteChar) {
+ str[0] = 0xC0 | (c >> 6);
+ str[1] = 0x80 | (c & kMask);
+ return 2;
+ } else if (c <= kMaxThreeByteChar) {
+ str[0] = 0xE0 | (c >> 12);
+ str[1] = 0x80 | ((c >> 6) & kMask);
+ str[2] = 0x80 | (c & kMask);
+ return 3;
+ } else {
+ str[0] = 0xF0 | (c >> 18);
+ str[1] = 0x80 | ((c >> 12) & kMask);
+ str[2] = 0x80 | ((c >> 6) & kMask);
+ str[3] = 0x80 | (c & kMask);
+ return 4;
+ }
+}
+
+size_t UnicodeCharUTF8Length(uint32_t c) {
+ if (c <= kMaxOneByteChar) {
+ return 1;
+ } else if (c <= kMaxTwoByteChar) {
+ return 2;
+ } else if (c <= kMaxThreeByteChar) {
+ return 3;
+ } else {
+ return 4;
+ }
+}
+
+uint32_t UnicodeCharFromUTF8(const uint8_t* str,
+ size_t length,
+ size_t* cursor) {
+ if (length <= 0) return kBadChar;
+
+ uint8_t first = str[0];
+ // Characters between 0000 and 0007F are encoded as a single character
+ if (first <= kMaxOneByteChar) {
+ *cursor += 1;
+ return first;
+ }
+
+ // We only get here for non-ascii characters.
+ if (length == 1) {
+ *cursor += 1;
+ return kBadChar;
+ }
+
+ uint8_t second = str[1] ^ 0x80;
+ if (second & 0xC0) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ if (first < 0xE0) {
+ if (first < 0xC0) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ uint32_t l = ((first << 6) | second) & kMaxTwoByteChar;
+ if (l <= kMaxOneByteChar) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ *cursor += 2;
+ return l;
+ }
+ if (length == 2) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ uint8_t third = str[2] ^ 0x80;
+ if (third & 0xC0) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ if (first < 0xF0) {
+ uint32_t l = ((((first << 6) | second) << 6) | third) & kMaxThreeByteChar;
+ if (l <= kMaxTwoByteChar) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ *cursor += 3;
+ return l;
+ }
+ if (length == 3) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ uint8_t fourth = str[3] ^ 0x80;
+ if (fourth & 0xC0) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ if (first < 0xF8) {
+ uint32_t l = (((((first << 6 | second) << 6) | third) << 6) | fourth) &
+ kMaxFourByteChar;
+ if (l <= kMaxThreeByteChar) {
+ *cursor += 1;
+ return kBadChar;
+ }
+ *cursor += 4;
+ return l;
+ }
+ *cursor += 1;
+ return kBadChar;
+}
+
+} // namespace
+
std::string SysWideToUTF8(const std::wstring& wide) {
- // In theory this should be using the system-provided conversion rather
- // than our ICU, but this will do for now.
- return WideToUTF8(wide);
+ size_t length = 0;
+ for (size_t i = 0; i < wide.size(); ++i)
+ length += UnicodeCharUTF8Length(wide[i]);
+
+ std::string out(length, 0);
+ size_t out_pos = 0;
+ for (size_t i = 0; i < wide.size(); ++i)
+ out_pos += WriteUnicodeCharAsUTF8(&out[out_pos], wide[i]);
+
+ return out;
}
+
std::wstring SysUTF8ToWide(const StringPiece& utf8) {
- // In theory this should be using the system-provided conversion rather
- // than our ICU, but this will do for now.
- std::wstring out;
- UTF8ToWide(utf8.data(), utf8.size(), &out);
+ size_t wide_length = 0;
+ for (size_t pos = 0; pos < utf8.size(); ++wide_length) {
+ if (UnicodeCharFromUTF8(
+ reinterpret_cast<const uint8_t*>(utf8.data() + pos),
+ utf8.size() - pos, &pos) == kBadChar) {
+ return std::wstring(); // Failure, invalid conversion.
+ }
+ }
+
+ std::wstring out(wide_length, 0);
+ for (size_t pos = 0, wide_pos = 0; pos < utf8.size(); ++wide_pos) {
+ out[wide_pos] = UnicodeCharFromUTF8(
+ reinterpret_cast<const uint8_t*>(utf8.data() + pos),
+ utf8.size() - pos, &pos);
+ }
+
return out;
}
« no previous file with comments | « no previous file | base/sys_string_conversions_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine