Index: base/sys_string_conversions_linux.cc |
diff --git a/base/sys_string_conversions_linux.cc b/base/sys_string_conversions_linux.cc |
index 118f0ac48494e2048079457f1f6b094ab2f21b33..bd530d9330ffe19d111d1a70f8f84f193b01d3c0 100644 |
--- a/base/sys_string_conversions_linux.cc |
+++ b/base/sys_string_conversions_linux.cc |
@@ -6,21 +6,166 @@ |
#include <wchar.h> |
+#include "base/basictypes.h" |
#include "base/string_piece.h" |
#include "base/string_util.h" |
namespace base { |
+// UTF8 <-> UCS-4 conversion from v8. |
+namespace { |
+ |
+const uint32_t kMaxEncodedSize = 4; |
+const uint32_t kMaxOneByteChar = 0x7f; |
+const uint32_t kMaxTwoByteChar = 0x7ff; |
+const uint32_t kMaxThreeByteChar = 0xffff; |
+const uint32_t kMaxFourByteChar = 0x1fffff; |
+const uint32_t kBadChar = 0xFFFD; |
+ |
+size_t WriteUnicodeCharAsUTF8(char* str, uint32_t c) { |
darin (slow to review)
2009/07/14 16:51:41
this stuff seems like it would be better broken ou
|
+ static const int kMask = ~(1 << 6); |
+ if (c <= kMaxOneByteChar) { |
+ str[0] = c; |
+ return 1; |
+ } else if (c <= kMaxTwoByteChar) { |
+ str[0] = 0xC0 | (c >> 6); |
+ str[1] = 0x80 | (c & kMask); |
+ return 2; |
+ } else if (c <= kMaxThreeByteChar) { |
+ str[0] = 0xE0 | (c >> 12); |
+ str[1] = 0x80 | ((c >> 6) & kMask); |
+ str[2] = 0x80 | (c & kMask); |
+ return 3; |
+ } else { |
+ str[0] = 0xF0 | (c >> 18); |
+ str[1] = 0x80 | ((c >> 12) & kMask); |
+ str[2] = 0x80 | ((c >> 6) & kMask); |
+ str[3] = 0x80 | (c & kMask); |
+ return 4; |
+ } |
+} |
+ |
+size_t UnicodeCharUTF8Length(uint32_t c) { |
+ if (c <= kMaxOneByteChar) { |
+ return 1; |
+ } else if (c <= kMaxTwoByteChar) { |
+ return 2; |
+ } else if (c <= kMaxThreeByteChar) { |
+ return 3; |
+ } else { |
+ return 4; |
+ } |
+} |
+ |
+uint32_t UnicodeCharFromUTF8(const uint8_t* str, |
+ size_t length, |
+ size_t* cursor) { |
+ if (length <= 0) return kBadChar; |
+ |
+ uint8_t first = str[0]; |
+ // Characters between 0000 and 0007F are encoded as a single character |
+ if (first <= kMaxOneByteChar) { |
+ *cursor += 1; |
+ return first; |
+ } |
+ |
+ // We only get here for non-ascii characters. |
+ if (length == 1) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ |
+ uint8_t second = str[1] ^ 0x80; |
+ if (second & 0xC0) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ if (first < 0xE0) { |
+ if (first < 0xC0) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ uint32_t l = ((first << 6) | second) & kMaxTwoByteChar; |
+ if (l <= kMaxOneByteChar) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ *cursor += 2; |
+ return l; |
+ } |
+ if (length == 2) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ uint8_t third = str[2] ^ 0x80; |
+ if (third & 0xC0) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ if (first < 0xF0) { |
+ uint32_t l = ((((first << 6) | second) << 6) | third) & kMaxThreeByteChar; |
+ if (l <= kMaxTwoByteChar) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ *cursor += 3; |
+ return l; |
+ } |
+ if (length == 3) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ uint8_t fourth = str[3] ^ 0x80; |
+ if (fourth & 0xC0) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ if (first < 0xF8) { |
+ uint32_t l = (((((first << 6 | second) << 6) | third) << 6) | fourth) & |
+ kMaxFourByteChar; |
+ if (l <= kMaxThreeByteChar) { |
+ *cursor += 1; |
+ return kBadChar; |
+ } |
+ *cursor += 4; |
+ return l; |
+ } |
+ *cursor += 1; |
+ return kBadChar; |
+} |
+ |
+} // namespace |
+ |
std::string SysWideToUTF8(const std::wstring& wide) { |
- // In theory this should be using the system-provided conversion rather |
- // than our ICU, but this will do for now. |
- return WideToUTF8(wide); |
+ size_t length = 0; |
+ for (size_t i = 0; i < wide.size(); ++i) |
+ length += UnicodeCharUTF8Length(wide[i]); |
+ |
+ std::string out(length, 0); |
+ size_t out_pos = 0; |
+ for (size_t i = 0; i < wide.size(); ++i) |
+ out_pos += WriteUnicodeCharAsUTF8(&out[out_pos], wide[i]); |
+ |
+ return out; |
} |
+ |
std::wstring SysUTF8ToWide(const StringPiece& utf8) { |
- // In theory this should be using the system-provided conversion rather |
- // than our ICU, but this will do for now. |
- std::wstring out; |
- UTF8ToWide(utf8.data(), utf8.size(), &out); |
+ size_t wide_length = 0; |
+ for (size_t pos = 0; pos < utf8.size(); ++wide_length) { |
+ if (UnicodeCharFromUTF8( |
+ reinterpret_cast<const uint8_t*>(utf8.data() + pos), |
+ utf8.size() - pos, &pos) == kBadChar) { |
+ return std::wstring(); // Failure, invalid conversion. |
+ } |
+ } |
+ |
+ std::wstring out(wide_length, 0); |
+ for (size_t pos = 0, wide_pos = 0; pos < utf8.size(); ++wide_pos) { |
+ out[wide_pos] = UnicodeCharFromUTF8( |
+ reinterpret_cast<const uint8_t*>(utf8.data() + pos), |
+ utf8.size() - pos, &pos); |
+ } |
+ |
return out; |
} |