Chromium Code Reviews| Index: base/sys_string_conversions_linux.cc |
| diff --git a/base/sys_string_conversions_linux.cc b/base/sys_string_conversions_linux.cc |
| index 118f0ac48494e2048079457f1f6b094ab2f21b33..bd530d9330ffe19d111d1a70f8f84f193b01d3c0 100644 |
| --- a/base/sys_string_conversions_linux.cc |
| +++ b/base/sys_string_conversions_linux.cc |
| @@ -6,21 +6,166 @@ |
| #include <wchar.h> |
| +#include "base/basictypes.h" |
| #include "base/string_piece.h" |
| #include "base/string_util.h" |
| namespace base { |
| +// UTF8 <-> UCS-4 conversion from v8. |
| +namespace { |
| + |
| +const uint32_t kMaxEncodedSize = 4; |
| +const uint32_t kMaxOneByteChar = 0x7f; |
| +const uint32_t kMaxTwoByteChar = 0x7ff; |
| +const uint32_t kMaxThreeByteChar = 0xffff; |
| +const uint32_t kMaxFourByteChar = 0x1fffff; |
| +const uint32_t kBadChar = 0xFFFD; |
| + |
| +size_t WriteUnicodeCharAsUTF8(char* str, uint32_t c) { |
|
darin (slow to review)
2009/07/14 16:51:41
this stuff seems like it would be better broken ou
|
| + static const int kMask = ~(1 << 6); |
| + if (c <= kMaxOneByteChar) { |
| + str[0] = c; |
| + return 1; |
| + } else if (c <= kMaxTwoByteChar) { |
| + str[0] = 0xC0 | (c >> 6); |
| + str[1] = 0x80 | (c & kMask); |
| + return 2; |
| + } else if (c <= kMaxThreeByteChar) { |
| + str[0] = 0xE0 | (c >> 12); |
| + str[1] = 0x80 | ((c >> 6) & kMask); |
| + str[2] = 0x80 | (c & kMask); |
| + return 3; |
| + } else { |
| + str[0] = 0xF0 | (c >> 18); |
| + str[1] = 0x80 | ((c >> 12) & kMask); |
| + str[2] = 0x80 | ((c >> 6) & kMask); |
| + str[3] = 0x80 | (c & kMask); |
| + return 4; |
| + } |
| +} |
| + |
| +size_t UnicodeCharUTF8Length(uint32_t c) { |
| + if (c <= kMaxOneByteChar) { |
| + return 1; |
| + } else if (c <= kMaxTwoByteChar) { |
| + return 2; |
| + } else if (c <= kMaxThreeByteChar) { |
| + return 3; |
| + } else { |
| + return 4; |
| + } |
| +} |
| + |
| +uint32_t UnicodeCharFromUTF8(const uint8_t* str, |
| + size_t length, |
| + size_t* cursor) { |
| + if (length <= 0) return kBadChar; |
| + |
| + uint8_t first = str[0]; |
| + // Characters between 0000 and 0007F are encoded as a single character |
| + if (first <= kMaxOneByteChar) { |
| + *cursor += 1; |
| + return first; |
| + } |
| + |
| + // We only get here for non-ascii characters. |
| + if (length == 1) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + |
| + uint8_t second = str[1] ^ 0x80; |
| + if (second & 0xC0) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + if (first < 0xE0) { |
| + if (first < 0xC0) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + uint32_t l = ((first << 6) | second) & kMaxTwoByteChar; |
| + if (l <= kMaxOneByteChar) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + *cursor += 2; |
| + return l; |
| + } |
| + if (length == 2) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + uint8_t third = str[2] ^ 0x80; |
| + if (third & 0xC0) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + if (first < 0xF0) { |
| + uint32_t l = ((((first << 6) | second) << 6) | third) & kMaxThreeByteChar; |
| + if (l <= kMaxTwoByteChar) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + *cursor += 3; |
| + return l; |
| + } |
| + if (length == 3) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + uint8_t fourth = str[3] ^ 0x80; |
| + if (fourth & 0xC0) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + if (first < 0xF8) { |
| + uint32_t l = (((((first << 6 | second) << 6) | third) << 6) | fourth) & |
| + kMaxFourByteChar; |
| + if (l <= kMaxThreeByteChar) { |
| + *cursor += 1; |
| + return kBadChar; |
| + } |
| + *cursor += 4; |
| + return l; |
| + } |
| + *cursor += 1; |
| + return kBadChar; |
| +} |
| + |
| +} // namespace |
| + |
| std::string SysWideToUTF8(const std::wstring& wide) { |
| - // In theory this should be using the system-provided conversion rather |
| - // than our ICU, but this will do for now. |
| - return WideToUTF8(wide); |
| + size_t length = 0; |
| + for (size_t i = 0; i < wide.size(); ++i) |
| + length += UnicodeCharUTF8Length(wide[i]); |
| + |
| + std::string out(length, 0); |
| + size_t out_pos = 0; |
| + for (size_t i = 0; i < wide.size(); ++i) |
| + out_pos += WriteUnicodeCharAsUTF8(&out[out_pos], wide[i]); |
| + |
| + return out; |
| } |
| + |
| std::wstring SysUTF8ToWide(const StringPiece& utf8) { |
| - // In theory this should be using the system-provided conversion rather |
| - // than our ICU, but this will do for now. |
| - std::wstring out; |
| - UTF8ToWide(utf8.data(), utf8.size(), &out); |
| + size_t wide_length = 0; |
| + for (size_t pos = 0; pos < utf8.size(); ++wide_length) { |
| + if (UnicodeCharFromUTF8( |
| + reinterpret_cast<const uint8_t*>(utf8.data() + pos), |
| + utf8.size() - pos, &pos) == kBadChar) { |
| + return std::wstring(); // Failure, invalid conversion. |
| + } |
| + } |
| + |
| + std::wstring out(wide_length, 0); |
| + for (size_t pos = 0, wide_pos = 0; pos < utf8.size(); ++wide_pos) { |
| + out[wide_pos] = UnicodeCharFromUTF8( |
| + reinterpret_cast<const uint8_t*>(utf8.data() + pos), |
| + utf8.size() - pos, &pos); |
| + } |
| + |
| return out; |
| } |