base/sys_string_conversions_linux.cc - Issue 149526: Use v8's utf8 <-> wide conversion for Linux sys_string_conversion.

Unified Diff: base/sys_string_conversions_linux.cc

Issue 149526: Use v8's utf8 <-> wide conversion for Linux sys_string_conversion. (Closed)

Patch Set: Created 11 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: base/sys_string_conversions_linux.cc

diff --git a/base/sys_string_conversions_linux.cc b/base/sys_string_conversions_linux.cc

index 118f0ac48494e2048079457f1f6b094ab2f21b33..bd530d9330ffe19d111d1a70f8f84f193b01d3c0 100644

--- a/base/sys_string_conversions_linux.cc

+++ b/base/sys_string_conversions_linux.cc

@@ -6,21 +6,166 @@

#include <wchar.h>

+#include "base/basictypes.h"

#include "base/string_piece.h"

#include "base/string_util.h"

namespace base {

+// UTF8 <-> UCS-4 conversion from v8.

+namespace {

+const uint32_t kMaxEncodedSize = 4;

+const uint32_t kMaxOneByteChar = 0x7f;

+const uint32_t kMaxTwoByteChar = 0x7ff;

+const uint32_t kMaxThreeByteChar = 0xffff;

+const uint32_t kMaxFourByteChar = 0x1fffff;

+const uint32_t kBadChar = 0xFFFD;

+size_t WriteUnicodeCharAsUTF8(char* str, uint32_t c) {

darin (slow to review) 2009/07/14 16:51:41 this stuff seems like it would be better broken ou

+ static const int kMask = ~(1 << 6);

+ if (c <= kMaxOneByteChar) {

+ str[0] = c;

+ return 1;

+ } else if (c <= kMaxTwoByteChar) {

+ str[0] = 0xC0 | (c >> 6);

+ str[1] = 0x80 | (c & kMask);

+ return 2;

+ } else if (c <= kMaxThreeByteChar) {

+ str[0] = 0xE0 | (c >> 12);

+ str[1] = 0x80 | ((c >> 6) & kMask);

+ str[2] = 0x80 | (c & kMask);

+ return 3;

+ } else {

+ str[0] = 0xF0 | (c >> 18);

+ str[1] = 0x80 | ((c >> 12) & kMask);

+ str[2] = 0x80 | ((c >> 6) & kMask);

+ str[3] = 0x80 | (c & kMask);

+ return 4;

+ }

+size_t UnicodeCharUTF8Length(uint32_t c) {

+ if (c <= kMaxOneByteChar) {

+ return 1;

+ } else if (c <= kMaxTwoByteChar) {

+ return 2;

+ } else if (c <= kMaxThreeByteChar) {

+ return 3;

+ } else {

+ return 4;

+ }

+uint32_t UnicodeCharFromUTF8(const uint8_t* str,

+ size_t length,

+ size_t* cursor) {

+ if (length <= 0) return kBadChar;

+ uint8_t first = str[0];

+ // Characters between 0000 and 0007F are encoded as a single character

+ if (first <= kMaxOneByteChar) {

+ *cursor += 1;

+ return first;

+ }

+ // We only get here for non-ascii characters.

+ if (length == 1) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ uint8_t second = str[1] ^ 0x80;

+ if (second & 0xC0) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ if (first < 0xE0) {

+ if (first < 0xC0) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ uint32_t l = ((first << 6) | second) & kMaxTwoByteChar;

+ if (l <= kMaxOneByteChar) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ *cursor += 2;

+ return l;

+ }

+ if (length == 2) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ uint8_t third = str[2] ^ 0x80;

+ if (third & 0xC0) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ if (first < 0xF0) {

+ uint32_t l = ((((first << 6) | second) << 6) | third) & kMaxThreeByteChar;

+ if (l <= kMaxTwoByteChar) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ *cursor += 3;

+ return l;

+ }

+ if (length == 3) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ uint8_t fourth = str[3] ^ 0x80;

+ if (fourth & 0xC0) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ if (first < 0xF8) {

+ uint32_t l = (((((first << 6 | second) << 6) | third) << 6) | fourth) &

+ kMaxFourByteChar;

+ if (l <= kMaxThreeByteChar) {

+ *cursor += 1;

+ return kBadChar;

+ }

+ *cursor += 4;

+ return l;

+ }

+ *cursor += 1;

+ return kBadChar;

+} // namespace

std::string SysWideToUTF8(const std::wstring& wide) {

- // In theory this should be using the system-provided conversion rather

- // than our ICU, but this will do for now.

- return WideToUTF8(wide);

+ size_t length = 0;

+ for (size_t i = 0; i < wide.size(); ++i)

+ length += UnicodeCharUTF8Length(wide[i]);

+ std::string out(length, 0);

+ size_t out_pos = 0;

+ for (size_t i = 0; i < wide.size(); ++i)

+ out_pos += WriteUnicodeCharAsUTF8(&out[out_pos], wide[i]);

+ return out;

}

std::wstring SysUTF8ToWide(const StringPiece& utf8) {

- // In theory this should be using the system-provided conversion rather

- // than our ICU, but this will do for now.

- std::wstring out;

- UTF8ToWide(utf8.data(), utf8.size(), &out);

+ size_t wide_length = 0;

+ for (size_t pos = 0; pos < utf8.size(); ++wide_length) {

+ if (UnicodeCharFromUTF8(

+ reinterpret_cast<const uint8_t*>(utf8.data() + pos),

+ utf8.size() - pos, &pos) == kBadChar) {

+ return std::wstring(); // Failure, invalid conversion.

+ }

+ std::wstring out(wide_length, 0);

+ for (size_t pos = 0, wide_pos = 0; pos < utf8.size(); ++wide_pos) {

+ out[wide_pos] = UnicodeCharFromUTF8(

+ reinterpret_cast<const uint8_t*>(utf8.data() + pos),

+ utf8.size() - pos, &pos);

+ }

return out;

}

« no previous file with comments | « no previous file | base/sys_string_conversions_unittest.cc » ('j') | no next file with comments »