base/sys_string_conversions_linux.cc - Issue 149526: Use v8's utf8 <-> wide conversion for Linux sys_string_conversion.

Side by Side Diff: base/sys_string_conversions_linux.cc

Issue 149526: Use v8's utf8 <-> wide conversion for Linux sys_string_conversion. (Closed)

Patch Set: Created 11 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/sys_string_conversions.h"	5 #include "base/sys_string_conversions.h"

6	6

7 #include <wchar.h>	7 #include <wchar.h>

8	8

	9 #include "base/basictypes.h"

9 #include "base/string_piece.h"	10 #include "base/string_piece.h"

10 #include "base/string_util.h"	11 #include "base/string_util.h"

11	12

12 namespace base {	13 namespace base {

13	14

	15 // UTF8 <-> UCS-4 conversion from v8.

	16 namespace {

	17

	18 const uint32_t kMaxEncodedSize = 4;

	19 const uint32_t kMaxOneByteChar = 0x7f;

	20 const uint32_t kMaxTwoByteChar = 0x7ff;

	21 const uint32_t kMaxThreeByteChar = 0xffff;

	22 const uint32_t kMaxFourByteChar = 0x1fffff;

	23 const uint32_t kBadChar = 0xFFFD;

	24

	25 size_t WriteUnicodeCharAsUTF8(char* str, uint32_t c) {
	darin (slow to review) 2009/07/14 16:51:41 this stuff seems like it would be better broken ou this stuff seems like it would be better broken out as a base/utf8_util.{h,cc} file. also, why not just use iconv here, if you need to avoid ICU--do you?? why write all of this code?
	26 static const int kMask = ~(1 << 6);

	27 if (c <= kMaxOneByteChar) {

	28 str[0] = c;

	29 return 1;

	30 } else if (c <= kMaxTwoByteChar) {

	31 str[0] = 0xC0 \| (c >> 6);

	32 str[1] = 0x80 \| (c & kMask);

	33 return 2;

	34 } else if (c <= kMaxThreeByteChar) {

	35 str[0] = 0xE0 \| (c >> 12);

	36 str[1] = 0x80 \| ((c >> 6) & kMask);

	37 str[2] = 0x80 \| (c & kMask);

	38 return 3;

	39 } else {

	40 str[0] = 0xF0 \| (c >> 18);

	41 str[1] = 0x80 \| ((c >> 12) & kMask);

	42 str[2] = 0x80 \| ((c >> 6) & kMask);

	43 str[3] = 0x80 \| (c & kMask);

	44 return 4;

	45 }

	46 }

	47

	48 size_t UnicodeCharUTF8Length(uint32_t c) {

	49 if (c <= kMaxOneByteChar) {

	50 return 1;

	51 } else if (c <= kMaxTwoByteChar) {

	52 return 2;

	53 } else if (c <= kMaxThreeByteChar) {

	54 return 3;

	55 } else {

	56 return 4;

	57 }

	58 }

	59

	60 uint32_t UnicodeCharFromUTF8(const uint8_t* str,

	61 size_t length,

	62 size_t* cursor) {

	63 if (length <= 0) return kBadChar;

	64

	65 uint8_t first = str[0];

	66 // Characters between 0000 and 0007F are encoded as a single character

	67 if (first <= kMaxOneByteChar) {

	68 *cursor += 1;

	69 return first;

	70 }

	71

	72 // We only get here for non-ascii characters.

	73 if (length == 1) {

	74 *cursor += 1;

	75 return kBadChar;

	76 }

	77

	78 uint8_t second = str[1] ^ 0x80;

	79 if (second & 0xC0) {

	80 *cursor += 1;

	81 return kBadChar;

	82 }

	83 if (first < 0xE0) {

	84 if (first < 0xC0) {

	85 *cursor += 1;

	86 return kBadChar;

	87 }

	88 uint32_t l = ((first << 6) \| second) & kMaxTwoByteChar;

	89 if (l <= kMaxOneByteChar) {

	90 *cursor += 1;

	91 return kBadChar;

	92 }

	93 *cursor += 2;

	94 return l;

	95 }

	96 if (length == 2) {

	97 *cursor += 1;

	98 return kBadChar;

	99 }

	100 uint8_t third = str[2] ^ 0x80;

	101 if (third & 0xC0) {

	102 *cursor += 1;

	103 return kBadChar;

	104 }

	105 if (first < 0xF0) {

	106 uint32_t l = ((((first << 6) \| second) << 6) \| third) & kMaxThreeByteChar;

	107 if (l <= kMaxTwoByteChar) {

	108 *cursor += 1;

	109 return kBadChar;

	110 }

	111 *cursor += 3;

	112 return l;

	113 }

	114 if (length == 3) {

	115 *cursor += 1;

	116 return kBadChar;

	117 }

	118 uint8_t fourth = str[3] ^ 0x80;

	119 if (fourth & 0xC0) {

	120 *cursor += 1;

	121 return kBadChar;

	122 }

	123 if (first < 0xF8) {

	124 uint32_t l = (((((first << 6 \| second) << 6) \| third) << 6) \| fourth) &

	125 kMaxFourByteChar;

	126 if (l <= kMaxThreeByteChar) {

	127 *cursor += 1;

	128 return kBadChar;

	129 }

	130 *cursor += 4;

	131 return l;

	132 }

	133 *cursor += 1;

	134 return kBadChar;

	135 }

	136

	137 } // namespace

	138

14 std::string SysWideToUTF8(const std::wstring& wide) {	139 std::string SysWideToUTF8(const std::wstring& wide) {

15 // In theory this should be using the system-provided conversion rather	140 size_t length = 0;

16 // than our ICU, but this will do for now.	141 for (size_t i = 0; i < wide.size(); ++i)

17 return WideToUTF8(wide);	142 length += UnicodeCharUTF8Length(wide[i]);

18 }	143

19 std::wstring SysUTF8ToWide(const StringPiece& utf8) {	144 std::string out(length, 0);

20 // In theory this should be using the system-provided conversion rather	145 size_t out_pos = 0;

21 // than our ICU, but this will do for now.	146 for (size_t i = 0; i < wide.size(); ++i)

22 std::wstring out;	147 out_pos += WriteUnicodeCharAsUTF8(&out[out_pos], wide[i]);

23 UTF8ToWide(utf8.data(), utf8.size(), &out);	148

24 return out;	149 return out;

25 }	150 }

26	151

	152 std::wstring SysUTF8ToWide(const StringPiece& utf8) {

	153 size_t wide_length = 0;

	154 for (size_t pos = 0; pos < utf8.size(); ++wide_length) {

	155 if (UnicodeCharFromUTF8(

	156 reinterpret_cast<const uint8_t*>(utf8.data() + pos),

	157 utf8.size() - pos, &pos) == kBadChar) {

	158 return std::wstring(); // Failure, invalid conversion.

	159 }

	160 }

	161

	162 std::wstring out(wide_length, 0);

	163 for (size_t pos = 0, wide_pos = 0; pos < utf8.size(); ++wide_pos) {

	164 out[wide_pos] = UnicodeCharFromUTF8(

	165 reinterpret_cast<const uint8_t*>(utf8.data() + pos),

	166 utf8.size() - pos, &pos);

	167 }

	168

	169 return out;

	170 }

	171

27 std::string SysWideToNativeMB(const std::wstring& wide) {	172 std::string SysWideToNativeMB(const std::wstring& wide) {

28 mbstate_t ps;	173 mbstate_t ps;

29	174

30 // Calculate the number of multi-byte characters. We walk through the string	175 // Calculate the number of multi-byte characters. We walk through the string

31 // without writing the output, counting the number of multi-byte characters.	176 // without writing the output, counting the number of multi-byte characters.

32 size_t num_out_chars = 0;	177 size_t num_out_chars = 0;

33 memset(&ps, 0, sizeof(ps));	178 memset(&ps, 0, sizeof(ps));

34 for (size_t i = 0; i < wide.size(); ++i) {	179 for (size_t i = 0; i < wide.size(); ++i) {

35 const wchar_t src = wide[i];	180 const wchar_t src = wide[i];

36 // Use a temp buffer since calling wcrtomb with an output of NULL does not	181 // Use a temp buffer since calling wcrtomb with an output of NULL does not

(...skipping 98 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
135 default:	280 default:

136 i += res;	281 i += res;

137 break;	282 break;

138 }	283 }

139 }	284 }

140	285

141 return out;	286 return out;

142 }	287 }

143	288

144 } // namespace base	289 } // namespace base

OLD	NEW

« no previous file with comments | « no previous file | base/sys_string_conversions_unittest.cc » ('j') | no next file with comments »