Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(250)

Side by Side Diff: base/sys_string_conversions_linux.cc

Issue 149526: Use v8's utf8 <-> wide conversion for Linux sys_string_conversion. (Closed)
Patch Set: Created 11 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | base/sys_string_conversions_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/sys_string_conversions.h" 5 #include "base/sys_string_conversions.h"
6 6
7 #include <wchar.h> 7 #include <wchar.h>
8 8
9 #include "base/basictypes.h"
9 #include "base/string_piece.h" 10 #include "base/string_piece.h"
10 #include "base/string_util.h" 11 #include "base/string_util.h"
11 12
12 namespace base { 13 namespace base {
13 14
15 // UTF8 <-> UCS-4 conversion from v8.
16 namespace {
17
18 const uint32_t kMaxEncodedSize = 4;
19 const uint32_t kMaxOneByteChar = 0x7f;
20 const uint32_t kMaxTwoByteChar = 0x7ff;
21 const uint32_t kMaxThreeByteChar = 0xffff;
22 const uint32_t kMaxFourByteChar = 0x1fffff;
23 const uint32_t kBadChar = 0xFFFD;
24
25 size_t WriteUnicodeCharAsUTF8(char* str, uint32_t c) {
darin (slow to review) 2009/07/14 16:51:41 this stuff seems like it would be better broken ou
26 static const int kMask = ~(1 << 6);
27 if (c <= kMaxOneByteChar) {
28 str[0] = c;
29 return 1;
30 } else if (c <= kMaxTwoByteChar) {
31 str[0] = 0xC0 | (c >> 6);
32 str[1] = 0x80 | (c & kMask);
33 return 2;
34 } else if (c <= kMaxThreeByteChar) {
35 str[0] = 0xE0 | (c >> 12);
36 str[1] = 0x80 | ((c >> 6) & kMask);
37 str[2] = 0x80 | (c & kMask);
38 return 3;
39 } else {
40 str[0] = 0xF0 | (c >> 18);
41 str[1] = 0x80 | ((c >> 12) & kMask);
42 str[2] = 0x80 | ((c >> 6) & kMask);
43 str[3] = 0x80 | (c & kMask);
44 return 4;
45 }
46 }
47
48 size_t UnicodeCharUTF8Length(uint32_t c) {
49 if (c <= kMaxOneByteChar) {
50 return 1;
51 } else if (c <= kMaxTwoByteChar) {
52 return 2;
53 } else if (c <= kMaxThreeByteChar) {
54 return 3;
55 } else {
56 return 4;
57 }
58 }
59
60 uint32_t UnicodeCharFromUTF8(const uint8_t* str,
61 size_t length,
62 size_t* cursor) {
63 if (length <= 0) return kBadChar;
64
65 uint8_t first = str[0];
66 // Characters between 0000 and 0007F are encoded as a single character
67 if (first <= kMaxOneByteChar) {
68 *cursor += 1;
69 return first;
70 }
71
72 // We only get here for non-ascii characters.
73 if (length == 1) {
74 *cursor += 1;
75 return kBadChar;
76 }
77
78 uint8_t second = str[1] ^ 0x80;
79 if (second & 0xC0) {
80 *cursor += 1;
81 return kBadChar;
82 }
83 if (first < 0xE0) {
84 if (first < 0xC0) {
85 *cursor += 1;
86 return kBadChar;
87 }
88 uint32_t l = ((first << 6) | second) & kMaxTwoByteChar;
89 if (l <= kMaxOneByteChar) {
90 *cursor += 1;
91 return kBadChar;
92 }
93 *cursor += 2;
94 return l;
95 }
96 if (length == 2) {
97 *cursor += 1;
98 return kBadChar;
99 }
100 uint8_t third = str[2] ^ 0x80;
101 if (third & 0xC0) {
102 *cursor += 1;
103 return kBadChar;
104 }
105 if (first < 0xF0) {
106 uint32_t l = ((((first << 6) | second) << 6) | third) & kMaxThreeByteChar;
107 if (l <= kMaxTwoByteChar) {
108 *cursor += 1;
109 return kBadChar;
110 }
111 *cursor += 3;
112 return l;
113 }
114 if (length == 3) {
115 *cursor += 1;
116 return kBadChar;
117 }
118 uint8_t fourth = str[3] ^ 0x80;
119 if (fourth & 0xC0) {
120 *cursor += 1;
121 return kBadChar;
122 }
123 if (first < 0xF8) {
124 uint32_t l = (((((first << 6 | second) << 6) | third) << 6) | fourth) &
125 kMaxFourByteChar;
126 if (l <= kMaxThreeByteChar) {
127 *cursor += 1;
128 return kBadChar;
129 }
130 *cursor += 4;
131 return l;
132 }
133 *cursor += 1;
134 return kBadChar;
135 }
136
137 } // namespace
138
14 std::string SysWideToUTF8(const std::wstring& wide) { 139 std::string SysWideToUTF8(const std::wstring& wide) {
15 // In theory this should be using the system-provided conversion rather 140 size_t length = 0;
16 // than our ICU, but this will do for now. 141 for (size_t i = 0; i < wide.size(); ++i)
17 return WideToUTF8(wide); 142 length += UnicodeCharUTF8Length(wide[i]);
18 } 143
19 std::wstring SysUTF8ToWide(const StringPiece& utf8) { 144 std::string out(length, 0);
20 // In theory this should be using the system-provided conversion rather 145 size_t out_pos = 0;
21 // than our ICU, but this will do for now. 146 for (size_t i = 0; i < wide.size(); ++i)
22 std::wstring out; 147 out_pos += WriteUnicodeCharAsUTF8(&out[out_pos], wide[i]);
23 UTF8ToWide(utf8.data(), utf8.size(), &out); 148
24 return out; 149 return out;
25 } 150 }
26 151
152 std::wstring SysUTF8ToWide(const StringPiece& utf8) {
153 size_t wide_length = 0;
154 for (size_t pos = 0; pos < utf8.size(); ++wide_length) {
155 if (UnicodeCharFromUTF8(
156 reinterpret_cast<const uint8_t*>(utf8.data() + pos),
157 utf8.size() - pos, &pos) == kBadChar) {
158 return std::wstring(); // Failure, invalid conversion.
159 }
160 }
161
162 std::wstring out(wide_length, 0);
163 for (size_t pos = 0, wide_pos = 0; pos < utf8.size(); ++wide_pos) {
164 out[wide_pos] = UnicodeCharFromUTF8(
165 reinterpret_cast<const uint8_t*>(utf8.data() + pos),
166 utf8.size() - pos, &pos);
167 }
168
169 return out;
170 }
171
27 std::string SysWideToNativeMB(const std::wstring& wide) { 172 std::string SysWideToNativeMB(const std::wstring& wide) {
28 mbstate_t ps; 173 mbstate_t ps;
29 174
30 // Calculate the number of multi-byte characters. We walk through the string 175 // Calculate the number of multi-byte characters. We walk through the string
31 // without writing the output, counting the number of multi-byte characters. 176 // without writing the output, counting the number of multi-byte characters.
32 size_t num_out_chars = 0; 177 size_t num_out_chars = 0;
33 memset(&ps, 0, sizeof(ps)); 178 memset(&ps, 0, sizeof(ps));
34 for (size_t i = 0; i < wide.size(); ++i) { 179 for (size_t i = 0; i < wide.size(); ++i) {
35 const wchar_t src = wide[i]; 180 const wchar_t src = wide[i];
36 // Use a temp buffer since calling wcrtomb with an output of NULL does not 181 // Use a temp buffer since calling wcrtomb with an output of NULL does not
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after
135 default: 280 default:
136 i += res; 281 i += res;
137 break; 282 break;
138 } 283 }
139 } 284 }
140 285
141 return out; 286 return out;
142 } 287 }
143 288
144 } // namespace base 289 } // namespace base
OLDNEW
« no previous file with comments | « no previous file | base/sys_string_conversions_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698