OLD | NEW |
| (Empty) |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "platform/inspector_protocol/InspectorProtocol.h" | |
6 | |
7 #include <algorithm> | |
8 #include <cctype> | |
9 #include <cstdio> | |
10 #include <locale> | |
11 | |
12 namespace blink { | |
13 namespace protocol { | |
14 | |
15 const UChar replacementCharacter = 0xFFFD; | |
16 using UChar32 = uint32_t; | |
17 | |
18 inline int inlineUTF8SequenceLengthNonASCII(char b0) | |
19 { | |
20 if ((b0 & 0xC0) != 0xC0) | |
21 return 0; | |
22 if ((b0 & 0xE0) == 0xC0) | |
23 return 2; | |
24 if ((b0 & 0xF0) == 0xE0) | |
25 return 3; | |
26 if ((b0 & 0xF8) == 0xF0) | |
27 return 4; | |
28 return 0; | |
29 } | |
30 | |
31 inline int inlineUTF8SequenceLength(char b0) | |
32 { | |
33 return String16::isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | |
34 } | |
35 | |
36 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | |
37 // into the first byte, depending on how many bytes follow. There are | |
38 // as many entries in this table as there are UTF-8 sequence types. | |
39 // (I.e., one byte sequence, two byte... etc.). Remember that sequences | |
40 // for *legal* UTF-8 will be 4 or fewer bytes total. | |
41 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; | |
42 | |
43 typedef enum { | |
44 conversionOK, // conversion successful | |
45 sourceExhausted, // partial character in source, but hit end | |
46 targetExhausted, // insuff. room in target for conversion | |
47 sourceIllegal // source sequence is illegal/malformed | |
48 } ConversionResult; | |
49 | |
50 ConversionResult convertUTF16ToUTF8( | |
51 const UChar** sourceStart, const UChar* sourceEnd, | |
52 char** targetStart, char* targetEnd, bool strict) | |
53 { | |
54 ConversionResult result = conversionOK; | |
55 const UChar* source = *sourceStart; | |
56 char* target = *targetStart; | |
57 while (source < sourceEnd) { | |
58 UChar32 ch; | |
59 unsigned short bytesToWrite = 0; | |
60 const UChar32 byteMask = 0xBF; | |
61 const UChar32 byteMark = 0x80; | |
62 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. | |
63 ch = static_cast<unsigned short>(*source++); | |
64 // If we have a surrogate pair, convert to UChar32 first. | |
65 if (ch >= 0xD800 && ch <= 0xDBFF) { | |
66 // If the 16 bits following the high surrogate are in the source buf
fer... | |
67 if (source < sourceEnd) { | |
68 UChar32 ch2 = static_cast<unsigned short>(*source); | |
69 // If it's a low surrogate, convert to UChar32. | |
70 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | |
71 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | |
72 ++source; | |
73 } else if (strict) { // it's an unpaired high surrogate | |
74 --source; // return to the illegal value itself | |
75 result = sourceIllegal; | |
76 break; | |
77 } | |
78 } else { // We don't have the 16 bits following the high surrogate. | |
79 --source; // return to the high surrogate | |
80 result = sourceExhausted; | |
81 break; | |
82 } | |
83 } else if (strict) { | |
84 // UTF-16 surrogate values are illegal in UTF-32 | |
85 if (ch >= 0xDC00 && ch <= 0xDFFF) { | |
86 --source; // return to the illegal value itself | |
87 result = sourceIllegal; | |
88 break; | |
89 } | |
90 } | |
91 // Figure out how many bytes the result will require | |
92 if (ch < (UChar32)0x80) { | |
93 bytesToWrite = 1; | |
94 } else if (ch < (UChar32)0x800) { | |
95 bytesToWrite = 2; | |
96 } else if (ch < (UChar32)0x10000) { | |
97 bytesToWrite = 3; | |
98 } else if (ch < (UChar32)0x110000) { | |
99 bytesToWrite = 4; | |
100 } else { | |
101 bytesToWrite = 3; | |
102 ch = replacementCharacter; | |
103 } | |
104 | |
105 target += bytesToWrite; | |
106 if (target > targetEnd) { | |
107 source = oldSource; // Back up source pointer! | |
108 target -= bytesToWrite; | |
109 result = targetExhausted; | |
110 break; | |
111 } | |
112 switch (bytesToWrite) { // note: everything falls through. | |
113 case 4: | |
114 *--target = (char)((ch | byteMark) & byteMask); | |
115 ch >>= 6; | |
116 case 3: | |
117 *--target = (char)((ch | byteMark) & byteMask); | |
118 ch >>= 6; | |
119 case 2: | |
120 *--target = (char)((ch | byteMark) & byteMask); | |
121 ch >>= 6; | |
122 case 1: | |
123 *--target = (char)(ch | firstByteMark[bytesToWrite]); | |
124 } | |
125 target += bytesToWrite; | |
126 } | |
127 *sourceStart = source; | |
128 *targetStart = target; | |
129 return result; | |
130 } | |
131 | |
132 /** | |
133 * Is this code point a BMP code point (U+0000..U+ffff)? | |
134 * @param c 32-bit code point | |
135 * @return TRUE or FALSE | |
136 * @stable ICU 2.8 | |
137 */ | |
138 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) | |
139 | |
140 /** | |
141 * Is this code point a supplementary code point (U+10000..U+10ffff)? | |
142 * @param c 32-bit code point | |
143 * @return TRUE or FALSE | |
144 * @stable ICU 2.8 | |
145 */ | |
146 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c) - 0x10000) <= 0xfffff) | |
147 | |
148 /** | |
149 * Is this code point a surrogate (U+d800..U+dfff)? | |
150 * @param c 32-bit code point | |
151 * @return TRUE or FALSE | |
152 * @stable ICU 2.4 | |
153 */ | |
154 #define U_IS_SURROGATE(c) (((c) & 0xfffff800) == 0xd800) | |
155 | |
156 /** | |
157 * Get the lead surrogate (0xd800..0xdbff) for a | |
158 * supplementary code point (0x10000..0x10ffff). | |
159 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
160 * @return lead surrogate (U+d800..U+dbff) for supplementary | |
161 * @stable ICU 2.4 | |
162 */ | |
163 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) | |
164 | |
165 /** | |
166 * Get the trail surrogate (0xdc00..0xdfff) for a | |
167 * supplementary code point (0x10000..0x10ffff). | |
168 * @param supplementary 32-bit code point (U+10000..U+10ffff) | |
169 * @return trail surrogate (U+dc00..U+dfff) for supplementary | |
170 * @stable ICU 2.4 | |
171 */ | |
172 #define U16_TRAIL(supplementary) (UChar)(((supplementary) & 0x3ff) | 0xdc00) | |
173 | |
174 // This must be called with the length pre-determined by the first byte. | |
175 // If presented with a length > 4, this returns false. The Unicode | |
176 // definition of UTF-8 goes up to 4-byte sequences. | |
177 static bool isLegalUTF8(const unsigned char* source, int length) | |
178 { | |
179 unsigned char a; | |
180 const unsigned char* srcptr = source + length; | |
181 switch (length) { | |
182 default: | |
183 return false; | |
184 // Everything else falls through when "true"... | |
185 case 4: | |
186 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
187 return false; | |
188 case 3: | |
189 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | |
190 return false; | |
191 case 2: | |
192 if ((a = (*--srcptr)) > 0xBF) | |
193 return false; | |
194 | |
195 // no fall-through in this inner switch | |
196 switch (*source) { | |
197 case 0xE0: | |
198 if (a < 0xA0) | |
199 return false; | |
200 break; | |
201 case 0xED: | |
202 if (a > 0x9F) | |
203 return false; | |
204 break; | |
205 case 0xF0: | |
206 if (a < 0x90) | |
207 return false; | |
208 break; | |
209 case 0xF4: | |
210 if (a > 0x8F) | |
211 return false; | |
212 break; | |
213 default: | |
214 if (a < 0x80) | |
215 return false; | |
216 } | |
217 | |
218 case 1: | |
219 if (*source >= 0x80 && *source < 0xC2) | |
220 return false; | |
221 } | |
222 if (*source > 0xF4) | |
223 return false; | |
224 return true; | |
225 } | |
226 | |
227 // Magic values subtracted from a buffer value during UTF8 conversion. | |
228 // This table contains as many values as there might be trailing bytes | |
229 // in a UTF-8 sequence. | |
230 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20
80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8
2082080UL) }; | |
231 | |
232 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) | |
233 { | |
234 UChar32 character = 0; | |
235 | |
236 // The cases all fall through. | |
237 switch (length) { | |
238 case 6: | |
239 character += static_cast<unsigned char>(*sequence++); | |
240 character <<= 6; | |
241 case 5: | |
242 character += static_cast<unsigned char>(*sequence++); | |
243 character <<= 6; | |
244 case 4: | |
245 character += static_cast<unsigned char>(*sequence++); | |
246 character <<= 6; | |
247 case 3: | |
248 character += static_cast<unsigned char>(*sequence++); | |
249 character <<= 6; | |
250 case 2: | |
251 character += static_cast<unsigned char>(*sequence++); | |
252 character <<= 6; | |
253 case 1: | |
254 character += static_cast<unsigned char>(*sequence++); | |
255 } | |
256 | |
257 return character - offsetsFromUTF8[length - 1]; | |
258 } | |
259 | |
260 ConversionResult convertUTF8ToUTF16( | |
261 const char** sourceStart, const char* sourceEnd, | |
262 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | |
263 { | |
264 ConversionResult result = conversionOK; | |
265 const char* source = *sourceStart; | |
266 UChar* target = *targetStart; | |
267 UChar orAllData = 0; | |
268 while (source < sourceEnd) { | |
269 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | |
270 if (sourceEnd - source < utf8SequenceLength) { | |
271 result = sourceExhausted; | |
272 break; | |
273 } | |
274 // Do this check whether lenient or strict | |
275 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength)) { | |
276 result = sourceIllegal; | |
277 break; | |
278 } | |
279 | |
280 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | |
281 | |
282 if (target >= targetEnd) { | |
283 source -= utf8SequenceLength; // Back up source pointer! | |
284 result = targetExhausted; | |
285 break; | |
286 } | |
287 | |
288 if (U_IS_BMP(character)) { | |
289 // UTF-16 surrogate values are illegal in UTF-32 | |
290 if (U_IS_SURROGATE(character)) { | |
291 if (strict) { | |
292 source -= utf8SequenceLength; // return to the illegal value
itself | |
293 result = sourceIllegal; | |
294 break; | |
295 } | |
296 *target++ = replacementCharacter; | |
297 orAllData |= replacementCharacter; | |
298 } else { | |
299 *target++ = static_cast<UChar>(character); // normal case | |
300 orAllData |= character; | |
301 } | |
302 } else if (U_IS_SUPPLEMENTARY(character)) { | |
303 // target is a character in range 0xFFFF - 0x10FFFF | |
304 if (target + 1 >= targetEnd) { | |
305 source -= utf8SequenceLength; // Back up source pointer! | |
306 result = targetExhausted; | |
307 break; | |
308 } | |
309 *target++ = U16_LEAD(character); | |
310 *target++ = U16_TRAIL(character); | |
311 orAllData = 0xffff; | |
312 } else { | |
313 if (strict) { | |
314 source -= utf8SequenceLength; // return to the start | |
315 result = sourceIllegal; | |
316 break; // Bail out; shouldn't continue | |
317 } else { | |
318 *target++ = replacementCharacter; | |
319 orAllData |= replacementCharacter; | |
320 } | |
321 } | |
322 } | |
323 *sourceStart = source; | |
324 *targetStart = target; | |
325 | |
326 if (sourceAllASCII) | |
327 *sourceAllASCII = !(orAllData & ~0x7f); | |
328 | |
329 return result; | |
330 } | |
331 | |
332 // Helper to write a three-byte UTF-8 code point to the buffer, caller must chec
k room is available. | |
333 static inline void putUTF8Triple(char*& buffer, UChar ch) | |
334 { | |
335 DCHECK_GE(ch, 0x0800); | |
336 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); | |
337 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); | |
338 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); | |
339 } | |
340 | |
341 String16 String16::fromUTF8(const char* stringStart, size_t length) | |
342 { | |
343 if (!stringStart || !length) | |
344 return String16(); | |
345 | |
346 std::vector<UChar> buffer(length); | |
347 UChar* bufferStart = buffer.data(); | |
348 | |
349 UChar* bufferCurrent = bufferStart; | |
350 const char* stringCurrent = stringStart; | |
351 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
bufferCurrent + buffer.size(), 0, true) != conversionOK) | |
352 return String16(); | |
353 | |
354 unsigned utf16Length = bufferCurrent - bufferStart; | |
355 return String16(bufferStart, utf16Length); | |
356 } | |
357 | |
358 std::string String16::utf8() const | |
359 { | |
360 unsigned length = this->length(); | |
361 | |
362 if (!length) | |
363 return std::string(""); | |
364 | |
365 // Allocate a buffer big enough to hold all the characters | |
366 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). | |
367 // Optimization ideas, if we find this function is hot: | |
368 // * We could speculatively create a CStringBuffer to contain 'length' | |
369 // characters, and resize if necessary (i.e. if the buffer contains | |
370 // non-ascii characters). (Alternatively, scan the buffer first for | |
371 // ascii characters, so we know this will be sufficient). | |
372 // * We could allocate a CStringBuffer with an appropriate size to | |
373 // have a good chance of being able to write the string into the | |
374 // buffer without reallocing (say, 1.5 x length). | |
375 if (length > std::numeric_limits<unsigned>::max() / 3) | |
376 return std::string(); | |
377 std::vector<char> bufferVector(length * 3); | |
378 char* buffer = bufferVector.data(); | |
379 const UChar* characters = m_impl.data(); | |
380 | |
381 ConversionResult result = convertUTF16ToUTF8(&characters, characters + lengt
h, &buffer, buffer + bufferVector.size(), false); | |
382 DCHECK(result != targetExhausted); // (length * 3) should be sufficient for
any conversion | |
383 | |
384 // Only produced from strict conversion. | |
385 DCHECK(result != sourceIllegal); | |
386 | |
387 // Check for an unconverted high surrogate. | |
388 if (result == sourceExhausted) { | |
389 // This should be one unpaired high surrogate. Treat it the same | |
390 // was as an unpaired high surrogate would have been handled in | |
391 // the middle of a string with non-strict conversion - which is | |
392 // to say, simply encode it to UTF-8. | |
393 DCHECK((characters + 1) == (m_impl.data() + length)); | |
394 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); | |
395 // There should be room left, since one UChar hasn't been | |
396 // converted. | |
397 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); | |
398 putUTF8Triple(buffer, *characters); | |
399 } | |
400 | |
401 return std::string(bufferVector.data(), buffer - bufferVector.data()); | |
402 } | |
403 | |
404 } // namespace protocol | |
405 } // namespace blink | |
OLD | NEW |