OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | |
3 * | |
4 * Redistribution and use in source and binary forms, with or without | |
5 * modification, are permitted provided that the following conditions | |
6 * are met: | |
7 * 1. Redistributions of source code must retain the above copyright | |
8 * notice, this list of conditions and the following disclaimer. | |
9 * 2. Redistributions in binary form must reproduce the above copyright | |
10 * notice, this list of conditions and the following disclaimer in the | |
11 * documentation and/or other materials provided with the distribution. | |
12 * | |
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY | |
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR | |
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
24 */ | |
25 | |
26 #include "wtf/text/TextCodecUTF8.h" | |
27 | |
28 #include "wtf/PtrUtil.h" | |
29 #include "wtf/text/CString.h" | |
30 #include "wtf/text/CharacterNames.h" | |
31 #include "wtf/text/StringBuffer.h" | |
32 #include "wtf/text/TextCodecASCIIFastPath.h" | |
33 #include <memory> | |
34 | |
35 namespace WTF { | |
36 | |
37 using namespace WTF::Unicode; | |
38 | |
39 // We'll use nonCharacter* constants to signal invalid utf-8. | |
40 // The number in the name signals how many input bytes were invalid. | |
41 const int nonCharacter1 = -1; | |
42 const int nonCharacter2 = -2; | |
43 const int nonCharacter3 = -3; | |
44 | |
45 bool isNonCharacter(int character) { | |
46 return character >= nonCharacter3 && character <= nonCharacter1; | |
47 } | |
48 | |
49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, | |
50 const void*) { | |
51 return WTF::wrapUnique(new TextCodecUTF8); | |
52 } | |
53 | |
54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { | |
55 registrar("UTF-8", "UTF-8"); | |
56 | |
57 // Additional aliases that originally were present in the encoding | |
58 // table in WebKit on Macintosh, and subsequently added by | |
59 // TextCodecICU. Perhaps we can prove some are not used on the web | |
60 // and remove them. | |
61 registrar("unicode11utf8", "UTF-8"); | |
62 registrar("unicode20utf8", "UTF-8"); | |
63 registrar("utf8", "UTF-8"); | |
64 registrar("x-unicode20utf8", "UTF-8"); | |
65 | |
66 // Additional aliases present in the WHATWG Encoding Standard | |
67 // (http://encoding.spec.whatwg.org/) | |
68 // and Firefox (24), but not in ICU 4.6. | |
69 registrar("unicode-1-1-utf-8", "UTF-8"); | |
70 } | |
71 | |
72 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { | |
73 registrar("UTF-8", create, 0); | |
74 } | |
75 | |
76 static inline int nonASCIISequenceLength(uint8_t firstByte) { | |
77 static const uint8_t lengths[256] = { | |
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
83 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
87 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |
89 return lengths[firstByte]; | |
90 } | |
91 | |
92 static inline int decodeNonASCIISequence(const uint8_t* sequence, | |
93 unsigned length) { | |
94 DCHECK(!isASCII(sequence[0])); | |
95 if (length == 2) { | |
96 DCHECK_LE(sequence[0], 0xDF); | |
97 if (sequence[0] < 0xC2) | |
98 return nonCharacter1; | |
99 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
100 return nonCharacter1; | |
101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; | |
102 } | |
103 if (length == 3) { | |
104 DCHECK_GE(sequence[0], 0xE0); | |
105 DCHECK_LE(sequence[0], 0xEF); | |
106 switch (sequence[0]) { | |
107 case 0xE0: | |
108 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) | |
109 return nonCharacter1; | |
110 break; | |
111 case 0xED: | |
112 if (sequence[1] < 0x80 || sequence[1] > 0x9F) | |
113 return nonCharacter1; | |
114 break; | |
115 default: | |
116 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
117 return nonCharacter1; | |
118 } | |
119 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | |
120 return nonCharacter2; | |
121 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - | |
122 0x000E2080; | |
123 } | |
124 DCHECK_EQ(length, 4u); | |
125 DCHECK_GE(sequence[0], 0xF0); | |
126 DCHECK_LE(sequence[0], 0xF4); | |
127 switch (sequence[0]) { | |
128 case 0xF0: | |
129 if (sequence[1] < 0x90 || sequence[1] > 0xBF) | |
130 return nonCharacter1; | |
131 break; | |
132 case 0xF4: | |
133 if (sequence[1] < 0x80 || sequence[1] > 0x8F) | |
134 return nonCharacter1; | |
135 break; | |
136 default: | |
137 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
138 return nonCharacter1; | |
139 } | |
140 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | |
141 return nonCharacter2; | |
142 if (sequence[3] < 0x80 || sequence[3] > 0xBF) | |
143 return nonCharacter3; | |
144 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + | |
145 sequence[3]) - | |
146 0x03C82080; | |
147 } | |
148 | |
149 static inline UChar* appendCharacter(UChar* destination, int character) { | |
150 DCHECK(!isNonCharacter(character)); | |
151 DCHECK(!U_IS_SURROGATE(character)); | |
152 if (U_IS_BMP(character)) { | |
153 *destination++ = static_cast<UChar>(character); | |
154 } else { | |
155 *destination++ = U16_LEAD(character); | |
156 *destination++ = U16_TRAIL(character); | |
157 } | |
158 return destination; | |
159 } | |
160 | |
161 void TextCodecUTF8::consumePartialSequenceByte() { | |
162 --m_partialSequenceSize; | |
163 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); | |
164 } | |
165 | |
166 void TextCodecUTF8::handleError(UChar*& destination, | |
167 bool stopOnError, | |
168 bool& sawError) { | |
169 sawError = true; | |
170 if (stopOnError) | |
171 return; | |
172 // Each error generates a replacement character and consumes one byte. | |
173 *destination++ = replacementCharacter; | |
174 consumePartialSequenceByte(); | |
175 } | |
176 | |
177 template <> | |
178 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, | |
179 const uint8_t*& source, | |
180 const uint8_t* end, | |
181 bool flush, | |
182 bool, | |
183 bool&) { | |
184 DCHECK(m_partialSequenceSize); | |
185 do { | |
186 if (isASCII(m_partialSequence[0])) { | |
187 *destination++ = m_partialSequence[0]; | |
188 consumePartialSequenceByte(); | |
189 continue; | |
190 } | |
191 int count = nonASCIISequenceLength(m_partialSequence[0]); | |
192 if (!count) | |
193 return true; | |
194 | |
195 if (count > m_partialSequenceSize) { | |
196 if (count - m_partialSequenceSize > end - source) { | |
197 if (!flush) { | |
198 // The new data is not enough to complete the sequence, so | |
199 // add it to the existing partial sequence. | |
200 memcpy(m_partialSequence + m_partialSequenceSize, source, | |
201 end - source); | |
202 m_partialSequenceSize += end - source; | |
203 return false; | |
204 } | |
205 // An incomplete partial sequence at the end is an error, but it will | |
206 // create a 16 bit string due to the replacementCharacter. Let the 16 | |
207 // bit path handle the error. | |
208 return true; | |
209 } | |
210 memcpy(m_partialSequence + m_partialSequenceSize, source, | |
211 count - m_partialSequenceSize); | |
212 source += count - m_partialSequenceSize; | |
213 m_partialSequenceSize = count; | |
214 } | |
215 int character = decodeNonASCIISequence(m_partialSequence, count); | |
216 if (character & ~0xff) | |
217 return true; | |
218 | |
219 m_partialSequenceSize -= count; | |
220 *destination++ = static_cast<LChar>(character); | |
221 } while (m_partialSequenceSize); | |
222 | |
223 return false; | |
224 } | |
225 | |
226 template <> | |
227 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, | |
228 const uint8_t*& source, | |
229 const uint8_t* end, | |
230 bool flush, | |
231 bool stopOnError, | |
232 bool& sawError) { | |
233 DCHECK(m_partialSequenceSize); | |
234 do { | |
235 if (isASCII(m_partialSequence[0])) { | |
236 *destination++ = m_partialSequence[0]; | |
237 consumePartialSequenceByte(); | |
238 continue; | |
239 } | |
240 int count = nonASCIISequenceLength(m_partialSequence[0]); | |
241 if (!count) { | |
242 handleError(destination, stopOnError, sawError); | |
243 if (stopOnError) | |
244 return false; | |
245 continue; | |
246 } | |
247 if (count > m_partialSequenceSize) { | |
248 if (count - m_partialSequenceSize > end - source) { | |
249 if (!flush) { | |
250 // The new data is not enough to complete the sequence, so | |
251 // add it to the existing partial sequence. | |
252 memcpy(m_partialSequence + m_partialSequenceSize, source, | |
253 end - source); | |
254 m_partialSequenceSize += end - source; | |
255 return false; | |
256 } | |
257 // An incomplete partial sequence at the end is an error. | |
258 handleError(destination, stopOnError, sawError); | |
259 if (stopOnError) | |
260 return false; | |
261 continue; | |
262 } | |
263 memcpy(m_partialSequence + m_partialSequenceSize, source, | |
264 count - m_partialSequenceSize); | |
265 source += count - m_partialSequenceSize; | |
266 m_partialSequenceSize = count; | |
267 } | |
268 int character = decodeNonASCIISequence(m_partialSequence, count); | |
269 if (isNonCharacter(character)) { | |
270 handleError(destination, stopOnError, sawError); | |
271 if (stopOnError) | |
272 return false; | |
273 continue; | |
274 } | |
275 | |
276 m_partialSequenceSize -= count; | |
277 destination = appendCharacter(destination, character); | |
278 } while (m_partialSequenceSize); | |
279 | |
280 return false; | |
281 } | |
282 | |
283 String TextCodecUTF8::decode(const char* bytes, | |
284 size_t length, | |
285 FlushBehavior flush, | |
286 bool stopOnError, | |
287 bool& sawError) { | |
288 // Each input byte might turn into a character. | |
289 // That includes all bytes in the partial-sequence buffer because | |
290 // each byte in an invalid sequence will turn into a replacement character. | |
291 StringBuffer<LChar> buffer(m_partialSequenceSize + length); | |
292 | |
293 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); | |
294 const uint8_t* end = source + length; | |
295 const uint8_t* alignedEnd = alignToMachineWord(end); | |
296 LChar* destination = buffer.characters(); | |
297 | |
298 do { | |
299 if (m_partialSequenceSize) { | |
300 // Explicitly copy destination and source pointers to avoid taking | |
301 // pointers to the local variables, which may harm code generation by | |
302 // disabling some optimizations in some compilers. | |
303 LChar* destinationForHandlePartialSequence = destination; | |
304 const uint8_t* sourceForHandlePartialSequence = source; | |
305 if (handlePartialSequence(destinationForHandlePartialSequence, | |
306 sourceForHandlePartialSequence, end, flush, | |
307 stopOnError, sawError)) { | |
308 source = sourceForHandlePartialSequence; | |
309 goto upConvertTo16Bit; | |
310 } | |
311 destination = destinationForHandlePartialSequence; | |
312 source = sourceForHandlePartialSequence; | |
313 if (m_partialSequenceSize) | |
314 break; | |
315 } | |
316 | |
317 while (source < end) { | |
318 if (isASCII(*source)) { | |
319 // Fast path for ASCII. Most UTF-8 text will be ASCII. | |
320 if (isAlignedToMachineWord(source)) { | |
321 while (source < alignedEnd) { | |
322 MachineWord chunk = | |
323 *reinterpret_cast_ptr<const MachineWord*>(source); | |
324 if (!isAllASCII<LChar>(chunk)) | |
325 break; | |
326 copyASCIIMachineWord(destination, source); | |
327 source += sizeof(MachineWord); | |
328 destination += sizeof(MachineWord); | |
329 } | |
330 if (source == end) | |
331 break; | |
332 if (!isASCII(*source)) | |
333 continue; | |
334 } | |
335 *destination++ = *source++; | |
336 continue; | |
337 } | |
338 int count = nonASCIISequenceLength(*source); | |
339 int character; | |
340 if (count == 0) { | |
341 character = nonCharacter1; | |
342 } else { | |
343 if (count > end - source) { | |
344 SECURITY_DCHECK(end - source < | |
345 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); | |
346 DCHECK(!m_partialSequenceSize); | |
347 m_partialSequenceSize = end - source; | |
348 memcpy(m_partialSequence, source, m_partialSequenceSize); | |
349 source = end; | |
350 break; | |
351 } | |
352 character = decodeNonASCIISequence(source, count); | |
353 } | |
354 if (isNonCharacter(character)) { | |
355 sawError = true; | |
356 if (stopOnError) | |
357 break; | |
358 | |
359 goto upConvertTo16Bit; | |
360 } | |
361 if (character > 0xff) | |
362 goto upConvertTo16Bit; | |
363 | |
364 source += count; | |
365 *destination++ = static_cast<LChar>(character); | |
366 } | |
367 } while (flush && m_partialSequenceSize); | |
368 | |
369 buffer.shrink(destination - buffer.characters()); | |
370 | |
371 return String::adopt(buffer); | |
372 | |
373 upConvertTo16Bit: | |
374 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); | |
375 | |
376 UChar* destination16 = buffer16.characters(); | |
377 | |
378 // Copy the already converted characters | |
379 for (LChar* converted8 = buffer.characters(); converted8 < destination;) | |
380 *destination16++ = *converted8++; | |
381 | |
382 do { | |
383 if (m_partialSequenceSize) { | |
384 // Explicitly copy destination and source pointers to avoid taking | |
385 // pointers to the local variables, which may harm code generation by | |
386 // disabling some optimizations in some compilers. | |
387 UChar* destinationForHandlePartialSequence = destination16; | |
388 const uint8_t* sourceForHandlePartialSequence = source; | |
389 handlePartialSequence(destinationForHandlePartialSequence, | |
390 sourceForHandlePartialSequence, end, flush, | |
391 stopOnError, sawError); | |
392 destination16 = destinationForHandlePartialSequence; | |
393 source = sourceForHandlePartialSequence; | |
394 if (m_partialSequenceSize) | |
395 break; | |
396 } | |
397 | |
398 while (source < end) { | |
399 if (isASCII(*source)) { | |
400 // Fast path for ASCII. Most UTF-8 text will be ASCII. | |
401 if (isAlignedToMachineWord(source)) { | |
402 while (source < alignedEnd) { | |
403 MachineWord chunk = | |
404 *reinterpret_cast_ptr<const MachineWord*>(source); | |
405 if (!isAllASCII<LChar>(chunk)) | |
406 break; | |
407 copyASCIIMachineWord(destination16, source); | |
408 source += sizeof(MachineWord); | |
409 destination16 += sizeof(MachineWord); | |
410 } | |
411 if (source == end) | |
412 break; | |
413 if (!isASCII(*source)) | |
414 continue; | |
415 } | |
416 *destination16++ = *source++; | |
417 continue; | |
418 } | |
419 int count = nonASCIISequenceLength(*source); | |
420 int character; | |
421 if (count == 0) { | |
422 character = nonCharacter1; | |
423 } else { | |
424 if (count > end - source) { | |
425 SECURITY_DCHECK(end - source < | |
426 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); | |
427 DCHECK(!m_partialSequenceSize); | |
428 m_partialSequenceSize = end - source; | |
429 memcpy(m_partialSequence, source, m_partialSequenceSize); | |
430 source = end; | |
431 break; | |
432 } | |
433 character = decodeNonASCIISequence(source, count); | |
434 } | |
435 if (isNonCharacter(character)) { | |
436 sawError = true; | |
437 if (stopOnError) | |
438 break; | |
439 // Each error generates one replacement character and consumes the | |
440 // 'largest subpart' of the incomplete character. | |
441 // Note that the nonCharacterX constants go from -1..-3 and contain | |
442 // the negative of number of bytes comprising the broken encoding | |
443 // detected. So subtracting c (when isNonCharacter(c)) adds the number | |
444 // of broken bytes. | |
445 *destination16++ = replacementCharacter; | |
446 source -= character; | |
447 continue; | |
448 } | |
449 source += count; | |
450 destination16 = appendCharacter(destination16, character); | |
451 } | |
452 } while (flush && m_partialSequenceSize); | |
453 | |
454 buffer16.shrink(destination16 - buffer16.characters()); | |
455 | |
456 return String::adopt(buffer16); | |
457 } | |
458 | |
459 template <typename CharType> | |
460 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { | |
461 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. | |
462 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes | |
463 // (3x). | |
464 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes | |
465 // (2x). | |
466 if (length > std::numeric_limits<size_t>::max() / 3) | |
467 CRASH(); | |
468 Vector<uint8_t> bytes(length * 3); | |
469 | |
470 size_t i = 0; | |
471 size_t bytesWritten = 0; | |
472 while (i < length) { | |
473 UChar32 character; | |
474 U16_NEXT(characters, i, length, character); | |
475 // U16_NEXT will simply emit a surrogate code point if an unmatched | |
476 // surrogate is encountered; we must convert it to a | |
477 // U+FFFD (REPLACEMENT CHARACTER) here. | |
478 if (0xD800 <= character && character <= 0xDFFF) | |
479 character = replacementCharacter; | |
480 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); | |
481 } | |
482 | |
483 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); | |
484 } | |
485 | |
486 CString TextCodecUTF8::encode(const UChar* characters, | |
487 size_t length, | |
488 UnencodableHandling) { | |
489 return encodeCommon(characters, length); | |
490 } | |
491 | |
492 CString TextCodecUTF8::encode(const LChar* characters, | |
493 size_t length, | |
494 UnencodableHandling) { | |
495 return encodeCommon(characters, length); | |
496 } | |
497 | |
498 } // namespace WTF | |
OLD | NEW |