Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(890)

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 2764283002: Move files in wtf/ to platform/wtf/ (Part 10). (Closed)
Patch Set: Rebase. Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "wtf/text/TextCodecUTF8.h"
27
28 #include "wtf/PtrUtil.h"
29 #include "wtf/text/CString.h"
30 #include "wtf/text/CharacterNames.h"
31 #include "wtf/text/StringBuffer.h"
32 #include "wtf/text/TextCodecASCIIFastPath.h"
33 #include <memory>
34
35 namespace WTF {
36
37 using namespace WTF::Unicode;
38
39 // We'll use nonCharacter* constants to signal invalid utf-8.
40 // The number in the name signals how many input bytes were invalid.
41 const int nonCharacter1 = -1;
42 const int nonCharacter2 = -2;
43 const int nonCharacter3 = -3;
44
45 bool isNonCharacter(int character) {
46 return character >= nonCharacter3 && character <= nonCharacter1;
47 }
48
49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,
50 const void*) {
51 return WTF::wrapUnique(new TextCodecUTF8);
52 }
53
54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
55 registrar("UTF-8", "UTF-8");
56
57 // Additional aliases that originally were present in the encoding
58 // table in WebKit on Macintosh, and subsequently added by
59 // TextCodecICU. Perhaps we can prove some are not used on the web
60 // and remove them.
61 registrar("unicode11utf8", "UTF-8");
62 registrar("unicode20utf8", "UTF-8");
63 registrar("utf8", "UTF-8");
64 registrar("x-unicode20utf8", "UTF-8");
65
66 // Additional aliases present in the WHATWG Encoding Standard
67 // (http://encoding.spec.whatwg.org/)
68 // and Firefox (24), but not in ICU 4.6.
69 registrar("unicode-1-1-utf-8", "UTF-8");
70 }
71
72 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
73 registrar("UTF-8", create, 0);
74 }
75
76 static inline int nonASCIISequenceLength(uint8_t firstByte) {
77 static const uint8_t lengths[256] = {
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
89 return lengths[firstByte];
90 }
91
92 static inline int decodeNonASCIISequence(const uint8_t* sequence,
93 unsigned length) {
94 DCHECK(!isASCII(sequence[0]));
95 if (length == 2) {
96 DCHECK_LE(sequence[0], 0xDF);
97 if (sequence[0] < 0xC2)
98 return nonCharacter1;
99 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
100 return nonCharacter1;
101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
102 }
103 if (length == 3) {
104 DCHECK_GE(sequence[0], 0xE0);
105 DCHECK_LE(sequence[0], 0xEF);
106 switch (sequence[0]) {
107 case 0xE0:
108 if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
109 return nonCharacter1;
110 break;
111 case 0xED:
112 if (sequence[1] < 0x80 || sequence[1] > 0x9F)
113 return nonCharacter1;
114 break;
115 default:
116 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
117 return nonCharacter1;
118 }
119 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
120 return nonCharacter2;
121 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
122 0x000E2080;
123 }
124 DCHECK_EQ(length, 4u);
125 DCHECK_GE(sequence[0], 0xF0);
126 DCHECK_LE(sequence[0], 0xF4);
127 switch (sequence[0]) {
128 case 0xF0:
129 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
130 return nonCharacter1;
131 break;
132 case 0xF4:
133 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
134 return nonCharacter1;
135 break;
136 default:
137 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
138 return nonCharacter1;
139 }
140 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
141 return nonCharacter2;
142 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
143 return nonCharacter3;
144 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
145 sequence[3]) -
146 0x03C82080;
147 }
148
149 static inline UChar* appendCharacter(UChar* destination, int character) {
150 DCHECK(!isNonCharacter(character));
151 DCHECK(!U_IS_SURROGATE(character));
152 if (U_IS_BMP(character)) {
153 *destination++ = static_cast<UChar>(character);
154 } else {
155 *destination++ = U16_LEAD(character);
156 *destination++ = U16_TRAIL(character);
157 }
158 return destination;
159 }
160
161 void TextCodecUTF8::consumePartialSequenceByte() {
162 --m_partialSequenceSize;
163 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
164 }
165
166 void TextCodecUTF8::handleError(UChar*& destination,
167 bool stopOnError,
168 bool& sawError) {
169 sawError = true;
170 if (stopOnError)
171 return;
172 // Each error generates a replacement character and consumes one byte.
173 *destination++ = replacementCharacter;
174 consumePartialSequenceByte();
175 }
176
177 template <>
178 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
179 const uint8_t*& source,
180 const uint8_t* end,
181 bool flush,
182 bool,
183 bool&) {
184 DCHECK(m_partialSequenceSize);
185 do {
186 if (isASCII(m_partialSequence[0])) {
187 *destination++ = m_partialSequence[0];
188 consumePartialSequenceByte();
189 continue;
190 }
191 int count = nonASCIISequenceLength(m_partialSequence[0]);
192 if (!count)
193 return true;
194
195 if (count > m_partialSequenceSize) {
196 if (count - m_partialSequenceSize > end - source) {
197 if (!flush) {
198 // The new data is not enough to complete the sequence, so
199 // add it to the existing partial sequence.
200 memcpy(m_partialSequence + m_partialSequenceSize, source,
201 end - source);
202 m_partialSequenceSize += end - source;
203 return false;
204 }
205 // An incomplete partial sequence at the end is an error, but it will
206 // create a 16 bit string due to the replacementCharacter. Let the 16
207 // bit path handle the error.
208 return true;
209 }
210 memcpy(m_partialSequence + m_partialSequenceSize, source,
211 count - m_partialSequenceSize);
212 source += count - m_partialSequenceSize;
213 m_partialSequenceSize = count;
214 }
215 int character = decodeNonASCIISequence(m_partialSequence, count);
216 if (character & ~0xff)
217 return true;
218
219 m_partialSequenceSize -= count;
220 *destination++ = static_cast<LChar>(character);
221 } while (m_partialSequenceSize);
222
223 return false;
224 }
225
226 template <>
227 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
228 const uint8_t*& source,
229 const uint8_t* end,
230 bool flush,
231 bool stopOnError,
232 bool& sawError) {
233 DCHECK(m_partialSequenceSize);
234 do {
235 if (isASCII(m_partialSequence[0])) {
236 *destination++ = m_partialSequence[0];
237 consumePartialSequenceByte();
238 continue;
239 }
240 int count = nonASCIISequenceLength(m_partialSequence[0]);
241 if (!count) {
242 handleError(destination, stopOnError, sawError);
243 if (stopOnError)
244 return false;
245 continue;
246 }
247 if (count > m_partialSequenceSize) {
248 if (count - m_partialSequenceSize > end - source) {
249 if (!flush) {
250 // The new data is not enough to complete the sequence, so
251 // add it to the existing partial sequence.
252 memcpy(m_partialSequence + m_partialSequenceSize, source,
253 end - source);
254 m_partialSequenceSize += end - source;
255 return false;
256 }
257 // An incomplete partial sequence at the end is an error.
258 handleError(destination, stopOnError, sawError);
259 if (stopOnError)
260 return false;
261 continue;
262 }
263 memcpy(m_partialSequence + m_partialSequenceSize, source,
264 count - m_partialSequenceSize);
265 source += count - m_partialSequenceSize;
266 m_partialSequenceSize = count;
267 }
268 int character = decodeNonASCIISequence(m_partialSequence, count);
269 if (isNonCharacter(character)) {
270 handleError(destination, stopOnError, sawError);
271 if (stopOnError)
272 return false;
273 continue;
274 }
275
276 m_partialSequenceSize -= count;
277 destination = appendCharacter(destination, character);
278 } while (m_partialSequenceSize);
279
280 return false;
281 }
282
283 String TextCodecUTF8::decode(const char* bytes,
284 size_t length,
285 FlushBehavior flush,
286 bool stopOnError,
287 bool& sawError) {
288 // Each input byte might turn into a character.
289 // That includes all bytes in the partial-sequence buffer because
290 // each byte in an invalid sequence will turn into a replacement character.
291 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
292
293 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
294 const uint8_t* end = source + length;
295 const uint8_t* alignedEnd = alignToMachineWord(end);
296 LChar* destination = buffer.characters();
297
298 do {
299 if (m_partialSequenceSize) {
300 // Explicitly copy destination and source pointers to avoid taking
301 // pointers to the local variables, which may harm code generation by
302 // disabling some optimizations in some compilers.
303 LChar* destinationForHandlePartialSequence = destination;
304 const uint8_t* sourceForHandlePartialSequence = source;
305 if (handlePartialSequence(destinationForHandlePartialSequence,
306 sourceForHandlePartialSequence, end, flush,
307 stopOnError, sawError)) {
308 source = sourceForHandlePartialSequence;
309 goto upConvertTo16Bit;
310 }
311 destination = destinationForHandlePartialSequence;
312 source = sourceForHandlePartialSequence;
313 if (m_partialSequenceSize)
314 break;
315 }
316
317 while (source < end) {
318 if (isASCII(*source)) {
319 // Fast path for ASCII. Most UTF-8 text will be ASCII.
320 if (isAlignedToMachineWord(source)) {
321 while (source < alignedEnd) {
322 MachineWord chunk =
323 *reinterpret_cast_ptr<const MachineWord*>(source);
324 if (!isAllASCII<LChar>(chunk))
325 break;
326 copyASCIIMachineWord(destination, source);
327 source += sizeof(MachineWord);
328 destination += sizeof(MachineWord);
329 }
330 if (source == end)
331 break;
332 if (!isASCII(*source))
333 continue;
334 }
335 *destination++ = *source++;
336 continue;
337 }
338 int count = nonASCIISequenceLength(*source);
339 int character;
340 if (count == 0) {
341 character = nonCharacter1;
342 } else {
343 if (count > end - source) {
344 SECURITY_DCHECK(end - source <
345 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
346 DCHECK(!m_partialSequenceSize);
347 m_partialSequenceSize = end - source;
348 memcpy(m_partialSequence, source, m_partialSequenceSize);
349 source = end;
350 break;
351 }
352 character = decodeNonASCIISequence(source, count);
353 }
354 if (isNonCharacter(character)) {
355 sawError = true;
356 if (stopOnError)
357 break;
358
359 goto upConvertTo16Bit;
360 }
361 if (character > 0xff)
362 goto upConvertTo16Bit;
363
364 source += count;
365 *destination++ = static_cast<LChar>(character);
366 }
367 } while (flush && m_partialSequenceSize);
368
369 buffer.shrink(destination - buffer.characters());
370
371 return String::adopt(buffer);
372
373 upConvertTo16Bit:
374 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
375
376 UChar* destination16 = buffer16.characters();
377
378 // Copy the already converted characters
379 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
380 *destination16++ = *converted8++;
381
382 do {
383 if (m_partialSequenceSize) {
384 // Explicitly copy destination and source pointers to avoid taking
385 // pointers to the local variables, which may harm code generation by
386 // disabling some optimizations in some compilers.
387 UChar* destinationForHandlePartialSequence = destination16;
388 const uint8_t* sourceForHandlePartialSequence = source;
389 handlePartialSequence(destinationForHandlePartialSequence,
390 sourceForHandlePartialSequence, end, flush,
391 stopOnError, sawError);
392 destination16 = destinationForHandlePartialSequence;
393 source = sourceForHandlePartialSequence;
394 if (m_partialSequenceSize)
395 break;
396 }
397
398 while (source < end) {
399 if (isASCII(*source)) {
400 // Fast path for ASCII. Most UTF-8 text will be ASCII.
401 if (isAlignedToMachineWord(source)) {
402 while (source < alignedEnd) {
403 MachineWord chunk =
404 *reinterpret_cast_ptr<const MachineWord*>(source);
405 if (!isAllASCII<LChar>(chunk))
406 break;
407 copyASCIIMachineWord(destination16, source);
408 source += sizeof(MachineWord);
409 destination16 += sizeof(MachineWord);
410 }
411 if (source == end)
412 break;
413 if (!isASCII(*source))
414 continue;
415 }
416 *destination16++ = *source++;
417 continue;
418 }
419 int count = nonASCIISequenceLength(*source);
420 int character;
421 if (count == 0) {
422 character = nonCharacter1;
423 } else {
424 if (count > end - source) {
425 SECURITY_DCHECK(end - source <
426 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
427 DCHECK(!m_partialSequenceSize);
428 m_partialSequenceSize = end - source;
429 memcpy(m_partialSequence, source, m_partialSequenceSize);
430 source = end;
431 break;
432 }
433 character = decodeNonASCIISequence(source, count);
434 }
435 if (isNonCharacter(character)) {
436 sawError = true;
437 if (stopOnError)
438 break;
439 // Each error generates one replacement character and consumes the
440 // 'largest subpart' of the incomplete character.
441 // Note that the nonCharacterX constants go from -1..-3 and contain
442 // the negative of number of bytes comprising the broken encoding
443 // detected. So subtracting c (when isNonCharacter(c)) adds the number
444 // of broken bytes.
445 *destination16++ = replacementCharacter;
446 source -= character;
447 continue;
448 }
449 source += count;
450 destination16 = appendCharacter(destination16, character);
451 }
452 } while (flush && m_partialSequenceSize);
453
454 buffer16.shrink(destination16 - buffer16.characters());
455
456 return String::adopt(buffer16);
457 }
458
459 template <typename CharType>
460 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
461 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
462 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
463 // (3x).
464 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
465 // (2x).
466 if (length > std::numeric_limits<size_t>::max() / 3)
467 CRASH();
468 Vector<uint8_t> bytes(length * 3);
469
470 size_t i = 0;
471 size_t bytesWritten = 0;
472 while (i < length) {
473 UChar32 character;
474 U16_NEXT(characters, i, length, character);
475 // U16_NEXT will simply emit a surrogate code point if an unmatched
476 // surrogate is encountered; we must convert it to a
477 // U+FFFD (REPLACEMENT CHARACTER) here.
478 if (0xD800 <= character && character <= 0xDFFF)
479 character = replacementCharacter;
480 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
481 }
482
483 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
484 }
485
486 CString TextCodecUTF8::encode(const UChar* characters,
487 size_t length,
488 UnencodableHandling) {
489 return encodeCommon(characters, length);
490 }
491
492 CString TextCodecUTF8::encode(const LChar* characters,
493 size_t length,
494 UnencodableHandling) {
495 return encodeCommon(characters, length);
496 }
497
498 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUserDefined.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698