Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(712)

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 2498653002: Return one U+fffd for longest subpart of incomplete utf-8 character. (Closed)
Patch Set: Remove dead assignments. Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/LayoutTests/fast/encoding/char-decoding-invalid-trail.html ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 18 matching lines...) Expand all
29 #include "wtf/text/CString.h" 29 #include "wtf/text/CString.h"
30 #include "wtf/text/CharacterNames.h" 30 #include "wtf/text/CharacterNames.h"
31 #include "wtf/text/StringBuffer.h" 31 #include "wtf/text/StringBuffer.h"
32 #include "wtf/text/TextCodecASCIIFastPath.h" 32 #include "wtf/text/TextCodecASCIIFastPath.h"
33 #include <memory> 33 #include <memory>
34 34
35 namespace WTF { 35 namespace WTF {
36 36
37 using namespace WTF::Unicode; 37 using namespace WTF::Unicode;
38 38
39 const int nonCharacter = -1; 39 // We'll use nonCharacter* constants to signal invalid utf-8.
40 // The number in the name signals how many input bytes were invalid.
41 const int nonCharacter1 = -1;
42 const int nonCharacter2 = -2;
43 const int nonCharacter3 = -3;
44
45 bool isNonCharacter(int character) {
46 return character >= nonCharacter3 && character <= nonCharacter1;
47 }
40 48
41 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, 49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,
42 const void*) { 50 const void*) {
43 return wrapUnique(new TextCodecUTF8); 51 return wrapUnique(new TextCodecUTF8);
44 } 52 }
45 53
46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { 54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
47 registrar("UTF-8", "UTF-8"); 55 registrar("UTF-8", "UTF-8");
48 56
49 // Additional aliases that originally were present in the encoding 57 // Additional aliases that originally were present in the encoding
(...skipping 30 matching lines...) Expand all
80 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
81 return lengths[firstByte]; 89 return lengths[firstByte];
82 } 90 }
83 91
84 static inline int decodeNonASCIISequence(const uint8_t* sequence, 92 static inline int decodeNonASCIISequence(const uint8_t* sequence,
85 unsigned length) { 93 unsigned length) {
86 ASSERT(!isASCII(sequence[0])); 94 ASSERT(!isASCII(sequence[0]));
87 if (length == 2) { 95 if (length == 2) {
88 ASSERT(sequence[0] <= 0xDF); 96 ASSERT(sequence[0] <= 0xDF);
89 if (sequence[0] < 0xC2) 97 if (sequence[0] < 0xC2)
90 return nonCharacter; 98 return nonCharacter1;
91 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 99 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
92 return nonCharacter; 100 return nonCharacter1;
93 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
94 } 102 }
95 if (length == 3) { 103 if (length == 3) {
96 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 104 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
97 switch (sequence[0]) { 105 switch (sequence[0]) {
98 case 0xE0: 106 case 0xE0:
99 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 107 if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
100 return nonCharacter; 108 return nonCharacter1;
101 break; 109 break;
102 case 0xED: 110 case 0xED:
103 if (sequence[1] < 0x80 || sequence[1] > 0x9F) 111 if (sequence[1] < 0x80 || sequence[1] > 0x9F)
104 return nonCharacter; 112 return nonCharacter1;
105 break; 113 break;
106 default: 114 default:
107 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 115 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
108 return nonCharacter; 116 return nonCharacter1;
109 } 117 }
110 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 118 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
111 return nonCharacter; 119 return nonCharacter2;
112 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 120 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
113 0x000E2080; 121 0x000E2080;
114 } 122 }
115 ASSERT(length == 4); 123 ASSERT(length == 4);
116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); 124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
117 switch (sequence[0]) { 125 switch (sequence[0]) {
118 case 0xF0: 126 case 0xF0:
119 if (sequence[1] < 0x90 || sequence[1] > 0xBF) 127 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
120 return nonCharacter; 128 return nonCharacter1;
121 break; 129 break;
122 case 0xF4: 130 case 0xF4:
123 if (sequence[1] < 0x80 || sequence[1] > 0x8F) 131 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
124 return nonCharacter; 132 return nonCharacter1;
125 break; 133 break;
126 default: 134 default:
127 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 135 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
128 return nonCharacter; 136 return nonCharacter1;
129 } 137 }
130 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 138 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
131 return nonCharacter; 139 return nonCharacter2;
132 if (sequence[3] < 0x80 || sequence[3] > 0xBF) 140 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
133 return nonCharacter; 141 return nonCharacter3;
134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + 142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
135 sequence[3]) - 143 sequence[3]) -
136 0x03C82080; 144 0x03C82080;
137 } 145 }
138 146
139 static inline UChar* appendCharacter(UChar* destination, int character) { 147 static inline UChar* appendCharacter(UChar* destination, int character) {
140 ASSERT(character != nonCharacter); 148 DCHECK(!isNonCharacter(character));
141 ASSERT(!U_IS_SURROGATE(character)); 149 DCHECK(!U_IS_SURROGATE(character));
142 if (U_IS_BMP(character)) { 150 if (U_IS_BMP(character)) {
143 *destination++ = static_cast<UChar>(character); 151 *destination++ = static_cast<UChar>(character);
144 } else { 152 } else {
145 *destination++ = U16_LEAD(character); 153 *destination++ = U16_LEAD(character);
146 *destination++ = U16_TRAIL(character); 154 *destination++ = U16_TRAIL(character);
147 } 155 }
148 return destination; 156 return destination;
149 } 157 }
150 158
151 void TextCodecUTF8::consumePartialSequenceByte() { 159 void TextCodecUTF8::consumePartialSequenceByte() {
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
249 if (stopOnError) 257 if (stopOnError)
250 return false; 258 return false;
251 continue; 259 continue;
252 } 260 }
253 memcpy(m_partialSequence + m_partialSequenceSize, source, 261 memcpy(m_partialSequence + m_partialSequenceSize, source,
254 count - m_partialSequenceSize); 262 count - m_partialSequenceSize);
255 source += count - m_partialSequenceSize; 263 source += count - m_partialSequenceSize;
256 m_partialSequenceSize = count; 264 m_partialSequenceSize = count;
257 } 265 }
258 int character = decodeNonASCIISequence(m_partialSequence, count); 266 int character = decodeNonASCIISequence(m_partialSequence, count);
259 if (character == nonCharacter) { 267 if (isNonCharacter(character)) {
260 handleError(destination, stopOnError, sawError); 268 handleError(destination, stopOnError, sawError);
261 if (stopOnError) 269 if (stopOnError)
262 return false; 270 return false;
263 continue; 271 continue;
264 } 272 }
265 273
266 m_partialSequenceSize -= count; 274 m_partialSequenceSize -= count;
267 destination = appendCharacter(destination, character); 275 destination = appendCharacter(destination, character);
268 } while (m_partialSequenceSize); 276 } while (m_partialSequenceSize);
269 277
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
321 break; 329 break;
322 if (!isASCII(*source)) 330 if (!isASCII(*source))
323 continue; 331 continue;
324 } 332 }
325 *destination++ = *source++; 333 *destination++ = *source++;
326 continue; 334 continue;
327 } 335 }
328 int count = nonASCIISequenceLength(*source); 336 int count = nonASCIISequenceLength(*source);
329 int character; 337 int character;
330 if (count == 0) { 338 if (count == 0) {
331 character = nonCharacter; 339 character = nonCharacter1;
332 } else { 340 } else {
333 if (count > end - source) { 341 if (count > end - source) {
334 SECURITY_DCHECK(end - source < 342 SECURITY_DCHECK(end - source <
335 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 343 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
336 ASSERT(!m_partialSequenceSize); 344 ASSERT(!m_partialSequenceSize);
337 m_partialSequenceSize = end - source; 345 m_partialSequenceSize = end - source;
338 memcpy(m_partialSequence, source, m_partialSequenceSize); 346 memcpy(m_partialSequence, source, m_partialSequenceSize);
339 source = end; 347 source = end;
340 break; 348 break;
341 } 349 }
342 character = decodeNonASCIISequence(source, count); 350 character = decodeNonASCIISequence(source, count);
343 } 351 }
344 if (character == nonCharacter) { 352 if (isNonCharacter(character)) {
345 sawError = true; 353 sawError = true;
346 if (stopOnError) 354 if (stopOnError)
347 break; 355 break;
348 356
349 goto upConvertTo16Bit; 357 goto upConvertTo16Bit;
350 } 358 }
351 if (character > 0xff) 359 if (character > 0xff)
352 goto upConvertTo16Bit; 360 goto upConvertTo16Bit;
353 361
354 source += count; 362 source += count;
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
402 break; 410 break;
403 if (!isASCII(*source)) 411 if (!isASCII(*source))
404 continue; 412 continue;
405 } 413 }
406 *destination16++ = *source++; 414 *destination16++ = *source++;
407 continue; 415 continue;
408 } 416 }
409 int count = nonASCIISequenceLength(*source); 417 int count = nonASCIISequenceLength(*source);
410 int character; 418 int character;
411 if (count == 0) { 419 if (count == 0) {
412 character = nonCharacter; 420 character = nonCharacter1;
413 } else { 421 } else {
414 if (count > end - source) { 422 if (count > end - source) {
415 SECURITY_DCHECK(end - source < 423 SECURITY_DCHECK(end - source <
416 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 424 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
417 ASSERT(!m_partialSequenceSize); 425 ASSERT(!m_partialSequenceSize);
418 m_partialSequenceSize = end - source; 426 m_partialSequenceSize = end - source;
419 memcpy(m_partialSequence, source, m_partialSequenceSize); 427 memcpy(m_partialSequence, source, m_partialSequenceSize);
420 source = end; 428 source = end;
421 break; 429 break;
422 } 430 }
423 character = decodeNonASCIISequence(source, count); 431 character = decodeNonASCIISequence(source, count);
424 } 432 }
425 if (character == nonCharacter) { 433 if (isNonCharacter(character)) {
426 sawError = true; 434 sawError = true;
427 if (stopOnError) 435 if (stopOnError)
428 break; 436 break;
429 // Each error generates a replacement character and consumes one byte. 437 // Each error generates a replacement character and consumes one byte.
marja 2016/11/16 09:59:14 Pls fix this comment
vogelheim 2016/11/16 10:35:23 Done.
430 *destination16++ = replacementCharacter; 438 *destination16++ = replacementCharacter;
431 ++source; 439 source -= character;
432 continue; 440 continue;
433 } 441 }
434 source += count; 442 source += count;
435 destination16 = appendCharacter(destination16, character); 443 destination16 = appendCharacter(destination16, character);
436 } 444 }
437 } while (flush && m_partialSequenceSize); 445 } while (flush && m_partialSequenceSize);
438 446
439 buffer16.shrink(destination16 - buffer16.characters()); 447 buffer16.shrink(destination16 - buffer16.characters());
440 448
441 return String::adopt(buffer16); 449 return String::adopt(buffer16);
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
474 return encodeCommon(characters, length); 482 return encodeCommon(characters, length);
475 } 483 }
476 484
477 CString TextCodecUTF8::encode(const LChar* characters, 485 CString TextCodecUTF8::encode(const LChar* characters,
478 size_t length, 486 size_t length,
479 UnencodableHandling) { 487 UnencodableHandling) {
480 return encodeCommon(characters, length); 488 return encodeCommon(characters, length);
481 } 489 }
482 490
483 } // namespace WTF 491 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/LayoutTests/fast/encoding/char-decoding-invalid-trail.html ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698