third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp - Issue 2521943002: Return one U+fffd for longest subpart of incomplete utf-8 character.

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 2521943002: Return one U+fffd for longest subpart of incomplete utf-8 character. (Closed)

Patch Set: Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 18 matching lines...) Expand all Loading...
29 #include "wtf/text/CString.h"	29 #include "wtf/text/CString.h"

30 #include "wtf/text/CharacterNames.h"	30 #include "wtf/text/CharacterNames.h"

31 #include "wtf/text/StringBuffer.h"	31 #include "wtf/text/StringBuffer.h"

32 #include "wtf/text/TextCodecASCIIFastPath.h"	32 #include "wtf/text/TextCodecASCIIFastPath.h"

33 #include <memory>	33 #include <memory>

34	34

35 namespace WTF {	35 namespace WTF {

36	36

37 using namespace WTF::Unicode;	37 using namespace WTF::Unicode;

38	38

39 const int nonCharacter = -1;	39 // We'll use nonCharacter* constants to signal invalid utf-8.

	40 // The number in the name signals how many input bytes were invalid.

	41 const int nonCharacter1 = -1;

	42 const int nonCharacter2 = -2;

	43 const int nonCharacter3 = -3;

	44

	45 bool isNonCharacter(int character) {

	46 return character >= nonCharacter3 && character <= nonCharacter1;

	47 }

40	48

41 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,	49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,

42 const void*) {	50 const void*) {

43 return wrapUnique(new TextCodecUTF8);	51 return wrapUnique(new TextCodecUTF8);

44 }	52 }

45	53

46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {	54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {

47 registrar("UTF-8", "UTF-8");	55 registrar("UTF-8", "UTF-8");

48	56

49 // Additional aliases that originally were present in the encoding	57 // Additional aliases that originally were present in the encoding

(...skipping 30 matching lines...) Expand all Loading...
80 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};	88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

81 return lengths[firstByte];	89 return lengths[firstByte];

82 }	90 }

83	91

84 static inline int decodeNonASCIISequence(const uint8_t* sequence,	92 static inline int decodeNonASCIISequence(const uint8_t* sequence,

85 unsigned length) {	93 unsigned length) {

86 ASSERT(!isASCII(sequence[0]));	94 ASSERT(!isASCII(sequence[0]));

87 if (length == 2) {	95 if (length == 2) {

88 ASSERT(sequence[0] <= 0xDF);	96 ASSERT(sequence[0] <= 0xDF);

89 if (sequence[0] < 0xC2)	97 if (sequence[0] < 0xC2)

90 return nonCharacter;	98 return nonCharacter1;

91 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	99 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

92 return nonCharacter;	100 return nonCharacter1;

93 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;	101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;

94 }	102 }

95 if (length == 3) {	103 if (length == 3) {

96 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);	104 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);

97 switch (sequence[0]) {	105 switch (sequence[0]) {

98 case 0xE0:	106 case 0xE0:

99 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)	107 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)

100 return nonCharacter;	108 return nonCharacter1;

101 break;	109 break;

102 case 0xED:	110 case 0xED:

103 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)	111 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)

104 return nonCharacter;	112 return nonCharacter1;

105 break;	113 break;

106 default:	114 default:

107 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	115 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

108 return nonCharacter;	116 return nonCharacter1;

109 }	117 }

110 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)	118 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

111 return nonCharacter;	119 return nonCharacter2;

112 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -	120 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -

113 0x000E2080;	121 0x000E2080;

114 }	122 }

115 ASSERT(length == 4);	123 ASSERT(length == 4);

116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);	124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

117 switch (sequence[0]) {	125 switch (sequence[0]) {

118 case 0xF0:	126 case 0xF0:

119 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)	127 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)

120 return nonCharacter;	128 return nonCharacter1;

121 break;	129 break;

122 case 0xF4:	130 case 0xF4:

123 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)	131 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)

124 return nonCharacter;	132 return nonCharacter1;

125 break;	133 break;

126 default:	134 default:

127 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	135 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

128 return nonCharacter;	136 return nonCharacter1;

129 }	137 }

130 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)	138 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

131 return nonCharacter;	139 return nonCharacter2;

132 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)	140 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)

133 return nonCharacter;	141 return nonCharacter3;

134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +	142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +

135 sequence[3]) -	143 sequence[3]) -

136 0x03C82080;	144 0x03C82080;

137 }	145 }

138	146

139 static inline UChar* appendCharacter(UChar* destination, int character) {	147 static inline UChar* appendCharacter(UChar* destination, int character) {

140 ASSERT(character != nonCharacter);	148 DCHECK(!isNonCharacter(character));

141 ASSERT(!U_IS_SURROGATE(character));	149 DCHECK(!U_IS_SURROGATE(character));

142 if (U_IS_BMP(character)) {	150 if (U_IS_BMP(character)) {

143 *destination++ = static_cast<UChar>(character);	151 *destination++ = static_cast<UChar>(character);

144 } else {	152 } else {

145 *destination++ = U16_LEAD(character);	153 *destination++ = U16_LEAD(character);

146 *destination++ = U16_TRAIL(character);	154 *destination++ = U16_TRAIL(character);

147 }	155 }

148 return destination;	156 return destination;

149 }	157 }

150	158

151 void TextCodecUTF8::consumePartialSequenceByte() {	159 void TextCodecUTF8::consumePartialSequenceByte() {

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
249 if (stopOnError)	257 if (stopOnError)

250 return false;	258 return false;

251 continue;	259 continue;

252 }	260 }

253 memcpy(m_partialSequence + m_partialSequenceSize, source,	261 memcpy(m_partialSequence + m_partialSequenceSize, source,

254 count - m_partialSequenceSize);	262 count - m_partialSequenceSize);

255 source += count - m_partialSequenceSize;	263 source += count - m_partialSequenceSize;

256 m_partialSequenceSize = count;	264 m_partialSequenceSize = count;

257 }	265 }

258 int character = decodeNonASCIISequence(m_partialSequence, count);	266 int character = decodeNonASCIISequence(m_partialSequence, count);

259 if (character == nonCharacter) {	267 if (isNonCharacter(character)) {

260 handleError(destination, stopOnError, sawError);	268 handleError(destination, stopOnError, sawError);

261 if (stopOnError)	269 if (stopOnError)

262 return false;	270 return false;

263 continue;	271 continue;

264 }	272 }

265	273

266 m_partialSequenceSize -= count;	274 m_partialSequenceSize -= count;

267 destination = appendCharacter(destination, character);	275 destination = appendCharacter(destination, character);

268 } while (m_partialSequenceSize);	276 } while (m_partialSequenceSize);

269	277

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
321 break;	329 break;

322 if (!isASCII(*source))	330 if (!isASCII(*source))

323 continue;	331 continue;

324 }	332 }

325 destination++ = source++;	333 destination++ = source++;

326 continue;	334 continue;

327 }	335 }

328 int count = nonASCIISequenceLength(*source);	336 int count = nonASCIISequenceLength(*source);

329 int character;	337 int character;

330 if (count == 0) {	338 if (count == 0) {

331 character = nonCharacter;	339 character = nonCharacter1;

332 } else {	340 } else {

333 if (count > end - source) {	341 if (count > end - source) {

334 ASSERT_WITH_SECURITY_IMPLICATION(	342 ASSERT_WITH_SECURITY_IMPLICATION(

335 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));	343 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

336 ASSERT(!m_partialSequenceSize);	344 ASSERT(!m_partialSequenceSize);

337 m_partialSequenceSize = end - source;	345 m_partialSequenceSize = end - source;

338 memcpy(m_partialSequence, source, m_partialSequenceSize);	346 memcpy(m_partialSequence, source, m_partialSequenceSize);

339 source = end;	347 source = end;

340 break;	348 break;

341 }	349 }

342 character = decodeNonASCIISequence(source, count);	350 character = decodeNonASCIISequence(source, count);

343 }	351 }

344 if (character == nonCharacter) {	352 if (isNonCharacter(character)) {

345 sawError = true;	353 sawError = true;

346 if (stopOnError)	354 if (stopOnError)

347 break;	355 break;

348	356

349 goto upConvertTo16Bit;	357 goto upConvertTo16Bit;

350 }	358 }

351 if (character > 0xff)	359 if (character > 0xff)

352 goto upConvertTo16Bit;	360 goto upConvertTo16Bit;

353	361

354 source += count;	362 source += count;

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
402 break;	410 break;

403 if (!isASCII(*source))	411 if (!isASCII(*source))

404 continue;	412 continue;

405 }	413 }

406 destination16++ = source++;	414 destination16++ = source++;

407 continue;	415 continue;

408 }	416 }

409 int count = nonASCIISequenceLength(*source);	417 int count = nonASCIISequenceLength(*source);

410 int character;	418 int character;

411 if (count == 0) {	419 if (count == 0) {

412 character = nonCharacter;	420 character = nonCharacter1;

413 } else {	421 } else {

414 if (count > end - source) {	422 if (count > end - source) {

415 ASSERT_WITH_SECURITY_IMPLICATION(	423 ASSERT_WITH_SECURITY_IMPLICATION(

416 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));	424 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

417 ASSERT(!m_partialSequenceSize);	425 ASSERT(!m_partialSequenceSize);

418 m_partialSequenceSize = end - source;	426 m_partialSequenceSize = end - source;

419 memcpy(m_partialSequence, source, m_partialSequenceSize);	427 memcpy(m_partialSequence, source, m_partialSequenceSize);

420 source = end;	428 source = end;

421 break;	429 break;

422 }	430 }

423 character = decodeNonASCIISequence(source, count);	431 character = decodeNonASCIISequence(source, count);

424 }	432 }

425 if (character == nonCharacter) {	433 if (isNonCharacter(character)) {

426 sawError = true;	434 sawError = true;

427 if (stopOnError)	435 if (stopOnError)

428 break;	436 break;

429 // Each error generates a replacement character and consumes one byte.	437 // Each error generates one replacement character and consumes the

	438 // 'largest subpart' of the incomplete character.

	439 // Note that the nonCharacterX constants go from -1..-3 and contain

	440 // the negative of number of bytes comprising the broken encoding

	441 // detected. So subtracting c (when isNonCharacter(c)) adds the number

	442 // of broken bytes.

430 *destination16++ = replacementCharacter;	443 *destination16++ = replacementCharacter;

431 ++source;	444 source -= character;

432 continue;	445 continue;

433 }	446 }

434 source += count;	447 source += count;

435 destination16 = appendCharacter(destination16, character);	448 destination16 = appendCharacter(destination16, character);

436 }	449 }

437 } while (flush && m_partialSequenceSize);	450 } while (flush && m_partialSequenceSize);

438	451

439 buffer16.shrink(destination16 - buffer16.characters());	452 buffer16.shrink(destination16 - buffer16.characters());

440	453

441 return String::adopt(buffer16);	454 return String::adopt(buffer16);

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
474 return encodeCommon(characters, length);	487 return encodeCommon(characters, length);

475 }	488 }

476	489

477 CString TextCodecUTF8::encode(const LChar* characters,	490 CString TextCodecUTF8::encode(const LChar* characters,

478 size_t length,	491 size_t length,

479 UnencodableHandling) {	492 UnencodableHandling) {

480 return encodeCommon(characters, length);	493 return encodeCommon(characters, length);

481 }	494 }

482	495

483 } // namespace WTF	496 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/LayoutTests/fast/encoding/char-decoding-invalid-trail.html ('k') | no next file » | no next file with comments »