third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp - Issue 2498653002: Return one U+fffd for longest subpart of incomplete utf-8 character.

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 2498653002: Return one U+fffd for longest subpart of incomplete utf-8 character. (Closed)

Patch Set: Updated test expectations. Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 18 matching lines...) Expand all Loading...
29 #include "wtf/text/CString.h"	29 #include "wtf/text/CString.h"

30 #include "wtf/text/CharacterNames.h"	30 #include "wtf/text/CharacterNames.h"

31 #include "wtf/text/StringBuffer.h"	31 #include "wtf/text/StringBuffer.h"

32 #include "wtf/text/TextCodecASCIIFastPath.h"	32 #include "wtf/text/TextCodecASCIIFastPath.h"

33 #include <memory>	33 #include <memory>

34	34

35 namespace WTF {	35 namespace WTF {

36	36

37 using namespace WTF::Unicode;	37 using namespace WTF::Unicode;

38	38

39 const int nonCharacter = -1;	39 // We'll use nonCharacter* constants to signal invalid utf-8.

	40 // The number in the name signals how many input bytes were invalid.

	41 const int nonCharacter1 = -1;

	42 const int nonCharacter2 = -2;

	43 const int nonCharacter3 = -3;

	44

	45 bool isNonCharacter(int character) {

	46 return character >= nonCharacter3 && character <= nonCharacter1;

	47 }

40	48

41 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,	49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,

42 const void*) {	50 const void*) {

43 return wrapUnique(new TextCodecUTF8);	51 return wrapUnique(new TextCodecUTF8);

44 }	52 }

45	53

46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {	54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {

47 registrar("UTF-8", "UTF-8");	55 registrar("UTF-8", "UTF-8");

48	56

49 // Additional aliases that originally were present in the encoding	57 // Additional aliases that originally were present in the encoding

(...skipping 30 matching lines...) Expand all Loading...
80 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};	88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

81 return lengths[firstByte];	89 return lengths[firstByte];

82 }	90 }

83	91

84 static inline int decodeNonASCIISequence(const uint8_t* sequence,	92 static inline int decodeNonASCIISequence(const uint8_t* sequence,

85 unsigned length) {	93 unsigned length) {

86 ASSERT(!isASCII(sequence[0]));	94 ASSERT(!isASCII(sequence[0]));

87 if (length == 2) {	95 if (length == 2) {

88 ASSERT(sequence[0] <= 0xDF);	96 ASSERT(sequence[0] <= 0xDF);

89 if (sequence[0] < 0xC2)	97 if (sequence[0] < 0xC2)

90 return nonCharacter;	98 return nonCharacter1;

91 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	99 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

92 return nonCharacter;	100 return nonCharacter1;

93 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;	101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;

94 }	102 }

95 if (length == 3) {	103 if (length == 3) {

96 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);	104 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);

97 switch (sequence[0]) {	105 switch (sequence[0]) {

98 case 0xE0:	106 case 0xE0:

99 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)	107 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)

100 return nonCharacter;	108 return nonCharacter1;

101 break;	109 break;

102 case 0xED:	110 case 0xED:

103 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)	111 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)

104 return nonCharacter;	112 return nonCharacter1;

105 break;	113 break;

106 default:	114 default:

107 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	115 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

108 return nonCharacter;	116 return nonCharacter1;

109 }	117 }

110 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)	118 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

111 return nonCharacter;	119 return nonCharacter2;

112 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -	120 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -

113 0x000E2080;	121 0x000E2080;

114 }	122 }

115 ASSERT(length == 4);	123 ASSERT(length == 4);

116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);	124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

117 switch (sequence[0]) {	125 switch (sequence[0]) {

118 case 0xF0:	126 case 0xF0:

119 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)	127 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)

120 return nonCharacter;	128 return nonCharacter1;

121 break;	129 break;

122 case 0xF4:	130 case 0xF4:

123 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)	131 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)

124 return nonCharacter;	132 return nonCharacter1;

125 break;	133 break;

126 default:	134 default:

127 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	135 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

128 return nonCharacter;	136 return nonCharacter1;

129 }	137 }

130 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)	138 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

131 return nonCharacter;	139 return nonCharacter2;

132 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)	140 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)

133 return nonCharacter;	141 return nonCharacter3;

134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +	142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +

135 sequence[3]) -	143 sequence[3]) -

136 0x03C82080;	144 0x03C82080;

137 }	145 }

138	146

139 static inline UChar* appendCharacter(UChar* destination, int character) {	147 static inline UChar* appendCharacter(UChar* destination, int character) {

140 ASSERT(character != nonCharacter);	148 DCHECK(!isNonCharacter(character));

141 ASSERT(!U_IS_SURROGATE(character));	149 DCHECK(!U_IS_SURROGATE(character));

142 if (U_IS_BMP(character)) {	150 if (U_IS_BMP(character)) {

143 *destination++ = static_cast<UChar>(character);	151 *destination++ = static_cast<UChar>(character);

144 } else {	152 } else {

145 *destination++ = U16_LEAD(character);	153 *destination++ = U16_LEAD(character);

146 *destination++ = U16_TRAIL(character);	154 *destination++ = U16_TRAIL(character);

147 }	155 }

148 return destination;	156 return destination;

149 }	157 }

150	158

151 void TextCodecUTF8::consumePartialSequenceByte() {	159 void TextCodecUTF8::consumePartialSequenceByte() {

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
249 if (stopOnError)	257 if (stopOnError)

250 return false;	258 return false;

251 continue;	259 continue;

252 }	260 }

253 memcpy(m_partialSequence + m_partialSequenceSize, source,	261 memcpy(m_partialSequence + m_partialSequenceSize, source,

254 count - m_partialSequenceSize);	262 count - m_partialSequenceSize);

255 source += count - m_partialSequenceSize;	263 source += count - m_partialSequenceSize;

256 m_partialSequenceSize = count;	264 m_partialSequenceSize = count;

257 }	265 }

258 int character = decodeNonASCIISequence(m_partialSequence, count);	266 int character = decodeNonASCIISequence(m_partialSequence, count);

259 if (character == nonCharacter) {	267 if (isNonCharacter(character)) {

260 handleError(destination, stopOnError, sawError);	268 handleError(destination, stopOnError, sawError);

	269 count = -character;
	marja 2016/11/16 08:41:38 Looks like count doesn't affect anything Looks like count doesn't affect anything vogelheim 2016/11/16 09:53:16 Removed. Show quoted text On 2016/11/16 08:41:38, marja wrote: > Looks like count doesn't affect anything Removed.
261 if (stopOnError)	270 if (stopOnError)

262 return false;	271 return false;

263 continue;	272 continue;

264 }	273 }

265	274

266 m_partialSequenceSize -= count;	275 m_partialSequenceSize -= count;

267 destination = appendCharacter(destination, character);	276 destination = appendCharacter(destination, character);

268 } while (m_partialSequenceSize);	277 } while (m_partialSequenceSize);

269	278

270 return false;	279 return false;

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
321 break;	330 break;

322 if (!isASCII(*source))	331 if (!isASCII(*source))

323 continue;	332 continue;

324 }	333 }

325 destination++ = source++;	334 destination++ = source++;

326 continue;	335 continue;

327 }	336 }

328 int count = nonASCIISequenceLength(*source);	337 int count = nonASCIISequenceLength(*source);

329 int character;	338 int character;

330 if (count == 0) {	339 if (count == 0) {

331 character = nonCharacter;	340 character = nonCharacter1;

332 } else {	341 } else {

333 if (count > end - source) {	342 if (count > end - source) {

334 SECURITY_DCHECK(end - source <	343 SECURITY_DCHECK(end - source <

335 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));	344 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

336 ASSERT(!m_partialSequenceSize);	345 ASSERT(!m_partialSequenceSize);

337 m_partialSequenceSize = end - source;	346 m_partialSequenceSize = end - source;

338 memcpy(m_partialSequence, source, m_partialSequenceSize);	347 memcpy(m_partialSequence, source, m_partialSequenceSize);

339 source = end;	348 source = end;

340 break;	349 break;

341 }	350 }

342 character = decodeNonASCIISequence(source, count);	351 character = decodeNonASCIISequence(source, count);

343 }	352 }

344 if (character == nonCharacter) {	353 if (isNonCharacter(character)) {

	354 count = -character;
	marja 2016/11/16 08:41:38 Ditto Ditto vogelheim 2016/11/16 09:53:16 Removed. (All actual processing of non-ASCII chara Show quoted text On 2016/11/16 08:41:38, marja wrote: > Ditto Removed. (All actual processing of non-ASCII characters is done in upConvertTo16Bit.)
345 sawError = true;	355 sawError = true;

346 if (stopOnError)	356 if (stopOnError)

347 break;	357 break;

348	358

349 goto upConvertTo16Bit;	359 goto upConvertTo16Bit;

350 }	360 }

351 if (character > 0xff)	361 if (character > 0xff)

352 goto upConvertTo16Bit;	362 goto upConvertTo16Bit;

353	363

354 source += count;	364 source += count;

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
402 break;	412 break;

403 if (!isASCII(*source))	413 if (!isASCII(*source))

404 continue;	414 continue;

405 }	415 }

406 destination16++ = source++;	416 destination16++ = source++;

407 continue;	417 continue;

408 }	418 }

409 int count = nonASCIISequenceLength(*source);	419 int count = nonASCIISequenceLength(*source);

410 int character;	420 int character;

411 if (count == 0) {	421 if (count == 0) {

412 character = nonCharacter;	422 character = nonCharacter1;

413 } else {	423 } else {

414 if (count > end - source) {	424 if (count > end - source) {

415 SECURITY_DCHECK(end - source <	425 SECURITY_DCHECK(end - source <

416 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));	426 static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

417 ASSERT(!m_partialSequenceSize);	427 ASSERT(!m_partialSequenceSize);

418 m_partialSequenceSize = end - source;	428 m_partialSequenceSize = end - source;

419 memcpy(m_partialSequence, source, m_partialSequenceSize);	429 memcpy(m_partialSequence, source, m_partialSequenceSize);

420 source = end;	430 source = end;

421 break;	431 break;

422 }	432 }

423 character = decodeNonASCIISequence(source, count);	433 character = decodeNonASCIISequence(source, count);

424 }	434 }

425 if (character == nonCharacter) {	435 if (isNonCharacter(character)) {

426 sawError = true;	436 sawError = true;

427 if (stopOnError)	437 if (stopOnError)

428 break;	438 break;

429 // Each error generates a replacement character and consumes one byte.	439 // Each error generates a replacement character and consumes one byte.

430 *destination16++ = replacementCharacter;	440 *destination16++ = replacementCharacter;

431 ++source;	441 source -= character;

432 continue;	442 continue;

433 }	443 }

434 source += count;	444 source += count;

435 destination16 = appendCharacter(destination16, character);	445 destination16 = appendCharacter(destination16, character);

436 }	446 }

437 } while (flush && m_partialSequenceSize);	447 } while (flush && m_partialSequenceSize);

438	448

439 buffer16.shrink(destination16 - buffer16.characters());	449 buffer16.shrink(destination16 - buffer16.characters());

440	450

441 return String::adopt(buffer16);	451 return String::adopt(buffer16);

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
474 return encodeCommon(characters, length);	484 return encodeCommon(characters, length);

475 }	485 }

476	486

477 CString TextCodecUTF8::encode(const LChar* characters,	487 CString TextCodecUTF8::encode(const LChar* characters,

478 size_t length,	488 size_t length,

479 UnencodableHandling) {	489 UnencodableHandling) {

480 return encodeCommon(characters, length);	490 return encodeCommon(characters, length);

481 }	491 }

482	492

483 } // namespace WTF	493 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/LayoutTests/fast/encoding/char-decoding-invalid-trail.html ('k') | no next file » | no next file with comments »