third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp - Issue 1611343002: wtf reformat test

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 1611343002: wtf reformat test Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: pydent Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 18 matching lines...) Expand all Loading...
29 #include "wtf/text/CharacterNames.h"	29 #include "wtf/text/CharacterNames.h"

30 #include "wtf/text/StringBuffer.h"	30 #include "wtf/text/StringBuffer.h"

31 #include "wtf/text/TextCodecASCIIFastPath.h"	31 #include "wtf/text/TextCodecASCIIFastPath.h"

32	32

33 namespace WTF {	33 namespace WTF {

34	34

35 using namespace WTF::Unicode;	35 using namespace WTF::Unicode;

36	36

37 const int nonCharacter = -1;	37 const int nonCharacter = -1;

38	38

39 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)	39 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {

40 {	40 return adoptPtr(new TextCodecUTF8);

41 return adoptPtr(new TextCodecUTF8);	41 }

42 }	42

43	43 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {

44 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)	44 registrar("UTF-8", "UTF-8");

45 {	45

46 registrar("UTF-8", "UTF-8");	46 // Additional aliases that originally were present in the encoding

47	47 // table in WebKit on Macintosh, and subsequently added by

48 // Additional aliases that originally were present in the encoding	48 // TextCodecICU. Perhaps we can prove some are not used on the web

49 // table in WebKit on Macintosh, and subsequently added by	49 // and remove them.

50 // TextCodecICU. Perhaps we can prove some are not used on the web	50 registrar("unicode11utf8", "UTF-8");

51 // and remove them.	51 registrar("unicode20utf8", "UTF-8");

52 registrar("unicode11utf8", "UTF-8");	52 registrar("utf8", "UTF-8");

53 registrar("unicode20utf8", "UTF-8");	53 registrar("x-unicode20utf8", "UTF-8");

54 registrar("utf8", "UTF-8");	54

55 registrar("x-unicode20utf8", "UTF-8");	55 // Additional aliases present in the WHATWG Encoding Standard (http://encoding .spec.whatwg.org/)

56	56 // and Firefox (24), but not in ICU 4.6.

57 // Additional aliases present in the WHATWG Encoding Standard (http://encodi ng.spec.whatwg.org/)	57 registrar("unicode-1-1-utf-8", "UTF-8");

58 // and Firefox (24), but not in ICU 4.6.	58 }

59 registrar("unicode-1-1-utf-8", "UTF-8");	59

60 }	60 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {

61	61 registrar("UTF-8", create, 0);

62 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)	62 }

63 {	63

64 registrar("UTF-8", create, 0);	64 static inline int nonASCIISequenceLength(uint8_t firstByte) {

65 }	65 static const uint8_t lengths[256] = {

66	66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

67 static inline int nonASCIISequenceLength(uint8_t firstByte)	67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

68 {	68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

69 static const uint8_t lengths[256] = {	69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	71 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	72 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	73 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	75 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	76 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	77 return lengths[firstByte];

78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	78 }

79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	79

80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	80 static inline int decodeNonASCIISequence(const uint8_t* sequence,

81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	81 unsigned length) {

82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	82 ASSERT(!isASCII(sequence[0]));

83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	83 if (length == 2) {

84 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	84 ASSERT(sequence[0] <= 0xDF);

85 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0	85 if (sequence[0] < 0xC2)

86 };	86 return nonCharacter;

87 return lengths[firstByte];	87 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

88 }	88 return nonCharacter;

89	89 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;

90 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt h)	90 }

91 {	91 if (length == 3) {

92 ASSERT(!isASCII(sequence[0]));	92 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);

93 if (length == 2) {	93 switch (sequence[0]) {

94 ASSERT(sequence[0] <= 0xDF);	94 case 0xE0:

95 if (sequence[0] < 0xC2)	95 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)

96 return nonCharacter;	96 return nonCharacter;

	97 break;

	98 case 0xED:

	99 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)

	100 return nonCharacter;

	101 break;

	102 default:

97 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	103 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

98 return nonCharacter;	104 return nonCharacter;

99 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;	105 }

100 }	106 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

101 if (length == 3) {	107 return nonCharacter;

102 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);	108 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -

103 switch (sequence[0]) {	109 0x000E2080;

104 case 0xE0:	110 }

105 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)	111 ASSERT(length == 4);

106 return nonCharacter;	112 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

	113 switch (sequence[0]) {

	114 case 0xF0:

	115 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)

	116 return nonCharacter;

	117 break;

	118 case 0xF4:

	119 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)

	120 return nonCharacter;

	121 break;

	122 default:

	123 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

	124 return nonCharacter;

	125 }

	126 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

	127 return nonCharacter;

	128 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)

	129 return nonCharacter;

	130 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +

	131 sequence[3]) -

	132 0x03C82080;

	133 }

	134

	135 static inline UChar* appendCharacter(UChar* destination, int character) {

	136 ASSERT(character != nonCharacter);

	137 ASSERT(!U_IS_SURROGATE(character));

	138 if (U_IS_BMP(character)) {

	139 *destination++ = static_cast<UChar>(character);

	140 } else {

	141 *destination++ = U16_LEAD(character);

	142 *destination++ = U16_TRAIL(character);

	143 }

	144 return destination;

	145 }

	146

	147 void TextCodecUTF8::consumePartialSequenceByte() {

	148 --m_partialSequenceSize;

	149 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);

	150 }

	151

	152 void TextCodecUTF8::handleError(UChar*& destination,

	153 bool stopOnError,

	154 bool& sawError) {

	155 sawError = true;

	156 if (stopOnError)

	157 return;

	158 // Each error generates a replacement character and consumes one byte.

	159 *destination++ = replacementCharacter;

	160 consumePartialSequenceByte();

	161 }

	162

	163 template <>

	164 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,

	165 const uint8_t*& source,

	166 const uint8_t* end,

	167 bool flush,

	168 bool,

	169 bool&) {

	170 ASSERT(m_partialSequenceSize);

	171 do {

	172 if (isASCII(m_partialSequence[0])) {

	173 *destination++ = m_partialSequence[0];

	174 consumePartialSequenceByte();

	175 continue;

	176 }

	177 int count = nonASCIISequenceLength(m_partialSequence[0]);

	178 if (!count)

	179 return true;

	180

	181 if (count > m_partialSequenceSize) {

	182 if (count - m_partialSequenceSize > end - source) {

	183 if (!flush) {

	184 // The new data is not enough to complete the sequence, so

	185 // add it to the existing partial sequence.

	186 memcpy(m_partialSequence + m_partialSequenceSize, source,

	187 end - source);

	188 m_partialSequenceSize += end - source;

	189 return false;

	190 }

	191 // An incomplete partial sequence at the end is an error, but it will cr eate

	192 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle

	193 // the error.

	194 return true;

	195 }

	196 memcpy(m_partialSequence + m_partialSequenceSize, source,

	197 count - m_partialSequenceSize);

	198 source += count - m_partialSequenceSize;

	199 m_partialSequenceSize = count;

	200 }

	201 int character = decodeNonASCIISequence(m_partialSequence, count);

	202 if (character & ~0xff)

	203 return true;

	204

	205 m_partialSequenceSize -= count;

	206 *destination++ = static_cast<LChar>(character);

	207 } while (m_partialSequenceSize);

	208

	209 return false;

	210 }

	211

	212 template <>

	213 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,

	214 const uint8_t*& source,

	215 const uint8_t* end,

	216 bool flush,

	217 bool stopOnError,

	218 bool& sawError) {

	219 ASSERT(m_partialSequenceSize);

	220 do {

	221 if (isASCII(m_partialSequence[0])) {

	222 *destination++ = m_partialSequence[0];

	223 consumePartialSequenceByte();

	224 continue;

	225 }

	226 int count = nonASCIISequenceLength(m_partialSequence[0]);

	227 if (!count) {

	228 handleError(destination, stopOnError, sawError);

	229 if (stopOnError)

	230 return false;

	231 continue;

	232 }

	233 if (count > m_partialSequenceSize) {

	234 if (count - m_partialSequenceSize > end - source) {

	235 if (!flush) {

	236 // The new data is not enough to complete the sequence, so

	237 // add it to the existing partial sequence.

	238 memcpy(m_partialSequence + m_partialSequenceSize, source,

	239 end - source);

	240 m_partialSequenceSize += end - source;

	241 return false;

	242 }

	243 // An incomplete partial sequence at the end is an error.

	244 handleError(destination, stopOnError, sawError);

	245 if (stopOnError)

	246 return false;

	247 continue;

	248 }

	249 memcpy(m_partialSequence + m_partialSequenceSize, source,

	250 count - m_partialSequenceSize);

	251 source += count - m_partialSequenceSize;

	252 m_partialSequenceSize = count;

	253 }

	254 int character = decodeNonASCIISequence(m_partialSequence, count);

	255 if (character == nonCharacter) {

	256 handleError(destination, stopOnError, sawError);

	257 if (stopOnError)

	258 return false;

	259 continue;

	260 }

	261

	262 m_partialSequenceSize -= count;

	263 destination = appendCharacter(destination, character);

	264 } while (m_partialSequenceSize);

	265

	266 return false;

	267 }

	268

	269 String TextCodecUTF8::decode(const char* bytes,

	270 size_t length,

	271 FlushBehavior flush,

	272 bool stopOnError,

	273 bool& sawError) {

	274 // Each input byte might turn into a character.

	275 // That includes all bytes in the partial-sequence buffer because

	276 // each byte in an invalid sequence will turn into a replacement character.

	277 StringBuffer<LChar> buffer(m_partialSequenceSize + length);

	278

	279 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);

	280 const uint8_t* end = source + length;

	281 const uint8_t* alignedEnd = alignToMachineWord(end);

	282 LChar* destination = buffer.characters();

	283

	284 do {

	285 if (m_partialSequenceSize) {

	286 // Explicitly copy destination and source pointers to avoid taking pointer s to the

	287 // local variables, which may harm code generation by disabling some optim izations

	288 // in some compilers.

	289 LChar* destinationForHandlePartialSequence = destination;

	290 const uint8_t* sourceForHandlePartialSequence = source;

	291 if (handlePartialSequence(destinationForHandlePartialSequence,

	292 sourceForHandlePartialSequence, end, flush,

	293 stopOnError, sawError)) {

	294 source = sourceForHandlePartialSequence;

	295 goto upConvertTo16Bit;

	296 }

	297 destination = destinationForHandlePartialSequence;

	298 source = sourceForHandlePartialSequence;

	299 if (m_partialSequenceSize)

	300 break;

	301 }

	302

	303 while (source < end) {

	304 if (isASCII(*source)) {

	305 // Fast path for ASCII. Most UTF-8 text will be ASCII.

	306 if (isAlignedToMachineWord(source)) {

	307 while (source < alignedEnd) {

	308 MachineWord chunk =

	309 reinterpret_cast_ptr<const MachineWord>(source);

	310 if (!isAllASCII<LChar>(chunk))

	311 break;

	312 copyASCIIMachineWord(destination, source);

	313 source += sizeof(MachineWord);

	314 destination += sizeof(MachineWord);

	315 }

	316 if (source == end)

107 break;	317 break;

108 case 0xED:	318 if (!isASCII(*source))

109 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)	319 continue;

110 return nonCharacter;	320 }

	321 destination++ = source++;

	322 continue;

	323 }

	324 int count = nonASCIISequenceLength(*source);

	325 int character;

	326 if (count == 0) {

	327 character = nonCharacter;

	328 } else {

	329 if (count > end - source) {

	330 ASSERT_WITH_SECURITY_IMPLICATION(

	331 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

	332 ASSERT(!m_partialSequenceSize);

	333 m_partialSequenceSize = end - source;

	334 memcpy(m_partialSequence, source, m_partialSequenceSize);

	335 source = end;

	336 break;

	337 }

	338 character = decodeNonASCIISequence(source, count);

	339 }

	340 if (character == nonCharacter) {

	341 sawError = true;

	342 if (stopOnError)

	343 break;

	344

	345 goto upConvertTo16Bit;

	346 }

	347 if (character > 0xff)

	348 goto upConvertTo16Bit;

	349

	350 source += count;

	351 *destination++ = static_cast<LChar>(character);

	352 }

	353 } while (flush && m_partialSequenceSize);

	354

	355 buffer.shrink(destination - buffer.characters());

	356

	357 return String::adopt(buffer);

	358

	359 upConvertTo16Bit:

	360 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

	361

	362 UChar* destination16 = buffer16.characters();

	363

	364 // Copy the already converted characters

	365 for (LChar* converted8 = buffer.characters(); converted8 < destination;)

	366 destination16++ = converted8++;

	367

	368 do {

	369 if (m_partialSequenceSize) {

	370 // Explicitly copy destination and source pointers to avoid taking pointer s to the

	371 // local variables, which may harm code generation by disabling some optim izations

	372 // in some compilers.

	373 UChar* destinationForHandlePartialSequence = destination16;

	374 const uint8_t* sourceForHandlePartialSequence = source;

	375 handlePartialSequence(destinationForHandlePartialSequence,

	376 sourceForHandlePartialSequence, end, flush,

	377 stopOnError, sawError);

	378 destination16 = destinationForHandlePartialSequence;

	379 source = sourceForHandlePartialSequence;

	380 if (m_partialSequenceSize)

	381 break;

	382 }

	383

	384 while (source < end) {

	385 if (isASCII(*source)) {

	386 // Fast path for ASCII. Most UTF-8 text will be ASCII.

	387 if (isAlignedToMachineWord(source)) {

	388 while (source < alignedEnd) {

	389 MachineWord chunk =

	390 reinterpret_cast_ptr<const MachineWord>(source);

	391 if (!isAllASCII<LChar>(chunk))

	392 break;

	393 copyASCIIMachineWord(destination16, source);

	394 source += sizeof(MachineWord);

	395 destination16 += sizeof(MachineWord);

	396 }

	397 if (source == end)

111 break;	398 break;

112 default:	399 if (!isASCII(*source))

113 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

114 return nonCharacter;

115 }

116 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

117 return nonCharacter;

118 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E 2080;

119 }

120 ASSERT(length == 4);

121 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

122 switch (sequence[0]) {

123 case 0xF0:

124 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)

125 return nonCharacter;

126 break;

127 case 0xF4:

128 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)

129 return nonCharacter;

130 break;

131 default:

132 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

133 return nonCharacter;

134 }

135 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

136 return nonCharacter;

137 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)

138 return nonCharacter;

139 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seq uence[3]) - 0x03C82080;

140 }

141

142 static inline UChar* appendCharacter(UChar* destination, int character)

143 {

144 ASSERT(character != nonCharacter);

145 ASSERT(!U_IS_SURROGATE(character));

146 if (U_IS_BMP(character)) {

147 *destination++ = static_cast<UChar>(character);

148 } else {

149 *destination++ = U16_LEAD(character);

150 *destination++ = U16_TRAIL(character);

151 }

152 return destination;

153 }

154

155 void TextCodecUTF8::consumePartialSequenceByte()

156 {

157 --m_partialSequenceSize;

158 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);

159 }

160

161 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw Error)

162 {

163 sawError = true;

164 if (stopOnError)

165 return;

166 // Each error generates a replacement character and consumes one byte.

167 *destination++ = replacementCharacter;

168 consumePartialSequenceByte();

169 }

170

171 template <>

172 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar& destination, const uint 8_t& source, const uint8_t* end, bool flush, bool, bool&)

173 {

174 ASSERT(m_partialSequenceSize);

175 do {

176 if (isASCII(m_partialSequence[0])) {

177 *destination++ = m_partialSequence[0];

178 consumePartialSequenceByte();

179 continue;	400 continue;

180 }	401 }

181 int count = nonASCIISequenceLength(m_partialSequence[0]);	402 destination16++ = source++;

182 if (!count)	403 continue;

183 return true;	404 }

184	405 int count = nonASCIISequenceLength(*source);

185 if (count > m_partialSequenceSize) {	406 int character;

186 if (count - m_partialSequenceSize > end - source) {	407 if (count == 0) {

187 if (!flush) {	408 character = nonCharacter;

188 // The new data is not enough to complete the sequence, so	409 } else {

189 // add it to the existing partial sequence.	410 if (count > end - source) {

190 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source);	411 ASSERT_WITH_SECURITY_IMPLICATION(

191 m_partialSequenceSize += end - source;	412 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

192 return false;	413 ASSERT(!m_partialSequenceSize);

193 }	414 m_partialSequenceSize = end - source;

194 // An incomplete partial sequence at the end is an error, but it will create	415 memcpy(m_partialSequence, source, m_partialSequenceSize);

195 // a 16 bit string due to the replacementCharacter. Let the 16 b it path handle	416 source = end;

196 // the error.	417 break;

197 return true;	418 }

198 }	419 character = decodeNonASCIISequence(source, count);

199 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize);	420 }

200 source += count - m_partialSequenceSize;	421 if (character == nonCharacter) {

201 m_partialSequenceSize = count;	422 sawError = true;

202 }	423 if (stopOnError)

203 int character = decodeNonASCIISequence(m_partialSequence, count);	424 break;

204 if (character & ~0xff)	425 // Each error generates a replacement character and consumes one byte.

205 return true;	426 *destination16++ = replacementCharacter;

206	427 ++source;

207 m_partialSequenceSize -= count;	428 continue;

208 *destination++ = static_cast<LChar>(character);	429 }

209 } while (m_partialSequenceSize);	430 source += count;

210	431 destination16 = appendCharacter(destination16, character);

211 return false;	432 }

212 }	433 } while (flush && m_partialSequenceSize);

213	434

214 template <>	435 buffer16.shrink(destination16 - buffer16.characters());

215 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar& destination, const uint 8_t& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)	436

216 {	437 return String::adopt(buffer16);

217 ASSERT(m_partialSequenceSize);	438 }

218 do {	439

219 if (isASCII(m_partialSequence[0])) {	440 template <typename CharType>

220 *destination++ = m_partialSequence[0];	441 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {

221 consumePartialSequenceByte();	442 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.

222 continue;	443 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3 x).

223 }	444 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2 x).

224 int count = nonASCIISequenceLength(m_partialSequence[0]);	445 if (length > std::numeric_limits<size_t>::max() / 3)

225 if (!count) {	446 CRASH();

226 handleError(destination, stopOnError, sawError);	447 Vector<uint8_t> bytes(length * 3);

227 if (stopOnError)	448

228 return false;	449 size_t i = 0;

229 continue;	450 size_t bytesWritten = 0;

230 }	451 while (i < length) {

231 if (count > m_partialSequenceSize) {	452 UChar32 character;

232 if (count - m_partialSequenceSize > end - source) {	453 U16_NEXT(characters, i, length, character);

233 if (!flush) {	454 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat e

234 // The new data is not enough to complete the sequence, so	455 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he re.

235 // add it to the existing partial sequence.	456 if (0xD800 <= character && character <= 0xDFFF)

236 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source);	457 character = replacementCharacter;

237 m_partialSequenceSize += end - source;	458 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);

238 return false;	459 }

239 }	460

240 // An incomplete partial sequence at the end is an error.	461 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);

241 handleError(destination, stopOnError, sawError);	462 }

242 if (stopOnError)	463

243 return false;	464 CString TextCodecUTF8::encode(const UChar* characters,

244 continue;	465 size_t length,

245 }	466 UnencodableHandling) {

246 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize);	467 return encodeCommon(characters, length);

247 source += count - m_partialSequenceSize;	468 }

248 m_partialSequenceSize = count;	469

249 }	470 CString TextCodecUTF8::encode(const LChar* characters,

250 int character = decodeNonASCIISequence(m_partialSequence, count);	471 size_t length,

251 if (character == nonCharacter) {	472 UnencodableHandling) {

252 handleError(destination, stopOnError, sawError);	473 return encodeCommon(characters, length);

253 if (stopOnError)	474 }

254 return false;	475

255 continue;	476 } // namespace WTF

256 }

257

258 m_partialSequenceSize -= count;

259 destination = appendCharacter(destination, character);

260 } while (m_partialSequenceSize);

261

262 return false;

263 }

264

265 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu sh, bool stopOnError, bool& sawError)

266 {

267 // Each input byte might turn into a character.

268 // That includes all bytes in the partial-sequence buffer because

269 // each byte in an invalid sequence will turn into a replacement character.

270 StringBuffer<LChar> buffer(m_partialSequenceSize + length);

271

272 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);

273 const uint8_t* end = source + length;

274 const uint8_t* alignedEnd = alignToMachineWord(end);

275 LChar* destination = buffer.characters();

276

277 do {

278 if (m_partialSequenceSize) {

279 // Explicitly copy destination and source pointers to avoid taking p ointers to the

280 // local variables, which may harm code generation by disabling some optimizations

281 // in some compilers.

282 LChar* destinationForHandlePartialSequence = destination;

283 const uint8_t* sourceForHandlePartialSequence = source;

284 if (handlePartialSequence(destinationForHandlePartialSequence, sourc eForHandlePartialSequence, end, flush, stopOnError, sawError)) {

285 source = sourceForHandlePartialSequence;

286 goto upConvertTo16Bit;

287 }

288 destination = destinationForHandlePartialSequence;

289 source = sourceForHandlePartialSequence;

290 if (m_partialSequenceSize)

291 break;

292 }

293

294 while (source < end) {

295 if (isASCII(*source)) {

296 // Fast path for ASCII. Most UTF-8 text will be ASCII.

297 if (isAlignedToMachineWord(source)) {

298 while (source < alignedEnd) {

299 MachineWord chunk = reinterpret_cast_ptr<const MachineW ord>(source);

300 if (!isAllASCII<LChar>(chunk))

301 break;

302 copyASCIIMachineWord(destination, source);

303 source += sizeof(MachineWord);

304 destination += sizeof(MachineWord);

305 }

306 if (source == end)

307 break;

308 if (!isASCII(*source))

309 continue;

310 }

311 destination++ = source++;

312 continue;

313 }

314 int count = nonASCIISequenceLength(*source);

315 int character;

316 if (count == 0) {

317 character = nonCharacter;

318 } else {

319 if (count > end - source) {

320 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));

321 ASSERT(!m_partialSequenceSize);

322 m_partialSequenceSize = end - source;

323 memcpy(m_partialSequence, source, m_partialSequenceSize);

324 source = end;

325 break;

326 }

327 character = decodeNonASCIISequence(source, count);

328 }

329 if (character == nonCharacter) {

330 sawError = true;

331 if (stopOnError)

332 break;

333

334 goto upConvertTo16Bit;

335 }

336 if (character > 0xff)

337 goto upConvertTo16Bit;

338

339 source += count;

340 *destination++ = static_cast<LChar>(character);

341 }

342 } while (flush && m_partialSequenceSize);

343

344 buffer.shrink(destination - buffer.characters());

345

346 return String::adopt(buffer);

347

348 upConvertTo16Bit:

349 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

350

351 UChar* destination16 = buffer16.characters();

352

353 // Copy the already converted characters

354 for (LChar* converted8 = buffer.characters(); converted8 < destination;)

355 destination16++ = converted8++;

356

357 do {

358 if (m_partialSequenceSize) {

359 // Explicitly copy destination and source pointers to avoid taking p ointers to the

360 // local variables, which may harm code generation by disabling some optimizations

361 // in some compilers.

362 UChar* destinationForHandlePartialSequence = destination16;

363 const uint8_t* sourceForHandlePartialSequence = source;

364 handlePartialSequence(destinationForHandlePartialSequence, sourceFor HandlePartialSequence, end, flush, stopOnError, sawError);

365 destination16 = destinationForHandlePartialSequence;

366 source = sourceForHandlePartialSequence;

367 if (m_partialSequenceSize)

368 break;

369 }

370

371 while (source < end) {

372 if (isASCII(*source)) {

373 // Fast path for ASCII. Most UTF-8 text will be ASCII.

374 if (isAlignedToMachineWord(source)) {

375 while (source < alignedEnd) {

376 MachineWord chunk = reinterpret_cast_ptr<const MachineW ord>(source);

377 if (!isAllASCII<LChar>(chunk))

378 break;

379 copyASCIIMachineWord(destination16, source);

380 source += sizeof(MachineWord);

381 destination16 += sizeof(MachineWord);

382 }

383 if (source == end)

384 break;

385 if (!isASCII(*source))

386 continue;

387 }

388 destination16++ = source++;

389 continue;

390 }

391 int count = nonASCIISequenceLength(*source);

392 int character;

393 if (count == 0) {

394 character = nonCharacter;

395 } else {

396 if (count > end - source) {

397 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));

398 ASSERT(!m_partialSequenceSize);

399 m_partialSequenceSize = end - source;

400 memcpy(m_partialSequence, source, m_partialSequenceSize);

401 source = end;

402 break;

403 }

404 character = decodeNonASCIISequence(source, count);

405 }

406 if (character == nonCharacter) {

407 sawError = true;

408 if (stopOnError)

409 break;

410 // Each error generates a replacement character and consumes one byte.

411 *destination16++ = replacementCharacter;

412 ++source;

413 continue;

414 }

415 source += count;

416 destination16 = appendCharacter(destination16, character);

417 }

418 } while (flush && m_partialSequenceSize);

419

420 buffer16.shrink(destination16 - buffer16.characters());

421

422 return String::adopt(buffer16);

423 }

424

425 template<typename CharType>

426 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)

427 {

428 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.

429 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).

430 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).

431 if (length > std::numeric_limits<size_t>::max() / 3)

432 CRASH();

433 Vector<uint8_t> bytes(length * 3);

434

435 size_t i = 0;

436 size_t bytesWritten = 0;

437 while (i < length) {

438 UChar32 character;

439 U16_NEXT(characters, i, length, character);

440 // U16_NEXT will simply emit a surrogate code point if an unmatched surr ogate

441 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER ) here.

442 if (0xD800 <= character && character <= 0xDFFF)

443 character = replacementCharacter;

444 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);

445 }

446

447 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);

448 }

449

450 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl eHandling)

451 {

452 return encodeCommon(characters, length);

453 }

454

455 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl eHandling)

456 {

457 return encodeCommon(characters, length);

458 }

459

460 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »