third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp - Issue 1436153002: Apply clang-format with Chromium-style without column limit.

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 1436153002: Apply clang-format with Chromium-style without column limit. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 19 matching lines...) Expand all Loading...
30 #include "wtf/text/CharacterNames.h"	30 #include "wtf/text/CharacterNames.h"

31 #include "wtf/text/StringBuffer.h"	31 #include "wtf/text/StringBuffer.h"

32 #include "wtf/text/TextCodecASCIIFastPath.h"	32 #include "wtf/text/TextCodecASCIIFastPath.h"

33	33

34 namespace WTF {	34 namespace WTF {

35	35

36 using namespace WTF::Unicode;	36 using namespace WTF::Unicode;

37	37

38 const int nonCharacter = -1;	38 const int nonCharacter = -1;

39	39

40 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)	40 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {

41 {	41 return adoptPtr(new TextCodecUTF8);

42 return adoptPtr(new TextCodecUTF8);	42 }

43 }	43

44	44 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {

45 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)	45 registrar("UTF-8", "UTF-8");

46 {	46

47 registrar("UTF-8", "UTF-8");	47 // Additional aliases that originally were present in the encoding

48	48 // table in WebKit on Macintosh, and subsequently added by

49 // Additional aliases that originally were present in the encoding	49 // TextCodecICU. Perhaps we can prove some are not used on the web

50 // table in WebKit on Macintosh, and subsequently added by	50 // and remove them.

51 // TextCodecICU. Perhaps we can prove some are not used on the web	51 registrar("unicode11utf8", "UTF-8");

52 // and remove them.	52 registrar("unicode20utf8", "UTF-8");

53 registrar("unicode11utf8", "UTF-8");	53 registrar("utf8", "UTF-8");

54 registrar("unicode20utf8", "UTF-8");	54 registrar("x-unicode20utf8", "UTF-8");

55 registrar("utf8", "UTF-8");	55

56 registrar("x-unicode20utf8", "UTF-8");	56 // Additional aliases present in the WHATWG Encoding Standard (http://encoding .spec.whatwg.org/)

57	57 // and Firefox (24), but not in ICU 4.6.

58 // Additional aliases present in the WHATWG Encoding Standard (http://encodi ng.spec.whatwg.org/)	58 registrar("unicode-1-1-utf-8", "UTF-8");

59 // and Firefox (24), but not in ICU 4.6.	59 }

60 registrar("unicode-1-1-utf-8", "UTF-8");	60

61 }	61 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {

62	62 registrar("UTF-8", create, 0);

63 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)	63 }

64 {	64

65 registrar("UTF-8", create, 0);	65 static inline int nonASCIISequenceLength(uint8_t firstByte) {

66 }	66 static const uint8_t lengths[256] = {

67	67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

68 static inline int nonASCIISequenceLength(uint8_t firstByte)	68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

69 {	69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

70 static const uint8_t lengths[256] = {	70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	75 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	76 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	82 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	83 return lengths[firstByte];

84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	84 }

85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	85

86 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0	86 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt h) {

87 };	87 ASSERT(!isASCII(sequence[0]));

88 return lengths[firstByte];	88 if (length == 2) {

89 }	89 ASSERT(sequence[0] <= 0xDF);

90	90 if (sequence[0] < 0xC2)

91 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt h)	91 return nonCharacter;

92 {	92 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

93 ASSERT(!isASCII(sequence[0]));	93 return nonCharacter;

94 if (length == 2) {	94 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;

95 ASSERT(sequence[0] <= 0xDF);	95 }

96 if (sequence[0] < 0xC2)	96 if (length == 3) {

97 return nonCharacter;	97 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);

	98 switch (sequence[0]) {

	99 case 0xE0:

	100 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)

	101 return nonCharacter;

	102 break;

	103 case 0xED:

	104 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)

	105 return nonCharacter;

	106 break;

	107 default:

98 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)	108 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

99 return nonCharacter;	109 return nonCharacter;

100 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;	110 }

101 }	111 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

102 if (length == 3) {	112 return nonCharacter;

103 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);	113 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080 ;

104 switch (sequence[0]) {	114 }

105 case 0xE0:	115 ASSERT(length == 4);

106 if (sequence[1] < 0xA0 \|\| sequence[1] > 0xBF)	116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

107 return nonCharacter;	117 switch (sequence[0]) {

	118 case 0xF0:

	119 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)

	120 return nonCharacter;

	121 break;

	122 case 0xF4:

	123 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)

	124 return nonCharacter;

	125 break;

	126 default:

	127 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

	128 return nonCharacter;

	129 }

	130 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

	131 return nonCharacter;

	132 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)

	133 return nonCharacter;

	134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seque nce[3]) - 0x03C82080;

	135 }

	136

	137 static inline UChar* appendCharacter(UChar* destination, int character) {

	138 ASSERT(character != nonCharacter);

	139 ASSERT(!U_IS_SURROGATE(character));

	140 if (U_IS_BMP(character)) {

	141 *destination++ = static_cast<UChar>(character);

	142 } else {

	143 *destination++ = U16_LEAD(character);

	144 *destination++ = U16_TRAIL(character);

	145 }

	146 return destination;

	147 }

	148

	149 void TextCodecUTF8::consumePartialSequenceByte() {

	150 --m_partialSequenceSize;

	151 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);

	152 }

	153

	154 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw Error) {

	155 sawError = true;

	156 if (stopOnError)

	157 return;

	158 // Each error generates a replacement character and consumes one byte.

	159 *destination++ = replacementCharacter;

	160 consumePartialSequenceByte();

	161 }

	162

	163 template <>

	164 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar& destination, const uint 8_t& source, const uint8_t* end, bool flush, bool, bool&) {

	165 ASSERT(m_partialSequenceSize);

	166 do {

	167 if (isASCII(m_partialSequence[0])) {

	168 *destination++ = m_partialSequence[0];

	169 consumePartialSequenceByte();

	170 continue;

	171 }

	172 int count = nonASCIISequenceLength(m_partialSequence[0]);

	173 if (!count)

	174 return true;

	175

	176 if (count > m_partialSequenceSize) {

	177 if (count - m_partialSequenceSize > end - source) {

	178 if (!flush) {

	179 // The new data is not enough to complete the sequence, so

	180 // add it to the existing partial sequence.

	181 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source );

	182 m_partialSequenceSize += end - source;

	183 return false;

	184 }

	185 // An incomplete partial sequence at the end is an error, but it will cr eate

	186 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle

	187 // the error.

	188 return true;

	189 }

	190 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partia lSequenceSize);

	191 source += count - m_partialSequenceSize;

	192 m_partialSequenceSize = count;

	193 }

	194 int character = decodeNonASCIISequence(m_partialSequence, count);

	195 if (character & ~0xff)

	196 return true;

	197

	198 m_partialSequenceSize -= count;

	199 *destination++ = static_cast<LChar>(character);

	200 } while (m_partialSequenceSize);

	201

	202 return false;

	203 }

	204

	205 template <>

	206 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar& destination, const uint 8_t& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) {

	207 ASSERT(m_partialSequenceSize);

	208 do {

	209 if (isASCII(m_partialSequence[0])) {

	210 *destination++ = m_partialSequence[0];

	211 consumePartialSequenceByte();

	212 continue;

	213 }

	214 int count = nonASCIISequenceLength(m_partialSequence[0]);

	215 if (!count) {

	216 handleError(destination, stopOnError, sawError);

	217 if (stopOnError)

	218 return false;

	219 continue;

	220 }

	221 if (count > m_partialSequenceSize) {

	222 if (count - m_partialSequenceSize > end - source) {

	223 if (!flush) {

	224 // The new data is not enough to complete the sequence, so

	225 // add it to the existing partial sequence.

	226 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source );

	227 m_partialSequenceSize += end - source;

	228 return false;

	229 }

	230 // An incomplete partial sequence at the end is an error.

	231 handleError(destination, stopOnError, sawError);

	232 if (stopOnError)

	233 return false;

	234 continue;

	235 }

	236 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partia lSequenceSize);

	237 source += count - m_partialSequenceSize;

	238 m_partialSequenceSize = count;

	239 }

	240 int character = decodeNonASCIISequence(m_partialSequence, count);

	241 if (character == nonCharacter) {

	242 handleError(destination, stopOnError, sawError);

	243 if (stopOnError)

	244 return false;

	245 continue;

	246 }

	247

	248 m_partialSequenceSize -= count;

	249 destination = appendCharacter(destination, character);

	250 } while (m_partialSequenceSize);

	251

	252 return false;

	253 }

	254

	255 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu sh, bool stopOnError, bool& sawError) {

	256 // Each input byte might turn into a character.

	257 // That includes all bytes in the partial-sequence buffer because

	258 // each byte in an invalid sequence will turn into a replacement character.

	259 StringBuffer<LChar> buffer(m_partialSequenceSize + length);

	260

	261 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);

	262 const uint8_t* end = source + length;

	263 const uint8_t* alignedEnd = alignToMachineWord(end);

	264 LChar* destination = buffer.characters();

	265

	266 do {

	267 if (m_partialSequenceSize) {

	268 // Explicitly copy destination and source pointers to avoid taking pointer s to the

	269 // local variables, which may harm code generation by disabling some optim izations

	270 // in some compilers.

	271 LChar* destinationForHandlePartialSequence = destination;

	272 const uint8_t* sourceForHandlePartialSequence = source;

	273 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHa ndlePartialSequence, end, flush, stopOnError, sawError)) {

	274 source = sourceForHandlePartialSequence;

	275 goto upConvertTo16Bit;

	276 }

	277 destination = destinationForHandlePartialSequence;

	278 source = sourceForHandlePartialSequence;

	279 if (m_partialSequenceSize)

	280 break;

	281 }

	282

	283 while (source < end) {

	284 if (isASCII(*source)) {

	285 // Fast path for ASCII. Most UTF-8 text will be ASCII.

	286 if (isAlignedToMachineWord(source)) {

	287 while (source < alignedEnd) {

	288 MachineWord chunk = reinterpret_cast_ptr<const MachineWord>(source );

	289 if (!isAllASCII<LChar>(chunk))

	290 break;

	291 copyASCIIMachineWord(destination, source);

	292 source += sizeof(MachineWord);

	293 destination += sizeof(MachineWord);

	294 }

	295 if (source == end)

108 break;	296 break;

109 case 0xED:	297 if (!isASCII(*source))

110 if (sequence[1] < 0x80 \|\| sequence[1] > 0x9F)	298 continue;

111 return nonCharacter;	299 }

	300 destination++ = source++;

	301 continue;

	302 }

	303 int count = nonASCIISequenceLength(*source);

	304 int character;

	305 if (count == 0) {

	306 character = nonCharacter;

	307 } else {

	308 if (count > end - source) {

	309 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t> (sizeof(m_partialSequence)));

	310 ASSERT(!m_partialSequenceSize);

	311 m_partialSequenceSize = end - source;

	312 memcpy(m_partialSequence, source, m_partialSequenceSize);

	313 source = end;

	314 break;

	315 }

	316 character = decodeNonASCIISequence(source, count);

	317 }

	318 if (character == nonCharacter) {

	319 sawError = true;

	320 if (stopOnError)

	321 break;

	322

	323 goto upConvertTo16Bit;

	324 }

	325 if (character > 0xff)

	326 goto upConvertTo16Bit;

	327

	328 source += count;

	329 *destination++ = static_cast<LChar>(character);

	330 }

	331 } while (flush && m_partialSequenceSize);

	332

	333 buffer.shrink(destination - buffer.characters());

	334

	335 return String::adopt(buffer);

	336

	337 upConvertTo16Bit:

	338 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

	339

	340 UChar* destination16 = buffer16.characters();

	341

	342 // Copy the already converted characters

	343 for (LChar* converted8 = buffer.characters(); converted8 < destination;)

	344 destination16++ = converted8++;

	345

	346 do {

	347 if (m_partialSequenceSize) {

	348 // Explicitly copy destination and source pointers to avoid taking pointer s to the

	349 // local variables, which may harm code generation by disabling some optim izations

	350 // in some compilers.

	351 UChar* destinationForHandlePartialSequence = destination16;

	352 const uint8_t* sourceForHandlePartialSequence = source;

	353 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandle PartialSequence, end, flush, stopOnError, sawError);

	354 destination16 = destinationForHandlePartialSequence;

	355 source = sourceForHandlePartialSequence;

	356 if (m_partialSequenceSize)

	357 break;

	358 }

	359

	360 while (source < end) {

	361 if (isASCII(*source)) {

	362 // Fast path for ASCII. Most UTF-8 text will be ASCII.

	363 if (isAlignedToMachineWord(source)) {

	364 while (source < alignedEnd) {

	365 MachineWord chunk = reinterpret_cast_ptr<const MachineWord>(source );

	366 if (!isAllASCII<LChar>(chunk))

	367 break;

	368 copyASCIIMachineWord(destination16, source);

	369 source += sizeof(MachineWord);

	370 destination16 += sizeof(MachineWord);

	371 }

	372 if (source == end)

112 break;	373 break;

113 default:	374 if (!isASCII(*source))

114 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

115 return nonCharacter;

116 }

117 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

118 return nonCharacter;

119 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E 2080;

120 }

121 ASSERT(length == 4);

122 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

123 switch (sequence[0]) {

124 case 0xF0:

125 if (sequence[1] < 0x90 \|\| sequence[1] > 0xBF)

126 return nonCharacter;

127 break;

128 case 0xF4:

129 if (sequence[1] < 0x80 \|\| sequence[1] > 0x8F)

130 return nonCharacter;

131 break;

132 default:

133 if (sequence[1] < 0x80 \|\| sequence[1] > 0xBF)

134 return nonCharacter;

135 }

136 if (sequence[2] < 0x80 \|\| sequence[2] > 0xBF)

137 return nonCharacter;

138 if (sequence[3] < 0x80 \|\| sequence[3] > 0xBF)

139 return nonCharacter;

140 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seq uence[3]) - 0x03C82080;

141 }

142

143 static inline UChar* appendCharacter(UChar* destination, int character)

144 {

145 ASSERT(character != nonCharacter);

146 ASSERT(!U_IS_SURROGATE(character));

147 if (U_IS_BMP(character)) {

148 *destination++ = static_cast<UChar>(character);

149 } else {

150 *destination++ = U16_LEAD(character);

151 *destination++ = U16_TRAIL(character);

152 }

153 return destination;

154 }

155

156 void TextCodecUTF8::consumePartialSequenceByte()

157 {

158 --m_partialSequenceSize;

159 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);

160 }

161

162 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw Error)

163 {

164 sawError = true;

165 if (stopOnError)

166 return;

167 // Each error generates a replacement character and consumes one byte.

168 *destination++ = replacementCharacter;

169 consumePartialSequenceByte();

170 }

171

172 template <>

173 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar& destination, const uint 8_t& source, const uint8_t* end, bool flush, bool, bool&)

174 {

175 ASSERT(m_partialSequenceSize);

176 do {

177 if (isASCII(m_partialSequence[0])) {

178 *destination++ = m_partialSequence[0];

179 consumePartialSequenceByte();

180 continue;	375 continue;

181 }	376 }

182 int count = nonASCIISequenceLength(m_partialSequence[0]);	377 destination16++ = source++;

183 if (!count)	378 continue;

184 return true;	379 }

185	380 int count = nonASCIISequenceLength(*source);

186 if (count > m_partialSequenceSize) {	381 int character;

187 if (count - m_partialSequenceSize > end - source) {	382 if (count == 0) {

188 if (!flush) {	383 character = nonCharacter;

189 // The new data is not enough to complete the sequence, so	384 } else {

190 // add it to the existing partial sequence.	385 if (count > end - source) {

191 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source);	386 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t> (sizeof(m_partialSequence)));

192 m_partialSequenceSize += end - source;	387 ASSERT(!m_partialSequenceSize);

193 return false;	388 m_partialSequenceSize = end - source;

194 }	389 memcpy(m_partialSequence, source, m_partialSequenceSize);

195 // An incomplete partial sequence at the end is an error, but it will create	390 source = end;

196 // a 16 bit string due to the replacementCharacter. Let the 16 b it path handle	391 break;

197 // the error.	392 }

198 return true;	393 character = decodeNonASCIISequence(source, count);

199 }	394 }

200 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize);	395 if (character == nonCharacter) {

201 source += count - m_partialSequenceSize;	396 sawError = true;

202 m_partialSequenceSize = count;	397 if (stopOnError)

203 }	398 break;

204 int character = decodeNonASCIISequence(m_partialSequence, count);	399 // Each error generates a replacement character and consumes one byte.

205 if (character & ~0xff)	400 *destination16++ = replacementCharacter;

206 return true;	401 ++source;

207	402 continue;

208 m_partialSequenceSize -= count;	403 }

209 *destination++ = static_cast<LChar>(character);	404 source += count;

210 } while (m_partialSequenceSize);	405 destination16 = appendCharacter(destination16, character);

211	406 }

212 return false;	407 } while (flush && m_partialSequenceSize);

213 }	408

214	409 buffer16.shrink(destination16 - buffer16.characters());

215 template <>	410

216 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar& destination, const uint 8_t& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)	411 return String::adopt(buffer16);

217 {	412 }

218 ASSERT(m_partialSequenceSize);	413

219 do {	414 template <typename CharType>

220 if (isASCII(m_partialSequence[0])) {	415 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {

221 *destination++ = m_partialSequence[0];	416 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.

222 consumePartialSequenceByte();	417 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3 x).

223 continue;	418 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2 x).

224 }	419 if (length > std::numeric_limits<size_t>::max() / 3)

225 int count = nonASCIISequenceLength(m_partialSequence[0]);	420 CRASH();

226 if (!count) {	421 Vector<uint8_t> bytes(length * 3);

227 handleError(destination, stopOnError, sawError);	422

228 if (stopOnError)	423 size_t i = 0;

229 return false;	424 size_t bytesWritten = 0;

230 continue;	425 while (i < length) {

231 }	426 UChar32 character;

232 if (count > m_partialSequenceSize) {	427 U16_NEXT(characters, i, length, character);

233 if (count - m_partialSequenceSize > end - source) {	428 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat e

234 if (!flush) {	429 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he re.

235 // The new data is not enough to complete the sequence, so	430 if (0xD800 <= character && character <= 0xDFFF)

236 // add it to the existing partial sequence.	431 character = replacementCharacter;

237 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source);	432 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);

238 m_partialSequenceSize += end - source;	433 }

239 return false;	434

240 }	435 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);

241 // An incomplete partial sequence at the end is an error.	436 }

242 handleError(destination, stopOnError, sawError);	437

243 if (stopOnError)	438 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl eHandling) {

244 return false;	439 return encodeCommon(characters, length);

245 continue;	440 }

246 }	441

247 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize);	442 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl eHandling) {

248 source += count - m_partialSequenceSize;	443 return encodeCommon(characters, length);

249 m_partialSequenceSize = count;	444 }

250 }	445

251 int character = decodeNonASCIISequence(m_partialSequence, count);	446 } // namespace WTF

252 if (character == nonCharacter) {

253 handleError(destination, stopOnError, sawError);

254 if (stopOnError)

255 return false;

256 continue;

257 }

258

259 m_partialSequenceSize -= count;

260 destination = appendCharacter(destination, character);

261 } while (m_partialSequenceSize);

262

263 return false;

264 }

265

266 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu sh, bool stopOnError, bool& sawError)

267 {

268 // Each input byte might turn into a character.

269 // That includes all bytes in the partial-sequence buffer because

270 // each byte in an invalid sequence will turn into a replacement character.

271 StringBuffer<LChar> buffer(m_partialSequenceSize + length);

272

273 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);

274 const uint8_t* end = source + length;

275 const uint8_t* alignedEnd = alignToMachineWord(end);

276 LChar* destination = buffer.characters();

277

278 do {

279 if (m_partialSequenceSize) {

280 // Explicitly copy destination and source pointers to avoid taking p ointers to the

281 // local variables, which may harm code generation by disabling some optimizations

282 // in some compilers.

283 LChar* destinationForHandlePartialSequence = destination;

284 const uint8_t* sourceForHandlePartialSequence = source;

285 if (handlePartialSequence(destinationForHandlePartialSequence, sourc eForHandlePartialSequence, end, flush, stopOnError, sawError)) {

286 source = sourceForHandlePartialSequence;

287 goto upConvertTo16Bit;

288 }

289 destination = destinationForHandlePartialSequence;

290 source = sourceForHandlePartialSequence;

291 if (m_partialSequenceSize)

292 break;

293 }

294

295 while (source < end) {

296 if (isASCII(*source)) {

297 // Fast path for ASCII. Most UTF-8 text will be ASCII.

298 if (isAlignedToMachineWord(source)) {

299 while (source < alignedEnd) {

300 MachineWord chunk = reinterpret_cast_ptr<const MachineW ord>(source);

301 if (!isAllASCII<LChar>(chunk))

302 break;

303 copyASCIIMachineWord(destination, source);

304 source += sizeof(MachineWord);

305 destination += sizeof(MachineWord);

306 }

307 if (source == end)

308 break;

309 if (!isASCII(*source))

310 continue;

311 }

312 destination++ = source++;

313 continue;

314 }

315 int count = nonASCIISequenceLength(*source);

316 int character;

317 if (count == 0) {

318 character = nonCharacter;

319 } else {

320 if (count > end - source) {

321 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));

322 ASSERT(!m_partialSequenceSize);

323 m_partialSequenceSize = end - source;

324 memcpy(m_partialSequence, source, m_partialSequenceSize);

325 source = end;

326 break;

327 }

328 character = decodeNonASCIISequence(source, count);

329 }

330 if (character == nonCharacter) {

331 sawError = true;

332 if (stopOnError)

333 break;

334

335 goto upConvertTo16Bit;

336 }

337 if (character > 0xff)

338 goto upConvertTo16Bit;

339

340 source += count;

341 *destination++ = static_cast<LChar>(character);

342 }

343 } while (flush && m_partialSequenceSize);

344

345 buffer.shrink(destination - buffer.characters());

346

347 return String::adopt(buffer);

348

349 upConvertTo16Bit:

350 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

351

352 UChar* destination16 = buffer16.characters();

353

354 // Copy the already converted characters

355 for (LChar* converted8 = buffer.characters(); converted8 < destination;)

356 destination16++ = converted8++;

357

358 do {

359 if (m_partialSequenceSize) {

360 // Explicitly copy destination and source pointers to avoid taking p ointers to the

361 // local variables, which may harm code generation by disabling some optimizations

362 // in some compilers.

363 UChar* destinationForHandlePartialSequence = destination16;

364 const uint8_t* sourceForHandlePartialSequence = source;

365 handlePartialSequence(destinationForHandlePartialSequence, sourceFor HandlePartialSequence, end, flush, stopOnError, sawError);

366 destination16 = destinationForHandlePartialSequence;

367 source = sourceForHandlePartialSequence;

368 if (m_partialSequenceSize)

369 break;

370 }

371

372 while (source < end) {

373 if (isASCII(*source)) {

374 // Fast path for ASCII. Most UTF-8 text will be ASCII.

375 if (isAlignedToMachineWord(source)) {

376 while (source < alignedEnd) {

377 MachineWord chunk = reinterpret_cast_ptr<const MachineW ord>(source);

378 if (!isAllASCII<LChar>(chunk))

379 break;

380 copyASCIIMachineWord(destination16, source);

381 source += sizeof(MachineWord);

382 destination16 += sizeof(MachineWord);

383 }

384 if (source == end)

385 break;

386 if (!isASCII(*source))

387 continue;

388 }

389 destination16++ = source++;

390 continue;

391 }

392 int count = nonASCIISequenceLength(*source);

393 int character;

394 if (count == 0) {

395 character = nonCharacter;

396 } else {

397 if (count > end - source) {

398 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));

399 ASSERT(!m_partialSequenceSize);

400 m_partialSequenceSize = end - source;

401 memcpy(m_partialSequence, source, m_partialSequenceSize);

402 source = end;

403 break;

404 }

405 character = decodeNonASCIISequence(source, count);

406 }

407 if (character == nonCharacter) {

408 sawError = true;

409 if (stopOnError)

410 break;

411 // Each error generates a replacement character and consumes one byte.

412 *destination16++ = replacementCharacter;

413 ++source;

414 continue;

415 }

416 source += count;

417 destination16 = appendCharacter(destination16, character);

418 }

419 } while (flush && m_partialSequenceSize);

420

421 buffer16.shrink(destination16 - buffer16.characters());

422

423 return String::adopt(buffer16);

424 }

425

426 template<typename CharType>

427 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)

428 {

429 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.

430 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).

431 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).

432 if (length > std::numeric_limits<size_t>::max() / 3)

433 CRASH();

434 Vector<uint8_t> bytes(length * 3);

435

436 size_t i = 0;

437 size_t bytesWritten = 0;

438 while (i < length) {

439 UChar32 character;

440 U16_NEXT(characters, i, length, character);

441 // U16_NEXT will simply emit a surrogate code point if an unmatched surr ogate

442 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER ) here.

443 if (0xD800 <= character && character <= 0xDFFF)

444 character = replacementCharacter;

445 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);

446 }

447

448 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);

449 }

450

451 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl eHandling)

452 {

453 return encodeCommon(characters, length);

454 }

455

456 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl eHandling)

457 {

458 return encodeCommon(characters, length);

459 }

460

461 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »