third_party/WebKit/Source/wtf/text/UTF8.cpp - Issue 1611343002: wtf reformat test

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1611343002: wtf reformat test Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: pydent Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2007 Apple Inc. All rights reserved.	2 * Copyright (C) 2007 Apple Inc. All rights reserved.

3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>	3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>

4 *	4 *

5 * Redistribution and use in source and binary forms, with or without	5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions	6 * modification, are permitted provided that the following conditions

7 * are met:	7 * are met:

8 * 1. Redistributions of source code must retain the above copyright	8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright	10 * 2. Redistributions in binary form must reproduce the above copyright

(...skipping 15 matching lines...) Expand all Loading...
26	26

27 #include "wtf/text/UTF8.h"	27 #include "wtf/text/UTF8.h"

28	28

29 #include "wtf/ASCIICType.h"	29 #include "wtf/ASCIICType.h"

30 #include "wtf/StringHasher.h"	30 #include "wtf/StringHasher.h"

31 #include "wtf/text/CharacterNames.h"	31 #include "wtf/text/CharacterNames.h"

32	32

33 namespace WTF {	33 namespace WTF {

34 namespace Unicode {	34 namespace Unicode {

35	35

36 inline int inlineUTF8SequenceLengthNonASCII(char b0)	36 inline int inlineUTF8SequenceLengthNonASCII(char b0) {

37 {	37 if ((b0 & 0xC0) != 0xC0)

38 if ((b0 & 0xC0) != 0xC0)

39 return 0;

40 if ((b0 & 0xE0) == 0xC0)

41 return 2;

42 if ((b0 & 0xF0) == 0xE0)

43 return 3;

44 if ((b0 & 0xF8) == 0xF0)

45 return 4;

46 return 0;	38 return 0;

	39 if ((b0 & 0xE0) == 0xC0)

	40 return 2;

	41 if ((b0 & 0xF0) == 0xE0)

	42 return 3;

	43 if ((b0 & 0xF8) == 0xF0)

	44 return 4;

	45 return 0;

47 }	46 }

48	47

49 inline int inlineUTF8SequenceLength(char b0)	48 inline int inlineUTF8SequenceLength(char b0) {

50 {	49 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

51 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

52 }	50 }

53	51

54 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed	52 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

55 // into the first byte, depending on how many bytes follow. There are	53 // into the first byte, depending on how many bytes follow. There are

56 // as many entries in this table as there are UTF-8 sequence types.	54 // as many entries in this table as there are UTF-8 sequence types.

57 // (I.e., one byte sequence, two byte... etc.). Remember that sequences	55 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

58 // for legal UTF-8 will be 4 or fewer bytes total.	56 // for legal UTF-8 will be 4 or fewer bytes total.

59 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };	57 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,

60	58 0xF0, 0xF8, 0xFC};

61 ConversionResult convertLatin1ToUTF8(	59

62 const LChar** sourceStart, const LChar* sourceEnd,	60 ConversionResult convertLatin1ToUTF8(const LChar** sourceStart,

63 char** targetStart, char* targetEnd)	61 const LChar* sourceEnd,

64 {	62 char** targetStart,

65 ConversionResult result = conversionOK;	63 char* targetEnd) {

66 const LChar* source = *sourceStart;	64 ConversionResult result = conversionOK;

67 char* target = *targetStart;	65 const LChar* source = *sourceStart;

68 while (source < sourceEnd) {	66 char* target = *targetStart;

69 UChar32 ch;	67 while (source < sourceEnd) {

70 unsigned short bytesToWrite = 0;	68 UChar32 ch;

71 const UChar32 byteMask = 0xBF;	69 unsigned short bytesToWrite = 0;

72 const UChar32 byteMark = 0x80;	70 const UChar32 byteMask = 0xBF;

73 const LChar* oldSource = source; // In case we have to back up because o f target overflow.	71 const UChar32 byteMark = 0x80;

74 ch = static_cast<unsigned short>(*source++);	72 const LChar* oldSource =

75	73 source; // In case we have to back up because of target overflow.

76 // Figure out how many bytes the result will require	74 ch = static_cast<unsigned short>(*source++);

77 if (ch < (UChar32)0x80)	75

78 bytesToWrite = 1;	76 // Figure out how many bytes the result will require

79 else	77 if (ch < (UChar32)0x80)

80 bytesToWrite = 2;	78 bytesToWrite = 1;

81	79 else

82 target += bytesToWrite;	80 bytesToWrite = 2;

83 if (target > targetEnd) {	81

84 source = oldSource; // Back up source pointer!	82 target += bytesToWrite;

85 target -= bytesToWrite;	83 if (target > targetEnd) {

86 result = targetExhausted;	84 source = oldSource; // Back up source pointer!

87 break;	85 target -= bytesToWrite;

	86 result = targetExhausted;

	87 break;

	88 }

	89 switch (bytesToWrite) { // note: everything falls through.

	90 case 2:

	91 *--target = (char)((ch \| byteMark) & byteMask);

	92 ch >>= 6;

	93 case 1:

	94 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

	95 }

	96 target += bytesToWrite;

	97 }

	98 *sourceStart = source;

	99 *targetStart = target;

	100 return result;

	101 }

	102

	103 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,

	104 const UChar* sourceEnd,

	105 char** targetStart,

	106 char* targetEnd,

	107 bool strict) {

	108 ConversionResult result = conversionOK;

	109 const UChar* source = *sourceStart;

	110 char* target = *targetStart;

	111 while (source < sourceEnd) {

	112 UChar32 ch;

	113 unsigned short bytesToWrite = 0;

	114 const UChar32 byteMask = 0xBF;

	115 const UChar32 byteMark = 0x80;

	116 const UChar* oldSource =

	117 source; // In case we have to back up because of target overflow.

	118 ch = static_cast<unsigned short>(*source++);

	119 // If we have a surrogate pair, convert to UChar32 first.

	120 if (ch >= 0xD800 && ch <= 0xDBFF) {

	121 // If the 16 bits following the high surrogate are in the source buffer...

	122 if (source < sourceEnd) {

	123 UChar32 ch2 = static_cast<unsigned short>(*source);

	124 // If it's a low surrogate, convert to UChar32.

	125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

	126 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

	127 ++source;

	128 } else if (strict) { // it's an unpaired high surrogate

	129 --source; // return to the illegal value itself

	130 result = sourceIllegal;

	131 break;

88 }	132 }

89 switch (bytesToWrite) { // note: everything falls through.	133 } else { // We don't have the 16 bits following the high surrogate.

90 case 2:	134 --source; // return to the high surrogate

91 *--target = (char)((ch \| byteMark) & byteMask);	135 result = sourceExhausted;

92 ch >>= 6;	136 break;

93 case 1:	137 }

94 *--target = (char)(ch \| firstByteMark[bytesToWrite]);	138 } else if (strict) {

95 }	139 // UTF-16 surrogate values are illegal in UTF-32

96 target += bytesToWrite;	140 if (ch >= 0xDC00 && ch <= 0xDFFF) {

97 }	141 --source; // return to the illegal value itself

98 *sourceStart = source;	142 result = sourceIllegal;

99 *targetStart = target;	143 break;

100 return result;	144 }

101 }	145 }

102	146 // Figure out how many bytes the result will require

103 ConversionResult convertUTF16ToUTF8(	147 if (ch < (UChar32)0x80) {

104 const UChar** sourceStart, const UChar* sourceEnd,	148 bytesToWrite = 1;

105 char** targetStart, char* targetEnd, bool strict)	149 } else if (ch < (UChar32)0x800) {

106 {	150 bytesToWrite = 2;

107 ConversionResult result = conversionOK;	151 } else if (ch < (UChar32)0x10000) {

108 const UChar* source = *sourceStart;	152 bytesToWrite = 3;

109 char* target = *targetStart;	153 } else if (ch < (UChar32)0x110000) {

110 while (source < sourceEnd) {	154 bytesToWrite = 4;

111 UChar32 ch;	155 } else {

112 unsigned short bytesToWrite = 0;	156 bytesToWrite = 3;

113 const UChar32 byteMask = 0xBF;	157 ch = replacementCharacter;

114 const UChar32 byteMark = 0x80;	158 }

115 const UChar* oldSource = source; // In case we have to back up because o f target overflow.	159

116 ch = static_cast<unsigned short>(*source++);	160 target += bytesToWrite;

117 // If we have a surrogate pair, convert to UChar32 first.	161 if (target > targetEnd) {

118 if (ch >= 0xD800 && ch <= 0xDBFF) {	162 source = oldSource; // Back up source pointer!

119 // If the 16 bits following the high surrogate are in the source buf fer...	163 target -= bytesToWrite;

120 if (source < sourceEnd) {	164 result = targetExhausted;

121 UChar32 ch2 = static_cast<unsigned short>(*source);	165 break;

122 // If it's a low surrogate, convert to UChar32.	166 }

123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {	167 switch (bytesToWrite) { // note: everything falls through.

124 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;	168 case 4:

125 ++source;	169 *--target = (char)((ch \| byteMark) & byteMask);

126 } else if (strict) { // it's an unpaired high surrogate	170 ch >>= 6;

127 --source; // return to the illegal value itself	171 case 3:

128 result = sourceIllegal;	172 *--target = (char)((ch \| byteMark) & byteMask);

129 break;	173 ch >>= 6;

130 }	174 case 2:

131 } else { // We don't have the 16 bits following the high surrogate.	175 *--target = (char)((ch \| byteMark) & byteMask);

132 --source; // return to the high surrogate	176 ch >>= 6;

133 result = sourceExhausted;	177 case 1:

134 break;	178 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

135 }	179 }

136 } else if (strict) {	180 target += bytesToWrite;

137 // UTF-16 surrogate values are illegal in UTF-32	181 }

138 if (ch >= 0xDC00 && ch <= 0xDFFF) {	182 *sourceStart = source;

139 --source; // return to the illegal value itself	183 *targetStart = target;

140 result = sourceIllegal;	184 return result;

141 break;

142 }

143 }

144 // Figure out how many bytes the result will require

145 if (ch < (UChar32)0x80) {

146 bytesToWrite = 1;

147 } else if (ch < (UChar32)0x800) {

148 bytesToWrite = 2;

149 } else if (ch < (UChar32)0x10000) {

150 bytesToWrite = 3;

151 } else if (ch < (UChar32)0x110000) {

152 bytesToWrite = 4;

153 } else {

154 bytesToWrite = 3;

155 ch = replacementCharacter;

156 }

157

158 target += bytesToWrite;

159 if (target > targetEnd) {

160 source = oldSource; // Back up source pointer!

161 target -= bytesToWrite;

162 result = targetExhausted;

163 break;

164 }

165 switch (bytesToWrite) { // note: everything falls through.

166 case 4:

167 *--target = (char)((ch \| byteMark) & byteMask);

168 ch >>= 6;

169 case 3:

170 *--target = (char)((ch \| byteMark) & byteMask);

171 ch >>= 6;

172 case 2:

173 *--target = (char)((ch \| byteMark) & byteMask);

174 ch >>= 6;

175 case 1:

176 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

177 }

178 target += bytesToWrite;

179 }

180 *sourceStart = source;

181 *targetStart = target;

182 return result;

183 }	185 }

184	186

185 // This must be called with the length pre-determined by the first byte.	187 // This must be called with the length pre-determined by the first byte.

186 // If presented with a length > 4, this returns false. The Unicode	188 // If presented with a length > 4, this returns false. The Unicode

187 // definition of UTF-8 goes up to 4-byte sequences.	189 // definition of UTF-8 goes up to 4-byte sequences.

188 static bool isLegalUTF8(const unsigned char* source, int length)	190 static bool isLegalUTF8(const unsigned char* source, int length) {

189 {	191 unsigned char a;

190 unsigned char a;	192 const unsigned char* srcptr = source + length;

191 const unsigned char* srcptr = source + length;	193 switch (length) {

192 switch (length) {

193 default:	194 default:

194 return false;	195 return false;

195 // Everything else falls through when "true"...	196 // Everything else falls through when "true"...

196 case 4:	197 case 4:

197 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)	198 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

	199 return false;

	200 case 3:

	201 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

	202 return false;

	203 case 2:

	204 if ((a = (*--srcptr)) > 0xBF)

	205 return false;

	206

	207 // no fall-through in this inner switch

	208 switch (*source) {

	209 case 0xE0:

	210 if (a < 0xA0)

198 return false;	211 return false;

199 case 3:	212 break;

200 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)	213 case 0xED:

	214 if (a > 0x9F)

201 return false;	215 return false;

202 case 2:	216 break;

203 if ((a = (*--srcptr)) > 0xBF)	217 case 0xF0:

	218 if (a < 0x90)

204 return false;	219 return false;

205	220 break;

206 // no fall-through in this inner switch

207 switch (*source) {

208 case 0xE0:

209 if (a < 0xA0)

210 return false;

211 break;

212 case 0xED:

213 if (a > 0x9F)

214 return false;

215 break;

216 case 0xF0:

217 if (a < 0x90)

218 return false;

219 break;

220 case 0xF4:	221 case 0xF4:

221 if (a > 0x8F)	222 if (a > 0x8F)

222 return false;	223 return false;

223 break;	224 break;

224 default:	225 default:

225 if (a < 0x80)	226 if (a < 0x80)

226 return false;	227 return false;

227 }	228 }

228	229

229 case 1:	230 case 1:

230 if (source >= 0x80 && source < 0xC2)	231 if (source >= 0x80 && source < 0xC2)

231 return false;	232 return false;

232 }	233 }

233 if (*source > 0xF4)	234 if (*source > 0xF4)

234 return false;	235 return false;

235 return true;	236 return true;

236 }	237 }

237	238

238 // Magic values subtracted from a buffer value during UTF8 conversion.	239 // Magic values subtracted from a buffer value during UTF8 conversion.

239 // This table contains as many values as there might be trailing bytes	240 // This table contains as many values as there might be trailing bytes

240 // in a UTF-8 sequence.	241 // in a UTF-8 sequence.

241 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };	242 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,

242	243 0x00003080UL,

243 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)	244 0x000E2080UL,

244 {	245 0x03C82080UL,

245 UChar32 character = 0;	246 static_cast<UChar32>(0xFA082080UL),

246	247 static_cast<UChar32>(0x82082080UL)};

247 // The cases all fall through.	248

248 switch (length) {	249 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) {

	250 UChar32 character = 0;

	251

	252 // The cases all fall through.

	253 switch (length) {

249 case 6:	254 case 6:

250 character += static_cast<unsigned char>(*sequence++);	255 character += static_cast<unsigned char>(*sequence++);

251 character <<= 6;	256 character <<= 6;

252 case 5:	257 case 5:

253 character += static_cast<unsigned char>(*sequence++);	258 character += static_cast<unsigned char>(*sequence++);

254 character <<= 6;	259 character <<= 6;

255 case 4:	260 case 4:

256 character += static_cast<unsigned char>(*sequence++);	261 character += static_cast<unsigned char>(*sequence++);

257 character <<= 6;	262 character <<= 6;

258 case 3:	263 case 3:

259 character += static_cast<unsigned char>(*sequence++);	264 character += static_cast<unsigned char>(*sequence++);

260 character <<= 6;	265 character <<= 6;

261 case 2:	266 case 2:

262 character += static_cast<unsigned char>(*sequence++);	267 character += static_cast<unsigned char>(*sequence++);

263 character <<= 6;	268 character <<= 6;

264 case 1:	269 case 1:

265 character += static_cast<unsigned char>(*sequence++);	270 character += static_cast<unsigned char>(*sequence++);

266 }	271 }

267	272

268 return character - offsetsFromUTF8[length - 1];	273 return character - offsetsFromUTF8[length - 1];

269 }	274 }

270	275

271 ConversionResult convertUTF8ToUTF16(	276 ConversionResult convertUTF8ToUTF16(const char** sourceStart,

272 const char** sourceStart, const char* sourceEnd,	277 const char* sourceEnd,

273 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)	278 UChar** targetStart,

274 {	279 UChar* targetEnd,

275 ConversionResult result = conversionOK;	280 bool* sourceAllASCII,

276 const char* source = *sourceStart;	281 bool strict) {

277 UChar* target = *targetStart;	282 ConversionResult result = conversionOK;

278 UChar orAllData = 0;	283 const char* source = *sourceStart;

279 while (source < sourceEnd) {	284 UChar* target = *targetStart;

280 int utf8SequenceLength = inlineUTF8SequenceLength(*source);	285 UChar orAllData = 0;

281 if (sourceEnd - source < utf8SequenceLength) {	286 while (source < sourceEnd) {

282 result = sourceExhausted;	287 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

283 break;	288 if (sourceEnd - source < utf8SequenceLength) {

	289 result = sourceExhausted;

	290 break;

	291 }

	292 // Do this check whether lenient or strict

	293 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),

	294 utf8SequenceLength)) {

	295 result = sourceIllegal;

	296 break;

	297 }

	298

	299 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

	300

	301 if (target >= targetEnd) {

	302 source -= utf8SequenceLength; // Back up source pointer!

	303 result = targetExhausted;

	304 break;

	305 }

	306

	307 if (U_IS_BMP(character)) {

	308 // UTF-16 surrogate values are illegal in UTF-32

	309 if (U_IS_SURROGATE(character)) {

	310 if (strict) {

	311 source -= utf8SequenceLength; // return to the illegal value itself

	312 result = sourceIllegal;

	313 break;

284 }	314 }

285 // Do this check whether lenient or strict	315 *target++ = replacementCharacter;

286 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {	316 orAllData \|= replacementCharacter;

287 result = sourceIllegal;	317 } else {

288 break;	318 *target++ = static_cast<UChar>(character); // normal case

289 }	319 orAllData \|= character;

290	320 }

291 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);	321 } else if (U_IS_SUPPLEMENTARY(character)) {

292	322 // target is a character in range 0xFFFF - 0x10FFFF

293 if (target >= targetEnd) {	323 if (target + 1 >= targetEnd) {

294 source -= utf8SequenceLength; // Back up source pointer!	324 source -= utf8SequenceLength; // Back up source pointer!

295 result = targetExhausted;	325 result = targetExhausted;

296 break;	326 break;

297 }	327 }

298	328 *target++ = U16_LEAD(character);

299 if (U_IS_BMP(character)) {	329 *target++ = U16_TRAIL(character);

300 // UTF-16 surrogate values are illegal in UTF-32	330 orAllData = 0xffff;

301 if (U_IS_SURROGATE(character)) {	331 } else {

302 if (strict) {	332 if (strict) {

303 source -= utf8SequenceLength; // return to the illegal value itself	333 source -= utf8SequenceLength; // return to the start

304 result = sourceIllegal;	334 result = sourceIllegal;

305 break;	335 break; // Bail out; shouldn't continue

306 }	336 } else {

307 *target++ = replacementCharacter;	337 *target++ = replacementCharacter;

308 orAllData \|= replacementCharacter;	338 orAllData \|= replacementCharacter;

309 } else {	339 }

310 *target++ = static_cast<UChar>(character); // normal case	340 }

311 orAllData \|= character;	341 }

312 }	342 *sourceStart = source;

313 } else if (U_IS_SUPPLEMENTARY(character)) {	343 *targetStart = target;

314 // target is a character in range 0xFFFF - 0x10FFFF	344

315 if (target + 1 >= targetEnd) {	345 if (sourceAllASCII)

316 source -= utf8SequenceLength; // Back up source pointer!	346 *sourceAllASCII = !(orAllData & ~0x7f);

317 result = targetExhausted;	347

318 break;	348 return result;

319 }	349 }

320 *target++ = U16_LEAD(character);	350

321 *target++ = U16_TRAIL(character);	351 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(

322 orAllData = 0xffff;	352 const char* data,

323 } else {	353 const char* dataEnd,

324 if (strict) {	354 unsigned& dataLength,

325 source -= utf8SequenceLength; // return to the start	355 unsigned& utf16Length) {

326 result = sourceIllegal;	356 if (!data)

327 break; // Bail out; shouldn't continue	357 return 0;

328 } else {	358

329 *target++ = replacementCharacter;	359 StringHasher stringHasher;

330 orAllData \|= replacementCharacter;	360 dataLength = 0;

331 }	361 utf16Length = 0;

332 }	362

333 }	363 while (data < dataEnd \|\| (!dataEnd && *data)) {

334 *sourceStart = source;	364 if (isASCII(*data)) {

335 *targetStart = target;	365 stringHasher.addCharacter(*data++);

336	366 dataLength++;

337 if (sourceAllASCII)	367 utf16Length++;

338 *sourceAllASCII = !(orAllData & ~0x7f);	368 continue;

339	369 }

340 return result;	370

341 }	371 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

342	372 dataLength += utf8SequenceLength;

343 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length)	373

344 {	374 if (!dataEnd) {

345 if (!data)	375 for (int i = 1; i < utf8SequenceLength; ++i) {

	376 if (!data[i])

	377 return 0;

	378 }

	379 } else if (dataEnd - data < utf8SequenceLength) {

	380 return 0;

	381 }

	382

	383 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data),

	384 utf8SequenceLength))

	385 return 0;

	386

	387 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);

	388 ASSERT(!isASCII(character));

	389

	390 if (U_IS_BMP(character)) {

	391 // UTF-16 surrogate values are illegal in UTF-32

	392 if (U_IS_SURROGATE(character))

346 return 0;	393 return 0;

347	394 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case

348 StringHasher stringHasher;	395 utf16Length++;

349 dataLength = 0;	396 } else if (U_IS_SUPPLEMENTARY(character)) {

350 utf16Length = 0;	397 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),

351	398 static_cast<UChar>(U16_TRAIL(character)));

352 while (data < dataEnd \|\| (!dataEnd && *data)) {	399 utf16Length += 2;

353 if (isASCII(*data)) {	400 } else {

354 stringHasher.addCharacter(*data++);	401 return 0;

355 dataLength++;	402 }

356 utf16Length++;	403 }

357 continue;	404

358 }	405 return stringHasher.hashWithTop8BitsMasked();

359	406 }

360 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);	407

361 dataLength += utf8SequenceLength;	408 template <typename CharType>

362	409 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a,

363 if (!dataEnd) {	410 const CharType* aEnd,

364 for (int i = 1; i < utf8SequenceLength; ++i) {	411 const char* b,

365 if (!data[i])	412 const char* bEnd) {

366 return 0;	413 while (b < bEnd) {

367 }	414 if (isASCII(*b)) {

368 } else if (dataEnd - data < utf8SequenceLength) {	415 if (a++ != b++)

369 return 0;	416 return false;

370 }	417 continue;

371	418 }

372 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))	419

373 return 0;	420 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

374	421

375 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);	422 if (bEnd - b < utf8SequenceLength)

376 ASSERT(!isASCII(character));	423 return false;

377	424

378 if (U_IS_BMP(character)) {	425 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b),

379 // UTF-16 surrogate values are illegal in UTF-32	426 utf8SequenceLength))

380 if (U_IS_SURROGATE(character))	427 return 0;

381 return 0;	428

382 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case	429 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);

383 utf16Length++;	430 ASSERT(!isASCII(character));

384 } else if (U_IS_SUPPLEMENTARY(character)) {	431

385 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));	432 if (U_IS_BMP(character)) {

386 utf16Length += 2;	433 // UTF-16 surrogate values are illegal in UTF-32

387 } else {	434 if (U_IS_SURROGATE(character))

388 return 0;	435 return false;

389 }	436 if (*a++ != character)

390 }	437 return false;

391	438 } else if (U_IS_SUPPLEMENTARY(character)) {

392 return stringHasher.hashWithTop8BitsMasked();	439 if (*a++ != U16_LEAD(character))

393 }	440 return false;

394	441 if (*a++ != U16_TRAIL(character))

395 template<typename CharType>	442 return false;

396 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd)	443 } else {

397 {	444 return false;

398 while (b < bEnd) {	445 }

399 if (isASCII(*b)) {	446 }

400 if (a++ != b++)	447

401 return false;	448 return a == aEnd;

402 continue;	449 }

403 }	450

404	451 bool equalUTF16WithUTF8(const UChar* a,

405 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);	452 const UChar* aEnd,

406	453 const char* b,

407 if (bEnd - b < utf8SequenceLength)	454 const char* bEnd) {

408 return false;	455 return equalWithUTF8Internal(a, aEnd, b, bEnd);

409	456 }

410 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence Length))	457

411 return 0;	458 bool equalLatin1WithUTF8(const LChar* a,

412	459 const LChar* aEnd,

413 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);	460 const char* b,

414 ASSERT(!isASCII(character));	461 const char* bEnd) {

415	462 return equalWithUTF8Internal(a, aEnd, b, bEnd);

416 if (U_IS_BMP(character)) {	463 }

417 // UTF-16 surrogate values are illegal in UTF-32	464

418 if (U_IS_SURROGATE(character))	465 } // namespace Unicode

419 return false;	466 } // namespace WTF

420 if (*a++ != character)

421 return false;

422 } else if (U_IS_SUPPLEMENTARY(character)) {

423 if (*a++ != U16_LEAD(character))

424 return false;

425 if (*a++ != U16_TRAIL(character))

426 return false;

427 } else {

428 return false;

429 }

430 }

431

432 return a == aEnd;

433 }

434

435 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)

436 {

437 return equalWithUTF8Internal(a, aEnd, b, bEnd);

438 }

439

440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)

441 {

442 return equalWithUTF8Internal(a, aEnd, b, bEnd);

443 }

444

445 } // namespace Unicode

446 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »