third_party/WebKit/Source/wtf/text/UTF8.cpp - Issue 1436153002: Apply clang-format with Chromium-style without column limit.

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1436153002: Apply clang-format with Chromium-style without column limit. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2007 Apple Inc. All rights reserved.	2 * Copyright (C) 2007 Apple Inc. All rights reserved.

3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>	3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>

4 *	4 *

5 * Redistribution and use in source and binary forms, with or without	5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions	6 * modification, are permitted provided that the following conditions

7 * are met:	7 * are met:

8 * 1. Redistributions of source code must retain the above copyright	8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright	10 * 2. Redistributions in binary form must reproduce the above copyright

(...skipping 16 matching lines...) Expand all Loading...
27 #include "config.h"	27 #include "config.h"

28 #include "wtf/text/UTF8.h"	28 #include "wtf/text/UTF8.h"

29	29

30 #include "wtf/ASCIICType.h"	30 #include "wtf/ASCIICType.h"

31 #include "wtf/StringHasher.h"	31 #include "wtf/StringHasher.h"

32 #include "wtf/text/CharacterNames.h"	32 #include "wtf/text/CharacterNames.h"

33	33

34 namespace WTF {	34 namespace WTF {

35 namespace Unicode {	35 namespace Unicode {

36	36

37 inline int inlineUTF8SequenceLengthNonASCII(char b0)	37 inline int inlineUTF8SequenceLengthNonASCII(char b0) {

38 {	38 if ((b0 & 0xC0) != 0xC0)

39 if ((b0 & 0xC0) != 0xC0)

40 return 0;

41 if ((b0 & 0xE0) == 0xC0)

42 return 2;

43 if ((b0 & 0xF0) == 0xE0)

44 return 3;

45 if ((b0 & 0xF8) == 0xF0)

46 return 4;

47 return 0;	39 return 0;

	40 if ((b0 & 0xE0) == 0xC0)

	41 return 2;

	42 if ((b0 & 0xF0) == 0xE0)

	43 return 3;

	44 if ((b0 & 0xF8) == 0xF0)

	45 return 4;

	46 return 0;

48 }	47 }

49	48

50 inline int inlineUTF8SequenceLength(char b0)	49 inline int inlineUTF8SequenceLength(char b0) {

51 {	50 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

53 }	51 }

54	52

55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed	53 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

56 // into the first byte, depending on how many bytes follow. There are	54 // into the first byte, depending on how many bytes follow. There are

57 // as many entries in this table as there are UTF-8 sequence types.	55 // as many entries in this table as there are UTF-8 sequence types.

58 // (I.e., one byte sequence, two byte... etc.). Remember that sequences	56 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

59 // for legal UTF-8 will be 4 or fewer bytes total.	57 // for legal UTF-8 will be 4 or fewer bytes total.

60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };	58 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF 8, 0xFC};

61	59

62 ConversionResult convertLatin1ToUTF8(	60 ConversionResult convertLatin1ToUTF8(

63 const LChar** sourceStart, const LChar* sourceEnd,	61 const LChar** sourceStart,

64 char** targetStart, char* targetEnd)	62 const LChar* sourceEnd,

65 {	63 char** targetStart,

66 ConversionResult result = conversionOK;	64 char* targetEnd) {

67 const LChar* source = *sourceStart;	65 ConversionResult result = conversionOK;

68 char* target = *targetStart;	66 const LChar* source = *sourceStart;

69 while (source < sourceEnd) {	67 char* target = *targetStart;

70 UChar32 ch;	68 while (source < sourceEnd) {

71 unsigned short bytesToWrite = 0;	69 UChar32 ch;

72 const UChar32 byteMask = 0xBF;	70 unsigned short bytesToWrite = 0;

73 const UChar32 byteMark = 0x80;	71 const UChar32 byteMask = 0xBF;

74 const LChar* oldSource = source; // In case we have to back up because o f target overflow.	72 const UChar32 byteMark = 0x80;

75 ch = static_cast<unsigned short>(*source++);	73 const LChar* oldSource = source; // In case we have to back up because of t arget overflow.

76	74 ch = static_cast<unsigned short>(*source++);

77 // Figure out how many bytes the result will require	75

78 if (ch < (UChar32)0x80)	76 // Figure out how many bytes the result will require

79 bytesToWrite = 1;	77 if (ch < (UChar32)0x80)

80 else	78 bytesToWrite = 1;

81 bytesToWrite = 2;	79 else

82	80 bytesToWrite = 2;

83 target += bytesToWrite;	81

84 if (target > targetEnd) {	82 target += bytesToWrite;

85 source = oldSource; // Back up source pointer!	83 if (target > targetEnd) {

86 target -= bytesToWrite;	84 source = oldSource; // Back up source pointer!

87 result = targetExhausted;	85 target -= bytesToWrite;

88 break;	86 result = targetExhausted;

	87 break;

	88 }

	89 switch (bytesToWrite) { // note: everything falls through.

	90 case 2:

	91 *--target = (char)((ch \| byteMark) & byteMask);

	92 ch >>= 6;

	93 case 1:

	94 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

	95 }

	96 target += bytesToWrite;

	97 }

	98 *sourceStart = source;

	99 *targetStart = target;

	100 return result;

	101 }

	102

	103 ConversionResult convertUTF16ToUTF8(

	104 const UChar** sourceStart,

	105 const UChar* sourceEnd,

	106 char** targetStart,

	107 char* targetEnd,

	108 bool strict) {

	109 ConversionResult result = conversionOK;

	110 const UChar* source = *sourceStart;

	111 char* target = *targetStart;

	112 while (source < sourceEnd) {

	113 UChar32 ch;

	114 unsigned short bytesToWrite = 0;

	115 const UChar32 byteMask = 0xBF;

	116 const UChar32 byteMark = 0x80;

	117 const UChar* oldSource = source; // In case we have to back up because of t arget overflow.

	118 ch = static_cast<unsigned short>(*source++);

	119 // If we have a surrogate pair, convert to UChar32 first.

	120 if (ch >= 0xD800 && ch <= 0xDBFF) {

	121 // If the 16 bits following the high surrogate are in the source buffer...

	122 if (source < sourceEnd) {

	123 UChar32 ch2 = static_cast<unsigned short>(*source);

	124 // If it's a low surrogate, convert to UChar32.

	125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

	126 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

	127 ++source;

	128 } else if (strict) { // it's an unpaired high surrogate

	129 --source; // return to the illegal value itself

	130 result = sourceIllegal;

	131 break;

89 }	132 }

90 switch (bytesToWrite) { // note: everything falls through.	133 } else { // We don't have the 16 bits following the high surrogate.

91 case 2:	134 --source; // return to the high surrogate

92 *--target = (char)((ch \| byteMark) & byteMask);	135 result = sourceExhausted;

93 ch >>= 6;	136 break;

94 case 1:	137 }

95 *--target = (char)(ch \| firstByteMark[bytesToWrite]);	138 } else if (strict) {

96 }	139 // UTF-16 surrogate values are illegal in UTF-32

97 target += bytesToWrite;	140 if (ch >= 0xDC00 && ch <= 0xDFFF) {

98 }	141 --source; // return to the illegal value itself

99 *sourceStart = source;	142 result = sourceIllegal;

100 *targetStart = target;	143 break;

101 return result;	144 }

102 }	145 }

103	146 // Figure out how many bytes the result will require

104 ConversionResult convertUTF16ToUTF8(	147 if (ch < (UChar32)0x80) {

105 const UChar** sourceStart, const UChar* sourceEnd,	148 bytesToWrite = 1;

106 char** targetStart, char* targetEnd, bool strict)	149 } else if (ch < (UChar32)0x800) {

107 {	150 bytesToWrite = 2;

108 ConversionResult result = conversionOK;	151 } else if (ch < (UChar32)0x10000) {

109 const UChar* source = *sourceStart;	152 bytesToWrite = 3;

110 char* target = *targetStart;	153 } else if (ch < (UChar32)0x110000) {

111 while (source < sourceEnd) {	154 bytesToWrite = 4;

112 UChar32 ch;	155 } else {

113 unsigned short bytesToWrite = 0;	156 bytesToWrite = 3;

114 const UChar32 byteMask = 0xBF;	157 ch = replacementCharacter;

115 const UChar32 byteMark = 0x80;	158 }

116 const UChar* oldSource = source; // In case we have to back up because o f target overflow.	159

117 ch = static_cast<unsigned short>(*source++);	160 target += bytesToWrite;

118 // If we have a surrogate pair, convert to UChar32 first.	161 if (target > targetEnd) {

119 if (ch >= 0xD800 && ch <= 0xDBFF) {	162 source = oldSource; // Back up source pointer!

120 // If the 16 bits following the high surrogate are in the source buf fer...	163 target -= bytesToWrite;

121 if (source < sourceEnd) {	164 result = targetExhausted;

122 UChar32 ch2 = static_cast<unsigned short>(*source);	165 break;

123 // If it's a low surrogate, convert to UChar32.	166 }

124 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {	167 switch (bytesToWrite) { // note: everything falls through.

125 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;	168 case 4:

126 ++source;	169 *--target = (char)((ch \| byteMark) & byteMask);

127 } else if (strict) { // it's an unpaired high surrogate	170 ch >>= 6;

128 --source; // return to the illegal value itself	171 case 3:

129 result = sourceIllegal;	172 *--target = (char)((ch \| byteMark) & byteMask);

130 break;	173 ch >>= 6;

131 }	174 case 2:

132 } else { // We don't have the 16 bits following the high surrogate.	175 *--target = (char)((ch \| byteMark) & byteMask);

133 --source; // return to the high surrogate	176 ch >>= 6;

134 result = sourceExhausted;	177 case 1:

135 break;	178 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

136 }	179 }

137 } else if (strict) {	180 target += bytesToWrite;

138 // UTF-16 surrogate values are illegal in UTF-32	181 }

139 if (ch >= 0xDC00 && ch <= 0xDFFF) {	182 *sourceStart = source;

140 --source; // return to the illegal value itself	183 *targetStart = target;

141 result = sourceIllegal;	184 return result;

142 break;

143 }

144 }

145 // Figure out how many bytes the result will require

146 if (ch < (UChar32)0x80) {

147 bytesToWrite = 1;

148 } else if (ch < (UChar32)0x800) {

149 bytesToWrite = 2;

150 } else if (ch < (UChar32)0x10000) {

151 bytesToWrite = 3;

152 } else if (ch < (UChar32)0x110000) {

153 bytesToWrite = 4;

154 } else {

155 bytesToWrite = 3;

156 ch = replacementCharacter;

157 }

158

159 target += bytesToWrite;

160 if (target > targetEnd) {

161 source = oldSource; // Back up source pointer!

162 target -= bytesToWrite;

163 result = targetExhausted;

164 break;

165 }

166 switch (bytesToWrite) { // note: everything falls through.

167 case 4:

168 *--target = (char)((ch \| byteMark) & byteMask);

169 ch >>= 6;

170 case 3:

171 *--target = (char)((ch \| byteMark) & byteMask);

172 ch >>= 6;

173 case 2:

174 *--target = (char)((ch \| byteMark) & byteMask);

175 ch >>= 6;

176 case 1:

177 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

178 }

179 target += bytesToWrite;

180 }

181 *sourceStart = source;

182 *targetStart = target;

183 return result;

184 }	185 }

185	186

186 // This must be called with the length pre-determined by the first byte.	187 // This must be called with the length pre-determined by the first byte.

187 // If presented with a length > 4, this returns false. The Unicode	188 // If presented with a length > 4, this returns false. The Unicode

188 // definition of UTF-8 goes up to 4-byte sequences.	189 // definition of UTF-8 goes up to 4-byte sequences.

189 static bool isLegalUTF8(const unsigned char* source, int length)	190 static bool isLegalUTF8(const unsigned char* source, int length) {

190 {	191 unsigned char a;

191 unsigned char a;	192 const unsigned char* srcptr = source + length;

192 const unsigned char* srcptr = source + length;	193 switch (length) {

193 switch (length) {

194 default:	194 default:

195 return false;	195 return false;

196 // Everything else falls through when "true"...	196 // Everything else falls through when "true"...

197 case 4:	197 case 4:

198 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)	198 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

199 return false;	199 return false;

200 case 3:	200 case 3:

201 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)	201 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

202 return false;	202 return false;

203 case 2:	203 case 2:

204 if ((a = (*--srcptr)) > 0xBF)	204 if ((a = (*--srcptr)) > 0xBF)

205 return false;	205 return false;

206	206

207 // no fall-through in this inner switch	207 // no fall-through in this inner switch

208 switch (*source) {	208 switch (*source) {

209 case 0xE0:	209 case 0xE0:

210 if (a < 0xA0)	210 if (a < 0xA0)

211 return false;	211 return false;

212 break;	212 break;

213 case 0xED:	213 case 0xED:

214 if (a > 0x9F)	214 if (a > 0x9F)

215 return false;	215 return false;

216 break;	216 break;

217 case 0xF0:	217 case 0xF0:

218 if (a < 0x90)	218 if (a < 0x90)

219 return false;	219 return false;

220 break;	220 break;

221 case 0xF4:	221 case 0xF4:

222 if (a > 0x8F)	222 if (a > 0x8F)

223 return false;	223 return false;

224 break;	224 break;

225 default:	225 default:

226 if (a < 0x80)	226 if (a < 0x80)

227 return false;	227 return false;

228 }	228 }

229	229

230 case 1:	230 case 1:

231 if (source >= 0x80 && source < 0xC2)	231 if (source >= 0x80 && source < 0xC2)

232 return false;	232 return false;

233 }	233 }

234 if (*source > 0xF4)	234 if (*source > 0xF4)

235 return false;	235 return false;

236 return true;	236 return true;

237 }	237 }

238	238

239 // Magic values subtracted from a buffer value during UTF8 conversion.	239 // Magic values subtracted from a buffer value during UTF8 conversion.

240 // This table contains as many values as there might be trailing bytes	240 // This table contains as many values as there might be trailing bytes

241 // in a UTF-8 sequence.	241 // in a UTF-8 sequence.

242 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };	242 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E208 0UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82 082080UL)};

243	243

244 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)	244 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) {

245 {	245 UChar32 character = 0;

246 UChar32 character = 0;	246

247	247 // The cases all fall through.

248 // The cases all fall through.	248 switch (length) {

249 switch (length) {

250 case 6:	249 case 6:

251 character += static_cast<unsigned char>(*sequence++);	250 character += static_cast<unsigned char>(*sequence++);

252 character <<= 6;	251 character <<= 6;

253 case 5:	252 case 5:

254 character += static_cast<unsigned char>(*sequence++);	253 character += static_cast<unsigned char>(*sequence++);

255 character <<= 6;	254 character <<= 6;

256 case 4:	255 case 4:

257 character += static_cast<unsigned char>(*sequence++);	256 character += static_cast<unsigned char>(*sequence++);

258 character <<= 6;	257 character <<= 6;

259 case 3:	258 case 3:

260 character += static_cast<unsigned char>(*sequence++);	259 character += static_cast<unsigned char>(*sequence++);

261 character <<= 6;	260 character <<= 6;

262 case 2:	261 case 2:

263 character += static_cast<unsigned char>(*sequence++);	262 character += static_cast<unsigned char>(*sequence++);

264 character <<= 6;	263 character <<= 6;

265 case 1:	264 case 1:

266 character += static_cast<unsigned char>(*sequence++);	265 character += static_cast<unsigned char>(*sequence++);

267 }	266 }

268	267

269 return character - offsetsFromUTF8[length - 1];	268 return character - offsetsFromUTF8[length - 1];

270 }	269 }

271	270

272 ConversionResult convertUTF8ToUTF16(	271 ConversionResult convertUTF8ToUTF16(

273 const char** sourceStart, const char* sourceEnd,	272 const char** sourceStart,

274 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)	273 const char* sourceEnd,

275 {	274 UChar** targetStart,

276 ConversionResult result = conversionOK;	275 UChar* targetEnd,

277 const char* source = *sourceStart;	276 bool* sourceAllASCII,

278 UChar* target = *targetStart;	277 bool strict) {

279 UChar orAllData = 0;	278 ConversionResult result = conversionOK;

280 while (source < sourceEnd) {	279 const char* source = *sourceStart;

281 int utf8SequenceLength = inlineUTF8SequenceLength(*source);	280 UChar* target = *targetStart;

282 if (sourceEnd - source < utf8SequenceLength) {	281 UChar orAllData = 0;

283 result = sourceExhausted;	282 while (source < sourceEnd) {

284 break;	283 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

	284 if (sourceEnd - source < utf8SequenceLength) {

	285 result = sourceExhausted;

	286 break;

	287 }

	288 // Do this check whether lenient or strict

	289 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Sequenc eLength)) {

	290 result = sourceIllegal;

	291 break;

	292 }

	293

	294 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

	295

	296 if (target >= targetEnd) {

	297 source -= utf8SequenceLength; // Back up source pointer!

	298 result = targetExhausted;

	299 break;

	300 }

	301

	302 if (U_IS_BMP(character)) {

	303 // UTF-16 surrogate values are illegal in UTF-32

	304 if (U_IS_SURROGATE(character)) {

	305 if (strict) {

	306 source -= utf8SequenceLength; // return to the illegal value itself

	307 result = sourceIllegal;

	308 break;

285 }	309 }

286 // Do this check whether lenient or strict	310 *target++ = replacementCharacter;

287 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {	311 orAllData \|= replacementCharacter;

288 result = sourceIllegal;	312 } else {

289 break;	313 *target++ = static_cast<UChar>(character); // normal case

290 }	314 orAllData \|= character;

291	315 }

292 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);	316 } else if (U_IS_SUPPLEMENTARY(character)) {

293	317 // target is a character in range 0xFFFF - 0x10FFFF

294 if (target >= targetEnd) {	318 if (target + 1 >= targetEnd) {

295 source -= utf8SequenceLength; // Back up source pointer!	319 source -= utf8SequenceLength; // Back up source pointer!

296 result = targetExhausted;	320 result = targetExhausted;

297 break;	321 break;

298 }	322 }

299	323 *target++ = U16_LEAD(character);

300 if (U_IS_BMP(character)) {	324 *target++ = U16_TRAIL(character);

301 // UTF-16 surrogate values are illegal in UTF-32	325 orAllData = 0xffff;

302 if (U_IS_SURROGATE(character)) {	326 } else {

303 if (strict) {	327 if (strict) {

304 source -= utf8SequenceLength; // return to the illegal value itself	328 source -= utf8SequenceLength; // return to the start

305 result = sourceIllegal;	329 result = sourceIllegal;

306 break;	330 break; // Bail out; shouldn't continue

307 }	331 } else {

308 *target++ = replacementCharacter;	332 *target++ = replacementCharacter;

309 orAllData \|= replacementCharacter;	333 orAllData \|= replacementCharacter;

310 } else {	334 }

311 *target++ = static_cast<UChar>(character); // normal case	335 }

312 orAllData \|= character;	336 }

313 }	337 *sourceStart = source;

314 } else if (U_IS_SUPPLEMENTARY(character)) {	338 *targetStart = target;

315 // target is a character in range 0xFFFF - 0x10FFFF	339

316 if (target + 1 >= targetEnd) {	340 if (sourceAllASCII)

317 source -= utf8SequenceLength; // Back up source pointer!	341 *sourceAllASCII = !(orAllData & ~0x7f);

318 result = targetExhausted;	342

319 break;	343 return result;

320 }	344 }

321 *target++ = U16_LEAD(character);	345

322 *target++ = U16_TRAIL(character);	346 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length) {

323 orAllData = 0xffff;	347 if (!data)

324 } else {	348 return 0;

325 if (strict) {	349

326 source -= utf8SequenceLength; // return to the start	350 StringHasher stringHasher;

327 result = sourceIllegal;	351 dataLength = 0;

328 break; // Bail out; shouldn't continue	352 utf16Length = 0;

329 } else {	353

330 *target++ = replacementCharacter;	354 while (data < dataEnd \|\| (!dataEnd && *data)) {

331 orAllData \|= replacementCharacter;	355 if (isASCII(*data)) {

332 }	356 stringHasher.addCharacter(*data++);

333 }	357 dataLength++;

334 }	358 utf16Length++;

335 *sourceStart = source;	359 continue;

336 *targetStart = target;	360 }

337	361

338 if (sourceAllASCII)	362 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

339 *sourceAllASCII = !(orAllData & ~0x7f);	363 dataLength += utf8SequenceLength;

340	364

341 return result;	365 if (!dataEnd) {

342 }	366 for (int i = 1; i < utf8SequenceLength; ++i) {

343	367 if (!data[i])

344 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length)	368 return 0;

345 {	369 }

346 if (!data)	370 } else if (dataEnd - data < utf8SequenceLength) {

	371 return 0;

	372 }

	373

	374 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceL ength))

	375 return 0;

	376

	377 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);

	378 ASSERT(!isASCII(character));

	379

	380 if (U_IS_BMP(character)) {

	381 // UTF-16 surrogate values are illegal in UTF-32

	382 if (U_IS_SURROGATE(character))

347 return 0;	383 return 0;

348	384 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case

349 StringHasher stringHasher;	385 utf16Length++;

350 dataLength = 0;	386 } else if (U_IS_SUPPLEMENTARY(character)) {

351 utf16Length = 0;	387 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static _cast<UChar>(U16_TRAIL(character)));

352	388 utf16Length += 2;

353 while (data < dataEnd \|\| (!dataEnd && *data)) {	389 } else {

354 if (isASCII(*data)) {	390 return 0;

355 stringHasher.addCharacter(*data++);	391 }

356 dataLength++;	392 }

357 utf16Length++;	393

358 continue;	394 return stringHasher.hashWithTop8BitsMasked();

359 }	395 }

360	396

361 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);	397 template <typename CharType>

362 dataLength += utf8SequenceLength;	398 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd) {

363	399 while (b < bEnd) {

364 if (!dataEnd) {	400 if (isASCII(*b)) {

365 for (int i = 1; i < utf8SequenceLength; ++i) {	401 if (a++ != b++)

366 if (!data[i])	402 return false;

367 return 0;	403 continue;

368 }	404 }

369 } else if (dataEnd - data < utf8SequenceLength) {	405

370 return 0;	406 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

371 }	407

372	408 if (bEnd - b < utf8SequenceLength)

373 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))	409 return false;

374 return 0;	410

375	411 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLeng th))

376 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);	412 return 0;

377 ASSERT(!isASCII(character));	413

378	414 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);

379 if (U_IS_BMP(character)) {	415 ASSERT(!isASCII(character));

380 // UTF-16 surrogate values are illegal in UTF-32	416

381 if (U_IS_SURROGATE(character))	417 if (U_IS_BMP(character)) {

382 return 0;	418 // UTF-16 surrogate values are illegal in UTF-32

383 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case	419 if (U_IS_SURROGATE(character))

384 utf16Length++;	420 return false;

385 } else if (U_IS_SUPPLEMENTARY(character)) {	421 if (*a++ != character)

386 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));	422 return false;

387 utf16Length += 2;	423 } else if (U_IS_SUPPLEMENTARY(character)) {

388 } else {	424 if (*a++ != U16_LEAD(character))

389 return 0;	425 return false;

390 }	426 if (*a++ != U16_TRAIL(character))

391 }	427 return false;

392	428 } else {

393 return stringHasher.hashWithTop8BitsMasked();	429 return false;

394 }	430 }

395	431 }

396 template<typename CharType>	432

397 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd)	433 return a == aEnd;

398 {	434 }

399 while (b < bEnd) {	435

400 if (isASCII(*b)) {	436 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd) {

401 if (a++ != b++)	437 return equalWithUTF8Internal(a, aEnd, b, bEnd);

402 return false;	438 }

403 continue;	439

404 }	440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd) {

405	441 return equalWithUTF8Internal(a, aEnd, b, bEnd);

406 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);	442 }

407	443

408 if (bEnd - b < utf8SequenceLength)	444 } // namespace Unicode

409 return false;	445 } // namespace WTF

410

411 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence Length))

412 return 0;

413

414 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);

415 ASSERT(!isASCII(character));

416

417 if (U_IS_BMP(character)) {

418 // UTF-16 surrogate values are illegal in UTF-32

419 if (U_IS_SURROGATE(character))

420 return false;

421 if (*a++ != character)

422 return false;

423 } else if (U_IS_SUPPLEMENTARY(character)) {

424 if (*a++ != U16_LEAD(character))

425 return false;

426 if (*a++ != U16_TRAIL(character))

427 return false;

428 } else {

429 return false;

430 }

431 }

432

433 return a == aEnd;

434 }

435

436 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)

437 {

438 return equalWithUTF8Internal(a, aEnd, b, bEnd);

439 }

440

441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)

442 {

443 return equalWithUTF8Internal(a, aEnd, b, bEnd);

444 }

445

446 } // namespace Unicode

447 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »