third_party/WebKit/Source/wtf/text/UTF8.cpp - Issue 1373773002: Fix check-webkit-style errors in Source/wtf/text/.

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1373773002: Fix check-webkit-style errors in Source/wtf/text/. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2007 Apple Inc. All rights reserved.	2 * Copyright (C) 2007 Apple Inc. All rights reserved.

3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>	3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>

4 *	4 *

5 * Redistribution and use in source and binary forms, with or without	5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions	6 * modification, are permitted provided that the following conditions

7 * are met:	7 * are met:

8 * 1. Redistributions of source code must retain the above copyright	8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright	10 * 2. Redistributions in binary form must reproduce the above copyright

(...skipping 37 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
48 }	48 }

49	49

50 inline int inlineUTF8SequenceLength(char b0)	50 inline int inlineUTF8SequenceLength(char b0)

51 {	51 {

52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);	52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

53 }	53 }

54	54

55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed	55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

56 // into the first byte, depending on how many bytes follow. There are	56 // into the first byte, depending on how many bytes follow. There are

57 // as many entries in this table as there are UTF-8 sequence types.	57 // as many entries in this table as there are UTF-8 sequence types.

58 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs	58 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

59 // for legal UTF-8 will be 4 or fewer bytes total.	59 // for legal UTF-8 will be 4 or fewer bytes total.

60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };	60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };

61	61

62 ConversionResult convertLatin1ToUTF8(	62 ConversionResult convertLatin1ToUTF8(

63 const LChar** sourceStart, const LChar* sou rceEnd,	63 const LChar** sourceStart, const LChar* sourceEnd,

64 char** targetStart, char* targetEnd)	64 char** targetStart, char* targetEnd)

65 {	65 {

66 ConversionResult result = conversionOK;	66 ConversionResult result = conversionOK;

67 const LChar* source = *sourceStart;	67 const LChar* source = *sourceStart;

68 char* target = *targetStart;	68 char* target = *targetStart;

69 while (source < sourceEnd) {	69 while (source < sourceEnd) {

70 UChar32 ch;	70 UChar32 ch;

71 unsigned short bytesToWrite = 0;	71 unsigned short bytesToWrite = 0;

72 const UChar32 byteMask = 0xBF;	72 const UChar32 byteMask = 0xBF;

73 const UChar32 byteMark = 0x80;	73 const UChar32 byteMark = 0x80;

74 const LChar* oldSource = source; // In case we have to back up because o f target overflow.	74 const LChar* oldSource = source; // In case we have to back up because o f target overflow.

(...skipping 82 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
157 }	157 }

158	158

159 target += bytesToWrite;	159 target += bytesToWrite;

160 if (target > targetEnd) {	160 if (target > targetEnd) {

161 source = oldSource; // Back up source pointer!	161 source = oldSource; // Back up source pointer!

162 target -= bytesToWrite;	162 target -= bytesToWrite;

163 result = targetExhausted;	163 result = targetExhausted;

164 break;	164 break;

165 }	165 }

166 switch (bytesToWrite) { // note: everything falls through.	166 switch (bytesToWrite) { // note: everything falls through.

167 case 4: *--target = (char)((ch \| byteMark) & byteMask); ch >>= 6;	167 case 4:

168 case 3: *--target = (char)((ch \| byteMark) & byteMask); ch >>= 6;	168 *--target = (char)((ch \| byteMark) & byteMask);

169 case 2: *--target = (char)((ch \| byteMark) & byteMask); ch >>= 6;	169 ch >>= 6;

170 case 1: *--target = (char)(ch \| firstByteMark[bytesToWrite]);	170 case 3:

	171 *--target = (char)((ch \| byteMark) & byteMask);

	172 ch >>= 6;

	173 case 2:

	174 *--target = (char)((ch \| byteMark) & byteMask);

	175 ch >>= 6;

	176 case 1:

	177 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

171 }	178 }

172 target += bytesToWrite;	179 target += bytesToWrite;

173 }	180 }

174 *sourceStart = source;	181 *sourceStart = source;

175 *targetStart = target;	182 *targetStart = target;

176 return result;	183 return result;

177 }	184 }

178	185

179 // This must be called with the length pre-determined by the first byte.	186 // This must be called with the length pre-determined by the first byte.

180 // If presented with a length > 4, this returns false. The Unicode	187 // If presented with a length > 4, this returns false. The Unicode

181 // definition of UTF-8 goes up to 4-byte sequences.	188 // definition of UTF-8 goes up to 4-byte sequences.

182 static bool isLegalUTF8(const unsigned char* source, int length)	189 static bool isLegalUTF8(const unsigned char* source, int length)

183 {	190 {

184 unsigned char a;	191 unsigned char a;

185 const unsigned char* srcptr = source + length;	192 const unsigned char* srcptr = source + length;

186 switch (length) {	193 switch (length) {

187 default: return false;	194 default:

188 // Everything else falls through when "true"...	195 return false;

189 case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;	196 // Everything else falls through when "true"...

190 case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;	197 case 4:

191 case 2: if ((a = (*--srcptr)) > 0xBF) return false;	198 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

	199 return false;

	200 case 3:

	201 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

	202 return false;

	203 case 2:

	204 if ((a = (*--srcptr)) > 0xBF)

	205 return false;

192	206

	207 // no fall-through in this inner switch

193 switch (*source) {	208 switch (*source) {

194 // no fall-through in this inner switch	209 case 0xE0:

195 case 0xE0: if (a < 0xA0) return false; break;	210 if (a < 0xA0)

196 case 0xED: if (a > 0x9F) return false; break;	211 return false;

197 case 0xF0: if (a < 0x90) return false; break;	212 break;

198 case 0xF4: if (a > 0x8F) return false; break;	213 case 0xED:

199 default: if (a < 0x80) return false;	214 if (a > 0x9F)

	215 return false;

	216 break;

	217 case 0xF0:

	218 if (a < 0x90)

	219 return false;

	220 break;

	221 case 0xF4:

	222 if (a > 0x8F)

	223 return false;

	224 break;

	225 default:

	226 if (a < 0x80)

	227 return false;

200 }	228 }

201	229

202 case 1: if (source >= 0x80 && source < 0xC2) return false;	230 case 1:

	231 if (source >= 0x80 && source < 0xC2)

	232 return false;

203 }	233 }

204 if (*source > 0xF4)	234 if (*source > 0xF4)

205 return false;	235 return false;

206 return true;	236 return true;

207 }	237 }

208	238

209 // Magic values subtracted from a buffer value during UTF8 conversion.	239 // Magic values subtracted from a buffer value during UTF8 conversion.

210 // This table contains as many values as there might be trailing bytes	240 // This table contains as many values as there might be trailing bytes

211 // in a UTF-8 sequence.	241 // in a UTF-8 sequence.

212 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };	242 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };

213	243

214 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)	244 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)

215 {	245 {

216 UChar32 character = 0;	246 UChar32 character = 0;

217	247

218 // The cases all fall through.	248 // The cases all fall through.

219 switch (length) {	249 switch (length) {

220 case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;	250 case 6:

221 case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;	251 character += static_cast<unsigned char>(*sequence++);

222 case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;	252 character <<= 6;

223 case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;	253 case 5:

224 case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;	254 character += static_cast<unsigned char>(*sequence++);

225 case 1: character += static_cast<unsigned char>(*sequence++);	255 character <<= 6;

	256 case 4:

	257 character += static_cast<unsigned char>(*sequence++);

	258 character <<= 6;

	259 case 3:

	260 character += static_cast<unsigned char>(*sequence++);

	261 character <<= 6;

	262 case 2:

	263 character += static_cast<unsigned char>(*sequence++);

	264 character <<= 6;

	265 case 1:

	266 character += static_cast<unsigned char>(*sequence++);

226 }	267 }

227	268

228 return character - offsetsFromUTF8[length - 1];	269 return character - offsetsFromUTF8[length - 1];

229 }	270 }

230	271

231 ConversionResult convertUTF8ToUTF16(	272 ConversionResult convertUTF8ToUTF16(

232 const char** sourceStart, const char* sourceEnd,	273 const char** sourceStart, const char* sourceEnd,

233 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)	274 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)

234 {	275 {

235 ConversionResult result = conversionOK;	276 ConversionResult result = conversionOK;

(...skipping 20 matching lines...) Expand all Loading...
256 break;	297 break;

257 }	298 }

258	299

259 if (U_IS_BMP(character)) {	300 if (U_IS_BMP(character)) {

260 // UTF-16 surrogate values are illegal in UTF-32	301 // UTF-16 surrogate values are illegal in UTF-32

261 if (U_IS_SURROGATE(character)) {	302 if (U_IS_SURROGATE(character)) {

262 if (strict) {	303 if (strict) {

263 source -= utf8SequenceLength; // return to the illegal value itself	304 source -= utf8SequenceLength; // return to the illegal value itself

264 result = sourceIllegal;	305 result = sourceIllegal;

265 break;	306 break;

266 } else {

267 *target++ = replacementCharacter;

268 orAllData \|= replacementCharacter;

269 }	307 }

	308 *target++ = replacementCharacter;

	309 orAllData \|= replacementCharacter;

270 } else {	310 } else {

271 *target++ = static_cast<UChar>(character); // normal case	311 *target++ = static_cast<UChar>(character); // normal case

272 orAllData \|= character;	312 orAllData \|= character;

273 }	313 }

274 } else if (U_IS_SUPPLEMENTARY(character)) {	314 } else if (U_IS_SUPPLEMENTARY(character)) {

275 // target is a character in range 0xFFFF - 0x10FFFF	315 // target is a character in range 0xFFFF - 0x10FFFF

276 if (target + 1 >= targetEnd) {	316 if (target + 1 >= targetEnd) {

277 source -= utf8SequenceLength; // Back up source pointer!	317 source -= utf8SequenceLength; // Back up source pointer!

278 result = targetExhausted;	318 result = targetExhausted;

279 break;	319 break;

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
319 }	359 }

320	360

321 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);	361 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

322 dataLength += utf8SequenceLength;	362 dataLength += utf8SequenceLength;

323	363

324 if (!dataEnd) {	364 if (!dataEnd) {

325 for (int i = 1; i < utf8SequenceLength; ++i) {	365 for (int i = 1; i < utf8SequenceLength; ++i) {

326 if (!data[i])	366 if (!data[i])

327 return 0;	367 return 0;

328 }	368 }

329 } else if (dataEnd - data < utf8SequenceLength)	369 } else if (dataEnd - data < utf8SequenceLength) {

330 return 0;	370 return 0;

	371 }

331	372

332 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))	373 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))

333 return 0;	374 return 0;

334	375

335 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);	376 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);

336 ASSERT(!isASCII(character));	377 ASSERT(!isASCII(character));

337	378

338 if (U_IS_BMP(character)) {	379 if (U_IS_BMP(character)) {

339 // UTF-16 surrogate values are illegal in UTF-32	380 // UTF-16 surrogate values are illegal in UTF-32

340 if (U_IS_SURROGATE(character))	381 if (U_IS_SURROGATE(character))

341 return 0;	382 return 0;

342 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case	383 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case

343 utf16Length++;	384 utf16Length++;

344 } else if (U_IS_SUPPLEMENTARY(character)) {	385 } else if (U_IS_SUPPLEMENTARY(character)) {

345 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),	386 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));

346 static_cast<UChar>(U16_TRAIL(character))) ;

347 utf16Length += 2;	387 utf16Length += 2;

348 } else	388 } else {

349 return 0;	389 return 0;

	390 }

350 }	391 }

351	392

352 return stringHasher.hashWithTop8BitsMasked();	393 return stringHasher.hashWithTop8BitsMasked();

353 }	394 }

354	395

355 template<typename CharType>	396 template<typename CharType>

356 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd)	397 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd)

357 {	398 {

358 while (b < bEnd) {	399 while (b < bEnd) {

359 if (isASCII(*b)) {	400 if (isASCII(*b)) {

(...skipping 17 matching lines...) Expand all Loading...
377 // UTF-16 surrogate values are illegal in UTF-32	418 // UTF-16 surrogate values are illegal in UTF-32

378 if (U_IS_SURROGATE(character))	419 if (U_IS_SURROGATE(character))

379 return false;	420 return false;

380 if (*a++ != character)	421 if (*a++ != character)

381 return false;	422 return false;

382 } else if (U_IS_SUPPLEMENTARY(character)) {	423 } else if (U_IS_SUPPLEMENTARY(character)) {

383 if (*a++ != U16_LEAD(character))	424 if (*a++ != U16_LEAD(character))

384 return false;	425 return false;

385 if (*a++ != U16_TRAIL(character))	426 if (*a++ != U16_TRAIL(character))

386 return false;	427 return false;

387 } else	428 } else {

388 return false;	429 return false;

	430 }

389 }	431 }

390	432

391 return a == aEnd;	433 return a == aEnd;

392 }	434 }

393	435

394 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)	436 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)

395 {	437 {

396 return equalWithUTF8Internal(a, aEnd, b, bEnd);	438 return equalWithUTF8Internal(a, aEnd, b, bEnd);

397 }	439 }

398	440

399 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)	441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)

400 {	442 {

401 return equalWithUTF8Internal(a, aEnd, b, bEnd);	443 return equalWithUTF8Internal(a, aEnd, b, bEnd);

402 }	444 }

403	445

404 } // namespace Unicode	446 } // namespace Unicode

405 } // namespace WTF	447 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »