Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1373773002: Fix check-webkit-style errors in Source/wtf/text/. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2007 Apple Inc. All rights reserved. 2 * Copyright (C) 2007 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4 * 4 *
5 * Redistribution and use in source and binary forms, with or without 5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions 6 * modification, are permitted provided that the following conditions
7 * are met: 7 * are met:
8 * 1. Redistributions of source code must retain the above copyright 8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright 10 * 2. Redistributions in binary form must reproduce the above copyright
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
48 } 48 }
49 49
50 inline int inlineUTF8SequenceLength(char b0) 50 inline int inlineUTF8SequenceLength(char b0)
51 { 51 {
52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
53 } 53 }
54 54
55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
56 // into the first byte, depending on how many bytes follow. There are 56 // into the first byte, depending on how many bytes follow. There are
57 // as many entries in this table as there are UTF-8 sequence types. 57 // as many entries in this table as there are UTF-8 sequence types.
58 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs 58 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
59 // for *legal* UTF-8 will be 4 or fewer bytes total. 59 // for *legal* UTF-8 will be 4 or fewer bytes total.
60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC }; 60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };
61 61
62 ConversionResult convertLatin1ToUTF8( 62 ConversionResult convertLatin1ToUTF8(
63 const LChar** sourceStart, const LChar* sou rceEnd, 63 const LChar** sourceStart, const LChar* sourceEnd,
64 char** targetStart, char* targetEnd) 64 char** targetStart, char* targetEnd)
65 { 65 {
66 ConversionResult result = conversionOK; 66 ConversionResult result = conversionOK;
67 const LChar* source = *sourceStart; 67 const LChar* source = *sourceStart;
68 char* target = *targetStart; 68 char* target = *targetStart;
69 while (source < sourceEnd) { 69 while (source < sourceEnd) {
70 UChar32 ch; 70 UChar32 ch;
71 unsigned short bytesToWrite = 0; 71 unsigned short bytesToWrite = 0;
72 const UChar32 byteMask = 0xBF; 72 const UChar32 byteMask = 0xBF;
73 const UChar32 byteMark = 0x80; 73 const UChar32 byteMark = 0x80;
74 const LChar* oldSource = source; // In case we have to back up because o f target overflow. 74 const LChar* oldSource = source; // In case we have to back up because o f target overflow.
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
157 } 157 }
158 158
159 target += bytesToWrite; 159 target += bytesToWrite;
160 if (target > targetEnd) { 160 if (target > targetEnd) {
161 source = oldSource; // Back up source pointer! 161 source = oldSource; // Back up source pointer!
162 target -= bytesToWrite; 162 target -= bytesToWrite;
163 result = targetExhausted; 163 result = targetExhausted;
164 break; 164 break;
165 } 165 }
166 switch (bytesToWrite) { // note: everything falls through. 166 switch (bytesToWrite) { // note: everything falls through.
167 case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; 167 case 4:
168 case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; 168 *--target = (char)((ch | byteMark) & byteMask);
169 case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; 169 ch >>= 6;
170 case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]); 170 case 3:
171 *--target = (char)((ch | byteMark) & byteMask);
172 ch >>= 6;
173 case 2:
174 *--target = (char)((ch | byteMark) & byteMask);
175 ch >>= 6;
176 case 1:
177 *--target = (char)(ch | firstByteMark[bytesToWrite]);
171 } 178 }
172 target += bytesToWrite; 179 target += bytesToWrite;
173 } 180 }
174 *sourceStart = source; 181 *sourceStart = source;
175 *targetStart = target; 182 *targetStart = target;
176 return result; 183 return result;
177 } 184 }
178 185
179 // This must be called with the length pre-determined by the first byte. 186 // This must be called with the length pre-determined by the first byte.
180 // If presented with a length > 4, this returns false. The Unicode 187 // If presented with a length > 4, this returns false. The Unicode
181 // definition of UTF-8 goes up to 4-byte sequences. 188 // definition of UTF-8 goes up to 4-byte sequences.
182 static bool isLegalUTF8(const unsigned char* source, int length) 189 static bool isLegalUTF8(const unsigned char* source, int length)
183 { 190 {
184 unsigned char a; 191 unsigned char a;
185 const unsigned char* srcptr = source + length; 192 const unsigned char* srcptr = source + length;
186 switch (length) { 193 switch (length) {
187 default: return false; 194 default:
188 // Everything else falls through when "true"... 195 return false;
189 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 196 // Everything else falls through when "true"...
190 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 197 case 4:
191 case 2: if ((a = (*--srcptr)) > 0xBF) return false; 198 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
199 return false;
200 case 3:
201 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
202 return false;
203 case 2:
204 if ((a = (*--srcptr)) > 0xBF)
205 return false;
192 206
207 // no fall-through in this inner switch
193 switch (*source) { 208 switch (*source) {
194 // no fall-through in this inner switch 209 case 0xE0:
195 case 0xE0: if (a < 0xA0) return false; break; 210 if (a < 0xA0)
196 case 0xED: if (a > 0x9F) return false; break; 211 return false;
197 case 0xF0: if (a < 0x90) return false; break; 212 break;
198 case 0xF4: if (a > 0x8F) return false; break; 213 case 0xED:
199 default: if (a < 0x80) return false; 214 if (a > 0x9F)
215 return false;
216 break;
217 case 0xF0:
218 if (a < 0x90)
219 return false;
220 break;
221 case 0xF4:
222 if (a > 0x8F)
223 return false;
224 break;
225 default:
226 if (a < 0x80)
227 return false;
200 } 228 }
201 229
202 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 230 case 1:
231 if (*source >= 0x80 && *source < 0xC2)
232 return false;
203 } 233 }
204 if (*source > 0xF4) 234 if (*source > 0xF4)
205 return false; 235 return false;
206 return true; 236 return true;
207 } 237 }
208 238
209 // Magic values subtracted from a buffer value during UTF8 conversion. 239 // Magic values subtracted from a buffer value during UTF8 conversion.
210 // This table contains as many values as there might be trailing bytes 240 // This table contains as many values as there might be trailing bytes
211 // in a UTF-8 sequence. 241 // in a UTF-8 sequence.
212 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) }; 242 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };
213 243
214 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) 244 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
215 { 245 {
216 UChar32 character = 0; 246 UChar32 character = 0;
217 247
218 // The cases all fall through. 248 // The cases all fall through.
219 switch (length) { 249 switch (length) {
220 case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6; 250 case 6:
221 case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6; 251 character += static_cast<unsigned char>(*sequence++);
222 case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6; 252 character <<= 6;
223 case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6; 253 case 5:
224 case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6; 254 character += static_cast<unsigned char>(*sequence++);
225 case 1: character += static_cast<unsigned char>(*sequence++); 255 character <<= 6;
256 case 4:
257 character += static_cast<unsigned char>(*sequence++);
258 character <<= 6;
259 case 3:
260 character += static_cast<unsigned char>(*sequence++);
261 character <<= 6;
262 case 2:
263 character += static_cast<unsigned char>(*sequence++);
264 character <<= 6;
265 case 1:
266 character += static_cast<unsigned char>(*sequence++);
226 } 267 }
227 268
228 return character - offsetsFromUTF8[length - 1]; 269 return character - offsetsFromUTF8[length - 1];
229 } 270 }
230 271
231 ConversionResult convertUTF8ToUTF16( 272 ConversionResult convertUTF8ToUTF16(
232 const char** sourceStart, const char* sourceEnd, 273 const char** sourceStart, const char* sourceEnd,
233 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) 274 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
234 { 275 {
235 ConversionResult result = conversionOK; 276 ConversionResult result = conversionOK;
(...skipping 20 matching lines...) Expand all
256 break; 297 break;
257 } 298 }
258 299
259 if (U_IS_BMP(character)) { 300 if (U_IS_BMP(character)) {
260 // UTF-16 surrogate values are illegal in UTF-32 301 // UTF-16 surrogate values are illegal in UTF-32
261 if (U_IS_SURROGATE(character)) { 302 if (U_IS_SURROGATE(character)) {
262 if (strict) { 303 if (strict) {
263 source -= utf8SequenceLength; // return to the illegal value itself 304 source -= utf8SequenceLength; // return to the illegal value itself
264 result = sourceIllegal; 305 result = sourceIllegal;
265 break; 306 break;
266 } else {
267 *target++ = replacementCharacter;
268 orAllData |= replacementCharacter;
269 } 307 }
308 *target++ = replacementCharacter;
309 orAllData |= replacementCharacter;
270 } else { 310 } else {
271 *target++ = static_cast<UChar>(character); // normal case 311 *target++ = static_cast<UChar>(character); // normal case
272 orAllData |= character; 312 orAllData |= character;
273 } 313 }
274 } else if (U_IS_SUPPLEMENTARY(character)) { 314 } else if (U_IS_SUPPLEMENTARY(character)) {
275 // target is a character in range 0xFFFF - 0x10FFFF 315 // target is a character in range 0xFFFF - 0x10FFFF
276 if (target + 1 >= targetEnd) { 316 if (target + 1 >= targetEnd) {
277 source -= utf8SequenceLength; // Back up source pointer! 317 source -= utf8SequenceLength; // Back up source pointer!
278 result = targetExhausted; 318 result = targetExhausted;
279 break; 319 break;
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
319 } 359 }
320 360
321 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); 361 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
322 dataLength += utf8SequenceLength; 362 dataLength += utf8SequenceLength;
323 363
324 if (!dataEnd) { 364 if (!dataEnd) {
325 for (int i = 1; i < utf8SequenceLength; ++i) { 365 for (int i = 1; i < utf8SequenceLength; ++i) {
326 if (!data[i]) 366 if (!data[i])
327 return 0; 367 return 0;
328 } 368 }
329 } else if (dataEnd - data < utf8SequenceLength) 369 } else if (dataEnd - data < utf8SequenceLength) {
330 return 0; 370 return 0;
371 }
331 372
332 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength)) 373 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))
333 return 0; 374 return 0;
334 375
335 UChar32 character = readUTF8Sequence(data, utf8SequenceLength); 376 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
336 ASSERT(!isASCII(character)); 377 ASSERT(!isASCII(character));
337 378
338 if (U_IS_BMP(character)) { 379 if (U_IS_BMP(character)) {
339 // UTF-16 surrogate values are illegal in UTF-32 380 // UTF-16 surrogate values are illegal in UTF-32
340 if (U_IS_SURROGATE(character)) 381 if (U_IS_SURROGATE(character))
341 return 0; 382 return 0;
342 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case 383 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
343 utf16Length++; 384 utf16Length++;
344 } else if (U_IS_SUPPLEMENTARY(character)) { 385 } else if (U_IS_SUPPLEMENTARY(character)) {
345 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), 386 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));
346 static_cast<UChar>(U16_TRAIL(character))) ;
347 utf16Length += 2; 387 utf16Length += 2;
348 } else 388 } else {
349 return 0; 389 return 0;
390 }
350 } 391 }
351 392
352 return stringHasher.hashWithTop8BitsMasked(); 393 return stringHasher.hashWithTop8BitsMasked();
353 } 394 }
354 395
355 template<typename CharType> 396 template<typename CharType>
356 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd) 397 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd)
357 { 398 {
358 while (b < bEnd) { 399 while (b < bEnd) {
359 if (isASCII(*b)) { 400 if (isASCII(*b)) {
(...skipping 17 matching lines...) Expand all
377 // UTF-16 surrogate values are illegal in UTF-32 418 // UTF-16 surrogate values are illegal in UTF-32
378 if (U_IS_SURROGATE(character)) 419 if (U_IS_SURROGATE(character))
379 return false; 420 return false;
380 if (*a++ != character) 421 if (*a++ != character)
381 return false; 422 return false;
382 } else if (U_IS_SUPPLEMENTARY(character)) { 423 } else if (U_IS_SUPPLEMENTARY(character)) {
383 if (*a++ != U16_LEAD(character)) 424 if (*a++ != U16_LEAD(character))
384 return false; 425 return false;
385 if (*a++ != U16_TRAIL(character)) 426 if (*a++ != U16_TRAIL(character))
386 return false; 427 return false;
387 } else 428 } else {
388 return false; 429 return false;
430 }
389 } 431 }
390 432
391 return a == aEnd; 433 return a == aEnd;
392 } 434 }
393 435
394 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd) 436 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
395 { 437 {
396 return equalWithUTF8Internal(a, aEnd, b, bEnd); 438 return equalWithUTF8Internal(a, aEnd, b, bEnd);
397 } 439 }
398 440
399 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd) 441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
400 { 442 {
401 return equalWithUTF8Internal(a, aEnd, b, bEnd); 443 return equalWithUTF8Internal(a, aEnd, b, bEnd);
402 } 444 }
403 445
404 } // namespace Unicode 446 } // namespace Unicode
405 } // namespace WTF 447 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698