Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(527)

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1436153002: Apply clang-format with Chromium-style without column limit. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2007 Apple Inc. All rights reserved. 2 * Copyright (C) 2007 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4 * 4 *
5 * Redistribution and use in source and binary forms, with or without 5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions 6 * modification, are permitted provided that the following conditions
7 * are met: 7 * are met:
8 * 1. Redistributions of source code must retain the above copyright 8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright 10 * 2. Redistributions in binary form must reproduce the above copyright
(...skipping 16 matching lines...) Expand all
27 #include "config.h" 27 #include "config.h"
28 #include "wtf/text/UTF8.h" 28 #include "wtf/text/UTF8.h"
29 29
30 #include "wtf/ASCIICType.h" 30 #include "wtf/ASCIICType.h"
31 #include "wtf/StringHasher.h" 31 #include "wtf/StringHasher.h"
32 #include "wtf/text/CharacterNames.h" 32 #include "wtf/text/CharacterNames.h"
33 33
34 namespace WTF { 34 namespace WTF {
35 namespace Unicode { 35 namespace Unicode {
36 36
37 inline int inlineUTF8SequenceLengthNonASCII(char b0) 37 inline int inlineUTF8SequenceLengthNonASCII(char b0) {
38 { 38 if ((b0 & 0xC0) != 0xC0)
39 if ((b0 & 0xC0) != 0xC0)
40 return 0;
41 if ((b0 & 0xE0) == 0xC0)
42 return 2;
43 if ((b0 & 0xF0) == 0xE0)
44 return 3;
45 if ((b0 & 0xF8) == 0xF0)
46 return 4;
47 return 0; 39 return 0;
40 if ((b0 & 0xE0) == 0xC0)
41 return 2;
42 if ((b0 & 0xF0) == 0xE0)
43 return 3;
44 if ((b0 & 0xF8) == 0xF0)
45 return 4;
46 return 0;
48 } 47 }
49 48
50 inline int inlineUTF8SequenceLength(char b0) 49 inline int inlineUTF8SequenceLength(char b0) {
51 { 50 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
53 } 51 }
54 52
55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 53 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
56 // into the first byte, depending on how many bytes follow. There are 54 // into the first byte, depending on how many bytes follow. There are
57 // as many entries in this table as there are UTF-8 sequence types. 55 // as many entries in this table as there are UTF-8 sequence types.
58 // (I.e., one byte sequence, two byte... etc.). Remember that sequences 56 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
59 // for *legal* UTF-8 will be 4 or fewer bytes total. 57 // for *legal* UTF-8 will be 4 or fewer bytes total.
60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC }; 58 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF 8, 0xFC};
61 59
62 ConversionResult convertLatin1ToUTF8( 60 ConversionResult convertLatin1ToUTF8(
63 const LChar** sourceStart, const LChar* sourceEnd, 61 const LChar** sourceStart,
64 char** targetStart, char* targetEnd) 62 const LChar* sourceEnd,
65 { 63 char** targetStart,
66 ConversionResult result = conversionOK; 64 char* targetEnd) {
67 const LChar* source = *sourceStart; 65 ConversionResult result = conversionOK;
68 char* target = *targetStart; 66 const LChar* source = *sourceStart;
69 while (source < sourceEnd) { 67 char* target = *targetStart;
70 UChar32 ch; 68 while (source < sourceEnd) {
71 unsigned short bytesToWrite = 0; 69 UChar32 ch;
72 const UChar32 byteMask = 0xBF; 70 unsigned short bytesToWrite = 0;
73 const UChar32 byteMark = 0x80; 71 const UChar32 byteMask = 0xBF;
74 const LChar* oldSource = source; // In case we have to back up because o f target overflow. 72 const UChar32 byteMark = 0x80;
75 ch = static_cast<unsigned short>(*source++); 73 const LChar* oldSource = source; // In case we have to back up because of t arget overflow.
76 74 ch = static_cast<unsigned short>(*source++);
77 // Figure out how many bytes the result will require 75
78 if (ch < (UChar32)0x80) 76 // Figure out how many bytes the result will require
79 bytesToWrite = 1; 77 if (ch < (UChar32)0x80)
80 else 78 bytesToWrite = 1;
81 bytesToWrite = 2; 79 else
82 80 bytesToWrite = 2;
83 target += bytesToWrite; 81
84 if (target > targetEnd) { 82 target += bytesToWrite;
85 source = oldSource; // Back up source pointer! 83 if (target > targetEnd) {
86 target -= bytesToWrite; 84 source = oldSource; // Back up source pointer!
87 result = targetExhausted; 85 target -= bytesToWrite;
88 break; 86 result = targetExhausted;
87 break;
88 }
89 switch (bytesToWrite) { // note: everything falls through.
90 case 2:
91 *--target = (char)((ch | byteMark) & byteMask);
92 ch >>= 6;
93 case 1:
94 *--target = (char)(ch | firstByteMark[bytesToWrite]);
95 }
96 target += bytesToWrite;
97 }
98 *sourceStart = source;
99 *targetStart = target;
100 return result;
101 }
102
103 ConversionResult convertUTF16ToUTF8(
104 const UChar** sourceStart,
105 const UChar* sourceEnd,
106 char** targetStart,
107 char* targetEnd,
108 bool strict) {
109 ConversionResult result = conversionOK;
110 const UChar* source = *sourceStart;
111 char* target = *targetStart;
112 while (source < sourceEnd) {
113 UChar32 ch;
114 unsigned short bytesToWrite = 0;
115 const UChar32 byteMask = 0xBF;
116 const UChar32 byteMark = 0x80;
117 const UChar* oldSource = source; // In case we have to back up because of t arget overflow.
118 ch = static_cast<unsigned short>(*source++);
119 // If we have a surrogate pair, convert to UChar32 first.
120 if (ch >= 0xD800 && ch <= 0xDBFF) {
121 // If the 16 bits following the high surrogate are in the source buffer...
122 if (source < sourceEnd) {
123 UChar32 ch2 = static_cast<unsigned short>(*source);
124 // If it's a low surrogate, convert to UChar32.
125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
126 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
127 ++source;
128 } else if (strict) { // it's an unpaired high surrogate
129 --source; // return to the illegal value itself
130 result = sourceIllegal;
131 break;
89 } 132 }
90 switch (bytesToWrite) { // note: everything falls through. 133 } else { // We don't have the 16 bits following the high surrogate.
91 case 2: 134 --source; // return to the high surrogate
92 *--target = (char)((ch | byteMark) & byteMask); 135 result = sourceExhausted;
93 ch >>= 6; 136 break;
94 case 1: 137 }
95 *--target = (char)(ch | firstByteMark[bytesToWrite]); 138 } else if (strict) {
96 } 139 // UTF-16 surrogate values are illegal in UTF-32
97 target += bytesToWrite; 140 if (ch >= 0xDC00 && ch <= 0xDFFF) {
98 } 141 --source; // return to the illegal value itself
99 *sourceStart = source; 142 result = sourceIllegal;
100 *targetStart = target; 143 break;
101 return result; 144 }
102 } 145 }
103 146 // Figure out how many bytes the result will require
104 ConversionResult convertUTF16ToUTF8( 147 if (ch < (UChar32)0x80) {
105 const UChar** sourceStart, const UChar* sourceEnd, 148 bytesToWrite = 1;
106 char** targetStart, char* targetEnd, bool strict) 149 } else if (ch < (UChar32)0x800) {
107 { 150 bytesToWrite = 2;
108 ConversionResult result = conversionOK; 151 } else if (ch < (UChar32)0x10000) {
109 const UChar* source = *sourceStart; 152 bytesToWrite = 3;
110 char* target = *targetStart; 153 } else if (ch < (UChar32)0x110000) {
111 while (source < sourceEnd) { 154 bytesToWrite = 4;
112 UChar32 ch; 155 } else {
113 unsigned short bytesToWrite = 0; 156 bytesToWrite = 3;
114 const UChar32 byteMask = 0xBF; 157 ch = replacementCharacter;
115 const UChar32 byteMark = 0x80; 158 }
116 const UChar* oldSource = source; // In case we have to back up because o f target overflow. 159
117 ch = static_cast<unsigned short>(*source++); 160 target += bytesToWrite;
118 // If we have a surrogate pair, convert to UChar32 first. 161 if (target > targetEnd) {
119 if (ch >= 0xD800 && ch <= 0xDBFF) { 162 source = oldSource; // Back up source pointer!
120 // If the 16 bits following the high surrogate are in the source buf fer... 163 target -= bytesToWrite;
121 if (source < sourceEnd) { 164 result = targetExhausted;
122 UChar32 ch2 = static_cast<unsigned short>(*source); 165 break;
123 // If it's a low surrogate, convert to UChar32. 166 }
124 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 167 switch (bytesToWrite) { // note: everything falls through.
125 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; 168 case 4:
126 ++source; 169 *--target = (char)((ch | byteMark) & byteMask);
127 } else if (strict) { // it's an unpaired high surrogate 170 ch >>= 6;
128 --source; // return to the illegal value itself 171 case 3:
129 result = sourceIllegal; 172 *--target = (char)((ch | byteMark) & byteMask);
130 break; 173 ch >>= 6;
131 } 174 case 2:
132 } else { // We don't have the 16 bits following the high surrogate. 175 *--target = (char)((ch | byteMark) & byteMask);
133 --source; // return to the high surrogate 176 ch >>= 6;
134 result = sourceExhausted; 177 case 1:
135 break; 178 *--target = (char)(ch | firstByteMark[bytesToWrite]);
136 } 179 }
137 } else if (strict) { 180 target += bytesToWrite;
138 // UTF-16 surrogate values are illegal in UTF-32 181 }
139 if (ch >= 0xDC00 && ch <= 0xDFFF) { 182 *sourceStart = source;
140 --source; // return to the illegal value itself 183 *targetStart = target;
141 result = sourceIllegal; 184 return result;
142 break;
143 }
144 }
145 // Figure out how many bytes the result will require
146 if (ch < (UChar32)0x80) {
147 bytesToWrite = 1;
148 } else if (ch < (UChar32)0x800) {
149 bytesToWrite = 2;
150 } else if (ch < (UChar32)0x10000) {
151 bytesToWrite = 3;
152 } else if (ch < (UChar32)0x110000) {
153 bytesToWrite = 4;
154 } else {
155 bytesToWrite = 3;
156 ch = replacementCharacter;
157 }
158
159 target += bytesToWrite;
160 if (target > targetEnd) {
161 source = oldSource; // Back up source pointer!
162 target -= bytesToWrite;
163 result = targetExhausted;
164 break;
165 }
166 switch (bytesToWrite) { // note: everything falls through.
167 case 4:
168 *--target = (char)((ch | byteMark) & byteMask);
169 ch >>= 6;
170 case 3:
171 *--target = (char)((ch | byteMark) & byteMask);
172 ch >>= 6;
173 case 2:
174 *--target = (char)((ch | byteMark) & byteMask);
175 ch >>= 6;
176 case 1:
177 *--target = (char)(ch | firstByteMark[bytesToWrite]);
178 }
179 target += bytesToWrite;
180 }
181 *sourceStart = source;
182 *targetStart = target;
183 return result;
184 } 185 }
185 186
186 // This must be called with the length pre-determined by the first byte. 187 // This must be called with the length pre-determined by the first byte.
187 // If presented with a length > 4, this returns false. The Unicode 188 // If presented with a length > 4, this returns false. The Unicode
188 // definition of UTF-8 goes up to 4-byte sequences. 189 // definition of UTF-8 goes up to 4-byte sequences.
189 static bool isLegalUTF8(const unsigned char* source, int length) 190 static bool isLegalUTF8(const unsigned char* source, int length) {
190 { 191 unsigned char a;
191 unsigned char a; 192 const unsigned char* srcptr = source + length;
192 const unsigned char* srcptr = source + length; 193 switch (length) {
193 switch (length) {
194 default: 194 default:
195 return false; 195 return false;
196 // Everything else falls through when "true"... 196 // Everything else falls through when "true"...
197 case 4: 197 case 4:
198 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) 198 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
199 return false; 199 return false;
200 case 3: 200 case 3:
201 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) 201 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
202 return false; 202 return false;
203 case 2: 203 case 2:
204 if ((a = (*--srcptr)) > 0xBF) 204 if ((a = (*--srcptr)) > 0xBF)
205 return false; 205 return false;
206 206
207 // no fall-through in this inner switch 207 // no fall-through in this inner switch
208 switch (*source) { 208 switch (*source) {
209 case 0xE0: 209 case 0xE0:
210 if (a < 0xA0) 210 if (a < 0xA0)
211 return false; 211 return false;
212 break; 212 break;
213 case 0xED: 213 case 0xED:
214 if (a > 0x9F) 214 if (a > 0x9F)
215 return false; 215 return false;
216 break; 216 break;
217 case 0xF0: 217 case 0xF0:
218 if (a < 0x90) 218 if (a < 0x90)
219 return false; 219 return false;
220 break; 220 break;
221 case 0xF4: 221 case 0xF4:
222 if (a > 0x8F) 222 if (a > 0x8F)
223 return false; 223 return false;
224 break; 224 break;
225 default: 225 default:
226 if (a < 0x80) 226 if (a < 0x80)
227 return false; 227 return false;
228 } 228 }
229 229
230 case 1: 230 case 1:
231 if (*source >= 0x80 && *source < 0xC2) 231 if (*source >= 0x80 && *source < 0xC2)
232 return false; 232 return false;
233 } 233 }
234 if (*source > 0xF4) 234 if (*source > 0xF4)
235 return false; 235 return false;
236 return true; 236 return true;
237 } 237 }
238 238
239 // Magic values subtracted from a buffer value during UTF8 conversion. 239 // Magic values subtracted from a buffer value during UTF8 conversion.
240 // This table contains as many values as there might be trailing bytes 240 // This table contains as many values as there might be trailing bytes
241 // in a UTF-8 sequence. 241 // in a UTF-8 sequence.
242 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) }; 242 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E208 0UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82 082080UL)};
243 243
244 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) 244 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) {
245 { 245 UChar32 character = 0;
246 UChar32 character = 0; 246
247 247 // The cases all fall through.
248 // The cases all fall through. 248 switch (length) {
249 switch (length) {
250 case 6: 249 case 6:
251 character += static_cast<unsigned char>(*sequence++); 250 character += static_cast<unsigned char>(*sequence++);
252 character <<= 6; 251 character <<= 6;
253 case 5: 252 case 5:
254 character += static_cast<unsigned char>(*sequence++); 253 character += static_cast<unsigned char>(*sequence++);
255 character <<= 6; 254 character <<= 6;
256 case 4: 255 case 4:
257 character += static_cast<unsigned char>(*sequence++); 256 character += static_cast<unsigned char>(*sequence++);
258 character <<= 6; 257 character <<= 6;
259 case 3: 258 case 3:
260 character += static_cast<unsigned char>(*sequence++); 259 character += static_cast<unsigned char>(*sequence++);
261 character <<= 6; 260 character <<= 6;
262 case 2: 261 case 2:
263 character += static_cast<unsigned char>(*sequence++); 262 character += static_cast<unsigned char>(*sequence++);
264 character <<= 6; 263 character <<= 6;
265 case 1: 264 case 1:
266 character += static_cast<unsigned char>(*sequence++); 265 character += static_cast<unsigned char>(*sequence++);
267 } 266 }
268 267
269 return character - offsetsFromUTF8[length - 1]; 268 return character - offsetsFromUTF8[length - 1];
270 } 269 }
271 270
272 ConversionResult convertUTF8ToUTF16( 271 ConversionResult convertUTF8ToUTF16(
273 const char** sourceStart, const char* sourceEnd, 272 const char** sourceStart,
274 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) 273 const char* sourceEnd,
275 { 274 UChar** targetStart,
276 ConversionResult result = conversionOK; 275 UChar* targetEnd,
277 const char* source = *sourceStart; 276 bool* sourceAllASCII,
278 UChar* target = *targetStart; 277 bool strict) {
279 UChar orAllData = 0; 278 ConversionResult result = conversionOK;
280 while (source < sourceEnd) { 279 const char* source = *sourceStart;
281 int utf8SequenceLength = inlineUTF8SequenceLength(*source); 280 UChar* target = *targetStart;
282 if (sourceEnd - source < utf8SequenceLength) { 281 UChar orAllData = 0;
283 result = sourceExhausted; 282 while (source < sourceEnd) {
284 break; 283 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
284 if (sourceEnd - source < utf8SequenceLength) {
285 result = sourceExhausted;
286 break;
287 }
288 // Do this check whether lenient or strict
289 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Sequenc eLength)) {
290 result = sourceIllegal;
291 break;
292 }
293
294 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
295
296 if (target >= targetEnd) {
297 source -= utf8SequenceLength; // Back up source pointer!
298 result = targetExhausted;
299 break;
300 }
301
302 if (U_IS_BMP(character)) {
303 // UTF-16 surrogate values are illegal in UTF-32
304 if (U_IS_SURROGATE(character)) {
305 if (strict) {
306 source -= utf8SequenceLength; // return to the illegal value itself
307 result = sourceIllegal;
308 break;
285 } 309 }
286 // Do this check whether lenient or strict 310 *target++ = replacementCharacter;
287 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) { 311 orAllData |= replacementCharacter;
288 result = sourceIllegal; 312 } else {
289 break; 313 *target++ = static_cast<UChar>(character); // normal case
290 } 314 orAllData |= character;
291 315 }
292 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); 316 } else if (U_IS_SUPPLEMENTARY(character)) {
293 317 // target is a character in range 0xFFFF - 0x10FFFF
294 if (target >= targetEnd) { 318 if (target + 1 >= targetEnd) {
295 source -= utf8SequenceLength; // Back up source pointer! 319 source -= utf8SequenceLength; // Back up source pointer!
296 result = targetExhausted; 320 result = targetExhausted;
297 break; 321 break;
298 } 322 }
299 323 *target++ = U16_LEAD(character);
300 if (U_IS_BMP(character)) { 324 *target++ = U16_TRAIL(character);
301 // UTF-16 surrogate values are illegal in UTF-32 325 orAllData = 0xffff;
302 if (U_IS_SURROGATE(character)) { 326 } else {
303 if (strict) { 327 if (strict) {
304 source -= utf8SequenceLength; // return to the illegal value itself 328 source -= utf8SequenceLength; // return to the start
305 result = sourceIllegal; 329 result = sourceIllegal;
306 break; 330 break; // Bail out; shouldn't continue
307 } 331 } else {
308 *target++ = replacementCharacter; 332 *target++ = replacementCharacter;
309 orAllData |= replacementCharacter; 333 orAllData |= replacementCharacter;
310 } else { 334 }
311 *target++ = static_cast<UChar>(character); // normal case 335 }
312 orAllData |= character; 336 }
313 } 337 *sourceStart = source;
314 } else if (U_IS_SUPPLEMENTARY(character)) { 338 *targetStart = target;
315 // target is a character in range 0xFFFF - 0x10FFFF 339
316 if (target + 1 >= targetEnd) { 340 if (sourceAllASCII)
317 source -= utf8SequenceLength; // Back up source pointer! 341 *sourceAllASCII = !(orAllData & ~0x7f);
318 result = targetExhausted; 342
319 break; 343 return result;
320 } 344 }
321 *target++ = U16_LEAD(character); 345
322 *target++ = U16_TRAIL(character); 346 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length) {
323 orAllData = 0xffff; 347 if (!data)
324 } else { 348 return 0;
325 if (strict) { 349
326 source -= utf8SequenceLength; // return to the start 350 StringHasher stringHasher;
327 result = sourceIllegal; 351 dataLength = 0;
328 break; // Bail out; shouldn't continue 352 utf16Length = 0;
329 } else { 353
330 *target++ = replacementCharacter; 354 while (data < dataEnd || (!dataEnd && *data)) {
331 orAllData |= replacementCharacter; 355 if (isASCII(*data)) {
332 } 356 stringHasher.addCharacter(*data++);
333 } 357 dataLength++;
334 } 358 utf16Length++;
335 *sourceStart = source; 359 continue;
336 *targetStart = target; 360 }
337 361
338 if (sourceAllASCII) 362 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
339 *sourceAllASCII = !(orAllData & ~0x7f); 363 dataLength += utf8SequenceLength;
340 364
341 return result; 365 if (!dataEnd) {
342 } 366 for (int i = 1; i < utf8SequenceLength; ++i) {
343 367 if (!data[i])
344 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length) 368 return 0;
345 { 369 }
346 if (!data) 370 } else if (dataEnd - data < utf8SequenceLength) {
371 return 0;
372 }
373
374 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceL ength))
375 return 0;
376
377 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
378 ASSERT(!isASCII(character));
379
380 if (U_IS_BMP(character)) {
381 // UTF-16 surrogate values are illegal in UTF-32
382 if (U_IS_SURROGATE(character))
347 return 0; 383 return 0;
348 384 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
349 StringHasher stringHasher; 385 utf16Length++;
350 dataLength = 0; 386 } else if (U_IS_SUPPLEMENTARY(character)) {
351 utf16Length = 0; 387 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static _cast<UChar>(U16_TRAIL(character)));
352 388 utf16Length += 2;
353 while (data < dataEnd || (!dataEnd && *data)) { 389 } else {
354 if (isASCII(*data)) { 390 return 0;
355 stringHasher.addCharacter(*data++); 391 }
356 dataLength++; 392 }
357 utf16Length++; 393
358 continue; 394 return stringHasher.hashWithTop8BitsMasked();
359 } 395 }
360 396
361 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); 397 template <typename CharType>
362 dataLength += utf8SequenceLength; 398 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd) {
363 399 while (b < bEnd) {
364 if (!dataEnd) { 400 if (isASCII(*b)) {
365 for (int i = 1; i < utf8SequenceLength; ++i) { 401 if (*a++ != *b++)
366 if (!data[i]) 402 return false;
367 return 0; 403 continue;
368 } 404 }
369 } else if (dataEnd - data < utf8SequenceLength) { 405
370 return 0; 406 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
371 } 407
372 408 if (bEnd - b < utf8SequenceLength)
373 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength)) 409 return false;
374 return 0; 410
375 411 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLeng th))
376 UChar32 character = readUTF8Sequence(data, utf8SequenceLength); 412 return 0;
377 ASSERT(!isASCII(character)); 413
378 414 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
379 if (U_IS_BMP(character)) { 415 ASSERT(!isASCII(character));
380 // UTF-16 surrogate values are illegal in UTF-32 416
381 if (U_IS_SURROGATE(character)) 417 if (U_IS_BMP(character)) {
382 return 0; 418 // UTF-16 surrogate values are illegal in UTF-32
383 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case 419 if (U_IS_SURROGATE(character))
384 utf16Length++; 420 return false;
385 } else if (U_IS_SUPPLEMENTARY(character)) { 421 if (*a++ != character)
386 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); 422 return false;
387 utf16Length += 2; 423 } else if (U_IS_SUPPLEMENTARY(character)) {
388 } else { 424 if (*a++ != U16_LEAD(character))
389 return 0; 425 return false;
390 } 426 if (*a++ != U16_TRAIL(character))
391 } 427 return false;
392 428 } else {
393 return stringHasher.hashWithTop8BitsMasked(); 429 return false;
394 } 430 }
395 431 }
396 template<typename CharType> 432
397 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd) 433 return a == aEnd;
398 { 434 }
399 while (b < bEnd) { 435
400 if (isASCII(*b)) { 436 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd) {
401 if (*a++ != *b++) 437 return equalWithUTF8Internal(a, aEnd, b, bEnd);
402 return false; 438 }
403 continue; 439
404 } 440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd) {
405 441 return equalWithUTF8Internal(a, aEnd, b, bEnd);
406 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); 442 }
407 443
408 if (bEnd - b < utf8SequenceLength) 444 } // namespace Unicode
409 return false; 445 } // namespace WTF
410
411 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence Length))
412 return 0;
413
414 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
415 ASSERT(!isASCII(character));
416
417 if (U_IS_BMP(character)) {
418 // UTF-16 surrogate values are illegal in UTF-32
419 if (U_IS_SURROGATE(character))
420 return false;
421 if (*a++ != character)
422 return false;
423 } else if (U_IS_SUPPLEMENTARY(character)) {
424 if (*a++ != U16_LEAD(character))
425 return false;
426 if (*a++ != U16_TRAIL(character))
427 return false;
428 } else {
429 return false;
430 }
431 }
432
433 return a == aEnd;
434 }
435
436 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
437 {
438 return equalWithUTF8Internal(a, aEnd, b, bEnd);
439 }
440
441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
442 {
443 return equalWithUTF8Internal(a, aEnd, b, bEnd);
444 }
445
446 } // namespace Unicode
447 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698