Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(33)

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1611343002: wtf reformat test Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: pydent Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2007 Apple Inc. All rights reserved. 2 * Copyright (C) 2007 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4 * 4 *
5 * Redistribution and use in source and binary forms, with or without 5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions 6 * modification, are permitted provided that the following conditions
7 * are met: 7 * are met:
8 * 1. Redistributions of source code must retain the above copyright 8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright 10 * 2. Redistributions in binary form must reproduce the above copyright
(...skipping 15 matching lines...) Expand all
26 26
27 #include "wtf/text/UTF8.h" 27 #include "wtf/text/UTF8.h"
28 28
29 #include "wtf/ASCIICType.h" 29 #include "wtf/ASCIICType.h"
30 #include "wtf/StringHasher.h" 30 #include "wtf/StringHasher.h"
31 #include "wtf/text/CharacterNames.h" 31 #include "wtf/text/CharacterNames.h"
32 32
33 namespace WTF { 33 namespace WTF {
34 namespace Unicode { 34 namespace Unicode {
35 35
36 inline int inlineUTF8SequenceLengthNonASCII(char b0) 36 inline int inlineUTF8SequenceLengthNonASCII(char b0) {
37 { 37 if ((b0 & 0xC0) != 0xC0)
38 if ((b0 & 0xC0) != 0xC0)
39 return 0;
40 if ((b0 & 0xE0) == 0xC0)
41 return 2;
42 if ((b0 & 0xF0) == 0xE0)
43 return 3;
44 if ((b0 & 0xF8) == 0xF0)
45 return 4;
46 return 0; 38 return 0;
39 if ((b0 & 0xE0) == 0xC0)
40 return 2;
41 if ((b0 & 0xF0) == 0xE0)
42 return 3;
43 if ((b0 & 0xF8) == 0xF0)
44 return 4;
45 return 0;
47 } 46 }
48 47
49 inline int inlineUTF8SequenceLength(char b0) 48 inline int inlineUTF8SequenceLength(char b0) {
50 { 49 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
51 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
52 } 50 }
53 51
54 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 52 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
55 // into the first byte, depending on how many bytes follow. There are 53 // into the first byte, depending on how many bytes follow. There are
56 // as many entries in this table as there are UTF-8 sequence types. 54 // as many entries in this table as there are UTF-8 sequence types.
57 // (I.e., one byte sequence, two byte... etc.). Remember that sequences 55 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
58 // for *legal* UTF-8 will be 4 or fewer bytes total. 56 // for *legal* UTF-8 will be 4 or fewer bytes total.
59 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC }; 57 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
60 58 0xF0, 0xF8, 0xFC};
61 ConversionResult convertLatin1ToUTF8( 59
62 const LChar** sourceStart, const LChar* sourceEnd, 60 ConversionResult convertLatin1ToUTF8(const LChar** sourceStart,
63 char** targetStart, char* targetEnd) 61 const LChar* sourceEnd,
64 { 62 char** targetStart,
65 ConversionResult result = conversionOK; 63 char* targetEnd) {
66 const LChar* source = *sourceStart; 64 ConversionResult result = conversionOK;
67 char* target = *targetStart; 65 const LChar* source = *sourceStart;
68 while (source < sourceEnd) { 66 char* target = *targetStart;
69 UChar32 ch; 67 while (source < sourceEnd) {
70 unsigned short bytesToWrite = 0; 68 UChar32 ch;
71 const UChar32 byteMask = 0xBF; 69 unsigned short bytesToWrite = 0;
72 const UChar32 byteMark = 0x80; 70 const UChar32 byteMask = 0xBF;
73 const LChar* oldSource = source; // In case we have to back up because o f target overflow. 71 const UChar32 byteMark = 0x80;
74 ch = static_cast<unsigned short>(*source++); 72 const LChar* oldSource =
75 73 source; // In case we have to back up because of target overflow.
76 // Figure out how many bytes the result will require 74 ch = static_cast<unsigned short>(*source++);
77 if (ch < (UChar32)0x80) 75
78 bytesToWrite = 1; 76 // Figure out how many bytes the result will require
79 else 77 if (ch < (UChar32)0x80)
80 bytesToWrite = 2; 78 bytesToWrite = 1;
81 79 else
82 target += bytesToWrite; 80 bytesToWrite = 2;
83 if (target > targetEnd) { 81
84 source = oldSource; // Back up source pointer! 82 target += bytesToWrite;
85 target -= bytesToWrite; 83 if (target > targetEnd) {
86 result = targetExhausted; 84 source = oldSource; // Back up source pointer!
87 break; 85 target -= bytesToWrite;
86 result = targetExhausted;
87 break;
88 }
89 switch (bytesToWrite) { // note: everything falls through.
90 case 2:
91 *--target = (char)((ch | byteMark) & byteMask);
92 ch >>= 6;
93 case 1:
94 *--target = (char)(ch | firstByteMark[bytesToWrite]);
95 }
96 target += bytesToWrite;
97 }
98 *sourceStart = source;
99 *targetStart = target;
100 return result;
101 }
102
103 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
104 const UChar* sourceEnd,
105 char** targetStart,
106 char* targetEnd,
107 bool strict) {
108 ConversionResult result = conversionOK;
109 const UChar* source = *sourceStart;
110 char* target = *targetStart;
111 while (source < sourceEnd) {
112 UChar32 ch;
113 unsigned short bytesToWrite = 0;
114 const UChar32 byteMask = 0xBF;
115 const UChar32 byteMark = 0x80;
116 const UChar* oldSource =
117 source; // In case we have to back up because of target overflow.
118 ch = static_cast<unsigned short>(*source++);
119 // If we have a surrogate pair, convert to UChar32 first.
120 if (ch >= 0xD800 && ch <= 0xDBFF) {
121 // If the 16 bits following the high surrogate are in the source buffer...
122 if (source < sourceEnd) {
123 UChar32 ch2 = static_cast<unsigned short>(*source);
124 // If it's a low surrogate, convert to UChar32.
125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
126 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
127 ++source;
128 } else if (strict) { // it's an unpaired high surrogate
129 --source; // return to the illegal value itself
130 result = sourceIllegal;
131 break;
88 } 132 }
89 switch (bytesToWrite) { // note: everything falls through. 133 } else { // We don't have the 16 bits following the high surrogate.
90 case 2: 134 --source; // return to the high surrogate
91 *--target = (char)((ch | byteMark) & byteMask); 135 result = sourceExhausted;
92 ch >>= 6; 136 break;
93 case 1: 137 }
94 *--target = (char)(ch | firstByteMark[bytesToWrite]); 138 } else if (strict) {
95 } 139 // UTF-16 surrogate values are illegal in UTF-32
96 target += bytesToWrite; 140 if (ch >= 0xDC00 && ch <= 0xDFFF) {
97 } 141 --source; // return to the illegal value itself
98 *sourceStart = source; 142 result = sourceIllegal;
99 *targetStart = target; 143 break;
100 return result; 144 }
101 } 145 }
102 146 // Figure out how many bytes the result will require
103 ConversionResult convertUTF16ToUTF8( 147 if (ch < (UChar32)0x80) {
104 const UChar** sourceStart, const UChar* sourceEnd, 148 bytesToWrite = 1;
105 char** targetStart, char* targetEnd, bool strict) 149 } else if (ch < (UChar32)0x800) {
106 { 150 bytesToWrite = 2;
107 ConversionResult result = conversionOK; 151 } else if (ch < (UChar32)0x10000) {
108 const UChar* source = *sourceStart; 152 bytesToWrite = 3;
109 char* target = *targetStart; 153 } else if (ch < (UChar32)0x110000) {
110 while (source < sourceEnd) { 154 bytesToWrite = 4;
111 UChar32 ch; 155 } else {
112 unsigned short bytesToWrite = 0; 156 bytesToWrite = 3;
113 const UChar32 byteMask = 0xBF; 157 ch = replacementCharacter;
114 const UChar32 byteMark = 0x80; 158 }
115 const UChar* oldSource = source; // In case we have to back up because o f target overflow. 159
116 ch = static_cast<unsigned short>(*source++); 160 target += bytesToWrite;
117 // If we have a surrogate pair, convert to UChar32 first. 161 if (target > targetEnd) {
118 if (ch >= 0xD800 && ch <= 0xDBFF) { 162 source = oldSource; // Back up source pointer!
119 // If the 16 bits following the high surrogate are in the source buf fer... 163 target -= bytesToWrite;
120 if (source < sourceEnd) { 164 result = targetExhausted;
121 UChar32 ch2 = static_cast<unsigned short>(*source); 165 break;
122 // If it's a low surrogate, convert to UChar32. 166 }
123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 167 switch (bytesToWrite) { // note: everything falls through.
124 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; 168 case 4:
125 ++source; 169 *--target = (char)((ch | byteMark) & byteMask);
126 } else if (strict) { // it's an unpaired high surrogate 170 ch >>= 6;
127 --source; // return to the illegal value itself 171 case 3:
128 result = sourceIllegal; 172 *--target = (char)((ch | byteMark) & byteMask);
129 break; 173 ch >>= 6;
130 } 174 case 2:
131 } else { // We don't have the 16 bits following the high surrogate. 175 *--target = (char)((ch | byteMark) & byteMask);
132 --source; // return to the high surrogate 176 ch >>= 6;
133 result = sourceExhausted; 177 case 1:
134 break; 178 *--target = (char)(ch | firstByteMark[bytesToWrite]);
135 } 179 }
136 } else if (strict) { 180 target += bytesToWrite;
137 // UTF-16 surrogate values are illegal in UTF-32 181 }
138 if (ch >= 0xDC00 && ch <= 0xDFFF) { 182 *sourceStart = source;
139 --source; // return to the illegal value itself 183 *targetStart = target;
140 result = sourceIllegal; 184 return result;
141 break;
142 }
143 }
144 // Figure out how many bytes the result will require
145 if (ch < (UChar32)0x80) {
146 bytesToWrite = 1;
147 } else if (ch < (UChar32)0x800) {
148 bytesToWrite = 2;
149 } else if (ch < (UChar32)0x10000) {
150 bytesToWrite = 3;
151 } else if (ch < (UChar32)0x110000) {
152 bytesToWrite = 4;
153 } else {
154 bytesToWrite = 3;
155 ch = replacementCharacter;
156 }
157
158 target += bytesToWrite;
159 if (target > targetEnd) {
160 source = oldSource; // Back up source pointer!
161 target -= bytesToWrite;
162 result = targetExhausted;
163 break;
164 }
165 switch (bytesToWrite) { // note: everything falls through.
166 case 4:
167 *--target = (char)((ch | byteMark) & byteMask);
168 ch >>= 6;
169 case 3:
170 *--target = (char)((ch | byteMark) & byteMask);
171 ch >>= 6;
172 case 2:
173 *--target = (char)((ch | byteMark) & byteMask);
174 ch >>= 6;
175 case 1:
176 *--target = (char)(ch | firstByteMark[bytesToWrite]);
177 }
178 target += bytesToWrite;
179 }
180 *sourceStart = source;
181 *targetStart = target;
182 return result;
183 } 185 }
184 186
185 // This must be called with the length pre-determined by the first byte. 187 // This must be called with the length pre-determined by the first byte.
186 // If presented with a length > 4, this returns false. The Unicode 188 // If presented with a length > 4, this returns false. The Unicode
187 // definition of UTF-8 goes up to 4-byte sequences. 189 // definition of UTF-8 goes up to 4-byte sequences.
188 static bool isLegalUTF8(const unsigned char* source, int length) 190 static bool isLegalUTF8(const unsigned char* source, int length) {
189 { 191 unsigned char a;
190 unsigned char a; 192 const unsigned char* srcptr = source + length;
191 const unsigned char* srcptr = source + length; 193 switch (length) {
192 switch (length) {
193 default: 194 default:
194 return false; 195 return false;
195 // Everything else falls through when "true"... 196 // Everything else falls through when "true"...
196 case 4: 197 case 4:
197 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) 198 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
199 return false;
200 case 3:
201 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
202 return false;
203 case 2:
204 if ((a = (*--srcptr)) > 0xBF)
205 return false;
206
207 // no fall-through in this inner switch
208 switch (*source) {
209 case 0xE0:
210 if (a < 0xA0)
198 return false; 211 return false;
199 case 3: 212 break;
200 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) 213 case 0xED:
214 if (a > 0x9F)
201 return false; 215 return false;
202 case 2: 216 break;
203 if ((a = (*--srcptr)) > 0xBF) 217 case 0xF0:
218 if (a < 0x90)
204 return false; 219 return false;
205 220 break;
206 // no fall-through in this inner switch
207 switch (*source) {
208 case 0xE0:
209 if (a < 0xA0)
210 return false;
211 break;
212 case 0xED:
213 if (a > 0x9F)
214 return false;
215 break;
216 case 0xF0:
217 if (a < 0x90)
218 return false;
219 break;
220 case 0xF4: 221 case 0xF4:
221 if (a > 0x8F) 222 if (a > 0x8F)
222 return false; 223 return false;
223 break; 224 break;
224 default: 225 default:
225 if (a < 0x80) 226 if (a < 0x80)
226 return false; 227 return false;
227 } 228 }
228 229
229 case 1: 230 case 1:
230 if (*source >= 0x80 && *source < 0xC2) 231 if (*source >= 0x80 && *source < 0xC2)
231 return false; 232 return false;
232 } 233 }
233 if (*source > 0xF4) 234 if (*source > 0xF4)
234 return false; 235 return false;
235 return true; 236 return true;
236 } 237 }
237 238
238 // Magic values subtracted from a buffer value during UTF8 conversion. 239 // Magic values subtracted from a buffer value during UTF8 conversion.
239 // This table contains as many values as there might be trailing bytes 240 // This table contains as many values as there might be trailing bytes
240 // in a UTF-8 sequence. 241 // in a UTF-8 sequence.
241 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) }; 242 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
242 243 0x00003080UL,
243 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) 244 0x000E2080UL,
244 { 245 0x03C82080UL,
245 UChar32 character = 0; 246 static_cast<UChar32>(0xFA082080UL),
246 247 static_cast<UChar32>(0x82082080UL)};
247 // The cases all fall through. 248
248 switch (length) { 249 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) {
250 UChar32 character = 0;
251
252 // The cases all fall through.
253 switch (length) {
249 case 6: 254 case 6:
250 character += static_cast<unsigned char>(*sequence++); 255 character += static_cast<unsigned char>(*sequence++);
251 character <<= 6; 256 character <<= 6;
252 case 5: 257 case 5:
253 character += static_cast<unsigned char>(*sequence++); 258 character += static_cast<unsigned char>(*sequence++);
254 character <<= 6; 259 character <<= 6;
255 case 4: 260 case 4:
256 character += static_cast<unsigned char>(*sequence++); 261 character += static_cast<unsigned char>(*sequence++);
257 character <<= 6; 262 character <<= 6;
258 case 3: 263 case 3:
259 character += static_cast<unsigned char>(*sequence++); 264 character += static_cast<unsigned char>(*sequence++);
260 character <<= 6; 265 character <<= 6;
261 case 2: 266 case 2:
262 character += static_cast<unsigned char>(*sequence++); 267 character += static_cast<unsigned char>(*sequence++);
263 character <<= 6; 268 character <<= 6;
264 case 1: 269 case 1:
265 character += static_cast<unsigned char>(*sequence++); 270 character += static_cast<unsigned char>(*sequence++);
266 } 271 }
267 272
268 return character - offsetsFromUTF8[length - 1]; 273 return character - offsetsFromUTF8[length - 1];
269 } 274 }
270 275
271 ConversionResult convertUTF8ToUTF16( 276 ConversionResult convertUTF8ToUTF16(const char** sourceStart,
272 const char** sourceStart, const char* sourceEnd, 277 const char* sourceEnd,
273 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) 278 UChar** targetStart,
274 { 279 UChar* targetEnd,
275 ConversionResult result = conversionOK; 280 bool* sourceAllASCII,
276 const char* source = *sourceStart; 281 bool strict) {
277 UChar* target = *targetStart; 282 ConversionResult result = conversionOK;
278 UChar orAllData = 0; 283 const char* source = *sourceStart;
279 while (source < sourceEnd) { 284 UChar* target = *targetStart;
280 int utf8SequenceLength = inlineUTF8SequenceLength(*source); 285 UChar orAllData = 0;
281 if (sourceEnd - source < utf8SequenceLength) { 286 while (source < sourceEnd) {
282 result = sourceExhausted; 287 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
283 break; 288 if (sourceEnd - source < utf8SequenceLength) {
289 result = sourceExhausted;
290 break;
291 }
292 // Do this check whether lenient or strict
293 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
294 utf8SequenceLength)) {
295 result = sourceIllegal;
296 break;
297 }
298
299 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
300
301 if (target >= targetEnd) {
302 source -= utf8SequenceLength; // Back up source pointer!
303 result = targetExhausted;
304 break;
305 }
306
307 if (U_IS_BMP(character)) {
308 // UTF-16 surrogate values are illegal in UTF-32
309 if (U_IS_SURROGATE(character)) {
310 if (strict) {
311 source -= utf8SequenceLength; // return to the illegal value itself
312 result = sourceIllegal;
313 break;
284 } 314 }
285 // Do this check whether lenient or strict 315 *target++ = replacementCharacter;
286 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) { 316 orAllData |= replacementCharacter;
287 result = sourceIllegal; 317 } else {
288 break; 318 *target++ = static_cast<UChar>(character); // normal case
289 } 319 orAllData |= character;
290 320 }
291 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); 321 } else if (U_IS_SUPPLEMENTARY(character)) {
292 322 // target is a character in range 0xFFFF - 0x10FFFF
293 if (target >= targetEnd) { 323 if (target + 1 >= targetEnd) {
294 source -= utf8SequenceLength; // Back up source pointer! 324 source -= utf8SequenceLength; // Back up source pointer!
295 result = targetExhausted; 325 result = targetExhausted;
296 break; 326 break;
297 } 327 }
298 328 *target++ = U16_LEAD(character);
299 if (U_IS_BMP(character)) { 329 *target++ = U16_TRAIL(character);
300 // UTF-16 surrogate values are illegal in UTF-32 330 orAllData = 0xffff;
301 if (U_IS_SURROGATE(character)) { 331 } else {
302 if (strict) { 332 if (strict) {
303 source -= utf8SequenceLength; // return to the illegal value itself 333 source -= utf8SequenceLength; // return to the start
304 result = sourceIllegal; 334 result = sourceIllegal;
305 break; 335 break; // Bail out; shouldn't continue
306 } 336 } else {
307 *target++ = replacementCharacter; 337 *target++ = replacementCharacter;
308 orAllData |= replacementCharacter; 338 orAllData |= replacementCharacter;
309 } else { 339 }
310 *target++ = static_cast<UChar>(character); // normal case 340 }
311 orAllData |= character; 341 }
312 } 342 *sourceStart = source;
313 } else if (U_IS_SUPPLEMENTARY(character)) { 343 *targetStart = target;
314 // target is a character in range 0xFFFF - 0x10FFFF 344
315 if (target + 1 >= targetEnd) { 345 if (sourceAllASCII)
316 source -= utf8SequenceLength; // Back up source pointer! 346 *sourceAllASCII = !(orAllData & ~0x7f);
317 result = targetExhausted; 347
318 break; 348 return result;
319 } 349 }
320 *target++ = U16_LEAD(character); 350
321 *target++ = U16_TRAIL(character); 351 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(
322 orAllData = 0xffff; 352 const char* data,
323 } else { 353 const char* dataEnd,
324 if (strict) { 354 unsigned& dataLength,
325 source -= utf8SequenceLength; // return to the start 355 unsigned& utf16Length) {
326 result = sourceIllegal; 356 if (!data)
327 break; // Bail out; shouldn't continue 357 return 0;
328 } else { 358
329 *target++ = replacementCharacter; 359 StringHasher stringHasher;
330 orAllData |= replacementCharacter; 360 dataLength = 0;
331 } 361 utf16Length = 0;
332 } 362
333 } 363 while (data < dataEnd || (!dataEnd && *data)) {
334 *sourceStart = source; 364 if (isASCII(*data)) {
335 *targetStart = target; 365 stringHasher.addCharacter(*data++);
336 366 dataLength++;
337 if (sourceAllASCII) 367 utf16Length++;
338 *sourceAllASCII = !(orAllData & ~0x7f); 368 continue;
339 369 }
340 return result; 370
341 } 371 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
342 372 dataLength += utf8SequenceLength;
343 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length) 373
344 { 374 if (!dataEnd) {
345 if (!data) 375 for (int i = 1; i < utf8SequenceLength; ++i) {
376 if (!data[i])
377 return 0;
378 }
379 } else if (dataEnd - data < utf8SequenceLength) {
380 return 0;
381 }
382
383 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data),
384 utf8SequenceLength))
385 return 0;
386
387 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
388 ASSERT(!isASCII(character));
389
390 if (U_IS_BMP(character)) {
391 // UTF-16 surrogate values are illegal in UTF-32
392 if (U_IS_SURROGATE(character))
346 return 0; 393 return 0;
347 394 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
348 StringHasher stringHasher; 395 utf16Length++;
349 dataLength = 0; 396 } else if (U_IS_SUPPLEMENTARY(character)) {
350 utf16Length = 0; 397 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
351 398 static_cast<UChar>(U16_TRAIL(character)));
352 while (data < dataEnd || (!dataEnd && *data)) { 399 utf16Length += 2;
353 if (isASCII(*data)) { 400 } else {
354 stringHasher.addCharacter(*data++); 401 return 0;
355 dataLength++; 402 }
356 utf16Length++; 403 }
357 continue; 404
358 } 405 return stringHasher.hashWithTop8BitsMasked();
359 406 }
360 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); 407
361 dataLength += utf8SequenceLength; 408 template <typename CharType>
362 409 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a,
363 if (!dataEnd) { 410 const CharType* aEnd,
364 for (int i = 1; i < utf8SequenceLength; ++i) { 411 const char* b,
365 if (!data[i]) 412 const char* bEnd) {
366 return 0; 413 while (b < bEnd) {
367 } 414 if (isASCII(*b)) {
368 } else if (dataEnd - data < utf8SequenceLength) { 415 if (*a++ != *b++)
369 return 0; 416 return false;
370 } 417 continue;
371 418 }
372 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength)) 419
373 return 0; 420 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
374 421
375 UChar32 character = readUTF8Sequence(data, utf8SequenceLength); 422 if (bEnd - b < utf8SequenceLength)
376 ASSERT(!isASCII(character)); 423 return false;
377 424
378 if (U_IS_BMP(character)) { 425 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b),
379 // UTF-16 surrogate values are illegal in UTF-32 426 utf8SequenceLength))
380 if (U_IS_SURROGATE(character)) 427 return 0;
381 return 0; 428
382 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case 429 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
383 utf16Length++; 430 ASSERT(!isASCII(character));
384 } else if (U_IS_SUPPLEMENTARY(character)) { 431
385 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); 432 if (U_IS_BMP(character)) {
386 utf16Length += 2; 433 // UTF-16 surrogate values are illegal in UTF-32
387 } else { 434 if (U_IS_SURROGATE(character))
388 return 0; 435 return false;
389 } 436 if (*a++ != character)
390 } 437 return false;
391 438 } else if (U_IS_SUPPLEMENTARY(character)) {
392 return stringHasher.hashWithTop8BitsMasked(); 439 if (*a++ != U16_LEAD(character))
393 } 440 return false;
394 441 if (*a++ != U16_TRAIL(character))
395 template<typename CharType> 442 return false;
396 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd , const char* b, const char* bEnd) 443 } else {
397 { 444 return false;
398 while (b < bEnd) { 445 }
399 if (isASCII(*b)) { 446 }
400 if (*a++ != *b++) 447
401 return false; 448 return a == aEnd;
402 continue; 449 }
403 } 450
404 451 bool equalUTF16WithUTF8(const UChar* a,
405 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); 452 const UChar* aEnd,
406 453 const char* b,
407 if (bEnd - b < utf8SequenceLength) 454 const char* bEnd) {
408 return false; 455 return equalWithUTF8Internal(a, aEnd, b, bEnd);
409 456 }
410 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence Length)) 457
411 return 0; 458 bool equalLatin1WithUTF8(const LChar* a,
412 459 const LChar* aEnd,
413 UChar32 character = readUTF8Sequence(b, utf8SequenceLength); 460 const char* b,
414 ASSERT(!isASCII(character)); 461 const char* bEnd) {
415 462 return equalWithUTF8Internal(a, aEnd, b, bEnd);
416 if (U_IS_BMP(character)) { 463 }
417 // UTF-16 surrogate values are illegal in UTF-32 464
418 if (U_IS_SURROGATE(character)) 465 } // namespace Unicode
419 return false; 466 } // namespace WTF
420 if (*a++ != character)
421 return false;
422 } else if (U_IS_SUPPLEMENTARY(character)) {
423 if (*a++ != U16_LEAD(character))
424 return false;
425 if (*a++ != U16_TRAIL(character))
426 return false;
427 } else {
428 return false;
429 }
430 }
431
432 return a == aEnd;
433 }
434
435 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
436 {
437 return equalWithUTF8Internal(a, aEnd, b, bEnd);
438 }
439
440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
441 {
442 return equalWithUTF8Internal(a, aEnd, b, bEnd);
443 }
444
445 } // namespace Unicode
446 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/Unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698