Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1768063002: Introduce String::fromUTF8Lenient() and use it for cache_name in CacheStorage API. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: incorporated jsbell's comment Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2007 Apple Inc. All rights reserved. 2 * Copyright (C) 2007 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4 * 4 *
5 * Redistribution and use in source and binary forms, with or without 5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions 6 * modification, are permitted provided that the following conditions
7 * are met: 7 * are met:
8 * 1. Redistributions of source code must retain the above copyright 8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright 10 * 2. Redistributions in binary form must reproduce the above copyright
(...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after
178 target += bytesToWrite; 178 target += bytesToWrite;
179 } 179 }
180 *sourceStart = source; 180 *sourceStart = source;
181 *targetStart = target; 181 *targetStart = target;
182 return result; 182 return result;
183 } 183 }
184 184
185 // This must be called with the length pre-determined by the first byte. 185 // This must be called with the length pre-determined by the first byte.
186 // If presented with a length > 4, this returns false. The Unicode 186 // If presented with a length > 4, this returns false. The Unicode
187 // definition of UTF-8 goes up to 4-byte sequences. 187 // definition of UTF-8 goes up to 4-byte sequences.
188 static bool isLegalUTF8(const unsigned char* source, int length) 188 static bool isLegalUTF8(const unsigned char* source, int length, bool strict = t rue)
189 { 189 {
190 unsigned char a; 190 unsigned char a;
191 const unsigned char* srcptr = source + length; 191 const unsigned char* srcptr = source + length;
192 switch (length) { 192 switch (length) {
193 default: 193 default:
194 return false; 194 return false;
195 // Everything else falls through when "true"... 195 // Everything else falls through when "true"...
196 case 4: 196 case 4:
197 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) 197 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
198 return false; 198 return false;
199 case 3: 199 case 3:
200 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) 200 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
201 return false; 201 return false;
202 case 2: 202 case 2:
203 if ((a = (*--srcptr)) > 0xBF) 203 if ((a = (*--srcptr)) > 0xBF)
204 return false; 204 return false;
205 205
206 // no fall-through in this inner switch 206 // no fall-through in this inner switch
207 switch (*source) { 207 switch (*source) {
208 case 0xE0: 208 case 0xE0:
209 if (a < 0xA0) 209 if (a < 0xA0)
210 return false; 210 return false;
211 break; 211 break;
212 case 0xED: 212 case 0xED:
213 if (a > 0x9F) 213 // Surrogate values are mapped to [EDA080-EDAFBF] and [EDB080-EDBFBF ] in lenient mode.
214 if (strict && a > 0x9F)
214 return false; 215 return false;
215 break; 216 break;
216 case 0xF0: 217 case 0xF0:
217 if (a < 0x90) 218 if (a < 0x90)
218 return false; 219 return false;
219 break; 220 break;
220 case 0xF4: 221 case 0xF4:
221 if (a > 0x8F) 222 if (a > 0x8F)
222 return false; 223 return false;
223 break; 224 break;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
276 const char* source = *sourceStart; 277 const char* source = *sourceStart;
277 UChar* target = *targetStart; 278 UChar* target = *targetStart;
278 UChar orAllData = 0; 279 UChar orAllData = 0;
279 while (source < sourceEnd) { 280 while (source < sourceEnd) {
280 int utf8SequenceLength = inlineUTF8SequenceLength(*source); 281 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
281 if (sourceEnd - source < utf8SequenceLength) { 282 if (sourceEnd - source < utf8SequenceLength) {
282 result = sourceExhausted; 283 result = sourceExhausted;
283 break; 284 break;
284 } 285 }
285 // Do this check whether lenient or strict 286 // Do this check whether lenient or strict
286 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) { 287 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength, strict)) {
287 result = sourceIllegal; 288 result = sourceIllegal;
288 break; 289 break;
289 } 290 }
290 291
291 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); 292 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
292 293
293 if (target >= targetEnd) { 294 if (target >= targetEnd) {
294 source -= utf8SequenceLength; // Back up source pointer! 295 source -= utf8SequenceLength; // Back up source pointer!
295 result = targetExhausted; 296 result = targetExhausted;
296 break; 297 break;
297 } 298 }
298 299
299 if (U_IS_BMP(character)) { 300 if (U_IS_BMP(character)) {
300 // UTF-16 surrogate values are illegal in UTF-32 301 // UTF-16 surrogate values are illegal in UTF-32
301 if (U_IS_SURROGATE(character)) { 302 if (U_IS_SURROGATE(character)) {
302 if (strict) { 303 if (strict) {
303 source -= utf8SequenceLength; // return to the illegal value itself 304 source -= utf8SequenceLength; // return to the illegal value itself
304 result = sourceIllegal; 305 result = sourceIllegal;
305 break; 306 break;
306 } 307 }
307 *target++ = replacementCharacter; 308 *target++ = static_cast<UChar>(character);
308 orAllData |= replacementCharacter; 309 orAllData |= character;
309 } else { 310 } else {
310 *target++ = static_cast<UChar>(character); // normal case 311 *target++ = static_cast<UChar>(character); // normal case
311 orAllData |= character; 312 orAllData |= character;
312 } 313 }
313 } else if (U_IS_SUPPLEMENTARY(character)) { 314 } else if (U_IS_SUPPLEMENTARY(character)) {
314 // target is a character in range 0xFFFF - 0x10FFFF 315 // target is a character in range 0xFFFF - 0x10FFFF
315 if (target + 1 >= targetEnd) { 316 if (target + 1 >= targetEnd) {
316 source -= utf8SequenceLength; // Back up source pointer! 317 source -= utf8SequenceLength; // Back up source pointer!
317 result = targetExhausted; 318 result = targetExhausted;
318 break; 319 break;
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
437 return equalWithUTF8Internal(a, aEnd, b, bEnd); 438 return equalWithUTF8Internal(a, aEnd, b, bEnd);
438 } 439 }
439 440
440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd) 441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
441 { 442 {
442 return equalWithUTF8Internal(a, aEnd, b, bEnd); 443 return equalWithUTF8Internal(a, aEnd, b, bEnd);
443 } 444 }
444 445
445 } // namespace Unicode 446 } // namespace Unicode
446 } // namespace WTF 447 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/platform/exported/WebString.cpp ('k') | third_party/WebKit/Source/wtf/text/WTFString.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698