third_party/WebKit/Source/wtf/text/UTF8.cpp - Issue 1768063002: Introduce String::fromUTF8Lenient() and use it for cache_name in CacheStorage API.

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 1768063002: Introduce String::fromUTF8Lenient() and use it for cache_name in CacheStorage API. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: incorporated jsbell's comment Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2007 Apple Inc. All rights reserved.	2 * Copyright (C) 2007 Apple Inc. All rights reserved.

3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>	3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>

4 *	4 *

5 * Redistribution and use in source and binary forms, with or without	5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions	6 * modification, are permitted provided that the following conditions

7 * are met:	7 * are met:

8 * 1. Redistributions of source code must retain the above copyright	8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright	10 * 2. Redistributions in binary form must reproduce the above copyright

(...skipping 167 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
178 target += bytesToWrite;	178 target += bytesToWrite;

179 }	179 }

180 *sourceStart = source;	180 *sourceStart = source;

181 *targetStart = target;	181 *targetStart = target;

182 return result;	182 return result;

183 }	183 }

184	184

185 // This must be called with the length pre-determined by the first byte.	185 // This must be called with the length pre-determined by the first byte.

186 // If presented with a length > 4, this returns false. The Unicode	186 // If presented with a length > 4, this returns false. The Unicode

187 // definition of UTF-8 goes up to 4-byte sequences.	187 // definition of UTF-8 goes up to 4-byte sequences.

188 static bool isLegalUTF8(const unsigned char* source, int length)	188 static bool isLegalUTF8(const unsigned char* source, int length, bool strict = t rue)

189 {	189 {

190 unsigned char a;	190 unsigned char a;

191 const unsigned char* srcptr = source + length;	191 const unsigned char* srcptr = source + length;

192 switch (length) {	192 switch (length) {

193 default:	193 default:

194 return false;	194 return false;

195 // Everything else falls through when "true"...	195 // Everything else falls through when "true"...

196 case 4:	196 case 4:

197 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)	197 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

198 return false;	198 return false;

199 case 3:	199 case 3:

200 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)	200 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

201 return false;	201 return false;

202 case 2:	202 case 2:

203 if ((a = (*--srcptr)) > 0xBF)	203 if ((a = (*--srcptr)) > 0xBF)

204 return false;	204 return false;

205	205

206 // no fall-through in this inner switch	206 // no fall-through in this inner switch

207 switch (*source) {	207 switch (*source) {

208 case 0xE0:	208 case 0xE0:

209 if (a < 0xA0)	209 if (a < 0xA0)

210 return false;	210 return false;

211 break;	211 break;

212 case 0xED:	212 case 0xED:

213 if (a > 0x9F)	213 // Surrogate values are mapped to [EDA080-EDAFBF] and [EDB080-EDBFBF ] in lenient mode.

	214 if (strict && a > 0x9F)

214 return false;	215 return false;

215 break;	216 break;

216 case 0xF0:	217 case 0xF0:

217 if (a < 0x90)	218 if (a < 0x90)

218 return false;	219 return false;

219 break;	220 break;

220 case 0xF4:	221 case 0xF4:

221 if (a > 0x8F)	222 if (a > 0x8F)

222 return false;	223 return false;

223 break;	224 break;

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
276 const char* source = *sourceStart;	277 const char* source = *sourceStart;

277 UChar* target = *targetStart;	278 UChar* target = *targetStart;

278 UChar orAllData = 0;	279 UChar orAllData = 0;

279 while (source < sourceEnd) {	280 while (source < sourceEnd) {

280 int utf8SequenceLength = inlineUTF8SequenceLength(*source);	281 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

281 if (sourceEnd - source < utf8SequenceLength) {	282 if (sourceEnd - source < utf8SequenceLength) {

282 result = sourceExhausted;	283 result = sourceExhausted;

283 break;	284 break;

284 }	285 }

285 // Do this check whether lenient or strict	286 // Do this check whether lenient or strict

286 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {	287 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength, strict)) {

287 result = sourceIllegal;	288 result = sourceIllegal;

288 break;	289 break;

289 }	290 }

290	291

291 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);	292 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

292	293

293 if (target >= targetEnd) {	294 if (target >= targetEnd) {

294 source -= utf8SequenceLength; // Back up source pointer!	295 source -= utf8SequenceLength; // Back up source pointer!

295 result = targetExhausted;	296 result = targetExhausted;

296 break;	297 break;

297 }	298 }

298	299

299 if (U_IS_BMP(character)) {	300 if (U_IS_BMP(character)) {

300 // UTF-16 surrogate values are illegal in UTF-32	301 // UTF-16 surrogate values are illegal in UTF-32

301 if (U_IS_SURROGATE(character)) {	302 if (U_IS_SURROGATE(character)) {

302 if (strict) {	303 if (strict) {

303 source -= utf8SequenceLength; // return to the illegal value itself	304 source -= utf8SequenceLength; // return to the illegal value itself

304 result = sourceIllegal;	305 result = sourceIllegal;

305 break;	306 break;

306 }	307 }

307 *target++ = replacementCharacter;	308 *target++ = static_cast<UChar>(character);

308 orAllData \|= replacementCharacter;	309 orAllData \|= character;

309 } else {	310 } else {

310 *target++ = static_cast<UChar>(character); // normal case	311 *target++ = static_cast<UChar>(character); // normal case

311 orAllData \|= character;	312 orAllData \|= character;

312 }	313 }

313 } else if (U_IS_SUPPLEMENTARY(character)) {	314 } else if (U_IS_SUPPLEMENTARY(character)) {

314 // target is a character in range 0xFFFF - 0x10FFFF	315 // target is a character in range 0xFFFF - 0x10FFFF

315 if (target + 1 >= targetEnd) {	316 if (target + 1 >= targetEnd) {

316 source -= utf8SequenceLength; // Back up source pointer!	317 source -= utf8SequenceLength; // Back up source pointer!

317 result = targetExhausted;	318 result = targetExhausted;

318 break;	319 break;

(...skipping 118 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
437 return equalWithUTF8Internal(a, aEnd, b, bEnd);	438 return equalWithUTF8Internal(a, aEnd, b, bEnd);

438 }	439 }

439	440

440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)	441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)

441 {	442 {

442 return equalWithUTF8Internal(a, aEnd, b, bEnd);	443 return equalWithUTF8Internal(a, aEnd, b, bEnd);

443 }	444 }

444	445

445 } // namespace Unicode	446 } // namespace Unicode

446 } // namespace WTF	447 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/platform/exported/WebString.cpp ('k') | third_party/WebKit/Source/wtf/text/WTFString.h » ('j') | no next file with comments »