third_party/WebKit/Source/wtf/text/TextCodecICU.cpp - Issue 2373983006: reflow comments in wtf/text

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecICU.cpp

Issue 2373983006: reflow comments in wtf/text (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« third_party/WebKit/Source/wtf/text/StringImplCF.cpp ('K') | « third_party/WebKit/Source/wtf/text/TextCodec.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecLatin1.cpp » ('j') | third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved.

3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>	3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>

4 *	4 *

5 * Redistribution and use in source and binary forms, with or without	5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions	6 * modification, are permitted provided that the following conditions

7 * are met:	7 * are met:

8 * 1. Redistributions of source code must retain the above copyright	8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright	10 * 2. Redistributions in binary form must reproduce the above copyright

(...skipping 70 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
81 // Try IANA to pick up 'windows-12xx' and other names	81 // Try IANA to pick up 'windows-12xx' and other names

82 // which are not preferred MIME names but are widely used.	82 // which are not preferred MIME names but are widely used.

83 standardName = ucnv_getStandardName(name, secondaryStandard, &error);	83 standardName = ucnv_getStandardName(name, secondaryStandard, &error);

84 if (U_FAILURE(error) \|\| !standardName)	84 if (U_FAILURE(error) \|\| !standardName)

85 continue;	85 continue;

86 }	86 }

87	87

88 // A number of these aliases are handled in Chrome's copy of ICU, but	88 // A number of these aliases are handled in Chrome's copy of ICU, but

89 // Chromium can be compiled with the system ICU.	89 // Chromium can be compiled with the system ICU.

90	90

91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other br owsers.	91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other

92 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding	92 // browsers.

93 // for encoding GB_2312-80 and several others. So, we need to override this b ehavior, too.	93 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native

	94 // encoding for encoding GB_2312-80 and several others. So, we need to

	95 // override this behavior, too.

94 #if defined(USING_SYSTEM_ICU)	96 #if defined(USING_SYSTEM_ICU)

95 if (!strcmp(standardName, "GB2312") \|\| !strcmp(standardName, "GB_2312-80"))	97 if (!strcmp(standardName, "GB2312") \|\| !strcmp(standardName, "GB_2312-80"))

96 standardName = "GBK";	98 standardName = "GBK";

97 // Similarly, EUC-KR encodings all map to an extended version, but	99 // Similarly, EUC-KR encodings all map to an extended version, but

98 // per HTML5, the canonical name still should be EUC-KR.	100 // per HTML5, the canonical name still should be EUC-KR.

99 else if (!strcmp(standardName, "EUC-KR") \|\|	101 else if (!strcmp(standardName, "EUC-KR") \|\|

100 !strcmp(standardName, "KSC_5601") \|\|	102 !strcmp(standardName, "KSC_5601") \|\|

101 !strcmp(standardName, "cp1363"))	103 !strcmp(standardName, "cp1363"))

102 standardName = "EUC-KR";	104 standardName = "EUC-KR";

103 // And so on.	105 // And so on.

104 else if (	106 else if (!strcasecmp(standardName, "iso-8859-9"))

105 !strcasecmp(	107 // This name is returned in different case by ICU 3.2 and 3.6.

106 standardName,

107 "iso-8859-9")) // This name is returned in different case by ICU 3. 2 and 3.6.

108 standardName = "windows-1254";	108 standardName = "windows-1254";

109 else if (!strcmp(standardName, "TIS-620"))	109 else if (!strcmp(standardName, "TIS-620"))

110 standardName = "windows-874";	110 standardName = "windows-874";

111 #endif	111 #endif

112	112

113 registrar(standardName, standardName);	113 registrar(standardName, standardName);

114	114

115 uint16_t numAliases = ucnv_countAliases(name, &error);	115 uint16_t numAliases = ucnv_countAliases(name, &error);

116 ASSERT(U_SUCCESS(error));	116 ASSERT(U_SUCCESS(error));

117 if (U_SUCCESS(error))	117 if (U_SUCCESS(error))

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
175 registrar("ISO8859-6", "ISO-8859-6");	175 registrar("ISO8859-6", "ISO-8859-6");

176 registrar("ISO8859-7", "ISO-8859-7");	176 registrar("ISO8859-7", "ISO-8859-7");

177 registrar("ISO8859-8", "ISO-8859-8");	177 registrar("ISO8859-8", "ISO-8859-8");

178 registrar("ISO8859-8-I", "ISO-8859-8-I");	178 registrar("ISO8859-8-I", "ISO-8859-8-I");

179 registrar("ISO8859-9", "ISO-8859-9");	179 registrar("ISO8859-9", "ISO-8859-9");

180 registrar("ISO8859-10", "ISO-8859-10");	180 registrar("ISO8859-10", "ISO-8859-10");

181 registrar("ISO8859-13", "ISO-8859-13");	181 registrar("ISO8859-13", "ISO-8859-13");

182 registrar("ISO8859-14", "ISO-8859-14");	182 registrar("ISO8859-14", "ISO-8859-14");

183 registrar("ISO8859-15", "ISO-8859-15");	183 registrar("ISO8859-15", "ISO-8859-15");

184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label	184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label

185 // listed in WHATWG Encoding Living Standard (http://encoding.spec.whatwg.org/ ).	185 // listed in WHATWG Encoding Living Standard, http://encoding.spec.whatwg.org/

186	186

187 // Additional aliases present in the WHATWG Encoding Standard	187 // Additional aliases present in the WHATWG Encoding Standard

188 // and Firefox (as of Oct 2014), but not in the upstream ICU.	188 // and Firefox (as of Oct 2014), but not in the upstream ICU.

189 // Three entries for windows-1252 need not be listed here because	189 // Three entries for windows-1252 need not be listed here because

190 // TextCodecLatin1 registers them.	190 // TextCodecLatin1 registers them.

191 registrar("csiso58gb231280", "GBK");	191 registrar("csiso58gb231280", "GBK");

192 registrar("csiso88596e", "ISO-8859-6");	192 registrar("csiso88596e", "ISO-8859-6");

193 registrar("csiso88596i", "ISO-8859-6");	193 registrar("csiso88596i", "ISO-8859-6");

194 registrar("csiso88598e", "ISO-8859-8");	194 registrar("csiso88598e", "ISO-8859-8");

195 registrar("gb_2312", "GBK");	195 registrar("gb_2312", "GBK");

(...skipping 176 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
372 int32_t* offsets = nullptr;	372 int32_t* offsets = nullptr;

373 UErrorCode err = U_ZERO_ERROR;	373 UErrorCode err = U_ZERO_ERROR;

374	374

375 do {	375 do {

376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit,	376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit,

377 offsets, flush != DoNotFlush, err);	377 offsets, flush != DoNotFlush, err);

378 result.append(buffer, ucharsDecoded);	378 result.append(buffer, ucharsDecoded);

379 } while (err == U_BUFFER_OVERFLOW_ERROR);	379 } while (err == U_BUFFER_OVERFLOW_ERROR);

380	380

381 if (U_FAILURE(err)) {	381 if (U_FAILURE(err)) {

382 // flush the converter so it can be reused, and not be bothered by this erro r.	382 // flush the converter so it can be reused, and not be bothered by this

	383 // error.

383 do {	384 do {

384 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true,	385 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true,

385 err);	386 err);

386 } while (source < sourceLimit);	387 } while (source < sourceLimit);

387 sawError = true;	388 sawError = true;

388 }	389 }

389	390

390 #if !defined(USING_SYSTEM_ICU)	391 #if !defined(USING_SYSTEM_ICU)

391 // Chrome's copy of ICU does not have the issue described below.	392 // Chrome's copy of ICU does not have the issue described below.

392 return result.toString();	393 return result.toString();

393 #else	394 #else

394 String resultString = result.toString();	395 String resultString = result.toString();

395	396

396 // <http://bugs.webkit.org/show_bug.cgi?id=17014>	397 // <http://bugs.webkit.org/show_bug.cgi?id=17014>

397 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5.	398 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but

	399 // ICU decodes it as U+E5E5.

398 if (!strcmp(m_encoding.name(), "GBK")) {	400 if (!strcmp(m_encoding.name(), "GBK")) {

399 if (!strcasecmp(m_encoding.name(), "gb18030"))	401 if (!strcasecmp(m_encoding.name(), "gb18030"))

400 resultString.replace(0xE5E5, ideographicSpaceCharacter);	402 resultString.replace(0xE5E5, ideographicSpaceCharacter);

401 // Make GBK compliant to the encoding spec and align with GB18030	403 // Make GBK compliant to the encoding spec and align with GB18030

402 resultString.replace(0x01F9, 0xE7C8);	404 resultString.replace(0x01F9, 0xE7C8);

403 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3	405 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3

404 // is resolved, add U+1E3F => 0xE7C7.	406 // is resolved, add U+1E3F => 0xE7C7.

405 }	407 }

406	408

407 return resultString;	409 return resultString;

408 #endif	410 #endif

409 }	411 }

410	412

411 #if defined(USING_SYSTEM_ICU)	413 #if defined(USING_SYSTEM_ICU)

412 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding	414 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding

413 // spec, but ICU converter does not have them.	415 // spec, but ICU converter does not have them.

414 static UChar fallbackForGBK(UChar32 character) {	416 static UChar fallbackForGBK(UChar32 character) {

415 switch (character) {	417 switch (character) {

416 case 0x01F9:	418 case 0x01F9:

417 return 0xE7C8; // mapped to xA8xBF by ICU.	419 return 0xE7C8; // mapped to xA8xBF by ICU.

418 case 0x1E3F:	420 case 0x1E3F:

419 return 0xE7C7; // mapped to xA8xBC by ICU.	421 return 0xE7C7; // mapped to xA8xBC by ICU.

420 }	422 }

421 return 0;	423 return 0;

422 }	424 }

423 #endif	425 #endif

424	426

425 // Generic helper for writing escaped entities using the specfied UnencodableHan dling.	427 // Generic helper for writing escaped entities using the specfied

	428 // UnencodableHandling.

426 static void formatEscapedEntityCallback(const void* context,	429 static void formatEscapedEntityCallback(const void* context,

427 UConverterFromUnicodeArgs* fromUArgs,	430 UConverterFromUnicodeArgs* fromUArgs,

428 const UChar* codeUnits,	431 const UChar* codeUnits,

429 int32_t length,	432 int32_t length,

430 UChar32 codePoint,	433 UChar32 codePoint,

431 UConverterCallbackReason reason,	434 UConverterCallbackReason reason,

432 UErrorCode* err,	435 UErrorCode* err,

433 UnencodableHandling handling) {	436 UnencodableHandling handling) {

434 if (reason == UCNV_UNASSIGNED) {	437 if (reason == UCNV_UNASSIGNED) {

435 *err = U_ZERO_ERROR;	438 *err = U_ZERO_ERROR;

(...skipping 13 matching lines...) Expand all Loading...
449 const UChar* codeUnits,	452 const UChar* codeUnits,

450 int32_t length,	453 int32_t length,

451 UChar32 codePoint,	454 UChar32 codePoint,

452 UConverterCallbackReason reason,	455 UConverterCallbackReason reason,

453 UErrorCode* err) {	456 UErrorCode* err) {

454 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,	457 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

455 reason, err, EntitiesForUnencodables);	458 reason, err, EntitiesForUnencodables);

456 }	459 }

457	460

458 // Invalid character handler when writing escaped entities in CSS encoding for	461 // Invalid character handler when writing escaped entities in CSS encoding for

459 // unrepresentable characters. See the declaration of TextCodec::encode for more .	462 // unrepresentable characters. See the declaration of TextCodec::encode for

	463 // more.

460 static void cssEscapedEntityCallback(const void* context,	464 static void cssEscapedEntityCallback(const void* context,

461 UConverterFromUnicodeArgs* fromUArgs,	465 UConverterFromUnicodeArgs* fromUArgs,

462 const UChar* codeUnits,	466 const UChar* codeUnits,

463 int32_t length,	467 int32_t length,

464 UChar32 codePoint,	468 UChar32 codePoint,

465 UConverterCallbackReason reason,	469 UConverterCallbackReason reason,

466 UErrorCode* err) {	470 UErrorCode* err) {

467 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,	471 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

468 reason, err, CSSEncodedEntitiesForUnencodables);	472 reason, err, CSSEncodedEntitiesForUnencodables);

469 }	473 }

470	474

471 // Invalid character handler when writing escaped entities in HTML/XML encoding for	475 // Invalid character handler when writing escaped entities in HTML/XML encoding

472 // unrepresentable characters. See the declaration of TextCodec::encode for more .	476 // for unrepresentable characters. See the declaration of TextCodec::encode for

	477 // more.

473 static void urlEscapedEntityCallback(const void* context,	478 static void urlEscapedEntityCallback(const void* context,

474 UConverterFromUnicodeArgs* fromUArgs,	479 UConverterFromUnicodeArgs* fromUArgs,

475 const UChar* codeUnits,	480 const UChar* codeUnits,

476 int32_t length,	481 int32_t length,

477 UChar32 codePoint,	482 UChar32 codePoint,

478 UConverterCallbackReason reason,	483 UConverterCallbackReason reason,

479 UErrorCode* err) {	484 UErrorCode* err) {

480 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,	485 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

481 reason, err, URLEncodedEntitiesForUnencodables);	486 reason, err, URLEncodedEntitiesForUnencodables);

482 }	487 }

(...skipping 210 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
693 return encodeCommon(characters, length, handling);	698 return encodeCommon(characters, length, handling);

694 }	699 }

695	700

696 CString TextCodecICU::encode(const LChar* characters,	701 CString TextCodecICU::encode(const LChar* characters,

697 size_t length,	702 size_t length,

698 UnencodableHandling handling) {	703 UnencodableHandling handling) {

699 return encodeCommon(characters, length, handling);	704 return encodeCommon(characters, length, handling);

700 }	705 }

701	706

702 } // namespace WTF	707 } // namespace WTF

OLD	NEW