Index: Source/wtf/text/TextCodecICU.cpp |
diff --git a/Source/wtf/text/TextCodecICU.cpp b/Source/wtf/text/TextCodecICU.cpp |
index 15beef74dc906cd436f93a86feb17a124a202858..f4460b6c1dd9017891bf2ecf0edc053c1c28722d 100644 |
--- a/Source/wtf/text/TextCodecICU.cpp |
+++ b/Source/wtf/text/TextCodecICU.cpp |
@@ -71,16 +71,20 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
for (int32_t i = 0; i < numEncodings; ++i) { |
const char* name = ucnv_getAvailableName(i); |
UErrorCode error = U_ZERO_ERROR; |
- // Try MIME before trying IANA to pick up commonly used names like |
- // 'EUC-JP' instead of horrendously long names like |
- // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. |
- const char* standardName = ucnv_getStandardName(name, "MIME", &error); |
- if (!U_SUCCESS(error) || !standardName) { |
+#if !defined(USING_SYSTEM_ICU) |
+ const char* primaryStandard = "HTML"; |
+ const char* secondaryStandard = "MIME"; |
+#else |
+ const char* primaryStandard = "MIME"; |
+ const char* secondaryStandard = "IANA"; |
+#endif |
+ const char* standardName = ucnv_getStandardName(name, primaryStandard, &error); |
+ if (U_FAILURE(error) || !standardName) { |
error = U_ZERO_ERROR; |
// Try IANA to pick up 'windows-12xx' and other names |
// which are not preferred MIME names but are widely used. |
- standardName = ucnv_getStandardName(name, "IANA", &error); |
- if (!U_SUCCESS(error) || !standardName) |
+ standardName = ucnv_getStandardName(name, secondaryStandard, &error); |
+ if (U_FAILURE(error) || !standardName) |
continue; |
} |
@@ -90,6 +94,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
// 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers. |
// 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding |
// for encoding GB_2312-80 and several others. So, we need to override this behavior, too. |
+#if defined(USING_SYSTEM_ICU) |
if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80")) |
standardName = "GBK"; |
// Similarly, EUC-KR encodings all map to an extended version, but |
@@ -101,6 +106,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
standardName = "windows-1254"; |
else if (!strcmp(standardName, "TIS-620")) |
standardName = "windows-874"; |
+#endif |
registrar(standardName, standardName); |
@@ -116,6 +122,12 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
} |
} |
+ // These two entries have to be added here because ICU's converter table |
+ // cannot have both ISO-8859-8-I and ISO-8859-8. |
+ registrar("csISO88598I", "ISO-8859-8-I"); |
+ registrar("logical", "ISO-8859-8-I"); |
+ |
+#if defined(USING_SYSTEM_ICU) |
// Additional alias for MacCyrillic not present in ICU. |
registrar("maccyrillic", "x-mac-cyrillic"); |
@@ -131,9 +143,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
registrar("csgb231280", "GBK"); |
registrar("x-euc-cn", "GBK"); |
registrar("x-gbk", "GBK"); |
- registrar("csISO88598I", "ISO-8859-8-I"); |
registrar("koi", "KOI8-R"); |
- registrar("logical", "ISO-8859-8-I"); |
registrar("visual", "ISO-8859-8"); |
registrar("winarabic", "windows-1256"); |
registrar("winbaltic", "windows-1257"); |
@@ -176,8 +186,6 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
// and Firefox (as of Oct 2014), but not in the upstream ICU. |
// Three entries for windows-1252 need not be listed here because |
// TextCodecLatin1 registers them. |
- // FIXME: We may introduce SYSTEM_ICU and enclose this block |
- // with |#if SYSTEM_ICU| because Chromium's ICU has them all. |
registrar("csiso58gb231280", "GBK"); |
registrar("csiso88596e", "ISO-8859-6"); |
registrar("csiso88596i", "ISO-8859-6"); |
@@ -212,6 +220,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) |
registrar("x-cp1256", "windows-1256"); |
registrar("x-cp1257", "windows-1257"); |
registrar("x-cp1258", "windows-1258"); |
+#endif |
} |
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar) |
@@ -237,7 +246,9 @@ void TextCodecICU::registerCodecs(TextCodecRegistrar registrar) |
TextCodecICU::TextCodecICU(const TextEncoding& encoding) |
: m_encoding(encoding) |
, m_converterICU(0) |
+#if defined(USING_SYSTEM_ICU) |
, m_needsGBKFallbacks(false) |
+#endif |
{ |
} |
@@ -261,8 +272,10 @@ void TextCodecICU::createICUConverter() const |
{ |
ASSERT(!m_converterICU); |
+#if defined(USING_SYSTEM_ICU) |
const char* name = m_encoding.name(); |
m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3]; |
+#endif |
UErrorCode err; |
@@ -367,32 +380,41 @@ String TextCodecICU::decode(const char* bytes, size_t length, FlushBehavior flus |
sawError = true; |
} |
+#if !defined(USING_SYSTEM_ICU) |
+ // Chrome's copy of ICU does not have the issue described below. |
+ return result.toString(); |
+#else |
String resultString = result.toString(); |
// <http://bugs.webkit.org/show_bug.cgi?id=17014> |
// Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. |
- if (!strcmp(m_encoding.name(), "GBK") || !strcasecmp(m_encoding.name(), "gb18030")) |
- resultString.replace(0xE5E5, ideographicSpaceCharacter); |
+ if (!strcmp(m_encoding.name(), "GBK")) { |
+ if (!strcasecmp(m_encoding.name(), "gb18030")) |
+ resultString.replace(0xE5E5, ideographicSpaceCharacter); |
+ // Make GBK compliant to the encoding spec and align with GB18030 |
+ resultString.replace(0x01F9, 0xE7C8); |
+ // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 |
+ // is resolved, add U+1E3F => 0xE7C7. |
+ } |
return resultString; |
+#endif |
} |
-// We need to apply these fallbacks ourselves as they are not currently supported by ICU and |
-// they were provided by the old TEC encoding path. Needed to fix <rdar://problem/4708689>. |
+#if defined(USING_SYSTEM_ICU) |
+// U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding |
+// spec, but ICU converter does not have them. |
static UChar fallbackForGBK(UChar32 character) |
{ |
switch (character) { |
case 0x01F9: |
- return 0xE7C8; |
+ return 0xE7C8; // mapped to xA8xBF by ICU. |
case 0x1E3F: |
- return 0xE7C7; |
- case 0x22EF: |
- return 0x2026; |
- case 0x301C: |
- return 0xFF5E; |
+ return 0xE7C7; // mapped to xA8xBC by ICU. |
} |
return 0; |
} |
+#endif |
// Invalid character handler when writing escaped entities for unrepresentable |
// characters. See the declaration of TextCodec::encode for more. |
@@ -409,6 +431,7 @@ static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeA |
UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); |
} |
+#if defined(USING_SYSTEM_ICU) |
// Substitutes special GBK characters, escaping all other unassigned entities. |
static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, |
UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) |
@@ -452,6 +475,7 @@ static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs |
} |
UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err); |
} |
+#endif // USING_SYSTEM_ICU |
class TextCodecInput { |
public: |
@@ -488,13 +512,25 @@ CString TextCodecICU::encodeInternal(const TextCodecInput& input, UnencodableHan |
switch (handling) { |
case QuestionMarksForUnencodables: |
ucnv_setSubstChars(m_converterICU, "?", 1, &err); |
+#if !defined(USING_SYSTEM_ICU) |
+ ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); |
+#else |
ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); |
+#endif |
break; |
case EntitiesForUnencodables: |
+#if !defined(USING_SYSTEM_ICU) |
+ ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); |
+#else |
ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); |
+#endif |
break; |
case URLEncodedEntitiesForUnencodables: |
+#if !defined(USING_SYSTEM_ICU) |
+ ucnv_setFromUCallBack(m_converterICU, urlEscapedEntityCallback, 0, 0, 0, &err); |
+#else |
ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err); |
+#endif |
break; |
} |