Source/wtf/text/TextCodecICU.cpp - Issue 1167523003: Define a variable to distinguish system_icu from bundled_icu in Blink

Unified Diff: Source/wtf/text/TextCodecICU.cpp

Issue 1167523003: Define a variable to distinguish system_icu from bundled_icu in Blink (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: fix the comment in BUILD.gn Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: Source/wtf/text/TextCodecICU.cpp

diff --git a/Source/wtf/text/TextCodecICU.cpp b/Source/wtf/text/TextCodecICU.cpp

index 15beef74dc906cd436f93a86feb17a124a202858..f4460b6c1dd9017891bf2ecf0edc053c1c28722d 100644

--- a/Source/wtf/text/TextCodecICU.cpp

+++ b/Source/wtf/text/TextCodecICU.cpp

@@ -71,16 +71,20 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

for (int32_t i = 0; i < numEncodings; ++i) {

const char* name = ucnv_getAvailableName(i);

UErrorCode error = U_ZERO_ERROR;

- // Try MIME before trying IANA to pick up commonly used names like

- // 'EUC-JP' instead of horrendously long names like

- // 'Extended_UNIX_Code_Packed_Format_for_Japanese'.

- const char* standardName = ucnv_getStandardName(name, "MIME", &error);

- if (!U_SUCCESS(error) || !standardName) {

+#if !defined(USING_SYSTEM_ICU)

+ const char* primaryStandard = "HTML";

+ const char* secondaryStandard = "MIME";

+#else

+ const char* primaryStandard = "MIME";

+ const char* secondaryStandard = "IANA";

+#endif

+ const char* standardName = ucnv_getStandardName(name, primaryStandard, &error);

+ if (U_FAILURE(error) || !standardName) {

error = U_ZERO_ERROR;

// Try IANA to pick up 'windows-12xx' and other names

// which are not preferred MIME names but are widely used.

- standardName = ucnv_getStandardName(name, "IANA", &error);

- if (!U_SUCCESS(error) || !standardName)

+ standardName = ucnv_getStandardName(name, secondaryStandard, &error);

+ if (U_FAILURE(error) || !standardName)

continue;

}

@@ -90,6 +94,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

// 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers.

// 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding

// for encoding GB_2312-80 and several others. So, we need to override this behavior, too.

+#if defined(USING_SYSTEM_ICU)

if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80"))

standardName = "GBK";

// Similarly, EUC-KR encodings all map to an extended version, but

@@ -101,6 +106,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

standardName = "windows-1254";

else if (!strcmp(standardName, "TIS-620"))

standardName = "windows-874";

+#endif

registrar(standardName, standardName);

@@ -116,6 +122,12 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

}

+ // These two entries have to be added here because ICU's converter table

+ // cannot have both ISO-8859-8-I and ISO-8859-8.

+ registrar("csISO88598I", "ISO-8859-8-I");

+ registrar("logical", "ISO-8859-8-I");

+#if defined(USING_SYSTEM_ICU)

// Additional alias for MacCyrillic not present in ICU.

registrar("maccyrillic", "x-mac-cyrillic");

@@ -131,9 +143,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

registrar("csgb231280", "GBK");

registrar("x-euc-cn", "GBK");

registrar("x-gbk", "GBK");

- registrar("csISO88598I", "ISO-8859-8-I");

registrar("koi", "KOI8-R");

- registrar("logical", "ISO-8859-8-I");

registrar("visual", "ISO-8859-8");

registrar("winarabic", "windows-1256");

registrar("winbaltic", "windows-1257");

@@ -176,8 +186,6 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

// and Firefox (as of Oct 2014), but not in the upstream ICU.

// Three entries for windows-1252 need not be listed here because

// TextCodecLatin1 registers them.

- // FIXME: We may introduce SYSTEM_ICU and enclose this block

- // with |#if SYSTEM_ICU| because Chromium's ICU has them all.

registrar("csiso58gb231280", "GBK");

registrar("csiso88596e", "ISO-8859-6");

registrar("csiso88596i", "ISO-8859-6");

@@ -212,6 +220,7 @@ void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)

registrar("x-cp1256", "windows-1256");

registrar("x-cp1257", "windows-1257");

registrar("x-cp1258", "windows-1258");

+#endif

}

void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)

@@ -237,7 +246,9 @@ void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)

TextCodecICU::TextCodecICU(const TextEncoding& encoding)

: m_encoding(encoding)

, m_converterICU(0)

+#if defined(USING_SYSTEM_ICU)

, m_needsGBKFallbacks(false)

+#endif

{

}

@@ -261,8 +272,10 @@ void TextCodecICU::createICUConverter() const

{

ASSERT(!m_converterICU);

+#if defined(USING_SYSTEM_ICU)

const char* name = m_encoding.name();

m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3];

+#endif

UErrorCode err;

@@ -367,32 +380,41 @@ String TextCodecICU::decode(const char* bytes, size_t length, FlushBehavior flus

sawError = true;

}

+#if !defined(USING_SYSTEM_ICU)

+ // Chrome's copy of ICU does not have the issue described below.

+ return result.toString();

+#else

String resultString = result.toString();

// <http://bugs.webkit.org/show_bug.cgi?id=17014>

// Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5.

- if (!strcmp(m_encoding.name(), "GBK") || !strcasecmp(m_encoding.name(), "gb18030"))

- resultString.replace(0xE5E5, ideographicSpaceCharacter);

+ if (!strcmp(m_encoding.name(), "GBK")) {

+ if (!strcasecmp(m_encoding.name(), "gb18030"))

+ resultString.replace(0xE5E5, ideographicSpaceCharacter);

+ // Make GBK compliant to the encoding spec and align with GB18030

+ resultString.replace(0x01F9, 0xE7C8);

+ // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3

+ // is resolved, add U+1E3F => 0xE7C7.

+ }

return resultString;

+#endif

}

-// We need to apply these fallbacks ourselves as they are not currently supported by ICU and

-// they were provided by the old TEC encoding path. Needed to fix <rdar://problem/4708689>.

+#if defined(USING_SYSTEM_ICU)

+// U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding

+// spec, but ICU converter does not have them.

static UChar fallbackForGBK(UChar32 character)

{

switch (character) {

case 0x01F9:

- return 0xE7C8;

+ return 0xE7C8; // mapped to xA8xBF by ICU.

case 0x1E3F:

- return 0xE7C7;

- case 0x22EF:

- return 0x2026;

- case 0x301C:

- return 0xFF5E;

+ return 0xE7C7; // mapped to xA8xBC by ICU.

}

return 0;

}

+#endif

// Invalid character handler when writing escaped entities for unrepresentable

// characters. See the declaration of TextCodec::encode for more.

@@ -409,6 +431,7 @@ static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeA

UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err);

}

+#if defined(USING_SYSTEM_ICU)

// Substitutes special GBK characters, escaping all other unassigned entities.

static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,

UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err)

@@ -452,6 +475,7 @@ static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs

}

UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err);

}

+#endif // USING_SYSTEM_ICU

class TextCodecInput {

public:

@@ -488,13 +512,25 @@ CString TextCodecICU::encodeInternal(const TextCodecInput& input, UnencodableHan

switch (handling) {

case QuestionMarksForUnencodables:

ucnv_setSubstChars(m_converterICU, "?", 1, &err);

+#if !defined(USING_SYSTEM_ICU)

+ ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);

+#else

ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err);

+#endif

break;

case EntitiesForUnencodables:

+#if !defined(USING_SYSTEM_ICU)

+ ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);

+#else

ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err);

+#endif

break;

case URLEncodedEntitiesForUnencodables:

+#if !defined(USING_SYSTEM_ICU)

+ ucnv_setFromUCallBack(m_converterICU, urlEscapedEntityCallback, 0, 0, 0, &err);

+#else

ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err);

+#endif

break;

}

« no previous file with comments | « Source/wtf/text/TextCodecICU.h ('k') | Source/wtf/unicode/CharacterNames.h » ('j') | no next file with comments »