| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved. |
| 3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> | 3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> |
| 4 * | 4 * |
| 5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
| 7 * are met: | 7 * are met: |
| 8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 81 // Try IANA to pick up 'windows-12xx' and other names | 81 // Try IANA to pick up 'windows-12xx' and other names |
| 82 // which are not preferred MIME names but are widely used. | 82 // which are not preferred MIME names but are widely used. |
| 83 standardName = ucnv_getStandardName(name, secondaryStandard, &error); | 83 standardName = ucnv_getStandardName(name, secondaryStandard, &error); |
| 84 if (U_FAILURE(error) || !standardName) | 84 if (U_FAILURE(error) || !standardName) |
| 85 continue; | 85 continue; |
| 86 } | 86 } |
| 87 | 87 |
| 88 // A number of these aliases are handled in Chrome's copy of ICU, but | 88 // A number of these aliases are handled in Chrome's copy of ICU, but |
| 89 // Chromium can be compiled with the system ICU. | 89 // Chromium can be compiled with the system ICU. |
| 90 | 90 |
| 91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other br
owsers. | 91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other |
| 92 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native
encoding | 92 // browsers. |
| 93 // for encoding GB_2312-80 and several others. So, we need to override this b
ehavior, too. | 93 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native |
| 94 // encoding for encoding GB_2312-80 and several others. So, we need to |
| 95 // override this behavior, too. |
| 94 #if defined(USING_SYSTEM_ICU) | 96 #if defined(USING_SYSTEM_ICU) |
| 95 if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80")) | 97 if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80")) |
| 96 standardName = "GBK"; | 98 standardName = "GBK"; |
| 97 // Similarly, EUC-KR encodings all map to an extended version, but | 99 // Similarly, EUC-KR encodings all map to an extended version, but |
| 98 // per HTML5, the canonical name still should be EUC-KR. | 100 // per HTML5, the canonical name still should be EUC-KR. |
| 99 else if (!strcmp(standardName, "EUC-KR") || | 101 else if (!strcmp(standardName, "EUC-KR") || |
| 100 !strcmp(standardName, "KSC_5601") || | 102 !strcmp(standardName, "KSC_5601") || |
| 101 !strcmp(standardName, "cp1363")) | 103 !strcmp(standardName, "cp1363")) |
| 102 standardName = "EUC-KR"; | 104 standardName = "EUC-KR"; |
| 103 // And so on. | 105 // And so on. |
| 104 else if ( | 106 else if (!strcasecmp(standardName, "iso-8859-9")) |
| 105 !strcasecmp( | 107 // This name is returned in different case by ICU 3.2 and 3.6. |
| 106 standardName, | |
| 107 "iso-8859-9")) // This name is returned in different case by ICU 3.
2 and 3.6. | |
| 108 standardName = "windows-1254"; | 108 standardName = "windows-1254"; |
| 109 else if (!strcmp(standardName, "TIS-620")) | 109 else if (!strcmp(standardName, "TIS-620")) |
| 110 standardName = "windows-874"; | 110 standardName = "windows-874"; |
| 111 #endif | 111 #endif |
| 112 | 112 |
| 113 registrar(standardName, standardName); | 113 registrar(standardName, standardName); |
| 114 | 114 |
| 115 uint16_t numAliases = ucnv_countAliases(name, &error); | 115 uint16_t numAliases = ucnv_countAliases(name, &error); |
| 116 ASSERT(U_SUCCESS(error)); | 116 ASSERT(U_SUCCESS(error)); |
| 117 if (U_SUCCESS(error)) | 117 if (U_SUCCESS(error)) |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 175 registrar("ISO8859-6", "ISO-8859-6"); | 175 registrar("ISO8859-6", "ISO-8859-6"); |
| 176 registrar("ISO8859-7", "ISO-8859-7"); | 176 registrar("ISO8859-7", "ISO-8859-7"); |
| 177 registrar("ISO8859-8", "ISO-8859-8"); | 177 registrar("ISO8859-8", "ISO-8859-8"); |
| 178 registrar("ISO8859-8-I", "ISO-8859-8-I"); | 178 registrar("ISO8859-8-I", "ISO-8859-8-I"); |
| 179 registrar("ISO8859-9", "ISO-8859-9"); | 179 registrar("ISO8859-9", "ISO-8859-9"); |
| 180 registrar("ISO8859-10", "ISO-8859-10"); | 180 registrar("ISO8859-10", "ISO-8859-10"); |
| 181 registrar("ISO8859-13", "ISO-8859-13"); | 181 registrar("ISO8859-13", "ISO-8859-13"); |
| 182 registrar("ISO8859-14", "ISO-8859-14"); | 182 registrar("ISO8859-14", "ISO-8859-14"); |
| 183 registrar("ISO8859-15", "ISO-8859-15"); | 183 registrar("ISO8859-15", "ISO-8859-15"); |
| 184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label | 184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label |
| 185 // listed in WHATWG Encoding Living Standard (http://encoding.spec.whatwg.org/
). | 185 // listed in WHATWG Encoding Living Standard, http://encoding.spec.whatwg.org/ |
| 186 | 186 |
| 187 // Additional aliases present in the WHATWG Encoding Standard | 187 // Additional aliases present in the WHATWG Encoding Standard |
| 188 // and Firefox (as of Oct 2014), but not in the upstream ICU. | 188 // and Firefox (as of Oct 2014), but not in the upstream ICU. |
| 189 // Three entries for windows-1252 need not be listed here because | 189 // Three entries for windows-1252 need not be listed here because |
| 190 // TextCodecLatin1 registers them. | 190 // TextCodecLatin1 registers them. |
| 191 registrar("csiso58gb231280", "GBK"); | 191 registrar("csiso58gb231280", "GBK"); |
| 192 registrar("csiso88596e", "ISO-8859-6"); | 192 registrar("csiso88596e", "ISO-8859-6"); |
| 193 registrar("csiso88596i", "ISO-8859-6"); | 193 registrar("csiso88596i", "ISO-8859-6"); |
| 194 registrar("csiso88598e", "ISO-8859-8"); | 194 registrar("csiso88598e", "ISO-8859-8"); |
| 195 registrar("gb_2312", "GBK"); | 195 registrar("gb_2312", "GBK"); |
| (...skipping 176 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 372 int32_t* offsets = nullptr; | 372 int32_t* offsets = nullptr; |
| 373 UErrorCode err = U_ZERO_ERROR; | 373 UErrorCode err = U_ZERO_ERROR; |
| 374 | 374 |
| 375 do { | 375 do { |
| 376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, | 376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, |
| 377 offsets, flush != DoNotFlush, err); | 377 offsets, flush != DoNotFlush, err); |
| 378 result.append(buffer, ucharsDecoded); | 378 result.append(buffer, ucharsDecoded); |
| 379 } while (err == U_BUFFER_OVERFLOW_ERROR); | 379 } while (err == U_BUFFER_OVERFLOW_ERROR); |
| 380 | 380 |
| 381 if (U_FAILURE(err)) { | 381 if (U_FAILURE(err)) { |
| 382 // flush the converter so it can be reused, and not be bothered by this erro
r. | 382 // flush the converter so it can be reused, and not be bothered by this |
| 383 // error. |
| 383 do { | 384 do { |
| 384 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, | 385 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, |
| 385 err); | 386 err); |
| 386 } while (source < sourceLimit); | 387 } while (source < sourceLimit); |
| 387 sawError = true; | 388 sawError = true; |
| 388 } | 389 } |
| 389 | 390 |
| 390 #if !defined(USING_SYSTEM_ICU) | 391 #if !defined(USING_SYSTEM_ICU) |
| 391 // Chrome's copy of ICU does not have the issue described below. | 392 // Chrome's copy of ICU does not have the issue described below. |
| 392 return result.toString(); | 393 return result.toString(); |
| 393 #else | 394 #else |
| 394 String resultString = result.toString(); | 395 String resultString = result.toString(); |
| 395 | 396 |
| 396 // <http://bugs.webkit.org/show_bug.cgi?id=17014> | 397 // <http://bugs.webkit.org/show_bug.cgi?id=17014> |
| 397 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but
ICU decodes it as U+E5E5. | 398 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but |
| 399 // ICU decodes it as U+E5E5. |
| 398 if (!strcmp(m_encoding.name(), "GBK")) { | 400 if (!strcmp(m_encoding.name(), "GBK")) { |
| 399 if (!strcasecmp(m_encoding.name(), "gb18030")) | 401 if (!strcasecmp(m_encoding.name(), "gb18030")) |
| 400 resultString.replace(0xE5E5, ideographicSpaceCharacter); | 402 resultString.replace(0xE5E5, ideographicSpaceCharacter); |
| 401 // Make GBK compliant to the encoding spec and align with GB18030 | 403 // Make GBK compliant to the encoding spec and align with GB18030 |
| 402 resultString.replace(0x01F9, 0xE7C8); | 404 resultString.replace(0x01F9, 0xE7C8); |
| 403 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 | 405 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 |
| 404 // is resolved, add U+1E3F => 0xE7C7. | 406 // is resolved, add U+1E3F => 0xE7C7. |
| 405 } | 407 } |
| 406 | 408 |
| 407 return resultString; | 409 return resultString; |
| 408 #endif | 410 #endif |
| 409 } | 411 } |
| 410 | 412 |
| 411 #if defined(USING_SYSTEM_ICU) | 413 #if defined(USING_SYSTEM_ICU) |
| 412 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding | 414 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding |
| 413 // spec, but ICU converter does not have them. | 415 // spec, but ICU converter does not have them. |
| 414 static UChar fallbackForGBK(UChar32 character) { | 416 static UChar fallbackForGBK(UChar32 character) { |
| 415 switch (character) { | 417 switch (character) { |
| 416 case 0x01F9: | 418 case 0x01F9: |
| 417 return 0xE7C8; // mapped to xA8xBF by ICU. | 419 return 0xE7C8; // mapped to xA8xBF by ICU. |
| 418 case 0x1E3F: | 420 case 0x1E3F: |
| 419 return 0xE7C7; // mapped to xA8xBC by ICU. | 421 return 0xE7C7; // mapped to xA8xBC by ICU. |
| 420 } | 422 } |
| 421 return 0; | 423 return 0; |
| 422 } | 424 } |
| 423 #endif | 425 #endif |
| 424 | 426 |
| 425 // Generic helper for writing escaped entities using the specfied UnencodableHan
dling. | 427 // Generic helper for writing escaped entities using the specfied |
| 428 // UnencodableHandling. |
| 426 static void formatEscapedEntityCallback(const void* context, | 429 static void formatEscapedEntityCallback(const void* context, |
| 427 UConverterFromUnicodeArgs* fromUArgs, | 430 UConverterFromUnicodeArgs* fromUArgs, |
| 428 const UChar* codeUnits, | 431 const UChar* codeUnits, |
| 429 int32_t length, | 432 int32_t length, |
| 430 UChar32 codePoint, | 433 UChar32 codePoint, |
| 431 UConverterCallbackReason reason, | 434 UConverterCallbackReason reason, |
| 432 UErrorCode* err, | 435 UErrorCode* err, |
| 433 UnencodableHandling handling) { | 436 UnencodableHandling handling) { |
| 434 if (reason == UCNV_UNASSIGNED) { | 437 if (reason == UCNV_UNASSIGNED) { |
| 435 *err = U_ZERO_ERROR; | 438 *err = U_ZERO_ERROR; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 449 const UChar* codeUnits, | 452 const UChar* codeUnits, |
| 450 int32_t length, | 453 int32_t length, |
| 451 UChar32 codePoint, | 454 UChar32 codePoint, |
| 452 UConverterCallbackReason reason, | 455 UConverterCallbackReason reason, |
| 453 UErrorCode* err) { | 456 UErrorCode* err) { |
| 454 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, | 457 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, |
| 455 reason, err, EntitiesForUnencodables); | 458 reason, err, EntitiesForUnencodables); |
| 456 } | 459 } |
| 457 | 460 |
| 458 // Invalid character handler when writing escaped entities in CSS encoding for | 461 // Invalid character handler when writing escaped entities in CSS encoding for |
| 459 // unrepresentable characters. See the declaration of TextCodec::encode for more
. | 462 // unrepresentable characters. See the declaration of TextCodec::encode for |
| 463 // more. |
| 460 static void cssEscapedEntityCallback(const void* context, | 464 static void cssEscapedEntityCallback(const void* context, |
| 461 UConverterFromUnicodeArgs* fromUArgs, | 465 UConverterFromUnicodeArgs* fromUArgs, |
| 462 const UChar* codeUnits, | 466 const UChar* codeUnits, |
| 463 int32_t length, | 467 int32_t length, |
| 464 UChar32 codePoint, | 468 UChar32 codePoint, |
| 465 UConverterCallbackReason reason, | 469 UConverterCallbackReason reason, |
| 466 UErrorCode* err) { | 470 UErrorCode* err) { |
| 467 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, | 471 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, |
| 468 reason, err, CSSEncodedEntitiesForUnencodables); | 472 reason, err, CSSEncodedEntitiesForUnencodables); |
| 469 } | 473 } |
| 470 | 474 |
| 471 // Invalid character handler when writing escaped entities in HTML/XML encoding
for | 475 // Invalid character handler when writing escaped entities in HTML/XML encoding |
| 472 // unrepresentable characters. See the declaration of TextCodec::encode for more
. | 476 // for unrepresentable characters. See the declaration of TextCodec::encode for |
| 477 // more. |
| 473 static void urlEscapedEntityCallback(const void* context, | 478 static void urlEscapedEntityCallback(const void* context, |
| 474 UConverterFromUnicodeArgs* fromUArgs, | 479 UConverterFromUnicodeArgs* fromUArgs, |
| 475 const UChar* codeUnits, | 480 const UChar* codeUnits, |
| 476 int32_t length, | 481 int32_t length, |
| 477 UChar32 codePoint, | 482 UChar32 codePoint, |
| 478 UConverterCallbackReason reason, | 483 UConverterCallbackReason reason, |
| 479 UErrorCode* err) { | 484 UErrorCode* err) { |
| 480 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, | 485 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, |
| 481 reason, err, URLEncodedEntitiesForUnencodables); | 486 reason, err, URLEncodedEntitiesForUnencodables); |
| 482 } | 487 } |
| (...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 693 return encodeCommon(characters, length, handling); | 698 return encodeCommon(characters, length, handling); |
| 694 } | 699 } |
| 695 | 700 |
| 696 CString TextCodecICU::encode(const LChar* characters, | 701 CString TextCodecICU::encode(const LChar* characters, |
| 697 size_t length, | 702 size_t length, |
| 698 UnencodableHandling handling) { | 703 UnencodableHandling handling) { |
| 699 return encodeCommon(characters, length, handling); | 704 return encodeCommon(characters, length, handling); |
| 700 } | 705 } |
| 701 | 706 |
| 702 } // namespace WTF | 707 } // namespace WTF |
| OLD | NEW |