OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved. |
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> | 3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> |
4 * | 4 * |
5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
7 * are met: | 7 * are met: |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
81 // Try IANA to pick up 'windows-12xx' and other names | 81 // Try IANA to pick up 'windows-12xx' and other names |
82 // which are not preferred MIME names but are widely used. | 82 // which are not preferred MIME names but are widely used. |
83 standardName = ucnv_getStandardName(name, secondaryStandard, &error); | 83 standardName = ucnv_getStandardName(name, secondaryStandard, &error); |
84 if (U_FAILURE(error) || !standardName) | 84 if (U_FAILURE(error) || !standardName) |
85 continue; | 85 continue; |
86 } | 86 } |
87 | 87 |
88 // A number of these aliases are handled in Chrome's copy of ICU, but | 88 // A number of these aliases are handled in Chrome's copy of ICU, but |
89 // Chromium can be compiled with the system ICU. | 89 // Chromium can be compiled with the system ICU. |
90 | 90 |
91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other br
owsers. | 91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other |
92 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native
encoding | 92 // browsers. |
93 // for encoding GB_2312-80 and several others. So, we need to override this b
ehavior, too. | 93 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native |
| 94 // encoding for encoding GB_2312-80 and several others. So, we need to |
| 95 // override this behavior, too. |
94 #if defined(USING_SYSTEM_ICU) | 96 #if defined(USING_SYSTEM_ICU) |
95 if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80")) | 97 if (!strcmp(standardName, "GB2312") || !strcmp(standardName, "GB_2312-80")) |
96 standardName = "GBK"; | 98 standardName = "GBK"; |
97 // Similarly, EUC-KR encodings all map to an extended version, but | 99 // Similarly, EUC-KR encodings all map to an extended version, but |
98 // per HTML5, the canonical name still should be EUC-KR. | 100 // per HTML5, the canonical name still should be EUC-KR. |
99 else if (!strcmp(standardName, "EUC-KR") || | 101 else if (!strcmp(standardName, "EUC-KR") || |
100 !strcmp(standardName, "KSC_5601") || | 102 !strcmp(standardName, "KSC_5601") || |
101 !strcmp(standardName, "cp1363")) | 103 !strcmp(standardName, "cp1363")) |
102 standardName = "EUC-KR"; | 104 standardName = "EUC-KR"; |
103 // And so on. | 105 // And so on. |
104 else if ( | 106 else if (!strcasecmp(standardName, "iso-8859-9")) |
105 !strcasecmp( | 107 // This name is returned in different case by ICU 3.2 and 3.6. |
106 standardName, | |
107 "iso-8859-9")) // This name is returned in different case by ICU 3.
2 and 3.6. | |
108 standardName = "windows-1254"; | 108 standardName = "windows-1254"; |
109 else if (!strcmp(standardName, "TIS-620")) | 109 else if (!strcmp(standardName, "TIS-620")) |
110 standardName = "windows-874"; | 110 standardName = "windows-874"; |
111 #endif | 111 #endif |
112 | 112 |
113 registrar(standardName, standardName); | 113 registrar(standardName, standardName); |
114 | 114 |
115 uint16_t numAliases = ucnv_countAliases(name, &error); | 115 uint16_t numAliases = ucnv_countAliases(name, &error); |
116 ASSERT(U_SUCCESS(error)); | 116 ASSERT(U_SUCCESS(error)); |
117 if (U_SUCCESS(error)) | 117 if (U_SUCCESS(error)) |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
175 registrar("ISO8859-6", "ISO-8859-6"); | 175 registrar("ISO8859-6", "ISO-8859-6"); |
176 registrar("ISO8859-7", "ISO-8859-7"); | 176 registrar("ISO8859-7", "ISO-8859-7"); |
177 registrar("ISO8859-8", "ISO-8859-8"); | 177 registrar("ISO8859-8", "ISO-8859-8"); |
178 registrar("ISO8859-8-I", "ISO-8859-8-I"); | 178 registrar("ISO8859-8-I", "ISO-8859-8-I"); |
179 registrar("ISO8859-9", "ISO-8859-9"); | 179 registrar("ISO8859-9", "ISO-8859-9"); |
180 registrar("ISO8859-10", "ISO-8859-10"); | 180 registrar("ISO8859-10", "ISO-8859-10"); |
181 registrar("ISO8859-13", "ISO-8859-13"); | 181 registrar("ISO8859-13", "ISO-8859-13"); |
182 registrar("ISO8859-14", "ISO-8859-14"); | 182 registrar("ISO8859-14", "ISO-8859-14"); |
183 registrar("ISO8859-15", "ISO-8859-15"); | 183 registrar("ISO8859-15", "ISO-8859-15"); |
184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label | 184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label |
185 // listed in WHATWG Encoding Living Standard (http://encoding.spec.whatwg.org/
). | 185 // listed in WHATWG Encoding Living Standard, http://encoding.spec.whatwg.org/ |
186 | 186 |
187 // Additional aliases present in the WHATWG Encoding Standard | 187 // Additional aliases present in the WHATWG Encoding Standard |
188 // and Firefox (as of Oct 2014), but not in the upstream ICU. | 188 // and Firefox (as of Oct 2014), but not in the upstream ICU. |
189 // Three entries for windows-1252 need not be listed here because | 189 // Three entries for windows-1252 need not be listed here because |
190 // TextCodecLatin1 registers them. | 190 // TextCodecLatin1 registers them. |
191 registrar("csiso58gb231280", "GBK"); | 191 registrar("csiso58gb231280", "GBK"); |
192 registrar("csiso88596e", "ISO-8859-6"); | 192 registrar("csiso88596e", "ISO-8859-6"); |
193 registrar("csiso88596i", "ISO-8859-6"); | 193 registrar("csiso88596i", "ISO-8859-6"); |
194 registrar("csiso88598e", "ISO-8859-8"); | 194 registrar("csiso88598e", "ISO-8859-8"); |
195 registrar("gb_2312", "GBK"); | 195 registrar("gb_2312", "GBK"); |
(...skipping 176 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
372 int32_t* offsets = nullptr; | 372 int32_t* offsets = nullptr; |
373 UErrorCode err = U_ZERO_ERROR; | 373 UErrorCode err = U_ZERO_ERROR; |
374 | 374 |
375 do { | 375 do { |
376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, | 376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, |
377 offsets, flush != DoNotFlush, err); | 377 offsets, flush != DoNotFlush, err); |
378 result.append(buffer, ucharsDecoded); | 378 result.append(buffer, ucharsDecoded); |
379 } while (err == U_BUFFER_OVERFLOW_ERROR); | 379 } while (err == U_BUFFER_OVERFLOW_ERROR); |
380 | 380 |
381 if (U_FAILURE(err)) { | 381 if (U_FAILURE(err)) { |
382 // flush the converter so it can be reused, and not be bothered by this erro
r. | 382 // flush the converter so it can be reused, and not be bothered by this |
| 383 // error. |
383 do { | 384 do { |
384 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, | 385 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, |
385 err); | 386 err); |
386 } while (source < sourceLimit); | 387 } while (source < sourceLimit); |
387 sawError = true; | 388 sawError = true; |
388 } | 389 } |
389 | 390 |
390 #if !defined(USING_SYSTEM_ICU) | 391 #if !defined(USING_SYSTEM_ICU) |
391 // Chrome's copy of ICU does not have the issue described below. | 392 // Chrome's copy of ICU does not have the issue described below. |
392 return result.toString(); | 393 return result.toString(); |
393 #else | 394 #else |
394 String resultString = result.toString(); | 395 String resultString = result.toString(); |
395 | 396 |
396 // <http://bugs.webkit.org/show_bug.cgi?id=17014> | 397 // <http://bugs.webkit.org/show_bug.cgi?id=17014> |
397 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but
ICU decodes it as U+E5E5. | 398 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but |
| 399 // ICU decodes it as U+E5E5. |
398 if (!strcmp(m_encoding.name(), "GBK")) { | 400 if (!strcmp(m_encoding.name(), "GBK")) { |
399 if (!strcasecmp(m_encoding.name(), "gb18030")) | 401 if (!strcasecmp(m_encoding.name(), "gb18030")) |
400 resultString.replace(0xE5E5, ideographicSpaceCharacter); | 402 resultString.replace(0xE5E5, ideographicSpaceCharacter); |
401 // Make GBK compliant to the encoding spec and align with GB18030 | 403 // Make GBK compliant to the encoding spec and align with GB18030 |
402 resultString.replace(0x01F9, 0xE7C8); | 404 resultString.replace(0x01F9, 0xE7C8); |
403 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 | 405 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3 |
404 // is resolved, add U+1E3F => 0xE7C7. | 406 // is resolved, add U+1E3F => 0xE7C7. |
405 } | 407 } |
406 | 408 |
407 return resultString; | 409 return resultString; |
408 #endif | 410 #endif |
409 } | 411 } |
410 | 412 |
411 #if defined(USING_SYSTEM_ICU) | 413 #if defined(USING_SYSTEM_ICU) |
412 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding | 414 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding |
413 // spec, but ICU converter does not have them. | 415 // spec, but ICU converter does not have them. |
414 static UChar fallbackForGBK(UChar32 character) { | 416 static UChar fallbackForGBK(UChar32 character) { |
415 switch (character) { | 417 switch (character) { |
416 case 0x01F9: | 418 case 0x01F9: |
417 return 0xE7C8; // mapped to xA8xBF by ICU. | 419 return 0xE7C8; // mapped to xA8xBF by ICU. |
418 case 0x1E3F: | 420 case 0x1E3F: |
419 return 0xE7C7; // mapped to xA8xBC by ICU. | 421 return 0xE7C7; // mapped to xA8xBC by ICU. |
420 } | 422 } |
421 return 0; | 423 return 0; |
422 } | 424 } |
423 #endif | 425 #endif |
424 | 426 |
425 // Generic helper for writing escaped entities using the specfied UnencodableHan
dling. | 427 // Generic helper for writing escaped entities using the specfied |
| 428 // UnencodableHandling. |
426 static void formatEscapedEntityCallback(const void* context, | 429 static void formatEscapedEntityCallback(const void* context, |
427 UConverterFromUnicodeArgs* fromUArgs, | 430 UConverterFromUnicodeArgs* fromUArgs, |
428 const UChar* codeUnits, | 431 const UChar* codeUnits, |
429 int32_t length, | 432 int32_t length, |
430 UChar32 codePoint, | 433 UChar32 codePoint, |
431 UConverterCallbackReason reason, | 434 UConverterCallbackReason reason, |
432 UErrorCode* err, | 435 UErrorCode* err, |
433 UnencodableHandling handling) { | 436 UnencodableHandling handling) { |
434 if (reason == UCNV_UNASSIGNED) { | 437 if (reason == UCNV_UNASSIGNED) { |
435 *err = U_ZERO_ERROR; | 438 *err = U_ZERO_ERROR; |
(...skipping 13 matching lines...) Expand all Loading... |
449 const UChar* codeUnits, | 452 const UChar* codeUnits, |
450 int32_t length, | 453 int32_t length, |
451 UChar32 codePoint, | 454 UChar32 codePoint, |
452 UConverterCallbackReason reason, | 455 UConverterCallbackReason reason, |
453 UErrorCode* err) { | 456 UErrorCode* err) { |
454 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, | 457 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, |
455 reason, err, EntitiesForUnencodables); | 458 reason, err, EntitiesForUnencodables); |
456 } | 459 } |
457 | 460 |
458 // Invalid character handler when writing escaped entities in CSS encoding for | 461 // Invalid character handler when writing escaped entities in CSS encoding for |
459 // unrepresentable characters. See the declaration of TextCodec::encode for more
. | 462 // unrepresentable characters. See the declaration of TextCodec::encode for |
| 463 // more. |
460 static void cssEscapedEntityCallback(const void* context, | 464 static void cssEscapedEntityCallback(const void* context, |
461 UConverterFromUnicodeArgs* fromUArgs, | 465 UConverterFromUnicodeArgs* fromUArgs, |
462 const UChar* codeUnits, | 466 const UChar* codeUnits, |
463 int32_t length, | 467 int32_t length, |
464 UChar32 codePoint, | 468 UChar32 codePoint, |
465 UConverterCallbackReason reason, | 469 UConverterCallbackReason reason, |
466 UErrorCode* err) { | 470 UErrorCode* err) { |
467 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, | 471 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, |
468 reason, err, CSSEncodedEntitiesForUnencodables); | 472 reason, err, CSSEncodedEntitiesForUnencodables); |
469 } | 473 } |
470 | 474 |
471 // Invalid character handler when writing escaped entities in HTML/XML encoding
for | 475 // Invalid character handler when writing escaped entities in HTML/XML encoding |
472 // unrepresentable characters. See the declaration of TextCodec::encode for more
. | 476 // for unrepresentable characters. See the declaration of TextCodec::encode for |
| 477 // more. |
473 static void urlEscapedEntityCallback(const void* context, | 478 static void urlEscapedEntityCallback(const void* context, |
474 UConverterFromUnicodeArgs* fromUArgs, | 479 UConverterFromUnicodeArgs* fromUArgs, |
475 const UChar* codeUnits, | 480 const UChar* codeUnits, |
476 int32_t length, | 481 int32_t length, |
477 UChar32 codePoint, | 482 UChar32 codePoint, |
478 UConverterCallbackReason reason, | 483 UConverterCallbackReason reason, |
479 UErrorCode* err) { | 484 UErrorCode* err) { |
480 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, | 485 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, |
481 reason, err, URLEncodedEntitiesForUnencodables); | 486 reason, err, URLEncodedEntitiesForUnencodables); |
482 } | 487 } |
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
693 return encodeCommon(characters, length, handling); | 698 return encodeCommon(characters, length, handling); |
694 } | 699 } |
695 | 700 |
696 CString TextCodecICU::encode(const LChar* characters, | 701 CString TextCodecICU::encode(const LChar* characters, |
697 size_t length, | 702 size_t length, |
698 UnencodableHandling handling) { | 703 UnencodableHandling handling) { |
699 return encodeCommon(characters, length, handling); | 704 return encodeCommon(characters, length, handling); |
700 } | 705 } |
701 | 706 |
702 } // namespace WTF | 707 } // namespace WTF |
OLD | NEW |