third_party/WebKit/WebCore/platform/text/TextEncoding.cpp - Issue 174528: japanese encoding webkit fixes for 3.0 branch

Unified Diff: third_party/WebKit/WebCore/platform/text/TextEncoding.cpp

Issue 174528: japanese encoding webkit fixes for 3.0 branch (Closed) Base URL: svn://chrome-svn/chrome/branches/195/src/

Patch Set: Created 11 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/WebKit/WebCore/platform/text/TextEncoding.cpp

===================================================================

--- third_party/WebKit/WebCore/platform/text/TextEncoding.cpp (revision 23786)

+++ third_party/WebKit/WebCore/platform/text/TextEncoding.cpp (working copy)

@@ -1,271 +1,279 @@

-/*

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions

- * are met:

- * 1. Redistributions of source code must retain the above copyright

- * notice, this list of conditions and the following disclaimer.

- * 2. Redistributions in binary form must reproduce the above copyright

- * notice, this list of conditions and the following disclaimer in the

- * documentation and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY

- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR

- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

-#include "config.h"

-#include "TextEncoding.h"

-#include "CString.h"

-#include "PlatformString.h"

-#include "TextCodec.h"

-#include "TextEncodingRegistry.h"

-#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)

-#include <unicode/unorm.h>

-#elif USE(QT4_UNICODE)

-#include <QString>

-#endif

-#include <wtf/HashSet.h>

-#include <wtf/OwnPtr.h>

-#include <wtf/StdLibExtras.h>

-namespace WebCore {

-static void addEncodingName(HashSet<const char*>& set, const char* name)

- const char* atomicName = atomicCanonicalTextEncodingName(name);

- if (atomicName)

- set.add(atomicName);

-static const TextEncoding& UTF7Encoding()

- static TextEncoding globalUTF7Encoding("UTF-7");

- return globalUTF7Encoding;

-TextEncoding::TextEncoding(const char* name)

- : m_name(atomicCanonicalTextEncodingName(name))

- , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())

-TextEncoding::TextEncoding(const String& name)

- : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))

- , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())

-String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const

- if (!m_name)

- return String();

- return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);

-CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const

- if (!m_name)

- return CString();

- if (!length)

- return "";

-#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)

- // FIXME: What's the right place to do normalization?

- // It's a little strange to do it inside the encode function.

- // Perhaps normalization should be an explicit step done before calling encode.

- const UChar* source = characters;

- size_t sourceLength = length;

- Vector<UChar> normalizedCharacters;

- UErrorCode err = U_ZERO_ERROR;

- if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {

- // First try using the length of the original string, since normalization to NFC rarely increases length.

- normalizedCharacters.grow(sourceLength);

- int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);

- if (err == U_BUFFER_OVERFLOW_ERROR) {

- err = U_ZERO_ERROR;

- normalizedCharacters.resize(normalizedLength);

- normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);

- }

- ASSERT(U_SUCCESS(err));

- source = normalizedCharacters.data();

- sourceLength = normalizedLength;

- }

- return newTextCodec(*this)->encode(source, sourceLength, handling);

-#elif USE(QT4_UNICODE)

- QString str(reinterpret_cast<const QChar*>(characters), length);

- str = str.normalized(QString::NormalizationForm_C);

- return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);

-#endif

-const char* TextEncoding::domName() const

- if (noExtendedTextEncodingNameUsed())

- return m_name;

- // We treat EUC-KR as windows-949 (its superset), but need to expose

- // the name 'EUC-KR' because the name 'windows-949' is not recognized by

- // most Korean web servers even though they do use the encoding

- // 'windows-949' with the name 'EUC-KR'.

- // FIXME: This is not thread-safe. At the moment, this function is

- // only accessed in a single thread, but eventually has to be made

- // thread-safe along with usesVisualOrdering().

- static const char* const a = atomicCanonicalTextEncodingName("windows-949");

- if (m_name == a)

- return "EUC-KR";

- return m_name;

-bool TextEncoding::usesVisualOrdering() const

- if (noExtendedTextEncodingNameUsed())

- return false;

- static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");

- return m_name == a;

-bool TextEncoding::isJapanese() const

- if (noExtendedTextEncodingNameUsed())

- return false;

- DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());

- if (set.isEmpty()) {

- addEncodingName(set, "x-mac-japanese");

- addEncodingName(set, "cp932");

- addEncodingName(set, "JIS_X0201");

- addEncodingName(set, "JIS_X0208-1983");

- addEncodingName(set, "JIS_X0208-1990");

- addEncodingName(set, "JIS_X0212-1990");

- addEncodingName(set, "JIS_C6226-1978");

- addEncodingName(set, "Shift_JIS_X0213-2000");

- addEncodingName(set, "ISO-2022-JP");

- addEncodingName(set, "ISO-2022-JP-2");

- addEncodingName(set, "ISO-2022-JP-1");

- addEncodingName(set, "ISO-2022-JP-3");

- addEncodingName(set, "EUC-JP");

- addEncodingName(set, "Shift_JIS");

- }

- return m_name && set.contains(m_name);

-UChar TextEncoding::backslashAsCurrencySymbol() const

- if (noExtendedTextEncodingNameUsed())

- return '\\';

- // The text encodings below treat backslash as a currency symbol.

- // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.

- static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");

- static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");

- return (m_name == a || m_name == b) ? 0x00A5 : '\\';

-bool TextEncoding::isNonByteBasedEncoding() const

- if (noExtendedTextEncodingNameUsed()) {

- return *this == UTF16LittleEndianEncoding()

- || *this == UTF16BigEndianEncoding();

- }

- return *this == UTF16LittleEndianEncoding()

- || *this == UTF16BigEndianEncoding()

- || *this == UTF32BigEndianEncoding()

- || *this == UTF32LittleEndianEncoding();

-bool TextEncoding::isUTF7Encoding() const

- if (noExtendedTextEncodingNameUsed())

- return false;

- return *this == UTF7Encoding();

-const TextEncoding& TextEncoding::closestByteBasedEquivalent() const

- if (isNonByteBasedEncoding())

- return UTF8Encoding();

- return *this;

-// HTML5 specifies that UTF-8 be used in form submission when a form is

-// is a part of a document in UTF-16 probably because UTF-16 is not a

-// byte-based encoding and can contain 0x00. By extension, the same

-// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,

-// but it's fraught with problems and we'd rather steer clear of it.

-const TextEncoding& TextEncoding::encodingForFormSubmission() const

- if (isNonByteBasedEncoding() || isUTF7Encoding())

- return UTF8Encoding();

- return *this;

-const TextEncoding& ASCIIEncoding()

- static TextEncoding globalASCIIEncoding("ASCII");

- return globalASCIIEncoding;

-const TextEncoding& Latin1Encoding()

- static TextEncoding globalLatin1Encoding("Latin-1");

- return globalLatin1Encoding;

-const TextEncoding& UTF16BigEndianEncoding()

- static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");

- return globalUTF16BigEndianEncoding;

-const TextEncoding& UTF16LittleEndianEncoding()

- static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");

- return globalUTF16LittleEndianEncoding;

-const TextEncoding& UTF32BigEndianEncoding()

- static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");

- return globalUTF32BigEndianEncoding;

-const TextEncoding& UTF32LittleEndianEncoding()

- static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");

- return globalUTF32LittleEndianEncoding;

-const TextEncoding& UTF8Encoding()

- static TextEncoding globalUTF8Encoding("UTF-8");

- return globalUTF8Encoding;

-const TextEncoding& WindowsLatin1Encoding()

- static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");

- return globalWindowsLatin1Encoding;

-} // namespace WebCore

+/*

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions

+ * are met:

+ * 1. Redistributions of source code must retain the above copyright

+ * notice, this list of conditions and the following disclaimer.

+ * 2. Redistributions in binary form must reproduce the above copyright

+ * notice, this list of conditions and the following disclaimer in the

+ * documentation and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY

+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR

+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "config.h"

+#include "TextEncoding.h"

+#include "CString.h"

+#include "PlatformString.h"

+#include "TextCodec.h"

+#include "TextEncodingRegistry.h"

+#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)

+#include <unicode/unorm.h>

+#elif USE(QT4_UNICODE)

+#include <QString>

+#endif

+#include <wtf/HashSet.h>

+#include <wtf/OwnPtr.h>

+#include <wtf/StdLibExtras.h>

+namespace WebCore {

+static void addEncodingName(HashSet<const char*>& set, const char* name)

+ const char* atomicName = atomicCanonicalTextEncodingName(name);

+ if (atomicName)

+ set.add(atomicName);

+static const TextEncoding& UTF7Encoding()

+ static TextEncoding globalUTF7Encoding("UTF-7");

+ return globalUTF7Encoding;

+TextEncoding::TextEncoding(const char* name)

+ : m_name(atomicCanonicalTextEncodingName(name))

+ , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())

+TextEncoding::TextEncoding(const String& name)

+ : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))

+ , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())

+String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const

+ if (!m_name)

+ return String();

+ return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);

+CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const

+ if (!m_name)

+ return CString();

+ if (!length)

+ return "";

+#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)

+ // FIXME: What's the right place to do normalization?

+ // It's a little strange to do it inside the encode function.

+ // Perhaps normalization should be an explicit step done before calling encode.

+ const UChar* source = characters;

+ size_t sourceLength = length;

+ Vector<UChar> normalizedCharacters;

+ UErrorCode err = U_ZERO_ERROR;

+ if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {

+ // First try using the length of the original string, since normalization to NFC rarely increases length.

+ normalizedCharacters.grow(sourceLength);

+ int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);

+ if (err == U_BUFFER_OVERFLOW_ERROR) {

+ err = U_ZERO_ERROR;

+ normalizedCharacters.resize(normalizedLength);

+ normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);

+ }

+ ASSERT(U_SUCCESS(err));

+ source = normalizedCharacters.data();

+ sourceLength = normalizedLength;

+ }

+ return newTextCodec(*this)->encode(source, sourceLength, handling);

+#elif USE(QT4_UNICODE)

+ QString str(reinterpret_cast<const QChar*>(characters), length);

+ str = str.normalized(QString::NormalizationForm_C);

+ return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);

+#endif

+const char* TextEncoding::domName() const

+ if (noExtendedTextEncodingNameUsed())

+ return m_name;

+ // We treat EUC-KR as windows-949 (its superset), but need to expose

+ // the name 'EUC-KR' because the name 'windows-949' is not recognized by

+ // most Korean web servers even though they do use the encoding

+ // 'windows-949' with the name 'EUC-KR'.

+ // FIXME: This is not thread-safe. At the moment, this function is

+ // only accessed in a single thread, but eventually has to be made

+ // thread-safe along with usesVisualOrdering().

+ static const char* const a = atomicCanonicalTextEncodingName("windows-949");

+ if (m_name == a)

+ return "EUC-KR";

+ return m_name;

+bool TextEncoding::usesVisualOrdering() const

+ if (noExtendedTextEncodingNameUsed())

+ return false;

+ static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");

+ return m_name == a;

+#if !PLATFORM(CHROMIUM)

+bool TextEncoding::isJapanese() const

+ if (noExtendedTextEncodingNameUsed())

+ return false;

+ DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());

+ if (set.isEmpty()) {

+ addEncodingName(set, "x-mac-japanese");

+ addEncodingName(set, "cp932");

+ addEncodingName(set, "JIS_X0201");

+ addEncodingName(set, "JIS_X0208-1983");

+ addEncodingName(set, "JIS_X0208-1990");

+ addEncodingName(set, "JIS_X0212-1990");

+ addEncodingName(set, "JIS_C6226-1978");

+ addEncodingName(set, "Shift_JIS_X0213-2000");

+ addEncodingName(set, "ISO-2022-JP");

+ addEncodingName(set, "ISO-2022-JP-2");

+ addEncodingName(set, "ISO-2022-JP-1");

+ addEncodingName(set, "ISO-2022-JP-3");

+ addEncodingName(set, "EUC-JP");

+ addEncodingName(set, "Shift_JIS");

+ }

+ return m_name && set.contains(m_name);

+#endif

+UChar TextEncoding::backslashAsCurrencySymbol() const

+#if PLATFORM(CHROMIUM)

+ // Chromium does not want this. The blog article cited is not

+ // a justification for replacing U+005C with U+00A5.

+ return '\\';

+#else

+ if (noExtendedTextEncodingNameUsed())

+ return '\\';

+ // The text encodings below treat backslash as a currency symbol.

+ // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.

+ static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");

+ static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");

+ return (m_name == a || m_name == b) ? 0x00A5 : '\\';

+#endif

+bool TextEncoding::isNonByteBasedEncoding() const

+ if (noExtendedTextEncodingNameUsed()) {

+ return *this == UTF16LittleEndianEncoding()

+ || *this == UTF16BigEndianEncoding();

+ }

+ return *this == UTF16LittleEndianEncoding()

+ || *this == UTF16BigEndianEncoding()

+ || *this == UTF32BigEndianEncoding()

+ || *this == UTF32LittleEndianEncoding();

+bool TextEncoding::isUTF7Encoding() const

+ if (noExtendedTextEncodingNameUsed())

+ return false;

+ return *this == UTF7Encoding();

+const TextEncoding& TextEncoding::closestByteBasedEquivalent() const

+ if (isNonByteBasedEncoding())

+ return UTF8Encoding();

+ return *this;

+// HTML5 specifies that UTF-8 be used in form submission when a form is

+// is a part of a document in UTF-16 probably because UTF-16 is not a

+// byte-based encoding and can contain 0x00. By extension, the same

+// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,

+// but it's fraught with problems and we'd rather steer clear of it.

+const TextEncoding& TextEncoding::encodingForFormSubmission() const

+ if (isNonByteBasedEncoding() || isUTF7Encoding())

+ return UTF8Encoding();

+ return *this;

+const TextEncoding& ASCIIEncoding()

+ static TextEncoding globalASCIIEncoding("ASCII");

+ return globalASCIIEncoding;

+const TextEncoding& Latin1Encoding()

+ static TextEncoding globalLatin1Encoding("Latin-1");

+ return globalLatin1Encoding;

+const TextEncoding& UTF16BigEndianEncoding()

+ static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");

+ return globalUTF16BigEndianEncoding;

+const TextEncoding& UTF16LittleEndianEncoding()

+ static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");

+ return globalUTF16LittleEndianEncoding;

+const TextEncoding& UTF32BigEndianEncoding()

+ static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");

+ return globalUTF32BigEndianEncoding;

+const TextEncoding& UTF32LittleEndianEncoding()

+ static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");

+ return globalUTF32LittleEndianEncoding;

+const TextEncoding& UTF8Encoding()

+ static TextEncoding globalUTF8Encoding("UTF-8");

+ return globalUTF8Encoding;

+const TextEncoding& WindowsLatin1Encoding()

+ static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");

+ return globalWindowsLatin1Encoding;

+} // namespace WebCore

« no previous file with comments | « third_party/WebKit/WebCore/platform/text/TextEncoding.h ('k') | no next file » | no next file with comments »