Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(813)

Unified Diff: third_party/WebKit/WebCore/platform/text/TextEncoding.cpp

Issue 174528: japanese encoding webkit fixes for 3.0 branch (Closed) Base URL: svn://chrome-svn/chrome/branches/195/src/
Patch Set: Created 11 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/WebKit/WebCore/platform/text/TextEncoding.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/WebKit/WebCore/platform/text/TextEncoding.cpp
===================================================================
--- third_party/WebKit/WebCore/platform/text/TextEncoding.cpp (revision 23786)
+++ third_party/WebKit/WebCore/platform/text/TextEncoding.cpp (working copy)
@@ -1,271 +1,279 @@
-/*
- * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
- * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "TextEncoding.h"
-
-#include "CString.h"
-#include "PlatformString.h"
-#include "TextCodec.h"
-#include "TextEncodingRegistry.h"
-#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)
-#include <unicode/unorm.h>
-#elif USE(QT4_UNICODE)
-#include <QString>
-#endif
-#include <wtf/HashSet.h>
-#include <wtf/OwnPtr.h>
-#include <wtf/StdLibExtras.h>
-
-namespace WebCore {
-
-static void addEncodingName(HashSet<const char*>& set, const char* name)
-{
- const char* atomicName = atomicCanonicalTextEncodingName(name);
- if (atomicName)
- set.add(atomicName);
-}
-
-static const TextEncoding& UTF7Encoding()
-{
- static TextEncoding globalUTF7Encoding("UTF-7");
- return globalUTF7Encoding;
-}
-
-TextEncoding::TextEncoding(const char* name)
- : m_name(atomicCanonicalTextEncodingName(name))
- , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
-{
-}
-
-TextEncoding::TextEncoding(const String& name)
- : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
- , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
-{
-}
-
-String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
-{
- if (!m_name)
- return String();
-
- return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
-}
-
-CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
-{
- if (!m_name)
- return CString();
-
- if (!length)
- return "";
-
-#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)
- // FIXME: What's the right place to do normalization?
- // It's a little strange to do it inside the encode function.
- // Perhaps normalization should be an explicit step done before calling encode.
-
- const UChar* source = characters;
- size_t sourceLength = length;
-
- Vector<UChar> normalizedCharacters;
-
- UErrorCode err = U_ZERO_ERROR;
- if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
- // First try using the length of the original string, since normalization to NFC rarely increases length.
- normalizedCharacters.grow(sourceLength);
- int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
- if (err == U_BUFFER_OVERFLOW_ERROR) {
- err = U_ZERO_ERROR;
- normalizedCharacters.resize(normalizedLength);
- normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
- }
- ASSERT(U_SUCCESS(err));
-
- source = normalizedCharacters.data();
- sourceLength = normalizedLength;
- }
- return newTextCodec(*this)->encode(source, sourceLength, handling);
-#elif USE(QT4_UNICODE)
- QString str(reinterpret_cast<const QChar*>(characters), length);
- str = str.normalized(QString::NormalizationForm_C);
- return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
-#endif
-}
-
-const char* TextEncoding::domName() const
-{
- if (noExtendedTextEncodingNameUsed())
- return m_name;
-
- // We treat EUC-KR as windows-949 (its superset), but need to expose
- // the name 'EUC-KR' because the name 'windows-949' is not recognized by
- // most Korean web servers even though they do use the encoding
- // 'windows-949' with the name 'EUC-KR'.
- // FIXME: This is not thread-safe. At the moment, this function is
- // only accessed in a single thread, but eventually has to be made
- // thread-safe along with usesVisualOrdering().
- static const char* const a = atomicCanonicalTextEncodingName("windows-949");
- if (m_name == a)
- return "EUC-KR";
- return m_name;
-}
-
-bool TextEncoding::usesVisualOrdering() const
-{
- if (noExtendedTextEncodingNameUsed())
- return false;
-
- static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
- return m_name == a;
-}
-
-bool TextEncoding::isJapanese() const
-{
- if (noExtendedTextEncodingNameUsed())
- return false;
-
- DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());
- if (set.isEmpty()) {
- addEncodingName(set, "x-mac-japanese");
- addEncodingName(set, "cp932");
- addEncodingName(set, "JIS_X0201");
- addEncodingName(set, "JIS_X0208-1983");
- addEncodingName(set, "JIS_X0208-1990");
- addEncodingName(set, "JIS_X0212-1990");
- addEncodingName(set, "JIS_C6226-1978");
- addEncodingName(set, "Shift_JIS_X0213-2000");
- addEncodingName(set, "ISO-2022-JP");
- addEncodingName(set, "ISO-2022-JP-2");
- addEncodingName(set, "ISO-2022-JP-1");
- addEncodingName(set, "ISO-2022-JP-3");
- addEncodingName(set, "EUC-JP");
- addEncodingName(set, "Shift_JIS");
- }
- return m_name && set.contains(m_name);
-}
-
-UChar TextEncoding::backslashAsCurrencySymbol() const
-{
- if (noExtendedTextEncodingNameUsed())
- return '\\';
-
- // The text encodings below treat backslash as a currency symbol.
- // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
- static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
- static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
- return (m_name == a || m_name == b) ? 0x00A5 : '\\';
-}
-
-bool TextEncoding::isNonByteBasedEncoding() const
-{
- if (noExtendedTextEncodingNameUsed()) {
- return *this == UTF16LittleEndianEncoding()
- || *this == UTF16BigEndianEncoding();
- }
-
- return *this == UTF16LittleEndianEncoding()
- || *this == UTF16BigEndianEncoding()
- || *this == UTF32BigEndianEncoding()
- || *this == UTF32LittleEndianEncoding();
-}
-
-bool TextEncoding::isUTF7Encoding() const
-{
- if (noExtendedTextEncodingNameUsed())
- return false;
-
- return *this == UTF7Encoding();
-}
-
-const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
-{
- if (isNonByteBasedEncoding())
- return UTF8Encoding();
- return *this;
-}
-
-// HTML5 specifies that UTF-8 be used in form submission when a form is
-// is a part of a document in UTF-16 probably because UTF-16 is not a
-// byte-based encoding and can contain 0x00. By extension, the same
-// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
-// but it's fraught with problems and we'd rather steer clear of it.
-const TextEncoding& TextEncoding::encodingForFormSubmission() const
-{
- if (isNonByteBasedEncoding() || isUTF7Encoding())
- return UTF8Encoding();
- return *this;
-}
-
-const TextEncoding& ASCIIEncoding()
-{
- static TextEncoding globalASCIIEncoding("ASCII");
- return globalASCIIEncoding;
-}
-
-const TextEncoding& Latin1Encoding()
-{
- static TextEncoding globalLatin1Encoding("Latin-1");
- return globalLatin1Encoding;
-}
-
-const TextEncoding& UTF16BigEndianEncoding()
-{
- static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
- return globalUTF16BigEndianEncoding;
-}
-
-const TextEncoding& UTF16LittleEndianEncoding()
-{
- static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
- return globalUTF16LittleEndianEncoding;
-}
-
-const TextEncoding& UTF32BigEndianEncoding()
-{
- static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
- return globalUTF32BigEndianEncoding;
-}
-
-const TextEncoding& UTF32LittleEndianEncoding()
-{
- static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
- return globalUTF32LittleEndianEncoding;
-}
-
-const TextEncoding& UTF8Encoding()
-{
- static TextEncoding globalUTF8Encoding("UTF-8");
- return globalUTF8Encoding;
-}
-
-const TextEncoding& WindowsLatin1Encoding()
-{
- static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
- return globalWindowsLatin1Encoding;
-}
-
-} // namespace WebCore
+/*
+ * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
+ * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "TextEncoding.h"
+
+#include "CString.h"
+#include "PlatformString.h"
+#include "TextCodec.h"
+#include "TextEncodingRegistry.h"
+#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)
+#include <unicode/unorm.h>
+#elif USE(QT4_UNICODE)
+#include <QString>
+#endif
+#include <wtf/HashSet.h>
+#include <wtf/OwnPtr.h>
+#include <wtf/StdLibExtras.h>
+
+namespace WebCore {
+
+static void addEncodingName(HashSet<const char*>& set, const char* name)
+{
+ const char* atomicName = atomicCanonicalTextEncodingName(name);
+ if (atomicName)
+ set.add(atomicName);
+}
+
+static const TextEncoding& UTF7Encoding()
+{
+ static TextEncoding globalUTF7Encoding("UTF-7");
+ return globalUTF7Encoding;
+}
+
+TextEncoding::TextEncoding(const char* name)
+ : m_name(atomicCanonicalTextEncodingName(name))
+ , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
+{
+}
+
+TextEncoding::TextEncoding(const String& name)
+ : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
+ , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
+{
+}
+
+String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
+{
+ if (!m_name)
+ return String();
+
+ return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
+}
+
+CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
+{
+ if (!m_name)
+ return CString();
+
+ if (!length)
+ return "";
+
+#if USE(ICU_UNICODE) || USE(GLIB_ICU_UNICODE_HYBRID)
+ // FIXME: What's the right place to do normalization?
+ // It's a little strange to do it inside the encode function.
+ // Perhaps normalization should be an explicit step done before calling encode.
+
+ const UChar* source = characters;
+ size_t sourceLength = length;
+
+ Vector<UChar> normalizedCharacters;
+
+ UErrorCode err = U_ZERO_ERROR;
+ if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
+ // First try using the length of the original string, since normalization to NFC rarely increases length.
+ normalizedCharacters.grow(sourceLength);
+ int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ err = U_ZERO_ERROR;
+ normalizedCharacters.resize(normalizedLength);
+ normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
+ }
+ ASSERT(U_SUCCESS(err));
+
+ source = normalizedCharacters.data();
+ sourceLength = normalizedLength;
+ }
+ return newTextCodec(*this)->encode(source, sourceLength, handling);
+#elif USE(QT4_UNICODE)
+ QString str(reinterpret_cast<const QChar*>(characters), length);
+ str = str.normalized(QString::NormalizationForm_C);
+ return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
+#endif
+}
+
+const char* TextEncoding::domName() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return m_name;
+
+ // We treat EUC-KR as windows-949 (its superset), but need to expose
+ // the name 'EUC-KR' because the name 'windows-949' is not recognized by
+ // most Korean web servers even though they do use the encoding
+ // 'windows-949' with the name 'EUC-KR'.
+ // FIXME: This is not thread-safe. At the moment, this function is
+ // only accessed in a single thread, but eventually has to be made
+ // thread-safe along with usesVisualOrdering().
+ static const char* const a = atomicCanonicalTextEncodingName("windows-949");
+ if (m_name == a)
+ return "EUC-KR";
+ return m_name;
+}
+
+bool TextEncoding::usesVisualOrdering() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
+ return m_name == a;
+}
+
+#if !PLATFORM(CHROMIUM)
+bool TextEncoding::isJapanese() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());
+ if (set.isEmpty()) {
+ addEncodingName(set, "x-mac-japanese");
+ addEncodingName(set, "cp932");
+ addEncodingName(set, "JIS_X0201");
+ addEncodingName(set, "JIS_X0208-1983");
+ addEncodingName(set, "JIS_X0208-1990");
+ addEncodingName(set, "JIS_X0212-1990");
+ addEncodingName(set, "JIS_C6226-1978");
+ addEncodingName(set, "Shift_JIS_X0213-2000");
+ addEncodingName(set, "ISO-2022-JP");
+ addEncodingName(set, "ISO-2022-JP-2");
+ addEncodingName(set, "ISO-2022-JP-1");
+ addEncodingName(set, "ISO-2022-JP-3");
+ addEncodingName(set, "EUC-JP");
+ addEncodingName(set, "Shift_JIS");
+ }
+ return m_name && set.contains(m_name);
+}
+#endif
+
+UChar TextEncoding::backslashAsCurrencySymbol() const
+{
+#if PLATFORM(CHROMIUM)
+ // Chromium does not want this. The blog article cited is not
+ // a justification for replacing U+005C with U+00A5.
+ return '\\';
+#else
+ if (noExtendedTextEncodingNameUsed())
+ return '\\';
+
+ // The text encodings below treat backslash as a currency symbol.
+ // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
+ static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
+ static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
+ return (m_name == a || m_name == b) ? 0x00A5 : '\\';
+#endif
+}
+
+bool TextEncoding::isNonByteBasedEncoding() const
+{
+ if (noExtendedTextEncodingNameUsed()) {
+ return *this == UTF16LittleEndianEncoding()
+ || *this == UTF16BigEndianEncoding();
+ }
+
+ return *this == UTF16LittleEndianEncoding()
+ || *this == UTF16BigEndianEncoding()
+ || *this == UTF32BigEndianEncoding()
+ || *this == UTF32LittleEndianEncoding();
+}
+
+bool TextEncoding::isUTF7Encoding() const
+{
+ if (noExtendedTextEncodingNameUsed())
+ return false;
+
+ return *this == UTF7Encoding();
+}
+
+const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
+{
+ if (isNonByteBasedEncoding())
+ return UTF8Encoding();
+ return *this;
+}
+
+// HTML5 specifies that UTF-8 be used in form submission when a form is
+// is a part of a document in UTF-16 probably because UTF-16 is not a
+// byte-based encoding and can contain 0x00. By extension, the same
+// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
+// but it's fraught with problems and we'd rather steer clear of it.
+const TextEncoding& TextEncoding::encodingForFormSubmission() const
+{
+ if (isNonByteBasedEncoding() || isUTF7Encoding())
+ return UTF8Encoding();
+ return *this;
+}
+
+const TextEncoding& ASCIIEncoding()
+{
+ static TextEncoding globalASCIIEncoding("ASCII");
+ return globalASCIIEncoding;
+}
+
+const TextEncoding& Latin1Encoding()
+{
+ static TextEncoding globalLatin1Encoding("Latin-1");
+ return globalLatin1Encoding;
+}
+
+const TextEncoding& UTF16BigEndianEncoding()
+{
+ static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
+ return globalUTF16BigEndianEncoding;
+}
+
+const TextEncoding& UTF16LittleEndianEncoding()
+{
+ static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
+ return globalUTF16LittleEndianEncoding;
+}
+
+const TextEncoding& UTF32BigEndianEncoding()
+{
+ static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
+ return globalUTF32BigEndianEncoding;
+}
+
+const TextEncoding& UTF32LittleEndianEncoding()
+{
+ static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
+ return globalUTF32LittleEndianEncoding;
+}
+
+const TextEncoding& UTF8Encoding()
+{
+ static TextEncoding globalUTF8Encoding("UTF-8");
+ return globalUTF8Encoding;
+}
+
+const TextEncoding& WindowsLatin1Encoding()
+{
+ static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
+ return globalWindowsLatin1Encoding;
+}
+
+} // namespace WebCore
« no previous file with comments | « third_party/WebKit/WebCore/platform/text/TextEncoding.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698