Index: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp |
diff --git a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp |
deleted file mode 100644 |
index 9446bc7989244cc30ade689f399d309e40b38173..0000000000000000000000000000000000000000 |
--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp |
+++ /dev/null |
@@ -1,498 +0,0 @@ |
-/* |
- * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
- * |
- * Redistribution and use in source and binary forms, with or without |
- * modification, are permitted provided that the following conditions |
- * are met: |
- * 1. Redistributions of source code must retain the above copyright |
- * notice, this list of conditions and the following disclaimer. |
- * 2. Redistributions in binary form must reproduce the above copyright |
- * notice, this list of conditions and the following disclaimer in the |
- * documentation and/or other materials provided with the distribution. |
- * |
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
- */ |
- |
-#include "wtf/text/TextCodecUTF8.h" |
- |
-#include "wtf/PtrUtil.h" |
-#include "wtf/text/CString.h" |
-#include "wtf/text/CharacterNames.h" |
-#include "wtf/text/StringBuffer.h" |
-#include "wtf/text/TextCodecASCIIFastPath.h" |
-#include <memory> |
- |
-namespace WTF { |
- |
-using namespace WTF::Unicode; |
- |
-// We'll use nonCharacter* constants to signal invalid utf-8. |
-// The number in the name signals how many input bytes were invalid. |
-const int nonCharacter1 = -1; |
-const int nonCharacter2 = -2; |
-const int nonCharacter3 = -3; |
- |
-bool isNonCharacter(int character) { |
- return character >= nonCharacter3 && character <= nonCharacter1; |
-} |
- |
-std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, |
- const void*) { |
- return WTF::wrapUnique(new TextCodecUTF8); |
-} |
- |
-void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { |
- registrar("UTF-8", "UTF-8"); |
- |
- // Additional aliases that originally were present in the encoding |
- // table in WebKit on Macintosh, and subsequently added by |
- // TextCodecICU. Perhaps we can prove some are not used on the web |
- // and remove them. |
- registrar("unicode11utf8", "UTF-8"); |
- registrar("unicode20utf8", "UTF-8"); |
- registrar("utf8", "UTF-8"); |
- registrar("x-unicode20utf8", "UTF-8"); |
- |
- // Additional aliases present in the WHATWG Encoding Standard |
- // (http://encoding.spec.whatwg.org/) |
- // and Firefox (24), but not in ICU 4.6. |
- registrar("unicode-1-1-utf-8", "UTF-8"); |
-} |
- |
-void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { |
- registrar("UTF-8", create, 0); |
-} |
- |
-static inline int nonASCIISequenceLength(uint8_t firstByte) { |
- static const uint8_t lengths[256] = { |
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
- 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
- 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
- 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
- return lengths[firstByte]; |
-} |
- |
-static inline int decodeNonASCIISequence(const uint8_t* sequence, |
- unsigned length) { |
- DCHECK(!isASCII(sequence[0])); |
- if (length == 2) { |
- DCHECK_LE(sequence[0], 0xDF); |
- if (sequence[0] < 0xC2) |
- return nonCharacter1; |
- if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
- return nonCharacter1; |
- return ((sequence[0] << 6) + sequence[1]) - 0x00003080; |
- } |
- if (length == 3) { |
- DCHECK_GE(sequence[0], 0xE0); |
- DCHECK_LE(sequence[0], 0xEF); |
- switch (sequence[0]) { |
- case 0xE0: |
- if (sequence[1] < 0xA0 || sequence[1] > 0xBF) |
- return nonCharacter1; |
- break; |
- case 0xED: |
- if (sequence[1] < 0x80 || sequence[1] > 0x9F) |
- return nonCharacter1; |
- break; |
- default: |
- if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
- return nonCharacter1; |
- } |
- if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
- return nonCharacter2; |
- return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - |
- 0x000E2080; |
- } |
- DCHECK_EQ(length, 4u); |
- DCHECK_GE(sequence[0], 0xF0); |
- DCHECK_LE(sequence[0], 0xF4); |
- switch (sequence[0]) { |
- case 0xF0: |
- if (sequence[1] < 0x90 || sequence[1] > 0xBF) |
- return nonCharacter1; |
- break; |
- case 0xF4: |
- if (sequence[1] < 0x80 || sequence[1] > 0x8F) |
- return nonCharacter1; |
- break; |
- default: |
- if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
- return nonCharacter1; |
- } |
- if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
- return nonCharacter2; |
- if (sequence[3] < 0x80 || sequence[3] > 0xBF) |
- return nonCharacter3; |
- return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + |
- sequence[3]) - |
- 0x03C82080; |
-} |
- |
-static inline UChar* appendCharacter(UChar* destination, int character) { |
- DCHECK(!isNonCharacter(character)); |
- DCHECK(!U_IS_SURROGATE(character)); |
- if (U_IS_BMP(character)) { |
- *destination++ = static_cast<UChar>(character); |
- } else { |
- *destination++ = U16_LEAD(character); |
- *destination++ = U16_TRAIL(character); |
- } |
- return destination; |
-} |
- |
-void TextCodecUTF8::consumePartialSequenceByte() { |
- --m_partialSequenceSize; |
- memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); |
-} |
- |
-void TextCodecUTF8::handleError(UChar*& destination, |
- bool stopOnError, |
- bool& sawError) { |
- sawError = true; |
- if (stopOnError) |
- return; |
- // Each error generates a replacement character and consumes one byte. |
- *destination++ = replacementCharacter; |
- consumePartialSequenceByte(); |
-} |
- |
-template <> |
-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, |
- const uint8_t*& source, |
- const uint8_t* end, |
- bool flush, |
- bool, |
- bool&) { |
- DCHECK(m_partialSequenceSize); |
- do { |
- if (isASCII(m_partialSequence[0])) { |
- *destination++ = m_partialSequence[0]; |
- consumePartialSequenceByte(); |
- continue; |
- } |
- int count = nonASCIISequenceLength(m_partialSequence[0]); |
- if (!count) |
- return true; |
- |
- if (count > m_partialSequenceSize) { |
- if (count - m_partialSequenceSize > end - source) { |
- if (!flush) { |
- // The new data is not enough to complete the sequence, so |
- // add it to the existing partial sequence. |
- memcpy(m_partialSequence + m_partialSequenceSize, source, |
- end - source); |
- m_partialSequenceSize += end - source; |
- return false; |
- } |
- // An incomplete partial sequence at the end is an error, but it will |
- // create a 16 bit string due to the replacementCharacter. Let the 16 |
- // bit path handle the error. |
- return true; |
- } |
- memcpy(m_partialSequence + m_partialSequenceSize, source, |
- count - m_partialSequenceSize); |
- source += count - m_partialSequenceSize; |
- m_partialSequenceSize = count; |
- } |
- int character = decodeNonASCIISequence(m_partialSequence, count); |
- if (character & ~0xff) |
- return true; |
- |
- m_partialSequenceSize -= count; |
- *destination++ = static_cast<LChar>(character); |
- } while (m_partialSequenceSize); |
- |
- return false; |
-} |
- |
-template <> |
-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, |
- const uint8_t*& source, |
- const uint8_t* end, |
- bool flush, |
- bool stopOnError, |
- bool& sawError) { |
- DCHECK(m_partialSequenceSize); |
- do { |
- if (isASCII(m_partialSequence[0])) { |
- *destination++ = m_partialSequence[0]; |
- consumePartialSequenceByte(); |
- continue; |
- } |
- int count = nonASCIISequenceLength(m_partialSequence[0]); |
- if (!count) { |
- handleError(destination, stopOnError, sawError); |
- if (stopOnError) |
- return false; |
- continue; |
- } |
- if (count > m_partialSequenceSize) { |
- if (count - m_partialSequenceSize > end - source) { |
- if (!flush) { |
- // The new data is not enough to complete the sequence, so |
- // add it to the existing partial sequence. |
- memcpy(m_partialSequence + m_partialSequenceSize, source, |
- end - source); |
- m_partialSequenceSize += end - source; |
- return false; |
- } |
- // An incomplete partial sequence at the end is an error. |
- handleError(destination, stopOnError, sawError); |
- if (stopOnError) |
- return false; |
- continue; |
- } |
- memcpy(m_partialSequence + m_partialSequenceSize, source, |
- count - m_partialSequenceSize); |
- source += count - m_partialSequenceSize; |
- m_partialSequenceSize = count; |
- } |
- int character = decodeNonASCIISequence(m_partialSequence, count); |
- if (isNonCharacter(character)) { |
- handleError(destination, stopOnError, sawError); |
- if (stopOnError) |
- return false; |
- continue; |
- } |
- |
- m_partialSequenceSize -= count; |
- destination = appendCharacter(destination, character); |
- } while (m_partialSequenceSize); |
- |
- return false; |
-} |
- |
-String TextCodecUTF8::decode(const char* bytes, |
- size_t length, |
- FlushBehavior flush, |
- bool stopOnError, |
- bool& sawError) { |
- // Each input byte might turn into a character. |
- // That includes all bytes in the partial-sequence buffer because |
- // each byte in an invalid sequence will turn into a replacement character. |
- StringBuffer<LChar> buffer(m_partialSequenceSize + length); |
- |
- const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); |
- const uint8_t* end = source + length; |
- const uint8_t* alignedEnd = alignToMachineWord(end); |
- LChar* destination = buffer.characters(); |
- |
- do { |
- if (m_partialSequenceSize) { |
- // Explicitly copy destination and source pointers to avoid taking |
- // pointers to the local variables, which may harm code generation by |
- // disabling some optimizations in some compilers. |
- LChar* destinationForHandlePartialSequence = destination; |
- const uint8_t* sourceForHandlePartialSequence = source; |
- if (handlePartialSequence(destinationForHandlePartialSequence, |
- sourceForHandlePartialSequence, end, flush, |
- stopOnError, sawError)) { |
- source = sourceForHandlePartialSequence; |
- goto upConvertTo16Bit; |
- } |
- destination = destinationForHandlePartialSequence; |
- source = sourceForHandlePartialSequence; |
- if (m_partialSequenceSize) |
- break; |
- } |
- |
- while (source < end) { |
- if (isASCII(*source)) { |
- // Fast path for ASCII. Most UTF-8 text will be ASCII. |
- if (isAlignedToMachineWord(source)) { |
- while (source < alignedEnd) { |
- MachineWord chunk = |
- *reinterpret_cast_ptr<const MachineWord*>(source); |
- if (!isAllASCII<LChar>(chunk)) |
- break; |
- copyASCIIMachineWord(destination, source); |
- source += sizeof(MachineWord); |
- destination += sizeof(MachineWord); |
- } |
- if (source == end) |
- break; |
- if (!isASCII(*source)) |
- continue; |
- } |
- *destination++ = *source++; |
- continue; |
- } |
- int count = nonASCIISequenceLength(*source); |
- int character; |
- if (count == 0) { |
- character = nonCharacter1; |
- } else { |
- if (count > end - source) { |
- SECURITY_DCHECK(end - source < |
- static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
- DCHECK(!m_partialSequenceSize); |
- m_partialSequenceSize = end - source; |
- memcpy(m_partialSequence, source, m_partialSequenceSize); |
- source = end; |
- break; |
- } |
- character = decodeNonASCIISequence(source, count); |
- } |
- if (isNonCharacter(character)) { |
- sawError = true; |
- if (stopOnError) |
- break; |
- |
- goto upConvertTo16Bit; |
- } |
- if (character > 0xff) |
- goto upConvertTo16Bit; |
- |
- source += count; |
- *destination++ = static_cast<LChar>(character); |
- } |
- } while (flush && m_partialSequenceSize); |
- |
- buffer.shrink(destination - buffer.characters()); |
- |
- return String::adopt(buffer); |
- |
-upConvertTo16Bit: |
- StringBuffer<UChar> buffer16(m_partialSequenceSize + length); |
- |
- UChar* destination16 = buffer16.characters(); |
- |
- // Copy the already converted characters |
- for (LChar* converted8 = buffer.characters(); converted8 < destination;) |
- *destination16++ = *converted8++; |
- |
- do { |
- if (m_partialSequenceSize) { |
- // Explicitly copy destination and source pointers to avoid taking |
- // pointers to the local variables, which may harm code generation by |
- // disabling some optimizations in some compilers. |
- UChar* destinationForHandlePartialSequence = destination16; |
- const uint8_t* sourceForHandlePartialSequence = source; |
- handlePartialSequence(destinationForHandlePartialSequence, |
- sourceForHandlePartialSequence, end, flush, |
- stopOnError, sawError); |
- destination16 = destinationForHandlePartialSequence; |
- source = sourceForHandlePartialSequence; |
- if (m_partialSequenceSize) |
- break; |
- } |
- |
- while (source < end) { |
- if (isASCII(*source)) { |
- // Fast path for ASCII. Most UTF-8 text will be ASCII. |
- if (isAlignedToMachineWord(source)) { |
- while (source < alignedEnd) { |
- MachineWord chunk = |
- *reinterpret_cast_ptr<const MachineWord*>(source); |
- if (!isAllASCII<LChar>(chunk)) |
- break; |
- copyASCIIMachineWord(destination16, source); |
- source += sizeof(MachineWord); |
- destination16 += sizeof(MachineWord); |
- } |
- if (source == end) |
- break; |
- if (!isASCII(*source)) |
- continue; |
- } |
- *destination16++ = *source++; |
- continue; |
- } |
- int count = nonASCIISequenceLength(*source); |
- int character; |
- if (count == 0) { |
- character = nonCharacter1; |
- } else { |
- if (count > end - source) { |
- SECURITY_DCHECK(end - source < |
- static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
- DCHECK(!m_partialSequenceSize); |
- m_partialSequenceSize = end - source; |
- memcpy(m_partialSequence, source, m_partialSequenceSize); |
- source = end; |
- break; |
- } |
- character = decodeNonASCIISequence(source, count); |
- } |
- if (isNonCharacter(character)) { |
- sawError = true; |
- if (stopOnError) |
- break; |
- // Each error generates one replacement character and consumes the |
- // 'largest subpart' of the incomplete character. |
- // Note that the nonCharacterX constants go from -1..-3 and contain |
- // the negative of number of bytes comprising the broken encoding |
- // detected. So subtracting c (when isNonCharacter(c)) adds the number |
- // of broken bytes. |
- *destination16++ = replacementCharacter; |
- source -= character; |
- continue; |
- } |
- source += count; |
- destination16 = appendCharacter(destination16, character); |
- } |
- } while (flush && m_partialSequenceSize); |
- |
- buffer16.shrink(destination16 - buffer16.characters()); |
- |
- return String::adopt(buffer16); |
-} |
- |
-template <typename CharType> |
-CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { |
- // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. |
- // BMP characters take only one UTF-16 code unit and can take up to 3 bytes |
- // (3x). |
- // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes |
- // (2x). |
- if (length > std::numeric_limits<size_t>::max() / 3) |
- CRASH(); |
- Vector<uint8_t> bytes(length * 3); |
- |
- size_t i = 0; |
- size_t bytesWritten = 0; |
- while (i < length) { |
- UChar32 character; |
- U16_NEXT(characters, i, length, character); |
- // U16_NEXT will simply emit a surrogate code point if an unmatched |
- // surrogate is encountered; we must convert it to a |
- // U+FFFD (REPLACEMENT CHARACTER) here. |
- if (0xD800 <= character && character <= 0xDFFF) |
- character = replacementCharacter; |
- U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); |
- } |
- |
- return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); |
-} |
- |
-CString TextCodecUTF8::encode(const UChar* characters, |
- size_t length, |
- UnencodableHandling) { |
- return encodeCommon(characters, length); |
-} |
- |
-CString TextCodecUTF8::encode(const LChar* characters, |
- size_t length, |
- UnencodableHandling) { |
- return encodeCommon(characters, length); |
-} |
- |
-} // namespace WTF |