Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(41)

Unified Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 2764283002: Move files in wtf/ to platform/wtf/ (Part 10). (Closed)
Patch Set: Rebase. Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
diff --git a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
deleted file mode 100644
index 9446bc7989244cc30ade689f399d309e40b38173..0000000000000000000000000000000000000000
--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "wtf/text/TextCodecUTF8.h"
-
-#include "wtf/PtrUtil.h"
-#include "wtf/text/CString.h"
-#include "wtf/text/CharacterNames.h"
-#include "wtf/text/StringBuffer.h"
-#include "wtf/text/TextCodecASCIIFastPath.h"
-#include <memory>
-
-namespace WTF {
-
-using namespace WTF::Unicode;
-
-// We'll use nonCharacter* constants to signal invalid utf-8.
-// The number in the name signals how many input bytes were invalid.
-const int nonCharacter1 = -1;
-const int nonCharacter2 = -2;
-const int nonCharacter3 = -3;
-
-bool isNonCharacter(int character) {
- return character >= nonCharacter3 && character <= nonCharacter1;
-}
-
-std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&,
- const void*) {
- return WTF::wrapUnique(new TextCodecUTF8);
-}
-
-void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
- registrar("UTF-8", "UTF-8");
-
- // Additional aliases that originally were present in the encoding
- // table in WebKit on Macintosh, and subsequently added by
- // TextCodecICU. Perhaps we can prove some are not used on the web
- // and remove them.
- registrar("unicode11utf8", "UTF-8");
- registrar("unicode20utf8", "UTF-8");
- registrar("utf8", "UTF-8");
- registrar("x-unicode20utf8", "UTF-8");
-
- // Additional aliases present in the WHATWG Encoding Standard
- // (http://encoding.spec.whatwg.org/)
- // and Firefox (24), but not in ICU 4.6.
- registrar("unicode-1-1-utf-8", "UTF-8");
-}
-
-void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
- registrar("UTF-8", create, 0);
-}
-
-static inline int nonASCIISequenceLength(uint8_t firstByte) {
- static const uint8_t lengths[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- return lengths[firstByte];
-}
-
-static inline int decodeNonASCIISequence(const uint8_t* sequence,
- unsigned length) {
- DCHECK(!isASCII(sequence[0]));
- if (length == 2) {
- DCHECK_LE(sequence[0], 0xDF);
- if (sequence[0] < 0xC2)
- return nonCharacter1;
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter1;
- return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
- }
- if (length == 3) {
- DCHECK_GE(sequence[0], 0xE0);
- DCHECK_LE(sequence[0], 0xEF);
- switch (sequence[0]) {
- case 0xE0:
- if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
- return nonCharacter1;
- break;
- case 0xED:
- if (sequence[1] < 0x80 || sequence[1] > 0x9F)
- return nonCharacter1;
- break;
- default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter1;
- }
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
- return nonCharacter2;
- return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
- 0x000E2080;
- }
- DCHECK_EQ(length, 4u);
- DCHECK_GE(sequence[0], 0xF0);
- DCHECK_LE(sequence[0], 0xF4);
- switch (sequence[0]) {
- case 0xF0:
- if (sequence[1] < 0x90 || sequence[1] > 0xBF)
- return nonCharacter1;
- break;
- case 0xF4:
- if (sequence[1] < 0x80 || sequence[1] > 0x8F)
- return nonCharacter1;
- break;
- default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter1;
- }
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
- return nonCharacter2;
- if (sequence[3] < 0x80 || sequence[3] > 0xBF)
- return nonCharacter3;
- return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
- sequence[3]) -
- 0x03C82080;
-}
-
-static inline UChar* appendCharacter(UChar* destination, int character) {
- DCHECK(!isNonCharacter(character));
- DCHECK(!U_IS_SURROGATE(character));
- if (U_IS_BMP(character)) {
- *destination++ = static_cast<UChar>(character);
- } else {
- *destination++ = U16_LEAD(character);
- *destination++ = U16_TRAIL(character);
- }
- return destination;
-}
-
-void TextCodecUTF8::consumePartialSequenceByte() {
- --m_partialSequenceSize;
- memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
-}
-
-void TextCodecUTF8::handleError(UChar*& destination,
- bool stopOnError,
- bool& sawError) {
- sawError = true;
- if (stopOnError)
- return;
- // Each error generates a replacement character and consumes one byte.
- *destination++ = replacementCharacter;
- consumePartialSequenceByte();
-}
-
-template <>
-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
- const uint8_t*& source,
- const uint8_t* end,
- bool flush,
- bool,
- bool&) {
- DCHECK(m_partialSequenceSize);
- do {
- if (isASCII(m_partialSequence[0])) {
- *destination++ = m_partialSequence[0];
- consumePartialSequenceByte();
- continue;
- }
- int count = nonASCIISequenceLength(m_partialSequence[0]);
- if (!count)
- return true;
-
- if (count > m_partialSequenceSize) {
- if (count - m_partialSequenceSize > end - source) {
- if (!flush) {
- // The new data is not enough to complete the sequence, so
- // add it to the existing partial sequence.
- memcpy(m_partialSequence + m_partialSequenceSize, source,
- end - source);
- m_partialSequenceSize += end - source;
- return false;
- }
- // An incomplete partial sequence at the end is an error, but it will
- // create a 16 bit string due to the replacementCharacter. Let the 16
- // bit path handle the error.
- return true;
- }
- memcpy(m_partialSequence + m_partialSequenceSize, source,
- count - m_partialSequenceSize);
- source += count - m_partialSequenceSize;
- m_partialSequenceSize = count;
- }
- int character = decodeNonASCIISequence(m_partialSequence, count);
- if (character & ~0xff)
- return true;
-
- m_partialSequenceSize -= count;
- *destination++ = static_cast<LChar>(character);
- } while (m_partialSequenceSize);
-
- return false;
-}
-
-template <>
-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
- const uint8_t*& source,
- const uint8_t* end,
- bool flush,
- bool stopOnError,
- bool& sawError) {
- DCHECK(m_partialSequenceSize);
- do {
- if (isASCII(m_partialSequence[0])) {
- *destination++ = m_partialSequence[0];
- consumePartialSequenceByte();
- continue;
- }
- int count = nonASCIISequenceLength(m_partialSequence[0]);
- if (!count) {
- handleError(destination, stopOnError, sawError);
- if (stopOnError)
- return false;
- continue;
- }
- if (count > m_partialSequenceSize) {
- if (count - m_partialSequenceSize > end - source) {
- if (!flush) {
- // The new data is not enough to complete the sequence, so
- // add it to the existing partial sequence.
- memcpy(m_partialSequence + m_partialSequenceSize, source,
- end - source);
- m_partialSequenceSize += end - source;
- return false;
- }
- // An incomplete partial sequence at the end is an error.
- handleError(destination, stopOnError, sawError);
- if (stopOnError)
- return false;
- continue;
- }
- memcpy(m_partialSequence + m_partialSequenceSize, source,
- count - m_partialSequenceSize);
- source += count - m_partialSequenceSize;
- m_partialSequenceSize = count;
- }
- int character = decodeNonASCIISequence(m_partialSequence, count);
- if (isNonCharacter(character)) {
- handleError(destination, stopOnError, sawError);
- if (stopOnError)
- return false;
- continue;
- }
-
- m_partialSequenceSize -= count;
- destination = appendCharacter(destination, character);
- } while (m_partialSequenceSize);
-
- return false;
-}
-
-String TextCodecUTF8::decode(const char* bytes,
- size_t length,
- FlushBehavior flush,
- bool stopOnError,
- bool& sawError) {
- // Each input byte might turn into a character.
- // That includes all bytes in the partial-sequence buffer because
- // each byte in an invalid sequence will turn into a replacement character.
- StringBuffer<LChar> buffer(m_partialSequenceSize + length);
-
- const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
- const uint8_t* end = source + length;
- const uint8_t* alignedEnd = alignToMachineWord(end);
- LChar* destination = buffer.characters();
-
- do {
- if (m_partialSequenceSize) {
- // Explicitly copy destination and source pointers to avoid taking
- // pointers to the local variables, which may harm code generation by
- // disabling some optimizations in some compilers.
- LChar* destinationForHandlePartialSequence = destination;
- const uint8_t* sourceForHandlePartialSequence = source;
- if (handlePartialSequence(destinationForHandlePartialSequence,
- sourceForHandlePartialSequence, end, flush,
- stopOnError, sawError)) {
- source = sourceForHandlePartialSequence;
- goto upConvertTo16Bit;
- }
- destination = destinationForHandlePartialSequence;
- source = sourceForHandlePartialSequence;
- if (m_partialSequenceSize)
- break;
- }
-
- while (source < end) {
- if (isASCII(*source)) {
- // Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
- while (source < alignedEnd) {
- MachineWord chunk =
- *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
- break;
- copyASCIIMachineWord(destination, source);
- source += sizeof(MachineWord);
- destination += sizeof(MachineWord);
- }
- if (source == end)
- break;
- if (!isASCII(*source))
- continue;
- }
- *destination++ = *source++;
- continue;
- }
- int count = nonASCIISequenceLength(*source);
- int character;
- if (count == 0) {
- character = nonCharacter1;
- } else {
- if (count > end - source) {
- SECURITY_DCHECK(end - source <
- static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
- DCHECK(!m_partialSequenceSize);
- m_partialSequenceSize = end - source;
- memcpy(m_partialSequence, source, m_partialSequenceSize);
- source = end;
- break;
- }
- character = decodeNonASCIISequence(source, count);
- }
- if (isNonCharacter(character)) {
- sawError = true;
- if (stopOnError)
- break;
-
- goto upConvertTo16Bit;
- }
- if (character > 0xff)
- goto upConvertTo16Bit;
-
- source += count;
- *destination++ = static_cast<LChar>(character);
- }
- } while (flush && m_partialSequenceSize);
-
- buffer.shrink(destination - buffer.characters());
-
- return String::adopt(buffer);
-
-upConvertTo16Bit:
- StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
-
- UChar* destination16 = buffer16.characters();
-
- // Copy the already converted characters
- for (LChar* converted8 = buffer.characters(); converted8 < destination;)
- *destination16++ = *converted8++;
-
- do {
- if (m_partialSequenceSize) {
- // Explicitly copy destination and source pointers to avoid taking
- // pointers to the local variables, which may harm code generation by
- // disabling some optimizations in some compilers.
- UChar* destinationForHandlePartialSequence = destination16;
- const uint8_t* sourceForHandlePartialSequence = source;
- handlePartialSequence(destinationForHandlePartialSequence,
- sourceForHandlePartialSequence, end, flush,
- stopOnError, sawError);
- destination16 = destinationForHandlePartialSequence;
- source = sourceForHandlePartialSequence;
- if (m_partialSequenceSize)
- break;
- }
-
- while (source < end) {
- if (isASCII(*source)) {
- // Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
- while (source < alignedEnd) {
- MachineWord chunk =
- *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
- break;
- copyASCIIMachineWord(destination16, source);
- source += sizeof(MachineWord);
- destination16 += sizeof(MachineWord);
- }
- if (source == end)
- break;
- if (!isASCII(*source))
- continue;
- }
- *destination16++ = *source++;
- continue;
- }
- int count = nonASCIISequenceLength(*source);
- int character;
- if (count == 0) {
- character = nonCharacter1;
- } else {
- if (count > end - source) {
- SECURITY_DCHECK(end - source <
- static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
- DCHECK(!m_partialSequenceSize);
- m_partialSequenceSize = end - source;
- memcpy(m_partialSequence, source, m_partialSequenceSize);
- source = end;
- break;
- }
- character = decodeNonASCIISequence(source, count);
- }
- if (isNonCharacter(character)) {
- sawError = true;
- if (stopOnError)
- break;
- // Each error generates one replacement character and consumes the
- // 'largest subpart' of the incomplete character.
- // Note that the nonCharacterX constants go from -1..-3 and contain
- // the negative of number of bytes comprising the broken encoding
- // detected. So subtracting c (when isNonCharacter(c)) adds the number
- // of broken bytes.
- *destination16++ = replacementCharacter;
- source -= character;
- continue;
- }
- source += count;
- destination16 = appendCharacter(destination16, character);
- }
- } while (flush && m_partialSequenceSize);
-
- buffer16.shrink(destination16 - buffer16.characters());
-
- return String::adopt(buffer16);
-}
-
-template <typename CharType>
-CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
- // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
- // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
- // (3x).
- // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
- // (2x).
- if (length > std::numeric_limits<size_t>::max() / 3)
- CRASH();
- Vector<uint8_t> bytes(length * 3);
-
- size_t i = 0;
- size_t bytesWritten = 0;
- while (i < length) {
- UChar32 character;
- U16_NEXT(characters, i, length, character);
- // U16_NEXT will simply emit a surrogate code point if an unmatched
- // surrogate is encountered; we must convert it to a
- // U+FFFD (REPLACEMENT CHARACTER) here.
- if (0xD800 <= character && character <= 0xDFFF)
- character = replacementCharacter;
- U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
- }
-
- return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
-}
-
-CString TextCodecUTF8::encode(const UChar* characters,
- size_t length,
- UnencodableHandling) {
- return encodeCommon(characters, length);
-}
-
-CString TextCodecUTF8::encode(const LChar* characters,
- size_t length,
- UnencodableHandling) {
- return encodeCommon(characters, length);
-}
-
-} // namespace WTF
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUserDefined.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698