| Index: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
|
| diff --git a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
|
| index bc4f0ed74e35c936f915023482b3b7d98ab0103a..ea75fd7d1608f6b0f1d8ec23a588c6545473142d 100644
|
| --- a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
|
| +++ b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
|
| @@ -36,425 +36,441 @@ using namespace WTF::Unicode;
|
|
|
| const int nonCharacter = -1;
|
|
|
| -PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
|
| -{
|
| - return adoptPtr(new TextCodecUTF8);
|
| +PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {
|
| + return adoptPtr(new TextCodecUTF8);
|
| }
|
|
|
| -void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
|
| -{
|
| - registrar("UTF-8", "UTF-8");
|
| -
|
| - // Additional aliases that originally were present in the encoding
|
| - // table in WebKit on Macintosh, and subsequently added by
|
| - // TextCodecICU. Perhaps we can prove some are not used on the web
|
| - // and remove them.
|
| - registrar("unicode11utf8", "UTF-8");
|
| - registrar("unicode20utf8", "UTF-8");
|
| - registrar("utf8", "UTF-8");
|
| - registrar("x-unicode20utf8", "UTF-8");
|
| -
|
| - // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
|
| - // and Firefox (24), but not in ICU 4.6.
|
| - registrar("unicode-1-1-utf-8", "UTF-8");
|
| +void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
|
| + registrar("UTF-8", "UTF-8");
|
| +
|
| + // Additional aliases that originally were present in the encoding
|
| + // table in WebKit on Macintosh, and subsequently added by
|
| + // TextCodecICU. Perhaps we can prove some are not used on the web
|
| + // and remove them.
|
| + registrar("unicode11utf8", "UTF-8");
|
| + registrar("unicode20utf8", "UTF-8");
|
| + registrar("utf8", "UTF-8");
|
| + registrar("x-unicode20utf8", "UTF-8");
|
| +
|
| + // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
|
| + // and Firefox (24), but not in ICU 4.6.
|
| + registrar("unicode-1-1-utf-8", "UTF-8");
|
| }
|
|
|
| -void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
|
| -{
|
| - registrar("UTF-8", create, 0);
|
| +void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
|
| + registrar("UTF-8", create, 0);
|
| }
|
|
|
| -static inline int nonASCIISequenceLength(uint8_t firstByte)
|
| -{
|
| - static const uint8_t lengths[256] = {
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
| - 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
| - };
|
| - return lengths[firstByte];
|
| +static inline int nonASCIISequenceLength(uint8_t firstByte) {
|
| + static const uint8_t lengths[256] = {
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
| + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
| + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
| + return lengths[firstByte];
|
| }
|
|
|
| -static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
|
| -{
|
| - ASSERT(!isASCII(sequence[0]));
|
| - if (length == 2) {
|
| - ASSERT(sequence[0] <= 0xDF);
|
| - if (sequence[0] < 0xC2)
|
| - return nonCharacter;
|
| - if (sequence[1] < 0x80 || sequence[1] > 0xBF)
|
| - return nonCharacter;
|
| - return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
|
| - }
|
| - if (length == 3) {
|
| - ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
|
| - switch (sequence[0]) {
|
| - case 0xE0:
|
| - if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
|
| - return nonCharacter;
|
| - break;
|
| - case 0xED:
|
| - if (sequence[1] < 0x80 || sequence[1] > 0x9F)
|
| - return nonCharacter;
|
| - break;
|
| - default:
|
| - if (sequence[1] < 0x80 || sequence[1] > 0xBF)
|
| - return nonCharacter;
|
| - }
|
| - if (sequence[2] < 0x80 || sequence[2] > 0xBF)
|
| - return nonCharacter;
|
| - return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
|
| - }
|
| - ASSERT(length == 4);
|
| - ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
|
| +static inline int decodeNonASCIISequence(const uint8_t* sequence,
|
| + unsigned length) {
|
| + ASSERT(!isASCII(sequence[0]));
|
| + if (length == 2) {
|
| + ASSERT(sequence[0] <= 0xDF);
|
| + if (sequence[0] < 0xC2)
|
| + return nonCharacter;
|
| + if (sequence[1] < 0x80 || sequence[1] > 0xBF)
|
| + return nonCharacter;
|
| + return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
|
| + }
|
| + if (length == 3) {
|
| + ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
|
| switch (sequence[0]) {
|
| - case 0xF0:
|
| - if (sequence[1] < 0x90 || sequence[1] > 0xBF)
|
| - return nonCharacter;
|
| + case 0xE0:
|
| + if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
|
| + return nonCharacter;
|
| break;
|
| - case 0xF4:
|
| - if (sequence[1] < 0x80 || sequence[1] > 0x8F)
|
| - return nonCharacter;
|
| + case 0xED:
|
| + if (sequence[1] < 0x80 || sequence[1] > 0x9F)
|
| + return nonCharacter;
|
| break;
|
| - default:
|
| + default:
|
| if (sequence[1] < 0x80 || sequence[1] > 0xBF)
|
| - return nonCharacter;
|
| + return nonCharacter;
|
| }
|
| if (sequence[2] < 0x80 || sequence[2] > 0xBF)
|
| + return nonCharacter;
|
| + return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
|
| + 0x000E2080;
|
| + }
|
| + ASSERT(length == 4);
|
| + ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
|
| + switch (sequence[0]) {
|
| + case 0xF0:
|
| + if (sequence[1] < 0x90 || sequence[1] > 0xBF)
|
| + return nonCharacter;
|
| + break;
|
| + case 0xF4:
|
| + if (sequence[1] < 0x80 || sequence[1] > 0x8F)
|
| return nonCharacter;
|
| - if (sequence[3] < 0x80 || sequence[3] > 0xBF)
|
| + break;
|
| + default:
|
| + if (sequence[1] < 0x80 || sequence[1] > 0xBF)
|
| return nonCharacter;
|
| - return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
|
| + }
|
| + if (sequence[2] < 0x80 || sequence[2] > 0xBF)
|
| + return nonCharacter;
|
| + if (sequence[3] < 0x80 || sequence[3] > 0xBF)
|
| + return nonCharacter;
|
| + return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
|
| + sequence[3]) -
|
| + 0x03C82080;
|
| }
|
|
|
| -static inline UChar* appendCharacter(UChar* destination, int character)
|
| -{
|
| - ASSERT(character != nonCharacter);
|
| - ASSERT(!U_IS_SURROGATE(character));
|
| - if (U_IS_BMP(character)) {
|
| - *destination++ = static_cast<UChar>(character);
|
| - } else {
|
| - *destination++ = U16_LEAD(character);
|
| - *destination++ = U16_TRAIL(character);
|
| - }
|
| - return destination;
|
| +static inline UChar* appendCharacter(UChar* destination, int character) {
|
| + ASSERT(character != nonCharacter);
|
| + ASSERT(!U_IS_SURROGATE(character));
|
| + if (U_IS_BMP(character)) {
|
| + *destination++ = static_cast<UChar>(character);
|
| + } else {
|
| + *destination++ = U16_LEAD(character);
|
| + *destination++ = U16_TRAIL(character);
|
| + }
|
| + return destination;
|
| }
|
|
|
| -void TextCodecUTF8::consumePartialSequenceByte()
|
| -{
|
| - --m_partialSequenceSize;
|
| - memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
|
| +void TextCodecUTF8::consumePartialSequenceByte() {
|
| + --m_partialSequenceSize;
|
| + memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
|
| }
|
|
|
| -void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
|
| -{
|
| - sawError = true;
|
| - if (stopOnError)
|
| - return;
|
| - // Each error generates a replacement character and consumes one byte.
|
| - *destination++ = replacementCharacter;
|
| - consumePartialSequenceByte();
|
| +void TextCodecUTF8::handleError(UChar*& destination,
|
| + bool stopOnError,
|
| + bool& sawError) {
|
| + sawError = true;
|
| + if (stopOnError)
|
| + return;
|
| + // Each error generates a replacement character and consumes one byte.
|
| + *destination++ = replacementCharacter;
|
| + consumePartialSequenceByte();
|
| }
|
|
|
| template <>
|
| -bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
|
| -{
|
| - ASSERT(m_partialSequenceSize);
|
| - do {
|
| - if (isASCII(m_partialSequence[0])) {
|
| - *destination++ = m_partialSequence[0];
|
| - consumePartialSequenceByte();
|
| - continue;
|
| - }
|
| - int count = nonASCIISequenceLength(m_partialSequence[0]);
|
| - if (!count)
|
| - return true;
|
| -
|
| - if (count > m_partialSequenceSize) {
|
| - if (count - m_partialSequenceSize > end - source) {
|
| - if (!flush) {
|
| - // The new data is not enough to complete the sequence, so
|
| - // add it to the existing partial sequence.
|
| - memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
|
| - m_partialSequenceSize += end - source;
|
| - return false;
|
| - }
|
| - // An incomplete partial sequence at the end is an error, but it will create
|
| - // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
|
| - // the error.
|
| - return true;
|
| - }
|
| - memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
|
| - source += count - m_partialSequenceSize;
|
| - m_partialSequenceSize = count;
|
| +bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
|
| + const uint8_t*& source,
|
| + const uint8_t* end,
|
| + bool flush,
|
| + bool,
|
| + bool&) {
|
| + ASSERT(m_partialSequenceSize);
|
| + do {
|
| + if (isASCII(m_partialSequence[0])) {
|
| + *destination++ = m_partialSequence[0];
|
| + consumePartialSequenceByte();
|
| + continue;
|
| + }
|
| + int count = nonASCIISequenceLength(m_partialSequence[0]);
|
| + if (!count)
|
| + return true;
|
| +
|
| + if (count > m_partialSequenceSize) {
|
| + if (count - m_partialSequenceSize > end - source) {
|
| + if (!flush) {
|
| + // The new data is not enough to complete the sequence, so
|
| + // add it to the existing partial sequence.
|
| + memcpy(m_partialSequence + m_partialSequenceSize, source,
|
| + end - source);
|
| + m_partialSequenceSize += end - source;
|
| + return false;
|
| }
|
| - int character = decodeNonASCIISequence(m_partialSequence, count);
|
| - if (character & ~0xff)
|
| - return true;
|
| + // An incomplete partial sequence at the end is an error, but it will create
|
| + // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
|
| + // the error.
|
| + return true;
|
| + }
|
| + memcpy(m_partialSequence + m_partialSequenceSize, source,
|
| + count - m_partialSequenceSize);
|
| + source += count - m_partialSequenceSize;
|
| + m_partialSequenceSize = count;
|
| + }
|
| + int character = decodeNonASCIISequence(m_partialSequence, count);
|
| + if (character & ~0xff)
|
| + return true;
|
|
|
| - m_partialSequenceSize -= count;
|
| - *destination++ = static_cast<LChar>(character);
|
| - } while (m_partialSequenceSize);
|
| + m_partialSequenceSize -= count;
|
| + *destination++ = static_cast<LChar>(character);
|
| + } while (m_partialSequenceSize);
|
|
|
| - return false;
|
| + return false;
|
| }
|
|
|
| template <>
|
| -bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
|
| -{
|
| - ASSERT(m_partialSequenceSize);
|
| - do {
|
| - if (isASCII(m_partialSequence[0])) {
|
| - *destination++ = m_partialSequence[0];
|
| - consumePartialSequenceByte();
|
| - continue;
|
| - }
|
| - int count = nonASCIISequenceLength(m_partialSequence[0]);
|
| - if (!count) {
|
| - handleError(destination, stopOnError, sawError);
|
| - if (stopOnError)
|
| - return false;
|
| - continue;
|
| - }
|
| - if (count > m_partialSequenceSize) {
|
| - if (count - m_partialSequenceSize > end - source) {
|
| - if (!flush) {
|
| - // The new data is not enough to complete the sequence, so
|
| - // add it to the existing partial sequence.
|
| - memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
|
| - m_partialSequenceSize += end - source;
|
| - return false;
|
| - }
|
| - // An incomplete partial sequence at the end is an error.
|
| - handleError(destination, stopOnError, sawError);
|
| - if (stopOnError)
|
| - return false;
|
| - continue;
|
| - }
|
| - memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
|
| - source += count - m_partialSequenceSize;
|
| - m_partialSequenceSize = count;
|
| - }
|
| - int character = decodeNonASCIISequence(m_partialSequence, count);
|
| - if (character == nonCharacter) {
|
| - handleError(destination, stopOnError, sawError);
|
| - if (stopOnError)
|
| - return false;
|
| - continue;
|
| +bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
|
| + const uint8_t*& source,
|
| + const uint8_t* end,
|
| + bool flush,
|
| + bool stopOnError,
|
| + bool& sawError) {
|
| + ASSERT(m_partialSequenceSize);
|
| + do {
|
| + if (isASCII(m_partialSequence[0])) {
|
| + *destination++ = m_partialSequence[0];
|
| + consumePartialSequenceByte();
|
| + continue;
|
| + }
|
| + int count = nonASCIISequenceLength(m_partialSequence[0]);
|
| + if (!count) {
|
| + handleError(destination, stopOnError, sawError);
|
| + if (stopOnError)
|
| + return false;
|
| + continue;
|
| + }
|
| + if (count > m_partialSequenceSize) {
|
| + if (count - m_partialSequenceSize > end - source) {
|
| + if (!flush) {
|
| + // The new data is not enough to complete the sequence, so
|
| + // add it to the existing partial sequence.
|
| + memcpy(m_partialSequence + m_partialSequenceSize, source,
|
| + end - source);
|
| + m_partialSequenceSize += end - source;
|
| + return false;
|
| }
|
| + // An incomplete partial sequence at the end is an error.
|
| + handleError(destination, stopOnError, sawError);
|
| + if (stopOnError)
|
| + return false;
|
| + continue;
|
| + }
|
| + memcpy(m_partialSequence + m_partialSequenceSize, source,
|
| + count - m_partialSequenceSize);
|
| + source += count - m_partialSequenceSize;
|
| + m_partialSequenceSize = count;
|
| + }
|
| + int character = decodeNonASCIISequence(m_partialSequence, count);
|
| + if (character == nonCharacter) {
|
| + handleError(destination, stopOnError, sawError);
|
| + if (stopOnError)
|
| + return false;
|
| + continue;
|
| + }
|
|
|
| - m_partialSequenceSize -= count;
|
| - destination = appendCharacter(destination, character);
|
| - } while (m_partialSequenceSize);
|
| + m_partialSequenceSize -= count;
|
| + destination = appendCharacter(destination, character);
|
| + } while (m_partialSequenceSize);
|
|
|
| - return false;
|
| + return false;
|
| }
|
|
|
| -String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
|
| -{
|
| - // Each input byte might turn into a character.
|
| - // That includes all bytes in the partial-sequence buffer because
|
| - // each byte in an invalid sequence will turn into a replacement character.
|
| - StringBuffer<LChar> buffer(m_partialSequenceSize + length);
|
| -
|
| - const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
|
| - const uint8_t* end = source + length;
|
| - const uint8_t* alignedEnd = alignToMachineWord(end);
|
| - LChar* destination = buffer.characters();
|
| -
|
| - do {
|
| - if (m_partialSequenceSize) {
|
| - // Explicitly copy destination and source pointers to avoid taking pointers to the
|
| - // local variables, which may harm code generation by disabling some optimizations
|
| - // in some compilers.
|
| - LChar* destinationForHandlePartialSequence = destination;
|
| - const uint8_t* sourceForHandlePartialSequence = source;
|
| - if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
|
| - source = sourceForHandlePartialSequence;
|
| - goto upConvertTo16Bit;
|
| - }
|
| - destination = destinationForHandlePartialSequence;
|
| - source = sourceForHandlePartialSequence;
|
| - if (m_partialSequenceSize)
|
| - break;
|
| - }
|
| +String TextCodecUTF8::decode(const char* bytes,
|
| + size_t length,
|
| + FlushBehavior flush,
|
| + bool stopOnError,
|
| + bool& sawError) {
|
| + // Each input byte might turn into a character.
|
| + // That includes all bytes in the partial-sequence buffer because
|
| + // each byte in an invalid sequence will turn into a replacement character.
|
| + StringBuffer<LChar> buffer(m_partialSequenceSize + length);
|
| +
|
| + const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
|
| + const uint8_t* end = source + length;
|
| + const uint8_t* alignedEnd = alignToMachineWord(end);
|
| + LChar* destination = buffer.characters();
|
| +
|
| + do {
|
| + if (m_partialSequenceSize) {
|
| + // Explicitly copy destination and source pointers to avoid taking pointers to the
|
| + // local variables, which may harm code generation by disabling some optimizations
|
| + // in some compilers.
|
| + LChar* destinationForHandlePartialSequence = destination;
|
| + const uint8_t* sourceForHandlePartialSequence = source;
|
| + if (handlePartialSequence(destinationForHandlePartialSequence,
|
| + sourceForHandlePartialSequence, end, flush,
|
| + stopOnError, sawError)) {
|
| + source = sourceForHandlePartialSequence;
|
| + goto upConvertTo16Bit;
|
| + }
|
| + destination = destinationForHandlePartialSequence;
|
| + source = sourceForHandlePartialSequence;
|
| + if (m_partialSequenceSize)
|
| + break;
|
| + }
|
|
|
| - while (source < end) {
|
| - if (isASCII(*source)) {
|
| - // Fast path for ASCII. Most UTF-8 text will be ASCII.
|
| - if (isAlignedToMachineWord(source)) {
|
| - while (source < alignedEnd) {
|
| - MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
|
| - if (!isAllASCII<LChar>(chunk))
|
| - break;
|
| - copyASCIIMachineWord(destination, source);
|
| - source += sizeof(MachineWord);
|
| - destination += sizeof(MachineWord);
|
| - }
|
| - if (source == end)
|
| - break;
|
| - if (!isASCII(*source))
|
| - continue;
|
| - }
|
| - *destination++ = *source++;
|
| - continue;
|
| - }
|
| - int count = nonASCIISequenceLength(*source);
|
| - int character;
|
| - if (count == 0) {
|
| - character = nonCharacter;
|
| - } else {
|
| - if (count > end - source) {
|
| - ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
|
| - ASSERT(!m_partialSequenceSize);
|
| - m_partialSequenceSize = end - source;
|
| - memcpy(m_partialSequence, source, m_partialSequenceSize);
|
| - source = end;
|
| - break;
|
| - }
|
| - character = decodeNonASCIISequence(source, count);
|
| - }
|
| - if (character == nonCharacter) {
|
| - sawError = true;
|
| - if (stopOnError)
|
| - break;
|
| -
|
| - goto upConvertTo16Bit;
|
| - }
|
| - if (character > 0xff)
|
| - goto upConvertTo16Bit;
|
| -
|
| - source += count;
|
| - *destination++ = static_cast<LChar>(character);
|
| + while (source < end) {
|
| + if (isASCII(*source)) {
|
| + // Fast path for ASCII. Most UTF-8 text will be ASCII.
|
| + if (isAlignedToMachineWord(source)) {
|
| + while (source < alignedEnd) {
|
| + MachineWord chunk =
|
| + *reinterpret_cast_ptr<const MachineWord*>(source);
|
| + if (!isAllASCII<LChar>(chunk))
|
| + break;
|
| + copyASCIIMachineWord(destination, source);
|
| + source += sizeof(MachineWord);
|
| + destination += sizeof(MachineWord);
|
| + }
|
| + if (source == end)
|
| + break;
|
| + if (!isASCII(*source))
|
| + continue;
|
| }
|
| - } while (flush && m_partialSequenceSize);
|
| + *destination++ = *source++;
|
| + continue;
|
| + }
|
| + int count = nonASCIISequenceLength(*source);
|
| + int character;
|
| + if (count == 0) {
|
| + character = nonCharacter;
|
| + } else {
|
| + if (count > end - source) {
|
| + ASSERT_WITH_SECURITY_IMPLICATION(
|
| + end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
|
| + ASSERT(!m_partialSequenceSize);
|
| + m_partialSequenceSize = end - source;
|
| + memcpy(m_partialSequence, source, m_partialSequenceSize);
|
| + source = end;
|
| + break;
|
| + }
|
| + character = decodeNonASCIISequence(source, count);
|
| + }
|
| + if (character == nonCharacter) {
|
| + sawError = true;
|
| + if (stopOnError)
|
| + break;
|
| +
|
| + goto upConvertTo16Bit;
|
| + }
|
| + if (character > 0xff)
|
| + goto upConvertTo16Bit;
|
| +
|
| + source += count;
|
| + *destination++ = static_cast<LChar>(character);
|
| + }
|
| + } while (flush && m_partialSequenceSize);
|
|
|
| - buffer.shrink(destination - buffer.characters());
|
| + buffer.shrink(destination - buffer.characters());
|
|
|
| - return String::adopt(buffer);
|
| + return String::adopt(buffer);
|
|
|
| upConvertTo16Bit:
|
| - StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
|
| -
|
| - UChar* destination16 = buffer16.characters();
|
| -
|
| - // Copy the already converted characters
|
| - for (LChar* converted8 = buffer.characters(); converted8 < destination;)
|
| - *destination16++ = *converted8++;
|
| -
|
| - do {
|
| - if (m_partialSequenceSize) {
|
| - // Explicitly copy destination and source pointers to avoid taking pointers to the
|
| - // local variables, which may harm code generation by disabling some optimizations
|
| - // in some compilers.
|
| - UChar* destinationForHandlePartialSequence = destination16;
|
| - const uint8_t* sourceForHandlePartialSequence = source;
|
| - handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
|
| - destination16 = destinationForHandlePartialSequence;
|
| - source = sourceForHandlePartialSequence;
|
| - if (m_partialSequenceSize)
|
| - break;
|
| - }
|
| + StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
|
| +
|
| + UChar* destination16 = buffer16.characters();
|
| +
|
| + // Copy the already converted characters
|
| + for (LChar* converted8 = buffer.characters(); converted8 < destination;)
|
| + *destination16++ = *converted8++;
|
| +
|
| + do {
|
| + if (m_partialSequenceSize) {
|
| + // Explicitly copy destination and source pointers to avoid taking pointers to the
|
| + // local variables, which may harm code generation by disabling some optimizations
|
| + // in some compilers.
|
| + UChar* destinationForHandlePartialSequence = destination16;
|
| + const uint8_t* sourceForHandlePartialSequence = source;
|
| + handlePartialSequence(destinationForHandlePartialSequence,
|
| + sourceForHandlePartialSequence, end, flush,
|
| + stopOnError, sawError);
|
| + destination16 = destinationForHandlePartialSequence;
|
| + source = sourceForHandlePartialSequence;
|
| + if (m_partialSequenceSize)
|
| + break;
|
| + }
|
|
|
| - while (source < end) {
|
| - if (isASCII(*source)) {
|
| - // Fast path for ASCII. Most UTF-8 text will be ASCII.
|
| - if (isAlignedToMachineWord(source)) {
|
| - while (source < alignedEnd) {
|
| - MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
|
| - if (!isAllASCII<LChar>(chunk))
|
| - break;
|
| - copyASCIIMachineWord(destination16, source);
|
| - source += sizeof(MachineWord);
|
| - destination16 += sizeof(MachineWord);
|
| - }
|
| - if (source == end)
|
| - break;
|
| - if (!isASCII(*source))
|
| - continue;
|
| - }
|
| - *destination16++ = *source++;
|
| - continue;
|
| - }
|
| - int count = nonASCIISequenceLength(*source);
|
| - int character;
|
| - if (count == 0) {
|
| - character = nonCharacter;
|
| - } else {
|
| - if (count > end - source) {
|
| - ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
|
| - ASSERT(!m_partialSequenceSize);
|
| - m_partialSequenceSize = end - source;
|
| - memcpy(m_partialSequence, source, m_partialSequenceSize);
|
| - source = end;
|
| - break;
|
| - }
|
| - character = decodeNonASCIISequence(source, count);
|
| - }
|
| - if (character == nonCharacter) {
|
| - sawError = true;
|
| - if (stopOnError)
|
| - break;
|
| - // Each error generates a replacement character and consumes one byte.
|
| - *destination16++ = replacementCharacter;
|
| - ++source;
|
| - continue;
|
| - }
|
| - source += count;
|
| - destination16 = appendCharacter(destination16, character);
|
| + while (source < end) {
|
| + if (isASCII(*source)) {
|
| + // Fast path for ASCII. Most UTF-8 text will be ASCII.
|
| + if (isAlignedToMachineWord(source)) {
|
| + while (source < alignedEnd) {
|
| + MachineWord chunk =
|
| + *reinterpret_cast_ptr<const MachineWord*>(source);
|
| + if (!isAllASCII<LChar>(chunk))
|
| + break;
|
| + copyASCIIMachineWord(destination16, source);
|
| + source += sizeof(MachineWord);
|
| + destination16 += sizeof(MachineWord);
|
| + }
|
| + if (source == end)
|
| + break;
|
| + if (!isASCII(*source))
|
| + continue;
|
| + }
|
| + *destination16++ = *source++;
|
| + continue;
|
| + }
|
| + int count = nonASCIISequenceLength(*source);
|
| + int character;
|
| + if (count == 0) {
|
| + character = nonCharacter;
|
| + } else {
|
| + if (count > end - source) {
|
| + ASSERT_WITH_SECURITY_IMPLICATION(
|
| + end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
|
| + ASSERT(!m_partialSequenceSize);
|
| + m_partialSequenceSize = end - source;
|
| + memcpy(m_partialSequence, source, m_partialSequenceSize);
|
| + source = end;
|
| + break;
|
| }
|
| - } while (flush && m_partialSequenceSize);
|
| + character = decodeNonASCIISequence(source, count);
|
| + }
|
| + if (character == nonCharacter) {
|
| + sawError = true;
|
| + if (stopOnError)
|
| + break;
|
| + // Each error generates a replacement character and consumes one byte.
|
| + *destination16++ = replacementCharacter;
|
| + ++source;
|
| + continue;
|
| + }
|
| + source += count;
|
| + destination16 = appendCharacter(destination16, character);
|
| + }
|
| + } while (flush && m_partialSequenceSize);
|
|
|
| - buffer16.shrink(destination16 - buffer16.characters());
|
| + buffer16.shrink(destination16 - buffer16.characters());
|
|
|
| - return String::adopt(buffer16);
|
| + return String::adopt(buffer16);
|
| }
|
|
|
| -template<typename CharType>
|
| -CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
|
| -{
|
| - // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
|
| - // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
|
| - // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
|
| - if (length > std::numeric_limits<size_t>::max() / 3)
|
| - CRASH();
|
| - Vector<uint8_t> bytes(length * 3);
|
| -
|
| - size_t i = 0;
|
| - size_t bytesWritten = 0;
|
| - while (i < length) {
|
| - UChar32 character;
|
| - U16_NEXT(characters, i, length, character);
|
| - // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
|
| - // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
|
| - if (0xD800 <= character && character <= 0xDFFF)
|
| - character = replacementCharacter;
|
| - U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
|
| - }
|
| -
|
| - return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
|
| +template <typename CharType>
|
| +CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
|
| + // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
|
| + // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
|
| + // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
|
| + if (length > std::numeric_limits<size_t>::max() / 3)
|
| + CRASH();
|
| + Vector<uint8_t> bytes(length * 3);
|
| +
|
| + size_t i = 0;
|
| + size_t bytesWritten = 0;
|
| + while (i < length) {
|
| + UChar32 character;
|
| + U16_NEXT(characters, i, length, character);
|
| + // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
|
| + // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
|
| + if (0xD800 <= character && character <= 0xDFFF)
|
| + character = replacementCharacter;
|
| + U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
|
| + }
|
| +
|
| + return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
|
| }
|
|
|
| -CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
|
| -{
|
| - return encodeCommon(characters, length);
|
| +CString TextCodecUTF8::encode(const UChar* characters,
|
| + size_t length,
|
| + UnencodableHandling) {
|
| + return encodeCommon(characters, length);
|
| }
|
|
|
| -CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
|
| -{
|
| - return encodeCommon(characters, length);
|
| +CString TextCodecUTF8::encode(const LChar* characters,
|
| + size_t length,
|
| + UnencodableHandling) {
|
| + return encodeCommon(characters, length);
|
| }
|
|
|
| -} // namespace WTF
|
| +} // namespace WTF
|
|
|