third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp - Issue 1611343002: wtf reformat test

Unified Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 1611343002: wtf reformat test Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: pydent Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

diff --git a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

index bc4f0ed74e35c936f915023482b3b7d98ab0103a..ea75fd7d1608f6b0f1d8ec23a588c6545473142d 100644

--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

+++ b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

@@ -36,425 +36,441 @@ using namespace WTF::Unicode;

const int nonCharacter = -1;

-PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)

- return adoptPtr(new TextCodecUTF8);

+PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {

+ return adoptPtr(new TextCodecUTF8);

}

-void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)

- registrar("UTF-8", "UTF-8");

- // Additional aliases that originally were present in the encoding

- // table in WebKit on Macintosh, and subsequently added by

- // TextCodecICU. Perhaps we can prove some are not used on the web

- // and remove them.

- registrar("unicode11utf8", "UTF-8");

- registrar("unicode20utf8", "UTF-8");

- registrar("utf8", "UTF-8");

- registrar("x-unicode20utf8", "UTF-8");

- // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)

- // and Firefox (24), but not in ICU 4.6.

- registrar("unicode-1-1-utf-8", "UTF-8");

+void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {

+ registrar("UTF-8", "UTF-8");

+ // Additional aliases that originally were present in the encoding

+ // table in WebKit on Macintosh, and subsequently added by

+ // TextCodecICU. Perhaps we can prove some are not used on the web

+ // and remove them.

+ registrar("unicode11utf8", "UTF-8");

+ registrar("unicode20utf8", "UTF-8");

+ registrar("utf8", "UTF-8");

+ registrar("x-unicode20utf8", "UTF-8");

+ // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)

+ // and Firefox (24), but not in ICU 4.6.

+ registrar("unicode-1-1-utf-8", "UTF-8");

}

-void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)

- registrar("UTF-8", create, 0);

+void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {

+ registrar("UTF-8", create, 0);

}

-static inline int nonASCIISequenceLength(uint8_t firstByte)

- static const uint8_t lengths[256] = {

- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

- 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

- };

- return lengths[firstByte];

+static inline int nonASCIISequenceLength(uint8_t firstByte) {

+ static const uint8_t lengths[256] = {

+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

+ return lengths[firstByte];

}

-static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)

- ASSERT(!isASCII(sequence[0]));

- if (length == 2) {

- ASSERT(sequence[0] <= 0xDF);

- if (sequence[0] < 0xC2)

- return nonCharacter;

- if (sequence[1] < 0x80 || sequence[1] > 0xBF)

- return nonCharacter;

- return ((sequence[0] << 6) + sequence[1]) - 0x00003080;

- }

- if (length == 3) {

- ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);

- switch (sequence[0]) {

- case 0xE0:

- if (sequence[1] < 0xA0 || sequence[1] > 0xBF)

- return nonCharacter;

- break;

- case 0xED:

- if (sequence[1] < 0x80 || sequence[1] > 0x9F)

- return nonCharacter;

- break;

- default:

- if (sequence[1] < 0x80 || sequence[1] > 0xBF)

- return nonCharacter;

- }

- if (sequence[2] < 0x80 || sequence[2] > 0xBF)

- return nonCharacter;

- return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;

- }

- ASSERT(length == 4);

- ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

+static inline int decodeNonASCIISequence(const uint8_t* sequence,

+ unsigned length) {

+ ASSERT(!isASCII(sequence[0]));

+ if (length == 2) {

+ ASSERT(sequence[0] <= 0xDF);

+ if (sequence[0] < 0xC2)

+ return nonCharacter;

+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)

+ return nonCharacter;

+ return ((sequence[0] << 6) + sequence[1]) - 0x00003080;

+ }

+ if (length == 3) {

+ ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);

switch (sequence[0]) {

- case 0xF0:

- if (sequence[1] < 0x90 || sequence[1] > 0xBF)

- return nonCharacter;

+ case 0xE0:

+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF)

+ return nonCharacter;

break;

- case 0xF4:

- if (sequence[1] < 0x80 || sequence[1] > 0x8F)

- return nonCharacter;

+ case 0xED:

+ if (sequence[1] < 0x80 || sequence[1] > 0x9F)

+ return nonCharacter;

break;

- default:

+ default:

if (sequence[1] < 0x80 || sequence[1] > 0xBF)

- return nonCharacter;

+ return nonCharacter;

}

if (sequence[2] < 0x80 || sequence[2] > 0xBF)

+ return nonCharacter;

+ return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -

+ 0x000E2080;

+ }

+ ASSERT(length == 4);

+ ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);

+ switch (sequence[0]) {

+ case 0xF0:

+ if (sequence[1] < 0x90 || sequence[1] > 0xBF)

+ return nonCharacter;

+ break;

+ case 0xF4:

+ if (sequence[1] < 0x80 || sequence[1] > 0x8F)

return nonCharacter;

- if (sequence[3] < 0x80 || sequence[3] > 0xBF)

+ break;

+ default:

+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)

return nonCharacter;

- return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;

+ }

+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)

+ return nonCharacter;

+ if (sequence[3] < 0x80 || sequence[3] > 0xBF)

+ return nonCharacter;

+ return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +

+ sequence[3]) -

+ 0x03C82080;

}

-static inline UChar* appendCharacter(UChar* destination, int character)

- ASSERT(character != nonCharacter);

- ASSERT(!U_IS_SURROGATE(character));

- if (U_IS_BMP(character)) {

- *destination++ = static_cast<UChar>(character);

- } else {

- *destination++ = U16_LEAD(character);

- *destination++ = U16_TRAIL(character);

- }

- return destination;

+static inline UChar* appendCharacter(UChar* destination, int character) {

+ ASSERT(character != nonCharacter);

+ ASSERT(!U_IS_SURROGATE(character));

+ if (U_IS_BMP(character)) {

+ *destination++ = static_cast<UChar>(character);

+ } else {

+ *destination++ = U16_LEAD(character);

+ *destination++ = U16_TRAIL(character);

+ }

+ return destination;

}

-void TextCodecUTF8::consumePartialSequenceByte()

- --m_partialSequenceSize;

- memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);

+void TextCodecUTF8::consumePartialSequenceByte() {

+ --m_partialSequenceSize;

+ memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);

}

-void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)

- sawError = true;

- if (stopOnError)

- return;

- // Each error generates a replacement character and consumes one byte.

- *destination++ = replacementCharacter;

- consumePartialSequenceByte();

+void TextCodecUTF8::handleError(UChar*& destination,

+ bool stopOnError,

+ bool& sawError) {

+ sawError = true;

+ if (stopOnError)

+ return;

+ // Each error generates a replacement character and consumes one byte.

+ *destination++ = replacementCharacter;

+ consumePartialSequenceByte();

}

template <>

-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)

- ASSERT(m_partialSequenceSize);

- do {

- if (isASCII(m_partialSequence[0])) {

- *destination++ = m_partialSequence[0];

- consumePartialSequenceByte();

- continue;

- }

- int count = nonASCIISequenceLength(m_partialSequence[0]);

- if (!count)

- return true;

- if (count > m_partialSequenceSize) {

- if (count - m_partialSequenceSize > end - source) {

- if (!flush) {

- // The new data is not enough to complete the sequence, so

- // add it to the existing partial sequence.

- memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);

- m_partialSequenceSize += end - source;

- return false;

- }

- // An incomplete partial sequence at the end is an error, but it will create

- // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle

- // the error.

- return true;

- }

- memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);

- source += count - m_partialSequenceSize;

- m_partialSequenceSize = count;

+bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,

+ const uint8_t*& source,

+ const uint8_t* end,

+ bool flush,

+ bool,

+ bool&) {

+ ASSERT(m_partialSequenceSize);

+ do {

+ if (isASCII(m_partialSequence[0])) {

+ *destination++ = m_partialSequence[0];

+ consumePartialSequenceByte();

+ continue;

+ }

+ int count = nonASCIISequenceLength(m_partialSequence[0]);

+ if (!count)

+ return true;

+ if (count > m_partialSequenceSize) {

+ if (count - m_partialSequenceSize > end - source) {

+ if (!flush) {

+ // The new data is not enough to complete the sequence, so

+ // add it to the existing partial sequence.

+ memcpy(m_partialSequence + m_partialSequenceSize, source,

+ end - source);

+ m_partialSequenceSize += end - source;

+ return false;

}

- int character = decodeNonASCIISequence(m_partialSequence, count);

- if (character & ~0xff)

- return true;

+ // An incomplete partial sequence at the end is an error, but it will create

+ // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle

+ // the error.

+ return true;

+ }

+ memcpy(m_partialSequence + m_partialSequenceSize, source,

+ count - m_partialSequenceSize);

+ source += count - m_partialSequenceSize;

+ m_partialSequenceSize = count;

+ }

+ int character = decodeNonASCIISequence(m_partialSequence, count);

+ if (character & ~0xff)

+ return true;

- m_partialSequenceSize -= count;

- *destination++ = static_cast<LChar>(character);

- } while (m_partialSequenceSize);

+ m_partialSequenceSize -= count;

+ *destination++ = static_cast<LChar>(character);

+ } while (m_partialSequenceSize);

- return false;

+ return false;

}

template <>

-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)

- ASSERT(m_partialSequenceSize);

- do {

- if (isASCII(m_partialSequence[0])) {

- *destination++ = m_partialSequence[0];

- consumePartialSequenceByte();

- continue;

- }

- int count = nonASCIISequenceLength(m_partialSequence[0]);

- if (!count) {

- handleError(destination, stopOnError, sawError);

- if (stopOnError)

- return false;

- continue;

- }

- if (count > m_partialSequenceSize) {

- if (count - m_partialSequenceSize > end - source) {

- if (!flush) {

- // The new data is not enough to complete the sequence, so

- // add it to the existing partial sequence.

- memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);

- m_partialSequenceSize += end - source;

- return false;

- }

- // An incomplete partial sequence at the end is an error.

- handleError(destination, stopOnError, sawError);

- if (stopOnError)

- return false;

- continue;

- }

- memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);

- source += count - m_partialSequenceSize;

- m_partialSequenceSize = count;

- }

- int character = decodeNonASCIISequence(m_partialSequence, count);

- if (character == nonCharacter) {

- handleError(destination, stopOnError, sawError);

- if (stopOnError)

- return false;

- continue;

+bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,

+ const uint8_t*& source,

+ const uint8_t* end,

+ bool flush,

+ bool stopOnError,

+ bool& sawError) {

+ ASSERT(m_partialSequenceSize);

+ do {

+ if (isASCII(m_partialSequence[0])) {

+ *destination++ = m_partialSequence[0];

+ consumePartialSequenceByte();

+ continue;

+ }

+ int count = nonASCIISequenceLength(m_partialSequence[0]);

+ if (!count) {

+ handleError(destination, stopOnError, sawError);

+ if (stopOnError)

+ return false;

+ continue;

+ }

+ if (count > m_partialSequenceSize) {

+ if (count - m_partialSequenceSize > end - source) {

+ if (!flush) {

+ // The new data is not enough to complete the sequence, so

+ // add it to the existing partial sequence.

+ memcpy(m_partialSequence + m_partialSequenceSize, source,

+ end - source);

+ m_partialSequenceSize += end - source;

+ return false;

}

+ // An incomplete partial sequence at the end is an error.

+ handleError(destination, stopOnError, sawError);

+ if (stopOnError)

+ return false;

+ continue;

+ }

+ memcpy(m_partialSequence + m_partialSequenceSize, source,

+ count - m_partialSequenceSize);

+ source += count - m_partialSequenceSize;

+ m_partialSequenceSize = count;

+ }

+ int character = decodeNonASCIISequence(m_partialSequence, count);

+ if (character == nonCharacter) {

+ handleError(destination, stopOnError, sawError);

+ if (stopOnError)

+ return false;

+ continue;

+ }

- m_partialSequenceSize -= count;

- destination = appendCharacter(destination, character);

- } while (m_partialSequenceSize);

+ m_partialSequenceSize -= count;

+ destination = appendCharacter(destination, character);

+ } while (m_partialSequenceSize);

- return false;

+ return false;

}

-String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)

- // Each input byte might turn into a character.

- // That includes all bytes in the partial-sequence buffer because

- // each byte in an invalid sequence will turn into a replacement character.

- StringBuffer<LChar> buffer(m_partialSequenceSize + length);

- const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);

- const uint8_t* end = source + length;

- const uint8_t* alignedEnd = alignToMachineWord(end);

- LChar* destination = buffer.characters();

- do {

- if (m_partialSequenceSize) {

- // Explicitly copy destination and source pointers to avoid taking pointers to the

- // local variables, which may harm code generation by disabling some optimizations

- // in some compilers.

- LChar* destinationForHandlePartialSequence = destination;

- const uint8_t* sourceForHandlePartialSequence = source;

- if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {

- source = sourceForHandlePartialSequence;

- goto upConvertTo16Bit;

- }

- destination = destinationForHandlePartialSequence;

- source = sourceForHandlePartialSequence;

- if (m_partialSequenceSize)

- break;

- }

+String TextCodecUTF8::decode(const char* bytes,

+ size_t length,

+ FlushBehavior flush,

+ bool stopOnError,

+ bool& sawError) {

+ // Each input byte might turn into a character.

+ // That includes all bytes in the partial-sequence buffer because

+ // each byte in an invalid sequence will turn into a replacement character.

+ StringBuffer<LChar> buffer(m_partialSequenceSize + length);

+ const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);

+ const uint8_t* end = source + length;

+ const uint8_t* alignedEnd = alignToMachineWord(end);

+ LChar* destination = buffer.characters();

+ do {

+ if (m_partialSequenceSize) {

+ // Explicitly copy destination and source pointers to avoid taking pointers to the

+ // local variables, which may harm code generation by disabling some optimizations

+ // in some compilers.

+ LChar* destinationForHandlePartialSequence = destination;

+ const uint8_t* sourceForHandlePartialSequence = source;

+ if (handlePartialSequence(destinationForHandlePartialSequence,

+ sourceForHandlePartialSequence, end, flush,

+ stopOnError, sawError)) {

+ source = sourceForHandlePartialSequence;

+ goto upConvertTo16Bit;

+ }

+ destination = destinationForHandlePartialSequence;

+ source = sourceForHandlePartialSequence;

+ if (m_partialSequenceSize)

+ break;

+ }

- while (source < end) {

- if (isASCII(*source)) {

- // Fast path for ASCII. Most UTF-8 text will be ASCII.

- if (isAlignedToMachineWord(source)) {

- while (source < alignedEnd) {

- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);

- if (!isAllASCII<LChar>(chunk))

- break;

- copyASCIIMachineWord(destination, source);

- source += sizeof(MachineWord);

- destination += sizeof(MachineWord);

- }

- if (source == end)

- break;

- if (!isASCII(*source))

- continue;

- }

- *destination++ = *source++;

- continue;

- }

- int count = nonASCIISequenceLength(*source);

- int character;

- if (count == 0) {

- character = nonCharacter;

- } else {

- if (count > end - source) {

- ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

- ASSERT(!m_partialSequenceSize);

- m_partialSequenceSize = end - source;

- memcpy(m_partialSequence, source, m_partialSequenceSize);

- source = end;

- break;

- }

- character = decodeNonASCIISequence(source, count);

- }

- if (character == nonCharacter) {

- sawError = true;

- if (stopOnError)

- break;

- goto upConvertTo16Bit;

- }

- if (character > 0xff)

- goto upConvertTo16Bit;

- source += count;

- *destination++ = static_cast<LChar>(character);

+ while (source < end) {

+ if (isASCII(*source)) {

+ // Fast path for ASCII. Most UTF-8 text will be ASCII.

+ if (isAlignedToMachineWord(source)) {

+ while (source < alignedEnd) {

+ MachineWord chunk =

+ *reinterpret_cast_ptr<const MachineWord*>(source);

+ if (!isAllASCII<LChar>(chunk))

+ break;

+ copyASCIIMachineWord(destination, source);

+ source += sizeof(MachineWord);

+ destination += sizeof(MachineWord);

+ }

+ if (source == end)

+ break;

+ if (!isASCII(*source))

+ continue;

}

- } while (flush && m_partialSequenceSize);

+ *destination++ = *source++;

+ continue;

+ }

+ int count = nonASCIISequenceLength(*source);

+ int character;

+ if (count == 0) {

+ character = nonCharacter;

+ } else {

+ if (count > end - source) {

+ ASSERT_WITH_SECURITY_IMPLICATION(

+ end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

+ ASSERT(!m_partialSequenceSize);

+ m_partialSequenceSize = end - source;

+ memcpy(m_partialSequence, source, m_partialSequenceSize);

+ source = end;

+ break;

+ }

+ character = decodeNonASCIISequence(source, count);

+ }

+ if (character == nonCharacter) {

+ sawError = true;

+ if (stopOnError)

+ break;

+ goto upConvertTo16Bit;

+ }

+ if (character > 0xff)

+ goto upConvertTo16Bit;

+ source += count;

+ *destination++ = static_cast<LChar>(character);

+ }

+ } while (flush && m_partialSequenceSize);

- buffer.shrink(destination - buffer.characters());

+ buffer.shrink(destination - buffer.characters());

- return String::adopt(buffer);

+ return String::adopt(buffer);

upConvertTo16Bit:

- StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

- UChar* destination16 = buffer16.characters();

- // Copy the already converted characters

- for (LChar* converted8 = buffer.characters(); converted8 < destination;)

- *destination16++ = *converted8++;

- do {

- if (m_partialSequenceSize) {

- // Explicitly copy destination and source pointers to avoid taking pointers to the

- // local variables, which may harm code generation by disabling some optimizations

- // in some compilers.

- UChar* destinationForHandlePartialSequence = destination16;

- const uint8_t* sourceForHandlePartialSequence = source;

- handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);

- destination16 = destinationForHandlePartialSequence;

- source = sourceForHandlePartialSequence;

- if (m_partialSequenceSize)

- break;

- }

+ StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

+ UChar* destination16 = buffer16.characters();

+ // Copy the already converted characters

+ for (LChar* converted8 = buffer.characters(); converted8 < destination;)

+ *destination16++ = *converted8++;

+ do {

+ if (m_partialSequenceSize) {

+ // Explicitly copy destination and source pointers to avoid taking pointers to the

+ // local variables, which may harm code generation by disabling some optimizations

+ // in some compilers.

+ UChar* destinationForHandlePartialSequence = destination16;

+ const uint8_t* sourceForHandlePartialSequence = source;

+ handlePartialSequence(destinationForHandlePartialSequence,

+ sourceForHandlePartialSequence, end, flush,

+ stopOnError, sawError);

+ destination16 = destinationForHandlePartialSequence;

+ source = sourceForHandlePartialSequence;

+ if (m_partialSequenceSize)

+ break;

+ }

- while (source < end) {

- if (isASCII(*source)) {

- // Fast path for ASCII. Most UTF-8 text will be ASCII.

- if (isAlignedToMachineWord(source)) {

- while (source < alignedEnd) {

- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);

- if (!isAllASCII<LChar>(chunk))

- break;

- copyASCIIMachineWord(destination16, source);

- source += sizeof(MachineWord);

- destination16 += sizeof(MachineWord);

- }

- if (source == end)

- break;

- if (!isASCII(*source))

- continue;

- }

- *destination16++ = *source++;

- continue;

- }

- int count = nonASCIISequenceLength(*source);

- int character;

- if (count == 0) {

- character = nonCharacter;

- } else {

- if (count > end - source) {

- ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

- ASSERT(!m_partialSequenceSize);

- m_partialSequenceSize = end - source;

- memcpy(m_partialSequence, source, m_partialSequenceSize);

- source = end;

- break;

- }

- character = decodeNonASCIISequence(source, count);

- }

- if (character == nonCharacter) {

- sawError = true;

- if (stopOnError)

- break;

- // Each error generates a replacement character and consumes one byte.

- *destination16++ = replacementCharacter;

- ++source;

- continue;

- }

- source += count;

- destination16 = appendCharacter(destination16, character);

+ while (source < end) {

+ if (isASCII(*source)) {

+ // Fast path for ASCII. Most UTF-8 text will be ASCII.

+ if (isAlignedToMachineWord(source)) {

+ while (source < alignedEnd) {

+ MachineWord chunk =

+ *reinterpret_cast_ptr<const MachineWord*>(source);

+ if (!isAllASCII<LChar>(chunk))

+ break;

+ copyASCIIMachineWord(destination16, source);

+ source += sizeof(MachineWord);

+ destination16 += sizeof(MachineWord);

+ }

+ if (source == end)

+ break;

+ if (!isASCII(*source))

+ continue;

+ }

+ *destination16++ = *source++;

+ continue;

+ }

+ int count = nonASCIISequenceLength(*source);

+ int character;

+ if (count == 0) {

+ character = nonCharacter;

+ } else {

+ if (count > end - source) {

+ ASSERT_WITH_SECURITY_IMPLICATION(

+ end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));

+ ASSERT(!m_partialSequenceSize);

+ m_partialSequenceSize = end - source;

+ memcpy(m_partialSequence, source, m_partialSequenceSize);

+ source = end;

+ break;

}

- } while (flush && m_partialSequenceSize);

+ character = decodeNonASCIISequence(source, count);

+ }

+ if (character == nonCharacter) {

+ sawError = true;

+ if (stopOnError)

+ break;

+ // Each error generates a replacement character and consumes one byte.

+ *destination16++ = replacementCharacter;

+ ++source;

+ continue;

+ }

+ source += count;

+ destination16 = appendCharacter(destination16, character);

+ }

+ } while (flush && m_partialSequenceSize);

- buffer16.shrink(destination16 - buffer16.characters());

+ buffer16.shrink(destination16 - buffer16.characters());

- return String::adopt(buffer16);

+ return String::adopt(buffer16);

}

-template<typename CharType>

-CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)

- // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.

- // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).

- // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).

- if (length > std::numeric_limits<size_t>::max() / 3)

- CRASH();

- Vector<uint8_t> bytes(length * 3);

- size_t i = 0;

- size_t bytesWritten = 0;

- while (i < length) {

- UChar32 character;

- U16_NEXT(characters, i, length, character);

- // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate

- // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.

- if (0xD800 <= character && character <= 0xDFFF)

- character = replacementCharacter;

- U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);

- }

- return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);

+template <typename CharType>

+CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {

+ // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.

+ // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).

+ // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).

+ if (length > std::numeric_limits<size_t>::max() / 3)

+ CRASH();

+ Vector<uint8_t> bytes(length * 3);

+ size_t i = 0;

+ size_t bytesWritten = 0;

+ while (i < length) {

+ UChar32 character;

+ U16_NEXT(characters, i, length, character);

+ // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate

+ // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.

+ if (0xD800 <= character && character <= 0xDFFF)

+ character = replacementCharacter;

+ U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);

+ }

+ return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);

}

-CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)

- return encodeCommon(characters, length);

+CString TextCodecUTF8::encode(const UChar* characters,

+ size_t length,

+ UnencodableHandling) {

+ return encodeCommon(characters, length);

}

-CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)

- return encodeCommon(characters, length);

+CString TextCodecUTF8::encode(const LChar* characters,

+ size_t length,

+ UnencodableHandling) {

+ return encodeCommon(characters, length);

}

-} // namespace WTF

+} // namespace WTF

« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »