Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1464)

Unified Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 1611343002: wtf reformat test Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: pydent Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
diff --git a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
index bc4f0ed74e35c936f915023482b3b7d98ab0103a..ea75fd7d1608f6b0f1d8ec23a588c6545473142d 100644
--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
+++ b/third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp
@@ -36,425 +36,441 @@ using namespace WTF::Unicode;
const int nonCharacter = -1;
-PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
-{
- return adoptPtr(new TextCodecUTF8);
+PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {
+ return adoptPtr(new TextCodecUTF8);
}
-void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
-{
- registrar("UTF-8", "UTF-8");
-
- // Additional aliases that originally were present in the encoding
- // table in WebKit on Macintosh, and subsequently added by
- // TextCodecICU. Perhaps we can prove some are not used on the web
- // and remove them.
- registrar("unicode11utf8", "UTF-8");
- registrar("unicode20utf8", "UTF-8");
- registrar("utf8", "UTF-8");
- registrar("x-unicode20utf8", "UTF-8");
-
- // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
- // and Firefox (24), but not in ICU 4.6.
- registrar("unicode-1-1-utf-8", "UTF-8");
+void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
+ registrar("UTF-8", "UTF-8");
+
+ // Additional aliases that originally were present in the encoding
+ // table in WebKit on Macintosh, and subsequently added by
+ // TextCodecICU. Perhaps we can prove some are not used on the web
+ // and remove them.
+ registrar("unicode11utf8", "UTF-8");
+ registrar("unicode20utf8", "UTF-8");
+ registrar("utf8", "UTF-8");
+ registrar("x-unicode20utf8", "UTF-8");
+
+ // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
+ // and Firefox (24), but not in ICU 4.6.
+ registrar("unicode-1-1-utf-8", "UTF-8");
}
-void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
-{
- registrar("UTF-8", create, 0);
+void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
+ registrar("UTF-8", create, 0);
}
-static inline int nonASCIISequenceLength(uint8_t firstByte)
-{
- static const uint8_t lengths[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- };
- return lengths[firstByte];
+static inline int nonASCIISequenceLength(uint8_t firstByte) {
+ static const uint8_t lengths[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ return lengths[firstByte];
}
-static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
-{
- ASSERT(!isASCII(sequence[0]));
- if (length == 2) {
- ASSERT(sequence[0] <= 0xDF);
- if (sequence[0] < 0xC2)
- return nonCharacter;
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter;
- return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
- }
- if (length == 3) {
- ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
- switch (sequence[0]) {
- case 0xE0:
- if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
- return nonCharacter;
- break;
- case 0xED:
- if (sequence[1] < 0x80 || sequence[1] > 0x9F)
- return nonCharacter;
- break;
- default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter;
- }
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
- return nonCharacter;
- return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
- }
- ASSERT(length == 4);
- ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+static inline int decodeNonASCIISequence(const uint8_t* sequence,
+ unsigned length) {
+ ASSERT(!isASCII(sequence[0]));
+ if (length == 2) {
+ ASSERT(sequence[0] <= 0xDF);
+ if (sequence[0] < 0xC2)
+ return nonCharacter;
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return nonCharacter;
+ return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
+ }
+ if (length == 3) {
+ ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
switch (sequence[0]) {
- case 0xF0:
- if (sequence[1] < 0x90 || sequence[1] > 0xBF)
- return nonCharacter;
+ case 0xE0:
+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+ return nonCharacter;
break;
- case 0xF4:
- if (sequence[1] < 0x80 || sequence[1] > 0x8F)
- return nonCharacter;
+ case 0xED:
+ if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+ return nonCharacter;
break;
- default:
+ default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter;
+ return nonCharacter;
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return nonCharacter;
+ return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
+ 0x000E2080;
+ }
+ ASSERT(length == 4);
+ ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+ switch (sequence[0]) {
+ case 0xF0:
+ if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+ return nonCharacter;
+ break;
+ case 0xF4:
+ if (sequence[1] < 0x80 || sequence[1] > 0x8F)
return nonCharacter;
- if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
return nonCharacter;
- return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return nonCharacter;
+ if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ return nonCharacter;
+ return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
+ sequence[3]) -
+ 0x03C82080;
}
-static inline UChar* appendCharacter(UChar* destination, int character)
-{
- ASSERT(character != nonCharacter);
- ASSERT(!U_IS_SURROGATE(character));
- if (U_IS_BMP(character)) {
- *destination++ = static_cast<UChar>(character);
- } else {
- *destination++ = U16_LEAD(character);
- *destination++ = U16_TRAIL(character);
- }
- return destination;
+static inline UChar* appendCharacter(UChar* destination, int character) {
+ ASSERT(character != nonCharacter);
+ ASSERT(!U_IS_SURROGATE(character));
+ if (U_IS_BMP(character)) {
+ *destination++ = static_cast<UChar>(character);
+ } else {
+ *destination++ = U16_LEAD(character);
+ *destination++ = U16_TRAIL(character);
+ }
+ return destination;
}
-void TextCodecUTF8::consumePartialSequenceByte()
-{
- --m_partialSequenceSize;
- memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
+void TextCodecUTF8::consumePartialSequenceByte() {
+ --m_partialSequenceSize;
+ memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
}
-void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
-{
- sawError = true;
- if (stopOnError)
- return;
- // Each error generates a replacement character and consumes one byte.
- *destination++ = replacementCharacter;
- consumePartialSequenceByte();
+void TextCodecUTF8::handleError(UChar*& destination,
+ bool stopOnError,
+ bool& sawError) {
+ sawError = true;
+ if (stopOnError)
+ return;
+ // Each error generates a replacement character and consumes one byte.
+ *destination++ = replacementCharacter;
+ consumePartialSequenceByte();
}
template <>
-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
-{
- ASSERT(m_partialSequenceSize);
- do {
- if (isASCII(m_partialSequence[0])) {
- *destination++ = m_partialSequence[0];
- consumePartialSequenceByte();
- continue;
- }
- int count = nonASCIISequenceLength(m_partialSequence[0]);
- if (!count)
- return true;
-
- if (count > m_partialSequenceSize) {
- if (count - m_partialSequenceSize > end - source) {
- if (!flush) {
- // The new data is not enough to complete the sequence, so
- // add it to the existing partial sequence.
- memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
- m_partialSequenceSize += end - source;
- return false;
- }
- // An incomplete partial sequence at the end is an error, but it will create
- // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
- // the error.
- return true;
- }
- memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
- source += count - m_partialSequenceSize;
- m_partialSequenceSize = count;
+bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
+ const uint8_t*& source,
+ const uint8_t* end,
+ bool flush,
+ bool,
+ bool&) {
+ ASSERT(m_partialSequenceSize);
+ do {
+ if (isASCII(m_partialSequence[0])) {
+ *destination++ = m_partialSequence[0];
+ consumePartialSequenceByte();
+ continue;
+ }
+ int count = nonASCIISequenceLength(m_partialSequence[0]);
+ if (!count)
+ return true;
+
+ if (count > m_partialSequenceSize) {
+ if (count - m_partialSequenceSize > end - source) {
+ if (!flush) {
+ // The new data is not enough to complete the sequence, so
+ // add it to the existing partial sequence.
+ memcpy(m_partialSequence + m_partialSequenceSize, source,
+ end - source);
+ m_partialSequenceSize += end - source;
+ return false;
}
- int character = decodeNonASCIISequence(m_partialSequence, count);
- if (character & ~0xff)
- return true;
+ // An incomplete partial sequence at the end is an error, but it will create
+ // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
+ // the error.
+ return true;
+ }
+ memcpy(m_partialSequence + m_partialSequenceSize, source,
+ count - m_partialSequenceSize);
+ source += count - m_partialSequenceSize;
+ m_partialSequenceSize = count;
+ }
+ int character = decodeNonASCIISequence(m_partialSequence, count);
+ if (character & ~0xff)
+ return true;
- m_partialSequenceSize -= count;
- *destination++ = static_cast<LChar>(character);
- } while (m_partialSequenceSize);
+ m_partialSequenceSize -= count;
+ *destination++ = static_cast<LChar>(character);
+ } while (m_partialSequenceSize);
- return false;
+ return false;
}
template <>
-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
-{
- ASSERT(m_partialSequenceSize);
- do {
- if (isASCII(m_partialSequence[0])) {
- *destination++ = m_partialSequence[0];
- consumePartialSequenceByte();
- continue;
- }
- int count = nonASCIISequenceLength(m_partialSequence[0]);
- if (!count) {
- handleError(destination, stopOnError, sawError);
- if (stopOnError)
- return false;
- continue;
- }
- if (count > m_partialSequenceSize) {
- if (count - m_partialSequenceSize > end - source) {
- if (!flush) {
- // The new data is not enough to complete the sequence, so
- // add it to the existing partial sequence.
- memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
- m_partialSequenceSize += end - source;
- return false;
- }
- // An incomplete partial sequence at the end is an error.
- handleError(destination, stopOnError, sawError);
- if (stopOnError)
- return false;
- continue;
- }
- memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
- source += count - m_partialSequenceSize;
- m_partialSequenceSize = count;
- }
- int character = decodeNonASCIISequence(m_partialSequence, count);
- if (character == nonCharacter) {
- handleError(destination, stopOnError, sawError);
- if (stopOnError)
- return false;
- continue;
+bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
+ const uint8_t*& source,
+ const uint8_t* end,
+ bool flush,
+ bool stopOnError,
+ bool& sawError) {
+ ASSERT(m_partialSequenceSize);
+ do {
+ if (isASCII(m_partialSequence[0])) {
+ *destination++ = m_partialSequence[0];
+ consumePartialSequenceByte();
+ continue;
+ }
+ int count = nonASCIISequenceLength(m_partialSequence[0]);
+ if (!count) {
+ handleError(destination, stopOnError, sawError);
+ if (stopOnError)
+ return false;
+ continue;
+ }
+ if (count > m_partialSequenceSize) {
+ if (count - m_partialSequenceSize > end - source) {
+ if (!flush) {
+ // The new data is not enough to complete the sequence, so
+ // add it to the existing partial sequence.
+ memcpy(m_partialSequence + m_partialSequenceSize, source,
+ end - source);
+ m_partialSequenceSize += end - source;
+ return false;
}
+ // An incomplete partial sequence at the end is an error.
+ handleError(destination, stopOnError, sawError);
+ if (stopOnError)
+ return false;
+ continue;
+ }
+ memcpy(m_partialSequence + m_partialSequenceSize, source,
+ count - m_partialSequenceSize);
+ source += count - m_partialSequenceSize;
+ m_partialSequenceSize = count;
+ }
+ int character = decodeNonASCIISequence(m_partialSequence, count);
+ if (character == nonCharacter) {
+ handleError(destination, stopOnError, sawError);
+ if (stopOnError)
+ return false;
+ continue;
+ }
- m_partialSequenceSize -= count;
- destination = appendCharacter(destination, character);
- } while (m_partialSequenceSize);
+ m_partialSequenceSize -= count;
+ destination = appendCharacter(destination, character);
+ } while (m_partialSequenceSize);
- return false;
+ return false;
}
-String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flush, bool stopOnError, bool& sawError)
-{
- // Each input byte might turn into a character.
- // That includes all bytes in the partial-sequence buffer because
- // each byte in an invalid sequence will turn into a replacement character.
- StringBuffer<LChar> buffer(m_partialSequenceSize + length);
-
- const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
- const uint8_t* end = source + length;
- const uint8_t* alignedEnd = alignToMachineWord(end);
- LChar* destination = buffer.characters();
-
- do {
- if (m_partialSequenceSize) {
- // Explicitly copy destination and source pointers to avoid taking pointers to the
- // local variables, which may harm code generation by disabling some optimizations
- // in some compilers.
- LChar* destinationForHandlePartialSequence = destination;
- const uint8_t* sourceForHandlePartialSequence = source;
- if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
- source = sourceForHandlePartialSequence;
- goto upConvertTo16Bit;
- }
- destination = destinationForHandlePartialSequence;
- source = sourceForHandlePartialSequence;
- if (m_partialSequenceSize)
- break;
- }
+String TextCodecUTF8::decode(const char* bytes,
+ size_t length,
+ FlushBehavior flush,
+ bool stopOnError,
+ bool& sawError) {
+ // Each input byte might turn into a character.
+ // That includes all bytes in the partial-sequence buffer because
+ // each byte in an invalid sequence will turn into a replacement character.
+ StringBuffer<LChar> buffer(m_partialSequenceSize + length);
+
+ const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
+ const uint8_t* end = source + length;
+ const uint8_t* alignedEnd = alignToMachineWord(end);
+ LChar* destination = buffer.characters();
+
+ do {
+ if (m_partialSequenceSize) {
+ // Explicitly copy destination and source pointers to avoid taking pointers to the
+ // local variables, which may harm code generation by disabling some optimizations
+ // in some compilers.
+ LChar* destinationForHandlePartialSequence = destination;
+ const uint8_t* sourceForHandlePartialSequence = source;
+ if (handlePartialSequence(destinationForHandlePartialSequence,
+ sourceForHandlePartialSequence, end, flush,
+ stopOnError, sawError)) {
+ source = sourceForHandlePartialSequence;
+ goto upConvertTo16Bit;
+ }
+ destination = destinationForHandlePartialSequence;
+ source = sourceForHandlePartialSequence;
+ if (m_partialSequenceSize)
+ break;
+ }
- while (source < end) {
- if (isASCII(*source)) {
- // Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
- while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
- break;
- copyASCIIMachineWord(destination, source);
- source += sizeof(MachineWord);
- destination += sizeof(MachineWord);
- }
- if (source == end)
- break;
- if (!isASCII(*source))
- continue;
- }
- *destination++ = *source++;
- continue;
- }
- int count = nonASCIISequenceLength(*source);
- int character;
- if (count == 0) {
- character = nonCharacter;
- } else {
- if (count > end - source) {
- ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
- ASSERT(!m_partialSequenceSize);
- m_partialSequenceSize = end - source;
- memcpy(m_partialSequence, source, m_partialSequenceSize);
- source = end;
- break;
- }
- character = decodeNonASCIISequence(source, count);
- }
- if (character == nonCharacter) {
- sawError = true;
- if (stopOnError)
- break;
-
- goto upConvertTo16Bit;
- }
- if (character > 0xff)
- goto upConvertTo16Bit;
-
- source += count;
- *destination++ = static_cast<LChar>(character);
+ while (source < end) {
+ if (isASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (isAlignedToMachineWord(source)) {
+ while (source < alignedEnd) {
+ MachineWord chunk =
+ *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (!isAllASCII<LChar>(chunk))
+ break;
+ copyASCIIMachineWord(destination, source);
+ source += sizeof(MachineWord);
+ destination += sizeof(MachineWord);
+ }
+ if (source == end)
+ break;
+ if (!isASCII(*source))
+ continue;
}
- } while (flush && m_partialSequenceSize);
+ *destination++ = *source++;
+ continue;
+ }
+ int count = nonASCIISequenceLength(*source);
+ int character;
+ if (count == 0) {
+ character = nonCharacter;
+ } else {
+ if (count > end - source) {
+ ASSERT_WITH_SECURITY_IMPLICATION(
+ end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+ ASSERT(!m_partialSequenceSize);
+ m_partialSequenceSize = end - source;
+ memcpy(m_partialSequence, source, m_partialSequenceSize);
+ source = end;
+ break;
+ }
+ character = decodeNonASCIISequence(source, count);
+ }
+ if (character == nonCharacter) {
+ sawError = true;
+ if (stopOnError)
+ break;
+
+ goto upConvertTo16Bit;
+ }
+ if (character > 0xff)
+ goto upConvertTo16Bit;
+
+ source += count;
+ *destination++ = static_cast<LChar>(character);
+ }
+ } while (flush && m_partialSequenceSize);
- buffer.shrink(destination - buffer.characters());
+ buffer.shrink(destination - buffer.characters());
- return String::adopt(buffer);
+ return String::adopt(buffer);
upConvertTo16Bit:
- StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
-
- UChar* destination16 = buffer16.characters();
-
- // Copy the already converted characters
- for (LChar* converted8 = buffer.characters(); converted8 < destination;)
- *destination16++ = *converted8++;
-
- do {
- if (m_partialSequenceSize) {
- // Explicitly copy destination and source pointers to avoid taking pointers to the
- // local variables, which may harm code generation by disabling some optimizations
- // in some compilers.
- UChar* destinationForHandlePartialSequence = destination16;
- const uint8_t* sourceForHandlePartialSequence = source;
- handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
- destination16 = destinationForHandlePartialSequence;
- source = sourceForHandlePartialSequence;
- if (m_partialSequenceSize)
- break;
- }
+ StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
+
+ UChar* destination16 = buffer16.characters();
+
+ // Copy the already converted characters
+ for (LChar* converted8 = buffer.characters(); converted8 < destination;)
+ *destination16++ = *converted8++;
+
+ do {
+ if (m_partialSequenceSize) {
+ // Explicitly copy destination and source pointers to avoid taking pointers to the
+ // local variables, which may harm code generation by disabling some optimizations
+ // in some compilers.
+ UChar* destinationForHandlePartialSequence = destination16;
+ const uint8_t* sourceForHandlePartialSequence = source;
+ handlePartialSequence(destinationForHandlePartialSequence,
+ sourceForHandlePartialSequence, end, flush,
+ stopOnError, sawError);
+ destination16 = destinationForHandlePartialSequence;
+ source = sourceForHandlePartialSequence;
+ if (m_partialSequenceSize)
+ break;
+ }
- while (source < end) {
- if (isASCII(*source)) {
- // Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
- while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
- break;
- copyASCIIMachineWord(destination16, source);
- source += sizeof(MachineWord);
- destination16 += sizeof(MachineWord);
- }
- if (source == end)
- break;
- if (!isASCII(*source))
- continue;
- }
- *destination16++ = *source++;
- continue;
- }
- int count = nonASCIISequenceLength(*source);
- int character;
- if (count == 0) {
- character = nonCharacter;
- } else {
- if (count > end - source) {
- ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
- ASSERT(!m_partialSequenceSize);
- m_partialSequenceSize = end - source;
- memcpy(m_partialSequence, source, m_partialSequenceSize);
- source = end;
- break;
- }
- character = decodeNonASCIISequence(source, count);
- }
- if (character == nonCharacter) {
- sawError = true;
- if (stopOnError)
- break;
- // Each error generates a replacement character and consumes one byte.
- *destination16++ = replacementCharacter;
- ++source;
- continue;
- }
- source += count;
- destination16 = appendCharacter(destination16, character);
+ while (source < end) {
+ if (isASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (isAlignedToMachineWord(source)) {
+ while (source < alignedEnd) {
+ MachineWord chunk =
+ *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (!isAllASCII<LChar>(chunk))
+ break;
+ copyASCIIMachineWord(destination16, source);
+ source += sizeof(MachineWord);
+ destination16 += sizeof(MachineWord);
+ }
+ if (source == end)
+ break;
+ if (!isASCII(*source))
+ continue;
+ }
+ *destination16++ = *source++;
+ continue;
+ }
+ int count = nonASCIISequenceLength(*source);
+ int character;
+ if (count == 0) {
+ character = nonCharacter;
+ } else {
+ if (count > end - source) {
+ ASSERT_WITH_SECURITY_IMPLICATION(
+ end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
+ ASSERT(!m_partialSequenceSize);
+ m_partialSequenceSize = end - source;
+ memcpy(m_partialSequence, source, m_partialSequenceSize);
+ source = end;
+ break;
}
- } while (flush && m_partialSequenceSize);
+ character = decodeNonASCIISequence(source, count);
+ }
+ if (character == nonCharacter) {
+ sawError = true;
+ if (stopOnError)
+ break;
+ // Each error generates a replacement character and consumes one byte.
+ *destination16++ = replacementCharacter;
+ ++source;
+ continue;
+ }
+ source += count;
+ destination16 = appendCharacter(destination16, character);
+ }
+ } while (flush && m_partialSequenceSize);
- buffer16.shrink(destination16 - buffer16.characters());
+ buffer16.shrink(destination16 - buffer16.characters());
- return String::adopt(buffer16);
+ return String::adopt(buffer16);
}
-template<typename CharType>
-CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
-{
- // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
- // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
- // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
- if (length > std::numeric_limits<size_t>::max() / 3)
- CRASH();
- Vector<uint8_t> bytes(length * 3);
-
- size_t i = 0;
- size_t bytesWritten = 0;
- while (i < length) {
- UChar32 character;
- U16_NEXT(characters, i, length, character);
- // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
- // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
- if (0xD800 <= character && character <= 0xDFFF)
- character = replacementCharacter;
- U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
- }
-
- return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+template <typename CharType>
+CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
+ // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
+ // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
+ // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
+ if (length > std::numeric_limits<size_t>::max() / 3)
+ CRASH();
+ Vector<uint8_t> bytes(length * 3);
+
+ size_t i = 0;
+ size_t bytesWritten = 0;
+ while (i < length) {
+ UChar32 character;
+ U16_NEXT(characters, i, length, character);
+ // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
+ // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
+ if (0xD800 <= character && character <= 0xDFFF)
+ character = replacementCharacter;
+ U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
+ }
+
+ return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
}
-CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
-{
- return encodeCommon(characters, length);
+CString TextCodecUTF8::encode(const UChar* characters,
+ size_t length,
+ UnencodableHandling) {
+ return encodeCommon(characters, length);
}
-CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
-{
- return encodeCommon(characters, length);
+CString TextCodecUTF8::encode(const LChar* characters,
+ size_t length,
+ UnencodableHandling) {
+ return encodeCommon(characters, length);
}
-} // namespace WTF
+} // namespace WTF
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698