third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp - Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters

Unified Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters (Closed)

Patch Set: Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

diff --git a/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp b/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

index f3e2c12b61ea5457757fcdd718dd896a57f32497..a55087f6f586e0b63b54d661e01428f73c57ba79 100644

--- a/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

+++ b/third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

@@ -73,58 +73,72 @@ String TextCodecUTF16::decode(const char* bytes, size_t length, FlushBehavior fl

const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;

if (!length) {

- if (!reallyFlush || !m_haveBufferedByte)

- return String();

- sawError = true;

- return String(&replacementCharacter, 1);

+ if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {

+ m_haveLeadByte = m_haveLeadSurrogate = false;

+ sawError = true;

+ return String(&replacementCharacter, 1);

+ }

+ return String();

}

- // FIXME: This should generate an error if there is an unpaired surrogate.

const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);

- size_t numBytes = length + m_haveBufferedByte;

- size_t numCharsIn = numBytes / 2;

- size_t numCharsOut = ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numCharsIn;

+ const size_t numBytes = length + m_haveLeadByte;

+ const bool willHaveExtraByte = numBytes & 1;

+ const size_t numCharsIn = numBytes / 2;

+ const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) + (reallyFlush && willHaveExtraByte ? 1 : 0);

- StringBuffer<UChar> buffer(numCharsOut);

+ StringBuffer<UChar> buffer(maxCharsOut);

UChar* q = buffer.characters();

- if (m_haveBufferedByte) {

+ for (size_t i = 0; i < numCharsIn; ++i) {

UChar c;

- if (m_littleEndian)

- c = m_bufferedByte | (p[0] << 8);

- else

- c = (m_bufferedByte << 8) | p[0];

- *q++ = c;

- m_haveBufferedByte = false;

- p += 1;

- numCharsIn -= 1;

- }

- if (m_littleEndian) {

- for (size_t i = 0; i < numCharsIn; ++i) {

- UChar c = p[0] | (p[1] << 8);

+ if (m_haveLeadByte) {

+ c = m_littleEndian ? (m_leadByte | (p[0] << 8)) : ((m_leadByte << 8) | p[0]);

+ m_haveLeadByte = false;

+ ++p;

+ } else {

+ c = m_littleEndian ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);

p += 2;

- *q++ = c;

}

- } else {

- for (size_t i = 0; i < numCharsIn; ++i) {

- UChar c = (p[0] << 8) | p[1];

- p += 2;

+ // TODO(jsbell): If necessary for performance, m_haveLeadByte handling

+ // can be pulled out and this loop split into distinct cases for

+ // big/little endian. The logic from here to the end of the loop is

+ // constant with respect to m_haveLeadByte and m_littleEndian.

+ if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {

+ *q++ = m_leadSurrogate;

+ m_haveLeadSurrogate = false;

*q++ = c;

+ } else {

+ if (m_haveLeadSurrogate) {

+ m_haveLeadSurrogate = false;

+ sawError = true;

+ *q++ = replacementCharacter;

+ }

+ if (U_IS_LEAD(c)) {

+ m_haveLeadSurrogate = true;

+ m_leadSurrogate = c;

+ } else if (U_IS_TRAIL(c)) {

+ sawError = true;

+ *q++ = replacementCharacter;

+ } else {

+ *q++ = c;

+ }

}

- if (numBytes & 1) {

- ASSERT(!m_haveBufferedByte);

+ if (willHaveExtraByte) {

+ DCHECK(!m_haveLeadByte);

foolip 2016/09/30 22:59:41 I think it's the m_haveLeadByte=false in the loop

jsbell 2016/09/30 23:52:13 Yes.

+ m_haveLeadByte = true;

+ m_leadByte = p[0];

+ }

- if (reallyFlush) {

- sawError = true;

- *q++ = replacementCharacter;

- } else {

- m_haveBufferedByte = true;

- m_bufferedByte = p[0];

- }

+ if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {

+ m_haveLeadByte = m_haveLeadSurrogate = false;

+ sawError = true;

+ *q++ = replacementCharacter;

}

buffer.shrink(q - buffer.characters());

« third_party/WebKit/LayoutTests/fast/encoding/utf-16-lone-surrogates.html ('K') | « third_party/WebKit/Source/wtf/text/TextCodecUTF16.h ('k') | no next file » | no next file with comments »