third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp - Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
66 registrar("UTF-16LE", newStreamingTextDecoderUTF16LE, 0);	66 registrar("UTF-16LE", newStreamingTextDecoderUTF16LE, 0);

67 registrar("UTF-16BE", newStreamingTextDecoderUTF16BE, 0);	67 registrar("UTF-16BE", newStreamingTextDecoderUTF16BE, 0);

68 }	68 }

69	69

70 String TextCodecUTF16::decode(const char* bytes, size_t length, FlushBehavior fl ush, bool, bool& sawError)	70 String TextCodecUTF16::decode(const char* bytes, size_t length, FlushBehavior fl ush, bool, bool& sawError)

71 {	71 {

72 // For compatibility reasons, ignore flush from fetch EOF.	72 // For compatibility reasons, ignore flush from fetch EOF.

73 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;	73 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;

74	74

75 if (!length) {	75 if (!length) {

76 if (!reallyFlush \|\| !m_haveBufferedByte)	76 if (reallyFlush && (m_haveLeadByte \|\| m_haveLeadSurrogate)) {

77 return String();	77 m_haveLeadByte = m_haveLeadSurrogate = false;

78 sawError = true;	78 sawError = true;

79 return String(&replacementCharacter, 1);	79 return String(&replacementCharacter, 1);

	80 }

	81 return String();

80 }	82 }

81	83

82 // FIXME: This should generate an error if there is an unpaired surrogate.	84 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);

	85 const size_t numBytes = length + m_haveLeadByte;

	86 const bool willHaveExtraByte = numBytes & 1;

	87 const size_t numCharsIn = numBytes / 2;

	88 const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) + (rea llyFlush && willHaveExtraByte ? 1 : 0);

83	89

84 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);	90 StringBuffer<UChar> buffer(maxCharsOut);

85 size_t numBytes = length + m_haveBufferedByte;

86 size_t numCharsIn = numBytes / 2;

87 size_t numCharsOut = ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numC harsIn;

88

89 StringBuffer<UChar> buffer(numCharsOut);

90 UChar* q = buffer.characters();	91 UChar* q = buffer.characters();

91	92

92 if (m_haveBufferedByte) {	93 for (size_t i = 0; i < numCharsIn; ++i) {

93 UChar c;	94 UChar c;

94 if (m_littleEndian)	95 if (m_haveLeadByte) {

95 c = m_bufferedByte \| (p[0] << 8);	96 c = m_littleEndian ? (m_leadByte \| (p[0] << 8)) : ((m_leadByte << 8) \| p[0]);

96 else	97 m_haveLeadByte = false;

97 c = (m_bufferedByte << 8) \| p[0];	98 ++p;

98 *q++ = c;	99 } else {

99 m_haveBufferedByte = false;	100 c = m_littleEndian ? (p[0] \| (p[1] << 8)) : ((p[0] << 8) \| p[1]);

100 p += 1;	101 p += 2;

101 numCharsIn -= 1;	102 }

102 }

103	103

104 if (m_littleEndian) {	104 // TODO(jsbell): If necessary for performance, m_haveLeadByte handling

105 for (size_t i = 0; i < numCharsIn; ++i) {	105 // can be pulled out and this loop split into distinct cases for

106 UChar c = p[0] \| (p[1] << 8);	106 // big/little endian. The logic from here to the end of the loop is

107 p += 2;	107 // constant with respect to m_haveLeadByte and m_littleEndian.

	108

	109 if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {

	110 *q++ = m_leadSurrogate;

	111 m_haveLeadSurrogate = false;

108 *q++ = c;	112 *q++ = c;

109 }	113 } else {

110 } else {	114 if (m_haveLeadSurrogate) {

111 for (size_t i = 0; i < numCharsIn; ++i) {	115 m_haveLeadSurrogate = false;

112 UChar c = (p[0] << 8) \| p[1];	116 sawError = true;

113 p += 2;	117 *q++ = replacementCharacter;

114 *q++ = c;	118 }

	119

	120 if (U_IS_LEAD(c)) {

	121 m_haveLeadSurrogate = true;

	122 m_leadSurrogate = c;

	123 } else if (U_IS_TRAIL(c)) {

	124 sawError = true;

	125 *q++ = replacementCharacter;

	126 } else {

	127 *q++ = c;

	128 }

115 }	129 }

116 }	130 }

117	131

118 if (numBytes & 1) {	132 if (willHaveExtraByte) {

119 ASSERT(!m_haveBufferedByte);	133 DCHECK(!m_haveLeadByte);
	foolip 2016/09/30 22:59:41 I think it's the m_haveLeadByte=false in the loop I think it's the m_haveLeadByte=false in the loop that makes this hold even if called with length=2 and m_haveLeadByte=true, but doesn't that hold even if !willHaveExtraByte, i.e. can this DCHECK be outside the if statement? jsbell 2016/09/30 23:52:13 Yes. Show quoted text On 2016/09/30 22:59:41, foolip wrote: > i.e. can this DCHECK be outside the if statement? Yes.
	134 m_haveLeadByte = true;

	135 m_leadByte = p[0];

	136 }

120	137

121 if (reallyFlush) {	138 if (reallyFlush && (m_haveLeadByte \|\| m_haveLeadSurrogate)) {

122 sawError = true;	139 m_haveLeadByte = m_haveLeadSurrogate = false;

123 *q++ = replacementCharacter;	140 sawError = true;

124 } else {	141 *q++ = replacementCharacter;

125 m_haveBufferedByte = true;

126 m_bufferedByte = p[0];

127 }

128 }	142 }

129	143

130 buffer.shrink(q - buffer.characters());	144 buffer.shrink(q - buffer.characters());

131	145

132 return String::adopt(buffer);	146 return String::adopt(buffer);

133 }	147 }

134	148

135 CString TextCodecUTF16::encode(const UChar* characters, size_t length, Unencodab leHandling)	149 CString TextCodecUTF16::encode(const UChar* characters, size_t length, Unencodab leHandling)

136 {	150 {

137 // We need to be sure we can double the length without overflowing.	151 // We need to be sure we can double the length without overflowing.

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
181 for (size_t i = 0; i < length; ++i) {	195 for (size_t i = 0; i < length; ++i) {

182 bytes[i * 2] = 0;	196 bytes[i * 2] = 0;

183 bytes[i * 2 + 1] = characters[i];	197 bytes[i * 2 + 1] = characters[i];

184 }	198 }

185 }	199 }

186	200

187 return result;	201 return result;

188 }	202 }

189	203

190 } // namespace WTF	204 } // namespace WTF

OLD	NEW

« third_party/WebKit/LayoutTests/fast/encoding/utf-16-lone-surrogates.html ('K') | « third_party/WebKit/Source/wtf/text/TextCodecUTF16.h ('k') | no next file » | no next file with comments »