third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp - Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters (Closed)

Patch Set: Rebase, switch test to testharness Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.	2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
69	69

70 String TextCodecUTF16::decode(const char* bytes,	70 String TextCodecUTF16::decode(const char* bytes,

71 size_t length,	71 size_t length,

72 FlushBehavior flush,	72 FlushBehavior flush,

73 bool,	73 bool,

74 bool& sawError) {	74 bool& sawError) {

75 // For compatibility reasons, ignore flush from fetch EOF.	75 // For compatibility reasons, ignore flush from fetch EOF.

76 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;	76 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;

77	77

78 if (!length) {	78 if (!length) {

79 if (!reallyFlush \|\| !m_haveBufferedByte)	79 if (reallyFlush && (m_haveLeadByte \|\| m_haveLeadSurrogate)) {

80 return String();	80 m_haveLeadByte = m_haveLeadSurrogate = false;

81 sawError = true;	81 sawError = true;

82 return String(&replacementCharacter, 1);	82 return String(&replacementCharacter, 1);

	83 }

	84 return String();

83 }	85 }

84	86

85 // FIXME: This should generate an error if there is an unpaired surrogate.	87 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);

	88 const size_t numBytes = length + m_haveLeadByte;

	89 const bool willHaveExtraByte = numBytes & 1;

	90 const size_t numCharsIn = numBytes / 2;

	91 const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) +

	92 (reallyFlush && willHaveExtraByte ? 1 : 0);

86	93

87 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);	94 StringBuffer<UChar> buffer(maxCharsOut);

88 size_t numBytes = length + m_haveBufferedByte;

89 size_t numCharsIn = numBytes / 2;

90 size_t numCharsOut =

91 ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numCharsIn;

92

93 StringBuffer<UChar> buffer(numCharsOut);

94 UChar* q = buffer.characters();	95 UChar* q = buffer.characters();

95	96

96 if (m_haveBufferedByte) {	97 for (size_t i = 0; i < numCharsIn; ++i) {

97 UChar c;	98 UChar c;

98 if (m_littleEndian)	99 if (m_haveLeadByte) {

99 c = m_bufferedByte \| (p[0] << 8);	100 c = m_littleEndian ? (m_leadByte \| (p[0] << 8))

100 else	101 : ((m_leadByte << 8) \| p[0]);

101 c = (m_bufferedByte << 8) \| p[0];	102 m_haveLeadByte = false;

102 *q++ = c;	103 ++p;

103 m_haveBufferedByte = false;	104 } else {

104 p += 1;	105 c = m_littleEndian ? (p[0] \| (p[1] << 8)) : ((p[0] << 8) \| p[1]);

105 numCharsIn -= 1;	106 p += 2;

106 }	107 }

107	108

108 if (m_littleEndian) {	109 // TODO(jsbell): If necessary for performance, m_haveLeadByte handling

109 for (size_t i = 0; i < numCharsIn; ++i) {	110 // can be pulled out and this loop split into distinct cases for

110 UChar c = p[0] \| (p[1] << 8);	111 // big/little endian. The logic from here to the end of the loop is

111 p += 2;	112 // constant with respect to m_haveLeadByte and m_littleEndian.

	113

	114 if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {

	115 *q++ = m_leadSurrogate;

	116 m_haveLeadSurrogate = false;

112 *q++ = c;	117 *q++ = c;

113 }	118 } else {

114 } else {	119 if (m_haveLeadSurrogate) {

115 for (size_t i = 0; i < numCharsIn; ++i) {	120 m_haveLeadSurrogate = false;

116 UChar c = (p[0] << 8) \| p[1];	121 sawError = true;

117 p += 2;	122 *q++ = replacementCharacter;

118 *q++ = c;	123 }

	124

	125 if (U_IS_LEAD(c)) {

	126 m_haveLeadSurrogate = true;

	127 m_leadSurrogate = c;

	128 } else if (U_IS_TRAIL(c)) {

	129 sawError = true;

	130 *q++ = replacementCharacter;

	131 } else {

	132 *q++ = c;

	133 }

119 }	134 }

120 }	135 }

121	136

122 if (numBytes & 1) {	137 DCHECK(!m_haveLeadByte);

123 ASSERT(!m_haveBufferedByte);	138 if (willHaveExtraByte) {

	139 m_haveLeadByte = true;

	140 m_leadByte = p[0];

	141 }

124	142

125 if (reallyFlush) {	143 if (reallyFlush && (m_haveLeadByte \|\| m_haveLeadSurrogate)) {

126 sawError = true;	144 m_haveLeadByte = m_haveLeadSurrogate = false;

127 *q++ = replacementCharacter;	145 sawError = true;

128 } else {	146 *q++ = replacementCharacter;

129 m_haveBufferedByte = true;

130 m_bufferedByte = p[0];

131 }

132 }	147 }

133	148

134 buffer.shrink(q - buffer.characters());	149 buffer.shrink(q - buffer.characters());

135	150

136 return String::adopt(buffer);	151 return String::adopt(buffer);

137 }	152 }

138	153

139 CString TextCodecUTF16::encode(const UChar* characters,	154 CString TextCodecUTF16::encode(const UChar* characters,

140 size_t length,	155 size_t length,

141 UnencodableHandling) {	156 UnencodableHandling) {

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
188 for (size_t i = 0; i < length; ++i) {	203 for (size_t i = 0; i < length; ++i) {

189 bytes[i * 2] = 0;	204 bytes[i * 2] = 0;

190 bytes[i * 2 + 1] = characters[i];	205 bytes[i * 2 + 1] = characters[i];

191 }	206 }

192 }	207 }

193	208

194 return result;	209 return result;

195 }	210 }

196	211

197 } // namespace WTF	212 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF16.h ('k') | no next file » | no next file with comments »