Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters (Closed)
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved. 2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 registrar("UTF-16LE", newStreamingTextDecoderUTF16LE, 0); 66 registrar("UTF-16LE", newStreamingTextDecoderUTF16LE, 0);
67 registrar("UTF-16BE", newStreamingTextDecoderUTF16BE, 0); 67 registrar("UTF-16BE", newStreamingTextDecoderUTF16BE, 0);
68 } 68 }
69 69
70 String TextCodecUTF16::decode(const char* bytes, size_t length, FlushBehavior fl ush, bool, bool& sawError) 70 String TextCodecUTF16::decode(const char* bytes, size_t length, FlushBehavior fl ush, bool, bool& sawError)
71 { 71 {
72 // For compatibility reasons, ignore flush from fetch EOF. 72 // For compatibility reasons, ignore flush from fetch EOF.
73 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF; 73 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;
74 74
75 if (!length) { 75 if (!length) {
76 if (!reallyFlush || !m_haveBufferedByte) 76 if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
77 return String(); 77 m_haveLeadByte = m_haveLeadSurrogate = false;
78 sawError = true; 78 sawError = true;
79 return String(&replacementCharacter, 1); 79 return String(&replacementCharacter, 1);
80 }
81 return String();
80 } 82 }
81 83
82 // FIXME: This should generate an error if there is an unpaired surrogate. 84 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
85 const size_t numBytes = length + m_haveLeadByte;
86 const bool willHaveExtraByte = numBytes & 1;
87 const size_t numCharsIn = numBytes / 2;
88 const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) + (rea llyFlush && willHaveExtraByte ? 1 : 0);
83 89
84 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes); 90 StringBuffer<UChar> buffer(maxCharsOut);
85 size_t numBytes = length + m_haveBufferedByte;
86 size_t numCharsIn = numBytes / 2;
87 size_t numCharsOut = ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numC harsIn;
88
89 StringBuffer<UChar> buffer(numCharsOut);
90 UChar* q = buffer.characters(); 91 UChar* q = buffer.characters();
91 92
92 if (m_haveBufferedByte) { 93 for (size_t i = 0; i < numCharsIn; ++i) {
93 UChar c; 94 UChar c;
94 if (m_littleEndian) 95 if (m_haveLeadByte) {
95 c = m_bufferedByte | (p[0] << 8); 96 c = m_littleEndian ? (m_leadByte | (p[0] << 8)) : ((m_leadByte << 8) | p[0]);
96 else 97 m_haveLeadByte = false;
97 c = (m_bufferedByte << 8) | p[0]; 98 ++p;
98 *q++ = c; 99 } else {
99 m_haveBufferedByte = false; 100 c = m_littleEndian ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);
100 p += 1; 101 p += 2;
101 numCharsIn -= 1; 102 }
102 }
103 103
104 if (m_littleEndian) { 104 // TODO(jsbell): If necessary for performance, m_haveLeadByte handling
105 for (size_t i = 0; i < numCharsIn; ++i) { 105 // can be pulled out and this loop split into distinct cases for
106 UChar c = p[0] | (p[1] << 8); 106 // big/little endian. The logic from here to the end of the loop is
107 p += 2; 107 // constant with respect to m_haveLeadByte and m_littleEndian.
108
109 if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {
110 *q++ = m_leadSurrogate;
111 m_haveLeadSurrogate = false;
108 *q++ = c; 112 *q++ = c;
109 } 113 } else {
110 } else { 114 if (m_haveLeadSurrogate) {
111 for (size_t i = 0; i < numCharsIn; ++i) { 115 m_haveLeadSurrogate = false;
112 UChar c = (p[0] << 8) | p[1]; 116 sawError = true;
113 p += 2; 117 *q++ = replacementCharacter;
114 *q++ = c; 118 }
119
120 if (U_IS_LEAD(c)) {
121 m_haveLeadSurrogate = true;
122 m_leadSurrogate = c;
123 } else if (U_IS_TRAIL(c)) {
124 sawError = true;
125 *q++ = replacementCharacter;
126 } else {
127 *q++ = c;
128 }
115 } 129 }
116 } 130 }
117 131
118 if (numBytes & 1) { 132 if (willHaveExtraByte) {
119 ASSERT(!m_haveBufferedByte); 133 DCHECK(!m_haveLeadByte);
foolip 2016/09/30 22:59:41 I think it's the m_haveLeadByte=false in the loop
jsbell 2016/09/30 23:52:13 Yes.
134 m_haveLeadByte = true;
135 m_leadByte = p[0];
136 }
120 137
121 if (reallyFlush) { 138 if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
122 sawError = true; 139 m_haveLeadByte = m_haveLeadSurrogate = false;
123 *q++ = replacementCharacter; 140 sawError = true;
124 } else { 141 *q++ = replacementCharacter;
125 m_haveBufferedByte = true;
126 m_bufferedByte = p[0];
127 }
128 } 142 }
129 143
130 buffer.shrink(q - buffer.characters()); 144 buffer.shrink(q - buffer.characters());
131 145
132 return String::adopt(buffer); 146 return String::adopt(buffer);
133 } 147 }
134 148
135 CString TextCodecUTF16::encode(const UChar* characters, size_t length, Unencodab leHandling) 149 CString TextCodecUTF16::encode(const UChar* characters, size_t length, Unencodab leHandling)
136 { 150 {
137 // We need to be sure we can double the length without overflowing. 151 // We need to be sure we can double the length without overflowing.
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
181 for (size_t i = 0; i < length; ++i) { 195 for (size_t i = 0; i < length; ++i) {
182 bytes[i * 2] = 0; 196 bytes[i * 2] = 0;
183 bytes[i * 2 + 1] = characters[i]; 197 bytes[i * 2 + 1] = characters[i];
184 } 198 }
185 } 199 }
186 200
187 return result; 201 return result;
188 } 202 }
189 203
190 } // namespace WTF 204 } // namespace WTF
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698