Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(38)

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF16.cpp

Issue 2379333003: UTF-16 Decoder: Convert unpaired surrogates to replacement characters (Closed)
Patch Set: Rebase, switch test to testharness Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF16.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved. 2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
69 69
70 String TextCodecUTF16::decode(const char* bytes, 70 String TextCodecUTF16::decode(const char* bytes,
71 size_t length, 71 size_t length,
72 FlushBehavior flush, 72 FlushBehavior flush,
73 bool, 73 bool,
74 bool& sawError) { 74 bool& sawError) {
75 // For compatibility reasons, ignore flush from fetch EOF. 75 // For compatibility reasons, ignore flush from fetch EOF.
76 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF; 76 const bool reallyFlush = flush != DoNotFlush && flush != FetchEOF;
77 77
78 if (!length) { 78 if (!length) {
79 if (!reallyFlush || !m_haveBufferedByte) 79 if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
80 return String(); 80 m_haveLeadByte = m_haveLeadSurrogate = false;
81 sawError = true; 81 sawError = true;
82 return String(&replacementCharacter, 1); 82 return String(&replacementCharacter, 1);
83 }
84 return String();
83 } 85 }
84 86
85 // FIXME: This should generate an error if there is an unpaired surrogate. 87 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
88 const size_t numBytes = length + m_haveLeadByte;
89 const bool willHaveExtraByte = numBytes & 1;
90 const size_t numCharsIn = numBytes / 2;
91 const size_t maxCharsOut = numCharsIn + (m_haveLeadSurrogate ? 1 : 0) +
92 (reallyFlush && willHaveExtraByte ? 1 : 0);
86 93
87 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes); 94 StringBuffer<UChar> buffer(maxCharsOut);
88 size_t numBytes = length + m_haveBufferedByte;
89 size_t numCharsIn = numBytes / 2;
90 size_t numCharsOut =
91 ((numBytes & 1) && reallyFlush) ? numCharsIn + 1 : numCharsIn;
92
93 StringBuffer<UChar> buffer(numCharsOut);
94 UChar* q = buffer.characters(); 95 UChar* q = buffer.characters();
95 96
96 if (m_haveBufferedByte) { 97 for (size_t i = 0; i < numCharsIn; ++i) {
97 UChar c; 98 UChar c;
98 if (m_littleEndian) 99 if (m_haveLeadByte) {
99 c = m_bufferedByte | (p[0] << 8); 100 c = m_littleEndian ? (m_leadByte | (p[0] << 8))
100 else 101 : ((m_leadByte << 8) | p[0]);
101 c = (m_bufferedByte << 8) | p[0]; 102 m_haveLeadByte = false;
102 *q++ = c; 103 ++p;
103 m_haveBufferedByte = false; 104 } else {
104 p += 1; 105 c = m_littleEndian ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);
105 numCharsIn -= 1; 106 p += 2;
106 } 107 }
107 108
108 if (m_littleEndian) { 109 // TODO(jsbell): If necessary for performance, m_haveLeadByte handling
109 for (size_t i = 0; i < numCharsIn; ++i) { 110 // can be pulled out and this loop split into distinct cases for
110 UChar c = p[0] | (p[1] << 8); 111 // big/little endian. The logic from here to the end of the loop is
111 p += 2; 112 // constant with respect to m_haveLeadByte and m_littleEndian.
113
114 if (m_haveLeadSurrogate && U_IS_TRAIL(c)) {
115 *q++ = m_leadSurrogate;
116 m_haveLeadSurrogate = false;
112 *q++ = c; 117 *q++ = c;
113 } 118 } else {
114 } else { 119 if (m_haveLeadSurrogate) {
115 for (size_t i = 0; i < numCharsIn; ++i) { 120 m_haveLeadSurrogate = false;
116 UChar c = (p[0] << 8) | p[1]; 121 sawError = true;
117 p += 2; 122 *q++ = replacementCharacter;
118 *q++ = c; 123 }
124
125 if (U_IS_LEAD(c)) {
126 m_haveLeadSurrogate = true;
127 m_leadSurrogate = c;
128 } else if (U_IS_TRAIL(c)) {
129 sawError = true;
130 *q++ = replacementCharacter;
131 } else {
132 *q++ = c;
133 }
119 } 134 }
120 } 135 }
121 136
122 if (numBytes & 1) { 137 DCHECK(!m_haveLeadByte);
123 ASSERT(!m_haveBufferedByte); 138 if (willHaveExtraByte) {
139 m_haveLeadByte = true;
140 m_leadByte = p[0];
141 }
124 142
125 if (reallyFlush) { 143 if (reallyFlush && (m_haveLeadByte || m_haveLeadSurrogate)) {
126 sawError = true; 144 m_haveLeadByte = m_haveLeadSurrogate = false;
127 *q++ = replacementCharacter; 145 sawError = true;
128 } else { 146 *q++ = replacementCharacter;
129 m_haveBufferedByte = true;
130 m_bufferedByte = p[0];
131 }
132 } 147 }
133 148
134 buffer.shrink(q - buffer.characters()); 149 buffer.shrink(q - buffer.characters());
135 150
136 return String::adopt(buffer); 151 return String::adopt(buffer);
137 } 152 }
138 153
139 CString TextCodecUTF16::encode(const UChar* characters, 154 CString TextCodecUTF16::encode(const UChar* characters,
140 size_t length, 155 size_t length,
141 UnencodableHandling) { 156 UnencodableHandling) {
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
188 for (size_t i = 0; i < length; ++i) { 203 for (size_t i = 0; i < length; ++i) {
189 bytes[i * 2] = 0; 204 bytes[i * 2] = 0;
190 bytes[i * 2 + 1] = characters[i]; 205 bytes[i * 2 + 1] = characters[i];
191 } 206 }
192 } 207 }
193 208
194 return result; 209 return result;
195 } 210 }
196 211
197 } // namespace WTF 212 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF16.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698