Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(279)

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 1436153002: Apply clang-format with Chromium-style without column limit. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 19 matching lines...) Expand all
30 #include "wtf/text/CharacterNames.h" 30 #include "wtf/text/CharacterNames.h"
31 #include "wtf/text/StringBuffer.h" 31 #include "wtf/text/StringBuffer.h"
32 #include "wtf/text/TextCodecASCIIFastPath.h" 32 #include "wtf/text/TextCodecASCIIFastPath.h"
33 33
34 namespace WTF { 34 namespace WTF {
35 35
36 using namespace WTF::Unicode; 36 using namespace WTF::Unicode;
37 37
38 const int nonCharacter = -1; 38 const int nonCharacter = -1;
39 39
40 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) 40 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {
41 { 41 return adoptPtr(new TextCodecUTF8);
42 return adoptPtr(new TextCodecUTF8); 42 }
43 } 43
44 44 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
45 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) 45 registrar("UTF-8", "UTF-8");
46 { 46
47 registrar("UTF-8", "UTF-8"); 47 // Additional aliases that originally were present in the encoding
48 48 // table in WebKit on Macintosh, and subsequently added by
49 // Additional aliases that originally were present in the encoding 49 // TextCodecICU. Perhaps we can prove some are not used on the web
50 // table in WebKit on Macintosh, and subsequently added by 50 // and remove them.
51 // TextCodecICU. Perhaps we can prove some are not used on the web 51 registrar("unicode11utf8", "UTF-8");
52 // and remove them. 52 registrar("unicode20utf8", "UTF-8");
53 registrar("unicode11utf8", "UTF-8"); 53 registrar("utf8", "UTF-8");
54 registrar("unicode20utf8", "UTF-8"); 54 registrar("x-unicode20utf8", "UTF-8");
55 registrar("utf8", "UTF-8"); 55
56 registrar("x-unicode20utf8", "UTF-8"); 56 // Additional aliases present in the WHATWG Encoding Standard (http://encoding .spec.whatwg.org/)
57 57 // and Firefox (24), but not in ICU 4.6.
58 // Additional aliases present in the WHATWG Encoding Standard (http://encodi ng.spec.whatwg.org/) 58 registrar("unicode-1-1-utf-8", "UTF-8");
59 // and Firefox (24), but not in ICU 4.6. 59 }
60 registrar("unicode-1-1-utf-8", "UTF-8"); 60
61 } 61 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
62 62 registrar("UTF-8", create, 0);
63 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) 63 }
64 { 64
65 registrar("UTF-8", create, 0); 65 static inline int nonASCIISequenceLength(uint8_t firstByte) {
66 } 66 static const uint8_t lengths[256] = {
67 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 static inline int nonASCIISequenceLength(uint8_t firstByte) 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 { 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 static const uint8_t lengths[256] = { 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 82 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83 return lengths[firstByte];
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 84 }
85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 85
86 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 86 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt h) {
87 }; 87 ASSERT(!isASCII(sequence[0]));
88 return lengths[firstByte]; 88 if (length == 2) {
89 } 89 ASSERT(sequence[0] <= 0xDF);
90 90 if (sequence[0] < 0xC2)
91 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt h) 91 return nonCharacter;
92 { 92 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
93 ASSERT(!isASCII(sequence[0])); 93 return nonCharacter;
94 if (length == 2) { 94 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
95 ASSERT(sequence[0] <= 0xDF); 95 }
96 if (sequence[0] < 0xC2) 96 if (length == 3) {
97 return nonCharacter; 97 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
98 switch (sequence[0]) {
99 case 0xE0:
100 if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
101 return nonCharacter;
102 break;
103 case 0xED:
104 if (sequence[1] < 0x80 || sequence[1] > 0x9F)
105 return nonCharacter;
106 break;
107 default:
98 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 108 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
99 return nonCharacter; 109 return nonCharacter;
100 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 110 }
101 } 111 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
102 if (length == 3) { 112 return nonCharacter;
103 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 113 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080 ;
104 switch (sequence[0]) { 114 }
105 case 0xE0: 115 ASSERT(length == 4);
106 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
107 return nonCharacter; 117 switch (sequence[0]) {
118 case 0xF0:
119 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
120 return nonCharacter;
121 break;
122 case 0xF4:
123 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
124 return nonCharacter;
125 break;
126 default:
127 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
128 return nonCharacter;
129 }
130 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
131 return nonCharacter;
132 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
133 return nonCharacter;
134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seque nce[3]) - 0x03C82080;
135 }
136
137 static inline UChar* appendCharacter(UChar* destination, int character) {
138 ASSERT(character != nonCharacter);
139 ASSERT(!U_IS_SURROGATE(character));
140 if (U_IS_BMP(character)) {
141 *destination++ = static_cast<UChar>(character);
142 } else {
143 *destination++ = U16_LEAD(character);
144 *destination++ = U16_TRAIL(character);
145 }
146 return destination;
147 }
148
149 void TextCodecUTF8::consumePartialSequenceByte() {
150 --m_partialSequenceSize;
151 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
152 }
153
154 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw Error) {
155 sawError = true;
156 if (stopOnError)
157 return;
158 // Each error generates a replacement character and consumes one byte.
159 *destination++ = replacementCharacter;
160 consumePartialSequenceByte();
161 }
162
163 template <>
164 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint 8_t*& source, const uint8_t* end, bool flush, bool, bool&) {
165 ASSERT(m_partialSequenceSize);
166 do {
167 if (isASCII(m_partialSequence[0])) {
168 *destination++ = m_partialSequence[0];
169 consumePartialSequenceByte();
170 continue;
171 }
172 int count = nonASCIISequenceLength(m_partialSequence[0]);
173 if (!count)
174 return true;
175
176 if (count > m_partialSequenceSize) {
177 if (count - m_partialSequenceSize > end - source) {
178 if (!flush) {
179 // The new data is not enough to complete the sequence, so
180 // add it to the existing partial sequence.
181 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source );
182 m_partialSequenceSize += end - source;
183 return false;
184 }
185 // An incomplete partial sequence at the end is an error, but it will cr eate
186 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
187 // the error.
188 return true;
189 }
190 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partia lSequenceSize);
191 source += count - m_partialSequenceSize;
192 m_partialSequenceSize = count;
193 }
194 int character = decodeNonASCIISequence(m_partialSequence, count);
195 if (character & ~0xff)
196 return true;
197
198 m_partialSequenceSize -= count;
199 *destination++ = static_cast<LChar>(character);
200 } while (m_partialSequenceSize);
201
202 return false;
203 }
204
205 template <>
206 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint 8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) {
207 ASSERT(m_partialSequenceSize);
208 do {
209 if (isASCII(m_partialSequence[0])) {
210 *destination++ = m_partialSequence[0];
211 consumePartialSequenceByte();
212 continue;
213 }
214 int count = nonASCIISequenceLength(m_partialSequence[0]);
215 if (!count) {
216 handleError(destination, stopOnError, sawError);
217 if (stopOnError)
218 return false;
219 continue;
220 }
221 if (count > m_partialSequenceSize) {
222 if (count - m_partialSequenceSize > end - source) {
223 if (!flush) {
224 // The new data is not enough to complete the sequence, so
225 // add it to the existing partial sequence.
226 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source );
227 m_partialSequenceSize += end - source;
228 return false;
229 }
230 // An incomplete partial sequence at the end is an error.
231 handleError(destination, stopOnError, sawError);
232 if (stopOnError)
233 return false;
234 continue;
235 }
236 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partia lSequenceSize);
237 source += count - m_partialSequenceSize;
238 m_partialSequenceSize = count;
239 }
240 int character = decodeNonASCIISequence(m_partialSequence, count);
241 if (character == nonCharacter) {
242 handleError(destination, stopOnError, sawError);
243 if (stopOnError)
244 return false;
245 continue;
246 }
247
248 m_partialSequenceSize -= count;
249 destination = appendCharacter(destination, character);
250 } while (m_partialSequenceSize);
251
252 return false;
253 }
254
255 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu sh, bool stopOnError, bool& sawError) {
256 // Each input byte might turn into a character.
257 // That includes all bytes in the partial-sequence buffer because
258 // each byte in an invalid sequence will turn into a replacement character.
259 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
260
261 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
262 const uint8_t* end = source + length;
263 const uint8_t* alignedEnd = alignToMachineWord(end);
264 LChar* destination = buffer.characters();
265
266 do {
267 if (m_partialSequenceSize) {
268 // Explicitly copy destination and source pointers to avoid taking pointer s to the
269 // local variables, which may harm code generation by disabling some optim izations
270 // in some compilers.
271 LChar* destinationForHandlePartialSequence = destination;
272 const uint8_t* sourceForHandlePartialSequence = source;
273 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHa ndlePartialSequence, end, flush, stopOnError, sawError)) {
274 source = sourceForHandlePartialSequence;
275 goto upConvertTo16Bit;
276 }
277 destination = destinationForHandlePartialSequence;
278 source = sourceForHandlePartialSequence;
279 if (m_partialSequenceSize)
280 break;
281 }
282
283 while (source < end) {
284 if (isASCII(*source)) {
285 // Fast path for ASCII. Most UTF-8 text will be ASCII.
286 if (isAlignedToMachineWord(source)) {
287 while (source < alignedEnd) {
288 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source );
289 if (!isAllASCII<LChar>(chunk))
290 break;
291 copyASCIIMachineWord(destination, source);
292 source += sizeof(MachineWord);
293 destination += sizeof(MachineWord);
294 }
295 if (source == end)
108 break; 296 break;
109 case 0xED: 297 if (!isASCII(*source))
110 if (sequence[1] < 0x80 || sequence[1] > 0x9F) 298 continue;
111 return nonCharacter; 299 }
300 *destination++ = *source++;
301 continue;
302 }
303 int count = nonASCIISequenceLength(*source);
304 int character;
305 if (count == 0) {
306 character = nonCharacter;
307 } else {
308 if (count > end - source) {
309 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t> (sizeof(m_partialSequence)));
310 ASSERT(!m_partialSequenceSize);
311 m_partialSequenceSize = end - source;
312 memcpy(m_partialSequence, source, m_partialSequenceSize);
313 source = end;
314 break;
315 }
316 character = decodeNonASCIISequence(source, count);
317 }
318 if (character == nonCharacter) {
319 sawError = true;
320 if (stopOnError)
321 break;
322
323 goto upConvertTo16Bit;
324 }
325 if (character > 0xff)
326 goto upConvertTo16Bit;
327
328 source += count;
329 *destination++ = static_cast<LChar>(character);
330 }
331 } while (flush && m_partialSequenceSize);
332
333 buffer.shrink(destination - buffer.characters());
334
335 return String::adopt(buffer);
336
337 upConvertTo16Bit:
338 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
339
340 UChar* destination16 = buffer16.characters();
341
342 // Copy the already converted characters
343 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
344 *destination16++ = *converted8++;
345
346 do {
347 if (m_partialSequenceSize) {
348 // Explicitly copy destination and source pointers to avoid taking pointer s to the
349 // local variables, which may harm code generation by disabling some optim izations
350 // in some compilers.
351 UChar* destinationForHandlePartialSequence = destination16;
352 const uint8_t* sourceForHandlePartialSequence = source;
353 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandle PartialSequence, end, flush, stopOnError, sawError);
354 destination16 = destinationForHandlePartialSequence;
355 source = sourceForHandlePartialSequence;
356 if (m_partialSequenceSize)
357 break;
358 }
359
360 while (source < end) {
361 if (isASCII(*source)) {
362 // Fast path for ASCII. Most UTF-8 text will be ASCII.
363 if (isAlignedToMachineWord(source)) {
364 while (source < alignedEnd) {
365 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source );
366 if (!isAllASCII<LChar>(chunk))
367 break;
368 copyASCIIMachineWord(destination16, source);
369 source += sizeof(MachineWord);
370 destination16 += sizeof(MachineWord);
371 }
372 if (source == end)
112 break; 373 break;
113 default: 374 if (!isASCII(*source))
114 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
115 return nonCharacter;
116 }
117 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
118 return nonCharacter;
119 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E 2080;
120 }
121 ASSERT(length == 4);
122 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
123 switch (sequence[0]) {
124 case 0xF0:
125 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
126 return nonCharacter;
127 break;
128 case 0xF4:
129 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
130 return nonCharacter;
131 break;
132 default:
133 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
134 return nonCharacter;
135 }
136 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
137 return nonCharacter;
138 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
139 return nonCharacter;
140 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seq uence[3]) - 0x03C82080;
141 }
142
143 static inline UChar* appendCharacter(UChar* destination, int character)
144 {
145 ASSERT(character != nonCharacter);
146 ASSERT(!U_IS_SURROGATE(character));
147 if (U_IS_BMP(character)) {
148 *destination++ = static_cast<UChar>(character);
149 } else {
150 *destination++ = U16_LEAD(character);
151 *destination++ = U16_TRAIL(character);
152 }
153 return destination;
154 }
155
156 void TextCodecUTF8::consumePartialSequenceByte()
157 {
158 --m_partialSequenceSize;
159 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
160 }
161
162 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw Error)
163 {
164 sawError = true;
165 if (stopOnError)
166 return;
167 // Each error generates a replacement character and consumes one byte.
168 *destination++ = replacementCharacter;
169 consumePartialSequenceByte();
170 }
171
172 template <>
173 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint 8_t*& source, const uint8_t* end, bool flush, bool, bool&)
174 {
175 ASSERT(m_partialSequenceSize);
176 do {
177 if (isASCII(m_partialSequence[0])) {
178 *destination++ = m_partialSequence[0];
179 consumePartialSequenceByte();
180 continue; 375 continue;
181 } 376 }
182 int count = nonASCIISequenceLength(m_partialSequence[0]); 377 *destination16++ = *source++;
183 if (!count) 378 continue;
184 return true; 379 }
185 380 int count = nonASCIISequenceLength(*source);
186 if (count > m_partialSequenceSize) { 381 int character;
187 if (count - m_partialSequenceSize > end - source) { 382 if (count == 0) {
188 if (!flush) { 383 character = nonCharacter;
189 // The new data is not enough to complete the sequence, so 384 } else {
190 // add it to the existing partial sequence. 385 if (count > end - source) {
191 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source); 386 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t> (sizeof(m_partialSequence)));
192 m_partialSequenceSize += end - source; 387 ASSERT(!m_partialSequenceSize);
193 return false; 388 m_partialSequenceSize = end - source;
194 } 389 memcpy(m_partialSequence, source, m_partialSequenceSize);
195 // An incomplete partial sequence at the end is an error, but it will create 390 source = end;
196 // a 16 bit string due to the replacementCharacter. Let the 16 b it path handle 391 break;
197 // the error. 392 }
198 return true; 393 character = decodeNonASCIISequence(source, count);
199 } 394 }
200 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize); 395 if (character == nonCharacter) {
201 source += count - m_partialSequenceSize; 396 sawError = true;
202 m_partialSequenceSize = count; 397 if (stopOnError)
203 } 398 break;
204 int character = decodeNonASCIISequence(m_partialSequence, count); 399 // Each error generates a replacement character and consumes one byte.
205 if (character & ~0xff) 400 *destination16++ = replacementCharacter;
206 return true; 401 ++source;
207 402 continue;
208 m_partialSequenceSize -= count; 403 }
209 *destination++ = static_cast<LChar>(character); 404 source += count;
210 } while (m_partialSequenceSize); 405 destination16 = appendCharacter(destination16, character);
211 406 }
212 return false; 407 } while (flush && m_partialSequenceSize);
213 } 408
214 409 buffer16.shrink(destination16 - buffer16.characters());
215 template <> 410
216 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint 8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) 411 return String::adopt(buffer16);
217 { 412 }
218 ASSERT(m_partialSequenceSize); 413
219 do { 414 template <typename CharType>
220 if (isASCII(m_partialSequence[0])) { 415 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
221 *destination++ = m_partialSequence[0]; 416 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
222 consumePartialSequenceByte(); 417 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3 x).
223 continue; 418 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2 x).
224 } 419 if (length > std::numeric_limits<size_t>::max() / 3)
225 int count = nonASCIISequenceLength(m_partialSequence[0]); 420 CRASH();
226 if (!count) { 421 Vector<uint8_t> bytes(length * 3);
227 handleError(destination, stopOnError, sawError); 422
228 if (stopOnError) 423 size_t i = 0;
229 return false; 424 size_t bytesWritten = 0;
230 continue; 425 while (i < length) {
231 } 426 UChar32 character;
232 if (count > m_partialSequenceSize) { 427 U16_NEXT(characters, i, length, character);
233 if (count - m_partialSequenceSize > end - source) { 428 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat e
234 if (!flush) { 429 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he re.
235 // The new data is not enough to complete the sequence, so 430 if (0xD800 <= character && character <= 0xDFFF)
236 // add it to the existing partial sequence. 431 character = replacementCharacter;
237 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source); 432 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
238 m_partialSequenceSize += end - source; 433 }
239 return false; 434
240 } 435 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
241 // An incomplete partial sequence at the end is an error. 436 }
242 handleError(destination, stopOnError, sawError); 437
243 if (stopOnError) 438 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl eHandling) {
244 return false; 439 return encodeCommon(characters, length);
245 continue; 440 }
246 } 441
247 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize); 442 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl eHandling) {
248 source += count - m_partialSequenceSize; 443 return encodeCommon(characters, length);
249 m_partialSequenceSize = count; 444 }
250 } 445
251 int character = decodeNonASCIISequence(m_partialSequence, count); 446 } // namespace WTF
252 if (character == nonCharacter) {
253 handleError(destination, stopOnError, sawError);
254 if (stopOnError)
255 return false;
256 continue;
257 }
258
259 m_partialSequenceSize -= count;
260 destination = appendCharacter(destination, character);
261 } while (m_partialSequenceSize);
262
263 return false;
264 }
265
266 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu sh, bool stopOnError, bool& sawError)
267 {
268 // Each input byte might turn into a character.
269 // That includes all bytes in the partial-sequence buffer because
270 // each byte in an invalid sequence will turn into a replacement character.
271 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
272
273 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
274 const uint8_t* end = source + length;
275 const uint8_t* alignedEnd = alignToMachineWord(end);
276 LChar* destination = buffer.characters();
277
278 do {
279 if (m_partialSequenceSize) {
280 // Explicitly copy destination and source pointers to avoid taking p ointers to the
281 // local variables, which may harm code generation by disabling some optimizations
282 // in some compilers.
283 LChar* destinationForHandlePartialSequence = destination;
284 const uint8_t* sourceForHandlePartialSequence = source;
285 if (handlePartialSequence(destinationForHandlePartialSequence, sourc eForHandlePartialSequence, end, flush, stopOnError, sawError)) {
286 source = sourceForHandlePartialSequence;
287 goto upConvertTo16Bit;
288 }
289 destination = destinationForHandlePartialSequence;
290 source = sourceForHandlePartialSequence;
291 if (m_partialSequenceSize)
292 break;
293 }
294
295 while (source < end) {
296 if (isASCII(*source)) {
297 // Fast path for ASCII. Most UTF-8 text will be ASCII.
298 if (isAlignedToMachineWord(source)) {
299 while (source < alignedEnd) {
300 MachineWord chunk = *reinterpret_cast_ptr<const MachineW ord*>(source);
301 if (!isAllASCII<LChar>(chunk))
302 break;
303 copyASCIIMachineWord(destination, source);
304 source += sizeof(MachineWord);
305 destination += sizeof(MachineWord);
306 }
307 if (source == end)
308 break;
309 if (!isASCII(*source))
310 continue;
311 }
312 *destination++ = *source++;
313 continue;
314 }
315 int count = nonASCIISequenceLength(*source);
316 int character;
317 if (count == 0) {
318 character = nonCharacter;
319 } else {
320 if (count > end - source) {
321 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));
322 ASSERT(!m_partialSequenceSize);
323 m_partialSequenceSize = end - source;
324 memcpy(m_partialSequence, source, m_partialSequenceSize);
325 source = end;
326 break;
327 }
328 character = decodeNonASCIISequence(source, count);
329 }
330 if (character == nonCharacter) {
331 sawError = true;
332 if (stopOnError)
333 break;
334
335 goto upConvertTo16Bit;
336 }
337 if (character > 0xff)
338 goto upConvertTo16Bit;
339
340 source += count;
341 *destination++ = static_cast<LChar>(character);
342 }
343 } while (flush && m_partialSequenceSize);
344
345 buffer.shrink(destination - buffer.characters());
346
347 return String::adopt(buffer);
348
349 upConvertTo16Bit:
350 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
351
352 UChar* destination16 = buffer16.characters();
353
354 // Copy the already converted characters
355 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
356 *destination16++ = *converted8++;
357
358 do {
359 if (m_partialSequenceSize) {
360 // Explicitly copy destination and source pointers to avoid taking p ointers to the
361 // local variables, which may harm code generation by disabling some optimizations
362 // in some compilers.
363 UChar* destinationForHandlePartialSequence = destination16;
364 const uint8_t* sourceForHandlePartialSequence = source;
365 handlePartialSequence(destinationForHandlePartialSequence, sourceFor HandlePartialSequence, end, flush, stopOnError, sawError);
366 destination16 = destinationForHandlePartialSequence;
367 source = sourceForHandlePartialSequence;
368 if (m_partialSequenceSize)
369 break;
370 }
371
372 while (source < end) {
373 if (isASCII(*source)) {
374 // Fast path for ASCII. Most UTF-8 text will be ASCII.
375 if (isAlignedToMachineWord(source)) {
376 while (source < alignedEnd) {
377 MachineWord chunk = *reinterpret_cast_ptr<const MachineW ord*>(source);
378 if (!isAllASCII<LChar>(chunk))
379 break;
380 copyASCIIMachineWord(destination16, source);
381 source += sizeof(MachineWord);
382 destination16 += sizeof(MachineWord);
383 }
384 if (source == end)
385 break;
386 if (!isASCII(*source))
387 continue;
388 }
389 *destination16++ = *source++;
390 continue;
391 }
392 int count = nonASCIISequenceLength(*source);
393 int character;
394 if (count == 0) {
395 character = nonCharacter;
396 } else {
397 if (count > end - source) {
398 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));
399 ASSERT(!m_partialSequenceSize);
400 m_partialSequenceSize = end - source;
401 memcpy(m_partialSequence, source, m_partialSequenceSize);
402 source = end;
403 break;
404 }
405 character = decodeNonASCIISequence(source, count);
406 }
407 if (character == nonCharacter) {
408 sawError = true;
409 if (stopOnError)
410 break;
411 // Each error generates a replacement character and consumes one byte.
412 *destination16++ = replacementCharacter;
413 ++source;
414 continue;
415 }
416 source += count;
417 destination16 = appendCharacter(destination16, character);
418 }
419 } while (flush && m_partialSequenceSize);
420
421 buffer16.shrink(destination16 - buffer16.characters());
422
423 return String::adopt(buffer16);
424 }
425
426 template<typename CharType>
427 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
428 {
429 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
430 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
431 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
432 if (length > std::numeric_limits<size_t>::max() / 3)
433 CRASH();
434 Vector<uint8_t> bytes(length * 3);
435
436 size_t i = 0;
437 size_t bytesWritten = 0;
438 while (i < length) {
439 UChar32 character;
440 U16_NEXT(characters, i, length, character);
441 // U16_NEXT will simply emit a surrogate code point if an unmatched surr ogate
442 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER ) here.
443 if (0xD800 <= character && character <= 0xDFFF)
444 character = replacementCharacter;
445 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
446 }
447
448 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
449 }
450
451 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl eHandling)
452 {
453 return encodeCommon(characters, length);
454 }
455
456 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl eHandling)
457 {
458 return encodeCommon(characters, length);
459 }
460
461 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698