Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1099)

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecUTF8.cpp

Issue 1611343002: wtf reformat test Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: pydent Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 18 matching lines...) Expand all
29 #include "wtf/text/CharacterNames.h" 29 #include "wtf/text/CharacterNames.h"
30 #include "wtf/text/StringBuffer.h" 30 #include "wtf/text/StringBuffer.h"
31 #include "wtf/text/TextCodecASCIIFastPath.h" 31 #include "wtf/text/TextCodecASCIIFastPath.h"
32 32
33 namespace WTF { 33 namespace WTF {
34 34
35 using namespace WTF::Unicode; 35 using namespace WTF::Unicode;
36 36
37 const int nonCharacter = -1; 37 const int nonCharacter = -1;
38 38
39 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) 39 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) {
40 { 40 return adoptPtr(new TextCodecUTF8);
41 return adoptPtr(new TextCodecUTF8); 41 }
42 } 42
43 43 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) {
44 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) 44 registrar("UTF-8", "UTF-8");
45 { 45
46 registrar("UTF-8", "UTF-8"); 46 // Additional aliases that originally were present in the encoding
47 47 // table in WebKit on Macintosh, and subsequently added by
48 // Additional aliases that originally were present in the encoding 48 // TextCodecICU. Perhaps we can prove some are not used on the web
49 // table in WebKit on Macintosh, and subsequently added by 49 // and remove them.
50 // TextCodecICU. Perhaps we can prove some are not used on the web 50 registrar("unicode11utf8", "UTF-8");
51 // and remove them. 51 registrar("unicode20utf8", "UTF-8");
52 registrar("unicode11utf8", "UTF-8"); 52 registrar("utf8", "UTF-8");
53 registrar("unicode20utf8", "UTF-8"); 53 registrar("x-unicode20utf8", "UTF-8");
54 registrar("utf8", "UTF-8"); 54
55 registrar("x-unicode20utf8", "UTF-8"); 55 // Additional aliases present in the WHATWG Encoding Standard (http://encoding .spec.whatwg.org/)
56 56 // and Firefox (24), but not in ICU 4.6.
57 // Additional aliases present in the WHATWG Encoding Standard (http://encodi ng.spec.whatwg.org/) 57 registrar("unicode-1-1-utf-8", "UTF-8");
58 // and Firefox (24), but not in ICU 4.6. 58 }
59 registrar("unicode-1-1-utf-8", "UTF-8"); 59
60 } 60 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) {
61 61 registrar("UTF-8", create, 0);
62 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) 62 }
63 { 63
64 registrar("UTF-8", create, 0); 64 static inline int nonASCIISequenceLength(uint8_t firstByte) {
65 } 65 static const uint8_t lengths[256] = {
66 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67 static inline int nonASCIISequenceLength(uint8_t firstByte) 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68 { 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69 static const uint8_t lengths[256] = { 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77 return lengths[firstByte];
78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 78 }
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 79
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 80 static inline int decodeNonASCIISequence(const uint8_t* sequence,
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 81 unsigned length) {
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 82 ASSERT(!isASCII(sequence[0]));
83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 83 if (length == 2) {
84 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 84 ASSERT(sequence[0] <= 0xDF);
85 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 85 if (sequence[0] < 0xC2)
86 }; 86 return nonCharacter;
87 return lengths[firstByte]; 87 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
88 } 88 return nonCharacter;
89 89 return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
90 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt h) 90 }
91 { 91 if (length == 3) {
92 ASSERT(!isASCII(sequence[0])); 92 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
93 if (length == 2) { 93 switch (sequence[0]) {
94 ASSERT(sequence[0] <= 0xDF); 94 case 0xE0:
95 if (sequence[0] < 0xC2) 95 if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
96 return nonCharacter; 96 return nonCharacter;
97 break;
98 case 0xED:
99 if (sequence[1] < 0x80 || sequence[1] > 0x9F)
100 return nonCharacter;
101 break;
102 default:
97 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 103 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
98 return nonCharacter; 104 return nonCharacter;
99 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 105 }
100 } 106 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
101 if (length == 3) { 107 return nonCharacter;
102 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 108 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
103 switch (sequence[0]) { 109 0x000E2080;
104 case 0xE0: 110 }
105 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 111 ASSERT(length == 4);
106 return nonCharacter; 112 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
113 switch (sequence[0]) {
114 case 0xF0:
115 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
116 return nonCharacter;
117 break;
118 case 0xF4:
119 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
120 return nonCharacter;
121 break;
122 default:
123 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
124 return nonCharacter;
125 }
126 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
127 return nonCharacter;
128 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
129 return nonCharacter;
130 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
131 sequence[3]) -
132 0x03C82080;
133 }
134
135 static inline UChar* appendCharacter(UChar* destination, int character) {
136 ASSERT(character != nonCharacter);
137 ASSERT(!U_IS_SURROGATE(character));
138 if (U_IS_BMP(character)) {
139 *destination++ = static_cast<UChar>(character);
140 } else {
141 *destination++ = U16_LEAD(character);
142 *destination++ = U16_TRAIL(character);
143 }
144 return destination;
145 }
146
147 void TextCodecUTF8::consumePartialSequenceByte() {
148 --m_partialSequenceSize;
149 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
150 }
151
152 void TextCodecUTF8::handleError(UChar*& destination,
153 bool stopOnError,
154 bool& sawError) {
155 sawError = true;
156 if (stopOnError)
157 return;
158 // Each error generates a replacement character and consumes one byte.
159 *destination++ = replacementCharacter;
160 consumePartialSequenceByte();
161 }
162
163 template <>
164 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination,
165 const uint8_t*& source,
166 const uint8_t* end,
167 bool flush,
168 bool,
169 bool&) {
170 ASSERT(m_partialSequenceSize);
171 do {
172 if (isASCII(m_partialSequence[0])) {
173 *destination++ = m_partialSequence[0];
174 consumePartialSequenceByte();
175 continue;
176 }
177 int count = nonASCIISequenceLength(m_partialSequence[0]);
178 if (!count)
179 return true;
180
181 if (count > m_partialSequenceSize) {
182 if (count - m_partialSequenceSize > end - source) {
183 if (!flush) {
184 // The new data is not enough to complete the sequence, so
185 // add it to the existing partial sequence.
186 memcpy(m_partialSequence + m_partialSequenceSize, source,
187 end - source);
188 m_partialSequenceSize += end - source;
189 return false;
190 }
191 // An incomplete partial sequence at the end is an error, but it will cr eate
192 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
193 // the error.
194 return true;
195 }
196 memcpy(m_partialSequence + m_partialSequenceSize, source,
197 count - m_partialSequenceSize);
198 source += count - m_partialSequenceSize;
199 m_partialSequenceSize = count;
200 }
201 int character = decodeNonASCIISequence(m_partialSequence, count);
202 if (character & ~0xff)
203 return true;
204
205 m_partialSequenceSize -= count;
206 *destination++ = static_cast<LChar>(character);
207 } while (m_partialSequenceSize);
208
209 return false;
210 }
211
212 template <>
213 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination,
214 const uint8_t*& source,
215 const uint8_t* end,
216 bool flush,
217 bool stopOnError,
218 bool& sawError) {
219 ASSERT(m_partialSequenceSize);
220 do {
221 if (isASCII(m_partialSequence[0])) {
222 *destination++ = m_partialSequence[0];
223 consumePartialSequenceByte();
224 continue;
225 }
226 int count = nonASCIISequenceLength(m_partialSequence[0]);
227 if (!count) {
228 handleError(destination, stopOnError, sawError);
229 if (stopOnError)
230 return false;
231 continue;
232 }
233 if (count > m_partialSequenceSize) {
234 if (count - m_partialSequenceSize > end - source) {
235 if (!flush) {
236 // The new data is not enough to complete the sequence, so
237 // add it to the existing partial sequence.
238 memcpy(m_partialSequence + m_partialSequenceSize, source,
239 end - source);
240 m_partialSequenceSize += end - source;
241 return false;
242 }
243 // An incomplete partial sequence at the end is an error.
244 handleError(destination, stopOnError, sawError);
245 if (stopOnError)
246 return false;
247 continue;
248 }
249 memcpy(m_partialSequence + m_partialSequenceSize, source,
250 count - m_partialSequenceSize);
251 source += count - m_partialSequenceSize;
252 m_partialSequenceSize = count;
253 }
254 int character = decodeNonASCIISequence(m_partialSequence, count);
255 if (character == nonCharacter) {
256 handleError(destination, stopOnError, sawError);
257 if (stopOnError)
258 return false;
259 continue;
260 }
261
262 m_partialSequenceSize -= count;
263 destination = appendCharacter(destination, character);
264 } while (m_partialSequenceSize);
265
266 return false;
267 }
268
269 String TextCodecUTF8::decode(const char* bytes,
270 size_t length,
271 FlushBehavior flush,
272 bool stopOnError,
273 bool& sawError) {
274 // Each input byte might turn into a character.
275 // That includes all bytes in the partial-sequence buffer because
276 // each byte in an invalid sequence will turn into a replacement character.
277 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
278
279 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
280 const uint8_t* end = source + length;
281 const uint8_t* alignedEnd = alignToMachineWord(end);
282 LChar* destination = buffer.characters();
283
284 do {
285 if (m_partialSequenceSize) {
286 // Explicitly copy destination and source pointers to avoid taking pointer s to the
287 // local variables, which may harm code generation by disabling some optim izations
288 // in some compilers.
289 LChar* destinationForHandlePartialSequence = destination;
290 const uint8_t* sourceForHandlePartialSequence = source;
291 if (handlePartialSequence(destinationForHandlePartialSequence,
292 sourceForHandlePartialSequence, end, flush,
293 stopOnError, sawError)) {
294 source = sourceForHandlePartialSequence;
295 goto upConvertTo16Bit;
296 }
297 destination = destinationForHandlePartialSequence;
298 source = sourceForHandlePartialSequence;
299 if (m_partialSequenceSize)
300 break;
301 }
302
303 while (source < end) {
304 if (isASCII(*source)) {
305 // Fast path for ASCII. Most UTF-8 text will be ASCII.
306 if (isAlignedToMachineWord(source)) {
307 while (source < alignedEnd) {
308 MachineWord chunk =
309 *reinterpret_cast_ptr<const MachineWord*>(source);
310 if (!isAllASCII<LChar>(chunk))
311 break;
312 copyASCIIMachineWord(destination, source);
313 source += sizeof(MachineWord);
314 destination += sizeof(MachineWord);
315 }
316 if (source == end)
107 break; 317 break;
108 case 0xED: 318 if (!isASCII(*source))
109 if (sequence[1] < 0x80 || sequence[1] > 0x9F) 319 continue;
110 return nonCharacter; 320 }
321 *destination++ = *source++;
322 continue;
323 }
324 int count = nonASCIISequenceLength(*source);
325 int character;
326 if (count == 0) {
327 character = nonCharacter;
328 } else {
329 if (count > end - source) {
330 ASSERT_WITH_SECURITY_IMPLICATION(
331 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
332 ASSERT(!m_partialSequenceSize);
333 m_partialSequenceSize = end - source;
334 memcpy(m_partialSequence, source, m_partialSequenceSize);
335 source = end;
336 break;
337 }
338 character = decodeNonASCIISequence(source, count);
339 }
340 if (character == nonCharacter) {
341 sawError = true;
342 if (stopOnError)
343 break;
344
345 goto upConvertTo16Bit;
346 }
347 if (character > 0xff)
348 goto upConvertTo16Bit;
349
350 source += count;
351 *destination++ = static_cast<LChar>(character);
352 }
353 } while (flush && m_partialSequenceSize);
354
355 buffer.shrink(destination - buffer.characters());
356
357 return String::adopt(buffer);
358
359 upConvertTo16Bit:
360 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
361
362 UChar* destination16 = buffer16.characters();
363
364 // Copy the already converted characters
365 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
366 *destination16++ = *converted8++;
367
368 do {
369 if (m_partialSequenceSize) {
370 // Explicitly copy destination and source pointers to avoid taking pointer s to the
371 // local variables, which may harm code generation by disabling some optim izations
372 // in some compilers.
373 UChar* destinationForHandlePartialSequence = destination16;
374 const uint8_t* sourceForHandlePartialSequence = source;
375 handlePartialSequence(destinationForHandlePartialSequence,
376 sourceForHandlePartialSequence, end, flush,
377 stopOnError, sawError);
378 destination16 = destinationForHandlePartialSequence;
379 source = sourceForHandlePartialSequence;
380 if (m_partialSequenceSize)
381 break;
382 }
383
384 while (source < end) {
385 if (isASCII(*source)) {
386 // Fast path for ASCII. Most UTF-8 text will be ASCII.
387 if (isAlignedToMachineWord(source)) {
388 while (source < alignedEnd) {
389 MachineWord chunk =
390 *reinterpret_cast_ptr<const MachineWord*>(source);
391 if (!isAllASCII<LChar>(chunk))
392 break;
393 copyASCIIMachineWord(destination16, source);
394 source += sizeof(MachineWord);
395 destination16 += sizeof(MachineWord);
396 }
397 if (source == end)
111 break; 398 break;
112 default: 399 if (!isASCII(*source))
113 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
114 return nonCharacter;
115 }
116 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
117 return nonCharacter;
118 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E 2080;
119 }
120 ASSERT(length == 4);
121 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
122 switch (sequence[0]) {
123 case 0xF0:
124 if (sequence[1] < 0x90 || sequence[1] > 0xBF)
125 return nonCharacter;
126 break;
127 case 0xF4:
128 if (sequence[1] < 0x80 || sequence[1] > 0x8F)
129 return nonCharacter;
130 break;
131 default:
132 if (sequence[1] < 0x80 || sequence[1] > 0xBF)
133 return nonCharacter;
134 }
135 if (sequence[2] < 0x80 || sequence[2] > 0xBF)
136 return nonCharacter;
137 if (sequence[3] < 0x80 || sequence[3] > 0xBF)
138 return nonCharacter;
139 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seq uence[3]) - 0x03C82080;
140 }
141
142 static inline UChar* appendCharacter(UChar* destination, int character)
143 {
144 ASSERT(character != nonCharacter);
145 ASSERT(!U_IS_SURROGATE(character));
146 if (U_IS_BMP(character)) {
147 *destination++ = static_cast<UChar>(character);
148 } else {
149 *destination++ = U16_LEAD(character);
150 *destination++ = U16_TRAIL(character);
151 }
152 return destination;
153 }
154
155 void TextCodecUTF8::consumePartialSequenceByte()
156 {
157 --m_partialSequenceSize;
158 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
159 }
160
161 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw Error)
162 {
163 sawError = true;
164 if (stopOnError)
165 return;
166 // Each error generates a replacement character and consumes one byte.
167 *destination++ = replacementCharacter;
168 consumePartialSequenceByte();
169 }
170
171 template <>
172 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint 8_t*& source, const uint8_t* end, bool flush, bool, bool&)
173 {
174 ASSERT(m_partialSequenceSize);
175 do {
176 if (isASCII(m_partialSequence[0])) {
177 *destination++ = m_partialSequence[0];
178 consumePartialSequenceByte();
179 continue; 400 continue;
180 } 401 }
181 int count = nonASCIISequenceLength(m_partialSequence[0]); 402 *destination16++ = *source++;
182 if (!count) 403 continue;
183 return true; 404 }
184 405 int count = nonASCIISequenceLength(*source);
185 if (count > m_partialSequenceSize) { 406 int character;
186 if (count - m_partialSequenceSize > end - source) { 407 if (count == 0) {
187 if (!flush) { 408 character = nonCharacter;
188 // The new data is not enough to complete the sequence, so 409 } else {
189 // add it to the existing partial sequence. 410 if (count > end - source) {
190 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source); 411 ASSERT_WITH_SECURITY_IMPLICATION(
191 m_partialSequenceSize += end - source; 412 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
192 return false; 413 ASSERT(!m_partialSequenceSize);
193 } 414 m_partialSequenceSize = end - source;
194 // An incomplete partial sequence at the end is an error, but it will create 415 memcpy(m_partialSequence, source, m_partialSequenceSize);
195 // a 16 bit string due to the replacementCharacter. Let the 16 b it path handle 416 source = end;
196 // the error. 417 break;
197 return true; 418 }
198 } 419 character = decodeNonASCIISequence(source, count);
199 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize); 420 }
200 source += count - m_partialSequenceSize; 421 if (character == nonCharacter) {
201 m_partialSequenceSize = count; 422 sawError = true;
202 } 423 if (stopOnError)
203 int character = decodeNonASCIISequence(m_partialSequence, count); 424 break;
204 if (character & ~0xff) 425 // Each error generates a replacement character and consumes one byte.
205 return true; 426 *destination16++ = replacementCharacter;
206 427 ++source;
207 m_partialSequenceSize -= count; 428 continue;
208 *destination++ = static_cast<LChar>(character); 429 }
209 } while (m_partialSequenceSize); 430 source += count;
210 431 destination16 = appendCharacter(destination16, character);
211 return false; 432 }
212 } 433 } while (flush && m_partialSequenceSize);
213 434
214 template <> 435 buffer16.shrink(destination16 - buffer16.characters());
215 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint 8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) 436
216 { 437 return String::adopt(buffer16);
217 ASSERT(m_partialSequenceSize); 438 }
218 do { 439
219 if (isASCII(m_partialSequence[0])) { 440 template <typename CharType>
220 *destination++ = m_partialSequence[0]; 441 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) {
221 consumePartialSequenceByte(); 442 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
222 continue; 443 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3 x).
223 } 444 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2 x).
224 int count = nonASCIISequenceLength(m_partialSequence[0]); 445 if (length > std::numeric_limits<size_t>::max() / 3)
225 if (!count) { 446 CRASH();
226 handleError(destination, stopOnError, sawError); 447 Vector<uint8_t> bytes(length * 3);
227 if (stopOnError) 448
228 return false; 449 size_t i = 0;
229 continue; 450 size_t bytesWritten = 0;
230 } 451 while (i < length) {
231 if (count > m_partialSequenceSize) { 452 UChar32 character;
232 if (count - m_partialSequenceSize > end - source) { 453 U16_NEXT(characters, i, length, character);
233 if (!flush) { 454 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat e
234 // The new data is not enough to complete the sequence, so 455 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he re.
235 // add it to the existing partial sequence. 456 if (0xD800 <= character && character <= 0xDFFF)
236 memcpy(m_partialSequence + m_partialSequenceSize, source, en d - source); 457 character = replacementCharacter;
237 m_partialSequenceSize += end - source; 458 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
238 return false; 459 }
239 } 460
240 // An incomplete partial sequence at the end is an error. 461 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
241 handleError(destination, stopOnError, sawError); 462 }
242 if (stopOnError) 463
243 return false; 464 CString TextCodecUTF8::encode(const UChar* characters,
244 continue; 465 size_t length,
245 } 466 UnencodableHandling) {
246 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_ partialSequenceSize); 467 return encodeCommon(characters, length);
247 source += count - m_partialSequenceSize; 468 }
248 m_partialSequenceSize = count; 469
249 } 470 CString TextCodecUTF8::encode(const LChar* characters,
250 int character = decodeNonASCIISequence(m_partialSequence, count); 471 size_t length,
251 if (character == nonCharacter) { 472 UnencodableHandling) {
252 handleError(destination, stopOnError, sawError); 473 return encodeCommon(characters, length);
253 if (stopOnError) 474 }
254 return false; 475
255 continue; 476 } // namespace WTF
256 }
257
258 m_partialSequenceSize -= count;
259 destination = appendCharacter(destination, character);
260 } while (m_partialSequenceSize);
261
262 return false;
263 }
264
265 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu sh, bool stopOnError, bool& sawError)
266 {
267 // Each input byte might turn into a character.
268 // That includes all bytes in the partial-sequence buffer because
269 // each byte in an invalid sequence will turn into a replacement character.
270 StringBuffer<LChar> buffer(m_partialSequenceSize + length);
271
272 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
273 const uint8_t* end = source + length;
274 const uint8_t* alignedEnd = alignToMachineWord(end);
275 LChar* destination = buffer.characters();
276
277 do {
278 if (m_partialSequenceSize) {
279 // Explicitly copy destination and source pointers to avoid taking p ointers to the
280 // local variables, which may harm code generation by disabling some optimizations
281 // in some compilers.
282 LChar* destinationForHandlePartialSequence = destination;
283 const uint8_t* sourceForHandlePartialSequence = source;
284 if (handlePartialSequence(destinationForHandlePartialSequence, sourc eForHandlePartialSequence, end, flush, stopOnError, sawError)) {
285 source = sourceForHandlePartialSequence;
286 goto upConvertTo16Bit;
287 }
288 destination = destinationForHandlePartialSequence;
289 source = sourceForHandlePartialSequence;
290 if (m_partialSequenceSize)
291 break;
292 }
293
294 while (source < end) {
295 if (isASCII(*source)) {
296 // Fast path for ASCII. Most UTF-8 text will be ASCII.
297 if (isAlignedToMachineWord(source)) {
298 while (source < alignedEnd) {
299 MachineWord chunk = *reinterpret_cast_ptr<const MachineW ord*>(source);
300 if (!isAllASCII<LChar>(chunk))
301 break;
302 copyASCIIMachineWord(destination, source);
303 source += sizeof(MachineWord);
304 destination += sizeof(MachineWord);
305 }
306 if (source == end)
307 break;
308 if (!isASCII(*source))
309 continue;
310 }
311 *destination++ = *source++;
312 continue;
313 }
314 int count = nonASCIISequenceLength(*source);
315 int character;
316 if (count == 0) {
317 character = nonCharacter;
318 } else {
319 if (count > end - source) {
320 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));
321 ASSERT(!m_partialSequenceSize);
322 m_partialSequenceSize = end - source;
323 memcpy(m_partialSequence, source, m_partialSequenceSize);
324 source = end;
325 break;
326 }
327 character = decodeNonASCIISequence(source, count);
328 }
329 if (character == nonCharacter) {
330 sawError = true;
331 if (stopOnError)
332 break;
333
334 goto upConvertTo16Bit;
335 }
336 if (character > 0xff)
337 goto upConvertTo16Bit;
338
339 source += count;
340 *destination++ = static_cast<LChar>(character);
341 }
342 } while (flush && m_partialSequenceSize);
343
344 buffer.shrink(destination - buffer.characters());
345
346 return String::adopt(buffer);
347
348 upConvertTo16Bit:
349 StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
350
351 UChar* destination16 = buffer16.characters();
352
353 // Copy the already converted characters
354 for (LChar* converted8 = buffer.characters(); converted8 < destination;)
355 *destination16++ = *converted8++;
356
357 do {
358 if (m_partialSequenceSize) {
359 // Explicitly copy destination and source pointers to avoid taking p ointers to the
360 // local variables, which may harm code generation by disabling some optimizations
361 // in some compilers.
362 UChar* destinationForHandlePartialSequence = destination16;
363 const uint8_t* sourceForHandlePartialSequence = source;
364 handlePartialSequence(destinationForHandlePartialSequence, sourceFor HandlePartialSequence, end, flush, stopOnError, sawError);
365 destination16 = destinationForHandlePartialSequence;
366 source = sourceForHandlePartialSequence;
367 if (m_partialSequenceSize)
368 break;
369 }
370
371 while (source < end) {
372 if (isASCII(*source)) {
373 // Fast path for ASCII. Most UTF-8 text will be ASCII.
374 if (isAlignedToMachineWord(source)) {
375 while (source < alignedEnd) {
376 MachineWord chunk = *reinterpret_cast_ptr<const MachineW ord*>(source);
377 if (!isAllASCII<LChar>(chunk))
378 break;
379 copyASCIIMachineWord(destination16, source);
380 source += sizeof(MachineWord);
381 destination16 += sizeof(MachineWord);
382 }
383 if (source == end)
384 break;
385 if (!isASCII(*source))
386 continue;
387 }
388 *destination16++ = *source++;
389 continue;
390 }
391 int count = nonASCIISequenceLength(*source);
392 int character;
393 if (count == 0) {
394 character = nonCharacter;
395 } else {
396 if (count > end - source) {
397 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast< ptrdiff_t>(sizeof(m_partialSequence)));
398 ASSERT(!m_partialSequenceSize);
399 m_partialSequenceSize = end - source;
400 memcpy(m_partialSequence, source, m_partialSequenceSize);
401 source = end;
402 break;
403 }
404 character = decodeNonASCIISequence(source, count);
405 }
406 if (character == nonCharacter) {
407 sawError = true;
408 if (stopOnError)
409 break;
410 // Each error generates a replacement character and consumes one byte.
411 *destination16++ = replacementCharacter;
412 ++source;
413 continue;
414 }
415 source += count;
416 destination16 = appendCharacter(destination16, character);
417 }
418 } while (flush && m_partialSequenceSize);
419
420 buffer16.shrink(destination16 - buffer16.characters());
421
422 return String::adopt(buffer16);
423 }
424
425 template<typename CharType>
426 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
427 {
428 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
429 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
430 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
431 if (length > std::numeric_limits<size_t>::max() / 3)
432 CRASH();
433 Vector<uint8_t> bytes(length * 3);
434
435 size_t i = 0;
436 size_t bytesWritten = 0;
437 while (i < length) {
438 UChar32 character;
439 U16_NEXT(characters, i, length, character);
440 // U16_NEXT will simply emit a surrogate code point if an unmatched surr ogate
441 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER ) here.
442 if (0xD800 <= character && character <= 0xDFFF)
443 character = replacementCharacter;
444 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
445 }
446
447 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
448 }
449
450 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl eHandling)
451 {
452 return encodeCommon(characters, length);
453 }
454
455 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl eHandling)
456 {
457 return encodeCommon(characters, length);
458 }
459
460 } // namespace WTF
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecUTF8.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecUTF8Test.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698