| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 19 matching lines...) Expand all Loading... |
| 30 #include "wtf/text/CharacterNames.h" | 30 #include "wtf/text/CharacterNames.h" |
| 31 #include "wtf/text/StringBuffer.h" | 31 #include "wtf/text/StringBuffer.h" |
| 32 #include "wtf/text/TextCodecASCIIFastPath.h" | 32 #include "wtf/text/TextCodecASCIIFastPath.h" |
| 33 | 33 |
| 34 namespace WTF { | 34 namespace WTF { |
| 35 | 35 |
| 36 using namespace WTF::Unicode; | 36 using namespace WTF::Unicode; |
| 37 | 37 |
| 38 const int nonCharacter = -1; | 38 const int nonCharacter = -1; |
| 39 | 39 |
| 40 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) | 40 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) { |
| 41 { | 41 return adoptPtr(new TextCodecUTF8); |
| 42 return adoptPtr(new TextCodecUTF8); | 42 } |
| 43 } | 43 |
| 44 | 44 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { |
| 45 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) | 45 registrar("UTF-8", "UTF-8"); |
| 46 { | 46 |
| 47 registrar("UTF-8", "UTF-8"); | 47 // Additional aliases that originally were present in the encoding |
| 48 | 48 // table in WebKit on Macintosh, and subsequently added by |
| 49 // Additional aliases that originally were present in the encoding | 49 // TextCodecICU. Perhaps we can prove some are not used on the web |
| 50 // table in WebKit on Macintosh, and subsequently added by | 50 // and remove them. |
| 51 // TextCodecICU. Perhaps we can prove some are not used on the web | 51 registrar("unicode11utf8", "UTF-8"); |
| 52 // and remove them. | 52 registrar("unicode20utf8", "UTF-8"); |
| 53 registrar("unicode11utf8", "UTF-8"); | 53 registrar("utf8", "UTF-8"); |
| 54 registrar("unicode20utf8", "UTF-8"); | 54 registrar("x-unicode20utf8", "UTF-8"); |
| 55 registrar("utf8", "UTF-8"); | 55 |
| 56 registrar("x-unicode20utf8", "UTF-8"); | 56 // Additional aliases present in the WHATWG Encoding Standard (http://encoding
.spec.whatwg.org/) |
| 57 | 57 // and Firefox (24), but not in ICU 4.6. |
| 58 // Additional aliases present in the WHATWG Encoding Standard (http://encodi
ng.spec.whatwg.org/) | 58 registrar("unicode-1-1-utf-8", "UTF-8"); |
| 59 // and Firefox (24), but not in ICU 4.6. | 59 } |
| 60 registrar("unicode-1-1-utf-8", "UTF-8"); | 60 |
| 61 } | 61 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { |
| 62 | 62 registrar("UTF-8", create, 0); |
| 63 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) | 63 } |
| 64 { | 64 |
| 65 registrar("UTF-8", create, 0); | 65 static inline int nonASCIISequenceLength(uint8_t firstByte) { |
| 66 } | 66 static const uint8_t lengths[256] = { |
| 67 | 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 68 static inline int nonASCIISequenceLength(uint8_t firstByte) | 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 69 { | 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 70 static const uint8_t lengths[256] = { | 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 75 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 76 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 82 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 83 return lengths[firstByte]; |
| 84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 84 } |
| 85 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 85 |
| 86 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 86 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt
h) { |
| 87 }; | 87 ASSERT(!isASCII(sequence[0])); |
| 88 return lengths[firstByte]; | 88 if (length == 2) { |
| 89 } | 89 ASSERT(sequence[0] <= 0xDF); |
| 90 | 90 if (sequence[0] < 0xC2) |
| 91 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt
h) | 91 return nonCharacter; |
| 92 { | 92 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 93 ASSERT(!isASCII(sequence[0])); | 93 return nonCharacter; |
| 94 if (length == 2) { | 94 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; |
| 95 ASSERT(sequence[0] <= 0xDF); | 95 } |
| 96 if (sequence[0] < 0xC2) | 96 if (length == 3) { |
| 97 return nonCharacter; | 97 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); |
| 98 switch (sequence[0]) { |
| 99 case 0xE0: |
| 100 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) |
| 101 return nonCharacter; |
| 102 break; |
| 103 case 0xED: |
| 104 if (sequence[1] < 0x80 || sequence[1] > 0x9F) |
| 105 return nonCharacter; |
| 106 break; |
| 107 default: |
| 98 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 108 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 99 return nonCharacter; | 109 return nonCharacter; |
| 100 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; | 110 } |
| 101 } | 111 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 102 if (length == 3) { | 112 return nonCharacter; |
| 103 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); | 113 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080
; |
| 104 switch (sequence[0]) { | 114 } |
| 105 case 0xE0: | 115 ASSERT(length == 4); |
| 106 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) | 116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); |
| 107 return nonCharacter; | 117 switch (sequence[0]) { |
| 118 case 0xF0: |
| 119 if (sequence[1] < 0x90 || sequence[1] > 0xBF) |
| 120 return nonCharacter; |
| 121 break; |
| 122 case 0xF4: |
| 123 if (sequence[1] < 0x80 || sequence[1] > 0x8F) |
| 124 return nonCharacter; |
| 125 break; |
| 126 default: |
| 127 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 128 return nonCharacter; |
| 129 } |
| 130 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 131 return nonCharacter; |
| 132 if (sequence[3] < 0x80 || sequence[3] > 0xBF) |
| 133 return nonCharacter; |
| 134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seque
nce[3]) - 0x03C82080; |
| 135 } |
| 136 |
| 137 static inline UChar* appendCharacter(UChar* destination, int character) { |
| 138 ASSERT(character != nonCharacter); |
| 139 ASSERT(!U_IS_SURROGATE(character)); |
| 140 if (U_IS_BMP(character)) { |
| 141 *destination++ = static_cast<UChar>(character); |
| 142 } else { |
| 143 *destination++ = U16_LEAD(character); |
| 144 *destination++ = U16_TRAIL(character); |
| 145 } |
| 146 return destination; |
| 147 } |
| 148 |
| 149 void TextCodecUTF8::consumePartialSequenceByte() { |
| 150 --m_partialSequenceSize; |
| 151 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); |
| 152 } |
| 153 |
| 154 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw
Error) { |
| 155 sawError = true; |
| 156 if (stopOnError) |
| 157 return; |
| 158 // Each error generates a replacement character and consumes one byte. |
| 159 *destination++ = replacementCharacter; |
| 160 consumePartialSequenceByte(); |
| 161 } |
| 162 |
| 163 template <> |
| 164 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
8_t*& source, const uint8_t* end, bool flush, bool, bool&) { |
| 165 ASSERT(m_partialSequenceSize); |
| 166 do { |
| 167 if (isASCII(m_partialSequence[0])) { |
| 168 *destination++ = m_partialSequence[0]; |
| 169 consumePartialSequenceByte(); |
| 170 continue; |
| 171 } |
| 172 int count = nonASCIISequenceLength(m_partialSequence[0]); |
| 173 if (!count) |
| 174 return true; |
| 175 |
| 176 if (count > m_partialSequenceSize) { |
| 177 if (count - m_partialSequenceSize > end - source) { |
| 178 if (!flush) { |
| 179 // The new data is not enough to complete the sequence, so |
| 180 // add it to the existing partial sequence. |
| 181 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source
); |
| 182 m_partialSequenceSize += end - source; |
| 183 return false; |
| 184 } |
| 185 // An incomplete partial sequence at the end is an error, but it will cr
eate |
| 186 // a 16 bit string due to the replacementCharacter. Let the 16 bit path
handle |
| 187 // the error. |
| 188 return true; |
| 189 } |
| 190 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partia
lSequenceSize); |
| 191 source += count - m_partialSequenceSize; |
| 192 m_partialSequenceSize = count; |
| 193 } |
| 194 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 195 if (character & ~0xff) |
| 196 return true; |
| 197 |
| 198 m_partialSequenceSize -= count; |
| 199 *destination++ = static_cast<LChar>(character); |
| 200 } while (m_partialSequenceSize); |
| 201 |
| 202 return false; |
| 203 } |
| 204 |
| 205 template <> |
| 206 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
{ |
| 207 ASSERT(m_partialSequenceSize); |
| 208 do { |
| 209 if (isASCII(m_partialSequence[0])) { |
| 210 *destination++ = m_partialSequence[0]; |
| 211 consumePartialSequenceByte(); |
| 212 continue; |
| 213 } |
| 214 int count = nonASCIISequenceLength(m_partialSequence[0]); |
| 215 if (!count) { |
| 216 handleError(destination, stopOnError, sawError); |
| 217 if (stopOnError) |
| 218 return false; |
| 219 continue; |
| 220 } |
| 221 if (count > m_partialSequenceSize) { |
| 222 if (count - m_partialSequenceSize > end - source) { |
| 223 if (!flush) { |
| 224 // The new data is not enough to complete the sequence, so |
| 225 // add it to the existing partial sequence. |
| 226 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source
); |
| 227 m_partialSequenceSize += end - source; |
| 228 return false; |
| 229 } |
| 230 // An incomplete partial sequence at the end is an error. |
| 231 handleError(destination, stopOnError, sawError); |
| 232 if (stopOnError) |
| 233 return false; |
| 234 continue; |
| 235 } |
| 236 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partia
lSequenceSize); |
| 237 source += count - m_partialSequenceSize; |
| 238 m_partialSequenceSize = count; |
| 239 } |
| 240 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 241 if (character == nonCharacter) { |
| 242 handleError(destination, stopOnError, sawError); |
| 243 if (stopOnError) |
| 244 return false; |
| 245 continue; |
| 246 } |
| 247 |
| 248 m_partialSequenceSize -= count; |
| 249 destination = appendCharacter(destination, character); |
| 250 } while (m_partialSequenceSize); |
| 251 |
| 252 return false; |
| 253 } |
| 254 |
| 255 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu
sh, bool stopOnError, bool& sawError) { |
| 256 // Each input byte might turn into a character. |
| 257 // That includes all bytes in the partial-sequence buffer because |
| 258 // each byte in an invalid sequence will turn into a replacement character. |
| 259 StringBuffer<LChar> buffer(m_partialSequenceSize + length); |
| 260 |
| 261 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); |
| 262 const uint8_t* end = source + length; |
| 263 const uint8_t* alignedEnd = alignToMachineWord(end); |
| 264 LChar* destination = buffer.characters(); |
| 265 |
| 266 do { |
| 267 if (m_partialSequenceSize) { |
| 268 // Explicitly copy destination and source pointers to avoid taking pointer
s to the |
| 269 // local variables, which may harm code generation by disabling some optim
izations |
| 270 // in some compilers. |
| 271 LChar* destinationForHandlePartialSequence = destination; |
| 272 const uint8_t* sourceForHandlePartialSequence = source; |
| 273 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHa
ndlePartialSequence, end, flush, stopOnError, sawError)) { |
| 274 source = sourceForHandlePartialSequence; |
| 275 goto upConvertTo16Bit; |
| 276 } |
| 277 destination = destinationForHandlePartialSequence; |
| 278 source = sourceForHandlePartialSequence; |
| 279 if (m_partialSequenceSize) |
| 280 break; |
| 281 } |
| 282 |
| 283 while (source < end) { |
| 284 if (isASCII(*source)) { |
| 285 // Fast path for ASCII. Most UTF-8 text will be ASCII. |
| 286 if (isAlignedToMachineWord(source)) { |
| 287 while (source < alignedEnd) { |
| 288 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source
); |
| 289 if (!isAllASCII<LChar>(chunk)) |
| 290 break; |
| 291 copyASCIIMachineWord(destination, source); |
| 292 source += sizeof(MachineWord); |
| 293 destination += sizeof(MachineWord); |
| 294 } |
| 295 if (source == end) |
| 108 break; | 296 break; |
| 109 case 0xED: | 297 if (!isASCII(*source)) |
| 110 if (sequence[1] < 0x80 || sequence[1] > 0x9F) | 298 continue; |
| 111 return nonCharacter; | 299 } |
| 300 *destination++ = *source++; |
| 301 continue; |
| 302 } |
| 303 int count = nonASCIISequenceLength(*source); |
| 304 int character; |
| 305 if (count == 0) { |
| 306 character = nonCharacter; |
| 307 } else { |
| 308 if (count > end - source) { |
| 309 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>
(sizeof(m_partialSequence))); |
| 310 ASSERT(!m_partialSequenceSize); |
| 311 m_partialSequenceSize = end - source; |
| 312 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 313 source = end; |
| 314 break; |
| 315 } |
| 316 character = decodeNonASCIISequence(source, count); |
| 317 } |
| 318 if (character == nonCharacter) { |
| 319 sawError = true; |
| 320 if (stopOnError) |
| 321 break; |
| 322 |
| 323 goto upConvertTo16Bit; |
| 324 } |
| 325 if (character > 0xff) |
| 326 goto upConvertTo16Bit; |
| 327 |
| 328 source += count; |
| 329 *destination++ = static_cast<LChar>(character); |
| 330 } |
| 331 } while (flush && m_partialSequenceSize); |
| 332 |
| 333 buffer.shrink(destination - buffer.characters()); |
| 334 |
| 335 return String::adopt(buffer); |
| 336 |
| 337 upConvertTo16Bit: |
| 338 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); |
| 339 |
| 340 UChar* destination16 = buffer16.characters(); |
| 341 |
| 342 // Copy the already converted characters |
| 343 for (LChar* converted8 = buffer.characters(); converted8 < destination;) |
| 344 *destination16++ = *converted8++; |
| 345 |
| 346 do { |
| 347 if (m_partialSequenceSize) { |
| 348 // Explicitly copy destination and source pointers to avoid taking pointer
s to the |
| 349 // local variables, which may harm code generation by disabling some optim
izations |
| 350 // in some compilers. |
| 351 UChar* destinationForHandlePartialSequence = destination16; |
| 352 const uint8_t* sourceForHandlePartialSequence = source; |
| 353 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandle
PartialSequence, end, flush, stopOnError, sawError); |
| 354 destination16 = destinationForHandlePartialSequence; |
| 355 source = sourceForHandlePartialSequence; |
| 356 if (m_partialSequenceSize) |
| 357 break; |
| 358 } |
| 359 |
| 360 while (source < end) { |
| 361 if (isASCII(*source)) { |
| 362 // Fast path for ASCII. Most UTF-8 text will be ASCII. |
| 363 if (isAlignedToMachineWord(source)) { |
| 364 while (source < alignedEnd) { |
| 365 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source
); |
| 366 if (!isAllASCII<LChar>(chunk)) |
| 367 break; |
| 368 copyASCIIMachineWord(destination16, source); |
| 369 source += sizeof(MachineWord); |
| 370 destination16 += sizeof(MachineWord); |
| 371 } |
| 372 if (source == end) |
| 112 break; | 373 break; |
| 113 default: | 374 if (!isASCII(*source)) |
| 114 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
| 115 return nonCharacter; | |
| 116 } | |
| 117 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | |
| 118 return nonCharacter; | |
| 119 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E
2080; | |
| 120 } | |
| 121 ASSERT(length == 4); | |
| 122 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); | |
| 123 switch (sequence[0]) { | |
| 124 case 0xF0: | |
| 125 if (sequence[1] < 0x90 || sequence[1] > 0xBF) | |
| 126 return nonCharacter; | |
| 127 break; | |
| 128 case 0xF4: | |
| 129 if (sequence[1] < 0x80 || sequence[1] > 0x8F) | |
| 130 return nonCharacter; | |
| 131 break; | |
| 132 default: | |
| 133 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
| 134 return nonCharacter; | |
| 135 } | |
| 136 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | |
| 137 return nonCharacter; | |
| 138 if (sequence[3] < 0x80 || sequence[3] > 0xBF) | |
| 139 return nonCharacter; | |
| 140 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seq
uence[3]) - 0x03C82080; | |
| 141 } | |
| 142 | |
| 143 static inline UChar* appendCharacter(UChar* destination, int character) | |
| 144 { | |
| 145 ASSERT(character != nonCharacter); | |
| 146 ASSERT(!U_IS_SURROGATE(character)); | |
| 147 if (U_IS_BMP(character)) { | |
| 148 *destination++ = static_cast<UChar>(character); | |
| 149 } else { | |
| 150 *destination++ = U16_LEAD(character); | |
| 151 *destination++ = U16_TRAIL(character); | |
| 152 } | |
| 153 return destination; | |
| 154 } | |
| 155 | |
| 156 void TextCodecUTF8::consumePartialSequenceByte() | |
| 157 { | |
| 158 --m_partialSequenceSize; | |
| 159 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); | |
| 160 } | |
| 161 | |
| 162 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw
Error) | |
| 163 { | |
| 164 sawError = true; | |
| 165 if (stopOnError) | |
| 166 return; | |
| 167 // Each error generates a replacement character and consumes one byte. | |
| 168 *destination++ = replacementCharacter; | |
| 169 consumePartialSequenceByte(); | |
| 170 } | |
| 171 | |
| 172 template <> | |
| 173 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
8_t*& source, const uint8_t* end, bool flush, bool, bool&) | |
| 174 { | |
| 175 ASSERT(m_partialSequenceSize); | |
| 176 do { | |
| 177 if (isASCII(m_partialSequence[0])) { | |
| 178 *destination++ = m_partialSequence[0]; | |
| 179 consumePartialSequenceByte(); | |
| 180 continue; | 375 continue; |
| 181 } | 376 } |
| 182 int count = nonASCIISequenceLength(m_partialSequence[0]); | 377 *destination16++ = *source++; |
| 183 if (!count) | 378 continue; |
| 184 return true; | 379 } |
| 185 | 380 int count = nonASCIISequenceLength(*source); |
| 186 if (count > m_partialSequenceSize) { | 381 int character; |
| 187 if (count - m_partialSequenceSize > end - source) { | 382 if (count == 0) { |
| 188 if (!flush) { | 383 character = nonCharacter; |
| 189 // The new data is not enough to complete the sequence, so | 384 } else { |
| 190 // add it to the existing partial sequence. | 385 if (count > end - source) { |
| 191 memcpy(m_partialSequence + m_partialSequenceSize, source, en
d - source); | 386 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>
(sizeof(m_partialSequence))); |
| 192 m_partialSequenceSize += end - source; | 387 ASSERT(!m_partialSequenceSize); |
| 193 return false; | 388 m_partialSequenceSize = end - source; |
| 194 } | 389 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 195 // An incomplete partial sequence at the end is an error, but it
will create | 390 source = end; |
| 196 // a 16 bit string due to the replacementCharacter. Let the 16 b
it path handle | 391 break; |
| 197 // the error. | 392 } |
| 198 return true; | 393 character = decodeNonASCIISequence(source, count); |
| 199 } | 394 } |
| 200 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_
partialSequenceSize); | 395 if (character == nonCharacter) { |
| 201 source += count - m_partialSequenceSize; | 396 sawError = true; |
| 202 m_partialSequenceSize = count; | 397 if (stopOnError) |
| 203 } | 398 break; |
| 204 int character = decodeNonASCIISequence(m_partialSequence, count); | 399 // Each error generates a replacement character and consumes one byte. |
| 205 if (character & ~0xff) | 400 *destination16++ = replacementCharacter; |
| 206 return true; | 401 ++source; |
| 207 | 402 continue; |
| 208 m_partialSequenceSize -= count; | 403 } |
| 209 *destination++ = static_cast<LChar>(character); | 404 source += count; |
| 210 } while (m_partialSequenceSize); | 405 destination16 = appendCharacter(destination16, character); |
| 211 | 406 } |
| 212 return false; | 407 } while (flush && m_partialSequenceSize); |
| 213 } | 408 |
| 214 | 409 buffer16.shrink(destination16 - buffer16.characters()); |
| 215 template <> | 410 |
| 216 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) | 411 return String::adopt(buffer16); |
| 217 { | 412 } |
| 218 ASSERT(m_partialSequenceSize); | 413 |
| 219 do { | 414 template <typename CharType> |
| 220 if (isASCII(m_partialSequence[0])) { | 415 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { |
| 221 *destination++ = m_partialSequence[0]; | 416 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. |
| 222 consumePartialSequenceByte(); | 417 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3
x). |
| 223 continue; | 418 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2
x). |
| 224 } | 419 if (length > std::numeric_limits<size_t>::max() / 3) |
| 225 int count = nonASCIISequenceLength(m_partialSequence[0]); | 420 CRASH(); |
| 226 if (!count) { | 421 Vector<uint8_t> bytes(length * 3); |
| 227 handleError(destination, stopOnError, sawError); | 422 |
| 228 if (stopOnError) | 423 size_t i = 0; |
| 229 return false; | 424 size_t bytesWritten = 0; |
| 230 continue; | 425 while (i < length) { |
| 231 } | 426 UChar32 character; |
| 232 if (count > m_partialSequenceSize) { | 427 U16_NEXT(characters, i, length, character); |
| 233 if (count - m_partialSequenceSize > end - source) { | 428 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat
e |
| 234 if (!flush) { | 429 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he
re. |
| 235 // The new data is not enough to complete the sequence, so | 430 if (0xD800 <= character && character <= 0xDFFF) |
| 236 // add it to the existing partial sequence. | 431 character = replacementCharacter; |
| 237 memcpy(m_partialSequence + m_partialSequenceSize, source, en
d - source); | 432 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); |
| 238 m_partialSequenceSize += end - source; | 433 } |
| 239 return false; | 434 |
| 240 } | 435 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); |
| 241 // An incomplete partial sequence at the end is an error. | 436 } |
| 242 handleError(destination, stopOnError, sawError); | 437 |
| 243 if (stopOnError) | 438 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl
eHandling) { |
| 244 return false; | 439 return encodeCommon(characters, length); |
| 245 continue; | 440 } |
| 246 } | 441 |
| 247 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_
partialSequenceSize); | 442 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl
eHandling) { |
| 248 source += count - m_partialSequenceSize; | 443 return encodeCommon(characters, length); |
| 249 m_partialSequenceSize = count; | 444 } |
| 250 } | 445 |
| 251 int character = decodeNonASCIISequence(m_partialSequence, count); | 446 } // namespace WTF |
| 252 if (character == nonCharacter) { | |
| 253 handleError(destination, stopOnError, sawError); | |
| 254 if (stopOnError) | |
| 255 return false; | |
| 256 continue; | |
| 257 } | |
| 258 | |
| 259 m_partialSequenceSize -= count; | |
| 260 destination = appendCharacter(destination, character); | |
| 261 } while (m_partialSequenceSize); | |
| 262 | |
| 263 return false; | |
| 264 } | |
| 265 | |
| 266 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu
sh, bool stopOnError, bool& sawError) | |
| 267 { | |
| 268 // Each input byte might turn into a character. | |
| 269 // That includes all bytes in the partial-sequence buffer because | |
| 270 // each byte in an invalid sequence will turn into a replacement character. | |
| 271 StringBuffer<LChar> buffer(m_partialSequenceSize + length); | |
| 272 | |
| 273 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); | |
| 274 const uint8_t* end = source + length; | |
| 275 const uint8_t* alignedEnd = alignToMachineWord(end); | |
| 276 LChar* destination = buffer.characters(); | |
| 277 | |
| 278 do { | |
| 279 if (m_partialSequenceSize) { | |
| 280 // Explicitly copy destination and source pointers to avoid taking p
ointers to the | |
| 281 // local variables, which may harm code generation by disabling some
optimizations | |
| 282 // in some compilers. | |
| 283 LChar* destinationForHandlePartialSequence = destination; | |
| 284 const uint8_t* sourceForHandlePartialSequence = source; | |
| 285 if (handlePartialSequence(destinationForHandlePartialSequence, sourc
eForHandlePartialSequence, end, flush, stopOnError, sawError)) { | |
| 286 source = sourceForHandlePartialSequence; | |
| 287 goto upConvertTo16Bit; | |
| 288 } | |
| 289 destination = destinationForHandlePartialSequence; | |
| 290 source = sourceForHandlePartialSequence; | |
| 291 if (m_partialSequenceSize) | |
| 292 break; | |
| 293 } | |
| 294 | |
| 295 while (source < end) { | |
| 296 if (isASCII(*source)) { | |
| 297 // Fast path for ASCII. Most UTF-8 text will be ASCII. | |
| 298 if (isAlignedToMachineWord(source)) { | |
| 299 while (source < alignedEnd) { | |
| 300 MachineWord chunk = *reinterpret_cast_ptr<const MachineW
ord*>(source); | |
| 301 if (!isAllASCII<LChar>(chunk)) | |
| 302 break; | |
| 303 copyASCIIMachineWord(destination, source); | |
| 304 source += sizeof(MachineWord); | |
| 305 destination += sizeof(MachineWord); | |
| 306 } | |
| 307 if (source == end) | |
| 308 break; | |
| 309 if (!isASCII(*source)) | |
| 310 continue; | |
| 311 } | |
| 312 *destination++ = *source++; | |
| 313 continue; | |
| 314 } | |
| 315 int count = nonASCIISequenceLength(*source); | |
| 316 int character; | |
| 317 if (count == 0) { | |
| 318 character = nonCharacter; | |
| 319 } else { | |
| 320 if (count > end - source) { | |
| 321 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<
ptrdiff_t>(sizeof(m_partialSequence))); | |
| 322 ASSERT(!m_partialSequenceSize); | |
| 323 m_partialSequenceSize = end - source; | |
| 324 memcpy(m_partialSequence, source, m_partialSequenceSize); | |
| 325 source = end; | |
| 326 break; | |
| 327 } | |
| 328 character = decodeNonASCIISequence(source, count); | |
| 329 } | |
| 330 if (character == nonCharacter) { | |
| 331 sawError = true; | |
| 332 if (stopOnError) | |
| 333 break; | |
| 334 | |
| 335 goto upConvertTo16Bit; | |
| 336 } | |
| 337 if (character > 0xff) | |
| 338 goto upConvertTo16Bit; | |
| 339 | |
| 340 source += count; | |
| 341 *destination++ = static_cast<LChar>(character); | |
| 342 } | |
| 343 } while (flush && m_partialSequenceSize); | |
| 344 | |
| 345 buffer.shrink(destination - buffer.characters()); | |
| 346 | |
| 347 return String::adopt(buffer); | |
| 348 | |
| 349 upConvertTo16Bit: | |
| 350 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); | |
| 351 | |
| 352 UChar* destination16 = buffer16.characters(); | |
| 353 | |
| 354 // Copy the already converted characters | |
| 355 for (LChar* converted8 = buffer.characters(); converted8 < destination;) | |
| 356 *destination16++ = *converted8++; | |
| 357 | |
| 358 do { | |
| 359 if (m_partialSequenceSize) { | |
| 360 // Explicitly copy destination and source pointers to avoid taking p
ointers to the | |
| 361 // local variables, which may harm code generation by disabling some
optimizations | |
| 362 // in some compilers. | |
| 363 UChar* destinationForHandlePartialSequence = destination16; | |
| 364 const uint8_t* sourceForHandlePartialSequence = source; | |
| 365 handlePartialSequence(destinationForHandlePartialSequence, sourceFor
HandlePartialSequence, end, flush, stopOnError, sawError); | |
| 366 destination16 = destinationForHandlePartialSequence; | |
| 367 source = sourceForHandlePartialSequence; | |
| 368 if (m_partialSequenceSize) | |
| 369 break; | |
| 370 } | |
| 371 | |
| 372 while (source < end) { | |
| 373 if (isASCII(*source)) { | |
| 374 // Fast path for ASCII. Most UTF-8 text will be ASCII. | |
| 375 if (isAlignedToMachineWord(source)) { | |
| 376 while (source < alignedEnd) { | |
| 377 MachineWord chunk = *reinterpret_cast_ptr<const MachineW
ord*>(source); | |
| 378 if (!isAllASCII<LChar>(chunk)) | |
| 379 break; | |
| 380 copyASCIIMachineWord(destination16, source); | |
| 381 source += sizeof(MachineWord); | |
| 382 destination16 += sizeof(MachineWord); | |
| 383 } | |
| 384 if (source == end) | |
| 385 break; | |
| 386 if (!isASCII(*source)) | |
| 387 continue; | |
| 388 } | |
| 389 *destination16++ = *source++; | |
| 390 continue; | |
| 391 } | |
| 392 int count = nonASCIISequenceLength(*source); | |
| 393 int character; | |
| 394 if (count == 0) { | |
| 395 character = nonCharacter; | |
| 396 } else { | |
| 397 if (count > end - source) { | |
| 398 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<
ptrdiff_t>(sizeof(m_partialSequence))); | |
| 399 ASSERT(!m_partialSequenceSize); | |
| 400 m_partialSequenceSize = end - source; | |
| 401 memcpy(m_partialSequence, source, m_partialSequenceSize); | |
| 402 source = end; | |
| 403 break; | |
| 404 } | |
| 405 character = decodeNonASCIISequence(source, count); | |
| 406 } | |
| 407 if (character == nonCharacter) { | |
| 408 sawError = true; | |
| 409 if (stopOnError) | |
| 410 break; | |
| 411 // Each error generates a replacement character and consumes one
byte. | |
| 412 *destination16++ = replacementCharacter; | |
| 413 ++source; | |
| 414 continue; | |
| 415 } | |
| 416 source += count; | |
| 417 destination16 = appendCharacter(destination16, character); | |
| 418 } | |
| 419 } while (flush && m_partialSequenceSize); | |
| 420 | |
| 421 buffer16.shrink(destination16 - buffer16.characters()); | |
| 422 | |
| 423 return String::adopt(buffer16); | |
| 424 } | |
| 425 | |
| 426 template<typename CharType> | |
| 427 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) | |
| 428 { | |
| 429 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. | |
| 430 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
(3x). | |
| 431 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
(2x). | |
| 432 if (length > std::numeric_limits<size_t>::max() / 3) | |
| 433 CRASH(); | |
| 434 Vector<uint8_t> bytes(length * 3); | |
| 435 | |
| 436 size_t i = 0; | |
| 437 size_t bytesWritten = 0; | |
| 438 while (i < length) { | |
| 439 UChar32 character; | |
| 440 U16_NEXT(characters, i, length, character); | |
| 441 // U16_NEXT will simply emit a surrogate code point if an unmatched surr
ogate | |
| 442 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER
) here. | |
| 443 if (0xD800 <= character && character <= 0xDFFF) | |
| 444 character = replacementCharacter; | |
| 445 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); | |
| 446 } | |
| 447 | |
| 448 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); | |
| 449 } | |
| 450 | |
| 451 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl
eHandling) | |
| 452 { | |
| 453 return encodeCommon(characters, length); | |
| 454 } | |
| 455 | |
| 456 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl
eHandling) | |
| 457 { | |
| 458 return encodeCommon(characters, length); | |
| 459 } | |
| 460 | |
| 461 } // namespace WTF | |
| OLD | NEW |