| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 18 matching lines...) Expand all Loading... |
| 29 #include "wtf/text/CharacterNames.h" | 29 #include "wtf/text/CharacterNames.h" |
| 30 #include "wtf/text/StringBuffer.h" | 30 #include "wtf/text/StringBuffer.h" |
| 31 #include "wtf/text/TextCodecASCIIFastPath.h" | 31 #include "wtf/text/TextCodecASCIIFastPath.h" |
| 32 | 32 |
| 33 namespace WTF { | 33 namespace WTF { |
| 34 | 34 |
| 35 using namespace WTF::Unicode; | 35 using namespace WTF::Unicode; |
| 36 | 36 |
| 37 const int nonCharacter = -1; | 37 const int nonCharacter = -1; |
| 38 | 38 |
| 39 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) | 39 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) { |
| 40 { | 40 return adoptPtr(new TextCodecUTF8); |
| 41 return adoptPtr(new TextCodecUTF8); | 41 } |
| 42 } | 42 |
| 43 | 43 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { |
| 44 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) | 44 registrar("UTF-8", "UTF-8"); |
| 45 { | 45 |
| 46 registrar("UTF-8", "UTF-8"); | 46 // Additional aliases that originally were present in the encoding |
| 47 | 47 // table in WebKit on Macintosh, and subsequently added by |
| 48 // Additional aliases that originally were present in the encoding | 48 // TextCodecICU. Perhaps we can prove some are not used on the web |
| 49 // table in WebKit on Macintosh, and subsequently added by | 49 // and remove them. |
| 50 // TextCodecICU. Perhaps we can prove some are not used on the web | 50 registrar("unicode11utf8", "UTF-8"); |
| 51 // and remove them. | 51 registrar("unicode20utf8", "UTF-8"); |
| 52 registrar("unicode11utf8", "UTF-8"); | 52 registrar("utf8", "UTF-8"); |
| 53 registrar("unicode20utf8", "UTF-8"); | 53 registrar("x-unicode20utf8", "UTF-8"); |
| 54 registrar("utf8", "UTF-8"); | 54 |
| 55 registrar("x-unicode20utf8", "UTF-8"); | 55 // Additional aliases present in the WHATWG Encoding Standard (http://encoding
.spec.whatwg.org/) |
| 56 | 56 // and Firefox (24), but not in ICU 4.6. |
| 57 // Additional aliases present in the WHATWG Encoding Standard (http://encodi
ng.spec.whatwg.org/) | 57 registrar("unicode-1-1-utf-8", "UTF-8"); |
| 58 // and Firefox (24), but not in ICU 4.6. | 58 } |
| 59 registrar("unicode-1-1-utf-8", "UTF-8"); | 59 |
| 60 } | 60 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { |
| 61 | 61 registrar("UTF-8", create, 0); |
| 62 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) | 62 } |
| 63 { | 63 |
| 64 registrar("UTF-8", create, 0); | 64 static inline int nonASCIISequenceLength(uint8_t firstByte) { |
| 65 } | 65 static const uint8_t lengths[256] = { |
| 66 | 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 67 static inline int nonASCIISequenceLength(uint8_t firstByte) | 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 68 { | 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 69 static const uint8_t lengths[256] = { | 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 71 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 72 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 73 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 75 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 76 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 77 return lengths[firstByte]; |
| 78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 78 } |
| 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 79 |
| 80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 80 static inline int decodeNonASCIISequence(const uint8_t* sequence, |
| 81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 81 unsigned length) { |
| 82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 82 ASSERT(!isASCII(sequence[0])); |
| 83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 83 if (length == 2) { |
| 84 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 84 ASSERT(sequence[0] <= 0xDF); |
| 85 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 85 if (sequence[0] < 0xC2) |
| 86 }; | 86 return nonCharacter; |
| 87 return lengths[firstByte]; | 87 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 88 } | 88 return nonCharacter; |
| 89 | 89 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; |
| 90 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned lengt
h) | 90 } |
| 91 { | 91 if (length == 3) { |
| 92 ASSERT(!isASCII(sequence[0])); | 92 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); |
| 93 if (length == 2) { | 93 switch (sequence[0]) { |
| 94 ASSERT(sequence[0] <= 0xDF); | 94 case 0xE0: |
| 95 if (sequence[0] < 0xC2) | 95 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) |
| 96 return nonCharacter; | 96 return nonCharacter; |
| 97 break; |
| 98 case 0xED: |
| 99 if (sequence[1] < 0x80 || sequence[1] > 0x9F) |
| 100 return nonCharacter; |
| 101 break; |
| 102 default: |
| 97 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 103 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 98 return nonCharacter; | 104 return nonCharacter; |
| 99 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; | 105 } |
| 100 } | 106 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 101 if (length == 3) { | 107 return nonCharacter; |
| 102 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); | 108 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - |
| 103 switch (sequence[0]) { | 109 0x000E2080; |
| 104 case 0xE0: | 110 } |
| 105 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) | 111 ASSERT(length == 4); |
| 106 return nonCharacter; | 112 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); |
| 113 switch (sequence[0]) { |
| 114 case 0xF0: |
| 115 if (sequence[1] < 0x90 || sequence[1] > 0xBF) |
| 116 return nonCharacter; |
| 117 break; |
| 118 case 0xF4: |
| 119 if (sequence[1] < 0x80 || sequence[1] > 0x8F) |
| 120 return nonCharacter; |
| 121 break; |
| 122 default: |
| 123 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 124 return nonCharacter; |
| 125 } |
| 126 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 127 return nonCharacter; |
| 128 if (sequence[3] < 0x80 || sequence[3] > 0xBF) |
| 129 return nonCharacter; |
| 130 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + |
| 131 sequence[3]) - |
| 132 0x03C82080; |
| 133 } |
| 134 |
| 135 static inline UChar* appendCharacter(UChar* destination, int character) { |
| 136 ASSERT(character != nonCharacter); |
| 137 ASSERT(!U_IS_SURROGATE(character)); |
| 138 if (U_IS_BMP(character)) { |
| 139 *destination++ = static_cast<UChar>(character); |
| 140 } else { |
| 141 *destination++ = U16_LEAD(character); |
| 142 *destination++ = U16_TRAIL(character); |
| 143 } |
| 144 return destination; |
| 145 } |
| 146 |
| 147 void TextCodecUTF8::consumePartialSequenceByte() { |
| 148 --m_partialSequenceSize; |
| 149 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); |
| 150 } |
| 151 |
| 152 void TextCodecUTF8::handleError(UChar*& destination, |
| 153 bool stopOnError, |
| 154 bool& sawError) { |
| 155 sawError = true; |
| 156 if (stopOnError) |
| 157 return; |
| 158 // Each error generates a replacement character and consumes one byte. |
| 159 *destination++ = replacementCharacter; |
| 160 consumePartialSequenceByte(); |
| 161 } |
| 162 |
| 163 template <> |
| 164 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, |
| 165 const uint8_t*& source, |
| 166 const uint8_t* end, |
| 167 bool flush, |
| 168 bool, |
| 169 bool&) { |
| 170 ASSERT(m_partialSequenceSize); |
| 171 do { |
| 172 if (isASCII(m_partialSequence[0])) { |
| 173 *destination++ = m_partialSequence[0]; |
| 174 consumePartialSequenceByte(); |
| 175 continue; |
| 176 } |
| 177 int count = nonASCIISequenceLength(m_partialSequence[0]); |
| 178 if (!count) |
| 179 return true; |
| 180 |
| 181 if (count > m_partialSequenceSize) { |
| 182 if (count - m_partialSequenceSize > end - source) { |
| 183 if (!flush) { |
| 184 // The new data is not enough to complete the sequence, so |
| 185 // add it to the existing partial sequence. |
| 186 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 187 end - source); |
| 188 m_partialSequenceSize += end - source; |
| 189 return false; |
| 190 } |
| 191 // An incomplete partial sequence at the end is an error, but it will cr
eate |
| 192 // a 16 bit string due to the replacementCharacter. Let the 16 bit path
handle |
| 193 // the error. |
| 194 return true; |
| 195 } |
| 196 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 197 count - m_partialSequenceSize); |
| 198 source += count - m_partialSequenceSize; |
| 199 m_partialSequenceSize = count; |
| 200 } |
| 201 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 202 if (character & ~0xff) |
| 203 return true; |
| 204 |
| 205 m_partialSequenceSize -= count; |
| 206 *destination++ = static_cast<LChar>(character); |
| 207 } while (m_partialSequenceSize); |
| 208 |
| 209 return false; |
| 210 } |
| 211 |
| 212 template <> |
| 213 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, |
| 214 const uint8_t*& source, |
| 215 const uint8_t* end, |
| 216 bool flush, |
| 217 bool stopOnError, |
| 218 bool& sawError) { |
| 219 ASSERT(m_partialSequenceSize); |
| 220 do { |
| 221 if (isASCII(m_partialSequence[0])) { |
| 222 *destination++ = m_partialSequence[0]; |
| 223 consumePartialSequenceByte(); |
| 224 continue; |
| 225 } |
| 226 int count = nonASCIISequenceLength(m_partialSequence[0]); |
| 227 if (!count) { |
| 228 handleError(destination, stopOnError, sawError); |
| 229 if (stopOnError) |
| 230 return false; |
| 231 continue; |
| 232 } |
| 233 if (count > m_partialSequenceSize) { |
| 234 if (count - m_partialSequenceSize > end - source) { |
| 235 if (!flush) { |
| 236 // The new data is not enough to complete the sequence, so |
| 237 // add it to the existing partial sequence. |
| 238 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 239 end - source); |
| 240 m_partialSequenceSize += end - source; |
| 241 return false; |
| 242 } |
| 243 // An incomplete partial sequence at the end is an error. |
| 244 handleError(destination, stopOnError, sawError); |
| 245 if (stopOnError) |
| 246 return false; |
| 247 continue; |
| 248 } |
| 249 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 250 count - m_partialSequenceSize); |
| 251 source += count - m_partialSequenceSize; |
| 252 m_partialSequenceSize = count; |
| 253 } |
| 254 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 255 if (character == nonCharacter) { |
| 256 handleError(destination, stopOnError, sawError); |
| 257 if (stopOnError) |
| 258 return false; |
| 259 continue; |
| 260 } |
| 261 |
| 262 m_partialSequenceSize -= count; |
| 263 destination = appendCharacter(destination, character); |
| 264 } while (m_partialSequenceSize); |
| 265 |
| 266 return false; |
| 267 } |
| 268 |
| 269 String TextCodecUTF8::decode(const char* bytes, |
| 270 size_t length, |
| 271 FlushBehavior flush, |
| 272 bool stopOnError, |
| 273 bool& sawError) { |
| 274 // Each input byte might turn into a character. |
| 275 // That includes all bytes in the partial-sequence buffer because |
| 276 // each byte in an invalid sequence will turn into a replacement character. |
| 277 StringBuffer<LChar> buffer(m_partialSequenceSize + length); |
| 278 |
| 279 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); |
| 280 const uint8_t* end = source + length; |
| 281 const uint8_t* alignedEnd = alignToMachineWord(end); |
| 282 LChar* destination = buffer.characters(); |
| 283 |
| 284 do { |
| 285 if (m_partialSequenceSize) { |
| 286 // Explicitly copy destination and source pointers to avoid taking pointer
s to the |
| 287 // local variables, which may harm code generation by disabling some optim
izations |
| 288 // in some compilers. |
| 289 LChar* destinationForHandlePartialSequence = destination; |
| 290 const uint8_t* sourceForHandlePartialSequence = source; |
| 291 if (handlePartialSequence(destinationForHandlePartialSequence, |
| 292 sourceForHandlePartialSequence, end, flush, |
| 293 stopOnError, sawError)) { |
| 294 source = sourceForHandlePartialSequence; |
| 295 goto upConvertTo16Bit; |
| 296 } |
| 297 destination = destinationForHandlePartialSequence; |
| 298 source = sourceForHandlePartialSequence; |
| 299 if (m_partialSequenceSize) |
| 300 break; |
| 301 } |
| 302 |
| 303 while (source < end) { |
| 304 if (isASCII(*source)) { |
| 305 // Fast path for ASCII. Most UTF-8 text will be ASCII. |
| 306 if (isAlignedToMachineWord(source)) { |
| 307 while (source < alignedEnd) { |
| 308 MachineWord chunk = |
| 309 *reinterpret_cast_ptr<const MachineWord*>(source); |
| 310 if (!isAllASCII<LChar>(chunk)) |
| 311 break; |
| 312 copyASCIIMachineWord(destination, source); |
| 313 source += sizeof(MachineWord); |
| 314 destination += sizeof(MachineWord); |
| 315 } |
| 316 if (source == end) |
| 107 break; | 317 break; |
| 108 case 0xED: | 318 if (!isASCII(*source)) |
| 109 if (sequence[1] < 0x80 || sequence[1] > 0x9F) | 319 continue; |
| 110 return nonCharacter; | 320 } |
| 321 *destination++ = *source++; |
| 322 continue; |
| 323 } |
| 324 int count = nonASCIISequenceLength(*source); |
| 325 int character; |
| 326 if (count == 0) { |
| 327 character = nonCharacter; |
| 328 } else { |
| 329 if (count > end - source) { |
| 330 ASSERT_WITH_SECURITY_IMPLICATION( |
| 331 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
| 332 ASSERT(!m_partialSequenceSize); |
| 333 m_partialSequenceSize = end - source; |
| 334 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 335 source = end; |
| 336 break; |
| 337 } |
| 338 character = decodeNonASCIISequence(source, count); |
| 339 } |
| 340 if (character == nonCharacter) { |
| 341 sawError = true; |
| 342 if (stopOnError) |
| 343 break; |
| 344 |
| 345 goto upConvertTo16Bit; |
| 346 } |
| 347 if (character > 0xff) |
| 348 goto upConvertTo16Bit; |
| 349 |
| 350 source += count; |
| 351 *destination++ = static_cast<LChar>(character); |
| 352 } |
| 353 } while (flush && m_partialSequenceSize); |
| 354 |
| 355 buffer.shrink(destination - buffer.characters()); |
| 356 |
| 357 return String::adopt(buffer); |
| 358 |
| 359 upConvertTo16Bit: |
| 360 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); |
| 361 |
| 362 UChar* destination16 = buffer16.characters(); |
| 363 |
| 364 // Copy the already converted characters |
| 365 for (LChar* converted8 = buffer.characters(); converted8 < destination;) |
| 366 *destination16++ = *converted8++; |
| 367 |
| 368 do { |
| 369 if (m_partialSequenceSize) { |
| 370 // Explicitly copy destination and source pointers to avoid taking pointer
s to the |
| 371 // local variables, which may harm code generation by disabling some optim
izations |
| 372 // in some compilers. |
| 373 UChar* destinationForHandlePartialSequence = destination16; |
| 374 const uint8_t* sourceForHandlePartialSequence = source; |
| 375 handlePartialSequence(destinationForHandlePartialSequence, |
| 376 sourceForHandlePartialSequence, end, flush, |
| 377 stopOnError, sawError); |
| 378 destination16 = destinationForHandlePartialSequence; |
| 379 source = sourceForHandlePartialSequence; |
| 380 if (m_partialSequenceSize) |
| 381 break; |
| 382 } |
| 383 |
| 384 while (source < end) { |
| 385 if (isASCII(*source)) { |
| 386 // Fast path for ASCII. Most UTF-8 text will be ASCII. |
| 387 if (isAlignedToMachineWord(source)) { |
| 388 while (source < alignedEnd) { |
| 389 MachineWord chunk = |
| 390 *reinterpret_cast_ptr<const MachineWord*>(source); |
| 391 if (!isAllASCII<LChar>(chunk)) |
| 392 break; |
| 393 copyASCIIMachineWord(destination16, source); |
| 394 source += sizeof(MachineWord); |
| 395 destination16 += sizeof(MachineWord); |
| 396 } |
| 397 if (source == end) |
| 111 break; | 398 break; |
| 112 default: | 399 if (!isASCII(*source)) |
| 113 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
| 114 return nonCharacter; | |
| 115 } | |
| 116 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | |
| 117 return nonCharacter; | |
| 118 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E
2080; | |
| 119 } | |
| 120 ASSERT(length == 4); | |
| 121 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); | |
| 122 switch (sequence[0]) { | |
| 123 case 0xF0: | |
| 124 if (sequence[1] < 0x90 || sequence[1] > 0xBF) | |
| 125 return nonCharacter; | |
| 126 break; | |
| 127 case 0xF4: | |
| 128 if (sequence[1] < 0x80 || sequence[1] > 0x8F) | |
| 129 return nonCharacter; | |
| 130 break; | |
| 131 default: | |
| 132 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | |
| 133 return nonCharacter; | |
| 134 } | |
| 135 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | |
| 136 return nonCharacter; | |
| 137 if (sequence[3] < 0x80 || sequence[3] > 0xBF) | |
| 138 return nonCharacter; | |
| 139 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + seq
uence[3]) - 0x03C82080; | |
| 140 } | |
| 141 | |
| 142 static inline UChar* appendCharacter(UChar* destination, int character) | |
| 143 { | |
| 144 ASSERT(character != nonCharacter); | |
| 145 ASSERT(!U_IS_SURROGATE(character)); | |
| 146 if (U_IS_BMP(character)) { | |
| 147 *destination++ = static_cast<UChar>(character); | |
| 148 } else { | |
| 149 *destination++ = U16_LEAD(character); | |
| 150 *destination++ = U16_TRAIL(character); | |
| 151 } | |
| 152 return destination; | |
| 153 } | |
| 154 | |
| 155 void TextCodecUTF8::consumePartialSequenceByte() | |
| 156 { | |
| 157 --m_partialSequenceSize; | |
| 158 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); | |
| 159 } | |
| 160 | |
| 161 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& saw
Error) | |
| 162 { | |
| 163 sawError = true; | |
| 164 if (stopOnError) | |
| 165 return; | |
| 166 // Each error generates a replacement character and consumes one byte. | |
| 167 *destination++ = replacementCharacter; | |
| 168 consumePartialSequenceByte(); | |
| 169 } | |
| 170 | |
| 171 template <> | |
| 172 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint
8_t*& source, const uint8_t* end, bool flush, bool, bool&) | |
| 173 { | |
| 174 ASSERT(m_partialSequenceSize); | |
| 175 do { | |
| 176 if (isASCII(m_partialSequence[0])) { | |
| 177 *destination++ = m_partialSequence[0]; | |
| 178 consumePartialSequenceByte(); | |
| 179 continue; | 400 continue; |
| 180 } | 401 } |
| 181 int count = nonASCIISequenceLength(m_partialSequence[0]); | 402 *destination16++ = *source++; |
| 182 if (!count) | 403 continue; |
| 183 return true; | 404 } |
| 184 | 405 int count = nonASCIISequenceLength(*source); |
| 185 if (count > m_partialSequenceSize) { | 406 int character; |
| 186 if (count - m_partialSequenceSize > end - source) { | 407 if (count == 0) { |
| 187 if (!flush) { | 408 character = nonCharacter; |
| 188 // The new data is not enough to complete the sequence, so | 409 } else { |
| 189 // add it to the existing partial sequence. | 410 if (count > end - source) { |
| 190 memcpy(m_partialSequence + m_partialSequenceSize, source, en
d - source); | 411 ASSERT_WITH_SECURITY_IMPLICATION( |
| 191 m_partialSequenceSize += end - source; | 412 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
| 192 return false; | 413 ASSERT(!m_partialSequenceSize); |
| 193 } | 414 m_partialSequenceSize = end - source; |
| 194 // An incomplete partial sequence at the end is an error, but it
will create | 415 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 195 // a 16 bit string due to the replacementCharacter. Let the 16 b
it path handle | 416 source = end; |
| 196 // the error. | 417 break; |
| 197 return true; | 418 } |
| 198 } | 419 character = decodeNonASCIISequence(source, count); |
| 199 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_
partialSequenceSize); | 420 } |
| 200 source += count - m_partialSequenceSize; | 421 if (character == nonCharacter) { |
| 201 m_partialSequenceSize = count; | 422 sawError = true; |
| 202 } | 423 if (stopOnError) |
| 203 int character = decodeNonASCIISequence(m_partialSequence, count); | 424 break; |
| 204 if (character & ~0xff) | 425 // Each error generates a replacement character and consumes one byte. |
| 205 return true; | 426 *destination16++ = replacementCharacter; |
| 206 | 427 ++source; |
| 207 m_partialSequenceSize -= count; | 428 continue; |
| 208 *destination++ = static_cast<LChar>(character); | 429 } |
| 209 } while (m_partialSequenceSize); | 430 source += count; |
| 210 | 431 destination16 = appendCharacter(destination16, character); |
| 211 return false; | 432 } |
| 212 } | 433 } while (flush && m_partialSequenceSize); |
| 213 | 434 |
| 214 template <> | 435 buffer16.shrink(destination16 - buffer16.characters()); |
| 215 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint
8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) | 436 |
| 216 { | 437 return String::adopt(buffer16); |
| 217 ASSERT(m_partialSequenceSize); | 438 } |
| 218 do { | 439 |
| 219 if (isASCII(m_partialSequence[0])) { | 440 template <typename CharType> |
| 220 *destination++ = m_partialSequence[0]; | 441 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { |
| 221 consumePartialSequenceByte(); | 442 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. |
| 222 continue; | 443 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3
x). |
| 223 } | 444 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2
x). |
| 224 int count = nonASCIISequenceLength(m_partialSequence[0]); | 445 if (length > std::numeric_limits<size_t>::max() / 3) |
| 225 if (!count) { | 446 CRASH(); |
| 226 handleError(destination, stopOnError, sawError); | 447 Vector<uint8_t> bytes(length * 3); |
| 227 if (stopOnError) | 448 |
| 228 return false; | 449 size_t i = 0; |
| 229 continue; | 450 size_t bytesWritten = 0; |
| 230 } | 451 while (i < length) { |
| 231 if (count > m_partialSequenceSize) { | 452 UChar32 character; |
| 232 if (count - m_partialSequenceSize > end - source) { | 453 U16_NEXT(characters, i, length, character); |
| 233 if (!flush) { | 454 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat
e |
| 234 // The new data is not enough to complete the sequence, so | 455 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he
re. |
| 235 // add it to the existing partial sequence. | 456 if (0xD800 <= character && character <= 0xDFFF) |
| 236 memcpy(m_partialSequence + m_partialSequenceSize, source, en
d - source); | 457 character = replacementCharacter; |
| 237 m_partialSequenceSize += end - source; | 458 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); |
| 238 return false; | 459 } |
| 239 } | 460 |
| 240 // An incomplete partial sequence at the end is an error. | 461 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); |
| 241 handleError(destination, stopOnError, sawError); | 462 } |
| 242 if (stopOnError) | 463 |
| 243 return false; | 464 CString TextCodecUTF8::encode(const UChar* characters, |
| 244 continue; | 465 size_t length, |
| 245 } | 466 UnencodableHandling) { |
| 246 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_
partialSequenceSize); | 467 return encodeCommon(characters, length); |
| 247 source += count - m_partialSequenceSize; | 468 } |
| 248 m_partialSequenceSize = count; | 469 |
| 249 } | 470 CString TextCodecUTF8::encode(const LChar* characters, |
| 250 int character = decodeNonASCIISequence(m_partialSequence, count); | 471 size_t length, |
| 251 if (character == nonCharacter) { | 472 UnencodableHandling) { |
| 252 handleError(destination, stopOnError, sawError); | 473 return encodeCommon(characters, length); |
| 253 if (stopOnError) | 474 } |
| 254 return false; | 475 |
| 255 continue; | 476 } // namespace WTF |
| 256 } | |
| 257 | |
| 258 m_partialSequenceSize -= count; | |
| 259 destination = appendCharacter(destination, character); | |
| 260 } while (m_partialSequenceSize); | |
| 261 | |
| 262 return false; | |
| 263 } | |
| 264 | |
| 265 String TextCodecUTF8::decode(const char* bytes, size_t length, FlushBehavior flu
sh, bool stopOnError, bool& sawError) | |
| 266 { | |
| 267 // Each input byte might turn into a character. | |
| 268 // That includes all bytes in the partial-sequence buffer because | |
| 269 // each byte in an invalid sequence will turn into a replacement character. | |
| 270 StringBuffer<LChar> buffer(m_partialSequenceSize + length); | |
| 271 | |
| 272 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); | |
| 273 const uint8_t* end = source + length; | |
| 274 const uint8_t* alignedEnd = alignToMachineWord(end); | |
| 275 LChar* destination = buffer.characters(); | |
| 276 | |
| 277 do { | |
| 278 if (m_partialSequenceSize) { | |
| 279 // Explicitly copy destination and source pointers to avoid taking p
ointers to the | |
| 280 // local variables, which may harm code generation by disabling some
optimizations | |
| 281 // in some compilers. | |
| 282 LChar* destinationForHandlePartialSequence = destination; | |
| 283 const uint8_t* sourceForHandlePartialSequence = source; | |
| 284 if (handlePartialSequence(destinationForHandlePartialSequence, sourc
eForHandlePartialSequence, end, flush, stopOnError, sawError)) { | |
| 285 source = sourceForHandlePartialSequence; | |
| 286 goto upConvertTo16Bit; | |
| 287 } | |
| 288 destination = destinationForHandlePartialSequence; | |
| 289 source = sourceForHandlePartialSequence; | |
| 290 if (m_partialSequenceSize) | |
| 291 break; | |
| 292 } | |
| 293 | |
| 294 while (source < end) { | |
| 295 if (isASCII(*source)) { | |
| 296 // Fast path for ASCII. Most UTF-8 text will be ASCII. | |
| 297 if (isAlignedToMachineWord(source)) { | |
| 298 while (source < alignedEnd) { | |
| 299 MachineWord chunk = *reinterpret_cast_ptr<const MachineW
ord*>(source); | |
| 300 if (!isAllASCII<LChar>(chunk)) | |
| 301 break; | |
| 302 copyASCIIMachineWord(destination, source); | |
| 303 source += sizeof(MachineWord); | |
| 304 destination += sizeof(MachineWord); | |
| 305 } | |
| 306 if (source == end) | |
| 307 break; | |
| 308 if (!isASCII(*source)) | |
| 309 continue; | |
| 310 } | |
| 311 *destination++ = *source++; | |
| 312 continue; | |
| 313 } | |
| 314 int count = nonASCIISequenceLength(*source); | |
| 315 int character; | |
| 316 if (count == 0) { | |
| 317 character = nonCharacter; | |
| 318 } else { | |
| 319 if (count > end - source) { | |
| 320 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<
ptrdiff_t>(sizeof(m_partialSequence))); | |
| 321 ASSERT(!m_partialSequenceSize); | |
| 322 m_partialSequenceSize = end - source; | |
| 323 memcpy(m_partialSequence, source, m_partialSequenceSize); | |
| 324 source = end; | |
| 325 break; | |
| 326 } | |
| 327 character = decodeNonASCIISequence(source, count); | |
| 328 } | |
| 329 if (character == nonCharacter) { | |
| 330 sawError = true; | |
| 331 if (stopOnError) | |
| 332 break; | |
| 333 | |
| 334 goto upConvertTo16Bit; | |
| 335 } | |
| 336 if (character > 0xff) | |
| 337 goto upConvertTo16Bit; | |
| 338 | |
| 339 source += count; | |
| 340 *destination++ = static_cast<LChar>(character); | |
| 341 } | |
| 342 } while (flush && m_partialSequenceSize); | |
| 343 | |
| 344 buffer.shrink(destination - buffer.characters()); | |
| 345 | |
| 346 return String::adopt(buffer); | |
| 347 | |
| 348 upConvertTo16Bit: | |
| 349 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); | |
| 350 | |
| 351 UChar* destination16 = buffer16.characters(); | |
| 352 | |
| 353 // Copy the already converted characters | |
| 354 for (LChar* converted8 = buffer.characters(); converted8 < destination;) | |
| 355 *destination16++ = *converted8++; | |
| 356 | |
| 357 do { | |
| 358 if (m_partialSequenceSize) { | |
| 359 // Explicitly copy destination and source pointers to avoid taking p
ointers to the | |
| 360 // local variables, which may harm code generation by disabling some
optimizations | |
| 361 // in some compilers. | |
| 362 UChar* destinationForHandlePartialSequence = destination16; | |
| 363 const uint8_t* sourceForHandlePartialSequence = source; | |
| 364 handlePartialSequence(destinationForHandlePartialSequence, sourceFor
HandlePartialSequence, end, flush, stopOnError, sawError); | |
| 365 destination16 = destinationForHandlePartialSequence; | |
| 366 source = sourceForHandlePartialSequence; | |
| 367 if (m_partialSequenceSize) | |
| 368 break; | |
| 369 } | |
| 370 | |
| 371 while (source < end) { | |
| 372 if (isASCII(*source)) { | |
| 373 // Fast path for ASCII. Most UTF-8 text will be ASCII. | |
| 374 if (isAlignedToMachineWord(source)) { | |
| 375 while (source < alignedEnd) { | |
| 376 MachineWord chunk = *reinterpret_cast_ptr<const MachineW
ord*>(source); | |
| 377 if (!isAllASCII<LChar>(chunk)) | |
| 378 break; | |
| 379 copyASCIIMachineWord(destination16, source); | |
| 380 source += sizeof(MachineWord); | |
| 381 destination16 += sizeof(MachineWord); | |
| 382 } | |
| 383 if (source == end) | |
| 384 break; | |
| 385 if (!isASCII(*source)) | |
| 386 continue; | |
| 387 } | |
| 388 *destination16++ = *source++; | |
| 389 continue; | |
| 390 } | |
| 391 int count = nonASCIISequenceLength(*source); | |
| 392 int character; | |
| 393 if (count == 0) { | |
| 394 character = nonCharacter; | |
| 395 } else { | |
| 396 if (count > end - source) { | |
| 397 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<
ptrdiff_t>(sizeof(m_partialSequence))); | |
| 398 ASSERT(!m_partialSequenceSize); | |
| 399 m_partialSequenceSize = end - source; | |
| 400 memcpy(m_partialSequence, source, m_partialSequenceSize); | |
| 401 source = end; | |
| 402 break; | |
| 403 } | |
| 404 character = decodeNonASCIISequence(source, count); | |
| 405 } | |
| 406 if (character == nonCharacter) { | |
| 407 sawError = true; | |
| 408 if (stopOnError) | |
| 409 break; | |
| 410 // Each error generates a replacement character and consumes one
byte. | |
| 411 *destination16++ = replacementCharacter; | |
| 412 ++source; | |
| 413 continue; | |
| 414 } | |
| 415 source += count; | |
| 416 destination16 = appendCharacter(destination16, character); | |
| 417 } | |
| 418 } while (flush && m_partialSequenceSize); | |
| 419 | |
| 420 buffer16.shrink(destination16 - buffer16.characters()); | |
| 421 | |
| 422 return String::adopt(buffer16); | |
| 423 } | |
| 424 | |
| 425 template<typename CharType> | |
| 426 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) | |
| 427 { | |
| 428 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. | |
| 429 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
(3x). | |
| 430 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
(2x). | |
| 431 if (length > std::numeric_limits<size_t>::max() / 3) | |
| 432 CRASH(); | |
| 433 Vector<uint8_t> bytes(length * 3); | |
| 434 | |
| 435 size_t i = 0; | |
| 436 size_t bytesWritten = 0; | |
| 437 while (i < length) { | |
| 438 UChar32 character; | |
| 439 U16_NEXT(characters, i, length, character); | |
| 440 // U16_NEXT will simply emit a surrogate code point if an unmatched surr
ogate | |
| 441 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER
) here. | |
| 442 if (0xD800 <= character && character <= 0xDFFF) | |
| 443 character = replacementCharacter; | |
| 444 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); | |
| 445 } | |
| 446 | |
| 447 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); | |
| 448 } | |
| 449 | |
| 450 CString TextCodecUTF8::encode(const UChar* characters, size_t length, Unencodabl
eHandling) | |
| 451 { | |
| 452 return encodeCommon(characters, length); | |
| 453 } | |
| 454 | |
| 455 CString TextCodecUTF8::encode(const LChar* characters, size_t length, Unencodabl
eHandling) | |
| 456 { | |
| 457 return encodeCommon(characters, length); | |
| 458 } | |
| 459 | |
| 460 } // namespace WTF | |
| OLD | NEW |