| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 48 | 48 |
| 49 // Additional aliases that originally were present in the encoding | 49 // Additional aliases that originally were present in the encoding |
| 50 // table in WebKit on Macintosh, and subsequently added by | 50 // table in WebKit on Macintosh, and subsequently added by |
| 51 // TextCodecICU. Perhaps we can prove some are not used on the web | 51 // TextCodecICU. Perhaps we can prove some are not used on the web |
| 52 // and remove them. | 52 // and remove them. |
| 53 registrar("unicode11utf8", "UTF-8"); | 53 registrar("unicode11utf8", "UTF-8"); |
| 54 registrar("unicode20utf8", "UTF-8"); | 54 registrar("unicode20utf8", "UTF-8"); |
| 55 registrar("utf8", "UTF-8"); | 55 registrar("utf8", "UTF-8"); |
| 56 registrar("x-unicode20utf8", "UTF-8"); | 56 registrar("x-unicode20utf8", "UTF-8"); |
| 57 | 57 |
| 58 // Additional aliases present in the WHATWG Encoding Standard (http://encoding
.spec.whatwg.org/) | 58 // Additional aliases present in the WHATWG Encoding Standard |
| 59 // (http://encoding.spec.whatwg.org/) |
| 59 // and Firefox (24), but not in ICU 4.6. | 60 // and Firefox (24), but not in ICU 4.6. |
| 60 registrar("unicode-1-1-utf-8", "UTF-8"); | 61 registrar("unicode-1-1-utf-8", "UTF-8"); |
| 61 } | 62 } |
| 62 | 63 |
| 63 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { | 64 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { |
| 64 registrar("UTF-8", create, 0); | 65 registrar("UTF-8", create, 0); |
| 65 } | 66 } |
| 66 | 67 |
| 67 static inline int nonASCIISequenceLength(uint8_t firstByte) { | 68 static inline int nonASCIISequenceLength(uint8_t firstByte) { |
| 68 static const uint8_t lengths[256] = { | 69 static const uint8_t lengths[256] = { |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 184 if (count > m_partialSequenceSize) { | 185 if (count > m_partialSequenceSize) { |
| 185 if (count - m_partialSequenceSize > end - source) { | 186 if (count - m_partialSequenceSize > end - source) { |
| 186 if (!flush) { | 187 if (!flush) { |
| 187 // The new data is not enough to complete the sequence, so | 188 // The new data is not enough to complete the sequence, so |
| 188 // add it to the existing partial sequence. | 189 // add it to the existing partial sequence. |
| 189 memcpy(m_partialSequence + m_partialSequenceSize, source, | 190 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 190 end - source); | 191 end - source); |
| 191 m_partialSequenceSize += end - source; | 192 m_partialSequenceSize += end - source; |
| 192 return false; | 193 return false; |
| 193 } | 194 } |
| 194 // An incomplete partial sequence at the end is an error, but it will cr
eate | 195 // An incomplete partial sequence at the end is an error, but it will |
| 195 // a 16 bit string due to the replacementCharacter. Let the 16 bit path
handle | 196 // create a 16 bit string due to the replacementCharacter. Let the 16 |
| 196 // the error. | 197 // bit path handle the error. |
| 197 return true; | 198 return true; |
| 198 } | 199 } |
| 199 memcpy(m_partialSequence + m_partialSequenceSize, source, | 200 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 200 count - m_partialSequenceSize); | 201 count - m_partialSequenceSize); |
| 201 source += count - m_partialSequenceSize; | 202 source += count - m_partialSequenceSize; |
| 202 m_partialSequenceSize = count; | 203 m_partialSequenceSize = count; |
| 203 } | 204 } |
| 204 int character = decodeNonASCIISequence(m_partialSequence, count); | 205 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 205 if (character & ~0xff) | 206 if (character & ~0xff) |
| 206 return true; | 207 return true; |
| (...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 279 // each byte in an invalid sequence will turn into a replacement character. | 280 // each byte in an invalid sequence will turn into a replacement character. |
| 280 StringBuffer<LChar> buffer(m_partialSequenceSize + length); | 281 StringBuffer<LChar> buffer(m_partialSequenceSize + length); |
| 281 | 282 |
| 282 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); | 283 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); |
| 283 const uint8_t* end = source + length; | 284 const uint8_t* end = source + length; |
| 284 const uint8_t* alignedEnd = alignToMachineWord(end); | 285 const uint8_t* alignedEnd = alignToMachineWord(end); |
| 285 LChar* destination = buffer.characters(); | 286 LChar* destination = buffer.characters(); |
| 286 | 287 |
| 287 do { | 288 do { |
| 288 if (m_partialSequenceSize) { | 289 if (m_partialSequenceSize) { |
| 289 // Explicitly copy destination and source pointers to avoid taking pointer
s to the | 290 // Explicitly copy destination and source pointers to avoid taking |
| 290 // local variables, which may harm code generation by disabling some optim
izations | 291 // pointers to the local variables, which may harm code generation by |
| 291 // in some compilers. | 292 // disabling some optimizations in some compilers. |
| 292 LChar* destinationForHandlePartialSequence = destination; | 293 LChar* destinationForHandlePartialSequence = destination; |
| 293 const uint8_t* sourceForHandlePartialSequence = source; | 294 const uint8_t* sourceForHandlePartialSequence = source; |
| 294 if (handlePartialSequence(destinationForHandlePartialSequence, | 295 if (handlePartialSequence(destinationForHandlePartialSequence, |
| 295 sourceForHandlePartialSequence, end, flush, | 296 sourceForHandlePartialSequence, end, flush, |
| 296 stopOnError, sawError)) { | 297 stopOnError, sawError)) { |
| 297 source = sourceForHandlePartialSequence; | 298 source = sourceForHandlePartialSequence; |
| 298 goto upConvertTo16Bit; | 299 goto upConvertTo16Bit; |
| 299 } | 300 } |
| 300 destination = destinationForHandlePartialSequence; | 301 destination = destinationForHandlePartialSequence; |
| 301 source = sourceForHandlePartialSequence; | 302 source = sourceForHandlePartialSequence; |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 363 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); | 364 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); |
| 364 | 365 |
| 365 UChar* destination16 = buffer16.characters(); | 366 UChar* destination16 = buffer16.characters(); |
| 366 | 367 |
| 367 // Copy the already converted characters | 368 // Copy the already converted characters |
| 368 for (LChar* converted8 = buffer.characters(); converted8 < destination;) | 369 for (LChar* converted8 = buffer.characters(); converted8 < destination;) |
| 369 *destination16++ = *converted8++; | 370 *destination16++ = *converted8++; |
| 370 | 371 |
| 371 do { | 372 do { |
| 372 if (m_partialSequenceSize) { | 373 if (m_partialSequenceSize) { |
| 373 // Explicitly copy destination and source pointers to avoid taking pointer
s to the | 374 // Explicitly copy destination and source pointers to avoid taking |
| 374 // local variables, which may harm code generation by disabling some optim
izations | 375 // pointers to the local variables, which may harm code generation by |
| 375 // in some compilers. | 376 // disabling some optimizations in some compilers. |
| 376 UChar* destinationForHandlePartialSequence = destination16; | 377 UChar* destinationForHandlePartialSequence = destination16; |
| 377 const uint8_t* sourceForHandlePartialSequence = source; | 378 const uint8_t* sourceForHandlePartialSequence = source; |
| 378 handlePartialSequence(destinationForHandlePartialSequence, | 379 handlePartialSequence(destinationForHandlePartialSequence, |
| 379 sourceForHandlePartialSequence, end, flush, | 380 sourceForHandlePartialSequence, end, flush, |
| 380 stopOnError, sawError); | 381 stopOnError, sawError); |
| 381 destination16 = destinationForHandlePartialSequence; | 382 destination16 = destinationForHandlePartialSequence; |
| 382 source = sourceForHandlePartialSequence; | 383 source = sourceForHandlePartialSequence; |
| 383 if (m_partialSequenceSize) | 384 if (m_partialSequenceSize) |
| 384 break; | 385 break; |
| 385 } | 386 } |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 436 } while (flush && m_partialSequenceSize); | 437 } while (flush && m_partialSequenceSize); |
| 437 | 438 |
| 438 buffer16.shrink(destination16 - buffer16.characters()); | 439 buffer16.shrink(destination16 - buffer16.characters()); |
| 439 | 440 |
| 440 return String::adopt(buffer16); | 441 return String::adopt(buffer16); |
| 441 } | 442 } |
| 442 | 443 |
| 443 template <typename CharType> | 444 template <typename CharType> |
| 444 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { | 445 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { |
| 445 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. | 446 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. |
| 446 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3
x). | 447 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes |
| 447 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2
x). | 448 // (3x). |
| 449 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes |
| 450 // (2x). |
| 448 if (length > std::numeric_limits<size_t>::max() / 3) | 451 if (length > std::numeric_limits<size_t>::max() / 3) |
| 449 CRASH(); | 452 CRASH(); |
| 450 Vector<uint8_t> bytes(length * 3); | 453 Vector<uint8_t> bytes(length * 3); |
| 451 | 454 |
| 452 size_t i = 0; | 455 size_t i = 0; |
| 453 size_t bytesWritten = 0; | 456 size_t bytesWritten = 0; |
| 454 while (i < length) { | 457 while (i < length) { |
| 455 UChar32 character; | 458 UChar32 character; |
| 456 U16_NEXT(characters, i, length, character); | 459 U16_NEXT(characters, i, length, character); |
| 457 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat
e | 460 // U16_NEXT will simply emit a surrogate code point if an unmatched |
| 458 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he
re. | 461 // surrogate is encountered; we must convert it to a |
| 462 // U+FFFD (REPLACEMENT CHARACTER) here. |
| 459 if (0xD800 <= character && character <= 0xDFFF) | 463 if (0xD800 <= character && character <= 0xDFFF) |
| 460 character = replacementCharacter; | 464 character = replacementCharacter; |
| 461 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); | 465 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); |
| 462 } | 466 } |
| 463 | 467 |
| 464 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); | 468 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); |
| 465 } | 469 } |
| 466 | 470 |
| 467 CString TextCodecUTF8::encode(const UChar* characters, | 471 CString TextCodecUTF8::encode(const UChar* characters, |
| 468 size_t length, | 472 size_t length, |
| 469 UnencodableHandling) { | 473 UnencodableHandling) { |
| 470 return encodeCommon(characters, length); | 474 return encodeCommon(characters, length); |
| 471 } | 475 } |
| 472 | 476 |
| 473 CString TextCodecUTF8::encode(const LChar* characters, | 477 CString TextCodecUTF8::encode(const LChar* characters, |
| 474 size_t length, | 478 size_t length, |
| 475 UnencodableHandling) { | 479 UnencodableHandling) { |
| 476 return encodeCommon(characters, length); | 480 return encodeCommon(characters, length); |
| 477 } | 481 } |
| 478 | 482 |
| 479 } // namespace WTF | 483 } // namespace WTF |
| OLD | NEW |