OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
48 | 48 |
49 // Additional aliases that originally were present in the encoding | 49 // Additional aliases that originally were present in the encoding |
50 // table in WebKit on Macintosh, and subsequently added by | 50 // table in WebKit on Macintosh, and subsequently added by |
51 // TextCodecICU. Perhaps we can prove some are not used on the web | 51 // TextCodecICU. Perhaps we can prove some are not used on the web |
52 // and remove them. | 52 // and remove them. |
53 registrar("unicode11utf8", "UTF-8"); | 53 registrar("unicode11utf8", "UTF-8"); |
54 registrar("unicode20utf8", "UTF-8"); | 54 registrar("unicode20utf8", "UTF-8"); |
55 registrar("utf8", "UTF-8"); | 55 registrar("utf8", "UTF-8"); |
56 registrar("x-unicode20utf8", "UTF-8"); | 56 registrar("x-unicode20utf8", "UTF-8"); |
57 | 57 |
58 // Additional aliases present in the WHATWG Encoding Standard (http://encoding
.spec.whatwg.org/) | 58 // Additional aliases present in the WHATWG Encoding Standard |
| 59 // (http://encoding.spec.whatwg.org/) |
59 // and Firefox (24), but not in ICU 4.6. | 60 // and Firefox (24), but not in ICU 4.6. |
60 registrar("unicode-1-1-utf-8", "UTF-8"); | 61 registrar("unicode-1-1-utf-8", "UTF-8"); |
61 } | 62 } |
62 | 63 |
63 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { | 64 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) { |
64 registrar("UTF-8", create, 0); | 65 registrar("UTF-8", create, 0); |
65 } | 66 } |
66 | 67 |
67 static inline int nonASCIISequenceLength(uint8_t firstByte) { | 68 static inline int nonASCIISequenceLength(uint8_t firstByte) { |
68 static const uint8_t lengths[256] = { | 69 static const uint8_t lengths[256] = { |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
184 if (count > m_partialSequenceSize) { | 185 if (count > m_partialSequenceSize) { |
185 if (count - m_partialSequenceSize > end - source) { | 186 if (count - m_partialSequenceSize > end - source) { |
186 if (!flush) { | 187 if (!flush) { |
187 // The new data is not enough to complete the sequence, so | 188 // The new data is not enough to complete the sequence, so |
188 // add it to the existing partial sequence. | 189 // add it to the existing partial sequence. |
189 memcpy(m_partialSequence + m_partialSequenceSize, source, | 190 memcpy(m_partialSequence + m_partialSequenceSize, source, |
190 end - source); | 191 end - source); |
191 m_partialSequenceSize += end - source; | 192 m_partialSequenceSize += end - source; |
192 return false; | 193 return false; |
193 } | 194 } |
194 // An incomplete partial sequence at the end is an error, but it will cr
eate | 195 // An incomplete partial sequence at the end is an error, but it will |
195 // a 16 bit string due to the replacementCharacter. Let the 16 bit path
handle | 196 // create a 16 bit string due to the replacementCharacter. Let the 16 |
196 // the error. | 197 // bit path handle the error. |
197 return true; | 198 return true; |
198 } | 199 } |
199 memcpy(m_partialSequence + m_partialSequenceSize, source, | 200 memcpy(m_partialSequence + m_partialSequenceSize, source, |
200 count - m_partialSequenceSize); | 201 count - m_partialSequenceSize); |
201 source += count - m_partialSequenceSize; | 202 source += count - m_partialSequenceSize; |
202 m_partialSequenceSize = count; | 203 m_partialSequenceSize = count; |
203 } | 204 } |
204 int character = decodeNonASCIISequence(m_partialSequence, count); | 205 int character = decodeNonASCIISequence(m_partialSequence, count); |
205 if (character & ~0xff) | 206 if (character & ~0xff) |
206 return true; | 207 return true; |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
279 // each byte in an invalid sequence will turn into a replacement character. | 280 // each byte in an invalid sequence will turn into a replacement character. |
280 StringBuffer<LChar> buffer(m_partialSequenceSize + length); | 281 StringBuffer<LChar> buffer(m_partialSequenceSize + length); |
281 | 282 |
282 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); | 283 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); |
283 const uint8_t* end = source + length; | 284 const uint8_t* end = source + length; |
284 const uint8_t* alignedEnd = alignToMachineWord(end); | 285 const uint8_t* alignedEnd = alignToMachineWord(end); |
285 LChar* destination = buffer.characters(); | 286 LChar* destination = buffer.characters(); |
286 | 287 |
287 do { | 288 do { |
288 if (m_partialSequenceSize) { | 289 if (m_partialSequenceSize) { |
289 // Explicitly copy destination and source pointers to avoid taking pointer
s to the | 290 // Explicitly copy destination and source pointers to avoid taking |
290 // local variables, which may harm code generation by disabling some optim
izations | 291 // pointers to the local variables, which may harm code generation by |
291 // in some compilers. | 292 // disabling some optimizations in some compilers. |
292 LChar* destinationForHandlePartialSequence = destination; | 293 LChar* destinationForHandlePartialSequence = destination; |
293 const uint8_t* sourceForHandlePartialSequence = source; | 294 const uint8_t* sourceForHandlePartialSequence = source; |
294 if (handlePartialSequence(destinationForHandlePartialSequence, | 295 if (handlePartialSequence(destinationForHandlePartialSequence, |
295 sourceForHandlePartialSequence, end, flush, | 296 sourceForHandlePartialSequence, end, flush, |
296 stopOnError, sawError)) { | 297 stopOnError, sawError)) { |
297 source = sourceForHandlePartialSequence; | 298 source = sourceForHandlePartialSequence; |
298 goto upConvertTo16Bit; | 299 goto upConvertTo16Bit; |
299 } | 300 } |
300 destination = destinationForHandlePartialSequence; | 301 destination = destinationForHandlePartialSequence; |
301 source = sourceForHandlePartialSequence; | 302 source = sourceForHandlePartialSequence; |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
363 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); | 364 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); |
364 | 365 |
365 UChar* destination16 = buffer16.characters(); | 366 UChar* destination16 = buffer16.characters(); |
366 | 367 |
367 // Copy the already converted characters | 368 // Copy the already converted characters |
368 for (LChar* converted8 = buffer.characters(); converted8 < destination;) | 369 for (LChar* converted8 = buffer.characters(); converted8 < destination;) |
369 *destination16++ = *converted8++; | 370 *destination16++ = *converted8++; |
370 | 371 |
371 do { | 372 do { |
372 if (m_partialSequenceSize) { | 373 if (m_partialSequenceSize) { |
373 // Explicitly copy destination and source pointers to avoid taking pointer
s to the | 374 // Explicitly copy destination and source pointers to avoid taking |
374 // local variables, which may harm code generation by disabling some optim
izations | 375 // pointers to the local variables, which may harm code generation by |
375 // in some compilers. | 376 // disabling some optimizations in some compilers. |
376 UChar* destinationForHandlePartialSequence = destination16; | 377 UChar* destinationForHandlePartialSequence = destination16; |
377 const uint8_t* sourceForHandlePartialSequence = source; | 378 const uint8_t* sourceForHandlePartialSequence = source; |
378 handlePartialSequence(destinationForHandlePartialSequence, | 379 handlePartialSequence(destinationForHandlePartialSequence, |
379 sourceForHandlePartialSequence, end, flush, | 380 sourceForHandlePartialSequence, end, flush, |
380 stopOnError, sawError); | 381 stopOnError, sawError); |
381 destination16 = destinationForHandlePartialSequence; | 382 destination16 = destinationForHandlePartialSequence; |
382 source = sourceForHandlePartialSequence; | 383 source = sourceForHandlePartialSequence; |
383 if (m_partialSequenceSize) | 384 if (m_partialSequenceSize) |
384 break; | 385 break; |
385 } | 386 } |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
436 } while (flush && m_partialSequenceSize); | 437 } while (flush && m_partialSequenceSize); |
437 | 438 |
438 buffer16.shrink(destination16 - buffer16.characters()); | 439 buffer16.shrink(destination16 - buffer16.characters()); |
439 | 440 |
440 return String::adopt(buffer16); | 441 return String::adopt(buffer16); |
441 } | 442 } |
442 | 443 |
443 template <typename CharType> | 444 template <typename CharType> |
444 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { | 445 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { |
445 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. | 446 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. |
446 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3
x). | 447 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes |
447 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2
x). | 448 // (3x). |
| 449 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes |
| 450 // (2x). |
448 if (length > std::numeric_limits<size_t>::max() / 3) | 451 if (length > std::numeric_limits<size_t>::max() / 3) |
449 CRASH(); | 452 CRASH(); |
450 Vector<uint8_t> bytes(length * 3); | 453 Vector<uint8_t> bytes(length * 3); |
451 | 454 |
452 size_t i = 0; | 455 size_t i = 0; |
453 size_t bytesWritten = 0; | 456 size_t bytesWritten = 0; |
454 while (i < length) { | 457 while (i < length) { |
455 UChar32 character; | 458 UChar32 character; |
456 U16_NEXT(characters, i, length, character); | 459 U16_NEXT(characters, i, length, character); |
457 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogat
e | 460 // U16_NEXT will simply emit a surrogate code point if an unmatched |
458 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) he
re. | 461 // surrogate is encountered; we must convert it to a |
| 462 // U+FFFD (REPLACEMENT CHARACTER) here. |
459 if (0xD800 <= character && character <= 0xDFFF) | 463 if (0xD800 <= character && character <= 0xDFFF) |
460 character = replacementCharacter; | 464 character = replacementCharacter; |
461 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); | 465 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); |
462 } | 466 } |
463 | 467 |
464 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); | 468 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); |
465 } | 469 } |
466 | 470 |
467 CString TextCodecUTF8::encode(const UChar* characters, | 471 CString TextCodecUTF8::encode(const UChar* characters, |
468 size_t length, | 472 size_t length, |
469 UnencodableHandling) { | 473 UnencodableHandling) { |
470 return encodeCommon(characters, length); | 474 return encodeCommon(characters, length); |
471 } | 475 } |
472 | 476 |
473 CString TextCodecUTF8::encode(const LChar* characters, | 477 CString TextCodecUTF8::encode(const LChar* characters, |
474 size_t length, | 478 size_t length, |
475 UnencodableHandling) { | 479 UnencodableHandling) { |
476 return encodeCommon(characters, length); | 480 return encodeCommon(characters, length); |
477 } | 481 } |
478 | 482 |
479 } // namespace WTF | 483 } // namespace WTF |
OLD | NEW |