| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 18 matching lines...) Expand all Loading... |
| 29 #include "wtf/text/CString.h" | 29 #include "wtf/text/CString.h" |
| 30 #include "wtf/text/CharacterNames.h" | 30 #include "wtf/text/CharacterNames.h" |
| 31 #include "wtf/text/StringBuffer.h" | 31 #include "wtf/text/StringBuffer.h" |
| 32 #include "wtf/text/TextCodecASCIIFastPath.h" | 32 #include "wtf/text/TextCodecASCIIFastPath.h" |
| 33 #include <memory> | 33 #include <memory> |
| 34 | 34 |
| 35 namespace WTF { | 35 namespace WTF { |
| 36 | 36 |
| 37 using namespace WTF::Unicode; | 37 using namespace WTF::Unicode; |
| 38 | 38 |
| 39 const int nonCharacter = -1; | 39 // We'll use nonCharacter* constants to signal invalid utf-8. |
| 40 // The number in the name signals how many input bytes were invalid. |
| 41 const int nonCharacter1 = -1; |
| 42 const int nonCharacter2 = -2; |
| 43 const int nonCharacter3 = -3; |
| 44 |
| 45 bool isNonCharacter(int character) { |
| 46 return character >= nonCharacter3 && character <= nonCharacter1; |
| 47 } |
| 40 | 48 |
| 41 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, | 49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, |
| 42 const void*) { | 50 const void*) { |
| 43 return wrapUnique(new TextCodecUTF8); | 51 return wrapUnique(new TextCodecUTF8); |
| 44 } | 52 } |
| 45 | 53 |
| 46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { | 54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { |
| 47 registrar("UTF-8", "UTF-8"); | 55 registrar("UTF-8", "UTF-8"); |
| 48 | 56 |
| 49 // Additional aliases that originally were present in the encoding | 57 // Additional aliases that originally were present in the encoding |
| (...skipping 30 matching lines...) Expand all Loading... |
| 80 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | 88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 81 return lengths[firstByte]; | 89 return lengths[firstByte]; |
| 82 } | 90 } |
| 83 | 91 |
| 84 static inline int decodeNonASCIISequence(const uint8_t* sequence, | 92 static inline int decodeNonASCIISequence(const uint8_t* sequence, |
| 85 unsigned length) { | 93 unsigned length) { |
| 86 ASSERT(!isASCII(sequence[0])); | 94 ASSERT(!isASCII(sequence[0])); |
| 87 if (length == 2) { | 95 if (length == 2) { |
| 88 ASSERT(sequence[0] <= 0xDF); | 96 ASSERT(sequence[0] <= 0xDF); |
| 89 if (sequence[0] < 0xC2) | 97 if (sequence[0] < 0xC2) |
| 90 return nonCharacter; | 98 return nonCharacter1; |
| 91 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 99 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 92 return nonCharacter; | 100 return nonCharacter1; |
| 93 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; | 101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; |
| 94 } | 102 } |
| 95 if (length == 3) { | 103 if (length == 3) { |
| 96 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); | 104 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); |
| 97 switch (sequence[0]) { | 105 switch (sequence[0]) { |
| 98 case 0xE0: | 106 case 0xE0: |
| 99 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) | 107 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) |
| 100 return nonCharacter; | 108 return nonCharacter1; |
| 101 break; | 109 break; |
| 102 case 0xED: | 110 case 0xED: |
| 103 if (sequence[1] < 0x80 || sequence[1] > 0x9F) | 111 if (sequence[1] < 0x80 || sequence[1] > 0x9F) |
| 104 return nonCharacter; | 112 return nonCharacter1; |
| 105 break; | 113 break; |
| 106 default: | 114 default: |
| 107 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 115 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 108 return nonCharacter; | 116 return nonCharacter1; |
| 109 } | 117 } |
| 110 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | 118 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 111 return nonCharacter; | 119 return nonCharacter2; |
| 112 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - | 120 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - |
| 113 0x000E2080; | 121 0x000E2080; |
| 114 } | 122 } |
| 115 ASSERT(length == 4); | 123 ASSERT(length == 4); |
| 116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); | 124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); |
| 117 switch (sequence[0]) { | 125 switch (sequence[0]) { |
| 118 case 0xF0: | 126 case 0xF0: |
| 119 if (sequence[1] < 0x90 || sequence[1] > 0xBF) | 127 if (sequence[1] < 0x90 || sequence[1] > 0xBF) |
| 120 return nonCharacter; | 128 return nonCharacter1; |
| 121 break; | 129 break; |
| 122 case 0xF4: | 130 case 0xF4: |
| 123 if (sequence[1] < 0x80 || sequence[1] > 0x8F) | 131 if (sequence[1] < 0x80 || sequence[1] > 0x8F) |
| 124 return nonCharacter; | 132 return nonCharacter1; |
| 125 break; | 133 break; |
| 126 default: | 134 default: |
| 127 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 135 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 128 return nonCharacter; | 136 return nonCharacter1; |
| 129 } | 137 } |
| 130 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | 138 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 131 return nonCharacter; | 139 return nonCharacter2; |
| 132 if (sequence[3] < 0x80 || sequence[3] > 0xBF) | 140 if (sequence[3] < 0x80 || sequence[3] > 0xBF) |
| 133 return nonCharacter; | 141 return nonCharacter3; |
| 134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + | 142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + |
| 135 sequence[3]) - | 143 sequence[3]) - |
| 136 0x03C82080; | 144 0x03C82080; |
| 137 } | 145 } |
| 138 | 146 |
| 139 static inline UChar* appendCharacter(UChar* destination, int character) { | 147 static inline UChar* appendCharacter(UChar* destination, int character) { |
| 140 ASSERT(character != nonCharacter); | 148 DCHECK(!isNonCharacter(character)); |
| 141 ASSERT(!U_IS_SURROGATE(character)); | 149 DCHECK(!U_IS_SURROGATE(character)); |
| 142 if (U_IS_BMP(character)) { | 150 if (U_IS_BMP(character)) { |
| 143 *destination++ = static_cast<UChar>(character); | 151 *destination++ = static_cast<UChar>(character); |
| 144 } else { | 152 } else { |
| 145 *destination++ = U16_LEAD(character); | 153 *destination++ = U16_LEAD(character); |
| 146 *destination++ = U16_TRAIL(character); | 154 *destination++ = U16_TRAIL(character); |
| 147 } | 155 } |
| 148 return destination; | 156 return destination; |
| 149 } | 157 } |
| 150 | 158 |
| 151 void TextCodecUTF8::consumePartialSequenceByte() { | 159 void TextCodecUTF8::consumePartialSequenceByte() { |
| (...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 249 if (stopOnError) | 257 if (stopOnError) |
| 250 return false; | 258 return false; |
| 251 continue; | 259 continue; |
| 252 } | 260 } |
| 253 memcpy(m_partialSequence + m_partialSequenceSize, source, | 261 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 254 count - m_partialSequenceSize); | 262 count - m_partialSequenceSize); |
| 255 source += count - m_partialSequenceSize; | 263 source += count - m_partialSequenceSize; |
| 256 m_partialSequenceSize = count; | 264 m_partialSequenceSize = count; |
| 257 } | 265 } |
| 258 int character = decodeNonASCIISequence(m_partialSequence, count); | 266 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 259 if (character == nonCharacter) { | 267 if (isNonCharacter(character)) { |
| 260 handleError(destination, stopOnError, sawError); | 268 handleError(destination, stopOnError, sawError); |
| 261 if (stopOnError) | 269 if (stopOnError) |
| 262 return false; | 270 return false; |
| 263 continue; | 271 continue; |
| 264 } | 272 } |
| 265 | 273 |
| 266 m_partialSequenceSize -= count; | 274 m_partialSequenceSize -= count; |
| 267 destination = appendCharacter(destination, character); | 275 destination = appendCharacter(destination, character); |
| 268 } while (m_partialSequenceSize); | 276 } while (m_partialSequenceSize); |
| 269 | 277 |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 321 break; | 329 break; |
| 322 if (!isASCII(*source)) | 330 if (!isASCII(*source)) |
| 323 continue; | 331 continue; |
| 324 } | 332 } |
| 325 *destination++ = *source++; | 333 *destination++ = *source++; |
| 326 continue; | 334 continue; |
| 327 } | 335 } |
| 328 int count = nonASCIISequenceLength(*source); | 336 int count = nonASCIISequenceLength(*source); |
| 329 int character; | 337 int character; |
| 330 if (count == 0) { | 338 if (count == 0) { |
| 331 character = nonCharacter; | 339 character = nonCharacter1; |
| 332 } else { | 340 } else { |
| 333 if (count > end - source) { | 341 if (count > end - source) { |
| 334 ASSERT_WITH_SECURITY_IMPLICATION( | 342 ASSERT_WITH_SECURITY_IMPLICATION( |
| 335 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); | 343 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
| 336 ASSERT(!m_partialSequenceSize); | 344 ASSERT(!m_partialSequenceSize); |
| 337 m_partialSequenceSize = end - source; | 345 m_partialSequenceSize = end - source; |
| 338 memcpy(m_partialSequence, source, m_partialSequenceSize); | 346 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 339 source = end; | 347 source = end; |
| 340 break; | 348 break; |
| 341 } | 349 } |
| 342 character = decodeNonASCIISequence(source, count); | 350 character = decodeNonASCIISequence(source, count); |
| 343 } | 351 } |
| 344 if (character == nonCharacter) { | 352 if (isNonCharacter(character)) { |
| 345 sawError = true; | 353 sawError = true; |
| 346 if (stopOnError) | 354 if (stopOnError) |
| 347 break; | 355 break; |
| 348 | 356 |
| 349 goto upConvertTo16Bit; | 357 goto upConvertTo16Bit; |
| 350 } | 358 } |
| 351 if (character > 0xff) | 359 if (character > 0xff) |
| 352 goto upConvertTo16Bit; | 360 goto upConvertTo16Bit; |
| 353 | 361 |
| 354 source += count; | 362 source += count; |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 402 break; | 410 break; |
| 403 if (!isASCII(*source)) | 411 if (!isASCII(*source)) |
| 404 continue; | 412 continue; |
| 405 } | 413 } |
| 406 *destination16++ = *source++; | 414 *destination16++ = *source++; |
| 407 continue; | 415 continue; |
| 408 } | 416 } |
| 409 int count = nonASCIISequenceLength(*source); | 417 int count = nonASCIISequenceLength(*source); |
| 410 int character; | 418 int character; |
| 411 if (count == 0) { | 419 if (count == 0) { |
| 412 character = nonCharacter; | 420 character = nonCharacter1; |
| 413 } else { | 421 } else { |
| 414 if (count > end - source) { | 422 if (count > end - source) { |
| 415 ASSERT_WITH_SECURITY_IMPLICATION( | 423 ASSERT_WITH_SECURITY_IMPLICATION( |
| 416 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); | 424 end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
| 417 ASSERT(!m_partialSequenceSize); | 425 ASSERT(!m_partialSequenceSize); |
| 418 m_partialSequenceSize = end - source; | 426 m_partialSequenceSize = end - source; |
| 419 memcpy(m_partialSequence, source, m_partialSequenceSize); | 427 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 420 source = end; | 428 source = end; |
| 421 break; | 429 break; |
| 422 } | 430 } |
| 423 character = decodeNonASCIISequence(source, count); | 431 character = decodeNonASCIISequence(source, count); |
| 424 } | 432 } |
| 425 if (character == nonCharacter) { | 433 if (isNonCharacter(character)) { |
| 426 sawError = true; | 434 sawError = true; |
| 427 if (stopOnError) | 435 if (stopOnError) |
| 428 break; | 436 break; |
| 429 // Each error generates a replacement character and consumes one byte. | 437 // Each error generates one replacement character and consumes the |
| 438 // 'largest subpart' of the incomplete character. |
| 439 // Note that the nonCharacterX constants go from -1..-3 and contain |
| 440 // the negative of number of bytes comprising the broken encoding |
| 441 // detected. So subtracting c (when isNonCharacter(c)) adds the number |
| 442 // of broken bytes. |
| 430 *destination16++ = replacementCharacter; | 443 *destination16++ = replacementCharacter; |
| 431 ++source; | 444 source -= character; |
| 432 continue; | 445 continue; |
| 433 } | 446 } |
| 434 source += count; | 447 source += count; |
| 435 destination16 = appendCharacter(destination16, character); | 448 destination16 = appendCharacter(destination16, character); |
| 436 } | 449 } |
| 437 } while (flush && m_partialSequenceSize); | 450 } while (flush && m_partialSequenceSize); |
| 438 | 451 |
| 439 buffer16.shrink(destination16 - buffer16.characters()); | 452 buffer16.shrink(destination16 - buffer16.characters()); |
| 440 | 453 |
| 441 return String::adopt(buffer16); | 454 return String::adopt(buffer16); |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 474 return encodeCommon(characters, length); | 487 return encodeCommon(characters, length); |
| 475 } | 488 } |
| 476 | 489 |
| 477 CString TextCodecUTF8::encode(const LChar* characters, | 490 CString TextCodecUTF8::encode(const LChar* characters, |
| 478 size_t length, | 491 size_t length, |
| 479 UnencodableHandling) { | 492 UnencodableHandling) { |
| 480 return encodeCommon(characters, length); | 493 return encodeCommon(characters, length); |
| 481 } | 494 } |
| 482 | 495 |
| 483 } // namespace WTF | 496 } // namespace WTF |
| OLD | NEW |