Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. | 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 29 #include "wtf/text/CString.h" | 29 #include "wtf/text/CString.h" |
| 30 #include "wtf/text/CharacterNames.h" | 30 #include "wtf/text/CharacterNames.h" |
| 31 #include "wtf/text/StringBuffer.h" | 31 #include "wtf/text/StringBuffer.h" |
| 32 #include "wtf/text/TextCodecASCIIFastPath.h" | 32 #include "wtf/text/TextCodecASCIIFastPath.h" |
| 33 #include <memory> | 33 #include <memory> |
| 34 | 34 |
| 35 namespace WTF { | 35 namespace WTF { |
| 36 | 36 |
| 37 using namespace WTF::Unicode; | 37 using namespace WTF::Unicode; |
| 38 | 38 |
| 39 const int nonCharacter = -1; | 39 // We'll use nonCharacter* constants to signal invalid utf-8. |
| 40 // The number in the name signals how many input bytes were invalid. | |
| 41 const int nonCharacter1 = -1; | |
| 42 const int nonCharacter2 = -2; | |
| 43 const int nonCharacter3 = -3; | |
| 44 | |
| 45 bool isNonCharacter(int character) { | |
| 46 return character >= nonCharacter3 && character <= nonCharacter1; | |
| 47 } | |
| 40 | 48 |
| 41 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, | 49 std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, |
| 42 const void*) { | 50 const void*) { |
| 43 return wrapUnique(new TextCodecUTF8); | 51 return wrapUnique(new TextCodecUTF8); |
| 44 } | 52 } |
| 45 | 53 |
| 46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { | 54 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) { |
| 47 registrar("UTF-8", "UTF-8"); | 55 registrar("UTF-8", "UTF-8"); |
| 48 | 56 |
| 49 // Additional aliases that originally were present in the encoding | 57 // Additional aliases that originally were present in the encoding |
| (...skipping 30 matching lines...) Expand all Loading... | |
| 80 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | 88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 81 return lengths[firstByte]; | 89 return lengths[firstByte]; |
| 82 } | 90 } |
| 83 | 91 |
| 84 static inline int decodeNonASCIISequence(const uint8_t* sequence, | 92 static inline int decodeNonASCIISequence(const uint8_t* sequence, |
| 85 unsigned length) { | 93 unsigned length) { |
| 86 ASSERT(!isASCII(sequence[0])); | 94 ASSERT(!isASCII(sequence[0])); |
| 87 if (length == 2) { | 95 if (length == 2) { |
| 88 ASSERT(sequence[0] <= 0xDF); | 96 ASSERT(sequence[0] <= 0xDF); |
| 89 if (sequence[0] < 0xC2) | 97 if (sequence[0] < 0xC2) |
| 90 return nonCharacter; | 98 return nonCharacter1; |
| 91 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 99 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 92 return nonCharacter; | 100 return nonCharacter1; |
| 93 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; | 101 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; |
| 94 } | 102 } |
| 95 if (length == 3) { | 103 if (length == 3) { |
| 96 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); | 104 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); |
| 97 switch (sequence[0]) { | 105 switch (sequence[0]) { |
| 98 case 0xE0: | 106 case 0xE0: |
| 99 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) | 107 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) |
| 100 return nonCharacter; | 108 return nonCharacter1; |
| 101 break; | 109 break; |
| 102 case 0xED: | 110 case 0xED: |
| 103 if (sequence[1] < 0x80 || sequence[1] > 0x9F) | 111 if (sequence[1] < 0x80 || sequence[1] > 0x9F) |
| 104 return nonCharacter; | 112 return nonCharacter1; |
| 105 break; | 113 break; |
| 106 default: | 114 default: |
| 107 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 115 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 108 return nonCharacter; | 116 return nonCharacter1; |
| 109 } | 117 } |
| 110 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | 118 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 111 return nonCharacter; | 119 return nonCharacter2; |
| 112 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - | 120 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - |
| 113 0x000E2080; | 121 0x000E2080; |
| 114 } | 122 } |
| 115 ASSERT(length == 4); | 123 ASSERT(length == 4); |
| 116 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); | 124 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); |
| 117 switch (sequence[0]) { | 125 switch (sequence[0]) { |
| 118 case 0xF0: | 126 case 0xF0: |
| 119 if (sequence[1] < 0x90 || sequence[1] > 0xBF) | 127 if (sequence[1] < 0x90 || sequence[1] > 0xBF) |
| 120 return nonCharacter; | 128 return nonCharacter1; |
| 121 break; | 129 break; |
| 122 case 0xF4: | 130 case 0xF4: |
| 123 if (sequence[1] < 0x80 || sequence[1] > 0x8F) | 131 if (sequence[1] < 0x80 || sequence[1] > 0x8F) |
| 124 return nonCharacter; | 132 return nonCharacter1; |
| 125 break; | 133 break; |
| 126 default: | 134 default: |
| 127 if (sequence[1] < 0x80 || sequence[1] > 0xBF) | 135 if (sequence[1] < 0x80 || sequence[1] > 0xBF) |
| 128 return nonCharacter; | 136 return nonCharacter1; |
| 129 } | 137 } |
| 130 if (sequence[2] < 0x80 || sequence[2] > 0xBF) | 138 if (sequence[2] < 0x80 || sequence[2] > 0xBF) |
| 131 return nonCharacter; | 139 return nonCharacter2; |
| 132 if (sequence[3] < 0x80 || sequence[3] > 0xBF) | 140 if (sequence[3] < 0x80 || sequence[3] > 0xBF) |
| 133 return nonCharacter; | 141 return nonCharacter3; |
| 134 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + | 142 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + |
| 135 sequence[3]) - | 143 sequence[3]) - |
| 136 0x03C82080; | 144 0x03C82080; |
| 137 } | 145 } |
| 138 | 146 |
| 139 static inline UChar* appendCharacter(UChar* destination, int character) { | 147 static inline UChar* appendCharacter(UChar* destination, int character) { |
| 140 ASSERT(character != nonCharacter); | 148 DCHECK(!isNonCharacter(character)); |
| 141 ASSERT(!U_IS_SURROGATE(character)); | 149 DCHECK(!U_IS_SURROGATE(character)); |
| 142 if (U_IS_BMP(character)) { | 150 if (U_IS_BMP(character)) { |
| 143 *destination++ = static_cast<UChar>(character); | 151 *destination++ = static_cast<UChar>(character); |
| 144 } else { | 152 } else { |
| 145 *destination++ = U16_LEAD(character); | 153 *destination++ = U16_LEAD(character); |
| 146 *destination++ = U16_TRAIL(character); | 154 *destination++ = U16_TRAIL(character); |
| 147 } | 155 } |
| 148 return destination; | 156 return destination; |
| 149 } | 157 } |
| 150 | 158 |
| 151 void TextCodecUTF8::consumePartialSequenceByte() { | 159 void TextCodecUTF8::consumePartialSequenceByte() { |
| (...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 249 if (stopOnError) | 257 if (stopOnError) |
| 250 return false; | 258 return false; |
| 251 continue; | 259 continue; |
| 252 } | 260 } |
| 253 memcpy(m_partialSequence + m_partialSequenceSize, source, | 261 memcpy(m_partialSequence + m_partialSequenceSize, source, |
| 254 count - m_partialSequenceSize); | 262 count - m_partialSequenceSize); |
| 255 source += count - m_partialSequenceSize; | 263 source += count - m_partialSequenceSize; |
| 256 m_partialSequenceSize = count; | 264 m_partialSequenceSize = count; |
| 257 } | 265 } |
| 258 int character = decodeNonASCIISequence(m_partialSequence, count); | 266 int character = decodeNonASCIISequence(m_partialSequence, count); |
| 259 if (character == nonCharacter) { | 267 if (isNonCharacter(character)) { |
| 260 handleError(destination, stopOnError, sawError); | 268 handleError(destination, stopOnError, sawError); |
| 269 count = -character; | |
|
marja
2016/11/16 08:41:38
Looks like count doesn't affect anything
vogelheim
2016/11/16 09:53:16
Removed.
| |
| 261 if (stopOnError) | 270 if (stopOnError) |
| 262 return false; | 271 return false; |
| 263 continue; | 272 continue; |
| 264 } | 273 } |
| 265 | 274 |
| 266 m_partialSequenceSize -= count; | 275 m_partialSequenceSize -= count; |
| 267 destination = appendCharacter(destination, character); | 276 destination = appendCharacter(destination, character); |
| 268 } while (m_partialSequenceSize); | 277 } while (m_partialSequenceSize); |
| 269 | 278 |
| 270 return false; | 279 return false; |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 321 break; | 330 break; |
| 322 if (!isASCII(*source)) | 331 if (!isASCII(*source)) |
| 323 continue; | 332 continue; |
| 324 } | 333 } |
| 325 *destination++ = *source++; | 334 *destination++ = *source++; |
| 326 continue; | 335 continue; |
| 327 } | 336 } |
| 328 int count = nonASCIISequenceLength(*source); | 337 int count = nonASCIISequenceLength(*source); |
| 329 int character; | 338 int character; |
| 330 if (count == 0) { | 339 if (count == 0) { |
| 331 character = nonCharacter; | 340 character = nonCharacter1; |
| 332 } else { | 341 } else { |
| 333 if (count > end - source) { | 342 if (count > end - source) { |
| 334 SECURITY_DCHECK(end - source < | 343 SECURITY_DCHECK(end - source < |
| 335 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); | 344 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
| 336 ASSERT(!m_partialSequenceSize); | 345 ASSERT(!m_partialSequenceSize); |
| 337 m_partialSequenceSize = end - source; | 346 m_partialSequenceSize = end - source; |
| 338 memcpy(m_partialSequence, source, m_partialSequenceSize); | 347 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 339 source = end; | 348 source = end; |
| 340 break; | 349 break; |
| 341 } | 350 } |
| 342 character = decodeNonASCIISequence(source, count); | 351 character = decodeNonASCIISequence(source, count); |
| 343 } | 352 } |
| 344 if (character == nonCharacter) { | 353 if (isNonCharacter(character)) { |
| 354 count = -character; | |
|
marja
2016/11/16 08:41:38
Ditto
vogelheim
2016/11/16 09:53:16
Removed. (All actual processing of non-ASCII chara
| |
| 345 sawError = true; | 355 sawError = true; |
| 346 if (stopOnError) | 356 if (stopOnError) |
| 347 break; | 357 break; |
| 348 | 358 |
| 349 goto upConvertTo16Bit; | 359 goto upConvertTo16Bit; |
| 350 } | 360 } |
| 351 if (character > 0xff) | 361 if (character > 0xff) |
| 352 goto upConvertTo16Bit; | 362 goto upConvertTo16Bit; |
| 353 | 363 |
| 354 source += count; | 364 source += count; |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 402 break; | 412 break; |
| 403 if (!isASCII(*source)) | 413 if (!isASCII(*source)) |
| 404 continue; | 414 continue; |
| 405 } | 415 } |
| 406 *destination16++ = *source++; | 416 *destination16++ = *source++; |
| 407 continue; | 417 continue; |
| 408 } | 418 } |
| 409 int count = nonASCIISequenceLength(*source); | 419 int count = nonASCIISequenceLength(*source); |
| 410 int character; | 420 int character; |
| 411 if (count == 0) { | 421 if (count == 0) { |
| 412 character = nonCharacter; | 422 character = nonCharacter1; |
| 413 } else { | 423 } else { |
| 414 if (count > end - source) { | 424 if (count > end - source) { |
| 415 SECURITY_DCHECK(end - source < | 425 SECURITY_DCHECK(end - source < |
| 416 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); | 426 static_cast<ptrdiff_t>(sizeof(m_partialSequence))); |
| 417 ASSERT(!m_partialSequenceSize); | 427 ASSERT(!m_partialSequenceSize); |
| 418 m_partialSequenceSize = end - source; | 428 m_partialSequenceSize = end - source; |
| 419 memcpy(m_partialSequence, source, m_partialSequenceSize); | 429 memcpy(m_partialSequence, source, m_partialSequenceSize); |
| 420 source = end; | 430 source = end; |
| 421 break; | 431 break; |
| 422 } | 432 } |
| 423 character = decodeNonASCIISequence(source, count); | 433 character = decodeNonASCIISequence(source, count); |
| 424 } | 434 } |
| 425 if (character == nonCharacter) { | 435 if (isNonCharacter(character)) { |
| 426 sawError = true; | 436 sawError = true; |
| 427 if (stopOnError) | 437 if (stopOnError) |
| 428 break; | 438 break; |
| 429 // Each error generates a replacement character and consumes one byte. | 439 // Each error generates a replacement character and consumes one byte. |
| 430 *destination16++ = replacementCharacter; | 440 *destination16++ = replacementCharacter; |
| 431 ++source; | 441 source -= character; |
| 432 continue; | 442 continue; |
| 433 } | 443 } |
| 434 source += count; | 444 source += count; |
| 435 destination16 = appendCharacter(destination16, character); | 445 destination16 = appendCharacter(destination16, character); |
| 436 } | 446 } |
| 437 } while (flush && m_partialSequenceSize); | 447 } while (flush && m_partialSequenceSize); |
| 438 | 448 |
| 439 buffer16.shrink(destination16 - buffer16.characters()); | 449 buffer16.shrink(destination16 - buffer16.characters()); |
| 440 | 450 |
| 441 return String::adopt(buffer16); | 451 return String::adopt(buffer16); |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 474 return encodeCommon(characters, length); | 484 return encodeCommon(characters, length); |
| 475 } | 485 } |
| 476 | 486 |
| 477 CString TextCodecUTF8::encode(const LChar* characters, | 487 CString TextCodecUTF8::encode(const LChar* characters, |
| 478 size_t length, | 488 size_t length, |
| 479 UnencodableHandling) { | 489 UnencodableHandling) { |
| 480 return encodeCommon(characters, length); | 490 return encodeCommon(characters, length); |
| 481 } | 491 } |
| 482 | 492 |
| 483 } // namespace WTF | 493 } // namespace WTF |
| OLD | NEW |