| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2007 Apple Inc. All rights reserved. | 2 * Copyright (C) 2007 Apple Inc. All rights reserved. |
| 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> | 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> |
| 4 * | 4 * |
| 5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
| 7 * are met: | 7 * are met: |
| 8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
| (...skipping 15 matching lines...) Expand all Loading... |
| 26 | 26 |
| 27 #include "wtf/text/UTF8.h" | 27 #include "wtf/text/UTF8.h" |
| 28 | 28 |
| 29 #include "wtf/ASCIICType.h" | 29 #include "wtf/ASCIICType.h" |
| 30 #include "wtf/StringHasher.h" | 30 #include "wtf/StringHasher.h" |
| 31 #include "wtf/text/CharacterNames.h" | 31 #include "wtf/text/CharacterNames.h" |
| 32 | 32 |
| 33 namespace WTF { | 33 namespace WTF { |
| 34 namespace Unicode { | 34 namespace Unicode { |
| 35 | 35 |
| 36 inline int inlineUTF8SequenceLengthNonASCII(char b0) | 36 inline int inlineUTF8SequenceLengthNonASCII(char b0) { |
| 37 { | 37 if ((b0 & 0xC0) != 0xC0) |
| 38 if ((b0 & 0xC0) != 0xC0) | |
| 39 return 0; | |
| 40 if ((b0 & 0xE0) == 0xC0) | |
| 41 return 2; | |
| 42 if ((b0 & 0xF0) == 0xE0) | |
| 43 return 3; | |
| 44 if ((b0 & 0xF8) == 0xF0) | |
| 45 return 4; | |
| 46 return 0; | 38 return 0; |
| 39 if ((b0 & 0xE0) == 0xC0) |
| 40 return 2; |
| 41 if ((b0 & 0xF0) == 0xE0) |
| 42 return 3; |
| 43 if ((b0 & 0xF8) == 0xF0) |
| 44 return 4; |
| 45 return 0; |
| 47 } | 46 } |
| 48 | 47 |
| 49 inline int inlineUTF8SequenceLength(char b0) | 48 inline int inlineUTF8SequenceLength(char b0) { |
| 50 { | 49 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); |
| 51 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); | |
| 52 } | 50 } |
| 53 | 51 |
| 54 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed | 52 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
| 55 // into the first byte, depending on how many bytes follow. There are | 53 // into the first byte, depending on how many bytes follow. There are |
| 56 // as many entries in this table as there are UTF-8 sequence types. | 54 // as many entries in this table as there are UTF-8 sequence types. |
| 57 // (I.e., one byte sequence, two byte... etc.). Remember that sequences | 55 // (I.e., one byte sequence, two byte... etc.). Remember that sequences |
| 58 // for *legal* UTF-8 will be 4 or fewer bytes total. | 56 // for *legal* UTF-8 will be 4 or fewer bytes total. |
| 59 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x
F8, 0xFC }; | 57 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, |
| 60 | 58 0xF0, 0xF8, 0xFC}; |
| 61 ConversionResult convertLatin1ToUTF8( | 59 |
| 62 const LChar** sourceStart, const LChar* sourceEnd, | 60 ConversionResult convertLatin1ToUTF8(const LChar** sourceStart, |
| 63 char** targetStart, char* targetEnd) | 61 const LChar* sourceEnd, |
| 64 { | 62 char** targetStart, |
| 65 ConversionResult result = conversionOK; | 63 char* targetEnd) { |
| 66 const LChar* source = *sourceStart; | 64 ConversionResult result = conversionOK; |
| 67 char* target = *targetStart; | 65 const LChar* source = *sourceStart; |
| 68 while (source < sourceEnd) { | 66 char* target = *targetStart; |
| 69 UChar32 ch; | 67 while (source < sourceEnd) { |
| 70 unsigned short bytesToWrite = 0; | 68 UChar32 ch; |
| 71 const UChar32 byteMask = 0xBF; | 69 unsigned short bytesToWrite = 0; |
| 72 const UChar32 byteMark = 0x80; | 70 const UChar32 byteMask = 0xBF; |
| 73 const LChar* oldSource = source; // In case we have to back up because o
f target overflow. | 71 const UChar32 byteMark = 0x80; |
| 74 ch = static_cast<unsigned short>(*source++); | 72 const LChar* oldSource = |
| 75 | 73 source; // In case we have to back up because of target overflow. |
| 76 // Figure out how many bytes the result will require | 74 ch = static_cast<unsigned short>(*source++); |
| 77 if (ch < (UChar32)0x80) | 75 |
| 78 bytesToWrite = 1; | 76 // Figure out how many bytes the result will require |
| 79 else | 77 if (ch < (UChar32)0x80) |
| 80 bytesToWrite = 2; | 78 bytesToWrite = 1; |
| 81 | 79 else |
| 82 target += bytesToWrite; | 80 bytesToWrite = 2; |
| 83 if (target > targetEnd) { | 81 |
| 84 source = oldSource; // Back up source pointer! | 82 target += bytesToWrite; |
| 85 target -= bytesToWrite; | 83 if (target > targetEnd) { |
| 86 result = targetExhausted; | 84 source = oldSource; // Back up source pointer! |
| 87 break; | 85 target -= bytesToWrite; |
| 86 result = targetExhausted; |
| 87 break; |
| 88 } |
| 89 switch (bytesToWrite) { // note: everything falls through. |
| 90 case 2: |
| 91 *--target = (char)((ch | byteMark) & byteMask); |
| 92 ch >>= 6; |
| 93 case 1: |
| 94 *--target = (char)(ch | firstByteMark[bytesToWrite]); |
| 95 } |
| 96 target += bytesToWrite; |
| 97 } |
| 98 *sourceStart = source; |
| 99 *targetStart = target; |
| 100 return result; |
| 101 } |
| 102 |
| 103 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, |
| 104 const UChar* sourceEnd, |
| 105 char** targetStart, |
| 106 char* targetEnd, |
| 107 bool strict) { |
| 108 ConversionResult result = conversionOK; |
| 109 const UChar* source = *sourceStart; |
| 110 char* target = *targetStart; |
| 111 while (source < sourceEnd) { |
| 112 UChar32 ch; |
| 113 unsigned short bytesToWrite = 0; |
| 114 const UChar32 byteMask = 0xBF; |
| 115 const UChar32 byteMark = 0x80; |
| 116 const UChar* oldSource = |
| 117 source; // In case we have to back up because of target overflow. |
| 118 ch = static_cast<unsigned short>(*source++); |
| 119 // If we have a surrogate pair, convert to UChar32 first. |
| 120 if (ch >= 0xD800 && ch <= 0xDBFF) { |
| 121 // If the 16 bits following the high surrogate are in the source buffer... |
| 122 if (source < sourceEnd) { |
| 123 UChar32 ch2 = static_cast<unsigned short>(*source); |
| 124 // If it's a low surrogate, convert to UChar32. |
| 125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
| 126 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; |
| 127 ++source; |
| 128 } else if (strict) { // it's an unpaired high surrogate |
| 129 --source; // return to the illegal value itself |
| 130 result = sourceIllegal; |
| 131 break; |
| 88 } | 132 } |
| 89 switch (bytesToWrite) { // note: everything falls through. | 133 } else { // We don't have the 16 bits following the high surrogate. |
| 90 case 2: | 134 --source; // return to the high surrogate |
| 91 *--target = (char)((ch | byteMark) & byteMask); | 135 result = sourceExhausted; |
| 92 ch >>= 6; | 136 break; |
| 93 case 1: | 137 } |
| 94 *--target = (char)(ch | firstByteMark[bytesToWrite]); | 138 } else if (strict) { |
| 95 } | 139 // UTF-16 surrogate values are illegal in UTF-32 |
| 96 target += bytesToWrite; | 140 if (ch >= 0xDC00 && ch <= 0xDFFF) { |
| 97 } | 141 --source; // return to the illegal value itself |
| 98 *sourceStart = source; | 142 result = sourceIllegal; |
| 99 *targetStart = target; | 143 break; |
| 100 return result; | 144 } |
| 101 } | 145 } |
| 102 | 146 // Figure out how many bytes the result will require |
| 103 ConversionResult convertUTF16ToUTF8( | 147 if (ch < (UChar32)0x80) { |
| 104 const UChar** sourceStart, const UChar* sourceEnd, | 148 bytesToWrite = 1; |
| 105 char** targetStart, char* targetEnd, bool strict) | 149 } else if (ch < (UChar32)0x800) { |
| 106 { | 150 bytesToWrite = 2; |
| 107 ConversionResult result = conversionOK; | 151 } else if (ch < (UChar32)0x10000) { |
| 108 const UChar* source = *sourceStart; | 152 bytesToWrite = 3; |
| 109 char* target = *targetStart; | 153 } else if (ch < (UChar32)0x110000) { |
| 110 while (source < sourceEnd) { | 154 bytesToWrite = 4; |
| 111 UChar32 ch; | 155 } else { |
| 112 unsigned short bytesToWrite = 0; | 156 bytesToWrite = 3; |
| 113 const UChar32 byteMask = 0xBF; | 157 ch = replacementCharacter; |
| 114 const UChar32 byteMark = 0x80; | 158 } |
| 115 const UChar* oldSource = source; // In case we have to back up because o
f target overflow. | 159 |
| 116 ch = static_cast<unsigned short>(*source++); | 160 target += bytesToWrite; |
| 117 // If we have a surrogate pair, convert to UChar32 first. | 161 if (target > targetEnd) { |
| 118 if (ch >= 0xD800 && ch <= 0xDBFF) { | 162 source = oldSource; // Back up source pointer! |
| 119 // If the 16 bits following the high surrogate are in the source buf
fer... | 163 target -= bytesToWrite; |
| 120 if (source < sourceEnd) { | 164 result = targetExhausted; |
| 121 UChar32 ch2 = static_cast<unsigned short>(*source); | 165 break; |
| 122 // If it's a low surrogate, convert to UChar32. | 166 } |
| 123 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | 167 switch (bytesToWrite) { // note: everything falls through. |
| 124 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; | 168 case 4: |
| 125 ++source; | 169 *--target = (char)((ch | byteMark) & byteMask); |
| 126 } else if (strict) { // it's an unpaired high surrogate | 170 ch >>= 6; |
| 127 --source; // return to the illegal value itself | 171 case 3: |
| 128 result = sourceIllegal; | 172 *--target = (char)((ch | byteMark) & byteMask); |
| 129 break; | 173 ch >>= 6; |
| 130 } | 174 case 2: |
| 131 } else { // We don't have the 16 bits following the high surrogate. | 175 *--target = (char)((ch | byteMark) & byteMask); |
| 132 --source; // return to the high surrogate | 176 ch >>= 6; |
| 133 result = sourceExhausted; | 177 case 1: |
| 134 break; | 178 *--target = (char)(ch | firstByteMark[bytesToWrite]); |
| 135 } | 179 } |
| 136 } else if (strict) { | 180 target += bytesToWrite; |
| 137 // UTF-16 surrogate values are illegal in UTF-32 | 181 } |
| 138 if (ch >= 0xDC00 && ch <= 0xDFFF) { | 182 *sourceStart = source; |
| 139 --source; // return to the illegal value itself | 183 *targetStart = target; |
| 140 result = sourceIllegal; | 184 return result; |
| 141 break; | |
| 142 } | |
| 143 } | |
| 144 // Figure out how many bytes the result will require | |
| 145 if (ch < (UChar32)0x80) { | |
| 146 bytesToWrite = 1; | |
| 147 } else if (ch < (UChar32)0x800) { | |
| 148 bytesToWrite = 2; | |
| 149 } else if (ch < (UChar32)0x10000) { | |
| 150 bytesToWrite = 3; | |
| 151 } else if (ch < (UChar32)0x110000) { | |
| 152 bytesToWrite = 4; | |
| 153 } else { | |
| 154 bytesToWrite = 3; | |
| 155 ch = replacementCharacter; | |
| 156 } | |
| 157 | |
| 158 target += bytesToWrite; | |
| 159 if (target > targetEnd) { | |
| 160 source = oldSource; // Back up source pointer! | |
| 161 target -= bytesToWrite; | |
| 162 result = targetExhausted; | |
| 163 break; | |
| 164 } | |
| 165 switch (bytesToWrite) { // note: everything falls through. | |
| 166 case 4: | |
| 167 *--target = (char)((ch | byteMark) & byteMask); | |
| 168 ch >>= 6; | |
| 169 case 3: | |
| 170 *--target = (char)((ch | byteMark) & byteMask); | |
| 171 ch >>= 6; | |
| 172 case 2: | |
| 173 *--target = (char)((ch | byteMark) & byteMask); | |
| 174 ch >>= 6; | |
| 175 case 1: | |
| 176 *--target = (char)(ch | firstByteMark[bytesToWrite]); | |
| 177 } | |
| 178 target += bytesToWrite; | |
| 179 } | |
| 180 *sourceStart = source; | |
| 181 *targetStart = target; | |
| 182 return result; | |
| 183 } | 185 } |
| 184 | 186 |
| 185 // This must be called with the length pre-determined by the first byte. | 187 // This must be called with the length pre-determined by the first byte. |
| 186 // If presented with a length > 4, this returns false. The Unicode | 188 // If presented with a length > 4, this returns false. The Unicode |
| 187 // definition of UTF-8 goes up to 4-byte sequences. | 189 // definition of UTF-8 goes up to 4-byte sequences. |
| 188 static bool isLegalUTF8(const unsigned char* source, int length) | 190 static bool isLegalUTF8(const unsigned char* source, int length) { |
| 189 { | 191 unsigned char a; |
| 190 unsigned char a; | 192 const unsigned char* srcptr = source + length; |
| 191 const unsigned char* srcptr = source + length; | 193 switch (length) { |
| 192 switch (length) { | |
| 193 default: | 194 default: |
| 194 return false; | 195 return false; |
| 195 // Everything else falls through when "true"... | 196 // Everything else falls through when "true"... |
| 196 case 4: | 197 case 4: |
| 197 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | 198 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
| 199 return false; |
| 200 case 3: |
| 201 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
| 202 return false; |
| 203 case 2: |
| 204 if ((a = (*--srcptr)) > 0xBF) |
| 205 return false; |
| 206 |
| 207 // no fall-through in this inner switch |
| 208 switch (*source) { |
| 209 case 0xE0: |
| 210 if (a < 0xA0) |
| 198 return false; | 211 return false; |
| 199 case 3: | 212 break; |
| 200 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | 213 case 0xED: |
| 214 if (a > 0x9F) |
| 201 return false; | 215 return false; |
| 202 case 2: | 216 break; |
| 203 if ((a = (*--srcptr)) > 0xBF) | 217 case 0xF0: |
| 218 if (a < 0x90) |
| 204 return false; | 219 return false; |
| 205 | 220 break; |
| 206 // no fall-through in this inner switch | |
| 207 switch (*source) { | |
| 208 case 0xE0: | |
| 209 if (a < 0xA0) | |
| 210 return false; | |
| 211 break; | |
| 212 case 0xED: | |
| 213 if (a > 0x9F) | |
| 214 return false; | |
| 215 break; | |
| 216 case 0xF0: | |
| 217 if (a < 0x90) | |
| 218 return false; | |
| 219 break; | |
| 220 case 0xF4: | 221 case 0xF4: |
| 221 if (a > 0x8F) | 222 if (a > 0x8F) |
| 222 return false; | 223 return false; |
| 223 break; | 224 break; |
| 224 default: | 225 default: |
| 225 if (a < 0x80) | 226 if (a < 0x80) |
| 226 return false; | 227 return false; |
| 227 } | 228 } |
| 228 | 229 |
| 229 case 1: | 230 case 1: |
| 230 if (*source >= 0x80 && *source < 0xC2) | 231 if (*source >= 0x80 && *source < 0xC2) |
| 231 return false; | 232 return false; |
| 232 } | 233 } |
| 233 if (*source > 0xF4) | 234 if (*source > 0xF4) |
| 234 return false; | 235 return false; |
| 235 return true; | 236 return true; |
| 236 } | 237 } |
| 237 | 238 |
| 238 // Magic values subtracted from a buffer value during UTF8 conversion. | 239 // Magic values subtracted from a buffer value during UTF8 conversion. |
| 239 // This table contains as many values as there might be trailing bytes | 240 // This table contains as many values as there might be trailing bytes |
| 240 // in a UTF-8 sequence. | 241 // in a UTF-8 sequence. |
| 241 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20
80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8
2082080UL) }; | 242 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, |
| 242 | 243 0x00003080UL, |
| 243 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) | 244 0x000E2080UL, |
| 244 { | 245 0x03C82080UL, |
| 245 UChar32 character = 0; | 246 static_cast<UChar32>(0xFA082080UL), |
| 246 | 247 static_cast<UChar32>(0x82082080UL)}; |
| 247 // The cases all fall through. | 248 |
| 248 switch (length) { | 249 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) { |
| 250 UChar32 character = 0; |
| 251 |
| 252 // The cases all fall through. |
| 253 switch (length) { |
| 249 case 6: | 254 case 6: |
| 250 character += static_cast<unsigned char>(*sequence++); | 255 character += static_cast<unsigned char>(*sequence++); |
| 251 character <<= 6; | 256 character <<= 6; |
| 252 case 5: | 257 case 5: |
| 253 character += static_cast<unsigned char>(*sequence++); | 258 character += static_cast<unsigned char>(*sequence++); |
| 254 character <<= 6; | 259 character <<= 6; |
| 255 case 4: | 260 case 4: |
| 256 character += static_cast<unsigned char>(*sequence++); | 261 character += static_cast<unsigned char>(*sequence++); |
| 257 character <<= 6; | 262 character <<= 6; |
| 258 case 3: | 263 case 3: |
| 259 character += static_cast<unsigned char>(*sequence++); | 264 character += static_cast<unsigned char>(*sequence++); |
| 260 character <<= 6; | 265 character <<= 6; |
| 261 case 2: | 266 case 2: |
| 262 character += static_cast<unsigned char>(*sequence++); | 267 character += static_cast<unsigned char>(*sequence++); |
| 263 character <<= 6; | 268 character <<= 6; |
| 264 case 1: | 269 case 1: |
| 265 character += static_cast<unsigned char>(*sequence++); | 270 character += static_cast<unsigned char>(*sequence++); |
| 266 } | 271 } |
| 267 | 272 |
| 268 return character - offsetsFromUTF8[length - 1]; | 273 return character - offsetsFromUTF8[length - 1]; |
| 269 } | 274 } |
| 270 | 275 |
| 271 ConversionResult convertUTF8ToUTF16( | 276 ConversionResult convertUTF8ToUTF16(const char** sourceStart, |
| 272 const char** sourceStart, const char* sourceEnd, | 277 const char* sourceEnd, |
| 273 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict) | 278 UChar** targetStart, |
| 274 { | 279 UChar* targetEnd, |
| 275 ConversionResult result = conversionOK; | 280 bool* sourceAllASCII, |
| 276 const char* source = *sourceStart; | 281 bool strict) { |
| 277 UChar* target = *targetStart; | 282 ConversionResult result = conversionOK; |
| 278 UChar orAllData = 0; | 283 const char* source = *sourceStart; |
| 279 while (source < sourceEnd) { | 284 UChar* target = *targetStart; |
| 280 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | 285 UChar orAllData = 0; |
| 281 if (sourceEnd - source < utf8SequenceLength) { | 286 while (source < sourceEnd) { |
| 282 result = sourceExhausted; | 287 int utf8SequenceLength = inlineUTF8SequenceLength(*source); |
| 283 break; | 288 if (sourceEnd - source < utf8SequenceLength) { |
| 289 result = sourceExhausted; |
| 290 break; |
| 291 } |
| 292 // Do this check whether lenient or strict |
| 293 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), |
| 294 utf8SequenceLength)) { |
| 295 result = sourceIllegal; |
| 296 break; |
| 297 } |
| 298 |
| 299 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); |
| 300 |
| 301 if (target >= targetEnd) { |
| 302 source -= utf8SequenceLength; // Back up source pointer! |
| 303 result = targetExhausted; |
| 304 break; |
| 305 } |
| 306 |
| 307 if (U_IS_BMP(character)) { |
| 308 // UTF-16 surrogate values are illegal in UTF-32 |
| 309 if (U_IS_SURROGATE(character)) { |
| 310 if (strict) { |
| 311 source -= utf8SequenceLength; // return to the illegal value itself |
| 312 result = sourceIllegal; |
| 313 break; |
| 284 } | 314 } |
| 285 // Do this check whether lenient or strict | 315 *target++ = replacementCharacter; |
| 286 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength)) { | 316 orAllData |= replacementCharacter; |
| 287 result = sourceIllegal; | 317 } else { |
| 288 break; | 318 *target++ = static_cast<UChar>(character); // normal case |
| 289 } | 319 orAllData |= character; |
| 290 | 320 } |
| 291 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | 321 } else if (U_IS_SUPPLEMENTARY(character)) { |
| 292 | 322 // target is a character in range 0xFFFF - 0x10FFFF |
| 293 if (target >= targetEnd) { | 323 if (target + 1 >= targetEnd) { |
| 294 source -= utf8SequenceLength; // Back up source pointer! | 324 source -= utf8SequenceLength; // Back up source pointer! |
| 295 result = targetExhausted; | 325 result = targetExhausted; |
| 296 break; | 326 break; |
| 297 } | 327 } |
| 298 | 328 *target++ = U16_LEAD(character); |
| 299 if (U_IS_BMP(character)) { | 329 *target++ = U16_TRAIL(character); |
| 300 // UTF-16 surrogate values are illegal in UTF-32 | 330 orAllData = 0xffff; |
| 301 if (U_IS_SURROGATE(character)) { | 331 } else { |
| 302 if (strict) { | 332 if (strict) { |
| 303 source -= utf8SequenceLength; // return to the illegal value
itself | 333 source -= utf8SequenceLength; // return to the start |
| 304 result = sourceIllegal; | 334 result = sourceIllegal; |
| 305 break; | 335 break; // Bail out; shouldn't continue |
| 306 } | 336 } else { |
| 307 *target++ = replacementCharacter; | 337 *target++ = replacementCharacter; |
| 308 orAllData |= replacementCharacter; | 338 orAllData |= replacementCharacter; |
| 309 } else { | 339 } |
| 310 *target++ = static_cast<UChar>(character); // normal case | 340 } |
| 311 orAllData |= character; | 341 } |
| 312 } | 342 *sourceStart = source; |
| 313 } else if (U_IS_SUPPLEMENTARY(character)) { | 343 *targetStart = target; |
| 314 // target is a character in range 0xFFFF - 0x10FFFF | 344 |
| 315 if (target + 1 >= targetEnd) { | 345 if (sourceAllASCII) |
| 316 source -= utf8SequenceLength; // Back up source pointer! | 346 *sourceAllASCII = !(orAllData & ~0x7f); |
| 317 result = targetExhausted; | 347 |
| 318 break; | 348 return result; |
| 319 } | 349 } |
| 320 *target++ = U16_LEAD(character); | 350 |
| 321 *target++ = U16_TRAIL(character); | 351 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits( |
| 322 orAllData = 0xffff; | 352 const char* data, |
| 323 } else { | 353 const char* dataEnd, |
| 324 if (strict) { | 354 unsigned& dataLength, |
| 325 source -= utf8SequenceLength; // return to the start | 355 unsigned& utf16Length) { |
| 326 result = sourceIllegal; | 356 if (!data) |
| 327 break; // Bail out; shouldn't continue | 357 return 0; |
| 328 } else { | 358 |
| 329 *target++ = replacementCharacter; | 359 StringHasher stringHasher; |
| 330 orAllData |= replacementCharacter; | 360 dataLength = 0; |
| 331 } | 361 utf16Length = 0; |
| 332 } | 362 |
| 333 } | 363 while (data < dataEnd || (!dataEnd && *data)) { |
| 334 *sourceStart = source; | 364 if (isASCII(*data)) { |
| 335 *targetStart = target; | 365 stringHasher.addCharacter(*data++); |
| 336 | 366 dataLength++; |
| 337 if (sourceAllASCII) | 367 utf16Length++; |
| 338 *sourceAllASCII = !(orAllData & ~0x7f); | 368 continue; |
| 339 | 369 } |
| 340 return result; | 370 |
| 341 } | 371 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); |
| 342 | 372 dataLength += utf8SequenceLength; |
| 343 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c
onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length) | 373 |
| 344 { | 374 if (!dataEnd) { |
| 345 if (!data) | 375 for (int i = 1; i < utf8SequenceLength; ++i) { |
| 376 if (!data[i]) |
| 377 return 0; |
| 378 } |
| 379 } else if (dataEnd - data < utf8SequenceLength) { |
| 380 return 0; |
| 381 } |
| 382 |
| 383 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), |
| 384 utf8SequenceLength)) |
| 385 return 0; |
| 386 |
| 387 UChar32 character = readUTF8Sequence(data, utf8SequenceLength); |
| 388 ASSERT(!isASCII(character)); |
| 389 |
| 390 if (U_IS_BMP(character)) { |
| 391 // UTF-16 surrogate values are illegal in UTF-32 |
| 392 if (U_IS_SURROGATE(character)) |
| 346 return 0; | 393 return 0; |
| 347 | 394 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case |
| 348 StringHasher stringHasher; | 395 utf16Length++; |
| 349 dataLength = 0; | 396 } else if (U_IS_SUPPLEMENTARY(character)) { |
| 350 utf16Length = 0; | 397 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), |
| 351 | 398 static_cast<UChar>(U16_TRAIL(character))); |
| 352 while (data < dataEnd || (!dataEnd && *data)) { | 399 utf16Length += 2; |
| 353 if (isASCII(*data)) { | 400 } else { |
| 354 stringHasher.addCharacter(*data++); | 401 return 0; |
| 355 dataLength++; | 402 } |
| 356 utf16Length++; | 403 } |
| 357 continue; | 404 |
| 358 } | 405 return stringHasher.hashWithTop8BitsMasked(); |
| 359 | 406 } |
| 360 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); | 407 |
| 361 dataLength += utf8SequenceLength; | 408 template <typename CharType> |
| 362 | 409 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, |
| 363 if (!dataEnd) { | 410 const CharType* aEnd, |
| 364 for (int i = 1; i < utf8SequenceLength; ++i) { | 411 const char* b, |
| 365 if (!data[i]) | 412 const char* bEnd) { |
| 366 return 0; | 413 while (b < bEnd) { |
| 367 } | 414 if (isASCII(*b)) { |
| 368 } else if (dataEnd - data < utf8SequenceLength) { | 415 if (*a++ != *b++) |
| 369 return 0; | 416 return false; |
| 370 } | 417 continue; |
| 371 | 418 } |
| 372 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque
nceLength)) | 419 |
| 373 return 0; | 420 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); |
| 374 | 421 |
| 375 UChar32 character = readUTF8Sequence(data, utf8SequenceLength); | 422 if (bEnd - b < utf8SequenceLength) |
| 376 ASSERT(!isASCII(character)); | 423 return false; |
| 377 | 424 |
| 378 if (U_IS_BMP(character)) { | 425 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), |
| 379 // UTF-16 surrogate values are illegal in UTF-32 | 426 utf8SequenceLength)) |
| 380 if (U_IS_SURROGATE(character)) | 427 return 0; |
| 381 return 0; | 428 |
| 382 stringHasher.addCharacter(static_cast<UChar>(character)); // normal
case | 429 UChar32 character = readUTF8Sequence(b, utf8SequenceLength); |
| 383 utf16Length++; | 430 ASSERT(!isASCII(character)); |
| 384 } else if (U_IS_SUPPLEMENTARY(character)) { | 431 |
| 385 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
static_cast<UChar>(U16_TRAIL(character))); | 432 if (U_IS_BMP(character)) { |
| 386 utf16Length += 2; | 433 // UTF-16 surrogate values are illegal in UTF-32 |
| 387 } else { | 434 if (U_IS_SURROGATE(character)) |
| 388 return 0; | 435 return false; |
| 389 } | 436 if (*a++ != character) |
| 390 } | 437 return false; |
| 391 | 438 } else if (U_IS_SUPPLEMENTARY(character)) { |
| 392 return stringHasher.hashWithTop8BitsMasked(); | 439 if (*a++ != U16_LEAD(character)) |
| 393 } | 440 return false; |
| 394 | 441 if (*a++ != U16_TRAIL(character)) |
| 395 template<typename CharType> | 442 return false; |
| 396 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd
, const char* b, const char* bEnd) | 443 } else { |
| 397 { | 444 return false; |
| 398 while (b < bEnd) { | 445 } |
| 399 if (isASCII(*b)) { | 446 } |
| 400 if (*a++ != *b++) | 447 |
| 401 return false; | 448 return a == aEnd; |
| 402 continue; | 449 } |
| 403 } | 450 |
| 404 | 451 bool equalUTF16WithUTF8(const UChar* a, |
| 405 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); | 452 const UChar* aEnd, |
| 406 | 453 const char* b, |
| 407 if (bEnd - b < utf8SequenceLength) | 454 const char* bEnd) { |
| 408 return false; | 455 return equalWithUTF8Internal(a, aEnd, b, bEnd); |
| 409 | 456 } |
| 410 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence
Length)) | 457 |
| 411 return 0; | 458 bool equalLatin1WithUTF8(const LChar* a, |
| 412 | 459 const LChar* aEnd, |
| 413 UChar32 character = readUTF8Sequence(b, utf8SequenceLength); | 460 const char* b, |
| 414 ASSERT(!isASCII(character)); | 461 const char* bEnd) { |
| 415 | 462 return equalWithUTF8Internal(a, aEnd, b, bEnd); |
| 416 if (U_IS_BMP(character)) { | 463 } |
| 417 // UTF-16 surrogate values are illegal in UTF-32 | 464 |
| 418 if (U_IS_SURROGATE(character)) | 465 } // namespace Unicode |
| 419 return false; | 466 } // namespace WTF |
| 420 if (*a++ != character) | |
| 421 return false; | |
| 422 } else if (U_IS_SUPPLEMENTARY(character)) { | |
| 423 if (*a++ != U16_LEAD(character)) | |
| 424 return false; | |
| 425 if (*a++ != U16_TRAIL(character)) | |
| 426 return false; | |
| 427 } else { | |
| 428 return false; | |
| 429 } | |
| 430 } | |
| 431 | |
| 432 return a == aEnd; | |
| 433 } | |
| 434 | |
| 435 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const
char* bEnd) | |
| 436 { | |
| 437 return equalWithUTF8Internal(a, aEnd, b, bEnd); | |
| 438 } | |
| 439 | |
| 440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const
char* bEnd) | |
| 441 { | |
| 442 return equalWithUTF8Internal(a, aEnd, b, bEnd); | |
| 443 } | |
| 444 | |
| 445 } // namespace Unicode | |
| 446 } // namespace WTF | |
| OLD | NEW |