OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2007 Apple Inc. All rights reserved. | 2 * Copyright (C) 2007 Apple Inc. All rights reserved. |
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> | 3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> |
4 * | 4 * |
5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
7 * are met: | 7 * are met: |
8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
(...skipping 167 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
178 target += bytesToWrite; | 178 target += bytesToWrite; |
179 } | 179 } |
180 *sourceStart = source; | 180 *sourceStart = source; |
181 *targetStart = target; | 181 *targetStart = target; |
182 return result; | 182 return result; |
183 } | 183 } |
184 | 184 |
185 // This must be called with the length pre-determined by the first byte. | 185 // This must be called with the length pre-determined by the first byte. |
186 // If presented with a length > 4, this returns false. The Unicode | 186 // If presented with a length > 4, this returns false. The Unicode |
187 // definition of UTF-8 goes up to 4-byte sequences. | 187 // definition of UTF-8 goes up to 4-byte sequences. |
188 static bool isLegalUTF8(const unsigned char* source, int length) | 188 static bool isLegalUTF8(const unsigned char* source, int length, bool strict = t
rue) |
189 { | 189 { |
190 unsigned char a; | 190 unsigned char a; |
191 const unsigned char* srcptr = source + length; | 191 const unsigned char* srcptr = source + length; |
192 switch (length) { | 192 switch (length) { |
193 default: | 193 default: |
194 return false; | 194 return false; |
195 // Everything else falls through when "true"... | 195 // Everything else falls through when "true"... |
196 case 4: | 196 case 4: |
197 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | 197 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
198 return false; | 198 return false; |
199 case 3: | 199 case 3: |
200 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) | 200 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
201 return false; | 201 return false; |
202 case 2: | 202 case 2: |
203 if ((a = (*--srcptr)) > 0xBF) | 203 if ((a = (*--srcptr)) > 0xBF) |
204 return false; | 204 return false; |
205 | 205 |
206 // no fall-through in this inner switch | 206 // no fall-through in this inner switch |
207 switch (*source) { | 207 switch (*source) { |
208 case 0xE0: | 208 case 0xE0: |
209 if (a < 0xA0) | 209 if (a < 0xA0) |
210 return false; | 210 return false; |
211 break; | 211 break; |
212 case 0xED: | 212 case 0xED: |
213 if (a > 0x9F) | 213 // Surrogate values are mapped to [EDA080-EDAFBF] and [EDB080-EDBFBF
] in lenient mode. |
| 214 if (strict && a > 0x9F) |
214 return false; | 215 return false; |
215 break; | 216 break; |
216 case 0xF0: | 217 case 0xF0: |
217 if (a < 0x90) | 218 if (a < 0x90) |
218 return false; | 219 return false; |
219 break; | 220 break; |
220 case 0xF4: | 221 case 0xF4: |
221 if (a > 0x8F) | 222 if (a > 0x8F) |
222 return false; | 223 return false; |
223 break; | 224 break; |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
276 const char* source = *sourceStart; | 277 const char* source = *sourceStart; |
277 UChar* target = *targetStart; | 278 UChar* target = *targetStart; |
278 UChar orAllData = 0; | 279 UChar orAllData = 0; |
279 while (source < sourceEnd) { | 280 while (source < sourceEnd) { |
280 int utf8SequenceLength = inlineUTF8SequenceLength(*source); | 281 int utf8SequenceLength = inlineUTF8SequenceLength(*source); |
281 if (sourceEnd - source < utf8SequenceLength) { | 282 if (sourceEnd - source < utf8SequenceLength) { |
282 result = sourceExhausted; | 283 result = sourceExhausted; |
283 break; | 284 break; |
284 } | 285 } |
285 // Do this check whether lenient or strict | 286 // Do this check whether lenient or strict |
286 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength)) { | 287 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq
uenceLength, strict)) { |
287 result = sourceIllegal; | 288 result = sourceIllegal; |
288 break; | 289 break; |
289 } | 290 } |
290 | 291 |
291 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); | 292 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); |
292 | 293 |
293 if (target >= targetEnd) { | 294 if (target >= targetEnd) { |
294 source -= utf8SequenceLength; // Back up source pointer! | 295 source -= utf8SequenceLength; // Back up source pointer! |
295 result = targetExhausted; | 296 result = targetExhausted; |
296 break; | 297 break; |
297 } | 298 } |
298 | 299 |
299 if (U_IS_BMP(character)) { | 300 if (U_IS_BMP(character)) { |
300 // UTF-16 surrogate values are illegal in UTF-32 | 301 // UTF-16 surrogate values are illegal in UTF-32 |
301 if (U_IS_SURROGATE(character)) { | 302 if (U_IS_SURROGATE(character)) { |
302 if (strict) { | 303 if (strict) { |
303 source -= utf8SequenceLength; // return to the illegal value
itself | 304 source -= utf8SequenceLength; // return to the illegal value
itself |
304 result = sourceIllegal; | 305 result = sourceIllegal; |
305 break; | 306 break; |
306 } | 307 } |
307 *target++ = replacementCharacter; | 308 *target++ = static_cast<UChar>(character); |
308 orAllData |= replacementCharacter; | 309 orAllData |= character; |
309 } else { | 310 } else { |
310 *target++ = static_cast<UChar>(character); // normal case | 311 *target++ = static_cast<UChar>(character); // normal case |
311 orAllData |= character; | 312 orAllData |= character; |
312 } | 313 } |
313 } else if (U_IS_SUPPLEMENTARY(character)) { | 314 } else if (U_IS_SUPPLEMENTARY(character)) { |
314 // target is a character in range 0xFFFF - 0x10FFFF | 315 // target is a character in range 0xFFFF - 0x10FFFF |
315 if (target + 1 >= targetEnd) { | 316 if (target + 1 >= targetEnd) { |
316 source -= utf8SequenceLength; // Back up source pointer! | 317 source -= utf8SequenceLength; // Back up source pointer! |
317 result = targetExhausted; | 318 result = targetExhausted; |
318 break; | 319 break; |
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
437 return equalWithUTF8Internal(a, aEnd, b, bEnd); | 438 return equalWithUTF8Internal(a, aEnd, b, bEnd); |
438 } | 439 } |
439 | 440 |
440 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const
char* bEnd) | 441 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const
char* bEnd) |
441 { | 442 { |
442 return equalWithUTF8Internal(a, aEnd, b, bEnd); | 443 return equalWithUTF8Internal(a, aEnd, b, bEnd); |
443 } | 444 } |
444 | 445 |
445 } // namespace Unicode | 446 } // namespace Unicode |
446 } // namespace WTF | 447 } // namespace WTF |
OLD | NEW |