Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(408)

Side by Side Diff: Source/WTF/wtf/unicode/UTF8.cpp

Issue 14238015: Move Source/WTF/wtf to Source/wtf (Closed) Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Created 7 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2007 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "UTF8.h"
29
30 #include "ASCIICType.h"
31 #include <wtf/StringHasher.h>
32 #include <wtf/unicode/CharacterNames.h>
33
34 namespace WTF {
35 namespace Unicode {
36
37 inline int inlineUTF8SequenceLengthNonASCII(char b0)
38 {
39 if ((b0 & 0xC0) != 0xC0)
40 return 0;
41 if ((b0 & 0xE0) == 0xC0)
42 return 2;
43 if ((b0 & 0xF0) == 0xE0)
44 return 3;
45 if ((b0 & 0xF8) == 0xF0)
46 return 4;
47 return 0;
48 }
49
50 inline int inlineUTF8SequenceLength(char b0)
51 {
52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
53 }
54
55 int UTF8SequenceLength(char b0)
56 {
57 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
58 }
59
60 int decodeUTF8Sequence(const char* sequence)
61 {
62 // Handle 0-byte sequences (never valid).
63 const unsigned char b0 = sequence[0];
64 const int length = inlineUTF8SequenceLength(b0);
65 if (length == 0)
66 return -1;
67
68 // Handle 1-byte sequences (plain ASCII).
69 const unsigned char b1 = sequence[1];
70 if (length == 1) {
71 if (b1)
72 return -1;
73 return b0;
74 }
75
76 // Handle 2-byte sequences.
77 if ((b1 & 0xC0) != 0x80)
78 return -1;
79 const unsigned char b2 = sequence[2];
80 if (length == 2) {
81 if (b2)
82 return -1;
83 const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
84 if (c < 0x80)
85 return -1;
86 return c;
87 }
88
89 // Handle 3-byte sequences.
90 if ((b2 & 0xC0) != 0x80)
91 return -1;
92 const unsigned char b3 = sequence[3];
93 if (length == 3) {
94 if (b3)
95 return -1;
96 const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
97 if (c < 0x800)
98 return -1;
99 // UTF-16 surrogates should never appear in UTF-8 data.
100 if (c >= 0xD800 && c <= 0xDFFF)
101 return -1;
102 return c;
103 }
104
105 // Handle 4-byte sequences.
106 if ((b3 & 0xC0) != 0x80)
107 return -1;
108 const unsigned char b4 = sequence[4];
109 if (length == 4) {
110 if (b4)
111 return -1;
112 const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
113 if (c < 0x10000 || c > 0x10FFFF)
114 return -1;
115 return c;
116 }
117
118 return -1;
119 }
120
121 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
122 // into the first byte, depending on how many bytes follow. There are
123 // as many entries in this table as there are UTF-8 sequence types.
124 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
125 // for *legal* UTF-8 will be 4 or fewer bytes total.
126 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };
127
128 ConversionResult convertLatin1ToUTF8(
129 const LChar** sourceStart, const LChar* sou rceEnd,
130 char** targetStart, char* targetEnd)
131 {
132 ConversionResult result = conversionOK;
133 const LChar* source = *sourceStart;
134 char* target = *targetStart;
135 while (source < sourceEnd) {
136 UChar32 ch;
137 unsigned short bytesToWrite = 0;
138 const UChar32 byteMask = 0xBF;
139 const UChar32 byteMark = 0x80;
140 const LChar* oldSource = source; // In case we have to back up because o f target overflow.
141 ch = static_cast<unsigned short>(*source++);
142
143 // Figure out how many bytes the result will require
144 if (ch < (UChar32)0x80)
145 bytesToWrite = 1;
146 else
147 bytesToWrite = 2;
148
149 target += bytesToWrite;
150 if (target > targetEnd) {
151 source = oldSource; // Back up source pointer!
152 target -= bytesToWrite;
153 result = targetExhausted;
154 break;
155 }
156 switch (bytesToWrite) { // note: everything falls through.
157 case 2:
158 *--target = (char)((ch | byteMark) & byteMask);
159 ch >>= 6;
160 case 1:
161 *--target = (char)(ch | firstByteMark[bytesToWrite]);
162 }
163 target += bytesToWrite;
164 }
165 *sourceStart = source;
166 *targetStart = target;
167 return result;
168 }
169
170 ConversionResult convertUTF16ToUTF8(
171 const UChar** sourceStart, const UChar* sourceEnd,
172 char** targetStart, char* targetEnd, bool strict)
173 {
174 ConversionResult result = conversionOK;
175 const UChar* source = *sourceStart;
176 char* target = *targetStart;
177 while (source < sourceEnd) {
178 UChar32 ch;
179 unsigned short bytesToWrite = 0;
180 const UChar32 byteMask = 0xBF;
181 const UChar32 byteMark = 0x80;
182 const UChar* oldSource = source; // In case we have to back up because o f target overflow.
183 ch = static_cast<unsigned short>(*source++);
184 // If we have a surrogate pair, convert to UChar32 first.
185 if (ch >= 0xD800 && ch <= 0xDBFF) {
186 // If the 16 bits following the high surrogate are in the source buf fer...
187 if (source < sourceEnd) {
188 UChar32 ch2 = static_cast<unsigned short>(*source);
189 // If it's a low surrogate, convert to UChar32.
190 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
191 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
192 ++source;
193 } else if (strict) { // it's an unpaired high surrogate
194 --source; // return to the illegal value itself
195 result = sourceIllegal;
196 break;
197 }
198 } else { // We don't have the 16 bits following the high surrogate.
199 --source; // return to the high surrogate
200 result = sourceExhausted;
201 break;
202 }
203 } else if (strict) {
204 // UTF-16 surrogate values are illegal in UTF-32
205 if (ch >= 0xDC00 && ch <= 0xDFFF) {
206 --source; // return to the illegal value itself
207 result = sourceIllegal;
208 break;
209 }
210 }
211 // Figure out how many bytes the result will require
212 if (ch < (UChar32)0x80) {
213 bytesToWrite = 1;
214 } else if (ch < (UChar32)0x800) {
215 bytesToWrite = 2;
216 } else if (ch < (UChar32)0x10000) {
217 bytesToWrite = 3;
218 } else if (ch < (UChar32)0x110000) {
219 bytesToWrite = 4;
220 } else {
221 bytesToWrite = 3;
222 ch = replacementCharacter;
223 }
224
225 target += bytesToWrite;
226 if (target > targetEnd) {
227 source = oldSource; // Back up source pointer!
228 target -= bytesToWrite;
229 result = targetExhausted;
230 break;
231 }
232 switch (bytesToWrite) { // note: everything falls through.
233 case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
234 case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
235 case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
236 case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]);
237 }
238 target += bytesToWrite;
239 }
240 *sourceStart = source;
241 *targetStart = target;
242 return result;
243 }
244
245 // This must be called with the length pre-determined by the first byte.
246 // If presented with a length > 4, this returns false. The Unicode
247 // definition of UTF-8 goes up to 4-byte sequences.
248 static bool isLegalUTF8(const unsigned char* source, int length)
249 {
250 unsigned char a;
251 const unsigned char* srcptr = source + length;
252 switch (length) {
253 default: return false;
254 // Everything else falls through when "true"...
255 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
256 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
257 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
258
259 switch (*source) {
260 // no fall-through in this inner switch
261 case 0xE0: if (a < 0xA0) return false; break;
262 case 0xED: if (a > 0x9F) return false; break;
263 case 0xF0: if (a < 0x90) return false; break;
264 case 0xF4: if (a > 0x8F) return false; break;
265 default: if (a < 0x80) return false;
266 }
267
268 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
269 }
270 if (*source > 0xF4)
271 return false;
272 return true;
273 }
274
275 // Magic values subtracted from a buffer value during UTF8 conversion.
276 // This table contains as many values as there might be trailing bytes
277 // in a UTF-8 sequence.
278 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };
279
280 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
281 {
282 UChar32 character = 0;
283
284 // The cases all fall through.
285 switch (length) {
286 case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
287 case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
288 case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
289 case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
290 case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
291 case 1: character += static_cast<unsigned char>(*sequence++);
292 }
293
294 return character - offsetsFromUTF8[length - 1];
295 }
296
297 ConversionResult convertUTF8ToUTF16(
298 const char** sourceStart, const char* sourceEnd,
299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
300 {
301 ConversionResult result = conversionOK;
302 const char* source = *sourceStart;
303 UChar* target = *targetStart;
304 UChar orAllData = 0;
305 while (source < sourceEnd) {
306 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
307 if (sourceEnd - source < utf8SequenceLength) {
308 result = sourceExhausted;
309 break;
310 }
311 // Do this check whether lenient or strict
312 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {
313 result = sourceIllegal;
314 break;
315 }
316
317 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
318
319 if (target >= targetEnd) {
320 source -= utf8SequenceLength; // Back up source pointer!
321 result = targetExhausted;
322 break;
323 }
324
325 if (U_IS_BMP(character)) {
326 // UTF-16 surrogate values are illegal in UTF-32
327 if (U_IS_SURROGATE(character)) {
328 if (strict) {
329 source -= utf8SequenceLength; // return to the illegal value itself
330 result = sourceIllegal;
331 break;
332 } else {
333 *target++ = replacementCharacter;
334 orAllData |= replacementCharacter;
335 }
336 } else {
337 *target++ = character; // normal case
338 orAllData |= character;
339 }
340 } else if (U_IS_SUPPLEMENTARY(character)) {
341 // target is a character in range 0xFFFF - 0x10FFFF
342 if (target + 1 >= targetEnd) {
343 source -= utf8SequenceLength; // Back up source pointer!
344 result = targetExhausted;
345 break;
346 }
347 *target++ = U16_LEAD(character);
348 *target++ = U16_TRAIL(character);
349 orAllData = 0xffff;
350 } else {
351 if (strict) {
352 source -= utf8SequenceLength; // return to the start
353 result = sourceIllegal;
354 break; // Bail out; shouldn't continue
355 } else {
356 *target++ = replacementCharacter;
357 orAllData |= replacementCharacter;
358 }
359 }
360 }
361 *sourceStart = source;
362 *targetStart = target;
363
364 if (sourceAllASCII)
365 *sourceAllASCII = !(orAllData & ~0x7f);
366
367 return result;
368 }
369
370 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
371 {
372 if (!data)
373 return 0;
374
375 StringHasher stringHasher;
376 dataLength = 0;
377 utf16Length = 0;
378
379 while (data < dataEnd || (!dataEnd && *data)) {
380 if (isASCII(*data)) {
381 stringHasher.addCharacter(*data++);
382 dataLength++;
383 utf16Length++;
384 continue;
385 }
386
387 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
388 dataLength += utf8SequenceLength;
389
390 if (!dataEnd) {
391 for (int i = 1; i < utf8SequenceLength; ++i) {
392 if (!data[i])
393 return 0;
394 }
395 } else if (dataEnd - data < utf8SequenceLength)
396 return 0;
397
398 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))
399 return 0;
400
401 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
402 ASSERT(!isASCII(character));
403
404 if (U_IS_BMP(character)) {
405 // UTF-16 surrogate values are illegal in UTF-32
406 if (U_IS_SURROGATE(character))
407 return 0;
408 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
409 utf16Length++;
410 } else if (U_IS_SUPPLEMENTARY(character)) {
411 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
412 static_cast<UChar>(U16_TRAIL(character))) ;
413 utf16Length += 2;
414 } else
415 return 0;
416 }
417
418 return stringHasher.hashWithTop8BitsMasked();
419 }
420
421 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
422 {
423 while (b < bEnd) {
424 if (isASCII(*b)) {
425 if (*a++ != *b++)
426 return false;
427 continue;
428 }
429
430 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
431
432 if (bEnd - b < utf8SequenceLength)
433 return false;
434
435 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence Length))
436 return 0;
437
438 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
439 ASSERT(!isASCII(character));
440
441 if (U_IS_BMP(character)) {
442 // UTF-16 surrogate values are illegal in UTF-32
443 if (U_IS_SURROGATE(character))
444 return false;
445 if (*a++ != character)
446 return false;
447 } else if (U_IS_SUPPLEMENTARY(character)) {
448 if (*a++ != U16_LEAD(character))
449 return false;
450 if (*a++ != U16_TRAIL(character))
451 return false;
452 } else
453 return false;
454 }
455
456 return a == aEnd;
457 }
458
459 } // namespace Unicode
460 } // namespace WTF
OLDNEW
« no previous file with comments | « Source/WTF/wtf/unicode/UTF8.h ('k') | Source/WTF/wtf/unicode/Unicode.h » ('j') | Source/config.h » ('J')

Powered by Google App Engine
This is Rietveld 408576698