Source/WTF/wtf/unicode/UTF8.cpp - Issue 14238015: Move Source/WTF/wtf to Source/wtf

Side by Side Diff: Source/WTF/wtf/unicode/UTF8.cpp

Issue 14238015: Move Source/WTF/wtf to Source/wtf (Closed) Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /*

2 * Copyright (C) 2007 Apple Inc. All rights reserved.

3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>

4 *

5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions

7 * are met:

8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright

11 * notice, this list of conditions and the following disclaimer in the

12 * documentation and/or other materials provided with the distribution.

13 *

14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY

15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR

18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

25 */

26

27 #include "config.h"

28 #include "UTF8.h"

29

30 #include "ASCIICType.h"

31 #include <wtf/StringHasher.h>

32 #include <wtf/unicode/CharacterNames.h>

33

34 namespace WTF {

35 namespace Unicode {

36

37 inline int inlineUTF8SequenceLengthNonASCII(char b0)

38 {

39 if ((b0 & 0xC0) != 0xC0)

40 return 0;

41 if ((b0 & 0xE0) == 0xC0)

42 return 2;

43 if ((b0 & 0xF0) == 0xE0)

44 return 3;

45 if ((b0 & 0xF8) == 0xF0)

46 return 4;

47 return 0;

48 }

49

50 inline int inlineUTF8SequenceLength(char b0)

51 {

52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

53 }

54

55 int UTF8SequenceLength(char b0)

56 {

57 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

58 }

59

60 int decodeUTF8Sequence(const char* sequence)

61 {

62 // Handle 0-byte sequences (never valid).

63 const unsigned char b0 = sequence[0];

64 const int length = inlineUTF8SequenceLength(b0);

65 if (length == 0)

66 return -1;

67

68 // Handle 1-byte sequences (plain ASCII).

69 const unsigned char b1 = sequence[1];

70 if (length == 1) {

71 if (b1)

72 return -1;

73 return b0;

74 }

75

76 // Handle 2-byte sequences.

77 if ((b1 & 0xC0) != 0x80)

78 return -1;

79 const unsigned char b2 = sequence[2];

80 if (length == 2) {

81 if (b2)

82 return -1;

83 const int c = ((b0 & 0x1F) << 6) \| (b1 & 0x3F);

84 if (c < 0x80)

85 return -1;

86 return c;

87 }

88

89 // Handle 3-byte sequences.

90 if ((b2 & 0xC0) != 0x80)

91 return -1;

92 const unsigned char b3 = sequence[3];

93 if (length == 3) {

94 if (b3)

95 return -1;

96 const int c = ((b0 & 0xF) << 12) \| ((b1 & 0x3F) << 6) \| (b2 & 0x3F);

97 if (c < 0x800)

98 return -1;

99 // UTF-16 surrogates should never appear in UTF-8 data.

100 if (c >= 0xD800 && c <= 0xDFFF)

101 return -1;

102 return c;

103 }

104

105 // Handle 4-byte sequences.

106 if ((b3 & 0xC0) != 0x80)

107 return -1;

108 const unsigned char b4 = sequence[4];

109 if (length == 4) {

110 if (b4)

111 return -1;

112 const int c = ((b0 & 0x7) << 18) \| ((b1 & 0x3F) << 12) \| ((b2 & 0x3F) << 6) \| (b3 & 0x3F);

113 if (c < 0x10000 \|\| c > 0x10FFFF)

114 return -1;

115 return c;

116 }

117

118 return -1;

119 }

120

121 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

122 // into the first byte, depending on how many bytes follow. There are

123 // as many entries in this table as there are UTF-8 sequence types.

124 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs

125 // for legal UTF-8 will be 4 or fewer bytes total.

126 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0x F8, 0xFC };

127

128 ConversionResult convertLatin1ToUTF8(

129 const LChar** sourceStart, const LChar* sou rceEnd,

130 char** targetStart, char* targetEnd)

131 {

132 ConversionResult result = conversionOK;

133 const LChar* source = *sourceStart;

134 char* target = *targetStart;

135 while (source < sourceEnd) {

136 UChar32 ch;

137 unsigned short bytesToWrite = 0;

138 const UChar32 byteMask = 0xBF;

139 const UChar32 byteMark = 0x80;

140 const LChar* oldSource = source; // In case we have to back up because o f target overflow.

141 ch = static_cast<unsigned short>(*source++);

142

143 // Figure out how many bytes the result will require

144 if (ch < (UChar32)0x80)

145 bytesToWrite = 1;

146 else

147 bytesToWrite = 2;

148

149 target += bytesToWrite;

150 if (target > targetEnd) {

151 source = oldSource; // Back up source pointer!

152 target -= bytesToWrite;

153 result = targetExhausted;

154 break;

155 }

156 switch (bytesToWrite) { // note: everything falls through.

157 case 2:

158 *--target = (char)((ch \| byteMark) & byteMask);

159 ch >>= 6;

160 case 1:

161 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

162 }

163 target += bytesToWrite;

164 }

165 *sourceStart = source;

166 *targetStart = target;

167 return result;

168 }

169

170 ConversionResult convertUTF16ToUTF8(

171 const UChar** sourceStart, const UChar* sourceEnd,

172 char** targetStart, char* targetEnd, bool strict)

173 {

174 ConversionResult result = conversionOK;

175 const UChar* source = *sourceStart;

176 char* target = *targetStart;

177 while (source < sourceEnd) {

178 UChar32 ch;

179 unsigned short bytesToWrite = 0;

180 const UChar32 byteMask = 0xBF;

181 const UChar32 byteMark = 0x80;

182 const UChar* oldSource = source; // In case we have to back up because o f target overflow.

183 ch = static_cast<unsigned short>(*source++);

184 // If we have a surrogate pair, convert to UChar32 first.

185 if (ch >= 0xD800 && ch <= 0xDBFF) {

186 // If the 16 bits following the high surrogate are in the source buf fer...

187 if (source < sourceEnd) {

188 UChar32 ch2 = static_cast<unsigned short>(*source);

189 // If it's a low surrogate, convert to UChar32.

190 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

191 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

192 ++source;

193 } else if (strict) { // it's an unpaired high surrogate

194 --source; // return to the illegal value itself

195 result = sourceIllegal;

196 break;

197 }

198 } else { // We don't have the 16 bits following the high surrogate.

199 --source; // return to the high surrogate

200 result = sourceExhausted;

201 break;

202 }

203 } else if (strict) {

204 // UTF-16 surrogate values are illegal in UTF-32

205 if (ch >= 0xDC00 && ch <= 0xDFFF) {

206 --source; // return to the illegal value itself

207 result = sourceIllegal;

208 break;

209 }

210 }

211 // Figure out how many bytes the result will require

212 if (ch < (UChar32)0x80) {

213 bytesToWrite = 1;

214 } else if (ch < (UChar32)0x800) {

215 bytesToWrite = 2;

216 } else if (ch < (UChar32)0x10000) {

217 bytesToWrite = 3;

218 } else if (ch < (UChar32)0x110000) {

219 bytesToWrite = 4;

220 } else {

221 bytesToWrite = 3;

222 ch = replacementCharacter;

223 }

224

225 target += bytesToWrite;

226 if (target > targetEnd) {

227 source = oldSource; // Back up source pointer!

228 target -= bytesToWrite;

229 result = targetExhausted;

230 break;

231 }

232 switch (bytesToWrite) { // note: everything falls through.

233 case 4: *--target = (char)((ch \| byteMark) & byteMask); ch >>= 6;

234 case 3: *--target = (char)((ch \| byteMark) & byteMask); ch >>= 6;

235 case 2: *--target = (char)((ch \| byteMark) & byteMask); ch >>= 6;

236 case 1: *--target = (char)(ch \| firstByteMark[bytesToWrite]);

237 }

238 target += bytesToWrite;

239 }

240 *sourceStart = source;

241 *targetStart = target;

242 return result;

243 }

244

245 // This must be called with the length pre-determined by the first byte.

246 // If presented with a length > 4, this returns false. The Unicode

247 // definition of UTF-8 goes up to 4-byte sequences.

248 static bool isLegalUTF8(const unsigned char* source, int length)

249 {

250 unsigned char a;

251 const unsigned char* srcptr = source + length;

252 switch (length) {

253 default: return false;

254 // Everything else falls through when "true"...

255 case 4: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;

256 case 3: if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF) return false;

257 case 2: if ((a = (*--srcptr)) > 0xBF) return false;

258

259 switch (*source) {

260 // no fall-through in this inner switch

261 case 0xE0: if (a < 0xA0) return false; break;

262 case 0xED: if (a > 0x9F) return false; break;

263 case 0xF0: if (a < 0x90) return false; break;

264 case 0xF4: if (a > 0x8F) return false; break;

265 default: if (a < 0x80) return false;

266 }

267

268 case 1: if (source >= 0x80 && source < 0xC2) return false;

269 }

270 if (*source > 0xF4)

271 return false;

272 return true;

273 }

274

275 // Magic values subtracted from a buffer value during UTF8 conversion.

276 // This table contains as many values as there might be trailing bytes

277 // in a UTF-8 sequence.

278 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E20 80UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x8 2082080UL) };

279

280 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)

281 {

282 UChar32 character = 0;

283

284 // The cases all fall through.

285 switch (length) {

286 case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;

287 case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;

288 case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;

289 case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;

290 case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;

291 case 1: character += static_cast<unsigned char>(*sequence++);

292 }

293

294 return character - offsetsFromUTF8[length - 1];

295 }

296

297 ConversionResult convertUTF8ToUTF16(

298 const char** sourceStart, const char* sourceEnd,

299 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)

300 {

301 ConversionResult result = conversionOK;

302 const char* source = *sourceStart;

303 UChar* target = *targetStart;

304 UChar orAllData = 0;

305 while (source < sourceEnd) {

306 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

307 if (sourceEnd - source < utf8SequenceLength) {

308 result = sourceExhausted;

309 break;

310 }

311 // Do this check whether lenient or strict

312 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8Seq uenceLength)) {

313 result = sourceIllegal;

314 break;

315 }

316

317 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

318

319 if (target >= targetEnd) {

320 source -= utf8SequenceLength; // Back up source pointer!

321 result = targetExhausted;

322 break;

323 }

324

325 if (U_IS_BMP(character)) {

326 // UTF-16 surrogate values are illegal in UTF-32

327 if (U_IS_SURROGATE(character)) {

328 if (strict) {

329 source -= utf8SequenceLength; // return to the illegal value itself

330 result = sourceIllegal;

331 break;

332 } else {

333 *target++ = replacementCharacter;

334 orAllData \|= replacementCharacter;

335 }

336 } else {

337 *target++ = character; // normal case

338 orAllData \|= character;

339 }

340 } else if (U_IS_SUPPLEMENTARY(character)) {

341 // target is a character in range 0xFFFF - 0x10FFFF

342 if (target + 1 >= targetEnd) {

343 source -= utf8SequenceLength; // Back up source pointer!

344 result = targetExhausted;

345 break;

346 }

347 *target++ = U16_LEAD(character);

348 *target++ = U16_TRAIL(character);

349 orAllData = 0xffff;

350 } else {

351 if (strict) {

352 source -= utf8SequenceLength; // return to the start

353 result = sourceIllegal;

354 break; // Bail out; shouldn't continue

355 } else {

356 *target++ = replacementCharacter;

357 orAllData \|= replacementCharacter;

358 }

359 }

360 }

361 *sourceStart = source;

362 *targetStart = target;

363

364 if (sourceAllASCII)

365 *sourceAllASCII = !(orAllData & ~0x7f);

366

367 return result;

368 }

369

370 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, c onst char* dataEnd, unsigned& dataLength, unsigned& utf16Length)

371 {

372 if (!data)

373 return 0;

374

375 StringHasher stringHasher;

376 dataLength = 0;

377 utf16Length = 0;

378

379 while (data < dataEnd \|\| (!dataEnd && *data)) {

380 if (isASCII(*data)) {

381 stringHasher.addCharacter(*data++);

382 dataLength++;

383 utf16Length++;

384 continue;

385 }

386

387 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

388 dataLength += utf8SequenceLength;

389

390 if (!dataEnd) {

391 for (int i = 1; i < utf8SequenceLength; ++i) {

392 if (!data[i])

393 return 0;

394 }

395 } else if (dataEnd - data < utf8SequenceLength)

396 return 0;

397

398 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8Seque nceLength))

399 return 0;

400

401 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);

402 ASSERT(!isASCII(character));

403

404 if (U_IS_BMP(character)) {

405 // UTF-16 surrogate values are illegal in UTF-32

406 if (U_IS_SURROGATE(character))

407 return 0;

408 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case

409 utf16Length++;

410 } else if (U_IS_SUPPLEMENTARY(character)) {

411 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),

412 static_cast<UChar>(U16_TRAIL(character))) ;

413 utf16Length += 2;

414 } else

415 return 0;

416 }

417

418 return stringHasher.hashWithTop8BitsMasked();

419 }

420

421 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)

422 {

423 while (b < bEnd) {

424 if (isASCII(*b)) {

425 if (a++ != b++)

426 return false;

427 continue;

428 }

429

430 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

431

432 if (bEnd - b < utf8SequenceLength)

433 return false;

434

435 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8Sequence Length))

436 return 0;

437

438 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);

439 ASSERT(!isASCII(character));

440

441 if (U_IS_BMP(character)) {

442 // UTF-16 surrogate values are illegal in UTF-32

443 if (U_IS_SURROGATE(character))

444 return false;

445 if (*a++ != character)

446 return false;

447 } else if (U_IS_SUPPLEMENTARY(character)) {

448 if (*a++ != U16_LEAD(character))

449 return false;

450 if (*a++ != U16_TRAIL(character))

451 return false;

452 } else

453 return false;

454 }

455

456 return a == aEnd;

457 }

458

459 } // namespace Unicode

460 } // namespace WTF

OLD	NEW

« no previous file with comments | « Source/WTF/wtf/unicode/UTF8.h ('k') | Source/WTF/wtf/unicode/Unicode.h » ('j') | Source/config.h » ('J')