third_party/WebKit/Source/wtf/text/UTF8.cpp - Issue 2764243002: Move files in wtf/ to platform/wtf/ (Part 9).

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8.cpp

Issue 2764243002: Move files in wtf/ to platform/wtf/ (Part 9). (Closed)

Patch Set: Rebase. Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 * Copyright (C) 2007 Apple Inc. All rights reserved.

3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>

4 *

5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions

7 * are met:

8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright

11 * notice, this list of conditions and the following disclaimer in the

12 * documentation and/or other materials provided with the distribution.

13 *

14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY

15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR

18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

25 */

26

27 #include "wtf/text/UTF8.h"

28

29 #include "wtf/ASCIICType.h"

30 #include "wtf/StringHasher.h"

31 #include "wtf/text/CharacterNames.h"

32

33 namespace WTF {

34 namespace Unicode {

35

36 inline int inlineUTF8SequenceLengthNonASCII(char b0) {

37 if ((b0 & 0xC0) != 0xC0)

38 return 0;

39 if ((b0 & 0xE0) == 0xC0)

40 return 2;

41 if ((b0 & 0xF0) == 0xE0)

42 return 3;

43 if ((b0 & 0xF8) == 0xF0)

44 return 4;

45 return 0;

46 }

47

48 inline int inlineUTF8SequenceLength(char b0) {

49 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);

50 }

51

52 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

53 // into the first byte, depending on how many bytes follow. There are

54 // as many entries in this table as there are UTF-8 sequence types.

55 // (I.e., one byte sequence, two byte... etc.). Remember that sequences

56 // for legal UTF-8 will be 4 or fewer bytes total.

57 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,

58 0xF0, 0xF8, 0xFC};

59

60 ConversionResult convertLatin1ToUTF8(const LChar** sourceStart,

61 const LChar* sourceEnd,

62 char** targetStart,

63 char* targetEnd) {

64 ConversionResult result = conversionOK;

65 const LChar* source = *sourceStart;

66 char* target = *targetStart;

67 while (source < sourceEnd) {

68 UChar32 ch;

69 unsigned short bytesToWrite = 0;

70 const UChar32 byteMask = 0xBF;

71 const UChar32 byteMark = 0x80;

72 const LChar* oldSource =

73 source; // In case we have to back up because of target overflow.

74 ch = static_cast<unsigned short>(*source++);

75

76 // Figure out how many bytes the result will require

77 if (ch < (UChar32)0x80)

78 bytesToWrite = 1;

79 else

80 bytesToWrite = 2;

81

82 target += bytesToWrite;

83 if (target > targetEnd) {

84 source = oldSource; // Back up source pointer!

85 target -= bytesToWrite;

86 result = targetExhausted;

87 break;

88 }

89 switch (bytesToWrite) { // note: everything falls through.

90 case 2:

91 *--target = (char)((ch \| byteMark) & byteMask);

92 ch >>= 6;

93 case 1:

94 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

95 }

96 target += bytesToWrite;

97 }

98 *sourceStart = source;

99 *targetStart = target;

100 return result;

101 }

102

103 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,

104 const UChar* sourceEnd,

105 char** targetStart,

106 char* targetEnd,

107 bool strict) {

108 ConversionResult result = conversionOK;

109 const UChar* source = *sourceStart;

110 char* target = *targetStart;

111 while (source < sourceEnd) {

112 UChar32 ch;

113 unsigned short bytesToWrite = 0;

114 const UChar32 byteMask = 0xBF;

115 const UChar32 byteMark = 0x80;

116 const UChar* oldSource =

117 source; // In case we have to back up because of target overflow.

118 ch = static_cast<unsigned short>(*source++);

119 // If we have a surrogate pair, convert to UChar32 first.

120 if (ch >= 0xD800 && ch <= 0xDBFF) {

121 // If the 16 bits following the high surrogate are in the source buffer...

122 if (source < sourceEnd) {

123 UChar32 ch2 = static_cast<unsigned short>(*source);

124 // If it's a low surrogate, convert to UChar32.

125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {

126 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;

127 ++source;

128 } else if (strict) { // it's an unpaired high surrogate

129 --source; // return to the illegal value itself

130 result = sourceIllegal;

131 break;

132 }

133 } else { // We don't have the 16 bits following the high surrogate.

134 --source; // return to the high surrogate

135 result = sourceExhausted;

136 break;

137 }

138 } else if (strict) {

139 // UTF-16 surrogate values are illegal in UTF-32

140 if (ch >= 0xDC00 && ch <= 0xDFFF) {

141 --source; // return to the illegal value itself

142 result = sourceIllegal;

143 break;

144 }

145 }

146 // Figure out how many bytes the result will require

147 if (ch < (UChar32)0x80) {

148 bytesToWrite = 1;

149 } else if (ch < (UChar32)0x800) {

150 bytesToWrite = 2;

151 } else if (ch < (UChar32)0x10000) {

152 bytesToWrite = 3;

153 } else if (ch < (UChar32)0x110000) {

154 bytesToWrite = 4;

155 } else {

156 bytesToWrite = 3;

157 ch = replacementCharacter;

158 }

159

160 target += bytesToWrite;

161 if (target > targetEnd) {

162 source = oldSource; // Back up source pointer!

163 target -= bytesToWrite;

164 result = targetExhausted;

165 break;

166 }

167 switch (bytesToWrite) { // note: everything falls through.

168 case 4:

169 *--target = (char)((ch \| byteMark) & byteMask);

170 ch >>= 6;

171 case 3:

172 *--target = (char)((ch \| byteMark) & byteMask);

173 ch >>= 6;

174 case 2:

175 *--target = (char)((ch \| byteMark) & byteMask);

176 ch >>= 6;

177 case 1:

178 *--target = (char)(ch \| firstByteMark[bytesToWrite]);

179 }

180 target += bytesToWrite;

181 }

182 *sourceStart = source;

183 *targetStart = target;

184 return result;

185 }

186

187 // This must be called with the length pre-determined by the first byte.

188 // If presented with a length > 4, this returns false. The Unicode

189 // definition of UTF-8 goes up to 4-byte sequences.

190 static bool isLegalUTF8(const unsigned char* source, int length) {

191 unsigned char a;

192 const unsigned char* srcptr = source + length;

193 switch (length) {

194 default:

195 return false;

196 // Everything else falls through when "true"...

197 case 4:

198 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

199 return false;

200 case 3:

201 if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)

202 return false;

203 case 2:

204 if ((a = (*--srcptr)) > 0xBF)

205 return false;

206

207 // no fall-through in this inner switch

208 switch (*source) {

209 case 0xE0:

210 if (a < 0xA0)

211 return false;

212 break;

213 case 0xED:

214 if (a > 0x9F)

215 return false;

216 break;

217 case 0xF0:

218 if (a < 0x90)

219 return false;

220 break;

221 case 0xF4:

222 if (a > 0x8F)

223 return false;

224 break;

225 default:

226 if (a < 0x80)

227 return false;

228 }

229

230 case 1:

231 if (source >= 0x80 && source < 0xC2)

232 return false;

233 }

234 if (*source > 0xF4)

235 return false;

236 return true;

237 }

238

239 // Magic values subtracted from a buffer value during UTF8 conversion.

240 // This table contains as many values as there might be trailing bytes

241 // in a UTF-8 sequence.

242 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,

243 0x00003080UL,

244 0x000E2080UL,

245 0x03C82080UL,

246 static_cast<UChar32>(0xFA082080UL),

247 static_cast<UChar32>(0x82082080UL)};

248

249 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length) {

250 UChar32 character = 0;

251

252 // The cases all fall through.

253 switch (length) {

254 case 6:

255 character += static_cast<unsigned char>(*sequence++);

256 character <<= 6;

257 case 5:

258 character += static_cast<unsigned char>(*sequence++);

259 character <<= 6;

260 case 4:

261 character += static_cast<unsigned char>(*sequence++);

262 character <<= 6;

263 case 3:

264 character += static_cast<unsigned char>(*sequence++);

265 character <<= 6;

266 case 2:

267 character += static_cast<unsigned char>(*sequence++);

268 character <<= 6;

269 case 1:

270 character += static_cast<unsigned char>(*sequence++);

271 }

272

273 return character - offsetsFromUTF8[length - 1];

274 }

275

276 ConversionResult convertUTF8ToUTF16(const char** sourceStart,

277 const char* sourceEnd,

278 UChar** targetStart,

279 UChar* targetEnd,

280 bool* sourceAllASCII,

281 bool strict) {

282 ConversionResult result = conversionOK;

283 const char* source = *sourceStart;

284 UChar* target = *targetStart;

285 UChar orAllData = 0;

286 while (source < sourceEnd) {

287 int utf8SequenceLength = inlineUTF8SequenceLength(*source);

288 if (sourceEnd - source < utf8SequenceLength) {

289 result = sourceExhausted;

290 break;

291 }

292 // Do this check whether lenient or strict

293 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),

294 utf8SequenceLength)) {

295 result = sourceIllegal;

296 break;

297 }

298

299 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

300

301 if (target >= targetEnd) {

302 source -= utf8SequenceLength; // Back up source pointer!

303 result = targetExhausted;

304 break;

305 }

306

307 if (U_IS_BMP(character)) {

308 // UTF-16 surrogate values are illegal in UTF-32

309 if (U_IS_SURROGATE(character)) {

310 if (strict) {

311 source -= utf8SequenceLength; // return to the illegal value itself

312 result = sourceIllegal;

313 break;

314 }

315 *target++ = replacementCharacter;

316 orAllData \|= replacementCharacter;

317 } else {

318 *target++ = static_cast<UChar>(character); // normal case

319 orAllData \|= character;

320 }

321 } else if (U_IS_SUPPLEMENTARY(character)) {

322 // target is a character in range 0xFFFF - 0x10FFFF

323 if (target + 1 >= targetEnd) {

324 source -= utf8SequenceLength; // Back up source pointer!

325 result = targetExhausted;

326 break;

327 }

328 *target++ = U16_LEAD(character);

329 *target++ = U16_TRAIL(character);

330 orAllData = 0xffff;

331 } else {

332 if (strict) {

333 source -= utf8SequenceLength; // return to the start

334 result = sourceIllegal;

335 break; // Bail out; shouldn't continue

336 } else {

337 *target++ = replacementCharacter;

338 orAllData \|= replacementCharacter;

339 }

340 }

341 }

342 *sourceStart = source;

343 *targetStart = target;

344

345 if (sourceAllASCII)

346 *sourceAllASCII = !(orAllData & ~0x7f);

347

348 return result;

349 }

350

351 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(

352 const char* data,

353 const char* dataEnd,

354 unsigned& dataLength,

355 unsigned& utf16Length) {

356 if (!data)

357 return 0;

358

359 StringHasher stringHasher;

360 dataLength = 0;

361 utf16Length = 0;

362

363 while (data < dataEnd \|\| (!dataEnd && *data)) {

364 if (isASCII(*data)) {

365 stringHasher.addCharacter(*data++);

366 dataLength++;

367 utf16Length++;

368 continue;

369 }

370

371 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

372 dataLength += utf8SequenceLength;

373

374 if (!dataEnd) {

375 for (int i = 1; i < utf8SequenceLength; ++i) {

376 if (!data[i])

377 return 0;

378 }

379 } else if (dataEnd - data < utf8SequenceLength) {

380 return 0;

381 }

382

383 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data),

384 utf8SequenceLength))

385 return 0;

386

387 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);

388 DCHECK(!isASCII(character));

389

390 if (U_IS_BMP(character)) {

391 // UTF-16 surrogate values are illegal in UTF-32

392 if (U_IS_SURROGATE(character))

393 return 0;

394 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case

395 utf16Length++;

396 } else if (U_IS_SUPPLEMENTARY(character)) {

397 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),

398 static_cast<UChar>(U16_TRAIL(character)));

399 utf16Length += 2;

400 } else {

401 return 0;

402 }

403 }

404

405 return stringHasher.hashWithTop8BitsMasked();

406 }

407

408 template <typename CharType>

409 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a,

410 const CharType* aEnd,

411 const char* b,

412 const char* bEnd) {

413 while (b < bEnd) {

414 if (isASCII(*b)) {

415 if (a++ != b++)

416 return false;

417 continue;

418 }

419

420 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

421

422 if (bEnd - b < utf8SequenceLength)

423 return false;

424

425 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b),

426 utf8SequenceLength))

427 return 0;

428

429 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);

430 DCHECK(!isASCII(character));

431

432 if (U_IS_BMP(character)) {

433 // UTF-16 surrogate values are illegal in UTF-32

434 if (U_IS_SURROGATE(character))

435 return false;

436 if (*a++ != character)

437 return false;

438 } else if (U_IS_SUPPLEMENTARY(character)) {

439 if (*a++ != U16_LEAD(character))

440 return false;

441 if (*a++ != U16_TRAIL(character))

442 return false;

443 } else {

444 return false;

445 }

446 }

447

448 return a == aEnd;

449 }

450

451 bool equalUTF16WithUTF8(const UChar* a,

452 const UChar* aEnd,

453 const char* b,

454 const char* bEnd) {

455 return equalWithUTF8Internal(a, aEnd, b, bEnd);

456 }

457

458 bool equalLatin1WithUTF8(const LChar* a,

459 const LChar* aEnd,

460 const char* b,

461 const char* bEnd) {

462 return equalWithUTF8Internal(a, aEnd, b, bEnd);

463 }

464

465 } // namespace Unicode

466 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/UTF8.h ('k') | third_party/WebKit/Source/wtf/text/WTFString.h » ('j') | no next file with comments »