third_party/WebKit/Source/wtf/text/TextCodecICU.cpp - Issue 2764283002: Move files in wtf/ to platform/wtf/ (Part 10).

Side by Side Diff: third_party/WebKit/Source/wtf/text/TextCodecICU.cpp

Issue 2764283002: Move files in wtf/ to platform/wtf/ (Part 10). (Closed)

Patch Set: Rebase. Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 * Copyright (C) 2004, 2006, 2007, 2008, 2011 Apple Inc. All rights reserved.

3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>

4 *

5 * Redistribution and use in source and binary forms, with or without

6 * modification, are permitted provided that the following conditions

7 * are met:

8 * 1. Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.

10 * 2. Redistributions in binary form must reproduce the above copyright

11 * notice, this list of conditions and the following disclaimer in the

12 * documentation and/or other materials provided with the distribution.

13 *

14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY

15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR

18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

25 */

26

27 #include "wtf/text/TextCodecICU.h"

28

29 #include "wtf/Assertions.h"

30 #include "wtf/PtrUtil.h"

31 #include "wtf/StringExtras.h"

32 #include "wtf/Threading.h"

33 #include "wtf/WTFThreadData.h"

34 #include "wtf/text/CString.h"

35 #include "wtf/text/CharacterNames.h"

36 #include "wtf/text/StringBuilder.h"

37 #include <memory>

38 #include <unicode/ucnv.h>

39 #include <unicode/ucnv_cb.h>

40

41 namespace WTF {

42

43 const size_t ConversionBufferSize = 16384;

44

45 ICUConverterWrapper::~ICUConverterWrapper() {

46 if (converter)

47 ucnv_close(converter);

48 }

49

50 static UConverter*& cachedConverterICU() {

51 return wtfThreadData().cachedConverterICU().converter;

52 }

53

54 std::unique_ptr<TextCodec> TextCodecICU::create(const TextEncoding& encoding,

55 const void*) {

56 return WTF::wrapUnique(new TextCodecICU(encoding));

57 }

58

59 void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar) {

60 // We register Hebrew with logical ordering using a separate name.

61 // Otherwise, this would share the same canonical name as the

62 // visual ordering case, and then TextEncoding could not tell them

63 // apart; ICU treats these names as synonyms.

64 registrar("ISO-8859-8-I", "ISO-8859-8-I");

65

66 int32_t numEncodings = ucnv_countAvailable();

67 for (int32_t i = 0; i < numEncodings; ++i) {

68 const char* name = ucnv_getAvailableName(i);

69 UErrorCode error = U_ZERO_ERROR;

70 #if !defined(USING_SYSTEM_ICU)

71 const char* primaryStandard = "HTML";

72 const char* secondaryStandard = "MIME";

73 #else

74 const char* primaryStandard = "MIME";

75 const char* secondaryStandard = "IANA";

76 #endif

77 const char* standardName =

78 ucnv_getStandardName(name, primaryStandard, &error);

79 if (U_FAILURE(error) \|\| !standardName) {

80 error = U_ZERO_ERROR;

81 // Try IANA to pick up 'windows-12xx' and other names

82 // which are not preferred MIME names but are widely used.

83 standardName = ucnv_getStandardName(name, secondaryStandard, &error);

84 if (U_FAILURE(error) \|\| !standardName)

85 continue;

86 }

87

88 // A number of these aliases are handled in Chrome's copy of ICU, but

89 // Chromium can be compiled with the system ICU.

90

91 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other

92 // browsers.

93 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native

94 // encoding for encoding GB_2312-80 and several others. So, we need to

95 // override this behavior, too.

96 #if defined(USING_SYSTEM_ICU)

97 if (!strcmp(standardName, "GB2312") \|\| !strcmp(standardName, "GB_2312-80"))

98 standardName = "GBK";

99 // Similarly, EUC-KR encodings all map to an extended version, but

100 // per HTML5, the canonical name still should be EUC-KR.

101 else if (!strcmp(standardName, "EUC-KR") \|\|

102 !strcmp(standardName, "KSC_5601") \|\|

103 !strcmp(standardName, "cp1363"))

104 standardName = "EUC-KR";

105 // And so on.

106 else if (!strcasecmp(standardName, "iso-8859-9"))

107 // This name is returned in different case by ICU 3.2 and 3.6.

108 standardName = "windows-1254";

109 else if (!strcmp(standardName, "TIS-620"))

110 standardName = "windows-874";

111 #endif

112

113 registrar(standardName, standardName);

114

115 uint16_t numAliases = ucnv_countAliases(name, &error);

116 DCHECK(U_SUCCESS(error));

117 if (U_SUCCESS(error))

118 for (uint16_t j = 0; j < numAliases; ++j) {

119 error = U_ZERO_ERROR;

120 const char* alias = ucnv_getAlias(name, j, &error);

121 DCHECK(U_SUCCESS(error));

122 if (U_SUCCESS(error) && alias != standardName)

123 registrar(alias, standardName);

124 }

125 }

126

127 // These two entries have to be added here because ICU's converter table

128 // cannot have both ISO-8859-8-I and ISO-8859-8.

129 registrar("csISO88598I", "ISO-8859-8-I");

130 registrar("logical", "ISO-8859-8-I");

131

132 #if defined(USING_SYSTEM_ICU)

133 // Additional alias for MacCyrillic not present in ICU.

134 registrar("maccyrillic", "x-mac-cyrillic");

135

136 // Additional aliases that historically were present in the encoding

137 // table in WebKit on Macintosh that don't seem to be present in ICU.

138 // Perhaps we can prove these are not used on the web and remove them.

139 // Or perhaps we can get them added to ICU.

140 registrar("x-mac-roman", "macintosh");

141 registrar("x-mac-ukrainian", "x-mac-cyrillic");

142 registrar("cn-big5", "Big5");

143 registrar("x-x-big5", "Big5");

144 registrar("cn-gb", "GBK");

145 registrar("csgb231280", "GBK");

146 registrar("x-euc-cn", "GBK");

147 registrar("x-gbk", "GBK");

148 registrar("koi", "KOI8-R");

149 registrar("visual", "ISO-8859-8");

150 registrar("winarabic", "windows-1256");

151 registrar("winbaltic", "windows-1257");

152 registrar("wincyrillic", "windows-1251");

153 registrar("iso-8859-11", "windows-874");

154 registrar("iso8859-11", "windows-874");

155 registrar("dos-874", "windows-874");

156 registrar("wingreek", "windows-1253");

157 registrar("winhebrew", "windows-1255");

158 registrar("winlatin2", "windows-1250");

159 registrar("winturkish", "windows-1254");

160 registrar("winvietnamese", "windows-1258");

161 registrar("x-cp1250", "windows-1250");

162 registrar("x-cp1251", "windows-1251");

163 registrar("x-euc", "EUC-JP");

164 registrar("x-windows-949", "EUC-KR");

165 registrar("KSC5601", "EUC-KR");

166 registrar("x-uhc", "EUC-KR");

167 registrar("shift-jis", "Shift_JIS");

168

169 // Alternative spelling of ISO encoding names.

170 registrar("ISO8859-1", "ISO-8859-1");

171 registrar("ISO8859-2", "ISO-8859-2");

172 registrar("ISO8859-3", "ISO-8859-3");

173 registrar("ISO8859-4", "ISO-8859-4");

174 registrar("ISO8859-5", "ISO-8859-5");

175 registrar("ISO8859-6", "ISO-8859-6");

176 registrar("ISO8859-7", "ISO-8859-7");

177 registrar("ISO8859-8", "ISO-8859-8");

178 registrar("ISO8859-8-I", "ISO-8859-8-I");

179 registrar("ISO8859-9", "ISO-8859-9");

180 registrar("ISO8859-10", "ISO-8859-10");

181 registrar("ISO8859-13", "ISO-8859-13");

182 registrar("ISO8859-14", "ISO-8859-14");

183 registrar("ISO8859-15", "ISO-8859-15");

184 // No need to have an entry for ISO8859-16. ISO-8859-16 has just one label

185 // listed in WHATWG Encoding Living Standard, http://encoding.spec.whatwg.org/

186

187 // Additional aliases present in the WHATWG Encoding Standard

188 // and Firefox (as of Oct 2014), but not in the upstream ICU.

189 // Three entries for windows-1252 need not be listed here because

190 // TextCodecLatin1 registers them.

191 registrar("csiso58gb231280", "GBK");

192 registrar("csiso88596e", "ISO-8859-6");

193 registrar("csiso88596i", "ISO-8859-6");

194 registrar("csiso88598e", "ISO-8859-8");

195 registrar("gb_2312", "GBK");

196 registrar("iso88592", "ISO-8859-2");

197 registrar("iso88593", "ISO-8859-3");

198 registrar("iso88594", "ISO-8859-4");

199 registrar("iso88595", "ISO-8859-5");

200 registrar("iso88596", "ISO-8859-6");

201 registrar("iso88597", "ISO-8859-7");

202 registrar("iso88598", "ISO-8859-8");

203 registrar("iso88599", "windows-1254");

204 registrar("iso885910", "ISO-8859-10");

205 registrar("iso885911", "windows-874");

206 registrar("iso885913", "ISO-8859-13");

207 registrar("iso885914", "ISO-8859-14");

208 registrar("iso885915", "ISO-8859-15");

209 registrar("iso_8859-2", "ISO-8859-2");

210 registrar("iso_8859-3", "ISO-8859-3");

211 registrar("iso_8859-4", "ISO-8859-4");

212 registrar("iso_8859-5", "ISO-8859-5");

213 registrar("iso_8859-6", "ISO-8859-6");

214 registrar("iso_8859-7", "ISO-8859-7");

215 registrar("iso_8859-8", "ISO-8859-8");

216 registrar("iso_8859-9", "windows-1254");

217 registrar("iso_8859-15", "ISO-8859-15");

218 registrar("koi8_r", "KOI8-R");

219 registrar("x-cp1253", "windows-1253");

220 registrar("x-cp1254", "windows-1254");

221 registrar("x-cp1255", "windows-1255");

222 registrar("x-cp1256", "windows-1256");

223 registrar("x-cp1257", "windows-1257");

224 registrar("x-cp1258", "windows-1258");

225 #endif

226 }

227

228 void TextCodecICU::registerCodecs(TextCodecRegistrar registrar) {

229 // See comment above in registerEncodingNames.

230 registrar("ISO-8859-8-I", create, 0);

231

232 int32_t numEncodings = ucnv_countAvailable();

233 for (int32_t i = 0; i < numEncodings; ++i) {

234 const char* name = ucnv_getAvailableName(i);

235 UErrorCode error = U_ZERO_ERROR;

236 const char* standardName = ucnv_getStandardName(name, "MIME", &error);

237 if (!U_SUCCESS(error) \|\| !standardName) {

238 error = U_ZERO_ERROR;

239 standardName = ucnv_getStandardName(name, "IANA", &error);

240 if (!U_SUCCESS(error) \|\| !standardName)

241 continue;

242 }

243 registrar(standardName, create, 0);

244 }

245 }

246

247 TextCodecICU::TextCodecICU(const TextEncoding& encoding)

248 : m_encoding(encoding),

249 m_converterICU(0)

250 #if defined(USING_SYSTEM_ICU)

251 ,

252 m_needsGBKFallbacks(false)

253 #endif

254 {

255 }

256

257 TextCodecICU::~TextCodecICU() {

258 releaseICUConverter();

259 }

260

261 void TextCodecICU::releaseICUConverter() const {

262 if (m_converterICU) {

263 UConverter*& cachedConverter = cachedConverterICU();

264 if (cachedConverter)

265 ucnv_close(cachedConverter);

266 cachedConverter = m_converterICU;

267 m_converterICU = 0;

268 }

269 }

270

271 void TextCodecICU::createICUConverter() const {

272 DCHECK(!m_converterICU);

273

274 #if defined(USING_SYSTEM_ICU)

275 const char* name = m_encoding.name();

276 m_needsGBKFallbacks =

277 name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3];

278 #endif

279

280 UErrorCode err;

281

282 UConverter*& cachedConverter = cachedConverterICU();

283 if (cachedConverter) {

284 err = U_ZERO_ERROR;

285 const char* cachedName = ucnv_getName(cachedConverter, &err);

286 if (U_SUCCESS(err) && m_encoding == cachedName) {

287 m_converterICU = cachedConverter;

288 cachedConverter = 0;

289 return;

290 }

291 }

292

293 err = U_ZERO_ERROR;

294 m_converterICU = ucnv_open(m_encoding.name(), &err);

295 DLOG_IF(ERROR, err == U_AMBIGUOUS_ALIAS_WARNING)

296 << "ICU ambiguous alias warning for encoding: " << m_encoding.name();

297 if (m_converterICU)

298 ucnv_setFallback(m_converterICU, TRUE);

299 }

300

301 int TextCodecICU::decodeToBuffer(UChar* target,

302 UChar* targetLimit,

303 const char*& source,

304 const char* sourceLimit,

305 int32_t* offsets,

306 bool flush,

307 UErrorCode& err) {

308 UChar* targetStart = target;

309 err = U_ZERO_ERROR;

310 ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit,

311 offsets, flush, &err);

312 return target - targetStart;

313 }

314

315 class ErrorCallbackSetter final {

316 STACK_ALLOCATED();

317

318 public:

319 ErrorCallbackSetter(UConverter* converter, bool stopOnError)

320 : m_converter(converter), m_shouldStopOnEncodingErrors(stopOnError) {

321 if (m_shouldStopOnEncodingErrors) {

322 UErrorCode err = U_ZERO_ERROR;

323 ucnv_setToUCallBack(m_converter, UCNV_TO_U_CALLBACK_STOP, 0,

324 &m_savedAction, &m_savedContext, &err);

325 DCHECK_EQ(err, U_ZERO_ERROR);

326 }

327 }

328 ~ErrorCallbackSetter() {

329 if (m_shouldStopOnEncodingErrors) {

330 UErrorCode err = U_ZERO_ERROR;

331 const void* oldContext;

332 UConverterToUCallback oldAction;

333 ucnv_setToUCallBack(m_converter, m_savedAction, m_savedContext,

334 &oldAction, &oldContext, &err);

335 DCHECK_EQ(oldAction, UCNV_TO_U_CALLBACK_STOP);

336 DCHECK(!oldContext);

337 DCHECK_EQ(err, U_ZERO_ERROR);

338 }

339 }

340

341 private:

342 UConverter* m_converter;

343 bool m_shouldStopOnEncodingErrors;

344 const void* m_savedContext;

345 UConverterToUCallback m_savedAction;

346 };

347

348 String TextCodecICU::decode(const char* bytes,

349 size_t length,

350 FlushBehavior flush,

351 bool stopOnError,

352 bool& sawError) {

353 // Get a converter for the passed-in encoding.

354 if (!m_converterICU) {

355 createICUConverter();

356 DCHECK(m_converterICU);

357 if (!m_converterICU) {

358 DLOG(ERROR)

359 << "error creating ICU encoder even though encoding was in table";

360 return String();

361 }

362 }

363

364 ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError);

365

366 StringBuilder result;

367

368 UChar buffer[ConversionBufferSize];

369 UChar* bufferLimit = buffer + ConversionBufferSize;

370 const char* source = reinterpret_cast<const char*>(bytes);

371 const char* sourceLimit = source + length;

372 int32_t* offsets = nullptr;

373 UErrorCode err = U_ZERO_ERROR;

374

375 do {

376 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit,

377 offsets, flush != DoNotFlush, err);

378 result.append(buffer, ucharsDecoded);

379 } while (err == U_BUFFER_OVERFLOW_ERROR);

380

381 if (U_FAILURE(err)) {

382 // flush the converter so it can be reused, and not be bothered by this

383 // error.

384 do {

385 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true,

386 err);

387 } while (source < sourceLimit);

388 sawError = true;

389 }

390

391 #if !defined(USING_SYSTEM_ICU)

392 // Chrome's copy of ICU does not have the issue described below.

393 return result.toString();

394 #else

395 String resultString = result.toString();

396

397 // <http://bugs.webkit.org/show_bug.cgi?id=17014>

398 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but

399 // ICU decodes it as U+E5E5.

400 if (!strcmp(m_encoding.name(), "GBK")) {

401 if (!strcasecmp(m_encoding.name(), "gb18030"))

402 resultString.replace(0xE5E5, ideographicSpaceCharacter);

403 // Make GBK compliant to the encoding spec and align with GB18030

404 resultString.replace(0x01F9, 0xE7C8);

405 // FIXME: Once https://www.w3.org/Bugs/Public/show_bug.cgi?id=28740#c3

406 // is resolved, add U+1E3F => 0xE7C7.

407 }

408

409 return resultString;

410 #endif

411 }

412

413 #if defined(USING_SYSTEM_ICU)

414 // U+01F9 and U+1E3F have to be mapped to xA8xBF and xA8xBC per the encoding

415 // spec, but ICU converter does not have them.

416 static UChar fallbackForGBK(UChar32 character) {

417 switch (character) {

418 case 0x01F9:

419 return 0xE7C8; // mapped to xA8xBF by ICU.

420 case 0x1E3F:

421 return 0xE7C7; // mapped to xA8xBC by ICU.

422 }

423 return 0;

424 }

425 #endif

426

427 // Generic helper for writing escaped entities using the specfied

428 // UnencodableHandling.

429 static void formatEscapedEntityCallback(const void* context,

430 UConverterFromUnicodeArgs* fromUArgs,

431 const UChar* codeUnits,

432 int32_t length,

433 UChar32 codePoint,

434 UConverterCallbackReason reason,

435 UErrorCode* err,

436 UnencodableHandling handling) {

437 if (reason == UCNV_UNASSIGNED) {

438 *err = U_ZERO_ERROR;

439

440 UnencodableReplacementArray entity;

441 int entityLen =

442 TextCodec::getUnencodableReplacement(codePoint, handling, entity);

443 ucnv_cbFromUWriteBytes(fromUArgs, entity, entityLen, 0, err);

444 } else {

445 UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length,

446 codePoint, reason, err);

447 }

448 }

449

450 static void numericEntityCallback(const void* context,

451 UConverterFromUnicodeArgs* fromUArgs,

452 const UChar* codeUnits,

453 int32_t length,

454 UChar32 codePoint,

455 UConverterCallbackReason reason,

456 UErrorCode* err) {

457 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

458 reason, err, EntitiesForUnencodables);

459 }

460

461 // Invalid character handler when writing escaped entities in CSS encoding for

462 // unrepresentable characters. See the declaration of TextCodec::encode for

463 // more.

464 static void cssEscapedEntityCallback(const void* context,

465 UConverterFromUnicodeArgs* fromUArgs,

466 const UChar* codeUnits,

467 int32_t length,

468 UChar32 codePoint,

469 UConverterCallbackReason reason,

470 UErrorCode* err) {

471 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

472 reason, err, CSSEncodedEntitiesForUnencodables);

473 }

474

475 // Invalid character handler when writing escaped entities in HTML/XML encoding

476 // for unrepresentable characters. See the declaration of TextCodec::encode for

477 // more.

478 static void urlEscapedEntityCallback(const void* context,

479 UConverterFromUnicodeArgs* fromUArgs,

480 const UChar* codeUnits,

481 int32_t length,

482 UChar32 codePoint,

483 UConverterCallbackReason reason,

484 UErrorCode* err) {

485 formatEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

486 reason, err, URLEncodedEntitiesForUnencodables);

487 }

488

489 #if defined(USING_SYSTEM_ICU)

490 // Substitutes special GBK characters, escaping all other unassigned entities.

491 static void gbkCallbackEscape(const void* context,

492 UConverterFromUnicodeArgs* fromUArgs,

493 const UChar* codeUnits,

494 int32_t length,

495 UChar32 codePoint,

496 UConverterCallbackReason reason,

497 UErrorCode* err) {

498 UChar outChar;

499 if (reason == UCNV_UNASSIGNED && (outChar = fallbackForGBK(codePoint))) {

500 const UChar* source = &outChar;

501 *err = U_ZERO_ERROR;

502 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);

503 return;

504 }

505 numericEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

506 reason, err);

507 }

508

509 // Combines both gbkCssEscapedEntityCallback and GBK character substitution.

510 static void gbkCssEscapedEntityCallack(const void* context,

511 UConverterFromUnicodeArgs* fromUArgs,

512 const UChar* codeUnits,

513 int32_t length,

514 UChar32 codePoint,

515 UConverterCallbackReason reason,

516 UErrorCode* err) {

517 if (reason == UCNV_UNASSIGNED) {

518 if (UChar outChar = fallbackForGBK(codePoint)) {

519 const UChar* source = &outChar;

520 *err = U_ZERO_ERROR;

521 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);

522 return;

523 }

524 cssEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

525 reason, err);

526 return;

527 }

528 UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint,

529 reason, err);

530 }

531

532 // Combines both gbkUrlEscapedEntityCallback and GBK character substitution.

533 static void gbkUrlEscapedEntityCallack(const void* context,

534 UConverterFromUnicodeArgs* fromUArgs,

535 const UChar* codeUnits,

536 int32_t length,

537 UChar32 codePoint,

538 UConverterCallbackReason reason,

539 UErrorCode* err) {

540 if (reason == UCNV_UNASSIGNED) {

541 if (UChar outChar = fallbackForGBK(codePoint)) {

542 const UChar* source = &outChar;

543 *err = U_ZERO_ERROR;

544 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);

545 return;

546 }

547 urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint,

548 reason, err);

549 return;

550 }

551 UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint,

552 reason, err);

553 }

554

555 static void gbkCallbackSubstitute(const void* context,

556 UConverterFromUnicodeArgs* fromUArgs,

557 const UChar* codeUnits,

558 int32_t length,

559 UChar32 codePoint,

560 UConverterCallbackReason reason,

561 UErrorCode* err) {

562 UChar outChar;

563 if (reason == UCNV_UNASSIGNED && (outChar = fallbackForGBK(codePoint))) {

564 const UChar* source = &outChar;

565 *err = U_ZERO_ERROR;

566 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err);

567 return;

568 }

569 UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length,

570 codePoint, reason, err);

571 }

572 #endif // USING_SYSTEM_ICU

573

574 class TextCodecInput final {

575 STACK_ALLOCATED();

576

577 public:

578 TextCodecInput(const TextEncoding& encoding,

579 const UChar* characters,

580 size_t length)

581 : m_begin(characters), m_end(characters + length) {}

582

583 TextCodecInput(const TextEncoding& encoding,

584 const LChar* characters,

585 size_t length) {

586 m_buffer.reserveInitialCapacity(length);

587 for (size_t i = 0; i < length; ++i)

588 m_buffer.push_back(characters[i]);

589 m_begin = m_buffer.data();

590 m_end = m_begin + m_buffer.size();

591 }

592

593 const UChar* begin() const { return m_begin; }

594 const UChar* end() const { return m_end; }

595

596 private:

597 const UChar* m_begin;

598 const UChar* m_end;

599 Vector<UChar> m_buffer;

600 };

601

602 CString TextCodecICU::encodeInternal(const TextCodecInput& input,

603 UnencodableHandling handling) {

604 const UChar* source = input.begin();

605 const UChar* end = input.end();

606

607 UErrorCode err = U_ZERO_ERROR;

608

609 switch (handling) {

610 case QuestionMarksForUnencodables:

611 // Non-byte-based encodings (i.e. UTF-16/32) don't need substitutions

612 // since they can encode any code point, and ucnv_setSubstChars would

613 // require a multi-byte substitution anyway.

614 if (!m_encoding.isNonByteBasedEncoding())

615 ucnv_setSubstChars(m_converterICU, "?", 1, &err);

616 #if !defined(USING_SYSTEM_ICU)

617 ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,

618 0, 0, &err);

619 #else

620 ucnv_setFromUCallBack(

621 m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute

622 : UCNV_FROM_U_CALLBACK_SUBSTITUTE,

623 0, 0, 0, &err);

624 #endif

625 break;

626 case EntitiesForUnencodables:

627 #if !defined(USING_SYSTEM_ICU)

628 ucnv_setFromUCallBack(m_converterICU, numericEntityCallback, 0, 0, 0,

629 &err);

630 #else

631 ucnv_setFromUCallBack(

632 m_converterICU,

633 m_needsGBKFallbacks ? gbkCallbackEscape : numericEntityCallback, 0, 0,

634 0, &err);

635 #endif

636 break;

637 case URLEncodedEntitiesForUnencodables:

638 #if !defined(USING_SYSTEM_ICU)

639 ucnv_setFromUCallBack(m_converterICU, urlEscapedEntityCallback, 0, 0, 0,

640 &err);

641 #else

642 ucnv_setFromUCallBack(m_converterICU,

643 m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack

644 : urlEscapedEntityCallback,

645 0, 0, 0, &err);

646 #endif

647 break;

648 case CSSEncodedEntitiesForUnencodables:

649 #if !defined(USING_SYSTEM_ICU)

650 ucnv_setFromUCallBack(m_converterICU, cssEscapedEntityCallback, 0, 0, 0,

651 &err);

652 #else

653 ucnv_setFromUCallBack(m_converterICU,

654 m_needsGBKFallbacks ? gbkCssEscapedEntityCallack

655 : cssEscapedEntityCallback,

656 0, 0, 0, &err);

657 #endif

658 break;

659 }

660

661 DCHECK(U_SUCCESS(err));

662 if (U_FAILURE(err))

663 return CString();

664

665 Vector<char> result;

666 size_t size = 0;

667 do {

668 char buffer[ConversionBufferSize];

669 char* target = buffer;

670 char* targetLimit = target + ConversionBufferSize;

671 err = U_ZERO_ERROR;

672 ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, end, 0,

673 true, &err);

674 size_t count = target - buffer;

675 result.grow(size + count);

676 memcpy(result.data() + size, buffer, count);

677 size += count;

678 } while (err == U_BUFFER_OVERFLOW_ERROR);

679

680 return CString(result.data(), size);

681 }

682

683 template <typename CharType>

684 CString TextCodecICU::encodeCommon(const CharType* characters,

685 size_t length,

686 UnencodableHandling handling) {

687 if (!length)

688 return "";

689

690 if (!m_converterICU)

691 createICUConverter();

692 if (!m_converterICU)

693 return CString();

694

695 TextCodecInput input(m_encoding, characters, length);

696 return encodeInternal(input, handling);

697 }

698

699 CString TextCodecICU::encode(const UChar* characters,

700 size_t length,

701 UnencodableHandling handling) {

702 return encodeCommon(characters, length, handling);

703 }

704

705 CString TextCodecICU::encode(const LChar* characters,

706 size_t length,

707 UnencodableHandling handling) {

708 return encodeCommon(characters, length, handling);

709 }

710

711 } // namespace WTF

OLD	NEW

« no previous file with comments | « third_party/WebKit/Source/wtf/text/TextCodecICU.h ('k') | third_party/WebKit/Source/wtf/text/TextCodecLatin1.h » ('j') | no next file with comments »