source/common/ucnvmbcs.c - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/common/ucnvmbcs.c

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 ******************************************************************************

3 *

4 * Copyright (C) 2000-2013, International Business Machines

5 * Corporation and others. All Rights Reserved.

6 *

7 ******************************************************************************

8 * file name: ucnvmbcs.c

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created on: 2000jul03

14 * created by: Markus W. Scherer

15 *

16 * The current code in this file replaces the previous implementation

17 * of conversion code from multi-byte codepages to Unicode and back.

18 * This implementation supports the following:

19 * - legacy variable-length codepages with up to 4 bytes per character

20 * - all Unicode code points (up to 0x10ffff)

21 * - efficient distinction of unassigned vs. illegal byte sequences

22 * - it is possible in fromUnicode() to directly deal with simple

23 * stateful encodings (used for EBCDIC_STATEFUL)

24 * - it is possible to convert Unicode code points

25 * to a single zero byte (but not as a fallback except for SBCS)

26 *

27 * Remaining limitations in fromUnicode:

28 * - byte sequences must not have leading zero bytes

29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte

30 * - limitation to up to 4 bytes per character

31 *

32 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of thes e

33 * limitations and adds m:n character mappings and other features.

34 * See ucnv_ext.h for details.

35 *

36 * Change history:

37 *

38 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM _U,

39 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE _2

40 * macros to ucnvmbcs.h file

41 */

42

43 #include "unicode/utypes.h"

44

45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION

46

47 #include "unicode/ucnv.h"

48 #include "unicode/ucnv_cb.h"

49 #include "unicode/udata.h"

50 #include "unicode/uset.h"

51 #include "unicode/utf8.h"

52 #include "unicode/utf16.h"

53 #include "ucnv_bld.h"

54 #include "ucnvmbcs.h"

55 #include "ucnv_ext.h"

56 #include "ucnv_cnv.h"

57 #include "cmemory.h"

58 #include "cstring.h"

59 #include "cmutex.h"

60

61 /* control optimizations according to the platform */

62 #define MBCS_UNROLL_SINGLE_TO_BMP 1

63 #define MBCS_UNROLL_SINGLE_FROM_BMP 0

64

65 /*

66 * _MBCSHeader versions 5.3 & 4.3

67 * (Note that the _MBCSHeader version is in addition to the converter formatVers ion.)

68 *

69 * This version is optional. Version 5 is used for incompatible data format chan ges.

70 * makeconv will continue to generate version 4 files if possible.

71 *

72 * Changes from version 4:

73 *

74 * The main difference is an additional _MBCSHeader field with

75 * - the length (number of uint32_t) of the _MBCSHeader

76 * - flags for further incompatible data format changes

77 * - flags for further, backward compatible data format changes

78 *

79 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitt ed from

80 * the file and needs to be reconstituted at load time.

81 * This requires a utf8Friendly format with an additional mbcsIndex table for fa st

82 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to max FastUChar.

83 * (For details about these structures see below, and see ucnvmbcs.h.)

84 *

85 * utf8Friendly also implies that the fromUnicode mappings are stored in ascen ding order

86 * of the Unicode code points. (This requires that the .ucm file has the \|0 et c.

87 * precision markers for all mappings.)

88 *

89 * All fallbacks have been moved to the extension table, leaving only roundtri ps in the

90 * omitted data that can be reconstituted from the toUnicode data.

91 *

92 * Of the stage 2 table, the part corresponding to maxFastUChar and below is o mitted.

93 * With only roundtrip mappings in the base fromUnicode data, this part is ful ly

94 * redundant with the mbcsIndex and will be reconstituted from that (also usin g the

95 * stage 1 table which contains the information about how stage 2 was compacte d).

96 *

97 * The rest of the stage 2 table, the part for code points above maxFastUChar,

98 * is stored in the file and will be appended to the reconstituted part.

99 *

100 * The entire fromUBytes array is omitted from the file and will be reconstitu ed.

101 * This is done by enumerating all toUnicode roundtrip mappings, performing

102 * each mapping (using the stage 1 and reconstituted stage 2 tables) and

103 * writing instead of reading the byte values.

104 *

105 * _MBCSHeader version 4.3

106 *

107 * Change from version 4.2:

108 * - Optional utf8Friendly data structures, with 64-entry stage 3 block

109 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS

110 * files which can be used instead of stages 1 & 2.

111 * Faster lookups for roundtrips from most commonly used characters,

112 * and lookups from UTF-8 byte sequences with a natural bit distribution.

113 * See ucnvmbcs.h for more details.

114 *

115 * Change from version 4.1:

116 * - Added an optional extension table structure at the end of the .cnv file.

117 * It is present if the upper bits of the header flags field contains a non-ze ro

118 * byte offset to it.

119 * Files that contain only a conversion table and no base table

120 * use the special outputType MBCS_OUTPUT_EXT_ONLY.

121 * These contain the base table name between the MBCS header and the extension

122 * data.

123 *

124 * Change from version 4.0:

125 * - Replace header.reserved with header.fromUBytesLength so that all

126 * fields in the data have length.

127 *

128 * Changes from version 3 (for performance improvements):

129 * - new bit distribution for state table entries

130 * - reordered action codes

131 * - new data structure for single-byte fromUnicode

132 * + stage 2 only contains indexes

133 * + stage 3 stores 16 bits per character with classification bits 15..8

134 * - no multiplier for stage 1 entries

135 * - stage 2 for non-single-byte codepages contains the index and the flags in

136 * one 32-bit value

137 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit inte gers

138 *

139 * For more details about old versions of the MBCS data structure, see

140 * the corresponding versions of this file.

141 *

142 * Converting stateless codepage data ---------------------------------------***

143 * (or codepage data with simple states) to Unicode.

144 *

145 * Data structure and algorithm for converting from complex legacy codepages

146 * to Unicode. (Designed before 2000-may-22.)

147 *

148 * The basic idea is that the structure of legacy codepages can be described

149 * with state tables.

150 * When reading a byte stream, each input byte causes a state transition.

151 * Some transitions result in the output of a code point, some result in

152 * "unassigned" or "illegal" output.

153 * This is used here for character conversion.

154 *

155 * The data structure begins with a state table consisting of a row

156 * per state, with 256 entries (columns) per row for each possible input

157 * byte value.

158 * Each entry is 32 bits wide, with two formats distinguished by

159 * the sign bit (bit 31):

160 *

161 * One format for transitional entries (bit 31 not set) for non-final bytes, and

162 * one format for final entries (bit 31 set).

163 * Both formats contain the number of the next state in the same bit

164 * positions.

165 * State 0 is the initial state.

166 *

167 * Most of the time, the offset values of subsequent states are added

168 * up to a scalar value. This value will eventually be the index of

169 * the Unicode code point in a table that follows the state table.

170 * The effect is that the code points for final state table rows

171 * are contiguous. The code points of final state rows follow each other

172 * in the order of the references to those final states by previous

173 * states, etc.

174 *

175 * For some terminal states, the offset is itself the output Unicode

176 * code point (16 bits for a BMP code point or 20 bits for a supplementary

177 * code point (stored as code point minus 0x10000 so that 20 bits are enough).

178 * For others, the code point in the Unicode table is stored with either

179 * one or two code units: one for BMP code points, two for a pair of

180 * surrogates.

181 * All code points for a final state entry take up the same number of code

182 * units, regardless of whether they all actually _use_ the same number

183 * of code units. This is necessary for simple array access.

184 *

185 * An additional feature comes in with what in ICU is called "fallback"

186 * mappings:

187 *

188 * In addition to round-trippable, precise, 1:1 mappings, there are often

189 * mappings defined between similar, though not the same, characters.

190 * Typically, such mappings occur only in fromUnicode mapping tables because

191 * Unicode has a superset repertoire of most other codepages. However, it

192 * is possible to provide such mappings in the toUnicode tables, too.

193 * In this case, the fallback mappings are partly integrated into the

194 * general state tables because the structure of the encoding includes their

195 * byte sequences.

196 * For final entries in an initial state, fallback mappings are stored in

197 * the entry itself like with roundtrip mappings.

198 * For other final entries, they are stored in the code units table if

199 * the entry is for a pair of code units.

200 * For single-unit results in the code units table, there is no space to

201 * alternatively hold a fallback mapping; in this case, the code unit

202 * is stored as U+fffe (unassigned), and the fallback mapping needs to

203 * be looked up by the scalar offset value in a separate table.

204 *

205 * "Unassigned" state entries really mean "structurally unassigned",

206 * i.e., such a byte sequence will never have a mapping result.

207 *

208 * The interpretation of the bits in each entry is as follows:

209 *

210 * Bit 31 not set, not a terminal entry ("transitional"):

211 * 30..24 next state

212 * 23..0 offset delta, to be added up

213 *

214 * Bit 31 set, terminal ("final") entry:

215 * 30..24 next state (regardless of action code)

216 * 23..20 action code:

217 * action codes 0 and 1 result in precise-mapping Unicode code points

218 * 0 valid byte sequence

219 * 19..16 not used, 0

220 * 15..0 16-bit Unicode BMP code point

221 * never U+fffe or U+ffff

222 * 1 valid byte sequence

223 * 19..0 20-bit Unicode supplementary code point

224 * never U+fffe or U+ffff

225 *

226 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unico de code points

227 * 2 valid byte sequence (fallback)

228 * 19..16 not used, 0

229 * 15..0 16-bit Unicode BMP code point as fallback result

230 * 3 valid byte sequence (fallback)

231 * 19..0 20-bit Unicode supplementary code point as fallback result

232 *

233 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illeg al results

234 * depending on the code units they result in

235 * 4 valid byte sequence

236 * 19..9 not used, 0

237 * 8..0 final offset delta

238 * pointing to one 16-bit code unit which may be

239 * fffe unassigned -- look for a fallback for this offset

240 * ffff illegal

241 * 5 valid byte sequence

242 * 19..9 not used, 0

243 * 8..0 final offset delta

244 * pointing to two 16-bit code units

245 * (typically UTF-16 surrogates)

246 * the result depends on the first code unit as follows:

247 * 0000..d7ff roundtrip BMP code point (1st alone)

248 * d800..dbff roundtrip surrogate pair (1st, 2nd)

249 * dc00..dfff fallback surrogate pair (1st-400, 2nd)

250 * e000 roundtrip BMP code point (2nd alone)

251 * e001 fallback BMP code point (2nd alone)

252 * fffe unassigned

253 * ffff illegal

254 * (the final offset deltas are at most 255 * 2,

255 * times 2 because of storing code unit pairs)

256 *

257 * 6 unassigned byte sequence

258 * 19..16 not used, 0

259 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)

260 * this does not contain a final offset delta because the main

261 * purpose of this action code is to save scalar offset values;

262 * therefore, fallback values cannot be assigned to byte

263 * sequences that result in this action code

264 * 7 illegal byte sequence

265 * 19..16 not used, 0

266 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)

267 * 8 state change only

268 * 19..0 not used, 0

269 * useful for state changes in simple stateful encodings,

270 * at Shift-In/Shift-Out codes

271 *

272 *

273 * 9..15 reserved for future use

274 * current implementations will only perform a state change

275 * and ignore bits 19..0

276 *

277 * An encoding with contiguous ranges of unassigned byte sequences, like

278 * Shift-JIS and especially EUC-TW, can be stored efficiently by having

279 * at least two states for the trail bytes:

280 * One trail byte state that results in code points, and one that only

281 * has "unassigned" and "illegal" terminal states.

282 *

283 * Note: partly by accident, this data structure supports simple stateful

284 * encodings without any additional logic.

285 * Currently, only simple Shift-In/Shift-Out schemes are handled with

286 * appropriate state tables (especially EBCDIC_STATEFUL!).

287 *

288 * MBCS version 2 added:

289 * unassigned and illegal action codes have U+fffe and U+ffff

290 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()

291 *

292 * Converting from Unicode to codepage bytes --------------------------------***

293 *

294 * The conversion data structure for fromUnicode is designed for the known

295 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to

296 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is

297 * a roundtrip mapping.

298 *

299 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3

300 * like in the character properties table.

301 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3

302 * with the resulting bytes is at offsetFromUBytes.

303 *

304 * Beginning with version 4, single-byte codepages have a significantly differen t

305 * trie compared to other codepages.

306 * In all cases, the entry in stage 1 is directly the index of the block of

307 * 64 entries in stage 2.

308 *

309 * Single-byte lookup:

310 *

311 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.

312 * Stage 3 contains one 16-bit word per result:

313 * Bits 15..8 indicate the kind of result:

314 * f roundtrip result

315 * c fallback result from private-use code point

316 * 8 fallback result from other code points

317 * 0 unassigned

318 * Bits 7..0 contain the codepage byte. A zero byte is always possible.

319 *

320 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly

321 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup

322 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.

323 * ASCII code points can be looked up with a linear array access into stage 3.

324 * See maxFastUChar and other details in ucnvmbcs.h.

325 *

326 * Multi-byte lookup:

327 *

328 * Stage 2 contains a 32-bit word for each 16-block in stage 3:

329 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results

330 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)

331 * If this test is false, then a non-zero result will be interpreted as

332 * a fallback mapping.

333 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(byt es per char)

334 *

335 * Stage 3 contains 2, 3, or 4 bytes per result.

336 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,

337 * while 3 bytes are stored as bytes in big-endian order.

338 * Leading zero bytes are ignored, and the number of bytes is counted.

339 * A zero byte mapping result is possible as a roundtrip result.

340 * For some output types, the actual result is processed from this;

341 * see ucnv_MBCSFromUnicodeWithOffsets().

342 *

343 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),

344 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.

345 *

346 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.

347 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup

348 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.

349 * ASCII code points can be looked up with a linear array access into stage 3.

350 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.

351 *

352 * In version 3, stage 2 blocks may overlap by multiples of the multiplier

353 * for compaction.

354 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)

355 * may overlap by any number of entries.

356 *

357 * MBCS version 2 added:

358 * the converter checks for known output types, which allows

359 * adding new ones without crashing an unaware converter

360 */

361

362 static const UConverterImpl _SBCSUTF8Impl;

363 static const UConverterImpl _DBCSUTF8Impl;

364

365 /* GB 18030 data ------------------------------------------------------------ */

366

367 /* helper macros for linear values for GB 18030 four-byte sequences */

368 #define LINEAR_18030(a, b, c, d) ((((a)10+(b))126L+(c))*10L+(d))

369

370 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)

371

372 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)

373

374 /*

375 * Some ranges of GB 18030 where both the Unicode code points and the

376 * GB four-byte sequences are contiguous and are handled algorithmically by

377 * the special callback functions below.

378 * The values are start & end of Unicode & GB codes.

379 *

380 * Note that single surrogates are not mapped by GB 18030

381 * as of the re-released mapping tables from 2000-nov-30.

382 */

383 static const uint32_t

384 gb18030Ranges[14][4]={

385 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},

386 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},

387 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)},

388 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)},

389 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},

390 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},

391 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},

392 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},

393 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},

394 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},

395 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},

396 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},

397 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},

398 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}

399 };

400

401 /* bit flag for UConverter.options indicating GB 18030 special handling */

402 #define _MBCS_OPTION_GB18030 0x8000

403

404 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */

405 #define _MBCS_OPTION_KEIS 0x01000

406 #define _MBCS_OPTION_JEF 0x02000

407 #define _MBCS_OPTION_JIPS 0x04000

408

409 #define KEIS_SO_CHAR_1 0x0A

410 #define KEIS_SO_CHAR_2 0x42

411 #define KEIS_SI_CHAR_1 0x0A

412 #define KEIS_SI_CHAR_2 0x41

413

414 #define JEF_SO_CHAR 0x28

415 #define JEF_SI_CHAR 0x29

416

417 #define JIPS_SO_CHAR_1 0x1A

418 #define JIPS_SO_CHAR_2 0x70

419 #define JIPS_SI_CHAR_1 0x1A

420 #define JIPS_SI_CHAR_2 0x71

421

422 enum SISO_Option {

423 SI,

424 SO

425 };

426 typedef enum SISO_Option SISO_Option;

427

428 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *val ue) {

429 int32_t SISOLength = 0;

430

431 switch (option) {

432 case SI:

433 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {

434 value[0] = KEIS_SI_CHAR_1;

435 value[1] = KEIS_SI_CHAR_2;

436 SISOLength = 2;

437 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {

438 value[0] = JEF_SI_CHAR;

439 SISOLength = 1;

440 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {

441 value[0] = JIPS_SI_CHAR_1;

442 value[1] = JIPS_SI_CHAR_2;

443 SISOLength = 2;

444 } else {

445 value[0] = UCNV_SI;

446 SISOLength = 1;

447 }

448 break;

449 case SO:

450 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {

451 value[0] = KEIS_SO_CHAR_1;

452 value[1] = KEIS_SO_CHAR_2;

453 SISOLength = 2;

454 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {

455 value[0] = JEF_SO_CHAR;

456 SISOLength = 1;

457 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {

458 value[0] = JIPS_SO_CHAR_1;

459 value[1] = JIPS_SO_CHAR_2;

460 SISOLength = 2;

461 } else {

462 value[0] = UCNV_SO;

463 SISOLength = 1;

464 }

465 break;

466 default:

467 /* Should never happen. */

468 break;

469 }

470

471 return SISOLength;

472 }

473

474 /* Miscellaneous ------------------------------------------------------------ */

475

476 /**

477 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from

478 * consecutive sequences of bytes, starting from the one encoded in value,

479 * to Unicode code points. (Multiple mappings to reduce per-function call overhe ad.)

480 * Does not currently support m:n mappings or reverse fallbacks.

481 * This function will not be called for sequences of bytes with leading zeros.

482 *

483 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()

484 * @param value contains 1..4 bytes of the first byte sequence, right-aligned

485 * @param codePoints resulting Unicode code points, or negative if a byte sequen ce does

486 * not map to anything

487 * @return TRUE to continue enumeration, FALSE to stop

488 */

489 typedef UBool U_CALLCONV

490 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoint s[32]);

491

492 /* similar to ucnv_MBCSGetNextUChar() but recursive */

493 static UBool

494 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],

495 int32_t state, uint32_t offset,

496 uint32_t value,

497 UConverterEnumToUCallback callback, const void context,

498 UErrorCode *pErrorCode) {

499 UChar32 codePoints[32];

500 const int32_t *row;

501 const uint16_t *unicodeCodeUnits;

502 UChar32 anyCodePoints;

503 int32_t b, limit;

504

505 row=mbcsTable->stateTable[state];

506 unicodeCodeUnits=mbcsTable->unicodeCodeUnits;

507

508 value<<=8;

509 anyCodePoints=-1; /* becomes non-negative if there is a mapping */

510

511 b=(stateProps[state]&0x38)<<2;

512 if(b==0 && stateProps[state]>=0x40) {

513 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */

514 codePoints[0]=U_SENTINEL;

515 b=1;

516 }

517 limit=((stateProps[state]&7)+1)<<5;

518 while(b<limit) {

519 int32_t entry=row[b];

520 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

521 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);

522 if(stateProps[nextState]>=0) {

523 /* recurse to a state with non-ignorable actions */

524 if(!enumToU(

525 mbcsTable, stateProps, nextState,

526 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),

527 value\|(uint32_t)b,

528 callback, context,

529 pErrorCode)) {

530 return FALSE;

531 }

532 }

533 codePoints[b&0x1f]=U_SENTINEL;

534 } else {

535 UChar32 c;

536 int32_t action;

537

538 /*

539 * An if-else-if chain provides more reliable performance for

540 * the most common cases compared to a switch.

541 */

542 action=MBCS_ENTRY_FINAL_ACTION(entry);

543 if(action==MBCS_STATE_VALID_DIRECT_16) {

544 /* output BMP code point */

545 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

546 } else if(action==MBCS_STATE_VALID_16) {

547 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);

548 c=unicodeCodeUnits[finalOffset];

549 if(c<0xfffe) {

550 /* output BMP code point */

551 } else {

552 c=U_SENTINEL;

553 }

554 } else if(action==MBCS_STATE_VALID_16_PAIR) {

555 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);

556 c=unicodeCodeUnits[finalOffset++];

557 if(c<0xd800) {

558 /* output BMP code point below 0xd800 */

559 } else if(c<=0xdbff) {

560 /* output roundtrip or fallback supplementary code point */

561 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xd c00);

562 } else if(c==0xe000) {

563 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

564 c=unicodeCodeUnits[finalOffset];

565 } else {

566 c=U_SENTINEL;

567 }

568 } else if(action==MBCS_STATE_VALID_DIRECT_20) {

569 /* output supplementary code point */

570 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);

571 } else {

572 c=U_SENTINEL;

573 }

574

575 codePoints[b&0x1f]=c;

576 anyCodePoints&=c;

577 }

578 if(((++b)&0x1f)==0) {

579 if(anyCodePoints>=0) {

580 if(!callback(context, value\|(uint32_t)(b-0x20), codePoints)) {

581 return FALSE;

582 }

583 anyCodePoints=-1;

584 }

585 }

586 }

587 return TRUE;

588 }

589

590 /*

591 * Only called if stateProps[state]==-1.

592 * A recursive call may do stateProps[state]\|=0x40 if this state is the target o f an

593 * MBCS_STATE_CHANGE_ONLY.

594 */

595 static int8_t

596 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {

597 const int32_t *row;

598 int32_t min, max, entry, nextState;

599

600 row=stateTable[state];

601 stateProps[state]=0;

602

603 /* find first non-ignorable state */

604 for(min=0;; ++min) {

605 entry=row[min];

606 nextState=MBCS_ENTRY_STATE(entry);

607 if(stateProps[nextState]==-1) {

608 getStateProp(stateTable, stateProps, nextState);

609 }

610 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

611 if(stateProps[nextState]>=0) {

612 break;

613 }

614 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {

615 break;

616 }

617 if(min==0xff) {

618 stateProps[state]=-0x40; /* (int8_t)0xc0 */

619 return stateProps[state];

620 }

621 }

622 stateProps[state]\|=(int8_t)((min>>5)<<3);

623

624 /* find last non-ignorable state */

625 for(max=0xff; min<max; --max) {

626 entry=row[max];

627 nextState=MBCS_ENTRY_STATE(entry);

628 if(stateProps[nextState]==-1) {

629 getStateProp(stateTable, stateProps, nextState);

630 }

631 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

632 if(stateProps[nextState]>=0) {

633 break;

634 }

635 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {

636 break;

637 }

638 }

639 stateProps[state]\|=(int8_t)(max>>5);

640

641 /* recurse further and collect direct-state information */

642 while(min<=max) {

643 entry=row[min];

644 nextState=MBCS_ENTRY_STATE(entry);

645 if(stateProps[nextState]==-1) {

646 getStateProp(stateTable, stateProps, nextState);

647 }

648 if(MBCS_ENTRY_IS_FINAL(entry)) {

649 stateProps[nextState]\|=0x40;

650 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {

651 stateProps[state]\|=0x40;

652 }

653 }

654 ++min;

655 }

656 return stateProps[state];

657 }

658

659 /*

660 * Internal function enumerating the toUnicode data of an MBCS converter.

661 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U

662 * table, but could also be used for a future ucnv_getUnicodeSet() option

663 * that includes reverse fallbacks (after updating this function's implementatio n).

664 * Currently only handles roundtrip mappings.

665 * Does not currently handle extensions.

666 */

667 static void

668 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,

669 UConverterEnumToUCallback callback, const void context,

670 UErrorCode *pErrorCode) {

671 /*

672 * Properties for each state, to speed up the enumeration.

673 * Ignorable actions are unassigned/illegal/state-change-only:

674 * They do not lead to mappings.

675 *

676 * Bits 7..6:

677 * 1 direct/initial state (stateful converters have multiple)

678 * 0 non-initial state with transitions or with non-ignorable result actions

679 * -1 final state with only ignorable actions

680 *

681 * Bits 5..3:

682 * The lowest byte value with non-ignorable actions is

683 * value<<5 (rounded down).

684 *

685 * Bits 2..0:

686 * The highest byte value with non-ignorable actions is

687 * (value<<5)&0x1f (rounded up).

688 */

689 int8_t stateProps[MBCS_MAX_STATE_COUNT];

690 int32_t state;

691

692 uprv_memset(stateProps, -1, sizeof(stateProps));

693

694 /* recurse from state 0 and set all stateProps */

695 getStateProp(mbcsTable->stateTable, stateProps, 0);

696

697 for(state=0; state<mbcsTable->countStates; ++state) {

698 /*if(stateProps[state]==-1) {

699 printf("unused/unreachable <icu:state> %d\n", state);

700 }*/

701 if(stateProps[state]>=0x40) {

702 /* start from each direct state */

703 enumToU(

704 mbcsTable, stateProps, state, 0, 0,

705 callback, context,

706 pErrorCode);

707 }

708 }

709 }

710

711 U_CFUNC void

712 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,

713 const USetAdder *sa,

714 UConverterUnicodeSet which,

715 UConverterSetFilter filter,

716 UErrorCode *pErrorCode) {

717 const UConverterMBCSTable *mbcsTable;

718 const uint16_t *table;

719

720 uint32_t st3;

721 uint16_t st1, maxStage1, st2;

722

723 UChar32 c;

724

725 /* enumerate the from-Unicode trie table */

726 mbcsTable=&sharedData->mbcs;

727 table=mbcsTable->fromUnicodeTable;

728 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {

729 maxStage1=0x440;

730 } else {

731 maxStage1=0x40;

732 }

733

734 c=0; /* keep track of the current code point while enumerating */

735

736 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

737 const uint16_t stage2, stage3, *results;

738 uint16_t minValue;

739

740 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;

741

742 /*

743 * Set a threshold variable for selecting which mappings to use.

744 * See ucnv_MBCSSingleFromBMPWithOffsets() and

745 * MBCS_SINGLE_RESULT_FROM_U() for details.

746 */

747 if(which==UCNV_ROUNDTRIP_SET) {

748 /* use only roundtrips */

749 minValue=0xf00;

750 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {

751 /* use all roundtrip and fallback results */

752 minValue=0x800;

753 }

754

755 for(st1=0; st1<maxStage1; ++st1) {

756 st2=table[st1];

757 if(st2>maxStage1) {

758 stage2=table+st2;

759 for(st2=0; st2<64; ++st2) {

760 if((st3=stage2[st2])!=0) {

761 /* read the stage 3 block */

762 stage3=results+st3;

763

764 do {

765 if(*stage3++>=minValue) {

766 sa->add(sa->set, c);

767 }

768 } while((++c&0xf)!=0);

769 } else {

770 c+=16; /* empty stage 3 block */

771 }

772 }

773 } else {

774 c+=1024; /* empty stage 2 block */

775 }

776 }

777 } else {

778 const uint32_t *stage2;

779 const uint8_t stage3, bytes;

780 uint32_t st3Multiplier;

781 uint32_t value;

782 UBool useFallback;

783

784 bytes=mbcsTable->fromUnicodeBytes;

785

786 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);

787

788 switch(mbcsTable->outputType) {

789 case MBCS_OUTPUT_3:

790 case MBCS_OUTPUT_4_EUC:

791 st3Multiplier=3;

792 break;

793 case MBCS_OUTPUT_4:

794 st3Multiplier=4;

795 break;

796 default:

797 st3Multiplier=2;

798 break;

799 }

800

801 for(st1=0; st1<maxStage1; ++st1) {

802 st2=table[st1];

803 if(st2>(maxStage1>>1)) {

804 stage2=(const uint32_t *)table+st2;

805 for(st2=0; st2<64; ++st2) {

806 if((st3=stage2[st2])!=0) {

807 /* read the stage 3 block */

808 stage3=bytes+st3Multiplier16(uint32_t)(uint16_t)st3;

809

810 /* get the roundtrip flags for the stage 3 block */

811 st3>>=16;

812

813 /*

814 * Add code points for which the roundtrip flag is set,

815 * or which map to non-zero bytes if we use fallbacks.

816 * See ucnv_MBCSFromUnicodeWithOffsets() for details.

817 */

818 switch(filter) {

819 case UCNV_SET_FILTER_NONE:

820 do {

821 if(st3&1) {

822 sa->add(sa->set, c);

823 stage3+=st3Multiplier;

824 } else if(useFallback) {

825 uint8_t b=0;

826 switch(st3Multiplier) {

827 case 4:

828 b\|=*stage3++;

829 case 3: /fall through/

830 b\|=*stage3++;

831 case 2: /fall through/

832 b\|=stage3[0]\|stage3[1];

833 stage3+=2;

834 default:

835 break;

836 }

837 if(b!=0) {

838 sa->add(sa->set, c);

839 }

840 }

841 st3>>=1;

842 } while((++c&0xf)!=0);

843 break;

844 case UCNV_SET_FILTER_DBCS_ONLY:

845 /* Ignore single-byte results (<0x100). */

846 do {

847 if(((st3&1)!=0 \|\| useFallback) && ((const uint1 6_t )stage3)>=0x100) {

848 sa->add(sa->set, c);

849 }

850 st3>>=1;

851 stage3+=2; /* +=st3Multiplier */

852 } while((++c&0xf)!=0);

853 break;

854 case UCNV_SET_FILTER_2022_CN:

855 /* Only add code points that map to CNS 11643 plane s 1 & 2 for non-EXT ISO-2022-CN. */

856 do {

857 if(((st3&1)!=0 \|\| useFallback) && ((value=*stage 3)==0x81 \|\| value==0x82)) {

858 sa->add(sa->set, c);

859 }

860 st3>>=1;

861 stage3+=3; /* +=st3Multiplier */

862 } while((++c&0xf)!=0);

863 break;

864 case UCNV_SET_FILTER_SJIS:

865 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */

866 do {

867 if(((st3&1)!=0 \|\| useFallback) && (value=((cons t uint16_t )stage3))>=0x8140 && value<=0xeffc) {

868 sa->add(sa->set, c);

869 }

870 st3>>=1;

871 stage3+=2; /* +=st3Multiplier */

872 } while((++c&0xf)!=0);

873 break;

874 case UCNV_SET_FILTER_GR94DBCS:

875 /* Only add code points that map to ISO 2022 GR 94 D BCS codes (each byte A1..FE). */

876 do {

877 if( ((st3&1)!=0 \|\| useFallback) &&

878 (uint16_t)((value=((const uint16_t )stage3 )) - 0xa1a1)<=(0xfefe - 0xa1a1) &&

879 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

880 ) {

881 sa->add(sa->set, c);

882 }

883 st3>>=1;

884 stage3+=2; /* +=st3Multiplier */

885 } while((++c&0xf)!=0);

886 break;

887 case UCNV_SET_FILTER_HZ:

888 /* Only add code points that are suitable for HZ DBC S (lead byte A1..FD). */

889 do {

890 if( ((st3&1)!=0 \|\| useFallback) &&

891 (uint16_t)((value=((const uint16_t )stage3 ))-0xa1a1)<=(0xfdfe - 0xa1a1) &&

892 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

893 ) {

894 sa->add(sa->set, c);

895 }

896 st3>>=1;

897 stage3+=2; /* +=st3Multiplier */

898 } while((++c&0xf)!=0);

899 break;

900 default:

901 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;

902 return;

903 }

904 } else {

905 c+=16; /* empty stage 3 block */

906 }

907 }

908 } else {

909 c+=1024; /* empty stage 2 block */

910 }

911 }

912 }

913

914 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);

915 }

916

917 U_CFUNC void

918 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,

919 const USetAdder *sa,

920 UConverterUnicodeSet which,

921 UErrorCode *pErrorCode) {

922 ucnv_MBCSGetFilteredUnicodeSetForUnicode(

923 sharedData, sa, which,

924 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?

925 UCNV_SET_FILTER_DBCS_ONLY :

926 UCNV_SET_FILTER_NONE,

927 pErrorCode);

928 }

929

930 static void

931 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,

932 const USetAdder *sa,

933 UConverterUnicodeSet which,

934 UErrorCode *pErrorCode) {

935 if(cnv->options&_MBCS_OPTION_GB18030) {

936 sa->addRange(sa->set, 0, 0xd7ff);

937 sa->addRange(sa->set, 0xe000, 0x10ffff);

938 } else {

939 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode) ;

940 }

941 }

942

943 /* conversion extensions for input not in the main table -------------------- */

944

945 /*

946 * Hardcoded extension handling for GB 18030.

947 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the f ile.

948 *

949 * In the future, conversion extensions may handle m:n mappings and delta tables ,

950 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/c onversion_extensions.html

951 *

952 * If an input character cannot be mapped, then these functions set an error

953 * code. The framework will then call the callback function.

954 */

955

956 /*

957 * @return if(U_FAILURE) return the code point for cnv->fromUChar32

958 * else return 0 after output has been written to the target

959 */

960 static UChar32

961 _extFromU(UConverter cnv, const UConverterSharedData sharedData,

962 UChar32 cp,

963 const UChar *source, const UChar sourceLimit,

964 uint8_t *target, const uint8_t targetLimit,

965 int32_t **offsets, int32_t sourceIndex,

966 UBool flush,

967 UErrorCode *pErrorCode) {

968 const int32_t *cx;

969

970 cnv->useSubChar1=FALSE;

971

972 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&

973 ucnv_extInitialMatchFromU(

974 cnv, cx,

975 cp, source, sourceLimit,

976 (char *)target, (char )targetLimit,

977 offsets, sourceIndex,

978 flush,

979 pErrorCode)

980 ) {

981 return 0; /* an extension mapping handled the input */

982 }

983

984 /* GB 18030 */

985 if((cnv->options&_MBCS_OPTION_GB18030)!=0) {

986 const uint32_t *range;

987 int32_t i;

988

989 range=gb18030Ranges[0];

990 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i ) {

991 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {

992 /* found the Unicode code point, output the four-byte sequence f or it */

993 uint32_t linear;

994 char bytes[4];

995

996 /* get the linear value of the first GB 18030 code in this range */

997 linear=range[2]-LINEAR_18030_BASE;

998

999 /* add the offset from the beginning of the range */

1000 linear+=((uint32_t)cp-range[0]);

1001

1002 /* turn this into a four-byte sequence */

1003 bytes[3]=(char)(0x30+linear%10); linear/=10;

1004 bytes[2]=(char)(0x81+linear%126); linear/=126;

1005 bytes[1]=(char)(0x30+linear%10); linear/=10;

1006 bytes[0]=(char)(0x81+linear);

1007

1008 /* output this sequence */

1009 ucnv_fromUWriteBytes(cnv,

1010 bytes, 4, (char *)target, (char )targetLi mit,

1011 offsets, sourceIndex, pErrorCode);

1012 return 0;

1013 }

1014 }

1015 }

1016

1017 /* no mapping */

1018 *pErrorCode=U_INVALID_CHAR_FOUND;

1019 return cp;

1020 }

1021

1022 /*

1023 * Input sequence: cnv->toUBytes[0..length[

1024 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input

1025 * else return 0 after output has been written to the target

1026 */

1027 static int8_t

1028 _extToU(UConverter cnv, const UConverterSharedData sharedData,

1029 int8_t length,

1030 const uint8_t *source, const uint8_t sourceLimit,

1031 UChar *target, const UChar targetLimit,

1032 int32_t **offsets, int32_t sourceIndex,

1033 UBool flush,

1034 UErrorCode *pErrorCode) {

1035 const int32_t *cx;

1036

1037 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&

1038 ucnv_extInitialMatchToU(

1039 cnv, cx,

1040 length, (const char *)source, (const char )sourceLimit,

1041 target, targetLimit,

1042 offsets, sourceIndex,

1043 flush,

1044 pErrorCode)

1045 ) {

1046 return 0; /* an extension mapping handled the input */

1047 }

1048

1049 /* GB 18030 */

1050 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {

1051 const uint32_t *range;

1052 uint32_t linear;

1053 int32_t i;

1054

1055 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2] , cnv->toUBytes[3]);

1056 range=gb18030Ranges[0];

1057 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i ) {

1058 if(range[2]<=linear && linear<=range[3]) {

1059 /* found the sequence, output the Unicode code point for it */

1060 *pErrorCode=U_ZERO_ERROR;

1061

1062 /* add the linear difference between the input and start sequenc es to the start code point */

1063 linear=range[0]+(linear-range[2]);

1064

1065 /* output this code point */

1066 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets , sourceIndex, pErrorCode);

1067

1068 return 0;

1069 }

1070 }

1071 }

1072

1073 /* no mapping */

1074 *pErrorCode=U_INVALID_CHAR_FOUND;

1075 return length;

1076 }

1077

1078 /* EBCDIC swap LF<->NL ------------------------------------------------------ */

1079

1080 /*

1081 * This code modifies a standard EBCDIC<->Unicode mapping table for

1082 * OS/390 (z/OS) Unix System Services (Open Edition).

1083 * The difference is in the mapping of Line Feed and New Line control codes:

1084 * Standard EBCDIC maps

1085 *

1086 * <U000A> \x25 \|0

1087 * <U0085> \x15 \|0

1088 *

1089 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,

1090 * mapping

1091 *

1092 * <U000A> \x15 \|0

1093 * <U0085> \x25 \|0

1094 *

1095 * This code modifies a loaded standard EBCDIC<->Unicode mapping table

1096 * by copying it into allocated memory and swapping the LF and NL values.

1097 * It allows to support the same EBCDIC charset in both versions without

1098 * duplicating the entire installed table.

1099 */

1100

1101 /* standard EBCDIC codes */

1102 #define EBCDIC_LF 0x25

1103 #define EBCDIC_NL 0x15

1104

1105 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */

1106 #define EBCDIC_RT_LF 0xf25

1107 #define EBCDIC_RT_NL 0xf15

1108

1109 /* Unicode code points */

1110 #define U_LF 0x0a

1111 #define U_NL 0x85

1112

1113 static UBool

1114 _EBCDICSwapLFNL(UConverterSharedData sharedData, UErrorCode pErrorCode) {

1115 UConverterMBCSTable *mbcsTable;

1116

1117 const uint16_t table, results;

1118 const uint8_t *bytes;

1119

1120 int32_t (*newStateTable)[256];

1121 uint16_t *newResults;

1122 uint8_t *p;

1123 char *name;

1124

1125 uint32_t stage2Entry;

1126 uint32_t size, sizeofFromUBytes;

1127

1128 mbcsTable=&sharedData->mbcs;

1129

1130 table=mbcsTable->fromUnicodeTable;

1131 bytes=mbcsTable->fromUnicodeBytes;

1132 results=(const uint16_t *)bytes;

1133

1134 /*

1135 * Check that this is an EBCDIC table with SBCS portion -

1136 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.

1137 *

1138 * If not, ignore the option. Options are always ignored if they do not appl y.

1139 */

1140 if(!(

1141 (mbcsTable->outputType==MBCS_OUTPUT_1 \|\| mbcsTable->outputType==MBCS_OU TPUT_2_SISO) &&

1142 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VAL ID_DIRECT_16, U_LF) &&

1143 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VAL ID_DIRECT_16, U_NL)

1144 )) {

1145 return FALSE;

1146 }

1147

1148 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

1149 if(!(

1150 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&

1151 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)

1152 )) {

1153 return FALSE;

1154 }

1155 } else /* MBCS_OUTPUT_2_SISO */ {

1156 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);

1157 if(!(

1158 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&

1159 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)

1160 )) {

1161 return FALSE;

1162 }

1163

1164 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);

1165 if(!(

1166 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&

1167 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)

1168 )) {

1169 return FALSE;

1170 }

1171 }

1172

1173 if(mbcsTable->fromUBytesLength>0) {

1174 /*

1175 * We _know_ the number of bytes in the fromUnicodeBytes array

1176 * starting with header.version 4.1.

1177 */

1178 sizeofFromUBytes=mbcsTable->fromUBytesLength;

1179 } else {

1180 /*

1181 * Otherwise:

1182 * There used to be code to enumerate the fromUnicode

1183 * trie and find the highest entry, but it was removed in ICU 3.2

1184 * because it was not tested and caused a low code coverage number.

1185 * See Jitterbug 3674.

1186 * This affects only some .cnv file formats with a header.version

1187 * below 4.1, and only when swaplfnl is requested.

1188 *

1189 * ucnvmbcs.c revision 1.99 is the last one with the

1190 * ucnv_MBCSSizeofFromUBytes() function.

1191 */

1192 *pErrorCode=U_INVALID_FORMAT_ERROR;

1193 return FALSE;

1194 }

1195

1196 /*

1197 * The table has an appropriate format.

1198 * Allocate and build

1199 * - a modified to-Unicode state table

1200 * - a modified from-Unicode output array

1201 * - a converter name string with the swap option appended

1202 */

1203 size=

1204 mbcsTable->countStates*1024+

1205 sizeofFromUBytes+

1206 UCNV_MAX_CONVERTER_NAME_LENGTH+20;

1207 p=(uint8_t *)uprv_malloc(size);

1208 if(p==NULL) {

1209 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

1210 return FALSE;

1211 }

1212

1213 /* copy and modify the to-Unicode state table */

1214 newStateTable=(int32_t (*)[256])p;

1215 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*102 4);

1216

1217 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);

1218 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);

1219

1220 /* copy and modify the from-Unicode result table */

1221 newResults=(uint16_t *)newStateTable[mbcsTable->countStates];

1222 uprv_memcpy(newResults, bytes, sizeofFromUBytes);

1223

1224 /* conveniently, the table access macros work on the left side of expression s */

1225 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

1226 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;

1227 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;

1228 } else /* MBCS_OUTPUT_2_SISO */ {

1229 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);

1230 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;

1231

1232 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);

1233 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;

1234 }

1235

1236 /* set the canonical converter name */

1237 name=(char *)newResults+sizeofFromUBytes;

1238 uprv_strcpy(name, sharedData->staticData->name);

1239 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);

1240

1241 /* set the pointers */

1242 umtx_lock(NULL);

1243 if(mbcsTable->swapLFNLStateTable==NULL) {

1244 mbcsTable->swapLFNLStateTable=newStateTable;

1245 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;

1246 mbcsTable->swapLFNLName=name;

1247

1248 newStateTable=NULL;

1249 }

1250 umtx_unlock(NULL);

1251

1252 /* release the allocated memory if another thread beat us to it */

1253 if(newStateTable!=NULL) {

1254 uprv_free(newStateTable);

1255 }

1256 return TRUE;

1257 }

1258

1259 /* reconstitute omitted fromUnicode data ------------------------------------ */

1260

1261 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() * /

1262 static UBool U_CALLCONV

1263 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32] ) {

1264 UConverterMBCSTable mbcsTable=(UConverterMBCSTable )context;

1265 const uint16_t *table;

1266 uint32_t *stage2;

1267 uint8_t bytes, p;

1268 UChar32 c;

1269 int32_t i, st3;

1270

1271 table=mbcsTable->fromUnicodeTable;

1272 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;

1273

1274 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */

1275 switch(mbcsTable->outputType) {

1276 case MBCS_OUTPUT_3_EUC:

1277 if(value<=0xffff) {

1278 /* short sequences are stored directly */

1279 /* code set 0 or 1 */

1280 } else if(value<=0x8effff) {

1281 /* code set 2 */

1282 value&=0x7fff;

1283 } else /* first byte is 0x8f */ {

1284 /* code set 3 */

1285 value&=0xff7f;

1286 }

1287 break;

1288 case MBCS_OUTPUT_4_EUC:

1289 if(value<=0xffffff) {

1290 /* short sequences are stored directly */

1291 /* code set 0 or 1 */

1292 } else if(value<=0x8effffff) {

1293 /* code set 2 */

1294 value&=0x7fffff;

1295 } else /* first byte is 0x8f */ {

1296 /* code set 3 */

1297 value&=0xff7fff;

1298 }

1299 break;

1300 default:

1301 break;

1302 }

1303

1304 for(i=0; i<=0x1f; ++value, ++i) {

1305 c=codePoints[i];

1306 if(c<0) {

1307 continue;

1308 }

1309

1310 /* locate the stage 2 & 3 data */

1311 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);

1312 p=bytes;

1313 st3=(int32_t)(uint16_t)stage216+(c&0xf);

1314

1315 /* write the codepage bytes into stage 3 */

1316 switch(mbcsTable->outputType) {

1317 case MBCS_OUTPUT_3:

1318 case MBCS_OUTPUT_4_EUC:

1319 p+=st3*3;

1320 p[0]=(uint8_t)(value>>16);

1321 p[1]=(uint8_t)(value>>8);

1322 p[2]=(uint8_t)value;

1323 break;

1324 case MBCS_OUTPUT_4:

1325 ((uint32_t *)p)[st3]=value;

1326 break;

1327 default:

1328 /* 2 bytes per character */

1329 ((uint16_t *)p)[st3]=(uint16_t)value;

1330 break;

1331 }

1332

1333 /* set the roundtrip flag */

1334 *stage2\|=(1UL<<(16+(c&0xf)));

1335 }

1336 return TRUE;

1337 }

1338

1339 static void

1340 reconstituteData(UConverterMBCSTable *mbcsTable,

1341 uint32_t stage1Length, uint32_t stage2Length,

1342 uint32_t fullStage2Length, /* lengths are numbers of units, no t bytes */

1343 UErrorCode *pErrorCode) {

1344 uint16_t *stage1;

1345 uint32_t *stage2;

1346 uint32_t dataLength=stage1Length2+fullStage2Length4+mbcsTable->fromUBytesL ength;

1347 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);

1348 if(mbcsTable->reconstitutedData==NULL) {

1349 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

1350 return;

1351 }

1352 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);

1353

1354 /* copy existing data and reroute the pointers */

1355 stage1=(uint16_t *)mbcsTable->reconstitutedData;

1356 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);

1357

1358 stage2=(uint32_t *)(stage1+stage1Length);

1359 uprv_memcpy(stage2+(fullStage2Length-stage2Length),

1360 mbcsTable->fromUnicodeTable+stage1Length,

1361 stage2Length*4);

1362

1363 mbcsTable->fromUnicodeTable=stage1;

1364 mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length);

1365

1366 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */

1367 stage2=(uint32_t *)stage1;

1368

1369 /* reconstitute the initial part of stage 2 from the mbcsIndex */

1370 {

1371 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;

1372 int32_t stageUTF8Index=0;

1373 int32_t st1, st2, st3, i;

1374

1375 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {

1376 st2=stage1[st1];

1377 if(st2!=stage1Length/2) {

1378 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */

1379 for(i=0; i<16; ++i) {

1380 st3=mbcsTable->mbcsIndex[stageUTF8Index++];

1381 if(st3!=0) {

1382 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */

1383 st3>>=4;

1384 /*

1385 * 4 stage 2 entries point to 4 consecutive stage 3 16-b locks which are

1386 * allocated together as a single 64-block for access fr om the mbcsIndex

1387 */

1388 stage2[st2++]=st3++;

1389 stage2[st2++]=st3++;

1390 stage2[st2++]=st3++;

1391 stage2[st2++]=st3;

1392 } else {

1393 /* no stage 3 block, skip */

1394 st2+=4;

1395 }

1396 }

1397 } else {

1398 /* no stage 2 block, skip */

1399 stageUTF8Index+=16;

1400 }

1401 }

1402 }

1403

1404 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */

1405 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCod e);

1406 }

1407

1408 /* MBCS setup functions ----------------------------------------------------- */

1409

1410 static void

1411 ucnv_MBCSLoad(UConverterSharedData *sharedData,

1412 UConverterLoadArgs *pArgs,

1413 const uint8_t *raw,

1414 UErrorCode *pErrorCode) {

1415 UDataInfo info;

1416 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;

1417 _MBCSHeader header=(_MBCSHeader )raw;

1418 uint32_t offset;

1419 uint32_t headerLength;

1420 UBool noFromU=FALSE;

1421

1422 if(header->version[0]==4) {

1423 headerLength=MBCS_HEADER_V4_LENGTH;

1424 } else if(header->version[0]==5 && header->version[1]>=3 &&

1425 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {

1426 headerLength=header->options&MBCS_OPT_LENGTH_MASK;

1427 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);

1428 } else {

1429 *pErrorCode=U_INVALID_TABLE_FORMAT;

1430 return;

1431 }

1432

1433 mbcsTable->outputType=(uint8_t)header->flags;

1434 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {

1435 *pErrorCode=U_INVALID_TABLE_FORMAT;

1436 return;

1437 }

1438

1439 /* extension data, header version 4.2 and higher */

1440 offset=header->flags>>8;

1441 if(offset!=0) {

1442 mbcsTable->extIndexes=(const int32_t *)(raw+offset);

1443 }

1444

1445 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {

1446 UConverterLoadArgs args={ 0 };

1447 UConverterSharedData *baseSharedData;

1448 const int32_t *extIndexes;

1449 const char *baseName;

1450

1451 /* extension-only file, load the base table and set values appropriately */

1452 if((extIndexes=mbcsTable->extIndexes)==NULL) {

1453 /* extension-only file without extension */

1454 *pErrorCode=U_INVALID_TABLE_FORMAT;

1455 return;

1456 }

1457

1458 if(pArgs->nestedLoads!=1) {

1459 /* an extension table must not be loaded as a base table */

1460 *pErrorCode=U_INVALID_TABLE_FILE;

1461 return;

1462 }

1463

1464 /* load the base table */

1465 baseName=(const char )header+headerLength4;

1466 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {

1467 /* forbid loading this same extension-only file */

1468 *pErrorCode=U_INVALID_TABLE_FORMAT;

1469 return;

1470 }

1471

1472 /* TODO parse package name out of the prefix of the base name in the ext ension .cnv file? */

1473 args.size=sizeof(UConverterLoadArgs);

1474 args.nestedLoads=2;

1475 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;

1476 args.reserved=pArgs->reserved;

1477 args.options=pArgs->options;

1478 args.pkg=pArgs->pkg;

1479 args.name=baseName;

1480 baseSharedData=ucnv_load(&args, pErrorCode);

1481 if(U_FAILURE(*pErrorCode)) {

1482 return;

1483 }

1484 if( baseSharedData->staticData->conversionType!=UCNV_MBCS \|\|

1485 baseSharedData->mbcs.baseSharedData!=NULL

1486 ) {

1487 ucnv_unload(baseSharedData);

1488 *pErrorCode=U_INVALID_TABLE_FORMAT;

1489 return;

1490 }

1491 if(pArgs->onlyTestIsLoadable) {

1492 /*

1493 * Exit as soon as we know that we can load the converter

1494 * and the format is valid and supported.

1495 * The worst that can happen in the following code is a memory

1496 * allocation error.

1497 */

1498 ucnv_unload(baseSharedData);

1499 return;

1500 }

1501

1502 /* copy the base table data */

1503 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable ));

1504

1505 /* overwrite values with relevant ones for the extension converter */

1506 mbcsTable->baseSharedData=baseSharedData;

1507 mbcsTable->extIndexes=extIndexes;

1508

1509 /*

1510 * It would be possible to share the swapLFNL data with a base converter ,

1511 * but the generated name would have to be different, and the memory

1512 * would have to be free'd only once.

1513 * It is easier to just create the data for the extension converter

1514 * separately when it is requested.

1515 */

1516 mbcsTable->swapLFNLStateTable=NULL;

1517 mbcsTable->swapLFNLFromUnicodeBytes=NULL;

1518 mbcsTable->swapLFNLName=NULL;

1519

1520 /*

1521 * The reconstitutedData must be deleted only when the base converter

1522 * is unloaded.

1523 */

1524 mbcsTable->reconstitutedData=NULL;

1525

1526 /*

1527 * Set a special, runtime-only outputType if the extension converter

1528 * is a DBCS version of a base converter that also maps single bytes.

1529 */

1530 if( sharedData->staticData->conversionType==UCNV_DBCS \|\|

1531 (sharedData->staticData->conversionType==UCNV_MBCS &&

1532 sharedData->staticData->minBytesPerChar>=2)

1533 ) {

1534 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {

1535 /* the base converter is SI/SO-stateful */

1536 int32_t entry;

1537

1538 /* get the dbcs state from the state table entry for SO=0x0e */

1539 entry=mbcsTable->stateTable[0][0xe];

1540 if( MBCS_ENTRY_IS_FINAL(entry) &&

1541 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&

1542 MBCS_ENTRY_FINAL_STATE(entry)!=0

1543 ) {

1544 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(ent ry);

1545

1546 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;

1547 }

1548 } else if(

1549 baseSharedData->staticData->conversionType==UCNV_MBCS &&

1550 baseSharedData->staticData->minBytesPerChar==1 &&

1551 baseSharedData->staticData->maxBytesPerChar==2 &&

1552 mbcsTable->countStates<=127

1553 ) {

1554 /* non-stateful base converter, need to modify the state table * /

1555 int32_t (*newStateTable)[256];

1556 int32_t *state;

1557 int32_t i, count;

1558

1559 /* allocate a new state table and copy the base state table cont ents */

1560 count=mbcsTable->countStates;

1561 newStateTable=(int32_t ()[256])uprv_malloc((count+1)1024);

1562 if(newStateTable==NULL) {

1563 ucnv_unload(baseSharedData);

1564 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

1565 return;

1566 }

1567

1568 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);

1569

1570 /* change all final single-byte entries to go to a new all-illeg al state */

1571 state=newStateTable[0];

1572 for(i=0; i<256; ++i) {

1573 if(MBCS_ENTRY_IS_FINAL(state[i])) {

1574 state[i]=MBCS_ENTRY_TRANSITION(count, 0);

1575 }

1576 }

1577

1578 /* build the new all-illegal state */

1579 state=newStateTable[count];

1580 for(i=0; i<256; ++i) {

1581 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);

1582 }

1583 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;

1584 mbcsTable->countStates=(uint8_t)(count+1);

1585 mbcsTable->stateTableOwned=TRUE;

1586

1587 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;

1588 }

1589 }

1590

1591 /*

1592 * unlike below for files with base tables, do not get the unicodeMask

1593 * from the sharedData; instead, use the base table's unicodeMask,

1594 * which we copied in the memcpy above;

1595 * this is necessary because the static data unicodeMask, especially

1596 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data

1597 */

1598 } else {

1599 /* conversion file with a base table; an additional extension table is o ptional */

1600 /* make sure that the output type is known */

1601 switch(mbcsTable->outputType) {

1602 case MBCS_OUTPUT_1:

1603 case MBCS_OUTPUT_2:

1604 case MBCS_OUTPUT_3:

1605 case MBCS_OUTPUT_4:

1606 case MBCS_OUTPUT_3_EUC:

1607 case MBCS_OUTPUT_4_EUC:

1608 case MBCS_OUTPUT_2_SISO:

1609 /* OK */

1610 break;

1611 default:

1612 *pErrorCode=U_INVALID_TABLE_FORMAT;

1613 return;

1614 }

1615 if(pArgs->onlyTestIsLoadable) {

1616 /*

1617 * Exit as soon as we know that we can load the converter

1618 * and the format is valid and supported.

1619 * The worst that can happen in the following code is a memory

1620 * allocation error.

1621 */

1622 return;

1623 }

1624

1625 mbcsTable->countStates=(uint8_t)header->countStates;

1626 mbcsTable->countToUFallbacks=header->countToUFallbacks;

1627 mbcsTable->stateTable=(const int32_t ()[256])(raw+headerLength4);

1628 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable +header->countStates);

1629 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCode Units);

1630

1631 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTa ble);

1632 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUByt es);

1633 mbcsTable->fromUBytesLength=header->fromUBytesLength;

1634

1635 /*

1636 * converter versions 6.1 and up contain a unicodeMask that is

1637 * used here to select the most efficient function implementations

1638 */

1639 info.size=sizeof(UDataInfo);

1640 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);

1641 if(info.formatVersion[0]>6 \|\| (info.formatVersion[0]==6 && info.formatVe rsion[1]>=1)) {

1642 /* mask off possible future extensions to be safe */

1643 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask &3);

1644 } else {

1645 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */

1646 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY\|UCNV_HAS_SURROGATES;

1647 }

1648

1649 /*

1650 * _MBCSHeader.version 4.3 adds utf8Friendly data structures.

1651 * Check for the header version, SBCS vs. MBCS, and for whether the

1652 * data structures are optimized for code points as high as what the

1653 * runtime code is designed for.

1654 * The implementation does not handle mapping tables with entries for

1655 * unpaired surrogates.

1656 */

1657 if( header->version[1]>=3 &&

1658 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&

1659 (mbcsTable->countStates==1 ?

1660 (header->version[2]>=(SBCS_FAST_MAX>>8)) :

1661 (header->version[2]>=(MBCS_FAST_MAX>>8))

1662 )

1663 ) {

1664 mbcsTable->utf8Friendly=TRUE;

1665

1666 if(mbcsTable->countStates==1) {

1667 /*

1668 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBC S_FAST_MAX or higher.

1669 * Build a table with indexes to each block, to be used instead of

1670 * the regular stage 1/2 table.

1671 */

1672 int32_t i;

1673 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {

1674 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTabl e->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];

1675 }

1676 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */

1677 mbcsTable->maxFastUChar=SBCS_FAST_MAX;

1678 } else {

1679 /*

1680 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBC S_FAST_MAX or higher.

1681 * The .cnv file is prebuilt with an additional stage table with indexes

1682 * to each block.

1683 */

1684 mbcsTable->mbcsIndex=(const uint16_t *)

1685 (mbcsTable->fromUnicodeBytes+

1686 (noFromU ? 0 : mbcsTable->fromUBytesLength));

1687 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)\|0xff;

1688 }

1689 }

1690

1691 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */

1692 {

1693 uint32_t asciiRoundtrips=0xffffffff;

1694 int32_t i;

1695

1696 for(i=0; i<0x80; ++i) {

1697 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_V ALID_DIRECT_16, i)) {

1698 asciiRoundtrips&=~((uint32_t)1<<(i>>2));

1699 }

1700 }

1701 mbcsTable->asciiRoundtrips=asciiRoundtrips;

1702 }

1703

1704 if(noFromU) {

1705 uint32_t stage1Length=

1706 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?

1707 0x440 : 0x40;

1708 uint32_t stage2Length=

1709 (header->offsetFromUBytes-header->offsetFromUTable)/4-

1710 stage1Length/2;

1711 reconstituteData(mbcsTable, stage1Length, stage2Length, header->full Stage2Length, pErrorCode);

1712 }

1713 }

1714

1715 /* Set the impl pointer here so that it is set for both extension-only and b ase tables. */

1716 if(mbcsTable->utf8Friendly) {

1717 if(mbcsTable->countStates==1) {

1718 sharedData->impl=&_SBCSUTF8Impl;

1719 } else {

1720 if(mbcsTable->outputType==MBCS_OUTPUT_2) {

1721 sharedData->impl=&_DBCSUTF8Impl;

1722 }

1723 }

1724 }

1725

1726 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY \|\| mbcsTable->outputType==MB CS_OUTPUT_2_SISO) {

1727 /*

1728 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not rou ndtrip.

1729 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength co rrectly.

1730 */

1731 mbcsTable->asciiRoundtrips=0;

1732 }

1733 }

1734

1735 static void

1736 ucnv_MBCSUnload(UConverterSharedData *sharedData) {

1737 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;

1738

1739 if(mbcsTable->swapLFNLStateTable!=NULL) {

1740 uprv_free(mbcsTable->swapLFNLStateTable);

1741 }

1742 if(mbcsTable->stateTableOwned) {

1743 uprv_free((void *)mbcsTable->stateTable);

1744 }

1745 if(mbcsTable->baseSharedData!=NULL) {

1746 ucnv_unload(mbcsTable->baseSharedData);

1747 }

1748 if(mbcsTable->reconstitutedData!=NULL) {

1749 uprv_free(mbcsTable->reconstitutedData);

1750 }

1751 }

1752

1753 static void

1754 ucnv_MBCSOpen(UConverter *cnv,

1755 UConverterLoadArgs *pArgs,

1756 UErrorCode *pErrorCode) {

1757 UConverterMBCSTable *mbcsTable;

1758 const int32_t *extIndexes;

1759 uint8_t outputType;

1760 int8_t maxBytesPerUChar;

1761

1762 if(pArgs->onlyTestIsLoadable) {

1763 return;

1764 }

1765

1766 mbcsTable=&cnv->sharedData->mbcs;

1767 outputType=mbcsTable->outputType;

1768

1769 if(outputType==MBCS_OUTPUT_DBCS_ONLY) {

1770 /* the swaplfnl option does not apply, remove it */

1771 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;

1772 }

1773

1774 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {

1775 /* do this because double-checked locking is broken */

1776 UBool isCached;

1777

1778 umtx_lock(NULL);

1779 isCached=mbcsTable->swapLFNLStateTable!=NULL;

1780 umtx_unlock(NULL);

1781

1782 if(!isCached) {

1783 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {

1784 if(U_FAILURE(*pErrorCode)) {

1785 return; /* something went wrong */

1786 }

1787

1788 /* the option does not apply, remove it */

1789 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;

1790 }

1791 }

1792 }

1793

1794 if(uprv_strstr(pArgs->name, "18030")!=NULL) {

1795 if(uprv_strstr(pArgs->name, "gb18030")!=NULL \|\| uprv_strstr(pArgs->name, "GB18030")!=NULL) {

1796 /* set a flag for GB 18030 mode, which changes the callback behavior */

1797 cnv->options\|=_MBCS_OPTION_GB18030;

1798 }

1799 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) \|\| (uprv_strstr(pArgs->na me, "keis")!=NULL)) {

1800 /* set a flag for KEIS converter, which changes the SI/SO character sequ ence */

1801 cnv->options\|=_MBCS_OPTION_KEIS;

1802 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) \|\| (uprv_strstr(pArgs->nam e, "jef")!=NULL)) {

1803 /* set a flag for JEF converter, which changes the SI/SO character seque nce */

1804 cnv->options\|=_MBCS_OPTION_JEF;

1805 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) \|\| (uprv_strstr(pArgs->na me, "jips")!=NULL)) {

1806 /* set a flag for JIPS converter, which changes the SI/SO character sequ ence */

1807 cnv->options\|=_MBCS_OPTION_JIPS;

1808 }

1809

1810 /* fix maxBytesPerUChar depending on outputType and options etc. */

1811 if(outputType==MBCS_OUTPUT_2_SISO) {

1812 cnv->maxBytesPerUChar=3; /* SO+DBCS */

1813 }

1814

1815 extIndexes=mbcsTable->extIndexes;

1816 if(extIndexes!=NULL) {

1817 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);

1818 if(outputType==MBCS_OUTPUT_2_SISO) {

1819 ++maxBytesPerUChar; /* SO + multiple DBCS */

1820 }

1821

1822 if(maxBytesPerUChar>cnv->maxBytesPerUChar) {

1823 cnv->maxBytesPerUChar=maxBytesPerUChar;

1824 }

1825 }

1826

1827 #if 0

1828 /*

1829 * documentation of UConverter fields used for status

1830 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()

1831 */

1832

1833 /* toUnicode */

1834 cnv->toUnicodeStatus=0; /* offset */

1835 cnv->mode=0; /* state */

1836 cnv->toULength=0; /* byteIndex */

1837

1838 /* fromUnicode */

1839 cnv->fromUChar32=0;

1840 cnv->fromUnicodeStatus=1; /* prevLength */

1841 #endif

1842 }

1843

1844 static const char *

1845 ucnv_MBCSGetName(const UConverter *cnv) {

1846 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNL Name!=NULL) {

1847 return cnv->sharedData->mbcs.swapLFNLName;

1848 } else {

1849 return cnv->sharedData->staticData->name;

1850 }

1851 }

1852

1853 /* MBCS-to-Unicode conversion functions ------------------------------------- */

1854

1855 static UChar32

1856 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {

1857 const _MBCSToUFallback *toUFallbacks;

1858 uint32_t i, start, limit;

1859

1860 limit=mbcsTable->countToUFallbacks;

1861 if(limit>0) {

1862 /* do a binary search for the fallback mapping */

1863 toUFallbacks=mbcsTable->toUFallbacks;

1864 start=0;

1865 while(start<limit-1) {

1866 i=(start+limit)/2;

1867 if(offset<toUFallbacks[i].offset) {

1868 limit=i;

1869 } else {

1870 start=i;

1871 }

1872 }

1873

1874 /* did we really find it? */

1875 if(offset==toUFallbacks[start].offset) {

1876 return toUFallbacks[start].codePoint;

1877 }

1878 }

1879

1880 return 0xfffe;

1881 }

1882

1883 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */

1884 static void

1885 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

1886 UErrorCode *pErrorCode) {

1887 UConverter *cnv;

1888 const uint8_t source, sourceLimit;

1889 UChar *target;

1890 const UChar *targetLimit;

1891 int32_t *offsets;

1892

1893 const int32_t (*stateTable)[256];

1894

1895 int32_t sourceIndex;

1896

1897 int32_t entry;

1898 UChar c;

1899 uint8_t action;

1900

1901 /* set up the local pointers */

1902 cnv=pArgs->converter;

1903 source=(const uint8_t *)pArgs->source;

1904 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

1905 target=pArgs->target;

1906 targetLimit=pArgs->targetLimit;

1907 offsets=pArgs->offsets;

1908

1909 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

1910 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

1911 } else {

1912 stateTable=cnv->sharedData->mbcs.stateTable;

1913 }

1914

1915 /* sourceIndex=-1 if the current character began in the previous buffer */

1916 sourceIndex=0;

1917

1918 /* conversion loop */

1919 while(source<sourceLimit) {

1920 /*

1921 * This following test is to see if available input would overflow the o utput.

1922 * It does not catch output of more than one code unit that

1923 * overflows as a result of a surrogate pair or callback output

1924 * from the last source byte.

1925 * Therefore, those situations also test for overflows and will

1926 * then break the loop, too.

1927 */

1928 if(target>=targetLimit) {

1929 /* target is full */

1930 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

1931 break;

1932 }

1933

1934 entry=stateTable[0][*source++];

1935 /* MBCS_ENTRY_IS_FINAL(entry) */

1936

1937 /* test the most common case first */

1938 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

1939 /* output BMP code point */

1940 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

1941 if(offsets!=NULL) {

1942 *offsets++=sourceIndex;

1943 }

1944

1945 /* normal end of action codes: prepare for a new character */

1946 ++sourceIndex;

1947 continue;

1948 }

1949

1950 /*

1951 * An if-else-if chain provides more reliable performance for

1952 * the most common cases compared to a switch.

1953 */

1954 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

1955 if(action==MBCS_STATE_VALID_DIRECT_20 \|\|

1956 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv) )

1957 ) {

1958 entry=MBCS_ENTRY_FINAL_VALUE(entry);

1959 /* output surrogate pair */

1960 *target++=(UChar)(0xd800\|(UChar)(entry>>10));

1961 if(offsets!=NULL) {

1962 *offsets++=sourceIndex;

1963 }

1964 c=(UChar)(0xdc00\|(UChar)(entry&0x3ff));

1965 if(target<targetLimit) {

1966 *target++=c;

1967 if(offsets!=NULL) {

1968 *offsets++=sourceIndex;

1969 }

1970 } else {

1971 /* target overflow */

1972 cnv->UCharErrorBuffer[0]=c;

1973 cnv->UCharErrorBufferLength=1;

1974 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

1975 break;

1976 }

1977

1978 ++sourceIndex;

1979 continue;

1980 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

1981 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

1982 /* output BMP code point */

1983 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

1984 if(offsets!=NULL) {

1985 *offsets++=sourceIndex;

1986 }

1987

1988 ++sourceIndex;

1989 continue;

1990 }

1991 } else if(action==MBCS_STATE_UNASSIGNED) {

1992 /* just fall through */

1993 } else if(action==MBCS_STATE_ILLEGAL) {

1994 /* callback(illegal) */

1995 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

1996 } else {

1997 /* reserved, must never occur */

1998 ++sourceIndex;

1999 continue;

2000 }

2001

2002 if(U_FAILURE(*pErrorCode)) {

2003 /* callback(illegal) */

2004 break;

2005 } else /* unassigned sequences indicated with byteIndex>0 */ {

2006 /* try an extension mapping */

2007 pArgs->source=(const char *)source;

2008 cnv->toUBytes[0]=*(source-1);

2009 cnv->toULength=_extToU(cnv, cnv->sharedData,

2010 1, &source, sourceLimit,

2011 &target, targetLimit,

2012 &offsets, sourceIndex,

2013 pArgs->flush,

2014 pErrorCode);

2015 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);

2016

2017 if(U_FAILURE(*pErrorCode)) {

2018 /* not mappable or buffer overflow */

2019 break;

2020 }

2021 }

2022 }

2023

2024 /* write back the updated pointers */

2025 pArgs->source=(const char *)source;

2026 pArgs->target=target;

2027 pArgs->offsets=offsets;

2028 }

2029

2030 /*

2031 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single -byte, single-state codepages

2032 * that only map to and from the BMP.

2033 * In addition to single-byte optimizations, the offset calculations

2034 * become much easier.

2035 */

2036 static void

2037 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,

2038 UErrorCode *pErrorCode) {

2039 UConverter *cnv;

2040 const uint8_t source, sourceLimit, *lastSource;

2041 UChar *target;

2042 int32_t targetCapacity, length;

2043 int32_t *offsets;

2044

2045 const int32_t (*stateTable)[256];

2046

2047 int32_t sourceIndex;

2048

2049 int32_t entry;

2050 uint8_t action;

2051

2052 /* set up the local pointers */

2053 cnv=pArgs->converter;

2054 source=(const uint8_t *)pArgs->source;

2055 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

2056 target=pArgs->target;

2057 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

2058 offsets=pArgs->offsets;

2059

2060 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

2061 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

2062 } else {

2063 stateTable=cnv->sharedData->mbcs.stateTable;

2064 }

2065

2066 /* sourceIndex=-1 if the current character began in the previous buffer */

2067 sourceIndex=0;

2068 lastSource=source;

2069

2070 /*

2071 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

2072 * for the minimum of the sourceLength and targetCapacity

2073 */

2074 length=(int32_t)(sourceLimit-source);

2075 if(length<targetCapacity) {

2076 targetCapacity=length;

2077 }

2078

2079 #if MBCS_UNROLL_SINGLE_TO_BMP

2080 /* unrolling makes it faster on Pentium III/Windows 2000 */

2081 /* unroll the loop with the most common case */

2082 unrolled:

2083 if(targetCapacity>=16) {

2084 int32_t count, loops, oredEntries;

2085

2086 loops=count=targetCapacity>>4;

2087 do {

2088 oredEntries=entry=stateTable[0][*source++];

2089 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2090 oredEntries\|=entry=stateTable[0][*source++];

2091 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2092 oredEntries\|=entry=stateTable[0][*source++];

2093 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2094 oredEntries\|=entry=stateTable[0][*source++];

2095 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2096 oredEntries\|=entry=stateTable[0][*source++];

2097 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2098 oredEntries\|=entry=stateTable[0][*source++];

2099 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2100 oredEntries\|=entry=stateTable[0][*source++];

2101 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2102 oredEntries\|=entry=stateTable[0][*source++];

2103 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2104 oredEntries\|=entry=stateTable[0][*source++];

2105 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2106 oredEntries\|=entry=stateTable[0][*source++];

2107 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2108 oredEntries\|=entry=stateTable[0][*source++];

2109 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2110 oredEntries\|=entry=stateTable[0][*source++];

2111 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2112 oredEntries\|=entry=stateTable[0][*source++];

2113 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2114 oredEntries\|=entry=stateTable[0][*source++];

2115 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2116 oredEntries\|=entry=stateTable[0][*source++];

2117 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2118 oredEntries\|=entry=stateTable[0][*source++];

2119 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2120

2121 /* were all 16 entries really valid? */

2122 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {

2123 /* no, return to the first of these 16 */

2124 source-=16;

2125 target-=16;

2126 break;

2127 }

2128 } while(--count>0);

2129 count=loops-count;

2130 targetCapacity-=16*count;

2131

2132 if(offsets!=NULL) {

2133 lastSource+=16*count;

2134 while(count>0) {

2135 *offsets++=sourceIndex++;

2136 *offsets++=sourceIndex++;

2137 *offsets++=sourceIndex++;

2138 *offsets++=sourceIndex++;

2139 *offsets++=sourceIndex++;

2140 *offsets++=sourceIndex++;

2141 *offsets++=sourceIndex++;

2142 *offsets++=sourceIndex++;

2143 *offsets++=sourceIndex++;

2144 *offsets++=sourceIndex++;

2145 *offsets++=sourceIndex++;

2146 *offsets++=sourceIndex++;

2147 *offsets++=sourceIndex++;

2148 *offsets++=sourceIndex++;

2149 *offsets++=sourceIndex++;

2150 *offsets++=sourceIndex++;

2151 --count;

2152 }

2153 }

2154 }

2155 #endif

2156

2157 /* conversion loop */

2158 while(targetCapacity > 0 && source < sourceLimit) {

2159 entry=stateTable[0][*source++];

2160 /* MBCS_ENTRY_IS_FINAL(entry) */

2161

2162 /* test the most common case first */

2163 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

2164 /* output BMP code point */

2165 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2166 --targetCapacity;

2167 continue;

2168 }

2169

2170 /*

2171 * An if-else-if chain provides more reliable performance for

2172 * the most common cases compared to a switch.

2173 */

2174 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

2175 if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

2176 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

2177 /* output BMP code point */

2178 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2179 --targetCapacity;

2180 continue;

2181 }

2182 } else if(action==MBCS_STATE_UNASSIGNED) {

2183 /* just fall through */

2184 } else if(action==MBCS_STATE_ILLEGAL) {

2185 /* callback(illegal) */

2186 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2187 } else {

2188 /* reserved, must never occur */

2189 continue;

2190 }

2191

2192 /* set offsets since the start or the last extension */

2193 if(offsets!=NULL) {

2194 int32_t count=(int32_t)(source-lastSource);

2195

2196 /* predecrement: do not set the offset for the callback-causing char acter */

2197 while(--count>0) {

2198 *offsets++=sourceIndex++;

2199 }

2200 /* offset and sourceIndex are now set for the current character */

2201 }

2202

2203 if(U_FAILURE(*pErrorCode)) {

2204 /* callback(illegal) */

2205 break;

2206 } else /* unassigned sequences indicated with byteIndex>0 */ {

2207 /* try an extension mapping */

2208 lastSource=source;

2209 cnv->toUBytes[0]=*(source-1);

2210 cnv->toULength=_extToU(cnv, cnv->sharedData,

2211 1, &source, sourceLimit,

2212 &target, pArgs->targetLimit,

2213 &offsets, sourceIndex,

2214 pArgs->flush,

2215 pErrorCode);

2216 sourceIndex+=1+(int32_t)(source-lastSource);

2217

2218 if(U_FAILURE(*pErrorCode)) {

2219 /* not mappable or buffer overflow */

2220 break;

2221 }

2222

2223 /* recalculate the targetCapacity after an extension mapping */

2224 targetCapacity=(int32_t)(pArgs->targetLimit-target);

2225 length=(int32_t)(sourceLimit-source);

2226 if(length<targetCapacity) {

2227 targetCapacity=length;

2228 }

2229 }

2230

2231 #if MBCS_UNROLL_SINGLE_TO_BMP

2232 /* unrolling makes it faster on Pentium III/Windows 2000 */

2233 goto unrolled;

2234 #endif

2235 }

2236

2237 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimi t) {

2238 /* target is full */

2239 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

2240 }

2241

2242 /* set offsets since the start or the last callback */

2243 if(offsets!=NULL) {

2244 size_t count=source-lastSource;

2245 while(count>0) {

2246 *offsets++=sourceIndex++;

2247 --count;

2248 }

2249 }

2250

2251 /* write back the updated pointers */

2252 pArgs->source=(const char *)source;

2253 pArgs->target=target;

2254 pArgs->offsets=offsets;

2255 }

2256

2257 static UBool

2258 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {

2259 const int32_t *row=stateTable[state];

2260 int32_t b, entry;

2261 /* First test for final entries in this state for some commonly valid byte v alues. */

2262 entry=row[0xa1];

2263 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

2264 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

2265 ) {

2266 return TRUE;

2267 }

2268 entry=row[0x41];

2269 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

2270 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

2271 ) {

2272 return TRUE;

2273 }

2274 /* Then test for final entries in this state. */

2275 for(b=0; b<=0xff; ++b) {

2276 entry=row[b];

2277 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

2278 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

2279 ) {

2280 return TRUE;

2281 }

2282 }

2283 /* Then recurse for transition entries. */

2284 for(b=0; b<=0xff; ++b) {

2285 entry=row[b];

2286 if( MBCS_ENTRY_IS_TRANSITION(entry) &&

2287 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE( entry))

2288 ) {

2289 return TRUE;

2290 }

2291 }

2292 return FALSE;

2293 }

2294

2295 /*

2296 * Is byte b a single/lead byte in this state?

2297 * Recurse for transition states, because here we don't want to say that

2298 * b is a lead byte if all byte sequences that start with b are illegal.

2299 */

2300 static UBool

2301 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly , uint8_t b) {

2302 const int32_t *row=stateTable[state];

2303 int32_t entry=row[b];

2304 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */

2305 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STA TE(entry));

2306 } else {

2307 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

2308 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {

2309 return FALSE; /* SI/SO are illegal for DBCS-only conversion */

2310 } else {

2311 return action!=MBCS_STATE_ILLEGAL;

2312 }

2313 }

2314 }

2315

2316 U_CFUNC void

2317 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

2318 UErrorCode *pErrorCode) {

2319 UConverter *cnv;

2320 const uint8_t source, sourceLimit;

2321 UChar *target;

2322 const UChar *targetLimit;

2323 int32_t *offsets;

2324

2325 const int32_t (*stateTable)[256];

2326 const uint16_t *unicodeCodeUnits;

2327

2328 uint32_t offset;

2329 uint8_t state;

2330 int8_t byteIndex;

2331 uint8_t *bytes;

2332

2333 int32_t sourceIndex, nextSourceIndex;

2334

2335 int32_t entry;

2336 UChar c;

2337 uint8_t action;

2338

2339 /* use optimized function if possible */

2340 cnv=pArgs->converter;

2341

2342 if(cnv->preToULength>0) {

2343 /*

2344 * pass sourceIndex=-1 because we continue from an earlier buffer

2345 * in the future, this may change with continuous offsets

2346 */

2347 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);

2348

2349 if(U_FAILURE(*pErrorCode) \|\| cnv->preToULength<0) {

2350 return;

2351 }

2352 }

2353

2354 if(cnv->sharedData->mbcs.countStates==1) {

2355 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

2356 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);

2357 } else {

2358 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);

2359 }

2360 return;

2361 }

2362

2363 /* set up the local pointers */

2364 source=(const uint8_t *)pArgs->source;

2365 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

2366 target=pArgs->target;

2367 targetLimit=pArgs->targetLimit;

2368 offsets=pArgs->offsets;

2369

2370 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

2371 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

2372 } else {

2373 stateTable=cnv->sharedData->mbcs.stateTable;

2374 }

2375 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;

2376

2377 /* get the converter state from UConverter */

2378 offset=cnv->toUnicodeStatus;

2379 byteIndex=cnv->toULength;

2380 bytes=cnv->toUBytes;

2381

2382 /*

2383 * if we are in the SBCS state for a DBCS-only converter,

2384 * then load the DBCS state from the MBCS data

2385 * (dbcsOnlyState==0 if it is not a DBCS-only converter)

2386 */

2387 if((state=(uint8_t)(cnv->mode))==0) {

2388 state=cnv->sharedData->mbcs.dbcsOnlyState;

2389 }

2390

2391 /* sourceIndex=-1 if the current character began in the previous buffer */

2392 sourceIndex=byteIndex==0 ? 0 : -1;

2393 nextSourceIndex=0;

2394

2395 /* conversion loop */

2396 while(source<sourceLimit) {

2397 /*

2398 * This following test is to see if available input would overflow the o utput.

2399 * It does not catch output of more than one code unit that

2400 * overflows as a result of a surrogate pair or callback output

2401 * from the last source byte.

2402 * Therefore, those situations also test for overflows and will

2403 * then break the loop, too.

2404 */

2405 if(target>=targetLimit) {

2406 /* target is full */

2407 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

2408 break;

2409 }

2410

2411 if(byteIndex==0) {

2412 /* optimized loop for 1/2-byte input and BMP output */

2413 if(offsets==NULL) {

2414 do {

2415 entry=stateTable[state][*source];

2416 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

2417 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

2418 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);

2419

2420 ++source;

2421 if( source<sourceLimit &&

2422 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source] ) &&

2423 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&

2424 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16 (entry)])<0xfffe

2425 ) {

2426 ++source;

2427 *target++=c;

2428 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

2429 offset=0;

2430 } else {

2431 /* set the state and leave the optimized loop */

2432 bytes[0]=*(source-1);

2433 byteIndex=1;

2434 break;

2435 }

2436 } else {

2437 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

2438 /* output BMP code point */

2439 ++source;

2440 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2441 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

2442 } else {

2443 /* leave the optimized loop */

2444 break;

2445 }

2446 }

2447 } while(source<sourceLimit && target<targetLimit);

2448 } else /* offsets!=NULL */ {

2449 do {

2450 entry=stateTable[state][*source];

2451 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

2452 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

2453 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);

2454

2455 ++source;

2456 if( source<sourceLimit &&

2457 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source] ) &&

2458 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&

2459 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16 (entry)])<0xfffe

2460 ) {

2461 ++source;

2462 *target++=c;

2463 if(offsets!=NULL) {

2464 *offsets++=sourceIndex;

2465 sourceIndex=(nextSourceIndex+=2);

2466 }

2467 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

2468 offset=0;

2469 } else {

2470 /* set the state and leave the optimized loop */

2471 ++nextSourceIndex;

2472 bytes[0]=*(source-1);

2473 byteIndex=1;

2474 break;

2475 }

2476 } else {

2477 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

2478 /* output BMP code point */

2479 ++source;

2480 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2481 if(offsets!=NULL) {

2482 *offsets++=sourceIndex;

2483 sourceIndex=++nextSourceIndex;

2484 }

2485 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

2486 } else {

2487 /* leave the optimized loop */

2488 break;

2489 }

2490 }

2491 } while(source<sourceLimit && target<targetLimit);

2492 }

2493

2494 /*

2495 * these tests and break statements could be put inside the loop

2496 * if C had "break outerLoop" like Java

2497 */

2498 if(source>=sourceLimit) {

2499 break;

2500 }

2501 if(target>=targetLimit) {

2502 /* target is full */

2503 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

2504 break;

2505 }

2506

2507 ++nextSourceIndex;

2508 bytes[byteIndex++]=*source++;

2509 } else /* byteIndex>0 */ {

2510 ++nextSourceIndex;

2511 entry=stateTable[state][bytes[byteIndex++]=*source++];

2512 }

2513

2514 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

2515 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

2516 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);

2517 continue;

2518 }

2519

2520 /* save the previous state for proper extension mapping with SI/SO-state ful converters */

2521 cnv->mode=state;

2522

2523 /* set the next state early so that we can reuse the entry variable */

2524 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */

2525

2526 /*

2527 * An if-else-if chain provides more reliable performance for

2528 * the most common cases compared to a switch.

2529 */

2530 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

2531 if(action==MBCS_STATE_VALID_16) {

2532 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

2533 c=unicodeCodeUnits[offset];

2534 if(c<0xfffe) {

2535 /* output BMP code point */

2536 *target++=c;

2537 if(offsets!=NULL) {

2538 *offsets++=sourceIndex;

2539 }

2540 byteIndex=0;

2541 } else if(c==0xfffe) {

2542 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFa llback(&cnv->sharedData->mbcs, offset))!=0xfffe) {

2543 /* output fallback BMP code point */

2544 *target++=(UChar)entry;

2545 if(offsets!=NULL) {

2546 *offsets++=sourceIndex;

2547 }

2548 byteIndex=0;

2549 }

2550 } else {

2551 /* callback(illegal) */

2552 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2553 }

2554 } else if(action==MBCS_STATE_VALID_DIRECT_16) {

2555 /* output BMP code point */

2556 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2557 if(offsets!=NULL) {

2558 *offsets++=sourceIndex;

2559 }

2560 byteIndex=0;

2561 } else if(action==MBCS_STATE_VALID_16_PAIR) {

2562 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

2563 c=unicodeCodeUnits[offset++];

2564 if(c<0xd800) {

2565 /* output BMP code point below 0xd800 */

2566 *target++=c;

2567 if(offsets!=NULL) {

2568 *offsets++=sourceIndex;

2569 }

2570 byteIndex=0;

2571 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {

2572 /* output roundtrip or fallback surrogate pair */

2573 *target++=(UChar)(c&0xdbff);

2574 if(offsets!=NULL) {

2575 *offsets++=sourceIndex;

2576 }

2577 byteIndex=0;

2578 if(target<targetLimit) {

2579 *target++=unicodeCodeUnits[offset];

2580 if(offsets!=NULL) {

2581 *offsets++=sourceIndex;

2582 }

2583 } else {

2584 /* target overflow */

2585 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];

2586 cnv->UCharErrorBufferLength=1;

2587 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

2588

2589 offset=0;

2590 break;

2591 }

2592 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe0 00) {

2593 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

2594 *target++=unicodeCodeUnits[offset];

2595 if(offsets!=NULL) {

2596 *offsets++=sourceIndex;

2597 }

2598 byteIndex=0;

2599 } else if(c==0xffff) {

2600 /* callback(illegal) */

2601 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2602 }

2603 } else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|

2604 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBA CK(cnv))

2605 ) {

2606 entry=MBCS_ENTRY_FINAL_VALUE(entry);

2607 /* output surrogate pair */

2608 *target++=(UChar)(0xd800\|(UChar)(entry>>10));

2609 if(offsets!=NULL) {

2610 *offsets++=sourceIndex;

2611 }

2612 byteIndex=0;

2613 c=(UChar)(0xdc00\|(UChar)(entry&0x3ff));

2614 if(target<targetLimit) {

2615 *target++=c;

2616 if(offsets!=NULL) {

2617 *offsets++=sourceIndex;

2618 }

2619 } else {

2620 /* target overflow */

2621 cnv->UCharErrorBuffer[0]=c;

2622 cnv->UCharErrorBufferLength=1;

2623 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

2624

2625 offset=0;

2626 break;

2627 }

2628 } else if(action==MBCS_STATE_CHANGE_ONLY) {

2629 /*

2630 * This serves as a state change without any output.

2631 * It is useful for reading simple stateful encodings,

2632 * for example using just Shift-In/Shift-Out codes.

2633 * The 21 unused bits may later be used for more sophisticated

2634 * state transitions.

2635 */

2636 if(cnv->sharedData->mbcs.dbcsOnlyState==0) {

2637 byteIndex=0;

2638 } else {

2639 /* SI/SO are illegal for DBCS-only conversion */

2640 state=(uint8_t)(cnv->mode); /* restore the previous state */

2641

2642 /* callback(illegal) */

2643 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2644 }

2645 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

2646 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

2647 /* output BMP code point */

2648 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2649 if(offsets!=NULL) {

2650 *offsets++=sourceIndex;

2651 }

2652 byteIndex=0;

2653 }

2654 } else if(action==MBCS_STATE_UNASSIGNED) {

2655 /* just fall through */

2656 } else if(action==MBCS_STATE_ILLEGAL) {

2657 /* callback(illegal) */

2658 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2659 } else {

2660 /* reserved, must never occur */

2661 byteIndex=0;

2662 }

2663

2664 /* end of action codes: prepare for a new character */

2665 offset=0;

2666

2667 if(byteIndex==0) {

2668 sourceIndex=nextSourceIndex;

2669 } else if(U_FAILURE(*pErrorCode)) {

2670 /* callback(illegal) */

2671 if(byteIndex>1) {

2672 /*

2673 * Ticket 5691: consistent illegal sequences:

2674 * - We include at least the first byte in the illegal sequence.

2675 * - If any of the non-initial bytes could be the start of a cha racter,

2676 * we stop the illegal sequence before the first one of those.

2677 */

2678 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0) ;

2679 int8_t i;

2680 for(i=1;

2681 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly , bytes[i]);

2682 ++i) {}

2683 if(i<byteIndex) {

2684 /* Back out some bytes. */

2685 int8_t backOutDistance=byteIndex-i;

2686 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);

2687 byteIndex=i; /* length of reported illegal byte sequence */

2688 if(backOutDistance<=bytesFromThisBuffer) {

2689 source-=backOutDistance;

2690 } else {

2691 /* Back out bytes from the previous buffer: Need to repl ay them. */

2692 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDi stance);

2693 /* preToULength is negative! */

2694 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);

2695 source=(const uint8_t *)pArgs->source;

2696 }

2697 }

2698 }

2699 break;

2700 } else /* unassigned sequences indicated with byteIndex>0 */ {

2701 /* try an extension mapping */

2702 pArgs->source=(const char *)source;

2703 byteIndex=_extToU(cnv, cnv->sharedData,

2704 byteIndex, &source, sourceLimit,

2705 &target, targetLimit,

2706 &offsets, sourceIndex,

2707 pArgs->flush,

2708 pErrorCode);

2709 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs ->source);

2710

2711 if(U_FAILURE(*pErrorCode)) {

2712 /* not mappable or buffer overflow */

2713 break;

2714 }

2715 }

2716 }

2717

2718 /* set the converter state back into UConverter */

2719 cnv->toUnicodeStatus=offset;

2720 cnv->mode=state;

2721 cnv->toULength=byteIndex;

2722

2723 /* write back the updated pointers */

2724 pArgs->source=(const char *)source;

2725 pArgs->target=target;

2726 pArgs->offsets=offsets;

2727 }

2728

2729 /*

2730 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single- state codepages.

2731 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.

2732 */

2733 static UChar32

2734 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,

2735 UErrorCode *pErrorCode) {

2736 UConverter *cnv;

2737 const int32_t (*stateTable)[256];

2738 const uint8_t source, sourceLimit;

2739

2740 int32_t entry;

2741 uint8_t action;

2742

2743 /* set up the local pointers */

2744 cnv=pArgs->converter;

2745 source=(const uint8_t *)pArgs->source;

2746 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

2747 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

2748 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

2749 } else {

2750 stateTable=cnv->sharedData->mbcs.stateTable;

2751 }

2752

2753 /* conversion loop */

2754 while(source<sourceLimit) {

2755 entry=stateTable[0][*source++];

2756 /* MBCS_ENTRY_IS_FINAL(entry) */

2757

2758 /* write back the updated pointer early so that we can return directly * /

2759 pArgs->source=(const char *)source;

2760

2761 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

2762 /* output BMP code point */

2763 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2764 }

2765

2766 /*

2767 * An if-else-if chain provides more reliable performance for

2768 * the most common cases compared to a switch.

2769 */

2770 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

2771 if( action==MBCS_STATE_VALID_DIRECT_20 \|\|

2772 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv ))

2773 ) {

2774 /* output supplementary code point */

2775 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);

2776 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

2777 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

2778 /* output BMP code point */

2779 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2780 }

2781 } else if(action==MBCS_STATE_UNASSIGNED) {

2782 /* just fall through */

2783 } else if(action==MBCS_STATE_ILLEGAL) {

2784 /* callback(illegal) */

2785 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2786 } else {

2787 /* reserved, must never occur */

2788 continue;

2789 }

2790

2791 if(U_FAILURE(*pErrorCode)) {

2792 /* callback(illegal) */

2793 break;

2794 } else /* unassigned sequence */ {

2795 /* defer to the generic implementation */

2796 pArgs->source=(const char *)source-1;

2797 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

2798 }

2799 }

2800

2801 /* no output because of empty input or only state changes */

2802 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

2803 return 0xffff;

2804 }

2805

2806 /*

2807 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character

2808 * conversion without offset handling.

2809 *

2810 * When a character does not have a mapping to Unicode, then we return to the

2811 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback

2812 * handling.

2813 * We also defer to the generic code in other complicated cases and have them

2814 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.

2815 *

2816 * All normal mappings and errors are handled here.

2817 */

2818 static UChar32

2819 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,

2820 UErrorCode *pErrorCode) {

2821 UConverter *cnv;

2822 const uint8_t source, sourceLimit, *lastSource;

2823

2824 const int32_t (*stateTable)[256];

2825 const uint16_t *unicodeCodeUnits;

2826

2827 uint32_t offset;

2828 uint8_t state;

2829

2830 int32_t entry;

2831 UChar32 c;

2832 uint8_t action;

2833

2834 /* use optimized function if possible */

2835 cnv=pArgs->converter;

2836

2837 if(cnv->preToULength>0) {

2838 /* use the generic code in ucnv_getNextUChar() to continue with a partia l match */

2839 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

2840 }

2841

2842 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {

2843 /*

2844 * Using the generic ucnv_getNextUChar() code lets us deal correctly

2845 * with the rare case of a codepage that maps single surrogates

2846 * without adding the complexity to this already complicated function he re.

2847 */

2848 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

2849 } else if(cnv->sharedData->mbcs.countStates==1) {

2850 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);

2851 }

2852

2853 /* set up the local pointers */

2854 source=lastSource=(const uint8_t *)pArgs->source;

2855 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

2856

2857 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

2858 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

2859 } else {

2860 stateTable=cnv->sharedData->mbcs.stateTable;

2861 }

2862 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;

2863

2864 /* get the converter state from UConverter */

2865 offset=cnv->toUnicodeStatus;

2866

2867 /*

2868 * if we are in the SBCS state for a DBCS-only converter,

2869 * then load the DBCS state from the MBCS data

2870 * (dbcsOnlyState==0 if it is not a DBCS-only converter)

2871 */

2872 if((state=(uint8_t)(cnv->mode))==0) {

2873 state=cnv->sharedData->mbcs.dbcsOnlyState;

2874 }

2875

2876 /* conversion loop */

2877 c=U_SENTINEL;

2878 while(source<sourceLimit) {

2879 entry=stateTable[state][*source++];

2880 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

2881 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

2882 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);

2883

2884 /* optimization for 1/2-byte input and BMP output */

2885 if( source<sourceLimit &&

2886 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&

2887 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&

2888 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0x fffe

2889 ) {

2890 ++source;

2891 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */

2892 /* output BMP code point */

2893 break;

2894 }

2895 } else {

2896 /* save the previous state for proper extension mapping with SI/SO-s tateful converters */

2897 cnv->mode=state;

2898

2899 /* set the next state early so that we can reuse the entry variable */

2900 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */

2901

2902 /*

2903 * An if-else-if chain provides more reliable performance for

2904 * the most common cases compared to a switch.

2905 */

2906 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

2907 if(action==MBCS_STATE_VALID_DIRECT_16) {

2908 /* output BMP code point */

2909 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2910 break;

2911 } else if(action==MBCS_STATE_VALID_16) {

2912 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

2913 c=unicodeCodeUnits[offset];

2914 if(c<0xfffe) {

2915 /* output BMP code point */

2916 break;

2917 } else if(c==0xfffe) {

2918 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&c nv->sharedData->mbcs, offset))!=0xfffe) {

2919 break;

2920 }

2921 } else {

2922 /* callback(illegal) */

2923 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2924 }

2925 } else if(action==MBCS_STATE_VALID_16_PAIR) {

2926 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

2927 c=unicodeCodeUnits[offset++];

2928 if(c<0xd800) {

2929 /* output BMP code point below 0xd800 */

2930 break;

2931 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {

2932 /* output roundtrip or fallback supplementary code point */

2933 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);

2934 break;

2935 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c== 0xe000) {

2936 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

2937 c=unicodeCodeUnits[offset];

2938 break;

2939 } else if(c==0xffff) {

2940 /* callback(illegal) */

2941 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2942 }

2943 } else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|

2944 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FA LLBACK(cnv))

2945 ) {

2946 /* output supplementary code point */

2947 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);

2948 break;

2949 } else if(action==MBCS_STATE_CHANGE_ONLY) {

2950 /*

2951 * This serves as a state change without any output.

2952 * It is useful for reading simple stateful encodings,

2953 * for example using just Shift-In/Shift-Out codes.

2954 * The 21 unused bits may later be used for more sophisticated

2955 * state transitions.

2956 */

2957 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {

2958 /* SI/SO are illegal for DBCS-only conversion */

2959 state=(uint8_t)(cnv->mode); /* restore the previous state */

2960

2961 /* callback(illegal) */

2962 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2963 }

2964 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

2965 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

2966 /* output BMP code point */

2967 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

2968 break;

2969 }

2970 } else if(action==MBCS_STATE_UNASSIGNED) {

2971 /* just fall through */

2972 } else if(action==MBCS_STATE_ILLEGAL) {

2973 /* callback(illegal) */

2974 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

2975 } else {

2976 /* reserved (must never occur), or only state change */

2977 offset=0;

2978 lastSource=source;

2979 continue;

2980 }

2981

2982 /* end of action codes: prepare for a new character */

2983 offset=0;

2984

2985 if(U_FAILURE(*pErrorCode)) {

2986 /* callback(illegal) */

2987 break;

2988 } else /* unassigned sequence */ {

2989 /* defer to the generic implementation */

2990 cnv->toUnicodeStatus=0;

2991 cnv->mode=state;

2992 pArgs->source=(const char *)lastSource;

2993 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

2994 }

2995 }

2996 }

2997

2998 if(c<0) {

2999 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {

3000 /* incomplete character byte sequence */

3001 uint8_t *bytes=cnv->toUBytes;

3002 cnv->toULength=(int8_t)(source-lastSource);

3003 do {

3004 bytes++=lastSource++;

3005 } while(lastSource<source);

3006 *pErrorCode=U_TRUNCATED_CHAR_FOUND;

3007 } else if(U_FAILURE(*pErrorCode)) {

3008 /* callback(illegal) */

3009 /*

3010 * Ticket 5691: consistent illegal sequences:

3011 * - We include at least the first byte in the illegal sequence.

3012 * - If any of the non-initial bytes could be the start of a charact er,

3013 * we stop the illegal sequence before the first one of those.

3014 */

3015 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

3016 uint8_t *bytes=cnv->toUBytes;

3017 bytes++=lastSource++; /* first byte */

3018 if(lastSource==source) {

3019 cnv->toULength=1;

3020 } else /* lastSource<source: multi-byte character */ {

3021 int8_t i;

3022 for(i=1;

3023 lastSource<source && !isSingleOrLead(stateTable, state, isDB CSOnly, *lastSource);

3024 ++i

3025 ) {

3026 bytes++=lastSource++;

3027 }

3028 cnv->toULength=i;

3029 source=lastSource;

3030 }

3031 } else {

3032 /* no output because of empty input or only state changes */

3033 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

3034 }

3035 c=0xffff;

3036 }

3037

3038 /* set the converter state back into UConverter, ready for a new character * /

3039 cnv->toUnicodeStatus=0;

3040 cnv->mode=state;

3041

3042 /* write back the updated pointer */

3043 pArgs->source=(const char *)source;

3044 return c;

3045 }

3046

3047 #if 0

3048 /*

3049 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. ma rkus

3050 * Removal improves code coverage.

3051 */

3052 /**

3053 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, s ingle-state codepages.

3054 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

3055 * It does not handle conversion extensions (_extToU()).

3056 */

3057 U_CFUNC UChar32

3058 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,

3059 uint8_t b, UBool useFallback) {

3060 int32_t entry;

3061 uint8_t action;

3062

3063 entry=sharedData->mbcs.stateTable[0][b];

3064 /* MBCS_ENTRY_IS_FINAL(entry) */

3065

3066 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

3067 /* output BMP code point */

3068 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

3069 }

3070

3071 /*

3072 * An if-else-if chain provides more reliable performance for

3073 * the most common cases compared to a switch.

3074 */

3075 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

3076 if(action==MBCS_STATE_VALID_DIRECT_20) {

3077 /* output supplementary code point */

3078 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

3079 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

3080 if(!TO_U_USE_FALLBACK(useFallback)) {

3081 return 0xfffe;

3082 }

3083 /* output BMP code point */

3084 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

3085 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {

3086 if(!TO_U_USE_FALLBACK(useFallback)) {

3087 return 0xfffe;

3088 }

3089 /* output supplementary code point */

3090 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

3091 } else if(action==MBCS_STATE_UNASSIGNED) {

3092 return 0xfffe;

3093 } else if(action==MBCS_STATE_ILLEGAL) {

3094 return 0xffff;

3095 } else {

3096 /* reserved, must never occur */

3097 return 0xffff;

3098 }

3099 }

3100 #endif

3101

3102 /*

3103 * This is a simple version of _MBCSGetNextUChar() that is used

3104 * by other converter implementations.

3105 * It only returns an "assigned" result if it consumes the entire input.

3106 * It does not use state from the converter, nor error codes.

3107 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

3108 * It handles conversion extensions but not GB 18030.

3109 *

3110 * Return value:

3111 * U+fffe unassigned

3112 * U+ffff illegal

3113 * otherwise the Unicode code point

3114 */

3115 U_CFUNC UChar32

3116 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,

3117 const char *source, int32_t length,

3118 UBool useFallback) {

3119 const int32_t (*stateTable)[256];

3120 const uint16_t *unicodeCodeUnits;

3121

3122 uint32_t offset;

3123 uint8_t state, action;

3124

3125 UChar32 c;

3126 int32_t i, entry;

3127

3128 if(length<=0) {

3129 /* no input at all: "illegal" */

3130 return 0xffff;

3131 }

3132

3133 #if 0

3134 /*

3135 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. ma rkus

3136 * TODO In future releases, verify that this function is never called for SBCS

3137 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.

3138 * Removal improves code coverage.

3139 */

3140 /* use optimized function if possible */

3141 if(sharedData->mbcs.countStates==1) {

3142 if(length==1) {

3143 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*sourc e, useFallback);

3144 } else {

3145 return 0xffff; /* illegal: more than a single byte for an SBCS conve rter */

3146 }

3147 }

3148 #endif

3149

3150 /* set up the local pointers */

3151 stateTable=sharedData->mbcs.stateTable;

3152 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;

3153

3154 /* converter state */

3155 offset=0;

3156 state=sharedData->mbcs.dbcsOnlyState;

3157

3158 /* conversion loop */

3159 for(i=0;;) {

3160 entry=stateTable[state][(uint8_t)source[i++]];

3161 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

3162 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

3163 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);

3164

3165 if(i==length) {

3166 return 0xffff; /* truncated character */

3167 }

3168 } else {

3169 /*

3170 * An if-else-if chain provides more reliable performance for

3171 * the most common cases compared to a switch.

3172 */

3173 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

3174 if(action==MBCS_STATE_VALID_16) {

3175 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

3176 c=unicodeCodeUnits[offset];

3177 if(c!=0xfffe) {

3178 /* done */

3179 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {

3180 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);

3181 /* else done with 0xfffe */

3182 }

3183 break;

3184 } else if(action==MBCS_STATE_VALID_DIRECT_16) {

3185 /* output BMP code point */

3186 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

3187 break;

3188 } else if(action==MBCS_STATE_VALID_16_PAIR) {

3189 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

3190 c=unicodeCodeUnits[offset++];

3191 if(c<0xd800) {

3192 /* output BMP code point below 0xd800 */

3193 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {

3194 /* output roundtrip or fallback supplementary code point */

3195 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x1000 0-0xdc00));

3196 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c== 0xe000) {

3197 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

3198 c=unicodeCodeUnits[offset];

3199 } else if(c==0xffff) {

3200 return 0xffff;

3201 } else {

3202 c=0xfffe;

3203 }

3204 break;

3205 } else if(action==MBCS_STATE_VALID_DIRECT_20) {

3206 /* output supplementary code point */

3207 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

3208 break;

3209 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

3210 if(!TO_U_USE_FALLBACK(useFallback)) {

3211 c=0xfffe;

3212 break;

3213 }

3214 /* output BMP code point */

3215 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

3216 break;

3217 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {

3218 if(!TO_U_USE_FALLBACK(useFallback)) {

3219 c=0xfffe;

3220 break;

3221 }

3222 /* output supplementary code point */

3223 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

3224 break;

3225 } else if(action==MBCS_STATE_UNASSIGNED) {

3226 c=0xfffe;

3227 break;

3228 }

3229

3230 /*

3231 * forbid MBCS_STATE_CHANGE_ONLY for this function,

3232 * and MBCS_STATE_ILLEGAL and reserved action codes

3233 */

3234 return 0xffff;

3235 }

3236 }

3237

3238 if(i!=length) {

3239 /* illegal for this function: not all input consumed */

3240 return 0xffff;

3241 }

3242

3243 if(c==0xfffe) {

3244 /* try an extension mapping */

3245 const int32_t *cx=sharedData->mbcs.extIndexes;

3246 if(cx!=NULL) {

3247 return ucnv_extSimpleMatchToU(cx, source, length, useFallback);

3248 }

3249 }

3250

3251 return c;

3252 }

3253

3254 /* MBCS-from-Unicode conversion functions ----------------------------------- */

3255

3256 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byt e codepages. */

3257 static void

3258 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

3259 UErrorCode *pErrorCode) {

3260 UConverter *cnv;

3261 const UChar source, sourceLimit;

3262 uint8_t *target;

3263 int32_t targetCapacity;

3264 int32_t *offsets;

3265

3266 const uint16_t *table;

3267 const uint16_t *mbcsIndex;

3268 const uint8_t *bytes;

3269

3270 UChar32 c;

3271

3272 int32_t sourceIndex, nextSourceIndex;

3273

3274 uint32_t stage2Entry;

3275 uint32_t asciiRoundtrips;

3276 uint32_t value;

3277 uint8_t unicodeMask;

3278

3279 /* use optimized function if possible */

3280 cnv=pArgs->converter;

3281 unicodeMask=cnv->sharedData->mbcs.unicodeMask;

3282

3283 /* set up the local pointers */

3284 source=pArgs->source;

3285 sourceLimit=pArgs->sourceLimit;

3286 target=(uint8_t *)pArgs->target;

3287 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

3288 offsets=pArgs->offsets;

3289

3290 table=cnv->sharedData->mbcs.fromUnicodeTable;

3291 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;

3292 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

3293 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

3294 } else {

3295 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;

3296 }

3297 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

3298

3299 /* get the converter state from UConverter */

3300 c=cnv->fromUChar32;

3301

3302 /* sourceIndex=-1 if the current character began in the previous buffer */

3303 sourceIndex= c==0 ? 0 : -1;

3304 nextSourceIndex=0;

3305

3306 /* conversion loop */

3307 if(c!=0 && targetCapacity>0) {

3308 goto getTrail;

3309 }

3310

3311 while(source<sourceLimit) {

3312 /*

3313 * This following test is to see if available input would overflow the o utput.

3314 * It does not catch output of more than one byte that

3315 * overflows as a result of a multi-byte character or callback output

3316 * from the last source character.

3317 * Therefore, those situations also test for overflows and will

3318 * then break the loop, too.

3319 */

3320 if(targetCapacity>0) {

3321 /*

3322 * Get a correct Unicode code point:

3323 * a single UChar for a BMP code point or

3324 * a matched surrogate pair for a "supplementary code point".

3325 */

3326 c=*source++;

3327 ++nextSourceIndex;

3328 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {

3329 *target++=(uint8_t)c;

3330 if(offsets!=NULL) {

3331 *offsets++=sourceIndex;

3332 sourceIndex=nextSourceIndex;

3333 }

3334 --targetCapacity;

3335 c=0;

3336 continue;

3337 }

3338 /*

3339 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX

3340 * to avoid dealing with surrogates.

3341 * MBCS_FAST_MAX must be >=0xd7ff.

3342 */

3343 if(c<=0xd7ff) {

3344 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)byt es, c);

3345 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */

3346 if(value==0) {

3347 goto unassigned;

3348 }

3349 /* output the value */

3350 } else {

3351 /*

3352 * This also tests if the codepage maps single surrogates.

3353 * If it does, then surrogates are not paired but mapped separat ely.

3354 * Note that in this case unmatched surrogates are not detected.

3355 */

3356 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {

3357 if(U16_IS_SURROGATE_LEAD(c)) {

3358 getTrail:

3359 if(source<sourceLimit) {

3360 /* test the following code unit */

3361 UChar trail=*source;

3362 if(U16_IS_TRAIL(trail)) {

3363 ++source;

3364 ++nextSourceIndex;

3365 c=U16_GET_SUPPLEMENTARY(c, trail);

3366 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

3367 /* BMP-only codepages are stored without sta ge 1 entries for supplementary code points */

3368 /* callback(unassigned) */

3369 goto unassigned;

3370 }

3371 /* convert this supplementary code point */

3372 /* exit this condition tree */

3373 } else {

3374 /* this is an unmatched lead code unit (1st surr ogate) */

3375 /* callback(illegal) */

3376 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

3377 break;

3378 }

3379 } else {

3380 /* no more input */

3381 break;

3382 }

3383 } else {

3384 /* this is an unmatched trail code unit (2nd surrogate) */

3385 /* callback(illegal) */

3386 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

3387 break;

3388 }

3389 }

3390

3391 /* convert the Unicode code point in c into codepage bytes */

3392 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

3393

3394 /* get the bytes and the length for the output */

3395 /* MBCS_OUTPUT_2 */

3396 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

3397

3398 /* is this code point assigned, or do we use fallbacks? */

3399 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|

3400 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))

3401 ) {

3402 /*

3403 * We allow a 0 byte output if the "assigned" bit is set for this entry.

3404 * There is no way with this data structure for fallback out put

3405 * to be a zero byte.

3406 */

3407

3408 unassigned:

3409 /* try an extension mapping */

3410 pArgs->source=source;

3411 c=_extFromU(cnv, cnv->sharedData,

3412 c, &source, sourceLimit,

3413 &target, target+targetCapacity,

3414 &offsets, sourceIndex,

3415 pArgs->flush,

3416 pErrorCode);

3417 nextSourceIndex+=(int32_t)(source-pArgs->source);

3418

3419 if(U_FAILURE(*pErrorCode)) {

3420 /* not mappable or buffer overflow */

3421 break;

3422 } else {

3423 /* a mapping was written to the target, continue */

3424

3425 /* recalculate the targetCapacity after an extension map ping */

3426 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)targ et);

3427

3428 /* normal end of conversion: prepare for a new character */

3429 sourceIndex=nextSourceIndex;

3430 continue;

3431 }

3432 }

3433 }

3434

3435 /* write the output character bytes from value and length */

3436 /* from the first if in the loop we know that targetCapacity>0 */

3437 if(value<=0xff) {

3438 /* this is easy because we know that there is enough space */

3439 *target++=(uint8_t)value;

3440 if(offsets!=NULL) {

3441 *offsets++=sourceIndex;

3442 }

3443 --targetCapacity;

3444 } else /* length==2 */ {

3445 *target++=(uint8_t)(value>>8);

3446 if(2<=targetCapacity) {

3447 *target++=(uint8_t)value;

3448 if(offsets!=NULL) {

3449 *offsets++=sourceIndex;

3450 *offsets++=sourceIndex;

3451 }

3452 targetCapacity-=2;

3453 } else {

3454 if(offsets!=NULL) {

3455 *offsets++=sourceIndex;

3456 }

3457 cnv->charErrorBuffer[0]=(char)value;

3458 cnv->charErrorBufferLength=1;

3459

3460 /* target overflow */

3461 targetCapacity=0;

3462 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

3463 c=0;

3464 break;

3465 }

3466 }

3467

3468 /* normal end of conversion: prepare for a new character */

3469 c=0;

3470 sourceIndex=nextSourceIndex;

3471 continue;

3472 } else {

3473 /* target is full */

3474 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

3475 break;

3476 }

3477 }

3478

3479 /* set the converter state back into UConverter */

3480 cnv->fromUChar32=c;

3481

3482 /* write back the updated pointers */

3483 pArgs->source=source;

3484 pArgs->target=(char *)target;

3485 pArgs->offsets=offsets;

3486 }

3487

3488 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byt e codepages. */

3489 static void

3490 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

3491 UErrorCode *pErrorCode) {

3492 UConverter *cnv;

3493 const UChar source, sourceLimit;

3494 uint8_t *target;

3495 int32_t targetCapacity;

3496 int32_t *offsets;

3497

3498 const uint16_t *table;

3499 const uint16_t *results;

3500

3501 UChar32 c;

3502

3503 int32_t sourceIndex, nextSourceIndex;

3504

3505 uint16_t value, minValue;

3506 UBool hasSupplementary;

3507

3508 /* set up the local pointers */

3509 cnv=pArgs->converter;

3510 source=pArgs->source;

3511 sourceLimit=pArgs->sourceLimit;

3512 target=(uint8_t *)pArgs->target;

3513 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

3514 offsets=pArgs->offsets;

3515

3516 table=cnv->sharedData->mbcs.fromUnicodeTable;

3517 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

3518 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

3519 } else {

3520 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

3521 }

3522

3523 if(cnv->useFallback) {

3524 /* use all roundtrip and fallback results */

3525 minValue=0x800;

3526 } else {

3527 /* use only roundtrips and fallbacks from private-use characters */

3528 minValue=0xc00;

3529 }

3530 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEME NTARY);

3531

3532 /* get the converter state from UConverter */

3533 c=cnv->fromUChar32;

3534

3535 /* sourceIndex=-1 if the current character began in the previous buffer */

3536 sourceIndex= c==0 ? 0 : -1;

3537 nextSourceIndex=0;

3538

3539 /* conversion loop */

3540 if(c!=0 && targetCapacity>0) {

3541 goto getTrail;

3542 }

3543

3544 while(source<sourceLimit) {

3545 /*

3546 * This following test is to see if available input would overflow the o utput.

3547 * It does not catch output of more than one byte that

3548 * overflows as a result of a multi-byte character or callback output

3549 * from the last source character.

3550 * Therefore, those situations also test for overflows and will

3551 * then break the loop, too.

3552 */

3553 if(targetCapacity>0) {

3554 /*

3555 * Get a correct Unicode code point:

3556 * a single UChar for a BMP code point or

3557 * a matched surrogate pair for a "supplementary code point".

3558 */

3559 c=*source++;

3560 ++nextSourceIndex;

3561 if(U16_IS_SURROGATE(c)) {

3562 if(U16_IS_SURROGATE_LEAD(c)) {

3563 getTrail:

3564 if(source<sourceLimit) {

3565 /* test the following code unit */

3566 UChar trail=*source;

3567 if(U16_IS_TRAIL(trail)) {

3568 ++source;

3569 ++nextSourceIndex;

3570 c=U16_GET_SUPPLEMENTARY(c, trail);

3571 if(!hasSupplementary) {

3572 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */

3573 /* callback(unassigned) */

3574 goto unassigned;

3575 }

3576 /* convert this supplementary code point */

3577 /* exit this condition tree */

3578 } else {

3579 /* this is an unmatched lead code unit (1st surrogat e) */

3580 /* callback(illegal) */

3581 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

3582 break;

3583 }

3584 } else {

3585 /* no more input */

3586 break;

3587 }

3588 } else {

3589 /* this is an unmatched trail code unit (2nd surrogate) */

3590 /* callback(illegal) */

3591 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

3592 break;

3593 }

3594 }

3595

3596 /* convert the Unicode code point in c into codepage bytes */

3597 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

3598

3599 /* is this code point assigned, or do we use fallbacks? */

3600 if(value>=minValue) {

3601 /* assigned, write the output character bytes from value and len gth */

3602 /* length==1 */

3603 /* this is easy because we know that there is enough space */

3604 *target++=(uint8_t)value;

3605 if(offsets!=NULL) {

3606 *offsets++=sourceIndex;

3607 }

3608 --targetCapacity;

3609

3610 /* normal end of conversion: prepare for a new character */

3611 c=0;

3612 sourceIndex=nextSourceIndex;

3613 } else { /* unassigned */

3614 unassigned:

3615 /* try an extension mapping */

3616 pArgs->source=source;

3617 c=_extFromU(cnv, cnv->sharedData,

3618 c, &source, sourceLimit,

3619 &target, target+targetCapacity,

3620 &offsets, sourceIndex,

3621 pArgs->flush,

3622 pErrorCode);

3623 nextSourceIndex+=(int32_t)(source-pArgs->source);

3624

3625 if(U_FAILURE(*pErrorCode)) {

3626 /* not mappable or buffer overflow */

3627 break;

3628 } else {

3629 /* a mapping was written to the target, continue */

3630

3631 /* recalculate the targetCapacity after an extension mapping */

3632 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);

3633

3634 /* normal end of conversion: prepare for a new character */

3635 sourceIndex=nextSourceIndex;

3636 }

3637 }

3638 } else {

3639 /* target is full */

3640 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

3641 break;

3642 }

3643 }

3644

3645 /* set the converter state back into UConverter */

3646 cnv->fromUChar32=c;

3647

3648 /* write back the updated pointers */

3649 pArgs->source=source;

3650 pArgs->target=(char *)target;

3651 pArgs->offsets=offsets;

3652 }

3653

3654 /*

3655 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages

3656 * that map only to and from the BMP.

3657 * In addition to single-byte/state optimizations, the offset calculations

3658 * become much easier.

3659 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,

3660 * but measurements have shown that this diminishes performance

3661 * in more cases than it improves it.

3662 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches

3663 * for various MBCS and SBCS optimizations.

3664 */

3665 static void

3666 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,

3667 UErrorCode *pErrorCode) {

3668 UConverter *cnv;

3669 const UChar source, sourceLimit, *lastSource;

3670 uint8_t *target;

3671 int32_t targetCapacity, length;

3672 int32_t *offsets;

3673

3674 const uint16_t *table;

3675 const uint16_t *results;

3676

3677 UChar32 c;

3678

3679 int32_t sourceIndex;

3680

3681 uint32_t asciiRoundtrips;

3682 uint16_t value, minValue;

3683

3684 /* set up the local pointers */

3685 cnv=pArgs->converter;

3686 source=pArgs->source;

3687 sourceLimit=pArgs->sourceLimit;

3688 target=(uint8_t *)pArgs->target;

3689 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

3690 offsets=pArgs->offsets;

3691

3692 table=cnv->sharedData->mbcs.fromUnicodeTable;

3693 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

3694 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

3695 } else {

3696 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

3697 }

3698 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

3699

3700 if(cnv->useFallback) {

3701 /* use all roundtrip and fallback results */

3702 minValue=0x800;

3703 } else {

3704 /* use only roundtrips and fallbacks from private-use characters */

3705 minValue=0xc00;

3706 }

3707

3708 /* get the converter state from UConverter */

3709 c=cnv->fromUChar32;

3710

3711 /* sourceIndex=-1 if the current character began in the previous buffer */

3712 sourceIndex= c==0 ? 0 : -1;

3713 lastSource=source;

3714

3715 /*

3716 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

3717 * for the minimum of the sourceLength and targetCapacity

3718 */

3719 length=(int32_t)(sourceLimit-source);

3720 if(length<targetCapacity) {

3721 targetCapacity=length;

3722 }

3723

3724 /* conversion loop */

3725 if(c!=0 && targetCapacity>0) {

3726 goto getTrail;

3727 }

3728

3729 #if MBCS_UNROLL_SINGLE_FROM_BMP

3730 /* unrolling makes it slower on Pentium III/Windows 2000?! */

3731 /* unroll the loop with the most common case */

3732 unrolled:

3733 if(targetCapacity>=4) {

3734 int32_t count, loops;

3735 uint16_t andedValues;

3736

3737 loops=count=targetCapacity>>2;

3738 do {

3739 c=*source++;

3740 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

3741 *target++=(uint8_t)value;

3742 c=*source++;

3743 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

3744 *target++=(uint8_t)value;

3745 c=*source++;

3746 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

3747 *target++=(uint8_t)value;

3748 c=*source++;

3749 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

3750 *target++=(uint8_t)value;

3751

3752 /* were all 4 entries really valid? */

3753 if(andedValues<minValue) {

3754 /* no, return to the first of these 4 */

3755 source-=4;

3756 target-=4;

3757 break;

3758 }

3759 } while(--count>0);

3760 count=loops-count;

3761 targetCapacity-=4*count;

3762

3763 if(offsets!=NULL) {

3764 lastSource+=4*count;

3765 while(count>0) {

3766 *offsets++=sourceIndex++;

3767 *offsets++=sourceIndex++;

3768 *offsets++=sourceIndex++;

3769 *offsets++=sourceIndex++;

3770 --count;

3771 }

3772 }

3773

3774 c=0;

3775 }

3776 #endif

3777

3778 while(targetCapacity>0) {

3779 /*

3780 * Get a correct Unicode code point:

3781 * a single UChar for a BMP code point or

3782 * a matched surrogate pair for a "supplementary code point".

3783 */

3784 c=*source++;

3785 /*

3786 * Do not immediately check for single surrogates:

3787 * Assume that they are unassigned and check for them in that case.

3788 * This speeds up the conversion of assigned characters.

3789 */

3790 /* convert the Unicode code point in c into codepage bytes */

3791 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {

3792 *target++=(uint8_t)c;

3793 --targetCapacity;

3794 c=0;

3795 continue;

3796 }

3797 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

3798 /* is this code point assigned, or do we use fallbacks? */

3799 if(value>=minValue) {

3800 /* assigned, write the output character bytes from value and length */

3801 /* length==1 */

3802 /* this is easy because we know that there is enough space */

3803 *target++=(uint8_t)value;

3804 --targetCapacity;

3805

3806 /* normal end of conversion: prepare for a new character */

3807 c=0;

3808 continue;

3809 } else if(!U16_IS_SURROGATE(c)) {

3810 /* normal, unassigned BMP character */

3811 } else if(U16_IS_SURROGATE_LEAD(c)) {

3812 getTrail:

3813 if(source<sourceLimit) {

3814 /* test the following code unit */

3815 UChar trail=*source;

3816 if(U16_IS_TRAIL(trail)) {

3817 ++source;

3818 c=U16_GET_SUPPLEMENTARY(c, trail);

3819 /* this codepage does not map supplementary code points */

3820 /* callback(unassigned) */

3821 } else {

3822 /* this is an unmatched lead code unit (1st surrogate) */

3823 /* callback(illegal) */

3824 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

3825 break;

3826 }

3827 } else {

3828 /* no more input */

3829 if (pArgs->flush) {

3830 *pErrorCode=U_TRUNCATED_CHAR_FOUND;

3831 }

3832 break;

3833 }

3834 } else {

3835 /* this is an unmatched trail code unit (2nd surrogate) */

3836 /* callback(illegal) */

3837 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

3838 break;

3839 }

3840

3841 /* c does not have a mapping */

3842

3843 /* get the number of code units for c to correctly advance sourceIndex * /

3844 length=U16_LENGTH(c);

3845

3846 /* set offsets since the start or the last extension */

3847 if(offsets!=NULL) {

3848 int32_t count=(int32_t)(source-lastSource);

3849

3850 /* do not set the offset for this character */

3851 count-=length;

3852

3853 while(count>0) {

3854 *offsets++=sourceIndex++;

3855 --count;

3856 }

3857 /* offsets and sourceIndex are now set for the current character */

3858 }

3859

3860 /* try an extension mapping */

3861 lastSource=source;

3862 c=_extFromU(cnv, cnv->sharedData,

3863 c, &source, sourceLimit,

3864 &target, (const uint8_t *)(pArgs->targetLimit),

3865 &offsets, sourceIndex,

3866 pArgs->flush,

3867 pErrorCode);

3868 sourceIndex+=length+(int32_t)(source-lastSource);

3869 lastSource=source;

3870

3871 if(U_FAILURE(*pErrorCode)) {

3872 /* not mappable or buffer overflow */

3873 break;

3874 } else {

3875 /* a mapping was written to the target, continue */

3876

3877 /* recalculate the targetCapacity after an extension mapping */

3878 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);

3879 length=(int32_t)(sourceLimit-source);

3880 if(length<targetCapacity) {

3881 targetCapacity=length;

3882 }

3883 }

3884

3885 #if MBCS_UNROLL_SINGLE_FROM_BMP

3886 /* unrolling makes it slower on Pentium III/Windows 2000?! */

3887 goto unrolled;

3888 #endif

3889 }

3890

3891 if(U_SUCCESS(pErrorCode) && source<sourceLimit && target>=(uint8_t )pArgs- >targetLimit) {

3892 /* target is full */

3893 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

3894 }

3895

3896 /* set offsets since the start or the last callback */

3897 if(offsets!=NULL) {

3898 size_t count=source-lastSource;

3899 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {

3900 /*

3901 Caller gave us a partial supplementary character,

3902 which this function couldn't convert in any case.

3903 The callback will handle the offset.

3904 */

3905 count--;

3906 }

3907 while(count>0) {

3908 *offsets++=sourceIndex++;

3909 --count;

3910 }

3911 }

3912

3913 /* set the converter state back into UConverter */

3914 cnv->fromUChar32=c;

3915

3916 /* write back the updated pointers */

3917 pArgs->source=source;

3918 pArgs->target=(char *)target;

3919 pArgs->offsets=offsets;

3920 }

3921

3922 U_CFUNC void

3923 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

3924 UErrorCode *pErrorCode) {

3925 UConverter *cnv;

3926 const UChar source, sourceLimit;

3927 uint8_t *target;

3928 int32_t targetCapacity;

3929 int32_t *offsets;

3930

3931 const uint16_t *table;

3932 const uint16_t *mbcsIndex;

3933 const uint8_t p, bytes;

3934 uint8_t outputType;

3935

3936 UChar32 c;

3937

3938 int32_t prevSourceIndex, sourceIndex, nextSourceIndex;

3939

3940 uint32_t stage2Entry;

3941 uint32_t asciiRoundtrips;

3942 uint32_t value;

3943 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */

3944 uint8_t siBytes[2] = {0, 0};

3945 uint8_t soBytes[2] = {0, 0};

3946 uint8_t siLength, soLength;

3947 int32_t length = 0, prevLength;

3948 uint8_t unicodeMask;

3949

3950 cnv=pArgs->converter;

3951

3952 if(cnv->preFromUFirstCP>=0) {

3953 /*

3954 * pass sourceIndex=-1 because we continue from an earlier buffer

3955 * in the future, this may change with continuous offsets

3956 */

3957 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);

3958

3959 if(U_FAILURE(*pErrorCode) \|\| cnv->preFromULength<0) {

3960 return;

3961 }

3962 }

3963

3964 /* use optimized function if possible */

3965 outputType=cnv->sharedData->mbcs.outputType;

3966 unicodeMask=cnv->sharedData->mbcs.unicodeMask;

3967 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {

3968 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

3969 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);

3970 } else {

3971 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);

3972 }

3973 return;

3974 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {

3975 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);

3976 return;

3977 }

3978

3979 /* set up the local pointers */

3980 source=pArgs->source;

3981 sourceLimit=pArgs->sourceLimit;

3982 target=(uint8_t *)pArgs->target;

3983 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

3984 offsets=pArgs->offsets;

3985

3986 table=cnv->sharedData->mbcs.fromUnicodeTable;

3987 if(cnv->sharedData->mbcs.utf8Friendly) {

3988 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;

3989 } else {

3990 mbcsIndex=NULL;

3991 }

3992 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

3993 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

3994 } else {

3995 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;

3996 }

3997 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

3998

3999 /* get the converter state from UConverter */

4000 c=cnv->fromUChar32;

4001

4002 if(outputType==MBCS_OUTPUT_2_SISO) {

4003 prevLength=cnv->fromUnicodeStatus;

4004 if(prevLength==0) {

4005 /* set the real value */

4006 prevLength=1;

4007 }

4008 } else {

4009 /* prevent fromUnicodeStatus from being set to something non-0 */

4010 prevLength=0;

4011 }

4012

4013 /* sourceIndex=-1 if the current character began in the previous buffer */

4014 prevSourceIndex=-1;

4015 sourceIndex= c==0 ? 0 : -1;

4016 nextSourceIndex=0;

4017

4018 /* Get the SI/SO character for the converter */

4019 siLength = getSISOBytes(SI, cnv->options, siBytes);

4020 soLength = getSISOBytes(SO, cnv->options, soBytes);

4021

4022 /* conversion loop */

4023 /*

4024 * This is another piece of ugly code:

4025 * A goto into the loop if the converter state contains a first surrogate

4026 * from the previous function call.

4027 * It saves me to check in each loop iteration a check of if(c==0)

4028 * and duplicating the trail-surrogate-handling code in the else

4029 * branch of that check.

4030 * I could not find any other way to get around this other than

4031 * using a function call for the conversion and callback, which would

4032 * be even more inefficient.

4033 *

4034 * Markus Scherer 2000-jul-19

4035 */

4036 if(c!=0 && targetCapacity>0) {

4037 goto getTrail;

4038 }

4039

4040 while(source<sourceLimit) {

4041 /*

4042 * This following test is to see if available input would overflow the o utput.

4043 * It does not catch output of more than one byte that

4044 * overflows as a result of a multi-byte character or callback output

4045 * from the last source character.

4046 * Therefore, those situations also test for overflows and will

4047 * then break the loop, too.

4048 */

4049 if(targetCapacity>0) {

4050 /*

4051 * Get a correct Unicode code point:

4052 * a single UChar for a BMP code point or

4053 * a matched surrogate pair for a "supplementary code point".

4054 */

4055 c=*source++;

4056 ++nextSourceIndex;

4057 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {

4058 *target++=(uint8_t)c;

4059 if(offsets!=NULL) {

4060 *offsets++=sourceIndex;

4061 prevSourceIndex=sourceIndex;

4062 sourceIndex=nextSourceIndex;

4063 }

4064 --targetCapacity;

4065 c=0;

4066 continue;

4067 }

4068 /*

4069 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX

4070 * to avoid dealing with surrogates.

4071 * MBCS_FAST_MAX must be >=0xd7ff.

4072 */

4073 if(c<=0xd7ff && mbcsIndex!=NULL) {

4074 value=mbcsIndex[c>>6];

4075

4076 /* get the bytes and the length for the output (copied from belo w and adapted for utf8Friendly data) */

4077 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */

4078 switch(outputType) {

4079 case MBCS_OUTPUT_2:

4080 value=((const uint16_t *)bytes)[value +(c&0x3f)];

4081 if(value<=0xff) {

4082 if(value==0) {

4083 goto unassigned;

4084 } else {

4085 length=1;

4086 }

4087 } else {

4088 length=2;

4089 }

4090 break;

4091 case MBCS_OUTPUT_2_SISO:

4092 /* 1/2-byte stateful with Shift-In/Shift-Out */

4093 /*

4094 * Save the old state in the converter object

4095 * right here, then change the local prevLength state variab le if necessary.

4096 * Then, if this character turns out to be unassigned or a f allback that

4097 * is not taken, the callback code must not save the new sta te in the converter

4098 * because the new state is for a character that is not outp ut.

4099 * However, the callback must still restore the state from t he converter

4100 * in case the callback function changed it for its output.

4101 */

4102 cnv->fromUnicodeStatus=prevLength; /* save the old state */

4103 value=((const uint16_t *)bytes)[value +(c&0x3f)];

4104 if(value<=0xff) {

4105 if(value==0) {

4106 goto unassigned;

4107 } else if(prevLength<=1) {

4108 length=1;

4109 } else {

4110 /* change from double-byte mode to single-byte */

4111 if (siLength == 1) {

4112 value\|=(uint32_t)siBytes[0]<<8;

4113 length = 2;

4114 } else if (siLength == 2) {

4115 value\|=(uint32_t)siBytes[1]<<8;

4116 value\|=(uint32_t)siBytes[0]<<16;

4117 length = 3;

4118 }

4119 prevLength=1;

4120 }

4121 } else {

4122 if(prevLength==2) {

4123 length=2;

4124 } else {

4125 /* change from single-byte mode to double-byte */

4126 if (soLength == 1) {

4127 value\|=(uint32_t)soBytes[0]<<16;

4128 length = 3;

4129 } else if (soLength == 2) {

4130 value\|=(uint32_t)soBytes[1]<<16;

4131 value\|=(uint32_t)soBytes[0]<<24;

4132 length = 4;

4133 }

4134 prevLength=2;

4135 }

4136 }

4137 break;

4138 case MBCS_OUTPUT_DBCS_ONLY:

4139 /* table with single-byte results, but only DBCS mappings us ed */

4140 value=((const uint16_t *)bytes)[value +(c&0x3f)];

4141 if(value<=0xff) {

4142 /* no mapping or SBCS result, not taken for DBCS-only */

4143 goto unassigned;

4144 } else {

4145 length=2;

4146 }

4147 break;

4148 case MBCS_OUTPUT_3:

4149 p=bytes+(value+(c&0x3f))*3;

4150 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

4151 if(value<=0xff) {

4152 if(value==0) {

4153 goto unassigned;

4154 } else {

4155 length=1;

4156 }

4157 } else if(value<=0xffff) {

4158 length=2;

4159 } else {

4160 length=3;

4161 }

4162 break;

4163 case MBCS_OUTPUT_4:

4164 value=((const uint32_t *)bytes)[value +(c&0x3f)];

4165 if(value<=0xff) {

4166 if(value==0) {

4167 goto unassigned;

4168 } else {

4169 length=1;

4170 }

4171 } else if(value<=0xffff) {

4172 length=2;

4173 } else if(value<=0xffffff) {

4174 length=3;

4175 } else {

4176 length=4;

4177 }

4178 break;

4179 case MBCS_OUTPUT_3_EUC:

4180 value=((const uint16_t *)bytes)[value +(c&0x3f)];

4181 /* EUC 16-bit fixed-length representation */

4182 if(value<=0xff) {

4183 if(value==0) {

4184 goto unassigned;

4185 } else {

4186 length=1;

4187 }

4188 } else if((value&0x8000)==0) {

4189 value\|=0x8e8000;

4190 length=3;

4191 } else if((value&0x80)==0) {

4192 value\|=0x8f0080;

4193 length=3;

4194 } else {

4195 length=2;

4196 }

4197 break;

4198 case MBCS_OUTPUT_4_EUC:

4199 p=bytes+(value+(c&0x3f))*3;

4200 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

4201 /* EUC 16-bit fixed-length representation applied to the fir st two bytes */

4202 if(value<=0xff) {

4203 if(value==0) {

4204 goto unassigned;

4205 } else {

4206 length=1;

4207 }

4208 } else if(value<=0xffff) {

4209 length=2;

4210 } else if((value&0x800000)==0) {

4211 value\|=0x8e800000;

4212 length=4;

4213 } else if((value&0x8000)==0) {

4214 value\|=0x8f008000;

4215 length=4;

4216 } else {

4217 length=3;

4218 }

4219 break;

4220 default:

4221 /* must not occur */

4222 /*

4223 * To avoid compiler warnings that value & length may be

4224 * used without having been initialized, we set them here.

4225 * In reality, this is unreachable code.

4226 * Not having a default branch also causes warnings with

4227 * some compilers.

4228 */

4229 value=0;

4230 length=0;

4231 break;

4232 }

4233 /* output the value */

4234 } else {

4235 /*

4236 * This also tests if the codepage maps single surrogates.

4237 * If it does, then surrogates are not paired but mapped separat ely.

4238 * Note that in this case unmatched surrogates are not detected.

4239 */

4240 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {

4241 if(U16_IS_SURROGATE_LEAD(c)) {

4242 getTrail:

4243 if(source<sourceLimit) {

4244 /* test the following code unit */

4245 UChar trail=*source;

4246 if(U16_IS_TRAIL(trail)) {

4247 ++source;

4248 ++nextSourceIndex;

4249 c=U16_GET_SUPPLEMENTARY(c, trail);

4250 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

4251 /* BMP-only codepages are stored without sta ge 1 entries for supplementary code points */

4252 cnv->fromUnicodeStatus=prevLength; /* save t he old state */

4253 /* callback(unassigned) */

4254 goto unassigned;

4255 }

4256 /* convert this supplementary code point */

4257 /* exit this condition tree */

4258 } else {

4259 /* this is an unmatched lead code unit (1st surr ogate) */

4260 /* callback(illegal) */

4261 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

4262 break;

4263 }

4264 } else {

4265 /* no more input */

4266 break;

4267 }

4268 } else {

4269 /* this is an unmatched trail code unit (2nd surrogate) */

4270 /* callback(illegal) */

4271 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

4272 break;

4273 }

4274 }

4275

4276 /* convert the Unicode code point in c into codepage bytes */

4277

4278 /*

4279 * The basic lookup is a triple-stage compact array (trie) looku p.

4280 * For details see the beginning of this file.

4281 *

4282 * Single-byte codepages are handled with a different data struc ture

4283 * by _MBCSSingle... functions.

4284 *

4285 * The result consists of a 32-bit value from stage 2 and

4286 * a pointer to as many bytes as are stored per character.

4287 * The pointer points to the character's bytes in stage 3.

4288 * Bits 15..0 of the stage 2 entry contain the stage 3 index

4289 * for that pointer, while bits 31..16 are flags for which of

4290 * the 16 characters in the block are roundtrip-assigned.

4291 *

4292 * For 2-byte and 4-byte codepages, the bytes are stored as uint 16_t

4293 * respectively as uint32_t, in the platform encoding.

4294 * For 3-byte codepages, the bytes are always stored in big-endi an order.

4295 *

4296 * For EUC encodings that use only either 0x8e or 0x8f as the fi rst

4297 * byte of their longest byte sequences, the first two bytes in

4298 * this third stage indicate with their 7th bits whether these b ytes

4299 * are to be written directly or actually need to be preceeded b y

4300 * one of the two Single-Shift codes. With this, the third stage

4301 * stores one byte fewer per character than the actual maximum l ength of

4302 * EUC byte sequences.

4303 *

4304 * Other than that, leading zero bytes are removed and the other

4305 * bytes output. A single zero byte may be output if the "assign ed"

4306 * bit in stage 2 was on.

4307 * The data structure does not support zero byte output as a fal lback,

4308 * and also does not allow output of leading zeros.

4309 */

4310 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

4311

4312 /* get the bytes and the length for the output */

4313 switch(outputType) {

4314 case MBCS_OUTPUT_2:

4315 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

4316 if(value<=0xff) {

4317 length=1;

4318 } else {

4319 length=2;

4320 }

4321 break;

4322 case MBCS_OUTPUT_2_SISO:

4323 /* 1/2-byte stateful with Shift-In/Shift-Out */

4324 /*

4325 * Save the old state in the converter object

4326 * right here, then change the local prevLength state variab le if necessary.

4327 * Then, if this character turns out to be unassigned or a f allback that

4328 * is not taken, the callback code must not save the new sta te in the converter

4329 * because the new state is for a character that is not outp ut.

4330 * However, the callback must still restore the state from t he converter

4331 * in case the callback function changed it for its output.

4332 */

4333 cnv->fromUnicodeStatus=prevLength; /* save the old state */

4334 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

4335 if(value<=0xff) {

4336 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)= =0) {

4337 /* no mapping, leave value==0 */

4338 length=0;

4339 } else if(prevLength<=1) {

4340 length=1;

4341 } else {

4342 /* change from double-byte mode to single-byte */

4343 if (siLength == 1) {

4344 value\|=(uint32_t)siBytes[0]<<8;

4345 length = 2;

4346 } else if (siLength == 2) {

4347 value\|=(uint32_t)siBytes[1]<<8;

4348 value\|=(uint32_t)siBytes[0]<<16;

4349 length = 3;

4350 }

4351 prevLength=1;

4352 }

4353 } else {

4354 if(prevLength==2) {

4355 length=2;

4356 } else {

4357 /* change from single-byte mode to double-byte */

4358 if (soLength == 1) {

4359 value\|=(uint32_t)soBytes[0]<<16;

4360 length = 3;

4361 } else if (soLength == 2) {

4362 value\|=(uint32_t)soBytes[1]<<16;

4363 value\|=(uint32_t)soBytes[0]<<24;

4364 length = 4;

4365 }

4366 prevLength=2;

4367 }

4368 }

4369 break;

4370 case MBCS_OUTPUT_DBCS_ONLY:

4371 /* table with single-byte results, but only DBCS mappings us ed */

4372 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

4373 if(value<=0xff) {

4374 /* no mapping or SBCS result, not taken for DBCS-only */

4375 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */

4376 length=0;

4377 } else {

4378 length=2;

4379 }

4380 break;

4381 case MBCS_OUTPUT_3:

4382 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);

4383 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

4384 if(value<=0xff) {

4385 length=1;

4386 } else if(value<=0xffff) {

4387 length=2;

4388 } else {

4389 length=3;

4390 }

4391 break;

4392 case MBCS_OUTPUT_4:

4393 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);

4394 if(value<=0xff) {

4395 length=1;

4396 } else if(value<=0xffff) {

4397 length=2;

4398 } else if(value<=0xffffff) {

4399 length=3;

4400 } else {

4401 length=4;

4402 }

4403 break;

4404 case MBCS_OUTPUT_3_EUC:

4405 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

4406 /* EUC 16-bit fixed-length representation */

4407 if(value<=0xff) {

4408 length=1;

4409 } else if((value&0x8000)==0) {

4410 value\|=0x8e8000;

4411 length=3;

4412 } else if((value&0x80)==0) {

4413 value\|=0x8f0080;

4414 length=3;

4415 } else {

4416 length=2;

4417 }

4418 break;

4419 case MBCS_OUTPUT_4_EUC:

4420 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);

4421 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

4422 /* EUC 16-bit fixed-length representation applied to the fir st two bytes */

4423 if(value<=0xff) {

4424 length=1;

4425 } else if(value<=0xffff) {

4426 length=2;

4427 } else if((value&0x800000)==0) {

4428 value\|=0x8e800000;

4429 length=4;

4430 } else if((value&0x8000)==0) {

4431 value\|=0x8f008000;

4432 length=4;

4433 } else {

4434 length=3;

4435 }

4436 break;

4437 default:

4438 /* must not occur */

4439 /*

4440 * To avoid compiler warnings that value & length may be

4441 * used without having been initialized, we set them here.

4442 * In reality, this is unreachable code.

4443 * Not having a default branch also causes warnings with

4444 * some compilers.

4445 */

4446 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip fla gs */

4447 length=0;

4448 break;

4449 }

4450

4451 /* is this code point assigned, or do we use fallbacks? */

4452 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 \|\|

4453 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))

4454 ) {

4455 /*

4456 * We allow a 0 byte output if the "assigned" bit is set for this entry.

4457 * There is no way with this data structure for fallback out put

4458 * to be a zero byte.

4459 */

4460

4461 unassigned:

4462 /* try an extension mapping */

4463 pArgs->source=source;

4464 c=_extFromU(cnv, cnv->sharedData,

4465 c, &source, sourceLimit,

4466 &target, target+targetCapacity,

4467 &offsets, sourceIndex,

4468 pArgs->flush,

4469 pErrorCode);

4470 nextSourceIndex+=(int32_t)(source-pArgs->source);

4471 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */

4472

4473 if(U_FAILURE(*pErrorCode)) {

4474 /* not mappable or buffer overflow */

4475 break;

4476 } else {

4477 /* a mapping was written to the target, continue */

4478

4479 /* recalculate the targetCapacity after an extension map ping */

4480 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)targ et);

4481

4482 /* normal end of conversion: prepare for a new character */

4483 if(offsets!=NULL) {

4484 prevSourceIndex=sourceIndex;

4485 sourceIndex=nextSourceIndex;

4486 }

4487 continue;

4488 }

4489 }

4490 }

4491

4492 /* write the output character bytes from value and length */

4493 /* from the first if in the loop we know that targetCapacity>0 */

4494 if(length<=targetCapacity) {

4495 if(offsets==NULL) {

4496 switch(length) {

4497 /* each branch falls through to the next one */

4498 case 4:

4499 *target++=(uint8_t)(value>>24);

4500 case 3: /fall through/

4501 *target++=(uint8_t)(value>>16);

4502 case 2: /fall through/

4503 *target++=(uint8_t)(value>>8);

4504 case 1: /fall through/

4505 *target++=(uint8_t)value;

4506 default:

4507 /* will never occur */

4508 break;

4509 }

4510 } else {

4511 switch(length) {

4512 /* each branch falls through to the next one */

4513 case 4:

4514 *target++=(uint8_t)(value>>24);

4515 *offsets++=sourceIndex;

4516 case 3: /fall through/

4517 *target++=(uint8_t)(value>>16);

4518 *offsets++=sourceIndex;

4519 case 2: /fall through/

4520 *target++=(uint8_t)(value>>8);

4521 *offsets++=sourceIndex;

4522 case 1: /fall through/

4523 *target++=(uint8_t)value;

4524 *offsets++=sourceIndex;

4525 default:

4526 /* will never occur */

4527 break;

4528 }

4529 }

4530 targetCapacity-=length;

4531 } else {

4532 uint8_t *charErrorBuffer;

4533

4534 /*

4535 * We actually do this backwards here:

4536 * In order to save an intermediate variable, we output

4537 * first to the overflow buffer what does not fit into the

4538 * regular target.

4539 */

4540 /* we know that 1<=targetCapacity<length<=4 */

4541 length-=targetCapacity;

4542 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;

4543 switch(length) {

4544 /* each branch falls through to the next one */

4545 case 3:

4546 *charErrorBuffer++=(uint8_t)(value>>16);

4547 case 2: /fall through/

4548 *charErrorBuffer++=(uint8_t)(value>>8);

4549 case 1: /fall through/

4550 *charErrorBuffer=(uint8_t)value;

4551 default:

4552 /* will never occur */

4553 break;

4554 }

4555 cnv->charErrorBufferLength=(int8_t)length;

4556

4557 /* now output what fits into the regular target */

4558 value>>=8length; / length was reduced by targetCapacity */

4559 switch(targetCapacity) {

4560 /* each branch falls through to the next one */

4561 case 3:

4562 *target++=(uint8_t)(value>>16);

4563 if(offsets!=NULL) {

4564 *offsets++=sourceIndex;

4565 }

4566 case 2: /fall through/

4567 *target++=(uint8_t)(value>>8);

4568 if(offsets!=NULL) {

4569 *offsets++=sourceIndex;

4570 }

4571 case 1: /fall through/

4572 *target++=(uint8_t)value;

4573 if(offsets!=NULL) {

4574 *offsets++=sourceIndex;

4575 }

4576 default:

4577 /* will never occur */

4578 break;

4579 }

4580

4581 /* target overflow */

4582 targetCapacity=0;

4583 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

4584 c=0;

4585 break;

4586 }

4587

4588 /* normal end of conversion: prepare for a new character */

4589 c=0;

4590 if(offsets!=NULL) {

4591 prevSourceIndex=sourceIndex;

4592 sourceIndex=nextSourceIndex;

4593 }

4594 continue;

4595 } else {

4596 /* target is full */

4597 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

4598 break;

4599 }

4600 }

4601

4602 /*

4603 * the end of the input stream and detection of truncated input

4604 * are handled by the framework, but for EBCDIC_STATEFUL conversion

4605 * we need to emit an SI at the very end

4606 *

4607 * conditions:

4608 * successful

4609 * EBCDIC_STATEFUL in DBCS mode

4610 * end of input and no truncated input

4611 */

4612 if( U_SUCCESS(*pErrorCode) &&

4613 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&

4614 pArgs->flush && source>=sourceLimit && c==0

4615 ) {

4616 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output str eam to SBCS */

4617 if(targetCapacity>0) {

4618 *target++=(uint8_t)siBytes[0];

4619 if (siLength == 2) {

4620 if (targetCapacity<2) {

4621 cnv->charErrorBuffer[0]=(uint8_t)siBytes[1];

4622 cnv->charErrorBufferLength=1;

4623 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

4624 } else {

4625 *target++=(uint8_t)siBytes[1];

4626 }

4627 }

4628 if(offsets!=NULL) {

4629 /* set the last source character's index (sourceIndex points at sourceLimit now) */

4630 *offsets++=prevSourceIndex;

4631 }

4632 } else {

4633 /* target is full */

4634 cnv->charErrorBuffer[0]=(uint8_t)siBytes[0];

4635 if (siLength == 2) {

4636 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1];

4637 }

4638 cnv->charErrorBufferLength=siLength;

4639 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

4640 }

4641 prevLength=1; /* we switched into SBCS */

4642 }

4643

4644 /* set the converter state back into UConverter */

4645 cnv->fromUChar32=c;

4646 cnv->fromUnicodeStatus=prevLength;

4647

4648 /* write back the updated pointers */

4649 pArgs->source=source;

4650 pArgs->target=(char *)target;

4651 pArgs->offsets=offsets;

4652 }

4653

4654 /*

4655 * This is another simple conversion function for internal use by other

4656 * conversion implementations.

4657 * It does not use the converter state nor call callbacks.

4658 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

4659 * It handles conversion extensions but not GB 18030.

4660 *

4661 * It converts one single Unicode code point into codepage bytes, encoded

4662 * as one 32-bit value. The function returns the number of bytes in *pValue:

4663 * 1..4 the number of bytes in *pValue

4664 * 0 unassigned (*pValue undefined)

4665 * -1 illegal (currently not used, *pValue undefined)

4666 *

4667 * *pValue will contain the resulting bytes with the last byte in bits 7..0,

4668 * the second to last byte in bits 15..8, etc.

4669 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.

4670 */

4671 U_CFUNC int32_t

4672 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,

4673 UChar32 c, uint32_t *pValue,

4674 UBool useFallback) {

4675 const int32_t *cx;

4676 const uint16_t *table;

4677 #if 0

4678 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */

4679 const uint8_t *p;

4680 #endif

4681 uint32_t stage2Entry;

4682 uint32_t value;

4683 int32_t length;

4684

4685 /* BMP-only codepages are stored without stage 1 entries for supplementary c ode points */

4686 if(c<=0xffff \|\| (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

4687 table=sharedData->mbcs.fromUnicodeTable;

4688

4689 /* convert the Unicode code point in c into codepage bytes (same as in _ MBCSFromUnicodeWithOffsets) */

4690 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {

4691 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs. fromUnicodeBytes, c);

4692 /* is this code point assigned, or do we use fallbacks? */

4693 if(useFallback ? value>=0x800 : value>=0xc00) {

4694 *pValue=value&0xff;

4695 return 1;

4696 }

4697 } else /* outputType!=MBCS_OUTPUT_1 */ {

4698 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

4699

4700 /* get the bytes and the length for the output */

4701 switch(sharedData->mbcs.outputType) {

4702 case MBCS_OUTPUT_2:

4703 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

4704 if(value<=0xff) {

4705 length=1;

4706 } else {

4707 length=2;

4708 }

4709 break;

4710 #if 0

4711 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */

4712 case MBCS_OUTPUT_DBCS_ONLY:

4713 /* table with single-byte results, but only DBCS mappings used * /

4714 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

4715 if(value<=0xff) {

4716 /* no mapping or SBCS result, not taken for DBCS-only */

4717 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip fla gs */

4718 length=0;

4719 } else {

4720 length=2;

4721 }

4722 break;

4723 case MBCS_OUTPUT_3:

4724 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);

4725 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

4726 if(value<=0xff) {

4727 length=1;

4728 } else if(value<=0xffff) {

4729 length=2;

4730 } else {

4731 length=3;

4732 }

4733 break;

4734 case MBCS_OUTPUT_4:

4735 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

4736 if(value<=0xff) {

4737 length=1;

4738 } else if(value<=0xffff) {

4739 length=2;

4740 } else if(value<=0xffffff) {

4741 length=3;

4742 } else {

4743 length=4;

4744 }

4745 break;

4746 case MBCS_OUTPUT_3_EUC:

4747 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

4748 /* EUC 16-bit fixed-length representation */

4749 if(value<=0xff) {

4750 length=1;

4751 } else if((value&0x8000)==0) {

4752 value\|=0x8e8000;

4753 length=3;

4754 } else if((value&0x80)==0) {

4755 value\|=0x8f0080;

4756 length=3;

4757 } else {

4758 length=2;

4759 }

4760 break;

4761 case MBCS_OUTPUT_4_EUC:

4762 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);

4763 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

4764 /* EUC 16-bit fixed-length representation applied to the first t wo bytes */

4765 if(value<=0xff) {

4766 length=1;

4767 } else if(value<=0xffff) {

4768 length=2;

4769 } else if((value&0x800000)==0) {

4770 value\|=0x8e800000;

4771 length=4;

4772 } else if((value&0x8000)==0) {

4773 value\|=0x8f008000;

4774 length=4;

4775 } else {

4776 length=3;

4777 }

4778 break;

4779 #endif

4780 default:

4781 /* must not occur */

4782 return -1;

4783 }

4784

4785 /* is this code point assigned, or do we use fallbacks? */

4786 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|

4787 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)

4788 ) {

4789 /*

4790 * We allow a 0 byte output if the "assigned" bit is set for thi s entry.

4791 * There is no way with this data structure for fallback output

4792 * to be a zero byte.

4793 */

4794 /* assigned */

4795 *pValue=value;

4796 return length;

4797 }

4798 }

4799 }

4800

4801 cx=sharedData->mbcs.extIndexes;

4802 if(cx!=NULL) {

4803 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);

4804 return length>=0 ? length : -length; /* return abs(length); */

4805 }

4806

4807 /* unassigned */

4808 return 0;

4809 }

4810

4811

4812 #if 0

4813 /*

4814 * This function has been moved to ucnv2022.c for inlining.

4815 * This implementation is here only for documentation purposes

4816 */

4817

4818 /**

4819 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages .

4820 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

4821 * It does not handle conversion extensions (_extFromU()).

4822 *

4823 * It returns the codepage byte for the code point, or -1 if it is unassigned.

4824 */

4825 U_CFUNC int32_t

4826 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,

4827 UChar32 c,

4828 UBool useFallback) {

4829 const uint16_t *table;

4830 int32_t value;

4831

4832 /* BMP-only codepages are stored without stage 1 entries for supplementary c ode points */

4833 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

4834 return -1;

4835 }

4836

4837 /* convert the Unicode code point in c into codepage bytes (same as in _MBCS FromUnicodeWithOffsets) */

4838 table=sharedData->mbcs.fromUnicodeTable;

4839

4840 /* get the byte for the output */

4841 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnic odeBytes, c);

4842 /* is this code point assigned, or do we use fallbacks? */

4843 if(useFallback ? value>=0x800 : value>=0xc00) {

4844 return value&0xff;

4845 } else {

4846 return -1;

4847 }

4848 }

4849 #endif

4850

4851 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */

4852

4853 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */

4854 static const UChar32

4855 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };

4856

4857 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail )<<6+trail... */

4858 static const UChar32

4859 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };

4860

4861 static void

4862 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

4863 UConverterToUnicodeArgs *pToUArgs,

4864 UErrorCode *pErrorCode) {

4865 UConverter utf8, cnv;

4866 const uint8_t source, sourceLimit;

4867 uint8_t *target;

4868 int32_t targetCapacity;

4869

4870 const uint16_t table, sbcsIndex;

4871 const uint16_t *results;

4872

4873 int8_t oldToULength, toULength, toULimit;

4874

4875 UChar32 c;

4876 uint8_t b, t1, t2;

4877

4878 uint32_t asciiRoundtrips;

4879 uint16_t value, minValue;

4880 UBool hasSupplementary;

4881

4882 /* set up the local pointers */

4883 utf8=pToUArgs->converter;

4884 cnv=pFromUArgs->converter;

4885 source=(uint8_t *)pToUArgs->source;

4886 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

4887 target=(uint8_t *)pFromUArgs->target;

4888 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

4889

4890 table=cnv->sharedData->mbcs.fromUnicodeTable;

4891 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;

4892 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

4893 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

4894 } else {

4895 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

4896 }

4897 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

4898

4899 if(cnv->useFallback) {

4900 /* use all roundtrip and fallback results */

4901 minValue=0x800;

4902 } else {

4903 /* use only roundtrips and fallbacks from private-use characters */

4904 minValue=0xc00;

4905 }

4906 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEME NTARY);

4907

4908 /* get the converter state from the UTF-8 UConverter */

4909 c=(UChar32)utf8->toUnicodeStatus;

4910 if(c!=0) {

4911 toULength=oldToULength=utf8->toULength;

4912 toULimit=(int8_t)utf8->mode;

4913 } else {

4914 toULength=oldToULength=toULimit=0;

4915 }

4916

4917 /*

4918 * Make sure that the last byte sequence before sourceLimit is complete

4919 * or runs into a lead byte.

4920 * Do not go back into the bytes that will be read for finishing a partial

4921 * sequence from the previous buffer.

4922 * In the conversion loop compare source with sourceLimit only once

4923 * per multi-byte character.

4924 */

4925 {

4926 int32_t i, length;

4927

4928 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);

4929 for(i=0; i<3 && i<length;) {

4930 b=*(sourceLimit-i-1);

4931 if(U8_IS_TRAIL(b)) {

4932 ++i;

4933 } else {

4934 if(i<U8_COUNT_TRAIL_BYTES(b)) {

4935 /* exit the conversion loop before the lead byte if there ar e not enough trail bytes for it */

4936 sourceLimit-=i+1;

4937 }

4938 break;

4939 }

4940 }

4941 }

4942

4943 if(c!=0 && targetCapacity>0) {

4944 utf8->toUnicodeStatus=0;

4945 utf8->toULength=0;

4946 goto moreBytes;

4947 /*

4948 * Note: We could avoid the goto by duplicating some of the moreBytes

4949 * code, but only up to the point of collecting a complete UTF-8

4950 * sequence; then recurse for the toUBytes[toULength]

4951 * and then continue with normal conversion.

4952 *

4953 * If so, move this code to just after initializing the minimum

4954 * set of local variables for reading the UTF-8 input

4955 * (utf8, source, target, limits but not cnv, table, minValue, etc.).

4956 *

4957 * Potential advantages:

4958 * - avoid the goto

4959 * - oldToULength could become a local variable in just those code block s

4960 * that deal with buffer boundaries

4961 * - possibly faster if the goto prevents some compiler optimizations

4962 * (this would need measuring to confirm)

4963 * Disadvantage:

4964 * - code duplication

4965 */

4966 }

4967

4968 /* conversion loop */

4969 while(source<sourceLimit) {

4970 if(targetCapacity>0) {

4971 b=*source++;

4972 if((int8_t)b>=0) {

4973 /* convert ASCII */

4974 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {

4975 *target++=(uint8_t)b;

4976 --targetCapacity;

4977 continue;

4978 } else {

4979 c=b;

4980 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);

4981 }

4982 } else {

4983 if(b<0xe0) {

4984 if( /* handle U+0080..U+07FF inline */

4985 b>=0xc2 &&

4986 (t1=(uint8_t)(*source-0x80)) <= 0x3f

4987 ) {

4988 c=b&0x1f;

4989 ++source;

4990 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);

4991 if(value>=minValue) {

4992 *target++=(uint8_t)value;

4993 --targetCapacity;

4994 continue;

4995 } else {

4996 c=(c<<6)\|t1;

4997 }

4998 } else {

4999 c=-1;

5000 }

5001 } else if(b==0xe0) {

5002 if( /* handle U+0800..U+0FFF inline */

5003 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&

5004 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f

5005 ) {

5006 c=t1;

5007 source+=2;

5008 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);

5009 if(value>=minValue) {

5010 *target++=(uint8_t)value;

5011 --targetCapacity;

5012 continue;

5013 } else {

5014 c=(c<<6)\|t2;

5015 }

5016 } else {

5017 c=-1;

5018 }

5019 } else {

5020 c=-1;

5021 }

5022

5023 if(c<0) {

5024 /* handle "complicated" and error cases, and continuing part ial characters */

5025 oldToULength=0;

5026 toULength=1;

5027 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;

5028 c=b;

5029 moreBytes:

5030 while(toULength<toULimit) {

5031 /*

5032 * The sourceLimit may have been adjusted before the con version loop

5033 * to stop before a truncated sequence.

5034 * Here we need to use the real limit in case we have tw o truncated

5035 * sequences at the end.

5036 * See ticket #7492.

5037 */

5038 if(source<(uint8_t *)pToUArgs->sourceLimit) {

5039 b=*source;

5040 if(U8_IS_TRAIL(b)) {

5041 ++source;

5042 ++toULength;

5043 c=(c<<6)+b;

5044 } else {

5045 break; /* sequence too short, stop with toULengt h<toULimit */

5046 }

5047 } else {

5048 /* store the partial UTF-8 character, compatible wit h the regular UTF-8 converter */

5049 source-=(toULength-oldToULength);

5050 while(oldToULength<toULength) {

5051 utf8->toUBytes[oldToULength++]=*source++;

5052 }

5053 utf8->toUnicodeStatus=c;

5054 utf8->toULength=toULength;

5055 utf8->mode=toULimit;

5056 pToUArgs->source=(char *)source;

5057 pFromUArgs->target=(char *)target;

5058 return;

5059 }

5060 }

5061

5062 if( toULength==toULimit && /* consumed all trail bytes */

5063 (toULength==3 \|\| toULength==2) && /* BMP */

5064 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] & &

5065 (c<=0xd7ff \|\| 0xe000<=c) /* not a surrogate */

5066 ) {

5067 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

5068 } else if(

5069 toULength==toULimit && toULength==4 &&

5070 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)

5071 ) {

5072 /* supplementary code point */

5073 if(!hasSupplementary) {

5074 /* BMP-only codepages are stored without stage 1 ent ries for supplementary code points */

5075 value=0;

5076 } else {

5077 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

5078 }

5079 } else {

5080 /* error handling: illegal UTF-8 byte sequence */

5081 source-=(toULength-oldToULength);

5082 while(oldToULength<toULength) {

5083 utf8->toUBytes[oldToULength++]=*source++;

5084 }

5085 utf8->toULength=toULength;

5086 pToUArgs->source=(char *)source;

5087 pFromUArgs->target=(char *)target;

5088 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

5089 return;

5090 }

5091 }

5092 }

5093

5094 if(value>=minValue) {

5095 /* output the mapping for c */

5096 *target++=(uint8_t)value;

5097 --targetCapacity;

5098 } else {

5099 /* value<minValue means c is unassigned (unmappable) */

5100 /*

5101 * Try an extension mapping.

5102 * Pass in no source because we don't have UTF-16 input.

5103 * If we have a partial match on c, we will return and revert

5104 * to UTF-8->UTF-16->charset conversion.

5105 */

5106 static const UChar nul=0;

5107 const UChar *noSource=&nul;

5108 c=_extFromU(cnv, cnv->sharedData,

5109 c, &noSource, noSource,

5110 &target, target+targetCapacity,

5111 NULL, -1,

5112 pFromUArgs->flush,

5113 pErrorCode);

5114

5115 if(U_FAILURE(*pErrorCode)) {

5116 /* not mappable or buffer overflow */

5117 cnv->fromUChar32=c;

5118 break;

5119 } else if(cnv->preFromUFirstCP>=0) {

5120 /*

5121 * Partial match, return and revert to pivoting.

5122 * In normal from-UTF-16 conversion, we would just continue

5123 * but then exit the loop because the extension match would

5124 * have consumed the source.

5125 */

5126 *pErrorCode=U_USING_DEFAULT_WARNING;

5127 break;

5128 } else {

5129 /* a mapping was written to the target, continue */

5130

5131 /* recalculate the targetCapacity after an extension mapping */

5132 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)tar get);

5133 }

5134 }

5135 } else {

5136 /* target is full */

5137 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

5138 break;

5139 }

5140 }

5141

5142 /*

5143 * The sourceLimit may have been adjusted before the conversion loop

5144 * to stop before a truncated sequence.

5145 * If so, then collect the truncated sequence now.

5146 */

5147 if(U_SUCCESS(*pErrorCode) &&

5148 cnv->preFromUFirstCP<0 &&

5149 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {

5150 c=utf8->toUBytes[0]=b=*source++;

5151 toULength=1;

5152 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;

5153 while(source<sourceLimit) {

5154 utf8->toUBytes[toULength++]=b=*source++;

5155 c=(c<<6)+b;

5156 }

5157 utf8->toUnicodeStatus=c;

5158 utf8->toULength=toULength;

5159 utf8->mode=toULimit;

5160 }

5161

5162 /* write back the updated pointers */

5163 pToUArgs->source=(char *)source;

5164 pFromUArgs->target=(char *)target;

5165 }

5166

5167 static void

5168 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

5169 UConverterToUnicodeArgs *pToUArgs,

5170 UErrorCode *pErrorCode) {

5171 UConverter utf8, cnv;

5172 const uint8_t source, sourceLimit;

5173 uint8_t *target;

5174 int32_t targetCapacity;

5175

5176 const uint16_t table, mbcsIndex;

5177 const uint16_t *results;

5178

5179 int8_t oldToULength, toULength, toULimit;

5180

5181 UChar32 c;

5182 uint8_t b, t1, t2;

5183

5184 uint32_t stage2Entry;

5185 uint32_t asciiRoundtrips;

5186 uint16_t value;

5187 UBool hasSupplementary;

5188

5189 /* set up the local pointers */

5190 utf8=pToUArgs->converter;

5191 cnv=pFromUArgs->converter;

5192 source=(uint8_t *)pToUArgs->source;

5193 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

5194 target=(uint8_t *)pFromUArgs->target;

5195 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

5196

5197 table=cnv->sharedData->mbcs.fromUnicodeTable;

5198 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;

5199 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

5200 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

5201 } else {

5202 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

5203 }

5204 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

5205

5206 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEME NTARY);

5207

5208 /* get the converter state from the UTF-8 UConverter */

5209 c=(UChar32)utf8->toUnicodeStatus;

5210 if(c!=0) {

5211 toULength=oldToULength=utf8->toULength;

5212 toULimit=(int8_t)utf8->mode;

5213 } else {

5214 toULength=oldToULength=toULimit=0;

5215 }

5216

5217 /*

5218 * Make sure that the last byte sequence before sourceLimit is complete

5219 * or runs into a lead byte.

5220 * Do not go back into the bytes that will be read for finishing a partial

5221 * sequence from the previous buffer.

5222 * In the conversion loop compare source with sourceLimit only once

5223 * per multi-byte character.

5224 */

5225 {

5226 int32_t i, length;

5227

5228 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);

5229 for(i=0; i<3 && i<length;) {

5230 b=*(sourceLimit-i-1);

5231 if(U8_IS_TRAIL(b)) {

5232 ++i;

5233 } else {

5234 if(i<U8_COUNT_TRAIL_BYTES(b)) {

5235 /* exit the conversion loop before the lead byte if there ar e not enough trail bytes for it */

5236 sourceLimit-=i+1;

5237 }

5238 break;

5239 }

5240 }

5241 }

5242

5243 if(c!=0 && targetCapacity>0) {

5244 utf8->toUnicodeStatus=0;

5245 utf8->toULength=0;

5246 goto moreBytes;

5247 /* See note in ucnv_SBCSFromUTF8() about this goto. */

5248 }

5249

5250 /* conversion loop */

5251 while(source<sourceLimit) {

5252 if(targetCapacity>0) {

5253 b=*source++;

5254 if((int8_t)b>=0) {

5255 /* convert ASCII */

5256 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {

5257 *target++=b;

5258 --targetCapacity;

5259 continue;

5260 } else {

5261 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);

5262 if(value==0) {

5263 c=b;

5264 goto unassigned;

5265 }

5266 }

5267 } else {

5268 if(b>0xe0) {

5269 if( /* handle U+1000..U+D7FF inline */

5270 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f) ) \|\|

5271 (b==0xed && (t1 <= 0x1f) )) &&

5272 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f

5273 ) {

5274 c=((b&0xf)<<6)\|t1;

5275 source+=2;

5276 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);

5277 if(value==0) {

5278 c=(c<<6)\|t2;

5279 goto unassigned;

5280 }

5281 } else {

5282 c=-1;

5283 }

5284 } else if(b<0xe0) {

5285 if( /* handle U+0080..U+07FF inline */

5286 b>=0xc2 &&

5287 (t1=(uint8_t)(*source-0x80)) <= 0x3f

5288 ) {

5289 c=b&0x1f;

5290 ++source;

5291 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);

5292 if(value==0) {

5293 c=(c<<6)\|t1;

5294 goto unassigned;

5295 }

5296 } else {

5297 c=-1;

5298 }

5299 } else {

5300 c=-1;

5301 }

5302

5303 if(c<0) {

5304 /* handle "complicated" and error cases, and continuing part ial characters */

5305 oldToULength=0;

5306 toULength=1;

5307 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;

5308 c=b;

5309 moreBytes:

5310 while(toULength<toULimit) {

5311 /*

5312 * The sourceLimit may have been adjusted before the con version loop

5313 * to stop before a truncated sequence.

5314 * Here we need to use the real limit in case we have tw o truncated

5315 * sequences at the end.

5316 * See ticket #7492.

5317 */

5318 if(source<(uint8_t *)pToUArgs->sourceLimit) {

5319 b=*source;

5320 if(U8_IS_TRAIL(b)) {

5321 ++source;

5322 ++toULength;

5323 c=(c<<6)+b;

5324 } else {

5325 break; /* sequence too short, stop with toULengt h<toULimit */

5326 }

5327 } else {

5328 /* store the partial UTF-8 character, compatible wit h the regular UTF-8 converter */

5329 source-=(toULength-oldToULength);

5330 while(oldToULength<toULength) {

5331 utf8->toUBytes[oldToULength++]=*source++;

5332 }

5333 utf8->toUnicodeStatus=c;

5334 utf8->toULength=toULength;

5335 utf8->mode=toULimit;

5336 pToUArgs->source=(char *)source;

5337 pFromUArgs->target=(char *)target;

5338 return;

5339 }

5340 }

5341

5342 if( toULength==toULimit && /* consumed all trail bytes */

5343 (toULength==3 \|\| toULength==2) && /* BMP */

5344 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] & &

5345 (c<=0xd7ff \|\| 0xe000<=c) /* not a surrogate */

5346 ) {

5347 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

5348 } else if(

5349 toULength==toULimit && toULength==4 &&

5350 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)

5351 ) {

5352 /* supplementary code point */

5353 if(!hasSupplementary) {

5354 /* BMP-only codepages are stored without stage 1 ent ries for supplementary code points */

5355 stage2Entry=0;

5356 } else {

5357 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

5358 }

5359 } else {

5360 /* error handling: illegal UTF-8 byte sequence */

5361 source-=(toULength-oldToULength);

5362 while(oldToULength<toULength) {

5363 utf8->toUBytes[oldToULength++]=*source++;

5364 }

5365 utf8->toULength=toULength;

5366 pToUArgs->source=(char *)source;

5367 pFromUArgs->target=(char *)target;

5368 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

5369 return;

5370 }

5371

5372 /* get the bytes and the length for the output */

5373 /* MBCS_OUTPUT_2 */

5374 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);

5375

5376 /* is this code point assigned, or do we use fallbacks? */

5377 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|

5378 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))

5379 ) {

5380 goto unassigned;

5381 }

5382 }

5383 }

5384

5385 /* write the output character bytes from value and length */

5386 /* from the first if in the loop we know that targetCapacity>0 */

5387 if(value<=0xff) {

5388 /* this is easy because we know that there is enough space */

5389 *target++=(uint8_t)value;

5390 --targetCapacity;

5391 } else /* length==2 */ {

5392 *target++=(uint8_t)(value>>8);

5393 if(2<=targetCapacity) {

5394 *target++=(uint8_t)value;

5395 targetCapacity-=2;

5396 } else {

5397 cnv->charErrorBuffer[0]=(char)value;

5398 cnv->charErrorBufferLength=1;

5399

5400 /* target overflow */

5401 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

5402 break;

5403 }

5404 }

5405 continue;

5406

5407 unassigned:

5408 {

5409 /*

5410 * Try an extension mapping.

5411 * Pass in no source because we don't have UTF-16 input.

5412 * If we have a partial match on c, we will return and revert

5413 * to UTF-8->UTF-16->charset conversion.

5414 */

5415 static const UChar nul=0;

5416 const UChar *noSource=&nul;

5417 c=_extFromU(cnv, cnv->sharedData,

5418 c, &noSource, noSource,

5419 &target, target+targetCapacity,

5420 NULL, -1,

5421 pFromUArgs->flush,

5422 pErrorCode);

5423

5424 if(U_FAILURE(*pErrorCode)) {

5425 /* not mappable or buffer overflow */

5426 cnv->fromUChar32=c;

5427 break;

5428 } else if(cnv->preFromUFirstCP>=0) {

5429 /*

5430 * Partial match, return and revert to pivoting.

5431 * In normal from-UTF-16 conversion, we would just continue

5432 * but then exit the loop because the extension match would

5433 * have consumed the source.

5434 */

5435 *pErrorCode=U_USING_DEFAULT_WARNING;

5436 break;

5437 } else {

5438 /* a mapping was written to the target, continue */

5439

5440 /* recalculate the targetCapacity after an extension mapping */

5441 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)tar get);

5442 continue;

5443 }

5444 }

5445 } else {

5446 /* target is full */

5447 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

5448 break;

5449 }

5450 }

5451

5452 /*

5453 * The sourceLimit may have been adjusted before the conversion loop

5454 * to stop before a truncated sequence.

5455 * If so, then collect the truncated sequence now.

5456 */

5457 if(U_SUCCESS(*pErrorCode) &&

5458 cnv->preFromUFirstCP<0 &&

5459 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {

5460 c=utf8->toUBytes[0]=b=*source++;

5461 toULength=1;

5462 toULimit=U8_COUNT_TRAIL_BYTES(b)+1;

5463 while(source<sourceLimit) {

5464 utf8->toUBytes[toULength++]=b=*source++;

5465 c=(c<<6)+b;

5466 }

5467 utf8->toUnicodeStatus=c;

5468 utf8->toULength=toULength;

5469 utf8->mode=toULimit;

5470 }

5471

5472 /* write back the updated pointers */

5473 pToUArgs->source=(char *)source;

5474 pFromUArgs->target=(char *)target;

5475 }

5476

5477 /* miscellaneous ------------------------------------------------------------ */

5478

5479 static void

5480 ucnv_MBCSGetStarters(const UConverter* cnv,

5481 UBool starters[256],

5482 UErrorCode *pErrorCode) {

5483 const int32_t *state0;

5484 int i;

5485

5486 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState] ;

5487 for(i=0; i<256; ++i) {

5488 /* all bytes that cause a state transition from state 0 are lead bytes * /

5489 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);

5490 }

5491 }

5492

5493 /*

5494 * This is an internal function that allows other converter implementations

5495 * to check whether a byte is a lead byte.

5496 */

5497 U_CFUNC UBool

5498 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {

5499 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8 _t)byte]);

5500 }

5501

5502 static void

5503 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,

5504 int32_t offsetIndex,

5505 UErrorCode *pErrorCode) {

5506 UConverter *cnv=pArgs->converter;

5507 char p, subchar;

5508 char buffer[4];

5509 int32_t length;

5510

5511 /* first, select between subChar and subChar1 */

5512 if( cnv->subChar1!=0 &&

5513 (cnv->sharedData->mbcs.extIndexes!=NULL ?

5514 cnv->useSubChar1 :

5515 (cnv->invalidUCharBuffer[0]<=0xff))

5516 ) {

5517 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */

5518 subchar=(char *)&cnv->subChar1;

5519 length=1;

5520 } else {

5521 /* select subChar in all other cases */

5522 subchar=(char *)cnv->subChars;

5523 length=cnv->subCharLen;

5524 }

5525

5526 /* reset the selector for the next code point */

5527 cnv->useSubChar1=FALSE;

5528

5529 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {

5530 p=buffer;

5531

5532 /* fromUnicodeStatus contains prevLength */

5533 switch(length) {

5534 case 1:

5535 if(cnv->fromUnicodeStatus==2) {

5536 /* DBCS mode and SBCS sub char: change to SBCS */

5537 cnv->fromUnicodeStatus=1;

5538 *p++=UCNV_SI;

5539 }

5540 *p++=subchar[0];

5541 break;

5542 case 2:

5543 if(cnv->fromUnicodeStatus<=1) {

5544 /* SBCS mode and DBCS sub char: change to DBCS */

5545 cnv->fromUnicodeStatus=2;

5546 *p++=UCNV_SO;

5547 }

5548 *p++=subchar[0];

5549 *p++=subchar[1];

5550 break;

5551 default:

5552 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

5553 return;

5554 }

5555 subchar=buffer;

5556 length=(int32_t)(p-buffer);

5557 }

5558

5559 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);

5560 }

5561

5562 U_CFUNC UConverterType

5563 ucnv_MBCSGetType(const UConverter* converter) {

5564 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */

5565 if(converter->sharedData->mbcs.countStates==1) {

5566 return (UConverterType)UCNV_SBCS;

5567 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {

5568 return (UConverterType)UCNV_EBCDIC_STATEFUL;

5569 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter ->sharedData->staticData->maxBytesPerChar==2) {

5570 return (UConverterType)UCNV_DBCS;

5571 }

5572 return (UConverterType)UCNV_MBCS;

5573 }

5574

5575 static const UConverterImpl _SBCSUTF8Impl={

5576 UCNV_MBCS,

5577

5578 ucnv_MBCSLoad,

5579 ucnv_MBCSUnload,

5580

5581 ucnv_MBCSOpen,

5582 NULL,

5583 NULL,

5584

5585 ucnv_MBCSToUnicodeWithOffsets,

5586 ucnv_MBCSToUnicodeWithOffsets,

5587 ucnv_MBCSFromUnicodeWithOffsets,

5588 ucnv_MBCSFromUnicodeWithOffsets,

5589 ucnv_MBCSGetNextUChar,

5590

5591 ucnv_MBCSGetStarters,

5592 ucnv_MBCSGetName,

5593 ucnv_MBCSWriteSub,

5594 NULL,

5595 ucnv_MBCSGetUnicodeSet,

5596

5597 NULL,

5598 ucnv_SBCSFromUTF8

5599 };

5600

5601 static const UConverterImpl _DBCSUTF8Impl={

5602 UCNV_MBCS,

5603

5604 ucnv_MBCSLoad,

5605 ucnv_MBCSUnload,

5606

5607 ucnv_MBCSOpen,

5608 NULL,

5609 NULL,

5610

5611 ucnv_MBCSToUnicodeWithOffsets,

5612 ucnv_MBCSToUnicodeWithOffsets,

5613 ucnv_MBCSFromUnicodeWithOffsets,

5614 ucnv_MBCSFromUnicodeWithOffsets,

5615 ucnv_MBCSGetNextUChar,

5616

5617 ucnv_MBCSGetStarters,

5618 ucnv_MBCSGetName,

5619 ucnv_MBCSWriteSub,

5620 NULL,

5621 ucnv_MBCSGetUnicodeSet,

5622

5623 NULL,

5624 ucnv_DBCSFromUTF8

5625 };

5626

5627 static const UConverterImpl _MBCSImpl={

5628 UCNV_MBCS,

5629

5630 ucnv_MBCSLoad,

5631 ucnv_MBCSUnload,

5632

5633 ucnv_MBCSOpen,

5634 NULL,

5635 NULL,

5636

5637 ucnv_MBCSToUnicodeWithOffsets,

5638 ucnv_MBCSToUnicodeWithOffsets,

5639 ucnv_MBCSFromUnicodeWithOffsets,

5640 ucnv_MBCSFromUnicodeWithOffsets,

5641 ucnv_MBCSGetNextUChar,

5642

5643 ucnv_MBCSGetStarters,

5644 ucnv_MBCSGetName,

5645 ucnv_MBCSWriteSub,

5646 NULL,

5647 ucnv_MBCSGetUnicodeSet

5648 };

5649

5650

5651 /* Static data is in tools/makeconv/ucnvstat.c for data-based

5652 * converters. Be sure to update it as well.

5653 */

5654

5655 const UConverterSharedData _MBCSData={

5656 sizeof(UConverterSharedData), 1,

5657 NULL, NULL, NULL, FALSE, &_MBCSImpl,

5658 0

5659 };

5660

5661 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

OLD	NEW

« no previous file with comments | « source/common/ucnvisci.c ('k') | source/common/ucnvmbcs.cpp » ('j') | no next file with comments »