icu46/source/common/ucnvmbcs.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ucnvmbcs.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 2000-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 * file name: ucnvmbcs.c

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2000jul03

	14 * created by: Markus W. Scherer

	15 *

	16 * The current code in this file replaces the previous implementation

	17 * of conversion code from multi-byte codepages to Unicode and back.

	18 * This implementation supports the following:

	19 * - legacy variable-length codepages with up to 4 bytes per character

	20 * - all Unicode code points (up to 0x10ffff)

	21 * - efficient distinction of unassigned vs. illegal byte sequences

	22 * - it is possible in fromUnicode() to directly deal with simple

	23 * stateful encodings (used for EBCDIC_STATEFUL)

	24 * - it is possible to convert Unicode code points

	25 * to a single zero byte (but not as a fallback except for SBCS)

	26 *

	27 * Remaining limitations in fromUnicode:

	28 * - byte sequences must not have leading zero bytes

	29 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte

	30 * - limitation to up to 4 bytes per character

	31 *

	32 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of thes e

	33 * limitations and adds m:n character mappings and other features.

	34 * See ucnv_ext.h for details.

	35 *

	36 * Change history:

	37 *

	38 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM _U,

	39 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE _2

	40 * macros to ucnvmbcs.h file

	41 */

	42

	43 #include "unicode/utypes.h"

	44

	45 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION

	46

	47 #include "unicode/ucnv.h"

	48 #include "unicode/ucnv_cb.h"

	49 #include "unicode/udata.h"

	50 #include "unicode/uset.h"

	51 #include "ucnv_bld.h"

	52 #include "ucnvmbcs.h"

	53 #include "ucnv_ext.h"

	54 #include "ucnv_cnv.h"

	55 #include "umutex.h"

	56 #include "cmemory.h"

	57 #include "cstring.h"

	58

	59 /* control optimizations according to the platform */

	60 #define MBCS_UNROLL_SINGLE_TO_BMP 1

	61 #define MBCS_UNROLL_SINGLE_FROM_BMP 0

	62

	63 /*

	64 * _MBCSHeader versions 5.3 & 4.3

	65 * (Note that the _MBCSHeader version is in addition to the converter formatVers ion.)

	66 *

	67 * This version is optional. Version 5 is used for incompatible data format chan ges.

	68 * makeconv will continue to generate version 4 files if possible.

	69 *

	70 * Changes from version 4:

	71 *

	72 * The main difference is an additional _MBCSHeader field with

	73 * - the length (number of uint32_t) of the _MBCSHeader

	74 * - flags for further incompatible data format changes

	75 * - flags for further, backward compatible data format changes

	76 *

	77 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitt ed from

	78 * the file and needs to be reconstituted at load time.

	79 * This requires a utf8Friendly format with an additional mbcsIndex table for fa st

	80 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to max FastUChar.

	81 * (For details about these structures see below, and see ucnvmbcs.h.)

	82 *

	83 * utf8Friendly also implies that the fromUnicode mappings are stored in ascen ding order

	84 * of the Unicode code points. (This requires that the .ucm file has the \|0 et c.

	85 * precision markers for all mappings.)

	86 *

	87 * All fallbacks have been moved to the extension table, leaving only roundtri ps in the

	88 * omitted data that can be reconstituted from the toUnicode data.

	89 *

	90 * Of the stage 2 table, the part corresponding to maxFastUChar and below is o mitted.

	91 * With only roundtrip mappings in the base fromUnicode data, this part is ful ly

	92 * redundant with the mbcsIndex and will be reconstituted from that (also usin g the

	93 * stage 1 table which contains the information about how stage 2 was compacte d).

	94 *

	95 * The rest of the stage 2 table, the part for code points above maxFastUChar,

	96 * is stored in the file and will be appended to the reconstituted part.

	97 *

	98 * The entire fromUBytes array is omitted from the file and will be reconstitu ed.

	99 * This is done by enumerating all toUnicode roundtrip mappings, performing

	100 * each mapping (using the stage 1 and reconstituted stage 2 tables) and

	101 * writing instead of reading the byte values.

	102 *

	103 * _MBCSHeader version 4.3

	104 *

	105 * Change from version 4.2:

	106 * - Optional utf8Friendly data structures, with 64-entry stage 3 block

	107 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS

	108 * files which can be used instead of stages 1 & 2.

	109 * Faster lookups for roundtrips from most commonly used characters,

	110 * and lookups from UTF-8 byte sequences with a natural bit distribution.

	111 * See ucnvmbcs.h for more details.

	112 *

	113 * Change from version 4.1:

	114 * - Added an optional extension table structure at the end of the .cnv file.

	115 * It is present if the upper bits of the header flags field contains a non-ze ro

	116 * byte offset to it.

	117 * Files that contain only a conversion table and no base table

	118 * use the special outputType MBCS_OUTPUT_EXT_ONLY.

	119 * These contain the base table name between the MBCS header and the extension

	120 * data.

	121 *

	122 * Change from version 4.0:

	123 * - Replace header.reserved with header.fromUBytesLength so that all

	124 * fields in the data have length.

	125 *

	126 * Changes from version 3 (for performance improvements):

	127 * - new bit distribution for state table entries

	128 * - reordered action codes

	129 * - new data structure for single-byte fromUnicode

	130 * + stage 2 only contains indexes

	131 * + stage 3 stores 16 bits per character with classification bits 15..8

	132 * - no multiplier for stage 1 entries

	133 * - stage 2 for non-single-byte codepages contains the index and the flags in

	134 * one 32-bit value

	135 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit inte gers

	136 *

	137 * For more details about old versions of the MBCS data structure, see

	138 * the corresponding versions of this file.

	139 *

	140 * Converting stateless codepage data ---------------------------------------***

	141 * (or codepage data with simple states) to Unicode.

	142 *

	143 * Data structure and algorithm for converting from complex legacy codepages

	144 * to Unicode. (Designed before 2000-may-22.)

	145 *

	146 * The basic idea is that the structure of legacy codepages can be described

	147 * with state tables.

	148 * When reading a byte stream, each input byte causes a state transition.

	149 * Some transitions result in the output of a code point, some result in

	150 * "unassigned" or "illegal" output.

	151 * This is used here for character conversion.

	152 *

	153 * The data structure begins with a state table consisting of a row

	154 * per state, with 256 entries (columns) per row for each possible input

	155 * byte value.

	156 * Each entry is 32 bits wide, with two formats distinguished by

	157 * the sign bit (bit 31):

	158 *

	159 * One format for transitional entries (bit 31 not set) for non-final bytes, and

	160 * one format for final entries (bit 31 set).

	161 * Both formats contain the number of the next state in the same bit

	162 * positions.

	163 * State 0 is the initial state.

	164 *

	165 * Most of the time, the offset values of subsequent states are added

	166 * up to a scalar value. This value will eventually be the index of

	167 * the Unicode code point in a table that follows the state table.

	168 * The effect is that the code points for final state table rows

	169 * are contiguous. The code points of final state rows follow each other

	170 * in the order of the references to those final states by previous

	171 * states, etc.

	172 *

	173 * For some terminal states, the offset is itself the output Unicode

	174 * code point (16 bits for a BMP code point or 20 bits for a supplementary

	175 * code point (stored as code point minus 0x10000 so that 20 bits are enough).

	176 * For others, the code point in the Unicode table is stored with either

	177 * one or two code units: one for BMP code points, two for a pair of

	178 * surrogates.

	179 * All code points for a final state entry take up the same number of code

	180 * units, regardless of whether they all actually _use_ the same number

	181 * of code units. This is necessary for simple array access.

	182 *

	183 * An additional feature comes in with what in ICU is called "fallback"

	184 * mappings:

	185 *

	186 * In addition to round-trippable, precise, 1:1 mappings, there are often

	187 * mappings defined between similar, though not the same, characters.

	188 * Typically, such mappings occur only in fromUnicode mapping tables because

	189 * Unicode has a superset repertoire of most other codepages. However, it

	190 * is possible to provide such mappings in the toUnicode tables, too.

	191 * In this case, the fallback mappings are partly integrated into the

	192 * general state tables because the structure of the encoding includes their

	193 * byte sequences.

	194 * For final entries in an initial state, fallback mappings are stored in

	195 * the entry itself like with roundtrip mappings.

	196 * For other final entries, they are stored in the code units table if

	197 * the entry is for a pair of code units.

	198 * For single-unit results in the code units table, there is no space to

	199 * alternatively hold a fallback mapping; in this case, the code unit

	200 * is stored as U+fffe (unassigned), and the fallback mapping needs to

	201 * be looked up by the scalar offset value in a separate table.

	202 *

	203 * "Unassigned" state entries really mean "structurally unassigned",

	204 * i.e., such a byte sequence will never have a mapping result.

	205 *

	206 * The interpretation of the bits in each entry is as follows:

	207 *

	208 * Bit 31 not set, not a terminal entry ("transitional"):

	209 * 30..24 next state

	210 * 23..0 offset delta, to be added up

	211 *

	212 * Bit 31 set, terminal ("final") entry:

	213 * 30..24 next state (regardless of action code)

	214 * 23..20 action code:

	215 * action codes 0 and 1 result in precise-mapping Unicode code points

	216 * 0 valid byte sequence

	217 * 19..16 not used, 0

	218 * 15..0 16-bit Unicode BMP code point

	219 * never U+fffe or U+ffff

	220 * 1 valid byte sequence

	221 * 19..0 20-bit Unicode supplementary code point

	222 * never U+fffe or U+ffff

	223 *

	224 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unico de code points

	225 * 2 valid byte sequence (fallback)

	226 * 19..16 not used, 0

	227 * 15..0 16-bit Unicode BMP code point as fallback result

	228 * 3 valid byte sequence (fallback)

	229 * 19..0 20-bit Unicode supplementary code point as fallback result

	230 *

	231 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illeg al results

	232 * depending on the code units they result in

	233 * 4 valid byte sequence

	234 * 19..9 not used, 0

	235 * 8..0 final offset delta

	236 * pointing to one 16-bit code unit which may be

	237 * fffe unassigned -- look for a fallback for this offset

	238 * ffff illegal

	239 * 5 valid byte sequence

	240 * 19..9 not used, 0

	241 * 8..0 final offset delta

	242 * pointing to two 16-bit code units

	243 * (typically UTF-16 surrogates)

	244 * the result depends on the first code unit as follows:

	245 * 0000..d7ff roundtrip BMP code point (1st alone)

	246 * d800..dbff roundtrip surrogate pair (1st, 2nd)

	247 * dc00..dfff fallback surrogate pair (1st-400, 2nd)

	248 * e000 roundtrip BMP code point (2nd alone)

	249 * e001 fallback BMP code point (2nd alone)

	250 * fffe unassigned

	251 * ffff illegal

	252 * (the final offset deltas are at most 255 * 2,

	253 * times 2 because of storing code unit pairs)

	254 *

	255 * 6 unassigned byte sequence

	256 * 19..16 not used, 0

	257 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2)

	258 * this does not contain a final offset delta because the main

	259 * purpose of this action code is to save scalar offset values;

	260 * therefore, fallback values cannot be assigned to byte

	261 * sequences that result in this action code

	262 * 7 illegal byte sequence

	263 * 19..16 not used, 0

	264 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2)

	265 * 8 state change only

	266 * 19..0 not used, 0

	267 * useful for state changes in simple stateful encodings,

	268 * at Shift-In/Shift-Out codes

	269 *

	270 *

	271 * 9..15 reserved for future use

	272 * current implementations will only perform a state change

	273 * and ignore bits 19..0

	274 *

	275 * An encoding with contiguous ranges of unassigned byte sequences, like

	276 * Shift-JIS and especially EUC-TW, can be stored efficiently by having

	277 * at least two states for the trail bytes:

	278 * One trail byte state that results in code points, and one that only

	279 * has "unassigned" and "illegal" terminal states.

	280 *

	281 * Note: partly by accident, this data structure supports simple stateful

	282 * encodings without any additional logic.

	283 * Currently, only simple Shift-In/Shift-Out schemes are handled with

	284 * appropriate state tables (especially EBCDIC_STATEFUL!).

	285 *

	286 * MBCS version 2 added:

	287 * unassigned and illegal action codes have U+fffe and U+ffff

	288 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP()

	289 *

	290 * Converting from Unicode to codepage bytes --------------------------------***

	291 *

	292 * The conversion data structure for fromUnicode is designed for the known

	293 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to

	294 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is

	295 * a roundtrip mapping.

	296 *

	297 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3

	298 * like in the character properties table.

	299 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3

	300 * with the resulting bytes is at offsetFromUBytes.

	301 *

	302 * Beginning with version 4, single-byte codepages have a significantly differen t

	303 * trie compared to other codepages.

	304 * In all cases, the entry in stage 1 is directly the index of the block of

	305 * 64 entries in stage 2.

	306 *

	307 * Single-byte lookup:

	308 *

	309 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3.

	310 * Stage 3 contains one 16-bit word per result:

	311 * Bits 15..8 indicate the kind of result:

	312 * f roundtrip result

	313 * c fallback result from private-use code point

	314 * 8 fallback result from other code points

	315 * 0 unassigned

	316 * Bits 7..0 contain the codepage byte. A zero byte is always possible.

	317 *

	318 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly

	319 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup

	320 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.

	321 * ASCII code points can be looked up with a linear array access into stage 3.

	322 * See maxFastUChar and other details in ucnvmbcs.h.

	323 *

	324 * Multi-byte lookup:

	325 *

	326 * Stage 2 contains a 32-bit word for each 16-block in stage 3:

	327 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results

	328 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)

	329 * If this test is false, then a non-zero result will be interpreted as

	330 * a fallback mapping.

	331 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(byt es per char)

	332 *

	333 * Stage 3 contains 2, 3, or 4 bytes per result.

	334 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness,

	335 * while 3 bytes are stored as bytes in big-endian order.

	336 * Leading zero bytes are ignored, and the number of bytes is counted.

	337 * A zero byte mapping result is possible as a roundtrip result.

	338 * For some output types, the actual result is processed from this;

	339 * see ucnv_MBCSFromUnicodeWithOffsets().

	340 *

	341 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10),

	342 * or (version 3 and up) for BMP-only codepages, it contains 64 entries.

	343 *

	344 * In version 4.3, a utf8Friendly file contains an mbcsIndex table.

	345 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup

	346 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3.

	347 * ASCII code points can be looked up with a linear array access into stage 3.

	348 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h.

	349 *

	350 * In version 3, stage 2 blocks may overlap by multiples of the multiplier

	351 * for compaction.

	352 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks)

	353 * may overlap by any number of entries.

	354 *

	355 * MBCS version 2 added:

	356 * the converter checks for known output types, which allows

	357 * adding new ones without crashing an unaware converter

	358 */

	359

	360 static const UConverterImpl _SBCSUTF8Impl;

	361 static const UConverterImpl _DBCSUTF8Impl;

	362

	363 /* GB 18030 data ------------------------------------------------------------ */

	364

	365 /* helper macros for linear values for GB 18030 four-byte sequences */

	366 #define LINEAR_18030(a, b, c, d) ((((a)10+(b))126L+(c))*10L+(d))

	367

	368 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30)

	369

	370 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff)

	371

	372 /*

	373 * Some ranges of GB 18030 where both the Unicode code points and the

	374 * GB four-byte sequences are contiguous and are handled algorithmically by

	375 * the special callback functions below.

	376 * The values are start & end of Unicode & GB codes.

	377 *

	378 * Note that single surrogates are not mapped by GB 18030

	379 * as of the re-released mapping tables from 2000-nov-30.

	380 */

	381 static const uint32_t

	382 gb18030Ranges[13][4]={

	383 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)},

	384 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)},

	385 {0x0452, 0x200F, LINEAR(0x8130D330), LINEAR(0x8136A531)},

	386 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)},

	387 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)},

	388 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)},

	389 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)},

	390 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)},

	391 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)},

	392 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)},

	393 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)},

	394 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)},

	395 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)}

	396 };

	397

	398 /* bit flag for UConverter.options indicating GB 18030 special handling */

	399 #define _MBCS_OPTION_GB18030 0x8000

	400

	401 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */

	402 #define _MBCS_OPTION_KEIS 0x01000

	403 #define _MBCS_OPTION_JEF 0x02000

	404 #define _MBCS_OPTION_JIPS 0x04000

	405

	406 #define KEIS_SO_CHAR_1 0x0A

	407 #define KEIS_SO_CHAR_2 0x42

	408 #define KEIS_SI_CHAR_1 0x0A

	409 #define KEIS_SI_CHAR_2 0x41

	410

	411 #define JEF_SO_CHAR 0x28

	412 #define JEF_SI_CHAR 0x29

	413

	414 #define JIPS_SO_CHAR_1 0x1A

	415 #define JIPS_SO_CHAR_2 0x70

	416 #define JIPS_SI_CHAR_1 0x1A

	417 #define JIPS_SI_CHAR_2 0x71

	418

	419 enum SISO_Option {

	420 SI,

	421 SO

	422 };

	423 typedef enum SISO_Option SISO_Option;

	424

	425 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *val ue) {

	426 int32_t SISOLength = 0;

	427

	428 switch (option) {

	429 case SI:

	430 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {

	431 value[0] = KEIS_SI_CHAR_1;

	432 value[1] = KEIS_SI_CHAR_2;

	433 SISOLength = 2;

	434 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {

	435 value[0] = JEF_SI_CHAR;

	436 SISOLength = 1;

	437 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {

	438 value[0] = JIPS_SI_CHAR_1;

	439 value[1] = JIPS_SI_CHAR_2;

	440 SISOLength = 2;

	441 } else {

	442 value[0] = UCNV_SI;

	443 SISOLength = 1;

	444 }

	445 break;

	446 case SO:

	447 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) {

	448 value[0] = KEIS_SO_CHAR_1;

	449 value[1] = KEIS_SO_CHAR_2;

	450 SISOLength = 2;

	451 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) {

	452 value[0] = JEF_SO_CHAR;

	453 SISOLength = 1;

	454 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) {

	455 value[0] = JIPS_SO_CHAR_1;

	456 value[1] = JIPS_SO_CHAR_2;

	457 SISOLength = 2;

	458 } else {

	459 value[0] = UCNV_SO;

	460 SISOLength = 1;

	461 }

	462 break;

	463 default:

	464 /* Should never happen. */

	465 break;

	466 }

	467

	468 return SISOLength;

	469 }

	470

	471 /* Miscellaneous ------------------------------------------------------------ */

	472

	473 /**

	474 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from

	475 * consecutive sequences of bytes, starting from the one encoded in value,

	476 * to Unicode code points. (Multiple mappings to reduce per-function call overhe ad.)

	477 * Does not currently support m:n mappings or reverse fallbacks.

	478 * This function will not be called for sequences of bytes with leading zeros.

	479 *

	480 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()

	481 * @param value contains 1..4 bytes of the first byte sequence, right-aligned

	482 * @param codePoints resulting Unicode code points, or negative if a byte sequen ce does

	483 * not map to anything

	484 * @return TRUE to continue enumeration, FALSE to stop

	485 */

	486 typedef UBool U_CALLCONV

	487 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoint s[32]);

	488

	489 /* similar to ucnv_MBCSGetNextUChar() but recursive */

	490 static UBool

	491 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],

	492 int32_t state, uint32_t offset,

	493 uint32_t value,

	494 UConverterEnumToUCallback callback, const void context,

	495 UErrorCode *pErrorCode) {

	496 UChar32 codePoints[32];

	497 const int32_t *row;

	498 const uint16_t *unicodeCodeUnits;

	499 UChar32 anyCodePoints;

	500 int32_t b, limit;

	501

	502 row=mbcsTable->stateTable[state];

	503 unicodeCodeUnits=mbcsTable->unicodeCodeUnits;

	504

	505 value<<=8;

	506 anyCodePoints=-1; /* becomes non-negative if there is a mapping */

	507

	508 b=(stateProps[state]&0x38)<<2;

	509 if(b==0 && stateProps[state]>=0x40) {

	510 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */

	511 codePoints[0]=U_SENTINEL;

	512 b=1;

	513 }

	514 limit=((stateProps[state]&7)+1)<<5;

	515 while(b<limit) {

	516 int32_t entry=row[b];

	517 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	518 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);

	519 if(stateProps[nextState]>=0) {

	520 /* recurse to a state with non-ignorable actions */

	521 if(!enumToU(

	522 mbcsTable, stateProps, nextState,

	523 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),

	524 value\|(uint32_t)b,

	525 callback, context,

	526 pErrorCode)) {

	527 return FALSE;

	528 }

	529 }

	530 codePoints[b&0x1f]=U_SENTINEL;

	531 } else {

	532 UChar32 c;

	533 int32_t action;

	534

	535 /*

	536 * An if-else-if chain provides more reliable performance for

	537 * the most common cases compared to a switch.

	538 */

	539 action=MBCS_ENTRY_FINAL_ACTION(entry);

	540 if(action==MBCS_STATE_VALID_DIRECT_16) {

	541 /* output BMP code point */

	542 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	543 } else if(action==MBCS_STATE_VALID_16) {

	544 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);

	545 c=unicodeCodeUnits[finalOffset];

	546 if(c<0xfffe) {

	547 /* output BMP code point */

	548 } else {

	549 c=U_SENTINEL;

	550 }

	551 } else if(action==MBCS_STATE_VALID_16_PAIR) {

	552 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);

	553 c=unicodeCodeUnits[finalOffset++];

	554 if(c<0xd800) {

	555 /* output BMP code point below 0xd800 */

	556 } else if(c<=0xdbff) {

	557 /* output roundtrip or fallback supplementary code point */

	558 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xd c00);

	559 } else if(c==0xe000) {

	560 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

	561 c=unicodeCodeUnits[finalOffset];

	562 } else {

	563 c=U_SENTINEL;

	564 }

	565 } else if(action==MBCS_STATE_VALID_DIRECT_20) {

	566 /* output supplementary code point */

	567 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);

	568 } else {

	569 c=U_SENTINEL;

	570 }

	571

	572 codePoints[b&0x1f]=c;

	573 anyCodePoints&=c;

	574 }

	575 if(((++b)&0x1f)==0) {

	576 if(anyCodePoints>=0) {

	577 if(!callback(context, value\|(uint32_t)(b-0x20), codePoints)) {

	578 return FALSE;

	579 }

	580 anyCodePoints=-1;

	581 }

	582 }

	583 }

	584 return TRUE;

	585 }

	586

	587 /*

	588 * Only called if stateProps[state]==-1.

	589 * A recursive call may do stateProps[state]\|=0x40 if this state is the target o f an

	590 * MBCS_STATE_CHANGE_ONLY.

	591 */

	592 static int8_t

	593 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {

	594 const int32_t *row;

	595 int32_t min, max, entry, nextState;

	596

	597 row=stateTable[state];

	598 stateProps[state]=0;

	599

	600 /* find first non-ignorable state */

	601 for(min=0;; ++min) {

	602 entry=row[min];

	603 nextState=MBCS_ENTRY_STATE(entry);

	604 if(stateProps[nextState]==-1) {

	605 getStateProp(stateTable, stateProps, nextState);

	606 }

	607 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	608 if(stateProps[nextState]>=0) {

	609 break;

	610 }

	611 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {

	612 break;

	613 }

	614 if(min==0xff) {

	615 stateProps[state]=-0x40; /* (int8_t)0xc0 */

	616 return stateProps[state];

	617 }

	618 }

	619 stateProps[state]\|=(int8_t)((min>>5)<<3);

	620

	621 /* find last non-ignorable state */

	622 for(max=0xff; min<max; --max) {

	623 entry=row[max];

	624 nextState=MBCS_ENTRY_STATE(entry);

	625 if(stateProps[nextState]==-1) {

	626 getStateProp(stateTable, stateProps, nextState);

	627 }

	628 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	629 if(stateProps[nextState]>=0) {

	630 break;

	631 }

	632 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {

	633 break;

	634 }

	635 }

	636 stateProps[state]\|=(int8_t)(max>>5);

	637

	638 /* recurse further and collect direct-state information */

	639 while(min<=max) {

	640 entry=row[min];

	641 nextState=MBCS_ENTRY_STATE(entry);

	642 if(stateProps[nextState]==-1) {

	643 getStateProp(stateTable, stateProps, nextState);

	644 }

	645 if(MBCS_ENTRY_IS_FINAL(entry)) {

	646 stateProps[nextState]\|=0x40;

	647 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {

	648 stateProps[state]\|=0x40;

	649 }

	650 }

	651 ++min;

	652 }

	653 return stateProps[state];

	654 }

	655

	656 /*

	657 * Internal function enumerating the toUnicode data of an MBCS converter.

	658 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U

	659 * table, but could also be used for a future ucnv_getUnicodeSet() option

	660 * that includes reverse fallbacks (after updating this function's implementatio n).

	661 * Currently only handles roundtrip mappings.

	662 * Does not currently handle extensions.

	663 */

	664 static void

	665 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,

	666 UConverterEnumToUCallback callback, const void context,

	667 UErrorCode *pErrorCode) {

	668 /*

	669 * Properties for each state, to speed up the enumeration.

	670 * Ignorable actions are unassigned/illegal/state-change-only:

	671 * They do not lead to mappings.

	672 *

	673 * Bits 7..6:

	674 * 1 direct/initial state (stateful converters have multiple)

	675 * 0 non-initial state with transitions or with non-ignorable result actions

	676 * -1 final state with only ignorable actions

	677 *

	678 * Bits 5..3:

	679 * The lowest byte value with non-ignorable actions is

	680 * value<<5 (rounded down).

	681 *

	682 * Bits 2..0:

	683 * The highest byte value with non-ignorable actions is

	684 * (value<<5)&0x1f (rounded up).

	685 */

	686 int8_t stateProps[MBCS_MAX_STATE_COUNT];

	687 int32_t state;

	688

	689 uprv_memset(stateProps, -1, sizeof(stateProps));

	690

	691 /* recurse from state 0 and set all stateProps */

	692 getStateProp(mbcsTable->stateTable, stateProps, 0);

	693

	694 for(state=0; state<mbcsTable->countStates; ++state) {

	695 /*if(stateProps[state]==-1) {

	696 printf("unused/unreachable <icu:state> %d\n", state);

	697 }*/

	698 if(stateProps[state]>=0x40) {

	699 /* start from each direct state */

	700 enumToU(

	701 mbcsTable, stateProps, state, 0, 0,

	702 callback, context,

	703 pErrorCode);

	704 }

	705 }

	706 }

	707

	708 U_CFUNC void

	709 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,

	710 const USetAdder *sa,

	711 UConverterUnicodeSet which,

	712 UConverterSetFilter filter,

	713 UErrorCode *pErrorCode) {

	714 const UConverterMBCSTable *mbcsTable;

	715 const uint16_t *table;

	716

	717 uint32_t st3;

	718 uint16_t st1, maxStage1, st2;

	719

	720 UChar32 c;

	721

	722 /* enumerate the from-Unicode trie table */

	723 mbcsTable=&sharedData->mbcs;

	724 table=mbcsTable->fromUnicodeTable;

	725 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {

	726 maxStage1=0x440;

	727 } else {

	728 maxStage1=0x40;

	729 }

	730

	731 c=0; /* keep track of the current code point while enumerating */

	732

	733 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

	734 const uint16_t stage2, stage3, *results;

	735 uint16_t minValue;

	736

	737 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;

	738

	739 /*

	740 * Set a threshold variable for selecting which mappings to use.

	741 * See ucnv_MBCSSingleFromBMPWithOffsets() and

	742 * MBCS_SINGLE_RESULT_FROM_U() for details.

	743 */

	744 if(which==UCNV_ROUNDTRIP_SET) {

	745 /* use only roundtrips */

	746 minValue=0xf00;

	747 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {

	748 /* use all roundtrip and fallback results */

	749 minValue=0x800;

	750 }

	751

	752 for(st1=0; st1<maxStage1; ++st1) {

	753 st2=table[st1];

	754 if(st2>maxStage1) {

	755 stage2=table+st2;

	756 for(st2=0; st2<64; ++st2) {

	757 if((st3=stage2[st2])!=0) {

	758 /* read the stage 3 block */

	759 stage3=results+st3;

	760

	761 do {

	762 if(*stage3++>=minValue) {

	763 sa->add(sa->set, c);

	764 }

	765 } while((++c&0xf)!=0);

	766 } else {

	767 c+=16; /* empty stage 3 block */

	768 }

	769 }

	770 } else {

	771 c+=1024; /* empty stage 2 block */

	772 }

	773 }

	774 } else {

	775 const uint32_t *stage2;

	776 const uint8_t stage3, bytes;

	777 uint32_t st3Multiplier;

	778 uint32_t value;

	779 UBool useFallback;

	780

	781 bytes=mbcsTable->fromUnicodeBytes;

	782

	783 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);

	784

	785 switch(mbcsTable->outputType) {

	786 case MBCS_OUTPUT_3:

	787 case MBCS_OUTPUT_4_EUC:

	788 st3Multiplier=3;

	789 break;

	790 case MBCS_OUTPUT_4:

	791 st3Multiplier=4;

	792 break;

	793 default:

	794 st3Multiplier=2;

	795 break;

	796 }

	797

	798 for(st1=0; st1<maxStage1; ++st1) {

	799 st2=table[st1];

	800 if(st2>(maxStage1>>1)) {

	801 stage2=(const uint32_t *)table+st2;

	802 for(st2=0; st2<64; ++st2) {

	803 if((st3=stage2[st2])!=0) {

	804 /* read the stage 3 block */

	805 stage3=bytes+st3Multiplier16(uint32_t)(uint16_t)st3;

	806

	807 /* get the roundtrip flags for the stage 3 block */

	808 st3>>=16;

	809

	810 /*

	811 * Add code points for which the roundtrip flag is set,

	812 * or which map to non-zero bytes if we use fallbacks.

	813 * See ucnv_MBCSFromUnicodeWithOffsets() for details.

	814 */

	815 switch(filter) {

	816 case UCNV_SET_FILTER_NONE:

	817 do {

	818 if(st3&1) {

	819 sa->add(sa->set, c);

	820 stage3+=st3Multiplier;

	821 } else if(useFallback) {

	822 uint8_t b=0;

	823 switch(st3Multiplier) {

	824 case 4:

	825 b\|=*stage3++;

	826 case 3:

	827 b\|=*stage3++;

	828 case 2:

	829 b\|=stage3[0]\|stage3[1];

	830 stage3+=2;

	831 default:

	832 break;

	833 }

	834 if(b!=0) {

	835 sa->add(sa->set, c);

	836 }

	837 }

	838 st3>>=1;

	839 } while((++c&0xf)!=0);

	840 break;

	841 case UCNV_SET_FILTER_DBCS_ONLY:

	842 /* Ignore single-byte results (<0x100). */

	843 do {

	844 if(((st3&1)!=0 \|\| useFallback) && ((const uint1 6_t )stage3)>=0x100) {

	845 sa->add(sa->set, c);

	846 }

	847 st3>>=1;

	848 stage3+=2; /* +=st3Multiplier */

	849 } while((++c&0xf)!=0);

	850 break;

	851 case UCNV_SET_FILTER_2022_CN:

	852 /* Only add code points that map to CNS 11643 plane s 1 & 2 for non-EXT ISO-2022-CN. */

	853 do {

	854 if(((st3&1)!=0 \|\| useFallback) && ((value=*stage 3)==0x81 \|\| value==0x82)) {

	855 sa->add(sa->set, c);

	856 }

	857 st3>>=1;

	858 stage3+=3; /* +=st3Multiplier */

	859 } while((++c&0xf)!=0);

	860 break;

	861 case UCNV_SET_FILTER_SJIS:

	862 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */

	863 do {

	864 if(((st3&1)!=0 \|\| useFallback) && (value=((cons t uint16_t )stage3))>=0x8140 && value<=0xeffc) {

	865 sa->add(sa->set, c);

	866 }

	867 st3>>=1;

	868 stage3+=2; /* +=st3Multiplier */

	869 } while((++c&0xf)!=0);

	870 break;

	871 case UCNV_SET_FILTER_GR94DBCS:

	872 /* Only add code points that map to ISO 2022 GR 94 D BCS codes (each byte A1..FE). */

	873 do {

	874 if( ((st3&1)!=0 \|\| useFallback) &&

	875 (uint16_t)((value=((const uint16_t )stage3 )) - 0xa1a1)<=(0xfefe - 0xa1a1) &&

	876 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

	877 ) {

	878 sa->add(sa->set, c);

	879 }

	880 st3>>=1;

	881 stage3+=2; /* +=st3Multiplier */

	882 } while((++c&0xf)!=0);

	883 break;

	884 case UCNV_SET_FILTER_HZ:

	885 /* Only add code points that are suitable for HZ DBC S (lead byte A1..FD). */

	886 do {

	887 if( ((st3&1)!=0 \|\| useFallback) &&

	888 (uint16_t)((value=((const uint16_t )stage3 ))-0xa1a1)<=(0xfdfe - 0xa1a1) &&

	889 (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

	890 ) {

	891 sa->add(sa->set, c);

	892 }

	893 st3>>=1;

	894 stage3+=2; /* +=st3Multiplier */

	895 } while((++c&0xf)!=0);

	896 break;

	897 default:

	898 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;

	899 return;

	900 }

	901 } else {

	902 c+=16; /* empty stage 3 block */

	903 }

	904 }

	905 } else {

	906 c+=1024; /* empty stage 2 block */

	907 }

	908 }

	909 }

	910

	911 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);

	912 }

	913

	914 U_CFUNC void

	915 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,

	916 const USetAdder *sa,

	917 UConverterUnicodeSet which,

	918 UErrorCode *pErrorCode) {

	919 ucnv_MBCSGetFilteredUnicodeSetForUnicode(

	920 sharedData, sa, which,

	921 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?

	922 UCNV_SET_FILTER_DBCS_ONLY :

	923 UCNV_SET_FILTER_NONE,

	924 pErrorCode);

	925 }

	926

	927 static void

	928 ucnv_MBCSGetUnicodeSet(const UConverter *cnv,

	929 const USetAdder *sa,

	930 UConverterUnicodeSet which,

	931 UErrorCode *pErrorCode) {

	932 if(cnv->options&_MBCS_OPTION_GB18030) {

	933 sa->addRange(sa->set, 0, 0xd7ff);

	934 sa->addRange(sa->set, 0xe000, 0x10ffff);

	935 } else {

	936 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode) ;

	937 }

	938 }

	939

	940 /* conversion extensions for input not in the main table -------------------- */

	941

	942 /*

	943 * Hardcoded extension handling for GB 18030.

	944 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the f ile.

	945 *

	946 * In the future, conversion extensions may handle m:n mappings and delta tables ,

	947 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/c onversion_extensions.html

	948 *

	949 * If an input character cannot be mapped, then these functions set an error

	950 * code. The framework will then call the callback function.

	951 */

	952

	953 /*

	954 * @return if(U_FAILURE) return the code point for cnv->fromUChar32

	955 * else return 0 after output has been written to the target

	956 */

	957 static UChar32

	958 _extFromU(UConverter cnv, const UConverterSharedData sharedData,

	959 UChar32 cp,

	960 const UChar *source, const UChar sourceLimit,

	961 uint8_t *target, const uint8_t targetLimit,

	962 int32_t **offsets, int32_t sourceIndex,

	963 UBool flush,

	964 UErrorCode *pErrorCode) {

	965 const int32_t *cx;

	966

	967 cnv->useSubChar1=FALSE;

	968

	969 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&

	970 ucnv_extInitialMatchFromU(

	971 cnv, cx,

	972 cp, source, sourceLimit,

	973 (char *)target, (char )targetLimit,

	974 offsets, sourceIndex,

	975 flush,

	976 pErrorCode)

	977 ) {

	978 return 0; /* an extension mapping handled the input */

	979 }

	980

	981 /* GB 18030 */

	982 if((cnv->options&_MBCS_OPTION_GB18030)!=0) {

	983 const uint32_t *range;

	984 int32_t i;

	985

	986 range=gb18030Ranges[0];

	987 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i ) {

	988 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) {

	989 /* found the Unicode code point, output the four-byte sequence f or it */

	990 uint32_t linear;

	991 char bytes[4];

	992

	993 /* get the linear value of the first GB 18030 code in this range */

	994 linear=range[2]-LINEAR_18030_BASE;

	995

	996 /* add the offset from the beginning of the range */

	997 linear+=((uint32_t)cp-range[0]);

	998

	999 /* turn this into a four-byte sequence */

	1000 bytes[3]=(char)(0x30+linear%10); linear/=10;

	1001 bytes[2]=(char)(0x81+linear%126); linear/=126;

	1002 bytes[1]=(char)(0x30+linear%10); linear/=10;

	1003 bytes[0]=(char)(0x81+linear);

	1004

	1005 /* output this sequence */

	1006 ucnv_fromUWriteBytes(cnv,

	1007 bytes, 4, (char *)target, (char )targetLi mit,

	1008 offsets, sourceIndex, pErrorCode);

	1009 return 0;

	1010 }

	1011 }

	1012 }

	1013

	1014 /* no mapping */

	1015 *pErrorCode=U_INVALID_CHAR_FOUND;

	1016 return cp;

	1017 }

	1018

	1019 /*

	1020 * Input sequence: cnv->toUBytes[0..length[

	1021 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input

	1022 * else return 0 after output has been written to the target

	1023 */

	1024 static int8_t

	1025 _extToU(UConverter cnv, const UConverterSharedData sharedData,

	1026 int8_t length,

	1027 const uint8_t *source, const uint8_t sourceLimit,

	1028 UChar *target, const UChar targetLimit,

	1029 int32_t **offsets, int32_t sourceIndex,

	1030 UBool flush,

	1031 UErrorCode *pErrorCode) {

	1032 const int32_t *cx;

	1033

	1034 if( (cx=sharedData->mbcs.extIndexes)!=NULL &&

	1035 ucnv_extInitialMatchToU(

	1036 cnv, cx,

	1037 length, (const char *)source, (const char )sourceLimit,

	1038 target, targetLimit,

	1039 offsets, sourceIndex,

	1040 flush,

	1041 pErrorCode)

	1042 ) {

	1043 return 0; /* an extension mapping handled the input */

	1044 }

	1045

	1046 /* GB 18030 */

	1047 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {

	1048 const uint32_t *range;

	1049 uint32_t linear;

	1050 int32_t i;

	1051

	1052 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2] , cnv->toUBytes[3]);

	1053 range=gb18030Ranges[0];

	1054 for(i=0; i<sizeof(gb18030Ranges)/sizeof(gb18030Ranges[0]); range+=4, ++i ) {

	1055 if(range[2]<=linear && linear<=range[3]) {

	1056 /* found the sequence, output the Unicode code point for it */

	1057 *pErrorCode=U_ZERO_ERROR;

	1058

	1059 /* add the linear difference between the input and start sequenc es to the start code point */

	1060 linear=range[0]+(linear-range[2]);

	1061

	1062 /* output this code point */

	1063 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets , sourceIndex, pErrorCode);

	1064

	1065 return 0;

	1066 }

	1067 }

	1068 }

	1069

	1070 /* no mapping */

	1071 *pErrorCode=U_INVALID_CHAR_FOUND;

	1072 return length;

	1073 }

	1074

	1075 /* EBCDIC swap LF<->NL ------------------------------------------------------ */

	1076

	1077 /*

	1078 * This code modifies a standard EBCDIC<->Unicode mapping table for

	1079 * OS/390 (z/OS) Unix System Services (Open Edition).

	1080 * The difference is in the mapping of Line Feed and New Line control codes:

	1081 * Standard EBCDIC maps

	1082 *

	1083 * <U000A> \x25 \|0

	1084 * <U0085> \x15 \|0

	1085 *

	1086 * but OS/390 USS EBCDIC swaps the control codes for LF and NL,

	1087 * mapping

	1088 *

	1089 * <U000A> \x15 \|0

	1090 * <U0085> \x25 \|0

	1091 *

	1092 * This code modifies a loaded standard EBCDIC<->Unicode mapping table

	1093 * by copying it into allocated memory and swapping the LF and NL values.

	1094 * It allows to support the same EBCDIC charset in both versions without

	1095 * duplicating the entire installed table.

	1096 */

	1097

	1098 /* standard EBCDIC codes */

	1099 #define EBCDIC_LF 0x25

	1100 #define EBCDIC_NL 0x15

	1101

	1102 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */

	1103 #define EBCDIC_RT_LF 0xf25

	1104 #define EBCDIC_RT_NL 0xf15

	1105

	1106 /* Unicode code points */

	1107 #define U_LF 0x0a

	1108 #define U_NL 0x85

	1109

	1110 static UBool

	1111 _EBCDICSwapLFNL(UConverterSharedData sharedData, UErrorCode pErrorCode) {

	1112 UConverterMBCSTable *mbcsTable;

	1113

	1114 const uint16_t table, results;

	1115 const uint8_t *bytes;

	1116

	1117 int32_t (*newStateTable)[256];

	1118 uint16_t *newResults;

	1119 uint8_t *p;

	1120 char *name;

	1121

	1122 uint32_t stage2Entry;

	1123 uint32_t size, sizeofFromUBytes;

	1124

	1125 mbcsTable=&sharedData->mbcs;

	1126

	1127 table=mbcsTable->fromUnicodeTable;

	1128 bytes=mbcsTable->fromUnicodeBytes;

	1129 results=(const uint16_t *)bytes;

	1130

	1131 /*

	1132 * Check that this is an EBCDIC table with SBCS portion -

	1133 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings.

	1134 *

	1135 * If not, ignore the option. Options are always ignored if they do not appl y.

	1136 */

	1137 if(!(

	1138 (mbcsTable->outputType==MBCS_OUTPUT_1 \|\| mbcsTable->outputType==MBCS_OU TPUT_2_SISO) &&

	1139 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VAL ID_DIRECT_16, U_LF) &&

	1140 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VAL ID_DIRECT_16, U_NL)

	1141 )) {

	1142 return FALSE;

	1143 }

	1144

	1145 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

	1146 if(!(

	1147 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) &&

	1148 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL)

	1149 )) {

	1150 return FALSE;

	1151 }

	1152 } else /* MBCS_OUTPUT_2_SISO */ {

	1153 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);

	1154 if(!(

	1155 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 &&

	1156 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF)

	1157 )) {

	1158 return FALSE;

	1159 }

	1160

	1161 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);

	1162 if(!(

	1163 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 &&

	1164 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL)

	1165 )) {

	1166 return FALSE;

	1167 }

	1168 }

	1169

	1170 if(mbcsTable->fromUBytesLength>0) {

	1171 /*

	1172 * We _know_ the number of bytes in the fromUnicodeBytes array

	1173 * starting with header.version 4.1.

	1174 */

	1175 sizeofFromUBytes=mbcsTable->fromUBytesLength;

	1176 } else {

	1177 /*

	1178 * Otherwise:

	1179 * There used to be code to enumerate the fromUnicode

	1180 * trie and find the highest entry, but it was removed in ICU 3.2

	1181 * because it was not tested and caused a low code coverage number.

	1182 * See Jitterbug 3674.

	1183 * This affects only some .cnv file formats with a header.version

	1184 * below 4.1, and only when swaplfnl is requested.

	1185 *

	1186 * ucnvmbcs.c revision 1.99 is the last one with the

	1187 * ucnv_MBCSSizeofFromUBytes() function.

	1188 */

	1189 *pErrorCode=U_INVALID_FORMAT_ERROR;

	1190 return FALSE;

	1191 }

	1192

	1193 /*

	1194 * The table has an appropriate format.

	1195 * Allocate and build

	1196 * - a modified to-Unicode state table

	1197 * - a modified from-Unicode output array

	1198 * - a converter name string with the swap option appended

	1199 */

	1200 size=

	1201 mbcsTable->countStates*1024+

	1202 sizeofFromUBytes+

	1203 UCNV_MAX_CONVERTER_NAME_LENGTH+20;

	1204 p=(uint8_t *)uprv_malloc(size);

	1205 if(p==NULL) {

	1206 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

	1207 return FALSE;

	1208 }

	1209

	1210 /* copy and modify the to-Unicode state table */

	1211 newStateTable=(int32_t (*)[256])p;

	1212 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*102 4);

	1213

	1214 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL);

	1215 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF);

	1216

	1217 /* copy and modify the from-Unicode result table */

	1218 newResults=(uint16_t *)newStateTable[mbcsTable->countStates];

	1219 uprv_memcpy(newResults, bytes, sizeofFromUBytes);

	1220

	1221 /* conveniently, the table access macros work on the left side of expression s */

	1222 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

	1223 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL;

	1224 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF;

	1225 } else /* MBCS_OUTPUT_2_SISO */ {

	1226 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF);

	1227 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL;

	1228

	1229 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL);

	1230 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF;

	1231 }

	1232

	1233 /* set the canonical converter name */

	1234 name=(char *)newResults+sizeofFromUBytes;

	1235 uprv_strcpy(name, sharedData->staticData->name);

	1236 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING);

	1237

	1238 /* set the pointers */

	1239 umtx_lock(NULL);

	1240 if(mbcsTable->swapLFNLStateTable==NULL) {

	1241 mbcsTable->swapLFNLStateTable=newStateTable;

	1242 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults;

	1243 mbcsTable->swapLFNLName=name;

	1244

	1245 newStateTable=NULL;

	1246 }

	1247 umtx_unlock(NULL);

	1248

	1249 /* release the allocated memory if another thread beat us to it */

	1250 if(newStateTable!=NULL) {

	1251 uprv_free(newStateTable);

	1252 }

	1253 return TRUE;

	1254 }

	1255

	1256 /* reconstitute omitted fromUnicode data ------------------------------------ */

	1257

	1258 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() * /

	1259 static UBool U_CALLCONV

	1260 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32] ) {

	1261 UConverterMBCSTable mbcsTable=(UConverterMBCSTable )context;

	1262 const uint16_t *table;

	1263 uint32_t *stage2;

	1264 uint8_t bytes, p;

	1265 UChar32 c;

	1266 int32_t i, st3;

	1267

	1268 table=mbcsTable->fromUnicodeTable;

	1269 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;

	1270

	1271 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */

	1272 switch(mbcsTable->outputType) {

	1273 case MBCS_OUTPUT_3_EUC:

	1274 if(value<=0xffff) {

	1275 /* short sequences are stored directly */

	1276 /* code set 0 or 1 */

	1277 } else if(value<=0x8effff) {

	1278 /* code set 2 */

	1279 value&=0x7fff;

	1280 } else /* first byte is 0x8f */ {

	1281 /* code set 3 */

	1282 value&=0xff7f;

	1283 }

	1284 break;

	1285 case MBCS_OUTPUT_4_EUC:

	1286 if(value<=0xffffff) {

	1287 /* short sequences are stored directly */

	1288 /* code set 0 or 1 */

	1289 } else if(value<=0x8effffff) {

	1290 /* code set 2 */

	1291 value&=0x7fffff;

	1292 } else /* first byte is 0x8f */ {

	1293 /* code set 3 */

	1294 value&=0xff7fff;

	1295 }

	1296 break;

	1297 default:

	1298 break;

	1299 }

	1300

	1301 for(i=0; i<=0x1f; ++value, ++i) {

	1302 c=codePoints[i];

	1303 if(c<0) {

	1304 continue;

	1305 }

	1306

	1307 /* locate the stage 2 & 3 data */

	1308 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);

	1309 p=bytes;

	1310 st3=(int32_t)(uint16_t)stage216+(c&0xf);

	1311

	1312 /* write the codepage bytes into stage 3 */

	1313 switch(mbcsTable->outputType) {

	1314 case MBCS_OUTPUT_3:

	1315 case MBCS_OUTPUT_4_EUC:

	1316 p+=st3*3;

	1317 p[0]=(uint8_t)(value>>16);

	1318 p[1]=(uint8_t)(value>>8);

	1319 p[2]=(uint8_t)value;

	1320 break;

	1321 case MBCS_OUTPUT_4:

	1322 ((uint32_t *)p)[st3]=value;

	1323 break;

	1324 default:

	1325 /* 2 bytes per character */

	1326 ((uint16_t *)p)[st3]=(uint16_t)value;

	1327 break;

	1328 }

	1329

	1330 /* set the roundtrip flag */

	1331 *stage2\|=(1UL<<(16+(c&0xf)));

	1332 }

	1333 return TRUE;

	1334 }

	1335

	1336 static void

	1337 reconstituteData(UConverterMBCSTable *mbcsTable,

	1338 uint32_t stage1Length, uint32_t stage2Length,

	1339 uint32_t fullStage2Length, /* lengths are numbers of units, no t bytes */

	1340 UErrorCode *pErrorCode) {

	1341 uint16_t *stage1;

	1342 uint32_t *stage2;

	1343 uint8_t *bytes;

	1344 uint32_t dataLength=stage1Length2+fullStage2Length4+mbcsTable->fromUBytesL ength;

	1345 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);

	1346 if(mbcsTable->reconstitutedData==NULL) {

	1347 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

	1348 return;

	1349 }

	1350 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);

	1351

	1352 /* copy existing data and reroute the pointers */

	1353 stage1=(uint16_t *)mbcsTable->reconstitutedData;

	1354 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);

	1355

	1356 stage2=(uint32_t *)(stage1+stage1Length);

	1357 uprv_memcpy(stage2+(fullStage2Length-stage2Length),

	1358 mbcsTable->fromUnicodeTable+stage1Length,

	1359 stage2Length*4);

	1360

	1361 mbcsTable->fromUnicodeTable=stage1;

	1362 mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);

	1363

	1364 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */

	1365 stage2=(uint32_t *)stage1;

	1366

	1367 /* reconstitute the initial part of stage 2 from the mbcsIndex */

	1368 {

	1369 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;

	1370 int32_t stageUTF8Index=0;

	1371 int32_t st1, st2, st3, i;

	1372

	1373 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {

	1374 st2=stage1[st1];

	1375 if(st2!=stage1Length/2) {

	1376 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */

	1377 for(i=0; i<16; ++i) {

	1378 st3=mbcsTable->mbcsIndex[stageUTF8Index++];

	1379 if(st3!=0) {

	1380 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */

	1381 st3>>=4;

	1382 /*

	1383 * 4 stage 2 entries point to 4 consecutive stage 3 16-b locks which are

	1384 * allocated together as a single 64-block for access fr om the mbcsIndex

	1385 */

	1386 stage2[st2++]=st3++;

	1387 stage2[st2++]=st3++;

	1388 stage2[st2++]=st3++;

	1389 stage2[st2++]=st3;

	1390 } else {

	1391 /* no stage 3 block, skip */

	1392 st2+=4;

	1393 }

	1394 }

	1395 } else {

	1396 /* no stage 2 block, skip */

	1397 stageUTF8Index+=16;

	1398 }

	1399 }

	1400 }

	1401

	1402 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */

	1403 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCod e);

	1404 }

	1405

	1406 /* MBCS setup functions ----------------------------------------------------- */

	1407

	1408 static void

	1409 ucnv_MBCSLoad(UConverterSharedData *sharedData,

	1410 UConverterLoadArgs *pArgs,

	1411 const uint8_t *raw,

	1412 UErrorCode *pErrorCode) {

	1413 UDataInfo info;

	1414 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;

	1415 _MBCSHeader header=(_MBCSHeader )raw;

	1416 uint32_t offset;

	1417 uint32_t headerLength;

	1418 UBool noFromU=FALSE;

	1419

	1420 if(header->version[0]==4) {

	1421 headerLength=MBCS_HEADER_V4_LENGTH;

	1422 } else if(header->version[0]==5 && header->version[1]>=3 &&

	1423 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {

	1424 headerLength=header->options&MBCS_OPT_LENGTH_MASK;

	1425 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);

	1426 } else {

	1427 *pErrorCode=U_INVALID_TABLE_FORMAT;

	1428 return;

	1429 }

	1430

	1431 mbcsTable->outputType=(uint8_t)header->flags;

	1432 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {

	1433 *pErrorCode=U_INVALID_TABLE_FORMAT;

	1434 return;

	1435 }

	1436

	1437 /* extension data, header version 4.2 and higher */

	1438 offset=header->flags>>8;

	1439 if(offset!=0) {

	1440 mbcsTable->extIndexes=(const int32_t *)(raw+offset);

	1441 }

	1442

	1443 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) {

	1444 UConverterLoadArgs args={ 0 };

	1445 UConverterSharedData *baseSharedData;

	1446 const int32_t *extIndexes;

	1447 const char *baseName;

	1448

	1449 /* extension-only file, load the base table and set values appropriately */

	1450 if((extIndexes=mbcsTable->extIndexes)==NULL) {

	1451 /* extension-only file without extension */

	1452 *pErrorCode=U_INVALID_TABLE_FORMAT;

	1453 return;

	1454 }

	1455

	1456 if(pArgs->nestedLoads!=1) {

	1457 /* an extension table must not be loaded as a base table */

	1458 *pErrorCode=U_INVALID_TABLE_FILE;

	1459 return;

	1460 }

	1461

	1462 /* load the base table */

	1463 baseName=(const char )header+headerLength4;

	1464 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {

	1465 /* forbid loading this same extension-only file */

	1466 *pErrorCode=U_INVALID_TABLE_FORMAT;

	1467 return;

	1468 }

	1469

	1470 /* TODO parse package name out of the prefix of the base name in the ext ension .cnv file? */

	1471 args.size=sizeof(UConverterLoadArgs);

	1472 args.nestedLoads=2;

	1473 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable;

	1474 args.reserved=pArgs->reserved;

	1475 args.options=pArgs->options;

	1476 args.pkg=pArgs->pkg;

	1477 args.name=baseName;

	1478 baseSharedData=ucnv_load(&args, pErrorCode);

	1479 if(U_FAILURE(*pErrorCode)) {

	1480 return;

	1481 }

	1482 if( baseSharedData->staticData->conversionType!=UCNV_MBCS \|\|

	1483 baseSharedData->mbcs.baseSharedData!=NULL

	1484 ) {

	1485 ucnv_unload(baseSharedData);

	1486 *pErrorCode=U_INVALID_TABLE_FORMAT;

	1487 return;

	1488 }

	1489 if(pArgs->onlyTestIsLoadable) {

	1490 /*

	1491 * Exit as soon as we know that we can load the converter

	1492 * and the format is valid and supported.

	1493 * The worst that can happen in the following code is a memory

	1494 * allocation error.

	1495 */

	1496 ucnv_unload(baseSharedData);

	1497 return;

	1498 }

	1499

	1500 /* copy the base table data */

	1501 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable ));

	1502

	1503 /* overwrite values with relevant ones for the extension converter */

	1504 mbcsTable->baseSharedData=baseSharedData;

	1505 mbcsTable->extIndexes=extIndexes;

	1506

	1507 /*

	1508 * It would be possible to share the swapLFNL data with a base converter ,

	1509 * but the generated name would have to be different, and the memory

	1510 * would have to be free'd only once.

	1511 * It is easier to just create the data for the extension converter

	1512 * separately when it is requested.

	1513 */

	1514 mbcsTable->swapLFNLStateTable=NULL;

	1515 mbcsTable->swapLFNLFromUnicodeBytes=NULL;

	1516 mbcsTable->swapLFNLName=NULL;

	1517

	1518 /*

	1519 * The reconstitutedData must be deleted only when the base converter

	1520 * is unloaded.

	1521 */

	1522 mbcsTable->reconstitutedData=NULL;

	1523

	1524 /*

	1525 * Set a special, runtime-only outputType if the extension converter

	1526 * is a DBCS version of a base converter that also maps single bytes.

	1527 */

	1528 if( sharedData->staticData->conversionType==UCNV_DBCS \|\|

	1529 (sharedData->staticData->conversionType==UCNV_MBCS &&

	1530 sharedData->staticData->minBytesPerChar>=2)

	1531 ) {

	1532 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) {

	1533 /* the base converter is SI/SO-stateful */

	1534 int32_t entry;

	1535

	1536 /* get the dbcs state from the state table entry for SO=0x0e */

	1537 entry=mbcsTable->stateTable[0][0xe];

	1538 if( MBCS_ENTRY_IS_FINAL(entry) &&

	1539 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY &&

	1540 MBCS_ENTRY_FINAL_STATE(entry)!=0

	1541 ) {

	1542 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(ent ry);

	1543

	1544 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;

	1545 }

	1546 } else if(

	1547 baseSharedData->staticData->conversionType==UCNV_MBCS &&

	1548 baseSharedData->staticData->minBytesPerChar==1 &&

	1549 baseSharedData->staticData->maxBytesPerChar==2 &&

	1550 mbcsTable->countStates<=127

	1551 ) {

	1552 /* non-stateful base converter, need to modify the state table * /

	1553 int32_t (*newStateTable)[256];

	1554 int32_t *state;

	1555 int32_t i, count;

	1556

	1557 /* allocate a new state table and copy the base state table cont ents */

	1558 count=mbcsTable->countStates;

	1559 newStateTable=(int32_t ()[256])uprv_malloc((count+1)1024);

	1560 if(newStateTable==NULL) {

	1561 ucnv_unload(baseSharedData);

	1562 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

	1563 return;

	1564 }

	1565

	1566 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024);

	1567

	1568 /* change all final single-byte entries to go to a new all-illeg al state */

	1569 state=newStateTable[0];

	1570 for(i=0; i<256; ++i) {

	1571 if(MBCS_ENTRY_IS_FINAL(state[i])) {

	1572 state[i]=MBCS_ENTRY_TRANSITION(count, 0);

	1573 }

	1574 }

	1575

	1576 /* build the new all-illegal state */

	1577 state=newStateTable[count];

	1578 for(i=0; i<256; ++i) {

	1579 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0);

	1580 }

	1581 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable;

	1582 mbcsTable->countStates=(uint8_t)(count+1);

	1583 mbcsTable->stateTableOwned=TRUE;

	1584

	1585 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY;

	1586 }

	1587 }

	1588

	1589 /*

	1590 * unlike below for files with base tables, do not get the unicodeMask

	1591 * from the sharedData; instead, use the base table's unicodeMask,

	1592 * which we copied in the memcpy above;

	1593 * this is necessary because the static data unicodeMask, especially

	1594 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data

	1595 */

	1596 } else {

	1597 /* conversion file with a base table; an additional extension table is o ptional */

	1598 /* make sure that the output type is known */

	1599 switch(mbcsTable->outputType) {

	1600 case MBCS_OUTPUT_1:

	1601 case MBCS_OUTPUT_2:

	1602 case MBCS_OUTPUT_3:

	1603 case MBCS_OUTPUT_4:

	1604 case MBCS_OUTPUT_3_EUC:

	1605 case MBCS_OUTPUT_4_EUC:

	1606 case MBCS_OUTPUT_2_SISO:

	1607 /* OK */

	1608 break;

	1609 default:

	1610 *pErrorCode=U_INVALID_TABLE_FORMAT;

	1611 return;

	1612 }

	1613 if(pArgs->onlyTestIsLoadable) {

	1614 /*

	1615 * Exit as soon as we know that we can load the converter

	1616 * and the format is valid and supported.

	1617 * The worst that can happen in the following code is a memory

	1618 * allocation error.

	1619 */

	1620 return;

	1621 }

	1622

	1623 mbcsTable->countStates=(uint8_t)header->countStates;

	1624 mbcsTable->countToUFallbacks=header->countToUFallbacks;

	1625 mbcsTable->stateTable=(const int32_t ()[256])(raw+headerLength4);

	1626 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable +header->countStates);

	1627 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCode Units);

	1628

	1629 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTa ble);

	1630 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUByt es);

	1631 mbcsTable->fromUBytesLength=header->fromUBytesLength;

	1632

	1633 /*

	1634 * converter versions 6.1 and up contain a unicodeMask that is

	1635 * used here to select the most efficient function implementations

	1636 */

	1637 info.size=sizeof(UDataInfo);

	1638 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);

	1639 if(info.formatVersion[0]>6 \|\| (info.formatVersion[0]==6 && info.formatVe rsion[1]>=1)) {

	1640 /* mask off possible future extensions to be safe */

	1641 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask &3);

	1642 } else {

	1643 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */

	1644 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY\|UCNV_HAS_SURROGATES;

	1645 }

	1646

	1647 /*

	1648 * _MBCSHeader.version 4.3 adds utf8Friendly data structures.

	1649 * Check for the header version, SBCS vs. MBCS, and for whether the

	1650 * data structures are optimized for code points as high as what the

	1651 * runtime code is designed for.

	1652 * The implementation does not handle mapping tables with entries for

	1653 * unpaired surrogates.

	1654 */

	1655 if( header->version[1]>=3 &&

	1656 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 &&

	1657 (mbcsTable->countStates==1 ?

	1658 (header->version[2]>=(SBCS_FAST_MAX>>8)) :

	1659 (header->version[2]>=(MBCS_FAST_MAX>>8))

	1660 )

	1661 ) {

	1662 mbcsTable->utf8Friendly=TRUE;

	1663

	1664 if(mbcsTable->countStates==1) {

	1665 /*

	1666 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBC S_FAST_MAX or higher.

	1667 * Build a table with indexes to each block, to be used instead of

	1668 * the regular stage 1/2 table.

	1669 */

	1670 int32_t i;

	1671 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) {

	1672 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTabl e->fromUnicodeTable[i>>4]+((i<<2)&0x3c)];

	1673 }

	1674 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */

	1675 mbcsTable->maxFastUChar=SBCS_FAST_MAX;

	1676 } else {

	1677 /*

	1678 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBC S_FAST_MAX or higher.

	1679 * The .cnv file is prebuilt with an additional stage table with indexes

	1680 * to each block.

	1681 */

	1682 mbcsTable->mbcsIndex=(const uint16_t *)

	1683 (mbcsTable->fromUnicodeBytes+

	1684 (noFromU ? 0 : mbcsTable->fromUBytesLength));

	1685 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)\|0xff;

	1686 }

	1687 }

	1688

	1689 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */

	1690 {

	1691 uint32_t asciiRoundtrips=0xffffffff;

	1692 int32_t i;

	1693

	1694 for(i=0; i<0x80; ++i) {

	1695 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_V ALID_DIRECT_16, i)) {

	1696 asciiRoundtrips&=~((uint32_t)1<<(i>>2));

	1697 }

	1698 }

	1699 mbcsTable->asciiRoundtrips=asciiRoundtrips;

	1700 }

	1701

	1702 if(noFromU) {

	1703 uint32_t stage1Length=

	1704 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?

	1705 0x440 : 0x40;

	1706 uint32_t stage2Length=

	1707 (header->offsetFromUBytes-header->offsetFromUTable)/4-

	1708 stage1Length/2;

	1709 reconstituteData(mbcsTable, stage1Length, stage2Length, header->full Stage2Length, pErrorCode);

	1710 }

	1711 }

	1712

	1713 /* Set the impl pointer here so that it is set for both extension-only and b ase tables. */

	1714 if(mbcsTable->utf8Friendly) {

	1715 if(mbcsTable->countStates==1) {

	1716 sharedData->impl=&_SBCSUTF8Impl;

	1717 } else {

	1718 if(mbcsTable->outputType==MBCS_OUTPUT_2) {

	1719 sharedData->impl=&_DBCSUTF8Impl;

	1720 }

	1721 }

	1722 }

	1723

	1724 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY \|\| mbcsTable->outputType==MB CS_OUTPUT_2_SISO) {

	1725 /*

	1726 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not rou ndtrip.

	1727 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength co rrectly.

	1728 */

	1729 mbcsTable->asciiRoundtrips=0;

	1730 }

	1731 }

	1732

	1733 static void

	1734 ucnv_MBCSUnload(UConverterSharedData *sharedData) {

	1735 UConverterMBCSTable *mbcsTable=&sharedData->mbcs;

	1736

	1737 if(mbcsTable->swapLFNLStateTable!=NULL) {

	1738 uprv_free(mbcsTable->swapLFNLStateTable);

	1739 }

	1740 if(mbcsTable->stateTableOwned) {

	1741 uprv_free((void *)mbcsTable->stateTable);

	1742 }

	1743 if(mbcsTable->baseSharedData!=NULL) {

	1744 ucnv_unload(mbcsTable->baseSharedData);

	1745 }

	1746 if(mbcsTable->reconstitutedData!=NULL) {

	1747 uprv_free(mbcsTable->reconstitutedData);

	1748 }

	1749 }

	1750

	1751 static void

	1752 ucnv_MBCSOpen(UConverter *cnv,

	1753 UConverterLoadArgs *pArgs,

	1754 UErrorCode *pErrorCode) {

	1755 UConverterMBCSTable *mbcsTable;

	1756 const int32_t *extIndexes;

	1757 uint8_t outputType;

	1758 int8_t maxBytesPerUChar;

	1759

	1760 if(pArgs->onlyTestIsLoadable) {

	1761 return;

	1762 }

	1763

	1764 mbcsTable=&cnv->sharedData->mbcs;

	1765 outputType=mbcsTable->outputType;

	1766

	1767 if(outputType==MBCS_OUTPUT_DBCS_ONLY) {

	1768 /* the swaplfnl option does not apply, remove it */

	1769 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;

	1770 }

	1771

	1772 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	1773 /* do this because double-checked locking is broken */

	1774 UBool isCached;

	1775

	1776 umtx_lock(NULL);

	1777 isCached=mbcsTable->swapLFNLStateTable!=NULL;

	1778 umtx_unlock(NULL);

	1779

	1780 if(!isCached) {

	1781 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) {

	1782 if(U_FAILURE(*pErrorCode)) {

	1783 return; /* something went wrong */

	1784 }

	1785

	1786 /* the option does not apply, remove it */

	1787 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL;

	1788 }

	1789 }

	1790 }

	1791

	1792 if(uprv_strstr(pArgs->name, "18030")!=NULL) {

	1793 if(uprv_strstr(pArgs->name, "gb18030")!=NULL \|\| uprv_strstr(pArgs->name, "GB18030")!=NULL) {

	1794 /* set a flag for GB 18030 mode, which changes the callback behavior */

	1795 cnv->options\|=_MBCS_OPTION_GB18030;

	1796 }

	1797 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) \|\| (uprv_strstr(pArgs->na me, "keis")!=NULL)) {

	1798 /* set a flag for KEIS converter, which changes the SI/SO character sequ ence */

	1799 cnv->options\|=_MBCS_OPTION_KEIS;

	1800 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) \|\| (uprv_strstr(pArgs->nam e, "jef")!=NULL)) {

	1801 /* set a flag for JEF converter, which changes the SI/SO character seque nce */

	1802 cnv->options\|=_MBCS_OPTION_JEF;

	1803 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) \|\| (uprv_strstr(pArgs->na me, "jips")!=NULL)) {

	1804 /* set a flag for JIPS converter, which changes the SI/SO character sequ ence */

	1805 cnv->options\|=_MBCS_OPTION_JIPS;

	1806 }

	1807

	1808 /* fix maxBytesPerUChar depending on outputType and options etc. */

	1809 if(outputType==MBCS_OUTPUT_2_SISO) {

	1810 cnv->maxBytesPerUChar=3; /* SO+DBCS */

	1811 }

	1812

	1813 extIndexes=mbcsTable->extIndexes;

	1814 if(extIndexes!=NULL) {

	1815 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes);

	1816 if(outputType==MBCS_OUTPUT_2_SISO) {

	1817 ++maxBytesPerUChar; /* SO + multiple DBCS */

	1818 }

	1819

	1820 if(maxBytesPerUChar>cnv->maxBytesPerUChar) {

	1821 cnv->maxBytesPerUChar=maxBytesPerUChar;

	1822 }

	1823 }

	1824

	1825 #if 0

	1826 /*

	1827 * documentation of UConverter fields used for status

	1828 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset()

	1829 */

	1830

	1831 /* toUnicode */

	1832 cnv->toUnicodeStatus=0; /* offset */

	1833 cnv->mode=0; /* state */

	1834 cnv->toULength=0; /* byteIndex */

	1835

	1836 /* fromUnicode */

	1837 cnv->fromUChar32=0;

	1838 cnv->fromUnicodeStatus=1; /* prevLength */

	1839 #endif

	1840 }

	1841

	1842 static const char *

	1843 ucnv_MBCSGetName(const UConverter *cnv) {

	1844 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNL Name!=NULL) {

	1845 return cnv->sharedData->mbcs.swapLFNLName;

	1846 } else {

	1847 return cnv->sharedData->staticData->name;

	1848 }

	1849 }

	1850

	1851 /* MBCS-to-Unicode conversion functions ------------------------------------- */

	1852

	1853 static UChar32

	1854 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) {

	1855 const _MBCSToUFallback *toUFallbacks;

	1856 uint32_t i, start, limit;

	1857

	1858 limit=mbcsTable->countToUFallbacks;

	1859 if(limit>0) {

	1860 /* do a binary search for the fallback mapping */

	1861 toUFallbacks=mbcsTable->toUFallbacks;

	1862 start=0;

	1863 while(start<limit-1) {

	1864 i=(start+limit)/2;

	1865 if(offset<toUFallbacks[i].offset) {

	1866 limit=i;

	1867 } else {

	1868 start=i;

	1869 }

	1870 }

	1871

	1872 /* did we really find it? */

	1873 if(offset==toUFallbacks[start].offset) {

	1874 return toUFallbacks[start].codePoint;

	1875 }

	1876 }

	1877

	1878 return 0xfffe;

	1879 }

	1880

	1881 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */

	1882 static void

	1883 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	1884 UErrorCode *pErrorCode) {

	1885 UConverter *cnv;

	1886 const uint8_t source, sourceLimit;

	1887 UChar *target;

	1888 const UChar *targetLimit;

	1889 int32_t *offsets;

	1890

	1891 const int32_t (*stateTable)[256];

	1892

	1893 int32_t sourceIndex;

	1894

	1895 int32_t entry;

	1896 UChar c;

	1897 uint8_t action;

	1898

	1899 /* set up the local pointers */

	1900 cnv=pArgs->converter;

	1901 source=(const uint8_t *)pArgs->source;

	1902 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	1903 target=pArgs->target;

	1904 targetLimit=pArgs->targetLimit;

	1905 offsets=pArgs->offsets;

	1906

	1907 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	1908 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

	1909 } else {

	1910 stateTable=cnv->sharedData->mbcs.stateTable;

	1911 }

	1912

	1913 /* sourceIndex=-1 if the current character began in the previous buffer */

	1914 sourceIndex=0;

	1915

	1916 /* conversion loop */

	1917 while(source<sourceLimit) {

	1918 /*

	1919 * This following test is to see if available input would overflow the o utput.

	1920 * It does not catch output of more than one code unit that

	1921 * overflows as a result of a surrogate pair or callback output

	1922 * from the last source byte.

	1923 * Therefore, those situations also test for overflows and will

	1924 * then break the loop, too.

	1925 */

	1926 if(target>=targetLimit) {

	1927 /* target is full */

	1928 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1929 break;

	1930 }

	1931

	1932 entry=stateTable[0][*source++];

	1933 /* MBCS_ENTRY_IS_FINAL(entry) */

	1934

	1935 /* test the most common case first */

	1936 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

	1937 /* output BMP code point */

	1938 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	1939 if(offsets!=NULL) {

	1940 *offsets++=sourceIndex;

	1941 }

	1942

	1943 /* normal end of action codes: prepare for a new character */

	1944 ++sourceIndex;

	1945 continue;

	1946 }

	1947

	1948 /*

	1949 * An if-else-if chain provides more reliable performance for

	1950 * the most common cases compared to a switch.

	1951 */

	1952 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	1953 if(action==MBCS_STATE_VALID_DIRECT_20 \|\|

	1954 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv) )

	1955 ) {

	1956 entry=MBCS_ENTRY_FINAL_VALUE(entry);

	1957 /* output surrogate pair */

	1958 *target++=(UChar)(0xd800\|(UChar)(entry>>10));

	1959 if(offsets!=NULL) {

	1960 *offsets++=sourceIndex;

	1961 }

	1962 c=(UChar)(0xdc00\|(UChar)(entry&0x3ff));

	1963 if(target<targetLimit) {

	1964 *target++=c;

	1965 if(offsets!=NULL) {

	1966 *offsets++=sourceIndex;

	1967 }

	1968 } else {

	1969 /* target overflow */

	1970 cnv->UCharErrorBuffer[0]=c;

	1971 cnv->UCharErrorBufferLength=1;

	1972 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1973 break;

	1974 }

	1975

	1976 ++sourceIndex;

	1977 continue;

	1978 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	1979 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

	1980 /* output BMP code point */

	1981 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	1982 if(offsets!=NULL) {

	1983 *offsets++=sourceIndex;

	1984 }

	1985

	1986 ++sourceIndex;

	1987 continue;

	1988 }

	1989 } else if(action==MBCS_STATE_UNASSIGNED) {

	1990 /* just fall through */

	1991 } else if(action==MBCS_STATE_ILLEGAL) {

	1992 /* callback(illegal) */

	1993 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1994 } else {

	1995 /* reserved, must never occur */

	1996 ++sourceIndex;

	1997 continue;

	1998 }

	1999

	2000 if(U_FAILURE(*pErrorCode)) {

	2001 /* callback(illegal) */

	2002 break;

	2003 } else /* unassigned sequences indicated with byteIndex>0 */ {

	2004 /* try an extension mapping */

	2005 pArgs->source=(const char *)source;

	2006 cnv->toUBytes[0]=*(source-1);

	2007 cnv->toULength=_extToU(cnv, cnv->sharedData,

	2008 1, &source, sourceLimit,

	2009 &target, targetLimit,

	2010 &offsets, sourceIndex,

	2011 pArgs->flush,

	2012 pErrorCode);

	2013 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);

	2014

	2015 if(U_FAILURE(*pErrorCode)) {

	2016 /* not mappable or buffer overflow */

	2017 break;

	2018 }

	2019 }

	2020 }

	2021

	2022 /* write back the updated pointers */

	2023 pArgs->source=(const char *)source;

	2024 pArgs->target=target;

	2025 pArgs->offsets=offsets;

	2026 }

	2027

	2028 /*

	2029 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single -byte, single-state codepages

	2030 * that only map to and from the BMP.

	2031 * In addition to single-byte optimizations, the offset calculations

	2032 * become much easier.

	2033 */

	2034 static void

	2035 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,

	2036 UErrorCode *pErrorCode) {

	2037 UConverter *cnv;

	2038 const uint8_t source, sourceLimit, *lastSource;

	2039 UChar *target;

	2040 int32_t targetCapacity, length;

	2041 int32_t *offsets;

	2042

	2043 const int32_t (*stateTable)[256];

	2044

	2045 int32_t sourceIndex;

	2046

	2047 int32_t entry;

	2048 uint8_t action;

	2049

	2050 /* set up the local pointers */

	2051 cnv=pArgs->converter;

	2052 source=(const uint8_t *)pArgs->source;

	2053 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	2054 target=pArgs->target;

	2055 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	2056 offsets=pArgs->offsets;

	2057

	2058 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	2059 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

	2060 } else {

	2061 stateTable=cnv->sharedData->mbcs.stateTable;

	2062 }

	2063

	2064 /* sourceIndex=-1 if the current character began in the previous buffer */

	2065 sourceIndex=0;

	2066 lastSource=source;

	2067

	2068 /*

	2069 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

	2070 * for the minimum of the sourceLength and targetCapacity

	2071 */

	2072 length=(int32_t)(sourceLimit-source);

	2073 if(length<targetCapacity) {

	2074 targetCapacity=length;

	2075 }

	2076

	2077 #if MBCS_UNROLL_SINGLE_TO_BMP

	2078 /* unrolling makes it faster on Pentium III/Windows 2000 */

	2079 /* unroll the loop with the most common case */

	2080 unrolled:

	2081 if(targetCapacity>=16) {

	2082 int32_t count, loops, oredEntries;

	2083

	2084 loops=count=targetCapacity>>4;

	2085 do {

	2086 oredEntries=entry=stateTable[0][*source++];

	2087 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2088 oredEntries\|=entry=stateTable[0][*source++];

	2089 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2090 oredEntries\|=entry=stateTable[0][*source++];

	2091 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2092 oredEntries\|=entry=stateTable[0][*source++];

	2093 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2094 oredEntries\|=entry=stateTable[0][*source++];

	2095 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2096 oredEntries\|=entry=stateTable[0][*source++];

	2097 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2098 oredEntries\|=entry=stateTable[0][*source++];

	2099 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2100 oredEntries\|=entry=stateTable[0][*source++];

	2101 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2102 oredEntries\|=entry=stateTable[0][*source++];

	2103 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2104 oredEntries\|=entry=stateTable[0][*source++];

	2105 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2106 oredEntries\|=entry=stateTable[0][*source++];

	2107 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2108 oredEntries\|=entry=stateTable[0][*source++];

	2109 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2110 oredEntries\|=entry=stateTable[0][*source++];

	2111 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2112 oredEntries\|=entry=stateTable[0][*source++];

	2113 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2114 oredEntries\|=entry=stateTable[0][*source++];

	2115 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2116 oredEntries\|=entry=stateTable[0][*source++];

	2117 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2118

	2119 /* were all 16 entries really valid? */

	2120 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) {

	2121 /* no, return to the first of these 16 */

	2122 source-=16;

	2123 target-=16;

	2124 break;

	2125 }

	2126 } while(--count>0);

	2127 count=loops-count;

	2128 targetCapacity-=16*count;

	2129

	2130 if(offsets!=NULL) {

	2131 lastSource+=16*count;

	2132 while(count>0) {

	2133 *offsets++=sourceIndex++;

	2134 *offsets++=sourceIndex++;

	2135 *offsets++=sourceIndex++;

	2136 *offsets++=sourceIndex++;

	2137 *offsets++=sourceIndex++;

	2138 *offsets++=sourceIndex++;

	2139 *offsets++=sourceIndex++;

	2140 *offsets++=sourceIndex++;

	2141 *offsets++=sourceIndex++;

	2142 *offsets++=sourceIndex++;

	2143 *offsets++=sourceIndex++;

	2144 *offsets++=sourceIndex++;

	2145 *offsets++=sourceIndex++;

	2146 *offsets++=sourceIndex++;

	2147 *offsets++=sourceIndex++;

	2148 *offsets++=sourceIndex++;

	2149 --count;

	2150 }

	2151 }

	2152 }

	2153 #endif

	2154

	2155 /* conversion loop */

	2156 while(targetCapacity > 0 && source < sourceLimit) {

	2157 entry=stateTable[0][*source++];

	2158 /* MBCS_ENTRY_IS_FINAL(entry) */

	2159

	2160 /* test the most common case first */

	2161 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

	2162 /* output BMP code point */

	2163 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2164 --targetCapacity;

	2165 continue;

	2166 }

	2167

	2168 /*

	2169 * An if-else-if chain provides more reliable performance for

	2170 * the most common cases compared to a switch.

	2171 */

	2172 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	2173 if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	2174 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

	2175 /* output BMP code point */

	2176 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2177 --targetCapacity;

	2178 continue;

	2179 }

	2180 } else if(action==MBCS_STATE_UNASSIGNED) {

	2181 /* just fall through */

	2182 } else if(action==MBCS_STATE_ILLEGAL) {

	2183 /* callback(illegal) */

	2184 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2185 } else {

	2186 /* reserved, must never occur */

	2187 continue;

	2188 }

	2189

	2190 /* set offsets since the start or the last extension */

	2191 if(offsets!=NULL) {

	2192 int32_t count=(int32_t)(source-lastSource);

	2193

	2194 /* predecrement: do not set the offset for the callback-causing char acter */

	2195 while(--count>0) {

	2196 *offsets++=sourceIndex++;

	2197 }

	2198 /* offset and sourceIndex are now set for the current character */

	2199 }

	2200

	2201 if(U_FAILURE(*pErrorCode)) {

	2202 /* callback(illegal) */

	2203 break;

	2204 } else /* unassigned sequences indicated with byteIndex>0 */ {

	2205 /* try an extension mapping */

	2206 lastSource=source;

	2207 cnv->toUBytes[0]=*(source-1);

	2208 cnv->toULength=_extToU(cnv, cnv->sharedData,

	2209 1, &source, sourceLimit,

	2210 &target, pArgs->targetLimit,

	2211 &offsets, sourceIndex,

	2212 pArgs->flush,

	2213 pErrorCode);

	2214 sourceIndex+=1+(int32_t)(source-lastSource);

	2215

	2216 if(U_FAILURE(*pErrorCode)) {

	2217 /* not mappable or buffer overflow */

	2218 break;

	2219 }

	2220

	2221 /* recalculate the targetCapacity after an extension mapping */

	2222 targetCapacity=(int32_t)(pArgs->targetLimit-target);

	2223 length=(int32_t)(sourceLimit-source);

	2224 if(length<targetCapacity) {

	2225 targetCapacity=length;

	2226 }

	2227 }

	2228

	2229 #if MBCS_UNROLL_SINGLE_TO_BMP

	2230 /* unrolling makes it faster on Pentium III/Windows 2000 */

	2231 goto unrolled;

	2232 #endif

	2233 }

	2234

	2235 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimi t) {

	2236 /* target is full */

	2237 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	2238 }

	2239

	2240 /* set offsets since the start or the last callback */

	2241 if(offsets!=NULL) {

	2242 size_t count=source-lastSource;

	2243 while(count>0) {

	2244 *offsets++=sourceIndex++;

	2245 --count;

	2246 }

	2247 }

	2248

	2249 /* write back the updated pointers */

	2250 pArgs->source=(const char *)source;

	2251 pArgs->target=target;

	2252 pArgs->offsets=offsets;

	2253 }

	2254

	2255 static UBool

	2256 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {

	2257 const int32_t *row=stateTable[state];

	2258 int32_t b, entry;

	2259 /* First test for final entries in this state for some commonly valid byte v alues. */

	2260 entry=row[0xa1];

	2261 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

	2262 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

	2263 ) {

	2264 return TRUE;

	2265 }

	2266 entry=row[0x41];

	2267 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

	2268 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

	2269 ) {

	2270 return TRUE;

	2271 }

	2272 /* Then test for final entries in this state. */

	2273 for(b=0; b<=0xff; ++b) {

	2274 entry=row[b];

	2275 if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

	2276 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

	2277 ) {

	2278 return TRUE;

	2279 }

	2280 }

	2281 /* Then recurse for transition entries. */

	2282 for(b=0; b<=0xff; ++b) {

	2283 entry=row[b];

	2284 if( MBCS_ENTRY_IS_TRANSITION(entry) &&

	2285 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE( entry))

	2286 ) {

	2287 return TRUE;

	2288 }

	2289 }

	2290 return FALSE;

	2291 }

	2292

	2293 /*

	2294 * Is byte b a single/lead byte in this state?

	2295 * Recurse for transition states, because here we don't want to say that

	2296 * b is a lead byte if all byte sequences that start with b are illegal.

	2297 */

	2298 static UBool

	2299 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly , uint8_t b) {

	2300 const int32_t *row=stateTable[state];

	2301 int32_t entry=row[b];

	2302 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */

	2303 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STA TE(entry));

	2304 } else {

	2305 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	2306 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {

	2307 return FALSE; /* SI/SO are illegal for DBCS-only conversion */

	2308 } else {

	2309 return action!=MBCS_STATE_ILLEGAL;

	2310 }

	2311 }

	2312 }

	2313

	2314 U_CFUNC void

	2315 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	2316 UErrorCode *pErrorCode) {

	2317 UConverter *cnv;

	2318 const uint8_t source, sourceLimit;

	2319 UChar *target;

	2320 const UChar *targetLimit;

	2321 int32_t *offsets;

	2322

	2323 const int32_t (*stateTable)[256];

	2324 const uint16_t *unicodeCodeUnits;

	2325

	2326 uint32_t offset;

	2327 uint8_t state;

	2328 int8_t byteIndex;

	2329 uint8_t *bytes;

	2330

	2331 int32_t sourceIndex, nextSourceIndex;

	2332

	2333 int32_t entry;

	2334 UChar c;

	2335 uint8_t action;

	2336

	2337 /* use optimized function if possible */

	2338 cnv=pArgs->converter;

	2339

	2340 if(cnv->preToULength>0) {

	2341 /*

	2342 * pass sourceIndex=-1 because we continue from an earlier buffer

	2343 * in the future, this may change with continuous offsets

	2344 */

	2345 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);

	2346

	2347 if(U_FAILURE(*pErrorCode) \|\| cnv->preToULength<0) {

	2348 return;

	2349 }

	2350 }

	2351

	2352 if(cnv->sharedData->mbcs.countStates==1) {

	2353 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

	2354 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);

	2355 } else {

	2356 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);

	2357 }

	2358 return;

	2359 }

	2360

	2361 /* set up the local pointers */

	2362 source=(const uint8_t *)pArgs->source;

	2363 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	2364 target=pArgs->target;

	2365 targetLimit=pArgs->targetLimit;

	2366 offsets=pArgs->offsets;

	2367

	2368 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	2369 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

	2370 } else {

	2371 stateTable=cnv->sharedData->mbcs.stateTable;

	2372 }

	2373 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;

	2374

	2375 /* get the converter state from UConverter */

	2376 offset=cnv->toUnicodeStatus;

	2377 byteIndex=cnv->toULength;

	2378 bytes=cnv->toUBytes;

	2379

	2380 /*

	2381 * if we are in the SBCS state for a DBCS-only converter,

	2382 * then load the DBCS state from the MBCS data

	2383 * (dbcsOnlyState==0 if it is not a DBCS-only converter)

	2384 */

	2385 if((state=(uint8_t)(cnv->mode))==0) {

	2386 state=cnv->sharedData->mbcs.dbcsOnlyState;

	2387 }

	2388

	2389 /* sourceIndex=-1 if the current character began in the previous buffer */

	2390 sourceIndex=byteIndex==0 ? 0 : -1;

	2391 nextSourceIndex=0;

	2392

	2393 /* conversion loop */

	2394 while(source<sourceLimit) {

	2395 /*

	2396 * This following test is to see if available input would overflow the o utput.

	2397 * It does not catch output of more than one code unit that

	2398 * overflows as a result of a surrogate pair or callback output

	2399 * from the last source byte.

	2400 * Therefore, those situations also test for overflows and will

	2401 * then break the loop, too.

	2402 */

	2403 if(target>=targetLimit) {

	2404 /* target is full */

	2405 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	2406 break;

	2407 }

	2408

	2409 if(byteIndex==0) {

	2410 /* optimized loop for 1/2-byte input and BMP output */

	2411 if(offsets==NULL) {

	2412 do {

	2413 entry=stateTable[state][*source];

	2414 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	2415 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

	2416 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);

	2417

	2418 ++source;

	2419 if( source<sourceLimit &&

	2420 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source] ) &&

	2421 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&

	2422 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16 (entry)])<0xfffe

	2423 ) {

	2424 ++source;

	2425 *target++=c;

	2426 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

	2427 offset=0;

	2428 } else {

	2429 /* set the state and leave the optimized loop */

	2430 bytes[0]=*(source-1);

	2431 byteIndex=1;

	2432 break;

	2433 }

	2434 } else {

	2435 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

	2436 /* output BMP code point */

	2437 ++source;

	2438 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2439 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

	2440 } else {

	2441 /* leave the optimized loop */

	2442 break;

	2443 }

	2444 }

	2445 } while(source<sourceLimit && target<targetLimit);

	2446 } else /* offsets!=NULL */ {

	2447 do {

	2448 entry=stateTable[state][*source];

	2449 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	2450 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

	2451 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);

	2452

	2453 ++source;

	2454 if( source<sourceLimit &&

	2455 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source] ) &&

	2456 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&

	2457 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16 (entry)])<0xfffe

	2458 ) {

	2459 ++source;

	2460 *target++=c;

	2461 if(offsets!=NULL) {

	2462 *offsets++=sourceIndex;

	2463 sourceIndex=(nextSourceIndex+=2);

	2464 }

	2465 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

	2466 offset=0;

	2467 } else {

	2468 /* set the state and leave the optimized loop */

	2469 ++nextSourceIndex;

	2470 bytes[0]=*(source-1);

	2471 byteIndex=1;

	2472 break;

	2473 }

	2474 } else {

	2475 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

	2476 /* output BMP code point */

	2477 ++source;

	2478 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2479 if(offsets!=NULL) {

	2480 *offsets++=sourceIndex;

	2481 sourceIndex=++nextSourceIndex;

	2482 }

	2483 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typ ically 0 */

	2484 } else {

	2485 /* leave the optimized loop */

	2486 break;

	2487 }

	2488 }

	2489 } while(source<sourceLimit && target<targetLimit);

	2490 }

	2491

	2492 /*

	2493 * these tests and break statements could be put inside the loop

	2494 * if C had "break outerLoop" like Java

	2495 */

	2496 if(source>=sourceLimit) {

	2497 break;

	2498 }

	2499 if(target>=targetLimit) {

	2500 /* target is full */

	2501 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	2502 break;

	2503 }

	2504

	2505 ++nextSourceIndex;

	2506 bytes[byteIndex++]=*source++;

	2507 } else /* byteIndex>0 */ {

	2508 ++nextSourceIndex;

	2509 entry=stateTable[state][bytes[byteIndex++]=*source++];

	2510 }

	2511

	2512 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	2513 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

	2514 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);

	2515 continue;

	2516 }

	2517

	2518 /* save the previous state for proper extension mapping with SI/SO-state ful converters */

	2519 cnv->mode=state;

	2520

	2521 /* set the next state early so that we can reuse the entry variable */

	2522 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */

	2523

	2524 /*

	2525 * An if-else-if chain provides more reliable performance for

	2526 * the most common cases compared to a switch.

	2527 */

	2528 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	2529 if(action==MBCS_STATE_VALID_16) {

	2530 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

	2531 c=unicodeCodeUnits[offset];

	2532 if(c<0xfffe) {

	2533 /* output BMP code point */

	2534 *target++=c;

	2535 if(offsets!=NULL) {

	2536 *offsets++=sourceIndex;

	2537 }

	2538 byteIndex=0;

	2539 } else if(c==0xfffe) {

	2540 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFa llback(&cnv->sharedData->mbcs, offset))!=0xfffe) {

	2541 /* output fallback BMP code point */

	2542 *target++=(UChar)entry;

	2543 if(offsets!=NULL) {

	2544 *offsets++=sourceIndex;

	2545 }

	2546 byteIndex=0;

	2547 }

	2548 } else {

	2549 /* callback(illegal) */

	2550 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2551 }

	2552 } else if(action==MBCS_STATE_VALID_DIRECT_16) {

	2553 /* output BMP code point */

	2554 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2555 if(offsets!=NULL) {

	2556 *offsets++=sourceIndex;

	2557 }

	2558 byteIndex=0;

	2559 } else if(action==MBCS_STATE_VALID_16_PAIR) {

	2560 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

	2561 c=unicodeCodeUnits[offset++];

	2562 if(c<0xd800) {

	2563 /* output BMP code point below 0xd800 */

	2564 *target++=c;

	2565 if(offsets!=NULL) {

	2566 *offsets++=sourceIndex;

	2567 }

	2568 byteIndex=0;

	2569 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {

	2570 /* output roundtrip or fallback surrogate pair */

	2571 *target++=(UChar)(c&0xdbff);

	2572 if(offsets!=NULL) {

	2573 *offsets++=sourceIndex;

	2574 }

	2575 byteIndex=0;

	2576 if(target<targetLimit) {

	2577 *target++=unicodeCodeUnits[offset];

	2578 if(offsets!=NULL) {

	2579 *offsets++=sourceIndex;

	2580 }

	2581 } else {

	2582 /* target overflow */

	2583 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset];

	2584 cnv->UCharErrorBufferLength=1;

	2585 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	2586

	2587 offset=0;

	2588 break;

	2589 }

	2590 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe0 00) {

	2591 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

	2592 *target++=unicodeCodeUnits[offset];

	2593 if(offsets!=NULL) {

	2594 *offsets++=sourceIndex;

	2595 }

	2596 byteIndex=0;

	2597 } else if(c==0xffff) {

	2598 /* callback(illegal) */

	2599 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2600 }

	2601 } else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|

	2602 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBA CK(cnv))

	2603 ) {

	2604 entry=MBCS_ENTRY_FINAL_VALUE(entry);

	2605 /* output surrogate pair */

	2606 *target++=(UChar)(0xd800\|(UChar)(entry>>10));

	2607 if(offsets!=NULL) {

	2608 *offsets++=sourceIndex;

	2609 }

	2610 byteIndex=0;

	2611 c=(UChar)(0xdc00\|(UChar)(entry&0x3ff));

	2612 if(target<targetLimit) {

	2613 *target++=c;

	2614 if(offsets!=NULL) {

	2615 *offsets++=sourceIndex;

	2616 }

	2617 } else {

	2618 /* target overflow */

	2619 cnv->UCharErrorBuffer[0]=c;

	2620 cnv->UCharErrorBufferLength=1;

	2621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	2622

	2623 offset=0;

	2624 break;

	2625 }

	2626 } else if(action==MBCS_STATE_CHANGE_ONLY) {

	2627 /*

	2628 * This serves as a state change without any output.

	2629 * It is useful for reading simple stateful encodings,

	2630 * for example using just Shift-In/Shift-Out codes.

	2631 * The 21 unused bits may later be used for more sophisticated

	2632 * state transitions.

	2633 */

	2634 if(cnv->sharedData->mbcs.dbcsOnlyState==0) {

	2635 byteIndex=0;

	2636 } else {

	2637 /* SI/SO are illegal for DBCS-only conversion */

	2638 state=(uint8_t)(cnv->mode); /* restore the previous state */

	2639

	2640 /* callback(illegal) */

	2641 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2642 }

	2643 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	2644 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

	2645 /* output BMP code point */

	2646 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2647 if(offsets!=NULL) {

	2648 *offsets++=sourceIndex;

	2649 }

	2650 byteIndex=0;

	2651 }

	2652 } else if(action==MBCS_STATE_UNASSIGNED) {

	2653 /* just fall through */

	2654 } else if(action==MBCS_STATE_ILLEGAL) {

	2655 /* callback(illegal) */

	2656 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2657 } else {

	2658 /* reserved, must never occur */

	2659 byteIndex=0;

	2660 }

	2661

	2662 /* end of action codes: prepare for a new character */

	2663 offset=0;

	2664

	2665 if(byteIndex==0) {

	2666 sourceIndex=nextSourceIndex;

	2667 } else if(U_FAILURE(*pErrorCode)) {

	2668 /* callback(illegal) */

	2669 if(byteIndex>1) {

	2670 /*

	2671 * Ticket 5691: consistent illegal sequences:

	2672 * - We include at least the first byte in the illegal sequence.

	2673 * - If any of the non-initial bytes could be the start of a cha racter,

	2674 * we stop the illegal sequence before the first one of those.

	2675 */

	2676 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0) ;

	2677 int8_t i;

	2678 for(i=1;

	2679 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly , bytes[i]);

	2680 ++i) {}

	2681 if(i<byteIndex) {

	2682 /* Back out some bytes. */

	2683 int8_t backOutDistance=byteIndex-i;

	2684 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);

	2685 byteIndex=i; /* length of reported illegal byte sequence */

	2686 if(backOutDistance<=bytesFromThisBuffer) {

	2687 source-=backOutDistance;

	2688 } else {

	2689 /* Back out bytes from the previous buffer: Need to repl ay them. */

	2690 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDi stance);

	2691 /* preToULength is negative! */

	2692 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);

	2693 source=(const uint8_t *)pArgs->source;

	2694 }

	2695 }

	2696 }

	2697 break;

	2698 } else /* unassigned sequences indicated with byteIndex>0 */ {

	2699 /* try an extension mapping */

	2700 pArgs->source=(const char *)source;

	2701 byteIndex=_extToU(cnv, cnv->sharedData,

	2702 byteIndex, &source, sourceLimit,

	2703 &target, targetLimit,

	2704 &offsets, sourceIndex,

	2705 pArgs->flush,

	2706 pErrorCode);

	2707 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs ->source);

	2708

	2709 if(U_FAILURE(*pErrorCode)) {

	2710 /* not mappable or buffer overflow */

	2711 break;

	2712 }

	2713 }

	2714 }

	2715

	2716 /* set the converter state back into UConverter */

	2717 cnv->toUnicodeStatus=offset;

	2718 cnv->mode=state;

	2719 cnv->toULength=byteIndex;

	2720

	2721 /* write back the updated pointers */

	2722 pArgs->source=(const char *)source;

	2723 pArgs->target=target;

	2724 pArgs->offsets=offsets;

	2725 }

	2726

	2727 /*

	2728 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single- state codepages.

	2729 * We still need a conversion loop in case we find reserved action codes, which are to be ignored.

	2730 */

	2731 static UChar32

	2732 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,

	2733 UErrorCode *pErrorCode) {

	2734 UConverter *cnv;

	2735 const int32_t (*stateTable)[256];

	2736 const uint8_t source, sourceLimit;

	2737

	2738 int32_t entry;

	2739 uint8_t action;

	2740

	2741 /* set up the local pointers */

	2742 cnv=pArgs->converter;

	2743 source=(const uint8_t *)pArgs->source;

	2744 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	2745 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	2746 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

	2747 } else {

	2748 stateTable=cnv->sharedData->mbcs.stateTable;

	2749 }

	2750

	2751 /* conversion loop */

	2752 while(source<sourceLimit) {

	2753 entry=stateTable[0][*source++];

	2754 /* MBCS_ENTRY_IS_FINAL(entry) */

	2755

	2756 /* write back the updated pointer early so that we can return directly * /

	2757 pArgs->source=(const char *)source;

	2758

	2759 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

	2760 /* output BMP code point */

	2761 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2762 }

	2763

	2764 /*

	2765 * An if-else-if chain provides more reliable performance for

	2766 * the most common cases compared to a switch.

	2767 */

	2768 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	2769 if( action==MBCS_STATE_VALID_DIRECT_20 \|\|

	2770 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv ))

	2771 ) {

	2772 /* output supplementary code point */

	2773 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);

	2774 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	2775 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

	2776 /* output BMP code point */

	2777 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2778 }

	2779 } else if(action==MBCS_STATE_UNASSIGNED) {

	2780 /* just fall through */

	2781 } else if(action==MBCS_STATE_ILLEGAL) {

	2782 /* callback(illegal) */

	2783 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2784 } else {

	2785 /* reserved, must never occur */

	2786 continue;

	2787 }

	2788

	2789 if(U_FAILURE(*pErrorCode)) {

	2790 /* callback(illegal) */

	2791 break;

	2792 } else /* unassigned sequence */ {

	2793 /* defer to the generic implementation */

	2794 pArgs->source=(const char *)source-1;

	2795 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	2796 }

	2797 }

	2798

	2799 /* no output because of empty input or only state changes */

	2800 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

	2801 return 0xffff;

	2802 }

	2803

	2804 /*

	2805 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character

	2806 * conversion without offset handling.

	2807 *

	2808 * When a character does not have a mapping to Unicode, then we return to the

	2809 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback

	2810 * handling.

	2811 * We also defer to the generic code in other complicated cases and have them

	2812 * ultimately handled by _MBCSToUnicodeWithOffsets() itself.

	2813 *

	2814 * All normal mappings and errors are handled here.

	2815 */

	2816 static UChar32

	2817 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,

	2818 UErrorCode *pErrorCode) {

	2819 UConverter *cnv;

	2820 const uint8_t source, sourceLimit, *lastSource;

	2821

	2822 const int32_t (*stateTable)[256];

	2823 const uint16_t *unicodeCodeUnits;

	2824

	2825 uint32_t offset;

	2826 uint8_t state;

	2827

	2828 int32_t entry;

	2829 UChar32 c;

	2830 uint8_t action;

	2831

	2832 /* use optimized function if possible */

	2833 cnv=pArgs->converter;

	2834

	2835 if(cnv->preToULength>0) {

	2836 /* use the generic code in ucnv_getNextUChar() to continue with a partia l match */

	2837 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	2838 }

	2839

	2840 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {

	2841 /*

	2842 * Using the generic ucnv_getNextUChar() code lets us deal correctly

	2843 * with the rare case of a codepage that maps single surrogates

	2844 * without adding the complexity to this already complicated function he re.

	2845 */

	2846 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	2847 } else if(cnv->sharedData->mbcs.countStates==1) {

	2848 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode);

	2849 }

	2850

	2851 /* set up the local pointers */

	2852 source=lastSource=(const uint8_t *)pArgs->source;

	2853 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	2854

	2855 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	2856 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTa ble;

	2857 } else {

	2858 stateTable=cnv->sharedData->mbcs.stateTable;

	2859 }

	2860 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits;

	2861

	2862 /* get the converter state from UConverter */

	2863 offset=cnv->toUnicodeStatus;

	2864

	2865 /*

	2866 * if we are in the SBCS state for a DBCS-only converter,

	2867 * then load the DBCS state from the MBCS data

	2868 * (dbcsOnlyState==0 if it is not a DBCS-only converter)

	2869 */

	2870 if((state=(uint8_t)(cnv->mode))==0) {

	2871 state=cnv->sharedData->mbcs.dbcsOnlyState;

	2872 }

	2873

	2874 /* conversion loop */

	2875 c=U_SENTINEL;

	2876 while(source<sourceLimit) {

	2877 entry=stateTable[state][*source++];

	2878 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	2879 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

	2880 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);

	2881

	2882 /* optimization for 1/2-byte input and BMP output */

	2883 if( source<sourceLimit &&

	2884 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&

	2885 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&

	2886 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0x fffe

	2887 ) {

	2888 ++source;

	2889 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */

	2890 /* output BMP code point */

	2891 break;

	2892 }

	2893 } else {

	2894 /* save the previous state for proper extension mapping with SI/SO-s tateful converters */

	2895 cnv->mode=state;

	2896

	2897 /* set the next state early so that we can reuse the entry variable */

	2898 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */

	2899

	2900 /*

	2901 * An if-else-if chain provides more reliable performance for

	2902 * the most common cases compared to a switch.

	2903 */

	2904 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	2905 if(action==MBCS_STATE_VALID_DIRECT_16) {

	2906 /* output BMP code point */

	2907 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2908 break;

	2909 } else if(action==MBCS_STATE_VALID_16) {

	2910 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

	2911 c=unicodeCodeUnits[offset];

	2912 if(c<0xfffe) {

	2913 /* output BMP code point */

	2914 break;

	2915 } else if(c==0xfffe) {

	2916 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&c nv->sharedData->mbcs, offset))!=0xfffe) {

	2917 break;

	2918 }

	2919 } else {

	2920 /* callback(illegal) */

	2921 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2922 }

	2923 } else if(action==MBCS_STATE_VALID_16_PAIR) {

	2924 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

	2925 c=unicodeCodeUnits[offset++];

	2926 if(c<0xd800) {

	2927 /* output BMP code point below 0xd800 */

	2928 break;

	2929 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {

	2930 /* output roundtrip or fallback supplementary code point */

	2931 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);

	2932 break;

	2933 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c== 0xe000) {

	2934 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

	2935 c=unicodeCodeUnits[offset];

	2936 break;

	2937 } else if(c==0xffff) {

	2938 /* callback(illegal) */

	2939 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2940 }

	2941 } else if(action==MBCS_STATE_VALID_DIRECT_20 \|\|

	2942 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FA LLBACK(cnv))

	2943 ) {

	2944 /* output supplementary code point */

	2945 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);

	2946 break;

	2947 } else if(action==MBCS_STATE_CHANGE_ONLY) {

	2948 /*

	2949 * This serves as a state change without any output.

	2950 * It is useful for reading simple stateful encodings,

	2951 * for example using just Shift-In/Shift-Out codes.

	2952 * The 21 unused bits may later be used for more sophisticated

	2953 * state transitions.

	2954 */

	2955 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) {

	2956 /* SI/SO are illegal for DBCS-only conversion */

	2957 state=(uint8_t)(cnv->mode); /* restore the previous state */

	2958

	2959 /* callback(illegal) */

	2960 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2961 }

	2962 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	2963 if(UCNV_TO_U_USE_FALLBACK(cnv)) {

	2964 /* output BMP code point */

	2965 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	2966 break;

	2967 }

	2968 } else if(action==MBCS_STATE_UNASSIGNED) {

	2969 /* just fall through */

	2970 } else if(action==MBCS_STATE_ILLEGAL) {

	2971 /* callback(illegal) */

	2972 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	2973 } else {

	2974 /* reserved (must never occur), or only state change */

	2975 offset=0;

	2976 lastSource=source;

	2977 continue;

	2978 }

	2979

	2980 /* end of action codes: prepare for a new character */

	2981 offset=0;

	2982

	2983 if(U_FAILURE(*pErrorCode)) {

	2984 /* callback(illegal) */

	2985 break;

	2986 } else /* unassigned sequence */ {

	2987 /* defer to the generic implementation */

	2988 cnv->toUnicodeStatus=0;

	2989 cnv->mode=state;

	2990 pArgs->source=(const char *)lastSource;

	2991 return UCNV_GET_NEXT_UCHAR_USE_TO_U;

	2992 }

	2993 }

	2994 }

	2995

	2996 if(c<0) {

	2997 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {

	2998 /* incomplete character byte sequence */

	2999 uint8_t *bytes=cnv->toUBytes;

	3000 cnv->toULength=(int8_t)(source-lastSource);

	3001 do {

	3002 bytes++=lastSource++;

	3003 } while(lastSource<source);

	3004 *pErrorCode=U_TRUNCATED_CHAR_FOUND;

	3005 } else if(U_FAILURE(*pErrorCode)) {

	3006 /* callback(illegal) */

	3007 /*

	3008 * Ticket 5691: consistent illegal sequences:

	3009 * - We include at least the first byte in the illegal sequence.

	3010 * - If any of the non-initial bytes could be the start of a charact er,

	3011 * we stop the illegal sequence before the first one of those.

	3012 */

	3013 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

	3014 uint8_t *bytes=cnv->toUBytes;

	3015 bytes++=lastSource++; /* first byte */

	3016 if(lastSource==source) {

	3017 cnv->toULength=1;

	3018 } else /* lastSource<source: multi-byte character */ {

	3019 int8_t i;

	3020 for(i=1;

	3021 lastSource<source && !isSingleOrLead(stateTable, state, isDB CSOnly, *lastSource);

	3022 ++i

	3023 ) {

	3024 bytes++=lastSource++;

	3025 }

	3026 cnv->toULength=i;

	3027 source=lastSource;

	3028 }

	3029 } else {

	3030 /* no output because of empty input or only state changes */

	3031 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

	3032 }

	3033 c=0xffff;

	3034 }

	3035

	3036 /* set the converter state back into UConverter, ready for a new character * /

	3037 cnv->toUnicodeStatus=0;

	3038 cnv->mode=state;

	3039

	3040 /* write back the updated pointer */

	3041 pArgs->source=(const char *)source;

	3042 return c;

	3043 }

	3044

	3045 #if 0

	3046 /*

	3047 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. ma rkus

	3048 * Removal improves code coverage.

	3049 */

	3050 /**

	3051 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, s ingle-state codepages.

	3052 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

	3053 * It does not handle conversion extensions (_extToU()).

	3054 */

	3055 U_CFUNC UChar32

	3056 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,

	3057 uint8_t b, UBool useFallback) {

	3058 int32_t entry;

	3059 uint8_t action;

	3060

	3061 entry=sharedData->mbcs.stateTable[0][b];

	3062 /* MBCS_ENTRY_IS_FINAL(entry) */

	3063

	3064 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {

	3065 /* output BMP code point */

	3066 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	3067 }

	3068

	3069 /*

	3070 * An if-else-if chain provides more reliable performance for

	3071 * the most common cases compared to a switch.

	3072 */

	3073 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	3074 if(action==MBCS_STATE_VALID_DIRECT_20) {

	3075 /* output supplementary code point */

	3076 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

	3077 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	3078 if(!TO_U_USE_FALLBACK(useFallback)) {

	3079 return 0xfffe;

	3080 }

	3081 /* output BMP code point */

	3082 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	3083 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {

	3084 if(!TO_U_USE_FALLBACK(useFallback)) {

	3085 return 0xfffe;

	3086 }

	3087 /* output supplementary code point */

	3088 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

	3089 } else if(action==MBCS_STATE_UNASSIGNED) {

	3090 return 0xfffe;

	3091 } else if(action==MBCS_STATE_ILLEGAL) {

	3092 return 0xffff;

	3093 } else {

	3094 /* reserved, must never occur */

	3095 return 0xffff;

	3096 }

	3097 }

	3098 #endif

	3099

	3100 /*

	3101 * This is a simple version of _MBCSGetNextUChar() that is used

	3102 * by other converter implementations.

	3103 * It only returns an "assigned" result if it consumes the entire input.

	3104 * It does not use state from the converter, nor error codes.

	3105 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

	3106 * It handles conversion extensions but not GB 18030.

	3107 *

	3108 * Return value:

	3109 * U+fffe unassigned

	3110 * U+ffff illegal

	3111 * otherwise the Unicode code point

	3112 */

	3113 U_CFUNC UChar32

	3114 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,

	3115 const char *source, int32_t length,

	3116 UBool useFallback) {

	3117 const int32_t (*stateTable)[256];

	3118 const uint16_t *unicodeCodeUnits;

	3119

	3120 uint32_t offset;

	3121 uint8_t state, action;

	3122

	3123 UChar32 c;

	3124 int32_t i, entry;

	3125

	3126 if(length<=0) {

	3127 /* no input at all: "illegal" */

	3128 return 0xffff;

	3129 }

	3130

	3131 #if 0

	3132 /*

	3133 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. ma rkus

	3134 * TODO In future releases, verify that this function is never called for SBCS

	3135 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true.

	3136 * Removal improves code coverage.

	3137 */

	3138 /* use optimized function if possible */

	3139 if(sharedData->mbcs.countStates==1) {

	3140 if(length==1) {

	3141 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*sourc e, useFallback);

	3142 } else {

	3143 return 0xffff; /* illegal: more than a single byte for an SBCS conve rter */

	3144 }

	3145 }

	3146 #endif

	3147

	3148 /* set up the local pointers */

	3149 stateTable=sharedData->mbcs.stateTable;

	3150 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits;

	3151

	3152 /* converter state */

	3153 offset=0;

	3154 state=sharedData->mbcs.dbcsOnlyState;

	3155

	3156 /* conversion loop */

	3157 for(i=0;;) {

	3158 entry=stateTable[state][(uint8_t)source[i++]];

	3159 if(MBCS_ENTRY_IS_TRANSITION(entry)) {

	3160 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);

	3161 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);

	3162

	3163 if(i==length) {

	3164 return 0xffff; /* truncated character */

	3165 }

	3166 } else {

	3167 /*

	3168 * An if-else-if chain provides more reliable performance for

	3169 * the most common cases compared to a switch.

	3170 */

	3171 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	3172 if(action==MBCS_STATE_VALID_16) {

	3173 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

	3174 c=unicodeCodeUnits[offset];

	3175 if(c!=0xfffe) {

	3176 /* done */

	3177 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) {

	3178 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset);

	3179 /* else done with 0xfffe */

	3180 }

	3181 break;

	3182 } else if(action==MBCS_STATE_VALID_DIRECT_16) {

	3183 /* output BMP code point */

	3184 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	3185 break;

	3186 } else if(action==MBCS_STATE_VALID_16_PAIR) {

	3187 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);

	3188 c=unicodeCodeUnits[offset++];

	3189 if(c<0xd800) {

	3190 /* output BMP code point below 0xd800 */

	3191 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {

	3192 /* output roundtrip or fallback supplementary code point */

	3193 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x1000 0-0xdc00));

	3194 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c== 0xe000) {

	3195 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */

	3196 c=unicodeCodeUnits[offset];

	3197 } else if(c==0xffff) {

	3198 return 0xffff;

	3199 } else {

	3200 c=0xfffe;

	3201 }

	3202 break;

	3203 } else if(action==MBCS_STATE_VALID_DIRECT_20) {

	3204 /* output supplementary code point */

	3205 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

	3206 break;

	3207 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {

	3208 if(!TO_U_USE_FALLBACK(useFallback)) {

	3209 c=0xfffe;

	3210 break;

	3211 }

	3212 /* output BMP code point */

	3213 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);

	3214 break;

	3215 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {

	3216 if(!TO_U_USE_FALLBACK(useFallback)) {

	3217 c=0xfffe;

	3218 break;

	3219 }

	3220 /* output supplementary code point */

	3221 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);

	3222 break;

	3223 } else if(action==MBCS_STATE_UNASSIGNED) {

	3224 c=0xfffe;

	3225 break;

	3226 }

	3227

	3228 /*

	3229 * forbid MBCS_STATE_CHANGE_ONLY for this function,

	3230 * and MBCS_STATE_ILLEGAL and reserved action codes

	3231 */

	3232 return 0xffff;

	3233 }

	3234 }

	3235

	3236 if(i!=length) {

	3237 /* illegal for this function: not all input consumed */

	3238 return 0xffff;

	3239 }

	3240

	3241 if(c==0xfffe) {

	3242 /* try an extension mapping */

	3243 const int32_t *cx=sharedData->mbcs.extIndexes;

	3244 if(cx!=NULL) {

	3245 return ucnv_extSimpleMatchToU(cx, source, length, useFallback);

	3246 }

	3247 }

	3248

	3249 return c;

	3250 }

	3251

	3252 /* MBCS-from-Unicode conversion functions ----------------------------------- */

	3253

	3254 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byt e codepages. */

	3255 static void

	3256 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

	3257 UErrorCode *pErrorCode) {

	3258 UConverter *cnv;

	3259 const UChar source, sourceLimit;

	3260 uint8_t *target;

	3261 int32_t targetCapacity;

	3262 int32_t *offsets;

	3263

	3264 const uint16_t *table;

	3265 const uint16_t *mbcsIndex;

	3266 const uint8_t *bytes;

	3267

	3268 UChar32 c;

	3269

	3270 int32_t sourceIndex, nextSourceIndex;

	3271

	3272 uint32_t stage2Entry;

	3273 uint32_t asciiRoundtrips;

	3274 uint32_t value;

	3275 uint8_t unicodeMask;

	3276

	3277 /* use optimized function if possible */

	3278 cnv=pArgs->converter;

	3279 unicodeMask=cnv->sharedData->mbcs.unicodeMask;

	3280

	3281 /* set up the local pointers */

	3282 source=pArgs->source;

	3283 sourceLimit=pArgs->sourceLimit;

	3284 target=(uint8_t *)pArgs->target;

	3285 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	3286 offsets=pArgs->offsets;

	3287

	3288 table=cnv->sharedData->mbcs.fromUnicodeTable;

	3289 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;

	3290 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	3291 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

	3292 } else {

	3293 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;

	3294 }

	3295 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

	3296

	3297 /* get the converter state from UConverter */

	3298 c=cnv->fromUChar32;

	3299

	3300 /* sourceIndex=-1 if the current character began in the previous buffer */

	3301 sourceIndex= c==0 ? 0 : -1;

	3302 nextSourceIndex=0;

	3303

	3304 /* conversion loop */

	3305 if(c!=0 && targetCapacity>0) {

	3306 goto getTrail;

	3307 }

	3308

	3309 while(source<sourceLimit) {

	3310 /*

	3311 * This following test is to see if available input would overflow the o utput.

	3312 * It does not catch output of more than one byte that

	3313 * overflows as a result of a multi-byte character or callback output

	3314 * from the last source character.

	3315 * Therefore, those situations also test for overflows and will

	3316 * then break the loop, too.

	3317 */

	3318 if(targetCapacity>0) {

	3319 /*

	3320 * Get a correct Unicode code point:

	3321 * a single UChar for a BMP code point or

	3322 * a matched surrogate pair for a "supplementary code point".

	3323 */

	3324 c=*source++;

	3325 ++nextSourceIndex;

	3326 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {

	3327 *target++=(uint8_t)c;

	3328 if(offsets!=NULL) {

	3329 *offsets++=sourceIndex;

	3330 sourceIndex=nextSourceIndex;

	3331 }

	3332 --targetCapacity;

	3333 c=0;

	3334 continue;

	3335 }

	3336 /*

	3337 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX

	3338 * to avoid dealing with surrogates.

	3339 * MBCS_FAST_MAX must be >=0xd7ff.

	3340 */

	3341 if(c<=0xd7ff) {

	3342 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)byt es, c);

	3343 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */

	3344 if(value==0) {

	3345 goto unassigned;

	3346 }

	3347 /* output the value */

	3348 } else {

	3349 /*

	3350 * This also tests if the codepage maps single surrogates.

	3351 * If it does, then surrogates are not paired but mapped separat ely.

	3352 * Note that in this case unmatched surrogates are not detected.

	3353 */

	3354 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {

	3355 if(UTF_IS_SURROGATE_FIRST(c)) {

	3356 getTrail:

	3357 if(source<sourceLimit) {

	3358 /* test the following code unit */

	3359 UChar trail=*source;

	3360 if(UTF_IS_SECOND_SURROGATE(trail)) {

	3361 ++source;

	3362 ++nextSourceIndex;

	3363 c=UTF16_GET_PAIR_VALUE(c, trail);

	3364 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

	3365 /* BMP-only codepages are stored without sta ge 1 entries for supplementary code points */

	3366 /* callback(unassigned) */

	3367 goto unassigned;

	3368 }

	3369 /* convert this supplementary code point */

	3370 /* exit this condition tree */

	3371 } else {

	3372 /* this is an unmatched lead code unit (1st surr ogate) */

	3373 /* callback(illegal) */

	3374 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	3375 break;

	3376 }

	3377 } else {

	3378 /* no more input */

	3379 break;

	3380 }

	3381 } else {

	3382 /* this is an unmatched trail code unit (2nd surrogate) */

	3383 /* callback(illegal) */

	3384 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	3385 break;

	3386 }

	3387 }

	3388

	3389 /* convert the Unicode code point in c into codepage bytes */

	3390 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

	3391

	3392 /* get the bytes and the length for the output */

	3393 /* MBCS_OUTPUT_2 */

	3394 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

	3395

	3396 /* is this code point assigned, or do we use fallbacks? */

	3397 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|

	3398 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))

	3399 ) {

	3400 /*

	3401 * We allow a 0 byte output if the "assigned" bit is set for this entry.

	3402 * There is no way with this data structure for fallback out put

	3403 * to be a zero byte.

	3404 */

	3405

	3406 unassigned:

	3407 /* try an extension mapping */

	3408 pArgs->source=source;

	3409 c=_extFromU(cnv, cnv->sharedData,

	3410 c, &source, sourceLimit,

	3411 &target, target+targetCapacity,

	3412 &offsets, sourceIndex,

	3413 pArgs->flush,

	3414 pErrorCode);

	3415 nextSourceIndex+=(int32_t)(source-pArgs->source);

	3416

	3417 if(U_FAILURE(*pErrorCode)) {

	3418 /* not mappable or buffer overflow */

	3419 break;

	3420 } else {

	3421 /* a mapping was written to the target, continue */

	3422

	3423 /* recalculate the targetCapacity after an extension map ping */

	3424 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)targ et);

	3425

	3426 /* normal end of conversion: prepare for a new character */

	3427 sourceIndex=nextSourceIndex;

	3428 continue;

	3429 }

	3430 }

	3431 }

	3432

	3433 /* write the output character bytes from value and length */

	3434 /* from the first if in the loop we know that targetCapacity>0 */

	3435 if(value<=0xff) {

	3436 /* this is easy because we know that there is enough space */

	3437 *target++=(uint8_t)value;

	3438 if(offsets!=NULL) {

	3439 *offsets++=sourceIndex;

	3440 }

	3441 --targetCapacity;

	3442 } else /* length==2 */ {

	3443 *target++=(uint8_t)(value>>8);

	3444 if(2<=targetCapacity) {

	3445 *target++=(uint8_t)value;

	3446 if(offsets!=NULL) {

	3447 *offsets++=sourceIndex;

	3448 *offsets++=sourceIndex;

	3449 }

	3450 targetCapacity-=2;

	3451 } else {

	3452 if(offsets!=NULL) {

	3453 *offsets++=sourceIndex;

	3454 }

	3455 cnv->charErrorBuffer[0]=(char)value;

	3456 cnv->charErrorBufferLength=1;

	3457

	3458 /* target overflow */

	3459 targetCapacity=0;

	3460 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	3461 c=0;

	3462 break;

	3463 }

	3464 }

	3465

	3466 /* normal end of conversion: prepare for a new character */

	3467 c=0;

	3468 sourceIndex=nextSourceIndex;

	3469 continue;

	3470 } else {

	3471 /* target is full */

	3472 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	3473 break;

	3474 }

	3475 }

	3476

	3477 /* set the converter state back into UConverter */

	3478 cnv->fromUChar32=c;

	3479

	3480 /* write back the updated pointers */

	3481 pArgs->source=source;

	3482 pArgs->target=(char *)target;

	3483 pArgs->offsets=offsets;

	3484 }

	3485

	3486 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byt e codepages. */

	3487 static void

	3488 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

	3489 UErrorCode *pErrorCode) {

	3490 UConverter *cnv;

	3491 const UChar source, sourceLimit;

	3492 uint8_t *target;

	3493 int32_t targetCapacity;

	3494 int32_t *offsets;

	3495

	3496 const uint16_t *table;

	3497 const uint16_t *results;

	3498

	3499 UChar32 c;

	3500

	3501 int32_t sourceIndex, nextSourceIndex;

	3502

	3503 uint16_t value, minValue;

	3504 UBool hasSupplementary;

	3505

	3506 /* set up the local pointers */

	3507 cnv=pArgs->converter;

	3508 source=pArgs->source;

	3509 sourceLimit=pArgs->sourceLimit;

	3510 target=(uint8_t *)pArgs->target;

	3511 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	3512 offsets=pArgs->offsets;

	3513

	3514 table=cnv->sharedData->mbcs.fromUnicodeTable;

	3515 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	3516 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

	3517 } else {

	3518 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

	3519 }

	3520

	3521 if(cnv->useFallback) {

	3522 /* use all roundtrip and fallback results */

	3523 minValue=0x800;

	3524 } else {

	3525 /* use only roundtrips and fallbacks from private-use characters */

	3526 minValue=0xc00;

	3527 }

	3528 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEME NTARY);

	3529

	3530 /* get the converter state from UConverter */

	3531 c=cnv->fromUChar32;

	3532

	3533 /* sourceIndex=-1 if the current character began in the previous buffer */

	3534 sourceIndex= c==0 ? 0 : -1;

	3535 nextSourceIndex=0;

	3536

	3537 /* conversion loop */

	3538 if(c!=0 && targetCapacity>0) {

	3539 goto getTrail;

	3540 }

	3541

	3542 while(source<sourceLimit) {

	3543 /*

	3544 * This following test is to see if available input would overflow the o utput.

	3545 * It does not catch output of more than one byte that

	3546 * overflows as a result of a multi-byte character or callback output

	3547 * from the last source character.

	3548 * Therefore, those situations also test for overflows and will

	3549 * then break the loop, too.

	3550 */

	3551 if(targetCapacity>0) {

	3552 /*

	3553 * Get a correct Unicode code point:

	3554 * a single UChar for a BMP code point or

	3555 * a matched surrogate pair for a "supplementary code point".

	3556 */

	3557 c=*source++;

	3558 ++nextSourceIndex;

	3559 if(UTF_IS_SURROGATE(c)) {

	3560 if(UTF_IS_SURROGATE_FIRST(c)) {

	3561 getTrail:

	3562 if(source<sourceLimit) {

	3563 /* test the following code unit */

	3564 UChar trail=*source;

	3565 if(UTF_IS_SECOND_SURROGATE(trail)) {

	3566 ++source;

	3567 ++nextSourceIndex;

	3568 c=UTF16_GET_PAIR_VALUE(c, trail);

	3569 if(!hasSupplementary) {

	3570 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */

	3571 /* callback(unassigned) */

	3572 goto unassigned;

	3573 }

	3574 /* convert this supplementary code point */

	3575 /* exit this condition tree */

	3576 } else {

	3577 /* this is an unmatched lead code unit (1st surrogat e) */

	3578 /* callback(illegal) */

	3579 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	3580 break;

	3581 }

	3582 } else {

	3583 /* no more input */

	3584 break;

	3585 }

	3586 } else {

	3587 /* this is an unmatched trail code unit (2nd surrogate) */

	3588 /* callback(illegal) */

	3589 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	3590 break;

	3591 }

	3592 }

	3593

	3594 /* convert the Unicode code point in c into codepage bytes */

	3595 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	3596

	3597 /* is this code point assigned, or do we use fallbacks? */

	3598 if(value>=minValue) {

	3599 /* assigned, write the output character bytes from value and len gth */

	3600 /* length==1 */

	3601 /* this is easy because we know that there is enough space */

	3602 *target++=(uint8_t)value;

	3603 if(offsets!=NULL) {

	3604 *offsets++=sourceIndex;

	3605 }

	3606 --targetCapacity;

	3607

	3608 /* normal end of conversion: prepare for a new character */

	3609 c=0;

	3610 sourceIndex=nextSourceIndex;

	3611 } else { /* unassigned */

	3612 unassigned:

	3613 /* try an extension mapping */

	3614 pArgs->source=source;

	3615 c=_extFromU(cnv, cnv->sharedData,

	3616 c, &source, sourceLimit,

	3617 &target, target+targetCapacity,

	3618 &offsets, sourceIndex,

	3619 pArgs->flush,

	3620 pErrorCode);

	3621 nextSourceIndex+=(int32_t)(source-pArgs->source);

	3622

	3623 if(U_FAILURE(*pErrorCode)) {

	3624 /* not mappable or buffer overflow */

	3625 break;

	3626 } else {

	3627 /* a mapping was written to the target, continue */

	3628

	3629 /* recalculate the targetCapacity after an extension mapping */

	3630 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);

	3631

	3632 /* normal end of conversion: prepare for a new character */

	3633 sourceIndex=nextSourceIndex;

	3634 }

	3635 }

	3636 } else {

	3637 /* target is full */

	3638 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	3639 break;

	3640 }

	3641 }

	3642

	3643 /* set the converter state back into UConverter */

	3644 cnv->fromUChar32=c;

	3645

	3646 /* write back the updated pointers */

	3647 pArgs->source=source;

	3648 pArgs->target=(char *)target;

	3649 pArgs->offsets=offsets;

	3650 }

	3651

	3652 /*

	3653 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages

	3654 * that map only to and from the BMP.

	3655 * In addition to single-byte/state optimizations, the offset calculations

	3656 * become much easier.

	3657 * It would be possible to use the sbcsIndex for UTF-8-friendly tables,

	3658 * but measurements have shown that this diminishes performance

	3659 * in more cases than it improves it.

	3660 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches

	3661 * for various MBCS and SBCS optimizations.

	3662 */

	3663 static void

	3664 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,

	3665 UErrorCode *pErrorCode) {

	3666 UConverter *cnv;

	3667 const UChar source, sourceLimit, *lastSource;

	3668 uint8_t *target;

	3669 int32_t targetCapacity, length;

	3670 int32_t *offsets;

	3671

	3672 const uint16_t *table;

	3673 const uint16_t *results;

	3674

	3675 UChar32 c;

	3676

	3677 int32_t sourceIndex;

	3678

	3679 uint32_t asciiRoundtrips;

	3680 uint16_t value, minValue;

	3681

	3682 /* set up the local pointers */

	3683 cnv=pArgs->converter;

	3684 source=pArgs->source;

	3685 sourceLimit=pArgs->sourceLimit;

	3686 target=(uint8_t *)pArgs->target;

	3687 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	3688 offsets=pArgs->offsets;

	3689

	3690 table=cnv->sharedData->mbcs.fromUnicodeTable;

	3691 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	3692 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

	3693 } else {

	3694 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

	3695 }

	3696 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

	3697

	3698 if(cnv->useFallback) {

	3699 /* use all roundtrip and fallback results */

	3700 minValue=0x800;

	3701 } else {

	3702 /* use only roundtrips and fallbacks from private-use characters */

	3703 minValue=0xc00;

	3704 }

	3705

	3706 /* get the converter state from UConverter */

	3707 c=cnv->fromUChar32;

	3708

	3709 /* sourceIndex=-1 if the current character began in the previous buffer */

	3710 sourceIndex= c==0 ? 0 : -1;

	3711 lastSource=source;

	3712

	3713 /*

	3714 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter

	3715 * for the minimum of the sourceLength and targetCapacity

	3716 */

	3717 length=(int32_t)(sourceLimit-source);

	3718 if(length<targetCapacity) {

	3719 targetCapacity=length;

	3720 }

	3721

	3722 /* conversion loop */

	3723 if(c!=0 && targetCapacity>0) {

	3724 goto getTrail;

	3725 }

	3726

	3727 #if MBCS_UNROLL_SINGLE_FROM_BMP

	3728 /* unrolling makes it slower on Pentium III/Windows 2000?! */

	3729 /* unroll the loop with the most common case */

	3730 unrolled:

	3731 if(targetCapacity>=4) {

	3732 int32_t count, loops;

	3733 uint16_t andedValues;

	3734

	3735 loops=count=targetCapacity>>2;

	3736 do {

	3737 c=*source++;

	3738 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	3739 *target++=(uint8_t)value;

	3740 c=*source++;

	3741 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	3742 *target++=(uint8_t)value;

	3743 c=*source++;

	3744 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	3745 *target++=(uint8_t)value;

	3746 c=*source++;

	3747 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	3748 *target++=(uint8_t)value;

	3749

	3750 /* were all 4 entries really valid? */

	3751 if(andedValues<minValue) {

	3752 /* no, return to the first of these 4 */

	3753 source-=4;

	3754 target-=4;

	3755 break;

	3756 }

	3757 } while(--count>0);

	3758 count=loops-count;

	3759 targetCapacity-=4*count;

	3760

	3761 if(offsets!=NULL) {

	3762 lastSource+=4*count;

	3763 while(count>0) {

	3764 *offsets++=sourceIndex++;

	3765 *offsets++=sourceIndex++;

	3766 *offsets++=sourceIndex++;

	3767 *offsets++=sourceIndex++;

	3768 --count;

	3769 }

	3770 }

	3771

	3772 c=0;

	3773 }

	3774 #endif

	3775

	3776 while(targetCapacity>0) {

	3777 /*

	3778 * Get a correct Unicode code point:

	3779 * a single UChar for a BMP code point or

	3780 * a matched surrogate pair for a "supplementary code point".

	3781 */

	3782 c=*source++;

	3783 /*

	3784 * Do not immediately check for single surrogates:

	3785 * Assume that they are unassigned and check for them in that case.

	3786 * This speeds up the conversion of assigned characters.

	3787 */

	3788 /* convert the Unicode code point in c into codepage bytes */

	3789 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {

	3790 *target++=(uint8_t)c;

	3791 --targetCapacity;

	3792 c=0;

	3793 continue;

	3794 }

	3795 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	3796 /* is this code point assigned, or do we use fallbacks? */

	3797 if(value>=minValue) {

	3798 /* assigned, write the output character bytes from value and length */

	3799 /* length==1 */

	3800 /* this is easy because we know that there is enough space */

	3801 *target++=(uint8_t)value;

	3802 --targetCapacity;

	3803

	3804 /* normal end of conversion: prepare for a new character */

	3805 c=0;

	3806 continue;

	3807 } else if(!UTF_IS_SURROGATE(c)) {

	3808 /* normal, unassigned BMP character */

	3809 } else if(UTF_IS_SURROGATE_FIRST(c)) {

	3810 getTrail:

	3811 if(source<sourceLimit) {

	3812 /* test the following code unit */

	3813 UChar trail=*source;

	3814 if(UTF_IS_SECOND_SURROGATE(trail)) {

	3815 ++source;

	3816 c=UTF16_GET_PAIR_VALUE(c, trail);

	3817 /* this codepage does not map supplementary code points */

	3818 /* callback(unassigned) */

	3819 } else {

	3820 /* this is an unmatched lead code unit (1st surrogate) */

	3821 /* callback(illegal) */

	3822 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	3823 break;

	3824 }

	3825 } else {

	3826 /* no more input */

	3827 if (pArgs->flush) {

	3828 *pErrorCode=U_TRUNCATED_CHAR_FOUND;

	3829 }

	3830 break;

	3831 }

	3832 } else {

	3833 /* this is an unmatched trail code unit (2nd surrogate) */

	3834 /* callback(illegal) */

	3835 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	3836 break;

	3837 }

	3838

	3839 /* c does not have a mapping */

	3840

	3841 /* get the number of code units for c to correctly advance sourceIndex * /

	3842 length=U16_LENGTH(c);

	3843

	3844 /* set offsets since the start or the last extension */

	3845 if(offsets!=NULL) {

	3846 int32_t count=(int32_t)(source-lastSource);

	3847

	3848 /* do not set the offset for this character */

	3849 count-=length;

	3850

	3851 while(count>0) {

	3852 *offsets++=sourceIndex++;

	3853 --count;

	3854 }

	3855 /* offsets and sourceIndex are now set for the current character */

	3856 }

	3857

	3858 /* try an extension mapping */

	3859 lastSource=source;

	3860 c=_extFromU(cnv, cnv->sharedData,

	3861 c, &source, sourceLimit,

	3862 &target, (const uint8_t *)(pArgs->targetLimit),

	3863 &offsets, sourceIndex,

	3864 pArgs->flush,

	3865 pErrorCode);

	3866 sourceIndex+=length+(int32_t)(source-lastSource);

	3867 lastSource=source;

	3868

	3869 if(U_FAILURE(*pErrorCode)) {

	3870 /* not mappable or buffer overflow */

	3871 break;

	3872 } else {

	3873 /* a mapping was written to the target, continue */

	3874

	3875 /* recalculate the targetCapacity after an extension mapping */

	3876 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target);

	3877 length=(int32_t)(sourceLimit-source);

	3878 if(length<targetCapacity) {

	3879 targetCapacity=length;

	3880 }

	3881 }

	3882

	3883 #if MBCS_UNROLL_SINGLE_FROM_BMP

	3884 /* unrolling makes it slower on Pentium III/Windows 2000?! */

	3885 goto unrolled;

	3886 #endif

	3887 }

	3888

	3889 if(U_SUCCESS(pErrorCode) && source<sourceLimit && target>=(uint8_t )pArgs- >targetLimit) {

	3890 /* target is full */

	3891 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	3892 }

	3893

	3894 /* set offsets since the start or the last callback */

	3895 if(offsets!=NULL) {

	3896 size_t count=source-lastSource;

	3897 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) {

	3898 /*

	3899 Caller gave us a partial supplementary character,

	3900 which this function couldn't convert in any case.

	3901 The callback will handle the offset.

	3902 */

	3903 count--;

	3904 }

	3905 while(count>0) {

	3906 *offsets++=sourceIndex++;

	3907 --count;

	3908 }

	3909 }

	3910

	3911 /* set the converter state back into UConverter */

	3912 cnv->fromUChar32=c;

	3913

	3914 /* write back the updated pointers */

	3915 pArgs->source=source;

	3916 pArgs->target=(char *)target;

	3917 pArgs->offsets=offsets;

	3918 }

	3919

	3920 U_CFUNC void

	3921 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

	3922 UErrorCode *pErrorCode) {

	3923 UConverter *cnv;

	3924 const UChar source, sourceLimit;

	3925 uint8_t *target;

	3926 int32_t targetCapacity;

	3927 int32_t *offsets;

	3928

	3929 const uint16_t *table;

	3930 const uint16_t *mbcsIndex;

	3931 const uint8_t p, bytes;

	3932 uint8_t outputType;

	3933

	3934 UChar32 c;

	3935

	3936 int32_t prevSourceIndex, sourceIndex, nextSourceIndex;

	3937

	3938 uint32_t stage2Entry;

	3939 uint32_t asciiRoundtrips;

	3940 uint32_t value;

	3941 uint8_t si_value[2] = {0, 0};

	3942 uint8_t so_value[2] = {0, 0};

	3943 uint8_t si_value_length, so_value_length;

	3944 int32_t length = 0, prevLength;

	3945 uint8_t unicodeMask;

	3946

	3947 cnv=pArgs->converter;

	3948

	3949 if(cnv->preFromUFirstCP>=0) {

	3950 /*

	3951 * pass sourceIndex=-1 because we continue from an earlier buffer

	3952 * in the future, this may change with continuous offsets

	3953 */

	3954 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);

	3955

	3956 if(U_FAILURE(*pErrorCode) \|\| cnv->preFromULength<0) {

	3957 return;

	3958 }

	3959 }

	3960

	3961 /* use optimized function if possible */

	3962 outputType=cnv->sharedData->mbcs.outputType;

	3963 unicodeMask=cnv->sharedData->mbcs.unicodeMask;

	3964 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {

	3965 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

	3966 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);

	3967 } else {

	3968 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);

	3969 }

	3970 return;

	3971 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) {

	3972 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode);

	3973 return;

	3974 }

	3975

	3976 /* set up the local pointers */

	3977 source=pArgs->source;

	3978 sourceLimit=pArgs->sourceLimit;

	3979 target=(uint8_t *)pArgs->target;

	3980 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	3981 offsets=pArgs->offsets;

	3982

	3983 table=cnv->sharedData->mbcs.fromUnicodeTable;

	3984 if(cnv->sharedData->mbcs.utf8Friendly) {

	3985 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;

	3986 } else {

	3987 mbcsIndex=NULL;

	3988 }

	3989 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	3990 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

	3991 } else {

	3992 bytes=cnv->sharedData->mbcs.fromUnicodeBytes;

	3993 }

	3994 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

	3995

	3996 /* get the converter state from UConverter */

	3997 c=cnv->fromUChar32;

	3998

	3999 if(outputType==MBCS_OUTPUT_2_SISO) {

	4000 prevLength=cnv->fromUnicodeStatus;

	4001 if(prevLength==0) {

	4002 /* set the real value */

	4003 prevLength=1;

	4004 }

	4005 } else {

	4006 /* prevent fromUnicodeStatus from being set to something non-0 */

	4007 prevLength=0;

	4008 }

	4009

	4010 /* sourceIndex=-1 if the current character began in the previous buffer */

	4011 prevSourceIndex=-1;

	4012 sourceIndex= c==0 ? 0 : -1;

	4013 nextSourceIndex=0;

	4014

	4015 /* Get the SI/SO character for the converter */

	4016 si_value_length = getSISOBytes(SI, cnv->options, si_value);

	4017 so_value_length = getSISOBytes(SO, cnv->options, so_value);

	4018

	4019 /* conversion loop */

	4020 /*

	4021 * This is another piece of ugly code:

	4022 * A goto into the loop if the converter state contains a first surrogate

	4023 * from the previous function call.

	4024 * It saves me to check in each loop iteration a check of if(c==0)

	4025 * and duplicating the trail-surrogate-handling code in the else

	4026 * branch of that check.

	4027 * I could not find any other way to get around this other than

	4028 * using a function call for the conversion and callback, which would

	4029 * be even more inefficient.

	4030 *

	4031 * Markus Scherer 2000-jul-19

	4032 */

	4033 if(c!=0 && targetCapacity>0) {

	4034 goto getTrail;

	4035 }

	4036

	4037 while(source<sourceLimit) {

	4038 /*

	4039 * This following test is to see if available input would overflow the o utput.

	4040 * It does not catch output of more than one byte that

	4041 * overflows as a result of a multi-byte character or callback output

	4042 * from the last source character.

	4043 * Therefore, those situations also test for overflows and will

	4044 * then break the loop, too.

	4045 */

	4046 if(targetCapacity>0) {

	4047 /*

	4048 * Get a correct Unicode code point:

	4049 * a single UChar for a BMP code point or

	4050 * a matched surrogate pair for a "supplementary code point".

	4051 */

	4052 c=*source++;

	4053 ++nextSourceIndex;

	4054 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {

	4055 *target++=(uint8_t)c;

	4056 if(offsets!=NULL) {

	4057 *offsets++=sourceIndex;

	4058 prevSourceIndex=sourceIndex;

	4059 sourceIndex=nextSourceIndex;

	4060 }

	4061 --targetCapacity;

	4062 c=0;

	4063 continue;

	4064 }

	4065 /*

	4066 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX

	4067 * to avoid dealing with surrogates.

	4068 * MBCS_FAST_MAX must be >=0xd7ff.

	4069 */

	4070 if(c<=0xd7ff && mbcsIndex!=NULL) {

	4071 value=mbcsIndex[c>>6];

	4072

	4073 /* get the bytes and the length for the output (copied from belo w and adapted for utf8Friendly data) */

	4074 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */

	4075 switch(outputType) {

	4076 case MBCS_OUTPUT_2:

	4077 value=((const uint16_t *)bytes)[value +(c&0x3f)];

	4078 if(value<=0xff) {

	4079 if(value==0) {

	4080 goto unassigned;

	4081 } else {

	4082 length=1;

	4083 }

	4084 } else {

	4085 length=2;

	4086 }

	4087 break;

	4088 case MBCS_OUTPUT_2_SISO:

	4089 /* 1/2-byte stateful with Shift-In/Shift-Out */

	4090 /*

	4091 * Save the old state in the converter object

	4092 * right here, then change the local prevLength state variab le if necessary.

	4093 * Then, if this character turns out to be unassigned or a f allback that

	4094 * is not taken, the callback code must not save the new sta te in the converter

	4095 * because the new state is for a character that is not outp ut.

	4096 * However, the callback must still restore the state from t he converter

	4097 * in case the callback function changed it for its output.

	4098 */

	4099 cnv->fromUnicodeStatus=prevLength; /* save the old state */

	4100 value=((const uint16_t *)bytes)[value +(c&0x3f)];

	4101 if(value<=0xff) {

	4102 if(value==0) {

	4103 goto unassigned;

	4104 } else if(prevLength<=1) {

	4105 length=1;

	4106 } else {

	4107 /* change from double-byte mode to single-byte */

	4108 if (si_value_length == 1) {

	4109 value\|=(uint32_t)si_value[0]<<8;

	4110 length = 2;

	4111 } else if (si_value_length == 2) {

	4112 value\|=(uint32_t)si_value[1]<<8;

	4113 value\|=(uint32_t)si_value[0]<<16;

	4114 length = 3;

	4115 }

	4116 prevLength=1;

	4117 }

	4118 } else {

	4119 if(prevLength==2) {

	4120 length=2;

	4121 } else {

	4122 /* change from single-byte mode to double-byte */

	4123 if (so_value_length == 1) {

	4124 value\|=(uint32_t)so_value[0]<<16;

	4125 length = 3;

	4126 } else if (so_value_length == 2) {

	4127 value\|=(uint32_t)so_value[1]<<16;

	4128 value\|=(uint32_t)so_value[0]<<24;

	4129 length = 4;

	4130 }

	4131 prevLength=2;

	4132 }

	4133 }

	4134 break;

	4135 case MBCS_OUTPUT_DBCS_ONLY:

	4136 /* table with single-byte results, but only DBCS mappings us ed */

	4137 value=((const uint16_t *)bytes)[value +(c&0x3f)];

	4138 if(value<=0xff) {

	4139 /* no mapping or SBCS result, not taken for DBCS-only */

	4140 goto unassigned;

	4141 } else {

	4142 length=2;

	4143 }

	4144 break;

	4145 case MBCS_OUTPUT_3:

	4146 p=bytes+(value+(c&0x3f))*3;

	4147 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

	4148 if(value<=0xff) {

	4149 if(value==0) {

	4150 goto unassigned;

	4151 } else {

	4152 length=1;

	4153 }

	4154 } else if(value<=0xffff) {

	4155 length=2;

	4156 } else {

	4157 length=3;

	4158 }

	4159 break;

	4160 case MBCS_OUTPUT_4:

	4161 value=((const uint32_t *)bytes)[value +(c&0x3f)];

	4162 if(value<=0xff) {

	4163 if(value==0) {

	4164 goto unassigned;

	4165 } else {

	4166 length=1;

	4167 }

	4168 } else if(value<=0xffff) {

	4169 length=2;

	4170 } else if(value<=0xffffff) {

	4171 length=3;

	4172 } else {

	4173 length=4;

	4174 }

	4175 break;

	4176 case MBCS_OUTPUT_3_EUC:

	4177 value=((const uint16_t *)bytes)[value +(c&0x3f)];

	4178 /* EUC 16-bit fixed-length representation */

	4179 if(value<=0xff) {

	4180 if(value==0) {

	4181 goto unassigned;

	4182 } else {

	4183 length=1;

	4184 }

	4185 } else if((value&0x8000)==0) {

	4186 value\|=0x8e8000;

	4187 length=3;

	4188 } else if((value&0x80)==0) {

	4189 value\|=0x8f0080;

	4190 length=3;

	4191 } else {

	4192 length=2;

	4193 }

	4194 break;

	4195 case MBCS_OUTPUT_4_EUC:

	4196 p=bytes+(value+(c&0x3f))*3;

	4197 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

	4198 /* EUC 16-bit fixed-length representation applied to the fir st two bytes */

	4199 if(value<=0xff) {

	4200 if(value==0) {

	4201 goto unassigned;

	4202 } else {

	4203 length=1;

	4204 }

	4205 } else if(value<=0xffff) {

	4206 length=2;

	4207 } else if((value&0x800000)==0) {

	4208 value\|=0x8e800000;

	4209 length=4;

	4210 } else if((value&0x8000)==0) {

	4211 value\|=0x8f008000;

	4212 length=4;

	4213 } else {

	4214 length=3;

	4215 }

	4216 break;

	4217 default:

	4218 /* must not occur */

	4219 /*

	4220 * To avoid compiler warnings that value & length may be

	4221 * used without having been initialized, we set them here.

	4222 * In reality, this is unreachable code.

	4223 * Not having a default branch also causes warnings with

	4224 * some compilers.

	4225 */

	4226 value=0;

	4227 length=0;

	4228 break;

	4229 }

	4230 /* output the value */

	4231 } else {

	4232 /*

	4233 * This also tests if the codepage maps single surrogates.

	4234 * If it does, then surrogates are not paired but mapped separat ely.

	4235 * Note that in this case unmatched surrogates are not detected.

	4236 */

	4237 if(UTF_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) {

	4238 if(UTF_IS_SURROGATE_FIRST(c)) {

	4239 getTrail:

	4240 if(source<sourceLimit) {

	4241 /* test the following code unit */

	4242 UChar trail=*source;

	4243 if(UTF_IS_SECOND_SURROGATE(trail)) {

	4244 ++source;

	4245 ++nextSourceIndex;

	4246 c=UTF16_GET_PAIR_VALUE(c, trail);

	4247 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

	4248 /* BMP-only codepages are stored without sta ge 1 entries for supplementary code points */

	4249 cnv->fromUnicodeStatus=prevLength; /* save t he old state */

	4250 /* callback(unassigned) */

	4251 goto unassigned;

	4252 }

	4253 /* convert this supplementary code point */

	4254 /* exit this condition tree */

	4255 } else {

	4256 /* this is an unmatched lead code unit (1st surr ogate) */

	4257 /* callback(illegal) */

	4258 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	4259 break;

	4260 }

	4261 } else {

	4262 /* no more input */

	4263 break;

	4264 }

	4265 } else {

	4266 /* this is an unmatched trail code unit (2nd surrogate) */

	4267 /* callback(illegal) */

	4268 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	4269 break;

	4270 }

	4271 }

	4272

	4273 /* convert the Unicode code point in c into codepage bytes */

	4274

	4275 /*

	4276 * The basic lookup is a triple-stage compact array (trie) looku p.

	4277 * For details see the beginning of this file.

	4278 *

	4279 * Single-byte codepages are handled with a different data struc ture

	4280 * by _MBCSSingle... functions.

	4281 *

	4282 * The result consists of a 32-bit value from stage 2 and

	4283 * a pointer to as many bytes as are stored per character.

	4284 * The pointer points to the character's bytes in stage 3.

	4285 * Bits 15..0 of the stage 2 entry contain the stage 3 index

	4286 * for that pointer, while bits 31..16 are flags for which of

	4287 * the 16 characters in the block are roundtrip-assigned.

	4288 *

	4289 * For 2-byte and 4-byte codepages, the bytes are stored as uint 16_t

	4290 * respectively as uint32_t, in the platform encoding.

	4291 * For 3-byte codepages, the bytes are always stored in big-endi an order.

	4292 *

	4293 * For EUC encodings that use only either 0x8e or 0x8f as the fi rst

	4294 * byte of their longest byte sequences, the first two bytes in

	4295 * this third stage indicate with their 7th bits whether these b ytes

	4296 * are to be written directly or actually need to be preceeded b y

	4297 * one of the two Single-Shift codes. With this, the third stage

	4298 * stores one byte fewer per character than the actual maximum l ength of

	4299 * EUC byte sequences.

	4300 *

	4301 * Other than that, leading zero bytes are removed and the other

	4302 * bytes output. A single zero byte may be output if the "assign ed"

	4303 * bit in stage 2 was on.

	4304 * The data structure does not support zero byte output as a fal lback,

	4305 * and also does not allow output of leading zeros.

	4306 */

	4307 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

	4308

	4309 /* get the bytes and the length for the output */

	4310 switch(outputType) {

	4311 case MBCS_OUTPUT_2:

	4312 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

	4313 if(value<=0xff) {

	4314 length=1;

	4315 } else {

	4316 length=2;

	4317 }

	4318 break;

	4319 case MBCS_OUTPUT_2_SISO:

	4320 /* 1/2-byte stateful with Shift-In/Shift-Out */

	4321 /*

	4322 * Save the old state in the converter object

	4323 * right here, then change the local prevLength state variab le if necessary.

	4324 * Then, if this character turns out to be unassigned or a f allback that

	4325 * is not taken, the callback code must not save the new sta te in the converter

	4326 * because the new state is for a character that is not outp ut.

	4327 * However, the callback must still restore the state from t he converter

	4328 * in case the callback function changed it for its output.

	4329 */

	4330 cnv->fromUnicodeStatus=prevLength; /* save the old state */

	4331 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

	4332 if(value<=0xff) {

	4333 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)= =0) {

	4334 /* no mapping, leave value==0 */

	4335 length=0;

	4336 } else if(prevLength<=1) {

	4337 length=1;

	4338 } else {

	4339 /* change from double-byte mode to single-byte */

	4340 if (si_value_length == 1) {

	4341 value\|=(uint32_t)si_value[0]<<8;

	4342 length = 2;

	4343 } else if (si_value_length == 2) {

	4344 value\|=(uint32_t)si_value[1]<<8;

	4345 value\|=(uint32_t)si_value[0]<<16;

	4346 length = 3;

	4347 }

	4348 prevLength=1;

	4349 }

	4350 } else {

	4351 if(prevLength==2) {

	4352 length=2;

	4353 } else {

	4354 /* change from single-byte mode to double-byte */

	4355 if (so_value_length == 1) {

	4356 value\|=(uint32_t)so_value[0]<<16;

	4357 length = 3;

	4358 } else if (so_value_length == 2) {

	4359 value\|=(uint32_t)so_value[1]<<16;

	4360 value\|=(uint32_t)so_value[0]<<24;

	4361 length = 4;

	4362 }

	4363 prevLength=2;

	4364 }

	4365 }

	4366 break;

	4367 case MBCS_OUTPUT_DBCS_ONLY:

	4368 /* table with single-byte results, but only DBCS mappings us ed */

	4369 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

	4370 if(value<=0xff) {

	4371 /* no mapping or SBCS result, not taken for DBCS-only */

	4372 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */

	4373 length=0;

	4374 } else {

	4375 length=2;

	4376 }

	4377 break;

	4378 case MBCS_OUTPUT_3:

	4379 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);

	4380 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

	4381 if(value<=0xff) {

	4382 length=1;

	4383 } else if(value<=0xffff) {

	4384 length=2;

	4385 } else {

	4386 length=3;

	4387 }

	4388 break;

	4389 case MBCS_OUTPUT_4:

	4390 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c);

	4391 if(value<=0xff) {

	4392 length=1;

	4393 } else if(value<=0xffff) {

	4394 length=2;

	4395 } else if(value<=0xffffff) {

	4396 length=3;

	4397 } else {

	4398 length=4;

	4399 }

	4400 break;

	4401 case MBCS_OUTPUT_3_EUC:

	4402 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c);

	4403 /* EUC 16-bit fixed-length representation */

	4404 if(value<=0xff) {

	4405 length=1;

	4406 } else if((value&0x8000)==0) {

	4407 value\|=0x8e8000;

	4408 length=3;

	4409 } else if((value&0x80)==0) {

	4410 value\|=0x8f0080;

	4411 length=3;

	4412 } else {

	4413 length=2;

	4414 }

	4415 break;

	4416 case MBCS_OUTPUT_4_EUC:

	4417 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c);

	4418 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

	4419 /* EUC 16-bit fixed-length representation applied to the fir st two bytes */

	4420 if(value<=0xff) {

	4421 length=1;

	4422 } else if(value<=0xffff) {

	4423 length=2;

	4424 } else if((value&0x800000)==0) {

	4425 value\|=0x8e800000;

	4426 length=4;

	4427 } else if((value&0x8000)==0) {

	4428 value\|=0x8f008000;

	4429 length=4;

	4430 } else {

	4431 length=3;

	4432 }

	4433 break;

	4434 default:

	4435 /* must not occur */

	4436 /*

	4437 * To avoid compiler warnings that value & length may be

	4438 * used without having been initialized, we set them here.

	4439 * In reality, this is unreachable code.

	4440 * Not having a default branch also causes warnings with

	4441 * some compilers.

	4442 */

	4443 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip fla gs */

	4444 length=0;

	4445 break;

	4446 }

	4447

	4448 /* is this code point assigned, or do we use fallbacks? */

	4449 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 \|\|

	4450 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))

	4451 ) {

	4452 /*

	4453 * We allow a 0 byte output if the "assigned" bit is set for this entry.

	4454 * There is no way with this data structure for fallback out put

	4455 * to be a zero byte.

	4456 */

	4457

	4458 unassigned:

	4459 /* try an extension mapping */

	4460 pArgs->source=source;

	4461 c=_extFromU(cnv, cnv->sharedData,

	4462 c, &source, sourceLimit,

	4463 &target, target+targetCapacity,

	4464 &offsets, sourceIndex,

	4465 pArgs->flush,

	4466 pErrorCode);

	4467 nextSourceIndex+=(int32_t)(source-pArgs->source);

	4468 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */

	4469

	4470 if(U_FAILURE(*pErrorCode)) {

	4471 /* not mappable or buffer overflow */

	4472 break;

	4473 } else {

	4474 /* a mapping was written to the target, continue */

	4475

	4476 /* recalculate the targetCapacity after an extension map ping */

	4477 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)targ et);

	4478

	4479 /* normal end of conversion: prepare for a new character */

	4480 if(offsets!=NULL) {

	4481 prevSourceIndex=sourceIndex;

	4482 sourceIndex=nextSourceIndex;

	4483 }

	4484 continue;

	4485 }

	4486 }

	4487 }

	4488

	4489 /* write the output character bytes from value and length */

	4490 /* from the first if in the loop we know that targetCapacity>0 */

	4491 if(length<=targetCapacity) {

	4492 if(offsets==NULL) {

	4493 switch(length) {

	4494 /* each branch falls through to the next one */

	4495 case 4:

	4496 *target++=(uint8_t)(value>>24);

	4497 case 3:

	4498 *target++=(uint8_t)(value>>16);

	4499 case 2:

	4500 *target++=(uint8_t)(value>>8);

	4501 case 1:

	4502 *target++=(uint8_t)value;

	4503 default:

	4504 /* will never occur */

	4505 break;

	4506 }

	4507 } else {

	4508 switch(length) {

	4509 /* each branch falls through to the next one */

	4510 case 4:

	4511 *target++=(uint8_t)(value>>24);

	4512 *offsets++=sourceIndex;

	4513 case 3:

	4514 *target++=(uint8_t)(value>>16);

	4515 *offsets++=sourceIndex;

	4516 case 2:

	4517 *target++=(uint8_t)(value>>8);

	4518 *offsets++=sourceIndex;

	4519 case 1:

	4520 *target++=(uint8_t)value;

	4521 *offsets++=sourceIndex;

	4522 default:

	4523 /* will never occur */

	4524 break;

	4525 }

	4526 }

	4527 targetCapacity-=length;

	4528 } else {

	4529 uint8_t *charErrorBuffer;

	4530

	4531 /*

	4532 * We actually do this backwards here:

	4533 * In order to save an intermediate variable, we output

	4534 * first to the overflow buffer what does not fit into the

	4535 * regular target.

	4536 */

	4537 /* we know that 1<=targetCapacity<length<=4 */

	4538 length-=targetCapacity;

	4539 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;

	4540 switch(length) {

	4541 /* each branch falls through to the next one */

	4542 case 3:

	4543 *charErrorBuffer++=(uint8_t)(value>>16);

	4544 case 2:

	4545 *charErrorBuffer++=(uint8_t)(value>>8);

	4546 case 1:

	4547 *charErrorBuffer=(uint8_t)value;

	4548 default:

	4549 /* will never occur */

	4550 break;

	4551 }

	4552 cnv->charErrorBufferLength=(int8_t)length;

	4553

	4554 /* now output what fits into the regular target */

	4555 value>>=8length; / length was reduced by targetCapacity */

	4556 switch(targetCapacity) {

	4557 /* each branch falls through to the next one */

	4558 case 3:

	4559 *target++=(uint8_t)(value>>16);

	4560 if(offsets!=NULL) {

	4561 *offsets++=sourceIndex;

	4562 }

	4563 case 2:

	4564 *target++=(uint8_t)(value>>8);

	4565 if(offsets!=NULL) {

	4566 *offsets++=sourceIndex;

	4567 }

	4568 case 1:

	4569 *target++=(uint8_t)value;

	4570 if(offsets!=NULL) {

	4571 *offsets++=sourceIndex;

	4572 }

	4573 default:

	4574 /* will never occur */

	4575 break;

	4576 }

	4577

	4578 /* target overflow */

	4579 targetCapacity=0;

	4580 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	4581 c=0;

	4582 break;

	4583 }

	4584

	4585 /* normal end of conversion: prepare for a new character */

	4586 c=0;

	4587 if(offsets!=NULL) {

	4588 prevSourceIndex=sourceIndex;

	4589 sourceIndex=nextSourceIndex;

	4590 }

	4591 continue;

	4592 } else {

	4593 /* target is full */

	4594 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	4595 break;

	4596 }

	4597 }

	4598

	4599 /*

	4600 * the end of the input stream and detection of truncated input

	4601 * are handled by the framework, but for EBCDIC_STATEFUL conversion

	4602 * we need to emit an SI at the very end

	4603 *

	4604 * conditions:

	4605 * successful

	4606 * EBCDIC_STATEFUL in DBCS mode

	4607 * end of input and no truncated input

	4608 */

	4609 if( U_SUCCESS(*pErrorCode) &&

	4610 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 &&

	4611 pArgs->flush && source>=sourceLimit && c==0

	4612 ) {

	4613 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output str eam to SBCS */

	4614 if(targetCapacity>0) {

	4615 *target++=(uint8_t)si_value[0];

	4616 if (si_value_length == 2) {

	4617 if (targetCapacity<2) {

	4618 cnv->charErrorBuffer[0]=(uint8_t)si_value[1];

	4619 cnv->charErrorBufferLength=1;

	4620 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	4621 } else {

	4622 *target++=(uint8_t)si_value[1];

	4623 }

	4624 }

	4625 if(offsets!=NULL) {

	4626 /* set the last source character's index (sourceIndex points at sourceLimit now) */

	4627 *offsets++=prevSourceIndex;

	4628 }

	4629 } else {

	4630 /* target is full */

	4631 cnv->charErrorBuffer[0]=(uint8_t)si_value[0];

	4632 if (si_value_length == 2) {

	4633 cnv->charErrorBuffer[1]=(uint8_t)si_value[1];

	4634 }

	4635 cnv->charErrorBufferLength=si_value_length;

	4636 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	4637 }

	4638 prevLength=1; /* we switched into SBCS */

	4639 }

	4640

	4641 /* set the converter state back into UConverter */

	4642 cnv->fromUChar32=c;

	4643 cnv->fromUnicodeStatus=prevLength;

	4644

	4645 /* write back the updated pointers */

	4646 pArgs->source=source;

	4647 pArgs->target=(char *)target;

	4648 pArgs->offsets=offsets;

	4649 }

	4650

	4651 /*

	4652 * This is another simple conversion function for internal use by other

	4653 * conversion implementations.

	4654 * It does not use the converter state nor call callbacks.

	4655 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

	4656 * It handles conversion extensions but not GB 18030.

	4657 *

	4658 * It converts one single Unicode code point into codepage bytes, encoded

	4659 * as one 32-bit value. The function returns the number of bytes in *pValue:

	4660 * 1..4 the number of bytes in *pValue

	4661 * 0 unassigned (*pValue undefined)

	4662 * -1 illegal (currently not used, *pValue undefined)

	4663 *

	4664 * *pValue will contain the resulting bytes with the last byte in bits 7..0,

	4665 * the second to last byte in bits 15..8, etc.

	4666 * Currently, the function assumes but does not check that 0<=c<=0x10ffff.

	4667 */

	4668 U_CFUNC int32_t

	4669 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData,

	4670 UChar32 c, uint32_t *pValue,

	4671 UBool useFallback) {

	4672 const int32_t *cx;

	4673 const uint16_t *table;

	4674 #if 0

	4675 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */

	4676 const uint8_t *p;

	4677 #endif

	4678 uint32_t stage2Entry;

	4679 uint32_t value;

	4680 int32_t length;

	4681

	4682 /* BMP-only codepages are stored without stage 1 entries for supplementary c ode points */

	4683 if(c<=0xffff \|\| (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

	4684 table=sharedData->mbcs.fromUnicodeTable;

	4685

	4686 /* convert the Unicode code point in c into codepage bytes (same as in _ MBCSFromUnicodeWithOffsets) */

	4687 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) {

	4688 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs. fromUnicodeBytes, c);

	4689 /* is this code point assigned, or do we use fallbacks? */

	4690 if(useFallback ? value>=0x800 : value>=0xc00) {

	4691 *pValue=value&0xff;

	4692 return 1;

	4693 }

	4694 } else /* outputType!=MBCS_OUTPUT_1 */ {

	4695 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

	4696

	4697 /* get the bytes and the length for the output */

	4698 switch(sharedData->mbcs.outputType) {

	4699 case MBCS_OUTPUT_2:

	4700 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

	4701 if(value<=0xff) {

	4702 length=1;

	4703 } else {

	4704 length=2;

	4705 }

	4706 break;

	4707 #if 0

	4708 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */

	4709 case MBCS_OUTPUT_DBCS_ONLY:

	4710 /* table with single-byte results, but only DBCS mappings used * /

	4711 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

	4712 if(value<=0xff) {

	4713 /* no mapping or SBCS result, not taken for DBCS-only */

	4714 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip fla gs */

	4715 length=0;

	4716 } else {

	4717 length=2;

	4718 }

	4719 break;

	4720 case MBCS_OUTPUT_3:

	4721 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);

	4722 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

	4723 if(value<=0xff) {

	4724 length=1;

	4725 } else if(value<=0xffff) {

	4726 length=2;

	4727 } else {

	4728 length=3;

	4729 }

	4730 break;

	4731 case MBCS_OUTPUT_4:

	4732 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

	4733 if(value<=0xff) {

	4734 length=1;

	4735 } else if(value<=0xffff) {

	4736 length=2;

	4737 } else if(value<=0xffffff) {

	4738 length=3;

	4739 } else {

	4740 length=4;

	4741 }

	4742 break;

	4743 case MBCS_OUTPUT_3_EUC:

	4744 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeByte s, stage2Entry, c);

	4745 /* EUC 16-bit fixed-length representation */

	4746 if(value<=0xff) {

	4747 length=1;

	4748 } else if((value&0x8000)==0) {

	4749 value\|=0x8e8000;

	4750 length=3;

	4751 } else if((value&0x80)==0) {

	4752 value\|=0x8f0080;

	4753 length=3;

	4754 } else {

	4755 length=2;

	4756 }

	4757 break;

	4758 case MBCS_OUTPUT_4_EUC:

	4759 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);

	4760 value=((uint32_t)*p<<16)\|((uint32_t)p[1]<<8)\|p[2];

	4761 /* EUC 16-bit fixed-length representation applied to the first t wo bytes */

	4762 if(value<=0xff) {

	4763 length=1;

	4764 } else if(value<=0xffff) {

	4765 length=2;

	4766 } else if((value&0x800000)==0) {

	4767 value\|=0x8e800000;

	4768 length=4;

	4769 } else if((value&0x8000)==0) {

	4770 value\|=0x8f008000;

	4771 length=4;

	4772 } else {

	4773 length=3;

	4774 }

	4775 break;

	4776 #endif

	4777 default:

	4778 /* must not occur */

	4779 return -1;

	4780 }

	4781

	4782 /* is this code point assigned, or do we use fallbacks? */

	4783 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|

	4784 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0)

	4785 ) {

	4786 /*

	4787 * We allow a 0 byte output if the "assigned" bit is set for thi s entry.

	4788 * There is no way with this data structure for fallback output

	4789 * to be a zero byte.

	4790 */

	4791 /* assigned */

	4792 *pValue=value;

	4793 return length;

	4794 }

	4795 }

	4796 }

	4797

	4798 cx=sharedData->mbcs.extIndexes;

	4799 if(cx!=NULL) {

	4800 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback);

	4801 return length>=0 ? length : -length; /* return abs(length); */

	4802 }

	4803

	4804 /* unassigned */

	4805 return 0;

	4806 }

	4807

	4808

	4809 #if 0

	4810 /*

	4811 * This function has been moved to ucnv2022.c for inlining.

	4812 * This implementation is here only for documentation purposes

	4813 */

	4814

	4815 /**

	4816 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages .

	4817 * It does not handle the EBCDIC swaplfnl option (set in UConverter).

	4818 * It does not handle conversion extensions (_extFromU()).

	4819 *

	4820 * It returns the codepage byte for the code point, or -1 if it is unassigned.

	4821 */

	4822 U_CFUNC int32_t

	4823 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData,

	4824 UChar32 c,

	4825 UBool useFallback) {

	4826 const uint16_t *table;

	4827 int32_t value;

	4828

	4829 /* BMP-only codepages are stored without stage 1 entries for supplementary c ode points */

	4830 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {

	4831 return -1;

	4832 }

	4833

	4834 /* convert the Unicode code point in c into codepage bytes (same as in _MBCS FromUnicodeWithOffsets) */

	4835 table=sharedData->mbcs.fromUnicodeTable;

	4836

	4837 /* get the byte for the output */

	4838 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnic odeBytes, c);

	4839 /* is this code point assigned, or do we use fallbacks? */

	4840 if(useFallback ? value>=0x800 : value>=0xc00) {

	4841 return value&0xff;

	4842 } else {

	4843 return -1;

	4844 }

	4845 }

	4846 #endif

	4847

	4848 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */

	4849

	4850 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */

	4851 static const UChar32

	4852 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };

	4853

	4854 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail )<<6+trail... */

	4855 static const UChar32

	4856 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };

	4857

	4858 static void

	4859 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

	4860 UConverterToUnicodeArgs *pToUArgs,

	4861 UErrorCode *pErrorCode) {

	4862 UConverter utf8, cnv;

	4863 const uint8_t source, sourceLimit;

	4864 uint8_t *target;

	4865 int32_t targetCapacity;

	4866

	4867 const uint16_t table, sbcsIndex;

	4868 const uint16_t *results;

	4869

	4870 int8_t oldToULength, toULength, toULimit;

	4871

	4872 UChar32 c;

	4873 uint8_t b, t1, t2;

	4874

	4875 uint32_t asciiRoundtrips;

	4876 uint16_t value, minValue;

	4877 UBool hasSupplementary;

	4878

	4879 /* set up the local pointers */

	4880 utf8=pToUArgs->converter;

	4881 cnv=pFromUArgs->converter;

	4882 source=(uint8_t *)pToUArgs->source;

	4883 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

	4884 target=(uint8_t *)pFromUArgs->target;

	4885 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

	4886

	4887 table=cnv->sharedData->mbcs.fromUnicodeTable;

	4888 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex;

	4889 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	4890 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

	4891 } else {

	4892 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

	4893 }

	4894 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

	4895

	4896 if(cnv->useFallback) {

	4897 /* use all roundtrip and fallback results */

	4898 minValue=0x800;

	4899 } else {

	4900 /* use only roundtrips and fallbacks from private-use characters */

	4901 minValue=0xc00;

	4902 }

	4903 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEME NTARY);

	4904

	4905 /* get the converter state from the UTF-8 UConverter */

	4906 c=(UChar32)utf8->toUnicodeStatus;

	4907 if(c!=0) {

	4908 toULength=oldToULength=utf8->toULength;

	4909 toULimit=(int8_t)utf8->mode;

	4910 } else {

	4911 toULength=oldToULength=toULimit=0;

	4912 }

	4913

	4914 /*

	4915 * Make sure that the last byte sequence before sourceLimit is complete

	4916 * or runs into a lead byte.

	4917 * Do not go back into the bytes that will be read for finishing a partial

	4918 * sequence from the previous buffer.

	4919 * In the conversion loop compare source with sourceLimit only once

	4920 * per multi-byte character.

	4921 */

	4922 {

	4923 int32_t i, length;

	4924

	4925 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);

	4926 for(i=0; i<3 && i<length;) {

	4927 b=*(sourceLimit-i-1);

	4928 if(U8_IS_TRAIL(b)) {

	4929 ++i;

	4930 } else {

	4931 if(i<utf8_countTrailBytes[b]) {

	4932 /* exit the conversion loop before the lead byte if there ar e not enough trail bytes for it */

	4933 sourceLimit-=i+1;

	4934 }

	4935 break;

	4936 }

	4937 }

	4938 }

	4939

	4940 if(c!=0 && targetCapacity>0) {

	4941 utf8->toUnicodeStatus=0;

	4942 utf8->toULength=0;

	4943 goto moreBytes;

	4944 /*

	4945 * Note: We could avoid the goto by duplicating some of the moreBytes

	4946 * code, but only up to the point of collecting a complete UTF-8

	4947 * sequence; then recurse for the toUBytes[toULength]

	4948 * and then continue with normal conversion.

	4949 *

	4950 * If so, move this code to just after initializing the minimum

	4951 * set of local variables for reading the UTF-8 input

	4952 * (utf8, source, target, limits but not cnv, table, minValue, etc.).

	4953 *

	4954 * Potential advantages:

	4955 * - avoid the goto

	4956 * - oldToULength could become a local variable in just those code block s

	4957 * that deal with buffer boundaries

	4958 * - possibly faster if the goto prevents some compiler optimizations

	4959 * (this would need measuring to confirm)

	4960 * Disadvantage:

	4961 * - code duplication

	4962 */

	4963 }

	4964

	4965 /* conversion loop */

	4966 while(source<sourceLimit) {

	4967 if(targetCapacity>0) {

	4968 b=*source++;

	4969 if((int8_t)b>=0) {

	4970 /* convert ASCII */

	4971 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {

	4972 *target++=(uint8_t)b;

	4973 --targetCapacity;

	4974 continue;

	4975 } else {

	4976 c=b;

	4977 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c);

	4978 }

	4979 } else {

	4980 if(b<0xe0) {

	4981 if( /* handle U+0080..U+07FF inline */

	4982 b>=0xc2 &&

	4983 (t1=(uint8_t)(*source-0x80)) <= 0x3f

	4984 ) {

	4985 c=b&0x1f;

	4986 ++source;

	4987 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1);

	4988 if(value>=minValue) {

	4989 *target++=(uint8_t)value;

	4990 --targetCapacity;

	4991 continue;

	4992 } else {

	4993 c=(c<<6)\|t1;

	4994 }

	4995 } else {

	4996 c=-1;

	4997 }

	4998 } else if(b==0xe0) {

	4999 if( /* handle U+0800..U+0FFF inline */

	5000 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 &&

	5001 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f

	5002 ) {

	5003 c=t1;

	5004 source+=2;

	5005 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2);

	5006 if(value>=minValue) {

	5007 *target++=(uint8_t)value;

	5008 --targetCapacity;

	5009 continue;

	5010 } else {

	5011 c=(c<<6)\|t2;

	5012 }

	5013 } else {

	5014 c=-1;

	5015 }

	5016 } else {

	5017 c=-1;

	5018 }

	5019

	5020 if(c<0) {

	5021 /* handle "complicated" and error cases, and continuing part ial characters */

	5022 oldToULength=0;

	5023 toULength=1;

	5024 toULimit=utf8_countTrailBytes[b]+1;

	5025 c=b;

	5026 moreBytes:

	5027 while(toULength<toULimit) {

	5028 /*

	5029 * The sourceLimit may have been adjusted before the con version loop

	5030 * to stop before a truncated sequence.

	5031 * Here we need to use the real limit in case we have tw o truncated

	5032 * sequences at the end.

	5033 * See ticket #7492.

	5034 */

	5035 if(source<(uint8_t *)pToUArgs->sourceLimit) {

	5036 b=*source;

	5037 if(U8_IS_TRAIL(b)) {

	5038 ++source;

	5039 ++toULength;

	5040 c=(c<<6)+b;

	5041 } else {

	5042 break; /* sequence too short, stop with toULengt h<toULimit */

	5043 }

	5044 } else {

	5045 /* store the partial UTF-8 character, compatible wit h the regular UTF-8 converter */

	5046 source-=(toULength-oldToULength);

	5047 while(oldToULength<toULength) {

	5048 utf8->toUBytes[oldToULength++]=*source++;

	5049 }

	5050 utf8->toUnicodeStatus=c;

	5051 utf8->toULength=toULength;

	5052 utf8->mode=toULimit;

	5053 pToUArgs->source=(char *)source;

	5054 pFromUArgs->target=(char *)target;

	5055 return;

	5056 }

	5057 }

	5058

	5059 if( toULength==toULimit && /* consumed all trail bytes */

	5060 (toULength==3 \|\| toULength==2) && /* BMP */

	5061 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] & &

	5062 (c<=0xd7ff \|\| 0xe000<=c) /* not a surrogate */

	5063 ) {

	5064 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	5065 } else if(

	5066 toULength==toULimit && toULength==4 &&

	5067 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)

	5068 ) {

	5069 /* supplementary code point */

	5070 if(!hasSupplementary) {

	5071 /* BMP-only codepages are stored without stage 1 ent ries for supplementary code points */

	5072 value=0;

	5073 } else {

	5074 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c);

	5075 }

	5076 } else {

	5077 /* error handling: illegal UTF-8 byte sequence */

	5078 source-=(toULength-oldToULength);

	5079 while(oldToULength<toULength) {

	5080 utf8->toUBytes[oldToULength++]=*source++;

	5081 }

	5082 utf8->toULength=toULength;

	5083 pToUArgs->source=(char *)source;

	5084 pFromUArgs->target=(char *)target;

	5085 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	5086 return;

	5087 }

	5088 }

	5089 }

	5090

	5091 if(value>=minValue) {

	5092 /* output the mapping for c */

	5093 *target++=(uint8_t)value;

	5094 --targetCapacity;

	5095 } else {

	5096 /* value<minValue means c is unassigned (unmappable) */

	5097 /*

	5098 * Try an extension mapping.

	5099 * Pass in no source because we don't have UTF-16 input.

	5100 * If we have a partial match on c, we will return and revert

	5101 * to UTF-8->UTF-16->charset conversion.

	5102 */

	5103 static const UChar nul=0;

	5104 const UChar *noSource=&nul;

	5105 c=_extFromU(cnv, cnv->sharedData,

	5106 c, &noSource, noSource,

	5107 &target, target+targetCapacity,

	5108 NULL, -1,

	5109 pFromUArgs->flush,

	5110 pErrorCode);

	5111

	5112 if(U_FAILURE(*pErrorCode)) {

	5113 /* not mappable or buffer overflow */

	5114 cnv->fromUChar32=c;

	5115 break;

	5116 } else if(cnv->preFromUFirstCP>=0) {

	5117 /*

	5118 * Partial match, return and revert to pivoting.

	5119 * In normal from-UTF-16 conversion, we would just continue

	5120 * but then exit the loop because the extension match would

	5121 * have consumed the source.

	5122 */

	5123 break;

	5124 } else {

	5125 /* a mapping was written to the target, continue */

	5126

	5127 /* recalculate the targetCapacity after an extension mapping */

	5128 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)tar get);

	5129 }

	5130 }

	5131 } else {

	5132 /* target is full */

	5133 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	5134 break;

	5135 }

	5136 }

	5137

	5138 /*

	5139 * The sourceLimit may have been adjusted before the conversion loop

	5140 * to stop before a truncated sequence.

	5141 * If so, then collect the truncated sequence now.

	5142 */

	5143 if(U_SUCCESS(pErrorCode) && source<(sourceLimit=(uint8_t )pToUArgs->source Limit)) {

	5144 c=utf8->toUBytes[0]=b=*source++;

	5145 toULength=1;

	5146 toULimit=utf8_countTrailBytes[b]+1;

	5147 while(source<sourceLimit) {

	5148 utf8->toUBytes[toULength++]=b=*source++;

	5149 c=(c<<6)+b;

	5150 }

	5151 utf8->toUnicodeStatus=c;

	5152 utf8->toULength=toULength;

	5153 utf8->mode=toULimit;

	5154 }

	5155

	5156 /* write back the updated pointers */

	5157 pToUArgs->source=(char *)source;

	5158 pFromUArgs->target=(char *)target;

	5159 }

	5160

	5161 static void

	5162 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,

	5163 UConverterToUnicodeArgs *pToUArgs,

	5164 UErrorCode *pErrorCode) {

	5165 UConverter utf8, cnv;

	5166 const uint8_t source, sourceLimit;

	5167 uint8_t *target;

	5168 int32_t targetCapacity;

	5169

	5170 const uint16_t table, mbcsIndex;

	5171 const uint16_t *results;

	5172

	5173 int8_t oldToULength, toULength, toULimit;

	5174

	5175 UChar32 c;

	5176 uint8_t b, t1, t2;

	5177

	5178 uint32_t stage2Entry;

	5179 uint32_t asciiRoundtrips;

	5180 uint16_t value, minValue;

	5181 UBool hasSupplementary;

	5182

	5183 /* set up the local pointers */

	5184 utf8=pToUArgs->converter;

	5185 cnv=pFromUArgs->converter;

	5186 source=(uint8_t *)pToUArgs->source;

	5187 sourceLimit=(uint8_t *)pToUArgs->sourceLimit;

	5188 target=(uint8_t *)pFromUArgs->target;

	5189 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);

	5190

	5191 table=cnv->sharedData->mbcs.fromUnicodeTable;

	5192 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex;

	5193 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {

	5194 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes;

	5195 } else {

	5196 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes;

	5197 }

	5198 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips;

	5199

	5200 if(cnv->useFallback) {

	5201 /* use all roundtrip and fallback results */

	5202 minValue=0x800;

	5203 } else {

	5204 /* use only roundtrips and fallbacks from private-use characters */

	5205 minValue=0xc00;

	5206 }

	5207 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEME NTARY);

	5208

	5209 /* get the converter state from the UTF-8 UConverter */

	5210 c=(UChar32)utf8->toUnicodeStatus;

	5211 if(c!=0) {

	5212 toULength=oldToULength=utf8->toULength;

	5213 toULimit=(int8_t)utf8->mode;

	5214 } else {

	5215 toULength=oldToULength=toULimit=0;

	5216 }

	5217

	5218 /*

	5219 * Make sure that the last byte sequence before sourceLimit is complete

	5220 * or runs into a lead byte.

	5221 * Do not go back into the bytes that will be read for finishing a partial

	5222 * sequence from the previous buffer.

	5223 * In the conversion loop compare source with sourceLimit only once

	5224 * per multi-byte character.

	5225 */

	5226 {

	5227 int32_t i, length;

	5228

	5229 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength);

	5230 for(i=0; i<3 && i<length;) {

	5231 b=*(sourceLimit-i-1);

	5232 if(U8_IS_TRAIL(b)) {

	5233 ++i;

	5234 } else {

	5235 if(i<utf8_countTrailBytes[b]) {

	5236 /* exit the conversion loop before the lead byte if there ar e not enough trail bytes for it */

	5237 sourceLimit-=i+1;

	5238 }

	5239 break;

	5240 }

	5241 }

	5242 }

	5243

	5244 if(c!=0 && targetCapacity>0) {

	5245 utf8->toUnicodeStatus=0;

	5246 utf8->toULength=0;

	5247 goto moreBytes;

	5248 /* See note in ucnv_SBCSFromUTF8() about this goto. */

	5249 }

	5250

	5251 /* conversion loop */

	5252 while(source<sourceLimit) {

	5253 if(targetCapacity>0) {

	5254 b=*source++;

	5255 if((int8_t)b>=0) {

	5256 /* convert ASCII */

	5257 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) {

	5258 *target++=b;

	5259 --targetCapacity;

	5260 continue;

	5261 } else {

	5262 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b);

	5263 if(value==0) {

	5264 c=b;

	5265 goto unassigned;

	5266 }

	5267 }

	5268 } else {

	5269 if(b>0xe0) {

	5270 if( /* handle U+1000..U+D7FF inline */

	5271 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f) ) \|\|

	5272 (b==0xed && (t1 <= 0x1f) )) &&

	5273 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f

	5274 ) {

	5275 c=((b&0xf)<<6)\|t1;

	5276 source+=2;

	5277 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2);

	5278 if(value==0) {

	5279 c=(c<<6)\|t2;

	5280 goto unassigned;

	5281 }

	5282 } else {

	5283 c=-1;

	5284 }

	5285 } else if(b<0xe0) {

	5286 if( /* handle U+0080..U+07FF inline */

	5287 b>=0xc2 &&

	5288 (t1=(uint8_t)(*source-0x80)) <= 0x3f

	5289 ) {

	5290 c=b&0x1f;

	5291 ++source;

	5292 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1);

	5293 if(value==0) {

	5294 c=(c<<6)\|t1;

	5295 goto unassigned;

	5296 }

	5297 } else {

	5298 c=-1;

	5299 }

	5300 } else {

	5301 c=-1;

	5302 }

	5303

	5304 if(c<0) {

	5305 /* handle "complicated" and error cases, and continuing part ial characters */

	5306 oldToULength=0;

	5307 toULength=1;

	5308 toULimit=utf8_countTrailBytes[b]+1;

	5309 c=b;

	5310 moreBytes:

	5311 while(toULength<toULimit) {

	5312 /*

	5313 * The sourceLimit may have been adjusted before the con version loop

	5314 * to stop before a truncated sequence.

	5315 * Here we need to use the real limit in case we have tw o truncated

	5316 * sequences at the end.

	5317 * See ticket #7492.

	5318 */

	5319 if(source<(uint8_t *)pToUArgs->sourceLimit) {

	5320 b=*source;

	5321 if(U8_IS_TRAIL(b)) {

	5322 ++source;

	5323 ++toULength;

	5324 c=(c<<6)+b;

	5325 } else {

	5326 break; /* sequence too short, stop with toULengt h<toULimit */

	5327 }

	5328 } else {

	5329 /* store the partial UTF-8 character, compatible wit h the regular UTF-8 converter */

	5330 source-=(toULength-oldToULength);

	5331 while(oldToULength<toULength) {

	5332 utf8->toUBytes[oldToULength++]=*source++;

	5333 }

	5334 utf8->toUnicodeStatus=c;

	5335 utf8->toULength=toULength;

	5336 utf8->mode=toULimit;

	5337 pToUArgs->source=(char *)source;

	5338 pFromUArgs->target=(char *)target;

	5339 return;

	5340 }

	5341 }

	5342

	5343 if( toULength==toULimit && /* consumed all trail bytes */

	5344 (toULength==3 \|\| toULength==2) && /* BMP */

	5345 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] & &

	5346 (c<=0xd7ff \|\| 0xe000<=c) /* not a surrogate */

	5347 ) {

	5348 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

	5349 } else if(

	5350 toULength==toULimit && toULength==4 &&

	5351 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)

	5352 ) {

	5353 /* supplementary code point */

	5354 if(!hasSupplementary) {

	5355 /* BMP-only codepages are stored without stage 1 ent ries for supplementary code points */

	5356 stage2Entry=0;

	5357 } else {

	5358 stage2Entry=MBCS_STAGE_2_FROM_U(table, c);

	5359 }

	5360 } else {

	5361 /* error handling: illegal UTF-8 byte sequence */

	5362 source-=(toULength-oldToULength);

	5363 while(oldToULength<toULength) {

	5364 utf8->toUBytes[oldToULength++]=*source++;

	5365 }

	5366 utf8->toULength=toULength;

	5367 pToUArgs->source=(char *)source;

	5368 pFromUArgs->target=(char *)target;

	5369 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	5370 return;

	5371 }

	5372

	5373 /* get the bytes and the length for the output */

	5374 /* MBCS_OUTPUT_2 */

	5375 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c);

	5376

	5377 /* is this code point assigned, or do we use fallbacks? */

	5378 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) \|\|

	5379 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0))

	5380 ) {

	5381 goto unassigned;

	5382 }

	5383 }

	5384 }

	5385

	5386 /* write the output character bytes from value and length */

	5387 /* from the first if in the loop we know that targetCapacity>0 */

	5388 if(value<=0xff) {

	5389 /* this is easy because we know that there is enough space */

	5390 *target++=(uint8_t)value;

	5391 --targetCapacity;

	5392 } else /* length==2 */ {

	5393 *target++=(uint8_t)(value>>8);

	5394 if(2<=targetCapacity) {

	5395 *target++=(uint8_t)value;

	5396 targetCapacity-=2;

	5397 } else {

	5398 cnv->charErrorBuffer[0]=(char)value;

	5399 cnv->charErrorBufferLength=1;

	5400

	5401 /* target overflow */

	5402 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	5403 break;

	5404 }

	5405 }

	5406 continue;

	5407

	5408 unassigned:

	5409 {

	5410 /*

	5411 * Try an extension mapping.

	5412 * Pass in no source because we don't have UTF-16 input.

	5413 * If we have a partial match on c, we will return and revert

	5414 * to UTF-8->UTF-16->charset conversion.

	5415 */

	5416 static const UChar nul=0;

	5417 const UChar *noSource=&nul;

	5418 c=_extFromU(cnv, cnv->sharedData,

	5419 c, &noSource, noSource,

	5420 &target, target+targetCapacity,

	5421 NULL, -1,

	5422 pFromUArgs->flush,

	5423 pErrorCode);

	5424

	5425 if(U_FAILURE(*pErrorCode)) {

	5426 /* not mappable or buffer overflow */

	5427 cnv->fromUChar32=c;

	5428 break;

	5429 } else if(cnv->preFromUFirstCP>=0) {

	5430 /*

	5431 * Partial match, return and revert to pivoting.

	5432 * In normal from-UTF-16 conversion, we would just continue

	5433 * but then exit the loop because the extension match would

	5434 * have consumed the source.

	5435 */

	5436 break;

	5437 } else {

	5438 /* a mapping was written to the target, continue */

	5439

	5440 /* recalculate the targetCapacity after an extension mapping */

	5441 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)tar get);

	5442 continue;

	5443 }

	5444 }

	5445 } else {

	5446 /* target is full */

	5447 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	5448 break;

	5449 }

	5450 }

	5451

	5452 /*

	5453 * The sourceLimit may have been adjusted before the conversion loop

	5454 * to stop before a truncated sequence.

	5455 * If so, then collect the truncated sequence now.

	5456 */

	5457 if(U_SUCCESS(pErrorCode) && source<(sourceLimit=(uint8_t )pToUArgs->source Limit)) {

	5458 c=utf8->toUBytes[0]=b=*source++;

	5459 toULength=1;

	5460 toULimit=utf8_countTrailBytes[b]+1;

	5461 while(source<sourceLimit) {

	5462 utf8->toUBytes[toULength++]=b=*source++;

	5463 c=(c<<6)+b;

	5464 }

	5465 utf8->toUnicodeStatus=c;

	5466 utf8->toULength=toULength;

	5467 utf8->mode=toULimit;

	5468 }

	5469

	5470 /* write back the updated pointers */

	5471 pToUArgs->source=(char *)source;

	5472 pFromUArgs->target=(char *)target;

	5473 }

	5474

	5475 /* miscellaneous ------------------------------------------------------------ */

	5476

	5477 static void

	5478 ucnv_MBCSGetStarters(const UConverter* cnv,

	5479 UBool starters[256],

	5480 UErrorCode *pErrorCode) {

	5481 const int32_t *state0;

	5482 int i;

	5483

	5484 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState] ;

	5485 for(i=0; i<256; ++i) {

	5486 /* all bytes that cause a state transition from state 0 are lead bytes * /

	5487 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]);

	5488 }

	5489 }

	5490

	5491 /*

	5492 * This is an internal function that allows other converter implementations

	5493 * to check whether a byte is a lead byte.

	5494 */

	5495 U_CFUNC UBool

	5496 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) {

	5497 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8 _t)byte]);

	5498 }

	5499

	5500 static void

	5501 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,

	5502 int32_t offsetIndex,

	5503 UErrorCode *pErrorCode) {

	5504 UConverter *cnv=pArgs->converter;

	5505 char p, subchar;

	5506 char buffer[4];

	5507 int32_t length;

	5508

	5509 /* first, select between subChar and subChar1 */

	5510 if( cnv->subChar1!=0 &&

	5511 (cnv->sharedData->mbcs.extIndexes!=NULL ?

	5512 cnv->useSubChar1 :

	5513 (cnv->invalidUCharBuffer[0]<=0xff))

	5514 ) {

	5515 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */

	5516 subchar=(char *)&cnv->subChar1;

	5517 length=1;

	5518 } else {

	5519 /* select subChar in all other cases */

	5520 subchar=(char *)cnv->subChars;

	5521 length=cnv->subCharLen;

	5522 }

	5523

	5524 /* reset the selector for the next code point */

	5525 cnv->useSubChar1=FALSE;

	5526

	5527 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) {

	5528 p=buffer;

	5529

	5530 /* fromUnicodeStatus contains prevLength */

	5531 switch(length) {

	5532 case 1:

	5533 if(cnv->fromUnicodeStatus==2) {

	5534 /* DBCS mode and SBCS sub char: change to SBCS */

	5535 cnv->fromUnicodeStatus=1;

	5536 *p++=UCNV_SI;

	5537 }

	5538 *p++=subchar[0];

	5539 break;

	5540 case 2:

	5541 if(cnv->fromUnicodeStatus<=1) {

	5542 /* SBCS mode and DBCS sub char: change to DBCS */

	5543 cnv->fromUnicodeStatus=2;

	5544 *p++=UCNV_SO;

	5545 }

	5546 *p++=subchar[0];

	5547 *p++=subchar[1];

	5548 break;

	5549 default:

	5550 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

	5551 return;

	5552 }

	5553 subchar=buffer;

	5554 length=(int32_t)(p-buffer);

	5555 }

	5556

	5557 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode);

	5558 }

	5559

	5560 U_CFUNC UConverterType

	5561 ucnv_MBCSGetType(const UConverter* converter) {

	5562 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */

	5563 if(converter->sharedData->mbcs.countStates==1) {

	5564 return (UConverterType)UCNV_SBCS;

	5565 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) {

	5566 return (UConverterType)UCNV_EBCDIC_STATEFUL;

	5567 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter ->sharedData->staticData->maxBytesPerChar==2) {

	5568 return (UConverterType)UCNV_DBCS;

	5569 }

	5570 return (UConverterType)UCNV_MBCS;

	5571 }

	5572

	5573 static const UConverterImpl _SBCSUTF8Impl={

	5574 UCNV_MBCS,

	5575

	5576 ucnv_MBCSLoad,

	5577 ucnv_MBCSUnload,

	5578

	5579 ucnv_MBCSOpen,

	5580 NULL,

	5581 NULL,

	5582

	5583 ucnv_MBCSToUnicodeWithOffsets,

	5584 ucnv_MBCSToUnicodeWithOffsets,

	5585 ucnv_MBCSFromUnicodeWithOffsets,

	5586 ucnv_MBCSFromUnicodeWithOffsets,

	5587 ucnv_MBCSGetNextUChar,

	5588

	5589 ucnv_MBCSGetStarters,

	5590 ucnv_MBCSGetName,

	5591 ucnv_MBCSWriteSub,

	5592 NULL,

	5593 ucnv_MBCSGetUnicodeSet,

	5594

	5595 NULL,

	5596 ucnv_SBCSFromUTF8

	5597 };

	5598

	5599 static const UConverterImpl _DBCSUTF8Impl={

	5600 UCNV_MBCS,

	5601

	5602 ucnv_MBCSLoad,

	5603 ucnv_MBCSUnload,

	5604

	5605 ucnv_MBCSOpen,

	5606 NULL,

	5607 NULL,

	5608

	5609 ucnv_MBCSToUnicodeWithOffsets,

	5610 ucnv_MBCSToUnicodeWithOffsets,

	5611 ucnv_MBCSFromUnicodeWithOffsets,

	5612 ucnv_MBCSFromUnicodeWithOffsets,

	5613 ucnv_MBCSGetNextUChar,

	5614

	5615 ucnv_MBCSGetStarters,

	5616 ucnv_MBCSGetName,

	5617 ucnv_MBCSWriteSub,

	5618 NULL,

	5619 ucnv_MBCSGetUnicodeSet,

	5620

	5621 NULL,

	5622 ucnv_DBCSFromUTF8

	5623 };

	5624

	5625 static const UConverterImpl _MBCSImpl={

	5626 UCNV_MBCS,

	5627

	5628 ucnv_MBCSLoad,

	5629 ucnv_MBCSUnload,

	5630

	5631 ucnv_MBCSOpen,

	5632 NULL,

	5633 NULL,

	5634

	5635 ucnv_MBCSToUnicodeWithOffsets,

	5636 ucnv_MBCSToUnicodeWithOffsets,

	5637 ucnv_MBCSFromUnicodeWithOffsets,

	5638 ucnv_MBCSFromUnicodeWithOffsets,

	5639 ucnv_MBCSGetNextUChar,

	5640

	5641 ucnv_MBCSGetStarters,

	5642 ucnv_MBCSGetName,

	5643 ucnv_MBCSWriteSub,

	5644 NULL,

	5645 ucnv_MBCSGetUnicodeSet

	5646 };

	5647

	5648

	5649 /* Static data is in tools/makeconv/ucnvstat.c for data-based

	5650 * converters. Be sure to update it as well.

	5651 */

	5652

	5653 const UConverterSharedData _MBCSData={

	5654 sizeof(UConverterSharedData), 1,

	5655 NULL, NULL, NULL, FALSE, &_MBCSImpl,

	5656 0

	5657 };

	5658

	5659 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

OLD	NEW

« no previous file with comments | « icu46/source/common/ucnvmbcs.h ('k') | icu46/source/common/ucnvscsu.c » ('j') | no next file with comments »