icu46/source/common/ucnvbocu.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/ucnvbocu.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 2002-2005, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 * file name: ucnvbocu.c

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2002mar27

	14 * created by: Markus W. Scherer

	15 *

	16 * This is an implementation of the Binary Ordered Compression for Unicode,

	17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/

	18 */

	19

	20 #include "unicode/utypes.h"

	21

	22 #if !UCONFIG_NO_CONVERSION

	23

	24 #include "unicode/ucnv.h"

	25 #include "unicode/ucnv_cb.h"

	26 #include "ucnv_bld.h"

	27 #include "ucnv_cnv.h"

	28

	29 /* BOCU-1 constants and macros ---------------------------------------------- */

	30

	31 /*

	32 * BOCU-1 encodes the code points of a Unicode string as

	33 * a sequence of byte-encoded differences (slope detection),

	34 * preserving lexical order.

	35 *

	36 * Optimize the difference-taking for runs of Unicode text within

	37 * small scripts:

	38 *

	39 * Most small scripts are allocated within aligned 128-blocks of Unicode

	40 * code points. Lexical order is preserved if the "previous code point" state

	41 * is always moved into the middle of such a block.

	42 *

	43 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul

	44 * areas into the middle of those areas.

	45 *

	46 * C0 control codes and space are encoded with their US-ASCII bytes.

	47 * "prev" is reset for C0 controls but not for space.

	48 */

	49

	50 /* initial value for "prev": middle of the ASCII range */

	51 #define BOCU1_ASCII_PREV 0x40

	52

	53 /* bounding byte values for differences */

	54 #define BOCU1_MIN 0x21

	55 #define BOCU1_MIDDLE 0x90

	56 #define BOCU1_MAX_LEAD 0xfe

	57 #define BOCU1_MAX_TRAIL 0xff

	58 #define BOCU1_RESET 0xff

	59

	60 /* number of lead bytes */

	61 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)

	62

	63 /* adjust trail byte counts for the use of some C0 control byte values */

	64 #define BOCU1_TRAIL_CONTROLS_COUNT 20

	65 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)

	66

	67 /* number of trail bytes */

	68 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTR OLS_COUNT)

	69

	70 /*

	71 * number of positive and negative single-byte codes

	72 * (counting 0==BOCU1_MIDDLE among the positive ones)

	73 */

	74 #define BOCU1_SINGLE 64

	75

	76 /* number of lead bytes for positive and negative 2/3/4-byte sequences */

	77 #define BOCU1_LEAD_2 43

	78 #define BOCU1_LEAD_3 3

	79 #define BOCU1_LEAD_4 1

	80

	81 /* The difference value range for single-byters. */

	82 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)

	83 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)

	84

	85 /* The difference value range for double-byters. */

	86 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

	87 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

	88

	89 /* The difference value range for 3-byters. */

	90 #define BOCU1_REACH_POS_3 \

	91 (BOCU1_REACH_POS_2+BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)

	92

	93 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3BOCU1_TRAIL_COUNTBO CU1_TRAIL_COUNT)

	94

	95 /* The lead byte start values. */

	96 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)

	97 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)

	98 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)

	99 /* ==BOCU1_MAX_LEAD */

	100

	101 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)

	102 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)

	103 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)

	104 /* ==BOCU1_MIN+1 */

	105

	106 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */

	107 #define BOCU1_LENGTH_FROM_LEAD(lead) \

	108 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \

	109 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \

	110 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)

	111

	112 /* The length of a byte sequence, according to its packed form. */

	113 #define BOCU1_LENGTH_FROM_PACKED(packed) \

	114 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)

	115

	116 /*

	117 * 12 commonly used C0 control codes (and space) are only used to encode

	118 * themselves directly,

	119 * which makes BOCU-1 MIME-usable and reasonably safe for

	120 * ASCII-oriented software.

	121 *

	122 * These controls are

	123 * 0 NUL

	124 *

	125 * 7 BEL

	126 * 8 BS

	127 *

	128 * 9 TAB

	129 * a LF

	130 * b VT

	131 * c FF

	132 * d CR

	133 *

	134 * e SO

	135 * f SI

	136 *

	137 * 1a SUB

	138 * 1b ESC

	139 *

	140 * The other 20 C0 controls are also encoded directly (to preserve order)

	141 * but are also used as trail bytes in difference encoding

	142 * (for better compression).

	143 */

	144 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAI L_BYTE_OFFSET : bocu1TrailToByte[t])

	145

	146 /*

	147 * Byte value map for control codes,

	148 * from external byte values 0x00..0x20

	149 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.

	150 * External byte values that are illegal as trail bytes are mapped to -1.

	151 */

	152 static const int8_t

	153 bocu1ByteToTrail[BOCU1_MIN]={

	154 /* 0 1 2 3 4 5 6 7 */

	155 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,

	156

	157 /* 8 9 a b c d e f */

	158 -1, -1, -1, -1, -1, -1, -1, -1,

	159

	160 /* 10 11 12 13 14 15 16 17 */

	161 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,

	162

	163 /* 18 19 1a 1b 1c 1d 1e 1f */

	164 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,

	165

	166 /* 20 */

	167 -1

	168 };

	169

	170 /*

	171 * Byte value map for control codes,

	172 * from trail byte values 0..19 (0..0x13) as used in the difference calculation

	173 * to external byte values 0x00..0x20.

	174 */

	175 static const int8_t

	176 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={

	177 /* 0 1 2 3 4 5 6 7 */

	178 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,

	179

	180 /* 8 9 a b c d e f */

	181 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

	182

	183 /* 10 11 12 13 */

	184 0x1c, 0x1d, 0x1e, 0x1f

	185 };

	186

	187 /**

	188 * Integer division and modulo with negative numerators

	189 * yields negative modulo results and quotients that are one more than

	190 * what we need here.

	191 * This macro adjust the results so that the modulo-value m is always >=0.

	192 *

	193 * For positive n, the if() condition is always FALSE.

	194 *

	195 * @param n Number to be split into quotient and rest.

	196 * Will be modified to contain the quotient.

	197 * @param d Divisor.

	198 * @param m Output variable for the rest (modulo result).

	199 */

	200 #define NEGDIVMOD(n, d, m) { \

	201 (m)=(n)%(d); \

	202 (n)/=(d); \

	203 if((m)<0) { \

	204 --(n); \

	205 (m)+=(d); \

	206 } \

	207 }

	208

	209 /* BOCU-1 implementation functions ------------------------------------------ */

	210

	211 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)

	212

	213 /**

	214 * Compute the next "previous" value for differencing

	215 * from the current code point.

	216 *

	217 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)

	218 * @return "previous code point" state value

	219 */

	220 static U_INLINE int32_t

	221 bocu1Prev(int32_t c) {

	222 /* compute new prev */

	223 if(/* 0x3040<=c && */ c<=0x309f) {

	224 /* Hiragana is not 128-aligned */

	225 return 0x3070;

	226 } else if(0x4e00<=c && c<=0x9fa5) {

	227 /* CJK Unihan */

	228 return 0x4e00-BOCU1_REACH_NEG_2;

	229 } else if(0xac00<=c /* && c<=0xd7a3 */) {

	230 /* Korean Hangul */

	231 return (0xd7a3+0xac00)/2;

	232 } else {

	233 /* mostly small scripts */

	234 return BOCU1_SIMPLE_PREV(c);

	235 }

	236 }

	237

	238 /** Fast version of bocu1Prev() for most scripts. */

	239 #define BOCU1_PREV(c) ((c)<0x3040 \|\| (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Pr ev(c))

	240

	241 /*

	242 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.

	243 * The UConverter fields are used as follows:

	244 *

	245 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PRE V)

	246 *

	247 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PRE V)

	248 * mode decoder's incomplete (diff<<2)\|count (ignored when toULe ngth==0)

	249 */

	250

	251 /* BOCU-1-from-Unicode conversion functions --------------------------------- */

	252

	253 /**

	254 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes

	255 * and return a packed integer with them.

	256 *

	257 * The encoding favors small absolut differences with short encodings

	258 * to compress runs of same-script characters.

	259 *

	260 * Optimized version with unrolled loops and fewer floating-point operations

	261 * than the standard packDiff().

	262 *

	263 * @param diff difference value -0x10ffff..0x10ffff

	264 * @return

	265 * 0x010000zz for 1-byte sequence zz

	266 * 0x0200yyzz for 2-byte sequence yy zz

	267 * 0x03xxyyzz for 3-byte sequence xx yy zz

	268 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)

	269 */

	270 static int32_t

	271 packDiff(int32_t diff) {

	272 int32_t result, m;

	273

	274 if(diff>=BOCU1_REACH_NEG_1) {

	275 /* mostly positive differences, and single-byte negative ones */

	276 #if 0 /* single-byte case handled in macros, see below */

	277 if(diff<=BOCU1_REACH_POS_1) {

	278 /* single byte */

	279 return 0x01000000\|(BOCU1_MIDDLE+diff);

	280 } else

	281 #endif

	282 if(diff<=BOCU1_REACH_POS_2) {

	283 /* two bytes */

	284 diff-=BOCU1_REACH_POS_1+1;

	285 result=0x02000000;

	286

	287 m=diff%BOCU1_TRAIL_COUNT;

	288 diff/=BOCU1_TRAIL_COUNT;

	289 result\|=BOCU1_TRAIL_TO_BYTE(m);

	290

	291 result\|=(BOCU1_START_POS_2+diff)<<8;

	292 } else if(diff<=BOCU1_REACH_POS_3) {

	293 /* three bytes */

	294 diff-=BOCU1_REACH_POS_2+1;

	295 result=0x03000000;

	296

	297 m=diff%BOCU1_TRAIL_COUNT;

	298 diff/=BOCU1_TRAIL_COUNT;

	299 result\|=BOCU1_TRAIL_TO_BYTE(m);

	300

	301 m=diff%BOCU1_TRAIL_COUNT;

	302 diff/=BOCU1_TRAIL_COUNT;

	303 result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;

	304

	305 result\|=(BOCU1_START_POS_3+diff)<<16;

	306 } else {

	307 /* four bytes */

	308 diff-=BOCU1_REACH_POS_3+1;

	309

	310 m=diff%BOCU1_TRAIL_COUNT;

	311 diff/=BOCU1_TRAIL_COUNT;

	312 result=BOCU1_TRAIL_TO_BYTE(m);

	313

	314 m=diff%BOCU1_TRAIL_COUNT;

	315 diff/=BOCU1_TRAIL_COUNT;

	316 result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;

	317

	318 /*

	319 * We know that / and % would deliver quotient 0 and rest=diff.

	320 * Avoid division and modulo for performance.

	321 */

	322 result\|=BOCU1_TRAIL_TO_BYTE(diff)<<16;

	323

	324 result\|=((uint32_t)BOCU1_START_POS_4)<<24;

	325 }

	326 } else {

	327 /* two- to four-byte negative differences */

	328 if(diff>=BOCU1_REACH_NEG_2) {

	329 /* two bytes */

	330 diff-=BOCU1_REACH_NEG_1;

	331 result=0x02000000;

	332

	333 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	334 result\|=BOCU1_TRAIL_TO_BYTE(m);

	335

	336 result\|=(BOCU1_START_NEG_2+diff)<<8;

	337 } else if(diff>=BOCU1_REACH_NEG_3) {

	338 /* three bytes */

	339 diff-=BOCU1_REACH_NEG_2;

	340 result=0x03000000;

	341

	342 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	343 result\|=BOCU1_TRAIL_TO_BYTE(m);

	344

	345 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	346 result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;

	347

	348 result\|=(BOCU1_START_NEG_3+diff)<<16;

	349 } else {

	350 /* four bytes */

	351 diff-=BOCU1_REACH_NEG_3;

	352

	353 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	354 result=BOCU1_TRAIL_TO_BYTE(m);

	355

	356 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	357 result\|=BOCU1_TRAIL_TO_BYTE(m)<<8;

	358

	359 /*

	360 * We know that NEGDIVMOD would deliver

	361 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.

	362 * Avoid division and modulo for performance.

	363 */

	364 m=diff+BOCU1_TRAIL_COUNT;

	365 result\|=BOCU1_TRAIL_TO_BYTE(m)<<16;

	366

	367 result\|=BOCU1_MIN<<24;

	368 }

	369 }

	370 return result;

	371 }

	372

	373 /* Faster versions of packDiff() for single-byte-encoded diff values. */

	374

	375 /** Is a diff value encodable in a single byte? */

	376 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_P OS_1)

	377

	378 /** Encode a diff value in a single byte. */

	379 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))

	380

	381 /** Is a diff value encodable in two bytes? */

	382 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_P OS_2)

	383

	384 static void

	385 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

	386 UErrorCode *pErrorCode) {

	387 UConverter *cnv;

	388 const UChar source, sourceLimit;

	389 uint8_t *target;

	390 int32_t targetCapacity;

	391 int32_t *offsets;

	392

	393 int32_t prev, c, diff;

	394

	395 int32_t sourceIndex, nextSourceIndex;

	396

	397 U_ALIGN_CODE(16)

	398

	399 /* set up the local pointers */

	400 cnv=pArgs->converter;

	401 source=pArgs->source;

	402 sourceLimit=pArgs->sourceLimit;

	403 target=(uint8_t *)pArgs->target;

	404 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	405 offsets=pArgs->offsets;

	406

	407 /* get the converter state from UConverter */

	408 c=cnv->fromUChar32;

	409 prev=(int32_t)cnv->fromUnicodeStatus;

	410 if(prev==0) {

	411 prev=BOCU1_ASCII_PREV;

	412 }

	413

	414 /* sourceIndex=-1 if the current character began in the previous buffer */

	415 sourceIndex= c==0 ? 0 : -1;

	416 nextSourceIndex=0;

	417

	418 /* conversion loop */

	419 if(c!=0 && targetCapacity>0) {

	420 goto getTrail;

	421 }

	422

	423 fastSingle:

	424 /* fast loop for single-byte differences */

	425 /* use only one loop counter variable, targetCapacity, not also source */

	426 diff=(int32_t)(sourceLimit-source);

	427 if(targetCapacity>diff) {

	428 targetCapacity=diff;

	429 }

	430 while(targetCapacity>0 && (c=*source)<0x3000) {

	431 if(c<=0x20) {

	432 if(c!=0x20) {

	433 prev=BOCU1_ASCII_PREV;

	434 }

	435 *target++=(uint8_t)c;

	436 *offsets++=nextSourceIndex++;

	437 ++source;

	438 --targetCapacity;

	439 } else {

	440 diff=c-prev;

	441 if(DIFF_IS_SINGLE(diff)) {

	442 prev=BOCU1_SIMPLE_PREV(c);

	443 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

	444 *offsets++=nextSourceIndex++;

	445 ++source;

	446 --targetCapacity;

	447 } else {

	448 break;

	449 }

	450 }

	451 }

	452 /* restore real values */

	453 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);

	454 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter * /

	455

	456 /* regular loop for all cases */

	457 while(source<sourceLimit) {

	458 if(targetCapacity>0) {

	459 c=*source++;

	460 ++nextSourceIndex;

	461

	462 if(c<=0x20) {

	463 /*

	464 * ISO C0 control & space:

	465 * Encode directly for MIME compatibility,

	466 * and reset state except for space, to not disrupt compression.

	467 */

	468 if(c!=0x20) {

	469 prev=BOCU1_ASCII_PREV;

	470 }

	471 *target++=(uint8_t)c;

	472 *offsets++=sourceIndex;

	473 --targetCapacity;

	474

	475 sourceIndex=nextSourceIndex;

	476 continue;

	477 }

	478

	479 if(UTF_IS_LEAD(c)) {

	480 getTrail:

	481 if(source<sourceLimit) {

	482 /* test the following code unit */

	483 UChar trail=*source;

	484 if(UTF_IS_SECOND_SURROGATE(trail)) {

	485 ++source;

	486 ++nextSourceIndex;

	487 c=UTF16_GET_PAIR_VALUE(c, trail);

	488 }

	489 } else {

	490 /* no more input */

	491 c=-c; /* negative lead surrogate as "incomplete" indicator t o avoid c=0 everywhere else */

	492 break;

	493 }

	494 }

	495

	496 /*

	497 * all other Unicode code points c==U+0021..U+10ffff

	498 * are encoded with the difference c-prev

	499 *

	500 * a new prev is computed from c,

	501 * placed in the middle of a 0x80-block (for most small scripts) or

	502 * in the middle of the Unihan and Hangul blocks

	503 * to statistically minimize the following difference

	504 */

	505 diff=c-prev;

	506 prev=BOCU1_PREV(c);

	507 if(DIFF_IS_SINGLE(diff)) {

	508 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

	509 *offsets++=sourceIndex;

	510 --targetCapacity;

	511 sourceIndex=nextSourceIndex;

	512 if(c<0x3000) {

	513 goto fastSingle;

	514 }

	515 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {

	516 /* optimize 2-byte case */

	517 int32_t m;

	518

	519 if(diff>=0) {

	520 diff-=BOCU1_REACH_POS_1+1;

	521 m=diff%BOCU1_TRAIL_COUNT;

	522 diff/=BOCU1_TRAIL_COUNT;

	523 diff+=BOCU1_START_POS_2;

	524 } else {

	525 diff-=BOCU1_REACH_NEG_1;

	526 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	527 diff+=BOCU1_START_NEG_2;

	528 }

	529 *target++=(uint8_t)diff;

	530 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);

	531 *offsets++=sourceIndex;

	532 *offsets++=sourceIndex;

	533 targetCapacity-=2;

	534 sourceIndex=nextSourceIndex;

	535 } else {

	536 int32_t length; /* will be 2..4 */

	537

	538 diff=packDiff(diff);

	539 length=BOCU1_LENGTH_FROM_PACKED(diff);

	540

	541 /* write the output character bytes from diff and length */

	542 /* from the first if in the loop we know that targetCapacity>0 * /

	543 if(length<=targetCapacity) {

	544 switch(length) {

	545 /* each branch falls through to the next one */

	546 case 4:

	547 *target++=(uint8_t)(diff>>24);

	548 *offsets++=sourceIndex;

	549 case 3:

	550 *target++=(uint8_t)(diff>>16);

	551 *offsets++=sourceIndex;

	552 case 2:

	553 *target++=(uint8_t)(diff>>8);

	554 *offsets++=sourceIndex;

	555 /* case 1: handled above */

	556 *target++=(uint8_t)diff;

	557 *offsets++=sourceIndex;

	558 default:

	559 /* will never occur */

	560 break;

	561 }

	562 targetCapacity-=length;

	563 sourceIndex=nextSourceIndex;

	564 } else {

	565 uint8_t *charErrorBuffer;

	566

	567 /*

	568 * We actually do this backwards here:

	569 * In order to save an intermediate variable, we output

	570 * first to the overflow buffer what does not fit into the

	571 * regular target.

	572 */

	573 /* we know that 1<=targetCapacity<length<=4 */

	574 length-=targetCapacity;

	575 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;

	576 switch(length) {

	577 /* each branch falls through to the next one */

	578 case 3:

	579 *charErrorBuffer++=(uint8_t)(diff>>16);

	580 case 2:

	581 *charErrorBuffer++=(uint8_t)(diff>>8);

	582 case 1:

	583 *charErrorBuffer=(uint8_t)diff;

	584 default:

	585 /* will never occur */

	586 break;

	587 }

	588 cnv->charErrorBufferLength=(int8_t)length;

	589

	590 /* now output what fits into the regular target */

	591 diff>>=8length; / length was reduced by targetCapacity */

	592 switch(targetCapacity) {

	593 /* each branch falls through to the next one */

	594 case 3:

	595 *target++=(uint8_t)(diff>>16);

	596 *offsets++=sourceIndex;

	597 case 2:

	598 *target++=(uint8_t)(diff>>8);

	599 *offsets++=sourceIndex;

	600 case 1:

	601 *target++=(uint8_t)diff;

	602 *offsets++=sourceIndex;

	603 default:

	604 /* will never occur */

	605 break;

	606 }

	607

	608 /* target overflow */

	609 targetCapacity=0;

	610 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	611 break;

	612 }

	613 }

	614 } else {

	615 /* target is full */

	616 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	617 break;

	618 }

	619 }

	620

	621 /* set the converter state back into UConverter */

	622 cnv->fromUChar32= c<0 ? -c : 0;

	623 cnv->fromUnicodeStatus=(uint32_t)prev;

	624

	625 /* write back the updated pointers */

	626 pArgs->source=source;

	627 pArgs->target=(char *)target;

	628 pArgs->offsets=offsets;

	629 }

	630

	631 /*

	632 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.

	633 * If a change is made in the original function, then either

	634 * change this function the same way or

	635 * re-copy the original function and remove the variables

	636 * offsets, sourceIndex, and nextSourceIndex.

	637 */

	638 static void

	639 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,

	640 UErrorCode *pErrorCode) {

	641 UConverter *cnv;

	642 const UChar source, sourceLimit;

	643 uint8_t *target;

	644 int32_t targetCapacity;

	645

	646 int32_t prev, c, diff;

	647

	648 /* set up the local pointers */

	649 cnv=pArgs->converter;

	650 source=pArgs->source;

	651 sourceLimit=pArgs->sourceLimit;

	652 target=(uint8_t *)pArgs->target;

	653 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);

	654

	655 /* get the converter state from UConverter */

	656 c=cnv->fromUChar32;

	657 prev=(int32_t)cnv->fromUnicodeStatus;

	658 if(prev==0) {

	659 prev=BOCU1_ASCII_PREV;

	660 }

	661

	662 /* conversion loop */

	663 if(c!=0 && targetCapacity>0) {

	664 goto getTrail;

	665 }

	666

	667 fastSingle:

	668 /* fast loop for single-byte differences */

	669 /* use only one loop counter variable, targetCapacity, not also source */

	670 diff=(int32_t)(sourceLimit-source);

	671 if(targetCapacity>diff) {

	672 targetCapacity=diff;

	673 }

	674 while(targetCapacity>0 && (c=*source)<0x3000) {

	675 if(c<=0x20) {

	676 if(c!=0x20) {

	677 prev=BOCU1_ASCII_PREV;

	678 }

	679 *target++=(uint8_t)c;

	680 } else {

	681 diff=c-prev;

	682 if(DIFF_IS_SINGLE(diff)) {

	683 prev=BOCU1_SIMPLE_PREV(c);

	684 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

	685 } else {

	686 break;

	687 }

	688 }

	689 ++source;

	690 --targetCapacity;

	691 }

	692 /* restore real values */

	693 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);

	694

	695 /* regular loop for all cases */

	696 while(source<sourceLimit) {

	697 if(targetCapacity>0) {

	698 c=*source++;

	699

	700 if(c<=0x20) {

	701 /*

	702 * ISO C0 control & space:

	703 * Encode directly for MIME compatibility,

	704 * and reset state except for space, to not disrupt compression.

	705 */

	706 if(c!=0x20) {

	707 prev=BOCU1_ASCII_PREV;

	708 }

	709 *target++=(uint8_t)c;

	710 --targetCapacity;

	711 continue;

	712 }

	713

	714 if(UTF_IS_LEAD(c)) {

	715 getTrail:

	716 if(source<sourceLimit) {

	717 /* test the following code unit */

	718 UChar trail=*source;

	719 if(UTF_IS_SECOND_SURROGATE(trail)) {

	720 ++source;

	721 c=UTF16_GET_PAIR_VALUE(c, trail);

	722 }

	723 } else {

	724 /* no more input */

	725 c=-c; /* negative lead surrogate as "incomplete" indicator t o avoid c=0 everywhere else */

	726 break;

	727 }

	728 }

	729

	730 /*

	731 * all other Unicode code points c==U+0021..U+10ffff

	732 * are encoded with the difference c-prev

	733 *

	734 * a new prev is computed from c,

	735 * placed in the middle of a 0x80-block (for most small scripts) or

	736 * in the middle of the Unihan and Hangul blocks

	737 * to statistically minimize the following difference

	738 */

	739 diff=c-prev;

	740 prev=BOCU1_PREV(c);

	741 if(DIFF_IS_SINGLE(diff)) {

	742 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);

	743 --targetCapacity;

	744 if(c<0x3000) {

	745 goto fastSingle;

	746 }

	747 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {

	748 /* optimize 2-byte case */

	749 int32_t m;

	750

	751 if(diff>=0) {

	752 diff-=BOCU1_REACH_POS_1+1;

	753 m=diff%BOCU1_TRAIL_COUNT;

	754 diff/=BOCU1_TRAIL_COUNT;

	755 diff+=BOCU1_START_POS_2;

	756 } else {

	757 diff-=BOCU1_REACH_NEG_1;

	758 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	759 diff+=BOCU1_START_NEG_2;

	760 }

	761 *target++=(uint8_t)diff;

	762 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);

	763 targetCapacity-=2;

	764 } else {

	765 int32_t length; /* will be 2..4 */

	766

	767 diff=packDiff(diff);

	768 length=BOCU1_LENGTH_FROM_PACKED(diff);

	769

	770 /* write the output character bytes from diff and length */

	771 /* from the first if in the loop we know that targetCapacity>0 * /

	772 if(length<=targetCapacity) {

	773 switch(length) {

	774 /* each branch falls through to the next one */

	775 case 4:

	776 *target++=(uint8_t)(diff>>24);

	777 case 3:

	778 *target++=(uint8_t)(diff>>16);

	779 /* case 2: handled above */

	780 *target++=(uint8_t)(diff>>8);

	781 /* case 1: handled above */

	782 *target++=(uint8_t)diff;

	783 default:

	784 /* will never occur */

	785 break;

	786 }

	787 targetCapacity-=length;

	788 } else {

	789 uint8_t *charErrorBuffer;

	790

	791 /*

	792 * We actually do this backwards here:

	793 * In order to save an intermediate variable, we output

	794 * first to the overflow buffer what does not fit into the

	795 * regular target.

	796 */

	797 /* we know that 1<=targetCapacity<length<=4 */

	798 length-=targetCapacity;

	799 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;

	800 switch(length) {

	801 /* each branch falls through to the next one */

	802 case 3:

	803 *charErrorBuffer++=(uint8_t)(diff>>16);

	804 case 2:

	805 *charErrorBuffer++=(uint8_t)(diff>>8);

	806 case 1:

	807 *charErrorBuffer=(uint8_t)diff;

	808 default:

	809 /* will never occur */

	810 break;

	811 }

	812 cnv->charErrorBufferLength=(int8_t)length;

	813

	814 /* now output what fits into the regular target */

	815 diff>>=8length; / length was reduced by targetCapacity */

	816 switch(targetCapacity) {

	817 /* each branch falls through to the next one */

	818 case 3:

	819 *target++=(uint8_t)(diff>>16);

	820 case 2:

	821 *target++=(uint8_t)(diff>>8);

	822 case 1:

	823 *target++=(uint8_t)diff;

	824 default:

	825 /* will never occur */

	826 break;

	827 }

	828

	829 /* target overflow */

	830 targetCapacity=0;

	831 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	832 break;

	833 }

	834 }

	835 } else {

	836 /* target is full */

	837 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	838 break;

	839 }

	840 }

	841

	842 /* set the converter state back into UConverter */

	843 cnv->fromUChar32= c<0 ? -c : 0;

	844 cnv->fromUnicodeStatus=(uint32_t)prev;

	845

	846 /* write back the updated pointers */

	847 pArgs->source=source;

	848 pArgs->target=(char *)target;

	849 }

	850

	851 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */

	852

	853 /**

	854 * Function for BOCU-1 decoder; handles multi-byte lead bytes.

	855 *

	856 * @param b lead byte;

	857 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEA D

	858 * @return (diff<<2)\|count

	859 */

	860 static U_INLINE int32_t

	861 decodeBocu1LeadByte(int32_t b) {

	862 int32_t diff, count;

	863

	864 if(b>=BOCU1_START_NEG_2) {

	865 /* positive difference */

	866 if(b<BOCU1_START_POS_3) {

	867 /* two bytes */

	868 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_PO S_1+1;

	869 count=1;

	870 } else if(b<BOCU1_START_POS_4) {

	871 /* three bytes */

	872 diff=((int32_t)b-BOCU1_START_POS_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_CO UNT+BOCU1_REACH_POS_2+1;

	873 count=2;

	874 } else {

	875 /* four bytes */

	876 diff=BOCU1_REACH_POS_3+1;

	877 count=3;

	878 }

	879 } else {

	880 /* negative difference */

	881 if(b>=BOCU1_START_NEG_3) {

	882 /* two bytes */

	883 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NE G_1;

	884 count=1;

	885 } else if(b>BOCU1_MIN) {

	886 /* three bytes */

	887 diff=((int32_t)b-BOCU1_START_NEG_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_CO UNT+BOCU1_REACH_NEG_2;

	888 count=2;

	889 } else {

	890 /* four bytes */

	891 diff=-BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_RE ACH_NEG_3;

	892 count=3;

	893 }

	894 }

	895

	896 /* return the state for decoding the trail byte(s) */

	897 return (diff<<2)\|count;

	898 }

	899

	900 /**

	901 * Function for BOCU-1 decoder; handles multi-byte trail bytes.

	902 *

	903 * @param count number of remaining trail bytes including this one

	904 * @param b trail byte

	905 * @return new delta for diff including b - <0 indicates an error

	906 *

	907 * @see decodeBocu1

	908 */

	909 static U_INLINE int32_t

	910 decodeBocu1TrailByte(int32_t count, int32_t b) {

	911 if(b<=0x20) {

	912 /* skip some C0 controls and make the trail byte range contiguous */

	913 b=bocu1ByteToTrail[b];

	914 /* b<0 for an illegal trail byte value will result in return<0 below */

	915 #if BOCU1_MAX_TRAIL<0xff

	916 } else if(b>BOCU1_MAX_TRAIL) {

	917 return -99;

	918 #endif

	919 } else {

	920 b-=BOCU1_TRAIL_BYTE_OFFSET;

	921 }

	922

	923 /* add trail byte into difference and decrement count */

	924 if(count==1) {

	925 return b;

	926 } else if(count==2) {

	927 return b*BOCU1_TRAIL_COUNT;

	928 } else /* count==3 */ {

	929 return b(BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT);

	930 }

	931 }

	932

	933 static void

	934 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	935 UErrorCode *pErrorCode) {

	936 UConverter *cnv;

	937 const uint8_t source, sourceLimit;

	938 UChar *target;

	939 const UChar *targetLimit;

	940 int32_t *offsets;

	941

	942 int32_t prev, count, diff, c;

	943

	944 int8_t byteIndex;

	945 uint8_t *bytes;

	946

	947 int32_t sourceIndex, nextSourceIndex;

	948

	949 /* set up the local pointers */

	950 cnv=pArgs->converter;

	951 source=(const uint8_t *)pArgs->source;

	952 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	953 target=pArgs->target;

	954 targetLimit=pArgs->targetLimit;

	955 offsets=pArgs->offsets;

	956

	957 /* get the converter state from UConverter */

	958 prev=(int32_t)cnv->toUnicodeStatus;

	959 if(prev==0) {

	960 prev=BOCU1_ASCII_PREV;

	961 }

	962 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULeng th==0 */

	963 count=diff&3;

	964 diff>>=2;

	965

	966 byteIndex=cnv->toULength;

	967 bytes=cnv->toUBytes;

	968

	969 /* sourceIndex=-1 if the current character began in the previous buffer */

	970 sourceIndex=byteIndex==0 ? 0 : -1;

	971 nextSourceIndex=0;

	972

	973 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */

	974 if(count>0 && byteIndex>0 && target<targetLimit) {

	975 goto getTrail;

	976 }

	977

	978 fastSingle:

	979 /* fast loop for single-byte differences */

	980 /* use count as the only loop counter variable */

	981 diff=(int32_t)(sourceLimit-source);

	982 count=(int32_t)(pArgs->targetLimit-target);

	983 if(count>diff) {

	984 count=diff;

	985 }

	986 while(count>0) {

	987 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {

	988 c=prev+(c-BOCU1_MIDDLE);

	989 if(c<0x3000) {

	990 *target++=(UChar)c;

	991 *offsets++=nextSourceIndex++;

	992 prev=BOCU1_SIMPLE_PREV(c);

	993 } else {

	994 break;

	995 }

	996 } else if(c<=0x20) {

	997 if(c!=0x20) {

	998 prev=BOCU1_ASCII_PREV;

	999 }

	1000 *target++=(UChar)c;

	1001 *offsets++=nextSourceIndex++;

	1002 } else {

	1003 break;

	1004 }

	1005 ++source;

	1006 --count;

	1007 }

	1008 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter * /

	1009

	1010 /* decode a sequence of single and lead bytes */

	1011 while(source<sourceLimit) {

	1012 if(target>=targetLimit) {

	1013 /* target is full */

	1014 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1015 break;

	1016 }

	1017

	1018 ++nextSourceIndex;

	1019 c=*source++;

	1020 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {

	1021 /* Write a code point directly from a single-byte difference. */

	1022 c=prev+(c-BOCU1_MIDDLE);

	1023 if(c<0x3000) {

	1024 *target++=(UChar)c;

	1025 *offsets++=sourceIndex;

	1026 prev=BOCU1_SIMPLE_PREV(c);

	1027 sourceIndex=nextSourceIndex;

	1028 goto fastSingle;

	1029 }

	1030 } else if(c<=0x20) {

	1031 /*

	1032 * Direct-encoded C0 control code or space.

	1033 * Reset prev for C0 control codes but not for space.

	1034 */

	1035 if(c!=0x20) {

	1036 prev=BOCU1_ASCII_PREV;

	1037 }

	1038 *target++=(UChar)c;

	1039 *offsets++=sourceIndex;

	1040 sourceIndex=nextSourceIndex;

	1041 continue;

	1042 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLi mit) {

	1043 /* Optimize two-byte case. */

	1044 if(c>=BOCU1_MIDDLE) {

	1045 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REAC H_POS_1+1;

	1046 } else {

	1047 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REAC H_NEG_1;

	1048 }

	1049

	1050 /* trail byte */

	1051 ++nextSourceIndex;

	1052 c=decodeBocu1TrailByte(1, *source++);

	1053 if(c<0 \|\| (uint32_t)(c=prev+diff+c)>0x10ffff) {

	1054 bytes[0]=source[-2];

	1055 bytes[1]=source[-1];

	1056 byteIndex=2;

	1057 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1058 break;

	1059 }

	1060 } else if(c==BOCU1_RESET) {

	1061 /* only reset the state, no code point */

	1062 prev=BOCU1_ASCII_PREV;

	1063 sourceIndex=nextSourceIndex;

	1064 continue;

	1065 } else {

	1066 /*

	1067 * For multi-byte difference lead bytes, set the decoder state

	1068 * with the partial difference value from the lead byte and

	1069 * with the number of trail bytes.

	1070 */

	1071 bytes[0]=(uint8_t)c;

	1072 byteIndex=1;

	1073

	1074 diff=decodeBocu1LeadByte(c);

	1075 count=diff&3;

	1076 diff>>=2;

	1077 getTrail:

	1078 for(;;) {

	1079 if(source>=sourceLimit) {

	1080 goto endloop;

	1081 }

	1082 ++nextSourceIndex;

	1083 c=bytes[byteIndex++]=*source++;

	1084

	1085 /* trail byte in any position */

	1086 c=decodeBocu1TrailByte(count, c);

	1087 if(c<0) {

	1088 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1089 goto endloop;

	1090 }

	1091

	1092 diff+=c;

	1093 if(--count==0) {

	1094 /* final trail byte, deliver a code point */

	1095 byteIndex=0;

	1096 c=prev+diff;

	1097 if((uint32_t)c>0x10ffff) {

	1098 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1099 goto endloop;

	1100 }

	1101 break;

	1102 }

	1103 }

	1104 }

	1105

	1106 /* calculate the next prev and output c */

	1107 prev=BOCU1_PREV(c);

	1108 if(c<=0xffff) {

	1109 *target++=(UChar)c;

	1110 *offsets++=sourceIndex;

	1111 } else {

	1112 /* output surrogate pair */

	1113 *target++=UTF16_LEAD(c);

	1114 if(target<targetLimit) {

	1115 *target++=UTF16_TRAIL(c);

	1116 *offsets++=sourceIndex;

	1117 *offsets++=sourceIndex;

	1118 } else {

	1119 /* target overflow */

	1120 *offsets++=sourceIndex;

	1121 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);

	1122 cnv->UCharErrorBufferLength=1;

	1123 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1124 break;

	1125 }

	1126 }

	1127 sourceIndex=nextSourceIndex;

	1128 }

	1129 endloop:

	1130

	1131 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {

	1132 /* set the converter state in UConverter to deal with the next character */

	1133 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;

	1134 cnv->mode=0;

	1135 } else {

	1136 /* set the converter state back into UConverter */

	1137 cnv->toUnicodeStatus=(uint32_t)prev;

	1138 cnv->mode=(diff<<2)\|count;

	1139 }

	1140 cnv->toULength=byteIndex;

	1141

	1142 /* write back the updated pointers */

	1143 pArgs->source=(const char *)source;

	1144 pArgs->target=target;

	1145 pArgs->offsets=offsets;

	1146 return;

	1147 }

	1148

	1149 /*

	1150 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.

	1151 * If a change is made in the original function, then either

	1152 * change this function the same way or

	1153 * re-copy the original function and remove the variables

	1154 * offsets, sourceIndex, and nextSourceIndex.

	1155 */

	1156 static void

	1157 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,

	1158 UErrorCode *pErrorCode) {

	1159 UConverter *cnv;

	1160 const uint8_t source, sourceLimit;

	1161 UChar *target;

	1162 const UChar *targetLimit;

	1163

	1164 int32_t prev, count, diff, c;

	1165

	1166 int8_t byteIndex;

	1167 uint8_t *bytes;

	1168

	1169 U_ALIGN_CODE(16)

	1170

	1171 /* set up the local pointers */

	1172 cnv=pArgs->converter;

	1173 source=(const uint8_t *)pArgs->source;

	1174 sourceLimit=(const uint8_t *)pArgs->sourceLimit;

	1175 target=pArgs->target;

	1176 targetLimit=pArgs->targetLimit;

	1177

	1178 /* get the converter state from UConverter */

	1179 prev=(int32_t)cnv->toUnicodeStatus;

	1180 if(prev==0) {

	1181 prev=BOCU1_ASCII_PREV;

	1182 }

	1183 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULeng th==0 */

	1184 count=diff&3;

	1185 diff>>=2;

	1186

	1187 byteIndex=cnv->toULength;

	1188 bytes=cnv->toUBytes;

	1189

	1190 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */

	1191 if(count>0 && byteIndex>0 && target<targetLimit) {

	1192 goto getTrail;

	1193 }

	1194

	1195 fastSingle:

	1196 /* fast loop for single-byte differences */

	1197 /* use count as the only loop counter variable */

	1198 diff=(int32_t)(sourceLimit-source);

	1199 count=(int32_t)(pArgs->targetLimit-target);

	1200 if(count>diff) {

	1201 count=diff;

	1202 }

	1203 while(count>0) {

	1204 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {

	1205 c=prev+(c-BOCU1_MIDDLE);

	1206 if(c<0x3000) {

	1207 *target++=(UChar)c;

	1208 prev=BOCU1_SIMPLE_PREV(c);

	1209 } else {

	1210 break;

	1211 }

	1212 } else if(c<=0x20) {

	1213 if(c!=0x20) {

	1214 prev=BOCU1_ASCII_PREV;

	1215 }

	1216 *target++=(UChar)c;

	1217 } else {

	1218 break;

	1219 }

	1220 ++source;

	1221 --count;

	1222 }

	1223

	1224 /* decode a sequence of single and lead bytes */

	1225 while(source<sourceLimit) {

	1226 if(target>=targetLimit) {

	1227 /* target is full */

	1228 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1229 break;

	1230 }

	1231

	1232 c=*source++;

	1233 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {

	1234 /* Write a code point directly from a single-byte difference. */

	1235 c=prev+(c-BOCU1_MIDDLE);

	1236 if(c<0x3000) {

	1237 *target++=(UChar)c;

	1238 prev=BOCU1_SIMPLE_PREV(c);

	1239 goto fastSingle;

	1240 }

	1241 } else if(c<=0x20) {

	1242 /*

	1243 * Direct-encoded C0 control code or space.

	1244 * Reset prev for C0 control codes but not for space.

	1245 */

	1246 if(c!=0x20) {

	1247 prev=BOCU1_ASCII_PREV;

	1248 }

	1249 *target++=(UChar)c;

	1250 continue;

	1251 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLi mit) {

	1252 /* Optimize two-byte case. */

	1253 if(c>=BOCU1_MIDDLE) {

	1254 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REAC H_POS_1+1;

	1255 } else {

	1256 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REAC H_NEG_1;

	1257 }

	1258

	1259 /* trail byte */

	1260 c=decodeBocu1TrailByte(1, *source++);

	1261 if(c<0 \|\| (uint32_t)(c=prev+diff+c)>0x10ffff) {

	1262 bytes[0]=source[-2];

	1263 bytes[1]=source[-1];

	1264 byteIndex=2;

	1265 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1266 break;

	1267 }

	1268 } else if(c==BOCU1_RESET) {

	1269 /* only reset the state, no code point */

	1270 prev=BOCU1_ASCII_PREV;

	1271 continue;

	1272 } else {

	1273 /*

	1274 * For multi-byte difference lead bytes, set the decoder state

	1275 * with the partial difference value from the lead byte and

	1276 * with the number of trail bytes.

	1277 */

	1278 bytes[0]=(uint8_t)c;

	1279 byteIndex=1;

	1280

	1281 diff=decodeBocu1LeadByte(c);

	1282 count=diff&3;

	1283 diff>>=2;

	1284 getTrail:

	1285 for(;;) {

	1286 if(source>=sourceLimit) {

	1287 goto endloop;

	1288 }

	1289 c=bytes[byteIndex++]=*source++;

	1290

	1291 /* trail byte in any position */

	1292 c=decodeBocu1TrailByte(count, c);

	1293 if(c<0) {

	1294 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1295 goto endloop;

	1296 }

	1297

	1298 diff+=c;

	1299 if(--count==0) {

	1300 /* final trail byte, deliver a code point */

	1301 byteIndex=0;

	1302 c=prev+diff;

	1303 if((uint32_t)c>0x10ffff) {

	1304 *pErrorCode=U_ILLEGAL_CHAR_FOUND;

	1305 goto endloop;

	1306 }

	1307 break;

	1308 }

	1309 }

	1310 }

	1311

	1312 /* calculate the next prev and output c */

	1313 prev=BOCU1_PREV(c);

	1314 if(c<=0xffff) {

	1315 *target++=(UChar)c;

	1316 } else {

	1317 /* output surrogate pair */

	1318 *target++=UTF16_LEAD(c);

	1319 if(target<targetLimit) {

	1320 *target++=UTF16_TRAIL(c);

	1321 } else {

	1322 /* target overflow */

	1323 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);

	1324 cnv->UCharErrorBufferLength=1;

	1325 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

	1326 break;

	1327 }

	1328 }

	1329 }

	1330 endloop:

	1331

	1332 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {

	1333 /* set the converter state in UConverter to deal with the next character */

	1334 cnv->toUnicodeStatus=BOCU1_ASCII_PREV;

	1335 cnv->mode=0;

	1336 } else {

	1337 /* set the converter state back into UConverter */

	1338 cnv->toUnicodeStatus=(uint32_t)prev;

	1339 cnv->mode=(diff<<2)\|count;

	1340 }

	1341 cnv->toULength=byteIndex;

	1342

	1343 /* write back the updated pointers */

	1344 pArgs->source=(const char *)source;

	1345 pArgs->target=target;

	1346 return;

	1347 }

	1348

	1349 /* miscellaneous ------------------------------------------------------------ */

	1350

	1351 static const UConverterImpl _Bocu1Impl={

	1352 UCNV_BOCU1,

	1353

	1354 NULL,

	1355 NULL,

	1356

	1357 NULL,

	1358 NULL,

	1359 NULL,

	1360

	1361 _Bocu1ToUnicode,

	1362 _Bocu1ToUnicodeWithOffsets,

	1363 _Bocu1FromUnicode,

	1364 _Bocu1FromUnicodeWithOffsets,

	1365 NULL,

	1366

	1367 NULL,

	1368 NULL,

	1369 NULL,

	1370 NULL,

	1371 ucnv_getCompleteUnicodeSet

	1372 };

	1373

	1374 static const UConverterStaticData _Bocu1StaticData={

	1375 sizeof(UConverterStaticData),

	1376 "BOCU-1",

	1377 1214, /* CCSID for BOCU-1 */

	1378 UCNV_IBM, UCNV_BOCU1,

	1379 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */

	1380 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */

	1381 FALSE, FALSE,

	1382 0,

	1383 0,

	1384 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

	1385 };

	1386

	1387 const UConverterSharedData _Bocu1Data={

	1388 sizeof(UConverterSharedData), ~((uint32_t)0),

	1389 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,

	1390 0

	1391 };

	1392

	1393 #endif

OLD	NEW

« no previous file with comments | « icu46/source/common/ucnv_u8.c ('k') | icu46/source/common/ucnvdisp.c » ('j') | no next file with comments »