icu46/source/test/cintltst/bocu1tst.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/test/cintltst/bocu1tst.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 2002-2010, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 * file name: bocu1tst.c

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2002may27

	14 * created by: Markus W. Scherer

	15 *

	16 * This is the reference implementation of BOCU-1,

	17 * the MIME-friendly form of the Binary Ordered Compression for Unicode,

	18 * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trun k/design/conversion/bocu1/

	19 * The files bocu1.h and bocu1.c from the design folder are taken

	20 * verbatim (minus copyright and #include) and copied together into this file.

	21 * The reference code and some of the reference bocu1tst.c

	22 * is modified to run as part of the ICU cintltst

	23 * test framework (minus main(), log_ln() etc. instead of printf()).

	24 *

	25 * This reference implementation is used here to verify

	26 * the ICU BOCU-1 implementation, which is

	27 * adapted for ICU conversion APIs and optimized.

	28 * ### links in design doc to here and to ucnvbocu.c

	29 */

	30

	31 #include "unicode/utypes.h"

	32 #include "unicode/ustring.h"

	33 #include "unicode/ucnv.h"

	34 #include "cmemory.h"

	35 #include "cintltst.h"

	36

	37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))

	38

	39 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */

	40

	41 /* BOCU-1 constants and macros ---------------------------------------------- */

	42

	43 /*

	44 * BOCU-1 encodes the code points of a Unicode string as

	45 * a sequence of byte-encoded differences (slope detection),

	46 * preserving lexical order.

	47 *

	48 * Optimize the difference-taking for runs of Unicode text within

	49 * small scripts:

	50 *

	51 * Most small scripts are allocated within aligned 128-blocks of Unicode

	52 * code points. Lexical order is preserved if the "previous code point" state

	53 * is always moved into the middle of such a block.

	54 *

	55 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul

	56 * areas into the middle of those areas.

	57 *

	58 * C0 control codes and space are encoded with their US-ASCII bytes.

	59 * "prev" is reset for C0 controls but not for space.

	60 */

	61

	62 /* initial value for "prev": middle of the ASCII range */

	63 #define BOCU1_ASCII_PREV 0x40

	64

	65 /* bounding byte values for differences */

	66 #define BOCU1_MIN 0x21

	67 #define BOCU1_MIDDLE 0x90

	68 #define BOCU1_MAX_LEAD 0xfe

	69

	70 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit com pilers */

	71 #define BOCU1_MAX_TRAIL 0xffL

	72 #define BOCU1_RESET 0xff

	73

	74 /* number of lead bytes */

	75 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)

	76

	77 /* adjust trail byte counts for the use of some C0 control byte values */

	78 #define BOCU1_TRAIL_CONTROLS_COUNT 20

	79 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)

	80

	81 /* number of trail bytes */

	82 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTR OLS_COUNT)

	83

	84 /*

	85 * number of positive and negative single-byte codes

	86 * (counting 0==BOCU1_MIDDLE among the positive ones)

	87 */

	88 #define BOCU1_SINGLE 64

	89

	90 /* number of lead bytes for positive and negative 2/3/4-byte sequences */

	91 #define BOCU1_LEAD_2 43

	92 #define BOCU1_LEAD_3 3

	93 #define BOCU1_LEAD_4 1

	94

	95 /* The difference value range for single-byters. */

	96 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)

	97 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)

	98

	99 /* The difference value range for double-byters. */

	100 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

	101 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

	102

	103 /* The difference value range for 3-byters. */

	104 #define BOCU1_REACH_POS_3 \

	105 (BOCU1_REACH_POS_2+BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)

	106

	107 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3BOCU1_TRAIL_COUNTBO CU1_TRAIL_COUNT)

	108

	109 /* The lead byte start values. */

	110 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)

	111 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)

	112 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)

	113 /* ==BOCU1_MAX_LEAD */

	114

	115 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)

	116 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)

	117 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)

	118 /* ==BOCU1_MIN+1 */

	119

	120 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */

	121 #define BOCU1_LENGTH_FROM_LEAD(lead) \

	122 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \

	123 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \

	124 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)

	125

	126 /* The length of a byte sequence, according to its packed form. */

	127 #define BOCU1_LENGTH_FROM_PACKED(packed) \

	128 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)

	129

	130 /*

	131 * 12 commonly used C0 control codes (and space) are only used to encode

	132 * themselves directly,

	133 * which makes BOCU-1 MIME-usable and reasonably safe for

	134 * ASCII-oriented software.

	135 *

	136 * These controls are

	137 * 0 NUL

	138 *

	139 * 7 BEL

	140 * 8 BS

	141 *

	142 * 9 TAB

	143 * a LF

	144 * b VT

	145 * c FF

	146 * d CR

	147 *

	148 * e SO

	149 * f SI

	150 *

	151 * 1a SUB

	152 * 1b ESC

	153 *

	154 * The other 20 C0 controls are also encoded directly (to preserve order)

	155 * but are also used as trail bytes in difference encoding

	156 * (for better compression).

	157 */

	158 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAI L_BYTE_OFFSET : bocu1TrailToByte[t])

	159

	160 /*

	161 * Byte value map for control codes,

	162 * from external byte values 0x00..0x20

	163 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.

	164 * External byte values that are illegal as trail bytes are mapped to -1.

	165 */

	166 static const int8_t

	167 bocu1ByteToTrail[BOCU1_MIN]={

	168 /* 0 1 2 3 4 5 6 7 */

	169 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,

	170

	171 /* 8 9 a b c d e f */

	172 -1, -1, -1, -1, -1, -1, -1, -1,

	173

	174 /* 10 11 12 13 14 15 16 17 */

	175 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,

	176

	177 /* 18 19 1a 1b 1c 1d 1e 1f */

	178 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,

	179

	180 /* 20 */

	181 -1

	182 };

	183

	184 /*

	185 * Byte value map for control codes,

	186 * from trail byte values 0..19 (0..0x13) as used in the difference calculation

	187 * to external byte values 0x00..0x20.

	188 */

	189 static const int8_t

	190 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={

	191 /* 0 1 2 3 4 5 6 7 */

	192 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,

	193

	194 /* 8 9 a b c d e f */

	195 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

	196

	197 /* 10 11 12 13 */

	198 0x1c, 0x1d, 0x1e, 0x1f

	199 };

	200

	201 /**

	202 * Integer division and modulo with negative numerators

	203 * yields negative modulo results and quotients that are one more than

	204 * what we need here.

	205 * This macro adjust the results so that the modulo-value m is always >=0.

	206 *

	207 * For positive n, the if() condition is always FALSE.

	208 *

	209 * @param n Number to be split into quotient and rest.

	210 * Will be modified to contain the quotient.

	211 * @param d Divisor.

	212 * @param m Output variable for the rest (modulo result).

	213 */

	214 #define NEGDIVMOD(n, d, m) { \

	215 (m)=(n)%(d); \

	216 (n)/=(d); \

	217 if((m)<0) { \

	218 --(n); \

	219 (m)+=(d); \

	220 } \

	221 }

	222

	223 /* State for BOCU-1 decoder function. */

	224 struct Bocu1Rx {

	225 int32_t prev, count, diff;

	226 };

	227

	228 typedef struct Bocu1Rx Bocu1Rx;

	229

	230 /* Function prototypes ------------------------------------------------------ */

	231

	232 /* see bocu1.c */

	233 U_CFUNC int32_t

	234 packDiff(int32_t diff);

	235

	236 U_CFUNC int32_t

	237 encodeBocu1(int32_t *pPrev, int32_t c);

	238

	239 U_CFUNC int32_t

	240 decodeBocu1(Bocu1Rx *pRx, uint8_t b);

	241

	242 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */

	243

	244 /* BOCU-1 implementation functions ------------------------------------------ */

	245

	246 /**

	247 * Compute the next "previous" value for differencing

	248 * from the current code point.

	249 *

	250 * @param c current code point, 0..0x10ffff

	251 * @return "previous code point" state value

	252 */

	253 static U_INLINE int32_t

	254 bocu1Prev(int32_t c) {

	255 /* compute new prev */

	256 if(0x3040<=c && c<=0x309f) {

	257 /* Hiragana is not 128-aligned */

	258 return 0x3070;

	259 } else if(0x4e00<=c && c<=0x9fa5) {

	260 /* CJK Unihan */

	261 return 0x4e00-BOCU1_REACH_NEG_2;

	262 } else if(0xac00<=c && c<=0xd7a3) {

	263 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compiler s) */

	264 return ((int32_t)0xd7a3+(int32_t)0xac00)/2;

	265 } else {

	266 /* mostly small scripts */

	267 return (c&~0x7f)+BOCU1_ASCII_PREV;

	268 }

	269 }

	270

	271 /**

	272 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes

	273 * and return a packed integer with them.

	274 *

	275 * The encoding favors small absolut differences with short encodings

	276 * to compress runs of same-script characters.

	277 *

	278 * @param diff difference value -0x10ffff..0x10ffff

	279 * @return

	280 * 0x010000zz for 1-byte sequence zz

	281 * 0x0200yyzz for 2-byte sequence yy zz

	282 * 0x03xxyyzz for 3-byte sequence xx yy zz

	283 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)

	284 */

	285 U_CFUNC int32_t

	286 packDiff(int32_t diff) {

	287 int32_t result, m, lead, count, shift;

	288

	289 if(diff>=BOCU1_REACH_NEG_1) {

	290 /* mostly positive differences, and single-byte negative ones */

	291 if(diff<=BOCU1_REACH_POS_1) {

	292 /* single byte */

	293 return 0x01000000\|(BOCU1_MIDDLE+diff);

	294 } else if(diff<=BOCU1_REACH_POS_2) {

	295 /* two bytes */

	296 diff-=BOCU1_REACH_POS_1+1;

	297 lead=BOCU1_START_POS_2;

	298 count=1;

	299 } else if(diff<=BOCU1_REACH_POS_3) {

	300 /* three bytes */

	301 diff-=BOCU1_REACH_POS_2+1;

	302 lead=BOCU1_START_POS_3;

	303 count=2;

	304 } else {

	305 /* four bytes */

	306 diff-=BOCU1_REACH_POS_3+1;

	307 lead=BOCU1_START_POS_4;

	308 count=3;

	309 }

	310 } else {

	311 /* two- and four-byte negative differences */

	312 if(diff>=BOCU1_REACH_NEG_2) {

	313 /* two bytes */

	314 diff-=BOCU1_REACH_NEG_1;

	315 lead=BOCU1_START_NEG_2;

	316 count=1;

	317 } else if(diff>=BOCU1_REACH_NEG_3) {

	318 /* three bytes */

	319 diff-=BOCU1_REACH_NEG_2;

	320 lead=BOCU1_START_NEG_3;

	321 count=2;

	322 } else {

	323 /* four bytes */

	324 diff-=BOCU1_REACH_NEG_3;

	325 lead=BOCU1_START_NEG_4;

	326 count=3;

	327 }

	328 }

	329

	330 /* encode the length of the packed result */

	331 if(count<3) {

	332 result=(count+1)<<24;

	333 } else /* count==3, MSB used for the lead byte */ {

	334 result=0;

	335 }

	336

	337 /* calculate trail bytes like digits in itoa() */

	338 shift=0;

	339 do {

	340 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

	341 result\|=BOCU1_TRAIL_TO_BYTE(m)<<shift;

	342 shift+=8;

	343 } while(--count>0);

	344

	345 /* add lead byte */

	346 result\|=(lead+diff)<<shift;

	347

	348 return result;

	349 }

	350

	351 /**

	352 * BOCU-1 encoder function.

	353 *

	354 * @param pPrev pointer to the integer that holds

	355 * the "previous code point" state;

	356 * the initial value should be 0 which

	357 * encodeBocu1 will set to the actual BOCU-1 initial state value

	358 * @param c the code point to encode

	359 * @return the packed 1/2/3/4-byte encoding, see packDiff(),

	360 * or 0 if an error occurs

	361 *

	362 * @see packDiff

	363 */

	364 U_CFUNC int32_t

	365 encodeBocu1(int32_t *pPrev, int32_t c) {

	366 int32_t prev;

	367

	368 if(pPrev==NULL \|\| c<0 \|\| c>0x10ffff) {

	369 /* illegal argument */

	370 return 0;

	371 }

	372

	373 prev=*pPrev;

	374 if(prev==0) {

	375 /* lenient handling of initial value 0 */

	376 prev=*pPrev=BOCU1_ASCII_PREV;

	377 }

	378

	379 if(c<=0x20) {

	380 /*

	381 * ISO C0 control & space:

	382 * Encode directly for MIME compatibility,

	383 * and reset state except for space, to not disrupt compression.

	384 */

	385 if(c!=0x20) {

	386 *pPrev=BOCU1_ASCII_PREV;

	387 }

	388 return 0x01000000\|c;

	389 }

	390

	391 /*

	392 * all other Unicode code points c==U+0021..U+10ffff

	393 * are encoded with the difference c-prev

	394 *

	395 * a new prev is computed from c,

	396 * placed in the middle of a 0x80-block (for most small scripts) or

	397 * in the middle of the Unihan and Hangul blocks

	398 * to statistically minimize the following difference

	399 */

	400 *pPrev=bocu1Prev(c);

	401 return packDiff(c-prev);

	402 }

	403

	404 /**

	405 * Function for BOCU-1 decoder; handles multi-byte lead bytes.

	406 *

	407 * @param pRx pointer to the decoder state structure

	408 * @param b lead byte;

	409 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LE AD

	410 * @return -1 (state change only)

	411 *

	412 * @see decodeBocu1

	413 */

	414 static int32_t

	415 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {

	416 int32_t c, count;

	417

	418 if(b>=BOCU1_START_NEG_2) {

	419 /* positive difference */

	420 if(b<BOCU1_START_POS_3) {

	421 /* two bytes */

	422 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1 +1;

	423 count=1;

	424 } else if(b<BOCU1_START_POS_4) {

	425 /* three bytes */

	426 c=((int32_t)b-BOCU1_START_POS_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT +BOCU1_REACH_POS_2+1;

	427 count=2;

	428 } else {

	429 /* four bytes */

	430 c=BOCU1_REACH_POS_3+1;

	431 count=3;

	432 }

	433 } else {

	434 /* negative difference */

	435 if(b>=BOCU1_START_NEG_3) {

	436 /* two bytes */

	437 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1 ;

	438 count=1;

	439 } else if(b>BOCU1_MIN) {

	440 /* three bytes */

	441 c=((int32_t)b-BOCU1_START_NEG_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT +BOCU1_REACH_NEG_2;

	442 count=2;

	443 } else {

	444 /* four bytes */

	445 c=-BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH _NEG_3;

	446 count=3;

	447 }

	448 }

	449

	450 /* set the state for decoding the trail byte(s) */

	451 pRx->diff=c;

	452 pRx->count=count;

	453 return -1;

	454 }

	455

	456 /**

	457 * Function for BOCU-1 decoder; handles multi-byte trail bytes.

	458 *

	459 * @param pRx pointer to the decoder state structure

	460 * @param b trail byte

	461 * @return result value, same as decodeBocu1

	462 *

	463 * @see decodeBocu1

	464 */

	465 static int32_t

	466 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {

	467 int32_t t, c, count;

	468

	469 if(b<=0x20) {

	470 /* skip some C0 controls and make the trail byte range contiguous */

	471 t=bocu1ByteToTrail[b];

	472 if(t<0) {

	473 /* illegal trail byte value */

	474 pRx->prev=BOCU1_ASCII_PREV;

	475 pRx->count=0;

	476 return -99;

	477 }

	478 #if BOCU1_MAX_TRAIL<0xff

	479 } else if(b>BOCU1_MAX_TRAIL) {

	480 return -99;

	481 #endif

	482 } else {

	483 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;

	484 }

	485

	486 /* add trail byte into difference and decrement count */

	487 c=pRx->diff;

	488 count=pRx->count;

	489

	490 if(count==1) {

	491 /* final trail byte, deliver a code point */

	492 c=pRx->prev+c+t;

	493 if(0<=c && c<=0x10ffff) {

	494 /* valid code point result */

	495 pRx->prev=bocu1Prev(c);

	496 pRx->count=0;

	497 return c;

	498 } else {

	499 /* illegal code point result */

	500 pRx->prev=BOCU1_ASCII_PREV;

	501 pRx->count=0;

	502 return -99;

	503 }

	504 }

	505

	506 /* intermediate trail byte */

	507 if(count==2) {

	508 pRx->diff=c+t*BOCU1_TRAIL_COUNT;

	509 } else /* count==3 */ {

	510 pRx->diff=c+tBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT;

	511 }

	512 pRx->count=count-1;

	513 return -1;

	514 }

	515

	516 /**

	517 * BOCU-1 decoder function.

	518 *

	519 * @param pRx pointer to the decoder state structure;

	520 * the initial values should be 0 which

	521 * decodeBocu1 will set to actual initial state values

	522 * @param b an input byte

	523 * @return

	524 * 0..0x10ffff for a result code point

	525 * -1 if only the state changed without code point output

	526 * <-1 if an error occurs

	527 */

	528 U_CFUNC int32_t

	529 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {

	530 int32_t prev, c, count;

	531

	532 if(pRx==NULL) {

	533 /* illegal argument */

	534 return -99;

	535 }

	536

	537 prev=pRx->prev;

	538 if(prev==0) {

	539 /* lenient handling of initial 0 values */

	540 prev=pRx->prev=BOCU1_ASCII_PREV;

	541 count=pRx->count=0;

	542 } else {

	543 count=pRx->count;

	544 }

	545

	546 if(count==0) {

	547 /* byte in lead position */

	548 if(b<=0x20) {

	549 /*

	550 * Direct-encoded C0 control code or space.

	551 * Reset prev for C0 control codes but not for space.

	552 */

	553 if(b!=0x20) {

	554 pRx->prev=BOCU1_ASCII_PREV;

	555 }

	556 return b;

	557 }

	558

	559 /*

	560 * b is a difference lead byte.

	561 *

	562 * Return a code point directly from a single-byte difference.

	563 *

	564 * For multi-byte difference lead bytes, set the decoder state

	565 * with the partial difference value from the lead byte and

	566 * with the number of trail bytes.

	567 *

	568 * For four-byte differences, the signedness also affects the

	569 * first trail byte, which has special handling farther below.

	570 */

	571 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {

	572 /* single-byte difference */

	573 c=prev+((int32_t)b-BOCU1_MIDDLE);

	574 pRx->prev=bocu1Prev(c);

	575 return c;

	576 } else if(b==BOCU1_RESET) {

	577 /* only reset the state, no code point */

	578 pRx->prev=BOCU1_ASCII_PREV;

	579 return -1;

	580 } else {

	581 return decodeBocu1LeadByte(pRx, b);

	582 }

	583 } else {

	584 /* trail byte in any position */

	585 return decodeBocu1TrailByte(pRx, b);

	586 }

	587 }

	588

	589 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */

	590

	591 /* test code ---------------------------------------------------------------- */

	592

	593 /* test code options */

	594

	595 /* ignore comma when processing name lists in testText() */

	596 #define TEST_IGNORE_COMMA 1

	597

	598 /**

	599 * Write a packed BOCU-1 byte sequence into a byte array,

	600 * without overflow check.

	601 * Test function.

	602 *

	603 * @param packed packed BOCU-1 byte sequence, see packDiff()

	604 * @param p pointer to byte array

	605 * @return number of bytes

	606 *

	607 * @see packDiff

	608 */

	609 static int32_t

	610 writePacked(int32_t packed, uint8_t *p) {

	611 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);

	612 switch(count) {

	613 case 4:

	614 *p++=(uint8_t)(packed>>24);

	615 case 3:

	616 *p++=(uint8_t)(packed>>16);

	617 case 2:

	618 *p++=(uint8_t)(packed>>8);

	619 case 1:

	620 *p++=(uint8_t)packed;

	621 default:

	622 break;

	623 }

	624

	625 return count;

	626 }

	627

	628 /**

	629 * Unpack a packed BOCU-1 non-C0/space byte sequence and get

	630 * the difference to initialPrev.

	631 * Used only for round-trip testing of the difference encoding and decoding.

	632 * Test function.

	633 *

	634 * @param initialPrev bogus "previous code point" value to make sure that

	635 * the resulting code point is in the range 0..0x10ffff

	636 * @param packed packed BOCU-1 byte sequence

	637 * @return the difference to initialPrev

	638 *

	639 * @see packDiff

	640 * @see writeDiff

	641 */

	642 static int32_t

	643 unpackDiff(int32_t initialPrev, int32_t packed) {

	644 Bocu1Rx rx={ 0, 0, 0 };

	645 int32_t count;

	646

	647 rx.prev=initialPrev;

	648 count=BOCU1_LENGTH_FROM_PACKED(packed);

	649 switch(count) {

	650 case 4:

	651 decodeBocu1(&rx, (uint8_t)(packed>>24));

	652 case 3:

	653 decodeBocu1(&rx, (uint8_t)(packed>>16));

	654 case 2:

	655 decodeBocu1(&rx, (uint8_t)(packed>>8));

	656 case 1:

	657 /* subtract initial prev */

	658 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;

	659 default:

	660 return -0x7fffffff;

	661 }

	662 }

	663

	664 /**

	665 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,

	666 * preserving lexical order.

	667 * Also checks for roundtripping of the difference encoding.

	668 * Test function.

	669 *

	670 * @param diff difference value to test, -0x10ffff..0x10ffff

	671 * @param p pointer to output byte array

	672 * @return p advanced by number of bytes output

	673 *

	674 * @see unpackDiff

	675 */

	676 static uint8_t *

	677 writeDiff(int32_t diff, uint8_t *p) {

	678 /* generate the difference as a packed value and serialize it */

	679 int32_t packed, initialPrev;

	680

	681 packed=packDiff(diff);

	682

	683 /*

	684 * bogus initial "prev" to work around

	685 * code point range check in decodeBocu1()

	686 */

	687 if(diff<=0) {

	688 initialPrev=0x10ffff;

	689 } else {

	690 initialPrev=-1;

	691 }

	692

	693 if(diff!=unpackDiff(initialPrev, packed)) {

	694 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",

	695 diff, packed, unpackDiff(initialPrev, packed));

	696 }

	697 return p+writePacked(packed, p);

	698 }

	699

	700 /**

	701 * Encode a UTF-16 string in BOCU-1.

	702 * Does not check for overflows, but otherwise useful function.

	703 *

	704 * @param s input UTF-16 string

	705 * @param length number of UChar code units in s

	706 * @param p pointer to output byte array

	707 * @return number of bytes output

	708 */

	709 static int32_t

	710 writeString(const UChar s, int32_t length, uint8_t p) {

	711 uint8_t *p0;

	712 int32_t c, prev, i;

	713

	714 prev=0;

	715 p0=p;

	716 i=0;

	717 while(i<length) {

	718 UTF_NEXT_CHAR(s, i, length, c);

	719 p+=writePacked(encodeBocu1(&prev, c), p);

	720 }

	721 return (int32_t)(p-p0);

	722 }

	723

	724 /**

	725 * Decode a BOCU-1 byte sequence to a UTF-16 string.

	726 * Does not check for overflows, but otherwise useful function.

	727 *

	728 * @param p pointer to input BOCU-1 bytes

	729 * @param length number of input bytes

	730 * @param s point to output UTF-16 string array

	731 * @return number of UChar code units output

	732 */

	733 static int32_t

	734 readString(const uint8_t p, int32_t length, UChar s) {

	735 Bocu1Rx rx={ 0, 0, 0 };

	736 int32_t c, i, sLength;

	737

	738 i=sLength=0;

	739 while(i<length) {

	740 c=decodeBocu1(&rx, p[i++]);

	741 if(c<-1) {

	742 log_err("error: readString detects encoding error at string index %l d\n", i);

	743 return -1;

	744 }

	745 if(c>=0) {

	746 UTF_APPEND_CHAR_UNSAFE(s, sLength, c);

	747 }

	748 }

	749 return sLength;

	750 }

	751

	752 static U_INLINE char

	753 hexDigit(uint8_t digit) {

	754 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);

	755 }

	756

	757 /**

	758 * Pretty-print 0-terminated byte values.

	759 * Helper function for test output.

	760 *

	761 * @param bytes 0-terminated byte array to print

	762 */

	763 static void

	764 printBytes(uint8_t bytes, char out) {

	765 int i;

	766 uint8_t b;

	767

	768 i=0;

	769 while((b=*bytes++)!=0) {

	770 *out++=' ';

	771 *out++=hexDigit((uint8_t)(b>>4));

	772 *out++=hexDigit((uint8_t)(b&0xf));

	773 ++i;

	774 }

	775 i=3*(5-i);

	776 while(i>0) {

	777 *out++=' ';

	778 --i;

	779 }

	780 *out=0;

	781 }

	782

	783 /**

	784 * Basic BOCU-1 test function, called when there are no command line arguments.

	785 * Prints some of the #define values and performs round-trip tests of the

	786 * difference encoding and decoding.

	787 */

	788 static void

	789 TestBOCU1RefDiff(void) {

	790 char buf1[80], buf2[80];

	791 uint8_t prev[5], level[5];

	792 int32_t i, cmp, countErrors;

	793

	794 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_ NEG_1);

	795 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_ NEG_2);

	796 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REAC H_NEG_3);

	797

	798 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_ REACH_NEG_1, BOCU1_REACH_POS_1);

	799 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_ REACH_NEG_2, BOCU1_REACH_POS_2);

	800 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU 1_REACH_NEG_3, BOCU1_REACH_POS_3);

	801

	802 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);

	803 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BO CU1_START_NEG_2, BOCU1_START_POS_2);

	804 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);

	805

	806 /* test packDiff() & unpackDiff() with some specific values */

	807 writeDiff(0, level);

	808 writeDiff(1, level);

	809 writeDiff(65, level);

	810 writeDiff(130, level);

	811 writeDiff(30000, level);

	812 writeDiff(1000000, level);

	813 writeDiff(-65, level);

	814 writeDiff(-130, level);

	815 writeDiff(-30000, level);

	816 writeDiff(-1000000, level);

	817

	818 /* test that each value is smaller than any following one */

	819 countErrors=0;

	820 i=-0x10ffff;

	821 *writeDiff(i, prev)=0;

	822

	823 /* show first number and bytes */

	824 printBytes(prev, buf1);

	825 log_verbose(" wD(%8ld) %s\n", i, buf1);

	826

	827 for(++i; i<=0x10ffff; ++i) {

	828 *writeDiff(i, level)=0;

	829 cmp=strcmp((const char )prev, (const char )level);

	830 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level )) {

	831 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDif f(%ld))\n",

	832 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const cha r *)level), i);

	833 }

	834 if(cmp<0) {

	835 if(i==0 \|\| i==1 \|\| strlen((const char )prev)!=strlen((const char ) level)) {

	836 /*

	837 * if the result is good, then print only if the length changed

	838 * to get little but interesting output

	839 */

	840 printBytes(prev, buf1);

	841 printBytes(level, buf2);

	842 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1 , i, cmp, buf1, buf2);

	843 }

	844 } else {

	845 ++countErrors;

	846 printBytes(prev, buf1);

	847 printBytes(level, buf2);

	848 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);

	849 }

	850 /* remember the previous bytes */

	851 memcpy(prev, level, 4);

	852 }

	853

	854 /* show last number and bytes */

	855 printBytes((uint8_t *)"", buf1);

	856 printBytes(prev, buf2);

	857 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, b uf2);

	858

	859 if(countErrors==0) {

	860 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");

	861 } else {

	862 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);

	863 }

	864

	865 /* output signature byte sequence */

	866 i=0;

	867 writePacked(encodeBocu1(&i, 0xfeff), level);

	868 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",

	869 level[0], level[1], level[2]);

	870 }

	871

	872 /* cintltst code ------------------------------------------------------------ */

	873

	874 static const int32_t DEFAULT_BUFFER_SIZE = 30000;

	875

	876

	877 /* test one string with the ICU and the reference BOCU-1 implementations */

	878 static void

	879 roundtripBOCU1(UConverter bocu1, int32_t number, const UChar text, int32_t len gth) {

	880 UChar roundtripRef, roundtripICU;

	881 char bocu1Ref, bocu1ICU;

	882

	883 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULeng th;

	884 UErrorCode errorCode;

	885

	886 roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

	887 roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

	888 bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);

	889 bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);

	890

	891 /* Unicode -> BOCU-1 */

	892 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);

	893

	894 errorCode=U_ZERO_ERROR;

	895 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, l ength, &errorCode);

	896 if(U_FAILURE(errorCode)) {

	897 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, le ngth, u_errorName(errorCode));

	898 return;

	899 }

	900

	901 if(bocu1RefLength!=bocu1ICULength \|\| 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu 1RefLength)) {

	902 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, l ength, bocu1RefLength, bocu1ICULength);

	903 return;

	904 }

	905

	906 /* BOCU-1 -> Unicode */

	907 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtrip Ref);

	908 if(roundtripRefLength<0) {

	909 free(roundtripICU);

	910 return; /* readString() found an error and reported it */

	911 }

	912

	913 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, b ocu1ICU, bocu1ICULength, &errorCode);

	914 if(U_FAILURE(errorCode)) {

	915 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, leng th, u_errorName(errorCode));

	916 return;

	917 }

	918

	919 if(length!=roundtripRefLength \|\| 0!=u_memcmp(text, roundtripRef, length)) {

	920 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);

	921 return;

	922 }

	923 if(roundtripRefLength!=roundtripICULength \|\| 0!=u_memcmp(roundtripRef, round tripICU, roundtripRefLength)) {

	924 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, round tripRefLength, roundtripICULength);

	925 return;

	926 }

	927 free(roundtripRef);

	928 free(roundtripICU);

	929 free(bocu1Ref);

	930 free(bocu1ICU);

	931 }

	932

	933 static const UChar feff[]={ 0xfeff };

	934 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };

	935 static const UChar crlf[]={ 0xd, 0xa, 0x20 };

	936 static const UChar nul[]={ 0 };

	937 static const UChar latin[]={ 0xdf, 0xe6 };

	938 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };

	939 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };

	940 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };

	941 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };

	942 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatc hed! */

	943 static const UChar plane1[]={ 0xd800, 0xdc00 };

	944 static const UChar plane2[]={ 0xd845, 0xdddd };

	945 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };

	946 static const UChar plane16[]={ 0xdbff, 0xdfff };

	947 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };

	948

	949 static const struct {

	950 const UChar *s;

	951 int32_t length;

	952 } strings[]={

	953 { feff, LENGTHOF(feff) },

	954 { ascii, LENGTHOF(ascii) },

	955 { crlf, LENGTHOF(crlf) },

	956 { nul, LENGTHOF(nul) },

	957 { latin, LENGTHOF(latin) },

	958 { devanagari, LENGTHOF(devanagari) },

	959 { hiragana, LENGTHOF(hiragana) },

	960 { unihan, LENGTHOF(unihan) },

	961 { hangul, LENGTHOF(hangul) },

	962 { surrogates, LENGTHOF(surrogates) },

	963 { plane1, LENGTHOF(plane1) },

	964 { plane2, LENGTHOF(plane2) },

	965 { plane15, LENGTHOF(plane15) },

	966 { plane16, LENGTHOF(plane16) },

	967 { c0, LENGTHOF(c0) }

	968 };

	969

	970 /*

	971 * Verify that the ICU BOCU-1 implementation produces the same results as

	972 * the reference implementation from the design folder.

	973 * Generate some texts and convert them with both converters, verifying

	974 * identical results and roundtripping.

	975 */

	976 static void

	977 TestBOCU1(void) {

	978 UChar *text;

	979 int32_t i, length;

	980

	981 UConverter *bocu1;

	982 UErrorCode errorCode;

	983

	984 errorCode=U_ZERO_ERROR;

	985 bocu1=ucnv_open("BOCU-1", &errorCode);

	986 if(U_FAILURE(errorCode)) {

	987 log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(erro rCode));

	988 return;

	989 }

	990

	991 text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

	992

	993 /* text 1: each of strings[] once */

	994 length=0;

	995 for(i=0; i<LENGTHOF(strings); ++i) {

	996 u_memcpy(text+length, strings[i].s, strings[i].length);

	997 length+=strings[i].length;

	998 }

	999 roundtripBOCU1(bocu1, 1, text, length);

	1000

	1001 /* text 2: each of strings[] twice */

	1002 length=0;

	1003 for(i=0; i<LENGTHOF(strings); ++i) {

	1004 u_memcpy(text+length, strings[i].s, strings[i].length);

	1005 length+=strings[i].length;

	1006 u_memcpy(text+length, strings[i].s, strings[i].length);

	1007 length+=strings[i].length;

	1008 }

	1009 roundtripBOCU1(bocu1, 2, text, length);

	1010

	1011 /* text 3: each of strings[] many times (set step vs. \|strings\| so that all strings are used) */

	1012 length=0;

	1013 for(i=1; length<5000; i+=7) {

	1014 if(i>=LENGTHOF(strings)) {

	1015 i-=LENGTHOF(strings);

	1016 }

	1017 u_memcpy(text+length, strings[i].s, strings[i].length);

	1018 length+=strings[i].length;

	1019 }

	1020 roundtripBOCU1(bocu1, 3, text, length);

	1021

	1022 ucnv_close(bocu1);

	1023 free(text);

	1024 }

	1025

	1026 U_CFUNC void addBOCU1Tests(TestNode** root);

	1027

	1028 U_CFUNC void

	1029 addBOCU1Tests(TestNode** root) {

	1030 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");

	1031 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");

	1032 }

OLD	NEW

« no previous file with comments | « icu46/source/test/cintltst/Makefile.in ('k') | icu46/source/test/cintltst/callcoll.h » ('j') | no next file with comments »