source/test/cintltst/bocu1tst.c - Issue 2435373002: Delete source/test

Unified Diff: source/test/cintltst/bocu1tst.c

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/test/cintltst/bocu1tst.c

diff --git a/source/test/cintltst/bocu1tst.c b/source/test/cintltst/bocu1tst.c

deleted file mode 100644

index 53a204f4007ca8eca9ce8c48f297cca05c68fec7..0000000000000000000000000000000000000000

--- a/source/test/cintltst/bocu1tst.c

+++ /dev/null

@@ -1,1031 +0,0 @@

-/*

-******************************************************************************

-* file name: bocu1tst.c

-* encoding: US-ASCII

-* tab size: 8 (not used)

-* indentation:4

-* created on: 2002may27

-* created by: Markus W. Scherer

-* This is the reference implementation of BOCU-1,

-* the MIME-friendly form of the Binary Ordered Compression for Unicode,

-* taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/

-* The files bocu1.h and bocu1.c from the design folder are taken

-* verbatim (minus copyright and #include) and copied together into this file.

-* The reference code and some of the reference bocu1tst.c

-* is modified to run as part of the ICU cintltst

-* test framework (minus main(), log_ln() etc. instead of printf()).

-* This reference implementation is used here to verify

-* the ICU BOCU-1 implementation, which is

-* adapted for ICU conversion APIs and optimized.

-* ### links in design doc to here and to ucnvbocu.c

-*/

-#include "unicode/utypes.h"

-#include "unicode/ustring.h"

-#include "unicode/ucnv.h"

-#include "unicode/utf16.h"

-#include "cmemory.h"

-#include "cintltst.h"

-/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */

-/* BOCU-1 constants and macros ---------------------------------------------- */

-/*

- * BOCU-1 encodes the code points of a Unicode string as

- * a sequence of byte-encoded differences (slope detection),

- * preserving lexical order.

- *

- * Optimize the difference-taking for runs of Unicode text within

- * small scripts:

- *

- * Most small scripts are allocated within aligned 128-blocks of Unicode

- * code points. Lexical order is preserved if the "previous code point" state

- * is always moved into the middle of such a block.

- *

- * Additionally, "prev" is moved from anywhere in the Unihan and Hangul

- * areas into the middle of those areas.

- *

- * C0 control codes and space are encoded with their US-ASCII bytes.

- * "prev" is reset for C0 controls but not for space.

- */

-/* initial value for "prev": middle of the ASCII range */

-#define BOCU1_ASCII_PREV 0x40

-/* bounding byte values for differences */

-#define BOCU1_MIN 0x21

-#define BOCU1_MIDDLE 0x90

-#define BOCU1_MAX_LEAD 0xfe

-/* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */

-#define BOCU1_MAX_TRAIL 0xffL

-#define BOCU1_RESET 0xff

-/* number of lead bytes */

-#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)

-/* adjust trail byte counts for the use of some C0 control byte values */

-#define BOCU1_TRAIL_CONTROLS_COUNT 20

-#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)

-/* number of trail bytes */

-#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)

-/*

- * number of positive and negative single-byte codes

- * (counting 0==BOCU1_MIDDLE among the positive ones)

- */

-#define BOCU1_SINGLE 64

-/* number of lead bytes for positive and negative 2/3/4-byte sequences */

-#define BOCU1_LEAD_2 43

-#define BOCU1_LEAD_3 3

-#define BOCU1_LEAD_4 1

-/* The difference value range for single-byters. */

-#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)

-#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)

-/* The difference value range for double-byters. */

-#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

-#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

-/* The difference value range for 3-byters. */

-#define BOCU1_REACH_POS_3 \

- (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)

-#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)

-/* The lead byte start values. */

-#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)

-#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)

-#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)

- /* ==BOCU1_MAX_LEAD */

-#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)

-#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)

-#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)

- /* ==BOCU1_MIN+1 */

-/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */

-#define BOCU1_LENGTH_FROM_LEAD(lead) \

- ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \

- (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \

- (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)

-/* The length of a byte sequence, according to its packed form. */

-#define BOCU1_LENGTH_FROM_PACKED(packed) \

- ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)

-/*

- * 12 commonly used C0 control codes (and space) are only used to encode

- * themselves directly,

- * which makes BOCU-1 MIME-usable and reasonably safe for

- * ASCII-oriented software.

- *

- * These controls are

- * 0 NUL

- *

- * 7 BEL

- * 8 BS

- *

- * 9 TAB

- * a LF

- * b VT

- * c FF

- * d CR

- *

- * e SO

- * f SI

- *

- * 1a SUB

- * 1b ESC

- *

- * The other 20 C0 controls are also encoded directly (to preserve order)

- * but are also used as trail bytes in difference encoding

- * (for better compression).

- */

-#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])

-/*

- * Byte value map for control codes,

- * from external byte values 0x00..0x20

- * to trail byte values 0..19 (0..0x13) as used in the difference calculation.

- * External byte values that are illegal as trail bytes are mapped to -1.

- */

-static const int8_t

-bocu1ByteToTrail[BOCU1_MIN]={

-/* 0 1 2 3 4 5 6 7 */

- -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,

-/* 8 9 a b c d e f */

- -1, -1, -1, -1, -1, -1, -1, -1,

-/* 10 11 12 13 14 15 16 17 */

- 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,

-/* 18 19 1a 1b 1c 1d 1e 1f */

- 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,

-/* 20 */

- -1

-};

-/*

- * Byte value map for control codes,

- * from trail byte values 0..19 (0..0x13) as used in the difference calculation

- * to external byte values 0x00..0x20.

- */

-static const int8_t

-bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={

-/* 0 1 2 3 4 5 6 7 */

- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,

-/* 8 9 a b c d e f */

- 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

-/* 10 11 12 13 */

- 0x1c, 0x1d, 0x1e, 0x1f

-};

-/**

- * Integer division and modulo with negative numerators

- * yields negative modulo results and quotients that are one more than

- * what we need here.

- * This macro adjust the results so that the modulo-value m is always >=0.

- *

- * For positive n, the if() condition is always FALSE.

- *

- * @param n Number to be split into quotient and rest.

- * Will be modified to contain the quotient.

- * @param d Divisor.

- * @param m Output variable for the rest (modulo result).

- */

-#define NEGDIVMOD(n, d, m) { \

- (m)=(n)%(d); \

- (n)/=(d); \

- if((m)<0) { \

- --(n); \

- (m)+=(d); \

- } \

-/* State for BOCU-1 decoder function. */

-struct Bocu1Rx {

- int32_t prev, count, diff;

-};

-typedef struct Bocu1Rx Bocu1Rx;

-/* Function prototypes ------------------------------------------------------ */

-/* see bocu1.c */

-U_CFUNC int32_t

-packDiff(int32_t diff);

-U_CFUNC int32_t

-encodeBocu1(int32_t *pPrev, int32_t c);

-U_CFUNC int32_t

-decodeBocu1(Bocu1Rx *pRx, uint8_t b);

-/* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */

-/* BOCU-1 implementation functions ------------------------------------------ */

-/**

- * Compute the next "previous" value for differencing

- * from the current code point.

- *

- * @param c current code point, 0..0x10ffff

- * @return "previous code point" state value

- */

-static int32_t

-bocu1Prev(int32_t c) {

- /* compute new prev */

- if(0x3040<=c && c<=0x309f) {

- /* Hiragana is not 128-aligned */

- return 0x3070;

- } else if(0x4e00<=c && c<=0x9fa5) {

- /* CJK Unihan */

- return 0x4e00-BOCU1_REACH_NEG_2;

- } else if(0xac00<=c && c<=0xd7a3) {

- /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */

- return ((int32_t)0xd7a3+(int32_t)0xac00)/2;

- } else {

- /* mostly small scripts */

- return (c&~0x7f)+BOCU1_ASCII_PREV;

- }

-/**

- * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes

- * and return a packed integer with them.

- *

- * The encoding favors small absolut differences with short encodings

- * to compress runs of same-script characters.

- *

- * @param diff difference value -0x10ffff..0x10ffff

- * @return

- * 0x010000zz for 1-byte sequence zz

- * 0x0200yyzz for 2-byte sequence yy zz

- * 0x03xxyyzz for 3-byte sequence xx yy zz

- * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)

- */

-U_CFUNC int32_t

-packDiff(int32_t diff) {

- int32_t result, m, lead, count, shift;

- if(diff>=BOCU1_REACH_NEG_1) {

- /* mostly positive differences, and single-byte negative ones */

- if(diff<=BOCU1_REACH_POS_1) {

- /* single byte */

- return 0x01000000|(BOCU1_MIDDLE+diff);

- } else if(diff<=BOCU1_REACH_POS_2) {

- /* two bytes */

- diff-=BOCU1_REACH_POS_1+1;

- lead=BOCU1_START_POS_2;

- count=1;

- } else if(diff<=BOCU1_REACH_POS_3) {

- /* three bytes */

- diff-=BOCU1_REACH_POS_2+1;

- lead=BOCU1_START_POS_3;

- count=2;

- } else {

- /* four bytes */

- diff-=BOCU1_REACH_POS_3+1;

- lead=BOCU1_START_POS_4;

- count=3;

- }

- } else {

- /* two- and four-byte negative differences */

- if(diff>=BOCU1_REACH_NEG_2) {

- /* two bytes */

- diff-=BOCU1_REACH_NEG_1;

- lead=BOCU1_START_NEG_2;

- count=1;

- } else if(diff>=BOCU1_REACH_NEG_3) {

- /* three bytes */

- diff-=BOCU1_REACH_NEG_2;

- lead=BOCU1_START_NEG_3;

- count=2;

- } else {

- /* four bytes */

- diff-=BOCU1_REACH_NEG_3;

- lead=BOCU1_START_NEG_4;

- count=3;

- }

- /* encode the length of the packed result */

- if(count<3) {

- result=(count+1)<<24;

- } else /* count==3, MSB used for the lead byte */ {

- result=0;

- }

- /* calculate trail bytes like digits in itoa() */

- shift=0;

- do {

- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

- result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;

- shift+=8;

- } while(--count>0);

- /* add lead byte */

- result|=(lead+diff)<<shift;

- return result;

-/**

- * BOCU-1 encoder function.

- *

- * @param pPrev pointer to the integer that holds

- * the "previous code point" state;

- * the initial value should be 0 which

- * encodeBocu1 will set to the actual BOCU-1 initial state value

- * @param c the code point to encode

- * @return the packed 1/2/3/4-byte encoding, see packDiff(),

- * or 0 if an error occurs

- *

- * @see packDiff

- */

-U_CFUNC int32_t

-encodeBocu1(int32_t *pPrev, int32_t c) {

- int32_t prev;

- if(pPrev==NULL || c<0 || c>0x10ffff) {

- /* illegal argument */

- return 0;

- }

- prev=*pPrev;

- if(prev==0) {

- /* lenient handling of initial value 0 */

- prev=*pPrev=BOCU1_ASCII_PREV;

- }

- if(c<=0x20) {

- /*

- * ISO C0 control & space:

- * Encode directly for MIME compatibility,

- * and reset state except for space, to not disrupt compression.

- */

- if(c!=0x20) {

- *pPrev=BOCU1_ASCII_PREV;

- }

- return 0x01000000|c;

- }

- /*

- * all other Unicode code points c==U+0021..U+10ffff

- * are encoded with the difference c-prev

- *

- * a new prev is computed from c,

- * placed in the middle of a 0x80-block (for most small scripts) or

- * in the middle of the Unihan and Hangul blocks

- * to statistically minimize the following difference

- */

- *pPrev=bocu1Prev(c);

- return packDiff(c-prev);

-/**

- * Function for BOCU-1 decoder; handles multi-byte lead bytes.

- *

- * @param pRx pointer to the decoder state structure

- * @param b lead byte;

- * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD

- * @return -1 (state change only)

- *

- * @see decodeBocu1

- */

-static int32_t

-decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {

- int32_t c, count;

- if(b>=BOCU1_START_NEG_2) {

- /* positive difference */

- if(b<BOCU1_START_POS_3) {

- /* two bytes */

- c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;

- count=1;

- } else if(b<BOCU1_START_POS_4) {

- /* three bytes */

- c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;

- count=2;

- } else {

- /* four bytes */

- c=BOCU1_REACH_POS_3+1;

- count=3;

- }

- } else {

- /* negative difference */

- if(b>=BOCU1_START_NEG_3) {

- /* two bytes */

- c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;

- count=1;

- } else if(b>BOCU1_MIN) {

- /* three bytes */

- c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;

- count=2;

- } else {

- /* four bytes */

- c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;

- count=3;

- }

- /* set the state for decoding the trail byte(s) */

- pRx->diff=c;

- pRx->count=count;

- return -1;

-/**

- * Function for BOCU-1 decoder; handles multi-byte trail bytes.

- *

- * @param pRx pointer to the decoder state structure

- * @param b trail byte

- * @return result value, same as decodeBocu1

- *

- * @see decodeBocu1

- */

-static int32_t

-decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {

- int32_t t, c, count;

- if(b<=0x20) {

- /* skip some C0 controls and make the trail byte range contiguous */

- t=bocu1ByteToTrail[b];

- if(t<0) {

- /* illegal trail byte value */

- pRx->prev=BOCU1_ASCII_PREV;

- pRx->count=0;

- return -99;

- }

-#if BOCU1_MAX_TRAIL<0xff

- } else if(b>BOCU1_MAX_TRAIL) {

- return -99;

-#endif

- } else {

- t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;

- }

- /* add trail byte into difference and decrement count */

- c=pRx->diff;

- count=pRx->count;

- if(count==1) {

- /* final trail byte, deliver a code point */

- c=pRx->prev+c+t;

- if(0<=c && c<=0x10ffff) {

- /* valid code point result */

- pRx->prev=bocu1Prev(c);

- pRx->count=0;

- return c;

- } else {

- /* illegal code point result */

- pRx->prev=BOCU1_ASCII_PREV;

- pRx->count=0;

- return -99;

- }

- /* intermediate trail byte */

- if(count==2) {

- pRx->diff=c+t*BOCU1_TRAIL_COUNT;

- } else /* count==3 */ {

- pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;

- }

- pRx->count=count-1;

- return -1;

-/**

- * BOCU-1 decoder function.

- *

- * @param pRx pointer to the decoder state structure;

- * the initial values should be 0 which

- * decodeBocu1 will set to actual initial state values

- * @param b an input byte

- * @return

- * 0..0x10ffff for a result code point

- * -1 if only the state changed without code point output

- * <-1 if an error occurs

- */

-U_CFUNC int32_t

-decodeBocu1(Bocu1Rx *pRx, uint8_t b) {

- int32_t prev, c, count;

- if(pRx==NULL) {

- /* illegal argument */

- return -99;

- }

- prev=pRx->prev;

- if(prev==0) {

- /* lenient handling of initial 0 values */

- prev=pRx->prev=BOCU1_ASCII_PREV;

- count=pRx->count=0;

- } else {

- count=pRx->count;

- }

- if(count==0) {

- /* byte in lead position */

- if(b<=0x20) {

- /*

- * Direct-encoded C0 control code or space.

- * Reset prev for C0 control codes but not for space.

- */

- if(b!=0x20) {

- pRx->prev=BOCU1_ASCII_PREV;

- }

- return b;

- }

- /*

- * b is a difference lead byte.

- *

- * Return a code point directly from a single-byte difference.

- *

- * For multi-byte difference lead bytes, set the decoder state

- * with the partial difference value from the lead byte and

- * with the number of trail bytes.

- *

- * For four-byte differences, the signedness also affects the

- * first trail byte, which has special handling farther below.

- */

- if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {

- /* single-byte difference */

- c=prev+((int32_t)b-BOCU1_MIDDLE);

- pRx->prev=bocu1Prev(c);

- return c;

- } else if(b==BOCU1_RESET) {

- /* only reset the state, no code point */

- pRx->prev=BOCU1_ASCII_PREV;

- return -1;

- } else {

- return decodeBocu1LeadByte(pRx, b);

- }

- } else {

- /* trail byte in any position */

- return decodeBocu1TrailByte(pRx, b);

- }

-/* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */

-/* test code ---------------------------------------------------------------- */

-/* test code options */

-/* ignore comma when processing name lists in testText() */

-#define TEST_IGNORE_COMMA 1

-/**

- * Write a packed BOCU-1 byte sequence into a byte array,

- * without overflow check.

- * Test function.

- *

- * @param packed packed BOCU-1 byte sequence, see packDiff()

- * @param p pointer to byte array

- * @return number of bytes

- *

- * @see packDiff

- */

-static int32_t

-writePacked(int32_t packed, uint8_t *p) {

- int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);

- switch(count) {

- case 4:

- *p++=(uint8_t)(packed>>24);

- case 3:

- *p++=(uint8_t)(packed>>16);

- case 2:

- *p++=(uint8_t)(packed>>8);

- case 1:

- *p++=(uint8_t)packed;

- default:

- break;

- }

- return count;

-/**

- * Unpack a packed BOCU-1 non-C0/space byte sequence and get

- * the difference to initialPrev.

- * Used only for round-trip testing of the difference encoding and decoding.

- * Test function.

- *

- * @param initialPrev bogus "previous code point" value to make sure that

- * the resulting code point is in the range 0..0x10ffff

- * @param packed packed BOCU-1 byte sequence

- * @return the difference to initialPrev

- *

- * @see packDiff

- * @see writeDiff

- */

-static int32_t

-unpackDiff(int32_t initialPrev, int32_t packed) {

- Bocu1Rx rx={ 0, 0, 0 };

- int32_t count;

- rx.prev=initialPrev;

- count=BOCU1_LENGTH_FROM_PACKED(packed);

- switch(count) {

- case 4:

- decodeBocu1(&rx, (uint8_t)(packed>>24));

- case 3:

- decodeBocu1(&rx, (uint8_t)(packed>>16));

- case 2:

- decodeBocu1(&rx, (uint8_t)(packed>>8));

- case 1:

- /* subtract initial prev */

- return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;

- default:

- return -0x7fffffff;

- }

-/**

- * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,

- * preserving lexical order.

- * Also checks for roundtripping of the difference encoding.

- * Test function.

- *

- * @param diff difference value to test, -0x10ffff..0x10ffff

- * @param p pointer to output byte array

- * @return p advanced by number of bytes output

- *

- * @see unpackDiff

- */

-static uint8_t *

-writeDiff(int32_t diff, uint8_t *p) {

- /* generate the difference as a packed value and serialize it */

- int32_t packed, initialPrev;

- packed=packDiff(diff);

- /*

- * bogus initial "prev" to work around

- * code point range check in decodeBocu1()

- */

- if(diff<=0) {

- initialPrev=0x10ffff;

- } else {

- initialPrev=-1;

- }

- if(diff!=unpackDiff(initialPrev, packed)) {

- log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",

- diff, packed, unpackDiff(initialPrev, packed));

- }

- return p+writePacked(packed, p);

-/**

- * Encode a UTF-16 string in BOCU-1.

- * Does not check for overflows, but otherwise useful function.

- *

- * @param s input UTF-16 string

- * @param length number of UChar code units in s

- * @param p pointer to output byte array

- * @return number of bytes output

- */

-static int32_t

-writeString(const UChar *s, int32_t length, uint8_t *p) {

- uint8_t *p0;

- int32_t c, prev, i;

- prev=0;

- p0=p;

- i=0;

- while(i<length) {

- U16_NEXT(s, i, length, c);

- p+=writePacked(encodeBocu1(&prev, c), p);

- }

- return (int32_t)(p-p0);

-/**

- * Decode a BOCU-1 byte sequence to a UTF-16 string.

- * Does not check for overflows, but otherwise useful function.

- *

- * @param p pointer to input BOCU-1 bytes

- * @param length number of input bytes

- * @param s point to output UTF-16 string array

- * @return number of UChar code units output

- */

-static int32_t

-readString(const uint8_t *p, int32_t length, UChar *s) {

- Bocu1Rx rx={ 0, 0, 0 };

- int32_t c, i, sLength;

- i=sLength=0;

- while(i<length) {

- c=decodeBocu1(&rx, p[i++]);

- if(c<-1) {

- log_err("error: readString detects encoding error at string index %ld\n", i);

- return -1;

- }

- if(c>=0) {

- U16_APPEND_UNSAFE(s, sLength, c);

- }

- return sLength;

-static char

-hexDigit(uint8_t digit) {

- return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);

-/**

- * Pretty-print 0-terminated byte values.

- * Helper function for test output.

- *

- * @param bytes 0-terminated byte array to print

- */

-static void

-printBytes(uint8_t *bytes, char *out) {

- int i;

- uint8_t b;

- i=0;

- while((b=*bytes++)!=0) {

- *out++=' ';

- *out++=hexDigit((uint8_t)(b>>4));

- *out++=hexDigit((uint8_t)(b&0xf));

- ++i;

- }

- i=3*(5-i);

- while(i>0) {

- *out++=' ';

- --i;

- }

- *out=0;

-/**

- * Basic BOCU-1 test function, called when there are no command line arguments.

- * Prints some of the #define values and performs round-trip tests of the

- * difference encoding and decoding.

- */

-static void

-TestBOCU1RefDiff(void) {

- char buf1[80], buf2[80];

- uint8_t prev[5], level[5];

- int32_t i, cmp, countErrors;

- log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);

- log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);

- log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);

- log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);

- log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);

- log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);

- log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);

- log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);

- log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);

- /* test packDiff() & unpackDiff() with some specific values */

- writeDiff(0, level);

- writeDiff(1, level);

- writeDiff(65, level);

- writeDiff(130, level);

- writeDiff(30000, level);

- writeDiff(1000000, level);

- writeDiff(-65, level);

- writeDiff(-130, level);

- writeDiff(-30000, level);

- writeDiff(-1000000, level);

- /* test that each value is smaller than any following one */

- countErrors=0;

- i=-0x10ffff;

- *writeDiff(i, prev)=0;

- /* show first number and bytes */

- printBytes(prev, buf1);

- log_verbose(" wD(%8ld) %s\n", i, buf1);

- for(++i; i<=0x10ffff; ++i) {

- *writeDiff(i, level)=0;

- cmp=strcmp((const char *)prev, (const char *)level);

- if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {

- log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",

- level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);

- }

- if(cmp<0) {

- if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {

- /*

- * if the result is good, then print only if the length changed

- * to get little but interesting output

- */

- printBytes(prev, buf1);

- printBytes(level, buf2);

- log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);

- }

- } else {

- ++countErrors;

- printBytes(prev, buf1);

- printBytes(level, buf2);

- log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);

- }

- /* remember the previous bytes */

- memcpy(prev, level, 4);

- }

- /* show last number and bytes */

- printBytes((uint8_t *)"", buf1);

- printBytes(prev, buf2);

- log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2);

- if(countErrors==0) {

- log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");

- } else {

- log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);

- }

- /* output signature byte sequence */

- i=0;

- writePacked(encodeBocu1(&i, 0xfeff), level);

- log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",

- level[0], level[1], level[2]);

-/* cintltst code ------------------------------------------------------------ */

-static const int32_t DEFAULT_BUFFER_SIZE = 30000;

-/* test one string with the ICU and the reference BOCU-1 implementations */

-static void

-roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {

- UChar *roundtripRef, *roundtripICU;

- char *bocu1Ref, *bocu1ICU;

- int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;

- UErrorCode errorCode;

- roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

- roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

- bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);

- bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);

- /* Unicode -> BOCU-1 */

- bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);

- errorCode=U_ZERO_ERROR;

- bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);

- if(U_FAILURE(errorCode)) {

- log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));

- goto cleanup;

- }

- if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {

- log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);

- goto cleanup;

- }

- /* BOCU-1 -> Unicode */

- roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);

- if(roundtripRefLength<0) {

- goto cleanup; /* readString() found an error and reported it */

- }

- roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);

- if(U_FAILURE(errorCode)) {

- log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));

- goto cleanup;

- }

- if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {

- log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);

- goto cleanup;

- }

- if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {

- log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);

- goto cleanup;

- }

-cleanup:

- free(roundtripRef);

- free(roundtripICU);

- free(bocu1Ref);

- free(bocu1ICU);

-static const UChar feff[]={ 0xfeff };

-static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };

-static const UChar crlf[]={ 0xd, 0xa, 0x20 };

-static const UChar nul[]={ 0 };

-static const UChar latin[]={ 0xdf, 0xe6 };

-static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };

-static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };

-static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };

-static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };

-static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */

-static const UChar plane1[]={ 0xd800, 0xdc00 };

-static const UChar plane2[]={ 0xd845, 0xdddd };

-static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };

-static const UChar plane16[]={ 0xdbff, 0xdfff };

-static const UChar c0[]={ 1, 0xe40, 0x20, 9 };

-static const struct {

- const UChar *s;

- int32_t length;

-} strings[]={

- { feff, UPRV_LENGTHOF(feff) },

- { ascii, UPRV_LENGTHOF(ascii) },

- { crlf, UPRV_LENGTHOF(crlf) },

- { nul, UPRV_LENGTHOF(nul) },

- { latin, UPRV_LENGTHOF(latin) },

- { devanagari, UPRV_LENGTHOF(devanagari) },

- { hiragana, UPRV_LENGTHOF(hiragana) },

- { unihan, UPRV_LENGTHOF(unihan) },

- { hangul, UPRV_LENGTHOF(hangul) },

- { surrogates, UPRV_LENGTHOF(surrogates) },

- { plane1, UPRV_LENGTHOF(plane1) },

- { plane2, UPRV_LENGTHOF(plane2) },

- { plane15, UPRV_LENGTHOF(plane15) },

- { plane16, UPRV_LENGTHOF(plane16) },

- { c0, UPRV_LENGTHOF(c0) }

-};

-/*

- * Verify that the ICU BOCU-1 implementation produces the same results as

- * the reference implementation from the design folder.

- * Generate some texts and convert them with both converters, verifying

- * identical results and roundtripping.

- */

-static void

-TestBOCU1(void) {

- UChar *text;

- int32_t i, length;

- UConverter *bocu1;

- UErrorCode errorCode;

- errorCode=U_ZERO_ERROR;

- bocu1=ucnv_open("BOCU-1", &errorCode);

- if(U_FAILURE(errorCode)) {

- log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));

- return;

- }

- text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

- /* text 1: each of strings[] once */

- length=0;

- for(i=0; i<UPRV_LENGTHOF(strings); ++i) {

- u_memcpy(text+length, strings[i].s, strings[i].length);

- length+=strings[i].length;

- }

- roundtripBOCU1(bocu1, 1, text, length);

- /* text 2: each of strings[] twice */

- length=0;

- for(i=0; i<UPRV_LENGTHOF(strings); ++i) {

- u_memcpy(text+length, strings[i].s, strings[i].length);

- length+=strings[i].length;

- u_memcpy(text+length, strings[i].s, strings[i].length);

- length+=strings[i].length;

- }

- roundtripBOCU1(bocu1, 2, text, length);

- /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */

- length=0;

- for(i=1; length<5000; i+=7) {

- if(i>=UPRV_LENGTHOF(strings)) {

- i-=UPRV_LENGTHOF(strings);

- }

- u_memcpy(text+length, strings[i].s, strings[i].length);

- length+=strings[i].length;

- }

- roundtripBOCU1(bocu1, 3, text, length);

- ucnv_close(bocu1);

- free(text);

-U_CFUNC void addBOCU1Tests(TestNode** root);

-U_CFUNC void

-addBOCU1Tests(TestNode** root) {

- addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");

- addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");

« no previous file with comments | « source/test/cintltst/Makefile.in ('k') | source/test/cintltst/callcoll.h » ('j') | no next file with comments »