Index: source/test/cintltst/bocu1tst.c |
diff --git a/source/test/cintltst/bocu1tst.c b/source/test/cintltst/bocu1tst.c |
deleted file mode 100644 |
index 53a204f4007ca8eca9ce8c48f297cca05c68fec7..0000000000000000000000000000000000000000 |
--- a/source/test/cintltst/bocu1tst.c |
+++ /dev/null |
@@ -1,1031 +0,0 @@ |
-/* |
-****************************************************************************** |
-* |
-* Copyright (C) 2002-2015, International Business Machines |
-* Corporation and others. All Rights Reserved. |
-* |
-****************************************************************************** |
-* file name: bocu1tst.c |
-* encoding: US-ASCII |
-* tab size: 8 (not used) |
-* indentation:4 |
-* |
-* created on: 2002may27 |
-* created by: Markus W. Scherer |
-* |
-* This is the reference implementation of BOCU-1, |
-* the MIME-friendly form of the Binary Ordered Compression for Unicode, |
-* taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/ |
-* The files bocu1.h and bocu1.c from the design folder are taken |
-* verbatim (minus copyright and #include) and copied together into this file. |
-* The reference code and some of the reference bocu1tst.c |
-* is modified to run as part of the ICU cintltst |
-* test framework (minus main(), log_ln() etc. instead of printf()). |
-* |
-* This reference implementation is used here to verify |
-* the ICU BOCU-1 implementation, which is |
-* adapted for ICU conversion APIs and optimized. |
-* ### links in design doc to here and to ucnvbocu.c |
-*/ |
- |
-#include "unicode/utypes.h" |
-#include "unicode/ustring.h" |
-#include "unicode/ucnv.h" |
-#include "unicode/utf16.h" |
-#include "cmemory.h" |
-#include "cintltst.h" |
- |
-/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */ |
- |
-/* BOCU-1 constants and macros ---------------------------------------------- */ |
- |
-/* |
- * BOCU-1 encodes the code points of a Unicode string as |
- * a sequence of byte-encoded differences (slope detection), |
- * preserving lexical order. |
- * |
- * Optimize the difference-taking for runs of Unicode text within |
- * small scripts: |
- * |
- * Most small scripts are allocated within aligned 128-blocks of Unicode |
- * code points. Lexical order is preserved if the "previous code point" state |
- * is always moved into the middle of such a block. |
- * |
- * Additionally, "prev" is moved from anywhere in the Unihan and Hangul |
- * areas into the middle of those areas. |
- * |
- * C0 control codes and space are encoded with their US-ASCII bytes. |
- * "prev" is reset for C0 controls but not for space. |
- */ |
- |
-/* initial value for "prev": middle of the ASCII range */ |
-#define BOCU1_ASCII_PREV 0x40 |
- |
-/* bounding byte values for differences */ |
-#define BOCU1_MIN 0x21 |
-#define BOCU1_MIDDLE 0x90 |
-#define BOCU1_MAX_LEAD 0xfe |
- |
-/* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */ |
-#define BOCU1_MAX_TRAIL 0xffL |
-#define BOCU1_RESET 0xff |
- |
-/* number of lead bytes */ |
-#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) |
- |
-/* adjust trail byte counts for the use of some C0 control byte values */ |
-#define BOCU1_TRAIL_CONTROLS_COUNT 20 |
-#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) |
- |
-/* number of trail bytes */ |
-#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) |
- |
-/* |
- * number of positive and negative single-byte codes |
- * (counting 0==BOCU1_MIDDLE among the positive ones) |
- */ |
-#define BOCU1_SINGLE 64 |
- |
-/* number of lead bytes for positive and negative 2/3/4-byte sequences */ |
-#define BOCU1_LEAD_2 43 |
-#define BOCU1_LEAD_3 3 |
-#define BOCU1_LEAD_4 1 |
- |
-/* The difference value range for single-byters. */ |
-#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) |
-#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) |
- |
-/* The difference value range for double-byters. */ |
-#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) |
-#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) |
- |
-/* The difference value range for 3-byters. */ |
-#define BOCU1_REACH_POS_3 \ |
- (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) |
- |
-#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) |
- |
-/* The lead byte start values. */ |
-#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) |
-#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) |
-#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) |
- /* ==BOCU1_MAX_LEAD */ |
- |
-#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) |
-#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) |
-#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) |
- /* ==BOCU1_MIN+1 */ |
- |
-/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ |
-#define BOCU1_LENGTH_FROM_LEAD(lead) \ |
- ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ |
- (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ |
- (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) |
- |
-/* The length of a byte sequence, according to its packed form. */ |
-#define BOCU1_LENGTH_FROM_PACKED(packed) \ |
- ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) |
- |
-/* |
- * 12 commonly used C0 control codes (and space) are only used to encode |
- * themselves directly, |
- * which makes BOCU-1 MIME-usable and reasonably safe for |
- * ASCII-oriented software. |
- * |
- * These controls are |
- * 0 NUL |
- * |
- * 7 BEL |
- * 8 BS |
- * |
- * 9 TAB |
- * a LF |
- * b VT |
- * c FF |
- * d CR |
- * |
- * e SO |
- * f SI |
- * |
- * 1a SUB |
- * 1b ESC |
- * |
- * The other 20 C0 controls are also encoded directly (to preserve order) |
- * but are also used as trail bytes in difference encoding |
- * (for better compression). |
- */ |
-#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) |
- |
-/* |
- * Byte value map for control codes, |
- * from external byte values 0x00..0x20 |
- * to trail byte values 0..19 (0..0x13) as used in the difference calculation. |
- * External byte values that are illegal as trail bytes are mapped to -1. |
- */ |
-static const int8_t |
-bocu1ByteToTrail[BOCU1_MIN]={ |
-/* 0 1 2 3 4 5 6 7 */ |
- -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, |
- |
-/* 8 9 a b c d e f */ |
- -1, -1, -1, -1, -1, -1, -1, -1, |
- |
-/* 10 11 12 13 14 15 16 17 */ |
- 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, |
- |
-/* 18 19 1a 1b 1c 1d 1e 1f */ |
- 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, |
- |
-/* 20 */ |
- -1 |
-}; |
- |
-/* |
- * Byte value map for control codes, |
- * from trail byte values 0..19 (0..0x13) as used in the difference calculation |
- * to external byte values 0x00..0x20. |
- */ |
-static const int8_t |
-bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ |
-/* 0 1 2 3 4 5 6 7 */ |
- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, |
- |
-/* 8 9 a b c d e f */ |
- 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, |
- |
-/* 10 11 12 13 */ |
- 0x1c, 0x1d, 0x1e, 0x1f |
-}; |
- |
-/** |
- * Integer division and modulo with negative numerators |
- * yields negative modulo results and quotients that are one more than |
- * what we need here. |
- * This macro adjust the results so that the modulo-value m is always >=0. |
- * |
- * For positive n, the if() condition is always FALSE. |
- * |
- * @param n Number to be split into quotient and rest. |
- * Will be modified to contain the quotient. |
- * @param d Divisor. |
- * @param m Output variable for the rest (modulo result). |
- */ |
-#define NEGDIVMOD(n, d, m) { \ |
- (m)=(n)%(d); \ |
- (n)/=(d); \ |
- if((m)<0) { \ |
- --(n); \ |
- (m)+=(d); \ |
- } \ |
-} |
- |
-/* State for BOCU-1 decoder function. */ |
-struct Bocu1Rx { |
- int32_t prev, count, diff; |
-}; |
- |
-typedef struct Bocu1Rx Bocu1Rx; |
- |
-/* Function prototypes ------------------------------------------------------ */ |
- |
-/* see bocu1.c */ |
-U_CFUNC int32_t |
-packDiff(int32_t diff); |
- |
-U_CFUNC int32_t |
-encodeBocu1(int32_t *pPrev, int32_t c); |
- |
-U_CFUNC int32_t |
-decodeBocu1(Bocu1Rx *pRx, uint8_t b); |
- |
-/* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */ |
- |
-/* BOCU-1 implementation functions ------------------------------------------ */ |
- |
-/** |
- * Compute the next "previous" value for differencing |
- * from the current code point. |
- * |
- * @param c current code point, 0..0x10ffff |
- * @return "previous code point" state value |
- */ |
-static int32_t |
-bocu1Prev(int32_t c) { |
- /* compute new prev */ |
- if(0x3040<=c && c<=0x309f) { |
- /* Hiragana is not 128-aligned */ |
- return 0x3070; |
- } else if(0x4e00<=c && c<=0x9fa5) { |
- /* CJK Unihan */ |
- return 0x4e00-BOCU1_REACH_NEG_2; |
- } else if(0xac00<=c && c<=0xd7a3) { |
- /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */ |
- return ((int32_t)0xd7a3+(int32_t)0xac00)/2; |
- } else { |
- /* mostly small scripts */ |
- return (c&~0x7f)+BOCU1_ASCII_PREV; |
- } |
-} |
- |
-/** |
- * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes |
- * and return a packed integer with them. |
- * |
- * The encoding favors small absolut differences with short encodings |
- * to compress runs of same-script characters. |
- * |
- * @param diff difference value -0x10ffff..0x10ffff |
- * @return |
- * 0x010000zz for 1-byte sequence zz |
- * 0x0200yyzz for 2-byte sequence yy zz |
- * 0x03xxyyzz for 3-byte sequence xx yy zz |
- * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) |
- */ |
-U_CFUNC int32_t |
-packDiff(int32_t diff) { |
- int32_t result, m, lead, count, shift; |
- |
- if(diff>=BOCU1_REACH_NEG_1) { |
- /* mostly positive differences, and single-byte negative ones */ |
- if(diff<=BOCU1_REACH_POS_1) { |
- /* single byte */ |
- return 0x01000000|(BOCU1_MIDDLE+diff); |
- } else if(diff<=BOCU1_REACH_POS_2) { |
- /* two bytes */ |
- diff-=BOCU1_REACH_POS_1+1; |
- lead=BOCU1_START_POS_2; |
- count=1; |
- } else if(diff<=BOCU1_REACH_POS_3) { |
- /* three bytes */ |
- diff-=BOCU1_REACH_POS_2+1; |
- lead=BOCU1_START_POS_3; |
- count=2; |
- } else { |
- /* four bytes */ |
- diff-=BOCU1_REACH_POS_3+1; |
- lead=BOCU1_START_POS_4; |
- count=3; |
- } |
- } else { |
- /* two- and four-byte negative differences */ |
- if(diff>=BOCU1_REACH_NEG_2) { |
- /* two bytes */ |
- diff-=BOCU1_REACH_NEG_1; |
- lead=BOCU1_START_NEG_2; |
- count=1; |
- } else if(diff>=BOCU1_REACH_NEG_3) { |
- /* three bytes */ |
- diff-=BOCU1_REACH_NEG_2; |
- lead=BOCU1_START_NEG_3; |
- count=2; |
- } else { |
- /* four bytes */ |
- diff-=BOCU1_REACH_NEG_3; |
- lead=BOCU1_START_NEG_4; |
- count=3; |
- } |
- } |
- |
- /* encode the length of the packed result */ |
- if(count<3) { |
- result=(count+1)<<24; |
- } else /* count==3, MSB used for the lead byte */ { |
- result=0; |
- } |
- |
- /* calculate trail bytes like digits in itoa() */ |
- shift=0; |
- do { |
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); |
- result|=BOCU1_TRAIL_TO_BYTE(m)<<shift; |
- shift+=8; |
- } while(--count>0); |
- |
- /* add lead byte */ |
- result|=(lead+diff)<<shift; |
- |
- return result; |
-} |
- |
-/** |
- * BOCU-1 encoder function. |
- * |
- * @param pPrev pointer to the integer that holds |
- * the "previous code point" state; |
- * the initial value should be 0 which |
- * encodeBocu1 will set to the actual BOCU-1 initial state value |
- * @param c the code point to encode |
- * @return the packed 1/2/3/4-byte encoding, see packDiff(), |
- * or 0 if an error occurs |
- * |
- * @see packDiff |
- */ |
-U_CFUNC int32_t |
-encodeBocu1(int32_t *pPrev, int32_t c) { |
- int32_t prev; |
- |
- if(pPrev==NULL || c<0 || c>0x10ffff) { |
- /* illegal argument */ |
- return 0; |
- } |
- |
- prev=*pPrev; |
- if(prev==0) { |
- /* lenient handling of initial value 0 */ |
- prev=*pPrev=BOCU1_ASCII_PREV; |
- } |
- |
- if(c<=0x20) { |
- /* |
- * ISO C0 control & space: |
- * Encode directly for MIME compatibility, |
- * and reset state except for space, to not disrupt compression. |
- */ |
- if(c!=0x20) { |
- *pPrev=BOCU1_ASCII_PREV; |
- } |
- return 0x01000000|c; |
- } |
- |
- /* |
- * all other Unicode code points c==U+0021..U+10ffff |
- * are encoded with the difference c-prev |
- * |
- * a new prev is computed from c, |
- * placed in the middle of a 0x80-block (for most small scripts) or |
- * in the middle of the Unihan and Hangul blocks |
- * to statistically minimize the following difference |
- */ |
- *pPrev=bocu1Prev(c); |
- return packDiff(c-prev); |
-} |
- |
-/** |
- * Function for BOCU-1 decoder; handles multi-byte lead bytes. |
- * |
- * @param pRx pointer to the decoder state structure |
- * @param b lead byte; |
- * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD |
- * @return -1 (state change only) |
- * |
- * @see decodeBocu1 |
- */ |
-static int32_t |
-decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) { |
- int32_t c, count; |
- |
- if(b>=BOCU1_START_NEG_2) { |
- /* positive difference */ |
- if(b<BOCU1_START_POS_3) { |
- /* two bytes */ |
- c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; |
- count=1; |
- } else if(b<BOCU1_START_POS_4) { |
- /* three bytes */ |
- c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; |
- count=2; |
- } else { |
- /* four bytes */ |
- c=BOCU1_REACH_POS_3+1; |
- count=3; |
- } |
- } else { |
- /* negative difference */ |
- if(b>=BOCU1_START_NEG_3) { |
- /* two bytes */ |
- c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; |
- count=1; |
- } else if(b>BOCU1_MIN) { |
- /* three bytes */ |
- c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; |
- count=2; |
- } else { |
- /* four bytes */ |
- c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; |
- count=3; |
- } |
- } |
- |
- /* set the state for decoding the trail byte(s) */ |
- pRx->diff=c; |
- pRx->count=count; |
- return -1; |
-} |
- |
-/** |
- * Function for BOCU-1 decoder; handles multi-byte trail bytes. |
- * |
- * @param pRx pointer to the decoder state structure |
- * @param b trail byte |
- * @return result value, same as decodeBocu1 |
- * |
- * @see decodeBocu1 |
- */ |
-static int32_t |
-decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) { |
- int32_t t, c, count; |
- |
- if(b<=0x20) { |
- /* skip some C0 controls and make the trail byte range contiguous */ |
- t=bocu1ByteToTrail[b]; |
- if(t<0) { |
- /* illegal trail byte value */ |
- pRx->prev=BOCU1_ASCII_PREV; |
- pRx->count=0; |
- return -99; |
- } |
-#if BOCU1_MAX_TRAIL<0xff |
- } else if(b>BOCU1_MAX_TRAIL) { |
- return -99; |
-#endif |
- } else { |
- t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET; |
- } |
- |
- /* add trail byte into difference and decrement count */ |
- c=pRx->diff; |
- count=pRx->count; |
- |
- if(count==1) { |
- /* final trail byte, deliver a code point */ |
- c=pRx->prev+c+t; |
- if(0<=c && c<=0x10ffff) { |
- /* valid code point result */ |
- pRx->prev=bocu1Prev(c); |
- pRx->count=0; |
- return c; |
- } else { |
- /* illegal code point result */ |
- pRx->prev=BOCU1_ASCII_PREV; |
- pRx->count=0; |
- return -99; |
- } |
- } |
- |
- /* intermediate trail byte */ |
- if(count==2) { |
- pRx->diff=c+t*BOCU1_TRAIL_COUNT; |
- } else /* count==3 */ { |
- pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT; |
- } |
- pRx->count=count-1; |
- return -1; |
-} |
- |
-/** |
- * BOCU-1 decoder function. |
- * |
- * @param pRx pointer to the decoder state structure; |
- * the initial values should be 0 which |
- * decodeBocu1 will set to actual initial state values |
- * @param b an input byte |
- * @return |
- * 0..0x10ffff for a result code point |
- * -1 if only the state changed without code point output |
- * <-1 if an error occurs |
- */ |
-U_CFUNC int32_t |
-decodeBocu1(Bocu1Rx *pRx, uint8_t b) { |
- int32_t prev, c, count; |
- |
- if(pRx==NULL) { |
- /* illegal argument */ |
- return -99; |
- } |
- |
- prev=pRx->prev; |
- if(prev==0) { |
- /* lenient handling of initial 0 values */ |
- prev=pRx->prev=BOCU1_ASCII_PREV; |
- count=pRx->count=0; |
- } else { |
- count=pRx->count; |
- } |
- |
- if(count==0) { |
- /* byte in lead position */ |
- if(b<=0x20) { |
- /* |
- * Direct-encoded C0 control code or space. |
- * Reset prev for C0 control codes but not for space. |
- */ |
- if(b!=0x20) { |
- pRx->prev=BOCU1_ASCII_PREV; |
- } |
- return b; |
- } |
- |
- /* |
- * b is a difference lead byte. |
- * |
- * Return a code point directly from a single-byte difference. |
- * |
- * For multi-byte difference lead bytes, set the decoder state |
- * with the partial difference value from the lead byte and |
- * with the number of trail bytes. |
- * |
- * For four-byte differences, the signedness also affects the |
- * first trail byte, which has special handling farther below. |
- */ |
- if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) { |
- /* single-byte difference */ |
- c=prev+((int32_t)b-BOCU1_MIDDLE); |
- pRx->prev=bocu1Prev(c); |
- return c; |
- } else if(b==BOCU1_RESET) { |
- /* only reset the state, no code point */ |
- pRx->prev=BOCU1_ASCII_PREV; |
- return -1; |
- } else { |
- return decodeBocu1LeadByte(pRx, b); |
- } |
- } else { |
- /* trail byte in any position */ |
- return decodeBocu1TrailByte(pRx, b); |
- } |
-} |
- |
-/* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */ |
- |
-/* test code ---------------------------------------------------------------- */ |
- |
-/* test code options */ |
- |
-/* ignore comma when processing name lists in testText() */ |
-#define TEST_IGNORE_COMMA 1 |
- |
-/** |
- * Write a packed BOCU-1 byte sequence into a byte array, |
- * without overflow check. |
- * Test function. |
- * |
- * @param packed packed BOCU-1 byte sequence, see packDiff() |
- * @param p pointer to byte array |
- * @return number of bytes |
- * |
- * @see packDiff |
- */ |
-static int32_t |
-writePacked(int32_t packed, uint8_t *p) { |
- int32_t count=BOCU1_LENGTH_FROM_PACKED(packed); |
- switch(count) { |
- case 4: |
- *p++=(uint8_t)(packed>>24); |
- case 3: |
- *p++=(uint8_t)(packed>>16); |
- case 2: |
- *p++=(uint8_t)(packed>>8); |
- case 1: |
- *p++=(uint8_t)packed; |
- default: |
- break; |
- } |
- |
- return count; |
-} |
- |
-/** |
- * Unpack a packed BOCU-1 non-C0/space byte sequence and get |
- * the difference to initialPrev. |
- * Used only for round-trip testing of the difference encoding and decoding. |
- * Test function. |
- * |
- * @param initialPrev bogus "previous code point" value to make sure that |
- * the resulting code point is in the range 0..0x10ffff |
- * @param packed packed BOCU-1 byte sequence |
- * @return the difference to initialPrev |
- * |
- * @see packDiff |
- * @see writeDiff |
- */ |
-static int32_t |
-unpackDiff(int32_t initialPrev, int32_t packed) { |
- Bocu1Rx rx={ 0, 0, 0 }; |
- int32_t count; |
- |
- rx.prev=initialPrev; |
- count=BOCU1_LENGTH_FROM_PACKED(packed); |
- switch(count) { |
- case 4: |
- decodeBocu1(&rx, (uint8_t)(packed>>24)); |
- case 3: |
- decodeBocu1(&rx, (uint8_t)(packed>>16)); |
- case 2: |
- decodeBocu1(&rx, (uint8_t)(packed>>8)); |
- case 1: |
- /* subtract initial prev */ |
- return decodeBocu1(&rx, (uint8_t)packed)-initialPrev; |
- default: |
- return -0x7fffffff; |
- } |
-} |
- |
-/** |
- * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, |
- * preserving lexical order. |
- * Also checks for roundtripping of the difference encoding. |
- * Test function. |
- * |
- * @param diff difference value to test, -0x10ffff..0x10ffff |
- * @param p pointer to output byte array |
- * @return p advanced by number of bytes output |
- * |
- * @see unpackDiff |
- */ |
-static uint8_t * |
-writeDiff(int32_t diff, uint8_t *p) { |
- /* generate the difference as a packed value and serialize it */ |
- int32_t packed, initialPrev; |
- |
- packed=packDiff(diff); |
- |
- /* |
- * bogus initial "prev" to work around |
- * code point range check in decodeBocu1() |
- */ |
- if(diff<=0) { |
- initialPrev=0x10ffff; |
- } else { |
- initialPrev=-1; |
- } |
- |
- if(diff!=unpackDiff(initialPrev, packed)) { |
- log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", |
- diff, packed, unpackDiff(initialPrev, packed)); |
- } |
- return p+writePacked(packed, p); |
-} |
- |
-/** |
- * Encode a UTF-16 string in BOCU-1. |
- * Does not check for overflows, but otherwise useful function. |
- * |
- * @param s input UTF-16 string |
- * @param length number of UChar code units in s |
- * @param p pointer to output byte array |
- * @return number of bytes output |
- */ |
-static int32_t |
-writeString(const UChar *s, int32_t length, uint8_t *p) { |
- uint8_t *p0; |
- int32_t c, prev, i; |
- |
- prev=0; |
- p0=p; |
- i=0; |
- while(i<length) { |
- U16_NEXT(s, i, length, c); |
- p+=writePacked(encodeBocu1(&prev, c), p); |
- } |
- return (int32_t)(p-p0); |
-} |
- |
-/** |
- * Decode a BOCU-1 byte sequence to a UTF-16 string. |
- * Does not check for overflows, but otherwise useful function. |
- * |
- * @param p pointer to input BOCU-1 bytes |
- * @param length number of input bytes |
- * @param s point to output UTF-16 string array |
- * @return number of UChar code units output |
- */ |
-static int32_t |
-readString(const uint8_t *p, int32_t length, UChar *s) { |
- Bocu1Rx rx={ 0, 0, 0 }; |
- int32_t c, i, sLength; |
- |
- i=sLength=0; |
- while(i<length) { |
- c=decodeBocu1(&rx, p[i++]); |
- if(c<-1) { |
- log_err("error: readString detects encoding error at string index %ld\n", i); |
- return -1; |
- } |
- if(c>=0) { |
- U16_APPEND_UNSAFE(s, sLength, c); |
- } |
- } |
- return sLength; |
-} |
- |
-static char |
-hexDigit(uint8_t digit) { |
- return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); |
-} |
- |
-/** |
- * Pretty-print 0-terminated byte values. |
- * Helper function for test output. |
- * |
- * @param bytes 0-terminated byte array to print |
- */ |
-static void |
-printBytes(uint8_t *bytes, char *out) { |
- int i; |
- uint8_t b; |
- |
- i=0; |
- while((b=*bytes++)!=0) { |
- *out++=' '; |
- *out++=hexDigit((uint8_t)(b>>4)); |
- *out++=hexDigit((uint8_t)(b&0xf)); |
- ++i; |
- } |
- i=3*(5-i); |
- while(i>0) { |
- *out++=' '; |
- --i; |
- } |
- *out=0; |
-} |
- |
-/** |
- * Basic BOCU-1 test function, called when there are no command line arguments. |
- * Prints some of the #define values and performs round-trip tests of the |
- * difference encoding and decoding. |
- */ |
-static void |
-TestBOCU1RefDiff(void) { |
- char buf1[80], buf2[80]; |
- uint8_t prev[5], level[5]; |
- int32_t i, cmp, countErrors; |
- |
- log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1); |
- log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2); |
- log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3); |
- |
- log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1); |
- log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2); |
- log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3); |
- |
- log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE); |
- log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2); |
- log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3); |
- |
- /* test packDiff() & unpackDiff() with some specific values */ |
- writeDiff(0, level); |
- writeDiff(1, level); |
- writeDiff(65, level); |
- writeDiff(130, level); |
- writeDiff(30000, level); |
- writeDiff(1000000, level); |
- writeDiff(-65, level); |
- writeDiff(-130, level); |
- writeDiff(-30000, level); |
- writeDiff(-1000000, level); |
- |
- /* test that each value is smaller than any following one */ |
- countErrors=0; |
- i=-0x10ffff; |
- *writeDiff(i, prev)=0; |
- |
- /* show first number and bytes */ |
- printBytes(prev, buf1); |
- log_verbose(" wD(%8ld) %s\n", i, buf1); |
- |
- for(++i; i<=0x10ffff; ++i) { |
- *writeDiff(i, level)=0; |
- cmp=strcmp((const char *)prev, (const char *)level); |
- if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) { |
- log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n", |
- level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i); |
- } |
- if(cmp<0) { |
- if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) { |
- /* |
- * if the result is good, then print only if the length changed |
- * to get little but interesting output |
- */ |
- printBytes(prev, buf1); |
- printBytes(level, buf2); |
- log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); |
- } |
- } else { |
- ++countErrors; |
- printBytes(prev, buf1); |
- printBytes(level, buf2); |
- log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); |
- } |
- /* remember the previous bytes */ |
- memcpy(prev, level, 4); |
- } |
- |
- /* show last number and bytes */ |
- printBytes((uint8_t *)"", buf1); |
- printBytes(prev, buf2); |
- log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2); |
- |
- if(countErrors==0) { |
- log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n"); |
- } else { |
- log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors); |
- } |
- |
- /* output signature byte sequence */ |
- i=0; |
- writePacked(encodeBocu1(&i, 0xfeff), level); |
- log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n", |
- level[0], level[1], level[2]); |
-} |
- |
-/* cintltst code ------------------------------------------------------------ */ |
- |
-static const int32_t DEFAULT_BUFFER_SIZE = 30000; |
- |
- |
-/* test one string with the ICU and the reference BOCU-1 implementations */ |
-static void |
-roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) { |
- UChar *roundtripRef, *roundtripICU; |
- char *bocu1Ref, *bocu1ICU; |
- |
- int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength; |
- UErrorCode errorCode; |
- |
- roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); |
- roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); |
- bocu1Ref = malloc(DEFAULT_BUFFER_SIZE); |
- bocu1ICU = malloc(DEFAULT_BUFFER_SIZE); |
- |
- /* Unicode -> BOCU-1 */ |
- bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref); |
- |
- errorCode=U_ZERO_ERROR; |
- bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode); |
- if(U_FAILURE(errorCode)) { |
- log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); |
- goto cleanup; |
- } |
- |
- if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) { |
- log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength); |
- goto cleanup; |
- } |
- |
- /* BOCU-1 -> Unicode */ |
- roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef); |
- if(roundtripRefLength<0) { |
- goto cleanup; /* readString() found an error and reported it */ |
- } |
- |
- roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode); |
- if(U_FAILURE(errorCode)) { |
- log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); |
- goto cleanup; |
- } |
- |
- if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) { |
- log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength); |
- goto cleanup; |
- } |
- if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) { |
- log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength); |
- goto cleanup; |
- } |
-cleanup: |
- free(roundtripRef); |
- free(roundtripICU); |
- free(bocu1Ref); |
- free(bocu1ICU); |
-} |
- |
-static const UChar feff[]={ 0xfeff }; |
-static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 }; |
-static const UChar crlf[]={ 0xd, 0xa, 0x20 }; |
-static const UChar nul[]={ 0 }; |
-static const UChar latin[]={ 0xdf, 0xe6 }; |
-static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 }; |
-static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 }; |
-static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 }; |
-static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 }; |
-static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */ |
-static const UChar plane1[]={ 0xd800, 0xdc00 }; |
-static const UChar plane2[]={ 0xd845, 0xdddd }; |
-static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 }; |
-static const UChar plane16[]={ 0xdbff, 0xdfff }; |
-static const UChar c0[]={ 1, 0xe40, 0x20, 9 }; |
- |
-static const struct { |
- const UChar *s; |
- int32_t length; |
-} strings[]={ |
- { feff, UPRV_LENGTHOF(feff) }, |
- { ascii, UPRV_LENGTHOF(ascii) }, |
- { crlf, UPRV_LENGTHOF(crlf) }, |
- { nul, UPRV_LENGTHOF(nul) }, |
- { latin, UPRV_LENGTHOF(latin) }, |
- { devanagari, UPRV_LENGTHOF(devanagari) }, |
- { hiragana, UPRV_LENGTHOF(hiragana) }, |
- { unihan, UPRV_LENGTHOF(unihan) }, |
- { hangul, UPRV_LENGTHOF(hangul) }, |
- { surrogates, UPRV_LENGTHOF(surrogates) }, |
- { plane1, UPRV_LENGTHOF(plane1) }, |
- { plane2, UPRV_LENGTHOF(plane2) }, |
- { plane15, UPRV_LENGTHOF(plane15) }, |
- { plane16, UPRV_LENGTHOF(plane16) }, |
- { c0, UPRV_LENGTHOF(c0) } |
-}; |
- |
-/* |
- * Verify that the ICU BOCU-1 implementation produces the same results as |
- * the reference implementation from the design folder. |
- * Generate some texts and convert them with both converters, verifying |
- * identical results and roundtripping. |
- */ |
-static void |
-TestBOCU1(void) { |
- UChar *text; |
- int32_t i, length; |
- |
- UConverter *bocu1; |
- UErrorCode errorCode; |
- |
- errorCode=U_ZERO_ERROR; |
- bocu1=ucnv_open("BOCU-1", &errorCode); |
- if(U_FAILURE(errorCode)) { |
- log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode)); |
- return; |
- } |
- |
- text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); |
- |
- /* text 1: each of strings[] once */ |
- length=0; |
- for(i=0; i<UPRV_LENGTHOF(strings); ++i) { |
- u_memcpy(text+length, strings[i].s, strings[i].length); |
- length+=strings[i].length; |
- } |
- roundtripBOCU1(bocu1, 1, text, length); |
- |
- /* text 2: each of strings[] twice */ |
- length=0; |
- for(i=0; i<UPRV_LENGTHOF(strings); ++i) { |
- u_memcpy(text+length, strings[i].s, strings[i].length); |
- length+=strings[i].length; |
- u_memcpy(text+length, strings[i].s, strings[i].length); |
- length+=strings[i].length; |
- } |
- roundtripBOCU1(bocu1, 2, text, length); |
- |
- /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */ |
- length=0; |
- for(i=1; length<5000; i+=7) { |
- if(i>=UPRV_LENGTHOF(strings)) { |
- i-=UPRV_LENGTHOF(strings); |
- } |
- u_memcpy(text+length, strings[i].s, strings[i].length); |
- length+=strings[i].length; |
- } |
- roundtripBOCU1(bocu1, 3, text, length); |
- |
- ucnv_close(bocu1); |
- free(text); |
-} |
- |
-U_CFUNC void addBOCU1Tests(TestNode** root); |
- |
-U_CFUNC void |
-addBOCU1Tests(TestNode** root) { |
- addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff"); |
- addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1"); |
-} |