source/test/cintltst/bocu1tst.c - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/cintltst/bocu1tst.c

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 ******************************************************************************

3 *

4 * Copyright (C) 2002-2015, International Business Machines

5 * Corporation and others. All Rights Reserved.

6 *

7 ******************************************************************************

8 * file name: bocu1tst.c

9 * encoding: US-ASCII

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created on: 2002may27

14 * created by: Markus W. Scherer

15 *

16 * This is the reference implementation of BOCU-1,

17 * the MIME-friendly form of the Binary Ordered Compression for Unicode,

18 * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trun k/design/conversion/bocu1/

19 * The files bocu1.h and bocu1.c from the design folder are taken

20 * verbatim (minus copyright and #include) and copied together into this file.

21 * The reference code and some of the reference bocu1tst.c

22 * is modified to run as part of the ICU cintltst

23 * test framework (minus main(), log_ln() etc. instead of printf()).

24 *

25 * This reference implementation is used here to verify

26 * the ICU BOCU-1 implementation, which is

27 * adapted for ICU conversion APIs and optimized.

28 * ### links in design doc to here and to ucnvbocu.c

29 */

30

31 #include "unicode/utypes.h"

32 #include "unicode/ustring.h"

33 #include "unicode/ucnv.h"

34 #include "unicode/utf16.h"

35 #include "cmemory.h"

36 #include "cintltst.h"

37

38 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */

39

40 /* BOCU-1 constants and macros ---------------------------------------------- */

41

42 /*

43 * BOCU-1 encodes the code points of a Unicode string as

44 * a sequence of byte-encoded differences (slope detection),

45 * preserving lexical order.

46 *

47 * Optimize the difference-taking for runs of Unicode text within

48 * small scripts:

49 *

50 * Most small scripts are allocated within aligned 128-blocks of Unicode

51 * code points. Lexical order is preserved if the "previous code point" state

52 * is always moved into the middle of such a block.

53 *

54 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul

55 * areas into the middle of those areas.

56 *

57 * C0 control codes and space are encoded with their US-ASCII bytes.

58 * "prev" is reset for C0 controls but not for space.

59 */

60

61 /* initial value for "prev": middle of the ASCII range */

62 #define BOCU1_ASCII_PREV 0x40

63

64 /* bounding byte values for differences */

65 #define BOCU1_MIN 0x21

66 #define BOCU1_MIDDLE 0x90

67 #define BOCU1_MAX_LEAD 0xfe

68

69 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit com pilers */

70 #define BOCU1_MAX_TRAIL 0xffL

71 #define BOCU1_RESET 0xff

72

73 /* number of lead bytes */

74 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)

75

76 /* adjust trail byte counts for the use of some C0 control byte values */

77 #define BOCU1_TRAIL_CONTROLS_COUNT 20

78 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)

79

80 /* number of trail bytes */

81 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTR OLS_COUNT)

82

83 /*

84 * number of positive and negative single-byte codes

85 * (counting 0==BOCU1_MIDDLE among the positive ones)

86 */

87 #define BOCU1_SINGLE 64

88

89 /* number of lead bytes for positive and negative 2/3/4-byte sequences */

90 #define BOCU1_LEAD_2 43

91 #define BOCU1_LEAD_3 3

92 #define BOCU1_LEAD_4 1

93

94 /* The difference value range for single-byters. */

95 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)

96 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)

97

98 /* The difference value range for double-byters. */

99 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

100 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)

101

102 /* The difference value range for 3-byters. */

103 #define BOCU1_REACH_POS_3 \

104 (BOCU1_REACH_POS_2+BOCU1_LEAD_3BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT)

105

106 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3BOCU1_TRAIL_COUNTBO CU1_TRAIL_COUNT)

107

108 /* The lead byte start values. */

109 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)

110 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)

111 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)

112 /* ==BOCU1_MAX_LEAD */

113

114 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)

115 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)

116 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)

117 /* ==BOCU1_MIN+1 */

118

119 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */

120 #define BOCU1_LENGTH_FROM_LEAD(lead) \

121 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \

122 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \

123 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)

124

125 /* The length of a byte sequence, according to its packed form. */

126 #define BOCU1_LENGTH_FROM_PACKED(packed) \

127 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)

128

129 /*

130 * 12 commonly used C0 control codes (and space) are only used to encode

131 * themselves directly,

132 * which makes BOCU-1 MIME-usable and reasonably safe for

133 * ASCII-oriented software.

134 *

135 * These controls are

136 * 0 NUL

137 *

138 * 7 BEL

139 * 8 BS

140 *

141 * 9 TAB

142 * a LF

143 * b VT

144 * c FF

145 * d CR

146 *

147 * e SO

148 * f SI

149 *

150 * 1a SUB

151 * 1b ESC

152 *

153 * The other 20 C0 controls are also encoded directly (to preserve order)

154 * but are also used as trail bytes in difference encoding

155 * (for better compression).

156 */

157 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAI L_BYTE_OFFSET : bocu1TrailToByte[t])

158

159 /*

160 * Byte value map for control codes,

161 * from external byte values 0x00..0x20

162 * to trail byte values 0..19 (0..0x13) as used in the difference calculation.

163 * External byte values that are illegal as trail bytes are mapped to -1.

164 */

165 static const int8_t

166 bocu1ByteToTrail[BOCU1_MIN]={

167 /* 0 1 2 3 4 5 6 7 */

168 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,

169

170 /* 8 9 a b c d e f */

171 -1, -1, -1, -1, -1, -1, -1, -1,

172

173 /* 10 11 12 13 14 15 16 17 */

174 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,

175

176 /* 18 19 1a 1b 1c 1d 1e 1f */

177 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,

178

179 /* 20 */

180 -1

181 };

182

183 /*

184 * Byte value map for control codes,

185 * from trail byte values 0..19 (0..0x13) as used in the difference calculation

186 * to external byte values 0x00..0x20.

187 */

188 static const int8_t

189 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={

190 /* 0 1 2 3 4 5 6 7 */

191 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,

192

193 /* 8 9 a b c d e f */

194 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,

195

196 /* 10 11 12 13 */

197 0x1c, 0x1d, 0x1e, 0x1f

198 };

199

200 /**

201 * Integer division and modulo with negative numerators

202 * yields negative modulo results and quotients that are one more than

203 * what we need here.

204 * This macro adjust the results so that the modulo-value m is always >=0.

205 *

206 * For positive n, the if() condition is always FALSE.

207 *

208 * @param n Number to be split into quotient and rest.

209 * Will be modified to contain the quotient.

210 * @param d Divisor.

211 * @param m Output variable for the rest (modulo result).

212 */

213 #define NEGDIVMOD(n, d, m) { \

214 (m)=(n)%(d); \

215 (n)/=(d); \

216 if((m)<0) { \

217 --(n); \

218 (m)+=(d); \

219 } \

220 }

221

222 /* State for BOCU-1 decoder function. */

223 struct Bocu1Rx {

224 int32_t prev, count, diff;

225 };

226

227 typedef struct Bocu1Rx Bocu1Rx;

228

229 /* Function prototypes ------------------------------------------------------ */

230

231 /* see bocu1.c */

232 U_CFUNC int32_t

233 packDiff(int32_t diff);

234

235 U_CFUNC int32_t

236 encodeBocu1(int32_t *pPrev, int32_t c);

237

238 U_CFUNC int32_t

239 decodeBocu1(Bocu1Rx *pRx, uint8_t b);

240

241 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */

242

243 /* BOCU-1 implementation functions ------------------------------------------ */

244

245 /**

246 * Compute the next "previous" value for differencing

247 * from the current code point.

248 *

249 * @param c current code point, 0..0x10ffff

250 * @return "previous code point" state value

251 */

252 static int32_t

253 bocu1Prev(int32_t c) {

254 /* compute new prev */

255 if(0x3040<=c && c<=0x309f) {

256 /* Hiragana is not 128-aligned */

257 return 0x3070;

258 } else if(0x4e00<=c && c<=0x9fa5) {

259 /* CJK Unihan */

260 return 0x4e00-BOCU1_REACH_NEG_2;

261 } else if(0xac00<=c && c<=0xd7a3) {

262 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compiler s) */

263 return ((int32_t)0xd7a3+(int32_t)0xac00)/2;

264 } else {

265 /* mostly small scripts */

266 return (c&~0x7f)+BOCU1_ASCII_PREV;

267 }

268 }

269

270 /**

271 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes

272 * and return a packed integer with them.

273 *

274 * The encoding favors small absolut differences with short encodings

275 * to compress runs of same-script characters.

276 *

277 * @param diff difference value -0x10ffff..0x10ffff

278 * @return

279 * 0x010000zz for 1-byte sequence zz

280 * 0x0200yyzz for 2-byte sequence yy zz

281 * 0x03xxyyzz for 3-byte sequence xx yy zz

282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)

283 */

284 U_CFUNC int32_t

285 packDiff(int32_t diff) {

286 int32_t result, m, lead, count, shift;

287

288 if(diff>=BOCU1_REACH_NEG_1) {

289 /* mostly positive differences, and single-byte negative ones */

290 if(diff<=BOCU1_REACH_POS_1) {

291 /* single byte */

292 return 0x01000000\|(BOCU1_MIDDLE+diff);

293 } else if(diff<=BOCU1_REACH_POS_2) {

294 /* two bytes */

295 diff-=BOCU1_REACH_POS_1+1;

296 lead=BOCU1_START_POS_2;

297 count=1;

298 } else if(diff<=BOCU1_REACH_POS_3) {

299 /* three bytes */

300 diff-=BOCU1_REACH_POS_2+1;

301 lead=BOCU1_START_POS_3;

302 count=2;

303 } else {

304 /* four bytes */

305 diff-=BOCU1_REACH_POS_3+1;

306 lead=BOCU1_START_POS_4;

307 count=3;

308 }

309 } else {

310 /* two- and four-byte negative differences */

311 if(diff>=BOCU1_REACH_NEG_2) {

312 /* two bytes */

313 diff-=BOCU1_REACH_NEG_1;

314 lead=BOCU1_START_NEG_2;

315 count=1;

316 } else if(diff>=BOCU1_REACH_NEG_3) {

317 /* three bytes */

318 diff-=BOCU1_REACH_NEG_2;

319 lead=BOCU1_START_NEG_3;

320 count=2;

321 } else {

322 /* four bytes */

323 diff-=BOCU1_REACH_NEG_3;

324 lead=BOCU1_START_NEG_4;

325 count=3;

326 }

327 }

328

329 /* encode the length of the packed result */

330 if(count<3) {

331 result=(count+1)<<24;

332 } else /* count==3, MSB used for the lead byte */ {

333 result=0;

334 }

335

336 /* calculate trail bytes like digits in itoa() */

337 shift=0;

338 do {

339 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);

340 result\|=BOCU1_TRAIL_TO_BYTE(m)<<shift;

341 shift+=8;

342 } while(--count>0);

343

344 /* add lead byte */

345 result\|=(lead+diff)<<shift;

346

347 return result;

348 }

349

350 /**

351 * BOCU-1 encoder function.

352 *

353 * @param pPrev pointer to the integer that holds

354 * the "previous code point" state;

355 * the initial value should be 0 which

356 * encodeBocu1 will set to the actual BOCU-1 initial state value

357 * @param c the code point to encode

358 * @return the packed 1/2/3/4-byte encoding, see packDiff(),

359 * or 0 if an error occurs

360 *

361 * @see packDiff

362 */

363 U_CFUNC int32_t

364 encodeBocu1(int32_t *pPrev, int32_t c) {

365 int32_t prev;

366

367 if(pPrev==NULL \|\| c<0 \|\| c>0x10ffff) {

368 /* illegal argument */

369 return 0;

370 }

371

372 prev=*pPrev;

373 if(prev==0) {

374 /* lenient handling of initial value 0 */

375 prev=*pPrev=BOCU1_ASCII_PREV;

376 }

377

378 if(c<=0x20) {

379 /*

380 * ISO C0 control & space:

381 * Encode directly for MIME compatibility,

382 * and reset state except for space, to not disrupt compression.

383 */

384 if(c!=0x20) {

385 *pPrev=BOCU1_ASCII_PREV;

386 }

387 return 0x01000000\|c;

388 }

389

390 /*

391 * all other Unicode code points c==U+0021..U+10ffff

392 * are encoded with the difference c-prev

393 *

394 * a new prev is computed from c,

395 * placed in the middle of a 0x80-block (for most small scripts) or

396 * in the middle of the Unihan and Hangul blocks

397 * to statistically minimize the following difference

398 */

399 *pPrev=bocu1Prev(c);

400 return packDiff(c-prev);

401 }

402

403 /**

404 * Function for BOCU-1 decoder; handles multi-byte lead bytes.

405 *

406 * @param pRx pointer to the decoder state structure

407 * @param b lead byte;

408 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LE AD

409 * @return -1 (state change only)

410 *

411 * @see decodeBocu1

412 */

413 static int32_t

414 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {

415 int32_t c, count;

416

417 if(b>=BOCU1_START_NEG_2) {

418 /* positive difference */

419 if(b<BOCU1_START_POS_3) {

420 /* two bytes */

421 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1 +1;

422 count=1;

423 } else if(b<BOCU1_START_POS_4) {

424 /* three bytes */

425 c=((int32_t)b-BOCU1_START_POS_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT +BOCU1_REACH_POS_2+1;

426 count=2;

427 } else {

428 /* four bytes */

429 c=BOCU1_REACH_POS_3+1;

430 count=3;

431 }

432 } else {

433 /* negative difference */

434 if(b>=BOCU1_START_NEG_3) {

435 /* two bytes */

436 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1 ;

437 count=1;

438 } else if(b>BOCU1_MIN) {

439 /* three bytes */

440 c=((int32_t)b-BOCU1_START_NEG_3)BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT +BOCU1_REACH_NEG_2;

441 count=2;

442 } else {

443 /* four bytes */

444 c=-BOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT+BOCU1_REACH _NEG_3;

445 count=3;

446 }

447 }

448

449 /* set the state for decoding the trail byte(s) */

450 pRx->diff=c;

451 pRx->count=count;

452 return -1;

453 }

454

455 /**

456 * Function for BOCU-1 decoder; handles multi-byte trail bytes.

457 *

458 * @param pRx pointer to the decoder state structure

459 * @param b trail byte

460 * @return result value, same as decodeBocu1

461 *

462 * @see decodeBocu1

463 */

464 static int32_t

465 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {

466 int32_t t, c, count;

467

468 if(b<=0x20) {

469 /* skip some C0 controls and make the trail byte range contiguous */

470 t=bocu1ByteToTrail[b];

471 if(t<0) {

472 /* illegal trail byte value */

473 pRx->prev=BOCU1_ASCII_PREV;

474 pRx->count=0;

475 return -99;

476 }

477 #if BOCU1_MAX_TRAIL<0xff

478 } else if(b>BOCU1_MAX_TRAIL) {

479 return -99;

480 #endif

481 } else {

482 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;

483 }

484

485 /* add trail byte into difference and decrement count */

486 c=pRx->diff;

487 count=pRx->count;

488

489 if(count==1) {

490 /* final trail byte, deliver a code point */

491 c=pRx->prev+c+t;

492 if(0<=c && c<=0x10ffff) {

493 /* valid code point result */

494 pRx->prev=bocu1Prev(c);

495 pRx->count=0;

496 return c;

497 } else {

498 /* illegal code point result */

499 pRx->prev=BOCU1_ASCII_PREV;

500 pRx->count=0;

501 return -99;

502 }

503 }

504

505 /* intermediate trail byte */

506 if(count==2) {

507 pRx->diff=c+t*BOCU1_TRAIL_COUNT;

508 } else /* count==3 */ {

509 pRx->diff=c+tBOCU1_TRAIL_COUNTBOCU1_TRAIL_COUNT;

510 }

511 pRx->count=count-1;

512 return -1;

513 }

514

515 /**

516 * BOCU-1 decoder function.

517 *

518 * @param pRx pointer to the decoder state structure;

519 * the initial values should be 0 which

520 * decodeBocu1 will set to actual initial state values

521 * @param b an input byte

522 * @return

523 * 0..0x10ffff for a result code point

524 * -1 if only the state changed without code point output

525 * <-1 if an error occurs

526 */

527 U_CFUNC int32_t

528 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {

529 int32_t prev, c, count;

530

531 if(pRx==NULL) {

532 /* illegal argument */

533 return -99;

534 }

535

536 prev=pRx->prev;

537 if(prev==0) {

538 /* lenient handling of initial 0 values */

539 prev=pRx->prev=BOCU1_ASCII_PREV;

540 count=pRx->count=0;

541 } else {

542 count=pRx->count;

543 }

544

545 if(count==0) {

546 /* byte in lead position */

547 if(b<=0x20) {

548 /*

549 * Direct-encoded C0 control code or space.

550 * Reset prev for C0 control codes but not for space.

551 */

552 if(b!=0x20) {

553 pRx->prev=BOCU1_ASCII_PREV;

554 }

555 return b;

556 }

557

558 /*

559 * b is a difference lead byte.

560 *

561 * Return a code point directly from a single-byte difference.

562 *

563 * For multi-byte difference lead bytes, set the decoder state

564 * with the partial difference value from the lead byte and

565 * with the number of trail bytes.

566 *

567 * For four-byte differences, the signedness also affects the

568 * first trail byte, which has special handling farther below.

569 */

570 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {

571 /* single-byte difference */

572 c=prev+((int32_t)b-BOCU1_MIDDLE);

573 pRx->prev=bocu1Prev(c);

574 return c;

575 } else if(b==BOCU1_RESET) {

576 /* only reset the state, no code point */

577 pRx->prev=BOCU1_ASCII_PREV;

578 return -1;

579 } else {

580 return decodeBocu1LeadByte(pRx, b);

581 }

582 } else {

583 /* trail byte in any position */

584 return decodeBocu1TrailByte(pRx, b);

585 }

586 }

587

588 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */

589

590 /* test code ---------------------------------------------------------------- */

591

592 /* test code options */

593

594 /* ignore comma when processing name lists in testText() */

595 #define TEST_IGNORE_COMMA 1

596

597 /**

598 * Write a packed BOCU-1 byte sequence into a byte array,

599 * without overflow check.

600 * Test function.

601 *

602 * @param packed packed BOCU-1 byte sequence, see packDiff()

603 * @param p pointer to byte array

604 * @return number of bytes

605 *

606 * @see packDiff

607 */

608 static int32_t

609 writePacked(int32_t packed, uint8_t *p) {

610 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);

611 switch(count) {

612 case 4:

613 *p++=(uint8_t)(packed>>24);

614 case 3:

615 *p++=(uint8_t)(packed>>16);

616 case 2:

617 *p++=(uint8_t)(packed>>8);

618 case 1:

619 *p++=(uint8_t)packed;

620 default:

621 break;

622 }

623

624 return count;

625 }

626

627 /**

628 * Unpack a packed BOCU-1 non-C0/space byte sequence and get

629 * the difference to initialPrev.

630 * Used only for round-trip testing of the difference encoding and decoding.

631 * Test function.

632 *

633 * @param initialPrev bogus "previous code point" value to make sure that

634 * the resulting code point is in the range 0..0x10ffff

635 * @param packed packed BOCU-1 byte sequence

636 * @return the difference to initialPrev

637 *

638 * @see packDiff

639 * @see writeDiff

640 */

641 static int32_t

642 unpackDiff(int32_t initialPrev, int32_t packed) {

643 Bocu1Rx rx={ 0, 0, 0 };

644 int32_t count;

645

646 rx.prev=initialPrev;

647 count=BOCU1_LENGTH_FROM_PACKED(packed);

648 switch(count) {

649 case 4:

650 decodeBocu1(&rx, (uint8_t)(packed>>24));

651 case 3:

652 decodeBocu1(&rx, (uint8_t)(packed>>16));

653 case 2:

654 decodeBocu1(&rx, (uint8_t)(packed>>8));

655 case 1:

656 /* subtract initial prev */

657 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;

658 default:

659 return -0x7fffffff;

660 }

661 }

662

663 /**

664 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,

665 * preserving lexical order.

666 * Also checks for roundtripping of the difference encoding.

667 * Test function.

668 *

669 * @param diff difference value to test, -0x10ffff..0x10ffff

670 * @param p pointer to output byte array

671 * @return p advanced by number of bytes output

672 *

673 * @see unpackDiff

674 */

675 static uint8_t *

676 writeDiff(int32_t diff, uint8_t *p) {

677 /* generate the difference as a packed value and serialize it */

678 int32_t packed, initialPrev;

679

680 packed=packDiff(diff);

681

682 /*

683 * bogus initial "prev" to work around

684 * code point range check in decodeBocu1()

685 */

686 if(diff<=0) {

687 initialPrev=0x10ffff;

688 } else {

689 initialPrev=-1;

690 }

691

692 if(diff!=unpackDiff(initialPrev, packed)) {

693 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",

694 diff, packed, unpackDiff(initialPrev, packed));

695 }

696 return p+writePacked(packed, p);

697 }

698

699 /**

700 * Encode a UTF-16 string in BOCU-1.

701 * Does not check for overflows, but otherwise useful function.

702 *

703 * @param s input UTF-16 string

704 * @param length number of UChar code units in s

705 * @param p pointer to output byte array

706 * @return number of bytes output

707 */

708 static int32_t

709 writeString(const UChar s, int32_t length, uint8_t p) {

710 uint8_t *p0;

711 int32_t c, prev, i;

712

713 prev=0;

714 p0=p;

715 i=0;

716 while(i<length) {

717 U16_NEXT(s, i, length, c);

718 p+=writePacked(encodeBocu1(&prev, c), p);

719 }

720 return (int32_t)(p-p0);

721 }

722

723 /**

724 * Decode a BOCU-1 byte sequence to a UTF-16 string.

725 * Does not check for overflows, but otherwise useful function.

726 *

727 * @param p pointer to input BOCU-1 bytes

728 * @param length number of input bytes

729 * @param s point to output UTF-16 string array

730 * @return number of UChar code units output

731 */

732 static int32_t

733 readString(const uint8_t p, int32_t length, UChar s) {

734 Bocu1Rx rx={ 0, 0, 0 };

735 int32_t c, i, sLength;

736

737 i=sLength=0;

738 while(i<length) {

739 c=decodeBocu1(&rx, p[i++]);

740 if(c<-1) {

741 log_err("error: readString detects encoding error at string index %l d\n", i);

742 return -1;

743 }

744 if(c>=0) {

745 U16_APPEND_UNSAFE(s, sLength, c);

746 }

747 }

748 return sLength;

749 }

750

751 static char

752 hexDigit(uint8_t digit) {

753 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);

754 }

755

756 /**

757 * Pretty-print 0-terminated byte values.

758 * Helper function for test output.

759 *

760 * @param bytes 0-terminated byte array to print

761 */

762 static void

763 printBytes(uint8_t bytes, char out) {

764 int i;

765 uint8_t b;

766

767 i=0;

768 while((b=*bytes++)!=0) {

769 *out++=' ';

770 *out++=hexDigit((uint8_t)(b>>4));

771 *out++=hexDigit((uint8_t)(b&0xf));

772 ++i;

773 }

774 i=3*(5-i);

775 while(i>0) {

776 *out++=' ';

777 --i;

778 }

779 *out=0;

780 }

781

782 /**

783 * Basic BOCU-1 test function, called when there are no command line arguments.

784 * Prints some of the #define values and performs round-trip tests of the

785 * difference encoding and decoding.

786 */

787 static void

788 TestBOCU1RefDiff(void) {

789 char buf1[80], buf2[80];

790 uint8_t prev[5], level[5];

791 int32_t i, cmp, countErrors;

792

793 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_ NEG_1);

794 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_ NEG_2);

795 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REAC H_NEG_3);

796

797 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_ REACH_NEG_1, BOCU1_REACH_POS_1);

798 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_ REACH_NEG_2, BOCU1_REACH_POS_2);

799 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU 1_REACH_NEG_3, BOCU1_REACH_POS_3);

800

801 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE);

802 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BO CU1_START_NEG_2, BOCU1_START_POS_2);

803 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);

804

805 /* test packDiff() & unpackDiff() with some specific values */

806 writeDiff(0, level);

807 writeDiff(1, level);

808 writeDiff(65, level);

809 writeDiff(130, level);

810 writeDiff(30000, level);

811 writeDiff(1000000, level);

812 writeDiff(-65, level);

813 writeDiff(-130, level);

814 writeDiff(-30000, level);

815 writeDiff(-1000000, level);

816

817 /* test that each value is smaller than any following one */

818 countErrors=0;

819 i=-0x10ffff;

820 *writeDiff(i, prev)=0;

821

822 /* show first number and bytes */

823 printBytes(prev, buf1);

824 log_verbose(" wD(%8ld) %s\n", i, buf1);

825

826 for(++i; i<=0x10ffff; ++i) {

827 *writeDiff(i, level)=0;

828 cmp=strcmp((const char )prev, (const char )level);

829 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level )) {

830 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDif f(%ld))\n",

831 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const cha r *)level), i);

832 }

833 if(cmp<0) {

834 if(i==0 \|\| i==1 \|\| strlen((const char )prev)!=strlen((const char ) level)) {

835 /*

836 * if the result is good, then print only if the length changed

837 * to get little but interesting output

838 */

839 printBytes(prev, buf1);

840 printBytes(level, buf2);

841 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1 , i, cmp, buf1, buf2);

842 }

843 } else {

844 ++countErrors;

845 printBytes(prev, buf1);

846 printBytes(level, buf2);

847 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2);

848 }

849 /* remember the previous bytes */

850 memcpy(prev, level, 4);

851 }

852

853 /* show last number and bytes */

854 printBytes((uint8_t *)"", buf1);

855 printBytes(prev, buf2);

856 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, b uf2);

857

858 if(countErrors==0) {

859 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");

860 } else {

861 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);

862 }

863

864 /* output signature byte sequence */

865 i=0;

866 writePacked(encodeBocu1(&i, 0xfeff), level);

867 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",

868 level[0], level[1], level[2]);

869 }

870

871 /* cintltst code ------------------------------------------------------------ */

872

873 static const int32_t DEFAULT_BUFFER_SIZE = 30000;

874

875

876 /* test one string with the ICU and the reference BOCU-1 implementations */

877 static void

878 roundtripBOCU1(UConverter bocu1, int32_t number, const UChar text, int32_t len gth) {

879 UChar roundtripRef, roundtripICU;

880 char bocu1Ref, bocu1ICU;

881

882 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULeng th;

883 UErrorCode errorCode;

884

885 roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

886 roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

887 bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);

888 bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);

889

890 /* Unicode -> BOCU-1 */

891 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);

892

893 errorCode=U_ZERO_ERROR;

894 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, l ength, &errorCode);

895 if(U_FAILURE(errorCode)) {

896 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, le ngth, u_errorName(errorCode));

897 goto cleanup;

898 }

899

900 if(bocu1RefLength!=bocu1ICULength \|\| 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu 1RefLength)) {

901 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, l ength, bocu1RefLength, bocu1ICULength);

902 goto cleanup;

903 }

904

905 /* BOCU-1 -> Unicode */

906 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtrip Ref);

907 if(roundtripRefLength<0) {

908 goto cleanup; /* readString() found an error and reported it */

909 }

910

911 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, b ocu1ICU, bocu1ICULength, &errorCode);

912 if(U_FAILURE(errorCode)) {

913 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, leng th, u_errorName(errorCode));

914 goto cleanup;

915 }

916

917 if(length!=roundtripRefLength \|\| 0!=u_memcmp(text, roundtripRef, length)) {

918 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);

919 goto cleanup;

920 }

921 if(roundtripRefLength!=roundtripICULength \|\| 0!=u_memcmp(roundtripRef, round tripICU, roundtripRefLength)) {

922 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, round tripRefLength, roundtripICULength);

923 goto cleanup;

924 }

925 cleanup:

926 free(roundtripRef);

927 free(roundtripICU);

928 free(bocu1Ref);

929 free(bocu1ICU);

930 }

931

932 static const UChar feff[]={ 0xfeff };

933 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };

934 static const UChar crlf[]={ 0xd, 0xa, 0x20 };

935 static const UChar nul[]={ 0 };

936 static const UChar latin[]={ 0xdf, 0xe6 };

937 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };

938 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };

939 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };

940 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };

941 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatc hed! */

942 static const UChar plane1[]={ 0xd800, 0xdc00 };

943 static const UChar plane2[]={ 0xd845, 0xdddd };

944 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };

945 static const UChar plane16[]={ 0xdbff, 0xdfff };

946 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };

947

948 static const struct {

949 const UChar *s;

950 int32_t length;

951 } strings[]={

952 { feff, UPRV_LENGTHOF(feff) },

953 { ascii, UPRV_LENGTHOF(ascii) },

954 { crlf, UPRV_LENGTHOF(crlf) },

955 { nul, UPRV_LENGTHOF(nul) },

956 { latin, UPRV_LENGTHOF(latin) },

957 { devanagari, UPRV_LENGTHOF(devanagari) },

958 { hiragana, UPRV_LENGTHOF(hiragana) },

959 { unihan, UPRV_LENGTHOF(unihan) },

960 { hangul, UPRV_LENGTHOF(hangul) },

961 { surrogates, UPRV_LENGTHOF(surrogates) },

962 { plane1, UPRV_LENGTHOF(plane1) },

963 { plane2, UPRV_LENGTHOF(plane2) },

964 { plane15, UPRV_LENGTHOF(plane15) },

965 { plane16, UPRV_LENGTHOF(plane16) },

966 { c0, UPRV_LENGTHOF(c0) }

967 };

968

969 /*

970 * Verify that the ICU BOCU-1 implementation produces the same results as

971 * the reference implementation from the design folder.

972 * Generate some texts and convert them with both converters, verifying

973 * identical results and roundtripping.

974 */

975 static void

976 TestBOCU1(void) {

977 UChar *text;

978 int32_t i, length;

979

980 UConverter *bocu1;

981 UErrorCode errorCode;

982

983 errorCode=U_ZERO_ERROR;

984 bocu1=ucnv_open("BOCU-1", &errorCode);

985 if(U_FAILURE(errorCode)) {

986 log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName (errorCode));

987 return;

988 }

989

990 text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));

991

992 /* text 1: each of strings[] once */

993 length=0;

994 for(i=0; i<UPRV_LENGTHOF(strings); ++i) {

995 u_memcpy(text+length, strings[i].s, strings[i].length);

996 length+=strings[i].length;

997 }

998 roundtripBOCU1(bocu1, 1, text, length);

999

1000 /* text 2: each of strings[] twice */

1001 length=0;

1002 for(i=0; i<UPRV_LENGTHOF(strings); ++i) {

1003 u_memcpy(text+length, strings[i].s, strings[i].length);

1004 length+=strings[i].length;

1005 u_memcpy(text+length, strings[i].s, strings[i].length);

1006 length+=strings[i].length;

1007 }

1008 roundtripBOCU1(bocu1, 2, text, length);

1009

1010 /* text 3: each of strings[] many times (set step vs. \|strings\| so that all strings are used) */

1011 length=0;

1012 for(i=1; length<5000; i+=7) {

1013 if(i>=UPRV_LENGTHOF(strings)) {

1014 i-=UPRV_LENGTHOF(strings);

1015 }

1016 u_memcpy(text+length, strings[i].s, strings[i].length);

1017 length+=strings[i].length;

1018 }

1019 roundtripBOCU1(bocu1, 3, text, length);

1020

1021 ucnv_close(bocu1);

1022 free(text);

1023 }

1024

1025 U_CFUNC void addBOCU1Tests(TestNode** root);

1026

1027 U_CFUNC void

1028 addBOCU1Tests(TestNode** root) {

1029 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");

1030 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");

1031 }

OLD	NEW

« no previous file with comments | « source/test/cintltst/Makefile.in ('k') | source/test/cintltst/callcoll.h » ('j') | no next file with comments »