source/test/perf/collationperf/collperf.cpp - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/perf/collationperf/collperf.cpp

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /********************************************************************

2 * COPYRIGHT:

3 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.

4 *

5 ********************************************************************/

6 /******************************************************************************* *

7 *

8 * File CALLCOLL.C

9 *

10 * Modification History:

11 * Name Description

12 * Andy Heninger First Version

13 *

14 ******************************************************************************** *

15 */

16

17 //

18 // This program tests string collation and sort key generation performance.

19 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMap String

20 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,

21 // and include a byte order mark. Either LE or BE format is OK.

22 //

23

24 const char gUsageString[] =

25 "usage: collperf options...\n"

26 "-help Display this message.\n"

27 "-file file_name utf-16 format file of names.\n"

28 "-locale name ICU locale to use. Default is en_US\n"

29 "-rules file_name Collation rules file (overrides locale)\n"

30 "-langid 0x1234 Windows Language ID number. Default to value fo r -locale option\n"

31 " see http://msdn.microsoft.com/library/psdk/wi nbase/nls_8xo3.htm\n"

32 "-win Run test using Windows native services. (ICU is default)\n"

33 "-unix Run test using Unix strxfrm, strcoll services.\n "

34 "-uselen Use API with string lengths. Default is null-te rminated strings\n"

35 "-usekeys Run tests using sortkeys rather than strcoll\n"

36 "-strcmp Run tests using u_strcmp rather than strcoll\n"

37 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather th an strcoll\n"

38 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"

39 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"

40 " under test at each call point. For measurin g test overhead.\n"

41 "-terse Terse numbers-only output. Intended for use by scripts.\n"

42 "-french French accent ordering\n"

43 "-frenchoff No French accent ordering (for use with French l ocales.)\n"

44 "-norm Normalizing mode on\n"

45 "-shifted Shifted mode\n"

46 "-lower Lower case first\n"

47 "-upper Upper case first\n"

48 "-case Enable separate case level\n"

49 "-level n Sort level, 1 to 5, for Primary, Secndary, Terti ary, Quaternary, Identical\n"

50 "-keyhist Produce a table sort key size vs. string length\ n"

51 "-binsearch Binary Search timing test\n"

52 "-keygen Sort Key Generation timing test\n"

53 "-qsort Quicksort timing test\n"

54 "-iter Iteration Performance Test\n"

55 "-dump Display strings, sort keys and CEs.\n"

56 ;

57

58

59

60 #include <stdio.h>

61 #include <string.h>

62 #include <stdlib.h>

63 #include <math.h>

64 #include <locale.h>

65 #include <errno.h>

66

67 #include <unicode/utypes.h>

68 #include <unicode/ucol.h>

69 #include <unicode/ucoleitr.h>

70 #include <unicode/uloc.h>

71 #include <unicode/ustring.h>

72 #include <unicode/ures.h>

73 #include <unicode/uchar.h>

74 #include <unicode/ucnv.h>

75 #include <unicode/utf8.h>

76

77 #ifdef WIN32

78 #include <windows.h>

79 #else

80 //

81 // Stubs for Windows API functions when building on UNIXes.

82 //

83 typedef int DWORD;

84 inline int CompareStringW(DWORD, DWORD, UChar , int, UChar , int) {return 0;}

85 #include <sys/time.h>

86 unsigned long timeGetTime() {

87 struct timeval t;

88 gettimeofday(&t, 0);

89 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.

90 val += t.tv_usec / 1000;

91 return val;

92 }

93 inline int LCMapStringW(DWORD, DWORD, UChar , int, UChar , int) {return 0;}

94 const int LCMAP_SORTKEY = 0;

95 #define MAKELCID(a,b) 0

96 const int SORT_DEFAULT = 0;

97 #endif

98

99

100

101 //

102 // Command line option variables

103 // These global variables are set according to the options specified

104 // on the command line by the user.

105 char * opt_fName = 0;

106 const char * opt_locale = "en_US";

107 int opt_langid = 0; // Defaults to value corresponding to opt_loc ale.

108 char * opt_rules = 0;

109 UBool opt_help = FALSE;

110 int opt_loopCount = 1;

111 int opt_iLoopCount = 1;

112 UBool opt_terse = FALSE;

113 UBool opt_qsort = FALSE;

114 UBool opt_binsearch = FALSE;

115 UBool opt_icu = TRUE;

116 UBool opt_win = FALSE; // Run with Windows native functions.

117 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.

118 UBool opt_uselen = FALSE;

119 UBool opt_usekeys = FALSE;

120 UBool opt_strcmp = FALSE;

121 UBool opt_strcmpCPO = FALSE;

122 UBool opt_norm = FALSE;

123 UBool opt_keygen = FALSE;

124 UBool opt_french = FALSE;

125 UBool opt_frenchoff = FALSE;

126 UBool opt_shifted = FALSE;

127 UBool opt_lower = FALSE;

128 UBool opt_upper = FALSE;

129 UBool opt_case = FALSE;

130 int opt_level = 0;

131 UBool opt_keyhist = FALSE;

132 UBool opt_itertest = FALSE;

133 UBool opt_dump = FALSE;

134

135

136

137 //

138 // Definitions for the command line options

139 //

140 struct OptSpec {

141 const char *name;

142 enum {FLAG, NUM, STRING} type;

143 void *pVar;

144 };

145

146 OptSpec opts[] = {

147 {"-file", OptSpec::STRING, &opt_fName},

148 {"-locale", OptSpec::STRING, &opt_locale},

149 {"-langid", OptSpec::NUM, &opt_langid},

150 {"-rules", OptSpec::STRING, &opt_rules},

151 {"-qsort", OptSpec::FLAG, &opt_qsort},

152 {"-binsearch", OptSpec::FLAG, &opt_binsearch},

153 {"-iter", OptSpec::FLAG, &opt_itertest},

154 {"-win", OptSpec::FLAG, &opt_win},

155 {"-unix", OptSpec::FLAG, &opt_unix},

156 {"-uselen", OptSpec::FLAG, &opt_uselen},

157 {"-usekeys", OptSpec::FLAG, &opt_usekeys},

158 {"-strcmp", OptSpec::FLAG, &opt_strcmp},

159 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO},

160 {"-norm", OptSpec::FLAG, &opt_norm},

161 {"-french", OptSpec::FLAG, &opt_french},

162 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff},

163 {"-shifted", OptSpec::FLAG, &opt_shifted},

164 {"-lower", OptSpec::FLAG, &opt_lower},

165 {"-upper", OptSpec::FLAG, &opt_upper},

166 {"-case", OptSpec::FLAG, &opt_case},

167 {"-level", OptSpec::NUM, &opt_level},

168 {"-keyhist", OptSpec::FLAG, &opt_keyhist},

169 {"-keygen", OptSpec::FLAG, &opt_keygen},

170 {"-loop", OptSpec::NUM, &opt_loopCount},

171 {"-iloop", OptSpec::NUM, &opt_iLoopCount},

172 {"-terse", OptSpec::FLAG, &opt_terse},

173 {"-dump", OptSpec::FLAG, &opt_dump},

174 {"-help", OptSpec::FLAG, &opt_help},

175 {"-?", OptSpec::FLAG, &opt_help},

176 {0, OptSpec::FLAG, 0}

177 };

178

179

180 //---------------------------------------------------------------------------

181 //

182 // Global variables pointing to and describing the test file

183 //

184 //---------------------------------------------------------------------------

185

186 //

187 // struct Line

188 //

189 // Each line from the source file (containing a name, presumably) gets

190 // one of these structs.

191 //

192 struct Line {

193 UChar *name;

194 int len;

195 char *winSortKey;

196 char *icuSortKey;

197 char *unixSortKey;

198 char *unixName;

199 };

200

201

202

203 Line *gFileLines; // Ptr to array of Line structs, one per li ne in the file.

204 int gNumFileLines;

205 UCollator *gCol;

206 DWORD gWinLCID;

207

208 Line **gSortedLines;

209 Line **gRandomLines;

210 int gCount;

211

212

213

214 //---------------------------------------------------------------------------

215 //

216 // ProcessOptions() Function to read the command line options.

217 //

218 //---------------------------------------------------------------------------

219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])

220 {

221 int i;

222 int argNum;

223 const char *pArgName;

224 OptSpec *pOpt;

225

226 for (argNum=1; argNum<argc; argNum++) {

227 pArgName = argv[argNum];

228 for (pOpt = opts; pOpt->name != 0; pOpt++) {

229 if (strcmp(pOpt->name, pArgName) == 0) {

230 switch (pOpt->type) {

231 case OptSpec::FLAG:

232 (UBool )(pOpt->pVar) = TRUE;

233 break;

234 case OptSpec::STRING:

235 argNum ++;

236 if (argNum >= argc) {

237 fprintf(stderr, "value expected for \"%s\" option.\n", p Opt->name);

238 return FALSE;

239 }

240 (const char *)(pOpt->pVar) = argv[argNum];

241 break;

242 case OptSpec::NUM:

243 argNum ++;

244 if (argNum >= argc) {

245 fprintf(stderr, "value expected for \"%s\" option.\n", p Opt->name);

246 return FALSE;

247 }

248 char *endp;

249 i = strtol(argv[argNum], &endp, 0);

250 if (endp == argv[argNum]) {

251 fprintf(stderr, "integer value expected for \"%s\" optio n.\n", pOpt->name);

252 return FALSE;

253 }

254 (int )(pOpt->pVar) = i;

255 }

256 break;

257 }

258 }

259 if (pOpt->name == 0)

260 {

261 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);

262 return FALSE;

263 }

264 }

265 return TRUE;

266 }

267

268 //------------------------------------------------------------------------------ ---------

269 //

270 // Comparison functions for use by qsort.

271 //

272 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with le ngth

273 // or null terminated.

274 //

275 //------------------------------------------------------------------------------ ---------

276 int ICUstrcmpK(const void a, const void b) {

277 gCount++;

278 int t = strcmp(((Line )a)->icuSortKey, ((Line **)b)->icuSortKey);

279 return t;

280 }

281

282

283 int ICUstrcmpL(const void a, const void b) {

284 gCount++;

285 UCollationResult t;

286 t = ucol_strcoll(gCol, ((Line )a)->name, ((Line *)a)->len, ((Line *)b )->name, ((Line **)b)->len);

287 if (t == UCOL_LESS) return -1;

288 if (t == UCOL_GREATER) return +1;

289 return 0;

290 }

291

292

293 int ICUstrcmp(const void a, const void b) {

294 gCount++;

295 UCollationResult t;

296 t = ucol_strcoll(gCol, ((Line )a)->name, -1, ((Line **)b)->name, -1);

297 if (t == UCOL_LESS) return -1;

298 if (t == UCOL_GREATER) return +1;

299 return 0;

300 }

301

302

303 int Winstrcmp(const void a, const void b) {

304 gCount++;

305 int t;

306 t = CompareStringW(gWinLCID, 0, ((Line )a)->name, -1, ((Line **)b)->name , -1);

307 return t-2;

308 }

309

310

311 int UNIXstrcmp(const void a, const void b) {

312 gCount++;

313 int t;

314 t = strcoll(((Line )a)->unixName, ((Line **)b)->unixName);

315 return t;

316 }

317

318

319 int WinstrcmpL(const void a, const void b) {

320 gCount++;

321 int t;

322 t = CompareStringW(gWinLCID, 0, ((Line )a)->name, ((Line *)a)->len, (( Line *)b)->name, ((Line **)b)->len);

323 return t-2;

324 }

325

326

327 int WinstrcmpK(const void a, const void b) {

328 gCount++;

329 int t = strcmp(((Line )a)->winSortKey, ((Line **)b)->winSortKey);

330 return t;

331 }

332

333

334 //------------------------------------------------------------------------------ ---------

335 //

336 // Function for sorting the names (lines) into a random order.

337 // Order is based on a hash of the ICU Sort key for the lines

338 // The randomized order is used as input for the sorting timing tests.

339 //

340 //------------------------------------------------------------------------------ ---------

341 int ICURandomCmp(const void a, const void b) {

342 char ask = ((Line **)a)->icuSortKey;

343 char bsk = ((Line **)b)->icuSortKey;

344 int aVal = 0;

345 int bVal = 0;

346 int retVal;

347 while (*ask != 0) {

348 aVal += aVal37 + ask++;

349 }

350 while (*bsk != 0) {

351 bVal += bVal37 + bsk++;

352 }

353 retVal = -1;

354 if (aVal == bVal) {

355 retVal = 0;

356 }

357 else if (aVal > bVal) {

358 retVal = 1;

359 }

360 return retVal;

361 }

362

363 //------------------------------------------------------------------------------ ---------

364 //

365 // doKeyGen() Key Generation Timing Test

366 //

367 //------------------------------------------------------------------------------ ---------

368 void doKeyGen()

369 {

370 int line;

371 int loops = 0;

372 int iLoop;

373 int t;

374 int len=-1;

375

376 // Adjust loop count to compensate for file size. Should be order n

377 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)) ;

378 int adj_loopCount = int(dLoopCount);

379 if (adj_loopCount < 1) adj_loopCount = 1;

380

381

382 unsigned long startTime = timeGetTime();

383

384 if (opt_win) {

385 for (loops=0; loops<adj_loopCount; loops++) {

386 for (line=0; line < gNumFileLines; line++) {

387 if (opt_uselen) {

388 len = gFileLines[line].len;

389 }

390 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

391 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,

392 gFileLines[line].name, len,

393 (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length.

394 }

395 }

396 }

397 }

398 else if (opt_icu)

399 {

400 for (loops=0; loops<adj_loopCount; loops++) {

401 for (line=0; line < gNumFileLines; line++) {

402 if (opt_uselen) {

403 len = gFileLines[line].len;

404 }

405 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

406 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsig ned char *)gFileLines[line].icuSortKey, 5000);

407 }

408 }

409 }

410 }

411 else if (opt_unix)

412 {

413 for (loops=0; loops<adj_loopCount; loops++) {

414 for (line=0; line < gNumFileLines; line++) {

415 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

416 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixN ame, 5000);

417 }

418 }

419 }

420 }

421

422 unsigned long elapsedTime = timeGetTime() - startTime;

423 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*g NumFileLines));

424

425 if (opt_terse == FALSE) {

426 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLin es);

427 printf("Sort Key Generation: time per key = %d ns\n", ns);

428 }

429 else {

430 printf("%d, ", ns);

431 }

432

433 int totalKeyLen = 0;

434 int totalChars = 0;

435 for (line=0; line<gNumFileLines; line++) {

436 totalChars += u_strlen(gFileLines[line].name);

437 if (opt_win) {

438 totalKeyLen += strlen(gFileLines[line].winSortKey);

439 }

440 else if (opt_icu) {

441 totalKeyLen += strlen(gFileLines[line].icuSortKey);

442 }

443 else if (opt_unix) {

444 totalKeyLen += strlen(gFileLines[line].unixSortKey);

445 }

446

447 }

448 if (opt_terse == FALSE) {

449 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)tota lChars);

450 } else {

451 printf("%f, ", (float)totalKeyLen / (float)totalChars);

452 }

453 }

454

455

456

457 //------------------------------------------------------------------------------ ---------

458 //

459 // doBinarySearch() Binary Search timing test. Each name from the list

460 // is looked up in the full sorted list of names.

461 //

462 //------------------------------------------------------------------------------ ---------

463 void doBinarySearch()

464 {

465

466 gCount = 0;

467 int line;

468 int loops = 0;

469 int iLoop = 0;

470 unsigned long elapsedTime = 0;

471

472 // Adjust loop count to compensate for file size. Should be order n (looku ps) * log n (compares/lookup)

473 // Accurate timings do not depend on this being perfect. The correction is just to try to

474 // get total running times of about the right order, so the that user does n't need to

475 // manually adjust the loop count for every different file size.

476 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileL ines) * double(gNumFileLines));

477 if (opt_usekeys) dLoopCount *= 5;

478 int adj_loopCount = int(dLoopCount);

479 if (adj_loopCount < 1) adj_loopCount = 1;

480

481

482 for (;;) { // not really a loop, just allows "break" to work, to simplify

483 // inadvertantly running more than one test through here.

484 if (opt_strcmp \|\| opt_strcmpCPO)

485 {

486 unsigned long startTime = timeGetTime();

487 typedef int32_t (U_EXPORT2 PF)(const UChar , const UChar *);

488 PF pf = u_strcmp;

489 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}

490 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the differ ence between int32_t and int

491 // which forces th e use of a cast here.

492

493 int r = 0;

494 for (loops=0; loops<adj_loopCount; loops++) {

495

496 for (line=0; line < gNumFileLines; line++) {

497 int hi = gNumFileLines-1;

498 int lo = 0;

499 int guess = -1;

500 for (;;) {

501 int newGuess = (hi + lo) / 2;

502 if (newGuess == guess)

503 break;

504 guess = newGuess;

505 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

506 r = (*pf)((gSortedLines[line])->name, (gSortedLines[ guess])->name);

507 }

508 gCount++;

509 if (r== 0)

510 break;

511 if (r < 0)

512 hi = guess;

513 else

514 lo = guess;

515 }

516 }

517 }

518 elapsedTime = timeGetTime() - startTime;

519 break;

520 }

521

522

523 if (opt_icu)

524 {

525 unsigned long startTime = timeGetTime();

526 UCollationResult r = UCOL_EQUAL;

527 for (loops=0; loops<adj_loopCount; loops++) {

528

529 for (line=0; line < gNumFileLines; line++) {

530 int lineLen = -1;

531 int guessLen = -1;

532 if (opt_uselen) {

533 lineLen = (gSortedLines[line])->len;

534 }

535 int hi = gNumFileLines-1;

536 int lo = 0;

537 int guess = -1;

538 for (;;) {

539 int newGuess = (hi + lo) / 2;

540 if (newGuess == guess)

541 break;

542 guess = newGuess;

543 int ri = 0;

544 if (opt_usekeys) {

545 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

546 ri = strcmp((gSortedLines[line])->icuSortKey, (g SortedLines[guess])->icuSortKey);

547 }

548 gCount++;

549 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri= =0) {r=UCOL_EQUAL;}

550 }

551 else

552 {

553 if (opt_uselen) {

554 guessLen = (gSortedLines[guess])->len;

555 }

556 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

557 r = ucol_strcoll(gCol, (gSortedLines[line])->nam e, lineLen, (gSortedLines[guess])->name, guessLen);

558 }

559 gCount++;

560 }

561 if (r== UCOL_EQUAL)

562 break;

563 if (r == UCOL_LESS)

564 hi = guess;

565 else

566 lo = guess;

567 }

568 }

569 }

570 elapsedTime = timeGetTime() - startTime;

571 break;

572 }

573

574 if (opt_win)

575 {

576 unsigned long startTime = timeGetTime();

577 int r = 0;

578 for (loops=0; loops<adj_loopCount; loops++) {

579

580 for (line=0; line < gNumFileLines; line++) {

581 int lineLen = -1;

582 int guessLen = -1;

583 if (opt_uselen) {

584 lineLen = (gSortedLines[line])->len;

585 }

586 int hi = gNumFileLines-1;

587 int lo = 0;

588 int guess = -1;

589 for (;;) {

590 int newGuess = (hi + lo) / 2;

591 if (newGuess == guess)

592 break;

593 guess = newGuess;

594 if (opt_usekeys) {

595 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

596 r = strcmp((gSortedLines[line])->winSortKey, (gS ortedLines[guess])->winSortKey);

597 }

598 gCount++;

599 r+=2;

600 }

601 else

602 {

603 if (opt_uselen) {

604 guessLen = (gSortedLines[guess])->len;

605 }

606 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

607 r = CompareStringW(gWinLCID, 0, (gSortedLines[li ne])->name, lineLen, (gSortedLines[guess])->name, guessLen);

608 }

609 if (r == 0) {

610 if (opt_terse == FALSE) {

611 fprintf(stderr, "Error returned from Windows CompareStringW.\n");

612 }

613 exit(-1);

614 }

615 gCount++;

616 }

617 if (r== 2) // strings ==

618 break;

619 if (r == 1) // line < guess

620 hi = guess;

621 else // line > guess

622 lo = guess;

623 }

624 }

625 }

626 elapsedTime = timeGetTime() - startTime;

627 break;

628 }

629

630 if (opt_unix)

631 {

632 unsigned long startTime = timeGetTime();

633 int r = 0;

634 for (loops=0; loops<adj_loopCount; loops++) {

635

636 for (line=0; line < gNumFileLines; line++) {

637 int hi = gNumFileLines-1;

638 int lo = 0;

639 int guess = -1;

640 for (;;) {

641 int newGuess = (hi + lo) / 2;

642 if (newGuess == guess)

643 break;

644 guess = newGuess;

645 if (opt_usekeys) {

646 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

647 r = strcmp((gSortedLines[line])->unixSortKey, ( gSortedLines[guess])->unixSortKey);

648 }

649 gCount++;

650 }

651 else

652 {

653 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {

654 r = strcoll((gSortedLines[line])->unixName, (gSo rtedLines[guess])->unixName);

655 }

656 errno = 0;

657 if (errno != 0) {

658 fprintf(stderr, "Error %d returned from strcoll. \n", errno);

659 exit(-1);

660 }

661 gCount++;

662 }

663 if (r == 0) // strings ==

664 break;

665 if (r < 0) // line < guess

666 hi = guess;

667 else // line > guess

668 lo = guess;

669 }

670 }

671 }

672 elapsedTime = timeGetTime() - startTime;

673 break;

674 }

675 break;

676 }

677

678 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);

679 if (opt_terse == FALSE) {

680 printf("binary search: total # of string compares = %d\n", gCount);

681 printf("binary search: compares per loop = %d\n", gCount / loops);

682 printf("binary search: time per compare = %d ns\n", ns);

683 } else {

684 printf("%d, ", ns);

685 }

686

687 }

688

689

690

691

692 //------------------------------------------------------------------------------ ---------

693 //

694 // doQSort() The quick sort timing test. Uses the C library qsort function .

695 //

696 //------------------------------------------------------------------------------ ---------

697 void doQSort() {

698 int i;

699 Line *sortBuf = new Line [gNumFileLines];

700

701 // Adjust loop count to compensate for file size. QSort should be n log(n)

702 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileL ines) * double(gNumFileLines));

703 if (opt_usekeys) dLoopCount *= 5;

704 int adj_loopCount = int(dLoopCount);

705 if (adj_loopCount < 1) adj_loopCount = 1;

706

707

708 gCount = 0;

709 unsigned long startTime = timeGetTime();

710 if (opt_win && opt_usekeys) {

711 for (i=0; i<opt_loopCount; i++) {

712 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

713 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);

714 }

715 }

716

717 else if (opt_win && opt_uselen) {

718 for (i=0; i<adj_loopCount; i++) {

719 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

720 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);

721 }

722 }

723

724

725 else if (opt_win && !opt_uselen) {

726 for (i=0; i<adj_loopCount; i++) {

727 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

728 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);

729 }

730 }

731

732 else if (opt_icu && opt_usekeys) {

733 for (i=0; i<adj_loopCount; i++) {

734 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

735 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);

736 }

737 }

738

739 else if (opt_icu && opt_uselen) {

740 for (i=0; i<adj_loopCount; i++) {

741 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

742 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);

743 }

744 }

745

746

747 else if (opt_icu && !opt_uselen) {

748 for (i=0; i<adj_loopCount; i++) {

749 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

750 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);

751 }

752 }

753

754 else if (opt_unix && !opt_usekeys) {

755 for (i=0; i<adj_loopCount; i++) {

756 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));

757 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);

758 }

759 }

760

761 unsigned long elapsedTime = timeGetTime() - startTime;

762 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);

763 if (opt_terse == FALSE) {

764 printf("qsort: total # of string compares = %d\n", gCount);

765 printf("qsort: time per compare = %d ns\n", ns);

766 } else {

767 printf("%d, ", ns);

768 }

769 }

770

771

772

773 //------------------------------------------------------------------------------ ---------

774 //

775 // doKeyHist() Output a table of data for

776 // average sort key size vs. string length.

777 //

778 //------------------------------------------------------------------------------ ---------

779 void doKeyHist() {

780 int i;

781 int maxLen = 0;

782

783 // Find the maximum string length

784 for (i=0; i<gNumFileLines; i++) {

785 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;

786 }

787

788 // Allocate arrays to hold the histogram data

789 int *accumulatedLen = new int[maxLen+1];

790 int *numKeysOfSize = new int[maxLen+1];

791 for (i=0; i<=maxLen; i++) {

792 accumulatedLen[i] = 0;

793 numKeysOfSize[i] = 0;

794 }

795

796 // Fill the arrays...

797 for (i=0; i<gNumFileLines; i++) {

798 int len = gFileLines[i].len;

799 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);

800 numKeysOfSize[len] += 1;

801 }

802

803 // And write out averages

804 printf("String Length, Avg Key Length, Avg Key Len per char\n");

805 for (i=1; i<=maxLen; i++) {

806 if (numKeysOfSize[i] > 0) {

807 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysO fSize[i],

808 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));

809 }

810 }

811 delete []accumulatedLen;

812 delete []numKeysOfSize ;

813 }

814

815 //------------------------------------------------------------------------------ ---------

816 //

817 // doForwardIterTest(UBool) Forward iteration test

818 // argument null-terminated string used

819 //

820 //------------------------------------------------------------------------------ ---------

821 void doForwardIterTest(UBool haslen) {

822 int count = 0;

823

824 UErrorCode error = U_ZERO_ERROR;

825 printf("\n\nPerforming forward iteration performance test with ");

826

827 if (haslen) {

828 printf("non-null terminated data -----------\n");

829 }

830 else {

831 printf("null terminated data -----------\n");

832 }

833 printf("performance test on strings from file -----------\n");

834

835 UChar dummytext[] = {0, 0};

836 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);

837 ucol_setText(iter, dummytext, 1, &error);

838

839 gCount = 0;

840 unsigned long startTime = timeGetTime();

841 while (count < opt_loopCount) {

842 int linecount = 0;

843 while (linecount < gNumFileLines) {

844 UChar *str = gFileLines[linecount].name;

845 int strlen = haslen?gFileLines[linecount].len:-1;

846 ucol_setText(iter, str, strlen, &error);

847 while (ucol_next(iter, &error) != UCOL_NULLORDER) {

848 gCount++;

849 }

850

851 linecount ++;

852 }

853 count ++;

854 }

855 unsigned long elapsedTime = timeGetTime() - startTime;

856 printf("elapsedTime %ld\n", elapsedTime);

857

858 // empty loop recalculation

859 count = 0;

860 startTime = timeGetTime();

861 while (count < opt_loopCount) {

862 int linecount = 0;

863 while (linecount < gNumFileLines) {

864 UChar *str = gFileLines[linecount].name;

865 int strlen = haslen?gFileLines[linecount].len:-1;

866 ucol_setText(iter, str, strlen, &error);

867 linecount ++;

868 }

869 count ++;

870 }

871 elapsedTime -= (timeGetTime() - startTime);

872 printf("elapsedTime %ld\n", elapsedTime);

873

874 ucol_closeElements(iter);

875

876 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);

877 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,

878 opt_loopCount);

879 printf("Average time per ucol_next() nano seconds %d\n", ns);

880

881 printf("performance test on skipped-5 concatenated strings from file ------- ----\n");

882

883 UChar *str;

884 int strlen = 0;

885 // appending all the strings

886 int linecount = 0;

887 while (linecount < gNumFileLines) {

888 strlen += haslen?gFileLines[linecount].len:

889 u_strlen(gFileLines[linecount].name);

890 linecount ++;

891 }

892 str = (UChar )malloc(sizeof(UChar) strlen);

893 int strindex = 0;

894 linecount = 0;

895 while (strindex < strlen) {

896 int len = 0;

897 len += haslen?gFileLines[linecount].len:

898 u_strlen(gFileLines[linecount].name);

899 memcpy(str + strindex, gFileLines[linecount].name,

900 sizeof(UChar) * len);

901 strindex += len;

902 linecount ++;

903 }

904

905 printf("Total size of strings %d\n", strlen);

906

907 gCount = 0;

908 count = 0;

909

910 if (!haslen) {

911 strlen = -1;

912 }

913 iter = ucol_openElements(gCol, str, strlen, &error);

914 if (!haslen) {

915 strlen = u_strlen(str);

916 }

917 strlen -= 5; // any left over characters are not iterated,

918 // this is to ensure the backwards and forwards iterators

919 // gets the same position

920 startTime = timeGetTime();

921 while (count < opt_loopCount) {

922 int count5 = 5;

923 strindex = 0;

924 ucol_setOffset(iter, strindex, &error);

925 while (TRUE) {

926 if (ucol_next(iter, &error) == UCOL_NULLORDER) {

927 break;

928 }

929 gCount++;

930 count5 --;

931 if (count5 == 0) {

932 strindex += 10;

933 if (strindex > strlen) {

934 break;

935 }

936 ucol_setOffset(iter, strindex, &error);

937 count5 = 5;

938 }

939 }

940 count ++;

941 }

942

943 elapsedTime = timeGetTime() - startTime;

944 printf("elapsedTime %ld\n", elapsedTime);

945

946 // empty loop recalculation

947 int tempgCount = 0;

948 count = 0;

949 startTime = timeGetTime();

950 while (count < opt_loopCount) {

951 int count5 = 5;

952 strindex = 0;

953 ucol_setOffset(iter, strindex, &error);

954 while (TRUE) {

955 tempgCount ++;

956 count5 --;

957 if (count5 == 0) {

958 strindex += 10;

959 if (strindex > strlen) {

960 break;

961 }

962 ucol_setOffset(iter, strindex, &error);

963 count5 = 5;

964 }

965 }

966 count ++;

967 }

968 elapsedTime -= (timeGetTime() - startTime);

969 printf("elapsedTime %ld\n", elapsedTime);

970

971 ucol_closeElements(iter);

972

973 printf("gCount %d\n", gCount);

974 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);

975 printf("Average time per ucol_next() nano seconds %d\n", ns);

976 }

977

978 //------------------------------------------------------------------------------ ---------

979 //

980 // doBackwardIterTest(UBool) Backwards iteration test

981 // argument null-terminated string used

982 //

983 //------------------------------------------------------------------------------ ---------

984 void doBackwardIterTest(UBool haslen) {

985 int count = 0;

986 UErrorCode error = U_ZERO_ERROR;

987 printf("\n\nPerforming backward iteration performance test with ");

988

989 if (haslen) {

990 printf("non-null terminated data -----------\n");

991 }

992 else {

993 printf("null terminated data -----------\n");

994 }

995

996 printf("performance test on strings from file -----------\n");

997

998 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);

999 UChar dummytext[] = {0, 0};

1000 ucol_setText(iter, dummytext, 1, &error);

1001

1002 gCount = 0;

1003 unsigned long startTime = timeGetTime();

1004 while (count < opt_loopCount) {

1005 int linecount = 0;

1006 while (linecount < gNumFileLines) {

1007 UChar *str = gFileLines[linecount].name;

1008 int strlen = haslen?gFileLines[linecount].len:-1;

1009 ucol_setText(iter, str, strlen, &error);

1010 while (ucol_previous(iter, &error) != UCOL_NULLORDER) {

1011 gCount ++;

1012 }

1013

1014 linecount ++;

1015 }

1016 count ++;

1017 }

1018 unsigned long elapsedTime = timeGetTime() - startTime;

1019

1020 printf("elapsedTime %ld\n", elapsedTime);

1021

1022 // empty loop recalculation

1023 count = 0;

1024 startTime = timeGetTime();

1025 while (count < opt_loopCount) {

1026 int linecount = 0;

1027 while (linecount < gNumFileLines) {

1028 UChar *str = gFileLines[linecount].name;

1029 int strlen = haslen?gFileLines[linecount].len:-1;

1030 ucol_setText(iter, str, strlen, &error);

1031 linecount ++;

1032 }

1033 count ++;

1034 }

1035 elapsedTime -= (timeGetTime() - startTime);

1036

1037 printf("elapsedTime %ld\n", elapsedTime);

1038 ucol_closeElements(iter);

1039

1040 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);

1041 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,

1042 opt_loopCount);

1043 printf("Average time per ucol_previous() nano seconds %d\n", ns);

1044

1045 printf("performance test on skipped-5 concatenated strings from file ------- ----\n");

1046

1047 UChar *str;

1048 int strlen = 0;

1049 // appending all the strings

1050 int linecount = 0;

1051 while (linecount < gNumFileLines) {

1052 strlen += haslen?gFileLines[linecount].len:

1053 u_strlen(gFileLines[linecount].name);

1054 linecount ++;

1055 }

1056 str = (UChar )malloc(sizeof(UChar) strlen);

1057 int strindex = 0;

1058 linecount = 0;

1059 while (strindex < strlen) {

1060 int len = 0;

1061 len += haslen?gFileLines[linecount].len:

1062 u_strlen(gFileLines[linecount].name);

1063 memcpy(str + strindex, gFileLines[linecount].name,

1064 sizeof(UChar) * len);

1065 strindex += len;

1066 linecount ++;

1067 }

1068

1069 printf("Total size of strings %d\n", strlen);

1070

1071 gCount = 0;

1072 count = 0;

1073

1074 if (!haslen) {

1075 strlen = -1;

1076 }

1077

1078 iter = ucol_openElements(gCol, str, strlen, &error);

1079 if (!haslen) {

1080 strlen = u_strlen(str);

1081 }

1082

1083 startTime = timeGetTime();

1084 while (count < opt_loopCount) {

1085 int count5 = 5;

1086 strindex = 5;

1087 ucol_setOffset(iter, strindex, &error);

1088 while (TRUE) {

1089 if (ucol_previous(iter, &error) == UCOL_NULLORDER) {

1090 break;

1091 }

1092 gCount ++;

1093 count5 --;

1094 if (count5 == 0) {

1095 strindex += 10;

1096 if (strindex > strlen) {

1097 break;

1098 }

1099 ucol_setOffset(iter, strindex, &error);

1100 count5 = 5;

1101 }

1102 }

1103 count ++;

1104 }

1105

1106 elapsedTime = timeGetTime() - startTime;

1107 printf("elapsedTime %ld\n", elapsedTime);

1108

1109 // empty loop recalculation

1110 count = 0;

1111 int tempgCount = 0;

1112 startTime = timeGetTime();

1113 while (count < opt_loopCount) {

1114 int count5 = 5;

1115 strindex = 5;

1116 ucol_setOffset(iter, strindex, &error);

1117 while (TRUE) {

1118 tempgCount ++;

1119 count5 --;

1120 if (count5 == 0) {

1121 strindex += 10;

1122 if (strindex > strlen) {

1123 break;

1124 }

1125 ucol_setOffset(iter, strindex, &error);

1126 count5 = 5;

1127 }

1128 }

1129 count ++;

1130 }

1131 elapsedTime -= (timeGetTime() - startTime);

1132 printf("elapsedTime %ld\n", elapsedTime);

1133 ucol_closeElements(iter);

1134

1135 printf("gCount %d\n", gCount);

1136 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);

1137 printf("Average time per ucol_previous() nano seconds %d\n", ns);

1138 }

1139

1140 //------------------------------------------------------------------------------ ---------

1141 //

1142 // doIterTest() Iteration test

1143 //

1144 //------------------------------------------------------------------------------ ---------

1145 void doIterTest() {

1146 doForwardIterTest(opt_uselen);

1147 doBackwardIterTest(opt_uselen);

1148 }

1149

1150

1151 //------------------------------------------------------------------------------ ----------

1152 //

1153 // UnixConvert -- Convert the lines of the file to the encoding for UNIX

1154 // Since it appears that Unicode support is going in the gene ral

1155 // direction of the use of UTF-8 locales, that is the approac h

1156 // that is used here.

1157 //

1158 //------------------------------------------------------------------------------ ----------

1159 void UnixConvert() {

1160 int line;

1161

1162 UConverter *cvrtr; // An ICU code page converter.

1163 UErrorCode status = U_ZERO_ERROR;

1164

1165

1166 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales f or now.

1167 if (U_FAILURE(status)) {

1168 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)) ;

1169 exit(-1);

1170 }

1171

1172 for (line=0; line < gNumFileLines; line++) {

1173 int sizeNeeded = ucnv_fromUChars(cvrtr,

1174 0, // ptr to target buffer.

1175 0, // length of target buffe r.

1176 gFileLines[line].name,

1177 -1, // source is null termin ated

1178 &status);

1179 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {

1180 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");

1181 //exit(-1);

1182 }

1183 status = U_ZERO_ERROR;

1184 gFileLines[line].unixName = new char[sizeNeeded+1];

1185 sizeNeeded = ucnv_fromUChars(cvrtr,

1186 gFileLines[line].unixName, // ptr to ta rget buffer.

1187 sizeNeeded+1, // length of target buffe r.

1188 gFileLines[line].name,

1189 -1, // source is null termin ated

1190 &status);

1191 if (U_FAILURE(status)) {

1192 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);

1193 exit(-1);

1194 }

1195 gFileLines[line].unixName[sizeNeeded] = 0;

1196 };

1197 ucnv_close(cvrtr);

1198 }

1199

1200

1201 //------------------------------------------------------------------------------ ----------

1202 //

1203 // class UCharFile Class to hide all the gorp to read a file in

1204 // and produce a stream of UChars.

1205 //

1206 //------------------------------------------------------------------------------ ----------

1207 class UCharFile {

1208 public:

1209 UCharFile(const char *fileName);

1210 ~UCharFile();

1211 UChar get();

1212 UBool eof() {return fEof;};

1213 UBool error() {return fError;};

1214

1215 private:

1216 UCharFile (const UCharFile & /other/) {}; // No co py constructor.

1217 UCharFile & operator = (const UCharFile &/other/) {return *this;}; // No assignment op

1218

1219 FILE *fFile;

1220 const char *fName;

1221 UBool fEof;

1222 UBool fError;

1223 UChar fPending2ndSurrogate;

1224

1225 enum {UTF16LE, UTF16BE, UTF8} fEncoding;

1226 };

1227

1228 UCharFile::UCharFile(const char * fileName) {

1229 fEof = FALSE;

1230 fError = FALSE;

1231 fName = fileName;

1232 fFile = fopen(fName, "rb");

1233 fPending2ndSurrogate = 0;

1234 if (fFile == NULL) {

1235 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);

1236 fError = TRUE;

1237 return;

1238 }

1239 //

1240 // Look for the byte order mark at the start of the file.

1241 //

1242 int BOMC1, BOMC2, BOMC3;

1243 BOMC1 = fgetc(fFile);

1244 BOMC2 = fgetc(fFile);

1245

1246 if (BOMC1 == 0xff && BOMC2 == 0xfe) {

1247 fEncoding = UTF16LE; }

1248 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {

1249 fEncoding = UTF16BE; }

1250 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {

1251 fEncoding = UTF8; }

1252 else

1253 {

1254 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16 , and "

1255 "must include a BOM.\n", fileName);

1256 fError = true;

1257 return;

1258 }

1259 }

1260

1261

1262 UCharFile::~UCharFile() {

1263 fclose(fFile);

1264 }

1265

1266

1267

1268 UChar UCharFile::get() {

1269 UChar c;

1270 switch (fEncoding) {

1271 case UTF16LE:

1272 {

1273 int cL, cH;

1274 cL = fgetc(fFile);

1275 cH = fgetc(fFile);

1276 c = cL \| (cH << 8);

1277 if (cH == EOF) {

1278 c = 0;

1279 fEof = TRUE;

1280 }

1281 break;

1282 }

1283 case UTF16BE:

1284 {

1285 int cL, cH;

1286 cH = fgetc(fFile);

1287 cL = fgetc(fFile);

1288 c = cL \| (cH << 8);

1289 if (cL == EOF) {

1290 c = 0;

1291 fEof = TRUE;

1292 }

1293 break;

1294 }

1295 case UTF8:

1296 {

1297 if (fPending2ndSurrogate != 0) {

1298 c = fPending2ndSurrogate;

1299 fPending2ndSurrogate = 0;

1300 break;

1301 }

1302

1303 int ch = fgetc(fFile); // Note: c and ch are separate cause eof t est doesn't work on UChar type.

1304 if (ch == EOF) {

1305 c = 0;

1306 fEof = TRUE;

1307 break;

1308 }

1309

1310 if (ch <= 0x7f) {

1311 // It's ascii. No further utf-8 conversion.

1312 c = ch;

1313 break;

1314 }

1315

1316 // Figure out the lenght of the char and read the rest of the bytes

1317 // into a temp array.

1318 int nBytes;

1319 if (ch >= 0xF0) {nBytes=4;}

1320 else if (ch >= 0xE0) {nBytes=3;}

1321 else if (ch >= 0xC0) {nBytes=2;}

1322 else {

1323 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");

1324 fError = TRUE;

1325 return 0;

1326 }

1327

1328 unsigned char bytes[10];

1329 bytes[0] = (unsigned char)ch;

1330 int i;

1331 for (i=1; i<nBytes; i++) {

1332 bytes[i] = fgetc(fFile);

1333 if (bytes[i] < 0x80 \|\| bytes[i] >= 0xc0) {

1334 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n ");

1335 fError = TRUE;

1336 return 0;

1337 }

1338 }

1339

1340 // Convert the bytes from the temp array to a Unicode char.

1341 i = 0;

1342 uint32_t cp;

1343 U8_NEXT_UNSAFE(bytes, i, cp);

1344 c = (UChar)cp;

1345

1346 if (cp >= 0x10000) {

1347 // The code point needs to be broken up into a utf-16 surrogate pair.

1348 // Process first half this time through the main loop, and

1349 // remember the other half for the next time through.

1350 UChar utf16Buf[3];

1351 i = 0;

1352 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);

1353 fPending2ndSurrogate = utf16Buf[1];

1354 c = utf16Buf[0];

1355 }

1356 break;

1357 };

1358 default:

1359 c = 0xFFFD; /* Error, unspecified codepage*/

1360 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");

1361 exit(1);

1362 }

1363 return c;

1364 }

1365

1366 //------------------------------------------------------------------------------ ----------

1367 //

1368 // openRulesCollator - Command line specified a rules file. Read it in

1369 // and open a collator with it.

1370 //

1371 //------------------------------------------------------------------------------ ----------

1372 UCollator *openRulesCollator() {

1373 UCharFile f(opt_rules);

1374 if (f.error()) {

1375 return 0;

1376 }

1377

1378 int bufLen = 10000;

1379 UChar buf = (UChar )malloc(bufLen * sizeof(UChar));

1380 UChar *tmp;

1381 int i = 0;

1382

1383 for(;;) {

1384 buf[i] = f.get();

1385 if (f.eof()) {

1386 break;

1387 }

1388 if (f.error()) {

1389 return 0;

1390 }

1391 i++;

1392 if (i >= bufLen) {

1393 tmp = buf;

1394 bufLen += 10000;

1395 buf = (UChar *)realloc(buf, bufLen);

1396 if (buf == NULL) {

1397 free(tmp);

1398 return 0;

1399 }

1400 }

1401 }

1402 buf[i] = 0;

1403

1404 UErrorCode status = U_ZERO_ERROR;

1405 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,

1406 UCOL_DEFAULT_STRENGTH, NULL, &status);

1407 if (U_FAILURE(status)) {

1408 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);

1409 return 0;

1410 }

1411 free(buf);

1412 return coll;

1413 }

1414

1415

1416

1417

1418

1419 //------------------------------------------------------------------------------ ----------

1420 //

1421 // Main -- process command line, read in and pre-process the test file,

1422 // call other functions to do the actual tests.

1423 //

1424 //------------------------------------------------------------------------------ ----------

1425 int main(int argc, const char** argv) {

1426 if (ProcessOptions(argc, argv, opts) != TRUE \|\| opt_help \|\| opt_fName == 0) {

1427 printf(gUsageString);

1428 exit (1);

1429 }

1430

1431 // Make sure that we've only got one API selected.

1432 if (opt_unix \|\| opt_win) opt_icu = FALSE;

1433 if (opt_unix) opt_win = FALSE;

1434

1435 //

1436 // Set up an ICU collator

1437 //

1438 UErrorCode status = U_ZERO_ERROR;

1439

1440 if (opt_rules != 0) {

1441 gCol = openRulesCollator();

1442 if (gCol == 0) {return -1;}

1443 }

1444 else {

1445 gCol = ucol_open(opt_locale, &status);

1446 if (U_FAILURE(status)) {

1447 fprintf(stderr, "Collator creation failed.: %d\n", status);

1448 return -1;

1449 }

1450 }

1451 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {

1452 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale) ;

1453 }

1454 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {

1455 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);

1456 }

1457

1458 if (opt_norm) {

1459 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);

1460 }

1461 if (opt_french && opt_frenchoff) {

1462 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options.");

1463 exit(-1);

1464 }

1465 if (opt_french) {

1466 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);

1467 }

1468 if (opt_frenchoff) {

1469 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);

1470 }

1471 if (opt_lower) {

1472 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);

1473 }

1474 if (opt_upper) {

1475 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);

1476 }

1477 if (opt_case) {

1478 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);

1479 }

1480 if (opt_shifted) {

1481 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);

1482 }

1483 if (opt_level != 0) {

1484 switch (opt_level) {

1485 case 1:

1486 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);

1487 break;

1488 case 2:

1489 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);

1490 break;

1491 case 3:

1492 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);

1493 break;

1494 case 4:

1495 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);

1496 break;

1497 case 5:

1498 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);

1499 break;

1500 default:

1501 fprintf(stderr, "-level param must be between 1 and 5\n");

1502 exit(-1);

1503 }

1504 }

1505

1506 if (U_FAILURE(status)) {

1507 fprintf(stderr, "Collator attribute setting failed.: %d\n", status);

1508 return -1;

1509 }

1510

1511

1512 //

1513 // Set up a Windows LCID

1514 //

1515 if (opt_langid != 0) {

1516 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);

1517 }

1518 else {

1519 gWinLCID = uloc_getLCID(opt_locale);

1520 }

1521

1522

1523 //

1524 // Set the UNIX locale

1525 //

1526 if (opt_unix) {

1527 if (setlocale(LC_ALL, opt_locale) == 0) {

1528 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);

1529 exit(-1);

1530 }

1531 }

1532

1533 // Read in the input file.

1534 // File assumed to be utf-16.

1535 // Lines go onto heap buffers. Global index array to line starts is creat ed.

1536 // Lines themselves are null terminated.

1537 //

1538

1539 UCharFile f(opt_fName);

1540 if (f.error()) {

1541 exit(-1);

1542 }

1543

1544 const int MAXLINES = 100000;

1545 gFileLines = new Line[MAXLINES];

1546 UChar buf[1024];

1547 int column = 0;

1548

1549 // Read the file, split into lines, and save in memory.

1550 // Loop runs once per utf-16 value from the input file,

1551 // (The number of bytes read from file per loop iteration depends on exte rnal encoding.)

1552 for (;;) {

1553

1554 UChar c = f.get();

1555 if (f.error()){

1556 exit(-1);

1557 }

1558

1559

1560 // We now have a good UTF-16 value in c.

1561

1562 // Watch for CR, LF, EOF; these finish off a line.

1563 if (c == 0xd) {

1564 continue;

1565 }

1566

1567 if (f.eof() \|\| c == 0x0a \|\| c==0x2028) { // Unipad inserts 2028 line se parators!

1568 buf[column++] = 0;

1569 if (column > 1) {

1570 gFileLines[gNumFileLines].name = new UChar[column];

1571 gFileLines[gNumFileLines].len = column-1;

1572 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UCha r));

1573 gNumFileLines++;

1574 column = 0;

1575 if (gNumFileLines >= MAXLINES) {

1576 fprintf(stderr, "File too big. Max number of lines is %d\n" , MAXLINES);

1577 exit(-1);

1578 }

1579

1580 }

1581 if (c == 0xa \|\| c == 0x2028)

1582 continue;

1583 else

1584 break; // EOF

1585 }

1586 buf[column++] = c;

1587 if (column >= 1023)

1588 {

1589 static UBool warnFlag = TRUE;

1590 if (warnFlag) {

1591 fprintf(stderr, "Warning - file line longer than 1023 chars trun cated.\n");

1592 warnFlag = FALSE;

1593 }

1594 column--;

1595 }

1596 }

1597

1598 if (opt_terse == FALSE) {

1599 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);

1600 }

1601

1602

1603 // Convert the lines to the UNIX encoding.

1604 if (opt_unix) {

1605 UnixConvert();

1606 }

1607

1608 //

1609 // Pre-compute ICU sort keys for the lines of the file.

1610 //

1611 int line;

1612 int32_t t;

1613

1614 for (line=0; line<gNumFileLines; line++) {

1615 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)b uf, sizeof(buf));

1616 gFileLines[line].icuSortKey = new char[t];

1617

1618 if (t > (int32_t)sizeof(buf)) {

1619 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);

1620 }

1621 else

1622 {

1623 memcpy(gFileLines[line].icuSortKey, buf, t);

1624 }

1625 }

1626

1627

1628

1629 //

1630 // Pre-compute Windows sort keys for the lines of the file.

1631 //

1632 for (line=0; line<gNumFileLines; line++) {

1633 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));

1634 gFileLines[line].winSortKey = new char[t];

1635 if (t > (int32_t)sizeof(buf)) {

1636 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1 , (unsigned short *)(gFileLines[line].winSortKey), t);

1637 }

1638 else

1639 {

1640 memcpy(gFileLines[line].winSortKey, buf, t);

1641 }

1642 }

1643

1644 //

1645 // Pre-compute UNIX sort keys for the lines of the file.

1646 //

1647 if (opt_unix) {

1648 for (line=0; line<gNumFileLines; line++) {

1649 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf));

1650 gFileLines[line].unixSortKey = new char[t];

1651 if (t > (int32_t)sizeof(buf)) {

1652 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unix Name, sizeof(buf));

1653 }

1654 else

1655 {

1656 memcpy(gFileLines[line].unixSortKey, buf, t);

1657 }

1658 }

1659 }

1660

1661

1662 //

1663 // Dump file lines, CEs, Sort Keys if requested.

1664 //

1665 if (opt_dump) {

1666 int i;

1667 for (line=0; line<gNumFileLines; line++) {

1668 for (i=0;;i++) {

1669 UChar c = gFileLines[line].name[i];

1670 if (c == 0)

1671 break;

1672 if (c < 0x20 \|\| c > 0x7e) {

1673 printf("\\u%.4x", c);

1674 }

1675 else {

1676 printf("%c", c);

1677 }

1678 }

1679 printf("\n");

1680

1681 printf(" CEs: ");

1682 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line ].name, -1, &status);

1683 int32_t ce;

1684 i = 0;

1685 for (;;) {

1686 ce = ucol_next(CEiter, &status);

1687 if (ce == UCOL_NULLORDER) {

1688 break;

1689 }

1690 printf(" %.8x", ce);

1691 if (++i > 8) {

1692 printf("\n ");

1693 i = 0;

1694 }

1695 }

1696 printf("\n");

1697 ucol_closeElements(CEiter);

1698

1699

1700 printf(" ICU Sort Key: ");

1701 for (i=0; ; i++) {

1702 unsigned char c = gFileLines[line].icuSortKey[i];

1703 printf("%02x ", c);

1704 if (c == 0) {

1705 break;

1706 }

1707 if (i > 0 && i % 20 == 0) {

1708 printf("\n ");

1709 }

1710 }

1711 printf("\n");

1712 }

1713 }

1714

1715

1716 //

1717 // Pre-sort the lines.

1718 //

1719 int i;

1720 gSortedLines = new Line *[gNumFileLines];

1721 for (i=0; i<gNumFileLines; i++) {

1722 gSortedLines[i] = &gFileLines[i];

1723 }

1724

1725 if (opt_win) {

1726 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);

1727 }

1728 else if (opt_unix) {

1729 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);

1730 }

1731 else /* ICU */

1732 {

1733 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);

1734 }

1735

1736

1737 //

1738 // Make up a randomized order, will be used for sorting tests.

1739 //

1740 gRandomLines = new Line *[gNumFileLines];

1741 for (i=0; i<gNumFileLines; i++) {

1742 gRandomLines[i] = &gFileLines[i];

1743 }

1744 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);

1745

1746

1747

1748

1749 //

1750 // We've got the file read into memory. Go do something with it.

1751 //

1752

1753 if (opt_qsort) doQSort();

1754 if (opt_binsearch) doBinarySearch();

1755 if (opt_keygen) doKeyGen();

1756 if (opt_keyhist) doKeyHist();

1757 if (opt_itertest) doIterTest();

1758

1759 return 0;

1760

1761 }

OLD	NEW

« no previous file with comments | « source/test/perf/collationperf/Makefile.in ('k') | source/test/perf/collationperf/readme.html » ('j') | no next file with comments »