OLD | NEW |
| (Empty) |
1 /******************************************************************** | |
2 * COPYRIGHT: | |
3 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. | |
4 * | |
5 ********************************************************************/ | |
6 /*******************************************************************************
* | |
7 * | |
8 * File CALLCOLL.C | |
9 * | |
10 * Modification History: | |
11 * Name Description | |
12 * Andy Heninger First Version | |
13 * | |
14 ********************************************************************************
* | |
15 */ | |
16 | |
17 // | |
18 // This program tests string collation and sort key generation performance. | |
19 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMap
String | |
20 // A file of names is required as input, one per line. It must be in utf-8
or utf-16 format, | |
21 // and include a byte order mark. Either LE or BE format is OK. | |
22 // | |
23 | |
24 const char gUsageString[] = | |
25 "usage: collperf options...\n" | |
26 "-help Display this message.\n" | |
27 "-file file_name utf-16 format file of names.\n" | |
28 "-locale name ICU locale to use. Default is en_US\n" | |
29 "-rules file_name Collation rules file (overrides locale)\n" | |
30 "-langid 0x1234 Windows Language ID number. Default to value fo
r -locale option\n" | |
31 " see http://msdn.microsoft.com/library/psdk/wi
nbase/nls_8xo3.htm\n" | |
32 "-win Run test using Windows native services. (ICU is
default)\n" | |
33 "-unix Run test using Unix strxfrm, strcoll services.\n
" | |
34 "-uselen Use API with string lengths. Default is null-te
rminated strings\n" | |
35 "-usekeys Run tests using sortkeys rather than strcoll\n" | |
36 "-strcmp Run tests using u_strcmp rather than strcoll\n" | |
37 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather th
an strcoll\n" | |
38 "-loop nnnn Loopcount for test. Adjust for reasonable total
running time.\n" | |
39 "-iloop n Inner Loop Count. Default = 1. Number of calls
to function\n" | |
40 " under test at each call point. For measurin
g test overhead.\n" | |
41 "-terse Terse numbers-only output. Intended for use by
scripts.\n" | |
42 "-french French accent ordering\n" | |
43 "-frenchoff No French accent ordering (for use with French l
ocales.)\n" | |
44 "-norm Normalizing mode on\n" | |
45 "-shifted Shifted mode\n" | |
46 "-lower Lower case first\n" | |
47 "-upper Upper case first\n" | |
48 "-case Enable separate case level\n" | |
49 "-level n Sort level, 1 to 5, for Primary, Secndary, Terti
ary, Quaternary, Identical\n" | |
50 "-keyhist Produce a table sort key size vs. string length\
n" | |
51 "-binsearch Binary Search timing test\n" | |
52 "-keygen Sort Key Generation timing test\n" | |
53 "-qsort Quicksort timing test\n" | |
54 "-iter Iteration Performance Test\n" | |
55 "-dump Display strings, sort keys and CEs.\n" | |
56 ; | |
57 | |
58 | |
59 | |
60 #include <stdio.h> | |
61 #include <string.h> | |
62 #include <stdlib.h> | |
63 #include <math.h> | |
64 #include <locale.h> | |
65 #include <errno.h> | |
66 | |
67 #include <unicode/utypes.h> | |
68 #include <unicode/ucol.h> | |
69 #include <unicode/ucoleitr.h> | |
70 #include <unicode/uloc.h> | |
71 #include <unicode/ustring.h> | |
72 #include <unicode/ures.h> | |
73 #include <unicode/uchar.h> | |
74 #include <unicode/ucnv.h> | |
75 #include <unicode/utf8.h> | |
76 | |
77 #ifdef WIN32 | |
78 #include <windows.h> | |
79 #else | |
80 // | |
81 // Stubs for Windows API functions when building on UNIXes. | |
82 // | |
83 typedef int DWORD; | |
84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} | |
85 #include <sys/time.h> | |
86 unsigned long timeGetTime() { | |
87 struct timeval t; | |
88 gettimeofday(&t, 0); | |
89 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. | |
90 val += t.tv_usec / 1000; | |
91 return val; | |
92 } | |
93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} | |
94 const int LCMAP_SORTKEY = 0; | |
95 #define MAKELCID(a,b) 0 | |
96 const int SORT_DEFAULT = 0; | |
97 #endif | |
98 | |
99 | |
100 | |
101 // | |
102 // Command line option variables | |
103 // These global variables are set according to the options specified | |
104 // on the command line by the user. | |
105 char * opt_fName = 0; | |
106 const char * opt_locale = "en_US"; | |
107 int opt_langid = 0; // Defaults to value corresponding to opt_loc
ale. | |
108 char * opt_rules = 0; | |
109 UBool opt_help = FALSE; | |
110 int opt_loopCount = 1; | |
111 int opt_iLoopCount = 1; | |
112 UBool opt_terse = FALSE; | |
113 UBool opt_qsort = FALSE; | |
114 UBool opt_binsearch = FALSE; | |
115 UBool opt_icu = TRUE; | |
116 UBool opt_win = FALSE; // Run with Windows native functions. | |
117 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. | |
118 UBool opt_uselen = FALSE; | |
119 UBool opt_usekeys = FALSE; | |
120 UBool opt_strcmp = FALSE; | |
121 UBool opt_strcmpCPO = FALSE; | |
122 UBool opt_norm = FALSE; | |
123 UBool opt_keygen = FALSE; | |
124 UBool opt_french = FALSE; | |
125 UBool opt_frenchoff = FALSE; | |
126 UBool opt_shifted = FALSE; | |
127 UBool opt_lower = FALSE; | |
128 UBool opt_upper = FALSE; | |
129 UBool opt_case = FALSE; | |
130 int opt_level = 0; | |
131 UBool opt_keyhist = FALSE; | |
132 UBool opt_itertest = FALSE; | |
133 UBool opt_dump = FALSE; | |
134 | |
135 | |
136 | |
137 // | |
138 // Definitions for the command line options | |
139 // | |
140 struct OptSpec { | |
141 const char *name; | |
142 enum {FLAG, NUM, STRING} type; | |
143 void *pVar; | |
144 }; | |
145 | |
146 OptSpec opts[] = { | |
147 {"-file", OptSpec::STRING, &opt_fName}, | |
148 {"-locale", OptSpec::STRING, &opt_locale}, | |
149 {"-langid", OptSpec::NUM, &opt_langid}, | |
150 {"-rules", OptSpec::STRING, &opt_rules}, | |
151 {"-qsort", OptSpec::FLAG, &opt_qsort}, | |
152 {"-binsearch", OptSpec::FLAG, &opt_binsearch}, | |
153 {"-iter", OptSpec::FLAG, &opt_itertest}, | |
154 {"-win", OptSpec::FLAG, &opt_win}, | |
155 {"-unix", OptSpec::FLAG, &opt_unix}, | |
156 {"-uselen", OptSpec::FLAG, &opt_uselen}, | |
157 {"-usekeys", OptSpec::FLAG, &opt_usekeys}, | |
158 {"-strcmp", OptSpec::FLAG, &opt_strcmp}, | |
159 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, | |
160 {"-norm", OptSpec::FLAG, &opt_norm}, | |
161 {"-french", OptSpec::FLAG, &opt_french}, | |
162 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, | |
163 {"-shifted", OptSpec::FLAG, &opt_shifted}, | |
164 {"-lower", OptSpec::FLAG, &opt_lower}, | |
165 {"-upper", OptSpec::FLAG, &opt_upper}, | |
166 {"-case", OptSpec::FLAG, &opt_case}, | |
167 {"-level", OptSpec::NUM, &opt_level}, | |
168 {"-keyhist", OptSpec::FLAG, &opt_keyhist}, | |
169 {"-keygen", OptSpec::FLAG, &opt_keygen}, | |
170 {"-loop", OptSpec::NUM, &opt_loopCount}, | |
171 {"-iloop", OptSpec::NUM, &opt_iLoopCount}, | |
172 {"-terse", OptSpec::FLAG, &opt_terse}, | |
173 {"-dump", OptSpec::FLAG, &opt_dump}, | |
174 {"-help", OptSpec::FLAG, &opt_help}, | |
175 {"-?", OptSpec::FLAG, &opt_help}, | |
176 {0, OptSpec::FLAG, 0} | |
177 }; | |
178 | |
179 | |
180 //--------------------------------------------------------------------------- | |
181 // | |
182 // Global variables pointing to and describing the test file | |
183 // | |
184 //--------------------------------------------------------------------------- | |
185 | |
186 // | |
187 // struct Line | |
188 // | |
189 // Each line from the source file (containing a name, presumably) gets | |
190 // one of these structs. | |
191 // | |
192 struct Line { | |
193 UChar *name; | |
194 int len; | |
195 char *winSortKey; | |
196 char *icuSortKey; | |
197 char *unixSortKey; | |
198 char *unixName; | |
199 }; | |
200 | |
201 | |
202 | |
203 Line *gFileLines; // Ptr to array of Line structs, one per li
ne in the file. | |
204 int gNumFileLines; | |
205 UCollator *gCol; | |
206 DWORD gWinLCID; | |
207 | |
208 Line **gSortedLines; | |
209 Line **gRandomLines; | |
210 int gCount; | |
211 | |
212 | |
213 | |
214 //--------------------------------------------------------------------------- | |
215 // | |
216 // ProcessOptions() Function to read the command line options. | |
217 // | |
218 //--------------------------------------------------------------------------- | |
219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) | |
220 { | |
221 int i; | |
222 int argNum; | |
223 const char *pArgName; | |
224 OptSpec *pOpt; | |
225 | |
226 for (argNum=1; argNum<argc; argNum++) { | |
227 pArgName = argv[argNum]; | |
228 for (pOpt = opts; pOpt->name != 0; pOpt++) { | |
229 if (strcmp(pOpt->name, pArgName) == 0) { | |
230 switch (pOpt->type) { | |
231 case OptSpec::FLAG: | |
232 *(UBool *)(pOpt->pVar) = TRUE; | |
233 break; | |
234 case OptSpec::STRING: | |
235 argNum ++; | |
236 if (argNum >= argc) { | |
237 fprintf(stderr, "value expected for \"%s\" option.\n", p
Opt->name); | |
238 return FALSE; | |
239 } | |
240 *(const char **)(pOpt->pVar) = argv[argNum]; | |
241 break; | |
242 case OptSpec::NUM: | |
243 argNum ++; | |
244 if (argNum >= argc) { | |
245 fprintf(stderr, "value expected for \"%s\" option.\n", p
Opt->name); | |
246 return FALSE; | |
247 } | |
248 char *endp; | |
249 i = strtol(argv[argNum], &endp, 0); | |
250 if (endp == argv[argNum]) { | |
251 fprintf(stderr, "integer value expected for \"%s\" optio
n.\n", pOpt->name); | |
252 return FALSE; | |
253 } | |
254 *(int *)(pOpt->pVar) = i; | |
255 } | |
256 break; | |
257 } | |
258 } | |
259 if (pOpt->name == 0) | |
260 { | |
261 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); | |
262 return FALSE; | |
263 } | |
264 } | |
265 return TRUE; | |
266 } | |
267 | |
268 //------------------------------------------------------------------------------
--------- | |
269 // | |
270 // Comparison functions for use by qsort. | |
271 // | |
272 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with le
ngth | |
273 // or null terminated. | |
274 // | |
275 //------------------------------------------------------------------------------
--------- | |
276 int ICUstrcmpK(const void *a, const void *b) { | |
277 gCount++; | |
278 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); | |
279 return t; | |
280 } | |
281 | |
282 | |
283 int ICUstrcmpL(const void *a, const void *b) { | |
284 gCount++; | |
285 UCollationResult t; | |
286 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b
)->name, (*(Line **)b)->len); | |
287 if (t == UCOL_LESS) return -1; | |
288 if (t == UCOL_GREATER) return +1; | |
289 return 0; | |
290 } | |
291 | |
292 | |
293 int ICUstrcmp(const void *a, const void *b) { | |
294 gCount++; | |
295 UCollationResult t; | |
296 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); | |
297 if (t == UCOL_LESS) return -1; | |
298 if (t == UCOL_GREATER) return +1; | |
299 return 0; | |
300 } | |
301 | |
302 | |
303 int Winstrcmp(const void *a, const void *b) { | |
304 gCount++; | |
305 int t; | |
306 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name
, -1); | |
307 return t-2; | |
308 } | |
309 | |
310 | |
311 int UNIXstrcmp(const void *a, const void *b) { | |
312 gCount++; | |
313 int t; | |
314 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); | |
315 return t; | |
316 } | |
317 | |
318 | |
319 int WinstrcmpL(const void *a, const void *b) { | |
320 gCount++; | |
321 int t; | |
322 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(
Line **)b)->name, (*(Line **)b)->len); | |
323 return t-2; | |
324 } | |
325 | |
326 | |
327 int WinstrcmpK(const void *a, const void *b) { | |
328 gCount++; | |
329 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); | |
330 return t; | |
331 } | |
332 | |
333 | |
334 //------------------------------------------------------------------------------
--------- | |
335 // | |
336 // Function for sorting the names (lines) into a random order. | |
337 // Order is based on a hash of the ICU Sort key for the lines | |
338 // The randomized order is used as input for the sorting timing tests. | |
339 // | |
340 //------------------------------------------------------------------------------
--------- | |
341 int ICURandomCmp(const void *a, const void *b) { | |
342 char *ask = (*(Line **)a)->icuSortKey; | |
343 char *bsk = (*(Line **)b)->icuSortKey; | |
344 int aVal = 0; | |
345 int bVal = 0; | |
346 int retVal; | |
347 while (*ask != 0) { | |
348 aVal += aVal*37 + *ask++; | |
349 } | |
350 while (*bsk != 0) { | |
351 bVal += bVal*37 + *bsk++; | |
352 } | |
353 retVal = -1; | |
354 if (aVal == bVal) { | |
355 retVal = 0; | |
356 } | |
357 else if (aVal > bVal) { | |
358 retVal = 1; | |
359 } | |
360 return retVal; | |
361 } | |
362 | |
363 //------------------------------------------------------------------------------
--------- | |
364 // | |
365 // doKeyGen() Key Generation Timing Test | |
366 // | |
367 //------------------------------------------------------------------------------
--------- | |
368 void doKeyGen() | |
369 { | |
370 int line; | |
371 int loops = 0; | |
372 int iLoop; | |
373 int t; | |
374 int len=-1; | |
375 | |
376 // Adjust loop count to compensate for file size. Should be order n | |
377 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines))
; | |
378 int adj_loopCount = int(dLoopCount); | |
379 if (adj_loopCount < 1) adj_loopCount = 1; | |
380 | |
381 | |
382 unsigned long startTime = timeGetTime(); | |
383 | |
384 if (opt_win) { | |
385 for (loops=0; loops<adj_loopCount; loops++) { | |
386 for (line=0; line < gNumFileLines; line++) { | |
387 if (opt_uselen) { | |
388 len = gFileLines[line].len; | |
389 } | |
390 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
391 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, | |
392 gFileLines[line].name, len, | |
393 (unsigned short *)gFileLines[line].winSortKey, 5000);
// TODO something with length. | |
394 } | |
395 } | |
396 } | |
397 } | |
398 else if (opt_icu) | |
399 { | |
400 for (loops=0; loops<adj_loopCount; loops++) { | |
401 for (line=0; line < gNumFileLines; line++) { | |
402 if (opt_uselen) { | |
403 len = gFileLines[line].len; | |
404 } | |
405 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
406 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsig
ned char *)gFileLines[line].icuSortKey, 5000); | |
407 } | |
408 } | |
409 } | |
410 } | |
411 else if (opt_unix) | |
412 { | |
413 for (loops=0; loops<adj_loopCount; loops++) { | |
414 for (line=0; line < gNumFileLines; line++) { | |
415 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
416 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixN
ame, 5000); | |
417 } | |
418 } | |
419 } | |
420 } | |
421 | |
422 unsigned long elapsedTime = timeGetTime() - startTime; | |
423 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*g
NumFileLines)); | |
424 | |
425 if (opt_terse == FALSE) { | |
426 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLin
es); | |
427 printf("Sort Key Generation: time per key = %d ns\n", ns); | |
428 } | |
429 else { | |
430 printf("%d, ", ns); | |
431 } | |
432 | |
433 int totalKeyLen = 0; | |
434 int totalChars = 0; | |
435 for (line=0; line<gNumFileLines; line++) { | |
436 totalChars += u_strlen(gFileLines[line].name); | |
437 if (opt_win) { | |
438 totalKeyLen += strlen(gFileLines[line].winSortKey); | |
439 } | |
440 else if (opt_icu) { | |
441 totalKeyLen += strlen(gFileLines[line].icuSortKey); | |
442 } | |
443 else if (opt_unix) { | |
444 totalKeyLen += strlen(gFileLines[line].unixSortKey); | |
445 } | |
446 | |
447 } | |
448 if (opt_terse == FALSE) { | |
449 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)tota
lChars); | |
450 } else { | |
451 printf("%f, ", (float)totalKeyLen / (float)totalChars); | |
452 } | |
453 } | |
454 | |
455 | |
456 | |
457 //------------------------------------------------------------------------------
--------- | |
458 // | |
459 // doBinarySearch() Binary Search timing test. Each name from the list | |
460 // is looked up in the full sorted list of names. | |
461 // | |
462 //------------------------------------------------------------------------------
--------- | |
463 void doBinarySearch() | |
464 { | |
465 | |
466 gCount = 0; | |
467 int line; | |
468 int loops = 0; | |
469 int iLoop = 0; | |
470 unsigned long elapsedTime = 0; | |
471 | |
472 // Adjust loop count to compensate for file size. Should be order n (looku
ps) * log n (compares/lookup) | |
473 // Accurate timings do not depend on this being perfect. The correction is
just to try to | |
474 // get total running times of about the right order, so the that user does
n't need to | |
475 // manually adjust the loop count for every different file size. | |
476 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileL
ines) * double(gNumFileLines)); | |
477 if (opt_usekeys) dLoopCount *= 5; | |
478 int adj_loopCount = int(dLoopCount); | |
479 if (adj_loopCount < 1) adj_loopCount = 1; | |
480 | |
481 | |
482 for (;;) { // not really a loop, just allows "break" to work, to simplify | |
483 // inadvertantly running more than one test through here. | |
484 if (opt_strcmp || opt_strcmpCPO) | |
485 { | |
486 unsigned long startTime = timeGetTime(); | |
487 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *); | |
488 PF pf = u_strcmp; | |
489 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;} | |
490 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the differ
ence between int32_t and int | |
491 // which forces th
e use of a cast here. | |
492 | |
493 int r = 0; | |
494 for (loops=0; loops<adj_loopCount; loops++) { | |
495 | |
496 for (line=0; line < gNumFileLines; line++) { | |
497 int hi = gNumFileLines-1; | |
498 int lo = 0; | |
499 int guess = -1; | |
500 for (;;) { | |
501 int newGuess = (hi + lo) / 2; | |
502 if (newGuess == guess) | |
503 break; | |
504 guess = newGuess; | |
505 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
506 r = (*pf)((gSortedLines[line])->name, (gSortedLines[
guess])->name); | |
507 } | |
508 gCount++; | |
509 if (r== 0) | |
510 break; | |
511 if (r < 0) | |
512 hi = guess; | |
513 else | |
514 lo = guess; | |
515 } | |
516 } | |
517 } | |
518 elapsedTime = timeGetTime() - startTime; | |
519 break; | |
520 } | |
521 | |
522 | |
523 if (opt_icu) | |
524 { | |
525 unsigned long startTime = timeGetTime(); | |
526 UCollationResult r = UCOL_EQUAL; | |
527 for (loops=0; loops<adj_loopCount; loops++) { | |
528 | |
529 for (line=0; line < gNumFileLines; line++) { | |
530 int lineLen = -1; | |
531 int guessLen = -1; | |
532 if (opt_uselen) { | |
533 lineLen = (gSortedLines[line])->len; | |
534 } | |
535 int hi = gNumFileLines-1; | |
536 int lo = 0; | |
537 int guess = -1; | |
538 for (;;) { | |
539 int newGuess = (hi + lo) / 2; | |
540 if (newGuess == guess) | |
541 break; | |
542 guess = newGuess; | |
543 int ri = 0; | |
544 if (opt_usekeys) { | |
545 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
546 ri = strcmp((gSortedLines[line])->icuSortKey, (g
SortedLines[guess])->icuSortKey); | |
547 } | |
548 gCount++; | |
549 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri=
=0) {r=UCOL_EQUAL;} | |
550 } | |
551 else | |
552 { | |
553 if (opt_uselen) { | |
554 guessLen = (gSortedLines[guess])->len; | |
555 } | |
556 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
557 r = ucol_strcoll(gCol, (gSortedLines[line])->nam
e, lineLen, (gSortedLines[guess])->name, guessLen); | |
558 } | |
559 gCount++; | |
560 } | |
561 if (r== UCOL_EQUAL) | |
562 break; | |
563 if (r == UCOL_LESS) | |
564 hi = guess; | |
565 else | |
566 lo = guess; | |
567 } | |
568 } | |
569 } | |
570 elapsedTime = timeGetTime() - startTime; | |
571 break; | |
572 } | |
573 | |
574 if (opt_win) | |
575 { | |
576 unsigned long startTime = timeGetTime(); | |
577 int r = 0; | |
578 for (loops=0; loops<adj_loopCount; loops++) { | |
579 | |
580 for (line=0; line < gNumFileLines; line++) { | |
581 int lineLen = -1; | |
582 int guessLen = -1; | |
583 if (opt_uselen) { | |
584 lineLen = (gSortedLines[line])->len; | |
585 } | |
586 int hi = gNumFileLines-1; | |
587 int lo = 0; | |
588 int guess = -1; | |
589 for (;;) { | |
590 int newGuess = (hi + lo) / 2; | |
591 if (newGuess == guess) | |
592 break; | |
593 guess = newGuess; | |
594 if (opt_usekeys) { | |
595 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
596 r = strcmp((gSortedLines[line])->winSortKey, (gS
ortedLines[guess])->winSortKey); | |
597 } | |
598 gCount++; | |
599 r+=2; | |
600 } | |
601 else | |
602 { | |
603 if (opt_uselen) { | |
604 guessLen = (gSortedLines[guess])->len; | |
605 } | |
606 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
607 r = CompareStringW(gWinLCID, 0, (gSortedLines[li
ne])->name, lineLen, (gSortedLines[guess])->name, guessLen); | |
608 } | |
609 if (r == 0) { | |
610 if (opt_terse == FALSE) { | |
611 fprintf(stderr, "Error returned from Windows
CompareStringW.\n"); | |
612 } | |
613 exit(-1); | |
614 } | |
615 gCount++; | |
616 } | |
617 if (r== 2) // strings == | |
618 break; | |
619 if (r == 1) // line < guess | |
620 hi = guess; | |
621 else // line > guess | |
622 lo = guess; | |
623 } | |
624 } | |
625 } | |
626 elapsedTime = timeGetTime() - startTime; | |
627 break; | |
628 } | |
629 | |
630 if (opt_unix) | |
631 { | |
632 unsigned long startTime = timeGetTime(); | |
633 int r = 0; | |
634 for (loops=0; loops<adj_loopCount; loops++) { | |
635 | |
636 for (line=0; line < gNumFileLines; line++) { | |
637 int hi = gNumFileLines-1; | |
638 int lo = 0; | |
639 int guess = -1; | |
640 for (;;) { | |
641 int newGuess = (hi + lo) / 2; | |
642 if (newGuess == guess) | |
643 break; | |
644 guess = newGuess; | |
645 if (opt_usekeys) { | |
646 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
647 r = strcmp((gSortedLines[line])->unixSortKey, (
gSortedLines[guess])->unixSortKey); | |
648 } | |
649 gCount++; | |
650 } | |
651 else | |
652 { | |
653 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { | |
654 r = strcoll((gSortedLines[line])->unixName, (gSo
rtedLines[guess])->unixName); | |
655 } | |
656 errno = 0; | |
657 if (errno != 0) { | |
658 fprintf(stderr, "Error %d returned from strcoll.
\n", errno); | |
659 exit(-1); | |
660 } | |
661 gCount++; | |
662 } | |
663 if (r == 0) // strings == | |
664 break; | |
665 if (r < 0) // line < guess | |
666 hi = guess; | |
667 else // line > guess | |
668 lo = guess; | |
669 } | |
670 } | |
671 } | |
672 elapsedTime = timeGetTime() - startTime; | |
673 break; | |
674 } | |
675 break; | |
676 } | |
677 | |
678 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); | |
679 if (opt_terse == FALSE) { | |
680 printf("binary search: total # of string compares = %d\n", gCount); | |
681 printf("binary search: compares per loop = %d\n", gCount / loops); | |
682 printf("binary search: time per compare = %d ns\n", ns); | |
683 } else { | |
684 printf("%d, ", ns); | |
685 } | |
686 | |
687 } | |
688 | |
689 | |
690 | |
691 | |
692 //------------------------------------------------------------------------------
--------- | |
693 // | |
694 // doQSort() The quick sort timing test. Uses the C library qsort function
. | |
695 // | |
696 //------------------------------------------------------------------------------
--------- | |
697 void doQSort() { | |
698 int i; | |
699 Line **sortBuf = new Line *[gNumFileLines]; | |
700 | |
701 // Adjust loop count to compensate for file size. QSort should be n log(n) | |
702 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileL
ines) * double(gNumFileLines)); | |
703 if (opt_usekeys) dLoopCount *= 5; | |
704 int adj_loopCount = int(dLoopCount); | |
705 if (adj_loopCount < 1) adj_loopCount = 1; | |
706 | |
707 | |
708 gCount = 0; | |
709 unsigned long startTime = timeGetTime(); | |
710 if (opt_win && opt_usekeys) { | |
711 for (i=0; i<opt_loopCount; i++) { | |
712 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
713 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK); | |
714 } | |
715 } | |
716 | |
717 else if (opt_win && opt_uselen) { | |
718 for (i=0; i<adj_loopCount; i++) { | |
719 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
720 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL); | |
721 } | |
722 } | |
723 | |
724 | |
725 else if (opt_win && !opt_uselen) { | |
726 for (i=0; i<adj_loopCount; i++) { | |
727 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
728 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp); | |
729 } | |
730 } | |
731 | |
732 else if (opt_icu && opt_usekeys) { | |
733 for (i=0; i<adj_loopCount; i++) { | |
734 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
735 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK); | |
736 } | |
737 } | |
738 | |
739 else if (opt_icu && opt_uselen) { | |
740 for (i=0; i<adj_loopCount; i++) { | |
741 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
742 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL); | |
743 } | |
744 } | |
745 | |
746 | |
747 else if (opt_icu && !opt_uselen) { | |
748 for (i=0; i<adj_loopCount; i++) { | |
749 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
750 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp); | |
751 } | |
752 } | |
753 | |
754 else if (opt_unix && !opt_usekeys) { | |
755 for (i=0; i<adj_loopCount; i++) { | |
756 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); | |
757 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp); | |
758 } | |
759 } | |
760 | |
761 unsigned long elapsedTime = timeGetTime() - startTime; | |
762 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); | |
763 if (opt_terse == FALSE) { | |
764 printf("qsort: total # of string compares = %d\n", gCount); | |
765 printf("qsort: time per compare = %d ns\n", ns); | |
766 } else { | |
767 printf("%d, ", ns); | |
768 } | |
769 } | |
770 | |
771 | |
772 | |
773 //------------------------------------------------------------------------------
--------- | |
774 // | |
775 // doKeyHist() Output a table of data for | |
776 // average sort key size vs. string length. | |
777 // | |
778 //------------------------------------------------------------------------------
--------- | |
779 void doKeyHist() { | |
780 int i; | |
781 int maxLen = 0; | |
782 | |
783 // Find the maximum string length | |
784 for (i=0; i<gNumFileLines; i++) { | |
785 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len; | |
786 } | |
787 | |
788 // Allocate arrays to hold the histogram data | |
789 int *accumulatedLen = new int[maxLen+1]; | |
790 int *numKeysOfSize = new int[maxLen+1]; | |
791 for (i=0; i<=maxLen; i++) { | |
792 accumulatedLen[i] = 0; | |
793 numKeysOfSize[i] = 0; | |
794 } | |
795 | |
796 // Fill the arrays... | |
797 for (i=0; i<gNumFileLines; i++) { | |
798 int len = gFileLines[i].len; | |
799 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey); | |
800 numKeysOfSize[len] += 1; | |
801 } | |
802 | |
803 // And write out averages | |
804 printf("String Length, Avg Key Length, Avg Key Len per char\n"); | |
805 for (i=1; i<=maxLen; i++) { | |
806 if (numKeysOfSize[i] > 0) { | |
807 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysO
fSize[i], | |
808 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); | |
809 } | |
810 } | |
811 delete []accumulatedLen; | |
812 delete []numKeysOfSize ; | |
813 } | |
814 | |
815 //------------------------------------------------------------------------------
--------- | |
816 // | |
817 // doForwardIterTest(UBool) Forward iteration test | |
818 // argument null-terminated string used | |
819 // | |
820 //------------------------------------------------------------------------------
--------- | |
821 void doForwardIterTest(UBool haslen) { | |
822 int count = 0; | |
823 | |
824 UErrorCode error = U_ZERO_ERROR; | |
825 printf("\n\nPerforming forward iteration performance test with "); | |
826 | |
827 if (haslen) { | |
828 printf("non-null terminated data -----------\n"); | |
829 } | |
830 else { | |
831 printf("null terminated data -----------\n"); | |
832 } | |
833 printf("performance test on strings from file -----------\n"); | |
834 | |
835 UChar dummytext[] = {0, 0}; | |
836 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); | |
837 ucol_setText(iter, dummytext, 1, &error); | |
838 | |
839 gCount = 0; | |
840 unsigned long startTime = timeGetTime(); | |
841 while (count < opt_loopCount) { | |
842 int linecount = 0; | |
843 while (linecount < gNumFileLines) { | |
844 UChar *str = gFileLines[linecount].name; | |
845 int strlen = haslen?gFileLines[linecount].len:-1; | |
846 ucol_setText(iter, str, strlen, &error); | |
847 while (ucol_next(iter, &error) != UCOL_NULLORDER) { | |
848 gCount++; | |
849 } | |
850 | |
851 linecount ++; | |
852 } | |
853 count ++; | |
854 } | |
855 unsigned long elapsedTime = timeGetTime() - startTime; | |
856 printf("elapsedTime %ld\n", elapsedTime); | |
857 | |
858 // empty loop recalculation | |
859 count = 0; | |
860 startTime = timeGetTime(); | |
861 while (count < opt_loopCount) { | |
862 int linecount = 0; | |
863 while (linecount < gNumFileLines) { | |
864 UChar *str = gFileLines[linecount].name; | |
865 int strlen = haslen?gFileLines[linecount].len:-1; | |
866 ucol_setText(iter, str, strlen, &error); | |
867 linecount ++; | |
868 } | |
869 count ++; | |
870 } | |
871 elapsedTime -= (timeGetTime() - startTime); | |
872 printf("elapsedTime %ld\n", elapsedTime); | |
873 | |
874 ucol_closeElements(iter); | |
875 | |
876 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); | |
877 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, | |
878 opt_loopCount); | |
879 printf("Average time per ucol_next() nano seconds %d\n", ns); | |
880 | |
881 printf("performance test on skipped-5 concatenated strings from file -------
----\n"); | |
882 | |
883 UChar *str; | |
884 int strlen = 0; | |
885 // appending all the strings | |
886 int linecount = 0; | |
887 while (linecount < gNumFileLines) { | |
888 strlen += haslen?gFileLines[linecount].len: | |
889 u_strlen(gFileLines[linecount].name); | |
890 linecount ++; | |
891 } | |
892 str = (UChar *)malloc(sizeof(UChar) * strlen); | |
893 int strindex = 0; | |
894 linecount = 0; | |
895 while (strindex < strlen) { | |
896 int len = 0; | |
897 len += haslen?gFileLines[linecount].len: | |
898 u_strlen(gFileLines[linecount].name); | |
899 memcpy(str + strindex, gFileLines[linecount].name, | |
900 sizeof(UChar) * len); | |
901 strindex += len; | |
902 linecount ++; | |
903 } | |
904 | |
905 printf("Total size of strings %d\n", strlen); | |
906 | |
907 gCount = 0; | |
908 count = 0; | |
909 | |
910 if (!haslen) { | |
911 strlen = -1; | |
912 } | |
913 iter = ucol_openElements(gCol, str, strlen, &error); | |
914 if (!haslen) { | |
915 strlen = u_strlen(str); | |
916 } | |
917 strlen -= 5; // any left over characters are not iterated, | |
918 // this is to ensure the backwards and forwards iterators | |
919 // gets the same position | |
920 startTime = timeGetTime(); | |
921 while (count < opt_loopCount) { | |
922 int count5 = 5; | |
923 strindex = 0; | |
924 ucol_setOffset(iter, strindex, &error); | |
925 while (TRUE) { | |
926 if (ucol_next(iter, &error) == UCOL_NULLORDER) { | |
927 break; | |
928 } | |
929 gCount++; | |
930 count5 --; | |
931 if (count5 == 0) { | |
932 strindex += 10; | |
933 if (strindex > strlen) { | |
934 break; | |
935 } | |
936 ucol_setOffset(iter, strindex, &error); | |
937 count5 = 5; | |
938 } | |
939 } | |
940 count ++; | |
941 } | |
942 | |
943 elapsedTime = timeGetTime() - startTime; | |
944 printf("elapsedTime %ld\n", elapsedTime); | |
945 | |
946 // empty loop recalculation | |
947 int tempgCount = 0; | |
948 count = 0; | |
949 startTime = timeGetTime(); | |
950 while (count < opt_loopCount) { | |
951 int count5 = 5; | |
952 strindex = 0; | |
953 ucol_setOffset(iter, strindex, &error); | |
954 while (TRUE) { | |
955 tempgCount ++; | |
956 count5 --; | |
957 if (count5 == 0) { | |
958 strindex += 10; | |
959 if (strindex > strlen) { | |
960 break; | |
961 } | |
962 ucol_setOffset(iter, strindex, &error); | |
963 count5 = 5; | |
964 } | |
965 } | |
966 count ++; | |
967 } | |
968 elapsedTime -= (timeGetTime() - startTime); | |
969 printf("elapsedTime %ld\n", elapsedTime); | |
970 | |
971 ucol_closeElements(iter); | |
972 | |
973 printf("gCount %d\n", gCount); | |
974 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); | |
975 printf("Average time per ucol_next() nano seconds %d\n", ns); | |
976 } | |
977 | |
978 //------------------------------------------------------------------------------
--------- | |
979 // | |
980 // doBackwardIterTest(UBool) Backwards iteration test | |
981 // argument null-terminated string used | |
982 // | |
983 //------------------------------------------------------------------------------
--------- | |
984 void doBackwardIterTest(UBool haslen) { | |
985 int count = 0; | |
986 UErrorCode error = U_ZERO_ERROR; | |
987 printf("\n\nPerforming backward iteration performance test with "); | |
988 | |
989 if (haslen) { | |
990 printf("non-null terminated data -----------\n"); | |
991 } | |
992 else { | |
993 printf("null terminated data -----------\n"); | |
994 } | |
995 | |
996 printf("performance test on strings from file -----------\n"); | |
997 | |
998 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); | |
999 UChar dummytext[] = {0, 0}; | |
1000 ucol_setText(iter, dummytext, 1, &error); | |
1001 | |
1002 gCount = 0; | |
1003 unsigned long startTime = timeGetTime(); | |
1004 while (count < opt_loopCount) { | |
1005 int linecount = 0; | |
1006 while (linecount < gNumFileLines) { | |
1007 UChar *str = gFileLines[linecount].name; | |
1008 int strlen = haslen?gFileLines[linecount].len:-1; | |
1009 ucol_setText(iter, str, strlen, &error); | |
1010 while (ucol_previous(iter, &error) != UCOL_NULLORDER) { | |
1011 gCount ++; | |
1012 } | |
1013 | |
1014 linecount ++; | |
1015 } | |
1016 count ++; | |
1017 } | |
1018 unsigned long elapsedTime = timeGetTime() - startTime; | |
1019 | |
1020 printf("elapsedTime %ld\n", elapsedTime); | |
1021 | |
1022 // empty loop recalculation | |
1023 count = 0; | |
1024 startTime = timeGetTime(); | |
1025 while (count < opt_loopCount) { | |
1026 int linecount = 0; | |
1027 while (linecount < gNumFileLines) { | |
1028 UChar *str = gFileLines[linecount].name; | |
1029 int strlen = haslen?gFileLines[linecount].len:-1; | |
1030 ucol_setText(iter, str, strlen, &error); | |
1031 linecount ++; | |
1032 } | |
1033 count ++; | |
1034 } | |
1035 elapsedTime -= (timeGetTime() - startTime); | |
1036 | |
1037 printf("elapsedTime %ld\n", elapsedTime); | |
1038 ucol_closeElements(iter); | |
1039 | |
1040 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); | |
1041 printf("Total number of strings compared %d in %d loops\n", gNumFileLines, | |
1042 opt_loopCount); | |
1043 printf("Average time per ucol_previous() nano seconds %d\n", ns); | |
1044 | |
1045 printf("performance test on skipped-5 concatenated strings from file -------
----\n"); | |
1046 | |
1047 UChar *str; | |
1048 int strlen = 0; | |
1049 // appending all the strings | |
1050 int linecount = 0; | |
1051 while (linecount < gNumFileLines) { | |
1052 strlen += haslen?gFileLines[linecount].len: | |
1053 u_strlen(gFileLines[linecount].name); | |
1054 linecount ++; | |
1055 } | |
1056 str = (UChar *)malloc(sizeof(UChar) * strlen); | |
1057 int strindex = 0; | |
1058 linecount = 0; | |
1059 while (strindex < strlen) { | |
1060 int len = 0; | |
1061 len += haslen?gFileLines[linecount].len: | |
1062 u_strlen(gFileLines[linecount].name); | |
1063 memcpy(str + strindex, gFileLines[linecount].name, | |
1064 sizeof(UChar) * len); | |
1065 strindex += len; | |
1066 linecount ++; | |
1067 } | |
1068 | |
1069 printf("Total size of strings %d\n", strlen); | |
1070 | |
1071 gCount = 0; | |
1072 count = 0; | |
1073 | |
1074 if (!haslen) { | |
1075 strlen = -1; | |
1076 } | |
1077 | |
1078 iter = ucol_openElements(gCol, str, strlen, &error); | |
1079 if (!haslen) { | |
1080 strlen = u_strlen(str); | |
1081 } | |
1082 | |
1083 startTime = timeGetTime(); | |
1084 while (count < opt_loopCount) { | |
1085 int count5 = 5; | |
1086 strindex = 5; | |
1087 ucol_setOffset(iter, strindex, &error); | |
1088 while (TRUE) { | |
1089 if (ucol_previous(iter, &error) == UCOL_NULLORDER) { | |
1090 break; | |
1091 } | |
1092 gCount ++; | |
1093 count5 --; | |
1094 if (count5 == 0) { | |
1095 strindex += 10; | |
1096 if (strindex > strlen) { | |
1097 break; | |
1098 } | |
1099 ucol_setOffset(iter, strindex, &error); | |
1100 count5 = 5; | |
1101 } | |
1102 } | |
1103 count ++; | |
1104 } | |
1105 | |
1106 elapsedTime = timeGetTime() - startTime; | |
1107 printf("elapsedTime %ld\n", elapsedTime); | |
1108 | |
1109 // empty loop recalculation | |
1110 count = 0; | |
1111 int tempgCount = 0; | |
1112 startTime = timeGetTime(); | |
1113 while (count < opt_loopCount) { | |
1114 int count5 = 5; | |
1115 strindex = 5; | |
1116 ucol_setOffset(iter, strindex, &error); | |
1117 while (TRUE) { | |
1118 tempgCount ++; | |
1119 count5 --; | |
1120 if (count5 == 0) { | |
1121 strindex += 10; | |
1122 if (strindex > strlen) { | |
1123 break; | |
1124 } | |
1125 ucol_setOffset(iter, strindex, &error); | |
1126 count5 = 5; | |
1127 } | |
1128 } | |
1129 count ++; | |
1130 } | |
1131 elapsedTime -= (timeGetTime() - startTime); | |
1132 printf("elapsedTime %ld\n", elapsedTime); | |
1133 ucol_closeElements(iter); | |
1134 | |
1135 printf("gCount %d\n", gCount); | |
1136 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); | |
1137 printf("Average time per ucol_previous() nano seconds %d\n", ns); | |
1138 } | |
1139 | |
1140 //------------------------------------------------------------------------------
--------- | |
1141 // | |
1142 // doIterTest() Iteration test | |
1143 // | |
1144 //------------------------------------------------------------------------------
--------- | |
1145 void doIterTest() { | |
1146 doForwardIterTest(opt_uselen); | |
1147 doBackwardIterTest(opt_uselen); | |
1148 } | |
1149 | |
1150 | |
1151 //------------------------------------------------------------------------------
---------- | |
1152 // | |
1153 // UnixConvert -- Convert the lines of the file to the encoding for UNIX | |
1154 // Since it appears that Unicode support is going in the gene
ral | |
1155 // direction of the use of UTF-8 locales, that is the approac
h | |
1156 // that is used here. | |
1157 // | |
1158 //------------------------------------------------------------------------------
---------- | |
1159 void UnixConvert() { | |
1160 int line; | |
1161 | |
1162 UConverter *cvrtr; // An ICU code page converter. | |
1163 UErrorCode status = U_ZERO_ERROR; | |
1164 | |
1165 | |
1166 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales f
or now. | |
1167 if (U_FAILURE(status)) { | |
1168 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status))
; | |
1169 exit(-1); | |
1170 } | |
1171 | |
1172 for (line=0; line < gNumFileLines; line++) { | |
1173 int sizeNeeded = ucnv_fromUChars(cvrtr, | |
1174 0, // ptr to target buffer. | |
1175 0, // length of target buffe
r. | |
1176 gFileLines[line].name, | |
1177 -1, // source is null termin
ated | |
1178 &status); | |
1179 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { | |
1180 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); | |
1181 //exit(-1); | |
1182 } | |
1183 status = U_ZERO_ERROR; | |
1184 gFileLines[line].unixName = new char[sizeNeeded+1]; | |
1185 sizeNeeded = ucnv_fromUChars(cvrtr, | |
1186 gFileLines[line].unixName, // ptr to ta
rget buffer. | |
1187 sizeNeeded+1, // length of target buffe
r. | |
1188 gFileLines[line].name, | |
1189 -1, // source is null termin
ated | |
1190 &status); | |
1191 if (U_FAILURE(status)) { | |
1192 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); | |
1193 exit(-1); | |
1194 } | |
1195 gFileLines[line].unixName[sizeNeeded] = 0; | |
1196 }; | |
1197 ucnv_close(cvrtr); | |
1198 } | |
1199 | |
1200 | |
1201 //------------------------------------------------------------------------------
---------- | |
1202 // | |
1203 // class UCharFile Class to hide all the gorp to read a file in | |
1204 // and produce a stream of UChars. | |
1205 // | |
1206 //------------------------------------------------------------------------------
---------- | |
1207 class UCharFile { | |
1208 public: | |
1209 UCharFile(const char *fileName); | |
1210 ~UCharFile(); | |
1211 UChar get(); | |
1212 UBool eof() {return fEof;}; | |
1213 UBool error() {return fError;}; | |
1214 | |
1215 private: | |
1216 UCharFile (const UCharFile & /*other*/) {}; // No co
py constructor. | |
1217 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No
assignment op | |
1218 | |
1219 FILE *fFile; | |
1220 const char *fName; | |
1221 UBool fEof; | |
1222 UBool fError; | |
1223 UChar fPending2ndSurrogate; | |
1224 | |
1225 enum {UTF16LE, UTF16BE, UTF8} fEncoding; | |
1226 }; | |
1227 | |
1228 UCharFile::UCharFile(const char * fileName) { | |
1229 fEof = FALSE; | |
1230 fError = FALSE; | |
1231 fName = fileName; | |
1232 fFile = fopen(fName, "rb"); | |
1233 fPending2ndSurrogate = 0; | |
1234 if (fFile == NULL) { | |
1235 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); | |
1236 fError = TRUE; | |
1237 return; | |
1238 } | |
1239 // | |
1240 // Look for the byte order mark at the start of the file. | |
1241 // | |
1242 int BOMC1, BOMC2, BOMC3; | |
1243 BOMC1 = fgetc(fFile); | |
1244 BOMC2 = fgetc(fFile); | |
1245 | |
1246 if (BOMC1 == 0xff && BOMC2 == 0xfe) { | |
1247 fEncoding = UTF16LE; } | |
1248 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { | |
1249 fEncoding = UTF16BE; } | |
1250 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF )
{ | |
1251 fEncoding = UTF8; } | |
1252 else | |
1253 { | |
1254 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16
, and " | |
1255 "must include a BOM.\n", fileName); | |
1256 fError = true; | |
1257 return; | |
1258 } | |
1259 } | |
1260 | |
1261 | |
1262 UCharFile::~UCharFile() { | |
1263 fclose(fFile); | |
1264 } | |
1265 | |
1266 | |
1267 | |
1268 UChar UCharFile::get() { | |
1269 UChar c; | |
1270 switch (fEncoding) { | |
1271 case UTF16LE: | |
1272 { | |
1273 int cL, cH; | |
1274 cL = fgetc(fFile); | |
1275 cH = fgetc(fFile); | |
1276 c = cL | (cH << 8); | |
1277 if (cH == EOF) { | |
1278 c = 0; | |
1279 fEof = TRUE; | |
1280 } | |
1281 break; | |
1282 } | |
1283 case UTF16BE: | |
1284 { | |
1285 int cL, cH; | |
1286 cH = fgetc(fFile); | |
1287 cL = fgetc(fFile); | |
1288 c = cL | (cH << 8); | |
1289 if (cL == EOF) { | |
1290 c = 0; | |
1291 fEof = TRUE; | |
1292 } | |
1293 break; | |
1294 } | |
1295 case UTF8: | |
1296 { | |
1297 if (fPending2ndSurrogate != 0) { | |
1298 c = fPending2ndSurrogate; | |
1299 fPending2ndSurrogate = 0; | |
1300 break; | |
1301 } | |
1302 | |
1303 int ch = fgetc(fFile); // Note: c and ch are separate cause eof t
est doesn't work on UChar type. | |
1304 if (ch == EOF) { | |
1305 c = 0; | |
1306 fEof = TRUE; | |
1307 break; | |
1308 } | |
1309 | |
1310 if (ch <= 0x7f) { | |
1311 // It's ascii. No further utf-8 conversion. | |
1312 c = ch; | |
1313 break; | |
1314 } | |
1315 | |
1316 // Figure out the lenght of the char and read the rest of the bytes | |
1317 // into a temp array. | |
1318 int nBytes; | |
1319 if (ch >= 0xF0) {nBytes=4;} | |
1320 else if (ch >= 0xE0) {nBytes=3;} | |
1321 else if (ch >= 0xC0) {nBytes=2;} | |
1322 else { | |
1323 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); | |
1324 fError = TRUE; | |
1325 return 0; | |
1326 } | |
1327 | |
1328 unsigned char bytes[10]; | |
1329 bytes[0] = (unsigned char)ch; | |
1330 int i; | |
1331 for (i=1; i<nBytes; i++) { | |
1332 bytes[i] = fgetc(fFile); | |
1333 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { | |
1334 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n
"); | |
1335 fError = TRUE; | |
1336 return 0; | |
1337 } | |
1338 } | |
1339 | |
1340 // Convert the bytes from the temp array to a Unicode char. | |
1341 i = 0; | |
1342 uint32_t cp; | |
1343 U8_NEXT_UNSAFE(bytes, i, cp); | |
1344 c = (UChar)cp; | |
1345 | |
1346 if (cp >= 0x10000) { | |
1347 // The code point needs to be broken up into a utf-16 surrogate
pair. | |
1348 // Process first half this time through the main loop, and | |
1349 // remember the other half for the next time through. | |
1350 UChar utf16Buf[3]; | |
1351 i = 0; | |
1352 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); | |
1353 fPending2ndSurrogate = utf16Buf[1]; | |
1354 c = utf16Buf[0]; | |
1355 } | |
1356 break; | |
1357 }; | |
1358 default: | |
1359 c = 0xFFFD; /* Error, unspecified codepage*/ | |
1360 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); | |
1361 exit(1); | |
1362 } | |
1363 return c; | |
1364 } | |
1365 | |
1366 //------------------------------------------------------------------------------
---------- | |
1367 // | |
1368 // openRulesCollator - Command line specified a rules file. Read it in | |
1369 // and open a collator with it. | |
1370 // | |
1371 //------------------------------------------------------------------------------
---------- | |
1372 UCollator *openRulesCollator() { | |
1373 UCharFile f(opt_rules); | |
1374 if (f.error()) { | |
1375 return 0; | |
1376 } | |
1377 | |
1378 int bufLen = 10000; | |
1379 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); | |
1380 UChar *tmp; | |
1381 int i = 0; | |
1382 | |
1383 for(;;) { | |
1384 buf[i] = f.get(); | |
1385 if (f.eof()) { | |
1386 break; | |
1387 } | |
1388 if (f.error()) { | |
1389 return 0; | |
1390 } | |
1391 i++; | |
1392 if (i >= bufLen) { | |
1393 tmp = buf; | |
1394 bufLen += 10000; | |
1395 buf = (UChar *)realloc(buf, bufLen); | |
1396 if (buf == NULL) { | |
1397 free(tmp); | |
1398 return 0; | |
1399 } | |
1400 } | |
1401 } | |
1402 buf[i] = 0; | |
1403 | |
1404 UErrorCode status = U_ZERO_ERROR; | |
1405 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, | |
1406 UCOL_DEFAULT_STRENGTH, NULL, &status); | |
1407 if (U_FAILURE(status)) { | |
1408 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); | |
1409 return 0; | |
1410 } | |
1411 free(buf); | |
1412 return coll; | |
1413 } | |
1414 | |
1415 | |
1416 | |
1417 | |
1418 | |
1419 //------------------------------------------------------------------------------
---------- | |
1420 // | |
1421 // Main -- process command line, read in and pre-process the test file, | |
1422 // call other functions to do the actual tests. | |
1423 // | |
1424 //------------------------------------------------------------------------------
---------- | |
1425 int main(int argc, const char** argv) { | |
1426 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0)
{ | |
1427 printf(gUsageString); | |
1428 exit (1); | |
1429 } | |
1430 | |
1431 // Make sure that we've only got one API selected. | |
1432 if (opt_unix || opt_win) opt_icu = FALSE; | |
1433 if (opt_unix) opt_win = FALSE; | |
1434 | |
1435 // | |
1436 // Set up an ICU collator | |
1437 // | |
1438 UErrorCode status = U_ZERO_ERROR; | |
1439 | |
1440 if (opt_rules != 0) { | |
1441 gCol = openRulesCollator(); | |
1442 if (gCol == 0) {return -1;} | |
1443 } | |
1444 else { | |
1445 gCol = ucol_open(opt_locale, &status); | |
1446 if (U_FAILURE(status)) { | |
1447 fprintf(stderr, "Collator creation failed.: %d\n", status); | |
1448 return -1; | |
1449 } | |
1450 } | |
1451 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { | |
1452 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale)
; | |
1453 } | |
1454 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { | |
1455 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); | |
1456 } | |
1457 | |
1458 if (opt_norm) { | |
1459 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); | |
1460 } | |
1461 if (opt_french && opt_frenchoff) { | |
1462 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff
options."); | |
1463 exit(-1); | |
1464 } | |
1465 if (opt_french) { | |
1466 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); | |
1467 } | |
1468 if (opt_frenchoff) { | |
1469 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); | |
1470 } | |
1471 if (opt_lower) { | |
1472 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); | |
1473 } | |
1474 if (opt_upper) { | |
1475 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); | |
1476 } | |
1477 if (opt_case) { | |
1478 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); | |
1479 } | |
1480 if (opt_shifted) { | |
1481 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); | |
1482 } | |
1483 if (opt_level != 0) { | |
1484 switch (opt_level) { | |
1485 case 1: | |
1486 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); | |
1487 break; | |
1488 case 2: | |
1489 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); | |
1490 break; | |
1491 case 3: | |
1492 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); | |
1493 break; | |
1494 case 4: | |
1495 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); | |
1496 break; | |
1497 case 5: | |
1498 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); | |
1499 break; | |
1500 default: | |
1501 fprintf(stderr, "-level param must be between 1 and 5\n"); | |
1502 exit(-1); | |
1503 } | |
1504 } | |
1505 | |
1506 if (U_FAILURE(status)) { | |
1507 fprintf(stderr, "Collator attribute setting failed.: %d\n", status); | |
1508 return -1; | |
1509 } | |
1510 | |
1511 | |
1512 // | |
1513 // Set up a Windows LCID | |
1514 // | |
1515 if (opt_langid != 0) { | |
1516 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); | |
1517 } | |
1518 else { | |
1519 gWinLCID = uloc_getLCID(opt_locale); | |
1520 } | |
1521 | |
1522 | |
1523 // | |
1524 // Set the UNIX locale | |
1525 // | |
1526 if (opt_unix) { | |
1527 if (setlocale(LC_ALL, opt_locale) == 0) { | |
1528 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); | |
1529 exit(-1); | |
1530 } | |
1531 } | |
1532 | |
1533 // Read in the input file. | |
1534 // File assumed to be utf-16. | |
1535 // Lines go onto heap buffers. Global index array to line starts is creat
ed. | |
1536 // Lines themselves are null terminated. | |
1537 // | |
1538 | |
1539 UCharFile f(opt_fName); | |
1540 if (f.error()) { | |
1541 exit(-1); | |
1542 } | |
1543 | |
1544 const int MAXLINES = 100000; | |
1545 gFileLines = new Line[MAXLINES]; | |
1546 UChar buf[1024]; | |
1547 int column = 0; | |
1548 | |
1549 // Read the file, split into lines, and save in memory. | |
1550 // Loop runs once per utf-16 value from the input file, | |
1551 // (The number of bytes read from file per loop iteration depends on exte
rnal encoding.) | |
1552 for (;;) { | |
1553 | |
1554 UChar c = f.get(); | |
1555 if (f.error()){ | |
1556 exit(-1); | |
1557 } | |
1558 | |
1559 | |
1560 // We now have a good UTF-16 value in c. | |
1561 | |
1562 // Watch for CR, LF, EOF; these finish off a line. | |
1563 if (c == 0xd) { | |
1564 continue; | |
1565 } | |
1566 | |
1567 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line se
parators! | |
1568 buf[column++] = 0; | |
1569 if (column > 1) { | |
1570 gFileLines[gNumFileLines].name = new UChar[column]; | |
1571 gFileLines[gNumFileLines].len = column-1; | |
1572 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UCha
r)); | |
1573 gNumFileLines++; | |
1574 column = 0; | |
1575 if (gNumFileLines >= MAXLINES) { | |
1576 fprintf(stderr, "File too big. Max number of lines is %d\n"
, MAXLINES); | |
1577 exit(-1); | |
1578 } | |
1579 | |
1580 } | |
1581 if (c == 0xa || c == 0x2028) | |
1582 continue; | |
1583 else | |
1584 break; // EOF | |
1585 } | |
1586 buf[column++] = c; | |
1587 if (column >= 1023) | |
1588 { | |
1589 static UBool warnFlag = TRUE; | |
1590 if (warnFlag) { | |
1591 fprintf(stderr, "Warning - file line longer than 1023 chars trun
cated.\n"); | |
1592 warnFlag = FALSE; | |
1593 } | |
1594 column--; | |
1595 } | |
1596 } | |
1597 | |
1598 if (opt_terse == FALSE) { | |
1599 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); | |
1600 } | |
1601 | |
1602 | |
1603 // Convert the lines to the UNIX encoding. | |
1604 if (opt_unix) { | |
1605 UnixConvert(); | |
1606 } | |
1607 | |
1608 // | |
1609 // Pre-compute ICU sort keys for the lines of the file. | |
1610 // | |
1611 int line; | |
1612 int32_t t; | |
1613 | |
1614 for (line=0; line<gNumFileLines; line++) { | |
1615 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)b
uf, sizeof(buf)); | |
1616 gFileLines[line].icuSortKey = new char[t]; | |
1617 | |
1618 if (t > (int32_t)sizeof(buf)) { | |
1619 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char
*)gFileLines[line].icuSortKey , t); | |
1620 } | |
1621 else | |
1622 { | |
1623 memcpy(gFileLines[line].icuSortKey, buf, t); | |
1624 } | |
1625 } | |
1626 | |
1627 | |
1628 | |
1629 // | |
1630 // Pre-compute Windows sort keys for the lines of the file. | |
1631 // | |
1632 for (line=0; line<gNumFileLines; line++) { | |
1633 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf,
sizeof(buf)); | |
1634 gFileLines[line].winSortKey = new char[t]; | |
1635 if (t > (int32_t)sizeof(buf)) { | |
1636 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1
, (unsigned short *)(gFileLines[line].winSortKey), t); | |
1637 } | |
1638 else | |
1639 { | |
1640 memcpy(gFileLines[line].winSortKey, buf, t); | |
1641 } | |
1642 } | |
1643 | |
1644 // | |
1645 // Pre-compute UNIX sort keys for the lines of the file. | |
1646 // | |
1647 if (opt_unix) { | |
1648 for (line=0; line<gNumFileLines; line++) { | |
1649 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf)); | |
1650 gFileLines[line].unixSortKey = new char[t]; | |
1651 if (t > (int32_t)sizeof(buf)) { | |
1652 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unix
Name, sizeof(buf)); | |
1653 } | |
1654 else | |
1655 { | |
1656 memcpy(gFileLines[line].unixSortKey, buf, t); | |
1657 } | |
1658 } | |
1659 } | |
1660 | |
1661 | |
1662 // | |
1663 // Dump file lines, CEs, Sort Keys if requested. | |
1664 // | |
1665 if (opt_dump) { | |
1666 int i; | |
1667 for (line=0; line<gNumFileLines; line++) { | |
1668 for (i=0;;i++) { | |
1669 UChar c = gFileLines[line].name[i]; | |
1670 if (c == 0) | |
1671 break; | |
1672 if (c < 0x20 || c > 0x7e) { | |
1673 printf("\\u%.4x", c); | |
1674 } | |
1675 else { | |
1676 printf("%c", c); | |
1677 } | |
1678 } | |
1679 printf("\n"); | |
1680 | |
1681 printf(" CEs: "); | |
1682 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line
].name, -1, &status); | |
1683 int32_t ce; | |
1684 i = 0; | |
1685 for (;;) { | |
1686 ce = ucol_next(CEiter, &status); | |
1687 if (ce == UCOL_NULLORDER) { | |
1688 break; | |
1689 } | |
1690 printf(" %.8x", ce); | |
1691 if (++i > 8) { | |
1692 printf("\n "); | |
1693 i = 0; | |
1694 } | |
1695 } | |
1696 printf("\n"); | |
1697 ucol_closeElements(CEiter); | |
1698 | |
1699 | |
1700 printf(" ICU Sort Key: "); | |
1701 for (i=0; ; i++) { | |
1702 unsigned char c = gFileLines[line].icuSortKey[i]; | |
1703 printf("%02x ", c); | |
1704 if (c == 0) { | |
1705 break; | |
1706 } | |
1707 if (i > 0 && i % 20 == 0) { | |
1708 printf("\n "); | |
1709 } | |
1710 } | |
1711 printf("\n"); | |
1712 } | |
1713 } | |
1714 | |
1715 | |
1716 // | |
1717 // Pre-sort the lines. | |
1718 // | |
1719 int i; | |
1720 gSortedLines = new Line *[gNumFileLines]; | |
1721 for (i=0; i<gNumFileLines; i++) { | |
1722 gSortedLines[i] = &gFileLines[i]; | |
1723 } | |
1724 | |
1725 if (opt_win) { | |
1726 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp); | |
1727 } | |
1728 else if (opt_unix) { | |
1729 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp); | |
1730 } | |
1731 else /* ICU */ | |
1732 { | |
1733 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp); | |
1734 } | |
1735 | |
1736 | |
1737 // | |
1738 // Make up a randomized order, will be used for sorting tests. | |
1739 // | |
1740 gRandomLines = new Line *[gNumFileLines]; | |
1741 for (i=0; i<gNumFileLines; i++) { | |
1742 gRandomLines[i] = &gFileLines[i]; | |
1743 } | |
1744 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp); | |
1745 | |
1746 | |
1747 | |
1748 | |
1749 // | |
1750 // We've got the file read into memory. Go do something with it. | |
1751 // | |
1752 | |
1753 if (opt_qsort) doQSort(); | |
1754 if (opt_binsearch) doBinarySearch(); | |
1755 if (opt_keygen) doKeyGen(); | |
1756 if (opt_keyhist) doKeyHist(); | |
1757 if (opt_itertest) doIterTest(); | |
1758 | |
1759 return 0; | |
1760 | |
1761 } | |
OLD | NEW |