Index: source/test/perf/collationperf/collperf.cpp |
diff --git a/source/test/perf/collationperf/collperf.cpp b/source/test/perf/collationperf/collperf.cpp |
deleted file mode 100644 |
index 1ce078176178ee80d2c288593729b3c2a2af3a5a..0000000000000000000000000000000000000000 |
--- a/source/test/perf/collationperf/collperf.cpp |
+++ /dev/null |
@@ -1,1761 +0,0 @@ |
-/******************************************************************** |
- * COPYRIGHT: |
- * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. |
- * |
- ********************************************************************/ |
-/******************************************************************************** |
-* |
-* File CALLCOLL.C |
-* |
-* Modification History: |
-* Name Description |
-* Andy Heninger First Version |
-* |
-********************************************************************************* |
-*/ |
- |
-// |
-// This program tests string collation and sort key generation performance. |
-// Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString |
-// A file of names is required as input, one per line. It must be in utf-8 or utf-16 format, |
-// and include a byte order mark. Either LE or BE format is OK. |
-// |
- |
-const char gUsageString[] = |
- "usage: collperf options...\n" |
- "-help Display this message.\n" |
- "-file file_name utf-16 format file of names.\n" |
- "-locale name ICU locale to use. Default is en_US\n" |
- "-rules file_name Collation rules file (overrides locale)\n" |
- "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" |
- " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" |
- "-win Run test using Windows native services. (ICU is default)\n" |
- "-unix Run test using Unix strxfrm, strcoll services.\n" |
- "-uselen Use API with string lengths. Default is null-terminated strings\n" |
- "-usekeys Run tests using sortkeys rather than strcoll\n" |
- "-strcmp Run tests using u_strcmp rather than strcoll\n" |
- "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n" |
- "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" |
- "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" |
- " under test at each call point. For measuring test overhead.\n" |
- "-terse Terse numbers-only output. Intended for use by scripts.\n" |
- "-french French accent ordering\n" |
- "-frenchoff No French accent ordering (for use with French locales.)\n" |
- "-norm Normalizing mode on\n" |
- "-shifted Shifted mode\n" |
- "-lower Lower case first\n" |
- "-upper Upper case first\n" |
- "-case Enable separate case level\n" |
- "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n" |
- "-keyhist Produce a table sort key size vs. string length\n" |
- "-binsearch Binary Search timing test\n" |
- "-keygen Sort Key Generation timing test\n" |
- "-qsort Quicksort timing test\n" |
- "-iter Iteration Performance Test\n" |
- "-dump Display strings, sort keys and CEs.\n" |
- ; |
- |
- |
- |
-#include <stdio.h> |
-#include <string.h> |
-#include <stdlib.h> |
-#include <math.h> |
-#include <locale.h> |
-#include <errno.h> |
- |
-#include <unicode/utypes.h> |
-#include <unicode/ucol.h> |
-#include <unicode/ucoleitr.h> |
-#include <unicode/uloc.h> |
-#include <unicode/ustring.h> |
-#include <unicode/ures.h> |
-#include <unicode/uchar.h> |
-#include <unicode/ucnv.h> |
-#include <unicode/utf8.h> |
- |
-#ifdef WIN32 |
-#include <windows.h> |
-#else |
-// |
-// Stubs for Windows API functions when building on UNIXes. |
-// |
-typedef int DWORD; |
-inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} |
-#include <sys/time.h> |
-unsigned long timeGetTime() { |
- struct timeval t; |
- gettimeofday(&t, 0); |
- unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. |
- val += t.tv_usec / 1000; |
- return val; |
-} |
-inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;} |
-const int LCMAP_SORTKEY = 0; |
-#define MAKELCID(a,b) 0 |
-const int SORT_DEFAULT = 0; |
-#endif |
- |
- |
- |
-// |
-// Command line option variables |
-// These global variables are set according to the options specified |
-// on the command line by the user. |
-char * opt_fName = 0; |
-const char * opt_locale = "en_US"; |
-int opt_langid = 0; // Defaults to value corresponding to opt_locale. |
-char * opt_rules = 0; |
-UBool opt_help = FALSE; |
-int opt_loopCount = 1; |
-int opt_iLoopCount = 1; |
-UBool opt_terse = FALSE; |
-UBool opt_qsort = FALSE; |
-UBool opt_binsearch = FALSE; |
-UBool opt_icu = TRUE; |
-UBool opt_win = FALSE; // Run with Windows native functions. |
-UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. |
-UBool opt_uselen = FALSE; |
-UBool opt_usekeys = FALSE; |
-UBool opt_strcmp = FALSE; |
-UBool opt_strcmpCPO = FALSE; |
-UBool opt_norm = FALSE; |
-UBool opt_keygen = FALSE; |
-UBool opt_french = FALSE; |
-UBool opt_frenchoff = FALSE; |
-UBool opt_shifted = FALSE; |
-UBool opt_lower = FALSE; |
-UBool opt_upper = FALSE; |
-UBool opt_case = FALSE; |
-int opt_level = 0; |
-UBool opt_keyhist = FALSE; |
-UBool opt_itertest = FALSE; |
-UBool opt_dump = FALSE; |
- |
- |
- |
-// |
-// Definitions for the command line options |
-// |
-struct OptSpec { |
- const char *name; |
- enum {FLAG, NUM, STRING} type; |
- void *pVar; |
-}; |
- |
-OptSpec opts[] = { |
- {"-file", OptSpec::STRING, &opt_fName}, |
- {"-locale", OptSpec::STRING, &opt_locale}, |
- {"-langid", OptSpec::NUM, &opt_langid}, |
- {"-rules", OptSpec::STRING, &opt_rules}, |
- {"-qsort", OptSpec::FLAG, &opt_qsort}, |
- {"-binsearch", OptSpec::FLAG, &opt_binsearch}, |
- {"-iter", OptSpec::FLAG, &opt_itertest}, |
- {"-win", OptSpec::FLAG, &opt_win}, |
- {"-unix", OptSpec::FLAG, &opt_unix}, |
- {"-uselen", OptSpec::FLAG, &opt_uselen}, |
- {"-usekeys", OptSpec::FLAG, &opt_usekeys}, |
- {"-strcmp", OptSpec::FLAG, &opt_strcmp}, |
- {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO}, |
- {"-norm", OptSpec::FLAG, &opt_norm}, |
- {"-french", OptSpec::FLAG, &opt_french}, |
- {"-frenchoff", OptSpec::FLAG, &opt_frenchoff}, |
- {"-shifted", OptSpec::FLAG, &opt_shifted}, |
- {"-lower", OptSpec::FLAG, &opt_lower}, |
- {"-upper", OptSpec::FLAG, &opt_upper}, |
- {"-case", OptSpec::FLAG, &opt_case}, |
- {"-level", OptSpec::NUM, &opt_level}, |
- {"-keyhist", OptSpec::FLAG, &opt_keyhist}, |
- {"-keygen", OptSpec::FLAG, &opt_keygen}, |
- {"-loop", OptSpec::NUM, &opt_loopCount}, |
- {"-iloop", OptSpec::NUM, &opt_iLoopCount}, |
- {"-terse", OptSpec::FLAG, &opt_terse}, |
- {"-dump", OptSpec::FLAG, &opt_dump}, |
- {"-help", OptSpec::FLAG, &opt_help}, |
- {"-?", OptSpec::FLAG, &opt_help}, |
- {0, OptSpec::FLAG, 0} |
-}; |
- |
- |
-//--------------------------------------------------------------------------- |
-// |
-// Global variables pointing to and describing the test file |
-// |
-//--------------------------------------------------------------------------- |
- |
-// |
-// struct Line |
-// |
-// Each line from the source file (containing a name, presumably) gets |
-// one of these structs. |
-// |
-struct Line { |
- UChar *name; |
- int len; |
- char *winSortKey; |
- char *icuSortKey; |
- char *unixSortKey; |
- char *unixName; |
-}; |
- |
- |
- |
-Line *gFileLines; // Ptr to array of Line structs, one per line in the file. |
-int gNumFileLines; |
-UCollator *gCol; |
-DWORD gWinLCID; |
- |
-Line **gSortedLines; |
-Line **gRandomLines; |
-int gCount; |
- |
- |
- |
-//--------------------------------------------------------------------------- |
-// |
-// ProcessOptions() Function to read the command line options. |
-// |
-//--------------------------------------------------------------------------- |
-UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) |
-{ |
- int i; |
- int argNum; |
- const char *pArgName; |
- OptSpec *pOpt; |
- |
- for (argNum=1; argNum<argc; argNum++) { |
- pArgName = argv[argNum]; |
- for (pOpt = opts; pOpt->name != 0; pOpt++) { |
- if (strcmp(pOpt->name, pArgName) == 0) { |
- switch (pOpt->type) { |
- case OptSpec::FLAG: |
- *(UBool *)(pOpt->pVar) = TRUE; |
- break; |
- case OptSpec::STRING: |
- argNum ++; |
- if (argNum >= argc) { |
- fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); |
- return FALSE; |
- } |
- *(const char **)(pOpt->pVar) = argv[argNum]; |
- break; |
- case OptSpec::NUM: |
- argNum ++; |
- if (argNum >= argc) { |
- fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); |
- return FALSE; |
- } |
- char *endp; |
- i = strtol(argv[argNum], &endp, 0); |
- if (endp == argv[argNum]) { |
- fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); |
- return FALSE; |
- } |
- *(int *)(pOpt->pVar) = i; |
- } |
- break; |
- } |
- } |
- if (pOpt->name == 0) |
- { |
- fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); |
- return FALSE; |
- } |
- } |
-return TRUE; |
-} |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// Comparison functions for use by qsort. |
-// |
-// Six flavors, ICU or Windows, SortKey or String Compare, Strings with length |
-// or null terminated. |
-// |
-//--------------------------------------------------------------------------------------- |
-int ICUstrcmpK(const void *a, const void *b) { |
- gCount++; |
- int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey); |
- return t; |
-} |
- |
- |
-int ICUstrcmpL(const void *a, const void *b) { |
- gCount++; |
- UCollationResult t; |
- t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); |
- if (t == UCOL_LESS) return -1; |
- if (t == UCOL_GREATER) return +1; |
- return 0; |
-} |
- |
- |
-int ICUstrcmp(const void *a, const void *b) { |
- gCount++; |
- UCollationResult t; |
- t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); |
- if (t == UCOL_LESS) return -1; |
- if (t == UCOL_GREATER) return +1; |
- return 0; |
-} |
- |
- |
-int Winstrcmp(const void *a, const void *b) { |
- gCount++; |
- int t; |
- t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1); |
- return t-2; |
-} |
- |
- |
-int UNIXstrcmp(const void *a, const void *b) { |
- gCount++; |
- int t; |
- t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName); |
- return t; |
-} |
- |
- |
-int WinstrcmpL(const void *a, const void *b) { |
- gCount++; |
- int t; |
- t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len); |
- return t-2; |
-} |
- |
- |
-int WinstrcmpK(const void *a, const void *b) { |
- gCount++; |
- int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey); |
- return t; |
-} |
- |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// Function for sorting the names (lines) into a random order. |
-// Order is based on a hash of the ICU Sort key for the lines |
-// The randomized order is used as input for the sorting timing tests. |
-// |
-//--------------------------------------------------------------------------------------- |
-int ICURandomCmp(const void *a, const void *b) { |
- char *ask = (*(Line **)a)->icuSortKey; |
- char *bsk = (*(Line **)b)->icuSortKey; |
- int aVal = 0; |
- int bVal = 0; |
- int retVal; |
- while (*ask != 0) { |
- aVal += aVal*37 + *ask++; |
- } |
- while (*bsk != 0) { |
- bVal += bVal*37 + *bsk++; |
- } |
- retVal = -1; |
- if (aVal == bVal) { |
- retVal = 0; |
- } |
- else if (aVal > bVal) { |
- retVal = 1; |
- } |
- return retVal; |
-} |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doKeyGen() Key Generation Timing Test |
-// |
-//--------------------------------------------------------------------------------------- |
-void doKeyGen() |
-{ |
- int line; |
- int loops = 0; |
- int iLoop; |
- int t; |
- int len=-1; |
- |
- // Adjust loop count to compensate for file size. Should be order n |
- double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines)); |
- int adj_loopCount = int(dLoopCount); |
- if (adj_loopCount < 1) adj_loopCount = 1; |
- |
- |
- unsigned long startTime = timeGetTime(); |
- |
- if (opt_win) { |
- for (loops=0; loops<adj_loopCount; loops++) { |
- for (line=0; line < gNumFileLines; line++) { |
- if (opt_uselen) { |
- len = gFileLines[line].len; |
- } |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, |
- gFileLines[line].name, len, |
- (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length. |
- } |
- } |
- } |
- } |
- else if (opt_icu) |
- { |
- for (loops=0; loops<adj_loopCount; loops++) { |
- for (line=0; line < gNumFileLines; line++) { |
- if (opt_uselen) { |
- len = gFileLines[line].len; |
- } |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000); |
- } |
- } |
- } |
- } |
- else if (opt_unix) |
- { |
- for (loops=0; loops<adj_loopCount; loops++) { |
- for (line=0; line < gNumFileLines; line++) { |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000); |
- } |
- } |
- } |
- } |
- |
- unsigned long elapsedTime = timeGetTime() - startTime; |
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines)); |
- |
- if (opt_terse == FALSE) { |
- printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines); |
- printf("Sort Key Generation: time per key = %d ns\n", ns); |
- } |
- else { |
- printf("%d, ", ns); |
- } |
- |
- int totalKeyLen = 0; |
- int totalChars = 0; |
- for (line=0; line<gNumFileLines; line++) { |
- totalChars += u_strlen(gFileLines[line].name); |
- if (opt_win) { |
- totalKeyLen += strlen(gFileLines[line].winSortKey); |
- } |
- else if (opt_icu) { |
- totalKeyLen += strlen(gFileLines[line].icuSortKey); |
- } |
- else if (opt_unix) { |
- totalKeyLen += strlen(gFileLines[line].unixSortKey); |
- } |
- |
- } |
- if (opt_terse == FALSE) { |
- printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars); |
- } else { |
- printf("%f, ", (float)totalKeyLen / (float)totalChars); |
- } |
-} |
- |
- |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doBinarySearch() Binary Search timing test. Each name from the list |
-// is looked up in the full sorted list of names. |
-// |
-//--------------------------------------------------------------------------------------- |
-void doBinarySearch() |
-{ |
- |
- gCount = 0; |
- int line; |
- int loops = 0; |
- int iLoop = 0; |
- unsigned long elapsedTime = 0; |
- |
- // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup) |
- // Accurate timings do not depend on this being perfect. The correction is just to try to |
- // get total running times of about the right order, so the that user doesn't need to |
- // manually adjust the loop count for every different file size. |
- double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); |
- if (opt_usekeys) dLoopCount *= 5; |
- int adj_loopCount = int(dLoopCount); |
- if (adj_loopCount < 1) adj_loopCount = 1; |
- |
- |
- for (;;) { // not really a loop, just allows "break" to work, to simplify |
- // inadvertantly running more than one test through here. |
- if (opt_strcmp || opt_strcmpCPO) |
- { |
- unsigned long startTime = timeGetTime(); |
- typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *); |
- PF pf = u_strcmp; |
- if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;} |
- //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int |
- // which forces the use of a cast here. |
- |
- int r = 0; |
- for (loops=0; loops<adj_loopCount; loops++) { |
- |
- for (line=0; line < gNumFileLines; line++) { |
- int hi = gNumFileLines-1; |
- int lo = 0; |
- int guess = -1; |
- for (;;) { |
- int newGuess = (hi + lo) / 2; |
- if (newGuess == guess) |
- break; |
- guess = newGuess; |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name); |
- } |
- gCount++; |
- if (r== 0) |
- break; |
- if (r < 0) |
- hi = guess; |
- else |
- lo = guess; |
- } |
- } |
- } |
- elapsedTime = timeGetTime() - startTime; |
- break; |
- } |
- |
- |
- if (opt_icu) |
- { |
- unsigned long startTime = timeGetTime(); |
- UCollationResult r = UCOL_EQUAL; |
- for (loops=0; loops<adj_loopCount; loops++) { |
- |
- for (line=0; line < gNumFileLines; line++) { |
- int lineLen = -1; |
- int guessLen = -1; |
- if (opt_uselen) { |
- lineLen = (gSortedLines[line])->len; |
- } |
- int hi = gNumFileLines-1; |
- int lo = 0; |
- int guess = -1; |
- for (;;) { |
- int newGuess = (hi + lo) / 2; |
- if (newGuess == guess) |
- break; |
- guess = newGuess; |
- int ri = 0; |
- if (opt_usekeys) { |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey); |
- } |
- gCount++; |
- r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;} |
- } |
- else |
- { |
- if (opt_uselen) { |
- guessLen = (gSortedLines[guess])->len; |
- } |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); |
- } |
- gCount++; |
- } |
- if (r== UCOL_EQUAL) |
- break; |
- if (r == UCOL_LESS) |
- hi = guess; |
- else |
- lo = guess; |
- } |
- } |
- } |
- elapsedTime = timeGetTime() - startTime; |
- break; |
- } |
- |
- if (opt_win) |
- { |
- unsigned long startTime = timeGetTime(); |
- int r = 0; |
- for (loops=0; loops<adj_loopCount; loops++) { |
- |
- for (line=0; line < gNumFileLines; line++) { |
- int lineLen = -1; |
- int guessLen = -1; |
- if (opt_uselen) { |
- lineLen = (gSortedLines[line])->len; |
- } |
- int hi = gNumFileLines-1; |
- int lo = 0; |
- int guess = -1; |
- for (;;) { |
- int newGuess = (hi + lo) / 2; |
- if (newGuess == guess) |
- break; |
- guess = newGuess; |
- if (opt_usekeys) { |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey); |
- } |
- gCount++; |
- r+=2; |
- } |
- else |
- { |
- if (opt_uselen) { |
- guessLen = (gSortedLines[guess])->len; |
- } |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen); |
- } |
- if (r == 0) { |
- if (opt_terse == FALSE) { |
- fprintf(stderr, "Error returned from Windows CompareStringW.\n"); |
- } |
- exit(-1); |
- } |
- gCount++; |
- } |
- if (r== 2) // strings == |
- break; |
- if (r == 1) // line < guess |
- hi = guess; |
- else // line > guess |
- lo = guess; |
- } |
- } |
- } |
- elapsedTime = timeGetTime() - startTime; |
- break; |
- } |
- |
- if (opt_unix) |
- { |
- unsigned long startTime = timeGetTime(); |
- int r = 0; |
- for (loops=0; loops<adj_loopCount; loops++) { |
- |
- for (line=0; line < gNumFileLines; line++) { |
- int hi = gNumFileLines-1; |
- int lo = 0; |
- int guess = -1; |
- for (;;) { |
- int newGuess = (hi + lo) / 2; |
- if (newGuess == guess) |
- break; |
- guess = newGuess; |
- if (opt_usekeys) { |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey); |
- } |
- gCount++; |
- } |
- else |
- { |
- for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) { |
- r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName); |
- } |
- errno = 0; |
- if (errno != 0) { |
- fprintf(stderr, "Error %d returned from strcoll.\n", errno); |
- exit(-1); |
- } |
- gCount++; |
- } |
- if (r == 0) // strings == |
- break; |
- if (r < 0) // line < guess |
- hi = guess; |
- else // line > guess |
- lo = guess; |
- } |
- } |
- } |
- elapsedTime = timeGetTime() - startTime; |
- break; |
- } |
- break; |
- } |
- |
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); |
- if (opt_terse == FALSE) { |
- printf("binary search: total # of string compares = %d\n", gCount); |
- printf("binary search: compares per loop = %d\n", gCount / loops); |
- printf("binary search: time per compare = %d ns\n", ns); |
- } else { |
- printf("%d, ", ns); |
- } |
- |
-} |
- |
- |
- |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doQSort() The quick sort timing test. Uses the C library qsort function. |
-// |
-//--------------------------------------------------------------------------------------- |
-void doQSort() { |
- int i; |
- Line **sortBuf = new Line *[gNumFileLines]; |
- |
- // Adjust loop count to compensate for file size. QSort should be n log(n) |
- double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines)); |
- if (opt_usekeys) dLoopCount *= 5; |
- int adj_loopCount = int(dLoopCount); |
- if (adj_loopCount < 1) adj_loopCount = 1; |
- |
- |
- gCount = 0; |
- unsigned long startTime = timeGetTime(); |
- if (opt_win && opt_usekeys) { |
- for (i=0; i<opt_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK); |
- } |
- } |
- |
- else if (opt_win && opt_uselen) { |
- for (i=0; i<adj_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL); |
- } |
- } |
- |
- |
- else if (opt_win && !opt_uselen) { |
- for (i=0; i<adj_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp); |
- } |
- } |
- |
- else if (opt_icu && opt_usekeys) { |
- for (i=0; i<adj_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK); |
- } |
- } |
- |
- else if (opt_icu && opt_uselen) { |
- for (i=0; i<adj_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL); |
- } |
- } |
- |
- |
- else if (opt_icu && !opt_uselen) { |
- for (i=0; i<adj_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp); |
- } |
- } |
- |
- else if (opt_unix && !opt_usekeys) { |
- for (i=0; i<adj_loopCount; i++) { |
- memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *)); |
- qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp); |
- } |
- } |
- |
- unsigned long elapsedTime = timeGetTime() - startTime; |
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); |
- if (opt_terse == FALSE) { |
- printf("qsort: total # of string compares = %d\n", gCount); |
- printf("qsort: time per compare = %d ns\n", ns); |
- } else { |
- printf("%d, ", ns); |
- } |
-} |
- |
- |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doKeyHist() Output a table of data for |
-// average sort key size vs. string length. |
-// |
-//--------------------------------------------------------------------------------------- |
-void doKeyHist() { |
- int i; |
- int maxLen = 0; |
- |
- // Find the maximum string length |
- for (i=0; i<gNumFileLines; i++) { |
- if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len; |
- } |
- |
- // Allocate arrays to hold the histogram data |
- int *accumulatedLen = new int[maxLen+1]; |
- int *numKeysOfSize = new int[maxLen+1]; |
- for (i=0; i<=maxLen; i++) { |
- accumulatedLen[i] = 0; |
- numKeysOfSize[i] = 0; |
- } |
- |
- // Fill the arrays... |
- for (i=0; i<gNumFileLines; i++) { |
- int len = gFileLines[i].len; |
- accumulatedLen[len] += strlen(gFileLines[i].icuSortKey); |
- numKeysOfSize[len] += 1; |
- } |
- |
- // And write out averages |
- printf("String Length, Avg Key Length, Avg Key Len per char\n"); |
- for (i=1; i<=maxLen; i++) { |
- if (numKeysOfSize[i] > 0) { |
- printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i], |
- (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i)); |
- } |
- } |
- delete []accumulatedLen; |
- delete []numKeysOfSize ; |
-} |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doForwardIterTest(UBool) Forward iteration test |
-// argument null-terminated string used |
-// |
-//--------------------------------------------------------------------------------------- |
-void doForwardIterTest(UBool haslen) { |
- int count = 0; |
- |
- UErrorCode error = U_ZERO_ERROR; |
- printf("\n\nPerforming forward iteration performance test with "); |
- |
- if (haslen) { |
- printf("non-null terminated data -----------\n"); |
- } |
- else { |
- printf("null terminated data -----------\n"); |
- } |
- printf("performance test on strings from file -----------\n"); |
- |
- UChar dummytext[] = {0, 0}; |
- UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); |
- ucol_setText(iter, dummytext, 1, &error); |
- |
- gCount = 0; |
- unsigned long startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int linecount = 0; |
- while (linecount < gNumFileLines) { |
- UChar *str = gFileLines[linecount].name; |
- int strlen = haslen?gFileLines[linecount].len:-1; |
- ucol_setText(iter, str, strlen, &error); |
- while (ucol_next(iter, &error) != UCOL_NULLORDER) { |
- gCount++; |
- } |
- |
- linecount ++; |
- } |
- count ++; |
- } |
- unsigned long elapsedTime = timeGetTime() - startTime; |
- printf("elapsedTime %ld\n", elapsedTime); |
- |
- // empty loop recalculation |
- count = 0; |
- startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int linecount = 0; |
- while (linecount < gNumFileLines) { |
- UChar *str = gFileLines[linecount].name; |
- int strlen = haslen?gFileLines[linecount].len:-1; |
- ucol_setText(iter, str, strlen, &error); |
- linecount ++; |
- } |
- count ++; |
- } |
- elapsedTime -= (timeGetTime() - startTime); |
- printf("elapsedTime %ld\n", elapsedTime); |
- |
- ucol_closeElements(iter); |
- |
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); |
- printf("Total number of strings compared %d in %d loops\n", gNumFileLines, |
- opt_loopCount); |
- printf("Average time per ucol_next() nano seconds %d\n", ns); |
- |
- printf("performance test on skipped-5 concatenated strings from file -----------\n"); |
- |
- UChar *str; |
- int strlen = 0; |
- // appending all the strings |
- int linecount = 0; |
- while (linecount < gNumFileLines) { |
- strlen += haslen?gFileLines[linecount].len: |
- u_strlen(gFileLines[linecount].name); |
- linecount ++; |
- } |
- str = (UChar *)malloc(sizeof(UChar) * strlen); |
- int strindex = 0; |
- linecount = 0; |
- while (strindex < strlen) { |
- int len = 0; |
- len += haslen?gFileLines[linecount].len: |
- u_strlen(gFileLines[linecount].name); |
- memcpy(str + strindex, gFileLines[linecount].name, |
- sizeof(UChar) * len); |
- strindex += len; |
- linecount ++; |
- } |
- |
- printf("Total size of strings %d\n", strlen); |
- |
- gCount = 0; |
- count = 0; |
- |
- if (!haslen) { |
- strlen = -1; |
- } |
- iter = ucol_openElements(gCol, str, strlen, &error); |
- if (!haslen) { |
- strlen = u_strlen(str); |
- } |
- strlen -= 5; // any left over characters are not iterated, |
- // this is to ensure the backwards and forwards iterators |
- // gets the same position |
- startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int count5 = 5; |
- strindex = 0; |
- ucol_setOffset(iter, strindex, &error); |
- while (TRUE) { |
- if (ucol_next(iter, &error) == UCOL_NULLORDER) { |
- break; |
- } |
- gCount++; |
- count5 --; |
- if (count5 == 0) { |
- strindex += 10; |
- if (strindex > strlen) { |
- break; |
- } |
- ucol_setOffset(iter, strindex, &error); |
- count5 = 5; |
- } |
- } |
- count ++; |
- } |
- |
- elapsedTime = timeGetTime() - startTime; |
- printf("elapsedTime %ld\n", elapsedTime); |
- |
- // empty loop recalculation |
- int tempgCount = 0; |
- count = 0; |
- startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int count5 = 5; |
- strindex = 0; |
- ucol_setOffset(iter, strindex, &error); |
- while (TRUE) { |
- tempgCount ++; |
- count5 --; |
- if (count5 == 0) { |
- strindex += 10; |
- if (strindex > strlen) { |
- break; |
- } |
- ucol_setOffset(iter, strindex, &error); |
- count5 = 5; |
- } |
- } |
- count ++; |
- } |
- elapsedTime -= (timeGetTime() - startTime); |
- printf("elapsedTime %ld\n", elapsedTime); |
- |
- ucol_closeElements(iter); |
- |
- printf("gCount %d\n", gCount); |
- ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); |
- printf("Average time per ucol_next() nano seconds %d\n", ns); |
-} |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doBackwardIterTest(UBool) Backwards iteration test |
-// argument null-terminated string used |
-// |
-//--------------------------------------------------------------------------------------- |
-void doBackwardIterTest(UBool haslen) { |
- int count = 0; |
- UErrorCode error = U_ZERO_ERROR; |
- printf("\n\nPerforming backward iteration performance test with "); |
- |
- if (haslen) { |
- printf("non-null terminated data -----------\n"); |
- } |
- else { |
- printf("null terminated data -----------\n"); |
- } |
- |
- printf("performance test on strings from file -----------\n"); |
- |
- UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error); |
- UChar dummytext[] = {0, 0}; |
- ucol_setText(iter, dummytext, 1, &error); |
- |
- gCount = 0; |
- unsigned long startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int linecount = 0; |
- while (linecount < gNumFileLines) { |
- UChar *str = gFileLines[linecount].name; |
- int strlen = haslen?gFileLines[linecount].len:-1; |
- ucol_setText(iter, str, strlen, &error); |
- while (ucol_previous(iter, &error) != UCOL_NULLORDER) { |
- gCount ++; |
- } |
- |
- linecount ++; |
- } |
- count ++; |
- } |
- unsigned long elapsedTime = timeGetTime() - startTime; |
- |
- printf("elapsedTime %ld\n", elapsedTime); |
- |
- // empty loop recalculation |
- count = 0; |
- startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int linecount = 0; |
- while (linecount < gNumFileLines) { |
- UChar *str = gFileLines[linecount].name; |
- int strlen = haslen?gFileLines[linecount].len:-1; |
- ucol_setText(iter, str, strlen, &error); |
- linecount ++; |
- } |
- count ++; |
- } |
- elapsedTime -= (timeGetTime() - startTime); |
- |
- printf("elapsedTime %ld\n", elapsedTime); |
- ucol_closeElements(iter); |
- |
- int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); |
- printf("Total number of strings compared %d in %d loops\n", gNumFileLines, |
- opt_loopCount); |
- printf("Average time per ucol_previous() nano seconds %d\n", ns); |
- |
- printf("performance test on skipped-5 concatenated strings from file -----------\n"); |
- |
- UChar *str; |
- int strlen = 0; |
- // appending all the strings |
- int linecount = 0; |
- while (linecount < gNumFileLines) { |
- strlen += haslen?gFileLines[linecount].len: |
- u_strlen(gFileLines[linecount].name); |
- linecount ++; |
- } |
- str = (UChar *)malloc(sizeof(UChar) * strlen); |
- int strindex = 0; |
- linecount = 0; |
- while (strindex < strlen) { |
- int len = 0; |
- len += haslen?gFileLines[linecount].len: |
- u_strlen(gFileLines[linecount].name); |
- memcpy(str + strindex, gFileLines[linecount].name, |
- sizeof(UChar) * len); |
- strindex += len; |
- linecount ++; |
- } |
- |
- printf("Total size of strings %d\n", strlen); |
- |
- gCount = 0; |
- count = 0; |
- |
- if (!haslen) { |
- strlen = -1; |
- } |
- |
- iter = ucol_openElements(gCol, str, strlen, &error); |
- if (!haslen) { |
- strlen = u_strlen(str); |
- } |
- |
- startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int count5 = 5; |
- strindex = 5; |
- ucol_setOffset(iter, strindex, &error); |
- while (TRUE) { |
- if (ucol_previous(iter, &error) == UCOL_NULLORDER) { |
- break; |
- } |
- gCount ++; |
- count5 --; |
- if (count5 == 0) { |
- strindex += 10; |
- if (strindex > strlen) { |
- break; |
- } |
- ucol_setOffset(iter, strindex, &error); |
- count5 = 5; |
- } |
- } |
- count ++; |
- } |
- |
- elapsedTime = timeGetTime() - startTime; |
- printf("elapsedTime %ld\n", elapsedTime); |
- |
- // empty loop recalculation |
- count = 0; |
- int tempgCount = 0; |
- startTime = timeGetTime(); |
- while (count < opt_loopCount) { |
- int count5 = 5; |
- strindex = 5; |
- ucol_setOffset(iter, strindex, &error); |
- while (TRUE) { |
- tempgCount ++; |
- count5 --; |
- if (count5 == 0) { |
- strindex += 10; |
- if (strindex > strlen) { |
- break; |
- } |
- ucol_setOffset(iter, strindex, &error); |
- count5 = 5; |
- } |
- } |
- count ++; |
- } |
- elapsedTime -= (timeGetTime() - startTime); |
- printf("elapsedTime %ld\n", elapsedTime); |
- ucol_closeElements(iter); |
- |
- printf("gCount %d\n", gCount); |
- ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount); |
- printf("Average time per ucol_previous() nano seconds %d\n", ns); |
-} |
- |
-//--------------------------------------------------------------------------------------- |
-// |
-// doIterTest() Iteration test |
-// |
-//--------------------------------------------------------------------------------------- |
-void doIterTest() { |
- doForwardIterTest(opt_uselen); |
- doBackwardIterTest(opt_uselen); |
-} |
- |
- |
-//---------------------------------------------------------------------------------------- |
-// |
-// UnixConvert -- Convert the lines of the file to the encoding for UNIX |
-// Since it appears that Unicode support is going in the general |
-// direction of the use of UTF-8 locales, that is the approach |
-// that is used here. |
-// |
-//---------------------------------------------------------------------------------------- |
-void UnixConvert() { |
- int line; |
- |
- UConverter *cvrtr; // An ICU code page converter. |
- UErrorCode status = U_ZERO_ERROR; |
- |
- |
- cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status)); |
- exit(-1); |
- } |
- |
- for (line=0; line < gNumFileLines; line++) { |
- int sizeNeeded = ucnv_fromUChars(cvrtr, |
- 0, // ptr to target buffer. |
- 0, // length of target buffer. |
- gFileLines[line].name, |
- -1, // source is null terminated |
- &status); |
- if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { |
- //fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); |
- //exit(-1); |
- } |
- status = U_ZERO_ERROR; |
- gFileLines[line].unixName = new char[sizeNeeded+1]; |
- sizeNeeded = ucnv_fromUChars(cvrtr, |
- gFileLines[line].unixName, // ptr to target buffer. |
- sizeNeeded+1, // length of target buffer. |
- gFileLines[line].name, |
- -1, // source is null terminated |
- &status); |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "ICU Conversion Failed.: %d\n", status); |
- exit(-1); |
- } |
- gFileLines[line].unixName[sizeNeeded] = 0; |
- }; |
- ucnv_close(cvrtr); |
-} |
- |
- |
-//---------------------------------------------------------------------------------------- |
-// |
-// class UCharFile Class to hide all the gorp to read a file in |
-// and produce a stream of UChars. |
-// |
-//---------------------------------------------------------------------------------------- |
-class UCharFile { |
-public: |
- UCharFile(const char *fileName); |
- ~UCharFile(); |
- UChar get(); |
- UBool eof() {return fEof;}; |
- UBool error() {return fError;}; |
- |
-private: |
- UCharFile (const UCharFile & /*other*/) {}; // No copy constructor. |
- UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op |
- |
- FILE *fFile; |
- const char *fName; |
- UBool fEof; |
- UBool fError; |
- UChar fPending2ndSurrogate; |
- |
- enum {UTF16LE, UTF16BE, UTF8} fEncoding; |
-}; |
- |
-UCharFile::UCharFile(const char * fileName) { |
- fEof = FALSE; |
- fError = FALSE; |
- fName = fileName; |
- fFile = fopen(fName, "rb"); |
- fPending2ndSurrogate = 0; |
- if (fFile == NULL) { |
- fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); |
- fError = TRUE; |
- return; |
- } |
- // |
- // Look for the byte order mark at the start of the file. |
- // |
- int BOMC1, BOMC2, BOMC3; |
- BOMC1 = fgetc(fFile); |
- BOMC2 = fgetc(fFile); |
- |
- if (BOMC1 == 0xff && BOMC2 == 0xfe) { |
- fEncoding = UTF16LE; } |
- else if (BOMC1 == 0xfe && BOMC2 == 0xff) { |
- fEncoding = UTF16BE; } |
- else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { |
- fEncoding = UTF8; } |
- else |
- { |
- fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " |
- "must include a BOM.\n", fileName); |
- fError = true; |
- return; |
- } |
-} |
- |
- |
-UCharFile::~UCharFile() { |
- fclose(fFile); |
-} |
- |
- |
- |
-UChar UCharFile::get() { |
- UChar c; |
- switch (fEncoding) { |
- case UTF16LE: |
- { |
- int cL, cH; |
- cL = fgetc(fFile); |
- cH = fgetc(fFile); |
- c = cL | (cH << 8); |
- if (cH == EOF) { |
- c = 0; |
- fEof = TRUE; |
- } |
- break; |
- } |
- case UTF16BE: |
- { |
- int cL, cH; |
- cH = fgetc(fFile); |
- cL = fgetc(fFile); |
- c = cL | (cH << 8); |
- if (cL == EOF) { |
- c = 0; |
- fEof = TRUE; |
- } |
- break; |
- } |
- case UTF8: |
- { |
- if (fPending2ndSurrogate != 0) { |
- c = fPending2ndSurrogate; |
- fPending2ndSurrogate = 0; |
- break; |
- } |
- |
- int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. |
- if (ch == EOF) { |
- c = 0; |
- fEof = TRUE; |
- break; |
- } |
- |
- if (ch <= 0x7f) { |
- // It's ascii. No further utf-8 conversion. |
- c = ch; |
- break; |
- } |
- |
- // Figure out the lenght of the char and read the rest of the bytes |
- // into a temp array. |
- int nBytes; |
- if (ch >= 0xF0) {nBytes=4;} |
- else if (ch >= 0xE0) {nBytes=3;} |
- else if (ch >= 0xC0) {nBytes=2;} |
- else { |
- fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); |
- fError = TRUE; |
- return 0; |
- } |
- |
- unsigned char bytes[10]; |
- bytes[0] = (unsigned char)ch; |
- int i; |
- for (i=1; i<nBytes; i++) { |
- bytes[i] = fgetc(fFile); |
- if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { |
- fprintf(stderr, "utf-8 encoded file contains corrupt data.\n"); |
- fError = TRUE; |
- return 0; |
- } |
- } |
- |
- // Convert the bytes from the temp array to a Unicode char. |
- i = 0; |
- uint32_t cp; |
- U8_NEXT_UNSAFE(bytes, i, cp); |
- c = (UChar)cp; |
- |
- if (cp >= 0x10000) { |
- // The code point needs to be broken up into a utf-16 surrogate pair. |
- // Process first half this time through the main loop, and |
- // remember the other half for the next time through. |
- UChar utf16Buf[3]; |
- i = 0; |
- UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); |
- fPending2ndSurrogate = utf16Buf[1]; |
- c = utf16Buf[0]; |
- } |
- break; |
- }; |
- default: |
- c = 0xFFFD; /* Error, unspecified codepage*/ |
- fprintf(stderr, "UCharFile: Error: unknown fEncoding\n"); |
- exit(1); |
- } |
- return c; |
-} |
- |
-//---------------------------------------------------------------------------------------- |
-// |
-// openRulesCollator - Command line specified a rules file. Read it in |
-// and open a collator with it. |
-// |
-//---------------------------------------------------------------------------------------- |
-UCollator *openRulesCollator() { |
- UCharFile f(opt_rules); |
- if (f.error()) { |
- return 0; |
- } |
- |
- int bufLen = 10000; |
- UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar)); |
- UChar *tmp; |
- int i = 0; |
- |
- for(;;) { |
- buf[i] = f.get(); |
- if (f.eof()) { |
- break; |
- } |
- if (f.error()) { |
- return 0; |
- } |
- i++; |
- if (i >= bufLen) { |
- tmp = buf; |
- bufLen += 10000; |
- buf = (UChar *)realloc(buf, bufLen); |
- if (buf == NULL) { |
- free(tmp); |
- return 0; |
- } |
- } |
- } |
- buf[i] = 0; |
- |
- UErrorCode status = U_ZERO_ERROR; |
- UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF, |
- UCOL_DEFAULT_STRENGTH, NULL, &status); |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status); |
- return 0; |
- } |
- free(buf); |
- return coll; |
-} |
- |
- |
- |
- |
- |
-//---------------------------------------------------------------------------------------- |
-// |
-// Main -- process command line, read in and pre-process the test file, |
-// call other functions to do the actual tests. |
-// |
-//---------------------------------------------------------------------------------------- |
-int main(int argc, const char** argv) { |
- if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) { |
- printf(gUsageString); |
- exit (1); |
- } |
- |
- // Make sure that we've only got one API selected. |
- if (opt_unix || opt_win) opt_icu = FALSE; |
- if (opt_unix) opt_win = FALSE; |
- |
- // |
- // Set up an ICU collator |
- // |
- UErrorCode status = U_ZERO_ERROR; |
- |
- if (opt_rules != 0) { |
- gCol = openRulesCollator(); |
- if (gCol == 0) {return -1;} |
- } |
- else { |
- gCol = ucol_open(opt_locale, &status); |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "Collator creation failed.: %d\n", status); |
- return -1; |
- } |
- } |
- if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { |
- fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); |
- } |
- if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { |
- fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); |
- } |
- |
- if (opt_norm) { |
- ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); |
- } |
- if (opt_french && opt_frenchoff) { |
- fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options."); |
- exit(-1); |
- } |
- if (opt_french) { |
- ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status); |
- } |
- if (opt_frenchoff) { |
- ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status); |
- } |
- if (opt_lower) { |
- ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status); |
- } |
- if (opt_upper) { |
- ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status); |
- } |
- if (opt_case) { |
- ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status); |
- } |
- if (opt_shifted) { |
- ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status); |
- } |
- if (opt_level != 0) { |
- switch (opt_level) { |
- case 1: |
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status); |
- break; |
- case 2: |
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status); |
- break; |
- case 3: |
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status); |
- break; |
- case 4: |
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status); |
- break; |
- case 5: |
- ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status); |
- break; |
- default: |
- fprintf(stderr, "-level param must be between 1 and 5\n"); |
- exit(-1); |
- } |
- } |
- |
- if (U_FAILURE(status)) { |
- fprintf(stderr, "Collator attribute setting failed.: %d\n", status); |
- return -1; |
- } |
- |
- |
- // |
- // Set up a Windows LCID |
- // |
- if (opt_langid != 0) { |
- gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); |
- } |
- else { |
- gWinLCID = uloc_getLCID(opt_locale); |
- } |
- |
- |
- // |
- // Set the UNIX locale |
- // |
- if (opt_unix) { |
- if (setlocale(LC_ALL, opt_locale) == 0) { |
- fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); |
- exit(-1); |
- } |
- } |
- |
- // Read in the input file. |
- // File assumed to be utf-16. |
- // Lines go onto heap buffers. Global index array to line starts is created. |
- // Lines themselves are null terminated. |
- // |
- |
- UCharFile f(opt_fName); |
- if (f.error()) { |
- exit(-1); |
- } |
- |
- const int MAXLINES = 100000; |
- gFileLines = new Line[MAXLINES]; |
- UChar buf[1024]; |
- int column = 0; |
- |
- // Read the file, split into lines, and save in memory. |
- // Loop runs once per utf-16 value from the input file, |
- // (The number of bytes read from file per loop iteration depends on external encoding.) |
- for (;;) { |
- |
- UChar c = f.get(); |
- if (f.error()){ |
- exit(-1); |
- } |
- |
- |
- // We now have a good UTF-16 value in c. |
- |
- // Watch for CR, LF, EOF; these finish off a line. |
- if (c == 0xd) { |
- continue; |
- } |
- |
- if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators! |
- buf[column++] = 0; |
- if (column > 1) { |
- gFileLines[gNumFileLines].name = new UChar[column]; |
- gFileLines[gNumFileLines].len = column-1; |
- memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar)); |
- gNumFileLines++; |
- column = 0; |
- if (gNumFileLines >= MAXLINES) { |
- fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES); |
- exit(-1); |
- } |
- |
- } |
- if (c == 0xa || c == 0x2028) |
- continue; |
- else |
- break; // EOF |
- } |
- buf[column++] = c; |
- if (column >= 1023) |
- { |
- static UBool warnFlag = TRUE; |
- if (warnFlag) { |
- fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n"); |
- warnFlag = FALSE; |
- } |
- column--; |
- } |
- } |
- |
- if (opt_terse == FALSE) { |
- printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines); |
- } |
- |
- |
- // Convert the lines to the UNIX encoding. |
- if (opt_unix) { |
- UnixConvert(); |
- } |
- |
- // |
- // Pre-compute ICU sort keys for the lines of the file. |
- // |
- int line; |
- int32_t t; |
- |
- for (line=0; line<gNumFileLines; line++) { |
- t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf)); |
- gFileLines[line].icuSortKey = new char[t]; |
- |
- if (t > (int32_t)sizeof(buf)) { |
- t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t); |
- } |
- else |
- { |
- memcpy(gFileLines[line].icuSortKey, buf, t); |
- } |
- } |
- |
- |
- |
- // |
- // Pre-compute Windows sort keys for the lines of the file. |
- // |
- for (line=0; line<gNumFileLines; line++) { |
- t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf)); |
- gFileLines[line].winSortKey = new char[t]; |
- if (t > (int32_t)sizeof(buf)) { |
- t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t); |
- } |
- else |
- { |
- memcpy(gFileLines[line].winSortKey, buf, t); |
- } |
- } |
- |
- // |
- // Pre-compute UNIX sort keys for the lines of the file. |
- // |
- if (opt_unix) { |
- for (line=0; line<gNumFileLines; line++) { |
- t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf)); |
- gFileLines[line].unixSortKey = new char[t]; |
- if (t > (int32_t)sizeof(buf)) { |
- t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf)); |
- } |
- else |
- { |
- memcpy(gFileLines[line].unixSortKey, buf, t); |
- } |
- } |
- } |
- |
- |
- // |
- // Dump file lines, CEs, Sort Keys if requested. |
- // |
- if (opt_dump) { |
- int i; |
- for (line=0; line<gNumFileLines; line++) { |
- for (i=0;;i++) { |
- UChar c = gFileLines[line].name[i]; |
- if (c == 0) |
- break; |
- if (c < 0x20 || c > 0x7e) { |
- printf("\\u%.4x", c); |
- } |
- else { |
- printf("%c", c); |
- } |
- } |
- printf("\n"); |
- |
- printf(" CEs: "); |
- UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status); |
- int32_t ce; |
- i = 0; |
- for (;;) { |
- ce = ucol_next(CEiter, &status); |
- if (ce == UCOL_NULLORDER) { |
- break; |
- } |
- printf(" %.8x", ce); |
- if (++i > 8) { |
- printf("\n "); |
- i = 0; |
- } |
- } |
- printf("\n"); |
- ucol_closeElements(CEiter); |
- |
- |
- printf(" ICU Sort Key: "); |
- for (i=0; ; i++) { |
- unsigned char c = gFileLines[line].icuSortKey[i]; |
- printf("%02x ", c); |
- if (c == 0) { |
- break; |
- } |
- if (i > 0 && i % 20 == 0) { |
- printf("\n "); |
- } |
- } |
- printf("\n"); |
- } |
- } |
- |
- |
- // |
- // Pre-sort the lines. |
- // |
- int i; |
- gSortedLines = new Line *[gNumFileLines]; |
- for (i=0; i<gNumFileLines; i++) { |
- gSortedLines[i] = &gFileLines[i]; |
- } |
- |
- if (opt_win) { |
- qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp); |
- } |
- else if (opt_unix) { |
- qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp); |
- } |
- else /* ICU */ |
- { |
- qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp); |
- } |
- |
- |
- // |
- // Make up a randomized order, will be used for sorting tests. |
- // |
- gRandomLines = new Line *[gNumFileLines]; |
- for (i=0; i<gNumFileLines; i++) { |
- gRandomLines[i] = &gFileLines[i]; |
- } |
- qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp); |
- |
- |
- |
- |
- // |
- // We've got the file read into memory. Go do something with it. |
- // |
- |
- if (opt_qsort) doQSort(); |
- if (opt_binsearch) doBinarySearch(); |
- if (opt_keygen) doKeyGen(); |
- if (opt_keyhist) doKeyHist(); |
- if (opt_itertest) doIterTest(); |
- |
- return 0; |
- |
-} |