OLD | NEW |
(Empty) | |
| 1 /******************************************************************** |
| 2 * COPYRIGHT: |
| 3 * Copyright (C) 2001-2005 IBM, Inc. All Rights Reserved. |
| 4 * |
| 5 ********************************************************************/ |
| 6 /*******************************************************************************
* |
| 7 * |
| 8 * File ubrkperf.cpp |
| 9 * |
| 10 * Modification History: |
| 11 * Name Description |
| 12 * Vladimir Weinstein First Version, based on collperf |
| 13 * |
| 14 ********************************************************************************
* |
| 15 */ |
| 16 |
| 17 // |
| 18 // This program tests break iterator performance |
| 19 // Currently we test only ICU APIs with the future possibility of testing *
nix & win32 APIs |
| 20 // (if any) |
| 21 // A text file is required as input. It must be in utf-8 or utf-16 format, |
| 22 // and include a byte order mark. Either LE or BE format is OK. |
| 23 // |
| 24 |
| 25 const char gUsageString[] = |
| 26 "usage: ubrkperf options...\n" |
| 27 "-help Display this message.\n" |
| 28 "-file file_name utf-16/utf-8 format file.\n" |
| 29 "-locale name ICU locale to use. Default is en_US\n" |
| 30 "-langid 0x1234 Windows Language ID number. Default to value fo
r -locale option\n" |
| 31 " see http://msdn.microsoft.com/library/psdk/wi
nbase/nls_8xo3.htm\n" |
| 32 "-win Run test using Windows native services. (current
ly not working) (ICU is default)\n" |
| 33 "-unix Run test using Unix word breaking services. (cur
rently not working) \n" |
| 34 "-mac Run test using MacOSX word breaking services.\n" |
| 35 "-uselen Use API with string lengths. Default is null-te
rminated strings\n" |
| 36 "-char Use character break iterator\n" |
| 37 "-word Use word break iterator\n" |
| 38 "-line Use line break iterator\n" |
| 39 "-sentence Use sentence break iterator\n" |
| 40 "-loop nnnn Loopcount for test. Adjust for reasonable total
running time.\n" |
| 41 "-iloop n Inner Loop Count. Default = 1. Number of calls
to function\n" |
| 42 " under test at each call point. For measurin
g test overhead.\n" |
| 43 "-terse Terse numbers-only output. Intended for use by
scripts.\n" |
| 44 "-dump Display stuff.\n" |
| 45 "-capi Use C APIs instead of C++ APIs (currently not wo
rking)\n" |
| 46 "-next Do the next test\n" |
| 47 "-isBound Do the isBound test\n" |
| 48 ; |
| 49 |
| 50 |
| 51 #include <stdio.h> |
| 52 #include <string.h> |
| 53 #include <stdlib.h> |
| 54 #include <math.h> |
| 55 #include <locale.h> |
| 56 #include <errno.h> |
| 57 #include <sys/stat.h> |
| 58 |
| 59 #include <unicode/utypes.h> |
| 60 #include <unicode/ucol.h> |
| 61 #include <unicode/ucoleitr.h> |
| 62 #include <unicode/uloc.h> |
| 63 #include <unicode/ustring.h> |
| 64 #include <unicode/ures.h> |
| 65 #include <unicode/uchar.h> |
| 66 #include <unicode/ucnv.h> |
| 67 #include <unicode/utf8.h> |
| 68 |
| 69 #include <unicode/brkiter.h> |
| 70 |
| 71 |
| 72 #ifdef U_WINDOWS |
| 73 #include <windows.h> |
| 74 #else |
| 75 // |
| 76 // Stubs for Windows API functions when building on UNIXes. |
| 77 // |
| 78 #include <sys/time.h> |
| 79 unsigned long timeGetTime() { |
| 80 struct timeval t; |
| 81 gettimeofday(&t, 0); |
| 82 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. |
| 83 val += t.tv_usec / 1000; |
| 84 return val; |
| 85 }; |
| 86 #define MAKELCID(a,b) 0 |
| 87 #endif |
| 88 |
| 89 |
| 90 // |
| 91 // Command line option variables |
| 92 // These global variables are set according to the options specified |
| 93 // on the command line by the user. |
| 94 char * opt_fName = 0; |
| 95 char * opt_locale = "en_US"; |
| 96 int opt_langid = 0; // Defaults to value corresponding to opt_loc
ale. |
| 97 char * opt_rules = 0; |
| 98 UBool opt_help = FALSE; |
| 99 int opt_time = 0; |
| 100 int opt_loopCount = 0; |
| 101 int opt_passesCount= 1; |
| 102 UBool opt_terse = FALSE; |
| 103 UBool opt_icu = TRUE; |
| 104 UBool opt_win = FALSE; // Run with Windows native functions. |
| 105 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. |
| 106 UBool opt_mac = FALSE; // Run with MacOSX word break services. |
| 107 UBool opt_uselen = FALSE; |
| 108 UBool opt_dump = FALSE; |
| 109 UBool opt_char = FALSE; |
| 110 UBool opt_word = FALSE; |
| 111 UBool opt_line = FALSE; |
| 112 UBool opt_sentence = FALSE; |
| 113 UBool opt_capi = FALSE; |
| 114 |
| 115 UBool opt_next = FALSE; |
| 116 UBool opt_isBound = FALSE; |
| 117 |
| 118 |
| 119 |
| 120 // |
| 121 // Definitions for the command line options |
| 122 // |
| 123 struct OptSpec { |
| 124 const char *name; |
| 125 enum {FLAG, NUM, STRING} type; |
| 126 void *pVar; |
| 127 }; |
| 128 |
| 129 OptSpec opts[] = { |
| 130 {"-file", OptSpec::STRING, &opt_fName}, |
| 131 {"-locale", OptSpec::STRING, &opt_locale}, |
| 132 {"-langid", OptSpec::NUM, &opt_langid}, |
| 133 {"-win", OptSpec::FLAG, &opt_win}, |
| 134 {"-unix", OptSpec::FLAG, &opt_unix}, |
| 135 {"-mac", OptSpec::FLAG, &opt_mac}, |
| 136 {"-uselen", OptSpec::FLAG, &opt_uselen}, |
| 137 {"-loop", OptSpec::NUM, &opt_loopCount}, |
| 138 {"-time", OptSpec::NUM, &opt_time}, |
| 139 {"-passes", OptSpec::NUM, &opt_passesCount}, |
| 140 {"-char", OptSpec::FLAG, &opt_char}, |
| 141 {"-word", OptSpec::FLAG, &opt_word}, |
| 142 {"-line", OptSpec::FLAG, &opt_line}, |
| 143 {"-sentence", OptSpec::FLAG, &opt_sentence}, |
| 144 {"-terse", OptSpec::FLAG, &opt_terse}, |
| 145 {"-dump", OptSpec::FLAG, &opt_dump}, |
| 146 {"-capi", OptSpec::FLAG, &opt_capi}, |
| 147 {"-next", OptSpec::FLAG, &opt_next}, |
| 148 {"-isBound", OptSpec::FLAG, &opt_isBound}, |
| 149 {"-help", OptSpec::FLAG, &opt_help}, |
| 150 {"-?", OptSpec::FLAG, &opt_help}, |
| 151 {0, OptSpec::FLAG, 0} |
| 152 }; |
| 153 |
| 154 |
| 155 //--------------------------------------------------------------------------- |
| 156 // |
| 157 // Global variables pointing to and describing the test file |
| 158 // |
| 159 //--------------------------------------------------------------------------- |
| 160 |
| 161 //DWORD gWinLCID; |
| 162 BreakIterator *brkit = NULL; |
| 163 UChar *text = NULL; |
| 164 int32_t textSize = 0; |
| 165 |
| 166 |
| 167 |
| 168 #ifdef U_DARWIN |
| 169 #include <ApplicationServices/ApplicationServices.h> |
| 170 enum{ |
| 171 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTex
tBreakLineMask) |
| 172 }; |
| 173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask,
kUCTextBreakWordMask, kUCTextBreakLineMask}; |
| 174 TextBreakLocatorRef breakRef; |
| 175 UCTextBreakType macBreakType; |
| 176 |
| 177 void createMACBrkIt() { |
| 178 OSStatus status = noErr; |
| 179 LocaleRef lref; |
| 180 status = LocaleRefFromLocaleString(opt_locale, &lref); |
| 181 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLoca
torRef*)&breakRef); |
| 182 if(opt_char == TRUE) { |
| 183 macBreakType = kUCTextBreakClusterMask; |
| 184 } else if(opt_word == TRUE) { |
| 185 macBreakType = kUCTextBreakWordMask; |
| 186 } else if(opt_line == TRUE) { |
| 187 macBreakType = kUCTextBreakLineMask; |
| 188 } else if(opt_sentence == TRUE) { |
| 189 // error |
| 190 // brkit = BreakIterator::createSentenceInstance(opt_locale, status); |
| 191 } else { |
| 192 // default is character iterator |
| 193 macBreakType = kUCTextBreakClusterMask; |
| 194 } |
| 195 } |
| 196 #endif |
| 197 |
| 198 void createICUBrkIt() { |
| 199 // |
| 200 // Set up an ICU break iterator |
| 201 // |
| 202 UErrorCode status = U_ZERO_ERROR; |
| 203 if(opt_char == TRUE) { |
| 204 brkit = BreakIterator::createCharacterInstance(opt_locale, status); |
| 205 } else if(opt_word == TRUE) { |
| 206 brkit = BreakIterator::createWordInstance(opt_locale, status); |
| 207 } else if(opt_line == TRUE) { |
| 208 brkit = BreakIterator::createLineInstance(opt_locale, status); |
| 209 } else if(opt_sentence == TRUE) { |
| 210 brkit = BreakIterator::createSentenceInstance(opt_locale, status); |
| 211 } else { |
| 212 // default is character iterator |
| 213 brkit = BreakIterator::createCharacterInstance(opt_locale, status); |
| 214 } |
| 215 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { |
| 216 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); |
| 217 } |
| 218 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { |
| 219 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); |
| 220 } |
| 221 |
| 222 } |
| 223 |
| 224 //--------------------------------------------------------------------------- |
| 225 // |
| 226 // ProcessOptions() Function to read the command line options. |
| 227 // |
| 228 //--------------------------------------------------------------------------- |
| 229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) |
| 230 { |
| 231 int i; |
| 232 int argNum; |
| 233 const char *pArgName; |
| 234 OptSpec *pOpt; |
| 235 |
| 236 for (argNum=1; argNum<argc; argNum++) { |
| 237 pArgName = argv[argNum]; |
| 238 for (pOpt = opts; pOpt->name != 0; pOpt++) { |
| 239 if (strcmp(pOpt->name, pArgName) == 0) { |
| 240 switch (pOpt->type) { |
| 241 case OptSpec::FLAG: |
| 242 *(UBool *)(pOpt->pVar) = TRUE; |
| 243 break; |
| 244 case OptSpec::STRING: |
| 245 argNum ++; |
| 246 if (argNum >= argc) { |
| 247 fprintf(stderr, "value expected for \"%s\" option.\n", p
Opt->name); |
| 248 return FALSE; |
| 249 } |
| 250 *(const char **)(pOpt->pVar) = argv[argNum]; |
| 251 break; |
| 252 case OptSpec::NUM: |
| 253 argNum ++; |
| 254 if (argNum >= argc) { |
| 255 fprintf(stderr, "value expected for \"%s\" option.\n", p
Opt->name); |
| 256 return FALSE; |
| 257 } |
| 258 char *endp; |
| 259 i = strtol(argv[argNum], &endp, 0); |
| 260 if (endp == argv[argNum]) { |
| 261 fprintf(stderr, "integer value expected for \"%s\" optio
n.\n", pOpt->name); |
| 262 return FALSE; |
| 263 } |
| 264 *(int *)(pOpt->pVar) = i; |
| 265 } |
| 266 break; |
| 267 } |
| 268 } |
| 269 if (pOpt->name == 0) |
| 270 { |
| 271 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); |
| 272 return FALSE; |
| 273 } |
| 274 } |
| 275 return TRUE; |
| 276 } |
| 277 |
| 278 |
| 279 void doForwardTest() { |
| 280 if (opt_terse == FALSE) { |
| 281 printf("Doing the forward test\n"); |
| 282 } |
| 283 int32_t noBreaks = 0; |
| 284 int32_t i = 0; |
| 285 unsigned long startTime = timeGetTime(); |
| 286 unsigned long elapsedTime = 0; |
| 287 if(opt_icu) { |
| 288 createICUBrkIt(); |
| 289 brkit->setText(UnicodeString(text, textSize)); |
| 290 brkit->first(); |
| 291 if (opt_terse == FALSE) { |
| 292 printf("Warmup\n"); |
| 293 } |
| 294 int j; |
| 295 while((j = brkit->next()) != BreakIterator::DONE) { |
| 296 noBreaks++; |
| 297 //fprintf(stderr, "%d ", j); |
| 298 } |
| 299 |
| 300 if (opt_terse == FALSE) { |
| 301 printf("Measure\n"); |
| 302 } |
| 303 startTime = timeGetTime(); |
| 304 for(i = 0; i < opt_loopCount; i++) { |
| 305 brkit->first(); |
| 306 while(brkit->next() != BreakIterator::DONE) { |
| 307 } |
| 308 } |
| 309 |
| 310 elapsedTime = timeGetTime()-startTime; |
| 311 } else if(opt_mac) { |
| 312 #ifdef U_DARWIN |
| 313 createMACBrkIt(); |
| 314 UniChar* filePtr = text; |
| 315 OSStatus status = noErr; |
| 316 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; |
| 317 startOffset = 0; |
| 318 //printf("\t---Search forward--\n"); |
| 319 |
| 320 while (startOffset < numUniChars) |
| 321 { |
| 322 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdge
Mask, filePtr, numUniChars, |
| 323 startOffset, &breakOffset); |
| 324 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed:
startOffset %d, status %d\n", (int)startOffset, (int)status)); |
| 325 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBr
eak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (i
nt)breakOffset)); |
| 326 |
| 327 // Output break |
| 328 //printf("\t%d\n", (int)breakOffset); |
| 329 |
| 330 // Increment counters |
| 331 noBreaks++; |
| 332 startOffset = breakOffset; |
| 333 } |
| 334 startTime = timeGetTime(); |
| 335 for(i = 0; i < opt_loopCount; i++) { |
| 336 startOffset = 0; |
| 337 |
| 338 while (startOffset < numUniChars) |
| 339 { |
| 340 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEd
geMask, filePtr, numUniChars, |
| 341 startOffset, &breakOffset); |
| 342 // Increment counters |
| 343 startOffset = breakOffset; |
| 344 } |
| 345 } |
| 346 elapsedTime = timeGetTime()-startTime; |
| 347 UCDisposeTextBreakLocator(&breakRef); |
| 348 #endif |
| 349 |
| 350 |
| 351 } |
| 352 |
| 353 |
| 354 if (opt_terse == FALSE) { |
| 355 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCoun
t)); |
| 356 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize))
; |
| 357 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreak
s)); |
| 358 printf("forward break iteration average loop time %d\n", loopTime); |
| 359 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU); |
| 360 printf("number of breaks %d average time per break %d\n", noBreaks, timePe
rBreak); |
| 361 } else { |
| 362 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); |
| 363 } |
| 364 |
| 365 |
| 366 } |
| 367 |
| 368 void doIsBoundTest() { |
| 369 int32_t noBreaks = 0, hit = 0; |
| 370 int32_t i = 0, j = 0; |
| 371 unsigned long startTime = timeGetTime(); |
| 372 unsigned long elapsedTime = 0; |
| 373 createICUBrkIt(); |
| 374 brkit->setText(UnicodeString(text, textSize)); |
| 375 brkit->first(); |
| 376 for(j = 0; j < textSize; j++) { |
| 377 if(brkit->isBoundary(j)) { |
| 378 noBreaks++; |
| 379 //fprintf(stderr, "%d ", j); |
| 380 } |
| 381 } |
| 382 /* |
| 383 while(brkit->next() != BreakIterator::DONE) { |
| 384 noBreaks++; |
| 385 } |
| 386 */ |
| 387 |
| 388 startTime = timeGetTime(); |
| 389 for(i = 0; i < opt_loopCount; i++) { |
| 390 for(j = 0; j < textSize; j++) { |
| 391 if(brkit->isBoundary(j)) { |
| 392 hit++; |
| 393 } |
| 394 } |
| 395 } |
| 396 |
| 397 elapsedTime = timeGetTime()-startTime; |
| 398 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCoun
t)); |
| 399 if (opt_terse == FALSE) { |
| 400 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize))
; |
| 401 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreak
s)); |
| 402 printf("forward break iteration average loop time %d\n", loopTime); |
| 403 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU); |
| 404 printf("number of breaks %d average time per break %d\n", noBreaks, timePe
rBreak); |
| 405 } else { |
| 406 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); |
| 407 } |
| 408 } |
| 409 |
| 410 //------------------------------------------------------------------------------
---------- |
| 411 // |
| 412 // UnixConvert -- Convert the lines of the file to the encoding for UNIX |
| 413 // Since it appears that Unicode support is going in the gene
ral |
| 414 // direction of the use of UTF-8 locales, that is the approac
h |
| 415 // that is used here. |
| 416 // |
| 417 //------------------------------------------------------------------------------
---------- |
| 418 void UnixConvert() { |
| 419 #if 0 |
| 420 int line; |
| 421 |
| 422 UConverter *cvrtr; // An ICU code page converter. |
| 423 UErrorCode status = U_ZERO_ERROR; |
| 424 |
| 425 |
| 426 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales f
or now. |
| 427 if (U_FAILURE(status)) { |
| 428 fprintf(stderr, "ICU Converter open failed.: %d\n", &status); |
| 429 exit(-1); |
| 430 } |
| 431 // redo for unix |
| 432 for (line=0; line < gNumFileLines; line++) { |
| 433 int sizeNeeded = ucnv_fromUChars(cvrtr, |
| 434 0, // ptr to target buffer. |
| 435 0, // length of target buffe
r. |
| 436 gFileLines[line].name, |
| 437 -1, // source is null termin
ated |
| 438 &status); |
| 439 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { |
| 440 fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); |
| 441 exit(-1); |
| 442 } |
| 443 status = U_ZERO_ERROR; |
| 444 gFileLines[line].unixName = new char[sizeNeeded+1]; |
| 445 sizeNeeded = ucnv_fromUChars(cvrtr, |
| 446 gFileLines[line].unixName, // ptr to ta
rget buffer. |
| 447 sizeNeeded+1, // length of target buffe
r. |
| 448 gFileLines[line].name, |
| 449 -1, // source is null termin
ated |
| 450 &status); |
| 451 if (U_FAILURE(status)) { |
| 452 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); |
| 453 exit(-1); |
| 454 } |
| 455 gFileLines[line].unixName[sizeNeeded] = 0; |
| 456 }; |
| 457 ucnv_close(cvrtr); |
| 458 #endif |
| 459 } |
| 460 |
| 461 |
| 462 //------------------------------------------------------------------------------
---------- |
| 463 // |
| 464 // class UCharFile Class to hide all the gorp to read a file in |
| 465 // and produce a stream of UChars. |
| 466 // |
| 467 //------------------------------------------------------------------------------
---------- |
| 468 class UCharFile { |
| 469 public: |
| 470 UCharFile(const char *fileName); |
| 471 ~UCharFile(); |
| 472 UChar get(); |
| 473 UBool eof() {return fEof;}; |
| 474 UBool error() {return fError;}; |
| 475 int32_t size() { return fFileSize; }; |
| 476 |
| 477 private: |
| 478 UCharFile (const UCharFile &other) {}; // No copy co
nstructor. |
| 479 UCharFile & operator = (const UCharFile &other) {return *this;}; // No ass
ignment op |
| 480 |
| 481 FILE *fFile; |
| 482 const char *fName; |
| 483 UBool fEof; |
| 484 UBool fError; |
| 485 UChar fPending2ndSurrogate; |
| 486 int32_t fFileSize; |
| 487 |
| 488 enum {UTF16LE, UTF16BE, UTF8} fEncoding; |
| 489 }; |
| 490 |
| 491 UCharFile::UCharFile(const char * fileName) { |
| 492 fEof = FALSE; |
| 493 fError = FALSE; |
| 494 fName = fileName; |
| 495 struct stat buf; |
| 496 int32_t result = stat(fileName, &buf); |
| 497 if(result != 0) { |
| 498 fprintf(stderr, "Error getting info\n"); |
| 499 fFileSize = -1; |
| 500 } else { |
| 501 fFileSize = buf.st_size; |
| 502 } |
| 503 fFile = fopen(fName, "rb"); |
| 504 fPending2ndSurrogate = 0; |
| 505 if (fFile == NULL) { |
| 506 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); |
| 507 fError = TRUE; |
| 508 return; |
| 509 } |
| 510 // |
| 511 // Look for the byte order mark at the start of the file. |
| 512 // |
| 513 int BOMC1, BOMC2, BOMC3; |
| 514 BOMC1 = fgetc(fFile); |
| 515 BOMC2 = fgetc(fFile); |
| 516 |
| 517 if (BOMC1 == 0xff && BOMC2 == 0xfe) { |
| 518 fEncoding = UTF16LE; } |
| 519 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { |
| 520 fEncoding = UTF16BE; } |
| 521 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF )
{ |
| 522 fEncoding = UTF8; } |
| 523 else |
| 524 { |
| 525 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16
, and " |
| 526 "must include a BOM.\n", fileName); |
| 527 fError = true; |
| 528 return; |
| 529 } |
| 530 } |
| 531 |
| 532 |
| 533 UCharFile::~UCharFile() { |
| 534 fclose(fFile); |
| 535 } |
| 536 |
| 537 |
| 538 |
| 539 UChar UCharFile::get() { |
| 540 UChar c; |
| 541 switch (fEncoding) { |
| 542 case UTF16LE: |
| 543 { |
| 544 int cL, cH; |
| 545 cL = fgetc(fFile); |
| 546 cH = fgetc(fFile); |
| 547 c = cL | (cH << 8); |
| 548 if (cH == EOF) { |
| 549 c = 0; |
| 550 fEof = TRUE; |
| 551 } |
| 552 break; |
| 553 } |
| 554 case UTF16BE: |
| 555 { |
| 556 int cL, cH; |
| 557 cH = fgetc(fFile); |
| 558 cL = fgetc(fFile); |
| 559 c = cL | (cH << 8); |
| 560 if (cL == EOF) { |
| 561 c = 0; |
| 562 fEof = TRUE; |
| 563 } |
| 564 break; |
| 565 } |
| 566 case UTF8: |
| 567 { |
| 568 if (fPending2ndSurrogate != 0) { |
| 569 c = fPending2ndSurrogate; |
| 570 fPending2ndSurrogate = 0; |
| 571 break; |
| 572 } |
| 573 |
| 574 int ch = fgetc(fFile); // Note: c and ch are separate cause eof t
est doesn't work on UChar type. |
| 575 if (ch == EOF) { |
| 576 c = 0; |
| 577 fEof = TRUE; |
| 578 break; |
| 579 } |
| 580 |
| 581 if (ch <= 0x7f) { |
| 582 // It's ascii. No further utf-8 conversion. |
| 583 c = ch; |
| 584 break; |
| 585 } |
| 586 |
| 587 // Figure out the lenght of the char and read the rest of the bytes |
| 588 // into a temp array. |
| 589 int nBytes; |
| 590 if (ch >= 0xF0) {nBytes=4;} |
| 591 else if (ch >= 0xE0) {nBytes=3;} |
| 592 else if (ch >= 0xC0) {nBytes=2;} |
| 593 else { |
| 594 fprintf(stderr, "not likely utf-8 encoded file %s contains corru
pt data at offset %d.\n", fName, ftell(fFile)); |
| 595 fError = TRUE; |
| 596 return 0; |
| 597 } |
| 598 |
| 599 unsigned char bytes[10]; |
| 600 bytes[0] = (unsigned char)ch; |
| 601 int i; |
| 602 for (i=1; i<nBytes; i++) { |
| 603 bytes[i] = fgetc(fFile); |
| 604 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { |
| 605 fprintf(stderr, "utf-8 encoded file %s contains corrupt data
at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fNa
me, ftell(fFile), nBytes, i, ch); |
| 606 fError = TRUE; |
| 607 return 0; |
| 608 } |
| 609 } |
| 610 |
| 611 // Convert the bytes from the temp array to a Unicode char. |
| 612 i = 0; |
| 613 uint32_t cp; |
| 614 UTF8_NEXT_CHAR_UNSAFE(bytes, i, cp); |
| 615 c = (UChar)cp; |
| 616 |
| 617 if (cp >= 0x10000) { |
| 618 // The code point needs to be broken up into a utf-16 surrogate
pair. |
| 619 // Process first half this time through the main loop, and |
| 620 // remember the other half for the next time through. |
| 621 UChar utf16Buf[3]; |
| 622 i = 0; |
| 623 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); |
| 624 fPending2ndSurrogate = utf16Buf[1]; |
| 625 c = utf16Buf[0]; |
| 626 } |
| 627 break; |
| 628 }; |
| 629 } |
| 630 return c; |
| 631 } |
| 632 |
| 633 |
| 634 //------------------------------------------------------------------------------
---------- |
| 635 // |
| 636 // Main -- process command line, read in and pre-process the test file, |
| 637 // call other functions to do the actual tests. |
| 638 // |
| 639 //------------------------------------------------------------------------------
---------- |
| 640 int main(int argc, const char** argv) { |
| 641 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0)
{ |
| 642 printf(gUsageString); |
| 643 exit (1); |
| 644 } |
| 645 // Make sure that we've only got one API selected. |
| 646 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; |
| 647 if (opt_mac || opt_unix) opt_win = FALSE; |
| 648 if (opt_mac) opt_unix = FALSE; |
| 649 |
| 650 UErrorCode status = U_ZERO_ERROR; |
| 651 |
| 652 |
| 653 |
| 654 // |
| 655 // Set up a Windows LCID |
| 656 // |
| 657 /* |
| 658 if (opt_langid != 0) { |
| 659 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); |
| 660 } |
| 661 else { |
| 662 gWinLCID = uloc_getLCID(opt_locale); |
| 663 } |
| 664 */ |
| 665 |
| 666 // |
| 667 // Set the UNIX locale |
| 668 // |
| 669 if (opt_unix) { |
| 670 if (setlocale(LC_ALL, opt_locale) == 0) { |
| 671 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); |
| 672 exit(-1); |
| 673 } |
| 674 } |
| 675 |
| 676 // Read in the input file. |
| 677 // File assumed to be utf-16. |
| 678 // Lines go onto heap buffers. Global index array to line starts is creat
ed. |
| 679 // Lines themselves are null terminated. |
| 680 // |
| 681 |
| 682 UCharFile f(opt_fName); |
| 683 if (f.error()) { |
| 684 exit(-1); |
| 685 } |
| 686 int32_t fileSize = f.size(); |
| 687 const int STARTSIZE = 70000; |
| 688 int32_t bufSize = 0; |
| 689 int32_t charCount = 0; |
| 690 if(fileSize != -1) { |
| 691 text = (UChar *)malloc(fileSize*sizeof(UChar)); |
| 692 bufSize = fileSize; |
| 693 } else { |
| 694 text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); |
| 695 bufSize = STARTSIZE; |
| 696 } |
| 697 if(text == NULL) { |
| 698 fprintf(stderr, "Allocating buffer failed\n"); |
| 699 exit(-1); |
| 700 } |
| 701 |
| 702 |
| 703 // Read the file, split into lines, and save in memory. |
| 704 // Loop runs once per utf-16 value from the input file, |
| 705 // (The number of bytes read from file per loop iteration depends on exte
rnal encoding.) |
| 706 for (;;) { |
| 707 |
| 708 UChar c = f.get(); |
| 709 if(f.eof()) { |
| 710 break; |
| 711 } |
| 712 if (f.error()){ |
| 713 exit(-1); |
| 714 } |
| 715 // We now have a good UTF-16 value in c. |
| 716 text[charCount++] = c; |
| 717 if(charCount == bufSize) { |
| 718 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); |
| 719 if(text == NULL) { |
| 720 fprintf(stderr, "Reallocating buffer failed\n"); |
| 721 exit(-1); |
| 722 } |
| 723 bufSize *= 2; |
| 724 } |
| 725 } |
| 726 |
| 727 |
| 728 if (opt_terse == FALSE) { |
| 729 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); |
| 730 } |
| 731 |
| 732 textSize = charCount; |
| 733 |
| 734 |
| 735 |
| 736 |
| 737 // |
| 738 // Dump file contents if requested. |
| 739 // |
| 740 if (opt_dump) { |
| 741 // dump file, etc... possibly |
| 742 } |
| 743 |
| 744 |
| 745 // |
| 746 // We've got the file read into memory. Go do something with it. |
| 747 // |
| 748 int32_t i = 0; |
| 749 for(i = 0; i < opt_passesCount; i++) { |
| 750 if(opt_loopCount != 0) { |
| 751 if(opt_next) { |
| 752 doForwardTest(); |
| 753 } else if(opt_isBound) { |
| 754 doIsBoundTest(); |
| 755 } else { |
| 756 doForwardTest(); |
| 757 } |
| 758 } else if(opt_time != 0) { |
| 759 |
| 760 } |
| 761 } |
| 762 |
| 763 if(text != NULL) { |
| 764 free(text); |
| 765 } |
| 766 if(brkit != NULL) { |
| 767 delete brkit; |
| 768 } |
| 769 |
| 770 return 0; |
| 771 } |
OLD | NEW |