OLD | NEW |
| (Empty) |
1 /******************************************************************** | |
2 * COPYRIGHT: | |
3 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. | |
4 * | |
5 ********************************************************************/ | |
6 /*******************************************************************************
* | |
7 * | |
8 * File ubrkperf.cpp | |
9 * | |
10 * Modification History: | |
11 * Name Description | |
12 * Vladimir Weinstein First Version, based on collperf | |
13 * | |
14 ********************************************************************************
* | |
15 */ | |
16 | |
17 // | |
18 // This program tests break iterator performance | |
19 // Currently we test only ICU APIs with the future possibility of testing *
nix & win32 APIs | |
20 // (if any) | |
21 // A text file is required as input. It must be in utf-8 or utf-16 format, | |
22 // and include a byte order mark. Either LE or BE format is OK. | |
23 // | |
24 | |
25 const char gUsageString[] = | |
26 "usage: ubrkperf options...\n" | |
27 "-help Display this message.\n" | |
28 "-file file_name utf-16/utf-8 format file.\n" | |
29 "-locale name ICU locale to use. Default is en_US\n" | |
30 "-langid 0x1234 Windows Language ID number. Default to value fo
r -locale option\n" | |
31 " see http://msdn.microsoft.com/library/psdk/wi
nbase/nls_8xo3.htm\n" | |
32 "-win Run test using Windows native services. (current
ly not working) (ICU is default)\n" | |
33 "-unix Run test using Unix word breaking services. (cur
rently not working) \n" | |
34 "-mac Run test using MacOSX word breaking services.\n" | |
35 "-uselen Use API with string lengths. Default is null-te
rminated strings\n" | |
36 "-char Use character break iterator\n" | |
37 "-word Use word break iterator\n" | |
38 "-line Use line break iterator\n" | |
39 "-sentence Use sentence break iterator\n" | |
40 "-loop nnnn Loopcount for test. Adjust for reasonable total
running time.\n" | |
41 "-iloop n Inner Loop Count. Default = 1. Number of calls
to function\n" | |
42 " under test at each call point. For measurin
g test overhead.\n" | |
43 "-terse Terse numbers-only output. Intended for use by
scripts.\n" | |
44 "-dump Display stuff.\n" | |
45 "-capi Use C APIs instead of C++ APIs (currently not wo
rking)\n" | |
46 "-next Do the next test\n" | |
47 "-isBound Do the isBound test\n" | |
48 ; | |
49 | |
50 | |
51 #include <stdio.h> | |
52 #include <string.h> | |
53 #include <stdlib.h> | |
54 #include <math.h> | |
55 #include <locale.h> | |
56 #include <errno.h> | |
57 #include <sys/stat.h> | |
58 | |
59 #include <unicode/utypes.h> | |
60 #include <unicode/ucol.h> | |
61 #include <unicode/ucoleitr.h> | |
62 #include <unicode/uloc.h> | |
63 #include <unicode/ustring.h> | |
64 #include <unicode/ures.h> | |
65 #include <unicode/uchar.h> | |
66 #include <unicode/ucnv.h> | |
67 #include <unicode/utf8.h> | |
68 | |
69 #include <unicode/brkiter.h> | |
70 | |
71 | |
72 #if U_PLATFORM_HAS_WIN32_API | |
73 #include <windows.h> | |
74 #else | |
75 // | |
76 // Stubs for Windows API functions when building on UNIXes. | |
77 // | |
78 #include <sys/time.h> | |
79 unsigned long timeGetTime() { | |
80 struct timeval t; | |
81 gettimeofday(&t, 0); | |
82 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. | |
83 val += t.tv_usec / 1000; | |
84 return val; | |
85 }; | |
86 #define MAKELCID(a,b) 0 | |
87 #endif | |
88 | |
89 | |
90 // | |
91 // Command line option variables | |
92 // These global variables are set according to the options specified | |
93 // on the command line by the user. | |
94 char * opt_fName = 0; | |
95 char * opt_locale = "en_US"; | |
96 int opt_langid = 0; // Defaults to value corresponding to opt_loc
ale. | |
97 char * opt_rules = 0; | |
98 UBool opt_help = FALSE; | |
99 int opt_time = 0; | |
100 int opt_loopCount = 0; | |
101 int opt_passesCount= 1; | |
102 UBool opt_terse = FALSE; | |
103 UBool opt_icu = TRUE; | |
104 UBool opt_win = FALSE; // Run with Windows native functions. | |
105 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions. | |
106 UBool opt_mac = FALSE; // Run with MacOSX word break services. | |
107 UBool opt_uselen = FALSE; | |
108 UBool opt_dump = FALSE; | |
109 UBool opt_char = FALSE; | |
110 UBool opt_word = FALSE; | |
111 UBool opt_line = FALSE; | |
112 UBool opt_sentence = FALSE; | |
113 UBool opt_capi = FALSE; | |
114 | |
115 UBool opt_next = FALSE; | |
116 UBool opt_isBound = FALSE; | |
117 | |
118 | |
119 | |
120 // | |
121 // Definitions for the command line options | |
122 // | |
123 struct OptSpec { | |
124 const char *name; | |
125 enum {FLAG, NUM, STRING} type; | |
126 void *pVar; | |
127 }; | |
128 | |
129 OptSpec opts[] = { | |
130 {"-file", OptSpec::STRING, &opt_fName}, | |
131 {"-locale", OptSpec::STRING, &opt_locale}, | |
132 {"-langid", OptSpec::NUM, &opt_langid}, | |
133 {"-win", OptSpec::FLAG, &opt_win}, | |
134 {"-unix", OptSpec::FLAG, &opt_unix}, | |
135 {"-mac", OptSpec::FLAG, &opt_mac}, | |
136 {"-uselen", OptSpec::FLAG, &opt_uselen}, | |
137 {"-loop", OptSpec::NUM, &opt_loopCount}, | |
138 {"-time", OptSpec::NUM, &opt_time}, | |
139 {"-passes", OptSpec::NUM, &opt_passesCount}, | |
140 {"-char", OptSpec::FLAG, &opt_char}, | |
141 {"-word", OptSpec::FLAG, &opt_word}, | |
142 {"-line", OptSpec::FLAG, &opt_line}, | |
143 {"-sentence", OptSpec::FLAG, &opt_sentence}, | |
144 {"-terse", OptSpec::FLAG, &opt_terse}, | |
145 {"-dump", OptSpec::FLAG, &opt_dump}, | |
146 {"-capi", OptSpec::FLAG, &opt_capi}, | |
147 {"-next", OptSpec::FLAG, &opt_next}, | |
148 {"-isBound", OptSpec::FLAG, &opt_isBound}, | |
149 {"-help", OptSpec::FLAG, &opt_help}, | |
150 {"-?", OptSpec::FLAG, &opt_help}, | |
151 {0, OptSpec::FLAG, 0} | |
152 }; | |
153 | |
154 | |
155 //--------------------------------------------------------------------------- | |
156 // | |
157 // Global variables pointing to and describing the test file | |
158 // | |
159 //--------------------------------------------------------------------------- | |
160 | |
161 //DWORD gWinLCID; | |
162 BreakIterator *brkit = NULL; | |
163 UChar *text = NULL; | |
164 int32_t textSize = 0; | |
165 | |
166 | |
167 | |
168 #if U_PLATFORM_IS_DARWIN_BASED | |
169 #include <ApplicationServices/ApplicationServices.h> | |
170 enum{ | |
171 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTex
tBreakLineMask) | |
172 }; | |
173 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask,
kUCTextBreakWordMask, kUCTextBreakLineMask}; | |
174 TextBreakLocatorRef breakRef; | |
175 UCTextBreakType macBreakType; | |
176 | |
177 void createMACBrkIt() { | |
178 OSStatus status = noErr; | |
179 LocaleRef lref; | |
180 status = LocaleRefFromLocaleString(opt_locale, &lref); | |
181 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLoca
torRef*)&breakRef); | |
182 if(opt_char == TRUE) { | |
183 macBreakType = kUCTextBreakClusterMask; | |
184 } else if(opt_word == TRUE) { | |
185 macBreakType = kUCTextBreakWordMask; | |
186 } else if(opt_line == TRUE) { | |
187 macBreakType = kUCTextBreakLineMask; | |
188 } else if(opt_sentence == TRUE) { | |
189 // error | |
190 // brkit = BreakIterator::createSentenceInstance(opt_locale, status); | |
191 } else { | |
192 // default is character iterator | |
193 macBreakType = kUCTextBreakClusterMask; | |
194 } | |
195 } | |
196 #endif | |
197 | |
198 void createICUBrkIt() { | |
199 // | |
200 // Set up an ICU break iterator | |
201 // | |
202 UErrorCode status = U_ZERO_ERROR; | |
203 if(opt_char == TRUE) { | |
204 brkit = BreakIterator::createCharacterInstance(opt_locale, status); | |
205 } else if(opt_word == TRUE) { | |
206 brkit = BreakIterator::createWordInstance(opt_locale, status); | |
207 } else if(opt_line == TRUE) { | |
208 brkit = BreakIterator::createLineInstance(opt_locale, status); | |
209 } else if(opt_sentence == TRUE) { | |
210 brkit = BreakIterator::createSentenceInstance(opt_locale, status); | |
211 } else { | |
212 // default is character iterator | |
213 brkit = BreakIterator::createCharacterInstance(opt_locale, status); | |
214 } | |
215 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) { | |
216 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); | |
217 } | |
218 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) { | |
219 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); | |
220 } | |
221 | |
222 } | |
223 | |
224 //--------------------------------------------------------------------------- | |
225 // | |
226 // ProcessOptions() Function to read the command line options. | |
227 // | |
228 //--------------------------------------------------------------------------- | |
229 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) | |
230 { | |
231 int i; | |
232 int argNum; | |
233 const char *pArgName; | |
234 OptSpec *pOpt; | |
235 | |
236 for (argNum=1; argNum<argc; argNum++) { | |
237 pArgName = argv[argNum]; | |
238 for (pOpt = opts; pOpt->name != 0; pOpt++) { | |
239 if (strcmp(pOpt->name, pArgName) == 0) { | |
240 switch (pOpt->type) { | |
241 case OptSpec::FLAG: | |
242 *(UBool *)(pOpt->pVar) = TRUE; | |
243 break; | |
244 case OptSpec::STRING: | |
245 argNum ++; | |
246 if (argNum >= argc) { | |
247 fprintf(stderr, "value expected for \"%s\" option.\n", p
Opt->name); | |
248 return FALSE; | |
249 } | |
250 *(const char **)(pOpt->pVar) = argv[argNum]; | |
251 break; | |
252 case OptSpec::NUM: | |
253 argNum ++; | |
254 if (argNum >= argc) { | |
255 fprintf(stderr, "value expected for \"%s\" option.\n", p
Opt->name); | |
256 return FALSE; | |
257 } | |
258 char *endp; | |
259 i = strtol(argv[argNum], &endp, 0); | |
260 if (endp == argv[argNum]) { | |
261 fprintf(stderr, "integer value expected for \"%s\" optio
n.\n", pOpt->name); | |
262 return FALSE; | |
263 } | |
264 *(int *)(pOpt->pVar) = i; | |
265 } | |
266 break; | |
267 } | |
268 } | |
269 if (pOpt->name == 0) | |
270 { | |
271 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); | |
272 return FALSE; | |
273 } | |
274 } | |
275 return TRUE; | |
276 } | |
277 | |
278 | |
279 void doForwardTest() { | |
280 if (opt_terse == FALSE) { | |
281 printf("Doing the forward test\n"); | |
282 } | |
283 int32_t noBreaks = 0; | |
284 int32_t i = 0; | |
285 unsigned long startTime = timeGetTime(); | |
286 unsigned long elapsedTime = 0; | |
287 if(opt_icu) { | |
288 createICUBrkIt(); | |
289 brkit->setText(UnicodeString(text, textSize)); | |
290 brkit->first(); | |
291 if (opt_terse == FALSE) { | |
292 printf("Warmup\n"); | |
293 } | |
294 int j; | |
295 while((j = brkit->next()) != BreakIterator::DONE) { | |
296 noBreaks++; | |
297 //fprintf(stderr, "%d ", j); | |
298 } | |
299 | |
300 if (opt_terse == FALSE) { | |
301 printf("Measure\n"); | |
302 } | |
303 startTime = timeGetTime(); | |
304 for(i = 0; i < opt_loopCount; i++) { | |
305 brkit->first(); | |
306 while(brkit->next() != BreakIterator::DONE) { | |
307 } | |
308 } | |
309 | |
310 elapsedTime = timeGetTime()-startTime; | |
311 } else if(opt_mac) { | |
312 #if U_PLATFORM_IS_DARWIN_BASED | |
313 createMACBrkIt(); | |
314 UniChar* filePtr = text; | |
315 OSStatus status = noErr; | |
316 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; | |
317 startOffset = 0; | |
318 //printf("\t---Search forward--\n"); | |
319 | |
320 while (startOffset < numUniChars) | |
321 { | |
322 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdge
Mask, filePtr, numUniChars, | |
323 startOffset, &breakOffset); | |
324 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed:
startOffset %d, status %d\n", (int)startOffset, (int)status)); | |
325 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBr
eak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (i
nt)breakOffset)); | |
326 | |
327 // Output break | |
328 //printf("\t%d\n", (int)breakOffset); | |
329 | |
330 // Increment counters | |
331 noBreaks++; | |
332 startOffset = breakOffset; | |
333 } | |
334 startTime = timeGetTime(); | |
335 for(i = 0; i < opt_loopCount; i++) { | |
336 startOffset = 0; | |
337 | |
338 while (startOffset < numUniChars) | |
339 { | |
340 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEd
geMask, filePtr, numUniChars, | |
341 startOffset, &breakOffset); | |
342 // Increment counters | |
343 startOffset = breakOffset; | |
344 } | |
345 } | |
346 elapsedTime = timeGetTime()-startTime; | |
347 UCDisposeTextBreakLocator(&breakRef); | |
348 #endif | |
349 | |
350 | |
351 } | |
352 | |
353 | |
354 if (opt_terse == FALSE) { | |
355 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCoun
t)); | |
356 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize))
; | |
357 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreak
s)); | |
358 printf("forward break iteration average loop time %d\n", loopTime); | |
359 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU); | |
360 printf("number of breaks %d average time per break %d\n", noBreaks, timePe
rBreak); | |
361 } else { | |
362 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); | |
363 } | |
364 | |
365 | |
366 } | |
367 | |
368 void doIsBoundTest() { | |
369 int32_t noBreaks = 0, hit = 0; | |
370 int32_t i = 0, j = 0; | |
371 unsigned long startTime = timeGetTime(); | |
372 unsigned long elapsedTime = 0; | |
373 createICUBrkIt(); | |
374 brkit->setText(UnicodeString(text, textSize)); | |
375 brkit->first(); | |
376 for(j = 0; j < textSize; j++) { | |
377 if(brkit->isBoundary(j)) { | |
378 noBreaks++; | |
379 //fprintf(stderr, "%d ", j); | |
380 } | |
381 } | |
382 /* | |
383 while(brkit->next() != BreakIterator::DONE) { | |
384 noBreaks++; | |
385 } | |
386 */ | |
387 | |
388 startTime = timeGetTime(); | |
389 for(i = 0; i < opt_loopCount; i++) { | |
390 for(j = 0; j < textSize; j++) { | |
391 if(brkit->isBoundary(j)) { | |
392 hit++; | |
393 } | |
394 } | |
395 } | |
396 | |
397 elapsedTime = timeGetTime()-startTime; | |
398 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCoun
t)); | |
399 if (opt_terse == FALSE) { | |
400 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize))
; | |
401 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreak
s)); | |
402 printf("forward break iteration average loop time %d\n", loopTime); | |
403 printf("number of code units %d average time per code unit %d\n", textSize
, timePerCU); | |
404 printf("number of breaks %d average time per break %d\n", noBreaks, timePe
rBreak); | |
405 } else { | |
406 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); | |
407 } | |
408 } | |
409 | |
410 //------------------------------------------------------------------------------
---------- | |
411 // | |
412 // UnixConvert -- Convert the lines of the file to the encoding for UNIX | |
413 // Since it appears that Unicode support is going in the gene
ral | |
414 // direction of the use of UTF-8 locales, that is the approac
h | |
415 // that is used here. | |
416 // | |
417 //------------------------------------------------------------------------------
---------- | |
418 void UnixConvert() { | |
419 #if 0 | |
420 int line; | |
421 | |
422 UConverter *cvrtr; // An ICU code page converter. | |
423 UErrorCode status = U_ZERO_ERROR; | |
424 | |
425 | |
426 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales f
or now. | |
427 if (U_FAILURE(status)) { | |
428 fprintf(stderr, "ICU Converter open failed.: %d\n", &status); | |
429 exit(-1); | |
430 } | |
431 // redo for unix | |
432 for (line=0; line < gNumFileLines; line++) { | |
433 int sizeNeeded = ucnv_fromUChars(cvrtr, | |
434 0, // ptr to target buffer. | |
435 0, // length of target buffe
r. | |
436 gFileLines[line].name, | |
437 -1, // source is null termin
ated | |
438 &status); | |
439 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { | |
440 fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); | |
441 exit(-1); | |
442 } | |
443 status = U_ZERO_ERROR; | |
444 gFileLines[line].unixName = new char[sizeNeeded+1]; | |
445 sizeNeeded = ucnv_fromUChars(cvrtr, | |
446 gFileLines[line].unixName, // ptr to ta
rget buffer. | |
447 sizeNeeded+1, // length of target buffe
r. | |
448 gFileLines[line].name, | |
449 -1, // source is null termin
ated | |
450 &status); | |
451 if (U_FAILURE(status)) { | |
452 fprintf(stderr, "ICU Conversion Failed.: %d\n", status); | |
453 exit(-1); | |
454 } | |
455 gFileLines[line].unixName[sizeNeeded] = 0; | |
456 }; | |
457 ucnv_close(cvrtr); | |
458 #endif | |
459 } | |
460 | |
461 | |
462 //------------------------------------------------------------------------------
---------- | |
463 // | |
464 // class UCharFile Class to hide all the gorp to read a file in | |
465 // and produce a stream of UChars. | |
466 // | |
467 //------------------------------------------------------------------------------
---------- | |
468 class UCharFile { | |
469 public: | |
470 UCharFile(const char *fileName); | |
471 ~UCharFile(); | |
472 UChar get(); | |
473 UBool eof() {return fEof;}; | |
474 UBool error() {return fError;}; | |
475 int32_t size() { return fFileSize; }; | |
476 | |
477 private: | |
478 UCharFile (const UCharFile &other) {}; // No copy co
nstructor. | |
479 UCharFile & operator = (const UCharFile &other) {return *this;}; // No ass
ignment op | |
480 | |
481 FILE *fFile; | |
482 const char *fName; | |
483 UBool fEof; | |
484 UBool fError; | |
485 UChar fPending2ndSurrogate; | |
486 int32_t fFileSize; | |
487 | |
488 enum {UTF16LE, UTF16BE, UTF8} fEncoding; | |
489 }; | |
490 | |
491 UCharFile::UCharFile(const char * fileName) { | |
492 fEof = FALSE; | |
493 fError = FALSE; | |
494 fName = fileName; | |
495 struct stat buf; | |
496 int32_t result = stat(fileName, &buf); | |
497 if(result != 0) { | |
498 fprintf(stderr, "Error getting info\n"); | |
499 fFileSize = -1; | |
500 } else { | |
501 fFileSize = buf.st_size; | |
502 } | |
503 fFile = fopen(fName, "rb"); | |
504 fPending2ndSurrogate = 0; | |
505 if (fFile == NULL) { | |
506 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); | |
507 fError = TRUE; | |
508 return; | |
509 } | |
510 // | |
511 // Look for the byte order mark at the start of the file. | |
512 // | |
513 int BOMC1, BOMC2, BOMC3; | |
514 BOMC1 = fgetc(fFile); | |
515 BOMC2 = fgetc(fFile); | |
516 | |
517 if (BOMC1 == 0xff && BOMC2 == 0xfe) { | |
518 fEncoding = UTF16LE; } | |
519 else if (BOMC1 == 0xfe && BOMC2 == 0xff) { | |
520 fEncoding = UTF16BE; } | |
521 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF )
{ | |
522 fEncoding = UTF8; } | |
523 else | |
524 { | |
525 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16
, and " | |
526 "must include a BOM.\n", fileName); | |
527 fError = true; | |
528 return; | |
529 } | |
530 } | |
531 | |
532 | |
533 UCharFile::~UCharFile() { | |
534 fclose(fFile); | |
535 } | |
536 | |
537 | |
538 | |
539 UChar UCharFile::get() { | |
540 UChar c; | |
541 switch (fEncoding) { | |
542 case UTF16LE: | |
543 { | |
544 int cL, cH; | |
545 cL = fgetc(fFile); | |
546 cH = fgetc(fFile); | |
547 c = cL | (cH << 8); | |
548 if (cH == EOF) { | |
549 c = 0; | |
550 fEof = TRUE; | |
551 } | |
552 break; | |
553 } | |
554 case UTF16BE: | |
555 { | |
556 int cL, cH; | |
557 cH = fgetc(fFile); | |
558 cL = fgetc(fFile); | |
559 c = cL | (cH << 8); | |
560 if (cL == EOF) { | |
561 c = 0; | |
562 fEof = TRUE; | |
563 } | |
564 break; | |
565 } | |
566 case UTF8: | |
567 { | |
568 if (fPending2ndSurrogate != 0) { | |
569 c = fPending2ndSurrogate; | |
570 fPending2ndSurrogate = 0; | |
571 break; | |
572 } | |
573 | |
574 int ch = fgetc(fFile); // Note: c and ch are separate cause eof t
est doesn't work on UChar type. | |
575 if (ch == EOF) { | |
576 c = 0; | |
577 fEof = TRUE; | |
578 break; | |
579 } | |
580 | |
581 if (ch <= 0x7f) { | |
582 // It's ascii. No further utf-8 conversion. | |
583 c = ch; | |
584 break; | |
585 } | |
586 | |
587 // Figure out the lenght of the char and read the rest of the bytes | |
588 // into a temp array. | |
589 int nBytes; | |
590 if (ch >= 0xF0) {nBytes=4;} | |
591 else if (ch >= 0xE0) {nBytes=3;} | |
592 else if (ch >= 0xC0) {nBytes=2;} | |
593 else { | |
594 fprintf(stderr, "not likely utf-8 encoded file %s contains corru
pt data at offset %d.\n", fName, ftell(fFile)); | |
595 fError = TRUE; | |
596 return 0; | |
597 } | |
598 | |
599 unsigned char bytes[10]; | |
600 bytes[0] = (unsigned char)ch; | |
601 int i; | |
602 for (i=1; i<nBytes; i++) { | |
603 bytes[i] = fgetc(fFile); | |
604 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { | |
605 fprintf(stderr, "utf-8 encoded file %s contains corrupt data
at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fNa
me, ftell(fFile), nBytes, i, ch); | |
606 fError = TRUE; | |
607 return 0; | |
608 } | |
609 } | |
610 | |
611 // Convert the bytes from the temp array to a Unicode char. | |
612 i = 0; | |
613 uint32_t cp; | |
614 U8_NEXT_UNSAFE(bytes, i, cp); | |
615 c = (UChar)cp; | |
616 | |
617 if (cp >= 0x10000) { | |
618 // The code point needs to be broken up into a utf-16 surrogate
pair. | |
619 // Process first half this time through the main loop, and | |
620 // remember the other half for the next time through. | |
621 UChar utf16Buf[3]; | |
622 i = 0; | |
623 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); | |
624 fPending2ndSurrogate = utf16Buf[1]; | |
625 c = utf16Buf[0]; | |
626 } | |
627 break; | |
628 }; | |
629 } | |
630 return c; | |
631 } | |
632 | |
633 | |
634 //------------------------------------------------------------------------------
---------- | |
635 // | |
636 // Main -- process command line, read in and pre-process the test file, | |
637 // call other functions to do the actual tests. | |
638 // | |
639 //------------------------------------------------------------------------------
---------- | |
640 int main(int argc, const char** argv) { | |
641 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0)
{ | |
642 printf(gUsageString); | |
643 exit (1); | |
644 } | |
645 // Make sure that we've only got one API selected. | |
646 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE; | |
647 if (opt_mac || opt_unix) opt_win = FALSE; | |
648 if (opt_mac) opt_unix = FALSE; | |
649 | |
650 UErrorCode status = U_ZERO_ERROR; | |
651 | |
652 | |
653 | |
654 // | |
655 // Set up a Windows LCID | |
656 // | |
657 /* | |
658 if (opt_langid != 0) { | |
659 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); | |
660 } | |
661 else { | |
662 gWinLCID = uloc_getLCID(opt_locale); | |
663 } | |
664 */ | |
665 | |
666 // | |
667 // Set the UNIX locale | |
668 // | |
669 if (opt_unix) { | |
670 if (setlocale(LC_ALL, opt_locale) == 0) { | |
671 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); | |
672 exit(-1); | |
673 } | |
674 } | |
675 | |
676 // Read in the input file. | |
677 // File assumed to be utf-16. | |
678 // Lines go onto heap buffers. Global index array to line starts is creat
ed. | |
679 // Lines themselves are null terminated. | |
680 // | |
681 | |
682 UCharFile f(opt_fName); | |
683 if (f.error()) { | |
684 exit(-1); | |
685 } | |
686 int32_t fileSize = f.size(); | |
687 const int STARTSIZE = 70000; | |
688 int32_t bufSize = 0; | |
689 int32_t charCount = 0; | |
690 if(fileSize != -1) { | |
691 text = (UChar *)malloc(fileSize*sizeof(UChar)); | |
692 bufSize = fileSize; | |
693 } else { | |
694 text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); | |
695 bufSize = STARTSIZE; | |
696 } | |
697 if(text == NULL) { | |
698 fprintf(stderr, "Allocating buffer failed\n"); | |
699 exit(-1); | |
700 } | |
701 | |
702 | |
703 // Read the file, split into lines, and save in memory. | |
704 // Loop runs once per utf-16 value from the input file, | |
705 // (The number of bytes read from file per loop iteration depends on exte
rnal encoding.) | |
706 for (;;) { | |
707 | |
708 UChar c = f.get(); | |
709 if(f.eof()) { | |
710 break; | |
711 } | |
712 if (f.error()){ | |
713 exit(-1); | |
714 } | |
715 // We now have a good UTF-16 value in c. | |
716 text[charCount++] = c; | |
717 if(charCount == bufSize) { | |
718 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); | |
719 if(text == NULL) { | |
720 fprintf(stderr, "Reallocating buffer failed\n"); | |
721 exit(-1); | |
722 } | |
723 bufSize *= 2; | |
724 } | |
725 } | |
726 | |
727 | |
728 if (opt_terse == FALSE) { | |
729 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); | |
730 } | |
731 | |
732 textSize = charCount; | |
733 | |
734 | |
735 | |
736 | |
737 // | |
738 // Dump file contents if requested. | |
739 // | |
740 if (opt_dump) { | |
741 // dump file, etc... possibly | |
742 } | |
743 | |
744 | |
745 // | |
746 // We've got the file read into memory. Go do something with it. | |
747 // | |
748 int32_t i = 0; | |
749 for(i = 0; i < opt_passesCount; i++) { | |
750 if(opt_loopCount != 0) { | |
751 if(opt_next) { | |
752 doForwardTest(); | |
753 } else if(opt_isBound) { | |
754 doIsBoundTest(); | |
755 } else { | |
756 doForwardTest(); | |
757 } | |
758 } else if(opt_time != 0) { | |
759 | |
760 } | |
761 } | |
762 | |
763 if(text != NULL) { | |
764 free(text); | |
765 } | |
766 if(brkit != NULL) { | |
767 delete brkit; | |
768 } | |
769 | |
770 return 0; | |
771 } | |
OLD | NEW |