OLD | NEW |
| (Empty) |
1 /* | |
2 ****************************************************************************** | |
3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation * | |
4 * and others. All Rights Reserved. * | |
5 ****************************************************************************** | |
6 */ | |
7 | |
8 #include <errno.h> | |
9 #include <stdio.h> | |
10 #include <string.h> | |
11 | |
12 #include "unicode/utypes.h" | |
13 #include "unicode/uchar.h" | |
14 #include "unicode/uchriter.h" | |
15 #include "unicode/brkiter.h" | |
16 #include "unicode/locid.h" | |
17 #include "unicode/unistr.h" | |
18 #include "unicode/uniset.h" | |
19 #include "unicode/ustring.h" | |
20 | |
21 /* | |
22 * This program takes a Unicode text file containing Thai text with | |
23 * spaces inserted where the word breaks are. It computes a copy of | |
24 * the text without spaces and uses a word instance of a Thai BreakIterator | |
25 * to compute the word breaks. The program reports any differences in the | |
26 * breaks. | |
27 * | |
28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is | |
29 * exptected that this program will always report some differences. | |
30 */ | |
31 | |
32 /* | |
33 * This class is a break iterator that counts words and spaces. | |
34 */ | |
35 class SpaceBreakIterator | |
36 { | |
37 public: | |
38 // The constructor: | |
39 // text - pointer to an array of UChars to iterate over | |
40 // count - the number of UChars in text | |
41 SpaceBreakIterator(const UChar *text, int32_t count); | |
42 | |
43 // the destructor | |
44 ~SpaceBreakIterator(); | |
45 | |
46 // return next break position | |
47 int32_t next(); | |
48 | |
49 // return current word count | |
50 int32_t getWordCount(); | |
51 | |
52 // return current space count | |
53 int32_t getSpaceCount(); | |
54 | |
55 private: | |
56 // No arg constructor: private so clients can't call it. | |
57 SpaceBreakIterator(); | |
58 | |
59 // The underlying BreakIterator | |
60 BreakIterator *fBreakIter; | |
61 | |
62 // address of the UChar array | |
63 const UChar *fText; | |
64 | |
65 // number of UChars in fText | |
66 int32_t fTextCount; | |
67 | |
68 // current word count | |
69 int32_t fWordCount; | |
70 | |
71 // current space count | |
72 int32_t fSpaceCount; | |
73 | |
74 // UnicodeSet of SA characters | |
75 UnicodeSet fComplexContext; | |
76 | |
77 // true when fBreakIter has returned DONE | |
78 UBool fDone; | |
79 }; | |
80 | |
81 /* | |
82 * This is the main class. It compares word breaks and reports the differences. | |
83 */ | |
84 class ThaiWordbreakTest | |
85 { | |
86 public: | |
87 // The main constructor: | |
88 // spaces - pointer to a UChar array for the text with spaces | |
89 // spaceCount - the number of characters in the spaces array | |
90 // noSpaces - pointer to a UChar array for the text without spaces | |
91 // noSpaceCount - the number of characters in the noSpaces array | |
92 // verbose - report all breaks if true, otherwise just report differenc
es | |
93 ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSp
aces, int32_t noSpaceCount, UBool verbose); | |
94 ~ThaiWordbreakTest(); | |
95 | |
96 // returns the number of breaks that are in the spaces array | |
97 // but aren't found in the noSpaces array | |
98 int32_t getBreaksNotFound(); | |
99 | |
100 // returns the number of breaks which are found in the noSpaces | |
101 // array but aren't in the spaces array | |
102 int32_t getInvalidBreaks(); | |
103 | |
104 // returns the number of words found in the spaces array | |
105 int32_t getWordCount(); | |
106 | |
107 // reads the input Unicode text file: | |
108 // fileName - the path name of the file | |
109 // charCount - set to the number of UChars read from the file | |
110 // returns - the address of the UChar array containing the characters | |
111 static const UChar *readFile(char *fileName, int32_t &charCount); | |
112 | |
113 // removes spaces form the input UChar array: | |
114 // spaces - pointer to the input UChar array | |
115 // count - number of UChars in the spaces array | |
116 // nonSpaceCount - the number of UChars in the result array | |
117 // returns - the address of the UChar array with spaces removed | |
118 static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t
&nonSpaceCount); | |
119 | |
120 private: | |
121 // The no arg constructor - private so clients can't call it | |
122 ThaiWordbreakTest(); | |
123 | |
124 // This does the actual comparison: | |
125 // spaces - the address of the UChar array for the text with spaces | |
126 // spaceCount - the number of UChars in the spaces array | |
127 // noSpaces - the address of the UChar array for the text without spaces | |
128 // noSpaceCount - the number of UChars in the noSpaces array | |
129 // returns - true if all breaks match, FALSE otherwise | |
130 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount, | |
131 const UChar *noSpaces, int32_t noSpaceCount); | |
132 | |
133 // helper method to report a break in the spaces | |
134 // array that's not found in the noSpaces array | |
135 void breakNotFound(int32_t br); | |
136 | |
137 // helper method to report a break that's found in | |
138 // the noSpaces array that's not in the spaces array | |
139 void foundInvalidBreak(int32_t br); | |
140 | |
141 // count of breaks in the spaces array that | |
142 // aren't found in the noSpaces array | |
143 int32_t fBreaksNotFound; | |
144 | |
145 // count of breaks found in the noSpaces array | |
146 // that aren't in the spaces array | |
147 int32_t fInvalidBreaks; | |
148 | |
149 // number of words found in the spaces array | |
150 int32_t fWordCount; | |
151 | |
152 // report all breaks if true, otherwise just report differences | |
153 UBool fVerbose; | |
154 }; | |
155 | |
156 /* | |
157 * The main constructor: it calls compareWordBreaks and reports any differences | |
158 */ | |
159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, | |
160 const UChar *noSpaces, int32_t noSpaceCount
, UBool verbose) | |
161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose) | |
162 { | |
163 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount); | |
164 } | |
165 | |
166 /* | |
167 * The no arg constructor | |
168 */ | |
169 ThaiWordbreakTest::ThaiWordbreakTest() | |
170 { | |
171 // nothing | |
172 } | |
173 | |
174 /* | |
175 * The destructor | |
176 */ | |
177 ThaiWordbreakTest::~ThaiWordbreakTest() | |
178 { | |
179 // nothing? | |
180 } | |
181 | |
182 /* | |
183 * returns the number of breaks in the spaces array | |
184 * that aren't found in the noSpaces array | |
185 */ | |
186 inline int32_t ThaiWordbreakTest::getBreaksNotFound() | |
187 { | |
188 return fBreaksNotFound; | |
189 } | |
190 | |
191 /* | |
192 * Returns the number of breaks found in the noSpaces | |
193 * array that aren't in the spaces array | |
194 */ | |
195 inline int32_t ThaiWordbreakTest::getInvalidBreaks() | |
196 { | |
197 return fInvalidBreaks; | |
198 } | |
199 | |
200 /* | |
201 * Returns the number of words found in the spaces array | |
202 */ | |
203 inline int32_t ThaiWordbreakTest::getWordCount() | |
204 { | |
205 return fWordCount; | |
206 } | |
207 | |
208 /* | |
209 * This method does the acutal break comparison and reports the results. | |
210 * It uses a SpaceBreakIterator to iterate over the text with spaces, | |
211 * and a word instance of a Thai BreakIterator to iterate over the text | |
212 * without spaces. | |
213 */ | |
214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCou
nt, | |
215 const UChar *noSpaces, int32_t noSpac
eCount) | |
216 { | |
217 UBool result = TRUE; | |
218 Locale thai("th"); | |
219 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, n
oSpaceCount); | |
220 UErrorCode status = U_ZERO_ERROR; | |
221 | |
222 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status); | |
223 breakIter->adoptText(noSpaceIter); | |
224 | |
225 SpaceBreakIterator spaceIter(spaces, spaceCount); | |
226 | |
227 int32_t nextBreak = 0; | |
228 int32_t nextSpaceBreak = 0; | |
229 int32_t iterCount = 0; | |
230 | |
231 while (TRUE) { | |
232 nextSpaceBreak = spaceIter.next(); | |
233 nextBreak = breakIter->next(); | |
234 | |
235 if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator:
:DONE) { | |
236 if (nextBreak != BreakIterator::DONE) { | |
237 fprintf(stderr, "break iterator didn't end.\n"); | |
238 } else if (nextSpaceBreak != BreakIterator::DONE) { | |
239 fprintf(stderr, "premature break iterator end.\n"); | |
240 } | |
241 | |
242 break; | |
243 } | |
244 | |
245 while (nextSpaceBreak != nextBreak && | |
246 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterat
or::DONE) { | |
247 if (nextSpaceBreak < nextBreak) { | |
248 breakNotFound(nextSpaceBreak); | |
249 result = FALSE; | |
250 nextSpaceBreak = spaceIter.next(); | |
251 } else if (nextSpaceBreak > nextBreak) { | |
252 foundInvalidBreak(nextBreak); | |
253 result = FALSE; | |
254 nextBreak = breakIter->next(); | |
255 } | |
256 } | |
257 | |
258 if (fVerbose) { | |
259 printf("%d %d\n", nextSpaceBreak, nextBreak); | |
260 } | |
261 } | |
262 | |
263 | |
264 fWordCount = spaceIter.getWordCount(); | |
265 | |
266 delete breakIter; | |
267 | |
268 return result; | |
269 } | |
270 | |
271 /* | |
272 * Report a break that's in the text with spaces but | |
273 * not found in the text without spaces. | |
274 */ | |
275 void ThaiWordbreakTest::breakNotFound(int32_t br) | |
276 { | |
277 if (fVerbose) { | |
278 printf("%d ****\n", br); | |
279 } else { | |
280 fprintf(stderr, "break not found: %d\n", br); | |
281 } | |
282 | |
283 fBreaksNotFound += 1; | |
284 } | |
285 | |
286 /* | |
287 * Report a break that's found in the text without spaces | |
288 * that isn't in the text with spaces. | |
289 */ | |
290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br) | |
291 { | |
292 if (fVerbose) { | |
293 printf("**** %d\n", br); | |
294 } else { | |
295 fprintf(stderr, "found invalid break: %d\n", br); | |
296 } | |
297 | |
298 fInvalidBreaks += 1; | |
299 } | |
300 | |
301 /* | |
302 * Read the text from a file. The text must start with a Unicode Byte | |
303 * Order Mark (BOM) so that we know what order to read the bytes in. | |
304 */ | |
305 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount) | |
306 { | |
307 FILE *f; | |
308 int32_t fileSize; | |
309 | |
310 UChar *buffer; | |
311 char *bufferChars; | |
312 | |
313 f = fopen(fileName, "rb"); | |
314 | |
315 if( f == NULL ) { | |
316 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errn
o)); | |
317 return 0; | |
318 } | |
319 | |
320 fseek(f, 0, SEEK_END); | |
321 fileSize = ftell(f); | |
322 | |
323 fseek(f, 0, SEEK_SET); | |
324 bufferChars = new char[fileSize]; | |
325 | |
326 if(bufferChars == 0) { | |
327 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileN
ame, strerror(errno)); | |
328 fclose(f); | |
329 return 0; | |
330 } | |
331 | |
332 fread(bufferChars, sizeof(char), fileSize, f); | |
333 if( ferror(f) ) { | |
334 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errn
o)); | |
335 fclose(f); | |
336 delete[] bufferChars; | |
337 return 0; | |
338 } | |
339 fclose(f); | |
340 | |
341 UnicodeString myText(bufferChars, fileSize, "UTF-8"); | |
342 | |
343 delete[] bufferChars; | |
344 | |
345 charCount = myText.length(); | |
346 buffer = new UChar[charCount]; | |
347 if(buffer == 0) { | |
348 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileN
ame, strerror(errno)); | |
349 return 0; | |
350 } | |
351 | |
352 myText.extract(1, myText.length(), buffer); | |
353 charCount--; // skip the BOM | |
354 buffer[charCount] = 0; // NULL terminate for easier reading in the debugg
er | |
355 | |
356 return buffer; | |
357 } | |
358 | |
359 /* | |
360 * Remove spaces from the input UChar array. | |
361 * | |
362 * We check explicitly for a Unicode code value of 0x0020 | |
363 * because Unicode::isSpaceChar returns true for CR, LF, etc. | |
364 * | |
365 */ | |
366 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count,
int32_t &nonSpaceCount) | |
367 { | |
368 int32_t i, out, spaceCount; | |
369 | |
370 spaceCount = 0; | |
371 for (i = 0; i < count; i += 1) { | |
372 if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) { | |
373 spaceCount += 1; | |
374 } | |
375 } | |
376 | |
377 nonSpaceCount = count - spaceCount; | |
378 UChar *noSpaces = new UChar[nonSpaceCount]; | |
379 | |
380 if (noSpaces == 0) { | |
381 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n
"); | |
382 return 0; | |
383 } | |
384 | |
385 for (out = 0, i = 0; i < count; i += 1) { | |
386 if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) { | |
387 noSpaces[out++] = spaces[i]; | |
388 } | |
389 } | |
390 | |
391 return noSpaces; | |
392 } | |
393 | |
394 /* | |
395 * Generate a text file with spaces in it from a file without. | |
396 */ | |
397 int generateFile(const UChar *chars, int32_t length) { | |
398 Locale root(""); | |
399 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, leng
th); | |
400 UErrorCode status = U_ZERO_ERROR; | |
401 | |
402 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status)
; | |
403 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status); | |
404 breakIter->adoptText(noSpaceIter); | |
405 char outbuf[1024]; | |
406 int32_t strlength; | |
407 UChar bom = 0xFEFF; | |
408 | |
409 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &statu
s)); | |
410 int32_t prevbreak = 0; | |
411 while (U_SUCCESS(status)) { | |
412 int32_t nextbreak = breakIter->next(); | |
413 if (nextbreak == BreakIterator::DONE) { | |
414 break; | |
415 } | |
416 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prev
break], | |
417 nextbreak-prevbreak, &status)); | |
418 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1]) | |
419 && complexContext.contains(chars[nextbreak])) { | |
420 printf(" "); | |
421 } | |
422 prevbreak = nextbreak; | |
423 } | |
424 | |
425 if (U_FAILURE(status)) { | |
426 fprintf(stderr, "generate failed: %s\n", u_errorName(status)); | |
427 return status; | |
428 } | |
429 else { | |
430 return 0; | |
431 } | |
432 } | |
433 | |
434 /* | |
435 * The main routine. Read the command line arguments, read the text file, | |
436 * remove the spaces, do the comparison and report the final results | |
437 */ | |
438 int main(int argc, char **argv) | |
439 { | |
440 char *fileName = "space.txt"; | |
441 int arg = 1; | |
442 UBool verbose = FALSE; | |
443 UBool generate = FALSE; | |
444 | |
445 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) { | |
446 generate = TRUE; | |
447 arg += 1; | |
448 } | |
449 | |
450 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) { | |
451 verbose = TRUE; | |
452 arg += 1; | |
453 } | |
454 | |
455 if (arg == argc - 1) { | |
456 fileName = argv[arg++]; | |
457 } | |
458 | |
459 if (arg != argc) { | |
460 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]); | |
461 return 1; | |
462 } | |
463 | |
464 int32_t spaceCount, nonSpaceCount; | |
465 const UChar *spaces, *noSpaces; | |
466 | |
467 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount); | |
468 | |
469 if (spaces == 0) { | |
470 return 1; | |
471 } | |
472 | |
473 if (generate) { | |
474 return generateFile(spaces, spaceCount); | |
475 } | |
476 | |
477 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount
); | |
478 | |
479 if (noSpaces == 0) { | |
480 return 1; | |
481 } | |
482 | |
483 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose)
; | |
484 | |
485 printf("word count: %d\n", test.getWordCount()); | |
486 printf("breaks not found: %d\n", test.getBreaksNotFound()); | |
487 printf("invalid breaks found: %d\n", test.getInvalidBreaks()); | |
488 | |
489 return 0; | |
490 } | |
491 | |
492 /* | |
493 * The main constructor. Clear all the counts and construct a default | |
494 * word instance of a BreakIterator. | |
495 */ | |
496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count) | |
497 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0)
, fDone(FALSE) | |
498 { | |
499 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count); | |
500 UErrorCode status = U_ZERO_ERROR; | |
501 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), stat
us); | |
502 Locale root(""); | |
503 | |
504 fBreakIter = BreakIterator::createWordInstance(root, status); | |
505 fBreakIter->adoptText(iter); | |
506 } | |
507 | |
508 SpaceBreakIterator::SpaceBreakIterator() | |
509 { | |
510 // nothing | |
511 } | |
512 | |
513 /* | |
514 * The destructor. delete the underlying BreakIterator | |
515 */ | |
516 SpaceBreakIterator::~SpaceBreakIterator() | |
517 { | |
518 delete fBreakIter; | |
519 } | |
520 | |
521 /* | |
522 * Return the next break, counting words and spaces. | |
523 */ | |
524 int32_t SpaceBreakIterator::next() | |
525 { | |
526 if (fDone) { | |
527 return BreakIterator::DONE; | |
528 } | |
529 | |
530 int32_t nextBreak; | |
531 do { | |
532 nextBreak = fBreakIter->next(); | |
533 | |
534 if (nextBreak == BreakIterator::DONE) { | |
535 fDone = TRUE; | |
536 return BreakIterator::DONE; | |
537 } | |
538 } | |
539 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1]) | |
540 && fComplexContext.contains(fText[nextBreak])); | |
541 | |
542 int32_t result = nextBreak - fSpaceCount; | |
543 | |
544 if (nextBreak < fTextCount) { | |
545 if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*
/) { | |
546 fSpaceCount += fBreakIter->next() - nextBreak; | |
547 } | |
548 } | |
549 | |
550 fWordCount += 1; | |
551 | |
552 return result; | |
553 } | |
554 | |
555 /* | |
556 * Returns the current space count | |
557 */ | |
558 int32_t SpaceBreakIterator::getSpaceCount() | |
559 { | |
560 return fSpaceCount; | |
561 } | |
562 | |
563 /* | |
564 * Returns the current word count | |
565 */ | |
566 int32_t SpaceBreakIterator::getWordCount() | |
567 { | |
568 return fWordCount; | |
569 } | |
570 | |
571 | |
OLD | NEW |