source/test/thaitest/thaitest.cpp - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/thaitest/thaitest.cpp

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 ******************************************************************************

3 * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *

4 * and others. All Rights Reserved. *

5 ******************************************************************************

6 */

7

8 #include <errno.h>

9 #include <stdio.h>

10 #include <string.h>

11

12 #include "unicode/utypes.h"

13 #include "unicode/uchar.h"

14 #include "unicode/uchriter.h"

15 #include "unicode/brkiter.h"

16 #include "unicode/locid.h"

17 #include "unicode/unistr.h"

18 #include "unicode/uniset.h"

19 #include "unicode/ustring.h"

20

21 /*

22 * This program takes a Unicode text file containing Thai text with

23 * spaces inserted where the word breaks are. It computes a copy of

24 * the text without spaces and uses a word instance of a Thai BreakIterator

25 * to compute the word breaks. The program reports any differences in the

26 * breaks.

27 *

28 * NOTE: by it's very nature, Thai word breaking is not exact, so it is

29 * exptected that this program will always report some differences.

30 */

31

32 /*

33 * This class is a break iterator that counts words and spaces.

34 */

35 class SpaceBreakIterator

36 {

37 public:

38 // The constructor:

39 // text - pointer to an array of UChars to iterate over

40 // count - the number of UChars in text

41 SpaceBreakIterator(const UChar *text, int32_t count);

42

43 // the destructor

44 ~SpaceBreakIterator();

45

46 // return next break position

47 int32_t next();

48

49 // return current word count

50 int32_t getWordCount();

51

52 // return current space count

53 int32_t getSpaceCount();

54

55 private:

56 // No arg constructor: private so clients can't call it.

57 SpaceBreakIterator();

58

59 // The underlying BreakIterator

60 BreakIterator *fBreakIter;

61

62 // address of the UChar array

63 const UChar *fText;

64

65 // number of UChars in fText

66 int32_t fTextCount;

67

68 // current word count

69 int32_t fWordCount;

70

71 // current space count

72 int32_t fSpaceCount;

73

74 // UnicodeSet of SA characters

75 UnicodeSet fComplexContext;

76

77 // true when fBreakIter has returned DONE

78 UBool fDone;

79 };

80

81 /*

82 * This is the main class. It compares word breaks and reports the differences.

83 */

84 class ThaiWordbreakTest

85 {

86 public:

87 // The main constructor:

88 // spaces - pointer to a UChar array for the text with spaces

89 // spaceCount - the number of characters in the spaces array

90 // noSpaces - pointer to a UChar array for the text without spaces

91 // noSpaceCount - the number of characters in the noSpaces array

92 // verbose - report all breaks if true, otherwise just report differenc es

93 ThaiWordbreakTest(const UChar spaces, int32_t spaceCount, const UChar noSp aces, int32_t noSpaceCount, UBool verbose);

94 ~ThaiWordbreakTest();

95

96 // returns the number of breaks that are in the spaces array

97 // but aren't found in the noSpaces array

98 int32_t getBreaksNotFound();

99

100 // returns the number of breaks which are found in the noSpaces

101 // array but aren't in the spaces array

102 int32_t getInvalidBreaks();

103

104 // returns the number of words found in the spaces array

105 int32_t getWordCount();

106

107 // reads the input Unicode text file:

108 // fileName - the path name of the file

109 // charCount - set to the number of UChars read from the file

110 // returns - the address of the UChar array containing the characters

111 static const UChar readFile(char fileName, int32_t &charCount);

112

113 // removes spaces form the input UChar array:

114 // spaces - pointer to the input UChar array

115 // count - number of UChars in the spaces array

116 // nonSpaceCount - the number of UChars in the result array

117 // returns - the address of the UChar array with spaces removed

118 static const UChar crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount);

119

120 private:

121 // The no arg constructor - private so clients can't call it

122 ThaiWordbreakTest();

123

124 // This does the actual comparison:

125 // spaces - the address of the UChar array for the text with spaces

126 // spaceCount - the number of UChars in the spaces array

127 // noSpaces - the address of the UChar array for the text without spaces

128 // noSpaceCount - the number of UChars in the noSpaces array

129 // returns - true if all breaks match, FALSE otherwise

130 UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,

131 const UChar *noSpaces, int32_t noSpaceCount);

132

133 // helper method to report a break in the spaces

134 // array that's not found in the noSpaces array

135 void breakNotFound(int32_t br);

136

137 // helper method to report a break that's found in

138 // the noSpaces array that's not in the spaces array

139 void foundInvalidBreak(int32_t br);

140

141 // count of breaks in the spaces array that

142 // aren't found in the noSpaces array

143 int32_t fBreaksNotFound;

144

145 // count of breaks found in the noSpaces array

146 // that aren't in the spaces array

147 int32_t fInvalidBreaks;

148

149 // number of words found in the spaces array

150 int32_t fWordCount;

151

152 // report all breaks if true, otherwise just report differences

153 UBool fVerbose;

154 };

155

156 /*

157 * The main constructor: it calls compareWordBreaks and reports any differences

158 */

159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,

160 const UChar *noSpaces, int32_t noSpaceCount , UBool verbose)

161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)

162 {

163 compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);

164 }

165

166 /*

167 * The no arg constructor

168 */

169 ThaiWordbreakTest::ThaiWordbreakTest()

170 {

171 // nothing

172 }

173

174 /*

175 * The destructor

176 */

177 ThaiWordbreakTest::~ThaiWordbreakTest()

178 {

179 // nothing?

180 }

181

182 /*

183 * returns the number of breaks in the spaces array

184 * that aren't found in the noSpaces array

185 */

186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()

187 {

188 return fBreaksNotFound;

189 }

190

191 /*

192 * Returns the number of breaks found in the noSpaces

193 * array that aren't in the spaces array

194 */

195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()

196 {

197 return fInvalidBreaks;

198 }

199

200 /*

201 * Returns the number of words found in the spaces array

202 */

203 inline int32_t ThaiWordbreakTest::getWordCount()

204 {

205 return fWordCount;

206 }

207

208 /*

209 * This method does the acutal break comparison and reports the results.

210 * It uses a SpaceBreakIterator to iterate over the text with spaces,

211 * and a word instance of a Thai BreakIterator to iterate over the text

212 * without spaces.

213 */

214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCou nt,

215 const UChar *noSpaces, int32_t noSpac eCount)

216 {

217 UBool result = TRUE;

218 Locale thai("th");

219 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, n oSpaceCount);

220 UErrorCode status = U_ZERO_ERROR;

221

222 BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);

223 breakIter->adoptText(noSpaceIter);

224

225 SpaceBreakIterator spaceIter(spaces, spaceCount);

226

227 int32_t nextBreak = 0;

228 int32_t nextSpaceBreak = 0;

229 int32_t iterCount = 0;

230

231 while (TRUE) {

232 nextSpaceBreak = spaceIter.next();

233 nextBreak = breakIter->next();

234

235 if (nextSpaceBreak == BreakIterator::DONE \|\| nextBreak == BreakIterator: :DONE) {

236 if (nextBreak != BreakIterator::DONE) {

237 fprintf(stderr, "break iterator didn't end.\n");

238 } else if (nextSpaceBreak != BreakIterator::DONE) {

239 fprintf(stderr, "premature break iterator end.\n");

240 }

241

242 break;

243 }

244

245 while (nextSpaceBreak != nextBreak &&

246 nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterat or::DONE) {

247 if (nextSpaceBreak < nextBreak) {

248 breakNotFound(nextSpaceBreak);

249 result = FALSE;

250 nextSpaceBreak = spaceIter.next();

251 } else if (nextSpaceBreak > nextBreak) {

252 foundInvalidBreak(nextBreak);

253 result = FALSE;

254 nextBreak = breakIter->next();

255 }

256 }

257

258 if (fVerbose) {

259 printf("%d %d\n", nextSpaceBreak, nextBreak);

260 }

261 }

262

263

264 fWordCount = spaceIter.getWordCount();

265

266 delete breakIter;

267

268 return result;

269 }

270

271 /*

272 * Report a break that's in the text with spaces but

273 * not found in the text without spaces.

274 */

275 void ThaiWordbreakTest::breakNotFound(int32_t br)

276 {

277 if (fVerbose) {

278 printf("%d ****\n", br);

279 } else {

280 fprintf(stderr, "break not found: %d\n", br);

281 }

282

283 fBreaksNotFound += 1;

284 }

285

286 /*

287 * Report a break that's found in the text without spaces

288 * that isn't in the text with spaces.

289 */

290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)

291 {

292 if (fVerbose) {

293 printf("**** %d\n", br);

294 } else {

295 fprintf(stderr, "found invalid break: %d\n", br);

296 }

297

298 fInvalidBreaks += 1;

299 }

300

301 /*

302 * Read the text from a file. The text must start with a Unicode Byte

303 * Order Mark (BOM) so that we know what order to read the bytes in.

304 */

305 const UChar ThaiWordbreakTest::readFile(char fileName, int32_t &charCount)

306 {

307 FILE *f;

308 int32_t fileSize;

309

310 UChar *buffer;

311 char *bufferChars;

312

313 f = fopen(fileName, "rb");

314

315 if( f == NULL ) {

316 fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errn o));

317 return 0;

318 }

319

320 fseek(f, 0, SEEK_END);

321 fileSize = ftell(f);

322

323 fseek(f, 0, SEEK_SET);

324 bufferChars = new char[fileSize];

325

326 if(bufferChars == 0) {

327 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileN ame, strerror(errno));

328 fclose(f);

329 return 0;

330 }

331

332 fread(bufferChars, sizeof(char), fileSize, f);

333 if( ferror(f) ) {

334 fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errn o));

335 fclose(f);

336 delete[] bufferChars;

337 return 0;

338 }

339 fclose(f);

340

341 UnicodeString myText(bufferChars, fileSize, "UTF-8");

342

343 delete[] bufferChars;

344

345 charCount = myText.length();

346 buffer = new UChar[charCount];

347 if(buffer == 0) {

348 fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileN ame, strerror(errno));

349 return 0;

350 }

351

352 myText.extract(1, myText.length(), buffer);

353 charCount--; // skip the BOM

354 buffer[charCount] = 0; // NULL terminate for easier reading in the debugg er

355

356 return buffer;

357 }

358

359 /*

360 * Remove spaces from the input UChar array.

361 *

362 * We check explicitly for a Unicode code value of 0x0020

363 * because Unicode::isSpaceChar returns true for CR, LF, etc.

364 *

365 */

366 const UChar ThaiWordbreakTest::crunchSpaces(const UChar spaces, int32_t count, int32_t &nonSpaceCount)

367 {

368 int32_t i, out, spaceCount;

369

370 spaceCount = 0;

371 for (i = 0; i < count; i += 1) {

372 if (spaces[i] == 0x0020 /Unicode::isSpaceChar(spaces[i])/) {

373 spaceCount += 1;

374 }

375 }

376

377 nonSpaceCount = count - spaceCount;

378 UChar *noSpaces = new UChar[nonSpaceCount];

379

380 if (noSpaces == 0) {

381 fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n ");

382 return 0;

383 }

384

385 for (out = 0, i = 0; i < count; i += 1) {

386 if (spaces[i] != 0x0020 /! Unicode::isSpaceChar(spaces[i])/) {

387 noSpaces[out++] = spaces[i];

388 }

389 }

390

391 return noSpaces;

392 }

393

394 /*

395 * Generate a text file with spaces in it from a file without.

396 */

397 int generateFile(const UChar *chars, int32_t length) {

398 Locale root("");

399 UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, leng th);

400 UErrorCode status = U_ZERO_ERROR;

401

402 UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status) ;

403 BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);

404 breakIter->adoptText(noSpaceIter);

405 char outbuf[1024];

406 int32_t strlength;

407 UChar bom = 0xFEFF;

408

409 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &statu s));

410 int32_t prevbreak = 0;

411 while (U_SUCCESS(status)) {

412 int32_t nextbreak = breakIter->next();

413 if (nextbreak == BreakIterator::DONE) {

414 break;

415 }

416 printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prev break],

417 nextbreak-prevbreak, &status));

418 if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])

419 && complexContext.contains(chars[nextbreak])) {

420 printf(" ");

421 }

422 prevbreak = nextbreak;

423 }

424

425 if (U_FAILURE(status)) {

426 fprintf(stderr, "generate failed: %s\n", u_errorName(status));

427 return status;

428 }

429 else {

430 return 0;

431 }

432 }

433

434 /*

435 * The main routine. Read the command line arguments, read the text file,

436 * remove the spaces, do the comparison and report the final results

437 */

438 int main(int argc, char **argv)

439 {

440 char *fileName = "space.txt";

441 int arg = 1;

442 UBool verbose = FALSE;

443 UBool generate = FALSE;

444

445 if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {

446 generate = TRUE;

447 arg += 1;

448 }

449

450 if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {

451 verbose = TRUE;

452 arg += 1;

453 }

454

455 if (arg == argc - 1) {

456 fileName = argv[arg++];

457 }

458

459 if (arg != argc) {

460 fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);

461 return 1;

462 }

463

464 int32_t spaceCount, nonSpaceCount;

465 const UChar spaces, noSpaces;

466

467 spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);

468

469 if (spaces == 0) {

470 return 1;

471 }

472

473 if (generate) {

474 return generateFile(spaces, spaceCount);

475 }

476

477 noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount );

478

479 if (noSpaces == 0) {

480 return 1;

481 }

482

483 ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose) ;

484

485 printf("word count: %d\n", test.getWordCount());

486 printf("breaks not found: %d\n", test.getBreaksNotFound());

487 printf("invalid breaks found: %d\n", test.getInvalidBreaks());

488

489 return 0;

490 }

491

492 /*

493 * The main constructor. Clear all the counts and construct a default

494 * word instance of a BreakIterator.

495 */

496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)

497 : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0) , fDone(FALSE)

498 {

499 UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);

500 UErrorCode status = U_ZERO_ERROR;

501 fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), stat us);

502 Locale root("");

503

504 fBreakIter = BreakIterator::createWordInstance(root, status);

505 fBreakIter->adoptText(iter);

506 }

507

508 SpaceBreakIterator::SpaceBreakIterator()

509 {

510 // nothing

511 }

512

513 /*

514 * The destructor. delete the underlying BreakIterator

515 */

516 SpaceBreakIterator::~SpaceBreakIterator()

517 {

518 delete fBreakIter;

519 }

520

521 /*

522 * Return the next break, counting words and spaces.

523 */

524 int32_t SpaceBreakIterator::next()

525 {

526 if (fDone) {

527 return BreakIterator::DONE;

528 }

529

530 int32_t nextBreak;

531 do {

532 nextBreak = fBreakIter->next();

533

534 if (nextBreak == BreakIterator::DONE) {

535 fDone = TRUE;

536 return BreakIterator::DONE;

537 }

538 }

539 while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])

540 && fComplexContext.contains(fText[nextBreak]));

541

542 int32_t result = nextBreak - fSpaceCount;

543

544 if (nextBreak < fTextCount) {

545 if (fText[nextBreak] == 0x0020 /Unicode::isSpaceChar(fText[nextBreak]) /) {

546 fSpaceCount += fBreakIter->next() - nextBreak;

547 }

548 }

549

550 fWordCount += 1;

551

552 return result;

553 }

554

555 /*

556 * Returns the current space count

557 */

558 int32_t SpaceBreakIterator::getSpaceCount()

559 {

560 return fSpaceCount;

561 }

562

563 /*

564 * Returns the current word count

565 */

566 int32_t SpaceBreakIterator::getWordCount()

567 {

568 return fWordCount;

569 }

570

571

OLD	NEW

« no previous file with comments | « source/test/thaitest/space.txt ('k') | source/test/thaitest/thaitest.dsp » ('j') | no next file with comments »