source/test/intltest/usettest.cpp - Issue 2435373002: Delete source/test

Side by Side Diff: source/test/intltest/usettest.cpp

Issue 2435373002: Delete source/test (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 ********************************************************************************

3 * Copyright (C) 1999-2015 International Business Machines Corporation and

4 * others. All Rights Reserved.

5 ********************************************************************************

6 * Date Name Description

7 * 10/20/99 alan Creation.

8 * 03/22/2000 Madhu Added additional tests

9 ********************************************************************************

10 */

11

12 #include <stdio.h>

13

14 #include <string.h>

15 #include "unicode/utypes.h"

16 #include "usettest.h"

17 #include "unicode/ucnv.h"

18 #include "unicode/uniset.h"

19 #include "unicode/uchar.h"

20 #include "unicode/usetiter.h"

21 #include "unicode/ustring.h"

22 #include "unicode/parsepos.h"

23 #include "unicode/symtable.h"

24 #include "unicode/uversion.h"

25 #include "hash.h"

26

27 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \

28 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \

29 u_errorName(status));}}

30

31 #define TEST_ASSERT(expr) {if (!(expr)) { \

32 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}

33

34 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {

35 UnicodeString pat;

36 set.toPattern(pat);

37 return left + UnicodeSetTest::escape(pat);

38 }

39

40 #define CASE(id,test) case id: \

41 name = #test; \

42 if (exec) { \

43 logln(#test "---"); \

44 logln(); \

45 test(); \

46 } \

47 break

48

49 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {

50 }

51

52 UConverter *UnicodeSetTest::openUTF8Converter() {

53 if(utf8Cnv==NULL) {

54 UErrorCode errorCode=U_ZERO_ERROR;

55 utf8Cnv=ucnv_open("UTF-8", &errorCode);

56 }

57 return utf8Cnv;

58 }

59

60 UnicodeSetTest::~UnicodeSetTest() {

61 ucnv_close(utf8Cnv);

62 }

63

64 void

65 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,

66 const char* &name, char* /par/) {

67 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");

68 switch (index) {

69 CASE(0,TestPatterns);

70 CASE(1,TestAddRemove);

71 CASE(2,TestCategories);

72 CASE(3,TestCloneEqualHash);

73 CASE(4,TestMinimalRep);

74 CASE(5,TestAPI);

75 CASE(6,TestScriptSet);

76 CASE(7,TestPropertySet);

77 CASE(8,TestClone);

78 CASE(9,TestExhaustive);

79 CASE(10,TestToPattern);

80 CASE(11,TestIndexOf);

81 CASE(12,TestStrings);

82 CASE(13,Testj2268);

83 CASE(14,TestCloseOver);

84 CASE(15,TestEscapePattern);

85 CASE(16,TestInvalidCodePoint);

86 CASE(17,TestSymbolTable);

87 CASE(18,TestSurrogate);

88 CASE(19,TestPosixClasses);

89 CASE(20,TestIteration);

90 CASE(21,TestFreezable);

91 CASE(22,TestSpan);

92 CASE(23,TestStringSpan);

93 CASE(24,TestUCAUnsafeBackwards);

94 default: name = ""; break;

95 }

96 }

97

98 static const char NOT[] = "%%%%";

99

100 /**

101 * UVector was improperly copying contents

102 * This code will crash this is still true

103 */

104 void UnicodeSetTest::Testj2268() {

105 UnicodeSet t;

106 t.add(UnicodeString("abc"));

107 UnicodeSet test(t);

108 UnicodeString ustrPat;

109 test.toPattern(ustrPat, TRUE);

110 }

111

112 /**

113 * Test toPattern().

114 */

115 void UnicodeSetTest::TestToPattern() {

116 UErrorCode ec = U_ZERO_ERROR;

117

118 // Test that toPattern() round trips with syntax characters and

119 // whitespace.

120 {

121 static const char* OTHER_TOPATTERN_TESTS[] = {

122 "[[:latin:]&[:greek:]]",

123 "[[:latin:]-[:greek:]]",

124 "[:nonspacing mark:]",

125 NULL

126 };

127

128 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {

129 ec = U_ZERO_ERROR;

130 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);

131 if (U_FAILURE(ec)) {

132 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_ TESTS[j] + " - " + UnicodeString(u_errorName(ec)));

133 continue;

134 }

135 checkPat(OTHER_TOPATTERN_TESTS[j], s);

136 }

137

138 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {

139 if ((i <= 0xFF && !u_isalpha(i)) \|\| u_isspace(i)) {

140

141 // check various combinations to make sure they all work.

142 if (i != 0 && !toPatternAux(i, i)){

143 continue;

144 }

145 if (!toPatternAux(0, i)){

146 continue;

147 }

148 if (!toPatternAux(i, 0xFFFF)){

149 continue;

150 }

151 }

152 }

153 }

154

155 // Test pattern behavior of multicharacter strings.

156 {

157 ec = U_ZERO_ERROR;

158 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);

159

160 // This loop isn't a loop. It's here to make the compiler happy.

161 // If you're curious, try removing it and changing the 'break'

162 // statements (except for the last) to goto's.

163 for (;;) {

164 if (U_FAILURE(ec)) break;

165 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};

166 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);

167

168 s->add("ac");

169 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};

170 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);

171

172 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);

173 if (U_FAILURE(ec)) break;

174 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};

175 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3 );

176

177 s->add("[]");

178 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};

179 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}] "), exp4);

180

181 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r} ]"), ec);

182 if (U_FAILURE(ec)) break;

183 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};

184 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4 E01\\u4E02}]"), exp5);

185

186 // j2189

187 s->clear();

188 s->add(UnicodeString("abc", ""));

189 s->add(UnicodeString("abc", ""));

190 const char* exp6[] = {"abc", NOT, "ab", NULL};

191 expectToPattern(*s, "[{abc}]", exp6);

192

193 break;

194 }

195

196 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");

197 delete s;

198 }

199

200 // JB#3400: For 2 character ranges prefer [ab] to [a-b]

201 UnicodeSet s;

202 s.add((UChar)97, (UChar)98); // 'a', 'b'

203 expectToPattern(s, "[ab]", NULL);

204 }

205

206 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {

207

208 // use Integer.toString because Utility.hex doesn't handle ints

209 UnicodeString pat = "";

210 // TODO do these in hex

211 //String source = "0x" + Integer.toString(start,16).toUpperCase();

212 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase( );

213 UnicodeString source;

214 source = source + (uint32_t)start;

215 if (start != end)

216 source = source + ".." + (uint32_t)end;

217 UnicodeSet testSet;

218 testSet.add(start, end);

219 return checkPat(source, testSet);

220 }

221

222 UBool UnicodeSetTest::checkPat(const UnicodeString& source,

223 const UnicodeSet& testSet) {

224 // What we want to make sure of is that a pattern generated

225 // by toPattern(), with or without escaped unprintables, can

226 // be passed back into the UnicodeSet constructor.

227 UnicodeString pat0;

228

229 testSet.toPattern(pat0, TRUE);

230

231 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;

232

233 //String pat1 = unescapeLeniently(pat0);

234 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;

235

236 UnicodeString pat2;

237 testSet.toPattern(pat2, FALSE);

238 if (!checkPat(source, testSet, pat2)) return FALSE;

239

240 //String pat3 = unescapeLeniently(pat2);

241 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;

242

243 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);

244 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);

245 return TRUE;

246 }

247

248 UBool UnicodeSetTest::checkPat(const UnicodeString& source,

249 const UnicodeSet& testSet,

250 const UnicodeString& pat) {

251 UErrorCode ec = U_ZERO_ERROR;

252 UnicodeSet testSet2(pat, ec);

253 if (testSet2 != testSet) {

254 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);

255 return FALSE;

256 }

257 return TRUE;

258 }

259

260 void

261 UnicodeSetTest::TestPatterns(void) {

262 UnicodeSet set;

263 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");

264 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");

265 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");

266 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");

267 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");

268 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");

269

270 // Throw in a test of complement

271 set.complement();

272 UnicodeString exp;

273 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append ((UChar)0xFFFF);

274 expectPairs(set, exp);

275 }

276

277 void

278 UnicodeSetTest::TestCategories(void) {

279 UErrorCode status = U_ZERO_ERROR;

280 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]

281 UnicodeSet set(pat, status);

282 if (U_FAILURE(status)) {

283 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));

284 return;

285 } else {

286 expectContainment(set, pat, "ABC", "abc");

287 }

288

289 UChar32 i;

290 int32_t failures = 0;

291 // Make sure generation of L doesn't pollute cached Lu set

292 // First generate L, then Lu

293 set.applyPattern("[:L:]", status);

294 if (U_FAILURE(status)) { errln("FAIL"); return; }

295 for (i=0; i<0x200; ++i) {

296 UBool l = u_isalpha((UChar)i);

297 if (l != set.contains(i)) {

298 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +

299 set.contains(i));

300 if (++failures == 10) break;

301 }

302 }

303

304 set.applyPattern("[:Lu:]", status);

305 if (U_FAILURE(status)) { errln("FAIL"); return; }

306 for (i=0; i<0x200; ++i) {

307 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);

308 if (lu != set.contains(i)) {

309 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +

310 set.contains(i));

311 if (++failures == 20) break;

312 }

313 }

314 }

315 void

316 UnicodeSetTest::TestCloneEqualHash(void) {

317 UErrorCode status = U_ZERO_ERROR;

318 // set1 and set2 used to be built with the obsolete constructor taking

319 // UCharCategory values; replaced with pattern constructors

320 // markus 20030502

321 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter} "), status); // :Ll: Letter, lowercase

322 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); / / Letter, lowercase

323 if (U_FAILURE(status)){

324 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));

325 return;

326 }

327 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}") , status); //Number, Decimal digit

328 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit

329 if (U_FAILURE(status)){

330 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");

331 return;

332 }

333

334 if (set1 != set1a) {

335 errln("FAIL: category constructor for Ll broken");

336 }

337 if (set2 != set2a) {

338 errln("FAIL: category constructor for Nd broken");

339 }

340 delete set1a;

341 delete set2a;

342

343 logln("Testing copy construction");

344 UnicodeSet set1copy=new UnicodeSet(set1);

345 if(set1 != set1copy \|\| set1 == set2 \|\|

346 getPairs(set1) != getPairs(set1copy) \|\|

347 set1->hashCode() != set1copy->hashCode()){

348 errln("FAIL : Error in copy construction");

349 return;

350 }

351

352 logln("Testing =operator");

353 UnicodeSet set1equal=*set1;

354 UnicodeSet set2equal=*set2;

355 if(set1equal != set1 \|\| set1equal != set1copy \|\| set2equal != *set2 \|\|

356 set2equal == set1 \|\| set2equal == set1copy \|\| set2equal == set1equal){

357 errln("FAIL: Error in =operator");

358 }

359

360 logln("Testing clone()");

361 UnicodeSet set1clone=(UnicodeSet)set1->clone();

362 UnicodeSet set2clone=(UnicodeSet)set2->clone();

363 if(set1clone != set1 \|\| set1clone != set1copy \|\| *set1clone != set1equal \|\|

364 set2clone != set2 \|\| set2clone == set1copy \|\| *set2clone != set2equa l \|\|

365 set2clone == set1 \|\| set2clone == set1equal \|\| set2clone == *set1clo ne){

366 errln("FAIL: Error in clone");

367 }

368

369 logln("Testing hashcode");

370 if(set1->hashCode() != set1equal.hashCode() \|\| set1->hashCode() != set1clone ->hashCode() \|\|

371 set2->hashCode() != set2equal.hashCode() \|\| set2->hashCode() != set2clon e->hashCode() \|\|

372 set1copy->hashCode() != set1equal.hashCode() \|\| set1copy->hashCode() != set1clone->hashCode() \|\|

373 set1->hashCode() == set2->hashCode() \|\| set1copy->hashCode() == set2->h ashCode() \|\|

374 set2->hashCode() == set1clone->hashCode() \|\| set2->hashCode() == set1equ al.hashCode() ){

375 errln("FAIL: Error in hashCode()");

376 }

377

378 delete set1;

379 delete set1copy;

380 delete set2;

381 delete set1clone;

382 delete set2clone;

383

384

385 }

386 void

387 UnicodeSetTest::TestAddRemove(void) {

388 UnicodeSet set; // Construct empty set

389 doAssert(set.isEmpty() == TRUE, "set should be empty");

390 doAssert(set.size() == 0, "size should be 0");

391 set.complement();

392 doAssert(set.size() == 0x110000, "size should be 0x110000");

393 set.clear();

394 set.add(0x0061, 0x007a);

395 expectPairs(set, "az");

396 doAssert(set.isEmpty() == FALSE, "set should not be empty");

397 doAssert(set.size() != 0, "size should not be equal to 0");

398 doAssert(set.size() == 26, "size should be equal to 26");

399 set.remove(0x006d, 0x0070);

400 expectPairs(set, "alqz");

401 doAssert(set.size() == 22, "size should be equal to 22");

402 set.remove(0x0065, 0x0067);

403 expectPairs(set, "adhlqz");

404 doAssert(set.size() == 19, "size should be equal to 19");

405 set.remove(0x0064, 0x0069);

406 expectPairs(set, "acjlqz");

407 doAssert(set.size() == 16, "size should be equal to 16");

408 set.remove(0x0063, 0x0072);

409 expectPairs(set, "absz");

410 doAssert(set.size() == 10, "size should be equal to 10");

411 set.add(0x0066, 0x0071);

412 expectPairs(set, "abfqsz");

413 doAssert(set.size() == 22, "size should be equal to 22");

414 set.remove(0x0061, 0x0067);

415 expectPairs(set, "hqsz");

416 set.remove(0x0061, 0x007a);

417 expectPairs(set, "");

418 doAssert(set.isEmpty() == TRUE, "set should be empty");

419 doAssert(set.size() == 0, "size should be 0");

420 set.add(0x0061);

421 doAssert(set.isEmpty() == FALSE, "set should not be empty");

422 doAssert(set.size() == 1, "size should not be equal to 1");

423 set.add(0x0062);

424 set.add(0x0063);

425 expectPairs(set, "ac");

426 doAssert(set.size() == 3, "size should not be equal to 3");

427 set.add(0x0070);

428 set.add(0x0071);

429 expectPairs(set, "acpq");

430 doAssert(set.size() == 5, "size should not be equal to 5");

431 set.clear();

432 expectPairs(set, "");

433 doAssert(set.isEmpty() == TRUE, "set should be empty");

434 doAssert(set.size() == 0, "size should be 0");

435

436 // Try removing an entire set from another set

437 expectPattern(set, "[c-x]", "cx");

438 UnicodeSet set2;

439 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");

440 set.removeAll(set2);

441 expectPairs(set, "deluxx");

442

443 // Try adding an entire set to another set

444 expectPattern(set, "[jackiemclean]", "aacceein");

445 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");

446 set.addAll(set2);

447 expectPairs(set, "aacehort");

448 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");

449

450 // Try retaining an set of elements contained in another set (intersection)

451 UnicodeSet set3;

452 expectPattern(set3, "[a-c]", "ac");

453 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elemen ts in set3");

454 set3.remove(0x0062);

455 expectPairs(set3, "aacc");

456 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");

457 set.retainAll(set3);

458 expectPairs(set, "aacc");

459 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");

460 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");

461 set.clear();

462 doAssert(set.size() != set3.size(), "set.size() != set3.size()");

463

464 // Test commutativity

465 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");

466 expectPattern(set2, "[jackiemclean]", "aacceein");

467 set.addAll(set2);

468 expectPairs(set, "aacehort");

469 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");

470

471

472

473

474 }

475

476 /**

477 * Make sure minimal representation is maintained.

478 */

479 void UnicodeSetTest::TestMinimalRep() {

480 UErrorCode status = U_ZERO_ERROR;

481 // This is pretty thoroughly tested by checkCanonicalRep()

482 // run against the exhaustive operation results. Use the code

483 // here for debugging specific spot problems.

484

485 // 1 overlap against 2

486 UnicodeSet set("[h-km-q]", status);

487 if (U_FAILURE(status)) { errln("FAIL"); return; }

488 UnicodeSet set2("[i-o]", status);

489 if (U_FAILURE(status)) { errln("FAIL"); return; }

490 set.addAll(set2);

491 expectPairs(set, "hq");

492 // right

493 set.applyPattern("[a-m]", status);

494 if (U_FAILURE(status)) { errln("FAIL"); return; }

495 set2.applyPattern("[e-o]", status);

496 if (U_FAILURE(status)) { errln("FAIL"); return; }

497 set.addAll(set2);

498 expectPairs(set, "ao");

499 // left

500 set.applyPattern("[e-o]", status);

501 if (U_FAILURE(status)) { errln("FAIL"); return; }

502 set2.applyPattern("[a-m]", status);

503 if (U_FAILURE(status)) { errln("FAIL"); return; }

504 set.addAll(set2);

505 expectPairs(set, "ao");

506 // 1 overlap against 3

507 set.applyPattern("[a-eg-mo-w]", status);

508 if (U_FAILURE(status)) { errln("FAIL"); return; }

509 set2.applyPattern("[d-q]", status);

510 if (U_FAILURE(status)) { errln("FAIL"); return; }

511 set.addAll(set2);

512 expectPairs(set, "aw");

513 }

514

515 void UnicodeSetTest::TestAPI() {

516 UErrorCode status = U_ZERO_ERROR;

517 // default ct

518 UnicodeSet set;

519 if (!set.isEmpty() \|\| set.getRangeCount() != 0) {

520 errln((UnicodeString)"FAIL, set should be empty but isn't: " +

521 set);

522 }

523

524 // clear(), isEmpty()

525 set.add(0x0061);

526 if (set.isEmpty()) {

527 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +

528 set);

529 }

530 set.clear();

531 if (!set.isEmpty()) {

532 errln((UnicodeString)"FAIL, set should be empty but isn't: " +

533 set);

534 }

535

536 // size()

537 set.clear();

538 if (set.size() != 0) {

539 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +

540 ": " + set);

541 }

542 set.add(0x0061);

543 if (set.size() != 1) {

544 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +

545 ": " + set);

546 }

547 set.add(0x0031, 0x0039);

548 if (set.size() != 10) {

549 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +

550 ": " + set);

551 }

552

553 // contains(first, last)

554 set.clear();

555 set.applyPattern("[A-Y 1-8 b-d l-y]", status);

556 if (U_FAILURE(status)) { errln("FAIL"); return; }

557 for (int32_t i = 0; i<set.getRangeCount(); ++i) {

558 UChar32 a = set.getRangeStart(i);

559 UChar32 b = set.getRangeEnd(i);

560 if (!set.contains(a, b)) {

561 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + ' -' + (unsigned short)b +

562 " but doesn't: " + set);

563 }

564 if (set.contains((UChar32)(a-1), b)) {

565 errln((UnicodeString)"FAIL, shouldn't contain " +

566 (unsigned short)(a-1) + '-' + (unsigned short)b +

567 " but does: " + set);

568 }

569 if (set.contains(a, (UChar32)(b+1))) {

570 errln((UnicodeString)"FAIL, shouldn't contain " +

571 (unsigned short)a + '-' + (unsigned short)(b+1) +

572 " but does: " + set);

573 }

574 }

575

576 // Ported InversionList test.

577 UnicodeSet a((UChar32)3,(UChar32)10);

578 UnicodeSet b((UChar32)7,(UChar32)15);

579 UnicodeSet c;

580

581 logln((UnicodeString)"a [3-10]: " + a);

582 logln((UnicodeString)"b [7-15]: " + b);

583 c = a;

584 c.addAll(b);

585 UnicodeSet exp((UChar32)3,(UChar32)15);

586 if (c == exp) {

587 logln((UnicodeString)"c.set(a).add(b): " + c);

588 } else {

589 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp) ;

590 }

591 c.complement();

592 exp.set((UChar32)0, (UChar32)2);

593 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);

594 if (c == exp) {

595 logln((UnicodeString)"c.complement(): " + c);

596 } else {

597 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);

598 }

599 c.complement();

600 exp.set((UChar32)3, (UChar32)15);

601 if (c == exp) {

602 logln((UnicodeString)"c.complement(): " + c);

603 } else {

604 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);

605 }

606 c = a;

607 c.complementAll(b);

608 exp.set((UChar32)3,(UChar32)6);

609 exp.add((UChar32)11,(UChar32) 15);

610 if (c == exp) {

611 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);

612 } else {

613 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);

614 }

615

616 exp = c;

617 bitsToSet(setToBits(c), c);

618 if (c == exp) {

619 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);

620 } else {

621 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);

622 }

623

624 // Additional tests for coverage JB#2118

625 //UnicodeSet::complement(class UnicodeString const &)

626 //UnicodeSet::complementAll(class UnicodeString const &)

627 //UnicodeSet::containsNone(class UnicodeSet const &)

628 //UnicodeSet::containsNone(long,long)

629 //UnicodeSet::containsSome(class UnicodeSet const &)

630 //UnicodeSet::containsSome(long,long)

631 //UnicodeSet::removeAll(class UnicodeString const &)

632 //UnicodeSet::retain(long)

633 //UnicodeSet::retainAll(class UnicodeString const &)

634 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)

635 //UnicodeSetIterator::getString(void)

636 set.clear();

637 set.complement("ab");

638 exp.applyPattern("[{ab}]", status);

639 if (U_FAILURE(status)) { errln("FAIL"); return; }

640 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }

641

642 UnicodeSetIterator iset(set);

643 if (!iset.next() \|\| !iset.isString()) {

644 errln("FAIL: UnicodeSetIterator::next/isString");

645 } else if (iset.getString() != "ab") {

646 errln("FAIL: UnicodeSetIterator::getString");

647 }

648

649 set.add((UChar32)0x61, (UChar32)0x7A);

650 set.complementAll("alan");

651 exp.applyPattern("[{ab}b-kmo-z]", status);

652 if (U_FAILURE(status)) { errln("FAIL"); return; }

653 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }

654

655 exp.applyPattern("[a-z]", status);

656 if (U_FAILURE(status)) { errln("FAIL"); return; }

657 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }

658 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }

659 exp.applyPattern("[aln]", status);

660 if (U_FAILURE(status)) { errln("FAIL"); return; }

661 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }

662 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }

663

664 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {

665 errln("FAIL: containsNone(UChar32, UChar32)");

666 }

667 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {

668 errln("FAIL: containsSome(UChar32, UChar32)");

669 }

670 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {

671 errln("FAIL: containsNone(UChar32, UChar32)");

672 }

673 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {

674 errln("FAIL: containsSome(UChar32, UChar32)");

675 }

676

677 set.removeAll("liu");

678 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);

679 if (U_FAILURE(status)) { errln("FAIL"); return; }

680 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }

681

682 set.retainAll("star");

683 exp.applyPattern("[rst]", status);

684 if (U_FAILURE(status)) { errln("FAIL"); return; }

685 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }

686

687 set.retain((UChar32)0x73);

688 exp.applyPattern("[s]", status);

689 if (U_FAILURE(status)) { errln("FAIL"); return; }

690 if (set != exp) { errln("FAIL: retain('s')"); return; }

691

692 uint16_t buf[32];

693 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);

694 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }

695 if (slen != 3 \|\| buf[0] != 2 \|\| buf[1] != 0x73 \|\| buf[2] != 0x74) {

696 errln("FAIL: serialize");

697 return;

698 }

699

700 // Conversions to and from USet

701 UnicodeSet *uniset = &set;

702 USet *uset = uniset->toUSet();

703 TEST_ASSERT((void )uset == (void )uniset);

704 UnicodeSet *setx = UnicodeSet::fromUSet(uset);

705 TEST_ASSERT((void )setx == (void )uset);

706 const UnicodeSet *constSet = uniset;

707 const USet *constUSet = constSet->toUSet();

708 TEST_ASSERT((void )constUSet == (void )constSet);

709 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);

710 TEST_ASSERT((void )constSetx == (void )constUSet);

711

712 // span(UnicodeString) and spanBack(UnicodeString) convenience methods

713 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccc cc");

714 UnicodeSet ac(0x61, 0x63);

715 ac.remove(0x62).freeze();

716 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 \|\|

717 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 \|\|

718 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 \|\|

719 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 \|\|

720 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|

721 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 \|\|

722 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 \|\|

723 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 \|\|

724 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 \|\|

725 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30

726 ) {

727 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes ");

728 }

729 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 \|\|

730 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 \|\|

731 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 \|\|

732 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 \|\|

733 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|

734 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 \|\|

735 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 \|\|

736 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 \|\|

737 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 \|\|

738 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20

739 ) {

740 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start i ndexes");

741 }

742 }

743

744 void UnicodeSetTest::TestIteration() {

745 UErrorCode ec = U_ZERO_ERROR;

746 int i = 0;

747 int outerLoop;

748

749 // 6 code points, 3 ranges, 2 strings, 8 total elements

750 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"

751 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);

752 TEST_ASSERT_SUCCESS(ec);

753 UnicodeSetIterator it(set);

754

755 for (outerLoop=0; outerLoop<3; outerLoop++) {

756 // Run the test multiple times, to check that iterator.reset() is workin g.

757 for (i=0; i<10; i++) {

758 UBool nextv = it.next();

759 UBool isString = it.isString();

760 int32_t codePoint = it.getCodepoint();

761 //int32_t codePointEnd = it.getCodepointEnd();

762 UnicodeString s = it.getString();

763 switch (i) {

764 case 0:

765 TEST_ASSERT(nextv == TRUE);

766 TEST_ASSERT(isString == FALSE);

767 TEST_ASSERT(codePoint==0x61);

768 TEST_ASSERT(s == "a");

769 break;

770 case 1:

771 TEST_ASSERT(nextv == TRUE);

772 TEST_ASSERT(isString == FALSE);

773 TEST_ASSERT(codePoint==0x62);

774 TEST_ASSERT(s == "b");

775 break;

776 case 2:

777 TEST_ASSERT(nextv == TRUE);

778 TEST_ASSERT(isString == FALSE);

779 TEST_ASSERT(codePoint==0x63);

780 TEST_ASSERT(s == "c");

781 break;

782 case 3:

783 TEST_ASSERT(nextv == TRUE);

784 TEST_ASSERT(isString == FALSE);

785 TEST_ASSERT(codePoint==0x79);

786 TEST_ASSERT(s == "y");

787 break;

788 case 4:

789 TEST_ASSERT(nextv == TRUE);

790 TEST_ASSERT(isString == FALSE);

791 TEST_ASSERT(codePoint==0x7a);

792 TEST_ASSERT(s == "z");

793 break;

794 case 5:

795 TEST_ASSERT(nextv == TRUE);

796 TEST_ASSERT(isString == FALSE);

797 TEST_ASSERT(codePoint==0x1abcd);

798 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));

799 break;

800 case 6:

801 TEST_ASSERT(nextv == TRUE);

802 TEST_ASSERT(isString == TRUE);

803 TEST_ASSERT(s == "str1");

804 break;

805 case 7:

806 TEST_ASSERT(nextv == TRUE);

807 TEST_ASSERT(isString == TRUE);

808 TEST_ASSERT(s == "str2");

809 break;

810 case 8:

811 TEST_ASSERT(nextv == FALSE);

812 break;

813 case 9:

814 TEST_ASSERT(nextv == FALSE);

815 break;

816 }

817 }

818 it.reset(); // prepare to run the iteration again.

819 }

820 }

821

822

823

824

825 void UnicodeSetTest::TestStrings() {

826 UErrorCode ec = U_ZERO_ERROR;

827

828 UnicodeSet* testList[] = {

829 UnicodeSet::createFromAll("abc"),

830 new UnicodeSet("[a-c]", ec),

831

832 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),

833 new UnicodeSet("[{ll}{ch}a-z]", ec),

834

835 UnicodeSet::createFrom("ab}c"),

836 new UnicodeSet("[{ab\\}c}]", ec),

837

838 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X ')),

839 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),

840

841 NULL

842 };

843

844 if (U_FAILURE(ec)) {

845 errln("FAIL: couldn't construct test sets");

846 }

847

848 for (int32_t i = 0; testList[i] != NULL; i+=2) {

849 if (U_SUCCESS(ec)) {

850 UnicodeString pat0, pat1;

851 testList[i]->toPattern(pat0, TRUE);

852 testList[i+1]->toPattern(pat1, TRUE);

853 if (testList[i] == testList[i+1]) {

854 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);

855 } else {

856 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);

857 }

858 }

859 delete testList[i];

860 delete testList[i+1];

861 }

862 }

863

864 /**

865 * Test the [:Latin:] syntax.

866 */

867 void UnicodeSetTest::TestScriptSet() {

868 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeSt ring("\\u0391\\u03B1"));

869

870 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString(" \\u0391\\u03B1"), "aA");

871

872 /* Jitterbug 1423 */

873 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsT oUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");

874

875 }

876

877 /**

878 * Test the [:Latin:] syntax.

879 */

880 void UnicodeSetTest::TestPropertySet() {

881 static const char* const DATA[] = {

882 // Pattern, Chars IN, Chars NOT in

883

884 "[:Latin:]",

885 "aA",

886 "\\u0391\\u03B1",

887

888 "[\\p{Greek}]",

889 "\\u0391\\u03B1",

890 "aA",

891

892 "\\P{ GENERAL Category = upper case letter }",

893 "abc",

894 "ABC",

895

896 #if !UCONFIG_NO_NORMALIZATION

897 // Combining class: @since ICU 2.2

898 // Check both symbolic and numeric

899 "\\p{ccc=Nukta}",

900 "\\u0ABC",

901 "abc",

902

903 "\\p{Canonical Combining Class = 11}",

904 "\\u05B1",

905 "\\u05B2",

906

907 "[:c c c = iota subscript :]",

908 "\\u0345",

909 "xyz",

910 #endif

911

912 // Bidi class: @since ICU 2.2

913 "\\p{bidiclass=lefttoright}",

914 "abc",

915 "\\u0671\\u0672",

916

917 // Binary properties: @since ICU 2.2

918 "\\p{ideographic}",

919 "\\u4E0A",

920 "x",

921

922 "[:math=false:]",

923 "q)*(",

924 // weiv: )(and * were removed from math in Unicode 4.0.1

925 //"(*+)",

926 "+<>^",

927

928 // JB#1767 \N{}, \p{ASCII}

929 "[:Ascii:]",

930 "abc\\u0000\\u007F",

931 "\\u0080\\u4E00",

932

933 "[\\N{ latin small letter a }[:name= latin small letter z:]]",

934 "az",

935 "qrs",

936

937 // JB#2015

938 "[:any:]",

939 "a\\U0010FFFF",

940 "",

941

942 "[:nv=0.5:]",

943 "\\u00BD\\u0F2A",

944 "\\u00BC",

945

946 // JB#2653: Age

947 "[:Age=1.1:]",

948 "\\u03D6", // 1.1

949 "\\u03D8\\u03D9", // 3.2

950

951 "[:Age=3.1:]",

952 "\\u1800\\u3400\\U0002f800",

953 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",

954

955 // JB#2350: Case_Sensitive

956 "[:Case Sensitive:]",

957 "A\\u1FFC\\U00010410",

958 ";\\u00B4\\U00010500",

959

960 // JB#2832: C99-compatibility props

961 "[:blank:]",

962 " \\u0009",

963 "1-9A-Z",

964

965 "[:graph:]",

966 "19AZ",

967 " \\u0003\\u0007\\u0009\\u000A\\u000D",

968

969 "[:punct:]",

970 "!@#%&*()[]{}-_\\/;:,.?'\"",

971 "09azAZ",

972

973 "[:xdigit:]",

974 "09afAF",

975 "gG!",

976

977 // Regex compatibility test

978 "[-b]", // leading '-' is literal

979 "-b",

980 "ac",

981

982 "[^-b]", // leading '-' is literal

983 "ac",

984 "-b",

985

986 "[b-]", // trailing '-' is literal

987 "-b",

988 "ac",

989

990 "[^b-]", // trailing '-' is literal

991 "ac",

992 "-b",

993

994 "[a-b-]", // trailing '-' is literal

995 "ab-",

996 "c=",

997

998 "[[a-q]&[p-z]-]", // trailing '-' is literal

999 "pq-",

1000 "or=",

1001

1002 "[\\s\|\\)\|:\|$\|\\>]", // from regex tests

1003 "s\|):$>",

1004 "abc",

1005

1006 "[\\uDC00cd]", // JB#2906: isolated trail at start

1007 "cd\\uDC00",

1008 "ab\\uD800\\U00010000",

1009

1010 "[ab\\uD800]", // JB#2906: isolated trail at start

1011 "ab\\uD800",

1012 "cd\\uDC00\\U00010000",

1013

1014 "[ab\\uD800cd]", // JB#2906: isolated lead in middle

1015 "abcd\\uD800",

1016 "ef\\uDC00\\U00010000",

1017

1018 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle

1019 "abcd\\uDC00",

1020 "ef\\uD800\\U00010000",

1021

1022 #if !UCONFIG_NO_NORMALIZATION

1023 "[:^lccc=0:]", // Lead canonical class

1024 "\\u0300\\u0301",

1025 "abcd\\u00c0\\u00c5",

1026

1027 "[:^tccc=0:]", // Trail canonical class

1028 "\\u0300\\u0301\\u00c0\\u00c5",

1029 "abcd",

1030

1031 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class

1032 "\\u0300\\u0301\\u00c0\\u00c5",

1033 "abcd",

1034

1035 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but end s with a base (none right now)

1036 "",

1037 "abcd\\u0300\\u0301\\u00c0\\u00c5",

1038

1039 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical clas s is zero, but both lead and trail are not

1040 "\\u0F73\\u0F75\\u0F81",

1041 "abcd\\u0300\\u0301\\u00c0\\u00c5",

1042 #endif /* !UCONFIG_NO_NORMALIZATION */

1043

1044 "[:Assigned:]",

1045 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",

1046 "\\u0888\\uFDD3\\uFFFE\\U00050005",

1047

1048 // Script_Extensions, new in Unicode 6.0

1049 "[:scx=Arab:]",

1050 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\ \uFDF3",

1051 "\\u061D\\uFDEF\\uFDFE",

1052

1053 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,

1054 // so scx-sc is missing U+FDF2.

1055 "[[:Script_Extensions=Arabic:]-[:Arab:]]",

1056 "\\u0640\\u064B\\u0650\\u0655",

1057 "\\uFDF2"

1058 };

1059

1060 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);

1061

1062 for (int32_t i=0; i<DATA_LEN; i+=3) {

1063 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeStri ng(DATA[i+1]),

1064 CharsToUnicodeString(DATA[i+2]));

1065 }

1066 }

1067

1068 /**

1069 * Test that Posix style character classes [:digit:], etc.

1070 * have the Unicode definitions from TR 18.

1071 */

1072 void UnicodeSetTest::TestPosixClasses() {

1073 {

1074 UErrorCode status = U_ZERO_ERROR;

1075 UnicodeSet s1("[:alpha:]", status);

1076 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);

1077 TEST_ASSERT_SUCCESS(status);

1078 TEST_ASSERT(s1==s2);

1079 }

1080 {

1081 UErrorCode status = U_ZERO_ERROR;

1082 UnicodeSet s1("[:lower:]", status);

1083 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);

1084 TEST_ASSERT_SUCCESS(status);

1085 TEST_ASSERT(s1==s2);

1086 }

1087 {

1088 UErrorCode status = U_ZERO_ERROR;

1089 UnicodeSet s1("[:upper:]", status);

1090 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);

1091 TEST_ASSERT_SUCCESS(status);

1092 TEST_ASSERT(s1==s2);

1093 }

1094 {

1095 UErrorCode status = U_ZERO_ERROR;

1096 UnicodeSet s1("[:punct:]", status);

1097 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);

1098 TEST_ASSERT_SUCCESS(status);

1099 TEST_ASSERT(s1==s2);

1100 }

1101 {

1102 UErrorCode status = U_ZERO_ERROR;

1103 UnicodeSet s1("[:digit:]", status);

1104 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);

1105 TEST_ASSERT_SUCCESS(status);

1106 TEST_ASSERT(s1==s2);

1107 }

1108 {

1109 UErrorCode status = U_ZERO_ERROR;

1110 UnicodeSet s1("[:xdigit:]", status);

1111 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]") , status);

1112 TEST_ASSERT_SUCCESS(status);

1113 TEST_ASSERT(s1==s2);

1114 }

1115 {

1116 UErrorCode status = U_ZERO_ERROR;

1117 UnicodeSet s1("[:alnum:]", status);

1118 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}] "), status);

1119 TEST_ASSERT_SUCCESS(status);

1120 TEST_ASSERT(s1==s2);

1121 }

1122 {

1123 UErrorCode status = U_ZERO_ERROR;

1124 UnicodeSet s1("[:space:]", status);

1125 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);

1126 TEST_ASSERT_SUCCESS(status);

1127 TEST_ASSERT(s1==s2);

1128 }

1129 {

1130 UErrorCode status = U_ZERO_ERROR;

1131 UnicodeSet s1("[:blank:]", status);

1132 TEST_ASSERT_SUCCESS(status);

1133 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u 000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),

1134 status);

1135 TEST_ASSERT_SUCCESS(status);

1136 TEST_ASSERT(s1==s2);

1137 }

1138 {

1139 UErrorCode status = U_ZERO_ERROR;

1140 UnicodeSet s1("[:cntrl:]", status);

1141 TEST_ASSERT_SUCCESS(status);

1142 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);

1143 TEST_ASSERT_SUCCESS(status);

1144 TEST_ASSERT(s1==s2);

1145 }

1146 {

1147 UErrorCode status = U_ZERO_ERROR;

1148 UnicodeSet s1("[:graph:]", status);

1149 TEST_ASSERT_SUCCESS(status);

1150 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Su rrogate}\\p{Unassigned}]"), status);

1151 TEST_ASSERT_SUCCESS(status);

1152 TEST_ASSERT(s1==s2);

1153 }

1154 {

1155 UErrorCode status = U_ZERO_ERROR;

1156 UnicodeSet s1("[:print:]", status);

1157 TEST_ASSERT_SUCCESS(status);

1158 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]] ") ,status);

1159 TEST_ASSERT_SUCCESS(status);

1160 TEST_ASSERT(s1==s2);

1161 }

1162 }

1163 /**

1164 * Test cloning of UnicodeSet. For C++, we test the copy constructor.

1165 */

1166 void UnicodeSetTest::TestClone() {

1167 UErrorCode ec = U_ZERO_ERROR;

1168 UnicodeSet s("[abcxyz]", ec);

1169 UnicodeSet t(s);

1170 expectContainment(t, "abc", "def");

1171 }

1172

1173 /**

1174 * Test the indexOf() and charAt() methods.

1175 */

1176 void UnicodeSetTest::TestIndexOf() {

1177 UErrorCode ec = U_ZERO_ERROR;

1178 UnicodeSet set("[a-cx-y3578]", ec);

1179 if (U_FAILURE(ec)) {

1180 errln("FAIL: UnicodeSet constructor");

1181 return;

1182 }

1183 for (int32_t i=0; i<set.size(); ++i) {

1184 UChar32 c = set.charAt(i);

1185 if (set.indexOf(c) != i) {

1186 errln("FAIL: charAt(%d) = %X => indexOf() => %d",

1187 i, c, set.indexOf(c));

1188 }

1189 }

1190 UChar32 c = set.charAt(set.size());

1191 if (c != -1) {

1192 errln("FAIL: charAt(<out of range>) = %X", c);

1193 }

1194 int32_t j = set.indexOf((UChar32)0x71/'q'/);

1195 if (j != -1) {

1196 errln((UnicodeString)"FAIL: indexOf('q') = " + j);

1197 }

1198 }

1199

1200 /**

1201 * Test closure API.

1202 */

1203 void UnicodeSetTest::TestCloseOver() {

1204 UErrorCode ec = U_ZERO_ERROR;

1205

1206 char CASE[] = {(char)USET_CASE_INSENSITIVE};

1207 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};

1208 const char* DATA[] = {

1209 // selector, input, output

1210 CASE,

1211 "[aq\\u00DF{Bc}{bC}{Fi}]",

1212 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETT ER SHARP S is new in Unicode 5.1

1213

1214 CASE,

1215 "[\\u01F1]", // 'DZ'

1216 "[\\u01F1\\u01F2\\u01F3]",

1217

1218 CASE,

1219 "[\\u1FB4]",

1220 "[\\u1FB4{\\u03AC\\u03B9}]",

1221

1222 CASE,

1223 "[{F\\uFB01}]",

1224 "[\\uFB03{ffi}]",

1225

1226 CASE, // make sure binary search finds limits

1227 "[a\\uFF3A]",

1228 "[aA\\uFF3A\\uFF5A]",

1229

1230 CASE,

1231 "[a-z]","[A-Za-z\\u017F\\u212A]",

1232 CASE,

1233 "[abc]","[A-Ca-c]",

1234 CASE,

1235 "[ABC]","[A-Ca-c]",

1236

1237 CASE, "[i]", "[iI]",

1238

1239 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I

1240 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot

1241

1242 CASE, "[\\u0131]", "[\\u0131]", // dotless i

1243

1244 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",

1245

1246 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas

1247

1248 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas

1249

1250 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",

1251

1252 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",

1253

1254 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",

1255 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",

1256

1257 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",

1258

1259 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted ta ble

1260

1261 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sort ed table

1262

1263 #if !UCONFIG_NO_FILE_IO

1264 CASE_MAPPINGS,

1265 "[aq\\u00DF{Bc}{bC}{Fi}]",

1266 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",

1267 #endif

1268

1269 CASE_MAPPINGS,

1270 "[\\u01F1]", // 'DZ'

1271 "[\\u01F1\\u01F2\\u01F3]",

1272

1273 CASE_MAPPINGS,

1274 "[a-z]",

1275 "[A-Za-z]",

1276

1277 NULL

1278 };

1279

1280 UnicodeSet s;

1281 UnicodeSet t;

1282 UnicodeString buf;

1283 for (int32_t i=0; DATA[i]!=NULL; i+=3) {

1284 int32_t selector = DATA[i][0];

1285 UnicodeString pat(DATA[i+1], -1, US_INV);

1286 UnicodeString exp(DATA[i+2], -1, US_INV);

1287 s.applyPattern(pat, ec);

1288 s.closeOver(selector);

1289 t.applyPattern(exp, ec);

1290 if (U_FAILURE(ec)) {

1291 errln("FAIL: applyPattern failed");

1292 continue;

1293 }

1294 if (s == t) {

1295 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);

1296 } else {

1297 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +

1298 s.toPattern(buf, TRUE) + ", expected " + exp);

1299 }

1300 }

1301

1302 #if 0

1303 /*

1304 * Unused test code.

1305 * This was used to compare the old implementation (using USET_CASE)

1306 * with the new one (using 0x100 temporarily)

1307 * while transitioning from hardcoded case closure tables in uniset.cpp

1308 * (moved to uniset_props.cpp) to building the data by gencase into ucase.ic u.

1309 * and using ucase.c functions for closure.

1310 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file

1311 *

1312 * Note: The old and new implementation never fully matched because

1313 * the old implementation turned out to not map U+0130 and U+0131 correctly

1314 * (dotted I and dotless i) and because the old implementation's data tables

1315 * were outdated compared to Unicode 4.0.1 at the time of the change to the

1316 * new implementation. (So sigmas and some other characters were not handled

1317 * according to the newer Unicode version.)

1318 */

1319 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;

1320 UnicodeSetIterator si(sens);

1321 UnicodeString str, buf2;

1322 const UnicodeString *pStr;

1323 UChar32 c;

1324 while(si.next()) {

1325 if(!si.isString()) {

1326 c=si.getCodepoint();

1327 s.clear();

1328 s.add(c);

1329

1330 str.setTo(c);

1331 str.foldCase();

1332 sens2.add(str);

1333

1334 t=s;

1335 s.closeOver(USET_CASE);

1336 t.closeOver(0x100);

1337 if(s!=t) {

1338 errln("FAIL: closeOver(U+%04x) differs: ", c);

1339 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.to Pattern(buf2, TRUE));

1340 }

1341 }

1342 }

1343 // remove all code points

1344 // should contain all full case folding mapping strings

1345 sens2.remove(0, 0x10ffff);

1346 si.reset(sens2);

1347 while(si.next()) {

1348 if(si.isString()) {

1349 pStr=&si.getString();

1350 s.clear();

1351 s.add(*pStr);

1352 t=s2=s;

1353 s.closeOver(USET_CASE);

1354 t.closeOver(0x100);

1355 if(s!=t) {

1356 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+ ") differs: ");

1357 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.to Pattern(buf2, TRUE));

1358 }

1359 }

1360 }

1361 #endif

1362

1363 // Test the pattern API

1364 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);

1365 if (U_FAILURE(ec)) {

1366 errln("FAIL: applyPattern failed");

1367 } else {

1368 expectContainment(s, "abcABC", "defDEF");

1369 }

1370 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);

1371 if (U_FAILURE(ec)) {

1372 errln("FAIL: constructor failed");

1373 } else {

1374 expectContainment(v, "defDEF", "abcABC");

1375 }

1376 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);

1377 if (U_FAILURE(ec)) {

1378 errln("FAIL: construct w/case mappings failed");

1379 } else {

1380 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")) ;

1381 }

1382 }

1383

1384 void UnicodeSetTest::TestEscapePattern() {

1385 const char pattern[] =

1386 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFF D ]";

1387 const char exp[] =

1388 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]" ;

1389 // We test this with two passes; in the second pass we

1390 // pre-unescape the pattern. Since U+200E is Pattern_White_Space,

1391 // this fails -- which is what we expect.

1392 for (int32_t pass=1; pass<=2; ++pass) {

1393 UErrorCode ec = U_ZERO_ERROR;

1394 UnicodeString pat(pattern, -1, US_INV);

1395 if (pass==2) {

1396 pat = pat.unescape();

1397 }

1398 // Pattern is only good for pass 1

1399 UBool isPatternValid = (pass==1);

1400

1401 UnicodeSet set(pat, ec);

1402 if (U_SUCCESS(ec) != isPatternValid){

1403 errln((UnicodeString)"FAIL: applyPattern(" +

1404 escape(pat) + ") => " +

1405 u_errorName(ec));

1406 continue;

1407 }

1408 if (U_FAILURE(ec)) {

1409 continue;

1410 }

1411 if (set.contains((UChar)0x0644)){

1412 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");

1413 }

1414

1415 UnicodeString newpat;

1416 set.toPattern(newpat, TRUE);

1417 if (newpat == UnicodeString(exp, -1, US_INV)) {

1418 logln(escape(pat) + " => " + newpat);

1419 } else {

1420 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);

1421 }

1422

1423 for (int32_t i=0; i<set.getRangeCount(); ++i) {

1424 UnicodeString str("Range ");

1425 str.append((UChar)(0x30 + i))

1426 .append(": ")

1427 .append((UChar32)set.getRangeStart(i))

1428 .append(" - ")

1429 .append((UChar32)set.getRangeEnd(i));

1430 str = str + " (" + set.getRangeStart(i) + " - " +

1431 set.getRangeEnd(i) + ")";

1432 if (set.getRangeStart(i) < 0) {

1433 errln((UnicodeString)"FAIL: " + escape(str));

1434 } else {

1435 logln(escape(str));

1436 }

1437 }

1438 }

1439 }

1440

1441 void UnicodeSetTest::expectRange(const UnicodeString& label,

1442 const UnicodeSet& set,

1443 UChar32 start, UChar32 end) {

1444 UnicodeSet exp(start, end);

1445 UnicodeString pat;

1446 if (set == exp) {

1447 logln(label + " => " + set.toPattern(pat, TRUE));

1448 } else {

1449 UnicodeString xpat;

1450 errln((UnicodeString)"FAIL: " + label + " => " +

1451 set.toPattern(pat, TRUE) +

1452 ", expected " + exp.toPattern(xpat, TRUE));

1453 }

1454 }

1455

1456 void UnicodeSetTest::TestInvalidCodePoint() {

1457

1458 const UChar32 DATA[] = {

1459 // Test range Expected range

1460 0, 0x10FFFF, 0, 0x10FFFF,

1461 (UChar32)-1, 8, 0, 8,

1462 8, 0x110000, 8, 0x10FFFF

1463 };

1464 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);

1465

1466 UnicodeString pat;

1467 int32_t i;

1468

1469 for (i=0; i<DATA_LENGTH; i+=4) {

1470 UChar32 start = DATA[i];

1471 UChar32 end = DATA[i+1];

1472 UChar32 xstart = DATA[i+2];

1473 UChar32 xend = DATA[i+3];

1474

1475 // Try various API using the test code points

1476

1477 UnicodeSet set(start, end);

1478 expectRange((UnicodeString)"ct(" + start + "," + end + ")",

1479 set, xstart, xend);

1480

1481 set.clear();

1482 set.set(start, end);

1483 expectRange((UnicodeString)"set(" + start + "," + end + ")",

1484 set, xstart, xend);

1485

1486 UBool b = set.contains(start);

1487 b = set.contains(start, end);

1488 b = set.containsNone(start, end);

1489 b = set.containsSome(start, end);

1490 (void)b; // Suppress set but not used warning.

1491

1492 /int32_t index = set.indexOf(start);/

1493

1494 set.clear();

1495 set.add(start);

1496 set.add(start, end);

1497 expectRange((UnicodeString)"add(" + start + "," + end + ")",

1498 set, xstart, xend);

1499

1500 set.set(0, 0x10FFFF);

1501 set.retain(start, end);

1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")",

1503 set, xstart, xend);

1504 set.retain(start);

1505

1506 set.set(0, 0x10FFFF);

1507 set.remove(start);

1508 set.remove(start, end);

1509 set.complement();

1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",

1511 set, xstart, xend);

1512

1513 set.set(0, 0x10FFFF);

1514 set.complement(start, end);

1515 set.complement();

1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",

1517 set, xstart, xend);

1518 set.complement(start);

1519 }

1520

1521 const UChar32 DATA2[] = {

1522 0,

1523 0x10FFFF,

1524 (UChar32)-1,

1525 0x110000

1526 };

1527 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);

1528

1529 for (i=0; i<DATA2_LENGTH; ++i) {

1530 UChar32 c = DATA2[i], end = 0x10FFFF;

1531 UBool valid = (c >= 0 && c <= 0x10FFFF);

1532

1533 UnicodeSet set(0, 0x10FFFF);

1534

1535 // For single-codepoint contains, invalid codepoints are NOT contained

1536 UBool b = set.contains(c);

1537 if (b == valid) {

1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +

1539 ") = " + b);

1540 } else {

1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +

1542 ") = " + b);

1543 }

1544

1545 // For codepoint range contains, containsNone, and containsSome,

1546 // invalid or empty (start > end) ranges have UNDEFINED behavior.

1547 b = set.contains(c, end);

1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +

1549 "," + end + ") = " + b);

1550

1551 b = set.containsNone(c, end);

1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +

1553 "," + end + ") = " + b);

1554

1555 b = set.containsSome(c, end);

1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +

1557 "," + end + ") = " + b);

1558

1559 int32_t index = set.indexOf(c);

1560 if ((index >= 0) == valid) {

1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +

1562 ") = " + index);

1563 } else {

1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +

1565 ") = " + index);

1566 }

1567 }

1568 }

1569

1570 // Used by TestSymbolTable

1571 class TokenSymbolTable : public SymbolTable {

1572 public:

1573 Hashtable contents;

1574

1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {

1576 contents.setValueDeleter(uprv_deleteUObject);

1577 }

1578

1579 ~TokenSymbolTable() {}

1580

1581 /**

1582 * (Non-SymbolTable API) Add the given variable and value to

1583 * the table. Variable should NOT contain leading '$'.

1584 */

1585 void add(const UnicodeString& var, const UnicodeString& value,

1586 UErrorCode& ec) {

1587 if (U_SUCCESS(ec)) {

1588 contents.put(var, new UnicodeString(value), ec);

1589 }

1590 }

1591

1592 /**

1593 * SymbolTable API

1594 */

1595 virtual const UnicodeString* lookup(const UnicodeString& s) const {

1596 return (const UnicodeString*) contents.get(s);

1597 }

1598

1599 /**

1600 * SymbolTable API

1601 */

1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /ch/) const {

1603 return NULL;

1604 }

1605

1606 /**

1607 * SymbolTable API

1608 */

1609 virtual UnicodeString parseReference(const UnicodeString& text,

1610 ParsePosition& pos, int32_t limit) cons t {

1611 int32_t start = pos.getIndex();

1612 int32_t i = start;

1613 UnicodeString result;

1614 while (i < limit) {

1615 UChar c = text.charAt(i);

1616 if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {

1617 break;

1618 }

1619 ++i;

1620 }

1621 if (i == start) { // No valid name chars

1622 return result; // Indicate failure with empty string

1623 }

1624 pos.setIndex(i);

1625 text.extractBetween(start, i, result);

1626 return result;

1627 }

1628 };

1629

1630 void UnicodeSetTest::TestSymbolTable() {

1631 // Multiple test cases can be set up here. Each test case

1632 // is terminated by null:

1633 // var, value, var, value,..., input pat., exp. output pat., null

1634 const char* DATA[] = {

1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,

1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,

1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,

1638 NULL

1639 };

1640

1641 for (int32_t i=0; DATA[i]!=NULL; ++i) {

1642 UErrorCode ec = U_ZERO_ERROR;

1643 TokenSymbolTable sym(ec);

1644 if (U_FAILURE(ec)) {

1645 errln("FAIL: couldn't construct TokenSymbolTable");

1646 continue;

1647 }

1648

1649 // Set up variables

1650 while (DATA[i+2] != NULL) {

1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);

1652 if (U_FAILURE(ec)) {

1653 errln("FAIL: couldn't add to TokenSymbolTable");

1654 continue;

1655 }

1656 i += 2;

1657 }

1658

1659 // Input pattern and expected output pattern

1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = Unico deString(DATA[i+1], -1, US_INV);

1661 i += 2;

1662

1663 ParsePosition pos(0);

1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);

1665 if (U_FAILURE(ec)) {

1666 errln("FAIL: couldn't construct UnicodeSet");

1667 continue;

1668 }

1669

1670 // results

1671 if (pos.getIndex() != inpat.length()) {

1672 errln((UnicodeString)"Failed to read to end of string \""

1673 + inpat + "\": read to "

1674 + pos.getIndex() + ", length is "

1675 + inpat.length());

1676 }

1677

1678 UnicodeSet us2(exppat, ec);

1679 if (U_FAILURE(ec)) {

1680 errln("FAIL: couldn't construct expected UnicodeSet");

1681 continue;

1682 }

1683

1684 UnicodeString a, b;

1685 if (us != us2) {

1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +

1687 ", expected " + us2.toPattern(b, TRUE));

1688 } else {

1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));

1690 }

1691 }

1692 }

1693

1694 void UnicodeSetTest::TestSurrogate() {

1695 const char* DATA[] = {

1696 // These should all behave identically

1697 "[abc\\uD800\\uDC00]",

1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java

1699 "[abc\\U00010000]",

1700 0

1701 };

1702 for (int i=0; DATA[i] != 0; ++i) {

1703 UErrorCode ec = U_ZERO_ERROR;

1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));

1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);

1706 UnicodeSet set(str, ec);

1707 if (U_FAILURE(ec)) {

1708 errln("FAIL: UnicodeSet constructor");

1709 continue;

1710 }

1711 expectContainment(set,

1712 CharsToUnicodeString("abc\\U00010000"),

1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apa rt surrogate-pair

1714 if (set.size() != 4) {

1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +

1716 set.size() + ", expected 4");

1717 }

1718

1719 {

1720 UErrorCode subErr = U_ZERO_ERROR;

1721 checkRoundTrip(set);

1722 checkSerializeRoundTrip(set, subErr);

1723 }

1724 }

1725 }

1726

1727 void UnicodeSetTest::TestExhaustive() {

1728 // exhaustive tests. Simulate UnicodeSets with integers.

1729 // That gives us very solid tests (except for large memory tests).

1730

1731 int32_t limit = 128;

1732

1733 UnicodeSet x, y, z, aa;

1734

1735 for (int32_t i = 0; i < limit; ++i) {

1736 bitsToSet(i, x);

1737 logln((UnicodeString)"Testing " + i + ", " + x);

1738 _testComplement(i, x, y);

1739

1740 UnicodeSet &toTest = bitsToSet(i, aa);

1741

1742 // AS LONG AS WE ARE HERE, check roundtrip

1743 checkRoundTrip(toTest);

1744 UErrorCode ec = U_ZERO_ERROR;

1745 checkSerializeRoundTrip(toTest, ec);

1746

1747 for (int32_t j = 0; j < limit; ++j) {

1748 _testAdd(i,j, x,y,z);

1749 _testXor(i,j, x,y,z);

1750 _testRetain(i,j, x,y,z);

1751 _testRemove(i,j, x,y,z);

1752 }

1753 }

1754 }

1755

1756 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {

1757 bitsToSet(a, x);

1758 z = x;

1759 z.complement();

1760 int32_t c = setToBits(z);

1761 if (c != (~a)) {

1762 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);

1763 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);

1764 }

1765 checkCanonicalRep(z, (UnicodeString)"complement " + a);

1766 }

1767

1768 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y , UnicodeSet& z) {

1769 bitsToSet(a, x);

1770 bitsToSet(b, y);

1771 z = x;

1772 z.addAll(y);

1773 int32_t c = setToBits(z);

1774 if (c != (a \| b)) {

1775 errln((UnicodeString)"FAILED: add: " + x + " \| " + y + " != " + z);

1776 errln((UnicodeString)"FAILED: add: " + a + " \| " + b + " != " + c);

1777 }

1778 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);

1779 }

1780

1781 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet & y, UnicodeSet& z) {

1782 bitsToSet(a, x);

1783 bitsToSet(b, y);

1784 z = x;

1785 z.retainAll(y);

1786 int32_t c = setToBits(z);

1787 if (c != (a & b)) {

1788 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);

1789 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);

1790 }

1791 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);

1792 }

1793

1794 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet & y, UnicodeSet& z) {

1795 bitsToSet(a, x);

1796 bitsToSet(b, y);

1797 z = x;

1798 z.removeAll(y);

1799 int32_t c = setToBits(z);

1800 if (c != (a &~ b)) {

1801 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);

1802 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);

1803 }

1804 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);

1805 }

1806

1807 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y , UnicodeSet& z) {

1808 bitsToSet(a, x);

1809 bitsToSet(b, y);

1810 z = x;

1811 z.complementAll(y);

1812 int32_t c = setToBits(z);

1813 if (c != (a ^ b)) {

1814 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z );

1815 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c );

1816 }

1817 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);

1818 }

1819

1820 /**

1821 * Check that ranges are monotonically increasing and non-

1822 * overlapping.

1823 */

1824 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeStrin g& msg) {

1825 int32_t n = set.getRangeCount();

1826 if (n < 0) {

1827 errln((UnicodeString)"FAIL result of " + msg +

1828 ": range count should be >= 0 but is " +

1829 n /+ " for " + set.toPattern())/);

1830 return;

1831 }

1832 UChar32 last = 0;

1833 for (int32_t i=0; i<n; ++i) {

1834 UChar32 start = set.getRangeStart(i);

1835 UChar32 end = set.getRangeEnd(i);

1836 if (start > end) {

1837 errln((UnicodeString)"FAIL result of " + msg +

1838 ": range " + (i+1) +

1839 " start > end: " + (int)start + ", " + (int)end +

1840 " for " + set);

1841 }

1842 if (i > 0 && start <= last) {

1843 errln((UnicodeString)"FAIL result of " + msg +

1844 ": range " + (i+1) +

1845 " overlaps previous range: " + (int)start + ", " + (int)end +

1846 " for " + set);

1847 }

1848 last = end;

1849 }

1850 }

1851

1852 /**

1853 * Convert a bitmask to a UnicodeSet.

1854 */

1855 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {

1856 result.clear();

1857 for (UChar32 i = 0; i < 32; ++i) {

1858 if ((a & (1<<i)) != 0) {

1859 result.add(i);

1860 }

1861 }

1862 return result;

1863 }

1864

1865 /**

1866 * Convert a UnicodeSet to a bitmask. Only the characters

1867 * U+0000 to U+0020 are represented in the bitmask.

1868 */

1869 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {

1870 int32_t result = 0;

1871 for (int32_t i = 0; i < 32; ++i) {

1872 if (x.contains((UChar32)i)) {

1873 result \|= (1<<i);

1874 }

1875 }

1876 return result;

1877 }

1878

1879 /**

1880 * Return the representation of an inversion list based UnicodeSet

1881 * as a pairs list. Ranges are listed in ascending Unicode order.

1882 * For example, the set [a-zA-M3] is represented as "33AMaz".

1883 */

1884 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {

1885 UnicodeString pairs;

1886 for (int32_t i=0; i<set.getRangeCount(); ++i) {

1887 UChar32 start = set.getRangeStart(i);

1888 UChar32 end = set.getRangeEnd(i);

1889 if (end > 0xFFFF) {

1890 end = 0xFFFF;

1891 i = set.getRangeCount(); // Should be unnecessary

1892 }

1893 pairs.append((UChar)start).append((UChar)end);

1894 }

1895 return pairs;

1896 }

1897

1898 /**

1899 * Basic consistency check for a few items.

1900 * That the iterator works, and that we can create a pattern and

1901 * get the same thing back

1902 */

1903 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {

1904 {

1905 UnicodeSet t(s);

1906 checkEqual(s, t, "copy ct");

1907 }

1908

1909 {

1910 UnicodeSet t(0xabcd, 0xdef0); // dummy contents should be overwritten

1911 t = s;

1912 checkEqual(s, t, "operator=");

1913 }

1914

1915 {

1916 UnicodeSet t;

1917 copyWithIterator(t, s, FALSE);

1918 checkEqual(s, t, "iterator roundtrip");

1919 }

1920

1921 {

1922 UnicodeSet t;

1923 copyWithIterator(t, s, TRUE); // try range

1924 checkEqual(s, t, "iterator roundtrip");

1925 }

1926

1927 {

1928 UnicodeSet t;

1929 UnicodeString pat;

1930 UErrorCode ec = U_ZERO_ERROR;

1931 s.toPattern(pat, FALSE);

1932 t.applyPattern(pat, ec);

1933 if (U_FAILURE(ec)) {

1934 errln("FAIL: toPattern(escapeUnprintable=FALSE), applyPattern - %s", u_errorName(ec));

1935 return;

1936 } else {

1937 checkEqual(s, t, "toPattern(false)");

1938 }

1939 }

1940

1941 {

1942 UnicodeSet t;

1943 UnicodeString pat;

1944 UErrorCode ec = U_ZERO_ERROR;

1945 s.toPattern(pat, TRUE);

1946 t.applyPattern(pat, ec);

1947 if (U_FAILURE(ec)) {

1948 errln("FAIL: toPattern(escapeUnprintable=TRUE), applyPattern - %s", u_errorName(ec));

1949 return;

1950 } else {

1951 checkEqual(s, t, "toPattern(true)");

1952 }

1953 }

1954 }

1955

1956 void UnicodeSetTest::checkSerializeRoundTrip(const UnicodeSet& t, UErrorCode &st atus) {

1957 if(U_FAILURE(status)) return;

1958 int32_t len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapac ity(), status);

1959 if(status == U_BUFFER_OVERFLOW_ERROR) {

1960 status = U_ZERO_ERROR;

1961 serializeBuffer.resize(len);

1962 len = t.serialize(serializeBuffer.getAlias(), serializeBuffer.getCapacity(), status);

1963 // let 2nd error stand

1964 }

1965 if(U_FAILURE(status)) {

1966 errln("checkSerializeRoundTrip: error %s serializing buffer\n", u_errorName( status));

1967 return;

1968 }

1969 UnicodeSet deserialized(serializeBuffer.getAlias(), len, UnicodeSet::kSerializ ed, status);

1970 if(U_FAILURE(status)) {

1971 errln("checkSerializeRoundTrip: error %s deserializing buffer: buf %p len %d , original %d\n", u_errorName(status), serializeBuffer.getAlias(), len, t.getRan geCount());

1972 return;

1973 }

1974

1975 checkEqual(t, deserialized, "Set was unequal when deserialized");

1976 }

1977

1978 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {

1979 t.clear();

1980 UnicodeSetIterator it(s);

1981 if (withRange) {

1982 while (it.nextRange()) {

1983 if (it.isString()) {

1984 t.add(it.getString());

1985 } else {

1986 t.add(it.getCodepoint(), it.getCodepointEnd());

1987 }

1988 }

1989 } else {

1990 while (it.next()) {

1991 if (it.isString()) {

1992 t.add(it.getString());

1993 } else {

1994 t.add(it.getCodepoint());

1995 }

1996 }

1997 }

1998 }

1999

2000 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {

2001 assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t. getRangeCount());

2002 assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());

2003 UnicodeString source; s.toPattern(source, TRUE);

2004 UnicodeString result; t.toPattern(result, TRUE);

2005 if (s != t) {

2006 errln((UnicodeString)"FAIL: " + message

2007 + "; source = " + source

2008 + "; result = " + result

2009 );

2010 return FALSE;

2011 } else {

2012 logln((UnicodeString)"Ok: " + message

2013 + "; source = " + source

2014 + "; result = " + result

2015 );

2016 }

2017 return TRUE;

2018 }

2019

2020 void

2021 UnicodeSetTest::expectContainment(const UnicodeString& pat,

2022 const UnicodeString& charsIn,

2023 const UnicodeString& charsOut) {

2024 UErrorCode ec = U_ZERO_ERROR;

2025 UnicodeSet set(pat, ec);

2026 if (U_FAILURE(ec)) {

2027 dataerrln((UnicodeString)"FAIL: pattern \"" +

2028 pat + "\" => " + u_errorName(ec));

2029 return;

2030 }

2031 expectContainment(set, pat, charsIn, charsOut);

2032 }

2033

2034 void

2035 UnicodeSetTest::expectContainment(const UnicodeSet& set,

2036 const UnicodeString& charsIn,

2037 const UnicodeString& charsOut) {

2038 UnicodeString pat;

2039 set.toPattern(pat);

2040 expectContainment(set, pat, charsIn, charsOut);

2041 }

2042

2043 void

2044 UnicodeSetTest::expectContainment(const UnicodeSet& set,

2045 const UnicodeString& setName,

2046 const UnicodeString& charsIn,

2047 const UnicodeString& charsOut) {

2048 UnicodeString bad;

2049 UChar32 c;

2050 int32_t i;

2051

2052 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {

2053 c = charsIn.char32At(i);

2054 if (!set.contains(c)) {

2055 bad.append(c);

2056 }

2057 }

2058 if (bad.length() > 0) {

2059 errln((UnicodeString)"Fail: set " + setName + " does not contain " + pre ttify(bad) +

2060 ", expected containment of " + prettify(charsIn));

2061 } else {

2062 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(char sIn));

2063 }

2064

2065 bad.truncate(0);

2066 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {

2067 c = charsOut.char32At(i);

2068 if (set.contains(c)) {

2069 bad.append(c);

2070 }

2071 }

2072 if (bad.length() > 0) {

2073 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(ba d) +

2074 ", expected non-containment of " + prettify(charsOut));

2075 } else {

2076 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prett ify(charsOut));

2077 }

2078 }

2079

2080 void

2081 UnicodeSetTest::expectPattern(UnicodeSet& set,

2082 const UnicodeString& pattern,

2083 const UnicodeString& expectedPairs){

2084 UErrorCode status = U_ZERO_ERROR;

2085 set.applyPattern(pattern, status);

2086 if (U_FAILURE(status)) {

2087 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +

2088 "\") failed");

2089 return;

2090 } else {

2091 if (getPairs(set) != expectedPairs ) {

2092 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +

2093 "\") => pairs \"" +

2094 escape(getPairs(set)) + "\", expected \"" +

2095 escape(expectedPairs) + "\"");

2096 } else {

2097 logln(UnicodeString("Ok: applyPattern(\"") + pattern +

2098 "\") => pairs \"" +

2099 escape(getPairs(set)) + "\"");

2100 }

2101 }

2102 // the result of calling set.toPattern(), which is the string representation of

2103 // this set(set), is passed to a UnicodeSet constructor, and tested that it

2104 // will produce another set that is equal to this one.

2105 UnicodeString temppattern;

2106 set.toPattern(temppattern);

2107 UnicodeSet *tempset=new UnicodeSet(temppattern, status);

2108 if (U_FAILURE(status)) {

2109 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));

2110 return;

2111 }

2112 if(tempset != set \|\| getPairs(tempset) != getPairs(set)){

2113 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +

2114 escape(getPairs(set)) + "\""));

2115 } else{

2116 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));

2117 }

2118

2119 delete tempset;

2120

2121 }

2122

2123 void

2124 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expected Pairs) {

2125 if (getPairs(set) != expectedPairs) {

2126 errln(UnicodeString("FAIL: Expected pair list \"") +

2127 escape(expectedPairs) + "\", got \"" +

2128 escape(getPairs(set)) + "\"");

2129 }

2130 }

2131

2132 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,

2133 const UnicodeString& expPat,

2134 const char** expStrings) {

2135 UnicodeString pat;

2136 set.toPattern(pat, TRUE);

2137 if (pat == expPat) {

2138 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");

2139 } else {

2140 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");

2141 return;

2142 }

2143 if (expStrings == NULL) {

2144 return;

2145 }

2146 UBool in = TRUE;

2147 for (int32_t i=0; expStrings[i] != NULL; ++i) {

2148 if (expStrings[i] == NOT) { // sic; pointer comparison

2149 in = FALSE;

2150 continue;

2151 }

2152 UnicodeString s = CharsToUnicodeString(expStrings[i]);

2153 UBool contained = set.contains(s);

2154 if (contained == in) {

2155 logln((UnicodeString)"Ok: " + expPat +

2156 (contained ? " contains {" : " does not contain {") +

2157 escape(expStrings[i]) + "}");

2158 } else {

2159 errln((UnicodeString)"FAIL: " + expPat +

2160 (contained ? " contains {" : " does not contain {") +

2161 escape(expStrings[i]) + "}");

2162 }

2163 }

2164 }

2165

2166 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }

2167

2168 void

2169 UnicodeSetTest::doAssert(UBool condition, const char *message)

2170 {

2171 if (!condition) {

2172 errln(UnicodeString("ERROR : ") + message);

2173 }

2174 }

2175

2176 UnicodeString

2177 UnicodeSetTest::escape(const UnicodeString& s) {

2178 UnicodeString buf;

2179 for (int32_t i=0; i<s.length(); )

2180 {

2181 UChar32 c = s.char32At(i);

2182 if (0x0020 <= c && c <= 0x007F) {

2183 buf += c;

2184 } else {

2185 if (c <= 0xFFFF) {

2186 buf += (UChar)0x5c; buf += (UChar)0x75;

2187 } else {

2188 buf += (UChar)0x5c; buf += (UChar)0x55;

2189 buf += toHexString((c & 0xF0000000) >> 28);

2190 buf += toHexString((c & 0x0F000000) >> 24);

2191 buf += toHexString((c & 0x00F00000) >> 20);

2192 buf += toHexString((c & 0x000F0000) >> 16);

2193 }

2194 buf += toHexString((c & 0xF000) >> 12);

2195 buf += toHexString((c & 0x0F00) >> 8);

2196 buf += toHexString((c & 0x00F0) >> 4);

2197 buf += toHexString(c & 0x000F);

2198 }

2199 i += U16_LENGTH(c);

2200 }

2201 return buf;

2202 }

2203

2204 void UnicodeSetTest::TestFreezable() {

2205 UErrorCode errorCode=U_ZERO_ERROR;

2206 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);

2207 UnicodeSet idSet(idPattern, errorCode);

2208 if(U_FAILURE(errorCode)) {

2209 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_e rrorName(errorCode));

2210 return;

2211 }

2212

2213 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);

2214 UnicodeSet wsSet(wsPattern, errorCode);

2215 if(U_FAILURE(errorCode)) {

2216 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_e rrorName(errorCode));

2217 return;

2218 }

2219

2220 idSet.add(idPattern);

2221 UnicodeSet frozen(idSet);

2222 frozen.freeze();

2223

2224 if(idSet.isFrozen() \|\| !frozen.isFrozen()) {

2225 errln("FAIL: isFrozen() is wrong");

2226 }

2227 if(frozen!=idSet \|\| !(frozen==idSet)) {

2228 errln("FAIL: a copy-constructed frozen set differs from its original");

2229 }

2230

2231 frozen=wsSet;

2232 if(frozen!=idSet \|\| !(frozen==idSet)) {

2233 errln("FAIL: a frozen set was modified by operator=");

2234 }

2235

2236 UnicodeSet frozen2(frozen);

2237 if(frozen2!=frozen \|\| frozen2!=idSet) {

2238 errln("FAIL: a copied frozen set differs from its frozen original");

2239 }

2240 if(!frozen2.isFrozen()) {

2241 errln("FAIL: copy-constructing a frozen set results in a thawed one");

2242 }

2243 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.

2244 if(frozen3.contains(0, 4) \|\| !frozen3.contains(5, 55) \|\| frozen3.contains(56 , 0x10ffff)) {

2245 errln("FAIL: UnicodeSet(5, 55) failed");

2246 }

2247 frozen3=frozen;

2248 if(!frozen3.isFrozen()) {

2249 errln("FAIL: copying a frozen set results in a thawed one");

2250 }

2251

2252 UnicodeSet cloned=(UnicodeSet )frozen.clone();

2253 if(!cloned->isFrozen() \|\| *cloned!=frozen \|\| cloned->containsSome(0xd802, 0x d805)) {

2254 errln("FAIL: clone() failed");

2255 }

2256 cloned->add(0xd802, 0xd805);

2257 if(cloned->containsSome(0xd802, 0xd805)) {

2258 errln("FAIL: unable to modify clone");

2259 }

2260 delete cloned;

2261

2262 UnicodeSet thawed=(UnicodeSet )frozen.cloneAsThawed();

2263 if(thawed->isFrozen() \|\| *thawed!=frozen \|\| thawed->containsSome(0xd802, 0xd 805)) {

2264 errln("FAIL: cloneAsThawed() failed");

2265 }

2266 thawed->add(0xd802, 0xd805);

2267 if(!thawed->contains(0xd802, 0xd805)) {

2268 errln("FAIL: unable to modify thawed clone");

2269 }

2270 delete thawed;

2271

2272 frozen.set(5, 55);

2273 if(frozen!=idSet \|\| !(frozen==idSet)) {

2274 errln("FAIL: UnicodeSet::set() modified a frozen set");

2275 }

2276

2277 frozen.clear();

2278 if(frozen!=idSet \|\| !(frozen==idSet)) {

2279 errln("FAIL: UnicodeSet::clear() modified a frozen set");

2280 }

2281

2282 frozen.closeOver(USET_CASE_INSENSITIVE);

2283 if(frozen!=idSet \|\| !(frozen==idSet)) {

2284 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");

2285 }

2286

2287 frozen.compact();

2288 if(frozen!=idSet \|\| !(frozen==idSet)) {

2289 errln("FAIL: UnicodeSet::compact() modified a frozen set");

2290 }

2291

2292 ParsePosition pos;

2293 frozen.

2294 applyPattern(wsPattern, errorCode).

2295 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).

2296 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).

2297 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).

2298 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), e rrorCode);

2299 if(frozen!=idSet \|\| !(frozen==idSet)) {

2300 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");

2301 }

2302

2303 frozen.

2304 add(0xd800).

2305 add(0xd802, 0xd805).

2306 add(wsPattern).

2307 addAll(idPattern).

2308 addAll(wsSet);

2309 if(frozen!=idSet \|\| !(frozen==idSet)) {

2310 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");

2311 }

2312

2313 frozen.

2314 retain(0x62).

2315 retain(0x64, 0x69).

2316 retainAll(wsPattern).

2317 retainAll(wsSet);

2318 if(frozen!=idSet \|\| !(frozen==idSet)) {

2319 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");

2320 }

2321

2322 frozen.

2323 remove(0x62).

2324 remove(0x64, 0x69).

2325 remove(idPattern).

2326 removeAll(idPattern).

2327 removeAll(idSet);

2328 if(frozen!=idSet \|\| !(frozen==idSet)) {

2329 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");

2330 }

2331

2332 frozen.

2333 complement().

2334 complement(0x62).

2335 complement(0x64, 0x69).

2336 complement(idPattern).

2337 complementAll(idPattern).

2338 complementAll(idSet);

2339 if(frozen!=idSet \|\| !(frozen==idSet)) {

2340 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");

2341 }

2342 }

2343

2344 // Test span() etc. -------------------------------------------------------- ***

2345

2346 // Append the UTF-8 version of the string to t and return the appended UTF-8 len gth.

2347 static int32_t

2348 appendUTF8(const UChar s, int32_t length, char t, int32_t capacity) {

2349 UErrorCode errorCode=U_ZERO_ERROR;

2350 int32_t length8=0;

2351 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);

2352 if(U_SUCCESS(errorCode)) {

2353 return length8;

2354 } else {

2355 // The string contains an unpaired surrogate.

2356 // Ignore this string.

2357 return 0;

2358 }

2359 }

2360

2361 class UnicodeSetWithStringsIterator;

2362

2363 // Make the strings in a UnicodeSet easily accessible.

2364 class UnicodeSetWithStrings {

2365 public:

2366 UnicodeSetWithStrings(const UnicodeSet &normalSet) :

2367 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {

2368 int32_t size=set.size();

2369 if(size>0 && set.charAt(size-1)<0) {

2370 // If a set's last element is not a code point, then it must contain strings.

2371 // Iterate over the set, skip all code point ranges, and cache the s trings.

2372 // Convert them to UTF-8 for spanUTF8().

2373 UnicodeSetIterator iter(set);

2374 const UnicodeString *s;

2375 char *s8=utf8;

2376 int32_t length8, utf8Count=0;

2377 while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {

2378 if(iter.isString()) {

2379 // Store the pointer to the set's string element

2380 // which we happen to know is a stable pointer.

2381 strings[stringsLength]=s=&iter.getString();

2382 utf8Count+=

2383 utf8Lengths[stringsLength]=length8=

2384 appendUTF8(s->getBuffer(), s->length(),

2385 s8, (int32_t)(sizeof(utf8)-utf8Count));

2386 if(length8==0) {

2387 hasSurrogates=TRUE; // Contains unpaired surrogates.

2388 }

2389 s8+=length8;

2390 ++stringsLength;

2391 }

2392 }

2393 }

2394 }

2395

2396 const UnicodeSet &getSet() const {

2397 return set;

2398 }

2399

2400 UBool hasStrings() const {

2401 return (UBool)(stringsLength>0);

2402 }

2403

2404 UBool hasStringsWithSurrogates() const {

2405 return hasSurrogates;

2406 }

2407

2408 private:

2409 friend class UnicodeSetWithStringsIterator;

2410

2411 const UnicodeSet &set;

2412

2413 const UnicodeString *strings[20];

2414 int32_t stringsLength;

2415 UBool hasSurrogates;

2416

2417 char utf8[1024];

2418 int32_t utf8Lengths[20];

2419 };

2420

2421 class UnicodeSetWithStringsIterator {

2422 public:

2423 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :

2424 fSet(set), nextStringIndex(0), nextUTF8Start(0) {

2425 }

2426

2427 void reset() {

2428 nextStringIndex=nextUTF8Start=0;

2429 }

2430

2431 const UnicodeString *nextString() {

2432 if(nextStringIndex<fSet.stringsLength) {

2433 return fSet.strings[nextStringIndex++];

2434 } else {

2435 return NULL;

2436 }

2437 }

2438

2439 // Do not mix with calls to nextString().

2440 const char *nextUTF8(int32_t &length) {

2441 if(nextStringIndex<fSet.stringsLength) {

2442 const char *s8=fSet.utf8+nextUTF8Start;

2443 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];

2444 return s8;

2445 } else {

2446 length=0;

2447 return NULL;

2448 }

2449 }

2450

2451 private:

2452 const UnicodeSetWithStrings &fSet;

2453 int32_t nextStringIndex;

2454 int32_t nextUTF8Start;

2455 };

2456

2457 // Compare 16-bit Unicode strings (which may be malformed UTF-16)

2458 // at code point boundaries.

2459 // That is, each edge of a match must not be in the middle of a surrogate pair.

2460 static inline UBool

2461 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString & t) {

2462 s+=start;

2463 limit-=start;

2464 int32_t length=t.length();

2465 return 0==t.compare(s, length) &&

2466 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&

2467 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]) );

2468 }

2469

2470 // Implement span() with contains() for comparison.

2471 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar * s, int32_t length,

2472 USetSpanCondition spanCondition) {

2473 const UnicodeSet &realSet(set.getSet());

2474 if(!set.hasStrings()) {

2475 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

2476 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

2477 }

2478

2479 UChar32 c;

2480 int32_t start=0, prev;

2481 while((prev=start)<length) {

2482 U16_NEXT(s, start, length, c);

2483 if(realSet.contains(c)!=spanCondition) {

2484 break;

2485 }

2486 }

2487 return prev;

2488 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

2489 UnicodeSetWithStringsIterator iter(set);

2490 UChar32 c;

2491 int32_t start, next;

2492 for(start=next=0; start<length;) {

2493 U16_NEXT(s, next, length, c);

2494 if(realSet.contains(c)) {

2495 break;

2496 }

2497 const UnicodeString *str;

2498 iter.reset();

2499 while((str=iter.nextString())!=NULL) {

2500 if(str->length()<=(length-start) && matches16CPB(s, start, lengt h, *str)) {

2501 // spanNeedsStrings=TRUE;

2502 return start;

2503 }

2504 }

2505 start=next;

2506 }

2507 return start;

2508 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

2509 UnicodeSetWithStringsIterator iter(set);

2510 UChar32 c;

2511 int32_t start, next, maxSpanLimit=0;

2512 for(start=next=0; start<length;) {

2513 U16_NEXT(s, next, length, c);

2514 if(!realSet.contains(c)) {

2515 next=start; // Do not span this single, not-contained code poin t.

2516 }

2517 const UnicodeString *str;

2518 iter.reset();

2519 while((str=iter.nextString())!=NULL) {

2520 if(str->length()<=(length-start) && matches16CPB(s, start, lengt h, *str)) {

2521 // spanNeedsStrings=TRUE;

2522 int32_t matchLimit=start+str->length();

2523 if(matchLimit==length) {

2524 return length;

2525 }

2526 if(spanCondition==USET_SPAN_CONTAINED) {

2527 // Iterate for the shortest match at each position.

2528 // Recurse for each but the shortest match.

2529 if(next==start) {

2530 next=matchLimit; // First match from start.

2531 } else {

2532 if(matchLimit<next) {

2533 // Remember shortest match from start for iterat ion.

2534 int32_t temp=next;

2535 next=matchLimit;

2536 matchLimit=temp;

2537 }

2538 // Recurse for non-shortest match from start.

2539 int32_t spanLength=containsSpanUTF16(set, s+matchLim it, length-matchLimit,

2540 USET_SPAN_CONTA INED);

2541 if((matchLimit+spanLength)>maxSpanLimit) {

2542 maxSpanLimit=matchLimit+spanLength;

2543 if(maxSpanLimit==length) {

2544 return length;

2545 }

2546 }

2547 }

2548 } else /* spanCondition==USET_SPAN_SIMPLE */ {

2549 if(matchLimit>next) {

2550 // Remember longest match from start.

2551 next=matchLimit;

2552 }

2553 }

2554 }

2555 }

2556 if(next==start) {

2557 break; // No match from start.

2558 }

2559 start=next;

2560 }

2561 if(start>maxSpanLimit) {

2562 return start;

2563 } else {

2564 return maxSpanLimit;

2565 }

2566 }

2567 }

2568

2569 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UCh ar *s, int32_t length,

2570 USetSpanCondition spanCondition) {

2571 if(length==0) {

2572 return 0;

2573 }

2574 const UnicodeSet &realSet(set.getSet());

2575 if(!set.hasStrings()) {

2576 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

2577 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

2578 }

2579

2580 UChar32 c;

2581 int32_t prev=length;

2582 do {

2583 U16_PREV(s, 0, length, c);

2584 if(realSet.contains(c)!=spanCondition) {

2585 break;

2586 }

2587 } while((prev=length)>0);

2588 return prev;

2589 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

2590 UnicodeSetWithStringsIterator iter(set);

2591 UChar32 c;

2592 int32_t prev=length, length0=length;

2593 do {

2594 U16_PREV(s, 0, length, c);

2595 if(realSet.contains(c)) {

2596 break;

2597 }

2598 const UnicodeString *str;

2599 iter.reset();

2600 while((str=iter.nextString())!=NULL) {

2601 if(str->length()<=prev && matches16CPB(s, prev-str->length(), le ngth0, *str)) {

2602 // spanNeedsStrings=TRUE;

2603 return prev;

2604 }

2605 }

2606 } while((prev=length)>0);

2607 return prev;

2608 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

2609 UnicodeSetWithStringsIterator iter(set);

2610 UChar32 c;

2611 int32_t prev=length, minSpanStart=length, length0=length;

2612 do {

2613 U16_PREV(s, 0, length, c);

2614 if(!realSet.contains(c)) {

2615 length=prev; // Do not span this single, not-contained code poi nt.

2616 }

2617 const UnicodeString *str;

2618 iter.reset();

2619 while((str=iter.nextString())!=NULL) {

2620 if(str->length()<=prev && matches16CPB(s, prev-str->length(), le ngth0, *str)) {

2621 // spanNeedsStrings=TRUE;

2622 int32_t matchStart=prev-str->length();

2623 if(matchStart==0) {

2624 return 0;

2625 }

2626 if(spanCondition==USET_SPAN_CONTAINED) {

2627 // Iterate for the shortest match at each position.

2628 // Recurse for each but the shortest match.

2629 if(length==prev) {

2630 length=matchStart; // First match from prev.

2631 } else {

2632 if(matchStart>length) {

2633 // Remember shortest match from prev for iterati on.

2634 int32_t temp=length;

2635 length=matchStart;

2636 matchStart=temp;

2637 }

2638 // Recurse for non-shortest match from prev.

2639 int32_t spanStart=containsSpanBackUTF16(set, s, matc hStart,

2640 USET_SPAN_CO NTAINED);

2641 if(spanStart<minSpanStart) {

2642 minSpanStart=spanStart;

2643 if(minSpanStart==0) {

2644 return 0;

2645 }

2646 }

2647 }

2648 } else /* spanCondition==USET_SPAN_SIMPLE */ {

2649 if(matchStart<length) {

2650 // Remember longest match from prev.

2651 length=matchStart;

2652 }

2653 }

2654 }

2655 }

2656 if(length==prev) {

2657 break; // No match from prev.

2658 }

2659 } while((prev=length)>0);

2660 if(prev<minSpanStart) {

2661 return prev;

2662 } else {

2663 return minSpanStart;

2664 }

2665 }

2666 }

2667

2668 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,

2669 USetSpanCondition spanCondition) {

2670 const UnicodeSet &realSet(set.getSet());

2671 if(!set.hasStrings()) {

2672 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

2673 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

2674 }

2675

2676 UChar32 c;

2677 int32_t start=0, prev;

2678 while((prev=start)<length) {

2679 U8_NEXT_OR_FFFD(s, start, length, c);

2680 if(realSet.contains(c)!=spanCondition) {

2681 break;

2682 }

2683 }

2684 return prev;

2685 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

2686 UnicodeSetWithStringsIterator iter(set);

2687 UChar32 c;

2688 int32_t start, next;

2689 for(start=next=0; start<length;) {

2690 U8_NEXT_OR_FFFD(s, next, length, c);

2691 if(realSet.contains(c)) {

2692 break;

2693 }

2694 const char *s8;

2695 int32_t length8;

2696 iter.reset();

2697 while((s8=iter.nextUTF8(length8))!=NULL) {

2698 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s 8, length8)) {

2699 // spanNeedsStrings=TRUE;

2700 return start;

2701 }

2702 }

2703 start=next;

2704 }

2705 return start;

2706 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

2707 UnicodeSetWithStringsIterator iter(set);

2708 UChar32 c;

2709 int32_t start, next, maxSpanLimit=0;

2710 for(start=next=0; start<length;) {

2711 U8_NEXT_OR_FFFD(s, next, length, c);

2712 if(!realSet.contains(c)) {

2713 next=start; // Do not span this single, not-contained code poin t.

2714 }

2715 const char *s8;

2716 int32_t length8;

2717 iter.reset();

2718 while((s8=iter.nextUTF8(length8))!=NULL) {

2719 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s 8, length8)) {

2720 // spanNeedsStrings=TRUE;

2721 int32_t matchLimit=start+length8;

2722 if(matchLimit==length) {

2723 return length;

2724 }

2725 if(spanCondition==USET_SPAN_CONTAINED) {

2726 // Iterate for the shortest match at each position.

2727 // Recurse for each but the shortest match.

2728 if(next==start) {

2729 next=matchLimit; // First match from start.

2730 } else {

2731 if(matchLimit<next) {

2732 // Remember shortest match from start for iterat ion.

2733 int32_t temp=next;

2734 next=matchLimit;

2735 matchLimit=temp;

2736 }

2737 // Recurse for non-shortest match from start.

2738 int32_t spanLength=containsSpanUTF8(set, s+matchLimi t, length-matchLimit,

2739 USET_SPAN_CONTAI NED);

2740 if((matchLimit+spanLength)>maxSpanLimit) {

2741 maxSpanLimit=matchLimit+spanLength;

2742 if(maxSpanLimit==length) {

2743 return length;

2744 }

2745 }

2746 }

2747 } else /* spanCondition==USET_SPAN_SIMPLE */ {

2748 if(matchLimit>next) {

2749 // Remember longest match from start.

2750 next=matchLimit;

2751 }

2752 }

2753 }

2754 }

2755 if(next==start) {

2756 break; // No match from start.

2757 }

2758 start=next;

2759 }

2760 if(start>maxSpanLimit) {

2761 return start;

2762 } else {

2763 return maxSpanLimit;

2764 }

2765 }

2766 }

2767

2768 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,

2769 USetSpanCondition spanCondition) {

2770 if(length==0) {

2771 return 0;

2772 }

2773 const UnicodeSet &realSet(set.getSet());

2774 if(!set.hasStrings()) {

2775 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

2776 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

2777 }

2778

2779 UChar32 c;

2780 int32_t prev=length;

2781 do {

2782 U8_PREV_OR_FFFD(s, 0, length, c);

2783 if(realSet.contains(c)!=spanCondition) {

2784 break;

2785 }

2786 } while((prev=length)>0);

2787 return prev;

2788 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

2789 UnicodeSetWithStringsIterator iter(set);

2790 UChar32 c;

2791 int32_t prev=length;

2792 do {

2793 U8_PREV_OR_FFFD(s, 0, length, c);

2794 if(realSet.contains(c)) {

2795 break;

2796 }

2797 const char *s8;

2798 int32_t length8;

2799 iter.reset();

2800 while((s8=iter.nextUTF8(length8))!=NULL) {

2801 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {

2802 // spanNeedsStrings=TRUE;

2803 return prev;

2804 }

2805 }

2806 } while((prev=length)>0);

2807 return prev;

2808 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

2809 UnicodeSetWithStringsIterator iter(set);

2810 UChar32 c;

2811 int32_t prev=length, minSpanStart=length;

2812 do {

2813 U8_PREV_OR_FFFD(s, 0, length, c);

2814 if(!realSet.contains(c)) {

2815 length=prev; // Do not span this single, not-contained code poi nt.

2816 }

2817 const char *s8;

2818 int32_t length8;

2819 iter.reset();

2820 while((s8=iter.nextUTF8(length8))!=NULL) {

2821 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {

2822 // spanNeedsStrings=TRUE;

2823 int32_t matchStart=prev-length8;

2824 if(matchStart==0) {

2825 return 0;

2826 }

2827 if(spanCondition==USET_SPAN_CONTAINED) {

2828 // Iterate for the shortest match at each position.

2829 // Recurse for each but the shortest match.

2830 if(length==prev) {

2831 length=matchStart; // First match from prev.

2832 } else {

2833 if(matchStart>length) {

2834 // Remember shortest match from prev for iterati on.

2835 int32_t temp=length;

2836 length=matchStart;

2837 matchStart=temp;

2838 }

2839 // Recurse for non-shortest match from prev.

2840 int32_t spanStart=containsSpanBackUTF8(set, s, match Start,

2841 USET_SPAN_CON TAINED);

2842 if(spanStart<minSpanStart) {

2843 minSpanStart=spanStart;

2844 if(minSpanStart==0) {

2845 return 0;

2846 }

2847 }

2848 }

2849 } else /* spanCondition==USET_SPAN_SIMPLE */ {

2850 if(matchStart<length) {

2851 // Remember longest match from prev.

2852 length=matchStart;

2853 }

2854 }

2855 }

2856 }

2857 if(length==prev) {

2858 break; // No match from prev.

2859 }

2860 } while((prev=length)>0);

2861 if(prev<minSpanStart) {

2862 return prev;

2863 } else {

2864 return minSpanStart;

2865 }

2866 }

2867 }

2868

2869 // spans to be performed and compared

2870 enum {

2871 SPAN_UTF16 =1,

2872 SPAN_UTF8 =2,

2873 SPAN_UTFS =3,

2874

2875 SPAN_SET =4,

2876 SPAN_COMPLEMENT =8,

2877 SPAN_POLARITY =0xc,

2878

2879 SPAN_FWD =0x10,

2880 SPAN_BACK =0x20,

2881 SPAN_DIRS =0x30,

2882

2883 SPAN_CONTAINED =0x100,

2884 SPAN_SIMPLE =0x200,

2885 SPAN_CONDITION =0x300,

2886

2887 SPAN_ALL =0x33f

2888 };

2889

2890 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondit ion, USetSpanCondition contained) {

2891 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_ CONTAINED;

2892 }

2893

2894 static inline int32_t slen(const void *s, UBool isUTF16) {

2895 return isUTF16 ? u_strlen((const UChar )s) : strlen((const char )s);

2896 }

2897

2898 /*

2899 * Count spans on a string with the method according to type and set the span li mits.

2900 * The set may be the complement of the original.

2901 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()

2902 * according to the expected number of spans.

2903 * Sets typeName to an empty string if there is no such type.

2904 * Returns -1 if the span option is filtered out.

2905 */

2906 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,

2907 const void *s, int32_t length, UBool isUTF16,

2908 uint32_t whichSpans,

2909 int type, const char *&typeName,

2910 int32_t limits[], int32_t limitsCapacity,

2911 int32_t expectCount) {

2912 const UnicodeSet &realSet(set.getSet());

2913 int32_t start, count;

2914 USetSpanCondition spanCondition, firstSpanCondition, contained;

2915 UBool isForward;

2916

2917 if(type<0 \|\| 7<type) {

2918 typeName="";

2919 return 0;

2920 }

2921

2922 static const char *const typeNames16[]={

2923 "contains", "contains(LM)",

2924 "span", "span(LM)",

2925 "containsBack", "containsBack(LM)",

2926 "spanBack", "spanBack(LM)"

2927 };

2928

2929 static const char *const typeNames8[]={

2930 "containsUTF8", "containsUTF8(LM)",

2931 "spanUTF8", "spanUTF8(LM)",

2932 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented

2933 "spanBackUTF8", "spanBackUTF8(LM)"

2934 };

2935

2936 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];

2937

2938 // filter span options

2939 if(type<=3) {

2940 // span forward

2941 if((whichSpans&SPAN_FWD)==0) {

2942 return -1;

2943 }

2944 isForward=TRUE;

2945 } else {

2946 // span backward

2947 if((whichSpans&SPAN_BACK)==0) {

2948 return -1;

2949 }

2950 isForward=FALSE;

2951 }

2952 if((type&1)==0) {

2953 // use USET_SPAN_CONTAINED

2954 if((whichSpans&SPAN_CONTAINED)==0) {

2955 return -1;

2956 }

2957 contained=USET_SPAN_CONTAINED;

2958 } else {

2959 // use USET_SPAN_SIMPLE

2960 if((whichSpans&SPAN_SIMPLE)==0) {

2961 return -1;

2962 }

2963 contained=USET_SPAN_SIMPLE;

2964 }

2965

2966 // Default first span condition for going forward with an uncomplemented set .

2967 spanCondition=USET_SPAN_NOT_CONTAINED;

2968 if(isComplement) {

2969 spanCondition=invertSpanCondition(spanCondition, contained);

2970 }

2971

2972 // First span condition for span(), used to terminate the spanBack() iterati on.

2973 firstSpanCondition=spanCondition;

2974

2975 // spanBack(): Its initial span condition is span()'s last span condition,

2976 // which is the opposite of span()'s first span condition

2977 // if we expect an even number of spans.

2978 // (The loop inverts spanCondition (expectCount-1) times

2979 // before the expectCount'th span() call.)

2980 // If we do not compare forward and backward directions, then we do not have an

2981 // expectCount and just start with firstSpanCondition.

2982 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {

2983 spanCondition=invertSpanCondition(spanCondition, contained);

2984 }

2985

2986 count=0;

2987 switch(type) {

2988 case 0:

2989 case 1:

2990 start=0;

2991 if(length<0) {

2992 length=slen(s, isUTF16);

2993 }

2994 for(;;) {

2995 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, len gth-start, spanCondition) :

2996 containsSpanUTF8(set, (const char *)s+start, lengt h-start, spanCondition);

2997 if(count<limitsCapacity) {

2998 limits[count]=start;

2999 }

3000 ++count;

3001 if(start>=length) {

3002 break;

3003 }

3004 spanCondition=invertSpanCondition(spanCondition, contained);

3005 }

3006 break;

3007 case 2:

3008 case 3:

3009 start=0;

3010 for(;;) {

3011 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? l ength-start : length, spanCondition) :

3012 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);

3013 if(count<limitsCapacity) {

3014 limits[count]=start;

3015 }

3016 ++count;

3017 if(length>=0 ? start>=length :

3018 isUTF16 ? ((const UChar *)s)[start]==0 :

3019 ((const char *)s)[start]==0

3020 ) {

3021 break;

3022 }

3023 spanCondition=invertSpanCondition(spanCondition, contained);

3024 }

3025 break;

3026 case 4:

3027 case 5:

3028 if(length<0) {

3029 length=slen(s, isUTF16);

3030 }

3031 for(;;) {

3032 ++count;

3033 if(count<=limitsCapacity) {

3034 limits[limitsCapacity-count]=length;

3035 }

3036 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, lengt h, spanCondition) :

3037 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);

3038 if(length==0 && spanCondition==firstSpanCondition) {

3039 break;

3040 }

3041 spanCondition=invertSpanCondition(spanCondition, contained);

3042 }

3043 if(count<limitsCapacity) {

3044 memmove(limits, limits+(limitsCapacity-count), count*4);

3045 }

3046 break;

3047 case 6:

3048 case 7:

3049 for(;;) {

3050 ++count;

3051 if(count<=limitsCapacity) {

3052 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUT F16);

3053 }

3054 // Note: Length<0 is tested only for the first spanBack().

3055 // If we wanted to keep length<0 for all spanBack()s, we would have to

3056 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.

3057 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCon dition) :

3058 realSet.spanBackUTF8((const char *)s, length, span Condition);

3059 if(length==0 && spanCondition==firstSpanCondition) {

3060 break;

3061 }

3062 spanCondition=invertSpanCondition(spanCondition, contained);

3063 }

3064 if(count<limitsCapacity) {

3065 memmove(limits, limits+(limitsCapacity-count), count*4);

3066 }

3067 break;

3068 default:

3069 typeName="";

3070 return -1;

3071 }

3072

3073 return count;

3074 }

3075

3076 // sets to be tested; odd index=isComplement

3077 enum {

3078 SLOW,

3079 SLOW_NOT,

3080 FAST,

3081 FAST_NOT,

3082 SET_COUNT

3083 };

3084

3085 static const char *const setNames[SET_COUNT]={

3086 "slow",

3087 "slow.not",

3088 "fast",

3089 "fast.not"

3090 };

3091

3092 /*

3093 * Verify that we get the same results whether we look at text with contains(),

3094 * span() or spanBack(), using unfrozen or frozen versions of the set,

3095 * and using the set or its complement (switching the spanConditions accordingly ).

3096 * The latter verifies that

3097 * set.span(spanCondition) == set.complement().span(!spanCondition).

3098 *

3099 * The expectLimits[] are either provided by the caller (with expectCount>=0)

3100 * or returned to the caller (with an input expectCount<0).

3101 */

3102 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],

3103 const void *s, int32_t length, UBool isUTF16,

3104 uint32_t whichSpans,

3105 int32_t expectLimits[], int32_t &expectCount,

3106 const char *testName, int32_t index) {

3107 int32_t limits[500];

3108 int32_t limitsCount;

3109 int i, j;

3110

3111 const char *typeName;

3112 int type;

3113

3114 for(i=0; i<SET_COUNT; ++i) {

3115 if((i&1)==0) {

3116 // Even-numbered sets are original, uncomplemented sets.

3117 if((whichSpans&SPAN_SET)==0) {

3118 continue;

3119 }

3120 } else {

3121 // Odd-numbered sets are complemented.

3122 if((whichSpans&SPAN_COMPLEMENT)==0) {

3123 continue;

3124 }

3125 }

3126 for(type=0;; ++type) {

3127 limitsCount=getSpans(*sets[i], (UBool)(i&1),

3128 s, length, isUTF16,

3129 whichSpans,

3130 type, typeName,

3131 limits, UPRV_LENGTHOF(limits), expectCount);

3132 if(typeName[0]==0) {

3133 break; // All types tried.

3134 }

3135 if(limitsCount<0) {

3136 continue; // Span option filtered out.

3137 }

3138 if(expectCount<0) {

3139 expectCount=limitsCount;

3140 if(limitsCount>UPRV_LENGTHOF(limits)) {

3141 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",

3142 testName, (long)index, setNames[i], typeName, (long)li mitsCount, (long)UPRV_LENGTHOF(limits));

3143 return;

3144 }

3145 memcpy(expectLimits, limits, limitsCount*4);

3146 } else if(limitsCount!=expectCount) {

3147 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",

3148 testName, (long)index, setNames[i], typeName, (long)limits Count, (long)expectCount);

3149 } else {

3150 for(j=0; j<limitsCount; ++j) {

3151 if(limits[j]!=expectLimits[j]) {

3152 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=% ld != %ld",

3153 testName, (long)index, setNames[i], typeName, (lon g)limitsCount,

3154 j, (long)limits[j], (long)expectLimits[j]);

3155 break;

3156 }

3157 }

3158 }

3159 }

3160 }

3161

3162 // Compare span() with containsAll()/containsNone(),

3163 // but only if we have expectLimits[] from the uncomplemented set.

3164 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {

3165 const UChar s16=(const UChar )s;

3166 UnicodeString string;

3167 int32_t prev=0, limit, length;

3168 for(i=0; i<expectCount; ++i) {

3169 limit=expectLimits[i];

3170 length=limit-prev;

3171 if(length>0) {

3172 string.setTo(FALSE, s16+prev, length); // read-only alias

3173 if(i&1) {

3174 if(!sets[SLOW]->getSet().containsAll(string)) {

3175 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE c ontradicts span()",

3176 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);

3177 return;

3178 }

3179 if(!sets[FAST]->getSet().containsAll(string)) {

3180 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE c ontradicts span()",

3181 testName, (long)index, setNames[FAST], (long)prev, (long)limit);

3182 return;

3183 }

3184 } else {

3185 if(!sets[SLOW]->getSet().containsNone(string)) {

3186 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",

3187 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);

3188 return;

3189 }

3190 if(!sets[FAST]->getSet().containsNone(string)) {

3191 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",

3192 testName, (long)index, setNames[FAST], (long)prev, (long)limit);

3193 return;

3194 }

3195 }

3196 }

3197 prev=limit;

3198 }

3199 }

3200 }

3201

3202 // Specifically test either UTF-16 or UTF-8.

3203 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],

3204 const void *s, int32_t length, UBool isUTF16,

3205 uint32_t whichSpans,

3206 const char *testName, int32_t index) {

3207 int32_t expectLimits[500];

3208 int32_t expectCount=-1;

3209 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, te stName, index);

3210 }

3211

3212 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {

3213 UChar c, c2;

3214

3215 if(length>=0) {

3216 while(length>0) {

3217 c=*s++;

3218 --length;

3219 if(0xd800<=c && c<0xe000) {

3220 if(c>=0xdc00 \|\| length==0 \|\| !U16_IS_TRAIL(c2=*s++)) {

3221 return TRUE;

3222 }

3223 --length;

3224 }

3225 }

3226 } else {

3227 while((c=*s++)!=0) {

3228 if(0xd800<=c && c<0xe000) {

3229 if(c>=0xdc00 \|\| !U16_IS_TRAIL(c2=*s++)) {

3230 return TRUE;

3231 }

3232 }

3233 }

3234 }

3235 return FALSE;

3236 }

3237

3238 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,

3239 // unless either UTF is turned off in whichSpans.

3240 // Testing UTF-16 and UTF-8 together requires that surrogate code points

3241 // have the same contains(c) value as U+FFFD.

3242 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],

3243 const UChar *s16, int32_t length16,

3244 uint32_t whichSpans,

3245 const char *testName, int32_t index) {

3246 int32_t expectLimits[500];

3247 int32_t expectCount;

3248

3249 expectCount=-1; // Get expectLimits[] from testSpan().

3250

3251 if((whichSpans&SPAN_UTF16)!=0) {

3252 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCoun t, testName, index);

3253 }

3254 if((whichSpans&SPAN_UTF8)==0) {

3255 return;

3256 }

3257

3258 // Convert s16[] and expectLimits[] to UTF-8.

3259 uint8_t s8[3000];

3260 int32_t offsets[3000];

3261

3262 const UChar *s16Limit=s16+length16;

3263 char t=(char )s8;

3264 char *tLimit=t+sizeof(s8);

3265 int32_t *o=offsets;

3266 UErrorCode errorCode=U_ZERO_ERROR;

3267

3268 // Convert with substitution: Turn unpaired surrogates into U+FFFD.

3269 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, & errorCode);

3270 if(U_FAILURE(errorCode)) {

3271 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",

3272 testName, (long)index, u_errorName(errorCode));

3273 ucnv_resetFromUnicode(utf8Cnv);

3274 return;

3275 }

3276 int32_t length8=(int32_t)(t-(char *)s8);

3277

3278 // Convert expectLimits[].

3279 int32_t i, j, expect;

3280 for(i=j=0; i<expectCount; ++i) {

3281 expect=expectLimits[i];

3282 if(expect==length16) {

3283 expectLimits[i]=length8;

3284 } else {

3285 while(offsets[j]<expect) {

3286 ++j;

3287 }

3288 expectLimits[i]=j;

3289 }

3290 }

3291

3292 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, te stName, index);

3293 }

3294

3295 static UChar32 nextCodePoint(UChar32 c) {

3296 // Skip some large and boring ranges.

3297 switch(c) {

3298 case 0x3441:

3299 return 0x4d7f;

3300 case 0x5100:

3301 return 0x9f00;

3302 case 0xb040:

3303 return 0xd780;

3304 case 0xe041:

3305 return 0xf8fe;

3306 case 0x10100:

3307 return 0x20000;

3308 case 0x20041:

3309 return 0xe0000;

3310 case 0xe0101:

3311 return 0x10fffd;

3312 default:

3313 return c+1;

3314 }

3315 }

3316

3317 // Verify that all implementations represent the same set.

3318 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings sets[4], uint 32_t whichSpans, const char testName) {

3319 // contains(U+FFFD) is inconsistent with contains(some surrogates),

3320 // or the set contains strings with unpaired surrogates which don't translat e to valid UTF-8:

3321 // Skip the UTF-8 part of the test - if the string contains surrogates -

3322 // because it is likely to produce a different result.

3323 UBool inconsistentSurrogates=

3324 (!(sets[0]->getSet().contains(0xfffd) ?

3325 sets[0]->getSet().contains(0xd800, 0xdfff) :

3326 sets[0]->getSet().containsNone(0xd800, 0xdfff)) \|\|

3327 sets[0]->hasStringsWithSurrogates());

3328

3329 UChar s[1000];

3330 int32_t length=0;

3331 uint32_t localWhichSpans;

3332

3333 UChar32 c, first;

3334 for(first=c=0;; c=nextCodePoint(c)) {

3335 if(c>0x10ffff \|\| length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {

3336 localWhichSpans=whichSpans;

3337 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurroga tes) {

3338 localWhichSpans&=~SPAN_UTF8;

3339 }

3340 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);

3341 if(c>0x10ffff) {

3342 break;

3343 }

3344 length=0;

3345 first=c;

3346 }

3347 U16_APPEND_UNSAFE(s, length, c);

3348 }

3349 }

3350

3351 // Test with a particular, interesting string.

3352 // Specify length and try NUL-termination.

3353 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings sets[4], u int32_t whichSpans, const char testName) {

3354 static const UChar s[]={

3355 0x61, 0x62, 0x20, // Latin, space

3356 0x3b1, 0x3b2, 0x3b3, // Greek

3357 0xd900, // lead surrogate

3358 0x3000, 0x30ab, 0x30ad, // wide space, Katakana

3359 0xdc05, // trail surrogate

3360 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul

3361 0xd900, 0xdc05, // unassigned supplementary

3362 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary

3363 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wro ng order, LS

3364 0 // NUL

3365 };

3366

3367 if((whichSpans&SPAN_UTF16)==0) {

3368 return;

3369 }

3370 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);

3371 testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testNam e, 1);

3372 }

3373

3374 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings sets[4], ui nt32_t whichSpans, const char testName) {

3375 static const char s[]={

3376 "abc" // Latin

3377

3378 /* trail byte in lead position */

3379 "\x80"

3380

3381 " " // space

3382

3383 /* truncated multi-byte sequences */

3384 "\xd0"

3385 "\xe0"

3386 "\xe1"

3387 "\xed"

3388 "\xee"

3389 "\xf0"

3390 "\xf1"

3391 "\xf4"

3392 "\xf8"

3393 "\xfc"

3394

3395 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek

3396

3397 /* trail byte in lead position */

3398 "\x80"

3399

3400 "\xe0\x80"

3401 "\xe0\xa0"

3402 "\xe1\x80"

3403 "\xed\x80"

3404 "\xed\xa0"

3405 "\xee\x80"

3406 "\xf0\x80"

3407 "\xf0\x90"

3408 "\xf1\x80"

3409 "\xf4\x80"

3410 "\xf4\x90"

3411 "\xf8\x80"

3412 "\xfc\x80"

3413

3414 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana

3415

3416 /* trail byte in lead position */

3417 "\x80"

3418

3419 "\xf0\x80\x80"

3420 "\xf0\x90\x80"

3421 "\xf1\x80\x80"

3422 "\xf4\x80\x80"

3423 "\xf4\x90\x80"

3424 "\xf8\x80\x80"

3425 "\xfc\x80\x80"

3426

3427 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul

3428

3429 /* trail byte in lead position */

3430 "\x80"

3431

3432 "\xf8\x80\x80\x80"

3433 "\xfc\x80\x80\x80"

3434

3435 "\xF1\x90\x80\x85" // unassigned supplementary

3436

3437 /* trail byte in lead position */

3438 "\x80"

3439

3440 "\xfc\x80\x80\x80\x80"

3441

3442 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary

3443

3444 /* trail byte in lead position */

3445 "\x80"

3446

3447 /* complete sequences but non-shortest forms or out of range etc. */

3448 "\xc0\x80"

3449 "\xe0\x80\x80"

3450 "\xed\xa0\x80"

3451 "\xf0\x80\x80\x80"

3452 "\xf4\x90\x80\x80"

3453 "\xf8\x80\x80\x80\x80"

3454 "\xfc\x80\x80\x80\x80\x80"

3455 "\xfe"

3456 "\xff"

3457

3458 /* trail byte in lead position */

3459 "\x80"

3460

3461 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminate d

3462 };

3463

3464 if((whichSpans&SPAN_UTF8)==0) {

3465 return;

3466 }

3467 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);

3468 testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testN ame, 1);

3469 }

3470

3471 // Take a set of span options and multiply them so that

3472 // each portion only has one of the options a, b and c.

3473 // If b==0, then the set of options is just modified with mask and a.

3474 // If b!=0 and c==0, then the set of options is just modified with mask, a and b .

3475 static int32_t

3476 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,

3477 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {

3478 uint32_t s;

3479 int32_t i;

3480

3481 for(i=0; i<whichSpansCount; ++i) {

3482 s=whichSpans[i]&mask;

3483 whichSpans[i]=s\|a;

3484 if(b!=0) {

3485 whichSpans[whichSpansCount+i]=s\|b;

3486 if(c!=0) {

3487 whichSpans[2*whichSpansCount+i]=s\|c;

3488 }

3489 }

3490 }

3491 return b==0 ? whichSpansCount : c==0 ? 2whichSpansCount : 3whichSpansCount ;

3492 }

3493

3494 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

3495 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

3496 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"

3497 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"

3498

3499 void UnicodeSetTest::TestSpan() {

3500 // "[...]" is a UnicodeSet pattern.

3501 // "*" performs tests on all Unicode code points and on a selection of

3502 // malformed UTF-8/16 strings.

3503 // "-options" limits the scope of testing for the current set.

3504 // By default, the test verifies that equivalent boundaries are found

3505 // for UTF-16 and UTF-8, going forward and backward,

3506 // alternating USET_SPAN_NOT_CONTAINED with

3507 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.

3508 // Single-character options:

3509 // 8 -- UTF-16 and UTF-8 boundaries may differ.

3510 // Cause: contains(U+FFFD) is inconsistent with contains(some surro gates),

3511 // or the set contains strings with unpaired surrogates

3512 // which do not translate to valid UTF-8.

3513 // c -- set.span() and set.complement().span() boundaries may differ.

3514 // Cause: Set strings are not complemented.

3515 // b -- span() and spanBack() boundaries may differ.

3516 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAI NED)

3517 // and spanBack(USET_SPAN_SIMPLE) are defined to

3518 // match with non-overlapping substrings.

3519 // For example, with a set containing "ab" and "ba",

3520 // span() of "aba" yields boundaries { 0, 2, 3 }

3521 // because the initial "ab" matches from 0 to 2,

3522 // while spanBack() yields boundaries { 0, 1, 3 }

3523 // because the final "ba" matches from 1 to 3.

3524 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.

3525 // Cause: Strings in the set overlap, and a longer match may

3526 // require a sequence including non-longest substrings.

3527 // For example, with a set containing "ab", "abc" and "cd",

3528 // span(contained) of "abcd" spans the entire string

3529 // but span(longest match) only spans the first 3 characters.

3530 // Each "-options" first resets all options and then applies the specified options.

3531 // A "-" without options resets the options.

3532 // The options are also reset for each new set.

3533 // Other strings will be spanned.

3534 static const char *const testdata[]={

3535 "[:ID_Continue:]",

3536 "*",

3537 "[:White_Space:]",

3538 "*",

3539 "[]",

3540 "*",

3541 "[\\u0000-\\U0010FFFF]",

3542 "*",

3543 "[\\u0000\\u0080\\u0800\\U00010000]",

3544 "*",

3545 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",

3546 "*",

3547 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30 ad}]",

3548 "-c",

3549 "*",

3550 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30 ad}]",

3551 "-c",

3552 "*",

3553

3554 // Overlapping strings cause overlapping attempts to match.

3555 "[x{xy}{xya}{axy}{ax}]",

3556 "-cl",

3557

3558 // More repetitions of "xya" would take too long with the recursive

3559 // reference implementation.

3560 // containsAll()=FALSE

3561 // test_string 0x14

3562 "xx"

3563 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.

3564 "xx" // set.complement().span(contained) will stop between th e two 'x'es.

3565 "xyaxyaxyaxya"

3566 "xx"

3567 "xyaxyaxyaxya" // span() ends here.

3568 "aaa",

3569

3570 // containsAll()=TRUE

3571 // test_string 0x15

3572 "xx"

3573 "xyaxyaxyaxya"

3574 "xx"

3575 "xyaxyaxyaxya"

3576 "xx"

3577 "xyaxyaxyaxy",

3578

3579 "-bc",

3580 // test_string 0x17

3581 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }

3582 "-c",

3583 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }

3584 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }

3585 "-",

3586 "byaya", // span() -> { 5 }

3587 "byay", // span() -> { 4 }

3588 "bya", // span() -> { 3 }

3589

3590 // span(longest match) will not span the whole string.

3591 "[a{ab}{bc}]",

3592 "-cl",

3593 // test_string 0x21

3594 "abc",

3595

3596 "[a{ab}{abc}{cd}]",

3597 "-cl",

3598 "acdabcdabccd",

3599

3600 // spanBack(longest match) will not span the whole string.

3601 "[c{ab}{bc}]",

3602 "-cl",

3603 "abc",

3604

3605 "[d{cd}{bcd}{ab}]",

3606 "-cl",

3607 "abbcdabcdabd",

3608

3609 // Test with non-ASCII set strings - test proper handling of surrogate p airs

3610 // and UTF-8 trail bytes.

3611 // Copies of above test sets and strings, but transliterated to have

3612 // different code points with similar trail units.

3613 // Previous: a b c d

3614 // Unicode: 042B 30AB 200AB 204AB

3615 // UTF-16: 042B 30AB D840 DCAB D841 DCAB

3616 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB

3617 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U00020 4AB}]",

3618 "-cl",

3619 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042 B\\u30AB\\U000200AB\\U000200AB\\U000204AB",

3620

3621 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u 042B\\u30AB}]",

3622 "-cl",

3623 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U 000204AB\\u042B\\u30AB\\U000204AB",

3624

3625 // Stress bookkeeping and recursion.

3626 // The following strings are barely doable with the recursive

3627 // reference implementation.

3628 // The not-contained character at the end prevents an early exit from th e span().

3629 "[b{bb}]",

3630 "-c",

3631 // test_string 0x33

3632 "bbbbbbbbbbbbbbbbbbbbbbbb-",

3633 // On complement sets, span() and spanBack() get different results

3634 // because b is not in the complement set and there is an odd number of b's

3635 // in the test string.

3636 "-bc",

3637 "bbbbbbbbbbbbbbbbbbbbbbbbb-",

3638

3639 // Test with set strings with an initial or final code point span

3640 // longer than 254.

3641 "[a{" _64_a _64_a _64_a _64_a "b}"

3642 "{a" _64_b _64_b _64_b _64_b "}]",

3643 "-c",

3644 _64_a _64_a _64_a _63_a "b",

3645 _64_a _64_a _64_a _64_a "b",

3646 _64_a _64_a _64_a _64_a "aaaabbbb",

3647 "a" _64_b _64_b _64_b _63_b,

3648 "a" _64_b _64_b _64_b _64_b,

3649 "aaaabbbb" _64_b _64_b _64_b _64_b,

3650

3651 // Test with strings containing unpaired surrogates.

3652 // They are not representable in UTF-8, and a leading trail surrogate

3653 // and a trailing lead surrogate must not match in the middle of a prope r surrogate pair.

3654 // U+20001 == \\uD840\\uDC01

3655 // U+20400 == \\uD841\\uDC00

3656 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",

3657 "-8cl",

3658 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a \\U00020000\\uDC00a\\uDC00babbb"

3659 };

3660 uint32_t whichSpans[96]={ SPAN_ALL };

3661 int32_t whichSpansCount=1;

3662

3663 UnicodeSet *sets[SET_COUNT]={ NULL };

3664 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };

3665

3666 char testName[1024];

3667 char *testNameLimit=testName;

3668

3669 int32_t i, j;

3670 for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {

3671 const char *s=testdata[i];

3672 if(s[0]=='[') {

3673 // Create new test sets from this pattern.

3674 for(j=0; j<SET_COUNT; ++j) {

3675 delete sets_with_str[j];

3676 delete sets[j];

3677 }

3678 UErrorCode errorCode=U_ZERO_ERROR;

3679 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), e rrorCode);

3680 if(U_FAILURE(errorCode)) {

3681 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_err orName(errorCode));

3682 break;

3683 }

3684 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);

3685 sets[SLOW_NOT]->complement();

3686 // Intermediate set: Test cloning of a frozen set.

3687 UnicodeSet fast=new UnicodeSet(sets[SLOW]);

3688 fast->freeze();

3689 sets[FAST]=(UnicodeSet *)fast->clone();

3690 delete fast;

3691 UnicodeSet fastNot=new UnicodeSet(sets[SLOW_NOT]);

3692 fastNot->freeze();

3693 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();

3694 delete fastNot;

3695

3696 for(j=0; j<SET_COUNT; ++j) {

3697 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);

3698 }

3699

3700 strcpy(testName, s);

3701 testNameLimit=strchr(testName, 0);

3702 *testNameLimit++=':';

3703 *testNameLimit=0;

3704

3705 whichSpans[0]=SPAN_ALL;

3706 whichSpansCount=1;

3707 } else if(s[0]=='-') {

3708 whichSpans[0]=SPAN_ALL;

3709 whichSpansCount=1;

3710

3711 while(*++s!=0) {

3712 switch(*s) {

3713 case 'c':

3714 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

3715 ~SPAN_POLARITY,

3716 SPAN_SET,

3717 SPAN_COMPLEMENT,

3718 0);

3719 break;

3720 case 'b':

3721 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

3722 ~SPAN_DIRS,

3723 SPAN_FWD,

3724 SPAN_BACK,

3725 0);

3726 break;

3727 case 'l':

3728 // test USET_SPAN_CONTAINED FWD & BACK, and separately

3729 // USET_SPAN_SIMPLE only FWD, and separately

3730 // USET_SPAN_SIMPLE only BACK

3731 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

3732 ~(SPAN_DIRS\|SPAN_CONDITION),

3733 SPAN_DIRS\|SPAN_CONTAINED,

3734 SPAN_FWD\|SPAN_SIMPLE,

3735 SPAN_BACK\|SPAN_SIMPLE);

3736 break;

3737 case '8':

3738 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

3739 ~SPAN_UTFS,

3740 SPAN_UTF16,

3741 SPAN_UTF8,

3742 0);

3743 break;

3744 default:

3745 errln("FAIL: unrecognized span set option in \"%s\"", testda ta[i]);

3746 break;

3747 }

3748 }

3749 } else if(0==strcmp(s, "*")) {

3750 strcpy(testNameLimit, "bad_string");

3751 for(j=0; j<whichSpansCount; ++j) {

3752 if(whichSpansCount>1) {

3753 sprintf(testNameLimit+10 /* strlen("bad_string") */,

3754 "%%0x%3x",

3755 whichSpans[j]);

3756 }

3757 testSpanUTF16String(sets_with_str, whichSpans[j], testName);

3758 testSpanUTF8String(sets_with_str, whichSpans[j], testName);

3759 }

3760

3761 strcpy(testNameLimit, "contents");

3762 for(j=0; j<whichSpansCount; ++j) {

3763 if(whichSpansCount>1) {

3764 sprintf(testNameLimit+8 /* strlen("contents") */,

3765 "%%0x%3x",

3766 whichSpans[j]);

3767 }

3768 testSpanContents(sets_with_str, whichSpans[j], testName);

3769 }

3770 } else {

3771 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();

3772 strcpy(testNameLimit, "test_string");

3773 for(j=0; j<whichSpansCount; ++j) {

3774 if(whichSpansCount>1) {

3775 sprintf(testNameLimit+11 /* strlen("test_string") */,

3776 "%%0x%3x",

3777 whichSpans[j]);

3778 }

3779 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.lengt h(), whichSpans[j], testName, i);

3780 }

3781 }

3782 }

3783 for(j=0; j<SET_COUNT; ++j) {

3784 delete sets_with_str[j];

3785 delete sets[j];

3786 }

3787 }

3788

3789 // Test select patterns and strings, and test USET_SPAN_SIMPLE.

3790 void UnicodeSetTest::TestStringSpan() {

3791 static const char *pattern="[x{xy}{xya}{axy}{ax}]";

3792 static const char *const string=

3793 "xx"

3794 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"

3795 "xx"

3796 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"

3797 "xx"

3798 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"

3799 "aaaa";

3800

3801 UErrorCode errorCode=U_ZERO_ERROR;

3802 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);

3803 UnicodeSet set(pattern16, errorCode);

3804 if(U_FAILURE(errorCode)) {

3805 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName (errorCode));

3806 return;

3807 }

3808

3809 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();

3810

3811 if(set.containsAll(string16)) {

3812 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, s tring);

3813 }

3814

3815 // Remove trailing "aaaa".

3816 string16.truncate(string16.length()-4);

3817 if(!set.containsAll(string16)) {

3818 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", patter n, string);

3819 }

3820

3821 string16=UNICODE_STRING_SIMPLE("byayaxya");

3822 const UChar *s16=string16.getBuffer();

3823 int32_t length16=string16.length();

3824 (void)length16; // Suppress set but not used warning.

3825 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 \|\|

3826 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 \|\|

3827 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 \|\|

3828 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 \|\|

3829 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 \|\|

3830 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3

3831 ) {

3832 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pa ttern);

3833 }

3834

3835 pattern="[a{ab}{abc}{cd}]";

3836 pattern16=UnicodeString(pattern, -1, US_INV);

3837 set.applyPattern(pattern16, errorCode);

3838 if(U_FAILURE(errorCode)) {

3839 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName (errorCode));

3840 return;

3841 }

3842 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");

3843 s16=string16.getBuffer();

3844 length16=string16.length();

3845 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 \|\|

3846 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|

3847 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5

3848 ) {

3849 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);

3850 }

3851

3852 pattern="[d{cd}{bcd}{ab}]";

3853 pattern16=UnicodeString(pattern, -1, US_INV);

3854 set.applyPattern(pattern16, errorCode).freeze();

3855 if(U_FAILURE(errorCode)) {

3856 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName (errorCode));

3857 return;

3858 }

3859 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");

3860 s16=string16.getBuffer();

3861 length16=string16.length();

3862 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 \|\|

3863 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|

3864 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0

3865 ) {

3866 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wr ong value", pattern);

3867 }

3868 }

3869

3870 /**

3871 * Including collationroot.h fails here with

3872 1>c:\Program Files (x86)\Microsoft SDKs\Windows\v7.0A\include\driverspecs.h(142) : error C2008: '$' : unexpected in macro definition

3873 * .. so, we skip this test on Windows.

3874 *

3875 * the cause is that intltest builds with /Za which disables language extension s - which means

3876 * windows header files can't be used.

3877 */

3878 #if !UCONFIG_NO_COLLATION && !U_PLATFORM_HAS_WIN32_API

3879 #include "collationroot.h"

3880 #include "collationtailoring.h"

3881 #endif

3882

3883 void UnicodeSetTest::TestUCAUnsafeBackwards() {

3884 #if U_PLATFORM_HAS_WIN32_API

3885 infoln("Skipping TestUCAUnsafeBackwards() - can't include collationroot.h on Windows without language extensions!");

3886 #elif !UCONFIG_NO_COLLATION

3887 UErrorCode errorCode = U_ZERO_ERROR;

3888

3889 // Get the unsafeBackwardsSet

3890 const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(erro rCode);

3891 if(U_FAILURE(errorCode)) {

3892 dataerrln("FAIL: %s getting root cache entry", u_errorName(errorCode));

3893 return;

3894 }

3895 //const UVersionInfo &version = rootEntry->tailoring->version;

3896 const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSe t;

3897

3898 checkSerializeRoundTrip(*unsafeBackwardSet, errorCode);

3899

3900 if(!logKnownIssue("11891","UnicodeSet fails to round trip on CollationRoot.. .unsafeBackwards set")) {

3901 // simple test case

3902 // TODO(ticket #11891): Simplify this test function to this simple case. Rename it appropriately.

3903 // TODO(ticket #11891): Port test to Java. Is this a bug there, too?

3904 UnicodeSet surrogates;

3905 surrogates.add(0xd83a); // a lead surrogate

3906 surrogates.add(0xdc00, 0xdfff); // a range of trail surrogates

3907 UnicodeString pat;

3908 surrogates.toPattern(pat, FALSE); // bad: [ 0xd83a, 0xdc00, 0x2d, 0xdff f ]

3909 // TODO: Probably fix either UnicodeSet::_generatePattern() or _appendTo Pat()

3910 // so that at least one type of surrogate code points are escaped,

3911 // or (minimally) so that adjacent lead+trail surrogate code points are escaped.

3912 errorCode = U_ZERO_ERROR;

3913 UnicodeSet s2;

3914 s2.applyPattern(pat, errorCode); // looks like invalid range [ 0x1e800, 0x2d, 0xdfff ]

3915 if(U_FAILURE(errorCode)) {

3916 errln("FAIL: surrogates to/from pattern - %s", u_errorName(errorCode ));

3917 } else {

3918 checkEqual(surrogates, s2, "surrogates to/from pattern");

3919 }

3920 // This occurs in the UCA unsafe-backwards set.

3921 checkRoundTrip(*unsafeBackwardSet);

3922 }

3923 #endif

3924 }

OLD	NEW

« no previous file with comments | « source/test/intltest/usettest.h ('k') | source/test/intltest/ustrtest.h » ('j') | no next file with comments »