icu46/source/test/intltest/usettest.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/test/intltest/usettest.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ********************************************************************************

	3 * Copyright (C) 1999-2010 International Business Machines Corporation and

	4 * others. All Rights Reserved.

	5 ********************************************************************************

	6 * Date Name Description

	7 * 10/20/99 alan Creation.

	8 * 03/22/2000 Madhu Added additional tests

	9 ********************************************************************************

	10 */

	11

	12 #include <stdio.h>

	13

	14 #include <string.h>

	15 #include "unicode/utypes.h"

	16 #include "usettest.h"

	17 #include "unicode/ucnv.h"

	18 #include "unicode/uniset.h"

	19 #include "unicode/uchar.h"

	20 #include "unicode/usetiter.h"

	21 #include "unicode/ustring.h"

	22 #include "unicode/parsepos.h"

	23 #include "unicode/symtable.h"

	24 #include "unicode/uversion.h"

	25 #include "hash.h"

	26

	27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

	28

	29 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \

	30 dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \

	31 u_errorName(status));}}

	32

	33 #define TEST_ASSERT(expr) {if (!(expr)) { \

	34 dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}

	35

	36 UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {

	37 UnicodeString pat;

	38 set.toPattern(pat);

	39 return left + UnicodeSetTest::escape(pat);

	40 }

	41

	42 #define CASE(id,test) case id: \

	43 name = #test; \

	44 if (exec) { \

	45 logln(#test "---"); \

	46 logln(); \

	47 test(); \

	48 } \

	49 break

	50

	51 UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {

	52 }

	53

	54 UConverter *UnicodeSetTest::openUTF8Converter() {

	55 if(utf8Cnv==NULL) {

	56 UErrorCode errorCode=U_ZERO_ERROR;

	57 utf8Cnv=ucnv_open("UTF-8", &errorCode);

	58 }

	59 return utf8Cnv;

	60 }

	61

	62 UnicodeSetTest::~UnicodeSetTest() {

	63 ucnv_close(utf8Cnv);

	64 }

	65

	66 void

	67 UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,

	68 const char* &name, char* /par/) {

	69 // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");

	70 switch (index) {

	71 CASE(0,TestPatterns);

	72 CASE(1,TestAddRemove);

	73 CASE(2,TestCategories);

	74 CASE(3,TestCloneEqualHash);

	75 CASE(4,TestMinimalRep);

	76 CASE(5,TestAPI);

	77 CASE(6,TestScriptSet);

	78 CASE(7,TestPropertySet);

	79 CASE(8,TestClone);

	80 CASE(9,TestExhaustive);

	81 CASE(10,TestToPattern);

	82 CASE(11,TestIndexOf);

	83 CASE(12,TestStrings);

	84 CASE(13,Testj2268);

	85 CASE(14,TestCloseOver);

	86 CASE(15,TestEscapePattern);

	87 CASE(16,TestInvalidCodePoint);

	88 CASE(17,TestSymbolTable);

	89 CASE(18,TestSurrogate);

	90 CASE(19,TestPosixClasses);

	91 CASE(20,TestIteration);

	92 CASE(21,TestFreezable);

	93 CASE(22,TestSpan);

	94 CASE(23,TestStringSpan);

	95 default: name = ""; break;

	96 }

	97 }

	98

	99 static const char NOT[] = "%%%%";

	100

	101 /**

	102 * UVector was improperly copying contents

	103 * This code will crash this is still true

	104 */

	105 void UnicodeSetTest::Testj2268() {

	106 UnicodeSet t;

	107 t.add(UnicodeString("abc"));

	108 UnicodeSet test(t);

	109 UnicodeString ustrPat;

	110 test.toPattern(ustrPat, TRUE);

	111 }

	112

	113 /**

	114 * Test toPattern().

	115 */

	116 void UnicodeSetTest::TestToPattern() {

	117 UErrorCode ec = U_ZERO_ERROR;

	118

	119 // Test that toPattern() round trips with syntax characters and

	120 // whitespace.

	121 {

	122 static const char* OTHER_TOPATTERN_TESTS[] = {

	123 "[[:latin:]&[:greek:]]",

	124 "[[:latin:]-[:greek:]]",

	125 "[:nonspacing mark:]",

	126 NULL

	127 };

	128

	129 for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {

	130 ec = U_ZERO_ERROR;

	131 UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);

	132 if (U_FAILURE(ec)) {

	133 dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_ TESTS[j] + " - " + UnicodeString(u_errorName(ec)));

	134 continue;

	135 }

	136 checkPat(OTHER_TOPATTERN_TESTS[j], s);

	137 }

	138

	139 for (UChar32 i = 0; i <= 0x10FFFF; ++i) {

	140 if ((i <= 0xFF && !u_isalpha(i)) \|\| u_isspace(i)) {

	141

	142 // check various combinations to make sure they all work.

	143 if (i != 0 && !toPatternAux(i, i)){

	144 continue;

	145 }

	146 if (!toPatternAux(0, i)){

	147 continue;

	148 }

	149 if (!toPatternAux(i, 0xFFFF)){

	150 continue;

	151 }

	152 }

	153 }

	154 }

	155

	156 // Test pattern behavior of multicharacter strings.

	157 {

	158 ec = U_ZERO_ERROR;

	159 UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);

	160

	161 // This loop isn't a loop. It's here to make the compiler happy.

	162 // If you're curious, try removing it and changing the 'break'

	163 // statements (except for the last) to goto's.

	164 for (;;) {

	165 if (U_FAILURE(ec)) break;

	166 const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};

	167 expectToPattern(*s, "[a-z{aa}{ab}]", exp1);

	168

	169 s->add("ac");

	170 const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};

	171 expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);

	172

	173 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);

	174 if (U_FAILURE(ec)) break;

	175 const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};

	176 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3 );

	177

	178 s->add("[]");

	179 const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};

	180 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}] "), exp4);

	181

	182 s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r} ]"), ec);

	183 if (U_FAILURE(ec)) break;

	184 const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};

	185 expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4 E01\\u4E02}]"), exp5);

	186

	187 // j2189

	188 s->clear();

	189 s->add(UnicodeString("abc", ""));

	190 s->add(UnicodeString("abc", ""));

	191 const char* exp6[] = {"abc", NOT, "ab", NULL};

	192 expectToPattern(*s, "[{abc}]", exp6);

	193

	194 break;

	195 }

	196

	197 if (U_FAILURE(ec)) errln("FAIL: pattern parse error");

	198 delete s;

	199 }

	200

	201 // JB#3400: For 2 character ranges prefer [ab] to [a-b]

	202 UnicodeSet s;

	203 s.add((UChar)97, (UChar)98); // 'a', 'b'

	204 expectToPattern(s, "[ab]", NULL);

	205 }

	206

	207 UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {

	208

	209 // use Integer.toString because Utility.hex doesn't handle ints

	210 UnicodeString pat = "";

	211 // TODO do these in hex

	212 //String source = "0x" + Integer.toString(start,16).toUpperCase();

	213 //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase( );

	214 UnicodeString source;

	215 source = source + (uint32_t)start;

	216 if (start != end)

	217 source = source + ".." + (uint32_t)end;

	218 UnicodeSet testSet;

	219 testSet.add(start, end);

	220 return checkPat(source, testSet);

	221 }

	222

	223 UBool UnicodeSetTest::checkPat(const UnicodeString& source,

	224 const UnicodeSet& testSet) {

	225 // What we want to make sure of is that a pattern generated

	226 // by toPattern(), with or without escaped unprintables, can

	227 // be passed back into the UnicodeSet constructor.

	228 UnicodeString pat0;

	229

	230 testSet.toPattern(pat0, TRUE);

	231

	232 if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;

	233

	234 //String pat1 = unescapeLeniently(pat0);

	235 //if (!checkPat(source + " (in code)", testSet, pat1)) return false;

	236

	237 UnicodeString pat2;

	238 testSet.toPattern(pat2, FALSE);

	239 if (!checkPat(source, testSet, pat2)) return FALSE;

	240

	241 //String pat3 = unescapeLeniently(pat2);

	242 // if (!checkPat(source + " (in code)", testSet, pat3)) return false;

	243

	244 //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);

	245 logln((UnicodeString)source + " => " + pat0 + ", " + pat2);

	246 return TRUE;

	247 }

	248

	249 UBool UnicodeSetTest::checkPat(const UnicodeString& source,

	250 const UnicodeSet& testSet,

	251 const UnicodeString& pat) {

	252 UErrorCode ec = U_ZERO_ERROR;

	253 UnicodeSet testSet2(pat, ec);

	254 if (testSet2 != testSet) {

	255 errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);

	256 return FALSE;

	257 }

	258 return TRUE;

	259 }

	260

	261 void

	262 UnicodeSetTest::TestPatterns(void) {

	263 UnicodeSet set;

	264 expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""), "km");

	265 expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""), "aczz");

	266 expectPattern(set, UnicodeString("[a\\-z]", ""), "--aazz");

	267 expectPattern(set, UnicodeString("[-az]", ""), "--aazz");

	268 expectPattern(set, UnicodeString("[az-]", ""), "--aazz");

	269 expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");

	270

	271 // Throw in a test of complement

	272 set.complement();

	273 UnicodeString exp;

	274 exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append ((UChar)0xFFFF);

	275 expectPairs(set, exp);

	276 }

	277

	278 void

	279 UnicodeSetTest::TestCategories(void) {

	280 UErrorCode status = U_ZERO_ERROR;

	281 const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]

	282 UnicodeSet set(pat, status);

	283 if (U_FAILURE(status)) {

	284 dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));

	285 return;

	286 } else {

	287 expectContainment(set, pat, "ABC", "abc");

	288 }

	289

	290 UChar32 i;

	291 int32_t failures = 0;

	292 // Make sure generation of L doesn't pollute cached Lu set

	293 // First generate L, then Lu

	294 set.applyPattern("[:L:]", status);

	295 if (U_FAILURE(status)) { errln("FAIL"); return; }

	296 for (i=0; i<0x200; ++i) {

	297 UBool l = u_isalpha((UChar)i);

	298 if (l != set.contains(i)) {

	299 errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +

	300 set.contains(i));

	301 if (++failures == 10) break;

	302 }

	303 }

	304

	305 set.applyPattern("[:Lu:]", status);

	306 if (U_FAILURE(status)) { errln("FAIL"); return; }

	307 for (i=0; i<0x200; ++i) {

	308 UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);

	309 if (lu != set.contains(i)) {

	310 errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +

	311 set.contains(i));

	312 if (++failures == 20) break;

	313 }

	314 }

	315 }

	316 void

	317 UnicodeSetTest::TestCloneEqualHash(void) {

	318 UErrorCode status = U_ZERO_ERROR;

	319 // set1 and set2 used to be built with the obsolete constructor taking

	320 // UCharCategory values; replaced with pattern constructors

	321 // markus 20030502

	322 UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter} "), status); // :Ll: Letter, lowercase

	323 UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); / / Letter, lowercase

	324 if (U_FAILURE(status)){

	325 dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));

	326 return;

	327 }

	328 UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}") , status); //Number, Decimal digit

	329 UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status); //Number, Decimal digit

	330 if (U_FAILURE(status)){

	331 errln((UnicodeString)"FAIL: Can't construct set with category->Nd");

	332 return;

	333 }

	334

	335 if (set1 != set1a) {

	336 errln("FAIL: category constructor for Ll broken");

	337 }

	338 if (set2 != set2a) {

	339 errln("FAIL: category constructor for Nd broken");

	340 }

	341 delete set1a;

	342 delete set2a;

	343

	344 logln("Testing copy construction");

	345 UnicodeSet set1copy=new UnicodeSet(set1);

	346 if(set1 != set1copy \|\| set1 == set2 \|\|

	347 getPairs(set1) != getPairs(set1copy) \|\|

	348 set1->hashCode() != set1copy->hashCode()){

	349 errln("FAIL : Error in copy construction");

	350 return;

	351 }

	352

	353 logln("Testing =operator");

	354 UnicodeSet set1equal=*set1;

	355 UnicodeSet set2equal=*set2;

	356 if(set1equal != set1 \|\| set1equal != set1copy \|\| set2equal != *set2 \|\|

	357 set2equal == set1 \|\| set2equal == set1copy \|\| set2equal == set1equal){

	358 errln("FAIL: Error in =operator");

	359 }

	360

	361 logln("Testing clone()");

	362 UnicodeSet set1clone=(UnicodeSet)set1->clone();

	363 UnicodeSet set2clone=(UnicodeSet)set2->clone();

	364 if(set1clone != set1 \|\| set1clone != set1copy \|\| *set1clone != set1equal \|\|

	365 set2clone != set2 \|\| set2clone == set1copy \|\| *set2clone != set2equa l \|\|

	366 set2clone == set1 \|\| set2clone == set1equal \|\| set2clone == *set1clo ne){

	367 errln("FAIL: Error in clone");

	368 }

	369

	370 logln("Testing hashcode");

	371 if(set1->hashCode() != set1equal.hashCode() \|\| set1->hashCode() != set1clone ->hashCode() \|\|

	372 set2->hashCode() != set2equal.hashCode() \|\| set2->hashCode() != set2clon e->hashCode() \|\|

	373 set1copy->hashCode() != set1equal.hashCode() \|\| set1copy->hashCode() != set1clone->hashCode() \|\|

	374 set1->hashCode() == set2->hashCode() \|\| set1copy->hashCode() == set2->h ashCode() \|\|

	375 set2->hashCode() == set1clone->hashCode() \|\| set2->hashCode() == set1equ al.hashCode() ){

	376 errln("FAIL: Error in hashCode()");

	377 }

	378

	379 delete set1;

	380 delete set1copy;

	381 delete set2;

	382 delete set1clone;

	383 delete set2clone;

	384

	385

	386 }

	387 void

	388 UnicodeSetTest::TestAddRemove(void) {

	389 UnicodeSet set; // Construct empty set

	390 doAssert(set.isEmpty() == TRUE, "set should be empty");

	391 doAssert(set.size() == 0, "size should be 0");

	392 set.complement();

	393 doAssert(set.size() == 0x110000, "size should be 0x110000");

	394 set.clear();

	395 set.add(0x0061, 0x007a);

	396 expectPairs(set, "az");

	397 doAssert(set.isEmpty() == FALSE, "set should not be empty");

	398 doAssert(set.size() != 0, "size should not be equal to 0");

	399 doAssert(set.size() == 26, "size should be equal to 26");

	400 set.remove(0x006d, 0x0070);

	401 expectPairs(set, "alqz");

	402 doAssert(set.size() == 22, "size should be equal to 22");

	403 set.remove(0x0065, 0x0067);

	404 expectPairs(set, "adhlqz");

	405 doAssert(set.size() == 19, "size should be equal to 19");

	406 set.remove(0x0064, 0x0069);

	407 expectPairs(set, "acjlqz");

	408 doAssert(set.size() == 16, "size should be equal to 16");

	409 set.remove(0x0063, 0x0072);

	410 expectPairs(set, "absz");

	411 doAssert(set.size() == 10, "size should be equal to 10");

	412 set.add(0x0066, 0x0071);

	413 expectPairs(set, "abfqsz");

	414 doAssert(set.size() == 22, "size should be equal to 22");

	415 set.remove(0x0061, 0x0067);

	416 expectPairs(set, "hqsz");

	417 set.remove(0x0061, 0x007a);

	418 expectPairs(set, "");

	419 doAssert(set.isEmpty() == TRUE, "set should be empty");

	420 doAssert(set.size() == 0, "size should be 0");

	421 set.add(0x0061);

	422 doAssert(set.isEmpty() == FALSE, "set should not be empty");

	423 doAssert(set.size() == 1, "size should not be equal to 1");

	424 set.add(0x0062);

	425 set.add(0x0063);

	426 expectPairs(set, "ac");

	427 doAssert(set.size() == 3, "size should not be equal to 3");

	428 set.add(0x0070);

	429 set.add(0x0071);

	430 expectPairs(set, "acpq");

	431 doAssert(set.size() == 5, "size should not be equal to 5");

	432 set.clear();

	433 expectPairs(set, "");

	434 doAssert(set.isEmpty() == TRUE, "set should be empty");

	435 doAssert(set.size() == 0, "size should be 0");

	436

	437 // Try removing an entire set from another set

	438 expectPattern(set, "[c-x]", "cx");

	439 UnicodeSet set2;

	440 expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");

	441 set.removeAll(set2);

	442 expectPairs(set, "deluxx");

	443

	444 // Try adding an entire set to another set

	445 expectPattern(set, "[jackiemclean]", "aacceein");

	446 expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");

	447 set.addAll(set2);

	448 expectPairs(set, "aacehort");

	449 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");

	450

	451 // Try retaining an set of elements contained in another set (intersection)

	452 UnicodeSet set3;

	453 expectPattern(set3, "[a-c]", "ac");

	454 doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elemen ts in set3");

	455 set3.remove(0x0062);

	456 expectPairs(set3, "aacc");

	457 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");

	458 set.retainAll(set3);

	459 expectPairs(set, "aacc");

	460 doAssert(set.size() == set3.size(), "set.size() should be set3.size()");

	461 doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");

	462 set.clear();

	463 doAssert(set.size() != set3.size(), "set.size() != set3.size()");

	464

	465 // Test commutativity

	466 expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");

	467 expectPattern(set2, "[jackiemclean]", "aacceein");

	468 set.addAll(set2);

	469 expectPairs(set, "aacehort");

	470 doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");

	471

	472

	473

	474

	475 }

	476

	477 /**

	478 * Make sure minimal representation is maintained.

	479 */

	480 void UnicodeSetTest::TestMinimalRep() {

	481 UErrorCode status = U_ZERO_ERROR;

	482 // This is pretty thoroughly tested by checkCanonicalRep()

	483 // run against the exhaustive operation results. Use the code

	484 // here for debugging specific spot problems.

	485

	486 // 1 overlap against 2

	487 UnicodeSet set("[h-km-q]", status);

	488 if (U_FAILURE(status)) { errln("FAIL"); return; }

	489 UnicodeSet set2("[i-o]", status);

	490 if (U_FAILURE(status)) { errln("FAIL"); return; }

	491 set.addAll(set2);

	492 expectPairs(set, "hq");

	493 // right

	494 set.applyPattern("[a-m]", status);

	495 if (U_FAILURE(status)) { errln("FAIL"); return; }

	496 set2.applyPattern("[e-o]", status);

	497 if (U_FAILURE(status)) { errln("FAIL"); return; }

	498 set.addAll(set2);

	499 expectPairs(set, "ao");

	500 // left

	501 set.applyPattern("[e-o]", status);

	502 if (U_FAILURE(status)) { errln("FAIL"); return; }

	503 set2.applyPattern("[a-m]", status);

	504 if (U_FAILURE(status)) { errln("FAIL"); return; }

	505 set.addAll(set2);

	506 expectPairs(set, "ao");

	507 // 1 overlap against 3

	508 set.applyPattern("[a-eg-mo-w]", status);

	509 if (U_FAILURE(status)) { errln("FAIL"); return; }

	510 set2.applyPattern("[d-q]", status);

	511 if (U_FAILURE(status)) { errln("FAIL"); return; }

	512 set.addAll(set2);

	513 expectPairs(set, "aw");

	514 }

	515

	516 void UnicodeSetTest::TestAPI() {

	517 UErrorCode status = U_ZERO_ERROR;

	518 // default ct

	519 UnicodeSet set;

	520 if (!set.isEmpty() \|\| set.getRangeCount() != 0) {

	521 errln((UnicodeString)"FAIL, set should be empty but isn't: " +

	522 set);

	523 }

	524

	525 // clear(), isEmpty()

	526 set.add(0x0061);

	527 if (set.isEmpty()) {

	528 errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +

	529 set);

	530 }

	531 set.clear();

	532 if (!set.isEmpty()) {

	533 errln((UnicodeString)"FAIL, set should be empty but isn't: " +

	534 set);

	535 }

	536

	537 // size()

	538 set.clear();

	539 if (set.size() != 0) {

	540 errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +

	541 ": " + set);

	542 }

	543 set.add(0x0061);

	544 if (set.size() != 1) {

	545 errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +

	546 ": " + set);

	547 }

	548 set.add(0x0031, 0x0039);

	549 if (set.size() != 10) {

	550 errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +

	551 ": " + set);

	552 }

	553

	554 // contains(first, last)

	555 set.clear();

	556 set.applyPattern("[A-Y 1-8 b-d l-y]", status);

	557 if (U_FAILURE(status)) { errln("FAIL"); return; }

	558 for (int32_t i = 0; i<set.getRangeCount(); ++i) {

	559 UChar32 a = set.getRangeStart(i);

	560 UChar32 b = set.getRangeEnd(i);

	561 if (!set.contains(a, b)) {

	562 errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + ' -' + (unsigned short)b +

	563 " but doesn't: " + set);

	564 }

	565 if (set.contains((UChar32)(a-1), b)) {

	566 errln((UnicodeString)"FAIL, shouldn't contain " +

	567 (unsigned short)(a-1) + '-' + (unsigned short)b +

	568 " but does: " + set);

	569 }

	570 if (set.contains(a, (UChar32)(b+1))) {

	571 errln((UnicodeString)"FAIL, shouldn't contain " +

	572 (unsigned short)a + '-' + (unsigned short)(b+1) +

	573 " but does: " + set);

	574 }

	575 }

	576

	577 // Ported InversionList test.

	578 UnicodeSet a((UChar32)3,(UChar32)10);

	579 UnicodeSet b((UChar32)7,(UChar32)15);

	580 UnicodeSet c;

	581

	582 logln((UnicodeString)"a [3-10]: " + a);

	583 logln((UnicodeString)"b [7-15]: " + b);

	584 c = a;

	585 c.addAll(b);

	586 UnicodeSet exp((UChar32)3,(UChar32)15);

	587 if (c == exp) {

	588 logln((UnicodeString)"c.set(a).add(b): " + c);

	589 } else {

	590 errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp) ;

	591 }

	592 c.complement();

	593 exp.set((UChar32)0, (UChar32)2);

	594 exp.add((UChar32)16, UnicodeSet::MAX_VALUE);

	595 if (c == exp) {

	596 logln((UnicodeString)"c.complement(): " + c);

	597 } else {

	598 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);

	599 }

	600 c.complement();

	601 exp.set((UChar32)3, (UChar32)15);

	602 if (c == exp) {

	603 logln((UnicodeString)"c.complement(): " + c);

	604 } else {

	605 errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);

	606 }

	607 c = a;

	608 c.complementAll(b);

	609 exp.set((UChar32)3,(UChar32)6);

	610 exp.add((UChar32)11,(UChar32) 15);

	611 if (c == exp) {

	612 logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);

	613 } else {

	614 errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);

	615 }

	616

	617 exp = c;

	618 bitsToSet(setToBits(c), c);

	619 if (c == exp) {

	620 logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);

	621 } else {

	622 errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);

	623 }

	624

	625 // Additional tests for coverage JB#2118

	626 //UnicodeSet::complement(class UnicodeString const &)

	627 //UnicodeSet::complementAll(class UnicodeString const &)

	628 //UnicodeSet::containsNone(class UnicodeSet const &)

	629 //UnicodeSet::containsNone(long,long)

	630 //UnicodeSet::containsSome(class UnicodeSet const &)

	631 //UnicodeSet::containsSome(long,long)

	632 //UnicodeSet::removeAll(class UnicodeString const &)

	633 //UnicodeSet::retain(long)

	634 //UnicodeSet::retainAll(class UnicodeString const &)

	635 //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)

	636 //UnicodeSetIterator::getString(void)

	637 set.clear();

	638 set.complement("ab");

	639 exp.applyPattern("[{ab}]", status);

	640 if (U_FAILURE(status)) { errln("FAIL"); return; }

	641 if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }

	642

	643 UnicodeSetIterator iset(set);

	644 if (!iset.next() \|\| !iset.isString()) {

	645 errln("FAIL: UnicodeSetIterator::next/isString");

	646 } else if (iset.getString() != "ab") {

	647 errln("FAIL: UnicodeSetIterator::getString");

	648 }

	649

	650 set.add((UChar32)0x61, (UChar32)0x7A);

	651 set.complementAll("alan");

	652 exp.applyPattern("[{ab}b-kmo-z]", status);

	653 if (U_FAILURE(status)) { errln("FAIL"); return; }

	654 if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }

	655

	656 exp.applyPattern("[a-z]", status);

	657 if (U_FAILURE(status)) { errln("FAIL"); return; }

	658 if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }

	659 if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }

	660 exp.applyPattern("[aln]", status);

	661 if (U_FAILURE(status)) { errln("FAIL"); return; }

	662 if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }

	663 if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }

	664

	665 if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {

	666 errln("FAIL: containsNone(UChar32, UChar32)");

	667 }

	668 if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {

	669 errln("FAIL: containsSome(UChar32, UChar32)");

	670 }

	671 if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {

	672 errln("FAIL: containsNone(UChar32, UChar32)");

	673 }

	674 if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {

	675 errln("FAIL: containsSome(UChar32, UChar32)");

	676 }

	677

	678 set.removeAll("liu");

	679 exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);

	680 if (U_FAILURE(status)) { errln("FAIL"); return; }

	681 if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }

	682

	683 set.retainAll("star");

	684 exp.applyPattern("[rst]", status);

	685 if (U_FAILURE(status)) { errln("FAIL"); return; }

	686 if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }

	687

	688 set.retain((UChar32)0x73);

	689 exp.applyPattern("[s]", status);

	690 if (U_FAILURE(status)) { errln("FAIL"); return; }

	691 if (set != exp) { errln("FAIL: retain('s')"); return; }

	692

	693 uint16_t buf[32];

	694 int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);

	695 if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }

	696 if (slen != 3 \|\| buf[0] != 2 \|\| buf[1] != 0x73 \|\| buf[2] != 0x74) {

	697 errln("FAIL: serialize");

	698 return;

	699 }

	700

	701 // Conversions to and from USet

	702 UnicodeSet *uniset = &set;

	703 USet *uset = uniset->toUSet();

	704 TEST_ASSERT((void )uset == (void )uniset);

	705 UnicodeSet *setx = UnicodeSet::fromUSet(uset);

	706 TEST_ASSERT((void )setx == (void )uset);

	707 const UnicodeSet *constSet = uniset;

	708 const USet *constUSet = constSet->toUSet();

	709 TEST_ASSERT((void )constUSet == (void )constSet);

	710 const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);

	711 TEST_ASSERT((void )constSetx == (void )constUSet);

	712

	713 // span(UnicodeString) and spanBack(UnicodeString) convenience methods

	714 UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccc cc");

	715 UnicodeSet ac(0x61, 0x63);

	716 ac.remove(0x62).freeze();

	717 if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 \|\|

	718 ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 \|\|

	719 ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 \|\|

	720 ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 \|\|

	721 ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|

	722 ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 \|\|

	723 ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 \|\|

	724 ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 \|\|

	725 ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 \|\|

	726 ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30

	727 ) {

	728 errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes ");

	729 }

	730 if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 \|\|

	731 ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 \|\|

	732 ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 \|\|

	733 ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 \|\|

	734 ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 \|\|

	735 ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 \|\|

	736 ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 \|\|

	737 ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 \|\|

	738 ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 \|\|

	739 ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20

	740 ) {

	741 errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start i ndexes");

	742 }

	743 }

	744

	745 void UnicodeSetTest::TestIteration() {

	746 UErrorCode ec = U_ZERO_ERROR;

	747 int i = 0;

	748 int outerLoop;

	749

	750 // 6 code points, 3 ranges, 2 strings, 8 total elements

	751 // Iteration will access them in sorted order - a, b, c, y, z, U0001abcd, "str1", "str2"

	752 UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);

	753 TEST_ASSERT_SUCCESS(ec);

	754 UnicodeSetIterator it(set);

	755

	756 for (outerLoop=0; outerLoop<3; outerLoop++) {

	757 // Run the test multiple times, to check that iterator.reset() is workin g.

	758 for (i=0; i<10; i++) {

	759 UBool nextv = it.next();

	760 UBool isString = it.isString();

	761 int32_t codePoint = it.getCodepoint();

	762 //int32_t codePointEnd = it.getCodepointEnd();

	763 UnicodeString s = it.getString();

	764 switch (i) {

	765 case 0:

	766 TEST_ASSERT(nextv == TRUE);

	767 TEST_ASSERT(isString == FALSE);

	768 TEST_ASSERT(codePoint==0x61);

	769 TEST_ASSERT(s == "a");

	770 break;

	771 case 1:

	772 TEST_ASSERT(nextv == TRUE);

	773 TEST_ASSERT(isString == FALSE);

	774 TEST_ASSERT(codePoint==0x62);

	775 TEST_ASSERT(s == "b");

	776 break;

	777 case 2:

	778 TEST_ASSERT(nextv == TRUE);

	779 TEST_ASSERT(isString == FALSE);

	780 TEST_ASSERT(codePoint==0x63);

	781 TEST_ASSERT(s == "c");

	782 break;

	783 case 3:

	784 TEST_ASSERT(nextv == TRUE);

	785 TEST_ASSERT(isString == FALSE);

	786 TEST_ASSERT(codePoint==0x79);

	787 TEST_ASSERT(s == "y");

	788 break;

	789 case 4:

	790 TEST_ASSERT(nextv == TRUE);

	791 TEST_ASSERT(isString == FALSE);

	792 TEST_ASSERT(codePoint==0x7a);

	793 TEST_ASSERT(s == "z");

	794 break;

	795 case 5:

	796 TEST_ASSERT(nextv == TRUE);

	797 TEST_ASSERT(isString == FALSE);

	798 TEST_ASSERT(codePoint==0x1abcd);

	799 TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));

	800 break;

	801 case 6:

	802 TEST_ASSERT(nextv == TRUE);

	803 TEST_ASSERT(isString == TRUE);

	804 TEST_ASSERT(s == "str1");

	805 break;

	806 case 7:

	807 TEST_ASSERT(nextv == TRUE);

	808 TEST_ASSERT(isString == TRUE);

	809 TEST_ASSERT(s == "str2");

	810 break;

	811 case 8:

	812 TEST_ASSERT(nextv == FALSE);

	813 break;

	814 case 9:

	815 TEST_ASSERT(nextv == FALSE);

	816 break;

	817 }

	818 }

	819 it.reset(); // prepare to run the iteration again.

	820 }

	821 }

	822

	823

	824

	825

	826 void UnicodeSetTest::TestStrings() {

	827 UErrorCode ec = U_ZERO_ERROR;

	828

	829 UnicodeSet* testList[] = {

	830 UnicodeSet::createFromAll("abc"),

	831 new UnicodeSet("[a-c]", ec),

	832

	833 &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),

	834 new UnicodeSet("[{ll}{ch}a-z]", ec),

	835

	836 UnicodeSet::createFrom("ab}c"),

	837 new UnicodeSet("[{ab\\}c}]", ec),

	838

	839 &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X ')),

	840 new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),

	841

	842 NULL

	843 };

	844

	845 if (U_FAILURE(ec)) {

	846 errln("FAIL: couldn't construct test sets");

	847 }

	848

	849 for (int32_t i = 0; testList[i] != NULL; i+=2) {

	850 if (U_SUCCESS(ec)) {

	851 UnicodeString pat0, pat1;

	852 testList[i]->toPattern(pat0, TRUE);

	853 testList[i+1]->toPattern(pat1, TRUE);

	854 if (testList[i] == testList[i+1]) {

	855 logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);

	856 } else {

	857 logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);

	858 }

	859 }

	860 delete testList[i];

	861 delete testList[i+1];

	862 }

	863 }

	864

	865 /**

	866 * Test the [:Latin:] syntax.

	867 */

	868 void UnicodeSetTest::TestScriptSet() {

	869 expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeSt ring("\\u0391\\u03B1"));

	870

	871 expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString(" \\u0391\\u03B1"), "aA");

	872

	873 /* Jitterbug 1423 */

	874 expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsT oUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");

	875

	876 }

	877

	878 /**

	879 * Test the [:Latin:] syntax.

	880 */

	881 void UnicodeSetTest::TestPropertySet() {

	882 static const char* const DATA[] = {

	883 // Pattern, Chars IN, Chars NOT in

	884

	885 "[:Latin:]",

	886 "aA",

	887 "\\u0391\\u03B1",

	888

	889 "[\\p{Greek}]",

	890 "\\u0391\\u03B1",

	891 "aA",

	892

	893 "\\P{ GENERAL Category = upper case letter }",

	894 "abc",

	895 "ABC",

	896

	897 #if !UCONFIG_NO_NORMALIZATION

	898 // Combining class: @since ICU 2.2

	899 // Check both symbolic and numeric

	900 "\\p{ccc=Nukta}",

	901 "\\u0ABC",

	902 "abc",

	903

	904 "\\p{Canonical Combining Class = 11}",

	905 "\\u05B1",

	906 "\\u05B2",

	907

	908 "[:c c c = iota subscript :]",

	909 "\\u0345",

	910 "xyz",

	911 #endif

	912

	913 // Bidi class: @since ICU 2.2

	914 "\\p{bidiclass=lefttoright}",

	915 "abc",

	916 "\\u0671\\u0672",

	917

	918 // Binary properties: @since ICU 2.2

	919 "\\p{ideographic}",

	920 "\\u4E0A",

	921 "x",

	922

	923 "[:math=false:]",

	924 "q)*(",

	925 // weiv: )(and * were removed from math in Unicode 4.0.1

	926 //"(*+)",

	927 "+<>^",

	928

	929 // JB#1767 \N{}, \p{ASCII}

	930 "[:Ascii:]",

	931 "abc\\u0000\\u007F",

	932 "\\u0080\\u4E00",

	933

	934 "[\\N{ latin small letter a }[:name= latin small letter z:]]",

	935 "az",

	936 "qrs",

	937

	938 // JB#2015

	939 "[:any:]",

	940 "a\\U0010FFFF",

	941 "",

	942

	943 "[:nv=0.5:]",

	944 "\\u00BD\\u0F2A",

	945 "\\u00BC",

	946

	947 // JB#2653: Age

	948 "[:Age=1.1:]",

	949 "\\u03D6", // 1.1

	950 "\\u03D8\\u03D9", // 3.2

	951

	952 "[:Age=3.1:]",

	953 "\\u1800\\u3400\\U0002f800",

	954 "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",

	955

	956 // JB#2350: Case_Sensitive

	957 "[:Case Sensitive:]",

	958 "A\\u1FFC\\U00010410",

	959 ";\\u00B4\\U00010500",

	960

	961 // JB#2832: C99-compatibility props

	962 "[:blank:]",

	963 " \\u0009",

	964 "1-9A-Z",

	965

	966 "[:graph:]",

	967 "19AZ",

	968 " \\u0003\\u0007\\u0009\\u000A\\u000D",

	969

	970 "[:punct:]",

	971 "!@#%&*()[]{}-_\\/;:,.?'\"",

	972 "09azAZ",

	973

	974 "[:xdigit:]",

	975 "09afAF",

	976 "gG!",

	977

	978 // Regex compatibility test

	979 "[-b]", // leading '-' is literal

	980 "-b",

	981 "ac",

	982

	983 "[^-b]", // leading '-' is literal

	984 "ac",

	985 "-b",

	986

	987 "[b-]", // trailing '-' is literal

	988 "-b",

	989 "ac",

	990

	991 "[^b-]", // trailing '-' is literal

	992 "ac",

	993 "-b",

	994

	995 "[a-b-]", // trailing '-' is literal

	996 "ab-",

	997 "c=",

	998

	999 "[[a-q]&[p-z]-]", // trailing '-' is literal

	1000 "pq-",

	1001 "or=",

	1002

	1003 "[\\s\|\\)\|:\|$\|\\>]", // from regex tests

	1004 "s\|):$>",

	1005 "abc",

	1006

	1007 "[\\uDC00cd]", // JB#2906: isolated trail at start

	1008 "cd\\uDC00",

	1009 "ab\\uD800\\U00010000",

	1010

	1011 "[ab\\uD800]", // JB#2906: isolated trail at start

	1012 "ab\\uD800",

	1013 "cd\\uDC00\\U00010000",

	1014

	1015 "[ab\\uD800cd]", // JB#2906: isolated lead in middle

	1016 "abcd\\uD800",

	1017 "ef\\uDC00\\U00010000",

	1018

	1019 "[ab\\uDC00cd]", // JB#2906: isolated trail in middle

	1020 "abcd\\uDC00",

	1021 "ef\\uD800\\U00010000",

	1022

	1023 #if !UCONFIG_NO_NORMALIZATION

	1024 "[:^lccc=0:]", // Lead canonical class

	1025 "\\u0300\\u0301",

	1026 "abcd\\u00c0\\u00c5",

	1027

	1028 "[:^tccc=0:]", // Trail canonical class

	1029 "\\u0300\\u0301\\u00c0\\u00c5",

	1030 "abcd",

	1031

	1032 "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class

	1033 "\\u0300\\u0301\\u00c0\\u00c5",

	1034 "abcd",

	1035

	1036 "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but end s with a base (none right now)

	1037 "",

	1038 "abcd\\u0300\\u0301\\u00c0\\u00c5",

	1039

	1040 "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical clas s is zero, but both lead and trail are not

	1041 "\\u0F73\\u0F75\\u0F81",

	1042 "abcd\\u0300\\u0301\\u00c0\\u00c5",

	1043 #endif /* !UCONFIG_NO_NORMALIZATION */

	1044

	1045 "[:Assigned:]",

	1046 "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",

	1047 "\\u0888\\uFDD3\\uFFFE\\U00050005",

	1048

	1049 // Script_Extensions, new in Unicode 6.0

	1050 "[:scx=Arab:]",

	1051 "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\ \uFDF3",

	1052 "\\u061D\\u065F\\uFDEF\\uFDFE",

	1053

	1054 // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,

	1055 // so scx-sc is missing U+FDF2.

	1056 "[[:Script_Extensions=Arabic:]-[:Arab:]]",

	1057 "\\u0640\\u064B\\u0650\\u0655\\uFDFD",

	1058 "\\uFDF2"

	1059 };

	1060

	1061 static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);

	1062

	1063 for (int32_t i=0; i<DATA_LEN; i+=3) {

	1064 expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeStri ng(DATA[i+1]),

	1065 CharsToUnicodeString(DATA[i+2]));

	1066 }

	1067 }

	1068

	1069 /**

	1070 * Test that Posix style character classes [:digit:], etc.

	1071 * have the Unicode definitions from TR 18.

	1072 */

	1073 void UnicodeSetTest::TestPosixClasses() {

	1074 {

	1075 UErrorCode status = U_ZERO_ERROR;

	1076 UnicodeSet s1("[:alpha:]", status);

	1077 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);

	1078 TEST_ASSERT_SUCCESS(status);

	1079 TEST_ASSERT(s1==s2);

	1080 }

	1081 {

	1082 UErrorCode status = U_ZERO_ERROR;

	1083 UnicodeSet s1("[:lower:]", status);

	1084 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);

	1085 TEST_ASSERT_SUCCESS(status);

	1086 TEST_ASSERT(s1==s2);

	1087 }

	1088 {

	1089 UErrorCode status = U_ZERO_ERROR;

	1090 UnicodeSet s1("[:upper:]", status);

	1091 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);

	1092 TEST_ASSERT_SUCCESS(status);

	1093 TEST_ASSERT(s1==s2);

	1094 }

	1095 {

	1096 UErrorCode status = U_ZERO_ERROR;

	1097 UnicodeSet s1("[:punct:]", status);

	1098 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);

	1099 TEST_ASSERT_SUCCESS(status);

	1100 TEST_ASSERT(s1==s2);

	1101 }

	1102 {

	1103 UErrorCode status = U_ZERO_ERROR;

	1104 UnicodeSet s1("[:digit:]", status);

	1105 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);

	1106 TEST_ASSERT_SUCCESS(status);

	1107 TEST_ASSERT(s1==s2);

	1108 }

	1109 {

	1110 UErrorCode status = U_ZERO_ERROR;

	1111 UnicodeSet s1("[:xdigit:]", status);

	1112 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]") , status);

	1113 TEST_ASSERT_SUCCESS(status);

	1114 TEST_ASSERT(s1==s2);

	1115 }

	1116 {

	1117 UErrorCode status = U_ZERO_ERROR;

	1118 UnicodeSet s1("[:alnum:]", status);

	1119 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}] "), status);

	1120 TEST_ASSERT_SUCCESS(status);

	1121 TEST_ASSERT(s1==s2);

	1122 }

	1123 {

	1124 UErrorCode status = U_ZERO_ERROR;

	1125 UnicodeSet s1("[:space:]", status);

	1126 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);

	1127 TEST_ASSERT_SUCCESS(status);

	1128 TEST_ASSERT(s1==s2);

	1129 }

	1130 {

	1131 UErrorCode status = U_ZERO_ERROR;

	1132 UnicodeSet s1("[:blank:]", status);

	1133 TEST_ASSERT_SUCCESS(status);

	1134 UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u 000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),

	1135 status);

	1136 TEST_ASSERT_SUCCESS(status);

	1137 TEST_ASSERT(s1==s2);

	1138 }

	1139 {

	1140 UErrorCode status = U_ZERO_ERROR;

	1141 UnicodeSet s1("[:cntrl:]", status);

	1142 TEST_ASSERT_SUCCESS(status);

	1143 UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);

	1144 TEST_ASSERT_SUCCESS(status);

	1145 TEST_ASSERT(s1==s2);

	1146 }

	1147 {

	1148 UErrorCode status = U_ZERO_ERROR;

	1149 UnicodeSet s1("[:graph:]", status);

	1150 TEST_ASSERT_SUCCESS(status);

	1151 UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Su rrogate}\\p{Unassigned}]"), status);

	1152 TEST_ASSERT_SUCCESS(status);

	1153 TEST_ASSERT(s1==s2);

	1154 }

	1155 {

	1156 UErrorCode status = U_ZERO_ERROR;

	1157 UnicodeSet s1("[:print:]", status);

	1158 TEST_ASSERT_SUCCESS(status);

	1159 UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]] ") ,status);

	1160 TEST_ASSERT_SUCCESS(status);

	1161 TEST_ASSERT(s1==s2);

	1162 }

	1163 }

	1164 /**

	1165 * Test cloning of UnicodeSet. For C++, we test the copy constructor.

	1166 */

	1167 void UnicodeSetTest::TestClone() {

	1168 UErrorCode ec = U_ZERO_ERROR;

	1169 UnicodeSet s("[abcxyz]", ec);

	1170 UnicodeSet t(s);

	1171 expectContainment(t, "abc", "def");

	1172 }

	1173

	1174 /**

	1175 * Test the indexOf() and charAt() methods.

	1176 */

	1177 void UnicodeSetTest::TestIndexOf() {

	1178 UErrorCode ec = U_ZERO_ERROR;

	1179 UnicodeSet set("[a-cx-y3578]", ec);

	1180 if (U_FAILURE(ec)) {

	1181 errln("FAIL: UnicodeSet constructor");

	1182 return;

	1183 }

	1184 for (int32_t i=0; i<set.size(); ++i) {

	1185 UChar32 c = set.charAt(i);

	1186 if (set.indexOf(c) != i) {

	1187 errln("FAIL: charAt(%d) = %X => indexOf() => %d",

	1188 i, c, set.indexOf(c));

	1189 }

	1190 }

	1191 UChar32 c = set.charAt(set.size());

	1192 if (c != -1) {

	1193 errln("FAIL: charAt(<out of range>) = %X", c);

	1194 }

	1195 int32_t j = set.indexOf((UChar32)0x71/'q'/);

	1196 if (j != -1) {

	1197 errln((UnicodeString)"FAIL: indexOf('q') = " + j);

	1198 }

	1199 }

	1200

	1201 /**

	1202 * Test closure API.

	1203 */

	1204 void UnicodeSetTest::TestCloseOver() {

	1205 UErrorCode ec = U_ZERO_ERROR;

	1206

	1207 char CASE[] = {(char)USET_CASE_INSENSITIVE};

	1208 char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};

	1209 const char* DATA[] = {

	1210 // selector, input, output

	1211 CASE,

	1212 "[aq\\u00DF{Bc}{bC}{Fi}]",

	1213 "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]", // U+1E9E LATIN CAPITAL LETT ER SHARP S is new in Unicode 5.1

	1214

	1215 CASE,

	1216 "[\\u01F1]", // 'DZ'

	1217 "[\\u01F1\\u01F2\\u01F3]",

	1218

	1219 CASE,

	1220 "[\\u1FB4]",

	1221 "[\\u1FB4{\\u03AC\\u03B9}]",

	1222

	1223 CASE,

	1224 "[{F\\uFB01}]",

	1225 "[\\uFB03{ffi}]",

	1226

	1227 CASE, // make sure binary search finds limits

	1228 "[a\\uFF3A]",

	1229 "[aA\\uFF3A\\uFF5A]",

	1230

	1231 CASE,

	1232 "[a-z]","[A-Za-z\\u017F\\u212A]",

	1233 CASE,

	1234 "[abc]","[A-Ca-c]",

	1235 CASE,

	1236 "[ABC]","[A-Ca-c]",

	1237

	1238 CASE, "[i]", "[iI]",

	1239

	1240 CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I

	1241 CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot

	1242

	1243 CASE, "[\\u0131]", "[\\u0131]", // dotless i

	1244

	1245 CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",

	1246

	1247 CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas

	1248

	1249 CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas

	1250

	1251 CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",

	1252

	1253 CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",

	1254

	1255 CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",

	1256 CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",

	1257

	1258 CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",

	1259

	1260 CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted ta ble

	1261

	1262 CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sort ed table

	1263

	1264 #if !UCONFIG_NO_FILE_IO

	1265 CASE_MAPPINGS,

	1266 "[aq\\u00DF{Bc}{bC}{Fi}]",

	1267 "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",

	1268 #endif

	1269

	1270 CASE_MAPPINGS,

	1271 "[\\u01F1]", // 'DZ'

	1272 "[\\u01F1\\u01F2\\u01F3]",

	1273

	1274 CASE_MAPPINGS,

	1275 "[a-z]",

	1276 "[A-Za-z]",

	1277

	1278 NULL

	1279 };

	1280

	1281 UnicodeSet s;

	1282 UnicodeSet t;

	1283 UnicodeString buf;

	1284 for (int32_t i=0; DATA[i]!=NULL; i+=3) {

	1285 int32_t selector = DATA[i][0];

	1286 UnicodeString pat(DATA[i+1], -1, US_INV);

	1287 UnicodeString exp(DATA[i+2], -1, US_INV);

	1288 s.applyPattern(pat, ec);

	1289 s.closeOver(selector);

	1290 t.applyPattern(exp, ec);

	1291 if (U_FAILURE(ec)) {

	1292 errln("FAIL: applyPattern failed");

	1293 continue;

	1294 }

	1295 if (s == t) {

	1296 logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);

	1297 } else {

	1298 dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +

	1299 s.toPattern(buf, TRUE) + ", expected " + exp);

	1300 }

	1301 }

	1302

	1303 #if 0

	1304 /*

	1305 * Unused test code.

	1306 * This was used to compare the old implementation (using USET_CASE)

	1307 * with the new one (using 0x100 temporarily)

	1308 * while transitioning from hardcoded case closure tables in uniset.cpp

	1309 * (moved to uniset_props.cpp) to building the data by gencase into ucase.ic u.

	1310 * and using ucase.c functions for closure.

	1311 * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file

	1312 *

	1313 * Note: The old and new implementation never fully matched because

	1314 * the old implementation turned out to not map U+0130 and U+0131 correctly

	1315 * (dotted I and dotless i) and because the old implementation's data tables

	1316 * were outdated compared to Unicode 4.0.1 at the time of the change to the

	1317 * new implementation. (So sigmas and some other characters were not handled

	1318 * according to the newer Unicode version.)

	1319 */

	1320 UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;

	1321 UnicodeSetIterator si(sens);

	1322 UnicodeString str, buf2;

	1323 const UnicodeString *pStr;

	1324 UChar32 c;

	1325 while(si.next()) {

	1326 if(!si.isString()) {

	1327 c=si.getCodepoint();

	1328 s.clear();

	1329 s.add(c);

	1330

	1331 str.setTo(c);

	1332 str.foldCase();

	1333 sens2.add(str);

	1334

	1335 t=s;

	1336 s.closeOver(USET_CASE);

	1337 t.closeOver(0x100);

	1338 if(s!=t) {

	1339 errln("FAIL: closeOver(U+%04x) differs: ", c);

	1340 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.to Pattern(buf2, TRUE));

	1341 }

	1342 }

	1343 }

	1344 // remove all code points

	1345 // should contain all full case folding mapping strings

	1346 sens2.remove(0, 0x10ffff);

	1347 si.reset(sens2);

	1348 while(si.next()) {

	1349 if(si.isString()) {

	1350 pStr=&si.getString();

	1351 s.clear();

	1352 s.add(*pStr);

	1353 t=s2=s;

	1354 s.closeOver(USET_CASE);

	1355 t.closeOver(0x100);

	1356 if(s!=t) {

	1357 errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+ ") differs: ");

	1358 errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.to Pattern(buf2, TRUE));

	1359 }

	1360 }

	1361 }

	1362 #endif

	1363

	1364 // Test the pattern API

	1365 s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);

	1366 if (U_FAILURE(ec)) {

	1367 errln("FAIL: applyPattern failed");

	1368 } else {

	1369 expectContainment(s, "abcABC", "defDEF");

	1370 }

	1371 UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);

	1372 if (U_FAILURE(ec)) {

	1373 errln("FAIL: constructor failed");

	1374 } else {

	1375 expectContainment(v, "defDEF", "abcABC");

	1376 }

	1377 UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);

	1378 if (U_FAILURE(ec)) {

	1379 errln("FAIL: construct w/case mappings failed");

	1380 } else {

	1381 expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A")) ;

	1382 }

	1383 }

	1384

	1385 void UnicodeSetTest::TestEscapePattern() {

	1386 const char pattern[] =

	1387 "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFF D ]";

	1388 const char exp[] =

	1389 "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]" ;

	1390 // We test this with two passes; in the second pass we

	1391 // pre-unescape the pattern. Since U+200E is rule whitespace,

	1392 // this fails -- which is what we expect.

	1393 for (int32_t pass=1; pass<=2; ++pass) {

	1394 UErrorCode ec = U_ZERO_ERROR;

	1395 UnicodeString pat(pattern, -1, US_INV);

	1396 if (pass==2) {

	1397 pat = pat.unescape();

	1398 }

	1399 // Pattern is only good for pass 1

	1400 UBool isPatternValid = (pass==1);

	1401

	1402 UnicodeSet set(pat, ec);

	1403 if (U_SUCCESS(ec) != isPatternValid){

	1404 errln((UnicodeString)"FAIL: applyPattern(" +

	1405 escape(pat) + ") => " +

	1406 u_errorName(ec));

	1407 continue;

	1408 }

	1409 if (U_FAILURE(ec)) {

	1410 continue;

	1411 }

	1412 if (set.contains((UChar)0x0644)){

	1413 errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");

	1414 }

	1415

	1416 UnicodeString newpat;

	1417 set.toPattern(newpat, TRUE);

	1418 if (newpat == UnicodeString(exp, -1, US_INV)) {

	1419 logln(escape(pat) + " => " + newpat);

	1420 } else {

	1421 errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);

	1422 }

	1423

	1424 for (int32_t i=0; i<set.getRangeCount(); ++i) {

	1425 UnicodeString str("Range ");

	1426 str.append((UChar)(0x30 + i))

	1427 .append(": ")

	1428 .append((UChar32)set.getRangeStart(i))

	1429 .append(" - ")

	1430 .append((UChar32)set.getRangeEnd(i));

	1431 str = str + " (" + set.getRangeStart(i) + " - " +

	1432 set.getRangeEnd(i) + ")";

	1433 if (set.getRangeStart(i) < 0) {

	1434 errln((UnicodeString)"FAIL: " + escape(str));

	1435 } else {

	1436 logln(escape(str));

	1437 }

	1438 }

	1439 }

	1440 }

	1441

	1442 void UnicodeSetTest::expectRange(const UnicodeString& label,

	1443 const UnicodeSet& set,

	1444 UChar32 start, UChar32 end) {

	1445 UnicodeSet exp(start, end);

	1446 UnicodeString pat;

	1447 if (set == exp) {

	1448 logln(label + " => " + set.toPattern(pat, TRUE));

	1449 } else {

	1450 UnicodeString xpat;

	1451 errln((UnicodeString)"FAIL: " + label + " => " +

	1452 set.toPattern(pat, TRUE) +

	1453 ", expected " + exp.toPattern(xpat, TRUE));

	1454 }

	1455 }

	1456

	1457 void UnicodeSetTest::TestInvalidCodePoint() {

	1458

	1459 const UChar32 DATA[] = {

	1460 // Test range Expected range

	1461 0, 0x10FFFF, 0, 0x10FFFF,

	1462 (UChar32)-1, 8, 0, 8,

	1463 8, 0x110000, 8, 0x10FFFF

	1464 };

	1465 const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);

	1466

	1467 UnicodeString pat;

	1468 int32_t i;

	1469

	1470 for (i=0; i<DATA_LENGTH; i+=4) {

	1471 UChar32 start = DATA[i];

	1472 UChar32 end = DATA[i+1];

	1473 UChar32 xstart = DATA[i+2];

	1474 UChar32 xend = DATA[i+3];

	1475

	1476 // Try various API using the test code points

	1477

	1478 UnicodeSet set(start, end);

	1479 expectRange((UnicodeString)"ct(" + start + "," + end + ")",

	1480 set, xstart, xend);

	1481

	1482 set.clear();

	1483 set.set(start, end);

	1484 expectRange((UnicodeString)"set(" + start + "," + end + ")",

	1485 set, xstart, xend);

	1486

	1487 UBool b = set.contains(start);

	1488 b = set.contains(start, end);

	1489 b = set.containsNone(start, end);

	1490 b = set.containsSome(start, end);

	1491

	1492 /int32_t index = set.indexOf(start);/

	1493

	1494 set.clear();

	1495 set.add(start);

	1496 set.add(start, end);

	1497 expectRange((UnicodeString)"add(" + start + "," + end + ")",

	1498 set, xstart, xend);

	1499

	1500 set.set(0, 0x10FFFF);

	1501 set.retain(start, end);

	1502 expectRange((UnicodeString)"retain(" + start + "," + end + ")",

	1503 set, xstart, xend);

	1504 set.retain(start);

	1505

	1506 set.set(0, 0x10FFFF);

	1507 set.remove(start);

	1508 set.remove(start, end);

	1509 set.complement();

	1510 expectRange((UnicodeString)"!remove(" + start + "," + end + ")",

	1511 set, xstart, xend);

	1512

	1513 set.set(0, 0x10FFFF);

	1514 set.complement(start, end);

	1515 set.complement();

	1516 expectRange((UnicodeString)"!complement(" + start + "," + end + ")",

	1517 set, xstart, xend);

	1518 set.complement(start);

	1519 }

	1520

	1521 const UChar32 DATA2[] = {

	1522 0,

	1523 0x10FFFF,

	1524 (UChar32)-1,

	1525 0x110000

	1526 };

	1527 const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);

	1528

	1529 for (i=0; i<DATA2_LENGTH; ++i) {

	1530 UChar32 c = DATA2[i], end = 0x10FFFF;

	1531 UBool valid = (c >= 0 && c <= 0x10FFFF);

	1532

	1533 UnicodeSet set(0, 0x10FFFF);

	1534

	1535 // For single-codepoint contains, invalid codepoints are NOT contained

	1536 UBool b = set.contains(c);

	1537 if (b == valid) {

	1538 logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +

	1539 ") = " + b);

	1540 } else {

	1541 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +

	1542 ") = " + b);

	1543 }

	1544

	1545 // For codepoint range contains, containsNone, and containsSome,

	1546 // invalid or empty (start > end) ranges have UNDEFINED behavior.

	1547 b = set.contains(c, end);

	1548 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +

	1549 "," + end + ") = " + b);

	1550

	1551 b = set.containsNone(c, end);

	1552 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +

	1553 "," + end + ") = " + b);

	1554

	1555 b = set.containsSome(c, end);

	1556 logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +

	1557 "," + end + ") = " + b);

	1558

	1559 int32_t index = set.indexOf(c);

	1560 if ((index >= 0) == valid) {

	1561 logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +

	1562 ") = " + index);

	1563 } else {

	1564 errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +

	1565 ") = " + index);

	1566 }

	1567 }

	1568 }

	1569

	1570 // Used by TestSymbolTable

	1571 class TokenSymbolTable : public SymbolTable {

	1572 public:

	1573 Hashtable contents;

	1574

	1575 TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {

	1576 contents.setValueDeleter(uhash_deleteUnicodeString);

	1577 }

	1578

	1579 ~TokenSymbolTable() {}

	1580

	1581 /**

	1582 * (Non-SymbolTable API) Add the given variable and value to

	1583 * the table. Variable should NOT contain leading '$'.

	1584 */

	1585 void add(const UnicodeString& var, const UnicodeString& value,

	1586 UErrorCode& ec) {

	1587 if (U_SUCCESS(ec)) {

	1588 contents.put(var, new UnicodeString(value), ec);

	1589 }

	1590 }

	1591

	1592 /**

	1593 * SymbolTable API

	1594 */

	1595 virtual const UnicodeString* lookup(const UnicodeString& s) const {

	1596 return (const UnicodeString*) contents.get(s);

	1597 }

	1598

	1599 /**

	1600 * SymbolTable API

	1601 */

	1602 virtual const UnicodeFunctor* lookupMatcher(UChar32 /ch/) const {

	1603 return NULL;

	1604 }

	1605

	1606 /**

	1607 * SymbolTable API

	1608 */

	1609 virtual UnicodeString parseReference(const UnicodeString& text,

	1610 ParsePosition& pos, int32_t limit) cons t {

	1611 int32_t start = pos.getIndex();

	1612 int32_t i = start;

	1613 UnicodeString result;

	1614 while (i < limit) {

	1615 UChar c = text.charAt(i);

	1616 if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {

	1617 break;

	1618 }

	1619 ++i;

	1620 }

	1621 if (i == start) { // No valid name chars

	1622 return result; // Indicate failure with empty string

	1623 }

	1624 pos.setIndex(i);

	1625 text.extractBetween(start, i, result);

	1626 return result;

	1627 }

	1628 };

	1629

	1630 void UnicodeSetTest::TestSymbolTable() {

	1631 // Multiple test cases can be set up here. Each test case

	1632 // is terminated by null:

	1633 // var, value, var, value,..., input pat., exp. output pat., null

	1634 const char* DATA[] = {

	1635 "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,

	1636 "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,

	1637 "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,

	1638 NULL

	1639 };

	1640

	1641 for (int32_t i=0; DATA[i]!=NULL; ++i) {

	1642 UErrorCode ec = U_ZERO_ERROR;

	1643 TokenSymbolTable sym(ec);

	1644 if (U_FAILURE(ec)) {

	1645 errln("FAIL: couldn't construct TokenSymbolTable");

	1646 continue;

	1647 }

	1648

	1649 // Set up variables

	1650 while (DATA[i+2] != NULL) {

	1651 sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);

	1652 if (U_FAILURE(ec)) {

	1653 errln("FAIL: couldn't add to TokenSymbolTable");

	1654 continue;

	1655 }

	1656 i += 2;

	1657 }

	1658

	1659 // Input pattern and expected output pattern

	1660 UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = Unico deString(DATA[i+1], -1, US_INV);

	1661 i += 2;

	1662

	1663 ParsePosition pos(0);

	1664 UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);

	1665 if (U_FAILURE(ec)) {

	1666 errln("FAIL: couldn't construct UnicodeSet");

	1667 continue;

	1668 }

	1669

	1670 // results

	1671 if (pos.getIndex() != inpat.length()) {

	1672 errln((UnicodeString)"Failed to read to end of string \""

	1673 + inpat + "\": read to "

	1674 + pos.getIndex() + ", length is "

	1675 + inpat.length());

	1676 }

	1677

	1678 UnicodeSet us2(exppat, ec);

	1679 if (U_FAILURE(ec)) {

	1680 errln("FAIL: couldn't construct expected UnicodeSet");

	1681 continue;

	1682 }

	1683

	1684 UnicodeString a, b;

	1685 if (us != us2) {

	1686 errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +

	1687 ", expected " + us2.toPattern(b, TRUE));

	1688 } else {

	1689 logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));

	1690 }

	1691 }

	1692 }

	1693

	1694 void UnicodeSetTest::TestSurrogate() {

	1695 const char* DATA[] = {

	1696 // These should all behave identically

	1697 "[abc\\uD800\\uDC00]",

	1698 // "[abc\uD800\uDC00]", // Can't do this on C -- only Java

	1699 "[abc\\U00010000]",

	1700 0

	1701 };

	1702 for (int i=0; DATA[i] != 0; ++i) {

	1703 UErrorCode ec = U_ZERO_ERROR;

	1704 logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));

	1705 UnicodeString str = UnicodeString(DATA[i], -1, US_INV);

	1706 UnicodeSet set(str, ec);

	1707 if (U_FAILURE(ec)) {

	1708 errln("FAIL: UnicodeSet constructor");

	1709 continue;

	1710 }

	1711 expectContainment(set,

	1712 CharsToUnicodeString("abc\\U00010000"),

	1713 CharsToUnicodeString("\\uD800;\\uDC00")); // split apa rt surrogate-pair

	1714 if (set.size() != 4) {

	1715 errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +

	1716 set.size() + ", expected 4");

	1717 }

	1718 }

	1719 }

	1720

	1721 void UnicodeSetTest::TestExhaustive() {

	1722 // exhaustive tests. Simulate UnicodeSets with integers.

	1723 // That gives us very solid tests (except for large memory tests).

	1724

	1725 int32_t limit = 128;

	1726

	1727 UnicodeSet x, y, z, aa;

	1728

	1729 for (int32_t i = 0; i < limit; ++i) {

	1730 bitsToSet(i, x);

	1731 logln((UnicodeString)"Testing " + i + ", " + x);

	1732 _testComplement(i, x, y);

	1733

	1734 // AS LONG AS WE ARE HERE, check roundtrip

	1735 checkRoundTrip(bitsToSet(i, aa));

	1736

	1737 for (int32_t j = 0; j < limit; ++j) {

	1738 _testAdd(i,j, x,y,z);

	1739 _testXor(i,j, x,y,z);

	1740 _testRetain(i,j, x,y,z);

	1741 _testRemove(i,j, x,y,z);

	1742 }

	1743 }

	1744 }

	1745

	1746 void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {

	1747 bitsToSet(a, x);

	1748 z = x;

	1749 z.complement();

	1750 int32_t c = setToBits(z);

	1751 if (c != (~a)) {

	1752 errln((UnicodeString)"FAILED: add: ~" + x + " != " + z);

	1753 errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);

	1754 }

	1755 checkCanonicalRep(z, (UnicodeString)"complement " + a);

	1756 }

	1757

	1758 void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y , UnicodeSet& z) {

	1759 bitsToSet(a, x);

	1760 bitsToSet(b, y);

	1761 z = x;

	1762 z.addAll(y);

	1763 int32_t c = setToBits(z);

	1764 if (c != (a \| b)) {

	1765 errln((UnicodeString)"FAILED: add: " + x + " \| " + y + " != " + z);

	1766 errln((UnicodeString)"FAILED: add: " + a + " \| " + b + " != " + c);

	1767 }

	1768 checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);

	1769 }

	1770

	1771 void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet & y, UnicodeSet& z) {

	1772 bitsToSet(a, x);

	1773 bitsToSet(b, y);

	1774 z = x;

	1775 z.retainAll(y);

	1776 int32_t c = setToBits(z);

	1777 if (c != (a & b)) {

	1778 errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);

	1779 errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);

	1780 }

	1781 checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);

	1782 }

	1783

	1784 void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet & y, UnicodeSet& z) {

	1785 bitsToSet(a, x);

	1786 bitsToSet(b, y);

	1787 z = x;

	1788 z.removeAll(y);

	1789 int32_t c = setToBits(z);

	1790 if (c != (a &~ b)) {

	1791 errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);

	1792 errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);

	1793 }

	1794 checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);

	1795 }

	1796

	1797 void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y , UnicodeSet& z) {

	1798 bitsToSet(a, x);

	1799 bitsToSet(b, y);

	1800 z = x;

	1801 z.complementAll(y);

	1802 int32_t c = setToBits(z);

	1803 if (c != (a ^ b)) {

	1804 errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z );

	1805 errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c );

	1806 }

	1807 checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);

	1808 }

	1809

	1810 /**

	1811 * Check that ranges are monotonically increasing and non-

	1812 * overlapping.

	1813 */

	1814 void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeStrin g& msg) {

	1815 int32_t n = set.getRangeCount();

	1816 if (n < 0) {

	1817 errln((UnicodeString)"FAIL result of " + msg +

	1818 ": range count should be >= 0 but is " +

	1819 n /+ " for " + set.toPattern())/);

	1820 return;

	1821 }

	1822 UChar32 last = 0;

	1823 for (int32_t i=0; i<n; ++i) {

	1824 UChar32 start = set.getRangeStart(i);

	1825 UChar32 end = set.getRangeEnd(i);

	1826 if (start > end) {

	1827 errln((UnicodeString)"FAIL result of " + msg +

	1828 ": range " + (i+1) +

	1829 " start > end: " + (int)start + ", " + (int)end +

	1830 " for " + set);

	1831 }

	1832 if (i > 0 && start <= last) {

	1833 errln((UnicodeString)"FAIL result of " + msg +

	1834 ": range " + (i+1) +

	1835 " overlaps previous range: " + (int)start + ", " + (int)end +

	1836 " for " + set);

	1837 }

	1838 last = end;

	1839 }

	1840 }

	1841

	1842 /**

	1843 * Convert a bitmask to a UnicodeSet.

	1844 */

	1845 UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {

	1846 result.clear();

	1847 for (UChar32 i = 0; i < 32; ++i) {

	1848 if ((a & (1<<i)) != 0) {

	1849 result.add(i);

	1850 }

	1851 }

	1852 return result;

	1853 }

	1854

	1855 /**

	1856 * Convert a UnicodeSet to a bitmask. Only the characters

	1857 * U+0000 to U+0020 are represented in the bitmask.

	1858 */

	1859 int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {

	1860 int32_t result = 0;

	1861 for (int32_t i = 0; i < 32; ++i) {

	1862 if (x.contains((UChar32)i)) {

	1863 result \|= (1<<i);

	1864 }

	1865 }

	1866 return result;

	1867 }

	1868

	1869 /**

	1870 * Return the representation of an inversion list based UnicodeSet

	1871 * as a pairs list. Ranges are listed in ascending Unicode order.

	1872 * For example, the set [a-zA-M3] is represented as "33AMaz".

	1873 */

	1874 UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {

	1875 UnicodeString pairs;

	1876 for (int32_t i=0; i<set.getRangeCount(); ++i) {

	1877 UChar32 start = set.getRangeStart(i);

	1878 UChar32 end = set.getRangeEnd(i);

	1879 if (end > 0xFFFF) {

	1880 end = 0xFFFF;

	1881 i = set.getRangeCount(); // Should be unnecessary

	1882 }

	1883 pairs.append((UChar)start).append((UChar)end);

	1884 }

	1885 return pairs;

	1886 }

	1887

	1888 /**

	1889 * Basic consistency check for a few items.

	1890 * That the iterator works, and that we can create a pattern and

	1891 * get the same thing back

	1892 */

	1893 void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {

	1894 UErrorCode ec = U_ZERO_ERROR;

	1895

	1896 UnicodeSet t(s);

	1897 checkEqual(s, t, "copy ct");

	1898

	1899 t = s;

	1900 checkEqual(s, t, "operator=");

	1901

	1902 copyWithIterator(t, s, FALSE);

	1903 checkEqual(s, t, "iterator roundtrip");

	1904

	1905 copyWithIterator(t, s, TRUE); // try range

	1906 checkEqual(s, t, "iterator roundtrip");

	1907

	1908 UnicodeString pat; s.toPattern(pat, FALSE);

	1909 t.applyPattern(pat, ec);

	1910 if (U_FAILURE(ec)) {

	1911 errln("FAIL: applyPattern");

	1912 return;

	1913 } else {

	1914 checkEqual(s, t, "toPattern(false)");

	1915 }

	1916

	1917 s.toPattern(pat, TRUE);

	1918 t.applyPattern(pat, ec);

	1919 if (U_FAILURE(ec)) {

	1920 errln("FAIL: applyPattern");

	1921 return;

	1922 } else {

	1923 checkEqual(s, t, "toPattern(true)");

	1924 }

	1925 }

	1926

	1927 void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {

	1928 t.clear();

	1929 UnicodeSetIterator it(s);

	1930 if (withRange) {

	1931 while (it.nextRange()) {

	1932 if (it.isString()) {

	1933 t.add(it.getString());

	1934 } else {

	1935 t.add(it.getCodepoint(), it.getCodepointEnd());

	1936 }

	1937 }

	1938 } else {

	1939 while (it.next()) {

	1940 if (it.isString()) {

	1941 t.add(it.getString());

	1942 } else {

	1943 t.add(it.getCodepoint());

	1944 }

	1945 }

	1946 }

	1947 }

	1948

	1949 UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {

	1950 UnicodeString source; s.toPattern(source, TRUE);

	1951 UnicodeString result; t.toPattern(result, TRUE);

	1952 if (s != t) {

	1953 errln((UnicodeString)"FAIL: " + message

	1954 + "; source = " + source

	1955 + "; result = " + result

	1956 );

	1957 return FALSE;

	1958 } else {

	1959 logln((UnicodeString)"Ok: " + message

	1960 + "; source = " + source

	1961 + "; result = " + result

	1962 );

	1963 }

	1964 return TRUE;

	1965 }

	1966

	1967 void

	1968 UnicodeSetTest::expectContainment(const UnicodeString& pat,

	1969 const UnicodeString& charsIn,

	1970 const UnicodeString& charsOut) {

	1971 UErrorCode ec = U_ZERO_ERROR;

	1972 UnicodeSet set(pat, ec);

	1973 if (U_FAILURE(ec)) {

	1974 dataerrln((UnicodeString)"FAIL: pattern \"" +

	1975 pat + "\" => " + u_errorName(ec));

	1976 return;

	1977 }

	1978 expectContainment(set, pat, charsIn, charsOut);

	1979 }

	1980

	1981 void

	1982 UnicodeSetTest::expectContainment(const UnicodeSet& set,

	1983 const UnicodeString& charsIn,

	1984 const UnicodeString& charsOut) {

	1985 UnicodeString pat;

	1986 set.toPattern(pat);

	1987 expectContainment(set, pat, charsIn, charsOut);

	1988 }

	1989

	1990 void

	1991 UnicodeSetTest::expectContainment(const UnicodeSet& set,

	1992 const UnicodeString& setName,

	1993 const UnicodeString& charsIn,

	1994 const UnicodeString& charsOut) {

	1995 UnicodeString bad;

	1996 UChar32 c;

	1997 int32_t i;

	1998

	1999 for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {

	2000 c = charsIn.char32At(i);

	2001 if (!set.contains(c)) {

	2002 bad.append(c);

	2003 }

	2004 }

	2005 if (bad.length() > 0) {

	2006 errln((UnicodeString)"Fail: set " + setName + " does not contain " + pre ttify(bad) +

	2007 ", expected containment of " + prettify(charsIn));

	2008 } else {

	2009 logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(char sIn));

	2010 }

	2011

	2012 bad.truncate(0);

	2013 for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {

	2014 c = charsOut.char32At(i);

	2015 if (set.contains(c)) {

	2016 bad.append(c);

	2017 }

	2018 }

	2019 if (bad.length() > 0) {

	2020 errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(ba d) +

	2021 ", expected non-containment of " + prettify(charsOut));

	2022 } else {

	2023 logln((UnicodeString)"Ok: set " + setName + " does not contain " + prett ify(charsOut));

	2024 }

	2025 }

	2026

	2027 void

	2028 UnicodeSetTest::expectPattern(UnicodeSet& set,

	2029 const UnicodeString& pattern,

	2030 const UnicodeString& expectedPairs){

	2031 UErrorCode status = U_ZERO_ERROR;

	2032 set.applyPattern(pattern, status);

	2033 if (U_FAILURE(status)) {

	2034 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +

	2035 "\") failed");

	2036 return;

	2037 } else {

	2038 if (getPairs(set) != expectedPairs ) {

	2039 errln(UnicodeString("FAIL: applyPattern(\"") + pattern +

	2040 "\") => pairs \"" +

	2041 escape(getPairs(set)) + "\", expected \"" +

	2042 escape(expectedPairs) + "\"");

	2043 } else {

	2044 logln(UnicodeString("Ok: applyPattern(\"") + pattern +

	2045 "\") => pairs \"" +

	2046 escape(getPairs(set)) + "\"");

	2047 }

	2048 }

	2049 // the result of calling set.toPattern(), which is the string representation of

	2050 // this set(set), is passed to a UnicodeSet constructor, and tested that it

	2051 // will produce another set that is equal to this one.

	2052 UnicodeString temppattern;

	2053 set.toPattern(temppattern);

	2054 UnicodeSet *tempset=new UnicodeSet(temppattern, status);

	2055 if (U_FAILURE(status)) {

	2056 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));

	2057 return;

	2058 }

	2059 if(tempset != set \|\| getPairs(tempset) != getPairs(set)){

	2060 errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +

	2061 escape(getPairs(set)) + "\""));

	2062 } else{

	2063 logln(UnicodeString("Ok: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));

	2064 }

	2065

	2066 delete tempset;

	2067

	2068 }

	2069

	2070 void

	2071 UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expected Pairs) {

	2072 if (getPairs(set) != expectedPairs) {

	2073 errln(UnicodeString("FAIL: Expected pair list \"") +

	2074 escape(expectedPairs) + "\", got \"" +

	2075 escape(getPairs(set)) + "\"");

	2076 }

	2077 }

	2078

	2079 void UnicodeSetTest::expectToPattern(const UnicodeSet& set,

	2080 const UnicodeString& expPat,

	2081 const char** expStrings) {

	2082 UnicodeString pat;

	2083 set.toPattern(pat, TRUE);

	2084 if (pat == expPat) {

	2085 logln((UnicodeString)"Ok: toPattern() => \"" + pat + "\"");

	2086 } else {

	2087 errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");

	2088 return;

	2089 }

	2090 if (expStrings == NULL) {

	2091 return;

	2092 }

	2093 UBool in = TRUE;

	2094 for (int32_t i=0; expStrings[i] != NULL; ++i) {

	2095 if (expStrings[i] == NOT) { // sic; pointer comparison

	2096 in = FALSE;

	2097 continue;

	2098 }

	2099 UnicodeString s = CharsToUnicodeString(expStrings[i]);

	2100 UBool contained = set.contains(s);

	2101 if (contained == in) {

	2102 logln((UnicodeString)"Ok: " + expPat +

	2103 (contained ? " contains {" : " does not contain {") +

	2104 escape(expStrings[i]) + "}");

	2105 } else {

	2106 errln((UnicodeString)"FAIL: " + expPat +

	2107 (contained ? " contains {" : " does not contain {") +

	2108 escape(expStrings[i]) + "}");

	2109 }

	2110 }

	2111 }

	2112

	2113 static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }

	2114

	2115 void

	2116 UnicodeSetTest::doAssert(UBool condition, const char *message)

	2117 {

	2118 if (!condition) {

	2119 errln(UnicodeString("ERROR : ") + message);

	2120 }

	2121 }

	2122

	2123 UnicodeString

	2124 UnicodeSetTest::escape(const UnicodeString& s) {

	2125 UnicodeString buf;

	2126 for (int32_t i=0; i<s.length(); )

	2127 {

	2128 UChar32 c = s.char32At(i);

	2129 if (0x0020 <= c && c <= 0x007F) {

	2130 buf += c;

	2131 } else {

	2132 if (c <= 0xFFFF) {

	2133 buf += (UChar)0x5c; buf += (UChar)0x75;

	2134 } else {

	2135 buf += (UChar)0x5c; buf += (UChar)0x55;

	2136 buf += toHexString((c & 0xF0000000) >> 28);

	2137 buf += toHexString((c & 0x0F000000) >> 24);

	2138 buf += toHexString((c & 0x00F00000) >> 20);

	2139 buf += toHexString((c & 0x000F0000) >> 16);

	2140 }

	2141 buf += toHexString((c & 0xF000) >> 12);

	2142 buf += toHexString((c & 0x0F00) >> 8);

	2143 buf += toHexString((c & 0x00F0) >> 4);

	2144 buf += toHexString(c & 0x000F);

	2145 }

	2146 i += U16_LENGTH(c);

	2147 }

	2148 return buf;

	2149 }

	2150

	2151 void UnicodeSetTest::TestFreezable() {

	2152 UErrorCode errorCode=U_ZERO_ERROR;

	2153 UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);

	2154 UnicodeSet idSet(idPattern, errorCode);

	2155 if(U_FAILURE(errorCode)) {

	2156 dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_e rrorName(errorCode));

	2157 return;

	2158 }

	2159

	2160 UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);

	2161 UnicodeSet wsSet(wsPattern, errorCode);

	2162 if(U_FAILURE(errorCode)) {

	2163 dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_e rrorName(errorCode));

	2164 return;

	2165 }

	2166

	2167 idSet.add(idPattern);

	2168 UnicodeSet frozen(idSet);

	2169 frozen.freeze();

	2170

	2171 if(idSet.isFrozen() \|\| !frozen.isFrozen()) {

	2172 errln("FAIL: isFrozen() is wrong");

	2173 }

	2174 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2175 errln("FAIL: a copy-constructed frozen set differs from its original");

	2176 }

	2177

	2178 frozen=wsSet;

	2179 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2180 errln("FAIL: a frozen set was modified by operator=");

	2181 }

	2182

	2183 UnicodeSet frozen2(frozen);

	2184 if(frozen2!=frozen \|\| frozen2!=idSet) {

	2185 errln("FAIL: a copied frozen set differs from its frozen original");

	2186 }

	2187 if(!frozen2.isFrozen()) {

	2188 errln("FAIL: copy-constructing a frozen set results in a thawed one");

	2189 }

	2190 UnicodeSet frozen3(5, 55); // Set to some values to really test assignment below, not copy construction.

	2191 if(frozen3.contains(0, 4) \|\| !frozen3.contains(5, 55) \|\| frozen3.contains(56 , 0x10ffff)) {

	2192 errln("FAIL: UnicodeSet(5, 55) failed");

	2193 }

	2194 frozen3=frozen;

	2195 if(!frozen3.isFrozen()) {

	2196 errln("FAIL: copying a frozen set results in a thawed one");

	2197 }

	2198

	2199 UnicodeSet cloned=(UnicodeSet )frozen.clone();

	2200 if(!cloned->isFrozen() \|\| *cloned!=frozen \|\| cloned->containsSome(0xd802, 0x d805)) {

	2201 errln("FAIL: clone() failed");

	2202 }

	2203 cloned->add(0xd802, 0xd805);

	2204 if(cloned->containsSome(0xd802, 0xd805)) {

	2205 errln("FAIL: unable to modify clone");

	2206 }

	2207 delete cloned;

	2208

	2209 UnicodeSet thawed=(UnicodeSet )frozen.cloneAsThawed();

	2210 if(thawed->isFrozen() \|\| *thawed!=frozen \|\| thawed->containsSome(0xd802, 0xd 805)) {

	2211 errln("FAIL: cloneAsThawed() failed");

	2212 }

	2213 thawed->add(0xd802, 0xd805);

	2214 if(!thawed->contains(0xd802, 0xd805)) {

	2215 errln("FAIL: unable to modify thawed clone");

	2216 }

	2217 delete thawed;

	2218

	2219 frozen.set(5, 55);

	2220 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2221 errln("FAIL: UnicodeSet::set() modified a frozen set");

	2222 }

	2223

	2224 frozen.clear();

	2225 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2226 errln("FAIL: UnicodeSet::clear() modified a frozen set");

	2227 }

	2228

	2229 frozen.closeOver(USET_CASE_INSENSITIVE);

	2230 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2231 errln("FAIL: UnicodeSet::closeOver() modified a frozen set");

	2232 }

	2233

	2234 frozen.compact();

	2235 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2236 errln("FAIL: UnicodeSet::compact() modified a frozen set");

	2237 }

	2238

	2239 ParsePosition pos;

	2240 frozen.

	2241 applyPattern(wsPattern, errorCode).

	2242 applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).

	2243 applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).

	2244 applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).

	2245 applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), e rrorCode);

	2246 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2247 errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");

	2248 }

	2249

	2250 frozen.

	2251 add(0xd800).

	2252 add(0xd802, 0xd805).

	2253 add(wsPattern).

	2254 addAll(idPattern).

	2255 addAll(wsSet);

	2256 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2257 errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");

	2258 }

	2259

	2260 frozen.

	2261 retain(0x62).

	2262 retain(0x64, 0x69).

	2263 retainAll(wsPattern).

	2264 retainAll(wsSet);

	2265 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2266 errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");

	2267 }

	2268

	2269 frozen.

	2270 remove(0x62).

	2271 remove(0x64, 0x69).

	2272 remove(idPattern).

	2273 removeAll(idPattern).

	2274 removeAll(idSet);

	2275 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2276 errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");

	2277 }

	2278

	2279 frozen.

	2280 complement().

	2281 complement(0x62).

	2282 complement(0x64, 0x69).

	2283 complement(idPattern).

	2284 complementAll(idPattern).

	2285 complementAll(idSet);

	2286 if(frozen!=idSet \|\| !(frozen==idSet)) {

	2287 errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");

	2288 }

	2289 }

	2290

	2291 // Test span() etc. -------------------------------------------------------- ***

	2292

	2293 // Append the UTF-8 version of the string to t and return the appended UTF-8 len gth.

	2294 static int32_t

	2295 appendUTF8(const UChar s, int32_t length, char t, int32_t capacity) {

	2296 UErrorCode errorCode=U_ZERO_ERROR;

	2297 int32_t length8=0;

	2298 u_strToUTF8(t, capacity, &length8, s, length, &errorCode);

	2299 if(U_SUCCESS(errorCode)) {

	2300 return length8;

	2301 } else {

	2302 // The string contains an unpaired surrogate.

	2303 // Ignore this string.

	2304 return 0;

	2305 }

	2306 }

	2307

	2308 class UnicodeSetWithStringsIterator;

	2309

	2310 // Make the strings in a UnicodeSet easily accessible.

	2311 class UnicodeSetWithStrings {

	2312 public:

	2313 UnicodeSetWithStrings(const UnicodeSet &normalSet) :

	2314 set(normalSet), stringsLength(0), hasSurrogates(FALSE) {

	2315 int32_t size=set.size();

	2316 if(size>0 && set.charAt(size-1)<0) {

	2317 // If a set's last element is not a code point, then it must contain strings.

	2318 // Iterate over the set, skip all code point ranges, and cache the s trings.

	2319 // Convert them to UTF-8 for spanUTF8().

	2320 UnicodeSetIterator iter(set);

	2321 const UnicodeString *s;

	2322 char *s8=utf8;

	2323 int32_t length8, utf8Count=0;

	2324 while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {

	2325 if(iter.isString()) {

	2326 // Store the pointer to the set's string element

	2327 // which we happen to know is a stable pointer.

	2328 strings[stringsLength]=s=&iter.getString();

	2329 utf8Count+=

	2330 utf8Lengths[stringsLength]=length8=

	2331 appendUTF8(s->getBuffer(), s->length(),

	2332 s8, (int32_t)(sizeof(utf8)-utf8Count));

	2333 if(length8==0) {

	2334 hasSurrogates=TRUE; // Contains unpaired surrogates.

	2335 }

	2336 s8+=length8;

	2337 ++stringsLength;

	2338 }

	2339 }

	2340 }

	2341 }

	2342

	2343 const UnicodeSet &getSet() const {

	2344 return set;

	2345 }

	2346

	2347 UBool hasStrings() const {

	2348 return (UBool)(stringsLength>0);

	2349 }

	2350

	2351 UBool hasStringsWithSurrogates() const {

	2352 return hasSurrogates;

	2353 }

	2354

	2355 private:

	2356 friend class UnicodeSetWithStringsIterator;

	2357

	2358 const UnicodeSet &set;

	2359

	2360 const UnicodeString *strings[20];

	2361 int32_t stringsLength;

	2362 UBool hasSurrogates;

	2363

	2364 char utf8[1024];

	2365 int32_t utf8Lengths[20];

	2366

	2367 int32_t nextStringIndex;

	2368 int32_t nextUTF8Start;

	2369 };

	2370

	2371 class UnicodeSetWithStringsIterator {

	2372 public:

	2373 UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :

	2374 fSet(set), nextStringIndex(0), nextUTF8Start(0) {

	2375 }

	2376

	2377 void reset() {

	2378 nextStringIndex=nextUTF8Start=0;

	2379 }

	2380

	2381 const UnicodeString *nextString() {

	2382 if(nextStringIndex<fSet.stringsLength) {

	2383 return fSet.strings[nextStringIndex++];

	2384 } else {

	2385 return NULL;

	2386 }

	2387 }

	2388

	2389 // Do not mix with calls to nextString().

	2390 const char *nextUTF8(int32_t &length) {

	2391 if(nextStringIndex<fSet.stringsLength) {

	2392 const char *s8=fSet.utf8+nextUTF8Start;

	2393 nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];

	2394 return s8;

	2395 } else {

	2396 length=0;

	2397 return NULL;

	2398 }

	2399 }

	2400

	2401 private:

	2402 const UnicodeSetWithStrings &fSet;

	2403 int32_t nextStringIndex;

	2404 int32_t nextUTF8Start;

	2405 };

	2406

	2407 // Compare 16-bit Unicode strings (which may be malformed UTF-16)

	2408 // at code point boundaries.

	2409 // That is, each edge of a match must not be in the middle of a surrogate pair.

	2410 static inline UBool

	2411 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString & t) {

	2412 s+=start;

	2413 limit-=start;

	2414 int32_t length=t.length();

	2415 return 0==t.compare(s, length) &&

	2416 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&

	2417 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]) );

	2418 }

	2419

	2420 // Implement span() with contains() for comparison.

	2421 static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar * s, int32_t length,

	2422 USetSpanCondition spanCondition) {

	2423 const UnicodeSet &realSet(set.getSet());

	2424 if(!set.hasStrings()) {

	2425 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2426 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2427 }

	2428

	2429 UChar32 c;

	2430 int32_t start=0, prev;

	2431 while((prev=start)<length) {

	2432 U16_NEXT(s, start, length, c);

	2433 if(realSet.contains(c)!=spanCondition) {

	2434 break;

	2435 }

	2436 }

	2437 return prev;

	2438 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	2439 UnicodeSetWithStringsIterator iter(set);

	2440 UChar32 c;

	2441 int32_t start, next;

	2442 for(start=next=0; start<length;) {

	2443 U16_NEXT(s, next, length, c);

	2444 if(realSet.contains(c)) {

	2445 break;

	2446 }

	2447 const UnicodeString *str;

	2448 iter.reset();

	2449 while((str=iter.nextString())!=NULL) {

	2450 if(str->length()<=(length-start) && matches16CPB(s, start, lengt h, *str)) {

	2451 // spanNeedsStrings=TRUE;

	2452 return start;

	2453 }

	2454 }

	2455 start=next;

	2456 }

	2457 return start;

	2458 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

	2459 UnicodeSetWithStringsIterator iter(set);

	2460 UChar32 c;

	2461 int32_t start, next, maxSpanLimit=0;

	2462 for(start=next=0; start<length;) {

	2463 U16_NEXT(s, next, length, c);

	2464 if(!realSet.contains(c)) {

	2465 next=start; // Do not span this single, not-contained code poin t.

	2466 }

	2467 const UnicodeString *str;

	2468 iter.reset();

	2469 while((str=iter.nextString())!=NULL) {

	2470 if(str->length()<=(length-start) && matches16CPB(s, start, lengt h, *str)) {

	2471 // spanNeedsStrings=TRUE;

	2472 int32_t matchLimit=start+str->length();

	2473 if(matchLimit==length) {

	2474 return length;

	2475 }

	2476 if(spanCondition==USET_SPAN_CONTAINED) {

	2477 // Iterate for the shortest match at each position.

	2478 // Recurse for each but the shortest match.

	2479 if(next==start) {

	2480 next=matchLimit; // First match from start.

	2481 } else {

	2482 if(matchLimit<next) {

	2483 // Remember shortest match from start for iterat ion.

	2484 int32_t temp=next;

	2485 next=matchLimit;

	2486 matchLimit=temp;

	2487 }

	2488 // Recurse for non-shortest match from start.

	2489 int32_t spanLength=containsSpanUTF16(set, s+matchLim it, length-matchLimit,

	2490 USET_SPAN_CONTA INED);

	2491 if((matchLimit+spanLength)>maxSpanLimit) {

	2492 maxSpanLimit=matchLimit+spanLength;

	2493 if(maxSpanLimit==length) {

	2494 return length;

	2495 }

	2496 }

	2497 }

	2498 } else /* spanCondition==USET_SPAN_SIMPLE */ {

	2499 if(matchLimit>next) {

	2500 // Remember longest match from start.

	2501 next=matchLimit;

	2502 }

	2503 }

	2504 }

	2505 }

	2506 if(next==start) {

	2507 break; // No match from start.

	2508 }

	2509 start=next;

	2510 }

	2511 if(start>maxSpanLimit) {

	2512 return start;

	2513 } else {

	2514 return maxSpanLimit;

	2515 }

	2516 }

	2517 }

	2518

	2519 static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UCh ar *s, int32_t length,

	2520 USetSpanCondition spanCondition) {

	2521 if(length==0) {

	2522 return 0;

	2523 }

	2524 const UnicodeSet &realSet(set.getSet());

	2525 if(!set.hasStrings()) {

	2526 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2527 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2528 }

	2529

	2530 UChar32 c;

	2531 int32_t prev=length;

	2532 do {

	2533 U16_PREV(s, 0, length, c);

	2534 if(realSet.contains(c)!=spanCondition) {

	2535 break;

	2536 }

	2537 } while((prev=length)>0);

	2538 return prev;

	2539 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	2540 UnicodeSetWithStringsIterator iter(set);

	2541 UChar32 c;

	2542 int32_t prev=length, length0=length;

	2543 do {

	2544 U16_PREV(s, 0, length, c);

	2545 if(realSet.contains(c)) {

	2546 break;

	2547 }

	2548 const UnicodeString *str;

	2549 iter.reset();

	2550 while((str=iter.nextString())!=NULL) {

	2551 if(str->length()<=prev && matches16CPB(s, prev-str->length(), le ngth0, *str)) {

	2552 // spanNeedsStrings=TRUE;

	2553 return prev;

	2554 }

	2555 }

	2556 } while((prev=length)>0);

	2557 return prev;

	2558 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

	2559 UnicodeSetWithStringsIterator iter(set);

	2560 UChar32 c;

	2561 int32_t prev=length, minSpanStart=length, length0=length;

	2562 do {

	2563 U16_PREV(s, 0, length, c);

	2564 if(!realSet.contains(c)) {

	2565 length=prev; // Do not span this single, not-contained code poi nt.

	2566 }

	2567 const UnicodeString *str;

	2568 iter.reset();

	2569 while((str=iter.nextString())!=NULL) {

	2570 if(str->length()<=prev && matches16CPB(s, prev-str->length(), le ngth0, *str)) {

	2571 // spanNeedsStrings=TRUE;

	2572 int32_t matchStart=prev-str->length();

	2573 if(matchStart==0) {

	2574 return 0;

	2575 }

	2576 if(spanCondition==USET_SPAN_CONTAINED) {

	2577 // Iterate for the shortest match at each position.

	2578 // Recurse for each but the shortest match.

	2579 if(length==prev) {

	2580 length=matchStart; // First match from prev.

	2581 } else {

	2582 if(matchStart>length) {

	2583 // Remember shortest match from prev for iterati on.

	2584 int32_t temp=length;

	2585 length=matchStart;

	2586 matchStart=temp;

	2587 }

	2588 // Recurse for non-shortest match from prev.

	2589 int32_t spanStart=containsSpanBackUTF16(set, s, matc hStart,

	2590 USET_SPAN_CO NTAINED);

	2591 if(spanStart<minSpanStart) {

	2592 minSpanStart=spanStart;

	2593 if(minSpanStart==0) {

	2594 return 0;

	2595 }

	2596 }

	2597 }

	2598 } else /* spanCondition==USET_SPAN_SIMPLE */ {

	2599 if(matchStart<length) {

	2600 // Remember longest match from prev.

	2601 length=matchStart;

	2602 }

	2603 }

	2604 }

	2605 }

	2606 if(length==prev) {

	2607 break; // No match from prev.

	2608 }

	2609 } while((prev=length)>0);

	2610 if(prev<minSpanStart) {

	2611 return prev;

	2612 } else {

	2613 return minSpanStart;

	2614 }

	2615 }

	2616 }

	2617

	2618 static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,

	2619 USetSpanCondition spanCondition) {

	2620 const UnicodeSet &realSet(set.getSet());

	2621 if(!set.hasStrings()) {

	2622 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2623 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2624 }

	2625

	2626 UChar32 c;

	2627 int32_t start=0, prev;

	2628 while((prev=start)<length) {

	2629 U8_NEXT(s, start, length, c);

	2630 if(c<0) {

	2631 c=0xfffd;

	2632 }

	2633 if(realSet.contains(c)!=spanCondition) {

	2634 break;

	2635 }

	2636 }

	2637 return prev;

	2638 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	2639 UnicodeSetWithStringsIterator iter(set);

	2640 UChar32 c;

	2641 int32_t start, next;

	2642 for(start=next=0; start<length;) {

	2643 U8_NEXT(s, next, length, c);

	2644 if(c<0) {

	2645 c=0xfffd;

	2646 }

	2647 if(realSet.contains(c)) {

	2648 break;

	2649 }

	2650 const char *s8;

	2651 int32_t length8;

	2652 iter.reset();

	2653 while((s8=iter.nextUTF8(length8))!=NULL) {

	2654 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s 8, length8)) {

	2655 // spanNeedsStrings=TRUE;

	2656 return start;

	2657 }

	2658 }

	2659 start=next;

	2660 }

	2661 return start;

	2662 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

	2663 UnicodeSetWithStringsIterator iter(set);

	2664 UChar32 c;

	2665 int32_t start, next, maxSpanLimit=0;

	2666 for(start=next=0; start<length;) {

	2667 U8_NEXT(s, next, length, c);

	2668 if(c<0) {

	2669 c=0xfffd;

	2670 }

	2671 if(!realSet.contains(c)) {

	2672 next=start; // Do not span this single, not-contained code poin t.

	2673 }

	2674 const char *s8;

	2675 int32_t length8;

	2676 iter.reset();

	2677 while((s8=iter.nextUTF8(length8))!=NULL) {

	2678 if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s 8, length8)) {

	2679 // spanNeedsStrings=TRUE;

	2680 int32_t matchLimit=start+length8;

	2681 if(matchLimit==length) {

	2682 return length;

	2683 }

	2684 if(spanCondition==USET_SPAN_CONTAINED) {

	2685 // Iterate for the shortest match at each position.

	2686 // Recurse for each but the shortest match.

	2687 if(next==start) {

	2688 next=matchLimit; // First match from start.

	2689 } else {

	2690 if(matchLimit<next) {

	2691 // Remember shortest match from start for iterat ion.

	2692 int32_t temp=next;

	2693 next=matchLimit;

	2694 matchLimit=temp;

	2695 }

	2696 // Recurse for non-shortest match from start.

	2697 int32_t spanLength=containsSpanUTF8(set, s+matchLimi t, length-matchLimit,

	2698 USET_SPAN_CONTAI NED);

	2699 if((matchLimit+spanLength)>maxSpanLimit) {

	2700 maxSpanLimit=matchLimit+spanLength;

	2701 if(maxSpanLimit==length) {

	2702 return length;

	2703 }

	2704 }

	2705 }

	2706 } else /* spanCondition==USET_SPAN_SIMPLE */ {

	2707 if(matchLimit>next) {

	2708 // Remember longest match from start.

	2709 next=matchLimit;

	2710 }

	2711 }

	2712 }

	2713 }

	2714 if(next==start) {

	2715 break; // No match from start.

	2716 }

	2717 start=next;

	2718 }

	2719 if(start>maxSpanLimit) {

	2720 return start;

	2721 } else {

	2722 return maxSpanLimit;

	2723 }

	2724 }

	2725 }

	2726

	2727 static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,

	2728 USetSpanCondition spanCondition) {

	2729 if(length==0) {

	2730 return 0;

	2731 }

	2732 const UnicodeSet &realSet(set.getSet());

	2733 if(!set.hasStrings()) {

	2734 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {

	2735 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.

	2736 }

	2737

	2738 UChar32 c;

	2739 int32_t prev=length;

	2740 do {

	2741 U8_PREV(s, 0, length, c);

	2742 if(c<0) {

	2743 c=0xfffd;

	2744 }

	2745 if(realSet.contains(c)!=spanCondition) {

	2746 break;

	2747 }

	2748 } while((prev=length)>0);

	2749 return prev;

	2750 } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {

	2751 UnicodeSetWithStringsIterator iter(set);

	2752 UChar32 c;

	2753 int32_t prev=length;

	2754 do {

	2755 U8_PREV(s, 0, length, c);

	2756 if(c<0) {

	2757 c=0xfffd;

	2758 }

	2759 if(realSet.contains(c)) {

	2760 break;

	2761 }

	2762 const char *s8;

	2763 int32_t length8;

	2764 iter.reset();

	2765 while((s8=iter.nextUTF8(length8))!=NULL) {

	2766 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {

	2767 // spanNeedsStrings=TRUE;

	2768 return prev;

	2769 }

	2770 }

	2771 } while((prev=length)>0);

	2772 return prev;

	2773 } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {

	2774 UnicodeSetWithStringsIterator iter(set);

	2775 UChar32 c;

	2776 int32_t prev=length, minSpanStart=length;

	2777 do {

	2778 U8_PREV(s, 0, length, c);

	2779 if(c<0) {

	2780 c=0xfffd;

	2781 }

	2782 if(!realSet.contains(c)) {

	2783 length=prev; // Do not span this single, not-contained code poi nt.

	2784 }

	2785 const char *s8;

	2786 int32_t length8;

	2787 iter.reset();

	2788 while((s8=iter.nextUTF8(length8))!=NULL) {

	2789 if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {

	2790 // spanNeedsStrings=TRUE;

	2791 int32_t matchStart=prev-length8;

	2792 if(matchStart==0) {

	2793 return 0;

	2794 }

	2795 if(spanCondition==USET_SPAN_CONTAINED) {

	2796 // Iterate for the shortest match at each position.

	2797 // Recurse for each but the shortest match.

	2798 if(length==prev) {

	2799 length=matchStart; // First match from prev.

	2800 } else {

	2801 if(matchStart>length) {

	2802 // Remember shortest match from prev for iterati on.

	2803 int32_t temp=length;

	2804 length=matchStart;

	2805 matchStart=temp;

	2806 }

	2807 // Recurse for non-shortest match from prev.

	2808 int32_t spanStart=containsSpanBackUTF8(set, s, match Start,

	2809 USET_SPAN_CON TAINED);

	2810 if(spanStart<minSpanStart) {

	2811 minSpanStart=spanStart;

	2812 if(minSpanStart==0) {

	2813 return 0;

	2814 }

	2815 }

	2816 }

	2817 } else /* spanCondition==USET_SPAN_SIMPLE */ {

	2818 if(matchStart<length) {

	2819 // Remember longest match from prev.

	2820 length=matchStart;

	2821 }

	2822 }

	2823 }

	2824 }

	2825 if(length==prev) {

	2826 break; // No match from prev.

	2827 }

	2828 } while((prev=length)>0);

	2829 if(prev<minSpanStart) {

	2830 return prev;

	2831 } else {

	2832 return minSpanStart;

	2833 }

	2834 }

	2835 }

	2836

	2837 // spans to be performed and compared

	2838 enum {

	2839 SPAN_UTF16 =1,

	2840 SPAN_UTF8 =2,

	2841 SPAN_UTFS =3,

	2842

	2843 SPAN_SET =4,

	2844 SPAN_COMPLEMENT =8,

	2845 SPAN_POLARITY =0xc,

	2846

	2847 SPAN_FWD =0x10,

	2848 SPAN_BACK =0x20,

	2849 SPAN_DIRS =0x30,

	2850

	2851 SPAN_CONTAINED =0x100,

	2852 SPAN_SIMPLE =0x200,

	2853 SPAN_CONDITION =0x300,

	2854

	2855 SPAN_ALL =0x33f

	2856 };

	2857

	2858 static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondit ion, USetSpanCondition contained) {

	2859 return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_ CONTAINED;

	2860 }

	2861

	2862 static inline int32_t slen(const void *s, UBool isUTF16) {

	2863 return isUTF16 ? u_strlen((const UChar )s) : strlen((const char )s);

	2864 }

	2865

	2866 /*

	2867 * Count spans on a string with the method according to type and set the span li mits.

	2868 * The set may be the complement of the original.

	2869 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()

	2870 * according to the expected number of spans.

	2871 * Sets typeName to an empty string if there is no such type.

	2872 * Returns -1 if the span option is filtered out.

	2873 */

	2874 static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,

	2875 const void *s, int32_t length, UBool isUTF16,

	2876 uint32_t whichSpans,

	2877 int type, const char *&typeName,

	2878 int32_t limits[], int32_t limitsCapacity,

	2879 int32_t expectCount) {

	2880 const UnicodeSet &realSet(set.getSet());

	2881 int32_t start, count;

	2882 USetSpanCondition spanCondition, firstSpanCondition, contained;

	2883 UBool isForward;

	2884

	2885 if(type<0 \|\| 7<type) {

	2886 typeName="";

	2887 return 0;

	2888 }

	2889

	2890 static const char *const typeNames16[]={

	2891 "contains", "contains(LM)",

	2892 "span", "span(LM)",

	2893 "containsBack", "containsBack(LM)",

	2894 "spanBack", "spanBack(LM)"

	2895 };

	2896

	2897 static const char *const typeNames8[]={

	2898 "containsUTF8", "containsUTF8(LM)",

	2899 "spanUTF8", "spanUTF8(LM)",

	2900 "containsBackUTF8", "containsBackUTF8(LM)", // not implemented

	2901 "spanBackUTF8", "spanBackUTF8(LM)"

	2902 };

	2903

	2904 typeName= isUTF16 ? typeNames16[type] : typeNames8[type];

	2905

	2906 // filter span options

	2907 if(type<=3) {

	2908 // span forward

	2909 if((whichSpans&SPAN_FWD)==0) {

	2910 return -1;

	2911 }

	2912 isForward=TRUE;

	2913 } else {

	2914 // span backward

	2915 if((whichSpans&SPAN_BACK)==0) {

	2916 return -1;

	2917 }

	2918 isForward=FALSE;

	2919 }

	2920 if((type&1)==0) {

	2921 // use USET_SPAN_CONTAINED

	2922 if((whichSpans&SPAN_CONTAINED)==0) {

	2923 return -1;

	2924 }

	2925 contained=USET_SPAN_CONTAINED;

	2926 } else {

	2927 // use USET_SPAN_SIMPLE

	2928 if((whichSpans&SPAN_SIMPLE)==0) {

	2929 return -1;

	2930 }

	2931 contained=USET_SPAN_SIMPLE;

	2932 }

	2933

	2934 // Default first span condition for going forward with an uncomplemented set .

	2935 spanCondition=USET_SPAN_NOT_CONTAINED;

	2936 if(isComplement) {

	2937 spanCondition=invertSpanCondition(spanCondition, contained);

	2938 }

	2939

	2940 // First span condition for span(), used to terminate the spanBack() iterati on.

	2941 firstSpanCondition=spanCondition;

	2942

	2943 // spanBack(): Its initial span condition is span()'s last span condition,

	2944 // which is the opposite of span()'s first span condition

	2945 // if we expect an even number of spans.

	2946 // (The loop inverts spanCondition (expectCount-1) times

	2947 // before the expectCount'th span() call.)

	2948 // If we do not compare forward and backward directions, then we do not have an

	2949 // expectCount and just start with firstSpanCondition.

	2950 if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {

	2951 spanCondition=invertSpanCondition(spanCondition, contained);

	2952 }

	2953

	2954 count=0;

	2955 switch(type) {

	2956 case 0:

	2957 case 1:

	2958 start=0;

	2959 if(length<0) {

	2960 length=slen(s, isUTF16);

	2961 }

	2962 for(;;) {

	2963 start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, len gth-start, spanCondition) :

	2964 containsSpanUTF8(set, (const char *)s+start, lengt h-start, spanCondition);

	2965 if(count<limitsCapacity) {

	2966 limits[count]=start;

	2967 }

	2968 ++count;

	2969 if(start>=length) {

	2970 break;

	2971 }

	2972 spanCondition=invertSpanCondition(spanCondition, contained);

	2973 }

	2974 break;

	2975 case 2:

	2976 case 3:

	2977 start=0;

	2978 for(;;) {

	2979 start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? l ength-start : length, spanCondition) :

	2980 realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);

	2981 if(count<limitsCapacity) {

	2982 limits[count]=start;

	2983 }

	2984 ++count;

	2985 if(length>=0 ? start>=length :

	2986 isUTF16 ? ((const UChar *)s)[start]==0 :

	2987 ((const char *)s)[start]==0

	2988 ) {

	2989 break;

	2990 }

	2991 spanCondition=invertSpanCondition(spanCondition, contained);

	2992 }

	2993 break;

	2994 case 4:

	2995 case 5:

	2996 if(length<0) {

	2997 length=slen(s, isUTF16);

	2998 }

	2999 for(;;) {

	3000 ++count;

	3001 if(count<=limitsCapacity) {

	3002 limits[limitsCapacity-count]=length;

	3003 }

	3004 length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, lengt h, spanCondition) :

	3005 containsSpanBackUTF8(set, (const char *)s, length, spanCondition);

	3006 if(length==0 && spanCondition==firstSpanCondition) {

	3007 break;

	3008 }

	3009 spanCondition=invertSpanCondition(spanCondition, contained);

	3010 }

	3011 if(count<limitsCapacity) {

	3012 memmove(limits, limits+(limitsCapacity-count), count*4);

	3013 }

	3014 break;

	3015 case 6:

	3016 case 7:

	3017 for(;;) {

	3018 ++count;

	3019 if(count<=limitsCapacity) {

	3020 limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUT F16);

	3021 }

	3022 // Note: Length<0 is tested only for the first spanBack().

	3023 // If we wanted to keep length<0 for all spanBack()s, we would have to

	3024 // temporarily modify the string by placing a NUL where the previous spanBack() stopped.

	3025 length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCon dition) :

	3026 realSet.spanBackUTF8((const char *)s, length, span Condition);

	3027 if(length==0 && spanCondition==firstSpanCondition) {

	3028 break;

	3029 }

	3030 spanCondition=invertSpanCondition(spanCondition, contained);

	3031 }

	3032 if(count<limitsCapacity) {

	3033 memmove(limits, limits+(limitsCapacity-count), count*4);

	3034 }

	3035 break;

	3036 default:

	3037 typeName="";

	3038 return -1;

	3039 }

	3040

	3041 return count;

	3042 }

	3043

	3044 // sets to be tested; odd index=isComplement

	3045 enum {

	3046 SLOW,

	3047 SLOW_NOT,

	3048 FAST,

	3049 FAST_NOT,

	3050 SET_COUNT

	3051 };

	3052

	3053 static const char *const setNames[SET_COUNT]={

	3054 "slow",

	3055 "slow.not",

	3056 "fast",

	3057 "fast.not"

	3058 };

	3059

	3060 /*

	3061 * Verify that we get the same results whether we look at text with contains(),

	3062 * span() or spanBack(), using unfrozen or frozen versions of the set,

	3063 * and using the set or its complement (switching the spanConditions accordingly ).

	3064 * The latter verifies that

	3065 * set.span(spanCondition) == set.complement().span(!spanCondition).

	3066 *

	3067 * The expectLimits[] are either provided by the caller (with expectCount>=0)

	3068 * or returned to the caller (with an input expectCount<0).

	3069 */

	3070 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],

	3071 const void *s, int32_t length, UBool isUTF16,

	3072 uint32_t whichSpans,

	3073 int32_t expectLimits[], int32_t &expectCount,

	3074 const char *testName, int32_t index) {

	3075 int32_t limits[500];

	3076 int32_t limitsCount;

	3077 int i, j;

	3078

	3079 const char *typeName;

	3080 int type;

	3081

	3082 for(i=0; i<SET_COUNT; ++i) {

	3083 if((i&1)==0) {

	3084 // Even-numbered sets are original, uncomplemented sets.

	3085 if((whichSpans&SPAN_SET)==0) {

	3086 continue;

	3087 }

	3088 } else {

	3089 // Odd-numbered sets are complemented.

	3090 if((whichSpans&SPAN_COMPLEMENT)==0) {

	3091 continue;

	3092 }

	3093 }

	3094 for(type=0;; ++type) {

	3095 limitsCount=getSpans(*sets[i], (UBool)(i&1),

	3096 s, length, isUTF16,

	3097 whichSpans,

	3098 type, typeName,

	3099 limits, LENGTHOF(limits), expectCount);

	3100 if(typeName[0]==0) {

	3101 break; // All types tried.

	3102 }

	3103 if(limitsCount<0) {

	3104 continue; // Span option filtered out.

	3105 }

	3106 if(expectCount<0) {

	3107 expectCount=limitsCount;

	3108 if(limitsCount>LENGTHOF(limits)) {

	3109 errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",

	3110 testName, (long)index, setNames[i], typeName, (long)li mitsCount, (long)LENGTHOF(limits));

	3111 return;

	3112 }

	3113 memcpy(expectLimits, limits, limitsCount*4);

	3114 } else if(limitsCount!=expectCount) {

	3115 errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",

	3116 testName, (long)index, setNames[i], typeName, (long)limits Count, (long)expectCount);

	3117 } else {

	3118 for(j=0; j<limitsCount; ++j) {

	3119 if(limits[j]!=expectLimits[j]) {

	3120 errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=% ld != %ld",

	3121 testName, (long)index, setNames[i], typeName, (lon g)limitsCount,

	3122 j, (long)limits[j], (long)expectLimits[j]);

	3123 break;

	3124 }

	3125 }

	3126 }

	3127 }

	3128 }

	3129

	3130 // Compare span() with containsAll()/containsNone(),

	3131 // but only if we have expectLimits[] from the uncomplemented set.

	3132 if(isUTF16 && (whichSpans&SPAN_SET)!=0) {

	3133 const UChar s16=(const UChar )s;

	3134 UnicodeString string;

	3135 int32_t prev=0, limit, length;

	3136 for(i=0; i<expectCount; ++i) {

	3137 limit=expectLimits[i];

	3138 length=limit-prev;

	3139 if(length>0) {

	3140 string.setTo(FALSE, s16+prev, length); // read-only alias

	3141 if(i&1) {

	3142 if(!sets[SLOW]->getSet().containsAll(string)) {

	3143 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE c ontradicts span()",

	3144 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);

	3145 return;

	3146 }

	3147 if(!sets[FAST]->getSet().containsAll(string)) {

	3148 errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE c ontradicts span()",

	3149 testName, (long)index, setNames[FAST], (long)prev, (long)limit);

	3150 return;

	3151 }

	3152 } else {

	3153 if(!sets[SLOW]->getSet().containsNone(string)) {

	3154 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",

	3155 testName, (long)index, setNames[SLOW], (long)prev, (long)limit);

	3156 return;

	3157 }

	3158 if(!sets[FAST]->getSet().containsNone(string)) {

	3159 errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",

	3160 testName, (long)index, setNames[FAST], (long)prev, (long)limit);

	3161 return;

	3162 }

	3163 }

	3164 }

	3165 prev=limit;

	3166 }

	3167 }

	3168 }

	3169

	3170 // Specifically test either UTF-16 or UTF-8.

	3171 void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],

	3172 const void *s, int32_t length, UBool isUTF16,

	3173 uint32_t whichSpans,

	3174 const char *testName, int32_t index) {

	3175 int32_t expectLimits[500];

	3176 int32_t expectCount=-1;

	3177 testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, te stName, index);

	3178 }

	3179

	3180 UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {

	3181 UChar c, c2;

	3182

	3183 if(length>=0) {

	3184 while(length>0) {

	3185 c=*s++;

	3186 --length;

	3187 if(0xd800<=c && c<0xe000) {

	3188 if(c>=0xdc00 \|\| length==0 \|\| !U16_IS_TRAIL(c2=*s++)) {

	3189 return TRUE;

	3190 }

	3191 --length;

	3192 }

	3193 }

	3194 } else {

	3195 while((c=*s++)!=0) {

	3196 if(0xd800<=c && c<0xe000) {

	3197 if(c>=0xdc00 \|\| !U16_IS_TRAIL(c2=*s++)) {

	3198 return TRUE;

	3199 }

	3200 }

	3201 }

	3202 }

	3203 return FALSE;

	3204 }

	3205

	3206 // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,

	3207 // unless either UTF is turned off in whichSpans.

	3208 // Testing UTF-16 and UTF-8 together requires that surrogate code points

	3209 // have the same contains(c) value as U+FFFD.

	3210 void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],

	3211 const UChar *s16, int32_t length16,

	3212 uint32_t whichSpans,

	3213 const char *testName, int32_t index) {

	3214 int32_t expectLimits[500];

	3215 int32_t expectCount;

	3216

	3217 expectCount=-1; // Get expectLimits[] from testSpan().

	3218

	3219 if((whichSpans&SPAN_UTF16)!=0) {

	3220 testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCoun t, testName, index);

	3221 }

	3222 if((whichSpans&SPAN_UTF8)==0) {

	3223 return;

	3224 }

	3225

	3226 // Convert s16[] and expectLimits[] to UTF-8.

	3227 uint8_t s8[3000];

	3228 int32_t offsets[3000];

	3229

	3230 const UChar *s16Limit=s16+length16;

	3231 char t=(char )s8;

	3232 char *tLimit=t+sizeof(s8);

	3233 int32_t *o=offsets;

	3234 UErrorCode errorCode=U_ZERO_ERROR;

	3235

	3236 // Convert with substitution: Turn unpaired surrogates into U+FFFD.

	3237 ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, & errorCode);

	3238 if(U_FAILURE(errorCode)) {

	3239 errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",

	3240 testName, (long)index, u_errorName(errorCode));

	3241 ucnv_resetFromUnicode(utf8Cnv);

	3242 return;

	3243 }

	3244 int32_t length8=(int32_t)(t-(char *)s8);

	3245

	3246 // Convert expectLimits[].

	3247 int32_t i, j, expect;

	3248 for(i=j=0; i<expectCount; ++i) {

	3249 expect=expectLimits[i];

	3250 if(expect==length16) {

	3251 expectLimits[i]=length8;

	3252 } else {

	3253 while(offsets[j]<expect) {

	3254 ++j;

	3255 }

	3256 expectLimits[i]=j;

	3257 }

	3258 }

	3259

	3260 testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, te stName, index);

	3261 }

	3262

	3263 static UChar32 nextCodePoint(UChar32 c) {

	3264 // Skip some large and boring ranges.

	3265 switch(c) {

	3266 case 0x3441:

	3267 return 0x4d7f;

	3268 case 0x5100:

	3269 return 0x9f00;

	3270 case 0xb040:

	3271 return 0xd780;

	3272 case 0xe041:

	3273 return 0xf8fe;

	3274 case 0x10100:

	3275 return 0x20000;

	3276 case 0x20041:

	3277 return 0xe0000;

	3278 case 0xe0101:

	3279 return 0x10fffd;

	3280 default:

	3281 return c+1;

	3282 }

	3283 }

	3284

	3285 // Verify that all implementations represent the same set.

	3286 void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings sets[4], uint 32_t whichSpans, const char testName) {

	3287 // contains(U+FFFD) is inconsistent with contains(some surrogates),

	3288 // or the set contains strings with unpaired surrogates which don't translat e to valid UTF-8:

	3289 // Skip the UTF-8 part of the test - if the string contains surrogates -

	3290 // because it is likely to produce a different result.

	3291 UBool inconsistentSurrogates=

	3292 (!(sets[0]->getSet().contains(0xfffd) ?

	3293 sets[0]->getSet().contains(0xd800, 0xdfff) :

	3294 sets[0]->getSet().containsNone(0xd800, 0xdfff)) \|\|

	3295 sets[0]->hasStringsWithSurrogates());

	3296

	3297 UChar s[1000];

	3298 int32_t length=0;

	3299 uint32_t localWhichSpans;

	3300

	3301 UChar32 c, first;

	3302 for(first=c=0;; c=nextCodePoint(c)) {

	3303 if(c>0x10ffff \|\| length>(LENGTHOF(s)-U16_MAX_LENGTH)) {

	3304 localWhichSpans=whichSpans;

	3305 if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurroga tes) {

	3306 localWhichSpans&=~SPAN_UTF8;

	3307 }

	3308 testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);

	3309 if(c>0x10ffff) {

	3310 break;

	3311 }

	3312 length=0;

	3313 first=c;

	3314 }

	3315 U16_APPEND_UNSAFE(s, length, c);

	3316 }

	3317 }

	3318

	3319 // Test with a particular, interesting string.

	3320 // Specify length and try NUL-termination.

	3321 void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings sets[4], u int32_t whichSpans, const char testName) {

	3322 static const UChar s[]={

	3323 0x61, 0x62, 0x20, // Latin, space

	3324 0x3b1, 0x3b2, 0x3b3, // Greek

	3325 0xd900, // lead surrogate

	3326 0x3000, 0x30ab, 0x30ad, // wide space, Katakana

	3327 0xdc05, // trail surrogate

	3328 0xa0, 0xac00, 0xd7a3, // nbsp, Hangul

	3329 0xd900, 0xdc05, // unassigned supplementary

	3330 0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary

	3331 0xd7a4, 0xdc05, 0xd900, 0x2028, // unassigned, surrogates in wro ng order, LS

	3332 0 // NUL

	3333 };

	3334

	3335 if((whichSpans&SPAN_UTF16)==0) {

	3336 return;

	3337 }

	3338 testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);

	3339 testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1) ;

	3340 }

	3341

	3342 void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings sets[4], ui nt32_t whichSpans, const char testName) {

	3343 static const char s[]={

	3344 "abc" // Latin

	3345

	3346 /* trail byte in lead position */

	3347 "\x80"

	3348

	3349 " " // space

	3350

	3351 /* truncated multi-byte sequences */

	3352 "\xd0"

	3353 "\xe0"

	3354 "\xe1"

	3355 "\xed"

	3356 "\xee"

	3357 "\xf0"

	3358 "\xf1"

	3359 "\xf4"

	3360 "\xf8"

	3361 "\xfc"

	3362

	3363 "\xCE\xB1\xCE\xB2\xCE\xB3" // Greek

	3364

	3365 /* trail byte in lead position */

	3366 "\x80"

	3367

	3368 "\xe0\x80"

	3369 "\xe0\xa0"

	3370 "\xe1\x80"

	3371 "\xed\x80"

	3372 "\xed\xa0"

	3373 "\xee\x80"

	3374 "\xf0\x80"

	3375 "\xf0\x90"

	3376 "\xf1\x80"

	3377 "\xf4\x80"

	3378 "\xf4\x90"

	3379 "\xf8\x80"

	3380 "\xfc\x80"

	3381

	3382 "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD" // wide space, Katakana

	3383

	3384 /* trail byte in lead position */

	3385 "\x80"

	3386

	3387 "\xf0\x80\x80"

	3388 "\xf0\x90\x80"

	3389 "\xf1\x80\x80"

	3390 "\xf4\x80\x80"

	3391 "\xf4\x90\x80"

	3392 "\xf8\x80\x80"

	3393 "\xfc\x80\x80"

	3394

	3395 "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3" // nbsp, Hangul

	3396

	3397 /* trail byte in lead position */

	3398 "\x80"

	3399

	3400 "\xf8\x80\x80\x80"

	3401 "\xfc\x80\x80\x80"

	3402

	3403 "\xF1\x90\x80\x85" // unassigned supplementary

	3404

	3405 /* trail byte in lead position */

	3406 "\x80"

	3407

	3408 "\xfc\x80\x80\x80\x80"

	3409

	3410 "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE" // Han supplementary

	3411

	3412 /* trail byte in lead position */

	3413 "\x80"

	3414

	3415 /* complete sequences but non-shortest forms or out of range etc. */

	3416 "\xc0\x80"

	3417 "\xe0\x80\x80"

	3418 "\xed\xa0\x80"

	3419 "\xf0\x80\x80\x80"

	3420 "\xf4\x90\x80\x80"

	3421 "\xf8\x80\x80\x80\x80"

	3422 "\xfc\x80\x80\x80\x80\x80"

	3423 "\xfe"

	3424 "\xff"

	3425

	3426 /* trail byte in lead position */

	3427 "\x80"

	3428

	3429 "\xED\x9E\xA4\xE2\x80\xA8" // unassigned, LS, NUL-terminate d

	3430 };

	3431

	3432 if((whichSpans&SPAN_UTF8)==0) {

	3433 return;

	3434 }

	3435 testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);

	3436 testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);

	3437 }

	3438

	3439 // Take a set of span options and multiply them so that

	3440 // each portion only has one of the options a, b and c.

	3441 // If b==0, then the set of options is just modified with mask and a.

	3442 // If b!=0 and c==0, then the set of options is just modified with mask, a and b .

	3443 static int32_t

	3444 addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,

	3445 uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {

	3446 uint32_t s;

	3447 int32_t i;

	3448

	3449 for(i=0; i<whichSpansCount; ++i) {

	3450 s=whichSpans[i]&mask;

	3451 whichSpans[i]=s\|a;

	3452 if(b!=0) {

	3453 whichSpans[whichSpansCount+i]=s\|b;

	3454 if(c!=0) {

	3455 whichSpans[2*whichSpansCount+i]=s\|c;

	3456 }

	3457 }

	3458 }

	3459 return b==0 ? whichSpansCount : c==0 ? 2whichSpansCount : 3whichSpansCount ;

	3460 }

	3461

	3462 #define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

	3463 #define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

	3464 #define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"

	3465 #define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"

	3466

	3467 void UnicodeSetTest::TestSpan() {

	3468 // "[...]" is a UnicodeSet pattern.

	3469 // "*" performs tests on all Unicode code points and on a selection of

	3470 // malformed UTF-8/16 strings.

	3471 // "-options" limits the scope of testing for the current set.

	3472 // By default, the test verifies that equivalent boundaries are found

	3473 // for UTF-16 and UTF-8, going forward and backward,

	3474 // alternating USET_SPAN_NOT_CONTAINED with

	3475 // either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.

	3476 // Single-character options:

	3477 // 8 -- UTF-16 and UTF-8 boundaries may differ.

	3478 // Cause: contains(U+FFFD) is inconsistent with contains(some surro gates),

	3479 // or the set contains strings with unpaired surrogates

	3480 // which do not translate to valid UTF-8.

	3481 // c -- set.span() and set.complement().span() boundaries may differ.

	3482 // Cause: Set strings are not complemented.

	3483 // b -- span() and spanBack() boundaries may differ.

	3484 // Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAI NED)

	3485 // and spanBack(USET_SPAN_SIMPLE) are defined to

	3486 // match with non-overlapping substrings.

	3487 // For example, with a set containing "ab" and "ba",

	3488 // span() of "aba" yields boundaries { 0, 2, 3 }

	3489 // because the initial "ab" matches from 0 to 2,

	3490 // while spanBack() yields boundaries { 0, 1, 3 }

	3491 // because the final "ba" matches from 1 to 3.

	3492 // l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.

	3493 // Cause: Strings in the set overlap, and a longer match may

	3494 // require a sequence including non-longest substrings.

	3495 // For example, with a set containing "ab", "abc" and "cd",

	3496 // span(contained) of "abcd" spans the entire string

	3497 // but span(longest match) only spans the first 3 characters.

	3498 // Each "-options" first resets all options and then applies the specified options.

	3499 // A "-" without options resets the options.

	3500 // The options are also reset for each new set.

	3501 // Other strings will be spanned.

	3502 static const char *const testdata[]={

	3503 "[:ID_Continue:]",

	3504 "*",

	3505 "[:White_Space:]",

	3506 "*",

	3507 "[]",

	3508 "*",

	3509 "[\\u0000-\\U0010FFFF]",

	3510 "*",

	3511 "[\\u0000\\u0080\\u0800\\U00010000]",

	3512 "*",

	3513 "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",

	3514 "*",

	3515 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30 ad}]",

	3516 "-c",

	3517 "*",

	3518 "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30 ad}]",

	3519 "-c",

	3520 "*",

	3521

	3522 // Overlapping strings cause overlapping attempts to match.

	3523 "[x{xy}{xya}{axy}{ax}]",

	3524 "-cl",

	3525

	3526 // More repetitions of "xya" would take too long with the recursive

	3527 // reference implementation.

	3528 // containsAll()=FALSE

	3529 // test_string 0x14

	3530 "xx"

	3531 "xyaxyaxyaxya" // set.complement().span(longest match) will stop here.

	3532 "xx" // set.complement().span(contained) will stop between th e two 'x'es.

	3533 "xyaxyaxyaxya"

	3534 "xx"

	3535 "xyaxyaxyaxya" // span() ends here.

	3536 "aaa",

	3537

	3538 // containsAll()=TRUE

	3539 // test_string 0x15

	3540 "xx"

	3541 "xyaxyaxyaxya"

	3542 "xx"

	3543 "xyaxyaxyaxya"

	3544 "xx"

	3545 "xyaxyaxyaxy",

	3546

	3547 "-bc",

	3548 // test_string 0x17

	3549 "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }

	3550 "-c",

	3551 "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }

	3552 "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }

	3553 "-",

	3554 "byaya", // span() -> { 5 }

	3555 "byay", // span() -> { 4 }

	3556 "bya", // span() -> { 3 }

	3557

	3558 // span(longest match) will not span the whole string.

	3559 "[a{ab}{bc}]",

	3560 "-cl",

	3561 // test_string 0x21

	3562 "abc",

	3563

	3564 "[a{ab}{abc}{cd}]",

	3565 "-cl",

	3566 "acdabcdabccd",

	3567

	3568 // spanBack(longest match) will not span the whole string.

	3569 "[c{ab}{bc}]",

	3570 "-cl",

	3571 "abc",

	3572

	3573 "[d{cd}{bcd}{ab}]",

	3574 "-cl",

	3575 "abbcdabcdabd",

	3576

	3577 // Test with non-ASCII set strings - test proper handling of surrogate p airs

	3578 // and UTF-8 trail bytes.

	3579 // Copies of above test sets and strings, but transliterated to have

	3580 // different code points with similar trail units.

	3581 // Previous: a b c d

	3582 // Unicode: 042B 30AB 200AB 204AB

	3583 // UTF-16: 042B 30AB D840 DCAB D841 DCAB

	3584 // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB

	3585 "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U00020 4AB}]",

	3586 "-cl",

	3587 "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042 B\\u30AB\\U000200AB\\U000200AB\\U000204AB",

	3588

	3589 "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u 042B\\u30AB}]",

	3590 "-cl",

	3591 "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U 000204AB\\u042B\\u30AB\\U000204AB",

	3592

	3593 // Stress bookkeeping and recursion.

	3594 // The following strings are barely doable with the recursive

	3595 // reference implementation.

	3596 // The not-contained character at the end prevents an early exit from th e span().

	3597 "[b{bb}]",

	3598 "-c",

	3599 // test_string 0x33

	3600 "bbbbbbbbbbbbbbbbbbbbbbbb-",

	3601 // On complement sets, span() and spanBack() get different results

	3602 // because b is not in the complement set and there is an odd number of b's

	3603 // in the test string.

	3604 "-bc",

	3605 "bbbbbbbbbbbbbbbbbbbbbbbbb-",

	3606

	3607 // Test with set strings with an initial or final code point span

	3608 // longer than 254.

	3609 "[a{" _64_a _64_a _64_a _64_a "b}"

	3610 "{a" _64_b _64_b _64_b _64_b "}]",

	3611 "-c",

	3612 _64_a _64_a _64_a _63_a "b",

	3613 _64_a _64_a _64_a _64_a "b",

	3614 _64_a _64_a _64_a _64_a "aaaabbbb",

	3615 "a" _64_b _64_b _64_b _63_b,

	3616 "a" _64_b _64_b _64_b _64_b,

	3617 "aaaabbbb" _64_b _64_b _64_b _64_b,

	3618

	3619 // Test with strings containing unpaired surrogates.

	3620 // They are not representable in UTF-8, and a leading trail surrogate

	3621 // and a trailing lead surrogate must not match in the middle of a prope r surrogate pair.

	3622 // U+20001 == \\uD840\\uDC01

	3623 // U+20400 == \\uD841\\uDC00

	3624 "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",

	3625 "-8cl",

	3626 "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a \\U00020000\\uDC00a\\uDC00babbb"

	3627 };

	3628 uint32_t whichSpans[96]={ SPAN_ALL };

	3629 int32_t whichSpansCount=1;

	3630

	3631 UnicodeSet *sets[SET_COUNT]={ NULL };

	3632 const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };

	3633

	3634 char testName[1024];

	3635 char *testNameLimit=testName;

	3636

	3637 int32_t i, j;

	3638 for(i=0; i<LENGTHOF(testdata); ++i) {

	3639 const char *s=testdata[i];

	3640 if(s[0]=='[') {

	3641 // Create new test sets from this pattern.

	3642 for(j=0; j<SET_COUNT; ++j) {

	3643 delete sets_with_str[j];

	3644 delete sets[j];

	3645 }

	3646 UErrorCode errorCode=U_ZERO_ERROR;

	3647 sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), e rrorCode);

	3648 if(U_FAILURE(errorCode)) {

	3649 dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_err orName(errorCode));

	3650 break;

	3651 }

	3652 sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);

	3653 sets[SLOW_NOT]->complement();

	3654 // Intermediate set: Test cloning of a frozen set.

	3655 UnicodeSet fast=new UnicodeSet(sets[SLOW]);

	3656 fast->freeze();

	3657 sets[FAST]=(UnicodeSet *)fast->clone();

	3658 delete fast;

	3659 UnicodeSet fastNot=new UnicodeSet(sets[SLOW_NOT]);

	3660 fastNot->freeze();

	3661 sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();

	3662 delete fastNot;

	3663

	3664 for(j=0; j<SET_COUNT; ++j) {

	3665 sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);

	3666 }

	3667

	3668 strcpy(testName, s);

	3669 testNameLimit=strchr(testName, 0);

	3670 *testNameLimit++=':';

	3671 *testNameLimit=0;

	3672

	3673 whichSpans[0]=SPAN_ALL;

	3674 whichSpansCount=1;

	3675 } else if(s[0]=='-') {

	3676 whichSpans[0]=SPAN_ALL;

	3677 whichSpansCount=1;

	3678

	3679 while(*++s!=0) {

	3680 switch(*s) {

	3681 case 'c':

	3682 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

	3683 ~SPAN_POLARITY,

	3684 SPAN_SET,

	3685 SPAN_COMPLEMENT,

	3686 0);

	3687 break;

	3688 case 'b':

	3689 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

	3690 ~SPAN_DIRS,

	3691 SPAN_FWD,

	3692 SPAN_BACK,

	3693 0);

	3694 break;

	3695 case 'l':

	3696 // test USET_SPAN_CONTAINED FWD & BACK, and separately

	3697 // USET_SPAN_SIMPLE only FWD, and separately

	3698 // USET_SPAN_SIMPLE only BACK

	3699 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

	3700 ~(SPAN_DIRS\|SPAN_CONDITION),

	3701 SPAN_DIRS\|SPAN_CONTAINED,

	3702 SPAN_FWD\|SPAN_SIMPLE,

	3703 SPAN_BACK\|SPAN_SIMPLE);

	3704 break;

	3705 case '8':

	3706 whichSpansCount=addAlternative(whichSpans, whichSpansCount,

	3707 ~SPAN_UTFS,

	3708 SPAN_UTF16,

	3709 SPAN_UTF8,

	3710 0);

	3711 break;

	3712 default:

	3713 errln("FAIL: unrecognized span set option in \"%s\"", testda ta[i]);

	3714 break;

	3715 }

	3716 }

	3717 } else if(0==strcmp(s, "*")) {

	3718 strcpy(testNameLimit, "bad_string");

	3719 for(j=0; j<whichSpansCount; ++j) {

	3720 if(whichSpansCount>1) {

	3721 sprintf(testNameLimit+10 /* strlen("bad_string") */,

	3722 "%%0x%3x",

	3723 whichSpans[j]);

	3724 }

	3725 testSpanUTF16String(sets_with_str, whichSpans[j], testName);

	3726 testSpanUTF8String(sets_with_str, whichSpans[j], testName);

	3727 }

	3728

	3729 strcpy(testNameLimit, "contents");

	3730 for(j=0; j<whichSpansCount; ++j) {

	3731 if(whichSpansCount>1) {

	3732 sprintf(testNameLimit+8 /* strlen("contents") */,

	3733 "%%0x%3x",

	3734 whichSpans[j]);

	3735 }

	3736 testSpanContents(sets_with_str, whichSpans[j], testName);

	3737 }

	3738 } else {

	3739 UnicodeString string=UnicodeString(s, -1, US_INV).unescape();

	3740 strcpy(testNameLimit, "test_string");

	3741 for(j=0; j<whichSpansCount; ++j) {

	3742 if(whichSpansCount>1) {

	3743 sprintf(testNameLimit+11 /* strlen("test_string") */,

	3744 "%%0x%3x",

	3745 whichSpans[j]);

	3746 }

	3747 testSpanBothUTFs(sets_with_str, string.getBuffer(), string.lengt h(), whichSpans[j], testName, i);

	3748 }

	3749 }

	3750 }

	3751 for(j=0; j<SET_COUNT; ++j) {

	3752 delete sets_with_str[j];

	3753 delete sets[j];

	3754 }

	3755 }

	3756

	3757 // Test select patterns and strings, and test USET_SPAN_SIMPLE.

	3758 void UnicodeSetTest::TestStringSpan() {

	3759 static const char *pattern="[x{xy}{xya}{axy}{ax}]";

	3760 static const char *const string=

	3761 "xx"

	3762 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"

	3763 "xx"

	3764 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"

	3765 "xx"

	3766 "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"

	3767 "aaaa";

	3768

	3769 UErrorCode errorCode=U_ZERO_ERROR;

	3770 UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);

	3771 UnicodeSet set(pattern16, errorCode);

	3772 if(U_FAILURE(errorCode)) {

	3773 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName (errorCode));

	3774 return;

	3775 }

	3776

	3777 UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();

	3778

	3779 if(set.containsAll(string16)) {

	3780 errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, s tring);

	3781 }

	3782

	3783 // Remove trailing "aaaa".

	3784 string16.truncate(string16.length()-4);

	3785 if(!set.containsAll(string16)) {

	3786 errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", patter n, string);

	3787 }

	3788

	3789 string16=UNICODE_STRING_SIMPLE("byayaxya");

	3790 const UChar *s16=string16.getBuffer();

	3791 int32_t length16=string16.length();

	3792 if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 \|\|

	3793 set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 \|\|

	3794 set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 \|\|

	3795 set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 \|\|

	3796 set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 \|\|

	3797 set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3

	3798 ) {

	3799 errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pa ttern);

	3800 }

	3801

	3802 pattern="[a{ab}{abc}{cd}]";

	3803 pattern16=UnicodeString(pattern, -1, US_INV);

	3804 set.applyPattern(pattern16, errorCode);

	3805 if(U_FAILURE(errorCode)) {

	3806 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName (errorCode));

	3807 return;

	3808 }

	3809 string16=UNICODE_STRING_SIMPLE("acdabcdabccd");

	3810 s16=string16.getBuffer();

	3811 length16=string16.length();

	3812 if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 \|\|

	3813 set.span(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|

	3814 set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5

	3815 ) {

	3816 errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);

	3817 }

	3818

	3819 pattern="[d{cd}{bcd}{ab}]";

	3820 pattern16=UnicodeString(pattern, -1, US_INV);

	3821 set.applyPattern(pattern16, errorCode).freeze();

	3822 if(U_FAILURE(errorCode)) {

	3823 errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName (errorCode));

	3824 return;

	3825 }

	3826 string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");

	3827 s16=string16.getBuffer();

	3828 length16=string16.length();

	3829 if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 \|\|

	3830 set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 \|\|

	3831 set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0

	3832 ) {

	3833 errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wr ong value", pattern);

	3834 }

	3835 }

OLD	NEW

« no previous file with comments | « icu46/source/test/intltest/usettest.h ('k') | icu46/source/test/intltest/ustrtest.h » ('j') | no next file with comments »