| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 ********************************************************************** | |
| 3 * Copyright (C) 2005-2014, International Business Machines | |
| 4 * Corporation and others. All Rights Reserved. | |
| 5 ********************************************************************** | |
| 6 */ | |
| 7 | |
| 8 #include "unicode/utypes.h" | |
| 9 | |
| 10 #if !UCONFIG_NO_COLLATION | |
| 11 | |
| 12 #include "cmemory.h" | |
| 13 #include "cstring.h" | |
| 14 #include "usrchimp.h" | |
| 15 | |
| 16 #include "unicode/coll.h" | |
| 17 #include "unicode/tblcoll.h" | |
| 18 #include "unicode/usearch.h" | |
| 19 #include "unicode/uset.h" | |
| 20 #include "unicode/ustring.h" | |
| 21 | |
| 22 #include "unicode/coleitr.h" | |
| 23 #include "unicode/regex.h" // TODO: make conditional on regexp being buil
t. | |
| 24 | |
| 25 #include "colldata.h" | |
| 26 #include "ssearch.h" | |
| 27 #include "xmlparser.h" | |
| 28 | |
| 29 #include <stdio.h> // for sprintf | |
| 30 | |
| 31 char testId[100]; | |
| 32 | |
| 33 #define TEST_ASSERT(x) {if (!(x)) { \ | |
| 34 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, t
estId);}} | |
| 35 | |
| 36 #define TEST_ASSERT_M(x, m) {if (!(x)) { \ | |
| 37 dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);re
turn;}} | |
| 38 | |
| 39 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ | |
| 40 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ | |
| 41 __FILE__, __LINE__, testId, u_errorName(errcode));}} | |
| 42 | |
| 43 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
| 44 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) | |
| 45 #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
| 46 | |
| 47 //--------------------------------------------------------------------------- | |
| 48 // | |
| 49 // Test class boilerplate | |
| 50 // | |
| 51 //--------------------------------------------------------------------------- | |
| 52 SSearchTest::SSearchTest() | |
| 53 { | |
| 54 } | |
| 55 | |
| 56 SSearchTest::~SSearchTest() | |
| 57 { | |
| 58 } | |
| 59 | |
| 60 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
char *params ) | |
| 61 { | |
| 62 if (exec) logln("TestSuite SSearchTest: "); | |
| 63 switch (index) { | |
| 64 #if !UCONFIG_NO_BREAK_ITERATION | |
| 65 case 0: name = "searchTest"; | |
| 66 if (exec) searchTest(); | |
| 67 break; | |
| 68 | |
| 69 case 1: name = "offsetTest"; | |
| 70 if (exec) offsetTest(); | |
| 71 break; | |
| 72 | |
| 73 case 2: name = "monkeyTest"; | |
| 74 if (exec) monkeyTest(params); | |
| 75 break; | |
| 76 | |
| 77 case 3: name = "sharpSTest"; | |
| 78 if (exec) sharpSTest(); | |
| 79 break; | |
| 80 | |
| 81 case 4: name = "goodSuffixTest"; | |
| 82 if (exec) goodSuffixTest(); | |
| 83 break; | |
| 84 | |
| 85 case 5: name = "searchTime"; | |
| 86 if (exec) searchTime(); | |
| 87 break; | |
| 88 #endif | |
| 89 default: name = ""; | |
| 90 break; //needed to end loop | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 | |
| 95 #if !UCONFIG_NO_BREAK_ITERATION | |
| 96 | |
| 97 #define PATH_BUFFER_SIZE 2048 | |
| 98 const char *SSearchTest::getPath(char buffer[2048], const char *filename) { | |
| 99 UErrorCode status = U_ZERO_ERROR; | |
| 100 const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
| 101 | |
| 102 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >=
PATH_BUFFER_SIZE) { | |
| 103 errln("ERROR: getPath() failed - %s", u_errorName(status)); | |
| 104 return NULL; | |
| 105 } | |
| 106 | |
| 107 strcpy(buffer, testDataDirectory); | |
| 108 strcat(buffer, filename); | |
| 109 return buffer; | |
| 110 } | |
| 111 | |
| 112 | |
| 113 void SSearchTest::searchTest() | |
| 114 { | |
| 115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO | |
| 116 UErrorCode status = U_ZERO_ERROR; | |
| 117 char path[PATH_BUFFER_SIZE]; | |
| 118 const char *testFilePath = getPath(path, "ssearch.xml"); | |
| 119 | |
| 120 if (testFilePath == NULL) { | |
| 121 return; /* Couldn't get path: error message already output. */ | |
| 122 } | |
| 123 | |
| 124 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); | |
| 125 TEST_ASSERT_SUCCESS(status); | |
| 126 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); | |
| 127 TEST_ASSERT_SUCCESS(status); | |
| 128 if (U_FAILURE(status)) { | |
| 129 return; | |
| 130 } | |
| 131 | |
| 132 const UnicodeString *debugTestCase = root->getAttribute("debug"); | |
| 133 if (debugTestCase != NULL) { | |
| 134 // setenv("USEARCH_DEBUG", "1", 1); | |
| 135 } | |
| 136 | |
| 137 | |
| 138 const UXMLElement *testCase; | |
| 139 int32_t tc = 0; | |
| 140 | |
| 141 while((testCase = root->nextChildElement(tc)) != NULL) { | |
| 142 | |
| 143 if (testCase->getTagName().compare("test-case") != 0) { | |
| 144 errln("ssearch, unrecognized XML Element in test file"); | |
| 145 continue; | |
| 146 } | |
| 147 const UnicodeString *id = testCase->getAttribute("id"); | |
| 148 *testId = 0; | |
| 149 if (id != NULL) { | |
| 150 id->extract(0, id->length(), testId, sizeof(testId), US_INV); | |
| 151 } | |
| 152 | |
| 153 // If debugging test case has been specified and this is not it, skip to
next. | |
| 154 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { | |
| 155 continue; | |
| 156 } | |
| 157 // | |
| 158 // Get the requested collation strength. | |
| 159 // Default is tertiary if the XML attribute is missing from the test
case. | |
| 160 // | |
| 161 const UnicodeString *strength = testCase->getAttribute("strength"); | |
| 162 UColAttributeValue collatorStrength = UCOL_PRIMARY; | |
| 163 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} | |
| 164 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} | |
| 165 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} | |
| 166 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} | |
| 167 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} | |
| 168 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} | |
| 169 else { | |
| 170 // Bogus value supplied for strength. Shouldn't happen, even from | |
| 171 // typos, if the XML source has been validated. | |
| 172 // This assert is a little deceiving in that strength can be | |
| 173 // any of the allowed values, not just TERTIARY, but it will | |
| 174 // do the job of getting the error output. | |
| 175 TEST_ASSERT(*strength=="TERTIARY") | |
| 176 } | |
| 177 | |
| 178 // | |
| 179 // Get the collator normalization flag. Default is UCOL_OFF. | |
| 180 // | |
| 181 UColAttributeValue normalize = UCOL_OFF; | |
| 182 const UnicodeString *norm = testCase->getAttribute("norm"); | |
| 183 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); | |
| 184 if (norm!=NULL && *norm=="ON") { | |
| 185 normalize = UCOL_ON; | |
| 186 } | |
| 187 | |
| 188 // | |
| 189 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. | |
| 190 // | |
| 191 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; | |
| 192 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); | |
| 193 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"
); | |
| 194 if (alt != NULL && *alt == "SHIFTED") { | |
| 195 alternateHandling = UCOL_SHIFTED; | |
| 196 } | |
| 197 | |
| 198 const UnicodeString defLocale("en"); | |
| 199 char clocale[100]; | |
| 200 const UnicodeString *locale = testCase->getAttribute("locale"); | |
| 201 if (locale == NULL || locale->length()==0) { | |
| 202 locale = &defLocale; | |
| 203 }; | |
| 204 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); | |
| 205 | |
| 206 | |
| 207 UnicodeString text; | |
| 208 UnicodeString target; | |
| 209 UnicodeString pattern; | |
| 210 int32_t expectedMatchStart = -1; | |
| 211 int32_t expectedMatchLimit = -1; | |
| 212 const UXMLElement *n; | |
| 213 int32_t nodeCount = 0; | |
| 214 | |
| 215 n = testCase->getChildElement("pattern"); | |
| 216 TEST_ASSERT(n != NULL); | |
| 217 if (n==NULL) { | |
| 218 continue; | |
| 219 } | |
| 220 text = n->getText(FALSE); | |
| 221 text = text.unescape(); | |
| 222 pattern.append(text); | |
| 223 nodeCount++; | |
| 224 | |
| 225 n = testCase->getChildElement("pre"); | |
| 226 if (n!=NULL) { | |
| 227 text = n->getText(FALSE); | |
| 228 text = text.unescape(); | |
| 229 target.append(text); | |
| 230 nodeCount++; | |
| 231 } | |
| 232 | |
| 233 n = testCase->getChildElement("m"); | |
| 234 if (n!=NULL) { | |
| 235 expectedMatchStart = target.length(); | |
| 236 text = n->getText(FALSE); | |
| 237 text = text.unescape(); | |
| 238 target.append(text); | |
| 239 expectedMatchLimit = target.length(); | |
| 240 nodeCount++; | |
| 241 } | |
| 242 | |
| 243 n = testCase->getChildElement("post"); | |
| 244 if (n!=NULL) { | |
| 245 text = n->getText(FALSE); | |
| 246 text = text.unescape(); | |
| 247 target.append(text); | |
| 248 nodeCount++; | |
| 249 } | |
| 250 | |
| 251 // Check that there weren't extra things in the XML | |
| 252 TEST_ASSERT(nodeCount == testCase->countChildren()); | |
| 253 | |
| 254 // Open a collator and StringSearch based on the parameters | |
| 255 // obtained from the XML. | |
| 256 // | |
| 257 status = U_ZERO_ERROR; | |
| 258 LocalUCollatorPointer collator(ucol_open(clocale, &status)); | |
| 259 ucol_setStrength(collator.getAlias(), collatorStrength); | |
| 260 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normaliz
e, &status); | |
| 261 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternat
eHandling, &status); | |
| 262 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer
(), pattern.length(), | |
| 263 target.getBuffer(
), target.length(), | |
| 264 collator.getAlias
(), | |
| 265 NULL, // the
break iterator | |
| 266 &status)); | |
| 267 | |
| 268 TEST_ASSERT_SUCCESS(status); | |
| 269 if (U_FAILURE(status)) { | |
| 270 continue; | |
| 271 } | |
| 272 | |
| 273 int32_t foundStart = 0; | |
| 274 int32_t foundLimit = 0; | |
| 275 UBool foundMatch; | |
| 276 | |
| 277 // | |
| 278 // Do the search, check the match result against the expected results. | |
| 279 // | |
| 280 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit,
&status); | |
| 281 TEST_ASSERT_SUCCESS(status); | |
| 282 if ((foundMatch && expectedMatchStart<0) || | |
| 283 (foundStart != expectedMatchStart) || | |
| 284 (foundLimit != expectedMatchLimit)) { | |
| 285 TEST_ASSERT(FALSE); // ouput generic error position | |
| 286 infoln("Found, expected match start = %d, %d \n" | |
| 287 "Found, expected match limit = %d, %d", | |
| 288 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
| 289 } | |
| 290 | |
| 291 // In case there are other matches... | |
| 292 // (should we only do this if the test case passed?) | |
| 293 while (foundMatch) { | |
| 294 expectedMatchStart = foundStart; | |
| 295 expectedMatchLimit = foundLimit; | |
| 296 | |
| 297 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart,
&foundLimit, &status); | |
| 298 } | |
| 299 | |
| 300 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.l
ength(), | |
| 301 target.getBuffer(), target.length(), | |
| 302 collator.getAlias(), | |
| 303 NULL, | |
| 304 &status)); | |
| 305 | |
| 306 // | |
| 307 // Do the backwards search, check the match result against the expected
results. | |
| 308 // | |
| 309 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &fo
undStart, &foundLimit, &status); | |
| 310 TEST_ASSERT_SUCCESS(status); | |
| 311 if ((foundMatch && expectedMatchStart<0) || | |
| 312 (foundStart != expectedMatchStart) || | |
| 313 (foundLimit != expectedMatchLimit)) { | |
| 314 TEST_ASSERT(FALSE); // ouput generic error position | |
| 315 infoln("Found, expected backwards match start = %d, %d \n" | |
| 316 "Found, expected backwards match limit = %d, %d", | |
| 317 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
| 318 } | |
| 319 } | |
| 320 #endif | |
| 321 } | |
| 322 | |
| 323 struct Order | |
| 324 { | |
| 325 int32_t order; | |
| 326 int32_t lowOffset; | |
| 327 int32_t highOffset; | |
| 328 }; | |
| 329 | |
| 330 class OrderList | |
| 331 { | |
| 332 public: | |
| 333 OrderList(); | |
| 334 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset
= 0); | |
| 335 ~OrderList(); | |
| 336 | |
| 337 int32_t size(void) const; | |
| 338 void add(int32_t order, int32_t low, int32_t high); | |
| 339 const Order *get(int32_t index) const; | |
| 340 int32_t getLowOffset(int32_t index) const; | |
| 341 int32_t getHighOffset(int32_t index) const; | |
| 342 int32_t getOrder(int32_t index) const; | |
| 343 void reverse(void); | |
| 344 UBool compare(const OrderList &other) const; | |
| 345 UBool matchesAt(int32_t offset, const OrderList &other) const; | |
| 346 | |
| 347 private: | |
| 348 Order *list; | |
| 349 int32_t listMax; | |
| 350 int32_t listSize; | |
| 351 }; | |
| 352 | |
| 353 OrderList::OrderList() | |
| 354 : list(NULL), listMax(16), listSize(0) | |
| 355 { | |
| 356 list = new Order[listMax]; | |
| 357 } | |
| 358 | |
| 359 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t strin
gOffset) | |
| 360 : list(NULL), listMax(16), listSize(0) | |
| 361 { | |
| 362 UErrorCode status = U_ZERO_ERROR; | |
| 363 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), stri
ng.length(), &status); | |
| 364 uint32_t strengthMask = 0; | |
| 365 int32_t order, low, high; | |
| 366 | |
| 367 switch (ucol_getStrength(coll)) | |
| 368 { | |
| 369 default: | |
| 370 strengthMask |= UCOL_TERTIARYORDERMASK; | |
| 371 /* fall through */ | |
| 372 | |
| 373 case UCOL_SECONDARY: | |
| 374 strengthMask |= UCOL_SECONDARYORDERMASK; | |
| 375 /* fall through */ | |
| 376 | |
| 377 case UCOL_PRIMARY: | |
| 378 strengthMask |= UCOL_PRIMARYORDERMASK; | |
| 379 } | |
| 380 | |
| 381 list = new Order[listMax]; | |
| 382 | |
| 383 ucol_setOffset(elems, stringOffset, &status); | |
| 384 | |
| 385 do { | |
| 386 low = ucol_getOffset(elems); | |
| 387 order = ucol_next(elems, &status); | |
| 388 high = ucol_getOffset(elems); | |
| 389 | |
| 390 if (order != UCOL_NULLORDER) { | |
| 391 order &= strengthMask; | |
| 392 } | |
| 393 | |
| 394 if (order != UCOL_IGNORABLE) { | |
| 395 add(order, low, high); | |
| 396 } | |
| 397 } while (order != UCOL_NULLORDER); | |
| 398 | |
| 399 ucol_closeElements(elems); | |
| 400 } | |
| 401 | |
| 402 OrderList::~OrderList() | |
| 403 { | |
| 404 delete[] list; | |
| 405 } | |
| 406 | |
| 407 void OrderList::add(int32_t order, int32_t low, int32_t high) | |
| 408 { | |
| 409 if (listSize >= listMax) { | |
| 410 listMax *= 2; | |
| 411 | |
| 412 Order *newList = new Order[listMax]; | |
| 413 | |
| 414 uprv_memcpy(newList, list, listSize * sizeof(Order)); | |
| 415 delete[] list; | |
| 416 list = newList; | |
| 417 } | |
| 418 | |
| 419 list[listSize].order = order; | |
| 420 list[listSize].lowOffset = low; | |
| 421 list[listSize].highOffset = high; | |
| 422 | |
| 423 listSize += 1; | |
| 424 } | |
| 425 | |
| 426 const Order *OrderList::get(int32_t index) const | |
| 427 { | |
| 428 if (index >= listSize) { | |
| 429 return NULL; | |
| 430 } | |
| 431 | |
| 432 return &list[index]; | |
| 433 } | |
| 434 | |
| 435 int32_t OrderList::getLowOffset(int32_t index) const | |
| 436 { | |
| 437 const Order *order = get(index); | |
| 438 | |
| 439 if (order != NULL) { | |
| 440 return order->lowOffset; | |
| 441 } | |
| 442 | |
| 443 return -1; | |
| 444 } | |
| 445 | |
| 446 int32_t OrderList::getHighOffset(int32_t index) const | |
| 447 { | |
| 448 const Order *order = get(index); | |
| 449 | |
| 450 if (order != NULL) { | |
| 451 return order->highOffset; | |
| 452 } | |
| 453 | |
| 454 return -1; | |
| 455 } | |
| 456 | |
| 457 int32_t OrderList::getOrder(int32_t index) const | |
| 458 { | |
| 459 const Order *order = get(index); | |
| 460 | |
| 461 if (order != NULL) { | |
| 462 return order->order; | |
| 463 } | |
| 464 | |
| 465 return UCOL_NULLORDER; | |
| 466 } | |
| 467 | |
| 468 int32_t OrderList::size() const | |
| 469 { | |
| 470 return listSize; | |
| 471 } | |
| 472 | |
| 473 void OrderList::reverse() | |
| 474 { | |
| 475 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { | |
| 476 Order swap = list[b]; | |
| 477 | |
| 478 list[b] = list[f]; | |
| 479 list[f] = swap; | |
| 480 } | |
| 481 } | |
| 482 | |
| 483 UBool OrderList::compare(const OrderList &other) const | |
| 484 { | |
| 485 if (listSize != other.listSize) { | |
| 486 return FALSE; | |
| 487 } | |
| 488 | |
| 489 for(int32_t i = 0; i < listSize; i += 1) { | |
| 490 if (list[i].order != other.list[i].order || | |
| 491 list[i].lowOffset != other.list[i].lowOffset || | |
| 492 list[i].highOffset != other.list[i].highOffset) { | |
| 493 return FALSE; | |
| 494 } | |
| 495 } | |
| 496 | |
| 497 return TRUE; | |
| 498 } | |
| 499 | |
| 500 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const | |
| 501 { | |
| 502 // NOTE: sizes include the NULLORDER, which we don't want to compare. | |
| 503 int32_t otherSize = other.size() - 1; | |
| 504 | |
| 505 if (listSize - 1 - offset < otherSize) { | |
| 506 return FALSE; | |
| 507 } | |
| 508 | |
| 509 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { | |
| 510 if (getOrder(i) != other.getOrder(j)) { | |
| 511 return FALSE; | |
| 512 } | |
| 513 } | |
| 514 | |
| 515 return TRUE; | |
| 516 } | |
| 517 | |
| 518 static char *printOffsets(char *buffer, OrderList &list) | |
| 519 { | |
| 520 int32_t size = list.size(); | |
| 521 char *s = buffer; | |
| 522 | |
| 523 for(int32_t i = 0; i < size; i += 1) { | |
| 524 const Order *order = list.get(i); | |
| 525 | |
| 526 if (i != 0) { | |
| 527 s += sprintf(s, ", "); | |
| 528 } | |
| 529 | |
| 530 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); | |
| 531 } | |
| 532 | |
| 533 return buffer; | |
| 534 } | |
| 535 | |
| 536 static char *printOrders(char *buffer, OrderList &list) | |
| 537 { | |
| 538 int32_t size = list.size(); | |
| 539 char *s = buffer; | |
| 540 | |
| 541 for(int32_t i = 0; i < size; i += 1) { | |
| 542 const Order *order = list.get(i); | |
| 543 | |
| 544 if (i != 0) { | |
| 545 s += sprintf(s, ", "); | |
| 546 } | |
| 547 | |
| 548 s += sprintf(s, "%8.8X", order->order); | |
| 549 } | |
| 550 | |
| 551 return buffer; | |
| 552 } | |
| 553 | |
| 554 void SSearchTest::offsetTest() | |
| 555 { | |
| 556 const char *test[] = { | |
| 557 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous | |
| 558 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. | |
| 559 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", | |
| 560 | |
| 561 "\\ua191\\u16ef\\u2036\\u017a", | |
| 562 | |
| 563 #if 0 | |
| 564 // This results in a complex interaction between contraction, | |
| 565 // expansion and normalization that confuses the backwards offset fixups
. | |
| 566 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
| 567 #endif | |
| 568 | |
| 569 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
| 570 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", | |
| 571 | |
| 572 "\\u02FE\\u02FF" | |
| 573 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\
\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" | |
| 574 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\
\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" | |
| 575 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\
\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" | |
| 576 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\
\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" | |
| 577 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\
\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 | |
| 578 | |
| 579 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // cu
rrently not working, see #8081 | |
| 580 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 | |
| 581 "a\\u02FF\\u0316\\u0301", | |
| 582 "a\\u0430\\u0301\\u0316", | |
| 583 "a\\u0430\\u0316\\u0301", | |
| 584 "abc\\u0E41\\u0301\\u0316", | |
| 585 "abc\\u0E41\\u0316\\u0301", | |
| 586 "\\u0E41\\u0301\\u0316", | |
| 587 "\\u0E41\\u0316\\u0301", | |
| 588 "a\\u0301\\u0316", | |
| 589 "a\\u0316\\u0301", | |
| 590 "\\uAC52\\uAC53", | |
| 591 "\\u34CA\\u34CB", | |
| 592 "\\u11ED\\u11EE", | |
| 593 "\\u30C3\\u30D0", | |
| 594 "p\\u00E9ch\\u00E9", | |
| 595 "a\\u0301\\u0325", | |
| 596 "a\\u0300\\u0325", | |
| 597 "a\\u0325\\u0300", | |
| 598 "A\\u0323\\u0300B", | |
| 599 "A\\u0300\\u0323B", | |
| 600 "A\\u0301\\u0323B", | |
| 601 "A\\u0302\\u0301\\u0323B", | |
| 602 "abc", | |
| 603 "ab\\u0300c", | |
| 604 "ab\\u0300\\u0323c", | |
| 605 " \\uD800\\uDC00\\uDC00", | |
| 606 "a\\uD800\\uDC00\\uDC00", | |
| 607 "A\\u0301\\u0301", | |
| 608 "A\\u0301\\u0323", | |
| 609 "A\\u0301\\u0323B", | |
| 610 "B\\u0301\\u0323C", | |
| 611 "A\\u0300\\u0323B", | |
| 612 "\\u0301A\\u0301\\u0301", | |
| 613 "abcd\\r\\u0301", | |
| 614 "p\\u00EAche", | |
| 615 "pe\\u0302che", | |
| 616 }; | |
| 617 | |
| 618 int32_t testCount = ARRAY_SIZE(test); | |
| 619 UErrorCode status = U_ZERO_ERROR; | |
| 620 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Loca
le::getEnglish(), status); | |
| 621 if (U_FAILURE(status)) { | |
| 622 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_er
rorName(status)); | |
| 623 return; | |
| 624 } | |
| 625 char buffer[4096]; // A bit of a hack... just happens to be long enough for
all the test cases... | |
| 626 // We could allocate one that's the right size by (CE_co
unt * 10) + 2 | |
| 627 // 10 chars is enough room for 8 hex digits plus ", ". 2
extra chars for "[" and "]" | |
| 628 | |
| 629 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); | |
| 630 | |
| 631 for(int32_t i = 0; i < testCount; i += 1) { | |
| 632 UnicodeString ts = CharsToUnicodeString(test[i]); | |
| 633 CollationElementIterator *iter = col->createCollationElementIterator(ts)
; | |
| 634 OrderList forwardList; | |
| 635 OrderList backwardList; | |
| 636 int32_t order, low, high; | |
| 637 | |
| 638 do { | |
| 639 low = iter->getOffset(); | |
| 640 order = iter->next(status); | |
| 641 high = iter->getOffset(); | |
| 642 | |
| 643 forwardList.add(order, low, high); | |
| 644 } while (order != CollationElementIterator::NULLORDER); | |
| 645 | |
| 646 iter->reset(); | |
| 647 iter->setOffset(ts.length(), status); | |
| 648 | |
| 649 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(),
iter->getOffset()); | |
| 650 | |
| 651 do { | |
| 652 high = iter->getOffset(); | |
| 653 order = iter->previous(status); | |
| 654 low = iter->getOffset(); | |
| 655 | |
| 656 if (order == CollationElementIterator::NULLORDER) { | |
| 657 break; | |
| 658 } | |
| 659 | |
| 660 backwardList.add(order, low, high); | |
| 661 } while (TRUE); | |
| 662 | |
| 663 backwardList.reverse(); | |
| 664 | |
| 665 if (forwardList.compare(backwardList)) { | |
| 666 logln("Works with \"%s\"", test[i]); | |
| 667 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
| 668 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); | |
| 669 | |
| 670 logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
| 671 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
| 672 | |
| 673 logln(); | |
| 674 } else { | |
| 675 errln("Fails with \"%s\"", test[i]); | |
| 676 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
| 677 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList))
; | |
| 678 | |
| 679 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
| 680 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
| 681 | |
| 682 infoln(); | |
| 683 } | |
| 684 delete iter; | |
| 685 } | |
| 686 delete col; | |
| 687 } | |
| 688 | |
| 689 #if 0 | |
| 690 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) | |
| 691 { | |
| 692 for(int32_t i = 0; i < string.length(); i += 1) { | |
| 693 UChar32 ch = string.char32At(i); | |
| 694 | |
| 695 if (ch >= 0x0020 && ch <= 0x007F) { | |
| 696 if (ch == 0x005C) { | |
| 697 buffer.append("\\\\"); | |
| 698 } else { | |
| 699 buffer.append(ch); | |
| 700 } | |
| 701 } else { | |
| 702 char cbuffer[12]; | |
| 703 | |
| 704 if (ch <= 0xFFFFL) { | |
| 705 sprintf(cbuffer, "\\u%4.4X", ch); | |
| 706 } else { | |
| 707 sprintf(cbuffer, "\\U%8.8X", ch); | |
| 708 } | |
| 709 | |
| 710 buffer.append(cbuffer); | |
| 711 } | |
| 712 | |
| 713 if (ch >= 0x10000L) { | |
| 714 i += 1; | |
| 715 } | |
| 716 } | |
| 717 | |
| 718 return buffer; | |
| 719 } | |
| 720 #endif | |
| 721 | |
| 722 void SSearchTest::sharpSTest() | |
| 723 { | |
| 724 UErrorCode status = U_ZERO_ERROR; | |
| 725 UCollator *coll = NULL; | |
| 726 UnicodeString lp = "fuss"; | |
| 727 UnicodeString sp = "fu\\u00DF"; | |
| 728 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball",
"12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", | |
| 729 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", | |
| 730 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "
12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu
\\u00DF", "1234fu\\u00DF"}; | |
| 731 int32_t start = -1, end = -1; | |
| 732 | |
| 733 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); | |
| 734 TEST_ASSERT_SUCCESS(status); | |
| 735 | |
| 736 UnicodeString lpUnescaped = lp.unescape(); | |
| 737 UnicodeString spUnescaped = sp.unescape(); | |
| 738 | |
| 739 LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBu
ffer(), lpUnescaped.length(), | |
| 740 lpUnescaped.getBuffer
(), lpUnescaped.length(), // actual test data will be set later | |
| 741 coll, | |
| 742 NULL, // the brea
k iterator | |
| 743 &status)); | |
| 744 | |
| 745 LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getB
uffer(), spUnescaped.length(), | |
| 746 spUnescaped.getBuffer
(), spUnescaped.length(), // actual test data will be set later | |
| 747 coll, | |
| 748 NULL, // the brea
k iterator | |
| 749 &status)); | |
| 750 TEST_ASSERT_SUCCESS(status); | |
| 751 | |
| 752 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { | |
| 753 UBool bFound; | |
| 754 UnicodeString target = targets[t].unescape(); | |
| 755 | |
| 756 start = end = -1; | |
| 757 usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(),
&status); | |
| 758 bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status); | |
| 759 TEST_ASSERT_SUCCESS(status); | |
| 760 if (bFound) { | |
| 761 logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
| 762 } else { | |
| 763 dataerrln("Test %d: did not find long pattern.", t); | |
| 764 } | |
| 765 | |
| 766 usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length()
, &status); | |
| 767 bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status); | |
| 768 TEST_ASSERT_SUCCESS(status); | |
| 769 if (bFound) { | |
| 770 logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
| 771 } else { | |
| 772 dataerrln("Test %d: did not find long pattern.", t); | |
| 773 } | |
| 774 } | |
| 775 | |
| 776 ucol_close(coll); | |
| 777 } | |
| 778 | |
| 779 void SSearchTest::goodSuffixTest() | |
| 780 { | |
| 781 UErrorCode status = U_ZERO_ERROR; | |
| 782 UCollator *coll = NULL; | |
| 783 UnicodeString pat = /*"gcagagag"*/ "fxeld"; | |
| 784 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; | |
| 785 int32_t start = -1, end = -1; | |
| 786 UBool bFound; | |
| 787 | |
| 788 coll = ucol_open(NULL, &status); | |
| 789 TEST_ASSERT_SUCCESS(status); | |
| 790 | |
| 791 LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.l
ength(), | |
| 792 target.getBuffer(), ta
rget.length(), | |
| 793 coll, | |
| 794 NULL, // the break
iterator | |
| 795 &status)); | |
| 796 TEST_ASSERT_SUCCESS(status); | |
| 797 | |
| 798 bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status); | |
| 799 TEST_ASSERT_SUCCESS(status); | |
| 800 if (bFound) { | |
| 801 logln("Found pattern at [%d, %d].", start, end); | |
| 802 } else { | |
| 803 dataerrln("Did not find pattern."); | |
| 804 } | |
| 805 | |
| 806 ucol_close(coll); | |
| 807 } | |
| 808 | |
| 809 // | |
| 810 // searchTime() A quick and dirty performance test for string search. | |
| 811 // Probably doesn't really belong as part of intltest, but it | |
| 812 // does check that the search succeeds, and gets the right resu
lt, | |
| 813 // so it serves as a functionality test also. | |
| 814 // | |
| 815 // To run as a perf test, up the loop count, select by commenti
ng | |
| 816 // and uncommenting in the code the operation to be measured, | |
| 817 // rebuild, and measure the running time of this test alone. | |
| 818 // | |
| 819 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearc
hTest/searchTime | |
| 820 // | |
| 821 void SSearchTest::searchTime() { | |
| 822 static const char *longishText = | |
| 823 "Whylom, as olde stories tellen us,\n" | |
| 824 "Ther was a duk that highte Theseus:\n" | |
| 825 "Of Athenes he was lord and governour,\n" | |
| 826 "And in his tyme swich a conquerour,\n" | |
| 827 "That gretter was ther noon under the sonne.\n" | |
| 828 "Ful many a riche contree hadde he wonne;\n" | |
| 829 "What with his wisdom and his chivalrye,\n" | |
| 830 "He conquered al the regne of Femenye,\n" | |
| 831 "That whylom was y-cleped Scithia;\n" | |
| 832 "And weddede the quene Ipolita,\n" | |
| 833 "And broghte hir hoom with him in his contree\n" | |
| 834 "With muchel glorie and greet solempnitee,\n" | |
| 835 "And eek hir yonge suster Emelye.\n" | |
| 836 "And thus with victorie and with melodye\n" | |
| 837 "Lete I this noble duk to Athenes ryde,\n" | |
| 838 "And al his hoost, in armes, him bisyde.\n" | |
| 839 "And certes, if it nere to long to here,\n" | |
| 840 "I wolde han told yow fully the manere,\n" | |
| 841 "How wonnen was the regne of Femenye\n" | |
| 842 "By Theseus, and by his chivalrye;\n" | |
| 843 "And of the grete bataille for the nones\n" | |
| 844 "Bitwixen Athen's and Amazones;\n" | |
| 845 "And how asseged was Ipolita,\n" | |
| 846 "The faire hardy quene of Scithia;\n" | |
| 847 "And of the feste that was at hir weddinge,\n" | |
| 848 "And of the tempest at hir hoom-cominge;\n" | |
| 849 "But al that thing I moot as now forbere.\n" | |
| 850 "I have, God woot, a large feeld to ere,\n" | |
| 851 "And wayke been the oxen in my plough.\n" | |
| 852 "The remenant of the tale is long y-nough.\n" | |
| 853 "I wol nat letten eek noon of this route;\n" | |
| 854 "Lat every felawe telle his tale aboute,\n" | |
| 855 "And lat see now who shal the soper winne;\n" | |
| 856 "And ther I lefte, I wol ageyn biginne.\n" | |
| 857 "This duk, of whom I make mencioun,\n" | |
| 858 "When he was come almost unto the toun,\n" | |
| 859 "In al his wele and in his moste pryde,\n" | |
| 860 "He was war, as he caste his eye asyde,\n" | |
| 861 "Wher that ther kneled in the hye weye\n" | |
| 862 "A companye of ladies, tweye and tweye,\n" | |
| 863 "Ech after other, clad in clothes blake; \n" | |
| 864 "But swich a cry and swich a wo they make,\n" | |
| 865 "That in this world nis creature livinge,\n" | |
| 866 "That herde swich another weymentinge;\n" | |
| 867 "And of this cry they nolde never stenten,\n" | |
| 868 "Til they the reynes of his brydel henten.\n" | |
| 869 "'What folk ben ye, that at myn hoomcominge\n" | |
| 870 "Perturben so my feste with cryinge'?\n" | |
| 871 "Quod Theseus, 'have ye so greet envye\n" | |
| 872 "Of myn honour, that thus compleyne and crye? \n" | |
| 873 "Or who hath yow misboden, or offended?\n" | |
| 874 "And telleth me if it may been amended;\n" | |
| 875 "And why that ye ben clothed thus in blak'?\n" | |
| 876 "The eldest lady of hem alle spak,\n" | |
| 877 "When she hadde swowned with a deedly chere,\n" | |
| 878 "That it was routhe for to seen and here,\n" | |
| 879 "And seyde: 'Lord, to whom Fortune hath yiven\n" | |
| 880 "Victorie, and as a conquerour to liven,\n" | |
| 881 "Noght greveth us your glorie and your honour;\n" | |
| 882 "But we biseken mercy and socour.\n" | |
| 883 "Have mercy on our wo and our distresse.\n" | |
| 884 "Som drope of pitee, thurgh thy gentilesse,\n" | |
| 885 "Up-on us wrecched wommen lat thou falle.\n" | |
| 886 "For certes, lord, ther nis noon of us alle,\n" | |
| 887 "That she nath been a duchesse or a quene;\n" | |
| 888 "Now be we caitifs, as it is wel sene:\n" | |
| 889 "Thanked be Fortune, and hir false wheel,\n" | |
| 890 "That noon estat assureth to be weel.\n" | |
| 891 "And certes, lord, t'abyden your presence,\n" | |
| 892 "Here in the temple of the goddesse Clemence\n" | |
| 893 "We han ben waytinge al this fourtenight;\n" | |
| 894 "Now help us, lord, sith it is in thy might.\n" | |
| 895 "I wrecche, which that wepe and waille thus,\n" | |
| 896 "Was whylom wyf to king Capaneus,\n" | |
| 897 "That starf at Thebes, cursed be that day!\n" | |
| 898 "And alle we, that been in this array,\n" | |
| 899 "And maken al this lamentacioun,\n" | |
| 900 "We losten alle our housbondes at that toun,\n" | |
| 901 "Whyl that the sege ther-aboute lay.\n" | |
| 902 "And yet now th'olde Creon, weylaway!\n" | |
| 903 "The lord is now of Thebes the citee, \n" | |
| 904 "Fulfild of ire and of iniquitee,\n" | |
| 905 "He, for despyt, and for his tirannye,\n" | |
| 906 "To do the dede bodyes vileinye,\n" | |
| 907 "Of alle our lordes, whiche that ben slawe,\n" | |
| 908 "Hath alle the bodyes on an heep y-drawe,\n" | |
| 909 "And wol nat suffren hem, by noon assent,\n" | |
| 910 "Neither to been y-buried nor y-brent,\n" | |
| 911 "But maketh houndes ete hem in despyt. zet'\n"; | |
| 912 | |
| 913 const char *cPattern = "maketh houndes ete hem"; | |
| 914 //const char *cPattern = "Whylom"; | |
| 915 //const char *cPattern = "zet"; | |
| 916 const char *testId = "searchTime()"; // for error macros. | |
| 917 UnicodeString target = longishText; | |
| 918 UErrorCode status = U_ZERO_ERROR; | |
| 919 | |
| 920 | |
| 921 LocalUCollatorPointer collator(ucol_open("en", &status)); | |
| 922 //ucol_setStrength(collator.getAlias(), collatorStrength); | |
| 923 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize,
&status); | |
| 924 UnicodeString uPattern = cPattern; | |
| 925 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(),
uPattern.length(), | |
| 926 target.getBuffer(), t
arget.length(), | |
| 927 collator.getAlias(), | |
| 928 NULL, // the brea
k iterator | |
| 929 &status)); | |
| 930 TEST_ASSERT_SUCCESS(status); | |
| 931 | |
| 932 // int32_t foundStart; | |
| 933 // int32_t foundEnd; | |
| 934 UBool found; | |
| 935 | |
| 936 // Find the match position usgin strstr | |
| 937 const char *pm = strstr(longishText, cPattern); | |
| 938 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); | |
| 939 int32_t refMatchPos = (int32_t)(pm - longishText); | |
| 940 int32_t icuMatchPos; | |
| 941 int32_t icuMatchEnd; | |
| 942 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); | |
| 943 TEST_ASSERT_SUCCESS(status); | |
| 944 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different mat
ch positions."); | |
| 945 | |
| 946 int32_t i; | |
| 947 // int32_t j=0; | |
| 948 | |
| 949 // Try loopcounts around 100000 to some millions, depending on the operation
, | |
| 950 // to get runtimes of at least several seconds. | |
| 951 for (i=0; i<10000; i++) { | |
| 952 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &s
tatus); | |
| 953 (void)found; // Suppress set but not used warning. | |
| 954 //TEST_ASSERT_SUCCESS(status); | |
| 955 //TEST_ASSERT(found); | |
| 956 | |
| 957 // usearch_setOffset(uss.getAlias(), 0, &status); | |
| 958 // icuMatchPos = usearch_next(uss.getAlias(), &status); | |
| 959 | |
| 960 // The i+j stuff is to confuse the optimizer and get it to actually lea
ve the | |
| 961 // call to strstr in place. | |
| 962 //pm = strstr(longishText+j, cPattern); | |
| 963 //j = (j + i)%5; | |
| 964 } | |
| 965 | |
| 966 //printf("%ld, %d\n", pm-longishText, j); | |
| 967 } | |
| 968 | |
| 969 //------------------------------------------------------------------------------
---------- | |
| 970 // | |
| 971 // Random Numbers. Similar to standard lib rand() and srand() | |
| 972 // Not using library to | |
| 973 // 1. Get same results on all platforms. | |
| 974 // 2. Get access to current seed, to more easily reproduce
failures. | |
| 975 // | |
| 976 //------------------------------------------------------------------------------
--------- | |
| 977 static uint32_t m_seed = 1; | |
| 978 | |
| 979 static uint32_t m_rand() | |
| 980 { | |
| 981 m_seed = m_seed * 1103515245 + 12345; | |
| 982 return (uint32_t)(m_seed/65536) % 32768; | |
| 983 } | |
| 984 | |
| 985 class Monkey | |
| 986 { | |
| 987 public: | |
| 988 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; | |
| 989 | |
| 990 protected: | |
| 991 Monkey(); | |
| 992 virtual ~Monkey(); | |
| 993 }; | |
| 994 | |
| 995 Monkey::Monkey() | |
| 996 { | |
| 997 // ook? | |
| 998 } | |
| 999 | |
| 1000 Monkey::~Monkey() | |
| 1001 { | |
| 1002 // ook? | |
| 1003 } | |
| 1004 | |
| 1005 class SetMonkey : public Monkey | |
| 1006 { | |
| 1007 public: | |
| 1008 SetMonkey(const USet *theSet); | |
| 1009 ~SetMonkey(); | |
| 1010 | |
| 1011 virtual void append(UnicodeString &test, UnicodeString &alternate); | |
| 1012 | |
| 1013 private: | |
| 1014 const USet *set; | |
| 1015 }; | |
| 1016 | |
| 1017 SetMonkey::SetMonkey(const USet *theSet) | |
| 1018 : Monkey(), set(theSet) | |
| 1019 { | |
| 1020 // ook? | |
| 1021 } | |
| 1022 | |
| 1023 SetMonkey::~SetMonkey() | |
| 1024 { | |
| 1025 //ook... | |
| 1026 } | |
| 1027 | |
| 1028 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) | |
| 1029 { | |
| 1030 int32_t size = uset_size(set); | |
| 1031 int32_t index = m_rand() % size; | |
| 1032 UChar32 ch = uset_charAt(set, index); | |
| 1033 UnicodeString str(ch); | |
| 1034 | |
| 1035 test.append(str); | |
| 1036 alternate.append(str); // flip case, or some junk? | |
| 1037 } | |
| 1038 | |
| 1039 class StringSetMonkey : public Monkey | |
| 1040 { | |
| 1041 public: | |
| 1042 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCol
lData); | |
| 1043 ~StringSetMonkey(); | |
| 1044 | |
| 1045 void append(UnicodeString &testCase, UnicodeString &alternate); | |
| 1046 | |
| 1047 private: | |
| 1048 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeStr
ing &alternate); | |
| 1049 | |
| 1050 const USet *set; | |
| 1051 UCollator *coll; | |
| 1052 CollData *collData; | |
| 1053 }; | |
| 1054 | |
| 1055 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, Col
lData *theCollData) | |
| 1056 : Monkey(), set(theSet), coll(theCollator), collData(theCollData) | |
| 1057 { | |
| 1058 // ook. | |
| 1059 } | |
| 1060 | |
| 1061 StringSetMonkey::~StringSetMonkey() | |
| 1062 { | |
| 1063 // ook? | |
| 1064 } | |
| 1065 | |
| 1066 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) | |
| 1067 { | |
| 1068 int32_t itemCount = uset_getItemCount(set), len = 0; | |
| 1069 int32_t index = m_rand() % itemCount; | |
| 1070 UChar32 rangeStart = 0, rangeEnd = 0; | |
| 1071 UChar buffer[16]; | |
| 1072 UErrorCode err = U_ZERO_ERROR; | |
| 1073 | |
| 1074 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); | |
| 1075 | |
| 1076 if (len == 0) { | |
| 1077 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); | |
| 1078 UChar32 ch = rangeStart + offset; | |
| 1079 UnicodeString str(ch); | |
| 1080 | |
| 1081 testCase.append(str); | |
| 1082 generateAlternative(str, alternate); | |
| 1083 } else if (len > 0) { | |
| 1084 // should check that len < 16... | |
| 1085 UnicodeString str(buffer, len); | |
| 1086 | |
| 1087 testCase.append(str); | |
| 1088 generateAlternative(str, alternate); | |
| 1089 } else { | |
| 1090 // shouldn't happen... | |
| 1091 } | |
| 1092 } | |
| 1093 | |
| 1094 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas
e, UnicodeString &alternate) | |
| 1095 { | |
| 1096 // find out shortest string for the longest sequence of ces. | |
| 1097 // needs to be refined to use dynamic programming, but will be roughly right | |
| 1098 UErrorCode status = U_ZERO_ERROR; | |
| 1099 CEList ceList(coll, testCase, status); | |
| 1100 UnicodeString alt; | |
| 1101 int32_t offset = 0; | |
| 1102 | |
| 1103 if (ceList.size() == 0) { | |
| 1104 return alternate.append(testCase); | |
| 1105 } | |
| 1106 | |
| 1107 while (offset < ceList.size()) { | |
| 1108 int32_t ce = ceList.get(offset); | |
| 1109 const StringList *strings = collData->getStringList(ce); | |
| 1110 | |
| 1111 if (strings == NULL) { | |
| 1112 return alternate.append(testCase); | |
| 1113 } | |
| 1114 | |
| 1115 int32_t stringCount = strings->size(); | |
| 1116 int32_t tries = 0; | |
| 1117 | |
| 1118 // find random string that generates the same CEList | |
| 1119 const CEList *ceList2 = NULL; | |
| 1120 const UnicodeString *string = NULL; | |
| 1121 UBool matches = FALSE; | |
| 1122 | |
| 1123 do { | |
| 1124 int32_t s = m_rand() % stringCount; | |
| 1125 | |
| 1126 if (tries++ > stringCount) { | |
| 1127 alternate.append(testCase); | |
| 1128 return alternate; | |
| 1129 } | |
| 1130 | |
| 1131 string = strings->get(s); | |
| 1132 ceList2 = collData->getCEList(string); | |
| 1133 matches = ceList.matchesAt(offset, ceList2); | |
| 1134 | |
| 1135 if (! matches) { | |
| 1136 collData->freeCEList((CEList *) ceList2); | |
| 1137 } | |
| 1138 } while (! matches); | |
| 1139 | |
| 1140 alt.append(*string); | |
| 1141 offset += ceList2->size(); | |
| 1142 collData->freeCEList(ceList2); | |
| 1143 } | |
| 1144 | |
| 1145 const CEList altCEs(coll, alt, status); | |
| 1146 | |
| 1147 if (ceList.matchesAt(0, &altCEs)) { | |
| 1148 return alternate.append(alt); | |
| 1149 } | |
| 1150 | |
| 1151 return alternate.append(testCase); | |
| 1152 } | |
| 1153 | |
| 1154 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyC
ount, UnicodeString &testCase, UnicodeString &alternate) | |
| 1155 { | |
| 1156 int32_t pieces = (m_rand() % 4) + 1; | |
| 1157 UErrorCode status = U_ZERO_ERROR; | |
| 1158 UBool matches; | |
| 1159 | |
| 1160 do { | |
| 1161 testCase.remove(); | |
| 1162 alternate.remove(); | |
| 1163 monkeys[0]->append(testCase, alternate); | |
| 1164 | |
| 1165 for(int32_t piece = 0; piece < pieces; piece += 1) { | |
| 1166 int32_t monkey = m_rand() % monkeyCount; | |
| 1167 | |
| 1168 monkeys[monkey]->append(testCase, alternate); | |
| 1169 } | |
| 1170 | |
| 1171 const CEList ceTest(coll, testCase, status); | |
| 1172 const CEList ceAlt(coll, alternate, status); | |
| 1173 | |
| 1174 matches = ceTest.matchesAt(0, &ceAlt); | |
| 1175 } while (! matches); | |
| 1176 } | |
| 1177 | |
| 1178 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t
offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) | |
| 1179 { | |
| 1180 UErrorCode status = U_ZERO_ERROR; | |
| 1181 OrderList targetOrders(coll, target, offset); | |
| 1182 OrderList patternOrders(coll, pattern); | |
| 1183 int32_t targetSize = targetOrders.size() - 1; | |
| 1184 int32_t patternSize = patternOrders.size() - 1; | |
| 1185 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale
ByType(coll, ULOC_VALID_LOCALE, &status), | |
| 1186 target.getBuffer(), target.len
gth(), &status); | |
| 1187 | |
| 1188 if (patternSize == 0) { | |
| 1189 // Searching for an empty pattern always fails | |
| 1190 matchStart = matchEnd = -1; | |
| 1191 ubrk_close(charBreakIterator); | |
| 1192 return FALSE; | |
| 1193 } | |
| 1194 | |
| 1195 matchStart = matchEnd = -1; | |
| 1196 | |
| 1197 for(int32_t i = 0; i < targetSize; i += 1) { | |
| 1198 if (targetOrders.matchesAt(i, patternOrders)) { | |
| 1199 int32_t start = targetOrders.getLowOffset(i); | |
| 1200 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); | |
| 1201 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); | |
| 1202 | |
| 1203 // if the low and high offsets of the first CE in | |
| 1204 // the match are the same, it means that the match | |
| 1205 // starts in the middle of an expansion - all but | |
| 1206 // the first CE of the expansion will have the offset | |
| 1207 // of the following character. | |
| 1208 if (start == targetOrders.getHighOffset(i)) { | |
| 1209 continue; | |
| 1210 } | |
| 1211 | |
| 1212 // Make sure match starts on a grapheme boundary | |
| 1213 if (! ubrk_isBoundary(charBreakIterator, start)) { | |
| 1214 continue; | |
| 1215 } | |
| 1216 | |
| 1217 // If the low and high offsets of the CE after the match | |
| 1218 // are the same, it means that the match ends in the middle | |
| 1219 // of an expansion sequence. | |
| 1220 if (maxLimit == targetOrders.getHighOffset(i + patternSize) && | |
| 1221 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { | |
| 1222 continue; | |
| 1223 } | |
| 1224 | |
| 1225 int32_t mend = maxLimit; | |
| 1226 | |
| 1227 // Find the first grapheme break after the character index | |
| 1228 // of the last CE in the match. If it's after character index | |
| 1229 // that's after the last CE in the match, use that index | |
| 1230 // as the end of the match. | |
| 1231 if (minLimit < maxLimit) { | |
| 1232 // When the last CE's low index is same with its high index, the
CE is likely | |
| 1233 // a part of expansion. In this case, the index is located just
after the | |
| 1234 // character corresponding to the CEs compared above. If the ind
ex is right | |
| 1235 // at the break boundary, move the position to the next boundary
will result | |
| 1236 // incorrect match length when there are ignorable characters ex
ist between | |
| 1237 // the position and the next character produces CE(s). See ticke
t#8482. | |
| 1238 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1)
&& ubrk_isBoundary(charBreakIterator, minLimit)) { | |
| 1239 mend = minLimit; | |
| 1240 } else { | |
| 1241 int32_t nba = ubrk_following(charBreakIterator, minLimit); | |
| 1242 | |
| 1243 if (nba >= targetOrders.getHighOffset(i + patternSize - 1))
{ | |
| 1244 mend = nba; | |
| 1245 } | |
| 1246 } | |
| 1247 } | |
| 1248 | |
| 1249 if (mend > maxLimit) { | |
| 1250 continue; | |
| 1251 } | |
| 1252 | |
| 1253 if (! ubrk_isBoundary(charBreakIterator, mend)) { | |
| 1254 continue; | |
| 1255 } | |
| 1256 | |
| 1257 matchStart = start; | |
| 1258 matchEnd = mend; | |
| 1259 | |
| 1260 ubrk_close(charBreakIterator); | |
| 1261 return TRUE; | |
| 1262 } | |
| 1263 } | |
| 1264 | |
| 1265 ubrk_close(charBreakIterator); | |
| 1266 return FALSE; | |
| 1267 } | |
| 1268 | |
| 1269 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
| 1270 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d
efaultVal) { | |
| 1271 int32_t val = defaultVal; | |
| 1272 | |
| 1273 name.append(" *= *(-?\\d+)"); | |
| 1274 | |
| 1275 UErrorCode status = U_ZERO_ERROR; | |
| 1276 RegexMatcher m(name, params, 0, status); | |
| 1277 | |
| 1278 if (m.find()) { | |
| 1279 // The param exists. Convert the string to an int. | |
| 1280 char valString[100]; | |
| 1281 int32_t paramLength = m.end(1, status) - m.start(1, status); | |
| 1282 | |
| 1283 if (paramLength >= (int32_t)(sizeof(valString)-1)) { | |
| 1284 paramLength = (int32_t)(sizeof(valString)-2); | |
| 1285 } | |
| 1286 | |
| 1287 params.extract(m.start(1, status), paramLength, valString, sizeof(valStr
ing)); | |
| 1288 val = uprv_strtol(valString, NULL, 10); | |
| 1289 | |
| 1290 // Delete this parameter from the params string. | |
| 1291 m.reset(); | |
| 1292 params = m.replaceFirst("", status); | |
| 1293 } | |
| 1294 | |
| 1295 //U_ASSERT(U_SUCCESS(status)); | |
| 1296 if (! U_SUCCESS(status)) { | |
| 1297 val = defaultVal; | |
| 1298 } | |
| 1299 | |
| 1300 return val; | |
| 1301 } | |
| 1302 #endif | |
| 1303 | |
| 1304 #if !UCONFIG_NO_COLLATION | |
| 1305 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCa
se, const UnicodeString &pattern, const UnicodeString &altPattern, | |
| 1306 const char *name, const char *strength, uint
32_t seed) | |
| 1307 { | |
| 1308 UErrorCode status = U_ZERO_ERROR; | |
| 1309 int32_t actualStart = -1, actualEnd = -1; | |
| 1310 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + alt
Pattern.length(); | |
| 1311 int32_t expectedStart = -1, expectedEnd = -1; | |
| 1312 int32_t notFoundCount = 0; | |
| 1313 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(),
pattern.length(), | |
| 1314 testCase.getBuffer(),
testCase.length(), | |
| 1315 coll, | |
| 1316 NULL, // the brea
k iterator | |
| 1317 &status)); | |
| 1318 | |
| 1319 // **** TODO: find *all* matches, not just first one **** | |
| 1320 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); | |
| 1321 | |
| 1322 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); | |
| 1323 | |
| 1324 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { | |
| 1325 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d,
%d]\n" | |
| 1326 " strength=%s seed=%d", | |
| 1327 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed); | |
| 1328 } | |
| 1329 | |
| 1330 if (expectedStart == -1 && actualStart == -1) { | |
| 1331 notFoundCount += 1; | |
| 1332 } | |
| 1333 | |
| 1334 // **** TODO: find *all* matches, not just first one **** | |
| 1335 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); | |
| 1336 | |
| 1337 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length
(), &status); | |
| 1338 | |
| 1339 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); | |
| 1340 | |
| 1341 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { | |
| 1342 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [
%d, %d]\n" | |
| 1343 " strength=%s seed=%d", | |
| 1344 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed); | |
| 1345 } | |
| 1346 | |
| 1347 if (expectedStart == -1 && actualStart == -1) { | |
| 1348 notFoundCount += 1; | |
| 1349 } | |
| 1350 | |
| 1351 return notFoundCount; | |
| 1352 } | |
| 1353 #endif | |
| 1354 | |
| 1355 void SSearchTest::monkeyTest(char *params) | |
| 1356 { | |
| 1357 // ook! | |
| 1358 UErrorCode status = U_ZERO_ERROR; | |
| 1359 //UCollator *coll = ucol_open(NULL, &status); | |
| 1360 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); | |
| 1361 | |
| 1362 if (U_FAILURE(status)) { | |
| 1363 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_er
rorName(status)); | |
| 1364 return; | |
| 1365 } | |
| 1366 | |
| 1367 CollData *monkeyData = new CollData(coll, status); | |
| 1368 | |
| 1369 USet *expansions = uset_openEmpty(); | |
| 1370 USet *contractions = uset_openEmpty(); | |
| 1371 | |
| 1372 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &st
atus); | |
| 1373 | |
| 1374 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; | |
| 1375 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; | |
| 1376 USet *letters = uset_openPattern(letter_pattern, 39, &status); | |
| 1377 SetMonkey letterMonkey(letters); | |
| 1378 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); | |
| 1379 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); | |
| 1380 UnicodeString testCase; | |
| 1381 UnicodeString alternate; | |
| 1382 UnicodeString pattern, altPattern; | |
| 1383 UnicodeString prefix, altPrefix; | |
| 1384 UnicodeString suffix, altSuffix; | |
| 1385 | |
| 1386 Monkey *monkeys[] = { | |
| 1387 &letterMonkey, | |
| 1388 &contractionMonkey, | |
| 1389 &expansionMonkey, | |
| 1390 &contractionMonkey, | |
| 1391 &expansionMonkey, | |
| 1392 &contractionMonkey, | |
| 1393 &expansionMonkey, | |
| 1394 &contractionMonkey, | |
| 1395 &expansionMonkey}; | |
| 1396 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); | |
| 1397 // int32_t nonMatchCount = 0; | |
| 1398 | |
| 1399 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIAR
Y}; | |
| 1400 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; | |
| 1401 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); | |
| 1402 int32_t loopCount = quick? 1000 : 10000; | |
| 1403 int32_t firstStrength = 0; | |
| 1404 int32_t lastStrength = strengthCount - 1; //*/ 0; | |
| 1405 | |
| 1406 if (params != NULL) { | |
| 1407 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
| 1408 UnicodeString p(params); | |
| 1409 | |
| 1410 loopCount = getIntParam("loop", p, loopCount); | |
| 1411 m_seed = getIntParam("seed", p, m_seed); | |
| 1412 | |
| 1413 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, st
atus); | |
| 1414 if (m.find()) { | |
| 1415 UnicodeString breakType = m.group(1, status); | |
| 1416 | |
| 1417 for (int32_t s = 0; s < strengthCount; s += 1) { | |
| 1418 if (breakType == strengthNames[s]) { | |
| 1419 firstStrength = lastStrength = s; | |
| 1420 break; | |
| 1421 } | |
| 1422 } | |
| 1423 | |
| 1424 m.reset(); | |
| 1425 p = m.replaceFirst("", status); | |
| 1426 } | |
| 1427 | |
| 1428 if (RegexMatcher("\\S", p, 0, status).find()) { | |
| 1429 // Each option is stripped out of the option string as it is process
ed. | |
| 1430 // All options have been checked. The option string should have bee
n completely emptied.. | |
| 1431 char buf[100]; | |
| 1432 p.extract(buf, sizeof(buf), NULL, status); | |
| 1433 buf[sizeof(buf)-1] = 0; | |
| 1434 errln("Unrecognized or extra parameter: %s\n", buf); | |
| 1435 return; | |
| 1436 } | |
| 1437 #else | |
| 1438 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring
parameters."); | |
| 1439 #endif | |
| 1440 } | |
| 1441 | |
| 1442 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { | |
| 1443 int32_t notFoundCount = 0; | |
| 1444 | |
| 1445 logln("Setting strength to %s.", strengthNames[s]); | |
| 1446 ucol_setStrength(coll, strengths[s]); | |
| 1447 | |
| 1448 // TODO: try alternate prefix and suffix too? | |
| 1449 // TODO: alternates are only equal at primary strength. Is this OK? | |
| 1450 for(int32_t t = 0; t < loopCount; t += 1) { | |
| 1451 uint32_t seed = m_seed; | |
| 1452 // int32_t nmc = 0; | |
| 1453 | |
| 1454 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); | |
| 1455 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); | |
| 1456 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); | |
| 1457 | |
| 1458 // pattern | |
| 1459 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern,
"pattern", strengthNames[s], seed); | |
| 1460 | |
| 1461 testCase.remove(); | |
| 1462 testCase.append(prefix); | |
| 1463 testCase.append(/*alt*/pattern); | |
| 1464 | |
| 1465 // prefix + pattern | |
| 1466 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"prefix + pattern", strengthNames[s], seed); | |
| 1467 | |
| 1468 testCase.append(suffix); | |
| 1469 | |
| 1470 // prefix + pattern + suffix | |
| 1471 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"prefix + pattern + suffix", strengthNames[s], seed); | |
| 1472 | |
| 1473 testCase.remove(); | |
| 1474 testCase.append(pattern); | |
| 1475 testCase.append(suffix); | |
| 1476 | |
| 1477 // pattern + suffix | |
| 1478 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"pattern + suffix", strengthNames[s], seed); | |
| 1479 } | |
| 1480 | |
| 1481 logln("For strength %s the not found count is %d.", strengthNames[s], not
FoundCount); | |
| 1482 } | |
| 1483 | |
| 1484 uset_close(contractions); | |
| 1485 uset_close(expansions); | |
| 1486 uset_close(letters); | |
| 1487 delete monkeyData; | |
| 1488 | |
| 1489 ucol_close(coll); | |
| 1490 } | |
| 1491 | |
| 1492 #endif | |
| 1493 | |
| 1494 #endif | |
| OLD | NEW |