OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 */ |
| 7 |
| 8 |
| 9 #include "unicode/utypes.h" |
| 10 |
| 11 #if !UCONFIG_NO_COLLATION |
| 12 |
| 13 #include "unicode/unistr.h" |
| 14 #include "unicode/putil.h" |
| 15 #include "unicode/usearch.h" |
| 16 |
| 17 #include "cmemory.h" |
| 18 #include "unicode/coll.h" |
| 19 #include "unicode/tblcoll.h" |
| 20 #include "unicode/coleitr.h" |
| 21 #include "unicode/ucoleitr.h" |
| 22 |
| 23 #include "unicode/regex.h" // TODO: make conditional on regexp being buil
t. |
| 24 |
| 25 #include "unicode/uniset.h" |
| 26 #include "unicode/uset.h" |
| 27 #include "unicode/ustring.h" |
| 28 #include "hash.h" |
| 29 #include "uhash.h" |
| 30 #include "ucol_imp.h" |
| 31 |
| 32 #include "intltest.h" |
| 33 #include "ssearch.h" |
| 34 |
| 35 #include "unicode/colldata.h" |
| 36 #include "unicode/bmsearch.h" |
| 37 #include "unicode/bms.h" |
| 38 |
| 39 #include "xmlparser.h" |
| 40 #include "ucbuf.h" |
| 41 |
| 42 #include <stdlib.h> |
| 43 #include <string.h> |
| 44 #include <stdio.h> |
| 45 |
| 46 char testId[100]; |
| 47 |
| 48 #define TEST_ASSERT(x) {if (!(x)) { \ |
| 49 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, t
estId);}} |
| 50 |
| 51 #define TEST_ASSERT_M(x, m) {if (!(x)) { \ |
| 52 errln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);return
;}} |
| 53 |
| 54 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ |
| 55 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ |
| 56 __FILE__, __LINE__, testId, u_errorName(errcode));}} |
| 57 |
| 58 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| 59 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) |
| 60 #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| 61 |
| 62 //--------------------------------------------------------------------------- |
| 63 // |
| 64 // Test class boilerplate |
| 65 // |
| 66 //--------------------------------------------------------------------------- |
| 67 SSearchTest::SSearchTest() |
| 68 { |
| 69 } |
| 70 |
| 71 SSearchTest::~SSearchTest() |
| 72 { |
| 73 } |
| 74 |
| 75 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
char *params ) |
| 76 { |
| 77 if (exec) logln("TestSuite SSearchTest: "); |
| 78 switch (index) { |
| 79 #if !UCONFIG_NO_BREAK_ITERATION |
| 80 case 0: name = "searchTest"; |
| 81 if (exec) searchTest(); |
| 82 break; |
| 83 |
| 84 case 1: name = "offsetTest"; |
| 85 if (exec) offsetTest(); |
| 86 break; |
| 87 |
| 88 case 2: name = "monkeyTest"; |
| 89 if (exec) monkeyTest(params); |
| 90 break; |
| 91 |
| 92 case 3: name = "bmMonkeyTest"; |
| 93 if (exec) bmMonkeyTest(params); |
| 94 break; |
| 95 |
| 96 case 4: name = "boyerMooreTest"; |
| 97 if (exec) boyerMooreTest(); |
| 98 break; |
| 99 |
| 100 case 5: name = "goodSuffixTest"; |
| 101 if (exec) goodSuffixTest(); |
| 102 break; |
| 103 |
| 104 case 6: name = "searchTime"; |
| 105 if (exec) searchTime(); |
| 106 break; |
| 107 |
| 108 case 7: name = "bmsTest"; |
| 109 if (exec) bmsTest(); |
| 110 break; |
| 111 |
| 112 case 8: name = "bmSearchTest"; |
| 113 if (exec) bmSearchTest(); |
| 114 break; |
| 115 |
| 116 case 9: name = "udhrTest"; |
| 117 if (exec) udhrTest(); |
| 118 break; |
| 119 case 10: name = "stringListTest"; |
| 120 if (exec) stringListTest(); |
| 121 break; |
| 122 #endif |
| 123 default: name = ""; |
| 124 break; //needed to end loop |
| 125 } |
| 126 } |
| 127 |
| 128 |
| 129 #if !UCONFIG_NO_BREAK_ITERATION |
| 130 |
| 131 #define PATH_BUFFER_SIZE 2048 |
| 132 const char *SSearchTest::getPath(char buffer[2048], const char *filename) { |
| 133 UErrorCode status = U_ZERO_ERROR; |
| 134 const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| 135 |
| 136 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >=
PATH_BUFFER_SIZE) { |
| 137 errln("ERROR: getPath() failed - %s", u_errorName(status)); |
| 138 return NULL; |
| 139 } |
| 140 |
| 141 strcpy(buffer, testDataDirectory); |
| 142 strcat(buffer, filename); |
| 143 return buffer; |
| 144 } |
| 145 |
| 146 |
| 147 void SSearchTest::searchTest() |
| 148 { |
| 149 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO |
| 150 UErrorCode status = U_ZERO_ERROR; |
| 151 char path[PATH_BUFFER_SIZE]; |
| 152 const char *testFilePath = getPath(path, "ssearch.xml"); |
| 153 |
| 154 if (testFilePath == NULL) { |
| 155 return; /* Couldn't get path: error message already output. */ |
| 156 } |
| 157 |
| 158 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); |
| 159 TEST_ASSERT_SUCCESS(status); |
| 160 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); |
| 161 TEST_ASSERT_SUCCESS(status); |
| 162 if (U_FAILURE(status)) { |
| 163 return; |
| 164 } |
| 165 |
| 166 const UnicodeString *debugTestCase = root->getAttribute("debug"); |
| 167 if (debugTestCase != NULL) { |
| 168 // setenv("USEARCH_DEBUG", "1", 1); |
| 169 } |
| 170 |
| 171 |
| 172 const UXMLElement *testCase; |
| 173 int32_t tc = 0; |
| 174 |
| 175 while((testCase = root->nextChildElement(tc)) != NULL) { |
| 176 |
| 177 if (testCase->getTagName().compare("test-case") != 0) { |
| 178 errln("ssearch, unrecognized XML Element in test file"); |
| 179 continue; |
| 180 } |
| 181 const UnicodeString *id = testCase->getAttribute("id"); |
| 182 *testId = 0; |
| 183 if (id != NULL) { |
| 184 id->extract(0, id->length(), testId, sizeof(testId), US_INV); |
| 185 } |
| 186 |
| 187 // If debugging test case has been specified and this is not it, skip to
next. |
| 188 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { |
| 189 continue; |
| 190 } |
| 191 // |
| 192 // Get the requested collation strength. |
| 193 // Default is tertiary if the XML attribute is missing from the test
case. |
| 194 // |
| 195 const UnicodeString *strength = testCase->getAttribute("strength"); |
| 196 UColAttributeValue collatorStrength = UCOL_PRIMARY; |
| 197 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} |
| 198 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} |
| 199 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} |
| 200 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} |
| 201 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} |
| 202 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} |
| 203 else { |
| 204 // Bogus value supplied for strength. Shouldn't happen, even from |
| 205 // typos, if the XML source has been validated. |
| 206 // This assert is a little deceiving in that strength can be |
| 207 // any of the allowed values, not just TERTIARY, but it will |
| 208 // do the job of getting the error output. |
| 209 TEST_ASSERT(*strength=="TERTIARY") |
| 210 } |
| 211 |
| 212 // |
| 213 // Get the collator normalization flag. Default is UCOL_OFF. |
| 214 // |
| 215 UColAttributeValue normalize = UCOL_OFF; |
| 216 const UnicodeString *norm = testCase->getAttribute("norm"); |
| 217 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); |
| 218 if (norm!=NULL && *norm=="ON") { |
| 219 normalize = UCOL_ON; |
| 220 } |
| 221 |
| 222 // |
| 223 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. |
| 224 // |
| 225 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; |
| 226 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); |
| 227 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"
); |
| 228 if (alt != NULL && *alt == "SHIFTED") { |
| 229 alternateHandling = UCOL_SHIFTED; |
| 230 } |
| 231 |
| 232 const UnicodeString defLocale("en"); |
| 233 char clocale[100]; |
| 234 const UnicodeString *locale = testCase->getAttribute("locale"); |
| 235 if (locale == NULL || locale->length()==0) { |
| 236 locale = &defLocale; |
| 237 }; |
| 238 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); |
| 239 |
| 240 |
| 241 UnicodeString text; |
| 242 UnicodeString target; |
| 243 UnicodeString pattern; |
| 244 int32_t expectedMatchStart = -1; |
| 245 int32_t expectedMatchLimit = -1; |
| 246 const UXMLElement *n; |
| 247 int32_t nodeCount = 0; |
| 248 |
| 249 n = testCase->getChildElement("pattern"); |
| 250 TEST_ASSERT(n != NULL); |
| 251 if (n==NULL) { |
| 252 continue; |
| 253 } |
| 254 text = n->getText(FALSE); |
| 255 text = text.unescape(); |
| 256 pattern.append(text); |
| 257 nodeCount++; |
| 258 |
| 259 n = testCase->getChildElement("pre"); |
| 260 if (n!=NULL) { |
| 261 text = n->getText(FALSE); |
| 262 text = text.unescape(); |
| 263 target.append(text); |
| 264 nodeCount++; |
| 265 } |
| 266 |
| 267 n = testCase->getChildElement("m"); |
| 268 if (n!=NULL) { |
| 269 expectedMatchStart = target.length(); |
| 270 text = n->getText(FALSE); |
| 271 text = text.unescape(); |
| 272 target.append(text); |
| 273 expectedMatchLimit = target.length(); |
| 274 nodeCount++; |
| 275 } |
| 276 |
| 277 n = testCase->getChildElement("post"); |
| 278 if (n!=NULL) { |
| 279 text = n->getText(FALSE); |
| 280 text = text.unescape(); |
| 281 target.append(text); |
| 282 nodeCount++; |
| 283 } |
| 284 |
| 285 // Check that there weren't extra things in the XML |
| 286 TEST_ASSERT(nodeCount == testCase->countChildren()); |
| 287 |
| 288 // Open a collator and StringSearch based on the parameters |
| 289 // obtained from the XML. |
| 290 // |
| 291 status = U_ZERO_ERROR; |
| 292 LocalUCollatorPointer collator(ucol_open(clocale, &status)); |
| 293 ucol_setStrength(collator.getAlias(), collatorStrength); |
| 294 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normaliz
e, &status); |
| 295 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternat
eHandling, &status); |
| 296 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer
(), pattern.length(), |
| 297 target.getBuffer(
), target.length(), |
| 298 collator.getAlias
(), |
| 299 NULL, // the
break iterator |
| 300 &status)); |
| 301 |
| 302 TEST_ASSERT_SUCCESS(status); |
| 303 if (U_FAILURE(status)) { |
| 304 continue; |
| 305 } |
| 306 |
| 307 int32_t foundStart = 0; |
| 308 int32_t foundLimit = 0; |
| 309 UBool foundMatch; |
| 310 |
| 311 // |
| 312 // Do the search, check the match result against the expected results. |
| 313 // |
| 314 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit,
&status); |
| 315 TEST_ASSERT_SUCCESS(status); |
| 316 if ((foundMatch && expectedMatchStart<0) || |
| 317 (foundStart != expectedMatchStart) || |
| 318 (foundLimit != expectedMatchLimit)) { |
| 319 TEST_ASSERT(FALSE); // ouput generic error position |
| 320 infoln("Found, expected match start = %d, %d \n" |
| 321 "Found, expected match limit = %d, %d", |
| 322 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); |
| 323 } |
| 324 |
| 325 // In case there are other matches... |
| 326 // (should we only do this if the test case passed?) |
| 327 while (foundMatch) { |
| 328 expectedMatchStart = foundStart; |
| 329 expectedMatchLimit = foundLimit; |
| 330 |
| 331 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart,
&foundLimit, &status); |
| 332 } |
| 333 |
| 334 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.l
ength(), |
| 335 target.getBuffer(), target.length(), |
| 336 collator.getAlias(), |
| 337 NULL, |
| 338 &status)); |
| 339 |
| 340 // |
| 341 // Do the backwards search, check the match result against the expected
results. |
| 342 // |
| 343 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &fo
undStart, &foundLimit, &status); |
| 344 TEST_ASSERT_SUCCESS(status); |
| 345 if ((foundMatch && expectedMatchStart<0) || |
| 346 (foundStart != expectedMatchStart) || |
| 347 (foundLimit != expectedMatchLimit)) { |
| 348 TEST_ASSERT(FALSE); // ouput generic error position |
| 349 infoln("Found, expected backwards match start = %d, %d \n" |
| 350 "Found, expected backwards match limit = %d, %d", |
| 351 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); |
| 352 } |
| 353 } |
| 354 #endif |
| 355 } |
| 356 |
| 357 struct UdhrTestCase |
| 358 { |
| 359 const char *locale; |
| 360 const char *file; |
| 361 }; |
| 362 |
| 363 void SSearchTest::udhrTest() |
| 364 { |
| 365 UErrorCode status = U_ZERO_ERROR; |
| 366 char path[PATH_BUFFER_SIZE]; |
| 367 const char *udhrPath = getPath(path, "udhr"); |
| 368 |
| 369 if (udhrPath == NULL) { |
| 370 // couldn't get path: error message already output... |
| 371 return; |
| 372 } |
| 373 |
| 374 UdhrTestCase testCases[] = { |
| 375 {"en", "udhr_eng.txt"}, |
| 376 {"de", "udhr_deu_1996.txt"}, |
| 377 {"fr", "udhr_fra.txt"}, |
| 378 {"ru", "udhr_rus.txt"}, |
| 379 {"th", "udhr_tha.txt"}, |
| 380 {"ja", "udhr_jpn.txt"}, |
| 381 {"ko", "udhr_kor.txt"}, |
| 382 {"zh", "udhr_cmn_hans.txt"}, |
| 383 {"zh_Hant", "udhr_cmn_hant.txt"} |
| 384 }; |
| 385 |
| 386 int32_t testCount = ARRAY_SIZE(testCases); |
| 387 |
| 388 for (int32_t t = 0; t < testCount; t += 1) { |
| 389 int32_t len = 0; |
| 390 char *resolvedFileName = NULL; |
| 391 const char *encoding = NULL; |
| 392 UCHARBUF *ucharBuf = NULL; |
| 393 |
| 394 ucbuf_resolveFileName(udhrPath, testCases[t].file, NULL, &len, &status); |
| 395 resolvedFileName = NEW_ARRAY(char, len); |
| 396 |
| 397 if(resolvedFileName == NULL){ |
| 398 continue; |
| 399 } |
| 400 |
| 401 if(status == U_BUFFER_OVERFLOW_ERROR){ |
| 402 status = U_ZERO_ERROR; |
| 403 } |
| 404 |
| 405 ucbuf_resolveFileName(udhrPath, testCases[t].file, resolvedFileName, &le
n, &status); |
| 406 ucharBuf = ucbuf_open(resolvedFileName, &encoding, TRUE, FALSE, &status)
; |
| 407 |
| 408 DELETE_ARRAY(resolvedFileName); |
| 409 |
| 410 if(U_FAILURE(status)){ |
| 411 infoln("Could not open the input file %s. Test skipped\n", testCases
[t].file); |
| 412 continue; |
| 413 } |
| 414 |
| 415 int32_t targetLen = 0; |
| 416 const UChar *target = ucbuf_getBuffer(ucharBuf, &targetLen, &status); |
| 417 |
| 418 /* The first line of the file contains the pattern */ |
| 419 int32_t start = 0, end = 0, plen = 0; |
| 420 |
| 421 for(end = start; ; end += 1) { |
| 422 UChar ch = target[end]; |
| 423 |
| 424 if (ch == 0x000A || ch == 0x000D || ch == 0x2028) { |
| 425 break; |
| 426 } |
| 427 } |
| 428 |
| 429 plen = end - start; |
| 430 |
| 431 UChar *pattern = NEW_ARRAY(UChar, plen); |
| 432 for (int32_t i = 0; i < plen; i += 1) { |
| 433 pattern[i] = target[start++]; |
| 434 } |
| 435 |
| 436 int32_t offset = 0; |
| 437 UCollator *coll = ucol_open(testCases[t].locale, &status); |
| 438 UCD *ucd = NULL; |
| 439 BMS *bms = NULL; |
| 440 |
| 441 if (U_FAILURE(status)) { |
| 442 errln("Could not open collator for %s", testCases[t].locale); |
| 443 goto delete_collator; |
| 444 } |
| 445 |
| 446 ucd = ucd_open(coll, &status); |
| 447 |
| 448 if (U_FAILURE(status)) { |
| 449 errln("Could not open CollData object for %s", testCases[t].locale); |
| 450 goto delete_ucd; |
| 451 } |
| 452 |
| 453 bms = bms_open(ucd, pattern, plen, target, targetLen, &status); |
| 454 |
| 455 if (U_FAILURE(status)) { |
| 456 errln("Could not open search object for %s", testCases[t].locale); |
| 457 goto delete_bms; |
| 458 } |
| 459 |
| 460 start = end = -1; |
| 461 while (bms_search(bms, offset, &start, &end)) { |
| 462 offset = end; |
| 463 } |
| 464 |
| 465 if (offset == 0) { |
| 466 errln("Could not find pattern - locale: %s, file: %s ", testCases[t]
.locale, testCases[t].file); |
| 467 } |
| 468 |
| 469 delete_bms: |
| 470 bms_close(bms); |
| 471 |
| 472 delete_ucd: |
| 473 ucd_close(ucd); |
| 474 |
| 475 delete_collator: |
| 476 ucol_close(coll); |
| 477 |
| 478 DELETE_ARRAY(pattern); |
| 479 ucbuf_close(ucharBuf); |
| 480 } |
| 481 |
| 482 ucd_flushCache(); |
| 483 } |
| 484 |
| 485 void SSearchTest::bmSearchTest() |
| 486 { |
| 487 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 488 UErrorCode status = U_ZERO_ERROR; |
| 489 char path[PATH_BUFFER_SIZE]; |
| 490 const char *testFilePath = getPath(path, "ssearch.xml"); |
| 491 |
| 492 if (testFilePath == NULL) { |
| 493 return; /* Couldn't get path: error message already output. */ |
| 494 } |
| 495 |
| 496 UXMLParser *parser = UXMLParser::createParser(status); |
| 497 TEST_ASSERT_SUCCESS(status); |
| 498 UXMLElement *root = parser->parseFile(testFilePath, status); |
| 499 TEST_ASSERT_SUCCESS(status); |
| 500 if (U_FAILURE(status)) { |
| 501 return; |
| 502 } |
| 503 |
| 504 const UnicodeString *debugTestCase = root->getAttribute("debug"); |
| 505 if (debugTestCase != NULL) { |
| 506 // setenv("USEARCH_DEBUG", "1", 1); |
| 507 } |
| 508 |
| 509 |
| 510 const UXMLElement *testCase; |
| 511 int32_t tc = 0; |
| 512 |
| 513 while((testCase = root->nextChildElement(tc)) != NULL) { |
| 514 |
| 515 if (testCase->getTagName().compare("test-case") != 0) { |
| 516 errln("ssearch, unrecognized XML Element in test file"); |
| 517 continue; |
| 518 } |
| 519 const UnicodeString *id = testCase->getAttribute("id"); |
| 520 *testId = 0; |
| 521 if (id != NULL) { |
| 522 id->extract(0, id->length(), testId, sizeof(testId), US_INV); |
| 523 } |
| 524 |
| 525 // If debugging test case has been specified and this is not it, skip to
next. |
| 526 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { |
| 527 continue; |
| 528 } |
| 529 // |
| 530 // Get the requested collation strength. |
| 531 // Default is tertiary if the XML attribute is missing from the test
case. |
| 532 // |
| 533 const UnicodeString *strength = testCase->getAttribute("strength"); |
| 534 UColAttributeValue collatorStrength = UCOL_PRIMARY; |
| 535 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} |
| 536 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} |
| 537 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} |
| 538 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} |
| 539 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} |
| 540 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} |
| 541 else { |
| 542 // Bogus value supplied for strength. Shouldn't happen, even from |
| 543 // typos, if the XML source has been validated. |
| 544 // This assert is a little deceiving in that strength can be |
| 545 // any of the allowed values, not just TERTIARY, but it will |
| 546 // do the job of getting the error output. |
| 547 TEST_ASSERT(*strength=="TERTIARY") |
| 548 } |
| 549 |
| 550 // |
| 551 // Get the collator normalization flag. Default is UCOL_OFF. |
| 552 // |
| 553 UColAttributeValue normalize = UCOL_OFF; |
| 554 const UnicodeString *norm = testCase->getAttribute("norm"); |
| 555 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); |
| 556 if (norm!=NULL && *norm=="ON") { |
| 557 normalize = UCOL_ON; |
| 558 } |
| 559 |
| 560 // |
| 561 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. |
| 562 // |
| 563 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; |
| 564 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); |
| 565 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"
); |
| 566 if (alt != NULL && *alt == "SHIFTED") { |
| 567 alternateHandling = UCOL_SHIFTED; |
| 568 } |
| 569 |
| 570 const UnicodeString defLocale("en"); |
| 571 char clocale[100]; |
| 572 const UnicodeString *locale = testCase->getAttribute("locale"); |
| 573 if (locale == NULL || locale->length()==0) { |
| 574 locale = &defLocale; |
| 575 }; |
| 576 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); |
| 577 |
| 578 |
| 579 UnicodeString text; |
| 580 UnicodeString target; |
| 581 UnicodeString pattern; |
| 582 int32_t expectedMatchStart = -1; |
| 583 int32_t expectedMatchLimit = -1; |
| 584 const UXMLElement *n; |
| 585 int32_t nodeCount = 0; |
| 586 |
| 587 n = testCase->getChildElement("pattern"); |
| 588 TEST_ASSERT(n != NULL); |
| 589 if (n==NULL) { |
| 590 continue; |
| 591 } |
| 592 text = n->getText(FALSE); |
| 593 text = text.unescape(); |
| 594 pattern.append(text); |
| 595 nodeCount++; |
| 596 |
| 597 n = testCase->getChildElement("pre"); |
| 598 if (n!=NULL) { |
| 599 text = n->getText(FALSE); |
| 600 text = text.unescape(); |
| 601 target.append(text); |
| 602 nodeCount++; |
| 603 } |
| 604 |
| 605 n = testCase->getChildElement("m"); |
| 606 if (n!=NULL) { |
| 607 expectedMatchStart = target.length(); |
| 608 text = n->getText(FALSE); |
| 609 text = text.unescape(); |
| 610 target.append(text); |
| 611 expectedMatchLimit = target.length(); |
| 612 nodeCount++; |
| 613 } |
| 614 |
| 615 n = testCase->getChildElement("post"); |
| 616 if (n!=NULL) { |
| 617 text = n->getText(FALSE); |
| 618 text = text.unescape(); |
| 619 target.append(text); |
| 620 nodeCount++; |
| 621 } |
| 622 |
| 623 // Check that there weren't extra things in the XML |
| 624 TEST_ASSERT(nodeCount == testCase->countChildren()); |
| 625 |
| 626 // Open a collator and StringSearch based on the parameters |
| 627 // obtained from the XML. |
| 628 // |
| 629 status = U_ZERO_ERROR; |
| 630 UCollator *collator = ucol_open(clocale, &status); |
| 631 ucol_setStrength(collator, collatorStrength); |
| 632 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, normalize, &status)
; |
| 633 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, alternateHandling,
&status); |
| 634 UCD *ucd = ucd_open(collator, &status); |
| 635 BMS *bms = bms_open(ucd, pattern.getBuffer(), pattern.length(), target.g
etBuffer(), target.length(), &status); |
| 636 |
| 637 TEST_ASSERT_SUCCESS(status); |
| 638 if (U_FAILURE(status)) { |
| 639 bms_close(bms); |
| 640 ucd_close(ucd); |
| 641 ucol_close(collator); |
| 642 continue; |
| 643 } |
| 644 |
| 645 int32_t foundStart = 0; |
| 646 int32_t foundLimit = 0; |
| 647 UBool foundMatch; |
| 648 |
| 649 // |
| 650 // Do the search, check the match result against the expected results. |
| 651 // |
| 652 foundMatch = bms_search(bms, 0, &foundStart, &foundLimit); |
| 653 //TEST_ASSERT_SUCCESS(status); |
| 654 if ((foundMatch && expectedMatchStart < 0) || |
| 655 (foundStart != expectedMatchStart) || |
| 656 (foundLimit != expectedMatchLimit)) { |
| 657 TEST_ASSERT(FALSE); // ouput generic error position |
| 658 infoln("Found, expected match start = %d, %d \n" |
| 659 "Found, expected match limit = %d, %d", |
| 660 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); |
| 661 } |
| 662 |
| 663 bms_close(bms); |
| 664 ucd_close(ucd); |
| 665 ucol_close(collator); |
| 666 } |
| 667 |
| 668 ucd_flushCache(); |
| 669 delete root; |
| 670 delete parser; |
| 671 #endif |
| 672 } |
| 673 |
| 674 struct Order |
| 675 { |
| 676 int32_t order; |
| 677 int32_t lowOffset; |
| 678 int32_t highOffset; |
| 679 }; |
| 680 |
| 681 class OrderList |
| 682 { |
| 683 public: |
| 684 OrderList(); |
| 685 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset
= 0); |
| 686 ~OrderList(); |
| 687 |
| 688 int32_t size(void) const; |
| 689 void add(int32_t order, int32_t low, int32_t high); |
| 690 const Order *get(int32_t index) const; |
| 691 int32_t getLowOffset(int32_t index) const; |
| 692 int32_t getHighOffset(int32_t index) const; |
| 693 int32_t getOrder(int32_t index) const; |
| 694 void reverse(void); |
| 695 UBool compare(const OrderList &other) const; |
| 696 UBool matchesAt(int32_t offset, const OrderList &other) const; |
| 697 |
| 698 private: |
| 699 Order *list; |
| 700 int32_t listMax; |
| 701 int32_t listSize; |
| 702 }; |
| 703 |
| 704 OrderList::OrderList() |
| 705 : list(NULL), listMax(16), listSize(0) |
| 706 { |
| 707 list = new Order[listMax]; |
| 708 } |
| 709 |
| 710 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t strin
gOffset) |
| 711 : list(NULL), listMax(16), listSize(0) |
| 712 { |
| 713 UErrorCode status = U_ZERO_ERROR; |
| 714 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), stri
ng.length(), &status); |
| 715 uint32_t strengthMask = 0; |
| 716 int32_t order, low, high; |
| 717 |
| 718 switch (ucol_getStrength(coll)) |
| 719 { |
| 720 default: |
| 721 strengthMask |= UCOL_TERTIARYORDERMASK; |
| 722 /* fall through */ |
| 723 |
| 724 case UCOL_SECONDARY: |
| 725 strengthMask |= UCOL_SECONDARYORDERMASK; |
| 726 /* fall through */ |
| 727 |
| 728 case UCOL_PRIMARY: |
| 729 strengthMask |= UCOL_PRIMARYORDERMASK; |
| 730 } |
| 731 |
| 732 list = new Order[listMax]; |
| 733 |
| 734 ucol_setOffset(elems, stringOffset, &status); |
| 735 |
| 736 do { |
| 737 low = ucol_getOffset(elems); |
| 738 order = ucol_next(elems, &status); |
| 739 high = ucol_getOffset(elems); |
| 740 |
| 741 if (order != UCOL_NULLORDER) { |
| 742 order &= strengthMask; |
| 743 } |
| 744 |
| 745 if (order != UCOL_IGNORABLE) { |
| 746 add(order, low, high); |
| 747 } |
| 748 } while (order != UCOL_NULLORDER); |
| 749 |
| 750 ucol_closeElements(elems); |
| 751 } |
| 752 |
| 753 OrderList::~OrderList() |
| 754 { |
| 755 delete[] list; |
| 756 } |
| 757 |
| 758 void OrderList::add(int32_t order, int32_t low, int32_t high) |
| 759 { |
| 760 if (listSize >= listMax) { |
| 761 listMax *= 2; |
| 762 |
| 763 Order *newList = new Order[listMax]; |
| 764 |
| 765 uprv_memcpy(newList, list, listSize * sizeof(Order)); |
| 766 delete[] list; |
| 767 list = newList; |
| 768 } |
| 769 |
| 770 list[listSize].order = order; |
| 771 list[listSize].lowOffset = low; |
| 772 list[listSize].highOffset = high; |
| 773 |
| 774 listSize += 1; |
| 775 } |
| 776 |
| 777 const Order *OrderList::get(int32_t index) const |
| 778 { |
| 779 if (index >= listSize) { |
| 780 return NULL; |
| 781 } |
| 782 |
| 783 return &list[index]; |
| 784 } |
| 785 |
| 786 int32_t OrderList::getLowOffset(int32_t index) const |
| 787 { |
| 788 const Order *order = get(index); |
| 789 |
| 790 if (order != NULL) { |
| 791 return order->lowOffset; |
| 792 } |
| 793 |
| 794 return -1; |
| 795 } |
| 796 |
| 797 int32_t OrderList::getHighOffset(int32_t index) const |
| 798 { |
| 799 const Order *order = get(index); |
| 800 |
| 801 if (order != NULL) { |
| 802 return order->highOffset; |
| 803 } |
| 804 |
| 805 return -1; |
| 806 } |
| 807 |
| 808 int32_t OrderList::getOrder(int32_t index) const |
| 809 { |
| 810 const Order *order = get(index); |
| 811 |
| 812 if (order != NULL) { |
| 813 return order->order; |
| 814 } |
| 815 |
| 816 return UCOL_NULLORDER; |
| 817 } |
| 818 |
| 819 int32_t OrderList::size() const |
| 820 { |
| 821 return listSize; |
| 822 } |
| 823 |
| 824 void OrderList::reverse() |
| 825 { |
| 826 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { |
| 827 Order swap = list[b]; |
| 828 |
| 829 list[b] = list[f]; |
| 830 list[f] = swap; |
| 831 } |
| 832 } |
| 833 |
| 834 UBool OrderList::compare(const OrderList &other) const |
| 835 { |
| 836 if (listSize != other.listSize) { |
| 837 return FALSE; |
| 838 } |
| 839 |
| 840 for(int32_t i = 0; i < listSize; i += 1) { |
| 841 if (list[i].order != other.list[i].order || |
| 842 list[i].lowOffset != other.list[i].lowOffset || |
| 843 list[i].highOffset != other.list[i].highOffset) { |
| 844 return FALSE; |
| 845 } |
| 846 } |
| 847 |
| 848 return TRUE; |
| 849 } |
| 850 |
| 851 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const |
| 852 { |
| 853 // NOTE: sizes include the NULLORDER, which we don't want to compare. |
| 854 int32_t otherSize = other.size() - 1; |
| 855 |
| 856 if (listSize - 1 - offset < otherSize) { |
| 857 return FALSE; |
| 858 } |
| 859 |
| 860 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { |
| 861 if (getOrder(i) != other.getOrder(j)) { |
| 862 return FALSE; |
| 863 } |
| 864 } |
| 865 |
| 866 return TRUE; |
| 867 } |
| 868 |
| 869 static char *printOffsets(char *buffer, OrderList &list) |
| 870 { |
| 871 int32_t size = list.size(); |
| 872 char *s = buffer; |
| 873 |
| 874 for(int32_t i = 0; i < size; i += 1) { |
| 875 const Order *order = list.get(i); |
| 876 |
| 877 if (i != 0) { |
| 878 s += sprintf(s, ", "); |
| 879 } |
| 880 |
| 881 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); |
| 882 } |
| 883 |
| 884 return buffer; |
| 885 } |
| 886 |
| 887 static char *printOrders(char *buffer, OrderList &list) |
| 888 { |
| 889 int32_t size = list.size(); |
| 890 char *s = buffer; |
| 891 |
| 892 for(int32_t i = 0; i < size; i += 1) { |
| 893 const Order *order = list.get(i); |
| 894 |
| 895 if (i != 0) { |
| 896 s += sprintf(s, ", "); |
| 897 } |
| 898 |
| 899 s += sprintf(s, "%8.8X", order->order); |
| 900 } |
| 901 |
| 902 return buffer; |
| 903 } |
| 904 |
| 905 void SSearchTest::offsetTest() |
| 906 { |
| 907 static const UVersionInfo icu47 = { 4, 7, 0, 0 }; |
| 908 const char *test[] = { |
| 909 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous |
| 910 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. |
| 911 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", |
| 912 |
| 913 "\\ua191\\u16ef\\u2036\\u017a", |
| 914 |
| 915 #if 0 |
| 916 // This results in a complex interaction between contraction, |
| 917 // expansion and normalization that confuses the backwards offset fixups
. |
| 918 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", |
| 919 #endif |
| 920 |
| 921 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", |
| 922 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", |
| 923 |
| 924 "\\u02FE\\u02FF" |
| 925 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\
\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" |
| 926 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\
\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" |
| 927 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\
\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" |
| 928 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\
\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" |
| 929 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\
\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 |
| 930 |
| 931 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // cu
rrently not working, see #8081 |
| 932 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 |
| 933 "a\\u02FF\\u0316\\u0301", |
| 934 "a\\u0430\\u0301\\u0316", |
| 935 "a\\u0430\\u0316\\u0301", |
| 936 "abc\\u0E41\\u0301\\u0316", |
| 937 "abc\\u0E41\\u0316\\u0301", |
| 938 "\\u0E41\\u0301\\u0316", |
| 939 "\\u0E41\\u0316\\u0301", |
| 940 "a\\u0301\\u0316", |
| 941 "a\\u0316\\u0301", |
| 942 "\\uAC52\\uAC53", |
| 943 "\\u34CA\\u34CB", |
| 944 "\\u11ED\\u11EE", |
| 945 "\\u30C3\\u30D0", |
| 946 "p\\u00E9ch\\u00E9", |
| 947 "a\\u0301\\u0325", |
| 948 "a\\u0300\\u0325", |
| 949 "a\\u0325\\u0300", |
| 950 "A\\u0323\\u0300B", |
| 951 "A\\u0300\\u0323B", |
| 952 "A\\u0301\\u0323B", |
| 953 "A\\u0302\\u0301\\u0323B", |
| 954 "abc", |
| 955 "ab\\u0300c", |
| 956 "ab\\u0300\\u0323c", |
| 957 " \\uD800\\uDC00\\uDC00", |
| 958 "a\\uD800\\uDC00\\uDC00", |
| 959 "A\\u0301\\u0301", |
| 960 "A\\u0301\\u0323", |
| 961 "A\\u0301\\u0323B", |
| 962 "B\\u0301\\u0323C", |
| 963 "A\\u0300\\u0323B", |
| 964 "\\u0301A\\u0301\\u0301", |
| 965 "abcd\\r\\u0301", |
| 966 "p\\u00EAche", |
| 967 "pe\\u0302che", |
| 968 }; |
| 969 |
| 970 int32_t testCount = ARRAY_SIZE(test); |
| 971 UErrorCode status = U_ZERO_ERROR; |
| 972 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Loca
le::getEnglish(), status); |
| 973 if (U_FAILURE(status)) { |
| 974 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_er
rorName(status)); |
| 975 return; |
| 976 } |
| 977 char buffer[4096]; // A bit of a hack... just happens to be long enough for
all the test cases... |
| 978 // We could allocate one that's the right size by (CE_co
unt * 10) + 2 |
| 979 // 10 chars is enough room for 8 hex digits plus ", ". 2
extra chars for "[" and "]" |
| 980 |
| 981 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); |
| 982 |
| 983 for(int32_t i = 0; i < testCount; i += 1) { |
| 984 if (!isICUVersionAtLeast(icu47) && i>=4 && i<=6) { |
| 985 continue; // timebomb until ticket #8080 is resolved |
| 986 } |
| 987 UnicodeString ts = CharsToUnicodeString(test[i]); |
| 988 CollationElementIterator *iter = col->createCollationElementIterator(ts)
; |
| 989 OrderList forwardList; |
| 990 OrderList backwardList; |
| 991 int32_t order, low, high; |
| 992 |
| 993 do { |
| 994 low = iter->getOffset(); |
| 995 order = iter->next(status); |
| 996 high = iter->getOffset(); |
| 997 |
| 998 forwardList.add(order, low, high); |
| 999 } while (order != CollationElementIterator::NULLORDER); |
| 1000 |
| 1001 iter->reset(); |
| 1002 iter->setOffset(ts.length(), status); |
| 1003 |
| 1004 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(),
iter->getOffset()); |
| 1005 |
| 1006 do { |
| 1007 high = iter->getOffset(); |
| 1008 order = iter->previous(status); |
| 1009 low = iter->getOffset(); |
| 1010 |
| 1011 if (order == CollationElementIterator::NULLORDER) { |
| 1012 break; |
| 1013 } |
| 1014 |
| 1015 backwardList.add(order, low, high); |
| 1016 } while (TRUE); |
| 1017 |
| 1018 backwardList.reverse(); |
| 1019 |
| 1020 if (forwardList.compare(backwardList)) { |
| 1021 logln("Works with \"%s\"", test[i]); |
| 1022 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); |
| 1023 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); |
| 1024 |
| 1025 logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); |
| 1026 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); |
| 1027 |
| 1028 logln(); |
| 1029 } else { |
| 1030 errln("Fails with \"%s\"", test[i]); |
| 1031 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); |
| 1032 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList))
; |
| 1033 |
| 1034 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); |
| 1035 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); |
| 1036 |
| 1037 infoln(); |
| 1038 } |
| 1039 delete iter; |
| 1040 } |
| 1041 delete col; |
| 1042 } |
| 1043 |
| 1044 #if 0 |
| 1045 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) |
| 1046 { |
| 1047 for(int32_t i = 0; i < string.length(); i += 1) { |
| 1048 UChar32 ch = string.char32At(i); |
| 1049 |
| 1050 if (ch >= 0x0020 && ch <= 0x007F) { |
| 1051 if (ch == 0x005C) { |
| 1052 buffer.append("\\\\"); |
| 1053 } else { |
| 1054 buffer.append(ch); |
| 1055 } |
| 1056 } else { |
| 1057 char cbuffer[12]; |
| 1058 |
| 1059 if (ch <= 0xFFFFL) { |
| 1060 sprintf(cbuffer, "\\u%4.4X", ch); |
| 1061 } else { |
| 1062 sprintf(cbuffer, "\\U%8.8X", ch); |
| 1063 } |
| 1064 |
| 1065 buffer.append(cbuffer); |
| 1066 } |
| 1067 |
| 1068 if (ch >= 0x10000L) { |
| 1069 i += 1; |
| 1070 } |
| 1071 } |
| 1072 |
| 1073 return buffer; |
| 1074 } |
| 1075 #endif |
| 1076 |
| 1077 #if 1 |
| 1078 |
| 1079 struct PCE |
| 1080 { |
| 1081 uint64_t ce; |
| 1082 int32_t lowOffset; |
| 1083 int32_t highOffset; |
| 1084 }; |
| 1085 |
| 1086 class PCEList |
| 1087 { |
| 1088 public: |
| 1089 PCEList(UCollator *coll, const UnicodeString &string); |
| 1090 ~PCEList(); |
| 1091 |
| 1092 int32_t size() const; |
| 1093 |
| 1094 const PCE *get(int32_t index) const; |
| 1095 |
| 1096 int32_t getLowOffset(int32_t index) const; |
| 1097 int32_t getHighOffset(int32_t index) const; |
| 1098 uint64_t getOrder(int32_t index) const; |
| 1099 |
| 1100 UBool matchesAt(int32_t offset, const PCEList &other) const; |
| 1101 |
| 1102 uint64_t operator[](int32_t index) const; |
| 1103 |
| 1104 private: |
| 1105 void add(uint64_t ce, int32_t low, int32_t high); |
| 1106 |
| 1107 PCE *list; |
| 1108 int32_t listMax; |
| 1109 int32_t listSize; |
| 1110 }; |
| 1111 |
| 1112 PCEList::PCEList(UCollator *coll, const UnicodeString &string) |
| 1113 { |
| 1114 UErrorCode status = U_ZERO_ERROR; |
| 1115 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), stri
ng.length(), &status); |
| 1116 uint64_t order; |
| 1117 int32_t low, high; |
| 1118 |
| 1119 list = new PCE[listMax]; |
| 1120 |
| 1121 ucol_setOffset(elems, 0, &status); |
| 1122 |
| 1123 do { |
| 1124 order = ucol_nextProcessed(elems, &low, &high, &status); |
| 1125 add(order, low, high); |
| 1126 } while (order != UCOL_PROCESSED_NULLORDER); |
| 1127 |
| 1128 ucol_closeElements(elems); |
| 1129 } |
| 1130 |
| 1131 PCEList::~PCEList() |
| 1132 { |
| 1133 delete[] list; |
| 1134 } |
| 1135 |
| 1136 void PCEList::add(uint64_t order, int32_t low, int32_t high) |
| 1137 { |
| 1138 if (listSize >= listMax) { |
| 1139 listMax *= 2; |
| 1140 |
| 1141 PCE *newList = new PCE[listMax]; |
| 1142 |
| 1143 uprv_memcpy(newList, list, listSize * sizeof(Order)); |
| 1144 delete[] list; |
| 1145 list = newList; |
| 1146 } |
| 1147 |
| 1148 list[listSize].ce = order; |
| 1149 list[listSize].lowOffset = low; |
| 1150 list[listSize].highOffset = high; |
| 1151 |
| 1152 listSize += 1; |
| 1153 } |
| 1154 |
| 1155 const PCE *PCEList::get(int32_t index) const |
| 1156 { |
| 1157 if (index >= listSize) { |
| 1158 return NULL; |
| 1159 } |
| 1160 |
| 1161 return &list[index]; |
| 1162 } |
| 1163 |
| 1164 int32_t PCEList::getLowOffset(int32_t index) const |
| 1165 { |
| 1166 const PCE *pce = get(index); |
| 1167 |
| 1168 if (pce != NULL) { |
| 1169 return pce->lowOffset; |
| 1170 } |
| 1171 |
| 1172 return -1; |
| 1173 } |
| 1174 |
| 1175 int32_t PCEList::getHighOffset(int32_t index) const |
| 1176 { |
| 1177 const PCE *pce = get(index); |
| 1178 |
| 1179 if (pce != NULL) { |
| 1180 return pce->highOffset; |
| 1181 } |
| 1182 |
| 1183 return -1; |
| 1184 } |
| 1185 |
| 1186 uint64_t PCEList::getOrder(int32_t index) const |
| 1187 { |
| 1188 const PCE *pce = get(index); |
| 1189 |
| 1190 if (pce != NULL) { |
| 1191 return pce->ce; |
| 1192 } |
| 1193 |
| 1194 return UCOL_PROCESSED_NULLORDER; |
| 1195 } |
| 1196 |
| 1197 int32_t PCEList::size() const |
| 1198 { |
| 1199 return listSize; |
| 1200 } |
| 1201 |
| 1202 UBool PCEList::matchesAt(int32_t offset, const PCEList &other) const |
| 1203 { |
| 1204 // NOTE: sizes include the NULLORDER, which we don't want to compare. |
| 1205 int32_t otherSize = other.size() - 1; |
| 1206 |
| 1207 if (listSize - 1 - offset < otherSize) { |
| 1208 return FALSE; |
| 1209 } |
| 1210 |
| 1211 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { |
| 1212 if (getOrder(i) != other.getOrder(j)) { |
| 1213 return FALSE; |
| 1214 } |
| 1215 } |
| 1216 |
| 1217 return TRUE; |
| 1218 } |
| 1219 |
| 1220 uint64_t PCEList::operator[](int32_t index) const |
| 1221 { |
| 1222 return getOrder(index); |
| 1223 } |
| 1224 |
| 1225 void SSearchTest::boyerMooreTest() |
| 1226 { |
| 1227 UErrorCode status = U_ZERO_ERROR; |
| 1228 UCollator *coll = NULL; |
| 1229 CollData *data = NULL; |
| 1230 const CEList* ce = NULL; |
| 1231 const CEList* ce1 = NULL; |
| 1232 UnicodeString lp = "fuss"; |
| 1233 UnicodeString sp = "fu\\u00DF"; |
| 1234 BoyerMooreSearch *longPattern = NULL; |
| 1235 BoyerMooreSearch *shortPattern = NULL; |
| 1236 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball",
"12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", |
| 1237 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", |
| 1238 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "
12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu
\\u00DF", "1234fu\\u00DF"}; |
| 1239 int32_t start = -1, end = -1; |
| 1240 |
| 1241 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); |
| 1242 if (U_FAILURE(status)) { |
| 1243 errcheckln(status, "Could not open collator. - %s", u_errorName(status))
; |
| 1244 return; |
| 1245 } |
| 1246 |
| 1247 data = CollData::open(coll, status); |
| 1248 if (U_FAILURE(status)) { |
| 1249 errln("Could not open CollData object."); |
| 1250 goto close_data; |
| 1251 } |
| 1252 |
| 1253 data->getDynamicClassID(); |
| 1254 if (U_FAILURE(status)) { |
| 1255 errln("Could not get dynamic class ID of CollData."); |
| 1256 goto close_patterns; |
| 1257 } |
| 1258 |
| 1259 data->getStaticClassID(); |
| 1260 if (U_FAILURE(status)) { |
| 1261 errln("Could not get static class ID of CollData."); |
| 1262 goto close_patterns; |
| 1263 } |
| 1264 |
| 1265 longPattern = new BoyerMooreSearch(data, lp.unescape(), NULL, status); |
| 1266 shortPattern = new BoyerMooreSearch(data, sp.unescape(), NULL, status); |
| 1267 if (U_FAILURE(status)) { |
| 1268 errln("Could not create pattern objects."); |
| 1269 goto close_patterns; |
| 1270 } |
| 1271 |
| 1272 longPattern->getBadCharacterTable(); |
| 1273 shortPattern->getBadCharacterTable(); |
| 1274 if (U_FAILURE(status)) { |
| 1275 errln("Could not get bad character table."); |
| 1276 goto close_patterns; |
| 1277 } |
| 1278 |
| 1279 longPattern->getGoodSuffixTable(); |
| 1280 shortPattern->getGoodSuffixTable(); |
| 1281 if (U_FAILURE(status)) { |
| 1282 errln("Could not get good suffix table."); |
| 1283 goto close_patterns; |
| 1284 } |
| 1285 |
| 1286 longPattern->getDynamicClassID(); |
| 1287 shortPattern->getDynamicClassID(); |
| 1288 if (U_FAILURE(status)) { |
| 1289 errln("Could not get dynamic class ID of BoyerMooreSearch."); |
| 1290 goto close_patterns; |
| 1291 } |
| 1292 |
| 1293 longPattern->getStaticClassID(); |
| 1294 shortPattern->getStaticClassID(); |
| 1295 if (U_FAILURE(status)) { |
| 1296 errln("Could not get static class ID of BoyerMooreSearch."); |
| 1297 goto close_patterns; |
| 1298 } |
| 1299 |
| 1300 longPattern->getData(); |
| 1301 shortPattern->getData(); |
| 1302 if (U_FAILURE(status)) { |
| 1303 errln("Could not get collate data."); |
| 1304 goto close_patterns; |
| 1305 } |
| 1306 |
| 1307 ce = longPattern->getPatternCEs(); |
| 1308 ce1 = shortPattern->getPatternCEs(); |
| 1309 if (U_FAILURE(status)) { |
| 1310 errln("Could not get pattern CEs."); |
| 1311 goto close_patterns; |
| 1312 } |
| 1313 |
| 1314 ce->getDynamicClassID(); |
| 1315 ce1->getDynamicClassID(); |
| 1316 if (U_FAILURE(status)) { |
| 1317 errln("Could not get dynamic class ID of CEList."); |
| 1318 goto close_patterns; |
| 1319 } |
| 1320 |
| 1321 ce->getStaticClassID(); |
| 1322 ce1->getStaticClassID(); |
| 1323 if (U_FAILURE(status)) { |
| 1324 errln("Could not get static class ID of CEList."); |
| 1325 goto close_patterns; |
| 1326 } |
| 1327 |
| 1328 if(data->minLengthInChars(ce,0) != 3){ |
| 1329 errln("Minimal Length in Characters for 'data' with 'ce' was suppose to
give 3."); |
| 1330 goto close_patterns; |
| 1331 } |
| 1332 |
| 1333 if(data->minLengthInChars(ce1,0) != 3){ |
| 1334 errln("Minimal Length in Characters for 'data' with 'ce1' was suppose to
give 3."); |
| 1335 goto close_patterns; |
| 1336 } |
| 1337 |
| 1338 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { |
| 1339 UnicodeString target = targets[t].unescape(); |
| 1340 |
| 1341 longPattern->setTargetString(&target, status); |
| 1342 if (longPattern->search(0, start, end)) { |
| 1343 logln("Test %d: found long pattern at [%d, %d].", t, start, end); |
| 1344 } else { |
| 1345 errln("Test %d: did not find long pattern.", t); |
| 1346 } |
| 1347 |
| 1348 shortPattern->setTargetString(&target, status); |
| 1349 if (shortPattern->search(0, start, end)) { |
| 1350 logln("Test %d: found short pattern at [%d, %d].", t, start, end); |
| 1351 } else { |
| 1352 errln("Test %d: did not find short pattern.", t); |
| 1353 } |
| 1354 |
| 1355 if(longPattern->empty()){ |
| 1356 errln("Test %d: Long pattern should not have been empty."); |
| 1357 } |
| 1358 |
| 1359 if(shortPattern->empty()){ |
| 1360 errln("Test %d: Short pattern should not have been empty."); |
| 1361 } |
| 1362 } |
| 1363 |
| 1364 close_patterns: |
| 1365 delete shortPattern; |
| 1366 delete longPattern; |
| 1367 |
| 1368 close_data: |
| 1369 CollData::close(data); |
| 1370 ucol_close(coll); |
| 1371 } |
| 1372 |
| 1373 void SSearchTest::bmsTest() |
| 1374 { |
| 1375 UErrorCode status = U_ZERO_ERROR; |
| 1376 UCollator *coll = NULL; |
| 1377 UCD *data = NULL; |
| 1378 UnicodeString lp = "fuss"; |
| 1379 UnicodeString lpu = lp.unescape(); |
| 1380 UnicodeString sp = "fu\\u00DF"; |
| 1381 UnicodeString spu = sp.unescape(); |
| 1382 BMS *longPattern = NULL; |
| 1383 BMS *shortPattern = NULL; |
| 1384 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball",
"12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", |
| 1385 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", |
| 1386 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "
12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu
\\u00DF", "1234fu\\u00DF"}; |
| 1387 int32_t start = -1, end = -1; |
| 1388 |
| 1389 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); |
| 1390 if (U_FAILURE(status)) { |
| 1391 errcheckln(status, "Could not open collator. - %s", u_errorName(status))
; |
| 1392 return; |
| 1393 } |
| 1394 |
| 1395 data = ucd_open(coll, &status); |
| 1396 if (U_FAILURE(status)) { |
| 1397 errln("Could not open CollData object."); |
| 1398 goto close_data; |
| 1399 } |
| 1400 |
| 1401 longPattern = bms_open(data, lpu.getBuffer(), lpu.length(), NULL, 0, &status
); |
| 1402 shortPattern = bms_open(data, spu.getBuffer(), spu.length(), NULL, 0, &statu
s); |
| 1403 if (U_FAILURE(status)) { |
| 1404 errln("Couldn't open pattern objects."); |
| 1405 goto close_patterns; |
| 1406 } |
| 1407 |
| 1408 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { |
| 1409 UnicodeString target = targets[t].unescape(); |
| 1410 |
| 1411 bms_setTargetString(longPattern, target.getBuffer(), target.length(), &s
tatus); |
| 1412 if (bms_search(longPattern, 0, &start, &end)) { |
| 1413 logln("Test %d: found long pattern at [%d, %d].", t, start, end); |
| 1414 } else { |
| 1415 errln("Test %d: did not find long pattern.", t); |
| 1416 } |
| 1417 |
| 1418 bms_setTargetString(shortPattern, target.getBuffer(), target.length(), &
status); |
| 1419 if (bms_search(shortPattern, 0, &start, &end)) { |
| 1420 logln("Test %d: found short pattern at [%d, %d].", t, start, end); |
| 1421 } else { |
| 1422 errln("Test %d: did not find short pattern.", t); |
| 1423 } |
| 1424 } |
| 1425 |
| 1426 /* Add better coverage for bms code. */ |
| 1427 if(bms_empty(longPattern)) { |
| 1428 errln("FAIL: longgPattern is empty."); |
| 1429 } |
| 1430 |
| 1431 if (!bms_getData(longPattern)) { |
| 1432 errln("FAIL: bms_getData returned NULL."); |
| 1433 } |
| 1434 |
| 1435 if (!ucd_getCollator(data)) { |
| 1436 errln("FAIL: ucd_getCollator returned NULL."); |
| 1437 } |
| 1438 |
| 1439 close_patterns: |
| 1440 bms_close(shortPattern); |
| 1441 bms_close(longPattern); |
| 1442 |
| 1443 close_data: |
| 1444 ucd_close(data); |
| 1445 ucd_freeCache(); |
| 1446 ucol_close(coll); |
| 1447 } |
| 1448 |
| 1449 void SSearchTest::goodSuffixTest() |
| 1450 { |
| 1451 UErrorCode status = U_ZERO_ERROR; |
| 1452 UCollator *coll = NULL; |
| 1453 CollData *data = NULL; |
| 1454 UnicodeString pat = /*"gcagagag"*/ "fxeld"; |
| 1455 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; |
| 1456 BoyerMooreSearch *pattern = NULL; |
| 1457 int32_t start = -1, end = -1; |
| 1458 |
| 1459 coll = ucol_open(NULL, &status); |
| 1460 if (U_FAILURE(status)) { |
| 1461 errcheckln(status, "Couldn't open collator. - %s", u_errorName(status)); |
| 1462 return; |
| 1463 } |
| 1464 |
| 1465 data = CollData::open(coll, status); |
| 1466 if (U_FAILURE(status)) { |
| 1467 errln("Couldn't open CollData object."); |
| 1468 goto close_data; |
| 1469 } |
| 1470 |
| 1471 pattern = new BoyerMooreSearch(data, pat, &target, status); |
| 1472 if (U_FAILURE(status)) { |
| 1473 errln("Couldn't open pattern object."); |
| 1474 goto close_pattern; |
| 1475 } |
| 1476 |
| 1477 if (pattern->search(0, start, end)) { |
| 1478 logln("Found pattern at [%d, %d].", start, end); |
| 1479 } else { |
| 1480 errln("Did not find pattern."); |
| 1481 } |
| 1482 |
| 1483 close_pattern: |
| 1484 delete pattern; |
| 1485 |
| 1486 close_data: |
| 1487 CollData::close(data); |
| 1488 ucol_close(coll); |
| 1489 } |
| 1490 |
| 1491 // |
| 1492 // searchTime() A quick and dirty performance test for string search. |
| 1493 // Probably doesn't really belong as part of intltest, but it |
| 1494 // does check that the search succeeds, and gets the right resu
lt, |
| 1495 // so it serves as a functionality test also. |
| 1496 // |
| 1497 // To run as a perf test, up the loop count, select by commenti
ng |
| 1498 // and uncommenting in the code the operation to be measured, |
| 1499 // rebuild, and measure the running time of this test alone. |
| 1500 // |
| 1501 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearc
hTest/searchTime |
| 1502 // |
| 1503 void SSearchTest::searchTime() { |
| 1504 static const char *longishText = |
| 1505 "Whylom, as olde stories tellen us,\n" |
| 1506 "Ther was a duk that highte Theseus:\n" |
| 1507 "Of Athenes he was lord and governour,\n" |
| 1508 "And in his tyme swich a conquerour,\n" |
| 1509 "That gretter was ther noon under the sonne.\n" |
| 1510 "Ful many a riche contree hadde he wonne;\n" |
| 1511 "What with his wisdom and his chivalrye,\n" |
| 1512 "He conquered al the regne of Femenye,\n" |
| 1513 "That whylom was y-cleped Scithia;\n" |
| 1514 "And weddede the quene Ipolita,\n" |
| 1515 "And broghte hir hoom with him in his contree\n" |
| 1516 "With muchel glorie and greet solempnitee,\n" |
| 1517 "And eek hir yonge suster Emelye.\n" |
| 1518 "And thus with victorie and with melodye\n" |
| 1519 "Lete I this noble duk to Athenes ryde,\n" |
| 1520 "And al his hoost, in armes, him bisyde.\n" |
| 1521 "And certes, if it nere to long to here,\n" |
| 1522 "I wolde han told yow fully the manere,\n" |
| 1523 "How wonnen was the regne of Femenye\n" |
| 1524 "By Theseus, and by his chivalrye;\n" |
| 1525 "And of the grete bataille for the nones\n" |
| 1526 "Bitwixen Athen's and Amazones;\n" |
| 1527 "And how asseged was Ipolita,\n" |
| 1528 "The faire hardy quene of Scithia;\n" |
| 1529 "And of the feste that was at hir weddinge,\n" |
| 1530 "And of the tempest at hir hoom-cominge;\n" |
| 1531 "But al that thing I moot as now forbere.\n" |
| 1532 "I have, God woot, a large feeld to ere,\n" |
| 1533 "And wayke been the oxen in my plough.\n" |
| 1534 "The remenant of the tale is long y-nough.\n" |
| 1535 "I wol nat letten eek noon of this route;\n" |
| 1536 "Lat every felawe telle his tale aboute,\n" |
| 1537 "And lat see now who shal the soper winne;\n" |
| 1538 "And ther I lefte, I wol ageyn biginne.\n" |
| 1539 "This duk, of whom I make mencioun,\n" |
| 1540 "When he was come almost unto the toun,\n" |
| 1541 "In al his wele and in his moste pryde,\n" |
| 1542 "He was war, as he caste his eye asyde,\n" |
| 1543 "Wher that ther kneled in the hye weye\n" |
| 1544 "A companye of ladies, tweye and tweye,\n" |
| 1545 "Ech after other, clad in clothes blake; \n" |
| 1546 "But swich a cry and swich a wo they make,\n" |
| 1547 "That in this world nis creature livinge,\n" |
| 1548 "That herde swich another weymentinge;\n" |
| 1549 "And of this cry they nolde never stenten,\n" |
| 1550 "Til they the reynes of his brydel henten.\n" |
| 1551 "'What folk ben ye, that at myn hoomcominge\n" |
| 1552 "Perturben so my feste with cryinge'?\n" |
| 1553 "Quod Theseus, 'have ye so greet envye\n" |
| 1554 "Of myn honour, that thus compleyne and crye? \n" |
| 1555 "Or who hath yow misboden, or offended?\n" |
| 1556 "And telleth me if it may been amended;\n" |
| 1557 "And why that ye ben clothed thus in blak'?\n" |
| 1558 "The eldest lady of hem alle spak,\n" |
| 1559 "When she hadde swowned with a deedly chere,\n" |
| 1560 "That it was routhe for to seen and here,\n" |
| 1561 "And seyde: 'Lord, to whom Fortune hath yiven\n" |
| 1562 "Victorie, and as a conquerour to liven,\n" |
| 1563 "Noght greveth us your glorie and your honour;\n" |
| 1564 "But we biseken mercy and socour.\n" |
| 1565 "Have mercy on our wo and our distresse.\n" |
| 1566 "Som drope of pitee, thurgh thy gentilesse,\n" |
| 1567 "Up-on us wrecched wommen lat thou falle.\n" |
| 1568 "For certes, lord, ther nis noon of us alle,\n" |
| 1569 "That she nath been a duchesse or a quene;\n" |
| 1570 "Now be we caitifs, as it is wel sene:\n" |
| 1571 "Thanked be Fortune, and hir false wheel,\n" |
| 1572 "That noon estat assureth to be weel.\n" |
| 1573 "And certes, lord, t'abyden your presence,\n" |
| 1574 "Here in the temple of the goddesse Clemence\n" |
| 1575 "We han ben waytinge al this fourtenight;\n" |
| 1576 "Now help us, lord, sith it is in thy might.\n" |
| 1577 "I wrecche, which that wepe and waille thus,\n" |
| 1578 "Was whylom wyf to king Capaneus,\n" |
| 1579 "That starf at Thebes, cursed be that day!\n" |
| 1580 "And alle we, that been in this array,\n" |
| 1581 "And maken al this lamentacioun,\n" |
| 1582 "We losten alle our housbondes at that toun,\n" |
| 1583 "Whyl that the sege ther-aboute lay.\n" |
| 1584 "And yet now th'olde Creon, weylaway!\n" |
| 1585 "The lord is now of Thebes the citee, \n" |
| 1586 "Fulfild of ire and of iniquitee,\n" |
| 1587 "He, for despyt, and for his tirannye,\n" |
| 1588 "To do the dede bodyes vileinye,\n" |
| 1589 "Of alle our lordes, whiche that ben slawe,\n" |
| 1590 "Hath alle the bodyes on an heep y-drawe,\n" |
| 1591 "And wol nat suffren hem, by noon assent,\n" |
| 1592 "Neither to been y-buried nor y-brent,\n" |
| 1593 "But maketh houndes ete hem in despyt. zet'\n"; |
| 1594 |
| 1595 #define TEST_BOYER_MOORE 1 |
| 1596 const char *cPattern = "maketh houndes ete hem"; |
| 1597 //const char *cPattern = "Whylom"; |
| 1598 //const char *cPattern = "zet"; |
| 1599 const char *testId = "searchTime()"; // for error macros. |
| 1600 UnicodeString target = longishText; |
| 1601 UErrorCode status = U_ZERO_ERROR; |
| 1602 |
| 1603 |
| 1604 LocalUCollatorPointer collator(ucol_open("en", &status)); |
| 1605 CollData *data = CollData::open(collator.getAlias(), status); |
| 1606 if (U_FAILURE(status) || collator.isNull() || data == NULL) { |
| 1607 errcheckln(status, "Unable to open UCollator or CollData. - %s", u_error
Name(status)); |
| 1608 return; |
| 1609 } |
| 1610 //ucol_setStrength(collator.getAlias(), collatorStrength); |
| 1611 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize,
&status); |
| 1612 UnicodeString uPattern = cPattern; |
| 1613 #ifndef TEST_BOYER_MOORE |
| 1614 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(),
uPattern.length(), |
| 1615 target.getBuffer(), t
arget.length(), |
| 1616 collator.getAlias(), |
| 1617 NULL, // the brea
k iterator |
| 1618 &status)); |
| 1619 TEST_ASSERT_SUCCESS(status); |
| 1620 #else |
| 1621 BoyerMooreSearch bms(data, uPattern, &target, status); |
| 1622 TEST_ASSERT_SUCCESS(status); |
| 1623 #endif |
| 1624 |
| 1625 // int32_t foundStart; |
| 1626 // int32_t foundEnd; |
| 1627 UBool found; |
| 1628 |
| 1629 // Find the match position usgin strstr |
| 1630 const char *pm = strstr(longishText, cPattern); |
| 1631 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); |
| 1632 int32_t refMatchPos = (int32_t)(pm - longishText); |
| 1633 int32_t icuMatchPos; |
| 1634 int32_t icuMatchEnd; |
| 1635 #ifndef TEST_BOYER_MOORE |
| 1636 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); |
| 1637 TEST_ASSERT_SUCCESS(status); |
| 1638 #else |
| 1639 found = bms.search(0, icuMatchPos, icuMatchEnd); |
| 1640 #endif |
| 1641 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different mat
ch positions."); |
| 1642 |
| 1643 int32_t i; |
| 1644 int32_t j=0; |
| 1645 |
| 1646 // Try loopcounts around 100000 to some millions, depending on the operation
, |
| 1647 // to get runtimes of at least several seconds. |
| 1648 for (i=0; i<10000; i++) { |
| 1649 #ifndef TEST_BOYER_MOORE |
| 1650 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &s
tatus); |
| 1651 #else |
| 1652 found = bms.search(0, icuMatchPos, icuMatchEnd); |
| 1653 #endif |
| 1654 //TEST_ASSERT_SUCCESS(status); |
| 1655 //TEST_ASSERT(found); |
| 1656 |
| 1657 // usearch_setOffset(uss.getAlias(), 0, &status); |
| 1658 // icuMatchPos = usearch_next(uss.getAlias(), &status); |
| 1659 |
| 1660 // The i+j stuff is to confuse the optimizer and get it to actually lea
ve the |
| 1661 // call to strstr in place. |
| 1662 //pm = strstr(longishText+j, cPattern); |
| 1663 //j = (j + i)%5; |
| 1664 } |
| 1665 |
| 1666 printf("%ld, %d\n", pm-longishText, j); |
| 1667 #ifdef TEST_BOYER_MOORE |
| 1668 CollData::close(data); |
| 1669 #endif |
| 1670 } |
| 1671 #endif |
| 1672 |
| 1673 //------------------------------------------------------------------------------
---------- |
| 1674 // |
| 1675 // Random Numbers. Similar to standard lib rand() and srand() |
| 1676 // Not using library to |
| 1677 // 1. Get same results on all platforms. |
| 1678 // 2. Get access to current seed, to more easily reproduce
failures. |
| 1679 // |
| 1680 //------------------------------------------------------------------------------
--------- |
| 1681 static uint32_t m_seed = 1; |
| 1682 |
| 1683 static uint32_t m_rand() |
| 1684 { |
| 1685 m_seed = m_seed * 1103515245 + 12345; |
| 1686 return (uint32_t)(m_seed/65536) % 32768; |
| 1687 } |
| 1688 |
| 1689 class Monkey |
| 1690 { |
| 1691 public: |
| 1692 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; |
| 1693 |
| 1694 protected: |
| 1695 Monkey(); |
| 1696 virtual ~Monkey(); |
| 1697 }; |
| 1698 |
| 1699 Monkey::Monkey() |
| 1700 { |
| 1701 // ook? |
| 1702 } |
| 1703 |
| 1704 Monkey::~Monkey() |
| 1705 { |
| 1706 // ook? |
| 1707 } |
| 1708 |
| 1709 class SetMonkey : public Monkey |
| 1710 { |
| 1711 public: |
| 1712 SetMonkey(const USet *theSet); |
| 1713 ~SetMonkey(); |
| 1714 |
| 1715 virtual void append(UnicodeString &test, UnicodeString &alternate); |
| 1716 |
| 1717 private: |
| 1718 const USet *set; |
| 1719 }; |
| 1720 |
| 1721 SetMonkey::SetMonkey(const USet *theSet) |
| 1722 : Monkey(), set(theSet) |
| 1723 { |
| 1724 // ook? |
| 1725 } |
| 1726 |
| 1727 SetMonkey::~SetMonkey() |
| 1728 { |
| 1729 //ook... |
| 1730 } |
| 1731 |
| 1732 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) |
| 1733 { |
| 1734 int32_t size = uset_size(set); |
| 1735 int32_t index = m_rand() % size; |
| 1736 UChar32 ch = uset_charAt(set, index); |
| 1737 UnicodeString str(ch); |
| 1738 |
| 1739 test.append(str); |
| 1740 alternate.append(str); // flip case, or some junk? |
| 1741 } |
| 1742 |
| 1743 class StringSetMonkey : public Monkey |
| 1744 { |
| 1745 public: |
| 1746 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCol
lData); |
| 1747 ~StringSetMonkey(); |
| 1748 |
| 1749 void append(UnicodeString &testCase, UnicodeString &alternate); |
| 1750 |
| 1751 private: |
| 1752 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeStr
ing &alternate); |
| 1753 |
| 1754 const USet *set; |
| 1755 UCollator *coll; |
| 1756 CollData *collData; |
| 1757 }; |
| 1758 |
| 1759 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, Col
lData *theCollData) |
| 1760 : Monkey(), set(theSet), coll(theCollator), collData(theCollData) |
| 1761 { |
| 1762 // ook. |
| 1763 } |
| 1764 |
| 1765 StringSetMonkey::~StringSetMonkey() |
| 1766 { |
| 1767 // ook? |
| 1768 } |
| 1769 |
| 1770 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) |
| 1771 { |
| 1772 int32_t itemCount = uset_getItemCount(set), len = 0; |
| 1773 int32_t index = m_rand() % itemCount; |
| 1774 UChar32 rangeStart = 0, rangeEnd = 0; |
| 1775 UChar buffer[16]; |
| 1776 UErrorCode err = U_ZERO_ERROR; |
| 1777 |
| 1778 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); |
| 1779 |
| 1780 if (len == 0) { |
| 1781 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); |
| 1782 UChar32 ch = rangeStart + offset; |
| 1783 UnicodeString str(ch); |
| 1784 |
| 1785 testCase.append(str); |
| 1786 generateAlternative(str, alternate); |
| 1787 } else if (len > 0) { |
| 1788 // should check that len < 16... |
| 1789 UnicodeString str(buffer, len); |
| 1790 |
| 1791 testCase.append(str); |
| 1792 generateAlternative(str, alternate); |
| 1793 } else { |
| 1794 // shouldn't happen... |
| 1795 } |
| 1796 } |
| 1797 |
| 1798 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas
e, UnicodeString &alternate) |
| 1799 { |
| 1800 // find out shortest string for the longest sequence of ces. |
| 1801 // needs to be refined to use dynamic programming, but will be roughly right |
| 1802 UErrorCode status = U_ZERO_ERROR; |
| 1803 CEList ceList(coll, testCase, status); |
| 1804 UnicodeString alt; |
| 1805 int32_t offset = 0; |
| 1806 |
| 1807 if (ceList.size() == 0) { |
| 1808 return alternate.append(testCase); |
| 1809 } |
| 1810 |
| 1811 while (offset < ceList.size()) { |
| 1812 int32_t ce = ceList.get(offset); |
| 1813 const StringList *strings = collData->getStringList(ce); |
| 1814 |
| 1815 if (strings == NULL) { |
| 1816 return alternate.append(testCase); |
| 1817 } |
| 1818 |
| 1819 int32_t stringCount = strings->size(); |
| 1820 int32_t tries = 0; |
| 1821 |
| 1822 // find random string that generates the same CEList |
| 1823 const CEList *ceList2 = NULL; |
| 1824 const UnicodeString *string = NULL; |
| 1825 UBool matches = FALSE; |
| 1826 |
| 1827 do { |
| 1828 int32_t s = m_rand() % stringCount; |
| 1829 |
| 1830 if (tries++ > stringCount) { |
| 1831 alternate.append(testCase); |
| 1832 return alternate; |
| 1833 } |
| 1834 |
| 1835 string = strings->get(s); |
| 1836 ceList2 = collData->getCEList(string); |
| 1837 matches = ceList.matchesAt(offset, ceList2); |
| 1838 |
| 1839 if (! matches) { |
| 1840 collData->freeCEList((CEList *) ceList2); |
| 1841 } |
| 1842 } while (! matches); |
| 1843 |
| 1844 alt.append(*string); |
| 1845 offset += ceList2->size(); |
| 1846 collData->freeCEList(ceList2); |
| 1847 } |
| 1848 |
| 1849 const CEList altCEs(coll, alt, status); |
| 1850 |
| 1851 if (ceList.matchesAt(0, &altCEs)) { |
| 1852 return alternate.append(alt); |
| 1853 } |
| 1854 |
| 1855 return alternate.append(testCase); |
| 1856 } |
| 1857 |
| 1858 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyC
ount, UnicodeString &testCase, UnicodeString &alternate) |
| 1859 { |
| 1860 int32_t pieces = (m_rand() % 4) + 1; |
| 1861 UErrorCode status = U_ZERO_ERROR; |
| 1862 UBool matches; |
| 1863 |
| 1864 do { |
| 1865 testCase.remove(); |
| 1866 alternate.remove(); |
| 1867 monkeys[0]->append(testCase, alternate); |
| 1868 |
| 1869 for(int32_t piece = 0; piece < pieces; piece += 1) { |
| 1870 int32_t monkey = m_rand() % monkeyCount; |
| 1871 |
| 1872 monkeys[monkey]->append(testCase, alternate); |
| 1873 } |
| 1874 |
| 1875 const CEList ceTest(coll, testCase, status); |
| 1876 const CEList ceAlt(coll, alternate, status); |
| 1877 |
| 1878 matches = ceTest.matchesAt(0, &ceAlt); |
| 1879 } while (! matches); |
| 1880 } |
| 1881 |
| 1882 // |
| 1883 // Find the next acceptable boundary following the specified starting index |
| 1884 // in the target text being searched. |
| 1885 // TODO: refine what is an acceptable boundary. For the moment, |
| 1886 // choose the next position not within a combining sequence. |
| 1887 // |
| 1888 #if 0 |
| 1889 static int32_t nextBoundaryAfter(const UnicodeString &string, int32_t startIndex
) { |
| 1890 const UChar *text = string.getBuffer(); |
| 1891 int32_t textLen = string.length(); |
| 1892 |
| 1893 if (startIndex >= textLen) { |
| 1894 return startIndex; |
| 1895 } |
| 1896 |
| 1897 UChar32 c; |
| 1898 int32_t i = startIndex; |
| 1899 |
| 1900 U16_NEXT(text, i, textLen, c); |
| 1901 |
| 1902 // If we are on a control character, stop without looking for combining mark
s. |
| 1903 // Control characters do not combine. |
| 1904 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| 1905 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_C
R) { |
| 1906 return i; |
| 1907 } |
| 1908 |
| 1909 // The initial character was not a control, and can thus accept trailing |
| 1910 // combining characters. Advance over however many of them there are. |
| 1911 int32_t indexOfLastCharChecked; |
| 1912 |
| 1913 for (;;) { |
| 1914 indexOfLastCharChecked = i; |
| 1915 |
| 1916 if (i>=textLen) { |
| 1917 break; |
| 1918 } |
| 1919 |
| 1920 U16_NEXT(text, i, textLen, c); |
| 1921 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| 1922 |
| 1923 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { |
| 1924 break; |
| 1925 } |
| 1926 } |
| 1927 |
| 1928 return indexOfLastCharChecked; |
| 1929 } |
| 1930 #endif |
| 1931 |
| 1932 #if 0 |
| 1933 static UBool isInCombiningSequence(const UnicodeString &string, int32_t index) { |
| 1934 const UChar *text = string.getBuffer(); |
| 1935 int32_t textLen = string.length(); |
| 1936 |
| 1937 if (index>=textLen || index<=0) { |
| 1938 return FALSE; |
| 1939 } |
| 1940 |
| 1941 // If the character at the current index is not a GRAPHEME_EXTEND |
| 1942 // then we can not be within a combining sequence. |
| 1943 UChar32 c; |
| 1944 U16_GET(text, 0, index, textLen, c); |
| 1945 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| 1946 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { |
| 1947 return FALSE; |
| 1948 } |
| 1949 |
| 1950 // We are at a combining mark. If the preceding character is anything |
| 1951 // except a CONTROL, CR or LF, we are in a combining sequence. |
| 1952 U16_PREV(text, 0, index, c); |
| 1953 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); |
| 1954 |
| 1955 return !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_
GCB_CR); |
| 1956 } |
| 1957 #endif |
| 1958 |
| 1959 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t
offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) |
| 1960 { |
| 1961 UErrorCode status = U_ZERO_ERROR; |
| 1962 OrderList targetOrders(coll, target, offset); |
| 1963 OrderList patternOrders(coll, pattern); |
| 1964 int32_t targetSize = targetOrders.size() - 1; |
| 1965 int32_t patternSize = patternOrders.size() - 1; |
| 1966 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale
ByType(coll, ULOC_VALID_LOCALE, &status), |
| 1967 target.getBuffer(), target.len
gth(), &status); |
| 1968 |
| 1969 if (patternSize == 0) { |
| 1970 // Searching for an empty pattern always fails |
| 1971 matchStart = matchEnd = -1; |
| 1972 ubrk_close(charBreakIterator); |
| 1973 return FALSE; |
| 1974 } |
| 1975 |
| 1976 matchStart = matchEnd = -1; |
| 1977 |
| 1978 for(int32_t i = 0; i < targetSize; i += 1) { |
| 1979 if (targetOrders.matchesAt(i, patternOrders)) { |
| 1980 int32_t start = targetOrders.getLowOffset(i); |
| 1981 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); |
| 1982 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); |
| 1983 |
| 1984 // if the low and high offsets of the first CE in |
| 1985 // the match are the same, it means that the match |
| 1986 // starts in the middle of an expansion - all but |
| 1987 // the first CE of the expansion will have the offset |
| 1988 // of the following character. |
| 1989 if (start == targetOrders.getHighOffset(i)) { |
| 1990 continue; |
| 1991 } |
| 1992 |
| 1993 // Make sure match starts on a grapheme boundary |
| 1994 if (! ubrk_isBoundary(charBreakIterator, start)) { |
| 1995 continue; |
| 1996 } |
| 1997 |
| 1998 // If the low and high offsets of the CE after the match |
| 1999 // are the same, it means that the match ends in the middle |
| 2000 // of an expansion sequence. |
| 2001 if (maxLimit == targetOrders.getHighOffset(i + patternSize) && |
| 2002 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { |
| 2003 continue; |
| 2004 } |
| 2005 |
| 2006 int32_t mend = maxLimit; |
| 2007 |
| 2008 // Find the first grapheme break after the character index |
| 2009 // of the last CE in the match. If it's after character index |
| 2010 // that's after the last CE in the match, use that index |
| 2011 // as the end of the match. |
| 2012 if (minLimit < maxLimit) { |
| 2013 int32_t nba = ubrk_following(charBreakIterator, minLimit); |
| 2014 |
| 2015 if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) { |
| 2016 mend = nba; |
| 2017 } |
| 2018 } |
| 2019 |
| 2020 if (mend > maxLimit) { |
| 2021 continue; |
| 2022 } |
| 2023 |
| 2024 if (! ubrk_isBoundary(charBreakIterator, mend)) { |
| 2025 continue; |
| 2026 } |
| 2027 |
| 2028 matchStart = start; |
| 2029 matchEnd = mend; |
| 2030 |
| 2031 ubrk_close(charBreakIterator); |
| 2032 return TRUE; |
| 2033 } |
| 2034 } |
| 2035 |
| 2036 ubrk_close(charBreakIterator); |
| 2037 return FALSE; |
| 2038 } |
| 2039 |
| 2040 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 2041 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d
efaultVal) { |
| 2042 int32_t val = defaultVal; |
| 2043 |
| 2044 name.append(" *= *(-?\\d+)"); |
| 2045 |
| 2046 UErrorCode status = U_ZERO_ERROR; |
| 2047 RegexMatcher m(name, params, 0, status); |
| 2048 |
| 2049 if (m.find()) { |
| 2050 // The param exists. Convert the string to an int. |
| 2051 char valString[100]; |
| 2052 int32_t paramLength = m.end(1, status) - m.start(1, status); |
| 2053 |
| 2054 if (paramLength >= (int32_t)(sizeof(valString)-1)) { |
| 2055 paramLength = (int32_t)(sizeof(valString)-2); |
| 2056 } |
| 2057 |
| 2058 params.extract(m.start(1, status), paramLength, valString, sizeof(valStr
ing)); |
| 2059 val = strtol(valString, NULL, 10); |
| 2060 |
| 2061 // Delete this parameter from the params string. |
| 2062 m.reset(); |
| 2063 params = m.replaceFirst("", status); |
| 2064 } |
| 2065 |
| 2066 //U_ASSERT(U_SUCCESS(status)); |
| 2067 if (! U_SUCCESS(status)) { |
| 2068 val = defaultVal; |
| 2069 } |
| 2070 |
| 2071 return val; |
| 2072 } |
| 2073 #endif |
| 2074 |
| 2075 #if !UCONFIG_NO_COLLATION |
| 2076 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCa
se, const UnicodeString &pattern, const UnicodeString &altPattern, |
| 2077 const char *name, const char *strength, uint
32_t seed) |
| 2078 { |
| 2079 UErrorCode status = U_ZERO_ERROR; |
| 2080 int32_t actualStart = -1, actualEnd = -1; |
| 2081 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + alt
Pattern.length(); |
| 2082 int32_t expectedStart = -1, expectedEnd = -1; |
| 2083 int32_t notFoundCount = 0; |
| 2084 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(),
pattern.length(), |
| 2085 testCase.getBuffer(),
testCase.length(), |
| 2086 coll, |
| 2087 NULL, // the brea
k iterator |
| 2088 &status)); |
| 2089 |
| 2090 // **** TODO: find *all* matches, not just first one **** |
| 2091 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); |
| 2092 |
| 2093 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); |
| 2094 |
| 2095 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { |
| 2096 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d,
%d]\n" |
| 2097 " strength=%s seed=%d", |
| 2098 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed); |
| 2099 } |
| 2100 |
| 2101 if (expectedStart == -1 && actualStart == -1) { |
| 2102 notFoundCount += 1; |
| 2103 } |
| 2104 |
| 2105 // **** TODO: find *all* matches, not just first one **** |
| 2106 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); |
| 2107 |
| 2108 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length
(), &status); |
| 2109 |
| 2110 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); |
| 2111 |
| 2112 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { |
| 2113 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [
%d, %d]\n" |
| 2114 " strength=%s seed=%d", |
| 2115 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed); |
| 2116 } |
| 2117 |
| 2118 if (expectedStart == -1 && actualStart == -1) { |
| 2119 notFoundCount += 1; |
| 2120 } |
| 2121 |
| 2122 return notFoundCount; |
| 2123 } |
| 2124 |
| 2125 static void hexForUnicodeString(const UnicodeString &ustr, char * cbuf, int32_t
cbuflen) |
| 2126 { |
| 2127 int32_t ustri, ustrlen = ustr.length(); |
| 2128 |
| 2129 for (ustri = 0; ustri < ustrlen; ++ustri) { |
| 2130 if (cbuflen >= 9 /* format width for single code unit(5) + terminating e
llipsis(3) + null(1) */) { |
| 2131 int len = sprintf(cbuf, " %04X", ustr.charAt(ustri)); |
| 2132 cbuflen -= len; |
| 2133 cbuf += len; |
| 2134 } else { |
| 2135 if (cbuflen >= 4 /* terminating ellipsis(3) + null(1) */) { |
| 2136 sprintf(cbuf, "..."); |
| 2137 } else if (cbuflen >= 1) { |
| 2138 cbuf = 0; |
| 2139 } |
| 2140 break; |
| 2141 } |
| 2142 } |
| 2143 } |
| 2144 |
| 2145 int32_t SSearchTest::bmMonkeyTestCase(UCollator *coll, const UnicodeString &test
Case, const UnicodeString &pattern, const UnicodeString &altPattern, |
| 2146 BoyerMooreSearch *bms, BoyerMooreSearch *abm
s, |
| 2147 const char *name, const char *strength, uint
32_t seed) |
| 2148 { |
| 2149 UErrorCode status = U_ZERO_ERROR; |
| 2150 int32_t actualStart = -1, actualEnd = -1; |
| 2151 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + alt
Pattern.length(); |
| 2152 int32_t expectedStart = -1, expectedEnd = -1; |
| 2153 int32_t notFoundCount = 0; |
| 2154 char hexbuf[128]; |
| 2155 |
| 2156 // **** TODO: find *all* matches, not just first one **** |
| 2157 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); |
| 2158 |
| 2159 bms->setTargetString(&testCase, status); |
| 2160 bms->search(0, actualStart, actualEnd); |
| 2161 |
| 2162 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { |
| 2163 hexForUnicodeString(pattern, hexbuf, sizeof(hexbuf)); |
| 2164 errln("Boyer-Moore Search for <pattern> in <%s> failed: expected [%d, %d
], got [%d, %d]\n" |
| 2165 " strength=%s seed=%d <pattern>: %s", |
| 2166 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed, hexbuf); |
| 2167 } |
| 2168 |
| 2169 if (expectedStart == -1 && actualStart == -1) { |
| 2170 notFoundCount += 1; |
| 2171 } |
| 2172 |
| 2173 // **** TODO: find *all* matches, not just first one **** |
| 2174 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); |
| 2175 |
| 2176 abms->setTargetString(&testCase, status); |
| 2177 abms->search(0, actualStart, actualEnd); |
| 2178 |
| 2179 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { |
| 2180 hexForUnicodeString(altPattern, hexbuf, sizeof(hexbuf)); |
| 2181 errln("Boyer-Moore Search for <alt_pattern> in <%s> failed: expected [%d
, %d], got [%d, %d]\n" |
| 2182 " strength=%s seed=%d <pattern>: %s", |
| 2183 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed, hexbuf); |
| 2184 } |
| 2185 |
| 2186 if (expectedStart == -1 && actualStart == -1) { |
| 2187 notFoundCount += 1; |
| 2188 } |
| 2189 |
| 2190 |
| 2191 return notFoundCount; |
| 2192 } |
| 2193 #endif |
| 2194 |
| 2195 void SSearchTest::monkeyTest(char *params) |
| 2196 { |
| 2197 // ook! |
| 2198 UErrorCode status = U_ZERO_ERROR; |
| 2199 //UCollator *coll = ucol_open(NULL, &status); |
| 2200 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); |
| 2201 |
| 2202 if (U_FAILURE(status)) { |
| 2203 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_er
rorName(status)); |
| 2204 return; |
| 2205 } |
| 2206 |
| 2207 CollData *monkeyData = CollData::open(coll, status); |
| 2208 |
| 2209 USet *expansions = uset_openEmpty(); |
| 2210 USet *contractions = uset_openEmpty(); |
| 2211 |
| 2212 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &st
atus); |
| 2213 |
| 2214 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; |
| 2215 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; |
| 2216 USet *letters = uset_openPattern(letter_pattern, 39, &status); |
| 2217 SetMonkey letterMonkey(letters); |
| 2218 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); |
| 2219 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); |
| 2220 UnicodeString testCase; |
| 2221 UnicodeString alternate; |
| 2222 UnicodeString pattern, altPattern; |
| 2223 UnicodeString prefix, altPrefix; |
| 2224 UnicodeString suffix, altSuffix; |
| 2225 |
| 2226 Monkey *monkeys[] = { |
| 2227 &letterMonkey, |
| 2228 &contractionMonkey, |
| 2229 &expansionMonkey, |
| 2230 &contractionMonkey, |
| 2231 &expansionMonkey, |
| 2232 &contractionMonkey, |
| 2233 &expansionMonkey, |
| 2234 &contractionMonkey, |
| 2235 &expansionMonkey}; |
| 2236 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); |
| 2237 // int32_t nonMatchCount = 0; |
| 2238 |
| 2239 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIAR
Y}; |
| 2240 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; |
| 2241 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); |
| 2242 int32_t loopCount = quick? 1000 : 10000; |
| 2243 int32_t firstStrength = 0; |
| 2244 int32_t lastStrength = strengthCount - 1; //*/ 0; |
| 2245 |
| 2246 if (params != NULL) { |
| 2247 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 2248 UnicodeString p(params); |
| 2249 |
| 2250 loopCount = getIntParam("loop", p, loopCount); |
| 2251 m_seed = getIntParam("seed", p, m_seed); |
| 2252 |
| 2253 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, st
atus); |
| 2254 if (m.find()) { |
| 2255 UnicodeString breakType = m.group(1, status); |
| 2256 |
| 2257 for (int32_t s = 0; s < strengthCount; s += 1) { |
| 2258 if (breakType == strengthNames[s]) { |
| 2259 firstStrength = lastStrength = s; |
| 2260 break; |
| 2261 } |
| 2262 } |
| 2263 |
| 2264 m.reset(); |
| 2265 p = m.replaceFirst("", status); |
| 2266 } |
| 2267 |
| 2268 if (RegexMatcher("\\S", p, 0, status).find()) { |
| 2269 // Each option is stripped out of the option string as it is process
ed. |
| 2270 // All options have been checked. The option string should have bee
n completely emptied.. |
| 2271 char buf[100]; |
| 2272 p.extract(buf, sizeof(buf), NULL, status); |
| 2273 buf[sizeof(buf)-1] = 0; |
| 2274 errln("Unrecognized or extra parameter: %s\n", buf); |
| 2275 return; |
| 2276 } |
| 2277 #else |
| 2278 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring
parameters."); |
| 2279 #endif |
| 2280 } |
| 2281 |
| 2282 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { |
| 2283 int32_t notFoundCount = 0; |
| 2284 |
| 2285 logln("Setting strength to %s.", strengthNames[s]); |
| 2286 ucol_setStrength(coll, strengths[s]); |
| 2287 |
| 2288 // TODO: try alternate prefix and suffix too? |
| 2289 // TODO: alterntaes are only equal at primary strength. Is this OK? |
| 2290 for(int32_t t = 0; t < loopCount; t += 1) { |
| 2291 uint32_t seed = m_seed; |
| 2292 // int32_t nmc = 0; |
| 2293 |
| 2294 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); |
| 2295 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); |
| 2296 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); |
| 2297 |
| 2298 // pattern |
| 2299 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern,
"pattern", strengthNames[s], seed); |
| 2300 |
| 2301 testCase.remove(); |
| 2302 testCase.append(prefix); |
| 2303 testCase.append(/*alt*/pattern); |
| 2304 |
| 2305 // prefix + pattern |
| 2306 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"prefix + pattern", strengthNames[s], seed); |
| 2307 |
| 2308 testCase.append(suffix); |
| 2309 |
| 2310 // prefix + pattern + suffix |
| 2311 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"prefix + pattern + suffix", strengthNames[s], seed); |
| 2312 |
| 2313 testCase.remove(); |
| 2314 testCase.append(pattern); |
| 2315 testCase.append(suffix); |
| 2316 |
| 2317 // pattern + suffix |
| 2318 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"pattern + suffix", strengthNames[s], seed); |
| 2319 } |
| 2320 |
| 2321 logln("For strength %s the not found count is %d.", strengthNames[s], not
FoundCount); |
| 2322 } |
| 2323 |
| 2324 uset_close(contractions); |
| 2325 uset_close(expansions); |
| 2326 uset_close(letters); |
| 2327 |
| 2328 CollData::close(monkeyData); |
| 2329 |
| 2330 ucol_close(coll); |
| 2331 } |
| 2332 |
| 2333 void SSearchTest::bmMonkeyTest(char *params) |
| 2334 { |
| 2335 static const UVersionInfo icu47 = { 4, 7, 0, 0 }; // for timebomb |
| 2336 static const UChar skipChars[] = { 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0
xAAB5, 0xAAB6, 0xAAB9, 0xAABB, 0xAABC, 0 }; // for timebomb |
| 2337 // ook! |
| 2338 UErrorCode status = U_ZERO_ERROR; |
| 2339 UCollator *coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); |
| 2340 |
| 2341 if (U_FAILURE(status)) { |
| 2342 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_er
rorName(status)); |
| 2343 return; |
| 2344 } |
| 2345 |
| 2346 CollData *monkeyData = CollData::open(coll, status); |
| 2347 |
| 2348 USet *expansions = uset_openEmpty(); |
| 2349 USet *contractions = uset_openEmpty(); |
| 2350 |
| 2351 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &st
atus); |
| 2352 |
| 2353 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; |
| 2354 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; |
| 2355 USet *letters = uset_openPattern(letter_pattern, 39, &status); |
| 2356 SetMonkey letterMonkey(letters); |
| 2357 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); |
| 2358 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); |
| 2359 UnicodeString testCase; |
| 2360 UnicodeString alternate; |
| 2361 UnicodeString pattern, altPattern; |
| 2362 UnicodeString prefix, altPrefix; |
| 2363 UnicodeString suffix, altSuffix; |
| 2364 |
| 2365 Monkey *monkeys[] = { |
| 2366 &letterMonkey, |
| 2367 &contractionMonkey, |
| 2368 &expansionMonkey, |
| 2369 &contractionMonkey, |
| 2370 &expansionMonkey, |
| 2371 &contractionMonkey, |
| 2372 &expansionMonkey, |
| 2373 &contractionMonkey, |
| 2374 &expansionMonkey}; |
| 2375 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); |
| 2376 // int32_t nonMatchCount = 0; |
| 2377 |
| 2378 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIAR
Y}; |
| 2379 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; |
| 2380 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); |
| 2381 int32_t loopCount = quick? 1000 : 10000; |
| 2382 int32_t firstStrength = 0; |
| 2383 int32_t lastStrength = strengthCount - 1; //*/ 0; |
| 2384 |
| 2385 if (params != NULL) { |
| 2386 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 2387 UnicodeString p(params); |
| 2388 |
| 2389 loopCount = getIntParam("loop", p, loopCount); |
| 2390 m_seed = getIntParam("seed", p, m_seed); |
| 2391 |
| 2392 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, st
atus); |
| 2393 if (m.find()) { |
| 2394 UnicodeString breakType = m.group(1, status); |
| 2395 |
| 2396 for (int32_t s = 0; s < strengthCount; s += 1) { |
| 2397 if (breakType == strengthNames[s]) { |
| 2398 firstStrength = lastStrength = s; |
| 2399 break; |
| 2400 } |
| 2401 } |
| 2402 |
| 2403 m.reset(); |
| 2404 p = m.replaceFirst("", status); |
| 2405 } |
| 2406 |
| 2407 if (RegexMatcher("\\S", p, 0, status).find()) { |
| 2408 // Each option is stripped out of the option string as it is process
ed. |
| 2409 // All options have been checked. The option string should have bee
n completely emptied.. |
| 2410 char buf[100]; |
| 2411 p.extract(buf, sizeof(buf), NULL, status); |
| 2412 buf[sizeof(buf)-1] = 0; |
| 2413 errln("Unrecognized or extra parameter: %s\n", buf); |
| 2414 return; |
| 2415 } |
| 2416 #else |
| 2417 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring
parameters."); |
| 2418 #endif |
| 2419 } |
| 2420 |
| 2421 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { |
| 2422 int32_t notFoundCount = 0; |
| 2423 |
| 2424 logln("Setting strength to %s.", strengthNames[s]); |
| 2425 ucol_setStrength(coll, strengths[s]); |
| 2426 |
| 2427 CollData *data = CollData::open(coll, status); |
| 2428 |
| 2429 UnicodeString skipString(skipChars); // for timebomb |
| 2430 UnicodeSet* skipSet = UnicodeSet::createFromAll(skipString); // for time
bomb |
| 2431 // TODO: try alternate prefix and suffix too? |
| 2432 // TODO: alterntaes are only equal at primary strength. Is this OK? |
| 2433 for(int32_t t = 0; t < loopCount; t += 1) { |
| 2434 uint32_t seed = m_seed; |
| 2435 // int32_t nmc = 0; |
| 2436 |
| 2437 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); |
| 2438 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); |
| 2439 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); |
| 2440 |
| 2441 if (!isICUVersionAtLeast(icu47) && skipSet->containsSome(pattern)) { |
| 2442 continue; // timebomb until ticket #8080 is resolved |
| 2443 } |
| 2444 |
| 2445 BoyerMooreSearch pat(data, pattern, NULL, status); |
| 2446 BoyerMooreSearch alt(data, altPattern, NULL, status); |
| 2447 |
| 2448 // **** need a better way to deal with this **** |
| 2449 #if 0 |
| 2450 if (pat.empty() || |
| 2451 alt.empty()) { |
| 2452 continue; |
| 2453 } |
| 2454 #endif |
| 2455 |
| 2456 // pattern |
| 2457 notFoundCount += bmMonkeyTestCase(coll, pattern, pattern, altPattern
, &pat, &alt, "pattern", strengthNames[s], seed); |
| 2458 |
| 2459 testCase.remove(); |
| 2460 testCase.append(prefix); |
| 2461 testCase.append(/*alt*/pattern); |
| 2462 |
| 2463 // prefix + pattern |
| 2464 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPatter
n, &pat, &alt, "prefix + pattern", strengthNames[s], seed); |
| 2465 |
| 2466 testCase.append(suffix); |
| 2467 |
| 2468 // prefix + pattern + suffix |
| 2469 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPatter
n, &pat, &alt, "prefix + pattern + suffix", strengthNames[s], seed); |
| 2470 |
| 2471 testCase.remove(); |
| 2472 testCase.append(pattern); |
| 2473 testCase.append(suffix); |
| 2474 |
| 2475 // pattern + suffix |
| 2476 notFoundCount += bmMonkeyTestCase(coll, testCase, pattern, altPatter
n, &pat, &alt, "pattern + suffix", strengthNames[s], seed); |
| 2477 } |
| 2478 delete skipSet; // for timebomb |
| 2479 |
| 2480 CollData::close(data); |
| 2481 |
| 2482 logln("For strength %s the not found count is %d.", strengthNames[s], no
tFoundCount); |
| 2483 } |
| 2484 |
| 2485 uset_close(contractions); |
| 2486 uset_close(expansions); |
| 2487 uset_close(letters); |
| 2488 |
| 2489 CollData::close(monkeyData); |
| 2490 |
| 2491 ucol_close(coll); |
| 2492 } |
| 2493 |
| 2494 void SSearchTest::stringListTest(){ |
| 2495 UErrorCode status = U_ZERO_ERROR; |
| 2496 StringList *sl = new StringList(status); |
| 2497 if(U_FAILURE(status)){ |
| 2498 errln("ERROR: stringListTest: Could not start StringList"); |
| 2499 } |
| 2500 |
| 2501 const UChar chars[] = { |
| 2502 0x0000 |
| 2503 }; |
| 2504 sl->add(chars, (int32_t) 0, status); |
| 2505 if(U_FAILURE(status)){ |
| 2506 errln("ERROR: stringListTest: StringList::add"); |
| 2507 } |
| 2508 |
| 2509 if(sl->getDynamicClassID() != StringList::getStaticClassID()){ |
| 2510 errln("ERROR: stringListTest: getDynamicClassID and getStaticClassID doe
s not match"); |
| 2511 } |
| 2512 delete sl; |
| 2513 } |
| 2514 |
| 2515 #endif |
| 2516 |
| 2517 #endif |
OLD | NEW |