OLD | NEW |
| (Empty) |
1 /* | |
2 ********************************************************************** | |
3 * Copyright (C) 2005-2014, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ********************************************************************** | |
6 */ | |
7 | |
8 #include "unicode/utypes.h" | |
9 | |
10 #if !UCONFIG_NO_COLLATION | |
11 | |
12 #include "cmemory.h" | |
13 #include "cstring.h" | |
14 #include "usrchimp.h" | |
15 | |
16 #include "unicode/coll.h" | |
17 #include "unicode/tblcoll.h" | |
18 #include "unicode/usearch.h" | |
19 #include "unicode/uset.h" | |
20 #include "unicode/ustring.h" | |
21 | |
22 #include "unicode/coleitr.h" | |
23 #include "unicode/regex.h" // TODO: make conditional on regexp being buil
t. | |
24 | |
25 #include "colldata.h" | |
26 #include "ssearch.h" | |
27 #include "xmlparser.h" | |
28 | |
29 #include <stdio.h> // for sprintf | |
30 | |
31 char testId[100]; | |
32 | |
33 #define TEST_ASSERT(x) {if (!(x)) { \ | |
34 errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, t
estId);}} | |
35 | |
36 #define TEST_ASSERT_M(x, m) {if (!(x)) { \ | |
37 dataerrln("Failure in file %s, line %d. \"%s\"", __FILE__, __LINE__, m);re
turn;}} | |
38 | |
39 #define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \ | |
40 dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \ | |
41 __FILE__, __LINE__, testId, u_errorName(errcode));}} | |
42 | |
43 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | |
44 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) | |
45 #define DELETE_ARRAY(array) uprv_free((void *) (array)) | |
46 | |
47 //--------------------------------------------------------------------------- | |
48 // | |
49 // Test class boilerplate | |
50 // | |
51 //--------------------------------------------------------------------------- | |
52 SSearchTest::SSearchTest() | |
53 { | |
54 } | |
55 | |
56 SSearchTest::~SSearchTest() | |
57 { | |
58 } | |
59 | |
60 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
char *params ) | |
61 { | |
62 if (exec) logln("TestSuite SSearchTest: "); | |
63 switch (index) { | |
64 #if !UCONFIG_NO_BREAK_ITERATION | |
65 case 0: name = "searchTest"; | |
66 if (exec) searchTest(); | |
67 break; | |
68 | |
69 case 1: name = "offsetTest"; | |
70 if (exec) offsetTest(); | |
71 break; | |
72 | |
73 case 2: name = "monkeyTest"; | |
74 if (exec) monkeyTest(params); | |
75 break; | |
76 | |
77 case 3: name = "sharpSTest"; | |
78 if (exec) sharpSTest(); | |
79 break; | |
80 | |
81 case 4: name = "goodSuffixTest"; | |
82 if (exec) goodSuffixTest(); | |
83 break; | |
84 | |
85 case 5: name = "searchTime"; | |
86 if (exec) searchTime(); | |
87 break; | |
88 #endif | |
89 default: name = ""; | |
90 break; //needed to end loop | |
91 } | |
92 } | |
93 | |
94 | |
95 #if !UCONFIG_NO_BREAK_ITERATION | |
96 | |
97 #define PATH_BUFFER_SIZE 2048 | |
98 const char *SSearchTest::getPath(char buffer[2048], const char *filename) { | |
99 UErrorCode status = U_ZERO_ERROR; | |
100 const char *testDataDirectory = IntlTest::getSourceTestData(status); | |
101 | |
102 if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >=
PATH_BUFFER_SIZE) { | |
103 errln("ERROR: getPath() failed - %s", u_errorName(status)); | |
104 return NULL; | |
105 } | |
106 | |
107 strcpy(buffer, testDataDirectory); | |
108 strcat(buffer, filename); | |
109 return buffer; | |
110 } | |
111 | |
112 | |
113 void SSearchTest::searchTest() | |
114 { | |
115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO | |
116 UErrorCode status = U_ZERO_ERROR; | |
117 char path[PATH_BUFFER_SIZE]; | |
118 const char *testFilePath = getPath(path, "ssearch.xml"); | |
119 | |
120 if (testFilePath == NULL) { | |
121 return; /* Couldn't get path: error message already output. */ | |
122 } | |
123 | |
124 LocalPointer<UXMLParser> parser(UXMLParser::createParser(status)); | |
125 TEST_ASSERT_SUCCESS(status); | |
126 LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status)); | |
127 TEST_ASSERT_SUCCESS(status); | |
128 if (U_FAILURE(status)) { | |
129 return; | |
130 } | |
131 | |
132 const UnicodeString *debugTestCase = root->getAttribute("debug"); | |
133 if (debugTestCase != NULL) { | |
134 // setenv("USEARCH_DEBUG", "1", 1); | |
135 } | |
136 | |
137 | |
138 const UXMLElement *testCase; | |
139 int32_t tc = 0; | |
140 | |
141 while((testCase = root->nextChildElement(tc)) != NULL) { | |
142 | |
143 if (testCase->getTagName().compare("test-case") != 0) { | |
144 errln("ssearch, unrecognized XML Element in test file"); | |
145 continue; | |
146 } | |
147 const UnicodeString *id = testCase->getAttribute("id"); | |
148 *testId = 0; | |
149 if (id != NULL) { | |
150 id->extract(0, id->length(), testId, sizeof(testId), US_INV); | |
151 } | |
152 | |
153 // If debugging test case has been specified and this is not it, skip to
next. | |
154 if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) { | |
155 continue; | |
156 } | |
157 // | |
158 // Get the requested collation strength. | |
159 // Default is tertiary if the XML attribute is missing from the test
case. | |
160 // | |
161 const UnicodeString *strength = testCase->getAttribute("strength"); | |
162 UColAttributeValue collatorStrength = UCOL_PRIMARY; | |
163 if (strength==NULL) { collatorStrength = UCOL_TERTIARY;} | |
164 else if (*strength=="PRIMARY") { collatorStrength = UCOL_PRIMARY;} | |
165 else if (*strength=="SECONDARY") { collatorStrength = UCOL_SECONDARY;} | |
166 else if (*strength=="TERTIARY") { collatorStrength = UCOL_TERTIARY;} | |
167 else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;} | |
168 else if (*strength=="IDENTICAL") { collatorStrength = UCOL_IDENTICAL;} | |
169 else { | |
170 // Bogus value supplied for strength. Shouldn't happen, even from | |
171 // typos, if the XML source has been validated. | |
172 // This assert is a little deceiving in that strength can be | |
173 // any of the allowed values, not just TERTIARY, but it will | |
174 // do the job of getting the error output. | |
175 TEST_ASSERT(*strength=="TERTIARY") | |
176 } | |
177 | |
178 // | |
179 // Get the collator normalization flag. Default is UCOL_OFF. | |
180 // | |
181 UColAttributeValue normalize = UCOL_OFF; | |
182 const UnicodeString *norm = testCase->getAttribute("norm"); | |
183 TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF"); | |
184 if (norm!=NULL && *norm=="ON") { | |
185 normalize = UCOL_ON; | |
186 } | |
187 | |
188 // | |
189 // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE. | |
190 // | |
191 UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE; | |
192 const UnicodeString *alt = testCase->getAttribute("alternate_handling"); | |
193 TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE"
); | |
194 if (alt != NULL && *alt == "SHIFTED") { | |
195 alternateHandling = UCOL_SHIFTED; | |
196 } | |
197 | |
198 const UnicodeString defLocale("en"); | |
199 char clocale[100]; | |
200 const UnicodeString *locale = testCase->getAttribute("locale"); | |
201 if (locale == NULL || locale->length()==0) { | |
202 locale = &defLocale; | |
203 }; | |
204 locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL); | |
205 | |
206 | |
207 UnicodeString text; | |
208 UnicodeString target; | |
209 UnicodeString pattern; | |
210 int32_t expectedMatchStart = -1; | |
211 int32_t expectedMatchLimit = -1; | |
212 const UXMLElement *n; | |
213 int32_t nodeCount = 0; | |
214 | |
215 n = testCase->getChildElement("pattern"); | |
216 TEST_ASSERT(n != NULL); | |
217 if (n==NULL) { | |
218 continue; | |
219 } | |
220 text = n->getText(FALSE); | |
221 text = text.unescape(); | |
222 pattern.append(text); | |
223 nodeCount++; | |
224 | |
225 n = testCase->getChildElement("pre"); | |
226 if (n!=NULL) { | |
227 text = n->getText(FALSE); | |
228 text = text.unescape(); | |
229 target.append(text); | |
230 nodeCount++; | |
231 } | |
232 | |
233 n = testCase->getChildElement("m"); | |
234 if (n!=NULL) { | |
235 expectedMatchStart = target.length(); | |
236 text = n->getText(FALSE); | |
237 text = text.unescape(); | |
238 target.append(text); | |
239 expectedMatchLimit = target.length(); | |
240 nodeCount++; | |
241 } | |
242 | |
243 n = testCase->getChildElement("post"); | |
244 if (n!=NULL) { | |
245 text = n->getText(FALSE); | |
246 text = text.unescape(); | |
247 target.append(text); | |
248 nodeCount++; | |
249 } | |
250 | |
251 // Check that there weren't extra things in the XML | |
252 TEST_ASSERT(nodeCount == testCase->countChildren()); | |
253 | |
254 // Open a collator and StringSearch based on the parameters | |
255 // obtained from the XML. | |
256 // | |
257 status = U_ZERO_ERROR; | |
258 LocalUCollatorPointer collator(ucol_open(clocale, &status)); | |
259 ucol_setStrength(collator.getAlias(), collatorStrength); | |
260 ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normaliz
e, &status); | |
261 ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternat
eHandling, &status); | |
262 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer
(), pattern.length(), | |
263 target.getBuffer(
), target.length(), | |
264 collator.getAlias
(), | |
265 NULL, // the
break iterator | |
266 &status)); | |
267 | |
268 TEST_ASSERT_SUCCESS(status); | |
269 if (U_FAILURE(status)) { | |
270 continue; | |
271 } | |
272 | |
273 int32_t foundStart = 0; | |
274 int32_t foundLimit = 0; | |
275 UBool foundMatch; | |
276 | |
277 // | |
278 // Do the search, check the match result against the expected results. | |
279 // | |
280 foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit,
&status); | |
281 TEST_ASSERT_SUCCESS(status); | |
282 if ((foundMatch && expectedMatchStart<0) || | |
283 (foundStart != expectedMatchStart) || | |
284 (foundLimit != expectedMatchLimit)) { | |
285 TEST_ASSERT(FALSE); // ouput generic error position | |
286 infoln("Found, expected match start = %d, %d \n" | |
287 "Found, expected match limit = %d, %d", | |
288 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
289 } | |
290 | |
291 // In case there are other matches... | |
292 // (should we only do this if the test case passed?) | |
293 while (foundMatch) { | |
294 expectedMatchStart = foundStart; | |
295 expectedMatchLimit = foundLimit; | |
296 | |
297 foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart,
&foundLimit, &status); | |
298 } | |
299 | |
300 uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.l
ength(), | |
301 target.getBuffer(), target.length(), | |
302 collator.getAlias(), | |
303 NULL, | |
304 &status)); | |
305 | |
306 // | |
307 // Do the backwards search, check the match result against the expected
results. | |
308 // | |
309 foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &fo
undStart, &foundLimit, &status); | |
310 TEST_ASSERT_SUCCESS(status); | |
311 if ((foundMatch && expectedMatchStart<0) || | |
312 (foundStart != expectedMatchStart) || | |
313 (foundLimit != expectedMatchLimit)) { | |
314 TEST_ASSERT(FALSE); // ouput generic error position | |
315 infoln("Found, expected backwards match start = %d, %d \n" | |
316 "Found, expected backwards match limit = %d, %d", | |
317 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit); | |
318 } | |
319 } | |
320 #endif | |
321 } | |
322 | |
323 struct Order | |
324 { | |
325 int32_t order; | |
326 int32_t lowOffset; | |
327 int32_t highOffset; | |
328 }; | |
329 | |
330 class OrderList | |
331 { | |
332 public: | |
333 OrderList(); | |
334 OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset
= 0); | |
335 ~OrderList(); | |
336 | |
337 int32_t size(void) const; | |
338 void add(int32_t order, int32_t low, int32_t high); | |
339 const Order *get(int32_t index) const; | |
340 int32_t getLowOffset(int32_t index) const; | |
341 int32_t getHighOffset(int32_t index) const; | |
342 int32_t getOrder(int32_t index) const; | |
343 void reverse(void); | |
344 UBool compare(const OrderList &other) const; | |
345 UBool matchesAt(int32_t offset, const OrderList &other) const; | |
346 | |
347 private: | |
348 Order *list; | |
349 int32_t listMax; | |
350 int32_t listSize; | |
351 }; | |
352 | |
353 OrderList::OrderList() | |
354 : list(NULL), listMax(16), listSize(0) | |
355 { | |
356 list = new Order[listMax]; | |
357 } | |
358 | |
359 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t strin
gOffset) | |
360 : list(NULL), listMax(16), listSize(0) | |
361 { | |
362 UErrorCode status = U_ZERO_ERROR; | |
363 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), stri
ng.length(), &status); | |
364 uint32_t strengthMask = 0; | |
365 int32_t order, low, high; | |
366 | |
367 switch (ucol_getStrength(coll)) | |
368 { | |
369 default: | |
370 strengthMask |= UCOL_TERTIARYORDERMASK; | |
371 /* fall through */ | |
372 | |
373 case UCOL_SECONDARY: | |
374 strengthMask |= UCOL_SECONDARYORDERMASK; | |
375 /* fall through */ | |
376 | |
377 case UCOL_PRIMARY: | |
378 strengthMask |= UCOL_PRIMARYORDERMASK; | |
379 } | |
380 | |
381 list = new Order[listMax]; | |
382 | |
383 ucol_setOffset(elems, stringOffset, &status); | |
384 | |
385 do { | |
386 low = ucol_getOffset(elems); | |
387 order = ucol_next(elems, &status); | |
388 high = ucol_getOffset(elems); | |
389 | |
390 if (order != UCOL_NULLORDER) { | |
391 order &= strengthMask; | |
392 } | |
393 | |
394 if (order != UCOL_IGNORABLE) { | |
395 add(order, low, high); | |
396 } | |
397 } while (order != UCOL_NULLORDER); | |
398 | |
399 ucol_closeElements(elems); | |
400 } | |
401 | |
402 OrderList::~OrderList() | |
403 { | |
404 delete[] list; | |
405 } | |
406 | |
407 void OrderList::add(int32_t order, int32_t low, int32_t high) | |
408 { | |
409 if (listSize >= listMax) { | |
410 listMax *= 2; | |
411 | |
412 Order *newList = new Order[listMax]; | |
413 | |
414 uprv_memcpy(newList, list, listSize * sizeof(Order)); | |
415 delete[] list; | |
416 list = newList; | |
417 } | |
418 | |
419 list[listSize].order = order; | |
420 list[listSize].lowOffset = low; | |
421 list[listSize].highOffset = high; | |
422 | |
423 listSize += 1; | |
424 } | |
425 | |
426 const Order *OrderList::get(int32_t index) const | |
427 { | |
428 if (index >= listSize) { | |
429 return NULL; | |
430 } | |
431 | |
432 return &list[index]; | |
433 } | |
434 | |
435 int32_t OrderList::getLowOffset(int32_t index) const | |
436 { | |
437 const Order *order = get(index); | |
438 | |
439 if (order != NULL) { | |
440 return order->lowOffset; | |
441 } | |
442 | |
443 return -1; | |
444 } | |
445 | |
446 int32_t OrderList::getHighOffset(int32_t index) const | |
447 { | |
448 const Order *order = get(index); | |
449 | |
450 if (order != NULL) { | |
451 return order->highOffset; | |
452 } | |
453 | |
454 return -1; | |
455 } | |
456 | |
457 int32_t OrderList::getOrder(int32_t index) const | |
458 { | |
459 const Order *order = get(index); | |
460 | |
461 if (order != NULL) { | |
462 return order->order; | |
463 } | |
464 | |
465 return UCOL_NULLORDER; | |
466 } | |
467 | |
468 int32_t OrderList::size() const | |
469 { | |
470 return listSize; | |
471 } | |
472 | |
473 void OrderList::reverse() | |
474 { | |
475 for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) { | |
476 Order swap = list[b]; | |
477 | |
478 list[b] = list[f]; | |
479 list[f] = swap; | |
480 } | |
481 } | |
482 | |
483 UBool OrderList::compare(const OrderList &other) const | |
484 { | |
485 if (listSize != other.listSize) { | |
486 return FALSE; | |
487 } | |
488 | |
489 for(int32_t i = 0; i < listSize; i += 1) { | |
490 if (list[i].order != other.list[i].order || | |
491 list[i].lowOffset != other.list[i].lowOffset || | |
492 list[i].highOffset != other.list[i].highOffset) { | |
493 return FALSE; | |
494 } | |
495 } | |
496 | |
497 return TRUE; | |
498 } | |
499 | |
500 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const | |
501 { | |
502 // NOTE: sizes include the NULLORDER, which we don't want to compare. | |
503 int32_t otherSize = other.size() - 1; | |
504 | |
505 if (listSize - 1 - offset < otherSize) { | |
506 return FALSE; | |
507 } | |
508 | |
509 for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) { | |
510 if (getOrder(i) != other.getOrder(j)) { | |
511 return FALSE; | |
512 } | |
513 } | |
514 | |
515 return TRUE; | |
516 } | |
517 | |
518 static char *printOffsets(char *buffer, OrderList &list) | |
519 { | |
520 int32_t size = list.size(); | |
521 char *s = buffer; | |
522 | |
523 for(int32_t i = 0; i < size; i += 1) { | |
524 const Order *order = list.get(i); | |
525 | |
526 if (i != 0) { | |
527 s += sprintf(s, ", "); | |
528 } | |
529 | |
530 s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset); | |
531 } | |
532 | |
533 return buffer; | |
534 } | |
535 | |
536 static char *printOrders(char *buffer, OrderList &list) | |
537 { | |
538 int32_t size = list.size(); | |
539 char *s = buffer; | |
540 | |
541 for(int32_t i = 0; i < size; i += 1) { | |
542 const Order *order = list.get(i); | |
543 | |
544 if (i != 0) { | |
545 s += sprintf(s, ", "); | |
546 } | |
547 | |
548 s += sprintf(s, "%8.8X", order->order); | |
549 } | |
550 | |
551 return buffer; | |
552 } | |
553 | |
554 void SSearchTest::offsetTest() | |
555 { | |
556 const char *test[] = { | |
557 // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous | |
558 // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71. | |
559 "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0", | |
560 | |
561 "\\ua191\\u16ef\\u2036\\u017a", | |
562 | |
563 #if 0 | |
564 // This results in a complex interaction between contraction, | |
565 // expansion and normalization that confuses the backwards offset fixups
. | |
566 "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
567 #endif | |
568 | |
569 "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85", | |
570 "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3", | |
571 | |
572 "\\u02FE\\u02FF" | |
573 "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\
\u030A\\u030B\\u030C\\u030D\\u030E\\u030F" | |
574 "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\
\u031A\\u031B\\u031C\\u031D\\u031E\\u031F" | |
575 "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\
\u032A\\u032B\\u032C\\u032D\\u032E\\u032F" | |
576 "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\
\u033A\\u033B\\u033C\\u033D\\u033E\\u033F" | |
577 "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\
\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081 | |
578 | |
579 "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // cu
rrently not working, see #8081 | |
580 "a\\u02FF\\u0301\\u0316", // currently not working, see #8081 | |
581 "a\\u02FF\\u0316\\u0301", | |
582 "a\\u0430\\u0301\\u0316", | |
583 "a\\u0430\\u0316\\u0301", | |
584 "abc\\u0E41\\u0301\\u0316", | |
585 "abc\\u0E41\\u0316\\u0301", | |
586 "\\u0E41\\u0301\\u0316", | |
587 "\\u0E41\\u0316\\u0301", | |
588 "a\\u0301\\u0316", | |
589 "a\\u0316\\u0301", | |
590 "\\uAC52\\uAC53", | |
591 "\\u34CA\\u34CB", | |
592 "\\u11ED\\u11EE", | |
593 "\\u30C3\\u30D0", | |
594 "p\\u00E9ch\\u00E9", | |
595 "a\\u0301\\u0325", | |
596 "a\\u0300\\u0325", | |
597 "a\\u0325\\u0300", | |
598 "A\\u0323\\u0300B", | |
599 "A\\u0300\\u0323B", | |
600 "A\\u0301\\u0323B", | |
601 "A\\u0302\\u0301\\u0323B", | |
602 "abc", | |
603 "ab\\u0300c", | |
604 "ab\\u0300\\u0323c", | |
605 " \\uD800\\uDC00\\uDC00", | |
606 "a\\uD800\\uDC00\\uDC00", | |
607 "A\\u0301\\u0301", | |
608 "A\\u0301\\u0323", | |
609 "A\\u0301\\u0323B", | |
610 "B\\u0301\\u0323C", | |
611 "A\\u0300\\u0323B", | |
612 "\\u0301A\\u0301\\u0301", | |
613 "abcd\\r\\u0301", | |
614 "p\\u00EAche", | |
615 "pe\\u0302che", | |
616 }; | |
617 | |
618 int32_t testCount = ARRAY_SIZE(test); | |
619 UErrorCode status = U_ZERO_ERROR; | |
620 RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Loca
le::getEnglish(), status); | |
621 if (U_FAILURE(status)) { | |
622 errcheckln(status, "Failed to create collator in offsetTest! - %s", u_er
rorName(status)); | |
623 return; | |
624 } | |
625 char buffer[4096]; // A bit of a hack... just happens to be long enough for
all the test cases... | |
626 // We could allocate one that's the right size by (CE_co
unt * 10) + 2 | |
627 // 10 chars is enough room for 8 hex digits plus ", ". 2
extra chars for "[" and "]" | |
628 | |
629 col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status); | |
630 | |
631 for(int32_t i = 0; i < testCount; i += 1) { | |
632 UnicodeString ts = CharsToUnicodeString(test[i]); | |
633 CollationElementIterator *iter = col->createCollationElementIterator(ts)
; | |
634 OrderList forwardList; | |
635 OrderList backwardList; | |
636 int32_t order, low, high; | |
637 | |
638 do { | |
639 low = iter->getOffset(); | |
640 order = iter->next(status); | |
641 high = iter->getOffset(); | |
642 | |
643 forwardList.add(order, low, high); | |
644 } while (order != CollationElementIterator::NULLORDER); | |
645 | |
646 iter->reset(); | |
647 iter->setOffset(ts.length(), status); | |
648 | |
649 backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(),
iter->getOffset()); | |
650 | |
651 do { | |
652 high = iter->getOffset(); | |
653 order = iter->previous(status); | |
654 low = iter->getOffset(); | |
655 | |
656 if (order == CollationElementIterator::NULLORDER) { | |
657 break; | |
658 } | |
659 | |
660 backwardList.add(order, low, high); | |
661 } while (TRUE); | |
662 | |
663 backwardList.reverse(); | |
664 | |
665 if (forwardList.compare(backwardList)) { | |
666 logln("Works with \"%s\"", test[i]); | |
667 logln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
668 // logln("Backward offsets: [%s]", printOffsets(buffer, backwardList)); | |
669 | |
670 logln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
671 // logln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
672 | |
673 logln(); | |
674 } else { | |
675 errln("Fails with \"%s\"", test[i]); | |
676 infoln("Forward offsets: [%s]", printOffsets(buffer, forwardList)); | |
677 infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList))
; | |
678 | |
679 infoln("Forward CEs: [%s]", printOrders(buffer, forwardList)); | |
680 infoln("Backward CEs: [%s]", printOrders(buffer, backwardList)); | |
681 | |
682 infoln(); | |
683 } | |
684 delete iter; | |
685 } | |
686 delete col; | |
687 } | |
688 | |
689 #if 0 | |
690 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer) | |
691 { | |
692 for(int32_t i = 0; i < string.length(); i += 1) { | |
693 UChar32 ch = string.char32At(i); | |
694 | |
695 if (ch >= 0x0020 && ch <= 0x007F) { | |
696 if (ch == 0x005C) { | |
697 buffer.append("\\\\"); | |
698 } else { | |
699 buffer.append(ch); | |
700 } | |
701 } else { | |
702 char cbuffer[12]; | |
703 | |
704 if (ch <= 0xFFFFL) { | |
705 sprintf(cbuffer, "\\u%4.4X", ch); | |
706 } else { | |
707 sprintf(cbuffer, "\\U%8.8X", ch); | |
708 } | |
709 | |
710 buffer.append(cbuffer); | |
711 } | |
712 | |
713 if (ch >= 0x10000L) { | |
714 i += 1; | |
715 } | |
716 } | |
717 | |
718 return buffer; | |
719 } | |
720 #endif | |
721 | |
722 void SSearchTest::sharpSTest() | |
723 { | |
724 UErrorCode status = U_ZERO_ERROR; | |
725 UCollator *coll = NULL; | |
726 UnicodeString lp = "fuss"; | |
727 UnicodeString sp = "fu\\u00DF"; | |
728 UnicodeString targets[] = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball",
"12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball", | |
729 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF", | |
730 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "
12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu
\\u00DF", "1234fu\\u00DF"}; | |
731 int32_t start = -1, end = -1; | |
732 | |
733 coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status); | |
734 TEST_ASSERT_SUCCESS(status); | |
735 | |
736 UnicodeString lpUnescaped = lp.unescape(); | |
737 UnicodeString spUnescaped = sp.unescape(); | |
738 | |
739 LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBu
ffer(), lpUnescaped.length(), | |
740 lpUnescaped.getBuffer
(), lpUnescaped.length(), // actual test data will be set later | |
741 coll, | |
742 NULL, // the brea
k iterator | |
743 &status)); | |
744 | |
745 LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getB
uffer(), spUnescaped.length(), | |
746 spUnescaped.getBuffer
(), spUnescaped.length(), // actual test data will be set later | |
747 coll, | |
748 NULL, // the brea
k iterator | |
749 &status)); | |
750 TEST_ASSERT_SUCCESS(status); | |
751 | |
752 for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) { | |
753 UBool bFound; | |
754 UnicodeString target = targets[t].unescape(); | |
755 | |
756 start = end = -1; | |
757 usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(),
&status); | |
758 bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status); | |
759 TEST_ASSERT_SUCCESS(status); | |
760 if (bFound) { | |
761 logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
762 } else { | |
763 dataerrln("Test %d: did not find long pattern.", t); | |
764 } | |
765 | |
766 usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length()
, &status); | |
767 bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status); | |
768 TEST_ASSERT_SUCCESS(status); | |
769 if (bFound) { | |
770 logln("Test %d: found long pattern at [%d, %d].", t, start, end); | |
771 } else { | |
772 dataerrln("Test %d: did not find long pattern.", t); | |
773 } | |
774 } | |
775 | |
776 ucol_close(coll); | |
777 } | |
778 | |
779 void SSearchTest::goodSuffixTest() | |
780 { | |
781 UErrorCode status = U_ZERO_ERROR; | |
782 UCollator *coll = NULL; | |
783 UnicodeString pat = /*"gcagagag"*/ "fxeld"; | |
784 UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld"; | |
785 int32_t start = -1, end = -1; | |
786 UBool bFound; | |
787 | |
788 coll = ucol_open(NULL, &status); | |
789 TEST_ASSERT_SUCCESS(status); | |
790 | |
791 LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.l
ength(), | |
792 target.getBuffer(), ta
rget.length(), | |
793 coll, | |
794 NULL, // the break
iterator | |
795 &status)); | |
796 TEST_ASSERT_SUCCESS(status); | |
797 | |
798 bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status); | |
799 TEST_ASSERT_SUCCESS(status); | |
800 if (bFound) { | |
801 logln("Found pattern at [%d, %d].", start, end); | |
802 } else { | |
803 dataerrln("Did not find pattern."); | |
804 } | |
805 | |
806 ucol_close(coll); | |
807 } | |
808 | |
809 // | |
810 // searchTime() A quick and dirty performance test for string search. | |
811 // Probably doesn't really belong as part of intltest, but it | |
812 // does check that the search succeeds, and gets the right resu
lt, | |
813 // so it serves as a functionality test also. | |
814 // | |
815 // To run as a perf test, up the loop count, select by commenti
ng | |
816 // and uncommenting in the code the operation to be measured, | |
817 // rebuild, and measure the running time of this test alone. | |
818 // | |
819 // time LD_LIBRARY_PATH=whatever ./intltest collate/SSearc
hTest/searchTime | |
820 // | |
821 void SSearchTest::searchTime() { | |
822 static const char *longishText = | |
823 "Whylom, as olde stories tellen us,\n" | |
824 "Ther was a duk that highte Theseus:\n" | |
825 "Of Athenes he was lord and governour,\n" | |
826 "And in his tyme swich a conquerour,\n" | |
827 "That gretter was ther noon under the sonne.\n" | |
828 "Ful many a riche contree hadde he wonne;\n" | |
829 "What with his wisdom and his chivalrye,\n" | |
830 "He conquered al the regne of Femenye,\n" | |
831 "That whylom was y-cleped Scithia;\n" | |
832 "And weddede the quene Ipolita,\n" | |
833 "And broghte hir hoom with him in his contree\n" | |
834 "With muchel glorie and greet solempnitee,\n" | |
835 "And eek hir yonge suster Emelye.\n" | |
836 "And thus with victorie and with melodye\n" | |
837 "Lete I this noble duk to Athenes ryde,\n" | |
838 "And al his hoost, in armes, him bisyde.\n" | |
839 "And certes, if it nere to long to here,\n" | |
840 "I wolde han told yow fully the manere,\n" | |
841 "How wonnen was the regne of Femenye\n" | |
842 "By Theseus, and by his chivalrye;\n" | |
843 "And of the grete bataille for the nones\n" | |
844 "Bitwixen Athen's and Amazones;\n" | |
845 "And how asseged was Ipolita,\n" | |
846 "The faire hardy quene of Scithia;\n" | |
847 "And of the feste that was at hir weddinge,\n" | |
848 "And of the tempest at hir hoom-cominge;\n" | |
849 "But al that thing I moot as now forbere.\n" | |
850 "I have, God woot, a large feeld to ere,\n" | |
851 "And wayke been the oxen in my plough.\n" | |
852 "The remenant of the tale is long y-nough.\n" | |
853 "I wol nat letten eek noon of this route;\n" | |
854 "Lat every felawe telle his tale aboute,\n" | |
855 "And lat see now who shal the soper winne;\n" | |
856 "And ther I lefte, I wol ageyn biginne.\n" | |
857 "This duk, of whom I make mencioun,\n" | |
858 "When he was come almost unto the toun,\n" | |
859 "In al his wele and in his moste pryde,\n" | |
860 "He was war, as he caste his eye asyde,\n" | |
861 "Wher that ther kneled in the hye weye\n" | |
862 "A companye of ladies, tweye and tweye,\n" | |
863 "Ech after other, clad in clothes blake; \n" | |
864 "But swich a cry and swich a wo they make,\n" | |
865 "That in this world nis creature livinge,\n" | |
866 "That herde swich another weymentinge;\n" | |
867 "And of this cry they nolde never stenten,\n" | |
868 "Til they the reynes of his brydel henten.\n" | |
869 "'What folk ben ye, that at myn hoomcominge\n" | |
870 "Perturben so my feste with cryinge'?\n" | |
871 "Quod Theseus, 'have ye so greet envye\n" | |
872 "Of myn honour, that thus compleyne and crye? \n" | |
873 "Or who hath yow misboden, or offended?\n" | |
874 "And telleth me if it may been amended;\n" | |
875 "And why that ye ben clothed thus in blak'?\n" | |
876 "The eldest lady of hem alle spak,\n" | |
877 "When she hadde swowned with a deedly chere,\n" | |
878 "That it was routhe for to seen and here,\n" | |
879 "And seyde: 'Lord, to whom Fortune hath yiven\n" | |
880 "Victorie, and as a conquerour to liven,\n" | |
881 "Noght greveth us your glorie and your honour;\n" | |
882 "But we biseken mercy and socour.\n" | |
883 "Have mercy on our wo and our distresse.\n" | |
884 "Som drope of pitee, thurgh thy gentilesse,\n" | |
885 "Up-on us wrecched wommen lat thou falle.\n" | |
886 "For certes, lord, ther nis noon of us alle,\n" | |
887 "That she nath been a duchesse or a quene;\n" | |
888 "Now be we caitifs, as it is wel sene:\n" | |
889 "Thanked be Fortune, and hir false wheel,\n" | |
890 "That noon estat assureth to be weel.\n" | |
891 "And certes, lord, t'abyden your presence,\n" | |
892 "Here in the temple of the goddesse Clemence\n" | |
893 "We han ben waytinge al this fourtenight;\n" | |
894 "Now help us, lord, sith it is in thy might.\n" | |
895 "I wrecche, which that wepe and waille thus,\n" | |
896 "Was whylom wyf to king Capaneus,\n" | |
897 "That starf at Thebes, cursed be that day!\n" | |
898 "And alle we, that been in this array,\n" | |
899 "And maken al this lamentacioun,\n" | |
900 "We losten alle our housbondes at that toun,\n" | |
901 "Whyl that the sege ther-aboute lay.\n" | |
902 "And yet now th'olde Creon, weylaway!\n" | |
903 "The lord is now of Thebes the citee, \n" | |
904 "Fulfild of ire and of iniquitee,\n" | |
905 "He, for despyt, and for his tirannye,\n" | |
906 "To do the dede bodyes vileinye,\n" | |
907 "Of alle our lordes, whiche that ben slawe,\n" | |
908 "Hath alle the bodyes on an heep y-drawe,\n" | |
909 "And wol nat suffren hem, by noon assent,\n" | |
910 "Neither to been y-buried nor y-brent,\n" | |
911 "But maketh houndes ete hem in despyt. zet'\n"; | |
912 | |
913 const char *cPattern = "maketh houndes ete hem"; | |
914 //const char *cPattern = "Whylom"; | |
915 //const char *cPattern = "zet"; | |
916 const char *testId = "searchTime()"; // for error macros. | |
917 UnicodeString target = longishText; | |
918 UErrorCode status = U_ZERO_ERROR; | |
919 | |
920 | |
921 LocalUCollatorPointer collator(ucol_open("en", &status)); | |
922 //ucol_setStrength(collator.getAlias(), collatorStrength); | |
923 //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize,
&status); | |
924 UnicodeString uPattern = cPattern; | |
925 LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(),
uPattern.length(), | |
926 target.getBuffer(), t
arget.length(), | |
927 collator.getAlias(), | |
928 NULL, // the brea
k iterator | |
929 &status)); | |
930 TEST_ASSERT_SUCCESS(status); | |
931 | |
932 // int32_t foundStart; | |
933 // int32_t foundEnd; | |
934 UBool found; | |
935 | |
936 // Find the match position usgin strstr | |
937 const char *pm = strstr(longishText, cPattern); | |
938 TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr"); | |
939 int32_t refMatchPos = (int32_t)(pm - longishText); | |
940 int32_t icuMatchPos; | |
941 int32_t icuMatchEnd; | |
942 usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status); | |
943 TEST_ASSERT_SUCCESS(status); | |
944 TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different mat
ch positions."); | |
945 | |
946 int32_t i; | |
947 // int32_t j=0; | |
948 | |
949 // Try loopcounts around 100000 to some millions, depending on the operation
, | |
950 // to get runtimes of at least several seconds. | |
951 for (i=0; i<10000; i++) { | |
952 found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &s
tatus); | |
953 (void)found; // Suppress set but not used warning. | |
954 //TEST_ASSERT_SUCCESS(status); | |
955 //TEST_ASSERT(found); | |
956 | |
957 // usearch_setOffset(uss.getAlias(), 0, &status); | |
958 // icuMatchPos = usearch_next(uss.getAlias(), &status); | |
959 | |
960 // The i+j stuff is to confuse the optimizer and get it to actually lea
ve the | |
961 // call to strstr in place. | |
962 //pm = strstr(longishText+j, cPattern); | |
963 //j = (j + i)%5; | |
964 } | |
965 | |
966 //printf("%ld, %d\n", pm-longishText, j); | |
967 } | |
968 | |
969 //------------------------------------------------------------------------------
---------- | |
970 // | |
971 // Random Numbers. Similar to standard lib rand() and srand() | |
972 // Not using library to | |
973 // 1. Get same results on all platforms. | |
974 // 2. Get access to current seed, to more easily reproduce
failures. | |
975 // | |
976 //------------------------------------------------------------------------------
--------- | |
977 static uint32_t m_seed = 1; | |
978 | |
979 static uint32_t m_rand() | |
980 { | |
981 m_seed = m_seed * 1103515245 + 12345; | |
982 return (uint32_t)(m_seed/65536) % 32768; | |
983 } | |
984 | |
985 class Monkey | |
986 { | |
987 public: | |
988 virtual void append(UnicodeString &test, UnicodeString &alternate) = 0; | |
989 | |
990 protected: | |
991 Monkey(); | |
992 virtual ~Monkey(); | |
993 }; | |
994 | |
995 Monkey::Monkey() | |
996 { | |
997 // ook? | |
998 } | |
999 | |
1000 Monkey::~Monkey() | |
1001 { | |
1002 // ook? | |
1003 } | |
1004 | |
1005 class SetMonkey : public Monkey | |
1006 { | |
1007 public: | |
1008 SetMonkey(const USet *theSet); | |
1009 ~SetMonkey(); | |
1010 | |
1011 virtual void append(UnicodeString &test, UnicodeString &alternate); | |
1012 | |
1013 private: | |
1014 const USet *set; | |
1015 }; | |
1016 | |
1017 SetMonkey::SetMonkey(const USet *theSet) | |
1018 : Monkey(), set(theSet) | |
1019 { | |
1020 // ook? | |
1021 } | |
1022 | |
1023 SetMonkey::~SetMonkey() | |
1024 { | |
1025 //ook... | |
1026 } | |
1027 | |
1028 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate) | |
1029 { | |
1030 int32_t size = uset_size(set); | |
1031 int32_t index = m_rand() % size; | |
1032 UChar32 ch = uset_charAt(set, index); | |
1033 UnicodeString str(ch); | |
1034 | |
1035 test.append(str); | |
1036 alternate.append(str); // flip case, or some junk? | |
1037 } | |
1038 | |
1039 class StringSetMonkey : public Monkey | |
1040 { | |
1041 public: | |
1042 StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCol
lData); | |
1043 ~StringSetMonkey(); | |
1044 | |
1045 void append(UnicodeString &testCase, UnicodeString &alternate); | |
1046 | |
1047 private: | |
1048 UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeStr
ing &alternate); | |
1049 | |
1050 const USet *set; | |
1051 UCollator *coll; | |
1052 CollData *collData; | |
1053 }; | |
1054 | |
1055 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, Col
lData *theCollData) | |
1056 : Monkey(), set(theSet), coll(theCollator), collData(theCollData) | |
1057 { | |
1058 // ook. | |
1059 } | |
1060 | |
1061 StringSetMonkey::~StringSetMonkey() | |
1062 { | |
1063 // ook? | |
1064 } | |
1065 | |
1066 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) | |
1067 { | |
1068 int32_t itemCount = uset_getItemCount(set), len = 0; | |
1069 int32_t index = m_rand() % itemCount; | |
1070 UChar32 rangeStart = 0, rangeEnd = 0; | |
1071 UChar buffer[16]; | |
1072 UErrorCode err = U_ZERO_ERROR; | |
1073 | |
1074 len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); | |
1075 | |
1076 if (len == 0) { | |
1077 int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); | |
1078 UChar32 ch = rangeStart + offset; | |
1079 UnicodeString str(ch); | |
1080 | |
1081 testCase.append(str); | |
1082 generateAlternative(str, alternate); | |
1083 } else if (len > 0) { | |
1084 // should check that len < 16... | |
1085 UnicodeString str(buffer, len); | |
1086 | |
1087 testCase.append(str); | |
1088 generateAlternative(str, alternate); | |
1089 } else { | |
1090 // shouldn't happen... | |
1091 } | |
1092 } | |
1093 | |
1094 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCas
e, UnicodeString &alternate) | |
1095 { | |
1096 // find out shortest string for the longest sequence of ces. | |
1097 // needs to be refined to use dynamic programming, but will be roughly right | |
1098 UErrorCode status = U_ZERO_ERROR; | |
1099 CEList ceList(coll, testCase, status); | |
1100 UnicodeString alt; | |
1101 int32_t offset = 0; | |
1102 | |
1103 if (ceList.size() == 0) { | |
1104 return alternate.append(testCase); | |
1105 } | |
1106 | |
1107 while (offset < ceList.size()) { | |
1108 int32_t ce = ceList.get(offset); | |
1109 const StringList *strings = collData->getStringList(ce); | |
1110 | |
1111 if (strings == NULL) { | |
1112 return alternate.append(testCase); | |
1113 } | |
1114 | |
1115 int32_t stringCount = strings->size(); | |
1116 int32_t tries = 0; | |
1117 | |
1118 // find random string that generates the same CEList | |
1119 const CEList *ceList2 = NULL; | |
1120 const UnicodeString *string = NULL; | |
1121 UBool matches = FALSE; | |
1122 | |
1123 do { | |
1124 int32_t s = m_rand() % stringCount; | |
1125 | |
1126 if (tries++ > stringCount) { | |
1127 alternate.append(testCase); | |
1128 return alternate; | |
1129 } | |
1130 | |
1131 string = strings->get(s); | |
1132 ceList2 = collData->getCEList(string); | |
1133 matches = ceList.matchesAt(offset, ceList2); | |
1134 | |
1135 if (! matches) { | |
1136 collData->freeCEList((CEList *) ceList2); | |
1137 } | |
1138 } while (! matches); | |
1139 | |
1140 alt.append(*string); | |
1141 offset += ceList2->size(); | |
1142 collData->freeCEList(ceList2); | |
1143 } | |
1144 | |
1145 const CEList altCEs(coll, alt, status); | |
1146 | |
1147 if (ceList.matchesAt(0, &altCEs)) { | |
1148 return alternate.append(alt); | |
1149 } | |
1150 | |
1151 return alternate.append(testCase); | |
1152 } | |
1153 | |
1154 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyC
ount, UnicodeString &testCase, UnicodeString &alternate) | |
1155 { | |
1156 int32_t pieces = (m_rand() % 4) + 1; | |
1157 UErrorCode status = U_ZERO_ERROR; | |
1158 UBool matches; | |
1159 | |
1160 do { | |
1161 testCase.remove(); | |
1162 alternate.remove(); | |
1163 monkeys[0]->append(testCase, alternate); | |
1164 | |
1165 for(int32_t piece = 0; piece < pieces; piece += 1) { | |
1166 int32_t monkey = m_rand() % monkeyCount; | |
1167 | |
1168 monkeys[monkey]->append(testCase, alternate); | |
1169 } | |
1170 | |
1171 const CEList ceTest(coll, testCase, status); | |
1172 const CEList ceAlt(coll, alternate, status); | |
1173 | |
1174 matches = ceTest.matchesAt(0, &ceAlt); | |
1175 } while (! matches); | |
1176 } | |
1177 | |
1178 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t
offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd) | |
1179 { | |
1180 UErrorCode status = U_ZERO_ERROR; | |
1181 OrderList targetOrders(coll, target, offset); | |
1182 OrderList patternOrders(coll, pattern); | |
1183 int32_t targetSize = targetOrders.size() - 1; | |
1184 int32_t patternSize = patternOrders.size() - 1; | |
1185 UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocale
ByType(coll, ULOC_VALID_LOCALE, &status), | |
1186 target.getBuffer(), target.len
gth(), &status); | |
1187 | |
1188 if (patternSize == 0) { | |
1189 // Searching for an empty pattern always fails | |
1190 matchStart = matchEnd = -1; | |
1191 ubrk_close(charBreakIterator); | |
1192 return FALSE; | |
1193 } | |
1194 | |
1195 matchStart = matchEnd = -1; | |
1196 | |
1197 for(int32_t i = 0; i < targetSize; i += 1) { | |
1198 if (targetOrders.matchesAt(i, patternOrders)) { | |
1199 int32_t start = targetOrders.getLowOffset(i); | |
1200 int32_t maxLimit = targetOrders.getLowOffset(i + patternSize); | |
1201 int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1); | |
1202 | |
1203 // if the low and high offsets of the first CE in | |
1204 // the match are the same, it means that the match | |
1205 // starts in the middle of an expansion - all but | |
1206 // the first CE of the expansion will have the offset | |
1207 // of the following character. | |
1208 if (start == targetOrders.getHighOffset(i)) { | |
1209 continue; | |
1210 } | |
1211 | |
1212 // Make sure match starts on a grapheme boundary | |
1213 if (! ubrk_isBoundary(charBreakIterator, start)) { | |
1214 continue; | |
1215 } | |
1216 | |
1217 // If the low and high offsets of the CE after the match | |
1218 // are the same, it means that the match ends in the middle | |
1219 // of an expansion sequence. | |
1220 if (maxLimit == targetOrders.getHighOffset(i + patternSize) && | |
1221 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) { | |
1222 continue; | |
1223 } | |
1224 | |
1225 int32_t mend = maxLimit; | |
1226 | |
1227 // Find the first grapheme break after the character index | |
1228 // of the last CE in the match. If it's after character index | |
1229 // that's after the last CE in the match, use that index | |
1230 // as the end of the match. | |
1231 if (minLimit < maxLimit) { | |
1232 // When the last CE's low index is same with its high index, the
CE is likely | |
1233 // a part of expansion. In this case, the index is located just
after the | |
1234 // character corresponding to the CEs compared above. If the ind
ex is right | |
1235 // at the break boundary, move the position to the next boundary
will result | |
1236 // incorrect match length when there are ignorable characters ex
ist between | |
1237 // the position and the next character produces CE(s). See ticke
t#8482. | |
1238 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1)
&& ubrk_isBoundary(charBreakIterator, minLimit)) { | |
1239 mend = minLimit; | |
1240 } else { | |
1241 int32_t nba = ubrk_following(charBreakIterator, minLimit); | |
1242 | |
1243 if (nba >= targetOrders.getHighOffset(i + patternSize - 1))
{ | |
1244 mend = nba; | |
1245 } | |
1246 } | |
1247 } | |
1248 | |
1249 if (mend > maxLimit) { | |
1250 continue; | |
1251 } | |
1252 | |
1253 if (! ubrk_isBoundary(charBreakIterator, mend)) { | |
1254 continue; | |
1255 } | |
1256 | |
1257 matchStart = start; | |
1258 matchEnd = mend; | |
1259 | |
1260 ubrk_close(charBreakIterator); | |
1261 return TRUE; | |
1262 } | |
1263 } | |
1264 | |
1265 ubrk_close(charBreakIterator); | |
1266 return FALSE; | |
1267 } | |
1268 | |
1269 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
1270 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t d
efaultVal) { | |
1271 int32_t val = defaultVal; | |
1272 | |
1273 name.append(" *= *(-?\\d+)"); | |
1274 | |
1275 UErrorCode status = U_ZERO_ERROR; | |
1276 RegexMatcher m(name, params, 0, status); | |
1277 | |
1278 if (m.find()) { | |
1279 // The param exists. Convert the string to an int. | |
1280 char valString[100]; | |
1281 int32_t paramLength = m.end(1, status) - m.start(1, status); | |
1282 | |
1283 if (paramLength >= (int32_t)(sizeof(valString)-1)) { | |
1284 paramLength = (int32_t)(sizeof(valString)-2); | |
1285 } | |
1286 | |
1287 params.extract(m.start(1, status), paramLength, valString, sizeof(valStr
ing)); | |
1288 val = uprv_strtol(valString, NULL, 10); | |
1289 | |
1290 // Delete this parameter from the params string. | |
1291 m.reset(); | |
1292 params = m.replaceFirst("", status); | |
1293 } | |
1294 | |
1295 //U_ASSERT(U_SUCCESS(status)); | |
1296 if (! U_SUCCESS(status)) { | |
1297 val = defaultVal; | |
1298 } | |
1299 | |
1300 return val; | |
1301 } | |
1302 #endif | |
1303 | |
1304 #if !UCONFIG_NO_COLLATION | |
1305 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCa
se, const UnicodeString &pattern, const UnicodeString &altPattern, | |
1306 const char *name, const char *strength, uint
32_t seed) | |
1307 { | |
1308 UErrorCode status = U_ZERO_ERROR; | |
1309 int32_t actualStart = -1, actualEnd = -1; | |
1310 //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + alt
Pattern.length(); | |
1311 int32_t expectedStart = -1, expectedEnd = -1; | |
1312 int32_t notFoundCount = 0; | |
1313 LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(),
pattern.length(), | |
1314 testCase.getBuffer(),
testCase.length(), | |
1315 coll, | |
1316 NULL, // the brea
k iterator | |
1317 &status)); | |
1318 | |
1319 // **** TODO: find *all* matches, not just first one **** | |
1320 simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd); | |
1321 | |
1322 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); | |
1323 | |
1324 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { | |
1325 errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d,
%d]\n" | |
1326 " strength=%s seed=%d", | |
1327 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed); | |
1328 } | |
1329 | |
1330 if (expectedStart == -1 && actualStart == -1) { | |
1331 notFoundCount += 1; | |
1332 } | |
1333 | |
1334 // **** TODO: find *all* matches, not just first one **** | |
1335 simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd); | |
1336 | |
1337 usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length
(), &status); | |
1338 | |
1339 usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status); | |
1340 | |
1341 if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expe
ctedEnd)) { | |
1342 errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [
%d, %d]\n" | |
1343 " strength=%s seed=%d", | |
1344 name, expectedStart, expectedEnd, actualStart, actualEnd, strength
, seed); | |
1345 } | |
1346 | |
1347 if (expectedStart == -1 && actualStart == -1) { | |
1348 notFoundCount += 1; | |
1349 } | |
1350 | |
1351 return notFoundCount; | |
1352 } | |
1353 #endif | |
1354 | |
1355 void SSearchTest::monkeyTest(char *params) | |
1356 { | |
1357 // ook! | |
1358 UErrorCode status = U_ZERO_ERROR; | |
1359 //UCollator *coll = ucol_open(NULL, &status); | |
1360 UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status); | |
1361 | |
1362 if (U_FAILURE(status)) { | |
1363 errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_er
rorName(status)); | |
1364 return; | |
1365 } | |
1366 | |
1367 CollData *monkeyData = new CollData(coll, status); | |
1368 | |
1369 USet *expansions = uset_openEmpty(); | |
1370 USet *contractions = uset_openEmpty(); | |
1371 | |
1372 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &st
atus); | |
1373 | |
1374 U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; | |
1375 U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39)
; | |
1376 USet *letters = uset_openPattern(letter_pattern, 39, &status); | |
1377 SetMonkey letterMonkey(letters); | |
1378 StringSetMonkey contractionMonkey(contractions, coll, monkeyData); | |
1379 StringSetMonkey expansionMonkey(expansions, coll, monkeyData); | |
1380 UnicodeString testCase; | |
1381 UnicodeString alternate; | |
1382 UnicodeString pattern, altPattern; | |
1383 UnicodeString prefix, altPrefix; | |
1384 UnicodeString suffix, altSuffix; | |
1385 | |
1386 Monkey *monkeys[] = { | |
1387 &letterMonkey, | |
1388 &contractionMonkey, | |
1389 &expansionMonkey, | |
1390 &contractionMonkey, | |
1391 &expansionMonkey, | |
1392 &contractionMonkey, | |
1393 &expansionMonkey, | |
1394 &contractionMonkey, | |
1395 &expansionMonkey}; | |
1396 int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]); | |
1397 // int32_t nonMatchCount = 0; | |
1398 | |
1399 UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIAR
Y}; | |
1400 const char *strengthNames[] = {"primary", "secondary", "tertiary"}; | |
1401 int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]); | |
1402 int32_t loopCount = quick? 1000 : 10000; | |
1403 int32_t firstStrength = 0; | |
1404 int32_t lastStrength = strengthCount - 1; //*/ 0; | |
1405 | |
1406 if (params != NULL) { | |
1407 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
1408 UnicodeString p(params); | |
1409 | |
1410 loopCount = getIntParam("loop", p, loopCount); | |
1411 m_seed = getIntParam("seed", p, m_seed); | |
1412 | |
1413 RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, st
atus); | |
1414 if (m.find()) { | |
1415 UnicodeString breakType = m.group(1, status); | |
1416 | |
1417 for (int32_t s = 0; s < strengthCount; s += 1) { | |
1418 if (breakType == strengthNames[s]) { | |
1419 firstStrength = lastStrength = s; | |
1420 break; | |
1421 } | |
1422 } | |
1423 | |
1424 m.reset(); | |
1425 p = m.replaceFirst("", status); | |
1426 } | |
1427 | |
1428 if (RegexMatcher("\\S", p, 0, status).find()) { | |
1429 // Each option is stripped out of the option string as it is process
ed. | |
1430 // All options have been checked. The option string should have bee
n completely emptied.. | |
1431 char buf[100]; | |
1432 p.extract(buf, sizeof(buf), NULL, status); | |
1433 buf[sizeof(buf)-1] = 0; | |
1434 errln("Unrecognized or extra parameter: %s\n", buf); | |
1435 return; | |
1436 } | |
1437 #else | |
1438 infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring
parameters."); | |
1439 #endif | |
1440 } | |
1441 | |
1442 for(int32_t s = firstStrength; s <= lastStrength; s += 1) { | |
1443 int32_t notFoundCount = 0; | |
1444 | |
1445 logln("Setting strength to %s.", strengthNames[s]); | |
1446 ucol_setStrength(coll, strengths[s]); | |
1447 | |
1448 // TODO: try alternate prefix and suffix too? | |
1449 // TODO: alternates are only equal at primary strength. Is this OK? | |
1450 for(int32_t t = 0; t < loopCount; t += 1) { | |
1451 uint32_t seed = m_seed; | |
1452 // int32_t nmc = 0; | |
1453 | |
1454 generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern); | |
1455 generateTestCase(coll, monkeys, monkeyCount, prefix, altPrefix); | |
1456 generateTestCase(coll, monkeys, monkeyCount, suffix, altSuffix); | |
1457 | |
1458 // pattern | |
1459 notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern,
"pattern", strengthNames[s], seed); | |
1460 | |
1461 testCase.remove(); | |
1462 testCase.append(prefix); | |
1463 testCase.append(/*alt*/pattern); | |
1464 | |
1465 // prefix + pattern | |
1466 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"prefix + pattern", strengthNames[s], seed); | |
1467 | |
1468 testCase.append(suffix); | |
1469 | |
1470 // prefix + pattern + suffix | |
1471 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"prefix + pattern + suffix", strengthNames[s], seed); | |
1472 | |
1473 testCase.remove(); | |
1474 testCase.append(pattern); | |
1475 testCase.append(suffix); | |
1476 | |
1477 // pattern + suffix | |
1478 notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern,
"pattern + suffix", strengthNames[s], seed); | |
1479 } | |
1480 | |
1481 logln("For strength %s the not found count is %d.", strengthNames[s], not
FoundCount); | |
1482 } | |
1483 | |
1484 uset_close(contractions); | |
1485 uset_close(expansions); | |
1486 uset_close(letters); | |
1487 delete monkeyData; | |
1488 | |
1489 ucol_close(coll); | |
1490 } | |
1491 | |
1492 #endif | |
1493 | |
1494 #endif | |
OLD | NEW |