OLD | NEW |
1 /******************************************************************** | 1 /******************************************************************** |
2 * COPYRIGHT: | 2 * COPYRIGHT: |
3 * Copyright (c) 2002-2014, International Business Machines Corporation and | 3 * Copyright (c) 2002-2015, International Business Machines Corporation and |
4 * others. All Rights Reserved. | 4 * others. All Rights Reserved. |
5 ********************************************************************/ | 5 ********************************************************************/ |
6 | 6 |
7 // | 7 // |
8 // regextst.cpp | 8 // regextst.cpp |
9 // | 9 // |
10 // ICU Regular Expressions test, part of intltest. | 10 // ICU Regular Expressions test, part of intltest. |
11 // | 11 // |
12 | 12 |
13 /* | 13 /* |
14 NOTE!! | 14 NOTE!! |
15 | 15 |
16 PLEASE be careful about ASCII assumptions in this test. | 16 PLEASE be careful about ASCII assumptions in this test. |
17 This test is one of the worst repeat offenders. | 17 This test is one of the worst repeat offenders. |
18 If you have questions, contact someone on the ICU PMC | 18 If you have questions, contact someone on the ICU PMC |
19 who has access to an EBCDIC system. | 19 who has access to an EBCDIC system. |
20 | 20 |
21 */ | 21 */ |
22 | 22 |
23 #include "intltest.h" | 23 #include "intltest.h" |
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
25 | 25 |
| 26 #include <stdlib.h> |
| 27 #include <stdio.h> |
| 28 #include <string.h> |
| 29 |
26 #include "unicode/localpointer.h" | 30 #include "unicode/localpointer.h" |
27 #include "unicode/regex.h" | 31 #include "unicode/regex.h" |
28 #include "unicode/uchar.h" | 32 #include "unicode/uchar.h" |
29 #include "unicode/ucnv.h" | 33 #include "unicode/ucnv.h" |
30 #include "unicode/uniset.h" | 34 #include "unicode/uniset.h" |
31 #include "unicode/uregex.h" | 35 #include "unicode/uregex.h" |
32 #include "unicode/usetiter.h" | 36 #include "unicode/usetiter.h" |
33 #include "unicode/ustring.h" | 37 #include "unicode/ustring.h" |
| 38 #include "unicode/utext.h" |
| 39 |
34 #include "regextst.h" | 40 #include "regextst.h" |
35 #include "regexcmp.h" | 41 #include "regexcmp.h" |
36 #include "uvector.h" | 42 #include "uvector.h" |
37 #include "util.h" | 43 #include "util.h" |
38 #include <stdlib.h> | 44 #include "cmemory.h" |
39 #include <string.h> | |
40 #include <stdio.h> | |
41 #include "cstring.h" | 45 #include "cstring.h" |
42 #include "uinvchar.h" | 46 #include "uinvchar.h" |
43 | 47 |
44 #define SUPPORT_MUTATING_INPUT_STRING 0 | 48 #define SUPPORT_MUTATING_INPUT_STRING 0 |
45 | 49 |
46 //--------------------------------------------------------------------------- | 50 //--------------------------------------------------------------------------- |
47 // | 51 // |
48 // Test class boilerplate | 52 // Test class boilerplate |
49 // | 53 // |
50 //--------------------------------------------------------------------------- | 54 //--------------------------------------------------------------------------- |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
140 break; | 144 break; |
141 case 23: name = "TestCaseInsensitiveStarters"; | 145 case 23: name = "TestCaseInsensitiveStarters"; |
142 if (exec) TestCaseInsensitiveStarters(); | 146 if (exec) TestCaseInsensitiveStarters(); |
143 break; | 147 break; |
144 case 24: name = "TestBug11049"; | 148 case 24: name = "TestBug11049"; |
145 if (exec) TestBug11049(); | 149 if (exec) TestBug11049(); |
146 break; | 150 break; |
147 case 25: name = "TestBug11371"; | 151 case 25: name = "TestBug11371"; |
148 if (exec) TestBug11371(); | 152 if (exec) TestBug11371(); |
149 break; | 153 break; |
| 154 case 26: name = "TestBug11480"; |
| 155 if (exec) TestBug11480(); |
| 156 break; |
| 157 case 27: name = "NamedCapture"; |
| 158 if (exec) NamedCapture(); |
| 159 break; |
| 160 case 28: name = "NamedCaptureLimits"; |
| 161 if (exec) NamedCaptureLimits(); |
| 162 break; |
150 default: name = ""; | 163 default: name = ""; |
151 break; //needed to end loop | 164 break; //needed to end loop |
152 } | 165 } |
153 } | 166 } |
154 | 167 |
155 | 168 |
156 | 169 |
157 /** | 170 /** |
158 * Calls utext_openUTF8 after, potentially, converting invariant text from the c
ompilation codepage | 171 * Calls utext_openUTF8 after, potentially, converting invariant text from the c
ompilation codepage |
159 * into ASCII. | 172 * into ASCII. |
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
232 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr)
;\ | 245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr)
;\ |
233 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=
%s, got %s", \ | 246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=
%s, got %s", \ |
234 __LINE__, u_errorName(errcode), u_errorName(status));};} | 247 __LINE__, u_errorName(errcode), u_errorName(status));};} |
235 | 248 |
236 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ | 249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ |
237 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), stat
us); }} | 250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), stat
us); }} |
238 | 251 |
239 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ | 252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ |
240 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} | 253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} |
241 | 254 |
242 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTes
t failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToA
ssertBuf(ustr),inv);};} | 255 // expected: const char * , restricted to invariant characters. |
| 256 // actual: const UnicodeString & |
| 257 #define REGEX_ASSERT_UNISTR(expected, actual) { \ |
| 258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \ |
| 259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",
\ |
| 260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} |
243 | 261 |
244 | 262 |
245 static UBool testUTextEqual(UText *uta, UText *utb) { | 263 static UBool testUTextEqual(UText *uta, UText *utb) { |
246 UChar32 ca = 0; | 264 UChar32 ca = 0; |
247 UChar32 cb = 0; | 265 UChar32 cb = 0; |
248 utext_setNativeIndex(uta, 0); | 266 utext_setNativeIndex(uta, 0); |
249 utext_setNativeIndex(utb, 0); | 267 utext_setNativeIndex(utb, 0); |
250 do { | 268 do { |
251 ca = utext_next32(uta); | 269 ca = utext_next32(uta); |
252 cb = utext_next32(utb); | 270 cb = utext_next32(utb); |
(...skipping 1163 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1416 REGEX_CHECK_STATUS; | 1434 REGEX_CHECK_STATUS; |
1417 dest = matcher2->replaceFirst("$1$1", status); | 1435 dest = matcher2->replaceFirst("$1$1", status); |
1418 REGEX_CHECK_STATUS; | 1436 REGEX_CHECK_STATUS; |
1419 REGEX_ASSERT(dest == "bcbcdefg"); | 1437 REGEX_ASSERT(dest == "bcbcdefg"); |
1420 | 1438 |
1421 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1
."), status); | 1439 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1
."), status); |
1422 REGEX_CHECK_STATUS; | 1440 REGEX_CHECK_STATUS; |
1423 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); | 1441 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); |
1424 | 1442 |
1425 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); | 1443 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); |
1426 REGEX_CHECK_STATUS; | 1444 REGEX_ASSERT(U_FAILURE(status)); |
1427 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); | 1445 status = U_ZERO_ERROR; |
1428 | 1446 |
1429 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U
0001D7CF."); | 1447 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U
0001D7CF."); |
1430 replacement = replacement.unescape(); | 1448 replacement = replacement.unescape(); |
1431 dest = matcher2->replaceFirst(replacement, status); | 1449 dest = matcher2->replaceFirst(replacement, status); |
1432 REGEX_CHECK_STATUS; | 1450 REGEX_CHECK_STATUS; |
1433 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); | 1451 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); |
1434 | 1452 |
1435 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",st
atus), U_INDEX_OUTOFBOUNDS_ERROR); | 1453 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",st
atus), U_INDEX_OUTOFBOUNDS_ERROR); |
1436 | 1454 |
1437 | 1455 |
(...skipping 605 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2043 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); | 2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); |
2044 utext_close(result); | 2062 utext_close(result); |
2045 result = matcher->group(0, &destText, group_len, status); | 2063 result = matcher->group(0, &destText, group_len, status); |
2046 REGEX_CHECK_STATUS; | 2064 REGEX_CHECK_STATUS; |
2047 REGEX_ASSERT(result == &destText); | 2065 REGEX_ASSERT(result == &destText); |
2048 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); | 2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); |
2049 // destText is now immutable, reopen it | 2067 // destText is now immutable, reopen it |
2050 utext_close(&destText); | 2068 utext_close(&destText); |
2051 utext_openUnicodeString(&destText, &dest, &status); | 2069 utext_openUnicodeString(&destText, &dest, &status); |
2052 | 2070 |
2053 result = matcher->group(0, NULL, status); | 2071 int64_t length; |
| 2072 result = matcher->group(0, NULL, length, status); |
2054 REGEX_CHECK_STATUS; | 2073 REGEX_CHECK_STATUS; |
2055 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); | 2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); |
2056 utext_close(result); | 2075 utext_close(result); |
2057 result = matcher->group(0, &destText, status); | 2076 result = matcher->group(0, &destText, length, status); |
2058 REGEX_CHECK_STATUS; | 2077 REGEX_CHECK_STATUS; |
2059 REGEX_ASSERT(result == &destText); | 2078 REGEX_ASSERT(result == &destText); |
2060 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); | 2079 REGEX_ASSERT(utext_getNativeIndex(result) == 0); |
| 2080 REGEX_ASSERT(length == 10); |
| 2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
2061 | 2082 |
2062 result = matcher->group(1, NULL, status); | 2083 // Capture Group 1 == "234567" |
| 2084 result = matcher->group(1, NULL, length, status); |
2063 REGEX_CHECK_STATUS; | 2085 REGEX_CHECK_STATUS; |
2064 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 };
/* 234567 */ | 2086 REGEX_ASSERT(utext_getNativeIndex(result) == 2); |
2065 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); | 2087 REGEX_ASSERT(length == 6); |
| 2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
2066 utext_close(result); | 2089 utext_close(result); |
2067 result = matcher->group(1, &destText, status); | 2090 |
| 2091 result = matcher->group(1, &destText, length, status); |
2068 REGEX_CHECK_STATUS; | 2092 REGEX_CHECK_STATUS; |
2069 REGEX_ASSERT(result == &destText); | 2093 REGEX_ASSERT(result == &destText); |
2070 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); | 2094 REGEX_ASSERT(utext_getNativeIndex(result) == 2); |
| 2095 REGEX_ASSERT(length == 6); |
| 2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| 2097 utext_close(result); |
2071 | 2098 |
2072 result = matcher->group(2, NULL, status); | 2099 // Capture Group 2 == "45" |
| 2100 result = matcher->group(2, NULL, length, status); |
2073 REGEX_CHECK_STATUS; | 2101 REGEX_CHECK_STATUS; |
2074 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ | 2102 REGEX_ASSERT(utext_getNativeIndex(result) == 4); |
2075 REGEX_ASSERT_UTEXT_UTF8(str_45, result); | 2103 REGEX_ASSERT(length == 2); |
| 2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
2076 utext_close(result); | 2105 utext_close(result); |
2077 result = matcher->group(2, &destText, status); | 2106 |
| 2107 result = matcher->group(2, &destText, length, status); |
2078 REGEX_CHECK_STATUS; | 2108 REGEX_CHECK_STATUS; |
2079 REGEX_ASSERT(result == &destText); | 2109 REGEX_ASSERT(result == &destText); |
2080 REGEX_ASSERT_UTEXT_UTF8(str_45, result); | 2110 REGEX_ASSERT(utext_getNativeIndex(result) == 4); |
| 2111 REGEX_ASSERT(length == 2); |
| 2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| 2113 utext_close(result); |
2081 | 2114 |
2082 result = matcher->group(3, NULL, status); | 2115 // Capture Group 3 == "89" |
| 2116 result = matcher->group(3, NULL, length, status); |
2083 REGEX_CHECK_STATUS; | 2117 REGEX_CHECK_STATUS; |
2084 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ | 2118 REGEX_ASSERT(utext_getNativeIndex(result) == 8); |
2085 REGEX_ASSERT_UTEXT_UTF8(str_89, result); | 2119 REGEX_ASSERT(length == 2); |
| 2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
2086 utext_close(result); | 2121 utext_close(result); |
2087 result = matcher->group(3, &destText, status); | 2122 |
| 2123 result = matcher->group(3, &destText, length, status); |
2088 REGEX_CHECK_STATUS; | 2124 REGEX_CHECK_STATUS; |
2089 REGEX_ASSERT(result == &destText); | 2125 REGEX_ASSERT(result == &destText); |
2090 REGEX_ASSERT_UTEXT_UTF8(str_89, result); | 2126 REGEX_ASSERT(utext_getNativeIndex(result) == 8); |
| 2127 REGEX_ASSERT(length == 2); |
| 2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); |
| 2129 utext_close(result); |
2091 | 2130 |
| 2131 // Capture Group number out of range. |
| 2132 status = U_ZERO_ERROR; |
2092 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR)
; | 2133 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR)
; |
| 2134 status = U_ZERO_ERROR; |
2093 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR)
; | 2135 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR)
; |
| 2136 status = U_ZERO_ERROR; |
2094 matcher->reset(); | 2137 matcher->reset(); |
2095 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); | 2138 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); |
2096 | 2139 |
2097 delete matcher; | 2140 delete matcher; |
2098 delete pat; | 2141 delete pat; |
2099 | 2142 |
2100 utext_close(&destText); | 2143 utext_close(&destText); |
2101 utext_close(&input); | 2144 utext_close(&input); |
2102 utext_close(&re); | 2145 utext_close(&re); |
2103 } | 2146 } |
(...skipping 491 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2595 REGEX_CHECK_STATUS; | 2638 REGEX_CHECK_STATUS; |
2596 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61,
0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0
x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg *
/ | 2639 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61,
0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0
x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg *
/ |
2597 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); | 2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); |
2598 utext_close(result); | 2641 utext_close(result); |
2599 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status)
; | 2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status)
; |
2600 result = matcher2->replaceFirst(&replText, &destText, status); | 2643 result = matcher2->replaceFirst(&replText, &destText, status); |
2601 REGEX_CHECK_STATUS; | 2644 REGEX_CHECK_STATUS; |
2602 REGEX_ASSERT(result == &destText); | 2645 REGEX_ASSERT(result == &destText); |
2603 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); | 2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); |
2604 | 2647 |
2605 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x6
9, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f,
0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0
x00 }; /* $ by itself, no group number $$$ */ | 2648 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x2
0, 0x69, 0x74, 0x73, 0x65, 0x6c, |
| 2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70,
0x20, 0x6e, 0x75, 0x6d, 0x62, |
| 2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /*
\$ by itself, no group number \$\$\$ */ |
2606 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); | 2651 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); |
2607 result = matcher2->replaceFirst(&replText, NULL, status); | 2652 result = matcher2->replaceFirst(&replText, NULL, status); |
2608 REGEX_CHECK_STATUS; | 2653 REGEX_CHECK_STATUS; |
2609 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20,
0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0
x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x2
4, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ | 2654 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20,
0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0
x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x2
4, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ |
2610 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); | 2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); |
2611 utext_close(result); | 2656 utext_close(result); |
2612 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status)
; | 2657 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status)
; |
2613 result = matcher2->replaceFirst(&replText, &destText, status); | 2658 result = matcher2->replaceFirst(&replText, &destText, status); |
2614 REGEX_CHECK_STATUS; | 2659 REGEX_CHECK_STATUS; |
2615 REGEX_ASSERT(result == &destText); | 2660 REGEX_ASSERT(result == &destText); |
(...skipping 446 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3062 REGEX_ASSERT(n==5); | 3107 REGEX_ASSERT(n==5); |
3063 REGEX_ASSERT(fields[0]=="1"); | 3108 REGEX_ASSERT(fields[0]=="1"); |
3064 REGEX_ASSERT(fields[1]=="-"); | 3109 REGEX_ASSERT(fields[1]=="-"); |
3065 REGEX_ASSERT(fields[2]=="10"); | 3110 REGEX_ASSERT(fields[2]=="10"); |
3066 REGEX_ASSERT(fields[3]==","); | 3111 REGEX_ASSERT(fields[3]==","); |
3067 REGEX_ASSERT(fields[4]=="20"); | 3112 REGEX_ASSERT(fields[4]=="20"); |
3068 delete pat1; | 3113 delete pat1; |
3069 | 3114 |
3070 | 3115 |
3071 // | 3116 // |
| 3117 // split of a UText based string, with library allocating output UTexts. |
| 3118 // |
| 3119 { |
| 3120 status = U_ZERO_ERROR; |
| 3121 RegexMatcher matcher(UnicodeString("(:)"), 0, status); |
| 3122 UnicodeString stringToSplit("first:second:third"); |
| 3123 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &stat
us); |
| 3124 REGEX_CHECK_STATUS; |
| 3125 |
| 3126 UText *splits[10] = {NULL}; |
| 3127 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(spl
its), status); |
| 3128 REGEX_CHECK_STATUS; |
| 3129 REGEX_ASSERT(numFields == 5); |
| 3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); |
| 3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); |
| 3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); |
| 3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); |
| 3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); |
| 3135 REGEX_ASSERT(splits[5] == NULL); |
| 3136 |
| 3137 for (int i=0; i<UPRV_LENGTHOF(splits); i++) { |
| 3138 if (splits[i]) { |
| 3139 utext_close(splits[i]); |
| 3140 splits[i] = NULL; |
| 3141 } |
| 3142 } |
| 3143 utext_close(textToSplit); |
| 3144 } |
| 3145 |
| 3146 |
| 3147 // |
3072 // RegexPattern::pattern() and patternText() | 3148 // RegexPattern::pattern() and patternText() |
3073 // | 3149 // |
3074 pat1 = new RegexPattern(); | 3150 pat1 = new RegexPattern(); |
3075 REGEX_ASSERT(pat1->pattern() == ""); | 3151 REGEX_ASSERT(pat1->pattern() == ""); |
3076 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); | 3152 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); |
3077 delete pat1; | 3153 delete pat1; |
3078 const char *helloWorldInvariant = "(Hello, world)*"; | 3154 const char *helloWorldInvariant = "(Hello, world)*"; |
3079 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); | 3155 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); |
3080 pat1 = RegexPattern::compile(&re1, pe, status); | 3156 pat1 = RegexPattern::compile(&re1, pe, status); |
3081 REGEX_CHECK_STATUS; | 3157 REGEX_CHECK_STATUS; |
3082 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*"); | 3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); |
3083 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); | 3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); |
3084 delete pat1; | 3160 delete pat1; |
3085 | 3161 |
3086 utext_close(&re1); | 3162 utext_close(&re1); |
3087 } | 3163 } |
3088 | 3164 |
3089 | 3165 |
3090 //--------------------------------------------------------------------------- | 3166 //--------------------------------------------------------------------------- |
3091 // | 3167 // |
3092 // Extended A more thorough check for features of regex patterns | 3168 // Extended A more thorough check for features of regex patterns |
(...skipping 684 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3777 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); | 3853 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); |
3778 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); | 3854 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); |
3779 | 3855 |
3780 // Mal-formed {min,max} quantifiers | 3856 // Mal-formed {min,max} quantifiers |
3781 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); | 3857 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); |
3782 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); | 3858 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); |
3783 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); | 3859 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); |
3784 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); | 3860 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); |
3785 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); | 3861 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); |
3786 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); | 3862 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); |
3787 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Ov
erflows int during scan | 3863 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov
erflows int during scan |
3788 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov
erflows regex binary format | 3864 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov
erflows regex binary format |
3789 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); | 3865 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); |
3790 | 3866 |
3791 // Ticket 5389 | 3867 // Ticket 5389 |
3792 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); | 3868 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); |
3793 | 3869 |
3794 // Invalid Back Reference \0 | 3870 // Invalid Back Reference \0 |
3795 // For ICU 3.8 and earlier | 3871 // For ICU 3.8 and earlier |
3796 // For ICU versions newer than 3.8, \0 introduces an octal escape. | 3872 // For ICU versions newer than 3.8, \0 introduces an octal escape. |
3797 // | 3873 // |
(...skipping 1001 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4799 REGEX_ASSERT(cbInfo.numCalls > 0); | 4875 REGEX_ASSERT(cbInfo.numCalls > 0); |
4800 | 4876 |
4801 // A longer running match that the callback function will abort. | 4877 // A longer running match that the callback function will abort. |
4802 status = U_ZERO_ERROR; | 4878 status = U_ZERO_ERROR; |
4803 cbInfo.reset(4); | 4879 cbInfo.reset(4); |
4804 s = "aaaaaaaaaaaaaaaaaaaaaaab"; | 4880 s = "aaaaaaaaaaaaaaaaaaaaaaab"; |
4805 matcher.reset(s); | 4881 matcher.reset(s); |
4806 REGEX_ASSERT(matcher.matches(status)==FALSE); | 4882 REGEX_ASSERT(matcher.matches(status)==FALSE); |
4807 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); | 4883 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); |
4808 REGEX_ASSERT(cbInfo.numCalls == 4); | 4884 REGEX_ASSERT(cbInfo.numCalls == 4); |
| 4885 |
| 4886 // A longer running find that the callback function will abort. |
| 4887 status = U_ZERO_ERROR; |
| 4888 cbInfo.reset(4); |
| 4889 s = "aaaaaaaaaaaaaaaaaaaaaaab"; |
| 4890 matcher.reset(s); |
| 4891 REGEX_ASSERT(matcher.find(status)==FALSE); |
| 4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); |
| 4893 REGEX_ASSERT(cbInfo.numCalls == 4); |
4809 } | 4894 } |
4810 | 4895 |
4811 | 4896 |
4812 } | 4897 } |
4813 | 4898 |
4814 | 4899 |
4815 // | 4900 // |
4816 // FindProgressCallbacks() Test the find "progress" callback function. | 4901 // FindProgressCallbacks() Test the find "progress" callback function. |
4817 // When set, the find progress callback will be invoked during
a find operations | 4902 // When set, the find progress callback will be invoked during
a find operations |
4818 // after each return from a match attempt, giving the applicati
on the opportunity | 4903 // after each return from a match attempt, giving the applicati
on the opportunity |
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4988 utext_close(&text2); | 5073 utext_close(&text2); |
4989 } | 5074 } |
4990 | 5075 |
4991 /* | 5076 /* |
4992 * group() | 5077 * group() |
4993 */ | 5078 */ |
4994 { | 5079 { |
4995 UChar text1[80]; | 5080 UChar text1[80]; |
4996 UText *actual; | 5081 UText *actual; |
4997 UBool result; | 5082 UBool result; |
4998 u_uastrncpy(text1, "noise abc interior def, and this is off the end", s
izeof(text1)/2); | 5083 int64_t length = 0; |
| 5084 |
| 5085 u_uastrncpy(text1, "noise abc interior def, and this is off the end", U
PRV_LENGTHOF(text1)); |
| 5086 // 012345678901234567890123456789012345678901234567 |
| 5087 // 0 1 2 3 4 |
4999 | 5088 |
5000 status = U_ZERO_ERROR; | 5089 status = U_ZERO_ERROR; |
5001 re = uregex_openC("abc(.*?)def", 0, NULL, &status); | 5090 re = uregex_openC("abc(.*?)def", 0, NULL, &status); |
5002 REGEX_CHECK_STATUS; | 5091 REGEX_CHECK_STATUS; |
5003 | 5092 |
5004 uregex_setText(re, text1, -1, &status); | 5093 uregex_setText(re, text1, -1, &status); |
5005 result = uregex_find(re, 0, &status); | 5094 result = uregex_find(re, 0, &status); |
5006 REGEX_ASSERT(result==TRUE); | 5095 REGEX_ASSERT(result==TRUE); |
5007 | 5096 |
5008 /* Capture Group 0, the full match. Should succeed. */ | 5097 /* Capture Group 0, the full match. Should succeed. "abc interior def"
*/ |
5009 status = U_ZERO_ERROR; | 5098 status = U_ZERO_ERROR; |
5010 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); | 5099 actual = uregex_groupUText(re, 0, &bufferText, &length, &status); |
5011 REGEX_CHECK_STATUS; | 5100 REGEX_CHECK_STATUS; |
5012 REGEX_ASSERT(actual == &bufferText); | 5101 REGEX_ASSERT(actual == &bufferText); |
5013 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); | 5102 REGEX_ASSERT(utext_getNativeIndex(actual) == 6); |
| 5103 REGEX_ASSERT(length == 16); |
| 5104 REGEX_ASSERT(utext_nativeLength(actual) == 47); |
5014 | 5105 |
5015 /* Capture group #1. Should succeed. */ | 5106 /* Capture group #1. Should succeed, matching " interior ". */ |
5016 status = U_ZERO_ERROR; | 5107 status = U_ZERO_ERROR; |
5017 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); | 5108 actual = uregex_groupUText(re, 1, &bufferText, &length, &status); |
5018 REGEX_CHECK_STATUS; | 5109 REGEX_CHECK_STATUS; |
5019 REGEX_ASSERT(actual == &bufferText); | 5110 REGEX_ASSERT(actual == &bufferText); |
5020 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); | 5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " inte
rior " |
| 5112 REGEX_ASSERT(length == 10); |
| 5113 REGEX_ASSERT(utext_nativeLength(actual) == 47); |
5021 | 5114 |
5022 /* Capture group out of range. Error. */ | 5115 /* Capture group out of range. Error. */ |
5023 status = U_ZERO_ERROR; | 5116 status = U_ZERO_ERROR; |
5024 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); | 5117 actual = uregex_groupUText(re, 2, &bufferText, &length, &status); |
5025 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); | 5118 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
5026 REGEX_ASSERT(actual == &bufferText); | 5119 REGEX_ASSERT(actual == &bufferText); |
5027 | |
5028 uregex_close(re); | 5120 uregex_close(re); |
5029 | 5121 |
5030 } | 5122 } |
5031 | 5123 |
5032 /* | 5124 /* |
5033 * replaceFirst() | 5125 * replaceFirst() |
5034 */ | 5126 */ |
5035 { | 5127 { |
5036 UChar text1[80]; | 5128 UChar text1[80]; |
5037 UChar text2[80]; | 5129 UChar text2[80]; |
5038 UText replText = UTEXT_INITIALIZER; | 5130 UText replText = UTEXT_INITIALIZER; |
5039 UText *result; | 5131 UText *result; |
| 5132 status = U_ZERO_ERROR; |
| 5133 utext_openUnicodeString(&bufferText, &buffer, &status); |
5040 | 5134 |
5041 status = U_ZERO_ERROR; | 5135 status = U_ZERO_ERROR; |
5042 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); | 5136 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); |
5043 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); | 5137 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); |
5044 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); | 5138 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); |
5045 | 5139 |
5046 re = uregex_openC("x(.*?)x", 0, NULL, &status); | 5140 re = uregex_openC("x(.*?)x", 0, NULL, &status); |
5047 REGEX_CHECK_STATUS; | 5141 REGEX_CHECK_STATUS; |
5048 | 5142 |
5049 /* Normal case, with match */ | 5143 /* Normal case, with match */ |
5050 uregex_setText(re, text1, -1, &status); | 5144 uregex_setText(re, text1, -1, &status); |
| 5145 REGEX_CHECK_STATUS; |
5051 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0,
&status); | 5146 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0,
&status); |
| 5147 REGEX_CHECK_STATUS; |
5052 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); | 5148 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); |
5053 REGEX_CHECK_STATUS; | 5149 REGEX_CHECK_STATUS; |
5054 REGEX_ASSERT(result == &bufferText); | 5150 REGEX_ASSERT(result == &bufferText); |
5055 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); | 5151 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); |
5056 | 5152 |
5057 /* No match. Text should copy to output with no changes. */ | 5153 /* No match. Text should copy to output with no changes. */ |
5058 uregex_setText(re, text2, -1, &status); | 5154 uregex_setText(re, text2, -1, &status); |
5059 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0,
&status); | 5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0,
&status); |
5060 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); | 5156 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); |
5061 REGEX_CHECK_STATUS; | 5157 REGEX_CHECK_STATUS; |
5062 REGEX_ASSERT(result == &bufferText); | 5158 REGEX_ASSERT(result == &bufferText); |
5063 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); | 5159 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); |
5064 | 5160 |
5065 /* Unicode escapes */ | 5161 /* Unicode escapes */ |
5066 uregex_setText(re, text1, -1, &status); | 5162 uregex_setText(re, text1, -1, &status); |
5067 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a"
, -1, &status); | 5163 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\
a", -1, &status); |
5068 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0,
&status); | 5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0,
&status); |
5069 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); | 5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); |
5070 REGEX_CHECK_STATUS; | 5166 REGEX_CHECK_STATUS; |
5071 REGEX_ASSERT(result == &bufferText); | 5167 REGEX_ASSERT(result == &bufferText); |
5072 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); | 5168 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); |
5073 | 5169 |
5074 uregex_close(re); | 5170 uregex_close(re); |
5075 utext_close(&replText); | 5171 utext_close(&replText); |
5076 } | 5172 } |
5077 | 5173 |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5116 | 5212 |
5117 /* | 5213 /* |
5118 * splitUText() uses the C++ API directly, and the UnicodeString version us
es mutable UTexts, | 5214 * splitUText() uses the C++ API directly, and the UnicodeString version us
es mutable UTexts, |
5119 * so we don't need to test it here. | 5215 * so we don't need to test it here. |
5120 */ | 5216 */ |
5121 | 5217 |
5122 utext_close(&bufferText); | 5218 utext_close(&bufferText); |
5123 utext_close(&patternText); | 5219 utext_close(&patternText); |
5124 } | 5220 } |
5125 | 5221 |
| 5222 |
| 5223 //-------------------------------------------------------------- |
| 5224 // |
| 5225 // NamedCapture Check basic named capture group functionality |
| 5226 // |
| 5227 //-------------------------------------------------------------- |
| 5228 void RegexTest::NamedCapture() { |
| 5229 UErrorCode status = U_ZERO_ERROR; |
| 5230 RegexPattern *pat = RegexPattern::compile(UnicodeString( |
| 5231 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, stat
us); |
| 5232 REGEX_CHECK_STATUS; |
| 5233 int32_t group = pat->groupNumberFromName("five", -1, status); |
| 5234 REGEX_CHECK_STATUS; |
| 5235 REGEX_ASSERT(5 == group); |
| 5236 group = pat->groupNumberFromName("three", -1, status); |
| 5237 REGEX_CHECK_STATUS; |
| 5238 REGEX_ASSERT(3 == group); |
| 5239 |
| 5240 status = U_ZERO_ERROR; |
| 5241 group = pat->groupNumberFromName(UnicodeString("six"), status); |
| 5242 REGEX_CHECK_STATUS; |
| 5243 REGEX_ASSERT(6 == group); |
| 5244 |
| 5245 status = U_ZERO_ERROR; |
| 5246 group = pat->groupNumberFromName(UnicodeString("nosuch"), status); |
| 5247 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5248 |
| 5249 status = U_ZERO_ERROR; |
| 5250 |
| 5251 // After copying a pattern, named capture should still work in the copy. |
| 5252 RegexPattern *copiedPat = new RegexPattern(*pat); |
| 5253 REGEX_ASSERT(*copiedPat == *pat); |
| 5254 delete pat; pat = NULL; // Delete original, copy should have no references
back to it. |
| 5255 |
| 5256 group = copiedPat->groupNumberFromName("five", -1, status); |
| 5257 REGEX_CHECK_STATUS; |
| 5258 REGEX_ASSERT(5 == group); |
| 5259 group = copiedPat->groupNumberFromName("three", -1, status); |
| 5260 REGEX_CHECK_STATUS; |
| 5261 REGEX_ASSERT(3 == group); |
| 5262 delete copiedPat; |
| 5263 |
| 5264 // ReplaceAll with named capture group. |
| 5265 status = U_ZERO_ERROR; |
| 5266 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>"); |
| 5267 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0
, status); |
| 5268 REGEX_CHECK_STATUS; |
| 5269 // m.pattern().dumpPattern(); |
| 5270 UnicodeString replacedText = m->replaceAll("'${mid}'", status); |
| 5271 REGEX_CHECK_STATUS; |
| 5272 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'")
== replacedText); |
| 5273 delete m; |
| 5274 |
| 5275 // ReplaceAll, allowed capture group numbers. |
| 5276 text = UnicodeString("abcmxyz"); |
| 5277 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status); |
| 5278 REGEX_CHECK_STATUS; |
| 5279 |
| 5280 status = U_ZERO_ERROR; |
| 5281 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0,
full match, is allowed. |
| 5282 REGEX_CHECK_STATUS; |
| 5283 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText); |
| 5284 |
| 5285 status = U_ZERO_ERROR; |
| 5286 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group
1 by number. |
| 5287 REGEX_CHECK_STATUS; |
| 5288 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); |
| 5289 |
| 5290 status = U_ZERO_ERROR; |
| 5291 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group
1 by name. |
| 5292 REGEX_CHECK_STATUS; |
| 5293 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); |
| 5294 |
| 5295 status = U_ZERO_ERROR; |
| 5296 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. |
| 5297 REGEX_CHECK_STATUS; |
| 5298 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText); |
| 5299 |
| 5300 status = U_ZERO_ERROR; |
| 5301 replacedText = m->replaceAll(UnicodeString("<$3>"), status); |
| 5302 REGEX_CHECK_STATUS; |
| 5303 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText); |
| 5304 |
| 5305 status = U_ZERO_ERROR; |
| 5306 replacedText = m->replaceAll(UnicodeString("<$4>"), status); |
| 5307 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| 5308 |
| 5309 status = U_ZERO_ERROR; |
| 5310 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group
0, leading 0, |
| 5311 REGEX_CHECK_STATUS; // tr
ailing out-of-range 4 passes through. |
| 5312 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText); |
| 5313 |
| 5314 status = U_ZERO_ERROR; |
| 5315 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consu
me leading zeroes. Don't consume digits |
| 5316 REGEX_CHECK_STATUS; // tha
t push group num out of range. |
| 5317 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // Thi
s is group 1. |
| 5318 |
| 5319 status = U_ZERO_ERROR; |
| 5320 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); |
| 5321 REGEX_CHECK_STATUS; |
| 5322 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText); |
| 5323 |
| 5324 status = U_ZERO_ERROR; |
| 5325 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); |
| 5326 REGEX_CHECK_STATUS; |
| 5327 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); |
| 5328 |
| 5329 status = U_ZERO_ERROR; |
| 5330 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); |
| 5331 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5332 |
| 5333 status = U_ZERO_ERROR; |
| 5334 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); |
| 5335 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5336 |
| 5337 status = U_ZERO_ERROR; |
| 5338 replacedText = m->replaceAll(UnicodeString("<${one"), status); |
| 5339 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5340 |
| 5341 status = U_ZERO_ERROR; |
| 5342 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status)
; |
| 5343 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5344 |
| 5345 delete m; |
| 5346 |
| 5347 // Repeat the above replaceAll() tests using the plain C API, which |
| 5348 // has a separate implementation internally. |
| 5349 // TODO: factor out the test data. |
| 5350 |
| 5351 status = U_ZERO_ERROR; |
| 5352 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status)
; |
| 5353 REGEX_CHECK_STATUS; |
| 5354 text = UnicodeString("abcmxyz"); |
| 5355 uregex_setText(re, text.getBuffer(), text.length(), &status); |
| 5356 REGEX_CHECK_STATUS; |
| 5357 |
| 5358 UChar resultBuf[100]; |
| 5359 int32_t resultLength; |
| 5360 UnicodeString repl; |
| 5361 |
| 5362 status = U_ZERO_ERROR; |
| 5363 repl = UnicodeString("<$0>"); |
| 5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5365 REGEX_CHECK_STATUS; |
| 5366 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLe
ngth)); |
| 5367 |
| 5368 status = U_ZERO_ERROR; |
| 5369 repl = UnicodeString("<$1>"); |
| 5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5371 REGEX_CHECK_STATUS; |
| 5372 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength
)); |
| 5373 |
| 5374 status = U_ZERO_ERROR; |
| 5375 repl = UnicodeString("<${one}>"); |
| 5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5377 REGEX_CHECK_STATUS; |
| 5378 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength
)); |
| 5379 |
| 5380 status = U_ZERO_ERROR; |
| 5381 repl = UnicodeString("<$2>"); |
| 5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5383 REGEX_CHECK_STATUS; |
| 5384 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength
)); |
| 5385 |
| 5386 status = U_ZERO_ERROR; |
| 5387 repl = UnicodeString("<$3>"); |
| 5388 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5389 REGEX_CHECK_STATUS; |
| 5390 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength
)); |
| 5391 |
| 5392 status = U_ZERO_ERROR; |
| 5393 repl = UnicodeString("<$4>"); |
| 5394 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5395 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); |
| 5396 |
| 5397 status = U_ZERO_ERROR; |
| 5398 repl = UnicodeString("<$04>"); |
| 5399 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5400 REGEX_CHECK_STATUS; |
| 5401 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultL
ength)); |
| 5402 |
| 5403 status = U_ZERO_ERROR; |
| 5404 repl = UnicodeString("<$000016>"); |
| 5405 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5406 REGEX_CHECK_STATUS; |
| 5407 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLengt
h)); |
| 5408 |
| 5409 status = U_ZERO_ERROR; |
| 5410 repl = UnicodeString("<$3$2$1${one}>"); |
| 5411 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5412 REGEX_CHECK_STATUS; |
| 5413 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLen
gth)); |
| 5414 |
| 5415 status = U_ZERO_ERROR; |
| 5416 repl = UnicodeString("$3$2$1${one}"); |
| 5417 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5418 REGEX_CHECK_STATUS; |
| 5419 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLengt
h)); |
| 5420 |
| 5421 status = U_ZERO_ERROR; |
| 5422 repl = UnicodeString("<${noSuchName}>"); |
| 5423 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5424 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5425 |
| 5426 status = U_ZERO_ERROR; |
| 5427 repl = UnicodeString("<${invalid-name}>"); |
| 5428 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5429 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5430 |
| 5431 status = U_ZERO_ERROR; |
| 5432 repl = UnicodeString("<${one"); |
| 5433 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5434 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5435 |
| 5436 status = U_ZERO_ERROR; |
| 5437 repl = UnicodeString("$not a capture group"); |
| 5438 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result
Buf, UPRV_LENGTHOF(resultBuf), &status); |
| 5439 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); |
| 5440 |
| 5441 uregex_close(re); |
| 5442 } |
| 5443 |
| 5444 //-------------------------------------------------------------- |
| 5445 // |
| 5446 // NamedCaptureLimits Patterns with huge numbers of named capture groups. |
| 5447 // The point is not so much what the exact limit is, |
| 5448 // but that a largish number doesn't hit bad non-linear pe
rformance, |
| 5449 // and that exceeding the limit fails cleanly. |
| 5450 // |
| 5451 //-------------------------------------------------------------- |
| 5452 void RegexTest::NamedCaptureLimits() { |
| 5453 if (quick) { |
| 5454 logln("Skipping test. Runs in exhuastive mode only."); |
| 5455 return; |
| 5456 } |
| 5457 const int32_t goodLimit = 1000000; // Pattern w this many groups builds
successfully. |
| 5458 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, f
ails to compile. |
| 5459 char nnbuf[100]; |
| 5460 UnicodeString pattern; |
| 5461 int32_t nn; |
| 5462 |
| 5463 for (nn=1; nn<goodLimit; nn++) { |
| 5464 sprintf(nnbuf, "(?<nn%d>)", nn); |
| 5465 pattern.append(UnicodeString(nnbuf, -1, US_INV)); |
| 5466 } |
| 5467 UErrorCode status = U_ZERO_ERROR; |
| 5468 RegexPattern *pat = RegexPattern::compile(pattern, 0, status); |
| 5469 REGEX_CHECK_STATUS; |
| 5470 for (nn=1; nn<goodLimit; nn++) { |
| 5471 sprintf(nnbuf, "nn%d", nn); |
| 5472 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status); |
| 5473 REGEX_ASSERT(nn == groupNum); |
| 5474 if (nn != groupNum) { |
| 5475 break; |
| 5476 } |
| 5477 } |
| 5478 delete pat; |
| 5479 |
| 5480 pattern.remove(); |
| 5481 for (nn=1; nn<failLimit; nn++) { |
| 5482 sprintf(nnbuf, "(?<nn%d>)", nn); |
| 5483 pattern.append(UnicodeString(nnbuf, -1, US_INV)); |
| 5484 } |
| 5485 status = U_ZERO_ERROR; |
| 5486 pat = RegexPattern::compile(pattern, 0, status); |
| 5487 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); |
| 5488 delete pat; |
| 5489 } |
| 5490 |
| 5491 |
5126 //-------------------------------------------------------------- | 5492 //-------------------------------------------------------------- |
5127 // | 5493 // |
5128 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher
. | 5494 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher
. |
5129 // | 5495 // |
5130 //--------------------------------------------------------------- | 5496 //--------------------------------------------------------------- |
5131 void RegexTest::Bug7651() { | 5497 void RegexTest::Bug7651() { |
5132 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\
u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z
0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\
uFFFF])|\\$[A-Za-z]+)"); | 5498 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\
u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z
0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\
uFFFF])|\\$[A-Za-z]+)"); |
5133 // The following should exceed the default operator stack depth in the matc
her, i.e. force the matcher to malloc instead of using fSmallData. | 5499 // The following should exceed the default operator stack depth in the matc
her, i.e. force the matcher to malloc instead of using fSmallData. |
5134 // It will cause a segfault if RegexMatcher tries to use fSmallData instead
of malloc'ing the memory needed (see init2) for the pattern operator stack allo
cation. | 5500 // It will cause a segfault if RegexMatcher tries to use fSmallData instead
of malloc'ing the memory needed (see init2) for the pattern operator stack allo
cation. |
5135 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u
0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![
A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u0
0f8-\\u00ff]*|\\$[A-Za-z]+)"); | 5501 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u
0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![
A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u0
0f8-\\u00ff]*|\\$[A-Za-z]+)"); |
(...skipping 271 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5407 patternString.append(UnicodeString("stuff and things dont you know, thes
e are a few of my favorite strings\n")); | 5773 patternString.append(UnicodeString("stuff and things dont you know, thes
e are a few of my favorite strings\n")); |
5408 } | 5774 } |
5409 patternString.append(UnicodeString("X? trailing string")); | 5775 patternString.append(UnicodeString("X? trailing string")); |
5410 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString,
0, status)); | 5776 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString,
0, status)); |
5411 if (status != U_REGEX_PATTERN_TOO_BIG) { | 5777 if (status != U_REGEX_PATTERN_TOO_BIG) { |
5412 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.
", | 5778 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.
", |
5413 __FILE__, __LINE__, u_errorName(status)); | 5779 __FILE__, __LINE__, u_errorName(status)); |
5414 } | 5780 } |
5415 } | 5781 } |
5416 | 5782 |
| 5783 void RegexTest::TestBug11480() { |
| 5784 // C API, get capture group of a group that does not participate in the matc
h. |
| 5785 // (Returns a zero length string, with nul termination, |
| 5786 // indistinguishable from a group with a zero length match.) |
| 5787 |
| 5788 UErrorCode status = U_ZERO_ERROR; |
| 5789 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); |
| 5790 REGEX_CHECK_STATUS; |
| 5791 UnicodeString text = UNICODE_STRING_SIMPLE("A"); |
| 5792 uregex_setText(re, text.getBuffer(), text.length(), &status); |
| 5793 REGEX_CHECK_STATUS; |
| 5794 REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); |
| 5795 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; |
| 5796 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); |
| 5797 REGEX_ASSERT(length == 0); |
| 5798 REGEX_ASSERT(buf[0] == 13); |
| 5799 REGEX_ASSERT(buf[1] == 0); |
| 5800 REGEX_ASSERT(buf[2] == 13); |
| 5801 uregex_close(re); |
| 5802 |
| 5803 // UText C++ API, length of match is 0 for non-participating matches. |
| 5804 UText ut = UTEXT_INITIALIZER; |
| 5805 utext_openUnicodeString(&ut, &text, &status); |
| 5806 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status); |
| 5807 REGEX_CHECK_STATUS; |
| 5808 matcher.reset(&ut); |
| 5809 REGEX_ASSERT(matcher.lookingAt(0, status)); |
| 5810 |
| 5811 // UText C++ API, Capture group 1 matches "A", position 0, length 1. |
| 5812 int64_t groupLen = -666; |
| 5813 UText group = UTEXT_INITIALIZER; |
| 5814 matcher.group(1, &group, groupLen, status); |
| 5815 REGEX_CHECK_STATUS; |
| 5816 REGEX_ASSERT(groupLen == 1); |
| 5817 REGEX_ASSERT(utext_getNativeIndex(&group) == 0); |
| 5818 |
| 5819 // Capture group 2, the (B), does not participate in the match. |
| 5820 matcher.group(2, &group, groupLen, status); |
| 5821 REGEX_CHECK_STATUS; |
| 5822 REGEX_ASSERT(groupLen == 0); |
| 5823 REGEX_ASSERT(matcher.start(2, status) == -1); |
| 5824 REGEX_CHECK_STATUS; |
| 5825 } |
| 5826 |
| 5827 |
5417 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | 5828 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
5418 | |
OLD | NEW |