Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Side by Side Diff: source/test/intltest/regextst.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/test/intltest/regextst.h ('k') | source/test/intltest/regiontst.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /******************************************************************** 1 /********************************************************************
2 * COPYRIGHT: 2 * COPYRIGHT:
3 * Copyright (c) 2002-2014, International Business Machines Corporation and 3 * Copyright (c) 2002-2015, International Business Machines Corporation and
4 * others. All Rights Reserved. 4 * others. All Rights Reserved.
5 ********************************************************************/ 5 ********************************************************************/
6 6
7 // 7 //
8 // regextst.cpp 8 // regextst.cpp
9 // 9 //
10 // ICU Regular Expressions test, part of intltest. 10 // ICU Regular Expressions test, part of intltest.
11 // 11 //
12 12
13 /* 13 /*
14 NOTE!! 14 NOTE!!
15 15
16 PLEASE be careful about ASCII assumptions in this test. 16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders. 17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC 18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system. 19 who has access to an EBCDIC system.
20 20
21 */ 21 */
22 22
23 #include "intltest.h" 23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25 25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
26 #include "unicode/localpointer.h" 30 #include "unicode/localpointer.h"
27 #include "unicode/regex.h" 31 #include "unicode/regex.h"
28 #include "unicode/uchar.h" 32 #include "unicode/uchar.h"
29 #include "unicode/ucnv.h" 33 #include "unicode/ucnv.h"
30 #include "unicode/uniset.h" 34 #include "unicode/uniset.h"
31 #include "unicode/uregex.h" 35 #include "unicode/uregex.h"
32 #include "unicode/usetiter.h" 36 #include "unicode/usetiter.h"
33 #include "unicode/ustring.h" 37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39
34 #include "regextst.h" 40 #include "regextst.h"
35 #include "regexcmp.h" 41 #include "regexcmp.h"
36 #include "uvector.h" 42 #include "uvector.h"
37 #include "util.h" 43 #include "util.h"
38 #include <stdlib.h> 44 #include "cmemory.h"
39 #include <string.h>
40 #include <stdio.h>
41 #include "cstring.h" 45 #include "cstring.h"
42 #include "uinvchar.h" 46 #include "uinvchar.h"
43 47
44 #define SUPPORT_MUTATING_INPUT_STRING 0 48 #define SUPPORT_MUTATING_INPUT_STRING 0
45 49
46 //--------------------------------------------------------------------------- 50 //---------------------------------------------------------------------------
47 // 51 //
48 // Test class boilerplate 52 // Test class boilerplate
49 // 53 //
50 //--------------------------------------------------------------------------- 54 //---------------------------------------------------------------------------
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
140 break; 144 break;
141 case 23: name = "TestCaseInsensitiveStarters"; 145 case 23: name = "TestCaseInsensitiveStarters";
142 if (exec) TestCaseInsensitiveStarters(); 146 if (exec) TestCaseInsensitiveStarters();
143 break; 147 break;
144 case 24: name = "TestBug11049"; 148 case 24: name = "TestBug11049";
145 if (exec) TestBug11049(); 149 if (exec) TestBug11049();
146 break; 150 break;
147 case 25: name = "TestBug11371"; 151 case 25: name = "TestBug11371";
148 if (exec) TestBug11371(); 152 if (exec) TestBug11371();
149 break; 153 break;
154 case 26: name = "TestBug11480";
155 if (exec) TestBug11480();
156 break;
157 case 27: name = "NamedCapture";
158 if (exec) NamedCapture();
159 break;
160 case 28: name = "NamedCaptureLimits";
161 if (exec) NamedCaptureLimits();
162 break;
150 default: name = ""; 163 default: name = "";
151 break; //needed to end loop 164 break; //needed to end loop
152 } 165 }
153 } 166 }
154 167
155 168
156 169
157 /** 170 /**
158 * Calls utext_openUTF8 after, potentially, converting invariant text from the c ompilation codepage 171 * Calls utext_openUTF8 after, potentially, converting invariant text from the c ompilation codepage
159 * into ASCII. 172 * into ASCII.
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
232 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr) ;\ 245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr) ;\
233 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= %s, got %s", \ 246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status= %s, got %s", \
234 __LINE__, u_errorName(errcode), u_errorName(status));};} 247 __LINE__, u_errorName(errcode), u_errorName(status));};}
235 248
236 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
237 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), stat us); }} 250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), stat us); }}
238 251
239 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
240 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
241 254
242 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTes t failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToA ssertBuf(ustr),inv);};} 255 // expected: const char * , restricted to invariant characters.
256 // actual: const UnicodeString &
257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
243 261
244 262
245 static UBool testUTextEqual(UText *uta, UText *utb) { 263 static UBool testUTextEqual(UText *uta, UText *utb) {
246 UChar32 ca = 0; 264 UChar32 ca = 0;
247 UChar32 cb = 0; 265 UChar32 cb = 0;
248 utext_setNativeIndex(uta, 0); 266 utext_setNativeIndex(uta, 0);
249 utext_setNativeIndex(utb, 0); 267 utext_setNativeIndex(utb, 0);
250 do { 268 do {
251 ca = utext_next32(uta); 269 ca = utext_next32(uta);
252 cb = utext_next32(utb); 270 cb = utext_next32(utb);
(...skipping 1163 matching lines...) Expand 10 before | Expand all | Expand 10 after
1416 REGEX_CHECK_STATUS; 1434 REGEX_CHECK_STATUS;
1417 dest = matcher2->replaceFirst("$1$1", status); 1435 dest = matcher2->replaceFirst("$1$1", status);
1418 REGEX_CHECK_STATUS; 1436 REGEX_CHECK_STATUS;
1419 REGEX_ASSERT(dest == "bcbcdefg"); 1437 REGEX_ASSERT(dest == "bcbcdefg");
1420 1438
1421 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1 ."), status); 1439 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1 ."), status);
1422 REGEX_CHECK_STATUS; 1440 REGEX_CHECK_STATUS;
1423 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1441 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1424 1442
1425 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1443 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1426 REGEX_CHECK_STATUS; 1444 REGEX_ASSERT(U_FAILURE(status));
1427 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); 1445 status = U_ZERO_ERROR;
1428 1446
1429 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U 0001D7CF."); 1447 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U 0001D7CF.");
1430 replacement = replacement.unescape(); 1448 replacement = replacement.unescape();
1431 dest = matcher2->replaceFirst(replacement, status); 1449 dest = matcher2->replaceFirst(replacement, status);
1432 REGEX_CHECK_STATUS; 1450 REGEX_CHECK_STATUS;
1433 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1451 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1434 1452
1435 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",st atus), U_INDEX_OUTOFBOUNDS_ERROR); 1453 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",st atus), U_INDEX_OUTOFBOUNDS_ERROR);
1436 1454
1437 1455
(...skipping 605 matching lines...) Expand 10 before | Expand all | Expand 10 after
2043 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2044 utext_close(result); 2062 utext_close(result);
2045 result = matcher->group(0, &destText, group_len, status); 2063 result = matcher->group(0, &destText, group_len, status);
2046 REGEX_CHECK_STATUS; 2064 REGEX_CHECK_STATUS;
2047 REGEX_ASSERT(result == &destText); 2065 REGEX_ASSERT(result == &destText);
2048 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2049 // destText is now immutable, reopen it 2067 // destText is now immutable, reopen it
2050 utext_close(&destText); 2068 utext_close(&destText);
2051 utext_openUnicodeString(&destText, &dest, &status); 2069 utext_openUnicodeString(&destText, &dest, &status);
2052 2070
2053 result = matcher->group(0, NULL, status); 2071 int64_t length;
2072 result = matcher->group(0, NULL, length, status);
2054 REGEX_CHECK_STATUS; 2073 REGEX_CHECK_STATUS;
2055 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2056 utext_close(result); 2075 utext_close(result);
2057 result = matcher->group(0, &destText, status); 2076 result = matcher->group(0, &destText, length, status);
2058 REGEX_CHECK_STATUS; 2077 REGEX_CHECK_STATUS;
2059 REGEX_ASSERT(result == &destText); 2078 REGEX_ASSERT(result == &destText);
2060 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2079 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2080 REGEX_ASSERT(length == 10);
2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2061 2082
2062 result = matcher->group(1, NULL, status); 2083 // Capture Group 1 == "234567"
2084 result = matcher->group(1, NULL, length, status);
2063 REGEX_CHECK_STATUS; 2085 REGEX_CHECK_STATUS;
2064 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ 2086 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2065 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 2087 REGEX_ASSERT(length == 6);
2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2066 utext_close(result); 2089 utext_close(result);
2067 result = matcher->group(1, &destText, status); 2090
2091 result = matcher->group(1, &destText, length, status);
2068 REGEX_CHECK_STATUS; 2092 REGEX_CHECK_STATUS;
2069 REGEX_ASSERT(result == &destText); 2093 REGEX_ASSERT(result == &destText);
2070 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 2094 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2095 REGEX_ASSERT(length == 6);
2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2097 utext_close(result);
2071 2098
2072 result = matcher->group(2, NULL, status); 2099 // Capture Group 2 == "45"
2100 result = matcher->group(2, NULL, length, status);
2073 REGEX_CHECK_STATUS; 2101 REGEX_CHECK_STATUS;
2074 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ 2102 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2075 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 2103 REGEX_ASSERT(length == 2);
2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2076 utext_close(result); 2105 utext_close(result);
2077 result = matcher->group(2, &destText, status); 2106
2107 result = matcher->group(2, &destText, length, status);
2078 REGEX_CHECK_STATUS; 2108 REGEX_CHECK_STATUS;
2079 REGEX_ASSERT(result == &destText); 2109 REGEX_ASSERT(result == &destText);
2080 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 2110 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2111 REGEX_ASSERT(length == 2);
2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2113 utext_close(result);
2081 2114
2082 result = matcher->group(3, NULL, status); 2115 // Capture Group 3 == "89"
2116 result = matcher->group(3, NULL, length, status);
2083 REGEX_CHECK_STATUS; 2117 REGEX_CHECK_STATUS;
2084 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ 2118 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2085 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 2119 REGEX_ASSERT(length == 2);
2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2086 utext_close(result); 2121 utext_close(result);
2087 result = matcher->group(3, &destText, status); 2122
2123 result = matcher->group(3, &destText, length, status);
2088 REGEX_CHECK_STATUS; 2124 REGEX_CHECK_STATUS;
2089 REGEX_ASSERT(result == &destText); 2125 REGEX_ASSERT(result == &destText);
2090 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 2126 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2127 REGEX_ASSERT(length == 2);
2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2129 utext_close(result);
2091 2130
2131 // Capture Group number out of range.
2132 status = U_ZERO_ERROR;
2092 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ; 2133 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR) ;
2134 status = U_ZERO_ERROR;
2093 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ; 2135 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR) ;
2136 status = U_ZERO_ERROR;
2094 matcher->reset(); 2137 matcher->reset();
2095 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2138 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2096 2139
2097 delete matcher; 2140 delete matcher;
2098 delete pat; 2141 delete pat;
2099 2142
2100 utext_close(&destText); 2143 utext_close(&destText);
2101 utext_close(&input); 2144 utext_close(&input);
2102 utext_close(&re); 2145 utext_close(&re);
2103 } 2146 }
(...skipping 491 matching lines...) Expand 10 before | Expand all | Expand 10 after
2595 REGEX_CHECK_STATUS; 2638 REGEX_CHECK_STATUS;
2596 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0 x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg * / 2639 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0 x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg * /
2597 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2598 utext_close(result); 2641 utext_close(result);
2599 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ; 2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;
2600 result = matcher2->replaceFirst(&replText, &destText, status); 2643 result = matcher2->replaceFirst(&replText, &destText, status);
2601 REGEX_CHECK_STATUS; 2644 REGEX_CHECK_STATUS;
2602 REGEX_ASSERT(result == &destText); 2645 REGEX_ASSERT(result == &destText);
2603 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2604 2647
2605 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x6 9, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0 x00 }; /* $ by itself, no group number $$$ */ 2648 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x2 0, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2606 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2651 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2607 result = matcher2->replaceFirst(&replText, NULL, status); 2652 result = matcher2->replaceFirst(&replText, NULL, status);
2608 REGEX_CHECK_STATUS; 2653 REGEX_CHECK_STATUS;
2609 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0 x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x2 4, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2654 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0 x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x2 4, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2610 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2611 utext_close(result); 2656 utext_close(result);
2612 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ; 2657 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status) ;
2613 result = matcher2->replaceFirst(&replText, &destText, status); 2658 result = matcher2->replaceFirst(&replText, &destText, status);
2614 REGEX_CHECK_STATUS; 2659 REGEX_CHECK_STATUS;
2615 REGEX_ASSERT(result == &destText); 2660 REGEX_ASSERT(result == &destText);
(...skipping 446 matching lines...) Expand 10 before | Expand all | Expand 10 after
3062 REGEX_ASSERT(n==5); 3107 REGEX_ASSERT(n==5);
3063 REGEX_ASSERT(fields[0]=="1"); 3108 REGEX_ASSERT(fields[0]=="1");
3064 REGEX_ASSERT(fields[1]=="-"); 3109 REGEX_ASSERT(fields[1]=="-");
3065 REGEX_ASSERT(fields[2]=="10"); 3110 REGEX_ASSERT(fields[2]=="10");
3066 REGEX_ASSERT(fields[3]==","); 3111 REGEX_ASSERT(fields[3]==",");
3067 REGEX_ASSERT(fields[4]=="20"); 3112 REGEX_ASSERT(fields[4]=="20");
3068 delete pat1; 3113 delete pat1;
3069 3114
3070 3115
3071 // 3116 //
3117 // split of a UText based string, with library allocating output UTexts.
3118 //
3119 {
3120 status = U_ZERO_ERROR;
3121 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3122 UnicodeString stringToSplit("first:second:third");
3123 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &stat us);
3124 REGEX_CHECK_STATUS;
3125
3126 UText *splits[10] = {NULL};
3127 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(spl its), status);
3128 REGEX_CHECK_STATUS;
3129 REGEX_ASSERT(numFields == 5);
3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3135 REGEX_ASSERT(splits[5] == NULL);
3136
3137 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3138 if (splits[i]) {
3139 utext_close(splits[i]);
3140 splits[i] = NULL;
3141 }
3142 }
3143 utext_close(textToSplit);
3144 }
3145
3146
3147 //
3072 // RegexPattern::pattern() and patternText() 3148 // RegexPattern::pattern() and patternText()
3073 // 3149 //
3074 pat1 = new RegexPattern(); 3150 pat1 = new RegexPattern();
3075 REGEX_ASSERT(pat1->pattern() == ""); 3151 REGEX_ASSERT(pat1->pattern() == "");
3076 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3152 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3077 delete pat1; 3153 delete pat1;
3078 const char *helloWorldInvariant = "(Hello, world)*"; 3154 const char *helloWorldInvariant = "(Hello, world)*";
3079 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3155 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3080 pat1 = RegexPattern::compile(&re1, pe, status); 3156 pat1 = RegexPattern::compile(&re1, pe, status);
3081 REGEX_CHECK_STATUS; 3157 REGEX_CHECK_STATUS;
3082 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*"); 3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3083 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3084 delete pat1; 3160 delete pat1;
3085 3161
3086 utext_close(&re1); 3162 utext_close(&re1);
3087 } 3163 }
3088 3164
3089 3165
3090 //--------------------------------------------------------------------------- 3166 //---------------------------------------------------------------------------
3091 // 3167 //
3092 // Extended A more thorough check for features of regex patterns 3168 // Extended A more thorough check for features of regex patterns
(...skipping 684 matching lines...) Expand 10 before | Expand all | Expand 10 after
3777 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3853 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3778 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3854 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3779 3855
3780 // Mal-formed {min,max} quantifiers 3856 // Mal-formed {min,max} quantifiers
3781 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3857 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3782 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3858 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3783 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3859 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3784 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3860 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3785 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3861 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3786 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3862 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3787 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Ov erflows int during scan 3863 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov erflows int during scan
3788 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov erflows regex binary format 3864 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Ov erflows regex binary format
3789 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3865 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3790 3866
3791 // Ticket 5389 3867 // Ticket 5389
3792 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3868 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3793 3869
3794 // Invalid Back Reference \0 3870 // Invalid Back Reference \0
3795 // For ICU 3.8 and earlier 3871 // For ICU 3.8 and earlier
3796 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3872 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3797 // 3873 //
(...skipping 1001 matching lines...) Expand 10 before | Expand all | Expand 10 after
4799 REGEX_ASSERT(cbInfo.numCalls > 0); 4875 REGEX_ASSERT(cbInfo.numCalls > 0);
4800 4876
4801 // A longer running match that the callback function will abort. 4877 // A longer running match that the callback function will abort.
4802 status = U_ZERO_ERROR; 4878 status = U_ZERO_ERROR;
4803 cbInfo.reset(4); 4879 cbInfo.reset(4);
4804 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4880 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4805 matcher.reset(s); 4881 matcher.reset(s);
4806 REGEX_ASSERT(matcher.matches(status)==FALSE); 4882 REGEX_ASSERT(matcher.matches(status)==FALSE);
4807 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4883 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4808 REGEX_ASSERT(cbInfo.numCalls == 4); 4884 REGEX_ASSERT(cbInfo.numCalls == 4);
4885
4886 // A longer running find that the callback function will abort.
4887 status = U_ZERO_ERROR;
4888 cbInfo.reset(4);
4889 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4890 matcher.reset(s);
4891 REGEX_ASSERT(matcher.find(status)==FALSE);
4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893 REGEX_ASSERT(cbInfo.numCalls == 4);
4809 } 4894 }
4810 4895
4811 4896
4812 } 4897 }
4813 4898
4814 4899
4815 // 4900 //
4816 // FindProgressCallbacks() Test the find "progress" callback function. 4901 // FindProgressCallbacks() Test the find "progress" callback function.
4817 // When set, the find progress callback will be invoked during a find operations 4902 // When set, the find progress callback will be invoked during a find operations
4818 // after each return from a match attempt, giving the applicati on the opportunity 4903 // after each return from a match attempt, giving the applicati on the opportunity
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
4988 utext_close(&text2); 5073 utext_close(&text2);
4989 } 5074 }
4990 5075
4991 /* 5076 /*
4992 * group() 5077 * group()
4993 */ 5078 */
4994 { 5079 {
4995 UChar text1[80]; 5080 UChar text1[80];
4996 UText *actual; 5081 UText *actual;
4997 UBool result; 5082 UBool result;
4998 u_uastrncpy(text1, "noise abc interior def, and this is off the end", s izeof(text1)/2); 5083 int64_t length = 0;
5084
5085 u_uastrncpy(text1, "noise abc interior def, and this is off the end", U PRV_LENGTHOF(text1));
5086 // 012345678901234567890123456789012345678901234567
5087 // 0 1 2 3 4
4999 5088
5000 status = U_ZERO_ERROR; 5089 status = U_ZERO_ERROR;
5001 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 5090 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5002 REGEX_CHECK_STATUS; 5091 REGEX_CHECK_STATUS;
5003 5092
5004 uregex_setText(re, text1, -1, &status); 5093 uregex_setText(re, text1, -1, &status);
5005 result = uregex_find(re, 0, &status); 5094 result = uregex_find(re, 0, &status);
5006 REGEX_ASSERT(result==TRUE); 5095 REGEX_ASSERT(result==TRUE);
5007 5096
5008 /* Capture Group 0, the full match. Should succeed. */ 5097 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5009 status = U_ZERO_ERROR; 5098 status = U_ZERO_ERROR;
5010 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); 5099 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5011 REGEX_CHECK_STATUS; 5100 REGEX_CHECK_STATUS;
5012 REGEX_ASSERT(actual == &bufferText); 5101 REGEX_ASSERT(actual == &bufferText);
5013 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); 5102 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5103 REGEX_ASSERT(length == 16);
5104 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5014 5105
5015 /* Capture group #1. Should succeed. */ 5106 /* Capture group #1. Should succeed, matching " interior ". */
5016 status = U_ZERO_ERROR; 5107 status = U_ZERO_ERROR;
5017 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); 5108 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5018 REGEX_CHECK_STATUS; 5109 REGEX_CHECK_STATUS;
5019 REGEX_ASSERT(actual == &bufferText); 5110 REGEX_ASSERT(actual == &bufferText);
5020 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); 5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " inte rior "
5112 REGEX_ASSERT(length == 10);
5113 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5021 5114
5022 /* Capture group out of range. Error. */ 5115 /* Capture group out of range. Error. */
5023 status = U_ZERO_ERROR; 5116 status = U_ZERO_ERROR;
5024 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); 5117 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5025 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5118 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5026 REGEX_ASSERT(actual == &bufferText); 5119 REGEX_ASSERT(actual == &bufferText);
5027
5028 uregex_close(re); 5120 uregex_close(re);
5029 5121
5030 } 5122 }
5031 5123
5032 /* 5124 /*
5033 * replaceFirst() 5125 * replaceFirst()
5034 */ 5126 */
5035 { 5127 {
5036 UChar text1[80]; 5128 UChar text1[80];
5037 UChar text2[80]; 5129 UChar text2[80];
5038 UText replText = UTEXT_INITIALIZER; 5130 UText replText = UTEXT_INITIALIZER;
5039 UText *result; 5131 UText *result;
5132 status = U_ZERO_ERROR;
5133 utext_openUnicodeString(&bufferText, &buffer, &status);
5040 5134
5041 status = U_ZERO_ERROR; 5135 status = U_ZERO_ERROR;
5042 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5136 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5043 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5137 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5044 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5138 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5045 5139
5046 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5140 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5047 REGEX_CHECK_STATUS; 5141 REGEX_CHECK_STATUS;
5048 5142
5049 /* Normal case, with match */ 5143 /* Normal case, with match */
5050 uregex_setText(re, text1, -1, &status); 5144 uregex_setText(re, text1, -1, &status);
5145 REGEX_CHECK_STATUS;
5051 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5146 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5147 REGEX_CHECK_STATUS;
5052 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5148 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5053 REGEX_CHECK_STATUS; 5149 REGEX_CHECK_STATUS;
5054 REGEX_ASSERT(result == &bufferText); 5150 REGEX_ASSERT(result == &bufferText);
5055 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5151 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5056 5152
5057 /* No match. Text should copy to output with no changes. */ 5153 /* No match. Text should copy to output with no changes. */
5058 uregex_setText(re, text2, -1, &status); 5154 uregex_setText(re, text2, -1, &status);
5059 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5060 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5156 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5061 REGEX_CHECK_STATUS; 5157 REGEX_CHECK_STATUS;
5062 REGEX_ASSERT(result == &bufferText); 5158 REGEX_ASSERT(result == &bufferText);
5063 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5159 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5064 5160
5065 /* Unicode escapes */ 5161 /* Unicode escapes */
5066 uregex_setText(re, text1, -1, &status); 5162 uregex_setText(re, text1, -1, &status);
5067 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a" , -1, &status); 5163 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\ a", -1, &status);
5068 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5069 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5070 REGEX_CHECK_STATUS; 5166 REGEX_CHECK_STATUS;
5071 REGEX_ASSERT(result == &bufferText); 5167 REGEX_ASSERT(result == &bufferText);
5072 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5168 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5073 5169
5074 uregex_close(re); 5170 uregex_close(re);
5075 utext_close(&replText); 5171 utext_close(&replText);
5076 } 5172 }
5077 5173
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
5116 5212
5117 /* 5213 /*
5118 * splitUText() uses the C++ API directly, and the UnicodeString version us es mutable UTexts, 5214 * splitUText() uses the C++ API directly, and the UnicodeString version us es mutable UTexts,
5119 * so we don't need to test it here. 5215 * so we don't need to test it here.
5120 */ 5216 */
5121 5217
5122 utext_close(&bufferText); 5218 utext_close(&bufferText);
5123 utext_close(&patternText); 5219 utext_close(&patternText);
5124 } 5220 }
5125 5221
5222
5223 //--------------------------------------------------------------
5224 //
5225 // NamedCapture Check basic named capture group functionality
5226 //
5227 //--------------------------------------------------------------
5228 void RegexTest::NamedCapture() {
5229 UErrorCode status = U_ZERO_ERROR;
5230 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5231 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, stat us);
5232 REGEX_CHECK_STATUS;
5233 int32_t group = pat->groupNumberFromName("five", -1, status);
5234 REGEX_CHECK_STATUS;
5235 REGEX_ASSERT(5 == group);
5236 group = pat->groupNumberFromName("three", -1, status);
5237 REGEX_CHECK_STATUS;
5238 REGEX_ASSERT(3 == group);
5239
5240 status = U_ZERO_ERROR;
5241 group = pat->groupNumberFromName(UnicodeString("six"), status);
5242 REGEX_CHECK_STATUS;
5243 REGEX_ASSERT(6 == group);
5244
5245 status = U_ZERO_ERROR;
5246 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5247 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5248
5249 status = U_ZERO_ERROR;
5250
5251 // After copying a pattern, named capture should still work in the copy.
5252 RegexPattern *copiedPat = new RegexPattern(*pat);
5253 REGEX_ASSERT(*copiedPat == *pat);
5254 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5255
5256 group = copiedPat->groupNumberFromName("five", -1, status);
5257 REGEX_CHECK_STATUS;
5258 REGEX_ASSERT(5 == group);
5259 group = copiedPat->groupNumberFromName("three", -1, status);
5260 REGEX_CHECK_STATUS;
5261 REGEX_ASSERT(3 == group);
5262 delete copiedPat;
5263
5264 // ReplaceAll with named capture group.
5265 status = U_ZERO_ERROR;
5266 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5267 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0 , status);
5268 REGEX_CHECK_STATUS;
5269 // m.pattern().dumpPattern();
5270 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5271 REGEX_CHECK_STATUS;
5272 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5273 delete m;
5274
5275 // ReplaceAll, allowed capture group numbers.
5276 text = UnicodeString("abcmxyz");
5277 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5278 REGEX_CHECK_STATUS;
5279
5280 status = U_ZERO_ERROR;
5281 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5282 REGEX_CHECK_STATUS;
5283 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5284
5285 status = U_ZERO_ERROR;
5286 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5287 REGEX_CHECK_STATUS;
5288 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5289
5290 status = U_ZERO_ERROR;
5291 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5292 REGEX_CHECK_STATUS;
5293 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5294
5295 status = U_ZERO_ERROR;
5296 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5297 REGEX_CHECK_STATUS;
5298 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5299
5300 status = U_ZERO_ERROR;
5301 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5302 REGEX_CHECK_STATUS;
5303 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5304
5305 status = U_ZERO_ERROR;
5306 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5307 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5308
5309 status = U_ZERO_ERROR;
5310 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5311 REGEX_CHECK_STATUS; // tr ailing out-of-range 4 passes through.
5312 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5313
5314 status = U_ZERO_ERROR;
5315 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consu me leading zeroes. Don't consume digits
5316 REGEX_CHECK_STATUS; // tha t push group num out of range.
5317 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // Thi s is group 1.
5318
5319 status = U_ZERO_ERROR;
5320 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5321 REGEX_CHECK_STATUS;
5322 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5323
5324 status = U_ZERO_ERROR;
5325 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5326 REGEX_CHECK_STATUS;
5327 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5328
5329 status = U_ZERO_ERROR;
5330 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5331 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5332
5333 status = U_ZERO_ERROR;
5334 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5335 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5336
5337 status = U_ZERO_ERROR;
5338 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5339 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5340
5341 status = U_ZERO_ERROR;
5342 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status) ;
5343 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5344
5345 delete m;
5346
5347 // Repeat the above replaceAll() tests using the plain C API, which
5348 // has a separate implementation internally.
5349 // TODO: factor out the test data.
5350
5351 status = U_ZERO_ERROR;
5352 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status) ;
5353 REGEX_CHECK_STATUS;
5354 text = UnicodeString("abcmxyz");
5355 uregex_setText(re, text.getBuffer(), text.length(), &status);
5356 REGEX_CHECK_STATUS;
5357
5358 UChar resultBuf[100];
5359 int32_t resultLength;
5360 UnicodeString repl;
5361
5362 status = U_ZERO_ERROR;
5363 repl = UnicodeString("<$0>");
5364 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5365 REGEX_CHECK_STATUS;
5366 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLe ngth));
5367
5368 status = U_ZERO_ERROR;
5369 repl = UnicodeString("<$1>");
5370 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5371 REGEX_CHECK_STATUS;
5372 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength ));
5373
5374 status = U_ZERO_ERROR;
5375 repl = UnicodeString("<${one}>");
5376 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5377 REGEX_CHECK_STATUS;
5378 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength ));
5379
5380 status = U_ZERO_ERROR;
5381 repl = UnicodeString("<$2>");
5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5383 REGEX_CHECK_STATUS;
5384 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength ));
5385
5386 status = U_ZERO_ERROR;
5387 repl = UnicodeString("<$3>");
5388 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5389 REGEX_CHECK_STATUS;
5390 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength ));
5391
5392 status = U_ZERO_ERROR;
5393 repl = UnicodeString("<$4>");
5394 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5395 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5396
5397 status = U_ZERO_ERROR;
5398 repl = UnicodeString("<$04>");
5399 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5400 REGEX_CHECK_STATUS;
5401 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultL ength));
5402
5403 status = U_ZERO_ERROR;
5404 repl = UnicodeString("<$000016>");
5405 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5406 REGEX_CHECK_STATUS;
5407 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLengt h));
5408
5409 status = U_ZERO_ERROR;
5410 repl = UnicodeString("<$3$2$1${one}>");
5411 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5412 REGEX_CHECK_STATUS;
5413 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLen gth));
5414
5415 status = U_ZERO_ERROR;
5416 repl = UnicodeString("$3$2$1${one}");
5417 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5418 REGEX_CHECK_STATUS;
5419 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLengt h));
5420
5421 status = U_ZERO_ERROR;
5422 repl = UnicodeString("<${noSuchName}>");
5423 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5424 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5425
5426 status = U_ZERO_ERROR;
5427 repl = UnicodeString("<${invalid-name}>");
5428 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5429 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5430
5431 status = U_ZERO_ERROR;
5432 repl = UnicodeString("<${one");
5433 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5434 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5435
5436 status = U_ZERO_ERROR;
5437 repl = UnicodeString("$not a capture group");
5438 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), result Buf, UPRV_LENGTHOF(resultBuf), &status);
5439 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5440
5441 uregex_close(re);
5442 }
5443
5444 //--------------------------------------------------------------
5445 //
5446 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5447 // The point is not so much what the exact limit is,
5448 // but that a largish number doesn't hit bad non-linear pe rformance,
5449 // and that exceeding the limit fails cleanly.
5450 //
5451 //--------------------------------------------------------------
5452 void RegexTest::NamedCaptureLimits() {
5453 if (quick) {
5454 logln("Skipping test. Runs in exhuastive mode only.");
5455 return;
5456 }
5457 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5458 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, f ails to compile.
5459 char nnbuf[100];
5460 UnicodeString pattern;
5461 int32_t nn;
5462
5463 for (nn=1; nn<goodLimit; nn++) {
5464 sprintf(nnbuf, "(?<nn%d>)", nn);
5465 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5466 }
5467 UErrorCode status = U_ZERO_ERROR;
5468 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5469 REGEX_CHECK_STATUS;
5470 for (nn=1; nn<goodLimit; nn++) {
5471 sprintf(nnbuf, "nn%d", nn);
5472 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5473 REGEX_ASSERT(nn == groupNum);
5474 if (nn != groupNum) {
5475 break;
5476 }
5477 }
5478 delete pat;
5479
5480 pattern.remove();
5481 for (nn=1; nn<failLimit; nn++) {
5482 sprintf(nnbuf, "(?<nn%d>)", nn);
5483 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5484 }
5485 status = U_ZERO_ERROR;
5486 pat = RegexPattern::compile(pattern, 0, status);
5487 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5488 delete pat;
5489 }
5490
5491
5126 //-------------------------------------------------------------- 5492 //--------------------------------------------------------------
5127 // 5493 //
5128 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher . 5494 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher .
5129 // 5495 //
5130 //--------------------------------------------------------------- 5496 //---------------------------------------------------------------
5131 void RegexTest::Bug7651() { 5497 void RegexTest::Bug7651() {
5132 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\ u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z 0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\ uFFFF])|\\$[A-Za-z]+)"); 5498 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\ u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z 0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\ uFFFF])|\\$[A-Za-z]+)");
5133 // The following should exceed the default operator stack depth in the matc her, i.e. force the matcher to malloc instead of using fSmallData. 5499 // The following should exceed the default operator stack depth in the matc her, i.e. force the matcher to malloc instead of using fSmallData.
5134 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allo cation. 5500 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allo cation.
5135 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u 0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![ A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u0 0f8-\\u00ff]*|\\$[A-Za-z]+)"); 5501 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u 0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![ A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u0 0f8-\\u00ff]*|\\$[A-Za-z]+)");
(...skipping 271 matching lines...) Expand 10 before | Expand all | Expand 10 after
5407 patternString.append(UnicodeString("stuff and things dont you know, thes e are a few of my favorite strings\n")); 5773 patternString.append(UnicodeString("stuff and things dont you know, thes e are a few of my favorite strings\n"));
5408 } 5774 }
5409 patternString.append(UnicodeString("X? trailing string")); 5775 patternString.append(UnicodeString("X? trailing string"));
5410 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status)); 5776 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5411 if (status != U_REGEX_PATTERN_TOO_BIG) { 5777 if (status != U_REGEX_PATTERN_TOO_BIG) {
5412 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s. ", 5778 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s. ",
5413 __FILE__, __LINE__, u_errorName(status)); 5779 __FILE__, __LINE__, u_errorName(status));
5414 } 5780 }
5415 } 5781 }
5416 5782
5783 void RegexTest::TestBug11480() {
5784 // C API, get capture group of a group that does not participate in the matc h.
5785 // (Returns a zero length string, with nul termination,
5786 // indistinguishable from a group with a zero length match.)
5787
5788 UErrorCode status = U_ZERO_ERROR;
5789 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5790 REGEX_CHECK_STATUS;
5791 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5792 uregex_setText(re, text.getBuffer(), text.length(), &status);
5793 REGEX_CHECK_STATUS;
5794 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5795 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5796 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5797 REGEX_ASSERT(length == 0);
5798 REGEX_ASSERT(buf[0] == 13);
5799 REGEX_ASSERT(buf[1] == 0);
5800 REGEX_ASSERT(buf[2] == 13);
5801 uregex_close(re);
5802
5803 // UText C++ API, length of match is 0 for non-participating matches.
5804 UText ut = UTEXT_INITIALIZER;
5805 utext_openUnicodeString(&ut, &text, &status);
5806 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5807 REGEX_CHECK_STATUS;
5808 matcher.reset(&ut);
5809 REGEX_ASSERT(matcher.lookingAt(0, status));
5810
5811 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5812 int64_t groupLen = -666;
5813 UText group = UTEXT_INITIALIZER;
5814 matcher.group(1, &group, groupLen, status);
5815 REGEX_CHECK_STATUS;
5816 REGEX_ASSERT(groupLen == 1);
5817 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5818
5819 // Capture group 2, the (B), does not participate in the match.
5820 matcher.group(2, &group, groupLen, status);
5821 REGEX_CHECK_STATUS;
5822 REGEX_ASSERT(groupLen == 0);
5823 REGEX_ASSERT(matcher.start(2, status) == -1);
5824 REGEX_CHECK_STATUS;
5825 }
5826
5827
5417 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5828 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5418
OLDNEW
« no previous file with comments | « source/test/intltest/regextst.h ('k') | source/test/intltest/regiontst.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698