OLD | NEW |
| (Empty) |
1 diff --git a/source/common/unicode/utypes.h b/source/common/unicode/utypes.h | |
2 index 704089c..1824625 100644 | |
3 --- a/source/common/unicode/utypes.h | |
4 +++ b/source/common/unicode/utypes.h | |
5 @@ -305,7 +305,7 @@ typedef double UDate; | |
6 #define U_IO_API | |
7 #define U_TOOLUTIL_API | |
8 #elif defined(U_COMMON_IMPLEMENTATION) | |
9 -#define U_DATA_API U_IMPORT | |
10 +#define U_DATA_API U_EXPORT | |
11 #define U_COMMON_API U_EXPORT | |
12 #define U_I18N_API U_IMPORT | |
13 #define U_LAYOUT_API U_IMPORT | |
14 @@ -647,6 +647,7 @@ typedef enum UErrorCode { | |
15 U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack sta
ck overflow. */ | |
16 U_REGEX_TIME_OUT, /**< Maximum allowed match time excee
ded */ | |
17 U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by us
er callback fn. */ | |
18 + U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size o
r complexity. @draft ICU 55 */ | |
19 U_REGEX_ERROR_LIMIT, /**< This must always be the last val
ue to indicate the limit for regexp errors */ | |
20 | |
21 /* | |
22 diff --git a/source/common/utypes.c b/source/common/utypes.c | |
23 index c28e727..32b6d88 100644 | |
24 --- a/source/common/utypes.c | |
25 +++ b/source/common/utypes.c | |
26 @@ -1,7 +1,7 @@ | |
27 /* | |
28 ****************************************************************************** | |
29 * | |
30 -* Copyright (C) 1997-2011, International Business Machines | |
31 +* Copyright (C) 1997-2014, International Business Machines | |
32 * Corporation and others. All Rights Reserved. | |
33 * | |
34 ****************************************************************************** | |
35 @@ -165,7 +165,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START]
= { | |
36 "U_REGEX_INVALID_RANGE", | |
37 "U_REGEX_STACK_OVERFLOW", | |
38 "U_REGEX_TIME_OUT", | |
39 - "U_REGEX_STOPPED_BY_CALLER" | |
40 + "U_REGEX_STOPPED_BY_CALLER", | |
41 + "U_REGEX_PATTERN_TOO_BIG" | |
42 }; | |
43 | |
44 static const char * const | |
45 diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp | |
46 index 0816eec..0c2196f 100644 | |
47 --- a/source/i18n/regexcmp.cpp | |
48 +++ b/source/i18n/regexcmp.cpp | |
49 @@ -301,7 +301,7 @@ void RegexCompile::compile( | |
50 // present in the saved state: the input string position (int64_t) and | |
51 // the position in the compiled pattern. | |
52 // | |
53 - fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; | |
54 + allocateStackData(RESTACKFRAME_HDRCOUNT); | |
55 | |
56 // | |
57 // Optimization pass 1: NOPs, back-references, and case-folding | |
58 @@ -367,9 +367,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
59 // the start of an ( grouping. | |
60 //4 NOP Resreved, will be replaced by a save if there are | |
61 // OR | operators at the top level | |
62 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus
); | |
63 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); | |
64 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); | |
65 + appendOp(URX_STATE_SAVE, 2); | |
66 + appendOp(URX_JMP, 3); | |
67 + appendOp(URX_FAIL, 0); | |
68 | |
69 // Standard open nonCapture paren action emits the two NOPs and | |
70 // sets up the paren stack frame. | |
71 @@ -392,7 +392,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
72 } | |
73 | |
74 // add the END operation to the compiled pattern. | |
75 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); | |
76 + appendOp(URX_END, 0); | |
77 | |
78 // Terminate the pattern compilation state machine. | |
79 returnVal = FALSE; | |
80 @@ -414,14 +414,13 @@ UBool RegexCompile::doParseActions(int32_t action) | |
81 int32_t savePosition = fParenStack.popi(); | |
82 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition
); | |
83 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserve
d location | |
84 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | |
85 + op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | |
86 fRXPat->fCompiledPat->setElementAt(op, savePosition); | |
87 | |
88 // Append an JMP operation into the compiled pattern. The operand
for | |
89 // the JMP will eventually be the location following the ')' for t
he | |
90 // group. This will be patched in later, when the ')' is encounte
red. | |
91 - op = URX_BUILD(URX_JMP, 0); | |
92 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
93 + appendOp(URX_JMP, 0); | |
94 | |
95 // Push the position of the newly added JMP op onto the parentheses
stack. | |
96 // This registers if for fixup when this block's close paren is enc
ountered. | |
97 @@ -430,7 +429,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
98 // Append a NOP to the compiled pattern. This is the slot reserved | |
99 // for a SAVE in the event that there is yet another '|' followin
g | |
100 // this one. | |
101 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
102 + appendOp(URX_NOP, 0); | |
103 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | |
104 } | |
105 break; | |
106 @@ -456,12 +455,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
107 // END_CAPTURE is encountered. | |
108 { | |
109 fixLiterals(); | |
110 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
111 - int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. | |
112 - fRXPat->fFrameSize += 3; | |
113 - int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); | |
114 - fRXPat->fCompiledPat->addElement(cop, *fStatus); | |
115 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
116 + appendOp(URX_NOP, 0); | |
117 + int32_t varsLoc = allocateStackData(3); // Reserve three slots
in match stack frame. | |
118 + appendOp(URX_START_CAPTURE, varsLoc); | |
119 + appendOp(URX_NOP, 0); | |
120 | |
121 // On the Parentheses stack, start a new frame and add the postions | |
122 // of the two NOPs. Depending on what follows in the pattern, th
e | |
123 @@ -486,8 +483,8 @@ UBool RegexCompile::doParseActions(int32_t action) | |
124 // is an '|' alternation within the parens. | |
125 { | |
126 fixLiterals(); | |
127 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
128 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
129 + appendOp(URX_NOP, 0); | |
130 + appendOp(URX_NOP, 0); | |
131 | |
132 // On the Parentheses stack, start a new frame and add the postions | |
133 // of the two NOPs. | |
134 @@ -509,12 +506,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
135 // is an '|' alternation within the parens. | |
136 { | |
137 fixLiterals(); | |
138 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
139 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati
on for saving the | |
140 - fRXPat->fDataSize += 1; // state stack ptr. | |
141 - int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); | |
142 - fRXPat->fCompiledPat->addElement(stoOp, *fStatus); | |
143 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
144 + appendOp(URX_NOP, 0); | |
145 + int32_t varLoc = allocateData(1); // Reserve a data location fo
r saving the state stack ptr. | |
146 + appendOp(URX_STO_SP, varLoc); | |
147 + appendOp(URX_NOP, 0); | |
148 | |
149 // On the Parentheses stack, start a new frame and add the postions | |
150 // of the two NOPs. Depending on what follows in the pattern, th
e | |
151 @@ -557,26 +552,14 @@ UBool RegexCompile::doParseActions(int32_t action) | |
152 // Two data slots are reserved, for saving the stack ptr and the input
position. | |
153 { | |
154 fixLiterals(); | |
155 - int32_t dataLoc = fRXPat->fDataSize; | |
156 - fRXPat->fDataSize += 2; | |
157 - int32_t op = URX_BUILD(URX_LA_START, dataLoc); | |
158 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
159 - | |
160 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | |
161 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
162 - | |
163 - op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | |
164 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
165 - | |
166 - op = URX_BUILD(URX_LA_END, dataLoc); | |
167 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
168 - | |
169 - op = URX_BUILD(URX_BACKTRACK, 0); | |
170 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
171 - | |
172 - op = URX_BUILD(URX_NOP, 0); | |
173 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
174 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
175 + int32_t dataLoc = allocateData(2); | |
176 + appendOp(URX_LA_START, dataLoc); | |
177 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | |
178 + appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | |
179 + appendOp(URX_LA_END, dataLoc); | |
180 + appendOp(URX_BACKTRACK, 0); | |
181 + appendOp(URX_NOP, 0); | |
182 + appendOp(URX_NOP, 0); | |
183 | |
184 // On the Parentheses stack, start a new frame and add the postions | |
185 // of the NOPs. | |
186 @@ -601,16 +584,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
187 // an alternate (transparent) re
gion. | |
188 { | |
189 fixLiterals(); | |
190 - int32_t dataLoc = fRXPat->fDataSize; | |
191 - fRXPat->fDataSize += 2; | |
192 - int32_t op = URX_BUILD(URX_LA_START, dataLoc); | |
193 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
194 - | |
195 - op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patch
ed later. | |
196 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
197 - | |
198 - op = URX_BUILD(URX_NOP, 0); | |
199 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
200 + int32_t dataLoc = allocateData(2); | |
201 + appendOp(URX_LA_START, dataLoc); | |
202 + appendOp(URX_STATE_SAVE, 0); // dest address will be patched lat
er. | |
203 + appendOp(URX_NOP, 0); | |
204 | |
205 // On the Parentheses stack, start a new frame and add the postions | |
206 // of the StateSave and NOP. | |
207 @@ -648,23 +625,19 @@ UBool RegexCompile::doParseActions(int32_t action) | |
208 fixLiterals(); | |
209 | |
210 // Allocate data space | |
211 - int32_t dataLoc = fRXPat->fDataSize; | |
212 - fRXPat->fDataSize += 4; | |
213 + int32_t dataLoc = allocateData(4); | |
214 | |
215 // Emit URX_LB_START | |
216 - int32_t op = URX_BUILD(URX_LB_START, dataLoc); | |
217 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
218 + appendOp(URX_LB_START, dataLoc); | |
219 | |
220 // Emit URX_LB_CONT | |
221 - op = URX_BUILD(URX_LB_CONT, dataLoc); | |
222 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
223 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt
h. To be filled later. | |
224 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt
h. To be filled later. | |
225 + appendOp(URX_LB_CONT, dataLoc); | |
226 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l
ater. | |
227 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l
ater. | |
228 | |
229 - // Emit the NOP | |
230 - op = URX_BUILD(URX_NOP, 0); | |
231 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
232 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
233 + // Emit the NOPs | |
234 + appendOp(URX_NOP, 0); | |
235 + appendOp(URX_NOP, 0); | |
236 | |
237 // On the Parentheses stack, start a new frame and add the postions | |
238 // of the URX_LB_CONT and the NOP. | |
239 @@ -704,24 +677,20 @@ UBool RegexCompile::doParseActions(int32_t action) | |
240 fixLiterals(); | |
241 | |
242 // Allocate data space | |
243 - int32_t dataLoc = fRXPat->fDataSize; | |
244 - fRXPat->fDataSize += 4; | |
245 + int32_t dataLoc = allocateData(4); | |
246 | |
247 // Emit URX_LB_START | |
248 - int32_t op = URX_BUILD(URX_LB_START, dataLoc); | |
249 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
250 + appendOp(URX_LB_START, dataLoc); | |
251 | |
252 // Emit URX_LBN_CONT | |
253 - op = URX_BUILD(URX_LBN_CONT, dataLoc); | |
254 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
255 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt
h. To be filled later. | |
256 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt
h. To be filled later. | |
257 - fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. | |
258 + appendOp(URX_LBN_CONT, dataLoc); | |
259 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l
ater. | |
260 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l
ater. | |
261 + appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled l
ater. | |
262 | |
263 - // Emit the NOP | |
264 - op = URX_BUILD(URX_NOP, 0); | |
265 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
266 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
267 + // Emit the NOPs | |
268 + appendOp(URX_NOP, 0); | |
269 + appendOp(URX_NOP, 0); | |
270 | |
271 // On the Parentheses stack, start a new frame and add the postions | |
272 // of the URX_LB_CONT and the NOP. | |
273 @@ -791,12 +760,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
274 | |
275 if (URX_TYPE(repeatedOp) == URX_SETREF) { | |
276 // Emit optimized code for [char set]+ | |
277 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated
Op)); | |
278 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | |
279 - frameLoc = fRXPat->fFrameSize; | |
280 - fRXPat->fFrameSize++; | |
281 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
282 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
283 + appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); | |
284 + frameLoc = allocateStackData(1); | |
285 + appendOp(URX_LOOP_C, frameLoc); | |
286 break; | |
287 } | |
288 | |
289 @@ -804,7 +770,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
290 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | |
291 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | |
292 // Emit Optimized code for .+ operations. | |
293 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | |
294 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); | |
295 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | |
296 // URX_LOOP_DOT_I operand is a flag indicating ". match
es any" mode. | |
297 loopOpI |= 1; | |
298 @@ -812,11 +778,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
299 if (fModeFlags & UREGEX_UNIX_LINES) { | |
300 loopOpI |= 2; | |
301 } | |
302 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | |
303 - frameLoc = fRXPat->fFrameSize; | |
304 - fRXPat->fFrameSize++; | |
305 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
306 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
307 + appendOp(loopOpI); | |
308 + frameLoc = allocateStackData(1); | |
309 + appendOp(URX_LOOP_C, frameLoc); | |
310 break; | |
311 } | |
312 | |
313 @@ -830,18 +794,15 @@ UBool RegexCompile::doParseActions(int32_t action) | |
314 // Zero length match is possible. | |
315 // Emit the code sequence that can handle it. | |
316 insertOp(topLoc); | |
317 - frameLoc = fRXPat->fFrameSize; | |
318 - fRXPat->fFrameSize++; | |
319 + frameLoc = allocateStackData(1); | |
320 | |
321 - int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | |
322 + int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); | |
323 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
324 | |
325 - op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | |
326 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
327 + appendOp(URX_JMP_SAV_X, topLoc+1); | |
328 } else { | |
329 // Simpler code when the repeated body must match something non
-empty | |
330 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); | |
331 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | |
332 + appendOp(URX_JMP_SAV, topLoc); | |
333 } | |
334 } | |
335 break; | |
336 @@ -853,8 +814,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
337 // 3. ... | |
338 { | |
339 int32_t topLoc = blockTopLoc(FALSE); | |
340 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | |
341 - fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); | |
342 + appendOp(URX_STATE_SAVE, topLoc); | |
343 } | |
344 break; | |
345 | |
346 @@ -868,7 +828,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
347 // Insert the state save into the compiled pattern, and we're done. | |
348 { | |
349 int32_t saveStateLoc = blockTopLoc(TRUE); | |
350 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompile
dPat->size()); | |
351 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledP
at->size()); | |
352 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | |
353 } | |
354 break; | |
355 @@ -887,14 +847,12 @@ UBool RegexCompile::doParseActions(int32_t action) | |
356 int32_t jmp1_loc = blockTopLoc(TRUE); | |
357 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); | |
358 | |
359 - int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); | |
360 + int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); | |
361 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | |
362 | |
363 - int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); | |
364 - fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); | |
365 + appendOp(URX_JMP, jmp2_loc+2); | |
366 | |
367 - int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | |
368 - fRXPat->fCompiledPat->addElement(save_op, *fStatus); | |
369 + appendOp(URX_STATE_SAVE, jmp1_loc+1); | |
370 } | |
371 break; | |
372 | |
373 @@ -934,12 +892,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
374 | |
375 if (URX_TYPE(repeatedOp) == URX_SETREF) { | |
376 // Emit optimized code for a [char set]* | |
377 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated
Op)); | |
378 + int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp
)); | |
379 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | |
380 - dataLoc = fRXPat->fFrameSize; | |
381 - fRXPat->fFrameSize++; | |
382 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
383 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
384 + dataLoc = allocateStackData(1); | |
385 + appendOp(URX_LOOP_C, dataLoc); | |
386 break; | |
387 } | |
388 | |
389 @@ -947,7 +903,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
390 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | |
391 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | |
392 // Emit Optimized code for .* operations. | |
393 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | |
394 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); | |
395 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | |
396 // URX_LOOP_DOT_I operand is a flag indicating . matche
s any mode. | |
397 loopOpI |= 1; | |
398 @@ -956,10 +912,8 @@ UBool RegexCompile::doParseActions(int32_t action) | |
399 loopOpI |= 2; | |
400 } | |
401 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | |
402 - dataLoc = fRXPat->fFrameSize; | |
403 - fRXPat->fFrameSize++; | |
404 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
405 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
406 + dataLoc = allocateStackData(1); | |
407 + appendOp(URX_LOOP_C, dataLoc); | |
408 break; | |
409 } | |
410 } | |
411 @@ -968,30 +922,29 @@ UBool RegexCompile::doParseActions(int32_t action) | |
412 // The optimizations did not apply. | |
413 | |
414 int32_t saveStateLoc = blockTopLoc(TRUE); | |
415 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | |
416 + int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); | |
417 | |
418 // Check for minimum match length of zero, which requires | |
419 // extra loop-breaking code. | |
420 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { | |
421 insertOp(saveStateLoc); | |
422 - dataLoc = fRXPat->fFrameSize; | |
423 - fRXPat->fFrameSize++; | |
424 + dataLoc = allocateStackData(1); | |
425 | |
426 - int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | |
427 + int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); | |
428 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | |
429 - jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | |
430 + jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); | |
431 } | |
432 | |
433 // Locate the position in the compiled pattern where the match will
continue | |
434 // after completing the *. (4 or 5 in the comment above) | |
435 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | |
436 | |
437 - // Put together the save state op store it into the compiled code. | |
438 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | |
439 + // Put together the save state op and store it into the compiled co
de. | |
440 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); | |
441 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | |
442 | |
443 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pat
tern. | |
444 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | |
445 + appendOp(jmpOp); | |
446 } | |
447 break; | |
448 | |
449 @@ -1005,10 +958,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
450 { | |
451 int32_t jmpLoc = blockTopLoc(TRUE); // loc
1. | |
452 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc
3. | |
453 - int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); | |
454 - int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | |
455 + int32_t jmpOp = buildOp(URX_JMP, saveLoc); | |
456 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | |
457 - fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); | |
458 + appendOp(URX_STATE_SAVE, jmpLoc+1); | |
459 } | |
460 break; | |
461 | |
462 @@ -1077,9 +1029,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
463 | |
464 // First the STO_SP before the start of the loop | |
465 insertOp(topLoc); | |
466 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati
on for saving the | |
467 - fRXPat->fDataSize += 1; // state stack ptr. | |
468 - int32_t op = URX_BUILD(URX_STO_SP, varLoc); | |
469 + | |
470 + int32_t varLoc = allocateData(1); // Reserve a data location for
saving the | |
471 + int32_t op = buildOp(URX_STO_SP, varLoc); | |
472 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
473 | |
474 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); | |
475 @@ -1088,8 +1040,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
476 fRXPat->fCompiledPat->push(loopOp, *fStatus); | |
477 | |
478 // Then the LD_SP after the end of the loop | |
479 - op = URX_BUILD(URX_LD_SP, varLoc); | |
480 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
481 + appendOp(URX_LD_SP, varLoc); | |
482 } | |
483 | |
484 break; | |
485 @@ -1125,55 +1076,49 @@ UBool RegexCompile::doParseActions(int32_t action) | |
486 // scanned a ".", match any single character. | |
487 { | |
488 fixLiterals(FALSE); | |
489 - int32_t op; | |
490 if (fModeFlags & UREGEX_DOTALL) { | |
491 - op = URX_BUILD(URX_DOTANY_ALL, 0); | |
492 + appendOp(URX_DOTANY_ALL, 0); | |
493 } else if (fModeFlags & UREGEX_UNIX_LINES) { | |
494 - op = URX_BUILD(URX_DOTANY_UNIX, 0); | |
495 + appendOp(URX_DOTANY_UNIX, 0); | |
496 } else { | |
497 - op = URX_BUILD(URX_DOTANY, 0); | |
498 + appendOp(URX_DOTANY, 0); | |
499 } | |
500 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
501 } | |
502 break; | |
503 | |
504 case doCaret: | |
505 { | |
506 fixLiterals(FALSE); | |
507 - int32_t op = 0; | |
508 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
509 - op = URX_CARET; | |
510 + appendOp(URX_CARET, 0); | |
511 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
512 - op = URX_CARET_M; | |
513 + appendOp(URX_CARET_M, 0); | |
514 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
515 - op = URX_CARET; // Only testing true start of input. | |
516 + appendOp(URX_CARET, 0); // Only testing true start of input. | |
517 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
518 - op = URX_CARET_M_UNIX; | |
519 + appendOp(URX_CARET_M_UNIX, 0); | |
520 } | |
521 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
522 } | |
523 break; | |
524 | |
525 case doDollar: | |
526 { | |
527 fixLiterals(FALSE); | |
528 - int32_t op = 0; | |
529 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
530 - op = URX_DOLLAR; | |
531 + appendOp(URX_DOLLAR, 0); | |
532 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
533 - op = URX_DOLLAR_M; | |
534 + appendOp(URX_DOLLAR_M, 0); | |
535 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
536 - op = URX_DOLLAR_D; | |
537 + appendOp(URX_DOLLAR_D, 0); | |
538 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
539 - op = URX_DOLLAR_MD; | |
540 + appendOp(URX_DOLLAR_MD, 0); | |
541 } | |
542 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
543 } | |
544 break; | |
545 | |
546 case doBackslashA: | |
547 fixLiterals(FALSE); | |
548 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); | |
549 + appendOp(URX_CARET, 0); | |
550 break; | |
551 | |
552 case doBackslashB: | |
553 @@ -1185,7 +1130,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
554 #endif | |
555 fixLiterals(FALSE); | |
556 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA
CKSLASH_B; | |
557 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); | |
558 + appendOp(op, 1); | |
559 } | |
560 break; | |
561 | |
562 @@ -1198,63 +1143,59 @@ UBool RegexCompile::doParseActions(int32_t action) | |
563 #endif | |
564 fixLiterals(FALSE); | |
565 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA
CKSLASH_B; | |
566 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
567 + appendOp(op, 0); | |
568 } | |
569 break; | |
570 | |
571 case doBackslashD: | |
572 fixLiterals(FALSE); | |
573 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatu
s); | |
574 + appendOp(URX_BACKSLASH_D, 1); | |
575 break; | |
576 | |
577 case doBackslashd: | |
578 fixLiterals(FALSE); | |
579 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatu
s); | |
580 + appendOp(URX_BACKSLASH_D, 0); | |
581 break; | |
582 | |
583 case doBackslashG: | |
584 fixLiterals(FALSE); | |
585 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatu
s); | |
586 + appendOp(URX_BACKSLASH_G, 0); | |
587 break; | |
588 | |
589 case doBackslashS: | |
590 fixLiterals(FALSE); | |
591 - fRXPat->fCompiledPat->addElement( | |
592 - URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); | |
593 + appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); | |
594 break; | |
595 | |
596 case doBackslashs: | |
597 fixLiterals(FALSE); | |
598 - fRXPat->fCompiledPat->addElement( | |
599 - URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); | |
600 + appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); | |
601 break; | |
602 | |
603 case doBackslashW: | |
604 fixLiterals(FALSE); | |
605 - fRXPat->fCompiledPat->addElement( | |
606 - URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); | |
607 + appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); | |
608 break; | |
609 | |
610 case doBackslashw: | |
611 fixLiterals(FALSE); | |
612 - fRXPat->fCompiledPat->addElement( | |
613 - URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); | |
614 + appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); | |
615 break; | |
616 | |
617 case doBackslashX: | |
618 fixLiterals(FALSE); | |
619 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatu
s); | |
620 + appendOp(URX_BACKSLASH_X, 0); | |
621 break; | |
622 | |
623 | |
624 case doBackslashZ: | |
625 fixLiterals(FALSE); | |
626 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); | |
627 + appendOp(URX_DOLLAR, 0); | |
628 break; | |
629 | |
630 case doBackslashz: | |
631 fixLiterals(FALSE); | |
632 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatu
s); | |
633 + appendOp(URX_BACKSLASH_Z, 0); | |
634 break; | |
635 | |
636 case doEscapeError: | |
637 @@ -1314,13 +1255,11 @@ UBool RegexCompile::doParseActions(int32_t action) | |
638 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, | |
639 // and shouldn't enter this code path a
t all. | |
640 fixLiterals(FALSE); | |
641 - int32_t op; | |
642 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | |
643 - op = URX_BUILD(URX_BACKREF_I, groupNum); | |
644 + appendOp(URX_BACKREF_I, groupNum); | |
645 } else { | |
646 - op = URX_BUILD(URX_BACKREF, groupNum); | |
647 + appendOp(URX_BACKREF, groupNum); | |
648 } | |
649 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
650 } | |
651 break; | |
652 | |
653 @@ -1341,22 +1280,18 @@ UBool RegexCompile::doParseActions(int32_t action) | |
654 { | |
655 // Emit the STO_SP | |
656 int32_t topLoc = blockTopLoc(TRUE); | |
657 - int32_t stoLoc = fRXPat->fDataSize; | |
658 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | |
659 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
660 + int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. | |
661 + int32_t op = buildOp(URX_STO_SP, stoLoc); | |
662 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
663 | |
664 // Emit the STATE_SAVE | |
665 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | |
666 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
667 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | |
668 | |
669 // Emit the JMP | |
670 - op = URX_BUILD(URX_JMP, topLoc+1); | |
671 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
672 + appendOp(URX_JMP, topLoc+1); | |
673 | |
674 // Emit the LD_SP | |
675 - op = URX_BUILD(URX_LD_SP, stoLoc); | |
676 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
677 + appendOp(URX_LD_SP, stoLoc); | |
678 } | |
679 break; | |
680 | |
681 @@ -1376,23 +1311,20 @@ UBool RegexCompile::doParseActions(int32_t action) | |
682 insertOp(topLoc); | |
683 | |
684 // emit STO_SP loc | |
685 - int32_t stoLoc = fRXPat->fDataSize; | |
686 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | |
687 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
688 + int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. | |
689 + int32_t op = buildOp(URX_STO_SP, stoLoc); | |
690 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
691 | |
692 // Emit the SAVE_STATE 5 | |
693 int32_t L7 = fRXPat->fCompiledPat->size()+1; | |
694 - op = URX_BUILD(URX_STATE_SAVE, L7); | |
695 + op = buildOp(URX_STATE_SAVE, L7); | |
696 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | |
697 | |
698 // Append the JMP operation. | |
699 - op = URX_BUILD(URX_JMP, topLoc+1); | |
700 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
701 + appendOp(URX_JMP, topLoc+1); | |
702 | |
703 // Emit the LD_SP loc | |
704 - op = URX_BUILD(URX_LD_SP, stoLoc); | |
705 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
706 + appendOp(URX_LD_SP, stoLoc); | |
707 } | |
708 break; | |
709 | |
710 @@ -1411,19 +1343,17 @@ UBool RegexCompile::doParseActions(int32_t action) | |
711 insertOp(topLoc); | |
712 | |
713 // Emit the STO_SP | |
714 - int32_t stoLoc = fRXPat->fDataSize; | |
715 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | |
716 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
717 + int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. | |
718 + int32_t op = buildOp(URX_STO_SP, stoLoc); | |
719 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
720 | |
721 // Emit the SAVE_STATE | |
722 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | |
723 - op = URX_BUILD(URX_STATE_SAVE, continueLoc); | |
724 + op = buildOp(URX_STATE_SAVE, continueLoc); | |
725 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | |
726 | |
727 // Emit the LD_SP | |
728 - op = URX_BUILD(URX_LD_SP, stoLoc); | |
729 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
730 + appendOp(URX_LD_SP, stoLoc); | |
731 } | |
732 break; | |
733 | |
734 @@ -1480,8 +1410,8 @@ UBool RegexCompile::doParseActions(int32_t action) | |
735 // is an '|' alternation within the parens. | |
736 { | |
737 fixLiterals(FALSE); | |
738 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
739 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
740 + appendOp(URX_NOP, 0); | |
741 + appendOp(URX_NOP, 0); | |
742 | |
743 // On the Parentheses stack, start a new frame and add the postions | |
744 // of the two NOPs (a normal non-capturing () frame, except for t
he | |
745 @@ -1818,7 +1748,6 @@ void RegexCompile::literalChar(UChar32 c) { | |
746 // | |
747 //-----------------------------------------------------------------------------
- | |
748 void RegexCompile::fixLiterals(UBool split) { | |
749 - int32_t op = 0; // An op from/for the compiled patte
rn. | |
750 | |
751 // If no literal characters have been scanned but not yet had code generate
d | |
752 // for them, nothing needs to be done. | |
753 @@ -1857,23 +1786,23 @@ void RegexCompile::fixLiterals(UBool split) { | |
754 // Single character, emit a URX_ONECHAR op to match it. | |
755 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | |
756 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { | |
757 - op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); | |
758 + appendOp(URX_ONECHAR_I, lastCodePoint); | |
759 } else { | |
760 - op = URX_BUILD(URX_ONECHAR, lastCodePoint); | |
761 + appendOp(URX_ONECHAR, lastCodePoint); | |
762 } | |
763 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
764 } else { | |
765 // Two or more chars, emit a URX_STRING to match them. | |
766 + if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length(
) > 0x00ffffff) { | |
767 + error(U_REGEX_PATTERN_TOO_BIG); | |
768 + } | |
769 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | |
770 - op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); | |
771 + appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); | |
772 } else { | |
773 // TODO here: add optimization to split case sensitive strings of
length two | |
774 // into two single char ops, for efficiency. | |
775 - op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); | |
776 + appendOp(URX_STRING, fRXPat->fLiteralText.length()); | |
777 } | |
778 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
779 - op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); | |
780 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
781 + appendOp(URX_STRING_LEN, fLiteralChars.length()); | |
782 | |
783 // Add this string into the accumulated strings of the compiled pattern
. | |
784 fRXPat->fLiteralText.append(fLiteralChars); | |
785 @@ -1883,8 +1812,58 @@ void RegexCompile::fixLiterals(UBool split) { | |
786 } | |
787 | |
788 | |
789 +int32_t RegexCompile::buildOp(int32_t type, int32_t val) { | |
790 + if (U_FAILURE(*fStatus)) { | |
791 + return 0; | |
792 + } | |
793 + if (type < 0 || type > 255) { | |
794 + U_ASSERT(FALSE); | |
795 + error(U_REGEX_INTERNAL_ERROR); | |
796 + type = URX_RESERVED_OP; | |
797 + } | |
798 + if (val > 0x00ffffff) { | |
799 + U_ASSERT(FALSE); | |
800 + error(U_REGEX_INTERNAL_ERROR); | |
801 + val = 0; | |
802 + } | |
803 + if (val < 0) { | |
804 + if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { | |
805 + U_ASSERT(FALSE); | |
806 + error(U_REGEX_INTERNAL_ERROR); | |
807 + return -1; | |
808 + } | |
809 + if (URX_TYPE(val) != 0xff) { | |
810 + U_ASSERT(FALSE); | |
811 + error(U_REGEX_INTERNAL_ERROR); | |
812 + return -1; | |
813 + } | |
814 + type = URX_RESERVED_OP_N; | |
815 + } | |
816 + return (type << 24) | val; | |
817 +} | |
818 | |
819 | |
820 +//-----------------------------------------------------------------------------
- | |
821 +// | |
822 +// appendOp() Append a new instruction onto the compiled pattern | |
823 +// Includes error checking, limiting the size of the | |
824 +// pattern to lengths that can be represented in the | |
825 +// 24 bit operand field of an instruction. | |
826 +// | |
827 +//-----------------------------------------------------------------------------
- | |
828 +void RegexCompile::appendOp(int32_t op) { | |
829 + if (U_FAILURE(*fStatus)) { | |
830 + return; | |
831 + } | |
832 + fRXPat->fCompiledPat->addElement(op, *fStatus); | |
833 + if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { | |
834 + error(U_REGEX_PATTERN_TOO_BIG); | |
835 + } | |
836 +} | |
837 + | |
838 +void RegexCompile::appendOp(int32_t type, int32_t val) { | |
839 + appendOp(buildOp(type, val)); | |
840 +} | |
841 | |
842 | |
843 //-----------------------------------------------------------------------------
- | |
844 @@ -1900,7 +1879,7 @@ void RegexCompile::insertOp(int32_t where) { | |
845 UVector64 *code = fRXPat->fCompiledPat; | |
846 U_ASSERT(where>0 && where < code->size()); | |
847 | |
848 - int32_t nop = URX_BUILD(URX_NOP, 0); | |
849 + int32_t nop = buildOp(URX_NOP, 0); | |
850 code->insertElementAt(nop, where, *fStatus); | |
851 | |
852 // Walk through the pattern, looking for any ops with targets that | |
853 @@ -1921,7 +1900,7 @@ void RegexCompile::insertOp(int32_t where) { | |
854 // Target location for this opcode is after the insertion point and | |
855 // needs to be incremented to adjust for the insertion. | |
856 opValue++; | |
857 - op = URX_BUILD(opType, opValue); | |
858 + op = buildOp(opType, opValue); | |
859 code->setElementAt(op, loc); | |
860 } | |
861 } | |
862 @@ -1946,6 +1925,58 @@ void RegexCompile::insertOp(int32_t where) { | |
863 } | |
864 | |
865 | |
866 +//-----------------------------------------------------------------------------
- | |
867 +// | |
868 +// allocateData() Allocate storage in the matcher's static data area. | |
869 +// Return the index for the newly allocated data. | |
870 +// The storage won't actually exist until we are runnin
g a match | |
871 +// operation, but the storage indexes are inserted into
various | |
872 +// opcodes while compiling the pattern. | |
873 +// | |
874 +//-----------------------------------------------------------------------------
- | |
875 +int32_t RegexCompile::allocateData(int32_t size) { | |
876 + if (U_FAILURE(*fStatus)) { | |
877 + return 0; | |
878 + } | |
879 + if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { | |
880 + error(U_REGEX_INTERNAL_ERROR); | |
881 + return 0; | |
882 + } | |
883 + int32_t dataIndex = fRXPat->fDataSize; | |
884 + fRXPat->fDataSize += size; | |
885 + if (fRXPat->fDataSize >= 0x00fffff0) { | |
886 + error(U_REGEX_INTERNAL_ERROR); | |
887 + } | |
888 + return dataIndex; | |
889 +} | |
890 + | |
891 + | |
892 +//-----------------------------------------------------------------------------
- | |
893 +// | |
894 +// allocateStackData() Allocate space in the back-tracking stack frame. | |
895 +// Return the index for the newly allocated data. | |
896 +// The frame indexes are inserted into various | |
897 +// opcodes while compiling the pattern, meaning that fr
ame | |
898 +// size must be restricted to the size that will fit | |
899 +// as an operand (24 bits). | |
900 +// | |
901 +//-----------------------------------------------------------------------------
- | |
902 +int32_t RegexCompile::allocateStackData(int32_t size) { | |
903 + if (U_FAILURE(*fStatus)) { | |
904 + return 0; | |
905 + } | |
906 + if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { | |
907 + error(U_REGEX_INTERNAL_ERROR); | |
908 + return 0; | |
909 + } | |
910 + int32_t dataIndex = fRXPat->fFrameSize; | |
911 + fRXPat->fFrameSize += size; | |
912 + if (fRXPat->fFrameSize >= 0x00fffff0) { | |
913 + error(U_REGEX_PATTERN_TOO_BIG); | |
914 + } | |
915 + return dataIndex; | |
916 +} | |
917 + | |
918 | |
919 //-----------------------------------------------------------------------------
- | |
920 // | |
921 @@ -1988,7 +2019,7 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { | |
922 theLoc--; | |
923 } | |
924 if (reserveLoc) { | |
925 - int32_t nop = URX_BUILD(URX_NOP, 0); | |
926 + int32_t nop = buildOp(URX_NOP, 0); | |
927 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); | |
928 } | |
929 } | |
930 @@ -2063,8 +2094,7 @@ void RegexCompile::handleCloseParen() { | |
931 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | |
932 | |
933 int32_t frameVarLocation = URX_VAL(captureOp); | |
934 - int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocatio
n); | |
935 - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); | |
936 + appendOp(URX_END_CAPTURE, frameVarLocation); | |
937 } | |
938 break; | |
939 case atomic: | |
940 @@ -2075,8 +2105,7 @@ void RegexCompile::handleCloseParen() { | |
941 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen+1); | |
942 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | |
943 int32_t stoLoc = URX_VAL(stoOp); | |
944 - int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); | |
945 - fRXPat->fCompiledPat->addElement(ldOp, *fStatus); | |
946 + appendOp(URX_LD_SP, stoLoc); | |
947 } | |
948 break; | |
949 | |
950 @@ -2085,8 +2114,7 @@ void RegexCompile::handleCloseParen() { | |
951 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-5); | |
952 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | |
953 int32_t dataLoc = URX_VAL(startOp); | |
954 - int32_t op = URX_BUILD(URX_LA_END, dataLoc); | |
955 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
956 + appendOp(URX_LA_END, dataLoc); | |
957 } | |
958 break; | |
959 | |
960 @@ -2096,19 +2124,16 @@ void RegexCompile::handleCloseParen() { | |
961 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-1); | |
962 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | |
963 int32_t dataLoc = URX_VAL(startOp); | |
964 - int32_t op = URX_BUILD(URX_LA_END, dataLoc); | |
965 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
966 - op = URX_BUILD(URX_BACKTRACK, 0); | |
967 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
968 - op = URX_BUILD(URX_LA_END, dataLoc); | |
969 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
970 + appendOp(URX_LA_END, dataLoc); | |
971 + appendOp(URX_BACKTRACK, 0); | |
972 + appendOp(URX_LA_END, dataLoc); | |
973 | |
974 // Patch the URX_SAVE near the top of the block. | |
975 // The destination of the SAVE is the final LA_END that was just ad
ded. | |
976 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen); | |
977 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | |
978 int32_t dest = fRXPat->fCompiledPat->size()-1; | |
979 - saveOp = URX_BUILD(URX_STATE_SAVE, dest); | |
980 + saveOp = buildOp(URX_STATE_SAVE, dest); | |
981 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | |
982 } | |
983 break; | |
984 @@ -2121,10 +2146,8 @@ void RegexCompile::handleCloseParen() { | |
985 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-4); | |
986 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | |
987 int32_t dataLoc = URX_VAL(startOp); | |
988 - int32_t op = URX_BUILD(URX_LB_END, dataLoc); | |
989 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
990 - op = URX_BUILD(URX_LA_END, dataLoc); | |
991 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
992 + appendOp(URX_LB_END, dataLoc); | |
993 + appendOp(URX_LA_END, dataLoc); | |
994 | |
995 // Determine the min and max bounds for the length of the | |
996 // string that the pattern can match. | |
997 @@ -2132,6 +2155,10 @@ void RegexCompile::handleCloseParen() { | |
998 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | |
999 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | |
1000 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | |
1001 + if (URX_TYPE(maxML) != 0) { | |
1002 + error(U_REGEX_LOOK_BEHIND_LIMIT); | |
1003 + break; | |
1004 + } | |
1005 if (maxML == INT32_MAX) { | |
1006 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
1007 break; | |
1008 @@ -2156,8 +2183,7 @@ void RegexCompile::handleCloseParen() { | |
1009 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-5); | |
1010 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | |
1011 int32_t dataLoc = URX_VAL(startOp); | |
1012 - int32_t op = URX_BUILD(URX_LBN_END, dataLoc); | |
1013 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1014 + appendOp(URX_LBN_END, dataLoc); | |
1015 | |
1016 // Determine the min and max bounds for the length of the | |
1017 // string that the pattern can match. | |
1018 @@ -2165,6 +2191,10 @@ void RegexCompile::handleCloseParen() { | |
1019 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | |
1020 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | |
1021 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | |
1022 + if (URX_TYPE(maxML) != 0) { | |
1023 + error(U_REGEX_LOOK_BEHIND_LIMIT); | |
1024 + break; | |
1025 + } | |
1026 if (maxML == INT32_MAX) { | |
1027 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
1028 break; | |
1029 @@ -2178,7 +2208,7 @@ void RegexCompile::handleCloseParen() { | |
1030 | |
1031 // Insert the pattern location to continue at after a successful ma
tch | |
1032 // as the last operand of the URX_LBN_CONT | |
1033 - op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); | |
1034 + int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size())
; | |
1035 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); | |
1036 } | |
1037 break; | |
1038 @@ -2219,7 +2249,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) | |
1039 case 0: | |
1040 { | |
1041 // Set of no elements. Always fails to match. | |
1042 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fSta
tus); | |
1043 + appendOp(URX_BACKTRACK, 0); | |
1044 delete theSet; | |
1045 } | |
1046 break; | |
1047 @@ -2240,8 +2270,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) | |
1048 // Put it into the compiled pattern as a set. | |
1049 int32_t setNumber = fRXPat->fSets->size(); | |
1050 fRXPat->fSets->addElement(theSet, *fStatus); | |
1051 - int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | |
1052 - fRXPat->fCompiledPat->addElement(setOp, *fStatus); | |
1053 + appendOp(URX_SETREF, setNumber); | |
1054 } | |
1055 } | |
1056 } | |
1057 @@ -2280,13 +2309,10 @@ void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp) | |
1058 // counterLoc --> Loop counter | |
1059 // +1 --> Input index (for breaking non-progressing loops
) | |
1060 // (Only present if unbounded upper limit on loop) | |
1061 - int32_t counterLoc = fRXPat->fFrameSize; | |
1062 - fRXPat->fFrameSize++; | |
1063 - if (fIntervalUpper < 0) { | |
1064 - fRXPat->fFrameSize++; | |
1065 - } | |
1066 + int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; | |
1067 + int32_t counterLoc = allocateStackData(dataSize); | |
1068 | |
1069 - int32_t op = URX_BUILD(InitOp, counterLoc); | |
1070 + int32_t op = buildOp(InitOp, counterLoc); | |
1071 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | |
1072 | |
1073 // The second operand of CTR_INIT is the location following the end of the
loop. | |
1074 @@ -2294,7 +2320,7 @@ void RegexCompile::compileInterval(int32_t InitOp,
int32_t LoopOp) | |
1075 // compilation of something later on causes the code to grow and the targ
et | |
1076 // position to move. | |
1077 int32_t loopEnd = fRXPat->fCompiledPat->size(); | |
1078 - op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | |
1079 + op = buildOp(URX_RELOC_OPRND, loopEnd); | |
1080 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | |
1081 | |
1082 // Followed by the min and max counts. | |
1083 @@ -2303,8 +2329,7 @@ void RegexCompile::compileInterval(int32_t InitOp,
int32_t LoopOp) | |
1084 | |
1085 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. | |
1086 // Goes at end of the block being looped over, so just append to the code
so far. | |
1087 - op = URX_BUILD(LoopOp, topOfBlock); | |
1088 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1089 + appendOp(LoopOp, topOfBlock); | |
1090 | |
1091 if ((fIntervalLow & 0xff000000) != 0 || | |
1092 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { | |
1093 @@ -2328,7 +2353,15 @@ UBool RegexCompile::compileInlineInterval() { | |
1094 int32_t topOfBlock = blockTopLoc(FALSE); | |
1095 if (fIntervalUpper == 0) { | |
1096 // Pathological case. Attempt no matches, as if the block doesn't exis
t. | |
1097 + // Discard the generated code for the block. | |
1098 + // If the block included parens, discard the info pertaining to them as
well. | |
1099 fRXPat->fCompiledPat->setSize(topOfBlock); | |
1100 + if (fMatchOpenParen >= topOfBlock) { | |
1101 + fMatchOpenParen = -1; | |
1102 + } | |
1103 + if (fMatchCloseParen >= topOfBlock) { | |
1104 + fMatchCloseParen = -1; | |
1105 + } | |
1106 return TRUE; | |
1107 } | |
1108 | |
1109 @@ -2349,7 +2382,7 @@ UBool RegexCompile::compileInlineInterval() { | |
1110 // | |
1111 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 | |
1112 + fIntervalUpper + (fIntervalUpper-fIntervalLow
); | |
1113 - int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); | |
1114 + int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); | |
1115 if (fIntervalLow == 0) { | |
1116 insertOp(topOfBlock); | |
1117 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); | |
1118 @@ -2362,13 +2395,10 @@ UBool RegexCompile::compileInlineInterval() { | |
1119 // it was put there when it was originally encountered. | |
1120 int32_t i; | |
1121 for (i=1; i<fIntervalUpper; i++ ) { | |
1122 - if (i == fIntervalLow) { | |
1123 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | |
1124 - } | |
1125 - if (i > fIntervalLow) { | |
1126 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | |
1127 + if (i >= fIntervalLow) { | |
1128 + appendOp(saveOp); | |
1129 } | |
1130 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
1131 + appendOp(op); | |
1132 } | |
1133 return TRUE; | |
1134 } | |
1135 @@ -3587,7 +3617,7 @@ void RegexCompile::stripNOPs() { | |
1136 int32_t operandAddress = URX_VAL(op); | |
1137 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); | |
1138 int32_t fixedOperandAddress = operandAddress - deltas.elementAt
i(operandAddress); | |
1139 - op = URX_BUILD(opType, fixedOperandAddress); | |
1140 + op = buildOp(opType, fixedOperandAddress); | |
1141 fRXPat->fCompiledPat->setElementAt(op, dst); | |
1142 dst++; | |
1143 break; | |
1144 @@ -3602,7 +3632,7 @@ void RegexCompile::stripNOPs() { | |
1145 break; | |
1146 } | |
1147 where = fRXPat->fGroupMap->elementAti(where-1); | |
1148 - op = URX_BUILD(opType, where); | |
1149 + op = buildOp(opType, where); | |
1150 fRXPat->fCompiledPat->setElementAt(op, dst); | |
1151 dst++; | |
1152 | |
1153 @@ -3954,7 +3984,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { | |
1154 //-----------------------------------------------------------------------------
- | |
1155 // | |
1156 // scanNamedChar | |
1157 - // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. | |
1158 +// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. | |
1159 // | |
1160 // The scan position will be at the 'N'. On return | |
1161 // the scan position should be just after the '}' | |
1162 diff --git a/source/i18n/regexcmp.h b/source/i18n/regexcmp.h | |
1163 index debdf45..c3cc7db 100644 | |
1164 --- a/source/i18n/regexcmp.h | |
1165 +++ b/source/i18n/regexcmp.h | |
1166 @@ -104,6 +104,13 @@ private: | |
1167 void fixLiterals(UBool split=FALSE); // Generate code for pendi
ng literal characters. | |
1168 void insertOp(int32_t where); // Open up a slot for a ne
w op in the | |
1169 // generated code at the
specified location. | |
1170 + void appendOp(int32_t op); // Append a new op to the
compiled pattern. | |
1171 + void appendOp(int32_t type, int32_t val); // Build & append a new op
to the compiled pattern. | |
1172 + int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode i
nstruction. | |
1173 + int32_t allocateData(int32_t size); // Allocate space in the m
atcher data area. | |
1174 + // Return index of the n
ewly allocated data. | |
1175 + int32_t allocateStackData(int32_t size); // Allocate space in the m
atch back-track stack frame. | |
1176 + // Return offset index i
n the frame. | |
1177 int32_t minMatchLength(int32_t start, | |
1178 int32_t end); | |
1179 int32_t maxMatchLength(int32_t start, | |
1180 @@ -187,7 +194,9 @@ private: | |
1181 int32_t fMatchOpenParen; // The position in the com
piled pattern | |
1182 // of the slot reserved
for a state save | |
1183 // at the start of the m
ost recently processed | |
1184 - // parenthesized block. | |
1185 + // parenthesized block.
Updated when processing | |
1186 + // a close to the locati
on for the corresponding open. | |
1187 + | |
1188 int32_t fMatchCloseParen; // The position in the pat
tern of the first | |
1189 // location after the mo
st recently processed | |
1190 // parenthesized block. | |
1191 diff --git a/source/i18n/regeximp.h b/source/i18n/regeximp.h | |
1192 index bdf8403..fdd9c76 100644 | |
1193 --- a/source/i18n/regeximp.h | |
1194 +++ b/source/i18n/regeximp.h | |
1195 @@ -1,5 +1,5 @@ | |
1196 // | |
1197 -// Copyright (C) 2002-2013 International Business Machines Corporation | |
1198 +// Copyright (C) 2002-2014 International Business Machines Corporation | |
1199 // and others. All rights reserved. | |
1200 // | |
1201 // file: regeximp.h | |
1202 @@ -241,7 +241,6 @@ enum { | |
1203 // | |
1204 // Convenience macros for assembling and disassembling a compiled operation. | |
1205 // | |
1206 -#define URX_BUILD(type, val) (int32_t)((type << 24) | (val)) | |
1207 #define URX_TYPE(x) ((uint32_t)(x) >> 24) | |
1208 #define URX_VAL(x) ((x) & 0xffffff) | |
1209 | |
1210 diff --git a/source/test/intltest/regextst.cpp b/source/test/intltest/regextst.c
pp | |
1211 index ca2fd21..f440c26 100644 | |
1212 --- a/source/test/intltest/regextst.cpp | |
1213 +++ b/source/test/intltest/regextst.cpp | |
1214 @@ -144,6 +144,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, c
onst char* &name, ch | |
1215 case 24: name = "TestBug11049"; | |
1216 if (exec) TestBug11049(); | |
1217 break; | |
1218 + case 25: name = "TestBug11371"; | |
1219 + if (exec) TestBug11371(); | |
1220 + break; | |
1221 default: name = ""; | |
1222 break; //needed to end loop | |
1223 } | |
1224 @@ -5367,6 +5370,49 @@ void RegexTest::TestCase11049(const char *pattern, const
char *data, UBool expec | |
1225 } | |
1226 | |
1227 | |
1228 +void RegexTest::TestBug11371() { | |
1229 + if (quick) { | |
1230 + logln("Skipping test. Runs in exhuastive mode only."); | |
1231 + return; | |
1232 + } | |
1233 + UErrorCode status = U_ZERO_ERROR; | |
1234 + UnicodeString patternString; | |
1235 + | |
1236 + for (int i=0; i<8000000; i++) { | |
1237 + patternString.append(UnicodeString("()")); | |
1238 + } | |
1239 + LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString,
0, status)); | |
1240 + if (status != U_REGEX_PATTERN_TOO_BIG) { | |
1241 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", | |
1242 + __FILE__, __LINE__, u_errorName(status)); | |
1243 + } | |
1244 + | |
1245 + status = U_ZERO_ERROR; | |
1246 + patternString = "("; | |
1247 + for (int i=0; i<20000000; i++) { | |
1248 + patternString.append(UnicodeString("A++")); | |
1249 + } | |
1250 + patternString.append(UnicodeString("){0}B++")); | |
1251 + LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString
, 0, status)); | |
1252 + if (status != U_REGEX_PATTERN_TOO_BIG) { | |
1253 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", | |
1254 + __FILE__, __LINE__, u_errorName(status)); | |
1255 + } | |
1256 + | |
1257 + // Pattern with too much string data, such that string indexes overflow ope
rand data field size | |
1258 + // in compiled instruction. | |
1259 + status = U_ZERO_ERROR; | |
1260 + patternString = ""; | |
1261 + while (patternString.length() < 0x00ffffff) { | |
1262 + patternString.append(UnicodeString("stuff and things dont you know, the
se are a few of my favorite strings\n")); | |
1263 + } | |
1264 + patternString.append(UnicodeString("X? trailing string")); | |
1265 + LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString
, 0, status)); | |
1266 + if (status != U_REGEX_PATTERN_TOO_BIG) { | |
1267 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", | |
1268 + __FILE__, __LINE__, u_errorName(status)); | |
1269 + } | |
1270 +} | |
1271 | |
1272 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
1273 | |
1274 diff --git a/source/test/intltest/regextst.h b/source/test/intltest/regextst.h | |
1275 index 28e2121..38cc4ef 100644 | |
1276 --- a/source/test/intltest/regextst.h | |
1277 +++ b/source/test/intltest/regextst.h | |
1278 @@ -50,6 +50,7 @@ public: | |
1279 virtual void Bug10459(); | |
1280 virtual void TestCaseInsensitiveStarters(); | |
1281 virtual void TestBug11049(); | |
1282 + virtual void TestBug11371(); | |
1283 | |
1284 // The following functions are internal to the regexp tests. | |
1285 virtual void assertUText(const char *expected, UText *actual, const char *f
ile, int line); | |
1286 diff --git a/source/test/testdata/regextst.txt b/source/test/testdata/regextst.t
xt | |
1287 index 4d2e7f6..d642e8b 100644 | |
1288 --- a/source/test/testdata/regextst.txt | |
1289 +++ b/source/test/testdata/regextst.txt | |
1290 @@ -1201,6 +1201,24 @@ | |
1291 "A|B|\U00012345" "hello <0>\U00012345</0>" | |
1292 "A|B|\U00010000" "hello \ud800" | |
1293 | |
1294 +# Bug 11369 | |
1295 +# Incorrect optimization of patterns with a zero length quantifier {0} | |
1296 + | |
1297 +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE" | |
1298 +"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>" | |
1299 +"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>" | |
1300 +"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>" | |
1301 +"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>" | |
1302 + | |
1303 +# Bug 11370 | |
1304 +# Max match length computation of look-behind expression gives result that is
too big to fit in the | |
1305 +# in the 24 bit operand portion of the compiled code. Expressions should fail
to compile | |
1306 +# (Look-behind match length must be bounded. This case is treated as unbounde
d, an error.) | |
1307 + | |
1308 +"(?<!(0123456789a){10000000})x" E "no match" | |
1309 +"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match" | |
1310 + | |
1311 + | |
1312 # Random debugging, Temporary | |
1313 # | |
1314 | |
OLD | NEW |