Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(60)

Side by Side Diff: patches/regex.patch

Issue 2442923002: ICU update to 58 part 2 (Closed)
Patch Set: apply more patches and updates; almost ready to roll Created 4 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « patches/measure_format.patch ('k') | patches/regexcmp.patch » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 diff --git a/source/common/unicode/utypes.h b/source/common/unicode/utypes.h
2 index 704089c..1824625 100644
3 --- a/source/common/unicode/utypes.h
4 +++ b/source/common/unicode/utypes.h
5 @@ -305,7 +305,7 @@ typedef double UDate;
6 #define U_IO_API
7 #define U_TOOLUTIL_API
8 #elif defined(U_COMMON_IMPLEMENTATION)
9 -#define U_DATA_API U_IMPORT
10 +#define U_DATA_API U_EXPORT
11 #define U_COMMON_API U_EXPORT
12 #define U_I18N_API U_IMPORT
13 #define U_LAYOUT_API U_IMPORT
14 @@ -647,6 +647,7 @@ typedef enum UErrorCode {
15 U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack sta ck overflow. */
16 U_REGEX_TIME_OUT, /**< Maximum allowed match time excee ded */
17 U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by us er callback fn. */
18 + U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size o r complexity. @draft ICU 55 */
19 U_REGEX_ERROR_LIMIT, /**< This must always be the last val ue to indicate the limit for regexp errors */
20
21 /*
22 diff --git a/source/common/utypes.c b/source/common/utypes.c
23 index c28e727..32b6d88 100644
24 --- a/source/common/utypes.c
25 +++ b/source/common/utypes.c
26 @@ -1,7 +1,7 @@
27 /*
28 ******************************************************************************
29 *
30 -* Copyright (C) 1997-2011, International Business Machines
31 +* Copyright (C) 1997-2014, International Business Machines
32 * Corporation and others. All Rights Reserved.
33 *
34 ******************************************************************************
35 @@ -165,7 +165,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
36 "U_REGEX_INVALID_RANGE",
37 "U_REGEX_STACK_OVERFLOW",
38 "U_REGEX_TIME_OUT",
39 - "U_REGEX_STOPPED_BY_CALLER"
40 + "U_REGEX_STOPPED_BY_CALLER",
41 + "U_REGEX_PATTERN_TOO_BIG"
42 };
43
44 static const char * const
45 diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp
46 index 0816eec..0c2196f 100644
47 --- a/source/i18n/regexcmp.cpp
48 +++ b/source/i18n/regexcmp.cpp
49 @@ -301,7 +301,7 @@ void RegexCompile::compile(
50 // present in the saved state: the input string position (int64_t) and
51 // the position in the compiled pattern.
52 //
53 - fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT;
54 + allocateStackData(RESTACKFRAME_HDRCOUNT);
55
56 //
57 // Optimization pass 1: NOPs, back-references, and case-folding
58 @@ -367,9 +367,9 @@ UBool RegexCompile::doParseActions(int32_t action)
59 // the start of an ( grouping.
60 //4 NOP Resreved, will be replaced by a save if there are
61 // OR | operators at the top level
62 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus );
63 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus);
64 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
65 + appendOp(URX_STATE_SAVE, 2);
66 + appendOp(URX_JMP, 3);
67 + appendOp(URX_FAIL, 0);
68
69 // Standard open nonCapture paren action emits the two NOPs and
70 // sets up the paren stack frame.
71 @@ -392,7 +392,7 @@ UBool RegexCompile::doParseActions(int32_t action)
72 }
73
74 // add the END operation to the compiled pattern.
75 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
76 + appendOp(URX_END, 0);
77
78 // Terminate the pattern compilation state machine.
79 returnVal = FALSE;
80 @@ -414,14 +414,13 @@ UBool RegexCompile::doParseActions(int32_t action)
81 int32_t savePosition = fParenStack.popi();
82 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition );
83 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserve d location
84 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
85 + op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
86 fRXPat->fCompiledPat->setElementAt(op, savePosition);
87
88 // Append an JMP operation into the compiled pattern. The operand for
89 // the JMP will eventually be the location following the ')' for t he
90 // group. This will be patched in later, when the ')' is encounte red.
91 - op = URX_BUILD(URX_JMP, 0);
92 - fRXPat->fCompiledPat->addElement(op, *fStatus);
93 + appendOp(URX_JMP, 0);
94
95 // Push the position of the newly added JMP op onto the parentheses stack.
96 // This registers if for fixup when this block's close paren is enc ountered.
97 @@ -430,7 +429,7 @@ UBool RegexCompile::doParseActions(int32_t action)
98 // Append a NOP to the compiled pattern. This is the slot reserved
99 // for a SAVE in the event that there is yet another '|' followin g
100 // this one.
101 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
102 + appendOp(URX_NOP, 0);
103 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
104 }
105 break;
106 @@ -456,12 +455,10 @@ UBool RegexCompile::doParseActions(int32_t action)
107 // END_CAPTURE is encountered.
108 {
109 fixLiterals();
110 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
111 - int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame.
112 - fRXPat->fFrameSize += 3;
113 - int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc);
114 - fRXPat->fCompiledPat->addElement(cop, *fStatus);
115 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
116 + appendOp(URX_NOP, 0);
117 + int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
118 + appendOp(URX_START_CAPTURE, varsLoc);
119 + appendOp(URX_NOP, 0);
120
121 // On the Parentheses stack, start a new frame and add the postions
122 // of the two NOPs. Depending on what follows in the pattern, th e
123 @@ -486,8 +483,8 @@ UBool RegexCompile::doParseActions(int32_t action)
124 // is an '|' alternation within the parens.
125 {
126 fixLiterals();
127 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
128 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
129 + appendOp(URX_NOP, 0);
130 + appendOp(URX_NOP, 0);
131
132 // On the Parentheses stack, start a new frame and add the postions
133 // of the two NOPs.
134 @@ -509,12 +506,10 @@ UBool RegexCompile::doParseActions(int32_t action)
135 // is an '|' alternation within the parens.
136 {
137 fixLiterals();
138 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
139 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati on for saving the
140 - fRXPat->fDataSize += 1; // state stack ptr.
141 - int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc);
142 - fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
143 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
144 + appendOp(URX_NOP, 0);
145 + int32_t varLoc = allocateData(1); // Reserve a data location fo r saving the state stack ptr.
146 + appendOp(URX_STO_SP, varLoc);
147 + appendOp(URX_NOP, 0);
148
149 // On the Parentheses stack, start a new frame and add the postions
150 // of the two NOPs. Depending on what follows in the pattern, th e
151 @@ -557,26 +552,14 @@ UBool RegexCompile::doParseActions(int32_t action)
152 // Two data slots are reserved, for saving the stack ptr and the input position.
153 {
154 fixLiterals();
155 - int32_t dataLoc = fRXPat->fDataSize;
156 - fRXPat->fDataSize += 2;
157 - int32_t op = URX_BUILD(URX_LA_START, dataLoc);
158 - fRXPat->fCompiledPat->addElement(op, *fStatus);
159 -
160 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
161 - fRXPat->fCompiledPat->addElement(op, *fStatus);
162 -
163 - op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
164 - fRXPat->fCompiledPat->addElement(op, *fStatus);
165 -
166 - op = URX_BUILD(URX_LA_END, dataLoc);
167 - fRXPat->fCompiledPat->addElement(op, *fStatus);
168 -
169 - op = URX_BUILD(URX_BACKTRACK, 0);
170 - fRXPat->fCompiledPat->addElement(op, *fStatus);
171 -
172 - op = URX_BUILD(URX_NOP, 0);
173 - fRXPat->fCompiledPat->addElement(op, *fStatus);
174 - fRXPat->fCompiledPat->addElement(op, *fStatus);
175 + int32_t dataLoc = allocateData(2);
176 + appendOp(URX_LA_START, dataLoc);
177 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
178 + appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
179 + appendOp(URX_LA_END, dataLoc);
180 + appendOp(URX_BACKTRACK, 0);
181 + appendOp(URX_NOP, 0);
182 + appendOp(URX_NOP, 0);
183
184 // On the Parentheses stack, start a new frame and add the postions
185 // of the NOPs.
186 @@ -601,16 +584,10 @@ UBool RegexCompile::doParseActions(int32_t action)
187 // an alternate (transparent) re gion.
188 {
189 fixLiterals();
190 - int32_t dataLoc = fRXPat->fDataSize;
191 - fRXPat->fDataSize += 2;
192 - int32_t op = URX_BUILD(URX_LA_START, dataLoc);
193 - fRXPat->fCompiledPat->addElement(op, *fStatus);
194 -
195 - op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patch ed later.
196 - fRXPat->fCompiledPat->addElement(op, *fStatus);
197 -
198 - op = URX_BUILD(URX_NOP, 0);
199 - fRXPat->fCompiledPat->addElement(op, *fStatus);
200 + int32_t dataLoc = allocateData(2);
201 + appendOp(URX_LA_START, dataLoc);
202 + appendOp(URX_STATE_SAVE, 0); // dest address will be patched lat er.
203 + appendOp(URX_NOP, 0);
204
205 // On the Parentheses stack, start a new frame and add the postions
206 // of the StateSave and NOP.
207 @@ -648,23 +625,19 @@ UBool RegexCompile::doParseActions(int32_t action)
208 fixLiterals();
209
210 // Allocate data space
211 - int32_t dataLoc = fRXPat->fDataSize;
212 - fRXPat->fDataSize += 4;
213 + int32_t dataLoc = allocateData(4);
214
215 // Emit URX_LB_START
216 - int32_t op = URX_BUILD(URX_LB_START, dataLoc);
217 - fRXPat->fCompiledPat->addElement(op, *fStatus);
218 + appendOp(URX_LB_START, dataLoc);
219
220 // Emit URX_LB_CONT
221 - op = URX_BUILD(URX_LB_CONT, dataLoc);
222 - fRXPat->fCompiledPat->addElement(op, *fStatus);
223 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt h. To be filled later.
224 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt h. To be filled later.
225 + appendOp(URX_LB_CONT, dataLoc);
226 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l ater.
227 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l ater.
228
229 - // Emit the NOP
230 - op = URX_BUILD(URX_NOP, 0);
231 - fRXPat->fCompiledPat->addElement(op, *fStatus);
232 - fRXPat->fCompiledPat->addElement(op, *fStatus);
233 + // Emit the NOPs
234 + appendOp(URX_NOP, 0);
235 + appendOp(URX_NOP, 0);
236
237 // On the Parentheses stack, start a new frame and add the postions
238 // of the URX_LB_CONT and the NOP.
239 @@ -704,24 +677,20 @@ UBool RegexCompile::doParseActions(int32_t action)
240 fixLiterals();
241
242 // Allocate data space
243 - int32_t dataLoc = fRXPat->fDataSize;
244 - fRXPat->fDataSize += 4;
245 + int32_t dataLoc = allocateData(4);
246
247 // Emit URX_LB_START
248 - int32_t op = URX_BUILD(URX_LB_START, dataLoc);
249 - fRXPat->fCompiledPat->addElement(op, *fStatus);
250 + appendOp(URX_LB_START, dataLoc);
251
252 // Emit URX_LBN_CONT
253 - op = URX_BUILD(URX_LBN_CONT, dataLoc);
254 - fRXPat->fCompiledPat->addElement(op, *fStatus);
255 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt h. To be filled later.
256 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt h. To be filled later.
257 - fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
258 + appendOp(URX_LBN_CONT, dataLoc);
259 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l ater.
260 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l ater.
261 + appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled l ater.
262
263 - // Emit the NOP
264 - op = URX_BUILD(URX_NOP, 0);
265 - fRXPat->fCompiledPat->addElement(op, *fStatus);
266 - fRXPat->fCompiledPat->addElement(op, *fStatus);
267 + // Emit the NOPs
268 + appendOp(URX_NOP, 0);
269 + appendOp(URX_NOP, 0);
270
271 // On the Parentheses stack, start a new frame and add the postions
272 // of the URX_LB_CONT and the NOP.
273 @@ -791,12 +760,9 @@ UBool RegexCompile::doParseActions(int32_t action)
274
275 if (URX_TYPE(repeatedOp) == URX_SETREF) {
276 // Emit optimized code for [char set]+
277 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated Op));
278 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
279 - frameLoc = fRXPat->fFrameSize;
280 - fRXPat->fFrameSize++;
281 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
282 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
283 + appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
284 + frameLoc = allocateStackData(1);
285 + appendOp(URX_LOOP_C, frameLoc);
286 break;
287 }
288
289 @@ -804,7 +770,7 @@ UBool RegexCompile::doParseActions(int32_t action)
290 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
291 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
292 // Emit Optimized code for .+ operations.
293 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
294 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
295 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
296 // URX_LOOP_DOT_I operand is a flag indicating ". match es any" mode.
297 loopOpI |= 1;
298 @@ -812,11 +778,9 @@ UBool RegexCompile::doParseActions(int32_t action)
299 if (fModeFlags & UREGEX_UNIX_LINES) {
300 loopOpI |= 2;
301 }
302 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
303 - frameLoc = fRXPat->fFrameSize;
304 - fRXPat->fFrameSize++;
305 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
306 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
307 + appendOp(loopOpI);
308 + frameLoc = allocateStackData(1);
309 + appendOp(URX_LOOP_C, frameLoc);
310 break;
311 }
312
313 @@ -830,18 +794,15 @@ UBool RegexCompile::doParseActions(int32_t action)
314 // Zero length match is possible.
315 // Emit the code sequence that can handle it.
316 insertOp(topLoc);
317 - frameLoc = fRXPat->fFrameSize;
318 - fRXPat->fFrameSize++;
319 + frameLoc = allocateStackData(1);
320
321 - int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
322 + int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
323 fRXPat->fCompiledPat->setElementAt(op, topLoc);
324
325 - op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
326 - fRXPat->fCompiledPat->addElement(op, *fStatus);
327 + appendOp(URX_JMP_SAV_X, topLoc+1);
328 } else {
329 // Simpler code when the repeated body must match something non -empty
330 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc);
331 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
332 + appendOp(URX_JMP_SAV, topLoc);
333 }
334 }
335 break;
336 @@ -853,8 +814,7 @@ UBool RegexCompile::doParseActions(int32_t action)
337 // 3. ...
338 {
339 int32_t topLoc = blockTopLoc(FALSE);
340 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
341 - fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
342 + appendOp(URX_STATE_SAVE, topLoc);
343 }
344 break;
345
346 @@ -868,7 +828,7 @@ UBool RegexCompile::doParseActions(int32_t action)
347 // Insert the state save into the compiled pattern, and we're done.
348 {
349 int32_t saveStateLoc = blockTopLoc(TRUE);
350 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompile dPat->size());
351 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledP at->size());
352 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
353 }
354 break;
355 @@ -887,14 +847,12 @@ UBool RegexCompile::doParseActions(int32_t action)
356 int32_t jmp1_loc = blockTopLoc(TRUE);
357 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
358
359 - int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1);
360 + int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
361 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
362
363 - int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2);
364 - fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
365 + appendOp(URX_JMP, jmp2_loc+2);
366
367 - int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
368 - fRXPat->fCompiledPat->addElement(save_op, *fStatus);
369 + appendOp(URX_STATE_SAVE, jmp1_loc+1);
370 }
371 break;
372
373 @@ -934,12 +892,10 @@ UBool RegexCompile::doParseActions(int32_t action)
374
375 if (URX_TYPE(repeatedOp) == URX_SETREF) {
376 // Emit optimized code for a [char set]*
377 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated Op));
378 + int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp ));
379 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
380 - dataLoc = fRXPat->fFrameSize;
381 - fRXPat->fFrameSize++;
382 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
383 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
384 + dataLoc = allocateStackData(1);
385 + appendOp(URX_LOOP_C, dataLoc);
386 break;
387 }
388
389 @@ -947,7 +903,7 @@ UBool RegexCompile::doParseActions(int32_t action)
390 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
391 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
392 // Emit Optimized code for .* operations.
393 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
394 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
395 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
396 // URX_LOOP_DOT_I operand is a flag indicating . matche s any mode.
397 loopOpI |= 1;
398 @@ -956,10 +912,8 @@ UBool RegexCompile::doParseActions(int32_t action)
399 loopOpI |= 2;
400 }
401 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
402 - dataLoc = fRXPat->fFrameSize;
403 - fRXPat->fFrameSize++;
404 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
405 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
406 + dataLoc = allocateStackData(1);
407 + appendOp(URX_LOOP_C, dataLoc);
408 break;
409 }
410 }
411 @@ -968,30 +922,29 @@ UBool RegexCompile::doParseActions(int32_t action)
412 // The optimizations did not apply.
413
414 int32_t saveStateLoc = blockTopLoc(TRUE);
415 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
416 + int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
417
418 // Check for minimum match length of zero, which requires
419 // extra loop-breaking code.
420 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
421 insertOp(saveStateLoc);
422 - dataLoc = fRXPat->fFrameSize;
423 - fRXPat->fFrameSize++;
424 + dataLoc = allocateStackData(1);
425
426 - int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
427 + int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
428 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
429 - jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
430 + jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
431 }
432
433 // Locate the position in the compiled pattern where the match will continue
434 // after completing the *. (4 or 5 in the comment above)
435 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
436
437 - // Put together the save state op store it into the compiled code.
438 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
439 + // Put together the save state op and store it into the compiled co de.
440 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
441 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
442
443 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pat tern.
444 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
445 + appendOp(jmpOp);
446 }
447 break;
448
449 @@ -1005,10 +958,9 @@ UBool RegexCompile::doParseActions(int32_t action)
450 {
451 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
452 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
453 - int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
454 - int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
455 + int32_t jmpOp = buildOp(URX_JMP, saveLoc);
456 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
457 - fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
458 + appendOp(URX_STATE_SAVE, jmpLoc+1);
459 }
460 break;
461
462 @@ -1077,9 +1029,9 @@ UBool RegexCompile::doParseActions(int32_t action)
463
464 // First the STO_SP before the start of the loop
465 insertOp(topLoc);
466 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati on for saving the
467 - fRXPat->fDataSize += 1; // state stack ptr.
468 - int32_t op = URX_BUILD(URX_STO_SP, varLoc);
469 +
470 + int32_t varLoc = allocateData(1); // Reserve a data location for saving the
471 + int32_t op = buildOp(URX_STO_SP, varLoc);
472 fRXPat->fCompiledPat->setElementAt(op, topLoc);
473
474 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
475 @@ -1088,8 +1040,7 @@ UBool RegexCompile::doParseActions(int32_t action)
476 fRXPat->fCompiledPat->push(loopOp, *fStatus);
477
478 // Then the LD_SP after the end of the loop
479 - op = URX_BUILD(URX_LD_SP, varLoc);
480 - fRXPat->fCompiledPat->addElement(op, *fStatus);
481 + appendOp(URX_LD_SP, varLoc);
482 }
483
484 break;
485 @@ -1125,55 +1076,49 @@ UBool RegexCompile::doParseActions(int32_t action)
486 // scanned a ".", match any single character.
487 {
488 fixLiterals(FALSE);
489 - int32_t op;
490 if (fModeFlags & UREGEX_DOTALL) {
491 - op = URX_BUILD(URX_DOTANY_ALL, 0);
492 + appendOp(URX_DOTANY_ALL, 0);
493 } else if (fModeFlags & UREGEX_UNIX_LINES) {
494 - op = URX_BUILD(URX_DOTANY_UNIX, 0);
495 + appendOp(URX_DOTANY_UNIX, 0);
496 } else {
497 - op = URX_BUILD(URX_DOTANY, 0);
498 + appendOp(URX_DOTANY, 0);
499 }
500 - fRXPat->fCompiledPat->addElement(op, *fStatus);
501 }
502 break;
503
504 case doCaret:
505 {
506 fixLiterals(FALSE);
507 - int32_t op = 0;
508 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR EGEX_UNIX_LINES) == 0) {
509 - op = URX_CARET;
510 + appendOp(URX_CARET, 0);
511 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR EGEX_UNIX_LINES) == 0) {
512 - op = URX_CARET_M;
513 + appendOp(URX_CARET_M, 0);
514 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR EGEX_UNIX_LINES) != 0) {
515 - op = URX_CARET; // Only testing true start of input.
516 + appendOp(URX_CARET, 0); // Only testing true start of input.
517 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR EGEX_UNIX_LINES) != 0) {
518 - op = URX_CARET_M_UNIX;
519 + appendOp(URX_CARET_M_UNIX, 0);
520 }
521 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
522 }
523 break;
524
525 case doDollar:
526 {
527 fixLiterals(FALSE);
528 - int32_t op = 0;
529 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR EGEX_UNIX_LINES) == 0) {
530 - op = URX_DOLLAR;
531 + appendOp(URX_DOLLAR, 0);
532 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR EGEX_UNIX_LINES) == 0) {
533 - op = URX_DOLLAR_M;
534 + appendOp(URX_DOLLAR_M, 0);
535 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR EGEX_UNIX_LINES) != 0) {
536 - op = URX_DOLLAR_D;
537 + appendOp(URX_DOLLAR_D, 0);
538 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR EGEX_UNIX_LINES) != 0) {
539 - op = URX_DOLLAR_MD;
540 + appendOp(URX_DOLLAR_MD, 0);
541 }
542 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
543 }
544 break;
545
546 case doBackslashA:
547 fixLiterals(FALSE);
548 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
549 + appendOp(URX_CARET, 0);
550 break;
551
552 case doBackslashB:
553 @@ -1185,7 +1130,7 @@ UBool RegexCompile::doParseActions(int32_t action)
554 #endif
555 fixLiterals(FALSE);
556 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA CKSLASH_B;
557 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
558 + appendOp(op, 1);
559 }
560 break;
561
562 @@ -1198,63 +1143,59 @@ UBool RegexCompile::doParseActions(int32_t action)
563 #endif
564 fixLiterals(FALSE);
565 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA CKSLASH_B;
566 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
567 + appendOp(op, 0);
568 }
569 break;
570
571 case doBackslashD:
572 fixLiterals(FALSE);
573 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatu s);
574 + appendOp(URX_BACKSLASH_D, 1);
575 break;
576
577 case doBackslashd:
578 fixLiterals(FALSE);
579 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatu s);
580 + appendOp(URX_BACKSLASH_D, 0);
581 break;
582
583 case doBackslashG:
584 fixLiterals(FALSE);
585 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatu s);
586 + appendOp(URX_BACKSLASH_G, 0);
587 break;
588
589 case doBackslashS:
590 fixLiterals(FALSE);
591 - fRXPat->fCompiledPat->addElement(
592 - URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
593 + appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
594 break;
595
596 case doBackslashs:
597 fixLiterals(FALSE);
598 - fRXPat->fCompiledPat->addElement(
599 - URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
600 + appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
601 break;
602
603 case doBackslashW:
604 fixLiterals(FALSE);
605 - fRXPat->fCompiledPat->addElement(
606 - URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
607 + appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
608 break;
609
610 case doBackslashw:
611 fixLiterals(FALSE);
612 - fRXPat->fCompiledPat->addElement(
613 - URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
614 + appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
615 break;
616
617 case doBackslashX:
618 fixLiterals(FALSE);
619 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatu s);
620 + appendOp(URX_BACKSLASH_X, 0);
621 break;
622
623
624 case doBackslashZ:
625 fixLiterals(FALSE);
626 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
627 + appendOp(URX_DOLLAR, 0);
628 break;
629
630 case doBackslashz:
631 fixLiterals(FALSE);
632 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatu s);
633 + appendOp(URX_BACKSLASH_Z, 0);
634 break;
635
636 case doEscapeError:
637 @@ -1314,13 +1255,11 @@ UBool RegexCompile::doParseActions(int32_t action)
638 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
639 // and shouldn't enter this code path a t all.
640 fixLiterals(FALSE);
641 - int32_t op;
642 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
643 - op = URX_BUILD(URX_BACKREF_I, groupNum);
644 + appendOp(URX_BACKREF_I, groupNum);
645 } else {
646 - op = URX_BUILD(URX_BACKREF, groupNum);
647 + appendOp(URX_BACKREF, groupNum);
648 }
649 - fRXPat->fCompiledPat->addElement(op, *fStatus);
650 }
651 break;
652
653 @@ -1341,22 +1280,18 @@ UBool RegexCompile::doParseActions(int32_t action)
654 {
655 // Emit the STO_SP
656 int32_t topLoc = blockTopLoc(TRUE);
657 - int32_t stoLoc = fRXPat->fDataSize;
658 - fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
659 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
660 + int32_t stoLoc = allocateData(1); // Reserve the data location f or storing save stack ptr.
661 + int32_t op = buildOp(URX_STO_SP, stoLoc);
662 fRXPat->fCompiledPat->setElementAt(op, topLoc);
663
664 // Emit the STATE_SAVE
665 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
666 - fRXPat->fCompiledPat->addElement(op, *fStatus);
667 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
668
669 // Emit the JMP
670 - op = URX_BUILD(URX_JMP, topLoc+1);
671 - fRXPat->fCompiledPat->addElement(op, *fStatus);
672 + appendOp(URX_JMP, topLoc+1);
673
674 // Emit the LD_SP
675 - op = URX_BUILD(URX_LD_SP, stoLoc);
676 - fRXPat->fCompiledPat->addElement(op, *fStatus);
677 + appendOp(URX_LD_SP, stoLoc);
678 }
679 break;
680
681 @@ -1376,23 +1311,20 @@ UBool RegexCompile::doParseActions(int32_t action)
682 insertOp(topLoc);
683
684 // emit STO_SP loc
685 - int32_t stoLoc = fRXPat->fDataSize;
686 - fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
687 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
688 + int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
689 + int32_t op = buildOp(URX_STO_SP, stoLoc);
690 fRXPat->fCompiledPat->setElementAt(op, topLoc);
691
692 // Emit the SAVE_STATE 5
693 int32_t L7 = fRXPat->fCompiledPat->size()+1;
694 - op = URX_BUILD(URX_STATE_SAVE, L7);
695 + op = buildOp(URX_STATE_SAVE, L7);
696 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
697
698 // Append the JMP operation.
699 - op = URX_BUILD(URX_JMP, topLoc+1);
700 - fRXPat->fCompiledPat->addElement(op, *fStatus);
701 + appendOp(URX_JMP, topLoc+1);
702
703 // Emit the LD_SP loc
704 - op = URX_BUILD(URX_LD_SP, stoLoc);
705 - fRXPat->fCompiledPat->addElement(op, *fStatus);
706 + appendOp(URX_LD_SP, stoLoc);
707 }
708 break;
709
710 @@ -1411,19 +1343,17 @@ UBool RegexCompile::doParseActions(int32_t action)
711 insertOp(topLoc);
712
713 // Emit the STO_SP
714 - int32_t stoLoc = fRXPat->fDataSize;
715 - fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr.
716 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc);
717 + int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
718 + int32_t op = buildOp(URX_STO_SP, stoLoc);
719 fRXPat->fCompiledPat->setElementAt(op, topLoc);
720
721 // Emit the SAVE_STATE
722 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
723 - op = URX_BUILD(URX_STATE_SAVE, continueLoc);
724 + op = buildOp(URX_STATE_SAVE, continueLoc);
725 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
726
727 // Emit the LD_SP
728 - op = URX_BUILD(URX_LD_SP, stoLoc);
729 - fRXPat->fCompiledPat->addElement(op, *fStatus);
730 + appendOp(URX_LD_SP, stoLoc);
731 }
732 break;
733
734 @@ -1480,8 +1410,8 @@ UBool RegexCompile::doParseActions(int32_t action)
735 // is an '|' alternation within the parens.
736 {
737 fixLiterals(FALSE);
738 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
739 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
740 + appendOp(URX_NOP, 0);
741 + appendOp(URX_NOP, 0);
742
743 // On the Parentheses stack, start a new frame and add the postions
744 // of the two NOPs (a normal non-capturing () frame, except for t he
745 @@ -1818,7 +1748,6 @@ void RegexCompile::literalChar(UChar32 c) {
746 //
747 //----------------------------------------------------------------------------- -
748 void RegexCompile::fixLiterals(UBool split) {
749 - int32_t op = 0; // An op from/for the compiled patte rn.
750
751 // If no literal characters have been scanned but not yet had code generate d
752 // for them, nothing needs to be done.
753 @@ -1857,23 +1786,23 @@ void RegexCompile::fixLiterals(UBool split) {
754 // Single character, emit a URX_ONECHAR op to match it.
755 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
756 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
757 - op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
758 + appendOp(URX_ONECHAR_I, lastCodePoint);
759 } else {
760 - op = URX_BUILD(URX_ONECHAR, lastCodePoint);
761 + appendOp(URX_ONECHAR, lastCodePoint);
762 }
763 - fRXPat->fCompiledPat->addElement(op, *fStatus);
764 } else {
765 // Two or more chars, emit a URX_STRING to match them.
766 + if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length( ) > 0x00ffffff) {
767 + error(U_REGEX_PATTERN_TOO_BIG);
768 + }
769 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
770 - op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length());
771 + appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
772 } else {
773 // TODO here: add optimization to split case sensitive strings of length two
774 // into two single char ops, for efficiency.
775 - op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
776 + appendOp(URX_STRING, fRXPat->fLiteralText.length());
777 }
778 - fRXPat->fCompiledPat->addElement(op, *fStatus);
779 - op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
780 - fRXPat->fCompiledPat->addElement(op, *fStatus);
781 + appendOp(URX_STRING_LEN, fLiteralChars.length());
782
783 // Add this string into the accumulated strings of the compiled pattern .
784 fRXPat->fLiteralText.append(fLiteralChars);
785 @@ -1883,8 +1812,58 @@ void RegexCompile::fixLiterals(UBool split) {
786 }
787
788
789 +int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
790 + if (U_FAILURE(*fStatus)) {
791 + return 0;
792 + }
793 + if (type < 0 || type > 255) {
794 + U_ASSERT(FALSE);
795 + error(U_REGEX_INTERNAL_ERROR);
796 + type = URX_RESERVED_OP;
797 + }
798 + if (val > 0x00ffffff) {
799 + U_ASSERT(FALSE);
800 + error(U_REGEX_INTERNAL_ERROR);
801 + val = 0;
802 + }
803 + if (val < 0) {
804 + if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
805 + U_ASSERT(FALSE);
806 + error(U_REGEX_INTERNAL_ERROR);
807 + return -1;
808 + }
809 + if (URX_TYPE(val) != 0xff) {
810 + U_ASSERT(FALSE);
811 + error(U_REGEX_INTERNAL_ERROR);
812 + return -1;
813 + }
814 + type = URX_RESERVED_OP_N;
815 + }
816 + return (type << 24) | val;
817 +}
818
819
820 +//----------------------------------------------------------------------------- -
821 +//
822 +// appendOp() Append a new instruction onto the compiled pattern
823 +// Includes error checking, limiting the size of the
824 +// pattern to lengths that can be represented in the
825 +// 24 bit operand field of an instruction.
826 +//
827 +//----------------------------------------------------------------------------- -
828 +void RegexCompile::appendOp(int32_t op) {
829 + if (U_FAILURE(*fStatus)) {
830 + return;
831 + }
832 + fRXPat->fCompiledPat->addElement(op, *fStatus);
833 + if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
834 + error(U_REGEX_PATTERN_TOO_BIG);
835 + }
836 +}
837 +
838 +void RegexCompile::appendOp(int32_t type, int32_t val) {
839 + appendOp(buildOp(type, val));
840 +}
841
842
843 //----------------------------------------------------------------------------- -
844 @@ -1900,7 +1879,7 @@ void RegexCompile::insertOp(int32_t where) {
845 UVector64 *code = fRXPat->fCompiledPat;
846 U_ASSERT(where>0 && where < code->size());
847
848 - int32_t nop = URX_BUILD(URX_NOP, 0);
849 + int32_t nop = buildOp(URX_NOP, 0);
850 code->insertElementAt(nop, where, *fStatus);
851
852 // Walk through the pattern, looking for any ops with targets that
853 @@ -1921,7 +1900,7 @@ void RegexCompile::insertOp(int32_t where) {
854 // Target location for this opcode is after the insertion point and
855 // needs to be incremented to adjust for the insertion.
856 opValue++;
857 - op = URX_BUILD(opType, opValue);
858 + op = buildOp(opType, opValue);
859 code->setElementAt(op, loc);
860 }
861 }
862 @@ -1946,6 +1925,58 @@ void RegexCompile::insertOp(int32_t where) {
863 }
864
865
866 +//----------------------------------------------------------------------------- -
867 +//
868 +// allocateData() Allocate storage in the matcher's static data area.
869 +// Return the index for the newly allocated data.
870 +// The storage won't actually exist until we are runnin g a match
871 +// operation, but the storage indexes are inserted into various
872 +// opcodes while compiling the pattern.
873 +//
874 +//----------------------------------------------------------------------------- -
875 +int32_t RegexCompile::allocateData(int32_t size) {
876 + if (U_FAILURE(*fStatus)) {
877 + return 0;
878 + }
879 + if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
880 + error(U_REGEX_INTERNAL_ERROR);
881 + return 0;
882 + }
883 + int32_t dataIndex = fRXPat->fDataSize;
884 + fRXPat->fDataSize += size;
885 + if (fRXPat->fDataSize >= 0x00fffff0) {
886 + error(U_REGEX_INTERNAL_ERROR);
887 + }
888 + return dataIndex;
889 +}
890 +
891 +
892 +//----------------------------------------------------------------------------- -
893 +//
894 +// allocateStackData() Allocate space in the back-tracking stack frame.
895 +// Return the index for the newly allocated data.
896 +// The frame indexes are inserted into various
897 +// opcodes while compiling the pattern, meaning that fr ame
898 +// size must be restricted to the size that will fit
899 +// as an operand (24 bits).
900 +//
901 +//----------------------------------------------------------------------------- -
902 +int32_t RegexCompile::allocateStackData(int32_t size) {
903 + if (U_FAILURE(*fStatus)) {
904 + return 0;
905 + }
906 + if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
907 + error(U_REGEX_INTERNAL_ERROR);
908 + return 0;
909 + }
910 + int32_t dataIndex = fRXPat->fFrameSize;
911 + fRXPat->fFrameSize += size;
912 + if (fRXPat->fFrameSize >= 0x00fffff0) {
913 + error(U_REGEX_PATTERN_TOO_BIG);
914 + }
915 + return dataIndex;
916 +}
917 +
918
919 //----------------------------------------------------------------------------- -
920 //
921 @@ -1988,7 +2019,7 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
922 theLoc--;
923 }
924 if (reserveLoc) {
925 - int32_t nop = URX_BUILD(URX_NOP, 0);
926 + int32_t nop = buildOp(URX_NOP, 0);
927 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
928 }
929 }
930 @@ -2063,8 +2094,7 @@ void RegexCompile::handleCloseParen() {
931 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
932
933 int32_t frameVarLocation = URX_VAL(captureOp);
934 - int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocatio n);
935 - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
936 + appendOp(URX_END_CAPTURE, frameVarLocation);
937 }
938 break;
939 case atomic:
940 @@ -2075,8 +2105,7 @@ void RegexCompile::handleCloseParen() {
941 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO penParen+1);
942 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
943 int32_t stoLoc = URX_VAL(stoOp);
944 - int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc);
945 - fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
946 + appendOp(URX_LD_SP, stoLoc);
947 }
948 break;
949
950 @@ -2085,8 +2114,7 @@ void RegexCompile::handleCloseParen() {
951 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch OpenParen-5);
952 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
953 int32_t dataLoc = URX_VAL(startOp);
954 - int32_t op = URX_BUILD(URX_LA_END, dataLoc);
955 - fRXPat->fCompiledPat->addElement(op, *fStatus);
956 + appendOp(URX_LA_END, dataLoc);
957 }
958 break;
959
960 @@ -2096,19 +2124,16 @@ void RegexCompile::handleCloseParen() {
961 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch OpenParen-1);
962 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
963 int32_t dataLoc = URX_VAL(startOp);
964 - int32_t op = URX_BUILD(URX_LA_END, dataLoc);
965 - fRXPat->fCompiledPat->addElement(op, *fStatus);
966 - op = URX_BUILD(URX_BACKTRACK, 0);
967 - fRXPat->fCompiledPat->addElement(op, *fStatus);
968 - op = URX_BUILD(URX_LA_END, dataLoc);
969 - fRXPat->fCompiledPat->addElement(op, *fStatus);
970 + appendOp(URX_LA_END, dataLoc);
971 + appendOp(URX_BACKTRACK, 0);
972 + appendOp(URX_LA_END, dataLoc);
973
974 // Patch the URX_SAVE near the top of the block.
975 // The destination of the SAVE is the final LA_END that was just ad ded.
976 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch OpenParen);
977 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
978 int32_t dest = fRXPat->fCompiledPat->size()-1;
979 - saveOp = URX_BUILD(URX_STATE_SAVE, dest);
980 + saveOp = buildOp(URX_STATE_SAVE, dest);
981 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
982 }
983 break;
984 @@ -2121,10 +2146,8 @@ void RegexCompile::handleCloseParen() {
985 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch OpenParen-4);
986 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
987 int32_t dataLoc = URX_VAL(startOp);
988 - int32_t op = URX_BUILD(URX_LB_END, dataLoc);
989 - fRXPat->fCompiledPat->addElement(op, *fStatus);
990 - op = URX_BUILD(URX_LA_END, dataLoc);
991 - fRXPat->fCompiledPat->addElement(op, *fStatus);
992 + appendOp(URX_LB_END, dataLoc);
993 + appendOp(URX_LA_END, dataLoc);
994
995 // Determine the min and max bounds for the length of the
996 // string that the pattern can match.
997 @@ -2132,6 +2155,10 @@ void RegexCompile::handleCloseParen() {
998 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
999 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
1000 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
1001 + if (URX_TYPE(maxML) != 0) {
1002 + error(U_REGEX_LOOK_BEHIND_LIMIT);
1003 + break;
1004 + }
1005 if (maxML == INT32_MAX) {
1006 error(U_REGEX_LOOK_BEHIND_LIMIT);
1007 break;
1008 @@ -2156,8 +2183,7 @@ void RegexCompile::handleCloseParen() {
1009 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch OpenParen-5);
1010 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
1011 int32_t dataLoc = URX_VAL(startOp);
1012 - int32_t op = URX_BUILD(URX_LBN_END, dataLoc);
1013 - fRXPat->fCompiledPat->addElement(op, *fStatus);
1014 + appendOp(URX_LBN_END, dataLoc);
1015
1016 // Determine the min and max bounds for the length of the
1017 // string that the pattern can match.
1018 @@ -2165,6 +2191,10 @@ void RegexCompile::handleCloseParen() {
1019 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
1020 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
1021 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
1022 + if (URX_TYPE(maxML) != 0) {
1023 + error(U_REGEX_LOOK_BEHIND_LIMIT);
1024 + break;
1025 + }
1026 if (maxML == INT32_MAX) {
1027 error(U_REGEX_LOOK_BEHIND_LIMIT);
1028 break;
1029 @@ -2178,7 +2208,7 @@ void RegexCompile::handleCloseParen() {
1030
1031 // Insert the pattern location to continue at after a successful ma tch
1032 // as the last operand of the URX_LBN_CONT
1033 - op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
1034 + int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()) ;
1035 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
1036 }
1037 break;
1038 @@ -2219,7 +2249,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
1039 case 0:
1040 {
1041 // Set of no elements. Always fails to match.
1042 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fSta tus);
1043 + appendOp(URX_BACKTRACK, 0);
1044 delete theSet;
1045 }
1046 break;
1047 @@ -2240,8 +2270,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
1048 // Put it into the compiled pattern as a set.
1049 int32_t setNumber = fRXPat->fSets->size();
1050 fRXPat->fSets->addElement(theSet, *fStatus);
1051 - int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
1052 - fRXPat->fCompiledPat->addElement(setOp, *fStatus);
1053 + appendOp(URX_SETREF, setNumber);
1054 }
1055 }
1056 }
1057 @@ -2280,13 +2309,10 @@ void RegexCompile::compileInterval(int32_t InitOp , int32_t LoopOp)
1058 // counterLoc --> Loop counter
1059 // +1 --> Input index (for breaking non-progressing loops )
1060 // (Only present if unbounded upper limit on loop)
1061 - int32_t counterLoc = fRXPat->fFrameSize;
1062 - fRXPat->fFrameSize++;
1063 - if (fIntervalUpper < 0) {
1064 - fRXPat->fFrameSize++;
1065 - }
1066 + int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
1067 + int32_t counterLoc = allocateStackData(dataSize);
1068
1069 - int32_t op = URX_BUILD(InitOp, counterLoc);
1070 + int32_t op = buildOp(InitOp, counterLoc);
1071 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
1072
1073 // The second operand of CTR_INIT is the location following the end of the loop.
1074 @@ -2294,7 +2320,7 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
1075 // compilation of something later on causes the code to grow and the targ et
1076 // position to move.
1077 int32_t loopEnd = fRXPat->fCompiledPat->size();
1078 - op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
1079 + op = buildOp(URX_RELOC_OPRND, loopEnd);
1080 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
1081
1082 // Followed by the min and max counts.
1083 @@ -2303,8 +2329,7 @@ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
1084
1085 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
1086 // Goes at end of the block being looped over, so just append to the code so far.
1087 - op = URX_BUILD(LoopOp, topOfBlock);
1088 - fRXPat->fCompiledPat->addElement(op, *fStatus);
1089 + appendOp(LoopOp, topOfBlock);
1090
1091 if ((fIntervalLow & 0xff000000) != 0 ||
1092 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
1093 @@ -2328,7 +2353,15 @@ UBool RegexCompile::compileInlineInterval() {
1094 int32_t topOfBlock = blockTopLoc(FALSE);
1095 if (fIntervalUpper == 0) {
1096 // Pathological case. Attempt no matches, as if the block doesn't exis t.
1097 + // Discard the generated code for the block.
1098 + // If the block included parens, discard the info pertaining to them as well.
1099 fRXPat->fCompiledPat->setSize(topOfBlock);
1100 + if (fMatchOpenParen >= topOfBlock) {
1101 + fMatchOpenParen = -1;
1102 + }
1103 + if (fMatchCloseParen >= topOfBlock) {
1104 + fMatchCloseParen = -1;
1105 + }
1106 return TRUE;
1107 }
1108
1109 @@ -2349,7 +2382,7 @@ UBool RegexCompile::compileInlineInterval() {
1110 //
1111 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
1112 + fIntervalUpper + (fIntervalUpper-fIntervalLow );
1113 - int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
1114 + int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
1115 if (fIntervalLow == 0) {
1116 insertOp(topOfBlock);
1117 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
1118 @@ -2362,13 +2395,10 @@ UBool RegexCompile::compileInlineInterval() {
1119 // it was put there when it was originally encountered.
1120 int32_t i;
1121 for (i=1; i<fIntervalUpper; i++ ) {
1122 - if (i == fIntervalLow) {
1123 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
1124 - }
1125 - if (i > fIntervalLow) {
1126 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
1127 + if (i >= fIntervalLow) {
1128 + appendOp(saveOp);
1129 }
1130 - fRXPat->fCompiledPat->addElement(op, *fStatus);
1131 + appendOp(op);
1132 }
1133 return TRUE;
1134 }
1135 @@ -3587,7 +3617,7 @@ void RegexCompile::stripNOPs() {
1136 int32_t operandAddress = URX_VAL(op);
1137 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
1138 int32_t fixedOperandAddress = operandAddress - deltas.elementAt i(operandAddress);
1139 - op = URX_BUILD(opType, fixedOperandAddress);
1140 + op = buildOp(opType, fixedOperandAddress);
1141 fRXPat->fCompiledPat->setElementAt(op, dst);
1142 dst++;
1143 break;
1144 @@ -3602,7 +3632,7 @@ void RegexCompile::stripNOPs() {
1145 break;
1146 }
1147 where = fRXPat->fGroupMap->elementAti(where-1);
1148 - op = URX_BUILD(opType, where);
1149 + op = buildOp(opType, where);
1150 fRXPat->fCompiledPat->setElementAt(op, dst);
1151 dst++;
1152
1153 @@ -3954,7 +3984,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
1154 //----------------------------------------------------------------------------- -
1155 //
1156 // scanNamedChar
1157 - // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
1158 +// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
1159 //
1160 // The scan position will be at the 'N'. On return
1161 // the scan position should be just after the '}'
1162 diff --git a/source/i18n/regexcmp.h b/source/i18n/regexcmp.h
1163 index debdf45..c3cc7db 100644
1164 --- a/source/i18n/regexcmp.h
1165 +++ b/source/i18n/regexcmp.h
1166 @@ -104,6 +104,13 @@ private:
1167 void fixLiterals(UBool split=FALSE); // Generate code for pendi ng literal characters.
1168 void insertOp(int32_t where); // Open up a slot for a ne w op in the
1169 // generated code at the specified location.
1170 + void appendOp(int32_t op); // Append a new op to the compiled pattern.
1171 + void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
1172 + int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode i nstruction.
1173 + int32_t allocateData(int32_t size); // Allocate space in the m atcher data area.
1174 + // Return index of the n ewly allocated data.
1175 + int32_t allocateStackData(int32_t size); // Allocate space in the m atch back-track stack frame.
1176 + // Return offset index i n the frame.
1177 int32_t minMatchLength(int32_t start,
1178 int32_t end);
1179 int32_t maxMatchLength(int32_t start,
1180 @@ -187,7 +194,9 @@ private:
1181 int32_t fMatchOpenParen; // The position in the com piled pattern
1182 // of the slot reserved for a state save
1183 // at the start of the m ost recently processed
1184 - // parenthesized block.
1185 + // parenthesized block. Updated when processing
1186 + // a close to the locati on for the corresponding open.
1187 +
1188 int32_t fMatchCloseParen; // The position in the pat tern of the first
1189 // location after the mo st recently processed
1190 // parenthesized block.
1191 diff --git a/source/i18n/regeximp.h b/source/i18n/regeximp.h
1192 index bdf8403..fdd9c76 100644
1193 --- a/source/i18n/regeximp.h
1194 +++ b/source/i18n/regeximp.h
1195 @@ -1,5 +1,5 @@
1196 //
1197 -// Copyright (C) 2002-2013 International Business Machines Corporation
1198 +// Copyright (C) 2002-2014 International Business Machines Corporation
1199 // and others. All rights reserved.
1200 //
1201 // file: regeximp.h
1202 @@ -241,7 +241,6 @@ enum {
1203 //
1204 // Convenience macros for assembling and disassembling a compiled operation.
1205 //
1206 -#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
1207 #define URX_TYPE(x) ((uint32_t)(x) >> 24)
1208 #define URX_VAL(x) ((x) & 0xffffff)
1209
1210 diff --git a/source/test/intltest/regextst.cpp b/source/test/intltest/regextst.c pp
1211 index ca2fd21..f440c26 100644
1212 --- a/source/test/intltest/regextst.cpp
1213 +++ b/source/test/intltest/regextst.cpp
1214 @@ -144,6 +144,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, c onst char* &name, ch
1215 case 24: name = "TestBug11049";
1216 if (exec) TestBug11049();
1217 break;
1218 + case 25: name = "TestBug11371";
1219 + if (exec) TestBug11371();
1220 + break;
1221 default: name = "";
1222 break; //needed to end loop
1223 }
1224 @@ -5367,6 +5370,49 @@ void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expec
1225 }
1226
1227
1228 +void RegexTest::TestBug11371() {
1229 + if (quick) {
1230 + logln("Skipping test. Runs in exhuastive mode only.");
1231 + return;
1232 + }
1233 + UErrorCode status = U_ZERO_ERROR;
1234 + UnicodeString patternString;
1235 +
1236 + for (int i=0; i<8000000; i++) {
1237 + patternString.append(UnicodeString("()"));
1238 + }
1239 + LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
1240 + if (status != U_REGEX_PATTERN_TOO_BIG) {
1241 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s .",
1242 + __FILE__, __LINE__, u_errorName(status));
1243 + }
1244 +
1245 + status = U_ZERO_ERROR;
1246 + patternString = "(";
1247 + for (int i=0; i<20000000; i++) {
1248 + patternString.append(UnicodeString("A++"));
1249 + }
1250 + patternString.append(UnicodeString("){0}B++"));
1251 + LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString , 0, status));
1252 + if (status != U_REGEX_PATTERN_TOO_BIG) {
1253 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s .",
1254 + __FILE__, __LINE__, u_errorName(status));
1255 + }
1256 +
1257 + // Pattern with too much string data, such that string indexes overflow ope rand data field size
1258 + // in compiled instruction.
1259 + status = U_ZERO_ERROR;
1260 + patternString = "";
1261 + while (patternString.length() < 0x00ffffff) {
1262 + patternString.append(UnicodeString("stuff and things dont you know, the se are a few of my favorite strings\n"));
1263 + }
1264 + patternString.append(UnicodeString("X? trailing string"));
1265 + LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString , 0, status));
1266 + if (status != U_REGEX_PATTERN_TOO_BIG) {
1267 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s .",
1268 + __FILE__, __LINE__, u_errorName(status));
1269 + }
1270 +}
1271
1272 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
1273
1274 diff --git a/source/test/intltest/regextst.h b/source/test/intltest/regextst.h
1275 index 28e2121..38cc4ef 100644
1276 --- a/source/test/intltest/regextst.h
1277 +++ b/source/test/intltest/regextst.h
1278 @@ -50,6 +50,7 @@ public:
1279 virtual void Bug10459();
1280 virtual void TestCaseInsensitiveStarters();
1281 virtual void TestBug11049();
1282 + virtual void TestBug11371();
1283
1284 // The following functions are internal to the regexp tests.
1285 virtual void assertUText(const char *expected, UText *actual, const char *f ile, int line);
1286 diff --git a/source/test/testdata/regextst.txt b/source/test/testdata/regextst.t xt
1287 index 4d2e7f6..d642e8b 100644
1288 --- a/source/test/testdata/regextst.txt
1289 +++ b/source/test/testdata/regextst.txt
1290 @@ -1201,6 +1201,24 @@
1291 "A|B|\U00012345" "hello <0>\U00012345</0>"
1292 "A|B|\U00010000" "hello \ud800"
1293
1294 +# Bug 11369
1295 +# Incorrect optimization of patterns with a zero length quantifier {0}
1296 +
1297 +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE"
1298 +"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>"
1299 +"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>"
1300 +"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>"
1301 +"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>"
1302 +
1303 +# Bug 11370
1304 +# Max match length computation of look-behind expression gives result that is too big to fit in the
1305 +# in the 24 bit operand portion of the compiled code. Expressions should fail to compile
1306 +# (Look-behind match length must be bounded. This case is treated as unbounde d, an error.)
1307 +
1308 +"(?<!(0123456789a){10000000})x" E "no match"
1309 +"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match"
1310 +
1311 +
1312 # Random debugging, Temporary
1313 #
1314
OLDNEW
« no previous file with comments | « patches/measure_format.patch ('k') | patches/regexcmp.patch » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698