OLD | NEW |
1 Index: source/i18n/regexcmp.h | 1 diff --git a/source/common/unicode/utypes.h b/source/common/unicode/utypes.h |
2 =================================================================== | 2 index 704089c..1824625 100644 |
3 --- source/i18n/regexcmp.h (revision 292476) | 3 --- a/source/common/unicode/utypes.h |
4 +++ source/i18n/regexcmp.h (working copy) | 4 +++ b/source/common/unicode/utypes.h |
5 @@ -182,7 +182,9 @@ | 5 @@ -305,7 +305,7 @@ typedef double UDate; |
6 int32_t fMatchOpenParen; // The position in the com
piled pattern | 6 #define U_IO_API |
7 // of the slot reserved
for a state save | 7 #define U_TOOLUTIL_API |
8 // at the start of the m
ost recently processed | 8 #elif defined(U_COMMON_IMPLEMENTATION) |
9 - // parenthesized block. | 9 -#define U_DATA_API U_IMPORT |
10 + // parenthesized block.
Updated when processing | 10 +#define U_DATA_API U_EXPORT |
11 + // a close to the locati
on for the corresponding open. | 11 #define U_COMMON_API U_EXPORT |
| 12 #define U_I18N_API U_IMPORT |
| 13 #define U_LAYOUT_API U_IMPORT |
| 14 @@ -647,6 +647,7 @@ typedef enum UErrorCode { |
| 15 U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack sta
ck overflow. */ |
| 16 U_REGEX_TIME_OUT, /**< Maximum allowed match time excee
ded */ |
| 17 U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by us
er callback fn. */ |
| 18 + U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size o
r complexity. @draft ICU 55 */ |
| 19 U_REGEX_ERROR_LIMIT, /**< This must always be the last val
ue to indicate the limit for regexp errors */ |
| 20 |
| 21 /* |
| 22 diff --git a/source/common/utypes.c b/source/common/utypes.c |
| 23 index c28e727..32b6d88 100644 |
| 24 --- a/source/common/utypes.c |
| 25 +++ b/source/common/utypes.c |
| 26 @@ -1,7 +1,7 @@ |
| 27 /* |
| 28 ****************************************************************************** |
| 29 * |
| 30 -* Copyright (C) 1997-2011, International Business Machines |
| 31 +* Copyright (C) 1997-2014, International Business Machines |
| 32 * Corporation and others. All Rights Reserved. |
| 33 * |
| 34 ****************************************************************************** |
| 35 @@ -165,7 +165,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START]
= { |
| 36 "U_REGEX_INVALID_RANGE", |
| 37 "U_REGEX_STACK_OVERFLOW", |
| 38 "U_REGEX_TIME_OUT", |
| 39 - "U_REGEX_STOPPED_BY_CALLER" |
| 40 + "U_REGEX_STOPPED_BY_CALLER", |
| 41 + "U_REGEX_PATTERN_TOO_BIG" |
| 42 }; |
| 43 |
| 44 static const char * const |
| 45 diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp |
| 46 index 0816eec..0c2196f 100644 |
| 47 --- a/source/i18n/regexcmp.cpp |
| 48 +++ b/source/i18n/regexcmp.cpp |
| 49 @@ -301,7 +301,7 @@ void RegexCompile::compile( |
| 50 // present in the saved state: the input string position (int64_t) and |
| 51 // the position in the compiled pattern. |
| 52 // |
| 53 - fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; |
| 54 + allocateStackData(RESTACKFRAME_HDRCOUNT); |
| 55 |
| 56 // |
| 57 // Optimization pass 1: NOPs, back-references, and case-folding |
| 58 @@ -367,9 +367,9 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 59 // the start of an ( grouping. |
| 60 //4 NOP Resreved, will be replaced by a save if there are |
| 61 // OR | operators at the top level |
| 62 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus
); |
| 63 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); |
| 64 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); |
| 65 + appendOp(URX_STATE_SAVE, 2); |
| 66 + appendOp(URX_JMP, 3); |
| 67 + appendOp(URX_FAIL, 0); |
| 68 |
| 69 // Standard open nonCapture paren action emits the two NOPs and |
| 70 // sets up the paren stack frame. |
| 71 @@ -392,7 +392,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 72 } |
| 73 |
| 74 // add the END operation to the compiled pattern. |
| 75 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); |
| 76 + appendOp(URX_END, 0); |
| 77 |
| 78 // Terminate the pattern compilation state machine. |
| 79 returnVal = FALSE; |
| 80 @@ -414,14 +414,13 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 81 int32_t savePosition = fParenStack.popi(); |
| 82 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition
); |
| 83 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserve
d location |
| 84 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
| 85 + op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); |
| 86 fRXPat->fCompiledPat->setElementAt(op, savePosition); |
| 87 |
| 88 // Append an JMP operation into the compiled pattern. The operand
for |
| 89 // the JMP will eventually be the location following the ')' for t
he |
| 90 // group. This will be patched in later, when the ')' is encounte
red. |
| 91 - op = URX_BUILD(URX_JMP, 0); |
| 92 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 93 + appendOp(URX_JMP, 0); |
| 94 |
| 95 // Push the position of the newly added JMP op onto the parentheses
stack. |
| 96 // This registers if for fixup when this block's close paren is enc
ountered. |
| 97 @@ -430,7 +429,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 98 // Append a NOP to the compiled pattern. This is the slot reserved |
| 99 // for a SAVE in the event that there is yet another '|' followin
g |
| 100 // this one. |
| 101 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 102 + appendOp(URX_NOP, 0); |
| 103 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); |
| 104 } |
| 105 break; |
| 106 @@ -456,12 +455,10 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 107 // END_CAPTURE is encountered. |
| 108 { |
| 109 fixLiterals(); |
| 110 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 111 - int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. |
| 112 - fRXPat->fFrameSize += 3; |
| 113 - int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); |
| 114 - fRXPat->fCompiledPat->addElement(cop, *fStatus); |
| 115 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 116 + appendOp(URX_NOP, 0); |
| 117 + int32_t varsLoc = allocateStackData(3); // Reserve three slots
in match stack frame. |
| 118 + appendOp(URX_START_CAPTURE, varsLoc); |
| 119 + appendOp(URX_NOP, 0); |
| 120 |
| 121 // On the Parentheses stack, start a new frame and add the postions |
| 122 // of the two NOPs. Depending on what follows in the pattern, th
e |
| 123 @@ -486,8 +483,8 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 124 // is an '|' alternation within the parens. |
| 125 { |
| 126 fixLiterals(); |
| 127 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 128 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 129 + appendOp(URX_NOP, 0); |
| 130 + appendOp(URX_NOP, 0); |
| 131 |
| 132 // On the Parentheses stack, start a new frame and add the postions |
| 133 // of the two NOPs. |
| 134 @@ -509,12 +506,10 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 135 // is an '|' alternation within the parens. |
| 136 { |
| 137 fixLiterals(); |
| 138 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 139 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati
on for saving the |
| 140 - fRXPat->fDataSize += 1; // state stack ptr. |
| 141 - int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); |
| 142 - fRXPat->fCompiledPat->addElement(stoOp, *fStatus); |
| 143 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 144 + appendOp(URX_NOP, 0); |
| 145 + int32_t varLoc = allocateData(1); // Reserve a data location fo
r saving the state stack ptr. |
| 146 + appendOp(URX_STO_SP, varLoc); |
| 147 + appendOp(URX_NOP, 0); |
| 148 |
| 149 // On the Parentheses stack, start a new frame and add the postions |
| 150 // of the two NOPs. Depending on what follows in the pattern, th
e |
| 151 @@ -557,26 +552,14 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 152 // Two data slots are reserved, for saving the stack ptr and the input
position. |
| 153 { |
| 154 fixLiterals(); |
| 155 - int32_t dataLoc = fRXPat->fDataSize; |
| 156 - fRXPat->fDataSize += 2; |
| 157 - int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
| 158 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 159 - |
| 160 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
| 161 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 162 - |
| 163 - op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
| 164 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 165 - |
| 166 - op = URX_BUILD(URX_LA_END, dataLoc); |
| 167 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 168 - |
| 169 - op = URX_BUILD(URX_BACKTRACK, 0); |
| 170 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 171 - |
| 172 - op = URX_BUILD(URX_NOP, 0); |
| 173 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 174 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 175 + int32_t dataLoc = allocateData(2); |
| 176 + appendOp(URX_LA_START, dataLoc); |
| 177 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); |
| 178 + appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); |
| 179 + appendOp(URX_LA_END, dataLoc); |
| 180 + appendOp(URX_BACKTRACK, 0); |
| 181 + appendOp(URX_NOP, 0); |
| 182 + appendOp(URX_NOP, 0); |
| 183 |
| 184 // On the Parentheses stack, start a new frame and add the postions |
| 185 // of the NOPs. |
| 186 @@ -601,16 +584,10 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 187 // an alternate (transparent) re
gion. |
| 188 { |
| 189 fixLiterals(); |
| 190 - int32_t dataLoc = fRXPat->fDataSize; |
| 191 - fRXPat->fDataSize += 2; |
| 192 - int32_t op = URX_BUILD(URX_LA_START, dataLoc); |
| 193 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 194 - |
| 195 - op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patch
ed later. |
| 196 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 197 - |
| 198 - op = URX_BUILD(URX_NOP, 0); |
| 199 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 200 + int32_t dataLoc = allocateData(2); |
| 201 + appendOp(URX_LA_START, dataLoc); |
| 202 + appendOp(URX_STATE_SAVE, 0); // dest address will be patched lat
er. |
| 203 + appendOp(URX_NOP, 0); |
| 204 |
| 205 // On the Parentheses stack, start a new frame and add the postions |
| 206 // of the StateSave and NOP. |
| 207 @@ -648,23 +625,19 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 208 fixLiterals(); |
| 209 |
| 210 // Allocate data space |
| 211 - int32_t dataLoc = fRXPat->fDataSize; |
| 212 - fRXPat->fDataSize += 4; |
| 213 + int32_t dataLoc = allocateData(4); |
| 214 |
| 215 // Emit URX_LB_START |
| 216 - int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
| 217 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 218 + appendOp(URX_LB_START, dataLoc); |
| 219 |
| 220 // Emit URX_LB_CONT |
| 221 - op = URX_BUILD(URX_LB_CONT, dataLoc); |
| 222 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 223 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt
h. To be filled later. |
| 224 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt
h. To be filled later. |
| 225 + appendOp(URX_LB_CONT, dataLoc); |
| 226 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l
ater. |
| 227 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l
ater. |
| 228 |
| 229 - // Emit the NOP |
| 230 - op = URX_BUILD(URX_NOP, 0); |
| 231 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 232 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 233 + // Emit the NOPs |
| 234 + appendOp(URX_NOP, 0); |
| 235 + appendOp(URX_NOP, 0); |
| 236 |
| 237 // On the Parentheses stack, start a new frame and add the postions |
| 238 // of the URX_LB_CONT and the NOP. |
| 239 @@ -704,24 +677,20 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 240 fixLiterals(); |
| 241 |
| 242 // Allocate data space |
| 243 - int32_t dataLoc = fRXPat->fDataSize; |
| 244 - fRXPat->fDataSize += 4; |
| 245 + int32_t dataLoc = allocateData(4); |
| 246 |
| 247 // Emit URX_LB_START |
| 248 - int32_t op = URX_BUILD(URX_LB_START, dataLoc); |
| 249 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 250 + appendOp(URX_LB_START, dataLoc); |
| 251 |
| 252 // Emit URX_LBN_CONT |
| 253 - op = URX_BUILD(URX_LBN_CONT, dataLoc); |
| 254 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 255 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt
h. To be filled later. |
| 256 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt
h. To be filled later. |
| 257 - fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. |
| 258 + appendOp(URX_LBN_CONT, dataLoc); |
| 259 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l
ater. |
| 260 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l
ater. |
| 261 + appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled l
ater. |
| 262 |
| 263 - // Emit the NOP |
| 264 - op = URX_BUILD(URX_NOP, 0); |
| 265 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 266 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 267 + // Emit the NOPs |
| 268 + appendOp(URX_NOP, 0); |
| 269 + appendOp(URX_NOP, 0); |
| 270 |
| 271 // On the Parentheses stack, start a new frame and add the postions |
| 272 // of the URX_LB_CONT and the NOP. |
| 273 @@ -791,12 +760,9 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 274 |
| 275 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 276 // Emit optimized code for [char set]+ |
| 277 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated
Op)); |
| 278 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
| 279 - frameLoc = fRXPat->fFrameSize; |
| 280 - fRXPat->fFrameSize++; |
| 281 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
| 282 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 283 + appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); |
| 284 + frameLoc = allocateStackData(1); |
| 285 + appendOp(URX_LOOP_C, frameLoc); |
| 286 break; |
| 287 } |
| 288 |
| 289 @@ -804,7 +770,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 290 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 291 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 292 // Emit Optimized code for .+ operations. |
| 293 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
| 294 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); |
| 295 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 296 // URX_LOOP_DOT_I operand is a flag indicating ". match
es any" mode. |
| 297 loopOpI |= 1; |
| 298 @@ -812,11 +778,9 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 299 if (fModeFlags & UREGEX_UNIX_LINES) { |
| 300 loopOpI |= 2; |
| 301 } |
| 302 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); |
| 303 - frameLoc = fRXPat->fFrameSize; |
| 304 - fRXPat->fFrameSize++; |
| 305 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); |
| 306 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 307 + appendOp(loopOpI); |
| 308 + frameLoc = allocateStackData(1); |
| 309 + appendOp(URX_LOOP_C, frameLoc); |
| 310 break; |
| 311 } |
| 312 |
| 313 @@ -830,18 +794,15 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 314 // Zero length match is possible. |
| 315 // Emit the code sequence that can handle it. |
| 316 insertOp(topLoc); |
| 317 - frameLoc = fRXPat->fFrameSize; |
| 318 - fRXPat->fFrameSize++; |
| 319 + frameLoc = allocateStackData(1); |
| 320 |
| 321 - int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); |
| 322 + int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); |
| 323 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 324 |
| 325 - op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); |
| 326 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 327 + appendOp(URX_JMP_SAV_X, topLoc+1); |
| 328 } else { |
| 329 // Simpler code when the repeated body must match something non
-empty |
| 330 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); |
| 331 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| 332 + appendOp(URX_JMP_SAV, topLoc); |
| 333 } |
| 334 } |
| 335 break; |
| 336 @@ -853,8 +814,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 337 // 3. ... |
| 338 { |
| 339 int32_t topLoc = blockTopLoc(FALSE); |
| 340 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); |
| 341 - fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); |
| 342 + appendOp(URX_STATE_SAVE, topLoc); |
| 343 } |
| 344 break; |
| 345 |
| 346 @@ -868,7 +828,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 347 // Insert the state save into the compiled pattern, and we're done. |
| 348 { |
| 349 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 350 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompile
dPat->size()); |
| 351 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledP
at->size()); |
| 352 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 353 } |
| 354 break; |
| 355 @@ -887,14 +847,12 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 356 int32_t jmp1_loc = blockTopLoc(TRUE); |
| 357 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); |
| 358 |
| 359 - int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); |
| 360 + int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); |
| 361 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); |
| 362 |
| 363 - int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); |
| 364 - fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); |
| 365 + appendOp(URX_JMP, jmp2_loc+2); |
| 366 |
| 367 - int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); |
| 368 - fRXPat->fCompiledPat->addElement(save_op, *fStatus); |
| 369 + appendOp(URX_STATE_SAVE, jmp1_loc+1); |
| 370 } |
| 371 break; |
| 372 |
| 373 @@ -934,12 +892,10 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 374 |
| 375 if (URX_TYPE(repeatedOp) == URX_SETREF) { |
| 376 // Emit optimized code for a [char set]* |
| 377 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated
Op)); |
| 378 + int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp
)); |
| 379 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 380 - dataLoc = fRXPat->fFrameSize; |
| 381 - fRXPat->fFrameSize++; |
| 382 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
| 383 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 384 + dataLoc = allocateStackData(1); |
| 385 + appendOp(URX_LOOP_C, dataLoc); |
| 386 break; |
| 387 } |
| 388 |
| 389 @@ -947,7 +903,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 390 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || |
| 391 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { |
| 392 // Emit Optimized code for .* operations. |
| 393 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); |
| 394 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); |
| 395 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { |
| 396 // URX_LOOP_DOT_I operand is a flag indicating . matche
s any mode. |
| 397 loopOpI |= 1; |
| 398 @@ -956,10 +912,8 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 399 loopOpI |= 2; |
| 400 } |
| 401 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); |
| 402 - dataLoc = fRXPat->fFrameSize; |
| 403 - fRXPat->fFrameSize++; |
| 404 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); |
| 405 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); |
| 406 + dataLoc = allocateStackData(1); |
| 407 + appendOp(URX_LOOP_C, dataLoc); |
| 408 break; |
| 409 } |
| 410 } |
| 411 @@ -968,30 +922,29 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 412 // The optimizations did not apply. |
| 413 |
| 414 int32_t saveStateLoc = blockTopLoc(TRUE); |
| 415 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); |
| 416 + int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); |
| 417 |
| 418 // Check for minimum match length of zero, which requires |
| 419 // extra loop-breaking code. |
| 420 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { |
| 421 insertOp(saveStateLoc); |
| 422 - dataLoc = fRXPat->fFrameSize; |
| 423 - fRXPat->fFrameSize++; |
| 424 + dataLoc = allocateStackData(1); |
| 425 |
| 426 - int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); |
| 427 + int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); |
| 428 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); |
| 429 - jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); |
| 430 + jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); |
| 431 } |
| 432 |
| 433 // Locate the position in the compiled pattern where the match will
continue |
| 434 // after completing the *. (4 or 5 in the comment above) |
| 435 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 436 |
| 437 - // Put together the save state op store it into the compiled code. |
| 438 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| 439 + // Put together the save state op and store it into the compiled co
de. |
| 440 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); |
| 441 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); |
| 442 |
| 443 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pat
tern. |
| 444 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); |
| 445 + appendOp(jmpOp); |
| 446 } |
| 447 break; |
| 448 |
| 449 @@ -1005,10 +958,9 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 450 { |
| 451 int32_t jmpLoc = blockTopLoc(TRUE); // loc
1. |
| 452 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc
3. |
| 453 - int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); |
| 454 - int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); |
| 455 + int32_t jmpOp = buildOp(URX_JMP, saveLoc); |
| 456 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); |
| 457 - fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); |
| 458 + appendOp(URX_STATE_SAVE, jmpLoc+1); |
| 459 } |
| 460 break; |
| 461 |
| 462 @@ -1077,9 +1029,9 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 463 |
| 464 // First the STO_SP before the start of the loop |
| 465 insertOp(topLoc); |
| 466 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati
on for saving the |
| 467 - fRXPat->fDataSize += 1; // state stack ptr. |
| 468 - int32_t op = URX_BUILD(URX_STO_SP, varLoc); |
12 + | 469 + |
13 int32_t fMatchCloseParen; // The position in the pat
tern of the first | 470 + int32_t varLoc = allocateData(1); // Reserve a data location for
saving the |
14 // location after the mo
st recently processed | 471 + int32_t op = buildOp(URX_STO_SP, varLoc); |
15 // parenthesized block. | 472 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
16 Index: source/i18n/regexcmp.cpp | 473 |
17 =================================================================== | 474 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); |
18 --- source/i18n/regexcmp.cpp (revision 292476) | 475 @@ -1088,8 +1040,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
19 +++ source/i18n/regexcmp.cpp (working copy) | 476 fRXPat->fCompiledPat->push(loopOp, *fStatus); |
20 @@ -2133,6 +2133,10 @@ | 477 |
| 478 // Then the LD_SP after the end of the loop |
| 479 - op = URX_BUILD(URX_LD_SP, varLoc); |
| 480 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 481 + appendOp(URX_LD_SP, varLoc); |
| 482 } |
| 483 |
| 484 break; |
| 485 @@ -1125,55 +1076,49 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 486 // scanned a ".", match any single character. |
| 487 { |
| 488 fixLiterals(FALSE); |
| 489 - int32_t op; |
| 490 if (fModeFlags & UREGEX_DOTALL) { |
| 491 - op = URX_BUILD(URX_DOTANY_ALL, 0); |
| 492 + appendOp(URX_DOTANY_ALL, 0); |
| 493 } else if (fModeFlags & UREGEX_UNIX_LINES) { |
| 494 - op = URX_BUILD(URX_DOTANY_UNIX, 0); |
| 495 + appendOp(URX_DOTANY_UNIX, 0); |
| 496 } else { |
| 497 - op = URX_BUILD(URX_DOTANY, 0); |
| 498 + appendOp(URX_DOTANY, 0); |
| 499 } |
| 500 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 501 } |
| 502 break; |
| 503 |
| 504 case doCaret: |
| 505 { |
| 506 fixLiterals(FALSE); |
| 507 - int32_t op = 0; |
| 508 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { |
| 509 - op = URX_CARET; |
| 510 + appendOp(URX_CARET, 0); |
| 511 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { |
| 512 - op = URX_CARET_M; |
| 513 + appendOp(URX_CARET_M, 0); |
| 514 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { |
| 515 - op = URX_CARET; // Only testing true start of input. |
| 516 + appendOp(URX_CARET, 0); // Only testing true start of input. |
| 517 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { |
| 518 - op = URX_CARET_M_UNIX; |
| 519 + appendOp(URX_CARET_M_UNIX, 0); |
| 520 } |
| 521 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 522 } |
| 523 break; |
| 524 |
| 525 case doDollar: |
| 526 { |
| 527 fixLiterals(FALSE); |
| 528 - int32_t op = 0; |
| 529 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { |
| 530 - op = URX_DOLLAR; |
| 531 + appendOp(URX_DOLLAR, 0); |
| 532 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { |
| 533 - op = URX_DOLLAR_M; |
| 534 + appendOp(URX_DOLLAR_M, 0); |
| 535 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { |
| 536 - op = URX_DOLLAR_D; |
| 537 + appendOp(URX_DOLLAR_D, 0); |
| 538 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { |
| 539 - op = URX_DOLLAR_MD; |
| 540 + appendOp(URX_DOLLAR_MD, 0); |
| 541 } |
| 542 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 543 } |
| 544 break; |
| 545 |
| 546 case doBackslashA: |
| 547 fixLiterals(FALSE); |
| 548 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); |
| 549 + appendOp(URX_CARET, 0); |
| 550 break; |
| 551 |
| 552 case doBackslashB: |
| 553 @@ -1185,7 +1130,7 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 554 #endif |
| 555 fixLiterals(FALSE); |
| 556 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA
CKSLASH_B; |
| 557 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); |
| 558 + appendOp(op, 1); |
| 559 } |
| 560 break; |
| 561 |
| 562 @@ -1198,63 +1143,59 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 563 #endif |
| 564 fixLiterals(FALSE); |
| 565 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA
CKSLASH_B; |
| 566 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); |
| 567 + appendOp(op, 0); |
| 568 } |
| 569 break; |
| 570 |
| 571 case doBackslashD: |
| 572 fixLiterals(FALSE); |
| 573 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatu
s); |
| 574 + appendOp(URX_BACKSLASH_D, 1); |
| 575 break; |
| 576 |
| 577 case doBackslashd: |
| 578 fixLiterals(FALSE); |
| 579 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatu
s); |
| 580 + appendOp(URX_BACKSLASH_D, 0); |
| 581 break; |
| 582 |
| 583 case doBackslashG: |
| 584 fixLiterals(FALSE); |
| 585 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatu
s); |
| 586 + appendOp(URX_BACKSLASH_G, 0); |
| 587 break; |
| 588 |
| 589 case doBackslashS: |
| 590 fixLiterals(FALSE); |
| 591 - fRXPat->fCompiledPat->addElement( |
| 592 - URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); |
| 593 + appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); |
| 594 break; |
| 595 |
| 596 case doBackslashs: |
| 597 fixLiterals(FALSE); |
| 598 - fRXPat->fCompiledPat->addElement( |
| 599 - URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); |
| 600 + appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); |
| 601 break; |
| 602 |
| 603 case doBackslashW: |
| 604 fixLiterals(FALSE); |
| 605 - fRXPat->fCompiledPat->addElement( |
| 606 - URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); |
| 607 + appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); |
| 608 break; |
| 609 |
| 610 case doBackslashw: |
| 611 fixLiterals(FALSE); |
| 612 - fRXPat->fCompiledPat->addElement( |
| 613 - URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); |
| 614 + appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); |
| 615 break; |
| 616 |
| 617 case doBackslashX: |
| 618 fixLiterals(FALSE); |
| 619 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatu
s); |
| 620 + appendOp(URX_BACKSLASH_X, 0); |
| 621 break; |
| 622 |
| 623 |
| 624 case doBackslashZ: |
| 625 fixLiterals(FALSE); |
| 626 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); |
| 627 + appendOp(URX_DOLLAR, 0); |
| 628 break; |
| 629 |
| 630 case doBackslashz: |
| 631 fixLiterals(FALSE); |
| 632 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatu
s); |
| 633 + appendOp(URX_BACKSLASH_Z, 0); |
| 634 break; |
| 635 |
| 636 case doEscapeError: |
| 637 @@ -1314,13 +1255,11 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 638 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, |
| 639 // and shouldn't enter this code path a
t all. |
| 640 fixLiterals(FALSE); |
| 641 - int32_t op; |
| 642 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 643 - op = URX_BUILD(URX_BACKREF_I, groupNum); |
| 644 + appendOp(URX_BACKREF_I, groupNum); |
| 645 } else { |
| 646 - op = URX_BUILD(URX_BACKREF, groupNum); |
| 647 + appendOp(URX_BACKREF, groupNum); |
| 648 } |
| 649 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 650 } |
| 651 break; |
| 652 |
| 653 @@ -1341,22 +1280,18 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 654 { |
| 655 // Emit the STO_SP |
| 656 int32_t topLoc = blockTopLoc(TRUE); |
| 657 - int32_t stoLoc = fRXPat->fDataSize; |
| 658 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 659 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 660 + int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. |
| 661 + int32_t op = buildOp(URX_STO_SP, stoLoc); |
| 662 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 663 |
| 664 // Emit the STATE_SAVE |
| 665 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
| 666 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 667 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); |
| 668 |
| 669 // Emit the JMP |
| 670 - op = URX_BUILD(URX_JMP, topLoc+1); |
| 671 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 672 + appendOp(URX_JMP, topLoc+1); |
| 673 |
| 674 // Emit the LD_SP |
| 675 - op = URX_BUILD(URX_LD_SP, stoLoc); |
| 676 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 677 + appendOp(URX_LD_SP, stoLoc); |
| 678 } |
| 679 break; |
| 680 |
| 681 @@ -1376,23 +1311,20 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 682 insertOp(topLoc); |
| 683 |
| 684 // emit STO_SP loc |
| 685 - int32_t stoLoc = fRXPat->fDataSize; |
| 686 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 687 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 688 + int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. |
| 689 + int32_t op = buildOp(URX_STO_SP, stoLoc); |
| 690 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 691 |
| 692 // Emit the SAVE_STATE 5 |
| 693 int32_t L7 = fRXPat->fCompiledPat->size()+1; |
| 694 - op = URX_BUILD(URX_STATE_SAVE, L7); |
| 695 + op = buildOp(URX_STATE_SAVE, L7); |
| 696 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 697 |
| 698 // Append the JMP operation. |
| 699 - op = URX_BUILD(URX_JMP, topLoc+1); |
| 700 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 701 + appendOp(URX_JMP, topLoc+1); |
| 702 |
| 703 // Emit the LD_SP loc |
| 704 - op = URX_BUILD(URX_LD_SP, stoLoc); |
| 705 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 706 + appendOp(URX_LD_SP, stoLoc); |
| 707 } |
| 708 break; |
| 709 |
| 710 @@ -1411,19 +1343,17 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 711 insertOp(topLoc); |
| 712 |
| 713 // Emit the STO_SP |
| 714 - int32_t stoLoc = fRXPat->fDataSize; |
| 715 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. |
| 716 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); |
| 717 + int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. |
| 718 + int32_t op = buildOp(URX_STO_SP, stoLoc); |
| 719 fRXPat->fCompiledPat->setElementAt(op, topLoc); |
| 720 |
| 721 // Emit the SAVE_STATE |
| 722 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; |
| 723 - op = URX_BUILD(URX_STATE_SAVE, continueLoc); |
| 724 + op = buildOp(URX_STATE_SAVE, continueLoc); |
| 725 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); |
| 726 |
| 727 // Emit the LD_SP |
| 728 - op = URX_BUILD(URX_LD_SP, stoLoc); |
| 729 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 730 + appendOp(URX_LD_SP, stoLoc); |
| 731 } |
| 732 break; |
| 733 |
| 734 @@ -1480,8 +1410,8 @@ UBool RegexCompile::doParseActions(int32_t action) |
| 735 // is an '|' alternation within the parens. |
| 736 { |
| 737 fixLiterals(FALSE); |
| 738 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 739 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); |
| 740 + appendOp(URX_NOP, 0); |
| 741 + appendOp(URX_NOP, 0); |
| 742 |
| 743 // On the Parentheses stack, start a new frame and add the postions |
| 744 // of the two NOPs (a normal non-capturing () frame, except for t
he |
| 745 @@ -1818,7 +1748,6 @@ void RegexCompile::literalChar(UChar32 c) { |
| 746 // |
| 747 //-----------------------------------------------------------------------------
- |
| 748 void RegexCompile::fixLiterals(UBool split) { |
| 749 - int32_t op = 0; // An op from/for the compiled patte
rn. |
| 750 |
| 751 // If no literal characters have been scanned but not yet had code generate
d |
| 752 // for them, nothing needs to be done. |
| 753 @@ -1857,23 +1786,23 @@ void RegexCompile::fixLiterals(UBool split) { |
| 754 // Single character, emit a URX_ONECHAR op to match it. |
| 755 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && |
| 756 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { |
| 757 - op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); |
| 758 + appendOp(URX_ONECHAR_I, lastCodePoint); |
| 759 } else { |
| 760 - op = URX_BUILD(URX_ONECHAR, lastCodePoint); |
| 761 + appendOp(URX_ONECHAR, lastCodePoint); |
| 762 } |
| 763 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 764 } else { |
| 765 // Two or more chars, emit a URX_STRING to match them. |
| 766 + if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length(
) > 0x00ffffff) { |
| 767 + error(U_REGEX_PATTERN_TOO_BIG); |
| 768 + } |
| 769 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { |
| 770 - op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); |
| 771 + appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); |
| 772 } else { |
| 773 // TODO here: add optimization to split case sensitive strings of
length two |
| 774 // into two single char ops, for efficiency. |
| 775 - op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); |
| 776 + appendOp(URX_STRING, fRXPat->fLiteralText.length()); |
| 777 } |
| 778 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 779 - op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); |
| 780 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 781 + appendOp(URX_STRING_LEN, fLiteralChars.length()); |
| 782 |
| 783 // Add this string into the accumulated strings of the compiled pattern
. |
| 784 fRXPat->fLiteralText.append(fLiteralChars); |
| 785 @@ -1883,8 +1812,58 @@ void RegexCompile::fixLiterals(UBool split) { |
| 786 } |
| 787 |
| 788 |
| 789 +int32_t RegexCompile::buildOp(int32_t type, int32_t val) { |
| 790 + if (U_FAILURE(*fStatus)) { |
| 791 + return 0; |
| 792 + } |
| 793 + if (type < 0 || type > 255) { |
| 794 + U_ASSERT(FALSE); |
| 795 + error(U_REGEX_INTERNAL_ERROR); |
| 796 + type = URX_RESERVED_OP; |
| 797 + } |
| 798 + if (val > 0x00ffffff) { |
| 799 + U_ASSERT(FALSE); |
| 800 + error(U_REGEX_INTERNAL_ERROR); |
| 801 + val = 0; |
| 802 + } |
| 803 + if (val < 0) { |
| 804 + if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { |
| 805 + U_ASSERT(FALSE); |
| 806 + error(U_REGEX_INTERNAL_ERROR); |
| 807 + return -1; |
| 808 + } |
| 809 + if (URX_TYPE(val) != 0xff) { |
| 810 + U_ASSERT(FALSE); |
| 811 + error(U_REGEX_INTERNAL_ERROR); |
| 812 + return -1; |
| 813 + } |
| 814 + type = URX_RESERVED_OP_N; |
| 815 + } |
| 816 + return (type << 24) | val; |
| 817 +} |
| 818 |
| 819 |
| 820 +//-----------------------------------------------------------------------------
- |
| 821 +// |
| 822 +// appendOp() Append a new instruction onto the compiled pattern |
| 823 +// Includes error checking, limiting the size of the |
| 824 +// pattern to lengths that can be represented in the |
| 825 +// 24 bit operand field of an instruction. |
| 826 +// |
| 827 +//-----------------------------------------------------------------------------
- |
| 828 +void RegexCompile::appendOp(int32_t op) { |
| 829 + if (U_FAILURE(*fStatus)) { |
| 830 + return; |
| 831 + } |
| 832 + fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 833 + if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { |
| 834 + error(U_REGEX_PATTERN_TOO_BIG); |
| 835 + } |
| 836 +} |
| 837 + |
| 838 +void RegexCompile::appendOp(int32_t type, int32_t val) { |
| 839 + appendOp(buildOp(type, val)); |
| 840 +} |
| 841 |
| 842 |
| 843 //-----------------------------------------------------------------------------
- |
| 844 @@ -1900,7 +1879,7 @@ void RegexCompile::insertOp(int32_t where) { |
| 845 UVector64 *code = fRXPat->fCompiledPat; |
| 846 U_ASSERT(where>0 && where < code->size()); |
| 847 |
| 848 - int32_t nop = URX_BUILD(URX_NOP, 0); |
| 849 + int32_t nop = buildOp(URX_NOP, 0); |
| 850 code->insertElementAt(nop, where, *fStatus); |
| 851 |
| 852 // Walk through the pattern, looking for any ops with targets that |
| 853 @@ -1921,7 +1900,7 @@ void RegexCompile::insertOp(int32_t where) { |
| 854 // Target location for this opcode is after the insertion point and |
| 855 // needs to be incremented to adjust for the insertion. |
| 856 opValue++; |
| 857 - op = URX_BUILD(opType, opValue); |
| 858 + op = buildOp(opType, opValue); |
| 859 code->setElementAt(op, loc); |
| 860 } |
| 861 } |
| 862 @@ -1946,6 +1925,58 @@ void RegexCompile::insertOp(int32_t where) { |
| 863 } |
| 864 |
| 865 |
| 866 +//-----------------------------------------------------------------------------
- |
| 867 +// |
| 868 +// allocateData() Allocate storage in the matcher's static data area. |
| 869 +// Return the index for the newly allocated data. |
| 870 +// The storage won't actually exist until we are runnin
g a match |
| 871 +// operation, but the storage indexes are inserted into
various |
| 872 +// opcodes while compiling the pattern. |
| 873 +// |
| 874 +//-----------------------------------------------------------------------------
- |
| 875 +int32_t RegexCompile::allocateData(int32_t size) { |
| 876 + if (U_FAILURE(*fStatus)) { |
| 877 + return 0; |
| 878 + } |
| 879 + if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { |
| 880 + error(U_REGEX_INTERNAL_ERROR); |
| 881 + return 0; |
| 882 + } |
| 883 + int32_t dataIndex = fRXPat->fDataSize; |
| 884 + fRXPat->fDataSize += size; |
| 885 + if (fRXPat->fDataSize >= 0x00fffff0) { |
| 886 + error(U_REGEX_INTERNAL_ERROR); |
| 887 + } |
| 888 + return dataIndex; |
| 889 +} |
| 890 + |
| 891 + |
| 892 +//-----------------------------------------------------------------------------
- |
| 893 +// |
| 894 +// allocateStackData() Allocate space in the back-tracking stack frame. |
| 895 +// Return the index for the newly allocated data. |
| 896 +// The frame indexes are inserted into various |
| 897 +// opcodes while compiling the pattern, meaning that fr
ame |
| 898 +// size must be restricted to the size that will fit |
| 899 +// as an operand (24 bits). |
| 900 +// |
| 901 +//-----------------------------------------------------------------------------
- |
| 902 +int32_t RegexCompile::allocateStackData(int32_t size) { |
| 903 + if (U_FAILURE(*fStatus)) { |
| 904 + return 0; |
| 905 + } |
| 906 + if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { |
| 907 + error(U_REGEX_INTERNAL_ERROR); |
| 908 + return 0; |
| 909 + } |
| 910 + int32_t dataIndex = fRXPat->fFrameSize; |
| 911 + fRXPat->fFrameSize += size; |
| 912 + if (fRXPat->fFrameSize >= 0x00fffff0) { |
| 913 + error(U_REGEX_PATTERN_TOO_BIG); |
| 914 + } |
| 915 + return dataIndex; |
| 916 +} |
| 917 + |
| 918 |
| 919 //-----------------------------------------------------------------------------
- |
| 920 // |
| 921 @@ -1988,7 +2019,7 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { |
| 922 theLoc--; |
| 923 } |
| 924 if (reserveLoc) { |
| 925 - int32_t nop = URX_BUILD(URX_NOP, 0); |
| 926 + int32_t nop = buildOp(URX_NOP, 0); |
| 927 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); |
| 928 } |
| 929 } |
| 930 @@ -2063,8 +2094,7 @@ void RegexCompile::handleCloseParen() { |
| 931 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); |
| 932 |
| 933 int32_t frameVarLocation = URX_VAL(captureOp); |
| 934 - int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocatio
n); |
| 935 - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); |
| 936 + appendOp(URX_END_CAPTURE, frameVarLocation); |
| 937 } |
| 938 break; |
| 939 case atomic: |
| 940 @@ -2075,8 +2105,7 @@ void RegexCompile::handleCloseParen() { |
| 941 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen+1); |
| 942 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); |
| 943 int32_t stoLoc = URX_VAL(stoOp); |
| 944 - int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); |
| 945 - fRXPat->fCompiledPat->addElement(ldOp, *fStatus); |
| 946 + appendOp(URX_LD_SP, stoLoc); |
| 947 } |
| 948 break; |
| 949 |
| 950 @@ -2085,8 +2114,7 @@ void RegexCompile::handleCloseParen() { |
| 951 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-5); |
| 952 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 953 int32_t dataLoc = URX_VAL(startOp); |
| 954 - int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
| 955 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 956 + appendOp(URX_LA_END, dataLoc); |
| 957 } |
| 958 break; |
| 959 |
| 960 @@ -2096,19 +2124,16 @@ void RegexCompile::handleCloseParen() { |
| 961 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-1); |
| 962 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); |
| 963 int32_t dataLoc = URX_VAL(startOp); |
| 964 - int32_t op = URX_BUILD(URX_LA_END, dataLoc); |
| 965 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 966 - op = URX_BUILD(URX_BACKTRACK, 0); |
| 967 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 968 - op = URX_BUILD(URX_LA_END, dataLoc); |
| 969 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 970 + appendOp(URX_LA_END, dataLoc); |
| 971 + appendOp(URX_BACKTRACK, 0); |
| 972 + appendOp(URX_LA_END, dataLoc); |
| 973 |
| 974 // Patch the URX_SAVE near the top of the block. |
| 975 // The destination of the SAVE is the final LA_END that was just ad
ded. |
| 976 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen); |
| 977 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); |
| 978 int32_t dest = fRXPat->fCompiledPat->size()-1; |
| 979 - saveOp = URX_BUILD(URX_STATE_SAVE, dest); |
| 980 + saveOp = buildOp(URX_STATE_SAVE, dest); |
| 981 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); |
| 982 } |
| 983 break; |
| 984 @@ -2121,10 +2146,8 @@ void RegexCompile::handleCloseParen() { |
| 985 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-4); |
| 986 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 987 int32_t dataLoc = URX_VAL(startOp); |
| 988 - int32_t op = URX_BUILD(URX_LB_END, dataLoc); |
| 989 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 990 - op = URX_BUILD(URX_LA_END, dataLoc); |
| 991 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 992 + appendOp(URX_LB_END, dataLoc); |
| 993 + appendOp(URX_LA_END, dataLoc); |
| 994 |
| 995 // Determine the min and max bounds for the length of the |
| 996 // string that the pattern can match. |
| 997 @@ -2132,6 +2155,10 @@ void RegexCompile::handleCloseParen() { |
21 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 998 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
22 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 999 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
23 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 1000 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
24 + if (URX_TYPE(maxML) != 0) { | 1001 + if (URX_TYPE(maxML) != 0) { |
25 + error(U_REGEX_LOOK_BEHIND_LIMIT); | 1002 + error(U_REGEX_LOOK_BEHIND_LIMIT); |
26 + break; | 1003 + break; |
27 + } | 1004 + } |
28 if (maxML == INT32_MAX) { | 1005 if (maxML == INT32_MAX) { |
29 error(U_REGEX_LOOK_BEHIND_LIMIT); | 1006 error(U_REGEX_LOOK_BEHIND_LIMIT); |
30 break; | 1007 break; |
31 @@ -2166,6 +2170,10 @@ | 1008 @@ -2156,8 +2183,7 @@ void RegexCompile::handleCloseParen() { |
| 1009 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-5); |
| 1010 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); |
| 1011 int32_t dataLoc = URX_VAL(startOp); |
| 1012 - int32_t op = URX_BUILD(URX_LBN_END, dataLoc); |
| 1013 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1014 + appendOp(URX_LBN_END, dataLoc); |
| 1015 |
| 1016 // Determine the min and max bounds for the length of the |
| 1017 // string that the pattern can match. |
| 1018 @@ -2165,6 +2191,10 @@ void RegexCompile::handleCloseParen() { |
32 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | 1019 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; |
33 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | 1020 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); |
34 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | 1021 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); |
35 + if (URX_TYPE(maxML) != 0) { | 1022 + if (URX_TYPE(maxML) != 0) { |
36 + error(U_REGEX_LOOK_BEHIND_LIMIT); | 1023 + error(U_REGEX_LOOK_BEHIND_LIMIT); |
37 + break; | 1024 + break; |
38 + } | 1025 + } |
39 if (maxML == INT32_MAX) { | 1026 if (maxML == INT32_MAX) { |
40 error(U_REGEX_LOOK_BEHIND_LIMIT); | 1027 error(U_REGEX_LOOK_BEHIND_LIMIT); |
41 break; | 1028 break; |
42 @@ -2329,7 +2337,15 @@ | 1029 @@ -2178,7 +2208,7 @@ void RegexCompile::handleCloseParen() { |
| 1030 |
| 1031 // Insert the pattern location to continue at after a successful ma
tch |
| 1032 // as the last operand of the URX_LBN_CONT |
| 1033 - op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); |
| 1034 + int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size())
; |
| 1035 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); |
| 1036 } |
| 1037 break; |
| 1038 @@ -2219,7 +2249,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) |
| 1039 case 0: |
| 1040 { |
| 1041 // Set of no elements. Always fails to match. |
| 1042 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fSta
tus); |
| 1043 + appendOp(URX_BACKTRACK, 0); |
| 1044 delete theSet; |
| 1045 } |
| 1046 break; |
| 1047 @@ -2240,8 +2270,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) |
| 1048 // Put it into the compiled pattern as a set. |
| 1049 int32_t setNumber = fRXPat->fSets->size(); |
| 1050 fRXPat->fSets->addElement(theSet, *fStatus); |
| 1051 - int32_t setOp = URX_BUILD(URX_SETREF, setNumber); |
| 1052 - fRXPat->fCompiledPat->addElement(setOp, *fStatus); |
| 1053 + appendOp(URX_SETREF, setNumber); |
| 1054 } |
| 1055 } |
| 1056 } |
| 1057 @@ -2280,13 +2309,10 @@ void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp) |
| 1058 // counterLoc --> Loop counter |
| 1059 // +1 --> Input index (for breaking non-progressing loops
) |
| 1060 // (Only present if unbounded upper limit on loop) |
| 1061 - int32_t counterLoc = fRXPat->fFrameSize; |
| 1062 - fRXPat->fFrameSize++; |
| 1063 - if (fIntervalUpper < 0) { |
| 1064 - fRXPat->fFrameSize++; |
| 1065 - } |
| 1066 + int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; |
| 1067 + int32_t counterLoc = allocateStackData(dataSize); |
| 1068 |
| 1069 - int32_t op = URX_BUILD(InitOp, counterLoc); |
| 1070 + int32_t op = buildOp(InitOp, counterLoc); |
| 1071 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); |
| 1072 |
| 1073 // The second operand of CTR_INIT is the location following the end of the
loop. |
| 1074 @@ -2294,7 +2320,7 @@ void RegexCompile::compileInterval(int32_t InitOp,
int32_t LoopOp) |
| 1075 // compilation of something later on causes the code to grow and the targ
et |
| 1076 // position to move. |
| 1077 int32_t loopEnd = fRXPat->fCompiledPat->size(); |
| 1078 - op = URX_BUILD(URX_RELOC_OPRND, loopEnd); |
| 1079 + op = buildOp(URX_RELOC_OPRND, loopEnd); |
| 1080 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); |
| 1081 |
| 1082 // Followed by the min and max counts. |
| 1083 @@ -2303,8 +2329,7 @@ void RegexCompile::compileInterval(int32_t InitOp,
int32_t LoopOp) |
| 1084 |
| 1085 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. |
| 1086 // Goes at end of the block being looped over, so just append to the code
so far. |
| 1087 - op = URX_BUILD(LoopOp, topOfBlock); |
| 1088 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1089 + appendOp(LoopOp, topOfBlock); |
| 1090 |
| 1091 if ((fIntervalLow & 0xff000000) != 0 || |
| 1092 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { |
| 1093 @@ -2328,7 +2353,15 @@ UBool RegexCompile::compileInlineInterval() { |
43 int32_t topOfBlock = blockTopLoc(FALSE); | 1094 int32_t topOfBlock = blockTopLoc(FALSE); |
44 if (fIntervalUpper == 0) { | 1095 if (fIntervalUpper == 0) { |
45 // Pathological case. Attempt no matches, as if the block doesn't exis
t. | 1096 // Pathological case. Attempt no matches, as if the block doesn't exis
t. |
46 + // Discard the generated code for the block. | 1097 + // Discard the generated code for the block. |
47 + // If the block included parens, discard the info pertaining to them as
well. | 1098 + // If the block included parens, discard the info pertaining to them as
well. |
48 fRXPat->fCompiledPat->setSize(topOfBlock); | 1099 fRXPat->fCompiledPat->setSize(topOfBlock); |
49 + if (fMatchOpenParen >= topOfBlock) { | 1100 + if (fMatchOpenParen >= topOfBlock) { |
50 + fMatchOpenParen = -1; | 1101 + fMatchOpenParen = -1; |
51 + } | 1102 + } |
52 + if (fMatchCloseParen >= topOfBlock) { | 1103 + if (fMatchCloseParen >= topOfBlock) { |
53 + fMatchCloseParen = -1; | 1104 + fMatchCloseParen = -1; |
54 + } | 1105 + } |
55 return TRUE; | 1106 return TRUE; |
56 } | 1107 } |
57 | 1108 |
58 Index: source/test/testdata/regextst.txt | 1109 @@ -2349,7 +2382,7 @@ UBool RegexCompile::compileInlineInterval() { |
59 =================================================================== | 1110 // |
60 --- source/test/testdata/regextst.txt» (revision 292476) | 1111 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 |
61 +++ source/test/testdata/regextst.txt» (working copy) | 1112 + fIntervalUpper + (fIntervalUpper-fIntervalLow
); |
62 @@ -1173,6 +1173,24 @@ | 1113 - int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); |
63 "(?<=(?:){11})bc" "<0>bc</0>" # Empty (?:) expression. | 1114 + int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); |
| 1115 if (fIntervalLow == 0) { |
| 1116 insertOp(topOfBlock); |
| 1117 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); |
| 1118 @@ -2362,13 +2395,10 @@ UBool RegexCompile::compileInlineInterval() { |
| 1119 // it was put there when it was originally encountered. |
| 1120 int32_t i; |
| 1121 for (i=1; i<fIntervalUpper; i++ ) { |
| 1122 - if (i == fIntervalLow) { |
| 1123 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
| 1124 - } |
| 1125 - if (i > fIntervalLow) { |
| 1126 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus); |
| 1127 + if (i >= fIntervalLow) { |
| 1128 + appendOp(saveOp); |
| 1129 } |
| 1130 - fRXPat->fCompiledPat->addElement(op, *fStatus); |
| 1131 + appendOp(op); |
| 1132 } |
| 1133 return TRUE; |
| 1134 } |
| 1135 @@ -3587,7 +3617,7 @@ void RegexCompile::stripNOPs() { |
| 1136 int32_t operandAddress = URX_VAL(op); |
| 1137 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); |
| 1138 int32_t fixedOperandAddress = operandAddress - deltas.elementAt
i(operandAddress); |
| 1139 - op = URX_BUILD(opType, fixedOperandAddress); |
| 1140 + op = buildOp(opType, fixedOperandAddress); |
| 1141 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 1142 dst++; |
| 1143 break; |
| 1144 @@ -3602,7 +3632,7 @@ void RegexCompile::stripNOPs() { |
| 1145 break; |
| 1146 } |
| 1147 where = fRXPat->fGroupMap->elementAti(where-1); |
| 1148 - op = URX_BUILD(opType, where); |
| 1149 + op = buildOp(opType, where); |
| 1150 fRXPat->fCompiledPat->setElementAt(op, dst); |
| 1151 dst++; |
64 | 1152 |
| 1153 @@ -3954,7 +3984,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { |
| 1154 //-----------------------------------------------------------------------------
- |
| 1155 // |
| 1156 // scanNamedChar |
| 1157 - // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. |
| 1158 +// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. |
| 1159 // |
| 1160 // The scan position will be at the 'N'. On return |
| 1161 // the scan position should be just after the '}' |
| 1162 diff --git a/source/i18n/regexcmp.h b/source/i18n/regexcmp.h |
| 1163 index debdf45..c3cc7db 100644 |
| 1164 --- a/source/i18n/regexcmp.h |
| 1165 +++ b/source/i18n/regexcmp.h |
| 1166 @@ -104,6 +104,13 @@ private: |
| 1167 void fixLiterals(UBool split=FALSE); // Generate code for pendi
ng literal characters. |
| 1168 void insertOp(int32_t where); // Open up a slot for a ne
w op in the |
| 1169 // generated code at the
specified location. |
| 1170 + void appendOp(int32_t op); // Append a new op to the
compiled pattern. |
| 1171 + void appendOp(int32_t type, int32_t val); // Build & append a new op
to the compiled pattern. |
| 1172 + int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode i
nstruction. |
| 1173 + int32_t allocateData(int32_t size); // Allocate space in the m
atcher data area. |
| 1174 + // Return index of the n
ewly allocated data. |
| 1175 + int32_t allocateStackData(int32_t size); // Allocate space in the m
atch back-track stack frame. |
| 1176 + // Return offset index i
n the frame. |
| 1177 int32_t minMatchLength(int32_t start, |
| 1178 int32_t end); |
| 1179 int32_t maxMatchLength(int32_t start, |
| 1180 @@ -187,7 +194,9 @@ private: |
| 1181 int32_t fMatchOpenParen; // The position in the com
piled pattern |
| 1182 // of the slot reserved
for a state save |
| 1183 // at the start of the m
ost recently processed |
| 1184 - // parenthesized block. |
| 1185 + // parenthesized block.
Updated when processing |
| 1186 + // a close to the locati
on for the corresponding open. |
| 1187 + |
| 1188 int32_t fMatchCloseParen; // The position in the pat
tern of the first |
| 1189 // location after the mo
st recently processed |
| 1190 // parenthesized block. |
| 1191 diff --git a/source/i18n/regeximp.h b/source/i18n/regeximp.h |
| 1192 index bdf8403..fdd9c76 100644 |
| 1193 --- a/source/i18n/regeximp.h |
| 1194 +++ b/source/i18n/regeximp.h |
| 1195 @@ -1,5 +1,5 @@ |
| 1196 // |
| 1197 -// Copyright (C) 2002-2013 International Business Machines Corporation |
| 1198 +// Copyright (C) 2002-2014 International Business Machines Corporation |
| 1199 // and others. All rights reserved. |
| 1200 // |
| 1201 // file: regeximp.h |
| 1202 @@ -241,7 +241,6 @@ enum { |
| 1203 // |
| 1204 // Convenience macros for assembling and disassembling a compiled operation. |
| 1205 // |
| 1206 -#define URX_BUILD(type, val) (int32_t)((type << 24) | (val)) |
| 1207 #define URX_TYPE(x) ((uint32_t)(x) >> 24) |
| 1208 #define URX_VAL(x) ((x) & 0xffffff) |
| 1209 |
| 1210 diff --git a/source/test/intltest/regextst.cpp b/source/test/intltest/regextst.c
pp |
| 1211 index ca2fd21..f440c26 100644 |
| 1212 --- a/source/test/intltest/regextst.cpp |
| 1213 +++ b/source/test/intltest/regextst.cpp |
| 1214 @@ -144,6 +144,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, c
onst char* &name, ch |
| 1215 case 24: name = "TestBug11049"; |
| 1216 if (exec) TestBug11049(); |
| 1217 break; |
| 1218 + case 25: name = "TestBug11371"; |
| 1219 + if (exec) TestBug11371(); |
| 1220 + break; |
| 1221 default: name = ""; |
| 1222 break; //needed to end loop |
| 1223 } |
| 1224 @@ -5367,6 +5370,49 @@ void RegexTest::TestCase11049(const char *pattern, const
char *data, UBool expec |
| 1225 } |
| 1226 |
| 1227 |
| 1228 +void RegexTest::TestBug11371() { |
| 1229 + if (quick) { |
| 1230 + logln("Skipping test. Runs in exhuastive mode only."); |
| 1231 + return; |
| 1232 + } |
| 1233 + UErrorCode status = U_ZERO_ERROR; |
| 1234 + UnicodeString patternString; |
| 1235 + |
| 1236 + for (int i=0; i<8000000; i++) { |
| 1237 + patternString.append(UnicodeString("()")); |
| 1238 + } |
| 1239 + LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString,
0, status)); |
| 1240 + if (status != U_REGEX_PATTERN_TOO_BIG) { |
| 1241 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", |
| 1242 + __FILE__, __LINE__, u_errorName(status)); |
| 1243 + } |
| 1244 + |
| 1245 + status = U_ZERO_ERROR; |
| 1246 + patternString = "("; |
| 1247 + for (int i=0; i<20000000; i++) { |
| 1248 + patternString.append(UnicodeString("A++")); |
| 1249 + } |
| 1250 + patternString.append(UnicodeString("){0}B++")); |
| 1251 + LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString
, 0, status)); |
| 1252 + if (status != U_REGEX_PATTERN_TOO_BIG) { |
| 1253 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", |
| 1254 + __FILE__, __LINE__, u_errorName(status)); |
| 1255 + } |
| 1256 + |
| 1257 + // Pattern with too much string data, such that string indexes overflow ope
rand data field size |
| 1258 + // in compiled instruction. |
| 1259 + status = U_ZERO_ERROR; |
| 1260 + patternString = ""; |
| 1261 + while (patternString.length() < 0x00ffffff) { |
| 1262 + patternString.append(UnicodeString("stuff and things dont you know, the
se are a few of my favorite strings\n")); |
| 1263 + } |
| 1264 + patternString.append(UnicodeString("X? trailing string")); |
| 1265 + LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString
, 0, status)); |
| 1266 + if (status != U_REGEX_PATTERN_TOO_BIG) { |
| 1267 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", |
| 1268 + __FILE__, __LINE__, u_errorName(status)); |
| 1269 + } |
| 1270 +} |
| 1271 |
| 1272 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
| 1273 |
| 1274 diff --git a/source/test/intltest/regextst.h b/source/test/intltest/regextst.h |
| 1275 index 28e2121..38cc4ef 100644 |
| 1276 --- a/source/test/intltest/regextst.h |
| 1277 +++ b/source/test/intltest/regextst.h |
| 1278 @@ -50,6 +50,7 @@ public: |
| 1279 virtual void Bug10459(); |
| 1280 virtual void TestCaseInsensitiveStarters(); |
| 1281 virtual void TestBug11049(); |
| 1282 + virtual void TestBug11371(); |
| 1283 |
| 1284 // The following functions are internal to the regexp tests. |
| 1285 virtual void assertUText(const char *expected, UText *actual, const char *f
ile, int line); |
| 1286 diff --git a/source/test/testdata/regextst.txt b/source/test/testdata/regextst.t
xt |
| 1287 index 4d2e7f6..d642e8b 100644 |
| 1288 --- a/source/test/testdata/regextst.txt |
| 1289 +++ b/source/test/testdata/regextst.txt |
| 1290 @@ -1201,6 +1201,24 @@ |
| 1291 "A|B|\U00012345" "hello <0>\U00012345</0>" |
| 1292 "A|B|\U00010000" "hello \ud800" |
65 | 1293 |
66 +# Bug 11369 | 1294 +# Bug 11369 |
67 +# Incorrect optimization of patterns with a zero length quantifier {0} | 1295 +# Incorrect optimization of patterns with a zero length quantifier {0} |
68 + | 1296 + |
69 +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE" | 1297 +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE" |
70 +"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>" | 1298 +"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>" |
71 +"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>" | 1299 +"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>" |
72 +"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>" | 1300 +"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>" |
73 +"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>" | 1301 +"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>" |
74 + | 1302 + |
75 +# Bug 11370 | 1303 +# Bug 11370 |
76 +# Max match length computation of look-behind expression gives result that is
too big to fit in the | 1304 +# Max match length computation of look-behind expression gives result that is
too big to fit in the |
77 +# in the 24 bit operand portion of the compiled code. Expressions should fail
to compile | 1305 +# in the 24 bit operand portion of the compiled code. Expressions should fail
to compile |
78 +# (Look-behind match length must be bounded. This case is treated as unbounde
d, an error.) | 1306 +# (Look-behind match length must be bounded. This case is treated as unbounde
d, an error.) |
79 + | 1307 + |
80 +"(?<!(0123456789a){10000000})x" E "no match" | 1308 +"(?<!(0123456789a){10000000})x" E "no match" |
81 +"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match" | 1309 +"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match" |
82 + | 1310 + |
83 + | 1311 + |
84 # Random debugging, Temporary | 1312 # Random debugging, Temporary |
85 # | 1313 # |
86 #"^(?:a?b?)*$"» "a--" | 1314 |
OLD | NEW |