| OLD | NEW |
| (Empty) |
| 1 diff --git a/source/common/unicode/utypes.h b/source/common/unicode/utypes.h | |
| 2 index 704089c..1824625 100644 | |
| 3 --- a/source/common/unicode/utypes.h | |
| 4 +++ b/source/common/unicode/utypes.h | |
| 5 @@ -305,7 +305,7 @@ typedef double UDate; | |
| 6 #define U_IO_API | |
| 7 #define U_TOOLUTIL_API | |
| 8 #elif defined(U_COMMON_IMPLEMENTATION) | |
| 9 -#define U_DATA_API U_IMPORT | |
| 10 +#define U_DATA_API U_EXPORT | |
| 11 #define U_COMMON_API U_EXPORT | |
| 12 #define U_I18N_API U_IMPORT | |
| 13 #define U_LAYOUT_API U_IMPORT | |
| 14 @@ -647,6 +647,7 @@ typedef enum UErrorCode { | |
| 15 U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack sta
ck overflow. */ | |
| 16 U_REGEX_TIME_OUT, /**< Maximum allowed match time excee
ded */ | |
| 17 U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by us
er callback fn. */ | |
| 18 + U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size o
r complexity. @draft ICU 55 */ | |
| 19 U_REGEX_ERROR_LIMIT, /**< This must always be the last val
ue to indicate the limit for regexp errors */ | |
| 20 | |
| 21 /* | |
| 22 diff --git a/source/common/utypes.c b/source/common/utypes.c | |
| 23 index c28e727..32b6d88 100644 | |
| 24 --- a/source/common/utypes.c | |
| 25 +++ b/source/common/utypes.c | |
| 26 @@ -1,7 +1,7 @@ | |
| 27 /* | |
| 28 ****************************************************************************** | |
| 29 * | |
| 30 -* Copyright (C) 1997-2011, International Business Machines | |
| 31 +* Copyright (C) 1997-2014, International Business Machines | |
| 32 * Corporation and others. All Rights Reserved. | |
| 33 * | |
| 34 ****************************************************************************** | |
| 35 @@ -165,7 +165,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START]
= { | |
| 36 "U_REGEX_INVALID_RANGE", | |
| 37 "U_REGEX_STACK_OVERFLOW", | |
| 38 "U_REGEX_TIME_OUT", | |
| 39 - "U_REGEX_STOPPED_BY_CALLER" | |
| 40 + "U_REGEX_STOPPED_BY_CALLER", | |
| 41 + "U_REGEX_PATTERN_TOO_BIG" | |
| 42 }; | |
| 43 | |
| 44 static const char * const | |
| 45 diff --git a/source/i18n/regexcmp.cpp b/source/i18n/regexcmp.cpp | |
| 46 index 0816eec..0c2196f 100644 | |
| 47 --- a/source/i18n/regexcmp.cpp | |
| 48 +++ b/source/i18n/regexcmp.cpp | |
| 49 @@ -301,7 +301,7 @@ void RegexCompile::compile( | |
| 50 // present in the saved state: the input string position (int64_t) and | |
| 51 // the position in the compiled pattern. | |
| 52 // | |
| 53 - fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; | |
| 54 + allocateStackData(RESTACKFRAME_HDRCOUNT); | |
| 55 | |
| 56 // | |
| 57 // Optimization pass 1: NOPs, back-references, and case-folding | |
| 58 @@ -367,9 +367,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 59 // the start of an ( grouping. | |
| 60 //4 NOP Resreved, will be replaced by a save if there are | |
| 61 // OR | operators at the top level | |
| 62 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus
); | |
| 63 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); | |
| 64 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); | |
| 65 + appendOp(URX_STATE_SAVE, 2); | |
| 66 + appendOp(URX_JMP, 3); | |
| 67 + appendOp(URX_FAIL, 0); | |
| 68 | |
| 69 // Standard open nonCapture paren action emits the two NOPs and | |
| 70 // sets up the paren stack frame. | |
| 71 @@ -392,7 +392,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 72 } | |
| 73 | |
| 74 // add the END operation to the compiled pattern. | |
| 75 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); | |
| 76 + appendOp(URX_END, 0); | |
| 77 | |
| 78 // Terminate the pattern compilation state machine. | |
| 79 returnVal = FALSE; | |
| 80 @@ -414,14 +414,13 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 81 int32_t savePosition = fParenStack.popi(); | |
| 82 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition
); | |
| 83 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserve
d location | |
| 84 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | |
| 85 + op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); | |
| 86 fRXPat->fCompiledPat->setElementAt(op, savePosition); | |
| 87 | |
| 88 // Append an JMP operation into the compiled pattern. The operand
for | |
| 89 // the JMP will eventually be the location following the ')' for t
he | |
| 90 // group. This will be patched in later, when the ')' is encounte
red. | |
| 91 - op = URX_BUILD(URX_JMP, 0); | |
| 92 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 93 + appendOp(URX_JMP, 0); | |
| 94 | |
| 95 // Push the position of the newly added JMP op onto the parentheses
stack. | |
| 96 // This registers if for fixup when this block's close paren is enc
ountered. | |
| 97 @@ -430,7 +429,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 98 // Append a NOP to the compiled pattern. This is the slot reserved | |
| 99 // for a SAVE in the event that there is yet another '|' followin
g | |
| 100 // this one. | |
| 101 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 102 + appendOp(URX_NOP, 0); | |
| 103 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); | |
| 104 } | |
| 105 break; | |
| 106 @@ -456,12 +455,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 107 // END_CAPTURE is encountered. | |
| 108 { | |
| 109 fixLiterals(); | |
| 110 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 111 - int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots
in match stack frame. | |
| 112 - fRXPat->fFrameSize += 3; | |
| 113 - int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); | |
| 114 - fRXPat->fCompiledPat->addElement(cop, *fStatus); | |
| 115 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 116 + appendOp(URX_NOP, 0); | |
| 117 + int32_t varsLoc = allocateStackData(3); // Reserve three slots
in match stack frame. | |
| 118 + appendOp(URX_START_CAPTURE, varsLoc); | |
| 119 + appendOp(URX_NOP, 0); | |
| 120 | |
| 121 // On the Parentheses stack, start a new frame and add the postions | |
| 122 // of the two NOPs. Depending on what follows in the pattern, th
e | |
| 123 @@ -486,8 +483,8 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 124 // is an '|' alternation within the parens. | |
| 125 { | |
| 126 fixLiterals(); | |
| 127 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 128 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 129 + appendOp(URX_NOP, 0); | |
| 130 + appendOp(URX_NOP, 0); | |
| 131 | |
| 132 // On the Parentheses stack, start a new frame and add the postions | |
| 133 // of the two NOPs. | |
| 134 @@ -509,12 +506,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 135 // is an '|' alternation within the parens. | |
| 136 { | |
| 137 fixLiterals(); | |
| 138 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 139 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati
on for saving the | |
| 140 - fRXPat->fDataSize += 1; // state stack ptr. | |
| 141 - int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); | |
| 142 - fRXPat->fCompiledPat->addElement(stoOp, *fStatus); | |
| 143 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 144 + appendOp(URX_NOP, 0); | |
| 145 + int32_t varLoc = allocateData(1); // Reserve a data location fo
r saving the state stack ptr. | |
| 146 + appendOp(URX_STO_SP, varLoc); | |
| 147 + appendOp(URX_NOP, 0); | |
| 148 | |
| 149 // On the Parentheses stack, start a new frame and add the postions | |
| 150 // of the two NOPs. Depending on what follows in the pattern, th
e | |
| 151 @@ -557,26 +552,14 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 152 // Two data slots are reserved, for saving the stack ptr and the input
position. | |
| 153 { | |
| 154 fixLiterals(); | |
| 155 - int32_t dataLoc = fRXPat->fDataSize; | |
| 156 - fRXPat->fDataSize += 2; | |
| 157 - int32_t op = URX_BUILD(URX_LA_START, dataLoc); | |
| 158 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 159 - | |
| 160 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | |
| 161 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 162 - | |
| 163 - op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | |
| 164 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 165 - | |
| 166 - op = URX_BUILD(URX_LA_END, dataLoc); | |
| 167 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 168 - | |
| 169 - op = URX_BUILD(URX_BACKTRACK, 0); | |
| 170 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 171 - | |
| 172 - op = URX_BUILD(URX_NOP, 0); | |
| 173 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 174 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 175 + int32_t dataLoc = allocateData(2); | |
| 176 + appendOp(URX_LA_START, dataLoc); | |
| 177 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); | |
| 178 + appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); | |
| 179 + appendOp(URX_LA_END, dataLoc); | |
| 180 + appendOp(URX_BACKTRACK, 0); | |
| 181 + appendOp(URX_NOP, 0); | |
| 182 + appendOp(URX_NOP, 0); | |
| 183 | |
| 184 // On the Parentheses stack, start a new frame and add the postions | |
| 185 // of the NOPs. | |
| 186 @@ -601,16 +584,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 187 // an alternate (transparent) re
gion. | |
| 188 { | |
| 189 fixLiterals(); | |
| 190 - int32_t dataLoc = fRXPat->fDataSize; | |
| 191 - fRXPat->fDataSize += 2; | |
| 192 - int32_t op = URX_BUILD(URX_LA_START, dataLoc); | |
| 193 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 194 - | |
| 195 - op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patch
ed later. | |
| 196 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 197 - | |
| 198 - op = URX_BUILD(URX_NOP, 0); | |
| 199 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 200 + int32_t dataLoc = allocateData(2); | |
| 201 + appendOp(URX_LA_START, dataLoc); | |
| 202 + appendOp(URX_STATE_SAVE, 0); // dest address will be patched lat
er. | |
| 203 + appendOp(URX_NOP, 0); | |
| 204 | |
| 205 // On the Parentheses stack, start a new frame and add the postions | |
| 206 // of the StateSave and NOP. | |
| 207 @@ -648,23 +625,19 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 208 fixLiterals(); | |
| 209 | |
| 210 // Allocate data space | |
| 211 - int32_t dataLoc = fRXPat->fDataSize; | |
| 212 - fRXPat->fDataSize += 4; | |
| 213 + int32_t dataLoc = allocateData(4); | |
| 214 | |
| 215 // Emit URX_LB_START | |
| 216 - int32_t op = URX_BUILD(URX_LB_START, dataLoc); | |
| 217 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 218 + appendOp(URX_LB_START, dataLoc); | |
| 219 | |
| 220 // Emit URX_LB_CONT | |
| 221 - op = URX_BUILD(URX_LB_CONT, dataLoc); | |
| 222 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 223 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt
h. To be filled later. | |
| 224 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt
h. To be filled later. | |
| 225 + appendOp(URX_LB_CONT, dataLoc); | |
| 226 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l
ater. | |
| 227 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l
ater. | |
| 228 | |
| 229 - // Emit the NOP | |
| 230 - op = URX_BUILD(URX_NOP, 0); | |
| 231 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 232 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 233 + // Emit the NOPs | |
| 234 + appendOp(URX_NOP, 0); | |
| 235 + appendOp(URX_NOP, 0); | |
| 236 | |
| 237 // On the Parentheses stack, start a new frame and add the postions | |
| 238 // of the URX_LB_CONT and the NOP. | |
| 239 @@ -704,24 +677,20 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 240 fixLiterals(); | |
| 241 | |
| 242 // Allocate data space | |
| 243 - int32_t dataLoc = fRXPat->fDataSize; | |
| 244 - fRXPat->fDataSize += 4; | |
| 245 + int32_t dataLoc = allocateData(4); | |
| 246 | |
| 247 // Emit URX_LB_START | |
| 248 - int32_t op = URX_BUILD(URX_LB_START, dataLoc); | |
| 249 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 250 + appendOp(URX_LB_START, dataLoc); | |
| 251 | |
| 252 // Emit URX_LBN_CONT | |
| 253 - op = URX_BUILD(URX_LBN_CONT, dataLoc); | |
| 254 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 255 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLengt
h. To be filled later. | |
| 256 - fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLengt
h. To be filled later. | |
| 257 - fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc.
To be filled later. | |
| 258 + appendOp(URX_LBN_CONT, dataLoc); | |
| 259 + appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled l
ater. | |
| 260 + appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled l
ater. | |
| 261 + appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled l
ater. | |
| 262 | |
| 263 - // Emit the NOP | |
| 264 - op = URX_BUILD(URX_NOP, 0); | |
| 265 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 266 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 267 + // Emit the NOPs | |
| 268 + appendOp(URX_NOP, 0); | |
| 269 + appendOp(URX_NOP, 0); | |
| 270 | |
| 271 // On the Parentheses stack, start a new frame and add the postions | |
| 272 // of the URX_LB_CONT and the NOP. | |
| 273 @@ -791,12 +760,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 274 | |
| 275 if (URX_TYPE(repeatedOp) == URX_SETREF) { | |
| 276 // Emit optimized code for [char set]+ | |
| 277 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated
Op)); | |
| 278 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | |
| 279 - frameLoc = fRXPat->fFrameSize; | |
| 280 - fRXPat->fFrameSize++; | |
| 281 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
| 282 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 283 + appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); | |
| 284 + frameLoc = allocateStackData(1); | |
| 285 + appendOp(URX_LOOP_C, frameLoc); | |
| 286 break; | |
| 287 } | |
| 288 | |
| 289 @@ -804,7 +770,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 290 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | |
| 291 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | |
| 292 // Emit Optimized code for .+ operations. | |
| 293 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | |
| 294 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); | |
| 295 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | |
| 296 // URX_LOOP_DOT_I operand is a flag indicating ". match
es any" mode. | |
| 297 loopOpI |= 1; | |
| 298 @@ -812,11 +778,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 299 if (fModeFlags & UREGEX_UNIX_LINES) { | |
| 300 loopOpI |= 2; | |
| 301 } | |
| 302 - fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); | |
| 303 - frameLoc = fRXPat->fFrameSize; | |
| 304 - fRXPat->fFrameSize++; | |
| 305 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); | |
| 306 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 307 + appendOp(loopOpI); | |
| 308 + frameLoc = allocateStackData(1); | |
| 309 + appendOp(URX_LOOP_C, frameLoc); | |
| 310 break; | |
| 311 } | |
| 312 | |
| 313 @@ -830,18 +794,15 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 314 // Zero length match is possible. | |
| 315 // Emit the code sequence that can handle it. | |
| 316 insertOp(topLoc); | |
| 317 - frameLoc = fRXPat->fFrameSize; | |
| 318 - fRXPat->fFrameSize++; | |
| 319 + frameLoc = allocateStackData(1); | |
| 320 | |
| 321 - int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); | |
| 322 + int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); | |
| 323 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
| 324 | |
| 325 - op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); | |
| 326 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 327 + appendOp(URX_JMP_SAV_X, topLoc+1); | |
| 328 } else { | |
| 329 // Simpler code when the repeated body must match something non
-empty | |
| 330 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); | |
| 331 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | |
| 332 + appendOp(URX_JMP_SAV, topLoc); | |
| 333 } | |
| 334 } | |
| 335 break; | |
| 336 @@ -853,8 +814,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 337 // 3. ... | |
| 338 { | |
| 339 int32_t topLoc = blockTopLoc(FALSE); | |
| 340 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); | |
| 341 - fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); | |
| 342 + appendOp(URX_STATE_SAVE, topLoc); | |
| 343 } | |
| 344 break; | |
| 345 | |
| 346 @@ -868,7 +828,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 347 // Insert the state save into the compiled pattern, and we're done. | |
| 348 { | |
| 349 int32_t saveStateLoc = blockTopLoc(TRUE); | |
| 350 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompile
dPat->size()); | |
| 351 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledP
at->size()); | |
| 352 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | |
| 353 } | |
| 354 break; | |
| 355 @@ -887,14 +847,12 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 356 int32_t jmp1_loc = blockTopLoc(TRUE); | |
| 357 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); | |
| 358 | |
| 359 - int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); | |
| 360 + int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); | |
| 361 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); | |
| 362 | |
| 363 - int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); | |
| 364 - fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); | |
| 365 + appendOp(URX_JMP, jmp2_loc+2); | |
| 366 | |
| 367 - int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); | |
| 368 - fRXPat->fCompiledPat->addElement(save_op, *fStatus); | |
| 369 + appendOp(URX_STATE_SAVE, jmp1_loc+1); | |
| 370 } | |
| 371 break; | |
| 372 | |
| 373 @@ -934,12 +892,10 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 374 | |
| 375 if (URX_TYPE(repeatedOp) == URX_SETREF) { | |
| 376 // Emit optimized code for a [char set]* | |
| 377 - int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeated
Op)); | |
| 378 + int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp
)); | |
| 379 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | |
| 380 - dataLoc = fRXPat->fFrameSize; | |
| 381 - fRXPat->fFrameSize++; | |
| 382 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
| 383 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 384 + dataLoc = allocateStackData(1); | |
| 385 + appendOp(URX_LOOP_C, dataLoc); | |
| 386 break; | |
| 387 } | |
| 388 | |
| 389 @@ -947,7 +903,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 390 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || | |
| 391 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { | |
| 392 // Emit Optimized code for .* operations. | |
| 393 - int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); | |
| 394 + int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); | |
| 395 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { | |
| 396 // URX_LOOP_DOT_I operand is a flag indicating . matche
s any mode. | |
| 397 loopOpI |= 1; | |
| 398 @@ -956,10 +912,8 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 399 loopOpI |= 2; | |
| 400 } | |
| 401 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); | |
| 402 - dataLoc = fRXPat->fFrameSize; | |
| 403 - fRXPat->fFrameSize++; | |
| 404 - int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); | |
| 405 - fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); | |
| 406 + dataLoc = allocateStackData(1); | |
| 407 + appendOp(URX_LOOP_C, dataLoc); | |
| 408 break; | |
| 409 } | |
| 410 } | |
| 411 @@ -968,30 +922,29 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 412 // The optimizations did not apply. | |
| 413 | |
| 414 int32_t saveStateLoc = blockTopLoc(TRUE); | |
| 415 - int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); | |
| 416 + int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); | |
| 417 | |
| 418 // Check for minimum match length of zero, which requires | |
| 419 // extra loop-breaking code. | |
| 420 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) ==
0) { | |
| 421 insertOp(saveStateLoc); | |
| 422 - dataLoc = fRXPat->fFrameSize; | |
| 423 - fRXPat->fFrameSize++; | |
| 424 + dataLoc = allocateStackData(1); | |
| 425 | |
| 426 - int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); | |
| 427 + int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); | |
| 428 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); | |
| 429 - jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); | |
| 430 + jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); | |
| 431 } | |
| 432 | |
| 433 // Locate the position in the compiled pattern where the match will
continue | |
| 434 // after completing the *. (4 or 5 in the comment above) | |
| 435 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | |
| 436 | |
| 437 - // Put together the save state op store it into the compiled code. | |
| 438 - int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); | |
| 439 + // Put together the save state op and store it into the compiled co
de. | |
| 440 + int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); | |
| 441 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); | |
| 442 | |
| 443 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pat
tern. | |
| 444 - fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); | |
| 445 + appendOp(jmpOp); | |
| 446 } | |
| 447 break; | |
| 448 | |
| 449 @@ -1005,10 +958,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 450 { | |
| 451 int32_t jmpLoc = blockTopLoc(TRUE); // loc
1. | |
| 452 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc
3. | |
| 453 - int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); | |
| 454 - int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); | |
| 455 + int32_t jmpOp = buildOp(URX_JMP, saveLoc); | |
| 456 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); | |
| 457 - fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); | |
| 458 + appendOp(URX_STATE_SAVE, jmpLoc+1); | |
| 459 } | |
| 460 break; | |
| 461 | |
| 462 @@ -1077,9 +1029,9 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 463 | |
| 464 // First the STO_SP before the start of the loop | |
| 465 insertOp(topLoc); | |
| 466 - int32_t varLoc = fRXPat->fDataSize; // Reserve a data locati
on for saving the | |
| 467 - fRXPat->fDataSize += 1; // state stack ptr. | |
| 468 - int32_t op = URX_BUILD(URX_STO_SP, varLoc); | |
| 469 + | |
| 470 + int32_t varLoc = allocateData(1); // Reserve a data location for
saving the | |
| 471 + int32_t op = buildOp(URX_STO_SP, varLoc); | |
| 472 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
| 473 | |
| 474 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); | |
| 475 @@ -1088,8 +1040,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 476 fRXPat->fCompiledPat->push(loopOp, *fStatus); | |
| 477 | |
| 478 // Then the LD_SP after the end of the loop | |
| 479 - op = URX_BUILD(URX_LD_SP, varLoc); | |
| 480 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 481 + appendOp(URX_LD_SP, varLoc); | |
| 482 } | |
| 483 | |
| 484 break; | |
| 485 @@ -1125,55 +1076,49 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 486 // scanned a ".", match any single character. | |
| 487 { | |
| 488 fixLiterals(FALSE); | |
| 489 - int32_t op; | |
| 490 if (fModeFlags & UREGEX_DOTALL) { | |
| 491 - op = URX_BUILD(URX_DOTANY_ALL, 0); | |
| 492 + appendOp(URX_DOTANY_ALL, 0); | |
| 493 } else if (fModeFlags & UREGEX_UNIX_LINES) { | |
| 494 - op = URX_BUILD(URX_DOTANY_UNIX, 0); | |
| 495 + appendOp(URX_DOTANY_UNIX, 0); | |
| 496 } else { | |
| 497 - op = URX_BUILD(URX_DOTANY, 0); | |
| 498 + appendOp(URX_DOTANY, 0); | |
| 499 } | |
| 500 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 501 } | |
| 502 break; | |
| 503 | |
| 504 case doCaret: | |
| 505 { | |
| 506 fixLiterals(FALSE); | |
| 507 - int32_t op = 0; | |
| 508 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
| 509 - op = URX_CARET; | |
| 510 + appendOp(URX_CARET, 0); | |
| 511 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
| 512 - op = URX_CARET_M; | |
| 513 + appendOp(URX_CARET_M, 0); | |
| 514 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
| 515 - op = URX_CARET; // Only testing true start of input. | |
| 516 + appendOp(URX_CARET, 0); // Only testing true start of input. | |
| 517 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
| 518 - op = URX_CARET_M_UNIX; | |
| 519 + appendOp(URX_CARET_M_UNIX, 0); | |
| 520 } | |
| 521 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
| 522 } | |
| 523 break; | |
| 524 | |
| 525 case doDollar: | |
| 526 { | |
| 527 fixLiterals(FALSE); | |
| 528 - int32_t op = 0; | |
| 529 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
| 530 - op = URX_DOLLAR; | |
| 531 + appendOp(URX_DOLLAR, 0); | |
| 532 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) == 0) { | |
| 533 - op = URX_DOLLAR_M; | |
| 534 + appendOp(URX_DOLLAR_M, 0); | |
| 535 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
| 536 - op = URX_DOLLAR_D; | |
| 537 + appendOp(URX_DOLLAR_D, 0); | |
| 538 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UR
EGEX_UNIX_LINES) != 0) { | |
| 539 - op = URX_DOLLAR_MD; | |
| 540 + appendOp(URX_DOLLAR_MD, 0); | |
| 541 } | |
| 542 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
| 543 } | |
| 544 break; | |
| 545 | |
| 546 case doBackslashA: | |
| 547 fixLiterals(FALSE); | |
| 548 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); | |
| 549 + appendOp(URX_CARET, 0); | |
| 550 break; | |
| 551 | |
| 552 case doBackslashB: | |
| 553 @@ -1185,7 +1130,7 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 554 #endif | |
| 555 fixLiterals(FALSE); | |
| 556 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA
CKSLASH_B; | |
| 557 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); | |
| 558 + appendOp(op, 1); | |
| 559 } | |
| 560 break; | |
| 561 | |
| 562 @@ -1198,63 +1143,59 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 563 #endif | |
| 564 fixLiterals(FALSE); | |
| 565 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BA
CKSLASH_B; | |
| 566 - fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); | |
| 567 + appendOp(op, 0); | |
| 568 } | |
| 569 break; | |
| 570 | |
| 571 case doBackslashD: | |
| 572 fixLiterals(FALSE); | |
| 573 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatu
s); | |
| 574 + appendOp(URX_BACKSLASH_D, 1); | |
| 575 break; | |
| 576 | |
| 577 case doBackslashd: | |
| 578 fixLiterals(FALSE); | |
| 579 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatu
s); | |
| 580 + appendOp(URX_BACKSLASH_D, 0); | |
| 581 break; | |
| 582 | |
| 583 case doBackslashG: | |
| 584 fixLiterals(FALSE); | |
| 585 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatu
s); | |
| 586 + appendOp(URX_BACKSLASH_G, 0); | |
| 587 break; | |
| 588 | |
| 589 case doBackslashS: | |
| 590 fixLiterals(FALSE); | |
| 591 - fRXPat->fCompiledPat->addElement( | |
| 592 - URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); | |
| 593 + appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); | |
| 594 break; | |
| 595 | |
| 596 case doBackslashs: | |
| 597 fixLiterals(FALSE); | |
| 598 - fRXPat->fCompiledPat->addElement( | |
| 599 - URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); | |
| 600 + appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); | |
| 601 break; | |
| 602 | |
| 603 case doBackslashW: | |
| 604 fixLiterals(FALSE); | |
| 605 - fRXPat->fCompiledPat->addElement( | |
| 606 - URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); | |
| 607 + appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); | |
| 608 break; | |
| 609 | |
| 610 case doBackslashw: | |
| 611 fixLiterals(FALSE); | |
| 612 - fRXPat->fCompiledPat->addElement( | |
| 613 - URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); | |
| 614 + appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); | |
| 615 break; | |
| 616 | |
| 617 case doBackslashX: | |
| 618 fixLiterals(FALSE); | |
| 619 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatu
s); | |
| 620 + appendOp(URX_BACKSLASH_X, 0); | |
| 621 break; | |
| 622 | |
| 623 | |
| 624 case doBackslashZ: | |
| 625 fixLiterals(FALSE); | |
| 626 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); | |
| 627 + appendOp(URX_DOLLAR, 0); | |
| 628 break; | |
| 629 | |
| 630 case doBackslashz: | |
| 631 fixLiterals(FALSE); | |
| 632 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatu
s); | |
| 633 + appendOp(URX_BACKSLASH_Z, 0); | |
| 634 break; | |
| 635 | |
| 636 case doEscapeError: | |
| 637 @@ -1314,13 +1255,11 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 638 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal
escape sequence, | |
| 639 // and shouldn't enter this code path a
t all. | |
| 640 fixLiterals(FALSE); | |
| 641 - int32_t op; | |
| 642 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | |
| 643 - op = URX_BUILD(URX_BACKREF_I, groupNum); | |
| 644 + appendOp(URX_BACKREF_I, groupNum); | |
| 645 } else { | |
| 646 - op = URX_BUILD(URX_BACKREF, groupNum); | |
| 647 + appendOp(URX_BACKREF, groupNum); | |
| 648 } | |
| 649 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 650 } | |
| 651 break; | |
| 652 | |
| 653 @@ -1341,22 +1280,18 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 654 { | |
| 655 // Emit the STO_SP | |
| 656 int32_t topLoc = blockTopLoc(TRUE); | |
| 657 - int32_t stoLoc = fRXPat->fDataSize; | |
| 658 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | |
| 659 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
| 660 + int32_t stoLoc = allocateData(1); // Reserve the data location f
or storing save stack ptr. | |
| 661 + int32_t op = buildOp(URX_STO_SP, stoLoc); | |
| 662 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
| 663 | |
| 664 // Emit the STATE_SAVE | |
| 665 - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | |
| 666 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 667 + appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); | |
| 668 | |
| 669 // Emit the JMP | |
| 670 - op = URX_BUILD(URX_JMP, topLoc+1); | |
| 671 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 672 + appendOp(URX_JMP, topLoc+1); | |
| 673 | |
| 674 // Emit the LD_SP | |
| 675 - op = URX_BUILD(URX_LD_SP, stoLoc); | |
| 676 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 677 + appendOp(URX_LD_SP, stoLoc); | |
| 678 } | |
| 679 break; | |
| 680 | |
| 681 @@ -1376,23 +1311,20 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 682 insertOp(topLoc); | |
| 683 | |
| 684 // emit STO_SP loc | |
| 685 - int32_t stoLoc = fRXPat->fDataSize; | |
| 686 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | |
| 687 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
| 688 + int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. | |
| 689 + int32_t op = buildOp(URX_STO_SP, stoLoc); | |
| 690 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
| 691 | |
| 692 // Emit the SAVE_STATE 5 | |
| 693 int32_t L7 = fRXPat->fCompiledPat->size()+1; | |
| 694 - op = URX_BUILD(URX_STATE_SAVE, L7); | |
| 695 + op = buildOp(URX_STATE_SAVE, L7); | |
| 696 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | |
| 697 | |
| 698 // Append the JMP operation. | |
| 699 - op = URX_BUILD(URX_JMP, topLoc+1); | |
| 700 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 701 + appendOp(URX_JMP, topLoc+1); | |
| 702 | |
| 703 // Emit the LD_SP loc | |
| 704 - op = URX_BUILD(URX_LD_SP, stoLoc); | |
| 705 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 706 + appendOp(URX_LD_SP, stoLoc); | |
| 707 } | |
| 708 break; | |
| 709 | |
| 710 @@ -1411,19 +1343,17 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 711 insertOp(topLoc); | |
| 712 | |
| 713 // Emit the STO_SP | |
| 714 - int32_t stoLoc = fRXPat->fDataSize; | |
| 715 - fRXPat->fDataSize++; // Reserve the data location for storing
save stack ptr. | |
| 716 - int32_t op = URX_BUILD(URX_STO_SP, stoLoc); | |
| 717 + int32_t stoLoc = allocateData(1); // Reserve the data location
for storing save stack ptr. | |
| 718 + int32_t op = buildOp(URX_STO_SP, stoLoc); | |
| 719 fRXPat->fCompiledPat->setElementAt(op, topLoc); | |
| 720 | |
| 721 // Emit the SAVE_STATE | |
| 722 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; | |
| 723 - op = URX_BUILD(URX_STATE_SAVE, continueLoc); | |
| 724 + op = buildOp(URX_STATE_SAVE, continueLoc); | |
| 725 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); | |
| 726 | |
| 727 // Emit the LD_SP | |
| 728 - op = URX_BUILD(URX_LD_SP, stoLoc); | |
| 729 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 730 + appendOp(URX_LD_SP, stoLoc); | |
| 731 } | |
| 732 break; | |
| 733 | |
| 734 @@ -1480,8 +1410,8 @@ UBool RegexCompile::doParseActions(int32_t action) | |
| 735 // is an '|' alternation within the parens. | |
| 736 { | |
| 737 fixLiterals(FALSE); | |
| 738 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 739 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); | |
| 740 + appendOp(URX_NOP, 0); | |
| 741 + appendOp(URX_NOP, 0); | |
| 742 | |
| 743 // On the Parentheses stack, start a new frame and add the postions | |
| 744 // of the two NOPs (a normal non-capturing () frame, except for t
he | |
| 745 @@ -1818,7 +1748,6 @@ void RegexCompile::literalChar(UChar32 c) { | |
| 746 // | |
| 747 //-----------------------------------------------------------------------------
- | |
| 748 void RegexCompile::fixLiterals(UBool split) { | |
| 749 - int32_t op = 0; // An op from/for the compiled patte
rn. | |
| 750 | |
| 751 // If no literal characters have been scanned but not yet had code generate
d | |
| 752 // for them, nothing needs to be done. | |
| 753 @@ -1857,23 +1786,23 @@ void RegexCompile::fixLiterals(UBool split) { | |
| 754 // Single character, emit a URX_ONECHAR op to match it. | |
| 755 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && | |
| 756 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { | |
| 757 - op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); | |
| 758 + appendOp(URX_ONECHAR_I, lastCodePoint); | |
| 759 } else { | |
| 760 - op = URX_BUILD(URX_ONECHAR, lastCodePoint); | |
| 761 + appendOp(URX_ONECHAR, lastCodePoint); | |
| 762 } | |
| 763 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 764 } else { | |
| 765 // Two or more chars, emit a URX_STRING to match them. | |
| 766 + if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length(
) > 0x00ffffff) { | |
| 767 + error(U_REGEX_PATTERN_TOO_BIG); | |
| 768 + } | |
| 769 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { | |
| 770 - op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); | |
| 771 + appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); | |
| 772 } else { | |
| 773 // TODO here: add optimization to split case sensitive strings of
length two | |
| 774 // into two single char ops, for efficiency. | |
| 775 - op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); | |
| 776 + appendOp(URX_STRING, fRXPat->fLiteralText.length()); | |
| 777 } | |
| 778 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 779 - op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); | |
| 780 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 781 + appendOp(URX_STRING_LEN, fLiteralChars.length()); | |
| 782 | |
| 783 // Add this string into the accumulated strings of the compiled pattern
. | |
| 784 fRXPat->fLiteralText.append(fLiteralChars); | |
| 785 @@ -1883,8 +1812,58 @@ void RegexCompile::fixLiterals(UBool split) { | |
| 786 } | |
| 787 | |
| 788 | |
| 789 +int32_t RegexCompile::buildOp(int32_t type, int32_t val) { | |
| 790 + if (U_FAILURE(*fStatus)) { | |
| 791 + return 0; | |
| 792 + } | |
| 793 + if (type < 0 || type > 255) { | |
| 794 + U_ASSERT(FALSE); | |
| 795 + error(U_REGEX_INTERNAL_ERROR); | |
| 796 + type = URX_RESERVED_OP; | |
| 797 + } | |
| 798 + if (val > 0x00ffffff) { | |
| 799 + U_ASSERT(FALSE); | |
| 800 + error(U_REGEX_INTERNAL_ERROR); | |
| 801 + val = 0; | |
| 802 + } | |
| 803 + if (val < 0) { | |
| 804 + if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { | |
| 805 + U_ASSERT(FALSE); | |
| 806 + error(U_REGEX_INTERNAL_ERROR); | |
| 807 + return -1; | |
| 808 + } | |
| 809 + if (URX_TYPE(val) != 0xff) { | |
| 810 + U_ASSERT(FALSE); | |
| 811 + error(U_REGEX_INTERNAL_ERROR); | |
| 812 + return -1; | |
| 813 + } | |
| 814 + type = URX_RESERVED_OP_N; | |
| 815 + } | |
| 816 + return (type << 24) | val; | |
| 817 +} | |
| 818 | |
| 819 | |
| 820 +//-----------------------------------------------------------------------------
- | |
| 821 +// | |
| 822 +// appendOp() Append a new instruction onto the compiled pattern | |
| 823 +// Includes error checking, limiting the size of the | |
| 824 +// pattern to lengths that can be represented in the | |
| 825 +// 24 bit operand field of an instruction. | |
| 826 +// | |
| 827 +//-----------------------------------------------------------------------------
- | |
| 828 +void RegexCompile::appendOp(int32_t op) { | |
| 829 + if (U_FAILURE(*fStatus)) { | |
| 830 + return; | |
| 831 + } | |
| 832 + fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 833 + if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { | |
| 834 + error(U_REGEX_PATTERN_TOO_BIG); | |
| 835 + } | |
| 836 +} | |
| 837 + | |
| 838 +void RegexCompile::appendOp(int32_t type, int32_t val) { | |
| 839 + appendOp(buildOp(type, val)); | |
| 840 +} | |
| 841 | |
| 842 | |
| 843 //-----------------------------------------------------------------------------
- | |
| 844 @@ -1900,7 +1879,7 @@ void RegexCompile::insertOp(int32_t where) { | |
| 845 UVector64 *code = fRXPat->fCompiledPat; | |
| 846 U_ASSERT(where>0 && where < code->size()); | |
| 847 | |
| 848 - int32_t nop = URX_BUILD(URX_NOP, 0); | |
| 849 + int32_t nop = buildOp(URX_NOP, 0); | |
| 850 code->insertElementAt(nop, where, *fStatus); | |
| 851 | |
| 852 // Walk through the pattern, looking for any ops with targets that | |
| 853 @@ -1921,7 +1900,7 @@ void RegexCompile::insertOp(int32_t where) { | |
| 854 // Target location for this opcode is after the insertion point and | |
| 855 // needs to be incremented to adjust for the insertion. | |
| 856 opValue++; | |
| 857 - op = URX_BUILD(opType, opValue); | |
| 858 + op = buildOp(opType, opValue); | |
| 859 code->setElementAt(op, loc); | |
| 860 } | |
| 861 } | |
| 862 @@ -1946,6 +1925,58 @@ void RegexCompile::insertOp(int32_t where) { | |
| 863 } | |
| 864 | |
| 865 | |
| 866 +//-----------------------------------------------------------------------------
- | |
| 867 +// | |
| 868 +// allocateData() Allocate storage in the matcher's static data area. | |
| 869 +// Return the index for the newly allocated data. | |
| 870 +// The storage won't actually exist until we are runnin
g a match | |
| 871 +// operation, but the storage indexes are inserted into
various | |
| 872 +// opcodes while compiling the pattern. | |
| 873 +// | |
| 874 +//-----------------------------------------------------------------------------
- | |
| 875 +int32_t RegexCompile::allocateData(int32_t size) { | |
| 876 + if (U_FAILURE(*fStatus)) { | |
| 877 + return 0; | |
| 878 + } | |
| 879 + if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { | |
| 880 + error(U_REGEX_INTERNAL_ERROR); | |
| 881 + return 0; | |
| 882 + } | |
| 883 + int32_t dataIndex = fRXPat->fDataSize; | |
| 884 + fRXPat->fDataSize += size; | |
| 885 + if (fRXPat->fDataSize >= 0x00fffff0) { | |
| 886 + error(U_REGEX_INTERNAL_ERROR); | |
| 887 + } | |
| 888 + return dataIndex; | |
| 889 +} | |
| 890 + | |
| 891 + | |
| 892 +//-----------------------------------------------------------------------------
- | |
| 893 +// | |
| 894 +// allocateStackData() Allocate space in the back-tracking stack frame. | |
| 895 +// Return the index for the newly allocated data. | |
| 896 +// The frame indexes are inserted into various | |
| 897 +// opcodes while compiling the pattern, meaning that fr
ame | |
| 898 +// size must be restricted to the size that will fit | |
| 899 +// as an operand (24 bits). | |
| 900 +// | |
| 901 +//-----------------------------------------------------------------------------
- | |
| 902 +int32_t RegexCompile::allocateStackData(int32_t size) { | |
| 903 + if (U_FAILURE(*fStatus)) { | |
| 904 + return 0; | |
| 905 + } | |
| 906 + if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { | |
| 907 + error(U_REGEX_INTERNAL_ERROR); | |
| 908 + return 0; | |
| 909 + } | |
| 910 + int32_t dataIndex = fRXPat->fFrameSize; | |
| 911 + fRXPat->fFrameSize += size; | |
| 912 + if (fRXPat->fFrameSize >= 0x00fffff0) { | |
| 913 + error(U_REGEX_PATTERN_TOO_BIG); | |
| 914 + } | |
| 915 + return dataIndex; | |
| 916 +} | |
| 917 + | |
| 918 | |
| 919 //-----------------------------------------------------------------------------
- | |
| 920 // | |
| 921 @@ -1988,7 +2019,7 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { | |
| 922 theLoc--; | |
| 923 } | |
| 924 if (reserveLoc) { | |
| 925 - int32_t nop = URX_BUILD(URX_NOP, 0); | |
| 926 + int32_t nop = buildOp(URX_NOP, 0); | |
| 927 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); | |
| 928 } | |
| 929 } | |
| 930 @@ -2063,8 +2094,7 @@ void RegexCompile::handleCloseParen() { | |
| 931 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); | |
| 932 | |
| 933 int32_t frameVarLocation = URX_VAL(captureOp); | |
| 934 - int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocatio
n); | |
| 935 - fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); | |
| 936 + appendOp(URX_END_CAPTURE, frameVarLocation); | |
| 937 } | |
| 938 break; | |
| 939 case atomic: | |
| 940 @@ -2075,8 +2105,7 @@ void RegexCompile::handleCloseParen() { | |
| 941 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchO
penParen+1); | |
| 942 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); | |
| 943 int32_t stoLoc = URX_VAL(stoOp); | |
| 944 - int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); | |
| 945 - fRXPat->fCompiledPat->addElement(ldOp, *fStatus); | |
| 946 + appendOp(URX_LD_SP, stoLoc); | |
| 947 } | |
| 948 break; | |
| 949 | |
| 950 @@ -2085,8 +2114,7 @@ void RegexCompile::handleCloseParen() { | |
| 951 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-5); | |
| 952 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | |
| 953 int32_t dataLoc = URX_VAL(startOp); | |
| 954 - int32_t op = URX_BUILD(URX_LA_END, dataLoc); | |
| 955 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 956 + appendOp(URX_LA_END, dataLoc); | |
| 957 } | |
| 958 break; | |
| 959 | |
| 960 @@ -2096,19 +2124,16 @@ void RegexCompile::handleCloseParen() { | |
| 961 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-1); | |
| 962 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); | |
| 963 int32_t dataLoc = URX_VAL(startOp); | |
| 964 - int32_t op = URX_BUILD(URX_LA_END, dataLoc); | |
| 965 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 966 - op = URX_BUILD(URX_BACKTRACK, 0); | |
| 967 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 968 - op = URX_BUILD(URX_LA_END, dataLoc); | |
| 969 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 970 + appendOp(URX_LA_END, dataLoc); | |
| 971 + appendOp(URX_BACKTRACK, 0); | |
| 972 + appendOp(URX_LA_END, dataLoc); | |
| 973 | |
| 974 // Patch the URX_SAVE near the top of the block. | |
| 975 // The destination of the SAVE is the final LA_END that was just ad
ded. | |
| 976 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen); | |
| 977 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); | |
| 978 int32_t dest = fRXPat->fCompiledPat->size()-1; | |
| 979 - saveOp = URX_BUILD(URX_STATE_SAVE, dest); | |
| 980 + saveOp = buildOp(URX_STATE_SAVE, dest); | |
| 981 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); | |
| 982 } | |
| 983 break; | |
| 984 @@ -2121,10 +2146,8 @@ void RegexCompile::handleCloseParen() { | |
| 985 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-4); | |
| 986 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | |
| 987 int32_t dataLoc = URX_VAL(startOp); | |
| 988 - int32_t op = URX_BUILD(URX_LB_END, dataLoc); | |
| 989 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 990 - op = URX_BUILD(URX_LA_END, dataLoc); | |
| 991 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 992 + appendOp(URX_LB_END, dataLoc); | |
| 993 + appendOp(URX_LA_END, dataLoc); | |
| 994 | |
| 995 // Determine the min and max bounds for the length of the | |
| 996 // string that the pattern can match. | |
| 997 @@ -2132,6 +2155,10 @@ void RegexCompile::handleCloseParen() { | |
| 998 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | |
| 999 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | |
| 1000 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | |
| 1001 + if (URX_TYPE(maxML) != 0) { | |
| 1002 + error(U_REGEX_LOOK_BEHIND_LIMIT); | |
| 1003 + break; | |
| 1004 + } | |
| 1005 if (maxML == INT32_MAX) { | |
| 1006 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
| 1007 break; | |
| 1008 @@ -2156,8 +2183,7 @@ void RegexCompile::handleCloseParen() { | |
| 1009 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatch
OpenParen-5); | |
| 1010 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); | |
| 1011 int32_t dataLoc = URX_VAL(startOp); | |
| 1012 - int32_t op = URX_BUILD(URX_LBN_END, dataLoc); | |
| 1013 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1014 + appendOp(URX_LBN_END, dataLoc); | |
| 1015 | |
| 1016 // Determine the min and max bounds for the length of the | |
| 1017 // string that the pattern can match. | |
| 1018 @@ -2165,6 +2191,10 @@ void RegexCompile::handleCloseParen() { | |
| 1019 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; | |
| 1020 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); | |
| 1021 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); | |
| 1022 + if (URX_TYPE(maxML) != 0) { | |
| 1023 + error(U_REGEX_LOOK_BEHIND_LIMIT); | |
| 1024 + break; | |
| 1025 + } | |
| 1026 if (maxML == INT32_MAX) { | |
| 1027 error(U_REGEX_LOOK_BEHIND_LIMIT); | |
| 1028 break; | |
| 1029 @@ -2178,7 +2208,7 @@ void RegexCompile::handleCloseParen() { | |
| 1030 | |
| 1031 // Insert the pattern location to continue at after a successful ma
tch | |
| 1032 // as the last operand of the URX_LBN_CONT | |
| 1033 - op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); | |
| 1034 + int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size())
; | |
| 1035 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); | |
| 1036 } | |
| 1037 break; | |
| 1038 @@ -2219,7 +2249,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) | |
| 1039 case 0: | |
| 1040 { | |
| 1041 // Set of no elements. Always fails to match. | |
| 1042 - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fSta
tus); | |
| 1043 + appendOp(URX_BACKTRACK, 0); | |
| 1044 delete theSet; | |
| 1045 } | |
| 1046 break; | |
| 1047 @@ -2240,8 +2270,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet) | |
| 1048 // Put it into the compiled pattern as a set. | |
| 1049 int32_t setNumber = fRXPat->fSets->size(); | |
| 1050 fRXPat->fSets->addElement(theSet, *fStatus); | |
| 1051 - int32_t setOp = URX_BUILD(URX_SETREF, setNumber); | |
| 1052 - fRXPat->fCompiledPat->addElement(setOp, *fStatus); | |
| 1053 + appendOp(URX_SETREF, setNumber); | |
| 1054 } | |
| 1055 } | |
| 1056 } | |
| 1057 @@ -2280,13 +2309,10 @@ void RegexCompile::compileInterval(int32_t InitOp
, int32_t LoopOp) | |
| 1058 // counterLoc --> Loop counter | |
| 1059 // +1 --> Input index (for breaking non-progressing loops
) | |
| 1060 // (Only present if unbounded upper limit on loop) | |
| 1061 - int32_t counterLoc = fRXPat->fFrameSize; | |
| 1062 - fRXPat->fFrameSize++; | |
| 1063 - if (fIntervalUpper < 0) { | |
| 1064 - fRXPat->fFrameSize++; | |
| 1065 - } | |
| 1066 + int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; | |
| 1067 + int32_t counterLoc = allocateStackData(dataSize); | |
| 1068 | |
| 1069 - int32_t op = URX_BUILD(InitOp, counterLoc); | |
| 1070 + int32_t op = buildOp(InitOp, counterLoc); | |
| 1071 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); | |
| 1072 | |
| 1073 // The second operand of CTR_INIT is the location following the end of the
loop. | |
| 1074 @@ -2294,7 +2320,7 @@ void RegexCompile::compileInterval(int32_t InitOp,
int32_t LoopOp) | |
| 1075 // compilation of something later on causes the code to grow and the targ
et | |
| 1076 // position to move. | |
| 1077 int32_t loopEnd = fRXPat->fCompiledPat->size(); | |
| 1078 - op = URX_BUILD(URX_RELOC_OPRND, loopEnd); | |
| 1079 + op = buildOp(URX_RELOC_OPRND, loopEnd); | |
| 1080 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); | |
| 1081 | |
| 1082 // Followed by the min and max counts. | |
| 1083 @@ -2303,8 +2329,7 @@ void RegexCompile::compileInterval(int32_t InitOp,
int32_t LoopOp) | |
| 1084 | |
| 1085 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. | |
| 1086 // Goes at end of the block being looped over, so just append to the code
so far. | |
| 1087 - op = URX_BUILD(LoopOp, topOfBlock); | |
| 1088 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1089 + appendOp(LoopOp, topOfBlock); | |
| 1090 | |
| 1091 if ((fIntervalLow & 0xff000000) != 0 || | |
| 1092 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { | |
| 1093 @@ -2328,7 +2353,15 @@ UBool RegexCompile::compileInlineInterval() { | |
| 1094 int32_t topOfBlock = blockTopLoc(FALSE); | |
| 1095 if (fIntervalUpper == 0) { | |
| 1096 // Pathological case. Attempt no matches, as if the block doesn't exis
t. | |
| 1097 + // Discard the generated code for the block. | |
| 1098 + // If the block included parens, discard the info pertaining to them as
well. | |
| 1099 fRXPat->fCompiledPat->setSize(topOfBlock); | |
| 1100 + if (fMatchOpenParen >= topOfBlock) { | |
| 1101 + fMatchOpenParen = -1; | |
| 1102 + } | |
| 1103 + if (fMatchCloseParen >= topOfBlock) { | |
| 1104 + fMatchCloseParen = -1; | |
| 1105 + } | |
| 1106 return TRUE; | |
| 1107 } | |
| 1108 | |
| 1109 @@ -2349,7 +2382,7 @@ UBool RegexCompile::compileInlineInterval() { | |
| 1110 // | |
| 1111 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 | |
| 1112 + fIntervalUpper + (fIntervalUpper-fIntervalLow
); | |
| 1113 - int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); | |
| 1114 + int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); | |
| 1115 if (fIntervalLow == 0) { | |
| 1116 insertOp(topOfBlock); | |
| 1117 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); | |
| 1118 @@ -2362,13 +2395,10 @@ UBool RegexCompile::compileInlineInterval() { | |
| 1119 // it was put there when it was originally encountered. | |
| 1120 int32_t i; | |
| 1121 for (i=1; i<fIntervalUpper; i++ ) { | |
| 1122 - if (i == fIntervalLow) { | |
| 1123 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | |
| 1124 - } | |
| 1125 - if (i > fIntervalLow) { | |
| 1126 - fRXPat->fCompiledPat->addElement(saveOp, *fStatus); | |
| 1127 + if (i >= fIntervalLow) { | |
| 1128 + appendOp(saveOp); | |
| 1129 } | |
| 1130 - fRXPat->fCompiledPat->addElement(op, *fStatus); | |
| 1131 + appendOp(op); | |
| 1132 } | |
| 1133 return TRUE; | |
| 1134 } | |
| 1135 @@ -3587,7 +3617,7 @@ void RegexCompile::stripNOPs() { | |
| 1136 int32_t operandAddress = URX_VAL(op); | |
| 1137 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); | |
| 1138 int32_t fixedOperandAddress = operandAddress - deltas.elementAt
i(operandAddress); | |
| 1139 - op = URX_BUILD(opType, fixedOperandAddress); | |
| 1140 + op = buildOp(opType, fixedOperandAddress); | |
| 1141 fRXPat->fCompiledPat->setElementAt(op, dst); | |
| 1142 dst++; | |
| 1143 break; | |
| 1144 @@ -3602,7 +3632,7 @@ void RegexCompile::stripNOPs() { | |
| 1145 break; | |
| 1146 } | |
| 1147 where = fRXPat->fGroupMap->elementAti(where-1); | |
| 1148 - op = URX_BUILD(opType, where); | |
| 1149 + op = buildOp(opType, where); | |
| 1150 fRXPat->fCompiledPat->setElementAt(op, dst); | |
| 1151 dst++; | |
| 1152 | |
| 1153 @@ -3954,7 +3984,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { | |
| 1154 //-----------------------------------------------------------------------------
- | |
| 1155 // | |
| 1156 // scanNamedChar | |
| 1157 - // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. | |
| 1158 +// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. | |
| 1159 // | |
| 1160 // The scan position will be at the 'N'. On return | |
| 1161 // the scan position should be just after the '}' | |
| 1162 diff --git a/source/i18n/regexcmp.h b/source/i18n/regexcmp.h | |
| 1163 index debdf45..c3cc7db 100644 | |
| 1164 --- a/source/i18n/regexcmp.h | |
| 1165 +++ b/source/i18n/regexcmp.h | |
| 1166 @@ -104,6 +104,13 @@ private: | |
| 1167 void fixLiterals(UBool split=FALSE); // Generate code for pendi
ng literal characters. | |
| 1168 void insertOp(int32_t where); // Open up a slot for a ne
w op in the | |
| 1169 // generated code at the
specified location. | |
| 1170 + void appendOp(int32_t op); // Append a new op to the
compiled pattern. | |
| 1171 + void appendOp(int32_t type, int32_t val); // Build & append a new op
to the compiled pattern. | |
| 1172 + int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode i
nstruction. | |
| 1173 + int32_t allocateData(int32_t size); // Allocate space in the m
atcher data area. | |
| 1174 + // Return index of the n
ewly allocated data. | |
| 1175 + int32_t allocateStackData(int32_t size); // Allocate space in the m
atch back-track stack frame. | |
| 1176 + // Return offset index i
n the frame. | |
| 1177 int32_t minMatchLength(int32_t start, | |
| 1178 int32_t end); | |
| 1179 int32_t maxMatchLength(int32_t start, | |
| 1180 @@ -187,7 +194,9 @@ private: | |
| 1181 int32_t fMatchOpenParen; // The position in the com
piled pattern | |
| 1182 // of the slot reserved
for a state save | |
| 1183 // at the start of the m
ost recently processed | |
| 1184 - // parenthesized block. | |
| 1185 + // parenthesized block.
Updated when processing | |
| 1186 + // a close to the locati
on for the corresponding open. | |
| 1187 + | |
| 1188 int32_t fMatchCloseParen; // The position in the pat
tern of the first | |
| 1189 // location after the mo
st recently processed | |
| 1190 // parenthesized block. | |
| 1191 diff --git a/source/i18n/regeximp.h b/source/i18n/regeximp.h | |
| 1192 index bdf8403..fdd9c76 100644 | |
| 1193 --- a/source/i18n/regeximp.h | |
| 1194 +++ b/source/i18n/regeximp.h | |
| 1195 @@ -1,5 +1,5 @@ | |
| 1196 // | |
| 1197 -// Copyright (C) 2002-2013 International Business Machines Corporation | |
| 1198 +// Copyright (C) 2002-2014 International Business Machines Corporation | |
| 1199 // and others. All rights reserved. | |
| 1200 // | |
| 1201 // file: regeximp.h | |
| 1202 @@ -241,7 +241,6 @@ enum { | |
| 1203 // | |
| 1204 // Convenience macros for assembling and disassembling a compiled operation. | |
| 1205 // | |
| 1206 -#define URX_BUILD(type, val) (int32_t)((type << 24) | (val)) | |
| 1207 #define URX_TYPE(x) ((uint32_t)(x) >> 24) | |
| 1208 #define URX_VAL(x) ((x) & 0xffffff) | |
| 1209 | |
| 1210 diff --git a/source/test/intltest/regextst.cpp b/source/test/intltest/regextst.c
pp | |
| 1211 index ca2fd21..f440c26 100644 | |
| 1212 --- a/source/test/intltest/regextst.cpp | |
| 1213 +++ b/source/test/intltest/regextst.cpp | |
| 1214 @@ -144,6 +144,9 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, c
onst char* &name, ch | |
| 1215 case 24: name = "TestBug11049"; | |
| 1216 if (exec) TestBug11049(); | |
| 1217 break; | |
| 1218 + case 25: name = "TestBug11371"; | |
| 1219 + if (exec) TestBug11371(); | |
| 1220 + break; | |
| 1221 default: name = ""; | |
| 1222 break; //needed to end loop | |
| 1223 } | |
| 1224 @@ -5367,6 +5370,49 @@ void RegexTest::TestCase11049(const char *pattern, const
char *data, UBool expec | |
| 1225 } | |
| 1226 | |
| 1227 | |
| 1228 +void RegexTest::TestBug11371() { | |
| 1229 + if (quick) { | |
| 1230 + logln("Skipping test. Runs in exhuastive mode only."); | |
| 1231 + return; | |
| 1232 + } | |
| 1233 + UErrorCode status = U_ZERO_ERROR; | |
| 1234 + UnicodeString patternString; | |
| 1235 + | |
| 1236 + for (int i=0; i<8000000; i++) { | |
| 1237 + patternString.append(UnicodeString("()")); | |
| 1238 + } | |
| 1239 + LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString,
0, status)); | |
| 1240 + if (status != U_REGEX_PATTERN_TOO_BIG) { | |
| 1241 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", | |
| 1242 + __FILE__, __LINE__, u_errorName(status)); | |
| 1243 + } | |
| 1244 + | |
| 1245 + status = U_ZERO_ERROR; | |
| 1246 + patternString = "("; | |
| 1247 + for (int i=0; i<20000000; i++) { | |
| 1248 + patternString.append(UnicodeString("A++")); | |
| 1249 + } | |
| 1250 + patternString.append(UnicodeString("){0}B++")); | |
| 1251 + LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString
, 0, status)); | |
| 1252 + if (status != U_REGEX_PATTERN_TOO_BIG) { | |
| 1253 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", | |
| 1254 + __FILE__, __LINE__, u_errorName(status)); | |
| 1255 + } | |
| 1256 + | |
| 1257 + // Pattern with too much string data, such that string indexes overflow ope
rand data field size | |
| 1258 + // in compiled instruction. | |
| 1259 + status = U_ZERO_ERROR; | |
| 1260 + patternString = ""; | |
| 1261 + while (patternString.length() < 0x00ffffff) { | |
| 1262 + patternString.append(UnicodeString("stuff and things dont you know, the
se are a few of my favorite strings\n")); | |
| 1263 + } | |
| 1264 + patternString.append(UnicodeString("X? trailing string")); | |
| 1265 + LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString
, 0, status)); | |
| 1266 + if (status != U_REGEX_PATTERN_TOO_BIG) { | |
| 1267 + errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s
.", | |
| 1268 + __FILE__, __LINE__, u_errorName(status)); | |
| 1269 + } | |
| 1270 +} | |
| 1271 | |
| 1272 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
| 1273 | |
| 1274 diff --git a/source/test/intltest/regextst.h b/source/test/intltest/regextst.h | |
| 1275 index 28e2121..38cc4ef 100644 | |
| 1276 --- a/source/test/intltest/regextst.h | |
| 1277 +++ b/source/test/intltest/regextst.h | |
| 1278 @@ -50,6 +50,7 @@ public: | |
| 1279 virtual void Bug10459(); | |
| 1280 virtual void TestCaseInsensitiveStarters(); | |
| 1281 virtual void TestBug11049(); | |
| 1282 + virtual void TestBug11371(); | |
| 1283 | |
| 1284 // The following functions are internal to the regexp tests. | |
| 1285 virtual void assertUText(const char *expected, UText *actual, const char *f
ile, int line); | |
| 1286 diff --git a/source/test/testdata/regextst.txt b/source/test/testdata/regextst.t
xt | |
| 1287 index 4d2e7f6..d642e8b 100644 | |
| 1288 --- a/source/test/testdata/regextst.txt | |
| 1289 +++ b/source/test/testdata/regextst.txt | |
| 1290 @@ -1201,6 +1201,24 @@ | |
| 1291 "A|B|\U00012345" "hello <0>\U00012345</0>" | |
| 1292 "A|B|\U00010000" "hello \ud800" | |
| 1293 | |
| 1294 +# Bug 11369 | |
| 1295 +# Incorrect optimization of patterns with a zero length quantifier {0} | |
| 1296 + | |
| 1297 +"(.|b)(|b){0}\$(?#xxx){3}(?>\D*)" "AAAAABBBBBCCCCCDDDDEEEEE" | |
| 1298 +"(|b)ab(c)" "<0><1></1>ab<2>c</2></0>" | |
| 1299 +"(|b){0}a{3}(D*)" "<0>aaa<2></2></0>" | |
| 1300 +"(|b){0,1}a{3}(D*)" "<0><1></1>aaa<2></2></0>" | |
| 1301 +"((|b){0})a{3}(D*)" "<0><1></1>aaa<3></3></0>" | |
| 1302 + | |
| 1303 +# Bug 11370 | |
| 1304 +# Max match length computation of look-behind expression gives result that is
too big to fit in the | |
| 1305 +# in the 24 bit operand portion of the compiled code. Expressions should fail
to compile | |
| 1306 +# (Look-behind match length must be bounded. This case is treated as unbounde
d, an error.) | |
| 1307 + | |
| 1308 +"(?<!(0123456789a){10000000})x" E "no match" | |
| 1309 +"(?<!\\ubeaf(\\ubeaf{11000}){11000})" E "no match" | |
| 1310 + | |
| 1311 + | |
| 1312 # Random debugging, Temporary | |
| 1313 # | |
| 1314 | |
| OLD | NEW |