OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2004-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * file name: regex.cpp |
| 7 */ |
| 8 |
| 9 #include "unicode/utypes.h" |
| 10 |
| 11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 12 |
| 13 #include "unicode/regex.h" |
| 14 #include "unicode/uregex.h" |
| 15 #include "unicode/unistr.h" |
| 16 #include "unicode/ustring.h" |
| 17 #include "unicode/uchar.h" |
| 18 #include "unicode/uobject.h" |
| 19 #include "umutex.h" |
| 20 #include "uassert.h" |
| 21 #include "cmemory.h" |
| 22 |
| 23 #include "regextxt.h" |
| 24 |
| 25 #include <stdio.h> |
| 26 |
| 27 U_NAMESPACE_BEGIN |
| 28 |
| 29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0) |
| 30 |
| 31 struct RegularExpression: public UMemory { |
| 32 public: |
| 33 RegularExpression(); |
| 34 ~RegularExpression(); |
| 35 int32_t fMagic; |
| 36 RegexPattern *fPat; |
| 37 int32_t *fPatRefCount; |
| 38 UChar *fPatString; |
| 39 int32_t fPatStringLen; |
| 40 RegexMatcher *fMatcher; |
| 41 const UChar *fText; // Text from setText() |
| 42 int32_t fTextLength; // Length provided by user with setText(),
which |
| 43 // may be -1. |
| 44 UBool fOwnsText; |
| 45 }; |
| 46 |
| 47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII |
| 48 |
| 49 RegularExpression::RegularExpression() { |
| 50 fMagic = REXP_MAGIC; |
| 51 fPat = NULL; |
| 52 fPatRefCount = NULL; |
| 53 fPatString = NULL; |
| 54 fPatStringLen = 0; |
| 55 fMatcher = NULL; |
| 56 fText = NULL; |
| 57 fTextLength = 0; |
| 58 fOwnsText = FALSE; |
| 59 } |
| 60 |
| 61 RegularExpression::~RegularExpression() { |
| 62 delete fMatcher; |
| 63 fMatcher = NULL; |
| 64 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { |
| 65 delete fPat; |
| 66 uprv_free(fPatString); |
| 67 uprv_free(fPatRefCount); |
| 68 } |
| 69 if (fOwnsText && fText!=NULL) { |
| 70 uprv_free((void *)fText); |
| 71 } |
| 72 fMagic = 0; |
| 73 } |
| 74 |
| 75 U_NAMESPACE_END |
| 76 |
| 77 U_NAMESPACE_USE |
| 78 |
| 79 //------------------------------------------------------------------------------
---------- |
| 80 // |
| 81 // validateRE Do boilerplate style checks on API function parameters. |
| 82 // Return TRUE if they look OK. |
| 83 //------------------------------------------------------------------------------
---------- |
| 84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool r
equiresText = TRUE) { |
| 85 if (U_FAILURE(*status)) { |
| 86 return FALSE; |
| 87 } |
| 88 if (re == NULL || re->fMagic != REXP_MAGIC) { |
| 89 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 90 return FALSE; |
| 91 } |
| 92 // !!! Not sure how to update this with the new UText backing, which is stor
ed in re->fMatcher anyway |
| 93 if (requiresText && re->fText == NULL && !re->fOwnsText) { |
| 94 *status = U_REGEX_INVALID_STATE; |
| 95 return FALSE; |
| 96 } |
| 97 return TRUE; |
| 98 } |
| 99 |
| 100 //------------------------------------------------------------------------------
---------- |
| 101 // |
| 102 // uregex_open |
| 103 // |
| 104 //------------------------------------------------------------------------------
---------- |
| 105 U_CAPI URegularExpression * U_EXPORT2 |
| 106 uregex_open( const UChar *pattern, |
| 107 int32_t patternLength, |
| 108 uint32_t flags, |
| 109 UParseError *pe, |
| 110 UErrorCode *status) { |
| 111 |
| 112 if (U_FAILURE(*status)) { |
| 113 return NULL; |
| 114 } |
| 115 if (pattern == NULL || patternLength < -1 || patternLength == 0) { |
| 116 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 117 return NULL; |
| 118 } |
| 119 int32_t actualPatLen = patternLength; |
| 120 if (actualPatLen == -1) { |
| 121 actualPatLen = u_strlen(pattern); |
| 122 } |
| 123 |
| 124 RegularExpression *re = new RegularExpression; |
| 125 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); |
| 126 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLe
n+1)); |
| 127 if (re == NULL || refC == NULL || patBuf == NULL) { |
| 128 *status = U_MEMORY_ALLOCATION_ERROR; |
| 129 delete re; |
| 130 uprv_free(refC); |
| 131 uprv_free(patBuf); |
| 132 return NULL; |
| 133 } |
| 134 re->fPatRefCount = refC; |
| 135 *re->fPatRefCount = 1; |
| 136 |
| 137 // |
| 138 // Make a copy of the pattern string, so we can return it later if asked. |
| 139 // For compiling the pattern, we will use a UText wrapper around |
| 140 // this local copy, to avoid making even more copies. |
| 141 // |
| 142 re->fPatString = patBuf; |
| 143 re->fPatStringLen = patternLength; |
| 144 u_memcpy(patBuf, pattern, actualPatLen); |
| 145 patBuf[actualPatLen] = 0; |
| 146 |
| 147 UText patText = UTEXT_INITIALIZER; |
| 148 utext_openUChars(&patText, patBuf, patternLength, status); |
| 149 |
| 150 // |
| 151 // Compile the pattern |
| 152 // |
| 153 if (pe != NULL) { |
| 154 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); |
| 155 } else { |
| 156 re->fPat = RegexPattern::compile(&patText, flags, *status); |
| 157 } |
| 158 utext_close(&patText); |
| 159 |
| 160 if (U_FAILURE(*status)) { |
| 161 goto ErrorExit; |
| 162 } |
| 163 |
| 164 // |
| 165 // Create the matcher object |
| 166 // |
| 167 re->fMatcher = re->fPat->matcher(*status); |
| 168 if (U_SUCCESS(*status)) { |
| 169 return (URegularExpression*)re; |
| 170 } |
| 171 |
| 172 ErrorExit: |
| 173 delete re; |
| 174 return NULL; |
| 175 |
| 176 } |
| 177 |
| 178 //------------------------------------------------------------------------------
---------- |
| 179 // |
| 180 // uregex_openUText |
| 181 // |
| 182 //------------------------------------------------------------------------------
---------- |
| 183 U_CAPI URegularExpression * U_EXPORT2 |
| 184 uregex_openUText(UText *pattern, |
| 185 uint32_t flags, |
| 186 UParseError *pe, |
| 187 UErrorCode *status) { |
| 188 |
| 189 if (U_FAILURE(*status)) { |
| 190 return NULL; |
| 191 } |
| 192 if (pattern == NULL) { |
| 193 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 194 return NULL; |
| 195 } |
| 196 |
| 197 int64_t patternNativeLength = utext_nativeLength(pattern); |
| 198 |
| 199 if (patternNativeLength == 0) { |
| 200 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 201 return NULL; |
| 202 } |
| 203 |
| 204 RegularExpression *re = new RegularExpression; |
| 205 |
| 206 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 207 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NUL
L, 0, &lengthStatus); |
| 208 |
| 209 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); |
| 210 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Le
ngth+1)); |
| 211 if (re == NULL || refC == NULL || patBuf == NULL) { |
| 212 *status = U_MEMORY_ALLOCATION_ERROR; |
| 213 delete re; |
| 214 uprv_free(refC); |
| 215 uprv_free(patBuf); |
| 216 return NULL; |
| 217 } |
| 218 re->fPatRefCount = refC; |
| 219 *re->fPatRefCount = 1; |
| 220 |
| 221 // |
| 222 // Make a copy of the pattern string, so we can return it later if asked. |
| 223 // For compiling the pattern, we will use a read-only UText wrapper |
| 224 // around this local copy, to avoid making even more copies. |
| 225 // |
| 226 re->fPatString = patBuf; |
| 227 re->fPatStringLen = pattern16Length; |
| 228 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, st
atus); |
| 229 |
| 230 UText patText = UTEXT_INITIALIZER; |
| 231 utext_openUChars(&patText, patBuf, pattern16Length, status); |
| 232 |
| 233 // |
| 234 // Compile the pattern |
| 235 // |
| 236 if (pe != NULL) { |
| 237 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status); |
| 238 } else { |
| 239 re->fPat = RegexPattern::compile(&patText, flags, *status); |
| 240 } |
| 241 utext_close(&patText); |
| 242 |
| 243 if (U_FAILURE(*status)) { |
| 244 goto ErrorExit; |
| 245 } |
| 246 |
| 247 // |
| 248 // Create the matcher object |
| 249 // |
| 250 re->fMatcher = re->fPat->matcher(*status); |
| 251 if (U_SUCCESS(*status)) { |
| 252 return (URegularExpression*)re; |
| 253 } |
| 254 |
| 255 ErrorExit: |
| 256 delete re; |
| 257 return NULL; |
| 258 |
| 259 } |
| 260 |
| 261 //------------------------------------------------------------------------------
---------- |
| 262 // |
| 263 // uregex_close |
| 264 // |
| 265 //------------------------------------------------------------------------------
---------- |
| 266 U_CAPI void U_EXPORT2 |
| 267 uregex_close(URegularExpression *re2) { |
| 268 RegularExpression *re = (RegularExpression*)re2; |
| 269 UErrorCode status = U_ZERO_ERROR; |
| 270 if (validateRE(re, &status, FALSE) == FALSE) { |
| 271 return; |
| 272 } |
| 273 delete re; |
| 274 } |
| 275 |
| 276 |
| 277 //------------------------------------------------------------------------------
---------- |
| 278 // |
| 279 // uregex_clone |
| 280 // |
| 281 //------------------------------------------------------------------------------
---------- |
| 282 U_CAPI URegularExpression * U_EXPORT2 |
| 283 uregex_clone(const URegularExpression *source2, UErrorCode *status) { |
| 284 RegularExpression *source = (RegularExpression*)source2; |
| 285 if (validateRE(source, status, FALSE) == FALSE) { |
| 286 return NULL; |
| 287 } |
| 288 |
| 289 RegularExpression *clone = new RegularExpression; |
| 290 if (clone == NULL) { |
| 291 *status = U_MEMORY_ALLOCATION_ERROR; |
| 292 return NULL; |
| 293 } |
| 294 |
| 295 clone->fMatcher = source->fPat->matcher(*status); |
| 296 if (U_FAILURE(*status)) { |
| 297 delete clone; |
| 298 return NULL; |
| 299 } |
| 300 |
| 301 clone->fPat = source->fPat; |
| 302 clone->fPatRefCount = source->fPatRefCount; |
| 303 clone->fPatString = source->fPatString; |
| 304 clone->fPatStringLen = source->fPatStringLen; |
| 305 umtx_atomic_inc(source->fPatRefCount); |
| 306 // Note: fText is not cloned. |
| 307 |
| 308 return (URegularExpression*)clone; |
| 309 } |
| 310 |
| 311 |
| 312 |
| 313 |
| 314 //------------------------------------------------------------------------------ |
| 315 // |
| 316 // uregex_pattern |
| 317 // |
| 318 //------------------------------------------------------------------------------ |
| 319 U_CAPI const UChar * U_EXPORT2 |
| 320 uregex_pattern(const URegularExpression *regexp2, |
| 321 int32_t *patLength, |
| 322 UErrorCode *status) { |
| 323 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 324 |
| 325 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 326 return NULL; |
| 327 } |
| 328 if (patLength != NULL) { |
| 329 *patLength = regexp->fPatStringLen; |
| 330 } |
| 331 return regexp->fPatString; |
| 332 } |
| 333 |
| 334 |
| 335 //------------------------------------------------------------------------------ |
| 336 // |
| 337 // uregex_patternUText |
| 338 // |
| 339 //------------------------------------------------------------------------------ |
| 340 U_CAPI UText * U_EXPORT2 |
| 341 uregex_patternUText(const URegularExpression *regexp2, |
| 342 UErrorCode *status) { |
| 343 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 344 return regexp->fPat->patternText(*status); |
| 345 } |
| 346 |
| 347 |
| 348 //------------------------------------------------------------------------------ |
| 349 // |
| 350 // uregex_flags |
| 351 // |
| 352 //------------------------------------------------------------------------------ |
| 353 U_CAPI int32_t U_EXPORT2 |
| 354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status) { |
| 355 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 356 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 357 return 0; |
| 358 } |
| 359 int32_t flags = regexp->fPat->flags(); |
| 360 return flags; |
| 361 } |
| 362 |
| 363 |
| 364 //------------------------------------------------------------------------------ |
| 365 // |
| 366 // uregex_setText |
| 367 // |
| 368 //------------------------------------------------------------------------------ |
| 369 U_CAPI void U_EXPORT2 |
| 370 uregex_setText(URegularExpression *regexp2, |
| 371 const UChar *text, |
| 372 int32_t textLength, |
| 373 UErrorCode *status) { |
| 374 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 375 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 376 return; |
| 377 } |
| 378 if (text == NULL || textLength < -1) { |
| 379 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 380 return; |
| 381 } |
| 382 |
| 383 if (regexp->fOwnsText && regexp->fText != NULL) { |
| 384 uprv_free((void *)regexp->fText); |
| 385 } |
| 386 |
| 387 regexp->fText = text; |
| 388 regexp->fTextLength = textLength; |
| 389 regexp->fOwnsText = FALSE; |
| 390 |
| 391 UText input = UTEXT_INITIALIZER; |
| 392 utext_openUChars(&input, text, textLength, status); |
| 393 regexp->fMatcher->reset(&input); |
| 394 utext_close(&input); // reset() made a shallow clone, so we don't need this
copy |
| 395 } |
| 396 |
| 397 |
| 398 //------------------------------------------------------------------------------ |
| 399 // |
| 400 // uregex_setUText |
| 401 // |
| 402 //------------------------------------------------------------------------------ |
| 403 U_CAPI void U_EXPORT2 |
| 404 uregex_setUText(URegularExpression *regexp2, |
| 405 UText *text, |
| 406 UErrorCode *status) { |
| 407 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 408 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 409 return; |
| 410 } |
| 411 if (text == NULL) { |
| 412 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 413 return; |
| 414 } |
| 415 |
| 416 if (regexp->fOwnsText && regexp->fText != NULL) { |
| 417 uprv_free((void *)regexp->fText); |
| 418 } |
| 419 |
| 420 regexp->fText = NULL; // only fill it in on request |
| 421 regexp->fTextLength = -1; |
| 422 regexp->fOwnsText = TRUE; |
| 423 regexp->fMatcher->reset(text); |
| 424 } |
| 425 |
| 426 |
| 427 |
| 428 //------------------------------------------------------------------------------ |
| 429 // |
| 430 // uregex_getText |
| 431 // |
| 432 //------------------------------------------------------------------------------ |
| 433 U_CAPI const UChar * U_EXPORT2 |
| 434 uregex_getText(URegularExpression *regexp2, |
| 435 int32_t *textLength, |
| 436 UErrorCode *status) { |
| 437 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 438 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 439 return NULL; |
| 440 } |
| 441 |
| 442 if (regexp->fText == NULL) { |
| 443 // need to fill in the text |
| 444 UText *inputText = regexp->fMatcher->inputText(); |
| 445 int64_t inputNativeLength = utext_nativeLength(inputText); |
| 446 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) { |
| 447 regexp->fText = inputText->chunkContents; |
| 448 regexp->fTextLength = (int32_t)inputNativeLength; |
| 449 regexp->fOwnsText = FALSE; // because the UText owns it |
| 450 } else { |
| 451 UErrorCode lengthStatus = U_ZERO_ERROR; |
| 452 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength,
NULL, 0, &lengthStatus); // buffer overflow error |
| 453 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTex
tLength+1)); |
| 454 |
| 455 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->f
TextLength+1, status); |
| 456 regexp->fText = inputChars; |
| 457 regexp->fOwnsText = TRUE; // should already be set but just in case |
| 458 } |
| 459 } |
| 460 |
| 461 if (textLength != NULL) { |
| 462 *textLength = regexp->fTextLength; |
| 463 } |
| 464 return regexp->fText; |
| 465 } |
| 466 |
| 467 |
| 468 //------------------------------------------------------------------------------ |
| 469 // |
| 470 // uregex_getUText |
| 471 // |
| 472 //------------------------------------------------------------------------------ |
| 473 U_CAPI UText * U_EXPORT2 |
| 474 uregex_getUText(URegularExpression *regexp2, |
| 475 UText *dest, |
| 476 UErrorCode *status) { |
| 477 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 478 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 479 return dest; |
| 480 } |
| 481 return regexp->fMatcher->getInput(dest, *status); |
| 482 } |
| 483 |
| 484 |
| 485 //------------------------------------------------------------------------------ |
| 486 // |
| 487 // uregex_matches |
| 488 // |
| 489 //------------------------------------------------------------------------------ |
| 490 U_CAPI UBool U_EXPORT2 |
| 491 uregex_matches(URegularExpression *regexp2, |
| 492 int32_t startIndex, |
| 493 UErrorCode *status) { |
| 494 return uregex_matches64( regexp2, (int64_t)startIndex, status); |
| 495 } |
| 496 |
| 497 U_CAPI UBool U_EXPORT2 |
| 498 uregex_matches64(URegularExpression *regexp2, |
| 499 int64_t startIndex, |
| 500 UErrorCode *status) { |
| 501 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 502 UBool result = FALSE; |
| 503 if (validateRE(regexp, status) == FALSE) { |
| 504 return result; |
| 505 } |
| 506 if (startIndex == -1) { |
| 507 result = regexp->fMatcher->matches(*status); |
| 508 } else { |
| 509 result = regexp->fMatcher->matches(startIndex, *status); |
| 510 } |
| 511 return result; |
| 512 } |
| 513 |
| 514 |
| 515 //------------------------------------------------------------------------------ |
| 516 // |
| 517 // uregex_lookingAt |
| 518 // |
| 519 //------------------------------------------------------------------------------ |
| 520 U_CAPI UBool U_EXPORT2 |
| 521 uregex_lookingAt(URegularExpression *regexp2, |
| 522 int32_t startIndex, |
| 523 UErrorCode *status) { |
| 524 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status); |
| 525 } |
| 526 |
| 527 U_CAPI UBool U_EXPORT2 |
| 528 uregex_lookingAt64(URegularExpression *regexp2, |
| 529 int64_t startIndex, |
| 530 UErrorCode *status) { |
| 531 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 532 UBool result = FALSE; |
| 533 if (validateRE(regexp, status) == FALSE) { |
| 534 return result; |
| 535 } |
| 536 if (startIndex == -1) { |
| 537 result = regexp->fMatcher->lookingAt(*status); |
| 538 } else { |
| 539 result = regexp->fMatcher->lookingAt(startIndex, *status); |
| 540 } |
| 541 return result; |
| 542 } |
| 543 |
| 544 |
| 545 |
| 546 //------------------------------------------------------------------------------ |
| 547 // |
| 548 // uregex_find |
| 549 // |
| 550 //------------------------------------------------------------------------------ |
| 551 U_CAPI UBool U_EXPORT2 |
| 552 uregex_find(URegularExpression *regexp2, |
| 553 int32_t startIndex, |
| 554 UErrorCode *status) { |
| 555 return uregex_find64( regexp2, (int64_t)startIndex, status); |
| 556 } |
| 557 |
| 558 U_CAPI UBool U_EXPORT2 |
| 559 uregex_find64(URegularExpression *regexp2, |
| 560 int64_t startIndex, |
| 561 UErrorCode *status) { |
| 562 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 563 UBool result = FALSE; |
| 564 if (validateRE(regexp, status) == FALSE) { |
| 565 return result; |
| 566 } |
| 567 if (startIndex == -1) { |
| 568 regexp->fMatcher->resetPreserveRegion(); |
| 569 result = regexp->fMatcher->find(); |
| 570 } else { |
| 571 result = regexp->fMatcher->find(startIndex, *status); |
| 572 } |
| 573 return result; |
| 574 } |
| 575 |
| 576 |
| 577 //------------------------------------------------------------------------------ |
| 578 // |
| 579 // uregex_findNext |
| 580 // |
| 581 //------------------------------------------------------------------------------ |
| 582 U_CAPI UBool U_EXPORT2 |
| 583 uregex_findNext(URegularExpression *regexp2, |
| 584 UErrorCode *status) { |
| 585 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 586 if (validateRE(regexp, status) == FALSE) { |
| 587 return FALSE; |
| 588 } |
| 589 UBool result = regexp->fMatcher->find(); |
| 590 return result; |
| 591 } |
| 592 |
| 593 //------------------------------------------------------------------------------ |
| 594 // |
| 595 // uregex_groupCount |
| 596 // |
| 597 //------------------------------------------------------------------------------ |
| 598 U_CAPI int32_t U_EXPORT2 |
| 599 uregex_groupCount(URegularExpression *regexp2, |
| 600 UErrorCode *status) { |
| 601 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 602 if (validateRE(regexp, status, FALSE) == FALSE) { |
| 603 return 0; |
| 604 } |
| 605 int32_t result = regexp->fMatcher->groupCount(); |
| 606 return result; |
| 607 } |
| 608 |
| 609 |
| 610 //------------------------------------------------------------------------------ |
| 611 // |
| 612 // uregex_group |
| 613 // |
| 614 //------------------------------------------------------------------------------ |
| 615 U_CAPI int32_t U_EXPORT2 |
| 616 uregex_group(URegularExpression *regexp2, |
| 617 int32_t groupNum, |
| 618 UChar *dest, |
| 619 int32_t destCapacity, |
| 620 UErrorCode *status) { |
| 621 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 622 if (validateRE(regexp, status) == FALSE) { |
| 623 return 0; |
| 624 } |
| 625 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { |
| 626 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 627 return 0; |
| 628 } |
| 629 |
| 630 if (destCapacity == 0 || regexp->fText != NULL) { |
| 631 // If preflighting or if we already have the text as UChars, |
| 632 // this is a little cheaper than going through uregex_groupUTextDeep() |
| 633 |
| 634 // |
| 635 // Pick up the range of characters from the matcher |
| 636 // |
| 637 int32_t startIx = regexp->fMatcher->start(groupNum, *status); |
| 638 int32_t endIx = regexp->fMatcher->end (groupNum, *status); |
| 639 if (U_FAILURE(*status)) { |
| 640 return 0; |
| 641 } |
| 642 |
| 643 // |
| 644 // Trim length based on buffer capacity |
| 645 // |
| 646 int32_t fullLength = endIx - startIx; |
| 647 int32_t copyLength = fullLength; |
| 648 if (copyLength < destCapacity) { |
| 649 dest[copyLength] = 0; |
| 650 } else if (copyLength == destCapacity) { |
| 651 *status = U_STRING_NOT_TERMINATED_WARNING; |
| 652 } else { |
| 653 copyLength = destCapacity; |
| 654 *status = U_BUFFER_OVERFLOW_ERROR; |
| 655 } |
| 656 |
| 657 // |
| 658 // Copy capture group to user's buffer |
| 659 // |
| 660 if (copyLength > 0) { |
| 661 u_memcpy(dest, ®exp->fText[startIx], copyLength); |
| 662 } |
| 663 return fullLength; |
| 664 } else { |
| 665 UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status
); |
| 666 int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupTex
t), dest, destCapacity, status); |
| 667 utext_close(groupText); |
| 668 return result; |
| 669 } |
| 670 } |
| 671 |
| 672 |
| 673 //------------------------------------------------------------------------------ |
| 674 // |
| 675 // uregex_groupUText |
| 676 // |
| 677 //------------------------------------------------------------------------------ |
| 678 U_CAPI UText * U_EXPORT2 |
| 679 uregex_groupUText(URegularExpression *regexp2, |
| 680 int32_t groupNum, |
| 681 UText *dest, |
| 682 int64_t *groupLength, |
| 683 UErrorCode *status) { |
| 684 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 685 if (validateRE(regexp, status) == FALSE) { |
| 686 UErrorCode emptyTextStatus = U_ZERO_ERROR; |
| 687 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus))
; |
| 688 } |
| 689 |
| 690 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status); |
| 691 } |
| 692 |
| 693 //------------------------------------------------------------------------------ |
| 694 // |
| 695 // uregex_groupUTextDeep |
| 696 // |
| 697 //------------------------------------------------------------------------------ |
| 698 U_CAPI UText * U_EXPORT2 |
| 699 uregex_groupUTextDeep(URegularExpression *regexp2, |
| 700 int32_t groupNum, |
| 701 UText *dest, |
| 702 UErrorCode *status) { |
| 703 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 704 if (validateRE(regexp, status) == FALSE) { |
| 705 UErrorCode emptyTextStatus = U_ZERO_ERROR; |
| 706 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus))
; |
| 707 } |
| 708 |
| 709 if (regexp->fText != NULL) { |
| 710 // |
| 711 // Pick up the range of characters from the matcher |
| 712 // and use our already-extracted characters |
| 713 // |
| 714 int32_t startIx = regexp->fMatcher->start(groupNum, *status); |
| 715 int32_t endIx = regexp->fMatcher->end (groupNum, *status); |
| 716 if (U_FAILURE(*status)) { |
| 717 UErrorCode emptyTextStatus = U_ZERO_ERROR; |
| 718 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStat
us)); |
| 719 } |
| 720 |
| 721 if (dest) { |
| 722 utext_replace(dest, 0, utext_nativeLength(dest), ®exp->fText[star
tIx], endIx - startIx, status); |
| 723 } else { |
| 724 UText groupText = UTEXT_INITIALIZER; |
| 725 utext_openUChars(&groupText, ®exp->fText[startIx], endIx - startI
x, status); |
| 726 dest = utext_clone(NULL, &groupText, TRUE, FALSE, status); |
| 727 utext_close(&groupText); |
| 728 } |
| 729 |
| 730 return dest; |
| 731 } else { |
| 732 return regexp->fMatcher->group(groupNum, dest, *status); |
| 733 } |
| 734 } |
| 735 |
| 736 //------------------------------------------------------------------------------ |
| 737 // |
| 738 // uregex_start |
| 739 // |
| 740 //------------------------------------------------------------------------------ |
| 741 U_CAPI int32_t U_EXPORT2 |
| 742 uregex_start(URegularExpression *regexp2, |
| 743 int32_t groupNum, |
| 744 UErrorCode *status) { |
| 745 return (int32_t)uregex_start64( regexp2, groupNum, status); |
| 746 } |
| 747 |
| 748 U_CAPI int64_t U_EXPORT2 |
| 749 uregex_start64(URegularExpression *regexp2, |
| 750 int32_t groupNum, |
| 751 UErrorCode *status) { |
| 752 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 753 if (validateRE(regexp, status) == FALSE) { |
| 754 return 0; |
| 755 } |
| 756 int32_t result = regexp->fMatcher->start(groupNum, *status); |
| 757 return result; |
| 758 } |
| 759 |
| 760 //------------------------------------------------------------------------------ |
| 761 // |
| 762 // uregex_end |
| 763 // |
| 764 //------------------------------------------------------------------------------ |
| 765 U_CAPI int32_t U_EXPORT2 |
| 766 uregex_end(URegularExpression *regexp2, |
| 767 int32_t groupNum, |
| 768 UErrorCode *status) { |
| 769 return (int32_t)uregex_end64( regexp2, groupNum, status); |
| 770 } |
| 771 |
| 772 U_CAPI int64_t U_EXPORT2 |
| 773 uregex_end64(URegularExpression *regexp2, |
| 774 int32_t groupNum, |
| 775 UErrorCode *status) { |
| 776 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 777 if (validateRE(regexp, status) == FALSE) { |
| 778 return 0; |
| 779 } |
| 780 int32_t result = regexp->fMatcher->end(groupNum, *status); |
| 781 return result; |
| 782 } |
| 783 |
| 784 //------------------------------------------------------------------------------ |
| 785 // |
| 786 // uregex_reset |
| 787 // |
| 788 //------------------------------------------------------------------------------ |
| 789 U_CAPI void U_EXPORT2 |
| 790 uregex_reset(URegularExpression *regexp2, |
| 791 int32_t index, |
| 792 UErrorCode *status) { |
| 793 uregex_reset64( regexp2, (int64_t)index, status); |
| 794 } |
| 795 |
| 796 U_CAPI void U_EXPORT2 |
| 797 uregex_reset64(URegularExpression *regexp2, |
| 798 int64_t index, |
| 799 UErrorCode *status) { |
| 800 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 801 if (validateRE(regexp, status) == FALSE) { |
| 802 return; |
| 803 } |
| 804 regexp->fMatcher->reset(index, *status); |
| 805 } |
| 806 |
| 807 |
| 808 //------------------------------------------------------------------------------ |
| 809 // |
| 810 // uregex_setRegion |
| 811 // |
| 812 //------------------------------------------------------------------------------ |
| 813 U_CAPI void U_EXPORT2 |
| 814 uregex_setRegion(URegularExpression *regexp2, |
| 815 int32_t regionStart, |
| 816 int32_t regionLimit, |
| 817 UErrorCode *status) { |
| 818 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, sta
tus); |
| 819 } |
| 820 |
| 821 U_CAPI void U_EXPORT2 |
| 822 uregex_setRegion64(URegularExpression *regexp2, |
| 823 int64_t regionStart, |
| 824 int64_t regionLimit, |
| 825 UErrorCode *status) { |
| 826 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 827 if (validateRE(regexp, status) == FALSE) { |
| 828 return; |
| 829 } |
| 830 regexp->fMatcher->region(regionStart, regionLimit, *status); |
| 831 } |
| 832 |
| 833 |
| 834 //------------------------------------------------------------------------------ |
| 835 // |
| 836 // uregex_setRegionAndStart |
| 837 // |
| 838 //------------------------------------------------------------------------------ |
| 839 U_DRAFT void U_EXPORT2 |
| 840 uregex_setRegionAndStart(URegularExpression *regexp2, |
| 841 int64_t regionStart, |
| 842 int64_t regionLimit, |
| 843 int64_t startIndex, |
| 844 UErrorCode *status) { |
| 845 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 846 if (validateRE(regexp, status) == FALSE) { |
| 847 return; |
| 848 } |
| 849 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status); |
| 850 } |
| 851 |
| 852 //------------------------------------------------------------------------------ |
| 853 // |
| 854 // uregex_regionStart |
| 855 // |
| 856 //------------------------------------------------------------------------------ |
| 857 U_CAPI int32_t U_EXPORT2 |
| 858 uregex_regionStart(const URegularExpression *regexp2, |
| 859 UErrorCode *status) { |
| 860 return (int32_t)uregex_regionStart64(regexp2, status); |
| 861 } |
| 862 |
| 863 U_CAPI int64_t U_EXPORT2 |
| 864 uregex_regionStart64(const URegularExpression *regexp2, |
| 865 UErrorCode *status) { |
| 866 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 867 if (validateRE(regexp, status) == FALSE) { |
| 868 return 0; |
| 869 } |
| 870 return regexp->fMatcher->regionStart(); |
| 871 } |
| 872 |
| 873 |
| 874 //------------------------------------------------------------------------------ |
| 875 // |
| 876 // uregex_regionEnd |
| 877 // |
| 878 //------------------------------------------------------------------------------ |
| 879 U_CAPI int32_t U_EXPORT2 |
| 880 uregex_regionEnd(const URegularExpression *regexp2, |
| 881 UErrorCode *status) { |
| 882 return (int32_t)uregex_regionEnd64(regexp2, status); |
| 883 } |
| 884 |
| 885 U_CAPI int64_t U_EXPORT2 |
| 886 uregex_regionEnd64(const URegularExpression *regexp2, |
| 887 UErrorCode *status) { |
| 888 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 889 if (validateRE(regexp, status) == FALSE) { |
| 890 return 0; |
| 891 } |
| 892 return regexp->fMatcher->regionEnd(); |
| 893 } |
| 894 |
| 895 |
| 896 //------------------------------------------------------------------------------ |
| 897 // |
| 898 // uregex_hasTransparentBounds |
| 899 // |
| 900 //------------------------------------------------------------------------------ |
| 901 U_CAPI UBool U_EXPORT2 |
| 902 uregex_hasTransparentBounds(const URegularExpression *regexp2, |
| 903 UErrorCode *status) { |
| 904 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 905 if (validateRE(regexp, status) == FALSE) { |
| 906 return FALSE; |
| 907 } |
| 908 return regexp->fMatcher->hasTransparentBounds(); |
| 909 } |
| 910 |
| 911 |
| 912 //------------------------------------------------------------------------------ |
| 913 // |
| 914 // uregex_useTransparentBounds |
| 915 // |
| 916 //------------------------------------------------------------------------------ |
| 917 U_CAPI void U_EXPORT2 |
| 918 uregex_useTransparentBounds(URegularExpression *regexp2, |
| 919 UBool b, |
| 920 UErrorCode *status) { |
| 921 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 922 if (validateRE(regexp, status) == FALSE) { |
| 923 return; |
| 924 } |
| 925 regexp->fMatcher->useTransparentBounds(b); |
| 926 } |
| 927 |
| 928 |
| 929 //------------------------------------------------------------------------------ |
| 930 // |
| 931 // uregex_hasAnchoringBounds |
| 932 // |
| 933 //------------------------------------------------------------------------------ |
| 934 U_CAPI UBool U_EXPORT2 |
| 935 uregex_hasAnchoringBounds(const URegularExpression *regexp2, |
| 936 UErrorCode *status) { |
| 937 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 938 if (validateRE(regexp, status) == FALSE) { |
| 939 return FALSE; |
| 940 } |
| 941 return regexp->fMatcher->hasAnchoringBounds(); |
| 942 } |
| 943 |
| 944 |
| 945 //------------------------------------------------------------------------------ |
| 946 // |
| 947 // uregex_useAnchoringBounds |
| 948 // |
| 949 //------------------------------------------------------------------------------ |
| 950 U_CAPI void U_EXPORT2 |
| 951 uregex_useAnchoringBounds(URegularExpression *regexp2, |
| 952 UBool b, |
| 953 UErrorCode *status) { |
| 954 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 955 if (validateRE(regexp, status) == FALSE) { |
| 956 return; |
| 957 } |
| 958 regexp->fMatcher->useAnchoringBounds(b); |
| 959 } |
| 960 |
| 961 |
| 962 //------------------------------------------------------------------------------ |
| 963 // |
| 964 // uregex_hitEnd |
| 965 // |
| 966 //------------------------------------------------------------------------------ |
| 967 U_CAPI UBool U_EXPORT2 |
| 968 uregex_hitEnd(const URegularExpression *regexp2, |
| 969 UErrorCode *status) { |
| 970 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 971 if (validateRE(regexp, status) == FALSE) { |
| 972 return FALSE; |
| 973 } |
| 974 return regexp->fMatcher->hitEnd(); |
| 975 } |
| 976 |
| 977 |
| 978 //------------------------------------------------------------------------------ |
| 979 // |
| 980 // uregex_requireEnd |
| 981 // |
| 982 //------------------------------------------------------------------------------ |
| 983 U_CAPI UBool U_EXPORT2 |
| 984 uregex_requireEnd(const URegularExpression *regexp2, |
| 985 UErrorCode *status) { |
| 986 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 987 if (validateRE(regexp, status) == FALSE) { |
| 988 return FALSE; |
| 989 } |
| 990 return regexp->fMatcher->requireEnd(); |
| 991 } |
| 992 |
| 993 |
| 994 //------------------------------------------------------------------------------ |
| 995 // |
| 996 // uregex_setTimeLimit |
| 997 // |
| 998 //------------------------------------------------------------------------------ |
| 999 U_CAPI void U_EXPORT2 |
| 1000 uregex_setTimeLimit(URegularExpression *regexp2, |
| 1001 int32_t limit, |
| 1002 UErrorCode *status) { |
| 1003 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1004 if (validateRE(regexp, status)) { |
| 1005 regexp->fMatcher->setTimeLimit(limit, *status); |
| 1006 } |
| 1007 } |
| 1008 |
| 1009 |
| 1010 |
| 1011 //------------------------------------------------------------------------------ |
| 1012 // |
| 1013 // uregex_getTimeLimit |
| 1014 // |
| 1015 //------------------------------------------------------------------------------ |
| 1016 U_CAPI int32_t U_EXPORT2 |
| 1017 uregex_getTimeLimit(const URegularExpression *regexp2, |
| 1018 UErrorCode *status) { |
| 1019 int32_t retVal = 0; |
| 1020 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1021 if (validateRE(regexp, status)) { |
| 1022 retVal = regexp->fMatcher->getTimeLimit(); |
| 1023 } |
| 1024 return retVal; |
| 1025 } |
| 1026 |
| 1027 |
| 1028 |
| 1029 //------------------------------------------------------------------------------ |
| 1030 // |
| 1031 // uregex_setStackLimit |
| 1032 // |
| 1033 //------------------------------------------------------------------------------ |
| 1034 U_CAPI void U_EXPORT2 |
| 1035 uregex_setStackLimit(URegularExpression *regexp2, |
| 1036 int32_t limit, |
| 1037 UErrorCode *status) { |
| 1038 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1039 if (validateRE(regexp, status)) { |
| 1040 regexp->fMatcher->setStackLimit(limit, *status); |
| 1041 } |
| 1042 } |
| 1043 |
| 1044 |
| 1045 |
| 1046 //------------------------------------------------------------------------------ |
| 1047 // |
| 1048 // uregex_getStackLimit |
| 1049 // |
| 1050 //------------------------------------------------------------------------------ |
| 1051 U_CAPI int32_t U_EXPORT2 |
| 1052 uregex_getStackLimit(const URegularExpression *regexp2, |
| 1053 UErrorCode *status) { |
| 1054 int32_t retVal = 0; |
| 1055 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1056 if (validateRE(regexp, status)) { |
| 1057 retVal = regexp->fMatcher->getStackLimit(); |
| 1058 } |
| 1059 return retVal; |
| 1060 } |
| 1061 |
| 1062 |
| 1063 //------------------------------------------------------------------------------ |
| 1064 // |
| 1065 // uregex_setMatchCallback |
| 1066 // |
| 1067 //------------------------------------------------------------------------------ |
| 1068 U_CAPI void U_EXPORT2 |
| 1069 uregex_setMatchCallback(URegularExpression *regexp2, |
| 1070 URegexMatchCallback *callback, |
| 1071 const void *context, |
| 1072 UErrorCode *status) { |
| 1073 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1074 if (validateRE(regexp, status)) { |
| 1075 regexp->fMatcher->setMatchCallback(callback, context, *status); |
| 1076 } |
| 1077 } |
| 1078 |
| 1079 |
| 1080 //------------------------------------------------------------------------------ |
| 1081 // |
| 1082 // uregex_getMatchCallback |
| 1083 // |
| 1084 //------------------------------------------------------------------------------ |
| 1085 U_CAPI void U_EXPORT2 |
| 1086 uregex_getMatchCallback(const URegularExpression *regexp2, |
| 1087 URegexMatchCallback **callback, |
| 1088 const void **context, |
| 1089 UErrorCode *status) { |
| 1090 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1091 if (validateRE(regexp, status)) { |
| 1092 regexp->fMatcher->getMatchCallback(*callback, *context, *status); |
| 1093 } |
| 1094 } |
| 1095 |
| 1096 |
| 1097 //------------------------------------------------------------------------------ |
| 1098 // |
| 1099 // uregex_setMatchProgressCallback |
| 1100 // |
| 1101 //------------------------------------------------------------------------------ |
| 1102 U_CAPI void U_EXPORT2 |
| 1103 uregex_setFindProgressCallback(URegularExpression *regexp2, |
| 1104 URegexFindProgressCallback *callback, |
| 1105 const void *context, |
| 1106 UErrorCode *status) { |
| 1107 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1108 if (validateRE(regexp, status)) { |
| 1109 regexp->fMatcher->setFindProgressCallback(callback, context, *status); |
| 1110 } |
| 1111 } |
| 1112 |
| 1113 |
| 1114 //------------------------------------------------------------------------------ |
| 1115 // |
| 1116 // uregex_getMatchCallback |
| 1117 // |
| 1118 //------------------------------------------------------------------------------ |
| 1119 U_CAPI void U_EXPORT2 |
| 1120 uregex_getFindProgressCallback(const URegularExpression *regexp2, |
| 1121 URegexFindProgressCallback **callback, |
| 1122 const void **context, |
| 1123 UErrorCode *status) { |
| 1124 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1125 if (validateRE(regexp, status)) { |
| 1126 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status)
; |
| 1127 } |
| 1128 } |
| 1129 |
| 1130 |
| 1131 //------------------------------------------------------------------------------ |
| 1132 // |
| 1133 // uregex_replaceAll |
| 1134 // |
| 1135 //------------------------------------------------------------------------------ |
| 1136 U_CAPI int32_t U_EXPORT2 |
| 1137 uregex_replaceAll(URegularExpression *regexp2, |
| 1138 const UChar *replacementText, |
| 1139 int32_t replacementLength, |
| 1140 UChar *destBuf, |
| 1141 int32_t destCapacity, |
| 1142 UErrorCode *status) { |
| 1143 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1144 if (validateRE(regexp, status) == FALSE) { |
| 1145 return 0; |
| 1146 } |
| 1147 if (replacementText == NULL || replacementLength < -1 || |
| 1148 (destBuf == NULL && destCapacity > 0) || |
| 1149 destCapacity < 0) { |
| 1150 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1151 return 0; |
| 1152 } |
| 1153 |
| 1154 int32_t len = 0; |
| 1155 |
| 1156 uregex_reset(regexp2, 0, status); |
| 1157 |
| 1158 // Note: Seperate error code variables for findNext() and appendReplacement(
) |
| 1159 // are used so that destination buffer overflow errors |
| 1160 // in appendReplacement won't stop findNext() from working. |
| 1161 // appendReplacement() and appendTail() special case incoming buffer |
| 1162 // overflow errors, continuing to return the correct length. |
| 1163 UErrorCode findStatus = *status; |
| 1164 while (uregex_findNext(regexp2, &findStatus)) { |
| 1165 len += uregex_appendReplacement(regexp2, replacementText, replacementLen
gth, |
| 1166 &destBuf, &destCapacity, status); |
| 1167 } |
| 1168 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); |
| 1169 |
| 1170 if (U_FAILURE(findStatus)) { |
| 1171 // If anything went wrong with the findNext(), make that error trump |
| 1172 // whatever may have happened with the append() operations. |
| 1173 // Errors in findNext() are not expected. |
| 1174 *status = findStatus; |
| 1175 } |
| 1176 |
| 1177 return len; |
| 1178 } |
| 1179 |
| 1180 |
| 1181 //------------------------------------------------------------------------------ |
| 1182 // |
| 1183 // uregex_replaceAllUText |
| 1184 // |
| 1185 //------------------------------------------------------------------------------ |
| 1186 U_CAPI UText * U_EXPORT2 |
| 1187 uregex_replaceAllUText(URegularExpression *regexp2, |
| 1188 UText *replacementText, |
| 1189 UText *dest, |
| 1190 UErrorCode *status) { |
| 1191 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1192 if (validateRE(regexp, status) == FALSE) { |
| 1193 return 0; |
| 1194 } |
| 1195 if (replacementText == NULL) { |
| 1196 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1197 return 0; |
| 1198 } |
| 1199 |
| 1200 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status); |
| 1201 return dest; |
| 1202 } |
| 1203 |
| 1204 |
| 1205 //------------------------------------------------------------------------------ |
| 1206 // |
| 1207 // uregex_replaceFirst |
| 1208 // |
| 1209 //------------------------------------------------------------------------------ |
| 1210 U_CAPI int32_t U_EXPORT2 |
| 1211 uregex_replaceFirst(URegularExpression *regexp2, |
| 1212 const UChar *replacementText, |
| 1213 int32_t replacementLength, |
| 1214 UChar *destBuf, |
| 1215 int32_t destCapacity, |
| 1216 UErrorCode *status) { |
| 1217 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1218 if (validateRE(regexp, status) == FALSE) { |
| 1219 return 0; |
| 1220 } |
| 1221 if (replacementText == NULL || replacementLength < -1 || |
| 1222 (destBuf == NULL && destCapacity > 0) || |
| 1223 destCapacity < 0) { |
| 1224 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1225 return 0; |
| 1226 } |
| 1227 |
| 1228 int32_t len = 0; |
| 1229 UBool findSucceeded; |
| 1230 uregex_reset(regexp2, 0, status); |
| 1231 findSucceeded = uregex_find(regexp2, 0, status); |
| 1232 if (findSucceeded) { |
| 1233 len = uregex_appendReplacement(regexp2, replacementText, replacementLeng
th, |
| 1234 &destBuf, &destCapacity, status); |
| 1235 } |
| 1236 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status); |
| 1237 |
| 1238 return len; |
| 1239 } |
| 1240 |
| 1241 |
| 1242 //------------------------------------------------------------------------------ |
| 1243 // |
| 1244 // uregex_replaceFirstUText |
| 1245 // |
| 1246 //------------------------------------------------------------------------------ |
| 1247 U_CAPI UText * U_EXPORT2 |
| 1248 uregex_replaceFirstUText(URegularExpression *regexp2, |
| 1249 UText *replacementText, |
| 1250 UText *dest, |
| 1251 UErrorCode *status) { |
| 1252 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1253 if (validateRE(regexp, status) == FALSE) { |
| 1254 return 0; |
| 1255 } |
| 1256 if (replacementText == NULL) { |
| 1257 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1258 return 0; |
| 1259 } |
| 1260 |
| 1261 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status); |
| 1262 return dest; |
| 1263 } |
| 1264 |
| 1265 |
| 1266 //------------------------------------------------------------------------------ |
| 1267 // |
| 1268 // uregex_appendReplacement |
| 1269 // |
| 1270 //------------------------------------------------------------------------------ |
| 1271 |
| 1272 U_NAMESPACE_BEGIN |
| 1273 // |
| 1274 // Dummy class, because these functions need to be friends of class RegexMatche
r, |
| 1275 // and stand-alone C functions don't work as friends |
| 1276 // |
| 1277 class RegexCImpl { |
| 1278 public: |
| 1279 inline static int32_t appendReplacement(RegularExpression *regexp, |
| 1280 const UChar *replacementText, |
| 1281 int32_t replacementLength, |
| 1282 UChar **destBuf, |
| 1283 int32_t *destCapacity, |
| 1284 UErrorCode *status); |
| 1285 |
| 1286 inline static int32_t appendTail(RegularExpression *regexp, |
| 1287 UChar **destBuf, |
| 1288 int32_t *destCapacity, |
| 1289 UErrorCode *status); |
| 1290 |
| 1291 inline static int32_t split(RegularExpression *regexp, |
| 1292 UChar *destBuf, |
| 1293 int32_t destCapacity, |
| 1294 int32_t *requiredCapacity, |
| 1295 UChar *destFields[], |
| 1296 int32_t destFieldsCapacity, |
| 1297 UErrorCode *status); |
| 1298 }; |
| 1299 |
| 1300 U_NAMESPACE_END |
| 1301 |
| 1302 |
| 1303 |
| 1304 static const UChar BACKSLASH = 0x5c; |
| 1305 static const UChar DOLLARSIGN = 0x24; |
| 1306 |
| 1307 // |
| 1308 // Move a character to an output buffer, with bounds checking on the index. |
| 1309 // Index advances even if capacity is exceeded, for preflight size computat
ions. |
| 1310 // This little sequence is used a LOT. |
| 1311 // |
| 1312 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCap
acity) { |
| 1313 if (*idx < bufCapacity) { |
| 1314 buf[*idx] = c; |
| 1315 } |
| 1316 (*idx)++; |
| 1317 } |
| 1318 |
| 1319 |
| 1320 // |
| 1321 // appendReplacement, the actual implementation. |
| 1322 // |
| 1323 int32_t RegexCImpl::appendReplacement(RegularExpression *regexp, |
| 1324 const UChar *replacementText, |
| 1325 int32_t replacementLength, |
| 1326 UChar **destBuf, |
| 1327 int32_t *destCapacity, |
| 1328 UErrorCode *status) { |
| 1329 |
| 1330 // If we come in with a buffer overflow error, don't suppress the operation. |
| 1331 // A series of appendReplacements, appendTail need to correctly preflight |
| 1332 // the buffer size when an overflow happens somewhere in the middle. |
| 1333 UBool pendingBufferOverflow = FALSE; |
| 1334 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapac
ity == 0) { |
| 1335 pendingBufferOverflow = TRUE; |
| 1336 *status = U_ZERO_ERROR; |
| 1337 } |
| 1338 |
| 1339 // |
| 1340 // Validate all paramters |
| 1341 // |
| 1342 if (validateRE(regexp, status) == FALSE) { |
| 1343 return 0; |
| 1344 } |
| 1345 if (replacementText == NULL || replacementLength < -1 || |
| 1346 destCapacity == NULL || destBuf == NULL || |
| 1347 (*destBuf == NULL && *destCapacity > 0) || |
| 1348 *destCapacity < 0) { |
| 1349 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1350 return 0; |
| 1351 } |
| 1352 |
| 1353 RegexMatcher *m = regexp->fMatcher; |
| 1354 if (m->fMatch == FALSE) { |
| 1355 *status = U_REGEX_INVALID_STATE; |
| 1356 return 0; |
| 1357 } |
| 1358 |
| 1359 UChar *dest = *destBuf; |
| 1360 int32_t capacity = *destCapacity; |
| 1361 int32_t destIdx = 0; |
| 1362 int32_t i; |
| 1363 |
| 1364 // If it wasn't supplied by the caller, get the length of the replacement t
ext. |
| 1365 // TODO: slightly smarter logic in the copy loop could watch for the NUL
on |
| 1366 // the fly and avoid this step. |
| 1367 if (replacementLength == -1) { |
| 1368 replacementLength = u_strlen(replacementText); |
| 1369 } |
| 1370 |
| 1371 // Copy input string from the end of previous match to start of current matc
h |
| 1372 if (regexp->fText != NULL) { |
| 1373 int32_t matchStart; |
| 1374 int32_t lastMatchEnd; |
| 1375 if (UTEXT_USES_U16(m->fInputText)) { |
| 1376 lastMatchEnd = (int32_t)m->fLastMatchEnd; |
| 1377 matchStart = (int32_t)m->fMatchStart; |
| 1378 } else { |
| 1379 // !!!: Would like a better way to do this! |
| 1380 UErrorCode status = U_ZERO_ERROR; |
| 1381 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NUL
L, 0, &status); |
| 1382 status = U_ZERO_ERROR; |
| 1383 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMat
chEnd, m->fMatchStart, NULL, 0, &status); |
| 1384 } |
| 1385 for (i=lastMatchEnd; i<matchStart; i++) { |
| 1386 appendToBuf(regexp->fText[i], &destIdx, dest, capacity); |
| 1387 } |
| 1388 } else { |
| 1389 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore |
| 1390 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart
, |
| 1391 &dest[destIdx], REMAINING_CAPACITY(destIdx, cap
acity), &possibleOverflowError); |
| 1392 } |
| 1393 |
| 1394 |
| 1395 // scan the replacement text, looking for substitutions ($n) and \escapes. |
| 1396 int32_t replIdx = 0; |
| 1397 while (replIdx < replacementLength) { |
| 1398 UChar c = replacementText[replIdx]; |
| 1399 replIdx++; |
| 1400 if (c != DOLLARSIGN && c != BACKSLASH) { |
| 1401 // Common case, no substitution, no escaping, |
| 1402 // just copy the char to the dest buf. |
| 1403 appendToBuf(c, &destIdx, dest, capacity); |
| 1404 continue; |
| 1405 } |
| 1406 |
| 1407 if (c == BACKSLASH) { |
| 1408 // Backslash Escape. Copy the following char out without further ch
ecks. |
| 1409 // Note: Surrogate pairs don't need any special
handling |
| 1410 // The second half wont be a '$' or a '\',
and |
| 1411 // will move to the dest normally on the n
ext |
| 1412 // loop iteration. |
| 1413 if (replIdx >= replacementLength) { |
| 1414 break; |
| 1415 } |
| 1416 c = replacementText[replIdx]; |
| 1417 |
| 1418 if (c==0x55/*U*/ || c==0x75/*u*/) { |
| 1419 // We have a \udddd or \Udddddddd escape sequence. |
| 1420 UChar32 escapedChar = |
| 1421 u_unescapeAt(uregex_ucstr_unescape_charAt, |
| 1422 &replIdx, // Index is updated by unesca
peAt |
| 1423 replacementLength, // Length of replacement text |
| 1424 (void *)replacementText); |
| 1425 |
| 1426 if (escapedChar != (UChar32)0xFFFFFFFF) { |
| 1427 if (escapedChar <= 0xffff) { |
| 1428 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity
); |
| 1429 } else { |
| 1430 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capac
ity); |
| 1431 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capa
city); |
| 1432 } |
| 1433 continue; |
| 1434 } |
| 1435 // Note: if the \u escape was invalid, just fall through and |
| 1436 // treat it as a plain \<anything> escape. |
| 1437 } |
| 1438 |
| 1439 // Plain backslash escape. Just put out the escaped character. |
| 1440 appendToBuf(c, &destIdx, dest, capacity); |
| 1441 |
| 1442 replIdx++; |
| 1443 continue; |
| 1444 } |
| 1445 |
| 1446 |
| 1447 |
| 1448 // We've got a $. Pick up a capture group number if one follows. |
| 1449 // Consume at most the number of digits necessary for the largest captur
e |
| 1450 // number that is valid for this pattern. |
| 1451 |
| 1452 int32_t numDigits = 0; |
| 1453 int32_t groupNum = 0; |
| 1454 UChar32 digitC; |
| 1455 for (;;) { |
| 1456 if (replIdx >= replacementLength) { |
| 1457 break; |
| 1458 } |
| 1459 U16_GET(replacementText, 0, replIdx, replacementLength, digitC); |
| 1460 if (u_isdigit(digitC) == FALSE) { |
| 1461 break; |
| 1462 } |
| 1463 |
| 1464 U16_FWD_1(replacementText, replIdx, replacementLength); |
| 1465 groupNum=groupNum*10 + u_charDigitValue(digitC); |
| 1466 numDigits++; |
| 1467 if (numDigits >= m->fPattern->fMaxCaptureDigits) { |
| 1468 break; |
| 1469 } |
| 1470 } |
| 1471 |
| 1472 |
| 1473 if (numDigits == 0) { |
| 1474 // The $ didn't introduce a group number at all. |
| 1475 // Treat it as just part of the substitution text. |
| 1476 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); |
| 1477 continue; |
| 1478 } |
| 1479 |
| 1480 // Finally, append the capture group data to the destination. |
| 1481 destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[des
tIdx], REMAINING_CAPACITY(destIdx, capacity), status); |
| 1482 if (*status == U_BUFFER_OVERFLOW_ERROR) { |
| 1483 // Ignore buffer overflow when extracting the group. We need to |
| 1484 // continue on to get full size of the untruncated result. We wil
l |
| 1485 // raise our own buffer overflow error at the end. |
| 1486 *status = U_ZERO_ERROR; |
| 1487 } |
| 1488 |
| 1489 if (U_FAILURE(*status)) { |
| 1490 // Can fail if group number is out of range. |
| 1491 break; |
| 1492 } |
| 1493 |
| 1494 } |
| 1495 |
| 1496 // |
| 1497 // Nul Terminate the dest buffer if possible. |
| 1498 // Set the appropriate buffer overflow or not terminated error, if needed. |
| 1499 // |
| 1500 if (destIdx < capacity) { |
| 1501 dest[destIdx] = 0; |
| 1502 } else if (destIdx == *destCapacity) { |
| 1503 *status = U_STRING_NOT_TERMINATED_WARNING; |
| 1504 } else { |
| 1505 *status = U_BUFFER_OVERFLOW_ERROR; |
| 1506 } |
| 1507 |
| 1508 // |
| 1509 // Return an updated dest buffer and capacity to the caller. |
| 1510 // |
| 1511 if (destIdx > 0 && *destCapacity > 0) { |
| 1512 if (destIdx < capacity) { |
| 1513 *destBuf += destIdx; |
| 1514 *destCapacity -= destIdx; |
| 1515 } else { |
| 1516 *destBuf += capacity; |
| 1517 *destCapacity = 0; |
| 1518 } |
| 1519 } |
| 1520 |
| 1521 // If we came in with a buffer overflow, make sure we go out with one also. |
| 1522 // (A zero length match right at the end of the previous match could |
| 1523 // make this function succeed even though a previous call had overflowed
the buf) |
| 1524 if (pendingBufferOverflow && U_SUCCESS(*status)) { |
| 1525 *status = U_BUFFER_OVERFLOW_ERROR; |
| 1526 } |
| 1527 |
| 1528 return destIdx; |
| 1529 } |
| 1530 |
| 1531 // |
| 1532 // appendReplacement the actual API function, |
| 1533 // |
| 1534 U_CAPI int32_t U_EXPORT2 |
| 1535 uregex_appendReplacement(URegularExpression *regexp2, |
| 1536 const UChar *replacementText, |
| 1537 int32_t replacementLength, |
| 1538 UChar **destBuf, |
| 1539 int32_t *destCapacity, |
| 1540 UErrorCode *status) { |
| 1541 |
| 1542 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1543 return RegexCImpl::appendReplacement( |
| 1544 regexp, replacementText, replacementLength,destBuf, destCapacity, status
); |
| 1545 } |
| 1546 |
| 1547 // |
| 1548 // uregex_appendReplacementUText...can just use the normal C++ method |
| 1549 // |
| 1550 U_CAPI void U_EXPORT2 |
| 1551 uregex_appendReplacementUText(URegularExpression *regexp2, |
| 1552 UText *replText, |
| 1553 UText *dest, |
| 1554 UErrorCode *status) { |
| 1555 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1556 regexp->fMatcher->appendReplacement(dest, replText, *status); |
| 1557 } |
| 1558 |
| 1559 |
| 1560 //------------------------------------------------------------------------------ |
| 1561 // |
| 1562 // uregex_appendTail |
| 1563 // |
| 1564 //------------------------------------------------------------------------------ |
| 1565 int32_t RegexCImpl::appendTail(RegularExpression *regexp, |
| 1566 UChar **destBuf, |
| 1567 int32_t *destCapacity, |
| 1568 UErrorCode *status) |
| 1569 { |
| 1570 |
| 1571 // If we come in with a buffer overflow error, don't suppress the operation. |
| 1572 // A series of appendReplacements, appendTail need to correctly preflight |
| 1573 // the buffer size when an overflow happens somewhere in the middle. |
| 1574 UBool pendingBufferOverflow = FALSE; |
| 1575 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapac
ity == 0) { |
| 1576 pendingBufferOverflow = TRUE; |
| 1577 *status = U_ZERO_ERROR; |
| 1578 } |
| 1579 |
| 1580 if (validateRE(regexp, status) == FALSE) { |
| 1581 return 0; |
| 1582 } |
| 1583 |
| 1584 if (destCapacity == NULL || destBuf == NULL || |
| 1585 (*destBuf == NULL && *destCapacity > 0) || |
| 1586 *destCapacity < 0) |
| 1587 { |
| 1588 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1589 return 0; |
| 1590 } |
| 1591 |
| 1592 RegexMatcher *m = regexp->fMatcher; |
| 1593 |
| 1594 int32_t destIdx = 0; |
| 1595 int32_t destCap = *destCapacity; |
| 1596 UChar *dest = *destBuf; |
| 1597 |
| 1598 if (regexp->fText != NULL) { |
| 1599 int32_t srcIdx; |
| 1600 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd); |
| 1601 if (nativeIdx == -1) { |
| 1602 srcIdx = 0; |
| 1603 } else if (UTEXT_USES_U16(m->fInputText)) { |
| 1604 srcIdx = (int32_t)nativeIdx; |
| 1605 } else { |
| 1606 UErrorCode status = U_ZERO_ERROR; |
| 1607 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status
); |
| 1608 } |
| 1609 |
| 1610 for (;;) { |
| 1611 if (srcIdx == regexp->fTextLength) { |
| 1612 break; |
| 1613 } |
| 1614 UChar c = regexp->fText[srcIdx]; |
| 1615 if (c == 0 && regexp->fTextLength == -1) { |
| 1616 regexp->fTextLength = srcIdx; |
| 1617 break; |
| 1618 } |
| 1619 if (destIdx < destCap) { |
| 1620 dest[destIdx] = c; |
| 1621 } else { |
| 1622 // We've overflowed the dest buffer. |
| 1623 // If the total input string length is known, we can |
| 1624 // compute the total buffer size needed without scanning thro
ugh the string. |
| 1625 if (regexp->fTextLength > 0) { |
| 1626 destIdx += (regexp->fTextLength - srcIdx); |
| 1627 break; |
| 1628 } |
| 1629 } |
| 1630 srcIdx++; |
| 1631 destIdx++; |
| 1632 } |
| 1633 } else { |
| 1634 int64_t srcIdx; |
| 1635 if (m->fMatch) { |
| 1636 // The most recent call to find() succeeded. |
| 1637 srcIdx = m->fMatchEnd; |
| 1638 } else { |
| 1639 // The last call to find() on this matcher failed(). |
| 1640 // Look back to the end of the last find() that succeeded for src
index. |
| 1641 srcIdx = m->fLastMatchEnd; |
| 1642 if (srcIdx == -1) { |
| 1643 // There has been no successful match with this matcher. |
| 1644 // We want to copy the whole string. |
| 1645 srcIdx = 0; |
| 1646 } |
| 1647 } |
| 1648 |
| 1649 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, de
stCap, status); |
| 1650 } |
| 1651 |
| 1652 // |
| 1653 // NUL terminate the output string, if possible, otherwise issue the |
| 1654 // appropriate error or warning. |
| 1655 // |
| 1656 if (destIdx < destCap) { |
| 1657 dest[destIdx] = 0; |
| 1658 } else if (destIdx == destCap) { |
| 1659 *status = U_STRING_NOT_TERMINATED_WARNING; |
| 1660 } else { |
| 1661 *status = U_BUFFER_OVERFLOW_ERROR; |
| 1662 } |
| 1663 |
| 1664 // |
| 1665 // Update the user's buffer ptr and capacity vars to reflect the |
| 1666 // amount used. |
| 1667 // |
| 1668 if (destIdx < destCap) { |
| 1669 *destBuf += destIdx; |
| 1670 *destCapacity -= destIdx; |
| 1671 } else { |
| 1672 *destBuf += destCap; |
| 1673 *destCapacity = 0; |
| 1674 } |
| 1675 |
| 1676 if (pendingBufferOverflow && U_SUCCESS(*status)) { |
| 1677 *status = U_BUFFER_OVERFLOW_ERROR; |
| 1678 } |
| 1679 |
| 1680 return destIdx; |
| 1681 } |
| 1682 |
| 1683 |
| 1684 // |
| 1685 // appendTail the actual API function |
| 1686 // |
| 1687 U_CAPI int32_t U_EXPORT2 |
| 1688 uregex_appendTail(URegularExpression *regexp2, |
| 1689 UChar **destBuf, |
| 1690 int32_t *destCapacity, |
| 1691 UErrorCode *status) { |
| 1692 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1693 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); |
| 1694 } |
| 1695 |
| 1696 |
| 1697 // |
| 1698 // uregex_appendTailUText...can just use the normal C++ method |
| 1699 // |
| 1700 U_CAPI UText * U_EXPORT2 |
| 1701 uregex_appendTailUText(URegularExpression *regexp2, |
| 1702 UText *dest, |
| 1703 UErrorCode *status) { |
| 1704 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1705 return regexp->fMatcher->appendTail(dest, *status); |
| 1706 } |
| 1707 |
| 1708 |
| 1709 //------------------------------------------------------------------------------ |
| 1710 // |
| 1711 // copyString Internal utility to copy a string to an output buffer, |
| 1712 // while managing buffer overflow and preflight size |
| 1713 // computation. NUL termination is added to destination, |
| 1714 // and the NUL is counted in the output size. |
| 1715 // |
| 1716 //------------------------------------------------------------------------------ |
| 1717 #if 0 |
| 1718 static void copyString(UChar *destBuffer, // Destination buffer. |
| 1719 int32_t destCapacity, // Total capacity of dest b
uffer |
| 1720 int32_t *destIndex, // Index into dest buffer.
Updated on return. |
| 1721 // Update not clipped to
destCapacity. |
| 1722 const UChar *srcPtr, // Pointer to source string |
| 1723 int32_t srcLen) // Source string len. |
| 1724 { |
| 1725 int32_t si; |
| 1726 int32_t di = *destIndex; |
| 1727 UChar c; |
| 1728 |
| 1729 for (si=0; si<srcLen; si++) { |
| 1730 c = srcPtr[si]; |
| 1731 if (di < destCapacity) { |
| 1732 destBuffer[di] = c; |
| 1733 di++; |
| 1734 } else { |
| 1735 di += srcLen - si; |
| 1736 break; |
| 1737 } |
| 1738 } |
| 1739 if (di<destCapacity) { |
| 1740 destBuffer[di] = 0; |
| 1741 } |
| 1742 di++; |
| 1743 *destIndex = di; |
| 1744 } |
| 1745 #endif |
| 1746 |
| 1747 //------------------------------------------------------------------------------ |
| 1748 // |
| 1749 // uregex_split |
| 1750 // |
| 1751 //------------------------------------------------------------------------------ |
| 1752 int32_t RegexCImpl::split(RegularExpression *regexp, |
| 1753 UChar *destBuf, |
| 1754 int32_t destCapacity, |
| 1755 int32_t *requiredCapacity, |
| 1756 UChar *destFields[], |
| 1757 int32_t destFieldsCapacity, |
| 1758 UErrorCode *status) { |
| 1759 // |
| 1760 // Reset for the input text |
| 1761 // |
| 1762 regexp->fMatcher->reset(); |
| 1763 UText *inputText = regexp->fMatcher->fInputText; |
| 1764 int64_t nextOutputStringStart = 0; |
| 1765 int64_t inputLen = regexp->fMatcher->fInputLength; |
| 1766 if (inputLen == 0) { |
| 1767 return 0; |
| 1768 } |
| 1769 |
| 1770 // |
| 1771 // Loop through the input text, searching for the delimiter pattern |
| 1772 // |
| 1773 int32_t i; // Index of the field being processed. |
| 1774 int32_t destIdx = 0; // Next available position in destBuf; |
| 1775 int32_t numCaptureGroups = regexp->fMatcher->groupCount(); |
| 1776 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
errors so that the strings are still counted |
| 1777 for (i=0; ; i++) { |
| 1778 if (i>=destFieldsCapacity-1) { |
| 1779 // There are one or zero output strings left. |
| 1780 // Fill the last output string with whatever is left from the input,
then exit the loop. |
| 1781 // ( i will be == destFieldsCapacity if we filled the output array
while processing |
| 1782 // capture groups of the delimiter expression, in which case we w
ill discard the |
| 1783 // last capture group saved in favor of the unprocessed remainder
of the |
| 1784 // input string.) |
| 1785 if (inputLen > nextOutputStringStart) { |
| 1786 if (i != destFieldsCapacity-1) { |
| 1787 // No fields are left. Recycle the last one for holding the
trailing part of |
| 1788 // the input string. |
| 1789 i = destFieldsCapacity-1; |
| 1790 destIdx = (int32_t)(destFields[i] - destFields[0]); |
| 1791 } |
| 1792 |
| 1793 destFields[i] = &destBuf[destIdx]; |
| 1794 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, i
nputLen, |
| 1795 &destBuf[destIdx], REMAINING_CAPACI
TY(destIdx, destCapacity), status); |
| 1796 } |
| 1797 break; |
| 1798 } |
| 1799 |
| 1800 if (regexp->fMatcher->find()) { |
| 1801 // We found another delimiter. Move everything from where we starte
d looking |
| 1802 // up until the start of the delimiter into the next output string. |
| 1803 destFields[i] = &destBuf[destIdx]; |
| 1804 |
| 1805 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regex
p->fMatcher->fMatchStart, |
| 1806 &destBuf[destIdx], REMAINING_CAPACITY(d
estIdx, destCapacity), &tStatus); |
| 1807 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { |
| 1808 tStatus = U_ZERO_ERROR; |
| 1809 } else { |
| 1810 *status = tStatus; |
| 1811 } |
| 1812 nextOutputStringStart = regexp->fMatcher->fMatchEnd; |
| 1813 |
| 1814 // If the delimiter pattern has capturing parentheses, the captured |
| 1815 // text goes out into the next n destination strings. |
| 1816 int32_t groupNum; |
| 1817 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { |
| 1818 // If we've run out of output string slots, bail out. |
| 1819 if (i==destFieldsCapacity-1) { |
| 1820 break; |
| 1821 } |
| 1822 i++; |
| 1823 |
| 1824 // Set up to extract the capture group contents into the dest bu
ffer. |
| 1825 destFields[i] = &destBuf[destIdx]; |
| 1826 tStatus = U_ZERO_ERROR; |
| 1827 int32_t t = uregex_group((URegularExpression*)regexp, groupNum,
destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus); |
| 1828 destIdx += t + 1; // Record the space used in the output stri
ng buffer. |
| 1829 // +1 for the NUL that terminates the stri
ng. |
| 1830 if (tStatus == U_BUFFER_OVERFLOW_ERROR) { |
| 1831 tStatus = U_ZERO_ERROR; |
| 1832 } else { |
| 1833 *status = tStatus; |
| 1834 } |
| 1835 } |
| 1836 |
| 1837 if (nextOutputStringStart == inputLen) { |
| 1838 // The delimiter was at the end of the string. We're done. |
| 1839 break; |
| 1840 } |
| 1841 |
| 1842 } |
| 1843 else |
| 1844 { |
| 1845 // We ran off the end of the input while looking for the next delimi
ter. |
| 1846 // All the remaining text goes into the current output string. |
| 1847 destFields[i] = &destBuf[destIdx]; |
| 1848 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, input
Len, |
| 1849 &destBuf[destIdx], REMAINING_CAPACITY(d
estIdx, destCapacity), status); |
| 1850 break; |
| 1851 } |
| 1852 } |
| 1853 |
| 1854 // Zero out any unused portion of the destFields array |
| 1855 int j; |
| 1856 for (j=i+1; j<destFieldsCapacity; j++) { |
| 1857 destFields[j] = NULL; |
| 1858 } |
| 1859 |
| 1860 if (requiredCapacity != NULL) { |
| 1861 *requiredCapacity = destIdx; |
| 1862 } |
| 1863 if (destIdx > destCapacity) { |
| 1864 *status = U_BUFFER_OVERFLOW_ERROR; |
| 1865 } |
| 1866 return i+1; |
| 1867 } |
| 1868 |
| 1869 // |
| 1870 // uregex_split The actual API function |
| 1871 // |
| 1872 U_CAPI int32_t U_EXPORT2 |
| 1873 uregex_split(URegularExpression *regexp2, |
| 1874 UChar *destBuf, |
| 1875 int32_t destCapacity, |
| 1876 int32_t *requiredCapacity, |
| 1877 UChar *destFields[], |
| 1878 int32_t destFieldsCapacity, |
| 1879 UErrorCode *status) { |
| 1880 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1881 if (validateRE(regexp, status) == FALSE) { |
| 1882 return 0; |
| 1883 } |
| 1884 if ((destBuf == NULL && destCapacity > 0) || |
| 1885 destCapacity < 0 || |
| 1886 destFields == NULL || |
| 1887 destFieldsCapacity < 1 ) { |
| 1888 *status = U_ILLEGAL_ARGUMENT_ERROR; |
| 1889 return 0; |
| 1890 } |
| 1891 |
| 1892 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, de
stFields, destFieldsCapacity, status); |
| 1893 } |
| 1894 |
| 1895 |
| 1896 // |
| 1897 // uregex_splitUText...can just use the normal C++ method |
| 1898 // |
| 1899 U_CAPI int32_t U_EXPORT2 |
| 1900 uregex_splitUText(URegularExpression *regexp2, |
| 1901 UText *destFields[], |
| 1902 int32_t destFieldsCapacity, |
| 1903 UErrorCode *status) { |
| 1904 RegularExpression *regexp = (RegularExpression*)regexp2; |
| 1905 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, de
stFieldsCapacity, *status); |
| 1906 } |
| 1907 |
| 1908 |
| 1909 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 1910 |
OLD | NEW |