OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2004-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * file name: uregex.h |
| 7 * encoding: US-ASCII |
| 8 * indentation:4 |
| 9 * |
| 10 * created on: 2004mar09 |
| 11 * created by: Andy Heninger |
| 12 * |
| 13 * ICU Regular Expressions, API for C |
| 14 */ |
| 15 |
| 16 /** |
| 17 * \file |
| 18 * \brief C API: Regular Expressions |
| 19 * |
| 20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.<
/p> |
| 21 */ |
| 22 |
| 23 #ifndef UREGEX_H |
| 24 #define UREGEX_H |
| 25 |
| 26 #include "unicode/utext.h" |
| 27 #include "unicode/utypes.h" |
| 28 |
| 29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 30 |
| 31 #include "unicode/localpointer.h" |
| 32 #include "unicode/parseerr.h" |
| 33 |
| 34 struct URegularExpression; |
| 35 /** |
| 36 * Structure representing a compiled regular rexpression, plus the results |
| 37 * of a match operation. |
| 38 * @stable ICU 3.0 |
| 39 */ |
| 40 typedef struct URegularExpression URegularExpression; |
| 41 |
| 42 |
| 43 /** |
| 44 * Constants for Regular Expression Match Modes. |
| 45 * @stable ICU 2.4 |
| 46 */ |
| 47 typedef enum URegexpFlag{ |
| 48 |
| 49 #ifndef U_HIDE_DRAFT_API |
| 50 /** Forces normalization of pattern and strings. |
| 51 Not implemented yet, just a placeholder, hence draft. |
| 52 @draft ICU 2.4 */ |
| 53 UREGEX_CANON_EQ = 128, |
| 54 #endif |
| 55 /** Enable case insensitive matching. @stable ICU 2.4 */ |
| 56 UREGEX_CASE_INSENSITIVE = 2, |
| 57 |
| 58 /** Allow white space and comments within patterns @stable ICU 2.4 */ |
| 59 UREGEX_COMMENTS = 4, |
| 60 |
| 61 /** If set, '.' matches line terminators, otherwise '.' matching stops at
line end. |
| 62 * @stable ICU 2.4 */ |
| 63 UREGEX_DOTALL = 32, |
| 64 |
| 65 /** If set, treat the entire pattern as a literal string. |
| 66 * Metacharacters or escape sequences in the input sequence will be given |
| 67 * no special meaning. Not implemented yet as of ICU 4.4. |
| 68 * |
| 69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact |
| 70 * on matching when used in conjunction with this flag. |
| 71 * The other flags become superfluous. |
| 72 * TODO: say which escapes are still handled; anything Java does |
| 73 * early (\\u) we should still do. |
| 74 * @stable ICU 4.0 |
| 75 */ |
| 76 UREGEX_LITERAL = 16, |
| 77 |
| 78 /** Control behavior of "$" and "^" |
| 79 * If set, recognize line terminators within string, |
| 80 * otherwise, match only at start and end of input string. |
| 81 * @stable ICU 2.4 */ |
| 82 UREGEX_MULTILINE = 8, |
| 83 |
| 84 /** Unix-only line endings. |
| 85 * When this mode is enabled, only \\u000a is recognized as a line ending |
| 86 * in the behavior of ., ^, and $. |
| 87 * @stable ICU 4.0 |
| 88 */ |
| 89 UREGEX_UNIX_LINES = 1, |
| 90 |
| 91 /** Unicode word boundaries. |
| 92 * If set, \b uses the Unicode TR 29 definition of word boundaries. |
| 93 * Warning: Unicode word boundaries are quite different from |
| 94 * traditional regular expression word boundaries. See |
| 95 * http://unicode.org/reports/tr29/#Word_Boundaries |
| 96 * @stable ICU 2.8 |
| 97 */ |
| 98 UREGEX_UWORD = 256, |
| 99 |
| 100 /** Error on Unrecognized backslash escapes. |
| 101 * If set, fail with an error on patterns that contain |
| 102 * backslash-escaped ASCII letters without a known specail |
| 103 * meaning. If this flag is not set, these |
| 104 * escaped letters represent themselves. |
| 105 * @stable ICU 4.0 |
| 106 */ |
| 107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 |
| 108 |
| 109 } URegexpFlag; |
| 110 |
| 111 /** |
| 112 * Open (compile) an ICU regular expression. Compiles the regular expression
in |
| 113 * string form into an internal representation using the specified match mode
flags. |
| 114 * The resulting regular expression handle can then be used to perform various |
| 115 * matching operations. |
| 116 * |
| 117 * |
| 118 * @param pattern The Regular Expression pattern to be compiled. |
| 119 * @param patternLength The length of the pattern, or -1 if the pattern is |
| 120 * NUL termintated. |
| 121 * @param flags Flags that alter the default matching behavior for |
| 122 * the regular expression, UREGEX_CASE_INSENSITIVE, for |
| 123 * example. For default behavior, set this parameter to
zero. |
| 124 * See <code>enum URegexpFlag</code>. All desired flags |
| 125 * are bitwise-ORed together. |
| 126 * @param pe Receives the position (line and column nubers) of any
syntax |
| 127 * error within the source regular expression string. If
this |
| 128 * information is not wanted, pass NULL for this paramete
r. |
| 129 * @param status Receives error detected by this function. |
| 130 * @stable ICU 3.0 |
| 131 * |
| 132 */ |
| 133 U_STABLE URegularExpression * U_EXPORT2 |
| 134 uregex_open( const UChar *pattern, |
| 135 int32_t patternLength, |
| 136 uint32_t flags, |
| 137 UParseError *pe, |
| 138 UErrorCode *status); |
| 139 |
| 140 /** |
| 141 * Open (compile) an ICU regular expression. Compiles the regular expression
in |
| 142 * string form into an internal representation using the specified match mode
flags. |
| 143 * The resulting regular expression handle can then be used to perform various |
| 144 * matching operations. |
| 145 * <p> |
| 146 * The contents of the pattern UText will be extracted and saved. Ownership of
the |
| 147 * UText struct itself remains with the caller. This is to match the behavior
of |
| 148 * uregex_open(). |
| 149 * |
| 150 * @param pattern The Regular Expression pattern to be compiled. |
| 151 * @param flags Flags that alter the default matching behavior for |
| 152 * the regular expression, UREGEX_CASE_INSENSITIVE, for |
| 153 * example. For default behavior, set this parameter to
zero. |
| 154 * See <code>enum URegexpFlag</code>. All desired flags |
| 155 * are bitwise-ORed together. |
| 156 * @param pe Receives the position (line and column nubers) of any
syntax |
| 157 * error within the source regular expression string. If
this |
| 158 * information is not wanted, pass NULL for this paramete
r. |
| 159 * @param status Receives error detected by this function. |
| 160 * |
| 161 * @draft ICU 4.6 |
| 162 */ |
| 163 U_DRAFT URegularExpression * U_EXPORT2 |
| 164 uregex_openUText(UText *pattern, |
| 165 uint32_t flags, |
| 166 UParseError *pe, |
| 167 UErrorCode *status); |
| 168 |
| 169 /** |
| 170 * Open (compile) an ICU regular expression. The resulting regular expression |
| 171 * handle can then be used to perform various matching operations. |
| 172 * <p> |
| 173 * This function is the same as uregex_open, except that the pattern |
| 174 * is supplied as an 8 bit char * string in the default code page. |
| 175 * |
| 176 * @param pattern The Regular Expression pattern to be compiled, |
| 177 * NUL termintated. |
| 178 * @param flags Flags that alter the default matching behavior for |
| 179 * the regular expression, UREGEX_CASE_INSENSITIVE, for |
| 180 * example. For default behavior, set this parameter to
zero. |
| 181 * See <code>enum URegexpFlag</code>. All desired flags |
| 182 * are bitwise-ORed together. |
| 183 * @param pe Receives the position (line and column nubers) of any
syntax |
| 184 * error within the source regular expression string. If
this |
| 185 * information is not wanted, pass NULL for this paramete
r. |
| 186 * @param status Receives errors detected by this function. |
| 187 * @return The URegularExpression object representing the compile
d |
| 188 * pattern. |
| 189 * |
| 190 * @stable ICU 3.0 |
| 191 */ |
| 192 #if !UCONFIG_NO_CONVERSION |
| 193 U_STABLE URegularExpression * U_EXPORT2 |
| 194 uregex_openC( const char *pattern, |
| 195 uint32_t flags, |
| 196 UParseError *pe, |
| 197 UErrorCode *status); |
| 198 #endif |
| 199 |
| 200 |
| 201 |
| 202 /** |
| 203 * Close the regular expression, recovering all resources (memory) it |
| 204 * was holding. |
| 205 * |
| 206 * @param regexp The regular expression to be closed. |
| 207 * @stable ICU 3.0 |
| 208 */ |
| 209 U_STABLE void U_EXPORT2 |
| 210 uregex_close(URegularExpression *regexp); |
| 211 |
| 212 #if U_SHOW_CPLUSPLUS_API |
| 213 |
| 214 U_NAMESPACE_BEGIN |
| 215 |
| 216 /** |
| 217 * \class LocalURegularExpressionPointer |
| 218 * "Smart pointer" class, closes a URegularExpression via uregex_close(). |
| 219 * For most methods see the LocalPointerBase base class. |
| 220 * |
| 221 * @see LocalPointerBase |
| 222 * @see LocalPointer |
| 223 * @stable ICU 4.4 |
| 224 */ |
| 225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression,
uregex_close); |
| 226 |
| 227 U_NAMESPACE_END |
| 228 |
| 229 #endif |
| 230 |
| 231 /** |
| 232 * Make a copy of a compiled regular expression. Cloning a regular |
| 233 * expression is faster than opening a second instance from the source |
| 234 * form of the expression, and requires less memory. |
| 235 * <p> |
| 236 * Note that the current input string and the position of any matched text |
| 237 * within it are not cloned; only the pattern itself and and the |
| 238 * match mode flags are copied. |
| 239 * <p> |
| 240 * Cloning can be particularly useful to threaded applications that perform |
| 241 * multiple match operations in parallel. Each concurrent RE |
| 242 * operation requires its own instance of a URegularExpression. |
| 243 * |
| 244 * @param regexp The compiled regular expression to be cloned. |
| 245 * @param status Receives indication of any errors encountered |
| 246 * @return the cloned copy of the compiled regular expression. |
| 247 * @stable ICU 3.0 |
| 248 */ |
| 249 U_STABLE URegularExpression * U_EXPORT2 |
| 250 uregex_clone(const URegularExpression *regexp, UErrorCode *status); |
| 251 |
| 252 /** |
| 253 * Returns a pointer to the source form of the pattern for this regular express
ion. |
| 254 * This function will work even if the pattern was originally specified as a UT
ext. |
| 255 * |
| 256 * @param regexp The compiled regular expression. |
| 257 * @param patLength This output parameter will be set to the length of the |
| 258 * pattern string. A NULL pointer may be used here if the |
| 259 * pattern length is not needed, as would be the case if |
| 260 * the pattern is known in advance to be a NUL terminated |
| 261 * string. |
| 262 * @param status Receives errors detected by this function. |
| 263 * @return a pointer to the pattern string. The storage for the string is |
| 264 * owned by the regular expression object, and must not be |
| 265 * altered or deleted by the application. The returned string |
| 266 * will remain valid until the regular expression is closed. |
| 267 * @stable ICU 3.0 |
| 268 */ |
| 269 U_STABLE const UChar * U_EXPORT2 |
| 270 uregex_pattern(const URegularExpression *regexp, |
| 271 int32_t *patLength, |
| 272 UErrorCode *status); |
| 273 |
| 274 /** |
| 275 * Returns the source text of the pattern for this regular expression. |
| 276 * This function will work even if the pattern was originally specified as a UC
har string. |
| 277 * |
| 278 * @param regexp The compiled regular expression. |
| 279 * @param status Receives errors detected by this function. |
| 280 * @return the pattern text. The storage for the text is owned by the regular e
xpression |
| 281 * object, and must not be altered or deleted. |
| 282 * |
| 283 * @draft ICU 4.6 |
| 284 */ |
| 285 U_DRAFT UText * U_EXPORT2 |
| 286 uregex_patternUText(const URegularExpression *regexp, |
| 287 UErrorCode *status); |
| 288 |
| 289 |
| 290 /** |
| 291 * Get the match mode flags that were specified when compiling this regular exp
ression. |
| 292 * @param status Receives errors detected by this function. |
| 293 * @param regexp The compiled regular expression. |
| 294 * @return The match mode flags |
| 295 * @see URegexpFlag |
| 296 * @stable ICU 3.0 |
| 297 */ |
| 298 U_STABLE int32_t U_EXPORT2 |
| 299 uregex_flags(const URegularExpression *regexp, |
| 300 UErrorCode *status); |
| 301 |
| 302 |
| 303 /** |
| 304 * Set the subject text string upon which the regular expression will look for
matches. |
| 305 * This function may be called any number of times, allowing the regular |
| 306 * expression pattern to be applied to different strings. |
| 307 * <p> |
| 308 * Regular expression matching operations work directly on the application's |
| 309 * string data. No copy is made. The subject string data must not be |
| 310 * altered after calling this function until after all regular expression |
| 311 * operations involving this string data are completed. |
| 312 * <p> |
| 313 * Zero length strings are permitted. In this case, no subsequent match |
| 314 * operation will dereference the text string pointer. |
| 315 * |
| 316 * @param regexp The compiled regular expression. |
| 317 * @param text The subject text string. |
| 318 * @param textLength The length of the subject text, or -1 if the string |
| 319 * is NUL terminated. |
| 320 * @param status Receives errors detected by this function. |
| 321 * @stable ICU 3.0 |
| 322 */ |
| 323 U_STABLE void U_EXPORT2 |
| 324 uregex_setText(URegularExpression *regexp, |
| 325 const UChar *text, |
| 326 int32_t textLength, |
| 327 UErrorCode *status); |
| 328 |
| 329 |
| 330 /** |
| 331 * Set the subject text string upon which the regular expression will look for
matches. |
| 332 * This function may be called any number of times, allowing the regular |
| 333 * expression pattern to be applied to different strings. |
| 334 * <p> |
| 335 * Regular expression matching operations work directly on the application's |
| 336 * string data; only a shallow clone is made. The subject string data must no
t be |
| 337 * altered after calling this function until after all regular expression |
| 338 * operations involving this string data are completed. |
| 339 * |
| 340 * @param regexp The compiled regular expression. |
| 341 * @param text The subject text string. |
| 342 * @param status Receives errors detected by this function. |
| 343 * |
| 344 * @draft ICU 4.6 |
| 345 */ |
| 346 U_DRAFT void U_EXPORT2 |
| 347 uregex_setUText(URegularExpression *regexp, |
| 348 UText *text, |
| 349 UErrorCode *status); |
| 350 |
| 351 /** |
| 352 * Get the subject text that is currently associated with this |
| 353 * regular expression object. If the input was supplied using uregex_setText
(), |
| 354 * that pointer will be returned. Otherwise, the characters in the input wil
l |
| 355 * be extracted to a buffer and returned. In either case, ownership remains |
| 356 * with the regular expression object. |
| 357 * |
| 358 * This function will work even if the input was originally specified as a UTe
xt. |
| 359 * |
| 360 * @param regexp The compiled regular expression. |
| 361 * @param textLength The length of the string is returned in this output param
eter. |
| 362 * A NULL pointer may be used here if the |
| 363 * text length is not needed, as would be the case if |
| 364 * the text is known in advance to be a NUL terminated |
| 365 * string. |
| 366 * @param status Receives errors detected by this function. |
| 367 * @return Pointer to the subject text string currently associated w
ith |
| 368 * this regular expression. |
| 369 * @stable ICU 3.0 |
| 370 */ |
| 371 U_STABLE const UChar * U_EXPORT2 |
| 372 uregex_getText(URegularExpression *regexp, |
| 373 int32_t *textLength, |
| 374 UErrorCode *status); |
| 375 |
| 376 |
| 377 /** |
| 378 * Get the subject text that is currently associated with this |
| 379 * regular expression object. |
| 380 * |
| 381 * This function will work even if the input was originally specified as a UCh
ar string. |
| 382 * |
| 383 * @param regexp The compiled regular expression. |
| 384 * @param dest A mutable UText in which to store the current input. |
| 385 * If NULL, a new UText will be created as an immutable shal
low clone |
| 386 * of the actual input string. |
| 387 * @param status Receives errors detected by this function. |
| 388 * @return The subject text currently associated with this regular e
xpression. |
| 389 * If a pre-allocated UText was provided, it will always be
used and returned. |
| 390 * |
| 391 * @draft ICU 4.6 |
| 392 */ |
| 393 U_DRAFT UText * U_EXPORT2 |
| 394 uregex_getUText(URegularExpression *regexp, |
| 395 UText *dest, |
| 396 UErrorCode *status); |
| 397 |
| 398 /** |
| 399 * Attempts to match the input string against the pattern. |
| 400 * To succeed, the match must extend to the end of the string, |
| 401 * or cover the complete match region. |
| 402 * |
| 403 * If startIndex >= zero the match operation starts at the specified |
| 404 * index and must extend to the end of the input string. Any region |
| 405 * that has been specified is reset. |
| 406 * |
| 407 * If startIndex == -1 the match must cover the input region, or the entire |
| 408 * input string if no region has been set. This directly corresponds to |
| 409 * Matcher.matches() in Java |
| 410 * |
| 411 * @param regexp The compiled regular expression. |
| 412 * @param startIndex The input string (native) index at which to begin mat
ching, or -1 |
| 413 * to match the input Region. |
| 414 * @param status Receives errors detected by this function. |
| 415 * @return TRUE if there is a match |
| 416 * @stable ICU 3.0 |
| 417 */ |
| 418 U_STABLE UBool U_EXPORT2 |
| 419 uregex_matches(URegularExpression *regexp, |
| 420 int32_t startIndex, |
| 421 UErrorCode *status); |
| 422 |
| 423 /** |
| 424 * 64bit version of uregex_matches. |
| 425 * @draft ICU 4.6 |
| 426 */ |
| 427 U_DRAFT UBool U_EXPORT2 |
| 428 uregex_matches64(URegularExpression *regexp, |
| 429 int64_t startIndex, |
| 430 UErrorCode *status); |
| 431 |
| 432 /** |
| 433 * Attempts to match the input string, starting from the specified index, aga
inst the pattern. |
| 434 * The match may be of any length, and is not required to extend to the end |
| 435 * of the input string. Contrast with uregex_matches(). |
| 436 * |
| 437 * <p>If startIndex is >= 0 any input region that was set for this |
| 438 * URegularExpression is reset before the operation begins. |
| 439 * |
| 440 * <p>If the specified starting index == -1 the match begins at the start of
the input |
| 441 * region, or at the start of the full string if no region has been specified
. |
| 442 * This corresponds directly with Matcher.lookingAt() in Java. |
| 443 * |
| 444 * <p>If the match succeeds then more information can be obtained via the |
| 445 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, |
| 446 * and <code>uregexp_group()</code> functions.</p> |
| 447 * |
| 448 * @param regexp The compiled regular expression. |
| 449 * @param startIndex The input string (native) index at which to begin ma
tching, or |
| 450 * -1 to match the Input Region |
| 451 * @param status A reference to a UErrorCode to receive any errors. |
| 452 * @return TRUE if there is a match. |
| 453 * @stable ICU 3.0 |
| 454 */ |
| 455 U_STABLE UBool U_EXPORT2 |
| 456 uregex_lookingAt(URegularExpression *regexp, |
| 457 int32_t startIndex, |
| 458 UErrorCode *status); |
| 459 |
| 460 /** |
| 461 * 64bit version of uregex_lookingAt. |
| 462 * @draft ICU 4.6 |
| 463 */ |
| 464 U_DRAFT UBool U_EXPORT2 |
| 465 uregex_lookingAt64(URegularExpression *regexp, |
| 466 int64_t startIndex, |
| 467 UErrorCode *status); |
| 468 |
| 469 /** |
| 470 * Find the first matching substring of the input string that matches the pat
tern. |
| 471 * If startIndex is >= zero the search for a match begins at the specified in
dex, |
| 472 * and any match region is reset. This corresponds directly with |
| 473 * Matcher.find(startIndex) in Java. |
| 474 * |
| 475 * If startIndex == -1 the search begins at the start of the input region, |
| 476 * or at the start of the full string if no region has been specified
. |
| 477 * |
| 478 * If a match is found, <code>uregex_start(), uregex_end()</code>, and |
| 479 * <code>uregex_group()</code> will provide more information regarding the ma
tch. |
| 480 * |
| 481 * @param regexp The compiled regular expression. |
| 482 * @param startIndex The position (native) in the input string to begin th
e search, or |
| 483 * -1 to search within the Input Region. |
| 484 * @param status A reference to a UErrorCode to receive any errors. |
| 485 * @return TRUE if a match is found. |
| 486 * @stable ICU 3.0 |
| 487 */ |
| 488 U_STABLE UBool U_EXPORT2 |
| 489 uregex_find(URegularExpression *regexp, |
| 490 int32_t startIndex, |
| 491 UErrorCode *status); |
| 492 |
| 493 /** |
| 494 * 64bit version of uregex_find. |
| 495 * @draft ICU 4.6 |
| 496 */ |
| 497 U_DRAFT UBool U_EXPORT2 |
| 498 uregex_find64(URegularExpression *regexp, |
| 499 int64_t startIndex, |
| 500 UErrorCode *status); |
| 501 |
| 502 /** |
| 503 * Find the next pattern match in the input string. Begin searching |
| 504 * the input at the location following the end of he previous match, |
| 505 * or at the start of the string (or region) if there is no |
| 506 * previous match. If a match is found, <code>uregex_start(), uregex_end()</c
ode>, and |
| 507 * <code>uregex_group()</code> will provide more information regarding the mat
ch. |
| 508 * |
| 509 * @param regexp The compiled regular expression. |
| 510 * @param status A reference to a UErrorCode to receive any errors. |
| 511 * @return TRUE if a match is found. |
| 512 * @see uregex_reset |
| 513 * @stable ICU 3.0 |
| 514 */ |
| 515 U_STABLE UBool U_EXPORT2 |
| 516 uregex_findNext(URegularExpression *regexp, |
| 517 UErrorCode *status); |
| 518 |
| 519 /** |
| 520 * Get the number of capturing groups in this regular expression's pattern. |
| 521 * @param regexp The compiled regular expression. |
| 522 * @param status A reference to a UErrorCode to receive any errors. |
| 523 * @return the number of capture groups |
| 524 * @stable ICU 3.0 |
| 525 */ |
| 526 U_STABLE int32_t U_EXPORT2 |
| 527 uregex_groupCount(URegularExpression *regexp, |
| 528 UErrorCode *status); |
| 529 |
| 530 /** Extract the string for the specified matching expression or subexpression. |
| 531 * Group #0 is the complete string of matched text. |
| 532 * Group #1 is the text matched by the first set of capturing parentheses. |
| 533 * |
| 534 * @param regexp The compiled regular expression. |
| 535 * @param groupNum The capture group to extract. Group 0 is the comple
te |
| 536 * match. The value of this parameter must be |
| 537 * less than or equal to the number of capture groups i
n |
| 538 * the pattern. |
| 539 * @param dest Buffer to receive the matching string data |
| 540 * @param destCapacity Capacity of the dest buffer. |
| 541 * @param status A reference to a UErrorCode to receive any errors. |
| 542 * @return Length of matching data, |
| 543 * or -1 if no applicable match. |
| 544 * @stable ICU 3.0 |
| 545 */ |
| 546 U_STABLE int32_t U_EXPORT2 |
| 547 uregex_group(URegularExpression *regexp, |
| 548 int32_t groupNum, |
| 549 UChar *dest, |
| 550 int32_t destCapacity, |
| 551 UErrorCode *status); |
| 552 |
| 553 |
| 554 /** Returns a shallow immutable clone of the entire input string. The returned
UText current native index |
| 555 * is set to the beginning of the requested capture group. The capture group
length is also |
| 556 * returned via groupLength. |
| 557 * Group #0 is the complete string of matched text. |
| 558 * Group #1 is the text matched by the first set of capturing parentheses. |
| 559 * |
| 560 * @param regexp The compiled regular expression. |
| 561 * @param groupNum The capture group to extract. Group 0 is the comple
te |
| 562 * match. The value of this parameter must be |
| 563 * less than or equal to the number of capture groups i
n |
| 564 * the pattern. |
| 565 * @param dest A mutable UText in which to store the current input. |
| 566 * If NULL, a new UText will be created as an immutable
shallow clone |
| 567 * of the entire input string. |
| 568 * @param groupLength The group length of the desired capture group. |
| 569 * @param status A reference to a UErrorCode to receive any errors. |
| 570 * @return The subject text currently associated with this regu
lar expression. |
| 571 * If a pre-allocated UText was provided, it will alway
s be used and returned. |
| 572 |
| 573 * |
| 574 * @draft ICU 4.6 |
| 575 */ |
| 576 U_DRAFT UText * U_EXPORT2 |
| 577 uregex_groupUText(URegularExpression *regexp, |
| 578 int32_t groupNum, |
| 579 UText *dest, |
| 580 int64_t *groupLength, |
| 581 UErrorCode *status); |
| 582 |
| 583 |
| 584 /** Extract the string for the specified matching expression or subexpression. |
| 585 * Group #0 is the complete string of matched text. |
| 586 * Group #1 is the text matched by the first set of capturing parentheses. |
| 587 * |
| 588 * @param regexp The compiled regular expression. |
| 589 * @param groupNum The capture group to extract. Group 0 is the comple
te |
| 590 * match. The value of this parameter must be |
| 591 * less than or equal to the number of capture groups i
n |
| 592 * the pattern. |
| 593 * @param dest Mutable UText to receive the matching string data. |
| 594 * If NULL, a new UText will be created (which may not
be mutable). |
| 595 * @param status A reference to a UErrorCode to receive any errors. |
| 596 * @return The matching string data. If a pre-allocated UText w
as provided, |
| 597 * it will always be used and returned. |
| 598 * |
| 599 * @internal ICU 4.4 technology preview |
| 600 */ |
| 601 U_INTERNAL UText * U_EXPORT2 |
| 602 uregex_groupUTextDeep(URegularExpression *regexp, |
| 603 int32_t groupNum, |
| 604 UText *dest, |
| 605 UErrorCode *status); |
| 606 |
| 607 /** |
| 608 * Returns the index in the input string of the start of the text matched by
the |
| 609 * specified capture group during the previous match operation. Return -1 if |
| 610 * the capture group was not part of the last match. |
| 611 * Group #0 refers to the complete range of matched text. |
| 612 * Group #1 refers to the text matched by the first set of capturing parenthe
ses. |
| 613 * |
| 614 * @param regexp The compiled regular expression. |
| 615 * @param groupNum The capture group number |
| 616 * @param status A reference to a UErrorCode to receive any errors. |
| 617 * @return the starting (native) position in the input of the t
ext matched |
| 618 * by the specified group. |
| 619 * @stable ICU 3.0 |
| 620 */ |
| 621 U_STABLE int32_t U_EXPORT2 |
| 622 uregex_start(URegularExpression *regexp, |
| 623 int32_t groupNum, |
| 624 UErrorCode *status); |
| 625 |
| 626 /** |
| 627 * 64bit version of uregex_start. |
| 628 * @draft ICU 4.6 |
| 629 */ |
| 630 U_DRAFT int64_t U_EXPORT2 |
| 631 uregex_start64(URegularExpression *regexp, |
| 632 int32_t groupNum, |
| 633 UErrorCode *status); |
| 634 |
| 635 /** |
| 636 * Returns the index in the input string of the position following the end |
| 637 * of the text matched by the specified capture group. |
| 638 * Return -1 if the capture group was not part of the last match. |
| 639 * Group #0 refers to the complete range of matched text. |
| 640 * Group #1 refers to the text matched by the first set of capturing parenthe
ses. |
| 641 * |
| 642 * @param regexp The compiled regular expression. |
| 643 * @param groupNum The capture group number |
| 644 * @param status A reference to a UErrorCode to receive any errors. |
| 645 * @return the (native) index of the position following the las
t matched character. |
| 646 * @stable ICU 3.0 |
| 647 */ |
| 648 U_STABLE int32_t U_EXPORT2 |
| 649 uregex_end(URegularExpression *regexp, |
| 650 int32_t groupNum, |
| 651 UErrorCode *status); |
| 652 |
| 653 /** |
| 654 * 64bit version of uregex_end. |
| 655 * @draft ICU 4.6 |
| 656 */ |
| 657 U_DRAFT int64_t U_EXPORT2 |
| 658 uregex_end64(URegularExpression *regexp, |
| 659 int32_t groupNum, |
| 660 UErrorCode *status); |
| 661 |
| 662 /** |
| 663 * Reset any saved state from the previous match. Has the effect of |
| 664 * causing uregex_findNext to begin at the specified index, and causing |
| 665 * uregex_start(), uregex_end() and uregex_group() to return an error |
| 666 * indicating that there is no match information available. Clears any |
| 667 * match region that may have been set. |
| 668 * |
| 669 * @param regexp The compiled regular expression. |
| 670 * @param index The position (native) in the text at which a |
| 671 * uregex_findNext() should begin searching. |
| 672 * @param status A reference to a UErrorCode to receive any errors. |
| 673 * @stable ICU 3.0 |
| 674 */ |
| 675 U_STABLE void U_EXPORT2 |
| 676 uregex_reset(URegularExpression *regexp, |
| 677 int32_t index, |
| 678 UErrorCode *status); |
| 679 |
| 680 /** |
| 681 * 64bit version of uregex_reset. |
| 682 * @draft ICU 4.6 |
| 683 */ |
| 684 U_DRAFT void U_EXPORT2 |
| 685 uregex_reset64(URegularExpression *regexp, |
| 686 int64_t index, |
| 687 UErrorCode *status); |
| 688 |
| 689 /** Sets the limits of the matching region for this URegularExpression. |
| 690 * The region is the part of the input string that will be considered when matc
hing. |
| 691 * Invoking this method resets any saved state from the previous match, |
| 692 * then sets the region to start at the index specified by the start parameter |
| 693 * and end at the index specified by the end parameter. |
| 694 * |
| 695 * Depending on the transparency and anchoring being used (see useTransparentBo
unds |
| 696 * and useAnchoringBounds), certain constructs such as anchors may behave diffe
rently |
| 697 * at or around the boundaries of the region |
| 698 * |
| 699 * The function will fail if start is greater than limit, or if either index |
| 700 * is less than zero or greater than the length of the string being matched. |
| 701 * |
| 702 * @param regexp The compiled regular expression. |
| 703 * @param regionStart The (native) index to begin searches at. |
| 704 * @param regionLimit The (native) index to end searches at (exclusive). |
| 705 * @param status A pointer to a UErrorCode to receive any errors. |
| 706 * @stable ICU 4.0 |
| 707 */ |
| 708 U_STABLE void U_EXPORT2 |
| 709 uregex_setRegion(URegularExpression *regexp, |
| 710 int32_t regionStart, |
| 711 int32_t regionLimit, |
| 712 UErrorCode *status); |
| 713 |
| 714 /** |
| 715 * 64bit version of uregex_setRegion. |
| 716 * @draft ICU 4.6 |
| 717 */ |
| 718 U_DRAFT void U_EXPORT2 |
| 719 uregex_setRegion64(URegularExpression *regexp, |
| 720 int64_t regionStart, |
| 721 int64_t regionLimit, |
| 722 UErrorCode *status); |
| 723 |
| 724 /** |
| 725 * Variation on uregex_setRegion to set the region without resetting the star
t index |
| 726 * without resetting the position for subsequent matches. |
| 727 * @draft ICU 4.6 |
| 728 */ |
| 729 U_DRAFT void U_EXPORT2 |
| 730 uregex_setRegionAndStart(URegularExpression *regexp, |
| 731 int64_t regionStart, |
| 732 int64_t regionLimit, |
| 733 int64_t startIndex, |
| 734 UErrorCode *status); |
| 735 |
| 736 /** |
| 737 * Reports the start index of the matching region. Any matches found are limite
d to |
| 738 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). |
| 739 * |
| 740 * @param regexp The compiled regular expression. |
| 741 * @param status A pointer to a UErrorCode to receive any errors. |
| 742 * @return The starting (native) index of this matcher's region. |
| 743 * @stable ICU 4.0 |
| 744 */ |
| 745 U_STABLE int32_t U_EXPORT2 |
| 746 uregex_regionStart(const URegularExpression *regexp, |
| 747 UErrorCode *status); |
| 748 |
| 749 /** |
| 750 * 64bit version of uregex_regionStart. |
| 751 * @draft ICU 4.6 |
| 752 */ |
| 753 U_DRAFT int64_t U_EXPORT2 |
| 754 uregex_regionStart64(const URegularExpression *regexp, |
| 755 UErrorCode *status); |
| 756 |
| 757 /** |
| 758 * Reports the end index (exclusive) of the matching region for this URegularEx
pression. |
| 759 * Any matches found are limited to to the region bounded by regionStart (inclu
sive) |
| 760 * and regionEnd (exclusive). |
| 761 * |
| 762 * @param regexp The compiled regular expression. |
| 763 * @param status A pointer to a UErrorCode to receive any errors. |
| 764 * @return The ending point (native) of this matcher's region. |
| 765 * @stable ICU 4.0 |
| 766 */ |
| 767 U_STABLE int32_t U_EXPORT2 |
| 768 uregex_regionEnd(const URegularExpression *regexp, |
| 769 UErrorCode *status); |
| 770 |
| 771 /** |
| 772 * 64bit version of uregex_regionEnd. |
| 773 * @draft ICU 4.6 |
| 774 */ |
| 775 U_DRAFT int64_t U_EXPORT2 |
| 776 uregex_regionEnd64(const URegularExpression *regexp, |
| 777 UErrorCode *status); |
| 778 |
| 779 /** |
| 780 * Queries the transparency of region bounds for this URegularExpression. |
| 781 * See useTransparentBounds for a description of transparent and opaque bounds. |
| 782 * By default, matching boundaries are opaque. |
| 783 * |
| 784 * @param regexp The compiled regular expression. |
| 785 * @param status A pointer to a UErrorCode to receive any errors. |
| 786 * @return TRUE if this matcher is using opaque bounds, false if it is not. |
| 787 * @stable ICU 4.0 |
| 788 */ |
| 789 U_STABLE UBool U_EXPORT2 |
| 790 uregex_hasTransparentBounds(const URegularExpression *regexp, |
| 791 UErrorCode *status); |
| 792 |
| 793 |
| 794 /** |
| 795 * Sets the transparency of region bounds for this URegularExpression. |
| 796 * Invoking this function with an argument of TRUE will set matches to use tran
sparent bounds. |
| 797 * If the boolean argument is FALSE, then opaque bounds will be used. |
| 798 * |
| 799 * Using transparent bounds, the boundaries of the matching region are transpar
ent |
| 800 * to lookahead, lookbehind, and boundary matching constructs. Those constructs
can |
| 801 * see text beyond the boundaries of the region while checking for a match. |
| 802 * |
| 803 * With opaque bounds, no text outside of the matching region is visible to loo
kahead, |
| 804 * lookbehind, and boundary matching constructs. |
| 805 * |
| 806 * By default, opaque bounds are used. |
| 807 * |
| 808 * @param regexp The compiled regular expression. |
| 809 * @param b TRUE for transparent bounds; FALSE for opaque bounds |
| 810 * @param status A pointer to a UErrorCode to receive any errors. |
| 811 * @stable ICU 4.0 |
| 812 **/ |
| 813 U_STABLE void U_EXPORT2 |
| 814 uregex_useTransparentBounds(URegularExpression *regexp, |
| 815 UBool b, |
| 816 UErrorCode *status); |
| 817 |
| 818 |
| 819 /** |
| 820 * Return true if this URegularExpression is using anchoring bounds. |
| 821 * By default, anchoring region bounds are used. |
| 822 * |
| 823 * @param regexp The compiled regular expression. |
| 824 * @param status A pointer to a UErrorCode to receive any errors. |
| 825 * @return TRUE if this matcher is using anchoring bounds. |
| 826 * @stable ICU 4.0 |
| 827 */ |
| 828 U_STABLE UBool U_EXPORT2 |
| 829 uregex_hasAnchoringBounds(const URegularExpression *regexp, |
| 830 UErrorCode *status); |
| 831 |
| 832 |
| 833 /** |
| 834 * Set whether this URegularExpression is using Anchoring Bounds for its region
. |
| 835 * With anchoring bounds, pattern anchors such as ^ and $ will match at the sta
rt |
| 836 * and end of the region. Without Anchoring Bounds, anchors will only match at |
| 837 * the positions they would in the complete text. |
| 838 * |
| 839 * Anchoring Bounds are the default for regions. |
| 840 * |
| 841 * @param regexp The compiled regular expression. |
| 842 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. |
| 843 * @param status A pointer to a UErrorCode to receive any errors. |
| 844 * @stable ICU 4.0 |
| 845 */ |
| 846 U_STABLE void U_EXPORT2 |
| 847 uregex_useAnchoringBounds(URegularExpression *regexp, |
| 848 UBool b, |
| 849 UErrorCode *status); |
| 850 |
| 851 /** |
| 852 * Return TRUE if the most recent matching operation touched the |
| 853 * end of the text being processed. In this case, additional input text could |
| 854 * change the results of that match. |
| 855 * |
| 856 * @param regexp The compiled regular expression. |
| 857 * @param status A pointer to a UErrorCode to receive any errors. |
| 858 * @return TRUE if the most recent match hit the end of input |
| 859 * @stable ICU 4.0 |
| 860 */ |
| 861 U_STABLE UBool U_EXPORT2 |
| 862 uregex_hitEnd(const URegularExpression *regexp, |
| 863 UErrorCode *status); |
| 864 |
| 865 /** |
| 866 * Return TRUE the most recent match succeeded and additional input could cause |
| 867 * it to fail. If this function returns false and a match was found, then more
input |
| 868 * might change the match but the match won't be lost. If a match was not found
, |
| 869 * then requireEnd has no meaning. |
| 870 * |
| 871 * @param regexp The compiled regular expression. |
| 872 * @param status A pointer to a UErrorCode to receive any errors. |
| 873 * @return TRUE if more input could cause the most recent match to no longer m
atch. |
| 874 * @stable ICU 4.0 |
| 875 */ |
| 876 U_STABLE UBool U_EXPORT2 |
| 877 uregex_requireEnd(const URegularExpression *regexp, |
| 878 UErrorCode *status); |
| 879 |
| 880 |
| 881 |
| 882 |
| 883 |
| 884 /** |
| 885 * Replaces every substring of the input that matches the pattern |
| 886 * with the given replacement string. This is a convenience function that |
| 887 * provides a complete find-and-replace-all operation. |
| 888 * |
| 889 * This method scans the input string looking for matches of the pattern. |
| 890 * Input that is not part of any match is copied unchanged to the |
| 891 * destination buffer. Matched regions are replaced in the output |
| 892 * buffer by the replacement string. The replacement string may contain |
| 893 * references to capture groups; these take the form of $1, $2, etc. |
| 894 * |
| 895 * @param regexp The compiled regular expression. |
| 896 * @param replacementText A string containing the replacement text. |
| 897 * @param replacementLength The length of the replacement string, or |
| 898 * -1 if it is NUL terminated. |
| 899 * @param destBuf A (UChar *) buffer that will receive the resu
lt. |
| 900 * @param destCapacity The capacity of the desitnation buffer. |
| 901 * @param status A reference to a UErrorCode to receive any er
rors. |
| 902 * @return The length of the string resulting from the f
ind |
| 903 * and replace operation. In the event that the |
| 904 * destination capacity is inadequate, the retur
n value |
| 905 * is still the full length of the untruncated s
tring. |
| 906 * @stable ICU 3.0 |
| 907 */ |
| 908 U_STABLE int32_t U_EXPORT2 |
| 909 uregex_replaceAll(URegularExpression *regexp, |
| 910 const UChar *replacementText, |
| 911 int32_t replacementLength, |
| 912 UChar *destBuf, |
| 913 int32_t destCapacity, |
| 914 UErrorCode *status); |
| 915 |
| 916 /** |
| 917 * Replaces every substring of the input that matches the pattern |
| 918 * with the given replacement string. This is a convenience function that |
| 919 * provides a complete find-and-replace-all operation. |
| 920 * |
| 921 * This method scans the input string looking for matches of the pattern. |
| 922 * Input that is not part of any match is copied unchanged to the |
| 923 * destination buffer. Matched regions are replaced in the output |
| 924 * buffer by the replacement string. The replacement string may contain |
| 925 * references to capture groups; these take the form of $1, $2, etc. |
| 926 * |
| 927 * @param regexp The compiled regular expression. |
| 928 * @param replacement A string containing the replacement text. |
| 929 * @param dest A mutable UText that will receive the result. |
| 930 * If NULL, a new UText will be created (which may
not be mutable). |
| 931 * @param status A reference to a UErrorCode to receive any errors
. |
| 932 * @return A UText containing the results of the find and re
place. |
| 933 * If a pre-allocated UText was provided, it will a
lways be used and returned. |
| 934 * |
| 935 * @draft ICU 4.6 |
| 936 */ |
| 937 U_DRAFT UText * U_EXPORT2 |
| 938 uregex_replaceAllUText(URegularExpression *regexp, |
| 939 UText *replacement, |
| 940 UText *dest, |
| 941 UErrorCode *status); |
| 942 |
| 943 /** |
| 944 * Replaces the first substring of the input that matches the pattern |
| 945 * with the given replacement string. This is a convenience function that |
| 946 * provides a complete find-and-replace operation. |
| 947 * |
| 948 * This method scans the input string looking for a match of the pattern. |
| 949 * All input that is not part of the match is copied unchanged to the |
| 950 * destination buffer. The matched region is replaced in the output |
| 951 * buffer by the replacement string. The replacement string may contain |
| 952 * references to capture groups; these take the form of $1, $2, etc. |
| 953 * |
| 954 * @param regexp The compiled regular expression. |
| 955 * @param replacementText A string containing the replacement text. |
| 956 * @param replacementLength The length of the replacement string, or |
| 957 * -1 if it is NUL terminated. |
| 958 * @param destBuf A (UChar *) buffer that will receive the resu
lt. |
| 959 * @param destCapacity The capacity of the desitnation buffer. |
| 960 * @param status a reference to a UErrorCode to receive any er
rors. |
| 961 * @return The length of the string resulting from the f
ind |
| 962 * and replace operation. In the event that the |
| 963 * destination capacity is inadequate, the retur
n value |
| 964 * is still the full length of the untruncated s
tring. |
| 965 * @stable ICU 3.0 |
| 966 */ |
| 967 U_STABLE int32_t U_EXPORT2 |
| 968 uregex_replaceFirst(URegularExpression *regexp, |
| 969 const UChar *replacementText, |
| 970 int32_t replacementLength, |
| 971 UChar *destBuf, |
| 972 int32_t destCapacity, |
| 973 UErrorCode *status); |
| 974 |
| 975 /** |
| 976 * Replaces the first substring of the input that matches the pattern |
| 977 * with the given replacement string. This is a convenience function that |
| 978 * provides a complete find-and-replace operation. |
| 979 * |
| 980 * This method scans the input string looking for a match of the pattern. |
| 981 * All input that is not part of the match is copied unchanged to the |
| 982 * destination buffer. The matched region is replaced in the output |
| 983 * buffer by the replacement string. The replacement string may contain |
| 984 * references to capture groups; these take the form of $1, $2, etc. |
| 985 * |
| 986 * @param regexp The compiled regular expression. |
| 987 * @param replacement A string containing the replacement text. |
| 988 * @param dest A mutable UText that will receive the result. |
| 989 * If NULL, a new UText will be created (which may
not be mutable). |
| 990 * @param status A reference to a UErrorCode to receive any errors
. |
| 991 * @return A UText containing the results of the find and re
place. |
| 992 * If a pre-allocated UText was provided, it will a
lways be used and returned. |
| 993 * |
| 994 * @draft ICU 4.6 |
| 995 */ |
| 996 U_DRAFT UText * U_EXPORT2 |
| 997 uregex_replaceFirstUText(URegularExpression *regexp, |
| 998 UText *replacement, |
| 999 UText *dest, |
| 1000 UErrorCode *status); |
| 1001 |
| 1002 |
| 1003 /** |
| 1004 * Implements a replace operation intended to be used as part of an |
| 1005 * incremental find-and-replace. |
| 1006 * |
| 1007 * <p>The input string, starting from the end of the previous match and endin
g at |
| 1008 * the start of the current match, is appended to the destination string. Th
en the |
| 1009 * replacement string is appended to the output string, |
| 1010 * including handling any substitutions of captured text.</p> |
| 1011 * |
| 1012 * <p>A note on preflight computation of buffersize and error handling: |
| 1013 * Calls to uregex_appendReplacement() and uregex_appendTail() are |
| 1014 * designed to be chained, one after another, with the destination |
| 1015 * buffer pointer and buffer capacity updated after each in preparation |
| 1016 * to for the next. If the destination buffer is exhausted partway through s
uch a |
| 1017 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal |
| 1018 * ICU conventions are for a function to perform no action if it is |
| 1019 * called with an error status, but for this one case, uregex_appendRepacemen
t() |
| 1020 * will operate normally so that buffer size computations will complete |
| 1021 * correctly. |
| 1022 * |
| 1023 * <p>For simple, prepackaged, non-incremental find-and-replace |
| 1024 * operations, see replaceFirst() or replaceAll().</p> |
| 1025 * |
| 1026 * @param regexp The regular expression object. |
| 1027 * @param replacementText The string that will replace the matched portion
of the |
| 1028 * input string as it is copied to the destination buffe
r. |
| 1029 * The replacement text may contain references ($1, for |
| 1030 * example) to capture groups from the match. |
| 1031 * @param replacementLength The length of the replacement text string, |
| 1032 * or -1 if the string is NUL terminated. |
| 1033 * @param destBuf The buffer into which the results of the |
| 1034 * find-and-replace are placed. On return, this pointer |
| 1035 * will be updated to refer to the beginning of the |
| 1036 * unused portion of buffer, leaving it in position for |
| 1037 * a subsequent call to this function. |
| 1038 * @param destCapacity The size of the output buffer, On return, this |
| 1039 * parameter will be updated to reflect the space remain
ing |
| 1040 * unused in the output buffer. |
| 1041 * @param status A reference to a UErrorCode to receive any errors. |
| 1042 * @return The length of the result string. In the event that |
| 1043 * destCapacity is inadequate, the full length of the |
| 1044 * untruncated output string is returned. |
| 1045 * |
| 1046 * @stable ICU 3.0 |
| 1047 * |
| 1048 */ |
| 1049 U_STABLE int32_t U_EXPORT2 |
| 1050 uregex_appendReplacement(URegularExpression *regexp, |
| 1051 const UChar *replacementText, |
| 1052 int32_t replacementLength, |
| 1053 UChar **destBuf, |
| 1054 int32_t *destCapacity, |
| 1055 UErrorCode *status); |
| 1056 |
| 1057 |
| 1058 /** |
| 1059 * Implements a replace operation intended to be used as part of an |
| 1060 * incremental find-and-replace. |
| 1061 * |
| 1062 * <p>The input string, starting from the end of the previous match and endin
g at |
| 1063 * the start of the current match, is appended to the destination string. Th
en the |
| 1064 * replacement string is appended to the output string, |
| 1065 * including handling any substitutions of captured text.</p> |
| 1066 * |
| 1067 * <p>For simple, prepackaged, non-incremental find-and-replace |
| 1068 * operations, see replaceFirst() or replaceAll().</p> |
| 1069 * |
| 1070 * @param regexp The regular expression object. |
| 1071 * @param replacementText The string that will replace the matched portion
of the |
| 1072 * input string as it is copied to the destination buffe
r. |
| 1073 * The replacement text may contain references ($1, for |
| 1074 * example) to capture groups from the match. |
| 1075 * @param dest A mutable UText that will receive the result. Must no
t be NULL. |
| 1076 * @param status A reference to a UErrorCode to receive any errors. |
| 1077 * |
| 1078 * @draft ICU 4.6 |
| 1079 */ |
| 1080 U_DRAFT void U_EXPORT2 |
| 1081 uregex_appendReplacementUText(URegularExpression *regexp, |
| 1082 UText *replacementText, |
| 1083 UText *dest, |
| 1084 UErrorCode *status); |
| 1085 |
| 1086 |
| 1087 /** |
| 1088 * As the final step in a find-and-replace operation, append the remainder |
| 1089 * of the input string, starting at the position following the last match, |
| 1090 * to the destination string. <code>uregex_appendTail()</code> is intended |
| 1091 * to be invoked after one or more invocations of the |
| 1092 * <code>uregex_appendReplacement()</code> function. |
| 1093 * |
| 1094 * @param regexp The regular expression object. This is needed to |
| 1095 * obtain the input string and with the position |
| 1096 * of the last match within it. |
| 1097 * @param destBuf The buffer in which the results of the |
| 1098 * find-and-replace are placed. On return, the pointer |
| 1099 * will be updated to refer to the beginning of the |
| 1100 * unused portion of buffer. |
| 1101 * @param destCapacity The size of the output buffer, On return, this |
| 1102 * value will be updated to reflect the space remaining |
| 1103 * unused in the output buffer. |
| 1104 * @param status A reference to a UErrorCode to receive any errors. |
| 1105 * @return The length of the result string. In the event that |
| 1106 * destCapacity is inadequate, the full length of the |
| 1107 * untruncated output string is returned. |
| 1108 * |
| 1109 * @stable ICU 3.0 |
| 1110 */ |
| 1111 U_STABLE int32_t U_EXPORT2 |
| 1112 uregex_appendTail(URegularExpression *regexp, |
| 1113 UChar **destBuf, |
| 1114 int32_t *destCapacity, |
| 1115 UErrorCode *status); |
| 1116 |
| 1117 |
| 1118 /** |
| 1119 * As the final step in a find-and-replace operation, append the remainder |
| 1120 * of the input string, starting at the position following the last match, |
| 1121 * to the destination string. <code>uregex_appendTailUText()</code> is intended
|
| 1122 * to be invoked after one or more invocations of the |
| 1123 * <code>uregex_appendReplacementUText()</code> function. |
| 1124 * |
| 1125 * @param regexp The regular expression object. This is needed to |
| 1126 * obtain the input string and with the position |
| 1127 * of the last match within it. |
| 1128 * @param dest A mutable UText that will receive the result. Must no
t be NULL. |
| 1129 * @return The destination UText. |
| 1130 * |
| 1131 * @draft ICU 4.6 |
| 1132 */ |
| 1133 U_DRAFT UText * U_EXPORT2 |
| 1134 uregex_appendTailUText(URegularExpression *regexp, |
| 1135 UText *dest, |
| 1136 UErrorCode *status); |
| 1137 |
| 1138 |
| 1139 |
| 1140 /** |
| 1141 * Split a string into fields. Somewhat like split() from Perl. |
| 1142 * The pattern matches identify delimiters that separate the input |
| 1143 * into fields. The input data between the matches becomes the |
| 1144 * fields themselves. |
| 1145 * <p> |
| 1146 * Each of the fields is copied from the input string to the destination |
| 1147 * buffer, and NUL terminated. The position of each field within |
| 1148 * the destination buffer is returned in the destFields array. |
| 1149 * |
| 1150 * Note: another choice for the design of this function would be to not |
| 1151 * copy the resulting fields at all, but to return indexes and |
| 1152 * lengths within the source text. |
| 1153 * Advantages would be |
| 1154 * o Faster. No Copying. |
| 1155 * o Nothing extra needed when field data may contain embedded NU
L chars. |
| 1156 * o Less memory needed if working on large data. |
| 1157 * Disadvantages |
| 1158 * o Less consistent with C++ split, which copies into an |
| 1159 * array of UnicodeStrings. |
| 1160 * o No NUL termination, extracted fields would be less convenien
t |
| 1161 * to use in most cases. |
| 1162 * o Possible problems in the future, when support Unicode Normal
ization |
| 1163 * could cause the fields to not correspond exactly to |
| 1164 * a range of the source text. |
| 1165 * |
| 1166 * @param regexp The compiled regular expression. |
| 1167 * @param destBuf A (UChar *) buffer to receive the fields that |
| 1168 * are extracted from the input string. These |
| 1169 * field pointers will refer to positions within the |
| 1170 * destination buffer supplied by the caller. Any |
| 1171 * extra positions within the destFields array will be |
| 1172 * set to NULL. |
| 1173 * @param destCapacity The capacity of the destBuf. |
| 1174 * @param requiredCapacity The actual capacity required of the destBuf. |
| 1175 * If destCapacity is too small, requiredCapacity will
return |
| 1176 * the total capacity required to hold all of the outp
ut, and |
| 1177 * a U_BUFFER_OVERFLOW_ERROR will be returned. |
| 1178 * @param destFields An array to be filled with the position of each |
| 1179 * of the extracted fields within destBuf. |
| 1180 * @param destFieldsCapacity The number of elements in the destFields ar
ray. |
| 1181 * If the number of fields found is less than destFieldsCapacit
y, |
| 1182 * the extra destFields elements are set to zero. |
| 1183 * If destFieldsCapacity is too small, the trailing part of the |
| 1184 * input, including any field delimiters, is treated as if it |
| 1185 * were the last field - it is copied to the destBuf, and |
| 1186 * its position is in the destBuf is stored in the last element |
| 1187 * of destFields. This behavior mimics that of Perl. It is no
t |
| 1188 * an error condition, and no error status is returned when all
destField |
| 1189 * positions are used. |
| 1190 * @param status A reference to a UErrorCode to receive any errors. |
| 1191 * @return The number of fields into which the input string was split. |
| 1192 * @stable ICU 3.0 |
| 1193 */ |
| 1194 U_STABLE int32_t U_EXPORT2 |
| 1195 uregex_split( URegularExpression *regexp, |
| 1196 UChar *destBuf, |
| 1197 int32_t destCapacity, |
| 1198 int32_t *requiredCapacity, |
| 1199 UChar *destFields[], |
| 1200 int32_t destFieldsCapacity, |
| 1201 UErrorCode *status); |
| 1202 |
| 1203 |
| 1204 /** |
| 1205 * Split a string into fields. Somewhat like split() from Perl. |
| 1206 * The pattern matches identify delimiters that separate the input |
| 1207 * into fields. The input data between the matches becomes the |
| 1208 * fields themselves. |
| 1209 * <p> |
| 1210 * The behavior of this function is not very closely aligned with uregex_split
(); |
| 1211 * instead, it is based on (and implemented directly on top of) the C++ split
method. |
| 1212 * |
| 1213 * @param regexp The compiled regular expression. |
| 1214 * @param destFields An array of mutable UText structs to receive the resul
ts of the split. |
| 1215 * If a field is NULL, a new UText is allocated to contain the
results for |
| 1216 * that field. This new UText is not guaranteed to be mutable. |
| 1217 * @param destFieldsCapacity The number of elements in the destination array. |
| 1218 * If the number of fields found is less than destCapacity, the |
| 1219 * extra strings in the destination array are not altered. |
| 1220 * If the number of destination strings is less than the number |
| 1221 * of fields, the trailing part of the input string, including
any |
| 1222 * field delimiters, is placed in the last destination string. |
| 1223 * This behavior mimics that of Perl. It is not an error cond
ition, and no |
| 1224 * error status is returned when all destField positions are us
ed. |
| 1225 * @param status A reference to a UErrorCode to receive any errors. |
| 1226 * @return The number of fields into which the input string was split. |
| 1227 * |
| 1228 * @draft ICU 4.6 |
| 1229 */ |
| 1230 U_DRAFT int32_t U_EXPORT2 |
| 1231 uregex_splitUText(URegularExpression *regexp, |
| 1232 UText *destFields[], |
| 1233 int32_t destFieldsCapacity, |
| 1234 UErrorCode *status); |
| 1235 |
| 1236 |
| 1237 |
| 1238 |
| 1239 /** |
| 1240 * Set a processing time limit for match operations with this URegularExpression
. |
| 1241 * |
| 1242 * Some patterns, when matching certain strings, can run in exponential time. |
| 1243 * For practical purposes, the match operation may appear to be in an |
| 1244 * infinite loop. |
| 1245 * When a limit is set a match operation will fail with an error if the |
| 1246 * limit is exceeded. |
| 1247 * <p> |
| 1248 * The units of the limit are steps of the match engine. |
| 1249 * Correspondence with actual processor time will depend on the speed |
| 1250 * of the processor and the details of the specific pattern, but will |
| 1251 * typically be on the order of milliseconds. |
| 1252 * <p> |
| 1253 * By default, the matching time is not limited. |
| 1254 * <p> |
| 1255 * |
| 1256 * @param regexp The compiled regular expression. |
| 1257 * @param limit The limit value, or 0 for no limit. |
| 1258 * @param status A reference to a UErrorCode to receive any errors. |
| 1259 * @stable ICU 4.0 |
| 1260 */ |
| 1261 U_STABLE void U_EXPORT2 |
| 1262 uregex_setTimeLimit(URegularExpression *regexp, |
| 1263 int32_t limit, |
| 1264 UErrorCode *status); |
| 1265 |
| 1266 /** |
| 1267 * Get the time limit for for matches with this URegularExpression. |
| 1268 * A return value of zero indicates that there is no limit. |
| 1269 * |
| 1270 * @param regexp The compiled regular expression. |
| 1271 * @param status A reference to a UErrorCode to receive any errors. |
| 1272 * @return the maximum allowed time for a match, in units of processing steps. |
| 1273 * @stable ICU 4.0 |
| 1274 */ |
| 1275 U_STABLE int32_t U_EXPORT2 |
| 1276 uregex_getTimeLimit(const URegularExpression *regexp, |
| 1277 UErrorCode *status); |
| 1278 |
| 1279 /** |
| 1280 * Set the amount of heap storage avaliable for use by the match backtracking st
ack. |
| 1281 * <p> |
| 1282 * ICU uses a backtracking regular expression engine, with the backtrack stack |
| 1283 * maintained on the heap. This function sets the limit to the amount of memory |
| 1284 * that can be used for this purpose. A backtracking stack overflow will |
| 1285 * result in an error from the match operation that caused it. |
| 1286 * <p> |
| 1287 * A limit is desirable because a malicious or poorly designed pattern can use |
| 1288 * excessive memory, potentially crashing the process. A limit is enabled |
| 1289 * by default. |
| 1290 * <p> |
| 1291 * @param regexp The compiled regular expression. |
| 1292 * @param limit The maximum size, in bytes, of the matching backtrack st
ack. |
| 1293 * A value of -1 means no limit. |
| 1294 * The limit must be greater than zero, or -1. |
| 1295 * @param status A reference to a UErrorCode to receive any errors. |
| 1296 * |
| 1297 * @stable ICU 4.0 |
| 1298 */ |
| 1299 U_STABLE void U_EXPORT2 |
| 1300 uregex_setStackLimit(URegularExpression *regexp, |
| 1301 int32_t limit, |
| 1302 UErrorCode *status); |
| 1303 |
| 1304 /** |
| 1305 * Get the size of the heap storage available for use by the back tracking stack
. |
| 1306 * |
| 1307 * @return the maximum backtracking stack size, in bytes, or zero if the |
| 1308 * stack size is unlimited. |
| 1309 * @stable ICU 4.0 |
| 1310 */ |
| 1311 U_STABLE int32_t U_EXPORT2 |
| 1312 uregex_getStackLimit(const URegularExpression *regexp, |
| 1313 UErrorCode *status); |
| 1314 |
| 1315 |
| 1316 /** |
| 1317 * Function pointer for a regular expression matching callback function. |
| 1318 * When set, a callback function will be called periodically during matching |
| 1319 * operations. If the call back function returns FALSE, the matching |
| 1320 * operation will be terminated early. |
| 1321 * |
| 1322 * Note: the callback function must not call other functions on this |
| 1323 * URegularExpression. |
| 1324 * |
| 1325 * @param context context pointer. The callback function will be invoked |
| 1326 * with the context specified at the time that |
| 1327 * uregex_setMatchCallback() is called. |
| 1328 * @param steps the accumulated processing time, in match steps, |
| 1329 * for this matching operation. |
| 1330 * @return TRUE to continue the matching operation. |
| 1331 * FALSE to terminate the matching operation. |
| 1332 * @stable ICU 4.0 |
| 1333 */ |
| 1334 U_CDECL_BEGIN |
| 1335 typedef UBool U_CALLCONV URegexMatchCallback ( |
| 1336 const void *context, |
| 1337 int32_t steps); |
| 1338 U_CDECL_END |
| 1339 |
| 1340 /** |
| 1341 * Set a callback function for this URegularExpression. |
| 1342 * During matching operations the function will be called periodically, |
| 1343 * giving the application the opportunity to terminate a long-running |
| 1344 * match. |
| 1345 * |
| 1346 * @param regexp The compiled regular expression. |
| 1347 * @param callback A pointer to the user-supplied callback function. |
| 1348 * @param context User context pointer. The value supplied at the |
| 1349 * time the callback function is set will be saved |
| 1350 * and passed to the callback each time that it is called. |
| 1351 * @param status A reference to a UErrorCode to receive any errors. |
| 1352 * @stable ICU 4.0 |
| 1353 */ |
| 1354 U_STABLE void U_EXPORT2 |
| 1355 uregex_setMatchCallback(URegularExpression *regexp, |
| 1356 URegexMatchCallback *callback, |
| 1357 const void *context, |
| 1358 UErrorCode *status); |
| 1359 |
| 1360 |
| 1361 /** |
| 1362 * Get the callback function for this URegularExpression. |
| 1363 * |
| 1364 * @param regexp The compiled regular expression. |
| 1365 * @param callback Out paramater, receives a pointer to the user-supplied |
| 1366 * callback function. |
| 1367 * @param context Out parameter, receives the user context pointer that |
| 1368 * was set when uregex_setMatchCallback() was called. |
| 1369 * @param status A reference to a UErrorCode to receive any errors. |
| 1370 * @stable ICU 4.0 |
| 1371 */ |
| 1372 U_STABLE void U_EXPORT2 |
| 1373 uregex_getMatchCallback(const URegularExpression *regexp, |
| 1374 URegexMatchCallback **callback, |
| 1375 const void **context, |
| 1376 UErrorCode *status); |
| 1377 |
| 1378 |
| 1379 /** |
| 1380 * Function pointer for a regular expression find callback function. |
| 1381 * |
| 1382 * When set, a callback function will be called during a find operation |
| 1383 * and for operations that depend on find, such as findNext, split and some repl
ace |
| 1384 * operations like replaceFirst. |
| 1385 * The callback will usually be called after each attempt at a match, but this i
s not a |
| 1386 * guarantee that the callback will be invoked at each character. For finds whe
re the |
| 1387 * match engine is invoked at each character, this may be close to true, but les
s likely |
| 1388 * for more optimized loops where the pattern is known to only start, and the ma
tch |
| 1389 * engine invoked, at certain characters. |
| 1390 * When invoked, this callback will specify the index at which a match operation
is about |
| 1391 * to be attempted, giving the application the opportunity to terminate a long-r
unning |
| 1392 * find operation. |
| 1393 * |
| 1394 * If the call back function returns FALSE, the find operation will be terminate
d early. |
| 1395 * |
| 1396 * Note: the callback function must not call other functions on this |
| 1397 * URegularExpression |
| 1398 * |
| 1399 * @param context context pointer. The callback function will be invoked |
| 1400 * with the context specified at the time that |
| 1401 * uregex_setFindProgressCallback() is called. |
| 1402 * @param matchIndex the next index at which a match attempt will be attempted
for this |
| 1403 * find operation. If this callback interrupts the search, this
is the |
| 1404 * index at which a find/findNext operation may be re-initiated. |
| 1405 * @return TRUE to continue the matching operation. |
| 1406 * FALSE to terminate the matching operation. |
| 1407 * @draft ICU 4.6 |
| 1408 */ |
| 1409 U_CDECL_BEGIN |
| 1410 typedef UBool U_CALLCONV URegexFindProgressCallback ( |
| 1411 const void *context, |
| 1412 int64_t matchIndex); |
| 1413 U_CDECL_END |
| 1414 |
| 1415 /** |
| 1416 * Set the find progress callback function for this URegularExpression. |
| 1417 * |
| 1418 * @param regexp The compiled regular expression. |
| 1419 * @param callback A pointer to the user-supplied callback function. |
| 1420 * @param context User context pointer. The value supplied at the |
| 1421 * time the callback function is set will be saved |
| 1422 * and passed to the callback each time that it is called. |
| 1423 * @param status A reference to a UErrorCode to receive any errors. |
| 1424 * @draft ICU 4.6 |
| 1425 */ |
| 1426 U_DRAFT void U_EXPORT2 |
| 1427 uregex_setFindProgressCallback(URegularExpression *regexp, |
| 1428 URegexFindProgressCallback *callback, |
| 1429 const void *context, |
| 1430 UErrorCode *status); |
| 1431 |
| 1432 |
| 1433 /** |
| 1434 * Get the find progress callback function for this URegularExpression. |
| 1435 * |
| 1436 * @param regexp The compiled regular expression. |
| 1437 * @param callback Out paramater, receives a pointer to the user-supplied |
| 1438 * callback function. |
| 1439 * @param context Out parameter, receives the user context pointer that |
| 1440 * was set when uregex_setFindProgressCallback() was called
. |
| 1441 * @param status A reference to a UErrorCode to receive any errors. |
| 1442 * @draft ICU 4.6 |
| 1443 */ |
| 1444 U_DRAFT void U_EXPORT2 |
| 1445 uregex_getFindProgressCallback(const URegularExpression *regexp, |
| 1446 URegexFindProgressCallback **callback, |
| 1447 const void **context, |
| 1448 UErrorCode *status); |
| 1449 |
| 1450 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ |
| 1451 #endif /* UREGEX_H */ |
OLD | NEW |