| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 ********************************************************************** | |
| 3 * Copyright (C) 2004-2010, International Business Machines | |
| 4 * Corporation and others. All Rights Reserved. | |
| 5 ********************************************************************** | |
| 6 * file name: uregex.h | |
| 7 * encoding: US-ASCII | |
| 8 * indentation:4 | |
| 9 * | |
| 10 * created on: 2004mar09 | |
| 11 * created by: Andy Heninger | |
| 12 * | |
| 13 * ICU Regular Expressions, API for C | |
| 14 */ | |
| 15 | |
| 16 /** | |
| 17 * \file | |
| 18 * \brief C API: Regular Expressions | |
| 19 * | |
| 20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.<
/p> | |
| 21 */ | |
| 22 | |
| 23 #ifndef UREGEX_H | |
| 24 #define UREGEX_H | |
| 25 | |
| 26 #include "unicode/utext.h" | |
| 27 #include "unicode/utypes.h" | |
| 28 | |
| 29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
| 30 | |
| 31 #include "unicode/localpointer.h" | |
| 32 #include "unicode/parseerr.h" | |
| 33 | |
| 34 struct URegularExpression; | |
| 35 /** | |
| 36 * Structure representing a compiled regular rexpression, plus the results | |
| 37 * of a match operation. | |
| 38 * @stable ICU 3.0 | |
| 39 */ | |
| 40 typedef struct URegularExpression URegularExpression; | |
| 41 | |
| 42 | |
| 43 /** | |
| 44 * Constants for Regular Expression Match Modes. | |
| 45 * @stable ICU 2.4 | |
| 46 */ | |
| 47 typedef enum URegexpFlag{ | |
| 48 | |
| 49 #ifndef U_HIDE_DRAFT_API | |
| 50 /** Forces normalization of pattern and strings. | |
| 51 Not implemented yet, just a placeholder, hence draft. | |
| 52 @draft ICU 2.4 */ | |
| 53 UREGEX_CANON_EQ = 128, | |
| 54 #endif | |
| 55 /** Enable case insensitive matching. @stable ICU 2.4 */ | |
| 56 UREGEX_CASE_INSENSITIVE = 2, | |
| 57 | |
| 58 /** Allow white space and comments within patterns @stable ICU 2.4 */ | |
| 59 UREGEX_COMMENTS = 4, | |
| 60 | |
| 61 /** If set, '.' matches line terminators, otherwise '.' matching stops at
line end. | |
| 62 * @stable ICU 2.4 */ | |
| 63 UREGEX_DOTALL = 32, | |
| 64 | |
| 65 /** If set, treat the entire pattern as a literal string. | |
| 66 * Metacharacters or escape sequences in the input sequence will be given | |
| 67 * no special meaning. Not implemented yet as of ICU 4.4. | |
| 68 * | |
| 69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact | |
| 70 * on matching when used in conjunction with this flag. | |
| 71 * The other flags become superfluous. | |
| 72 * TODO: say which escapes are still handled; anything Java does | |
| 73 * early (\\u) we should still do. | |
| 74 * @stable ICU 4.0 | |
| 75 */ | |
| 76 UREGEX_LITERAL = 16, | |
| 77 | |
| 78 /** Control behavior of "$" and "^" | |
| 79 * If set, recognize line terminators within string, | |
| 80 * otherwise, match only at start and end of input string. | |
| 81 * @stable ICU 2.4 */ | |
| 82 UREGEX_MULTILINE = 8, | |
| 83 | |
| 84 /** Unix-only line endings. | |
| 85 * When this mode is enabled, only \\u000a is recognized as a line ending | |
| 86 * in the behavior of ., ^, and $. | |
| 87 * @stable ICU 4.0 | |
| 88 */ | |
| 89 UREGEX_UNIX_LINES = 1, | |
| 90 | |
| 91 /** Unicode word boundaries. | |
| 92 * If set, \b uses the Unicode TR 29 definition of word boundaries. | |
| 93 * Warning: Unicode word boundaries are quite different from | |
| 94 * traditional regular expression word boundaries. See | |
| 95 * http://unicode.org/reports/tr29/#Word_Boundaries | |
| 96 * @stable ICU 2.8 | |
| 97 */ | |
| 98 UREGEX_UWORD = 256, | |
| 99 | |
| 100 /** Error on Unrecognized backslash escapes. | |
| 101 * If set, fail with an error on patterns that contain | |
| 102 * backslash-escaped ASCII letters without a known specail | |
| 103 * meaning. If this flag is not set, these | |
| 104 * escaped letters represent themselves. | |
| 105 * @stable ICU 4.0 | |
| 106 */ | |
| 107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 | |
| 108 | |
| 109 } URegexpFlag; | |
| 110 | |
| 111 /** | |
| 112 * Open (compile) an ICU regular expression. Compiles the regular expression
in | |
| 113 * string form into an internal representation using the specified match mode
flags. | |
| 114 * The resulting regular expression handle can then be used to perform various | |
| 115 * matching operations. | |
| 116 * | |
| 117 * | |
| 118 * @param pattern The Regular Expression pattern to be compiled. | |
| 119 * @param patternLength The length of the pattern, or -1 if the pattern is | |
| 120 * NUL termintated. | |
| 121 * @param flags Flags that alter the default matching behavior for | |
| 122 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
| 123 * example. For default behavior, set this parameter to
zero. | |
| 124 * See <code>enum URegexpFlag</code>. All desired flags | |
| 125 * are bitwise-ORed together. | |
| 126 * @param pe Receives the position (line and column nubers) of any
syntax | |
| 127 * error within the source regular expression string. If
this | |
| 128 * information is not wanted, pass NULL for this paramete
r. | |
| 129 * @param status Receives error detected by this function. | |
| 130 * @stable ICU 3.0 | |
| 131 * | |
| 132 */ | |
| 133 U_STABLE URegularExpression * U_EXPORT2 | |
| 134 uregex_open( const UChar *pattern, | |
| 135 int32_t patternLength, | |
| 136 uint32_t flags, | |
| 137 UParseError *pe, | |
| 138 UErrorCode *status); | |
| 139 | |
| 140 /** | |
| 141 * Open (compile) an ICU regular expression. Compiles the regular expression
in | |
| 142 * string form into an internal representation using the specified match mode
flags. | |
| 143 * The resulting regular expression handle can then be used to perform various | |
| 144 * matching operations. | |
| 145 * <p> | |
| 146 * The contents of the pattern UText will be extracted and saved. Ownership of
the | |
| 147 * UText struct itself remains with the caller. This is to match the behavior
of | |
| 148 * uregex_open(). | |
| 149 * | |
| 150 * @param pattern The Regular Expression pattern to be compiled. | |
| 151 * @param flags Flags that alter the default matching behavior for | |
| 152 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
| 153 * example. For default behavior, set this parameter to
zero. | |
| 154 * See <code>enum URegexpFlag</code>. All desired flags | |
| 155 * are bitwise-ORed together. | |
| 156 * @param pe Receives the position (line and column nubers) of any
syntax | |
| 157 * error within the source regular expression string. If
this | |
| 158 * information is not wanted, pass NULL for this paramete
r. | |
| 159 * @param status Receives error detected by this function. | |
| 160 * | |
| 161 * @draft ICU 4.6 | |
| 162 */ | |
| 163 U_DRAFT URegularExpression * U_EXPORT2 | |
| 164 uregex_openUText(UText *pattern, | |
| 165 uint32_t flags, | |
| 166 UParseError *pe, | |
| 167 UErrorCode *status); | |
| 168 | |
| 169 /** | |
| 170 * Open (compile) an ICU regular expression. The resulting regular expression | |
| 171 * handle can then be used to perform various matching operations. | |
| 172 * <p> | |
| 173 * This function is the same as uregex_open, except that the pattern | |
| 174 * is supplied as an 8 bit char * string in the default code page. | |
| 175 * | |
| 176 * @param pattern The Regular Expression pattern to be compiled, | |
| 177 * NUL termintated. | |
| 178 * @param flags Flags that alter the default matching behavior for | |
| 179 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
| 180 * example. For default behavior, set this parameter to
zero. | |
| 181 * See <code>enum URegexpFlag</code>. All desired flags | |
| 182 * are bitwise-ORed together. | |
| 183 * @param pe Receives the position (line and column nubers) of any
syntax | |
| 184 * error within the source regular expression string. If
this | |
| 185 * information is not wanted, pass NULL for this paramete
r. | |
| 186 * @param status Receives errors detected by this function. | |
| 187 * @return The URegularExpression object representing the compile
d | |
| 188 * pattern. | |
| 189 * | |
| 190 * @stable ICU 3.0 | |
| 191 */ | |
| 192 #if !UCONFIG_NO_CONVERSION | |
| 193 U_STABLE URegularExpression * U_EXPORT2 | |
| 194 uregex_openC( const char *pattern, | |
| 195 uint32_t flags, | |
| 196 UParseError *pe, | |
| 197 UErrorCode *status); | |
| 198 #endif | |
| 199 | |
| 200 | |
| 201 | |
| 202 /** | |
| 203 * Close the regular expression, recovering all resources (memory) it | |
| 204 * was holding. | |
| 205 * | |
| 206 * @param regexp The regular expression to be closed. | |
| 207 * @stable ICU 3.0 | |
| 208 */ | |
| 209 U_STABLE void U_EXPORT2 | |
| 210 uregex_close(URegularExpression *regexp); | |
| 211 | |
| 212 #if U_SHOW_CPLUSPLUS_API | |
| 213 | |
| 214 U_NAMESPACE_BEGIN | |
| 215 | |
| 216 /** | |
| 217 * \class LocalURegularExpressionPointer | |
| 218 * "Smart pointer" class, closes a URegularExpression via uregex_close(). | |
| 219 * For most methods see the LocalPointerBase base class. | |
| 220 * | |
| 221 * @see LocalPointerBase | |
| 222 * @see LocalPointer | |
| 223 * @stable ICU 4.4 | |
| 224 */ | |
| 225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression,
uregex_close); | |
| 226 | |
| 227 U_NAMESPACE_END | |
| 228 | |
| 229 #endif | |
| 230 | |
| 231 /** | |
| 232 * Make a copy of a compiled regular expression. Cloning a regular | |
| 233 * expression is faster than opening a second instance from the source | |
| 234 * form of the expression, and requires less memory. | |
| 235 * <p> | |
| 236 * Note that the current input string and the position of any matched text | |
| 237 * within it are not cloned; only the pattern itself and and the | |
| 238 * match mode flags are copied. | |
| 239 * <p> | |
| 240 * Cloning can be particularly useful to threaded applications that perform | |
| 241 * multiple match operations in parallel. Each concurrent RE | |
| 242 * operation requires its own instance of a URegularExpression. | |
| 243 * | |
| 244 * @param regexp The compiled regular expression to be cloned. | |
| 245 * @param status Receives indication of any errors encountered | |
| 246 * @return the cloned copy of the compiled regular expression. | |
| 247 * @stable ICU 3.0 | |
| 248 */ | |
| 249 U_STABLE URegularExpression * U_EXPORT2 | |
| 250 uregex_clone(const URegularExpression *regexp, UErrorCode *status); | |
| 251 | |
| 252 /** | |
| 253 * Returns a pointer to the source form of the pattern for this regular express
ion. | |
| 254 * This function will work even if the pattern was originally specified as a UT
ext. | |
| 255 * | |
| 256 * @param regexp The compiled regular expression. | |
| 257 * @param patLength This output parameter will be set to the length of the | |
| 258 * pattern string. A NULL pointer may be used here if the | |
| 259 * pattern length is not needed, as would be the case if | |
| 260 * the pattern is known in advance to be a NUL terminated | |
| 261 * string. | |
| 262 * @param status Receives errors detected by this function. | |
| 263 * @return a pointer to the pattern string. The storage for the string is | |
| 264 * owned by the regular expression object, and must not be | |
| 265 * altered or deleted by the application. The returned string | |
| 266 * will remain valid until the regular expression is closed. | |
| 267 * @stable ICU 3.0 | |
| 268 */ | |
| 269 U_STABLE const UChar * U_EXPORT2 | |
| 270 uregex_pattern(const URegularExpression *regexp, | |
| 271 int32_t *patLength, | |
| 272 UErrorCode *status); | |
| 273 | |
| 274 /** | |
| 275 * Returns the source text of the pattern for this regular expression. | |
| 276 * This function will work even if the pattern was originally specified as a UC
har string. | |
| 277 * | |
| 278 * @param regexp The compiled regular expression. | |
| 279 * @param status Receives errors detected by this function. | |
| 280 * @return the pattern text. The storage for the text is owned by the regular e
xpression | |
| 281 * object, and must not be altered or deleted. | |
| 282 * | |
| 283 * @draft ICU 4.6 | |
| 284 */ | |
| 285 U_DRAFT UText * U_EXPORT2 | |
| 286 uregex_patternUText(const URegularExpression *regexp, | |
| 287 UErrorCode *status); | |
| 288 | |
| 289 | |
| 290 /** | |
| 291 * Get the match mode flags that were specified when compiling this regular exp
ression. | |
| 292 * @param status Receives errors detected by this function. | |
| 293 * @param regexp The compiled regular expression. | |
| 294 * @return The match mode flags | |
| 295 * @see URegexpFlag | |
| 296 * @stable ICU 3.0 | |
| 297 */ | |
| 298 U_STABLE int32_t U_EXPORT2 | |
| 299 uregex_flags(const URegularExpression *regexp, | |
| 300 UErrorCode *status); | |
| 301 | |
| 302 | |
| 303 /** | |
| 304 * Set the subject text string upon which the regular expression will look for
matches. | |
| 305 * This function may be called any number of times, allowing the regular | |
| 306 * expression pattern to be applied to different strings. | |
| 307 * <p> | |
| 308 * Regular expression matching operations work directly on the application's | |
| 309 * string data. No copy is made. The subject string data must not be | |
| 310 * altered after calling this function until after all regular expression | |
| 311 * operations involving this string data are completed. | |
| 312 * <p> | |
| 313 * Zero length strings are permitted. In this case, no subsequent match | |
| 314 * operation will dereference the text string pointer. | |
| 315 * | |
| 316 * @param regexp The compiled regular expression. | |
| 317 * @param text The subject text string. | |
| 318 * @param textLength The length of the subject text, or -1 if the string | |
| 319 * is NUL terminated. | |
| 320 * @param status Receives errors detected by this function. | |
| 321 * @stable ICU 3.0 | |
| 322 */ | |
| 323 U_STABLE void U_EXPORT2 | |
| 324 uregex_setText(URegularExpression *regexp, | |
| 325 const UChar *text, | |
| 326 int32_t textLength, | |
| 327 UErrorCode *status); | |
| 328 | |
| 329 | |
| 330 /** | |
| 331 * Set the subject text string upon which the regular expression will look for
matches. | |
| 332 * This function may be called any number of times, allowing the regular | |
| 333 * expression pattern to be applied to different strings. | |
| 334 * <p> | |
| 335 * Regular expression matching operations work directly on the application's | |
| 336 * string data; only a shallow clone is made. The subject string data must no
t be | |
| 337 * altered after calling this function until after all regular expression | |
| 338 * operations involving this string data are completed. | |
| 339 * | |
| 340 * @param regexp The compiled regular expression. | |
| 341 * @param text The subject text string. | |
| 342 * @param status Receives errors detected by this function. | |
| 343 * | |
| 344 * @draft ICU 4.6 | |
| 345 */ | |
| 346 U_DRAFT void U_EXPORT2 | |
| 347 uregex_setUText(URegularExpression *regexp, | |
| 348 UText *text, | |
| 349 UErrorCode *status); | |
| 350 | |
| 351 /** | |
| 352 * Get the subject text that is currently associated with this | |
| 353 * regular expression object. If the input was supplied using uregex_setText
(), | |
| 354 * that pointer will be returned. Otherwise, the characters in the input wil
l | |
| 355 * be extracted to a buffer and returned. In either case, ownership remains | |
| 356 * with the regular expression object. | |
| 357 * | |
| 358 * This function will work even if the input was originally specified as a UTe
xt. | |
| 359 * | |
| 360 * @param regexp The compiled regular expression. | |
| 361 * @param textLength The length of the string is returned in this output param
eter. | |
| 362 * A NULL pointer may be used here if the | |
| 363 * text length is not needed, as would be the case if | |
| 364 * the text is known in advance to be a NUL terminated | |
| 365 * string. | |
| 366 * @param status Receives errors detected by this function. | |
| 367 * @return Pointer to the subject text string currently associated w
ith | |
| 368 * this regular expression. | |
| 369 * @stable ICU 3.0 | |
| 370 */ | |
| 371 U_STABLE const UChar * U_EXPORT2 | |
| 372 uregex_getText(URegularExpression *regexp, | |
| 373 int32_t *textLength, | |
| 374 UErrorCode *status); | |
| 375 | |
| 376 | |
| 377 /** | |
| 378 * Get the subject text that is currently associated with this | |
| 379 * regular expression object. | |
| 380 * | |
| 381 * This function will work even if the input was originally specified as a UCh
ar string. | |
| 382 * | |
| 383 * @param regexp The compiled regular expression. | |
| 384 * @param dest A mutable UText in which to store the current input. | |
| 385 * If NULL, a new UText will be created as an immutable shal
low clone | |
| 386 * of the actual input string. | |
| 387 * @param status Receives errors detected by this function. | |
| 388 * @return The subject text currently associated with this regular e
xpression. | |
| 389 * If a pre-allocated UText was provided, it will always be
used and returned. | |
| 390 * | |
| 391 * @draft ICU 4.6 | |
| 392 */ | |
| 393 U_DRAFT UText * U_EXPORT2 | |
| 394 uregex_getUText(URegularExpression *regexp, | |
| 395 UText *dest, | |
| 396 UErrorCode *status); | |
| 397 | |
| 398 /** | |
| 399 * Attempts to match the input string against the pattern. | |
| 400 * To succeed, the match must extend to the end of the string, | |
| 401 * or cover the complete match region. | |
| 402 * | |
| 403 * If startIndex >= zero the match operation starts at the specified | |
| 404 * index and must extend to the end of the input string. Any region | |
| 405 * that has been specified is reset. | |
| 406 * | |
| 407 * If startIndex == -1 the match must cover the input region, or the entire | |
| 408 * input string if no region has been set. This directly corresponds to | |
| 409 * Matcher.matches() in Java | |
| 410 * | |
| 411 * @param regexp The compiled regular expression. | |
| 412 * @param startIndex The input string (native) index at which to begin mat
ching, or -1 | |
| 413 * to match the input Region. | |
| 414 * @param status Receives errors detected by this function. | |
| 415 * @return TRUE if there is a match | |
| 416 * @stable ICU 3.0 | |
| 417 */ | |
| 418 U_STABLE UBool U_EXPORT2 | |
| 419 uregex_matches(URegularExpression *regexp, | |
| 420 int32_t startIndex, | |
| 421 UErrorCode *status); | |
| 422 | |
| 423 /** | |
| 424 * 64bit version of uregex_matches. | |
| 425 * @draft ICU 4.6 | |
| 426 */ | |
| 427 U_DRAFT UBool U_EXPORT2 | |
| 428 uregex_matches64(URegularExpression *regexp, | |
| 429 int64_t startIndex, | |
| 430 UErrorCode *status); | |
| 431 | |
| 432 /** | |
| 433 * Attempts to match the input string, starting from the specified index, aga
inst the pattern. | |
| 434 * The match may be of any length, and is not required to extend to the end | |
| 435 * of the input string. Contrast with uregex_matches(). | |
| 436 * | |
| 437 * <p>If startIndex is >= 0 any input region that was set for this | |
| 438 * URegularExpression is reset before the operation begins. | |
| 439 * | |
| 440 * <p>If the specified starting index == -1 the match begins at the start of
the input | |
| 441 * region, or at the start of the full string if no region has been specified
. | |
| 442 * This corresponds directly with Matcher.lookingAt() in Java. | |
| 443 * | |
| 444 * <p>If the match succeeds then more information can be obtained via the | |
| 445 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, | |
| 446 * and <code>uregexp_group()</code> functions.</p> | |
| 447 * | |
| 448 * @param regexp The compiled regular expression. | |
| 449 * @param startIndex The input string (native) index at which to begin ma
tching, or | |
| 450 * -1 to match the Input Region | |
| 451 * @param status A reference to a UErrorCode to receive any errors. | |
| 452 * @return TRUE if there is a match. | |
| 453 * @stable ICU 3.0 | |
| 454 */ | |
| 455 U_STABLE UBool U_EXPORT2 | |
| 456 uregex_lookingAt(URegularExpression *regexp, | |
| 457 int32_t startIndex, | |
| 458 UErrorCode *status); | |
| 459 | |
| 460 /** | |
| 461 * 64bit version of uregex_lookingAt. | |
| 462 * @draft ICU 4.6 | |
| 463 */ | |
| 464 U_DRAFT UBool U_EXPORT2 | |
| 465 uregex_lookingAt64(URegularExpression *regexp, | |
| 466 int64_t startIndex, | |
| 467 UErrorCode *status); | |
| 468 | |
| 469 /** | |
| 470 * Find the first matching substring of the input string that matches the pat
tern. | |
| 471 * If startIndex is >= zero the search for a match begins at the specified in
dex, | |
| 472 * and any match region is reset. This corresponds directly with | |
| 473 * Matcher.find(startIndex) in Java. | |
| 474 * | |
| 475 * If startIndex == -1 the search begins at the start of the input region, | |
| 476 * or at the start of the full string if no region has been specified
. | |
| 477 * | |
| 478 * If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
| 479 * <code>uregex_group()</code> will provide more information regarding the ma
tch. | |
| 480 * | |
| 481 * @param regexp The compiled regular expression. | |
| 482 * @param startIndex The position (native) in the input string to begin th
e search, or | |
| 483 * -1 to search within the Input Region. | |
| 484 * @param status A reference to a UErrorCode to receive any errors. | |
| 485 * @return TRUE if a match is found. | |
| 486 * @stable ICU 3.0 | |
| 487 */ | |
| 488 U_STABLE UBool U_EXPORT2 | |
| 489 uregex_find(URegularExpression *regexp, | |
| 490 int32_t startIndex, | |
| 491 UErrorCode *status); | |
| 492 | |
| 493 /** | |
| 494 * 64bit version of uregex_find. | |
| 495 * @draft ICU 4.6 | |
| 496 */ | |
| 497 U_DRAFT UBool U_EXPORT2 | |
| 498 uregex_find64(URegularExpression *regexp, | |
| 499 int64_t startIndex, | |
| 500 UErrorCode *status); | |
| 501 | |
| 502 /** | |
| 503 * Find the next pattern match in the input string. Begin searching | |
| 504 * the input at the location following the end of he previous match, | |
| 505 * or at the start of the string (or region) if there is no | |
| 506 * previous match. If a match is found, <code>uregex_start(), uregex_end()</c
ode>, and | |
| 507 * <code>uregex_group()</code> will provide more information regarding the mat
ch. | |
| 508 * | |
| 509 * @param regexp The compiled regular expression. | |
| 510 * @param status A reference to a UErrorCode to receive any errors. | |
| 511 * @return TRUE if a match is found. | |
| 512 * @see uregex_reset | |
| 513 * @stable ICU 3.0 | |
| 514 */ | |
| 515 U_STABLE UBool U_EXPORT2 | |
| 516 uregex_findNext(URegularExpression *regexp, | |
| 517 UErrorCode *status); | |
| 518 | |
| 519 /** | |
| 520 * Get the number of capturing groups in this regular expression's pattern. | |
| 521 * @param regexp The compiled regular expression. | |
| 522 * @param status A reference to a UErrorCode to receive any errors. | |
| 523 * @return the number of capture groups | |
| 524 * @stable ICU 3.0 | |
| 525 */ | |
| 526 U_STABLE int32_t U_EXPORT2 | |
| 527 uregex_groupCount(URegularExpression *regexp, | |
| 528 UErrorCode *status); | |
| 529 | |
| 530 /** Extract the string for the specified matching expression or subexpression. | |
| 531 * Group #0 is the complete string of matched text. | |
| 532 * Group #1 is the text matched by the first set of capturing parentheses. | |
| 533 * | |
| 534 * @param regexp The compiled regular expression. | |
| 535 * @param groupNum The capture group to extract. Group 0 is the comple
te | |
| 536 * match. The value of this parameter must be | |
| 537 * less than or equal to the number of capture groups i
n | |
| 538 * the pattern. | |
| 539 * @param dest Buffer to receive the matching string data | |
| 540 * @param destCapacity Capacity of the dest buffer. | |
| 541 * @param status A reference to a UErrorCode to receive any errors. | |
| 542 * @return Length of matching data, | |
| 543 * or -1 if no applicable match. | |
| 544 * @stable ICU 3.0 | |
| 545 */ | |
| 546 U_STABLE int32_t U_EXPORT2 | |
| 547 uregex_group(URegularExpression *regexp, | |
| 548 int32_t groupNum, | |
| 549 UChar *dest, | |
| 550 int32_t destCapacity, | |
| 551 UErrorCode *status); | |
| 552 | |
| 553 | |
| 554 /** Returns a shallow immutable clone of the entire input string. The returned
UText current native index | |
| 555 * is set to the beginning of the requested capture group. The capture group
length is also | |
| 556 * returned via groupLength. | |
| 557 * Group #0 is the complete string of matched text. | |
| 558 * Group #1 is the text matched by the first set of capturing parentheses. | |
| 559 * | |
| 560 * @param regexp The compiled regular expression. | |
| 561 * @param groupNum The capture group to extract. Group 0 is the comple
te | |
| 562 * match. The value of this parameter must be | |
| 563 * less than or equal to the number of capture groups i
n | |
| 564 * the pattern. | |
| 565 * @param dest A mutable UText in which to store the current input. | |
| 566 * If NULL, a new UText will be created as an immutable
shallow clone | |
| 567 * of the entire input string. | |
| 568 * @param groupLength The group length of the desired capture group. | |
| 569 * @param status A reference to a UErrorCode to receive any errors. | |
| 570 * @return The subject text currently associated with this regu
lar expression. | |
| 571 * If a pre-allocated UText was provided, it will alway
s be used and returned. | |
| 572 | |
| 573 * | |
| 574 * @draft ICU 4.6 | |
| 575 */ | |
| 576 U_DRAFT UText * U_EXPORT2 | |
| 577 uregex_groupUText(URegularExpression *regexp, | |
| 578 int32_t groupNum, | |
| 579 UText *dest, | |
| 580 int64_t *groupLength, | |
| 581 UErrorCode *status); | |
| 582 | |
| 583 | |
| 584 /** Extract the string for the specified matching expression or subexpression. | |
| 585 * Group #0 is the complete string of matched text. | |
| 586 * Group #1 is the text matched by the first set of capturing parentheses. | |
| 587 * | |
| 588 * @param regexp The compiled regular expression. | |
| 589 * @param groupNum The capture group to extract. Group 0 is the comple
te | |
| 590 * match. The value of this parameter must be | |
| 591 * less than or equal to the number of capture groups i
n | |
| 592 * the pattern. | |
| 593 * @param dest Mutable UText to receive the matching string data. | |
| 594 * If NULL, a new UText will be created (which may not
be mutable). | |
| 595 * @param status A reference to a UErrorCode to receive any errors. | |
| 596 * @return The matching string data. If a pre-allocated UText w
as provided, | |
| 597 * it will always be used and returned. | |
| 598 * | |
| 599 * @internal ICU 4.4 technology preview | |
| 600 */ | |
| 601 U_INTERNAL UText * U_EXPORT2 | |
| 602 uregex_groupUTextDeep(URegularExpression *regexp, | |
| 603 int32_t groupNum, | |
| 604 UText *dest, | |
| 605 UErrorCode *status); | |
| 606 | |
| 607 /** | |
| 608 * Returns the index in the input string of the start of the text matched by
the | |
| 609 * specified capture group during the previous match operation. Return -1 if | |
| 610 * the capture group was not part of the last match. | |
| 611 * Group #0 refers to the complete range of matched text. | |
| 612 * Group #1 refers to the text matched by the first set of capturing parenthe
ses. | |
| 613 * | |
| 614 * @param regexp The compiled regular expression. | |
| 615 * @param groupNum The capture group number | |
| 616 * @param status A reference to a UErrorCode to receive any errors. | |
| 617 * @return the starting (native) position in the input of the t
ext matched | |
| 618 * by the specified group. | |
| 619 * @stable ICU 3.0 | |
| 620 */ | |
| 621 U_STABLE int32_t U_EXPORT2 | |
| 622 uregex_start(URegularExpression *regexp, | |
| 623 int32_t groupNum, | |
| 624 UErrorCode *status); | |
| 625 | |
| 626 /** | |
| 627 * 64bit version of uregex_start. | |
| 628 * @draft ICU 4.6 | |
| 629 */ | |
| 630 U_DRAFT int64_t U_EXPORT2 | |
| 631 uregex_start64(URegularExpression *regexp, | |
| 632 int32_t groupNum, | |
| 633 UErrorCode *status); | |
| 634 | |
| 635 /** | |
| 636 * Returns the index in the input string of the position following the end | |
| 637 * of the text matched by the specified capture group. | |
| 638 * Return -1 if the capture group was not part of the last match. | |
| 639 * Group #0 refers to the complete range of matched text. | |
| 640 * Group #1 refers to the text matched by the first set of capturing parenthe
ses. | |
| 641 * | |
| 642 * @param regexp The compiled regular expression. | |
| 643 * @param groupNum The capture group number | |
| 644 * @param status A reference to a UErrorCode to receive any errors. | |
| 645 * @return the (native) index of the position following the las
t matched character. | |
| 646 * @stable ICU 3.0 | |
| 647 */ | |
| 648 U_STABLE int32_t U_EXPORT2 | |
| 649 uregex_end(URegularExpression *regexp, | |
| 650 int32_t groupNum, | |
| 651 UErrorCode *status); | |
| 652 | |
| 653 /** | |
| 654 * 64bit version of uregex_end. | |
| 655 * @draft ICU 4.6 | |
| 656 */ | |
| 657 U_DRAFT int64_t U_EXPORT2 | |
| 658 uregex_end64(URegularExpression *regexp, | |
| 659 int32_t groupNum, | |
| 660 UErrorCode *status); | |
| 661 | |
| 662 /** | |
| 663 * Reset any saved state from the previous match. Has the effect of | |
| 664 * causing uregex_findNext to begin at the specified index, and causing | |
| 665 * uregex_start(), uregex_end() and uregex_group() to return an error | |
| 666 * indicating that there is no match information available. Clears any | |
| 667 * match region that may have been set. | |
| 668 * | |
| 669 * @param regexp The compiled regular expression. | |
| 670 * @param index The position (native) in the text at which a | |
| 671 * uregex_findNext() should begin searching. | |
| 672 * @param status A reference to a UErrorCode to receive any errors. | |
| 673 * @stable ICU 3.0 | |
| 674 */ | |
| 675 U_STABLE void U_EXPORT2 | |
| 676 uregex_reset(URegularExpression *regexp, | |
| 677 int32_t index, | |
| 678 UErrorCode *status); | |
| 679 | |
| 680 /** | |
| 681 * 64bit version of uregex_reset. | |
| 682 * @draft ICU 4.6 | |
| 683 */ | |
| 684 U_DRAFT void U_EXPORT2 | |
| 685 uregex_reset64(URegularExpression *regexp, | |
| 686 int64_t index, | |
| 687 UErrorCode *status); | |
| 688 | |
| 689 /** Sets the limits of the matching region for this URegularExpression. | |
| 690 * The region is the part of the input string that will be considered when matc
hing. | |
| 691 * Invoking this method resets any saved state from the previous match, | |
| 692 * then sets the region to start at the index specified by the start parameter | |
| 693 * and end at the index specified by the end parameter. | |
| 694 * | |
| 695 * Depending on the transparency and anchoring being used (see useTransparentBo
unds | |
| 696 * and useAnchoringBounds), certain constructs such as anchors may behave diffe
rently | |
| 697 * at or around the boundaries of the region | |
| 698 * | |
| 699 * The function will fail if start is greater than limit, or if either index | |
| 700 * is less than zero or greater than the length of the string being matched. | |
| 701 * | |
| 702 * @param regexp The compiled regular expression. | |
| 703 * @param regionStart The (native) index to begin searches at. | |
| 704 * @param regionLimit The (native) index to end searches at (exclusive). | |
| 705 * @param status A pointer to a UErrorCode to receive any errors. | |
| 706 * @stable ICU 4.0 | |
| 707 */ | |
| 708 U_STABLE void U_EXPORT2 | |
| 709 uregex_setRegion(URegularExpression *regexp, | |
| 710 int32_t regionStart, | |
| 711 int32_t regionLimit, | |
| 712 UErrorCode *status); | |
| 713 | |
| 714 /** | |
| 715 * 64bit version of uregex_setRegion. | |
| 716 * @draft ICU 4.6 | |
| 717 */ | |
| 718 U_DRAFT void U_EXPORT2 | |
| 719 uregex_setRegion64(URegularExpression *regexp, | |
| 720 int64_t regionStart, | |
| 721 int64_t regionLimit, | |
| 722 UErrorCode *status); | |
| 723 | |
| 724 /** | |
| 725 * Variation on uregex_setRegion to set the region without resetting the star
t index | |
| 726 * without resetting the position for subsequent matches. | |
| 727 * @draft ICU 4.6 | |
| 728 */ | |
| 729 U_DRAFT void U_EXPORT2 | |
| 730 uregex_setRegionAndStart(URegularExpression *regexp, | |
| 731 int64_t regionStart, | |
| 732 int64_t regionLimit, | |
| 733 int64_t startIndex, | |
| 734 UErrorCode *status); | |
| 735 | |
| 736 /** | |
| 737 * Reports the start index of the matching region. Any matches found are limite
d to | |
| 738 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). | |
| 739 * | |
| 740 * @param regexp The compiled regular expression. | |
| 741 * @param status A pointer to a UErrorCode to receive any errors. | |
| 742 * @return The starting (native) index of this matcher's region. | |
| 743 * @stable ICU 4.0 | |
| 744 */ | |
| 745 U_STABLE int32_t U_EXPORT2 | |
| 746 uregex_regionStart(const URegularExpression *regexp, | |
| 747 UErrorCode *status); | |
| 748 | |
| 749 /** | |
| 750 * 64bit version of uregex_regionStart. | |
| 751 * @draft ICU 4.6 | |
| 752 */ | |
| 753 U_DRAFT int64_t U_EXPORT2 | |
| 754 uregex_regionStart64(const URegularExpression *regexp, | |
| 755 UErrorCode *status); | |
| 756 | |
| 757 /** | |
| 758 * Reports the end index (exclusive) of the matching region for this URegularEx
pression. | |
| 759 * Any matches found are limited to to the region bounded by regionStart (inclu
sive) | |
| 760 * and regionEnd (exclusive). | |
| 761 * | |
| 762 * @param regexp The compiled regular expression. | |
| 763 * @param status A pointer to a UErrorCode to receive any errors. | |
| 764 * @return The ending point (native) of this matcher's region. | |
| 765 * @stable ICU 4.0 | |
| 766 */ | |
| 767 U_STABLE int32_t U_EXPORT2 | |
| 768 uregex_regionEnd(const URegularExpression *regexp, | |
| 769 UErrorCode *status); | |
| 770 | |
| 771 /** | |
| 772 * 64bit version of uregex_regionEnd. | |
| 773 * @draft ICU 4.6 | |
| 774 */ | |
| 775 U_DRAFT int64_t U_EXPORT2 | |
| 776 uregex_regionEnd64(const URegularExpression *regexp, | |
| 777 UErrorCode *status); | |
| 778 | |
| 779 /** | |
| 780 * Queries the transparency of region bounds for this URegularExpression. | |
| 781 * See useTransparentBounds for a description of transparent and opaque bounds. | |
| 782 * By default, matching boundaries are opaque. | |
| 783 * | |
| 784 * @param regexp The compiled regular expression. | |
| 785 * @param status A pointer to a UErrorCode to receive any errors. | |
| 786 * @return TRUE if this matcher is using opaque bounds, false if it is not. | |
| 787 * @stable ICU 4.0 | |
| 788 */ | |
| 789 U_STABLE UBool U_EXPORT2 | |
| 790 uregex_hasTransparentBounds(const URegularExpression *regexp, | |
| 791 UErrorCode *status); | |
| 792 | |
| 793 | |
| 794 /** | |
| 795 * Sets the transparency of region bounds for this URegularExpression. | |
| 796 * Invoking this function with an argument of TRUE will set matches to use tran
sparent bounds. | |
| 797 * If the boolean argument is FALSE, then opaque bounds will be used. | |
| 798 * | |
| 799 * Using transparent bounds, the boundaries of the matching region are transpar
ent | |
| 800 * to lookahead, lookbehind, and boundary matching constructs. Those constructs
can | |
| 801 * see text beyond the boundaries of the region while checking for a match. | |
| 802 * | |
| 803 * With opaque bounds, no text outside of the matching region is visible to loo
kahead, | |
| 804 * lookbehind, and boundary matching constructs. | |
| 805 * | |
| 806 * By default, opaque bounds are used. | |
| 807 * | |
| 808 * @param regexp The compiled regular expression. | |
| 809 * @param b TRUE for transparent bounds; FALSE for opaque bounds | |
| 810 * @param status A pointer to a UErrorCode to receive any errors. | |
| 811 * @stable ICU 4.0 | |
| 812 **/ | |
| 813 U_STABLE void U_EXPORT2 | |
| 814 uregex_useTransparentBounds(URegularExpression *regexp, | |
| 815 UBool b, | |
| 816 UErrorCode *status); | |
| 817 | |
| 818 | |
| 819 /** | |
| 820 * Return true if this URegularExpression is using anchoring bounds. | |
| 821 * By default, anchoring region bounds are used. | |
| 822 * | |
| 823 * @param regexp The compiled regular expression. | |
| 824 * @param status A pointer to a UErrorCode to receive any errors. | |
| 825 * @return TRUE if this matcher is using anchoring bounds. | |
| 826 * @stable ICU 4.0 | |
| 827 */ | |
| 828 U_STABLE UBool U_EXPORT2 | |
| 829 uregex_hasAnchoringBounds(const URegularExpression *regexp, | |
| 830 UErrorCode *status); | |
| 831 | |
| 832 | |
| 833 /** | |
| 834 * Set whether this URegularExpression is using Anchoring Bounds for its region
. | |
| 835 * With anchoring bounds, pattern anchors such as ^ and $ will match at the sta
rt | |
| 836 * and end of the region. Without Anchoring Bounds, anchors will only match at | |
| 837 * the positions they would in the complete text. | |
| 838 * | |
| 839 * Anchoring Bounds are the default for regions. | |
| 840 * | |
| 841 * @param regexp The compiled regular expression. | |
| 842 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. | |
| 843 * @param status A pointer to a UErrorCode to receive any errors. | |
| 844 * @stable ICU 4.0 | |
| 845 */ | |
| 846 U_STABLE void U_EXPORT2 | |
| 847 uregex_useAnchoringBounds(URegularExpression *regexp, | |
| 848 UBool b, | |
| 849 UErrorCode *status); | |
| 850 | |
| 851 /** | |
| 852 * Return TRUE if the most recent matching operation touched the | |
| 853 * end of the text being processed. In this case, additional input text could | |
| 854 * change the results of that match. | |
| 855 * | |
| 856 * @param regexp The compiled regular expression. | |
| 857 * @param status A pointer to a UErrorCode to receive any errors. | |
| 858 * @return TRUE if the most recent match hit the end of input | |
| 859 * @stable ICU 4.0 | |
| 860 */ | |
| 861 U_STABLE UBool U_EXPORT2 | |
| 862 uregex_hitEnd(const URegularExpression *regexp, | |
| 863 UErrorCode *status); | |
| 864 | |
| 865 /** | |
| 866 * Return TRUE the most recent match succeeded and additional input could cause | |
| 867 * it to fail. If this function returns false and a match was found, then more
input | |
| 868 * might change the match but the match won't be lost. If a match was not found
, | |
| 869 * then requireEnd has no meaning. | |
| 870 * | |
| 871 * @param regexp The compiled regular expression. | |
| 872 * @param status A pointer to a UErrorCode to receive any errors. | |
| 873 * @return TRUE if more input could cause the most recent match to no longer m
atch. | |
| 874 * @stable ICU 4.0 | |
| 875 */ | |
| 876 U_STABLE UBool U_EXPORT2 | |
| 877 uregex_requireEnd(const URegularExpression *regexp, | |
| 878 UErrorCode *status); | |
| 879 | |
| 880 | |
| 881 | |
| 882 | |
| 883 | |
| 884 /** | |
| 885 * Replaces every substring of the input that matches the pattern | |
| 886 * with the given replacement string. This is a convenience function that | |
| 887 * provides a complete find-and-replace-all operation. | |
| 888 * | |
| 889 * This method scans the input string looking for matches of the pattern. | |
| 890 * Input that is not part of any match is copied unchanged to the | |
| 891 * destination buffer. Matched regions are replaced in the output | |
| 892 * buffer by the replacement string. The replacement string may contain | |
| 893 * references to capture groups; these take the form of $1, $2, etc. | |
| 894 * | |
| 895 * @param regexp The compiled regular expression. | |
| 896 * @param replacementText A string containing the replacement text. | |
| 897 * @param replacementLength The length of the replacement string, or | |
| 898 * -1 if it is NUL terminated. | |
| 899 * @param destBuf A (UChar *) buffer that will receive the resu
lt. | |
| 900 * @param destCapacity The capacity of the desitnation buffer. | |
| 901 * @param status A reference to a UErrorCode to receive any er
rors. | |
| 902 * @return The length of the string resulting from the f
ind | |
| 903 * and replace operation. In the event that the | |
| 904 * destination capacity is inadequate, the retur
n value | |
| 905 * is still the full length of the untruncated s
tring. | |
| 906 * @stable ICU 3.0 | |
| 907 */ | |
| 908 U_STABLE int32_t U_EXPORT2 | |
| 909 uregex_replaceAll(URegularExpression *regexp, | |
| 910 const UChar *replacementText, | |
| 911 int32_t replacementLength, | |
| 912 UChar *destBuf, | |
| 913 int32_t destCapacity, | |
| 914 UErrorCode *status); | |
| 915 | |
| 916 /** | |
| 917 * Replaces every substring of the input that matches the pattern | |
| 918 * with the given replacement string. This is a convenience function that | |
| 919 * provides a complete find-and-replace-all operation. | |
| 920 * | |
| 921 * This method scans the input string looking for matches of the pattern. | |
| 922 * Input that is not part of any match is copied unchanged to the | |
| 923 * destination buffer. Matched regions are replaced in the output | |
| 924 * buffer by the replacement string. The replacement string may contain | |
| 925 * references to capture groups; these take the form of $1, $2, etc. | |
| 926 * | |
| 927 * @param regexp The compiled regular expression. | |
| 928 * @param replacement A string containing the replacement text. | |
| 929 * @param dest A mutable UText that will receive the result. | |
| 930 * If NULL, a new UText will be created (which may
not be mutable). | |
| 931 * @param status A reference to a UErrorCode to receive any errors
. | |
| 932 * @return A UText containing the results of the find and re
place. | |
| 933 * If a pre-allocated UText was provided, it will a
lways be used and returned. | |
| 934 * | |
| 935 * @draft ICU 4.6 | |
| 936 */ | |
| 937 U_DRAFT UText * U_EXPORT2 | |
| 938 uregex_replaceAllUText(URegularExpression *regexp, | |
| 939 UText *replacement, | |
| 940 UText *dest, | |
| 941 UErrorCode *status); | |
| 942 | |
| 943 /** | |
| 944 * Replaces the first substring of the input that matches the pattern | |
| 945 * with the given replacement string. This is a convenience function that | |
| 946 * provides a complete find-and-replace operation. | |
| 947 * | |
| 948 * This method scans the input string looking for a match of the pattern. | |
| 949 * All input that is not part of the match is copied unchanged to the | |
| 950 * destination buffer. The matched region is replaced in the output | |
| 951 * buffer by the replacement string. The replacement string may contain | |
| 952 * references to capture groups; these take the form of $1, $2, etc. | |
| 953 * | |
| 954 * @param regexp The compiled regular expression. | |
| 955 * @param replacementText A string containing the replacement text. | |
| 956 * @param replacementLength The length of the replacement string, or | |
| 957 * -1 if it is NUL terminated. | |
| 958 * @param destBuf A (UChar *) buffer that will receive the resu
lt. | |
| 959 * @param destCapacity The capacity of the desitnation buffer. | |
| 960 * @param status a reference to a UErrorCode to receive any er
rors. | |
| 961 * @return The length of the string resulting from the f
ind | |
| 962 * and replace operation. In the event that the | |
| 963 * destination capacity is inadequate, the retur
n value | |
| 964 * is still the full length of the untruncated s
tring. | |
| 965 * @stable ICU 3.0 | |
| 966 */ | |
| 967 U_STABLE int32_t U_EXPORT2 | |
| 968 uregex_replaceFirst(URegularExpression *regexp, | |
| 969 const UChar *replacementText, | |
| 970 int32_t replacementLength, | |
| 971 UChar *destBuf, | |
| 972 int32_t destCapacity, | |
| 973 UErrorCode *status); | |
| 974 | |
| 975 /** | |
| 976 * Replaces the first substring of the input that matches the pattern | |
| 977 * with the given replacement string. This is a convenience function that | |
| 978 * provides a complete find-and-replace operation. | |
| 979 * | |
| 980 * This method scans the input string looking for a match of the pattern. | |
| 981 * All input that is not part of the match is copied unchanged to the | |
| 982 * destination buffer. The matched region is replaced in the output | |
| 983 * buffer by the replacement string. The replacement string may contain | |
| 984 * references to capture groups; these take the form of $1, $2, etc. | |
| 985 * | |
| 986 * @param regexp The compiled regular expression. | |
| 987 * @param replacement A string containing the replacement text. | |
| 988 * @param dest A mutable UText that will receive the result. | |
| 989 * If NULL, a new UText will be created (which may
not be mutable). | |
| 990 * @param status A reference to a UErrorCode to receive any errors
. | |
| 991 * @return A UText containing the results of the find and re
place. | |
| 992 * If a pre-allocated UText was provided, it will a
lways be used and returned. | |
| 993 * | |
| 994 * @draft ICU 4.6 | |
| 995 */ | |
| 996 U_DRAFT UText * U_EXPORT2 | |
| 997 uregex_replaceFirstUText(URegularExpression *regexp, | |
| 998 UText *replacement, | |
| 999 UText *dest, | |
| 1000 UErrorCode *status); | |
| 1001 | |
| 1002 | |
| 1003 /** | |
| 1004 * Implements a replace operation intended to be used as part of an | |
| 1005 * incremental find-and-replace. | |
| 1006 * | |
| 1007 * <p>The input string, starting from the end of the previous match and endin
g at | |
| 1008 * the start of the current match, is appended to the destination string. Th
en the | |
| 1009 * replacement string is appended to the output string, | |
| 1010 * including handling any substitutions of captured text.</p> | |
| 1011 * | |
| 1012 * <p>A note on preflight computation of buffersize and error handling: | |
| 1013 * Calls to uregex_appendReplacement() and uregex_appendTail() are | |
| 1014 * designed to be chained, one after another, with the destination | |
| 1015 * buffer pointer and buffer capacity updated after each in preparation | |
| 1016 * to for the next. If the destination buffer is exhausted partway through s
uch a | |
| 1017 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal | |
| 1018 * ICU conventions are for a function to perform no action if it is | |
| 1019 * called with an error status, but for this one case, uregex_appendRepacemen
t() | |
| 1020 * will operate normally so that buffer size computations will complete | |
| 1021 * correctly. | |
| 1022 * | |
| 1023 * <p>For simple, prepackaged, non-incremental find-and-replace | |
| 1024 * operations, see replaceFirst() or replaceAll().</p> | |
| 1025 * | |
| 1026 * @param regexp The regular expression object. | |
| 1027 * @param replacementText The string that will replace the matched portion
of the | |
| 1028 * input string as it is copied to the destination buffe
r. | |
| 1029 * The replacement text may contain references ($1, for | |
| 1030 * example) to capture groups from the match. | |
| 1031 * @param replacementLength The length of the replacement text string, | |
| 1032 * or -1 if the string is NUL terminated. | |
| 1033 * @param destBuf The buffer into which the results of the | |
| 1034 * find-and-replace are placed. On return, this pointer | |
| 1035 * will be updated to refer to the beginning of the | |
| 1036 * unused portion of buffer, leaving it in position for | |
| 1037 * a subsequent call to this function. | |
| 1038 * @param destCapacity The size of the output buffer, On return, this | |
| 1039 * parameter will be updated to reflect the space remain
ing | |
| 1040 * unused in the output buffer. | |
| 1041 * @param status A reference to a UErrorCode to receive any errors. | |
| 1042 * @return The length of the result string. In the event that | |
| 1043 * destCapacity is inadequate, the full length of the | |
| 1044 * untruncated output string is returned. | |
| 1045 * | |
| 1046 * @stable ICU 3.0 | |
| 1047 * | |
| 1048 */ | |
| 1049 U_STABLE int32_t U_EXPORT2 | |
| 1050 uregex_appendReplacement(URegularExpression *regexp, | |
| 1051 const UChar *replacementText, | |
| 1052 int32_t replacementLength, | |
| 1053 UChar **destBuf, | |
| 1054 int32_t *destCapacity, | |
| 1055 UErrorCode *status); | |
| 1056 | |
| 1057 | |
| 1058 /** | |
| 1059 * Implements a replace operation intended to be used as part of an | |
| 1060 * incremental find-and-replace. | |
| 1061 * | |
| 1062 * <p>The input string, starting from the end of the previous match and endin
g at | |
| 1063 * the start of the current match, is appended to the destination string. Th
en the | |
| 1064 * replacement string is appended to the output string, | |
| 1065 * including handling any substitutions of captured text.</p> | |
| 1066 * | |
| 1067 * <p>For simple, prepackaged, non-incremental find-and-replace | |
| 1068 * operations, see replaceFirst() or replaceAll().</p> | |
| 1069 * | |
| 1070 * @param regexp The regular expression object. | |
| 1071 * @param replacementText The string that will replace the matched portion
of the | |
| 1072 * input string as it is copied to the destination buffe
r. | |
| 1073 * The replacement text may contain references ($1, for | |
| 1074 * example) to capture groups from the match. | |
| 1075 * @param dest A mutable UText that will receive the result. Must no
t be NULL. | |
| 1076 * @param status A reference to a UErrorCode to receive any errors. | |
| 1077 * | |
| 1078 * @draft ICU 4.6 | |
| 1079 */ | |
| 1080 U_DRAFT void U_EXPORT2 | |
| 1081 uregex_appendReplacementUText(URegularExpression *regexp, | |
| 1082 UText *replacementText, | |
| 1083 UText *dest, | |
| 1084 UErrorCode *status); | |
| 1085 | |
| 1086 | |
| 1087 /** | |
| 1088 * As the final step in a find-and-replace operation, append the remainder | |
| 1089 * of the input string, starting at the position following the last match, | |
| 1090 * to the destination string. <code>uregex_appendTail()</code> is intended | |
| 1091 * to be invoked after one or more invocations of the | |
| 1092 * <code>uregex_appendReplacement()</code> function. | |
| 1093 * | |
| 1094 * @param regexp The regular expression object. This is needed to | |
| 1095 * obtain the input string and with the position | |
| 1096 * of the last match within it. | |
| 1097 * @param destBuf The buffer in which the results of the | |
| 1098 * find-and-replace are placed. On return, the pointer | |
| 1099 * will be updated to refer to the beginning of the | |
| 1100 * unused portion of buffer. | |
| 1101 * @param destCapacity The size of the output buffer, On return, this | |
| 1102 * value will be updated to reflect the space remaining | |
| 1103 * unused in the output buffer. | |
| 1104 * @param status A reference to a UErrorCode to receive any errors. | |
| 1105 * @return The length of the result string. In the event that | |
| 1106 * destCapacity is inadequate, the full length of the | |
| 1107 * untruncated output string is returned. | |
| 1108 * | |
| 1109 * @stable ICU 3.0 | |
| 1110 */ | |
| 1111 U_STABLE int32_t U_EXPORT2 | |
| 1112 uregex_appendTail(URegularExpression *regexp, | |
| 1113 UChar **destBuf, | |
| 1114 int32_t *destCapacity, | |
| 1115 UErrorCode *status); | |
| 1116 | |
| 1117 | |
| 1118 /** | |
| 1119 * As the final step in a find-and-replace operation, append the remainder | |
| 1120 * of the input string, starting at the position following the last match, | |
| 1121 * to the destination string. <code>uregex_appendTailUText()</code> is intended
| |
| 1122 * to be invoked after one or more invocations of the | |
| 1123 * <code>uregex_appendReplacementUText()</code> function. | |
| 1124 * | |
| 1125 * @param regexp The regular expression object. This is needed to | |
| 1126 * obtain the input string and with the position | |
| 1127 * of the last match within it. | |
| 1128 * @param dest A mutable UText that will receive the result. Must no
t be NULL. | |
| 1129 * @return The destination UText. | |
| 1130 * | |
| 1131 * @draft ICU 4.6 | |
| 1132 */ | |
| 1133 U_DRAFT UText * U_EXPORT2 | |
| 1134 uregex_appendTailUText(URegularExpression *regexp, | |
| 1135 UText *dest, | |
| 1136 UErrorCode *status); | |
| 1137 | |
| 1138 | |
| 1139 | |
| 1140 /** | |
| 1141 * Split a string into fields. Somewhat like split() from Perl. | |
| 1142 * The pattern matches identify delimiters that separate the input | |
| 1143 * into fields. The input data between the matches becomes the | |
| 1144 * fields themselves. | |
| 1145 * <p> | |
| 1146 * Each of the fields is copied from the input string to the destination | |
| 1147 * buffer, and NUL terminated. The position of each field within | |
| 1148 * the destination buffer is returned in the destFields array. | |
| 1149 * | |
| 1150 * Note: another choice for the design of this function would be to not | |
| 1151 * copy the resulting fields at all, but to return indexes and | |
| 1152 * lengths within the source text. | |
| 1153 * Advantages would be | |
| 1154 * o Faster. No Copying. | |
| 1155 * o Nothing extra needed when field data may contain embedded NU
L chars. | |
| 1156 * o Less memory needed if working on large data. | |
| 1157 * Disadvantages | |
| 1158 * o Less consistent with C++ split, which copies into an | |
| 1159 * array of UnicodeStrings. | |
| 1160 * o No NUL termination, extracted fields would be less convenien
t | |
| 1161 * to use in most cases. | |
| 1162 * o Possible problems in the future, when support Unicode Normal
ization | |
| 1163 * could cause the fields to not correspond exactly to | |
| 1164 * a range of the source text. | |
| 1165 * | |
| 1166 * @param regexp The compiled regular expression. | |
| 1167 * @param destBuf A (UChar *) buffer to receive the fields that | |
| 1168 * are extracted from the input string. These | |
| 1169 * field pointers will refer to positions within the | |
| 1170 * destination buffer supplied by the caller. Any | |
| 1171 * extra positions within the destFields array will be | |
| 1172 * set to NULL. | |
| 1173 * @param destCapacity The capacity of the destBuf. | |
| 1174 * @param requiredCapacity The actual capacity required of the destBuf. | |
| 1175 * If destCapacity is too small, requiredCapacity will
return | |
| 1176 * the total capacity required to hold all of the outp
ut, and | |
| 1177 * a U_BUFFER_OVERFLOW_ERROR will be returned. | |
| 1178 * @param destFields An array to be filled with the position of each | |
| 1179 * of the extracted fields within destBuf. | |
| 1180 * @param destFieldsCapacity The number of elements in the destFields ar
ray. | |
| 1181 * If the number of fields found is less than destFieldsCapacit
y, | |
| 1182 * the extra destFields elements are set to zero. | |
| 1183 * If destFieldsCapacity is too small, the trailing part of the | |
| 1184 * input, including any field delimiters, is treated as if it | |
| 1185 * were the last field - it is copied to the destBuf, and | |
| 1186 * its position is in the destBuf is stored in the last element | |
| 1187 * of destFields. This behavior mimics that of Perl. It is no
t | |
| 1188 * an error condition, and no error status is returned when all
destField | |
| 1189 * positions are used. | |
| 1190 * @param status A reference to a UErrorCode to receive any errors. | |
| 1191 * @return The number of fields into which the input string was split. | |
| 1192 * @stable ICU 3.0 | |
| 1193 */ | |
| 1194 U_STABLE int32_t U_EXPORT2 | |
| 1195 uregex_split( URegularExpression *regexp, | |
| 1196 UChar *destBuf, | |
| 1197 int32_t destCapacity, | |
| 1198 int32_t *requiredCapacity, | |
| 1199 UChar *destFields[], | |
| 1200 int32_t destFieldsCapacity, | |
| 1201 UErrorCode *status); | |
| 1202 | |
| 1203 | |
| 1204 /** | |
| 1205 * Split a string into fields. Somewhat like split() from Perl. | |
| 1206 * The pattern matches identify delimiters that separate the input | |
| 1207 * into fields. The input data between the matches becomes the | |
| 1208 * fields themselves. | |
| 1209 * <p> | |
| 1210 * The behavior of this function is not very closely aligned with uregex_split
(); | |
| 1211 * instead, it is based on (and implemented directly on top of) the C++ split
method. | |
| 1212 * | |
| 1213 * @param regexp The compiled regular expression. | |
| 1214 * @param destFields An array of mutable UText structs to receive the resul
ts of the split. | |
| 1215 * If a field is NULL, a new UText is allocated to contain the
results for | |
| 1216 * that field. This new UText is not guaranteed to be mutable. | |
| 1217 * @param destFieldsCapacity The number of elements in the destination array. | |
| 1218 * If the number of fields found is less than destCapacity, the | |
| 1219 * extra strings in the destination array are not altered. | |
| 1220 * If the number of destination strings is less than the number | |
| 1221 * of fields, the trailing part of the input string, including
any | |
| 1222 * field delimiters, is placed in the last destination string. | |
| 1223 * This behavior mimics that of Perl. It is not an error cond
ition, and no | |
| 1224 * error status is returned when all destField positions are us
ed. | |
| 1225 * @param status A reference to a UErrorCode to receive any errors. | |
| 1226 * @return The number of fields into which the input string was split. | |
| 1227 * | |
| 1228 * @draft ICU 4.6 | |
| 1229 */ | |
| 1230 U_DRAFT int32_t U_EXPORT2 | |
| 1231 uregex_splitUText(URegularExpression *regexp, | |
| 1232 UText *destFields[], | |
| 1233 int32_t destFieldsCapacity, | |
| 1234 UErrorCode *status); | |
| 1235 | |
| 1236 | |
| 1237 | |
| 1238 | |
| 1239 /** | |
| 1240 * Set a processing time limit for match operations with this URegularExpression
. | |
| 1241 * | |
| 1242 * Some patterns, when matching certain strings, can run in exponential time. | |
| 1243 * For practical purposes, the match operation may appear to be in an | |
| 1244 * infinite loop. | |
| 1245 * When a limit is set a match operation will fail with an error if the | |
| 1246 * limit is exceeded. | |
| 1247 * <p> | |
| 1248 * The units of the limit are steps of the match engine. | |
| 1249 * Correspondence with actual processor time will depend on the speed | |
| 1250 * of the processor and the details of the specific pattern, but will | |
| 1251 * typically be on the order of milliseconds. | |
| 1252 * <p> | |
| 1253 * By default, the matching time is not limited. | |
| 1254 * <p> | |
| 1255 * | |
| 1256 * @param regexp The compiled regular expression. | |
| 1257 * @param limit The limit value, or 0 for no limit. | |
| 1258 * @param status A reference to a UErrorCode to receive any errors. | |
| 1259 * @stable ICU 4.0 | |
| 1260 */ | |
| 1261 U_STABLE void U_EXPORT2 | |
| 1262 uregex_setTimeLimit(URegularExpression *regexp, | |
| 1263 int32_t limit, | |
| 1264 UErrorCode *status); | |
| 1265 | |
| 1266 /** | |
| 1267 * Get the time limit for for matches with this URegularExpression. | |
| 1268 * A return value of zero indicates that there is no limit. | |
| 1269 * | |
| 1270 * @param regexp The compiled regular expression. | |
| 1271 * @param status A reference to a UErrorCode to receive any errors. | |
| 1272 * @return the maximum allowed time for a match, in units of processing steps. | |
| 1273 * @stable ICU 4.0 | |
| 1274 */ | |
| 1275 U_STABLE int32_t U_EXPORT2 | |
| 1276 uregex_getTimeLimit(const URegularExpression *regexp, | |
| 1277 UErrorCode *status); | |
| 1278 | |
| 1279 /** | |
| 1280 * Set the amount of heap storage avaliable for use by the match backtracking st
ack. | |
| 1281 * <p> | |
| 1282 * ICU uses a backtracking regular expression engine, with the backtrack stack | |
| 1283 * maintained on the heap. This function sets the limit to the amount of memory | |
| 1284 * that can be used for this purpose. A backtracking stack overflow will | |
| 1285 * result in an error from the match operation that caused it. | |
| 1286 * <p> | |
| 1287 * A limit is desirable because a malicious or poorly designed pattern can use | |
| 1288 * excessive memory, potentially crashing the process. A limit is enabled | |
| 1289 * by default. | |
| 1290 * <p> | |
| 1291 * @param regexp The compiled regular expression. | |
| 1292 * @param limit The maximum size, in bytes, of the matching backtrack st
ack. | |
| 1293 * A value of -1 means no limit. | |
| 1294 * The limit must be greater than zero, or -1. | |
| 1295 * @param status A reference to a UErrorCode to receive any errors. | |
| 1296 * | |
| 1297 * @stable ICU 4.0 | |
| 1298 */ | |
| 1299 U_STABLE void U_EXPORT2 | |
| 1300 uregex_setStackLimit(URegularExpression *regexp, | |
| 1301 int32_t limit, | |
| 1302 UErrorCode *status); | |
| 1303 | |
| 1304 /** | |
| 1305 * Get the size of the heap storage available for use by the back tracking stack
. | |
| 1306 * | |
| 1307 * @return the maximum backtracking stack size, in bytes, or zero if the | |
| 1308 * stack size is unlimited. | |
| 1309 * @stable ICU 4.0 | |
| 1310 */ | |
| 1311 U_STABLE int32_t U_EXPORT2 | |
| 1312 uregex_getStackLimit(const URegularExpression *regexp, | |
| 1313 UErrorCode *status); | |
| 1314 | |
| 1315 | |
| 1316 /** | |
| 1317 * Function pointer for a regular expression matching callback function. | |
| 1318 * When set, a callback function will be called periodically during matching | |
| 1319 * operations. If the call back function returns FALSE, the matching | |
| 1320 * operation will be terminated early. | |
| 1321 * | |
| 1322 * Note: the callback function must not call other functions on this | |
| 1323 * URegularExpression. | |
| 1324 * | |
| 1325 * @param context context pointer. The callback function will be invoked | |
| 1326 * with the context specified at the time that | |
| 1327 * uregex_setMatchCallback() is called. | |
| 1328 * @param steps the accumulated processing time, in match steps, | |
| 1329 * for this matching operation. | |
| 1330 * @return TRUE to continue the matching operation. | |
| 1331 * FALSE to terminate the matching operation. | |
| 1332 * @stable ICU 4.0 | |
| 1333 */ | |
| 1334 U_CDECL_BEGIN | |
| 1335 typedef UBool U_CALLCONV URegexMatchCallback ( | |
| 1336 const void *context, | |
| 1337 int32_t steps); | |
| 1338 U_CDECL_END | |
| 1339 | |
| 1340 /** | |
| 1341 * Set a callback function for this URegularExpression. | |
| 1342 * During matching operations the function will be called periodically, | |
| 1343 * giving the application the opportunity to terminate a long-running | |
| 1344 * match. | |
| 1345 * | |
| 1346 * @param regexp The compiled regular expression. | |
| 1347 * @param callback A pointer to the user-supplied callback function. | |
| 1348 * @param context User context pointer. The value supplied at the | |
| 1349 * time the callback function is set will be saved | |
| 1350 * and passed to the callback each time that it is called. | |
| 1351 * @param status A reference to a UErrorCode to receive any errors. | |
| 1352 * @stable ICU 4.0 | |
| 1353 */ | |
| 1354 U_STABLE void U_EXPORT2 | |
| 1355 uregex_setMatchCallback(URegularExpression *regexp, | |
| 1356 URegexMatchCallback *callback, | |
| 1357 const void *context, | |
| 1358 UErrorCode *status); | |
| 1359 | |
| 1360 | |
| 1361 /** | |
| 1362 * Get the callback function for this URegularExpression. | |
| 1363 * | |
| 1364 * @param regexp The compiled regular expression. | |
| 1365 * @param callback Out paramater, receives a pointer to the user-supplied | |
| 1366 * callback function. | |
| 1367 * @param context Out parameter, receives the user context pointer that | |
| 1368 * was set when uregex_setMatchCallback() was called. | |
| 1369 * @param status A reference to a UErrorCode to receive any errors. | |
| 1370 * @stable ICU 4.0 | |
| 1371 */ | |
| 1372 U_STABLE void U_EXPORT2 | |
| 1373 uregex_getMatchCallback(const URegularExpression *regexp, | |
| 1374 URegexMatchCallback **callback, | |
| 1375 const void **context, | |
| 1376 UErrorCode *status); | |
| 1377 | |
| 1378 | |
| 1379 /** | |
| 1380 * Function pointer for a regular expression find callback function. | |
| 1381 * | |
| 1382 * When set, a callback function will be called during a find operation | |
| 1383 * and for operations that depend on find, such as findNext, split and some repl
ace | |
| 1384 * operations like replaceFirst. | |
| 1385 * The callback will usually be called after each attempt at a match, but this i
s not a | |
| 1386 * guarantee that the callback will be invoked at each character. For finds whe
re the | |
| 1387 * match engine is invoked at each character, this may be close to true, but les
s likely | |
| 1388 * for more optimized loops where the pattern is known to only start, and the ma
tch | |
| 1389 * engine invoked, at certain characters. | |
| 1390 * When invoked, this callback will specify the index at which a match operation
is about | |
| 1391 * to be attempted, giving the application the opportunity to terminate a long-r
unning | |
| 1392 * find operation. | |
| 1393 * | |
| 1394 * If the call back function returns FALSE, the find operation will be terminate
d early. | |
| 1395 * | |
| 1396 * Note: the callback function must not call other functions on this | |
| 1397 * URegularExpression | |
| 1398 * | |
| 1399 * @param context context pointer. The callback function will be invoked | |
| 1400 * with the context specified at the time that | |
| 1401 * uregex_setFindProgressCallback() is called. | |
| 1402 * @param matchIndex the next index at which a match attempt will be attempted
for this | |
| 1403 * find operation. If this callback interrupts the search, this
is the | |
| 1404 * index at which a find/findNext operation may be re-initiated. | |
| 1405 * @return TRUE to continue the matching operation. | |
| 1406 * FALSE to terminate the matching operation. | |
| 1407 * @draft ICU 4.6 | |
| 1408 */ | |
| 1409 U_CDECL_BEGIN | |
| 1410 typedef UBool U_CALLCONV URegexFindProgressCallback ( | |
| 1411 const void *context, | |
| 1412 int64_t matchIndex); | |
| 1413 U_CDECL_END | |
| 1414 | |
| 1415 /** | |
| 1416 * Set the find progress callback function for this URegularExpression. | |
| 1417 * | |
| 1418 * @param regexp The compiled regular expression. | |
| 1419 * @param callback A pointer to the user-supplied callback function. | |
| 1420 * @param context User context pointer. The value supplied at the | |
| 1421 * time the callback function is set will be saved | |
| 1422 * and passed to the callback each time that it is called. | |
| 1423 * @param status A reference to a UErrorCode to receive any errors. | |
| 1424 * @draft ICU 4.6 | |
| 1425 */ | |
| 1426 U_DRAFT void U_EXPORT2 | |
| 1427 uregex_setFindProgressCallback(URegularExpression *regexp, | |
| 1428 URegexFindProgressCallback *callback, | |
| 1429 const void *context, | |
| 1430 UErrorCode *status); | |
| 1431 | |
| 1432 | |
| 1433 /** | |
| 1434 * Get the find progress callback function for this URegularExpression. | |
| 1435 * | |
| 1436 * @param regexp The compiled regular expression. | |
| 1437 * @param callback Out paramater, receives a pointer to the user-supplied | |
| 1438 * callback function. | |
| 1439 * @param context Out parameter, receives the user context pointer that | |
| 1440 * was set when uregex_setFindProgressCallback() was called
. | |
| 1441 * @param status A reference to a UErrorCode to receive any errors. | |
| 1442 * @draft ICU 4.6 | |
| 1443 */ | |
| 1444 U_DRAFT void U_EXPORT2 | |
| 1445 uregex_getFindProgressCallback(const URegularExpression *regexp, | |
| 1446 URegexFindProgressCallback **callback, | |
| 1447 const void **context, | |
| 1448 UErrorCode *status); | |
| 1449 | |
| 1450 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
| 1451 #endif /* UREGEX_H */ | |
| OLD | NEW |