OLD | NEW |
| (Empty) |
1 /* | |
2 ********************************************************************** | |
3 * Copyright (C) 2004-2010, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ********************************************************************** | |
6 * file name: uregex.h | |
7 * encoding: US-ASCII | |
8 * indentation:4 | |
9 * | |
10 * created on: 2004mar09 | |
11 * created by: Andy Heninger | |
12 * | |
13 * ICU Regular Expressions, API for C | |
14 */ | |
15 | |
16 /** | |
17 * \file | |
18 * \brief C API: Regular Expressions | |
19 * | |
20 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.<
/p> | |
21 */ | |
22 | |
23 #ifndef UREGEX_H | |
24 #define UREGEX_H | |
25 | |
26 #include "unicode/utext.h" | |
27 #include "unicode/utypes.h" | |
28 | |
29 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
30 | |
31 #include "unicode/localpointer.h" | |
32 #include "unicode/parseerr.h" | |
33 | |
34 struct URegularExpression; | |
35 /** | |
36 * Structure representing a compiled regular rexpression, plus the results | |
37 * of a match operation. | |
38 * @stable ICU 3.0 | |
39 */ | |
40 typedef struct URegularExpression URegularExpression; | |
41 | |
42 | |
43 /** | |
44 * Constants for Regular Expression Match Modes. | |
45 * @stable ICU 2.4 | |
46 */ | |
47 typedef enum URegexpFlag{ | |
48 | |
49 #ifndef U_HIDE_DRAFT_API | |
50 /** Forces normalization of pattern and strings. | |
51 Not implemented yet, just a placeholder, hence draft. | |
52 @draft ICU 2.4 */ | |
53 UREGEX_CANON_EQ = 128, | |
54 #endif | |
55 /** Enable case insensitive matching. @stable ICU 2.4 */ | |
56 UREGEX_CASE_INSENSITIVE = 2, | |
57 | |
58 /** Allow white space and comments within patterns @stable ICU 2.4 */ | |
59 UREGEX_COMMENTS = 4, | |
60 | |
61 /** If set, '.' matches line terminators, otherwise '.' matching stops at
line end. | |
62 * @stable ICU 2.4 */ | |
63 UREGEX_DOTALL = 32, | |
64 | |
65 /** If set, treat the entire pattern as a literal string. | |
66 * Metacharacters or escape sequences in the input sequence will be given | |
67 * no special meaning. Not implemented yet as of ICU 4.4. | |
68 * | |
69 * The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact | |
70 * on matching when used in conjunction with this flag. | |
71 * The other flags become superfluous. | |
72 * TODO: say which escapes are still handled; anything Java does | |
73 * early (\\u) we should still do. | |
74 * @stable ICU 4.0 | |
75 */ | |
76 UREGEX_LITERAL = 16, | |
77 | |
78 /** Control behavior of "$" and "^" | |
79 * If set, recognize line terminators within string, | |
80 * otherwise, match only at start and end of input string. | |
81 * @stable ICU 2.4 */ | |
82 UREGEX_MULTILINE = 8, | |
83 | |
84 /** Unix-only line endings. | |
85 * When this mode is enabled, only \\u000a is recognized as a line ending | |
86 * in the behavior of ., ^, and $. | |
87 * @stable ICU 4.0 | |
88 */ | |
89 UREGEX_UNIX_LINES = 1, | |
90 | |
91 /** Unicode word boundaries. | |
92 * If set, \b uses the Unicode TR 29 definition of word boundaries. | |
93 * Warning: Unicode word boundaries are quite different from | |
94 * traditional regular expression word boundaries. See | |
95 * http://unicode.org/reports/tr29/#Word_Boundaries | |
96 * @stable ICU 2.8 | |
97 */ | |
98 UREGEX_UWORD = 256, | |
99 | |
100 /** Error on Unrecognized backslash escapes. | |
101 * If set, fail with an error on patterns that contain | |
102 * backslash-escaped ASCII letters without a known specail | |
103 * meaning. If this flag is not set, these | |
104 * escaped letters represent themselves. | |
105 * @stable ICU 4.0 | |
106 */ | |
107 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 | |
108 | |
109 } URegexpFlag; | |
110 | |
111 /** | |
112 * Open (compile) an ICU regular expression. Compiles the regular expression
in | |
113 * string form into an internal representation using the specified match mode
flags. | |
114 * The resulting regular expression handle can then be used to perform various | |
115 * matching operations. | |
116 * | |
117 * | |
118 * @param pattern The Regular Expression pattern to be compiled. | |
119 * @param patternLength The length of the pattern, or -1 if the pattern is | |
120 * NUL termintated. | |
121 * @param flags Flags that alter the default matching behavior for | |
122 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
123 * example. For default behavior, set this parameter to
zero. | |
124 * See <code>enum URegexpFlag</code>. All desired flags | |
125 * are bitwise-ORed together. | |
126 * @param pe Receives the position (line and column nubers) of any
syntax | |
127 * error within the source regular expression string. If
this | |
128 * information is not wanted, pass NULL for this paramete
r. | |
129 * @param status Receives error detected by this function. | |
130 * @stable ICU 3.0 | |
131 * | |
132 */ | |
133 U_STABLE URegularExpression * U_EXPORT2 | |
134 uregex_open( const UChar *pattern, | |
135 int32_t patternLength, | |
136 uint32_t flags, | |
137 UParseError *pe, | |
138 UErrorCode *status); | |
139 | |
140 /** | |
141 * Open (compile) an ICU regular expression. Compiles the regular expression
in | |
142 * string form into an internal representation using the specified match mode
flags. | |
143 * The resulting regular expression handle can then be used to perform various | |
144 * matching operations. | |
145 * <p> | |
146 * The contents of the pattern UText will be extracted and saved. Ownership of
the | |
147 * UText struct itself remains with the caller. This is to match the behavior
of | |
148 * uregex_open(). | |
149 * | |
150 * @param pattern The Regular Expression pattern to be compiled. | |
151 * @param flags Flags that alter the default matching behavior for | |
152 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
153 * example. For default behavior, set this parameter to
zero. | |
154 * See <code>enum URegexpFlag</code>. All desired flags | |
155 * are bitwise-ORed together. | |
156 * @param pe Receives the position (line and column nubers) of any
syntax | |
157 * error within the source regular expression string. If
this | |
158 * information is not wanted, pass NULL for this paramete
r. | |
159 * @param status Receives error detected by this function. | |
160 * | |
161 * @draft ICU 4.6 | |
162 */ | |
163 U_DRAFT URegularExpression * U_EXPORT2 | |
164 uregex_openUText(UText *pattern, | |
165 uint32_t flags, | |
166 UParseError *pe, | |
167 UErrorCode *status); | |
168 | |
169 /** | |
170 * Open (compile) an ICU regular expression. The resulting regular expression | |
171 * handle can then be used to perform various matching operations. | |
172 * <p> | |
173 * This function is the same as uregex_open, except that the pattern | |
174 * is supplied as an 8 bit char * string in the default code page. | |
175 * | |
176 * @param pattern The Regular Expression pattern to be compiled, | |
177 * NUL termintated. | |
178 * @param flags Flags that alter the default matching behavior for | |
179 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
180 * example. For default behavior, set this parameter to
zero. | |
181 * See <code>enum URegexpFlag</code>. All desired flags | |
182 * are bitwise-ORed together. | |
183 * @param pe Receives the position (line and column nubers) of any
syntax | |
184 * error within the source regular expression string. If
this | |
185 * information is not wanted, pass NULL for this paramete
r. | |
186 * @param status Receives errors detected by this function. | |
187 * @return The URegularExpression object representing the compile
d | |
188 * pattern. | |
189 * | |
190 * @stable ICU 3.0 | |
191 */ | |
192 #if !UCONFIG_NO_CONVERSION | |
193 U_STABLE URegularExpression * U_EXPORT2 | |
194 uregex_openC( const char *pattern, | |
195 uint32_t flags, | |
196 UParseError *pe, | |
197 UErrorCode *status); | |
198 #endif | |
199 | |
200 | |
201 | |
202 /** | |
203 * Close the regular expression, recovering all resources (memory) it | |
204 * was holding. | |
205 * | |
206 * @param regexp The regular expression to be closed. | |
207 * @stable ICU 3.0 | |
208 */ | |
209 U_STABLE void U_EXPORT2 | |
210 uregex_close(URegularExpression *regexp); | |
211 | |
212 #if U_SHOW_CPLUSPLUS_API | |
213 | |
214 U_NAMESPACE_BEGIN | |
215 | |
216 /** | |
217 * \class LocalURegularExpressionPointer | |
218 * "Smart pointer" class, closes a URegularExpression via uregex_close(). | |
219 * For most methods see the LocalPointerBase base class. | |
220 * | |
221 * @see LocalPointerBase | |
222 * @see LocalPointer | |
223 * @stable ICU 4.4 | |
224 */ | |
225 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression,
uregex_close); | |
226 | |
227 U_NAMESPACE_END | |
228 | |
229 #endif | |
230 | |
231 /** | |
232 * Make a copy of a compiled regular expression. Cloning a regular | |
233 * expression is faster than opening a second instance from the source | |
234 * form of the expression, and requires less memory. | |
235 * <p> | |
236 * Note that the current input string and the position of any matched text | |
237 * within it are not cloned; only the pattern itself and and the | |
238 * match mode flags are copied. | |
239 * <p> | |
240 * Cloning can be particularly useful to threaded applications that perform | |
241 * multiple match operations in parallel. Each concurrent RE | |
242 * operation requires its own instance of a URegularExpression. | |
243 * | |
244 * @param regexp The compiled regular expression to be cloned. | |
245 * @param status Receives indication of any errors encountered | |
246 * @return the cloned copy of the compiled regular expression. | |
247 * @stable ICU 3.0 | |
248 */ | |
249 U_STABLE URegularExpression * U_EXPORT2 | |
250 uregex_clone(const URegularExpression *regexp, UErrorCode *status); | |
251 | |
252 /** | |
253 * Returns a pointer to the source form of the pattern for this regular express
ion. | |
254 * This function will work even if the pattern was originally specified as a UT
ext. | |
255 * | |
256 * @param regexp The compiled regular expression. | |
257 * @param patLength This output parameter will be set to the length of the | |
258 * pattern string. A NULL pointer may be used here if the | |
259 * pattern length is not needed, as would be the case if | |
260 * the pattern is known in advance to be a NUL terminated | |
261 * string. | |
262 * @param status Receives errors detected by this function. | |
263 * @return a pointer to the pattern string. The storage for the string is | |
264 * owned by the regular expression object, and must not be | |
265 * altered or deleted by the application. The returned string | |
266 * will remain valid until the regular expression is closed. | |
267 * @stable ICU 3.0 | |
268 */ | |
269 U_STABLE const UChar * U_EXPORT2 | |
270 uregex_pattern(const URegularExpression *regexp, | |
271 int32_t *patLength, | |
272 UErrorCode *status); | |
273 | |
274 /** | |
275 * Returns the source text of the pattern for this regular expression. | |
276 * This function will work even if the pattern was originally specified as a UC
har string. | |
277 * | |
278 * @param regexp The compiled regular expression. | |
279 * @param status Receives errors detected by this function. | |
280 * @return the pattern text. The storage for the text is owned by the regular e
xpression | |
281 * object, and must not be altered or deleted. | |
282 * | |
283 * @draft ICU 4.6 | |
284 */ | |
285 U_DRAFT UText * U_EXPORT2 | |
286 uregex_patternUText(const URegularExpression *regexp, | |
287 UErrorCode *status); | |
288 | |
289 | |
290 /** | |
291 * Get the match mode flags that were specified when compiling this regular exp
ression. | |
292 * @param status Receives errors detected by this function. | |
293 * @param regexp The compiled regular expression. | |
294 * @return The match mode flags | |
295 * @see URegexpFlag | |
296 * @stable ICU 3.0 | |
297 */ | |
298 U_STABLE int32_t U_EXPORT2 | |
299 uregex_flags(const URegularExpression *regexp, | |
300 UErrorCode *status); | |
301 | |
302 | |
303 /** | |
304 * Set the subject text string upon which the regular expression will look for
matches. | |
305 * This function may be called any number of times, allowing the regular | |
306 * expression pattern to be applied to different strings. | |
307 * <p> | |
308 * Regular expression matching operations work directly on the application's | |
309 * string data. No copy is made. The subject string data must not be | |
310 * altered after calling this function until after all regular expression | |
311 * operations involving this string data are completed. | |
312 * <p> | |
313 * Zero length strings are permitted. In this case, no subsequent match | |
314 * operation will dereference the text string pointer. | |
315 * | |
316 * @param regexp The compiled regular expression. | |
317 * @param text The subject text string. | |
318 * @param textLength The length of the subject text, or -1 if the string | |
319 * is NUL terminated. | |
320 * @param status Receives errors detected by this function. | |
321 * @stable ICU 3.0 | |
322 */ | |
323 U_STABLE void U_EXPORT2 | |
324 uregex_setText(URegularExpression *regexp, | |
325 const UChar *text, | |
326 int32_t textLength, | |
327 UErrorCode *status); | |
328 | |
329 | |
330 /** | |
331 * Set the subject text string upon which the regular expression will look for
matches. | |
332 * This function may be called any number of times, allowing the regular | |
333 * expression pattern to be applied to different strings. | |
334 * <p> | |
335 * Regular expression matching operations work directly on the application's | |
336 * string data; only a shallow clone is made. The subject string data must no
t be | |
337 * altered after calling this function until after all regular expression | |
338 * operations involving this string data are completed. | |
339 * | |
340 * @param regexp The compiled regular expression. | |
341 * @param text The subject text string. | |
342 * @param status Receives errors detected by this function. | |
343 * | |
344 * @draft ICU 4.6 | |
345 */ | |
346 U_DRAFT void U_EXPORT2 | |
347 uregex_setUText(URegularExpression *regexp, | |
348 UText *text, | |
349 UErrorCode *status); | |
350 | |
351 /** | |
352 * Get the subject text that is currently associated with this | |
353 * regular expression object. If the input was supplied using uregex_setText
(), | |
354 * that pointer will be returned. Otherwise, the characters in the input wil
l | |
355 * be extracted to a buffer and returned. In either case, ownership remains | |
356 * with the regular expression object. | |
357 * | |
358 * This function will work even if the input was originally specified as a UTe
xt. | |
359 * | |
360 * @param regexp The compiled regular expression. | |
361 * @param textLength The length of the string is returned in this output param
eter. | |
362 * A NULL pointer may be used here if the | |
363 * text length is not needed, as would be the case if | |
364 * the text is known in advance to be a NUL terminated | |
365 * string. | |
366 * @param status Receives errors detected by this function. | |
367 * @return Pointer to the subject text string currently associated w
ith | |
368 * this regular expression. | |
369 * @stable ICU 3.0 | |
370 */ | |
371 U_STABLE const UChar * U_EXPORT2 | |
372 uregex_getText(URegularExpression *regexp, | |
373 int32_t *textLength, | |
374 UErrorCode *status); | |
375 | |
376 | |
377 /** | |
378 * Get the subject text that is currently associated with this | |
379 * regular expression object. | |
380 * | |
381 * This function will work even if the input was originally specified as a UCh
ar string. | |
382 * | |
383 * @param regexp The compiled regular expression. | |
384 * @param dest A mutable UText in which to store the current input. | |
385 * If NULL, a new UText will be created as an immutable shal
low clone | |
386 * of the actual input string. | |
387 * @param status Receives errors detected by this function. | |
388 * @return The subject text currently associated with this regular e
xpression. | |
389 * If a pre-allocated UText was provided, it will always be
used and returned. | |
390 * | |
391 * @draft ICU 4.6 | |
392 */ | |
393 U_DRAFT UText * U_EXPORT2 | |
394 uregex_getUText(URegularExpression *regexp, | |
395 UText *dest, | |
396 UErrorCode *status); | |
397 | |
398 /** | |
399 * Attempts to match the input string against the pattern. | |
400 * To succeed, the match must extend to the end of the string, | |
401 * or cover the complete match region. | |
402 * | |
403 * If startIndex >= zero the match operation starts at the specified | |
404 * index and must extend to the end of the input string. Any region | |
405 * that has been specified is reset. | |
406 * | |
407 * If startIndex == -1 the match must cover the input region, or the entire | |
408 * input string if no region has been set. This directly corresponds to | |
409 * Matcher.matches() in Java | |
410 * | |
411 * @param regexp The compiled regular expression. | |
412 * @param startIndex The input string (native) index at which to begin mat
ching, or -1 | |
413 * to match the input Region. | |
414 * @param status Receives errors detected by this function. | |
415 * @return TRUE if there is a match | |
416 * @stable ICU 3.0 | |
417 */ | |
418 U_STABLE UBool U_EXPORT2 | |
419 uregex_matches(URegularExpression *regexp, | |
420 int32_t startIndex, | |
421 UErrorCode *status); | |
422 | |
423 /** | |
424 * 64bit version of uregex_matches. | |
425 * @draft ICU 4.6 | |
426 */ | |
427 U_DRAFT UBool U_EXPORT2 | |
428 uregex_matches64(URegularExpression *regexp, | |
429 int64_t startIndex, | |
430 UErrorCode *status); | |
431 | |
432 /** | |
433 * Attempts to match the input string, starting from the specified index, aga
inst the pattern. | |
434 * The match may be of any length, and is not required to extend to the end | |
435 * of the input string. Contrast with uregex_matches(). | |
436 * | |
437 * <p>If startIndex is >= 0 any input region that was set for this | |
438 * URegularExpression is reset before the operation begins. | |
439 * | |
440 * <p>If the specified starting index == -1 the match begins at the start of
the input | |
441 * region, or at the start of the full string if no region has been specified
. | |
442 * This corresponds directly with Matcher.lookingAt() in Java. | |
443 * | |
444 * <p>If the match succeeds then more information can be obtained via the | |
445 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, | |
446 * and <code>uregexp_group()</code> functions.</p> | |
447 * | |
448 * @param regexp The compiled regular expression. | |
449 * @param startIndex The input string (native) index at which to begin ma
tching, or | |
450 * -1 to match the Input Region | |
451 * @param status A reference to a UErrorCode to receive any errors. | |
452 * @return TRUE if there is a match. | |
453 * @stable ICU 3.0 | |
454 */ | |
455 U_STABLE UBool U_EXPORT2 | |
456 uregex_lookingAt(URegularExpression *regexp, | |
457 int32_t startIndex, | |
458 UErrorCode *status); | |
459 | |
460 /** | |
461 * 64bit version of uregex_lookingAt. | |
462 * @draft ICU 4.6 | |
463 */ | |
464 U_DRAFT UBool U_EXPORT2 | |
465 uregex_lookingAt64(URegularExpression *regexp, | |
466 int64_t startIndex, | |
467 UErrorCode *status); | |
468 | |
469 /** | |
470 * Find the first matching substring of the input string that matches the pat
tern. | |
471 * If startIndex is >= zero the search for a match begins at the specified in
dex, | |
472 * and any match region is reset. This corresponds directly with | |
473 * Matcher.find(startIndex) in Java. | |
474 * | |
475 * If startIndex == -1 the search begins at the start of the input region, | |
476 * or at the start of the full string if no region has been specified
. | |
477 * | |
478 * If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
479 * <code>uregex_group()</code> will provide more information regarding the ma
tch. | |
480 * | |
481 * @param regexp The compiled regular expression. | |
482 * @param startIndex The position (native) in the input string to begin th
e search, or | |
483 * -1 to search within the Input Region. | |
484 * @param status A reference to a UErrorCode to receive any errors. | |
485 * @return TRUE if a match is found. | |
486 * @stable ICU 3.0 | |
487 */ | |
488 U_STABLE UBool U_EXPORT2 | |
489 uregex_find(URegularExpression *regexp, | |
490 int32_t startIndex, | |
491 UErrorCode *status); | |
492 | |
493 /** | |
494 * 64bit version of uregex_find. | |
495 * @draft ICU 4.6 | |
496 */ | |
497 U_DRAFT UBool U_EXPORT2 | |
498 uregex_find64(URegularExpression *regexp, | |
499 int64_t startIndex, | |
500 UErrorCode *status); | |
501 | |
502 /** | |
503 * Find the next pattern match in the input string. Begin searching | |
504 * the input at the location following the end of he previous match, | |
505 * or at the start of the string (or region) if there is no | |
506 * previous match. If a match is found, <code>uregex_start(), uregex_end()</c
ode>, and | |
507 * <code>uregex_group()</code> will provide more information regarding the mat
ch. | |
508 * | |
509 * @param regexp The compiled regular expression. | |
510 * @param status A reference to a UErrorCode to receive any errors. | |
511 * @return TRUE if a match is found. | |
512 * @see uregex_reset | |
513 * @stable ICU 3.0 | |
514 */ | |
515 U_STABLE UBool U_EXPORT2 | |
516 uregex_findNext(URegularExpression *regexp, | |
517 UErrorCode *status); | |
518 | |
519 /** | |
520 * Get the number of capturing groups in this regular expression's pattern. | |
521 * @param regexp The compiled regular expression. | |
522 * @param status A reference to a UErrorCode to receive any errors. | |
523 * @return the number of capture groups | |
524 * @stable ICU 3.0 | |
525 */ | |
526 U_STABLE int32_t U_EXPORT2 | |
527 uregex_groupCount(URegularExpression *regexp, | |
528 UErrorCode *status); | |
529 | |
530 /** Extract the string for the specified matching expression or subexpression. | |
531 * Group #0 is the complete string of matched text. | |
532 * Group #1 is the text matched by the first set of capturing parentheses. | |
533 * | |
534 * @param regexp The compiled regular expression. | |
535 * @param groupNum The capture group to extract. Group 0 is the comple
te | |
536 * match. The value of this parameter must be | |
537 * less than or equal to the number of capture groups i
n | |
538 * the pattern. | |
539 * @param dest Buffer to receive the matching string data | |
540 * @param destCapacity Capacity of the dest buffer. | |
541 * @param status A reference to a UErrorCode to receive any errors. | |
542 * @return Length of matching data, | |
543 * or -1 if no applicable match. | |
544 * @stable ICU 3.0 | |
545 */ | |
546 U_STABLE int32_t U_EXPORT2 | |
547 uregex_group(URegularExpression *regexp, | |
548 int32_t groupNum, | |
549 UChar *dest, | |
550 int32_t destCapacity, | |
551 UErrorCode *status); | |
552 | |
553 | |
554 /** Returns a shallow immutable clone of the entire input string. The returned
UText current native index | |
555 * is set to the beginning of the requested capture group. The capture group
length is also | |
556 * returned via groupLength. | |
557 * Group #0 is the complete string of matched text. | |
558 * Group #1 is the text matched by the first set of capturing parentheses. | |
559 * | |
560 * @param regexp The compiled regular expression. | |
561 * @param groupNum The capture group to extract. Group 0 is the comple
te | |
562 * match. The value of this parameter must be | |
563 * less than or equal to the number of capture groups i
n | |
564 * the pattern. | |
565 * @param dest A mutable UText in which to store the current input. | |
566 * If NULL, a new UText will be created as an immutable
shallow clone | |
567 * of the entire input string. | |
568 * @param groupLength The group length of the desired capture group. | |
569 * @param status A reference to a UErrorCode to receive any errors. | |
570 * @return The subject text currently associated with this regu
lar expression. | |
571 * If a pre-allocated UText was provided, it will alway
s be used and returned. | |
572 | |
573 * | |
574 * @draft ICU 4.6 | |
575 */ | |
576 U_DRAFT UText * U_EXPORT2 | |
577 uregex_groupUText(URegularExpression *regexp, | |
578 int32_t groupNum, | |
579 UText *dest, | |
580 int64_t *groupLength, | |
581 UErrorCode *status); | |
582 | |
583 | |
584 /** Extract the string for the specified matching expression or subexpression. | |
585 * Group #0 is the complete string of matched text. | |
586 * Group #1 is the text matched by the first set of capturing parentheses. | |
587 * | |
588 * @param regexp The compiled regular expression. | |
589 * @param groupNum The capture group to extract. Group 0 is the comple
te | |
590 * match. The value of this parameter must be | |
591 * less than or equal to the number of capture groups i
n | |
592 * the pattern. | |
593 * @param dest Mutable UText to receive the matching string data. | |
594 * If NULL, a new UText will be created (which may not
be mutable). | |
595 * @param status A reference to a UErrorCode to receive any errors. | |
596 * @return The matching string data. If a pre-allocated UText w
as provided, | |
597 * it will always be used and returned. | |
598 * | |
599 * @internal ICU 4.4 technology preview | |
600 */ | |
601 U_INTERNAL UText * U_EXPORT2 | |
602 uregex_groupUTextDeep(URegularExpression *regexp, | |
603 int32_t groupNum, | |
604 UText *dest, | |
605 UErrorCode *status); | |
606 | |
607 /** | |
608 * Returns the index in the input string of the start of the text matched by
the | |
609 * specified capture group during the previous match operation. Return -1 if | |
610 * the capture group was not part of the last match. | |
611 * Group #0 refers to the complete range of matched text. | |
612 * Group #1 refers to the text matched by the first set of capturing parenthe
ses. | |
613 * | |
614 * @param regexp The compiled regular expression. | |
615 * @param groupNum The capture group number | |
616 * @param status A reference to a UErrorCode to receive any errors. | |
617 * @return the starting (native) position in the input of the t
ext matched | |
618 * by the specified group. | |
619 * @stable ICU 3.0 | |
620 */ | |
621 U_STABLE int32_t U_EXPORT2 | |
622 uregex_start(URegularExpression *regexp, | |
623 int32_t groupNum, | |
624 UErrorCode *status); | |
625 | |
626 /** | |
627 * 64bit version of uregex_start. | |
628 * @draft ICU 4.6 | |
629 */ | |
630 U_DRAFT int64_t U_EXPORT2 | |
631 uregex_start64(URegularExpression *regexp, | |
632 int32_t groupNum, | |
633 UErrorCode *status); | |
634 | |
635 /** | |
636 * Returns the index in the input string of the position following the end | |
637 * of the text matched by the specified capture group. | |
638 * Return -1 if the capture group was not part of the last match. | |
639 * Group #0 refers to the complete range of matched text. | |
640 * Group #1 refers to the text matched by the first set of capturing parenthe
ses. | |
641 * | |
642 * @param regexp The compiled regular expression. | |
643 * @param groupNum The capture group number | |
644 * @param status A reference to a UErrorCode to receive any errors. | |
645 * @return the (native) index of the position following the las
t matched character. | |
646 * @stable ICU 3.0 | |
647 */ | |
648 U_STABLE int32_t U_EXPORT2 | |
649 uregex_end(URegularExpression *regexp, | |
650 int32_t groupNum, | |
651 UErrorCode *status); | |
652 | |
653 /** | |
654 * 64bit version of uregex_end. | |
655 * @draft ICU 4.6 | |
656 */ | |
657 U_DRAFT int64_t U_EXPORT2 | |
658 uregex_end64(URegularExpression *regexp, | |
659 int32_t groupNum, | |
660 UErrorCode *status); | |
661 | |
662 /** | |
663 * Reset any saved state from the previous match. Has the effect of | |
664 * causing uregex_findNext to begin at the specified index, and causing | |
665 * uregex_start(), uregex_end() and uregex_group() to return an error | |
666 * indicating that there is no match information available. Clears any | |
667 * match region that may have been set. | |
668 * | |
669 * @param regexp The compiled regular expression. | |
670 * @param index The position (native) in the text at which a | |
671 * uregex_findNext() should begin searching. | |
672 * @param status A reference to a UErrorCode to receive any errors. | |
673 * @stable ICU 3.0 | |
674 */ | |
675 U_STABLE void U_EXPORT2 | |
676 uregex_reset(URegularExpression *regexp, | |
677 int32_t index, | |
678 UErrorCode *status); | |
679 | |
680 /** | |
681 * 64bit version of uregex_reset. | |
682 * @draft ICU 4.6 | |
683 */ | |
684 U_DRAFT void U_EXPORT2 | |
685 uregex_reset64(URegularExpression *regexp, | |
686 int64_t index, | |
687 UErrorCode *status); | |
688 | |
689 /** Sets the limits of the matching region for this URegularExpression. | |
690 * The region is the part of the input string that will be considered when matc
hing. | |
691 * Invoking this method resets any saved state from the previous match, | |
692 * then sets the region to start at the index specified by the start parameter | |
693 * and end at the index specified by the end parameter. | |
694 * | |
695 * Depending on the transparency and anchoring being used (see useTransparentBo
unds | |
696 * and useAnchoringBounds), certain constructs such as anchors may behave diffe
rently | |
697 * at or around the boundaries of the region | |
698 * | |
699 * The function will fail if start is greater than limit, or if either index | |
700 * is less than zero or greater than the length of the string being matched. | |
701 * | |
702 * @param regexp The compiled regular expression. | |
703 * @param regionStart The (native) index to begin searches at. | |
704 * @param regionLimit The (native) index to end searches at (exclusive). | |
705 * @param status A pointer to a UErrorCode to receive any errors. | |
706 * @stable ICU 4.0 | |
707 */ | |
708 U_STABLE void U_EXPORT2 | |
709 uregex_setRegion(URegularExpression *regexp, | |
710 int32_t regionStart, | |
711 int32_t regionLimit, | |
712 UErrorCode *status); | |
713 | |
714 /** | |
715 * 64bit version of uregex_setRegion. | |
716 * @draft ICU 4.6 | |
717 */ | |
718 U_DRAFT void U_EXPORT2 | |
719 uregex_setRegion64(URegularExpression *regexp, | |
720 int64_t regionStart, | |
721 int64_t regionLimit, | |
722 UErrorCode *status); | |
723 | |
724 /** | |
725 * Variation on uregex_setRegion to set the region without resetting the star
t index | |
726 * without resetting the position for subsequent matches. | |
727 * @draft ICU 4.6 | |
728 */ | |
729 U_DRAFT void U_EXPORT2 | |
730 uregex_setRegionAndStart(URegularExpression *regexp, | |
731 int64_t regionStart, | |
732 int64_t regionLimit, | |
733 int64_t startIndex, | |
734 UErrorCode *status); | |
735 | |
736 /** | |
737 * Reports the start index of the matching region. Any matches found are limite
d to | |
738 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). | |
739 * | |
740 * @param regexp The compiled regular expression. | |
741 * @param status A pointer to a UErrorCode to receive any errors. | |
742 * @return The starting (native) index of this matcher's region. | |
743 * @stable ICU 4.0 | |
744 */ | |
745 U_STABLE int32_t U_EXPORT2 | |
746 uregex_regionStart(const URegularExpression *regexp, | |
747 UErrorCode *status); | |
748 | |
749 /** | |
750 * 64bit version of uregex_regionStart. | |
751 * @draft ICU 4.6 | |
752 */ | |
753 U_DRAFT int64_t U_EXPORT2 | |
754 uregex_regionStart64(const URegularExpression *regexp, | |
755 UErrorCode *status); | |
756 | |
757 /** | |
758 * Reports the end index (exclusive) of the matching region for this URegularEx
pression. | |
759 * Any matches found are limited to to the region bounded by regionStart (inclu
sive) | |
760 * and regionEnd (exclusive). | |
761 * | |
762 * @param regexp The compiled regular expression. | |
763 * @param status A pointer to a UErrorCode to receive any errors. | |
764 * @return The ending point (native) of this matcher's region. | |
765 * @stable ICU 4.0 | |
766 */ | |
767 U_STABLE int32_t U_EXPORT2 | |
768 uregex_regionEnd(const URegularExpression *regexp, | |
769 UErrorCode *status); | |
770 | |
771 /** | |
772 * 64bit version of uregex_regionEnd. | |
773 * @draft ICU 4.6 | |
774 */ | |
775 U_DRAFT int64_t U_EXPORT2 | |
776 uregex_regionEnd64(const URegularExpression *regexp, | |
777 UErrorCode *status); | |
778 | |
779 /** | |
780 * Queries the transparency of region bounds for this URegularExpression. | |
781 * See useTransparentBounds for a description of transparent and opaque bounds. | |
782 * By default, matching boundaries are opaque. | |
783 * | |
784 * @param regexp The compiled regular expression. | |
785 * @param status A pointer to a UErrorCode to receive any errors. | |
786 * @return TRUE if this matcher is using opaque bounds, false if it is not. | |
787 * @stable ICU 4.0 | |
788 */ | |
789 U_STABLE UBool U_EXPORT2 | |
790 uregex_hasTransparentBounds(const URegularExpression *regexp, | |
791 UErrorCode *status); | |
792 | |
793 | |
794 /** | |
795 * Sets the transparency of region bounds for this URegularExpression. | |
796 * Invoking this function with an argument of TRUE will set matches to use tran
sparent bounds. | |
797 * If the boolean argument is FALSE, then opaque bounds will be used. | |
798 * | |
799 * Using transparent bounds, the boundaries of the matching region are transpar
ent | |
800 * to lookahead, lookbehind, and boundary matching constructs. Those constructs
can | |
801 * see text beyond the boundaries of the region while checking for a match. | |
802 * | |
803 * With opaque bounds, no text outside of the matching region is visible to loo
kahead, | |
804 * lookbehind, and boundary matching constructs. | |
805 * | |
806 * By default, opaque bounds are used. | |
807 * | |
808 * @param regexp The compiled regular expression. | |
809 * @param b TRUE for transparent bounds; FALSE for opaque bounds | |
810 * @param status A pointer to a UErrorCode to receive any errors. | |
811 * @stable ICU 4.0 | |
812 **/ | |
813 U_STABLE void U_EXPORT2 | |
814 uregex_useTransparentBounds(URegularExpression *regexp, | |
815 UBool b, | |
816 UErrorCode *status); | |
817 | |
818 | |
819 /** | |
820 * Return true if this URegularExpression is using anchoring bounds. | |
821 * By default, anchoring region bounds are used. | |
822 * | |
823 * @param regexp The compiled regular expression. | |
824 * @param status A pointer to a UErrorCode to receive any errors. | |
825 * @return TRUE if this matcher is using anchoring bounds. | |
826 * @stable ICU 4.0 | |
827 */ | |
828 U_STABLE UBool U_EXPORT2 | |
829 uregex_hasAnchoringBounds(const URegularExpression *regexp, | |
830 UErrorCode *status); | |
831 | |
832 | |
833 /** | |
834 * Set whether this URegularExpression is using Anchoring Bounds for its region
. | |
835 * With anchoring bounds, pattern anchors such as ^ and $ will match at the sta
rt | |
836 * and end of the region. Without Anchoring Bounds, anchors will only match at | |
837 * the positions they would in the complete text. | |
838 * | |
839 * Anchoring Bounds are the default for regions. | |
840 * | |
841 * @param regexp The compiled regular expression. | |
842 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. | |
843 * @param status A pointer to a UErrorCode to receive any errors. | |
844 * @stable ICU 4.0 | |
845 */ | |
846 U_STABLE void U_EXPORT2 | |
847 uregex_useAnchoringBounds(URegularExpression *regexp, | |
848 UBool b, | |
849 UErrorCode *status); | |
850 | |
851 /** | |
852 * Return TRUE if the most recent matching operation touched the | |
853 * end of the text being processed. In this case, additional input text could | |
854 * change the results of that match. | |
855 * | |
856 * @param regexp The compiled regular expression. | |
857 * @param status A pointer to a UErrorCode to receive any errors. | |
858 * @return TRUE if the most recent match hit the end of input | |
859 * @stable ICU 4.0 | |
860 */ | |
861 U_STABLE UBool U_EXPORT2 | |
862 uregex_hitEnd(const URegularExpression *regexp, | |
863 UErrorCode *status); | |
864 | |
865 /** | |
866 * Return TRUE the most recent match succeeded and additional input could cause | |
867 * it to fail. If this function returns false and a match was found, then more
input | |
868 * might change the match but the match won't be lost. If a match was not found
, | |
869 * then requireEnd has no meaning. | |
870 * | |
871 * @param regexp The compiled regular expression. | |
872 * @param status A pointer to a UErrorCode to receive any errors. | |
873 * @return TRUE if more input could cause the most recent match to no longer m
atch. | |
874 * @stable ICU 4.0 | |
875 */ | |
876 U_STABLE UBool U_EXPORT2 | |
877 uregex_requireEnd(const URegularExpression *regexp, | |
878 UErrorCode *status); | |
879 | |
880 | |
881 | |
882 | |
883 | |
884 /** | |
885 * Replaces every substring of the input that matches the pattern | |
886 * with the given replacement string. This is a convenience function that | |
887 * provides a complete find-and-replace-all operation. | |
888 * | |
889 * This method scans the input string looking for matches of the pattern. | |
890 * Input that is not part of any match is copied unchanged to the | |
891 * destination buffer. Matched regions are replaced in the output | |
892 * buffer by the replacement string. The replacement string may contain | |
893 * references to capture groups; these take the form of $1, $2, etc. | |
894 * | |
895 * @param regexp The compiled regular expression. | |
896 * @param replacementText A string containing the replacement text. | |
897 * @param replacementLength The length of the replacement string, or | |
898 * -1 if it is NUL terminated. | |
899 * @param destBuf A (UChar *) buffer that will receive the resu
lt. | |
900 * @param destCapacity The capacity of the desitnation buffer. | |
901 * @param status A reference to a UErrorCode to receive any er
rors. | |
902 * @return The length of the string resulting from the f
ind | |
903 * and replace operation. In the event that the | |
904 * destination capacity is inadequate, the retur
n value | |
905 * is still the full length of the untruncated s
tring. | |
906 * @stable ICU 3.0 | |
907 */ | |
908 U_STABLE int32_t U_EXPORT2 | |
909 uregex_replaceAll(URegularExpression *regexp, | |
910 const UChar *replacementText, | |
911 int32_t replacementLength, | |
912 UChar *destBuf, | |
913 int32_t destCapacity, | |
914 UErrorCode *status); | |
915 | |
916 /** | |
917 * Replaces every substring of the input that matches the pattern | |
918 * with the given replacement string. This is a convenience function that | |
919 * provides a complete find-and-replace-all operation. | |
920 * | |
921 * This method scans the input string looking for matches of the pattern. | |
922 * Input that is not part of any match is copied unchanged to the | |
923 * destination buffer. Matched regions are replaced in the output | |
924 * buffer by the replacement string. The replacement string may contain | |
925 * references to capture groups; these take the form of $1, $2, etc. | |
926 * | |
927 * @param regexp The compiled regular expression. | |
928 * @param replacement A string containing the replacement text. | |
929 * @param dest A mutable UText that will receive the result. | |
930 * If NULL, a new UText will be created (which may
not be mutable). | |
931 * @param status A reference to a UErrorCode to receive any errors
. | |
932 * @return A UText containing the results of the find and re
place. | |
933 * If a pre-allocated UText was provided, it will a
lways be used and returned. | |
934 * | |
935 * @draft ICU 4.6 | |
936 */ | |
937 U_DRAFT UText * U_EXPORT2 | |
938 uregex_replaceAllUText(URegularExpression *regexp, | |
939 UText *replacement, | |
940 UText *dest, | |
941 UErrorCode *status); | |
942 | |
943 /** | |
944 * Replaces the first substring of the input that matches the pattern | |
945 * with the given replacement string. This is a convenience function that | |
946 * provides a complete find-and-replace operation. | |
947 * | |
948 * This method scans the input string looking for a match of the pattern. | |
949 * All input that is not part of the match is copied unchanged to the | |
950 * destination buffer. The matched region is replaced in the output | |
951 * buffer by the replacement string. The replacement string may contain | |
952 * references to capture groups; these take the form of $1, $2, etc. | |
953 * | |
954 * @param regexp The compiled regular expression. | |
955 * @param replacementText A string containing the replacement text. | |
956 * @param replacementLength The length of the replacement string, or | |
957 * -1 if it is NUL terminated. | |
958 * @param destBuf A (UChar *) buffer that will receive the resu
lt. | |
959 * @param destCapacity The capacity of the desitnation buffer. | |
960 * @param status a reference to a UErrorCode to receive any er
rors. | |
961 * @return The length of the string resulting from the f
ind | |
962 * and replace operation. In the event that the | |
963 * destination capacity is inadequate, the retur
n value | |
964 * is still the full length of the untruncated s
tring. | |
965 * @stable ICU 3.0 | |
966 */ | |
967 U_STABLE int32_t U_EXPORT2 | |
968 uregex_replaceFirst(URegularExpression *regexp, | |
969 const UChar *replacementText, | |
970 int32_t replacementLength, | |
971 UChar *destBuf, | |
972 int32_t destCapacity, | |
973 UErrorCode *status); | |
974 | |
975 /** | |
976 * Replaces the first substring of the input that matches the pattern | |
977 * with the given replacement string. This is a convenience function that | |
978 * provides a complete find-and-replace operation. | |
979 * | |
980 * This method scans the input string looking for a match of the pattern. | |
981 * All input that is not part of the match is copied unchanged to the | |
982 * destination buffer. The matched region is replaced in the output | |
983 * buffer by the replacement string. The replacement string may contain | |
984 * references to capture groups; these take the form of $1, $2, etc. | |
985 * | |
986 * @param regexp The compiled regular expression. | |
987 * @param replacement A string containing the replacement text. | |
988 * @param dest A mutable UText that will receive the result. | |
989 * If NULL, a new UText will be created (which may
not be mutable). | |
990 * @param status A reference to a UErrorCode to receive any errors
. | |
991 * @return A UText containing the results of the find and re
place. | |
992 * If a pre-allocated UText was provided, it will a
lways be used and returned. | |
993 * | |
994 * @draft ICU 4.6 | |
995 */ | |
996 U_DRAFT UText * U_EXPORT2 | |
997 uregex_replaceFirstUText(URegularExpression *regexp, | |
998 UText *replacement, | |
999 UText *dest, | |
1000 UErrorCode *status); | |
1001 | |
1002 | |
1003 /** | |
1004 * Implements a replace operation intended to be used as part of an | |
1005 * incremental find-and-replace. | |
1006 * | |
1007 * <p>The input string, starting from the end of the previous match and endin
g at | |
1008 * the start of the current match, is appended to the destination string. Th
en the | |
1009 * replacement string is appended to the output string, | |
1010 * including handling any substitutions of captured text.</p> | |
1011 * | |
1012 * <p>A note on preflight computation of buffersize and error handling: | |
1013 * Calls to uregex_appendReplacement() and uregex_appendTail() are | |
1014 * designed to be chained, one after another, with the destination | |
1015 * buffer pointer and buffer capacity updated after each in preparation | |
1016 * to for the next. If the destination buffer is exhausted partway through s
uch a | |
1017 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal | |
1018 * ICU conventions are for a function to perform no action if it is | |
1019 * called with an error status, but for this one case, uregex_appendRepacemen
t() | |
1020 * will operate normally so that buffer size computations will complete | |
1021 * correctly. | |
1022 * | |
1023 * <p>For simple, prepackaged, non-incremental find-and-replace | |
1024 * operations, see replaceFirst() or replaceAll().</p> | |
1025 * | |
1026 * @param regexp The regular expression object. | |
1027 * @param replacementText The string that will replace the matched portion
of the | |
1028 * input string as it is copied to the destination buffe
r. | |
1029 * The replacement text may contain references ($1, for | |
1030 * example) to capture groups from the match. | |
1031 * @param replacementLength The length of the replacement text string, | |
1032 * or -1 if the string is NUL terminated. | |
1033 * @param destBuf The buffer into which the results of the | |
1034 * find-and-replace are placed. On return, this pointer | |
1035 * will be updated to refer to the beginning of the | |
1036 * unused portion of buffer, leaving it in position for | |
1037 * a subsequent call to this function. | |
1038 * @param destCapacity The size of the output buffer, On return, this | |
1039 * parameter will be updated to reflect the space remain
ing | |
1040 * unused in the output buffer. | |
1041 * @param status A reference to a UErrorCode to receive any errors. | |
1042 * @return The length of the result string. In the event that | |
1043 * destCapacity is inadequate, the full length of the | |
1044 * untruncated output string is returned. | |
1045 * | |
1046 * @stable ICU 3.0 | |
1047 * | |
1048 */ | |
1049 U_STABLE int32_t U_EXPORT2 | |
1050 uregex_appendReplacement(URegularExpression *regexp, | |
1051 const UChar *replacementText, | |
1052 int32_t replacementLength, | |
1053 UChar **destBuf, | |
1054 int32_t *destCapacity, | |
1055 UErrorCode *status); | |
1056 | |
1057 | |
1058 /** | |
1059 * Implements a replace operation intended to be used as part of an | |
1060 * incremental find-and-replace. | |
1061 * | |
1062 * <p>The input string, starting from the end of the previous match and endin
g at | |
1063 * the start of the current match, is appended to the destination string. Th
en the | |
1064 * replacement string is appended to the output string, | |
1065 * including handling any substitutions of captured text.</p> | |
1066 * | |
1067 * <p>For simple, prepackaged, non-incremental find-and-replace | |
1068 * operations, see replaceFirst() or replaceAll().</p> | |
1069 * | |
1070 * @param regexp The regular expression object. | |
1071 * @param replacementText The string that will replace the matched portion
of the | |
1072 * input string as it is copied to the destination buffe
r. | |
1073 * The replacement text may contain references ($1, for | |
1074 * example) to capture groups from the match. | |
1075 * @param dest A mutable UText that will receive the result. Must no
t be NULL. | |
1076 * @param status A reference to a UErrorCode to receive any errors. | |
1077 * | |
1078 * @draft ICU 4.6 | |
1079 */ | |
1080 U_DRAFT void U_EXPORT2 | |
1081 uregex_appendReplacementUText(URegularExpression *regexp, | |
1082 UText *replacementText, | |
1083 UText *dest, | |
1084 UErrorCode *status); | |
1085 | |
1086 | |
1087 /** | |
1088 * As the final step in a find-and-replace operation, append the remainder | |
1089 * of the input string, starting at the position following the last match, | |
1090 * to the destination string. <code>uregex_appendTail()</code> is intended | |
1091 * to be invoked after one or more invocations of the | |
1092 * <code>uregex_appendReplacement()</code> function. | |
1093 * | |
1094 * @param regexp The regular expression object. This is needed to | |
1095 * obtain the input string and with the position | |
1096 * of the last match within it. | |
1097 * @param destBuf The buffer in which the results of the | |
1098 * find-and-replace are placed. On return, the pointer | |
1099 * will be updated to refer to the beginning of the | |
1100 * unused portion of buffer. | |
1101 * @param destCapacity The size of the output buffer, On return, this | |
1102 * value will be updated to reflect the space remaining | |
1103 * unused in the output buffer. | |
1104 * @param status A reference to a UErrorCode to receive any errors. | |
1105 * @return The length of the result string. In the event that | |
1106 * destCapacity is inadequate, the full length of the | |
1107 * untruncated output string is returned. | |
1108 * | |
1109 * @stable ICU 3.0 | |
1110 */ | |
1111 U_STABLE int32_t U_EXPORT2 | |
1112 uregex_appendTail(URegularExpression *regexp, | |
1113 UChar **destBuf, | |
1114 int32_t *destCapacity, | |
1115 UErrorCode *status); | |
1116 | |
1117 | |
1118 /** | |
1119 * As the final step in a find-and-replace operation, append the remainder | |
1120 * of the input string, starting at the position following the last match, | |
1121 * to the destination string. <code>uregex_appendTailUText()</code> is intended
| |
1122 * to be invoked after one or more invocations of the | |
1123 * <code>uregex_appendReplacementUText()</code> function. | |
1124 * | |
1125 * @param regexp The regular expression object. This is needed to | |
1126 * obtain the input string and with the position | |
1127 * of the last match within it. | |
1128 * @param dest A mutable UText that will receive the result. Must no
t be NULL. | |
1129 * @return The destination UText. | |
1130 * | |
1131 * @draft ICU 4.6 | |
1132 */ | |
1133 U_DRAFT UText * U_EXPORT2 | |
1134 uregex_appendTailUText(URegularExpression *regexp, | |
1135 UText *dest, | |
1136 UErrorCode *status); | |
1137 | |
1138 | |
1139 | |
1140 /** | |
1141 * Split a string into fields. Somewhat like split() from Perl. | |
1142 * The pattern matches identify delimiters that separate the input | |
1143 * into fields. The input data between the matches becomes the | |
1144 * fields themselves. | |
1145 * <p> | |
1146 * Each of the fields is copied from the input string to the destination | |
1147 * buffer, and NUL terminated. The position of each field within | |
1148 * the destination buffer is returned in the destFields array. | |
1149 * | |
1150 * Note: another choice for the design of this function would be to not | |
1151 * copy the resulting fields at all, but to return indexes and | |
1152 * lengths within the source text. | |
1153 * Advantages would be | |
1154 * o Faster. No Copying. | |
1155 * o Nothing extra needed when field data may contain embedded NU
L chars. | |
1156 * o Less memory needed if working on large data. | |
1157 * Disadvantages | |
1158 * o Less consistent with C++ split, which copies into an | |
1159 * array of UnicodeStrings. | |
1160 * o No NUL termination, extracted fields would be less convenien
t | |
1161 * to use in most cases. | |
1162 * o Possible problems in the future, when support Unicode Normal
ization | |
1163 * could cause the fields to not correspond exactly to | |
1164 * a range of the source text. | |
1165 * | |
1166 * @param regexp The compiled regular expression. | |
1167 * @param destBuf A (UChar *) buffer to receive the fields that | |
1168 * are extracted from the input string. These | |
1169 * field pointers will refer to positions within the | |
1170 * destination buffer supplied by the caller. Any | |
1171 * extra positions within the destFields array will be | |
1172 * set to NULL. | |
1173 * @param destCapacity The capacity of the destBuf. | |
1174 * @param requiredCapacity The actual capacity required of the destBuf. | |
1175 * If destCapacity is too small, requiredCapacity will
return | |
1176 * the total capacity required to hold all of the outp
ut, and | |
1177 * a U_BUFFER_OVERFLOW_ERROR will be returned. | |
1178 * @param destFields An array to be filled with the position of each | |
1179 * of the extracted fields within destBuf. | |
1180 * @param destFieldsCapacity The number of elements in the destFields ar
ray. | |
1181 * If the number of fields found is less than destFieldsCapacit
y, | |
1182 * the extra destFields elements are set to zero. | |
1183 * If destFieldsCapacity is too small, the trailing part of the | |
1184 * input, including any field delimiters, is treated as if it | |
1185 * were the last field - it is copied to the destBuf, and | |
1186 * its position is in the destBuf is stored in the last element | |
1187 * of destFields. This behavior mimics that of Perl. It is no
t | |
1188 * an error condition, and no error status is returned when all
destField | |
1189 * positions are used. | |
1190 * @param status A reference to a UErrorCode to receive any errors. | |
1191 * @return The number of fields into which the input string was split. | |
1192 * @stable ICU 3.0 | |
1193 */ | |
1194 U_STABLE int32_t U_EXPORT2 | |
1195 uregex_split( URegularExpression *regexp, | |
1196 UChar *destBuf, | |
1197 int32_t destCapacity, | |
1198 int32_t *requiredCapacity, | |
1199 UChar *destFields[], | |
1200 int32_t destFieldsCapacity, | |
1201 UErrorCode *status); | |
1202 | |
1203 | |
1204 /** | |
1205 * Split a string into fields. Somewhat like split() from Perl. | |
1206 * The pattern matches identify delimiters that separate the input | |
1207 * into fields. The input data between the matches becomes the | |
1208 * fields themselves. | |
1209 * <p> | |
1210 * The behavior of this function is not very closely aligned with uregex_split
(); | |
1211 * instead, it is based on (and implemented directly on top of) the C++ split
method. | |
1212 * | |
1213 * @param regexp The compiled regular expression. | |
1214 * @param destFields An array of mutable UText structs to receive the resul
ts of the split. | |
1215 * If a field is NULL, a new UText is allocated to contain the
results for | |
1216 * that field. This new UText is not guaranteed to be mutable. | |
1217 * @param destFieldsCapacity The number of elements in the destination array. | |
1218 * If the number of fields found is less than destCapacity, the | |
1219 * extra strings in the destination array are not altered. | |
1220 * If the number of destination strings is less than the number | |
1221 * of fields, the trailing part of the input string, including
any | |
1222 * field delimiters, is placed in the last destination string. | |
1223 * This behavior mimics that of Perl. It is not an error cond
ition, and no | |
1224 * error status is returned when all destField positions are us
ed. | |
1225 * @param status A reference to a UErrorCode to receive any errors. | |
1226 * @return The number of fields into which the input string was split. | |
1227 * | |
1228 * @draft ICU 4.6 | |
1229 */ | |
1230 U_DRAFT int32_t U_EXPORT2 | |
1231 uregex_splitUText(URegularExpression *regexp, | |
1232 UText *destFields[], | |
1233 int32_t destFieldsCapacity, | |
1234 UErrorCode *status); | |
1235 | |
1236 | |
1237 | |
1238 | |
1239 /** | |
1240 * Set a processing time limit for match operations with this URegularExpression
. | |
1241 * | |
1242 * Some patterns, when matching certain strings, can run in exponential time. | |
1243 * For practical purposes, the match operation may appear to be in an | |
1244 * infinite loop. | |
1245 * When a limit is set a match operation will fail with an error if the | |
1246 * limit is exceeded. | |
1247 * <p> | |
1248 * The units of the limit are steps of the match engine. | |
1249 * Correspondence with actual processor time will depend on the speed | |
1250 * of the processor and the details of the specific pattern, but will | |
1251 * typically be on the order of milliseconds. | |
1252 * <p> | |
1253 * By default, the matching time is not limited. | |
1254 * <p> | |
1255 * | |
1256 * @param regexp The compiled regular expression. | |
1257 * @param limit The limit value, or 0 for no limit. | |
1258 * @param status A reference to a UErrorCode to receive any errors. | |
1259 * @stable ICU 4.0 | |
1260 */ | |
1261 U_STABLE void U_EXPORT2 | |
1262 uregex_setTimeLimit(URegularExpression *regexp, | |
1263 int32_t limit, | |
1264 UErrorCode *status); | |
1265 | |
1266 /** | |
1267 * Get the time limit for for matches with this URegularExpression. | |
1268 * A return value of zero indicates that there is no limit. | |
1269 * | |
1270 * @param regexp The compiled regular expression. | |
1271 * @param status A reference to a UErrorCode to receive any errors. | |
1272 * @return the maximum allowed time for a match, in units of processing steps. | |
1273 * @stable ICU 4.0 | |
1274 */ | |
1275 U_STABLE int32_t U_EXPORT2 | |
1276 uregex_getTimeLimit(const URegularExpression *regexp, | |
1277 UErrorCode *status); | |
1278 | |
1279 /** | |
1280 * Set the amount of heap storage avaliable for use by the match backtracking st
ack. | |
1281 * <p> | |
1282 * ICU uses a backtracking regular expression engine, with the backtrack stack | |
1283 * maintained on the heap. This function sets the limit to the amount of memory | |
1284 * that can be used for this purpose. A backtracking stack overflow will | |
1285 * result in an error from the match operation that caused it. | |
1286 * <p> | |
1287 * A limit is desirable because a malicious or poorly designed pattern can use | |
1288 * excessive memory, potentially crashing the process. A limit is enabled | |
1289 * by default. | |
1290 * <p> | |
1291 * @param regexp The compiled regular expression. | |
1292 * @param limit The maximum size, in bytes, of the matching backtrack st
ack. | |
1293 * A value of -1 means no limit. | |
1294 * The limit must be greater than zero, or -1. | |
1295 * @param status A reference to a UErrorCode to receive any errors. | |
1296 * | |
1297 * @stable ICU 4.0 | |
1298 */ | |
1299 U_STABLE void U_EXPORT2 | |
1300 uregex_setStackLimit(URegularExpression *regexp, | |
1301 int32_t limit, | |
1302 UErrorCode *status); | |
1303 | |
1304 /** | |
1305 * Get the size of the heap storage available for use by the back tracking stack
. | |
1306 * | |
1307 * @return the maximum backtracking stack size, in bytes, or zero if the | |
1308 * stack size is unlimited. | |
1309 * @stable ICU 4.0 | |
1310 */ | |
1311 U_STABLE int32_t U_EXPORT2 | |
1312 uregex_getStackLimit(const URegularExpression *regexp, | |
1313 UErrorCode *status); | |
1314 | |
1315 | |
1316 /** | |
1317 * Function pointer for a regular expression matching callback function. | |
1318 * When set, a callback function will be called periodically during matching | |
1319 * operations. If the call back function returns FALSE, the matching | |
1320 * operation will be terminated early. | |
1321 * | |
1322 * Note: the callback function must not call other functions on this | |
1323 * URegularExpression. | |
1324 * | |
1325 * @param context context pointer. The callback function will be invoked | |
1326 * with the context specified at the time that | |
1327 * uregex_setMatchCallback() is called. | |
1328 * @param steps the accumulated processing time, in match steps, | |
1329 * for this matching operation. | |
1330 * @return TRUE to continue the matching operation. | |
1331 * FALSE to terminate the matching operation. | |
1332 * @stable ICU 4.0 | |
1333 */ | |
1334 U_CDECL_BEGIN | |
1335 typedef UBool U_CALLCONV URegexMatchCallback ( | |
1336 const void *context, | |
1337 int32_t steps); | |
1338 U_CDECL_END | |
1339 | |
1340 /** | |
1341 * Set a callback function for this URegularExpression. | |
1342 * During matching operations the function will be called periodically, | |
1343 * giving the application the opportunity to terminate a long-running | |
1344 * match. | |
1345 * | |
1346 * @param regexp The compiled regular expression. | |
1347 * @param callback A pointer to the user-supplied callback function. | |
1348 * @param context User context pointer. The value supplied at the | |
1349 * time the callback function is set will be saved | |
1350 * and passed to the callback each time that it is called. | |
1351 * @param status A reference to a UErrorCode to receive any errors. | |
1352 * @stable ICU 4.0 | |
1353 */ | |
1354 U_STABLE void U_EXPORT2 | |
1355 uregex_setMatchCallback(URegularExpression *regexp, | |
1356 URegexMatchCallback *callback, | |
1357 const void *context, | |
1358 UErrorCode *status); | |
1359 | |
1360 | |
1361 /** | |
1362 * Get the callback function for this URegularExpression. | |
1363 * | |
1364 * @param regexp The compiled regular expression. | |
1365 * @param callback Out paramater, receives a pointer to the user-supplied | |
1366 * callback function. | |
1367 * @param context Out parameter, receives the user context pointer that | |
1368 * was set when uregex_setMatchCallback() was called. | |
1369 * @param status A reference to a UErrorCode to receive any errors. | |
1370 * @stable ICU 4.0 | |
1371 */ | |
1372 U_STABLE void U_EXPORT2 | |
1373 uregex_getMatchCallback(const URegularExpression *regexp, | |
1374 URegexMatchCallback **callback, | |
1375 const void **context, | |
1376 UErrorCode *status); | |
1377 | |
1378 | |
1379 /** | |
1380 * Function pointer for a regular expression find callback function. | |
1381 * | |
1382 * When set, a callback function will be called during a find operation | |
1383 * and for operations that depend on find, such as findNext, split and some repl
ace | |
1384 * operations like replaceFirst. | |
1385 * The callback will usually be called after each attempt at a match, but this i
s not a | |
1386 * guarantee that the callback will be invoked at each character. For finds whe
re the | |
1387 * match engine is invoked at each character, this may be close to true, but les
s likely | |
1388 * for more optimized loops where the pattern is known to only start, and the ma
tch | |
1389 * engine invoked, at certain characters. | |
1390 * When invoked, this callback will specify the index at which a match operation
is about | |
1391 * to be attempted, giving the application the opportunity to terminate a long-r
unning | |
1392 * find operation. | |
1393 * | |
1394 * If the call back function returns FALSE, the find operation will be terminate
d early. | |
1395 * | |
1396 * Note: the callback function must not call other functions on this | |
1397 * URegularExpression | |
1398 * | |
1399 * @param context context pointer. The callback function will be invoked | |
1400 * with the context specified at the time that | |
1401 * uregex_setFindProgressCallback() is called. | |
1402 * @param matchIndex the next index at which a match attempt will be attempted
for this | |
1403 * find operation. If this callback interrupts the search, this
is the | |
1404 * index at which a find/findNext operation may be re-initiated. | |
1405 * @return TRUE to continue the matching operation. | |
1406 * FALSE to terminate the matching operation. | |
1407 * @draft ICU 4.6 | |
1408 */ | |
1409 U_CDECL_BEGIN | |
1410 typedef UBool U_CALLCONV URegexFindProgressCallback ( | |
1411 const void *context, | |
1412 int64_t matchIndex); | |
1413 U_CDECL_END | |
1414 | |
1415 /** | |
1416 * Set the find progress callback function for this URegularExpression. | |
1417 * | |
1418 * @param regexp The compiled regular expression. | |
1419 * @param callback A pointer to the user-supplied callback function. | |
1420 * @param context User context pointer. The value supplied at the | |
1421 * time the callback function is set will be saved | |
1422 * and passed to the callback each time that it is called. | |
1423 * @param status A reference to a UErrorCode to receive any errors. | |
1424 * @draft ICU 4.6 | |
1425 */ | |
1426 U_DRAFT void U_EXPORT2 | |
1427 uregex_setFindProgressCallback(URegularExpression *regexp, | |
1428 URegexFindProgressCallback *callback, | |
1429 const void *context, | |
1430 UErrorCode *status); | |
1431 | |
1432 | |
1433 /** | |
1434 * Get the find progress callback function for this URegularExpression. | |
1435 * | |
1436 * @param regexp The compiled regular expression. | |
1437 * @param callback Out paramater, receives a pointer to the user-supplied | |
1438 * callback function. | |
1439 * @param context Out parameter, receives the user context pointer that | |
1440 * was set when uregex_setFindProgressCallback() was called
. | |
1441 * @param status A reference to a UErrorCode to receive any errors. | |
1442 * @draft ICU 4.6 | |
1443 */ | |
1444 U_DRAFT void U_EXPORT2 | |
1445 uregex_getFindProgressCallback(const URegularExpression *regexp, | |
1446 URegexFindProgressCallback **callback, | |
1447 const void **context, | |
1448 UErrorCode *status); | |
1449 | |
1450 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
1451 #endif /* UREGEX_H */ | |
OLD | NEW |