OLD | NEW |
| (Empty) |
1 /* | |
2 ********************************************************************** | |
3 * Copyright (C) 2001-2010 IBM and others. All rights reserved. | |
4 ********************************************************************** | |
5 * Date Name Description | |
6 * 06/28/2001 synwee Creation. | |
7 ********************************************************************** | |
8 */ | |
9 #ifndef USEARCH_H | |
10 #define USEARCH_H | |
11 | |
12 #include "unicode/utypes.h" | |
13 | |
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | |
15 | |
16 #include "unicode/localpointer.h" | |
17 #include "unicode/ucol.h" | |
18 #include "unicode/ucoleitr.h" | |
19 #include "unicode/ubrk.h" | |
20 | |
21 /** | |
22 * \file | |
23 * \brief C API: StringSearch | |
24 * | |
25 * C Apis for an engine that provides language-sensitive text searching based | |
26 * on the comparison rules defined in a <tt>UCollator</tt> data struct, | |
27 * see <tt>ucol.h</tt>. This ensures that language eccentricity can be | |
28 * handled, e.g. for the German collator, characters ß and SS will be matc
hed | |
29 * if case is chosen to be ignored. | |
30 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design
/collation/ICU_collation_design.htm"> | |
31 * "ICU Collation Design Document"</a> for more information. | |
32 * <p> | |
33 * The algorithm implemented is a modified form of the Boyer Moore's search. | |
34 * For more information see | |
35 * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.
html"> | |
36 * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i> | |
37 * in February, 1999, for further information on the algorithm. | |
38 * <p> | |
39 * There are 2 match options for selection:<br> | |
40 * Let S' be the sub-string of a text string S between the offsets start and | |
41 * end <start, end>. | |
42 * <br> | |
43 * A pattern string P matches a text string S at the offsets <start, end> | |
44 * if | |
45 * <pre> | |
46 * option 1. Some canonical equivalent of P matches some canonical equivalent | |
47 * of S' | |
48 * option 2. P matches S' and if P starts or ends with a combining mark, | |
49 * there exists no non-ignorable combining mark before or after S' | |
50 * in S respectively. | |
51 * </pre> | |
52 * Option 2. will be the default. | |
53 * <p> | |
54 * This search has APIs similar to that of other text iteration mechanisms | |
55 * such as the break iterators in <tt>ubrk.h</tt>. Using these | |
56 * APIs, it is easy to scan through text looking for all occurances of | |
57 * a given pattern. This search iterator allows changing of direction by | |
58 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. | |
59 * Though a direction change can occur without calling <tt>reset</tt> first, | |
60 * this operation comes with some speed penalty. | |
61 * Generally, match results in the forward direction will match the result | |
62 * matches in the backwards direction in the reverse order | |
63 * <p> | |
64 * <tt>usearch.h</tt> provides APIs to specify the starting position | |
65 * within the text string to be searched, e.g. <tt>usearch_setOffset</tt>, | |
66 * <tt>usearch_preceding</tt> and <tt>usearch_following</tt>. Since the | |
67 * starting position will be set as it is specified, please take note that | |
68 * there are some dangerous positions which the search may render incorrect | |
69 * results: | |
70 * <ul> | |
71 * <li> The midst of a substring that requires normalization. | |
72 * <li> If the following match is to be found, the position should not be the | |
73 * second character which requires to be swapped with the preceding | |
74 * character. Vice versa, if the preceding match is to be found, | |
75 * position to search from should not be the first character which | |
76 * requires to be swapped with the next character. E.g certain Thai and | |
77 * Lao characters require swapping. | |
78 * <li> If a following pattern match is to be found, any position within a | |
79 * contracting sequence except the first will fail. Vice versa if a | |
80 * preceding pattern match is to be found, a invalid starting point | |
81 * would be any character within a contracting sequence except the last. | |
82 * </ul> | |
83 * <p> | |
84 * A breakiterator can be used if only matches at logical breaks are desired. | |
85 * Using a breakiterator will only give you results that exactly matches the | |
86 * boundaries given by the breakiterator. For instance the pattern "e" will | |
87 * not be found in the string "\u00e9" if a character break iterator is used. | |
88 * <p> | |
89 * Options are provided to handle overlapping matches. | |
90 * E.g. In English, overlapping matches produces the result 0 and 2 | |
91 * for the pattern "abab" in the text "ababab", where else mutually | |
92 * exclusive matches only produce the result of 0. | |
93 * <p> | |
94 * Though collator attributes will be taken into consideration while | |
95 * performing matches, there are no APIs here for setting and getting the | |
96 * attributes. These attributes can be set by getting the collator | |
97 * from <tt>usearch_getCollator</tt> and using the APIs in <tt>ucol.h</tt>. | |
98 * Lastly to update String Search to the new collator attributes, | |
99 * usearch_reset() has to be called. | |
100 * <p> | |
101 * Restriction: <br> | |
102 * Currently there are no composite characters that consists of a | |
103 * character with combining class > 0 before a character with combining | |
104 * class == 0. However, if such a character exists in the future, the | |
105 * search mechanism does not guarantee the results for option 1. | |
106 * | |
107 * <p> | |
108 * Example of use:<br> | |
109 * <pre><code> | |
110 * char *tgtstr = "The quick brown fox jumped over the lazy fox"; | |
111 * char *patstr = "fox"; | |
112 * UChar target[64]; | |
113 * UChar pattern[16]; | |
114 * UErrorCode status = U_ZERO_ERROR; | |
115 * u_uastrcpy(target, tgtstr); | |
116 * u_uastrcpy(pattern, patstr); | |
117 * | |
118 * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US", | |
119 * NULL, &status); | |
120 * if (U_SUCCESS(status)) { | |
121 * for (int pos = usearch_first(search, &status); | |
122 * pos != USEARCH_DONE; | |
123 * pos = usearch_next(search, &status)) | |
124 * { | |
125 * printf("Found match at %d pos, length is %d\n", pos, | |
126 * usearch_getMatchLength(search)); | |
127 * } | |
128 * } | |
129 * | |
130 * usearch_close(search); | |
131 * </code></pre> | |
132 * @stable ICU 2.4 | |
133 */ | |
134 | |
135 /** | |
136 * DONE is returned by previous() and next() after all valid matches have | |
137 * been returned, and by first() and last() if there are no matches at all. | |
138 * @stable ICU 2.4 | |
139 */ | |
140 #define USEARCH_DONE -1 | |
141 | |
142 /** | |
143 * Data structure for searching | |
144 * @stable ICU 2.4 | |
145 */ | |
146 struct UStringSearch; | |
147 /** | |
148 * Data structure for searching | |
149 * @stable ICU 2.4 | |
150 */ | |
151 typedef struct UStringSearch UStringSearch; | |
152 | |
153 /** | |
154 * @stable ICU 2.4 | |
155 */ | |
156 typedef enum { | |
157 /** Option for overlapping matches */ | |
158 USEARCH_OVERLAP, | |
159 /** | |
160 * Option for canonical matches. option 1 in header documentation. | |
161 * The default value will be USEARCH_OFF | |
162 */ | |
163 USEARCH_CANONICAL_MATCH, | |
164 /** | |
165 * Option to control how collation elements are compared. | |
166 * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. | |
167 * @stable ICU 4.4 | |
168 */ | |
169 USEARCH_ELEMENT_COMPARISON, | |
170 | |
171 USEARCH_ATTRIBUTE_COUNT | |
172 } USearchAttribute; | |
173 | |
174 /** | |
175 * @stable ICU 2.4 | |
176 */ | |
177 typedef enum { | |
178 /** Default value for any USearchAttribute */ | |
179 USEARCH_DEFAULT = -1, | |
180 /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ | |
181 USEARCH_OFF, | |
182 /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ | |
183 USEARCH_ON, | |
184 /** | |
185 * Value (default) for USEARCH_ELEMENT_COMPARISON; | |
186 * standard collation element comparison at the specified collator | |
187 * strength. | |
188 * @stable ICU 4.4 | |
189 */ | |
190 USEARCH_STANDARD_ELEMENT_COMPARISON, | |
191 /** | |
192 * Value for USEARCH_ELEMENT_COMPARISON; | |
193 * collation element comparison is modified to effectively provide | |
194 * behavior between the specified strength and strength - 1. Collation | |
195 * elements in the pattern that have the base weight for the specified | |
196 * strength are treated as "wildcards" that match an element with any | |
197 * other weight at that collation level in the searched text. For | |
198 * example, with a secondary-strength English collator, a plain 'e' in | |
199 * the pattern will match a plain e or an e with any diacritic in the | |
200 * searched text, but an e with diacritic in the pattern will only | |
201 * match an e with the same diacritic in the searched text. | |
202 * @stable ICU 4.4 | |
203 */ | |
204 USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, | |
205 /** | |
206 * Value for USEARCH_ELEMENT_COMPARISON. | |
207 * collation element comparison is modified to effectively provide | |
208 * behavior between the specified strength and strength - 1. Collation | |
209 * elements in either the pattern or the searched text that have the | |
210 * base weight for the specified strength are treated as "wildcards" | |
211 * that match an element with any other weight at that collation level. | |
212 * For example, with a secondary-strength English collator, a plain 'e' | |
213 * in the pattern will match a plain e or an e with any diacritic in the | |
214 * searched text, but an e with diacritic in the pattern will only | |
215 * match an e with the same diacritic or a plain e in the searched text. | |
216 * @stable ICU 4.4 | |
217 */ | |
218 USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, | |
219 | |
220 USEARCH_ATTRIBUTE_VALUE_COUNT | |
221 } USearchAttributeValue; | |
222 | |
223 /* open and close ------------------------------------------------------ */ | |
224 | |
225 /** | |
226 * Creating a search iterator data struct using the argument locale language | |
227 * rule set. A collator will be created in the process, which will be owned by | |
228 * this search and will be deleted in <tt>usearch_close</tt>. | |
229 * @param pattern for matching | |
230 * @param patternlength length of the pattern, -1 for null-termination | |
231 * @param text text string | |
232 * @param textlength length of the text string, -1 for null-termination | |
233 * @param locale name of locale for the rules to be used | |
234 * @param breakiter A BreakIterator that will be used to restrict the points | |
235 * at which matches are detected. If a match is found, but | |
236 * the match's start or end index is not a boundary as | |
237 * determined by the <tt>BreakIterator</tt>, the match will | |
238 * be rejected and another will be searched for. | |
239 * If this parameter is <tt>NULL</tt>, no break detection is | |
240 * attempted. | |
241 * @param status for errors if it occurs. If pattern or text is NULL, or if | |
242 * patternlength or textlength is 0 then an | |
243 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
244 * @return search iterator data structure, or NULL if there is an error. | |
245 * @stable ICU 2.4 | |
246 */ | |
247 U_STABLE UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, | |
248 int32_t patternlength, | |
249 const UChar *text, | |
250 int32_t textlength, | |
251 const char *locale, | |
252 UBreakIterator *breakiter, | |
253 UErrorCode *status); | |
254 | |
255 /** | |
256 * Creating a search iterator data struct using the argument collator language | |
257 * rule set. Note, user retains the ownership of this collator, thus the | |
258 * responsibility of deletion lies with the user. | |
259 * NOTE: string search cannot be instantiated from a collator that has | |
260 * collate digits as numbers (CODAN) turned on. | |
261 * @param pattern for matching | |
262 * @param patternlength length of the pattern, -1 for null-termination | |
263 * @param text text string | |
264 * @param textlength length of the text string, -1 for null-termination | |
265 * @param collator used for the language rules | |
266 * @param breakiter A BreakIterator that will be used to restrict the points | |
267 * at which matches are detected. If a match is found, but | |
268 * the match's start or end index is not a boundary as | |
269 * determined by the <tt>BreakIterator</tt>, the match will | |
270 * be rejected and another will be searched for. | |
271 * If this parameter is <tt>NULL</tt>, no break detection is | |
272 * attempted. | |
273 * @param status for errors if it occurs. If collator, pattern or text is NULL, | |
274 * or if patternlength or textlength is 0 then an | |
275 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
276 * @return search iterator data structure, or NULL if there is an error. | |
277 * @stable ICU 2.4 | |
278 */ | |
279 U_STABLE UStringSearch * U_EXPORT2 usearch_openFromCollator( | |
280 const UChar *pattern, | |
281 int32_t patternlength, | |
282 const UChar *text, | |
283 int32_t textlength, | |
284 const UCollator *collator, | |
285 UBreakIterator *breakiter, | |
286 UErrorCode *status); | |
287 | |
288 /** | |
289 * Destroying and cleaning up the search iterator data struct. | |
290 * If a collator is created in <tt>usearch_open</tt>, it will be destroyed here. | |
291 * @param searchiter data struct to clean up | |
292 * @stable ICU 2.4 | |
293 */ | |
294 U_STABLE void U_EXPORT2 usearch_close(UStringSearch *searchiter); | |
295 | |
296 #if U_SHOW_CPLUSPLUS_API | |
297 | |
298 U_NAMESPACE_BEGIN | |
299 | |
300 /** | |
301 * \class LocalUStringSearchPointer | |
302 * "Smart pointer" class, closes a UStringSearch via usearch_close(). | |
303 * For most methods see the LocalPointerBase base class. | |
304 * | |
305 * @see LocalPointerBase | |
306 * @see LocalPointer | |
307 * @stable ICU 4.4 | |
308 */ | |
309 U_DEFINE_LOCAL_OPEN_POINTER(LocalUStringSearchPointer, UStringSearch, usearch_cl
ose); | |
310 | |
311 U_NAMESPACE_END | |
312 | |
313 #endif | |
314 | |
315 /* get and set methods -------------------------------------------------- */ | |
316 | |
317 /** | |
318 * Sets the current position in the text string which the next search will | |
319 * start from. Clears previous states. | |
320 * This method takes the argument index and sets the position in the text | |
321 * string accordingly without checking if the index is pointing to a | |
322 * valid starting point to begin searching. | |
323 * Search positions that may render incorrect results are highlighted in the | |
324 * header comments | |
325 * @param strsrch search iterator data struct | |
326 * @param position position to start next search from. If position is less | |
327 * than or greater than the text range for searching, | |
328 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
329 * @param status error status if any. | |
330 * @stable ICU 2.4 | |
331 */ | |
332 U_STABLE void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, | |
333 int32_t position, | |
334 UErrorCode *status); | |
335 | |
336 /** | |
337 * Return the current index in the string text being searched. | |
338 * If the iteration has gone past the end of the text (or past the beginning | |
339 * for a backwards search), <tt>USEARCH_DONE</tt> is returned. | |
340 * @param strsrch search iterator data struct | |
341 * @see #USEARCH_DONE | |
342 * @stable ICU 2.4 | |
343 */ | |
344 U_STABLE int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch); | |
345 | |
346 /** | |
347 * Sets the text searching attributes located in the enum USearchAttribute | |
348 * with values from the enum USearchAttributeValue. | |
349 * <tt>USEARCH_DEFAULT</tt> can be used for all attributes for resetting. | |
350 * @param strsrch search iterator data struct | |
351 * @param attribute text attribute to be set | |
352 * @param value text attribute value | |
353 * @param status for errors if it occurs | |
354 * @see #usearch_getAttribute | |
355 * @stable ICU 2.4 | |
356 */ | |
357 U_STABLE void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, | |
358 USearchAttribute attribute, | |
359 USearchAttributeValue value, | |
360 UErrorCode *status); | |
361 | |
362 /** | |
363 * Gets the text searching attributes. | |
364 * @param strsrch search iterator data struct | |
365 * @param attribute text attribute to be retrieve | |
366 * @return text attribute value | |
367 * @see #usearch_setAttribute | |
368 * @stable ICU 2.4 | |
369 */ | |
370 U_STABLE USearchAttributeValue U_EXPORT2 usearch_getAttribute( | |
371 const UStringSearch *strsrch, | |
372 USearchAttribute attribute); | |
373 | |
374 /** | |
375 * Returns the index to the match in the text string that was searched. | |
376 * This call returns a valid result only after a successful call to | |
377 * <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, | |
378 * or <tt>usearch_last</tt>. | |
379 * Just after construction, or after a searching method returns | |
380 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. | |
381 * <p> | |
382 * Use <tt>usearch_getMatchedLength</tt> to get the matched string length. | |
383 * @param strsrch search iterator data struct | |
384 * @return index to a substring within the text string that is being | |
385 * searched. | |
386 * @see #usearch_first | |
387 * @see #usearch_next | |
388 * @see #usearch_previous | |
389 * @see #usearch_last | |
390 * @see #USEARCH_DONE | |
391 * @stable ICU 2.4 | |
392 */ | |
393 U_STABLE int32_t U_EXPORT2 usearch_getMatchedStart( | |
394 const UStringSearch *strsrch); | |
395 | |
396 /** | |
397 * Returns the length of text in the string which matches the search pattern. | |
398 * This call returns a valid result only after a successful call to | |
399 * <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, | |
400 * or <tt>usearch_last</tt>. | |
401 * Just after construction, or after a searching method returns | |
402 * <tt>USEARCH_DONE</tt>, this method will return 0. | |
403 * @param strsrch search iterator data struct | |
404 * @return The length of the match in the string text, or 0 if there is no | |
405 * match currently. | |
406 * @see #usearch_first | |
407 * @see #usearch_next | |
408 * @see #usearch_previous | |
409 * @see #usearch_last | |
410 * @see #USEARCH_DONE | |
411 * @stable ICU 2.4 | |
412 */ | |
413 U_STABLE int32_t U_EXPORT2 usearch_getMatchedLength( | |
414 const UStringSearch *strsrch); | |
415 | |
416 /** | |
417 * Returns the text that was matched by the most recent call to | |
418 * <tt>usearch_first</tt>, <tt>usearch_next</tt>, <tt>usearch_previous</tt>, | |
419 * or <tt>usearch_last</tt>. | |
420 * If the iterator is not pointing at a valid match (e.g. just after | |
421 * construction or after <tt>USEARCH_DONE</tt> has been returned, returns | |
422 * an empty string. If result is not large enough to store the matched text, | |
423 * result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR | |
424 * will be returned in status. result will be null-terminated whenever | |
425 * possible. If the buffer fits the matched text exactly, a null-termination | |
426 * is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status. | |
427 * Pre-flighting can be either done with length = 0 or the API | |
428 * <tt>usearch_getMatchLength</tt>. | |
429 * @param strsrch search iterator data struct | |
430 * @param result UChar buffer to store the matched string | |
431 * @param resultCapacity length of the result buffer | |
432 * @param status error returned if result is not large enough | |
433 * @return exact length of the matched text, not counting the null-termination | |
434 * @see #usearch_first | |
435 * @see #usearch_next | |
436 * @see #usearch_previous | |
437 * @see #usearch_last | |
438 * @see #USEARCH_DONE | |
439 * @stable ICU 2.4 | |
440 */ | |
441 U_STABLE int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, | |
442 UChar *result, | |
443 int32_t resultCapacity, | |
444 UErrorCode *status); | |
445 | |
446 #if !UCONFIG_NO_BREAK_ITERATION | |
447 | |
448 /** | |
449 * Set the BreakIterator that will be used to restrict the points at which | |
450 * matches are detected. | |
451 * @param strsrch search iterator data struct | |
452 * @param breakiter A BreakIterator that will be used to restrict the points | |
453 * at which matches are detected. If a match is found, but | |
454 * the match's start or end index is not a boundary as | |
455 * determined by the <tt>BreakIterator</tt>, the match will | |
456 * be rejected and another will be searched for. | |
457 * If this parameter is <tt>NULL</tt>, no break detection is | |
458 * attempted. | |
459 * @param status for errors if it occurs | |
460 * @see #usearch_getBreakIterator | |
461 * @stable ICU 2.4 | |
462 */ | |
463 U_STABLE void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, | |
464 UBreakIterator *breakiter, | |
465 UErrorCode *status); | |
466 | |
467 /** | |
468 * Returns the BreakIterator that is used to restrict the points at which | |
469 * matches are detected. This will be the same object that was passed to the | |
470 * constructor or to <tt>usearch_setBreakIterator</tt>. Note that | |
471 * <tt>NULL</tt> | |
472 * is a legal value; it means that break detection should not be attempted. | |
473 * @param strsrch search iterator data struct | |
474 * @return break iterator used | |
475 * @see #usearch_setBreakIterator | |
476 * @stable ICU 2.4 | |
477 */ | |
478 U_STABLE const UBreakIterator * U_EXPORT2 usearch_getBreakIterator( | |
479 const UStringSearch *strsrch); | |
480 | |
481 #endif | |
482 | |
483 /** | |
484 * Set the string text to be searched. Text iteration will hence begin at the | |
485 * start of the text string. This method is useful if you want to re-use an | |
486 * iterator to search for the same pattern within a different body of text. | |
487 * @param strsrch search iterator data struct | |
488 * @param text new string to look for match | |
489 * @param textlength length of the new string, -1 for null-termination | |
490 * @param status for errors if it occurs. If text is NULL, or textlength is 0 | |
491 * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change | |
492 * done to strsrch. | |
493 * @see #usearch_getText | |
494 * @stable ICU 2.4 | |
495 */ | |
496 U_STABLE void U_EXPORT2 usearch_setText( UStringSearch *strsrch, | |
497 const UChar *text, | |
498 int32_t textlength, | |
499 UErrorCode *status); | |
500 | |
501 /** | |
502 * Return the string text to be searched. | |
503 * @param strsrch search iterator data struct | |
504 * @param length returned string text length | |
505 * @return string text | |
506 * @see #usearch_setText | |
507 * @stable ICU 2.4 | |
508 */ | |
509 U_STABLE const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, | |
510 int32_t *length); | |
511 | |
512 /** | |
513 * Gets the collator used for the language rules. | |
514 * <p> | |
515 * Deleting the returned <tt>UCollator</tt> before calling | |
516 * <tt>usearch_close</tt> would cause the string search to fail. | |
517 * <tt>usearch_close</tt> will delete the collator if this search owns it. | |
518 * @param strsrch search iterator data struct | |
519 * @return collator | |
520 * @stable ICU 2.4 | |
521 */ | |
522 U_STABLE UCollator * U_EXPORT2 usearch_getCollator( | |
523 const UStringSearch *strsrch); | |
524 | |
525 /** | |
526 * Sets the collator used for the language rules. User retains the ownership | |
527 * of this collator, thus the responsibility of deletion lies with the user. | |
528 * This method causes internal data such as Boyer-Moore shift tables to | |
529 * be recalculated, but the iterator's position is unchanged. | |
530 * @param strsrch search iterator data struct | |
531 * @param collator to be used | |
532 * @param status for errors if it occurs | |
533 * @stable ICU 2.4 | |
534 */ | |
535 U_STABLE void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, | |
536 const UCollator *collator, | |
537 UErrorCode *status); | |
538 | |
539 /** | |
540 * Sets the pattern used for matching. | |
541 * Internal data like the Boyer Moore table will be recalculated, but the | |
542 * iterator's position is unchanged. | |
543 * @param strsrch search iterator data struct | |
544 * @param pattern string | |
545 * @param patternlength pattern length, -1 for null-terminated string | |
546 * @param status for errors if it occurs. If text is NULL, or textlength is 0 | |
547 * then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change | |
548 * done to strsrch. | |
549 * @stable ICU 2.4 | |
550 */ | |
551 U_STABLE void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, | |
552 const UChar *pattern, | |
553 int32_t patternlength, | |
554 UErrorCode *status); | |
555 | |
556 /** | |
557 * Gets the search pattern | |
558 * @param strsrch search iterator data struct | |
559 * @param length return length of the pattern, -1 indicates that the pattern | |
560 * is null-terminated | |
561 * @return pattern string | |
562 * @stable ICU 2.4 | |
563 */ | |
564 U_STABLE const UChar * U_EXPORT2 usearch_getPattern( | |
565 const UStringSearch *strsrch, | |
566 int32_t *length); | |
567 | |
568 /* methods ------------------------------------------------------------- */ | |
569 | |
570 /** | |
571 * Returns the first index at which the string text matches the search | |
572 * pattern. | |
573 * The iterator is adjusted so that its current index (as returned by | |
574 * <tt>usearch_getOffset</tt>) is the match position if one was found. | |
575 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
576 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. | |
577 * @param strsrch search iterator data struct | |
578 * @param status for errors if it occurs | |
579 * @return The character index of the first match, or | |
580 * <tt>USEARCH_DONE</tt> if there are no matches. | |
581 * @see #usearch_getOffset | |
582 * @see #USEARCH_DONE | |
583 * @stable ICU 2.4 | |
584 */ | |
585 U_STABLE int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, | |
586 UErrorCode *status); | |
587 | |
588 /** | |
589 * Returns the first index greater than <tt>position</tt> at which the string | |
590 * text | |
591 * matches the search pattern. The iterator is adjusted so that its current | |
592 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if | |
593 * one was found. | |
594 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
595 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> | |
596 * <p> | |
597 * Search positions that may render incorrect results are highlighted in the | |
598 * header comments. If position is less than or greater than the text range | |
599 * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
600 * @param strsrch search iterator data struct | |
601 * @param position to start the search at | |
602 * @param status for errors if it occurs | |
603 * @return The character index of the first match following <tt>pos</tt>, | |
604 * or <tt>USEARCH_DONE</tt> if there are no matches. | |
605 * @see #usearch_getOffset | |
606 * @see #USEARCH_DONE | |
607 * @stable ICU 2.4 | |
608 */ | |
609 U_STABLE int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, | |
610 int32_t position, | |
611 UErrorCode *status); | |
612 | |
613 /** | |
614 * Returns the last index in the target text at which it matches the search | |
615 * pattern. The iterator is adjusted so that its current | |
616 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if | |
617 * one was found. | |
618 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
619 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. | |
620 * @param strsrch search iterator data struct | |
621 * @param status for errors if it occurs | |
622 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if there | |
623 * are no matches. | |
624 * @see #usearch_getOffset | |
625 * @see #USEARCH_DONE | |
626 * @stable ICU 2.4 | |
627 */ | |
628 U_STABLE int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, | |
629 UErrorCode *status); | |
630 | |
631 /** | |
632 * Returns the first index less than <tt>position</tt> at which the string text | |
633 * matches the search pattern. The iterator is adjusted so that its current | |
634 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if | |
635 * one was found. | |
636 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
637 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> | |
638 * <p> | |
639 * Search positions that may render incorrect results are highlighted in the | |
640 * header comments. If position is less than or greater than the text range | |
641 * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
642 * @param strsrch search iterator data struct | |
643 * @param position index position the search is to begin at | |
644 * @param status for errors if it occurs | |
645 * @return The character index of the first match preceding <tt>pos</tt>, | |
646 * or <tt>USEARCH_DONE</tt> if there are no matches. | |
647 * @see #usearch_getOffset | |
648 * @see #USEARCH_DONE | |
649 * @stable ICU 2.4 | |
650 */ | |
651 U_STABLE int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, | |
652 int32_t position, | |
653 UErrorCode *status); | |
654 | |
655 /** | |
656 * Returns the index of the next point at which the string text matches the | |
657 * search pattern, starting from the current position. | |
658 * The iterator is adjusted so that its current | |
659 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if | |
660 * one was found. | |
661 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
662 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> | |
663 * @param strsrch search iterator data struct | |
664 * @param status for errors if it occurs | |
665 * @return The index of the next match after the current position, or | |
666 * <tt>USEARCH_DONE</tt> if there are no more matches. | |
667 * @see #usearch_first | |
668 * @see #usearch_getOffset | |
669 * @see #USEARCH_DONE | |
670 * @stable ICU 2.4 | |
671 */ | |
672 U_STABLE int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, | |
673 UErrorCode *status); | |
674 | |
675 /** | |
676 * Returns the index of the previous point at which the string text matches | |
677 * the search pattern, starting at the current position. | |
678 * The iterator is adjusted so that its current | |
679 * index (as returned by <tt>usearch_getOffset</tt>) is the match position if | |
680 * one was found. | |
681 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
682 * the iterator will be adjusted to the index <tt>USEARCH_DONE</tt> | |
683 * @param strsrch search iterator data struct | |
684 * @param status for errors if it occurs | |
685 * @return The index of the previous match before the current position, | |
686 * or <tt>USEARCH_DONE</tt> if there are no more matches. | |
687 * @see #usearch_last | |
688 * @see #usearch_getOffset | |
689 * @see #USEARCH_DONE | |
690 * @stable ICU 2.4 | |
691 */ | |
692 U_STABLE int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, | |
693 UErrorCode *status); | |
694 | |
695 /** | |
696 * Reset the iteration. | |
697 * Search will begin at the start of the text string if a forward iteration | |
698 * is initiated before a backwards iteration. Otherwise if a backwards | |
699 * iteration is initiated before a forwards iteration, the search will begin | |
700 * at the end of the text string. | |
701 * @param strsrch search iterator data struct | |
702 * @see #usearch_first | |
703 * @stable ICU 2.4 | |
704 */ | |
705 U_STABLE void U_EXPORT2 usearch_reset(UStringSearch *strsrch); | |
706 | |
707 /** | |
708 * Simple forward search for the pattern, starting at a specified index, | |
709 * and using using a default set search options. | |
710 * | |
711 * This is an experimental function, and is not an official part of the | |
712 * ICU API. | |
713 * | |
714 * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are hono
red. | |
715 * | |
716 * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and | |
717 * any Break Iterator are ignored. | |
718 * | |
719 * Matches obey the following constraints: | |
720 * | |
721 * Characters at the start or end positions of a match that are ignorable | |
722 * for collation are not included as part of the match, unless they | |
723 * are part of a combining sequence, as described below. | |
724 * | |
725 * A match will not include a partial combining sequence. Combining | |
726 * character sequences are considered to be inseperable units, | |
727 * and either match the pattern completely, or are considered to not match | |
728 * at all. Thus, for example, an A followed a combining accent mark will | |
729 * not be found when searching for a plain (unaccented) A. (unless | |
730 * the collation strength has been set to ignore all accents). | |
731 * | |
732 * When beginning a search, the initial starting position, startIdx, | |
733 * is assumed to be an acceptable match boundary with respect to | |
734 * combining characters. A combining sequence that spans across the | |
735 * starting point will not supress a match beginning at startIdx. | |
736 * | |
737 * Characters that expand to multiple collation elements | |
738 * (German sharp-S becoming 'ss', or the composed forms of accented | |
739 * characters, for example) also must match completely. | |
740 * Searching for a single 's' in a string containing only a sharp-s will | |
741 * find no match. | |
742 * | |
743 * | |
744 * @param strsrch the UStringSearch struct, which references both | |
745 * the text to be searched and the pattern being sought. | |
746 * @param startIdx The index into the text to begin the search. | |
747 * @param matchStart An out parameter, the starting index of the matched text. | |
748 * This parameter may be NULL. | |
749 * A value of -1 will be returned if no match was found. | |
750 * @param matchLimit Out parameter, the index of the first position following
the matched text. | |
751 * The matchLimit will be at a suitable position for beginni
ng a subsequent search | |
752 * in the input text. | |
753 * This parameter may be NULL. | |
754 * A value of -1 will be returned if no match was found. | |
755 * | |
756 * @param status Report any errors. Note that no match found is not an er
ror. | |
757 * @return TRUE if a match was found, FALSE otherwise. | |
758 * | |
759 * @internal | |
760 */ | |
761 U_INTERNAL UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, | |
762 int32_t startIdx, | |
763 int32_t *matchStart, | |
764 int32_t *matchLimit, | |
765 UErrorCode *status); | |
766 | |
767 /** | |
768 * Simple backwards search for the pattern, starting at a specified index, | |
769 * and using using a default set search options. | |
770 * | |
771 * This is an experimental function, and is not an official part of the | |
772 * ICU API. | |
773 * | |
774 * The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are hono
red. | |
775 * | |
776 * The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and | |
777 * any Break Iterator are ignored. | |
778 * | |
779 * Matches obey the following constraints: | |
780 * | |
781 * Characters at the start or end positions of a match that are ignorable | |
782 * for collation are not included as part of the match, unless they | |
783 * are part of a combining sequence, as described below. | |
784 * | |
785 * A match will not include a partial combining sequence. Combining | |
786 * character sequences are considered to be inseperable units, | |
787 * and either match the pattern completely, or are considered to not match | |
788 * at all. Thus, for example, an A followed a combining accent mark will | |
789 * not be found when searching for a plain (unaccented) A. (unless | |
790 * the collation strength has been set to ignore all accents). | |
791 * | |
792 * When beginning a search, the initial starting position, startIdx, | |
793 * is assumed to be an acceptable match boundary with respect to | |
794 * combining characters. A combining sequence that spans across the | |
795 * starting point will not supress a match beginning at startIdx. | |
796 * | |
797 * Characters that expand to multiple collation elements | |
798 * (German sharp-S becoming 'ss', or the composed forms of accented | |
799 * characters, for example) also must match completely. | |
800 * Searching for a single 's' in a string containing only a sharp-s will | |
801 * find no match. | |
802 * | |
803 * | |
804 * @param strsrch the UStringSearch struct, which references both | |
805 * the text to be searched and the pattern being sought. | |
806 * @param startIdx The index into the text to begin the search. | |
807 * @param matchStart An out parameter, the starting index of the matched text. | |
808 * This parameter may be NULL. | |
809 * A value of -1 will be returned if no match was found. | |
810 * @param matchLimit Out parameter, the index of the first position following
the matched text. | |
811 * The matchLimit will be at a suitable position for beginni
ng a subsequent search | |
812 * in the input text. | |
813 * This parameter may be NULL. | |
814 * A value of -1 will be returned if no match was found. | |
815 * | |
816 * @param status Report any errors. Note that no match found is not an er
ror. | |
817 * @return TRUE if a match was found, FALSE otherwise. | |
818 * | |
819 * @internal | |
820 */ | |
821 U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, | |
822 int32_t startIdx, | |
823 int32_t *matchStart, | |
824 int32_t *matchLimit, | |
825 UErrorCode *status); | |
826 | |
827 #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */ | |
828 | |
829 #endif | |
OLD | NEW |