OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2001-2011 IBM and others. All rights reserved. | 3 * Copyright (C) 2001-2011,2014 IBM and others. All rights reserved. |
4 ********************************************************************** | 4 ********************************************************************** |
5 * Date Name Description | 5 * Date Name Description |
6 * 06/28/2001 synwee Creation. | 6 * 06/28/2001 synwee Creation. |
7 ********************************************************************** | 7 ********************************************************************** |
8 */ | 8 */ |
9 #ifndef USEARCH_H | 9 #ifndef USEARCH_H |
10 #define USEARCH_H | 10 #define USEARCH_H |
11 | 11 |
12 #include "unicode/utypes.h" | 12 #include "unicode/utypes.h" |
13 | 13 |
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | 14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION |
15 | 15 |
16 #include "unicode/localpointer.h" | 16 #include "unicode/localpointer.h" |
17 #include "unicode/ucol.h" | 17 #include "unicode/ucol.h" |
18 #include "unicode/ucoleitr.h" | 18 #include "unicode/ucoleitr.h" |
19 #include "unicode/ubrk.h" | 19 #include "unicode/ubrk.h" |
20 | 20 |
21 /** | 21 /** |
22 * \file | 22 * \file |
23 * \brief C API: StringSearch | 23 * \brief C API: StringSearch |
24 * | 24 * |
25 * C Apis for an engine that provides language-sensitive text searching based | 25 * C Apis for an engine that provides language-sensitive text searching based |
26 * on the comparison rules defined in a <tt>UCollator</tt> data struct, | 26 * on the comparison rules defined in a <tt>UCollator</tt> data struct, |
27 * see <tt>ucol.h</tt>. This ensures that language eccentricity can be | 27 * see <tt>ucol.h</tt>. This ensures that language eccentricity can be |
28 * handled, e.g. for the German collator, characters ß and SS will be matc
hed | 28 * handled, e.g. for the German collator, characters ß and SS will be matc
hed |
29 * if case is chosen to be ignored. | 29 * if case is chosen to be ignored. |
30 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design
/collation/ICU_collation_design.htm"> | 30 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design
/collation/ICU_collation_design.htm"> |
31 * "ICU Collation Design Document"</a> for more information. | 31 * "ICU Collation Design Document"</a> for more information. |
32 * <p> | 32 * <p> |
33 * The algorithm implemented is a modified form of the Boyer Moore's search. | 33 * The implementation may use a linear search or a modified form of the Boyer-Mo
ore |
34 * For more information see | 34 * search; for more information on the latter see |
35 * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.
html"> | 35 * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.
html"> |
36 * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i> | 36 * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i> |
37 * in February, 1999, for further information on the algorithm. | 37 * in February, 1999. |
38 * <p> | 38 * <p> |
39 * There are 2 match options for selection:<br> | 39 * There are 2 match options for selection:<br> |
40 * Let S' be the sub-string of a text string S between the offsets start and | 40 * Let S' be the sub-string of a text string S between the offsets start and |
41 * end <start, end>. | 41 * end <start, end>. |
42 * <br> | 42 * <br> |
43 * A pattern string P matches a text string S at the offsets <start, end> | 43 * A pattern string P matches a text string S at the offsets <start, end> |
44 * if | 44 * if |
45 * <pre> | 45 * <pre> |
46 * option 1. Some canonical equivalent of P matches some canonical equivalent | 46 * option 1. Some canonical equivalent of P matches some canonical equivalent |
47 * of S' | 47 * of S' |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
84 * A breakiterator can be used if only matches at logical breaks are desired. | 84 * A breakiterator can be used if only matches at logical breaks are desired. |
85 * Using a breakiterator will only give you results that exactly matches the | 85 * Using a breakiterator will only give you results that exactly matches the |
86 * boundaries given by the breakiterator. For instance the pattern "e" will | 86 * boundaries given by the breakiterator. For instance the pattern "e" will |
87 * not be found in the string "\u00e9" if a character break iterator is used. | 87 * not be found in the string "\u00e9" if a character break iterator is used. |
88 * <p> | 88 * <p> |
89 * Options are provided to handle overlapping matches. | 89 * Options are provided to handle overlapping matches. |
90 * E.g. In English, overlapping matches produces the result 0 and 2 | 90 * E.g. In English, overlapping matches produces the result 0 and 2 |
91 * for the pattern "abab" in the text "ababab", where else mutually | 91 * for the pattern "abab" in the text "ababab", where else mutually |
92 * exclusive matches only produce the result of 0. | 92 * exclusive matches only produce the result of 0. |
93 * <p> | 93 * <p> |
| 94 * Options are also provided to implement "asymmetric search" as described in |
| 95 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> |
| 96 * UTS #10 Unicode Collation Algorithm</a>, specifically the USearchAttribute |
| 97 * USEARCH_ELEMENT_COMPARISON and its values. |
| 98 * <p> |
94 * Though collator attributes will be taken into consideration while | 99 * Though collator attributes will be taken into consideration while |
95 * performing matches, there are no APIs here for setting and getting the | 100 * performing matches, there are no APIs here for setting and getting the |
96 * attributes. These attributes can be set by getting the collator | 101 * attributes. These attributes can be set by getting the collator |
97 * from <tt>usearch_getCollator</tt> and using the APIs in <tt>ucol.h</tt>. | 102 * from <tt>usearch_getCollator</tt> and using the APIs in <tt>ucol.h</tt>. |
98 * Lastly to update String Search to the new collator attributes, | 103 * Lastly to update String Search to the new collator attributes, |
99 * usearch_reset() has to be called. | 104 * usearch_reset() has to be called. |
100 * <p> | 105 * <p> |
101 * Restriction: <br> | 106 * Restriction: <br> |
102 * Currently there are no composite characters that consists of a | 107 * Currently there are no composite characters that consists of a |
103 * character with combining class > 0 before a character with combining | 108 * character with combining class > 0 before a character with combining |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
147 /** | 152 /** |
148 * Data structure for searching | 153 * Data structure for searching |
149 * @stable ICU 2.4 | 154 * @stable ICU 2.4 |
150 */ | 155 */ |
151 typedef struct UStringSearch UStringSearch; | 156 typedef struct UStringSearch UStringSearch; |
152 | 157 |
153 /** | 158 /** |
154 * @stable ICU 2.4 | 159 * @stable ICU 2.4 |
155 */ | 160 */ |
156 typedef enum { | 161 typedef enum { |
157 /** Option for overlapping matches */ | 162 /** |
158 USEARCH_OVERLAP, | 163 * Option for overlapping matches |
| 164 * @stable ICU 2.4 |
| 165 */ |
| 166 USEARCH_OVERLAP = 0, |
| 167 #ifndef U_HIDE_DEPRECATED_API |
159 /** | 168 /** |
160 * Option for canonical matches. option 1 in header documentation. | 169 * Option for canonical matches; option 1 in header documentation. |
161 * The default value will be USEARCH_OFF | 170 * The default value will be USEARCH_OFF. |
| 171 * Note: Setting this option to USEARCH_ON currently has no effect on |
| 172 * search behavior, and this option is deprecated. Instead, to control |
| 173 * canonical match behavior, you must set UCOL_NORMALIZATION_MODE |
| 174 * appropriately (to UCOL_OFF or UCOL_ON) in the UCollator used by |
| 175 * the UStringSearch object. |
| 176 * @see usearch_openFromCollator |
| 177 * @see usearch_getCollator |
| 178 * @see usearch_setCollator |
| 179 * @see ucol_getAttribute |
| 180 * @deprecated ICU 53 |
162 */ | 181 */ |
163 USEARCH_CANONICAL_MATCH, | 182 USEARCH_CANONICAL_MATCH = 1, |
| 183 #endif /* U_HIDE_DEPRECATED_API */ |
164 /** | 184 /** |
165 * Option to control how collation elements are compared. | 185 * Option to control how collation elements are compared. |
166 * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. | 186 * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON. |
167 * @stable ICU 4.4 | 187 * @stable ICU 4.4 |
168 */ | 188 */ |
169 USEARCH_ELEMENT_COMPARISON, | 189 USEARCH_ELEMENT_COMPARISON = 2, |
170 | 190 |
171 USEARCH_ATTRIBUTE_COUNT | 191 /** |
| 192 * Count of attribute types |
| 193 * @stable ICU 2.4 |
| 194 */ |
| 195 USEARCH_ATTRIBUTE_COUNT = 3 |
172 } USearchAttribute; | 196 } USearchAttribute; |
173 | 197 |
174 /** | 198 /** |
175 * @stable ICU 2.4 | 199 * @stable ICU 2.4 |
176 */ | 200 */ |
177 typedef enum { | 201 typedef enum { |
178 /** Default value for any USearchAttribute */ | 202 /** |
| 203 * Default value for any USearchAttribute |
| 204 * @stable ICU 2.4 |
| 205 */ |
179 USEARCH_DEFAULT = -1, | 206 USEARCH_DEFAULT = -1, |
180 /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ | 207 /** |
| 208 * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH |
| 209 * @stable ICU 2.4 |
| 210 */ |
181 USEARCH_OFF, | 211 USEARCH_OFF, |
182 /** Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH */ | 212 /** |
| 213 * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH |
| 214 * @stable ICU 2.4 |
| 215 */ |
183 USEARCH_ON, | 216 USEARCH_ON, |
184 /** | 217 /** |
185 * Value (default) for USEARCH_ELEMENT_COMPARISON; | 218 * Value (default) for USEARCH_ELEMENT_COMPARISON; |
186 * standard collation element comparison at the specified collator | 219 * standard collation element comparison at the specified collator |
187 * strength. | 220 * strength. |
188 * @stable ICU 4.4 | 221 * @stable ICU 4.4 |
189 */ | 222 */ |
190 USEARCH_STANDARD_ELEMENT_COMPARISON, | 223 USEARCH_STANDARD_ELEMENT_COMPARISON, |
191 /** | 224 /** |
192 * Value for USEARCH_ELEMENT_COMPARISON; | 225 * Value for USEARCH_ELEMENT_COMPARISON; |
193 * collation element comparison is modified to effectively provide | 226 * collation element comparison is modified to effectively provide |
194 * behavior between the specified strength and strength - 1. Collation | 227 * behavior between the specified strength and strength - 1. Collation |
195 * elements in the pattern that have the base weight for the specified | 228 * elements in the pattern that have the base weight for the specified |
196 * strength are treated as "wildcards" that match an element with any | 229 * strength are treated as "wildcards" that match an element with any |
197 * other weight at that collation level in the searched text. For | 230 * other weight at that collation level in the searched text. For |
198 * example, with a secondary-strength English collator, a plain 'e' in | 231 * example, with a secondary-strength English collator, a plain 'e' in |
199 * the pattern will match a plain e or an e with any diacritic in the | 232 * the pattern will match a plain e or an e with any diacritic in the |
200 * searched text, but an e with diacritic in the pattern will only | 233 * searched text, but an e with diacritic in the pattern will only |
201 * match an e with the same diacritic in the searched text. | 234 * match an e with the same diacritic in the searched text. |
| 235 * |
| 236 * This supports "asymmetric search" as described in |
| 237 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> |
| 238 * UTS #10 Unicode Collation Algorithm</a>. |
| 239 * |
202 * @stable ICU 4.4 | 240 * @stable ICU 4.4 |
203 */ | 241 */ |
204 USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, | 242 USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD, |
205 /** | 243 /** |
206 * Value for USEARCH_ELEMENT_COMPARISON. | 244 * Value for USEARCH_ELEMENT_COMPARISON. |
207 * collation element comparison is modified to effectively provide | 245 * collation element comparison is modified to effectively provide |
208 * behavior between the specified strength and strength - 1. Collation | 246 * behavior between the specified strength and strength - 1. Collation |
209 * elements in either the pattern or the searched text that have the | 247 * elements in either the pattern or the searched text that have the |
210 * base weight for the specified strength are treated as "wildcards" | 248 * base weight for the specified strength are treated as "wildcards" |
211 * that match an element with any other weight at that collation level. | 249 * that match an element with any other weight at that collation level. |
212 * For example, with a secondary-strength English collator, a plain 'e' | 250 * For example, with a secondary-strength English collator, a plain 'e' |
213 * in the pattern will match a plain e or an e with any diacritic in the | 251 * in the pattern will match a plain e or an e with any diacritic in the |
214 * searched text, but an e with diacritic in the pattern will only | 252 * searched text, but an e with diacritic in the pattern will only |
215 * match an e with the same diacritic or a plain e in the searched text. | 253 * match an e with the same diacritic or a plain e in the searched text. |
| 254 * |
| 255 * This option is similar to "asymmetric search" as described in |
| 256 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> |
| 257 * UTS #10 Unicode Collation Algorithm</a, but also allows unmarked |
| 258 * characters in the searched text to match marked or unmarked versions of |
| 259 * that character in the pattern. |
| 260 * |
216 * @stable ICU 4.4 | 261 * @stable ICU 4.4 |
217 */ | 262 */ |
218 USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, | 263 USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD, |
219 | 264 |
| 265 /** |
| 266 * Count of attribute values |
| 267 * @stable ICU 2.4 |
| 268 */ |
220 USEARCH_ATTRIBUTE_VALUE_COUNT | 269 USEARCH_ATTRIBUTE_VALUE_COUNT |
221 } USearchAttributeValue; | 270 } USearchAttributeValue; |
222 | 271 |
223 /* open and close ------------------------------------------------------ */ | 272 /* open and close ------------------------------------------------------ */ |
224 | 273 |
225 /** | 274 /** |
226 * Creating a search iterator data struct using the argument locale language | 275 * Creating a search iterator data struct using the argument locale language |
227 * rule set. A collator will be created in the process, which will be owned by | 276 * rule set. A collator will be created in the process, which will be owned by |
228 * this search and will be deleted in <tt>usearch_close</tt>. | 277 * this search and will be deleted in <tt>usearch_close</tt>. |
229 * @param pattern for matching | 278 * @param pattern for matching |
(...skipping 597 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
827 U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, | 876 U_INTERNAL UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, |
828 int32_t startIdx, | 877 int32_t startIdx, |
829 int32_t *matchStart, | 878 int32_t *matchStart, |
830 int32_t *matchLimit, | 879 int32_t *matchLimit, |
831 UErrorCode *status); | 880 UErrorCode *status); |
832 #endif /* U_HIDE_INTERNAL_API */ | 881 #endif /* U_HIDE_INTERNAL_API */ |
833 | 882 |
834 #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */ | 883 #endif /* #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION */ |
835 | 884 |
836 #endif | 885 #endif |
OLD | NEW |