OLD | NEW |
| (Empty) |
1 /* | |
2 ****************************************************************************** | |
3 * Copyright (C) 1996-2010, International Business Machines * | |
4 * Corporation and others. All Rights Reserved. * | |
5 ****************************************************************************** | |
6 */ | |
7 | |
8 /** | |
9 * \file | |
10 * \brief C++ API: Boyer-Moore StringSearch technology preview | |
11 * \internal ICU 4.0.1 technology preview | |
12 */ | |
13 | |
14 #ifndef B_M_SEARCH_H | |
15 #define B_M_SEARCH_H | |
16 | |
17 #include "unicode/utypes.h" | |
18 | |
19 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | |
20 | |
21 #include "unicode/uobject.h" | |
22 #include "unicode/ucol.h" | |
23 | |
24 #include "unicode/colldata.h" | |
25 | |
26 U_NAMESPACE_BEGIN | |
27 | |
28 class BadCharacterTable; | |
29 class GoodSuffixTable; | |
30 class Target; | |
31 | |
32 /** | |
33 * BoyerMooreSearch | |
34 * | |
35 * This object holds the information needed to do a Collation sensitive Boyer-Mo
ore search. It encapulates | |
36 * the pattern, the "bad character" and "good suffix" tables, the Collator-based
data needed to compute them, | |
37 * and a reference to the text being searched. | |
38 * | |
39 * To do a search, you fist need to get a <code>CollData</code> object by callin
g <code>CollData::open</code>. | |
40 * Then you construct a <code>BoyerMooreSearch</code> object from the <code>Coll
Data</code> object, the pattern | |
41 * string and the target string. Then you call the <code>search</code> method. H
ere's a code sample: | |
42 * | |
43 * <pre> | |
44 * void boyerMooreExample(UCollator *collator, UnicodeString *pattern, UnicodeSt
ring *target) | |
45 * { | |
46 * UErrorCode status = U_ZERO_ERROR; | |
47 * CollData *collData = CollData::open(collator, status); | |
48 * | |
49 * if (U_FAILURE(status)) { | |
50 * // could not create a CollData object | |
51 * return; | |
52 * } | |
53 * | |
54 * BoyerMooreSearch *search = new BoyerMooreSearch(collData, *patternString,
target, status); | |
55 * | |
56 * if (U_FAILURE(status)) { | |
57 * // could not create a BoyerMooreSearch object | |
58 * CollData::close(collData); | |
59 * return; | |
60 * } | |
61 * | |
62 * int32_t offset = 0, start = -1, end = -1; | |
63 * | |
64 * // Find all matches | |
65 * while (search->search(offset, start, end)) { | |
66 * // process the match between start and end | |
67 * ... | |
68 * // advance past the match | |
69 * offset = end; | |
70 * } | |
71 * | |
72 * // at this point, if offset == 0, there were no matches | |
73 * if (offset == 0) { | |
74 * // handle the case of no matches | |
75 * } | |
76 * | |
77 * delete search; | |
78 * CollData::close(collData); | |
79 * | |
80 * // CollData objects are cached, so the call to | |
81 * // CollData::close doesn't delete the object. | |
82 * // Call this if you don't need the object any more. | |
83 * CollData::flushCollDataCache(); | |
84 * } | |
85 * </pre> | |
86 * | |
87 * NOTE: This is a technology preview. The final version of this API may not bea
r any resenblence to this API. | |
88 * | |
89 * Knows linitations: | |
90 * 1) Backwards searching has not been implemented. | |
91 * | |
92 * 2) For Han and Hangul characters, this code ignores any Collation tailoring
s. In general, | |
93 * this isn't a problem, but in Korean locals, at strength 1, Hangul charac
ters are tailored | |
94 * to be equal to Han characters with the same pronounciation. Because this
code ignroes | |
95 * tailorings, searching for a Hangul character will not find a Han charact
er and visa-versa. | |
96 * | |
97 * 3) In some cases, searching for a pattern that needs to be normalized and e
nds | |
98 * in a discontiguous contraction may fail. The only known cases of this ar
e with | |
99 * the Tibetan script. For example searching for the pattern | |
100 * "\u0F7F\u0F80\u0F81\u0F82\u0F83\u0F84\u0F85" will fail. (This case is ar
tificial. We've | |
101 * been unable to find a pratical, real-world example of this failure.) | |
102 * | |
103 * @internal ICU 4.0.1 technology preview | |
104 * | |
105 * @see CollData | |
106 */ | |
107 class U_I18N_API BoyerMooreSearch : public UObject | |
108 { | |
109 public: | |
110 /** | |
111 * Construct a <code>BoyerMooreSearch</code> object. | |
112 * | |
113 * @param theData - A <code>CollData</code> object holding the Collator-sens
itive data | |
114 * @param patternString - the string for which to search | |
115 * @param targetString - the string in which to search or <code>NULL</code>
if youu will | |
116 * set it later by calling <code>setTargetString</code
>. | |
117 * @param status - will be set if any errors occur. | |
118 * | |
119 * Note: if on return, status is set to an error code, | |
120 * the only safe thing to do with this object is to call | |
121 * the destructor. | |
122 * | |
123 * @internal ICU 4.0.1 technology preview | |
124 */ | |
125 BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, cons
t UnicodeString *targetString, UErrorCode &status); | |
126 | |
127 /** | |
128 * The desstructor | |
129 * | |
130 * @internal ICU 4.0.1 technology preview | |
131 */ | |
132 ~BoyerMooreSearch(); | |
133 | |
134 /** | |
135 * Test the pattern to see if it generates any CEs. | |
136 * | |
137 * @return <code>TRUE</code> if the pattern string did not generate any CEs | |
138 * | |
139 * @internal ICU 4.0.1 technology preview | |
140 */ | |
141 UBool empty(); | |
142 | |
143 /** | |
144 * Search for the pattern string in the target string. | |
145 * | |
146 * @param offset - the offset in the target string at which to begin the sea
rch | |
147 * @param start - will be set to the starting offset of the match, or -1 if
there's no match | |
148 * @param end - will be set to the ending offset of the match, or -1 if ther
e's no match | |
149 * | |
150 * @return <code>TRUE</code> if the match succeeds, <code>FALSE</code> other
wise. | |
151 * | |
152 * @internal ICU 4.0.1 technology preview | |
153 */ | |
154 UBool search(int32_t offset, int32_t &start, int32_t &end); | |
155 | |
156 /** | |
157 * Set the target string for the match. | |
158 * | |
159 * @param targetString - the new target string | |
160 * @param status - will be set if any errors occur. | |
161 * | |
162 * @internal ICU 4.0.1 technology preview | |
163 */ | |
164 void setTargetString(const UnicodeString *targetString, UErrorCode &status); | |
165 | |
166 // **** no longer need these? **** | |
167 /** | |
168 * Return the <code>CollData</code> object used for searching | |
169 * | |
170 * @return the <code>CollData</code> object used for searching | |
171 * | |
172 * @internal ICU 4.0.1 technology preview | |
173 */ | |
174 CollData *getData(); | |
175 | |
176 /** | |
177 * Return the CEs generated by the pattern string. | |
178 * | |
179 * @return a <code>CEList</code> object holding the CEs generated by the pat
tern string. | |
180 * | |
181 * @internal ICU 4.0.1 technology preview | |
182 */ | |
183 CEList *getPatternCEs(); | |
184 | |
185 /** | |
186 * Return the <code>BadCharacterTable</code> object computed for the pattern
string. | |
187 * | |
188 * @return the <code>BadCharacterTable</code> object. | |
189 * | |
190 * @internal ICU 4.0.1 technology preview | |
191 */ | |
192 BadCharacterTable *getBadCharacterTable(); | |
193 | |
194 /** | |
195 * Return the <code>GoodSuffixTable</code> object computed for the pattern s
tring. | |
196 * | |
197 * @return the <code>GoodSuffixTable</code> object computed for the pattern
string. | |
198 * | |
199 * @internal ICU 4.0.1 technology preview | |
200 */ | |
201 GoodSuffixTable *getGoodSuffixTable(); | |
202 | |
203 /** | |
204 * UObject glue... | |
205 * @internal ICU 4.0.1 technology preview | |
206 */ | |
207 virtual UClassID getDynamicClassID() const; | |
208 /** | |
209 * UObject glue... | |
210 * @internal ICU 4.0.1 technology preview | |
211 */ | |
212 static UClassID getStaticClassID(); | |
213 | |
214 private: | |
215 CollData *data; | |
216 CEList *patCEs; | |
217 BadCharacterTable *badCharacterTable; | |
218 GoodSuffixTable *goodSuffixTable; | |
219 UnicodeString pattern; | |
220 Target *target; | |
221 }; | |
222 | |
223 U_NAMESPACE_END | |
224 | |
225 #endif // #if !UCONFIG_NO_COLLATION | |
226 #endif // #ifndef B_M_SEARCH_H | |
OLD | NEW |