OLD | NEW |
| (Empty) |
1 /* | |
2 ******************************************************************************* | |
3 * Copyright (C) 1996-2010, International Business Machines Corporation and * | |
4 * others. All Rights Reserved. * | |
5 ******************************************************************************* | |
6 */ | |
7 | |
8 #ifndef CANITER_H | |
9 #define CANITER_H | |
10 | |
11 #include "unicode/utypes.h" | |
12 | |
13 #if !UCONFIG_NO_NORMALIZATION | |
14 | |
15 #include "unicode/uobject.h" | |
16 #include "unicode/unistr.h" | |
17 | |
18 /** | |
19 * \file | |
20 * \brief C++ API: Canonical Iterator | |
21 */ | |
22 | |
23 /** Should permutation skip characters with combining class zero | |
24 * Should be either TRUE or FALSE. This is a compile time option | |
25 * @stable ICU 2.4 | |
26 */ | |
27 #ifndef CANITER_SKIP_ZEROES | |
28 #define CANITER_SKIP_ZEROES TRUE | |
29 #endif | |
30 | |
31 U_NAMESPACE_BEGIN | |
32 | |
33 class Hashtable; | |
34 class Normalizer2; | |
35 class Normalizer2Impl; | |
36 | |
37 /** | |
38 * This class allows one to iterate through all the strings that are canonically
equivalent to a given | |
39 * string. For example, here are some sample results: | |
40 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMB
INING DOT ABOVE}{COMBINING CEDILLA} | |
41 1: \\u0041\\u030A\\u0064\\u0307\\u0327 | |
42 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBININ
G DOT ABOVE}{COMBINING CEDILLA} | |
43 2: \\u0041\\u030A\\u0064\\u0327\\u0307 | |
44 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBININ
G CEDILLA}{COMBINING DOT ABOVE} | |
45 3: \\u0041\\u030A\\u1E0B\\u0327 | |
46 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT
ABOVE}{COMBINING CEDILLA} | |
47 4: \\u0041\\u030A\\u1E11\\u0307 | |
48 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDI
LLA}{COMBINING DOT ABOVE} | |
49 5: \\u00C5\\u0064\\u0307\\u0327 | |
50 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT
ABOVE}{COMBINING CEDILLA} | |
51 6: \\u00C5\\u0064\\u0327\\u0307 | |
52 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDI
LLA}{COMBINING DOT ABOVE} | |
53 7: \\u00C5\\u1E0B\\u0327 | |
54 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}
{COMBINING CEDILLA} | |
55 8: \\u00C5\\u1E11\\u0307 | |
56 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{C
OMBINING DOT ABOVE} | |
57 9: \\u212B\\u0064\\u0307\\u0327 | |
58 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} | |
59 10: \\u212B\\u0064\\u0327\\u0307 | |
60 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} | |
61 11: \\u212B\\u1E0B\\u0327 | |
62 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} | |
63 12: \\u212B\\u1E11\\u0307 | |
64 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} | |
65 *<br>Note: the code is intended for use with small strings, and is not suitable
for larger ones, | |
66 * since it has not been optimized for that situation. | |
67 * Note, CanonicalIterator is not intended to be subclassed. | |
68 * @author M. Davis | |
69 * @author C++ port by V. Weinstein | |
70 * @stable ICU 2.4 | |
71 */ | |
72 class U_COMMON_API CanonicalIterator : public UObject { | |
73 public: | |
74 /** | |
75 * Construct a CanonicalIterator object | |
76 * @param source string to get results for | |
77 * @param status Fill-in parameter which receives the status of this oper
ation. | |
78 * @stable ICU 2.4 | |
79 */ | |
80 CanonicalIterator(const UnicodeString &source, UErrorCode &status); | |
81 | |
82 /** Destructor | |
83 * Cleans pieces | |
84 * @stable ICU 2.4 | |
85 */ | |
86 virtual ~CanonicalIterator(); | |
87 | |
88 /** | |
89 * Gets the NFD form of the current source we are iterating over. | |
90 * @return gets the source: NOTE: it is the NFD form of source | |
91 * @stable ICU 2.4 | |
92 */ | |
93 UnicodeString getSource(); | |
94 | |
95 /** | |
96 * Resets the iterator so that one can start again from the beginning. | |
97 * @stable ICU 2.4 | |
98 */ | |
99 void reset(); | |
100 | |
101 /** | |
102 * Get the next canonically equivalent string. | |
103 * <br><b>Warning: The strings are not guaranteed to be in any particular or
der.</b> | |
104 * @return the next string that is canonically equivalent. A bogus string is
returned when | |
105 * the iteration is done. | |
106 * @stable ICU 2.4 | |
107 */ | |
108 UnicodeString next(); | |
109 | |
110 /** | |
111 * Set a new source for this iterator. Allows object reuse. | |
112 * @param newSource the source string to iterate against. This allows th
e same iterator to be used | |
113 * while changing the source string, saving object creat
ion. | |
114 * @param status Fill-in parameter which receives the status of this
operation. | |
115 * @stable ICU 2.4 | |
116 */ | |
117 void setSource(const UnicodeString &newSource, UErrorCode &status); | |
118 | |
119 /** | |
120 * Dumb recursive implementation of permutation. | |
121 * TODO: optimize | |
122 * @param source the string to find permutations for | |
123 * @param skipZeros determine if skip zeros | |
124 * @param result the results in a set. | |
125 * @param status Fill-in parameter which receives the status of this o
peration. | |
126 * @internal | |
127 */ | |
128 static void U_EXPORT2 permute(UnicodeString &source, UBool skipZeros, Hashta
ble *result, UErrorCode &status); | |
129 | |
130 /** | |
131 * ICU "poor man's RTTI", returns a UClassID for this class. | |
132 * | |
133 * @stable ICU 2.2 | |
134 */ | |
135 static UClassID U_EXPORT2 getStaticClassID(); | |
136 | |
137 /** | |
138 * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
139 * | |
140 * @stable ICU 2.2 | |
141 */ | |
142 virtual UClassID getDynamicClassID() const; | |
143 | |
144 private: | |
145 // ===================== PRIVATES ============================== | |
146 // private default constructor | |
147 CanonicalIterator(); | |
148 | |
149 | |
150 /** | |
151 * Copy constructor. Private for now. | |
152 * @internal | |
153 */ | |
154 CanonicalIterator(const CanonicalIterator& other); | |
155 | |
156 /** | |
157 * Assignment operator. Private for now. | |
158 * @internal | |
159 */ | |
160 CanonicalIterator& operator=(const CanonicalIterator& other); | |
161 | |
162 // fields | |
163 UnicodeString source; | |
164 UBool done; | |
165 | |
166 // 2 dimensional array holds the pieces of the string with | |
167 // their different canonically equivalent representations | |
168 UnicodeString **pieces; | |
169 int32_t pieces_length; | |
170 int32_t *pieces_lengths; | |
171 | |
172 // current is used in iterating to combine pieces | |
173 int32_t *current; | |
174 int32_t current_length; | |
175 | |
176 // transient fields | |
177 UnicodeString buffer; | |
178 | |
179 const Normalizer2 &nfd; | |
180 const Normalizer2Impl &nfcImpl; | |
181 | |
182 // we have a segment, in NFD. Find all the strings that are canonically equi
valent to it. | |
183 UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_
len, UErrorCode &status); //private String[] getEquivalents(String segment) | |
184 | |
185 //Set getEquivalents2(String segment); | |
186 Hashtable *getEquivalents2(Hashtable *fillinResult, const UChar *segment, in
t32_t segLen, UErrorCode &status); | |
187 //Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, U
ErrorCode &status); | |
188 | |
189 /** | |
190 * See if the decomposition of cp2 is at segment starting at segmentPos | |
191 * (with canonical rearrangment!) | |
192 * If so, take the remainder, and return the equivalents | |
193 */ | |
194 //Set extract(int comp, String segment, int segmentPos, StringBuffer buffer)
; | |
195 Hashtable *extract(Hashtable *fillinResult, UChar32 comp, const UChar *segme
nt, int32_t segLen, int32_t segmentPos, UErrorCode &status); | |
196 //Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t seg
Len, int32_t segmentPos, UErrorCode &status); | |
197 | |
198 void cleanPieces(); | |
199 | |
200 }; | |
201 | |
202 U_NAMESPACE_END | |
203 | |
204 #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
205 | |
206 #endif | |
OLD | NEW |