OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 1996-2011, International Business Machines Corporation and * | 3 * Copyright (C) 1996-2014, International Business Machines Corporation and |
4 * others. All Rights Reserved. * | 4 * others. All Rights Reserved. |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 */ | 6 */ |
7 | 7 |
8 /* | 8 /* |
9 * File coleitr.cpp | 9 * File coleitr.cpp |
10 * | 10 * |
11 * | |
12 * | |
13 * Created by: Helena Shih | 11 * Created by: Helena Shih |
14 * | 12 * |
15 * Modification History: | 13 * Modification History: |
16 * | 14 * |
17 * Date Name Description | 15 * Date Name Description |
18 * | 16 * |
19 * 6/23/97 helena Adding comments to make code more readable. | 17 * 6/23/97 helena Adding comments to make code more readable. |
20 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.ja
va | 18 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.ja
va |
21 * 12/10/99 aliu Ported Thai collation support from Java. | 19 * 12/10/99 aliu Ported Thai collation support from Java. |
22 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) | 20 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) |
23 * 02/19/01 swquek Removed CollationElementsIterator() since it is | 21 * 02/19/01 swquek Removed CollationElementIterator() since it is |
24 * private constructor and no calls are made to it | 22 * private constructor and no calls are made to it |
| 23 * 2012-2014 markus Rewritten in C++ again. |
25 */ | 24 */ |
26 | 25 |
27 #include "unicode/utypes.h" | 26 #include "unicode/utypes.h" |
28 | 27 |
29 #if !UCONFIG_NO_COLLATION | 28 #if !UCONFIG_NO_COLLATION |
30 | 29 |
31 #include "unicode/coleitr.h" | 30 #include "unicode/coleitr.h" |
| 31 #include "unicode/tblcoll.h" |
32 #include "unicode/ustring.h" | 32 #include "unicode/ustring.h" |
33 #include "ucol_imp.h" | 33 #include "cmemory.h" |
| 34 #include "collation.h" |
| 35 #include "collationdata.h" |
| 36 #include "collationiterator.h" |
| 37 #include "collationsets.h" |
| 38 #include "collationtailoring.h" |
34 #include "uassert.h" | 39 #include "uassert.h" |
35 #include "cmemory.h" | 40 #include "uhash.h" |
36 | 41 #include "utf16collationiterator.h" |
| 42 #include "uvectr32.h" |
37 | 43 |
38 /* Constants --------------------------------------------------------------- */ | 44 /* Constants --------------------------------------------------------------- */ |
39 | 45 |
40 U_NAMESPACE_BEGIN | 46 U_NAMESPACE_BEGIN |
41 | 47 |
42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) | 48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
43 | 49 |
44 /* CollationElementIterator public constructor/destructor ------------------ */ | 50 /* CollationElementIterator public constructor/destructor ------------------ */ |
45 | 51 |
46 CollationElementIterator::CollationElementIterator( | 52 CollationElementIterator::CollationElementIterator( |
47 const CollationElementIterator& other) | 53 const CollationElementIterator& other) |
48 : UObject(other), isDataOwned_(TRUE) | 54 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offse
ts_(NULL) { |
49 { | |
50 UErrorCode status = U_ZERO_ERROR; | |
51 m_data_ = ucol_openElements(other.m_data_->iteratordata_.coll, NULL, 0, | |
52 &status); | |
53 | |
54 *this = other; | 55 *this = other; |
55 } | 56 } |
56 | 57 |
57 CollationElementIterator::~CollationElementIterator() | 58 CollationElementIterator::~CollationElementIterator() |
58 { | 59 { |
59 if (isDataOwned_) { | 60 delete iter_; |
60 ucol_closeElements(m_data_); | 61 delete offsets_; |
61 } | |
62 } | 62 } |
63 | 63 |
64 /* CollationElementIterator public methods --------------------------------- */ | 64 /* CollationElementIterator public methods --------------------------------- */ |
65 | 65 |
| 66 namespace { |
| 67 |
| 68 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { |
| 69 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xf
f); |
| 70 } |
| 71 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { |
| 72 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); |
| 73 } |
| 74 UBool ceNeedsTwoParts(int64_t ce) { |
| 75 return (ce & INT64_C(0xffff00ff003f)) != 0; |
| 76 } |
| 77 |
| 78 } // namespace |
| 79 |
66 int32_t CollationElementIterator::getOffset() const | 80 int32_t CollationElementIterator::getOffset() const |
67 { | 81 { |
68 return ucol_getOffset(m_data_); | 82 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
| 83 // CollationIterator::previousCE() decrements the CEs length |
| 84 // while it pops CEs from its internal buffer. |
| 85 int32_t i = iter_->getCEsLength(); |
| 86 if (otherHalf_ != 0) { |
| 87 // Return the trailing CE offset while we are in the middle of a 64-
bit CE. |
| 88 ++i; |
| 89 } |
| 90 U_ASSERT(i < offsets_->size()); |
| 91 return offsets_->elementAti(i); |
| 92 } |
| 93 return iter_->getOffset(); |
69 } | 94 } |
70 | 95 |
71 /** | 96 /** |
72 * Get the ordering priority of the next character in the string. | 97 * Get the ordering priority of the next character in the string. |
73 * @return the next character's ordering. Returns NULLORDER if an error has | 98 * @return the next character's ordering. Returns NULLORDER if an error has |
74 * occured or if the end of string has been reached | 99 * occured or if the end of string has been reached |
75 */ | 100 */ |
76 int32_t CollationElementIterator::next(UErrorCode& status) | 101 int32_t CollationElementIterator::next(UErrorCode& status) |
77 { | 102 { |
78 return ucol_next(m_data_, &status); | 103 if (U_FAILURE(status)) { return NULLORDER; } |
| 104 if (dir_ > 1) { |
| 105 // Continue forward iteration. Test this first. |
| 106 if (otherHalf_ != 0) { |
| 107 uint32_t oh = otherHalf_; |
| 108 otherHalf_ = 0; |
| 109 return oh; |
| 110 } |
| 111 } else if (dir_ == 1) { |
| 112 // next() after setOffset() |
| 113 dir_ = 2; |
| 114 } else if (dir_ == 0) { |
| 115 // The iter_ is already reset to the start of the text. |
| 116 dir_ = 2; |
| 117 } else /* dir_ < 0 */ { |
| 118 // illegal change of direction |
| 119 status = U_INVALID_STATE_ERROR; |
| 120 return NULLORDER; |
| 121 } |
| 122 // No need to keep all CEs in the buffer when we iterate. |
| 123 iter_->clearCEsIfNoneRemaining(); |
| 124 int64_t ce = iter_->nextCE(status); |
| 125 if (ce == Collation::NO_CE) { return NULLORDER; } |
| 126 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits
. |
| 127 uint32_t p = (uint32_t)(ce >> 32); |
| 128 uint32_t lower32 = (uint32_t)ce; |
| 129 uint32_t firstHalf = getFirstHalf(p, lower32); |
| 130 uint32_t secondHalf = getSecondHalf(p, lower32); |
| 131 if (secondHalf != 0) { |
| 132 otherHalf_ = secondHalf | 0xc0; // continuation CE |
| 133 } |
| 134 return firstHalf; |
79 } | 135 } |
80 | 136 |
81 UBool CollationElementIterator::operator!=( | 137 UBool CollationElementIterator::operator!=( |
82 const CollationElementIterator& other) const | 138 const CollationElementIterator& other) const |
83 { | 139 { |
84 return !(*this == other); | 140 return !(*this == other); |
85 } | 141 } |
86 | 142 |
87 UBool CollationElementIterator::operator==( | 143 UBool CollationElementIterator::operator==( |
88 const CollationElementIterator& that) const | 144 const CollationElementIterator& that) const |
89 { | 145 { |
90 if (this == &that || m_data_ == that.m_data_) { | 146 if (this == &that) { |
91 return TRUE; | 147 return TRUE; |
92 } | 148 } |
93 | 149 |
94 // option comparison | 150 return |
95 if (m_data_->iteratordata_.coll != that.m_data_->iteratordata_.coll) | 151 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && |
96 { | 152 otherHalf_ == that.otherHalf_ && |
97 return FALSE; | 153 normalizeDir() == that.normalizeDir() && |
98 } | 154 string_ == that.string_ && |
99 | 155 *iter_ == *that.iter_; |
100 // the constructor and setText always sets a length | |
101 // and we only compare the string not the contents of the normalization | |
102 // buffer | |
103 int thislength = (int)(m_data_->iteratordata_.endp - m_data_->iteratordata_.
string); | |
104 int thatlength = (int)(that.m_data_->iteratordata_.endp - that.m_data_->iter
atordata_.string); | |
105 | |
106 if (thislength != thatlength) { | |
107 return FALSE; | |
108 } | |
109 | |
110 if (uprv_memcmp(m_data_->iteratordata_.string, | |
111 that.m_data_->iteratordata_.string, | |
112 thislength * U_SIZEOF_UCHAR) != 0) { | |
113 return FALSE; | |
114 } | |
115 if (getOffset() != that.getOffset()) { | |
116 return FALSE; | |
117 } | |
118 | |
119 // checking normalization buffer | |
120 if ((m_data_->iteratordata_.flags & UCOL_ITER_HASLEN) == 0) { | |
121 if ((that.m_data_->iteratordata_.flags & UCOL_ITER_HASLEN) != 0) { | |
122 return FALSE; | |
123 } | |
124 // both are in the normalization buffer | |
125 if (m_data_->iteratordata_.pos | |
126 - m_data_->iteratordata_.writableBuffer.getBuffer() | |
127 != that.m_data_->iteratordata_.pos | |
128 - that.m_data_->iteratordata_.writableBuffer.getBuffer()) { | |
129 // not in the same position in the normalization buffer | |
130 return FALSE; | |
131 } | |
132 } | |
133 else if ((that.m_data_->iteratordata_.flags & UCOL_ITER_HASLEN) == 0) { | |
134 return FALSE; | |
135 } | |
136 // checking ce position | |
137 return (m_data_->iteratordata_.CEpos - m_data_->iteratordata_.CEs) | |
138 == (that.m_data_->iteratordata_.CEpos | |
139 - that.m_data_->iteratordata_.CEs); | |
140 } | 156 } |
141 | 157 |
142 /** | 158 /** |
143 * Get the ordering priority of the previous collation element in the string. | 159 * Get the ordering priority of the previous collation element in the string. |
144 * @param status the error code status. | 160 * @param status the error code status. |
145 * @return the previous element's ordering. Returns NULLORDER if an error has | 161 * @return the previous element's ordering. Returns NULLORDER if an error has |
146 * occured or if the start of string has been reached. | 162 * occured or if the start of string has been reached. |
147 */ | 163 */ |
148 int32_t CollationElementIterator::previous(UErrorCode& status) | 164 int32_t CollationElementIterator::previous(UErrorCode& status) |
149 { | 165 { |
150 return ucol_previous(m_data_, &status); | 166 if (U_FAILURE(status)) { return NULLORDER; } |
| 167 if (dir_ < 0) { |
| 168 // Continue backwards iteration. Test this first. |
| 169 if (otherHalf_ != 0) { |
| 170 uint32_t oh = otherHalf_; |
| 171 otherHalf_ = 0; |
| 172 return oh; |
| 173 } |
| 174 } else if (dir_ == 0) { |
| 175 iter_->resetToOffset(string_.length()); |
| 176 dir_ = -1; |
| 177 } else if (dir_ == 1) { |
| 178 // previous() after setOffset() |
| 179 dir_ = -1; |
| 180 } else /* dir_ > 1 */ { |
| 181 // illegal change of direction |
| 182 status = U_INVALID_STATE_ERROR; |
| 183 return NULLORDER; |
| 184 } |
| 185 if (offsets_ == NULL) { |
| 186 offsets_ = new UVector32(status); |
| 187 if (offsets_ == NULL) { |
| 188 status = U_MEMORY_ALLOCATION_ERROR; |
| 189 return NULLORDER; |
| 190 } |
| 191 } |
| 192 // If we already have expansion CEs, then we also have offsets. |
| 193 // Otherwise remember the trailing offset in case we need to |
| 194 // write offsets for an artificial expansion. |
| 195 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; |
| 196 int64_t ce = iter_->previousCE(*offsets_, status); |
| 197 if (ce == Collation::NO_CE) { return NULLORDER; } |
| 198 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits
. |
| 199 uint32_t p = (uint32_t)(ce >> 32); |
| 200 uint32_t lower32 = (uint32_t)ce; |
| 201 uint32_t firstHalf = getFirstHalf(p, lower32); |
| 202 uint32_t secondHalf = getSecondHalf(p, lower32); |
| 203 if (secondHalf != 0) { |
| 204 if (offsets_->isEmpty()) { |
| 205 // When we convert a single 64-bit CE into two 32-bit CEs, |
| 206 // we need to make this artificial expansion behave like a normal ex
pansion. |
| 207 // See CollationIterator::previousCE(). |
| 208 offsets_->addElement(iter_->getOffset(), status); |
| 209 offsets_->addElement(limitOffset, status); |
| 210 } |
| 211 otherHalf_ = firstHalf; |
| 212 return secondHalf | 0xc0; // continuation CE |
| 213 } |
| 214 return firstHalf; |
151 } | 215 } |
152 | 216 |
153 /** | 217 /** |
154 * Resets the cursor to the beginning of the string. | 218 * Resets the cursor to the beginning of the string. |
155 */ | 219 */ |
156 void CollationElementIterator::reset() | 220 void CollationElementIterator::reset() |
157 { | 221 { |
158 ucol_reset(m_data_); | 222 iter_ ->resetToOffset(0); |
| 223 otherHalf_ = 0; |
| 224 dir_ = 0; |
159 } | 225 } |
160 | 226 |
161 void CollationElementIterator::setOffset(int32_t newOffset, | 227 void CollationElementIterator::setOffset(int32_t newOffset, |
162 UErrorCode& status) | 228 UErrorCode& status) |
163 { | 229 { |
164 ucol_setOffset(m_data_, newOffset, &status); | 230 if (U_FAILURE(status)) { return; } |
| 231 if (0 < newOffset && newOffset < string_.length()) { |
| 232 int32_t offset = newOffset; |
| 233 do { |
| 234 UChar c = string_.charAt(offset); |
| 235 if (!rbc_->isUnsafe(c) || |
| 236 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset))
)) { |
| 237 break; |
| 238 } |
| 239 // Back up to before this unsafe character. |
| 240 --offset; |
| 241 } while (offset > 0); |
| 242 if (offset < newOffset) { |
| 243 // We might have backed up more than necessary. |
| 244 // For example, contractions "ch" and "cu" make both 'h' and 'u' uns
afe, |
| 245 // but for text "chu" setOffset(2) should remain at 2 |
| 246 // although we initially back up to offset 0. |
| 247 // Find the last safe offset no greater than newOffset by iterating
forward. |
| 248 int32_t lastSafeOffset = offset; |
| 249 do { |
| 250 iter_->resetToOffset(lastSafeOffset); |
| 251 do { |
| 252 iter_->nextCE(status); |
| 253 if (U_FAILURE(status)) { return; } |
| 254 } while ((offset = iter_->getOffset()) == lastSafeOffset); |
| 255 if (offset <= newOffset) { |
| 256 lastSafeOffset = offset; |
| 257 } |
| 258 } while (offset < newOffset); |
| 259 newOffset = lastSafeOffset; |
| 260 } |
| 261 } |
| 262 iter_->resetToOffset(newOffset); |
| 263 otherHalf_ = 0; |
| 264 dir_ = 1; |
165 } | 265 } |
166 | 266 |
167 /** | 267 /** |
168 * Sets the source to the new source string. | 268 * Sets the source to the new source string. |
169 */ | 269 */ |
170 void CollationElementIterator::setText(const UnicodeString& source, | 270 void CollationElementIterator::setText(const UnicodeString& source, |
171 UErrorCode& status) | 271 UErrorCode& status) |
172 { | 272 { |
173 if (U_FAILURE(status)) { | 273 if (U_FAILURE(status)) { |
174 return; | 274 return; |
175 } | 275 } |
176 | 276 |
177 int32_t length = source.length(); | 277 string_ = source; |
178 UChar *string = NULL; | 278 const UChar *s = string_.getBuffer(); |
179 if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) { | 279 CollationIterator *newIter; |
180 uprv_free((UChar *)m_data_->iteratordata_.string); | 280 UBool numeric = rbc_->settings->isNumeric(); |
| 281 if (rbc_->settings->dontCheckFCD()) { |
| 282 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + stri
ng_.length()); |
| 283 } else { |
| 284 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + s
tring_.length()); |
181 } | 285 } |
182 m_data_->isWritable = TRUE; | 286 if (newIter == NULL) { |
183 if (length > 0) { | 287 status = U_MEMORY_ALLOCATION_ERROR; |
184 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | 288 return; |
185 /* test for NULL */ | |
186 if (string == NULL) { | |
187 status = U_MEMORY_ALLOCATION_ERROR; | |
188 return; | |
189 } | |
190 u_memcpy(string, source.getBuffer(), length); | |
191 } | 289 } |
192 else { | 290 delete iter_; |
193 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | 291 iter_ = newIter; |
194 /* test for NULL */ | 292 otherHalf_ = 0; |
195 if (string == NULL) { | 293 dir_ = 0; |
196 status = U_MEMORY_ALLOCATION_ERROR; | |
197 return; | |
198 } | |
199 *string = 0; | |
200 } | |
201 /* Free offsetBuffer before initializing it. */ | |
202 ucol_freeOffsetBuffer(&(m_data_->iteratordata_)); | |
203 uprv_init_collIterate(m_data_->iteratordata_.coll, string, length, | |
204 &m_data_->iteratordata_, &status); | |
205 | |
206 m_data_->reset_ = TRUE; | |
207 } | 294 } |
208 | 295 |
209 // Sets the source to the new character iterator. | 296 // Sets the source to the new character iterator. |
210 void CollationElementIterator::setText(CharacterIterator& source, | 297 void CollationElementIterator::setText(CharacterIterator& source, |
211 UErrorCode& status) | 298 UErrorCode& status) |
212 { | 299 { |
213 if (U_FAILURE(status)) | 300 if (U_FAILURE(status)) |
214 return; | 301 return; |
215 | 302 |
216 int32_t length = source.getLength(); | 303 source.getText(string_); |
217 UChar *buffer = NULL; | 304 setText(string_, status); |
218 | |
219 if (length == 0) { | |
220 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | |
221 /* test for NULL */ | |
222 if (buffer == NULL) { | |
223 status = U_MEMORY_ALLOCATION_ERROR; | |
224 return; | |
225 } | |
226 *buffer = 0; | |
227 } | |
228 else { | |
229 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | |
230 /* test for NULL */ | |
231 if (buffer == NULL) { | |
232 status = U_MEMORY_ALLOCATION_ERROR; | |
233 return; | |
234 } | |
235 /* | |
236 Using this constructor will prevent buffer from being removed when | |
237 string gets removed | |
238 */ | |
239 UnicodeString string; | |
240 source.getText(string); | |
241 u_memcpy(buffer, string.getBuffer(), length); | |
242 } | |
243 | |
244 if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) { | |
245 uprv_free((UChar *)m_data_->iteratordata_.string); | |
246 } | |
247 m_data_->isWritable = TRUE; | |
248 /* Free offsetBuffer before initializing it. */ | |
249 ucol_freeOffsetBuffer(&(m_data_->iteratordata_)); | |
250 uprv_init_collIterate(m_data_->iteratordata_.coll, buffer, length, | |
251 &m_data_->iteratordata_, &status); | |
252 m_data_->reset_ = TRUE; | |
253 } | 305 } |
254 | 306 |
255 int32_t CollationElementIterator::strengthOrder(int32_t order) const | 307 int32_t CollationElementIterator::strengthOrder(int32_t order) const |
256 { | 308 { |
257 UCollationStrength s = ucol_getStrength(m_data_->iteratordata_.coll); | 309 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
258 // Mask off the unwanted differences. | 310 // Mask off the unwanted differences. |
259 if (s == UCOL_PRIMARY) { | 311 if (s == UCOL_PRIMARY) { |
260 order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY; | 312 order &= 0xffff0000; |
261 } | 313 } |
262 else if (s == UCOL_SECONDARY) { | 314 else if (s == UCOL_SECONDARY) { |
263 order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY; | 315 order &= 0xffffff00; |
264 } | 316 } |
265 | 317 |
266 return order; | 318 return order; |
267 } | 319 } |
268 | 320 |
269 /* CollationElementIterator private constructors/destructors --------------- */ | 321 /* CollationElementIterator private constructors/destructors --------------- */ |
270 | 322 |
271 /** | 323 /** |
272 * This is the "real" constructor for this class; it constructs an iterator | 324 * This is the "real" constructor for this class; it constructs an iterator |
273 * over the source text using the specified collator | 325 * over the source text using the specified collator |
274 */ | 326 */ |
275 CollationElementIterator::CollationElementIterator( | 327 CollationElementIterator::CollationElementIterator( |
276 const UnicodeString& sourceText, | 328 const UnicodeString &source, |
277 const RuleBasedCollator* order, | 329 const RuleBasedCollator *coll, |
278 UErrorCode& status) | 330 UErrorCode &status) |
279 : isDataOwned_(TRUE) | 331 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
280 { | 332 setText(source, status); |
281 if (U_FAILURE(status)) { | |
282 return; | |
283 } | |
284 | |
285 int32_t length = sourceText.length(); | |
286 UChar *string = NULL; | |
287 | |
288 if (length > 0) { | |
289 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | |
290 /* test for NULL */ | |
291 if (string == NULL) { | |
292 status = U_MEMORY_ALLOCATION_ERROR; | |
293 return; | |
294 } | |
295 /* | |
296 Using this constructor will prevent buffer from being removed when | |
297 string gets removed | |
298 */ | |
299 u_memcpy(string, sourceText.getBuffer(), length); | |
300 } | |
301 else { | |
302 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | |
303 /* test for NULL */ | |
304 if (string == NULL) { | |
305 status = U_MEMORY_ALLOCATION_ERROR; | |
306 return; | |
307 } | |
308 *string = 0; | |
309 } | |
310 m_data_ = ucol_openElements(order->ucollator, string, length, &status); | |
311 | |
312 /* Test for buffer overflows */ | |
313 if (U_FAILURE(status)) { | |
314 return; | |
315 } | |
316 m_data_->isWritable = TRUE; | |
317 } | 333 } |
318 | 334 |
319 /** | 335 /** |
320 * This is the "real" constructor for this class; it constructs an iterator over | 336 * This is the "real" constructor for this class; it constructs an iterator over |
321 * the source text using the specified collator | 337 * the source text using the specified collator |
322 */ | 338 */ |
323 CollationElementIterator::CollationElementIterator( | 339 CollationElementIterator::CollationElementIterator( |
324 const CharacterIterator& sourceText, | 340 const CharacterIterator &source, |
325 const RuleBasedCollator* order, | 341 const RuleBasedCollator *coll, |
326 UErrorCode& status) | 342 UErrorCode &status) |
327 : isDataOwned_(TRUE) | 343 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
328 { | 344 // We only call source.getText() which should be const anyway. |
329 if (U_FAILURE(status)) | 345 setText(const_cast<CharacterIterator &>(source), status); |
330 return; | |
331 | |
332 // **** should I just drop this test? **** | |
333 /* | |
334 if ( sourceText.endIndex() != 0 ) | |
335 { | |
336 // A CollationElementIterator is really a two-layered beast. | |
337 // Internally it uses a Normalizer to munge the source text into a form | |
338 // where all "composed" Unicode characters (such as \u00FC) are split in
to a | |
339 // normal character and a combining accent character. | |
340 // Afterward, CollationElementIterator does its own processing to handle | |
341 // expanding and contracting collation sequences, ignorables, and so on. | |
342 | |
343 Normalizer::EMode decomp = order->getStrength() == Collator::IDENTICAL | |
344 ? Normalizer::NO_OP : order->getDecomposition(); | |
345 | |
346 text = new Normalizer(sourceText, decomp); | |
347 if (text == NULL) | |
348 status = U_MEMORY_ALLOCATION_ERROR; | |
349 } | |
350 */ | |
351 int32_t length = sourceText.getLength(); | |
352 UChar *buffer; | |
353 if (length > 0) { | |
354 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | |
355 /* test for NULL */ | |
356 if (buffer == NULL) { | |
357 status = U_MEMORY_ALLOCATION_ERROR; | |
358 return; | |
359 } | |
360 /* | |
361 Using this constructor will prevent buffer from being removed when | |
362 string gets removed | |
363 */ | |
364 UnicodeString string(buffer, length, length); | |
365 ((CharacterIterator &)sourceText).getText(string); | |
366 const UChar *temp = string.getBuffer(); | |
367 u_memcpy(buffer, temp, length); | |
368 } | |
369 else { | |
370 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | |
371 /* test for NULL */ | |
372 if (buffer == NULL) { | |
373 status = U_MEMORY_ALLOCATION_ERROR; | |
374 return; | |
375 } | |
376 *buffer = 0; | |
377 } | |
378 m_data_ = ucol_openElements(order->ucollator, buffer, length, &status); | |
379 | |
380 /* Test for buffer overflows */ | |
381 if (U_FAILURE(status)) { | |
382 return; | |
383 } | |
384 m_data_->isWritable = TRUE; | |
385 } | 346 } |
386 | 347 |
387 /* CollationElementIterator protected methods ----------------------------- */ | 348 /* CollationElementIterator private methods -------------------------------- */ |
388 | 349 |
389 const CollationElementIterator& CollationElementIterator::operator=( | 350 const CollationElementIterator& CollationElementIterator::operator=( |
390 const CollationElementIterator& other) | 351 const CollationElementIterator& other) |
391 { | 352 { |
392 if (this != &other) | 353 if (this == &other) { |
393 { | 354 return *this; |
394 UCollationElements *ucolelem = this->m_data_; | |
395 UCollationElements *otherucolelem = other.m_data_; | |
396 collIterate *coliter = &(ucolelem->iteratordata_); | |
397 collIterate *othercoliter = &(otherucolelem->iteratordata_); | |
398 int length = 0; | |
399 | |
400 // checking only UCOL_ITER_HASLEN is not enough here as we may be in | |
401 // the normalization buffer | |
402 length = (int)(othercoliter->endp - othercoliter->string); | |
403 | |
404 ucolelem->reset_ = otherucolelem->reset_; | |
405 ucolelem->isWritable = TRUE; | |
406 | |
407 /* create a duplicate of string */ | |
408 if (length > 0) { | |
409 coliter->string = (UChar *)uprv_malloc(length * U_SIZEOF_UCHAR); | |
410 if(coliter->string != NULL) { | |
411 uprv_memcpy((UChar *)coliter->string, othercoliter->string, | |
412 length * U_SIZEOF_UCHAR); | |
413 } else { // Error: couldn't allocate memory. No copying should be do
ne | |
414 length = 0; | |
415 } | |
416 } | |
417 else { | |
418 coliter->string = NULL; | |
419 } | |
420 | |
421 /* start and end of string */ | |
422 coliter->endp = coliter->string == NULL ? NULL : coliter->string + lengt
h; | |
423 | |
424 /* handle writable buffer here */ | |
425 | |
426 if (othercoliter->flags & UCOL_ITER_INNORMBUF) { | |
427 coliter->writableBuffer = othercoliter->writableBuffer; | |
428 coliter->writableBuffer.getTerminatedBuffer(); | |
429 } | |
430 | |
431 /* current position */ | |
432 if (othercoliter->pos >= othercoliter->string && | |
433 othercoliter->pos <= othercoliter->endp) | |
434 { | |
435 U_ASSERT(coliter->string != NULL); | |
436 coliter->pos = coliter->string + | |
437 (othercoliter->pos - othercoliter->string); | |
438 } | |
439 else { | |
440 coliter->pos = coliter->writableBuffer.getTerminatedBuffer() + | |
441 (othercoliter->pos - othercoliter->writableBuffer.getBuffer()); | |
442 } | |
443 | |
444 /* CE buffer */ | |
445 int32_t CEsize; | |
446 if (coliter->extendCEs) { | |
447 uprv_memcpy(coliter->CEs, othercoliter->CEs, sizeof(uint32_t) * UCOL
_EXPAND_CE_BUFFER_SIZE); | |
448 CEsize = sizeof(othercoliter->extendCEs); | |
449 if (CEsize > 0) { | |
450 othercoliter->extendCEs = (uint32_t *)uprv_malloc(CEsize); | |
451 uprv_memcpy(coliter->extendCEs, othercoliter->extendCEs, CEsize)
; | |
452 } | |
453 coliter->toReturn = coliter->extendCEs + | |
454 (othercoliter->toReturn - othercoliter->extendCEs); | |
455 coliter->CEpos = coliter->extendCEs + CEsize; | |
456 } else { | |
457 CEsize = (int32_t)(othercoliter->CEpos - othercoliter->CEs); | |
458 if (CEsize > 0) { | |
459 uprv_memcpy(coliter->CEs, othercoliter->CEs, CEsize); | |
460 } | |
461 coliter->toReturn = coliter->CEs + | |
462 (othercoliter->toReturn - othercoliter->CEs); | |
463 coliter->CEpos = coliter->CEs + CEsize; | |
464 } | |
465 | |
466 if (othercoliter->fcdPosition != NULL) { | |
467 U_ASSERT(coliter->string != NULL); | |
468 coliter->fcdPosition = coliter->string + | |
469 (othercoliter->fcdPosition | |
470 - othercoliter->string); | |
471 } | |
472 else { | |
473 coliter->fcdPosition = NULL; | |
474 } | |
475 coliter->flags = othercoliter->flags/*| UCOL_ITER_HASLEN*/; | |
476 coliter->origFlags = othercoliter->origFlags; | |
477 coliter->coll = othercoliter->coll; | |
478 this->isDataOwned_ = TRUE; | |
479 } | 355 } |
480 | 356 |
| 357 CollationIterator *newIter; |
| 358 const FCDUTF16CollationIterator *otherFCDIter = |
| 359 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); |
| 360 if(otherFCDIter != NULL) { |
| 361 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer
()); |
| 362 } else { |
| 363 const UTF16CollationIterator *otherIter = |
| 364 dynamic_cast<const UTF16CollationIterator *>(other.iter_); |
| 365 if(otherIter != NULL) { |
| 366 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()
); |
| 367 } else { |
| 368 newIter = NULL; |
| 369 } |
| 370 } |
| 371 if(newIter != NULL) { |
| 372 delete iter_; |
| 373 iter_ = newIter; |
| 374 rbc_ = other.rbc_; |
| 375 otherHalf_ = other.otherHalf_; |
| 376 dir_ = other.dir_; |
| 377 |
| 378 string_ = other.string_; |
| 379 } |
| 380 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { |
| 381 UErrorCode errorCode = U_ZERO_ERROR; |
| 382 if(offsets_ == NULL) { |
| 383 offsets_ = new UVector32(other.offsets_->size(), errorCode); |
| 384 } |
| 385 if(offsets_ != NULL) { |
| 386 offsets_->assign(*other.offsets_, errorCode); |
| 387 } |
| 388 } |
481 return *this; | 389 return *this; |
482 } | 390 } |
483 | 391 |
| 392 namespace { |
| 393 |
| 394 class MaxExpSink : public ContractionsAndExpansions::CESink { |
| 395 public: |
| 396 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec)
{} |
| 397 virtual ~MaxExpSink(); |
| 398 virtual void handleCE(int64_t /*ce*/) {} |
| 399 virtual void handleExpansion(const int64_t ces[], int32_t length) { |
| 400 if (length <= 1) { |
| 401 // We do not need to add single CEs into the map. |
| 402 return; |
| 403 } |
| 404 int32_t count = 0; // number of CE "halves" |
| 405 for (int32_t i = 0; i < length; ++i) { |
| 406 count += ceNeedsTwoParts(ces[i]) ? 2 : 1; |
| 407 } |
| 408 // last "half" of the last CE |
| 409 int64_t ce = ces[length - 1]; |
| 410 uint32_t p = (uint32_t)(ce >> 32); |
| 411 uint32_t lower32 = (uint32_t)ce; |
| 412 uint32_t lastHalf = getSecondHalf(p, lower32); |
| 413 if (lastHalf == 0) { |
| 414 lastHalf = getFirstHalf(p, lower32); |
| 415 U_ASSERT(lastHalf != 0); |
| 416 } else { |
| 417 lastHalf |= 0xc0; // old-style continuation CE |
| 418 } |
| 419 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
| 420 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); |
| 421 } |
| 422 } |
| 423 |
| 424 private: |
| 425 UHashtable *maxExpansions; |
| 426 UErrorCode &errorCode; |
| 427 }; |
| 428 |
| 429 MaxExpSink::~MaxExpSink() {} |
| 430 |
| 431 } // namespace |
| 432 |
| 433 UHashtable * |
| 434 CollationElementIterator::computeMaxExpansions(const CollationData *data, UError
Code &errorCode) { |
| 435 if (U_FAILURE(errorCode)) { return NULL; } |
| 436 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, |
| 437 uhash_compareLong, &errorCode); |
| 438 if (U_FAILURE(errorCode)) { return NULL; } |
| 439 MaxExpSink sink(maxExpansions, errorCode); |
| 440 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); |
| 441 if (U_FAILURE(errorCode)) { |
| 442 uhash_close(maxExpansions); |
| 443 return NULL; |
| 444 } |
| 445 return maxExpansions; |
| 446 } |
| 447 |
| 448 int32_t |
| 449 CollationElementIterator::getMaxExpansion(int32_t order) const { |
| 450 return getMaxExpansion(rbc_->tailoring->maxExpansions, order); |
| 451 } |
| 452 |
| 453 int32_t |
| 454 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32
_t order) { |
| 455 if (order == 0) { return 1; } |
| 456 int32_t max; |
| 457 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0)
{ |
| 458 return max; |
| 459 } |
| 460 if ((order & 0xc0) == 0xc0) { |
| 461 // old-style continuation CE |
| 462 return 2; |
| 463 } else { |
| 464 return 1; |
| 465 } |
| 466 } |
| 467 |
484 U_NAMESPACE_END | 468 U_NAMESPACE_END |
485 | 469 |
486 #endif /* #if !UCONFIG_NO_COLLATION */ | 470 #endif /* #if !UCONFIG_NO_COLLATION */ |
487 | |
488 /* eof */ | |
OLD | NEW |