| OLD | NEW |
| 1 /* | 1 /* |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 1996-2011, International Business Machines Corporation and * | 3 * Copyright (C) 1996-2014, International Business Machines Corporation and |
| 4 * others. All Rights Reserved. * | 4 * others. All Rights Reserved. |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 /* | 8 /* |
| 9 * File coleitr.cpp | 9 * File coleitr.cpp |
| 10 * | 10 * |
| 11 * | |
| 12 * | |
| 13 * Created by: Helena Shih | 11 * Created by: Helena Shih |
| 14 * | 12 * |
| 15 * Modification History: | 13 * Modification History: |
| 16 * | 14 * |
| 17 * Date Name Description | 15 * Date Name Description |
| 18 * | 16 * |
| 19 * 6/23/97 helena Adding comments to make code more readable. | 17 * 6/23/97 helena Adding comments to make code more readable. |
| 20 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.ja
va | 18 * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.ja
va |
| 21 * 12/10/99 aliu Ported Thai collation support from Java. | 19 * 12/10/99 aliu Ported Thai collation support from Java. |
| 22 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) | 20 * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) |
| 23 * 02/19/01 swquek Removed CollationElementsIterator() since it is | 21 * 02/19/01 swquek Removed CollationElementIterator() since it is |
| 24 * private constructor and no calls are made to it | 22 * private constructor and no calls are made to it |
| 23 * 2012-2014 markus Rewritten in C++ again. |
| 25 */ | 24 */ |
| 26 | 25 |
| 27 #include "unicode/utypes.h" | 26 #include "unicode/utypes.h" |
| 28 | 27 |
| 29 #if !UCONFIG_NO_COLLATION | 28 #if !UCONFIG_NO_COLLATION |
| 30 | 29 |
| 31 #include "unicode/coleitr.h" | 30 #include "unicode/coleitr.h" |
| 31 #include "unicode/tblcoll.h" |
| 32 #include "unicode/ustring.h" | 32 #include "unicode/ustring.h" |
| 33 #include "ucol_imp.h" | 33 #include "cmemory.h" |
| 34 #include "collation.h" |
| 35 #include "collationdata.h" |
| 36 #include "collationiterator.h" |
| 37 #include "collationsets.h" |
| 38 #include "collationtailoring.h" |
| 34 #include "uassert.h" | 39 #include "uassert.h" |
| 35 #include "cmemory.h" | 40 #include "uhash.h" |
| 36 | 41 #include "utf16collationiterator.h" |
| 42 #include "uvectr32.h" |
| 37 | 43 |
| 38 /* Constants --------------------------------------------------------------- */ | 44 /* Constants --------------------------------------------------------------- */ |
| 39 | 45 |
| 40 U_NAMESPACE_BEGIN | 46 U_NAMESPACE_BEGIN |
| 41 | 47 |
| 42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) | 48 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
| 43 | 49 |
| 44 /* CollationElementIterator public constructor/destructor ------------------ */ | 50 /* CollationElementIterator public constructor/destructor ------------------ */ |
| 45 | 51 |
| 46 CollationElementIterator::CollationElementIterator( | 52 CollationElementIterator::CollationElementIterator( |
| 47 const CollationElementIterator& other) | 53 const CollationElementIterator& other) |
| 48 : UObject(other), isDataOwned_(TRUE) | 54 : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offse
ts_(NULL) { |
| 49 { | |
| 50 UErrorCode status = U_ZERO_ERROR; | |
| 51 m_data_ = ucol_openElements(other.m_data_->iteratordata_.coll, NULL, 0, | |
| 52 &status); | |
| 53 | |
| 54 *this = other; | 55 *this = other; |
| 55 } | 56 } |
| 56 | 57 |
| 57 CollationElementIterator::~CollationElementIterator() | 58 CollationElementIterator::~CollationElementIterator() |
| 58 { | 59 { |
| 59 if (isDataOwned_) { | 60 delete iter_; |
| 60 ucol_closeElements(m_data_); | 61 delete offsets_; |
| 61 } | |
| 62 } | 62 } |
| 63 | 63 |
| 64 /* CollationElementIterator public methods --------------------------------- */ | 64 /* CollationElementIterator public methods --------------------------------- */ |
| 65 | 65 |
| 66 namespace { |
| 67 |
| 68 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { |
| 69 return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xf
f); |
| 70 } |
| 71 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { |
| 72 return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); |
| 73 } |
| 74 UBool ceNeedsTwoParts(int64_t ce) { |
| 75 return (ce & INT64_C(0xffff00ff003f)) != 0; |
| 76 } |
| 77 |
| 78 } // namespace |
| 79 |
| 66 int32_t CollationElementIterator::getOffset() const | 80 int32_t CollationElementIterator::getOffset() const |
| 67 { | 81 { |
| 68 return ucol_getOffset(m_data_); | 82 if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
| 83 // CollationIterator::previousCE() decrements the CEs length |
| 84 // while it pops CEs from its internal buffer. |
| 85 int32_t i = iter_->getCEsLength(); |
| 86 if (otherHalf_ != 0) { |
| 87 // Return the trailing CE offset while we are in the middle of a 64-
bit CE. |
| 88 ++i; |
| 89 } |
| 90 U_ASSERT(i < offsets_->size()); |
| 91 return offsets_->elementAti(i); |
| 92 } |
| 93 return iter_->getOffset(); |
| 69 } | 94 } |
| 70 | 95 |
| 71 /** | 96 /** |
| 72 * Get the ordering priority of the next character in the string. | 97 * Get the ordering priority of the next character in the string. |
| 73 * @return the next character's ordering. Returns NULLORDER if an error has | 98 * @return the next character's ordering. Returns NULLORDER if an error has |
| 74 * occured or if the end of string has been reached | 99 * occured or if the end of string has been reached |
| 75 */ | 100 */ |
| 76 int32_t CollationElementIterator::next(UErrorCode& status) | 101 int32_t CollationElementIterator::next(UErrorCode& status) |
| 77 { | 102 { |
| 78 return ucol_next(m_data_, &status); | 103 if (U_FAILURE(status)) { return NULLORDER; } |
| 104 if (dir_ > 1) { |
| 105 // Continue forward iteration. Test this first. |
| 106 if (otherHalf_ != 0) { |
| 107 uint32_t oh = otherHalf_; |
| 108 otherHalf_ = 0; |
| 109 return oh; |
| 110 } |
| 111 } else if (dir_ == 1) { |
| 112 // next() after setOffset() |
| 113 dir_ = 2; |
| 114 } else if (dir_ == 0) { |
| 115 // The iter_ is already reset to the start of the text. |
| 116 dir_ = 2; |
| 117 } else /* dir_ < 0 */ { |
| 118 // illegal change of direction |
| 119 status = U_INVALID_STATE_ERROR; |
| 120 return NULLORDER; |
| 121 } |
| 122 // No need to keep all CEs in the buffer when we iterate. |
| 123 iter_->clearCEsIfNoneRemaining(); |
| 124 int64_t ce = iter_->nextCE(status); |
| 125 if (ce == Collation::NO_CE) { return NULLORDER; } |
| 126 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits
. |
| 127 uint32_t p = (uint32_t)(ce >> 32); |
| 128 uint32_t lower32 = (uint32_t)ce; |
| 129 uint32_t firstHalf = getFirstHalf(p, lower32); |
| 130 uint32_t secondHalf = getSecondHalf(p, lower32); |
| 131 if (secondHalf != 0) { |
| 132 otherHalf_ = secondHalf | 0xc0; // continuation CE |
| 133 } |
| 134 return firstHalf; |
| 79 } | 135 } |
| 80 | 136 |
| 81 UBool CollationElementIterator::operator!=( | 137 UBool CollationElementIterator::operator!=( |
| 82 const CollationElementIterator& other) const | 138 const CollationElementIterator& other) const |
| 83 { | 139 { |
| 84 return !(*this == other); | 140 return !(*this == other); |
| 85 } | 141 } |
| 86 | 142 |
| 87 UBool CollationElementIterator::operator==( | 143 UBool CollationElementIterator::operator==( |
| 88 const CollationElementIterator& that) const | 144 const CollationElementIterator& that) const |
| 89 { | 145 { |
| 90 if (this == &that || m_data_ == that.m_data_) { | 146 if (this == &that) { |
| 91 return TRUE; | 147 return TRUE; |
| 92 } | 148 } |
| 93 | 149 |
| 94 // option comparison | 150 return |
| 95 if (m_data_->iteratordata_.coll != that.m_data_->iteratordata_.coll) | 151 (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && |
| 96 { | 152 otherHalf_ == that.otherHalf_ && |
| 97 return FALSE; | 153 normalizeDir() == that.normalizeDir() && |
| 98 } | 154 string_ == that.string_ && |
| 99 | 155 *iter_ == *that.iter_; |
| 100 // the constructor and setText always sets a length | |
| 101 // and we only compare the string not the contents of the normalization | |
| 102 // buffer | |
| 103 int thislength = (int)(m_data_->iteratordata_.endp - m_data_->iteratordata_.
string); | |
| 104 int thatlength = (int)(that.m_data_->iteratordata_.endp - that.m_data_->iter
atordata_.string); | |
| 105 | |
| 106 if (thislength != thatlength) { | |
| 107 return FALSE; | |
| 108 } | |
| 109 | |
| 110 if (uprv_memcmp(m_data_->iteratordata_.string, | |
| 111 that.m_data_->iteratordata_.string, | |
| 112 thislength * U_SIZEOF_UCHAR) != 0) { | |
| 113 return FALSE; | |
| 114 } | |
| 115 if (getOffset() != that.getOffset()) { | |
| 116 return FALSE; | |
| 117 } | |
| 118 | |
| 119 // checking normalization buffer | |
| 120 if ((m_data_->iteratordata_.flags & UCOL_ITER_HASLEN) == 0) { | |
| 121 if ((that.m_data_->iteratordata_.flags & UCOL_ITER_HASLEN) != 0) { | |
| 122 return FALSE; | |
| 123 } | |
| 124 // both are in the normalization buffer | |
| 125 if (m_data_->iteratordata_.pos | |
| 126 - m_data_->iteratordata_.writableBuffer.getBuffer() | |
| 127 != that.m_data_->iteratordata_.pos | |
| 128 - that.m_data_->iteratordata_.writableBuffer.getBuffer()) { | |
| 129 // not in the same position in the normalization buffer | |
| 130 return FALSE; | |
| 131 } | |
| 132 } | |
| 133 else if ((that.m_data_->iteratordata_.flags & UCOL_ITER_HASLEN) == 0) { | |
| 134 return FALSE; | |
| 135 } | |
| 136 // checking ce position | |
| 137 return (m_data_->iteratordata_.CEpos - m_data_->iteratordata_.CEs) | |
| 138 == (that.m_data_->iteratordata_.CEpos | |
| 139 - that.m_data_->iteratordata_.CEs); | |
| 140 } | 156 } |
| 141 | 157 |
| 142 /** | 158 /** |
| 143 * Get the ordering priority of the previous collation element in the string. | 159 * Get the ordering priority of the previous collation element in the string. |
| 144 * @param status the error code status. | 160 * @param status the error code status. |
| 145 * @return the previous element's ordering. Returns NULLORDER if an error has | 161 * @return the previous element's ordering. Returns NULLORDER if an error has |
| 146 * occured or if the start of string has been reached. | 162 * occured or if the start of string has been reached. |
| 147 */ | 163 */ |
| 148 int32_t CollationElementIterator::previous(UErrorCode& status) | 164 int32_t CollationElementIterator::previous(UErrorCode& status) |
| 149 { | 165 { |
| 150 return ucol_previous(m_data_, &status); | 166 if (U_FAILURE(status)) { return NULLORDER; } |
| 167 if (dir_ < 0) { |
| 168 // Continue backwards iteration. Test this first. |
| 169 if (otherHalf_ != 0) { |
| 170 uint32_t oh = otherHalf_; |
| 171 otherHalf_ = 0; |
| 172 return oh; |
| 173 } |
| 174 } else if (dir_ == 0) { |
| 175 iter_->resetToOffset(string_.length()); |
| 176 dir_ = -1; |
| 177 } else if (dir_ == 1) { |
| 178 // previous() after setOffset() |
| 179 dir_ = -1; |
| 180 } else /* dir_ > 1 */ { |
| 181 // illegal change of direction |
| 182 status = U_INVALID_STATE_ERROR; |
| 183 return NULLORDER; |
| 184 } |
| 185 if (offsets_ == NULL) { |
| 186 offsets_ = new UVector32(status); |
| 187 if (offsets_ == NULL) { |
| 188 status = U_MEMORY_ALLOCATION_ERROR; |
| 189 return NULLORDER; |
| 190 } |
| 191 } |
| 192 // If we already have expansion CEs, then we also have offsets. |
| 193 // Otherwise remember the trailing offset in case we need to |
| 194 // write offsets for an artificial expansion. |
| 195 int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; |
| 196 int64_t ce = iter_->previousCE(*offsets_, status); |
| 197 if (ce == Collation::NO_CE) { return NULLORDER; } |
| 198 // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits
. |
| 199 uint32_t p = (uint32_t)(ce >> 32); |
| 200 uint32_t lower32 = (uint32_t)ce; |
| 201 uint32_t firstHalf = getFirstHalf(p, lower32); |
| 202 uint32_t secondHalf = getSecondHalf(p, lower32); |
| 203 if (secondHalf != 0) { |
| 204 if (offsets_->isEmpty()) { |
| 205 // When we convert a single 64-bit CE into two 32-bit CEs, |
| 206 // we need to make this artificial expansion behave like a normal ex
pansion. |
| 207 // See CollationIterator::previousCE(). |
| 208 offsets_->addElement(iter_->getOffset(), status); |
| 209 offsets_->addElement(limitOffset, status); |
| 210 } |
| 211 otherHalf_ = firstHalf; |
| 212 return secondHalf | 0xc0; // continuation CE |
| 213 } |
| 214 return firstHalf; |
| 151 } | 215 } |
| 152 | 216 |
| 153 /** | 217 /** |
| 154 * Resets the cursor to the beginning of the string. | 218 * Resets the cursor to the beginning of the string. |
| 155 */ | 219 */ |
| 156 void CollationElementIterator::reset() | 220 void CollationElementIterator::reset() |
| 157 { | 221 { |
| 158 ucol_reset(m_data_); | 222 iter_ ->resetToOffset(0); |
| 223 otherHalf_ = 0; |
| 224 dir_ = 0; |
| 159 } | 225 } |
| 160 | 226 |
| 161 void CollationElementIterator::setOffset(int32_t newOffset, | 227 void CollationElementIterator::setOffset(int32_t newOffset, |
| 162 UErrorCode& status) | 228 UErrorCode& status) |
| 163 { | 229 { |
| 164 ucol_setOffset(m_data_, newOffset, &status); | 230 if (U_FAILURE(status)) { return; } |
| 231 if (0 < newOffset && newOffset < string_.length()) { |
| 232 int32_t offset = newOffset; |
| 233 do { |
| 234 UChar c = string_.charAt(offset); |
| 235 if (!rbc_->isUnsafe(c) || |
| 236 (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset))
)) { |
| 237 break; |
| 238 } |
| 239 // Back up to before this unsafe character. |
| 240 --offset; |
| 241 } while (offset > 0); |
| 242 if (offset < newOffset) { |
| 243 // We might have backed up more than necessary. |
| 244 // For example, contractions "ch" and "cu" make both 'h' and 'u' uns
afe, |
| 245 // but for text "chu" setOffset(2) should remain at 2 |
| 246 // although we initially back up to offset 0. |
| 247 // Find the last safe offset no greater than newOffset by iterating
forward. |
| 248 int32_t lastSafeOffset = offset; |
| 249 do { |
| 250 iter_->resetToOffset(lastSafeOffset); |
| 251 do { |
| 252 iter_->nextCE(status); |
| 253 if (U_FAILURE(status)) { return; } |
| 254 } while ((offset = iter_->getOffset()) == lastSafeOffset); |
| 255 if (offset <= newOffset) { |
| 256 lastSafeOffset = offset; |
| 257 } |
| 258 } while (offset < newOffset); |
| 259 newOffset = lastSafeOffset; |
| 260 } |
| 261 } |
| 262 iter_->resetToOffset(newOffset); |
| 263 otherHalf_ = 0; |
| 264 dir_ = 1; |
| 165 } | 265 } |
| 166 | 266 |
| 167 /** | 267 /** |
| 168 * Sets the source to the new source string. | 268 * Sets the source to the new source string. |
| 169 */ | 269 */ |
| 170 void CollationElementIterator::setText(const UnicodeString& source, | 270 void CollationElementIterator::setText(const UnicodeString& source, |
| 171 UErrorCode& status) | 271 UErrorCode& status) |
| 172 { | 272 { |
| 173 if (U_FAILURE(status)) { | 273 if (U_FAILURE(status)) { |
| 174 return; | 274 return; |
| 175 } | 275 } |
| 176 | 276 |
| 177 int32_t length = source.length(); | 277 string_ = source; |
| 178 UChar *string = NULL; | 278 const UChar *s = string_.getBuffer(); |
| 179 if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) { | 279 CollationIterator *newIter; |
| 180 uprv_free((UChar *)m_data_->iteratordata_.string); | 280 UBool numeric = rbc_->settings->isNumeric(); |
| 281 if (rbc_->settings->dontCheckFCD()) { |
| 282 newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + stri
ng_.length()); |
| 283 } else { |
| 284 newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + s
tring_.length()); |
| 181 } | 285 } |
| 182 m_data_->isWritable = TRUE; | 286 if (newIter == NULL) { |
| 183 if (length > 0) { | 287 status = U_MEMORY_ALLOCATION_ERROR; |
| 184 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | 288 return; |
| 185 /* test for NULL */ | |
| 186 if (string == NULL) { | |
| 187 status = U_MEMORY_ALLOCATION_ERROR; | |
| 188 return; | |
| 189 } | |
| 190 u_memcpy(string, source.getBuffer(), length); | |
| 191 } | 289 } |
| 192 else { | 290 delete iter_; |
| 193 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | 291 iter_ = newIter; |
| 194 /* test for NULL */ | 292 otherHalf_ = 0; |
| 195 if (string == NULL) { | 293 dir_ = 0; |
| 196 status = U_MEMORY_ALLOCATION_ERROR; | |
| 197 return; | |
| 198 } | |
| 199 *string = 0; | |
| 200 } | |
| 201 /* Free offsetBuffer before initializing it. */ | |
| 202 ucol_freeOffsetBuffer(&(m_data_->iteratordata_)); | |
| 203 uprv_init_collIterate(m_data_->iteratordata_.coll, string, length, | |
| 204 &m_data_->iteratordata_, &status); | |
| 205 | |
| 206 m_data_->reset_ = TRUE; | |
| 207 } | 294 } |
| 208 | 295 |
| 209 // Sets the source to the new character iterator. | 296 // Sets the source to the new character iterator. |
| 210 void CollationElementIterator::setText(CharacterIterator& source, | 297 void CollationElementIterator::setText(CharacterIterator& source, |
| 211 UErrorCode& status) | 298 UErrorCode& status) |
| 212 { | 299 { |
| 213 if (U_FAILURE(status)) | 300 if (U_FAILURE(status)) |
| 214 return; | 301 return; |
| 215 | 302 |
| 216 int32_t length = source.getLength(); | 303 source.getText(string_); |
| 217 UChar *buffer = NULL; | 304 setText(string_, status); |
| 218 | |
| 219 if (length == 0) { | |
| 220 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | |
| 221 /* test for NULL */ | |
| 222 if (buffer == NULL) { | |
| 223 status = U_MEMORY_ALLOCATION_ERROR; | |
| 224 return; | |
| 225 } | |
| 226 *buffer = 0; | |
| 227 } | |
| 228 else { | |
| 229 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | |
| 230 /* test for NULL */ | |
| 231 if (buffer == NULL) { | |
| 232 status = U_MEMORY_ALLOCATION_ERROR; | |
| 233 return; | |
| 234 } | |
| 235 /* | |
| 236 Using this constructor will prevent buffer from being removed when | |
| 237 string gets removed | |
| 238 */ | |
| 239 UnicodeString string; | |
| 240 source.getText(string); | |
| 241 u_memcpy(buffer, string.getBuffer(), length); | |
| 242 } | |
| 243 | |
| 244 if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) { | |
| 245 uprv_free((UChar *)m_data_->iteratordata_.string); | |
| 246 } | |
| 247 m_data_->isWritable = TRUE; | |
| 248 /* Free offsetBuffer before initializing it. */ | |
| 249 ucol_freeOffsetBuffer(&(m_data_->iteratordata_)); | |
| 250 uprv_init_collIterate(m_data_->iteratordata_.coll, buffer, length, | |
| 251 &m_data_->iteratordata_, &status); | |
| 252 m_data_->reset_ = TRUE; | |
| 253 } | 305 } |
| 254 | 306 |
| 255 int32_t CollationElementIterator::strengthOrder(int32_t order) const | 307 int32_t CollationElementIterator::strengthOrder(int32_t order) const |
| 256 { | 308 { |
| 257 UCollationStrength s = ucol_getStrength(m_data_->iteratordata_.coll); | 309 UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
| 258 // Mask off the unwanted differences. | 310 // Mask off the unwanted differences. |
| 259 if (s == UCOL_PRIMARY) { | 311 if (s == UCOL_PRIMARY) { |
| 260 order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY; | 312 order &= 0xffff0000; |
| 261 } | 313 } |
| 262 else if (s == UCOL_SECONDARY) { | 314 else if (s == UCOL_SECONDARY) { |
| 263 order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY; | 315 order &= 0xffffff00; |
| 264 } | 316 } |
| 265 | 317 |
| 266 return order; | 318 return order; |
| 267 } | 319 } |
| 268 | 320 |
| 269 /* CollationElementIterator private constructors/destructors --------------- */ | 321 /* CollationElementIterator private constructors/destructors --------------- */ |
| 270 | 322 |
| 271 /** | 323 /** |
| 272 * This is the "real" constructor for this class; it constructs an iterator | 324 * This is the "real" constructor for this class; it constructs an iterator |
| 273 * over the source text using the specified collator | 325 * over the source text using the specified collator |
| 274 */ | 326 */ |
| 275 CollationElementIterator::CollationElementIterator( | 327 CollationElementIterator::CollationElementIterator( |
| 276 const UnicodeString& sourceText, | 328 const UnicodeString &source, |
| 277 const RuleBasedCollator* order, | 329 const RuleBasedCollator *coll, |
| 278 UErrorCode& status) | 330 UErrorCode &status) |
| 279 : isDataOwned_(TRUE) | 331 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
| 280 { | 332 setText(source, status); |
| 281 if (U_FAILURE(status)) { | |
| 282 return; | |
| 283 } | |
| 284 | |
| 285 int32_t length = sourceText.length(); | |
| 286 UChar *string = NULL; | |
| 287 | |
| 288 if (length > 0) { | |
| 289 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | |
| 290 /* test for NULL */ | |
| 291 if (string == NULL) { | |
| 292 status = U_MEMORY_ALLOCATION_ERROR; | |
| 293 return; | |
| 294 } | |
| 295 /* | |
| 296 Using this constructor will prevent buffer from being removed when | |
| 297 string gets removed | |
| 298 */ | |
| 299 u_memcpy(string, sourceText.getBuffer(), length); | |
| 300 } | |
| 301 else { | |
| 302 string = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | |
| 303 /* test for NULL */ | |
| 304 if (string == NULL) { | |
| 305 status = U_MEMORY_ALLOCATION_ERROR; | |
| 306 return; | |
| 307 } | |
| 308 *string = 0; | |
| 309 } | |
| 310 m_data_ = ucol_openElements(order->ucollator, string, length, &status); | |
| 311 | |
| 312 /* Test for buffer overflows */ | |
| 313 if (U_FAILURE(status)) { | |
| 314 return; | |
| 315 } | |
| 316 m_data_->isWritable = TRUE; | |
| 317 } | 333 } |
| 318 | 334 |
| 319 /** | 335 /** |
| 320 * This is the "real" constructor for this class; it constructs an iterator over | 336 * This is the "real" constructor for this class; it constructs an iterator over |
| 321 * the source text using the specified collator | 337 * the source text using the specified collator |
| 322 */ | 338 */ |
| 323 CollationElementIterator::CollationElementIterator( | 339 CollationElementIterator::CollationElementIterator( |
| 324 const CharacterIterator& sourceText, | 340 const CharacterIterator &source, |
| 325 const RuleBasedCollator* order, | 341 const RuleBasedCollator *coll, |
| 326 UErrorCode& status) | 342 UErrorCode &status) |
| 327 : isDataOwned_(TRUE) | 343 : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
| 328 { | 344 // We only call source.getText() which should be const anyway. |
| 329 if (U_FAILURE(status)) | 345 setText(const_cast<CharacterIterator &>(source), status); |
| 330 return; | |
| 331 | |
| 332 // **** should I just drop this test? **** | |
| 333 /* | |
| 334 if ( sourceText.endIndex() != 0 ) | |
| 335 { | |
| 336 // A CollationElementIterator is really a two-layered beast. | |
| 337 // Internally it uses a Normalizer to munge the source text into a form | |
| 338 // where all "composed" Unicode characters (such as \u00FC) are split in
to a | |
| 339 // normal character and a combining accent character. | |
| 340 // Afterward, CollationElementIterator does its own processing to handle | |
| 341 // expanding and contracting collation sequences, ignorables, and so on. | |
| 342 | |
| 343 Normalizer::EMode decomp = order->getStrength() == Collator::IDENTICAL | |
| 344 ? Normalizer::NO_OP : order->getDecomposition(); | |
| 345 | |
| 346 text = new Normalizer(sourceText, decomp); | |
| 347 if (text == NULL) | |
| 348 status = U_MEMORY_ALLOCATION_ERROR; | |
| 349 } | |
| 350 */ | |
| 351 int32_t length = sourceText.getLength(); | |
| 352 UChar *buffer; | |
| 353 if (length > 0) { | |
| 354 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR * length); | |
| 355 /* test for NULL */ | |
| 356 if (buffer == NULL) { | |
| 357 status = U_MEMORY_ALLOCATION_ERROR; | |
| 358 return; | |
| 359 } | |
| 360 /* | |
| 361 Using this constructor will prevent buffer from being removed when | |
| 362 string gets removed | |
| 363 */ | |
| 364 UnicodeString string(buffer, length, length); | |
| 365 ((CharacterIterator &)sourceText).getText(string); | |
| 366 const UChar *temp = string.getBuffer(); | |
| 367 u_memcpy(buffer, temp, length); | |
| 368 } | |
| 369 else { | |
| 370 buffer = (UChar *)uprv_malloc(U_SIZEOF_UCHAR); | |
| 371 /* test for NULL */ | |
| 372 if (buffer == NULL) { | |
| 373 status = U_MEMORY_ALLOCATION_ERROR; | |
| 374 return; | |
| 375 } | |
| 376 *buffer = 0; | |
| 377 } | |
| 378 m_data_ = ucol_openElements(order->ucollator, buffer, length, &status); | |
| 379 | |
| 380 /* Test for buffer overflows */ | |
| 381 if (U_FAILURE(status)) { | |
| 382 return; | |
| 383 } | |
| 384 m_data_->isWritable = TRUE; | |
| 385 } | 346 } |
| 386 | 347 |
| 387 /* CollationElementIterator protected methods ----------------------------- */ | 348 /* CollationElementIterator private methods -------------------------------- */ |
| 388 | 349 |
| 389 const CollationElementIterator& CollationElementIterator::operator=( | 350 const CollationElementIterator& CollationElementIterator::operator=( |
| 390 const CollationElementIterator& other) | 351 const CollationElementIterator& other) |
| 391 { | 352 { |
| 392 if (this != &other) | 353 if (this == &other) { |
| 393 { | 354 return *this; |
| 394 UCollationElements *ucolelem = this->m_data_; | |
| 395 UCollationElements *otherucolelem = other.m_data_; | |
| 396 collIterate *coliter = &(ucolelem->iteratordata_); | |
| 397 collIterate *othercoliter = &(otherucolelem->iteratordata_); | |
| 398 int length = 0; | |
| 399 | |
| 400 // checking only UCOL_ITER_HASLEN is not enough here as we may be in | |
| 401 // the normalization buffer | |
| 402 length = (int)(othercoliter->endp - othercoliter->string); | |
| 403 | |
| 404 ucolelem->reset_ = otherucolelem->reset_; | |
| 405 ucolelem->isWritable = TRUE; | |
| 406 | |
| 407 /* create a duplicate of string */ | |
| 408 if (length > 0) { | |
| 409 coliter->string = (UChar *)uprv_malloc(length * U_SIZEOF_UCHAR); | |
| 410 if(coliter->string != NULL) { | |
| 411 uprv_memcpy((UChar *)coliter->string, othercoliter->string, | |
| 412 length * U_SIZEOF_UCHAR); | |
| 413 } else { // Error: couldn't allocate memory. No copying should be do
ne | |
| 414 length = 0; | |
| 415 } | |
| 416 } | |
| 417 else { | |
| 418 coliter->string = NULL; | |
| 419 } | |
| 420 | |
| 421 /* start and end of string */ | |
| 422 coliter->endp = coliter->string == NULL ? NULL : coliter->string + lengt
h; | |
| 423 | |
| 424 /* handle writable buffer here */ | |
| 425 | |
| 426 if (othercoliter->flags & UCOL_ITER_INNORMBUF) { | |
| 427 coliter->writableBuffer = othercoliter->writableBuffer; | |
| 428 coliter->writableBuffer.getTerminatedBuffer(); | |
| 429 } | |
| 430 | |
| 431 /* current position */ | |
| 432 if (othercoliter->pos >= othercoliter->string && | |
| 433 othercoliter->pos <= othercoliter->endp) | |
| 434 { | |
| 435 U_ASSERT(coliter->string != NULL); | |
| 436 coliter->pos = coliter->string + | |
| 437 (othercoliter->pos - othercoliter->string); | |
| 438 } | |
| 439 else { | |
| 440 coliter->pos = coliter->writableBuffer.getTerminatedBuffer() + | |
| 441 (othercoliter->pos - othercoliter->writableBuffer.getBuffer()); | |
| 442 } | |
| 443 | |
| 444 /* CE buffer */ | |
| 445 int32_t CEsize; | |
| 446 if (coliter->extendCEs) { | |
| 447 uprv_memcpy(coliter->CEs, othercoliter->CEs, sizeof(uint32_t) * UCOL
_EXPAND_CE_BUFFER_SIZE); | |
| 448 CEsize = sizeof(othercoliter->extendCEs); | |
| 449 if (CEsize > 0) { | |
| 450 othercoliter->extendCEs = (uint32_t *)uprv_malloc(CEsize); | |
| 451 uprv_memcpy(coliter->extendCEs, othercoliter->extendCEs, CEsize)
; | |
| 452 } | |
| 453 coliter->toReturn = coliter->extendCEs + | |
| 454 (othercoliter->toReturn - othercoliter->extendCEs); | |
| 455 coliter->CEpos = coliter->extendCEs + CEsize; | |
| 456 } else { | |
| 457 CEsize = (int32_t)(othercoliter->CEpos - othercoliter->CEs); | |
| 458 if (CEsize > 0) { | |
| 459 uprv_memcpy(coliter->CEs, othercoliter->CEs, CEsize); | |
| 460 } | |
| 461 coliter->toReturn = coliter->CEs + | |
| 462 (othercoliter->toReturn - othercoliter->CEs); | |
| 463 coliter->CEpos = coliter->CEs + CEsize; | |
| 464 } | |
| 465 | |
| 466 if (othercoliter->fcdPosition != NULL) { | |
| 467 U_ASSERT(coliter->string != NULL); | |
| 468 coliter->fcdPosition = coliter->string + | |
| 469 (othercoliter->fcdPosition | |
| 470 - othercoliter->string); | |
| 471 } | |
| 472 else { | |
| 473 coliter->fcdPosition = NULL; | |
| 474 } | |
| 475 coliter->flags = othercoliter->flags/*| UCOL_ITER_HASLEN*/; | |
| 476 coliter->origFlags = othercoliter->origFlags; | |
| 477 coliter->coll = othercoliter->coll; | |
| 478 this->isDataOwned_ = TRUE; | |
| 479 } | 355 } |
| 480 | 356 |
| 357 CollationIterator *newIter; |
| 358 const FCDUTF16CollationIterator *otherFCDIter = |
| 359 dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); |
| 360 if(otherFCDIter != NULL) { |
| 361 newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer
()); |
| 362 } else { |
| 363 const UTF16CollationIterator *otherIter = |
| 364 dynamic_cast<const UTF16CollationIterator *>(other.iter_); |
| 365 if(otherIter != NULL) { |
| 366 newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()
); |
| 367 } else { |
| 368 newIter = NULL; |
| 369 } |
| 370 } |
| 371 if(newIter != NULL) { |
| 372 delete iter_; |
| 373 iter_ = newIter; |
| 374 rbc_ = other.rbc_; |
| 375 otherHalf_ = other.otherHalf_; |
| 376 dir_ = other.dir_; |
| 377 |
| 378 string_ = other.string_; |
| 379 } |
| 380 if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { |
| 381 UErrorCode errorCode = U_ZERO_ERROR; |
| 382 if(offsets_ == NULL) { |
| 383 offsets_ = new UVector32(other.offsets_->size(), errorCode); |
| 384 } |
| 385 if(offsets_ != NULL) { |
| 386 offsets_->assign(*other.offsets_, errorCode); |
| 387 } |
| 388 } |
| 481 return *this; | 389 return *this; |
| 482 } | 390 } |
| 483 | 391 |
| 392 namespace { |
| 393 |
| 394 class MaxExpSink : public ContractionsAndExpansions::CESink { |
| 395 public: |
| 396 MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec)
{} |
| 397 virtual ~MaxExpSink(); |
| 398 virtual void handleCE(int64_t /*ce*/) {} |
| 399 virtual void handleExpansion(const int64_t ces[], int32_t length) { |
| 400 if (length <= 1) { |
| 401 // We do not need to add single CEs into the map. |
| 402 return; |
| 403 } |
| 404 int32_t count = 0; // number of CE "halves" |
| 405 for (int32_t i = 0; i < length; ++i) { |
| 406 count += ceNeedsTwoParts(ces[i]) ? 2 : 1; |
| 407 } |
| 408 // last "half" of the last CE |
| 409 int64_t ce = ces[length - 1]; |
| 410 uint32_t p = (uint32_t)(ce >> 32); |
| 411 uint32_t lower32 = (uint32_t)ce; |
| 412 uint32_t lastHalf = getSecondHalf(p, lower32); |
| 413 if (lastHalf == 0) { |
| 414 lastHalf = getFirstHalf(p, lower32); |
| 415 U_ASSERT(lastHalf != 0); |
| 416 } else { |
| 417 lastHalf |= 0xc0; // old-style continuation CE |
| 418 } |
| 419 if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
| 420 uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); |
| 421 } |
| 422 } |
| 423 |
| 424 private: |
| 425 UHashtable *maxExpansions; |
| 426 UErrorCode &errorCode; |
| 427 }; |
| 428 |
| 429 MaxExpSink::~MaxExpSink() {} |
| 430 |
| 431 } // namespace |
| 432 |
| 433 UHashtable * |
| 434 CollationElementIterator::computeMaxExpansions(const CollationData *data, UError
Code &errorCode) { |
| 435 if (U_FAILURE(errorCode)) { return NULL; } |
| 436 UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, |
| 437 uhash_compareLong, &errorCode); |
| 438 if (U_FAILURE(errorCode)) { return NULL; } |
| 439 MaxExpSink sink(maxExpansions, errorCode); |
| 440 ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); |
| 441 if (U_FAILURE(errorCode)) { |
| 442 uhash_close(maxExpansions); |
| 443 return NULL; |
| 444 } |
| 445 return maxExpansions; |
| 446 } |
| 447 |
| 448 int32_t |
| 449 CollationElementIterator::getMaxExpansion(int32_t order) const { |
| 450 return getMaxExpansion(rbc_->tailoring->maxExpansions, order); |
| 451 } |
| 452 |
| 453 int32_t |
| 454 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32
_t order) { |
| 455 if (order == 0) { return 1; } |
| 456 int32_t max; |
| 457 if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0)
{ |
| 458 return max; |
| 459 } |
| 460 if ((order & 0xc0) == 0xc0) { |
| 461 // old-style continuation CE |
| 462 return 2; |
| 463 } else { |
| 464 return 1; |
| 465 } |
| 466 } |
| 467 |
| 484 U_NAMESPACE_END | 468 U_NAMESPACE_END |
| 485 | 469 |
| 486 #endif /* #if !UCONFIG_NO_COLLATION */ | 470 #endif /* #if !UCONFIG_NO_COLLATION */ |
| 487 | |
| 488 /* eof */ | |
| OLD | NEW |