OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ************************************************************************* |
| 3 * COPYRIGHT: |
| 4 * Copyright (c) 1996-2010, International Business Machines Corporation and |
| 5 * others. All Rights Reserved. |
| 6 ************************************************************************* |
| 7 */ |
| 8 |
| 9 #include "unicode/utypes.h" |
| 10 |
| 11 #if !UCONFIG_NO_NORMALIZATION |
| 12 |
| 13 #include "unicode/uniset.h" |
| 14 #include "unicode/unistr.h" |
| 15 #include "unicode/chariter.h" |
| 16 #include "unicode/schriter.h" |
| 17 #include "unicode/uchriter.h" |
| 18 #include "unicode/normlzr.h" |
| 19 #include "cmemory.h" |
| 20 #include "normalizer2impl.h" |
| 21 #include "uprops.h" // for uniset_getUnicode32Instance() |
| 22 |
| 23 U_NAMESPACE_BEGIN |
| 24 |
| 25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) |
| 26 |
| 27 //------------------------------------------------------------------------- |
| 28 // Constructors and other boilerplate |
| 29 //------------------------------------------------------------------------- |
| 30 |
| 31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : |
| 32 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| 33 text(new StringCharacterIterator(str)), |
| 34 currentIndex(0), nextIndex(0), |
| 35 buffer(), bufferPos(0) |
| 36 { |
| 37 init(); |
| 38 } |
| 39 |
| 40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode
) : |
| 41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| 42 text(new UCharCharacterIterator(str, length)), |
| 43 currentIndex(0), nextIndex(0), |
| 44 buffer(), bufferPos(0) |
| 45 { |
| 46 init(); |
| 47 } |
| 48 |
| 49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : |
| 50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| 51 text(iter.clone()), |
| 52 currentIndex(0), nextIndex(0), |
| 53 buffer(), bufferPos(0) |
| 54 { |
| 55 init(); |
| 56 } |
| 57 |
| 58 Normalizer::Normalizer(const Normalizer ©) : |
| 59 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOpt
ions(copy.fOptions), |
| 60 text(copy.text->clone()), |
| 61 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), |
| 62 buffer(copy.buffer), bufferPos(copy.bufferPos) |
| 63 { |
| 64 init(); |
| 65 } |
| 66 |
| 67 static const UChar _NUL=0; |
| 68 |
| 69 void |
| 70 Normalizer::init() { |
| 71 UErrorCode errorCode=U_ZERO_ERROR; |
| 72 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); |
| 73 if(fOptions&UNORM_UNICODE_3_2) { |
| 74 delete fFilteredNorm2; |
| 75 fNorm2=fFilteredNorm2= |
| 76 new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorC
ode)); |
| 77 } |
| 78 if(U_FAILURE(errorCode)) { |
| 79 errorCode=U_ZERO_ERROR; |
| 80 fNorm2=Normalizer2Factory::getNoopInstance(errorCode); |
| 81 } |
| 82 } |
| 83 |
| 84 Normalizer::~Normalizer() |
| 85 { |
| 86 delete fFilteredNorm2; |
| 87 delete text; |
| 88 } |
| 89 |
| 90 Normalizer* |
| 91 Normalizer::clone() const |
| 92 { |
| 93 return new Normalizer(*this); |
| 94 } |
| 95 |
| 96 /** |
| 97 * Generates a hash code for this iterator. |
| 98 */ |
| 99 int32_t Normalizer::hashCode() const |
| 100 { |
| 101 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos
+ currentIndex + nextIndex; |
| 102 } |
| 103 |
| 104 UBool Normalizer::operator==(const Normalizer& that) const |
| 105 { |
| 106 return |
| 107 this==&that || |
| 108 (fUMode==that.fUMode && |
| 109 fOptions==that.fOptions && |
| 110 *text==*that.text && |
| 111 buffer==that.buffer && |
| 112 bufferPos==that.bufferPos && |
| 113 nextIndex==that.nextIndex); |
| 114 } |
| 115 |
| 116 //------------------------------------------------------------------------- |
| 117 // Static utility methods |
| 118 //------------------------------------------------------------------------- |
| 119 |
| 120 void U_EXPORT2 |
| 121 Normalizer::normalize(const UnicodeString& source, |
| 122 UNormalizationMode mode, int32_t options, |
| 123 UnicodeString& result, |
| 124 UErrorCode &status) { |
| 125 if(source.isBogus() || U_FAILURE(status)) { |
| 126 result.setToBogus(); |
| 127 if(U_SUCCESS(status)) { |
| 128 status=U_ILLEGAL_ARGUMENT_ERROR; |
| 129 } |
| 130 } else { |
| 131 UnicodeString localDest; |
| 132 UnicodeString *dest; |
| 133 |
| 134 if(&source!=&result) { |
| 135 dest=&result; |
| 136 } else { |
| 137 // the source and result strings are the same object, use a temporar
y one |
| 138 dest=&localDest; |
| 139 } |
| 140 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| 141 if(U_SUCCESS(status)) { |
| 142 if(options&UNORM_UNICODE_3_2) { |
| 143 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| 144 normalize(source, *dest, status); |
| 145 } else { |
| 146 n2->normalize(source, *dest, status); |
| 147 } |
| 148 } |
| 149 if(dest==&localDest && U_SUCCESS(status)) { |
| 150 result=*dest; |
| 151 } |
| 152 } |
| 153 } |
| 154 |
| 155 void U_EXPORT2 |
| 156 Normalizer::compose(const UnicodeString& source, |
| 157 UBool compat, int32_t options, |
| 158 UnicodeString& result, |
| 159 UErrorCode &status) { |
| 160 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); |
| 161 } |
| 162 |
| 163 void U_EXPORT2 |
| 164 Normalizer::decompose(const UnicodeString& source, |
| 165 UBool compat, int32_t options, |
| 166 UnicodeString& result, |
| 167 UErrorCode &status) { |
| 168 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); |
| 169 } |
| 170 |
| 171 UNormalizationCheckResult |
| 172 Normalizer::quickCheck(const UnicodeString& source, |
| 173 UNormalizationMode mode, int32_t options, |
| 174 UErrorCode &status) { |
| 175 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| 176 if(U_SUCCESS(status)) { |
| 177 if(options&UNORM_UNICODE_3_2) { |
| 178 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)
). |
| 179 quickCheck(source, status); |
| 180 } else { |
| 181 return n2->quickCheck(source, status); |
| 182 } |
| 183 } else { |
| 184 return UNORM_MAYBE; |
| 185 } |
| 186 } |
| 187 |
| 188 UBool |
| 189 Normalizer::isNormalized(const UnicodeString& source, |
| 190 UNormalizationMode mode, int32_t options, |
| 191 UErrorCode &status) { |
| 192 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| 193 if(U_SUCCESS(status)) { |
| 194 if(options&UNORM_UNICODE_3_2) { |
| 195 return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)
). |
| 196 isNormalized(source, status); |
| 197 } else { |
| 198 return n2->isNormalized(source, status); |
| 199 } |
| 200 } else { |
| 201 return FALSE; |
| 202 } |
| 203 } |
| 204 |
| 205 UnicodeString & U_EXPORT2 |
| 206 Normalizer::concatenate(UnicodeString &left, UnicodeString &right, |
| 207 UnicodeString &result, |
| 208 UNormalizationMode mode, int32_t options, |
| 209 UErrorCode &errorCode) { |
| 210 if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { |
| 211 result.setToBogus(); |
| 212 if(U_SUCCESS(errorCode)) { |
| 213 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 214 } |
| 215 } else { |
| 216 UnicodeString localDest; |
| 217 UnicodeString *dest; |
| 218 |
| 219 if(&right!=&result) { |
| 220 dest=&result; |
| 221 } else { |
| 222 // the right and result strings are the same object, use a temporary
one |
| 223 dest=&localDest; |
| 224 } |
| 225 *dest=left; |
| 226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); |
| 227 if(U_SUCCESS(errorCode)) { |
| 228 if(options&UNORM_UNICODE_3_2) { |
| 229 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)
). |
| 230 append(*dest, right, errorCode); |
| 231 } else { |
| 232 n2->append(*dest, right, errorCode); |
| 233 } |
| 234 } |
| 235 if(dest==&localDest && U_SUCCESS(errorCode)) { |
| 236 result=*dest; |
| 237 } |
| 238 } |
| 239 return result; |
| 240 } |
| 241 |
| 242 //------------------------------------------------------------------------- |
| 243 // Iteration API |
| 244 //------------------------------------------------------------------------- |
| 245 |
| 246 /** |
| 247 * Return the current character in the normalized text. |
| 248 */ |
| 249 UChar32 Normalizer::current() { |
| 250 if(bufferPos<buffer.length() || nextNormalize()) { |
| 251 return buffer.char32At(bufferPos); |
| 252 } else { |
| 253 return DONE; |
| 254 } |
| 255 } |
| 256 |
| 257 /** |
| 258 * Return the next character in the normalized text and advance |
| 259 * the iteration position by one. If the end |
| 260 * of the text has already been reached, {@link #DONE} is returned. |
| 261 */ |
| 262 UChar32 Normalizer::next() { |
| 263 if(bufferPos<buffer.length() || nextNormalize()) { |
| 264 UChar32 c=buffer.char32At(bufferPos); |
| 265 bufferPos+=UTF_CHAR_LENGTH(c); |
| 266 return c; |
| 267 } else { |
| 268 return DONE; |
| 269 } |
| 270 } |
| 271 |
| 272 /** |
| 273 * Return the previous character in the normalized text and decrement |
| 274 * the iteration position by one. If the beginning |
| 275 * of the text has already been reached, {@link #DONE} is returned. |
| 276 */ |
| 277 UChar32 Normalizer::previous() { |
| 278 if(bufferPos>0 || previousNormalize()) { |
| 279 UChar32 c=buffer.char32At(bufferPos-1); |
| 280 bufferPos-=UTF_CHAR_LENGTH(c); |
| 281 return c; |
| 282 } else { |
| 283 return DONE; |
| 284 } |
| 285 } |
| 286 |
| 287 void Normalizer::reset() { |
| 288 currentIndex=nextIndex=text->setToStart(); |
| 289 clearBuffer(); |
| 290 } |
| 291 |
| 292 void |
| 293 Normalizer::setIndexOnly(int32_t index) { |
| 294 text->setIndex(index); // pins index |
| 295 currentIndex=nextIndex=text->getIndex(); |
| 296 clearBuffer(); |
| 297 } |
| 298 |
| 299 /** |
| 300 * Return the first character in the normalized text. This resets |
| 301 * the <tt>Normalizer's</tt> position to the beginning of the text. |
| 302 */ |
| 303 UChar32 Normalizer::first() { |
| 304 reset(); |
| 305 return next(); |
| 306 } |
| 307 |
| 308 /** |
| 309 * Return the last character in the normalized text. This resets |
| 310 * the <tt>Normalizer's</tt> position to be just before the |
| 311 * the input text corresponding to that normalized character. |
| 312 */ |
| 313 UChar32 Normalizer::last() { |
| 314 currentIndex=nextIndex=text->setToEnd(); |
| 315 clearBuffer(); |
| 316 return previous(); |
| 317 } |
| 318 |
| 319 /** |
| 320 * Retrieve the current iteration position in the input text that is |
| 321 * being normalized. This method is useful in applications such as |
| 322 * searching, where you need to be able to determine the position in |
| 323 * the input text that corresponds to a given normalized output character. |
| 324 * <p> |
| 325 * <b>Note:</b> This method sets the position in the <em>input</em>, while |
| 326 * {@link #next} and {@link #previous} iterate through characters in the |
| 327 * <em>output</em>. This means that there is not necessarily a one-to-one |
| 328 * correspondence between characters returned by <tt>next</tt> and |
| 329 * <tt>previous</tt> and the indices passed to and returned from |
| 330 * <tt>setIndex</tt> and {@link #getIndex}. |
| 331 * |
| 332 */ |
| 333 int32_t Normalizer::getIndex() const { |
| 334 if(bufferPos<buffer.length()) { |
| 335 return currentIndex; |
| 336 } else { |
| 337 return nextIndex; |
| 338 } |
| 339 } |
| 340 |
| 341 /** |
| 342 * Retrieve the index of the start of the input text. This is the begin index |
| 343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt
> |
| 344 * over which this <tt>Normalizer</tt> is iterating |
| 345 */ |
| 346 int32_t Normalizer::startIndex() const { |
| 347 return text->startIndex(); |
| 348 } |
| 349 |
| 350 /** |
| 351 * Retrieve the index of the end of the input text. This is the end index |
| 352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
| 353 * over which this <tt>Normalizer</tt> is iterating |
| 354 */ |
| 355 int32_t Normalizer::endIndex() const { |
| 356 return text->endIndex(); |
| 357 } |
| 358 |
| 359 //------------------------------------------------------------------------- |
| 360 // Property access methods |
| 361 //------------------------------------------------------------------------- |
| 362 |
| 363 void |
| 364 Normalizer::setMode(UNormalizationMode newMode) |
| 365 { |
| 366 fUMode = newMode; |
| 367 init(); |
| 368 } |
| 369 |
| 370 UNormalizationMode |
| 371 Normalizer::getUMode() const |
| 372 { |
| 373 return fUMode; |
| 374 } |
| 375 |
| 376 void |
| 377 Normalizer::setOption(int32_t option, |
| 378 UBool value) |
| 379 { |
| 380 if (value) { |
| 381 fOptions |= option; |
| 382 } else { |
| 383 fOptions &= (~option); |
| 384 } |
| 385 init(); |
| 386 } |
| 387 |
| 388 UBool |
| 389 Normalizer::getOption(int32_t option) const |
| 390 { |
| 391 return (fOptions & option) != 0; |
| 392 } |
| 393 |
| 394 /** |
| 395 * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| 396 * The iteration position is set to the beginning of the input text. |
| 397 */ |
| 398 void |
| 399 Normalizer::setText(const UnicodeString& newText, |
| 400 UErrorCode &status) |
| 401 { |
| 402 if (U_FAILURE(status)) { |
| 403 return; |
| 404 } |
| 405 CharacterIterator *newIter = new StringCharacterIterator(newText); |
| 406 if (newIter == NULL) { |
| 407 status = U_MEMORY_ALLOCATION_ERROR; |
| 408 return; |
| 409 } |
| 410 delete text; |
| 411 text = newIter; |
| 412 reset(); |
| 413 } |
| 414 |
| 415 /** |
| 416 * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| 417 * The iteration position is set to the beginning of the string. |
| 418 */ |
| 419 void |
| 420 Normalizer::setText(const CharacterIterator& newText, |
| 421 UErrorCode &status) |
| 422 { |
| 423 if (U_FAILURE(status)) { |
| 424 return; |
| 425 } |
| 426 CharacterIterator *newIter = newText.clone(); |
| 427 if (newIter == NULL) { |
| 428 status = U_MEMORY_ALLOCATION_ERROR; |
| 429 return; |
| 430 } |
| 431 delete text; |
| 432 text = newIter; |
| 433 reset(); |
| 434 } |
| 435 |
| 436 void |
| 437 Normalizer::setText(const UChar* newText, |
| 438 int32_t length, |
| 439 UErrorCode &status) |
| 440 { |
| 441 if (U_FAILURE(status)) { |
| 442 return; |
| 443 } |
| 444 CharacterIterator *newIter = new UCharCharacterIterator(newText, length); |
| 445 if (newIter == NULL) { |
| 446 status = U_MEMORY_ALLOCATION_ERROR; |
| 447 return; |
| 448 } |
| 449 delete text; |
| 450 text = newIter; |
| 451 reset(); |
| 452 } |
| 453 |
| 454 /** |
| 455 * Copies the text under iteration into the UnicodeString referred to by "result
". |
| 456 * @param result Receives a copy of the text under iteration. |
| 457 */ |
| 458 void |
| 459 Normalizer::getText(UnicodeString& result) |
| 460 { |
| 461 text->getText(result); |
| 462 } |
| 463 |
| 464 //------------------------------------------------------------------------- |
| 465 // Private utility methods |
| 466 //------------------------------------------------------------------------- |
| 467 |
| 468 void Normalizer::clearBuffer() { |
| 469 buffer.remove(); |
| 470 bufferPos=0; |
| 471 } |
| 472 |
| 473 UBool |
| 474 Normalizer::nextNormalize() { |
| 475 clearBuffer(); |
| 476 currentIndex=nextIndex; |
| 477 text->setIndex(nextIndex); |
| 478 if(!text->hasNext()) { |
| 479 return FALSE; |
| 480 } |
| 481 // Skip at least one character so we make progress. |
| 482 UnicodeString segment(text->next32PostInc()); |
| 483 while(text->hasNext()) { |
| 484 UChar32 c; |
| 485 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { |
| 486 text->move32(-1, CharacterIterator::kCurrent); |
| 487 break; |
| 488 } |
| 489 segment.append(c); |
| 490 } |
| 491 nextIndex=text->getIndex(); |
| 492 UErrorCode errorCode=U_ZERO_ERROR; |
| 493 fNorm2->normalize(segment, buffer, errorCode); |
| 494 return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
| 495 } |
| 496 |
| 497 UBool |
| 498 Normalizer::previousNormalize() { |
| 499 clearBuffer(); |
| 500 nextIndex=currentIndex; |
| 501 text->setIndex(currentIndex); |
| 502 if(!text->hasPrevious()) { |
| 503 return FALSE; |
| 504 } |
| 505 UnicodeString segment; |
| 506 while(text->hasPrevious()) { |
| 507 UChar32 c=text->previous32(); |
| 508 segment.insert(0, c); |
| 509 if(fNorm2->hasBoundaryBefore(c)) { |
| 510 break; |
| 511 } |
| 512 } |
| 513 currentIndex=text->getIndex(); |
| 514 UErrorCode errorCode=U_ZERO_ERROR; |
| 515 fNorm2->normalize(segment, buffer, errorCode); |
| 516 bufferPos=buffer.length(); |
| 517 return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
| 518 } |
| 519 |
| 520 U_NAMESPACE_END |
| 521 |
| 522 #endif /* #if !UCONFIG_NO_NORMALIZATION */ |
OLD | NEW |