Index: icu46/source/common/normlzr.cpp |
=================================================================== |
--- icu46/source/common/normlzr.cpp (revision 0) |
+++ icu46/source/common/normlzr.cpp (revision 0) |
@@ -0,0 +1,522 @@ |
+/* |
+ ************************************************************************* |
+ * COPYRIGHT: |
+ * Copyright (c) 1996-2010, International Business Machines Corporation and |
+ * others. All Rights Reserved. |
+ ************************************************************************* |
+ */ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_NORMALIZATION |
+ |
+#include "unicode/uniset.h" |
+#include "unicode/unistr.h" |
+#include "unicode/chariter.h" |
+#include "unicode/schriter.h" |
+#include "unicode/uchriter.h" |
+#include "unicode/normlzr.h" |
+#include "cmemory.h" |
+#include "normalizer2impl.h" |
+#include "uprops.h" // for uniset_getUnicode32Instance() |
+ |
+U_NAMESPACE_BEGIN |
+ |
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) |
+ |
+//------------------------------------------------------------------------- |
+// Constructors and other boilerplate |
+//------------------------------------------------------------------------- |
+ |
+Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : |
+ UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
+ text(new StringCharacterIterator(str)), |
+ currentIndex(0), nextIndex(0), |
+ buffer(), bufferPos(0) |
+{ |
+ init(); |
+} |
+ |
+Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) : |
+ UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
+ text(new UCharCharacterIterator(str, length)), |
+ currentIndex(0), nextIndex(0), |
+ buffer(), bufferPos(0) |
+{ |
+ init(); |
+} |
+ |
+Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : |
+ UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
+ text(iter.clone()), |
+ currentIndex(0), nextIndex(0), |
+ buffer(), bufferPos(0) |
+{ |
+ init(); |
+} |
+ |
+Normalizer::Normalizer(const Normalizer ©) : |
+ UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), |
+ text(copy.text->clone()), |
+ currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), |
+ buffer(copy.buffer), bufferPos(copy.bufferPos) |
+{ |
+ init(); |
+} |
+ |
+static const UChar _NUL=0; |
+ |
+void |
+Normalizer::init() { |
+ UErrorCode errorCode=U_ZERO_ERROR; |
+ fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); |
+ if(fOptions&UNORM_UNICODE_3_2) { |
+ delete fFilteredNorm2; |
+ fNorm2=fFilteredNorm2= |
+ new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); |
+ } |
+ if(U_FAILURE(errorCode)) { |
+ errorCode=U_ZERO_ERROR; |
+ fNorm2=Normalizer2Factory::getNoopInstance(errorCode); |
+ } |
+} |
+ |
+Normalizer::~Normalizer() |
+{ |
+ delete fFilteredNorm2; |
+ delete text; |
+} |
+ |
+Normalizer* |
+Normalizer::clone() const |
+{ |
+ return new Normalizer(*this); |
+} |
+ |
+/** |
+ * Generates a hash code for this iterator. |
+ */ |
+int32_t Normalizer::hashCode() const |
+{ |
+ return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; |
+} |
+ |
+UBool Normalizer::operator==(const Normalizer& that) const |
+{ |
+ return |
+ this==&that || |
+ (fUMode==that.fUMode && |
+ fOptions==that.fOptions && |
+ *text==*that.text && |
+ buffer==that.buffer && |
+ bufferPos==that.bufferPos && |
+ nextIndex==that.nextIndex); |
+} |
+ |
+//------------------------------------------------------------------------- |
+// Static utility methods |
+//------------------------------------------------------------------------- |
+ |
+void U_EXPORT2 |
+Normalizer::normalize(const UnicodeString& source, |
+ UNormalizationMode mode, int32_t options, |
+ UnicodeString& result, |
+ UErrorCode &status) { |
+ if(source.isBogus() || U_FAILURE(status)) { |
+ result.setToBogus(); |
+ if(U_SUCCESS(status)) { |
+ status=U_ILLEGAL_ARGUMENT_ERROR; |
+ } |
+ } else { |
+ UnicodeString localDest; |
+ UnicodeString *dest; |
+ |
+ if(&source!=&result) { |
+ dest=&result; |
+ } else { |
+ // the source and result strings are the same object, use a temporary one |
+ dest=&localDest; |
+ } |
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
+ if(U_SUCCESS(status)) { |
+ if(options&UNORM_UNICODE_3_2) { |
+ FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
+ normalize(source, *dest, status); |
+ } else { |
+ n2->normalize(source, *dest, status); |
+ } |
+ } |
+ if(dest==&localDest && U_SUCCESS(status)) { |
+ result=*dest; |
+ } |
+ } |
+} |
+ |
+void U_EXPORT2 |
+Normalizer::compose(const UnicodeString& source, |
+ UBool compat, int32_t options, |
+ UnicodeString& result, |
+ UErrorCode &status) { |
+ normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); |
+} |
+ |
+void U_EXPORT2 |
+Normalizer::decompose(const UnicodeString& source, |
+ UBool compat, int32_t options, |
+ UnicodeString& result, |
+ UErrorCode &status) { |
+ normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); |
+} |
+ |
+UNormalizationCheckResult |
+Normalizer::quickCheck(const UnicodeString& source, |
+ UNormalizationMode mode, int32_t options, |
+ UErrorCode &status) { |
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
+ if(U_SUCCESS(status)) { |
+ if(options&UNORM_UNICODE_3_2) { |
+ return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
+ quickCheck(source, status); |
+ } else { |
+ return n2->quickCheck(source, status); |
+ } |
+ } else { |
+ return UNORM_MAYBE; |
+ } |
+} |
+ |
+UBool |
+Normalizer::isNormalized(const UnicodeString& source, |
+ UNormalizationMode mode, int32_t options, |
+ UErrorCode &status) { |
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
+ if(U_SUCCESS(status)) { |
+ if(options&UNORM_UNICODE_3_2) { |
+ return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
+ isNormalized(source, status); |
+ } else { |
+ return n2->isNormalized(source, status); |
+ } |
+ } else { |
+ return FALSE; |
+ } |
+} |
+ |
+UnicodeString & U_EXPORT2 |
+Normalizer::concatenate(UnicodeString &left, UnicodeString &right, |
+ UnicodeString &result, |
+ UNormalizationMode mode, int32_t options, |
+ UErrorCode &errorCode) { |
+ if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { |
+ result.setToBogus(); |
+ if(U_SUCCESS(errorCode)) { |
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
+ } |
+ } else { |
+ UnicodeString localDest; |
+ UnicodeString *dest; |
+ |
+ if(&right!=&result) { |
+ dest=&result; |
+ } else { |
+ // the right and result strings are the same object, use a temporary one |
+ dest=&localDest; |
+ } |
+ *dest=left; |
+ const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); |
+ if(U_SUCCESS(errorCode)) { |
+ if(options&UNORM_UNICODE_3_2) { |
+ FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). |
+ append(*dest, right, errorCode); |
+ } else { |
+ n2->append(*dest, right, errorCode); |
+ } |
+ } |
+ if(dest==&localDest && U_SUCCESS(errorCode)) { |
+ result=*dest; |
+ } |
+ } |
+ return result; |
+} |
+ |
+//------------------------------------------------------------------------- |
+// Iteration API |
+//------------------------------------------------------------------------- |
+ |
+/** |
+ * Return the current character in the normalized text. |
+ */ |
+UChar32 Normalizer::current() { |
+ if(bufferPos<buffer.length() || nextNormalize()) { |
+ return buffer.char32At(bufferPos); |
+ } else { |
+ return DONE; |
+ } |
+} |
+ |
+/** |
+ * Return the next character in the normalized text and advance |
+ * the iteration position by one. If the end |
+ * of the text has already been reached, {@link #DONE} is returned. |
+ */ |
+UChar32 Normalizer::next() { |
+ if(bufferPos<buffer.length() || nextNormalize()) { |
+ UChar32 c=buffer.char32At(bufferPos); |
+ bufferPos+=UTF_CHAR_LENGTH(c); |
+ return c; |
+ } else { |
+ return DONE; |
+ } |
+} |
+ |
+/** |
+ * Return the previous character in the normalized text and decrement |
+ * the iteration position by one. If the beginning |
+ * of the text has already been reached, {@link #DONE} is returned. |
+ */ |
+UChar32 Normalizer::previous() { |
+ if(bufferPos>0 || previousNormalize()) { |
+ UChar32 c=buffer.char32At(bufferPos-1); |
+ bufferPos-=UTF_CHAR_LENGTH(c); |
+ return c; |
+ } else { |
+ return DONE; |
+ } |
+} |
+ |
+void Normalizer::reset() { |
+ currentIndex=nextIndex=text->setToStart(); |
+ clearBuffer(); |
+} |
+ |
+void |
+Normalizer::setIndexOnly(int32_t index) { |
+ text->setIndex(index); // pins index |
+ currentIndex=nextIndex=text->getIndex(); |
+ clearBuffer(); |
+} |
+ |
+/** |
+ * Return the first character in the normalized text. This resets |
+ * the <tt>Normalizer's</tt> position to the beginning of the text. |
+ */ |
+UChar32 Normalizer::first() { |
+ reset(); |
+ return next(); |
+} |
+ |
+/** |
+ * Return the last character in the normalized text. This resets |
+ * the <tt>Normalizer's</tt> position to be just before the |
+ * the input text corresponding to that normalized character. |
+ */ |
+UChar32 Normalizer::last() { |
+ currentIndex=nextIndex=text->setToEnd(); |
+ clearBuffer(); |
+ return previous(); |
+} |
+ |
+/** |
+ * Retrieve the current iteration position in the input text that is |
+ * being normalized. This method is useful in applications such as |
+ * searching, where you need to be able to determine the position in |
+ * the input text that corresponds to a given normalized output character. |
+ * <p> |
+ * <b>Note:</b> This method sets the position in the <em>input</em>, while |
+ * {@link #next} and {@link #previous} iterate through characters in the |
+ * <em>output</em>. This means that there is not necessarily a one-to-one |
+ * correspondence between characters returned by <tt>next</tt> and |
+ * <tt>previous</tt> and the indices passed to and returned from |
+ * <tt>setIndex</tt> and {@link #getIndex}. |
+ * |
+ */ |
+int32_t Normalizer::getIndex() const { |
+ if(bufferPos<buffer.length()) { |
+ return currentIndex; |
+ } else { |
+ return nextIndex; |
+ } |
+} |
+ |
+/** |
+ * Retrieve the index of the start of the input text. This is the begin index |
+ * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> |
+ * over which this <tt>Normalizer</tt> is iterating |
+ */ |
+int32_t Normalizer::startIndex() const { |
+ return text->startIndex(); |
+} |
+ |
+/** |
+ * Retrieve the index of the end of the input text. This is the end index |
+ * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
+ * over which this <tt>Normalizer</tt> is iterating |
+ */ |
+int32_t Normalizer::endIndex() const { |
+ return text->endIndex(); |
+} |
+ |
+//------------------------------------------------------------------------- |
+// Property access methods |
+//------------------------------------------------------------------------- |
+ |
+void |
+Normalizer::setMode(UNormalizationMode newMode) |
+{ |
+ fUMode = newMode; |
+ init(); |
+} |
+ |
+UNormalizationMode |
+Normalizer::getUMode() const |
+{ |
+ return fUMode; |
+} |
+ |
+void |
+Normalizer::setOption(int32_t option, |
+ UBool value) |
+{ |
+ if (value) { |
+ fOptions |= option; |
+ } else { |
+ fOptions &= (~option); |
+ } |
+ init(); |
+} |
+ |
+UBool |
+Normalizer::getOption(int32_t option) const |
+{ |
+ return (fOptions & option) != 0; |
+} |
+ |
+/** |
+ * Set the input text over which this <tt>Normalizer</tt> will iterate. |
+ * The iteration position is set to the beginning of the input text. |
+ */ |
+void |
+Normalizer::setText(const UnicodeString& newText, |
+ UErrorCode &status) |
+{ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ CharacterIterator *newIter = new StringCharacterIterator(newText); |
+ if (newIter == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ delete text; |
+ text = newIter; |
+ reset(); |
+} |
+ |
+/** |
+ * Set the input text over which this <tt>Normalizer</tt> will iterate. |
+ * The iteration position is set to the beginning of the string. |
+ */ |
+void |
+Normalizer::setText(const CharacterIterator& newText, |
+ UErrorCode &status) |
+{ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ CharacterIterator *newIter = newText.clone(); |
+ if (newIter == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ delete text; |
+ text = newIter; |
+ reset(); |
+} |
+ |
+void |
+Normalizer::setText(const UChar* newText, |
+ int32_t length, |
+ UErrorCode &status) |
+{ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ CharacterIterator *newIter = new UCharCharacterIterator(newText, length); |
+ if (newIter == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ delete text; |
+ text = newIter; |
+ reset(); |
+} |
+ |
+/** |
+ * Copies the text under iteration into the UnicodeString referred to by "result". |
+ * @param result Receives a copy of the text under iteration. |
+ */ |
+void |
+Normalizer::getText(UnicodeString& result) |
+{ |
+ text->getText(result); |
+} |
+ |
+//------------------------------------------------------------------------- |
+// Private utility methods |
+//------------------------------------------------------------------------- |
+ |
+void Normalizer::clearBuffer() { |
+ buffer.remove(); |
+ bufferPos=0; |
+} |
+ |
+UBool |
+Normalizer::nextNormalize() { |
+ clearBuffer(); |
+ currentIndex=nextIndex; |
+ text->setIndex(nextIndex); |
+ if(!text->hasNext()) { |
+ return FALSE; |
+ } |
+ // Skip at least one character so we make progress. |
+ UnicodeString segment(text->next32PostInc()); |
+ while(text->hasNext()) { |
+ UChar32 c; |
+ if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { |
+ text->move32(-1, CharacterIterator::kCurrent); |
+ break; |
+ } |
+ segment.append(c); |
+ } |
+ nextIndex=text->getIndex(); |
+ UErrorCode errorCode=U_ZERO_ERROR; |
+ fNorm2->normalize(segment, buffer, errorCode); |
+ return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
+} |
+ |
+UBool |
+Normalizer::previousNormalize() { |
+ clearBuffer(); |
+ nextIndex=currentIndex; |
+ text->setIndex(currentIndex); |
+ if(!text->hasPrevious()) { |
+ return FALSE; |
+ } |
+ UnicodeString segment; |
+ while(text->hasPrevious()) { |
+ UChar32 c=text->previous32(); |
+ segment.insert(0, c); |
+ if(fNorm2->hasBoundaryBefore(c)) { |
+ break; |
+ } |
+ } |
+ currentIndex=text->getIndex(); |
+ UErrorCode errorCode=U_ZERO_ERROR; |
+ fNorm2->normalize(segment, buffer, errorCode); |
+ bufferPos=buffer.length(); |
+ return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
+} |
+ |
+U_NAMESPACE_END |
+ |
+#endif /* #if !UCONFIG_NO_NORMALIZATION */ |
Property changes on: icu46/source/common/normlzr.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |