Index: source/common/normalizer2impl.cpp |
diff --git a/source/common/normalizer2impl.cpp b/source/common/normalizer2impl.cpp |
index 9c00c1c818d12aff921fe62a82a14b6517ff527b..ec4809c4630995c13a4bedbba3ca49b60be755aa 100644 |
--- a/source/common/normalizer2impl.cpp |
+++ b/source/common/normalizer2impl.cpp |
@@ -1,7 +1,7 @@ |
/* |
******************************************************************************* |
* |
-* Copyright (C) 2009-2013, International Business Machines |
+* Copyright (C) 2009-2014, International Business Machines |
* Corporation and others. All Rights Reserved. |
* |
******************************************************************************* |
@@ -253,50 +253,12 @@ struct CanonIterData : public UMemory { |
}; |
Normalizer2Impl::~Normalizer2Impl() { |
- udata_close(memory); |
- utrie2_close(normTrie); |
delete fCanonIterData; |
} |
-UBool U_CALLCONV |
-Normalizer2Impl::isAcceptable(void *context, |
- const char * /* type */, const char * /*name*/, |
- const UDataInfo *pInfo) { |
- if( |
- pInfo->size>=20 && |
- pInfo->isBigEndian==U_IS_BIG_ENDIAN && |
- pInfo->charsetFamily==U_CHARSET_FAMILY && |
- pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */ |
- pInfo->dataFormat[1]==0x72 && |
- pInfo->dataFormat[2]==0x6d && |
- pInfo->dataFormat[3]==0x32 && |
- pInfo->formatVersion[0]==2 |
- ) { |
- Normalizer2Impl *me=(Normalizer2Impl *)context; |
- uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4); |
- return TRUE; |
- } else { |
- return FALSE; |
- } |
-} |
- |
void |
-Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) { |
- if(U_FAILURE(errorCode)) { |
- return; |
- } |
- memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode); |
- if(U_FAILURE(errorCode)) { |
- return; |
- } |
- const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory); |
- const int32_t *inIndexes=(const int32_t *)inBytes; |
- int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4; |
- if(indexesLength<=IX_MIN_MAYBE_YES) { |
- errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes. |
- return; |
- } |
- |
+Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie, |
+ const uint16_t *inExtraData, const uint8_t *inSmallFCD) { |
minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; |
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; |
@@ -306,23 +268,12 @@ Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &err |
limitNoNo=inIndexes[IX_LIMIT_NO_NO]; |
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; |
- int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET]; |
- int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; |
- normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, |
- inBytes+offset, nextOffset-offset, NULL, |
- &errorCode); |
- if(U_FAILURE(errorCode)) { |
- return; |
- } |
+ normTrie=inTrie; |
- offset=nextOffset; |
- nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; |
- maybeYesCompositions=(const uint16_t *)(inBytes+offset); |
+ maybeYesCompositions=inExtraData; |
extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes); |
- // smallFCD: new in formatVersion 2 |
- offset=nextOffset; |
- smallFCD=inBytes+offset; |
+ smallFCD=inSmallFCD; |
// Build tccc180[]. |
// gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. |
@@ -357,9 +308,71 @@ uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, co |
} |
} |
+namespace { |
+ |
+class LcccContext { |
+public: |
+ LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {} |
+ |
+ void handleRange(UChar32 start, UChar32 end, uint16_t norm16) { |
+ if(impl.isAlgorithmicNoNo(norm16)) { |
+ // Range of code points with same-norm16-value algorithmic decompositions. |
+ // They might have different non-zero FCD16 values. |
+ do { |
+ uint16_t fcd16=impl.getFCD16(start); |
+ if(fcd16>0xff) { set.add(start); } |
+ } while(++start<=end); |
+ } else { |
+ uint16_t fcd16=impl.getFCD16(start); |
+ if(fcd16>0xff) { set.add(start, end); } |
+ } |
+ } |
+ |
+private: |
+ const Normalizer2Impl &impl; |
+ UnicodeSet &set; |
+}; |
+ |
+struct PropertyStartsContext { |
+ PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder) |
+ : impl(ni), sa(adder) {} |
+ |
+ const Normalizer2Impl &impl; |
+ const USetAdder *sa; |
+}; |
+ |
+} // namespace |
+ |
U_CDECL_BEGIN |
static UBool U_CALLCONV |
+enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
+ ((LcccContext *)context)->handleRange(start, end, (uint16_t)value); |
+ return TRUE; |
+} |
+ |
+static UBool U_CALLCONV |
+enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { |
+ /* add the start code point to the USet */ |
+ const PropertyStartsContext *ctx=(const PropertyStartsContext *)context; |
+ const USetAdder *sa=ctx->sa; |
+ sa->add(sa->set, start); |
+ if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) { |
+ // Range of code points with same-norm16-value algorithmic decompositions. |
+ // They might have different non-zero FCD16 values. |
+ uint16_t prevFCD16=ctx->impl.getFCD16(start); |
+ while(++start<=end) { |
+ uint16_t fcd16=ctx->impl.getFCD16(start); |
+ if(fcd16!=prevFCD16) { |
+ sa->add(sa->set, start); |
+ prevFCD16=fcd16; |
+ } |
+ } |
+ } |
+ return TRUE; |
+} |
+ |
+static UBool U_CALLCONV |
enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { |
/* add the start code point to the USet */ |
const USetAdder *sa=(const USetAdder *)context; |
@@ -375,9 +388,17 @@ segmentStarterMapper(const void * /*context*/, uint32_t value) { |
U_CDECL_END |
void |
+Normalizer2Impl::addLcccChars(UnicodeSet &set) const { |
+ /* add the start code point of each same-value range of each trie */ |
+ LcccContext context(*this, set); |
+ utrie2_enum(normTrie, NULL, enumLcccRange, &context); |
+} |
+ |
+void |
Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { |
/* add the start code point of each same-value range of each trie */ |
- utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa); |
+ PropertyStartsContext context(*this, sa); |
+ utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context); |
/* add Hangul LV syllables and LV+1 because of skippables */ |
for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { |
@@ -419,6 +440,38 @@ Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src, |
return src; |
} |
+UnicodeString & |
+Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest, |
+ UErrorCode &errorCode) const { |
+ if(U_FAILURE(errorCode)) { |
+ dest.setToBogus(); |
+ return dest; |
+ } |
+ const UChar *sArray=src.getBuffer(); |
+ if(&dest==&src || sArray==NULL) { |
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
+ dest.setToBogus(); |
+ return dest; |
+ } |
+ decompose(sArray, sArray+src.length(), dest, src.length(), errorCode); |
+ return dest; |
+} |
+ |
+void |
+Normalizer2Impl::decompose(const UChar *src, const UChar *limit, |
+ UnicodeString &dest, |
+ int32_t destLengthEstimate, |
+ UErrorCode &errorCode) const { |
+ if(destLengthEstimate<0 && limit!=NULL) { |
+ destLengthEstimate=(int32_t)(limit-src); |
+ } |
+ dest.remove(); |
+ ReorderingBuffer buffer(*this, dest); |
+ if(buffer.init(destLengthEstimate, errorCode)) { |
+ decompose(src, limit, &buffer, errorCode); |
+ } |
+} |
+ |
// Dual functionality: |
// buffer!=NULL: normalize |
// buffer==NULL: isNormalized/spanQuickCheckYes |