Index: source/test/intltest/colldata.cpp |
diff --git a/source/test/intltest/colldata.cpp b/source/test/intltest/colldata.cpp |
deleted file mode 100644 |
index abbfac2c9fd8086674bdd79832c9a466b4000895..0000000000000000000000000000000000000000 |
--- a/source/test/intltest/colldata.cpp |
+++ /dev/null |
@@ -1,652 +0,0 @@ |
-/* |
- ****************************************************************************** |
- * Copyright (C) 1996-2014, International Business Machines |
- * Corporation and others. All Rights Reserved. |
- ****************************************************************************** |
- */ |
- |
-#include "unicode/utypes.h" |
- |
-#if !UCONFIG_NO_COLLATION |
- |
-#include "unicode/unistr.h" |
-#include "unicode/usearch.h" |
- |
-#include "cmemory.h" |
-#include "unicode/coll.h" |
-#include "unicode/tblcoll.h" |
-#include "unicode/coleitr.h" |
-#include "unicode/ucoleitr.h" |
- |
-#include "unicode/regex.h" // TODO: make conditional on regexp being built. |
- |
-#include "unicode/uniset.h" |
-#include "unicode/uset.h" |
-#include "unicode/usetiter.h" |
-#include "unicode/ustring.h" |
-#include "hash.h" |
-#include "normalizer2impl.h" |
-#include "uhash.h" |
-#include "usrchimp.h" |
-#include "uassert.h" |
- |
-#include "colldata.h" |
- |
-#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) |
-#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) |
-#define DELETE_ARRAY(array) uprv_free((void *) (array)) |
-#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) |
- |
-CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) |
- : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) |
-{ |
- UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); |
- UCollationStrength strength = ucol_getStrength(coll); |
- UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; |
- uint32_t variableTop = ucol_getVariableTop(coll, &status); |
- uint32_t strengthMask = 0; |
- int32_t order; |
- |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- // **** only set flag if string has Han(gul) **** |
- // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476 |
- |
- switch (strength) |
- { |
- default: |
- strengthMask |= UCOL_TERTIARYORDERMASK; |
- /* fall through */ |
- |
- case UCOL_SECONDARY: |
- strengthMask |= UCOL_SECONDARYORDERMASK; |
- /* fall through */ |
- |
- case UCOL_PRIMARY: |
- strengthMask |= UCOL_PRIMARYORDERMASK; |
- } |
- |
- ces = ceBuffer; |
- |
- while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { |
- UBool cont = isContinuation(order); |
- |
- order &= strengthMask; |
- |
- if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { |
- if (strength >= UCOL_QUATERNARY) { |
- order &= UCOL_PRIMARYORDERMASK; |
- } else { |
- order = UCOL_IGNORABLE; |
- } |
- } |
- |
- if (order == UCOL_IGNORABLE) { |
- continue; |
- } |
- |
- if (cont) { |
- order |= UCOL_CONTINUATION_MARKER; |
- } |
- |
- add(order, status); |
- } |
- |
- ucol_closeElements(elems); |
-} |
- |
-CEList::~CEList() |
-{ |
- if (ces != ceBuffer) { |
- DELETE_ARRAY(ces); |
- } |
-} |
- |
-void CEList::add(uint32_t ce, UErrorCode &status) |
-{ |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- if (listSize >= listMax) { |
- int32_t newMax = listMax + CELIST_BUFFER_SIZE; |
- uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); |
- |
- if (newCEs == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- return; |
- } |
- |
- uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); |
- |
- if (ces != ceBuffer) { |
- DELETE_ARRAY(ces); |
- } |
- |
- ces = newCEs; |
- listMax = newMax; |
- } |
- |
- ces[listSize++] = ce; |
-} |
- |
-uint32_t CEList::get(int32_t index) const |
-{ |
- if (index >= 0 && index < listSize) { |
- return ces[index]; |
- } |
- |
- return (uint32_t)UCOL_NULLORDER; |
-} |
- |
-uint32_t &CEList::operator[](int32_t index) const |
-{ |
- return ces[index]; |
-} |
- |
-UBool CEList::matchesAt(int32_t offset, const CEList *other) const |
-{ |
- if (other == NULL || listSize - offset < other->size()) { |
- return FALSE; |
- } |
- |
- for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { |
- if (ces[i] != (*other)[j]) { |
- return FALSE; |
- } |
- } |
- |
- return TRUE; |
-} |
- |
-int32_t CEList::size() const |
-{ |
- return listSize; |
-} |
- |
-StringList::StringList(UErrorCode &status) |
- : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) |
-{ |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- strings = new UnicodeString [listMax]; |
- |
- if (strings == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- return; |
- } |
-} |
- |
-StringList::~StringList() |
-{ |
- delete[] strings; |
-} |
- |
-void StringList::add(const UnicodeString *string, UErrorCode &status) |
-{ |
- if (U_FAILURE(status)) { |
- return; |
- } |
- if (listSize >= listMax) { |
- int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; |
- UnicodeString *newStrings = new UnicodeString[newMax]; |
- if (newStrings == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- return; |
- } |
- for (int32_t i=0; i<listSize; ++i) { |
- newStrings[i] = strings[i]; |
- } |
- delete[] strings; |
- strings = newStrings; |
- listMax = newMax; |
- } |
- |
- // The ctor initialized all the strings in |
- // the array to empty strings, so this |
- // is the same as copying the source string. |
- strings[listSize++].append(*string); |
-} |
- |
-void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) |
-{ |
- const UnicodeString string(chars, count); |
- |
- add(&string, status); |
-} |
- |
-const UnicodeString *StringList::get(int32_t index) const |
-{ |
- if (index >= 0 && index < listSize) { |
- return &strings[index]; |
- } |
- |
- return NULL; |
-} |
- |
-int32_t StringList::size() const |
-{ |
- return listSize; |
-} |
- |
- |
-U_CDECL_BEGIN |
-static void U_CALLCONV |
-deleteStringList(void *obj) |
-{ |
- StringList *strings = (StringList *) obj; |
- |
- delete strings; |
-} |
-U_CDECL_END |
- |
-class CEToStringsMap |
-{ |
-public: |
- CEToStringsMap(UErrorCode &status); |
- ~CEToStringsMap(); |
- |
- void put(uint32_t ce, UnicodeString *string, UErrorCode &status); |
- StringList *getStringList(uint32_t ce) const; |
- |
-private: |
- void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); |
- UHashtable *map; |
-}; |
- |
-CEToStringsMap::CEToStringsMap(UErrorCode &status) |
- : map(NULL) |
-{ |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- map = uhash_open(uhash_hashLong, uhash_compareLong, |
- uhash_compareCaselessUnicodeString, |
- &status); |
- |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- uhash_setValueDeleter(map, deleteStringList); |
-} |
- |
-CEToStringsMap::~CEToStringsMap() |
-{ |
- uhash_close(map); |
-} |
- |
-void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) |
-{ |
- StringList *strings = getStringList(ce); |
- |
- if (strings == NULL) { |
- strings = new StringList(status); |
- |
- if (strings == NULL || U_FAILURE(status)) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- return; |
- } |
- |
- putStringList(ce, strings, status); |
- } |
- |
- strings->add(string, status); |
-} |
- |
-StringList *CEToStringsMap::getStringList(uint32_t ce) const |
-{ |
- return (StringList *) uhash_iget(map, ce); |
-} |
- |
-void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) |
-{ |
- uhash_iput(map, ce, (void *) stringList, &status); |
-} |
- |
-#define CLONE_COLLATOR |
- |
-CollData::CollData(UCollator *collator, UErrorCode &status) |
- : coll(NULL), ceToCharsStartingWith(NULL) |
-{ |
- // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] |
- // i.e. other, control, private use, format, surrogate |
- U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); |
- U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); |
- USet *charsToTest = uset_openPattern(test_pattern, 20, &status); |
- |
- // Han ext. A, Han, Jamo, Hangul, Han Ext. B |
- // i.e. all the characers we handle implicitly |
- U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); |
- U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); |
- USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); |
- |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- USet *expansions = uset_openEmpty(); |
- USet *contractions = uset_openEmpty(); |
- int32_t itemCount; |
- |
- ceToCharsStartingWith = new CEToStringsMap(status); |
- |
- if (U_FAILURE(status)) { |
- goto bail; |
- } |
- |
-#ifdef CLONE_COLLATOR |
- coll = ucol_safeClone(collator, NULL, NULL, &status); |
- |
- if (U_FAILURE(status)) { |
- goto bail; |
- } |
-#else |
- coll = collator; |
-#endif |
- |
- ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); |
- |
- uset_addAll(charsToTest, contractions); |
- uset_addAll(charsToTest, expansions); |
- uset_removeAll(charsToTest, charsToRemove); |
- |
- itemCount = uset_getItemCount(charsToTest); |
- for(int32_t item = 0; item < itemCount; item += 1) { |
- UChar32 start = 0, end = 0; |
- UChar buffer[16]; |
- int32_t len = uset_getItem(charsToTest, item, &start, &end, |
- buffer, 16, &status); |
- |
- if (len == 0) { |
- for (UChar32 ch = start; ch <= end; ch += 1) { |
- UnicodeString *st = new UnicodeString(ch); |
- |
- if (st == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- break; |
- } |
- |
- CEList *ceList = new CEList(coll, *st, status); |
- |
- ceToCharsStartingWith->put(ceList->get(0), st, status); |
- |
- delete ceList; |
- delete st; |
- } |
- } else if (len > 0) { |
- UnicodeString *st = new UnicodeString(buffer, len); |
- |
- if (st == NULL) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- break; |
- } |
- |
- CEList *ceList = new CEList(coll, *st, status); |
- |
- ceToCharsStartingWith->put(ceList->get(0), st, status); |
- |
- delete ceList; |
- delete st; |
- } else { |
- // shouldn't happen... |
- } |
- |
- if (U_FAILURE(status)) { |
- break; |
- } |
- } |
- |
-bail: |
- uset_close(contractions); |
- uset_close(expansions); |
- uset_close(charsToRemove); |
- uset_close(charsToTest); |
- |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status); |
- if (U_FAILURE(status)) { |
- return; |
- } |
- UnicodeSetIterator hanIter(hanRanges); |
- UnicodeString hanString; |
- while(hanIter.nextRange()) { |
- hanString.append(hanIter.getCodepoint()); |
- hanString.append(hanIter.getCodepointEnd()); |
- } |
- // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, |
- // but as of Unicode 6.3 the 11xx block is filled, |
- // and there are also more Jamo T at U+D7CB..U+D7FB. |
- // Maybe use [:HST=T:] and look for the end of the last range? |
- // Maybe use script boundary mappings instead of this code?? |
- UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF}; |
- UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); |
- CEList hanList(coll, hanString, status); |
- CEList jamoList(coll, jamoString, status); |
- int32_t j = 0; |
- |
- if (U_FAILURE(status)) { |
- return; |
- } |
- |
- for (int32_t c = 0; c < jamoList.size(); c += 1) { |
- uint32_t jce = jamoList[c]; |
- |
- if (! isContinuation(jce)) { |
- jamoLimits[j++] = jce; |
- } |
- } |
- |
- jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); |
- |
- minHan = 0xFFFFFFFF; |
- maxHan = 0; |
- |
- for(int32_t h = 0; h < hanList.size(); h += 2) { |
- uint32_t han = (uint32_t) hanList[h]; |
- |
- if (han < minHan) { |
- minHan = han; |
- } |
- |
- if (han > maxHan) { |
- maxHan = han; |
- } |
- } |
- |
- maxHan += (1 << UCOL_PRIMARYORDERSHIFT); |
-} |
- |
-CollData::~CollData() |
-{ |
-#ifdef CLONE_COLLATOR |
- ucol_close(coll); |
-#endif |
- |
- delete ceToCharsStartingWith; |
-} |
- |
-UCollator *CollData::getCollator() const |
-{ |
- return coll; |
-} |
- |
-const StringList *CollData::getStringList(int32_t ce) const |
-{ |
- return ceToCharsStartingWith->getStringList(ce); |
-} |
- |
-const CEList *CollData::getCEList(const UnicodeString *string) const |
-{ |
- UErrorCode status = U_ZERO_ERROR; |
- const CEList *list = new CEList(coll, *string, status); |
- |
- if (U_FAILURE(status)) { |
- delete list; |
- list = NULL; |
- } |
- |
- return list; |
-} |
- |
-void CollData::freeCEList(const CEList *list) |
-{ |
- delete list; |
-} |
- |
-int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const |
-{ |
- // find out shortest string for the longest sequence of ces. |
- // this can probably be folded with the minLengthCache... |
- |
- if (history[offset] >= 0) { |
- return history[offset]; |
- } |
- |
- uint32_t ce = ceList->get(offset); |
- int32_t maxOffset = ceList->size(); |
- int32_t shortestLength = INT32_MAX; |
- const StringList *strings = ceToCharsStartingWith->getStringList(ce); |
- |
- if (strings != NULL) { |
- int32_t stringCount = strings->size(); |
- |
- for (int32_t s = 0; s < stringCount; s += 1) { |
- const UnicodeString *string = strings->get(s); |
- UErrorCode status = U_ZERO_ERROR; |
- const CEList *ceList2 = new CEList(coll, *string, status); |
- |
- if (U_FAILURE(status)) { |
- delete ceList2; |
- ceList2 = NULL; |
- } |
- |
- if (ceList->matchesAt(offset, ceList2)) { |
- U_ASSERT(ceList2 != NULL); |
- int32_t clength = ceList2->size(); |
- int32_t slength = string->length(); |
- int32_t roffset = offset + clength; |
- int32_t rlength = 0; |
- |
- if (roffset < maxOffset) { |
- rlength = minLengthInChars(ceList, roffset, history); |
- |
- if (rlength <= 0) { |
- // delete before continue to avoid memory leak. |
- delete ceList2; |
- |
- // ignore any dead ends |
- continue; |
- } |
- } |
- |
- if (shortestLength > slength + rlength) { |
- shortestLength = slength + rlength; |
- } |
- } |
- |
- delete ceList2; |
- } |
- } |
- |
- if (shortestLength == INT32_MAX) { |
- // No matching strings at this offset. See if |
- // the CE is in a range we can handle manually. |
- if (ce >= minHan && ce < maxHan) { |
- // all han have implicit orders which |
- // generate two CEs. |
- int32_t roffset = offset + 2; |
- int32_t rlength = 0; |
- |
- //history[roffset++] = -1; |
- //history[roffset++] = 1; |
- |
- if (roffset < maxOffset) { |
- rlength = minLengthInChars(ceList, roffset, history); |
- } |
- |
- if (rlength < 0) { |
- return -1; |
- } |
- |
- shortestLength = 1 + rlength; |
- goto have_shortest; |
- } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { |
- int32_t roffset = offset; |
- int32_t rlength = 0; |
- |
- // **** this loop may not handle archaic Hangul correctly **** |
- for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { |
- uint32_t jce = ceList->get(roffset); |
- |
- // Some Jamo have 24-bit primary order; skip the |
- // 2nd CE. This should always be OK because if |
- // we're still in the loop all we've seen are |
- // a series of Jamo in LVT order. |
- if (isContinuation(jce)) { |
- continue; |
- } |
- |
- if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { |
- break; |
- } |
- } |
- |
- if (roffset == offset) { |
- // we started with a non-L Jamo... |
- // just say it comes from a single character |
- roffset += 1; |
- |
- // See if the single Jamo has a 24-bit order. |
- if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { |
- roffset += 1; |
- } |
- } |
- |
- if (roffset < maxOffset) { |
- rlength = minLengthInChars(ceList, roffset, history); |
- } |
- |
- if (rlength < 0) { |
- return -1; |
- } |
- |
- shortestLength = 1 + rlength; |
- goto have_shortest; |
- } |
- |
- // Can't handle it manually either. Just move on. |
- return -1; |
- } |
- |
-have_shortest: |
- history[offset] = shortestLength; |
- |
- return shortestLength; |
-} |
- |
-int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const |
-{ |
- int32_t clength = ceList->size(); |
- int32_t *history = NEW_ARRAY(int32_t, clength); |
- |
- for (int32_t i = 0; i < clength; i += 1) { |
- history[i] = -1; |
- } |
- |
- int32_t minLength = minLengthInChars(ceList, offset, history); |
- |
- DELETE_ARRAY(history); |
- |
- return minLength; |
-} |
- |
-#endif // #if !UCONFIG_NO_COLLATION |