| Index: icu46/source/i18n/colldata.cpp
|
| ===================================================================
|
| --- icu46/source/i18n/colldata.cpp (revision 0)
|
| +++ icu46/source/i18n/colldata.cpp (revision 0)
|
| @@ -0,0 +1,1100 @@
|
| +/*
|
| + ******************************************************************************
|
| + * Copyright (C) 1996-2009, International Business Machines *
|
| + * Corporation and others. All Rights Reserved. *
|
| + ******************************************************************************
|
| + */
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_COLLATION
|
| +
|
| +#include "unicode/unistr.h"
|
| +#include "unicode/putil.h"
|
| +#include "unicode/usearch.h"
|
| +
|
| +#include "cmemory.h"
|
| +#include "unicode/coll.h"
|
| +#include "unicode/tblcoll.h"
|
| +#include "unicode/coleitr.h"
|
| +#include "unicode/ucoleitr.h"
|
| +
|
| +#include "unicode/regex.h" // TODO: make conditional on regexp being built.
|
| +
|
| +#include "unicode/uniset.h"
|
| +#include "unicode/uset.h"
|
| +#include "unicode/ustring.h"
|
| +#include "hash.h"
|
| +#include "uhash.h"
|
| +#include "ucln_in.h"
|
| +#include "ucol_imp.h"
|
| +#include "umutex.h"
|
| +
|
| +#include "unicode/colldata.h"
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
|
| +#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
|
| +#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
| +#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
|
| +
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
|
| +
|
| +#ifdef INSTRUMENT_CELIST
|
| +int32_t CEList::_active = 0;
|
| +int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
| +#endif
|
| +
|
| +CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
|
| + : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
|
| +{
|
| + UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
|
| + UCollationStrength strength = ucol_getStrength(coll);
|
| + UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
|
| + uint32_t variableTop = ucol_getVariableTop(coll, &status);
|
| + uint32_t strengthMask = 0;
|
| + int32_t order;
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + // **** only set flag if string has Han(gul) ****
|
| + ucol_forceHanImplicit(elems, &status);
|
| +
|
| + switch (strength)
|
| + {
|
| + default:
|
| + strengthMask |= UCOL_TERTIARYORDERMASK;
|
| + /* fall through */
|
| +
|
| + case UCOL_SECONDARY:
|
| + strengthMask |= UCOL_SECONDARYORDERMASK;
|
| + /* fall through */
|
| +
|
| + case UCOL_PRIMARY:
|
| + strengthMask |= UCOL_PRIMARYORDERMASK;
|
| + }
|
| +
|
| +#ifdef INSTRUMENT_CELIST
|
| + _active += 1;
|
| + _histogram[0] += 1;
|
| +#endif
|
| +
|
| + ces = ceBuffer;
|
| +
|
| + while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
|
| + UBool cont = isContinuation(order);
|
| +
|
| + order &= strengthMask;
|
| +
|
| + if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
|
| + if (strength >= UCOL_QUATERNARY) {
|
| + order &= UCOL_PRIMARYORDERMASK;
|
| + } else {
|
| + order = UCOL_IGNORABLE;
|
| + }
|
| + }
|
| +
|
| + if (order == UCOL_IGNORABLE) {
|
| + continue;
|
| + }
|
| +
|
| + if (cont) {
|
| + order |= UCOL_CONTINUATION_MARKER;
|
| + }
|
| +
|
| + add(order, status);
|
| + }
|
| +
|
| + ucol_closeElements(elems);
|
| +}
|
| +
|
| +CEList::~CEList()
|
| +{
|
| +#ifdef INSTRUMENT_CELIST
|
| + _active -= 1;
|
| +#endif
|
| +
|
| + if (ces != ceBuffer) {
|
| + DELETE_ARRAY(ces);
|
| + }
|
| +}
|
| +
|
| +void CEList::add(uint32_t ce, UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + if (listSize >= listMax) {
|
| + int32_t newMax = listMax + CELIST_BUFFER_SIZE;
|
| +
|
| +#ifdef INSTRUMENT_CELIST
|
| + _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
|
| +#endif
|
| +
|
| + uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
|
| +
|
| + if (newCEs == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
|
| +
|
| + if (ces != ceBuffer) {
|
| + DELETE_ARRAY(ces);
|
| + }
|
| +
|
| + ces = newCEs;
|
| + listMax = newMax;
|
| + }
|
| +
|
| + ces[listSize++] = ce;
|
| +}
|
| +
|
| +uint32_t CEList::get(int32_t index) const
|
| +{
|
| + if (index >= 0 && index < listSize) {
|
| + return ces[index];
|
| + }
|
| +
|
| + return UCOL_NULLORDER;
|
| +}
|
| +
|
| +uint32_t &CEList::operator[](int32_t index) const
|
| +{
|
| + return ces[index];
|
| +}
|
| +
|
| +UBool CEList::matchesAt(int32_t offset, const CEList *other) const
|
| +{
|
| + if (other == NULL || listSize - offset < other->size()) {
|
| + return FALSE;
|
| + }
|
| +
|
| + for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
|
| + if (ces[i] != (*other)[j]) {
|
| + return FALSE;
|
| + }
|
| + }
|
| +
|
| + return TRUE;
|
| +}
|
| +
|
| +int32_t CEList::size() const
|
| +{
|
| + return listSize;
|
| +}
|
| +
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
|
| +
|
| +#ifdef INSTRUMENT_STRING_LIST
|
| +int32_t StringList::_lists = 0;
|
| +int32_t StringList::_strings = 0;
|
| +int32_t StringList::_histogram[101] = {0};
|
| +#endif
|
| +
|
| +StringList::StringList(UErrorCode &status)
|
| + : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + strings = new UnicodeString [listMax];
|
| +
|
| + if (strings == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| +#ifdef INSTRUMENT_STRING_LIST
|
| + _lists += 1;
|
| + _histogram[0] += 1;
|
| +#endif
|
| +}
|
| +
|
| +StringList::~StringList()
|
| +{
|
| + delete[] strings;
|
| +}
|
| +
|
| +void StringList::add(const UnicodeString *string, UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| +#ifdef INSTRUMENT_STRING_LIST
|
| + _strings += 1;
|
| +#endif
|
| +
|
| + if (listSize >= listMax) {
|
| + int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
|
| +
|
| + UnicodeString *newStrings = new UnicodeString[newMax];
|
| +
|
| + uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
|
| +
|
| +#ifdef INSTRUMENT_STRING_LIST
|
| + int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
|
| +
|
| + if (_h > 100) {
|
| + _h = 100;
|
| + }
|
| +
|
| + _histogram[_h] += 1;
|
| +#endif
|
| +
|
| + delete[] strings;
|
| + strings = newStrings;
|
| + listMax = newMax;
|
| + }
|
| +
|
| + // The ctor initialized all the strings in
|
| + // the array to empty strings, so this
|
| + // is the same as copying the source string.
|
| + strings[listSize++].append(*string);
|
| +}
|
| +
|
| +void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
|
| +{
|
| + const UnicodeString string(chars, count);
|
| +
|
| + add(&string, status);
|
| +}
|
| +
|
| +const UnicodeString *StringList::get(int32_t index) const
|
| +{
|
| + if (index >= 0 && index < listSize) {
|
| + return &strings[index];
|
| + }
|
| +
|
| + return NULL;
|
| +}
|
| +
|
| +int32_t StringList::size() const
|
| +{
|
| + return listSize;
|
| +}
|
| +
|
| +
|
| +U_CFUNC void deleteStringList(void *obj);
|
| +
|
| +class CEToStringsMap : public UMemory
|
| +{
|
| +public:
|
| +
|
| + CEToStringsMap(UErrorCode &status);
|
| + ~CEToStringsMap();
|
| +
|
| + void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
|
| + StringList *getStringList(uint32_t ce) const;
|
| +
|
| +private:
|
| +
|
| + void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
|
| + UHashtable *map;
|
| +};
|
| +
|
| +CEToStringsMap::CEToStringsMap(UErrorCode &status)
|
| + : map(NULL)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + map = uhash_open(uhash_hashLong, uhash_compareLong,
|
| + uhash_compareCaselessUnicodeString,
|
| + &status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + uhash_setValueDeleter(map, deleteStringList);
|
| +}
|
| +
|
| +CEToStringsMap::~CEToStringsMap()
|
| +{
|
| + uhash_close(map);
|
| +}
|
| +
|
| +void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
|
| +{
|
| + StringList *strings = getStringList(ce);
|
| +
|
| + if (strings == NULL) {
|
| + strings = new StringList(status);
|
| +
|
| + if (strings == NULL || U_FAILURE(status)) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + putStringList(ce, strings, status);
|
| + }
|
| +
|
| + strings->add(string, status);
|
| +}
|
| +
|
| +StringList *CEToStringsMap::getStringList(uint32_t ce) const
|
| +{
|
| + return (StringList *) uhash_iget(map, ce);
|
| +}
|
| +
|
| +void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
|
| +{
|
| + uhash_iput(map, ce, (void *) stringList, &status);
|
| +}
|
| +
|
| +U_CFUNC void deleteStringList(void *obj)
|
| +{
|
| + StringList *strings = (StringList *) obj;
|
| +
|
| + delete strings;
|
| +}
|
| +
|
| +U_CFUNC void deleteCEList(void *obj);
|
| +U_CFUNC void deleteUnicodeStringKey(void *obj);
|
| +
|
| +class StringToCEsMap : public UMemory
|
| +{
|
| +public:
|
| + StringToCEsMap(UErrorCode &status);
|
| + ~StringToCEsMap();
|
| +
|
| + void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
|
| + const CEList *get(const UnicodeString *string);
|
| + void free(const CEList *list);
|
| +
|
| +private:
|
| +
|
| +
|
| + UHashtable *map;
|
| +};
|
| +
|
| +StringToCEsMap::StringToCEsMap(UErrorCode &status)
|
| + : map(NULL)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + map = uhash_open(uhash_hashUnicodeString,
|
| + uhash_compareUnicodeString,
|
| + uhash_compareLong,
|
| + &status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + uhash_setValueDeleter(map, deleteCEList);
|
| + uhash_setKeyDeleter(map, deleteUnicodeStringKey);
|
| +}
|
| +
|
| +StringToCEsMap::~StringToCEsMap()
|
| +{
|
| + uhash_close(map);
|
| +}
|
| +
|
| +void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
|
| +{
|
| + uhash_put(map, (void *) string, (void *) ces, &status);
|
| +}
|
| +
|
| +const CEList *StringToCEsMap::get(const UnicodeString *string)
|
| +{
|
| + return (const CEList *) uhash_get(map, string);
|
| +}
|
| +
|
| +U_CFUNC void deleteCEList(void *obj)
|
| +{
|
| + CEList *list = (CEList *) obj;
|
| +
|
| + delete list;
|
| +}
|
| +
|
| +U_CFUNC void deleteUnicodeStringKey(void *obj)
|
| +{
|
| + UnicodeString *key = (UnicodeString *) obj;
|
| +
|
| + delete key;
|
| +}
|
| +
|
| +class CollDataCacheEntry : public UMemory
|
| +{
|
| +public:
|
| + CollDataCacheEntry(CollData *theData);
|
| + ~CollDataCacheEntry();
|
| +
|
| + CollData *data;
|
| + int32_t refCount;
|
| +};
|
| +
|
| +CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
|
| + : data(theData), refCount(1)
|
| +{
|
| + // nothing else to do
|
| +}
|
| +
|
| +CollDataCacheEntry::~CollDataCacheEntry()
|
| +{
|
| + // check refCount?
|
| + delete data;
|
| +}
|
| +
|
| +class CollDataCache : public UMemory
|
| +{
|
| +public:
|
| + CollDataCache(UErrorCode &status);
|
| + ~CollDataCache();
|
| +
|
| + CollData *get(UCollator *collator, UErrorCode &status);
|
| + void unref(CollData *collData);
|
| +
|
| + void flush();
|
| +
|
| +private:
|
| + static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
|
| + static void deleteKey(char *key);
|
| +
|
| + UMTX lock;
|
| + UHashtable *cache;
|
| +};
|
| +
|
| +U_CFUNC void deleteChars(void * /*obj*/)
|
| +{
|
| + // char *chars = (char *) obj;
|
| + // All the key strings are owned by the
|
| + // CollData objects and don't need to
|
| + // be freed here.
|
| + //DELETE_ARRAY(chars);
|
| +}
|
| +
|
| +U_CFUNC void deleteCollDataCacheEntry(void *obj)
|
| +{
|
| + CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
|
| +
|
| + delete entry;
|
| +}
|
| +
|
| +CollDataCache::CollDataCache(UErrorCode &status)
|
| + : lock(0), cache(NULL)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
|
| + uhash_setKeyDeleter(cache, deleteChars);
|
| +}
|
| +
|
| +CollDataCache::~CollDataCache()
|
| +{
|
| + umtx_lock(&lock);
|
| + uhash_close(cache);
|
| + cache = NULL;
|
| + umtx_unlock(&lock);
|
| +
|
| + umtx_destroy(&lock);
|
| +}
|
| +
|
| +CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
|
| +{
|
| + char keyBuffer[KEY_BUFFER_SIZE];
|
| + int32_t keyLength = KEY_BUFFER_SIZE;
|
| + char *key = getKey(collator, keyBuffer, &keyLength);
|
| + CollData *result = NULL, *newData = NULL;
|
| + CollDataCacheEntry *entry = NULL, *newEntry = NULL;
|
| +
|
| + umtx_lock(&lock);
|
| + entry = (CollDataCacheEntry *) uhash_get(cache, key);
|
| +
|
| + if (entry == NULL) {
|
| + umtx_unlock(&lock);
|
| +
|
| + newData = new CollData(collator, key, keyLength, status);
|
| + newEntry = new CollDataCacheEntry(newData);
|
| +
|
| + if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return NULL;
|
| + }
|
| +
|
| + umtx_lock(&lock);
|
| + entry = (CollDataCacheEntry *) uhash_get(cache, key);
|
| +
|
| + if (entry == NULL) {
|
| + uhash_put(cache, newData->key, newEntry, &status);
|
| + umtx_unlock(&lock);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + delete newEntry;
|
| + delete newData;
|
| +
|
| + return NULL;
|
| + }
|
| +
|
| + return newData;
|
| + }
|
| + }
|
| +
|
| + result = entry->data;
|
| + entry->refCount += 1;
|
| + umtx_unlock(&lock);
|
| +
|
| + if (key != keyBuffer) {
|
| + deleteKey(key);
|
| + }
|
| +
|
| + if (newEntry != NULL) {
|
| + delete newEntry;
|
| + delete newData;
|
| + }
|
| +
|
| + return result;
|
| +}
|
| +
|
| +void CollDataCache::unref(CollData *collData)
|
| +{
|
| + CollDataCacheEntry *entry = NULL;
|
| +
|
| + umtx_lock(&lock);
|
| + entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
|
| +
|
| + if (entry != NULL) {
|
| + entry->refCount -= 1;
|
| + }
|
| + umtx_unlock(&lock);
|
| +}
|
| +
|
| +char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
|
| +
|
| + if (len >= *keyBufferLength) {
|
| + *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null
|
| + keyBuffer = NEW_ARRAY(char, *keyBufferLength);
|
| + status = U_ZERO_ERROR;
|
| +
|
| + len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
|
| + }
|
| +
|
| + keyBuffer[len] = '\0';
|
| +
|
| + return keyBuffer;
|
| +}
|
| +
|
| +void CollDataCache::flush()
|
| +{
|
| + const UHashElement *element;
|
| + int32_t pos = -1;
|
| +
|
| + umtx_lock(&lock);
|
| + while ((element = uhash_nextElement(cache, &pos)) != NULL) {
|
| + CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
|
| +
|
| + if (entry->refCount <= 0) {
|
| + uhash_removeElement(cache, element);
|
| + }
|
| + }
|
| + umtx_unlock(&lock);
|
| +}
|
| +
|
| +void CollDataCache::deleteKey(char *key)
|
| +{
|
| + DELETE_ARRAY(key);
|
| +}
|
| +
|
| +U_CDECL_BEGIN
|
| +static UBool coll_data_cleanup(void) {
|
| + CollData::freeCollDataCache();
|
| + return TRUE;
|
| +}
|
| +U_CDECL_END
|
| +
|
| +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
|
| +
|
| +CollData::CollData()
|
| +{
|
| + // nothing
|
| +}
|
| +
|
| +#define CLONE_COLLATOR
|
| +
|
| +//#define CACHE_CELISTS
|
| +CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
|
| + : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
|
| +{
|
| + // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
|
| + // i.e. other, control, private use, format, surrogate
|
| + U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
|
| + U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
|
| + USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
|
| +
|
| + // Han ext. A, Han, Jamo, Hangul, Han Ext. B
|
| + // i.e. all the characers we handle implicitly
|
| + U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
|
| + U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
|
| + USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + USet *expansions = uset_openEmpty();
|
| + USet *contractions = uset_openEmpty();
|
| + int32_t itemCount;
|
| +
|
| +#ifdef CACHE_CELISTS
|
| + charsToCEList = new StringToCEsMap(status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + goto bail;
|
| + }
|
| +#else
|
| + charsToCEList = NULL;
|
| +#endif
|
| +
|
| + ceToCharsStartingWith = new CEToStringsMap(status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + goto bail;
|
| + }
|
| +
|
| + if (cacheKeyLength > KEY_BUFFER_SIZE) {
|
| + key = NEW_ARRAY(char, cacheKeyLength);
|
| +
|
| + if (key == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + goto bail;
|
| + }
|
| + } else {
|
| + key = keyBuffer;
|
| + }
|
| +
|
| + ARRAY_COPY(key, cacheKey, cacheKeyLength);
|
| +
|
| +#ifdef CLONE_COLLATOR
|
| + coll = ucol_safeClone(collator, NULL, NULL, &status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + goto bail;
|
| + }
|
| +#else
|
| + coll = collator;
|
| +#endif
|
| +
|
| + ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
|
| +
|
| + uset_addAll(charsToTest, contractions);
|
| + uset_addAll(charsToTest, expansions);
|
| + uset_removeAll(charsToTest, charsToRemove);
|
| +
|
| + itemCount = uset_getItemCount(charsToTest);
|
| + for(int32_t item = 0; item < itemCount; item += 1) {
|
| + UChar32 start = 0, end = 0;
|
| + UChar buffer[16];
|
| + int32_t len = uset_getItem(charsToTest, item, &start, &end,
|
| + buffer, 16, &status);
|
| +
|
| + if (len == 0) {
|
| + for (UChar32 ch = start; ch <= end; ch += 1) {
|
| + UnicodeString *st = new UnicodeString(ch);
|
| +
|
| + if (st == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + break;
|
| + }
|
| +
|
| + CEList *ceList = new CEList(coll, *st, status);
|
| +
|
| + ceToCharsStartingWith->put(ceList->get(0), st, status);
|
| +
|
| +#ifdef CACHE_CELISTS
|
| + charsToCEList->put(st, ceList, status);
|
| +#else
|
| + delete ceList;
|
| + delete st;
|
| +#endif
|
| + }
|
| + } else if (len > 0) {
|
| + UnicodeString *st = new UnicodeString(buffer, len);
|
| +
|
| + if (st == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + break;
|
| + }
|
| +
|
| + CEList *ceList = new CEList(coll, *st, status);
|
| +
|
| + ceToCharsStartingWith->put(ceList->get(0), st, status);
|
| +
|
| +#ifdef CACHE_CELISTS
|
| + charsToCEList->put(st, ceList, status);
|
| +#else
|
| + delete ceList;
|
| + delete st;
|
| +#endif
|
| + } else {
|
| + // shouldn't happen...
|
| + }
|
| +
|
| + if (U_FAILURE(status)) {
|
| + break;
|
| + }
|
| + }
|
| +
|
| +bail:
|
| + uset_close(contractions);
|
| + uset_close(expansions);
|
| + uset_close(charsToRemove);
|
| + uset_close(charsToTest);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
|
| + UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
|
| + UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
|
| + UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
|
| + UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
|
| + CEList hanList(coll, hanString, status);
|
| + CEList jamoList(coll, jamoString, status);
|
| + int32_t j = 0;
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + for (int32_t c = 0; c < jamoList.size(); c += 1) {
|
| + uint32_t jce = jamoList[c];
|
| +
|
| + if (! isContinuation(jce)) {
|
| + jamoLimits[j++] = jce;
|
| + }
|
| + }
|
| +
|
| + jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
|
| +
|
| + minHan = 0xFFFFFFFF;
|
| + maxHan = 0;
|
| +
|
| + for(int32_t h = 0; h < hanList.size(); h += 2) {
|
| + uint32_t han = (uint32_t) hanList[h];
|
| +
|
| + if (han < minHan) {
|
| + minHan = han;
|
| + }
|
| +
|
| + if (han > maxHan) {
|
| + maxHan = han;
|
| + }
|
| + }
|
| +
|
| + maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
|
| +}
|
| +
|
| +CollData::~CollData()
|
| +{
|
| +#ifdef CLONE_COLLATOR
|
| + ucol_close(coll);
|
| +#endif
|
| +
|
| + if (key != keyBuffer) {
|
| + DELETE_ARRAY(key);
|
| + }
|
| +
|
| + delete ceToCharsStartingWith;
|
| +
|
| +#ifdef CACHE_CELISTS
|
| + delete charsToCEList;
|
| +#endif
|
| +}
|
| +
|
| +UCollator *CollData::getCollator() const
|
| +{
|
| + return coll;
|
| +}
|
| +
|
| +const StringList *CollData::getStringList(int32_t ce) const
|
| +{
|
| + return ceToCharsStartingWith->getStringList(ce);
|
| +}
|
| +
|
| +const CEList *CollData::getCEList(const UnicodeString *string) const
|
| +{
|
| +#ifdef CACHE_CELISTS
|
| + return charsToCEList->get(string);
|
| +#else
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + const CEList *list = new CEList(coll, *string, status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + delete list;
|
| + list = NULL;
|
| + }
|
| +
|
| + return list;
|
| +#endif
|
| +}
|
| +
|
| +void CollData::freeCEList(const CEList *list)
|
| +{
|
| +#ifndef CACHE_CELISTS
|
| + delete list;
|
| +#endif
|
| +}
|
| +
|
| +int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
|
| +{
|
| + // find out shortest string for the longest sequence of ces.
|
| + // this can probably be folded with the minLengthCache...
|
| +
|
| + if (history[offset] >= 0) {
|
| + return history[offset];
|
| + }
|
| +
|
| + uint32_t ce = ceList->get(offset);
|
| + int32_t maxOffset = ceList->size();
|
| + int32_t shortestLength = INT32_MAX;
|
| + const StringList *strings = ceToCharsStartingWith->getStringList(ce);
|
| +
|
| + if (strings != NULL) {
|
| + int32_t stringCount = strings->size();
|
| +
|
| + for (int32_t s = 0; s < stringCount; s += 1) {
|
| + const UnicodeString *string = strings->get(s);
|
| +#ifdef CACHE_CELISTS
|
| + const CEList *ceList2 = charsToCEList->get(string);
|
| +#else
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + const CEList *ceList2 = new CEList(coll, *string, status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + delete ceList2;
|
| + ceList2 = NULL;
|
| + }
|
| +#endif
|
| +
|
| + if (ceList->matchesAt(offset, ceList2)) {
|
| + int32_t clength = ceList2->size();
|
| + int32_t slength = string->length();
|
| + int32_t roffset = offset + clength;
|
| + int32_t rlength = 0;
|
| +
|
| + if (roffset < maxOffset) {
|
| + rlength = minLengthInChars(ceList, roffset, history);
|
| +
|
| + if (rlength <= 0) {
|
| + // delete before continue to avoid memory leak.
|
| +#ifndef CACHE_CELISTS
|
| + delete ceList2;
|
| +#endif
|
| + // ignore any dead ends
|
| + continue;
|
| + }
|
| + }
|
| +
|
| + if (shortestLength > slength + rlength) {
|
| + shortestLength = slength + rlength;
|
| + }
|
| + }
|
| +
|
| +#ifndef CACHE_CELISTS
|
| + delete ceList2;
|
| +#endif
|
| + }
|
| + }
|
| +
|
| + if (shortestLength == INT32_MAX) {
|
| + // No matching strings at this offset. See if
|
| + // the CE is in a range we can handle manually.
|
| + if (ce >= minHan && ce < maxHan) {
|
| + // all han have implicit orders which
|
| + // generate two CEs.
|
| + int32_t roffset = offset + 2;
|
| + int32_t rlength = 0;
|
| +
|
| + //history[roffset++] = -1;
|
| + //history[roffset++] = 1;
|
| +
|
| + if (roffset < maxOffset) {
|
| + rlength = minLengthInChars(ceList, roffset, history);
|
| + }
|
| +
|
| + if (rlength < 0) {
|
| + return -1;
|
| + }
|
| +
|
| + shortestLength = 1 + rlength;
|
| + goto have_shortest;
|
| + } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
|
| + int32_t roffset = offset;
|
| + int32_t rlength = 0;
|
| +
|
| + // **** this loop may not handle archaic Hangul correctly ****
|
| + for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
|
| + uint32_t jce = ceList->get(roffset);
|
| +
|
| + // Some Jamo have 24-bit primary order; skip the
|
| + // 2nd CE. This should always be OK because if
|
| + // we're still in the loop all we've seen are
|
| + // a series of Jamo in LVT order.
|
| + if (isContinuation(jce)) {
|
| + continue;
|
| + }
|
| +
|
| + if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
|
| + break;
|
| + }
|
| + }
|
| +
|
| + if (roffset == offset) {
|
| + // we started with a non-L Jamo...
|
| + // just say it comes from a single character
|
| + roffset += 1;
|
| +
|
| + // See if the single Jamo has a 24-bit order.
|
| + if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
|
| + roffset += 1;
|
| + }
|
| + }
|
| +
|
| + if (roffset < maxOffset) {
|
| + rlength = minLengthInChars(ceList, roffset, history);
|
| + }
|
| +
|
| + if (rlength < 0) {
|
| + return -1;
|
| + }
|
| +
|
| + shortestLength = 1 + rlength;
|
| + goto have_shortest;
|
| + }
|
| +
|
| + // Can't handle it manually either. Just move on.
|
| + return -1;
|
| + }
|
| +
|
| +have_shortest:
|
| + history[offset] = shortestLength;
|
| +
|
| + return shortestLength;
|
| +}
|
| +
|
| +int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
|
| +{
|
| + int32_t clength = ceList->size();
|
| + int32_t *history = NEW_ARRAY(int32_t, clength);
|
| +
|
| + for (int32_t i = 0; i < clength; i += 1) {
|
| + history[i] = -1;
|
| + }
|
| +
|
| + int32_t minLength = minLengthInChars(ceList, offset, history);
|
| +
|
| + DELETE_ARRAY(history);
|
| +
|
| + return minLength;
|
| +}
|
| +
|
| +CollData *CollData::open(UCollator *collator, UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return NULL;
|
| + }
|
| +
|
| + CollDataCache *cache = getCollDataCache();
|
| +
|
| + return cache->get(collator, status);
|
| +}
|
| +
|
| +void CollData::close(CollData *collData)
|
| +{
|
| + CollDataCache *cache = getCollDataCache();
|
| +
|
| + cache->unref(collData);
|
| +}
|
| +
|
| +CollDataCache *CollData::collDataCache = NULL;
|
| +
|
| +CollDataCache *CollData::getCollDataCache()
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + CollDataCache *cache = NULL;
|
| +
|
| + UMTX_CHECK(NULL, collDataCache, cache);
|
| +
|
| + if (cache == NULL) {
|
| + cache = new CollDataCache(status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + delete cache;
|
| + return NULL;
|
| + }
|
| +
|
| + umtx_lock(NULL);
|
| + if (collDataCache == NULL) {
|
| + collDataCache = cache;
|
| +
|
| + ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
|
| + }
|
| + umtx_unlock(NULL);
|
| +
|
| + if (collDataCache != cache) {
|
| + delete cache;
|
| + }
|
| + }
|
| +
|
| + return collDataCache;
|
| +}
|
| +
|
| +void CollData::freeCollDataCache()
|
| +{
|
| + CollDataCache *cache = NULL;
|
| +
|
| + UMTX_CHECK(NULL, collDataCache, cache);
|
| +
|
| + if (cache != NULL) {
|
| + umtx_lock(NULL);
|
| + if (collDataCache != NULL) {
|
| + collDataCache = NULL;
|
| + } else {
|
| + cache = NULL;
|
| + }
|
| + umtx_unlock(NULL);
|
| +
|
| + delete cache;
|
| + }
|
| +}
|
| +
|
| +void CollData::flushCollDataCache()
|
| +{
|
| + CollDataCache *cache = NULL;
|
| +
|
| + UMTX_CHECK(NULL, collDataCache, cache);
|
| +
|
| + // **** this will fail if the another ****
|
| + // **** thread deletes the cache here ****
|
| + if (cache != NULL) {
|
| + cache->flush();
|
| + }
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +#endif // #if !UCONFIG_NO_COLLATION
|
|
|
| Property changes on: icu46/source/i18n/colldata.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|