Index: icu46/source/i18n/csdetect.cpp |
=================================================================== |
--- icu46/source/i18n/csdetect.cpp (revision 0) |
+++ icu46/source/i18n/csdetect.cpp (revision 0) |
@@ -0,0 +1,414 @@ |
+/* |
+ ********************************************************************** |
+ * Copyright (C) 2005-2009, International Business Machines |
+ * Corporation and others. All Rights Reserved. |
+ ********************************************************************** |
+ */ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_CONVERSION |
+ |
+#include "unicode/ucsdet.h" |
+ |
+#include "csdetect.h" |
+#include "csmatch.h" |
+#include "uenumimp.h" |
+ |
+#include "cmemory.h" |
+#include "cstring.h" |
+#include "umutex.h" |
+#include "ucln_in.h" |
+#include "uarrsort.h" |
+#include "inputext.h" |
+#include "csrsbcs.h" |
+#include "csrmbcs.h" |
+#include "csrutf8.h" |
+#include "csrucode.h" |
+#include "csr2022.h" |
+ |
+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
+ |
+#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
+#define DELETE_ARRAY(array) uprv_free((void *) (array)) |
+ |
+U_CDECL_BEGIN |
+static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL; |
+ |
+static int32_t fCSRecognizers_size = 0; |
+ |
+static UBool U_CALLCONV csdet_cleanup(void) |
+{ |
+ if (fCSRecognizers != NULL) { |
+ for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { |
+ delete fCSRecognizers[r]; |
+ fCSRecognizers[r] = NULL; |
+ } |
+ |
+ DELETE_ARRAY(fCSRecognizers); |
+ fCSRecognizers = NULL; |
+ fCSRecognizers_size = 0; |
+ } |
+ |
+ return TRUE; |
+} |
+ |
+static int32_t U_CALLCONV |
+charsetMatchComparator(const void * /*context*/, const void *left, const void *right) |
+{ |
+ U_NAMESPACE_USE |
+ |
+ const CharsetMatch **csm_l = (const CharsetMatch **) left; |
+ const CharsetMatch **csm_r = (const CharsetMatch **) right; |
+ |
+ // NOTE: compare is backwards to sort from highest to lowest. |
+ return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); |
+} |
+ |
+U_CDECL_END |
+ |
+U_NAMESPACE_BEGIN |
+ |
+void CharsetDetector::setRecognizers(UErrorCode &status) |
+{ |
+ UBool needsInit; |
+ CharsetRecognizer **recognizers; |
+ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ |
+ UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); |
+ |
+ if (needsInit) { |
+ CharsetRecognizer *tempArray[] = { |
+ new CharsetRecog_UTF8(), |
+ |
+ new CharsetRecog_UTF_16_BE(), |
+ new CharsetRecog_UTF_16_LE(), |
+ new CharsetRecog_UTF_32_BE(), |
+ new CharsetRecog_UTF_32_LE(), |
+ |
+ new CharsetRecog_8859_1_en(), |
+ new CharsetRecog_8859_1_da(), |
+ new CharsetRecog_8859_1_de(), |
+ new CharsetRecog_8859_1_es(), |
+ new CharsetRecog_8859_1_fr(), |
+ new CharsetRecog_8859_1_it(), |
+ new CharsetRecog_8859_1_nl(), |
+ new CharsetRecog_8859_1_no(), |
+ new CharsetRecog_8859_1_pt(), |
+ new CharsetRecog_8859_1_sv(), |
+ new CharsetRecog_8859_2_cs(), |
+ new CharsetRecog_8859_2_hu(), |
+ new CharsetRecog_8859_2_pl(), |
+ new CharsetRecog_8859_2_ro(), |
+ new CharsetRecog_8859_5_ru(), |
+ new CharsetRecog_8859_6_ar(), |
+ new CharsetRecog_8859_7_el(), |
+ new CharsetRecog_8859_8_I_he(), |
+ new CharsetRecog_8859_8_he(), |
+ new CharsetRecog_windows_1251(), |
+ new CharsetRecog_windows_1256(), |
+ new CharsetRecog_KOI8_R(), |
+ new CharsetRecog_8859_9_tr(), |
+ new CharsetRecog_sjis(), |
+ new CharsetRecog_gb_18030(), |
+ new CharsetRecog_euc_jp(), |
+ new CharsetRecog_euc_kr(), |
+ new CharsetRecog_big5(), |
+ |
+ new CharsetRecog_2022JP(), |
+ new CharsetRecog_2022KR(), |
+ new CharsetRecog_2022CN(), |
+ |
+ new CharsetRecog_IBM424_he_rtl(), |
+ new CharsetRecog_IBM424_he_ltr(), |
+ new CharsetRecog_IBM420_ar_rtl(), |
+ new CharsetRecog_IBM420_ar_ltr() |
+ }; |
+ int32_t rCount = ARRAY_SIZE(tempArray); |
+ int32_t r; |
+ |
+ recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); |
+ |
+ if (recognizers == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } else { |
+ for (r = 0; r < rCount; r += 1) { |
+ recognizers[r] = tempArray[r]; |
+ |
+ if (recognizers[r] == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ break; |
+ } |
+ } |
+ } |
+ |
+ if (U_SUCCESS(status)) { |
+ umtx_lock(NULL); |
+ if (fCSRecognizers == NULL) { |
+ fCSRecognizers_size = rCount; |
+ fCSRecognizers = recognizers; |
+ } |
+ umtx_unlock(NULL); |
+ } |
+ |
+ if (fCSRecognizers != recognizers) { |
+ for (r = 0; r < rCount; r += 1) { |
+ delete recognizers[r]; |
+ recognizers[r] = NULL; |
+ } |
+ |
+ DELETE_ARRAY(recognizers); |
+ } |
+ |
+ recognizers = NULL; |
+ ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); |
+ } |
+} |
+ |
+CharsetDetector::CharsetDetector(UErrorCode &status) |
+ : textIn(new InputText(status)), resultArray(NULL), |
+ resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) |
+{ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ |
+ setRecognizers(status); |
+ |
+ if (U_FAILURE(status)) { |
+ return; |
+ } |
+ |
+ resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); |
+ |
+ if (resultArray == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ |
+ for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
+ resultArray[i] = new CharsetMatch(); |
+ |
+ if (resultArray[i] == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ break; |
+ } |
+ } |
+} |
+ |
+CharsetDetector::~CharsetDetector() |
+{ |
+ delete textIn; |
+ |
+ for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { |
+ delete resultArray[i]; |
+ } |
+ |
+ uprv_free(resultArray); |
+} |
+ |
+void CharsetDetector::setText(const char *in, int32_t len) |
+{ |
+ textIn->setText(in, len); |
+ fFreshTextSet = TRUE; |
+} |
+ |
+UBool CharsetDetector::setStripTagsFlag(UBool flag) |
+{ |
+ UBool temp = fStripTags; |
+ fStripTags = flag; |
+ fFreshTextSet = TRUE; |
+ return temp; |
+} |
+ |
+UBool CharsetDetector::getStripTagsFlag() const |
+{ |
+ return fStripTags; |
+} |
+ |
+void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const |
+{ |
+ textIn->setDeclaredEncoding(encoding,len); |
+} |
+ |
+int32_t CharsetDetector::getDetectableCount() |
+{ |
+ UErrorCode status = U_ZERO_ERROR; |
+ |
+ setRecognizers(status); |
+ |
+ return fCSRecognizers_size; |
+} |
+ |
+const CharsetMatch *CharsetDetector::detect(UErrorCode &status) |
+{ |
+ int32_t maxMatchesFound = 0; |
+ |
+ detectAll(maxMatchesFound, status); |
+ |
+ if(maxMatchesFound > 0) { |
+ return resultArray[0]; |
+ } else { |
+ return NULL; |
+ } |
+} |
+ |
+const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) |
+{ |
+ if(!textIn->isSet()) { |
+ status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set |
+ |
+ return NULL; |
+ } else if(fFreshTextSet) { |
+ CharsetRecognizer *csr; |
+ int32_t detectResults; |
+ int32_t confidence; |
+ int32_t i; |
+ |
+ textIn->MungeInput(fStripTags); |
+ |
+ // Iterate over all possible charsets, remember all that |
+ // give a match quality > 0. |
+ resultCount = 0; |
+ for (i = 0; i < fCSRecognizers_size; i += 1) { |
+ csr = fCSRecognizers[i]; |
+ detectResults = csr->match(textIn); |
+ confidence = detectResults; |
+ |
+ if (confidence > 0) { |
+ resultArray[resultCount++]->set(textIn, csr, confidence); |
+ } |
+ } |
+ |
+ for(i = resultCount; i < fCSRecognizers_size; i += 1) { |
+ resultArray[i]->set(textIn, 0, 0); |
+ } |
+ |
+ uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); |
+ |
+ // Remove duplicate charsets from the results. |
+ // Simple minded, brute force approach - check each entry against all that follow. |
+ // The first entry of any duplicated set is the one that should be kept because it will |
+ // be the one with the highest confidence rating. |
+ // (Duplicate matches have different languages, only the charset is the same) |
+ // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually |
+ // deleted, just reordered, with the unwanted duplicates placed after the good results. |
+ int32_t j, k; |
+ for (i=0; i<resultCount; i++) { |
+ const char *charSetName = resultArray[i]->getName(); |
+ for (j=i+1; j<resultCount; ) { |
+ if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) { |
+ // Not a duplicate. |
+ j++; |
+ } else { |
+ // Duplicate entry at index j. |
+ CharsetMatch *duplicate = resultArray[j]; |
+ for (k=j; k<resultCount-1; k++) { |
+ resultArray[k] = resultArray[k+1]; |
+ } |
+ resultCount--; |
+ resultArray[resultCount] = duplicate; |
+ } |
+ } |
+ } |
+ |
+ fFreshTextSet = FALSE; |
+ } |
+ |
+ maxMatchesFound = resultCount; |
+ |
+ return resultArray; |
+} |
+ |
+/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const |
+{ |
+ if( index > fCSRecognizers_size-1 || index < 0) { |
+ status = U_INDEX_OUTOFBOUNDS_ERROR; |
+ |
+ return 0; |
+ } else { |
+ return fCSRecognizers[index]->getName(); |
+ } |
+}*/ |
+ |
+U_NAMESPACE_END |
+ |
+U_CDECL_BEGIN |
+typedef struct { |
+ int32_t currIndex; |
+} Context; |
+ |
+ |
+ |
+static void U_CALLCONV |
+enumClose(UEnumeration *en) { |
+ if(en->context != NULL) { |
+ DELETE_ARRAY(en->context); |
+ } |
+ |
+ DELETE_ARRAY(en); |
+} |
+ |
+static int32_t U_CALLCONV |
+enumCount(UEnumeration *, UErrorCode *) { |
+ return fCSRecognizers_size; |
+} |
+ |
+static const char* U_CALLCONV |
+enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { |
+ if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { |
+ if(resultLength != NULL) { |
+ *resultLength = 0; |
+ } |
+ return NULL; |
+ } |
+ const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); |
+ if(resultLength != NULL) { |
+ *resultLength = (int32_t)uprv_strlen(currName); |
+ } |
+ ((Context *)en->context)->currIndex++; |
+ |
+ return currName; |
+} |
+ |
+static void U_CALLCONV |
+enumReset(UEnumeration *en, UErrorCode *) { |
+ ((Context *)en->context)->currIndex = 0; |
+} |
+ |
+static const UEnumeration gCSDetEnumeration = { |
+ NULL, |
+ NULL, |
+ enumClose, |
+ enumCount, |
+ uenum_unextDefault, |
+ enumNext, |
+ enumReset |
+}; |
+ |
+U_CAPI UEnumeration * U_EXPORT2 |
+ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) |
+{ |
+ U_NAMESPACE_USE |
+ |
+ if(U_FAILURE(*status)) { |
+ return 0; |
+ } |
+ |
+ /* Initialize recognized charsets. */ |
+ CharsetDetector::getDetectableCount(); |
+ |
+ UEnumeration *en = NEW_ARRAY(UEnumeration, 1); |
+ memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); |
+ en->context = (void*)NEW_ARRAY(Context, 1); |
+ uprv_memset(en->context, 0, sizeof(Context)); |
+ return en; |
+} |
+U_CDECL_END |
+ |
+#endif |
+ |
Property changes on: icu46/source/i18n/csdetect.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |