| Index: icu46/source/i18n/csdetect.cpp
|
| ===================================================================
|
| --- icu46/source/i18n/csdetect.cpp (revision 0)
|
| +++ icu46/source/i18n/csdetect.cpp (revision 0)
|
| @@ -0,0 +1,414 @@
|
| +/*
|
| + **********************************************************************
|
| + * Copyright (C) 2005-2009, International Business Machines
|
| + * Corporation and others. All Rights Reserved.
|
| + **********************************************************************
|
| + */
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_CONVERSION
|
| +
|
| +#include "unicode/ucsdet.h"
|
| +
|
| +#include "csdetect.h"
|
| +#include "csmatch.h"
|
| +#include "uenumimp.h"
|
| +
|
| +#include "cmemory.h"
|
| +#include "cstring.h"
|
| +#include "umutex.h"
|
| +#include "ucln_in.h"
|
| +#include "uarrsort.h"
|
| +#include "inputext.h"
|
| +#include "csrsbcs.h"
|
| +#include "csrmbcs.h"
|
| +#include "csrutf8.h"
|
| +#include "csrucode.h"
|
| +#include "csr2022.h"
|
| +
|
| +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
| +
|
| +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
|
| +#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
| +
|
| +U_CDECL_BEGIN
|
| +static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
|
| +
|
| +static int32_t fCSRecognizers_size = 0;
|
| +
|
| +static UBool U_CALLCONV csdet_cleanup(void)
|
| +{
|
| + if (fCSRecognizers != NULL) {
|
| + for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
|
| + delete fCSRecognizers[r];
|
| + fCSRecognizers[r] = NULL;
|
| + }
|
| +
|
| + DELETE_ARRAY(fCSRecognizers);
|
| + fCSRecognizers = NULL;
|
| + fCSRecognizers_size = 0;
|
| + }
|
| +
|
| + return TRUE;
|
| +}
|
| +
|
| +static int32_t U_CALLCONV
|
| +charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
|
| +{
|
| + U_NAMESPACE_USE
|
| +
|
| + const CharsetMatch **csm_l = (const CharsetMatch **) left;
|
| + const CharsetMatch **csm_r = (const CharsetMatch **) right;
|
| +
|
| + // NOTE: compare is backwards to sort from highest to lowest.
|
| + return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
|
| +}
|
| +
|
| +U_CDECL_END
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +void CharsetDetector::setRecognizers(UErrorCode &status)
|
| +{
|
| + UBool needsInit;
|
| + CharsetRecognizer **recognizers;
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
|
| +
|
| + if (needsInit) {
|
| + CharsetRecognizer *tempArray[] = {
|
| + new CharsetRecog_UTF8(),
|
| +
|
| + new CharsetRecog_UTF_16_BE(),
|
| + new CharsetRecog_UTF_16_LE(),
|
| + new CharsetRecog_UTF_32_BE(),
|
| + new CharsetRecog_UTF_32_LE(),
|
| +
|
| + new CharsetRecog_8859_1_en(),
|
| + new CharsetRecog_8859_1_da(),
|
| + new CharsetRecog_8859_1_de(),
|
| + new CharsetRecog_8859_1_es(),
|
| + new CharsetRecog_8859_1_fr(),
|
| + new CharsetRecog_8859_1_it(),
|
| + new CharsetRecog_8859_1_nl(),
|
| + new CharsetRecog_8859_1_no(),
|
| + new CharsetRecog_8859_1_pt(),
|
| + new CharsetRecog_8859_1_sv(),
|
| + new CharsetRecog_8859_2_cs(),
|
| + new CharsetRecog_8859_2_hu(),
|
| + new CharsetRecog_8859_2_pl(),
|
| + new CharsetRecog_8859_2_ro(),
|
| + new CharsetRecog_8859_5_ru(),
|
| + new CharsetRecog_8859_6_ar(),
|
| + new CharsetRecog_8859_7_el(),
|
| + new CharsetRecog_8859_8_I_he(),
|
| + new CharsetRecog_8859_8_he(),
|
| + new CharsetRecog_windows_1251(),
|
| + new CharsetRecog_windows_1256(),
|
| + new CharsetRecog_KOI8_R(),
|
| + new CharsetRecog_8859_9_tr(),
|
| + new CharsetRecog_sjis(),
|
| + new CharsetRecog_gb_18030(),
|
| + new CharsetRecog_euc_jp(),
|
| + new CharsetRecog_euc_kr(),
|
| + new CharsetRecog_big5(),
|
| +
|
| + new CharsetRecog_2022JP(),
|
| + new CharsetRecog_2022KR(),
|
| + new CharsetRecog_2022CN(),
|
| +
|
| + new CharsetRecog_IBM424_he_rtl(),
|
| + new CharsetRecog_IBM424_he_ltr(),
|
| + new CharsetRecog_IBM420_ar_rtl(),
|
| + new CharsetRecog_IBM420_ar_ltr()
|
| + };
|
| + int32_t rCount = ARRAY_SIZE(tempArray);
|
| + int32_t r;
|
| +
|
| + recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
|
| +
|
| + if (recognizers == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + } else {
|
| + for (r = 0; r < rCount; r += 1) {
|
| + recognizers[r] = tempArray[r];
|
| +
|
| + if (recognizers[r] == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + break;
|
| + }
|
| + }
|
| + }
|
| +
|
| + if (U_SUCCESS(status)) {
|
| + umtx_lock(NULL);
|
| + if (fCSRecognizers == NULL) {
|
| + fCSRecognizers_size = rCount;
|
| + fCSRecognizers = recognizers;
|
| + }
|
| + umtx_unlock(NULL);
|
| + }
|
| +
|
| + if (fCSRecognizers != recognizers) {
|
| + for (r = 0; r < rCount; r += 1) {
|
| + delete recognizers[r];
|
| + recognizers[r] = NULL;
|
| + }
|
| +
|
| + DELETE_ARRAY(recognizers);
|
| + }
|
| +
|
| + recognizers = NULL;
|
| + ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
|
| + }
|
| +}
|
| +
|
| +CharsetDetector::CharsetDetector(UErrorCode &status)
|
| + : textIn(new InputText(status)), resultArray(NULL),
|
| + resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + setRecognizers(status);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| +
|
| + resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
|
| +
|
| + if (resultArray == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + return;
|
| + }
|
| +
|
| + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
|
| + resultArray[i] = new CharsetMatch();
|
| +
|
| + if (resultArray[i] == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + break;
|
| + }
|
| + }
|
| +}
|
| +
|
| +CharsetDetector::~CharsetDetector()
|
| +{
|
| + delete textIn;
|
| +
|
| + for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
|
| + delete resultArray[i];
|
| + }
|
| +
|
| + uprv_free(resultArray);
|
| +}
|
| +
|
| +void CharsetDetector::setText(const char *in, int32_t len)
|
| +{
|
| + textIn->setText(in, len);
|
| + fFreshTextSet = TRUE;
|
| +}
|
| +
|
| +UBool CharsetDetector::setStripTagsFlag(UBool flag)
|
| +{
|
| + UBool temp = fStripTags;
|
| + fStripTags = flag;
|
| + fFreshTextSet = TRUE;
|
| + return temp;
|
| +}
|
| +
|
| +UBool CharsetDetector::getStripTagsFlag() const
|
| +{
|
| + return fStripTags;
|
| +}
|
| +
|
| +void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
|
| +{
|
| + textIn->setDeclaredEncoding(encoding,len);
|
| +}
|
| +
|
| +int32_t CharsetDetector::getDetectableCount()
|
| +{
|
| + UErrorCode status = U_ZERO_ERROR;
|
| +
|
| + setRecognizers(status);
|
| +
|
| + return fCSRecognizers_size;
|
| +}
|
| +
|
| +const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
|
| +{
|
| + int32_t maxMatchesFound = 0;
|
| +
|
| + detectAll(maxMatchesFound, status);
|
| +
|
| + if(maxMatchesFound > 0) {
|
| + return resultArray[0];
|
| + } else {
|
| + return NULL;
|
| + }
|
| +}
|
| +
|
| +const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
|
| +{
|
| + if(!textIn->isSet()) {
|
| + status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
|
| +
|
| + return NULL;
|
| + } else if(fFreshTextSet) {
|
| + CharsetRecognizer *csr;
|
| + int32_t detectResults;
|
| + int32_t confidence;
|
| + int32_t i;
|
| +
|
| + textIn->MungeInput(fStripTags);
|
| +
|
| + // Iterate over all possible charsets, remember all that
|
| + // give a match quality > 0.
|
| + resultCount = 0;
|
| + for (i = 0; i < fCSRecognizers_size; i += 1) {
|
| + csr = fCSRecognizers[i];
|
| + detectResults = csr->match(textIn);
|
| + confidence = detectResults;
|
| +
|
| + if (confidence > 0) {
|
| + resultArray[resultCount++]->set(textIn, csr, confidence);
|
| + }
|
| + }
|
| +
|
| + for(i = resultCount; i < fCSRecognizers_size; i += 1) {
|
| + resultArray[i]->set(textIn, 0, 0);
|
| + }
|
| +
|
| + uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
|
| +
|
| + // Remove duplicate charsets from the results.
|
| + // Simple minded, brute force approach - check each entry against all that follow.
|
| + // The first entry of any duplicated set is the one that should be kept because it will
|
| + // be the one with the highest confidence rating.
|
| + // (Duplicate matches have different languages, only the charset is the same)
|
| + // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
|
| + // deleted, just reordered, with the unwanted duplicates placed after the good results.
|
| + int32_t j, k;
|
| + for (i=0; i<resultCount; i++) {
|
| + const char *charSetName = resultArray[i]->getName();
|
| + for (j=i+1; j<resultCount; ) {
|
| + if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
|
| + // Not a duplicate.
|
| + j++;
|
| + } else {
|
| + // Duplicate entry at index j.
|
| + CharsetMatch *duplicate = resultArray[j];
|
| + for (k=j; k<resultCount-1; k++) {
|
| + resultArray[k] = resultArray[k+1];
|
| + }
|
| + resultCount--;
|
| + resultArray[resultCount] = duplicate;
|
| + }
|
| + }
|
| + }
|
| +
|
| + fFreshTextSet = FALSE;
|
| + }
|
| +
|
| + maxMatchesFound = resultCount;
|
| +
|
| + return resultArray;
|
| +}
|
| +
|
| +/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
|
| +{
|
| + if( index > fCSRecognizers_size-1 || index < 0) {
|
| + status = U_INDEX_OUTOFBOUNDS_ERROR;
|
| +
|
| + return 0;
|
| + } else {
|
| + return fCSRecognizers[index]->getName();
|
| + }
|
| +}*/
|
| +
|
| +U_NAMESPACE_END
|
| +
|
| +U_CDECL_BEGIN
|
| +typedef struct {
|
| + int32_t currIndex;
|
| +} Context;
|
| +
|
| +
|
| +
|
| +static void U_CALLCONV
|
| +enumClose(UEnumeration *en) {
|
| + if(en->context != NULL) {
|
| + DELETE_ARRAY(en->context);
|
| + }
|
| +
|
| + DELETE_ARRAY(en);
|
| +}
|
| +
|
| +static int32_t U_CALLCONV
|
| +enumCount(UEnumeration *, UErrorCode *) {
|
| + return fCSRecognizers_size;
|
| +}
|
| +
|
| +static const char* U_CALLCONV
|
| +enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
|
| + if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
|
| + if(resultLength != NULL) {
|
| + *resultLength = 0;
|
| + }
|
| + return NULL;
|
| + }
|
| + const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
|
| + if(resultLength != NULL) {
|
| + *resultLength = (int32_t)uprv_strlen(currName);
|
| + }
|
| + ((Context *)en->context)->currIndex++;
|
| +
|
| + return currName;
|
| +}
|
| +
|
| +static void U_CALLCONV
|
| +enumReset(UEnumeration *en, UErrorCode *) {
|
| + ((Context *)en->context)->currIndex = 0;
|
| +}
|
| +
|
| +static const UEnumeration gCSDetEnumeration = {
|
| + NULL,
|
| + NULL,
|
| + enumClose,
|
| + enumCount,
|
| + uenum_unextDefault,
|
| + enumNext,
|
| + enumReset
|
| +};
|
| +
|
| +U_CAPI UEnumeration * U_EXPORT2
|
| +ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
|
| +{
|
| + U_NAMESPACE_USE
|
| +
|
| + if(U_FAILURE(*status)) {
|
| + return 0;
|
| + }
|
| +
|
| + /* Initialize recognized charsets. */
|
| + CharsetDetector::getDetectableCount();
|
| +
|
| + UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
|
| + memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
|
| + en->context = (void*)NEW_ARRAY(Context, 1);
|
| + uprv_memset(en->context, 0, sizeof(Context));
|
| + return en;
|
| +}
|
| +U_CDECL_END
|
| +
|
| +#endif
|
| +
|
|
|
| Property changes on: icu46/source/i18n/csdetect.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|