icu46/source/i18n/csdetect.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/csdetect.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/csdetect.cpp

===================================================================

--- icu46/source/i18n/csdetect.cpp (revision 0)

+++ icu46/source/i18n/csdetect.cpp (revision 0)

@@ -0,0 +1,414 @@

+/*

+ **********************************************************************

+ */

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "unicode/ucsdet.h"

+#include "csdetect.h"

+#include "csmatch.h"

+#include "uenumimp.h"

+#include "cmemory.h"

+#include "cstring.h"

+#include "umutex.h"

+#include "ucln_in.h"

+#include "uarrsort.h"

+#include "inputext.h"

+#include "csrsbcs.h"

+#include "csrmbcs.h"

+#include "csrutf8.h"

+#include "csrucode.h"

+#include "csr2022.h"

+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

+#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))

+#define DELETE_ARRAY(array) uprv_free((void *) (array))

+U_CDECL_BEGIN

+static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;

+static int32_t fCSRecognizers_size = 0;

+static UBool U_CALLCONV csdet_cleanup(void)

+ if (fCSRecognizers != NULL) {

+ for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {

+ delete fCSRecognizers[r];

+ fCSRecognizers[r] = NULL;

+ }

+ DELETE_ARRAY(fCSRecognizers);

+ fCSRecognizers = NULL;

+ fCSRecognizers_size = 0;

+ }

+ return TRUE;

+static int32_t U_CALLCONV

+charsetMatchComparator(const void * /*context*/, const void *left, const void *right)

+ U_NAMESPACE_USE

+ const CharsetMatch **csm_l = (const CharsetMatch **) left;

+ const CharsetMatch **csm_r = (const CharsetMatch **) right;

+ // NOTE: compare is backwards to sort from highest to lowest.

+ return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();

+U_CDECL_END

+U_NAMESPACE_BEGIN

+void CharsetDetector::setRecognizers(UErrorCode &status)

+ UBool needsInit;

+ CharsetRecognizer **recognizers;

+ if (U_FAILURE(status)) {

+ return;

+ }

+ UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);

+ if (needsInit) {

+ CharsetRecognizer *tempArray[] = {

+ new CharsetRecog_UTF8(),

+ new CharsetRecog_UTF_16_BE(),

+ new CharsetRecog_UTF_16_LE(),

+ new CharsetRecog_UTF_32_BE(),

+ new CharsetRecog_UTF_32_LE(),

+ new CharsetRecog_8859_1_en(),

+ new CharsetRecog_8859_1_da(),

+ new CharsetRecog_8859_1_de(),

+ new CharsetRecog_8859_1_es(),

+ new CharsetRecog_8859_1_fr(),

+ new CharsetRecog_8859_1_it(),

+ new CharsetRecog_8859_1_nl(),

+ new CharsetRecog_8859_1_no(),

+ new CharsetRecog_8859_1_pt(),

+ new CharsetRecog_8859_1_sv(),

+ new CharsetRecog_8859_2_cs(),

+ new CharsetRecog_8859_2_hu(),

+ new CharsetRecog_8859_2_pl(),

+ new CharsetRecog_8859_2_ro(),

+ new CharsetRecog_8859_5_ru(),

+ new CharsetRecog_8859_6_ar(),

+ new CharsetRecog_8859_7_el(),

+ new CharsetRecog_8859_8_I_he(),

+ new CharsetRecog_8859_8_he(),

+ new CharsetRecog_windows_1251(),

+ new CharsetRecog_windows_1256(),

+ new CharsetRecog_KOI8_R(),

+ new CharsetRecog_8859_9_tr(),

+ new CharsetRecog_sjis(),

+ new CharsetRecog_gb_18030(),

+ new CharsetRecog_euc_jp(),

+ new CharsetRecog_euc_kr(),

+ new CharsetRecog_big5(),

+ new CharsetRecog_2022JP(),

+ new CharsetRecog_2022KR(),

+ new CharsetRecog_2022CN(),

+ new CharsetRecog_IBM424_he_rtl(),

+ new CharsetRecog_IBM424_he_ltr(),

+ new CharsetRecog_IBM420_ar_rtl(),

+ new CharsetRecog_IBM420_ar_ltr()

+ };

+ int32_t rCount = ARRAY_SIZE(tempArray);

+ int32_t r;

+ recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);

+ if (recognizers == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ } else {

+ for (r = 0; r < rCount; r += 1) {

+ recognizers[r] = tempArray[r];

+ if (recognizers[r] == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ break;

+ }

+ if (U_SUCCESS(status)) {

+ umtx_lock(NULL);

+ if (fCSRecognizers == NULL) {

+ fCSRecognizers_size = rCount;

+ fCSRecognizers = recognizers;

+ }

+ umtx_unlock(NULL);

+ }

+ if (fCSRecognizers != recognizers) {

+ for (r = 0; r < rCount; r += 1) {

+ delete recognizers[r];

+ recognizers[r] = NULL;

+ }

+ DELETE_ARRAY(recognizers);

+ }

+ recognizers = NULL;

+ ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);

+ }

+CharsetDetector::CharsetDetector(UErrorCode &status)

+ : textIn(new InputText(status)), resultArray(NULL),

+ resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)

+ if (U_FAILURE(status)) {

+ return;

+ }

+ setRecognizers(status);

+ if (U_FAILURE(status)) {

+ return;

+ }

+ resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);

+ if (resultArray == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ return;

+ }

+ for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {

+ resultArray[i] = new CharsetMatch();

+ if (resultArray[i] == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ break;

+ }

+CharsetDetector::~CharsetDetector()

+ delete textIn;

+ for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {

+ delete resultArray[i];

+ }

+ uprv_free(resultArray);

+void CharsetDetector::setText(const char *in, int32_t len)

+ textIn->setText(in, len);

+ fFreshTextSet = TRUE;

+UBool CharsetDetector::setStripTagsFlag(UBool flag)

+ UBool temp = fStripTags;

+ fStripTags = flag;

+ fFreshTextSet = TRUE;

+ return temp;

+UBool CharsetDetector::getStripTagsFlag() const

+ return fStripTags;

+void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const

+ textIn->setDeclaredEncoding(encoding,len);

+int32_t CharsetDetector::getDetectableCount()

+ UErrorCode status = U_ZERO_ERROR;

+ setRecognizers(status);

+ return fCSRecognizers_size;

+const CharsetMatch *CharsetDetector::detect(UErrorCode &status)

+ int32_t maxMatchesFound = 0;

+ detectAll(maxMatchesFound, status);

+ if(maxMatchesFound > 0) {

+ return resultArray[0];

+ } else {

+ return NULL;

+ }

+const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)

+ if(!textIn->isSet()) {

+ status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set

+ return NULL;

+ } else if(fFreshTextSet) {

+ CharsetRecognizer *csr;

+ int32_t detectResults;

+ int32_t confidence;

+ int32_t i;

+ textIn->MungeInput(fStripTags);

+ // Iterate over all possible charsets, remember all that

+ // give a match quality > 0.

+ resultCount = 0;

+ for (i = 0; i < fCSRecognizers_size; i += 1) {

+ csr = fCSRecognizers[i];

+ detectResults = csr->match(textIn);

+ confidence = detectResults;

+ if (confidence > 0) {

+ resultArray[resultCount++]->set(textIn, csr, confidence);

+ }

+ for(i = resultCount; i < fCSRecognizers_size; i += 1) {

+ resultArray[i]->set(textIn, 0, 0);

+ }

+ uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);

+ // Remove duplicate charsets from the results.

+ // Simple minded, brute force approach - check each entry against all that follow.

+ // The first entry of any duplicated set is the one that should be kept because it will

+ // be the one with the highest confidence rating.

+ // (Duplicate matches have different languages, only the charset is the same)

+ // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually

+ // deleted, just reordered, with the unwanted duplicates placed after the good results.

+ int32_t j, k;

+ for (i=0; i<resultCount; i++) {

+ const char *charSetName = resultArray[i]->getName();

+ for (j=i+1; j<resultCount; ) {

+ if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {

+ // Not a duplicate.

+ j++;

+ } else {

+ // Duplicate entry at index j.

+ CharsetMatch *duplicate = resultArray[j];

+ for (k=j; k<resultCount-1; k++) {

+ resultArray[k] = resultArray[k+1];

+ }

+ resultCount--;

+ resultArray[resultCount] = duplicate;

+ }

+ fFreshTextSet = FALSE;

+ }

+ maxMatchesFound = resultCount;

+ return resultArray;

+/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const

+ if( index > fCSRecognizers_size-1 || index < 0) {

+ status = U_INDEX_OUTOFBOUNDS_ERROR;

+ return 0;

+ } else {

+ return fCSRecognizers[index]->getName();

+ }

+}*/

+U_NAMESPACE_END

+U_CDECL_BEGIN

+typedef struct {

+ int32_t currIndex;

+} Context;

+static void U_CALLCONV

+enumClose(UEnumeration *en) {

+ if(en->context != NULL) {

+ DELETE_ARRAY(en->context);

+ }

+ DELETE_ARRAY(en);

+static int32_t U_CALLCONV

+enumCount(UEnumeration *, UErrorCode *) {

+ return fCSRecognizers_size;

+static const char* U_CALLCONV

+enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {

+ if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {

+ if(resultLength != NULL) {

+ *resultLength = 0;

+ }

+ return NULL;

+ }

+ const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();

+ if(resultLength != NULL) {

+ *resultLength = (int32_t)uprv_strlen(currName);

+ }

+ ((Context *)en->context)->currIndex++;

+ return currName;

+static void U_CALLCONV

+enumReset(UEnumeration *en, UErrorCode *) {

+ ((Context *)en->context)->currIndex = 0;

+static const UEnumeration gCSDetEnumeration = {

+ NULL,

+ enumClose,

+ enumCount,

+ uenum_unextDefault,

+ enumNext,

+ enumReset

+};

+U_CAPI UEnumeration * U_EXPORT2

+ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)

+ U_NAMESPACE_USE

+ if(U_FAILURE(*status)) {

+ return 0;

+ }

+ /* Initialize recognized charsets. */

+ CharsetDetector::getDetectableCount();

+ UEnumeration *en = NEW_ARRAY(UEnumeration, 1);

+ memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));

+ en->context = (void*)NEW_ARRAY(Context, 1);

+ uprv_memset(en->context, 0, sizeof(Context));

+ return en;

+U_CDECL_END

+#endif

Property changes on: icu46/source/i18n/csdetect.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/csdetect.h ('k') | icu46/source/i18n/csmatch.h » ('j') | no next file with comments »