icu46/source/i18n/csdetect.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/csdetect.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2009, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8 #include "unicode/utypes.h"

	9

	10 #if !UCONFIG_NO_CONVERSION

	11

	12 #include "unicode/ucsdet.h"

	13

	14 #include "csdetect.h"

	15 #include "csmatch.h"

	16 #include "uenumimp.h"

	17

	18 #include "cmemory.h"

	19 #include "cstring.h"

	20 #include "umutex.h"

	21 #include "ucln_in.h"

	22 #include "uarrsort.h"

	23 #include "inputext.h"

	24 #include "csrsbcs.h"

	25 #include "csrmbcs.h"

	26 #include "csrutf8.h"

	27 #include "csrucode.h"

	28 #include "csr2022.h"

	29

	30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

	31

	32 #define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))

	33 #define DELETE_ARRAY(array) uprv_free((void *) (array))

	34

	35 U_CDECL_BEGIN

	36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;

	37

	38 static int32_t fCSRecognizers_size = 0;

	39

	40 static UBool U_CALLCONV csdet_cleanup(void)

	41 {

	42 if (fCSRecognizers != NULL) {

	43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {

	44 delete fCSRecognizers[r];

	45 fCSRecognizers[r] = NULL;

	46 }

	47

	48 DELETE_ARRAY(fCSRecognizers);

	49 fCSRecognizers = NULL;

	50 fCSRecognizers_size = 0;

	51 }

	52

	53 return TRUE;

	54 }

	55

	56 static int32_t U_CALLCONV

	57 charsetMatchComparator(const void * /context/, const void left, const void r ight)

	58 {

	59 U_NAMESPACE_USE

	60

	61 const CharsetMatch csm_l = (const CharsetMatch ) left;

	62 const CharsetMatch csm_r = (const CharsetMatch ) right;

	63

	64 // NOTE: compare is backwards to sort from highest to lowest.

	65 return (csm_r)->getConfidence() - (csm_l)->getConfidence();

	66 }

	67

	68 U_CDECL_END

	69

	70 U_NAMESPACE_BEGIN

	71

	72 void CharsetDetector::setRecognizers(UErrorCode &status)

	73 {

	74 UBool needsInit;

	75 CharsetRecognizer **recognizers;

	76

	77 if (U_FAILURE(status)) {

	78 return;

	79 }

	80

	81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);

	82

	83 if (needsInit) {

	84 CharsetRecognizer *tempArray[] = {

	85 new CharsetRecog_UTF8(),

	86

	87 new CharsetRecog_UTF_16_BE(),

	88 new CharsetRecog_UTF_16_LE(),

	89 new CharsetRecog_UTF_32_BE(),

	90 new CharsetRecog_UTF_32_LE(),

	91

	92 new CharsetRecog_8859_1_en(),

	93 new CharsetRecog_8859_1_da(),

	94 new CharsetRecog_8859_1_de(),

	95 new CharsetRecog_8859_1_es(),

	96 new CharsetRecog_8859_1_fr(),

	97 new CharsetRecog_8859_1_it(),

	98 new CharsetRecog_8859_1_nl(),

	99 new CharsetRecog_8859_1_no(),

	100 new CharsetRecog_8859_1_pt(),

	101 new CharsetRecog_8859_1_sv(),

	102 new CharsetRecog_8859_2_cs(),

	103 new CharsetRecog_8859_2_hu(),

	104 new CharsetRecog_8859_2_pl(),

	105 new CharsetRecog_8859_2_ro(),

	106 new CharsetRecog_8859_5_ru(),

	107 new CharsetRecog_8859_6_ar(),

	108 new CharsetRecog_8859_7_el(),

	109 new CharsetRecog_8859_8_I_he(),

	110 new CharsetRecog_8859_8_he(),

	111 new CharsetRecog_windows_1251(),

	112 new CharsetRecog_windows_1256(),

	113 new CharsetRecog_KOI8_R(),

	114 new CharsetRecog_8859_9_tr(),

	115 new CharsetRecog_sjis(),

	116 new CharsetRecog_gb_18030(),

	117 new CharsetRecog_euc_jp(),

	118 new CharsetRecog_euc_kr(),

	119 new CharsetRecog_big5(),

	120

	121 new CharsetRecog_2022JP(),

	122 new CharsetRecog_2022KR(),

	123 new CharsetRecog_2022CN(),

	124

	125 new CharsetRecog_IBM424_he_rtl(),

	126 new CharsetRecog_IBM424_he_ltr(),

	127 new CharsetRecog_IBM420_ar_rtl(),

	128 new CharsetRecog_IBM420_ar_ltr()

	129 };

	130 int32_t rCount = ARRAY_SIZE(tempArray);

	131 int32_t r;

	132

	133 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);

	134

	135 if (recognizers == NULL) {

	136 status = U_MEMORY_ALLOCATION_ERROR;

	137 return;

	138 } else {

	139 for (r = 0; r < rCount; r += 1) {

	140 recognizers[r] = tempArray[r];

	141

	142 if (recognizers[r] == NULL) {

	143 status = U_MEMORY_ALLOCATION_ERROR;

	144 break;

	145 }

	146 }

	147 }

	148

	149 if (U_SUCCESS(status)) {

	150 umtx_lock(NULL);

	151 if (fCSRecognizers == NULL) {

	152 fCSRecognizers_size = rCount;

	153 fCSRecognizers = recognizers;

	154 }

	155 umtx_unlock(NULL);

	156 }

	157

	158 if (fCSRecognizers != recognizers) {

	159 for (r = 0; r < rCount; r += 1) {

	160 delete recognizers[r];

	161 recognizers[r] = NULL;

	162 }

	163

	164 DELETE_ARRAY(recognizers);

	165 }

	166

	167 recognizers = NULL;

	168 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);

	169 }

	170 }

	171

	172 CharsetDetector::CharsetDetector(UErrorCode &status)

	173 : textIn(new InputText(status)), resultArray(NULL),

	174 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)

	175 {

	176 if (U_FAILURE(status)) {

	177 return;

	178 }

	179

	180 setRecognizers(status);

	181

	182 if (U_FAILURE(status)) {

	183 return;

	184 }

	185

	186 resultArray = (CharsetMatch *)uprv_malloc(sizeof(CharsetMatch )*fCSRecogni zers_size);

	187

	188 if (resultArray == NULL) {

	189 status = U_MEMORY_ALLOCATION_ERROR;

	190 return;

	191 }

	192

	193 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {

	194 resultArray[i] = new CharsetMatch();

	195

	196 if (resultArray[i] == NULL) {

	197 status = U_MEMORY_ALLOCATION_ERROR;

	198 break;

	199 }

	200 }

	201 }

	202

	203 CharsetDetector::~CharsetDetector()

	204 {

	205 delete textIn;

	206

	207 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {

	208 delete resultArray[i];

	209 }

	210

	211 uprv_free(resultArray);

	212 }

	213

	214 void CharsetDetector::setText(const char *in, int32_t len)

	215 {

	216 textIn->setText(in, len);

	217 fFreshTextSet = TRUE;

	218 }

	219

	220 UBool CharsetDetector::setStripTagsFlag(UBool flag)

	221 {

	222 UBool temp = fStripTags;

	223 fStripTags = flag;

	224 fFreshTextSet = TRUE;

	225 return temp;

	226 }

	227

	228 UBool CharsetDetector::getStripTagsFlag() const

	229 {

	230 return fStripTags;

	231 }

	232

	233 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) con st

	234 {

	235 textIn->setDeclaredEncoding(encoding,len);

	236 }

	237

	238 int32_t CharsetDetector::getDetectableCount()

	239 {

	240 UErrorCode status = U_ZERO_ERROR;

	241

	242 setRecognizers(status);

	243

	244 return fCSRecognizers_size;

	245 }

	246

	247 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)

	248 {

	249 int32_t maxMatchesFound = 0;

	250

	251 detectAll(maxMatchesFound, status);

	252

	253 if(maxMatchesFound > 0) {

	254 return resultArray[0];

	255 } else {

	256 return NULL;

	257 }

	258 }

	259

	260 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)

	261 {

	262 if(!textIn->isSet()) {

	263 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status co de for input text not set

	264

	265 return NULL;

	266 } else if(fFreshTextSet) {

	267 CharsetRecognizer *csr;

	268 int32_t detectResults;

	269 int32_t confidence;

	270 int32_t i;

	271

	272 textIn->MungeInput(fStripTags);

	273

	274 // Iterate over all possible charsets, remember all that

	275 // give a match quality > 0.

	276 resultCount = 0;

	277 for (i = 0; i < fCSRecognizers_size; i += 1) {

	278 csr = fCSRecognizers[i];

	279 detectResults = csr->match(textIn);

	280 confidence = detectResults;

	281

	282 if (confidence > 0) {

	283 resultArray[resultCount++]->set(textIn, csr, confidence);

	284 }

	285 }

	286

	287 for(i = resultCount; i < fCSRecognizers_size; i += 1) {

	288 resultArray[i]->set(textIn, 0, 0);

	289 }

	290

	291 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetM atchComparator, NULL, TRUE, &status);

	292

	293 // Remove duplicate charsets from the results.

	294 // Simple minded, brute force approach - check each entry against all th at follow.

	295 // The first entry of any duplicated set is the one that should be kept because it will

	296 // be the one with the highest confidence rating.

	297 // (Duplicate matches have different languages, only the charset is th e same)

	298 // Because the resultArray contains preallocated CharsetMatch objects, t hey aren't actually

	299 // deleted, just reordered, with the unwanted duplicates placed after th e good results.

	300 int32_t j, k;

	301 for (i=0; i<resultCount; i++) {

	302 const char *charSetName = resultArray[i]->getName();

	303 for (j=i+1; j<resultCount; ) {

	304 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {

	305 // Not a duplicate.

	306 j++;

	307 } else {

	308 // Duplicate entry at index j.

	309 CharsetMatch *duplicate = resultArray[j];

	310 for (k=j; k<resultCount-1; k++) {

	311 resultArray[k] = resultArray[k+1];

	312 }

	313 resultCount--;

	314 resultArray[resultCount] = duplicate;

	315 }

	316 }

	317 }

	318

	319 fFreshTextSet = FALSE;

	320 }

	321

	322 maxMatchesFound = resultCount;

	323

	324 return resultArray;

	325 }

	326

	327 /const char CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const

	328 {

	329 if( index > fCSRecognizers_size-1 \|\| index < 0) {

	330 status = U_INDEX_OUTOFBOUNDS_ERROR;

	331

	332 return 0;

	333 } else {

	334 return fCSRecognizers[index]->getName();

	335 }

	336 }*/

	337

	338 U_NAMESPACE_END

	339

	340 U_CDECL_BEGIN

	341 typedef struct {

	342 int32_t currIndex;

	343 } Context;

	344

	345

	346

	347 static void U_CALLCONV

	348 enumClose(UEnumeration *en) {

	349 if(en->context != NULL) {

	350 DELETE_ARRAY(en->context);

	351 }

	352

	353 DELETE_ARRAY(en);

	354 }

	355

	356 static int32_t U_CALLCONV

	357 enumCount(UEnumeration , UErrorCode ) {

	358 return fCSRecognizers_size;

	359 }

	360

	361 static const char* U_CALLCONV

	362 enumNext(UEnumeration en, int32_t resultLength, UErrorCode * /status/) {

	363 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {

	364 if(resultLength != NULL) {

	365 *resultLength = 0;

	366 }

	367 return NULL;

	368 }

	369 const char currName = fCSRecognizers[((Context )en->context)->currIndex]-> getName();

	370 if(resultLength != NULL) {

	371 *resultLength = (int32_t)uprv_strlen(currName);

	372 }

	373 ((Context *)en->context)->currIndex++;

	374

	375 return currName;

	376 }

	377

	378 static void U_CALLCONV

	379 enumReset(UEnumeration en, UErrorCode ) {

	380 ((Context *)en->context)->currIndex = 0;

	381 }

	382

	383 static const UEnumeration gCSDetEnumeration = {

	384 NULL,

	385 NULL,

	386 enumClose,

	387 enumCount,

	388 uenum_unextDefault,

	389 enumNext,

	390 enumReset

	391 };

	392

	393 U_CAPI UEnumeration * U_EXPORT2

	394 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /ucsd/, UErrorCode *s tatus)

	395 {

	396 U_NAMESPACE_USE

	397

	398 if(U_FAILURE(*status)) {

	399 return 0;

	400 }

	401

	402 /* Initialize recognized charsets. */

	403 CharsetDetector::getDetectableCount();

	404

	405 UEnumeration *en = NEW_ARRAY(UEnumeration, 1);

	406 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));

	407 en->context = (void*)NEW_ARRAY(Context, 1);

	408 uprv_memset(en->context, 0, sizeof(Context));

	409 return en;

	410 }

	411 U_CDECL_END

	412

	413 #endif

	414

OLD	NEW

« no previous file with comments | « icu46/source/i18n/csdetect.h ('k') | icu46/source/i18n/csmatch.h » ('j') | no next file with comments »