icu46/source/i18n/csrmbcs.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/csrmbcs.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8 #include "unicode/utypes.h"

	9

	10 #if !UCONFIG_NO_CONVERSION

	11

	12 #include "csrmbcs.h"

	13

	14 #include <math.h>

	15

	16 U_NAMESPACE_BEGIN

	17

	18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

	19

	20 #define min(x,y) (((x)<(y))?(x):(y))

	21

	22 static const uint16_t commonChars_sjis [] = {

	23 // TODO: This set of data comes from the character frequency-

	24 // of-occurence analysis tool. The data needs to be moved

	25 // into a resource and loaded from there.

	26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,

	27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,

	28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,

	29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,

	30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,

	31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};

	32

	33 static const uint16_t commonChars_euc_jp[] = {

	34 // TODO: This set of data comes from the character frequency-

	35 // of-occurence analysis tool. The data needs to be moved

	36 // into a resource and loaded from there.

	37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,

	38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,

	39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,

	40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,

	41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,

	42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,

	43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,

	44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,

	45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,

	46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};

	47

	48 static const uint16_t commonChars_euc_kr[] = {

	49 // TODO: This set of data comes from the character frequency-

	50 // of-occurence analysis tool. The data needs to be moved

	51 // into a resource and loaded from there.

	52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,

	53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,

	54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,

	55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,

	56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,

	57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,

	58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,

	59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,

	60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,

	61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};

	62

	63 static const uint16_t commonChars_big5[] = {

	64 // TODO: This set of data comes from the character frequency-

	65 // of-occurence analysis tool. The data needs to be moved

	66 // into a resource and loaded from there.

	67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,

	68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,

	69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,

	70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,

	71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,

	72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,

	73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,

	74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,

	75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,

	76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};

	77

	78 static const uint16_t commonChars_gb_18030[] = {

	79 // TODO: This set of data comes from the character frequency-

	80 // of-occurence analysis tool. The data needs to be moved

	81 // into a resource and loaded from there.

	82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,

	83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,

	84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,

	85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,

	86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,

	87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,

	88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,

	89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,

	90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,

	91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};

	92

	93 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)

	94 {

	95 int32_t start = 0, end = len-1;

	96 int32_t mid = (start+end)/2;

	97

	98 while(start <= end) {

	99 if(array[mid] == value) {

	100 return mid;

	101 }

	102

	103 if(array[mid] < value){

	104 start = mid+1;

	105 } else {

	106 end = mid-1;

	107 }

	108

	109 mid = (start+end)/2;

	110 }

	111

	112 return -1;

	113 }

	114

	115 IteratedChar::IteratedChar() :

	116 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)

	117 {

	118 // nothing else to do.

	119 }

	120

	121 /*void IteratedChar::reset()

	122 {

	123 charValue = 0;

	124 index = -1;

	125 nextIndex = 0;

	126 error = FALSE;

	127 done = FALSE;

	128 }*/

	129

	130 int32_t IteratedChar::nextByte(InputText *det)

	131 {

	132 if (nextIndex >= det->fRawLength) {

	133 done = TRUE;

	134

	135 return -1;

	136 }

	137

	138 return det->fRawInput[nextIndex++];

	139 }

	140

	141 CharsetRecog_mbcs::~CharsetRecog_mbcs()

	142 {

	143 // nothing to do.

	144 }

	145

	146 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars [], int32_t commonCharsLen) {

	147 int32_t singleByteCharCount = 0;

	148 int32_t doubleByteCharCount = 0;

	149 int32_t commonCharCount = 0;

	150 int32_t badCharCount = 0;

	151 int32_t totalCharCount = 0;

	152 int32_t confidence = 0;

	153 IteratedChar iter;

	154

	155 while (nextChar(&iter, det)) {

	156 totalCharCount++;

	157

	158 if (iter.error) {

	159 badCharCount++;

	160 } else {

	161 if (iter.charValue <= 0xFF) {

	162 singleByteCharCount++;

	163 } else {

	164 doubleByteCharCount++;

	165

	166 if (commonChars != 0) {

	167 if (binarySearch(commonChars, commonCharsLen, iter.charValue ) >= 0){

	168 commonCharCount += 1;

	169 }

	170 }

	171 }

	172 }

	173

	174

	175 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {

	176 // Bail out early if the byte data is not matching the encoding sche me.

	177 // break detectBlock;

	178 return confidence;

	179 }

	180 }

	181

	182 if (doubleByteCharCount <= 10 && badCharCount == 0) {

	183 // Not many multi-byte chars.

	184 if (doubleByteCharCount == 0 && totalCharCount < 10) {

	185 // There weren't any multibyte sequences, and there was a low densit y of non-ASCII single bytes.

	186 // We don't have enough data to have any confidence.

	187 // Statistical analysis of single byte non-ASCII charcters would pro bably help here.

	188 confidence = 0;

	189 }

	190 else {

	191 // ASCII or ISO file? It's probably not our encoding,

	192 // but is not incompatible with our encoding, so don't give it a z ero.

	193 confidence = 10;

	194 }

	195

	196 return confidence;

	197 }

	198

	199 //

	200 // No match if there are too many characters that don't fit the encoding sc heme.

	201 // (should we have zero tolerance for these?)

	202 //

	203 if (doubleByteCharCount < 20*badCharCount) {

	204 confidence = 0;

	205

	206 return confidence;

	207 }

	208

	209 if (commonChars == 0) {

	210 // We have no statistics on frequently occuring characters.

	211 // Assess confidence purely on having a reasonable number of

	212 // multi-byte characters (the more the better)

	213 confidence = 30 + doubleByteCharCount - 20*badCharCount;

	214

	215 if (confidence > 100) {

	216 confidence = 100;

	217 }

	218 } else {

	219 //

	220 // Frequency of occurence statistics exist.

	221 //

	222

	223 double maxVal = log10((double)doubleByteCharCount / 4); /(float)?/

	224 double scaleFactor = 90.0 / maxVal;

	225 confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);

	226

	227 confidence = min(confidence, 100);

	228 }

	229

	230 if (confidence < 0) {

	231 confidence = 0;

	232 }

	233

	234 return confidence;

	235 }

	236

	237 CharsetRecog_sjis::~CharsetRecog_sjis()

	238 {

	239 // nothing to do

	240 }

	241

	242 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {

	243 it->index = it->nextIndex;

	244 it->error = FALSE;

	245

	246 int32_t firstByte = it->charValue = it->nextByte(det);

	247

	248 if (firstByte < 0) {

	249 return FALSE;

	250 }

	251

	252 if (firstByte <= 0x7F \|\| (firstByte > 0xA0 && firstByte <= 0xDF)) {

	253 return TRUE;

	254 }

	255

	256 int32_t secondByte = it->nextByte(det);

	257 if (secondByte >= 0) {

	258 it->charValue = (firstByte << 8) \| secondByte;

	259 }

	260 // else we'll handle the error later.

	261

	262 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) \|\| (secondByte >= 0x80 && secondByte <= 0xFE))) {

	263 // Illegal second byte value.

	264 it->error = TRUE;

	265 }

	266

	267 return TRUE;

	268 }

	269

	270 int32_t CharsetRecog_sjis::match(InputText* det)

	271 {

	272 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));

	273 }

	274

	275 const char *CharsetRecog_sjis::getName() const

	276 {

	277 return "Shift_JIS";

	278 }

	279

	280 const char *CharsetRecog_sjis::getLanguage() const

	281 {

	282 return "ja";

	283 }

	284

	285 CharsetRecog_euc::~CharsetRecog_euc()

	286 {

	287 // nothing to do

	288 }

	289

	290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {

	291 int32_t firstByte = 0;

	292 int32_t secondByte = 0;

	293 int32_t thirdByte = 0;

	294

	295 it->index = it->nextIndex;

	296 it->error = FALSE;

	297 firstByte = it->charValue = it->nextByte(det);

	298

	299 if (firstByte < 0) {

	300 // Ran off the end of the input data

	301 return FALSE;

	302 }

	303

	304 if (firstByte <= 0x8D) {

	305 // single byte char

	306 return TRUE;

	307 }

	308

	309 secondByte = it->nextByte(det);

	310 if (secondByte >= 0) {

	311 it->charValue = (it->charValue << 8) \| secondByte;

	312 }

	313 // else we'll handle the error later.

	314

	315 if (firstByte >= 0xA1 && firstByte <= 0xFE) {

	316 // Two byte Char

	317 if (secondByte < 0xA1) {

	318 it->error = TRUE;

	319 }

	320

	321 return TRUE;

	322 }

	323

	324 if (firstByte == 0x8E) {

	325 // Code Set 2.

	326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.

	327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to ch ar value.

	328 // We don't know which we've got.

	329 // Treat it like EUC-JP. If the data really was EUC-TW, the following t wo

	330 // bytes will look like a well formed 2 byte char.

	331 if (secondByte < 0xA1) {

	332 it->error = TRUE;

	333 }

	334

	335 return TRUE;

	336 }

	337

	338 if (firstByte == 0x8F) {

	339 // Code set 3.

	340 // Three byte total char size, two bytes of actual char value.

	341 thirdByte = it->nextByte(det);

	342 it->charValue = (it->charValue << 8) \| thirdByte;

	343

	344 if (thirdByte < 0xa1) {

	345 // Bad second byte or ran off the end of the input data with a non-A SCII first byte.

	346 it->error = TRUE;

	347 }

	348 }

	349

	350 return TRUE;

	351

	352 }

	353

	354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()

	355 {

	356 // nothing to do

	357 }

	358

	359 const char *CharsetRecog_euc_jp::getName() const

	360 {

	361 return "EUC-JP";

	362 }

	363

	364 const char *CharsetRecog_euc_jp::getLanguage() const

	365 {

	366 return "ja";

	367 }

	368

	369 int32_t CharsetRecog_euc_jp::match(InputText *det)

	370 {

	371 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));

	372 }

	373

	374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()

	375 {

	376 // nothing to do

	377 }

	378

	379 const char *CharsetRecog_euc_kr::getName() const

	380 {

	381 return "EUC-KR";

	382 }

	383

	384 const char *CharsetRecog_euc_kr::getLanguage() const

	385 {

	386 return "ko";

	387 }

	388

	389 int32_t CharsetRecog_euc_kr::match(InputText *det)

	390 {

	391 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));

	392 }

	393

	394 CharsetRecog_big5::~CharsetRecog_big5()

	395 {

	396 // nothing to do

	397 }

	398

	399 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)

	400 {

	401 int32_t firstByte;

	402

	403 it->index = it->nextIndex;

	404 it->error = FALSE;

	405 firstByte = it->charValue = it->nextByte(det);

	406

	407 if (firstByte < 0) {

	408 return FALSE;

	409 }

	410

	411 if (firstByte <= 0x7F \|\| firstByte == 0xFF) {

	412 // single byte character.

	413 return TRUE;

	414 }

	415

	416 int32_t secondByte = it->nextByte(det);

	417 if (secondByte >= 0) {

	418 it->charValue = (it->charValue << 8) \| secondByte;

	419 }

	420 // else we'll handle the error later.

	421

	422 if (secondByte < 0x40 \|\| secondByte == 0x7F \|\| secondByte == 0xFF) {

	423 it->error = TRUE;

	424 }

	425

	426 return TRUE;

	427 }

	428

	429 const char *CharsetRecog_big5::getName() const

	430 {

	431 return "Big5";

	432 }

	433

	434 const char *CharsetRecog_big5::getLanguage() const

	435 {

	436 return "zh";

	437 }

	438

	439 int32_t CharsetRecog_big5::match(InputText *det)

	440 {

	441 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));

	442 }

	443

	444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()

	445 {

	446 // nothing to do

	447 }

	448

	449 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {

	450 int32_t firstByte = 0;

	451 int32_t secondByte = 0;

	452 int32_t thirdByte = 0;

	453 int32_t fourthByte = 0;

	454

	455 it->index = it->nextIndex;

	456 it->error = FALSE;

	457 firstByte = it->charValue = it->nextByte(det);

	458

	459 if (firstByte < 0) {

	460 // Ran off the end of the input data

	461 return FALSE;

	462 }

	463

	464 if (firstByte <= 0x80) {

	465 // single byte char

	466 return TRUE;

	467 }

	468

	469 secondByte = it->nextByte(det);

	470 if (secondByte >= 0) {

	471 it->charValue = (it->charValue << 8) \| secondByte;

	472 }

	473 // else we'll handle the error later.

	474

	475 if (firstByte >= 0x81 && firstByte <= 0xFE) {

	476 // Two byte Char

	477 if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && se condByte <= 0xFE)) {

	478 return TRUE;

	479 }

	480

	481 // Four byte char

	482 if (secondByte >= 0x30 && secondByte <= 0x39) {

	483 thirdByte = it->nextByte(det);

	484

	485 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {

	486 fourthByte = it->nextByte(det);

	487

	488 if (fourthByte >= 0x30 && fourthByte <= 0x39) {

	489 it->charValue = (it->charValue << 16) \| (thirdByte << 8) \| f ourthByte;

	490

	491 return TRUE;

	492 }

	493 }

	494 }

	495

	496 // Something wasn't valid, or we ran out of data (-1).

	497 it->error = TRUE;

	498 }

	499

	500 return TRUE;

	501 }

	502

	503 const char *CharsetRecog_gb_18030::getName() const

	504 {

	505 return "GB18030";

	506 }

	507

	508 const char *CharsetRecog_gb_18030::getLanguage() const

	509 {

	510 return "zh";

	511 }

	512

	513 int32_t CharsetRecog_gb_18030::match(InputText *det)

	514 {

	515 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030 ));

	516 }

	517

	518 U_NAMESPACE_END

	519 #endif

OLD	NEW

« no previous file with comments | « icu46/source/i18n/csrmbcs.h ('k') | icu46/source/i18n/csrsbcs.h » ('j') | no next file with comments »