icu46/source/i18n/csr2022.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/csr2022.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2009, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8 #include "unicode/utypes.h"

	9

	10 #if !UCONFIG_NO_CONVERSION

	11

	12 #include "cstring.h"

	13

	14 #include "csr2022.h"

	15

	16 U_NAMESPACE_BEGIN

	17

	18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

	19

	20 /**

	21 * Matching function shared among the 2022 detectors JP, CN and KR

	22 * Counts up the number of legal an unrecognized escape sequences in

	23 * the sample of text, and computes a score based on the total number &

	24 * the proportion that fit the encoding.

	25 *

	26 *

	27 * @param text the byte buffer containing text to analyse

	28 * @param textLen the size of the text in the byte.

	29 * @param escapeSequences the byte escape sequences to test for.

	30 * @return match quality, in the range of 0-100.

	31 */

	32 int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, cons t uint8_t escapeSequences[][5], int32_t escapeSequences_length)

	33 {

	34 int32_t i, j;

	35 int32_t escN;

	36 int32_t hits = 0;

	37 int32_t misses = 0;

	38 int32_t shifts = 0;

	39 int32_t quality;

	40

	41 i = 0;

	42 while(i < textLen) {

	43 if(text[i] == 0x1B) {

	44 escN = 0;

	45 while(escN < escapeSequences_length) {

	46 const uint8_t *seq = escapeSequences[escN];

	47 int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);

	48

	49 if (textLen-i >= seq_length) {

	50 j = 1;

	51 while(j < seq_length) {

	52 if(seq[j] != text[i+j]) {

	53 goto checkEscapes;

	54 }

	55

	56 j += 1;

	57 }

	58

	59 hits += 1;

	60 i += seq_length-1;

	61 goto scanInput;

	62 }

	63 // else we ran out of string to compare this time.

	64 checkEscapes:

	65 escN += 1;

	66 }

	67

	68 misses += 1;

	69 }

	70

	71 if( text[i]== 0x0e \|\| text[i] == 0x0f){

	72 shifts += 1;

	73 }

	74

	75 scanInput:

	76 i += 1;

	77 }

	78

	79 if (hits == 0) {

	80 return 0;

	81 }

	82

	83 //

	84 // Initial quality is based on relative proportion of recongized vs.

	85 // unrecognized escape sequences.

	86 // All good: quality = 100;

	87 // half or less good: quality = 0;

	88 // linear inbetween.

	89 quality = (100hits - 100misses) / (hits + misses);

	90

	91 // Back off quality if there were too few escape sequences seen.

	92 // Include shifts in this computation, so that KR does not get penalized

	93 // for having only a single Escape sequence, but many shifts.

	94 if (hits+shifts < 5) {

	95 quality -= (5-(hits+shifts))*10;

	96 }

	97

	98 if (quality < 0) {

	99 quality = 0;

	100 }

	101

	102 return quality;

	103 }

	104

	105

	106 static const uint8_t escapeSequences_2022JP[][5] = {

	107 {0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992

	108 {0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990

	109 {0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978

	110 {0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80

	111 {0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983

	112 {0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997

	113 {0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII

	114 {0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman

	115 {0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana

	116 {0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman

	117 {0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1

	118 {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7

	119 };

	120

	121 static const uint8_t escapeSequences_2022KR[][5] = {

	122 {0x1b, 0x24, 0x29, 0x43, 0x00}

	123 };

	124

	125 static const uint8_t escapeSequences_2022CN[][5] = {

	126 {0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80

	127 {0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1

	128 {0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2

	129 {0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165

	130 {0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3

	131 {0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4

	132 {0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5

	133 {0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6

	134 {0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7

	135 {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2

	136 {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3

	137 };

	138

	139 const char *CharsetRecog_2022JP::getName() const

	140 {

	141 return "ISO-2022-JP";

	142 }

	143

	144 int32_t CharsetRecog_2022JP::match(InputText *textIn)

	145 {

	146 return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_20 22JP, ARRAY_SIZE(escapeSequences_2022JP));

	147 }

	148

	149 const char *CharsetRecog_2022KR::getName() const

	150 {

	151 return "ISO-2022-KR";

	152 }

	153

	154 int32_t CharsetRecog_2022KR::match(InputText *textIn)

	155 {

	156 return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_20 22KR, ARRAY_SIZE(escapeSequences_2022KR));

	157 }

	158

	159 const char *CharsetRecog_2022CN::getName() const

	160 {

	161 return "ISO-2022-CN";

	162 }

	163

	164 int32_t CharsetRecog_2022CN::match(InputText *textIn)

	165 {

	166 return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_20 22CN, ARRAY_SIZE(escapeSequences_2022CN));

	167 }

	168

	169 CharsetRecog_2022::~CharsetRecog_2022()

	170 {

	171 // nothing to do

	172 }

	173

	174 U_NAMESPACE_END

	175 #endif

OLD	NEW

« no previous file with comments | « icu46/source/i18n/csr2022.h ('k') | icu46/source/i18n/csrecog.h » ('j') | no next file with comments »