icu46/source/i18n/csr2022.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/csr2022.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/csr2022.cpp

===================================================================

--- icu46/source/i18n/csr2022.cpp (revision 0)

+++ icu46/source/i18n/csr2022.cpp (revision 0)

@@ -0,0 +1,175 @@

+/*

+ **********************************************************************

+ */

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "cstring.h"

+#include "csr2022.h"

+U_NAMESPACE_BEGIN

+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

+/**

+ * Matching function shared among the 2022 detectors JP, CN and KR

+ * Counts up the number of legal an unrecognized escape sequences in

+ * the sample of text, and computes a score based on the total number &

+ * the proportion that fit the encoding.

+ *

+ * @param text the byte buffer containing text to analyse

+ * @param textLen the size of the text in the byte.

+ * @param escapeSequences the byte escape sequences to test for.

+ * @return match quality, in the range of 0-100.

+ */

+int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length)

+ int32_t i, j;

+ int32_t escN;

+ int32_t hits = 0;

+ int32_t misses = 0;

+ int32_t shifts = 0;

+ int32_t quality;

+ i = 0;

+ while(i < textLen) {

+ if(text[i] == 0x1B) {

+ escN = 0;

+ while(escN < escapeSequences_length) {

+ const uint8_t *seq = escapeSequences[escN];

+ int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);

+ if (textLen-i >= seq_length) {

+ j = 1;

+ while(j < seq_length) {

+ if(seq[j] != text[i+j]) {

+ goto checkEscapes;

+ }

+ j += 1;

+ }

+ hits += 1;

+ i += seq_length-1;

+ goto scanInput;

+ }

+ // else we ran out of string to compare this time.

+checkEscapes:

+ escN += 1;

+ }

+ misses += 1;

+ }

+ if( text[i]== 0x0e || text[i] == 0x0f){

+ shifts += 1;

+ }

+scanInput:

+ i += 1;

+ }

+ if (hits == 0) {

+ return 0;

+ }

+ //

+ // Initial quality is based on relative proportion of recongized vs.

+ // unrecognized escape sequences.

+ // All good: quality = 100;

+ // half or less good: quality = 0;

+ // linear inbetween.

+ quality = (100*hits - 100*misses) / (hits + misses);

+ // Back off quality if there were too few escape sequences seen.

+ // Include shifts in this computation, so that KR does not get penalized

+ // for having only a single Escape sequence, but many shifts.

+ if (hits+shifts < 5) {

+ quality -= (5-(hits+shifts))*10;

+ }

+ if (quality < 0) {

+ quality = 0;

+ }

+ return quality;

+static const uint8_t escapeSequences_2022JP[][5] = {

+ {0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992

+ {0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990

+ {0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978

+ {0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80

+ {0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983

+ {0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997

+ {0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII

+ {0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman

+ {0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana

+ {0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman

+ {0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1

+ {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7

+};

+static const uint8_t escapeSequences_2022KR[][5] = {

+ {0x1b, 0x24, 0x29, 0x43, 0x00}

+};

+static const uint8_t escapeSequences_2022CN[][5] = {

+ {0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80

+ {0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1

+ {0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2

+ {0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165

+ {0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3

+ {0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4

+ {0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5

+ {0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6

+ {0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7

+ {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2

+ {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3

+};

+const char *CharsetRecog_2022JP::getName() const

+ return "ISO-2022-JP";

+int32_t CharsetRecog_2022JP::match(InputText *textIn)

+ return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022JP, ARRAY_SIZE(escapeSequences_2022JP));

+const char *CharsetRecog_2022KR::getName() const

+ return "ISO-2022-KR";

+int32_t CharsetRecog_2022KR::match(InputText *textIn)

+ return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022KR, ARRAY_SIZE(escapeSequences_2022KR));

+const char *CharsetRecog_2022CN::getName() const

+ return "ISO-2022-CN";

+int32_t CharsetRecog_2022CN::match(InputText *textIn)

+ return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_2022CN, ARRAY_SIZE(escapeSequences_2022CN));

+CharsetRecog_2022::~CharsetRecog_2022()

+ // nothing to do

+U_NAMESPACE_END

+#endif

Property changes on: icu46/source/i18n/csr2022.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/csr2022.h ('k') | icu46/source/i18n/csrecog.h » ('j') | no next file with comments »