OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2009, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 */ |
| 7 |
| 8 #include "unicode/utypes.h" |
| 9 |
| 10 #if !UCONFIG_NO_CONVERSION |
| 11 |
| 12 #include "cstring.h" |
| 13 |
| 14 #include "csr2022.h" |
| 15 |
| 16 U_NAMESPACE_BEGIN |
| 17 |
| 18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| 19 |
| 20 /** |
| 21 * Matching function shared among the 2022 detectors JP, CN and KR |
| 22 * Counts up the number of legal an unrecognized escape sequences in |
| 23 * the sample of text, and computes a score based on the total number & |
| 24 * the proportion that fit the encoding. |
| 25 * |
| 26 * |
| 27 * @param text the byte buffer containing text to analyse |
| 28 * @param textLen the size of the text in the byte. |
| 29 * @param escapeSequences the byte escape sequences to test for. |
| 30 * @return match quality, in the range of 0-100. |
| 31 */ |
| 32 int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, cons
t uint8_t escapeSequences[][5], int32_t escapeSequences_length) |
| 33 { |
| 34 int32_t i, j; |
| 35 int32_t escN; |
| 36 int32_t hits = 0; |
| 37 int32_t misses = 0; |
| 38 int32_t shifts = 0; |
| 39 int32_t quality; |
| 40 |
| 41 i = 0; |
| 42 while(i < textLen) { |
| 43 if(text[i] == 0x1B) { |
| 44 escN = 0; |
| 45 while(escN < escapeSequences_length) { |
| 46 const uint8_t *seq = escapeSequences[escN]; |
| 47 int32_t seq_length = (int32_t)uprv_strlen((const char *) seq); |
| 48 |
| 49 if (textLen-i >= seq_length) { |
| 50 j = 1; |
| 51 while(j < seq_length) { |
| 52 if(seq[j] != text[i+j]) { |
| 53 goto checkEscapes; |
| 54 } |
| 55 |
| 56 j += 1; |
| 57 } |
| 58 |
| 59 hits += 1; |
| 60 i += seq_length-1; |
| 61 goto scanInput; |
| 62 } |
| 63 // else we ran out of string to compare this time. |
| 64 checkEscapes: |
| 65 escN += 1; |
| 66 } |
| 67 |
| 68 misses += 1; |
| 69 } |
| 70 |
| 71 if( text[i]== 0x0e || text[i] == 0x0f){ |
| 72 shifts += 1; |
| 73 } |
| 74 |
| 75 scanInput: |
| 76 i += 1; |
| 77 } |
| 78 |
| 79 if (hits == 0) { |
| 80 return 0; |
| 81 } |
| 82 |
| 83 // |
| 84 // Initial quality is based on relative proportion of recongized vs. |
| 85 // unrecognized escape sequences. |
| 86 // All good: quality = 100; |
| 87 // half or less good: quality = 0; |
| 88 // linear inbetween. |
| 89 quality = (100*hits - 100*misses) / (hits + misses); |
| 90 |
| 91 // Back off quality if there were too few escape sequences seen. |
| 92 // Include shifts in this computation, so that KR does not get penalized |
| 93 // for having only a single Escape sequence, but many shifts. |
| 94 if (hits+shifts < 5) { |
| 95 quality -= (5-(hits+shifts))*10; |
| 96 } |
| 97 |
| 98 if (quality < 0) { |
| 99 quality = 0; |
| 100 } |
| 101 |
| 102 return quality; |
| 103 } |
| 104 |
| 105 |
| 106 static const uint8_t escapeSequences_2022JP[][5] = { |
| 107 {0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992 |
| 108 {0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990 |
| 109 {0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978 |
| 110 {0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80 |
| 111 {0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983 |
| 112 {0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997 |
| 113 {0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII |
| 114 {0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman |
| 115 {0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana |
| 116 {0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman |
| 117 {0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1 |
| 118 {0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7 |
| 119 }; |
| 120 |
| 121 static const uint8_t escapeSequences_2022KR[][5] = { |
| 122 {0x1b, 0x24, 0x29, 0x43, 0x00} |
| 123 }; |
| 124 |
| 125 static const uint8_t escapeSequences_2022CN[][5] = { |
| 126 {0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80 |
| 127 {0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1 |
| 128 {0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2 |
| 129 {0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165 |
| 130 {0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3 |
| 131 {0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4 |
| 132 {0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5 |
| 133 {0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6 |
| 134 {0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7 |
| 135 {0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2 |
| 136 {0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3 |
| 137 }; |
| 138 |
| 139 const char *CharsetRecog_2022JP::getName() const |
| 140 { |
| 141 return "ISO-2022-JP"; |
| 142 } |
| 143 |
| 144 int32_t CharsetRecog_2022JP::match(InputText *textIn) |
| 145 { |
| 146 return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_20
22JP, ARRAY_SIZE(escapeSequences_2022JP)); |
| 147 } |
| 148 |
| 149 const char *CharsetRecog_2022KR::getName() const |
| 150 { |
| 151 return "ISO-2022-KR"; |
| 152 } |
| 153 |
| 154 int32_t CharsetRecog_2022KR::match(InputText *textIn) |
| 155 { |
| 156 return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_20
22KR, ARRAY_SIZE(escapeSequences_2022KR)); |
| 157 } |
| 158 |
| 159 const char *CharsetRecog_2022CN::getName() const |
| 160 { |
| 161 return "ISO-2022-CN"; |
| 162 } |
| 163 |
| 164 int32_t CharsetRecog_2022CN::match(InputText *textIn) |
| 165 { |
| 166 return match_2022(textIn->fInputBytes, textIn->fInputLen, escapeSequences_20
22CN, ARRAY_SIZE(escapeSequences_2022CN)); |
| 167 } |
| 168 |
| 169 CharsetRecog_2022::~CharsetRecog_2022() |
| 170 { |
| 171 // nothing to do |
| 172 } |
| 173 |
| 174 U_NAMESPACE_END |
| 175 #endif |
OLD | NEW |