OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2005-2013, International Business Machines | 3 * Copyright (C) 2005-2015, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #include "cmemory.h" | 10 #include "cmemory.h" |
11 | 11 |
12 #if !UCONFIG_NO_CONVERSION | 12 #if !UCONFIG_NO_CONVERSION |
13 #include "csrsbcs.h" | 13 #include "csrsbcs.h" |
14 #include "csmatch.h" | 14 #include "csmatch.h" |
15 | 15 |
16 #define N_GRAM_SIZE 3 | 16 #define N_GRAM_SIZE 3 |
17 #define N_GRAM_MASK 0xFFFFFF | 17 #define N_GRAM_MASK 0xFFFFFF |
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) | 18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
19 | 19 |
20 U_NAMESPACE_BEGIN | 20 U_NAMESPACE_BEGIN |
21 | 21 |
22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) | 22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap) |
23 : ngram(0), byteIndex(0) | 23 : ngram(0), byteIndex(0) |
24 { | 24 { |
25 ngramList = theNgramList; | 25 ngramList = theNgramList; |
26 charMap = theCharMap; | 26 charMap = theCharMap; |
27 | 27 |
28 ngramCount = hitCount = 0; | 28 ngramCount = hitCount = 0; |
29 } | 29 } |
30 | 30 |
| 31 NGramParser::~NGramParser() |
| 32 { |
| 33 } |
| 34 |
31 /* | 35 /* |
32 * Binary search for value in table, which must have exactly 64 entries. | 36 * Binary search for value in table, which must have exactly 64 entries. |
33 */ | 37 */ |
34 | 38 |
35 int32_t NGramParser::search(const int32_t *table, int32_t value) | 39 int32_t NGramParser::search(const int32_t *table, int32_t value) |
36 { | 40 { |
37 int32_t index = 0; | 41 int32_t index = 0; |
38 | 42 |
39 if (table[index + 32] <= value) { | 43 if (table[index + 32] <= value) { |
40 index += 32; | 44 index += 32; |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
130 | 134 |
131 // TODO - This is a bit of a hack to take care of a case | 135 // TODO - This is a bit of a hack to take care of a case |
132 // were we were getting a confidence of 135... | 136 // were we were getting a confidence of 135... |
133 if (rawPercent > 0.33) { | 137 if (rawPercent > 0.33) { |
134 return 98; | 138 return 98; |
135 } | 139 } |
136 | 140 |
137 return (int32_t) (rawPercent * 300.0); | 141 return (int32_t) (rawPercent * 300.0); |
138 } | 142 } |
139 | 143 |
140 #if !UCONFIG_NO_NON_HTML5_CONVERSION | 144 #if !UCONFIG_ONLY_HTML_CONVERSION |
141 static const uint8_t unshapeMap_IBM420[] = { | 145 static const uint8_t unshapeMap_IBM420[] = { |
142 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ | 146 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ |
143 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 147 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
144 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 148 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
145 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 149 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
146 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 150 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
147 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, | 151 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, |
148 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, | 152 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, |
149 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, | 153 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, |
150 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, | 154 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, |
151 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x
8B, 0x8B, 0x8D, 0x8D, 0x8F, | 155 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x
8B, 0x8B, 0x8D, 0x8D, 0x8F, |
152 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9A, 0x9A, 0x9A, 0x9E, 0x9E, | 156 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9A, 0x9A, 0x9A, 0x9E, 0x9E, |
153 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0x
AB, 0xAB, 0xAD, 0xAD, 0xAF, | 157 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0x
AB, 0xAB, 0xAD, 0xAD, 0xAF, |
154 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0x
BB, 0xBB, 0xBD, 0xBD, 0xBF, | 158 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0x
BB, 0xBB, 0xBD, 0xBD, 0xBF, |
155 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0x
BF, 0xCC, 0xBF, 0xCE, 0xCF, | 159 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0x
BF, 0xCC, 0xBF, 0xCE, 0xCF, |
156 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0x
DA, 0xDC, 0xDC, 0xDC, 0xDF, | 160 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0x
DA, 0xDC, 0xDC, 0xDC, 0xDF, |
157 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0x
EB, 0xEC, 0xED, 0xEE, 0xEF, | 161 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0x
EB, 0xEC, 0xED, 0xEE, 0xEF, |
158 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0x
FB, 0xFC, 0xFD, 0xFE, 0xFF, | 162 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0x
FB, 0xFC, 0xFD, 0xFE, 0xFF, |
159 }; | 163 }; |
160 | 164 |
161 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_
t *theCharMap):NGramParser(theNgramList, theCharMap) | 165 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_
t *theCharMap):NGramParser(theNgramList, theCharMap) |
162 { | 166 { |
163 alef = 0x00; | 167 alef = 0x00; |
164 } | 168 } |
165 | 169 |
| 170 NGramParser_IBM420::~NGramParser_IBM420() {} |
166 | 171 |
167 int32_t NGramParser_IBM420::isLamAlef(int32_t b) | 172 int32_t NGramParser_IBM420::isLamAlef(int32_t b) |
168 { | 173 { |
169 if(b == 0xB2 || b == 0xB3){ | 174 if(b == 0xB2 || b == 0xB3){ |
170 return 0x47; | 175 return 0x47; |
171 }else if(b == 0xB4 || b == 0xB5){ | 176 }else if(b == 0xB4 || b == 0xB5){ |
172 return 0x49; | 177 return 0x49; |
173 }else if(b == 0xB8 || b == 0xB9){ | 178 }else if(b == 0xB8 || b == 0xB9){ |
174 return 0x56; | 179 return 0x56; |
175 }else | 180 }else |
(...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
619 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 624 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
620 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 625 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
621 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 626 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
622 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 627 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
623 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 628 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
624 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 629 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
625 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 630 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
626 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 631 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
627 }; | 632 }; |
628 | 633 |
629 #if !UCONFIG_NO_NON_HTML5_CONVERSION | 634 #if !UCONFIG_ONLY_HTML_CONVERSION |
630 static const int32_t ngrams_IBM424_he_rtl[] = { | 635 static const int32_t ngrams_IBM424_he_rtl[] = { |
631 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, | 636 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, |
632 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, | 637 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, |
633 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, | 638 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, |
634 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, | 639 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, |
635 }; | 640 }; |
636 | 641 |
637 static const int32_t ngrams_IBM424_he_ltr[] = { | 642 static const int32_t ngrams_IBM424_he_ltr[] = { |
638 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, | 643 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, |
639 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, | 644 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, |
(...skipping 512 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1152 return "ru"; | 1157 return "ru"; |
1153 } | 1158 } |
1154 | 1159 |
1155 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const | 1160 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const |
1156 { | 1161 { |
1157 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); | 1162 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); |
1158 results->set(textIn, this, confidence); | 1163 results->set(textIn, this, confidence); |
1159 return (confidence > 0); | 1164 return (confidence > 0); |
1160 } | 1165 } |
1161 | 1166 |
1162 #if !UCONFIG_NO_NON_HTML5_CONVERSION | 1167 #if !UCONFIG_ONLY_HTML_CONVERSION |
1163 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() | 1168 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() |
1164 { | 1169 { |
1165 // nothing to do | 1170 // nothing to do |
1166 } | 1171 } |
1167 | 1172 |
1168 const char *CharsetRecog_IBM424_he::getLanguage() const | 1173 const char *CharsetRecog_IBM424_he::getLanguage() const |
1169 { | 1174 { |
1170 return "he"; | 1175 return "he"; |
1171 } | 1176 } |
1172 | 1177 |
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1256 { | 1261 { |
1257 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); | 1262 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); |
1258 results->set(textIn, this, confidence); | 1263 results->set(textIn, this, confidence); |
1259 return (confidence > 0); | 1264 return (confidence > 0); |
1260 } | 1265 } |
1261 #endif | 1266 #endif |
1262 | 1267 |
1263 U_NAMESPACE_END | 1268 U_NAMESPACE_END |
1264 #endif | 1269 #endif |
1265 | 1270 |
OLD | NEW |