OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2005-2013, International Business Machines | 3 * Copyright (C) 2005-2013, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #include "cmemory.h" | 10 #include "cmemory.h" |
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
130 | 130 |
131 // TODO - This is a bit of a hack to take care of a case | 131 // TODO - This is a bit of a hack to take care of a case |
132 // were we were getting a confidence of 135... | 132 // were we were getting a confidence of 135... |
133 if (rawPercent > 0.33) { | 133 if (rawPercent > 0.33) { |
134 return 98; | 134 return 98; |
135 } | 135 } |
136 | 136 |
137 return (int32_t) (rawPercent * 300.0); | 137 return (int32_t) (rawPercent * 300.0); |
138 } | 138 } |
139 | 139 |
140 #if !UCONFIG_NO_NON_HTML5_CONVERSION | |
141 static const uint8_t unshapeMap_IBM420[] = { | 140 static const uint8_t unshapeMap_IBM420[] = { |
142 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ | 141 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ |
143 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 142 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
144 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 143 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
145 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 144 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
146 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 145 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
147 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, | 146 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, |
148 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, | 147 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, |
149 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, | 148 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, |
150 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, | 149 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
226 if (!(mb == 0x20 && ignoreSpace)) { | 225 if (!(mb == 0x20 && ignoreSpace)) { |
227 addByte(mb); | 226 addByte(mb); |
228 } | 227 } |
229 | 228 |
230 ignoreSpace = (mb == 0x20); | 229 ignoreSpace = (mb == 0x20); |
231 } | 230 } |
232 | 231 |
233 } | 232 } |
234 } | 233 } |
235 } | 234 } |
236 #endif | |
237 | 235 |
238 CharsetRecog_sbcs::CharsetRecog_sbcs() | 236 CharsetRecog_sbcs::CharsetRecog_sbcs() |
239 { | 237 { |
240 // nothing else to do | 238 // nothing else to do |
241 } | 239 } |
242 | 240 |
243 CharsetRecog_sbcs::~CharsetRecog_sbcs() | 241 CharsetRecog_sbcs::~CharsetRecog_sbcs() |
244 { | 242 { |
245 // nothing to do | 243 // nothing to do |
246 } | 244 } |
(...skipping 372 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
619 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 617 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
620 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 618 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
621 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 619 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
622 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 620 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
623 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 621 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
624 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 622 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
625 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 623 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
626 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 624 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
627 }; | 625 }; |
628 | 626 |
629 #if !UCONFIG_NO_NON_HTML5_CONVERSION | |
630 static const int32_t ngrams_IBM424_he_rtl[] = { | 627 static const int32_t ngrams_IBM424_he_rtl[] = { |
631 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, | 628 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, |
632 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, | 629 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, |
633 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, | 630 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, |
634 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, | 631 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, |
635 }; | 632 }; |
636 | 633 |
637 static const int32_t ngrams_IBM424_he_ltr[] = { | 634 static const int32_t ngrams_IBM424_he_ltr[] = { |
638 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, | 635 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, |
639 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, | 636 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
687 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 684 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
688 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x
8B, 0x8C, 0x8D, 0x8E, 0x8F, | 685 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x
8B, 0x8C, 0x8D, 0x8E, 0x8F, |
689 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9B, 0x9C, 0x9D, 0x9E, 0x9F, | 686 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9B, 0x9C, 0x9D, 0x9E, 0x9F, |
690 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0x
AB, 0xAC, 0xAD, 0xAE, 0xAF, | 687 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0x
AB, 0xAC, 0xAD, 0xAE, 0xAF, |
691 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0x
BB, 0xBC, 0xBD, 0xBE, 0xBF, | 688 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0x
BB, 0xBC, 0xBD, 0xBE, 0xBF, |
692 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x
CB, 0x40, 0xCD, 0x40, 0xCF, | 689 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x
CB, 0x40, 0xCD, 0x40, 0xCF, |
693 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0x
DB, 0xDC, 0xDD, 0xDE, 0xDF, | 690 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0x
DB, 0xDC, 0xDD, 0xDE, 0xDF, |
694 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0x
EB, 0x40, 0xED, 0xEE, 0xEF, | 691 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0x
EB, 0x40, 0xED, 0xEE, 0xEF, |
695 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
FB, 0xFC, 0xFD, 0xFE, 0x40, | 692 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
FB, 0xFC, 0xFD, 0xFE, 0x40, |
696 }; | 693 }; |
697 #endif | |
698 | 694 |
699 //ISO-8859-1,2,5,6,7,8,9 Ngrams | 695 //ISO-8859-1,2,5,6,7,8,9 Ngrams |
700 | 696 |
701 struct NGramsPlusLang { | 697 struct NGramsPlusLang { |
702 const int32_t ngrams[64]; | 698 const int32_t ngrams[64]; |
703 const char * lang; | 699 const char * lang; |
704 }; | 700 }; |
705 | 701 |
706 static const NGramsPlusLang ngrams_8859_1[] = { | 702 static const NGramsPlusLang ngrams_8859_1[] = { |
707 { | 703 { |
(...skipping 444 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1152 return "ru"; | 1148 return "ru"; |
1153 } | 1149 } |
1154 | 1150 |
1155 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const | 1151 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const |
1156 { | 1152 { |
1157 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); | 1153 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); |
1158 results->set(textIn, this, confidence); | 1154 results->set(textIn, this, confidence); |
1159 return (confidence > 0); | 1155 return (confidence > 0); |
1160 } | 1156 } |
1161 | 1157 |
1162 #if !UCONFIG_NO_NON_HTML5_CONVERSION | |
1163 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() | 1158 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() |
1164 { | 1159 { |
1165 // nothing to do | 1160 // nothing to do |
1166 } | 1161 } |
1167 | 1162 |
1168 const char *CharsetRecog_IBM424_he::getLanguage() const | 1163 const char *CharsetRecog_IBM424_he::getLanguage() const |
1169 { | 1164 { |
1170 return "he"; | 1165 return "he"; |
1171 } | 1166 } |
1172 | 1167 |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1251 { | 1246 { |
1252 return "IBM420_ltr"; | 1247 return "IBM420_ltr"; |
1253 } | 1248 } |
1254 | 1249 |
1255 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
) const | 1250 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
) const |
1256 { | 1251 { |
1257 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); | 1252 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); |
1258 results->set(textIn, this, confidence); | 1253 results->set(textIn, this, confidence); |
1259 return (confidence > 0); | 1254 return (confidence > 0); |
1260 } | 1255 } |
1261 #endif | |
1262 | 1256 |
1263 U_NAMESPACE_END | 1257 U_NAMESPACE_END |
1264 #endif | 1258 #endif |
1265 | 1259 |
OLD | NEW |