OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2005-2013, International Business Machines | 3 * Copyright (C) 2005-2013, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #include "cmemory.h" | 10 #include "cmemory.h" |
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
130 | 130 |
131 // TODO - This is a bit of a hack to take care of a case | 131 // TODO - This is a bit of a hack to take care of a case |
132 // were we were getting a confidence of 135... | 132 // were we were getting a confidence of 135... |
133 if (rawPercent > 0.33) { | 133 if (rawPercent > 0.33) { |
134 return 98; | 134 return 98; |
135 } | 135 } |
136 | 136 |
137 return (int32_t) (rawPercent * 300.0); | 137 return (int32_t) (rawPercent * 300.0); |
138 } | 138 } |
139 | 139 |
| 140 #if !UCONFIG_NO_NON_HTML5_CONVERSION |
140 static const uint8_t unshapeMap_IBM420[] = { | 141 static const uint8_t unshapeMap_IBM420[] = { |
141 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ | 142 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ |
142 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 143 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
143 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 144 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
144 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 145 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
145 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 146 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
146 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, | 147 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, |
147 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, | 148 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, |
148 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, | 149 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, |
149 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, | 150 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
225 if (!(mb == 0x20 && ignoreSpace)) { | 226 if (!(mb == 0x20 && ignoreSpace)) { |
226 addByte(mb); | 227 addByte(mb); |
227 } | 228 } |
228 | 229 |
229 ignoreSpace = (mb == 0x20); | 230 ignoreSpace = (mb == 0x20); |
230 } | 231 } |
231 | 232 |
232 } | 233 } |
233 } | 234 } |
234 } | 235 } |
| 236 #endif |
235 | 237 |
236 CharsetRecog_sbcs::CharsetRecog_sbcs() | 238 CharsetRecog_sbcs::CharsetRecog_sbcs() |
237 { | 239 { |
238 // nothing else to do | 240 // nothing else to do |
239 } | 241 } |
240 | 242 |
241 CharsetRecog_sbcs::~CharsetRecog_sbcs() | 243 CharsetRecog_sbcs::~CharsetRecog_sbcs() |
242 { | 244 { |
243 // nothing to do | 245 // nothing to do |
244 } | 246 } |
(...skipping 372 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
617 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 619 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
618 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 620 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
619 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 621 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
620 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 622 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
621 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 623 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
622 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 624 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
623 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 625 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
624 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 626 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
625 }; | 627 }; |
626 | 628 |
| 629 #if !UCONFIG_NO_NON_HTML5_CONVERSION |
627 static const int32_t ngrams_IBM424_he_rtl[] = { | 630 static const int32_t ngrams_IBM424_he_rtl[] = { |
628 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, | 631 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, |
629 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, | 632 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, |
630 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, | 633 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, |
631 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, | 634 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, |
632 }; | 635 }; |
633 | 636 |
634 static const int32_t ngrams_IBM424_he_ltr[] = { | 637 static const int32_t ngrams_IBM424_he_ltr[] = { |
635 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, | 638 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, |
636 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, | 639 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
684 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 687 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
685 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x
8B, 0x8C, 0x8D, 0x8E, 0x8F, | 688 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x
8B, 0x8C, 0x8D, 0x8E, 0x8F, |
686 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9B, 0x9C, 0x9D, 0x9E, 0x9F, | 689 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9B, 0x9C, 0x9D, 0x9E, 0x9F, |
687 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0x
AB, 0xAC, 0xAD, 0xAE, 0xAF, | 690 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0x
AB, 0xAC, 0xAD, 0xAE, 0xAF, |
688 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0x
BB, 0xBC, 0xBD, 0xBE, 0xBF, | 691 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0x
BB, 0xBC, 0xBD, 0xBE, 0xBF, |
689 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x
CB, 0x40, 0xCD, 0x40, 0xCF, | 692 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x
CB, 0x40, 0xCD, 0x40, 0xCF, |
690 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0x
DB, 0xDC, 0xDD, 0xDE, 0xDF, | 693 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0x
DB, 0xDC, 0xDD, 0xDE, 0xDF, |
691 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0x
EB, 0x40, 0xED, 0xEE, 0xEF, | 694 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0x
EB, 0x40, 0xED, 0xEE, 0xEF, |
692 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
FB, 0xFC, 0xFD, 0xFE, 0x40, | 695 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
FB, 0xFC, 0xFD, 0xFE, 0x40, |
693 }; | 696 }; |
| 697 #endif |
694 | 698 |
695 //ISO-8859-1,2,5,6,7,8,9 Ngrams | 699 //ISO-8859-1,2,5,6,7,8,9 Ngrams |
696 | 700 |
697 struct NGramsPlusLang { | 701 struct NGramsPlusLang { |
698 const int32_t ngrams[64]; | 702 const int32_t ngrams[64]; |
699 const char * lang; | 703 const char * lang; |
700 }; | 704 }; |
701 | 705 |
702 static const NGramsPlusLang ngrams_8859_1[] = { | 706 static const NGramsPlusLang ngrams_8859_1[] = { |
703 { | 707 { |
(...skipping 444 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1148 return "ru"; | 1152 return "ru"; |
1149 } | 1153 } |
1150 | 1154 |
1151 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const | 1155 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const |
1152 { | 1156 { |
1153 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); | 1157 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); |
1154 results->set(textIn, this, confidence); | 1158 results->set(textIn, this, confidence); |
1155 return (confidence > 0); | 1159 return (confidence > 0); |
1156 } | 1160 } |
1157 | 1161 |
| 1162 #if !UCONFIG_NO_NON_HTML5_CONVERSION |
1158 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() | 1163 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() |
1159 { | 1164 { |
1160 // nothing to do | 1165 // nothing to do |
1161 } | 1166 } |
1162 | 1167 |
1163 const char *CharsetRecog_IBM424_he::getLanguage() const | 1168 const char *CharsetRecog_IBM424_he::getLanguage() const |
1164 { | 1169 { |
1165 return "he"; | 1170 return "he"; |
1166 } | 1171 } |
1167 | 1172 |
(...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1246 { | 1251 { |
1247 return "IBM420_ltr"; | 1252 return "IBM420_ltr"; |
1248 } | 1253 } |
1249 | 1254 |
1250 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
) const | 1255 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
) const |
1251 { | 1256 { |
1252 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); | 1257 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); |
1253 results->set(textIn, this, confidence); | 1258 results->set(textIn, this, confidence); |
1254 return (confidence > 0); | 1259 return (confidence > 0); |
1255 } | 1260 } |
| 1261 #endif |
1256 | 1262 |
1257 U_NAMESPACE_END | 1263 U_NAMESPACE_END |
1258 #endif | 1264 #endif |
1259 | 1265 |
OLD | NEW |