| OLD | NEW |
| 1 /* | 1 /* |
| 2 ********************************************************************** | 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2013, International Business Machines | 3 * Copyright (C) 2005-2013, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** | 5 ********************************************************************** |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
| 9 | 9 |
| 10 #include "cmemory.h" | 10 #include "cmemory.h" |
| (...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 130 | 130 |
| 131 // TODO - This is a bit of a hack to take care of a case | 131 // TODO - This is a bit of a hack to take care of a case |
| 132 // were we were getting a confidence of 135... | 132 // were we were getting a confidence of 135... |
| 133 if (rawPercent > 0.33) { | 133 if (rawPercent > 0.33) { |
| 134 return 98; | 134 return 98; |
| 135 } | 135 } |
| 136 | 136 |
| 137 return (int32_t) (rawPercent * 300.0); | 137 return (int32_t) (rawPercent * 300.0); |
| 138 } | 138 } |
| 139 | 139 |
| 140 #if !UCONFIG_NO_NON_HTML5_CONVERSION |
| 140 static const uint8_t unshapeMap_IBM420[] = { | 141 static const uint8_t unshapeMap_IBM420[] = { |
| 141 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ | 142 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -
B -C -D -E -F */ |
| 142 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 143 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
| 143 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 144 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
| 144 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 145 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
| 145 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 146 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
| 146 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, | 147 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x
4B, 0x4C, 0x4D, 0x4E, 0x4F, |
| 147 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, | 148 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x
5B, 0x5C, 0x5D, 0x5E, 0x5F, |
| 148 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, | 149 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x
6B, 0x6C, 0x6D, 0x6E, 0x6F, |
| 149 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, | 150 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x
7B, 0x7C, 0x7D, 0x7E, 0x7F, |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 225 if (!(mb == 0x20 && ignoreSpace)) { | 226 if (!(mb == 0x20 && ignoreSpace)) { |
| 226 addByte(mb); | 227 addByte(mb); |
| 227 } | 228 } |
| 228 | 229 |
| 229 ignoreSpace = (mb == 0x20); | 230 ignoreSpace = (mb == 0x20); |
| 230 } | 231 } |
| 231 | 232 |
| 232 } | 233 } |
| 233 } | 234 } |
| 234 } | 235 } |
| 236 #endif |
| 235 | 237 |
| 236 CharsetRecog_sbcs::CharsetRecog_sbcs() | 238 CharsetRecog_sbcs::CharsetRecog_sbcs() |
| 237 { | 239 { |
| 238 // nothing else to do | 240 // nothing else to do |
| 239 } | 241 } |
| 240 | 242 |
| 241 CharsetRecog_sbcs::~CharsetRecog_sbcs() | 243 CharsetRecog_sbcs::~CharsetRecog_sbcs() |
| 242 { | 244 { |
| 243 // nothing to do | 245 // nothing to do |
| 244 } | 246 } |
| (...skipping 372 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 617 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 619 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
| 618 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 620 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
| 619 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 621 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
| 620 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 622 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
| 621 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, | 623 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, |
| 622 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, | 624 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, |
| 623 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, | 625 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, |
| 624 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, | 626 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, |
| 625 }; | 627 }; |
| 626 | 628 |
| 629 #if !UCONFIG_NO_NON_HTML5_CONVERSION |
| 627 static const int32_t ngrams_IBM424_he_rtl[] = { | 630 static const int32_t ngrams_IBM424_he_rtl[] = { |
| 628 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, | 631 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x4045
46, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x4056
41, |
| 629 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, | 632 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x4540
56, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x5140
45, |
| 630 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, | 633 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x5440
41, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x5540
56, |
| 631 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, | 634 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x6840
45, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x7140
69, |
| 632 }; | 635 }; |
| 633 | 636 |
| 634 static const int32_t ngrams_IBM424_he_ltr[] = { | 637 static const int32_t ngrams_IBM424_he_ltr[] = { |
| 635 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, | 638 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x4054
62, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x4071
41, |
| 636 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, | 639 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x4146
45, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x4540
54, |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 684 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, | 687 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x
40, 0x40, 0x40, 0x40, 0x40, |
| 685 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x
8B, 0x8C, 0x8D, 0x8E, 0x8F, | 688 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x
8B, 0x8C, 0x8D, 0x8E, 0x8F, |
| 686 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9B, 0x9C, 0x9D, 0x9E, 0x9F, | 689 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x
9B, 0x9C, 0x9D, 0x9E, 0x9F, |
| 687 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0x
AB, 0xAC, 0xAD, 0xAE, 0xAF, | 690 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0x
AB, 0xAC, 0xAD, 0xAE, 0xAF, |
| 688 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0x
BB, 0xBC, 0xBD, 0xBE, 0xBF, | 691 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0x
BB, 0xBC, 0xBD, 0xBE, 0xBF, |
| 689 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x
CB, 0x40, 0xCD, 0x40, 0xCF, | 692 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x
CB, 0x40, 0xCD, 0x40, 0xCF, |
| 690 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0x
DB, 0xDC, 0xDD, 0xDE, 0xDF, | 693 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0x
DB, 0xDC, 0xDD, 0xDE, 0xDF, |
| 691 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0x
EB, 0x40, 0xED, 0xEE, 0xEF, | 694 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0x
EB, 0x40, 0xED, 0xEE, 0xEF, |
| 692 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
FB, 0xFC, 0xFD, 0xFE, 0x40, | 695 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x
FB, 0xFC, 0xFD, 0xFE, 0x40, |
| 693 }; | 696 }; |
| 697 #endif |
| 694 | 698 |
| 695 //ISO-8859-1,2,5,6,7,8,9 Ngrams | 699 //ISO-8859-1,2,5,6,7,8,9 Ngrams |
| 696 | 700 |
| 697 struct NGramsPlusLang { | 701 struct NGramsPlusLang { |
| 698 const int32_t ngrams[64]; | 702 const int32_t ngrams[64]; |
| 699 const char * lang; | 703 const char * lang; |
| 700 }; | 704 }; |
| 701 | 705 |
| 702 static const NGramsPlusLang ngrams_8859_1[] = { | 706 static const NGramsPlusLang ngrams_8859_1[] = { |
| 703 { | 707 { |
| (...skipping 444 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1148 return "ru"; | 1152 return "ru"; |
| 1149 } | 1153 } |
| 1150 | 1154 |
| 1151 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const | 1155 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const |
| 1152 { | 1156 { |
| 1153 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); | 1157 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R); |
| 1154 results->set(textIn, this, confidence); | 1158 results->set(textIn, this, confidence); |
| 1155 return (confidence > 0); | 1159 return (confidence > 0); |
| 1156 } | 1160 } |
| 1157 | 1161 |
| 1162 #if !UCONFIG_NO_NON_HTML5_CONVERSION |
| 1158 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() | 1163 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he() |
| 1159 { | 1164 { |
| 1160 // nothing to do | 1165 // nothing to do |
| 1161 } | 1166 } |
| 1162 | 1167 |
| 1163 const char *CharsetRecog_IBM424_he::getLanguage() const | 1168 const char *CharsetRecog_IBM424_he::getLanguage() const |
| 1164 { | 1169 { |
| 1165 return "he"; | 1170 return "he"; |
| 1166 } | 1171 } |
| 1167 | 1172 |
| (...skipping 78 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1246 { | 1251 { |
| 1247 return "IBM420_ltr"; | 1252 return "IBM420_ltr"; |
| 1248 } | 1253 } |
| 1249 | 1254 |
| 1250 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
) const | 1255 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results
) const |
| 1251 { | 1256 { |
| 1252 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); | 1257 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420
_ar); |
| 1253 results->set(textIn, this, confidence); | 1258 results->set(textIn, this, confidence); |
| 1254 return (confidence > 0); | 1259 return (confidence > 0); |
| 1255 } | 1260 } |
| 1261 #endif |
| 1256 | 1262 |
| 1257 U_NAMESPACE_END | 1263 U_NAMESPACE_END |
| 1258 #endif | 1264 #endif |
| 1259 | 1265 |
| OLD | NEW |