OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2005-2013, International Business Machines | 3 * Copyright (C) 2005-2013, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #if !UCONFIG_NO_CONVERSION | 10 #if !UCONFIG_NO_CONVERSION |
(...skipping 11 matching lines...) Expand all Loading... |
22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() | 22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() |
23 { | 23 { |
24 // nothing to do | 24 // nothing to do |
25 } | 25 } |
26 | 26 |
27 const char *CharsetRecog_UTF_16_BE::getName() const | 27 const char *CharsetRecog_UTF_16_BE::getName() const |
28 { | 28 { |
29 return "UTF-16BE"; | 29 return "UTF-16BE"; |
30 } | 30 } |
31 | 31 |
| 32 // UTF-16 confidence calculation. Very simple minded, but better than nothing. |
| 33 // Any 8 bit non-control characters bump the confidence up. These have a zero
high byte, |
| 34 // and are very likely to be UTF-16, although they could also be part of a U
TF-32 code. |
| 35 // NULs are a contra-indication, they will appear commonly if the actual encod
ing is UTF-32. |
| 36 // NULs should be rare in actual text. |
| 37 |
| 38 static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { |
| 39 if (codeUnit == 0) { |
| 40 confidence -= 10; |
| 41 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { |
| 42 confidence += 10; |
| 43 } |
| 44 if (confidence < 0) { |
| 45 confidence = 0; |
| 46 } else if (confidence > 100) { |
| 47 confidence = 100; |
| 48 } |
| 49 return confidence; |
| 50 } |
| 51 |
| 52 |
32 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co
nst | 53 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co
nst |
33 { | 54 { |
34 const uint8_t *input = textIn->fRawInput; | 55 const uint8_t *input = textIn->fRawInput; |
35 int32_t confidence = 0; | 56 int32_t confidence = 10; |
36 int32_t length = textIn->fRawLength; | 57 int32_t length = textIn->fRawLength; |
37 | 58 |
38 if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { | 59 int32_t bytesToCheck = (length > 30) ? 30 : length; |
39 confidence = 100; | 60 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
| 61 UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1]; |
| 62 if (charIndex == 0 && codeUnit == 0xFEFF) { |
| 63 confidence = 100; |
| 64 break; |
| 65 } |
| 66 confidence = adjustConfidence(codeUnit, confidence); |
| 67 if (confidence == 0 || confidence == 100) { |
| 68 break; |
| 69 } |
40 } | 70 } |
41 | 71 if (bytesToCheck < 4 && confidence < 100) { |
42 // TODO: Do some statastics to check for unsigned UTF-16BE | 72 confidence = 0; |
| 73 } |
43 results->set(textIn, this, confidence); | 74 results->set(textIn, this, confidence); |
44 return (confidence > 0); | 75 return (confidence > 0); |
45 } | 76 } |
46 | 77 |
47 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() | 78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() |
48 { | 79 { |
49 // nothing to do | 80 // nothing to do |
50 } | 81 } |
51 | 82 |
52 const char *CharsetRecog_UTF_16_LE::getName() const | 83 const char *CharsetRecog_UTF_16_LE::getName() const |
53 { | 84 { |
54 return "UTF-16LE"; | 85 return "UTF-16LE"; |
55 } | 86 } |
56 | 87 |
57 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co
nst | 88 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co
nst |
58 { | 89 { |
59 const uint8_t *input = textIn->fRawInput; | 90 const uint8_t *input = textIn->fRawInput; |
60 int32_t confidence = 0; | 91 int32_t confidence = 10; |
61 int32_t length = textIn->fRawLength; | 92 int32_t length = textIn->fRawLength; |
62 | 93 |
63 if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00
|| input[3] != 0x00)) { | 94 int32_t bytesToCheck = (length > 30) ? 30 : length; |
64 confidence = 100; | 95 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
| 96 UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8); |
| 97 if (charIndex == 0 && codeUnit == 0xFEFF) { |
| 98 confidence = 100; // UTF-16 BOM |
| 99 if (length >= 4 && input[2] == 0 && input[3] == 0) { |
| 100 confidence = 0; // UTF-32 BOM |
| 101 } |
| 102 break; |
| 103 } |
| 104 confidence = adjustConfidence(codeUnit, confidence); |
| 105 if (confidence == 0 || confidence == 100) { |
| 106 break; |
| 107 } |
65 } | 108 } |
66 | 109 if (bytesToCheck < 4 && confidence < 100) { |
67 // TODO: Do some statastics to check for unsigned UTF-16LE | 110 confidence = 0; |
| 111 } |
68 results->set(textIn, this, confidence); | 112 results->set(textIn, this, confidence); |
69 return (confidence > 0); | 113 return (confidence > 0); |
70 } | 114 } |
71 | 115 |
72 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() | 116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() |
73 { | 117 { |
74 // nothing to do | 118 // nothing to do |
75 } | 119 } |
76 | 120 |
77 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const | 121 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
145 | 189 |
146 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con
st | 190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con
st |
147 { | 191 { |
148 return input[index + 3] << 24 | input[index + 2] << 16 | | 192 return input[index + 3] << 24 | input[index + 2] << 16 | |
149 input[index + 1] << 8 | input[index + 0]; | 193 input[index + 1] << 8 | input[index + 0]; |
150 } | 194 } |
151 | 195 |
152 U_NAMESPACE_END | 196 U_NAMESPACE_END |
153 #endif | 197 #endif |
154 | 198 |
OLD | NEW |