| OLD | NEW |
| 1 /* | 1 /* |
| 2 ********************************************************************** | 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2013, International Business Machines | 3 * Copyright (C) 2005-2013, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** | 5 ********************************************************************** |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
| 9 | 9 |
| 10 #if !UCONFIG_NO_CONVERSION | 10 #if !UCONFIG_NO_CONVERSION |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() | 22 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() |
| 23 { | 23 { |
| 24 // nothing to do | 24 // nothing to do |
| 25 } | 25 } |
| 26 | 26 |
| 27 const char *CharsetRecog_UTF_16_BE::getName() const | 27 const char *CharsetRecog_UTF_16_BE::getName() const |
| 28 { | 28 { |
| 29 return "UTF-16BE"; | 29 return "UTF-16BE"; |
| 30 } | 30 } |
| 31 | 31 |
| 32 // UTF-16 confidence calculation. Very simple minded, but better than nothing. |
| 33 // Any 8 bit non-control characters bump the confidence up. These have a zero
high byte, |
| 34 // and are very likely to be UTF-16, although they could also be part of a U
TF-32 code. |
| 35 // NULs are a contra-indication, they will appear commonly if the actual encod
ing is UTF-32. |
| 36 // NULs should be rare in actual text. |
| 37 |
| 38 static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { |
| 39 if (codeUnit == 0) { |
| 40 confidence -= 10; |
| 41 } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { |
| 42 confidence += 10; |
| 43 } |
| 44 if (confidence < 0) { |
| 45 confidence = 0; |
| 46 } else if (confidence > 100) { |
| 47 confidence = 100; |
| 48 } |
| 49 return confidence; |
| 50 } |
| 51 |
| 52 |
| 32 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co
nst | 53 UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) co
nst |
| 33 { | 54 { |
| 34 const uint8_t *input = textIn->fRawInput; | 55 const uint8_t *input = textIn->fRawInput; |
| 35 int32_t confidence = 0; | 56 int32_t confidence = 10; |
| 36 int32_t length = textIn->fRawLength; | 57 int32_t length = textIn->fRawLength; |
| 37 | 58 |
| 38 if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { | 59 int32_t bytesToCheck = (length > 30) ? 30 : length; |
| 39 confidence = 100; | 60 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
| 61 UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1]; |
| 62 if (charIndex == 0 && codeUnit == 0xFEFF) { |
| 63 confidence = 100; |
| 64 break; |
| 65 } |
| 66 confidence = adjustConfidence(codeUnit, confidence); |
| 67 if (confidence == 0 || confidence == 100) { |
| 68 break; |
| 69 } |
| 40 } | 70 } |
| 41 | 71 if (bytesToCheck < 4 && confidence < 100) { |
| 42 // TODO: Do some statastics to check for unsigned UTF-16BE | 72 confidence = 0; |
| 73 } |
| 43 results->set(textIn, this, confidence); | 74 results->set(textIn, this, confidence); |
| 44 return (confidence > 0); | 75 return (confidence > 0); |
| 45 } | 76 } |
| 46 | 77 |
| 47 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() | 78 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() |
| 48 { | 79 { |
| 49 // nothing to do | 80 // nothing to do |
| 50 } | 81 } |
| 51 | 82 |
| 52 const char *CharsetRecog_UTF_16_LE::getName() const | 83 const char *CharsetRecog_UTF_16_LE::getName() const |
| 53 { | 84 { |
| 54 return "UTF-16LE"; | 85 return "UTF-16LE"; |
| 55 } | 86 } |
| 56 | 87 |
| 57 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co
nst | 88 UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) co
nst |
| 58 { | 89 { |
| 59 const uint8_t *input = textIn->fRawInput; | 90 const uint8_t *input = textIn->fRawInput; |
| 60 int32_t confidence = 0; | 91 int32_t confidence = 10; |
| 61 int32_t length = textIn->fRawLength; | 92 int32_t length = textIn->fRawLength; |
| 62 | 93 |
| 63 if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00
|| input[3] != 0x00)) { | 94 int32_t bytesToCheck = (length > 30) ? 30 : length; |
| 64 confidence = 100; | 95 for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
| 96 UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8); |
| 97 if (charIndex == 0 && codeUnit == 0xFEFF) { |
| 98 confidence = 100; // UTF-16 BOM |
| 99 if (length >= 4 && input[2] == 0 && input[3] == 0) { |
| 100 confidence = 0; // UTF-32 BOM |
| 101 } |
| 102 break; |
| 103 } |
| 104 confidence = adjustConfidence(codeUnit, confidence); |
| 105 if (confidence == 0 || confidence == 100) { |
| 106 break; |
| 107 } |
| 65 } | 108 } |
| 66 | 109 if (bytesToCheck < 4 && confidence < 100) { |
| 67 // TODO: Do some statastics to check for unsigned UTF-16LE | 110 confidence = 0; |
| 111 } |
| 68 results->set(textIn, this, confidence); | 112 results->set(textIn, this, confidence); |
| 69 return (confidence > 0); | 113 return (confidence > 0); |
| 70 } | 114 } |
| 71 | 115 |
| 72 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() | 116 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() |
| 73 { | 117 { |
| 74 // nothing to do | 118 // nothing to do |
| 75 } | 119 } |
| 76 | 120 |
| 77 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const | 121 UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 145 | 189 |
| 146 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con
st | 190 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con
st |
| 147 { | 191 { |
| 148 return input[index + 3] << 24 | input[index + 2] << 16 | | 192 return input[index + 3] << 24 | input[index + 2] << 16 | |
| 149 input[index + 1] << 8 | input[index + 0]; | 193 input[index + 1] << 8 | input[index + 0]; |
| 150 } | 194 } |
| 151 | 195 |
| 152 U_NAMESPACE_END | 196 U_NAMESPACE_END |
| 153 #endif | 197 #endif |
| 154 | 198 |
| OLD | NEW |