Index: source/i18n/csrucode.cpp |
diff --git a/source/i18n/csrucode.cpp b/source/i18n/csrucode.cpp |
index 21239b7eafd90773482062d2b0e1db4deec4b927..f09834306808107d482aa3dc8b8064a7eb9a5779 100644 |
--- a/source/i18n/csrucode.cpp |
+++ b/source/i18n/csrucode.cpp |
@@ -29,17 +29,48 @@ const char *CharsetRecog_UTF_16_BE::getName() const |
return "UTF-16BE"; |
} |
+// UTF-16 confidence calculation. Very simple minded, but better than nothing. |
+// Any 8 bit non-control characters bump the confidence up. These have a zero high byte, |
+// and are very likely to be UTF-16, although they could also be part of a UTF-32 code. |
+// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. |
+// NULs should be rare in actual text. |
+ |
+static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { |
+ if (codeUnit == 0) { |
+ confidence -= 10; |
+ } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { |
+ confidence += 10; |
+ } |
+ if (confidence < 0) { |
+ confidence = 0; |
+ } else if (confidence > 100) { |
+ confidence = 100; |
+ } |
+ return confidence; |
+} |
+ |
+ |
UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const |
{ |
const uint8_t *input = textIn->fRawInput; |
- int32_t confidence = 0; |
+ int32_t confidence = 10; |
int32_t length = textIn->fRawLength; |
- if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { |
- confidence = 100; |
+ int32_t bytesToCheck = (length > 30) ? 30 : length; |
+ for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
+ UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1]; |
+ if (charIndex == 0 && codeUnit == 0xFEFF) { |
+ confidence = 100; |
+ break; |
+ } |
+ confidence = adjustConfidence(codeUnit, confidence); |
+ if (confidence == 0 || confidence == 100) { |
+ break; |
+ } |
+ } |
+ if (bytesToCheck < 4 && confidence < 100) { |
+ confidence = 0; |
} |
- |
- // TODO: Do some statastics to check for unsigned UTF-16BE |
results->set(textIn, this, confidence); |
return (confidence > 0); |
} |
@@ -57,14 +88,27 @@ const char *CharsetRecog_UTF_16_LE::getName() const |
UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const |
{ |
const uint8_t *input = textIn->fRawInput; |
- int32_t confidence = 0; |
+ int32_t confidence = 10; |
int32_t length = textIn->fRawLength; |
- if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { |
- confidence = 100; |
+ int32_t bytesToCheck = (length > 30) ? 30 : length; |
+ for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
+ UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8); |
+ if (charIndex == 0 && codeUnit == 0xFEFF) { |
+ confidence = 100; // UTF-16 BOM |
+ if (length >= 4 && input[2] == 0 && input[3] == 0) { |
+ confidence = 0; // UTF-32 BOM |
+ } |
+ break; |
+ } |
+ confidence = adjustConfidence(codeUnit, confidence); |
+ if (confidence == 0 || confidence == 100) { |
+ break; |
+ } |
+ } |
+ if (bytesToCheck < 4 && confidence < 100) { |
+ confidence = 0; |
} |
- |
- // TODO: Do some statastics to check for unsigned UTF-16LE |
results->set(textIn, this, confidence); |
return (confidence > 0); |
} |