Index: source/i18n/csrutf8.cpp |
diff --git a/source/i18n/csrutf8.cpp b/source/i18n/csrutf8.cpp |
index 420c66909d4da0732941649e6ab0d50b373da819..b18aa77e79669f3115e862065498796883d30cce 100644 |
--- a/source/i18n/csrutf8.cpp |
+++ b/source/i18n/csrutf8.cpp |
@@ -1,6 +1,6 @@ |
/* |
********************************************************************** |
- * Copyright (C) 2005-2012, International Business Machines |
+ * Copyright (C) 2005-2014, International Business Machines |
* Corporation and others. All Rights Reserved. |
********************************************************************** |
*/ |
@@ -55,12 +55,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { |
trailBytes = 3; |
} else { |
numInvalid += 1; |
- |
- if (numInvalid > 5) { |
- break; |
- } |
- |
- trailBytes = 0; |
+ continue; |
} |
// Verify that we've got the right number of trail bytes in the sequence |
@@ -86,7 +81,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { |
} |
- // Cook up some sort of confidence score, based on presense of a BOM |
+ // Cook up some sort of confidence score, based on presence of a BOM |
// and the existence of valid and/or invalid multi-byte sequences. |
confidence = 0; |
if (hasBOM && numInvalid == 0) { |
@@ -98,8 +93,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { |
} else if (numValid > 0 && numInvalid == 0) { |
confidence = 80; |
} else if (numValid == 0 && numInvalid == 0) { |
- // Plain ASCII. |
- confidence = 10; |
+ // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which |
+ // accepts ASCII with confidence = 10. |
+ confidence = 15; |
} else if (numValid > numInvalid*10) { |
// Probably corruput utf-8 data. Valid sequences aren't likely by chance. |
confidence = 25; |