| Index: source/i18n/csrutf8.cpp
|
| diff --git a/source/i18n/csrutf8.cpp b/source/i18n/csrutf8.cpp
|
| index 420c66909d4da0732941649e6ab0d50b373da819..b18aa77e79669f3115e862065498796883d30cce 100644
|
| --- a/source/i18n/csrutf8.cpp
|
| +++ b/source/i18n/csrutf8.cpp
|
| @@ -1,6 +1,6 @@
|
| /*
|
| **********************************************************************
|
| - * Copyright (C) 2005-2012, International Business Machines
|
| + * Copyright (C) 2005-2014, International Business Machines
|
| * Corporation and others. All Rights Reserved.
|
| **********************************************************************
|
| */
|
| @@ -55,12 +55,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
|
| trailBytes = 3;
|
| } else {
|
| numInvalid += 1;
|
| -
|
| - if (numInvalid > 5) {
|
| - break;
|
| - }
|
| -
|
| - trailBytes = 0;
|
| + continue;
|
| }
|
|
|
| // Verify that we've got the right number of trail bytes in the sequence
|
| @@ -86,7 +81,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
|
|
|
| }
|
|
|
| - // Cook up some sort of confidence score, based on presense of a BOM
|
| + // Cook up some sort of confidence score, based on presence of a BOM
|
| // and the existence of valid and/or invalid multi-byte sequences.
|
| confidence = 0;
|
| if (hasBOM && numInvalid == 0) {
|
| @@ -98,8 +93,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
|
| } else if (numValid > 0 && numInvalid == 0) {
|
| confidence = 80;
|
| } else if (numValid == 0 && numInvalid == 0) {
|
| - // Plain ASCII.
|
| - confidence = 10;
|
| + // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
|
| + // accepts ASCII with confidence = 10.
|
| + confidence = 15;
|
| } else if (numValid > numInvalid*10) {
|
| // Probably corruput utf-8 data. Valid sequences aren't likely by chance.
|
| confidence = 25;
|
|
|