OLD | NEW |
1 /* | 1 /* |
2 ********************************************************************** | 2 ********************************************************************** |
3 * Copyright (C) 2005-2012, International Business Machines | 3 * Copyright (C) 2005-2014, International Business Machines |
4 * Corporation and others. All Rights Reserved. | 4 * Corporation and others. All Rights Reserved. |
5 ********************************************************************** | 5 ********************************************************************** |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #if !UCONFIG_NO_CONVERSION | 10 #if !UCONFIG_NO_CONVERSION |
11 | 11 |
12 #include "csrutf8.h" | 12 #include "csrutf8.h" |
13 #include "csmatch.h" | 13 #include "csmatch.h" |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
48 | 48 |
49 // Hi bit on char found. Figure out how long the sequence should be | 49 // Hi bit on char found. Figure out how long the sequence should be |
50 if ((b & 0x0E0) == 0x0C0) { | 50 if ((b & 0x0E0) == 0x0C0) { |
51 trailBytes = 1; | 51 trailBytes = 1; |
52 } else if ((b & 0x0F0) == 0x0E0) { | 52 } else if ((b & 0x0F0) == 0x0E0) { |
53 trailBytes = 2; | 53 trailBytes = 2; |
54 } else if ((b & 0x0F8) == 0xF0) { | 54 } else if ((b & 0x0F8) == 0xF0) { |
55 trailBytes = 3; | 55 trailBytes = 3; |
56 } else { | 56 } else { |
57 numInvalid += 1; | 57 numInvalid += 1; |
58 | 58 continue; |
59 if (numInvalid > 5) { | |
60 break; | |
61 } | |
62 | |
63 trailBytes = 0; | |
64 } | 59 } |
65 | 60 |
66 // Verify that we've got the right number of trail bytes in the sequence | 61 // Verify that we've got the right number of trail bytes in the sequence |
67 for (;;) { | 62 for (;;) { |
68 i += 1; | 63 i += 1; |
69 | 64 |
70 if (i >= input->fRawLength) { | 65 if (i >= input->fRawLength) { |
71 break; | 66 break; |
72 } | 67 } |
73 | 68 |
74 b = inputBytes[i]; | 69 b = inputBytes[i]; |
75 | 70 |
76 if ((b & 0xC0) != 0x080) { | 71 if ((b & 0xC0) != 0x080) { |
77 numInvalid += 1; | 72 numInvalid += 1; |
78 break; | 73 break; |
79 } | 74 } |
80 | 75 |
81 if (--trailBytes == 0) { | 76 if (--trailBytes == 0) { |
82 numValid += 1; | 77 numValid += 1; |
83 break; | 78 break; |
84 } | 79 } |
85 } | 80 } |
86 | 81 |
87 } | 82 } |
88 | 83 |
89 // Cook up some sort of confidence score, based on presense of a BOM | 84 // Cook up some sort of confidence score, based on presence of a BOM |
90 // and the existence of valid and/or invalid multi-byte sequences. | 85 // and the existence of valid and/or invalid multi-byte sequences. |
91 confidence = 0; | 86 confidence = 0; |
92 if (hasBOM && numInvalid == 0) { | 87 if (hasBOM && numInvalid == 0) { |
93 confidence = 100; | 88 confidence = 100; |
94 } else if (hasBOM && numValid > numInvalid*10) { | 89 } else if (hasBOM && numValid > numInvalid*10) { |
95 confidence = 80; | 90 confidence = 80; |
96 } else if (numValid > 3 && numInvalid == 0) { | 91 } else if (numValid > 3 && numInvalid == 0) { |
97 confidence = 100; | 92 confidence = 100; |
98 } else if (numValid > 0 && numInvalid == 0) { | 93 } else if (numValid > 0 && numInvalid == 0) { |
99 confidence = 80; | 94 confidence = 80; |
100 } else if (numValid == 0 && numInvalid == 0) { | 95 } else if (numValid == 0 && numInvalid == 0) { |
101 // Plain ASCII. | 96 // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, w
hich |
102 confidence = 10; | 97 // accepts ASCII with confidence = 10. |
| 98 confidence = 15; |
103 } else if (numValid > numInvalid*10) { | 99 } else if (numValid > numInvalid*10) { |
104 // Probably corruput utf-8 data. Valid sequences aren't likely by chanc
e. | 100 // Probably corruput utf-8 data. Valid sequences aren't likely by chanc
e. |
105 confidence = 25; | 101 confidence = 25; |
106 } | 102 } |
107 | 103 |
108 results->set(input, this, confidence); | 104 results->set(input, this, confidence); |
109 return (confidence > 0); | 105 return (confidence > 0); |
110 } | 106 } |
111 | 107 |
112 U_NAMESPACE_END | 108 U_NAMESPACE_END |
113 #endif | 109 #endif |
OLD | NEW |