OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2006, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 */ |
| 7 |
| 8 #include "unicode/utypes.h" |
| 9 |
| 10 #if !UCONFIG_NO_CONVERSION |
| 11 |
| 12 #include "csrucode.h" |
| 13 |
| 14 U_NAMESPACE_BEGIN |
| 15 |
| 16 CharsetRecog_Unicode::~CharsetRecog_Unicode() |
| 17 { |
| 18 // nothing to do |
| 19 } |
| 20 |
| 21 CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() |
| 22 { |
| 23 // nothing to do |
| 24 } |
| 25 |
| 26 const char *CharsetRecog_UTF_16_BE::getName() const |
| 27 { |
| 28 return "UTF-16BE"; |
| 29 } |
| 30 |
| 31 int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn) |
| 32 { |
| 33 const uint8_t *input = textIn->fRawInput; |
| 34 |
| 35 if (input[0] == 0xFE && input[1] == 0xFF) { |
| 36 return 100; |
| 37 } |
| 38 |
| 39 // TODO: Do some statastics to check for unsigned UTF-16BE |
| 40 return 0; |
| 41 } |
| 42 |
| 43 CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() |
| 44 { |
| 45 // nothing to do |
| 46 } |
| 47 |
| 48 const char *CharsetRecog_UTF_16_LE::getName() const |
| 49 { |
| 50 return "UTF-16LE"; |
| 51 } |
| 52 |
| 53 int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn) |
| 54 { |
| 55 const uint8_t *input = textIn->fRawInput; |
| 56 |
| 57 if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] !=
0x00)) { |
| 58 return 100; |
| 59 } |
| 60 |
| 61 // TODO: Do some statastics to check for unsigned UTF-16LE |
| 62 return 0; |
| 63 } |
| 64 |
| 65 CharsetRecog_UTF_32::~CharsetRecog_UTF_32() |
| 66 { |
| 67 // nothing to do |
| 68 } |
| 69 |
| 70 int32_t CharsetRecog_UTF_32::match(InputText* textIn) |
| 71 { |
| 72 const uint8_t *input = textIn->fRawInput; |
| 73 int32_t limit = (textIn->fRawLength / 4) * 4; |
| 74 int32_t numValid = 0; |
| 75 int32_t numInvalid = 0; |
| 76 bool hasBOM = FALSE; |
| 77 int32_t confidence = 0; |
| 78 |
| 79 if (getChar(input, 0) == 0x0000FEFFUL) { |
| 80 hasBOM = TRUE; |
| 81 } |
| 82 |
| 83 for(int32_t i = 0; i < limit; i += 4) { |
| 84 int32_t ch = getChar(input, i); |
| 85 |
| 86 if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { |
| 87 numInvalid += 1; |
| 88 } else { |
| 89 numValid += 1; |
| 90 } |
| 91 } |
| 92 |
| 93 |
| 94 // Cook up some sort of confidence score, based on presense of a BOM |
| 95 // and the existence of valid and/or invalid multi-byte sequences. |
| 96 if (hasBOM && numInvalid==0) { |
| 97 confidence = 100; |
| 98 } else if (hasBOM && numValid > numInvalid*10) { |
| 99 confidence = 80; |
| 100 } else if (numValid > 3 && numInvalid == 0) { |
| 101 confidence = 100; |
| 102 } else if (numValid > 0 && numInvalid == 0) { |
| 103 confidence = 80; |
| 104 } else if (numValid > numInvalid*10) { |
| 105 // Probably corruput UTF-32BE data. Valid sequences aren't likely by ch
ance. |
| 106 confidence = 25; |
| 107 } |
| 108 |
| 109 return confidence; |
| 110 } |
| 111 |
| 112 CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() |
| 113 { |
| 114 // nothing to do |
| 115 } |
| 116 |
| 117 const char *CharsetRecog_UTF_32_BE::getName() const |
| 118 { |
| 119 return "UTF-32BE"; |
| 120 } |
| 121 |
| 122 int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) con
st |
| 123 { |
| 124 return input[index + 0] << 24 | input[index + 1] << 16 | |
| 125 input[index + 2] << 8 | input[index + 3]; |
| 126 } |
| 127 |
| 128 CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() |
| 129 { |
| 130 // nothing to do |
| 131 } |
| 132 |
| 133 const char *CharsetRecog_UTF_32_LE::getName() const |
| 134 { |
| 135 return "UTF-32LE"; |
| 136 } |
| 137 |
| 138 int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) con
st |
| 139 { |
| 140 return input[index + 3] << 24 | input[index + 2] << 16 | |
| 141 input[index + 1] << 8 | input[index + 0]; |
| 142 } |
| 143 |
| 144 U_NAMESPACE_END |
| 145 #endif |
| 146 |
OLD | NEW |