OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2005-2009, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 */ |
| 7 |
| 8 #include "unicode/utypes.h" |
| 9 |
| 10 #if !UCONFIG_NO_CONVERSION |
| 11 |
| 12 #include "inputext.h" |
| 13 |
| 14 #include "cmemory.h" |
| 15 #include "cstring.h" |
| 16 |
| 17 #include <string.h> |
| 18 |
| 19 U_NAMESPACE_BEGIN |
| 20 |
| 21 #define BUFFER_SIZE 8192 |
| 22 |
| 23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
| 24 |
| 25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
| 26 #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
| 27 |
| 28 InputText::InputText(UErrorCode &status) |
| 29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.
Markup will have been |
| 30 // removed if appropriate. |
| 31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics fo
r the input text. |
| 32 // Value is percent, not abso
lute. |
| 33 fDeclaredEncoding(0), |
| 34 fRawInput(0), |
| 35 fRawLength(0) |
| 36 { |
| 37 if (fInputBytes == NULL || fByteStats == NULL) { |
| 38 status = U_MEMORY_ALLOCATION_ERROR; |
| 39 } |
| 40 } |
| 41 |
| 42 InputText::~InputText() |
| 43 { |
| 44 DELETE_ARRAY(fDeclaredEncoding); |
| 45 DELETE_ARRAY(fByteStats); |
| 46 DELETE_ARRAY(fInputBytes); |
| 47 } |
| 48 |
| 49 void InputText::setText(const char *in, int32_t len) |
| 50 { |
| 51 fInputLen = 0; |
| 52 fC1Bytes = FALSE; |
| 53 fRawInput = (const uint8_t *) in; |
| 54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
| 55 } |
| 56 |
| 57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len) |
| 58 { |
| 59 if(encoding) { |
| 60 if (len == -1) { |
| 61 len = (int32_t)uprv_strlen(encoding); |
| 62 } |
| 63 |
| 64 len += 1; // to make place for the \0 at the end. |
| 65 uprv_free(fDeclaredEncoding); |
| 66 fDeclaredEncoding = NEW_ARRAY(char, len); |
| 67 uprv_strncpy(fDeclaredEncoding, encoding, len); |
| 68 } |
| 69 } |
| 70 |
| 71 UBool InputText::isSet() const |
| 72 { |
| 73 return fRawInput != NULL; |
| 74 } |
| 75 |
| 76 /** |
| 77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess |
| 78 * it by removing what appears to be html markup. |
| 79 * |
| 80 * @internal |
| 81 */ |
| 82 void InputText::MungeInput(UBool fStripTags) { |
| 83 int srci = 0; |
| 84 int dsti = 0; |
| 85 uint8_t b; |
| 86 bool inMarkup = FALSE; |
| 87 int32_t openTags = 0; |
| 88 int32_t badTags = 0; |
| 89 |
| 90 // |
| 91 // html / xml markup stripping. |
| 92 // quick and dirty, not 100% accurate, but hopefully good enough, statis
tically. |
| 93 // discard everything within < brackets > |
| 94 // Count how many total '<' and illegal (nested) '<' occur, so we can ma
ke some |
| 95 // guess as to whether the input was actually marked up at all. |
| 96 // TODO: Think about how this interacts with EBCDIC charsets that are detect
ed. |
| 97 if (fStripTags) { |
| 98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { |
| 99 b = fRawInput[srci]; |
| 100 |
| 101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ |
| 102 if (inMarkup) { |
| 103 badTags += 1; |
| 104 } |
| 105 |
| 106 inMarkup = TRUE; |
| 107 openTags += 1; |
| 108 } |
| 109 |
| 110 if (! inMarkup) { |
| 111 fInputBytes[dsti++] = b; |
| 112 } |
| 113 |
| 114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ |
| 115 inMarkup = FALSE; |
| 116 } |
| 117 } |
| 118 |
| 119 fInputLen = dsti; |
| 120 } |
| 121 |
| 122 // |
| 123 // If it looks like this input wasn't marked up, or if it looks like it's |
| 124 // essentially nothing but markup abandon the markup stripping. |
| 125 // Detection will have to work on the unstripped input. |
| 126 // |
| 127 if (openTags<5 || openTags/5 < badTags || |
| 128 (fInputLen < 100 && fRawLength>600)) |
| 129 { |
| 130 int32_t limit = fRawLength; |
| 131 |
| 132 if (limit > BUFFER_SIZE) { |
| 133 limit = BUFFER_SIZE; |
| 134 } |
| 135 |
| 136 for (srci=0; srci<limit; srci++) { |
| 137 fInputBytes[srci] = fRawInput[srci]; |
| 138 } |
| 139 |
| 140 fInputLen = srci; |
| 141 } |
| 142 |
| 143 // |
| 144 // Tally up the byte occurence statistics. |
| 145 // These are available for use by the various detectors. |
| 146 // |
| 147 |
| 148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); |
| 149 |
| 150 for (srci = 0; srci < fInputLen; srci += 1) { |
| 151 fByteStats[fInputBytes[srci]] += 1; |
| 152 } |
| 153 |
| 154 for (int32_t i = 0x80; i <= 0x9F; i += 1) { |
| 155 if (fByteStats[i] != 0) { |
| 156 fC1Bytes = TRUE; |
| 157 break; |
| 158 } |
| 159 } |
| 160 } |
| 161 |
| 162 U_NAMESPACE_END |
| 163 #endif |
| 164 |
OLD | NEW |