Index: icu46/source/i18n/inputext.cpp |
=================================================================== |
--- icu46/source/i18n/inputext.cpp (revision 0) |
+++ icu46/source/i18n/inputext.cpp (revision 0) |
@@ -0,0 +1,164 @@ |
+/* |
+ ********************************************************************** |
+ * Copyright (C) 2005-2009, International Business Machines |
+ * Corporation and others. All Rights Reserved. |
+ ********************************************************************** |
+ */ |
+ |
+#include "unicode/utypes.h" |
+ |
+#if !UCONFIG_NO_CONVERSION |
+ |
+#include "inputext.h" |
+ |
+#include "cmemory.h" |
+#include "cstring.h" |
+ |
+#include <string.h> |
+ |
+U_NAMESPACE_BEGIN |
+ |
+#define BUFFER_SIZE 8192 |
+ |
+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) |
+ |
+#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) |
+#define DELETE_ARRAY(array) uprv_free((void *) (array)) |
+ |
+InputText::InputText(UErrorCode &status) |
+ : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been |
+ // removed if appropriate. |
+ fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. |
+ // Value is percent, not absolute. |
+ fDeclaredEncoding(0), |
+ fRawInput(0), |
+ fRawLength(0) |
+{ |
+ if (fInputBytes == NULL || fByteStats == NULL) { |
+ status = U_MEMORY_ALLOCATION_ERROR; |
+ } |
+} |
+ |
+InputText::~InputText() |
+{ |
+ DELETE_ARRAY(fDeclaredEncoding); |
+ DELETE_ARRAY(fByteStats); |
+ DELETE_ARRAY(fInputBytes); |
+} |
+ |
+void InputText::setText(const char *in, int32_t len) |
+{ |
+ fInputLen = 0; |
+ fC1Bytes = FALSE; |
+ fRawInput = (const uint8_t *) in; |
+ fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; |
+} |
+ |
+void InputText::setDeclaredEncoding(const char* encoding, int32_t len) |
+{ |
+ if(encoding) { |
+ if (len == -1) { |
+ len = (int32_t)uprv_strlen(encoding); |
+ } |
+ |
+ len += 1; // to make place for the \0 at the end. |
+ uprv_free(fDeclaredEncoding); |
+ fDeclaredEncoding = NEW_ARRAY(char, len); |
+ uprv_strncpy(fDeclaredEncoding, encoding, len); |
+ } |
+} |
+ |
+UBool InputText::isSet() const |
+{ |
+ return fRawInput != NULL; |
+} |
+ |
+/** |
+* MungeInput - after getting a set of raw input data to be analyzed, preprocess |
+* it by removing what appears to be html markup. |
+* |
+* @internal |
+*/ |
+void InputText::MungeInput(UBool fStripTags) { |
+ int srci = 0; |
+ int dsti = 0; |
+ uint8_t b; |
+ bool inMarkup = FALSE; |
+ int32_t openTags = 0; |
+ int32_t badTags = 0; |
+ |
+ // |
+ // html / xml markup stripping. |
+ // quick and dirty, not 100% accurate, but hopefully good enough, statistically. |
+ // discard everything within < brackets > |
+ // Count how many total '<' and illegal (nested) '<' occur, so we can make some |
+ // guess as to whether the input was actually marked up at all. |
+ // TODO: Think about how this interacts with EBCDIC charsets that are detected. |
+ if (fStripTags) { |
+ for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { |
+ b = fRawInput[srci]; |
+ |
+ if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ |
+ if (inMarkup) { |
+ badTags += 1; |
+ } |
+ |
+ inMarkup = TRUE; |
+ openTags += 1; |
+ } |
+ |
+ if (! inMarkup) { |
+ fInputBytes[dsti++] = b; |
+ } |
+ |
+ if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ |
+ inMarkup = FALSE; |
+ } |
+ } |
+ |
+ fInputLen = dsti; |
+ } |
+ |
+ // |
+ // If it looks like this input wasn't marked up, or if it looks like it's |
+ // essentially nothing but markup abandon the markup stripping. |
+ // Detection will have to work on the unstripped input. |
+ // |
+ if (openTags<5 || openTags/5 < badTags || |
+ (fInputLen < 100 && fRawLength>600)) |
+ { |
+ int32_t limit = fRawLength; |
+ |
+ if (limit > BUFFER_SIZE) { |
+ limit = BUFFER_SIZE; |
+ } |
+ |
+ for (srci=0; srci<limit; srci++) { |
+ fInputBytes[srci] = fRawInput[srci]; |
+ } |
+ |
+ fInputLen = srci; |
+ } |
+ |
+ // |
+ // Tally up the byte occurence statistics. |
+ // These are available for use by the various detectors. |
+ // |
+ |
+ uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); |
+ |
+ for (srci = 0; srci < fInputLen; srci += 1) { |
+ fByteStats[fInputBytes[srci]] += 1; |
+ } |
+ |
+ for (int32_t i = 0x80; i <= 0x9F; i += 1) { |
+ if (fByteStats[i] != 0) { |
+ fC1Bytes = TRUE; |
+ break; |
+ } |
+ } |
+} |
+ |
+U_NAMESPACE_END |
+#endif |
+ |
Property changes on: icu46/source/i18n/inputext.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |