| Index: icu46/source/i18n/inputext.cpp
|
| ===================================================================
|
| --- icu46/source/i18n/inputext.cpp (revision 0)
|
| +++ icu46/source/i18n/inputext.cpp (revision 0)
|
| @@ -0,0 +1,164 @@
|
| +/*
|
| + **********************************************************************
|
| + * Copyright (C) 2005-2009, International Business Machines
|
| + * Corporation and others. All Rights Reserved.
|
| + **********************************************************************
|
| + */
|
| +
|
| +#include "unicode/utypes.h"
|
| +
|
| +#if !UCONFIG_NO_CONVERSION
|
| +
|
| +#include "inputext.h"
|
| +
|
| +#include "cmemory.h"
|
| +#include "cstring.h"
|
| +
|
| +#include <string.h>
|
| +
|
| +U_NAMESPACE_BEGIN
|
| +
|
| +#define BUFFER_SIZE 8192
|
| +
|
| +#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
|
| +
|
| +#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
|
| +#define DELETE_ARRAY(array) uprv_free((void *) (array))
|
| +
|
| +InputText::InputText(UErrorCode &status)
|
| + : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
|
| + // removed if appropriate.
|
| + fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
|
| + // Value is percent, not absolute.
|
| + fDeclaredEncoding(0),
|
| + fRawInput(0),
|
| + fRawLength(0)
|
| +{
|
| + if (fInputBytes == NULL || fByteStats == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + }
|
| +}
|
| +
|
| +InputText::~InputText()
|
| +{
|
| + DELETE_ARRAY(fDeclaredEncoding);
|
| + DELETE_ARRAY(fByteStats);
|
| + DELETE_ARRAY(fInputBytes);
|
| +}
|
| +
|
| +void InputText::setText(const char *in, int32_t len)
|
| +{
|
| + fInputLen = 0;
|
| + fC1Bytes = FALSE;
|
| + fRawInput = (const uint8_t *) in;
|
| + fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
|
| +}
|
| +
|
| +void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
|
| +{
|
| + if(encoding) {
|
| + if (len == -1) {
|
| + len = (int32_t)uprv_strlen(encoding);
|
| + }
|
| +
|
| + len += 1; // to make place for the \0 at the end.
|
| + uprv_free(fDeclaredEncoding);
|
| + fDeclaredEncoding = NEW_ARRAY(char, len);
|
| + uprv_strncpy(fDeclaredEncoding, encoding, len);
|
| + }
|
| +}
|
| +
|
| +UBool InputText::isSet() const
|
| +{
|
| + return fRawInput != NULL;
|
| +}
|
| +
|
| +/**
|
| +* MungeInput - after getting a set of raw input data to be analyzed, preprocess
|
| +* it by removing what appears to be html markup.
|
| +*
|
| +* @internal
|
| +*/
|
| +void InputText::MungeInput(UBool fStripTags) {
|
| + int srci = 0;
|
| + int dsti = 0;
|
| + uint8_t b;
|
| + bool inMarkup = FALSE;
|
| + int32_t openTags = 0;
|
| + int32_t badTags = 0;
|
| +
|
| + //
|
| + // html / xml markup stripping.
|
| + // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
|
| + // discard everything within < brackets >
|
| + // Count how many total '<' and illegal (nested) '<' occur, so we can make some
|
| + // guess as to whether the input was actually marked up at all.
|
| + // TODO: Think about how this interacts with EBCDIC charsets that are detected.
|
| + if (fStripTags) {
|
| + for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
|
| + b = fRawInput[srci];
|
| +
|
| + if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
|
| + if (inMarkup) {
|
| + badTags += 1;
|
| + }
|
| +
|
| + inMarkup = TRUE;
|
| + openTags += 1;
|
| + }
|
| +
|
| + if (! inMarkup) {
|
| + fInputBytes[dsti++] = b;
|
| + }
|
| +
|
| + if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
|
| + inMarkup = FALSE;
|
| + }
|
| + }
|
| +
|
| + fInputLen = dsti;
|
| + }
|
| +
|
| + //
|
| + // If it looks like this input wasn't marked up, or if it looks like it's
|
| + // essentially nothing but markup abandon the markup stripping.
|
| + // Detection will have to work on the unstripped input.
|
| + //
|
| + if (openTags<5 || openTags/5 < badTags ||
|
| + (fInputLen < 100 && fRawLength>600))
|
| + {
|
| + int32_t limit = fRawLength;
|
| +
|
| + if (limit > BUFFER_SIZE) {
|
| + limit = BUFFER_SIZE;
|
| + }
|
| +
|
| + for (srci=0; srci<limit; srci++) {
|
| + fInputBytes[srci] = fRawInput[srci];
|
| + }
|
| +
|
| + fInputLen = srci;
|
| + }
|
| +
|
| + //
|
| + // Tally up the byte occurence statistics.
|
| + // These are available for use by the various detectors.
|
| + //
|
| +
|
| + uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
|
| +
|
| + for (srci = 0; srci < fInputLen; srci += 1) {
|
| + fByteStats[fInputBytes[srci]] += 1;
|
| + }
|
| +
|
| + for (int32_t i = 0x80; i <= 0x9F; i += 1) {
|
| + if (fByteStats[i] != 0) {
|
| + fC1Bytes = TRUE;
|
| + break;
|
| + }
|
| + }
|
| +}
|
| +
|
| +U_NAMESPACE_END
|
| +#endif
|
| +
|
|
|
| Property changes on: icu46/source/i18n/inputext.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|