icu46/source/i18n/inputext.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/inputext.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/inputext.cpp

===================================================================

--- icu46/source/i18n/inputext.cpp (revision 0)

+++ icu46/source/i18n/inputext.cpp (revision 0)

@@ -0,0 +1,164 @@

+/*

+ **********************************************************************

+ */

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "inputext.h"

+#include "cmemory.h"

+#include "cstring.h"

+#include <string.h>

+U_NAMESPACE_BEGIN

+#define BUFFER_SIZE 8192

+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

+#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))

+#define DELETE_ARRAY(array) uprv_free((void *) (array))

+InputText::InputText(UErrorCode &status)

+ : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been

+ // removed if appropriate.

+ fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.

+ // Value is percent, not absolute.

+ fDeclaredEncoding(0),

+ fRawInput(0),

+ fRawLength(0)

+ if (fInputBytes == NULL || fByteStats == NULL) {

+ status = U_MEMORY_ALLOCATION_ERROR;

+ }

+InputText::~InputText()

+ DELETE_ARRAY(fDeclaredEncoding);

+ DELETE_ARRAY(fByteStats);

+ DELETE_ARRAY(fInputBytes);

+void InputText::setText(const char *in, int32_t len)

+ fInputLen = 0;

+ fC1Bytes = FALSE;

+ fRawInput = (const uint8_t *) in;

+ fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;

+void InputText::setDeclaredEncoding(const char* encoding, int32_t len)

+ if(encoding) {

+ if (len == -1) {

+ len = (int32_t)uprv_strlen(encoding);

+ }

+ len += 1; // to make place for the \0 at the end.

+ uprv_free(fDeclaredEncoding);

+ fDeclaredEncoding = NEW_ARRAY(char, len);

+ uprv_strncpy(fDeclaredEncoding, encoding, len);

+ }

+UBool InputText::isSet() const

+ return fRawInput != NULL;

+/**

+* MungeInput - after getting a set of raw input data to be analyzed, preprocess

+* it by removing what appears to be html markup.

+* @internal

+*/

+void InputText::MungeInput(UBool fStripTags) {

+ int srci = 0;

+ int dsti = 0;

+ uint8_t b;

+ bool inMarkup = FALSE;

+ int32_t openTags = 0;

+ int32_t badTags = 0;

+ //

+ // html / xml markup stripping.

+ // quick and dirty, not 100% accurate, but hopefully good enough, statistically.

+ // discard everything within < brackets >

+ // Count how many total '<' and illegal (nested) '<' occur, so we can make some

+ // guess as to whether the input was actually marked up at all.

+ // TODO: Think about how this interacts with EBCDIC charsets that are detected.

+ if (fStripTags) {

+ for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {

+ b = fRawInput[srci];

+ if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */

+ if (inMarkup) {

+ badTags += 1;

+ }

+ inMarkup = TRUE;

+ openTags += 1;

+ }

+ if (! inMarkup) {

+ fInputBytes[dsti++] = b;

+ }

+ if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */

+ inMarkup = FALSE;

+ }

+ fInputLen = dsti;

+ }

+ //

+ // If it looks like this input wasn't marked up, or if it looks like it's

+ // essentially nothing but markup abandon the markup stripping.

+ // Detection will have to work on the unstripped input.

+ //

+ if (openTags<5 || openTags/5 < badTags ||

+ (fInputLen < 100 && fRawLength>600))

+ {

+ int32_t limit = fRawLength;

+ if (limit > BUFFER_SIZE) {

+ limit = BUFFER_SIZE;

+ }

+ for (srci=0; srci<limit; srci++) {

+ fInputBytes[srci] = fRawInput[srci];

+ }

+ fInputLen = srci;

+ }

+ //

+ // Tally up the byte occurence statistics.

+ // These are available for use by the various detectors.

+ //

+ uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

+ for (srci = 0; srci < fInputLen; srci += 1) {

+ fByteStats[fInputBytes[srci]] += 1;

+ }

+ for (int32_t i = 0x80; i <= 0x9F; i += 1) {

+ if (fByteStats[i] != 0) {

+ fC1Bytes = TRUE;

+ break;

+ }

+U_NAMESPACE_END

+#endif

Property changes on: icu46/source/i18n/inputext.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/inputext.h ('k') | icu46/source/i18n/islamcal.h » ('j') | no next file with comments »