icu46/source/i18n/inputext.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/inputext.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2009, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8 #include "unicode/utypes.h"

	9

	10 #if !UCONFIG_NO_CONVERSION

	11

	12 #include "inputext.h"

	13

	14 #include "cmemory.h"

	15 #include "cstring.h"

	16

	17 #include <string.h>

	18

	19 U_NAMESPACE_BEGIN

	20

	21 #define BUFFER_SIZE 8192

	22

	23 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

	24

	25 #define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))

	26 #define DELETE_ARRAY(array) uprv_free((void *) (array))

	27

	28 InputText::InputText(UErrorCode &status)

	29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been

	30 // removed if appropriate.

	31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics fo r the input text.

	32 // Value is percent, not abso lute.

	33 fDeclaredEncoding(0),

	34 fRawInput(0),

	35 fRawLength(0)

	36 {

	37 if (fInputBytes == NULL \|\| fByteStats == NULL) {

	38 status = U_MEMORY_ALLOCATION_ERROR;

	39 }

	40 }

	41

	42 InputText::~InputText()

	43 {

	44 DELETE_ARRAY(fDeclaredEncoding);

	45 DELETE_ARRAY(fByteStats);

	46 DELETE_ARRAY(fInputBytes);

	47 }

	48

	49 void InputText::setText(const char *in, int32_t len)

	50 {

	51 fInputLen = 0;

	52 fC1Bytes = FALSE;

	53 fRawInput = (const uint8_t *) in;

	54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;

	55 }

	56

	57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)

	58 {

	59 if(encoding) {

	60 if (len == -1) {

	61 len = (int32_t)uprv_strlen(encoding);

	62 }

	63

	64 len += 1; // to make place for the \0 at the end.

	65 uprv_free(fDeclaredEncoding);

	66 fDeclaredEncoding = NEW_ARRAY(char, len);

	67 uprv_strncpy(fDeclaredEncoding, encoding, len);

	68 }

	69 }

	70

	71 UBool InputText::isSet() const

	72 {

	73 return fRawInput != NULL;

	74 }

	75

	76 /**

	77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess

	78 * it by removing what appears to be html markup.

	79 *

	80 * @internal

	81 */

	82 void InputText::MungeInput(UBool fStripTags) {

	83 int srci = 0;

	84 int dsti = 0;

	85 uint8_t b;

	86 bool inMarkup = FALSE;

	87 int32_t openTags = 0;

	88 int32_t badTags = 0;

	89

	90 //

	91 // html / xml markup stripping.

	92 // quick and dirty, not 100% accurate, but hopefully good enough, statis tically.

	93 // discard everything within < brackets >

	94 // Count how many total '<' and illegal (nested) '<' occur, so we can ma ke some

	95 // guess as to whether the input was actually marked up at all.

	96 // TODO: Think about how this interacts with EBCDIC charsets that are detect ed.

	97 if (fStripTags) {

	98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {

	99 b = fRawInput[srci];

	100

	101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */

	102 if (inMarkup) {

	103 badTags += 1;

	104 }

	105

	106 inMarkup = TRUE;

	107 openTags += 1;

	108 }

	109

	110 if (! inMarkup) {

	111 fInputBytes[dsti++] = b;

	112 }

	113

	114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */

	115 inMarkup = FALSE;

	116 }

	117 }

	118

	119 fInputLen = dsti;

	120 }

	121

	122 //

	123 // If it looks like this input wasn't marked up, or if it looks like it's

	124 // essentially nothing but markup abandon the markup stripping.

	125 // Detection will have to work on the unstripped input.

	126 //

	127 if (openTags<5 \|\| openTags/5 < badTags \|\|

	128 (fInputLen < 100 && fRawLength>600))

	129 {

	130 int32_t limit = fRawLength;

	131

	132 if (limit > BUFFER_SIZE) {

	133 limit = BUFFER_SIZE;

	134 }

	135

	136 for (srci=0; srci<limit; srci++) {

	137 fInputBytes[srci] = fRawInput[srci];

	138 }

	139

	140 fInputLen = srci;

	141 }

	142

	143 //

	144 // Tally up the byte occurence statistics.

	145 // These are available for use by the various detectors.

	146 //

	147

	148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

	149

	150 for (srci = 0; srci < fInputLen; srci += 1) {

	151 fByteStats[fInputBytes[srci]] += 1;

	152 }

	153

	154 for (int32_t i = 0x80; i <= 0x9F; i += 1) {

	155 if (fByteStats[i] != 0) {

	156 fC1Bytes = TRUE;

	157 break;

	158 }

	159 }

	160 }

	161

	162 U_NAMESPACE_END

	163 #endif

	164

OLD	NEW

« no previous file with comments | « icu46/source/i18n/inputext.h ('k') | icu46/source/i18n/islamcal.h » ('j') | no next file with comments »