icu46/source/i18n/uspoof_wsconf.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/uspoof_wsconf.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 2008-2009, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 * file name: uspoof_wsconf.cpp

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 2009Jan05 (refactoring earlier files)

	14 * created by: Andy Heninger

	15 *

	16 * Internal functions for compililing Whole Script confusable source data

	17 * into its binary (runtime) form. The binary data format is described

	18 * in uspoof_impl.h

	19 */

	20

	21 #include "unicode/utypes.h"

	22 #include "unicode/uspoof.h"

	23

	24 #if !UCONFIG_NO_NORMALIZATION

	25

	26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS

	27

	28 #include "unicode/unorm.h"

	29 #include "unicode/uregex.h"

	30 #include "unicode/ustring.h"

	31 #include "cmemory.h"

	32 #include "uspoof_impl.h"

	33 #include "uhash.h"

	34 #include "uvector.h"

	35 #include "uassert.h"

	36 #include "uspoof_wsconf.h"

	37

	38 U_NAMESPACE_USE

	39

	40

	41 // Regular expression for parsing a line from the Unicode file confusablesWholeS cript.txt

	42 // Example Lines:

	43 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O

	44 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I

	45 // \| \| \| \|

	46 // \| \| \| \|---- Which table, Any Case or Lower Case (A or L)

	47 // \| \| \|----------Target script. We need this.

	48 // \| \|----------------Src script. Should match the script of t he source

	49 // \| code points. Beyond checking that, we do n't keep it.

	50 // \|--------------------------------Source code points or range.

	51 //

	52 // The expression will match _all_ lines, including erroneous lines.

	53 // The result of the parse is returned via the contents of the (match) groups.

	54 static const char *parseExp =

	55

	56 "(?m)" // Multi-line mode

	57 "^([ \\t](?:#.?)?)$" // A blank or comment lin e. Matches Group 1.

	58 "\|^(?:" // OR

	59 "\\s([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s;" // Code point range. Gro ups 2 and 3.

	60 "\\s([A-Za-z]+)\\s;" // The source script. Gr oup 4.

	61 "\\s([A-Za-z]+)\\s;" // The target script. Gr oup 5.

	62 "\\s*(?:(A)\|(L))" // The table A or L. Gr oup 6 or 7

	63 "[ \\t](?:#.?)?" // Trailing commment

	64 ")$\|" // OR

	65 "^(.*?)$"; // An error line. Gr oup 8.

	66 // Any line not matchi ng the preceding

	67 // parts of the expres sion.will match

	68 // this, and thus be f lagged as an error

	69

	70

	71 // Extract a regular expression match group into a char * string.

	72 // The group must contain only invariant characters.

	73 // Used for script names

	74 //

	75 static void extractGroup(

	76 URegularExpression e, int32_t group, char destBuf, int32_t destCapacity, U ErrorCode &status) {

	77

	78 UChar ubuf[50];

	79 ubuf[0] = 0;

	80 destBuf[0] = 0;

	81 int32_t len = uregex_group(e, group, ubuf, 50, &status);

	82 if (U_FAILURE(status) \|\| len == -1 \|\| len >= destCapacity) {

	83 return;

	84 }

	85 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor

	86 s.extract(0, len, destBuf, destCapacity, US_INV);

	87 }

	88

	89

	90

	91 // Build the Whole Script Confusable data

	92 //

	93 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,

	94 // because everything is local to this one build functio n anyhow,

	95 // OR

	96 // break this function into more reasonably sized pieces , with

	97 // state in WSConfusableDataBuilder.

	98 //

	99 void buildWSConfusableData(SpoofImpl spImpl, const char confusablesWS,

	100 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)

	101 {

	102 if (U_FAILURE(status)) {

	103 return;

	104 }

	105 URegularExpression *parseRegexp = NULL;

	106 int32_t inputLen = 0;

	107 UChar *input = NULL;

	108 int32_t lineNum = 0;

	109

	110 UVector *scriptSets = NULL;

	111 uint32_t rtScriptSetsCount = 2;

	112

	113 UTrie2 *anyCaseTrie = NULL;

	114 UTrie2 *lowerCaseTrie = NULL;

	115

	116 anyCaseTrie = utrie2_open(0, 0, &status);

	117 lowerCaseTrie = utrie2_open(0, 0, &status);

	118

	119

	120 // The scriptSets vector provides a mapping from TRIE values to the set of s cripts.

	121 //

	122 // Reserved TRIE values:

	123 // 0: Code point has no whole script confusables.

	124 // 1: Code point is of script Common or Inherited.

	125 // These code points do not participate in whole script confusable det ection.

	126 // (This is logically equivalent to saying that they contain confusabl es in

	127 // all scripts)

	128 //

	129 // Because Trie values are indexes into the ScriptSets vector, pre-fill

	130 // vector positions 0 and 1 to avoid conflicts with the reserved values.

	131

	132 scriptSets = new UVector(status);

	133 if (scriptSets == NULL) {

	134 status = U_MEMORY_ALLOCATION_ERROR;

	135 goto cleanup;

	136 }

	137 scriptSets->addElement((void *)NULL, status);

	138 scriptSets->addElement((void *)NULL, status);

	139

	140 // Convert the user input data from UTF-8 to UChar (UTF-16)

	141 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);

	142 if (status != U_BUFFER_OVERFLOW_ERROR) {

	143 goto cleanup;

	144 }

	145 status = U_ZERO_ERROR;

	146 input = static_cast<UChar >(uprv_malloc((inputLen+1) sizeof(UChar)));

	147 if (input == NULL) {

	148 status = U_MEMORY_ALLOCATION_ERROR;

	149 goto cleanup;

	150 }

	151 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &sta tus);

	152

	153

	154

	155 parseRegexp = uregex_openC(parseExp, 0, NULL, &status);

	156

	157 // Zap any Byte Order Mark at the start of input. Changing it to a space is benign

	158 // given the syntax of the input.

	159 if (*input == 0xfeff) {

	160 *input = 0x20;

	161 }

	162

	163 // Parse the input, one line per iteration of this loop.

	164 uregex_setText(parseRegexp, input, inputLen, &status);

	165 while (uregex_findNext(parseRegexp, &status)) {

	166 lineNum++;

	167 UChar line[200];

	168 uregex_group(parseRegexp, 0, line, 200, &status);

	169 if (uregex_start(parseRegexp, 1, &status) >= 0) {

	170 // this was a blank or comment line.

	171 continue;

	172 }

	173 if (uregex_start(parseRegexp, 8, &status) >= 0) {

	174 // input file syntax error.

	175 status = U_PARSE_ERROR;

	176 goto cleanup;

	177 }

	178 if (U_FAILURE(status)) {

	179 goto cleanup;

	180 }

	181

	182 // Pick up the start and optional range end code points from the parsed line.

	183 UChar32 startCodePoint = SpoofImpl::ScanHex(

	184 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp , 2, &status), status);

	185 UChar32 endCodePoint = startCodePoint;

	186 if (uregex_start(parseRegexp, 3, &status) >=0) {

	187 endCodePoint = SpoofImpl::ScanHex(

	188 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRe gexp, 3, &status), status);

	189 }

	190

	191 // Extract the two script names from the source line. We need these in an 8 bit

	192 // default encoding (will be EBCDIC on IBM mainframes) in order to pas s them on

	193 // to the ICU u_getPropertyValueEnum() function. Ugh.

	194 char srcScriptName[20];

	195 char targScriptName[20];

	196 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), statu s);

	197 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), sta tus);

	198 UScriptCode srcScript =

	199 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScr iptName));

	200 UScriptCode targScript =

	201 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targSc riptName));

	202 if (U_FAILURE(status)) {

	203 goto cleanup;

	204 }

	205 if (srcScript == USCRIPT_INVALID_CODE \|\| targScript == USCRIPT_INVALID_C ODE) {

	206 status = U_INVALID_FORMAT_ERROR;

	207 goto cleanup;

	208 }

	209

	210 // select the table - (A) any case or (L) lower case only

	211 UTrie2 *table = anyCaseTrie;

	212 if (uregex_start(parseRegexp, 7, &status) >= 0) {

	213 table = lowerCaseTrie;

	214 }

	215

	216 // Build the set of scripts containing confusable characters for

	217 // the code point(s) specified in this input line.

	218 // Sanity check that the script of the source code point is the same

	219 // as the source script indicated in the input file. Failure of this check is

	220 // an error in the input file.

	221 // Include the source script in the set (needed for Mixed Script Confusa ble detection).

	222 //

	223 UChar32 cp;

	224 for (cp=startCodePoint; cp<=endCodePoint; cp++) {

	225 int32_t setIndex = utrie2_get32(table, cp);

	226 BuilderScriptSet *bsset = NULL;

	227 if (setIndex > 0) {

	228 U_ASSERT(setIndex < scriptSets->size());

	229 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(se tIndex));

	230 } else {

	231 bsset = new BuilderScriptSet();

	232 if (bsset == NULL) {

	233 status = U_MEMORY_ALLOCATION_ERROR;

	234 goto cleanup;

	235 }

	236 bsset->codePoint = cp;

	237 bsset->trie = table;

	238 bsset->sset = new ScriptSet();

	239 setIndex = scriptSets->size();

	240 bsset->index = setIndex;

	241 bsset->rindex = 0;

	242 if (bsset->sset == NULL) {

	243 status = U_MEMORY_ALLOCATION_ERROR;

	244 goto cleanup;

	245 }

	246 scriptSets->addElement(bsset, status);

	247 utrie2_set32(table, cp, setIndex, &status);

	248 }

	249 bsset->sset->Union(targScript);

	250 bsset->sset->Union(srcScript);

	251

	252 if (U_FAILURE(status)) {

	253 goto cleanup;

	254 }

	255 UScriptCode cpScript = uscript_getScript(cp, &status);

	256 if (cpScript != srcScript) {

	257 status = U_INVALID_FORMAT_ERROR;

	258 goto cleanup;

	259 }

	260 }

	261 }

	262

	263 // Eliminate duplicate script sets. At this point we have a separate

	264 // script set for every code point that had data in the input file.

	265 //

	266 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that w rap them

	267 //

	268 // printf("Number of scriptSets: %d\n", scriptSets->size());

	269 {

	270 int32_t duplicateCount = 0;

	271 rtScriptSetsCount = 2;

	272 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {

	273 BuilderScriptSet outerSet = static_cast<BuilderScriptSet >(scriptS ets->elementAt(outeri));

	274 if (outerSet->index != static_cast<uint32_t>(outeri)) {

	275 // This set was already identified as a duplicate.

	276 // It will not be allocated a position in the runtime array of ScriptSets.

	277 continue;

	278 }

	279 outerSet->rindex = rtScriptSetsCount++;

	280 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {

	281 BuilderScriptSet innerSet = static_cast<BuilderScriptSet >(scr iptSets->elementAt(inneri));

	282 if ((outerSet->sset) == (innerSet->sset) && outerSet->sset != innerSet->sset) {

	283 delete innerSet->sset;

	284 innerSet->scriptSetOwned = FALSE;

	285 innerSet->sset = outerSet->sset;

	286 innerSet->index = outeri;

	287 innerSet->rindex = outerSet->rindex;

	288 duplicateCount++;

	289 }

	290 // But this doesn't get all. We need to fix the TRIE.

	291 }

	292 }

	293 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);

	294 }

	295

	296

	297

	298 // Update the Trie values to be reflect the run time script indexes (after d uplicate merging).

	299 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scri ptSets

	300 // are unused, which is why the loop index starts at 2.)

	301 {

	302 for (int32_t i=2; i<scriptSets->size(); i++) {

	303 BuilderScriptSet bSet = static_cast<BuilderScriptSet >(scriptSets- >elementAt(i));

	304 if (bSet->rindex != (uint32_t)i) {

	305 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status) ;

	306 }

	307 }

	308 }

	309

	310 // For code points with script==Common or script==Inherited,

	311 // Set the reserved value of 1 into both Tries. These characters do not p articipate

	312 // in Whole Script Confusable detection; this reserved value is the means

	313 // by which they are detected.

	314 {

	315 UnicodeSet ignoreSet;

	316 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);

	317 UnicodeSet inheritedSet;

	318 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, stat us);

	319 ignoreSet.addAll(inheritedSet);

	320 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {

	321 UChar32 rangeStart = ignoreSet.getRangeStart(rn);

	322 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);

	323 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &sta tus);

	324 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &sta tus);

	325 }

	326 }

	327

	328 // Serialize the data to the Spoof Detector

	329 {

	330 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);

	331 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);

	332 // printf("Any case Trie size: %d\n", size);

	333 if (status != U_BUFFER_OVERFLOW_ERROR) {

	334 goto cleanup;

	335 }

	336 status = U_ZERO_ERROR;

	337 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLim it;

	338 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;

	339 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;

	340 void *where = spImpl->fSpoofData->reserveSpace(size, status);

	341 utrie2_serialize(anyCaseTrie, where, size, &status);

	342

	343 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);

	344 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);

	345 // printf("Lower case Trie size: %d\n", size);

	346 if (status != U_BUFFER_OVERFLOW_ERROR) {

	347 goto cleanup;

	348 }

	349 status = U_ZERO_ERROR;

	350 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemL imit;

	351 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;

	352 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;

	353 where = spImpl->fSpoofData->reserveSpace(size, status);

	354 utrie2_serialize(lowerCaseTrie, where, size, &status);

	355

	356 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimi t;

	357 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;

	358 ScriptSet rtScriptSets = static_cast<ScriptSet >

	359 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptS et), status));

	360 uint32_t rindex = 2;

	361 for (int32_t i=2; i<scriptSets->size(); i++) {

	362 BuilderScriptSet bSet = static_cast<BuilderScriptSet >(scriptSets- >elementAt(i));

	363 if (bSet->rindex < rindex) {

	364 // We have already copied this script set to the serialized data .

	365 continue;

	366 }

	367 U_ASSERT(rindex == bSet->rindex);

	368 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet j ust copies the bits.

	369 rindex++;

	370 }

	371 }

	372

	373 // Open new utrie2s from the serialized data. We don't want to keep the one s

	374 // we just built because we would then have two copies of the data, one in ternal to

	375 // the utries that we have already constructed, and one in the serialized data area.

	376 // An alternative would be to not pre-serialize the Trie data, but that ma kes the

	377 // spoof detector data different, depending on how the detector was constr ucted.

	378 // It's simpler to keep the data always the same.

	379

	380 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(

	381 UTRIE2_16_VALUE_BITS,

	382 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRa wData->fAnyCaseTrie,

	383 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,

	384 NULL,

	385 &status);

	386

	387 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(

	388 UTRIE2_16_VALUE_BITS,

	389 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRa wData->fLowerCaseTrie,

	390 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,

	391 NULL,

	392 &status);

	393

	394

	395

	396 cleanup:

	397 if (U_FAILURE(status)) {

	398 pe->line = lineNum;

	399 }

	400 uregex_close(parseRegexp);

	401 uprv_free(input);

	402

	403 int32_t i;

	404 for (i=0; i<scriptSets->size(); i++) {

	405 BuilderScriptSet bsset = static_cast<BuilderScriptSet >(scriptSets->el ementAt(i));

	406 delete bsset;

	407 }

	408 delete scriptSets;

	409 utrie2_close(anyCaseTrie);

	410 utrie2_close(lowerCaseTrie);

	411 return;

	412 }

	413

	414

	415

	416

	417

	418 BuilderScriptSet::BuilderScriptSet() {

	419 codePoint = -1;

	420 trie = NULL;

	421 sset = NULL;

	422 index = 0;

	423 rindex = 0;

	424 scriptSetOwned = TRUE;

	425 }

	426

	427 BuilderScriptSet::~BuilderScriptSet() {

	428 if (scriptSetOwned) {

	429 delete sset;

	430 }

	431 }

	432

	433 #endif

	434 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

	435

OLD	NEW

« no previous file with comments | « icu46/source/i18n/uspoof_wsconf.h ('k') | icu46/source/i18n/usrchimp.h » ('j') | no next file with comments »