| Index: icu46/source/i18n/uspoof_wsconf.cpp
|
| ===================================================================
|
| --- icu46/source/i18n/uspoof_wsconf.cpp (revision 0)
|
| +++ icu46/source/i18n/uspoof_wsconf.cpp (revision 0)
|
| @@ -0,0 +1,435 @@
|
| +/*
|
| +******************************************************************************
|
| +*
|
| +* Copyright (C) 2008-2009, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*
|
| +******************************************************************************
|
| +* file name: uspoof_wsconf.cpp
|
| +* encoding: US-ASCII
|
| +* tab size: 8 (not used)
|
| +* indentation:4
|
| +*
|
| +* created on: 2009Jan05 (refactoring earlier files)
|
| +* created by: Andy Heninger
|
| +*
|
| +* Internal functions for compililing Whole Script confusable source data
|
| +* into its binary (runtime) form. The binary data format is described
|
| +* in uspoof_impl.h
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +#include "unicode/uspoof.h"
|
| +
|
| +#if !UCONFIG_NO_NORMALIZATION
|
| +
|
| +#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
| +
|
| +#include "unicode/unorm.h"
|
| +#include "unicode/uregex.h"
|
| +#include "unicode/ustring.h"
|
| +#include "cmemory.h"
|
| +#include "uspoof_impl.h"
|
| +#include "uhash.h"
|
| +#include "uvector.h"
|
| +#include "uassert.h"
|
| +#include "uspoof_wsconf.h"
|
| +
|
| +U_NAMESPACE_USE
|
| +
|
| +
|
| +// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
|
| +// Example Lines:
|
| +// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
|
| +// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
|
| +// | | | |
|
| +// | | | |---- Which table, Any Case or Lower Case (A or L)
|
| +// | | |----------Target script. We need this.
|
| +// | |----------------Src script. Should match the script of the source
|
| +// | code points. Beyond checking that, we don't keep it.
|
| +// |--------------------------------Source code points or range.
|
| +//
|
| +// The expression will match _all_ lines, including erroneous lines.
|
| +// The result of the parse is returned via the contents of the (match) groups.
|
| +static const char *parseExp =
|
| +
|
| + "(?m)" // Multi-line mode
|
| + "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
|
| + "|^(?:" // OR
|
| + "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
|
| + "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
|
| + "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
|
| + "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
|
| + "[ \\t]*(?:#.*?)?" // Trailing commment
|
| + ")$|" // OR
|
| + "^(.*?)$"; // An error line. Group 8.
|
| + // Any line not matching the preceding
|
| + // parts of the expression.will match
|
| + // this, and thus be flagged as an error
|
| +
|
| +
|
| +// Extract a regular expression match group into a char * string.
|
| +// The group must contain only invariant characters.
|
| +// Used for script names
|
| +//
|
| +static void extractGroup(
|
| + URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
|
| +
|
| + UChar ubuf[50];
|
| + ubuf[0] = 0;
|
| + destBuf[0] = 0;
|
| + int32_t len = uregex_group(e, group, ubuf, 50, &status);
|
| + if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
|
| + return;
|
| + }
|
| + UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
|
| + s.extract(0, len, destBuf, destCapacity, US_INV);
|
| +}
|
| +
|
| +
|
| +
|
| +// Build the Whole Script Confusable data
|
| +//
|
| +// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
|
| +// because everything is local to this one build function anyhow,
|
| +// OR
|
| +// break this function into more reasonably sized pieces, with
|
| +// state in WSConfusableDataBuilder.
|
| +//
|
| +void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
| + int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
|
| +{
|
| + if (U_FAILURE(status)) {
|
| + return;
|
| + }
|
| + URegularExpression *parseRegexp = NULL;
|
| + int32_t inputLen = 0;
|
| + UChar *input = NULL;
|
| + int32_t lineNum = 0;
|
| +
|
| + UVector *scriptSets = NULL;
|
| + uint32_t rtScriptSetsCount = 2;
|
| +
|
| + UTrie2 *anyCaseTrie = NULL;
|
| + UTrie2 *lowerCaseTrie = NULL;
|
| +
|
| + anyCaseTrie = utrie2_open(0, 0, &status);
|
| + lowerCaseTrie = utrie2_open(0, 0, &status);
|
| +
|
| +
|
| + // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
|
| + //
|
| + // Reserved TRIE values:
|
| + // 0: Code point has no whole script confusables.
|
| + // 1: Code point is of script Common or Inherited.
|
| + // These code points do not participate in whole script confusable detection.
|
| + // (This is logically equivalent to saying that they contain confusables in
|
| + // all scripts)
|
| + //
|
| + // Because Trie values are indexes into the ScriptSets vector, pre-fill
|
| + // vector positions 0 and 1 to avoid conflicts with the reserved values.
|
| +
|
| + scriptSets = new UVector(status);
|
| + if (scriptSets == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + goto cleanup;
|
| + }
|
| + scriptSets->addElement((void *)NULL, status);
|
| + scriptSets->addElement((void *)NULL, status);
|
| +
|
| + // Convert the user input data from UTF-8 to UChar (UTF-16)
|
| + u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
|
| + if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| + goto cleanup;
|
| + }
|
| + status = U_ZERO_ERROR;
|
| + input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
|
| + if (input == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + goto cleanup;
|
| + }
|
| + u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
|
| +
|
| +
|
| +
|
| + parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
|
| +
|
| + // Zap any Byte Order Mark at the start of input. Changing it to a space is benign
|
| + // given the syntax of the input.
|
| + if (*input == 0xfeff) {
|
| + *input = 0x20;
|
| + }
|
| +
|
| + // Parse the input, one line per iteration of this loop.
|
| + uregex_setText(parseRegexp, input, inputLen, &status);
|
| + while (uregex_findNext(parseRegexp, &status)) {
|
| + lineNum++;
|
| + UChar line[200];
|
| + uregex_group(parseRegexp, 0, line, 200, &status);
|
| + if (uregex_start(parseRegexp, 1, &status) >= 0) {
|
| + // this was a blank or comment line.
|
| + continue;
|
| + }
|
| + if (uregex_start(parseRegexp, 8, &status) >= 0) {
|
| + // input file syntax error.
|
| + status = U_PARSE_ERROR;
|
| + goto cleanup;
|
| + }
|
| + if (U_FAILURE(status)) {
|
| + goto cleanup;
|
| + }
|
| +
|
| + // Pick up the start and optional range end code points from the parsed line.
|
| + UChar32 startCodePoint = SpoofImpl::ScanHex(
|
| + input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
|
| + UChar32 endCodePoint = startCodePoint;
|
| + if (uregex_start(parseRegexp, 3, &status) >=0) {
|
| + endCodePoint = SpoofImpl::ScanHex(
|
| + input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
|
| + }
|
| +
|
| + // Extract the two script names from the source line. We need these in an 8 bit
|
| + // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
|
| + // to the ICU u_getPropertyValueEnum() function. Ugh.
|
| + char srcScriptName[20];
|
| + char targScriptName[20];
|
| + extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
|
| + extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
|
| + UScriptCode srcScript =
|
| + static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
|
| + UScriptCode targScript =
|
| + static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
|
| + if (U_FAILURE(status)) {
|
| + goto cleanup;
|
| + }
|
| + if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
|
| + status = U_INVALID_FORMAT_ERROR;
|
| + goto cleanup;
|
| + }
|
| +
|
| + // select the table - (A) any case or (L) lower case only
|
| + UTrie2 *table = anyCaseTrie;
|
| + if (uregex_start(parseRegexp, 7, &status) >= 0) {
|
| + table = lowerCaseTrie;
|
| + }
|
| +
|
| + // Build the set of scripts containing confusable characters for
|
| + // the code point(s) specified in this input line.
|
| + // Sanity check that the script of the source code point is the same
|
| + // as the source script indicated in the input file. Failure of this check is
|
| + // an error in the input file.
|
| + // Include the source script in the set (needed for Mixed Script Confusable detection).
|
| + //
|
| + UChar32 cp;
|
| + for (cp=startCodePoint; cp<=endCodePoint; cp++) {
|
| + int32_t setIndex = utrie2_get32(table, cp);
|
| + BuilderScriptSet *bsset = NULL;
|
| + if (setIndex > 0) {
|
| + U_ASSERT(setIndex < scriptSets->size());
|
| + bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
|
| + } else {
|
| + bsset = new BuilderScriptSet();
|
| + if (bsset == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + goto cleanup;
|
| + }
|
| + bsset->codePoint = cp;
|
| + bsset->trie = table;
|
| + bsset->sset = new ScriptSet();
|
| + setIndex = scriptSets->size();
|
| + bsset->index = setIndex;
|
| + bsset->rindex = 0;
|
| + if (bsset->sset == NULL) {
|
| + status = U_MEMORY_ALLOCATION_ERROR;
|
| + goto cleanup;
|
| + }
|
| + scriptSets->addElement(bsset, status);
|
| + utrie2_set32(table, cp, setIndex, &status);
|
| + }
|
| + bsset->sset->Union(targScript);
|
| + bsset->sset->Union(srcScript);
|
| +
|
| + if (U_FAILURE(status)) {
|
| + goto cleanup;
|
| + }
|
| + UScriptCode cpScript = uscript_getScript(cp, &status);
|
| + if (cpScript != srcScript) {
|
| + status = U_INVALID_FORMAT_ERROR;
|
| + goto cleanup;
|
| + }
|
| + }
|
| + }
|
| +
|
| + // Eliminate duplicate script sets. At this point we have a separate
|
| + // script set for every code point that had data in the input file.
|
| + //
|
| + // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
|
| + //
|
| + // printf("Number of scriptSets: %d\n", scriptSets->size());
|
| + {
|
| + int32_t duplicateCount = 0;
|
| + rtScriptSetsCount = 2;
|
| + for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
|
| + BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
|
| + if (outerSet->index != static_cast<uint32_t>(outeri)) {
|
| + // This set was already identified as a duplicate.
|
| + // It will not be allocated a position in the runtime array of ScriptSets.
|
| + continue;
|
| + }
|
| + outerSet->rindex = rtScriptSetsCount++;
|
| + for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
|
| + BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
|
| + if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
|
| + delete innerSet->sset;
|
| + innerSet->scriptSetOwned = FALSE;
|
| + innerSet->sset = outerSet->sset;
|
| + innerSet->index = outeri;
|
| + innerSet->rindex = outerSet->rindex;
|
| + duplicateCount++;
|
| + }
|
| + // But this doesn't get all. We need to fix the TRIE.
|
| + }
|
| + }
|
| + // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
|
| + }
|
| +
|
| +
|
| +
|
| + // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
|
| + // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
|
| + // are unused, which is why the loop index starts at 2.)
|
| + {
|
| + for (int32_t i=2; i<scriptSets->size(); i++) {
|
| + BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
| + if (bSet->rindex != (uint32_t)i) {
|
| + utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
|
| + }
|
| + }
|
| + }
|
| +
|
| + // For code points with script==Common or script==Inherited,
|
| + // Set the reserved value of 1 into both Tries. These characters do not participate
|
| + // in Whole Script Confusable detection; this reserved value is the means
|
| + // by which they are detected.
|
| + {
|
| + UnicodeSet ignoreSet;
|
| + ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
|
| + UnicodeSet inheritedSet;
|
| + inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
|
| + ignoreSet.addAll(inheritedSet);
|
| + for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
|
| + UChar32 rangeStart = ignoreSet.getRangeStart(rn);
|
| + UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
|
| + utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
| + utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
| + }
|
| + }
|
| +
|
| + // Serialize the data to the Spoof Detector
|
| + {
|
| + utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
| + int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
|
| + // printf("Any case Trie size: %d\n", size);
|
| + if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| + goto cleanup;
|
| + }
|
| + status = U_ZERO_ERROR;
|
| + spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
|
| + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
|
| + spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
|
| + void *where = spImpl->fSpoofData->reserveSpace(size, status);
|
| + utrie2_serialize(anyCaseTrie, where, size, &status);
|
| +
|
| + utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
| + size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
|
| + // printf("Lower case Trie size: %d\n", size);
|
| + if (status != U_BUFFER_OVERFLOW_ERROR) {
|
| + goto cleanup;
|
| + }
|
| + status = U_ZERO_ERROR;
|
| + spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
|
| + spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
|
| + spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
|
| + where = spImpl->fSpoofData->reserveSpace(size, status);
|
| + utrie2_serialize(lowerCaseTrie, where, size, &status);
|
| +
|
| + spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
|
| + spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
|
| + ScriptSet *rtScriptSets = static_cast<ScriptSet *>
|
| + (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
|
| + uint32_t rindex = 2;
|
| + for (int32_t i=2; i<scriptSets->size(); i++) {
|
| + BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
| + if (bSet->rindex < rindex) {
|
| + // We have already copied this script set to the serialized data.
|
| + continue;
|
| + }
|
| + U_ASSERT(rindex == bSet->rindex);
|
| + rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
|
| + rindex++;
|
| + }
|
| + }
|
| +
|
| + // Open new utrie2s from the serialized data. We don't want to keep the ones
|
| + // we just built because we would then have two copies of the data, one internal to
|
| + // the utries that we have already constructed, and one in the serialized data area.
|
| + // An alternative would be to not pre-serialize the Trie data, but that makes the
|
| + // spoof detector data different, depending on how the detector was constructed.
|
| + // It's simpler to keep the data always the same.
|
| +
|
| + spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
|
| + UTRIE2_16_VALUE_BITS,
|
| + (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
|
| + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
| + NULL,
|
| + &status);
|
| +
|
| + spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
|
| + UTRIE2_16_VALUE_BITS,
|
| + (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
|
| + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
| + NULL,
|
| + &status);
|
| +
|
| +
|
| +
|
| +cleanup:
|
| + if (U_FAILURE(status)) {
|
| + pe->line = lineNum;
|
| + }
|
| + uregex_close(parseRegexp);
|
| + uprv_free(input);
|
| +
|
| + int32_t i;
|
| + for (i=0; i<scriptSets->size(); i++) {
|
| + BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
| + delete bsset;
|
| + }
|
| + delete scriptSets;
|
| + utrie2_close(anyCaseTrie);
|
| + utrie2_close(lowerCaseTrie);
|
| + return;
|
| +}
|
| +
|
| +
|
| +
|
| +
|
| +
|
| +BuilderScriptSet::BuilderScriptSet() {
|
| + codePoint = -1;
|
| + trie = NULL;
|
| + sset = NULL;
|
| + index = 0;
|
| + rindex = 0;
|
| + scriptSetOwned = TRUE;
|
| +}
|
| +
|
| +BuilderScriptSet::~BuilderScriptSet() {
|
| + if (scriptSetOwned) {
|
| + delete sset;
|
| + }
|
| +}
|
| +
|
| +#endif
|
| +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
| +
|
|
|
| Property changes on: icu46/source/i18n/uspoof_wsconf.cpp
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|