OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 2008-2009, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: uspoof_wsconf.cpp |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2009Jan05 (refactoring earlier files) |
| 14 * created by: Andy Heninger |
| 15 * |
| 16 * Internal functions for compililing Whole Script confusable source data |
| 17 * into its binary (runtime) form. The binary data format is described |
| 18 * in uspoof_impl.h |
| 19 */ |
| 20 |
| 21 #include "unicode/utypes.h" |
| 22 #include "unicode/uspoof.h" |
| 23 |
| 24 #if !UCONFIG_NO_NORMALIZATION |
| 25 |
| 26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 27 |
| 28 #include "unicode/unorm.h" |
| 29 #include "unicode/uregex.h" |
| 30 #include "unicode/ustring.h" |
| 31 #include "cmemory.h" |
| 32 #include "uspoof_impl.h" |
| 33 #include "uhash.h" |
| 34 #include "uvector.h" |
| 35 #include "uassert.h" |
| 36 #include "uspoof_wsconf.h" |
| 37 |
| 38 U_NAMESPACE_USE |
| 39 |
| 40 |
| 41 // Regular expression for parsing a line from the Unicode file confusablesWholeS
cript.txt |
| 42 // Example Lines: |
| 43 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O |
| 44 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN
CAPITAL LETTER I |
| 45 // | | | | |
| 46 // | | | |---- Which table, Any Case or Lower Case (A or
L) |
| 47 // | | |----------Target script. We need this. |
| 48 // | |----------------Src script. Should match the script of t
he source |
| 49 // | code points. Beyond checking that, we do
n't keep it. |
| 50 // |--------------------------------Source code points or range. |
| 51 // |
| 52 // The expression will match _all_ lines, including erroneous lines. |
| 53 // The result of the parse is returned via the contents of the (match) groups. |
| 54 static const char *parseExp = |
| 55 |
| 56 "(?m)" // Multi-line mode |
| 57 "^([ \\t]*(?:#.*?)?)$" // A blank or comment lin
e. Matches Group 1. |
| 58 "|^(?:" // OR |
| 59 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Gro
ups 2 and 3. |
| 60 "\\s*([A-Za-z]+)\\s*;" // The source script. Gr
oup 4. |
| 61 "\\s*([A-Za-z]+)\\s*;" // The target script. Gr
oup 5. |
| 62 "\\s*(?:(A)|(L))" // The table A or L. Gr
oup 6 or 7 |
| 63 "[ \\t]*(?:#.*?)?" // Trailing commment |
| 64 ")$|" // OR |
| 65 "^(.*?)$"; // An error line. Gr
oup 8. |
| 66 // Any line not matchi
ng the preceding |
| 67 // parts of the expres
sion.will match |
| 68 // this, and thus be f
lagged as an error |
| 69 |
| 70 |
| 71 // Extract a regular expression match group into a char * string. |
| 72 // The group must contain only invariant characters. |
| 73 // Used for script names |
| 74 // |
| 75 static void extractGroup( |
| 76 URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, U
ErrorCode &status) { |
| 77 |
| 78 UChar ubuf[50]; |
| 79 ubuf[0] = 0; |
| 80 destBuf[0] = 0; |
| 81 int32_t len = uregex_group(e, group, ubuf, 50, &status); |
| 82 if (U_FAILURE(status) || len == -1 || len >= destCapacity) { |
| 83 return; |
| 84 } |
| 85 UnicodeString s(FALSE, ubuf, len); // Aliasing constructor |
| 86 s.extract(0, len, destBuf, destCapacity, US_INV); |
| 87 } |
| 88 |
| 89 |
| 90 |
| 91 // Build the Whole Script Confusable data |
| 92 // |
| 93 // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, |
| 94 // because everything is local to this one build functio
n anyhow, |
| 95 // OR |
| 96 // break this function into more reasonably sized pieces
, with |
| 97 // state in WSConfusableDataBuilder. |
| 98 // |
| 99 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, |
| 100 int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) |
| 101 { |
| 102 if (U_FAILURE(status)) { |
| 103 return; |
| 104 } |
| 105 URegularExpression *parseRegexp = NULL; |
| 106 int32_t inputLen = 0; |
| 107 UChar *input = NULL; |
| 108 int32_t lineNum = 0; |
| 109 |
| 110 UVector *scriptSets = NULL; |
| 111 uint32_t rtScriptSetsCount = 2; |
| 112 |
| 113 UTrie2 *anyCaseTrie = NULL; |
| 114 UTrie2 *lowerCaseTrie = NULL; |
| 115 |
| 116 anyCaseTrie = utrie2_open(0, 0, &status); |
| 117 lowerCaseTrie = utrie2_open(0, 0, &status); |
| 118 |
| 119 |
| 120 // The scriptSets vector provides a mapping from TRIE values to the set of s
cripts. |
| 121 // |
| 122 // Reserved TRIE values: |
| 123 // 0: Code point has no whole script confusables. |
| 124 // 1: Code point is of script Common or Inherited. |
| 125 // These code points do not participate in whole script confusable det
ection. |
| 126 // (This is logically equivalent to saying that they contain confusabl
es in |
| 127 // all scripts) |
| 128 // |
| 129 // Because Trie values are indexes into the ScriptSets vector, pre-fill |
| 130 // vector positions 0 and 1 to avoid conflicts with the reserved values. |
| 131 |
| 132 scriptSets = new UVector(status); |
| 133 if (scriptSets == NULL) { |
| 134 status = U_MEMORY_ALLOCATION_ERROR; |
| 135 goto cleanup; |
| 136 } |
| 137 scriptSets->addElement((void *)NULL, status); |
| 138 scriptSets->addElement((void *)NULL, status); |
| 139 |
| 140 // Convert the user input data from UTF-8 to UChar (UTF-16) |
| 141 u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); |
| 142 if (status != U_BUFFER_OVERFLOW_ERROR) { |
| 143 goto cleanup; |
| 144 } |
| 145 status = U_ZERO_ERROR; |
| 146 input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); |
| 147 if (input == NULL) { |
| 148 status = U_MEMORY_ALLOCATION_ERROR; |
| 149 goto cleanup; |
| 150 } |
| 151 u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &sta
tus); |
| 152 |
| 153 |
| 154 |
| 155 parseRegexp = uregex_openC(parseExp, 0, NULL, &status); |
| 156 |
| 157 // Zap any Byte Order Mark at the start of input. Changing it to a space is
benign |
| 158 // given the syntax of the input. |
| 159 if (*input == 0xfeff) { |
| 160 *input = 0x20; |
| 161 } |
| 162 |
| 163 // Parse the input, one line per iteration of this loop. |
| 164 uregex_setText(parseRegexp, input, inputLen, &status); |
| 165 while (uregex_findNext(parseRegexp, &status)) { |
| 166 lineNum++; |
| 167 UChar line[200]; |
| 168 uregex_group(parseRegexp, 0, line, 200, &status); |
| 169 if (uregex_start(parseRegexp, 1, &status) >= 0) { |
| 170 // this was a blank or comment line. |
| 171 continue; |
| 172 } |
| 173 if (uregex_start(parseRegexp, 8, &status) >= 0) { |
| 174 // input file syntax error. |
| 175 status = U_PARSE_ERROR; |
| 176 goto cleanup; |
| 177 } |
| 178 if (U_FAILURE(status)) { |
| 179 goto cleanup; |
| 180 } |
| 181 |
| 182 // Pick up the start and optional range end code points from the parsed
line. |
| 183 UChar32 startCodePoint = SpoofImpl::ScanHex( |
| 184 input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp
, 2, &status), status); |
| 185 UChar32 endCodePoint = startCodePoint; |
| 186 if (uregex_start(parseRegexp, 3, &status) >=0) { |
| 187 endCodePoint = SpoofImpl::ScanHex( |
| 188 input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRe
gexp, 3, &status), status); |
| 189 } |
| 190 |
| 191 // Extract the two script names from the source line. We need these in
an 8 bit |
| 192 // default encoding (will be EBCDIC on IBM mainframes) in order to pas
s them on |
| 193 // to the ICU u_getPropertyValueEnum() function. Ugh. |
| 194 char srcScriptName[20]; |
| 195 char targScriptName[20]; |
| 196 extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), statu
s); |
| 197 extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), sta
tus); |
| 198 UScriptCode srcScript = |
| 199 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScr
iptName)); |
| 200 UScriptCode targScript = |
| 201 static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targSc
riptName)); |
| 202 if (U_FAILURE(status)) { |
| 203 goto cleanup; |
| 204 } |
| 205 if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_C
ODE) { |
| 206 status = U_INVALID_FORMAT_ERROR; |
| 207 goto cleanup; |
| 208 } |
| 209 |
| 210 // select the table - (A) any case or (L) lower case only |
| 211 UTrie2 *table = anyCaseTrie; |
| 212 if (uregex_start(parseRegexp, 7, &status) >= 0) { |
| 213 table = lowerCaseTrie; |
| 214 } |
| 215 |
| 216 // Build the set of scripts containing confusable characters for |
| 217 // the code point(s) specified in this input line. |
| 218 // Sanity check that the script of the source code point is the same |
| 219 // as the source script indicated in the input file. Failure of this
check is |
| 220 // an error in the input file. |
| 221 // Include the source script in the set (needed for Mixed Script Confusa
ble detection). |
| 222 // |
| 223 UChar32 cp; |
| 224 for (cp=startCodePoint; cp<=endCodePoint; cp++) { |
| 225 int32_t setIndex = utrie2_get32(table, cp); |
| 226 BuilderScriptSet *bsset = NULL; |
| 227 if (setIndex > 0) { |
| 228 U_ASSERT(setIndex < scriptSets->size()); |
| 229 bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(se
tIndex)); |
| 230 } else { |
| 231 bsset = new BuilderScriptSet(); |
| 232 if (bsset == NULL) { |
| 233 status = U_MEMORY_ALLOCATION_ERROR; |
| 234 goto cleanup; |
| 235 } |
| 236 bsset->codePoint = cp; |
| 237 bsset->trie = table; |
| 238 bsset->sset = new ScriptSet(); |
| 239 setIndex = scriptSets->size(); |
| 240 bsset->index = setIndex; |
| 241 bsset->rindex = 0; |
| 242 if (bsset->sset == NULL) { |
| 243 status = U_MEMORY_ALLOCATION_ERROR; |
| 244 goto cleanup; |
| 245 } |
| 246 scriptSets->addElement(bsset, status); |
| 247 utrie2_set32(table, cp, setIndex, &status); |
| 248 } |
| 249 bsset->sset->Union(targScript); |
| 250 bsset->sset->Union(srcScript); |
| 251 |
| 252 if (U_FAILURE(status)) { |
| 253 goto cleanup; |
| 254 } |
| 255 UScriptCode cpScript = uscript_getScript(cp, &status); |
| 256 if (cpScript != srcScript) { |
| 257 status = U_INVALID_FORMAT_ERROR; |
| 258 goto cleanup; |
| 259 } |
| 260 } |
| 261 } |
| 262 |
| 263 // Eliminate duplicate script sets. At this point we have a separate |
| 264 // script set for every code point that had data in the input file. |
| 265 // |
| 266 // We eliminate underlying ScriptSet objects, not the BuildScriptSets that w
rap them |
| 267 // |
| 268 // printf("Number of scriptSets: %d\n", scriptSets->size()); |
| 269 { |
| 270 int32_t duplicateCount = 0; |
| 271 rtScriptSetsCount = 2; |
| 272 for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { |
| 273 BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptS
ets->elementAt(outeri)); |
| 274 if (outerSet->index != static_cast<uint32_t>(outeri)) { |
| 275 // This set was already identified as a duplicate. |
| 276 // It will not be allocated a position in the runtime array of
ScriptSets. |
| 277 continue; |
| 278 } |
| 279 outerSet->rindex = rtScriptSetsCount++; |
| 280 for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { |
| 281 BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scr
iptSets->elementAt(inneri)); |
| 282 if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset !=
innerSet->sset) { |
| 283 delete innerSet->sset; |
| 284 innerSet->scriptSetOwned = FALSE; |
| 285 innerSet->sset = outerSet->sset; |
| 286 innerSet->index = outeri; |
| 287 innerSet->rindex = outerSet->rindex; |
| 288 duplicateCount++; |
| 289 } |
| 290 // But this doesn't get all. We need to fix the TRIE. |
| 291 } |
| 292 } |
| 293 // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); |
| 294 } |
| 295 |
| 296 |
| 297 |
| 298 // Update the Trie values to be reflect the run time script indexes (after d
uplicate merging). |
| 299 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scri
ptSets |
| 300 // are unused, which is why the loop index starts at 2.) |
| 301 { |
| 302 for (int32_t i=2; i<scriptSets->size(); i++) { |
| 303 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets-
>elementAt(i)); |
| 304 if (bSet->rindex != (uint32_t)i) { |
| 305 utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status)
; |
| 306 } |
| 307 } |
| 308 } |
| 309 |
| 310 // For code points with script==Common or script==Inherited, |
| 311 // Set the reserved value of 1 into both Tries. These characters do not p
articipate |
| 312 // in Whole Script Confusable detection; this reserved value is the means |
| 313 // by which they are detected. |
| 314 { |
| 315 UnicodeSet ignoreSet; |
| 316 ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); |
| 317 UnicodeSet inheritedSet; |
| 318 inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, stat
us); |
| 319 ignoreSet.addAll(inheritedSet); |
| 320 for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { |
| 321 UChar32 rangeStart = ignoreSet.getRangeStart(rn); |
| 322 UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); |
| 323 utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &sta
tus); |
| 324 utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &sta
tus); |
| 325 } |
| 326 } |
| 327 |
| 328 // Serialize the data to the Spoof Detector |
| 329 { |
| 330 utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
| 331 int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); |
| 332 // printf("Any case Trie size: %d\n", size); |
| 333 if (status != U_BUFFER_OVERFLOW_ERROR) { |
| 334 goto cleanup; |
| 335 } |
| 336 status = U_ZERO_ERROR; |
| 337 spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLim
it; |
| 338 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; |
| 339 spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; |
| 340 void *where = spImpl->fSpoofData->reserveSpace(size, status); |
| 341 utrie2_serialize(anyCaseTrie, where, size, &status); |
| 342 |
| 343 utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); |
| 344 size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); |
| 345 // printf("Lower case Trie size: %d\n", size); |
| 346 if (status != U_BUFFER_OVERFLOW_ERROR) { |
| 347 goto cleanup; |
| 348 } |
| 349 status = U_ZERO_ERROR; |
| 350 spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemL
imit; |
| 351 spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; |
| 352 spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; |
| 353 where = spImpl->fSpoofData->reserveSpace(size, status); |
| 354 utrie2_serialize(lowerCaseTrie, where, size, &status); |
| 355 |
| 356 spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimi
t; |
| 357 spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; |
| 358 ScriptSet *rtScriptSets = static_cast<ScriptSet *> |
| 359 (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptS
et), status)); |
| 360 uint32_t rindex = 2; |
| 361 for (int32_t i=2; i<scriptSets->size(); i++) { |
| 362 BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets-
>elementAt(i)); |
| 363 if (bSet->rindex < rindex) { |
| 364 // We have already copied this script set to the serialized data
. |
| 365 continue; |
| 366 } |
| 367 U_ASSERT(rindex == bSet->rindex); |
| 368 rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet j
ust copies the bits. |
| 369 rindex++; |
| 370 } |
| 371 } |
| 372 |
| 373 // Open new utrie2s from the serialized data. We don't want to keep the one
s |
| 374 // we just built because we would then have two copies of the data, one in
ternal to |
| 375 // the utries that we have already constructed, and one in the serialized
data area. |
| 376 // An alternative would be to not pre-serialize the Trie data, but that ma
kes the |
| 377 // spoof detector data different, depending on how the detector was constr
ucted. |
| 378 // It's simpler to keep the data always the same. |
| 379 |
| 380 spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( |
| 381 UTRIE2_16_VALUE_BITS, |
| 382 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRa
wData->fAnyCaseTrie, |
| 383 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
| 384 NULL, |
| 385 &status); |
| 386 |
| 387 spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( |
| 388 UTRIE2_16_VALUE_BITS, |
| 389 (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRa
wData->fLowerCaseTrie, |
| 390 spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, |
| 391 NULL, |
| 392 &status); |
| 393 |
| 394 |
| 395 |
| 396 cleanup: |
| 397 if (U_FAILURE(status)) { |
| 398 pe->line = lineNum; |
| 399 } |
| 400 uregex_close(parseRegexp); |
| 401 uprv_free(input); |
| 402 |
| 403 int32_t i; |
| 404 for (i=0; i<scriptSets->size(); i++) { |
| 405 BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->el
ementAt(i)); |
| 406 delete bsset; |
| 407 } |
| 408 delete scriptSets; |
| 409 utrie2_close(anyCaseTrie); |
| 410 utrie2_close(lowerCaseTrie); |
| 411 return; |
| 412 } |
| 413 |
| 414 |
| 415 |
| 416 |
| 417 |
| 418 BuilderScriptSet::BuilderScriptSet() { |
| 419 codePoint = -1; |
| 420 trie = NULL; |
| 421 sset = NULL; |
| 422 index = 0; |
| 423 rindex = 0; |
| 424 scriptSetOwned = TRUE; |
| 425 } |
| 426 |
| 427 BuilderScriptSet::~BuilderScriptSet() { |
| 428 if (scriptSetOwned) { |
| 429 delete sset; |
| 430 } |
| 431 } |
| 432 |
| 433 #endif |
| 434 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
| 435 |
OLD | NEW |