OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2010, International Business Machines Corporation |
| 4 * and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 */ |
| 7 /** |
| 8 * IntlTestSpoof tests for USpoofDetector |
| 9 */ |
| 10 |
| 11 #include "unicode/utypes.h" |
| 12 |
| 13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_
FILE_IO |
| 14 |
| 15 #include "itspoof.h" |
| 16 #include "unicode/uspoof.h" |
| 17 #include "unicode/unistr.h" |
| 18 #include "unicode/regex.h" |
| 19 #include "unicode/normlzr.h" |
| 20 #include "cstring.h" |
| 21 #include <stdlib.h> |
| 22 #include <stdio.h> |
| 23 |
| 24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \ |
| 25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LI
NE__, u_errorName(status));}} |
| 26 |
| 27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \ |
| 28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LI
NE__, #expr);};} |
| 29 |
| 30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \ |
| 31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \ |
| 32 __FILE__, __LINE__, #a, (a), #b, (b)); }} |
| 33 |
| 34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \ |
| 35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \ |
| 36 __FILE__, __LINE__, #a, (a), #b, (b)); }} |
| 37 |
| 38 /* |
| 39 * TEST_SETUP and TEST_TEARDOWN |
| 40 * macros to handle the boilerplate around setting up test case. |
| 41 * Put arbitrary test code between SETUP and TEARDOWN. |
| 42 * "sc" is the ready-to-go SpoofChecker for use in the tests. |
| 43 */ |
| 44 #define TEST_SETUP { \ |
| 45 UErrorCode status = U_ZERO_ERROR; \ |
| 46 USpoofChecker *sc; \ |
| 47 sc = uspoof_open(&status); \ |
| 48 TEST_ASSERT_SUCCESS(status); \ |
| 49 if (U_SUCCESS(status)){ |
| 50 |
| 51 #define TEST_TEARDOWN \ |
| 52 } \ |
| 53 TEST_ASSERT_SUCCESS(status); \ |
| 54 uspoof_close(sc); \ |
| 55 } |
| 56 |
| 57 |
| 58 |
| 59 |
| 60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name
, char* /*par*/ ) |
| 61 { |
| 62 if (exec) logln("TestSuite spoof: "); |
| 63 switch (index) { |
| 64 case 0: |
| 65 name = "TestSpoofAPI"; |
| 66 if (exec) { |
| 67 testSpoofAPI(); |
| 68 } |
| 69 break; |
| 70 case 1: |
| 71 name = "TestSkeleton"; |
| 72 if (exec) { |
| 73 testSkeleton(); |
| 74 } |
| 75 break; |
| 76 case 2: |
| 77 name = "TestAreConfusable"; |
| 78 if (exec) { |
| 79 testAreConfusable(); |
| 80 } |
| 81 break; |
| 82 case 3: |
| 83 name = "TestInvisible"; |
| 84 if (exec) { |
| 85 testInvisible(); |
| 86 } |
| 87 break; |
| 88 case 4: |
| 89 name = "testConfData"; |
| 90 if (exec) { |
| 91 testConfData(); |
| 92 } |
| 93 break; |
| 94 default: name=""; break; |
| 95 } |
| 96 } |
| 97 |
| 98 void IntlTestSpoof::testSpoofAPI() { |
| 99 |
| 100 TEST_SETUP |
| 101 UnicodeString s("xyz"); // Many latin ranges are whole-script confusabl
e with other scripts. |
| 102 // If this test starts failing, consult confusa
blesWholeScript.txt |
| 103 int32_t position = 666; |
| 104 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &stat
us); |
| 105 TEST_ASSERT_SUCCESS(status); |
| 106 TEST_ASSERT_EQ(0, checkResults); |
| 107 TEST_ASSERT_EQ(666, position); |
| 108 TEST_TEARDOWN; |
| 109 |
| 110 TEST_SETUP |
| 111 UnicodeString s1("cxs"); |
| 112 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); /
/ Cyrillic "cxs" |
| 113 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &st
atus); |
| 114 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONF
USABLE, checkResults); |
| 115 |
| 116 TEST_TEARDOWN; |
| 117 |
| 118 TEST_SETUP |
| 119 UnicodeString s("I1l0O"); |
| 120 UnicodeString dest; |
| 121 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_C
ASE, s, dest, &status); |
| 122 TEST_ASSERT_SUCCESS(status); |
| 123 TEST_ASSERT(UnicodeString("lllOO") == dest); |
| 124 TEST_ASSERT(&dest == &retStr); |
| 125 TEST_TEARDOWN; |
| 126 } |
| 127 |
| 128 |
| 129 #define CHECK_SKELETON(type, input, expected) { \ |
| 130 checkSkeleton(sc, type, input, expected, __LINE__); \ |
| 131 } |
| 132 |
| 133 |
| 134 // testSkeleton. Spot check a number of confusable skeleton substitutions from
the |
| 135 // Unicode data file confusables.txt |
| 136 // Test cases chosen for substitutions of various lengths, and |
| 137 // membership in different mapping tables. |
| 138 void IntlTestSpoof::testSkeleton() { |
| 139 const uint32_t ML = 0; |
| 140 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE; |
| 141 const uint32_t MA = USPOOF_ANY_CASE; |
| 142 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; |
| 143 |
| 144 TEST_SETUP |
| 145 // A long "identifier" that will overflow implementation stack buffers,
forcing heap allocations. |
| 146 CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implem
entation stack buffers, forcing heap allocations." |
| 147 " A 1ong 'identifier' that will overflow implementati
on stack buffers, forcing heap allocations." |
| 148 " A 1ong 'identifier' that will overflow implementati
on stack buffers, forcing heap allocations." |
| 149 " A 1ong 'identifier' that will overflow implementati
on stack buffers, forcing heap allocations.", |
| 150 |
| 151 " A long 'identifier' that vvill overflovv irnplernentation stack
buffers, forcing heap allocations." |
| 152 " A long 'identifier' that vvill overflovv irnplernentation stack
buffers, forcing heap allocations." |
| 153 " A long 'identifier' that vvill overflovv irnplernentation stack
buffers, forcing heap allocations." |
| 154 " A long 'identifier' that vvill overflovv irnplernentation stack
buffers, forcing heap allocations.") |
| 155 |
| 156 // FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATA
N ISOLATED FORM to |
| 157 // ARABIC KASRATAN ISOLATED FORM, ARABIC
SHADDA |
| 158 // This character NFKD normalizes to \u0020 \u064d \u0651, so its con
fusable mapping |
| 159 // is never used in creating a skeleton. |
| 160 CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651"); |
| 161 |
| 162 CHECK_SKELETON(SL, "nochange", "nochange"); |
| 163 CHECK_SKELETON(MA, "love", "love"); |
| 164 CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l |
| 165 CHECK_SKELETON(ML, "OOPS", "OOPS"); |
| 166 CHECK_SKELETON(ML, "00PS", "00PS"); // Digit 0 unchanged in lower case
mode. |
| 167 CHECK_SKELETON(MA, "OOPS", "OOPS"); |
| 168 CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case
mode only |
| 169 CHECK_SKELETON(SL, "\\u059c", "\\u0301"); |
| 170 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D"); |
| 171 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll)
" |
| 172 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u062
7\\u0644\\u0647"); |
| 173 |
| 174 // This mapping exists in the ML and MA tables, does not exist in SL, SA |
| 175 //0C83 ; 0C03 ; |
| 176 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83"); |
| 177 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83"); |
| 178 CHECK_SKELETON(ML, "\\u0C83", "\\u0983"); |
| 179 CHECK_SKELETON(MA, "\\u0C83", "\\u0983"); |
| 180 |
| 181 // 0391 ; 0041 ; |
| 182 // This mapping exists only in the MA table. |
| 183 CHECK_SKELETON(MA, "\\u0391", "A"); |
| 184 CHECK_SKELETON(SA, "\\u0391", "\\u0391"); |
| 185 CHECK_SKELETON(ML, "\\u0391", "\\u0391"); |
| 186 CHECK_SKELETON(SL, "\\u0391", "\\u0391"); |
| 187 |
| 188 // 13CF ; 0062 ; |
| 189 // This mapping exists in the ML and MA tables |
| 190 CHECK_SKELETON(ML, "\\u13CF", "b"); |
| 191 CHECK_SKELETON(MA, "\\u13CF", "b"); |
| 192 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF"); |
| 193 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF"); |
| 194 |
| 195 // 0022 ; 0027 0027 ; |
| 196 // all tables. |
| 197 CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027"); |
| 198 CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027"); |
| 199 CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027"); |
| 200 CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027"); |
| 201 |
| 202 TEST_TEARDOWN; |
| 203 } |
| 204 |
| 205 |
| 206 // |
| 207 // Run a single confusable skeleton transformation test case. |
| 208 // |
| 209 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, |
| 210 const char *input, const char *expected, int32
_t lineNum) { |
| 211 UnicodeString uInput = UnicodeString(input).unescape(); |
| 212 UnicodeString uExpected = UnicodeString(expected).unescape(); |
| 213 |
| 214 UErrorCode status = U_ZERO_ERROR; |
| 215 UnicodeString actual; |
| 216 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status); |
| 217 if (U_FAILURE(status)) { |
| 218 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__
, __LINE__, lineNum, |
| 219 u_errorName(status)); |
| 220 return; |
| 221 } |
| 222 if (uExpected != actual) { |
| 223 errln("File %s, Line %d, Test case from line %d, Actual and Expected ske
letons differ.", |
| 224 __FILE__, __LINE__, lineNum); |
| 225 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString("
\"\n") + |
| 226 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeStrin
g("\"")); |
| 227 } |
| 228 } |
| 229 |
| 230 void IntlTestSpoof::testAreConfusable() { |
| 231 TEST_SETUP |
| 232 UnicodeString s1("A long string that will overflow stack buffers. A lon
g string that will overflow stack buffers. " |
| 233 "A long string that will overflow stack buffers. A lon
g string that will overflow stack buffers. "); |
| 234 UnicodeString s2("A long string that wi11 overflow stack buffers. A lon
g string that will overflow stack buffers. " |
| 235 "A long string that wi11 overflow stack buffers. A lon
g string that will overflow stack buffers. "); |
| 236 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnic
odeString(sc, s1, s2, &status)); |
| 237 TEST_ASSERT_SUCCESS(status); |
| 238 |
| 239 TEST_TEARDOWN; |
| 240 } |
| 241 |
| 242 void IntlTestSpoof::testInvisible() { |
| 243 TEST_SETUP |
| 244 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape(); |
| 245 int32_t position = -42; |
| 246 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status)); |
| 247 TEST_ASSERT_SUCCESS(status); |
| 248 TEST_ASSERT(position == -42); |
| 249 |
| 250 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescap
e(); |
| 251 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &posi
tion, &status)); |
| 252 TEST_ASSERT_SUCCESS(status); |
| 253 TEST_ASSERT_EQ(7, position); |
| 254 |
| 255 // Tow acute accents, one from the composed a with acute accent, \u00e1, |
| 256 // and one separate. |
| 257 position = -42; |
| 258 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape(); |
| 259 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &posi
tion, &status)); |
| 260 TEST_ASSERT_SUCCESS(status); |
| 261 TEST_ASSERT_EQ(7, position); |
| 262 TEST_TEARDOWN; |
| 263 } |
| 264 |
| 265 |
| 266 static UnicodeString parseHex(const UnicodeString &in) { |
| 267 // Convert a series of hex numbers in a Unicode String to a string with the |
| 268 // corresponding characters. |
| 269 // The conversion is _really_ annoying. There must be some function to just
do it. |
| 270 UnicodeString result; |
| 271 UChar32 cc = 0; |
| 272 for (int32_t i=0; i<in.length(); i++) { |
| 273 UChar c = in.charAt(i); |
| 274 if (c == 0x20) { // Space |
| 275 if (cc > 0) { |
| 276 result.append(cc); |
| 277 cc = 0; |
| 278 } |
| 279 } else if (c>=0x30 && c<=0x39) { |
| 280 cc = (cc<<4) + (c - 0x30); |
| 281 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) { |
| 282 cc = (cc<<4) + (c & 0x0f)+9; |
| 283 } |
| 284 // else do something with bad input. |
| 285 } |
| 286 if (cc > 0) { |
| 287 result.append(cc); |
| 288 } |
| 289 return result; |
| 290 } |
| 291 |
| 292 |
| 293 // |
| 294 // Append the hex form of a UChar32 to a UnicodeString. |
| 295 // Used in formatting error messages. |
| 296 // Match the formatting of numbers in confusables.txt |
| 297 // Minimum of 4 digits, no leading zeroes for positions 5 and up. |
| 298 // |
| 299 static void appendHexUChar(UnicodeString &dest, UChar32 c) { |
| 300 UBool doZeroes = FALSE; |
| 301 for (int bitNum=28; bitNum>=0; bitNum-=4) { |
| 302 if (bitNum <= 12) { |
| 303 doZeroes = TRUE; |
| 304 } |
| 305 int hexDigit = (c>>bitNum) & 0x0f; |
| 306 if (hexDigit != 0 || doZeroes) { |
| 307 doZeroes = TRUE; |
| 308 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x4
1)); |
| 309 } |
| 310 } |
| 311 dest.append((UChar)0x20); |
| 312 } |
| 313 |
| 314 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose); |
| 315 |
| 316 // testConfData - Check each data item from the Unicode confusables.txt file, |
| 317 // verify that it transforms correctly in a skeleton. |
| 318 // |
| 319 void IntlTestSpoof::testConfData() { |
| 320 UErrorCode status = U_ZERO_ERROR; |
| 321 |
| 322 const char *testDataDir = IntlTest::getSourceTestData(status); |
| 323 TEST_ASSERT_SUCCESS(status); |
| 324 char buffer[2000]; |
| 325 uprv_strcpy(buffer, testDataDir); |
| 326 uprv_strcat(buffer, "confusables.txt"); |
| 327 |
| 328 LocalStdioFilePointer f(fopen(buffer, "rb")); |
| 329 if (f.isNull()) { |
| 330 errln("Skipping test spoof/testConfData. File confusables.txt not acces
sible."); |
| 331 return; |
| 332 } |
| 333 fseek(f.getAlias(), 0, SEEK_END); |
| 334 int32_t fileSize = ftell(f.getAlias()); |
| 335 LocalArray<char> fileBuf(new char[fileSize]); |
| 336 fseek(f.getAlias(), 0, SEEK_SET); |
| 337 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias()); |
| 338 TEST_ASSERT_EQ(amt_read, fileSize); |
| 339 TEST_ASSERT(fileSize>0); |
| 340 if (amt_read != fileSize || fileSize <=0) { |
| 341 return; |
| 342 } |
| 343 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.g
etAlias(), fileSize)); |
| 344 |
| 345 LocalUSpoofCheckerPointer sc(uspoof_open(&status)); |
| 346 TEST_ASSERT_SUCCESS(status); |
| 347 |
| 348 // Parse lines from the confusables.txt file. Example Line: |
| 349 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH .... |
| 350 // Three fields. The hex fields can contain more than one character, |
| 351 // and each character may be more than 4 digits (for supplemn
tals) |
| 352 // This regular expression matches lines and splits the fields into capture
groups. |
| 353 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confus
ablesTxt, 0, status); |
| 354 TEST_ASSERT_SUCCESS(status); |
| 355 while (parseLine.find()) { |
| 356 UnicodeString from = parseHex(parseLine.group(1, status)); |
| 357 if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) { |
| 358 // The source character was not NFKD. |
| 359 // Skip this case; the first step in obtaining a skeleton is to NFKD
the input, |
| 360 // so the mapping in this line of confusables.txt will never be app
lied. |
| 361 continue; |
| 362 } |
| 363 |
| 364 UnicodeString rawExpected = parseHex(parseLine.group(2, status)); |
| 365 UnicodeString expected; |
| 366 Normalizer::decompose(rawExpected, TRUE, 0, expected, status); |
| 367 TEST_ASSERT_SUCCESS(status); |
| 368 |
| 369 int32_t skeletonType = 0; |
| 370 UnicodeString tableType = parseLine.group(3, status); |
| 371 TEST_ASSERT_SUCCESS(status); |
| 372 if (tableType.indexOf("SL") >= 0) { |
| 373 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE; |
| 374 } else if (tableType.indexOf("SA") >= 0) { |
| 375 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE; |
| 376 } else if (tableType.indexOf("ML") >= 0) { |
| 377 skeletonType = 0; |
| 378 } else if (tableType.indexOf("MA") >= 0) { |
| 379 skeletonType = USPOOF_ANY_CASE; |
| 380 } |
| 381 |
| 382 UnicodeString actual; |
| 383 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actua
l, &status); |
| 384 TEST_ASSERT_SUCCESS(status); |
| 385 TEST_ASSERT(actual == expected); |
| 386 if (actual != expected) { |
| 387 errln(parseLine.group(0, status)); |
| 388 UnicodeString line = "Actual: "; |
| 389 int i = 0; |
| 390 while (i < actual.length()) { |
| 391 appendHexUChar(line, actual.char32At(i)); |
| 392 i = actual.moveIndex32(i, 1); |
| 393 } |
| 394 errln(line); |
| 395 } |
| 396 if (U_FAILURE(status)) { |
| 397 break; |
| 398 } |
| 399 } |
| 400 } |
| 401 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS |
| 402 |
OLD | NEW |