Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(95)

Side by Side Diff: icu46/source/test/intltest/itspoof.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 10 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/test/intltest/itspoof.h ('k') | icu46/source/test/intltest/ittrans.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /*
2 **********************************************************************
3 * Copyright (C) 2010, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 **********************************************************************
6 */
7 /**
8 * IntlTestSpoof tests for USpoofDetector
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_ FILE_IO
14
15 #include "itspoof.h"
16 #include "unicode/uspoof.h"
17 #include "unicode/unistr.h"
18 #include "unicode/regex.h"
19 #include "unicode/normlzr.h"
20 #include "cstring.h"
21 #include <stdlib.h>
22 #include <stdio.h>
23
24 #define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
25 errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LI NE__, u_errorName(status));}}
26
27 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
28 errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LI NE__, #expr);};}
29
30 #define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
31 errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
32 __FILE__, __LINE__, #a, (a), #b, (b)); }}
33
34 #define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
35 errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
36 __FILE__, __LINE__, #a, (a), #b, (b)); }}
37
38 /*
39 * TEST_SETUP and TEST_TEARDOWN
40 * macros to handle the boilerplate around setting up test case.
41 * Put arbitrary test code between SETUP and TEARDOWN.
42 * "sc" is the ready-to-go SpoofChecker for use in the tests.
43 */
44 #define TEST_SETUP { \
45 UErrorCode status = U_ZERO_ERROR; \
46 USpoofChecker *sc; \
47 sc = uspoof_open(&status); \
48 TEST_ASSERT_SUCCESS(status); \
49 if (U_SUCCESS(status)){
50
51 #define TEST_TEARDOWN \
52 } \
53 TEST_ASSERT_SUCCESS(status); \
54 uspoof_close(sc); \
55 }
56
57
58
59
60 void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name , char* /*par*/ )
61 {
62 if (exec) logln("TestSuite spoof: ");
63 switch (index) {
64 case 0:
65 name = "TestSpoofAPI";
66 if (exec) {
67 testSpoofAPI();
68 }
69 break;
70 case 1:
71 name = "TestSkeleton";
72 if (exec) {
73 testSkeleton();
74 }
75 break;
76 case 2:
77 name = "TestAreConfusable";
78 if (exec) {
79 testAreConfusable();
80 }
81 break;
82 case 3:
83 name = "TestInvisible";
84 if (exec) {
85 testInvisible();
86 }
87 break;
88 case 4:
89 name = "testConfData";
90 if (exec) {
91 testConfData();
92 }
93 break;
94 default: name=""; break;
95 }
96 }
97
98 void IntlTestSpoof::testSpoofAPI() {
99
100 TEST_SETUP
101 UnicodeString s("xyz"); // Many latin ranges are whole-script confusabl e with other scripts.
102 // If this test starts failing, consult confusa blesWholeScript.txt
103 int32_t position = 666;
104 int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &stat us);
105 TEST_ASSERT_SUCCESS(status);
106 TEST_ASSERT_EQ(0, checkResults);
107 TEST_ASSERT_EQ(666, position);
108 TEST_TEARDOWN;
109
110 TEST_SETUP
111 UnicodeString s1("cxs");
112 UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape(); / / Cyrillic "cxs"
113 int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &st atus);
114 TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONF USABLE, checkResults);
115
116 TEST_TEARDOWN;
117
118 TEST_SETUP
119 UnicodeString s("I1l0O");
120 UnicodeString dest;
121 UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_C ASE, s, dest, &status);
122 TEST_ASSERT_SUCCESS(status);
123 TEST_ASSERT(UnicodeString("lllOO") == dest);
124 TEST_ASSERT(&dest == &retStr);
125 TEST_TEARDOWN;
126 }
127
128
129 #define CHECK_SKELETON(type, input, expected) { \
130 checkSkeleton(sc, type, input, expected, __LINE__); \
131 }
132
133
134 // testSkeleton. Spot check a number of confusable skeleton substitutions from the
135 // Unicode data file confusables.txt
136 // Test cases chosen for substitutions of various lengths, and
137 // membership in different mapping tables.
138 void IntlTestSpoof::testSkeleton() {
139 const uint32_t ML = 0;
140 const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
141 const uint32_t MA = USPOOF_ANY_CASE;
142 const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
143
144 TEST_SETUP
145 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
146 CHECK_SKELETON(SL, " A 1ong \\u02b9identifier' that will overflow implem entation stack buffers, forcing heap allocations."
147 " A 1ong 'identifier' that will overflow implementati on stack buffers, forcing heap allocations."
148 " A 1ong 'identifier' that will overflow implementati on stack buffers, forcing heap allocations."
149 " A 1ong 'identifier' that will overflow implementati on stack buffers, forcing heap allocations.",
150
151 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
152 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
153 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
154 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.")
155
156 // FC5F ; FE74 0651 ; ML #* ARABIC LIGATURE SHADDA WITH KASRATA N ISOLATED FORM to
157 // ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
158 // This character NFKD normalizes to \u0020 \u064d \u0651, so its con fusable mapping
159 // is never used in creating a skeleton.
160 CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");
161
162 CHECK_SKELETON(SL, "nochange", "nochange");
163 CHECK_SKELETON(MA, "love", "love");
164 CHECK_SKELETON(MA, "1ove", "love"); // Digit 1 to letter l
165 CHECK_SKELETON(ML, "OOPS", "OOPS");
166 CHECK_SKELETON(ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode.
167 CHECK_SKELETON(MA, "OOPS", "OOPS");
168 CHECK_SKELETON(MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
169 CHECK_SKELETON(SL, "\\u059c", "\\u0301");
170 CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
171 CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u006C\\u006C\\u0029"); // "(ll) "
172 CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u062 7\\u0644\\u0647");
173
174 // This mapping exists in the ML and MA tables, does not exist in SL, SA
175 //0C83 ; 0C03 ;
176 CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
177 CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
178 CHECK_SKELETON(ML, "\\u0C83", "\\u0983");
179 CHECK_SKELETON(MA, "\\u0C83", "\\u0983");
180
181 // 0391 ; 0041 ;
182 // This mapping exists only in the MA table.
183 CHECK_SKELETON(MA, "\\u0391", "A");
184 CHECK_SKELETON(SA, "\\u0391", "\\u0391");
185 CHECK_SKELETON(ML, "\\u0391", "\\u0391");
186 CHECK_SKELETON(SL, "\\u0391", "\\u0391");
187
188 // 13CF ; 0062 ;
189 // This mapping exists in the ML and MA tables
190 CHECK_SKELETON(ML, "\\u13CF", "b");
191 CHECK_SKELETON(MA, "\\u13CF", "b");
192 CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
193 CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
194
195 // 0022 ; 0027 0027 ;
196 // all tables.
197 CHECK_SKELETON(SL, "\\u0022", "\\u0027\\u0027");
198 CHECK_SKELETON(SA, "\\u0022", "\\u0027\\u0027");
199 CHECK_SKELETON(ML, "\\u0022", "\\u0027\\u0027");
200 CHECK_SKELETON(MA, "\\u0022", "\\u0027\\u0027");
201
202 TEST_TEARDOWN;
203 }
204
205
206 //
207 // Run a single confusable skeleton transformation test case.
208 //
209 void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
210 const char *input, const char *expected, int32 _t lineNum) {
211 UnicodeString uInput = UnicodeString(input).unescape();
212 UnicodeString uExpected = UnicodeString(expected).unescape();
213
214 UErrorCode status = U_ZERO_ERROR;
215 UnicodeString actual;
216 uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
217 if (U_FAILURE(status)) {
218 errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__ , __LINE__, lineNum,
219 u_errorName(status));
220 return;
221 }
222 if (uExpected != actual) {
223 errln("File %s, Line %d, Test case from line %d, Actual and Expected ske letons differ.",
224 __FILE__, __LINE__, lineNum);
225 errln(UnicodeString(" Actual Skeleton: \"") + actual + UnicodeString(" \"\n") +
226 UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeStrin g("\""));
227 }
228 }
229
230 void IntlTestSpoof::testAreConfusable() {
231 TEST_SETUP
232 UnicodeString s1("A long string that will overflow stack buffers. A lon g string that will overflow stack buffers. "
233 "A long string that will overflow stack buffers. A lon g string that will overflow stack buffers. ");
234 UnicodeString s2("A long string that wi11 overflow stack buffers. A lon g string that will overflow stack buffers. "
235 "A long string that wi11 overflow stack buffers. A lon g string that will overflow stack buffers. ");
236 TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnic odeString(sc, s1, s2, &status));
237 TEST_ASSERT_SUCCESS(status);
238
239 TEST_TEARDOWN;
240 }
241
242 void IntlTestSpoof::testInvisible() {
243 TEST_SETUP
244 UnicodeString s = UnicodeString("abcd\\u0301ef").unescape();
245 int32_t position = -42;
246 TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
247 TEST_ASSERT_SUCCESS(status);
248 TEST_ASSERT(position == -42);
249
250 UnicodeString s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescap e();
251 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &posi tion, &status));
252 TEST_ASSERT_SUCCESS(status);
253 TEST_ASSERT_EQ(7, position);
254
255 // Tow acute accents, one from the composed a with acute accent, \u00e1,
256 // and one separate.
257 position = -42;
258 UnicodeString s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
259 TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &posi tion, &status));
260 TEST_ASSERT_SUCCESS(status);
261 TEST_ASSERT_EQ(7, position);
262 TEST_TEARDOWN;
263 }
264
265
266 static UnicodeString parseHex(const UnicodeString &in) {
267 // Convert a series of hex numbers in a Unicode String to a string with the
268 // corresponding characters.
269 // The conversion is _really_ annoying. There must be some function to just do it.
270 UnicodeString result;
271 UChar32 cc = 0;
272 for (int32_t i=0; i<in.length(); i++) {
273 UChar c = in.charAt(i);
274 if (c == 0x20) { // Space
275 if (cc > 0) {
276 result.append(cc);
277 cc = 0;
278 }
279 } else if (c>=0x30 && c<=0x39) {
280 cc = (cc<<4) + (c - 0x30);
281 } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
282 cc = (cc<<4) + (c & 0x0f)+9;
283 }
284 // else do something with bad input.
285 }
286 if (cc > 0) {
287 result.append(cc);
288 }
289 return result;
290 }
291
292
293 //
294 // Append the hex form of a UChar32 to a UnicodeString.
295 // Used in formatting error messages.
296 // Match the formatting of numbers in confusables.txt
297 // Minimum of 4 digits, no leading zeroes for positions 5 and up.
298 //
299 static void appendHexUChar(UnicodeString &dest, UChar32 c) {
300 UBool doZeroes = FALSE;
301 for (int bitNum=28; bitNum>=0; bitNum-=4) {
302 if (bitNum <= 12) {
303 doZeroes = TRUE;
304 }
305 int hexDigit = (c>>bitNum) & 0x0f;
306 if (hexDigit != 0 || doZeroes) {
307 doZeroes = TRUE;
308 dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x4 1));
309 }
310 }
311 dest.append((UChar)0x20);
312 }
313
314 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
315
316 // testConfData - Check each data item from the Unicode confusables.txt file,
317 // verify that it transforms correctly in a skeleton.
318 //
319 void IntlTestSpoof::testConfData() {
320 UErrorCode status = U_ZERO_ERROR;
321
322 const char *testDataDir = IntlTest::getSourceTestData(status);
323 TEST_ASSERT_SUCCESS(status);
324 char buffer[2000];
325 uprv_strcpy(buffer, testDataDir);
326 uprv_strcat(buffer, "confusables.txt");
327
328 LocalStdioFilePointer f(fopen(buffer, "rb"));
329 if (f.isNull()) {
330 errln("Skipping test spoof/testConfData. File confusables.txt not acces sible.");
331 return;
332 }
333 fseek(f.getAlias(), 0, SEEK_END);
334 int32_t fileSize = ftell(f.getAlias());
335 LocalArray<char> fileBuf(new char[fileSize]);
336 fseek(f.getAlias(), 0, SEEK_SET);
337 int32_t amt_read = fread(fileBuf.getAlias(), 1, fileSize, f.getAlias());
338 TEST_ASSERT_EQ(amt_read, fileSize);
339 TEST_ASSERT(fileSize>0);
340 if (amt_read != fileSize || fileSize <=0) {
341 return;
342 }
343 UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf.g etAlias(), fileSize));
344
345 LocalUSpoofCheckerPointer sc(uspoof_open(&status));
346 TEST_ASSERT_SUCCESS(status);
347
348 // Parse lines from the confusables.txt file. Example Line:
349 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
350 // Three fields. The hex fields can contain more than one character,
351 // and each character may be more than 4 digits (for supplemn tals)
352 // This regular expression matches lines and splits the fields into capture groups.
353 RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confus ablesTxt, 0, status);
354 TEST_ASSERT_SUCCESS(status);
355 while (parseLine.find()) {
356 UnicodeString from = parseHex(parseLine.group(1, status));
357 if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
358 // The source character was not NFKD.
359 // Skip this case; the first step in obtaining a skeleton is to NFKD the input,
360 // so the mapping in this line of confusables.txt will never be app lied.
361 continue;
362 }
363
364 UnicodeString rawExpected = parseHex(parseLine.group(2, status));
365 UnicodeString expected;
366 Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
367 TEST_ASSERT_SUCCESS(status);
368
369 int32_t skeletonType = 0;
370 UnicodeString tableType = parseLine.group(3, status);
371 TEST_ASSERT_SUCCESS(status);
372 if (tableType.indexOf("SL") >= 0) {
373 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
374 } else if (tableType.indexOf("SA") >= 0) {
375 skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
376 } else if (tableType.indexOf("ML") >= 0) {
377 skeletonType = 0;
378 } else if (tableType.indexOf("MA") >= 0) {
379 skeletonType = USPOOF_ANY_CASE;
380 }
381
382 UnicodeString actual;
383 uspoof_getSkeletonUnicodeString(sc.getAlias(), skeletonType, from, actua l, &status);
384 TEST_ASSERT_SUCCESS(status);
385 TEST_ASSERT(actual == expected);
386 if (actual != expected) {
387 errln(parseLine.group(0, status));
388 UnicodeString line = "Actual: ";
389 int i = 0;
390 while (i < actual.length()) {
391 appendHexUChar(line, actual.char32At(i));
392 i = actual.moveIndex32(i, 1);
393 }
394 errln(line);
395 }
396 if (U_FAILURE(status)) {
397 break;
398 }
399 }
400 }
401 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
402
OLDNEW
« no previous file with comments | « icu46/source/test/intltest/itspoof.h ('k') | icu46/source/test/intltest/ittrans.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698