| OLD | NEW |
| 1 /******************************************************************** | 1 /******************************************************************** |
| 2 * COPYRIGHT: | 2 * COPYRIGHT: |
| 3 * Copyright (c) 1997-2013, International Business Machines Corporation and | 3 * Copyright (c) 1997-2014, International Business Machines Corporation and |
| 4 * others. All Rights Reserved. | 4 * others. All Rights Reserved. |
| 5 ********************************************************************/ | 5 ********************************************************************/ |
| 6 /*******************************************************************************
* | 6 /*******************************************************************************
* |
| 7 * | 7 * |
| 8 * File CITERTST.C | 8 * File CITERTST.C |
| 9 * | 9 * |
| 10 * Modification History: | 10 * Modification History: |
| 11 * Date Name Description | 11 * Date Name Description |
| 12 * Madhu Katragadda Ported for C API | 12 * Madhu Katragadda Ported for C API |
| 13 * 02/19/01 synwee Modified test case for new collation iterator | 13 * 02/19/01 synwee Modified test case for new collation iterator |
| (...skipping 14 matching lines...) Expand all Loading... |
| 28 #include "unicode/ustring.h" | 28 #include "unicode/ustring.h" |
| 29 #include "unicode/putil.h" | 29 #include "unicode/putil.h" |
| 30 #include "callcoll.h" | 30 #include "callcoll.h" |
| 31 #include "cmemory.h" | 31 #include "cmemory.h" |
| 32 #include "cintltst.h" | 32 #include "cintltst.h" |
| 33 #include "citertst.h" | 33 #include "citertst.h" |
| 34 #include "ccolltst.h" | 34 #include "ccolltst.h" |
| 35 #include "filestrm.h" | 35 #include "filestrm.h" |
| 36 #include "cstring.h" | 36 #include "cstring.h" |
| 37 #include "ucol_imp.h" | 37 #include "ucol_imp.h" |
| 38 #include "ucol_tok.h" | |
| 39 #include "uparse.h" | 38 #include "uparse.h" |
| 40 #include <stdio.h> | 39 #include <stdio.h> |
| 41 | 40 |
| 42 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); | 41 extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); |
| 43 | 42 |
| 44 void addCollIterTest(TestNode** root) | 43 void addCollIterTest(TestNode** root) |
| 45 { | 44 { |
| 46 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious"); | 45 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious"); |
| 47 addTest(root, &TestOffset, "tscoll/citertst/TestOffset"); | 46 addTest(root, &TestOffset, "tscoll/citertst/TestOffset"); |
| 48 addTest(root, &TestSetText, "tscoll/citertst/TestSetText"); | 47 addTest(root, &TestSetText, "tscoll/citertst/TestSetText"); |
| 49 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion"); | 48 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion"); |
| 50 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar"); | 49 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar"); |
| 51 addTest(root, &TestNormalizedUnicodeChar, | 50 addTest(root, &TestNormalizedUnicodeChar, |
| 52 "tscoll/citertst/TestNormalizedUnicodeChar"); | 51 "tscoll/citertst/TestNormalizedUnicodeChar"); |
| 53 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization"); | 52 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization"); |
| 54 addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); | 53 addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); |
| 55 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); | 54 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); |
| 56 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); | 55 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); |
| 57 addTest(root, &TestCEs, "tscoll/citertst/TestCEs"); | |
| 58 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); | 56 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); |
| 59 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow")
; | |
| 60 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); | |
| 61 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); | |
| 62 addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollat
orElements"); | 57 addTest(root, &TestSearchCollatorElements, "tscoll/citertst/TestSearchCollat
orElements"); |
| 63 } | 58 } |
| 64 | 59 |
| 65 /* The locales we support */ | 60 /* The locales we support */ |
| 66 | 61 |
| 67 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"}; | 62 static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"}; |
| 68 | 63 |
| 69 static void TestBug672() { | 64 static void TestBug672() { |
| 70 UErrorCode status = U_ZERO_ERROR; | 65 UErrorCode status = U_ZERO_ERROR; |
| 71 UChar pattern[20]; | 66 UChar pattern[20]; |
| (...skipping 684 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 756 { | 751 { |
| 757 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status
)); | 752 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status
)); |
| 758 } | 753 } |
| 759 else | 754 else |
| 760 { | 755 { |
| 761 assertEqual(iter1, iter2); | 756 assertEqual(iter1, iter2); |
| 762 } | 757 } |
| 763 | 758 |
| 764 /* Now set it to point to a null string with fake length*/ | 759 /* Now set it to point to a null string with fake length*/ |
| 765 ucol_setText(iter2, NULL, 2, &status); | 760 ucol_setText(iter2, NULL, 2, &status); |
| 766 if (U_FAILURE(status)) | 761 if (status != U_ILLEGAL_ARGUMENT_ERROR) |
| 767 { | 762 { |
| 768 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status)
); | 763 log_err("call to iter2->setText(null, 2) should yield an illegal-argumen
t-error - %s\n", |
| 769 } | 764 myErrorName(status)); |
| 770 else | |
| 771 { | |
| 772 if (ucol_next(iter2, &status) != UCOL_NULLORDER) { | |
| 773 log_err("iter2 with null text expected to return UCOL_NULLORDER\n"); | |
| 774 } | |
| 775 } | 765 } |
| 776 | 766 |
| 777 ucol_closeElements(iter2); | 767 ucol_closeElements(iter2); |
| 778 ucol_closeElements(iter1); | 768 ucol_closeElements(iter1); |
| 779 ucol_close(en_us); | 769 ucol_close(en_us); |
| 780 } | 770 } |
| 781 | 771 |
| 782 /** @bug 4108762 | 772 /** @bug 4108762 |
| 783 * Test for getMaxExpansion() | 773 * Test for getMaxExpansion() |
| 784 */ | 774 */ |
| (...skipping 219 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1004 ucol_next(testiter, &status); | 994 ucol_next(testiter, &status); |
| 1005 ucol_closeElements(testiter); | 995 ucol_closeElements(testiter); |
| 1006 ucol_closeElements(iter); | 996 ucol_closeElements(iter); |
| 1007 ucol_close(coll); | 997 ucol_close(coll); |
| 1008 } else { | 998 } else { |
| 1009 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(statu
s)); | 999 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(statu
s)); |
| 1010 } | 1000 } |
| 1011 } | 1001 } |
| 1012 | 1002 |
| 1013 /** | 1003 /** |
| 1014 * Sniplets of code from genuca | |
| 1015 */ | |
| 1016 static int32_t hex2num(char hex) { | |
| 1017 if(hex>='0' && hex <='9') { | |
| 1018 return hex-'0'; | |
| 1019 } else if(hex>='a' && hex<='f') { | |
| 1020 return hex-'a'+10; | |
| 1021 } else if(hex>='A' && hex<='F') { | |
| 1022 return hex-'A'+10; | |
| 1023 } else { | |
| 1024 return 0; | |
| 1025 } | |
| 1026 } | |
| 1027 | |
| 1028 /** | |
| 1029 * Getting codepoints from a string | |
| 1030 * @param str character string contain codepoints seperated by space and ended | |
| 1031 * by a semicolon | |
| 1032 * @param codepoints array for storage, assuming size > 5 | |
| 1033 * @return position at the end of the codepoint section | |
| 1034 */ | |
| 1035 static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) { | |
| 1036 UErrorCode errorCode = U_ZERO_ERROR; | |
| 1037 char *semi = uprv_strchr(str, ';'); | |
| 1038 char *pipe = uprv_strchr(str, '|'); | |
| 1039 char *s; | |
| 1040 *codepoints = 0; | |
| 1041 *contextCPs = 0; | |
| 1042 if(semi == NULL) { | |
| 1043 log_err("expected semicolon after code point string in FractionalUCA.txt
%s\n", str); | |
| 1044 return str; | |
| 1045 } | |
| 1046 if(pipe != NULL) { | |
| 1047 int32_t contextLength; | |
| 1048 *pipe = 0; | |
| 1049 contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode); | |
| 1050 *pipe = '|'; | |
| 1051 if(U_FAILURE(errorCode)) { | |
| 1052 log_err("error parsing precontext string from FractionalUCA.txt %s\n
", str); | |
| 1053 return str; | |
| 1054 } | |
| 1055 /* prepend the precontext string to the codepoints */ | |
| 1056 u_memcpy(codepoints, contextCPs, contextLength); | |
| 1057 codepoints += contextLength; | |
| 1058 /* start of the code point string */ | |
| 1059 s = pipe + 1; | |
| 1060 } else { | |
| 1061 s = str; | |
| 1062 } | |
| 1063 u_parseString(s, codepoints, 99, NULL, &errorCode); | |
| 1064 if(U_FAILURE(errorCode)) { | |
| 1065 log_err("error parsing code point string from FractionalUCA.txt %s\n", s
tr); | |
| 1066 return str; | |
| 1067 } | |
| 1068 return semi + 1; | |
| 1069 } | |
| 1070 | |
| 1071 /** | |
| 1072 * Sniplets of code from genuca | |
| 1073 */ | |
| 1074 static int32_t | |
| 1075 readElement(char **from, char *to, char separator, UErrorCode *status) | |
| 1076 { | |
| 1077 if (U_SUCCESS(*status)) { | |
| 1078 char buffer[1024]; | |
| 1079 int32_t i = 0; | |
| 1080 while (**from != separator) { | |
| 1081 if (**from != ' ') { | |
| 1082 *(buffer+i++) = **from; | |
| 1083 } | |
| 1084 (*from)++; | |
| 1085 } | |
| 1086 (*from)++; | |
| 1087 *(buffer + i) = 0; | |
| 1088 strcpy(to, buffer); | |
| 1089 return i/2; | |
| 1090 } | |
| 1091 | |
| 1092 return 0; | |
| 1093 } | |
| 1094 | |
| 1095 /** | |
| 1096 * Sniplets of code from genuca | |
| 1097 */ | |
| 1098 static uint32_t | |
| 1099 getSingleCEValue(char *primary, char *secondary, char *tertiary, | |
| 1100 UErrorCode *status) | |
| 1101 { | |
| 1102 if (U_SUCCESS(*status)) { | |
| 1103 uint32_t value = 0; | |
| 1104 char primsave = '\0'; | |
| 1105 char secsave = '\0'; | |
| 1106 char tersave = '\0'; | |
| 1107 char *primend = primary+4; | |
| 1108 char *secend = secondary+2; | |
| 1109 char *terend = tertiary+2; | |
| 1110 uint32_t primvalue; | |
| 1111 uint32_t secvalue; | |
| 1112 uint32_t tervalue; | |
| 1113 | |
| 1114 if (uprv_strlen(primary) > 4) { | |
| 1115 primsave = *primend; | |
| 1116 *primend = '\0'; | |
| 1117 } | |
| 1118 | |
| 1119 if (uprv_strlen(secondary) > 2) { | |
| 1120 secsave = *secend; | |
| 1121 *secend = '\0'; | |
| 1122 } | |
| 1123 | |
| 1124 if (uprv_strlen(tertiary) > 2) { | |
| 1125 tersave = *terend; | |
| 1126 *terend = '\0'; | |
| 1127 } | |
| 1128 | |
| 1129 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0; | |
| 1130 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0; | |
| 1131 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0; | |
| 1132 if(primvalue <= 0xFF) { | |
| 1133 primvalue <<= 8; | |
| 1134 } | |
| 1135 | |
| 1136 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK) | |
| 1137 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK) | |
| 1138 | (tervalue & UCOL_TERTIARYORDERMASK); | |
| 1139 | |
| 1140 if(primsave!='\0') { | |
| 1141 *primend = primsave; | |
| 1142 } | |
| 1143 if(secsave!='\0') { | |
| 1144 *secend = secsave; | |
| 1145 } | |
| 1146 if(tersave!='\0') { | |
| 1147 *terend = tersave; | |
| 1148 } | |
| 1149 return value; | |
| 1150 } | |
| 1151 return 0; | |
| 1152 } | |
| 1153 | |
| 1154 /** | |
| 1155 * Getting collation elements generated from a string | |
| 1156 * @param str character string contain collation elements contained in [] and | |
| 1157 * seperated by space | |
| 1158 * @param ce array for storage, assuming size > 20 | |
| 1159 * @param status error status | |
| 1160 * @return position at the end of the codepoint section | |
| 1161 */ | |
| 1162 static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) { | |
| 1163 char *pStartCP = uprv_strchr(str, '['); | |
| 1164 int count = 0; | |
| 1165 char *pEndCP; | |
| 1166 char primary[100]; | |
| 1167 char secondary[100]; | |
| 1168 char tertiary[100]; | |
| 1169 | |
| 1170 while (*pStartCP == '[') { | |
| 1171 uint32_t primarycount = 0; | |
| 1172 uint32_t secondarycount = 0; | |
| 1173 uint32_t tertiarycount = 0; | |
| 1174 uint32_t CEi = 1; | |
| 1175 pEndCP = strchr(pStartCP, ']'); | |
| 1176 if(pEndCP == NULL) { | |
| 1177 break; | |
| 1178 } | |
| 1179 pStartCP ++; | |
| 1180 | |
| 1181 primarycount = readElement(&pStartCP, primary, ',', status); | |
| 1182 secondarycount = readElement(&pStartCP, secondary, ',', status); | |
| 1183 tertiarycount = readElement(&pStartCP, tertiary, ']', status); | |
| 1184 | |
| 1185 /* I want to get the CEs entered right here, including continuation */ | |
| 1186 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status); | |
| 1187 if (U_FAILURE(*status)) { | |
| 1188 break; | |
| 1189 } | |
| 1190 | |
| 1191 while (2 * CEi < primarycount || CEi < secondarycount || | |
| 1192 CEi < tertiarycount) { | |
| 1193 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ | |
| 1194 if (2 * CEi < primarycount) { | |
| 1195 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28); | |
| 1196 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24); | |
| 1197 } | |
| 1198 | |
| 1199 if (2 * CEi + 1 < primarycount) { | |
| 1200 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20); | |
| 1201 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16); | |
| 1202 } | |
| 1203 | |
| 1204 if (CEi < secondarycount) { | |
| 1205 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12); | |
| 1206 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8); | |
| 1207 } | |
| 1208 | |
| 1209 if (CEi < tertiarycount) { | |
| 1210 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4); | |
| 1211 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF); | |
| 1212 } | |
| 1213 | |
| 1214 CEi ++; | |
| 1215 ces[count ++] = value; | |
| 1216 } | |
| 1217 | |
| 1218 pStartCP = pEndCP + 1; | |
| 1219 } | |
| 1220 ces[count] = 0; | |
| 1221 return pStartCP; | |
| 1222 } | |
| 1223 | |
| 1224 /** | |
| 1225 * Getting the FractionalUCA.txt file stream | |
| 1226 */ | |
| 1227 static FileStream * getFractionalUCA(void) | |
| 1228 { | |
| 1229 char newPath[256]; | |
| 1230 char backupPath[256]; | |
| 1231 FileStream *result = NULL; | |
| 1232 | |
| 1233 /* Look inside ICU_DATA first */ | |
| 1234 uprv_strcpy(newPath, ctest_dataSrcDir()); | |
| 1235 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING ); | |
| 1236 uprv_strcat(newPath, "FractionalUCA.txt"); | |
| 1237 | |
| 1238 /* As a fallback, try to guess where the source data was located | |
| 1239 * at the time ICU was built, and look there. | |
| 1240 */ | |
| 1241 #if defined (U_TOPSRCDIR) | |
| 1242 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); | |
| 1243 #else | |
| 1244 { | |
| 1245 UErrorCode errorCode = U_ZERO_ERROR; | |
| 1246 strcpy(backupPath, loadTestData(&errorCode)); | |
| 1247 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_
SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); | |
| 1248 } | |
| 1249 #endif | |
| 1250 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "Fractional
UCA.txt"); | |
| 1251 | |
| 1252 result = T_FileStream_open(newPath, "rb"); | |
| 1253 | |
| 1254 if (result == NULL) { | |
| 1255 result = T_FileStream_open(backupPath, "rb"); | |
| 1256 if (result == NULL) { | |
| 1257 log_err("Failed to open either %s or %s\n", newPath, backupPath); | |
| 1258 } | |
| 1259 } | |
| 1260 return result; | |
| 1261 } | |
| 1262 | |
| 1263 /** | |
| 1264 * Testing the CEs returned by the iterator | |
| 1265 */ | |
| 1266 static void TestCEs() { | |
| 1267 FileStream *file = NULL; | |
| 1268 char line[2048]; | |
| 1269 char *str; | |
| 1270 UChar codepoints[10]; | |
| 1271 uint32_t ces[20]; | |
| 1272 UErrorCode status = U_ZERO_ERROR; | |
| 1273 UCollator *coll = ucol_open("", &status); | |
| 1274 uint32_t lineNo = 0; | |
| 1275 UChar contextCPs[5]; | |
| 1276 | |
| 1277 if (U_FAILURE(status)) { | |
| 1278 log_err_status(status, "Error in opening root collator -> %s\n", u_error
Name(status)); | |
| 1279 return; | |
| 1280 } | |
| 1281 | |
| 1282 file = getFractionalUCA(); | |
| 1283 | |
| 1284 if (file == NULL) { | |
| 1285 log_err("*** unable to open input FractionalUCA.txt file ***\n"); | |
| 1286 return; | |
| 1287 } | |
| 1288 | |
| 1289 | |
| 1290 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { | |
| 1291 int count = 0; | |
| 1292 UCollationElements *iter; | |
| 1293 int32_t preContextCeLen=0; | |
| 1294 lineNo++; | |
| 1295 /* skip this line if it is empty or a comment or is a return value | |
| 1296 or start of some variable section */ | |
| 1297 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || | |
| 1298 line[0] == 0x000D || line[0] == '[') { | |
| 1299 continue; | |
| 1300 } | |
| 1301 | |
| 1302 str = getCodePoints(line, codepoints, contextCPs); | |
| 1303 | |
| 1304 /* these are 'fake' codepoints in the fractional UCA, and are used just | |
| 1305 * for positioning of indirect values. They should not go through this | |
| 1306 * test. | |
| 1307 */ | |
| 1308 if(*codepoints == 0xFDD0) { | |
| 1309 continue; | |
| 1310 } | |
| 1311 if (*contextCPs != 0) { | |
| 1312 iter = ucol_openElements(coll, contextCPs, -1, &status); | |
| 1313 if (U_FAILURE(status)) { | |
| 1314 log_err("Error in opening collation elements\n"); | |
| 1315 break; | |
| 1316 } | |
| 1317 while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t
)UCOL_NULLORDER) { | |
| 1318 preContextCeLen++; | |
| 1319 } | |
| 1320 ucol_closeElements(iter); | |
| 1321 } | |
| 1322 | |
| 1323 getCEs(str, ces+preContextCeLen, &status); | |
| 1324 if (U_FAILURE(status)) { | |
| 1325 log_err("Error in parsing collation elements in FractionalUCA.txt\n"
); | |
| 1326 break; | |
| 1327 } | |
| 1328 iter = ucol_openElements(coll, codepoints, -1, &status); | |
| 1329 if (U_FAILURE(status)) { | |
| 1330 log_err("Error in opening collation elements\n"); | |
| 1331 break; | |
| 1332 } | |
| 1333 for (;;) { | |
| 1334 uint32_t ce = (uint32_t)ucol_next(iter, &status); | |
| 1335 if (ce == 0xFFFFFFFF) { | |
| 1336 ce = 0; | |
| 1337 } | |
| 1338 /* we now unconditionally reorder Thai/Lao prevowels, so this | |
| 1339 * test would fail if we don't skip here. | |
| 1340 */ | |
| 1341 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) { | |
| 1342 continue; | |
| 1343 } | |
| 1344 if (ce != ces[count] || U_FAILURE(status)) { | |
| 1345 log_err("Collation elements in FractionalUCA.txt and iterators d
o not match!\n"); | |
| 1346 break; | |
| 1347 } | |
| 1348 if (ces[count] == 0) { | |
| 1349 break; | |
| 1350 } | |
| 1351 count ++; | |
| 1352 } | |
| 1353 ucol_closeElements(iter); | |
| 1354 } | |
| 1355 | |
| 1356 T_FileStream_close(file); | |
| 1357 ucol_close(coll); | |
| 1358 } | |
| 1359 | |
| 1360 /** | |
| 1361 * Testing the discontigous contractions | 1004 * Testing the discontigous contractions |
| 1362 */ | 1005 */ |
| 1363 static void TestDiscontiguos() { | 1006 static void TestDiscontiguos() { |
| 1364 const char *rulestr = | 1007 const char *rulestr = |
| 1365 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315"; | 1008 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315"; |
| 1366 UChar rule[50]; | 1009 UChar rule[50]; |
| 1367 int rulelen = u_unescape(rulestr, rule, 50); | 1010 int rulelen = u_unescape(rulestr, rule, 50); |
| 1368 const char *src[] = { | 1011 const char *src[] = { |
| 1369 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC", | 1012 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC", |
| 1370 /* base character blocked */ | 1013 /* base character blocked */ |
| (...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1460 } | 1103 } |
| 1461 ucol_reset(iter); | 1104 ucol_reset(iter); |
| 1462 backAndForth(iter); | 1105 backAndForth(iter); |
| 1463 count ++; | 1106 count ++; |
| 1464 } | 1107 } |
| 1465 ucol_closeElements(resultiter); | 1108 ucol_closeElements(resultiter); |
| 1466 ucol_closeElements(iter); | 1109 ucol_closeElements(iter); |
| 1467 ucol_close(coll); | 1110 ucol_close(coll); |
| 1468 } | 1111 } |
| 1469 | 1112 |
| 1470 static void TestCEBufferOverflow() | |
| 1471 { | |
| 1472 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1]; | |
| 1473 UErrorCode status = U_ZERO_ERROR; | |
| 1474 UChar rule[10]; | |
| 1475 UCollator *coll; | |
| 1476 UCollationElements *iter; | |
| 1477 | |
| 1478 u_uastrcpy(rule, "&z < AB"); | |
| 1479 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH,
NULL,&status); | |
| 1480 if (U_FAILURE(status)) { | |
| 1481 log_err_status(status, "Rule based collator not created for testing ce b
uffer overflow -> %s\n", u_errorName(status)); | |
| 1482 return; | |
| 1483 } | |
| 1484 | |
| 1485 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic | |
| 1486 test. this will cause an overflow in getPrev */ | |
| 1487 str[0] = 0x0041; /* 'A' */ | |
| 1488 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/ | |
| 1489 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE); | |
| 1490 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */ | |
| 1491 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1, | |
| 1492 &status); | |
| 1493 if (ucol_previous(iter, &status) == UCOL_NULLORDER || | |
| 1494 status == U_BUFFER_OVERFLOW_ERROR) { | |
| 1495 log_err("CE buffer should not overflow with long string of trail surroga
tes\n"); | |
| 1496 } | |
| 1497 ucol_closeElements(iter); | |
| 1498 ucol_close(coll); | |
| 1499 } | |
| 1500 | |
| 1501 /** | |
| 1502 * Checking collation element validity. | |
| 1503 */ | |
| 1504 #define MAX_CODEPOINTS_TO_SHOW 10 | |
| 1505 static void showCodepoints(const UChar *codepoints, int length, char * codepoint
Text) { | |
| 1506 int i, lengthToUse = length; | |
| 1507 if (lengthToUse > MAX_CODEPOINTS_TO_SHOW) { | |
| 1508 lengthToUse = MAX_CODEPOINTS_TO_SHOW; | |
| 1509 } | |
| 1510 for (i = 0; i < lengthToUse; ++i) { | |
| 1511 int bytesWritten = sprintf(codepointText, " %04X", *codepoints++); | |
| 1512 if (bytesWritten <= 0) { | |
| 1513 break; | |
| 1514 } | |
| 1515 codepointText += bytesWritten; | |
| 1516 } | |
| 1517 if (i < length) { | |
| 1518 sprintf(codepointText, " ..."); | |
| 1519 } | |
| 1520 } | |
| 1521 | |
| 1522 static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints, | |
| 1523 int length) | |
| 1524 { | |
| 1525 UErrorCode status = U_ZERO_ERROR; | |
| 1526 UCollationElements *iter = ucol_openElements(coll, codepoints, length, | |
| 1527 &status); | |
| 1528 UBool result = FALSE; | |
| 1529 UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE; | |
| 1530 const char * collLocale; | |
| 1531 | |
| 1532 if (U_FAILURE(status)) { | |
| 1533 log_err("Error creating iterator for testing validity\n"); | |
| 1534 return FALSE; | |
| 1535 } | |
| 1536 collLocale = ucol_getLocale(coll, ULOC_VALID_LOCALE, &status); | |
| 1537 if (U_FAILURE(status) || collLocale==NULL) { | |
| 1538 status = U_ZERO_ERROR; | |
| 1539 collLocale = "?"; | |
| 1540 } | |
| 1541 | |
| 1542 for (;;) { | |
| 1543 uint32_t ce = ucol_next(iter, &status); | |
| 1544 uint32_t primary, p1, p2, secondary, tertiary; | |
| 1545 if (ce == UCOL_NULLORDER) { | |
| 1546 result = TRUE; | |
| 1547 break; | |
| 1548 } | |
| 1549 if (ce == 0) { | |
| 1550 continue; | |
| 1551 } | |
| 1552 if (ce == 0x02000202) { | |
| 1553 /* special CE for merge-sort character */ | |
| 1554 if (*codepoints == 0xFFFE /* && length == 1 */) { | |
| 1555 /* | |
| 1556 * Note: We should check for length==1 but the token parser appe
ars | |
| 1557 * to give us trailing NUL characters. | |
| 1558 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTail
oredSet() | |
| 1559 * rather than the internal collation rule p
arser | |
| 1560 */ | |
| 1561 continue; | |
| 1562 } else { | |
| 1563 log_err("Special 02/02/02 weight for code point U+%04X [len %d]
!= U+FFFE\n", | |
| 1564 (int)*codepoints, (int)length); | |
| 1565 break; | |
| 1566 } | |
| 1567 } | |
| 1568 primary = UCOL_PRIMARYORDER(ce); | |
| 1569 p1 = primary >> 8; | |
| 1570 p2 = primary & 0xFF; | |
| 1571 secondary = UCOL_SECONDARYORDER(ce); | |
| 1572 tertiary = UCOL_TERTIARYORDER(ce) & UCOL_REMOVE_CONTINUATION; | |
| 1573 | |
| 1574 if (!isContinuation(ce)) { | |
| 1575 if ((ce & UCOL_REMOVE_CONTINUATION) == 0) { | |
| 1576 log_err("Empty CE %08lX except for case bits\n", (long)ce); | |
| 1577 break; | |
| 1578 } | |
| 1579 if (p1 == 0) { | |
| 1580 if (p2 != 0) { | |
| 1581 log_err("Primary 00 xx in %08lX\n", (long)ce); | |
| 1582 break; | |
| 1583 } | |
| 1584 primaryDone = TRUE; | |
| 1585 } else { | |
| 1586 if (p1 <= 2 || p1 >= 0xF0) { | |
| 1587 /* Primary first bytes F0..FF are specials. */ | |
| 1588 log_err("Primary first byte of %08lX out of range\n", (long)
ce); | |
| 1589 break; | |
| 1590 } | |
| 1591 if (p2 == 0) { | |
| 1592 primaryDone = TRUE; | |
| 1593 } else { | |
| 1594 if (p2 <= 3 || p2 >= 0xFF) { | |
| 1595 /* Primary second bytes 03 and FF are sort key compressi
on terminators. */ | |
| 1596 log_err("Primary second byte of %08lX out of range\n", (
long)ce); | |
| 1597 break; | |
| 1598 } | |
| 1599 primaryDone = FALSE; | |
| 1600 } | |
| 1601 } | |
| 1602 if (secondary == 0) { | |
| 1603 if (primary != 0) { | |
| 1604 log_err("Primary!=0 secondary==0 in %08lX\n", (long)ce); | |
| 1605 break; | |
| 1606 } | |
| 1607 secondaryDone = TRUE; | |
| 1608 } else { | |
| 1609 if (secondary <= 2 || | |
| 1610 (UCOL_BYTE_COMMON < secondary && secondary <= (UCOL_BYTE_COM
MON + 0x80)) | |
| 1611 ) { | |
| 1612 /* Secondary first bytes common+1..+0x80 are used for sort k
ey compression. */ | |
| 1613 log_err("Secondary byte of %08lX out of range\n", (long)ce); | |
| 1614 break; | |
| 1615 } | |
| 1616 secondaryDone = FALSE; | |
| 1617 } | |
| 1618 if (tertiary == 0) { | |
| 1619 /* We know that ce != 0. */ | |
| 1620 log_err("Primary!=0 or secondary!=0 but tertiary==0 in %08lX\n",
(long)ce); | |
| 1621 break; | |
| 1622 } | |
| 1623 if (tertiary <= 2) { | |
| 1624 log_err("Tertiary byte of %08lX out of range\n", (long)ce); | |
| 1625 break; | |
| 1626 } | |
| 1627 tertiaryDone = FALSE; | |
| 1628 } else { | |
| 1629 if ((ce & UCOL_REMOVE_CONTINUATION) == 0) { | |
| 1630 log_err("Empty continuation %08lX\n", (long)ce); | |
| 1631 break; | |
| 1632 } | |
| 1633 if (primaryDone && primary != 0) { | |
| 1634 log_err("Primary was done but continues in %08lX\n", (long)ce); | |
| 1635 break; | |
| 1636 } | |
| 1637 if (p1 == 0) { | |
| 1638 if (p2 != 0) { | |
| 1639 log_err("Primary 00 xx in %08lX\n", (long)ce); | |
| 1640 break; | |
| 1641 } | |
| 1642 primaryDone = TRUE; | |
| 1643 } else { | |
| 1644 if (p1 <= 2) { | |
| 1645 log_err("Primary first byte of %08lX out of range\n", (long)
ce); | |
| 1646 break; | |
| 1647 } | |
| 1648 if (p2 == 0) { | |
| 1649 primaryDone = TRUE; | |
| 1650 } else { | |
| 1651 if (p2 <= 3) { | |
| 1652 log_err("Primary second byte of %08lX out of range\n", (
long)ce); | |
| 1653 break; | |
| 1654 } | |
| 1655 } | |
| 1656 } | |
| 1657 if (secondaryDone && secondary != 0) { | |
| 1658 log_err("Secondary was done but continues in %08lX\n", (long)ce)
; | |
| 1659 break; | |
| 1660 } | |
| 1661 if (secondary == 0) { | |
| 1662 secondaryDone = TRUE; | |
| 1663 } else { | |
| 1664 if (secondary <= 2) { | |
| 1665 log_err("Secondary byte of %08lX out of range\n", (long)ce); | |
| 1666 break; | |
| 1667 } | |
| 1668 } | |
| 1669 if (tertiaryDone && tertiary != 0) { | |
| 1670 log_err("Tertiary was done but continues in %08lX\n", (long)ce); | |
| 1671 break; | |
| 1672 } | |
| 1673 if (tertiary == 0) { | |
| 1674 tertiaryDone = TRUE; | |
| 1675 } else if (tertiary <= 2) { | |
| 1676 log_err("Tertiary byte of %08lX out of range\n", (long)ce); | |
| 1677 break; | |
| 1678 } | |
| 1679 } | |
| 1680 } | |
| 1681 if (!result) { | |
| 1682 char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5]; | |
| 1683 showCodepoints(codepoints, length, codepointText); | |
| 1684 log_err("Locale: %s Code point string: %s\n", collLocale, codepointText
); | |
| 1685 } | |
| 1686 ucol_closeElements(iter); | |
| 1687 return result; | |
| 1688 } | |
| 1689 | |
| 1690 static const UChar IMPORT[] = { 0x5B, 0x69, 0x6D, 0x70, 0x6F, 0x72, 0x74, 0 };
/* "[import" */ | |
| 1691 | |
| 1692 static void TestCEValidity() | |
| 1693 { | |
| 1694 /* testing UCA collation elements */ | |
| 1695 UErrorCode status = U_ZERO_ERROR; | |
| 1696 /* en_US has no tailorings */ | |
| 1697 UCollator *coll = ucol_open("root", &status); | |
| 1698 /* tailored locales */ | |
| 1699 char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh
__PINYIN"}; | |
| 1700 const char *loc; | |
| 1701 FileStream *file = NULL; | |
| 1702 char line[2048]; | |
| 1703 UChar codepoints[11]; | |
| 1704 int count = 0; | |
| 1705 int maxCount = 0; | |
| 1706 UChar contextCPs[3]; | |
| 1707 UChar32 c; | |
| 1708 UParseError parseError; | |
| 1709 if (U_FAILURE(status)) { | |
| 1710 log_err_status(status, "en_US collator creation failed -> %s\n", u_error
Name(status)); | |
| 1711 return; | |
| 1712 } | |
| 1713 log_verbose("Testing UCA elements\n"); | |
| 1714 file = getFractionalUCA(); | |
| 1715 if (file == NULL) { | |
| 1716 log_err("Fractional UCA data can not be opened\n"); | |
| 1717 return; | |
| 1718 } | |
| 1719 | |
| 1720 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { | |
| 1721 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || | |
| 1722 line[0] == 0x000D || line[0] == '[') { | |
| 1723 continue; | |
| 1724 } | |
| 1725 | |
| 1726 getCodePoints(line, codepoints, contextCPs); | |
| 1727 checkCEValidity(coll, codepoints, u_strlen(codepoints)); | |
| 1728 } | |
| 1729 | |
| 1730 log_verbose("Testing UCA elements for the whole range of unicode characters\
n"); | |
| 1731 for (c = 0; c <= 0xffff; ++c) { | |
| 1732 if (u_isdefined(c)) { | |
| 1733 codepoints[0] = (UChar)c; | |
| 1734 checkCEValidity(coll, codepoints, 1); | |
| 1735 } | |
| 1736 } | |
| 1737 for (; c <= 0x10ffff; ++c) { | |
| 1738 if (u_isdefined(c)) { | |
| 1739 int32_t i = 0; | |
| 1740 U16_APPEND_UNSAFE(codepoints, i, c); | |
| 1741 checkCEValidity(coll, codepoints, i); | |
| 1742 } | |
| 1743 } | |
| 1744 | |
| 1745 ucol_close(coll); | |
| 1746 | |
| 1747 /* testing tailored collation elements */ | |
| 1748 log_verbose("Testing tailored elements\n"); | |
| 1749 if(getTestOption(QUICK_OPTION)) { | |
| 1750 maxCount = sizeof(locale)/sizeof(locale[0]); | |
| 1751 } else { | |
| 1752 maxCount = uloc_countAvailable(); | |
| 1753 } | |
| 1754 while (count < maxCount) { | |
| 1755 const UChar *rules = NULL, | |
| 1756 *current = NULL; | |
| 1757 UChar *rulesCopy = NULL; | |
| 1758 int32_t ruleLen = 0; | |
| 1759 | |
| 1760 uint32_t chOffset = 0; | |
| 1761 uint32_t chLen = 0; | |
| 1762 uint32_t exOffset = 0; | |
| 1763 uint32_t exLen = 0; | |
| 1764 uint32_t prefixOffset = 0; | |
| 1765 uint32_t prefixLen = 0; | |
| 1766 UBool startOfRules = TRUE; | |
| 1767 UColOptionSet opts; | |
| 1768 | |
| 1769 UColTokenParser src; | |
| 1770 uint32_t strength = 0; | |
| 1771 uint16_t specs = 0; | |
| 1772 | |
| 1773 (void)specs; /* Suppress set but not used warnings. */ | |
| 1774 (void)strength; | |
| 1775 (void)prefixLen; | |
| 1776 (void)prefixOffset; | |
| 1777 (void)exLen; | |
| 1778 (void)exOffset; | |
| 1779 | |
| 1780 if(getTestOption(QUICK_OPTION)) { | |
| 1781 loc = locale[count]; | |
| 1782 } else { | |
| 1783 loc = uloc_getAvailable(count); | |
| 1784 if(!hasCollationElements(loc)) { | |
| 1785 count++; | |
| 1786 continue; | |
| 1787 } | |
| 1788 } | |
| 1789 status = U_ZERO_ERROR; // clear status from previous loop iteration | |
| 1790 | |
| 1791 uprv_memset(&src, 0, sizeof(UColTokenParser)); | |
| 1792 | |
| 1793 log_verbose("Testing CEs for %s\n", loc); | |
| 1794 | |
| 1795 coll = ucol_open(loc, &status); | |
| 1796 if (U_FAILURE(status)) { | |
| 1797 log_err("%s collator creation failed with status %s\n", loc, u_error
Name(status)); | |
| 1798 return; | |
| 1799 } | |
| 1800 | |
| 1801 src.opts = &opts; | |
| 1802 rules = ucol_getRules(coll, &ruleLen); | |
| 1803 | |
| 1804 /* | |
| 1805 * We have not set up the UColTokenParser with a callback function | |
| 1806 * to fetch [import] sub-rules, | |
| 1807 * so skip testing tailorings that import others. | |
| 1808 * TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet(
) | |
| 1809 * rather than the internal collation rule parser | |
| 1810 */ | |
| 1811 if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) { | |
| 1812 rulesCopy = (UChar *)uprv_malloc((ruleLen + | |
| 1813 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); | |
| 1814 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); | |
| 1815 src.current = src.source = rulesCopy; | |
| 1816 src.end = rulesCopy + ruleLen; | |
| 1817 src.extraCurrent = src.end; | |
| 1818 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; | |
| 1819 | |
| 1820 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parse
NextToken can cause the pointer to | |
| 1821 the rules copy in src.source to get reallocated, freeing the
original pointer in rulesCopy */ | |
| 1822 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parse
Error,&status)) != NULL && U_SUCCESS(status)) { | |
| 1823 strength = src.parsedToken.strength; | |
| 1824 chOffset = src.parsedToken.charsOffset; | |
| 1825 chLen = src.parsedToken.charsLen; | |
| 1826 exOffset = src.parsedToken.extensionOffset; | |
| 1827 exLen = src.parsedToken.extensionLen; | |
| 1828 prefixOffset = src.parsedToken.prefixOffset; | |
| 1829 prefixLen = src.parsedToken.prefixLen; | |
| 1830 specs = src.parsedToken.flags; | |
| 1831 | |
| 1832 startOfRules = FALSE; | |
| 1833 uprv_memcpy(codepoints, src.source + chOffset, | |
| 1834 chLen * sizeof(UChar)); | |
| 1835 codepoints[chLen] = 0; | |
| 1836 checkCEValidity(coll, codepoints, chLen); | |
| 1837 } | |
| 1838 if (U_FAILURE(status)) { | |
| 1839 log_err("%s collator, ucol_tok_parseNextToken failed with status
%s\n", loc, u_errorName(status)); | |
| 1840 } | |
| 1841 uprv_free(src.source); | |
| 1842 uprv_free(src.reorderCodes); | |
| 1843 } | |
| 1844 | |
| 1845 ucol_close(coll); | |
| 1846 count ++; | |
| 1847 } | |
| 1848 T_FileStream_close(file); | |
| 1849 } | |
| 1850 | |
| 1851 static void printSortKeyError(const UChar *codepoints, int length, | |
| 1852 uint8_t *sortkey, int sklen) | |
| 1853 { | |
| 1854 int count = 0; | |
| 1855 log_err("Sortkey not valid for "); | |
| 1856 while (length > 0) { | |
| 1857 log_err("0x%04x ", *codepoints); | |
| 1858 length --; | |
| 1859 codepoints ++; | |
| 1860 } | |
| 1861 log_err("\nSortkey : "); | |
| 1862 while (count < sklen) { | |
| 1863 log_err("0x%02x ", sortkey[count]); | |
| 1864 count ++; | |
| 1865 } | |
| 1866 log_err("\n"); | |
| 1867 } | |
| 1868 | |
| 1869 /** | |
| 1870 * Checking sort key validity for all levels | |
| 1871 */ | |
| 1872 static UBool checkSortKeyValidity(UCollator *coll, | |
| 1873 const UChar *codepoints, | |
| 1874 int length) | |
| 1875 { | |
| 1876 UErrorCode status = U_ZERO_ERROR; | |
| 1877 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY, | |
| 1878 UCOL_TERTIARY, UCOL_QUATERNARY, | |
| 1879 UCOL_IDENTICAL}; | |
| 1880 int strengthlen = 5; | |
| 1881 int strengthIndex = 0; | |
| 1882 int caselevel = 0; | |
| 1883 | |
| 1884 while (caselevel < 1) { | |
| 1885 if (caselevel == 0) { | |
| 1886 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status); | |
| 1887 } | |
| 1888 else { | |
| 1889 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status); | |
| 1890 } | |
| 1891 | |
| 1892 while (strengthIndex < strengthlen) { | |
| 1893 int count01 = 0; | |
| 1894 uint32_t count = 0; | |
| 1895 uint8_t sortkey[128]; | |
| 1896 uint32_t sklen; | |
| 1897 | |
| 1898 ucol_setStrength(coll, strength[strengthIndex]); | |
| 1899 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128); | |
| 1900 while (sortkey[count] != 0) { | |
| 1901 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 &
& strengthIndex != 4)) { | |
| 1902 printSortKeyError(codepoints, length, sortkey, sklen); | |
| 1903 return FALSE; | |
| 1904 } | |
| 1905 if (sortkey[count] == 1) { | |
| 1906 count01 ++; | |
| 1907 } | |
| 1908 count ++; | |
| 1909 } | |
| 1910 | |
| 1911 if (count + 1 != sklen || (count01 != strengthIndex + caselevel)) { | |
| 1912 printSortKeyError(codepoints, length, sortkey, sklen); | |
| 1913 return FALSE; | |
| 1914 } | |
| 1915 strengthIndex ++; | |
| 1916 } | |
| 1917 caselevel ++; | |
| 1918 } | |
| 1919 return TRUE; | |
| 1920 } | |
| 1921 | |
| 1922 static void TestSortKeyValidity(void) | |
| 1923 { | |
| 1924 /* testing UCA collation elements */ | |
| 1925 UErrorCode status = U_ZERO_ERROR; | |
| 1926 /* en_US has no tailorings */ | |
| 1927 UCollator *coll = ucol_open("en_US", &status); | |
| 1928 /* tailored locales */ | |
| 1929 char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"}; | |
| 1930 FileStream *file = NULL; | |
| 1931 char line[2048]; | |
| 1932 UChar codepoints[10]; | |
| 1933 int count = 0; | |
| 1934 UChar contextCPs[5]; | |
| 1935 UParseError parseError; | |
| 1936 if (U_FAILURE(status)) { | |
| 1937 log_err_status(status, "en_US collator creation failed -> %s\n", u_error
Name(status)); | |
| 1938 return; | |
| 1939 } | |
| 1940 log_verbose("Testing UCA elements\n"); | |
| 1941 file = getFractionalUCA(); | |
| 1942 if (file == NULL) { | |
| 1943 log_err("Fractional UCA data can not be opened\n"); | |
| 1944 return; | |
| 1945 } | |
| 1946 | |
| 1947 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { | |
| 1948 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || | |
| 1949 line[0] == 0x000D || line[0] == '[') { | |
| 1950 continue; | |
| 1951 } | |
| 1952 | |
| 1953 getCodePoints(line, codepoints, contextCPs); | |
| 1954 if(codepoints[0] == 0xFFFE) { | |
| 1955 /* Skip special merge-sort character U+FFFE which has otherwise ille
gal 02 weight bytes. */ | |
| 1956 continue; | |
| 1957 } | |
| 1958 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints)); | |
| 1959 } | |
| 1960 | |
| 1961 log_verbose("Testing UCA elements for the whole range of unicode characters\
n"); | |
| 1962 codepoints[0] = 0; | |
| 1963 | |
| 1964 while (codepoints[0] < 0xFFFF) { | |
| 1965 if (u_isdefined((UChar32)codepoints[0])) { | |
| 1966 checkSortKeyValidity(coll, codepoints, 1); | |
| 1967 } | |
| 1968 codepoints[0] ++; | |
| 1969 } | |
| 1970 | |
| 1971 ucol_close(coll); | |
| 1972 | |
| 1973 /* testing tailored collation elements */ | |
| 1974 log_verbose("Testing tailored elements\n"); | |
| 1975 while (count < 5) { | |
| 1976 const UChar *rules = NULL, | |
| 1977 *current = NULL; | |
| 1978 UChar *rulesCopy = NULL; | |
| 1979 int32_t ruleLen = 0; | |
| 1980 | |
| 1981 uint32_t chOffset = 0; | |
| 1982 uint32_t chLen = 0; | |
| 1983 uint32_t exOffset = 0; | |
| 1984 uint32_t exLen = 0; | |
| 1985 uint32_t prefixOffset = 0; | |
| 1986 uint32_t prefixLen = 0; | |
| 1987 UBool startOfRules = TRUE; | |
| 1988 UColOptionSet opts; | |
| 1989 | |
| 1990 UColTokenParser src; | |
| 1991 uint32_t strength = 0; | |
| 1992 uint16_t specs = 0; | |
| 1993 status = U_ZERO_ERROR; // clear status from previous loop iteration | |
| 1994 | |
| 1995 (void)specs; | |
| 1996 (void)strength; | |
| 1997 (void)prefixLen; | |
| 1998 (void)prefixOffset; | |
| 1999 (void)exLen; | |
| 2000 (void)exOffset; | |
| 2001 | |
| 2002 uprv_memset(&src, 0, sizeof(UColTokenParser)); | |
| 2003 | |
| 2004 coll = ucol_open(locale[count], &status); | |
| 2005 if (U_FAILURE(status)) { | |
| 2006 log_err("%s collator creation failed with status %s\n", locale[count
], u_errorName(status)); | |
| 2007 return; | |
| 2008 } | |
| 2009 | |
| 2010 src.opts = &opts; | |
| 2011 rules = ucol_getRules(coll, &ruleLen); | |
| 2012 | |
| 2013 /* | |
| 2014 * We have not set up the UColTokenParser with a callback function | |
| 2015 * to fetch [import] sub-rules, | |
| 2016 * so skip testing tailorings that import others. | |
| 2017 * TODO: Ticket #8047: Change TestSortKeyValidity to use ucol_getTailore
dSet() | |
| 2018 * rather than the internal collation rule parser | |
| 2019 */ | |
| 2020 if (ruleLen > 0 && u_strstr(rules, IMPORT) == NULL) { | |
| 2021 rulesCopy = (UChar *)uprv_malloc((ruleLen + | |
| 2022 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); | |
| 2023 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); | |
| 2024 src.current = src.source = rulesCopy; | |
| 2025 src.end = rulesCopy + ruleLen; | |
| 2026 src.extraCurrent = src.end; | |
| 2027 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; | |
| 2028 | |
| 2029 /* Note that as a result of tickets 7015 or 6912, ucol_tok_parse
NextToken can cause the pointer to | |
| 2030 the rules copy in src.source to get reallocated, freeing the
original pointer in rulesCopy */ | |
| 2031 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseE
rror, &status)) != NULL && U_SUCCESS(status)) { | |
| 2032 strength = src.parsedToken.strength; | |
| 2033 chOffset = src.parsedToken.charsOffset; | |
| 2034 chLen = src.parsedToken.charsLen; | |
| 2035 exOffset = src.parsedToken.extensionOffset; | |
| 2036 exLen = src.parsedToken.extensionLen; | |
| 2037 prefixOffset = src.parsedToken.prefixOffset; | |
| 2038 prefixLen = src.parsedToken.prefixLen; | |
| 2039 specs = src.parsedToken.flags; | |
| 2040 | |
| 2041 startOfRules = FALSE; | |
| 2042 uprv_memcpy(codepoints, src.source + chOffset, | |
| 2043 chLen * sizeof(UChar)); | |
| 2044 codepoints[chLen] = 0; | |
| 2045 if(codepoints[0] == 0xFFFE) { | |
| 2046 /* Skip special merge-sort character U+FFFE which has otherw
ise illegal 02 weight bytes. */ | |
| 2047 continue; | |
| 2048 } | |
| 2049 checkSortKeyValidity(coll, codepoints, chLen); | |
| 2050 } | |
| 2051 if (U_FAILURE(status)) { | |
| 2052 log_err("%s collator, ucol_tok_parseNextToken failed with status
%s\n", locale[count], u_errorName(status)); | |
| 2053 } | |
| 2054 uprv_free(src.source); | |
| 2055 uprv_free(src.reorderCodes); | |
| 2056 } | |
| 2057 | |
| 2058 ucol_close(coll); | |
| 2059 count ++; | |
| 2060 } | |
| 2061 T_FileStream_close(file); | |
| 2062 } | |
| 2063 | |
| 2064 /** | 1113 /** |
| 2065 * TestSearchCollatorElements tests iterator behavior (forwards and backwards) wi
th | 1114 * TestSearchCollatorElements tests iterator behavior (forwards and backwards) wi
th |
| 2066 * normalization on AND jamo tailoring, among other things. | 1115 * normalization on AND jamo tailoring, among other things. |
| 1116 * |
| 1117 * Note: This test is sensitive to changes of the root collator, |
| 1118 * for example whether the ae-ligature maps to three CEs (as in the DUCET) |
| 1119 * or to two CEs (as in the CLDR 24 FractionalUCA.txt). |
| 1120 * It is also sensitive to how those CEs map to the iterator's 32-bit CE encoding
. |
| 1121 * For example, the DUCET's artificial secondary CE in the ae-ligature |
| 1122 * may map to two 32-bit iterator CEs (as it did until ICU 52). |
| 2067 */ | 1123 */ |
| 2068 static const UChar tsceText[] = { /* Nothing in here should be ignorable */ | 1124 static const UChar tsceText[] = { /* Nothing in here should be ignorable */ |
| 2069 0x0020, 0xAC00, /* simple LV Hangul */ | 1125 0x0020, 0xAC00, /* simple LV Hangul */ |
| 2070 0x0020, 0xAC01, /* simple LVT Hangul */ | 1126 0x0020, 0xAC01, /* simple LVT Hangul */ |
| 2071 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */ | 1127 0x0020, 0xAC0F, /* LVTT, last jamo expands for search */ |
| 2072 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */ | 1128 0x0020, 0xAFFF, /* LLVVVTT, every jamo expands for search */ |
| 2073 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */ | 1129 0x0020, 0x1100, 0x1161, 0x11A8, /* 0xAC01 as conjoining jamo */ |
| 2074 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */ | 1130 0x0020, 0x3131, 0x314F, 0x3131, /* 0xAC01 as compatibility jamo */ |
| 2075 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands f
or search */ | 1131 0x0020, 0x1100, 0x1161, 0x11B6, /* 0xAC0F as conjoining jamo; last expands f
or search */ |
| 2076 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for
search */ | 1132 0x0020, 0x1101, 0x1170, 0x11B6, /* 0xAFFF as conjoining jamo; all expand for
search */ |
| 2077 0x0020, 0x00E6, /* small letter ae, expands */ | 1133 0x0020, 0x00E6, /* small letter ae, expands */ |
| 2078 0x0020, 0x1E4D, /* small letter o with tilde and acute, deco
mposes */ | 1134 0x0020, 0x1E4D, /* small letter o with tilde and acute, deco
mposes */ |
| 2079 0x0020 | 1135 0x0020 |
| 2080 }; | 1136 }; |
| 2081 enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) }; | 1137 enum { kLen_tsceText = sizeof(tsceText)/sizeof(tsceText[0]) }; |
| 2082 | 1138 |
| 2083 static const int32_t rootStandardOffsets[] = { | 1139 static const int32_t rootStandardOffsets[] = { |
| 2084 0, 1,2, | 1140 0, 1,2, |
| 2085 2, 3,4,4, | 1141 2, 3,4,4, |
| 2086 4, 5,6,6, | 1142 4, 5,6,6, |
| 2087 6, 7,8,8, | 1143 6, 7,8,8, |
| 2088 8, 9,10,11, | 1144 8, 9,10,11, |
| 2089 12, 13,14,15, | 1145 12, 13,14,15, |
| 2090 16, 17,18,19, | 1146 16, 17,18,19, |
| 2091 20, 21,22,23, | 1147 20, 21,22,23, |
| 2092 24, 25,26,26,26, | 1148 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs *
/ |
| 2093 26, 27,28,28, | 1149 26, 27,28,28, |
| 2094 28, | 1150 28, |
| 2095 29 | 1151 29 |
| 2096 }; | 1152 }; |
| 2097 enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandar
dOffsets[0]) }; | 1153 enum { kLen_rootStandardOffsets = sizeof(rootStandardOffsets)/sizeof(rootStandar
dOffsets[0]) }; |
| 2098 | 1154 |
| 2099 static const int32_t rootSearchOffsets[] = { | 1155 static const int32_t rootSearchOffsets[] = { |
| 2100 0, 1,2, | 1156 0, 1,2, |
| 2101 2, 3,4,4, | 1157 2, 3,4,4, |
| 2102 4, 5,6,6,6, | 1158 4, 5,6,6,6, |
| 2103 6, 7,8,8,8,8,8,8, | 1159 6, 7,8,8,8,8,8,8, |
| 2104 8, 9,10,11, | 1160 8, 9,10,11, |
| 2105 12, 13,14,15, | 1161 12, 13,14,15, |
| 2106 16, 17,18,19,20, | 1162 16, 17,18,19,20, |
| 2107 20, 21,22,22,23,23,23,24, | 1163 20, 21,22,22,23,23,23,24, |
| 2108 24, 25,26,26,26, | 1164 24, 25,26, /* plus another 1-2 offset=26 if ae-ligature maps to three CEs *
/ |
| 2109 26, 27,28,28, | 1165 26, 27,28,28, |
| 2110 28, | 1166 28, |
| 2111 29 | 1167 29 |
| 2112 }; | 1168 }; |
| 2113 enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffse
ts[0]) }; | 1169 enum { kLen_rootSearchOffsets = sizeof(rootSearchOffsets)/sizeof(rootSearchOffse
ts[0]) }; |
| 2114 | 1170 |
| 2115 typedef struct { | 1171 typedef struct { |
| 2116 const char * locale; | 1172 const char * locale; |
| 2117 const int32_t * offsets; | 1173 const int32_t * offsets; |
| 2118 int32_t offsetsLen; | 1174 int32_t offsetsLen; |
| (...skipping 16 matching lines...) Expand all Loading... |
| 2135 if ( U_SUCCESS(status) ) { | 1191 if ( U_SUCCESS(status) ) { |
| 2136 int32_t offset, element; | 1192 int32_t offset, element; |
| 2137 const int32_t * nextOffsetPtr; | 1193 const int32_t * nextOffsetPtr; |
| 2138 const int32_t * limitOffsetPtr; | 1194 const int32_t * limitOffsetPtr; |
| 2139 | 1195 |
| 2140 nextOffsetPtr = tsceItemPtr->offsets; | 1196 nextOffsetPtr = tsceItemPtr->offsets; |
| 2141 limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; | 1197 limitOffsetPtr = tsceItemPtr->offsets + tsceItemPtr->offsetsLen; |
| 2142 do { | 1198 do { |
| 2143 offset = ucol_getOffset(uce); | 1199 offset = ucol_getOffset(uce); |
| 2144 element = ucol_next(uce, &status); | 1200 element = ucol_next(uce, &status); |
| 1201 log_verbose("(%s) offset=%2d ce=%08x\n", tsceItemPtr->local
e, offset, element); |
| 2145 if ( element == 0 ) { | 1202 if ( element == 0 ) { |
| 2146 log_err("error, locale %s, ucol_next returned element 0\
n", tsceItemPtr->locale ); | 1203 log_err("error, locale %s, ucol_next returned element 0\
n", tsceItemPtr->locale ); |
| 2147 } | 1204 } |
| 2148 if ( nextOffsetPtr < limitOffsetPtr ) { | 1205 if ( nextOffsetPtr < limitOffsetPtr ) { |
| 2149 if (offset != *nextOffsetPtr) { | 1206 if (offset != *nextOffsetPtr) { |
| 2150 log_err("error, locale %s, expected ucol_next -> uco
l_getOffset %d, got %d\n", | 1207 log_err("error, locale %s, expected ucol_next -> uco
l_getOffset %d, got %d\n", |
| 2151 tsceItemPtr->locale,
*nextOffsetPtr, offset ); | 1208 tsceItemPtr->locale,
*nextOffsetPtr, offset ); |
| 2152 nextOffsetPtr = limitOffsetPtr; | 1209 nextOffsetPtr = limitOffsetPtr; |
| 2153 break; | 1210 break; |
| 2154 } | 1211 } |
| (...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2192 log_err("error, locale %s, ucol_openElements failed: %s\n", tsce
ItemPtr->locale, u_errorName(status) ); | 1249 log_err("error, locale %s, ucol_openElements failed: %s\n", tsce
ItemPtr->locale, u_errorName(status) ); |
| 2193 } | 1250 } |
| 2194 ucol_close(ucol); | 1251 ucol_close(ucol); |
| 2195 } else { | 1252 } else { |
| 2196 log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr
->locale, u_errorName(status) ); | 1253 log_data_err("error, locale %s, ucol_open failed: %s\n", tsceItemPtr
->locale, u_errorName(status) ); |
| 2197 } | 1254 } |
| 2198 } | 1255 } |
| 2199 } | 1256 } |
| 2200 | 1257 |
| 2201 #endif /* #if !UCONFIG_NO_COLLATION */ | 1258 #endif /* #if !UCONFIG_NO_COLLATION */ |
| OLD | NEW |