OLD | NEW |
1 /******************************************************************** | 1 /******************************************************************** |
2 * COPYRIGHT: | 2 * COPYRIGHT: |
3 * Copyright (c) 1999-2010, International Business Machines Corporation and | 3 * Copyright (c) 1999-2010, International Business Machines Corporation and |
4 * others. All Rights Reserved. | 4 * others. All Rights Reserved. |
5 ********************************************************************/ | 5 ********************************************************************/ |
6 /************************************************************************ | 6 /************************************************************************ |
7 * Date Name Description | 7 * Date Name Description |
8 * 12/15/99 Madhu Creation. | 8 * 12/15/99 Madhu Creation. |
9 * 01/12/2000 Madhu Updated for changed API and added new tests | 9 * 01/12/2000 Madhu Updated for changed API and added new tests |
10 ************************************************************************/ | 10 ************************************************************************/ |
(...skipping 17 matching lines...) Expand all Loading... |
28 #include "unicode/utext.h" | 28 #include "unicode/utext.h" |
29 #include "intltest.h" | 29 #include "intltest.h" |
30 #include "rbbitst.h" | 30 #include "rbbitst.h" |
31 #include <string.h> | 31 #include <string.h> |
32 #include "uvector.h" | 32 #include "uvector.h" |
33 #include "uvectr32.h" | 33 #include "uvectr32.h" |
34 #include "triedict.h" | 34 #include "triedict.h" |
35 #include <string.h> | 35 #include <string.h> |
36 #include <stdio.h> | 36 #include <stdio.h> |
37 #include <stdlib.h> | 37 #include <stdlib.h> |
| 38 #include "unicode/numfmt.h" |
| 39 #include "unicode/uscript.h" |
38 | 40 |
39 #define TEST_ASSERT(x) {if (!(x)) { \ | 41 #define TEST_ASSERT(x) {if (!(x)) { \ |
40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} | 42 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} |
41 | 43 |
42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ | 44 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ |
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__, u_errorName(errcode));}} | 45 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__
, __LINE__, u_errorName(errcode));}} |
44 | 46 |
45 | 47 |
46 //--------------------------------------------- | 48 //--------------------------------------------- |
47 // runIndexedTest | 49 // runIndexedTest |
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
131 case 20: name = "TestTrieDict"; | 133 case 20: name = "TestTrieDict"; |
132 if(exec) TestTrieDict(); break; | 134 if(exec) TestTrieDict(); break; |
133 | 135 |
134 #if !UCONFIG_NO_FILE_IO | 136 #if !UCONFIG_NO_FILE_IO |
135 case 21: name = "TestBug5775"; | 137 case 21: name = "TestBug5775"; |
136 if (exec) TestBug5775(); break; | 138 if (exec) TestBug5775(); break; |
137 case 22: name = "TestThaiBreaks"; | 139 case 22: name = "TestThaiBreaks"; |
138 if (exec) TestThaiBreaks(); break; | 140 if (exec) TestThaiBreaks(); break; |
139 case 23: name = "TestTailoredBreaks"; | 141 case 23: name = "TestTailoredBreaks"; |
140 if (exec) TestTailoredBreaks(); break; | 142 if (exec) TestTailoredBreaks(); break; |
| 143 case 24: name = "TestTrieDictWithValue"; |
| 144 if(exec) TestTrieDictWithValue(); break; |
141 #else | 145 #else |
142 case 21: case 22: case 23: name = "skip"; | 146 case 21: case 22: case 23: case 24: name = "skip"; |
143 break; | 147 break; |
144 #endif | 148 #endif |
145 case 24: name = "TestDictRules"; | 149 case 25: name = "TestDictRules"; |
146 if (exec) TestDictRules(); break; | 150 if (exec) TestDictRules(); break; |
147 case 25: name = "TestBug5532"; | 151 case 25: name = "TestBug5532"; |
148 if (exec) TestBug5532(); break; | 152 if (exec) TestBug5532(); break; |
149 default: name = ""; break; //needed to end loop | 153 default: name = ""; break; //needed to end loop |
150 } | 154 } |
151 } | 155 } |
152 | 156 |
153 | 157 |
154 //--------------------------------------------------------------------------- | 158 //--------------------------------------------------------------------------- |
155 // | 159 // |
(...skipping 444 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
600 startOfSecondWord = bi->following(0); | 604 startOfSecondWord = bi->following(0); |
601 if (startOfSecondWord != 4) { | 605 if (startOfSecondWord != 4) { |
602 errln("Fail at file %s, line %d expected start of word at 4, got %d", | 606 errln("Fail at file %s, line %d expected start of word at 4, got %d", |
603 __FILE__, __LINE__, startOfSecondWord); | 607 __FILE__, __LINE__, startOfSecondWord); |
604 } | 608 } |
605 delete bi; | 609 delete bi; |
606 } | 610 } |
607 | 611 |
608 | 612 |
609 void RBBITest::TestJapaneseWordBreak() { | 613 void RBBITest::TestJapaneseWordBreak() { |
| 614 // TODO: Rewrite this test for a dictionary-based word breaking. |
| 615 #if 0 |
610 UErrorCode status = U_ZERO_ERROR; | 616 UErrorCode status = U_ZERO_ERROR; |
611 BITestData japaneseWordSelection(status); | 617 BITestData japaneseWordSelection(status); |
612 | 618 |
613 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at
start of data | 619 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at
start of data |
614 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 | 620 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 |
615 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status);
//5 | 621 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status);
//5 |
616 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 | 622 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 |
617 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status);
//10 | 623 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status);
//10 |
618 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 | 624 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 |
619 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 | 625 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 |
620 | 626 |
621 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createW
ordInstance( | 627 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createW
ordInstance( |
622 Locale("ja"), status); | 628 Locale("ja"), status); |
623 if (U_FAILURE(status)) | 629 if (U_FAILURE(status)) |
624 { | 630 { |
625 errcheckln(status, "Failed to create the BreakIterator for Japanese loca
le in TestJapaneseWordBreak.\n"); | 631 errcheckln(status, "Failed to create the BreakIterator for Japanese loca
le in TestJapaneseWordBreak.\n"); |
626 return; | 632 return; |
627 } | 633 } |
628 | 634 |
629 generalIteratorTest(*e, japaneseWordSelection); | 635 generalIteratorTest(*e, japaneseWordSelection); |
630 delete e; | 636 delete e; |
| 637 #endif |
631 } | 638 } |
632 | 639 |
633 void RBBITest::TestTrieDict() { | 640 void RBBITest::TestTrieDict() { |
634 UErrorCode status = U_ZERO_ERROR; | 641 UErrorCode status = U_ZERO_ERROR; |
635 | 642 |
636 // | 643 // |
637 // Open and read the test data file. | 644 // Open and read the test data file. |
638 // | 645 // |
639 const char *testDataDirectory = IntlTest::getSourceTestData(status); | 646 const char *testDataDirectory = IntlTest::getSourceTestData(status); |
640 char testFileName[1000]; | 647 char testFileName[1000]; |
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
842 delete compactDict; | 849 delete compactDict; |
843 delete mutableDict; | 850 delete mutableDict; |
844 delete breaks; | 851 delete breaks; |
845 delete[] testFile; | 852 delete[] testFile; |
846 delete enumer1; | 853 delete enumer1; |
847 delete mutable2; | 854 delete mutable2; |
848 delete cloneEnum; | 855 delete cloneEnum; |
849 delete compact2; | 856 delete compact2; |
850 } | 857 } |
851 | 858 |
| 859 /*TODO: delete later*/ |
| 860 inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){ |
| 861 UErrorCode status = U_ZERO_ERROR; |
| 862 FILE *outfile = fopen(filename,"w"); |
| 863 UConverter *cvt = ucnv_open("UTF-8", &status); |
| 864 if (U_FAILURE(status)) |
| 865 return; |
| 866 if(outfile != NULL){ |
| 867 status = U_ZERO_ERROR; |
| 868 const UnicodeString *word = enumer->snext(status); |
| 869 while (word != NULL && U_SUCCESS(status)) { |
| 870 char u8word[500]; |
| 871 status = U_ZERO_ERROR; |
| 872 ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(), |
| 873 &status); |
| 874 fprintf(outfile,"%s\n", u8word); |
| 875 status = U_ZERO_ERROR; |
| 876 word = enumer->snext(status); |
| 877 } |
| 878 fclose(outfile); |
| 879 } |
| 880 ucnv_close(cvt); |
| 881 } |
| 882 |
| 883 // A very simple helper class to streamline the buffer handling in |
| 884 // TestTrieDictWithValue |
| 885 template<class T, size_t N> |
| 886 class AutoBuffer { |
| 887 public: |
| 888 AutoBuffer(size_t size) : buffer(stackBuffer) { |
| 889 if (size > N) |
| 890 buffer = new T[size]; |
| 891 } |
| 892 ~AutoBuffer() { |
| 893 if (buffer != stackBuffer) |
| 894 delete [] buffer; |
| 895 } |
| 896 T* elems() { |
| 897 return buffer; |
| 898 } |
| 899 const T& operator[] (size_t i) const { |
| 900 return buffer[i]; |
| 901 } |
| 902 T& operator[] (size_t i) { |
| 903 return buffer[i]; |
| 904 } |
| 905 private: |
| 906 T stackBuffer[N]; |
| 907 T* buffer; |
| 908 AutoBuffer(); |
| 909 }; |
| 910 |
| 911 //---------------------------------------------------------------------------- |
| 912 // |
| 913 // TestTrieDictWithValue Test trie dictionaries with logprob values and |
| 914 // more than 2^16 nodes after compaction. |
| 915 // |
| 916 //---------------------------------------------------------------------------- |
| 917 void RBBITest::TestTrieDictWithValue() { |
| 918 UErrorCode status = U_ZERO_ERROR; |
| 919 |
| 920 // |
| 921 // Open and read the test data file. |
| 922 // |
| 923 const char *testDataDirectory = IntlTest::getSourceTestData(status); |
| 924 const char *filename = "cjdict-truncated.txt"; |
| 925 char testFileName[1000]; |
| 926 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename
) + 10 >= sizeof(testFileName)) { |
| 927 errln("Can't open test data. Path too long."); |
| 928 return; |
| 929 } |
| 930 strcpy(testFileName, testDataDirectory); |
| 931 strcat(testFileName, filename); |
| 932 |
| 933 // Items needing deleting at the end |
| 934 MutableTrieDictionary *mutableDict = NULL; |
| 935 CompactTrieDictionary *compactDict = NULL; |
| 936 UnicodeSet *breaks = NULL; |
| 937 UChar *testFile = NULL; |
| 938 StringEnumeration *enumer1 = NULL; |
| 939 StringEnumeration *enumer2 = NULL; |
| 940 MutableTrieDictionary *mutable2 = NULL; |
| 941 StringEnumeration *cloneEnum = NULL; |
| 942 CompactTrieDictionary *compact2 = NULL; |
| 943 NumberFormat *nf = NULL; |
| 944 UText *originalText = NULL, *cloneText = NULL; |
| 945 |
| 946 const UnicodeString *originalWord = NULL; |
| 947 const UnicodeString *cloneWord = NULL; |
| 948 UChar *current; |
| 949 UChar *word; |
| 950 UChar uc; |
| 951 int32_t wordLen; |
| 952 int32_t wordCount; |
| 953 int32_t testCount; |
| 954 int32_t valueLen; |
| 955 int counter = 0; |
| 956 |
| 957 int len; |
| 958 testFile = ReadAndConvertFile(testFileName, len, NULL, status); |
| 959 if (U_FAILURE(status)) { |
| 960 goto cleanup; /* something went wrong, error already output */ |
| 961 } |
| 962 |
| 963 mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE); |
| 964 if (U_FAILURE(status)) { |
| 965 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status))
; |
| 966 goto cleanup; |
| 967 } |
| 968 |
| 969 breaks = new UnicodeSet; |
| 970 breaks->add(0x000A); // Line Feed |
| 971 breaks->add(0x000D); // Carriage Return |
| 972 breaks->add(0x2028); // Line Separator |
| 973 breaks->add(0x2029); // Paragraph Separator |
| 974 breaks->add(0x0009); // Tab character |
| 975 |
| 976 // Now add each non-comment line of the file as a word. |
| 977 current = testFile; |
| 978 word = current; |
| 979 uc = *current++; |
| 980 wordLen = 0; |
| 981 wordCount = 0; |
| 982 nf = NumberFormat::createInstance(status); |
| 983 |
| 984 while (uc) { |
| 985 UnicodeString ucharValue; |
| 986 valueLen = 0; |
| 987 |
| 988 if (uc == 0x0023) { // #comment line, skip |
| 989 while (uc && !breaks->contains(uc)) { |
| 990 uc = *current++; |
| 991 } |
| 992 } |
| 993 else{ |
| 994 while (uc && !breaks->contains(uc)) { |
| 995 ++wordLen; |
| 996 uc = *current++; |
| 997 } |
| 998 if(uc == 0x0009){ //separator is a tab char, read in num after tab |
| 999 uc = *current++; |
| 1000 while (uc && !breaks->contains(uc)) { |
| 1001 ucharValue.append(uc); |
| 1002 uc = *current++; |
| 1003 } |
| 1004 } |
| 1005 } |
| 1006 if (wordLen > 0) { |
| 1007 Formattable value((int32_t)0); |
| 1008 nf->parse(ucharValue.getTerminatedBuffer(), value, status); |
| 1009 |
| 1010 if(U_FAILURE(status)){ |
| 1011 errln("parsing of value failed when reading in dictionary\n"); |
| 1012 goto cleanup; |
| 1013 } |
| 1014 mutableDict->addWord(word, wordLen, status, value.getLong()); |
| 1015 if (U_FAILURE(status)) { |
| 1016 errln("Could not add word to mutable dictionary; status %s\n", u
_errorName(status)); |
| 1017 goto cleanup; |
| 1018 } |
| 1019 wordCount += 1; |
| 1020 } |
| 1021 |
| 1022 // Find beginning of next line |
| 1023 while (uc && breaks->contains(uc)) { |
| 1024 uc = *current++; |
| 1025 } |
| 1026 word = current-1; |
| 1027 wordLen = 0; |
| 1028 } |
| 1029 |
| 1030 if (wordCount < 50) { |
| 1031 errln("Word count (%d) unreasonably small\n", wordCount); |
| 1032 goto cleanup; |
| 1033 } |
| 1034 |
| 1035 enumer1 = mutableDict->openWords(status); |
| 1036 if (U_FAILURE(status)) { |
| 1037 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(
status)); |
| 1038 goto cleanup; |
| 1039 } |
| 1040 |
| 1041 testCount = 0; |
| 1042 if (wordCount != (testCount = enumer1->count(status))) { |
| 1043 errln("MutableTrieDictionary word count (%d) differs from file word coun
t (%d), with status %s\n", |
| 1044 testCount, wordCount, u_errorName(status)); |
| 1045 goto cleanup; |
| 1046 } |
| 1047 |
| 1048 // Now compact it |
| 1049 compactDict = new CompactTrieDictionary(*mutableDict, status); |
| 1050 if (U_FAILURE(status)) { |
| 1051 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status
)); |
| 1052 goto cleanup; |
| 1053 } |
| 1054 |
| 1055 enumer2 = compactDict->openWords(status); |
| 1056 if (U_FAILURE(status)) { |
| 1057 errln("Could not open compact trie dictionary enumerator: %s\n", u_error
Name(status)); |
| 1058 goto cleanup; |
| 1059 } |
| 1060 |
| 1061 |
| 1062 //delete later |
| 1063 // writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt"); |
| 1064 // writeEnumerationToFile(enumer2, "/home/jchye/compact.txt"); |
| 1065 |
| 1066 enumer1->reset(status); |
| 1067 enumer2->reset(status); |
| 1068 |
| 1069 originalWord = enumer1->snext(status); |
| 1070 cloneWord = enumer2->snext(status); |
| 1071 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { |
| 1072 if (*originalWord != *cloneWord) { |
| 1073 errln("MutableTrieDictionary and CompactTrieDictionary word mismatch
at %d, lengths are %d and %d\n", |
| 1074 counter, originalWord->length(), cloneWord->length()); |
| 1075 goto cleanup; |
| 1076 } |
| 1077 |
| 1078 // check if attached values of the same word in both dictionaries tally |
| 1079 #if 0 |
| 1080 int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()]; |
| 1081 uint16_t values1[originalWord->length()], values2[cloneWord->length()]; |
| 1082 #endif |
| 1083 AutoBuffer<int32_t, 20> lengths1(originalWord->length()); |
| 1084 AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); |
| 1085 AutoBuffer<uint16_t, 20> values1(originalWord->length()); |
| 1086 AutoBuffer<uint16_t, 20> values2(cloneWord->length()); |
| 1087 |
| 1088 originalText = utext_openConstUnicodeString(originalText, originalWord,
&status); |
| 1089 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); |
| 1090 |
| 1091 int count1, count2; |
| 1092 mutableDict->matches(originalText, originalWord->length(), lengths1.elem
s(), count1, originalWord->length(), values1.elems()); |
| 1093 compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), c
ount2, cloneWord->length(), values2.elems()); |
| 1094 |
| 1095 if(values1[count1-1] != values2[count2-1]){ |
| 1096 errln("Values of word %d in MutableTrieDictionary and CompactTrieDic
tionary do not match, with values %d and %d\n", |
| 1097 counter, values1[count1-1], values2[count2-1]); |
| 1098 goto cleanup; |
| 1099 } |
| 1100 |
| 1101 counter++; |
| 1102 originalWord = enumer1->snext(status); |
| 1103 cloneWord = enumer2->snext(status); |
| 1104 } |
| 1105 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) { |
| 1106 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are th
e same"); |
| 1107 } |
| 1108 |
| 1109 delete enumer1; |
| 1110 enumer1 = NULL; |
| 1111 delete enumer2; |
| 1112 enumer2 = NULL; |
| 1113 |
| 1114 // Now un-compact it |
| 1115 mutable2 = compactDict->cloneMutable(status); |
| 1116 if (U_FAILURE(status)) { |
| 1117 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %
s\n", u_errorName(status)); |
| 1118 goto cleanup; |
| 1119 } |
| 1120 |
| 1121 cloneEnum = mutable2->openWords(status); |
| 1122 if (U_FAILURE(status)) { |
| 1123 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(st
atus)); |
| 1124 goto cleanup; |
| 1125 } |
| 1126 |
| 1127 if (wordCount != (testCount = cloneEnum->count(status))) { |
| 1128 errln("Cloned MutableTrieDictionary word count (%d) differs from file wo
rd count (%d), with status %s\n", |
| 1129 testCount, wordCount, u_errorName(status)); |
| 1130 goto cleanup; |
| 1131 } |
| 1132 |
| 1133 // Compact original dictionary to clone. Note that we can only compare the s
ame kind of |
| 1134 // dictionary as the order of the enumerators is not guaranteed to be the sa
me between |
| 1135 // different kinds |
| 1136 enumer1 = mutableDict->openWords(status); |
| 1137 if (U_FAILURE(status)) { |
| 1138 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorNa
me(status)); |
| 1139 goto cleanup; |
| 1140 } |
| 1141 |
| 1142 counter = 0; |
| 1143 originalWord = enumer1->snext(status); |
| 1144 cloneWord = cloneEnum->snext(status); |
| 1145 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) { |
| 1146 if (*originalWord != *cloneWord) { |
| 1147 errln("Original and cloned MutableTrieDictionary word mismatch\n"); |
| 1148 goto cleanup; |
| 1149 } |
| 1150 |
| 1151 // check if attached values of the same word in both dictionaries tally |
| 1152 AutoBuffer<int32_t, 20> lengths1(originalWord->length()); |
| 1153 AutoBuffer<int32_t, 20> lengths2(cloneWord->length()); |
| 1154 AutoBuffer<uint16_t, 20> values1(originalWord->length()); |
| 1155 AutoBuffer<uint16_t, 20> values2(cloneWord->length()); |
| 1156 originalText = utext_openConstUnicodeString(originalText, originalWord,
&status); |
| 1157 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status); |
| 1158 |
| 1159 int count1, count2; |
| 1160 mutableDict->matches(originalText, originalWord->length(), lengths1.elem
s(), count1, originalWord->length(), values1.elems()); |
| 1161 mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), coun
t2, cloneWord->length(), values2.elems()); |
| 1162 |
| 1163 if(values1[count1-1] != values2[count2-1]){ |
| 1164 errln("Values of word %d in original and cloned MutableTrieDictionar
y do not match, with values %d and %d\n", |
| 1165 counter, values1[count1-1], values2[count2-1]); |
| 1166 goto cleanup; |
| 1167 } |
| 1168 |
| 1169 counter++; |
| 1170 |
| 1171 originalWord = enumer1->snext(status); |
| 1172 cloneWord = cloneEnum->snext(status); |
| 1173 } |
| 1174 |
| 1175 if (U_FAILURE(status)) { |
| 1176 errln("Enumeration failed: %s\n", u_errorName(status)); |
| 1177 goto cleanup; |
| 1178 } |
| 1179 |
| 1180 if (originalWord != cloneWord) { |
| 1181 errln("Original and cloned MutableTrieDictionary ended enumeration at di
fferent points\n"); |
| 1182 goto cleanup; |
| 1183 } |
| 1184 |
| 1185 // Test the data copying constructor for CompactTrieDict, and the data acces
s APIs. |
| 1186 compact2 = new CompactTrieDictionary(compactDict->data(), status); |
| 1187 if (U_FAILURE(status)) { |
| 1188 errln("CompactTrieDictionary(const void *,...) failed\n"); |
| 1189 goto cleanup; |
| 1190 } |
| 1191 |
| 1192 if (compact2->dataSize() == 0) { |
| 1193 errln("CompactTrieDictionary->dataSize() == 0\n"); |
| 1194 goto cleanup; |
| 1195 } |
| 1196 |
| 1197 // Now count the words via the second dictionary |
| 1198 delete enumer1; |
| 1199 enumer1 = compact2->openWords(status); |
| 1200 if (U_FAILURE(status)) { |
| 1201 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_err
orName(status)); |
| 1202 goto cleanup; |
| 1203 } |
| 1204 |
| 1205 if (wordCount != (testCount = enumer1->count(status))) { |
| 1206 errln("CompactTrieDictionary 2 word count (%d) differs from file word co
unt (%d), with status %s\n", |
| 1207 testCount, wordCount, u_errorName(status)); |
| 1208 goto cleanup; |
| 1209 } |
| 1210 |
| 1211 cleanup: |
| 1212 delete compactDict; |
| 1213 delete mutableDict; |
| 1214 delete breaks; |
| 1215 delete[] testFile; |
| 1216 delete enumer1; |
| 1217 delete mutable2; |
| 1218 delete cloneEnum; |
| 1219 delete compact2; |
| 1220 utext_close(originalText); |
| 1221 utext_close(cloneText); |
| 1222 |
| 1223 |
| 1224 } |
852 | 1225 |
853 //---------------------------------------------------------------------------- | 1226 //---------------------------------------------------------------------------- |
854 // | 1227 // |
855 // generalIteratorTest Given a break iterator and a set of test data, | 1228 // generalIteratorTest Given a break iterator and a set of test data, |
856 // Run the tests and report the results. | 1229 // Run the tests and report the results. |
857 // | 1230 // |
858 //---------------------------------------------------------------------------- | 1231 //---------------------------------------------------------------------------- |
859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) | 1232 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) |
860 { | 1233 { |
861 | 1234 |
(...skipping 1001 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" | 2236 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" |
1864 // Words don't include colon or period (cldrbug #1969). | 2237 // Words don't include colon or period (cldrbug #1969). |
1865 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.
field for CS-types."; | 2238 static const char posxWordText[] = "Can't have breaks in xx:yy or struct.
field for CS-types."; |
1866 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24
, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; | 2239 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24
, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; |
1867 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,
26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; | 2240 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,
26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; |
1868 | 2241 |
1869 // UBreakIteratorType UBRK_WORD, Locale "ja" | 2242 // UBreakIteratorType UBRK_WORD, Locale "ja" |
1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includ
es \u3005 \u3007 \u303B (cldrbug #2009). | 2243 // Don't break in runs of hiragana or runs of ideograph, where the latter includ
es \u3005 \u3007 \u303B (cldrbug #2009). |
1871 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3
007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" | 2244 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3
007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" |
1872 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3
005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; | 2245 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3
005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; |
| 2246 #if 0 |
1873 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17
, 18, 20, 21, 24, 27, 28 }; | 2247 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17
, 18, 20, 21, 24, 27, 28 }; |
1874 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17
, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; | 2248 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17
, 18, 19, 20, 21, 24, 25, 26, 27, 28 }; |
| 2249 #endif |
| 2250 // There's no separate Japanese word break iterator. Root is the same as Japanes
e. |
| 2251 // Our dictionary-based iterator has to be tweaked to better handle U+3005, |
| 2252 // U+3007, U+300B and some other cases. |
| 2253 static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15
, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; |
| 2254 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15
, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 }; |
1875 | 2255 |
1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el" | 2256 // UBreakIteratorType UBRK_SENTENCE, Locale "el" |
1877 // Add break after Greek question mark (cldrbug #2069). | 2257 // Add break after Greek question mark (cldrbug #2069). |
1878 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395
\\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " | 2258 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395
\\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " |
1879 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\
\u03C0, \\u03A1\\u03C2? \\u03A3"; | 2259 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\
\u03C0, \\u03A1\\u03C2? \\u03A3"; |
1880 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; | 2260 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; |
1881 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; | 2261 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; |
1882 | 2262 |
1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th" | 2263 // UBreakIteratorType UBRK_CHARACTER, Locale "th" |
1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), exce
pt for [SARA] AM (cldrbug #2161). | 2264 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), exce
pt for [SARA] AM (cldrbug #2161). |
(...skipping 780 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2665 virtual void setText(const UnicodeString &s); | 3045 virtual void setText(const UnicodeString &s); |
2666 virtual int32_t next(int32_t i); | 3046 virtual int32_t next(int32_t i); |
2667 private: | 3047 private: |
2668 UVector *fSets; | 3048 UVector *fSets; |
2669 | 3049 |
2670 UnicodeSet *fCRSet; | 3050 UnicodeSet *fCRSet; |
2671 UnicodeSet *fLFSet; | 3051 UnicodeSet *fLFSet; |
2672 UnicodeSet *fNewlineSet; | 3052 UnicodeSet *fNewlineSet; |
2673 UnicodeSet *fKatakanaSet; | 3053 UnicodeSet *fKatakanaSet; |
2674 UnicodeSet *fALetterSet; | 3054 UnicodeSet *fALetterSet; |
| 3055 // TODO(jungshik): Do we still need this change? |
| 3056 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt |
2675 UnicodeSet *fMidNumLetSet; | 3057 UnicodeSet *fMidNumLetSet; |
2676 UnicodeSet *fMidLetterSet; | 3058 UnicodeSet *fMidLetterSet; |
2677 UnicodeSet *fMidNumSet; | 3059 UnicodeSet *fMidNumSet; |
2678 UnicodeSet *fNumericSet; | 3060 UnicodeSet *fNumericSet; |
2679 UnicodeSet *fFormatSet; | 3061 UnicodeSet *fFormatSet; |
2680 UnicodeSet *fOtherSet; | 3062 UnicodeSet *fOtherSet; |
2681 UnicodeSet *fExtendSet; | 3063 UnicodeSet *fExtendSet; |
2682 UnicodeSet *fExtendNumLetSet; | 3064 UnicodeSet *fExtendNumLetSet; |
| 3065 UnicodeSet *fDictionaryCjkSet; |
2683 | 3066 |
2684 RegexMatcher *fMatcher; | 3067 RegexMatcher *fMatcher; |
2685 | 3068 |
2686 const UnicodeString *fText; | 3069 const UnicodeString *fText; |
2687 }; | 3070 }; |
2688 | 3071 |
2689 | 3072 |
2690 RBBIWordMonkey::RBBIWordMonkey() | 3073 RBBIWordMonkey::RBBIWordMonkey() |
2691 { | 3074 { |
2692 UErrorCode status = U_ZERO_ERROR; | 3075 UErrorCode status = U_ZERO_ERROR; |
2693 | 3076 |
2694 fSets = new UVector(status); | 3077 fSets = new UVector(status); |
2695 | 3078 |
2696 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = C
R}]"), status); | 3079 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = C
R}]"), status); |
2697 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = L
F}]"), status); | 3080 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = L
F}]"), status); |
2698 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N
ewline}]"), status); | 3081 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N
ewline}]"), status); |
2699 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = A
Letter}]"), status); | 3082 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]",
status); |
| 3083 // Exclude Hangul syllables from ALetterSet during testing. |
| 3084 // Leave CJK dictionary characters out from the monkey tests! |
| 3085 #if 0 |
| 3086 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}" |
| 3087 "[\\p{Line_Break = Complex_Context}" |
| 3088 "-\\p{Grapheme_Cluster_Break = Extend}" |
| 3089 "-\\p{Grapheme_Cluster_Break = Control}" |
| 3090 "]]", |
| 3091 status); |
| 3092 #endif |
| 3093 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = A
Letter}]"), status); |
| 3094 fALetterSet->removeAll(*fDictionaryCjkSet); |
2700 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = K
atakana}]"), status); | 3095 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = K
atakana}]"), status); |
2701 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M
idNumLet}]"), status); | 3096 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M
idNumLet}]"), status); |
2702 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M
idLetter}]"), status); | 3097 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M
idLetter}]"), status); |
2703 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M
idNum}]"), status); | 3098 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M
idNum}]"), status); |
2704 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N
umeric}]"), status); | 3099 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N
umeric}[\\uff10-\\uff19]]"), status); |
2705 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = F
ormat}]"), status); | 3100 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = F
ormat}]"), status); |
2706 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E
xtendNumLet}]"), status); | 3101 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E
xtendNumLet}]"), status); |
2707 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E
xtend}]"), status); | 3102 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E
xtend}]"), status); |
2708 | 3103 |
2709 fOtherSet = new UnicodeSet(); | 3104 fOtherSet = new UnicodeSet(); |
2710 if(U_FAILURE(status)) { | 3105 if(U_FAILURE(status)) { |
2711 deferredStatus = status; | 3106 deferredStatus = status; |
2712 return; | 3107 return; |
2713 } | 3108 } |
2714 | 3109 |
2715 fOtherSet->complement(); | 3110 fOtherSet->complement(); |
2716 fOtherSet->removeAll(*fCRSet); | 3111 fOtherSet->removeAll(*fCRSet); |
2717 fOtherSet->removeAll(*fLFSet); | 3112 fOtherSet->removeAll(*fLFSet); |
2718 fOtherSet->removeAll(*fNewlineSet); | 3113 fOtherSet->removeAll(*fNewlineSet); |
2719 fOtherSet->removeAll(*fKatakanaSet); | 3114 fOtherSet->removeAll(*fKatakanaSet); |
2720 fOtherSet->removeAll(*fALetterSet); | 3115 fOtherSet->removeAll(*fALetterSet); |
2721 fOtherSet->removeAll(*fMidLetterSet); | 3116 fOtherSet->removeAll(*fMidLetterSet); |
2722 fOtherSet->removeAll(*fMidNumSet); | 3117 fOtherSet->removeAll(*fMidNumSet); |
2723 fOtherSet->removeAll(*fNumericSet); | 3118 fOtherSet->removeAll(*fNumericSet); |
2724 fOtherSet->removeAll(*fExtendNumLetSet); | 3119 fOtherSet->removeAll(*fExtendNumLetSet); |
2725 fOtherSet->removeAll(*fFormatSet); | 3120 fOtherSet->removeAll(*fFormatSet); |
2726 fOtherSet->removeAll(*fExtendSet); | 3121 fOtherSet->removeAll(*fExtendSet); |
2727 // Inhibit dictionary characters from being tested at all. | 3122 // Inhibit dictionary characters from being tested at all. |
| 3123 fOtherSet->removeAll(*fDictionaryCjkSet); |
2728 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Comp
lex_Context}]"), status)); | 3124 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Comp
lex_Context}]"), status)); |
2729 | 3125 |
2730 fSets->addElement(fCRSet, status); | 3126 fSets->addElement(fCRSet, status); |
2731 fSets->addElement(fLFSet, status); | 3127 fSets->addElement(fLFSet, status); |
2732 fSets->addElement(fNewlineSet, status); | 3128 fSets->addElement(fNewlineSet, status); |
2733 fSets->addElement(fALetterSet, status); | 3129 fSets->addElement(fALetterSet, status); |
2734 fSets->addElement(fKatakanaSet, status); | 3130 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test kat
akana |
2735 fSets->addElement(fMidLetterSet, status); | 3131 fSets->addElement(fMidLetterSet, status); |
2736 fSets->addElement(fMidNumLetSet, status); | 3132 fSets->addElement(fMidNumLetSet, status); |
2737 fSets->addElement(fMidNumSet, status); | 3133 fSets->addElement(fMidNumSet, status); |
2738 fSets->addElement(fNumericSet, status); | 3134 fSets->addElement(fNumericSet, status); |
2739 fSets->addElement(fFormatSet, status); | 3135 fSets->addElement(fFormatSet, status); |
2740 fSets->addElement(fExtendSet, status); | 3136 fSets->addElement(fExtendSet, status); |
2741 fSets->addElement(fOtherSet, status); | 3137 fSets->addElement(fOtherSet, status); |
2742 fSets->addElement(fExtendNumLetSet, status); | 3138 fSets->addElement(fExtendNumLetSet, status); |
2743 | 3139 |
2744 if (U_FAILURE(status)) { | 3140 if (U_FAILURE(status)) { |
(...skipping 1226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3971 printStringBreaks(ustr, expected, expectedcount); | 4367 printStringBreaks(ustr, expected, expectedcount); |
3972 test->errln("isBoundary() failed. Not expecting boundary at pos
ition %d", j); | 4368 test->errln("isBoundary() failed. Not expecting boundary at pos
ition %d", j); |
3973 return; | 4369 return; |
3974 } | 4370 } |
3975 } | 4371 } |
3976 } | 4372 } |
3977 | 4373 |
3978 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { | 4374 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { |
3979 count --; | 4375 count --; |
3980 if (forward[count] != i) { | 4376 if (forward[count] != i) { |
| 4377 printStringBreaks(ustr, expected, expectedcount); |
3981 test->errln("happy break test previous() failed: expected %d but got
%d", | 4378 test->errln("happy break test previous() failed: expected %d but got
%d", |
3982 forward[count], i); | 4379 forward[count], i); |
3983 break; | 4380 break; |
3984 } | 4381 } |
3985 } | 4382 } |
3986 if (count != 0) { | 4383 if (count != 0) { |
3987 printStringBreaks(ustr, expected, expectedcount); | 4384 printStringBreaks(ustr, expected, expectedcount); |
3988 test->errln("break test previous() failed: missed a match"); | 4385 test->errln("break test previous() failed: missed a match"); |
3989 return; | 4386 return; |
3990 } | 4387 } |
(...skipping 13 matching lines...) Expand all Loading... |
4004 } | 4401 } |
4005 | 4402 |
4006 void RBBITest::TestWordBreaks(void) | 4403 void RBBITest::TestWordBreaks(void) |
4007 { | 4404 { |
4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | 4405 #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
4009 | 4406 |
4010 Locale locale("en"); | 4407 Locale locale("en"); |
4011 UErrorCode status = U_ZERO_ERROR; | 4408 UErrorCode status = U_ZERO_ERROR; |
4012 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu
s); | 4409 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu
s); |
4013 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); | 4410 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); |
| 4411 // Replaced any C+J characters in a row with a random sequence of characters |
| 4412 // of the same length to make our C+J segmentation not get in the way. |
4014 static const char *strlist[] = | 4413 static const char *strlist[] = |
4015 { | 4414 { |
4016 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", | 4415 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", |
4017 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040
\\u003b", | 4416 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040
\\u003b", |
4018 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e
0061\\u003a", | 4417 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e
0061\\u003a", |
4019 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", | 4418 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", |
4020 "\\u90ca\\u3588\\u009c\\u0953\\u194b", | 4419 "\\uac00\\u3588\\u009c\\u0953\\u194b", |
4021 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", | 4420 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", |
4022 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e"
, | 4421 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e"
, |
4023 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", | 4422 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", |
4024 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", | 4423 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", |
4025 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", | 4424 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", |
4026 "\\u2027\\U000e0067\\u0a47\\u00b7", | 4425 "\\u2027\\U000e0067\\u0a47\\u00b7", |
4027 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", | 4426 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", |
4028 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", | 4427 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", |
4029 "\\u0589\\U000e006e\\u0a42\\U000104a5", | 4428 "\\u0589\\U000e006e\\u0a42\\U000104a5", |
4030 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", | 4429 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", |
4031 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", | 4430 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", |
4032 "\\u0027\\u11af\\U000e0057\\u0602", | 4431 "\\u0027\\u11af\\U000e0057\\u0602", |
4033 "\\U0001d7f2\\U000e007\\u0004\\u0589", | 4432 "\\U0001d7f2\\U000e007\\u0004\\u0589", |
4034 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b
", | 4433 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b
", |
4035 "\\U0001d7f2\\U000e007d\\u0004\\u0589", | 4434 "\\U0001d7f2\\U000e007d\\u0004\\u0589", |
4036 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", | 4435 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", |
4037 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", | 4436 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", |
4038 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", | 4437 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", |
4039 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", | 4438 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", |
4040 "\\u0233\\U000e0020\\u0a69\\u0d6a", | 4439 "\\u0233\\U000e0020\\u0a69\\u0d6a", |
4041 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", | 4440 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", |
4042 "\\u58f4\\U000e0049\\u20e7\\u2027", | 4441 "\\u18f4\\U000e0049\\u20e7\\u2027", |
4043 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | 4442 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
4044 "\\ua183\\u102d\\u0bec\\u003a", | 4443 "\\ua183\\u102d\\u0bec\\u003a", |
4045 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", | 4444 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", |
4046 "\\u003a\\u0e57\\u0fad\\u002e", | 4445 "\\u003a\\u0e57\\u0fad\\u002e", |
4047 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", | 4446 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", |
4048 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", | 4447 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", |
4049 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", | 4448 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", |
4050 "\\u003a\\u0664\\u00b7\\u1fba", | 4449 "\\u003a\\u0664\\u00b7\\u1fba", |
4051 "\\u003b\\u0027\\u00b7\\u47a3", | 4450 "\\u003b\\u0027\\u00b7\\u47a3", |
4052 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", | 4451 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", |
4053 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u
0e51\\u1058\\U000e0058\\u00b7\\u0673", | 4452 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u
0e51\\u1058\\U000e0058\\u00b7\\u0673", |
4054 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", | 4453 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", |
4055 }; | 4454 }; |
4056 int loop; | 4455 int loop; |
4057 if (U_FAILURE(status)) { | 4456 if (U_FAILURE(status)) { |
4058 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s
tatus)); | 4457 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s
tatus)); |
4059 return; | 4458 return; |
4060 } | 4459 } |
4061 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { | 4460 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { |
4062 // printf("looping %d\n", loop); | 4461 // printf("looping %d\n", loop); |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4097 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", | 4496 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", |
4098 "\\u0589\\U000e006e\\u0a42\\U000104a5", | 4497 "\\u0589\\U000e006e\\u0a42\\U000104a5", |
4099 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", | 4498 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", |
4100 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", | 4499 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", |
4101 "\\u0027\\u11af\\U000e0057\\u0602", | 4500 "\\u0027\\u11af\\U000e0057\\u0602", |
4102 "\\U0001d7f2\\U000e007\\u0004\\u0589", | 4501 "\\U0001d7f2\\U000e007\\u0004\\u0589", |
4103 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b
", | 4502 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b
", |
4104 "\\U0001d7f2\\U000e007d\\u0004\\u0589", | 4503 "\\U0001d7f2\\U000e007d\\u0004\\u0589", |
4105 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", | 4504 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", |
4106 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", | 4505 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", |
4107 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", | 4506 "\\U000e0065\\u302c\\u09ee\\U000e0068", |
4108 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", | 4507 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", |
4109 "\\u0233\\U000e0020\\u0a69\\u0d6a", | 4508 "\\u0233\\U000e0020\\u0a69\\u0d6a", |
4110 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", | 4509 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", |
4111 "\\u58f4\\U000e0049\\u20e7\\u2027", | 4510 "\\u58f4\\U000e0049\\u20e7\\u2027", |
4112 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", | 4511 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", |
4113 "\\ua183\\u102d\\u0bec\\u003a", | 4512 "\\ua183\\u102d\\u0bec\\u003a", |
4114 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", | 4513 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", |
4115 "\\u003a\\u0e57\\u0fad\\u002e", | 4514 "\\u003a\\u0e57\\u0fad\\u002e", |
4116 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", | 4515 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", |
4117 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", | 4516 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", |
4118 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019"
, | 4517 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019"
, |
4119 "\\u003a\\u0664\\u00b7\\u1fba", | 4518 "\\u003a\\u0664\\u00b7\\u1fba", |
4120 "\\u003b\\u0027\\u00b7\\u47a3", | 4519 "\\u003b\\u0027\\u00b7\\u47a3", |
4121 }; | 4520 }; |
4122 int loop; | 4521 int loop; |
(...skipping 662 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4785 pos = bi->last(); | 5184 pos = bi->last(); |
4786 do { | 5185 do { |
4787 // ruleStatus = bi->getRuleStatus(); | 5186 // ruleStatus = bi->getRuleStatus(); |
4788 printf("%d\t%d\n", pos, ruleStatus); | 5187 printf("%d\t%d\n", pos, ruleStatus); |
4789 pos = bi->previous(); | 5188 pos = bi->previous(); |
4790 } while (pos != BreakIterator::DONE); | 5189 } while (pos != BreakIterator::DONE); |
4791 #endif | 5190 #endif |
4792 } | 5191 } |
4793 | 5192 |
4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 5193 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |