Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(201)

Side by Side Diff: icu46/source/test/intltest/rbbitst.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/test/intltest/rbbitst.h ('k') | icu46/source/test/testdata/rbbitst.txt » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /******************************************************************** 1 /********************************************************************
2 * COPYRIGHT: 2 * COPYRIGHT:
3 * Copyright (c) 1999-2010, International Business Machines Corporation and 3 * Copyright (c) 1999-2010, International Business Machines Corporation and
4 * others. All Rights Reserved. 4 * others. All Rights Reserved.
5 ********************************************************************/ 5 ********************************************************************/
6 /************************************************************************ 6 /************************************************************************
7 * Date Name Description 7 * Date Name Description
8 * 12/15/99 Madhu Creation. 8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests 9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/ 10 ************************************************************************/
(...skipping 17 matching lines...) Expand all
28 #include "unicode/utext.h" 28 #include "unicode/utext.h"
29 #include "intltest.h" 29 #include "intltest.h"
30 #include "rbbitst.h" 30 #include "rbbitst.h"
31 #include <string.h> 31 #include <string.h>
32 #include "uvector.h" 32 #include "uvector.h"
33 #include "uvectr32.h" 33 #include "uvectr32.h"
34 #include "triedict.h" 34 #include "triedict.h"
35 #include <string.h> 35 #include <string.h>
36 #include <stdio.h> 36 #include <stdio.h>
37 #include <stdlib.h> 37 #include <stdlib.h>
38 #include "unicode/numfmt.h"
39 #include "unicode/uscript.h"
38 40
39 #define TEST_ASSERT(x) {if (!(x)) { \ 41 #define TEST_ASSERT(x) {if (!(x)) { \
40 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 42 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
41 43
42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 44 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
43 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__ , __LINE__, u_errorName(errcode));}} 45 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__ , __LINE__, u_errorName(errcode));}}
44 46
45 47
46 //--------------------------------------------- 48 //---------------------------------------------
47 // runIndexedTest 49 // runIndexedTest
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
131 case 20: name = "TestTrieDict"; 133 case 20: name = "TestTrieDict";
132 if(exec) TestTrieDict(); break; 134 if(exec) TestTrieDict(); break;
133 135
134 #if !UCONFIG_NO_FILE_IO 136 #if !UCONFIG_NO_FILE_IO
135 case 21: name = "TestBug5775"; 137 case 21: name = "TestBug5775";
136 if (exec) TestBug5775(); break; 138 if (exec) TestBug5775(); break;
137 case 22: name = "TestThaiBreaks"; 139 case 22: name = "TestThaiBreaks";
138 if (exec) TestThaiBreaks(); break; 140 if (exec) TestThaiBreaks(); break;
139 case 23: name = "TestTailoredBreaks"; 141 case 23: name = "TestTailoredBreaks";
140 if (exec) TestTailoredBreaks(); break; 142 if (exec) TestTailoredBreaks(); break;
143 case 24: name = "TestTrieDictWithValue";
144 if(exec) TestTrieDictWithValue(); break;
141 #else 145 #else
142 case 21: case 22: case 23: name = "skip"; 146 case 21: case 22: case 23: case 24: name = "skip";
143 break; 147 break;
144 #endif 148 #endif
145 case 24: name = "TestDictRules"; 149 case 25: name = "TestDictRules";
146 if (exec) TestDictRules(); break; 150 if (exec) TestDictRules(); break;
147 case 25: name = "TestBug5532"; 151 case 25: name = "TestBug5532";
148 if (exec) TestBug5532(); break; 152 if (exec) TestBug5532(); break;
149 default: name = ""; break; //needed to end loop 153 default: name = ""; break; //needed to end loop
150 } 154 }
151 } 155 }
152 156
153 157
154 //--------------------------------------------------------------------------- 158 //---------------------------------------------------------------------------
155 // 159 //
(...skipping 444 matching lines...) Expand 10 before | Expand all | Expand 10 after
600 startOfSecondWord = bi->following(0); 604 startOfSecondWord = bi->following(0);
601 if (startOfSecondWord != 4) { 605 if (startOfSecondWord != 4) {
602 errln("Fail at file %s, line %d expected start of word at 4, got %d", 606 errln("Fail at file %s, line %d expected start of word at 4, got %d",
603 __FILE__, __LINE__, startOfSecondWord); 607 __FILE__, __LINE__, startOfSecondWord);
604 } 608 }
605 delete bi; 609 delete bi;
606 } 610 }
607 611
608 612
609 void RBBITest::TestJapaneseWordBreak() { 613 void RBBITest::TestJapaneseWordBreak() {
614 // TODO: Rewrite this test for a dictionary-based word breaking.
615 #if 0
610 UErrorCode status = U_ZERO_ERROR; 616 UErrorCode status = U_ZERO_ERROR;
611 BITestData japaneseWordSelection(status); 617 BITestData japaneseWordSelection(status);
612 618
613 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data 619 ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); // Break at start of data
614 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2 620 ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
615 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5 621 ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
616 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7 622 ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
617 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10 623 ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
618 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11 624 ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
619 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12 625 ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
620 626
621 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createW ordInstance( 627 RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createW ordInstance(
622 Locale("ja"), status); 628 Locale("ja"), status);
623 if (U_FAILURE(status)) 629 if (U_FAILURE(status))
624 { 630 {
625 errcheckln(status, "Failed to create the BreakIterator for Japanese loca le in TestJapaneseWordBreak.\n"); 631 errcheckln(status, "Failed to create the BreakIterator for Japanese loca le in TestJapaneseWordBreak.\n");
626 return; 632 return;
627 } 633 }
628 634
629 generalIteratorTest(*e, japaneseWordSelection); 635 generalIteratorTest(*e, japaneseWordSelection);
630 delete e; 636 delete e;
637 #endif
631 } 638 }
632 639
633 void RBBITest::TestTrieDict() { 640 void RBBITest::TestTrieDict() {
634 UErrorCode status = U_ZERO_ERROR; 641 UErrorCode status = U_ZERO_ERROR;
635 642
636 // 643 //
637 // Open and read the test data file. 644 // Open and read the test data file.
638 // 645 //
639 const char *testDataDirectory = IntlTest::getSourceTestData(status); 646 const char *testDataDirectory = IntlTest::getSourceTestData(status);
640 char testFileName[1000]; 647 char testFileName[1000];
(...skipping 201 matching lines...) Expand 10 before | Expand all | Expand 10 after
842 delete compactDict; 849 delete compactDict;
843 delete mutableDict; 850 delete mutableDict;
844 delete breaks; 851 delete breaks;
845 delete[] testFile; 852 delete[] testFile;
846 delete enumer1; 853 delete enumer1;
847 delete mutable2; 854 delete mutable2;
848 delete cloneEnum; 855 delete cloneEnum;
849 delete compact2; 856 delete compact2;
850 } 857 }
851 858
859 /*TODO: delete later*/
860 inline void writeEnumerationToFile(StringEnumeration *enumer, char *filename){
861 UErrorCode status = U_ZERO_ERROR;
862 FILE *outfile = fopen(filename,"w");
863 UConverter *cvt = ucnv_open("UTF-8", &status);
864 if (U_FAILURE(status))
865 return;
866 if(outfile != NULL){
867 status = U_ZERO_ERROR;
868 const UnicodeString *word = enumer->snext(status);
869 while (word != NULL && U_SUCCESS(status)) {
870 char u8word[500];
871 status = U_ZERO_ERROR;
872 ucnv_fromUChars(cvt, u8word, 500, word->getBuffer(), word->length(),
873 &status);
874 fprintf(outfile,"%s\n", u8word);
875 status = U_ZERO_ERROR;
876 word = enumer->snext(status);
877 }
878 fclose(outfile);
879 }
880 ucnv_close(cvt);
881 }
882
883 // A very simple helper class to streamline the buffer handling in
884 // TestTrieDictWithValue
885 template<class T, size_t N>
886 class AutoBuffer {
887 public:
888 AutoBuffer(size_t size) : buffer(stackBuffer) {
889 if (size > N)
890 buffer = new T[size];
891 }
892 ~AutoBuffer() {
893 if (buffer != stackBuffer)
894 delete [] buffer;
895 }
896 T* elems() {
897 return buffer;
898 }
899 const T& operator[] (size_t i) const {
900 return buffer[i];
901 }
902 T& operator[] (size_t i) {
903 return buffer[i];
904 }
905 private:
906 T stackBuffer[N];
907 T* buffer;
908 AutoBuffer();
909 };
910
911 //----------------------------------------------------------------------------
912 //
913 // TestTrieDictWithValue Test trie dictionaries with logprob values and
914 // more than 2^16 nodes after compaction.
915 //
916 //----------------------------------------------------------------------------
917 void RBBITest::TestTrieDictWithValue() {
918 UErrorCode status = U_ZERO_ERROR;
919
920 //
921 // Open and read the test data file.
922 //
923 const char *testDataDirectory = IntlTest::getSourceTestData(status);
924 const char *filename = "cjdict-truncated.txt";
925 char testFileName[1000];
926 if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen(filename ) + 10 >= sizeof(testFileName)) {
927 errln("Can't open test data. Path too long.");
928 return;
929 }
930 strcpy(testFileName, testDataDirectory);
931 strcat(testFileName, filename);
932
933 // Items needing deleting at the end
934 MutableTrieDictionary *mutableDict = NULL;
935 CompactTrieDictionary *compactDict = NULL;
936 UnicodeSet *breaks = NULL;
937 UChar *testFile = NULL;
938 StringEnumeration *enumer1 = NULL;
939 StringEnumeration *enumer2 = NULL;
940 MutableTrieDictionary *mutable2 = NULL;
941 StringEnumeration *cloneEnum = NULL;
942 CompactTrieDictionary *compact2 = NULL;
943 NumberFormat *nf = NULL;
944 UText *originalText = NULL, *cloneText = NULL;
945
946 const UnicodeString *originalWord = NULL;
947 const UnicodeString *cloneWord = NULL;
948 UChar *current;
949 UChar *word;
950 UChar uc;
951 int32_t wordLen;
952 int32_t wordCount;
953 int32_t testCount;
954 int32_t valueLen;
955 int counter = 0;
956
957 int len;
958 testFile = ReadAndConvertFile(testFileName, len, NULL, status);
959 if (U_FAILURE(status)) {
960 goto cleanup; /* something went wrong, error already output */
961 }
962
963 mutableDict = new MutableTrieDictionary(0x0E1C, status, TRUE);
964 if (U_FAILURE(status)) {
965 errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status)) ;
966 goto cleanup;
967 }
968
969 breaks = new UnicodeSet;
970 breaks->add(0x000A); // Line Feed
971 breaks->add(0x000D); // Carriage Return
972 breaks->add(0x2028); // Line Separator
973 breaks->add(0x2029); // Paragraph Separator
974 breaks->add(0x0009); // Tab character
975
976 // Now add each non-comment line of the file as a word.
977 current = testFile;
978 word = current;
979 uc = *current++;
980 wordLen = 0;
981 wordCount = 0;
982 nf = NumberFormat::createInstance(status);
983
984 while (uc) {
985 UnicodeString ucharValue;
986 valueLen = 0;
987
988 if (uc == 0x0023) { // #comment line, skip
989 while (uc && !breaks->contains(uc)) {
990 uc = *current++;
991 }
992 }
993 else{
994 while (uc && !breaks->contains(uc)) {
995 ++wordLen;
996 uc = *current++;
997 }
998 if(uc == 0x0009){ //separator is a tab char, read in num after tab
999 uc = *current++;
1000 while (uc && !breaks->contains(uc)) {
1001 ucharValue.append(uc);
1002 uc = *current++;
1003 }
1004 }
1005 }
1006 if (wordLen > 0) {
1007 Formattable value((int32_t)0);
1008 nf->parse(ucharValue.getTerminatedBuffer(), value, status);
1009
1010 if(U_FAILURE(status)){
1011 errln("parsing of value failed when reading in dictionary\n");
1012 goto cleanup;
1013 }
1014 mutableDict->addWord(word, wordLen, status, value.getLong());
1015 if (U_FAILURE(status)) {
1016 errln("Could not add word to mutable dictionary; status %s\n", u _errorName(status));
1017 goto cleanup;
1018 }
1019 wordCount += 1;
1020 }
1021
1022 // Find beginning of next line
1023 while (uc && breaks->contains(uc)) {
1024 uc = *current++;
1025 }
1026 word = current-1;
1027 wordLen = 0;
1028 }
1029
1030 if (wordCount < 50) {
1031 errln("Word count (%d) unreasonably small\n", wordCount);
1032 goto cleanup;
1033 }
1034
1035 enumer1 = mutableDict->openWords(status);
1036 if (U_FAILURE(status)) {
1037 errln("Could not open mutable dictionary enumerator: %s\n", u_errorName( status));
1038 goto cleanup;
1039 }
1040
1041 testCount = 0;
1042 if (wordCount != (testCount = enumer1->count(status))) {
1043 errln("MutableTrieDictionary word count (%d) differs from file word coun t (%d), with status %s\n",
1044 testCount, wordCount, u_errorName(status));
1045 goto cleanup;
1046 }
1047
1048 // Now compact it
1049 compactDict = new CompactTrieDictionary(*mutableDict, status);
1050 if (U_FAILURE(status)) {
1051 errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status ));
1052 goto cleanup;
1053 }
1054
1055 enumer2 = compactDict->openWords(status);
1056 if (U_FAILURE(status)) {
1057 errln("Could not open compact trie dictionary enumerator: %s\n", u_error Name(status));
1058 goto cleanup;
1059 }
1060
1061
1062 //delete later
1063 // writeEnumerationToFile(enumer1, "/home/jchye/mutable.txt");
1064 // writeEnumerationToFile(enumer2, "/home/jchye/compact.txt");
1065
1066 enumer1->reset(status);
1067 enumer2->reset(status);
1068
1069 originalWord = enumer1->snext(status);
1070 cloneWord = enumer2->snext(status);
1071 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
1072 if (*originalWord != *cloneWord) {
1073 errln("MutableTrieDictionary and CompactTrieDictionary word mismatch at %d, lengths are %d and %d\n",
1074 counter, originalWord->length(), cloneWord->length());
1075 goto cleanup;
1076 }
1077
1078 // check if attached values of the same word in both dictionaries tally
1079 #if 0
1080 int32_t lengths1[originalWord->length()], lengths2[cloneWord->length()];
1081 uint16_t values1[originalWord->length()], values2[cloneWord->length()];
1082 #endif
1083 AutoBuffer<int32_t, 20> lengths1(originalWord->length());
1084 AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
1085 AutoBuffer<uint16_t, 20> values1(originalWord->length());
1086 AutoBuffer<uint16_t, 20> values2(cloneWord->length());
1087
1088 originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
1089 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
1090
1091 int count1, count2;
1092 mutableDict->matches(originalText, originalWord->length(), lengths1.elem s(), count1, originalWord->length(), values1.elems());
1093 compactDict->matches(cloneText, cloneWord->length(), lengths2.elems(), c ount2, cloneWord->length(), values2.elems());
1094
1095 if(values1[count1-1] != values2[count2-1]){
1096 errln("Values of word %d in MutableTrieDictionary and CompactTrieDic tionary do not match, with values %d and %d\n",
1097 counter, values1[count1-1], values2[count2-1]);
1098 goto cleanup;
1099 }
1100
1101 counter++;
1102 originalWord = enumer1->snext(status);
1103 cloneWord = enumer2->snext(status);
1104 }
1105 if (enumer1->getDynamicClassID() == enumer2->getDynamicClassID()) {
1106 errln("CompactTrieEnumeration and MutableTrieEnumeration ClassIDs are th e same");
1107 }
1108
1109 delete enumer1;
1110 enumer1 = NULL;
1111 delete enumer2;
1112 enumer2 = NULL;
1113
1114 // Now un-compact it
1115 mutable2 = compactDict->cloneMutable(status);
1116 if (U_FAILURE(status)) {
1117 errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: % s\n", u_errorName(status));
1118 goto cleanup;
1119 }
1120
1121 cloneEnum = mutable2->openWords(status);
1122 if (U_FAILURE(status)) {
1123 errln("Could not create cloned mutable enumerator: %s\n", u_errorName(st atus));
1124 goto cleanup;
1125 }
1126
1127 if (wordCount != (testCount = cloneEnum->count(status))) {
1128 errln("Cloned MutableTrieDictionary word count (%d) differs from file wo rd count (%d), with status %s\n",
1129 testCount, wordCount, u_errorName(status));
1130 goto cleanup;
1131 }
1132
1133 // Compact original dictionary to clone. Note that we can only compare the s ame kind of
1134 // dictionary as the order of the enumerators is not guaranteed to be the sa me between
1135 // different kinds
1136 enumer1 = mutableDict->openWords(status);
1137 if (U_FAILURE(status)) {
1138 errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorNa me(status));
1139 goto cleanup;
1140 }
1141
1142 counter = 0;
1143 originalWord = enumer1->snext(status);
1144 cloneWord = cloneEnum->snext(status);
1145 while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
1146 if (*originalWord != *cloneWord) {
1147 errln("Original and cloned MutableTrieDictionary word mismatch\n");
1148 goto cleanup;
1149 }
1150
1151 // check if attached values of the same word in both dictionaries tally
1152 AutoBuffer<int32_t, 20> lengths1(originalWord->length());
1153 AutoBuffer<int32_t, 20> lengths2(cloneWord->length());
1154 AutoBuffer<uint16_t, 20> values1(originalWord->length());
1155 AutoBuffer<uint16_t, 20> values2(cloneWord->length());
1156 originalText = utext_openConstUnicodeString(originalText, originalWord, &status);
1157 cloneText = utext_openConstUnicodeString(cloneText, cloneWord, &status);
1158
1159 int count1, count2;
1160 mutableDict->matches(originalText, originalWord->length(), lengths1.elem s(), count1, originalWord->length(), values1.elems());
1161 mutable2->matches(cloneText, cloneWord->length(), lengths2.elems(), coun t2, cloneWord->length(), values2.elems());
1162
1163 if(values1[count1-1] != values2[count2-1]){
1164 errln("Values of word %d in original and cloned MutableTrieDictionar y do not match, with values %d and %d\n",
1165 counter, values1[count1-1], values2[count2-1]);
1166 goto cleanup;
1167 }
1168
1169 counter++;
1170
1171 originalWord = enumer1->snext(status);
1172 cloneWord = cloneEnum->snext(status);
1173 }
1174
1175 if (U_FAILURE(status)) {
1176 errln("Enumeration failed: %s\n", u_errorName(status));
1177 goto cleanup;
1178 }
1179
1180 if (originalWord != cloneWord) {
1181 errln("Original and cloned MutableTrieDictionary ended enumeration at di fferent points\n");
1182 goto cleanup;
1183 }
1184
1185 // Test the data copying constructor for CompactTrieDict, and the data acces s APIs.
1186 compact2 = new CompactTrieDictionary(compactDict->data(), status);
1187 if (U_FAILURE(status)) {
1188 errln("CompactTrieDictionary(const void *,...) failed\n");
1189 goto cleanup;
1190 }
1191
1192 if (compact2->dataSize() == 0) {
1193 errln("CompactTrieDictionary->dataSize() == 0\n");
1194 goto cleanup;
1195 }
1196
1197 // Now count the words via the second dictionary
1198 delete enumer1;
1199 enumer1 = compact2->openWords(status);
1200 if (U_FAILURE(status)) {
1201 errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_err orName(status));
1202 goto cleanup;
1203 }
1204
1205 if (wordCount != (testCount = enumer1->count(status))) {
1206 errln("CompactTrieDictionary 2 word count (%d) differs from file word co unt (%d), with status %s\n",
1207 testCount, wordCount, u_errorName(status));
1208 goto cleanup;
1209 }
1210
1211 cleanup:
1212 delete compactDict;
1213 delete mutableDict;
1214 delete breaks;
1215 delete[] testFile;
1216 delete enumer1;
1217 delete mutable2;
1218 delete cloneEnum;
1219 delete compact2;
1220 utext_close(originalText);
1221 utext_close(cloneText);
1222
1223
1224 }
852 1225
853 //---------------------------------------------------------------------------- 1226 //----------------------------------------------------------------------------
854 // 1227 //
855 // generalIteratorTest Given a break iterator and a set of test data, 1228 // generalIteratorTest Given a break iterator and a set of test data,
856 // Run the tests and report the results. 1229 // Run the tests and report the results.
857 // 1230 //
858 //---------------------------------------------------------------------------- 1231 //----------------------------------------------------------------------------
859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 1232 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
860 { 1233 {
861 1234
(...skipping 1001 matching lines...) Expand 10 before | Expand all | Expand 10 after
1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 2236 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1864 // Words don't include colon or period (cldrbug #1969). 2237 // Words don't include colon or period (cldrbug #1969).
1865 static const char posxWordText[] = "Can't have breaks in xx:yy or struct. field for CS-types."; 2238 static const char posxWordText[] = "Can't have breaks in xx:yy or struct. field for CS-types.";
1866 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24 , 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 }; 2239 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24 , 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1867 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 }; 2240 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 26, 27, 29, 30, 42, 43, 46, 47, 49, 50, 55, 56 };
1868 2241
1869 // UBreakIteratorType UBRK_WORD, Locale "ja" 2242 // UBreakIteratorType UBRK_WORD, Locale "ja"
1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includ es \u3005 \u3007 \u303B (cldrbug #2009). 2243 // Don't break in runs of hiragana or runs of ideograph, where the latter includ es \u3005 \u3007 \u303B (cldrbug #2009).
1871 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3 007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF" 2244 static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3 007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1872 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3 005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002"; 2245 "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3 005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
2246 #if 0
1873 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17 , 18, 20, 21, 24, 27, 28 }; 2247 static const int32_t jaWordTOffsets[] = { 2, 3, 7, 8, 14, 17 , 18, 20, 21, 24, 27, 28 };
1874 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17 , 18, 19, 20, 21, 24, 25, 26, 27, 28 }; 2248 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17 , 18, 19, 20, 21, 24, 25, 26, 27, 28 };
2249 #endif
2250 // There's no separate Japanese word break iterator. Root is the same as Japanes e.
2251 // Our dictionary-based iterator has to be tweaked to better handle U+3005,
2252 // U+3007, U+300B and some other cases.
2253 static const int32_t jaWordTOffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15 , 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
2254 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 7, 8, 12, 13, 14, 15 , 17, 18, 20, 21, 22, 23, 24, 25, 27, 28 };
1875 2255
1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el" 2256 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
1877 // Add break after Greek question mark (cldrbug #2069). 2257 // Add break after Greek question mark (cldrbug #2069).
1878 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. " 2258 static const char elSentText[] = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1879 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\ \u03C0, \\u03A1\\u03C2? \\u03A3"; 2259 "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\ \u03C0, \\u03A1\\u03C2? \\u03A3";
1880 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 }; 2260 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
1881 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 }; 2261 static const int32_t elSentROffsets[] = { 20, 27, 35, 36 };
1882 2262
1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th" 2263 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), exce pt for [SARA] AM (cldrbug #2161). 2264 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), exce pt for [SARA] AM (cldrbug #2161).
(...skipping 780 matching lines...) Expand 10 before | Expand all | Expand 10 after
2665 virtual void setText(const UnicodeString &s); 3045 virtual void setText(const UnicodeString &s);
2666 virtual int32_t next(int32_t i); 3046 virtual int32_t next(int32_t i);
2667 private: 3047 private:
2668 UVector *fSets; 3048 UVector *fSets;
2669 3049
2670 UnicodeSet *fCRSet; 3050 UnicodeSet *fCRSet;
2671 UnicodeSet *fLFSet; 3051 UnicodeSet *fLFSet;
2672 UnicodeSet *fNewlineSet; 3052 UnicodeSet *fNewlineSet;
2673 UnicodeSet *fKatakanaSet; 3053 UnicodeSet *fKatakanaSet;
2674 UnicodeSet *fALetterSet; 3054 UnicodeSet *fALetterSet;
3055 // TODO(jungshik): Do we still need this change?
3056 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2675 UnicodeSet *fMidNumLetSet; 3057 UnicodeSet *fMidNumLetSet;
2676 UnicodeSet *fMidLetterSet; 3058 UnicodeSet *fMidLetterSet;
2677 UnicodeSet *fMidNumSet; 3059 UnicodeSet *fMidNumSet;
2678 UnicodeSet *fNumericSet; 3060 UnicodeSet *fNumericSet;
2679 UnicodeSet *fFormatSet; 3061 UnicodeSet *fFormatSet;
2680 UnicodeSet *fOtherSet; 3062 UnicodeSet *fOtherSet;
2681 UnicodeSet *fExtendSet; 3063 UnicodeSet *fExtendSet;
2682 UnicodeSet *fExtendNumLetSet; 3064 UnicodeSet *fExtendNumLetSet;
3065 UnicodeSet *fDictionaryCjkSet;
2683 3066
2684 RegexMatcher *fMatcher; 3067 RegexMatcher *fMatcher;
2685 3068
2686 const UnicodeString *fText; 3069 const UnicodeString *fText;
2687 }; 3070 };
2688 3071
2689 3072
2690 RBBIWordMonkey::RBBIWordMonkey() 3073 RBBIWordMonkey::RBBIWordMonkey()
2691 { 3074 {
2692 UErrorCode status = U_ZERO_ERROR; 3075 UErrorCode status = U_ZERO_ERROR;
2693 3076
2694 fSets = new UVector(status); 3077 fSets = new UVector(status);
2695 3078
2696 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = C R}]"), status); 3079 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = C R}]"), status);
2697 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = L F}]"), status); 3080 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = L F}]"), status);
2698 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N ewline}]"), status); 3081 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N ewline}]"), status);
2699 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = A Letter}]"), status); 3082 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
3083 // Exclude Hangul syllables from ALetterSet during testing.
3084 // Leave CJK dictionary characters out from the monkey tests!
3085 #if 0
3086 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
3087 "[\\p{Line_Break = Complex_Context}"
3088 "-\\p{Grapheme_Cluster_Break = Extend}"
3089 "-\\p{Grapheme_Cluster_Break = Control}"
3090 "]]",
3091 status);
3092 #endif
3093 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = A Letter}]"), status);
3094 fALetterSet->removeAll(*fDictionaryCjkSet);
2700 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = K atakana}]"), status); 3095 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = K atakana}]"), status);
2701 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idNumLet}]"), status); 3096 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idNumLet}]"), status);
2702 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idLetter}]"), status); 3097 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idLetter}]"), status);
2703 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idNum}]"), status); 3098 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = M idNum}]"), status);
2704 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N umeric}]"), status); 3099 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = N umeric}[\\uff10-\\uff19]]"), status);
2705 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = F ormat}]"), status); 3100 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = F ormat}]"), status);
2706 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E xtendNumLet}]"), status); 3101 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E xtendNumLet}]"), status);
2707 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E xtend}]"), status); 3102 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = E xtend}]"), status);
2708 3103
2709 fOtherSet = new UnicodeSet(); 3104 fOtherSet = new UnicodeSet();
2710 if(U_FAILURE(status)) { 3105 if(U_FAILURE(status)) {
2711 deferredStatus = status; 3106 deferredStatus = status;
2712 return; 3107 return;
2713 } 3108 }
2714 3109
2715 fOtherSet->complement(); 3110 fOtherSet->complement();
2716 fOtherSet->removeAll(*fCRSet); 3111 fOtherSet->removeAll(*fCRSet);
2717 fOtherSet->removeAll(*fLFSet); 3112 fOtherSet->removeAll(*fLFSet);
2718 fOtherSet->removeAll(*fNewlineSet); 3113 fOtherSet->removeAll(*fNewlineSet);
2719 fOtherSet->removeAll(*fKatakanaSet); 3114 fOtherSet->removeAll(*fKatakanaSet);
2720 fOtherSet->removeAll(*fALetterSet); 3115 fOtherSet->removeAll(*fALetterSet);
2721 fOtherSet->removeAll(*fMidLetterSet); 3116 fOtherSet->removeAll(*fMidLetterSet);
2722 fOtherSet->removeAll(*fMidNumSet); 3117 fOtherSet->removeAll(*fMidNumSet);
2723 fOtherSet->removeAll(*fNumericSet); 3118 fOtherSet->removeAll(*fNumericSet);
2724 fOtherSet->removeAll(*fExtendNumLetSet); 3119 fOtherSet->removeAll(*fExtendNumLetSet);
2725 fOtherSet->removeAll(*fFormatSet); 3120 fOtherSet->removeAll(*fFormatSet);
2726 fOtherSet->removeAll(*fExtendSet); 3121 fOtherSet->removeAll(*fExtendSet);
2727 // Inhibit dictionary characters from being tested at all. 3122 // Inhibit dictionary characters from being tested at all.
3123 fOtherSet->removeAll(*fDictionaryCjkSet);
2728 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Comp lex_Context}]"), status)); 3124 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Comp lex_Context}]"), status));
2729 3125
2730 fSets->addElement(fCRSet, status); 3126 fSets->addElement(fCRSet, status);
2731 fSets->addElement(fLFSet, status); 3127 fSets->addElement(fLFSet, status);
2732 fSets->addElement(fNewlineSet, status); 3128 fSets->addElement(fNewlineSet, status);
2733 fSets->addElement(fALetterSet, status); 3129 fSets->addElement(fALetterSet, status);
2734 fSets->addElement(fKatakanaSet, status); 3130 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test kat akana
2735 fSets->addElement(fMidLetterSet, status); 3131 fSets->addElement(fMidLetterSet, status);
2736 fSets->addElement(fMidNumLetSet, status); 3132 fSets->addElement(fMidNumLetSet, status);
2737 fSets->addElement(fMidNumSet, status); 3133 fSets->addElement(fMidNumSet, status);
2738 fSets->addElement(fNumericSet, status); 3134 fSets->addElement(fNumericSet, status);
2739 fSets->addElement(fFormatSet, status); 3135 fSets->addElement(fFormatSet, status);
2740 fSets->addElement(fExtendSet, status); 3136 fSets->addElement(fExtendSet, status);
2741 fSets->addElement(fOtherSet, status); 3137 fSets->addElement(fOtherSet, status);
2742 fSets->addElement(fExtendNumLetSet, status); 3138 fSets->addElement(fExtendNumLetSet, status);
2743 3139
2744 if (U_FAILURE(status)) { 3140 if (U_FAILURE(status)) {
(...skipping 1226 matching lines...) Expand 10 before | Expand all | Expand 10 after
3971 printStringBreaks(ustr, expected, expectedcount); 4367 printStringBreaks(ustr, expected, expectedcount);
3972 test->errln("isBoundary() failed. Not expecting boundary at pos ition %d", j); 4368 test->errln("isBoundary() failed. Not expecting boundary at pos ition %d", j);
3973 return; 4369 return;
3974 } 4370 }
3975 } 4371 }
3976 } 4372 }
3977 4373
3978 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 4374 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3979 count --; 4375 count --;
3980 if (forward[count] != i) { 4376 if (forward[count] != i) {
4377 printStringBreaks(ustr, expected, expectedcount);
3981 test->errln("happy break test previous() failed: expected %d but got %d", 4378 test->errln("happy break test previous() failed: expected %d but got %d",
3982 forward[count], i); 4379 forward[count], i);
3983 break; 4380 break;
3984 } 4381 }
3985 } 4382 }
3986 if (count != 0) { 4383 if (count != 0) {
3987 printStringBreaks(ustr, expected, expectedcount); 4384 printStringBreaks(ustr, expected, expectedcount);
3988 test->errln("break test previous() failed: missed a match"); 4385 test->errln("break test previous() failed: missed a match");
3989 return; 4386 return;
3990 } 4387 }
(...skipping 13 matching lines...) Expand all
4004 } 4401 }
4005 4402
4006 void RBBITest::TestWordBreaks(void) 4403 void RBBITest::TestWordBreaks(void)
4007 { 4404 {
4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4405 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4009 4406
4010 Locale locale("en"); 4407 Locale locale("en");
4011 UErrorCode status = U_ZERO_ERROR; 4408 UErrorCode status = U_ZERO_ERROR;
4012 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu s); 4409 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, statu s);
4013 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4410 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4411 // Replaced any C+J characters in a row with a random sequence of characters
4412 // of the same length to make our C+J segmentation not get in the way.
4014 static const char *strlist[] = 4413 static const char *strlist[] =
4015 { 4414 {
4016 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 4415 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
4017 "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040 \\u003b", 4416 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040 \\u003b",
4018 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e 0061\\u003a", 4417 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e 0061\\u003a",
4019 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 4418 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
4020 "\\u90ca\\u3588\\u009c\\u0953\\u194b", 4419 "\\uac00\\u3588\\u009c\\u0953\\u194b",
4021 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 4420 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4022 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e" , 4421 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e" ,
4023 "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", 4422 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
4024 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 4423 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4025 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 4424 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4026 "\\u2027\\U000e0067\\u0a47\\u00b7", 4425 "\\u2027\\U000e0067\\u0a47\\u00b7",
4027 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 4426 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4028 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4427 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4029 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4428 "\\u0589\\U000e006e\\u0a42\\U000104a5",
4030 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4429 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
4031 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4430 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4032 "\\u0027\\u11af\\U000e0057\\u0602", 4431 "\\u0027\\u11af\\U000e0057\\u0602",
4033 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4432 "\\U0001d7f2\\U000e007\\u0004\\u0589",
4034 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ", 4433 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ",
4035 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4434 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4036 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4435 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4037 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4436 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4038 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4437 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4039 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4438 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4040 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4439 "\\u0233\\U000e0020\\u0a69\\u0d6a",
4041 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4440 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4042 "\\u58f4\\U000e0049\\u20e7\\u2027", 4441 "\\u18f4\\U000e0049\\u20e7\\u2027",
4043 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4442 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4044 "\\ua183\\u102d\\u0bec\\u003a", 4443 "\\ua183\\u102d\\u0bec\\u003a",
4045 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4444 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4046 "\\u003a\\u0e57\\u0fad\\u002e", 4445 "\\u003a\\u0e57\\u0fad\\u002e",
4047 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4446 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4048 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4447 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4049 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 4448 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
4050 "\\u003a\\u0664\\u00b7\\u1fba", 4449 "\\u003a\\u0664\\u00b7\\u1fba",
4051 "\\u003b\\u0027\\u00b7\\u47a3", 4450 "\\u003b\\u0027\\u00b7\\u47a3",
4052 "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b", 4451 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
4053 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u 0e51\\u1058\\U000e0058\\u00b7\\u0673", 4452 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u 0e51\\u1058\\U000e0058\\u00b7\\u0673",
4054 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 4453 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
4055 }; 4454 };
4056 int loop; 4455 int loop;
4057 if (U_FAILURE(status)) { 4456 if (U_FAILURE(status)) {
4058 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus)); 4457 errcheckln(status, "Creation of break iterator failed %s", u_errorName(s tatus));
4059 return; 4458 return;
4060 } 4459 }
4061 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) { 4460 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4062 // printf("looping %d\n", loop); 4461 // printf("looping %d\n", loop);
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
4097 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 4496 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4098 "\\u0589\\U000e006e\\u0a42\\U000104a5", 4497 "\\u0589\\U000e006e\\u0a42\\U000104a5",
4099 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 4498 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4100 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 4499 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4101 "\\u0027\\u11af\\U000e0057\\u0602", 4500 "\\u0027\\u11af\\U000e0057\\u0602",
4102 "\\U0001d7f2\\U000e007\\u0004\\u0589", 4501 "\\U0001d7f2\\U000e007\\u0004\\u0589",
4103 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ", 4502 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b ",
4104 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 4503 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4105 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 4504 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4106 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 4505 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4107 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 4506 "\\U000e0065\\u302c\\u09ee\\U000e0068",
4108 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 4507 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4109 "\\u0233\\U000e0020\\u0a69\\u0d6a", 4508 "\\u0233\\U000e0020\\u0a69\\u0d6a",
4110 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 4509 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4111 "\\u58f4\\U000e0049\\u20e7\\u2027", 4510 "\\u58f4\\U000e0049\\u20e7\\u2027",
4112 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 4511 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4113 "\\ua183\\u102d\\u0bec\\u003a", 4512 "\\ua183\\u102d\\u0bec\\u003a",
4114 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 4513 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4115 "\\u003a\\u0e57\\u0fad\\u002e", 4514 "\\u003a\\u0e57\\u0fad\\u002e",
4116 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 4515 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4117 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 4516 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4118 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019" , 4517 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019" ,
4119 "\\u003a\\u0664\\u00b7\\u1fba", 4518 "\\u003a\\u0664\\u00b7\\u1fba",
4120 "\\u003b\\u0027\\u00b7\\u47a3", 4519 "\\u003b\\u0027\\u00b7\\u47a3",
4121 }; 4520 };
4122 int loop; 4521 int loop;
(...skipping 662 matching lines...) Expand 10 before | Expand all | Expand 10 after
4785 pos = bi->last(); 5184 pos = bi->last();
4786 do { 5185 do {
4787 // ruleStatus = bi->getRuleStatus(); 5186 // ruleStatus = bi->getRuleStatus();
4788 printf("%d\t%d\n", pos, ruleStatus); 5187 printf("%d\t%d\n", pos, ruleStatus);
4789 pos = bi->previous(); 5188 pos = bi->previous();
4790 } while (pos != BreakIterator::DONE); 5189 } while (pos != BreakIterator::DONE);
4791 #endif 5190 #endif
4792 } 5191 }
4793 5192
4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 5193 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
OLDNEW
« no previous file with comments | « icu46/source/test/intltest/rbbitst.h ('k') | icu46/source/test/testdata/rbbitst.txt » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698