Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: source/common/dictbe.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/common/dictbe.h ('k') | source/common/dictionarydata.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /** 1 /**
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 2006-2013, International Business Machines Corporation 3 * Copyright (C) 2006-2014, International Business Machines Corporation
4 * and others. All Rights Reserved. 4 * and others. All Rights Reserved.
5 ******************************************************************************* 5 *******************************************************************************
6 */ 6 */
7 7
8 #include "unicode/utypes.h" 8 #include "unicode/utypes.h"
9 9
10 #if !UCONFIG_NO_BREAK_ITERATION 10 #if !UCONFIG_NO_BREAK_ITERATION
11 11
12 #include "brkeng.h" 12 #include "brkeng.h"
13 #include "dictbe.h" 13 #include "dictbe.h"
14 #include "unicode/uniset.h" 14 #include "unicode/uniset.h"
15 #include "unicode/chariter.h" 15 #include "unicode/chariter.h"
16 #include "unicode/ubrk.h" 16 #include "unicode/ubrk.h"
17 #include "uvectr32.h"
17 #include "uvector.h" 18 #include "uvector.h"
18 #include "uassert.h" 19 #include "uassert.h"
19 #include "unicode/normlzr.h" 20 #include "unicode/normlzr.h"
20 #include "cmemory.h" 21 #include "cmemory.h"
21 #include "dictionarydata.h" 22 #include "dictionarydata.h"
22 23
23 U_NAMESPACE_BEGIN 24 U_NAMESPACE_BEGIN
24 25
25 /* 26 /*
26 ****************************************************************** 27 ******************************************************************
(...skipping 15 matching lines...) Expand all
42 int32_t 43 int32_t
43 DictionaryBreakEngine::findBreaks( UText *text, 44 DictionaryBreakEngine::findBreaks( UText *text,
44 int32_t startPos, 45 int32_t startPos,
45 int32_t endPos, 46 int32_t endPos,
46 UBool reverse, 47 UBool reverse,
47 int32_t breakType, 48 int32_t breakType,
48 UStack &foundBreaks ) const { 49 UStack &foundBreaks ) const {
49 int32_t result = 0; 50 int32_t result = 0;
50 51
51 // Find the span of characters included in the set. 52 // Find the span of characters included in the set.
53 // The span to break begins at the current position in the text, and
54 // extends towards the start or end of the text, depending on 'reverse'.
55
52 int32_t start = (int32_t)utext_getNativeIndex(text); 56 int32_t start = (int32_t)utext_getNativeIndex(text);
53 int32_t current; 57 int32_t current;
54 int32_t rangeStart; 58 int32_t rangeStart;
55 int32_t rangeEnd; 59 int32_t rangeEnd;
56 UChar32 c = utext_current32(text); 60 UChar32 c = utext_current32(text);
57 if (reverse) { 61 if (reverse) {
58 UBool isDict = fSet.contains(c); 62 UBool isDict = fSet.contains(c);
59 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDi ct) { 63 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDi ct) {
60 c = utext_previous32(text); 64 c = utext_previous32(text);
61 isDict = fSet.contains(c); 65 isDict = fSet.contains(c);
62 } 66 }
63 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 67 if (current < startPos) {
64 rangeEnd = start + 1; 68 rangeStart = startPos;
69 } else {
70 rangeStart = current;
71 if (!isDict) {
72 utext_next32(text);
73 rangeStart = utext_getNativeIndex(text);
74 }
75 }
76 // rangeEnd = start + 1;
77 utext_setNativeIndex(text, start);
78 utext_next32(text);
79 rangeEnd = utext_getNativeIndex(text);
65 } 80 }
66 else { 81 else {
67 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.c ontains(c)) { 82 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.c ontains(c)) {
68 utext_next32(text); // TODO: recast loop for postincrement 83 utext_next32(text); // TODO: recast loop for postincrement
69 c = utext_current32(text); 84 c = utext_current32(text);
70 } 85 }
71 rangeStart = start; 86 rangeStart = start;
72 rangeEnd = current; 87 rangeEnd = current;
73 } 88 }
74 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes )) { 89 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes )) {
(...skipping 14 matching lines...) Expand all
89 /* 104 /*
90 ****************************************************************** 105 ******************************************************************
91 * PossibleWord 106 * PossibleWord
92 */ 107 */
93 108
94 // Helper class for improving readability of the Thai/Lao/Khmer word break 109 // Helper class for improving readability of the Thai/Lao/Khmer word break
95 // algorithm. The implementation is completely inline. 110 // algorithm. The implementation is completely inline.
96 111
97 // List size, limited by the maximum number of words in the dictionary 112 // List size, limited by the maximum number of words in the dictionary
98 // that form a nested sequence. 113 // that form a nested sequence.
99 #define POSSIBLE_WORD_LIST_MAX 20 114 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
100 115
101 class PossibleWord { 116 class PossibleWord {
102 private: 117 private:
103 // list of word candidate lengths, in increasing length order 118 // list of word candidate lengths, in increasing length order
104 int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 119 // TODO: bytes would be sufficient for word lengths.
105 int32_t count; // Count of candidates 120 int32_t count; // Count of candidates
106 int32_t prefix; // The longest match with a dictionary word 121 int32_t prefix; // The longest match with a dictionary word
107 int32_t offset; // Offset in the text of these candidates 122 int32_t offset; // Offset in the text of these candidates
108 int mark; // The preferred candidate's offset 123 int32_t mark; // The preferred candidate's offset
109 int current; // The candidate we're currently looking at 124 int32_t current; // The candidate we're currently looking at
125 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code unit s.
126 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code poin ts.
110 127
111 public: 128 public:
112 PossibleWord(); 129 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
113 ~PossibleWord(); 130 ~PossibleWord() {};
114 131
115 // Fill the list of candidates if needed, select the longest, and return the number found 132 // Fill the list of candidates if needed, select the longest, and return the number found
116 int candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 133 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
117 134
118 // Select the currently marked candidate, point after it in the text, and in validate self 135 // Select the currently marked candidate, point after it in the text, and in validate self
119 int32_t acceptMarked( UText *text ); 136 int32_t acceptMarked( UText *text );
120 137
121 // Back up from the current candidate to the next shorter one; return TRUE i f that exists 138 // Back up from the current candidate to the next shorter one; return TRUE i f that exists
122 // and point the text after it 139 // and point the text after it
123 UBool backUp( UText *text ); 140 UBool backUp( UText *text );
124 141
125 // Return the longest prefix this candidate location shares with a dictionar y word 142 // Return the longest prefix this candidate location shares with a dictionar y word
126 int32_t longestPrefix(); 143 // Return value is in code points.
144 int32_t longestPrefix() { return prefix; };
127 145
128 // Mark the current candidate as the one we like 146 // Mark the current candidate as the one we like
129 void markCurrent(); 147 void markCurrent() { mark = current; };
148
149 // Get length in code points of the marked word.
150 int32_t markedCPLength() { return cpLengths[mark]; };
130 }; 151 };
131 152
132 inline
133 PossibleWord::PossibleWord() {
134 offset = -1;
135 }
136 153
137 inline 154 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
138 PossibleWord::~PossibleWord() {
139 }
140
141 inline int
142 PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
143 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 155 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
144 int32_t start = (int32_t)utext_getNativeIndex(text); 156 int32_t start = (int32_t)utext_getNativeIndex(text);
145 if (start != offset) { 157 if (start != offset) {
146 offset = start; 158 offset = start;
147 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(leng ths)/sizeof(lengths[0])); 159 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cu Lengths, cpLengths, NULL, &prefix);
148 // Dictionary leaves text after longest prefix, not longest word. Back u p. 160 // Dictionary leaves text after longest prefix, not longest word. Back u p.
149 if (count <= 0) { 161 if (count <= 0) {
150 utext_setNativeIndex(text, start); 162 utext_setNativeIndex(text, start);
151 } 163 }
152 } 164 }
153 if (count > 0) { 165 if (count > 0) {
154 utext_setNativeIndex(text, start+lengths[count-1]); 166 utext_setNativeIndex(text, start+cuLengths[count-1]);
155 } 167 }
156 current = count-1; 168 current = count-1;
157 mark = current; 169 mark = current;
158 return count; 170 return count;
159 } 171 }
160 172
161 inline int32_t 173 int32_t
162 PossibleWord::acceptMarked( UText *text ) { 174 PossibleWord::acceptMarked( UText *text ) {
163 utext_setNativeIndex(text, offset + lengths[mark]); 175 utext_setNativeIndex(text, offset + cuLengths[mark]);
164 return lengths[mark]; 176 return cuLengths[mark];
165 } 177 }
166 178
167 inline UBool 179
180 UBool
168 PossibleWord::backUp( UText *text ) { 181 PossibleWord::backUp( UText *text ) {
169 if (current > 0) { 182 if (current > 0) {
170 utext_setNativeIndex(text, offset + lengths[--current]); 183 utext_setNativeIndex(text, offset + cuLengths[--current]);
171 return TRUE; 184 return TRUE;
172 } 185 }
173 return FALSE; 186 return FALSE;
174 } 187 }
175 188
176 inline int32_t
177 PossibleWord::longestPrefix() {
178 return prefix;
179 }
180
181 inline void
182 PossibleWord::markCurrent() {
183 mark = current;
184 }
185
186 /* 189 /*
187 ****************************************************************** 190 ******************************************************************
188 * ThaiBreakEngine 191 * ThaiBreakEngine
189 */ 192 */
190 193
191 // How many words in a row are "good enough"? 194 // How many words in a row are "good enough"?
192 #define THAI_LOOKAHEAD 3 195 static const int32_t THAI_LOOKAHEAD = 3;
193 196
194 // Will not combine a non-word with a preceding dictionary word longer than this 197 // Will not combine a non-word with a preceding dictionary word longer than this
195 #define THAI_ROOT_COMBINE_THRESHOLD 3 198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
196 199
197 // Will not combine a non-word that shares at least this much prefix with a 200 // Will not combine a non-word that shares at least this much prefix with a
198 // dictionary word, with a preceding word 201 // dictionary word, with a preceding word
199 #define THAI_PREFIX_COMBINE_THRESHOLD 3 202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
200 203
201 // Ellision character 204 // Ellision character
202 #define THAI_PAIYANNOI 0x0E2F 205 static const int32_t THAI_PAIYANNOI = 0x0E2F;
203 206
204 // Repeat character 207 // Repeat character
205 #define THAI_MAIYAMOK 0x0E46 208 static const int32_t THAI_MAIYAMOK = 0x0E46;
206 209
207 // Minimum word size 210 // Minimum word size
208 #define THAI_MIN_WORD 2 211 static const int32_t THAI_MIN_WORD = 2;
209 212
210 // Minimum number of characters for two words 213 // Minimum number of characters for two words
211 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 214 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
212 215
213 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
214 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 217 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
215 fDictionary(adoptDictionary) 218 fDictionary(adoptDictionary)
216 { 219 {
217 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]] "), status); 220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]] "), status);
218 if (U_SUCCESS(status)) { 221 if (U_SUCCESS(status)) {
219 setCharacters(fThaiWordSet); 222 setCharacters(fThaiWordSet);
220 } 223 }
221 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M: ]]"), status); 224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M: ]]"), status);
(...skipping 15 matching lines...) Expand all
237 240
238 ThaiBreakEngine::~ThaiBreakEngine() { 241 ThaiBreakEngine::~ThaiBreakEngine() {
239 delete fDictionary; 242 delete fDictionary;
240 } 243 }
241 244
242 int32_t 245 int32_t
243 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
244 int32_t rangeStart, 247 int32_t rangeStart,
245 int32_t rangeEnd, 248 int32_t rangeEnd,
246 UStack &foundBreaks ) const { 249 UStack &foundBreaks ) const {
247 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 250 utext_setNativeIndex(text, rangeStart);
251 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
252 if (utext_getNativeIndex(text) >= rangeEnd) {
248 return 0; // Not enough characters for two words 253 return 0; // Not enough characters for two words
249 } 254 }
255 utext_setNativeIndex(text, rangeStart);
256
250 257
251 uint32_t wordsFound = 0; 258 uint32_t wordsFound = 0;
252 int32_t wordLength; 259 int32_t cpWordLength = 0; // Word Length in Code Points.
260 int32_t cuWordLength = 0; // Word length in code units (UText native inde xing)
253 int32_t current; 261 int32_t current;
254 UErrorCode status = U_ZERO_ERROR; 262 UErrorCode status = U_ZERO_ERROR;
255 PossibleWord words[THAI_LOOKAHEAD]; 263 PossibleWord words[THAI_LOOKAHEAD];
256 UChar32 uc;
257 264
258 utext_setNativeIndex(text, rangeStart); 265 utext_setNativeIndex(text, rangeStart);
259 266
260 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 267 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
261 wordLength = 0; 268 cpWordLength = 0;
269 cuWordLength = 0;
262 270
263 // Look for candidate words at the current position 271 // Look for candidate words at the current position
264 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDict ionary, rangeEnd); 272 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, f Dictionary, rangeEnd);
265 273
266 // If we found exactly one, use that 274 // If we found exactly one, use that
267 if (candidates == 1) { 275 if (candidates == 1) {
268 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 276 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text) ;
277 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
269 wordsFound += 1; 278 wordsFound += 1;
270 } 279 }
271 // If there was more than one, see which one can take us forward the mos t words 280 // If there was more than one, see which one can take us forward the mos t words
272 else if (candidates > 1) { 281 else if (candidates > 1) {
273 // If we're already at the end of the range, we're done 282 // If we're already at the end of the range, we're done
274 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 283 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
275 goto foundBest; 284 goto foundBest;
276 } 285 }
277 do { 286 do {
278 int wordsMatched = 1; 287 int32_t wordsMatched = 1;
279 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fD ictionary, rangeEnd) > 0) { 288 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fD ictionary, rangeEnd) > 0) {
280 if (wordsMatched < 2) { 289 if (wordsMatched < 2) {
281 // Followed by another dictionary word; mark first word as a good candidate 290 // Followed by another dictionary word; mark first word as a good candidate
282 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 291 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
283 wordsMatched = 2; 292 wordsMatched = 2;
284 } 293 }
285 294
286 // If we're already at the end of the range, we're done 295 // If we're already at the end of the range, we're done
287 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 296 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
288 goto foundBest; 297 goto foundBest;
289 } 298 }
290 299
291 // See if any of the possible second words is followed by a third word 300 // See if any of the possible second words is followed by a third word
292 do { 301 do {
293 // If we find a third word, stop right away 302 // If we find a third word, stop right away
294 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates( text, fDictionary, rangeEnd)) { 303 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates( text, fDictionary, rangeEnd)) {
295 words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 304 words[wordsFound % THAI_LOOKAHEAD].markCurrent();
296 goto foundBest; 305 goto foundBest;
297 } 306 }
298 } 307 }
299 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text) ); 308 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text) );
300 } 309 }
301 } 310 }
302 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 311 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
303 foundBest: 312 foundBest:
304 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 313 // Set UText position to after the accepted word.
314 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text) ;
315 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
305 wordsFound += 1; 316 wordsFound += 1;
306 } 317 }
307 318
308 // We come here after having either found a word or not. We look ahead t o the 319 // We come here after having either found a word or not. We look ahead t o the
309 // next word. If it's not a dictionary word, we will combine it withe th e word we 320 // next word. If it's not a dictionary word, we will combine it with the word we
310 // just found (if there is one), but only if the preceding word does not exceed 321 // just found (if there is one), but only if the preceding word does not exceed
311 // the threshold. 322 // the threshold.
312 // The text iterator should now be positioned at the end of the word we found. 323 // The text iterator should now be positioned at the end of the word we found.
313 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ ROOT_COMBINE_THRESHOLD) { 324
325 UChar32 uc = 0;
326 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < TH AI_ROOT_COMBINE_THRESHOLD) {
314 // if it is a dictionary word, do nothing. If it isn't, then if ther e is 327 // if it is a dictionary word, do nothing. If it isn't, then if ther e is
315 // no preceding word, or the non-word shares less than the minimum t hreshold 328 // no preceding word, or the non-word shares less than the minimum t hreshold
316 // of characters with a dictionary word, then scan to resynchronize 329 // of characters with a dictionary word, then scan to resynchronize
317 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 330 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
318 && (wordLength == 0 331 && (cuWordLength == 0
319 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI _PREFIX_COMBINE_THRESHOLD)) { 332 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI _PREFIX_COMBINE_THRESHOLD)) {
320 // Look for a plausible word boundary 333 // Look for a plausible word boundary
321 //TODO: This section will need a rework for UText. 334 int32_t remaining = rangeEnd - (current+cuWordLength);
322 int32_t remaining = rangeEnd - (current+wordLength); 335 UChar32 pc;
323 UChar32 pc = utext_current32(text);
324 int32_t chars = 0; 336 int32_t chars = 0;
325 for (;;) { 337 for (;;) {
326 utext_next32(text); 338 int32_t pcIndex = utext_getNativeIndex(text);
327 uc = utext_current32(text); 339 pc = utext_next32(text);
328 // TODO: Here we're counting on the fact that the SA languag es are all 340 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
329 // in the BMP. This should get fixed with the UText rework. 341 chars += pcSize;
330 chars += 1; 342 remaining -= pcSize;
331 if (--remaining <= 0) { 343 if (remaining <= 0) {
332 break; 344 break;
333 } 345 }
346 uc = utext_current32(text);
334 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 347 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
335 // Maybe. See if it's in the dictionary. 348 // Maybe. See if it's in the dictionary.
336 // NOTE: In the original Apple code, checked that the ne xt 349 // NOTE: In the original Apple code, checked that the ne xt
337 // two characters after uc were not 0x0E4C THANTHAKHAT b efore 350 // two characters after uc were not 0x0E4C THANTHAKHAT b efore
338 // checking the dictionary. That is just a performance f ilter, 351 // checking the dictionary. That is just a performance f ilter,
339 // but it's not clear it's faster than checking the trie . 352 // but it's not clear it's faster than checking the trie .
340 int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD ].candidates(text, fDictionary, rangeEnd); 353 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKA HEAD].candidates(text, fDictionary, rangeEnd);
341 utext_setNativeIndex(text, current + wordLength + chars) ; 354 utext_setNativeIndex(text, current + cuWordLength + char s);
342 if (candidates > 0) { 355 if (candidates > 0) {
343 break; 356 break;
344 } 357 }
345 } 358 }
346 pc = uc;
347 } 359 }
348 360
349 // Bump the word count if there wasn't already one 361 // Bump the word count if there wasn't already one
350 if (wordLength <= 0) { 362 if (cuWordLength <= 0) {
351 wordsFound += 1; 363 wordsFound += 1;
352 } 364 }
353 365
354 // Update the length with the passed-over characters 366 // Update the length with the passed-over characters
355 wordLength += chars; 367 cuWordLength += chars;
356 } 368 }
357 else { 369 else {
358 // Back up to where we were for next iteration 370 // Back up to where we were for next iteration
359 utext_setNativeIndex(text, current+wordLength); 371 utext_setNativeIndex(text, current+cuWordLength);
360 } 372 }
361 } 373 }
362 374
363 // Never stop before a combining mark. 375 // Never stop before a combining mark.
364 int32_t currPos; 376 int32_t currPos;
365 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) { 377 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {
366 utext_next32(text); 378 utext_next32(text);
367 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 379 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
368 } 380 }
369 381
370 // Look ahead for possible suffixes if a dictionary word does not follow . 382 // Look ahead for possible suffixes if a dictionary word does not follow .
371 // We do this in code rather than using a rule so that the heuristic 383 // We do this in code rather than using a rule so that the heuristic
372 // resynch continues to function. For example, one of the suffix charact ers 384 // resynch continues to function. For example, one of the suffix charact ers
373 // could be a typo in the middle of a word. 385 // could be a typo in the middle of a word.
374 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 386 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
375 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, r angeEnd) <= 0 387 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, r angeEnd) <= 0
376 && fSuffixSet.contains(uc = utext_current32(text))) { 388 && fSuffixSet.contains(uc = utext_current32(text))) {
377 if (uc == THAI_PAIYANNOI) { 389 if (uc == THAI_PAIYANNOI) {
378 if (!fSuffixSet.contains(utext_previous32(text))) { 390 if (!fSuffixSet.contains(utext_previous32(text))) {
379 // Skip over previous end and PAIYANNOI 391 // Skip over previous end and PAIYANNOI
380 utext_next32(text); 392 utext_next32(text);
393 int32_t paiyannoiIndex = utext_getNativeIndex(text);
381 utext_next32(text); 394 utext_next32(text);
382 wordLength += 1; // Add PAIYANNOI to word 395 cuWordLength += utext_getNativeIndex(text) - paiyannoiIn dex; // Add PAIYANNOI to word
383 uc = utext_current32(text); // Fetch next character 396 uc = utext_current32(text); // Fetch next character
384 } 397 }
385 else { 398 else {
386 // Restore prior position 399 // Restore prior position
387 utext_next32(text); 400 utext_next32(text);
388 } 401 }
389 } 402 }
390 if (uc == THAI_MAIYAMOK) { 403 if (uc == THAI_MAIYAMOK) {
391 if (utext_previous32(text) != THAI_MAIYAMOK) { 404 if (utext_previous32(text) != THAI_MAIYAMOK) {
392 // Skip over previous end and MAIYAMOK 405 // Skip over previous end and MAIYAMOK
393 utext_next32(text); 406 utext_next32(text);
407 int32_t maiyamokIndex = utext_getNativeIndex(text);
394 utext_next32(text); 408 utext_next32(text);
395 wordLength += 1; // Add MAIYAMOK to word 409 cuWordLength += utext_getNativeIndex(text) - maiyamokInd ex; // Add MAIYAMOK to word
396 } 410 }
397 else { 411 else {
398 // Restore prior position 412 // Restore prior position
399 utext_next32(text); 413 utext_next32(text);
400 } 414 }
401 } 415 }
402 } 416 }
403 else { 417 else {
404 utext_setNativeIndex(text, current+wordLength); 418 utext_setNativeIndex(text, current+cuWordLength);
405 } 419 }
406 } 420 }
407 421
408 // Did we find a word on this iteration? If so, push it on the break sta ck 422 // Did we find a word on this iteration? If so, push it on the break sta ck
409 if (wordLength > 0) { 423 if (cuWordLength > 0) {
410 foundBreaks.push((current+wordLength), status); 424 foundBreaks.push((current+cuWordLength), status);
411 } 425 }
412 } 426 }
413 427
414 // Don't return a break for the end of the dictionary range if there is one there. 428 // Don't return a break for the end of the dictionary range if there is one there.
415 if (foundBreaks.peeki() >= rangeEnd) { 429 if (foundBreaks.peeki() >= rangeEnd) {
416 (void) foundBreaks.popi(); 430 (void) foundBreaks.popi();
417 wordsFound -= 1; 431 wordsFound -= 1;
418 } 432 }
419 433
420 return wordsFound; 434 return wordsFound;
421 } 435 }
422 436
423 /* 437 /*
424 ****************************************************************** 438 ******************************************************************
425 * LaoBreakEngine 439 * LaoBreakEngine
426 */ 440 */
427 441
428 // How many words in a row are "good enough"? 442 // How many words in a row are "good enough"?
429 #define LAO_LOOKAHEAD 3 443 static const int32_t LAO_LOOKAHEAD = 3;
430 444
431 // Will not combine a non-word with a preceding dictionary word longer than this 445 // Will not combine a non-word with a preceding dictionary word longer than this
432 #define LAO_ROOT_COMBINE_THRESHOLD 3 446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
433 447
434 // Will not combine a non-word that shares at least this much prefix with a 448 // Will not combine a non-word that shares at least this much prefix with a
435 // dictionary word, with a preceding word 449 // dictionary word, with a preceding word
436 #define LAO_PREFIX_COMBINE_THRESHOLD 3 450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
437 451
438 // Minimum word size 452 // Minimum word size
439 #define LAO_MIN_WORD 2 453 static const int32_t LAO_MIN_WORD = 2;
440 454
441 // Minimum number of characters for two words 455 // Minimum number of characters for two words
442 #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2) 456 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
443 457
444 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s tatus) 458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s tatus)
445 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 459 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
446 fDictionary(adoptDictionary) 460 fDictionary(adoptDictionary)
447 { 461 {
448 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]" ), status); 462 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]" ), status);
449 if (U_SUCCESS(status)) { 463 if (U_SUCCESS(status)) {
450 setCharacters(fLaoWordSet); 464 setCharacters(fLaoWordSet);
451 } 465 }
452 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M: ]]"), status); 466 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M: ]]"), status);
(...skipping 17 matching lines...) Expand all
470 int32_t 484 int32_t
471 LaoBreakEngine::divideUpDictionaryRange( UText *text, 485 LaoBreakEngine::divideUpDictionaryRange( UText *text,
472 int32_t rangeStart, 486 int32_t rangeStart,
473 int32_t rangeEnd, 487 int32_t rangeEnd,
474 UStack &foundBreaks ) const { 488 UStack &foundBreaks ) const {
475 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 489 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
476 return 0; // Not enough characters for two words 490 return 0; // Not enough characters for two words
477 } 491 }
478 492
479 uint32_t wordsFound = 0; 493 uint32_t wordsFound = 0;
480 int32_t wordLength; 494 int32_t cpWordLength = 0;
495 int32_t cuWordLength = 0;
481 int32_t current; 496 int32_t current;
482 UErrorCode status = U_ZERO_ERROR; 497 UErrorCode status = U_ZERO_ERROR;
483 PossibleWord words[LAO_LOOKAHEAD]; 498 PossibleWord words[LAO_LOOKAHEAD];
484 UChar32 uc;
485 499
486 utext_setNativeIndex(text, rangeStart); 500 utext_setNativeIndex(text, rangeStart);
487 501
488 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 502 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
489 wordLength = 0; 503 cuWordLength = 0;
504 cpWordLength = 0;
490 505
491 // Look for candidate words at the current position 506 // Look for candidate words at the current position
492 int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDicti onary, rangeEnd); 507 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fD ictionary, rangeEnd);
493 508
494 // If we found exactly one, use that 509 // If we found exactly one, use that
495 if (candidates == 1) { 510 if (candidates == 1) {
496 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 511 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
512 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497 wordsFound += 1; 513 wordsFound += 1;
498 } 514 }
499 // If there was more than one, see which one can take us forward the mos t words 515 // If there was more than one, see which one can take us forward the mos t words
500 else if (candidates > 1) { 516 else if (candidates > 1) {
501 // If we're already at the end of the range, we're done 517 // If we're already at the end of the range, we're done
502 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 518 if (utext_getNativeIndex(text) >= rangeEnd) {
503 goto foundBest; 519 goto foundBest;
504 } 520 }
505 do { 521 do {
506 int wordsMatched = 1; 522 int32_t wordsMatched = 1;
507 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDi ctionary, rangeEnd) > 0) { 523 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDi ctionary, rangeEnd) > 0) {
508 if (wordsMatched < 2) { 524 if (wordsMatched < 2) {
509 // Followed by another dictionary word; mark first word as a good candidate 525 // Followed by another dictionary word; mark first word as a good candidate
510 words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 526 words[wordsFound%LAO_LOOKAHEAD].markCurrent();
511 wordsMatched = 2; 527 wordsMatched = 2;
512 } 528 }
513 529
514 // If we're already at the end of the range, we're done 530 // If we're already at the end of the range, we're done
515 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 531 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
516 goto foundBest; 532 goto foundBest;
517 } 533 }
518 534
519 // See if any of the possible second words is followed by a third word 535 // See if any of the possible second words is followed by a third word
520 do { 536 do {
521 // If we find a third word, stop right away 537 // If we find a third word, stop right away
522 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(t ext, fDictionary, rangeEnd)) { 538 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(t ext, fDictionary, rangeEnd)) {
523 words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 539 words[wordsFound % LAO_LOOKAHEAD].markCurrent();
524 goto foundBest; 540 goto foundBest;
525 } 541 }
526 } 542 }
527 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)) ; 543 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)) ;
528 } 544 }
529 } 545 }
530 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 546 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
531 foundBest: 547 foundBest:
532 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 548 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
549 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
533 wordsFound += 1; 550 wordsFound += 1;
534 } 551 }
535 552
536 // We come here after having either found a word or not. We look ahead t o the 553 // We come here after having either found a word or not. We look ahead t o the
537 // next word. If it's not a dictionary word, we will combine it withe th e word we 554 // next word. If it's not a dictionary word, we will combine it withe th e word we
538 // just found (if there is one), but only if the preceding word does not exceed 555 // just found (if there is one), but only if the preceding word does not exceed
539 // the threshold. 556 // the threshold.
540 // The text iterator should now be positioned at the end of the word we found. 557 // The text iterator should now be positioned at the end of the word we found.
541 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_R OOT_COMBINE_THRESHOLD) { 558 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO _ROOT_COMBINE_THRESHOLD) {
542 // if it is a dictionary word, do nothing. If it isn't, then if ther e is 559 // if it is a dictionary word, do nothing. If it isn't, then if ther e is
543 // no preceding word, or the non-word shares less than the minimum t hreshold 560 // no preceding word, or the non-word shares less than the minimum t hreshold
544 // of characters with a dictionary word, then scan to resynchronize 561 // of characters with a dictionary word, then scan to resynchronize
545 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 562 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
546 && (wordLength == 0 563 && (cuWordLength == 0
547 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_P REFIX_COMBINE_THRESHOLD)) { 564 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_P REFIX_COMBINE_THRESHOLD)) {
548 // Look for a plausible word boundary 565 // Look for a plausible word boundary
549 //TODO: This section will need a rework for UText. 566 int32_t remaining = rangeEnd - (current + cuWordLength);
550 int32_t remaining = rangeEnd - (current+wordLength); 567 UChar32 pc;
551 UChar32 pc = utext_current32(text); 568 UChar32 uc;
552 int32_t chars = 0; 569 int32_t chars = 0;
553 for (;;) { 570 for (;;) {
554 utext_next32(text); 571 int32_t pcIndex = utext_getNativeIndex(text);
555 uc = utext_current32(text); 572 pc = utext_next32(text);
556 // TODO: Here we're counting on the fact that the SA languag es are all 573 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
557 // in the BMP. This should get fixed with the UText rework. 574 chars += pcSize;
558 chars += 1; 575 remaining -= pcSize;
559 if (--remaining <= 0) { 576 if (remaining <= 0) {
560 break; 577 break;
561 } 578 }
579 uc = utext_current32(text);
562 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 580 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
563 // Maybe. See if it's in the dictionary. 581 // Maybe. See if it's in the dictionary.
564 int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD] .candidates(text, fDictionary, rangeEnd); 582 // TODO: this looks iffy; compare with old code.
565 utext_setNativeIndex(text, current + wordLength + chars) ; 583 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAH EAD].candidates(text, fDictionary, rangeEnd);
584 utext_setNativeIndex(text, current + cuWordLength + char s);
566 if (candidates > 0) { 585 if (candidates > 0) {
567 break; 586 break;
568 } 587 }
569 } 588 }
570 pc = uc;
571 } 589 }
572 590
573 // Bump the word count if there wasn't already one 591 // Bump the word count if there wasn't already one
574 if (wordLength <= 0) { 592 if (cuWordLength <= 0) {
575 wordsFound += 1; 593 wordsFound += 1;
576 } 594 }
577 595
578 // Update the length with the passed-over characters 596 // Update the length with the passed-over characters
579 wordLength += chars; 597 cuWordLength += chars;
580 } 598 }
581 else { 599 else {
582 // Back up to where we were for next iteration 600 // Back up to where we were for next iteration
583 utext_setNativeIndex(text, current+wordLength); 601 utext_setNativeIndex(text, current + cuWordLength);
584 } 602 }
585 } 603 }
586 604
587 // Never stop before a combining mark. 605 // Never stop before a combining mark.
588 int32_t currPos; 606 int32_t currPos;
589 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) { 607 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {
590 utext_next32(text); 608 utext_next32(text);
591 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 609 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
592 } 610 }
593 611
594 // Look ahead for possible suffixes if a dictionary word does not follow . 612 // Look ahead for possible suffixes if a dictionary word does not follow .
595 // We do this in code rather than using a rule so that the heuristic 613 // We do this in code rather than using a rule so that the heuristic
596 // resynch continues to function. For example, one of the suffix charact ers 614 // resynch continues to function. For example, one of the suffix charact ers
597 // could be a typo in the middle of a word. 615 // could be a typo in the middle of a word.
598 // NOT CURRENTLY APPLICABLE TO LAO 616 // NOT CURRENTLY APPLICABLE TO LAO
599 617
600 // Did we find a word on this iteration? If so, push it on the break sta ck 618 // Did we find a word on this iteration? If so, push it on the break sta ck
601 if (wordLength > 0) { 619 if (cuWordLength > 0) {
602 foundBreaks.push((current+wordLength), status); 620 foundBreaks.push((current+cuWordLength), status);
603 } 621 }
604 } 622 }
605 623
624 // Don't return a break for the end of the dictionary range if there is one there.
625 if (foundBreaks.peeki() >= rangeEnd) {
626 (void) foundBreaks.popi();
627 wordsFound -= 1;
628 }
629
630 return wordsFound;
631 }
632
633 /*
634 ******************************************************************
635 * BurmeseBreakEngine
636 */
637
638 // How many words in a row are "good enough"?
639 static const int32_t BURMESE_LOOKAHEAD = 3;
640
641 // Will not combine a non-word with a preceding dictionary word longer than this
642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
643
644 // Will not combine a non-word that shares at least this much prefix with a
645 // dictionary word, with a preceding word
646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
647
648 // Minimum word size
649 static const int32_t BURMESE_MIN_WORD = 2;
650
651 // Minimum number of characters for two words
652 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
653
654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro rCode &status)
655 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
656 fDictionary(adoptDictionary)
657 {
658 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA :]]"), status);
659 if (U_SUCCESS(status)) {
660 setCharacters(fBurmeseWordSet);
661 }
662 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M: ]]"), status);
663 fMarkSet.add(0x0020);
664 fEndWordSet = fBurmeseWordSet;
665 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
666
667 // Compact for caching.
668 fMarkSet.compact();
669 fEndWordSet.compact();
670 fBeginWordSet.compact();
671 }
672
673 BurmeseBreakEngine::~BurmeseBreakEngine() {
674 delete fDictionary;
675 }
676
677 int32_t
678 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
679 int32_t rangeStart,
680 int32_t rangeEnd,
681 UStack &foundBreaks ) const {
682 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
683 return 0; // Not enough characters for two words
684 }
685
686 uint32_t wordsFound = 0;
687 int32_t cpWordLength = 0;
688 int32_t cuWordLength = 0;
689 int32_t current;
690 UErrorCode status = U_ZERO_ERROR;
691 PossibleWord words[BURMESE_LOOKAHEAD];
692
693 utext_setNativeIndex(text, rangeStart);
694
695 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
696 cuWordLength = 0;
697 cpWordLength = 0;
698
699 // Look for candidate words at the current position
700 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text , fDictionary, rangeEnd);
701
702 // If we found exactly one, use that
703 if (candidates == 1) {
704 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(te xt);
705 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength( );
706 wordsFound += 1;
707 }
708 // If there was more than one, see which one can take us forward the mos t words
709 else if (candidates > 1) {
710 // If we're already at the end of the range, we're done
711 if (utext_getNativeIndex(text) >= rangeEnd) {
712 goto foundBest;
713 }
714 do {
715 int32_t wordsMatched = 1;
716 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
717 if (wordsMatched < 2) {
718 // Followed by another dictionary word; mark first word as a good candidate
719 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
720 wordsMatched = 2;
721 }
722
723 // If we're already at the end of the range, we're done
724 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
725 goto foundBest;
726 }
727
728 // See if any of the possible second words is followed by a third word
729 do {
730 // If we find a third word, stop right away
731 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidat es(text, fDictionary, rangeEnd)) {
732 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
733 goto foundBest;
734 }
735 }
736 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(te xt));
737 }
738 }
739 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
740 foundBest:
741 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(te xt);
742 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength( );
743 wordsFound += 1;
744 }
745
746 // We come here after having either found a word or not. We look ahead t o the
747 // next word. If it's not a dictionary word, we will combine it withe th e word we
748 // just found (if there is one), but only if the preceding word does not exceed
749 // the threshold.
750 // The text iterator should now be positioned at the end of the word we found.
751 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BUR MESE_ROOT_COMBINE_THRESHOLD) {
752 // if it is a dictionary word, do nothing. If it isn't, then if ther e is
753 // no preceding word, or the non-word shares less than the minimum t hreshold
754 // of characters with a dictionary word, then scan to resynchronize
755 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictiona ry, rangeEnd) <= 0
756 && (cuWordLength == 0
757 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < B URMESE_PREFIX_COMBINE_THRESHOLD)) {
758 // Look for a plausible word boundary
759 int32_t remaining = rangeEnd - (current + cuWordLength);
760 UChar32 pc;
761 UChar32 uc;
762 int32_t chars = 0;
763 for (;;) {
764 int32_t pcIndex = utext_getNativeIndex(text);
765 pc = utext_next32(text);
766 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
767 chars += pcSize;
768 remaining -= pcSize;
769 if (remaining <= 0) {
770 break;
771 }
772 uc = utext_current32(text);
773 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
774 // Maybe. See if it's in the dictionary.
775 // TODO: this looks iffy; compare with old code.
776 int32_t candidates = words[(wordsFound + 1) % BURMESE_LO OKAHEAD].candidates(text, fDictionary, rangeEnd);
777 utext_setNativeIndex(text, current + cuWordLength + char s);
778 if (candidates > 0) {
779 break;
780 }
781 }
782 }
783
784 // Bump the word count if there wasn't already one
785 if (cuWordLength <= 0) {
786 wordsFound += 1;
787 }
788
789 // Update the length with the passed-over characters
790 cuWordLength += chars;
791 }
792 else {
793 // Back up to where we were for next iteration
794 utext_setNativeIndex(text, current + cuWordLength);
795 }
796 }
797
798 // Never stop before a combining mark.
799 int32_t currPos;
800 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {
801 utext_next32(text);
802 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
803 }
804
805 // Look ahead for possible suffixes if a dictionary word does not follow .
806 // We do this in code rather than using a rule so that the heuristic
807 // resynch continues to function. For example, one of the suffix charact ers
808 // could be a typo in the middle of a word.
809 // NOT CURRENTLY APPLICABLE TO BURMESE
810
811 // Did we find a word on this iteration? If so, push it on the break sta ck
812 if (cuWordLength > 0) {
813 foundBreaks.push((current+cuWordLength), status);
814 }
815 }
816
606 // Don't return a break for the end of the dictionary range if there is one there. 817 // Don't return a break for the end of the dictionary range if there is one there.
607 if (foundBreaks.peeki() >= rangeEnd) { 818 if (foundBreaks.peeki() >= rangeEnd) {
608 (void) foundBreaks.popi(); 819 (void) foundBreaks.popi();
609 wordsFound -= 1; 820 wordsFound -= 1;
610 } 821 }
611 822
612 return wordsFound; 823 return wordsFound;
613 } 824 }
614 825
615 /* 826 /*
616 ****************************************************************** 827 ******************************************************************
617 * KhmerBreakEngine 828 * KhmerBreakEngine
618 */ 829 */
619 830
620 // How many words in a row are "good enough"? 831 // How many words in a row are "good enough"?
621 #define KHMER_LOOKAHEAD 3 832 static const int32_t KHMER_LOOKAHEAD = 3;
622 833
623 // Will not combine a non-word with a preceding dictionary word longer than this 834 // Will not combine a non-word with a preceding dictionary word longer than this
624 #define KHMER_ROOT_COMBINE_THRESHOLD 10 835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
625 836
626 // Will not combine a non-word that shares at least this much prefix with a 837 // Will not combine a non-word that shares at least this much prefix with a
627 // dictionary word, with a preceding word 838 // dictionary word, with a preceding word
628 #define KHMER_PREFIX_COMBINE_THRESHOLD 5 839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
629 840
630 // Minimum word size 841 // Minimum word size
631 #define KHMER_MIN_WORD 2 842 static const int32_t KHMER_MIN_WORD = 2;
632 843
633 // Minimum number of characters for two words 844 // Minimum number of characters for two words
634 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
635 846
636 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod e &status) 847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod e &status)
637 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 848 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
638 fDictionary(adoptDictionary) 849 fDictionary(adoptDictionary)
639 { 850 {
640 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:] ]"), status); 851 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:] ]"), status);
641 if (U_SUCCESS(status)) { 852 if (U_SUCCESS(status)) {
642 setCharacters(fKhmerWordSet); 853 setCharacters(fKhmerWordSet);
643 } 854 }
644 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M: ]]"), status); 855 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M: ]]"), status);
(...skipping 26 matching lines...) Expand all
671 int32_t 882 int32_t
672 KhmerBreakEngine::divideUpDictionaryRange( UText *text, 883 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
673 int32_t rangeStart, 884 int32_t rangeStart,
674 int32_t rangeEnd, 885 int32_t rangeEnd,
675 UStack &foundBreaks ) const { 886 UStack &foundBreaks ) const {
676 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 887 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
677 return 0; // Not enough characters for two words 888 return 0; // Not enough characters for two words
678 } 889 }
679 890
680 uint32_t wordsFound = 0; 891 uint32_t wordsFound = 0;
681 int32_t wordLength; 892 int32_t cpWordLength = 0;
893 int32_t cuWordLength = 0;
682 int32_t current; 894 int32_t current;
683 UErrorCode status = U_ZERO_ERROR; 895 UErrorCode status = U_ZERO_ERROR;
684 PossibleWord words[KHMER_LOOKAHEAD]; 896 PossibleWord words[KHMER_LOOKAHEAD];
685 UChar32 uc;
686 897
687 utext_setNativeIndex(text, rangeStart); 898 utext_setNativeIndex(text, rangeStart);
688 899
689 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 900 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
690 wordLength = 0; 901 cuWordLength = 0;
902 cpWordLength = 0;
691 903
692 // Look for candidate words at the current position 904 // Look for candidate words at the current position
693 int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDic tionary, rangeEnd); 905 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
694 906
695 // If we found exactly one, use that 907 // If we found exactly one, use that
696 if (candidates == 1) { 908 if (candidates == 1) {
697 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 909 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text );
910 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
698 wordsFound += 1; 911 wordsFound += 1;
699 } 912 }
700 913
701 // If there was more than one, see which one can take us forward the mos t words 914 // If there was more than one, see which one can take us forward the mos t words
702 else if (candidates > 1) { 915 else if (candidates > 1) {
703 // If we're already at the end of the range, we're done 916 // If we're already at the end of the range, we're done
704 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 917 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
705 goto foundBest; 918 goto foundBest;
706 } 919 }
707 do { 920 do {
708 int wordsMatched = 1; 921 int32_t wordsMatched = 1;
709 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, f Dictionary, rangeEnd) > 0) { 922 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, f Dictionary, rangeEnd) > 0) {
710 if (wordsMatched < 2) { 923 if (wordsMatched < 2) {
711 // Followed by another dictionary word; mark first word as a good candidate 924 // Followed by another dictionary word; mark first word as a good candidate
712 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 925 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
713 wordsMatched = 2; 926 wordsMatched = 2;
714 } 927 }
715 928
716 // If we're already at the end of the range, we're done 929 // If we're already at the end of the range, we're done
717 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 930 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
718 goto foundBest; 931 goto foundBest;
719 } 932 }
720 933
721 // See if any of the possible second words is followed by a third word 934 // See if any of the possible second words is followed by a third word
722 do { 935 do {
723 // If we find a third word, stop right away 936 // If we find a third word, stop right away
724 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates (text, fDictionary, rangeEnd)) { 937 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates (text, fDictionary, rangeEnd)) {
725 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 938 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
726 goto foundBest; 939 goto foundBest;
727 } 940 }
728 } 941 }
729 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text )); 942 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text ));
730 } 943 }
731 } 944 }
732 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 945 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
733 foundBest: 946 foundBest:
734 wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 947 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text );
948 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
735 wordsFound += 1; 949 wordsFound += 1;
736 } 950 }
737 951
738 // We come here after having either found a word or not. We look ahead t o the 952 // We come here after having either found a word or not. We look ahead t o the
739 // next word. If it's not a dictionary word, we will combine it with the word we 953 // next word. If it's not a dictionary word, we will combine it with the word we
740 // just found (if there is one), but only if the preceding word does not exceed 954 // just found (if there is one), but only if the preceding word does not exceed
741 // the threshold. 955 // the threshold.
742 // The text iterator should now be positioned at the end of the word we found. 956 // The text iterator should now be positioned at the end of the word we found.
743 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER _ROOT_COMBINE_THRESHOLD) { 957 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHM ER_ROOT_COMBINE_THRESHOLD) {
744 // if it is a dictionary word, do nothing. If it isn't, then if ther e is 958 // if it is a dictionary word, do nothing. If it isn't, then if ther e is
745 // no preceding word, or the non-word shares less than the minimum t hreshold 959 // no preceding word, or the non-word shares less than the minimum t hreshold
746 // of characters with a dictionary word, then scan to resynchronize 960 // of characters with a dictionary word, then scan to resynchronize
747 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0 961 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0
748 && (wordLength == 0 962 && (cuWordLength == 0
749 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < K HMER_PREFIX_COMBINE_THRESHOLD)) { 963 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < K HMER_PREFIX_COMBINE_THRESHOLD)) {
750 // Look for a plausible word boundary 964 // Look for a plausible word boundary
751 //TODO: This section will need a rework for UText. 965 int32_t remaining = rangeEnd - (current+cuWordLength);
752 int32_t remaining = rangeEnd - (current+wordLength); 966 UChar32 pc;
753 UChar32 pc = utext_current32(text); 967 UChar32 uc;
754 int32_t chars = 0; 968 int32_t chars = 0;
755 for (;;) { 969 for (;;) {
756 utext_next32(text); 970 int32_t pcIndex = utext_getNativeIndex(text);
757 uc = utext_current32(text); 971 pc = utext_next32(text);
758 // TODO: Here we're counting on the fact that the SA languag es are all 972 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;
759 // in the BMP. This should get fixed with the UText rework. 973 chars += pcSize;
760 chars += 1; 974 remaining -= pcSize;
761 if (--remaining <= 0) { 975 if (remaining <= 0) {
762 break; 976 break;
763 } 977 }
978 uc = utext_current32(text);
764 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 979 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
765 // Maybe. See if it's in the dictionary. 980 // Maybe. See if it's in the dictionary.
766 int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEA D].candidates(text, fDictionary, rangeEnd); 981 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOK AHEAD].candidates(text, fDictionary, rangeEnd);
767 utext_setNativeIndex(text, current+wordLength+chars); 982 utext_setNativeIndex(text, current+cuWordLength+chars);
768 if (candidates > 0) { 983 if (candidates > 0) {
769 break; 984 break;
770 } 985 }
771 } 986 }
772 pc = uc;
773 } 987 }
774 988
775 // Bump the word count if there wasn't already one 989 // Bump the word count if there wasn't already one
776 if (wordLength <= 0) { 990 if (cuWordLength <= 0) {
777 wordsFound += 1; 991 wordsFound += 1;
778 } 992 }
779 993
780 // Update the length with the passed-over characters 994 // Update the length with the passed-over characters
781 wordLength += chars; 995 cuWordLength += chars;
782 } 996 }
783 else { 997 else {
784 // Back up to where we were for next iteration 998 // Back up to where we were for next iteration
785 utext_setNativeIndex(text, current+wordLength); 999 utext_setNativeIndex(text, current+cuWordLength);
786 } 1000 }
787 } 1001 }
788 1002
789 // Never stop before a combining mark. 1003 // Never stop before a combining mark.
790 int32_t currPos; 1004 int32_t currPos;
791 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) { 1005 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {
792 utext_next32(text); 1006 utext_next32(text);
793 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 1007 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
794 } 1008 }
795 1009
796 // Look ahead for possible suffixes if a dictionary word does not follow . 1010 // Look ahead for possible suffixes if a dictionary word does not follow .
797 // We do this in code rather than using a rule so that the heuristic 1011 // We do this in code rather than using a rule so that the heuristic
798 // resynch continues to function. For example, one of the suffix charact ers 1012 // resynch continues to function. For example, one of the suffix charact ers
799 // could be a typo in the middle of a word. 1013 // could be a typo in the middle of a word.
800 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 1014 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
801 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0 1015 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0
802 // && fSuffixSet.contains(uc = utext_current32(text))) { 1016 // && fSuffixSet.contains(uc = utext_current32(text))) {
803 // if (uc == KHMER_PAIYANNOI) { 1017 // if (uc == KHMER_PAIYANNOI) {
(...skipping 21 matching lines...) Expand all
825 // utext_next32(text); 1039 // utext_next32(text);
826 // } 1040 // }
827 // } 1041 // }
828 // } 1042 // }
829 // else { 1043 // else {
830 // utext_setNativeIndex(text, current+wordLength); 1044 // utext_setNativeIndex(text, current+wordLength);
831 // } 1045 // }
832 // } 1046 // }
833 1047
834 // Did we find a word on this iteration? If so, push it on the break sta ck 1048 // Did we find a word on this iteration? If so, push it on the break sta ck
835 if (wordLength > 0) { 1049 if (cuWordLength > 0) {
836 foundBreaks.push((current+wordLength), status); 1050 foundBreaks.push((current+cuWordLength), status);
837 } 1051 }
838 } 1052 }
839 1053
840 // Don't return a break for the end of the dictionary range if there is one there. 1054 // Don't return a break for the end of the dictionary range if there is one there.
841 if (foundBreaks.peeki() >= rangeEnd) { 1055 if (foundBreaks.peeki() >= rangeEnd) {
842 (void) foundBreaks.popi(); 1056 (void) foundBreaks.popi();
843 wordsFound -= 1; 1057 wordsFound -= 1;
844 } 1058 }
845 1059
846 return wordsFound; 1060 return wordsFound;
847 } 1061 }
848 1062
849 #if !UCONFIG_NO_NORMALIZATION 1063 #if !UCONFIG_NO_NORMALIZATION
850 /* 1064 /*
851 ****************************************************************** 1065 ******************************************************************
852 * CjkBreakEngine 1066 * CjkBreakEngine
853 */ 1067 */
854 static const uint32_t kuint32max = 0xFFFFFFFF; 1068 static const uint32_t kuint32max = 0xFFFFFFFF;
855 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
856 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 1070 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
857 // Korean dictionary only includes Hangul syllables 1071 // Korean dictionary only includes Hangul syllables
858 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat us); 1072 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat us);
859 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 1073 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
860 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u ff9f]"), status); 1074 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u ff9f]"), status);
861 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status) ; 1075 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status) ;
1076 nfkcNorm2 = Normalizer2::getNFKCInstance(status);
862 1077
863 if (U_SUCCESS(status)) { 1078 if (U_SUCCESS(status)) {
864 // handle Korean and Japanese/Chinese using different dictionaries 1079 // handle Korean and Japanese/Chinese using different dictionaries
865 if (type == kKorean) { 1080 if (type == kKorean) {
866 setCharacters(fHangulWordSet); 1081 setCharacters(fHangulWordSet);
867 } else { //Chinese and Japanese 1082 } else { //Chinese and Japanese
868 UnicodeSet cjSet; 1083 UnicodeSet cjSet;
869 cjSet.addAll(fHanWordSet); 1084 cjSet.addAll(fHanWordSet);
870 cjSet.addAll(fKatakanaWordSet); 1085 cjSet.addAll(fKatakanaWordSet);
871 cjSet.addAll(fHiraganaWordSet); 1086 cjSet.addAll(fHiraganaWordSet);
872 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MA RK 1087 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MA RK
873 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 1088 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
874 setCharacters(cjSet); 1089 setCharacters(cjSet);
875 } 1090 }
876 } 1091 }
877 } 1092 }
878 1093
879 CjkBreakEngine::~CjkBreakEngine(){ 1094 CjkBreakEngine::~CjkBreakEngine(){
880 delete fDictionary; 1095 delete fDictionary;
881 } 1096 }
882 1097
883 // The katakanaCost values below are based on the length frequencies of all 1098 // The katakanaCost values below are based on the length frequencies of all
884 // katakana phrases in the dictionary 1099 // katakana phrases in the dictionary
885 static const int kMaxKatakanaLength = 8; 1100 static const int32_t kMaxKatakanaLength = 8;
886 static const int kMaxKatakanaGroupLength = 20; 1101 static const int32_t kMaxKatakanaGroupLength = 20;
887 static const uint32_t maxSnlp = 255; 1102 static const uint32_t maxSnlp = 255;
888 1103
889 static inline uint32_t getKatakanaCost(int wordLength){ 1104 static inline uint32_t getKatakanaCost(int32_t wordLength){
890 //TODO: fill array with actual values from dictionary! 1105 //TODO: fill array with actual values from dictionary!
891 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 1106 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
892 = {8192, 984, 408, 240, 204, 252, 300, 37 2, 480}; 1107 = {8192, 984, 408, 240, 204, 252, 300, 37 2, 480};
893 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 1108 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
894 } 1109 }
895 1110
896 static inline bool isKatakana(uint16_t value) { 1111 static inline bool isKatakana(uint16_t value) {
897 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 1112 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
898 (value >= 0xFF66u && value <= 0xFF9fu); 1113 (value >= 0xFF66u && value <= 0xFF9fu);
899 } 1114 }
900 1115
901 // A very simple helper class to streamline the buffer handling in
902 // divideUpDictionaryRange.
903 template<class T, size_t N>
904 class AutoBuffer {
905 public:
906 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
907 if (size > N) {
908 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
909 capacity = size;
910 }
911 }
912 ~AutoBuffer() {
913 if (buffer != stackBuffer)
914 uprv_free(buffer);
915 }
916 1116
917 T* elems() { 1117 // Function for accessing internal utext flags.
918 return buffer; 1118 // Replicates an internal UText function.
919 }
920 1119
921 const T& operator[] (size_t i) const { 1120 static inline int32_t utext_i32_flag(int32_t bitIndex) {
922 return buffer[i]; 1121 return (int32_t)1 << bitIndex;
923 } 1122 }
924 1123
925 T& operator[] (size_t i) { 1124
926 return buffer[i];
927 }
928
929 // resize without copy
930 void resize(size_t size) {
931 if (size <= capacity)
932 return;
933 if (buffer != stackBuffer)
934 uprv_free(buffer);
935 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
936 capacity = size;
937 }
938
939 private:
940 T stackBuffer[N];
941 T* buffer;
942 AutoBuffer();
943 size_t capacity;
944 };
945
946
947 /* 1125 /*
948 * @param text A UText representing the text 1126 * @param text A UText representing the text
949 * @param rangeStart The start of the range of dictionary characters 1127 * @param rangeStart The start of the range of dictionary characters
950 * @param rangeEnd The end of the range of dictionary characters 1128 * @param rangeEnd The end of the range of dictionary characters
951 * @param foundBreaks Output of C array of int32_t break positions, or 0 1129 * @param foundBreaks Output of C array of int32_t break positions, or 0
952 * @return The number of breaks found 1130 * @return The number of breaks found
953 */ 1131 */
954 int32_t 1132 int32_t
955 CjkBreakEngine::divideUpDictionaryRange( UText *text, 1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
956 int32_t rangeStart, 1134 int32_t rangeStart,
957 int32_t rangeEnd, 1135 int32_t rangeEnd,
958 UStack &foundBreaks ) const { 1136 UStack &foundBreaks ) const {
959 if (rangeStart >= rangeEnd) { 1137 if (rangeStart >= rangeEnd) {
960 return 0; 1138 return 0;
961 } 1139 }
962 1140
963 const size_t defaultInputLength = 80; 1141 // UnicodeString version of input UText, NFKC normalized in necessary.
964 size_t inputLength = rangeEnd - rangeStart; 1142 UnicodeString *inString;
965 // TODO: Replace by UnicodeString.
966 AutoBuffer<UChar, defaultInputLength> charString(inputLength);
967 1143
968 // Normalize the input string and put it in normalizedText. 1144 // inputMap[inStringIndex] = corresponding native index from UText inText.
969 // The map from the indices of the normalized input to the raw 1145 // If NULL then mapping is 1:1
970 // input is kept in charPositions. 1146 UVector32 *inputMap = NULL;
971 UErrorCode status = U_ZERO_ERROR; 1147
972 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, & status); 1148 UErrorCode status = U_ZERO_ERROR;
973 if (U_FAILURE(status)) { 1149
974 return 0; 1150
1151 // if UText has the input string as one contiguous UTF-16 chunk
1152 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNK S)) &&
1153 inText->chunkNativeStart <= rangeStart &&
1154 inText->chunkNativeLimit >= rangeEnd &&
1155 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1156
1157 // Input UTtxt is in one contiguous UTF-16 chunk.
1158 // Use Read-only aliasing UnicodeString constructor on it.
1159 inString = new UnicodeString(FALSE,
1160 inText->chunkContents + rangeStart - inText->chunk NativeStart,
1161 rangeEnd - rangeStart);
1162 } else {
1163 // Copy the text from the original inText (UText) to inString (UnicodeSt ring).
1164 // Create a map from UnicodeString indices -> UText offsets.
1165 utext_setNativeIndex(inText, rangeStart);
1166 int32_t limit = rangeEnd;
1167 U_ASSERT(limit <= utext_nativeLength(inText));
1168 if (limit > utext_nativeLength(inText)) {
1169 limit = utext_nativeLength(inText);
1170 }
1171 inString = new UnicodeString;
1172 inputMap = new UVector32(status);
1173 while (utext_getNativeIndex(inText) < limit) {
1174 int32_t nativePosition = utext_getNativeIndex(inText);
1175 UChar32 c = utext_next32(inText);
1176 U_ASSERT(c != U_SENTINEL);
1177 inString->append(c);
1178 while (inputMap->size() < inString->length()) {
1179 inputMap->addElement(nativePosition, status);
1180 }
1181 }
1182 inputMap->addElement(limit, status);
975 } 1183 }
976 1184
977 UnicodeString inputString(charString.elems(), inputLength);
978 // TODO: Use Normalizer2.
979 UNormalizationMode norm_mode = UNORM_NFKC;
980 UBool isNormalized =
981 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
982 Normalizer::isNormalized(inputString, norm_mode, status);
983 1185
984 // TODO: Replace by UVector32. 1186 if (!nfkcNorm2->isNormalized(*inString, status)) {
985 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); 1187 UnicodeString *normalizedInput = new UnicodeString();
986 int numChars = 0; 1188 // normalizedMap[normalizedInput position] == original UText position.
987 UText normalizedText = UTEXT_INITIALIZER; 1189 UVector32 *normalizedMap = new UVector32(status);
988 // Needs to be declared here because normalizedText holds onto its buffer.
989 UnicodeString normalizedString;
990 if (isNormalized) {
991 int32_t index = 0;
992 charPositions[0] = 0;
993 while(index < inputString.length()) {
994 index = inputString.moveIndex32(index, 1);
995 charPositions[++numChars] = index;
996 }
997 utext_openUnicodeString(&normalizedText, &inputString, &status);
998 }
999 else {
1000 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, statu s);
1001 if (U_FAILURE(status)) { 1190 if (U_FAILURE(status)) {
1002 return 0; 1191 return 0;
1003 } 1192 }
1004 charPositions.resize(normalizedString.length() + 1); 1193
1005 Normalizer normalizer(charString.elems(), inputLength, norm_mode); 1194 UnicodeString fragment;
1006 int32_t index = 0; 1195 UnicodeString normalizedFragment;
1007 charPositions[0] = 0; 1196 for (int32_t srcI = 0; srcI < inString->length();) { // Once per normalization chunk
1008 while(index < normalizer.endIndex()){ 1197 fragment.remove();
1009 /* UChar32 uc = */ normalizer.next(); 1198 int32_t fragmentStartI = srcI;
1010 charPositions[++numChars] = index = normalizer.getIndex(); 1199 UChar32 c = inString->char32At(srcI);
1200 for (;;) {
1201 fragment.append(c);
1202 srcI = inString->moveIndex32(srcI, 1);
1203 if (srcI == inString->length()) {
1204 break;
1205 }
1206 c = inString->char32At(srcI);
1207 if (nfkcNorm2->hasBoundaryBefore(c)) {
1208 break;
1209 }
1210 }
1211 nfkcNorm2->normalize(fragment, normalizedFragment, status);
1212 normalizedInput->append(normalizedFragment);
1213
1214 // Map every position in the normalized chunk to the start of the ch unk
1215 // in the original input.
1216 int32_t fragmentOriginalStart = inputMap? inputMap->elementAti(fragm entStartI) : fragmentStartI+rangeStart;
1217 while (normalizedMap->size() < normalizedInput->length()) {
1218 normalizedMap->addElement(fragmentOriginalStart, status);
1219 if (U_FAILURE(status)) {
1220 break;
1221 }
1222 }
1011 } 1223 }
1012 utext_openUnicodeString(&normalizedText, &normalizedString, &status); 1224 U_ASSERT(normalizedMap->size() == normalizedInput->length());
1225 int32_t nativeEnd = inputMap? inputMap->elementAti(inString->length()) : inString->length()+rangeStart;
1226 normalizedMap->addElement(nativeEnd, status);
1227
1228 delete inputMap;
1229 inputMap = normalizedMap;
1230 delete inString;
1231 inString = normalizedInput;
1013 } 1232 }
1014 1233
1015 if (U_FAILURE(status)) { 1234 int32_t numCodePts = inString->countChar32();
1016 return 0; 1235 if (numCodePts != inString->length()) {
1236 // There are supplementary characters in the input.
1237 // The dictionary will produce boundary positions in terms of code point indexes,
1238 // not in terms of code unit string indexes.
1239 // Use the inputMap mechanism to take care of this in addition to indexi ng differences
1240 // from normalization and/or UTF-8 input.
1241 UBool hadExistingMap = (inputMap != NULL);
1242 if (!hadExistingMap) {
1243 inputMap = new UVector32(status);
1244 }
1245 int32_t cpIdx = 0;
1246 for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) {
1247 U_ASSERT(cuIdx >= cpIdx);
1248 if (hadExistingMap) {
1249 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1250 } else {
1251 inputMap->addElement(cuIdx+rangeStart, status);
1252 }
1253 cpIdx++;
1254 if (cuIdx == inString->length()) {
1255 break;
1256 }
1257 }
1258 }
1259
1260 // bestSnlp[i] is the snlp of the best segmentation of the first i
1261 // code points in the range to be matched.
1262 UVector32 bestSnlp(numCodePts + 1, status);
1263 bestSnlp.addElement(0, status);
1264 for(int32_t i = 1; i <= numCodePts; i++) {
1265 bestSnlp.addElement(kuint32max, status);
1017 } 1266 }
1018 1267
1019 // From this point on, all the indices refer to the indices of
1020 // the normalized input string.
1021 1268
1022 // bestSnlp[i] is the snlp of the best segmentation of the first i 1269 // prev[i] is the index of the last CJK code point in the previous word in
1023 // characters in the range to be matched. 1270 // the best segmentation of the first i characters.
1024 // TODO: Replace by UVector32. 1271 UVector32 prev(numCodePts + 1, status);
1025 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); 1272 for(int32_t i = 0; i <= numCodePts; i++){
1026 bestSnlp[0] = 0; 1273 prev.addElement(-1, status);
1027 for(int i = 1; i <= numChars; i++) {
1028 bestSnlp[i] = kuint32max;
1029 } 1274 }
1030 1275
1031 // prev[i] is the index of the last CJK character in the previous word in 1276 const int32_t maxWordSize = 20;
1032 // the best segmentation of the first i characters. 1277 UVector32 values(numCodePts, status);
1033 // TODO: Replace by UVector32. 1278 values.setSize(numCodePts);
1034 AutoBuffer<int, defaultInputLength> prev(numChars + 1); 1279 UVector32 lengths(numCodePts, status);
1035 for(int i = 0; i <= numChars; i++){ 1280 lengths.setSize(numCodePts);
1036 prev[i] = -1;
1037 }
1038 1281
1039 const size_t maxWordSize = 20; 1282 UText fu = UTEXT_INITIALIZER;
1040 // TODO: Replace both with UVector32. 1283 utext_openUnicodeString(&fu, inString, &status);
1041 AutoBuffer<int32_t, maxWordSize> values(numChars);
1042 AutoBuffer<int32_t, maxWordSize> lengths(numChars);
1043 1284
1044 // Dynamic programming to find the best segmentation. 1285 // Dynamic programming to find the best segmentation.
1045 bool is_prev_katakana = false; 1286
1046 for (int32_t i = 0; i < numChars; ++i) { 1287 // In outer loop, i is the code point index,
1047 //utext_setNativeIndex(text, rangeStart + i); 1288 // ix is the corresponding string (code unit) index.
1048 utext_setNativeIndex(&normalizedText, i); 1289 // They differ when the string contains supplementary characters.
1049 if (bestSnlp[i] == kuint32max) 1290 int32_t ix = 0;
1291 for (int32_t i = 0; i < numCodePts; ++i, ix = inString->moveIndex32(ix, 1) ) {
1292 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1050 continue; 1293 continue;
1294 }
1051 1295
1052 int32_t count; 1296 int32_t count;
1053 // limit maximum word length matched to size of current substring 1297 utext_setNativeIndex(&fu, ix);
1054 int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWord Size : (numChars - i); 1298 count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1055 1299 NULL, lengths.getBuffer(), values.getBuffer(), NULL );
1056 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); 1300 // Note: lengths is filled with code point lengths
1301 // The NULL parameter is the ignored code uni t lengths.
1057 1302
1058 // if there are no single character matches found in the dictionary 1303 // if there are no single character matches found in the dictionary
1059 // starting with this charcter, treat character as a 1-character word 1304 // starting with this charcter, treat character as a 1-character word
1060 // with the highest value possible, i.e. the least likely to occur. 1305 // with the highest value possible, i.e. the least likely to occur.
1061 // Exclude Korean characters from this treatment, as they should be left 1306 // Exclude Korean characters from this treatment, as they should be left
1062 // together by default. 1307 // together by default.
1063 if((count == 0 || lengths[0] != 1) && 1308 if ((count == 0 || lengths.elementAti(0) != 1) &&
1064 !fHangulWordSet.contains(utext_current32(&normalizedText))) { 1309 !fHangulWordSet.contains(inString->char32At(ix))) {
1065 values[count] = maxSnlp; 1310 values.setElementAt(maxSnlp, count); // 255
1066 lengths[count++] = 1; 1311 lengths.setElementAt(1, count++);
1067 } 1312 }
1068 1313
1069 for (int j = 0; j < count; j++) { 1314 for (int32_t j = 0; j < count; j++) {
1070 uint32_t newSnlp = bestSnlp[i] + values[j]; 1315 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)valu es.elementAti(j);
1071 if (newSnlp < bestSnlp[lengths[j] + i]) { 1316 int32_t ln_j_i = lengths.elementAti(j) + i;
1072 bestSnlp[lengths[j] + i] = newSnlp; 1317 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1073 prev[lengths[j] + i] = i; 1318 bestSnlp.setElementAt(newSnlp, ln_j_i);
1319 prev.setElementAt(i, ln_j_i);
1074 } 1320 }
1075 } 1321 }
1076 1322
1077 // In Japanese, 1323 // In Japanese,
1078 // Katakana word in single character is pretty rare. So we apply 1324 // Katakana word in single character is pretty rare. So we apply
1079 // the following heuristic to Katakana: any continuous run of Katakana 1325 // the following heuristic to Katakana: any continuous run of Katakana
1080 // characters is considered a candidate word with a default cost 1326 // characters is considered a candidate word with a default cost
1081 // specified in the katakanaCost table according to its length. 1327 // specified in the katakanaCost table according to its length.
1082 //utext_setNativeIndex(text, rangeStart + i); 1328
1083 utext_setNativeIndex(&normalizedText, i); 1329 bool is_prev_katakana = false;
1084 bool is_katakana = isKatakana(utext_current32(&normalizedText)); 1330 bool is_katakana = isKatakana(inString->char32At(ix));
1331 int32_t katakanaRunLength = 1;
1085 if (!is_prev_katakana && is_katakana) { 1332 if (!is_prev_katakana && is_katakana) {
1086 int j = i + 1; 1333 int32_t j = inString->moveIndex32(ix, 1);
1087 utext_next32(&normalizedText);
1088 // Find the end of the continuous run of Katakana characters 1334 // Find the end of the continuous run of Katakana characters
1089 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && 1335 while (j < inString->length() && katakanaRunLength < kMaxKatakanaGro upLength &&
1090 isKatakana(utext_current32(&normalizedText))) { 1336 isKatakana(inString->char32At(j))) {
1091 utext_next32(&normalizedText); 1337 j = inString->moveIndex32(j, 1);
1092 ++j; 1338 katakanaRunLength++;
1093 } 1339 }
1094 if ((j - i) < kMaxKatakanaGroupLength) { 1340 if (katakanaRunLength < kMaxKatakanaGroupLength) {
1095 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 1341 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(kata kanaRunLength);
1096 if (newSnlp < bestSnlp[j]) { 1342 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1097 bestSnlp[j] = newSnlp; 1343 bestSnlp.setElementAt(newSnlp, j);
1098 prev[j] = i; 1344 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
1099 } 1345 }
1100 } 1346 }
1101 } 1347 }
1102 is_prev_katakana = is_katakana; 1348 is_prev_katakana = is_katakana;
1103 } 1349 }
1350 utext_close(&fu);
1104 1351
1105 // Start pushing the optimal offset index into t_boundary (t for tentative). 1352 // Start pushing the optimal offset index into t_boundary (t for tentative).
1106 // prev[numChars] is guaranteed to be meaningful. 1353 // prev[numCodePts] is guaranteed to be meaningful.
1107 // We'll first push in the reverse order, i.e., 1354 // We'll first push in the reverse order, i.e.,
1108 // t_boundary[0] = numChars, and afterwards do a swap. 1355 // t_boundary[0] = numCodePts, and afterwards do a swap.
1109 // TODO: Replace by UVector32. 1356 UVector32 t_boundary(numCodePts+1, status);
1110 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
1111 1357
1112 int numBreaks = 0; 1358 int32_t numBreaks = 0;
1113 // No segmentation found, set boundary to end of range 1359 // No segmentation found, set boundary to end of range
1114 if (bestSnlp[numChars] == kuint32max) { 1360 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1115 t_boundary[numBreaks++] = numChars; 1361 t_boundary.addElement(numCodePts, status);
1362 numBreaks++;
1116 } else { 1363 } else {
1117 for (int i = numChars; i > 0; i = prev[i]) { 1364 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1118 t_boundary[numBreaks++] = i; 1365 t_boundary.addElement(i, status);
1366 numBreaks++;
1119 } 1367 }
1120 U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); 1368 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1121 } 1369 }
1122 1370
1123 // Reverse offset index in t_boundary. 1371 // Add a break for the start of the dictionary range if there is not one
1124 // Don't add a break for the start of the dictionary range if there is one
1125 // there already. 1372 // there already.
1126 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1373 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1127 t_boundary[numBreaks++] = 0; 1374 t_boundary.addElement(0, status);
1375 numBreaks++;
1128 } 1376 }
1129 1377
1130 // Now that we're done, convert positions in t_bdry[] (indices in 1378 // Now that we're done, convert positions in t_boundary[] (indices in
1131 // the normalized input string) back to indices in the raw input string 1379 // the normalized input string) back to indices in the original input UText
1132 // while reversing t_bdry and pushing values to foundBreaks. 1380 // while reversing t_boundary and pushing values to foundBreaks.
1133 for (int i = numBreaks-1; i >= 0; i--) { 1381 for (int32_t i = numBreaks-1; i >= 0; i--) {
1134 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); 1382 int32_t cpPos = t_boundary.elementAti(i);
1383 int32_t utextPos = inputMap ? inputMap->elementAti(cpPos) : cpPos + ran geStart;
1384 // Boundaries are added to foundBreaks output in ascending order.
1385 U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
1386 foundBreaks.push(utextPos, status);
1135 } 1387 }
1136 1388
1137 utext_close(&normalizedText); 1389 delete inString;
1390 delete inputMap;
1138 return numBreaks; 1391 return numBreaks;
1139 } 1392 }
1140 #endif 1393 #endif
1141 1394
1142 U_NAMESPACE_END 1395 U_NAMESPACE_END
1143 1396
1144 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1397 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1145 1398
OLDNEW
« no previous file with comments | « source/common/dictbe.h ('k') | source/common/dictionarydata.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698