source/common/dictbe.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/common/dictbe.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /**	1 /**

2 *******************************************************************************	2 *******************************************************************************

3 * Copyright (C) 2006-2013, International Business Machines Corporation	3 * Copyright (C) 2006-2014, International Business Machines Corporation

4 * and others. All Rights Reserved.	4 * and others. All Rights Reserved.

5 *******************************************************************************	5 *******************************************************************************

6 */	6 */

7	7

8 #include "unicode/utypes.h"	8 #include "unicode/utypes.h"

9	9

10 #if !UCONFIG_NO_BREAK_ITERATION	10 #if !UCONFIG_NO_BREAK_ITERATION

11	11

12 #include "brkeng.h"	12 #include "brkeng.h"

13 #include "dictbe.h"	13 #include "dictbe.h"

14 #include "unicode/uniset.h"	14 #include "unicode/uniset.h"

15 #include "unicode/chariter.h"	15 #include "unicode/chariter.h"

16 #include "unicode/ubrk.h"	16 #include "unicode/ubrk.h"

	17 #include "uvectr32.h"

17 #include "uvector.h"	18 #include "uvector.h"

18 #include "uassert.h"	19 #include "uassert.h"

19 #include "unicode/normlzr.h"	20 #include "unicode/normlzr.h"

20 #include "cmemory.h"	21 #include "cmemory.h"

21 #include "dictionarydata.h"	22 #include "dictionarydata.h"

22	23

23 U_NAMESPACE_BEGIN	24 U_NAMESPACE_BEGIN

24	25

25 /*	26 /*

26 ******************************************************************	27 ******************************************************************

(...skipping 15 matching lines...) Expand all Loading...
42 int32_t	43 int32_t

43 DictionaryBreakEngine::findBreaks( UText *text,	44 DictionaryBreakEngine::findBreaks( UText *text,

44 int32_t startPos,	45 int32_t startPos,

45 int32_t endPos,	46 int32_t endPos,

46 UBool reverse,	47 UBool reverse,

47 int32_t breakType,	48 int32_t breakType,

48 UStack &foundBreaks ) const {	49 UStack &foundBreaks ) const {

49 int32_t result = 0;	50 int32_t result = 0;

50	51

51 // Find the span of characters included in the set.	52 // Find the span of characters included in the set.

	53 // The span to break begins at the current position in the text, and

	54 // extends towards the start or end of the text, depending on 'reverse'.

	55

52 int32_t start = (int32_t)utext_getNativeIndex(text);	56 int32_t start = (int32_t)utext_getNativeIndex(text);

53 int32_t current;	57 int32_t current;

54 int32_t rangeStart;	58 int32_t rangeStart;

55 int32_t rangeEnd;	59 int32_t rangeEnd;

56 UChar32 c = utext_current32(text);	60 UChar32 c = utext_current32(text);

57 if (reverse) {	61 if (reverse) {

58 UBool isDict = fSet.contains(c);	62 UBool isDict = fSet.contains(c);

59 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDi ct) {	63 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDi ct) {

60 c = utext_previous32(text);	64 c = utext_previous32(text);

61 isDict = fSet.contains(c);	65 isDict = fSet.contains(c);

62 }	66 }

63 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);	67 if (current < startPos) {

64 rangeEnd = start + 1;	68 rangeStart = startPos;

	69 } else {

	70 rangeStart = current;

	71 if (!isDict) {

	72 utext_next32(text);

	73 rangeStart = utext_getNativeIndex(text);

	74 }

	75 }

	76 // rangeEnd = start + 1;

	77 utext_setNativeIndex(text, start);

	78 utext_next32(text);

	79 rangeEnd = utext_getNativeIndex(text);

65 }	80 }

66 else {	81 else {

67 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.c ontains(c)) {	82 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.c ontains(c)) {

68 utext_next32(text); // TODO: recast loop for postincrement	83 utext_next32(text); // TODO: recast loop for postincrement

69 c = utext_current32(text);	84 c = utext_current32(text);

70 }	85 }

71 rangeStart = start;	86 rangeStart = start;

72 rangeEnd = current;	87 rangeEnd = current;

73 }	88 }

74 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes )) {	89 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes )) {

(...skipping 14 matching lines...) Expand all Loading...
89 /*	104 /*

90 ******************************************************************	105 ******************************************************************

91 * PossibleWord	106 * PossibleWord

92 */	107 */

93	108

94 // Helper class for improving readability of the Thai/Lao/Khmer word break	109 // Helper class for improving readability of the Thai/Lao/Khmer word break

95 // algorithm. The implementation is completely inline.	110 // algorithm. The implementation is completely inline.

96	111

97 // List size, limited by the maximum number of words in the dictionary	112 // List size, limited by the maximum number of words in the dictionary

98 // that form a nested sequence.	113 // that form a nested sequence.

99 #define POSSIBLE_WORD_LIST_MAX 20	114 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;

100	115

101 class PossibleWord {	116 class PossibleWord {

102 private:	117 private:

103 // list of word candidate lengths, in increasing length order	118 // list of word candidate lengths, in increasing length order

104 int32_t lengths[POSSIBLE_WORD_LIST_MAX];	119 // TODO: bytes would be sufficient for word lengths.

105 int32_t count; // Count of candidates	120 int32_t count; // Count of candidates

106 int32_t prefix; // The longest match with a dictionary word	121 int32_t prefix; // The longest match with a dictionary word

107 int32_t offset; // Offset in the text of these candidates	122 int32_t offset; // Offset in the text of these candidates

108 int mark; // The preferred candidate's offset	123 int32_t mark; // The preferred candidate's offset

109 int current; // The candidate we're currently looking at	124 int32_t current; // The candidate we're currently looking at

	125 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code unit s.

	126 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code poin ts.

110	127

111 public:	128 public:

112 PossibleWord();	129 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};

113 ~PossibleWord();	130 ~PossibleWord() {};

114	131

115 // Fill the list of candidates if needed, select the longest, and return the number found	132 // Fill the list of candidates if needed, select the longest, and return the number found

116 int candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd );	133 int32_t candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd );

117	134

118 // Select the currently marked candidate, point after it in the text, and in validate self	135 // Select the currently marked candidate, point after it in the text, and in validate self

119 int32_t acceptMarked( UText *text );	136 int32_t acceptMarked( UText *text );

120	137

121 // Back up from the current candidate to the next shorter one; return TRUE i f that exists	138 // Back up from the current candidate to the next shorter one; return TRUE i f that exists

122 // and point the text after it	139 // and point the text after it

123 UBool backUp( UText *text );	140 UBool backUp( UText *text );

124	141

125 // Return the longest prefix this candidate location shares with a dictionar y word	142 // Return the longest prefix this candidate location shares with a dictionar y word

126 int32_t longestPrefix();	143 // Return value is in code points.

	144 int32_t longestPrefix() { return prefix; };

127	145

128 // Mark the current candidate as the one we like	146 // Mark the current candidate as the one we like

129 void markCurrent();	147 void markCurrent() { mark = current; };

	148

	149 // Get length in code points of the marked word.

	150 int32_t markedCPLength() { return cpLengths[mark]; };

130 };	151 };

131	152

132 inline

133 PossibleWord::PossibleWord() {

134 offset = -1;

135 }

136	153

137 inline	154 int32_t PossibleWord::candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd ) {

138 PossibleWord::~PossibleWord() {

139 }

140

141 inline int

142 PossibleWord::candidates( UText text, DictionaryMatcher dict, int32_t rangeEnd ) {

143 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()	155 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()

144 int32_t start = (int32_t)utext_getNativeIndex(text);	156 int32_t start = (int32_t)utext_getNativeIndex(text);

145 if (start != offset) {	157 if (start != offset) {

146 offset = start;	158 offset = start;

147 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(leng ths)/sizeof(lengths[0]));	159 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cu Lengths, cpLengths, NULL, &prefix);

148 // Dictionary leaves text after longest prefix, not longest word. Back u p.	160 // Dictionary leaves text after longest prefix, not longest word. Back u p.

149 if (count <= 0) {	161 if (count <= 0) {

150 utext_setNativeIndex(text, start);	162 utext_setNativeIndex(text, start);

151 }	163 }

152 }	164 }

153 if (count > 0) {	165 if (count > 0) {

154 utext_setNativeIndex(text, start+lengths[count-1]);	166 utext_setNativeIndex(text, start+cuLengths[count-1]);

155 }	167 }

156 current = count-1;	168 current = count-1;

157 mark = current;	169 mark = current;

158 return count;	170 return count;

159 }	171 }

160	172

161 inline int32_t	173 int32_t

162 PossibleWord::acceptMarked( UText *text ) {	174 PossibleWord::acceptMarked( UText *text ) {

163 utext_setNativeIndex(text, offset + lengths[mark]);	175 utext_setNativeIndex(text, offset + cuLengths[mark]);

164 return lengths[mark];	176 return cuLengths[mark];

165 }	177 }

166	178

167 inline UBool	179

	180 UBool

168 PossibleWord::backUp( UText *text ) {	181 PossibleWord::backUp( UText *text ) {

169 if (current > 0) {	182 if (current > 0) {

170 utext_setNativeIndex(text, offset + lengths[--current]);	183 utext_setNativeIndex(text, offset + cuLengths[--current]);

171 return TRUE;	184 return TRUE;

172 }	185 }

173 return FALSE;	186 return FALSE;

174 }	187 }

175	188

176 inline int32_t

177 PossibleWord::longestPrefix() {

178 return prefix;

179 }

180

181 inline void

182 PossibleWord::markCurrent() {

183 mark = current;

184 }

185

186 /*	189 /*

187 ******************************************************************	190 ******************************************************************

188 * ThaiBreakEngine	191 * ThaiBreakEngine

189 */	192 */

190	193

191 // How many words in a row are "good enough"?	194 // How many words in a row are "good enough"?

192 #define THAI_LOOKAHEAD 3	195 static const int32_t THAI_LOOKAHEAD = 3;

193	196

194 // Will not combine a non-word with a preceding dictionary word longer than this	197 // Will not combine a non-word with a preceding dictionary word longer than this

195 #define THAI_ROOT_COMBINE_THRESHOLD 3	198 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;

196	199

197 // Will not combine a non-word that shares at least this much prefix with a	200 // Will not combine a non-word that shares at least this much prefix with a

198 // dictionary word, with a preceding word	201 // dictionary word, with a preceding word

199 #define THAI_PREFIX_COMBINE_THRESHOLD 3	202 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;

200	203

201 // Ellision character	204 // Ellision character

202 #define THAI_PAIYANNOI 0x0E2F	205 static const int32_t THAI_PAIYANNOI = 0x0E2F;

203	206

204 // Repeat character	207 // Repeat character

205 #define THAI_MAIYAMOK 0x0E46	208 static const int32_t THAI_MAIYAMOK = 0x0E46;

206	209

207 // Minimum word size	210 // Minimum word size

208 #define THAI_MIN_WORD 2	211 static const int32_t THAI_MIN_WORD = 2;

209	212

210 // Minimum number of characters for two words	213 // Minimum number of characters for two words

211 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)	214 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;

212	215

213 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)	216 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)

214 : DictionaryBreakEngine((1<<UBRK_WORD) \| (1<<UBRK_LINE)),	217 : DictionaryBreakEngine((1<<UBRK_WORD) \| (1<<UBRK_LINE)),

215 fDictionary(adoptDictionary)	218 fDictionary(adoptDictionary)

216 {	219 {

217 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]] "), status);	220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]] "), status);

218 if (U_SUCCESS(status)) {	221 if (U_SUCCESS(status)) {

219 setCharacters(fThaiWordSet);	222 setCharacters(fThaiWordSet);

220 }	223 }

221 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M: ]]"), status);	224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M: ]]"), status);

(...skipping 15 matching lines...) Expand all Loading...
237	240

238 ThaiBreakEngine::~ThaiBreakEngine() {	241 ThaiBreakEngine::~ThaiBreakEngine() {

239 delete fDictionary;	242 delete fDictionary;

240 }	243 }

241	244

242 int32_t	245 int32_t

243 ThaiBreakEngine::divideUpDictionaryRange( UText *text,	246 ThaiBreakEngine::divideUpDictionaryRange( UText *text,

244 int32_t rangeStart,	247 int32_t rangeStart,

245 int32_t rangeEnd,	248 int32_t rangeEnd,

246 UStack &foundBreaks ) const {	249 UStack &foundBreaks ) const {

247 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {	250 utext_setNativeIndex(text, rangeStart);

	251 utext_moveIndex32(text, THAI_MIN_WORD_SPAN);

	252 if (utext_getNativeIndex(text) >= rangeEnd) {

248 return 0; // Not enough characters for two words	253 return 0; // Not enough characters for two words

249 }	254 }

	255 utext_setNativeIndex(text, rangeStart);

	256

250	257

251 uint32_t wordsFound = 0;	258 uint32_t wordsFound = 0;

252 int32_t wordLength;	259 int32_t cpWordLength = 0; // Word Length in Code Points.

	260 int32_t cuWordLength = 0; // Word length in code units (UText native inde xing)

253 int32_t current;	261 int32_t current;

254 UErrorCode status = U_ZERO_ERROR;	262 UErrorCode status = U_ZERO_ERROR;

255 PossibleWord words[THAI_LOOKAHEAD];	263 PossibleWord words[THAI_LOOKAHEAD];

256 UChar32 uc;

257	264

258 utext_setNativeIndex(text, rangeStart);	265 utext_setNativeIndex(text, rangeStart);

259	266

260 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {	267 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

261 wordLength = 0;	268 cpWordLength = 0;

	269 cuWordLength = 0;

262	270

263 // Look for candidate words at the current position	271 // Look for candidate words at the current position

264 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDict ionary, rangeEnd);	272 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, f Dictionary, rangeEnd);

265	273

266 // If we found exactly one, use that	274 // If we found exactly one, use that

267 if (candidates == 1) {	275 if (candidates == 1) {

268 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);	276 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text) ;

	277 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();

269 wordsFound += 1;	278 wordsFound += 1;

270 }	279 }

271 // If there was more than one, see which one can take us forward the mos t words	280 // If there was more than one, see which one can take us forward the mos t words

272 else if (candidates > 1) {	281 else if (candidates > 1) {

273 // If we're already at the end of the range, we're done	282 // If we're already at the end of the range, we're done

274 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {	283 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

275 goto foundBest;	284 goto foundBest;

276 }	285 }

277 do {	286 do {

278 int wordsMatched = 1;	287 int32_t wordsMatched = 1;

279 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fD ictionary, rangeEnd) > 0) {	288 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fD ictionary, rangeEnd) > 0) {

280 if (wordsMatched < 2) {	289 if (wordsMatched < 2) {

281 // Followed by another dictionary word; mark first word as a good candidate	290 // Followed by another dictionary word; mark first word as a good candidate

282 words[wordsFound%THAI_LOOKAHEAD].markCurrent();	291 words[wordsFound%THAI_LOOKAHEAD].markCurrent();

283 wordsMatched = 2;	292 wordsMatched = 2;

284 }	293 }

285	294

286 // If we're already at the end of the range, we're done	295 // If we're already at the end of the range, we're done

287 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {	296 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

288 goto foundBest;	297 goto foundBest;

289 }	298 }

290	299

291 // See if any of the possible second words is followed by a third word	300 // See if any of the possible second words is followed by a third word

292 do {	301 do {

293 // If we find a third word, stop right away	302 // If we find a third word, stop right away

294 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates( text, fDictionary, rangeEnd)) {	303 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates( text, fDictionary, rangeEnd)) {

295 words[wordsFound % THAI_LOOKAHEAD].markCurrent();	304 words[wordsFound % THAI_LOOKAHEAD].markCurrent();

296 goto foundBest;	305 goto foundBest;

297 }	306 }

298 }	307 }

299 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text) );	308 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text) );

300 }	309 }

301 }	310 }

302 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));	311 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));

303 foundBest:	312 foundBest:

304 wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);	313 // Set UText position to after the accepted word.

	314 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text) ;

	315 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();

305 wordsFound += 1;	316 wordsFound += 1;

306 }	317 }

307	318

308 // We come here after having either found a word or not. We look ahead t o the	319 // We come here after having either found a word or not. We look ahead t o the

309 // next word. If it's not a dictionary word, we will combine it withe th e word we	320 // next word. If it's not a dictionary word, we will combine it with the word we

310 // just found (if there is one), but only if the preceding word does not exceed	321 // just found (if there is one), but only if the preceding word does not exceed

311 // the threshold.	322 // the threshold.

312 // The text iterator should now be positioned at the end of the word we found.	323 // The text iterator should now be positioned at the end of the word we found.

313 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ ROOT_COMBINE_THRESHOLD) {	324

	325 UChar32 uc = 0;

	326 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < TH AI_ROOT_COMBINE_THRESHOLD) {

314 // if it is a dictionary word, do nothing. If it isn't, then if ther e is	327 // if it is a dictionary word, do nothing. If it isn't, then if ther e is

315 // no preceding word, or the non-word shares less than the minimum t hreshold	328 // no preceding word, or the non-word shares less than the minimum t hreshold

316 // of characters with a dictionary word, then scan to resynchronize	329 // of characters with a dictionary word, then scan to resynchronize

317 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0	330 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

318 && (wordLength == 0	331 && (cuWordLength == 0

319 \|\| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI _PREFIX_COMBINE_THRESHOLD)) {	332 \|\| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI _PREFIX_COMBINE_THRESHOLD)) {

320 // Look for a plausible word boundary	333 // Look for a plausible word boundary

321 //TODO: This section will need a rework for UText.	334 int32_t remaining = rangeEnd - (current+cuWordLength);

322 int32_t remaining = rangeEnd - (current+wordLength);	335 UChar32 pc;

323 UChar32 pc = utext_current32(text);

324 int32_t chars = 0;	336 int32_t chars = 0;

325 for (;;) {	337 for (;;) {

326 utext_next32(text);	338 int32_t pcIndex = utext_getNativeIndex(text);

327 uc = utext_current32(text);	339 pc = utext_next32(text);

328 // TODO: Here we're counting on the fact that the SA languag es are all	340 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;

329 // in the BMP. This should get fixed with the UText rework.	341 chars += pcSize;

330 chars += 1;	342 remaining -= pcSize;

331 if (--remaining <= 0) {	343 if (remaining <= 0) {

332 break;	344 break;

333 }	345 }

	346 uc = utext_current32(text);

334 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {	347 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

335 // Maybe. See if it's in the dictionary.	348 // Maybe. See if it's in the dictionary.

336 // NOTE: In the original Apple code, checked that the ne xt	349 // NOTE: In the original Apple code, checked that the ne xt

337 // two characters after uc were not 0x0E4C THANTHAKHAT b efore	350 // two characters after uc were not 0x0E4C THANTHAKHAT b efore

338 // checking the dictionary. That is just a performance f ilter,	351 // checking the dictionary. That is just a performance f ilter,

339 // but it's not clear it's faster than checking the trie .	352 // but it's not clear it's faster than checking the trie .

340 int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD ].candidates(text, fDictionary, rangeEnd);	353 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKA HEAD].candidates(text, fDictionary, rangeEnd);

341 utext_setNativeIndex(text, current + wordLength + chars) ;	354 utext_setNativeIndex(text, current + cuWordLength + char s);

342 if (candidates > 0) {	355 if (candidates > 0) {

343 break;	356 break;

344 }	357 }

345 }	358 }

346 pc = uc;

347 }	359 }

348	360

349 // Bump the word count if there wasn't already one	361 // Bump the word count if there wasn't already one

350 if (wordLength <= 0) {	362 if (cuWordLength <= 0) {

351 wordsFound += 1;	363 wordsFound += 1;

352 }	364 }

353	365

354 // Update the length with the passed-over characters	366 // Update the length with the passed-over characters

355 wordLength += chars;	367 cuWordLength += chars;

356 }	368 }

357 else {	369 else {

358 // Back up to where we were for next iteration	370 // Back up to where we were for next iteration

359 utext_setNativeIndex(text, current+wordLength);	371 utext_setNativeIndex(text, current+cuWordLength);

360 }	372 }

361 }	373 }

362	374

363 // Never stop before a combining mark.	375 // Never stop before a combining mark.

364 int32_t currPos;	376 int32_t currPos;

365 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {	377 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {

366 utext_next32(text);	378 utext_next32(text);

367 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;	379 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

368 }	380 }

369	381

370 // Look ahead for possible suffixes if a dictionary word does not follow .	382 // Look ahead for possible suffixes if a dictionary word does not follow .

371 // We do this in code rather than using a rule so that the heuristic	383 // We do this in code rather than using a rule so that the heuristic

372 // resynch continues to function. For example, one of the suffix charact ers	384 // resynch continues to function. For example, one of the suffix charact ers

373 // could be a typo in the middle of a word.	385 // could be a typo in the middle of a word.

374 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {	386 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {

375 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, r angeEnd) <= 0	387 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, r angeEnd) <= 0

376 && fSuffixSet.contains(uc = utext_current32(text))) {	388 && fSuffixSet.contains(uc = utext_current32(text))) {

377 if (uc == THAI_PAIYANNOI) {	389 if (uc == THAI_PAIYANNOI) {

378 if (!fSuffixSet.contains(utext_previous32(text))) {	390 if (!fSuffixSet.contains(utext_previous32(text))) {

379 // Skip over previous end and PAIYANNOI	391 // Skip over previous end and PAIYANNOI

380 utext_next32(text);	392 utext_next32(text);

	393 int32_t paiyannoiIndex = utext_getNativeIndex(text);

381 utext_next32(text);	394 utext_next32(text);

382 wordLength += 1; // Add PAIYANNOI to word	395 cuWordLength += utext_getNativeIndex(text) - paiyannoiIn dex; // Add PAIYANNOI to word

383 uc = utext_current32(text); // Fetch next character	396 uc = utext_current32(text); // Fetch next character

384 }	397 }

385 else {	398 else {

386 // Restore prior position	399 // Restore prior position

387 utext_next32(text);	400 utext_next32(text);

388 }	401 }

389 }	402 }

390 if (uc == THAI_MAIYAMOK) {	403 if (uc == THAI_MAIYAMOK) {

391 if (utext_previous32(text) != THAI_MAIYAMOK) {	404 if (utext_previous32(text) != THAI_MAIYAMOK) {

392 // Skip over previous end and MAIYAMOK	405 // Skip over previous end and MAIYAMOK

393 utext_next32(text);	406 utext_next32(text);

	407 int32_t maiyamokIndex = utext_getNativeIndex(text);

394 utext_next32(text);	408 utext_next32(text);

395 wordLength += 1; // Add MAIYAMOK to word	409 cuWordLength += utext_getNativeIndex(text) - maiyamokInd ex; // Add MAIYAMOK to word

396 }	410 }

397 else {	411 else {

398 // Restore prior position	412 // Restore prior position

399 utext_next32(text);	413 utext_next32(text);

400 }	414 }

401 }	415 }

402 }	416 }

403 else {	417 else {

404 utext_setNativeIndex(text, current+wordLength);	418 utext_setNativeIndex(text, current+cuWordLength);

405 }	419 }

406 }	420 }

407	421

408 // Did we find a word on this iteration? If so, push it on the break sta ck	422 // Did we find a word on this iteration? If so, push it on the break sta ck

409 if (wordLength > 0) {	423 if (cuWordLength > 0) {

410 foundBreaks.push((current+wordLength), status);	424 foundBreaks.push((current+cuWordLength), status);

411 }	425 }

412 }	426 }

413	427

414 // Don't return a break for the end of the dictionary range if there is one there.	428 // Don't return a break for the end of the dictionary range if there is one there.

415 if (foundBreaks.peeki() >= rangeEnd) {	429 if (foundBreaks.peeki() >= rangeEnd) {

416 (void) foundBreaks.popi();	430 (void) foundBreaks.popi();

417 wordsFound -= 1;	431 wordsFound -= 1;

418 }	432 }

419	433

420 return wordsFound;	434 return wordsFound;

421 }	435 }

422	436

423 /*	437 /*

424 ******************************************************************	438 ******************************************************************

425 * LaoBreakEngine	439 * LaoBreakEngine

426 */	440 */

427	441

428 // How many words in a row are "good enough"?	442 // How many words in a row are "good enough"?

429 #define LAO_LOOKAHEAD 3	443 static const int32_t LAO_LOOKAHEAD = 3;

430	444

431 // Will not combine a non-word with a preceding dictionary word longer than this	445 // Will not combine a non-word with a preceding dictionary word longer than this

432 #define LAO_ROOT_COMBINE_THRESHOLD 3	446 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;

433	447

434 // Will not combine a non-word that shares at least this much prefix with a	448 // Will not combine a non-word that shares at least this much prefix with a

435 // dictionary word, with a preceding word	449 // dictionary word, with a preceding word

436 #define LAO_PREFIX_COMBINE_THRESHOLD 3	450 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;

437	451

438 // Minimum word size	452 // Minimum word size

439 #define LAO_MIN_WORD 2	453 static const int32_t LAO_MIN_WORD = 2;

440	454

441 // Minimum number of characters for two words	455 // Minimum number of characters for two words

442 #define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)	456 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;

443	457

444 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s tatus)	458 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s tatus)

445 : DictionaryBreakEngine((1<<UBRK_WORD) \| (1<<UBRK_LINE)),	459 : DictionaryBreakEngine((1<<UBRK_WORD) \| (1<<UBRK_LINE)),

446 fDictionary(adoptDictionary)	460 fDictionary(adoptDictionary)

447 {	461 {

448 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]" ), status);	462 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]" ), status);

449 if (U_SUCCESS(status)) {	463 if (U_SUCCESS(status)) {

450 setCharacters(fLaoWordSet);	464 setCharacters(fLaoWordSet);

451 }	465 }

452 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M: ]]"), status);	466 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M: ]]"), status);

(...skipping 17 matching lines...) Expand all Loading...
470 int32_t	484 int32_t

471 LaoBreakEngine::divideUpDictionaryRange( UText *text,	485 LaoBreakEngine::divideUpDictionaryRange( UText *text,

472 int32_t rangeStart,	486 int32_t rangeStart,

473 int32_t rangeEnd,	487 int32_t rangeEnd,

474 UStack &foundBreaks ) const {	488 UStack &foundBreaks ) const {

475 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {	489 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {

476 return 0; // Not enough characters for two words	490 return 0; // Not enough characters for two words

477 }	491 }

478	492

479 uint32_t wordsFound = 0;	493 uint32_t wordsFound = 0;

480 int32_t wordLength;	494 int32_t cpWordLength = 0;

	495 int32_t cuWordLength = 0;

481 int32_t current;	496 int32_t current;

482 UErrorCode status = U_ZERO_ERROR;	497 UErrorCode status = U_ZERO_ERROR;

483 PossibleWord words[LAO_LOOKAHEAD];	498 PossibleWord words[LAO_LOOKAHEAD];

484 UChar32 uc;

485	499

486 utext_setNativeIndex(text, rangeStart);	500 utext_setNativeIndex(text, rangeStart);

487	501

488 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {	502 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

489 wordLength = 0;	503 cuWordLength = 0;

	504 cpWordLength = 0;

490	505

491 // Look for candidate words at the current position	506 // Look for candidate words at the current position

492 int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDicti onary, rangeEnd);	507 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fD ictionary, rangeEnd);

493	508

494 // If we found exactly one, use that	509 // If we found exactly one, use that

495 if (candidates == 1) {	510 if (candidates == 1) {

496 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);	511 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);

	512 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();

497 wordsFound += 1;	513 wordsFound += 1;

498 }	514 }

499 // If there was more than one, see which one can take us forward the mos t words	515 // If there was more than one, see which one can take us forward the mos t words

500 else if (candidates > 1) {	516 else if (candidates > 1) {

501 // If we're already at the end of the range, we're done	517 // If we're already at the end of the range, we're done

502 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {	518 if (utext_getNativeIndex(text) >= rangeEnd) {

503 goto foundBest;	519 goto foundBest;

504 }	520 }

505 do {	521 do {

506 int wordsMatched = 1;	522 int32_t wordsMatched = 1;

507 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDi ctionary, rangeEnd) > 0) {	523 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDi ctionary, rangeEnd) > 0) {

508 if (wordsMatched < 2) {	524 if (wordsMatched < 2) {

509 // Followed by another dictionary word; mark first word as a good candidate	525 // Followed by another dictionary word; mark first word as a good candidate

510 words[wordsFound%LAO_LOOKAHEAD].markCurrent();	526 words[wordsFound%LAO_LOOKAHEAD].markCurrent();

511 wordsMatched = 2;	527 wordsMatched = 2;

512 }	528 }

513	529

514 // If we're already at the end of the range, we're done	530 // If we're already at the end of the range, we're done

515 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {	531 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

516 goto foundBest;	532 goto foundBest;

517 }	533 }

518	534

519 // See if any of the possible second words is followed by a third word	535 // See if any of the possible second words is followed by a third word

520 do {	536 do {

521 // If we find a third word, stop right away	537 // If we find a third word, stop right away

522 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(t ext, fDictionary, rangeEnd)) {	538 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(t ext, fDictionary, rangeEnd)) {

523 words[wordsFound % LAO_LOOKAHEAD].markCurrent();	539 words[wordsFound % LAO_LOOKAHEAD].markCurrent();

524 goto foundBest;	540 goto foundBest;

525 }	541 }

526 }	542 }

527 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)) ;	543 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)) ;

528 }	544 }

529 }	545 }

530 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));	546 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));

531 foundBest:	547 foundBest:

532 wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);	548 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);

	549 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();

533 wordsFound += 1;	550 wordsFound += 1;

534 }	551 }

535	552

536 // We come here after having either found a word or not. We look ahead t o the	553 // We come here after having either found a word or not. We look ahead t o the

537 // next word. If it's not a dictionary word, we will combine it withe th e word we	554 // next word. If it's not a dictionary word, we will combine it withe th e word we

538 // just found (if there is one), but only if the preceding word does not exceed	555 // just found (if there is one), but only if the preceding word does not exceed

539 // the threshold.	556 // the threshold.

540 // The text iterator should now be positioned at the end of the word we found.	557 // The text iterator should now be positioned at the end of the word we found.

541 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_R OOT_COMBINE_THRESHOLD) {	558 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO _ROOT_COMBINE_THRESHOLD) {

542 // if it is a dictionary word, do nothing. If it isn't, then if ther e is	559 // if it is a dictionary word, do nothing. If it isn't, then if ther e is

543 // no preceding word, or the non-word shares less than the minimum t hreshold	560 // no preceding word, or the non-word shares less than the minimum t hreshold

544 // of characters with a dictionary word, then scan to resynchronize	561 // of characters with a dictionary word, then scan to resynchronize

545 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0	562 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

546 && (wordLength == 0	563 && (cuWordLength == 0

547 \|\| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_P REFIX_COMBINE_THRESHOLD)) {	564 \|\| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_P REFIX_COMBINE_THRESHOLD)) {

548 // Look for a plausible word boundary	565 // Look for a plausible word boundary

549 //TODO: This section will need a rework for UText.	566 int32_t remaining = rangeEnd - (current + cuWordLength);

550 int32_t remaining = rangeEnd - (current+wordLength);	567 UChar32 pc;

551 UChar32 pc = utext_current32(text);	568 UChar32 uc;

552 int32_t chars = 0;	569 int32_t chars = 0;

553 for (;;) {	570 for (;;) {

554 utext_next32(text);	571 int32_t pcIndex = utext_getNativeIndex(text);

555 uc = utext_current32(text);	572 pc = utext_next32(text);

556 // TODO: Here we're counting on the fact that the SA languag es are all	573 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;

557 // in the BMP. This should get fixed with the UText rework.	574 chars += pcSize;

558 chars += 1;	575 remaining -= pcSize;

559 if (--remaining <= 0) {	576 if (remaining <= 0) {

560 break;	577 break;

561 }	578 }

	579 uc = utext_current32(text);

562 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {	580 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

563 // Maybe. See if it's in the dictionary.	581 // Maybe. See if it's in the dictionary.

564 int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD] .candidates(text, fDictionary, rangeEnd);	582 // TODO: this looks iffy; compare with old code.

565 utext_setNativeIndex(text, current + wordLength + chars) ;	583 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAH EAD].candidates(text, fDictionary, rangeEnd);

	584 utext_setNativeIndex(text, current + cuWordLength + char s);

566 if (candidates > 0) {	585 if (candidates > 0) {

567 break;	586 break;

568 }	587 }

569 }	588 }

570 pc = uc;

571 }	589 }

572	590

573 // Bump the word count if there wasn't already one	591 // Bump the word count if there wasn't already one

574 if (wordLength <= 0) {	592 if (cuWordLength <= 0) {

575 wordsFound += 1;	593 wordsFound += 1;

576 }	594 }

577	595

578 // Update the length with the passed-over characters	596 // Update the length with the passed-over characters

579 wordLength += chars;	597 cuWordLength += chars;

580 }	598 }

581 else {	599 else {

582 // Back up to where we were for next iteration	600 // Back up to where we were for next iteration

583 utext_setNativeIndex(text, current+wordLength);	601 utext_setNativeIndex(text, current + cuWordLength);

584 }	602 }

585 }	603 }

586	604

587 // Never stop before a combining mark.	605 // Never stop before a combining mark.

588 int32_t currPos;	606 int32_t currPos;

589 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {	607 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {

590 utext_next32(text);	608 utext_next32(text);

591 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;	609 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

592 }	610 }

593	611

594 // Look ahead for possible suffixes if a dictionary word does not follow .	612 // Look ahead for possible suffixes if a dictionary word does not follow .

595 // We do this in code rather than using a rule so that the heuristic	613 // We do this in code rather than using a rule so that the heuristic

596 // resynch continues to function. For example, one of the suffix charact ers	614 // resynch continues to function. For example, one of the suffix charact ers

597 // could be a typo in the middle of a word.	615 // could be a typo in the middle of a word.

598 // NOT CURRENTLY APPLICABLE TO LAO	616 // NOT CURRENTLY APPLICABLE TO LAO

599	617

600 // Did we find a word on this iteration? If so, push it on the break sta ck	618 // Did we find a word on this iteration? If so, push it on the break sta ck

601 if (wordLength > 0) {	619 if (cuWordLength > 0) {

602 foundBreaks.push((current+wordLength), status);	620 foundBreaks.push((current+cuWordLength), status);

603 }	621 }

604 }	622 }

605	623

	624 // Don't return a break for the end of the dictionary range if there is one there.

	625 if (foundBreaks.peeki() >= rangeEnd) {

	626 (void) foundBreaks.popi();

	627 wordsFound -= 1;

	628 }

	629

	630 return wordsFound;

	631 }

	632

	633 /*

	634 ******************************************************************

	635 * BurmeseBreakEngine

	636 */

	637

	638 // How many words in a row are "good enough"?

	639 static const int32_t BURMESE_LOOKAHEAD = 3;

	640

	641 // Will not combine a non-word with a preceding dictionary word longer than this

	642 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;

	643

	644 // Will not combine a non-word that shares at least this much prefix with a

	645 // dictionary word, with a preceding word

	646 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;

	647

	648 // Minimum word size

	649 static const int32_t BURMESE_MIN_WORD = 2;

	650

	651 // Minimum number of characters for two words

	652 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;

	653

	654 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro rCode &status)

	655 : DictionaryBreakEngine((1<<UBRK_WORD) \| (1<<UBRK_LINE)),

	656 fDictionary(adoptDictionary)

	657 {

	658 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA :]]"), status);

	659 if (U_SUCCESS(status)) {

	660 setCharacters(fBurmeseWordSet);

	661 }

	662 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M: ]]"), status);

	663 fMarkSet.add(0x0020);

	664 fEndWordSet = fBurmeseWordSet;

	665 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels

	666

	667 // Compact for caching.

	668 fMarkSet.compact();

	669 fEndWordSet.compact();

	670 fBeginWordSet.compact();

	671 }

	672

	673 BurmeseBreakEngine::~BurmeseBreakEngine() {

	674 delete fDictionary;

	675 }

	676

	677 int32_t

	678 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,

	679 int32_t rangeStart,

	680 int32_t rangeEnd,

	681 UStack &foundBreaks ) const {

	682 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {

	683 return 0; // Not enough characters for two words

	684 }

	685

	686 uint32_t wordsFound = 0;

	687 int32_t cpWordLength = 0;

	688 int32_t cuWordLength = 0;

	689 int32_t current;

	690 UErrorCode status = U_ZERO_ERROR;

	691 PossibleWord words[BURMESE_LOOKAHEAD];

	692

	693 utext_setNativeIndex(text, rangeStart);

	694

	695 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

	696 cuWordLength = 0;

	697 cpWordLength = 0;

	698

	699 // Look for candidate words at the current position

	700 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text , fDictionary, rangeEnd);

	701

	702 // If we found exactly one, use that

	703 if (candidates == 1) {

	704 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(te xt);

	705 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength( );

	706 wordsFound += 1;

	707 }

	708 // If there was more than one, see which one can take us forward the mos t words

	709 else if (candidates > 1) {

	710 // If we're already at the end of the range, we're done

	711 if (utext_getNativeIndex(text) >= rangeEnd) {

	712 goto foundBest;

	713 }

	714 do {

	715 int32_t wordsMatched = 1;

	716 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {

	717 if (wordsMatched < 2) {

	718 // Followed by another dictionary word; mark first word as a good candidate

	719 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();

	720 wordsMatched = 2;

	721 }

	722

	723 // If we're already at the end of the range, we're done

	724 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

	725 goto foundBest;

	726 }

	727

	728 // See if any of the possible second words is followed by a third word

	729 do {

	730 // If we find a third word, stop right away

	731 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidat es(text, fDictionary, rangeEnd)) {

	732 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();

	733 goto foundBest;

	734 }

	735 }

	736 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(te xt));

	737 }

	738 }

	739 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));

	740 foundBest:

	741 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(te xt);

	742 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength( );

	743 wordsFound += 1;

	744 }

	745

	746 // We come here after having either found a word or not. We look ahead t o the

	747 // next word. If it's not a dictionary word, we will combine it withe th e word we

	748 // just found (if there is one), but only if the preceding word does not exceed

	749 // the threshold.

	750 // The text iterator should now be positioned at the end of the word we found.

	751 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BUR MESE_ROOT_COMBINE_THRESHOLD) {

	752 // if it is a dictionary word, do nothing. If it isn't, then if ther e is

	753 // no preceding word, or the non-word shares less than the minimum t hreshold

	754 // of characters with a dictionary word, then scan to resynchronize

	755 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictiona ry, rangeEnd) <= 0

	756 && (cuWordLength == 0

	757 \|\| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < B URMESE_PREFIX_COMBINE_THRESHOLD)) {

	758 // Look for a plausible word boundary

	759 int32_t remaining = rangeEnd - (current + cuWordLength);

	760 UChar32 pc;

	761 UChar32 uc;

	762 int32_t chars = 0;

	763 for (;;) {

	764 int32_t pcIndex = utext_getNativeIndex(text);

	765 pc = utext_next32(text);

	766 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;

	767 chars += pcSize;

	768 remaining -= pcSize;

	769 if (remaining <= 0) {

	770 break;

	771 }

	772 uc = utext_current32(text);

	773 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

	774 // Maybe. See if it's in the dictionary.

	775 // TODO: this looks iffy; compare with old code.

	776 int32_t candidates = words[(wordsFound + 1) % BURMESE_LO OKAHEAD].candidates(text, fDictionary, rangeEnd);

	777 utext_setNativeIndex(text, current + cuWordLength + char s);

	778 if (candidates > 0) {

	779 break;

	780 }

	781 }

	782 }

	783

	784 // Bump the word count if there wasn't already one

	785 if (cuWordLength <= 0) {

	786 wordsFound += 1;

	787 }

	788

	789 // Update the length with the passed-over characters

	790 cuWordLength += chars;

	791 }

	792 else {

	793 // Back up to where we were for next iteration

	794 utext_setNativeIndex(text, current + cuWordLength);

	795 }

	796 }

	797

	798 // Never stop before a combining mark.

	799 int32_t currPos;

	800 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {

	801 utext_next32(text);

	802 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

	803 }

	804

	805 // Look ahead for possible suffixes if a dictionary word does not follow .

	806 // We do this in code rather than using a rule so that the heuristic

	807 // resynch continues to function. For example, one of the suffix charact ers

	808 // could be a typo in the middle of a word.

	809 // NOT CURRENTLY APPLICABLE TO BURMESE

	810

	811 // Did we find a word on this iteration? If so, push it on the break sta ck

	812 if (cuWordLength > 0) {

	813 foundBreaks.push((current+cuWordLength), status);

	814 }

	815 }

	816

606 // Don't return a break for the end of the dictionary range if there is one there.	817 // Don't return a break for the end of the dictionary range if there is one there.

607 if (foundBreaks.peeki() >= rangeEnd) {	818 if (foundBreaks.peeki() >= rangeEnd) {

608 (void) foundBreaks.popi();	819 (void) foundBreaks.popi();

609 wordsFound -= 1;	820 wordsFound -= 1;

610 }	821 }

611	822

612 return wordsFound;	823 return wordsFound;

613 }	824 }

614	825

615 /*	826 /*

616 ******************************************************************	827 ******************************************************************

617 * KhmerBreakEngine	828 * KhmerBreakEngine

618 */	829 */

619	830

620 // How many words in a row are "good enough"?	831 // How many words in a row are "good enough"?

621 #define KHMER_LOOKAHEAD 3	832 static const int32_t KHMER_LOOKAHEAD = 3;

622	833

623 // Will not combine a non-word with a preceding dictionary word longer than this	834 // Will not combine a non-word with a preceding dictionary word longer than this

624 #define KHMER_ROOT_COMBINE_THRESHOLD 10	835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;

625	836

626 // Will not combine a non-word that shares at least this much prefix with a	837 // Will not combine a non-word that shares at least this much prefix with a

627 // dictionary word, with a preceding word	838 // dictionary word, with a preceding word

628 #define KHMER_PREFIX_COMBINE_THRESHOLD 5	839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;

629	840

630 // Minimum word size	841 // Minimum word size

631 #define KHMER_MIN_WORD 2	842 static const int32_t KHMER_MIN_WORD = 2;

632	843

633 // Minimum number of characters for two words	844 // Minimum number of characters for two words

634 #define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)	845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;

635	846

636 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod e &status)	847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod e &status)

637 : DictionaryBreakEngine((1 << UBRK_WORD) \| (1 << UBRK_LINE)),	848 : DictionaryBreakEngine((1 << UBRK_WORD) \| (1 << UBRK_LINE)),

638 fDictionary(adoptDictionary)	849 fDictionary(adoptDictionary)

639 {	850 {

640 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:] ]"), status);	851 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:] ]"), status);

641 if (U_SUCCESS(status)) {	852 if (U_SUCCESS(status)) {

642 setCharacters(fKhmerWordSet);	853 setCharacters(fKhmerWordSet);

643 }	854 }

644 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M: ]]"), status);	855 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M: ]]"), status);

(...skipping 26 matching lines...) Expand all Loading...
671 int32_t	882 int32_t

672 KhmerBreakEngine::divideUpDictionaryRange( UText *text,	883 KhmerBreakEngine::divideUpDictionaryRange( UText *text,

673 int32_t rangeStart,	884 int32_t rangeStart,

674 int32_t rangeEnd,	885 int32_t rangeEnd,

675 UStack &foundBreaks ) const {	886 UStack &foundBreaks ) const {

676 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {	887 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {

677 return 0; // Not enough characters for two words	888 return 0; // Not enough characters for two words

678 }	889 }

679	890

680 uint32_t wordsFound = 0;	891 uint32_t wordsFound = 0;

681 int32_t wordLength;	892 int32_t cpWordLength = 0;

	893 int32_t cuWordLength = 0;

682 int32_t current;	894 int32_t current;

683 UErrorCode status = U_ZERO_ERROR;	895 UErrorCode status = U_ZERO_ERROR;

684 PossibleWord words[KHMER_LOOKAHEAD];	896 PossibleWord words[KHMER_LOOKAHEAD];

685 UChar32 uc;

686	897

687 utext_setNativeIndex(text, rangeStart);	898 utext_setNativeIndex(text, rangeStart);

688	899

689 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {	900 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

690 wordLength = 0;	901 cuWordLength = 0;

	902 cpWordLength = 0;

691	903

692 // Look for candidate words at the current position	904 // Look for candidate words at the current position

693 int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDic tionary, rangeEnd);	905 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

694	906

695 // If we found exactly one, use that	907 // If we found exactly one, use that

696 if (candidates == 1) {	908 if (candidates == 1) {

697 wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);	909 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text );

	910 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();

698 wordsFound += 1;	911 wordsFound += 1;

699 }	912 }

700	913

701 // If there was more than one, see which one can take us forward the mos t words	914 // If there was more than one, see which one can take us forward the mos t words

702 else if (candidates > 1) {	915 else if (candidates > 1) {

703 // If we're already at the end of the range, we're done	916 // If we're already at the end of the range, we're done

704 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {	917 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

705 goto foundBest;	918 goto foundBest;

706 }	919 }

707 do {	920 do {

708 int wordsMatched = 1;	921 int32_t wordsMatched = 1;

709 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, f Dictionary, rangeEnd) > 0) {	922 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, f Dictionary, rangeEnd) > 0) {

710 if (wordsMatched < 2) {	923 if (wordsMatched < 2) {

711 // Followed by another dictionary word; mark first word as a good candidate	924 // Followed by another dictionary word; mark first word as a good candidate

712 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();	925 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();

713 wordsMatched = 2;	926 wordsMatched = 2;

714 }	927 }

715	928

716 // If we're already at the end of the range, we're done	929 // If we're already at the end of the range, we're done

717 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {	930 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

718 goto foundBest;	931 goto foundBest;

719 }	932 }

720	933

721 // See if any of the possible second words is followed by a third word	934 // See if any of the possible second words is followed by a third word

722 do {	935 do {

723 // If we find a third word, stop right away	936 // If we find a third word, stop right away

724 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates (text, fDictionary, rangeEnd)) {	937 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates (text, fDictionary, rangeEnd)) {

725 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();	938 words[wordsFound % KHMER_LOOKAHEAD].markCurrent();

726 goto foundBest;	939 goto foundBest;

727 }	940 }

728 }	941 }

729 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text ));	942 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text ));

730 }	943 }

731 }	944 }

732 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));	945 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));

733 foundBest:	946 foundBest:

734 wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);	947 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text );

	948 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();

735 wordsFound += 1;	949 wordsFound += 1;

736 }	950 }

737	951

738 // We come here after having either found a word or not. We look ahead t o the	952 // We come here after having either found a word or not. We look ahead t o the

739 // next word. If it's not a dictionary word, we will combine it with the word we	953 // next word. If it's not a dictionary word, we will combine it with the word we

740 // just found (if there is one), but only if the preceding word does not exceed	954 // just found (if there is one), but only if the preceding word does not exceed

741 // the threshold.	955 // the threshold.

742 // The text iterator should now be positioned at the end of the word we found.	956 // The text iterator should now be positioned at the end of the word we found.

743 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER _ROOT_COMBINE_THRESHOLD) {	957 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHM ER_ROOT_COMBINE_THRESHOLD) {

744 // if it is a dictionary word, do nothing. If it isn't, then if ther e is	958 // if it is a dictionary word, do nothing. If it isn't, then if ther e is

745 // no preceding word, or the non-word shares less than the minimum t hreshold	959 // no preceding word, or the non-word shares less than the minimum t hreshold

746 // of characters with a dictionary word, then scan to resynchronize	960 // of characters with a dictionary word, then scan to resynchronize

747 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0	961 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0

748 && (wordLength == 0	962 && (cuWordLength == 0

749 \|\| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < K HMER_PREFIX_COMBINE_THRESHOLD)) {	963 \|\| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < K HMER_PREFIX_COMBINE_THRESHOLD)) {

750 // Look for a plausible word boundary	964 // Look for a plausible word boundary

751 //TODO: This section will need a rework for UText.	965 int32_t remaining = rangeEnd - (current+cuWordLength);

752 int32_t remaining = rangeEnd - (current+wordLength);	966 UChar32 pc;

753 UChar32 pc = utext_current32(text);	967 UChar32 uc;

754 int32_t chars = 0;	968 int32_t chars = 0;

755 for (;;) {	969 for (;;) {

756 utext_next32(text);	970 int32_t pcIndex = utext_getNativeIndex(text);

757 uc = utext_current32(text);	971 pc = utext_next32(text);

758 // TODO: Here we're counting on the fact that the SA languag es are all	972 int32_t pcSize = utext_getNativeIndex(text) - pcIndex;

759 // in the BMP. This should get fixed with the UText rework.	973 chars += pcSize;

760 chars += 1;	974 remaining -= pcSize;

761 if (--remaining <= 0) {	975 if (remaining <= 0) {

762 break;	976 break;

763 }	977 }

	978 uc = utext_current32(text);

764 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {	979 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

765 // Maybe. See if it's in the dictionary.	980 // Maybe. See if it's in the dictionary.

766 int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEA D].candidates(text, fDictionary, rangeEnd);	981 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOK AHEAD].candidates(text, fDictionary, rangeEnd);

767 utext_setNativeIndex(text, current+wordLength+chars);	982 utext_setNativeIndex(text, current+cuWordLength+chars);

768 if (candidates > 0) {	983 if (candidates > 0) {

769 break;	984 break;

770 }	985 }

771 }	986 }

772 pc = uc;

773 }	987 }

774	988

775 // Bump the word count if there wasn't already one	989 // Bump the word count if there wasn't already one

776 if (wordLength <= 0) {	990 if (cuWordLength <= 0) {

777 wordsFound += 1;	991 wordsFound += 1;

778 }	992 }

779	993

780 // Update the length with the passed-over characters	994 // Update the length with the passed-over characters

781 wordLength += chars;	995 cuWordLength += chars;

782 }	996 }

783 else {	997 else {

784 // Back up to where we were for next iteration	998 // Back up to where we were for next iteration

785 utext_setNativeIndex(text, current+wordLength);	999 utext_setNativeIndex(text, current+cuWordLength);

786 }	1000 }

787 }	1001 }

788	1002

789 // Never stop before a combining mark.	1003 // Never stop before a combining mark.

790 int32_t currPos;	1004 int32_t currPos;

791 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {	1005 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMa rkSet.contains(utext_current32(text))) {

792 utext_next32(text);	1006 utext_next32(text);

793 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;	1007 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

794 }	1008 }

795	1009

796 // Look ahead for possible suffixes if a dictionary word does not follow .	1010 // Look ahead for possible suffixes if a dictionary word does not follow .

797 // We do this in code rather than using a rule so that the heuristic	1011 // We do this in code rather than using a rule so that the heuristic

798 // resynch continues to function. For example, one of the suffix charact ers	1012 // resynch continues to function. For example, one of the suffix charact ers

799 // could be a typo in the middle of a word.	1013 // could be a typo in the middle of a word.

800 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {	1014 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {

801 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0	1015 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary , rangeEnd) <= 0

802 // && fSuffixSet.contains(uc = utext_current32(text))) {	1016 // && fSuffixSet.contains(uc = utext_current32(text))) {

803 // if (uc == KHMER_PAIYANNOI) {	1017 // if (uc == KHMER_PAIYANNOI) {

(...skipping 21 matching lines...) Expand all Loading...
825 // utext_next32(text);	1039 // utext_next32(text);

826 // }	1040 // }

827 // }	1041 // }

828 // }	1042 // }

829 // else {	1043 // else {

830 // utext_setNativeIndex(text, current+wordLength);	1044 // utext_setNativeIndex(text, current+wordLength);

831 // }	1045 // }

832 // }	1046 // }

833	1047

834 // Did we find a word on this iteration? If so, push it on the break sta ck	1048 // Did we find a word on this iteration? If so, push it on the break sta ck

835 if (wordLength > 0) {	1049 if (cuWordLength > 0) {

836 foundBreaks.push((current+wordLength), status);	1050 foundBreaks.push((current+cuWordLength), status);

837 }	1051 }

838 }	1052 }

839	1053

840 // Don't return a break for the end of the dictionary range if there is one there.	1054 // Don't return a break for the end of the dictionary range if there is one there.

841 if (foundBreaks.peeki() >= rangeEnd) {	1055 if (foundBreaks.peeki() >= rangeEnd) {

842 (void) foundBreaks.popi();	1056 (void) foundBreaks.popi();

843 wordsFound -= 1;	1057 wordsFound -= 1;

844 }	1058 }

845	1059

846 return wordsFound;	1060 return wordsFound;

847 }	1061 }

848	1062

849 #if !UCONFIG_NO_NORMALIZATION	1063 #if !UCONFIG_NO_NORMALIZATION

850 /*	1064 /*

851 ******************************************************************	1065 ******************************************************************

852 * CjkBreakEngine	1066 * CjkBreakEngine

853 */	1067 */

854 static const uint32_t kuint32max = 0xFFFFFFFF;	1068 static const uint32_t kuint32max = 0xFFFFFFFF;

855 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)	1069 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)

856 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {	1070 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {

857 // Korean dictionary only includes Hangul syllables	1071 // Korean dictionary only includes Hangul syllables

858 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat us);	1072 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat us);

859 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);	1073 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);

860 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u ff9f]"), status);	1074 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u ff9f]"), status);

861 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status) ;	1075 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status) ;

	1076 nfkcNorm2 = Normalizer2::getNFKCInstance(status);

862	1077

863 if (U_SUCCESS(status)) {	1078 if (U_SUCCESS(status)) {

864 // handle Korean and Japanese/Chinese using different dictionaries	1079 // handle Korean and Japanese/Chinese using different dictionaries

865 if (type == kKorean) {	1080 if (type == kKorean) {

866 setCharacters(fHangulWordSet);	1081 setCharacters(fHangulWordSet);

867 } else { //Chinese and Japanese	1082 } else { //Chinese and Japanese

868 UnicodeSet cjSet;	1083 UnicodeSet cjSet;

869 cjSet.addAll(fHanWordSet);	1084 cjSet.addAll(fHanWordSet);

870 cjSet.addAll(fKatakanaWordSet);	1085 cjSet.addAll(fKatakanaWordSet);

871 cjSet.addAll(fHiraganaWordSet);	1086 cjSet.addAll(fHiraganaWordSet);

872 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MA RK	1087 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MA RK

873 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK	1088 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK

874 setCharacters(cjSet);	1089 setCharacters(cjSet);

875 }	1090 }

876 }	1091 }

877 }	1092 }

878	1093

879 CjkBreakEngine::~CjkBreakEngine(){	1094 CjkBreakEngine::~CjkBreakEngine(){

880 delete fDictionary;	1095 delete fDictionary;

881 }	1096 }

882	1097

883 // The katakanaCost values below are based on the length frequencies of all	1098 // The katakanaCost values below are based on the length frequencies of all

884 // katakana phrases in the dictionary	1099 // katakana phrases in the dictionary

885 static const int kMaxKatakanaLength = 8;	1100 static const int32_t kMaxKatakanaLength = 8;

886 static const int kMaxKatakanaGroupLength = 20;	1101 static const int32_t kMaxKatakanaGroupLength = 20;

887 static const uint32_t maxSnlp = 255;	1102 static const uint32_t maxSnlp = 255;

888	1103

889 static inline uint32_t getKatakanaCost(int wordLength){	1104 static inline uint32_t getKatakanaCost(int32_t wordLength){

890 //TODO: fill array with actual values from dictionary!	1105 //TODO: fill array with actual values from dictionary!

891 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]	1106 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]

892 = {8192, 984, 408, 240, 204, 252, 300, 37 2, 480};	1107 = {8192, 984, 408, 240, 204, 252, 300, 37 2, 480};

893 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];	1108 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];

894 }	1109 }

895	1110

896 static inline bool isKatakana(uint16_t value) {	1111 static inline bool isKatakana(uint16_t value) {

897 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) \|\|	1112 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) \|\|

898 (value >= 0xFF66u && value <= 0xFF9fu);	1113 (value >= 0xFF66u && value <= 0xFF9fu);

899 }	1114 }

900	1115

901 // A very simple helper class to streamline the buffer handling in

902 // divideUpDictionaryRange.

903 template<class T, size_t N>

904 class AutoBuffer {

905 public:

906 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {

907 if (size > N) {

908 buffer = reinterpret_cast<T>(uprv_malloc(sizeof(T)size));

909 capacity = size;

910 }

911 }

912 ~AutoBuffer() {

913 if (buffer != stackBuffer)

914 uprv_free(buffer);

915 }

916	1116

917 T* elems() {	1117 // Function for accessing internal utext flags.

918 return buffer;	1118 // Replicates an internal UText function.

919 }

920	1119

921 const T& operator[] (size_t i) const {	1120 static inline int32_t utext_i32_flag(int32_t bitIndex) {

922 return buffer[i];	1121 return (int32_t)1 << bitIndex;

923 }	1122 }

924	1123

925 T& operator[] (size_t i) {	1124

926 return buffer[i];

927 }

928

929 // resize without copy

930 void resize(size_t size) {

931 if (size <= capacity)

932 return;

933 if (buffer != stackBuffer)

934 uprv_free(buffer);

935 buffer = reinterpret_cast<T>(uprv_malloc(sizeof(T)size));

936 capacity = size;

937 }

938

939 private:

940 T stackBuffer[N];

941 T* buffer;

942 AutoBuffer();

943 size_t capacity;

944 };

945

946

947 /*	1125 /*

948 * @param text A UText representing the text	1126 * @param text A UText representing the text

949 * @param rangeStart The start of the range of dictionary characters	1127 * @param rangeStart The start of the range of dictionary characters

950 * @param rangeEnd The end of the range of dictionary characters	1128 * @param rangeEnd The end of the range of dictionary characters

951 * @param foundBreaks Output of C array of int32_t break positions, or 0	1129 * @param foundBreaks Output of C array of int32_t break positions, or 0

952 * @return The number of breaks found	1130 * @return The number of breaks found

953 */	1131 */

954 int32_t	1132 int32_t

955 CjkBreakEngine::divideUpDictionaryRange( UText *text,	1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText,

956 int32_t rangeStart,	1134 int32_t rangeStart,

957 int32_t rangeEnd,	1135 int32_t rangeEnd,

958 UStack &foundBreaks ) const {	1136 UStack &foundBreaks ) const {

959 if (rangeStart >= rangeEnd) {	1137 if (rangeStart >= rangeEnd) {

960 return 0;	1138 return 0;

961 }	1139 }

962	1140

963 const size_t defaultInputLength = 80;	1141 // UnicodeString version of input UText, NFKC normalized in necessary.

964 size_t inputLength = rangeEnd - rangeStart;	1142 UnicodeString *inString;

965 // TODO: Replace by UnicodeString.

966 AutoBuffer<UChar, defaultInputLength> charString(inputLength);

967	1143

968 // Normalize the input string and put it in normalizedText.	1144 // inputMap[inStringIndex] = corresponding native index from UText inText.

969 // The map from the indices of the normalized input to the raw	1145 // If NULL then mapping is 1:1

970 // input is kept in charPositions.	1146 UVector32 *inputMap = NULL;

971 UErrorCode status = U_ZERO_ERROR;	1147

972 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, & status);	1148 UErrorCode status = U_ZERO_ERROR;

973 if (U_FAILURE(status)) {	1149

974 return 0;	1150

	1151 // if UText has the input string as one contiguous UTF-16 chunk

	1152 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNK S)) &&

	1153 inText->chunkNativeStart <= rangeStart &&

	1154 inText->chunkNativeLimit >= rangeEnd &&

	1155 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {

	1156

	1157 // Input UTtxt is in one contiguous UTF-16 chunk.

	1158 // Use Read-only aliasing UnicodeString constructor on it.

	1159 inString = new UnicodeString(FALSE,

	1160 inText->chunkContents + rangeStart - inText->chunk NativeStart,

	1161 rangeEnd - rangeStart);

	1162 } else {

	1163 // Copy the text from the original inText (UText) to inString (UnicodeSt ring).

	1164 // Create a map from UnicodeString indices -> UText offsets.

	1165 utext_setNativeIndex(inText, rangeStart);

	1166 int32_t limit = rangeEnd;

	1167 U_ASSERT(limit <= utext_nativeLength(inText));

	1168 if (limit > utext_nativeLength(inText)) {

	1169 limit = utext_nativeLength(inText);

	1170 }

	1171 inString = new UnicodeString;

	1172 inputMap = new UVector32(status);

	1173 while (utext_getNativeIndex(inText) < limit) {

	1174 int32_t nativePosition = utext_getNativeIndex(inText);

	1175 UChar32 c = utext_next32(inText);

	1176 U_ASSERT(c != U_SENTINEL);

	1177 inString->append(c);

	1178 while (inputMap->size() < inString->length()) {

	1179 inputMap->addElement(nativePosition, status);

	1180 }

	1181 }

	1182 inputMap->addElement(limit, status);

975 }	1183 }

976	1184

977 UnicodeString inputString(charString.elems(), inputLength);

978 // TODO: Use Normalizer2.

979 UNormalizationMode norm_mode = UNORM_NFKC;

980 UBool isNormalized =

981 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES \|\|

982 Normalizer::isNormalized(inputString, norm_mode, status);

983	1185

984 // TODO: Replace by UVector32.	1186 if (!nfkcNorm2->isNormalized(*inString, status)) {

985 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);	1187 UnicodeString *normalizedInput = new UnicodeString();

986 int numChars = 0;	1188 // normalizedMap[normalizedInput position] == original UText position.

987 UText normalizedText = UTEXT_INITIALIZER;	1189 UVector32 *normalizedMap = new UVector32(status);

988 // Needs to be declared here because normalizedText holds onto its buffer.

989 UnicodeString normalizedString;

990 if (isNormalized) {

991 int32_t index = 0;

992 charPositions[0] = 0;

993 while(index < inputString.length()) {

994 index = inputString.moveIndex32(index, 1);

995 charPositions[++numChars] = index;

996 }

997 utext_openUnicodeString(&normalizedText, &inputString, &status);

998 }

999 else {

1000 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, statu s);

1001 if (U_FAILURE(status)) {	1190 if (U_FAILURE(status)) {

1002 return 0;	1191 return 0;

1003 }	1192 }

1004 charPositions.resize(normalizedString.length() + 1);	1193

1005 Normalizer normalizer(charString.elems(), inputLength, norm_mode);	1194 UnicodeString fragment;

1006 int32_t index = 0;	1195 UnicodeString normalizedFragment;

1007 charPositions[0] = 0;	1196 for (int32_t srcI = 0; srcI < inString->length();) { // Once per normalization chunk

1008 while(index < normalizer.endIndex()){	1197 fragment.remove();

1009 /* UChar32 uc = */ normalizer.next();	1198 int32_t fragmentStartI = srcI;

1010 charPositions[++numChars] = index = normalizer.getIndex();	1199 UChar32 c = inString->char32At(srcI);

	1200 for (;;) {

	1201 fragment.append(c);

	1202 srcI = inString->moveIndex32(srcI, 1);

	1203 if (srcI == inString->length()) {

	1204 break;

	1205 }

	1206 c = inString->char32At(srcI);

	1207 if (nfkcNorm2->hasBoundaryBefore(c)) {

	1208 break;

	1209 }

	1210 }

	1211 nfkcNorm2->normalize(fragment, normalizedFragment, status);

	1212 normalizedInput->append(normalizedFragment);

	1213

	1214 // Map every position in the normalized chunk to the start of the ch unk

	1215 // in the original input.

	1216 int32_t fragmentOriginalStart = inputMap? inputMap->elementAti(fragm entStartI) : fragmentStartI+rangeStart;

	1217 while (normalizedMap->size() < normalizedInput->length()) {

	1218 normalizedMap->addElement(fragmentOriginalStart, status);

	1219 if (U_FAILURE(status)) {

	1220 break;

	1221 }

	1222 }

1011 }	1223 }

1012 utext_openUnicodeString(&normalizedText, &normalizedString, &status);	1224 U_ASSERT(normalizedMap->size() == normalizedInput->length());

	1225 int32_t nativeEnd = inputMap? inputMap->elementAti(inString->length()) : inString->length()+rangeStart;

	1226 normalizedMap->addElement(nativeEnd, status);

	1227

	1228 delete inputMap;

	1229 inputMap = normalizedMap;

	1230 delete inString;

	1231 inString = normalizedInput;

1013 }	1232 }

1014	1233

1015 if (U_FAILURE(status)) {	1234 int32_t numCodePts = inString->countChar32();

1016 return 0;	1235 if (numCodePts != inString->length()) {

	1236 // There are supplementary characters in the input.

	1237 // The dictionary will produce boundary positions in terms of code point indexes,

	1238 // not in terms of code unit string indexes.

	1239 // Use the inputMap mechanism to take care of this in addition to indexi ng differences

	1240 // from normalization and/or UTF-8 input.

	1241 UBool hadExistingMap = (inputMap != NULL);

	1242 if (!hadExistingMap) {

	1243 inputMap = new UVector32(status);

	1244 }

	1245 int32_t cpIdx = 0;

	1246 for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) {

	1247 U_ASSERT(cuIdx >= cpIdx);

	1248 if (hadExistingMap) {

	1249 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);

	1250 } else {

	1251 inputMap->addElement(cuIdx+rangeStart, status);

	1252 }

	1253 cpIdx++;

	1254 if (cuIdx == inString->length()) {

	1255 break;

	1256 }

	1257 }

	1258 }

	1259

	1260 // bestSnlp[i] is the snlp of the best segmentation of the first i

	1261 // code points in the range to be matched.

	1262 UVector32 bestSnlp(numCodePts + 1, status);

	1263 bestSnlp.addElement(0, status);

	1264 for(int32_t i = 1; i <= numCodePts; i++) {

	1265 bestSnlp.addElement(kuint32max, status);

1017 }	1266 }

1018	1267

1019 // From this point on, all the indices refer to the indices of

1020 // the normalized input string.

1021	1268

1022 // bestSnlp[i] is the snlp of the best segmentation of the first i	1269 // prev[i] is the index of the last CJK code point in the previous word in

1023 // characters in the range to be matched.	1270 // the best segmentation of the first i characters.

1024 // TODO: Replace by UVector32.	1271 UVector32 prev(numCodePts + 1, status);

1025 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);	1272 for(int32_t i = 0; i <= numCodePts; i++){

1026 bestSnlp[0] = 0;	1273 prev.addElement(-1, status);

1027 for(int i = 1; i <= numChars; i++) {

1028 bestSnlp[i] = kuint32max;

1029 }	1274 }

1030	1275

1031 // prev[i] is the index of the last CJK character in the previous word in	1276 const int32_t maxWordSize = 20;

1032 // the best segmentation of the first i characters.	1277 UVector32 values(numCodePts, status);

1033 // TODO: Replace by UVector32.	1278 values.setSize(numCodePts);

1034 AutoBuffer<int, defaultInputLength> prev(numChars + 1);	1279 UVector32 lengths(numCodePts, status);

1035 for(int i = 0; i <= numChars; i++){	1280 lengths.setSize(numCodePts);

1036 prev[i] = -1;

1037 }

1038	1281

1039 const size_t maxWordSize = 20;	1282 UText fu = UTEXT_INITIALIZER;

1040 // TODO: Replace both with UVector32.	1283 utext_openUnicodeString(&fu, inString, &status);

1041 AutoBuffer<int32_t, maxWordSize> values(numChars);

1042 AutoBuffer<int32_t, maxWordSize> lengths(numChars);

1043	1284

1044 // Dynamic programming to find the best segmentation.	1285 // Dynamic programming to find the best segmentation.

1045 bool is_prev_katakana = false;	1286

1046 for (int32_t i = 0; i < numChars; ++i) {	1287 // In outer loop, i is the code point index,

1047 //utext_setNativeIndex(text, rangeStart + i);	1288 // ix is the corresponding string (code unit) index.

1048 utext_setNativeIndex(&normalizedText, i);	1289 // They differ when the string contains supplementary characters.

1049 if (bestSnlp[i] == kuint32max)	1290 int32_t ix = 0;

	1291 for (int32_t i = 0; i < numCodePts; ++i, ix = inString->moveIndex32(ix, 1) ) {

	1292 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {

1050 continue;	1293 continue;

	1294 }

1051	1295

1052 int32_t count;	1296 int32_t count;

1053 // limit maximum word length matched to size of current substring	1297 utext_setNativeIndex(&fu, ix);

1054 int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWord Size : (numChars - i);	1298 count = fDictionary->matches(&fu, maxWordSize, numCodePts,

1055	1299 NULL, lengths.getBuffer(), values.getBuffer(), NULL );

1056 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());	1300 // Note: lengths is filled with code point lengths

	1301 // The NULL parameter is the ignored code uni t lengths.

1057	1302

1058 // if there are no single character matches found in the dictionary	1303 // if there are no single character matches found in the dictionary

1059 // starting with this charcter, treat character as a 1-character word	1304 // starting with this charcter, treat character as a 1-character word

1060 // with the highest value possible, i.e. the least likely to occur.	1305 // with the highest value possible, i.e. the least likely to occur.

1061 // Exclude Korean characters from this treatment, as they should be left	1306 // Exclude Korean characters from this treatment, as they should be left

1062 // together by default.	1307 // together by default.

1063 if((count == 0 \|\| lengths[0] != 1) &&	1308 if ((count == 0 \|\| lengths.elementAti(0) != 1) &&

1064 !fHangulWordSet.contains(utext_current32(&normalizedText))) {	1309 !fHangulWordSet.contains(inString->char32At(ix))) {

1065 values[count] = maxSnlp;	1310 values.setElementAt(maxSnlp, count); // 255

1066 lengths[count++] = 1;	1311 lengths.setElementAt(1, count++);

1067 }	1312 }

1068	1313

1069 for (int j = 0; j < count; j++) {	1314 for (int32_t j = 0; j < count; j++) {

1070 uint32_t newSnlp = bestSnlp[i] + values[j];	1315 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)valu es.elementAti(j);

1071 if (newSnlp < bestSnlp[lengths[j] + i]) {	1316 int32_t ln_j_i = lengths.elementAti(j) + i;

1072 bestSnlp[lengths[j] + i] = newSnlp;	1317 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {

1073 prev[lengths[j] + i] = i;	1318 bestSnlp.setElementAt(newSnlp, ln_j_i);

	1319 prev.setElementAt(i, ln_j_i);

1074 }	1320 }

1075 }	1321 }

1076	1322

1077 // In Japanese,	1323 // In Japanese,

1078 // Katakana word in single character is pretty rare. So we apply	1324 // Katakana word in single character is pretty rare. So we apply

1079 // the following heuristic to Katakana: any continuous run of Katakana	1325 // the following heuristic to Katakana: any continuous run of Katakana

1080 // characters is considered a candidate word with a default cost	1326 // characters is considered a candidate word with a default cost

1081 // specified in the katakanaCost table according to its length.	1327 // specified in the katakanaCost table according to its length.

1082 //utext_setNativeIndex(text, rangeStart + i);	1328

1083 utext_setNativeIndex(&normalizedText, i);	1329 bool is_prev_katakana = false;

1084 bool is_katakana = isKatakana(utext_current32(&normalizedText));	1330 bool is_katakana = isKatakana(inString->char32At(ix));

	1331 int32_t katakanaRunLength = 1;

1085 if (!is_prev_katakana && is_katakana) {	1332 if (!is_prev_katakana && is_katakana) {

1086 int j = i + 1;	1333 int32_t j = inString->moveIndex32(ix, 1);

1087 utext_next32(&normalizedText);

1088 // Find the end of the continuous run of Katakana characters	1334 // Find the end of the continuous run of Katakana characters

1089 while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&	1335 while (j < inString->length() && katakanaRunLength < kMaxKatakanaGro upLength &&

1090 isKatakana(utext_current32(&normalizedText))) {	1336 isKatakana(inString->char32At(j))) {

1091 utext_next32(&normalizedText);	1337 j = inString->moveIndex32(j, 1);

1092 ++j;	1338 katakanaRunLength++;

1093 }	1339 }

1094 if ((j - i) < kMaxKatakanaGroupLength) {	1340 if (katakanaRunLength < kMaxKatakanaGroupLength) {

1095 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);	1341 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(kata kanaRunLength);

1096 if (newSnlp < bestSnlp[j]) {	1342 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {

1097 bestSnlp[j] = newSnlp;	1343 bestSnlp.setElementAt(newSnlp, j);

1098 prev[j] = i;	1344 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;

1099 }	1345 }

1100 }	1346 }

1101 }	1347 }

1102 is_prev_katakana = is_katakana;	1348 is_prev_katakana = is_katakana;

1103 }	1349 }

	1350 utext_close(&fu);

1104	1351

1105 // Start pushing the optimal offset index into t_boundary (t for tentative).	1352 // Start pushing the optimal offset index into t_boundary (t for tentative).

1106 // prev[numChars] is guaranteed to be meaningful.	1353 // prev[numCodePts] is guaranteed to be meaningful.

1107 // We'll first push in the reverse order, i.e.,	1354 // We'll first push in the reverse order, i.e.,

1108 // t_boundary[0] = numChars, and afterwards do a swap.	1355 // t_boundary[0] = numCodePts, and afterwards do a swap.

1109 // TODO: Replace by UVector32.	1356 UVector32 t_boundary(numCodePts+1, status);

1110 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

1111	1357

1112 int numBreaks = 0;	1358 int32_t numBreaks = 0;

1113 // No segmentation found, set boundary to end of range	1359 // No segmentation found, set boundary to end of range

1114 if (bestSnlp[numChars] == kuint32max) {	1360 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {

1115 t_boundary[numBreaks++] = numChars;	1361 t_boundary.addElement(numCodePts, status);

	1362 numBreaks++;

1116 } else {	1363 } else {

1117 for (int i = numChars; i > 0; i = prev[i]) {	1364 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {

1118 t_boundary[numBreaks++] = i;	1365 t_boundary.addElement(i, status);

	1366 numBreaks++;

1119 }	1367 }

1120 U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);	1368 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);

1121 }	1369 }

1122	1370

1123 // Reverse offset index in t_boundary.	1371 // Add a break for the start of the dictionary range if there is not one

1124 // Don't add a break for the start of the dictionary range if there is one

1125 // there already.	1372 // there already.

1126 if (foundBreaks.size() == 0 \|\| foundBreaks.peeki() < rangeStart) {	1373 if (foundBreaks.size() == 0 \|\| foundBreaks.peeki() < rangeStart) {

1127 t_boundary[numBreaks++] = 0;	1374 t_boundary.addElement(0, status);

	1375 numBreaks++;

1128 }	1376 }

1129	1377

1130 // Now that we're done, convert positions in t_bdry[] (indices in	1378 // Now that we're done, convert positions in t_boundary[] (indices in

1131 // the normalized input string) back to indices in the raw input string	1379 // the normalized input string) back to indices in the original input UText

1132 // while reversing t_bdry and pushing values to foundBreaks.	1380 // while reversing t_boundary and pushing values to foundBreaks.

1133 for (int i = numBreaks-1; i >= 0; i--) {	1381 for (int32_t i = numBreaks-1; i >= 0; i--) {

1134 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);	1382 int32_t cpPos = t_boundary.elementAti(i);

	1383 int32_t utextPos = inputMap ? inputMap->elementAti(cpPos) : cpPos + ran geStart;

	1384 // Boundaries are added to foundBreaks output in ascending order.

	1385 U_ASSERT(foundBreaks.size() == 0 \|\|foundBreaks.peeki() < utextPos);

	1386 foundBreaks.push(utextPos, status);

1135 }	1387 }

1136	1388

1137 utext_close(&normalizedText);	1389 delete inString;

	1390 delete inputMap;

1138 return numBreaks;	1391 return numBreaks;

1139 }	1392 }

1140 #endif	1393 #endif

1141	1394

1142 U_NAMESPACE_END	1395 U_NAMESPACE_END

1143	1396

1144 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */	1397 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

1145	1398

OLD	NEW

« no previous file with comments | « source/common/dictbe.h ('k') | source/common/dictionarydata.h » ('j') | no next file with comments »