| OLD | NEW |
| 1 /** | 1 /** |
| 2 ******************************************************************************* | 2 ******************************************************************************* |
| 3 * Copyright (C) 2006-2014, International Business Machines Corporation | 3 * Copyright (C) 2006-2015, International Business Machines Corporation |
| 4 * and others. All Rights Reserved. | 4 * and others. All Rights Reserved. |
| 5 ******************************************************************************* | 5 ******************************************************************************* |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
| 9 | 9 |
| 10 #if !UCONFIG_NO_BREAK_ITERATION | 10 #if !UCONFIG_NO_BREAK_ITERATION |
| 11 | 11 |
| 12 #include "brkeng.h" | 12 #include "brkeng.h" |
| 13 #include "dictbe.h" | 13 #include "dictbe.h" |
| (...skipping 811 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 825 | 825 |
| 826 /* | 826 /* |
| 827 ****************************************************************** | 827 ****************************************************************** |
| 828 * KhmerBreakEngine | 828 * KhmerBreakEngine |
| 829 */ | 829 */ |
| 830 | 830 |
| 831 // How many words in a row are "good enough"? | 831 // How many words in a row are "good enough"? |
| 832 static const int32_t KHMER_LOOKAHEAD = 3; | 832 static const int32_t KHMER_LOOKAHEAD = 3; |
| 833 | 833 |
| 834 // Will not combine a non-word with a preceding dictionary word longer than this | 834 // Will not combine a non-word with a preceding dictionary word longer than this |
| 835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 10; | 835 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; |
| 836 | 836 |
| 837 // Will not combine a non-word that shares at least this much prefix with a | 837 // Will not combine a non-word that shares at least this much prefix with a |
| 838 // dictionary word, with a preceding word | 838 // dictionary word, with a preceding word |
| 839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 5; | 839 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; |
| 840 | 840 |
| 841 // Minimum word size | 841 // Minimum word size |
| 842 static const int32_t KHMER_MIN_WORD = 2; | 842 static const int32_t KHMER_MIN_WORD = 2; |
| 843 | 843 |
| 844 // Minimum number of characters for two words | 844 // Minimum number of characters for two words |
| 845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; | 845 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; |
| 846 | 846 |
| 847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
e &status) | 847 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
e &status) |
| 848 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), | 848 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), |
| 849 fDictionary(adoptDictionary) | 849 fDictionary(adoptDictionary) |
| (...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1131 */ | 1131 */ |
| 1132 int32_t | 1132 int32_t |
| 1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText, | 1133 CjkBreakEngine::divideUpDictionaryRange( UText *inText, |
| 1134 int32_t rangeStart, | 1134 int32_t rangeStart, |
| 1135 int32_t rangeEnd, | 1135 int32_t rangeEnd, |
| 1136 UStack &foundBreaks ) const { | 1136 UStack &foundBreaks ) const { |
| 1137 if (rangeStart >= rangeEnd) { | 1137 if (rangeStart >= rangeEnd) { |
| 1138 return 0; | 1138 return 0; |
| 1139 } | 1139 } |
| 1140 | 1140 |
| 1141 // UnicodeString version of input UText, NFKC normalized in necessary. | 1141 // UnicodeString version of input UText, NFKC normalized if necessary. |
| 1142 UnicodeString *inString; | 1142 UnicodeString inString; |
| 1143 | 1143 |
| 1144 // inputMap[inStringIndex] = corresponding native index from UText inText. | 1144 // inputMap[inStringIndex] = corresponding native index from UText inText. |
| 1145 // If NULL then mapping is 1:1 | 1145 // If NULL then mapping is 1:1 |
| 1146 UVector32 *inputMap = NULL; | 1146 LocalPointer<UVector32> inputMap; |
| 1147 | 1147 |
| 1148 UErrorCode status = U_ZERO_ERROR; | 1148 UErrorCode status = U_ZERO_ERROR; |
| 1149 | 1149 |
| 1150 | 1150 |
| 1151 // if UText has the input string as one contiguous UTF-16 chunk | 1151 // if UText has the input string as one contiguous UTF-16 chunk |
| 1152 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNK
S)) && | 1152 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNK
S)) && |
| 1153 inText->chunkNativeStart <= rangeStart && | 1153 inText->chunkNativeStart <= rangeStart && |
| 1154 inText->chunkNativeLimit >= rangeEnd && | 1154 inText->chunkNativeLimit >= rangeEnd && |
| 1155 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { | 1155 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { |
| 1156 | 1156 |
| 1157 // Input UTtxt is in one contiguous UTF-16 chunk. | 1157 // Input UText is in one contiguous UTF-16 chunk. |
| 1158 // Use Read-only aliasing UnicodeString constructor on it. | 1158 // Use Read-only aliasing UnicodeString. |
| 1159 inString = new UnicodeString(FALSE, | 1159 inString.setTo(FALSE, |
| 1160 inText->chunkContents + rangeStart - inText->chunk
NativeStart, | 1160 inText->chunkContents + rangeStart - inText->chunkNativeS
tart, |
| 1161 rangeEnd - rangeStart); | 1161 rangeEnd - rangeStart); |
| 1162 } else { | 1162 } else { |
| 1163 // Copy the text from the original inText (UText) to inString (UnicodeSt
ring). | 1163 // Copy the text from the original inText (UText) to inString (UnicodeSt
ring). |
| 1164 // Create a map from UnicodeString indices -> UText offsets. | 1164 // Create a map from UnicodeString indices -> UText offsets. |
| 1165 utext_setNativeIndex(inText, rangeStart); | 1165 utext_setNativeIndex(inText, rangeStart); |
| 1166 int32_t limit = rangeEnd; | 1166 int32_t limit = rangeEnd; |
| 1167 U_ASSERT(limit <= utext_nativeLength(inText)); | 1167 U_ASSERT(limit <= utext_nativeLength(inText)); |
| 1168 if (limit > utext_nativeLength(inText)) { | 1168 if (limit > utext_nativeLength(inText)) { |
| 1169 limit = utext_nativeLength(inText); | 1169 limit = utext_nativeLength(inText); |
| 1170 } | 1170 } |
| 1171 inString = new UnicodeString; | 1171 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); |
| 1172 inputMap = new UVector32(status); | 1172 if (U_FAILURE(status)) { |
| 1173 return 0; |
| 1174 } |
| 1173 while (utext_getNativeIndex(inText) < limit) { | 1175 while (utext_getNativeIndex(inText) < limit) { |
| 1174 int32_t nativePosition = utext_getNativeIndex(inText); | 1176 int32_t nativePosition = utext_getNativeIndex(inText); |
| 1175 UChar32 c = utext_next32(inText); | 1177 UChar32 c = utext_next32(inText); |
| 1176 U_ASSERT(c != U_SENTINEL); | 1178 U_ASSERT(c != U_SENTINEL); |
| 1177 inString->append(c); | 1179 inString.append(c); |
| 1178 while (inputMap->size() < inString->length()) { | 1180 while (inputMap->size() < inString.length()) { |
| 1179 inputMap->addElement(nativePosition, status); | 1181 inputMap->addElement(nativePosition, status); |
| 1180 } | 1182 } |
| 1181 } | 1183 } |
| 1182 inputMap->addElement(limit, status); | 1184 inputMap->addElement(limit, status); |
| 1183 } | 1185 } |
| 1184 | 1186 |
| 1185 | 1187 |
| 1186 if (!nfkcNorm2->isNormalized(*inString, status)) { | 1188 if (!nfkcNorm2->isNormalized(inString, status)) { |
| 1187 UnicodeString *normalizedInput = new UnicodeString(); | 1189 UnicodeString normalizedInput; |
| 1188 // normalizedMap[normalizedInput position] == original UText position. | 1190 // normalizedMap[normalizedInput position] == original UText position. |
| 1189 UVector32 *normalizedMap = new UVector32(status); | 1191 LocalPointer<UVector32> normalizedMap(new UVector32(status), status); |
| 1190 if (U_FAILURE(status)) { | 1192 if (U_FAILURE(status)) { |
| 1191 return 0; | 1193 return 0; |
| 1192 } | 1194 } |
| 1193 | 1195 |
| 1194 UnicodeString fragment; | 1196 UnicodeString fragment; |
| 1195 UnicodeString normalizedFragment; | 1197 UnicodeString normalizedFragment; |
| 1196 for (int32_t srcI = 0; srcI < inString->length();) { //
Once per normalization chunk | 1198 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normali
zation chunk |
| 1197 fragment.remove(); | 1199 fragment.remove(); |
| 1198 int32_t fragmentStartI = srcI; | 1200 int32_t fragmentStartI = srcI; |
| 1199 UChar32 c = inString->char32At(srcI); | 1201 UChar32 c = inString.char32At(srcI); |
| 1200 for (;;) { | 1202 for (;;) { |
| 1201 fragment.append(c); | 1203 fragment.append(c); |
| 1202 srcI = inString->moveIndex32(srcI, 1); | 1204 srcI = inString.moveIndex32(srcI, 1); |
| 1203 if (srcI == inString->length()) { | 1205 if (srcI == inString.length()) { |
| 1204 break; | 1206 break; |
| 1205 } | 1207 } |
| 1206 c = inString->char32At(srcI); | 1208 c = inString.char32At(srcI); |
| 1207 if (nfkcNorm2->hasBoundaryBefore(c)) { | 1209 if (nfkcNorm2->hasBoundaryBefore(c)) { |
| 1208 break; | 1210 break; |
| 1209 } | 1211 } |
| 1210 } | 1212 } |
| 1211 nfkcNorm2->normalize(fragment, normalizedFragment, status); | 1213 nfkcNorm2->normalize(fragment, normalizedFragment, status); |
| 1212 normalizedInput->append(normalizedFragment); | 1214 normalizedInput.append(normalizedFragment); |
| 1213 | 1215 |
| 1214 // Map every position in the normalized chunk to the start of the ch
unk | 1216 // Map every position in the normalized chunk to the start of the ch
unk |
| 1215 // in the original input. | 1217 // in the original input. |
| 1216 int32_t fragmentOriginalStart = inputMap? inputMap->elementAti(fragm
entStartI) : fragmentStartI+rangeStart; | 1218 int32_t fragmentOriginalStart = inputMap.isValid() ? |
| 1217 while (normalizedMap->size() < normalizedInput->length()) { | 1219 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeS
tart; |
| 1220 while (normalizedMap->size() < normalizedInput.length()) { |
| 1218 normalizedMap->addElement(fragmentOriginalStart, status); | 1221 normalizedMap->addElement(fragmentOriginalStart, status); |
| 1219 if (U_FAILURE(status)) { | 1222 if (U_FAILURE(status)) { |
| 1220 break; | 1223 break; |
| 1221 } | 1224 } |
| 1222 } | 1225 } |
| 1223 } | 1226 } |
| 1224 U_ASSERT(normalizedMap->size() == normalizedInput->length()); | 1227 U_ASSERT(normalizedMap->size() == normalizedInput.length()); |
| 1225 int32_t nativeEnd = inputMap? inputMap->elementAti(inString->length()) :
inString->length()+rangeStart; | 1228 int32_t nativeEnd = inputMap.isValid() ? |
| 1229 inputMap->elementAti(inString.length()) : inString.length()+rang
eStart; |
| 1226 normalizedMap->addElement(nativeEnd, status); | 1230 normalizedMap->addElement(nativeEnd, status); |
| 1227 | 1231 |
| 1228 delete inputMap; | 1232 inputMap.moveFrom(normalizedMap); |
| 1229 inputMap = normalizedMap; | 1233 inString.moveFrom(normalizedInput); |
| 1230 delete inString; | |
| 1231 inString = normalizedInput; | |
| 1232 } | 1234 } |
| 1233 | 1235 |
| 1234 int32_t numCodePts = inString->countChar32(); | 1236 int32_t numCodePts = inString.countChar32(); |
| 1235 if (numCodePts != inString->length()) { | 1237 if (numCodePts != inString.length()) { |
| 1236 // There are supplementary characters in the input. | 1238 // There are supplementary characters in the input. |
| 1237 // The dictionary will produce boundary positions in terms of code point
indexes, | 1239 // The dictionary will produce boundary positions in terms of code point
indexes, |
| 1238 // not in terms of code unit string indexes. | 1240 // not in terms of code unit string indexes. |
| 1239 // Use the inputMap mechanism to take care of this in addition to indexi
ng differences | 1241 // Use the inputMap mechanism to take care of this in addition to indexi
ng differences |
| 1240 // from normalization and/or UTF-8 input. | 1242 // from normalization and/or UTF-8 input. |
| 1241 UBool hadExistingMap = (inputMap != NULL); | 1243 UBool hadExistingMap = inputMap.isValid(); |
| 1242 if (!hadExistingMap) { | 1244 if (!hadExistingMap) { |
| 1243 inputMap = new UVector32(status); | 1245 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status
); |
| 1246 if (U_FAILURE(status)) { |
| 1247 return 0; |
| 1248 } |
| 1244 } | 1249 } |
| 1245 int32_t cpIdx = 0; | 1250 int32_t cpIdx = 0; |
| 1246 for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) { | 1251 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) { |
| 1247 U_ASSERT(cuIdx >= cpIdx); | 1252 U_ASSERT(cuIdx >= cpIdx); |
| 1248 if (hadExistingMap) { | 1253 if (hadExistingMap) { |
| 1249 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx); | 1254 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx); |
| 1250 } else { | 1255 } else { |
| 1251 inputMap->addElement(cuIdx+rangeStart, status); | 1256 inputMap->addElement(cuIdx+rangeStart, status); |
| 1252 } | 1257 } |
| 1253 cpIdx++; | 1258 cpIdx++; |
| 1254 if (cuIdx == inString->length()) { | 1259 if (cuIdx == inString.length()) { |
| 1255 break; | 1260 break; |
| 1256 } | 1261 } |
| 1257 } | 1262 } |
| 1258 } | 1263 } |
| 1259 | 1264 |
| 1260 // bestSnlp[i] is the snlp of the best segmentation of the first i | 1265 // bestSnlp[i] is the snlp of the best segmentation of the first i |
| 1261 // code points in the range to be matched. | 1266 // code points in the range to be matched. |
| 1262 UVector32 bestSnlp(numCodePts + 1, status); | 1267 UVector32 bestSnlp(numCodePts + 1, status); |
| 1263 bestSnlp.addElement(0, status); | 1268 bestSnlp.addElement(0, status); |
| 1264 for(int32_t i = 1; i <= numCodePts; i++) { | 1269 for(int32_t i = 1; i <= numCodePts; i++) { |
| 1265 bestSnlp.addElement(kuint32max, status); | 1270 bestSnlp.addElement(kuint32max, status); |
| 1266 } | 1271 } |
| 1267 | 1272 |
| 1268 | 1273 |
| 1269 // prev[i] is the index of the last CJK code point in the previous word in | 1274 // prev[i] is the index of the last CJK code point in the previous word in |
| 1270 // the best segmentation of the first i characters. | 1275 // the best segmentation of the first i characters. |
| 1271 UVector32 prev(numCodePts + 1, status); | 1276 UVector32 prev(numCodePts + 1, status); |
| 1272 for(int32_t i = 0; i <= numCodePts; i++){ | 1277 for(int32_t i = 0; i <= numCodePts; i++){ |
| 1273 prev.addElement(-1, status); | 1278 prev.addElement(-1, status); |
| 1274 } | 1279 } |
| 1275 | 1280 |
| 1276 const int32_t maxWordSize = 20; | 1281 const int32_t maxWordSize = 20; |
| 1277 UVector32 values(numCodePts, status); | 1282 UVector32 values(numCodePts, status); |
| 1278 values.setSize(numCodePts); | 1283 values.setSize(numCodePts); |
| 1279 UVector32 lengths(numCodePts, status); | 1284 UVector32 lengths(numCodePts, status); |
| 1280 lengths.setSize(numCodePts); | 1285 lengths.setSize(numCodePts); |
| 1281 | 1286 |
| 1282 UText fu = UTEXT_INITIALIZER; | 1287 UText fu = UTEXT_INITIALIZER; |
| 1283 utext_openUnicodeString(&fu, inString, &status); | 1288 utext_openUnicodeString(&fu, &inString, &status); |
| 1284 | 1289 |
| 1285 // Dynamic programming to find the best segmentation. | 1290 // Dynamic programming to find the best segmentation. |
| 1286 | 1291 |
| 1287 // In outer loop, i is the code point index, | 1292 // In outer loop, i is the code point index, |
| 1288 // ix is the corresponding string (code unit) index. | 1293 // ix is the corresponding string (code unit) index. |
| 1289 // They differ when the string contains supplementary characters. | 1294 // They differ when the string contains supplementary characters. |
| 1290 int32_t ix = 0; | 1295 int32_t ix = 0; |
| 1291 for (int32_t i = 0; i < numCodePts; ++i, ix = inString->moveIndex32(ix, 1)
) { | 1296 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1))
{ |
| 1292 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { | 1297 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { |
| 1293 continue; | 1298 continue; |
| 1294 } | 1299 } |
| 1295 | 1300 |
| 1296 int32_t count; | 1301 int32_t count; |
| 1297 utext_setNativeIndex(&fu, ix); | 1302 utext_setNativeIndex(&fu, ix); |
| 1298 count = fDictionary->matches(&fu, maxWordSize, numCodePts, | 1303 count = fDictionary->matches(&fu, maxWordSize, numCodePts, |
| 1299 NULL, lengths.getBuffer(), values.getBuffer(), NULL
); | 1304 NULL, lengths.getBuffer(), values.getBuffer(), NULL
); |
| 1300 // Note: lengths is filled with code point lengths | 1305 // Note: lengths is filled with code point lengths |
| 1301 // The NULL parameter is the ignored code uni
t lengths. | 1306 // The NULL parameter is the ignored code uni
t lengths. |
| 1302 | 1307 |
| 1303 // if there are no single character matches found in the dictionary | 1308 // if there are no single character matches found in the dictionary |
| 1304 // starting with this charcter, treat character as a 1-character word | 1309 // starting with this charcter, treat character as a 1-character word |
| 1305 // with the highest value possible, i.e. the least likely to occur. | 1310 // with the highest value possible, i.e. the least likely to occur. |
| 1306 // Exclude Korean characters from this treatment, as they should be left | 1311 // Exclude Korean characters from this treatment, as they should be left |
| 1307 // together by default. | 1312 // together by default. |
| 1308 if ((count == 0 || lengths.elementAti(0) != 1) && | 1313 if ((count == 0 || lengths.elementAti(0) != 1) && |
| 1309 !fHangulWordSet.contains(inString->char32At(ix))) { | 1314 !fHangulWordSet.contains(inString.char32At(ix))) { |
| 1310 values.setElementAt(maxSnlp, count); // 255 | 1315 values.setElementAt(maxSnlp, count); // 255 |
| 1311 lengths.setElementAt(1, count++); | 1316 lengths.setElementAt(1, count++); |
| 1312 } | 1317 } |
| 1313 | 1318 |
| 1314 for (int32_t j = 0; j < count; j++) { | 1319 for (int32_t j = 0; j < count; j++) { |
| 1315 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)valu
es.elementAti(j); | 1320 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)valu
es.elementAti(j); |
| 1316 int32_t ln_j_i = lengths.elementAti(j) + i; | 1321 int32_t ln_j_i = lengths.elementAti(j) + i; |
| 1317 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) { | 1322 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) { |
| 1318 bestSnlp.setElementAt(newSnlp, ln_j_i); | 1323 bestSnlp.setElementAt(newSnlp, ln_j_i); |
| 1319 prev.setElementAt(i, ln_j_i); | 1324 prev.setElementAt(i, ln_j_i); |
| 1320 } | 1325 } |
| 1321 } | 1326 } |
| 1322 | 1327 |
| 1323 // In Japanese, | 1328 // In Japanese, |
| 1324 // Katakana word in single character is pretty rare. So we apply | 1329 // Katakana word in single character is pretty rare. So we apply |
| 1325 // the following heuristic to Katakana: any continuous run of Katakana | 1330 // the following heuristic to Katakana: any continuous run of Katakana |
| 1326 // characters is considered a candidate word with a default cost | 1331 // characters is considered a candidate word with a default cost |
| 1327 // specified in the katakanaCost table according to its length. | 1332 // specified in the katakanaCost table according to its length. |
| 1328 | 1333 |
| 1329 bool is_prev_katakana = false; | 1334 bool is_prev_katakana = false; |
| 1330 bool is_katakana = isKatakana(inString->char32At(ix)); | 1335 bool is_katakana = isKatakana(inString.char32At(ix)); |
| 1331 int32_t katakanaRunLength = 1; | 1336 int32_t katakanaRunLength = 1; |
| 1332 if (!is_prev_katakana && is_katakana) { | 1337 if (!is_prev_katakana && is_katakana) { |
| 1333 int32_t j = inString->moveIndex32(ix, 1); | 1338 int32_t j = inString.moveIndex32(ix, 1); |
| 1334 // Find the end of the continuous run of Katakana characters | 1339 // Find the end of the continuous run of Katakana characters |
| 1335 while (j < inString->length() && katakanaRunLength < kMaxKatakanaGro
upLength && | 1340 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGrou
pLength && |
| 1336 isKatakana(inString->char32At(j))) { | 1341 isKatakana(inString.char32At(j))) { |
| 1337 j = inString->moveIndex32(j, 1); | 1342 j = inString.moveIndex32(j, 1); |
| 1338 katakanaRunLength++; | 1343 katakanaRunLength++; |
| 1339 } | 1344 } |
| 1340 if (katakanaRunLength < kMaxKatakanaGroupLength) { | 1345 if (katakanaRunLength < kMaxKatakanaGroupLength) { |
| 1341 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(kata
kanaRunLength); | 1346 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(kata
kanaRunLength); |
| 1342 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) { | 1347 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) { |
| 1343 bestSnlp.setElementAt(newSnlp, j); | 1348 bestSnlp.setElementAt(newSnlp, j); |
| 1344 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; | 1349 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; |
| 1345 } | 1350 } |
| 1346 } | 1351 } |
| 1347 } | 1352 } |
| (...skipping 25 matching lines...) Expand all Loading... |
| 1373 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { | 1378 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { |
| 1374 t_boundary.addElement(0, status); | 1379 t_boundary.addElement(0, status); |
| 1375 numBreaks++; | 1380 numBreaks++; |
| 1376 } | 1381 } |
| 1377 | 1382 |
| 1378 // Now that we're done, convert positions in t_boundary[] (indices in | 1383 // Now that we're done, convert positions in t_boundary[] (indices in |
| 1379 // the normalized input string) back to indices in the original input UText | 1384 // the normalized input string) back to indices in the original input UText |
| 1380 // while reversing t_boundary and pushing values to foundBreaks. | 1385 // while reversing t_boundary and pushing values to foundBreaks. |
| 1381 for (int32_t i = numBreaks-1; i >= 0; i--) { | 1386 for (int32_t i = numBreaks-1; i >= 0; i--) { |
| 1382 int32_t cpPos = t_boundary.elementAti(i); | 1387 int32_t cpPos = t_boundary.elementAti(i); |
| 1383 int32_t utextPos = inputMap ? inputMap->elementAti(cpPos) : cpPos + ran
geStart; | 1388 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : c
pPos + rangeStart; |
| 1384 // Boundaries are added to foundBreaks output in ascending order. | 1389 // Boundaries are added to foundBreaks output in ascending order. |
| 1385 U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); | 1390 U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); |
| 1386 foundBreaks.push(utextPos, status); | 1391 foundBreaks.push(utextPos, status); |
| 1387 } | 1392 } |
| 1388 | 1393 |
| 1389 delete inString; | 1394 // inString goes out of scope |
| 1390 delete inputMap; | 1395 // inputMap goes out of scope |
| 1391 return numBreaks; | 1396 return numBreaks; |
| 1392 } | 1397 } |
| 1393 #endif | 1398 #endif |
| 1394 | 1399 |
| 1395 U_NAMESPACE_END | 1400 U_NAMESPACE_END |
| 1396 | 1401 |
| 1397 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 1402 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 1398 | 1403 |
| OLD | NEW |