| Index: source/common/dictbe.cpp
|
| diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
|
| index 88e9139ecea133a2aeee2eb082aab6469f696a72..f1c874d4ad16445578ae4b6d4889c6d9b2620844 100644
|
| --- a/source/common/dictbe.cpp
|
| +++ b/source/common/dictbe.cpp
|
| @@ -1,6 +1,6 @@
|
| /**
|
| *******************************************************************************
|
| - * Copyright (C) 2006-2014, International Business Machines Corporation
|
| + * Copyright (C) 2006-2015, International Business Machines Corporation
|
| * and others. All Rights Reserved.
|
| *******************************************************************************
|
| */
|
| @@ -832,11 +832,11 @@ foundBest:
|
| static const int32_t KHMER_LOOKAHEAD = 3;
|
|
|
| // Will not combine a non-word with a preceding dictionary word longer than this
|
| -static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 10;
|
| +static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
|
|
|
| // Will not combine a non-word that shares at least this much prefix with a
|
| // dictionary word, with a preceding word
|
| -static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 5;
|
| +static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
|
|
|
| // Minimum word size
|
| static const int32_t KHMER_MIN_WORD = 2;
|
| @@ -1138,12 +1138,12 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| return 0;
|
| }
|
|
|
| - // UnicodeString version of input UText, NFKC normalized in necessary.
|
| - UnicodeString *inString;
|
| + // UnicodeString version of input UText, NFKC normalized if necessary.
|
| + UnicodeString inString;
|
|
|
| // inputMap[inStringIndex] = corresponding native index from UText inText.
|
| // If NULL then mapping is 1:1
|
| - UVector32 *inputMap = NULL;
|
| + LocalPointer<UVector32> inputMap;
|
|
|
| UErrorCode status = U_ZERO_ERROR;
|
|
|
| @@ -1153,12 +1153,12 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| inText->chunkNativeStart <= rangeStart &&
|
| inText->chunkNativeLimit >= rangeEnd &&
|
| inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
|
| -
|
| - // Input UTtxt is in one contiguous UTF-16 chunk.
|
| - // Use Read-only aliasing UnicodeString constructor on it.
|
| - inString = new UnicodeString(FALSE,
|
| - inText->chunkContents + rangeStart - inText->chunkNativeStart,
|
| - rangeEnd - rangeStart);
|
| +
|
| + // Input UText is in one contiguous UTF-16 chunk.
|
| + // Use Read-only aliasing UnicodeString.
|
| + inString.setTo(FALSE,
|
| + inText->chunkContents + rangeStart - inText->chunkNativeStart,
|
| + rangeEnd - rangeStart);
|
| } else {
|
| // Copy the text from the original inText (UText) to inString (UnicodeString).
|
| // Create a map from UnicodeString indices -> UText offsets.
|
| @@ -1168,14 +1168,16 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| if (limit > utext_nativeLength(inText)) {
|
| limit = utext_nativeLength(inText);
|
| }
|
| - inString = new UnicodeString;
|
| - inputMap = new UVector32(status);
|
| + inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
|
| + if (U_FAILURE(status)) {
|
| + return 0;
|
| + }
|
| while (utext_getNativeIndex(inText) < limit) {
|
| int32_t nativePosition = utext_getNativeIndex(inText);
|
| UChar32 c = utext_next32(inText);
|
| U_ASSERT(c != U_SENTINEL);
|
| - inString->append(c);
|
| - while (inputMap->size() < inString->length()) {
|
| + inString.append(c);
|
| + while (inputMap->size() < inString.length()) {
|
| inputMap->addElement(nativePosition, status);
|
| }
|
| }
|
| @@ -1183,67 +1185,70 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| }
|
|
|
|
|
| - if (!nfkcNorm2->isNormalized(*inString, status)) {
|
| - UnicodeString *normalizedInput = new UnicodeString();
|
| + if (!nfkcNorm2->isNormalized(inString, status)) {
|
| + UnicodeString normalizedInput;
|
| // normalizedMap[normalizedInput position] == original UText position.
|
| - UVector32 *normalizedMap = new UVector32(status);
|
| + LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
|
| if (U_FAILURE(status)) {
|
| return 0;
|
| }
|
|
|
| UnicodeString fragment;
|
| UnicodeString normalizedFragment;
|
| - for (int32_t srcI = 0; srcI < inString->length();) { // Once per normalization chunk
|
| + for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
|
| fragment.remove();
|
| int32_t fragmentStartI = srcI;
|
| - UChar32 c = inString->char32At(srcI);
|
| + UChar32 c = inString.char32At(srcI);
|
| for (;;) {
|
| fragment.append(c);
|
| - srcI = inString->moveIndex32(srcI, 1);
|
| - if (srcI == inString->length()) {
|
| + srcI = inString.moveIndex32(srcI, 1);
|
| + if (srcI == inString.length()) {
|
| break;
|
| }
|
| - c = inString->char32At(srcI);
|
| + c = inString.char32At(srcI);
|
| if (nfkcNorm2->hasBoundaryBefore(c)) {
|
| break;
|
| }
|
| }
|
| nfkcNorm2->normalize(fragment, normalizedFragment, status);
|
| - normalizedInput->append(normalizedFragment);
|
| + normalizedInput.append(normalizedFragment);
|
|
|
| // Map every position in the normalized chunk to the start of the chunk
|
| // in the original input.
|
| - int32_t fragmentOriginalStart = inputMap? inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
|
| - while (normalizedMap->size() < normalizedInput->length()) {
|
| + int32_t fragmentOriginalStart = inputMap.isValid() ?
|
| + inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
|
| + while (normalizedMap->size() < normalizedInput.length()) {
|
| normalizedMap->addElement(fragmentOriginalStart, status);
|
| if (U_FAILURE(status)) {
|
| break;
|
| }
|
| }
|
| }
|
| - U_ASSERT(normalizedMap->size() == normalizedInput->length());
|
| - int32_t nativeEnd = inputMap? inputMap->elementAti(inString->length()) : inString->length()+rangeStart;
|
| + U_ASSERT(normalizedMap->size() == normalizedInput.length());
|
| + int32_t nativeEnd = inputMap.isValid() ?
|
| + inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
|
| normalizedMap->addElement(nativeEnd, status);
|
|
|
| - delete inputMap;
|
| - inputMap = normalizedMap;
|
| - delete inString;
|
| - inString = normalizedInput;
|
| + inputMap.moveFrom(normalizedMap);
|
| + inString.moveFrom(normalizedInput);
|
| }
|
|
|
| - int32_t numCodePts = inString->countChar32();
|
| - if (numCodePts != inString->length()) {
|
| + int32_t numCodePts = inString.countChar32();
|
| + if (numCodePts != inString.length()) {
|
| // There are supplementary characters in the input.
|
| // The dictionary will produce boundary positions in terms of code point indexes,
|
| // not in terms of code unit string indexes.
|
| // Use the inputMap mechanism to take care of this in addition to indexing differences
|
| // from normalization and/or UTF-8 input.
|
| - UBool hadExistingMap = (inputMap != NULL);
|
| + UBool hadExistingMap = inputMap.isValid();
|
| if (!hadExistingMap) {
|
| - inputMap = new UVector32(status);
|
| + inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
|
| + if (U_FAILURE(status)) {
|
| + return 0;
|
| + }
|
| }
|
| int32_t cpIdx = 0;
|
| - for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) {
|
| + for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
|
| U_ASSERT(cuIdx >= cpIdx);
|
| if (hadExistingMap) {
|
| inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
|
| @@ -1251,7 +1256,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| inputMap->addElement(cuIdx+rangeStart, status);
|
| }
|
| cpIdx++;
|
| - if (cuIdx == inString->length()) {
|
| + if (cuIdx == inString.length()) {
|
| break;
|
| }
|
| }
|
| @@ -1280,7 +1285,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| lengths.setSize(numCodePts);
|
|
|
| UText fu = UTEXT_INITIALIZER;
|
| - utext_openUnicodeString(&fu, inString, &status);
|
| + utext_openUnicodeString(&fu, &inString, &status);
|
|
|
| // Dynamic programming to find the best segmentation.
|
|
|
| @@ -1288,7 +1293,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| // ix is the corresponding string (code unit) index.
|
| // They differ when the string contains supplementary characters.
|
| int32_t ix = 0;
|
| - for (int32_t i = 0; i < numCodePts; ++i, ix = inString->moveIndex32(ix, 1)) {
|
| + for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
|
| if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
|
| continue;
|
| }
|
| @@ -1306,7 +1311,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| // Exclude Korean characters from this treatment, as they should be left
|
| // together by default.
|
| if ((count == 0 || lengths.elementAti(0) != 1) &&
|
| - !fHangulWordSet.contains(inString->char32At(ix))) {
|
| + !fHangulWordSet.contains(inString.char32At(ix))) {
|
| values.setElementAt(maxSnlp, count); // 255
|
| lengths.setElementAt(1, count++);
|
| }
|
| @@ -1327,14 +1332,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| // specified in the katakanaCost table according to its length.
|
|
|
| bool is_prev_katakana = false;
|
| - bool is_katakana = isKatakana(inString->char32At(ix));
|
| + bool is_katakana = isKatakana(inString.char32At(ix));
|
| int32_t katakanaRunLength = 1;
|
| if (!is_prev_katakana && is_katakana) {
|
| - int32_t j = inString->moveIndex32(ix, 1);
|
| + int32_t j = inString.moveIndex32(ix, 1);
|
| // Find the end of the continuous run of Katakana characters
|
| - while (j < inString->length() && katakanaRunLength < kMaxKatakanaGroupLength &&
|
| - isKatakana(inString->char32At(j))) {
|
| - j = inString->moveIndex32(j, 1);
|
| + while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
|
| + isKatakana(inString.char32At(j))) {
|
| + j = inString.moveIndex32(j, 1);
|
| katakanaRunLength++;
|
| }
|
| if (katakanaRunLength < kMaxKatakanaGroupLength) {
|
| @@ -1380,14 +1385,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
| // while reversing t_boundary and pushing values to foundBreaks.
|
| for (int32_t i = numBreaks-1; i >= 0; i--) {
|
| int32_t cpPos = t_boundary.elementAti(i);
|
| - int32_t utextPos = inputMap ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
|
| + int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
|
| // Boundaries are added to foundBreaks output in ascending order.
|
| U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
|
| foundBreaks.push(utextPos, status);
|
| }
|
|
|
| - delete inString;
|
| - delete inputMap;
|
| + // inString goes out of scope
|
| + // inputMap goes out of scope
|
| return numBreaks;
|
| }
|
| #endif
|
|
|