icu46/source/common/dictbe.cpp - Issue 6370014: CJK segmentation patch for ICU 4.6...

Unified Diff: icu46/source/common/dictbe.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/common/dictbe.cpp

===================================================================

--- icu46/source/common/dictbe.cpp (revision 68397)

+++ icu46/source/common/dictbe.cpp (working copy)

@@ -16,6 +16,9 @@

#include "unicode/ubrk.h"

#include "uvector.h"

#include "triedict.h"

+#include "uassert.h"

+#include "unicode/normlzr.h"

+#include "cmemory.h"

U_NAMESPACE_BEGIN

@@ -422,6 +425,294 @@

return wordsFound;

}

+/*

+ ******************************************************************

+ * CjkBreakEngine

+ */

+static const uint32_t kuint32max = 0xFFFFFFFF;

+CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)

+: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){

+ if (!adoptDictionary->getValued()) {

+ status = U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ // Korean dictionary only includes Hangul syllables

+ fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);

+ fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);

+ fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);

+ fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);

+ if (U_SUCCESS(status)) {

+ // handle Korean and Japanese/Chinese using different dictionaries

+ if (type == kKorean) {

+ setCharacters(fHangulWordSet);

+ } else { //Chinese and Japanese

+ UnicodeSet cjSet;

+ cjSet.addAll(fHanWordSet);

+ cjSet.addAll(fKatakanaWordSet);

+ cjSet.addAll(fHiraganaWordSet);

+ cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));

+ setCharacters(cjSet);

+ }

+CjkBreakEngine::~CjkBreakEngine(){

+ delete fDictionary;

+// The katakanaCost values below are based on the length frequencies of all

+// katakana phrases in the dictionary

+static const int kMaxKatakanaLength = 8;

+static const int kMaxKatakanaGroupLength = 20;

+static const uint32_t maxSnlp = 255;

+static inline uint32_t getKatakanaCost(int wordLength){

+ //TODO: fill array with actual values from dictionary!

+ static const uint32_t katakanaCost[kMaxKatakanaLength + 1]

+ = {8192, 984, 408, 240, 204, 252, 300, 372, 480};

+ return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];

+static inline bool isKatakana(uint16_t value) {

+ return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||

+ (value >= 0xFF66u && value <= 0xFF9fu);

+// A very simple helper class to streamline the buffer handling in

+// divideUpDictionaryRange.

+template<class T, size_t N>

+class AutoBuffer {

+ public:

+ AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {

+ if (size > N) {

+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));

+ capacity = size;

+ }

+ ~AutoBuffer() {

+ if (buffer != stackBuffer)

+ uprv_free(buffer);

+ }

+#if 0

+ T* operator& () {

+ return buffer;

+ }

+#endif

+ T* elems() {

+ return buffer;

+ }

+ const T& operator[] (size_t i) const {

+ return buffer[i];

+ }

+ T& operator[] (size_t i) {

+ return buffer[i];

+ }

+ // resize without copy

+ void resize(size_t size) {

+ if (size <= capacity)

+ return;

+ if (buffer != stackBuffer)

+ uprv_free(buffer);

+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));

+ capacity = size;

+ }

+ private:

+ T stackBuffer[N];

+ T* buffer;

+ AutoBuffer();

+ size_t capacity;

+};

+/*

+ * @param text A UText representing the text

+ * @param rangeStart The start of the range of dictionary characters

+ * @param rangeEnd The end of the range of dictionary characters

+ * @param foundBreaks Output of C array of int32_t break positions, or 0

+ * @return The number of breaks found

+ */

+int32_t

+CjkBreakEngine::divideUpDictionaryRange( UText *text,

+ int32_t rangeStart,

+ int32_t rangeEnd,

+ UStack &foundBreaks ) const {

+ if (rangeStart >= rangeEnd) {

+ return 0;

+ }

+ const size_t defaultInputLength = 80;

+ size_t inputLength = rangeEnd - rangeStart;

+ AutoBuffer<UChar, defaultInputLength> charString(inputLength);

+ // Normalize the input string and put it in normalizedText.

+ // The map from the indices of the normalized input to the raw

+ // input is kept in charPositions.

+ UErrorCode status = U_ZERO_ERROR;

+ utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);

+ if (U_FAILURE(status))

+ return 0;

+ UnicodeString inputString(charString.elems(), inputLength);

+ UNormalizationMode norm_mode = UNORM_NFKC;

+ UBool isNormalized =

+ Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||

+ Normalizer::isNormalized(inputString, norm_mode, status);

+ AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);

+ int numChars = 0;

+ UText normalizedText = UTEXT_INITIALIZER;

+ // Needs to be declared here because normalizedText holds onto its buffer.

+ UnicodeString normalizedString;

+ if (isNormalized) {

+ int32_t index = 0;

+ charPositions[0] = 0;

+ while(index < inputString.length()) {

+ index = inputString.moveIndex32(index, 1);

+ charPositions[++numChars] = index;

+ }

+ utext_openUnicodeString(&normalizedText, &inputString, &status);

+ }

+ else {

+ Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);

+ if (U_FAILURE(status))

+ return 0;

+ charPositions.resize(normalizedString.length() + 1);

+ Normalizer normalizer(charString.elems(), inputLength, norm_mode);

+ int32_t index = 0;

+ charPositions[0] = 0;

+ while(index < normalizer.endIndex()){

+ UChar32 uc = normalizer.next();

+ charPositions[++numChars] = index = normalizer.getIndex();

+ }

+ utext_openUnicodeString(&normalizedText, &normalizedString, &status);

+ }

+ if (U_FAILURE(status))

+ return 0;

+ // From this point on, all the indices refer to the indices of

+ // the normalized input string.

+ // bestSnlp[i] is the snlp of the best segmentation of the first i

+ // characters in the range to be matched.

+ AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);

+ bestSnlp[0] = 0;

+ for(int i=1; i<=numChars; i++){

+ bestSnlp[i] = kuint32max;

+ }

+ // prev[i] is the index of the last CJK character in the previous word in

+ // the best segmentation of the first i characters.

+ AutoBuffer<int, defaultInputLength> prev(numChars + 1);

+ for(int i=0; i<=numChars; i++){

+ prev[i] = -1;

+ }

+ const size_t maxWordSize = 20;

+ AutoBuffer<uint16_t, maxWordSize> values(numChars);

+ AutoBuffer<int32_t, maxWordSize> lengths(numChars);

+ // Dynamic programming to find the best segmentation.

+ bool is_prev_katakana = false;

+ for (int i = 0; i < numChars; ++i) {

+ //utext_setNativeIndex(text, rangeStart + i);

+ utext_setNativeIndex(&normalizedText, i);

+ if (bestSnlp[i] == kuint32max)

+ continue;

+ int count;

+ // limit maximum word length matched to size of current substring

+ int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;

+ fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());

+ // if there are no single character matches found in the dictionary

+ // starting with this charcter, treat character as a 1-character word

+ // with the highest value possible, i.e. the least likely to occur.

+ // Exclude Korean characters from this treatment, as they should be left

+ // together by default.

+ if((count == 0 || lengths[0] != 1) &&

+ !fHangulWordSet.contains(utext_current32(&normalizedText))){

+ values[count] = maxSnlp;

+ lengths[count++] = 1;

+ }

+ for (int j = 0; j < count; j++){

+ //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);

+ uint32_t newSnlp = bestSnlp[i] + values[j];

+ if (newSnlp < bestSnlp[lengths[j] + i]) {

+ bestSnlp[lengths[j] + i] = newSnlp;

+ prev[lengths[j] + i] = i;

+ }

+ // In Japanese,

+ // Katakana word in single character is pretty rare. So we apply

+ // the following heuristic to Katakana: any continuous run of Katakana

+ // characters is considered a candidate word with a default cost

+ // specified in the katakanaCost table according to its length.

+ //utext_setNativeIndex(text, rangeStart + i);

+ utext_setNativeIndex(&normalizedText, i);

+ bool is_katakana = isKatakana(utext_current32(&normalizedText));

+ if (!is_prev_katakana && is_katakana) {

+ int j = i + 1;

+ utext_next32(&normalizedText);

+ // Find the end of the continuous run of Katakana characters

+ while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&

+ isKatakana(utext_current32(&normalizedText))) {

+ utext_next32(&normalizedText);

+ ++j;

+ }

+ if ((j - i) < kMaxKatakanaGroupLength) {

+ uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);

+ if (newSnlp < bestSnlp[j]) {

+ bestSnlp[j] = newSnlp;

+ prev[j] = i;

+ }

+ is_prev_katakana = is_katakana;

+ }

+ // Start pushing the optimal offset index into t_boundary (t for tentative).

+ // prev[numChars] is guaranteed to be meaningful.

+ // We'll first push in the reverse order, i.e.,

+ // t_boundary[0] = numChars, and afterwards do a swap.

+ AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

+ int numBreaks = 0;

+ // No segmentation found, set boundary to end of range

+ if (bestSnlp[numChars] == kuint32max) {

+ t_boundary[numBreaks++] = numChars;

+ } else {

+ for (int i = numChars; i > 0; i = prev[i]){

+ t_boundary[numBreaks++] = i;

+ }

+ U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);

+ }

+ // Reverse offset index in t_boundary.

+ // Don't add a break for the start of the dictionary range if there is one

+ // there already.

+ if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {

+ t_boundary[numBreaks++] = 0;

+ }

+ // Now that we're done, convert positions in t_bdry[] (indices in

+ // the normalized input string) back to indices in the raw input string

+ // while reversing t_bdry and pushing values to foundBreaks.

+ for (int i = numBreaks-1; i >= 0; i--) {

+ foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);

+ }

+ utext_close(&normalizedText);

+ return numBreaks;

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

« no previous file with comments | « icu46/source/common/dictbe.h ('k') | icu46/source/common/rbbi.cpp » ('j') | no next file with comments »