Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Unified Diff: icu46/source/common/dictbe.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « icu46/source/common/dictbe.h ('k') | icu46/source/common/rbbi.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: icu46/source/common/dictbe.cpp
===================================================================
--- icu46/source/common/dictbe.cpp (revision 68397)
+++ icu46/source/common/dictbe.cpp (working copy)
@@ -16,6 +16,9 @@
#include "unicode/ubrk.h"
#include "uvector.h"
#include "triedict.h"
+#include "uassert.h"
+#include "unicode/normlzr.h"
+#include "cmemory.h"
U_NAMESPACE_BEGIN
@@ -422,6 +425,294 @@
return wordsFound;
}
+/*
+ ******************************************************************
+ * CjkBreakEngine
+ */
+static const uint32_t kuint32max = 0xFFFFFFFF;
+CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, LanguageType type, UErrorCode &status)
+: DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
+ if (!adoptDictionary->getValued()) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ // Korean dictionary only includes Hangul syllables
+ fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
+ fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
+ fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
+ fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
+
+ if (U_SUCCESS(status)) {
+ // handle Korean and Japanese/Chinese using different dictionaries
+ if (type == kKorean) {
+ setCharacters(fHangulWordSet);
+ } else { //Chinese and Japanese
+ UnicodeSet cjSet;
+ cjSet.addAll(fHanWordSet);
+ cjSet.addAll(fKatakanaWordSet);
+ cjSet.addAll(fHiraganaWordSet);
+ cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
+ setCharacters(cjSet);
+ }
+ }
+}
+
+CjkBreakEngine::~CjkBreakEngine(){
+ delete fDictionary;
+}
+
+// The katakanaCost values below are based on the length frequencies of all
+// katakana phrases in the dictionary
+static const int kMaxKatakanaLength = 8;
+static const int kMaxKatakanaGroupLength = 20;
+static const uint32_t maxSnlp = 255;
+
+static inline uint32_t getKatakanaCost(int wordLength){
+ //TODO: fill array with actual values from dictionary!
+ static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
+ = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
+ return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
+}
+
+static inline bool isKatakana(uint16_t value) {
+ return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
+ (value >= 0xFF66u && value <= 0xFF9fu);
+}
+
+// A very simple helper class to streamline the buffer handling in
+// divideUpDictionaryRange.
+template<class T, size_t N>
+class AutoBuffer {
+ public:
+ AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
+ if (size > N) {
+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
+ capacity = size;
+ }
+ }
+ ~AutoBuffer() {
+ if (buffer != stackBuffer)
+ uprv_free(buffer);
+ }
+#if 0
+ T* operator& () {
+ return buffer;
+ }
+#endif
+ T* elems() {
+ return buffer;
+ }
+ const T& operator[] (size_t i) const {
+ return buffer[i];
+ }
+ T& operator[] (size_t i) {
+ return buffer[i];
+ }
+
+ // resize without copy
+ void resize(size_t size) {
+ if (size <= capacity)
+ return;
+ if (buffer != stackBuffer)
+ uprv_free(buffer);
+ buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
+ capacity = size;
+ }
+ private:
+ T stackBuffer[N];
+ T* buffer;
+ AutoBuffer();
+ size_t capacity;
+};
+
+
+/*
+ * @param text A UText representing the text
+ * @param rangeStart The start of the range of dictionary characters
+ * @param rangeEnd The end of the range of dictionary characters
+ * @param foundBreaks Output of C array of int32_t break positions, or 0
+ * @return The number of breaks found
+ */
+int32_t
+CjkBreakEngine::divideUpDictionaryRange( UText *text,
+ int32_t rangeStart,
+ int32_t rangeEnd,
+ UStack &foundBreaks ) const {
+ if (rangeStart >= rangeEnd) {
+ return 0;
+ }
+
+ const size_t defaultInputLength = 80;
+ size_t inputLength = rangeEnd - rangeStart;
+ AutoBuffer<UChar, defaultInputLength> charString(inputLength);
+
+ // Normalize the input string and put it in normalizedText.
+ // The map from the indices of the normalized input to the raw
+ // input is kept in charPositions.
+ UErrorCode status = U_ZERO_ERROR;
+ utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
+ if (U_FAILURE(status))
+ return 0;
+
+ UnicodeString inputString(charString.elems(), inputLength);
+ UNormalizationMode norm_mode = UNORM_NFKC;
+ UBool isNormalized =
+ Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
+ Normalizer::isNormalized(inputString, norm_mode, status);
+
+ AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
+ int numChars = 0;
+ UText normalizedText = UTEXT_INITIALIZER;
+ // Needs to be declared here because normalizedText holds onto its buffer.
+ UnicodeString normalizedString;
+ if (isNormalized) {
+ int32_t index = 0;
+ charPositions[0] = 0;
+ while(index < inputString.length()) {
+ index = inputString.moveIndex32(index, 1);
+ charPositions[++numChars] = index;
+ }
+ utext_openUnicodeString(&normalizedText, &inputString, &status);
+ }
+ else {
+ Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
+ if (U_FAILURE(status))
+ return 0;
+ charPositions.resize(normalizedString.length() + 1);
+ Normalizer normalizer(charString.elems(), inputLength, norm_mode);
+ int32_t index = 0;
+ charPositions[0] = 0;
+ while(index < normalizer.endIndex()){
+ UChar32 uc = normalizer.next();
+ charPositions[++numChars] = index = normalizer.getIndex();
+ }
+ utext_openUnicodeString(&normalizedText, &normalizedString, &status);
+ }
+
+ if (U_FAILURE(status))
+ return 0;
+
+ // From this point on, all the indices refer to the indices of
+ // the normalized input string.
+
+ // bestSnlp[i] is the snlp of the best segmentation of the first i
+ // characters in the range to be matched.
+ AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
+ bestSnlp[0] = 0;
+ for(int i=1; i<=numChars; i++){
+ bestSnlp[i] = kuint32max;
+ }
+
+ // prev[i] is the index of the last CJK character in the previous word in
+ // the best segmentation of the first i characters.
+ AutoBuffer<int, defaultInputLength> prev(numChars + 1);
+ for(int i=0; i<=numChars; i++){
+ prev[i] = -1;
+ }
+
+ const size_t maxWordSize = 20;
+ AutoBuffer<uint16_t, maxWordSize> values(numChars);
+ AutoBuffer<int32_t, maxWordSize> lengths(numChars);
+
+ // Dynamic programming to find the best segmentation.
+ bool is_prev_katakana = false;
+ for (int i = 0; i < numChars; ++i) {
+ //utext_setNativeIndex(text, rangeStart + i);
+ utext_setNativeIndex(&normalizedText, i);
+ if (bestSnlp[i] == kuint32max)
+ continue;
+
+ int count;
+ // limit maximum word length matched to size of current substring
+ int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize: numChars - i;
+
+ fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
+
+ // if there are no single character matches found in the dictionary
+ // starting with this charcter, treat character as a 1-character word
+ // with the highest value possible, i.e. the least likely to occur.
+ // Exclude Korean characters from this treatment, as they should be left
+ // together by default.
+ if((count == 0 || lengths[0] != 1) &&
+ !fHangulWordSet.contains(utext_current32(&normalizedText))){
+ values[count] = maxSnlp;
+ lengths[count++] = 1;
+ }
+
+ for (int j = 0; j < count; j++){
+ //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
+ uint32_t newSnlp = bestSnlp[i] + values[j];
+ if (newSnlp < bestSnlp[lengths[j] + i]) {
+ bestSnlp[lengths[j] + i] = newSnlp;
+ prev[lengths[j] + i] = i;
+ }
+ }
+
+ // In Japanese,
+ // Katakana word in single character is pretty rare. So we apply
+ // the following heuristic to Katakana: any continuous run of Katakana
+ // characters is considered a candidate word with a default cost
+ // specified in the katakanaCost table according to its length.
+ //utext_setNativeIndex(text, rangeStart + i);
+ utext_setNativeIndex(&normalizedText, i);
+ bool is_katakana = isKatakana(utext_current32(&normalizedText));
+ if (!is_prev_katakana && is_katakana) {
+ int j = i + 1;
+ utext_next32(&normalizedText);
+ // Find the end of the continuous run of Katakana characters
+ while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
+ isKatakana(utext_current32(&normalizedText))) {
+ utext_next32(&normalizedText);
+ ++j;
+ }
+ if ((j - i) < kMaxKatakanaGroupLength) {
+ uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
+ if (newSnlp < bestSnlp[j]) {
+ bestSnlp[j] = newSnlp;
+ prev[j] = i;
+ }
+ }
+ }
+ is_prev_katakana = is_katakana;
+ }
+
+ // Start pushing the optimal offset index into t_boundary (t for tentative).
+ // prev[numChars] is guaranteed to be meaningful.
+ // We'll first push in the reverse order, i.e.,
+ // t_boundary[0] = numChars, and afterwards do a swap.
+ AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
+
+ int numBreaks = 0;
+ // No segmentation found, set boundary to end of range
+ if (bestSnlp[numChars] == kuint32max) {
+ t_boundary[numBreaks++] = numChars;
+ } else {
+ for (int i = numChars; i > 0; i = prev[i]){
+ t_boundary[numBreaks++] = i;
+
+ }
+ U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
+ }
+
+ // Reverse offset index in t_boundary.
+ // Don't add a break for the start of the dictionary range if there is one
+ // there already.
+ if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
+ t_boundary[numBreaks++] = 0;
+ }
+
+ // Now that we're done, convert positions in t_bdry[] (indices in
+ // the normalized input string) back to indices in the raw input string
+ // while reversing t_bdry and pushing values to foundBreaks.
+ for (int i = numBreaks-1; i >= 0; i--) {
+ foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
+ }
+
+ utext_close(&normalizedText);
+ return numBreaks;
+}
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
« no previous file with comments | « icu46/source/common/dictbe.h ('k') | icu46/source/common/rbbi.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698