icu46/source/common/dictbe.cpp - Issue 6370014: CJK segmentation patch for ICU 4.6...

Side by Side Diff: icu46/source/common/dictbe.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /**	1 /**

2 *******************************************************************************	2 *******************************************************************************

3 * Copyright (C) 2006-2008, International Business Machines Corporation and othe rs. *	3 * Copyright (C) 2006-2008, International Business Machines Corporation and othe rs. *

4 * All Rights Reserved. *	4 * All Rights Reserved. *

5 *******************************************************************************	5 *******************************************************************************

6 */	6 */

7	7

8 #include "unicode/utypes.h"	8 #include "unicode/utypes.h"

9	9

10 #if !UCONFIG_NO_BREAK_ITERATION	10 #if !UCONFIG_NO_BREAK_ITERATION

11	11

12 #include "brkeng.h"	12 #include "brkeng.h"

13 #include "dictbe.h"	13 #include "dictbe.h"

14 #include "unicode/uniset.h"	14 #include "unicode/uniset.h"

15 #include "unicode/chariter.h"	15 #include "unicode/chariter.h"

16 #include "unicode/ubrk.h"	16 #include "unicode/ubrk.h"

17 #include "uvector.h"	17 #include "uvector.h"

18 #include "triedict.h"	18 #include "triedict.h"

	19 #include "uassert.h"

	20 #include "unicode/normlzr.h"

	21 #include "cmemory.h"

19	22

20 U_NAMESPACE_BEGIN	23 U_NAMESPACE_BEGIN

21	24

22 /*	25 /*

23 ******************************************************************	26 ******************************************************************

24 */	27 */

25	28

26 /*DictionaryBreakEngine::DictionaryBreakEngine() {	29 /*DictionaryBreakEngine::DictionaryBreakEngine() {

27 fTypes = 0;	30 fTypes = 0;

28 }*/	31 }*/

(...skipping 386 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
415	418

416 // Don't return a break for the end of the dictionary range if there is one there.	419 // Don't return a break for the end of the dictionary range if there is one there.

417 if (foundBreaks.peeki() >= rangeEnd) {	420 if (foundBreaks.peeki() >= rangeEnd) {

418 (void) foundBreaks.popi();	421 (void) foundBreaks.popi();

419 wordsFound -= 1;	422 wordsFound -= 1;

420 }	423 }

421	424

422 return wordsFound;	425 return wordsFound;

423 }	426 }

424	427

	428 /*

	429 ******************************************************************

	430 * CjkBreakEngine

	431 */

	432 static const uint32_t kuint32max = 0xFFFFFFFF;

	433 CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langua geType type, UErrorCode &status)

	434 : DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){

	435 if (!adoptDictionary->getValued()) {

	436 status = U_ILLEGAL_ARGUMENT_ERROR;

	437 return;

	438 }

	439

	440 // Korean dictionary only includes Hangul syllables

	441 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat us);

	442 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);

	443 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u ff9f]"), status);

	444 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status) ;

	445

	446 if (U_SUCCESS(status)) {

	447 // handle Korean and Japanese/Chinese using different dictionaries

	448 if (type == kKorean) {

	449 setCharacters(fHangulWordSet);

	450 } else { //Chinese and Japanese

	451 UnicodeSet cjSet;

	452 cjSet.addAll(fHanWordSet);

	453 cjSet.addAll(fKatakanaWordSet);

	454 cjSet.addAll(fHiraganaWordSet);

	455 cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));

	456 setCharacters(cjSet);

	457 }

	458 }

	459 }

	460

	461 CjkBreakEngine::~CjkBreakEngine(){

	462 delete fDictionary;

	463 }

	464

	465 // The katakanaCost values below are based on the length frequencies of all

	466 // katakana phrases in the dictionary

	467 static const int kMaxKatakanaLength = 8;

	468 static const int kMaxKatakanaGroupLength = 20;

	469 static const uint32_t maxSnlp = 255;

	470

	471 static inline uint32_t getKatakanaCost(int wordLength){

	472 //TODO: fill array with actual values from dictionary!

	473 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]

	474 = {8192, 984, 408, 240, 204, 252, 300, 37 2, 480};

	475 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];

	476 }

	477

	478 static inline bool isKatakana(uint16_t value) {

	479 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) \|\|

	480 (value >= 0xFF66u && value <= 0xFF9fu);

	481 }

	482

	483 // A very simple helper class to streamline the buffer handling in

	484 // divideUpDictionaryRange.

	485 template<class T, size_t N>

	486 class AutoBuffer {

	487 public:

	488 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {

	489 if (size > N) {

	490 buffer = reinterpret_cast<T>(uprv_malloc(sizeof(T)size));

	491 capacity = size;

	492 }

	493 }

	494 ~AutoBuffer() {

	495 if (buffer != stackBuffer)

	496 uprv_free(buffer);

	497 }

	498 #if 0

	499 T* operator& () {

	500 return buffer;

	501 }

	502 #endif

	503 T* elems() {

	504 return buffer;

	505 }

	506 const T& operator[] (size_t i) const {

	507 return buffer[i];

	508 }

	509 T& operator[] (size_t i) {

	510 return buffer[i];

	511 }

	512

	513 // resize without copy

	514 void resize(size_t size) {

	515 if (size <= capacity)

	516 return;

	517 if (buffer != stackBuffer)

	518 uprv_free(buffer);

	519 buffer = reinterpret_cast<T>(uprv_malloc(sizeof(T)size));

	520 capacity = size;

	521 }

	522 private:

	523 T stackBuffer[N];

	524 T* buffer;

	525 AutoBuffer();

	526 size_t capacity;

	527 };

	528

	529

	530 /*

	531 * @param text A UText representing the text

	532 * @param rangeStart The start of the range of dictionary characters

	533 * @param rangeEnd The end of the range of dictionary characters

	534 * @param foundBreaks Output of C array of int32_t break positions, or 0

	535 * @return The number of breaks found

	536 */

	537 int32_t

	538 CjkBreakEngine::divideUpDictionaryRange( UText *text,

	539 int32_t rangeStart,

	540 int32_t rangeEnd,

	541 UStack &foundBreaks ) const {

	542 if (rangeStart >= rangeEnd) {

	543 return 0;

	544 }

	545

	546 const size_t defaultInputLength = 80;

	547 size_t inputLength = rangeEnd - rangeStart;

	548 AutoBuffer<UChar, defaultInputLength> charString(inputLength);

	549

	550 // Normalize the input string and put it in normalizedText.

	551 // The map from the indices of the normalized input to the raw

	552 // input is kept in charPositions.

	553 UErrorCode status = U_ZERO_ERROR;

	554 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, & status);

	555 if (U_FAILURE(status))

	556 return 0;

	557

	558 UnicodeString inputString(charString.elems(), inputLength);

	559 UNormalizationMode norm_mode = UNORM_NFKC;

	560 UBool isNormalized =

	561 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES \|\|

	562 Normalizer::isNormalized(inputString, norm_mode, status);

	563

	564 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);

	565 int numChars = 0;

	566 UText normalizedText = UTEXT_INITIALIZER;

	567 // Needs to be declared here because normalizedText holds onto its buffer.

	568 UnicodeString normalizedString;

	569 if (isNormalized) {

	570 int32_t index = 0;

	571 charPositions[0] = 0;

	572 while(index < inputString.length()) {

	573 index = inputString.moveIndex32(index, 1);

	574 charPositions[++numChars] = index;

	575 }

	576 utext_openUnicodeString(&normalizedText, &inputString, &status);

	577 }

	578 else {

	579 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, statu s);

	580 if (U_FAILURE(status))

	581 return 0;

	582 charPositions.resize(normalizedString.length() + 1);

	583 Normalizer normalizer(charString.elems(), inputLength, norm_mode);

	584 int32_t index = 0;

	585 charPositions[0] = 0;

	586 while(index < normalizer.endIndex()){

	587 UChar32 uc = normalizer.next();

	588 charPositions[++numChars] = index = normalizer.getIndex();

	589 }

	590 utext_openUnicodeString(&normalizedText, &normalizedString, &status);

	591 }

	592

	593 if (U_FAILURE(status))

	594 return 0;

	595

	596 // From this point on, all the indices refer to the indices of

	597 // the normalized input string.

	598

	599 // bestSnlp[i] is the snlp of the best segmentation of the first i

	600 // characters in the range to be matched.

	601 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);

	602 bestSnlp[0] = 0;

	603 for(int i=1; i<=numChars; i++){

	604 bestSnlp[i] = kuint32max;

	605 }

	606

	607 // prev[i] is the index of the last CJK character in the previous word in

	608 // the best segmentation of the first i characters.

	609 AutoBuffer<int, defaultInputLength> prev(numChars + 1);

	610 for(int i=0; i<=numChars; i++){

	611 prev[i] = -1;

	612 }

	613

	614 const size_t maxWordSize = 20;

	615 AutoBuffer<uint16_t, maxWordSize> values(numChars);

	616 AutoBuffer<int32_t, maxWordSize> lengths(numChars);

	617

	618 // Dynamic programming to find the best segmentation.

	619 bool is_prev_katakana = false;

	620 for (int i = 0; i < numChars; ++i) {

	621 //utext_setNativeIndex(text, rangeStart + i);

	622 utext_setNativeIndex(&normalizedText, i);

	623 if (bestSnlp[i] == kuint32max)

	624 continue;

	625

	626 int count;

	627 // limit maximum word length matched to size of current substring

	628 int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : numChars - i;

	629

	630 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());

	631

	632 // if there are no single character matches found in the dictionary

	633 // starting with this charcter, treat character as a 1-character word

	634 // with the highest value possible, i.e. the least likely to occur.

	635 // Exclude Korean characters from this treatment, as they should be left

	636 // together by default.

	637 if((count == 0 \|\| lengths[0] != 1) &&

	638 !fHangulWordSet.contains(utext_current32(&normalizedText))){

	639 values[count] = maxSnlp;

	640 lengths[count++] = 1;

	641 }

	642

	643 for (int j = 0; j < count; j++){

	644 //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);

	645 uint32_t newSnlp = bestSnlp[i] + values[j];

	646 if (newSnlp < bestSnlp[lengths[j] + i]) {

	647 bestSnlp[lengths[j] + i] = newSnlp;

	648 prev[lengths[j] + i] = i;

	649 }

	650 }

	651

	652 // In Japanese,

	653 // Katakana word in single character is pretty rare. So we apply

	654 // the following heuristic to Katakana: any continuous run of Katakana

	655 // characters is considered a candidate word with a default cost

	656 // specified in the katakanaCost table according to its length.

	657 //utext_setNativeIndex(text, rangeStart + i);

	658 utext_setNativeIndex(&normalizedText, i);

	659 bool is_katakana = isKatakana(utext_current32(&normalizedText));

	660 if (!is_prev_katakana && is_katakana) {

	661 int j = i + 1;

	662 utext_next32(&normalizedText);

	663 // Find the end of the continuous run of Katakana characters

	664 while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&

	665 isKatakana(utext_current32(&normalizedText))) {

	666 utext_next32(&normalizedText);

	667 ++j;

	668 }

	669 if ((j - i) < kMaxKatakanaGroupLength) {

	670 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);

	671 if (newSnlp < bestSnlp[j]) {

	672 bestSnlp[j] = newSnlp;

	673 prev[j] = i;

	674 }

	675 }

	676 }

	677 is_prev_katakana = is_katakana;

	678 }

	679

	680 // Start pushing the optimal offset index into t_boundary (t for tentative).

	681 // prev[numChars] is guaranteed to be meaningful.

	682 // We'll first push in the reverse order, i.e.,

	683 // t_boundary[0] = numChars, and afterwards do a swap.

	684 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

	685

	686 int numBreaks = 0;

	687 // No segmentation found, set boundary to end of range

	688 if (bestSnlp[numChars] == kuint32max) {

	689 t_boundary[numBreaks++] = numChars;

	690 } else {

	691 for (int i = numChars; i > 0; i = prev[i]){

	692 t_boundary[numBreaks++] = i;

	693

	694 }

	695 U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);

	696 }

	697

	698 // Reverse offset index in t_boundary.

	699 // Don't add a break for the start of the dictionary range if there is one

	700 // there already.

	701 if (foundBreaks.size() == 0 \|\| foundBreaks.peeki() < rangeStart) {

	702 t_boundary[numBreaks++] = 0;

	703 }

	704

	705 // Now that we're done, convert positions in t_bdry[] (indices in

	706 // the normalized input string) back to indices in the raw input string

	707 // while reversing t_bdry and pushing values to foundBreaks.

	708 for (int i = numBreaks-1; i >= 0; i--) {

	709 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);

	710 }

	711

	712 utext_close(&normalizedText);

	713 return numBreaks;

	714 }

	715

425 U_NAMESPACE_END	716 U_NAMESPACE_END

426	717

427 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */	718 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

OLD	NEW

« no previous file with comments | « icu46/source/common/dictbe.h ('k') | icu46/source/common/rbbi.cpp » ('j') | no next file with comments »