Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(375)

Side by Side Diff: icu46/source/common/dictbe.cpp

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/
Patch Set: Created 9 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu46/source/common/dictbe.h ('k') | icu46/source/common/rbbi.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /** 1 /**
2 ******************************************************************************* 2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation and othe rs. * 3 * Copyright (C) 2006-2008, International Business Machines Corporation and othe rs. *
4 * All Rights Reserved. * 4 * All Rights Reserved. *
5 ******************************************************************************* 5 *******************************************************************************
6 */ 6 */
7 7
8 #include "unicode/utypes.h" 8 #include "unicode/utypes.h"
9 9
10 #if !UCONFIG_NO_BREAK_ITERATION 10 #if !UCONFIG_NO_BREAK_ITERATION
11 11
12 #include "brkeng.h" 12 #include "brkeng.h"
13 #include "dictbe.h" 13 #include "dictbe.h"
14 #include "unicode/uniset.h" 14 #include "unicode/uniset.h"
15 #include "unicode/chariter.h" 15 #include "unicode/chariter.h"
16 #include "unicode/ubrk.h" 16 #include "unicode/ubrk.h"
17 #include "uvector.h" 17 #include "uvector.h"
18 #include "triedict.h" 18 #include "triedict.h"
19 #include "uassert.h"
20 #include "unicode/normlzr.h"
21 #include "cmemory.h"
19 22
20 U_NAMESPACE_BEGIN 23 U_NAMESPACE_BEGIN
21 24
22 /* 25 /*
23 ****************************************************************** 26 ******************************************************************
24 */ 27 */
25 28
26 /*DictionaryBreakEngine::DictionaryBreakEngine() { 29 /*DictionaryBreakEngine::DictionaryBreakEngine() {
27 fTypes = 0; 30 fTypes = 0;
28 }*/ 31 }*/
(...skipping 386 matching lines...) Expand 10 before | Expand all | Expand 10 after
415 418
416 // Don't return a break for the end of the dictionary range if there is one there. 419 // Don't return a break for the end of the dictionary range if there is one there.
417 if (foundBreaks.peeki() >= rangeEnd) { 420 if (foundBreaks.peeki() >= rangeEnd) {
418 (void) foundBreaks.popi(); 421 (void) foundBreaks.popi();
419 wordsFound -= 1; 422 wordsFound -= 1;
420 } 423 }
421 424
422 return wordsFound; 425 return wordsFound;
423 } 426 }
424 427
428 /*
429 ******************************************************************
430 * CjkBreakEngine
431 */
432 static const uint32_t kuint32max = 0xFFFFFFFF;
433 CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langua geType type, UErrorCode &status)
434 : DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){
435 if (!adoptDictionary->getValued()) {
436 status = U_ILLEGAL_ARGUMENT_ERROR;
437 return;
438 }
439
440 // Korean dictionary only includes Hangul syllables
441 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat us);
442 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
443 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u ff9f]"), status);
444 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status) ;
445
446 if (U_SUCCESS(status)) {
447 // handle Korean and Japanese/Chinese using different dictionaries
448 if (type == kKorean) {
449 setCharacters(fHangulWordSet);
450 } else { //Chinese and Japanese
451 UnicodeSet cjSet;
452 cjSet.addAll(fHanWordSet);
453 cjSet.addAll(fKatakanaWordSet);
454 cjSet.addAll(fHiraganaWordSet);
455 cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc"));
456 setCharacters(cjSet);
457 }
458 }
459 }
460
461 CjkBreakEngine::~CjkBreakEngine(){
462 delete fDictionary;
463 }
464
465 // The katakanaCost values below are based on the length frequencies of all
466 // katakana phrases in the dictionary
467 static const int kMaxKatakanaLength = 8;
468 static const int kMaxKatakanaGroupLength = 20;
469 static const uint32_t maxSnlp = 255;
470
471 static inline uint32_t getKatakanaCost(int wordLength){
472 //TODO: fill array with actual values from dictionary!
473 static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
474 = {8192, 984, 408, 240, 204, 252, 300, 37 2, 480};
475 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
476 }
477
478 static inline bool isKatakana(uint16_t value) {
479 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
480 (value >= 0xFF66u && value <= 0xFF9fu);
481 }
482
483 // A very simple helper class to streamline the buffer handling in
484 // divideUpDictionaryRange.
485 template<class T, size_t N>
486 class AutoBuffer {
487 public:
488 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
489 if (size > N) {
490 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
491 capacity = size;
492 }
493 }
494 ~AutoBuffer() {
495 if (buffer != stackBuffer)
496 uprv_free(buffer);
497 }
498 #if 0
499 T* operator& () {
500 return buffer;
501 }
502 #endif
503 T* elems() {
504 return buffer;
505 }
506 const T& operator[] (size_t i) const {
507 return buffer[i];
508 }
509 T& operator[] (size_t i) {
510 return buffer[i];
511 }
512
513 // resize without copy
514 void resize(size_t size) {
515 if (size <= capacity)
516 return;
517 if (buffer != stackBuffer)
518 uprv_free(buffer);
519 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
520 capacity = size;
521 }
522 private:
523 T stackBuffer[N];
524 T* buffer;
525 AutoBuffer();
526 size_t capacity;
527 };
528
529
530 /*
531 * @param text A UText representing the text
532 * @param rangeStart The start of the range of dictionary characters
533 * @param rangeEnd The end of the range of dictionary characters
534 * @param foundBreaks Output of C array of int32_t break positions, or 0
535 * @return The number of breaks found
536 */
537 int32_t
538 CjkBreakEngine::divideUpDictionaryRange( UText *text,
539 int32_t rangeStart,
540 int32_t rangeEnd,
541 UStack &foundBreaks ) const {
542 if (rangeStart >= rangeEnd) {
543 return 0;
544 }
545
546 const size_t defaultInputLength = 80;
547 size_t inputLength = rangeEnd - rangeStart;
548 AutoBuffer<UChar, defaultInputLength> charString(inputLength);
549
550 // Normalize the input string and put it in normalizedText.
551 // The map from the indices of the normalized input to the raw
552 // input is kept in charPositions.
553 UErrorCode status = U_ZERO_ERROR;
554 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, & status);
555 if (U_FAILURE(status))
556 return 0;
557
558 UnicodeString inputString(charString.elems(), inputLength);
559 UNormalizationMode norm_mode = UNORM_NFKC;
560 UBool isNormalized =
561 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
562 Normalizer::isNormalized(inputString, norm_mode, status);
563
564 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
565 int numChars = 0;
566 UText normalizedText = UTEXT_INITIALIZER;
567 // Needs to be declared here because normalizedText holds onto its buffer.
568 UnicodeString normalizedString;
569 if (isNormalized) {
570 int32_t index = 0;
571 charPositions[0] = 0;
572 while(index < inputString.length()) {
573 index = inputString.moveIndex32(index, 1);
574 charPositions[++numChars] = index;
575 }
576 utext_openUnicodeString(&normalizedText, &inputString, &status);
577 }
578 else {
579 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, statu s);
580 if (U_FAILURE(status))
581 return 0;
582 charPositions.resize(normalizedString.length() + 1);
583 Normalizer normalizer(charString.elems(), inputLength, norm_mode);
584 int32_t index = 0;
585 charPositions[0] = 0;
586 while(index < normalizer.endIndex()){
587 UChar32 uc = normalizer.next();
588 charPositions[++numChars] = index = normalizer.getIndex();
589 }
590 utext_openUnicodeString(&normalizedText, &normalizedString, &status);
591 }
592
593 if (U_FAILURE(status))
594 return 0;
595
596 // From this point on, all the indices refer to the indices of
597 // the normalized input string.
598
599 // bestSnlp[i] is the snlp of the best segmentation of the first i
600 // characters in the range to be matched.
601 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
602 bestSnlp[0] = 0;
603 for(int i=1; i<=numChars; i++){
604 bestSnlp[i] = kuint32max;
605 }
606
607 // prev[i] is the index of the last CJK character in the previous word in
608 // the best segmentation of the first i characters.
609 AutoBuffer<int, defaultInputLength> prev(numChars + 1);
610 for(int i=0; i<=numChars; i++){
611 prev[i] = -1;
612 }
613
614 const size_t maxWordSize = 20;
615 AutoBuffer<uint16_t, maxWordSize> values(numChars);
616 AutoBuffer<int32_t, maxWordSize> lengths(numChars);
617
618 // Dynamic programming to find the best segmentation.
619 bool is_prev_katakana = false;
620 for (int i = 0; i < numChars; ++i) {
621 //utext_setNativeIndex(text, rangeStart + i);
622 utext_setNativeIndex(&normalizedText, i);
623 if (bestSnlp[i] == kuint32max)
624 continue;
625
626 int count;
627 // limit maximum word length matched to size of current substring
628 int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : numChars - i;
629
630 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
631
632 // if there are no single character matches found in the dictionary
633 // starting with this charcter, treat character as a 1-character word
634 // with the highest value possible, i.e. the least likely to occur.
635 // Exclude Korean characters from this treatment, as they should be left
636 // together by default.
637 if((count == 0 || lengths[0] != 1) &&
638 !fHangulWordSet.contains(utext_current32(&normalizedText))){
639 values[count] = maxSnlp;
640 lengths[count++] = 1;
641 }
642
643 for (int j = 0; j < count; j++){
644 //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp);
645 uint32_t newSnlp = bestSnlp[i] + values[j];
646 if (newSnlp < bestSnlp[lengths[j] + i]) {
647 bestSnlp[lengths[j] + i] = newSnlp;
648 prev[lengths[j] + i] = i;
649 }
650 }
651
652 // In Japanese,
653 // Katakana word in single character is pretty rare. So we apply
654 // the following heuristic to Katakana: any continuous run of Katakana
655 // characters is considered a candidate word with a default cost
656 // specified in the katakanaCost table according to its length.
657 //utext_setNativeIndex(text, rangeStart + i);
658 utext_setNativeIndex(&normalizedText, i);
659 bool is_katakana = isKatakana(utext_current32(&normalizedText));
660 if (!is_prev_katakana && is_katakana) {
661 int j = i + 1;
662 utext_next32(&normalizedText);
663 // Find the end of the continuous run of Katakana characters
664 while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
665 isKatakana(utext_current32(&normalizedText))) {
666 utext_next32(&normalizedText);
667 ++j;
668 }
669 if ((j - i) < kMaxKatakanaGroupLength) {
670 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
671 if (newSnlp < bestSnlp[j]) {
672 bestSnlp[j] = newSnlp;
673 prev[j] = i;
674 }
675 }
676 }
677 is_prev_katakana = is_katakana;
678 }
679
680 // Start pushing the optimal offset index into t_boundary (t for tentative).
681 // prev[numChars] is guaranteed to be meaningful.
682 // We'll first push in the reverse order, i.e.,
683 // t_boundary[0] = numChars, and afterwards do a swap.
684 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
685
686 int numBreaks = 0;
687 // No segmentation found, set boundary to end of range
688 if (bestSnlp[numChars] == kuint32max) {
689 t_boundary[numBreaks++] = numChars;
690 } else {
691 for (int i = numChars; i > 0; i = prev[i]){
692 t_boundary[numBreaks++] = i;
693
694 }
695 U_ASSERT(prev[t_boundary[numBreaks-1]] == 0);
696 }
697
698 // Reverse offset index in t_boundary.
699 // Don't add a break for the start of the dictionary range if there is one
700 // there already.
701 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
702 t_boundary[numBreaks++] = 0;
703 }
704
705 // Now that we're done, convert positions in t_bdry[] (indices in
706 // the normalized input string) back to indices in the raw input string
707 // while reversing t_bdry and pushing values to foundBreaks.
708 for (int i = numBreaks-1; i >= 0; i--) {
709 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
710 }
711
712 utext_close(&normalizedText);
713 return numBreaks;
714 }
715
425 U_NAMESPACE_END 716 U_NAMESPACE_END
426 717
427 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 718 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
OLDNEW
« no previous file with comments | « icu46/source/common/dictbe.h ('k') | icu46/source/common/rbbi.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698