OLD | NEW |
1 /** | 1 /** |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * Copyright (C) 2006-2008, International Business Machines Corporation and othe
rs. * | 3 * Copyright (C) 2006-2008, International Business Machines Corporation and othe
rs. * |
4 * All Rights Reserved. * | 4 * All Rights Reserved. * |
5 ******************************************************************************* | 5 ******************************************************************************* |
6 */ | 6 */ |
7 | 7 |
8 #include "unicode/utypes.h" | 8 #include "unicode/utypes.h" |
9 | 9 |
10 #if !UCONFIG_NO_BREAK_ITERATION | 10 #if !UCONFIG_NO_BREAK_ITERATION |
11 | 11 |
12 #include "brkeng.h" | 12 #include "brkeng.h" |
13 #include "dictbe.h" | 13 #include "dictbe.h" |
14 #include "unicode/uniset.h" | 14 #include "unicode/uniset.h" |
15 #include "unicode/chariter.h" | 15 #include "unicode/chariter.h" |
16 #include "unicode/ubrk.h" | 16 #include "unicode/ubrk.h" |
17 #include "uvector.h" | 17 #include "uvector.h" |
18 #include "triedict.h" | 18 #include "triedict.h" |
| 19 #include "uassert.h" |
| 20 #include "unicode/normlzr.h" |
| 21 #include "cmemory.h" |
19 | 22 |
20 U_NAMESPACE_BEGIN | 23 U_NAMESPACE_BEGIN |
21 | 24 |
22 /* | 25 /* |
23 ****************************************************************** | 26 ****************************************************************** |
24 */ | 27 */ |
25 | 28 |
26 /*DictionaryBreakEngine::DictionaryBreakEngine() { | 29 /*DictionaryBreakEngine::DictionaryBreakEngine() { |
27 fTypes = 0; | 30 fTypes = 0; |
28 }*/ | 31 }*/ |
(...skipping 386 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
415 | 418 |
416 // Don't return a break for the end of the dictionary range if there is one
there. | 419 // Don't return a break for the end of the dictionary range if there is one
there. |
417 if (foundBreaks.peeki() >= rangeEnd) { | 420 if (foundBreaks.peeki() >= rangeEnd) { |
418 (void) foundBreaks.popi(); | 421 (void) foundBreaks.popi(); |
419 wordsFound -= 1; | 422 wordsFound -= 1; |
420 } | 423 } |
421 | 424 |
422 return wordsFound; | 425 return wordsFound; |
423 } | 426 } |
424 | 427 |
| 428 /* |
| 429 ****************************************************************** |
| 430 * CjkBreakEngine |
| 431 */ |
| 432 static const uint32_t kuint32max = 0xFFFFFFFF; |
| 433 CjkBreakEngine::CjkBreakEngine(const TrieWordDictionary *adoptDictionary, Langua
geType type, UErrorCode &status) |
| 434 : DictionaryBreakEngine(1<<UBRK_WORD), fDictionary(adoptDictionary){ |
| 435 if (!adoptDictionary->getValued()) { |
| 436 status = U_ILLEGAL_ARGUMENT_ERROR; |
| 437 return; |
| 438 } |
| 439 |
| 440 // Korean dictionary only includes Hangul syllables |
| 441 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), stat
us); |
| 442 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); |
| 443 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\u
ff9f]"), status); |
| 444 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status)
; |
| 445 |
| 446 if (U_SUCCESS(status)) { |
| 447 // handle Korean and Japanese/Chinese using different dictionaries |
| 448 if (type == kKorean) { |
| 449 setCharacters(fHangulWordSet); |
| 450 } else { //Chinese and Japanese |
| 451 UnicodeSet cjSet; |
| 452 cjSet.addAll(fHanWordSet); |
| 453 cjSet.addAll(fKatakanaWordSet); |
| 454 cjSet.addAll(fHiraganaWordSet); |
| 455 cjSet.add(UNICODE_STRING_SIMPLE("\\uff70\\u30fc")); |
| 456 setCharacters(cjSet); |
| 457 } |
| 458 } |
| 459 } |
| 460 |
| 461 CjkBreakEngine::~CjkBreakEngine(){ |
| 462 delete fDictionary; |
| 463 } |
| 464 |
| 465 // The katakanaCost values below are based on the length frequencies of all |
| 466 // katakana phrases in the dictionary |
| 467 static const int kMaxKatakanaLength = 8; |
| 468 static const int kMaxKatakanaGroupLength = 20; |
| 469 static const uint32_t maxSnlp = 255; |
| 470 |
| 471 static inline uint32_t getKatakanaCost(int wordLength){ |
| 472 //TODO: fill array with actual values from dictionary! |
| 473 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] |
| 474 = {8192, 984, 408, 240, 204, 252, 300, 37
2, 480}; |
| 475 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; |
| 476 } |
| 477 |
| 478 static inline bool isKatakana(uint16_t value) { |
| 479 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || |
| 480 (value >= 0xFF66u && value <= 0xFF9fu); |
| 481 } |
| 482 |
| 483 // A very simple helper class to streamline the buffer handling in |
| 484 // divideUpDictionaryRange. |
| 485 template<class T, size_t N> |
| 486 class AutoBuffer { |
| 487 public: |
| 488 AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) { |
| 489 if (size > N) { |
| 490 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); |
| 491 capacity = size; |
| 492 } |
| 493 } |
| 494 ~AutoBuffer() { |
| 495 if (buffer != stackBuffer) |
| 496 uprv_free(buffer); |
| 497 } |
| 498 #if 0 |
| 499 T* operator& () { |
| 500 return buffer; |
| 501 } |
| 502 #endif |
| 503 T* elems() { |
| 504 return buffer; |
| 505 } |
| 506 const T& operator[] (size_t i) const { |
| 507 return buffer[i]; |
| 508 } |
| 509 T& operator[] (size_t i) { |
| 510 return buffer[i]; |
| 511 } |
| 512 |
| 513 // resize without copy |
| 514 void resize(size_t size) { |
| 515 if (size <= capacity) |
| 516 return; |
| 517 if (buffer != stackBuffer) |
| 518 uprv_free(buffer); |
| 519 buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size)); |
| 520 capacity = size; |
| 521 } |
| 522 private: |
| 523 T stackBuffer[N]; |
| 524 T* buffer; |
| 525 AutoBuffer(); |
| 526 size_t capacity; |
| 527 }; |
| 528 |
| 529 |
| 530 /* |
| 531 * @param text A UText representing the text |
| 532 * @param rangeStart The start of the range of dictionary characters |
| 533 * @param rangeEnd The end of the range of dictionary characters |
| 534 * @param foundBreaks Output of C array of int32_t break positions, or 0 |
| 535 * @return The number of breaks found |
| 536 */ |
| 537 int32_t |
| 538 CjkBreakEngine::divideUpDictionaryRange( UText *text, |
| 539 int32_t rangeStart, |
| 540 int32_t rangeEnd, |
| 541 UStack &foundBreaks ) const { |
| 542 if (rangeStart >= rangeEnd) { |
| 543 return 0; |
| 544 } |
| 545 |
| 546 const size_t defaultInputLength = 80; |
| 547 size_t inputLength = rangeEnd - rangeStart; |
| 548 AutoBuffer<UChar, defaultInputLength> charString(inputLength); |
| 549 |
| 550 // Normalize the input string and put it in normalizedText. |
| 551 // The map from the indices of the normalized input to the raw |
| 552 // input is kept in charPositions. |
| 553 UErrorCode status = U_ZERO_ERROR; |
| 554 utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &
status); |
| 555 if (U_FAILURE(status)) |
| 556 return 0; |
| 557 |
| 558 UnicodeString inputString(charString.elems(), inputLength); |
| 559 UNormalizationMode norm_mode = UNORM_NFKC; |
| 560 UBool isNormalized = |
| 561 Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || |
| 562 Normalizer::isNormalized(inputString, norm_mode, status); |
| 563 |
| 564 AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); |
| 565 int numChars = 0; |
| 566 UText normalizedText = UTEXT_INITIALIZER; |
| 567 // Needs to be declared here because normalizedText holds onto its buffer. |
| 568 UnicodeString normalizedString; |
| 569 if (isNormalized) { |
| 570 int32_t index = 0; |
| 571 charPositions[0] = 0; |
| 572 while(index < inputString.length()) { |
| 573 index = inputString.moveIndex32(index, 1); |
| 574 charPositions[++numChars] = index; |
| 575 } |
| 576 utext_openUnicodeString(&normalizedText, &inputString, &status); |
| 577 } |
| 578 else { |
| 579 Normalizer::normalize(inputString, norm_mode, 0, normalizedString, statu
s); |
| 580 if (U_FAILURE(status)) |
| 581 return 0; |
| 582 charPositions.resize(normalizedString.length() + 1); |
| 583 Normalizer normalizer(charString.elems(), inputLength, norm_mode); |
| 584 int32_t index = 0; |
| 585 charPositions[0] = 0; |
| 586 while(index < normalizer.endIndex()){ |
| 587 UChar32 uc = normalizer.next(); |
| 588 charPositions[++numChars] = index = normalizer.getIndex(); |
| 589 } |
| 590 utext_openUnicodeString(&normalizedText, &normalizedString, &status); |
| 591 } |
| 592 |
| 593 if (U_FAILURE(status)) |
| 594 return 0; |
| 595 |
| 596 // From this point on, all the indices refer to the indices of |
| 597 // the normalized input string. |
| 598 |
| 599 // bestSnlp[i] is the snlp of the best segmentation of the first i |
| 600 // characters in the range to be matched. |
| 601 AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); |
| 602 bestSnlp[0] = 0; |
| 603 for(int i=1; i<=numChars; i++){ |
| 604 bestSnlp[i] = kuint32max; |
| 605 } |
| 606 |
| 607 // prev[i] is the index of the last CJK character in the previous word in |
| 608 // the best segmentation of the first i characters. |
| 609 AutoBuffer<int, defaultInputLength> prev(numChars + 1); |
| 610 for(int i=0; i<=numChars; i++){ |
| 611 prev[i] = -1; |
| 612 } |
| 613 |
| 614 const size_t maxWordSize = 20; |
| 615 AutoBuffer<uint16_t, maxWordSize> values(numChars); |
| 616 AutoBuffer<int32_t, maxWordSize> lengths(numChars); |
| 617 |
| 618 // Dynamic programming to find the best segmentation. |
| 619 bool is_prev_katakana = false; |
| 620 for (int i = 0; i < numChars; ++i) { |
| 621 //utext_setNativeIndex(text, rangeStart + i); |
| 622 utext_setNativeIndex(&normalizedText, i); |
| 623 if (bestSnlp[i] == kuint32max) |
| 624 continue; |
| 625 |
| 626 int count; |
| 627 // limit maximum word length matched to size of current substring |
| 628 int maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize
: numChars - i; |
| 629 |
| 630 fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(),
count, maxSearchLength, values.elems()); |
| 631 |
| 632 // if there are no single character matches found in the dictionary |
| 633 // starting with this charcter, treat character as a 1-character word |
| 634 // with the highest value possible, i.e. the least likely to occur. |
| 635 // Exclude Korean characters from this treatment, as they should be left |
| 636 // together by default. |
| 637 if((count == 0 || lengths[0] != 1) && |
| 638 !fHangulWordSet.contains(utext_current32(&normalizedText))){ |
| 639 values[count] = maxSnlp; |
| 640 lengths[count++] = 1; |
| 641 } |
| 642 |
| 643 for (int j = 0; j < count; j++){ |
| 644 //U_ASSERT(values[j] >= 0 && values[j] <= maxSnlp); |
| 645 uint32_t newSnlp = bestSnlp[i] + values[j]; |
| 646 if (newSnlp < bestSnlp[lengths[j] + i]) { |
| 647 bestSnlp[lengths[j] + i] = newSnlp; |
| 648 prev[lengths[j] + i] = i; |
| 649 } |
| 650 } |
| 651 |
| 652 // In Japanese, |
| 653 // Katakana word in single character is pretty rare. So we apply |
| 654 // the following heuristic to Katakana: any continuous run of Katakana |
| 655 // characters is considered a candidate word with a default cost |
| 656 // specified in the katakanaCost table according to its length. |
| 657 //utext_setNativeIndex(text, rangeStart + i); |
| 658 utext_setNativeIndex(&normalizedText, i); |
| 659 bool is_katakana = isKatakana(utext_current32(&normalizedText)); |
| 660 if (!is_prev_katakana && is_katakana) { |
| 661 int j = i + 1; |
| 662 utext_next32(&normalizedText); |
| 663 // Find the end of the continuous run of Katakana characters |
| 664 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && |
| 665 isKatakana(utext_current32(&normalizedText))) { |
| 666 utext_next32(&normalizedText); |
| 667 ++j; |
| 668 } |
| 669 if ((j - i) < kMaxKatakanaGroupLength) { |
| 670 uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); |
| 671 if (newSnlp < bestSnlp[j]) { |
| 672 bestSnlp[j] = newSnlp; |
| 673 prev[j] = i; |
| 674 } |
| 675 } |
| 676 } |
| 677 is_prev_katakana = is_katakana; |
| 678 } |
| 679 |
| 680 // Start pushing the optimal offset index into t_boundary (t for tentative). |
| 681 // prev[numChars] is guaranteed to be meaningful. |
| 682 // We'll first push in the reverse order, i.e., |
| 683 // t_boundary[0] = numChars, and afterwards do a swap. |
| 684 AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); |
| 685 |
| 686 int numBreaks = 0; |
| 687 // No segmentation found, set boundary to end of range |
| 688 if (bestSnlp[numChars] == kuint32max) { |
| 689 t_boundary[numBreaks++] = numChars; |
| 690 } else { |
| 691 for (int i = numChars; i > 0; i = prev[i]){ |
| 692 t_boundary[numBreaks++] = i; |
| 693 |
| 694 } |
| 695 U_ASSERT(prev[t_boundary[numBreaks-1]] == 0); |
| 696 } |
| 697 |
| 698 // Reverse offset index in t_boundary. |
| 699 // Don't add a break for the start of the dictionary range if there is one |
| 700 // there already. |
| 701 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { |
| 702 t_boundary[numBreaks++] = 0; |
| 703 } |
| 704 |
| 705 // Now that we're done, convert positions in t_bdry[] (indices in |
| 706 // the normalized input string) back to indices in the raw input string |
| 707 // while reversing t_bdry and pushing values to foundBreaks. |
| 708 for (int i = numBreaks-1; i >= 0; i--) { |
| 709 foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); |
| 710 } |
| 711 |
| 712 utext_close(&normalizedText); |
| 713 return numBreaks; |
| 714 } |
| 715 |
425 U_NAMESPACE_END | 716 U_NAMESPACE_END |
426 | 717 |
427 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 718 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |