OLD | NEW |
1 /* | 1 /* |
2 *************************************************************************** | 2 *************************************************************************** |
3 * Copyright (C) 1999-2013 International Business Machines Corporation | 3 * Copyright (C) 1999-2014 International Business Machines Corporation |
4 * and others. All rights reserved. | 4 * and others. All rights reserved. |
5 *************************************************************************** | 5 *************************************************************************** |
6 */ | 6 */ |
7 // | 7 // |
8 // file: rbbi.c Contains the implementation of the rule based break iterato
r | 8 // file: rbbi.c Contains the implementation of the rule based break iterato
r |
9 // runtime engine and the API implementation for | 9 // runtime engine and the API implementation for |
10 // class RuleBasedBreakIterator | 10 // class RuleBasedBreakIterator |
11 // | 11 // |
12 | 12 |
13 #include "utypeinfo.h" // for 'typeid' to work | 13 #include "utypeinfo.h" // for 'typeid' to work |
(...skipping 497 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
511 // contents as the old. If we can't set to the same position, it doesn'
t. | 511 // contents as the old. If we can't set to the same position, it doesn'
t. |
512 // The contents underlying the old utext might be invalid at this point, | 512 // The contents underlying the old utext might be invalid at this point, |
513 // so it's not safe to check directly. | 513 // so it's not safe to check directly. |
514 status = U_ILLEGAL_ARGUMENT_ERROR; | 514 status = U_ILLEGAL_ARGUMENT_ERROR; |
515 } | 515 } |
516 return *this; | 516 return *this; |
517 } | 517 } |
518 | 518 |
519 | 519 |
520 /** | 520 /** |
521 * Sets the current iteration position to the beginning of the text. | 521 * Sets the current iteration position to the beginning of the text, position ze
ro. |
522 * @return The offset of the beginning of the text. | 522 * @return The new iterator position, which is zero. |
523 */ | 523 */ |
524 int32_t RuleBasedBreakIterator::first(void) { | 524 int32_t RuleBasedBreakIterator::first(void) { |
525 reset(); | 525 reset(); |
526 fLastRuleStatusIndex = 0; | 526 fLastRuleStatusIndex = 0; |
527 fLastStatusIndexValid = TRUE; | 527 fLastStatusIndexValid = TRUE; |
528 //if (fText == NULL) | 528 //if (fText == NULL) |
529 // return BreakIterator::DONE; | 529 // return BreakIterator::DONE; |
530 | 530 |
531 utext_setNativeIndex(fText, 0); | 531 utext_setNativeIndex(fText, 0); |
532 return 0; | 532 return 0; |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
585 int32_t pos = fCachedBreakPositions[fPositionInCache]; | 585 int32_t pos = fCachedBreakPositions[fPositionInCache]; |
586 utext_setNativeIndex(fText, pos); | 586 utext_setNativeIndex(fText, pos); |
587 return pos; | 587 return pos; |
588 } | 588 } |
589 else { | 589 else { |
590 reset(); | 590 reset(); |
591 } | 591 } |
592 } | 592 } |
593 | 593 |
594 int32_t startPos = current(); | 594 int32_t startPos = current(); |
| 595 fDictionaryCharCount = 0; |
595 int32_t result = handleNext(fData->fForwardTable); | 596 int32_t result = handleNext(fData->fForwardTable); |
596 if (fDictionaryCharCount > 0) { | 597 if (fDictionaryCharCount > 0) { |
597 result = checkDictionary(startPos, result, FALSE); | 598 result = checkDictionary(startPos, result, FALSE); |
598 } | 599 } |
599 return result; | 600 return result; |
600 } | 601 } |
601 | 602 |
602 /** | 603 /** |
603 * Advances the iterator backwards, to the last boundary preceding this one. | 604 * Advances the iterator backwards, to the last boundary preceding this one. |
604 * @return The position of the last boundary position preceding this one. | 605 * @return The position of the last boundary position preceding this one. |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
639 result = checkDictionary(result, startPos, TRUE); | 640 result = checkDictionary(result, startPos, TRUE); |
640 } | 641 } |
641 return result; | 642 return result; |
642 } | 643 } |
643 | 644 |
644 // old rule syntax | 645 // old rule syntax |
645 // set things up. handlePrevious() will back us up to some valid | 646 // set things up. handlePrevious() will back us up to some valid |
646 // break position before the current position (we back our internal | 647 // break position before the current position (we back our internal |
647 // iterator up one step to prevent handlePrevious() from returning | 648 // iterator up one step to prevent handlePrevious() from returning |
648 // the current position), but not necessarily the last one before | 649 // the current position), but not necessarily the last one before |
649 | |
650 // where we started | 650 // where we started |
651 | 651 |
652 int32_t start = current(); | 652 int32_t start = current(); |
653 | 653 |
654 (void)UTEXT_PREVIOUS32(fText); | 654 (void)UTEXT_PREVIOUS32(fText); |
655 int32_t lastResult = handlePrevious(fData->fReverseTable); | 655 int32_t lastResult = handlePrevious(fData->fReverseTable); |
656 if (lastResult == UBRK_DONE) { | 656 if (lastResult == UBRK_DONE) { |
657 lastResult = 0; | 657 lastResult = 0; |
658 utext_setNativeIndex(fText, 0); | 658 utext_setNativeIndex(fText, 0); |
659 } | 659 } |
(...skipping 12 matching lines...) Expand all Loading... |
672 } | 672 } |
673 lastResult = result; | 673 lastResult = result; |
674 lastTag = fLastRuleStatusIndex; | 674 lastTag = fLastRuleStatusIndex; |
675 breakTagValid = TRUE; | 675 breakTagValid = TRUE; |
676 } | 676 } |
677 | 677 |
678 // fLastBreakTag wants to have the value for section of text preceding | 678 // fLastBreakTag wants to have the value for section of text preceding |
679 // the result position that we are to return (in lastResult.) If | 679 // the result position that we are to return (in lastResult.) If |
680 // the backwards rules overshot and the above loop had to do two or more | 680 // the backwards rules overshot and the above loop had to do two or more |
681 // next()s to move up to the desired return position, we will have a valid | 681 // next()s to move up to the desired return position, we will have a valid |
682 // tag value. But, if handlePrevious() took us to exactly the correct result
positon, | 682 // tag value. But, if handlePrevious() took us to exactly the correct result
position, |
683 // we wont have a tag value for that position, which is only set by handleNe
xt(). | 683 // we wont have a tag value for that position, which is only set by handleNe
xt(). |
684 | 684 |
685 // set the current iteration position to be the last break position | 685 // Set the current iteration position to be the last break position |
686 // before where we started, and then return that value | 686 // before where we started, and then return that value. |
687 utext_setNativeIndex(fText, lastResult); | 687 utext_setNativeIndex(fText, lastResult); |
688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() | 688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() |
689 fLastStatusIndexValid = breakTagValid; | 689 fLastStatusIndexValid = breakTagValid; |
690 | 690 |
691 // No need to check the dictionary; it will have been handled by | 691 // No need to check the dictionary; it will have been handled by |
692 // next() | 692 // next() |
693 | 693 |
694 return lastResult; | 694 return lastResult; |
695 } | 695 } |
696 | 696 |
697 /** | 697 /** |
698 * Sets the iterator to refer to the first boundary position following | 698 * Sets the iterator to refer to the first boundary position following |
699 * the specified position. | 699 * the specified position. |
700 * @offset The position from which to begin searching for a break position. | 700 * @offset The position from which to begin searching for a break position. |
701 * @return The position of the first break after the current position. | 701 * @return The position of the first break after the current position. |
702 */ | 702 */ |
703 int32_t RuleBasedBreakIterator::following(int32_t offset) { | 703 int32_t RuleBasedBreakIterator::following(int32_t offset) { |
| 704 // if the offset passed in is already past the end of the text, |
| 705 // just return DONE; if it's before the beginning, return the |
| 706 // text's starting offset |
| 707 if (fText == NULL || offset >= utext_nativeLength(fText)) { |
| 708 last(); |
| 709 return next(); |
| 710 } |
| 711 else if (offset < 0) { |
| 712 return first(); |
| 713 } |
| 714 |
| 715 // Move requested offset to a code point start. It might be on a trail surro
gate, |
| 716 // or on a trail byte if the input is UTF-8. |
| 717 utext_setNativeIndex(fText, offset); |
| 718 offset = utext_getNativeIndex(fText); |
| 719 |
704 // if we have cached break positions and offset is in the range | 720 // if we have cached break positions and offset is in the range |
705 // covered by them, use them | 721 // covered by them, use them |
706 // TODO: could use binary search | 722 // TODO: could use binary search |
707 // TODO: what if offset is outside range, but break is not? | 723 // TODO: what if offset is outside range, but break is not? |
708 if (fCachedBreakPositions != NULL) { | 724 if (fCachedBreakPositions != NULL) { |
709 if (offset >= fCachedBreakPositions[0] | 725 if (offset >= fCachedBreakPositions[0] |
710 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1])
{ | 726 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1])
{ |
711 fPositionInCache = 0; | 727 fPositionInCache = 0; |
712 // We are guaranteed not to leave the array due to range test above | 728 // We are guaranteed not to leave the array due to range test above |
713 while (offset >= fCachedBreakPositions[fPositionInCache]) { | 729 while (offset >= fCachedBreakPositions[fPositionInCache]) { |
714 ++fPositionInCache; | 730 ++fPositionInCache; |
715 } | 731 } |
716 int32_t pos = fCachedBreakPositions[fPositionInCache]; | 732 int32_t pos = fCachedBreakPositions[fPositionInCache]; |
717 utext_setNativeIndex(fText, pos); | 733 utext_setNativeIndex(fText, pos); |
718 return pos; | 734 return pos; |
719 } | 735 } |
720 else { | 736 else { |
721 reset(); | 737 reset(); |
722 } | 738 } |
723 } | 739 } |
724 | 740 |
725 // if the offset passed in is already past the end of the text, | 741 // Set our internal iteration position (temporarily) |
726 // just return DONE; if it's before the beginning, return the | |
727 // text's starting offset | |
728 fLastRuleStatusIndex = 0; | |
729 fLastStatusIndexValid = TRUE; | |
730 if (fText == NULL || offset >= utext_nativeLength(fText)) { | |
731 last(); | |
732 return next(); | |
733 } | |
734 else if (offset < 0) { | |
735 return first(); | |
736 } | |
737 | |
738 // otherwise, set our internal iteration position (temporarily) | |
739 // to the position passed in. If this is the _beginning_ position, | 742 // to the position passed in. If this is the _beginning_ position, |
740 // then we can just use next() to get our return value | 743 // then we can just use next() to get our return value |
741 | 744 |
742 int32_t result = 0; | 745 int32_t result = 0; |
743 | 746 |
744 if (fData->fSafeRevTable != NULL) { | 747 if (fData->fSafeRevTable != NULL) { |
745 // new rule syntax | 748 // new rule syntax |
746 utext_setNativeIndex(fText, offset); | 749 utext_setNativeIndex(fText, offset); |
747 // move forward one codepoint to prepare for moving back to a | 750 // move forward one codepoint to prepare for moving back to a |
748 // safe point. | 751 // safe point. |
749 // this handles offset being between a supplementary character | 752 // this handles offset being between a supplementary character |
| 753 // TODO: is this still needed, with move to code point boundary handled
above? |
750 (void)UTEXT_NEXT32(fText); | 754 (void)UTEXT_NEXT32(fText); |
751 // handlePrevious will move most of the time to < 1 boundary away | 755 // handlePrevious will move most of the time to < 1 boundary away |
752 handlePrevious(fData->fSafeRevTable); | 756 handlePrevious(fData->fSafeRevTable); |
753 int32_t result = next(); | 757 int32_t result = next(); |
754 while (result <= offset) { | 758 while (result <= offset) { |
755 result = next(); | 759 result = next(); |
756 } | 760 } |
757 return result; | 761 return result; |
758 } | 762 } |
759 if (fData->fSafeFwdTable != NULL) { | 763 if (fData->fSafeFwdTable != NULL) { |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
802 return result; | 806 return result; |
803 } | 807 } |
804 | 808 |
805 /** | 809 /** |
806 * Sets the iterator to refer to the last boundary position before the | 810 * Sets the iterator to refer to the last boundary position before the |
807 * specified position. | 811 * specified position. |
808 * @offset The position to begin searching for a break from. | 812 * @offset The position to begin searching for a break from. |
809 * @return The position of the last boundary before the starting position. | 813 * @return The position of the last boundary before the starting position. |
810 */ | 814 */ |
811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { | 815 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
| 816 // if the offset passed in is already past the end of the text, |
| 817 // just return DONE; if it's before the beginning, return the |
| 818 // text's starting offset |
| 819 if (fText == NULL || offset > utext_nativeLength(fText)) { |
| 820 return last(); |
| 821 } |
| 822 else if (offset < 0) { |
| 823 return first(); |
| 824 } |
| 825 |
| 826 // Move requested offset to a code point start. It might be on a trail surro
gate, |
| 827 // or on a trail byte if the input is UTF-8. |
| 828 utext_setNativeIndex(fText, offset); |
| 829 offset = utext_getNativeIndex(fText); |
| 830 |
812 // if we have cached break positions and offset is in the range | 831 // if we have cached break positions and offset is in the range |
813 // covered by them, use them | 832 // covered by them, use them |
814 if (fCachedBreakPositions != NULL) { | 833 if (fCachedBreakPositions != NULL) { |
815 // TODO: binary search? | 834 // TODO: binary search? |
816 // TODO: What if offset is outside range, but break is not? | 835 // TODO: What if offset is outside range, but break is not? |
817 if (offset > fCachedBreakPositions[0] | 836 if (offset > fCachedBreakPositions[0] |
818 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]
) { | 837 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]
) { |
819 fPositionInCache = 0; | 838 fPositionInCache = 0; |
820 while (fPositionInCache < fNumCachedBreakPositions | 839 while (fPositionInCache < fNumCachedBreakPositions |
821 && offset > fCachedBreakPositions[fPositionInCache]) | 840 && offset > fCachedBreakPositions[fPositionInCache]) |
822 ++fPositionInCache; | 841 ++fPositionInCache; |
823 --fPositionInCache; | 842 --fPositionInCache; |
824 // If we're at the beginning of the cache, need to reevaluate the | 843 // If we're at the beginning of the cache, need to reevaluate the |
825 // rule status | 844 // rule status |
826 if (fPositionInCache <= 0) { | 845 if (fPositionInCache <= 0) { |
827 fLastStatusIndexValid = FALSE; | 846 fLastStatusIndexValid = FALSE; |
828 } | 847 } |
829 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache])
; | 848 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache])
; |
830 return fCachedBreakPositions[fPositionInCache]; | 849 return fCachedBreakPositions[fPositionInCache]; |
831 } | 850 } |
832 else { | 851 else { |
833 reset(); | 852 reset(); |
834 } | 853 } |
835 } | 854 } |
836 | 855 |
837 // if the offset passed in is already past the end of the text, | |
838 // just return DONE; if it's before the beginning, return the | |
839 // text's starting offset | |
840 if (fText == NULL || offset > utext_nativeLength(fText)) { | |
841 // return BreakIterator::DONE; | |
842 return last(); | |
843 } | |
844 else if (offset < 0) { | |
845 return first(); | |
846 } | |
847 | |
848 // if we start by updating the current iteration position to the | 856 // if we start by updating the current iteration position to the |
849 // position specified by the caller, we can just use previous() | 857 // position specified by the caller, we can just use previous() |
850 // to carry out this operation | 858 // to carry out this operation |
851 | 859 |
852 if (fData->fSafeFwdTable != NULL) { | 860 if (fData->fSafeFwdTable != NULL) { |
853 // new rule syntax | 861 // new rule syntax |
854 utext_setNativeIndex(fText, offset); | 862 utext_setNativeIndex(fText, offset); |
855 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); | 863 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
856 if (newOffset != offset) { | 864 if (newOffset != offset) { |
857 // Will come here if specified offset was not a code point boundary
AND | 865 // Will come here if specified offset was not a code point boundary
AND |
(...skipping 713 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1571 // Reset the old break cache first. | 1579 // Reset the old break cache first. |
1572 reset(); | 1580 reset(); |
1573 | 1581 |
1574 // note: code segment below assumes that dictionary chars are in the | 1582 // note: code segment below assumes that dictionary chars are in the |
1575 // startPos-endPos range | 1583 // startPos-endPos range |
1576 // value returned should be next character in sequence | 1584 // value returned should be next character in sequence |
1577 if ((endPos - startPos) <= 1) { | 1585 if ((endPos - startPos) <= 1) { |
1578 return (reverse ? startPos : endPos); | 1586 return (reverse ? startPos : endPos); |
1579 } | 1587 } |
1580 | 1588 |
1581 // Bug 5532. The dictionary code will crash if the input text is UTF-8 | |
1582 // because native indexes are different from UTF-16 indexes. | |
1583 // Temporary hack: skip dictionary lookup for UTF-8 encoded text. | |
1584 // It wont give the right breaks, but it's better than a crash. | |
1585 // | |
1586 // Check the type of the UText by checking its pFuncs field, which | |
1587 // is UText's function dispatch table. It will be the same for all | |
1588 // UTF-8 UTexts and different for any other UText type. | |
1589 // | |
1590 // We have no other type of UText available with non-UTF-16 native inde
xing. | |
1591 // This whole check will go away once the dictionary code is fixed. | |
1592 static const void *utext_utf8Funcs; | |
1593 if (utext_utf8Funcs == NULL) { | |
1594 // Cache the UTF-8 UText function pointer value. | |
1595 UErrorCode status = U_ZERO_ERROR; | |
1596 UText tempUText = UTEXT_INITIALIZER; | |
1597 utext_openUTF8(&tempUText, NULL, 0, &status); | |
1598 utext_utf8Funcs = tempUText.pFuncs; | |
1599 utext_close(&tempUText); | |
1600 } | |
1601 if (fText->pFuncs == utext_utf8Funcs) { | |
1602 return (reverse ? startPos : endPos); | |
1603 } | |
1604 | |
1605 // Starting from the starting point, scan towards the proposed result, | 1589 // Starting from the starting point, scan towards the proposed result, |
1606 // looking for the first dictionary character (which may be the one | 1590 // looking for the first dictionary character (which may be the one |
1607 // we're on, if we're starting in the middle of a range). | 1591 // we're on, if we're starting in the middle of a range). |
1608 utext_setNativeIndex(fText, reverse ? endPos : startPos); | 1592 utext_setNativeIndex(fText, reverse ? endPos : startPos); |
1609 if (reverse) { | 1593 if (reverse) { |
1610 UTEXT_PREVIOUS32(fText); | 1594 UTEXT_PREVIOUS32(fText); |
1611 } | 1595 } |
1612 | 1596 |
1613 int32_t rangeStart = startPos; | 1597 int32_t rangeStart = startPos; |
1614 int32_t rangeEnd = endPos; | 1598 int32_t rangeEnd = endPos; |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1696 } | 1680 } |
1697 | 1681 |
1698 // Reload the loop variables for the next go-round | 1682 // Reload the loop variables for the next go-round |
1699 c = utext_current32(fText); | 1683 c = utext_current32(fText); |
1700 UTRIE_GET16(&fData->fTrie, c, category); | 1684 UTRIE_GET16(&fData->fTrie, c, category); |
1701 } | 1685 } |
1702 | 1686 |
1703 // If we found breaks, build a new break cache. The first and last entries m
ust | 1687 // If we found breaks, build a new break cache. The first and last entries m
ust |
1704 // be the original starting and ending position. | 1688 // be the original starting and ending position. |
1705 if (foundBreakCount > 0) { | 1689 if (foundBreakCount > 0) { |
| 1690 U_ASSERT(foundBreakCount == breaks.size()); |
1706 int32_t totalBreaks = foundBreakCount; | 1691 int32_t totalBreaks = foundBreakCount; |
1707 if (startPos < breaks.elementAti(0)) { | 1692 if (startPos < breaks.elementAti(0)) { |
1708 totalBreaks += 1; | 1693 totalBreaks += 1; |
1709 } | 1694 } |
1710 if (endPos > breaks.peeki()) { | 1695 if (endPos > breaks.peeki()) { |
1711 totalBreaks += 1; | 1696 totalBreaks += 1; |
1712 } | 1697 } |
1713 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int3
2_t)); | 1698 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int3
2_t)); |
1714 if (fCachedBreakPositions != NULL) { | 1699 if (fCachedBreakPositions != NULL) { |
1715 int32_t out = 0; | 1700 int32_t out = 0; |
(...skipping 19 matching lines...) Expand all Loading... |
1735 } | 1720 } |
1736 // If the allocation failed, just fall through to the "no breaks found"
case. | 1721 // If the allocation failed, just fall through to the "no breaks found"
case. |
1737 } | 1722 } |
1738 | 1723 |
1739 // If we get here, there were no language-based breaks. Set the text pointer | 1724 // If we get here, there were no language-based breaks. Set the text pointer |
1740 // to the original proposed break. | 1725 // to the original proposed break. |
1741 utext_setNativeIndex(fText, reverse ? startPos : endPos); | 1726 utext_setNativeIndex(fText, reverse ? startPos : endPos); |
1742 return (reverse ? startPos : endPos); | 1727 return (reverse ? startPos : endPos); |
1743 } | 1728 } |
1744 | 1729 |
1745 // defined in ucln_cmn.h | |
1746 | |
1747 U_NAMESPACE_END | 1730 U_NAMESPACE_END |
1748 | 1731 |
1749 | 1732 |
1750 static icu::UStack *gLanguageBreakFactories = NULL; | 1733 static icu::UStack *gLanguageBreakFactories = NULL; |
1751 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; | 1734 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; |
1752 | 1735 |
1753 /** | 1736 /** |
1754 * Release all static memory held by breakiterator. | 1737 * Release all static memory held by breakiterator. |
1755 */ | 1738 */ |
1756 U_CDECL_BEGIN | 1739 U_CDECL_BEGIN |
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1882 }*/ | 1865 }*/ |
1883 | 1866 |
1884 void RuleBasedBreakIterator::setBreakType(int32_t type) { | 1867 void RuleBasedBreakIterator::setBreakType(int32_t type) { |
1885 fBreakType = type; | 1868 fBreakType = type; |
1886 reset(); | 1869 reset(); |
1887 } | 1870 } |
1888 | 1871 |
1889 U_NAMESPACE_END | 1872 U_NAMESPACE_END |
1890 | 1873 |
1891 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | 1874 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
OLD | NEW |