Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(31)

Side by Side Diff: source/common/rbbi.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: remove unusued directories Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/common/putilimp.h ('k') | source/common/rbbidata.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 *************************************************************************** 2 ***************************************************************************
3 * Copyright (C) 1999-2013 International Business Machines Corporation 3 * Copyright (C) 1999-2014 International Business Machines Corporation
4 * and others. All rights reserved. 4 * and others. All rights reserved.
5 *************************************************************************** 5 ***************************************************************************
6 */ 6 */
7 // 7 //
8 // file: rbbi.c Contains the implementation of the rule based break iterato r 8 // file: rbbi.c Contains the implementation of the rule based break iterato r
9 // runtime engine and the API implementation for 9 // runtime engine and the API implementation for
10 // class RuleBasedBreakIterator 10 // class RuleBasedBreakIterator
11 // 11 //
12 12
13 #include "utypeinfo.h" // for 'typeid' to work 13 #include "utypeinfo.h" // for 'typeid' to work
(...skipping 497 matching lines...) Expand 10 before | Expand all | Expand 10 after
511 // contents as the old. If we can't set to the same position, it doesn' t. 511 // contents as the old. If we can't set to the same position, it doesn' t.
512 // The contents underlying the old utext might be invalid at this point, 512 // The contents underlying the old utext might be invalid at this point,
513 // so it's not safe to check directly. 513 // so it's not safe to check directly.
514 status = U_ILLEGAL_ARGUMENT_ERROR; 514 status = U_ILLEGAL_ARGUMENT_ERROR;
515 } 515 }
516 return *this; 516 return *this;
517 } 517 }
518 518
519 519
520 /** 520 /**
521 * Sets the current iteration position to the beginning of the text. 521 * Sets the current iteration position to the beginning of the text, position ze ro.
522 * @return The offset of the beginning of the text. 522 * @return The new iterator position, which is zero.
523 */ 523 */
524 int32_t RuleBasedBreakIterator::first(void) { 524 int32_t RuleBasedBreakIterator::first(void) {
525 reset(); 525 reset();
526 fLastRuleStatusIndex = 0; 526 fLastRuleStatusIndex = 0;
527 fLastStatusIndexValid = TRUE; 527 fLastStatusIndexValid = TRUE;
528 //if (fText == NULL) 528 //if (fText == NULL)
529 // return BreakIterator::DONE; 529 // return BreakIterator::DONE;
530 530
531 utext_setNativeIndex(fText, 0); 531 utext_setNativeIndex(fText, 0);
532 return 0; 532 return 0;
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
585 int32_t pos = fCachedBreakPositions[fPositionInCache]; 585 int32_t pos = fCachedBreakPositions[fPositionInCache];
586 utext_setNativeIndex(fText, pos); 586 utext_setNativeIndex(fText, pos);
587 return pos; 587 return pos;
588 } 588 }
589 else { 589 else {
590 reset(); 590 reset();
591 } 591 }
592 } 592 }
593 593
594 int32_t startPos = current(); 594 int32_t startPos = current();
595 fDictionaryCharCount = 0;
595 int32_t result = handleNext(fData->fForwardTable); 596 int32_t result = handleNext(fData->fForwardTable);
596 if (fDictionaryCharCount > 0) { 597 if (fDictionaryCharCount > 0) {
597 result = checkDictionary(startPos, result, FALSE); 598 result = checkDictionary(startPos, result, FALSE);
598 } 599 }
599 return result; 600 return result;
600 } 601 }
601 602
602 /** 603 /**
603 * Advances the iterator backwards, to the last boundary preceding this one. 604 * Advances the iterator backwards, to the last boundary preceding this one.
604 * @return The position of the last boundary position preceding this one. 605 * @return The position of the last boundary position preceding this one.
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
639 result = checkDictionary(result, startPos, TRUE); 640 result = checkDictionary(result, startPos, TRUE);
640 } 641 }
641 return result; 642 return result;
642 } 643 }
643 644
644 // old rule syntax 645 // old rule syntax
645 // set things up. handlePrevious() will back us up to some valid 646 // set things up. handlePrevious() will back us up to some valid
646 // break position before the current position (we back our internal 647 // break position before the current position (we back our internal
647 // iterator up one step to prevent handlePrevious() from returning 648 // iterator up one step to prevent handlePrevious() from returning
648 // the current position), but not necessarily the last one before 649 // the current position), but not necessarily the last one before
649
650 // where we started 650 // where we started
651 651
652 int32_t start = current(); 652 int32_t start = current();
653 653
654 (void)UTEXT_PREVIOUS32(fText); 654 (void)UTEXT_PREVIOUS32(fText);
655 int32_t lastResult = handlePrevious(fData->fReverseTable); 655 int32_t lastResult = handlePrevious(fData->fReverseTable);
656 if (lastResult == UBRK_DONE) { 656 if (lastResult == UBRK_DONE) {
657 lastResult = 0; 657 lastResult = 0;
658 utext_setNativeIndex(fText, 0); 658 utext_setNativeIndex(fText, 0);
659 } 659 }
(...skipping 12 matching lines...) Expand all
672 } 672 }
673 lastResult = result; 673 lastResult = result;
674 lastTag = fLastRuleStatusIndex; 674 lastTag = fLastRuleStatusIndex;
675 breakTagValid = TRUE; 675 breakTagValid = TRUE;
676 } 676 }
677 677
678 // fLastBreakTag wants to have the value for section of text preceding 678 // fLastBreakTag wants to have the value for section of text preceding
679 // the result position that we are to return (in lastResult.) If 679 // the result position that we are to return (in lastResult.) If
680 // the backwards rules overshot and the above loop had to do two or more 680 // the backwards rules overshot and the above loop had to do two or more
681 // next()s to move up to the desired return position, we will have a valid 681 // next()s to move up to the desired return position, we will have a valid
682 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 682 // tag value. But, if handlePrevious() took us to exactly the correct result position,
683 // we wont have a tag value for that position, which is only set by handleNe xt(). 683 // we wont have a tag value for that position, which is only set by handleNe xt().
684 684
685 // set the current iteration position to be the last break position 685 // Set the current iteration position to be the last break position
686 // before where we started, and then return that value 686 // before where we started, and then return that value.
687 utext_setNativeIndex(fText, lastResult); 687 utext_setNativeIndex(fText, lastResult);
688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
689 fLastStatusIndexValid = breakTagValid; 689 fLastStatusIndexValid = breakTagValid;
690 690
691 // No need to check the dictionary; it will have been handled by 691 // No need to check the dictionary; it will have been handled by
692 // next() 692 // next()
693 693
694 return lastResult; 694 return lastResult;
695 } 695 }
696 696
697 /** 697 /**
698 * Sets the iterator to refer to the first boundary position following 698 * Sets the iterator to refer to the first boundary position following
699 * the specified position. 699 * the specified position.
700 * @offset The position from which to begin searching for a break position. 700 * @offset The position from which to begin searching for a break position.
701 * @return The position of the first break after the current position. 701 * @return The position of the first break after the current position.
702 */ 702 */
703 int32_t RuleBasedBreakIterator::following(int32_t offset) { 703 int32_t RuleBasedBreakIterator::following(int32_t offset) {
704 // if the offset passed in is already past the end of the text,
705 // just return DONE; if it's before the beginning, return the
706 // text's starting offset
707 if (fText == NULL || offset >= utext_nativeLength(fText)) {
708 last();
709 return next();
710 }
711 else if (offset < 0) {
712 return first();
713 }
714
715 // Move requested offset to a code point start. It might be on a trail surro gate,
716 // or on a trail byte if the input is UTF-8.
717 utext_setNativeIndex(fText, offset);
718 offset = utext_getNativeIndex(fText);
719
704 // if we have cached break positions and offset is in the range 720 // if we have cached break positions and offset is in the range
705 // covered by them, use them 721 // covered by them, use them
706 // TODO: could use binary search 722 // TODO: could use binary search
707 // TODO: what if offset is outside range, but break is not? 723 // TODO: what if offset is outside range, but break is not?
708 if (fCachedBreakPositions != NULL) { 724 if (fCachedBreakPositions != NULL) {
709 if (offset >= fCachedBreakPositions[0] 725 if (offset >= fCachedBreakPositions[0]
710 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 726 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
711 fPositionInCache = 0; 727 fPositionInCache = 0;
712 // We are guaranteed not to leave the array due to range test above 728 // We are guaranteed not to leave the array due to range test above
713 while (offset >= fCachedBreakPositions[fPositionInCache]) { 729 while (offset >= fCachedBreakPositions[fPositionInCache]) {
714 ++fPositionInCache; 730 ++fPositionInCache;
715 } 731 }
716 int32_t pos = fCachedBreakPositions[fPositionInCache]; 732 int32_t pos = fCachedBreakPositions[fPositionInCache];
717 utext_setNativeIndex(fText, pos); 733 utext_setNativeIndex(fText, pos);
718 return pos; 734 return pos;
719 } 735 }
720 else { 736 else {
721 reset(); 737 reset();
722 } 738 }
723 } 739 }
724 740
725 // if the offset passed in is already past the end of the text, 741 // Set our internal iteration position (temporarily)
726 // just return DONE; if it's before the beginning, return the
727 // text's starting offset
728 fLastRuleStatusIndex = 0;
729 fLastStatusIndexValid = TRUE;
730 if (fText == NULL || offset >= utext_nativeLength(fText)) {
731 last();
732 return next();
733 }
734 else if (offset < 0) {
735 return first();
736 }
737
738 // otherwise, set our internal iteration position (temporarily)
739 // to the position passed in. If this is the _beginning_ position, 742 // to the position passed in. If this is the _beginning_ position,
740 // then we can just use next() to get our return value 743 // then we can just use next() to get our return value
741 744
742 int32_t result = 0; 745 int32_t result = 0;
743 746
744 if (fData->fSafeRevTable != NULL) { 747 if (fData->fSafeRevTable != NULL) {
745 // new rule syntax 748 // new rule syntax
746 utext_setNativeIndex(fText, offset); 749 utext_setNativeIndex(fText, offset);
747 // move forward one codepoint to prepare for moving back to a 750 // move forward one codepoint to prepare for moving back to a
748 // safe point. 751 // safe point.
749 // this handles offset being between a supplementary character 752 // this handles offset being between a supplementary character
753 // TODO: is this still needed, with move to code point boundary handled above?
750 (void)UTEXT_NEXT32(fText); 754 (void)UTEXT_NEXT32(fText);
751 // handlePrevious will move most of the time to < 1 boundary away 755 // handlePrevious will move most of the time to < 1 boundary away
752 handlePrevious(fData->fSafeRevTable); 756 handlePrevious(fData->fSafeRevTable);
753 int32_t result = next(); 757 int32_t result = next();
754 while (result <= offset) { 758 while (result <= offset) {
755 result = next(); 759 result = next();
756 } 760 }
757 return result; 761 return result;
758 } 762 }
759 if (fData->fSafeFwdTable != NULL) { 763 if (fData->fSafeFwdTable != NULL) {
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
802 return result; 806 return result;
803 } 807 }
804 808
805 /** 809 /**
806 * Sets the iterator to refer to the last boundary position before the 810 * Sets the iterator to refer to the last boundary position before the
807 * specified position. 811 * specified position.
808 * @offset The position to begin searching for a break from. 812 * @offset The position to begin searching for a break from.
809 * @return The position of the last boundary before the starting position. 813 * @return The position of the last boundary before the starting position.
810 */ 814 */
811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 815 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
816 // if the offset passed in is already past the end of the text,
817 // just return DONE; if it's before the beginning, return the
818 // text's starting offset
819 if (fText == NULL || offset > utext_nativeLength(fText)) {
820 return last();
821 }
822 else if (offset < 0) {
823 return first();
824 }
825
826 // Move requested offset to a code point start. It might be on a trail surro gate,
827 // or on a trail byte if the input is UTF-8.
828 utext_setNativeIndex(fText, offset);
829 offset = utext_getNativeIndex(fText);
830
812 // if we have cached break positions and offset is in the range 831 // if we have cached break positions and offset is in the range
813 // covered by them, use them 832 // covered by them, use them
814 if (fCachedBreakPositions != NULL) { 833 if (fCachedBreakPositions != NULL) {
815 // TODO: binary search? 834 // TODO: binary search?
816 // TODO: What if offset is outside range, but break is not? 835 // TODO: What if offset is outside range, but break is not?
817 if (offset > fCachedBreakPositions[0] 836 if (offset > fCachedBreakPositions[0]
818 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1] ) { 837 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1] ) {
819 fPositionInCache = 0; 838 fPositionInCache = 0;
820 while (fPositionInCache < fNumCachedBreakPositions 839 while (fPositionInCache < fNumCachedBreakPositions
821 && offset > fCachedBreakPositions[fPositionInCache]) 840 && offset > fCachedBreakPositions[fPositionInCache])
822 ++fPositionInCache; 841 ++fPositionInCache;
823 --fPositionInCache; 842 --fPositionInCache;
824 // If we're at the beginning of the cache, need to reevaluate the 843 // If we're at the beginning of the cache, need to reevaluate the
825 // rule status 844 // rule status
826 if (fPositionInCache <= 0) { 845 if (fPositionInCache <= 0) {
827 fLastStatusIndexValid = FALSE; 846 fLastStatusIndexValid = FALSE;
828 } 847 }
829 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]) ; 848 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]) ;
830 return fCachedBreakPositions[fPositionInCache]; 849 return fCachedBreakPositions[fPositionInCache];
831 } 850 }
832 else { 851 else {
833 reset(); 852 reset();
834 } 853 }
835 } 854 }
836 855
837 // if the offset passed in is already past the end of the text,
838 // just return DONE; if it's before the beginning, return the
839 // text's starting offset
840 if (fText == NULL || offset > utext_nativeLength(fText)) {
841 // return BreakIterator::DONE;
842 return last();
843 }
844 else if (offset < 0) {
845 return first();
846 }
847
848 // if we start by updating the current iteration position to the 856 // if we start by updating the current iteration position to the
849 // position specified by the caller, we can just use previous() 857 // position specified by the caller, we can just use previous()
850 // to carry out this operation 858 // to carry out this operation
851 859
852 if (fData->fSafeFwdTable != NULL) { 860 if (fData->fSafeFwdTable != NULL) {
853 // new rule syntax 861 // new rule syntax
854 utext_setNativeIndex(fText, offset); 862 utext_setNativeIndex(fText, offset);
855 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 863 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
856 if (newOffset != offset) { 864 if (newOffset != offset) {
857 // Will come here if specified offset was not a code point boundary AND 865 // Will come here if specified offset was not a code point boundary AND
(...skipping 713 matching lines...) Expand 10 before | Expand all | Expand 10 after
1571 // Reset the old break cache first. 1579 // Reset the old break cache first.
1572 reset(); 1580 reset();
1573 1581
1574 // note: code segment below assumes that dictionary chars are in the 1582 // note: code segment below assumes that dictionary chars are in the
1575 // startPos-endPos range 1583 // startPos-endPos range
1576 // value returned should be next character in sequence 1584 // value returned should be next character in sequence
1577 if ((endPos - startPos) <= 1) { 1585 if ((endPos - startPos) <= 1) {
1578 return (reverse ? startPos : endPos); 1586 return (reverse ? startPos : endPos);
1579 } 1587 }
1580 1588
1581 // Bug 5532. The dictionary code will crash if the input text is UTF-8
1582 // because native indexes are different from UTF-16 indexes.
1583 // Temporary hack: skip dictionary lookup for UTF-8 encoded text.
1584 // It wont give the right breaks, but it's better than a crash.
1585 //
1586 // Check the type of the UText by checking its pFuncs field, which
1587 // is UText's function dispatch table. It will be the same for all
1588 // UTF-8 UTexts and different for any other UText type.
1589 //
1590 // We have no other type of UText available with non-UTF-16 native inde xing.
1591 // This whole check will go away once the dictionary code is fixed.
1592 static const void *utext_utf8Funcs;
1593 if (utext_utf8Funcs == NULL) {
1594 // Cache the UTF-8 UText function pointer value.
1595 UErrorCode status = U_ZERO_ERROR;
1596 UText tempUText = UTEXT_INITIALIZER;
1597 utext_openUTF8(&tempUText, NULL, 0, &status);
1598 utext_utf8Funcs = tempUText.pFuncs;
1599 utext_close(&tempUText);
1600 }
1601 if (fText->pFuncs == utext_utf8Funcs) {
1602 return (reverse ? startPos : endPos);
1603 }
1604
1605 // Starting from the starting point, scan towards the proposed result, 1589 // Starting from the starting point, scan towards the proposed result,
1606 // looking for the first dictionary character (which may be the one 1590 // looking for the first dictionary character (which may be the one
1607 // we're on, if we're starting in the middle of a range). 1591 // we're on, if we're starting in the middle of a range).
1608 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1592 utext_setNativeIndex(fText, reverse ? endPos : startPos);
1609 if (reverse) { 1593 if (reverse) {
1610 UTEXT_PREVIOUS32(fText); 1594 UTEXT_PREVIOUS32(fText);
1611 } 1595 }
1612 1596
1613 int32_t rangeStart = startPos; 1597 int32_t rangeStart = startPos;
1614 int32_t rangeEnd = endPos; 1598 int32_t rangeEnd = endPos;
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
1696 } 1680 }
1697 1681
1698 // Reload the loop variables for the next go-round 1682 // Reload the loop variables for the next go-round
1699 c = utext_current32(fText); 1683 c = utext_current32(fText);
1700 UTRIE_GET16(&fData->fTrie, c, category); 1684 UTRIE_GET16(&fData->fTrie, c, category);
1701 } 1685 }
1702 1686
1703 // If we found breaks, build a new break cache. The first and last entries m ust 1687 // If we found breaks, build a new break cache. The first and last entries m ust
1704 // be the original starting and ending position. 1688 // be the original starting and ending position.
1705 if (foundBreakCount > 0) { 1689 if (foundBreakCount > 0) {
1690 U_ASSERT(foundBreakCount == breaks.size());
1706 int32_t totalBreaks = foundBreakCount; 1691 int32_t totalBreaks = foundBreakCount;
1707 if (startPos < breaks.elementAti(0)) { 1692 if (startPos < breaks.elementAti(0)) {
1708 totalBreaks += 1; 1693 totalBreaks += 1;
1709 } 1694 }
1710 if (endPos > breaks.peeki()) { 1695 if (endPos > breaks.peeki()) {
1711 totalBreaks += 1; 1696 totalBreaks += 1;
1712 } 1697 }
1713 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int3 2_t)); 1698 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int3 2_t));
1714 if (fCachedBreakPositions != NULL) { 1699 if (fCachedBreakPositions != NULL) {
1715 int32_t out = 0; 1700 int32_t out = 0;
(...skipping 19 matching lines...) Expand all
1735 } 1720 }
1736 // If the allocation failed, just fall through to the "no breaks found" case. 1721 // If the allocation failed, just fall through to the "no breaks found" case.
1737 } 1722 }
1738 1723
1739 // If we get here, there were no language-based breaks. Set the text pointer 1724 // If we get here, there were no language-based breaks. Set the text pointer
1740 // to the original proposed break. 1725 // to the original proposed break.
1741 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1726 utext_setNativeIndex(fText, reverse ? startPos : endPos);
1742 return (reverse ? startPos : endPos); 1727 return (reverse ? startPos : endPos);
1743 } 1728 }
1744 1729
1745 // defined in ucln_cmn.h
1746
1747 U_NAMESPACE_END 1730 U_NAMESPACE_END
1748 1731
1749 1732
1750 static icu::UStack *gLanguageBreakFactories = NULL; 1733 static icu::UStack *gLanguageBreakFactories = NULL;
1751 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; 1734 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1752 1735
1753 /** 1736 /**
1754 * Release all static memory held by breakiterator. 1737 * Release all static memory held by breakiterator.
1755 */ 1738 */
1756 U_CDECL_BEGIN 1739 U_CDECL_BEGIN
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after
1882 }*/ 1865 }*/
1883 1866
1884 void RuleBasedBreakIterator::setBreakType(int32_t type) { 1867 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1885 fBreakType = type; 1868 fBreakType = type;
1886 reset(); 1869 reset();
1887 } 1870 }
1888 1871
1889 U_NAMESPACE_END 1872 U_NAMESPACE_END
1890 1873
1891 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1874 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
OLDNEW
« no previous file with comments | « source/common/putilimp.h ('k') | source/common/rbbidata.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698