source/common/rbbi.cpp - Issue 845603002: Update ICU to 54.1 step 1

Side by Side Diff: source/common/rbbi.cpp

Issue 845603002: Update ICU to 54.1 step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master

Patch Set: remove unusued directories Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 ***************************************************************************	2 ***************************************************************************

3 * Copyright (C) 1999-2013 International Business Machines Corporation	3 * Copyright (C) 1999-2014 International Business Machines Corporation

4 * and others. All rights reserved.	4 * and others. All rights reserved.

5 ***************************************************************************	5 ***************************************************************************

6 */	6 */

7 //	7 //

8 // file: rbbi.c Contains the implementation of the rule based break iterato r	8 // file: rbbi.c Contains the implementation of the rule based break iterato r

9 // runtime engine and the API implementation for	9 // runtime engine and the API implementation for

10 // class RuleBasedBreakIterator	10 // class RuleBasedBreakIterator

11 //	11 //

12	12

13 #include "utypeinfo.h" // for 'typeid' to work	13 #include "utypeinfo.h" // for 'typeid' to work

(...skipping 497 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
511 // contents as the old. If we can't set to the same position, it doesn' t.	511 // contents as the old. If we can't set to the same position, it doesn' t.

512 // The contents underlying the old utext might be invalid at this point,	512 // The contents underlying the old utext might be invalid at this point,

513 // so it's not safe to check directly.	513 // so it's not safe to check directly.

514 status = U_ILLEGAL_ARGUMENT_ERROR;	514 status = U_ILLEGAL_ARGUMENT_ERROR;

515 }	515 }

516 return *this;	516 return *this;

517 }	517 }

518	518

519	519

520 /**	520 /**

521 * Sets the current iteration position to the beginning of the text.	521 * Sets the current iteration position to the beginning of the text, position ze ro.

522 * @return The offset of the beginning of the text.	522 * @return The new iterator position, which is zero.

523 */	523 */

524 int32_t RuleBasedBreakIterator::first(void) {	524 int32_t RuleBasedBreakIterator::first(void) {

525 reset();	525 reset();

526 fLastRuleStatusIndex = 0;	526 fLastRuleStatusIndex = 0;

527 fLastStatusIndexValid = TRUE;	527 fLastStatusIndexValid = TRUE;

528 //if (fText == NULL)	528 //if (fText == NULL)

529 // return BreakIterator::DONE;	529 // return BreakIterator::DONE;

530	530

531 utext_setNativeIndex(fText, 0);	531 utext_setNativeIndex(fText, 0);

532 return 0;	532 return 0;

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
585 int32_t pos = fCachedBreakPositions[fPositionInCache];	585 int32_t pos = fCachedBreakPositions[fPositionInCache];

586 utext_setNativeIndex(fText, pos);	586 utext_setNativeIndex(fText, pos);

587 return pos;	587 return pos;

588 }	588 }

589 else {	589 else {

590 reset();	590 reset();

591 }	591 }

592 }	592 }

593	593

594 int32_t startPos = current();	594 int32_t startPos = current();

	595 fDictionaryCharCount = 0;

595 int32_t result = handleNext(fData->fForwardTable);	596 int32_t result = handleNext(fData->fForwardTable);

596 if (fDictionaryCharCount > 0) {	597 if (fDictionaryCharCount > 0) {

597 result = checkDictionary(startPos, result, FALSE);	598 result = checkDictionary(startPos, result, FALSE);

598 }	599 }

599 return result;	600 return result;

600 }	601 }

601	602

602 /**	603 /**

603 * Advances the iterator backwards, to the last boundary preceding this one.	604 * Advances the iterator backwards, to the last boundary preceding this one.

604 * @return The position of the last boundary position preceding this one.	605 * @return The position of the last boundary position preceding this one.

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
639 result = checkDictionary(result, startPos, TRUE);	640 result = checkDictionary(result, startPos, TRUE);

640 }	641 }

641 return result;	642 return result;

642 }	643 }

643	644

644 // old rule syntax	645 // old rule syntax

645 // set things up. handlePrevious() will back us up to some valid	646 // set things up. handlePrevious() will back us up to some valid

646 // break position before the current position (we back our internal	647 // break position before the current position (we back our internal

647 // iterator up one step to prevent handlePrevious() from returning	648 // iterator up one step to prevent handlePrevious() from returning

648 // the current position), but not necessarily the last one before	649 // the current position), but not necessarily the last one before

649

650 // where we started	650 // where we started

651	651

652 int32_t start = current();	652 int32_t start = current();

653	653

654 (void)UTEXT_PREVIOUS32(fText);	654 (void)UTEXT_PREVIOUS32(fText);

655 int32_t lastResult = handlePrevious(fData->fReverseTable);	655 int32_t lastResult = handlePrevious(fData->fReverseTable);

656 if (lastResult == UBRK_DONE) {	656 if (lastResult == UBRK_DONE) {

657 lastResult = 0;	657 lastResult = 0;

658 utext_setNativeIndex(fText, 0);	658 utext_setNativeIndex(fText, 0);

659 }	659 }

(...skipping 12 matching lines...) Expand all Loading...
672 }	672 }

673 lastResult = result;	673 lastResult = result;

674 lastTag = fLastRuleStatusIndex;	674 lastTag = fLastRuleStatusIndex;

675 breakTagValid = TRUE;	675 breakTagValid = TRUE;

676 }	676 }

677	677

678 // fLastBreakTag wants to have the value for section of text preceding	678 // fLastBreakTag wants to have the value for section of text preceding

679 // the result position that we are to return (in lastResult.) If	679 // the result position that we are to return (in lastResult.) If

680 // the backwards rules overshot and the above loop had to do two or more	680 // the backwards rules overshot and the above loop had to do two or more

681 // next()s to move up to the desired return position, we will have a valid	681 // next()s to move up to the desired return position, we will have a valid

682 // tag value. But, if handlePrevious() took us to exactly the correct result positon,	682 // tag value. But, if handlePrevious() took us to exactly the correct result position,

683 // we wont have a tag value for that position, which is only set by handleNe xt().	683 // we wont have a tag value for that position, which is only set by handleNe xt().

684	684

685 // set the current iteration position to be the last break position	685 // Set the current iteration position to be the last break position

686 // before where we started, and then return that value	686 // before where we started, and then return that value.

687 utext_setNativeIndex(fText, lastResult);	687 utext_setNativeIndex(fText, lastResult);

688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()	688 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()

689 fLastStatusIndexValid = breakTagValid;	689 fLastStatusIndexValid = breakTagValid;

690	690

691 // No need to check the dictionary; it will have been handled by	691 // No need to check the dictionary; it will have been handled by

692 // next()	692 // next()

693	693

694 return lastResult;	694 return lastResult;

695 }	695 }

696	696

697 /**	697 /**

698 * Sets the iterator to refer to the first boundary position following	698 * Sets the iterator to refer to the first boundary position following

699 * the specified position.	699 * the specified position.

700 * @offset The position from which to begin searching for a break position.	700 * @offset The position from which to begin searching for a break position.

701 * @return The position of the first break after the current position.	701 * @return The position of the first break after the current position.

702 */	702 */

703 int32_t RuleBasedBreakIterator::following(int32_t offset) {	703 int32_t RuleBasedBreakIterator::following(int32_t offset) {

	704 // if the offset passed in is already past the end of the text,

	705 // just return DONE; if it's before the beginning, return the

	706 // text's starting offset

	707 if (fText == NULL \|\| offset >= utext_nativeLength(fText)) {

	708 last();

	709 return next();

	710 }

	711 else if (offset < 0) {

	712 return first();

	713 }

	714

	715 // Move requested offset to a code point start. It might be on a trail surro gate,

	716 // or on a trail byte if the input is UTF-8.

	717 utext_setNativeIndex(fText, offset);

	718 offset = utext_getNativeIndex(fText);

	719

704 // if we have cached break positions and offset is in the range	720 // if we have cached break positions and offset is in the range

705 // covered by them, use them	721 // covered by them, use them

706 // TODO: could use binary search	722 // TODO: could use binary search

707 // TODO: what if offset is outside range, but break is not?	723 // TODO: what if offset is outside range, but break is not?

708 if (fCachedBreakPositions != NULL) {	724 if (fCachedBreakPositions != NULL) {

709 if (offset >= fCachedBreakPositions[0]	725 if (offset >= fCachedBreakPositions[0]

710 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {	726 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {

711 fPositionInCache = 0;	727 fPositionInCache = 0;

712 // We are guaranteed not to leave the array due to range test above	728 // We are guaranteed not to leave the array due to range test above

713 while (offset >= fCachedBreakPositions[fPositionInCache]) {	729 while (offset >= fCachedBreakPositions[fPositionInCache]) {

714 ++fPositionInCache;	730 ++fPositionInCache;

715 }	731 }

716 int32_t pos = fCachedBreakPositions[fPositionInCache];	732 int32_t pos = fCachedBreakPositions[fPositionInCache];

717 utext_setNativeIndex(fText, pos);	733 utext_setNativeIndex(fText, pos);

718 return pos;	734 return pos;

719 }	735 }

720 else {	736 else {

721 reset();	737 reset();

722 }	738 }

723 }	739 }

724	740

725 // if the offset passed in is already past the end of the text,	741 // Set our internal iteration position (temporarily)

726 // just return DONE; if it's before the beginning, return the

727 // text's starting offset

728 fLastRuleStatusIndex = 0;

729 fLastStatusIndexValid = TRUE;

730 if (fText == NULL \|\| offset >= utext_nativeLength(fText)) {

731 last();

732 return next();

733 }

734 else if (offset < 0) {

735 return first();

736 }

737

738 // otherwise, set our internal iteration position (temporarily)

739 // to the position passed in. If this is the _beginning_ position,	742 // to the position passed in. If this is the _beginning_ position,

740 // then we can just use next() to get our return value	743 // then we can just use next() to get our return value

741	744

742 int32_t result = 0;	745 int32_t result = 0;

743	746

744 if (fData->fSafeRevTable != NULL) {	747 if (fData->fSafeRevTable != NULL) {

745 // new rule syntax	748 // new rule syntax

746 utext_setNativeIndex(fText, offset);	749 utext_setNativeIndex(fText, offset);

747 // move forward one codepoint to prepare for moving back to a	750 // move forward one codepoint to prepare for moving back to a

748 // safe point.	751 // safe point.

749 // this handles offset being between a supplementary character	752 // this handles offset being between a supplementary character

	753 // TODO: is this still needed, with move to code point boundary handled above?

750 (void)UTEXT_NEXT32(fText);	754 (void)UTEXT_NEXT32(fText);

751 // handlePrevious will move most of the time to < 1 boundary away	755 // handlePrevious will move most of the time to < 1 boundary away

752 handlePrevious(fData->fSafeRevTable);	756 handlePrevious(fData->fSafeRevTable);

753 int32_t result = next();	757 int32_t result = next();

754 while (result <= offset) {	758 while (result <= offset) {

755 result = next();	759 result = next();

756 }	760 }

757 return result;	761 return result;

758 }	762 }

759 if (fData->fSafeFwdTable != NULL) {	763 if (fData->fSafeFwdTable != NULL) {

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
802 return result;	806 return result;

803 }	807 }

804	808

805 /**	809 /**

806 * Sets the iterator to refer to the last boundary position before the	810 * Sets the iterator to refer to the last boundary position before the

807 * specified position.	811 * specified position.

808 * @offset The position to begin searching for a break from.	812 * @offset The position to begin searching for a break from.

809 * @return The position of the last boundary before the starting position.	813 * @return The position of the last boundary before the starting position.

810 */	814 */

811 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {	815 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {

	816 // if the offset passed in is already past the end of the text,

	817 // just return DONE; if it's before the beginning, return the

	818 // text's starting offset

	819 if (fText == NULL \|\| offset > utext_nativeLength(fText)) {

	820 return last();

	821 }

	822 else if (offset < 0) {

	823 return first();

	824 }

	825

	826 // Move requested offset to a code point start. It might be on a trail surro gate,

	827 // or on a trail byte if the input is UTF-8.

	828 utext_setNativeIndex(fText, offset);

	829 offset = utext_getNativeIndex(fText);

	830

812 // if we have cached break positions and offset is in the range	831 // if we have cached break positions and offset is in the range

813 // covered by them, use them	832 // covered by them, use them

814 if (fCachedBreakPositions != NULL) {	833 if (fCachedBreakPositions != NULL) {

815 // TODO: binary search?	834 // TODO: binary search?

816 // TODO: What if offset is outside range, but break is not?	835 // TODO: What if offset is outside range, but break is not?

817 if (offset > fCachedBreakPositions[0]	836 if (offset > fCachedBreakPositions[0]

818 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1] ) {	837 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1] ) {

819 fPositionInCache = 0;	838 fPositionInCache = 0;

820 while (fPositionInCache < fNumCachedBreakPositions	839 while (fPositionInCache < fNumCachedBreakPositions

821 && offset > fCachedBreakPositions[fPositionInCache])	840 && offset > fCachedBreakPositions[fPositionInCache])

822 ++fPositionInCache;	841 ++fPositionInCache;

823 --fPositionInCache;	842 --fPositionInCache;

824 // If we're at the beginning of the cache, need to reevaluate the	843 // If we're at the beginning of the cache, need to reevaluate the

825 // rule status	844 // rule status

826 if (fPositionInCache <= 0) {	845 if (fPositionInCache <= 0) {

827 fLastStatusIndexValid = FALSE;	846 fLastStatusIndexValid = FALSE;

828 }	847 }

829 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]) ;	848 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]) ;

830 return fCachedBreakPositions[fPositionInCache];	849 return fCachedBreakPositions[fPositionInCache];

831 }	850 }

832 else {	851 else {

833 reset();	852 reset();

834 }	853 }

835 }	854 }

836	855

837 // if the offset passed in is already past the end of the text,

838 // just return DONE; if it's before the beginning, return the

839 // text's starting offset

840 if (fText == NULL \|\| offset > utext_nativeLength(fText)) {

841 // return BreakIterator::DONE;

842 return last();

843 }

844 else if (offset < 0) {

845 return first();

846 }

847

848 // if we start by updating the current iteration position to the	856 // if we start by updating the current iteration position to the

849 // position specified by the caller, we can just use previous()	857 // position specified by the caller, we can just use previous()

850 // to carry out this operation	858 // to carry out this operation

851	859

852 if (fData->fSafeFwdTable != NULL) {	860 if (fData->fSafeFwdTable != NULL) {

853 // new rule syntax	861 // new rule syntax

854 utext_setNativeIndex(fText, offset);	862 utext_setNativeIndex(fText, offset);

855 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);	863 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);

856 if (newOffset != offset) {	864 if (newOffset != offset) {

857 // Will come here if specified offset was not a code point boundary AND	865 // Will come here if specified offset was not a code point boundary AND

(...skipping 713 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1571 // Reset the old break cache first.	1579 // Reset the old break cache first.

1572 reset();	1580 reset();

1573	1581

1574 // note: code segment below assumes that dictionary chars are in the	1582 // note: code segment below assumes that dictionary chars are in the

1575 // startPos-endPos range	1583 // startPos-endPos range

1576 // value returned should be next character in sequence	1584 // value returned should be next character in sequence

1577 if ((endPos - startPos) <= 1) {	1585 if ((endPos - startPos) <= 1) {

1578 return (reverse ? startPos : endPos);	1586 return (reverse ? startPos : endPos);

1579 }	1587 }

1580	1588

1581 // Bug 5532. The dictionary code will crash if the input text is UTF-8

1582 // because native indexes are different from UTF-16 indexes.

1583 // Temporary hack: skip dictionary lookup for UTF-8 encoded text.

1584 // It wont give the right breaks, but it's better than a crash.

1585 //

1586 // Check the type of the UText by checking its pFuncs field, which

1587 // is UText's function dispatch table. It will be the same for all

1588 // UTF-8 UTexts and different for any other UText type.

1589 //

1590 // We have no other type of UText available with non-UTF-16 native inde xing.

1591 // This whole check will go away once the dictionary code is fixed.

1592 static const void *utext_utf8Funcs;

1593 if (utext_utf8Funcs == NULL) {

1594 // Cache the UTF-8 UText function pointer value.

1595 UErrorCode status = U_ZERO_ERROR;

1596 UText tempUText = UTEXT_INITIALIZER;

1597 utext_openUTF8(&tempUText, NULL, 0, &status);

1598 utext_utf8Funcs = tempUText.pFuncs;

1599 utext_close(&tempUText);

1600 }

1601 if (fText->pFuncs == utext_utf8Funcs) {

1602 return (reverse ? startPos : endPos);

1603 }

1604

1605 // Starting from the starting point, scan towards the proposed result,	1589 // Starting from the starting point, scan towards the proposed result,

1606 // looking for the first dictionary character (which may be the one	1590 // looking for the first dictionary character (which may be the one

1607 // we're on, if we're starting in the middle of a range).	1591 // we're on, if we're starting in the middle of a range).

1608 utext_setNativeIndex(fText, reverse ? endPos : startPos);	1592 utext_setNativeIndex(fText, reverse ? endPos : startPos);

1609 if (reverse) {	1593 if (reverse) {

1610 UTEXT_PREVIOUS32(fText);	1594 UTEXT_PREVIOUS32(fText);

1611 }	1595 }

1612	1596

1613 int32_t rangeStart = startPos;	1597 int32_t rangeStart = startPos;

1614 int32_t rangeEnd = endPos;	1598 int32_t rangeEnd = endPos;

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1696 }	1680 }

1697	1681

1698 // Reload the loop variables for the next go-round	1682 // Reload the loop variables for the next go-round

1699 c = utext_current32(fText);	1683 c = utext_current32(fText);

1700 UTRIE_GET16(&fData->fTrie, c, category);	1684 UTRIE_GET16(&fData->fTrie, c, category);

1701 }	1685 }

1702	1686

1703 // If we found breaks, build a new break cache. The first and last entries m ust	1687 // If we found breaks, build a new break cache. The first and last entries m ust

1704 // be the original starting and ending position.	1688 // be the original starting and ending position.

1705 if (foundBreakCount > 0) {	1689 if (foundBreakCount > 0) {

	1690 U_ASSERT(foundBreakCount == breaks.size());

1706 int32_t totalBreaks = foundBreakCount;	1691 int32_t totalBreaks = foundBreakCount;

1707 if (startPos < breaks.elementAti(0)) {	1692 if (startPos < breaks.elementAti(0)) {

1708 totalBreaks += 1;	1693 totalBreaks += 1;

1709 }	1694 }

1710 if (endPos > breaks.peeki()) {	1695 if (endPos > breaks.peeki()) {

1711 totalBreaks += 1;	1696 totalBreaks += 1;

1712 }	1697 }

1713 fCachedBreakPositions = (int32_t )uprv_malloc(totalBreaks sizeof(int3 2_t));	1698 fCachedBreakPositions = (int32_t )uprv_malloc(totalBreaks sizeof(int3 2_t));

1714 if (fCachedBreakPositions != NULL) {	1699 if (fCachedBreakPositions != NULL) {

1715 int32_t out = 0;	1700 int32_t out = 0;

(...skipping 19 matching lines...) Expand all Loading...
1735 }	1720 }

1736 // If the allocation failed, just fall through to the "no breaks found" case.	1721 // If the allocation failed, just fall through to the "no breaks found" case.

1737 }	1722 }

1738	1723

1739 // If we get here, there were no language-based breaks. Set the text pointer	1724 // If we get here, there were no language-based breaks. Set the text pointer

1740 // to the original proposed break.	1725 // to the original proposed break.

1741 utext_setNativeIndex(fText, reverse ? startPos : endPos);	1726 utext_setNativeIndex(fText, reverse ? startPos : endPos);

1742 return (reverse ? startPos : endPos);	1727 return (reverse ? startPos : endPos);

1743 }	1728 }

1744	1729

1745 // defined in ucln_cmn.h

1746

1747 U_NAMESPACE_END	1730 U_NAMESPACE_END

1748	1731

1749	1732

1750 static icu::UStack *gLanguageBreakFactories = NULL;	1733 static icu::UStack *gLanguageBreakFactories = NULL;

1751 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;	1734 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;

1752	1735

1753 /**	1736 /**

1754 * Release all static memory held by breakiterator.	1737 * Release all static memory held by breakiterator.

1755 */	1738 */

1756 U_CDECL_BEGIN	1739 U_CDECL_BEGIN

(...skipping 125 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1882 }*/	1865 }*/

1883	1866

1884 void RuleBasedBreakIterator::setBreakType(int32_t type) {	1867 void RuleBasedBreakIterator::setBreakType(int32_t type) {

1885 fBreakType = type;	1868 fBreakType = type;

1886 reset();	1869 reset();

1887 }	1870 }

1888	1871

1889 U_NAMESPACE_END	1872 U_NAMESPACE_END

1890	1873

1891 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */	1874 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

OLD	NEW

« no previous file with comments | « source/common/putilimp.h ('k') | source/common/rbbidata.h » ('j') | no next file with comments »