Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(294)

Side by Side Diff: source/common/ustrcase.cpp

Issue 1621843002: ICU 56 update step 1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@561
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/common/ustr_imp.h ('k') | source/common/ustrenum.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ******************************************************************************* 2 *******************************************************************************
3 * 3 *
4 * Copyright (C) 2001-2014, International Business Machines 4 * Copyright (C) 2001-2015, International Business Machines
5 * Corporation and others. All Rights Reserved. 5 * Corporation and others. All Rights Reserved.
6 * 6 *
7 ******************************************************************************* 7 *******************************************************************************
8 * file name: ustrcase.cpp 8 * file name: ustrcase.cpp
9 * encoding: US-ASCII 9 * encoding: US-ASCII
10 * tab size: 8 (not used) 10 * tab size: 8 (not used)
11 * indentation:4 11 * indentation:4
12 * 12 *
13 * created on: 2002feb20 13 * created on: 2002feb20
14 * created by: Markus W. Scherer 14 * created by: Markus W. Scherer
15 * 15 *
16 * Implementation file for string casing C API functions. 16 * Implementation file for string casing C API functions.
17 * Uses functions from uchar.c for basic functionality that requires access 17 * Uses functions from uchar.c for basic functionality that requires access
18 * to the Unicode Character Database (uprops.dat). 18 * to the Unicode Character Database (uprops.dat).
19 */ 19 */
20 20
21 #include "unicode/utypes.h" 21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h" 22 #include "unicode/brkiter.h"
23 #include "unicode/ustring.h" 23 #include "unicode/ustring.h"
24 #include "unicode/ucasemap.h" 24 #include "unicode/ucasemap.h"
25 #include "unicode/ubrk.h" 25 #include "unicode/ubrk.h"
26 #include "unicode/utf.h" 26 #include "unicode/utf.h"
27 #include "unicode/utf16.h" 27 #include "unicode/utf16.h"
28 #include "cmemory.h" 28 #include "cmemory.h"
29 #include "ucase.h" 29 #include "ucase.h"
30 #include "ustr_imp.h" 30 #include "ustr_imp.h"
31 #include "uassert.h"
31 32
32 U_NAMESPACE_USE 33 U_NAMESPACE_USE
33 34
34 /* string casing ------------------------------------------------------------ */ 35 /* string casing ------------------------------------------------------------ */
35 36
36 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ 37 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
37 static inline int32_t 38 static inline int32_t
38 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, 39 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
39 int32_t result, const UChar *s) { 40 int32_t result, const UChar *s) {
40 UChar32 c; 41 UChar32 c;
(...skipping 415 matching lines...) Expand 10 before | Expand all | Expand 10 after
456 * It makes caseless (but not canonical caseless) matches independent of 457 * It makes caseless (but not canonical caseless) matches independent of
457 * the normalization code. 458 * the normalization code.
458 */ 459 */
459 460
460 /* stack element for previous-level source/decomposition pointers */ 461 /* stack element for previous-level source/decomposition pointers */
461 struct CmpEquivLevel { 462 struct CmpEquivLevel {
462 const UChar *start, *s, *limit; 463 const UChar *start, *s, *limit;
463 }; 464 };
464 typedef struct CmpEquivLevel CmpEquivLevel; 465 typedef struct CmpEquivLevel CmpEquivLevel;
465 466
466 /* internal function */ 467 /**
467 U_CFUNC int32_t 468 * Internal implementation code comparing string with case fold.
468 u_strcmpFold(const UChar *s1, int32_t length1, 469 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch( ).
469 const UChar *s2, int32_t length2, 470 *
470 uint32_t options, 471 * @param s1 input string 1
471 UErrorCode *pErrorCode) { 472 * @param length1 length of string 1, or -1 (NULL terminated)
473 * @param s2 input string 2
474 * @param length2 length of string 2, or -1 (NULL terminated)
475 * @param options compare options
476 * @param matchLen1 (output) length of partial prefix match in s1
477 * @param matchLen2 (output) length of partial prefix match in s2
478 * @param pErrorCode receives error status
479 * @return The result of comparison
480 */
481 static int32_t _cmpFold(
482 const UChar *s1, int32_t length1,
483 const UChar *s2, int32_t length2,
484 uint32_t options,
485 int32_t *matchLen1, int32_t *matchLen2,
486 UErrorCode *pErrorCode) {
487 int32_t cmpRes = 0;
488
472 const UCaseProps *csp; 489 const UCaseProps *csp;
473 490
474 /* current-level start/limit - s1/s2 as current */ 491 /* current-level start/limit - s1/s2 as current */
475 const UChar *start1, *start2, *limit1, *limit2; 492 const UChar *start1, *start2, *limit1, *limit2;
476 493
494 /* points to the original start address */
495 const UChar *org1, *org2;
496
497 /* points to the end of match + 1 */
498 const UChar *m1, *m2;
499
477 /* case folding variables */ 500 /* case folding variables */
478 const UChar *p; 501 const UChar *p;
479 int32_t length; 502 int32_t length;
480 503
481 /* stacks of previous-level start/current/limit */ 504 /* stacks of previous-level start/current/limit */
482 CmpEquivLevel stack1[2], stack2[2]; 505 CmpEquivLevel stack1[2], stack2[2];
483 506
484 /* case folding buffers, only use current-level start/limit */ 507 /* case folding buffers, only use current-level start/limit */
485 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 508 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
486 509
487 /* track which is the current level per string */ 510 /* track which is the current level per string */
488 int32_t level1, level2; 511 int32_t level1, level2;
489 512
490 /* current code units, and code points for lookups */ 513 /* current code units, and code points for lookups */
491 UChar32 c1, c2, cp1, cp2; 514 UChar32 c1, c2, cp1, cp2;
492 515
493 /* no argument error checking because this itself is not an API */ 516 /* no argument error checking because this itself is not an API */
494 517
495 /* 518 /*
496 * assume that at least the option U_COMPARE_IGNORE_CASE is set 519 * assume that at least the option U_COMPARE_IGNORE_CASE is set
497 * otherwise this function would have to behave exactly as uprv_strCompare() 520 * otherwise this function would have to behave exactly as uprv_strCompare()
498 */ 521 */
499 csp=ucase_getSingleton(); 522 csp=ucase_getSingleton();
500 if(U_FAILURE(*pErrorCode)) { 523 if(U_FAILURE(*pErrorCode)) {
501 return 0; 524 return 0;
502 } 525 }
503 526
504 /* initialize */ 527 /* initialize */
505 start1=s1; 528 if(matchLen1) {
529 U_ASSERT(matchLen2 !=NULL);
530 *matchLen1=0;
531 *matchLen2=0;
532 }
533
534 start1=m1=org1=s1;
506 if(length1==-1) { 535 if(length1==-1) {
507 limit1=NULL; 536 limit1=NULL;
508 } else { 537 } else {
509 limit1=s1+length1; 538 limit1=s1+length1;
510 } 539 }
511 540
512 start2=s2; 541 start2=m2=org2=s2;
513 if(length2==-1) { 542 if(length2==-1) {
514 limit2=NULL; 543 limit2=NULL;
515 } else { 544 } else {
516 limit2=s2+length2; 545 limit2=s2+length2;
517 } 546 }
518 547
519 level1=level2=0; 548 level1=level2=0;
520 c1=c2=-1; 549 c1=c2=-1;
521 550
522 /* comparison loop */ 551 /* comparison loop */
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
570 s2=stack2[level2].s; /*Not uninitialized*/ 599 s2=stack2[level2].s; /*Not uninitialized*/
571 limit2=stack2[level2].limit; /*Not uninitialized*/ 600 limit2=stack2[level2].limit; /*Not uninitialized*/
572 } 601 }
573 } 602 }
574 603
575 /* 604 /*
576 * compare c1 and c2 605 * compare c1 and c2
577 * either variable c1, c2 is -1 only if the corresponding string is fini shed 606 * either variable c1, c2 is -1 only if the corresponding string is fini shed
578 */ 607 */
579 if(c1==c2) { 608 if(c1==c2) {
609 const UChar *next1, *next2;
610
580 if(c1<0) { 611 if(c1<0) {
581 return 0; /* c1==c2==-1 indicating end of strings */ 612 cmpRes=0; /* c1==c2==-1 indicating end of strings */
613 break;
614 }
615
616 /*
617 * Note: Move the match positions in both strings at the same time
618 * only when corresponding code point(s) in the original string s
619 * are fully consumed. For example, when comparing s1="Fust" an d
620 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matc hes
621 * the first code point in the case-folded data. But the second "s"
622 * has no matching code point in s1, so this implementation ret urns
623 * 2 as the prefix match length ("Fu").
624 */
625 next1=next2=NULL;
626 if(level1==0) {
627 next1=s1;
628 } else if(s1==limit1) {
629 /* Note: This implementation only use a single level of stack.
630 * If this code needs to be changed to use multiple levels
631 * of stacks, the code above should check if the current
632 * code is at the end of all stacks.
633 */
634 U_ASSERT(level1==1);
635
636 /* is s1 at the end of the current stack? */
637 next1=stack1[0].s;
638 }
639
640 if (next1!=NULL) {
641 if(level2==0) {
642 next2=s2;
643 } else if(s2==limit2) {
644 U_ASSERT(level2==1);
645
646 /* is s2 at the end of the current stack? */
647 next2=stack2[0].s;
648 }
649 if(next2!=NULL) {
650 m1=next1;
651 m2=next2;
652 }
582 } 653 }
583 c1=c2=-1; /* make us fetch new code units */ 654 c1=c2=-1; /* make us fetch new code units */
584 continue; 655 continue;
585 } else if(c1<0) { 656 } else if(c1<0) {
586 return -1; /* string 1 ends before string 2 */ 657 cmpRes=-1; /* string 1 ends before string 2 */
658 break;
587 } else if(c2<0) { 659 } else if(c2<0) {
588 return 1; /* string 2 ends before string 1 */ 660 cmpRes=1; /* string 2 ends before string 1 */
661 break;
589 } 662 }
590 /* c1!=c2 && c1>=0 && c2>=0 */ 663 /* c1!=c2 && c1>=0 && c2>=0 */
591 664
592 /* get complete code points for c1, c2 for lookups if either is a surrog ate */ 665 /* get complete code points for c1, c2 for lookups if either is a surrog ate */
593 cp1=c1; 666 cp1=c1;
594 if(U_IS_SURROGATE(c1)) { 667 if(U_IS_SURROGATE(c1)) {
595 UChar c; 668 UChar c;
596 669
597 if(U_IS_SURROGATE_LEAD(c1)) { 670 if(U_IS_SURROGATE_LEAD(c1)) {
598 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 671 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
637 ++s1; 710 ++s1;
638 } else /* isTrail(c1) */ { 711 } else /* isTrail(c1) */ {
639 /* 712 /*
640 * we got a supplementary code point when hitting its trail surrogate, 713 * we got a supplementary code point when hitting its trail surrogate,
641 * therefore the lead surrogate must have been the same as i n the other string; 714 * therefore the lead surrogate must have been the same as i n the other string;
642 * compare this decomposition with the lead surrogate in the other string 715 * compare this decomposition with the lead surrogate in the other string
643 * remember that this simulates bulk text replacement: 716 * remember that this simulates bulk text replacement:
644 * the decomposition would replace the entire code point 717 * the decomposition would replace the entire code point
645 */ 718 */
646 --s2; 719 --s2;
720 --m2;
647 c2=*(s2-1); 721 c2=*(s2-1);
648 } 722 }
649 } 723 }
650 724
651 /* push current level pointers */ 725 /* push current level pointers */
652 stack1[0].start=start1; 726 stack1[0].start=start1;
653 stack1[0].s=s1; 727 stack1[0].s=s1;
654 stack1[0].limit=limit1; 728 stack1[0].limit=limit1;
655 ++level1; 729 ++level1;
656 730
(...skipping 25 matching lines...) Expand all
682 ++s2; 756 ++s2;
683 } else /* isTrail(c2) */ { 757 } else /* isTrail(c2) */ {
684 /* 758 /*
685 * we got a supplementary code point when hitting its trail surrogate, 759 * we got a supplementary code point when hitting its trail surrogate,
686 * therefore the lead surrogate must have been the same as i n the other string; 760 * therefore the lead surrogate must have been the same as i n the other string;
687 * compare this decomposition with the lead surrogate in the other string 761 * compare this decomposition with the lead surrogate in the other string
688 * remember that this simulates bulk text replacement: 762 * remember that this simulates bulk text replacement:
689 * the decomposition would replace the entire code point 763 * the decomposition would replace the entire code point
690 */ 764 */
691 --s1; 765 --s1;
766 --m2;
692 c1=*(s1-1); 767 c1=*(s1-1);
693 } 768 }
694 } 769 }
695 770
696 /* push current level pointers */ 771 /* push current level pointers */
697 stack2[0].start=start2; 772 stack2[0].start=start2;
698 stack2[0].s=s2; 773 stack2[0].s=s2;
699 stack2[0].limit=limit2; 774 stack2[0].limit=limit2;
700 ++level2; 775 ++level2;
701 776
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
750 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 825 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
751 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 826 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
752 ) { 827 ) {
753 /* part of a surrogate pair, leave >=d800 */ 828 /* part of a surrogate pair, leave >=d800 */
754 } else { 829 } else {
755 /* BMP code point - may be surrogate code point - make <d800 */ 830 /* BMP code point - may be surrogate code point - make <d800 */
756 c2-=0x2800; 831 c2-=0x2800;
757 } 832 }
758 } 833 }
759 834
760 return c1-c2; 835 cmpRes=c1-c2;
836 break;
761 } 837 }
838
839 if(matchLen1) {
840 *matchLen1=m1-org1;
841 *matchLen2=m2-org2;
842 }
843 return cmpRes;
844 }
845
846 /* internal function */
847 U_CFUNC int32_t
848 u_strcmpFold(const UChar *s1, int32_t length1,
849 const UChar *s2, int32_t length2,
850 uint32_t options,
851 UErrorCode *pErrorCode) {
852 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
762 } 853 }
763 854
764 /* public API functions */ 855 /* public API functions */
765 856
766 U_CAPI int32_t U_EXPORT2 857 U_CAPI int32_t U_EXPORT2
767 u_strCaseCompare(const UChar *s1, int32_t length1, 858 u_strCaseCompare(const UChar *s1, int32_t length1,
768 const UChar *s2, int32_t length2, 859 const UChar *s2, int32_t length2,
769 uint32_t options, 860 uint32_t options,
770 UErrorCode *pErrorCode) { 861 UErrorCode *pErrorCode) {
771 /* argument checking */ 862 /* argument checking */
(...skipping 25 matching lines...) Expand all
797 &errorCode); 888 &errorCode);
798 } 889 }
799 890
800 U_CAPI int32_t U_EXPORT2 891 U_CAPI int32_t U_EXPORT2
801 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { 892 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
802 UErrorCode errorCode=U_ZERO_ERROR; 893 UErrorCode errorCode=U_ZERO_ERROR;
803 return u_strcmpFold(s1, n, s2, n, 894 return u_strcmpFold(s1, n, s2, n,
804 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), 895 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
805 &errorCode); 896 &errorCode);
806 } 897 }
898
899 /* internal API - detect length of shared prefix */
900 U_CAPI void
901 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
902 const UChar *s2, int32_t length2,
903 uint32_t options,
904 int32_t *matchLen1, int32_t *matchLen2,
905 UErrorCode *pErrorCode) {
906 _cmpFold(s1, length1, s2, length2, options,
907 matchLen1, matchLen2, pErrorCode);
908 }
OLDNEW
« no previous file with comments | « source/common/ustr_imp.h ('k') | source/common/ustrenum.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698