OLD | NEW |
1 /* | 1 /* |
2 ******************************************************************************* | 2 ******************************************************************************* |
3 * | 3 * |
4 * Copyright (C) 2001-2014, International Business Machines | 4 * Copyright (C) 2001-2015, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ******************************************************************************* | 7 ******************************************************************************* |
8 * file name: ustrcase.cpp | 8 * file name: ustrcase.cpp |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
11 * indentation:4 | 11 * indentation:4 |
12 * | 12 * |
13 * created on: 2002feb20 | 13 * created on: 2002feb20 |
14 * created by: Markus W. Scherer | 14 * created by: Markus W. Scherer |
15 * | 15 * |
16 * Implementation file for string casing C API functions. | 16 * Implementation file for string casing C API functions. |
17 * Uses functions from uchar.c for basic functionality that requires access | 17 * Uses functions from uchar.c for basic functionality that requires access |
18 * to the Unicode Character Database (uprops.dat). | 18 * to the Unicode Character Database (uprops.dat). |
19 */ | 19 */ |
20 | 20 |
21 #include "unicode/utypes.h" | 21 #include "unicode/utypes.h" |
22 #include "unicode/brkiter.h" | 22 #include "unicode/brkiter.h" |
23 #include "unicode/ustring.h" | 23 #include "unicode/ustring.h" |
24 #include "unicode/ucasemap.h" | 24 #include "unicode/ucasemap.h" |
25 #include "unicode/ubrk.h" | 25 #include "unicode/ubrk.h" |
26 #include "unicode/utf.h" | 26 #include "unicode/utf.h" |
27 #include "unicode/utf16.h" | 27 #include "unicode/utf16.h" |
28 #include "cmemory.h" | 28 #include "cmemory.h" |
29 #include "ucase.h" | 29 #include "ucase.h" |
30 #include "ustr_imp.h" | 30 #include "ustr_imp.h" |
| 31 #include "uassert.h" |
31 | 32 |
32 U_NAMESPACE_USE | 33 U_NAMESPACE_USE |
33 | 34 |
34 /* string casing ------------------------------------------------------------ */ | 35 /* string casing ------------------------------------------------------------ */ |
35 | 36 |
36 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ | 37 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ |
37 static inline int32_t | 38 static inline int32_t |
38 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, | 39 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, |
39 int32_t result, const UChar *s) { | 40 int32_t result, const UChar *s) { |
40 UChar32 c; | 41 UChar32 c; |
(...skipping 415 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
456 * It makes caseless (but not canonical caseless) matches independent of | 457 * It makes caseless (but not canonical caseless) matches independent of |
457 * the normalization code. | 458 * the normalization code. |
458 */ | 459 */ |
459 | 460 |
460 /* stack element for previous-level source/decomposition pointers */ | 461 /* stack element for previous-level source/decomposition pointers */ |
461 struct CmpEquivLevel { | 462 struct CmpEquivLevel { |
462 const UChar *start, *s, *limit; | 463 const UChar *start, *s, *limit; |
463 }; | 464 }; |
464 typedef struct CmpEquivLevel CmpEquivLevel; | 465 typedef struct CmpEquivLevel CmpEquivLevel; |
465 | 466 |
466 /* internal function */ | 467 /** |
467 U_CFUNC int32_t | 468 * Internal implementation code comparing string with case fold. |
468 u_strcmpFold(const UChar *s1, int32_t length1, | 469 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(
). |
469 const UChar *s2, int32_t length2, | 470 * |
470 uint32_t options, | 471 * @param s1 input string 1 |
471 UErrorCode *pErrorCode) { | 472 * @param length1 length of string 1, or -1 (NULL terminated) |
| 473 * @param s2 input string 2 |
| 474 * @param length2 length of string 2, or -1 (NULL terminated) |
| 475 * @param options compare options |
| 476 * @param matchLen1 (output) length of partial prefix match in s1 |
| 477 * @param matchLen2 (output) length of partial prefix match in s2 |
| 478 * @param pErrorCode receives error status |
| 479 * @return The result of comparison |
| 480 */ |
| 481 static int32_t _cmpFold( |
| 482 const UChar *s1, int32_t length1, |
| 483 const UChar *s2, int32_t length2, |
| 484 uint32_t options, |
| 485 int32_t *matchLen1, int32_t *matchLen2, |
| 486 UErrorCode *pErrorCode) { |
| 487 int32_t cmpRes = 0; |
| 488 |
472 const UCaseProps *csp; | 489 const UCaseProps *csp; |
473 | 490 |
474 /* current-level start/limit - s1/s2 as current */ | 491 /* current-level start/limit - s1/s2 as current */ |
475 const UChar *start1, *start2, *limit1, *limit2; | 492 const UChar *start1, *start2, *limit1, *limit2; |
476 | 493 |
| 494 /* points to the original start address */ |
| 495 const UChar *org1, *org2; |
| 496 |
| 497 /* points to the end of match + 1 */ |
| 498 const UChar *m1, *m2; |
| 499 |
477 /* case folding variables */ | 500 /* case folding variables */ |
478 const UChar *p; | 501 const UChar *p; |
479 int32_t length; | 502 int32_t length; |
480 | 503 |
481 /* stacks of previous-level start/current/limit */ | 504 /* stacks of previous-level start/current/limit */ |
482 CmpEquivLevel stack1[2], stack2[2]; | 505 CmpEquivLevel stack1[2], stack2[2]; |
483 | 506 |
484 /* case folding buffers, only use current-level start/limit */ | 507 /* case folding buffers, only use current-level start/limit */ |
485 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; | 508 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; |
486 | 509 |
487 /* track which is the current level per string */ | 510 /* track which is the current level per string */ |
488 int32_t level1, level2; | 511 int32_t level1, level2; |
489 | 512 |
490 /* current code units, and code points for lookups */ | 513 /* current code units, and code points for lookups */ |
491 UChar32 c1, c2, cp1, cp2; | 514 UChar32 c1, c2, cp1, cp2; |
492 | 515 |
493 /* no argument error checking because this itself is not an API */ | 516 /* no argument error checking because this itself is not an API */ |
494 | 517 |
495 /* | 518 /* |
496 * assume that at least the option U_COMPARE_IGNORE_CASE is set | 519 * assume that at least the option U_COMPARE_IGNORE_CASE is set |
497 * otherwise this function would have to behave exactly as uprv_strCompare() | 520 * otherwise this function would have to behave exactly as uprv_strCompare() |
498 */ | 521 */ |
499 csp=ucase_getSingleton(); | 522 csp=ucase_getSingleton(); |
500 if(U_FAILURE(*pErrorCode)) { | 523 if(U_FAILURE(*pErrorCode)) { |
501 return 0; | 524 return 0; |
502 } | 525 } |
503 | 526 |
504 /* initialize */ | 527 /* initialize */ |
505 start1=s1; | 528 if(matchLen1) { |
| 529 U_ASSERT(matchLen2 !=NULL); |
| 530 *matchLen1=0; |
| 531 *matchLen2=0; |
| 532 } |
| 533 |
| 534 start1=m1=org1=s1; |
506 if(length1==-1) { | 535 if(length1==-1) { |
507 limit1=NULL; | 536 limit1=NULL; |
508 } else { | 537 } else { |
509 limit1=s1+length1; | 538 limit1=s1+length1; |
510 } | 539 } |
511 | 540 |
512 start2=s2; | 541 start2=m2=org2=s2; |
513 if(length2==-1) { | 542 if(length2==-1) { |
514 limit2=NULL; | 543 limit2=NULL; |
515 } else { | 544 } else { |
516 limit2=s2+length2; | 545 limit2=s2+length2; |
517 } | 546 } |
518 | 547 |
519 level1=level2=0; | 548 level1=level2=0; |
520 c1=c2=-1; | 549 c1=c2=-1; |
521 | 550 |
522 /* comparison loop */ | 551 /* comparison loop */ |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
570 s2=stack2[level2].s; /*Not uninitialized*/ | 599 s2=stack2[level2].s; /*Not uninitialized*/ |
571 limit2=stack2[level2].limit; /*Not uninitialized*/ | 600 limit2=stack2[level2].limit; /*Not uninitialized*/ |
572 } | 601 } |
573 } | 602 } |
574 | 603 |
575 /* | 604 /* |
576 * compare c1 and c2 | 605 * compare c1 and c2 |
577 * either variable c1, c2 is -1 only if the corresponding string is fini
shed | 606 * either variable c1, c2 is -1 only if the corresponding string is fini
shed |
578 */ | 607 */ |
579 if(c1==c2) { | 608 if(c1==c2) { |
| 609 const UChar *next1, *next2; |
| 610 |
580 if(c1<0) { | 611 if(c1<0) { |
581 return 0; /* c1==c2==-1 indicating end of strings */ | 612 cmpRes=0; /* c1==c2==-1 indicating end of strings */ |
| 613 break; |
| 614 } |
| 615 |
| 616 /* |
| 617 * Note: Move the match positions in both strings at the same time |
| 618 * only when corresponding code point(s) in the original string
s |
| 619 * are fully consumed. For example, when comparing s1="Fust" an
d |
| 620 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matc
hes |
| 621 * the first code point in the case-folded data. But the second
"s" |
| 622 * has no matching code point in s1, so this implementation ret
urns |
| 623 * 2 as the prefix match length ("Fu"). |
| 624 */ |
| 625 next1=next2=NULL; |
| 626 if(level1==0) { |
| 627 next1=s1; |
| 628 } else if(s1==limit1) { |
| 629 /* Note: This implementation only use a single level of stack. |
| 630 * If this code needs to be changed to use multiple levels |
| 631 * of stacks, the code above should check if the current |
| 632 * code is at the end of all stacks. |
| 633 */ |
| 634 U_ASSERT(level1==1); |
| 635 |
| 636 /* is s1 at the end of the current stack? */ |
| 637 next1=stack1[0].s; |
| 638 } |
| 639 |
| 640 if (next1!=NULL) { |
| 641 if(level2==0) { |
| 642 next2=s2; |
| 643 } else if(s2==limit2) { |
| 644 U_ASSERT(level2==1); |
| 645 |
| 646 /* is s2 at the end of the current stack? */ |
| 647 next2=stack2[0].s; |
| 648 } |
| 649 if(next2!=NULL) { |
| 650 m1=next1; |
| 651 m2=next2; |
| 652 } |
582 } | 653 } |
583 c1=c2=-1; /* make us fetch new code units */ | 654 c1=c2=-1; /* make us fetch new code units */ |
584 continue; | 655 continue; |
585 } else if(c1<0) { | 656 } else if(c1<0) { |
586 return -1; /* string 1 ends before string 2 */ | 657 cmpRes=-1; /* string 1 ends before string 2 */ |
| 658 break; |
587 } else if(c2<0) { | 659 } else if(c2<0) { |
588 return 1; /* string 2 ends before string 1 */ | 660 cmpRes=1; /* string 2 ends before string 1 */ |
| 661 break; |
589 } | 662 } |
590 /* c1!=c2 && c1>=0 && c2>=0 */ | 663 /* c1!=c2 && c1>=0 && c2>=0 */ |
591 | 664 |
592 /* get complete code points for c1, c2 for lookups if either is a surrog
ate */ | 665 /* get complete code points for c1, c2 for lookups if either is a surrog
ate */ |
593 cp1=c1; | 666 cp1=c1; |
594 if(U_IS_SURROGATE(c1)) { | 667 if(U_IS_SURROGATE(c1)) { |
595 UChar c; | 668 UChar c; |
596 | 669 |
597 if(U_IS_SURROGATE_LEAD(c1)) { | 670 if(U_IS_SURROGATE_LEAD(c1)) { |
598 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { | 671 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
637 ++s1; | 710 ++s1; |
638 } else /* isTrail(c1) */ { | 711 } else /* isTrail(c1) */ { |
639 /* | 712 /* |
640 * we got a supplementary code point when hitting its trail
surrogate, | 713 * we got a supplementary code point when hitting its trail
surrogate, |
641 * therefore the lead surrogate must have been the same as i
n the other string; | 714 * therefore the lead surrogate must have been the same as i
n the other string; |
642 * compare this decomposition with the lead surrogate in the
other string | 715 * compare this decomposition with the lead surrogate in the
other string |
643 * remember that this simulates bulk text replacement: | 716 * remember that this simulates bulk text replacement: |
644 * the decomposition would replace the entire code point | 717 * the decomposition would replace the entire code point |
645 */ | 718 */ |
646 --s2; | 719 --s2; |
| 720 --m2; |
647 c2=*(s2-1); | 721 c2=*(s2-1); |
648 } | 722 } |
649 } | 723 } |
650 | 724 |
651 /* push current level pointers */ | 725 /* push current level pointers */ |
652 stack1[0].start=start1; | 726 stack1[0].start=start1; |
653 stack1[0].s=s1; | 727 stack1[0].s=s1; |
654 stack1[0].limit=limit1; | 728 stack1[0].limit=limit1; |
655 ++level1; | 729 ++level1; |
656 | 730 |
(...skipping 25 matching lines...) Expand all Loading... |
682 ++s2; | 756 ++s2; |
683 } else /* isTrail(c2) */ { | 757 } else /* isTrail(c2) */ { |
684 /* | 758 /* |
685 * we got a supplementary code point when hitting its trail
surrogate, | 759 * we got a supplementary code point when hitting its trail
surrogate, |
686 * therefore the lead surrogate must have been the same as i
n the other string; | 760 * therefore the lead surrogate must have been the same as i
n the other string; |
687 * compare this decomposition with the lead surrogate in the
other string | 761 * compare this decomposition with the lead surrogate in the
other string |
688 * remember that this simulates bulk text replacement: | 762 * remember that this simulates bulk text replacement: |
689 * the decomposition would replace the entire code point | 763 * the decomposition would replace the entire code point |
690 */ | 764 */ |
691 --s1; | 765 --s1; |
| 766 --m2; |
692 c1=*(s1-1); | 767 c1=*(s1-1); |
693 } | 768 } |
694 } | 769 } |
695 | 770 |
696 /* push current level pointers */ | 771 /* push current level pointers */ |
697 stack2[0].start=start2; | 772 stack2[0].start=start2; |
698 stack2[0].s=s2; | 773 stack2[0].s=s2; |
699 stack2[0].limit=limit2; | 774 stack2[0].limit=limit2; |
700 ++level2; | 775 ++level2; |
701 | 776 |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
750 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || | 825 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || |
751 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) | 826 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) |
752 ) { | 827 ) { |
753 /* part of a surrogate pair, leave >=d800 */ | 828 /* part of a surrogate pair, leave >=d800 */ |
754 } else { | 829 } else { |
755 /* BMP code point - may be surrogate code point - make <d800 */ | 830 /* BMP code point - may be surrogate code point - make <d800 */ |
756 c2-=0x2800; | 831 c2-=0x2800; |
757 } | 832 } |
758 } | 833 } |
759 | 834 |
760 return c1-c2; | 835 cmpRes=c1-c2; |
| 836 break; |
761 } | 837 } |
| 838 |
| 839 if(matchLen1) { |
| 840 *matchLen1=m1-org1; |
| 841 *matchLen2=m2-org2; |
| 842 } |
| 843 return cmpRes; |
| 844 } |
| 845 |
| 846 /* internal function */ |
| 847 U_CFUNC int32_t |
| 848 u_strcmpFold(const UChar *s1, int32_t length1, |
| 849 const UChar *s2, int32_t length2, |
| 850 uint32_t options, |
| 851 UErrorCode *pErrorCode) { |
| 852 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode); |
762 } | 853 } |
763 | 854 |
764 /* public API functions */ | 855 /* public API functions */ |
765 | 856 |
766 U_CAPI int32_t U_EXPORT2 | 857 U_CAPI int32_t U_EXPORT2 |
767 u_strCaseCompare(const UChar *s1, int32_t length1, | 858 u_strCaseCompare(const UChar *s1, int32_t length1, |
768 const UChar *s2, int32_t length2, | 859 const UChar *s2, int32_t length2, |
769 uint32_t options, | 860 uint32_t options, |
770 UErrorCode *pErrorCode) { | 861 UErrorCode *pErrorCode) { |
771 /* argument checking */ | 862 /* argument checking */ |
(...skipping 25 matching lines...) Expand all Loading... |
797 &errorCode); | 888 &errorCode); |
798 } | 889 } |
799 | 890 |
800 U_CAPI int32_t U_EXPORT2 | 891 U_CAPI int32_t U_EXPORT2 |
801 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { | 892 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { |
802 UErrorCode errorCode=U_ZERO_ERROR; | 893 UErrorCode errorCode=U_ZERO_ERROR; |
803 return u_strcmpFold(s1, n, s2, n, | 894 return u_strcmpFold(s1, n, s2, n, |
804 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), | 895 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), |
805 &errorCode); | 896 &errorCode); |
806 } | 897 } |
| 898 |
| 899 /* internal API - detect length of shared prefix */ |
| 900 U_CAPI void |
| 901 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, |
| 902 const UChar *s2, int32_t length2, |
| 903 uint32_t options, |
| 904 int32_t *matchLen1, int32_t *matchLen2, |
| 905 UErrorCode *pErrorCode) { |
| 906 _cmpFold(s1, length1, s2, length2, options, |
| 907 matchLen1, matchLen2, pErrorCode); |
| 908 } |
OLD | NEW |