src/runtime/runtime-strings.cc - Issue 2533033003: Fix the uppercasing of U+00E7(ç) and U+00F7(÷)

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 2533033003: Fix the uppercasing of U+00E7(ç) and U+00F7(÷) (Closed)

Patch Set: a bit more tweaks + tests Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/runtime/runtime-utils.h"	5 #include "src/runtime/runtime-utils.h"

6	6

7 #include "src/arguments.h"	7 #include "src/arguments.h"

8 #include "src/regexp/jsregexp-inl.h"	8 #include "src/regexp/jsregexp-inl.h"

9 #include "src/string-builder.h"	9 #include "src/string-builder.h"

10 #include "src/string-search.h"	10 #include "src/string-search.h"

(...skipping 661 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
672 return result;	672 return result;

673 } else {	673 } else {

674 // If we didn't actually change anything in doing the conversion	674 // If we didn't actually change anything in doing the conversion

675 // we simple return the result and let the converted string	675 // we simple return the result and let the converted string

676 // become garbage; there is no reason to keep two identical strings	676 // become garbage; there is no reason to keep two identical strings

677 // alive.	677 // alive.

678 return string;	678 return string;

679 }	679 }

680 }	680 }

681	681

682

683 static const uintptr_t kOneInEveryByte = kUintptrAllBitsSet / 0xFF;

684 static const uintptr_t kAsciiMask = kOneInEveryByte << 7;

685

686 // Given a word and two range boundaries returns a word with high bit

687 // set in every byte iff the corresponding input byte was strictly in

688 // the range (m, n). All the other bits in the result are cleared.

689 // This function is only useful when it can be inlined and the

690 // boundaries are statically known.

691 // Requires: all bytes in the input word and the boundaries must be

692 // ASCII (less than 0x7F).

693 static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {

694 // Use strict inequalities since in edge cases the function could be

695 // further simplified.

696 DCHECK(0 < m && m < n);

697 // Has high bit set in every w byte less than n.

698 uintptr_t tmp1 = kOneInEveryByte * (0x7F + n) - w;

699 // Has high bit set in every w byte greater than m.

700 uintptr_t tmp2 = w + kOneInEveryByte * (0x7F - m);

701 return (tmp1 & tmp2 & (kOneInEveryByte * 0x80));

702 }

703

704

705 #ifdef DEBUG

706 static bool CheckFastAsciiConvert(char* dst, const char* src, int length,

707 bool changed, bool is_to_lower) {

708 bool expected_changed = false;

709 for (int i = 0; i < length; i++) {

710 if (dst[i] == src[i]) continue;

711 expected_changed = true;

712 if (is_to_lower) {

713 DCHECK('A' <= src[i] && src[i] <= 'Z');

714 DCHECK(dst[i] == src[i] + ('a' - 'A'));

715 } else {

716 DCHECK('a' <= src[i] && src[i] <= 'z');

717 DCHECK(dst[i] == src[i] - ('a' - 'A'));

718 }

719 }

720 return (expected_changed == changed);

721 }

722 #endif

723

724

725 template <class Converter>

726 static bool FastAsciiConvert(char* dst, const char* src, int length,

727 bool* changed_out) {

728 #ifdef DEBUG

729 char* saved_dst = dst;

730 const char* saved_src = src;

731 #endif

732 DisallowHeapAllocation no_gc;

733 // We rely on the distance between upper and lower case letters

734 // being a known power of 2.

735 DCHECK('a' - 'A' == (1 << 5));

736 // Boundaries for the range of input characters than require conversion.

737 static const char lo = Converter::kIsToLower ? 'A' - 1 : 'a' - 1;

738 static const char hi = Converter::kIsToLower ? 'Z' + 1 : 'z' + 1;

739 bool changed = false;

740 uintptr_t or_acc = 0;

741 const char* const limit = src + length;

742

743 // dst is newly allocated and always aligned.

744 DCHECK(IsAligned(reinterpret_cast<intptr_t>(dst), sizeof(uintptr_t)));

745 // Only attempt processing one word at a time if src is also aligned.

746 if (IsAligned(reinterpret_cast<intptr_t>(src), sizeof(uintptr_t))) {

747 // Process the prefix of the input that requires no conversion one aligned

748 // (machine) word at a time.

749 while (src <= limit - sizeof(uintptr_t)) {

750 const uintptr_t w = reinterpret_cast<const uintptr_t>(src);

751 or_acc \|= w;

752 if (AsciiRangeMask(w, lo, hi) != 0) {

753 changed = true;

754 break;

755 }

756 reinterpret_cast<uintptr_t>(dst) = w;

757 src += sizeof(uintptr_t);

758 dst += sizeof(uintptr_t);

759 }

760 // Process the remainder of the input performing conversion when

761 // required one word at a time.

762 while (src <= limit - sizeof(uintptr_t)) {

763 const uintptr_t w = reinterpret_cast<const uintptr_t>(src);

764 or_acc \|= w;

765 uintptr_t m = AsciiRangeMask(w, lo, hi);

766 // The mask has high (7th) bit set in every byte that needs

767 // conversion and we know that the distance between cases is

768 // 1 << 5.

769 reinterpret_cast<uintptr_t>(dst) = w ^ (m >> 2);

770 src += sizeof(uintptr_t);

771 dst += sizeof(uintptr_t);

772 }

773 }

774 // Process the last few bytes of the input (or the whole input if

775 // unaligned access is not supported).

776 while (src < limit) {

777 char c = *src;

778 or_acc \|= c;

779 if (lo < c && c < hi) {

780 c ^= (1 << 5);

781 changed = true;

782 }

783 *dst = c;

784 ++src;

785 ++dst;

786 }

787

788 if ((or_acc & kAsciiMask) != 0) return false;

789

790 DCHECK(CheckFastAsciiConvert(saved_dst, saved_src, length, changed,

791 Converter::kIsToLower));

792

793 *changed_out = changed;

794 return true;

795 }

796

797

798 template <class Converter>	682 template <class Converter>

799 MUST_USE_RESULT static Object* ConvertCase(	683 MUST_USE_RESULT static Object* ConvertCase(

800 Handle<String> s, Isolate* isolate,	684 Handle<String> s, Isolate* isolate,

801 unibrow::Mapping<Converter, 128>* mapping) {	685 unibrow::Mapping<Converter, 128>* mapping) {

802 s = String::Flatten(s);	686 s = String::Flatten(s);

803 int length = s->length();	687 int length = s->length();

804 // Assume that the string is not empty; we need this assumption later	688 // Assume that the string is not empty; we need this assumption later

805 if (length == 0) return *s;	689 if (length == 0) return *s;

806	690

807 // Simpler handling of ASCII strings.	691 // Simpler handling of ASCII strings.

808 //	692 //

809 // NOTE: This assumes that the upper/lower case of an ASCII	693 // NOTE: This assumes that the upper/lower case of an ASCII

810 // character is also ASCII. This is currently the case, but it	694 // character is also ASCII. This is currently the case, but it

811 // might break in the future if we implement more context and locale	695 // might break in the future if we implement more context and locale

812 // dependent upper/lower conversions.	696 // dependent upper/lower conversions.

813 if (s->IsOneByteRepresentationUnderneath()) {	697 if (s->IsOneByteRepresentationUnderneath()) {

814 // Same length as input.	698 // Same length as input.

815 Handle<SeqOneByteString> result =	699 Handle<SeqOneByteString> result =

816 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();	700 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

817 DisallowHeapAllocation no_gc;	701 DisallowHeapAllocation no_gc;

818 String::FlatContent flat_content = s->GetFlatContent();	702 String::FlatContent flat_content = s->GetFlatContent();

819 DCHECK(flat_content.IsFlat());	703 DCHECK(flat_content.IsFlat());

820 bool has_changed_character = false;	704 bool has_changed_character = false;

821 bool is_ascii = FastAsciiConvert<Converter>(	705 bool is_ascii = FastAsciiConvert<Converter::kIsToLower>(

822 reinterpret_cast<char*>(result->GetChars()),	706 reinterpret_cast<char*>(result->GetChars()),

823 reinterpret_cast<const char*>(flat_content.ToOneByteVector().start()),	707 reinterpret_cast<const char*>(flat_content.ToOneByteVector().start()),

824 length, &has_changed_character);	708 length, &has_changed_character);

825 // If not ASCII, we discard the result and take the 2 byte path.	709 // If not ASCII, we discard the result and take the 2 byte path.

826 if (is_ascii) return has_changed_character ? result : s;	710 if (is_ascii) return has_changed_character ? result : s;

827 }	711 }

828	712

829 Handle<SeqString> result; // Same length as input.	713 Handle<SeqString> result; // Same length as input.

830 if (s->IsOneByteRepresentation()) {	714 if (s->IsOneByteRepresentation()) {

831 result = isolate->factory()->NewRawOneByteString(length).ToHandleChecked();	715 result = isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

(...skipping 152 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
984 SealHandleScope shs(isolate);	868 SealHandleScope shs(isolate);

985 DCHECK(args.length() == 2);	869 DCHECK(args.length() == 2);

986 if (!args[0]->IsString()) return isolate->heap()->undefined_value();	870 if (!args[0]->IsString()) return isolate->heap()->undefined_value();

987 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();	871 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();

988 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();	872 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();

989 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);	873 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);

990 }	874 }

991	875

992 } // namespace internal	876 } // namespace internal

993 } // namespace v8	877 } // namespace v8

OLD	NEW

« no previous file with comments | « src/runtime/runtime-i18n.cc ('k') | src/runtime/runtime-utils.h » ('j') | no next file with comments »